├── .gitattributes ├── .gitignore ├── CHANGELOG.md ├── CMakeLists.txt ├── LICENSE ├── README.md ├── cmake_uninstall.cmake.in ├── msvc ├── vsTCanny.sln ├── vsTCanny.vcxproj └── vsTCanny.vcxproj.filters └── src ├── VCL2 ├── LICENSE ├── instrset.h ├── instrset_detect.cpp ├── vector_convert.h ├── vectorclass.h ├── vectorf128.h ├── vectorf256.h ├── vectorf256e.h ├── vectorf512.h ├── vectorf512e.h ├── vectori128.h ├── vectori256.h ├── vectori256e.h ├── vectori512.h ├── vectori512e.h ├── vectori512s.h ├── vectori512se.h ├── vectormath_common.h ├── vectormath_exp.h ├── vectormath_hyp.h ├── vectormath_lib.h └── vectormath_trig.h ├── vsTCanny.cpp ├── vsTCanny.h ├── vsTCanny.rc ├── vsTCanny_AVX2.cpp ├── vsTCanny_AVX512.cpp └── vsTCanny_SSE2.cpp /.gitattributes: -------------------------------------------------------------------------------- 1 | #sources 2 | *.c text 3 | *.cc text 4 | *.cxx text 5 | *.cpp text 6 | *.c++ text 7 | *.hpp text 8 | *.h text 9 | *.h++ text 10 | *.hh text 11 | 12 | # Compiled Object files 13 | *.slo binary 14 | *.lo binary 15 | *.o binary 16 | *.obj binary 17 | 18 | # Precompiled Headers 19 | *.gch binary 20 | *.pch binary 21 | 22 | # Compiled Dynamic libraries 23 | *.so binary 24 | *.dylib binary 25 | *.dll binary 26 | 27 | # Compiled Static libraries 28 | *.lai binary 29 | *.la binary 30 | *.a binary 31 | *.lib binary 32 | 33 | # Executables 34 | *.exe binary 35 | *.out binary 36 | *.app binary 37 | ############################################################################### 38 | # Set default behavior to automatically normalize line endings. 39 | ############################################################################### 40 | * text=auto 41 | 42 | ############################################################################### 43 | # Set the merge driver for project and solution files 44 | # 45 | # Merging from the command prompt will add diff markers to the files if there 46 | # are conflicts (Merging from VS is not affected by the settings below, in VS 47 | # the diff markers are never inserted). Diff markers may cause the following 48 | # file extensions to fail to load in VS. An alternative would be to treat 49 | # these files as binary and thus will always conflict and require user 50 | # intervention with every merge. To do so, just comment the entries below and 51 | # uncomment the group further below 52 | ############################################################################### 53 | 54 | *.sln text eol=crlf 55 | *.csproj text eol=crlf 56 | *.vbproj text eol=crlf 57 | *.vcxproj text eol=crlf 58 | *.vcproj text eol=crlf 59 | *.dbproj text eol=crlf 60 | *.fsproj text eol=crlf 61 | *.lsproj text eol=crlf 62 | *.wixproj text eol=crlf 63 | *.modelproj text eol=crlf 64 | *.sqlproj text eol=crlf 65 | *.wmaproj text eol=crlf 66 | 67 | *.xproj text eol=crlf 68 | *.props text eol=crlf 69 | *.filters text eol=crlf 70 | *.vcxitems text eol=crlf 71 | 72 | 73 | #*.sln merge=binary 74 | #*.csproj merge=binary 75 | #*.vbproj merge=binary 76 | #*.vcxproj merge=binary 77 | #*.vcproj merge=binary 78 | #*.dbproj merge=binary 79 | #*.fsproj merge=binary 80 | #*.lsproj merge=binary 81 | #*.wixproj merge=binary 82 | #*.modelproj merge=binary 83 | #*.sqlproj merge=binary 84 | #*.wwaproj merge=binary 85 | 86 | #*.xproj merge=binary 87 | #*.props merge=binary 88 | #*.filters merge=binary 89 | #*.vcxitems merge=binary 90 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | ## 4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore 5 | 6 | # User-specific files 7 | *.rsuser 8 | *.suo 9 | *.user 10 | *.userosscache 11 | *.sln.docstates 12 | 13 | # User-specific files (MonoDevelop/Xamarin Studio) 14 | *.userprefs 15 | 16 | # Build results 17 | [Dd]ebug/ 18 | [Dd]ebugPublic/ 19 | [Rr]elease/ 20 | [Rr]eleases/ 21 | x64/ 22 | x86/ 23 | [Aa][Rr][Mm]/ 24 | [Aa][Rr][Mm]64/ 25 | bld/ 26 | [Bb]in/ 27 | [Oo]bj/ 28 | [Ll]og/ 29 | 30 | # Visual Studio 2015/2017 cache/options directory 31 | .vs/ 32 | # Uncomment if you have tasks that create the project's static files in wwwroot 33 | #wwwroot/ 34 | 35 | # Visual Studio 2017 auto generated files 36 | Generated\ Files/ 37 | 38 | # MSTest test Results 39 | [Tt]est[Rr]esult*/ 40 | [Bb]uild[Ll]og.* 41 | 42 | # NUNIT 43 | *.VisualState.xml 44 | TestResult.xml 45 | 46 | # Build Results of an ATL Project 47 | [Dd]ebugPS/ 48 | [Rr]eleasePS/ 49 | dlldata.c 50 | 51 | # Benchmark Results 52 | BenchmarkDotNet.Artifacts/ 53 | 54 | # .NET Core 55 | project.lock.json 56 | project.fragment.lock.json 57 | artifacts/ 58 | 59 | # StyleCop 60 | StyleCopReport.xml 61 | 62 | # Files built by Visual Studio 63 | *_i.c 64 | *_p.c 65 | *_h.h 66 | *.ilk 67 | *.meta 68 | *.obj 69 | *.iobj 70 | *.pch 71 | *.pdb 72 | *.ipdb 73 | *.pgc 74 | *.pgd 75 | *.rsp 76 | *.sbr 77 | *.tlb 78 | *.tli 79 | *.tlh 80 | *.tmp 81 | *.tmp_proj 82 | *_wpftmp.csproj 83 | *.log 84 | *.vspscc 85 | *.vssscc 86 | .builds 87 | *.pidb 88 | *.svclog 89 | *.scc 90 | 91 | # Chutzpah Test files 92 | _Chutzpah* 93 | 94 | # Visual C++ cache files 95 | ipch/ 96 | *.aps 97 | *.ncb 98 | *.opendb 99 | *.opensdf 100 | *.sdf 101 | *.cachefile 102 | *.VC.db 103 | *.VC.VC.opendb 104 | 105 | # Visual Studio profiler 106 | *.psess 107 | *.vsp 108 | *.vspx 109 | *.sap 110 | 111 | # Visual Studio Trace Files 112 | *.e2e 113 | 114 | # TFS 2012 Local Workspace 115 | $tf/ 116 | 117 | # Guidance Automation Toolkit 118 | *.gpState 119 | 120 | # ReSharper is a .NET coding add-in 121 | _ReSharper*/ 122 | *.[Rr]e[Ss]harper 123 | *.DotSettings.user 124 | 125 | # JustCode is a .NET coding add-in 126 | .JustCode 127 | 128 | # TeamCity is a build add-in 129 | _TeamCity* 130 | 131 | # DotCover is a Code Coverage Tool 132 | *.dotCover 133 | 134 | # AxoCover is a Code Coverage Tool 135 | .axoCover/* 136 | !.axoCover/settings.json 137 | 138 | # Visual Studio code coverage results 139 | *.coverage 140 | *.coveragexml 141 | 142 | # NCrunch 143 | _NCrunch_* 144 | .*crunch*.local.xml 145 | nCrunchTemp_* 146 | 147 | # MightyMoose 148 | *.mm.* 149 | AutoTest.Net/ 150 | 151 | # Web workbench (sass) 152 | .sass-cache/ 153 | 154 | # Installshield output folder 155 | [Ee]xpress/ 156 | 157 | # DocProject is a documentation generator add-in 158 | DocProject/buildhelp/ 159 | DocProject/Help/*.HxT 160 | DocProject/Help/*.HxC 161 | DocProject/Help/*.hhc 162 | DocProject/Help/*.hhk 163 | DocProject/Help/*.hhp 164 | DocProject/Help/Html2 165 | DocProject/Help/html 166 | 167 | # Click-Once directory 168 | publish/ 169 | 170 | # Publish Web Output 171 | *.[Pp]ublish.xml 172 | *.azurePubxml 173 | # Note: Comment the next line if you want to checkin your web deploy settings, 174 | # but database connection strings (with potential passwords) will be unencrypted 175 | *.pubxml 176 | *.publishproj 177 | 178 | # Microsoft Azure Web App publish settings. Comment the next line if you want to 179 | # checkin your Azure Web App publish settings, but sensitive information contained 180 | # in these scripts will be unencrypted 181 | PublishScripts/ 182 | 183 | # NuGet Packages 184 | *.nupkg 185 | # The packages folder can be ignored because of Package Restore 186 | **/[Pp]ackages/* 187 | # except build/, which is used as an MSBuild target. 188 | !**/[Pp]ackages/build/ 189 | # Uncomment if necessary however generally it will be regenerated when needed 190 | #!**/[Pp]ackages/repositories.config 191 | # NuGet v3's project.json files produces more ignorable files 192 | *.nuget.props 193 | *.nuget.targets 194 | 195 | # Microsoft Azure Build Output 196 | csx/ 197 | *.build.csdef 198 | 199 | # Microsoft Azure Emulator 200 | ecf/ 201 | rcf/ 202 | 203 | # Windows Store app package directories and files 204 | AppPackages/ 205 | BundleArtifacts/ 206 | Package.StoreAssociation.xml 207 | _pkginfo.txt 208 | *.appx 209 | 210 | # Visual Studio cache files 211 | # files ending in .cache can be ignored 212 | *.[Cc]ache 213 | # but keep track of directories ending in .cache 214 | !*.[Cc]ache/ 215 | 216 | # Others 217 | ClientBin/ 218 | ~$* 219 | *~ 220 | *.dbmdl 221 | *.dbproj.schemaview 222 | *.jfm 223 | *.pfx 224 | *.publishsettings 225 | orleans.codegen.cs 226 | 227 | # Including strong name files can present a security risk 228 | # (https://github.com/github/gitignore/pull/2483#issue-259490424) 229 | #*.snk 230 | 231 | # Since there are multiple workflows, uncomment next line to ignore bower_components 232 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) 233 | #bower_components/ 234 | # ASP.NET Core default setup: bower directory is configured as wwwroot/lib/ and bower restore is true 235 | **/wwwroot/lib/ 236 | 237 | # RIA/Silverlight projects 238 | Generated_Code/ 239 | 240 | # Backup & report files from converting an old project file 241 | # to a newer Visual Studio version. Backup files are not needed, 242 | # because we have git ;-) 243 | _UpgradeReport_Files/ 244 | Backup*/ 245 | UpgradeLog*.XML 246 | UpgradeLog*.htm 247 | ServiceFabricBackup/ 248 | *.rptproj.bak 249 | 250 | # SQL Server files 251 | *.mdf 252 | *.ldf 253 | *.ndf 254 | 255 | # Business Intelligence projects 256 | *.rdl.data 257 | *.bim.layout 258 | *.bim_*.settings 259 | *.rptproj.rsuser 260 | 261 | # Microsoft Fakes 262 | FakesAssemblies/ 263 | 264 | # GhostDoc plugin setting file 265 | *.GhostDoc.xml 266 | 267 | # Node.js Tools for Visual Studio 268 | .ntvs_analysis.dat 269 | node_modules/ 270 | 271 | # Visual Studio 6 build log 272 | *.plg 273 | 274 | # Visual Studio 6 workspace options file 275 | *.opt 276 | 277 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.) 278 | *.vbw 279 | 280 | # Visual Studio LightSwitch build output 281 | **/*.HTMLClient/GeneratedArtifacts 282 | **/*.DesktopClient/GeneratedArtifacts 283 | **/*.DesktopClient/ModelManifest.xml 284 | **/*.Server/GeneratedArtifacts 285 | **/*.Server/ModelManifest.xml 286 | _Pvt_Extensions 287 | 288 | # Paket dependency manager 289 | .paket/paket.exe 290 | paket-files/ 291 | 292 | # FAKE - F# Make 293 | .fake/ 294 | 295 | # JetBrains Rider 296 | .idea/ 297 | *.sln.iml 298 | 299 | # CodeRush personal settings 300 | .cr/personal 301 | 302 | # Python Tools for Visual Studio (PTVS) 303 | __pycache__/ 304 | *.pyc 305 | 306 | # Cake - Uncomment if you are using it 307 | # tools/** 308 | # !tools/packages.config 309 | 310 | # Tabs Studio 311 | *.tss 312 | 313 | # Telerik's JustMock configuration file 314 | *.jmconfig 315 | 316 | # BizTalk build output 317 | *.btp.cs 318 | *.btm.cs 319 | *.odx.cs 320 | *.xsd.cs 321 | 322 | # OpenCover UI analysis results 323 | OpenCover/ 324 | 325 | # Azure Stream Analytics local run output 326 | ASALocalRun/ 327 | 328 | # MSBuild Binary and Structured Log 329 | *.binlog 330 | 331 | # NVidia Nsight GPU debugger configuration file 332 | *.nvuser 333 | 334 | # MFractors (Xamarin productivity tool) working folder 335 | .mfractor/ 336 | 337 | # Local History for Visual Studio 338 | .localhistory/ 339 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ##### 1.1.8: 2 | Fixed default sigma_vY when the clip has only one plane. (regression from 1.1.7) 3 | 4 | ##### 1.1.7: 5 | Changed the behavior of default sigma_vU. 6 | 7 | ##### 1.1.6: 8 | Fixed default sigma_U/V/vU/vV for RGB formats. 9 | Changed default sigma_vU/vV. Now they are inherited from sigmaU/V. 10 | 11 | ##### 1.1.5: 12 | Fixed the processing of planes for RGB formats. 13 | Properly clamped float mask to 0-1 range in mode=1. (VS plugin r14) 14 | 15 | ##### 1.1.4: 16 | Fixed the behavior when y/u/v=1. 17 | 18 | ##### 1.1.3: 19 | Fixed the uninitialized variables when the clip has only one plane. 20 | 21 | ##### 1.1.2: 22 | Fixed a bug when sigma=0 and the plane is not processed. 23 | 24 | ##### 1.1.1: 25 | Fixed the processing of clips with one plane. 26 | 27 | ##### 1.1.0: 28 | Changed chroma planes range from -0.5..0.5 to 0.0..1.0 (float clips). (VS plugin r13) 29 | Added AVX512 code. (VS plugin r13) 30 | Added Kroon, Kirsch and FDoG operatos. (VS plugin r13) 31 | Renamed `gmmax` parameter to `scale` and changed its default to 1.0. (VS plugin r13) 32 | Changed default sigma_vY from 1.5 to sigmaY. 33 | 34 | ##### 1.0.1: 35 | Fixed sigma for RGB clips. 36 | 37 | ##### 1.0.0: 38 | Port of the VapourSynth plugin TCanny r12. 39 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.16) 2 | 3 | project(libvstcanny LANGUAGES CXX) 4 | 5 | add_library(vstcanny SHARED 6 | src/vsTCanny.cpp 7 | src/vsTCanny_SSE2.cpp 8 | src/vsTCanny_AVX2.cpp 9 | src/vsTCanny_AVX512.cpp 10 | ) 11 | 12 | target_include_directories(vstcanny PRIVATE 13 | ${CMAKE_CURRENT_SOURCE_DIR}/src 14 | /usr/local/include/avisynth 15 | ) 16 | 17 | if (NOT CMAKE_BUILD_TYPE) 18 | set(CMAKE_BUILD_TYPE "Release" CACHE STRING "" FORCE) 19 | endif() 20 | 21 | string(TOLOWER ${CMAKE_BUILD_TYPE} build_type) 22 | if (build_type STREQUAL debug) 23 | target_compile_definitions(vstcanny PRIVATE DEBUG_BUILD) 24 | else (build_type STREQUAL release) 25 | target_compile_definitions(vstcanny PRIVATE RELEASE_BUILD) 26 | endif () 27 | 28 | message(STATUS "Build type - ${CMAKE_BUILD_TYPE}") 29 | 30 | target_compile_features(vstcanny PRIVATE cxx_std_17) 31 | 32 | set_source_files_properties(src/vsTCanny_SSE2.cpp PROPERTIES COMPILE_OPTIONS "-mfpmath=sse;-msse2") 33 | set_source_files_properties(src/vsTCanny_AVX2.cpp PROPERTIES COMPILE_OPTIONS "-mavx2;-mfma") 34 | set_source_files_properties(src/vsTCanny_AVX512.cpp PROPERTIES COMPILE_OPTIONS "-mavx512f;-mavx512bw;-mavx512dq;-mavx512vl;-mfma") 35 | 36 | find_package (Git) 37 | 38 | if (GIT_FOUND) 39 | execute_process (COMMAND ${GIT_EXECUTABLE} describe --tags --abbrev=0 40 | OUTPUT_VARIABLE ver 41 | OUTPUT_STRIP_TRAILING_WHITESPACE 42 | ) 43 | set_target_properties(vstcanny PROPERTIES OUTPUT_NAME "vstcanny.${ver}") 44 | else () 45 | message (STATUS "GIT not found") 46 | endif () 47 | 48 | include(GNUInstallDirs) 49 | 50 | INSTALL(TARGETS vstcanny LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}/avisynth") 51 | 52 | # uninstall target 53 | if(NOT TARGET uninstall) 54 | configure_file( 55 | "${CMAKE_CURRENT_SOURCE_DIR}/cmake_uninstall.cmake.in" 56 | "${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake" 57 | IMMEDIATE @ONLY) 58 | 59 | add_custom_target(uninstall 60 | COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake) 61 | endif() 62 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 2, June 1991 3 | 4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc., 5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | Preamble 10 | 11 | The licenses for most software are designed to take away your 12 | freedom to share and change it. By contrast, the GNU General Public 13 | License is intended to guarantee your freedom to share and change free 14 | software--to make sure the software is free for all its users. This 15 | General Public License applies to most of the Free Software 16 | Foundation's software and to any other program whose authors commit to 17 | using it. (Some other Free Software Foundation software is covered by 18 | the GNU Lesser General Public License instead.) You can apply it to 19 | your programs, too. 20 | 21 | When we speak of free software, we are referring to freedom, not 22 | price. Our General Public Licenses are designed to make sure that you 23 | have the freedom to distribute copies of free software (and charge for 24 | this service if you wish), that you receive source code or can get it 25 | if you want it, that you can change the software or use pieces of it 26 | in new free programs; and that you know you can do these things. 27 | 28 | To protect your rights, we need to make restrictions that forbid 29 | anyone to deny you these rights or to ask you to surrender the rights. 30 | These restrictions translate to certain responsibilities for you if you 31 | distribute copies of the software, or if you modify it. 32 | 33 | For example, if you distribute copies of such a program, whether 34 | gratis or for a fee, you must give the recipients all the rights that 35 | you have. You must make sure that they, too, receive or can get the 36 | source code. And you must show them these terms so they know their 37 | rights. 38 | 39 | We protect your rights with two steps: (1) copyright the software, and 40 | (2) offer you this license which gives you legal permission to copy, 41 | distribute and/or modify the software. 42 | 43 | Also, for each author's protection and ours, we want to make certain 44 | that everyone understands that there is no warranty for this free 45 | software. If the software is modified by someone else and passed on, we 46 | want its recipients to know that what they have is not the original, so 47 | that any problems introduced by others will not reflect on the original 48 | authors' reputations. 49 | 50 | Finally, any free program is threatened constantly by software 51 | patents. We wish to avoid the danger that redistributors of a free 52 | program will individually obtain patent licenses, in effect making the 53 | program proprietary. To prevent this, we have made it clear that any 54 | patent must be licensed for everyone's free use or not licensed at all. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | GNU GENERAL PUBLIC LICENSE 60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 61 | 62 | 0. This License applies to any program or other work which contains 63 | a notice placed by the copyright holder saying it may be distributed 64 | under the terms of this General Public License. The "Program", below, 65 | refers to any such program or work, and a "work based on the Program" 66 | means either the Program or any derivative work under copyright law: 67 | that is to say, a work containing the Program or a portion of it, 68 | either verbatim or with modifications and/or translated into another 69 | language. (Hereinafter, translation is included without limitation in 70 | the term "modification".) Each licensee is addressed as "you". 71 | 72 | Activities other than copying, distribution and modification are not 73 | covered by this License; they are outside its scope. The act of 74 | running the Program is not restricted, and the output from the Program 75 | is covered only if its contents constitute a work based on the 76 | Program (independent of having been made by running the Program). 77 | Whether that is true depends on what the Program does. 78 | 79 | 1. You may copy and distribute verbatim copies of the Program's 80 | source code as you receive it, in any medium, provided that you 81 | conspicuously and appropriately publish on each copy an appropriate 82 | copyright notice and disclaimer of warranty; keep intact all the 83 | notices that refer to this License and to the absence of any warranty; 84 | and give any other recipients of the Program a copy of this License 85 | along with the Program. 86 | 87 | You may charge a fee for the physical act of transferring a copy, and 88 | you may at your option offer warranty protection in exchange for a fee. 89 | 90 | 2. You may modify your copy or copies of the Program or any portion 91 | of it, thus forming a work based on the Program, and copy and 92 | distribute such modifications or work under the terms of Section 1 93 | above, provided that you also meet all of these conditions: 94 | 95 | a) You must cause the modified files to carry prominent notices 96 | stating that you changed the files and the date of any change. 97 | 98 | b) You must cause any work that you distribute or publish, that in 99 | whole or in part contains or is derived from the Program or any 100 | part thereof, to be licensed as a whole at no charge to all third 101 | parties under the terms of this License. 102 | 103 | c) If the modified program normally reads commands interactively 104 | when run, you must cause it, when started running for such 105 | interactive use in the most ordinary way, to print or display an 106 | announcement including an appropriate copyright notice and a 107 | notice that there is no warranty (or else, saying that you provide 108 | a warranty) and that users may redistribute the program under 109 | these conditions, and telling the user how to view a copy of this 110 | License. (Exception: if the Program itself is interactive but 111 | does not normally print such an announcement, your work based on 112 | the Program is not required to print an announcement.) 113 | 114 | These requirements apply to the modified work as a whole. If 115 | identifiable sections of that work are not derived from the Program, 116 | and can be reasonably considered independent and separate works in 117 | themselves, then this License, and its terms, do not apply to those 118 | sections when you distribute them as separate works. But when you 119 | distribute the same sections as part of a whole which is a work based 120 | on the Program, the distribution of the whole must be on the terms of 121 | this License, whose permissions for other licensees extend to the 122 | entire whole, and thus to each and every part regardless of who wrote it. 123 | 124 | Thus, it is not the intent of this section to claim rights or contest 125 | your rights to work written entirely by you; rather, the intent is to 126 | exercise the right to control the distribution of derivative or 127 | collective works based on the Program. 128 | 129 | In addition, mere aggregation of another work not based on the Program 130 | with the Program (or with a work based on the Program) on a volume of 131 | a storage or distribution medium does not bring the other work under 132 | the scope of this License. 133 | 134 | 3. You may copy and distribute the Program (or a work based on it, 135 | under Section 2) in object code or executable form under the terms of 136 | Sections 1 and 2 above provided that you also do one of the following: 137 | 138 | a) Accompany it with the complete corresponding machine-readable 139 | source code, which must be distributed under the terms of Sections 140 | 1 and 2 above on a medium customarily used for software interchange; or, 141 | 142 | b) Accompany it with a written offer, valid for at least three 143 | years, to give any third party, for a charge no more than your 144 | cost of physically performing source distribution, a complete 145 | machine-readable copy of the corresponding source code, to be 146 | distributed under the terms of Sections 1 and 2 above on a medium 147 | customarily used for software interchange; or, 148 | 149 | c) Accompany it with the information you received as to the offer 150 | to distribute corresponding source code. (This alternative is 151 | allowed only for noncommercial distribution and only if you 152 | received the program in object code or executable form with such 153 | an offer, in accord with Subsection b above.) 154 | 155 | The source code for a work means the preferred form of the work for 156 | making modifications to it. For an executable work, complete source 157 | code means all the source code for all modules it contains, plus any 158 | associated interface definition files, plus the scripts used to 159 | control compilation and installation of the executable. However, as a 160 | special exception, the source code distributed need not include 161 | anything that is normally distributed (in either source or binary 162 | form) with the major components (compiler, kernel, and so on) of the 163 | operating system on which the executable runs, unless that component 164 | itself accompanies the executable. 165 | 166 | If distribution of executable or object code is made by offering 167 | access to copy from a designated place, then offering equivalent 168 | access to copy the source code from the same place counts as 169 | distribution of the source code, even though third parties are not 170 | compelled to copy the source along with the object code. 171 | 172 | 4. You may not copy, modify, sublicense, or distribute the Program 173 | except as expressly provided under this License. Any attempt 174 | otherwise to copy, modify, sublicense or distribute the Program is 175 | void, and will automatically terminate your rights under this License. 176 | However, parties who have received copies, or rights, from you under 177 | this License will not have their licenses terminated so long as such 178 | parties remain in full compliance. 179 | 180 | 5. You are not required to accept this License, since you have not 181 | signed it. However, nothing else grants you permission to modify or 182 | distribute the Program or its derivative works. These actions are 183 | prohibited by law if you do not accept this License. Therefore, by 184 | modifying or distributing the Program (or any work based on the 185 | Program), you indicate your acceptance of this License to do so, and 186 | all its terms and conditions for copying, distributing or modifying 187 | the Program or works based on it. 188 | 189 | 6. Each time you redistribute the Program (or any work based on the 190 | Program), the recipient automatically receives a license from the 191 | original licensor to copy, distribute or modify the Program subject to 192 | these terms and conditions. You may not impose any further 193 | restrictions on the recipients' exercise of the rights granted herein. 194 | You are not responsible for enforcing compliance by third parties to 195 | this License. 196 | 197 | 7. If, as a consequence of a court judgment or allegation of patent 198 | infringement or for any other reason (not limited to patent issues), 199 | conditions are imposed on you (whether by court order, agreement or 200 | otherwise) that contradict the conditions of this License, they do not 201 | excuse you from the conditions of this License. If you cannot 202 | distribute so as to satisfy simultaneously your obligations under this 203 | License and any other pertinent obligations, then as a consequence you 204 | may not distribute the Program at all. For example, if a patent 205 | license would not permit royalty-free redistribution of the Program by 206 | all those who receive copies directly or indirectly through you, then 207 | the only way you could satisfy both it and this License would be to 208 | refrain entirely from distribution of the Program. 209 | 210 | If any portion of this section is held invalid or unenforceable under 211 | any particular circumstance, the balance of the section is intended to 212 | apply and the section as a whole is intended to apply in other 213 | circumstances. 214 | 215 | It is not the purpose of this section to induce you to infringe any 216 | patents or other property right claims or to contest validity of any 217 | such claims; this section has the sole purpose of protecting the 218 | integrity of the free software distribution system, which is 219 | implemented by public license practices. Many people have made 220 | generous contributions to the wide range of software distributed 221 | through that system in reliance on consistent application of that 222 | system; it is up to the author/donor to decide if he or she is willing 223 | to distribute software through any other system and a licensee cannot 224 | impose that choice. 225 | 226 | This section is intended to make thoroughly clear what is believed to 227 | be a consequence of the rest of this License. 228 | 229 | 8. If the distribution and/or use of the Program is restricted in 230 | certain countries either by patents or by copyrighted interfaces, the 231 | original copyright holder who places the Program under this License 232 | may add an explicit geographical distribution limitation excluding 233 | those countries, so that distribution is permitted only in or among 234 | countries not thus excluded. In such case, this License incorporates 235 | the limitation as if written in the body of this License. 236 | 237 | 9. The Free Software Foundation may publish revised and/or new versions 238 | of the General Public License from time to time. Such new versions will 239 | be similar in spirit to the present version, but may differ in detail to 240 | address new problems or concerns. 241 | 242 | Each version is given a distinguishing version number. If the Program 243 | specifies a version number of this License which applies to it and "any 244 | later version", you have the option of following the terms and conditions 245 | either of that version or of any later version published by the Free 246 | Software Foundation. If the Program does not specify a version number of 247 | this License, you may choose any version ever published by the Free Software 248 | Foundation. 249 | 250 | 10. If you wish to incorporate parts of the Program into other free 251 | programs whose distribution conditions are different, write to the author 252 | to ask for permission. For software which is copyrighted by the Free 253 | Software Foundation, write to the Free Software Foundation; we sometimes 254 | make exceptions for this. Our decision will be guided by the two goals 255 | of preserving the free status of all derivatives of our free software and 256 | of promoting the sharing and reuse of software generally. 257 | 258 | NO WARRANTY 259 | 260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 268 | REPAIR OR CORRECTION. 269 | 270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 278 | POSSIBILITY OF SUCH DAMAGES. 279 | 280 | END OF TERMS AND CONDITIONS 281 | 282 | How to Apply These Terms to Your New Programs 283 | 284 | If you develop a new program, and you want it to be of the greatest 285 | possible use to the public, the best way to achieve this is to make it 286 | free software which everyone can redistribute and change under these terms. 287 | 288 | To do so, attach the following notices to the program. It is safest 289 | to attach them to the start of each source file to most effectively 290 | convey the exclusion of warranty; and each file should have at least 291 | the "copyright" line and a pointer to where the full notice is found. 292 | 293 | MSharpen 294 | Copyright (C) 2020 AvisynthPlus plugins 295 | 296 | This program is free software; you can redistribute it and/or modify 297 | it under the terms of the GNU General Public License as published by 298 | the Free Software Foundation; either version 2 of the License, or 299 | (at your option) any later version. 300 | 301 | This program is distributed in the hope that it will be useful, 302 | but WITHOUT ANY WARRANTY; without even the implied warranty of 303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 304 | GNU General Public License for more details. 305 | 306 | You should have received a copy of the GNU General Public License along 307 | with this program; if not, write to the Free Software Foundation, Inc., 308 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 309 | 310 | Also add information on how to contact you by electronic and paper mail. 311 | 312 | If the program is interactive, make it output a short notice like this 313 | when it starts in an interactive mode: 314 | 315 | Gnomovision version 69, Copyright (C) year name of author 316 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 317 | This is free software, and you are welcome to redistribute it 318 | under certain conditions; type `show c' for details. 319 | 320 | The hypothetical commands `show w' and `show c' should show the appropriate 321 | parts of the General Public License. Of course, the commands you use may 322 | be called something other than `show w' and `show c'; they could even be 323 | mouse-clicks or menu items--whatever suits your program. 324 | 325 | You should also get your employer (if you work as a programmer) or your 326 | school, if any, to sign a "copyright disclaimer" for the program, if 327 | necessary. Here is a sample; alter the names: 328 | 329 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program 330 | `Gnomovision' (which makes passes at compilers) written by James Hacker. 331 | 332 | {signature of Ty Coon}, 1 April 1989 333 | Ty Coon, President of Vice 334 | 335 | This General Public License does not permit incorporating your program into 336 | proprietary programs. If your program is a subroutine library, you may 337 | consider it more useful to permit linking proprietary applications with the 338 | library. If this is what you want to do, use the GNU Lesser General 339 | Public License instead of this License. 340 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Description 2 | 3 | Builds an edge map using canny edge detection. 4 | 5 | This is [a port of the VapourSynth plugin TCanny](https://github.com/HomeOfVapourSynthEvolution/VapourSynth-TCanny). 6 | 7 | ### Requirements: 8 | 9 | - AviSynth 2.60 / AviSynth+ 3.4 or later 10 | 11 | - Microsoft VisualC++ Redistributable Package 2022 (can be downloaded from [here](https://github.com/abbodi1406/vcredist/releases)) (Windows only) 12 | 13 | ### Usage: 14 | 15 | ``` 16 | vsTCanny (clip, float "sigmaY", float "sigmaU", float "sigmaV", float sigma_vY", float "sigma_vU", float "sigma_vV", float "t_h", float "t_l", int "mode", int "op", float "scale", int "y", int "u", int "v", int "opt") 17 | ``` 18 | 19 | ### Parameters: 20 | 21 | - clip\ 22 | A clip to process. All planar formats are supported. 23 | 24 | - sigmaY, sigmaU, sigmaV\ 25 | Standard deviation of horizontal gaussian blur.\ 26 | Must be positive value.\ 27 | Setting to 0 disables gaussian blur.\ 28 | Default: 29 | - sigmaY = 1.5 30 | - sigmaU = sigmaY / horizontal_chroma_subsampling_factor 31 | - sigmaV = sigmaU 32 | 33 | - sigma_vY, sigma_vU, sigma_vV\ 34 | Standard deviation of vertical gaussian blur.\ 35 | Must be positive value.\ 36 | Setting to 0 disables gaussian blur.\ 37 | Default: 38 | - sigma_vY = sigmaY 39 | - if sigma_vY not defined: if horizontal and vertical subsampling factors are equal `sigma_vU = sigmaU` else `sigma_vU = sigmaU * horizontal_chroma_subsampling_factor` 40 | - if sigma_vY defined: sigma_vU = sigma_vY / vertical_chroma_subsampling_factor 41 | - sigma_vV = sigma_vU 42 | 43 | - t_h\ 44 | High gradient magnitude threshold for hysteresis.\ 45 | Default: 8.0. 46 | 47 | - t_l\ 48 | Low gradient magnitude threshold for hysteresis.\ 49 | Must be lower than t_h.\ 50 | Default: 1.0. 51 | 52 | - mode\ 53 | Sets output format.\ 54 | -1: Gaussian blur only.\ 55 | 0: Thresholded edge map (2^bitdepth-1 for edge, 0 for non-edge).\ 56 | 1: Gradient magnitude map.\ 57 | Default: 0. 58 | 59 | - op\ 60 | Sets the operator for edge detection.\ 61 | 0: The operator used in tritical's original filter.\ 62 | 1: The Prewitt operator whose use is proposed by P. Zhou et al. [1]\ 63 | 2: The Sobel operator.\ 64 | 3: The Scharr operator.\ 65 | 4: The Kroon operator.\ 66 | 5: The Kirsch operator.\ 67 | 6: The FDoG operator.\ 68 | Default: 1. 69 | 70 | - scale\ 71 | Multiplies the gradient by `scale`.\ 72 | This can be used to increase or decrease the intensity of edges in the output.\ 73 | Must be greater than 0.0.\ 74 | Default: 1.0. 75 | 76 | - y, u, v\ 77 | Planes to process.\ 78 | 1: Return garbage.\ 79 | 2: Copy plane.\ 80 | 3: Process plane. Always process planes when the clip is RGB.\ 81 | Default: y = u = v = 3. 82 | 83 | - opt\ 84 | Sets which cpu optimizations to use.\ 85 | -1: Auto-detect.\ 86 | 0: Use C++ code.\ 87 | 1: Use SSE2 code.\ 88 | 2: Use AVX2 code.\ 89 | 3: Use AVX512 code.\ 90 | Default: -1. 91 | 92 | [1]: Zhou, P., Ye, W., & Wang, Q. (2011). An Improved Canny Algorithm for Edge Detection. Journal of Computational Information Systems, 7(5), 1516-1523. 93 | 94 | ### Building: 95 | 96 | - Windows\ 97 | Use solution files. 98 | 99 | - Linux 100 | ``` 101 | Requirements: 102 | - Git 103 | - C++17 compiler 104 | - CMake >= 3.16 105 | ``` 106 | ``` 107 | git clone https://github.com/Asd-g/AviSynth-vsTCanny && \ 108 | cd AviSynth-vsTCanny && \ 109 | mkdir build && \ 110 | cd build && \ 111 | cmake .. && \ 112 | make -j$(nproc) && \ 113 | sudo make install 114 | ``` 115 | -------------------------------------------------------------------------------- /cmake_uninstall.cmake.in: -------------------------------------------------------------------------------- 1 | if(NOT EXISTS "@CMAKE_BINARY_DIR@/install_manifest.txt") 2 | message(FATAL_ERROR "Cannot find install manifest: @CMAKE_BINARY_DIR@/install_manifest.txt") 3 | endif() 4 | 5 | file(READ "@CMAKE_BINARY_DIR@/install_manifest.txt" files) 6 | string(REGEX REPLACE "\n" ";" files "${files}") 7 | foreach(file ${files}) 8 | message(STATUS "Uninstalling $ENV{DESTDIR}${file}") 9 | if(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}") 10 | exec_program( 11 | "@CMAKE_COMMAND@" ARGS "-E remove \"$ENV{DESTDIR}${file}\"" 12 | OUTPUT_VARIABLE rm_out 13 | RETURN_VALUE rm_retval 14 | ) 15 | if(NOT "${rm_retval}" STREQUAL 0) 16 | message(FATAL_ERROR "Problem when removing $ENV{DESTDIR}${file}") 17 | endif() 18 | else(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}") 19 | message(STATUS "File $ENV{DESTDIR}${file} does not exist.") 20 | endif() 21 | endforeach() 22 | -------------------------------------------------------------------------------- /msvc/vsTCanny.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio Version 16 4 | VisualStudioVersion = 16.0.30503.244 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "vsTCanny", "vsTCanny.vcxproj", "{A8044448-4796-42AD-8EFD-B42DE2639A78}" 7 | EndProject 8 | Global 9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 10 | Debug|x64 = Debug|x64 11 | Debug|x86 = Debug|x86 12 | Release|x64 = Release|x64 13 | Release|x86 = Release|x86 14 | EndGlobalSection 15 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 16 | {A8044448-4796-42AD-8EFD-B42DE2639A78}.Debug|x64.ActiveCfg = Debug|x64 17 | {A8044448-4796-42AD-8EFD-B42DE2639A78}.Debug|x64.Build.0 = Debug|x64 18 | {A8044448-4796-42AD-8EFD-B42DE2639A78}.Debug|x86.ActiveCfg = Debug|Win32 19 | {A8044448-4796-42AD-8EFD-B42DE2639A78}.Debug|x86.Build.0 = Debug|Win32 20 | {A8044448-4796-42AD-8EFD-B42DE2639A78}.Release|x64.ActiveCfg = Release|x64 21 | {A8044448-4796-42AD-8EFD-B42DE2639A78}.Release|x64.Build.0 = Release|x64 22 | {A8044448-4796-42AD-8EFD-B42DE2639A78}.Release|x86.ActiveCfg = Release|Win32 23 | {A8044448-4796-42AD-8EFD-B42DE2639A78}.Release|x86.Build.0 = Release|Win32 24 | EndGlobalSection 25 | GlobalSection(SolutionProperties) = preSolution 26 | HideSolutionNode = FALSE 27 | EndGlobalSection 28 | GlobalSection(ExtensibilityGlobals) = postSolution 29 | SolutionGuid = {C1C68C5C-D0F3-4014-8988-9AFE3E9C5787} 30 | EndGlobalSection 31 | EndGlobal 32 | -------------------------------------------------------------------------------- /msvc/vsTCanny.vcxproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Debug 6 | Win32 7 | 8 | 9 | Release 10 | Win32 11 | 12 | 13 | Debug 14 | x64 15 | 16 | 17 | Release 18 | x64 19 | 20 | 21 | 22 | 16.0 23 | {A8044448-4796-42AD-8EFD-B42DE2639A78} 24 | Win32Proj 25 | 10.0 26 | 27 | 28 | 29 | Application 30 | true 31 | v142 32 | 33 | 34 | DynamicLibrary 35 | false 36 | llvm 37 | 38 | 39 | DynamicLibrary 40 | true 41 | v142 42 | 43 | 44 | DynamicLibrary 45 | false 46 | llvm 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | true 68 | 69 | 70 | false 71 | ..\..\AviSynthPlus\avs_core\include;$(IncludePath) 72 | 73 | 74 | ..\..\AviSynthPlus\avs_core\include;$(IncludePath) 75 | 76 | 77 | ..\..\AviSynthPlus\avs_core\include;$(IncludePath) 78 | false 79 | 80 | 81 | 82 | 83 | true 84 | 85 | 86 | true 87 | 88 | 89 | 90 | WIN32;_DEBUG;_WINDOWS;%(PreprocessorDefinitions) 91 | MultiThreadedDebugDLL 92 | Level3 93 | ProgramDatabase 94 | Disabled 95 | 96 | 97 | MachineX86 98 | true 99 | Windows 100 | 101 | 102 | 103 | 104 | WIN32;NDEBUG;_WINDOWS;%(PreprocessorDefinitions) 105 | MultiThreadedDLL 106 | Level3 107 | ProgramDatabase 108 | stdcpp17 109 | true 110 | AnySuitable 111 | true 112 | Speed 113 | true 114 | true 115 | Precise 116 | 117 | 118 | MachineX86 119 | true 120 | Windows 121 | true 122 | 123 | 124 | 125 | 126 | stdcpp17 127 | 128 | 129 | 130 | 131 | stdcpp17 132 | AnySuitable 133 | true 134 | Speed 135 | true 136 | true 137 | Precise 138 | MultiFile 139 | true 140 | 141 | 142 | true 143 | 144 | 145 | true 146 | kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) 147 | 148 | 149 | 150 | 151 | 152 | AdvancedVectorExtensions2 153 | AdvancedVectorExtensions2 154 | AVX2 155 | AdvancedVectorExtensions2 156 | 157 | 158 | AdvancedVectorExtensions512 159 | AdvancedVectorExtensions512 160 | CORE512 161 | AdvancedVectorExtensions512 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | -------------------------------------------------------------------------------- /msvc/vsTCanny.vcxproj.filters: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | {4FC737F1-C7A5-4376-A066-2A32D752A2FF} 6 | cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx 7 | 8 | 9 | {93995380-89BD-4b04-88EB-625FBE52EBFB} 10 | h;hh;hpp;hxx;hm;inl;inc;xsd 11 | 12 | 13 | {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} 14 | rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav 15 | 16 | 17 | 18 | 19 | Source Files 20 | 21 | 22 | Source Files 23 | 24 | 25 | Source Files 26 | 27 | 28 | Source Files 29 | 30 | 31 | 32 | 33 | Header Files 34 | 35 | 36 | 37 | 38 | Resource Files 39 | 40 | 41 | -------------------------------------------------------------------------------- /src/VCL2/LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | 179 | Copyright 2012-2019 Agner Fog. 180 | 181 | Licensed under the Apache License, Version 2.0 (the "License"); 182 | you may not use this file except in compliance with the License. 183 | You may obtain a copy of the License at 184 | 185 | http://www.apache.org/licenses/LICENSE-2.0 186 | 187 | Unless required by applicable law or agreed to in writing, software 188 | distributed under the License is distributed on an "AS IS" BASIS, 189 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 190 | See the License for the specific language governing permissions and 191 | limitations under the License. 192 | -------------------------------------------------------------------------------- /src/VCL2/instrset_detect.cpp: -------------------------------------------------------------------------------- 1 | /************************** instrset_detect.cpp **************************** 2 | * Author: Agner Fog 3 | * Date created: 2012-05-30 4 | * Last modified: 2019-08-01 5 | * Version: 2.00.00 6 | * Project: vector class library 7 | * Description: 8 | * Functions for checking which instruction sets are supported. 9 | * 10 | * (c) Copyright 2012-2019 Agner Fog. 11 | * Apache License version 2.0 or later. 12 | ******************************************************************************/ 13 | 14 | #include "instrset.h" 15 | 16 | #ifdef VCL_NAMESPACE 17 | namespace VCL_NAMESPACE { 18 | #endif 19 | 20 | 21 | // Define interface to xgetbv instruction 22 | static inline uint64_t xgetbv (int ctr) { 23 | #if (defined (_MSC_FULL_VER) && _MSC_FULL_VER >= 160040000) || (defined (__INTEL_COMPILER) && __INTEL_COMPILER >= 1200) 24 | // Microsoft or Intel compiler supporting _xgetbv intrinsic 25 | 26 | return uint64_t(_xgetbv(ctr)); // intrinsic function for XGETBV 27 | 28 | #elif defined(__GNUC__) || defined (__clang__) // use inline assembly, Gnu/AT&T syntax 29 | 30 | uint32_t a, d; 31 | __asm("xgetbv" : "=a"(a),"=d"(d) : "c"(ctr) : ); 32 | return a | (uint64_t(d) << 32); 33 | 34 | #else // #elif defined (_WIN32) // other compiler. try inline assembly with masm/intel/MS syntax 35 | uint32_t a, d; 36 | __asm { 37 | mov ecx, ctr 38 | _emit 0x0f 39 | _emit 0x01 40 | _emit 0xd0 ; // xgetbv 41 | mov a, eax 42 | mov d, edx 43 | } 44 | return a | (uint64_t(d) << 32); 45 | 46 | #endif 47 | } 48 | 49 | /* find supported instruction set 50 | return value: 51 | 0 = 80386 instruction set 52 | 1 or above = SSE (XMM) supported by CPU (not testing for OS support) 53 | 2 or above = SSE2 54 | 3 or above = SSE3 55 | 4 or above = Supplementary SSE3 (SSSE3) 56 | 5 or above = SSE4.1 57 | 6 or above = SSE4.2 58 | 7 or above = AVX supported by CPU and operating system 59 | 8 or above = AVX2 60 | 9 or above = AVX512F 61 | 10 or above = AVX512VL, AVX512BW, AVX512DQ 62 | */ 63 | int instrset_detect(void) { 64 | 65 | static int iset = -1; // remember value for next call 66 | if (iset >= 0) { 67 | return iset; // called before 68 | } 69 | iset = 0; // default value 70 | int abcd[4] = {0,0,0,0}; // cpuid results 71 | cpuid(abcd, 0); // call cpuid function 0 72 | if (abcd[0] == 0) return iset; // no further cpuid function supported 73 | cpuid(abcd, 1); // call cpuid function 1 for feature flags 74 | if ((abcd[3] & (1 << 0)) == 0) return iset; // no floating point 75 | if ((abcd[3] & (1 << 23)) == 0) return iset; // no MMX 76 | if ((abcd[3] & (1 << 15)) == 0) return iset; // no conditional move 77 | if ((abcd[3] & (1 << 24)) == 0) return iset; // no FXSAVE 78 | if ((abcd[3] & (1 << 25)) == 0) return iset; // no SSE 79 | iset = 1; // 1: SSE supported 80 | if ((abcd[3] & (1 << 26)) == 0) return iset; // no SSE2 81 | iset = 2; // 2: SSE2 supported 82 | if ((abcd[2] & (1 << 0)) == 0) return iset; // no SSE3 83 | iset = 3; // 3: SSE3 supported 84 | if ((abcd[2] & (1 << 9)) == 0) return iset; // no SSSE3 85 | iset = 4; // 4: SSSE3 supported 86 | if ((abcd[2] & (1 << 19)) == 0) return iset; // no SSE4.1 87 | iset = 5; // 5: SSE4.1 supported 88 | if ((abcd[2] & (1 << 23)) == 0) return iset; // no POPCNT 89 | if ((abcd[2] & (1 << 20)) == 0) return iset; // no SSE4.2 90 | iset = 6; // 6: SSE4.2 supported 91 | if ((abcd[2] & (1 << 27)) == 0) return iset; // no OSXSAVE 92 | if ((xgetbv(0) & 6) != 6) return iset; // AVX not enabled in O.S. 93 | if ((abcd[2] & (1 << 28)) == 0) return iset; // no AVX 94 | iset = 7; // 7: AVX supported 95 | cpuid(abcd, 7); // call cpuid leaf 7 for feature flags 96 | if ((abcd[1] & (1 << 5)) == 0) return iset; // no AVX2 97 | iset = 8; 98 | if ((abcd[1] & (1 << 16)) == 0) return iset; // no AVX512 99 | cpuid(abcd, 0xD); // call cpuid leaf 0xD for feature flags 100 | if ((abcd[0] & 0x60) != 0x60) return iset; // no AVX512 101 | iset = 9; 102 | cpuid(abcd, 7); // call cpuid leaf 7 for feature flags 103 | if ((abcd[1] & (1 << 31)) == 0) return iset; // no AVX512VL 104 | if ((abcd[1] & 0x40020000) != 0x40020000) return iset; // no AVX512BW, AVX512DQ 105 | iset = 10; 106 | return iset; 107 | } 108 | 109 | // detect if CPU supports the FMA3 instruction set 110 | bool hasFMA3(void) { 111 | if (instrset_detect() < 7) return false; // must have AVX 112 | int abcd[4]; // cpuid results 113 | cpuid(abcd, 1); // call cpuid function 1 114 | return ((abcd[2] & (1 << 12)) != 0); // ecx bit 12 indicates FMA3 115 | } 116 | 117 | // detect if CPU supports the FMA4 instruction set 118 | bool hasFMA4(void) { 119 | if (instrset_detect() < 7) return false; // must have AVX 120 | int abcd[4]; // cpuid results 121 | cpuid(abcd, 0x80000001); // call cpuid function 0x80000001 122 | return ((abcd[2] & (1 << 16)) != 0); // ecx bit 16 indicates FMA4 123 | } 124 | 125 | // detect if CPU supports the XOP instruction set 126 | bool hasXOP(void) { 127 | if (instrset_detect() < 7) return false; // must have AVX 128 | int abcd[4]; // cpuid results 129 | cpuid(abcd, 0x80000001); // call cpuid function 0x80000001 130 | return ((abcd[2] & (1 << 11)) != 0); // ecx bit 11 indicates XOP 131 | } 132 | 133 | // detect if CPU supports the F16C instruction set 134 | bool hasF16C(void) { 135 | if (instrset_detect() < 7) return false; // must have AVX 136 | int abcd[4]; // cpuid results 137 | cpuid(abcd, 1); // call cpuid function 1 138 | return ((abcd[2] & (1 << 29)) != 0); // ecx bit 29 indicates F16C 139 | } 140 | 141 | // detect if CPU supports the AVX512ER instruction set 142 | bool hasAVX512ER(void) { 143 | if (instrset_detect() < 9) return false; // must have AVX512F 144 | int abcd[4]; // cpuid results 145 | cpuid(abcd, 7); // call cpuid function 7 146 | return ((abcd[1] & (1 << 27)) != 0); // ebx bit 27 indicates AVX512ER 147 | } 148 | 149 | // detect if CPU supports the AVX512VBMI instruction set 150 | bool hasAVX512VBMI(void) { 151 | if (instrset_detect() < 10) return false; // must have AVX512BW 152 | int abcd[4]; // cpuid results 153 | cpuid(abcd, 7); // call cpuid function 7 154 | return ((abcd[2] & (1 << 1)) != 0); // ecx bit 1 indicates AVX512VBMI 155 | } 156 | 157 | // detect if CPU supports the AVX512VBMI2 instruction set 158 | bool hasAVX512VBMI2(void) { 159 | if (instrset_detect() < 10) return false; // must have AVX512BW 160 | int abcd[4]; // cpuid results 161 | cpuid(abcd, 7); // call cpuid function 7 162 | return ((abcd[2] & (1 << 6)) != 0); // ecx bit 6 indicates AVX512VBMI2 163 | } 164 | 165 | #ifdef VCL_NAMESPACE 166 | } 167 | #endif 168 | -------------------------------------------------------------------------------- /src/VCL2/vector_convert.h: -------------------------------------------------------------------------------- 1 | /************************** vector_convert.h ******************************* 2 | * Author: Agner Fog 3 | * Date created: 2014-07-23 4 | * Last modified: 2019-11-17 5 | * Version: 2.01.00 6 | * Project: vector class library 7 | * Description: 8 | * Header file for conversion between different vector classes with different 9 | * sizes. Also includes verious generic template functions. 10 | * 11 | * (c) Copyright 2012-2019 Agner Fog. 12 | * Apache License version 2.0 or later. 13 | *****************************************************************************/ 14 | 15 | #ifndef VECTOR_CONVERT_H 16 | #define VECTOR_CONVERT_H 17 | 18 | #ifndef VECTORCLASS_H 19 | #include "vectorclass.h" 20 | #endif 21 | 22 | #if VECTORCLASS_H < 20100 23 | #error Incompatible versions of vector class library mixed 24 | #endif 25 | 26 | #ifdef VCL_NAMESPACE 27 | namespace VCL_NAMESPACE { 28 | #endif 29 | 30 | #if MAX_VECTOR_SIZE >= 256 31 | 32 | /***************************************************************************** 33 | * 34 | * Extend from 128 to 256 bit vectors 35 | * 36 | *****************************************************************************/ 37 | 38 | #if INSTRSET >= 8 // AVX2. 256 bit integer vectors 39 | 40 | // sign extend 41 | static inline Vec16s extend (Vec16c const a) { 42 | return _mm256_cvtepi8_epi16(a); 43 | } 44 | 45 | // zero extend 46 | static inline Vec16us extend (Vec16uc const a) { 47 | return _mm256_cvtepu8_epi16(a); 48 | } 49 | 50 | // sign extend 51 | static inline Vec8i extend (Vec8s const a) { 52 | return _mm256_cvtepi16_epi32(a); 53 | } 54 | 55 | // zero extend 56 | static inline Vec8ui extend (Vec8us const a) { 57 | return _mm256_cvtepu16_epi32(a); 58 | } 59 | 60 | // sign extend 61 | static inline Vec4q extend (Vec4i const a) { 62 | return _mm256_cvtepi32_epi64(a); 63 | } 64 | 65 | // zero extend 66 | static inline Vec4uq extend (Vec4ui const a) { 67 | return _mm256_cvtepu32_epi64(a); 68 | } 69 | 70 | 71 | #else // no AVX2. 256 bit integer vectors are emulated 72 | 73 | // sign extend and zero extend functions: 74 | static inline Vec16s extend (Vec16c const a) { 75 | return Vec16s(extend_low(a), extend_high(a)); 76 | } 77 | 78 | static inline Vec16us extend (Vec16uc const a) { 79 | return Vec16us(extend_low(a), extend_high(a)); 80 | } 81 | 82 | static inline Vec8i extend (Vec8s const a) { 83 | return Vec8i(extend_low(a), extend_high(a)); 84 | } 85 | 86 | static inline Vec8ui extend (Vec8us const a) { 87 | return Vec8ui(extend_low(a), extend_high(a)); 88 | } 89 | 90 | static inline Vec4q extend (Vec4i const a) { 91 | return Vec4q(extend_low(a), extend_high(a)); 92 | } 93 | 94 | static inline Vec4uq extend (Vec4ui const a) { 95 | return Vec4uq(extend_low(a), extend_high(a)); 96 | } 97 | 98 | #endif // AVX2 99 | 100 | /***************************************************************************** 101 | * 102 | * Conversions between float and double 103 | * 104 | *****************************************************************************/ 105 | #if INSTRSET >= 7 // AVX. 256 bit float vectors 106 | 107 | // float to double 108 | static inline Vec4d to_double (Vec4f const a) { 109 | return _mm256_cvtps_pd(a); 110 | } 111 | 112 | // double to float 113 | static inline Vec4f to_float (Vec4d const a) { 114 | return _mm256_cvtpd_ps(a); 115 | } 116 | 117 | #else // no AVX2. 256 bit float vectors are emulated 118 | 119 | // float to double 120 | static inline Vec4d to_double (Vec4f const a) { 121 | Vec2d lo = _mm_cvtps_pd(a); 122 | Vec2d hi = _mm_cvtps_pd(_mm_movehl_ps(a, a)); 123 | return Vec4d(lo,hi); 124 | } 125 | 126 | // double to float 127 | static inline Vec4f to_float (Vec4d const a) { 128 | Vec4f lo = _mm_cvtpd_ps(a.get_low()); 129 | Vec4f hi = _mm_cvtpd_ps(a.get_high()); 130 | return _mm_movelh_ps(lo, hi); 131 | } 132 | 133 | #endif 134 | 135 | /***************************************************************************** 136 | * 137 | * Reduce from 256 to 128 bit vectors 138 | * 139 | *****************************************************************************/ 140 | #if INSTRSET >= 10 // AVX512VL 141 | 142 | // compress functions. overflow wraps around 143 | static inline Vec16c compress (Vec16s const a) { 144 | return _mm256_cvtepi16_epi8(a); 145 | } 146 | 147 | static inline Vec16uc compress (Vec16us const a) { 148 | return _mm256_cvtepi16_epi8(a); 149 | } 150 | 151 | static inline Vec8s compress (Vec8i const a) { 152 | return _mm256_cvtepi32_epi16(a); 153 | } 154 | 155 | static inline Vec8us compress (Vec8ui const a) { 156 | return _mm256_cvtepi32_epi16(a); 157 | } 158 | 159 | static inline Vec4i compress (Vec4q const a) { 160 | return _mm256_cvtepi64_epi32(a); 161 | } 162 | 163 | static inline Vec4ui compress (Vec4uq const a) { 164 | return _mm256_cvtepi64_epi32(a); 165 | } 166 | 167 | #else // no AVX512 168 | 169 | // compress functions. overflow wraps around 170 | static inline Vec16c compress (Vec16s const a) { 171 | return compress(a.get_low(), a.get_high()); 172 | } 173 | 174 | static inline Vec16uc compress (Vec16us const a) { 175 | return compress(a.get_low(), a.get_high()); 176 | } 177 | 178 | static inline Vec8s compress (Vec8i const a) { 179 | return compress(a.get_low(), a.get_high()); 180 | } 181 | 182 | static inline Vec8us compress (Vec8ui const a) { 183 | return compress(a.get_low(), a.get_high()); 184 | } 185 | 186 | static inline Vec4i compress (Vec4q const a) { 187 | return compress(a.get_low(), a.get_high()); 188 | } 189 | 190 | static inline Vec4ui compress (Vec4uq const a) { 191 | return compress(a.get_low(), a.get_high()); 192 | } 193 | 194 | #endif // AVX512 195 | 196 | #endif // MAX_VECTOR_SIZE >= 256 197 | 198 | 199 | #if MAX_VECTOR_SIZE >= 512 200 | 201 | /***************************************************************************** 202 | * 203 | * Extend from 256 to 512 bit vectors 204 | * 205 | *****************************************************************************/ 206 | 207 | #if INSTRSET >= 9 // AVX512. 512 bit integer vectors 208 | 209 | // sign extend 210 | static inline Vec32s extend (Vec32c const a) { 211 | #if INSTRSET >= 10 212 | return _mm512_cvtepi8_epi16(a); 213 | #else 214 | return Vec32s(extend_low(a), extend_high(a)); 215 | #endif 216 | } 217 | 218 | // zero extend 219 | static inline Vec32us extend (Vec32uc const a) { 220 | #if INSTRSET >= 10 221 | return _mm512_cvtepu8_epi16(a); 222 | #else 223 | return Vec32us(extend_low(a), extend_high(a)); 224 | #endif 225 | } 226 | 227 | // sign extend 228 | static inline Vec16i extend (Vec16s const a) { 229 | return _mm512_cvtepi16_epi32(a); 230 | } 231 | 232 | // zero extend 233 | static inline Vec16ui extend (Vec16us const a) { 234 | return _mm512_cvtepu16_epi32(a); 235 | } 236 | 237 | // sign extend 238 | static inline Vec8q extend (Vec8i const a) { 239 | return _mm512_cvtepi32_epi64(a); 240 | } 241 | 242 | // zero extend 243 | static inline Vec8uq extend (Vec8ui const a) { 244 | return _mm512_cvtepu32_epi64(a); 245 | } 246 | 247 | #else // no AVX512. 512 bit vectors are emulated 248 | 249 | 250 | 251 | // sign extend 252 | static inline Vec32s extend (Vec32c const a) { 253 | return Vec32s(extend_low(a), extend_high(a)); 254 | } 255 | 256 | // zero extend 257 | static inline Vec32us extend (Vec32uc const a) { 258 | return Vec32us(extend_low(a), extend_high(a)); 259 | } 260 | 261 | // sign extend 262 | static inline Vec16i extend (Vec16s const a) { 263 | return Vec16i(extend_low(a), extend_high(a)); 264 | } 265 | 266 | // zero extend 267 | static inline Vec16ui extend (Vec16us const a) { 268 | return Vec16ui(extend_low(a), extend_high(a)); 269 | } 270 | 271 | // sign extend 272 | static inline Vec8q extend (Vec8i const a) { 273 | return Vec8q(extend_low(a), extend_high(a)); 274 | } 275 | 276 | // zero extend 277 | static inline Vec8uq extend (Vec8ui const a) { 278 | return Vec8uq(extend_low(a), extend_high(a)); 279 | } 280 | 281 | #endif // AVX512 282 | 283 | 284 | /***************************************************************************** 285 | * 286 | * Reduce from 512 to 256 bit vectors 287 | * 288 | *****************************************************************************/ 289 | #if INSTRSET >= 9 // AVX512F 290 | 291 | // compress functions. overflow wraps around 292 | static inline Vec32c compress (Vec32s const a) { 293 | #if INSTRSET >= 10 // AVVX512BW 294 | return _mm512_cvtepi16_epi8(a); 295 | #else 296 | return compress(a.get_low(), a.get_high()); 297 | #endif 298 | } 299 | 300 | static inline Vec32uc compress (Vec32us const a) { 301 | return Vec32uc(compress(Vec32s(a))); 302 | } 303 | 304 | static inline Vec16s compress (Vec16i const a) { 305 | return _mm512_cvtepi32_epi16(a); 306 | } 307 | 308 | static inline Vec16us compress (Vec16ui const a) { 309 | return _mm512_cvtepi32_epi16(a); 310 | } 311 | 312 | static inline Vec8i compress (Vec8q const a) { 313 | return _mm512_cvtepi64_epi32(a); 314 | } 315 | 316 | static inline Vec8ui compress (Vec8uq const a) { 317 | return _mm512_cvtepi64_epi32(a); 318 | } 319 | 320 | #else // no AVX512 321 | 322 | // compress functions. overflow wraps around 323 | static inline Vec32c compress (Vec32s const a) { 324 | return compress(a.get_low(), a.get_high()); 325 | } 326 | 327 | static inline Vec32uc compress (Vec32us const a) { 328 | return compress(a.get_low(), a.get_high()); 329 | } 330 | 331 | static inline Vec16s compress (Vec16i const a) { 332 | return compress(a.get_low(), a.get_high()); 333 | } 334 | 335 | static inline Vec16us compress (Vec16ui const a) { 336 | return compress(a.get_low(), a.get_high()); 337 | } 338 | 339 | static inline Vec8i compress (Vec8q const a) { 340 | return compress(a.get_low(), a.get_high()); 341 | } 342 | 343 | static inline Vec8ui compress (Vec8uq const a) { 344 | return compress(a.get_low(), a.get_high()); 345 | } 346 | 347 | #endif // AVX512 348 | 349 | /***************************************************************************** 350 | * 351 | * Conversions between float and double 352 | * 353 | *****************************************************************************/ 354 | 355 | #if INSTRSET >= 9 // AVX512. 512 bit float vectors 356 | 357 | // float to double 358 | static inline Vec8d to_double (Vec8f const a) { 359 | return _mm512_cvtps_pd(a); 360 | } 361 | 362 | // double to float 363 | static inline Vec8f to_float (Vec8d const a) { 364 | return _mm512_cvtpd_ps(a); 365 | } 366 | 367 | #else // no AVX512. 512 bit float vectors are emulated 368 | 369 | // float to double 370 | static inline Vec8d to_double (Vec8f const a) { 371 | Vec4d lo = to_double(a.get_low()); 372 | Vec4d hi = to_double(a.get_high()); 373 | return Vec8d(lo,hi); 374 | } 375 | 376 | // double to float 377 | static inline Vec8f to_float (Vec8d const a) { 378 | Vec4f lo = to_float(a.get_low()); 379 | Vec4f hi = to_float(a.get_high()); 380 | return Vec8f(lo, hi); 381 | } 382 | 383 | #endif 384 | 385 | #endif // MAX_VECTOR_SIZE >= 512 386 | 387 | // double to float 388 | static inline Vec4f to_float (Vec2d const a) { 389 | return _mm_cvtpd_ps(a); 390 | } 391 | 392 | 393 | /***************************************************************************** 394 | * 395 | * Generic template functions 396 | * 397 | * These templates define functions for multiple vector types in one template 398 | * 399 | *****************************************************************************/ 400 | 401 | // horizontal min/max of vector elements 402 | // implemented with universal template, works for all vector types: 403 | 404 | template auto horizontal_min(T const x) { 405 | if constexpr ((T::elementtype() & 16) != 0) { 406 | // T is a float or double vector 407 | if (horizontal_or(is_nan(x))) { 408 | // check for NAN because min does not guarantee NAN propagation 409 | return x[horizontal_find_first(is_nan(x))]; 410 | } 411 | } 412 | return horizontal_min1(x); 413 | } 414 | 415 | template auto horizontal_min1(T const x) { 416 | if constexpr (T::elementtype() <= 3) { // boolean vector type 417 | return horizontal_and(x); 418 | } 419 | else if constexpr (sizeof(T) >= 32) { 420 | // split recursively into smaller vectors 421 | return horizontal_min1(min(x.get_low(), x.get_high())); 422 | } 423 | else if constexpr (T::size() == 2) { 424 | T a = permute2 <1, V_DC>(x); // high half 425 | T b = min(a, x); 426 | return b[0]; 427 | } 428 | else if constexpr (T::size() == 4) { 429 | T a = permute4<2, 3, V_DC, V_DC>(x); // high half 430 | T b = min(a, x); 431 | a = permute4<1, V_DC, V_DC, V_DC>(b); 432 | b = min(a, b); 433 | return b[0]; 434 | } 435 | else if constexpr (T::size() == 8) { 436 | T a = permute8<4, 5, 6, 7, V_DC, V_DC, V_DC, V_DC>(x); // high half 437 | T b = min(a, x); 438 | a = permute8<2, 3, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b); 439 | b = min(a, b); 440 | a = permute8<1, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b); 441 | b = min(a, b); 442 | return b[0]; 443 | } 444 | else { 445 | static_assert(T::size() == 16); // no other size is allowed 446 | T a = permute16<8, 9, 10, 11, 12, 13, 14, 15, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC >(x); // high half 447 | T b = min(a, x); 448 | a = permute16<4, 5, 6, 7, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b); 449 | b = min(a, b); 450 | a = permute16<2, 3, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b); 451 | b = min(a, b); 452 | a = permute16<1, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b); 453 | b = min(a, b); 454 | return b[0]; 455 | } 456 | } 457 | 458 | template auto horizontal_max(T const x) { 459 | if constexpr ((T::elementtype() & 16) != 0) { 460 | // T is a float or double vector 461 | if (horizontal_or(is_nan(x))) { 462 | // check for NAN because max does not guarantee NAN propagation 463 | return x[horizontal_find_first(is_nan(x))]; 464 | } 465 | } 466 | return horizontal_max1(x); 467 | } 468 | 469 | template auto horizontal_max1(T const x) { 470 | if constexpr (T::elementtype() <= 3) { // boolean vector type 471 | return horizontal_or(x); 472 | } 473 | else if constexpr (sizeof(T) >= 32) { 474 | // split recursively into smaller vectors 475 | return horizontal_max1(max(x.get_low(), x.get_high())); 476 | } 477 | else if constexpr (T::size() == 2) { 478 | T a = permute2 <1, V_DC>(x); // high half 479 | T b = max(a, x); 480 | return b[0]; 481 | } 482 | else if constexpr (T::size() == 4) { 483 | T a = permute4<2, 3, V_DC, V_DC>(x); // high half 484 | T b = max(a, x); 485 | a = permute4<1, V_DC, V_DC, V_DC>(b); 486 | b = max(a, b); 487 | return b[0]; 488 | } 489 | else if constexpr (T::size() == 8) { 490 | T a = permute8<4, 5, 6, 7, V_DC, V_DC, V_DC, V_DC>(x); // high half 491 | T b = max(a, x); 492 | a = permute8<2, 3, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b); 493 | b = max(a, b); 494 | a = permute8<1, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b); 495 | b = max(a, b); 496 | return b[0]; 497 | } 498 | else { 499 | static_assert(T::size() == 16); // no other size is allowed 500 | T a = permute16<8, 9, 10, 11, 12, 13, 14, 15, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC >(x); // high half 501 | T b = max(a, x); 502 | a = permute16<4, 5, 6, 7, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b); 503 | b = max(a, b); 504 | a = permute16<2, 3, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b); 505 | b = max(a, b); 506 | a = permute16<1, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b); 507 | b = max(a, b); 508 | return b[0]; 509 | } 510 | } 511 | 512 | // Find first element that is true in a boolean vector 513 | template 514 | static inline int horizontal_find_first(V const x) { 515 | static_assert(V::elementtype() == 2 || V::elementtype() == 3, "Boolean vector expected"); 516 | auto bits = to_bits(x); // convert to bits 517 | if (bits == 0) return -1; 518 | if constexpr (V::size() < 32) { 519 | return bit_scan_forward((uint32_t)bits); 520 | } 521 | else { 522 | return bit_scan_forward(bits); 523 | } 524 | } 525 | 526 | // Count the number of elements that are true in a boolean vector 527 | template 528 | static inline int horizontal_count(V const x) { 529 | static_assert(V::elementtype() == 2 || V::elementtype() == 3, "Boolean vector expected"); 530 | auto bits = to_bits(x); // convert to bits 531 | if constexpr (V::size() < 32) { 532 | return vml_popcnt((uint32_t)bits); 533 | } 534 | else { 535 | return (int)vml_popcnt(bits); 536 | } 537 | } 538 | 539 | // maximum and minimum functions. This version is sure to propagate NANs, 540 | // conforming to the new IEEE-754 2019 standard 541 | template 542 | static inline V maximum(V const a, V const b) { 543 | if constexpr (V::elementtype() < 16) { 544 | return max(a, b); // integer type 545 | } 546 | else { // float or double vector 547 | V y = select(is_nan(a), a, max(a, b)); 548 | #ifdef SIGNED_ZERO // pedantic about signed zero 549 | y = select(a == b, a & b, y); // maximum(+0, -0) = +0 550 | #endif 551 | return y; 552 | } 553 | } 554 | 555 | template 556 | static inline V minimum(V const a, V const b) { 557 | if constexpr (V::elementtype() < 16) { 558 | return min(a, b); // integer type 559 | } 560 | else { // float or double vector 561 | V y = select(is_nan(a), a, min(a, b)); 562 | #ifdef SIGNED_ZERO // pedantic about signed zero 563 | y = select(a == b, a | b, y); // minimum(+0, -0) = -0 564 | #endif 565 | return y; 566 | } 567 | } 568 | 569 | 570 | #ifdef VCL_NAMESPACE 571 | } 572 | #endif 573 | 574 | #endif // VECTOR_CONVERT_H 575 | -------------------------------------------------------------------------------- /src/VCL2/vectorclass.h: -------------------------------------------------------------------------------- 1 | /**************************** vectorclass.h ******************************** 2 | * Author: Agner Fog 3 | * Date created: 2012-05-30 4 | * Last modified: 2021-08-18 5 | * Version: 2.01.04 6 | * Project: vector class library 7 | * Home: https://github.com/vectorclass 8 | * Description: 9 | * Header file defining vector classes as interface to intrinsic functions 10 | * in x86 and x86-64 microprocessors with SSE2 and later instruction sets. 11 | * 12 | * Instructions: 13 | * Use Gnu, Clang, Intel or Microsoft C++ compiler. Compile for the desired 14 | * instruction set, which must be at least SSE2. Specify the supported 15 | * instruction set by a command line define, e.g. __SSE4_1__ if the 16 | * compiler does not automatically do so. 17 | * For detailed instructions, see vcl_manual.pdf 18 | * 19 | * Each vector object is represented internally in the CPU as a vector 20 | * register with 128, 256 or 512 bits. 21 | * 22 | * This header file includes the appropriate header files depending on the 23 | * selected instruction set. 24 | * 25 | * (c) Copyright 2012-2021 Agner Fog. 26 | * Apache License version 2.0 or later. 27 | ******************************************************************************/ 28 | #ifndef VECTORCLASS_H 29 | #define VECTORCLASS_H 20103 30 | 31 | // Maximum vector size, bits. Allowed values are 128, 256, 512 32 | #ifndef MAX_VECTOR_SIZE 33 | #define MAX_VECTOR_SIZE 512 34 | #endif 35 | 36 | // Determine instruction set, and define platform-dependent functions 37 | #include "instrset.h" // Select supported instruction set 38 | 39 | #if INSTRSET < 2 // instruction set SSE2 is the minimum 40 | #error Please compile for the SSE2 instruction set or higher 41 | #else 42 | 43 | // Select appropriate .h files depending on instruction set 44 | #include "vectori128.h" // 128-bit integer vectors 45 | #include "vectorf128.h" // 128-bit floating point vectors 46 | 47 | #if MAX_VECTOR_SIZE >= 256 48 | #if INSTRSET >= 8 49 | #include "vectori256.h" // 256-bit integer vectors, requires AVX2 instruction set 50 | #else 51 | #include "vectori256e.h" // 256-bit integer vectors, emulated 52 | #endif // INSTRSET >= 8 53 | #if INSTRSET >= 7 54 | #include "vectorf256.h" // 256-bit floating point vectors, requires AVX instruction set 55 | #else 56 | #include "vectorf256e.h" // 256-bit floating point vectors, emulated 57 | #endif // INSTRSET >= 7 58 | #endif // MAX_VECTOR_SIZE >= 256 59 | 60 | #if MAX_VECTOR_SIZE >= 512 61 | #if INSTRSET >= 9 62 | #include "vectori512.h" // 512-bit vectors of 32 and 64 bit integers, requires AVX512F instruction set 63 | #include "vectorf512.h" // 512-bit floating point vectors, requires AVX512F instruction set 64 | #else 65 | #include "vectori512e.h" // 512-bit integer vectors, emulated 66 | #include "vectorf512e.h" // 512-bit floating point vectors, emulated 67 | #endif // INSTRSET >= 9 68 | #if INSTRSET >= 10 69 | #include "vectori512s.h" // 512-bit vectors of 8 and 16 bit integers, requires AVX512BW instruction set 70 | #else 71 | #include "vectori512se.h" // 512-bit vectors of 8 and 16 bit integers, emulated 72 | #endif 73 | #endif // MAX_VECTOR_SIZE >= 512 74 | 75 | #include "vector_convert.h" // conversion between different vector sizes 76 | 77 | #endif // INSTRSET >= 2 78 | 79 | 80 | #else // VECTORCLASS_H 81 | 82 | #if VECTORCLASS_H < 20000 83 | #error Mixed versions of vector class library 84 | #endif 85 | 86 | #endif // VECTORCLASS_H 87 | -------------------------------------------------------------------------------- /src/VCL2/vectormath_common.h: -------------------------------------------------------------------------------- 1 | /*************************** vectormath_common.h **************************** 2 | * Author: Agner Fog 3 | * Date created: 2014-04-18 4 | * Last modified: 2020-06-08 5 | * Version: 2.01.03 6 | * Project: vector classes 7 | * Description: 8 | * Header file containing common code for inline version of mathematical functions. 9 | * 10 | * For detailed instructions, see VectorClass.pdf 11 | * 12 | * (c) Copyright 2014-2020 Agner Fog. 13 | * Apache License version 2.0 or later. 14 | ******************************************************************************/ 15 | 16 | #ifndef VECTORMATH_COMMON_H 17 | #define VECTORMATH_COMMON_H 2 18 | 19 | #ifdef VECTORMATH_LIB_H 20 | #error conflicting header files. More than one implementation of mathematical functions included 21 | #endif 22 | 23 | #include 24 | 25 | #ifndef VECTORCLASS_H 26 | #include "vectorclass.h" 27 | #endif 28 | 29 | #if VECTORCLASS_H < 20000 30 | #error Incompatible versions of vector class library mixed 31 | #endif 32 | 33 | 34 | /****************************************************************************** 35 | Define NAN payload values 36 | ******************************************************************************/ 37 | #define NAN_LOG 0x101 // logarithm for x<0 38 | #define NAN_POW 0x102 // negative number raised to non-integer power 39 | #define NAN_HYP 0x104 // acosh for x<1 and atanh for abs(x)>1 40 | 41 | 42 | /****************************************************************************** 43 | Define mathematical constants 44 | ******************************************************************************/ 45 | #define VM_PI 3.14159265358979323846 // pi 46 | #define VM_PI_2 1.57079632679489661923 // pi / 2 47 | #define VM_PI_4 0.785398163397448309616 // pi / 4 48 | #define VM_SQRT2 1.41421356237309504880 // sqrt(2) 49 | #define VM_LOG2E 1.44269504088896340736 // 1/log(2) 50 | #define VM_LOG10E 0.434294481903251827651 // 1/log(10) 51 | #define VM_LOG210 3.321928094887362347808 // log2(10) 52 | #define VM_LN2 0.693147180559945309417 // log(2) 53 | #define VM_LN10 2.30258509299404568402 // log(10) 54 | #define VM_SMALLEST_NORMAL 2.2250738585072014E-308 // smallest normal number, double 55 | #define VM_SMALLEST_NORMALF 1.17549435E-38f // smallest normal number, float 56 | 57 | 58 | #ifdef VCL_NAMESPACE 59 | namespace VCL_NAMESPACE { 60 | #endif 61 | 62 | /****************************************************************************** 63 | templates for producing infinite and nan in desired vector type 64 | ******************************************************************************/ 65 | template 66 | static inline VTYPE infinite_vec(); 67 | 68 | template <> 69 | inline Vec2d infinite_vec() { 70 | return infinite2d(); 71 | } 72 | 73 | template <> 74 | inline Vec4f infinite_vec() { 75 | return infinite4f(); 76 | } 77 | 78 | #if MAX_VECTOR_SIZE >= 256 79 | 80 | template <> 81 | inline Vec4d infinite_vec() { 82 | return infinite4d(); 83 | } 84 | 85 | template <> 86 | inline Vec8f infinite_vec() { 87 | return infinite8f(); 88 | } 89 | 90 | #endif // MAX_VECTOR_SIZE >= 256 91 | 92 | #if MAX_VECTOR_SIZE >= 512 93 | 94 | template <> 95 | inline Vec8d infinite_vec() { 96 | return infinite8d(); 97 | } 98 | 99 | template <> 100 | inline Vec16f infinite_vec() { 101 | return infinite16f(); 102 | } 103 | 104 | #endif // MAX_VECTOR_SIZE >= 512 105 | 106 | 107 | 108 | /****************************************************************************** 109 | * Detect NAN codes 110 | * 111 | * These functions return the code hidden in a NAN. The sign bit is ignored 112 | ******************************************************************************/ 113 | 114 | static inline Vec4ui nan_code(Vec4f const x) { 115 | Vec4ui a = Vec4ui(reinterpret_i(x)); 116 | Vec4ui const n = 0x007FFFFF; 117 | return select(Vec4ib(is_nan(x)), a & n, 0); 118 | } 119 | 120 | // This function returns the code hidden in a NAN. The sign bit is ignored 121 | static inline Vec2uq nan_code(Vec2d const x) { 122 | Vec2uq a = Vec2uq(reinterpret_i(x)); 123 | return select(Vec2qb(is_nan(x)), a << 12 >> (12+29), 0); 124 | } 125 | 126 | #if MAX_VECTOR_SIZE >= 256 127 | 128 | // This function returns the code hidden in a NAN. The sign bit is ignored 129 | static inline Vec8ui nan_code(Vec8f const x) { 130 | Vec8ui a = Vec8ui(reinterpret_i(x)); 131 | Vec8ui const n = 0x007FFFFF; 132 | return select(Vec8ib(is_nan(x)), a & n, 0); 133 | } 134 | 135 | // This function returns the code hidden in a NAN. The sign bit is ignored 136 | static inline Vec4uq nan_code(Vec4d const x) { 137 | Vec4uq a = Vec4uq(reinterpret_i(x)); 138 | return select(Vec4qb(is_nan(x)), a << 12 >> (12+29), 0); 139 | } 140 | 141 | #endif // MAX_VECTOR_SIZE >= 256 142 | #if MAX_VECTOR_SIZE >= 512 143 | 144 | // This function returns the code hidden in a NAN. The sign bit is ignored 145 | static inline Vec16ui nan_code(Vec16f const x) { 146 | Vec16ui a = Vec16ui(reinterpret_i(x)); 147 | Vec16ui const n = 0x007FFFFF; 148 | return select(Vec16ib(is_nan(x)), a & n, 0); 149 | } 150 | 151 | // This function returns the code hidden in a NAN. The sign bit is ignored 152 | static inline Vec8uq nan_code(Vec8d const x) { 153 | Vec8uq a = Vec8uq(reinterpret_i(x)); 154 | return select(Vec8qb(is_nan(x)), a << 12 >> (12+29), 0); 155 | } 156 | 157 | #endif // MAX_VECTOR_SIZE >= 512 158 | 159 | 160 | /****************************************************************************** 161 | templates for polynomials 162 | Using Estrin's scheme to make shorter dependency chains and use FMA, starting 163 | longest dependency chains first. 164 | ******************************************************************************/ 165 | 166 | // template 167 | template 168 | static inline VTYPE polynomial_2(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2) { 169 | // calculates polynomial c2*x^2 + c1*x + c0 170 | // VTYPE may be a vector type, CTYPE is a scalar type 171 | VTYPE x2 = x * x; 172 | //return = x2 * c2 + (x * c1 + c0); 173 | return mul_add(x2, c2, mul_add(x, c1, c0)); 174 | } 175 | 176 | template 177 | static inline VTYPE polynomial_3(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3) { 178 | // calculates polynomial c3*x^3 + c2*x^2 + c1*x + c0 179 | // VTYPE may be a vector type, CTYPE is a scalar type 180 | VTYPE x2 = x * x; 181 | //return (c2 + c3*x)*x2 + (c1*x + c0); 182 | return mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0)); 183 | } 184 | 185 | template 186 | static inline VTYPE polynomial_4(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4) { 187 | // calculates polynomial c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0 188 | // VTYPE may be a vector type, CTYPE is a scalar type 189 | VTYPE x2 = x * x; 190 | VTYPE x4 = x2 * x2; 191 | //return (c2+c3*x)*x2 + ((c0+c1*x) + c4*x4); 192 | return mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0) + c4*x4); 193 | } 194 | 195 | template 196 | static inline VTYPE polynomial_4n(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3) { 197 | // calculates polynomial 1*x^4 + c3*x^3 + c2*x^2 + c1*x + c0 198 | // VTYPE may be a vector type, CTYPE is a scalar type 199 | VTYPE x2 = x * x; 200 | VTYPE x4 = x2 * x2; 201 | //return (c2+c3*x)*x2 + ((c0+c1*x) + x4); 202 | return mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0) + x4); 203 | } 204 | 205 | template 206 | static inline VTYPE polynomial_5(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5) { 207 | // calculates polynomial c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0 208 | // VTYPE may be a vector type, CTYPE is a scalar type 209 | VTYPE x2 = x * x; 210 | VTYPE x4 = x2 * x2; 211 | //return (c2+c3*x)*x2 + ((c4+c5*x)*x4 + (c0+c1*x)); 212 | return mul_add(mul_add(c3, x, c2), x2, mul_add(mul_add(c5, x, c4), x4, mul_add(c1, x, c0))); 213 | } 214 | 215 | template 216 | static inline VTYPE polynomial_5n(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4) { 217 | // calculates polynomial 1*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0 218 | // VTYPE may be a vector type, CTYPE is a scalar type 219 | VTYPE x2 = x * x; 220 | VTYPE x4 = x2 * x2; 221 | //return (c2+c3*x)*x2 + ((c4+x)*x4 + (c0+c1*x)); 222 | return mul_add(mul_add(c3, x, c2), x2, mul_add(c4 + x, x4, mul_add(c1, x, c0))); 223 | } 224 | 225 | template 226 | static inline VTYPE polynomial_6(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6) { 227 | // calculates polynomial c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0 228 | // VTYPE may be a vector type, CTYPE is a scalar type 229 | VTYPE x2 = x * x; 230 | VTYPE x4 = x2 * x2; 231 | //return (c4+c5*x+c6*x2)*x4 + ((c2+c3*x)*x2 + (c0+c1*x)); 232 | return mul_add(mul_add(c6, x2, mul_add(c5, x, c4)), x4, mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0))); 233 | } 234 | 235 | template 236 | static inline VTYPE polynomial_6n(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5) { 237 | // calculates polynomial 1*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0 238 | // VTYPE may be a vector type, CTYPE is a scalar type 239 | VTYPE x2 = x * x; 240 | VTYPE x4 = x2 * x2; 241 | //return (c4+c5*x+x2)*x4 + ((c2+c3*x)*x2 + (c0+c1*x)); 242 | return mul_add(mul_add(c5, x, c4 + x2), x4, mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0))); 243 | } 244 | 245 | template 246 | static inline VTYPE polynomial_7(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7) { 247 | // calculates polynomial c7*x^7 + c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0 248 | // VTYPE may be a vector type, CTYPE is a scalar type 249 | VTYPE x2 = x * x; 250 | VTYPE x4 = x2 * x2; 251 | //return ((c6+c7*x)*x2 + (c4+c5*x))*x4 + ((c2+c3*x)*x2 + (c0+c1*x)); 252 | return mul_add(mul_add(mul_add(c7, x, c6), x2, mul_add(c5, x, c4)), x4, mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0))); 253 | } 254 | 255 | template 256 | static inline VTYPE polynomial_8(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7, CTYPE c8) { 257 | // calculates polynomial c8*x^8 + c7*x^7 + c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0 258 | // VTYPE may be a vector type, CTYPE is a scalar type 259 | VTYPE x2 = x * x; 260 | VTYPE x4 = x2 * x2; 261 | VTYPE x8 = x4 * x4; 262 | //return ((c6+c7*x)*x2 + (c4+c5*x))*x4 + (c8*x8 + (c2+c3*x)*x2 + (c0+c1*x)); 263 | return mul_add(mul_add(mul_add(c7, x, c6), x2, mul_add(c5, x, c4)), x4, 264 | mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0) + c8*x8)); 265 | } 266 | 267 | template 268 | static inline VTYPE polynomial_9(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7, CTYPE c8, CTYPE c9) { 269 | // calculates polynomial c9*x^9 + c8*x^8 + c7*x^7 + c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0 270 | // VTYPE may be a vector type, CTYPE is a scalar type 271 | VTYPE x2 = x * x; 272 | VTYPE x4 = x2 * x2; 273 | VTYPE x8 = x4 * x4; 274 | //return (((c6+c7*x)*x2 + (c4+c5*x))*x4 + (c8+c9*x)*x8) + ((c2+c3*x)*x2 + (c0+c1*x)); 275 | return mul_add(mul_add(c9, x, c8), x8, mul_add( 276 | mul_add(mul_add(c7, x, c6), x2, mul_add(c5, x, c4)), x4, 277 | mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0)))); 278 | } 279 | 280 | template 281 | static inline VTYPE polynomial_10(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7, CTYPE c8, CTYPE c9, CTYPE c10) { 282 | // calculates polynomial c10*x^10 + c9*x^9 + c8*x^8 + c7*x^7 + c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0 283 | // VTYPE may be a vector type, CTYPE is a scalar type 284 | VTYPE x2 = x * x; 285 | VTYPE x4 = x2 * x2; 286 | VTYPE x8 = x4 * x4; 287 | //return (((c6+c7*x)*x2 + (c4+c5*x))*x4 + (c8+c9*x+c10*x2)*x8) + ((c2+c3*x)*x2 + (c0+c1*x)); 288 | return mul_add(mul_add(x2, c10, mul_add(c9, x, c8)), x8, 289 | mul_add(mul_add(mul_add(c7, x, c6), x2, mul_add(c5, x, c4)), x4, 290 | mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0)))); 291 | } 292 | 293 | template 294 | static inline VTYPE polynomial_13(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7, CTYPE c8, CTYPE c9, CTYPE c10, CTYPE c11, CTYPE c12, CTYPE c13) { 295 | // calculates polynomial c13*x^13 + c12*x^12 + ... + c1*x + c0 296 | // VTYPE may be a vector type, CTYPE is a scalar type 297 | VTYPE x2 = x * x; 298 | VTYPE x4 = x2 * x2; 299 | VTYPE x8 = x4 * x4; 300 | return mul_add( 301 | mul_add( 302 | mul_add(c13, x, c12), x4, 303 | mul_add(mul_add(c11, x, c10), x2, mul_add(c9, x, c8))), x8, 304 | mul_add( 305 | mul_add(mul_add(c7, x, c6), x2, mul_add(c5, x, c4)), x4, 306 | mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0)))); 307 | } 308 | 309 | 310 | template 311 | static inline VTYPE polynomial_13m(VTYPE const x, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7, CTYPE c8, CTYPE c9, CTYPE c10, CTYPE c11, CTYPE c12, CTYPE c13) { 312 | // calculates polynomial c13*x^13 + c12*x^12 + ... + x + 0 313 | // VTYPE may be a vector type, CTYPE is a scalar type 314 | VTYPE x2 = x * x; 315 | VTYPE x4 = x2 * x2; 316 | VTYPE x8 = x4 * x4; 317 | // return ((c8+c9*x) + (c10+c11*x)*x2 + (c12+c13*x)*x4)*x8 + (((c6+c7*x)*x2 + (c4+c5*x))*x4 + ((c2+c3*x)*x2 + x)); 318 | return mul_add( 319 | mul_add(mul_add(c13, x, c12), x4, mul_add(mul_add(c11, x, c10), x2, mul_add(c9, x, c8))), x8, 320 | mul_add(mul_add(mul_add(c7, x, c6), x2, mul_add(c5, x, c4)), x4, mul_add(mul_add(c3, x, c2), x2, x))); 321 | } 322 | 323 | #ifdef VCL_NAMESPACE 324 | } 325 | #endif 326 | 327 | #endif 328 | -------------------------------------------------------------------------------- /src/VCL2/vectormath_hyp.h: -------------------------------------------------------------------------------- 1 | /**************************** vectormath_hyp.h ****************************** 2 | * Author: Agner Fog 3 | * Date created: 2014-07-09 4 | * Last modified: 2019-08-01 5 | * Version: 2.00.00 6 | * Project: vector class library 7 | * Description: 8 | * Header file containing inline vector functions of hyperbolic and inverse 9 | * hyperbolic functions: 10 | * sinh hyperbolic sine 11 | * cosh hyperbolic cosine 12 | * tanh hyperbolic tangent 13 | * asinh inverse hyperbolic sine 14 | * acosh inverse hyperbolic cosine 15 | * atanh inverse hyperbolic tangent 16 | * 17 | * Theory, methods and inspiration based partially on these sources: 18 | * > Moshier, Stephen Lloyd Baluk: Methods and programs for mathematical functions. 19 | * Ellis Horwood, 1989. 20 | * > VDT library developed on CERN by Danilo Piparo, Thomas Hauth and 21 | * Vincenzo Innocente, 2012, https://svnweb.cern.ch/trac/vdt 22 | * > Cephes math library by Stephen L. Moshier 1992, 23 | * http://www.netlib.org/cephes/ 24 | * 25 | * For detailed instructions, see vectormath_common.h and vcl_manual.pdf 26 | * 27 | * (c) Copyright 2014-2019 Agner Fog. 28 | * Apache License version 2.0 or later. 29 | ******************************************************************************/ 30 | 31 | #ifndef VECTORMATH_HYP_H 32 | #define VECTORMATH_HYP_H 1 33 | 34 | #include "vectormath_exp.h" 35 | 36 | #ifdef VCL_NAMESPACE 37 | namespace VCL_NAMESPACE { 38 | #endif 39 | 40 | /****************************************************************************** 41 | * Hyperbolic functions 42 | ******************************************************************************/ 43 | 44 | // Template for sinh function, double precision 45 | // This function does not produce denormals 46 | // Template parameters: 47 | // VTYPE: double vector type 48 | template 49 | static inline VTYPE sinh_d(VTYPE const x0) { 50 | // The limit of abs(x) is 709.7, as defined by max_x in vectormath_exp.h for 0.5*exp(x). 51 | 52 | // Coefficients 53 | const double p0 = -3.51754964808151394800E5; 54 | const double p1 = -1.15614435765005216044E4; 55 | const double p2 = -1.63725857525983828727E2; 56 | const double p3 = -7.89474443963537015605E-1; 57 | 58 | const double q0 = -2.11052978884890840399E6; 59 | const double q1 = 3.61578279834431989373E4; 60 | const double q2 = -2.77711081420602794433E2; 61 | const double q3 = 1.0; 62 | 63 | // data vectors 64 | VTYPE x, x2, y1, y2; 65 | 66 | x = abs(x0); 67 | auto x_small = x <= 1.0; // use Pade approximation if abs(x) <= 1 68 | 69 | if (horizontal_or(x_small)) { 70 | // At least one element needs small method 71 | x2 = x*x; 72 | y1 = polynomial_3(x2, p0, p1, p2, p3) / polynomial_3(x2, q0, q1, q2, q3); 73 | y1 = mul_add(y1, x*x2, x); // y1 = x + x2*(x*y1); 74 | } 75 | if (!horizontal_and(x_small)) { 76 | // At least one element needs big method 77 | y2 = exp_d(x); // 0.5 * exp(x) 78 | y2 -= 0.25 / y2; // - 0.5 * exp(-x) 79 | } 80 | y1 = select(x_small, y1, y2); // choose method 81 | y1 = sign_combine(y1, x0); // get original sign 82 | // you can avoid the sign_combine by replacing x by x0 above, but at a loss of precision 83 | 84 | return y1; 85 | } 86 | 87 | // instances of sinh_d template 88 | static inline Vec2d sinh(Vec2d const x) { 89 | return sinh_d(x); 90 | } 91 | 92 | #if MAX_VECTOR_SIZE >= 256 93 | static inline Vec4d sinh(Vec4d const x) { 94 | return sinh_d(x); 95 | } 96 | #endif // MAX_VECTOR_SIZE >= 256 97 | 98 | #if MAX_VECTOR_SIZE >= 512 99 | static inline Vec8d sinh(Vec8d const x) { 100 | return sinh_d(x); 101 | } 102 | #endif // MAX_VECTOR_SIZE >= 512 103 | 104 | 105 | // Template for sinh function, single precision 106 | // This function does not produce denormals 107 | // Template parameters: 108 | // VTYPE: double vector type 109 | template 110 | static inline VTYPE sinh_f(VTYPE const x0) { 111 | // The limit of abs(x) is 89.0, as defined by max_x in vectormath_exp.h for 0.5*exp(x). 112 | 113 | // Coefficients 114 | const float r0 = 1.66667160211E-1f; 115 | const float r1 = 8.33028376239E-3f; 116 | const float r2 = 2.03721912945E-4f; 117 | 118 | // data vectors 119 | VTYPE x, x2, y1, y2; 120 | 121 | x = abs(x0); 122 | auto x_small = x <= 1.0f; // use polynomial approximation if abs(x) <= 1 123 | 124 | if (horizontal_or(x_small)) { 125 | // At least one element needs small method 126 | x2 = x*x; 127 | y1 = polynomial_2(x2, r0, r1, r2); 128 | y1 = mul_add(y1, x2*x, x); // y1 = x + x2*(x*y1); 129 | } 130 | if (!horizontal_and(x_small)) { 131 | // At least one element needs big method 132 | y2 = exp_f(x); // 0.5 * exp(x) 133 | y2 -= 0.25f / y2; // - 0.5 * exp(-x) 134 | } 135 | y1 = select(x_small, y1, y2); // choose method 136 | y1 = sign_combine(y1, x0); // get original sign 137 | // you can avoid the sign_combine by replacing x by x0 above, but at a loss of precision 138 | 139 | return y1; 140 | } 141 | 142 | // instances of sinh_f template 143 | static inline Vec4f sinh(Vec4f const x) { 144 | return sinh_f(x); 145 | } 146 | 147 | #if MAX_VECTOR_SIZE >= 256 148 | static inline Vec8f sinh(Vec8f const x) { 149 | return sinh_f(x); 150 | } 151 | #endif // MAX_VECTOR_SIZE >= 256 152 | 153 | #if MAX_VECTOR_SIZE >= 512 154 | static inline Vec16f sinh(Vec16f const x) { 155 | return sinh_f(x); 156 | } 157 | #endif // MAX_VECTOR_SIZE >= 512 158 | 159 | 160 | // Template for cosh function, double precision 161 | // This function does not produce denormals 162 | // Template parameters: 163 | // VTYPE: double vector type 164 | template 165 | static inline VTYPE cosh_d(VTYPE const x0) { 166 | // The limit of abs(x) is 709.7, as defined by max_x in vectormath_exp.h for 0.5*exp(x). 167 | 168 | // data vectors 169 | VTYPE x, y; 170 | x = abs(x0); 171 | y = exp_d(x); // 0.5 * exp(x) 172 | y += 0.25 / y; // + 0.5 * exp(-x) 173 | return y; 174 | } 175 | 176 | // instances of sinh_d template 177 | static inline Vec2d cosh(Vec2d const x) { 178 | return cosh_d(x); 179 | } 180 | 181 | #if MAX_VECTOR_SIZE >= 256 182 | static inline Vec4d cosh(Vec4d const x) { 183 | return cosh_d(x); 184 | } 185 | #endif // MAX_VECTOR_SIZE >= 256 186 | 187 | #if MAX_VECTOR_SIZE >= 512 188 | static inline Vec8d cosh(Vec8d const x) { 189 | return cosh_d(x); 190 | } 191 | #endif // MAX_VECTOR_SIZE >= 512 192 | 193 | 194 | // Template for cosh function, single precision 195 | // This function does not produce denormals 196 | // Template parameters: 197 | // VTYPE: double vector type 198 | template 199 | static inline VTYPE cosh_f(VTYPE const x0) { 200 | // The limit of abs(x) is 89.0, as defined by max_x in vectormath_exp.h for 0.5*exp(x). 201 | 202 | // data vectors 203 | VTYPE x, y; 204 | x = abs(x0); 205 | y = exp_f(x); // 0.5 * exp(x) 206 | y += 0.25f / y; // + 0.5 * exp(-x) 207 | return y; 208 | } 209 | 210 | // instances of sinh_d template 211 | static inline Vec4f cosh(Vec4f const x) { 212 | return cosh_f(x); 213 | } 214 | 215 | #if MAX_VECTOR_SIZE >= 256 216 | static inline Vec8f cosh(Vec8f const x) { 217 | return cosh_f(x); 218 | } 219 | #endif // MAX_VECTOR_SIZE >= 256 220 | 221 | #if MAX_VECTOR_SIZE >= 512 222 | static inline Vec16f cosh(Vec16f const x) { 223 | return cosh_f(x); 224 | } 225 | #endif // MAX_VECTOR_SIZE >= 512 226 | 227 | 228 | // Template for tanh function, double precision 229 | // This function does not produce denormals 230 | // Template parameters: 231 | // VTYPE: double vector type 232 | template 233 | static inline VTYPE tanh_d(VTYPE const x0) { 234 | 235 | // Coefficients 236 | const double p0 = -1.61468768441708447952E3; 237 | const double p1 = -9.92877231001918586564E1; 238 | const double p2 = -9.64399179425052238628E-1; 239 | 240 | const double q0 = 4.84406305325125486048E3; 241 | const double q1 = 2.23548839060100448583E3; 242 | const double q2 = 1.12811678491632931402E2; 243 | const double q3 = 1.0; 244 | 245 | // data vectors 246 | VTYPE x, x2, y1, y2; 247 | 248 | x = abs(x0); 249 | auto x_small = x <= 0.625; // use Pade approximation if abs(x) <= 5/8 250 | 251 | if (horizontal_or(x_small)) { 252 | // At least one element needs small method 253 | x2 = x*x; 254 | y1 = polynomial_2(x2, p0, p1, p2) / polynomial_3(x2, q0, q1, q2, q3); 255 | y1 = mul_add(y1, x2*x, x); // y1 = x + x2*(x*y1); 256 | } 257 | if (!horizontal_and(x_small)) { 258 | // At least one element needs big method 259 | y2 = exp(x+x); // exp(2*x) 260 | y2 = 1.0 - 2.0 / (y2 + 1.0); // tanh(x) 261 | } 262 | auto x_big = x > 350.; 263 | y1 = select(x_small, y1, y2); // choose method 264 | y1 = select(x_big, 1.0, y1); // avoid overflow 265 | y1 = sign_combine(y1, x0); // get original sign 266 | return y1; 267 | } 268 | 269 | // instances of tanh_d template 270 | static inline Vec2d tanh(Vec2d const x) { 271 | return tanh_d(x); 272 | } 273 | 274 | #if MAX_VECTOR_SIZE >= 256 275 | static inline Vec4d tanh(Vec4d const x) { 276 | return tanh_d(x); 277 | } 278 | #endif // MAX_VECTOR_SIZE >= 256 279 | 280 | #if MAX_VECTOR_SIZE >= 512 281 | static inline Vec8d tanh(Vec8d const x) { 282 | return tanh_d(x); 283 | } 284 | #endif // MAX_VECTOR_SIZE >= 512 285 | 286 | 287 | // Template for tanh function, single precision 288 | // This function does not produce denormals 289 | // Template parameters: 290 | // VTYPE: double vector type 291 | template 292 | static inline VTYPE tanh_f(VTYPE const x0) { 293 | // The limit of abs(x) is 89.0, as defined by max_x in vectormath_exp.h for 0.5*exp(x). 294 | 295 | // Coefficients 296 | const float r0 = -3.33332819422E-1f; 297 | const float r1 = 1.33314422036E-1f; 298 | const float r2 = -5.37397155531E-2f; 299 | const float r3 = 2.06390887954E-2f; 300 | const float r4 = -5.70498872745E-3f; 301 | 302 | // data vectors 303 | VTYPE x, x2, y1, y2; 304 | 305 | x = abs(x0); 306 | auto x_small = x <= 0.625f; // use polynomial approximation if abs(x) <= 5/8 307 | 308 | if (horizontal_or(x_small)) { 309 | // At least one element needs small method 310 | x2 = x*x; 311 | y1 = polynomial_4(x2, r0, r1, r2, r3, r4); 312 | y1 = mul_add(y1, x2*x, x); // y1 = x + (x2*x)*y1; 313 | } 314 | if (!horizontal_and(x_small)) { 315 | // At least one element needs big method 316 | y2 = exp(x+x); // exp(2*x) 317 | y2 = 1.0f - 2.0f / (y2 + 1.0f); // tanh(x) 318 | } 319 | auto x_big = x > 44.4f; 320 | y1 = select(x_small, y1, y2); // choose method 321 | y1 = select(x_big, 1.0f, y1); // avoid overflow 322 | y1 = sign_combine(y1, x0); // get original sign 323 | return y1; 324 | } 325 | 326 | // instances of tanh_f template 327 | static inline Vec4f tanh(Vec4f const x) { 328 | return tanh_f(x); 329 | } 330 | 331 | #if MAX_VECTOR_SIZE >= 256 332 | static inline Vec8f tanh(Vec8f const x) { 333 | return tanh_f(x); 334 | } 335 | #endif // MAX_VECTOR_SIZE >= 256 336 | 337 | #if MAX_VECTOR_SIZE >= 512 338 | static inline Vec16f tanh(Vec16f const x) { 339 | return tanh_f(x); 340 | } 341 | #endif // MAX_VECTOR_SIZE >= 512 342 | 343 | 344 | 345 | /****************************************************************************** 346 | * Inverse hyperbolic functions 347 | ******************************************************************************/ 348 | 349 | // Template for asinh function, double precision 350 | // This function does not produce denormals 351 | // Template parameters: 352 | // VTYPE: double vector type 353 | template 354 | static inline VTYPE asinh_d(VTYPE const x0) { 355 | 356 | // Coefficients 357 | const double p0 = -5.56682227230859640450E0; 358 | const double p1 = -9.09030533308377316566E0; 359 | const double p2 = -4.37390226194356683570E0; 360 | const double p3 = -5.91750212056387121207E-1; 361 | const double p4 = -4.33231683752342103572E-3; 362 | 363 | const double q0 = 3.34009336338516356383E1; 364 | const double q1 = 6.95722521337257608734E1; 365 | const double q2 = 4.86042483805291788324E1; 366 | const double q3 = 1.28757002067426453537E1; 367 | const double q4 = 1.0; 368 | 369 | // data vectors 370 | VTYPE x, x2, y1, y2; 371 | 372 | x2 = x0 * x0; 373 | x = abs(x0); 374 | auto x_small = x <= 0.533; // use Pade approximation if abs(x) <= 0.5 375 | // Both methods give the highest error close to 0.5. 376 | // This limit is adjusted for minimum error 377 | auto x_huge = x > 1.E20; // simple approximation, avoid overflow 378 | 379 | if (horizontal_or(x_small)) { 380 | // At least one element needs small method 381 | y1 = polynomial_4(x2, p0, p1, p2, p3, p4) / polynomial_4(x2, q0, q1, q2, q3, q4); 382 | y1 = mul_add(y1, x2*x, x); // y1 = x + (x2*x)*y1; 383 | } 384 | if (!horizontal_and(x_small)) { 385 | // At least one element needs big method 386 | y2 = log(x + sqrt(x2 + 1.0)); 387 | if (horizontal_or(x_huge)) { 388 | // At least one element needs huge method to avoid overflow 389 | y2 = select(x_huge, log(x) + VM_LN2, y2); 390 | } 391 | } 392 | y1 = select(x_small, y1, y2); // choose method 393 | y1 = sign_combine(y1, x0); // get original sign 394 | return y1; 395 | } 396 | 397 | // instances of asinh_d template 398 | static inline Vec2d asinh(Vec2d const x) { 399 | return asinh_d(x); 400 | } 401 | 402 | #if MAX_VECTOR_SIZE >= 256 403 | static inline Vec4d asinh(Vec4d const x) { 404 | return asinh_d(x); 405 | } 406 | #endif // MAX_VECTOR_SIZE >= 256 407 | 408 | #if MAX_VECTOR_SIZE >= 512 409 | static inline Vec8d asinh(Vec8d const x) { 410 | return asinh_d(x); 411 | } 412 | #endif // MAX_VECTOR_SIZE >= 512 413 | 414 | 415 | // Template for asinh function, single precision 416 | // This function does not produce denormals 417 | // Template parameters: 418 | // VTYPE: double vector type 419 | template 420 | static inline VTYPE asinh_f(VTYPE const x0) { 421 | 422 | // Coefficients 423 | const float r0 = -1.6666288134E-1f; 424 | const float r1 = 7.4847586088E-2f; 425 | const float r2 = -4.2699340972E-2f; 426 | const float r3 = 2.0122003309E-2f; 427 | 428 | // data vectors 429 | VTYPE x, x2, y1, y2; 430 | 431 | x2 = x0 * x0; 432 | x = abs(x0); 433 | auto x_small = x <= 0.51f; // use polynomial approximation if abs(x) <= 0.5 434 | auto x_huge = x > 1.E10f; // simple approximation, avoid overflow 435 | 436 | if (horizontal_or(x_small)) { 437 | // At least one element needs small method 438 | y1 = polynomial_3(x2, r0, r1, r2, r3); 439 | y1 = mul_add(y1, x2*x, x); // y1 = x + (x2*x)*y1; 440 | } 441 | if (!horizontal_and(x_small)) { 442 | // At least one element needs big method 443 | y2 = log(x + sqrt(x2 + 1.0f)); 444 | if (horizontal_or(x_huge)) { 445 | // At least one element needs huge method to avoid overflow 446 | y2 = select(x_huge, log(x) + (float)VM_LN2, y2); 447 | } 448 | } 449 | y1 = select(x_small, y1, y2); // choose method 450 | y1 = sign_combine(y1, x0); // get original sign 451 | return y1; 452 | } 453 | 454 | // instances of asinh_f template 455 | static inline Vec4f asinh(Vec4f const x) { 456 | return asinh_f(x); 457 | } 458 | 459 | #if MAX_VECTOR_SIZE >= 256 460 | static inline Vec8f asinh(Vec8f const x) { 461 | return asinh_f(x); 462 | } 463 | #endif // MAX_VECTOR_SIZE >= 256 464 | 465 | #if MAX_VECTOR_SIZE >= 512 466 | static inline Vec16f asinh(Vec16f const x) { 467 | return asinh_f(x); 468 | } 469 | #endif // MAX_VECTOR_SIZE >= 512 470 | 471 | 472 | // Template for acosh function, double precision 473 | // This function does not produce denormals 474 | // Template parameters: 475 | // VTYPE: double vector type 476 | template 477 | static inline VTYPE acosh_d(VTYPE const x0) { 478 | 479 | // Coefficients 480 | const double p0 = 1.10855947270161294369E5; 481 | const double p1 = 1.08102874834699867335E5; 482 | const double p2 = 3.43989375926195455866E4; 483 | const double p3 = 3.94726656571334401102E3; 484 | const double p4 = 1.18801130533544501356E2; 485 | 486 | const double q0 = 7.83869920495893927727E4; 487 | const double q1 = 8.29725251988426222434E4; 488 | const double q2 = 2.97683430363289370382E4; 489 | const double q3 = 4.15352677227719831579E3; 490 | const double q4 = 1.86145380837903397292E2; 491 | const double q5 = 1.0; 492 | 493 | // data vectors 494 | VTYPE x1, y1, y2; 495 | 496 | x1 = x0 - 1.0; 497 | auto undef = x0 < 1.0; // result is NAN 498 | auto x_small = x1 < 0.49; // use Pade approximation if abs(x-1) < 0.5 499 | auto x_huge = x1 > 1.E20; // simple approximation, avoid overflow 500 | 501 | if (horizontal_or(x_small)) { 502 | // At least one element needs small method 503 | y1 = sqrt(x1) * (polynomial_4(x1, p0, p1, p2, p3, p4) / polynomial_5(x1, q0, q1, q2, q3, q4, q5)); 504 | // x < 1 generates NAN 505 | y1 = select(undef, nan_vec(NAN_HYP), y1); 506 | } 507 | if (!horizontal_and(x_small)) { 508 | // At least one element needs big method 509 | y2 = log(x0 + sqrt(mul_sub(x0,x0,1.0))); 510 | if (horizontal_or(x_huge)) { 511 | // At least one element needs huge method to avoid overflow 512 | y2 = select(x_huge, log(x0) + VM_LN2, y2); 513 | } 514 | } 515 | y1 = select(x_small, y1, y2); // choose method 516 | return y1; 517 | } 518 | 519 | // instances of acosh_d template 520 | static inline Vec2d acosh(Vec2d const x) { 521 | return acosh_d(x); 522 | } 523 | 524 | #if MAX_VECTOR_SIZE >= 256 525 | static inline Vec4d acosh(Vec4d const x) { 526 | return acosh_d(x); 527 | } 528 | #endif // MAX_VECTOR_SIZE >= 256 529 | 530 | #if MAX_VECTOR_SIZE >= 512 531 | static inline Vec8d acosh(Vec8d const x) { 532 | return acosh_d(x); 533 | } 534 | #endif // MAX_VECTOR_SIZE >= 512 535 | 536 | 537 | // Template for acosh function, single precision 538 | // This function does not produce denormals 539 | // Template parameters: 540 | // VTYPE: double vector type 541 | template 542 | static inline VTYPE acosh_f(VTYPE const x0) { 543 | 544 | // Coefficients 545 | const float r0 = 1.4142135263E0f; 546 | const float r1 = -1.1784741703E-1f; 547 | const float r2 = 2.6454905019E-2f; 548 | const float r3 = -7.5272886713E-3f; 549 | const float r4 = 1.7596881071E-3f; 550 | 551 | // data vectors 552 | VTYPE x1, y1, y2; 553 | 554 | x1 = x0 - 1.0f; 555 | auto undef = x0 < 1.0f; // result is NAN 556 | auto x_small = x1 < 0.49f; // use Pade approximation if abs(x-1) < 0.5 557 | auto x_huge = x1 > 1.E10f; // simple approximation, avoid overflow 558 | 559 | if (horizontal_or(x_small)) { 560 | // At least one element needs small method 561 | y1 = sqrt(x1) * polynomial_4(x1, r0, r1, r2, r3, r4); 562 | // x < 1 generates NAN 563 | y1 = select(undef, nan_vec(NAN_HYP), y1); 564 | } 565 | if (!horizontal_and(x_small)) { 566 | // At least one element needs big method 567 | y2 = log(x0 + sqrt(mul_sub(x0,x0,1.0))); 568 | if (horizontal_or(x_huge)) { 569 | // At least one element needs huge method to avoid overflow 570 | y2 = select(x_huge, log(x0) + (float)VM_LN2, y2); 571 | } 572 | } 573 | y1 = select(x_small, y1, y2); // choose method 574 | return y1; 575 | } 576 | 577 | // instances of acosh_f template 578 | static inline Vec4f acosh(Vec4f const x) { 579 | return acosh_f(x); 580 | } 581 | 582 | #if MAX_VECTOR_SIZE >= 256 583 | static inline Vec8f acosh(Vec8f const x) { 584 | return acosh_f(x); 585 | } 586 | #endif // MAX_VECTOR_SIZE >= 256 587 | 588 | #if MAX_VECTOR_SIZE >= 512 589 | static inline Vec16f acosh(Vec16f const x) { 590 | return acosh_f(x); 591 | } 592 | #endif // MAX_VECTOR_SIZE >= 512 593 | 594 | 595 | // Template for atanh function, double precision 596 | // This function does not produce denormals 597 | // Template parameters: 598 | // VTYPE: double vector type 599 | template 600 | static inline VTYPE atanh_d(VTYPE const x0) { 601 | 602 | // Coefficients 603 | const double p0 = -3.09092539379866942570E1; 604 | const double p1 = 6.54566728676544377376E1; 605 | const double p2 = -4.61252884198732692637E1; 606 | const double p3 = 1.20426861384072379242E1; 607 | const double p4 = -8.54074331929669305196E-1; 608 | 609 | const double q0 = -9.27277618139601130017E1; 610 | const double q1 = 2.52006675691344555838E2; 611 | const double q2 = -2.49839401325893582852E2; 612 | const double q3 = 1.08938092147140262656E2; 613 | const double q4 = -1.95638849376911654834E1; 614 | const double q5 = 1.0; 615 | 616 | // data vectors 617 | VTYPE x, x2, y1, y2, y3; 618 | 619 | x = abs(x0); 620 | auto x_small = x < 0.5; // use Pade approximation if abs(x) < 0.5 621 | 622 | if (horizontal_or(x_small)) { 623 | // At least one element needs small method 624 | x2 = x * x; 625 | y1 = polynomial_4(x2, p0, p1, p2, p3, p4) / polynomial_5(x2, q0, q1, q2, q3, q4, q5); 626 | y1 = mul_add(y1, x2*x, x); 627 | } 628 | if (!horizontal_and(x_small)) { 629 | // At least one element needs big method 630 | y2 = log((1.0+x)/(1.0-x)) * 0.5; 631 | // check if out of range 632 | y3 = select(x == 1.0, infinite_vec(), nan_vec(NAN_HYP)); 633 | y2 = select(x >= 1.0, y3, y2); 634 | } 635 | y1 = select(x_small, y1, y2); // choose method 636 | y1 = sign_combine(y1, x0); // get original sign 637 | return y1; 638 | } 639 | 640 | // instances of atanh_d template 641 | static inline Vec2d atanh(Vec2d const x) { 642 | return atanh_d(x); 643 | } 644 | 645 | #if MAX_VECTOR_SIZE >= 256 646 | static inline Vec4d atanh(Vec4d const x) { 647 | return atanh_d(x); 648 | } 649 | #endif // MAX_VECTOR_SIZE >= 256 650 | 651 | #if MAX_VECTOR_SIZE >= 512 652 | static inline Vec8d atanh(Vec8d const x) { 653 | return atanh_d(x); 654 | } 655 | #endif // MAX_VECTOR_SIZE >= 512 656 | 657 | 658 | // Template for atanh function, single precision 659 | // This function does not produce denormals 660 | // Template parameters: 661 | // VTYPE: double vector type 662 | template 663 | static inline VTYPE atanh_f(VTYPE const x0) { 664 | 665 | // Coefficients 666 | const float r0 = 3.33337300303E-1f; 667 | const float r1 = 1.99782164500E-1f; 668 | const float r2 = 1.46691431730E-1f; 669 | const float r3 = 8.24370301058E-2f; 670 | const float r4 = 1.81740078349E-1f; 671 | 672 | // data vectors 673 | VTYPE x, x2, y1, y2, y3; 674 | 675 | x = abs(x0); 676 | auto x_small = x < 0.5f; // use polynomial approximation if abs(x) < 0.5 677 | 678 | if (horizontal_or(x_small)) { 679 | // At least one element needs small method 680 | x2 = x * x; 681 | y1 = polynomial_4(x2, r0, r1, r2, r3, r4); 682 | y1 = mul_add(y1, x2*x, x); 683 | } 684 | if (!horizontal_and(x_small)) { 685 | // At least one element needs big method 686 | y2 = log((1.0f+x)/(1.0f-x)) * 0.5f; 687 | // check if out of range 688 | y3 = select(x == 1.0f, infinite_vec(), nan_vec(NAN_HYP)); 689 | y2 = select(x >= 1.0f, y3, y2); 690 | } 691 | y1 = select(x_small, y1, y2); // choose method 692 | y1 = sign_combine(y1, x0); // get original sign 693 | return y1; 694 | } 695 | 696 | // instances of atanh_f template 697 | static inline Vec4f atanh(Vec4f const x) { 698 | return atanh_f(x); 699 | } 700 | 701 | #if MAX_VECTOR_SIZE >= 256 702 | static inline Vec8f atanh(Vec8f const x) { 703 | return atanh_f(x); 704 | } 705 | #endif // MAX_VECTOR_SIZE >= 256 706 | 707 | #if MAX_VECTOR_SIZE >= 512 708 | static inline Vec16f atanh(Vec16f const x) { 709 | return atanh_f(x); 710 | } 711 | #endif // MAX_VECTOR_SIZE >= 512 712 | 713 | #ifdef VCL_NAMESPACE 714 | } 715 | #endif 716 | 717 | #endif 718 | -------------------------------------------------------------------------------- /src/vsTCanny.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "vsTCanny.h" 5 | 6 | AVS_FORCEINLINE void* aligned_malloc(size_t size, size_t align) 7 | { 8 | void* result = [&]() { 9 | #ifdef _WIN32 10 | return _aligned_malloc(size, align); 11 | #else 12 | if (posix_memalign(&result, align, size)) 13 | return result = nullptr; 14 | else 15 | return result; 16 | #endif 17 | }(); 18 | 19 | return result; 20 | } 21 | 22 | AVS_FORCEINLINE void aligned_free(void* ptr) 23 | { 24 | #ifdef _WIN32 25 | _aligned_free(ptr); 26 | #else 27 | free(ptr); 28 | #endif 29 | } 30 | 31 | template 32 | static void copyPlane(const T* srcp, float* __restrict dstp, const int width, const int height, const int srcStride, const int dstStride) noexcept 33 | { 34 | for (int y{ 0 }; y < height; ++y) 35 | { 36 | for (int x{ 0 }; x < width; ++x) 37 | dstp[x] = srcp[x]; 38 | 39 | srcp += srcStride; 40 | dstp += dstStride; 41 | } 42 | } 43 | 44 | template 45 | static void gaussianBlur(const T* _srcp, float* __restrict temp, float* __restrict dstp, const float* weightsH, const float* weightsV, const int width, const int height, const int srcStride, const int dstStride, const int radiusH, const int radiusV) noexcept 46 | { 47 | const int diameter{ radiusV * 2 + 1 }; 48 | std::unique_ptr srcp{ std::make_unique(diameter) }; 49 | 50 | srcp[radiusV] = _srcp; 51 | for (int i{ 1 }; i <= radiusV; ++i) 52 | srcp[radiusV - i] = srcp[radiusV + i] = srcp[radiusV] + srcStride * i; 53 | 54 | weightsH += radiusH; 55 | 56 | for (int y{ 0 }; y < height; ++y) 57 | { 58 | for (int x{ 0 }; x < width; ++x) 59 | { 60 | float sum{ 0.0f }; 61 | 62 | for (int i{ 0 }; i < diameter; ++i) 63 | sum += srcp[i][x] * weightsV[i]; 64 | 65 | temp[x] = sum; 66 | } 67 | 68 | for (int i{ 1 }; i <= radiusH; ++i) 69 | { 70 | temp[-i] = temp[i]; 71 | temp[width - 1 + i] = temp[width - 1 - i]; 72 | } 73 | 74 | for (int x{ 0 }; x < width; ++x) 75 | { 76 | float sum{ 0.0f }; 77 | 78 | for (int i = -radiusH; i <= radiusH; ++i) 79 | sum += temp[x + i] * weightsH[i]; 80 | 81 | dstp[x] = sum; 82 | } 83 | 84 | for (int i{ 0 }; i < diameter - 1; ++i) 85 | srcp[i] = srcp[i + 1]; 86 | 87 | srcp[diameter - 1] += (y < height - 1 - radiusV) ? srcStride : -srcStride;; 88 | 89 | dstp += dstStride; 90 | } 91 | } 92 | 93 | template 94 | static void gaussianBlurV(const T* _srcp, float* __restrict dstp, const float* weights, const int width, const int height, const int srcStride, const int dstStride, const int radius) noexcept 95 | { 96 | const int diameter{ radius * 2 + 1 }; 97 | std::unique_ptr srcp{ std::make_unique(diameter) }; 98 | 99 | srcp[radius] = _srcp; 100 | for (int i{ 1 }; i <= radius; ++i) 101 | srcp[radius - i] = srcp[radius + i] = srcp[radius] + srcStride * i; 102 | 103 | for (int y{ 0 }; y < height; ++y) 104 | { 105 | for (int x{ 0 }; x < width; ++x) 106 | { 107 | float sum{ 0.0f }; 108 | 109 | for (int i{ 0 }; i < diameter; ++i) 110 | sum += srcp[i][x] * weights[i]; 111 | 112 | dstp[x] = sum; 113 | } 114 | 115 | for (int i{ 0 }; i < diameter - 1; ++i) 116 | srcp[i] = srcp[i + 1]; 117 | 118 | srcp[diameter - 1] += (y < height - 1 - radius) ? srcStride : -srcStride; 119 | 120 | dstp += dstStride; 121 | } 122 | } 123 | 124 | template 125 | static void gaussianBlurH(const T* srcp, float* __restrict temp, float* __restrict dstp, const float* weights, const int width, const int height, const int srcStride, const int dstStride, const int radius) noexcept 126 | { 127 | weights += radius; 128 | 129 | for (int y{ 0 }; y < height; ++y) 130 | { 131 | for (int x{ 0 }; x < width; ++x) 132 | temp[x] = srcp[x]; 133 | 134 | for (int i{ 1 }; i <= radius; ++i) 135 | { 136 | temp[-i] = temp[i]; 137 | temp[width - 1 + i] = temp[width - 1 - i]; 138 | } 139 | 140 | for (int x{ 0 }; x < width; ++x) 141 | { 142 | float sum{ 0.0f }; 143 | 144 | for (int i{ -radius }; i <= radius; ++i) 145 | sum += temp[x + i] * weights[i]; 146 | 147 | dstp[x] = sum; 148 | } 149 | 150 | srcp += srcStride; 151 | dstp += dstStride; 152 | } 153 | } 154 | 155 | static void detectEdge(float* __restrict blur, float* __restrict gradient, int* __restrict direction, const int width, const int height, const int stride, const int bgStride, const int mode, const int op, const float scale) noexcept 156 | { 157 | float* __restrict cur{ blur }; 158 | float* __restrict next{ blur + bgStride }; 159 | float* __restrict next2{ blur + bgStride * 2 }; 160 | float* __restrict prev{ next }; 161 | float* __restrict prev2{ next2 }; 162 | 163 | cur[-1] = cur[1]; 164 | cur[width] = cur[width - 2]; 165 | 166 | if (op == FDOG) 167 | { 168 | cur[-2] = cur[2]; 169 | cur[width + 1] = cur[width - 3]; 170 | } 171 | 172 | for (int y{ 0 }; y < height; ++y) 173 | { 174 | next[-1] = next[1]; 175 | next[width] = next[width - 2]; 176 | 177 | if (op == FDOG) 178 | { 179 | next[-2] = next[2]; 180 | next[width + 1] = next[width - 3]; 181 | 182 | next2[-1] = next2[1]; 183 | next2[-2] = next2[2]; 184 | next2[width] = next2[width - 2]; 185 | next2[width + 1] = next2[width - 3]; 186 | } 187 | 188 | for (int x{ 0 }; x < width; ++x) 189 | { 190 | float gx, gy; 191 | 192 | if (op != FDOG) 193 | { 194 | const float c1{ prev[x - 1] }; 195 | const float c2{ prev[x] }; 196 | const float c3{ prev[x + 1] }; 197 | const float c4{ cur[x - 1] }; 198 | const float c6{ cur[x + 1] }; 199 | const float c7{ next[x - 1] }; 200 | const float c8{ next[x] }; 201 | const float c9{ next[x + 1] }; 202 | 203 | switch (op) 204 | { 205 | case TRITICAL: 206 | { 207 | gx = c6 - c4; 208 | gy = c2 - c8; 209 | break; 210 | } 211 | case PREWITT: 212 | { 213 | gx = (c3 + c6 + c9 - c1 - c4 - c7) / 2.0f; 214 | gy = (c1 + c2 + c3 - c7 - c8 - c9) / 2.0f; 215 | break; 216 | } 217 | case SOBEL: 218 | { 219 | gx = c3 + 2.0f * c6 + c9 - c1 - 2.0f * c4 - c7; 220 | gy = c1 + 2.0f * c2 + c3 - c7 - 2.0f * c8 - c9; 221 | break; 222 | } 223 | case SCHARR: 224 | { 225 | gx = 3.0f * c3 + 10.0f * c6 + 3.0f * c9 - 3.0f * c1 - 10.0f * c4 - 3.0f * c7; 226 | gy = 3.0f * c1 + 10.0f * c2 + 3.0f * c3 - 3.0f * c7 - 10.0f * c8 - 3.0f * c9; 227 | break; 228 | } 229 | case KROON: 230 | { 231 | gx = 17.0f * c3 + 61.0f * c6 + 17.0f * c9 - 17.0f * c1 - 61.0f * c4 - 17.0f * c7; 232 | gy = 17.0f * c1 + 61.0f * c2 + 17.0f * c3 - 17.0f * c7 - 61.0f * c8 - 17.0f * c9; 233 | break; 234 | } 235 | case KIRSCH: 236 | { 237 | const float g1{ 5.0f * c1 + 5.0f * c2 + 5.0f * c3 - 3.0f * c4 - 3.0f * c6 - 3.0f * c7 - 3.0f * c8 - 3.0f * c9 }; 238 | const float g2{ 5.0f * c1 + 5.0f * c2 - 3.0f * c3 + 5.0f * c4 - 3.0f * c6 - 3.0f * c7 - 3.0f * c8 - 3.0f * c9 }; 239 | const float g3{ 5.0f * c1 - 3.0f * c2 - 3.0f * c3 + 5.0f * c4 - 3.0f * c6 + 5.0f * c7 - 3.0f * c8 - 3.0f * c9 }; 240 | const float g4{ -3.0f * c1 - 3.0f * c2 - 3.0f * c3 + 5.0f * c4 - 3.0f * c6 + 5.0f * c7 + 5.0f * c8 - 3.0f * c9 }; 241 | const float g5{ -3.0f * c1 - 3.0f * c2 - 3.0f * c3 - 3.0f * c4 - 3.0f * c6 + 5.0f * c7 + 5.0f * c8 + 5.0f * c9 }; 242 | const float g6{ -3.0f * c1 - 3.0f * c2 - 3.0f * c3 - 3.0f * c4 + 5.0f * c6 - 3.0f * c7 + 5.0f * c8 + 5.0f * c9 }; 243 | const float g7{ -3.0f * c1 - 3.0f * c2 + 5.0f * c3 - 3.0f * c4 + 5.0f * c6 - 3.0f * c7 - 3.0f * c8 + 5.0f * c9 }; 244 | const float g8{ -3.0f * c1 + 5.0f * c2 + 5.0f * c3 - 3.0f * c4 + 5.0f * c6 - 3.0f * c7 - 3.0f * c8 - 3.0f * c9 }; 245 | const float g{ std::max({ std::abs(g1), std::abs(g2), std::abs(g3), std::abs(g4), std::abs(g5), std::abs(g6), std::abs(g7), std::abs(g8) }) }; 246 | gradient[x] = g * scale; 247 | break; 248 | } 249 | } 250 | } 251 | else 252 | { 253 | const float c1{ prev2[x - 2] }; 254 | const float c2{ prev2[x - 1] }; 255 | const float c3{ prev2[x] }; 256 | const float c4{ prev2[x + 1] }; 257 | const float c5{ prev2[x + 2] }; 258 | const float c6{ prev[x - 2] }; 259 | const float c7{ prev[x - 1] }; 260 | const float c8{ prev[x] }; 261 | const float c9{ prev[x + 1] }; 262 | const float c10{ prev[x + 2] }; 263 | const float c11{ cur[x - 2] }; 264 | const float c12{ cur[x - 1] }; 265 | const float c14{ cur[x + 1] }; 266 | const float c15{ cur[x + 2] }; 267 | const float c16{ next[x - 2] }; 268 | const float c17{ next[x - 1] }; 269 | const float c18{ next[x] }; 270 | const float c19{ next[x + 1] }; 271 | const float c20{ next[x + 2] }; 272 | const float c21{ next2[x - 2] }; 273 | const float c22{ next2[x - 1] }; 274 | const float c23{ next2[x] }; 275 | const float c24{ next2[x + 1] }; 276 | const float c25{ next2[x + 2] }; 277 | 278 | gx = c5 + 2.0f * c10 + 3.0f * c15 + 2.0f * c20 + c25 + c4 + 2.0f * c9 + 3.0f * c14 + 2.0f * c19 + c24 279 | - c2 - 2.0f * c7 - 3.0f * c12 - 2.0f * c17 - c22 - c1 - 2.0f * c6 - 3.0f * c11 - 2.0f * c16 - c21; 280 | gy = c1 + 2.0f * c2 + 3.0f * c3 + 2.0f * c4 + c5 + c6 + 2.0f * c7 + 3.0f * c8 + 2.0f * c9 + c10 281 | - c16 - 2.0f * c17 - 3.0f * c18 - 2.0f * c19 - c20 - c21 - 2.0f * c22 - 3.0f * c23 - 2.0f * c24 - c25; 282 | } 283 | 284 | if (op != KIRSCH) 285 | { 286 | gx *= scale; 287 | gy *= scale; 288 | gradient[x] = std::sqrt(gx * gx + gy * gy); 289 | } 290 | 291 | if (mode == 0) 292 | { 293 | float dr{ std::atan2(gy, gx) }; 294 | 295 | if (dr < 0.0f) 296 | dr += M_PIF; 297 | 298 | const int bin{ static_cast(dr * 4.0f * M_1_PIF + 0.5f) }; 299 | direction[x] = (bin >= 4) ? 0 : bin; 300 | } 301 | } 302 | 303 | prev2 = prev; 304 | prev = cur; 305 | cur = next; 306 | 307 | if (op != FDOG) 308 | next += (y < height - 2) ? bgStride : -bgStride; 309 | else 310 | { 311 | next = next2; 312 | next2 += (y < height - 3) ? bgStride : -bgStride; 313 | } 314 | 315 | gradient += bgStride; 316 | direction += stride; 317 | } 318 | } 319 | 320 | static void nonMaximumSuppression(const int* direction, float* __restrict gradient, float* __restrict blur, const int width, const int height, const int stride, const int bgStride, const int radiusAlign) noexcept 321 | { 322 | const int offsets[]{ 1, -bgStride + 1, -bgStride, -bgStride - 1 }; 323 | 324 | gradient[-1] = gradient[1]; 325 | gradient[-1 + bgStride * (height - 1)] = gradient[1 + bgStride * (height - 1)]; 326 | gradient[width] = gradient[width - 2]; 327 | gradient[width + bgStride * (height - 1)] = gradient[width - 2 + bgStride * (height - 1)]; 328 | std::copy_n(gradient - radiusAlign + bgStride, width + radiusAlign * 2, gradient - radiusAlign - bgStride); 329 | std::copy_n(gradient - radiusAlign + bgStride * (height - 2), width + radiusAlign * 2, gradient - radiusAlign + bgStride * height); 330 | 331 | for (int y{ 0 }; y < height; ++y) 332 | { 333 | for (int x{ 0 }; x < width; ++x) 334 | { 335 | const int offset{ offsets[direction[x]] }; 336 | blur[x] = (gradient[x] >= std::max(gradient[x + offset], gradient[x - offset])) ? gradient[x] : fltLowest; 337 | } 338 | 339 | direction += stride; 340 | gradient += bgStride; 341 | blur += bgStride; 342 | } 343 | } 344 | 345 | template 346 | static void binarizeCE(const float* srcp, T* __restrict dstp, const int width, const int height, const int srcStride, const int dstStride, const int peak) noexcept 347 | { 348 | for (int y{ 0 }; y < height; ++y) 349 | { 350 | for (int x{ 0 }; x < width; ++x) 351 | { 352 | if constexpr (std::is_integral_v) 353 | dstp[x] = (srcp[x] == fltMax) ? static_cast(peak) : 0; 354 | else 355 | dstp[x] = (srcp[x] == fltMax) ? 1.0f : 0.0f; 356 | } 357 | 358 | srcp += srcStride; 359 | dstp += dstStride; 360 | } 361 | } 362 | 363 | template 364 | static void discretizeGM(const float* srcp, T* __restrict dstp, const int width, const int height, const int srcStride, const int dstStride, const int peak) noexcept 365 | { 366 | for (int y{ 0 }; y < height; ++y) 367 | { 368 | for (int x{ 0 }; x < width; ++x) 369 | { 370 | if constexpr (std::is_integral_v) 371 | dstp[x] = static_cast(std::min(static_cast(srcp[x] + 0.5f), peak)); 372 | else if constexpr (clampFP) 373 | dstp[x] = std::clamp(srcp[x], 0.0f, 1.0f); 374 | else 375 | dstp[x] = srcp[x]; 376 | } 377 | 378 | srcp += srcStride; 379 | dstp += dstStride; 380 | } 381 | } 382 | 383 | template 384 | void vsTCanny::filter_c(PVideoFrame& src, PVideoFrame& dst, IScriptEnvironment* env) noexcept 385 | { 386 | const int planes_y[3]{ PLANAR_Y, PLANAR_U, PLANAR_V }; 387 | const int planes_r[3]{ PLANAR_G, PLANAR_B, PLANAR_R }; 388 | const int* current_planes{ (vi.IsRGB()) ? planes_r : planes_y }; 389 | const int planecount{ std::min(vi.NumComponents(), 3) }; 390 | 391 | for (int i{ 0 }; i < planecount; ++i) 392 | { 393 | const int height{ src->GetHeight(current_planes[i]) }; 394 | 395 | if (process[i] == 3) 396 | { 397 | const size_t stride{ src->GetPitch(current_planes[i]) / sizeof(T) }; 398 | const size_t bgStride{ stride + radiusAlign * 2 }; 399 | const size_t dst_stride{ dst->GetPitch(current_planes[i]) / sizeof(T) }; 400 | const size_t width{ src->GetRowSize(current_planes[i]) / sizeof(T) }; 401 | const T* srcp{ reinterpret_cast(src->GetReadPtr(current_planes[i])) }; 402 | T* dstp{ reinterpret_cast(dst->GetWritePtr(current_planes[i])) }; 403 | 404 | float* blur{ vsTCanny::blur + radiusAlign }; 405 | float* gradient{ vsTCanny::gradient + bgStride + radiusAlign }; 406 | 407 | if (radiusV[i] && radiusH[i]) 408 | gaussianBlur(srcp, gradient, blur, weightsH[i].get(), weightsV[i].get(), width, height, stride, bgStride, radiusH[i], radiusV[i]); 409 | else if (radiusV[i]) 410 | gaussianBlurV(srcp, blur, weightsV[i].get(), width, height, stride, bgStride, radiusV[i]); 411 | else if (radiusH[i]) 412 | gaussianBlurH(srcp, gradient, blur, weightsH[i].get(), width, height, stride, bgStride, radiusH[i]); 413 | else 414 | copyPlane(srcp, blur, width, height, stride, bgStride); 415 | 416 | if (mode_ != -1) 417 | { 418 | detectEdge(blur, gradient, direction, width, height, stride, bgStride, mode_, op_, scale); 419 | 420 | if (mode_ == 0) 421 | { 422 | nonMaximumSuppression(direction, gradient, blur, width, height, stride, bgStride, radiusAlign); 423 | hysteresis(blur, found.get(), width, height, bgStride, t_h_, t_l_); 424 | } 425 | } 426 | 427 | switch (mode_) 428 | { 429 | case 0: binarizeCE(blur, dstp, width, height, bgStride, dst_stride, peak); break; 430 | case 1: discretizeGM(gradient, dstp, width, height, bgStride, stride, peak); break; 431 | default: discretizeGM(blur, dstp, width, height, bgStride, dst_stride, peak); break; 432 | } 433 | } 434 | else if (process[i] == 2) 435 | env->BitBlt(dst->GetWritePtr(current_planes[i]), dst->GetPitch(current_planes[i]), src->GetReadPtr(current_planes[i]), src->GetPitch(current_planes[i]), src->GetRowSize(current_planes[i]), height); 436 | } 437 | } 438 | 439 | static float* gaussianWeights(const float sigma, int& radius) noexcept 440 | { 441 | const int diameter{ std::max(static_cast(sigma * 3.0f + 0.5f), 1) * 2 + 1 }; 442 | radius = diameter / 2; 443 | 444 | float* weights{ new float[diameter]() }; 445 | float sum{ 0.0f }; 446 | 447 | for (int k = -radius; k <= radius; ++k) 448 | { 449 | const float w{ std::exp(-(k * k) / (2.0f * sigma * sigma)) }; 450 | weights[k + radius] = w; 451 | sum += w; 452 | } 453 | 454 | for (int k{ 0 }; k < diameter; ++k) 455 | weights[k] /= sum; 456 | 457 | return weights; 458 | } 459 | 460 | vsTCanny::vsTCanny(PClip _child, float sigmaY, float sigmaU, float sigmaV, float sigma_vY, float sigma_vU, float sigma_vV, float t_h, float t_l, int mode, int op, float scale_, int y, int u, int v, int opt, IScriptEnvironment* env) 461 | : GenericVideoFilter(_child), t_h_(t_h), t_l_(t_l), mode_(mode), op_(op), scale(scale_), process{ 0, 0, 0 }, radiusH{ 0, 0, 0 }, radiusV{ 0, 0, 0 } 462 | { 463 | if (!vi.IsPlanar()) 464 | env->ThrowError("vsTCanny: the clip is not in planar format."); 465 | 466 | const int height{ vi.height }; 467 | const int width{ vi.width }; 468 | 469 | if (height < 3) 470 | env->ThrowError("vsTCanny: the clip's height must be at least 3."); 471 | if (t_l_ >= t_h_) 472 | env->ThrowError("vsTCanny: t_h must be greater than t_l."); 473 | if (mode_ < -1 || mode_ > 1) 474 | env->ThrowError("vsTCanny: mode must be -1, 0, or 1."); 475 | if (op_ < 0 || op_ > 6) 476 | env->ThrowError("vsTCanny: op must be 0, 1, 2, 3, 4, 5 or 6."); 477 | if (op_ == 5 && mode == 0) 478 | env->ThrowError("vsTCanny: op=5 cannot be used when mode=0."); 479 | if (scale <= 0.0f) 480 | env->ThrowError("vsTCanny: scale must be greater than 0.0."); 481 | if (opt < -1 || opt > 3) 482 | env->ThrowError("vsTCanny: opt must be between -1..3."); 483 | 484 | const bool avx512{ !!(env->GetCPUFlags() & CPUF_AVX512F) }; 485 | const bool avx2{ !!(env->GetCPUFlags() & CPUF_AVX2) }; 486 | const bool sse2{ !!(env->GetCPUFlags() & CPUF_SSE2) }; 487 | 488 | if (!avx512 && opt == 3) 489 | env->ThrowError("vsTCanny: opt=3 requires AVX512."); 490 | if (!avx2 && opt == 2) 491 | env->ThrowError("vsTCanny: opt=2 requires AVX2."); 492 | if (!sse2 && opt == 1) 493 | env->ThrowError("vsTCanny: opt=1 requires SSE2."); 494 | 495 | const bool rgb{ vi.IsRGB() }; 496 | int sw{ 0 }; 497 | int sh{ 0 }; 498 | const int planecount{ std::min(vi.NumComponents(), 3) }; 499 | 500 | if (planecount > 1) 501 | { 502 | sw = vi.GetPlaneWidthSubsampling((rgb) ? PLANAR_R : PLANAR_U); 503 | sh = vi.GetPlaneHeightSubsampling((rgb) ? PLANAR_R : PLANAR_U); 504 | 505 | if (sigmaU == -1354.4f) 506 | sigmaU = (rgb) ? sigmaY : (sigmaY / (1 << sw)); 507 | if (sigmaV == -1354.4f) 508 | sigmaV = sigmaU; 509 | if (sigma_vY == -1354.4f) 510 | { 511 | sigma_vY = sigmaY; 512 | 513 | if (sigma_vU == -1354.4f) 514 | sigma_vU = (sw == sh) ? sigmaU : (sigmaU * (1 << sw)); 515 | } 516 | else 517 | { 518 | if (sigma_vU == -1354.4f) 519 | sigma_vU = (rgb) ? sigma_vY : (sigma_vY / (1 << sh)); 520 | } 521 | if (sigma_vV == -1354.4f) 522 | sigma_vV = sigma_vU; 523 | } 524 | else 525 | { 526 | if (sigma_vY == -1354.4f) 527 | sigma_vY = sigmaY; 528 | } 529 | 530 | const float sigmaH[3]{ sigmaY, sigmaU, sigmaV }; 531 | const float sigmaV_[3]{ sigma_vY, sigma_vU, sigma_vV }; 532 | const int planes[3]{ y, u, v }; 533 | 534 | for (int i{ 0 }; i < planecount; ++i) 535 | { 536 | if (rgb) 537 | process[i] = 3; 538 | else 539 | { 540 | switch (planes[i]) 541 | { 542 | case 3: process[i] = 3; break; 543 | case 2: process[i] = 2; break; 544 | default: process[i] = 1; break; 545 | } 546 | } 547 | 548 | if (sigmaH[i] < 0.0f) 549 | { 550 | const std::string sigmaOrder[3]{ "sigmaY", "sigmaU", "sigmaV" }; 551 | env->ThrowError(std::string{ "vsTCanny: " + sigmaOrder[i] + " must be greater than or equal to 0.0." }.c_str()); 552 | } 553 | if (sigmaV_[i] < 0.0f) 554 | { 555 | const std::string sigmaVOrder[3]{ "sigma_vY", "sigma_vU", "sigma_vV" }; 556 | env->ThrowError(std::string{ "vsTCanny: " + sigmaVOrder[i] + " must be greater than or equal to 0.0." }.c_str()); 557 | } 558 | if (planes[i] < 1 || planes[i] > 3) 559 | env->ThrowError("vsTCanny: y, u, v must be between 1..3."); 560 | 561 | if (process[i] == 3) 562 | { 563 | if (sigmaH[i]) 564 | { 565 | weightsH[i].reset(gaussianWeights(sigmaH[i], radiusH[i])); 566 | 567 | const int width_{ (i && !rgb) ? (width >> sw) : width }; 568 | if (width_ < radiusH[i] + 1) 569 | { 570 | const std::string planeOrder[3]{ "first", "second", "third" }; 571 | env->ThrowError(std::string{ "vsTCanny: the " + planeOrder[i] + " plane's width must be greater than or equal to " + std::to_string(radiusH[i] + 1) + " for specified sigma." }.c_str()); 572 | } 573 | } 574 | else 575 | radiusH[i] = 0; 576 | 577 | if (sigmaV_[i]) 578 | { 579 | weightsV[i].reset(gaussianWeights(sigmaV_[i], radiusV[i])); 580 | 581 | const int height_{ (i && !rgb) ? (height >> sh) : height }; 582 | if (height_ < radiusV[i] + 1) 583 | { 584 | const std::string planeOrder[3]{ "first", "second", "third" }; 585 | env->ThrowError(std::string{ "vsTCanny: the " + planeOrder[i] + " plane's height must be greater than or equal to " + std::to_string(radiusV[i] + 1) + " for specified sigma_v." }.c_str()); 586 | } 587 | } 588 | else 589 | radiusV[i] = 0; 590 | } 591 | else 592 | { 593 | radiusH[i] = 0; 594 | radiusV[i] = 0; 595 | } 596 | } 597 | 598 | const int comp_size{ vi.ComponentSize() }; 599 | if (comp_size < 4) 600 | { 601 | peak = (1 << vi.BitsPerComponent()) - 1; 602 | const float scale_{ peak / 255.0f }; 603 | t_h_ *= scale_; 604 | t_l_ *= scale_; 605 | } 606 | else 607 | { 608 | peak = 0; 609 | t_h_ /= 255.0f; 610 | t_l_ /= 255.0f; 611 | } 612 | 613 | int vectorSize; 614 | 615 | if ((avx512 && opt < 0) || opt == 3) 616 | { 617 | vectorSize = 16; 618 | alignment = 64; 619 | 620 | switch (comp_size) 621 | { 622 | case 1: filter = &vsTCanny::filter_avx512; break; 623 | case 2: filter = &vsTCanny::filter_avx512; break; 624 | default: filter = &vsTCanny::filter_avx512; break; 625 | } 626 | } 627 | else if ((avx2 && opt < 0) || opt == 2) 628 | { 629 | vectorSize = 8; 630 | alignment = 32; 631 | 632 | switch (comp_size) 633 | { 634 | case 1: filter = &vsTCanny::filter_avx2; break; 635 | case 2: filter = &vsTCanny::filter_avx2; break; 636 | default: filter = &vsTCanny::filter_avx2; break; 637 | } 638 | } 639 | else if ((sse2 && opt < 0) || opt == 1) 640 | { 641 | vectorSize = 4; 642 | alignment = 16; 643 | 644 | switch (comp_size) 645 | { 646 | case 1: filter = &vsTCanny::filter_sse2; break; 647 | case 2: filter = &vsTCanny::filter_sse2; break; 648 | default: filter = &vsTCanny::filter_sse2; break; 649 | } 650 | } 651 | else 652 | { 653 | vectorSize = 1; 654 | alignment = 4; 655 | 656 | switch (comp_size) 657 | { 658 | case 1: filter = &vsTCanny::filter_c; break; 659 | case 2: filter = &vsTCanny::filter_c; break; 660 | default: filter = &vsTCanny::filter_c; break; 661 | } 662 | } 663 | 664 | radiusAlign = (std::max({ radiusH[0], radiusH[1], radiusH[2], (op == FDOG) ? 2 : 1 }) + vectorSize - 1) & ~(vectorSize - 1); 665 | 666 | const int pitch{ child->GetFrame(0, env)->GetPitch() / comp_size }; 667 | 668 | blur = reinterpret_cast(aligned_malloc((pitch + radiusAlign * 2) * height * sizeof(float), alignment)); 669 | if (!blur) 670 | env->ThrowError("vsTCanny: malloc failure (blur)."); 671 | 672 | gradient = reinterpret_cast(aligned_malloc((pitch + radiusAlign * 2) * (height + 2) * sizeof(float), alignment)); 673 | if (!gradient) 674 | env->ThrowError("vsTCanny: malloc failure (gradient)."); 675 | 676 | if (mode_ == 0) 677 | { 678 | direction = reinterpret_cast(aligned_malloc(pitch * height * sizeof(int), alignment)); 679 | if (!direction) 680 | env->ThrowError("vsTCanny: malloc failure (direction)."); 681 | 682 | found = std::make_unique(width * height); 683 | } 684 | else 685 | direction = nullptr; 686 | 687 | has_at_least_v8 = true; 688 | try { env->CheckVersion(8); } 689 | catch (const AvisynthError&) { has_at_least_v8 = false; } 690 | } 691 | 692 | vsTCanny::~vsTCanny() 693 | { 694 | aligned_free(blur); 695 | aligned_free(gradient); 696 | 697 | if (direction) 698 | aligned_free(direction); 699 | } 700 | 701 | PVideoFrame __stdcall vsTCanny::GetFrame(int n, IScriptEnvironment* env) 702 | { 703 | PVideoFrame src{ child->GetFrame(n, env) }; 704 | PVideoFrame dst{ (has_at_least_v8) ? env->NewVideoFrameP(vi, &src) : env->NewVideoFrame(vi) }; 705 | 706 | (this->*filter)(src, dst, env); 707 | 708 | return dst; 709 | } 710 | 711 | AVSValue __cdecl Create_vsTCanny(AVSValue args, void* user_data, IScriptEnvironment* env) 712 | { 713 | const float sigmaY{ args[1].AsFloatf(1.5f) }; 714 | if (sigmaY < 0.0f) 715 | env->ThrowError("vsTCanny: sigmaY must be greater than or equal to 0.0."); 716 | 717 | const float sigmaU{ args[2].AsFloatf(-1354.4f) }; 718 | if (sigmaU < 0.0f && sigmaU != -1354.4f) 719 | env->ThrowError("vsTCanny: sigmaU must be greater than or equal to 0.0."); 720 | 721 | const float sigmaV{ args[3].AsFloatf(sigmaU) }; 722 | if (sigmaV < 0.0f && sigmaV != -1354.4f) 723 | env->ThrowError("vsTCanny: sigmaV must be greater than or equal to 0.0."); 724 | 725 | const float sigma_vY{ args[4].AsFloatf(-1354.4f) }; 726 | if (sigma_vY < 0.0f && sigma_vY != -1354.4f) 727 | env->ThrowError("vsTCanny: sigma_vY must be greater than or equal to 0.0."); 728 | 729 | const float sigma_vU{ args[5].AsFloatf(-1354.4f) }; 730 | if (sigma_vU < 0.0f && sigma_vU != -1354.4f) 731 | env->ThrowError("vsTCanny: sigma_vU must be greater than or equal to 0.0."); 732 | 733 | const float sigma_vV{ args[6].AsFloatf(sigma_vU) }; 734 | if (sigma_vV < 0.0f && sigma_vV != -1354.4f) 735 | env->ThrowError("vsTCanny: sigma_vV must be greater than or equal to 0.0."); 736 | 737 | return new vsTCanny( 738 | args[0].AsClip(), 739 | sigmaY, 740 | sigmaU, 741 | sigmaV, 742 | sigma_vY, 743 | sigma_vU, 744 | sigma_vV, 745 | args[7].AsFloatf(8.0f), 746 | args[8].AsFloatf(1.0f), 747 | args[9].AsInt(0), 748 | args[10].AsInt(1), 749 | args[11].AsFloatf(1.0f), 750 | args[12].AsInt(3), 751 | args[13].AsInt(3), 752 | args[14].AsInt(3), 753 | args[15].AsInt(-1), 754 | env); 755 | } 756 | 757 | const AVS_Linkage* AVS_linkage; 758 | 759 | extern "C" __declspec(dllexport) 760 | const char* __stdcall AvisynthPluginInit3(IScriptEnvironment * env, const AVS_Linkage* const vectors) 761 | { 762 | AVS_linkage = vectors; 763 | 764 | env->AddFunction("vsTCanny", "c[sigmaY]f[sigmaU]f[sigmaV]f[sigma_vY]f[sigma_vU]f[sigma_vV]f[t_h]f[t_l]f[mode]i[op]i[scale]f[y]i[u]i[v]i[opt]i", Create_vsTCanny, 0); 765 | 766 | return "vsTCanny"; 767 | } 768 | -------------------------------------------------------------------------------- /src/vsTCanny.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "avisynth.h" 9 | 10 | static constexpr float M_PIF{ 3.14159265358979323846f }; 11 | static constexpr float M_1_PIF{ 0.318309886183790671538f }; 12 | static constexpr float fltMax{ std::numeric_limits::max() }; 13 | static constexpr float fltLowest{ std::numeric_limits::lowest() }; 14 | 15 | enum Operator 16 | { 17 | TRITICAL, 18 | PREWITT, 19 | SOBEL, 20 | SCHARR, 21 | KROON, 22 | KIRSCH, 23 | FDOG 24 | }; 25 | 26 | class vsTCanny : public GenericVideoFilter 27 | { 28 | float t_h_; 29 | float t_l_; 30 | int mode_; 31 | float op_; 32 | float scale; 33 | int process[3]; 34 | int radiusH[3]; 35 | int radiusV[3]; 36 | std::unique_ptr weightsH[3]; 37 | std::unique_ptr weightsV[3]; 38 | int peak; 39 | int alignment; 40 | int radiusAlign; 41 | float* blur; 42 | float* gradient; 43 | int* direction; 44 | std::unique_ptr found; 45 | bool has_at_least_v8; 46 | 47 | template 48 | void filter_c(PVideoFrame& src, PVideoFrame& dst, IScriptEnvironment* env) noexcept; 49 | template 50 | void filter_sse2(PVideoFrame& src, PVideoFrame& dst, IScriptEnvironment* env) noexcept; 51 | template 52 | void filter_avx2(PVideoFrame& src, PVideoFrame& dst, IScriptEnvironment* env) noexcept; 53 | template 54 | void filter_avx512(PVideoFrame& src, PVideoFrame& dst, IScriptEnvironment* env) noexcept; 55 | 56 | void (vsTCanny::* filter)(PVideoFrame& src, PVideoFrame& dst, IScriptEnvironment* env) noexcept; 57 | 58 | public: 59 | vsTCanny(PClip _child, float sigmaY, float sigmaU, float sigmaV, float sigma_vY, float sigma_vU, float sigma_vV, float t_h, float t_l, int mode, int op, float gmmax, int y, int u, int v, int opt, IScriptEnvironment* env); 60 | PVideoFrame __stdcall GetFrame(int n, IScriptEnvironment* env); 61 | int __stdcall SetCacheHints(int cachehints, int frame_range) 62 | { 63 | return cachehints == CACHE_GET_MTMODE ? MT_MULTI_INSTANCE : 0; 64 | } 65 | ~vsTCanny(); 66 | }; 67 | 68 | static void hysteresis(float* __restrict srcp, bool* __restrict found, const int width, const int height, const int stride, const float t_h, const float t_l) noexcept 69 | { 70 | std::fill_n(found, width * height, false); 71 | std::vector> coordinates; 72 | 73 | for (int y{ 0 }; y < height; ++y) 74 | { 75 | for (int x{ 0 }; x < width; ++x) 76 | { 77 | if (!found[width * y + x] && srcp[stride * y + x] >= t_h) 78 | { 79 | srcp[stride * y + x] = fltMax; 80 | found[width * y + x] = true; 81 | 82 | coordinates.emplace_back(std::make_pair(x, y)); 83 | 84 | while (!coordinates.empty()) 85 | { 86 | const auto pos = coordinates.back(); 87 | coordinates.pop_back(); 88 | 89 | const int xxStart{ std::max(pos.first - 1, 0) }; 90 | const int xxStop{ std::min(pos.first + 1, width - 1) }; 91 | const int yyStart{ std::max(pos.second - 1, 0) }; 92 | const int yyStop{ std::min(pos.second + 1, height - 1) }; 93 | 94 | for (int yy{ yyStart }; yy <= yyStop; ++yy) 95 | { 96 | for (int xx{ xxStart }; xx <= xxStop; ++xx) 97 | { 98 | if (!found[width * yy + xx] && srcp[stride * yy + xx] >= t_l) 99 | { 100 | srcp[stride * yy + xx] = fltMax; 101 | found[width * yy + xx] = true; 102 | 103 | coordinates.emplace_back(std::make_pair(xx, yy)); 104 | } 105 | } 106 | } 107 | } 108 | } 109 | } 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /src/vsTCanny.rc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | VS_VERSION_INFO VERSIONINFO 4 | FILEVERSION 1,1,8,0 5 | PRODUCTVERSION 1,1,8,0 6 | FILEFLAGSMASK VS_FFI_FILEFLAGSMASK 7 | FILEFLAGS 0x0L 8 | FILEOS VOS__WINDOWS32 9 | FILETYPE VFT_DLL 10 | FILESUBTYPE VFT2_UNKNOWN 11 | BEGIN 12 | BLOCK "StringFileInfo" 13 | BEGIN 14 | BLOCK "040904E4" 15 | BEGIN 16 | VALUE "Comments", "Canny edge detection filter." 17 | VALUE "FileDescription", "TCanny for AviSynth 2.6 / AviSynth+" 18 | VALUE "FileVersion", "1.1.8" 19 | VALUE "InternalName", "vsTCanny" 20 | VALUE "OriginalFilename", "vsTCanny.dll" 21 | VALUE "ProductName", "vsTCanny" 22 | VALUE "ProductVersion", "1.1.8" 23 | END 24 | END 25 | BLOCK "VarFileInfo" 26 | BEGIN 27 | VALUE "Translation", 0x409, 1252 28 | END 29 | END 30 | -------------------------------------------------------------------------------- /src/vsTCanny_AVX2.cpp: -------------------------------------------------------------------------------- 1 | #include "VCL2/vectormath_trig.h" 2 | #include "vsTCanny.h" 3 | 4 | template 5 | static void copyPlane(const T* srcp, float* dstp, const int width, const int height, const int srcStride, const int dstStride) noexcept 6 | { 7 | for (int y{ 0 }; y < height; ++y) 8 | { 9 | for (int x{ 0 }; x < width; x += 8) 10 | { 11 | if constexpr (std::is_same_v) 12 | to_float(Vec8i().load_8uc(srcp + x)).store_nt(dstp + x); 13 | else if constexpr (std::is_same_v) 14 | to_float(Vec8i().load_8us(srcp + x)).store_nt(dstp + x); 15 | else 16 | Vec8f().load_a(srcp + x).store_nt(dstp + x); 17 | } 18 | 19 | srcp += srcStride; 20 | dstp += dstStride; 21 | } 22 | } 23 | 24 | template 25 | static void gaussianBlur(const T* __srcp, float* temp, float* dstp, const float* weightsH, const float* weightsV, const int width, const int height, const int srcStride, const int dstStride, const int radiusH, const int radiusV) noexcept 26 | { 27 | const int diameter{ radiusV * 2 + 1 }; 28 | std::unique_ptr _srcp{ std::make_unique(diameter) }; 29 | 30 | _srcp[radiusV] = __srcp; 31 | for (int i{ 1 }; i <= radiusV; ++i) 32 | _srcp[radiusV - i] = _srcp[radiusV + i] = _srcp[radiusV] + srcStride * i; 33 | 34 | weightsH += radiusH; 35 | 36 | for (int y{ 0 }; y < height; ++y) 37 | { 38 | for (int x{ 0 }; x < width; x += 8) 39 | { 40 | Vec8f sum{ zero_8f() }; 41 | 42 | for (int i{ 0 }; i < diameter; ++i) 43 | { 44 | if constexpr (std::is_same_v) 45 | { 46 | const Vec8f srcp{ to_float(Vec8i().load_8uc(_srcp[i] + x)) }; 47 | sum = mul_add(srcp, weightsV[i], sum); 48 | } 49 | else if constexpr (std::is_same_v) 50 | { 51 | const Vec8f srcp{ to_float(Vec8i().load_8us(_srcp[i] + x)) }; 52 | sum = mul_add(srcp, weightsV[i], sum); 53 | } 54 | else 55 | { 56 | const Vec8f srcp{ Vec8f().load_a(_srcp[i] + x) }; 57 | sum = mul_add(srcp, weightsV[i], sum); 58 | } 59 | } 60 | 61 | sum.store_a(temp + x); 62 | } 63 | 64 | for (int i{ 1 }; i <= radiusH; ++i) 65 | { 66 | temp[-i] = temp[i]; 67 | temp[width - 1 + i] = temp[width - 1 - i]; 68 | } 69 | 70 | for (int x{ 0 }; x < width; x += 8) 71 | { 72 | Vec8f sum{ zero_8f() }; 73 | 74 | for (int i{ -radiusH }; i <= radiusH; ++i) 75 | { 76 | const Vec8f srcp{ Vec8f().load(temp + x + i) }; 77 | sum = mul_add(srcp, weightsH[i], sum); 78 | } 79 | 80 | sum.store_nt(dstp + x); 81 | } 82 | 83 | for (int i{ 0 }; i < diameter - 1; ++i) 84 | _srcp[i] = _srcp[i + 1]; 85 | 86 | _srcp[diameter - 1] += (y < height - 1 - radiusV) ? srcStride : -srcStride; 87 | 88 | dstp += dstStride; 89 | } 90 | } 91 | 92 | template 93 | static void gaussianBlurV(const T* __srcp, float* dstp, const float* weights, const int width, const int height, const int srcStride, const int dstStride, const int radius) noexcept 94 | { 95 | const int diameter{ radius * 2 + 1 }; 96 | std::unique_ptr _srcp{ std::make_unique(diameter) }; 97 | 98 | _srcp[radius] = __srcp; 99 | for (int i{ 1 }; i <= radius; ++i) 100 | _srcp[radius - i] = _srcp[radius + i] = _srcp[radius] + srcStride * i; 101 | 102 | for (int y{ 0 }; y < height; ++y) 103 | { 104 | for (int x{ 0 }; x < width; x += 8) 105 | { 106 | Vec8f sum{ zero_8f() }; 107 | 108 | for (int i{ 0 }; i < diameter; ++i) 109 | { 110 | if constexpr (std::is_same_v) 111 | { 112 | const Vec8f srcp{ to_float(Vec8i().load_8uc(_srcp[i] + x)) }; 113 | sum = mul_add(srcp, weights[i], sum); 114 | } 115 | else if constexpr (std::is_same_v) 116 | { 117 | const Vec8f srcp{ to_float(Vec8i().load_8us(_srcp[i] + x)) }; 118 | sum = mul_add(srcp, weights[i], sum); 119 | } 120 | else 121 | { 122 | const Vec8f srcp{ Vec8f().load_a(_srcp[i] + x) }; 123 | sum = mul_add(srcp, weights[i], sum); 124 | } 125 | } 126 | 127 | sum.store_nt(dstp + x); 128 | } 129 | 130 | for (int i{ 0 }; i < diameter - 1; ++i) 131 | _srcp[i] = _srcp[i + 1]; 132 | 133 | _srcp[diameter - 1] += (y < height - 1 - radius) ? srcStride : -srcStride; 134 | 135 | dstp += dstStride; 136 | } 137 | } 138 | 139 | template 140 | static void gaussianBlurH(const T* _srcp, float* temp, float* dstp, const float* weights, const int width, const int height, const int srcStride, const int dstStride, const int radius) noexcept 141 | { 142 | weights += radius; 143 | 144 | for (int y{ 0 }; y < height; ++y) 145 | { 146 | for (int x{ 0 }; x < width; x += 8) 147 | { 148 | if constexpr (std::is_same_v) 149 | to_float(Vec8i().load_8uc(_srcp + x)).store_a(temp + x); 150 | else if constexpr (std::is_same_v) 151 | to_float(Vec8i().load_8us(_srcp + x)).store_a(temp + x); 152 | else 153 | Vec8f().load_a(_srcp + x).store_a(temp + x); 154 | } 155 | 156 | for (int i{ 1 }; i <= radius; ++i) 157 | { 158 | temp[-i] = temp[i]; 159 | temp[width - 1 + i] = temp[width - 1 - i]; 160 | } 161 | 162 | for (int x{ 0 }; x < width; x += 8) 163 | { 164 | Vec8f sum{ zero_8f() }; 165 | 166 | for (int i{ -radius }; i <= radius; ++i) 167 | { 168 | const Vec8f srcp{ Vec8f().load(temp + x + i) }; 169 | sum = mul_add(srcp, weights[i], sum); 170 | } 171 | 172 | sum.store_nt(dstp + x); 173 | } 174 | 175 | _srcp += srcStride; 176 | dstp += dstStride; 177 | } 178 | } 179 | 180 | static void detectEdge(float* blur, float* gradient, int* direction, const int width, const int height, const int stride, const int bgStride, const int mode, const int op, const float scale) noexcept 181 | { 182 | float* __restrict cur{ blur }; 183 | float* __restrict next{ blur + bgStride }; 184 | float* __restrict next2{ blur + bgStride * 2 }; 185 | float* __restrict prev{ next }; 186 | float* __restrict prev2{ next2 }; 187 | 188 | cur[-1] = cur[1]; 189 | cur[width] = cur[width - 2]; 190 | 191 | if (op == FDOG) 192 | { 193 | cur[-2] = cur[2]; 194 | cur[width + 1] = cur[width - 3]; 195 | } 196 | 197 | for (int y{ 0 }; y < height; ++y) 198 | { 199 | next[-1] = next[1]; 200 | next[width] = next[width - 2]; 201 | 202 | if (op == FDOG) 203 | { 204 | next[-2] = next[2]; 205 | next[width + 1] = next[width - 3]; 206 | 207 | next2[-1] = next2[1]; 208 | next2[-2] = next2[2]; 209 | next2[width] = next2[width - 2]; 210 | next2[width + 1] = next2[width - 3]; 211 | } 212 | 213 | for (int x{ 0 }; x < width; x += 8) 214 | { 215 | Vec8f gx, gy; 216 | 217 | if (op != FDOG) 218 | { 219 | const Vec8f c1{ Vec8f().load(prev + x - 1) }; 220 | const Vec8f c2{ Vec8f().load_a(prev + x) }; 221 | const Vec8f c3{ Vec8f().load(prev + x + 1) }; 222 | const Vec8f c4{ Vec8f().load(cur + x - 1) }; 223 | const Vec8f c6{ Vec8f().load(cur + x + 1) }; 224 | const Vec8f c7{ Vec8f().load(next + x - 1) }; 225 | const Vec8f c8{ Vec8f().load_a(next + x) }; 226 | const Vec8f c9{ Vec8f().load(next + x + 1) }; 227 | 228 | switch (op) 229 | { 230 | case TRITICAL: 231 | { 232 | gx = c6 - c4; 233 | gy = c2 - c8; 234 | break; 235 | } 236 | case PREWITT: 237 | { 238 | gx = (c3 + c6 + c9 - c1 - c4 - c7) * 0.5f; 239 | gy = (c1 + c2 + c3 - c7 - c8 - c9) * 0.5f; 240 | break; 241 | } 242 | case SOBEL: 243 | { 244 | gx = c3 + mul_add(2.0f, c6, c9) - c1 - mul_add(2.0f, c4, c7); 245 | gy = c1 + mul_add(2.0f, c2, c3) - c7 - mul_add(2.0f, c8, c9); 246 | break; 247 | } 248 | case SCHARR: 249 | { 250 | gx = mul_add(3.0f, c3 + c9, 10.0f * c6) - mul_add(3.0f, c1 + c7, 10.0f * c4); 251 | gy = mul_add(3.0f, c1 + c3, 10.0f * c2) - mul_add(3.0f, c7 + c9, 10.0f * c8); 252 | break; 253 | } 254 | case KROON: 255 | { 256 | gx = mul_add(17.0f, c3 + c9, 61.0f * c6) - mul_add(17.0f, c1 + c7, 61.0f * c4); 257 | gy = mul_add(17.0f, c1 + c3, 61.0f * c2) - mul_add(17.0f, c7 + c9, 61.0f * c8); 258 | break; 259 | } 260 | case KIRSCH: 261 | { 262 | const Vec8f g1{ mul_sub(5.0f, c1 + c2 + c3, 3.0f * (c4 + c6 + c7 + c8 + c9)) }; 263 | const Vec8f g2{ mul_sub(5.0f, c1 + c2 + c4, 3.0f * (c3 + c6 + c7 + c8 + c9)) }; 264 | const Vec8f g3{ mul_sub(5.0f, c1 + c4 + c7, 3.0f * (c2 + c3 + c6 + c8 + c9)) }; 265 | const Vec8f g4{ mul_sub(5.0f, c4 + c7 + c8, 3.0f * (c1 + c2 + c3 + c6 + c9)) }; 266 | const Vec8f g5{ mul_sub(5.0f, c7 + c8 + c9, 3.0f * (c1 + c2 + c3 + c4 + c6)) }; 267 | const Vec8f g6{ mul_sub(5.0f, c6 + c8 + c9, 3.0f * (c1 + c2 + c3 + c4 + c7)) }; 268 | const Vec8f g7{ mul_sub(5.0f, c3 + c6 + c9, 3.0f * (c1 + c2 + c4 + c7 + c8)) }; 269 | const Vec8f g8{ mul_sub(5.0f, c2 + c3 + c6, 3.0f * (c1 + c4 + c7 + c8 + c9)) }; 270 | const Vec8f g{ max(max(max(abs(g1), abs(g2)), max(abs(g3), abs(g4))), max(max(abs(g5), abs(g6)), max(abs(g7), abs(g8)))) }; 271 | (g * scale).store_nt(gradient + x); 272 | break; 273 | } 274 | } 275 | } 276 | else 277 | { 278 | const Vec8f c1{ Vec8f().load(prev2 + x - 2) }; 279 | const Vec8f c2{ Vec8f().load(prev2 + x - 1) }; 280 | const Vec8f c3{ Vec8f().load(prev2 + x) }; 281 | const Vec8f c4{ Vec8f().load(prev2 + x + 1) }; 282 | const Vec8f c5{ Vec8f().load(prev2 + x + 2) }; 283 | const Vec8f c6{ Vec8f().load(prev + x - 2) }; 284 | const Vec8f c7{ Vec8f().load(prev + x - 1) }; 285 | const Vec8f c8{ Vec8f().load(prev + x) }; 286 | const Vec8f c9{ Vec8f().load(prev + x + 1) }; 287 | const Vec8f c10{ Vec8f().load(prev + x + 2) }; 288 | const Vec8f c11{ Vec8f().load(cur + x - 2) }; 289 | const Vec8f c12{ Vec8f().load(cur + x - 1) }; 290 | const Vec8f c14{ Vec8f().load(cur + x + 1) }; 291 | const Vec8f c15{ Vec8f().load(cur + x + 2) }; 292 | const Vec8f c16{ Vec8f().load(next + x - 2) }; 293 | const Vec8f c17{ Vec8f().load(next + x - 1) }; 294 | const Vec8f c18{ Vec8f().load(next + x) }; 295 | const Vec8f c19{ Vec8f().load(next + x + 1) }; 296 | const Vec8f c20{ Vec8f().load(next + x + 2) }; 297 | const Vec8f c21{ Vec8f().load(next2 + x - 2) }; 298 | const Vec8f c22{ Vec8f().load(next2 + x - 1) }; 299 | const Vec8f c23{ Vec8f().load(next2 + x) }; 300 | const Vec8f c24{ Vec8f().load(next2 + x + 1) }; 301 | const Vec8f c25{ Vec8f().load(next2 + x + 2) }; 302 | 303 | gx = c5 + c25 + c4 + c24 + mul_add(2.0f, c10 + c20 + c9 + c19, 3.0f * (c15 + c14)) 304 | - c2 - c22 - c1 - c21 - mul_add(2.0f, c7 + c17 + c6 + c16, 3.0f * (c12 + c11)); 305 | gy = c1 + c5 + c6 + c10 + mul_add(2.0f, c2 + c4 + c7 + c9, 3.0f * (c3 + c8)) 306 | - c16 - c20 - c21 - c25 - mul_add(2.0f, c17 + c19 + c22 + c24, 3.0f * (c18 + c23)); 307 | } 308 | 309 | if (op != KIRSCH) 310 | { 311 | gx *= scale; 312 | gy *= scale; 313 | sqrt(mul_add(gx, gx, gy * gy)).store_nt(gradient + x); 314 | } 315 | 316 | if (mode == 0) 317 | { 318 | Vec8f dr{ atan2(gy, gx) }; 319 | dr = if_add(dr < 0.0f, dr, M_PIF); 320 | 321 | const Vec8i bin{ truncatei(mul_add(dr, 4.0f * M_1_PIF, 0.5f)) }; 322 | select(bin >= 4, zero_si256(), bin).store_nt(direction + x); 323 | } 324 | } 325 | 326 | prev2 = prev; 327 | prev = cur; 328 | cur = next; 329 | 330 | if (op != FDOG) 331 | next += (y < height - 2) ? bgStride : -bgStride; 332 | else 333 | { 334 | next = next2; 335 | next2 += (y < height - 3) ? bgStride : -bgStride; 336 | } 337 | 338 | gradient += bgStride; 339 | direction += stride; 340 | } 341 | } 342 | 343 | static void nonMaximumSuppression(const int* _direction, float* _gradient, float* blur, const int width, const int height, const int stride, const int bgStride, const int radiusAlign) noexcept 344 | { 345 | _gradient[-1] = _gradient[1]; 346 | _gradient[-1 + bgStride * (height - 1)] = _gradient[1 + bgStride * (height - 1)]; 347 | _gradient[width] = _gradient[width - 2]; 348 | _gradient[width + bgStride * (height - 1)] = _gradient[width - 2 + bgStride * (height - 1)]; 349 | std::copy_n(_gradient - radiusAlign + bgStride, width + radiusAlign * 2, _gradient - radiusAlign - bgStride); 350 | std::copy_n(_gradient - radiusAlign + bgStride * (height - 2), width + radiusAlign * 2, _gradient - radiusAlign + bgStride * static_cast(height)); 351 | 352 | for (int y{ 0 }; y < height; ++y) 353 | { 354 | for (int x{ 0 }; x < width; x += 8) 355 | { 356 | const Vec8ui direction{ Vec8ui().load_a(_direction + x) }; 357 | 358 | Vec8fb mask{ Vec8fb(direction == 0) }; 359 | Vec8f gradient{ max(Vec8f().load(_gradient + x + 1), Vec8f().load(_gradient + x - 1)) }; 360 | Vec8f result{ gradient & mask }; 361 | 362 | mask = Vec8fb(direction == 1); 363 | gradient = max(Vec8f().load(_gradient + x - bgStride + 1), Vec8f().load(_gradient + x + bgStride - 1)); 364 | result |= gradient & mask; 365 | 366 | mask = Vec8fb(direction == 2); 367 | gradient = max(Vec8f().load_a(_gradient + x - bgStride), Vec8f().load_a(_gradient + x + bgStride)); 368 | result |= gradient & mask; 369 | 370 | mask = Vec8fb(direction == 3); 371 | gradient = max(Vec8f().load(_gradient + x - bgStride - 1), Vec8f().load(_gradient + x + bgStride + 1)); 372 | result |= gradient & mask; 373 | 374 | gradient = Vec8f().load_a(_gradient + x); 375 | select(gradient >= result, gradient, fltLowest).store_nt(blur + x); 376 | } 377 | 378 | _direction += stride; 379 | _gradient += bgStride; 380 | blur += bgStride; 381 | } 382 | } 383 | 384 | template 385 | static void binarizeCE(const float* _srcp, T* dstp, const int width, const int height, const int srcStride, const int dstStride, const int peak) noexcept 386 | { 387 | for (int y{ 0 }; y < height; ++y) 388 | { 389 | for (int x{ 0 }; x < width; x += 8) 390 | { 391 | const Vec8f srcp{ Vec8f().load_a(_srcp + x) }; 392 | 393 | if constexpr (std::is_same_v) 394 | { 395 | const Vec16cb mask{ Vec16cb(compress_saturated(compress_saturated(Vec8ib(srcp == fltMax), zero_si256()), zero_si256()).get_low()) }; 396 | select(mask, Vec16uc(255), zero_si128()).storel(dstp + x); 397 | } 398 | else if constexpr (std::is_same_v) 399 | { 400 | const Vec8sb mask{ Vec8sb(compress_saturated(Vec8ib(srcp == fltMax), zero_si256()).get_low()) }; 401 | select(mask, Vec8us(peak), zero_si128()).store_nt(dstp + x); 402 | } 403 | else 404 | { 405 | const Vec8fb mask{ srcp == fltMax }; 406 | select(mask, Vec8f(1.0f), Vec8f(0.0f)).store_nt(dstp + x); 407 | } 408 | } 409 | 410 | _srcp += srcStride; 411 | dstp += dstStride; 412 | } 413 | } 414 | 415 | template 416 | static void discretizeGM(const float* _srcp, T* dstp, const int width, const int height, const int srcStride, const int dstStride, const int peak) noexcept 417 | { 418 | for (int y{ 0 }; y < height; ++y) 419 | { 420 | for (int x{ 0 }; x < width; x += 8) 421 | { 422 | const Vec8f srcp{ Vec8f().load_a(_srcp + x) }; 423 | 424 | if constexpr (std::is_same_v) 425 | { 426 | const Vec16uc result{ compress_saturated_s2u(compress_saturated(truncatei(srcp + 0.5f), zero_si256()), zero_si256()).get_low() }; 427 | result.storel(dstp + x); 428 | } 429 | else if constexpr (std::is_same_v) 430 | { 431 | const Vec8us result{ compress_saturated_s2u(truncatei(srcp + 0.5f), zero_si256()).get_low() }; 432 | min(result, peak).store_nt(dstp + x); 433 | } 434 | else if constexpr (clampFP) 435 | min(max(srcp, 0.0f), 1.0f).store_nt(dstp + x); 436 | else 437 | srcp.store_nt(dstp + x); 438 | } 439 | 440 | _srcp += srcStride; 441 | dstp += dstStride; 442 | } 443 | } 444 | 445 | template 446 | void vsTCanny::filter_avx2(PVideoFrame& src, PVideoFrame& dst, IScriptEnvironment* env) noexcept 447 | { 448 | const int planes_y[3]{ PLANAR_Y, PLANAR_U, PLANAR_V }; 449 | const int planes_r[3]{ PLANAR_G, PLANAR_B, PLANAR_R }; 450 | const int* current_planes{ (vi.IsRGB()) ? planes_r : planes_y }; 451 | const int planecount{ std::min(vi.NumComponents(), 3) }; 452 | 453 | for (int i{ 0 }; i < planecount; ++i) 454 | { 455 | const int height{ src->GetHeight(current_planes[i]) }; 456 | 457 | if (process[i] == 3) 458 | { 459 | const size_t stride{ src->GetPitch(current_planes[i]) / sizeof(T) }; 460 | const size_t bgStride{ stride + radiusAlign * 2 }; 461 | const size_t dst_stride{ dst->GetPitch(current_planes[i]) / sizeof(T) }; 462 | const size_t width{ src->GetRowSize(current_planes[i]) / sizeof(T) }; 463 | const T* srcp{ reinterpret_cast(src->GetReadPtr(current_planes[i])) }; 464 | T* dstp{ reinterpret_cast(dst->GetWritePtr(current_planes[i])) }; 465 | 466 | float* blur{ vsTCanny::blur + radiusAlign }; 467 | float* gradient{ vsTCanny::gradient + bgStride + radiusAlign }; 468 | 469 | if (radiusV[i] && radiusH[i]) 470 | gaussianBlur(srcp, gradient, blur, weightsH[i].get(), weightsV[i].get(), width, height, stride, bgStride, radiusH[i], radiusV[i]); 471 | else if (radiusV[i]) 472 | gaussianBlurV(srcp, blur, weightsV[i].get(), width, height, stride, bgStride, radiusV[i]); 473 | else if (radiusH[i]) 474 | gaussianBlurH(srcp, gradient, blur, weightsH[i].get(), width, height, stride, bgStride, radiusH[i]); 475 | else 476 | copyPlane(srcp, blur, width, height, stride, bgStride); 477 | 478 | if (mode_ != -1) 479 | { 480 | detectEdge(blur, gradient, direction, width, height, stride, bgStride, mode_, op_, scale); 481 | 482 | if (mode_ == 0) 483 | { 484 | nonMaximumSuppression(direction, gradient, blur, width, height, stride, bgStride, radiusAlign); 485 | hysteresis(blur, found.get(), width, height, bgStride, t_h_, t_l_); 486 | } 487 | } 488 | 489 | switch (mode_) 490 | { 491 | case 0: binarizeCE(blur, dstp, width, height, bgStride, dst_stride, peak); break; 492 | case 1: discretizeGM(gradient, dstp, width, height, bgStride, stride, peak); break; 493 | default: discretizeGM(blur, dstp, width, height, bgStride, dst_stride, peak); break; 494 | } 495 | } 496 | else if (process[i] == 2) 497 | env->BitBlt(dst->GetWritePtr(current_planes[i]), dst->GetPitch(current_planes[i]), src->GetReadPtr(current_planes[i]), src->GetPitch(current_planes[i]), src->GetRowSize(current_planes[i]), height); 498 | } 499 | } 500 | 501 | template void vsTCanny::filter_avx2(PVideoFrame& src, PVideoFrame& dst, IScriptEnvironment* env) noexcept; 502 | template void vsTCanny::filter_avx2(PVideoFrame& src, PVideoFrame& dst, IScriptEnvironment* env) noexcept; 503 | template void vsTCanny::filter_avx2(PVideoFrame& src, PVideoFrame& dst, IScriptEnvironment* env) noexcept; 504 | -------------------------------------------------------------------------------- /src/vsTCanny_AVX512.cpp: -------------------------------------------------------------------------------- 1 | #include "VCL2/vectormath_trig.h" 2 | #include "vsTCanny.h" 3 | 4 | template 5 | static void copyPlane(const T* srcp, float* dstp, const int width, const int height, const int srcStride, const int dstStride) noexcept 6 | { 7 | for (int y{ 0 }; y < height; ++y) 8 | { 9 | for (int x{ 0 }; x < width; x += 16) 10 | { 11 | if constexpr (std::is_same_v) 12 | to_float(Vec16i().load_16uc(srcp + x)).store_nt(dstp + x); 13 | else if constexpr (std::is_same_v) 14 | to_float(Vec16i().load_16us(srcp + x)).store_nt(dstp + x); 15 | else 16 | Vec16f().load_a(srcp + x).store_nt(dstp + x); 17 | } 18 | 19 | srcp += srcStride; 20 | dstp += dstStride; 21 | } 22 | } 23 | 24 | template 25 | static void gaussianBlur(const T* __srcp, float* temp, float* dstp, const float* weightsH, const float* weightsV, const int width, const int height, const int srcStride, const int dstStride, const int radiusH, const int radiusV) noexcept 26 | { 27 | const int diameter{ radiusV * 2 + 1 }; 28 | std::unique_ptr _srcp{ std::make_unique(diameter) }; 29 | 30 | _srcp[radiusV] = __srcp; 31 | for (int i{ 1 }; i <= radiusV; ++i) 32 | _srcp[radiusV - i] = _srcp[radiusV + i] = _srcp[radiusV] + srcStride * i; 33 | 34 | weightsH += radiusH; 35 | 36 | for (int y{ 0 }; y < height; ++y) 37 | { 38 | for (int x{ 0 }; x < width; x += 16) 39 | { 40 | Vec16f sum{ zero_16f() }; 41 | 42 | for (int i{ 0 }; i < diameter; ++i) 43 | { 44 | if constexpr (std::is_same_v) 45 | { 46 | const Vec16f srcp{ to_float(Vec16i().load_16uc(_srcp[i] + x)) }; 47 | sum = mul_add(srcp, weightsV[i], sum); 48 | } 49 | else if constexpr (std::is_same_v) 50 | { 51 | const Vec16f srcp{ to_float(Vec16i().load_16us(_srcp[i] + x)) }; 52 | sum = mul_add(srcp, weightsV[i], sum); 53 | } 54 | else 55 | { 56 | const Vec16f srcp{ Vec16f().load_a(_srcp[i] + x) }; 57 | sum = mul_add(srcp, weightsV[i], sum); 58 | } 59 | } 60 | 61 | sum.store_a(temp + x); 62 | } 63 | 64 | for (int i{ 1 }; i <= radiusH; ++i) 65 | { 66 | temp[-i] = temp[i]; 67 | temp[width - 1 + i] = temp[width - 1 - i]; 68 | } 69 | 70 | for (int x{ 0 }; x < width; x += 16) 71 | { 72 | Vec16f sum{ zero_16f() }; 73 | 74 | for (int i{ -radiusH }; i <= radiusH; ++i) 75 | { 76 | const Vec16f srcp{ Vec16f().load(temp + x + i) }; 77 | sum = mul_add(srcp, weightsH[i], sum); 78 | } 79 | 80 | sum.store_nt(dstp + x); 81 | } 82 | 83 | for (int i{ 0 }; i < diameter - 1; ++i) 84 | _srcp[i] = _srcp[i + 1]; 85 | 86 | _srcp[diameter - 1] += (y < height - 1 - radiusV) ? srcStride : -srcStride; 87 | 88 | dstp += dstStride; 89 | } 90 | } 91 | 92 | template 93 | static void gaussianBlurV(const T* __srcp, float* dstp, const float* weights, const int width, const int height, const int srcStride, const int dstStride, const int radius) noexcept 94 | { 95 | const int diameter{ radius * 2 + 1 }; 96 | std::unique_ptr _srcp{ std::make_unique(diameter) }; 97 | 98 | _srcp[radius] = __srcp; 99 | for (int i{ 1 }; i <= radius; ++i) 100 | _srcp[radius - i] = _srcp[radius + i] = _srcp[radius] + srcStride * i; 101 | 102 | for (int y{ 0 }; y < height; ++y) 103 | { 104 | for (int x{ 0 }; x < width; x += 16) 105 | { 106 | Vec16f sum{ zero_16f() }; 107 | 108 | for (int i{ 0 }; i < diameter; ++i) 109 | { 110 | if constexpr (std::is_same_v) 111 | { 112 | const Vec16f srcp{ to_float(Vec16i().load_16uc(_srcp[i] + x)) }; 113 | sum = mul_add(srcp, weights[i], sum); 114 | } 115 | else if constexpr (std::is_same_v) 116 | { 117 | const Vec16f srcp{ to_float(Vec16i().load_16us(_srcp[i] + x)) }; 118 | sum = mul_add(srcp, weights[i], sum); 119 | } 120 | else 121 | { 122 | const Vec16f srcp{ Vec16f().load_a(_srcp[i] + x) }; 123 | sum = mul_add(srcp, weights[i], sum); 124 | } 125 | } 126 | 127 | sum.store_nt(dstp + x); 128 | } 129 | 130 | for (int i{ 0 }; i < diameter - 1; ++i) 131 | _srcp[i] = _srcp[i + 1]; 132 | 133 | _srcp[diameter - 1] += (y < height - 1 - radius) ? srcStride : -srcStride; 134 | 135 | dstp += dstStride; 136 | } 137 | } 138 | 139 | template 140 | static void gaussianBlurH(const T* _srcp, float* temp, float* dstp, const float* weights, const int width, const int height, const int srcStride, const int dstStride, const int radius) noexcept 141 | { 142 | weights += radius; 143 | 144 | for (int y{ 0 }; y < height; ++y) 145 | { 146 | for (int x{ 0 }; x < width; x += 16) 147 | { 148 | if constexpr (std::is_same_v) 149 | to_float(Vec16i().load_16uc(_srcp + x)).store_a(temp + x); 150 | else if constexpr (std::is_same_v) 151 | to_float(Vec16i().load_16us(_srcp + x)).store_a(temp + x); 152 | else 153 | Vec16f().load_a(_srcp + x).store_a(temp + x); 154 | } 155 | 156 | for (int i{ 1 }; i <= radius; ++i) 157 | { 158 | temp[-i] = temp[i]; 159 | temp[width - 1 + i] = temp[width - 1 - i]; 160 | } 161 | 162 | for (int x{ 0 }; x < width; x += 16) 163 | { 164 | Vec16f sum{ zero_16f() }; 165 | 166 | for (int i{ -radius }; i <= radius; ++i) 167 | { 168 | const Vec16f srcp{ Vec16f().load(temp + x + i) }; 169 | sum = mul_add(srcp, weights[i], sum); 170 | } 171 | 172 | sum.store_nt(dstp + x); 173 | } 174 | 175 | _srcp += srcStride; 176 | dstp += dstStride; 177 | } 178 | } 179 | 180 | static void detectEdge(float* blur, float* gradient, int* direction, const int width, const int height, const int stride, const int bgStride, const int mode, const int op, const float scale) noexcept 181 | { 182 | float* __restrict cur{ blur }; 183 | float* __restrict next{ blur + bgStride }; 184 | float* __restrict next2{ blur + bgStride * 2 }; 185 | float* __restrict prev{ next }; 186 | float* __restrict prev2{ next2 }; 187 | 188 | cur[-1] = cur[1]; 189 | cur[width] = cur[width - 2]; 190 | 191 | if (op == FDOG) 192 | { 193 | cur[-2] = cur[2]; 194 | cur[width + 1] = cur[width - 3]; 195 | } 196 | 197 | for (int y{ 0 }; y < height; ++y) 198 | { 199 | next[-1] = next[1]; 200 | next[width] = next[width - 2]; 201 | 202 | if (op == FDOG) 203 | { 204 | next[-2] = next[2]; 205 | next[width + 1] = next[width - 3]; 206 | 207 | next2[-1] = next2[1]; 208 | next2[-2] = next2[2]; 209 | next2[width] = next2[width - 2]; 210 | next2[width + 1] = next2[width - 3]; 211 | } 212 | 213 | for (int x{ 0 }; x < width; x += 16) 214 | { 215 | Vec16f gx, gy; 216 | 217 | if (op != FDOG) 218 | { 219 | const Vec16f c1{ Vec16f().load(prev + x - 1) }; 220 | const Vec16f c2{ Vec16f().load_a(prev + x) }; 221 | const Vec16f c3{ Vec16f().load(prev + x + 1) }; 222 | const Vec16f c4{ Vec16f().load(cur + x - 1) }; 223 | const Vec16f c6{ Vec16f().load(cur + x + 1) }; 224 | const Vec16f c7{ Vec16f().load(next + x - 1) }; 225 | const Vec16f c8{ Vec16f().load_a(next + x) }; 226 | const Vec16f c9{ Vec16f().load(next + x + 1) }; 227 | 228 | switch (op) 229 | { 230 | case TRITICAL: 231 | { 232 | gx = c6 - c4; 233 | gy = c2 - c8; 234 | break; 235 | } 236 | case PREWITT: 237 | { 238 | gx = (c3 + c6 + c9 - c1 - c4 - c7) * 0.5f; 239 | gy = (c1 + c2 + c3 - c7 - c8 - c9) * 0.5f; 240 | break; 241 | } 242 | case SOBEL: 243 | { 244 | gx = c3 + mul_add(2.0f, c6, c9) - c1 - mul_add(2.0f, c4, c7); 245 | gy = c1 + mul_add(2.0f, c2, c3) - c7 - mul_add(2.0f, c8, c9); 246 | break; 247 | } 248 | case SCHARR: 249 | { 250 | gx = mul_add(3.0f, c3 + c9, 10.0f * c6) - mul_add(3.0f, c1 + c7, 10.0f * c4); 251 | gy = mul_add(3.0f, c1 + c3, 10.0f * c2) - mul_add(3.0f, c7 + c9, 10.0f * c8); 252 | break; 253 | } 254 | case KROON: 255 | { 256 | gx = mul_add(17.0f, c3 + c9, 61.0f * c6) - mul_add(17.0f, c1 + c7, 61.0f * c4); 257 | gy = mul_add(17.0f, c1 + c3, 61.0f * c2) - mul_add(17.0f, c7 + c9, 61.0f * c8); 258 | break; 259 | } 260 | case KIRSCH: 261 | { 262 | const Vec16f g1{ mul_sub(5.0f, c1 + c2 + c3, 3.0f * (c4 + c6 + c7 + c8 + c9)) }; 263 | const Vec16f g2{ mul_sub(5.0f, c1 + c2 + c4, 3.0f * (c3 + c6 + c7 + c8 + c9)) }; 264 | const Vec16f g3{ mul_sub(5.0f, c1 + c4 + c7, 3.0f * (c2 + c3 + c6 + c8 + c9)) }; 265 | const Vec16f g4{ mul_sub(5.0f, c4 + c7 + c8, 3.0f * (c1 + c2 + c3 + c6 + c9)) }; 266 | const Vec16f g5{ mul_sub(5.0f, c7 + c8 + c9, 3.0f * (c1 + c2 + c3 + c4 + c6)) }; 267 | const Vec16f g6{ mul_sub(5.0f, c6 + c8 + c9, 3.0f * (c1 + c2 + c3 + c4 + c7)) }; 268 | const Vec16f g7{ mul_sub(5.0f, c3 + c6 + c9, 3.0f * (c1 + c2 + c4 + c7 + c8)) }; 269 | const Vec16f g8{ mul_sub(5.0f, c2 + c3 + c6, 3.0f * (c1 + c4 + c7 + c8 + c9)) }; 270 | const Vec16f g{ max(max(max(abs(g1), abs(g2)), max(abs(g3), abs(g4))), max(max(abs(g5), abs(g6)), max(abs(g7), abs(g8)))) }; 271 | (g * scale).store_nt(gradient + x); 272 | break; 273 | } 274 | } 275 | } 276 | else 277 | { 278 | const Vec16f c1{ Vec16f().load(prev2 + x - 2) }; 279 | const Vec16f c2{ Vec16f().load(prev2 + x - 1) }; 280 | const Vec16f c3{ Vec16f().load(prev2 + x) }; 281 | const Vec16f c4{ Vec16f().load(prev2 + x + 1) }; 282 | const Vec16f c5{ Vec16f().load(prev2 + x + 2) }; 283 | const Vec16f c6{ Vec16f().load(prev + x - 2) }; 284 | const Vec16f c7{ Vec16f().load(prev + x - 1) }; 285 | const Vec16f c8{ Vec16f().load(prev + x) }; 286 | const Vec16f c9{ Vec16f().load(prev + x + 1) }; 287 | const Vec16f c10{ Vec16f().load(prev + x + 2) }; 288 | const Vec16f c11{ Vec16f().load(cur + x - 2) }; 289 | const Vec16f c12{ Vec16f().load(cur + x - 1) }; 290 | const Vec16f c14{ Vec16f().load(cur + x + 1) }; 291 | const Vec16f c15{ Vec16f().load(cur + x + 2) }; 292 | const Vec16f c16{ Vec16f().load(next + x - 2) }; 293 | const Vec16f c17{ Vec16f().load(next + x - 1) }; 294 | const Vec16f c18{ Vec16f().load(next + x) }; 295 | const Vec16f c19{ Vec16f().load(next + x + 1) }; 296 | const Vec16f c20{ Vec16f().load(next + x + 2) }; 297 | const Vec16f c21{ Vec16f().load(next2 + x - 2) }; 298 | const Vec16f c22{ Vec16f().load(next2 + x - 1) }; 299 | const Vec16f c23{ Vec16f().load(next2 + x) }; 300 | const Vec16f c24{ Vec16f().load(next2 + x + 1) }; 301 | const Vec16f c25{ Vec16f().load(next2 + x + 2) }; 302 | 303 | gx = c5 + c25 + c4 + c24 + mul_add(2.0f, c10 + c20 + c9 + c19, 3.0f * (c15 + c14)) 304 | - c2 - c22 - c1 - c21 - mul_add(2.0f, c7 + c17 + c6 + c16, 3.0f * (c12 + c11)); 305 | gy = c1 + c5 + c6 + c10 + mul_add(2.0f, c2 + c4 + c7 + c9, 3.0f * (c3 + c8)) 306 | - c16 - c20 - c21 - c25 - mul_add(2.0f, c17 + c19 + c22 + c24, 3.0f * (c18 + c23)); 307 | } 308 | 309 | if (op != KIRSCH) 310 | { 311 | gx *= scale; 312 | gy *= scale; 313 | sqrt(mul_add(gx, gx, gy * gy)).store_nt(gradient + x); 314 | } 315 | 316 | if (mode == 0) 317 | { 318 | Vec16f dr{ atan2(gy, gx) }; 319 | dr = if_add(dr < 0.0f, dr, M_PIF); 320 | 321 | const Vec16i bin{ truncatei(mul_add(dr, 4.0f * M_1_PIF, 0.5f)) }; 322 | select(bin >= 4, zero_si512(), bin).store_nt(direction + x); 323 | } 324 | } 325 | 326 | prev2 = prev; 327 | prev = cur; 328 | cur = next; 329 | 330 | if (op != FDOG) 331 | next += (y < height - 2) ? bgStride : -bgStride; 332 | else 333 | { 334 | next = next2; 335 | next2 += (y < height - 3) ? bgStride : -bgStride; 336 | } 337 | 338 | gradient += bgStride; 339 | direction += stride; 340 | } 341 | } 342 | 343 | static void nonMaximumSuppression(const int* _direction, float* _gradient, float* blur, const int width, const int height, const int stride, const int bgStride, const int radiusAlign) noexcept 344 | { 345 | _gradient[-1] = _gradient[1]; 346 | _gradient[-1 + bgStride * (height - 1)] = _gradient[1 + bgStride * (height - 1)]; 347 | _gradient[width] = _gradient[width - 2]; 348 | _gradient[width + bgStride * (height - 1)] = _gradient[width - 2 + bgStride * (height - 1)]; 349 | std::copy_n(_gradient - radiusAlign + bgStride, width + radiusAlign * 2, _gradient - radiusAlign - bgStride); 350 | std::copy_n(_gradient - radiusAlign + bgStride * (height - 2), width + radiusAlign * 2, _gradient - radiusAlign + bgStride * static_cast(height)); 351 | 352 | for (int y{ 0 }; y < height; ++y) 353 | { 354 | for (int x{ 0 }; x < width; x += 16) 355 | { 356 | const Vec16ui direction{ Vec16ui().load_a(_direction + x) }; 357 | 358 | Vec16fb mask{ Vec16fb(direction == 0) }; 359 | Vec16f gradient{ max(Vec16f().load(_gradient + x + 1), Vec16f().load(_gradient + x - 1)) }; 360 | Vec16f result{ gradient & mask }; 361 | 362 | mask = Vec16fb(direction == 1); 363 | gradient = max(Vec16f().load(_gradient + x - bgStride + 1), Vec16f().load(_gradient + x + bgStride - 1)); 364 | result |= gradient & mask; 365 | 366 | mask = Vec16fb(direction == 2); 367 | gradient = max(Vec16f().load_a(_gradient + x - bgStride), Vec16f().load_a(_gradient + x + bgStride)); 368 | result |= gradient & mask; 369 | 370 | mask = Vec16fb(direction == 3); 371 | gradient = max(Vec16f().load(_gradient + x - bgStride - 1), Vec16f().load(_gradient + x + bgStride + 1)); 372 | result |= gradient & mask; 373 | 374 | gradient = Vec16f().load_a(_gradient + x); 375 | select(gradient >= result, gradient, fltLowest).store_nt(blur + x); 376 | } 377 | 378 | _direction += stride; 379 | _gradient += bgStride; 380 | blur += bgStride; 381 | } 382 | } 383 | 384 | template 385 | static void binarizeCE(const float* _srcp, T* dstp, const int width, const int height, const int srcStride, const int dstStride, const int peak) noexcept 386 | { 387 | for (int y{ 0 }; y < height; ++y) 388 | { 389 | for (int x{ 0 }; x < width; x += 16) 390 | { 391 | const Vec16f srcp{ Vec16f().load_a(_srcp + x) }; 392 | 393 | if constexpr (std::is_same_v) 394 | { 395 | const Vec16cb mask{ Vec16cb(srcp == fltMax) }; 396 | select(mask, Vec16uc(255), zero_si128()).store_nt(dstp + x); 397 | } 398 | else if constexpr (std::is_same_v) 399 | { 400 | const Vec16sb mask{ Vec16sb(srcp == fltMax) }; 401 | select(mask, Vec16us(peak), zero_si256()).store_nt(dstp + x); 402 | } 403 | else 404 | { 405 | const Vec16fb mask{ srcp == fltMax }; 406 | select(mask, Vec16f(1.0f), Vec16f(0.0f)).store_nt(dstp + x); 407 | } 408 | } 409 | 410 | _srcp += srcStride; 411 | dstp += dstStride; 412 | } 413 | } 414 | 415 | template 416 | static void discretizeGM(const float* _srcp, T* dstp, const int width, const int height, const int srcStride, const int dstStride, const int peak) noexcept 417 | { 418 | for (int y{ 0 }; y < height; ++y) 419 | { 420 | for (int x{ 0 }; x < width; x += 16) 421 | { 422 | const Vec16f srcp{ Vec16f().load_a(_srcp + x) }; 423 | 424 | if constexpr (std::is_same_v) 425 | { 426 | const Vec16uc result{ compress_saturated_s2u(compress_saturated(truncatei(srcp + 0.5f), zero_si512()), zero_si512()).get_low().get_low() }; 427 | result.store_nt(dstp + x); 428 | } 429 | else if constexpr (std::is_same_v) 430 | { 431 | const Vec16us result{ compress_saturated_s2u(truncatei(srcp + 0.5f), zero_si512()).get_low() }; 432 | min(result, peak).store_nt(dstp + x); 433 | } 434 | else if constexpr (clampFP) 435 | min(max(srcp, 0.0f), 1.0f).store_nt(dstp + x); 436 | else 437 | srcp.store_nt(dstp + x); 438 | } 439 | 440 | _srcp += srcStride; 441 | dstp += dstStride; 442 | } 443 | } 444 | 445 | template 446 | void vsTCanny::filter_avx512(PVideoFrame& src, PVideoFrame& dst, IScriptEnvironment* env) noexcept 447 | { 448 | const int planes_y[3]{ PLANAR_Y, PLANAR_U, PLANAR_V }; 449 | const int planes_r[3]{ PLANAR_G, PLANAR_B, PLANAR_R }; 450 | const int* current_planes{ (vi.IsRGB()) ? planes_r : planes_y }; 451 | const int planecount{ std::min(vi.NumComponents(), 3) }; 452 | 453 | for (int i{ 0 }; i < planecount; ++i) 454 | { 455 | const int height{ src->GetHeight(current_planes[i]) }; 456 | 457 | if (process[i] == 3) 458 | { 459 | const size_t stride{ src->GetPitch(current_planes[i]) / sizeof(T) }; 460 | const size_t bgStride{ stride + radiusAlign * 2 }; 461 | const size_t dst_stride{ dst->GetPitch(current_planes[i]) / sizeof(T) }; 462 | const size_t width{ src->GetRowSize(current_planes[i]) / sizeof(T) }; 463 | const T* srcp{ reinterpret_cast(src->GetReadPtr(current_planes[i])) }; 464 | T* dstp{ reinterpret_cast(dst->GetWritePtr(current_planes[i])) }; 465 | 466 | float* blur{ vsTCanny::blur + radiusAlign }; 467 | float* gradient{ vsTCanny::gradient + bgStride + radiusAlign }; 468 | 469 | if (radiusV[i] && radiusH[i]) 470 | gaussianBlur(srcp, gradient, blur, weightsH[i].get(), weightsV[i].get(), width, height, stride, bgStride, radiusH[i], radiusV[i]); 471 | else if (radiusV[i]) 472 | gaussianBlurV(srcp, blur, weightsV[i].get(), width, height, stride, bgStride, radiusV[i]); 473 | else if (radiusH[i]) 474 | gaussianBlurH(srcp, gradient, blur, weightsH[i].get(), width, height, stride, bgStride, radiusH[i]); 475 | else 476 | copyPlane(srcp, blur, width, height, stride, bgStride); 477 | 478 | if (mode_ != -1) 479 | { 480 | detectEdge(blur, gradient, direction, width, height, stride, bgStride, mode_, op_, scale); 481 | 482 | if (mode_ == 0) 483 | { 484 | nonMaximumSuppression(direction, gradient, blur, width, height, stride, bgStride, radiusAlign); 485 | hysteresis(blur, found.get(), width, height, bgStride, t_h_, t_l_); 486 | } 487 | } 488 | 489 | switch (mode_) 490 | { 491 | case 0: binarizeCE(blur, dstp, width, height, bgStride, dst_stride, peak); break; 492 | case 1: discretizeGM(gradient, dstp, width, height, bgStride, stride, peak); break; 493 | default: discretizeGM(blur, dstp, width, height, bgStride, dst_stride, peak); break; 494 | } 495 | } 496 | else if (process[i] == 2) 497 | env->BitBlt(dst->GetWritePtr(current_planes[i]), dst->GetPitch(current_planes[i]), src->GetReadPtr(current_planes[i]), src->GetPitch(current_planes[i]), src->GetRowSize(current_planes[i]), height); 498 | } 499 | } 500 | 501 | template void vsTCanny::filter_avx512(PVideoFrame& src, PVideoFrame& dst, IScriptEnvironment* env) noexcept; 502 | template void vsTCanny::filter_avx512(PVideoFrame& src, PVideoFrame& dst, IScriptEnvironment* env) noexcept; 503 | template void vsTCanny::filter_avx512(PVideoFrame& src, PVideoFrame& dst, IScriptEnvironment* env) noexcept; 504 | -------------------------------------------------------------------------------- /src/vsTCanny_SSE2.cpp: -------------------------------------------------------------------------------- 1 | #include "VCL2/vectormath_trig.h" 2 | #include "vsTCanny.h" 3 | 4 | template 5 | static void copyPlane(const T* srcp, float* dstp, const int width, const int height, const int srcStride, const int dstStride) noexcept 6 | { 7 | for (int y{ 0 }; y < height; ++y) 8 | { 9 | for (int x{ 0 }; x < width; x += 4) 10 | { 11 | if constexpr (std::is_same_v) 12 | to_float(Vec4i().load_4uc(srcp + x)).store_nt(dstp + x); 13 | else if constexpr (std::is_same_v) 14 | to_float(Vec4i().load_4us(srcp + x)).store_nt(dstp + x); 15 | else 16 | Vec4f().load_a(srcp + x).store_nt(dstp + x); 17 | } 18 | 19 | srcp += srcStride; 20 | dstp += dstStride; 21 | } 22 | } 23 | 24 | template 25 | static void gaussianBlur(const T* __srcp, float* temp, float* dstp, const float* weightsH, const float* weightsV, const int width, const int height, const int srcStride, const int dstStride, const int radiusH, const int radiusV) noexcept 26 | { 27 | const int diameter{ radiusV * 2 + 1 }; 28 | std::unique_ptr _srcp{ std::make_unique(diameter) }; 29 | 30 | _srcp[radiusV] = __srcp; 31 | for (int i{ 1 }; i <= radiusV; ++i) 32 | _srcp[radiusV - i] = _srcp[radiusV + i] = _srcp[radiusV] + srcStride * i; 33 | 34 | weightsH += radiusH; 35 | 36 | for (int y{ 0 }; y < height; ++y) 37 | { 38 | for (int x{ 0 }; x < width; x += 4) 39 | { 40 | Vec4f sum{ zero_4f() }; 41 | 42 | for (int i{ 0 }; i < diameter; ++i) 43 | { 44 | if constexpr (std::is_same_v) 45 | { 46 | const Vec4f srcp{ to_float(Vec4i().load_4uc(_srcp[i] + x)) }; 47 | sum = mul_add(srcp, weightsV[i], sum); 48 | } 49 | else if constexpr (std::is_same_v) 50 | { 51 | const Vec4f srcp{ to_float(Vec4i().load_4us(_srcp[i] + x)) }; 52 | sum = mul_add(srcp, weightsV[i], sum); 53 | } 54 | else 55 | { 56 | const Vec4f srcp{ Vec4f().load_a(_srcp[i] + x) }; 57 | sum = mul_add(srcp, weightsV[i], sum); 58 | } 59 | } 60 | 61 | sum.store_a(temp + x); 62 | } 63 | 64 | for (int i{ 1 }; i <= radiusH; ++i) 65 | { 66 | temp[-i] = temp[i]; 67 | temp[width - 1 + i] = temp[width - 1 - i]; 68 | } 69 | 70 | for (int x{ 0 }; x < width; x += 4) 71 | { 72 | Vec4f sum{ zero_4f() }; 73 | 74 | for (int i{ -radiusH }; i <= radiusH; ++i) 75 | { 76 | const Vec4f srcp{ Vec4f().load(temp + x + i) }; 77 | sum = mul_add(srcp, weightsH[i], sum); 78 | } 79 | 80 | sum.store_nt(dstp + x); 81 | } 82 | 83 | for (int i{ 0 }; i < diameter - 1; ++i) 84 | _srcp[i] = _srcp[i + 1]; 85 | 86 | _srcp[diameter - 1] += (y < height - 1 - radiusV) ? srcStride : -srcStride; 87 | 88 | dstp += dstStride; 89 | } 90 | } 91 | 92 | template 93 | static void gaussianBlurV(const T* __srcp, float* dstp, const float* weights, const int width, const int height, const int srcStride, const int dstStride, const int radius) noexcept 94 | { 95 | const int diameter{ radius * 2 + 1 }; 96 | std::unique_ptr _srcp{ std::make_unique(diameter) }; 97 | 98 | _srcp[radius] = __srcp; 99 | for (int i{ 1 }; i <= radius; ++i) 100 | _srcp[radius - i] = _srcp[radius + i] = _srcp[radius] + srcStride * i; 101 | 102 | for (int y{ 0 }; y < height; ++y) 103 | { 104 | for (int x{ 0 }; x < width; x += 4) 105 | { 106 | Vec4f sum{ zero_4f() }; 107 | 108 | for (int i{ 0 }; i < diameter; ++i) 109 | { 110 | if constexpr (std::is_same_v) 111 | { 112 | const Vec4f srcp{ to_float(Vec4i().load_4uc(_srcp[i] + x)) }; 113 | sum = mul_add(srcp, weights[i], sum); 114 | } 115 | else if constexpr (std::is_same_v) 116 | { 117 | const Vec4f srcp{ to_float(Vec4i().load_4us(_srcp[i] + x)) }; 118 | sum = mul_add(srcp, weights[i], sum); 119 | } 120 | else 121 | { 122 | const Vec4f srcp{ Vec4f().load_a(_srcp[i] + x) }; 123 | sum = mul_add(srcp, weights[i], sum); 124 | } 125 | } 126 | 127 | sum.store_nt(dstp + x); 128 | } 129 | 130 | for (int i{ 0 }; i < diameter - 1; ++i) 131 | _srcp[i] = _srcp[i + 1]; 132 | 133 | _srcp[diameter - 1] += (y < height - 1 - radius) ? srcStride : -srcStride; 134 | 135 | dstp += dstStride; 136 | } 137 | } 138 | 139 | template 140 | static void gaussianBlurH(const T* _srcp, float* temp, float* dstp, const float* weights, const int width, const int height, const int srcStride, const int dstStride, const int radius) noexcept 141 | { 142 | weights += radius; 143 | 144 | for (int y{ 0 }; y < height; ++y) 145 | { 146 | for (int x{ 0 }; x < width; x += 4) 147 | { 148 | if constexpr (std::is_same_v) 149 | to_float(Vec4i().load_4uc(_srcp + x)).store_a(temp + x); 150 | else if constexpr (std::is_same_v) 151 | to_float(Vec4i().load_4us(_srcp + x)).store_a(temp + x); 152 | else 153 | Vec4f().load_a(_srcp + x).store_a(temp + x); 154 | } 155 | 156 | for (int i{ 1 }; i <= radius; ++i) 157 | { 158 | temp[-i] = temp[i]; 159 | temp[width - 1 + i] = temp[width - 1 - i]; 160 | } 161 | 162 | for (int x{ 0 }; x < width; x += 4) 163 | { 164 | Vec4f sum{ zero_4f() }; 165 | 166 | for (int i{ -radius }; i <= radius; ++i) 167 | { 168 | const Vec4f srcp{ Vec4f().load(temp + x + i) }; 169 | sum = mul_add(srcp, weights[i], sum); 170 | } 171 | 172 | sum.store_nt(dstp + x); 173 | } 174 | 175 | _srcp += srcStride; 176 | dstp += dstStride; 177 | } 178 | } 179 | 180 | static void detectEdge(float* blur, float* gradient, int* direction, const int width, const int height, const int stride, const int bgStride, const int mode, const int op, const float scale) noexcept 181 | { 182 | float* __restrict cur{ blur }; 183 | float* __restrict next{ blur + bgStride }; 184 | float* __restrict next2{ blur + bgStride * 2 }; 185 | float* __restrict prev{ next }; 186 | float* __restrict prev2{ next2 }; 187 | 188 | cur[-1] = cur[1]; 189 | cur[width] = cur[width - 2]; 190 | 191 | if (op == FDOG) 192 | { 193 | cur[-2] = cur[2]; 194 | cur[width + 1] = cur[width - 3]; 195 | } 196 | 197 | for (int y{ 0 }; y < height; ++y) 198 | { 199 | next[-1] = next[1]; 200 | next[width] = next[width - 2]; 201 | 202 | if (op == FDOG) 203 | { 204 | next[-2] = next[2]; 205 | next[width + 1] = next[width - 3]; 206 | 207 | next2[-1] = next2[1]; 208 | next2[-2] = next2[2]; 209 | next2[width] = next2[width - 2]; 210 | next2[width + 1] = next2[width - 3]; 211 | } 212 | 213 | for (int x{ 0 }; x < width; x += 4) 214 | { 215 | Vec4f gx, gy; 216 | 217 | if (op != FDOG) 218 | { 219 | const Vec4f c1{ Vec4f().load(prev + x - 1) }; 220 | const Vec4f c2{ Vec4f().load_a(prev + x) }; 221 | const Vec4f c3{ Vec4f().load(prev + x + 1) }; 222 | const Vec4f c4{ Vec4f().load(cur + x - 1) }; 223 | const Vec4f c6{ Vec4f().load(cur + x + 1) }; 224 | const Vec4f c7{ Vec4f().load(next + x - 1) }; 225 | const Vec4f c8{ Vec4f().load_a(next + x) }; 226 | const Vec4f c9{ Vec4f().load(next + x + 1) }; 227 | 228 | switch (op) 229 | { 230 | case TRITICAL: 231 | { 232 | gx = c6 - c4; 233 | gy = c2 - c8; 234 | break; 235 | } 236 | case PREWITT: 237 | { 238 | gx = (c3 + c6 + c9 - c1 - c4 - c7) * 0.5f; 239 | gy = (c1 + c2 + c3 - c7 - c8 - c9) * 0.5f; 240 | break; 241 | } 242 | case SOBEL: 243 | { 244 | gx = c3 + mul_add(2.0f, c6, c9) - c1 - mul_add(2.0f, c4, c7); 245 | gy = c1 + mul_add(2.0f, c2, c3) - c7 - mul_add(2.0f, c8, c9); 246 | break; 247 | } 248 | case SCHARR: 249 | { 250 | gx = mul_add(3.0f, c3 + c9, 10.0f * c6) - mul_add(3.0f, c1 + c7, 10.0f * c4); 251 | gy = mul_add(3.0f, c1 + c3, 10.0f * c2) - mul_add(3.0f, c7 + c9, 10.0f * c8); 252 | break; 253 | } 254 | case KROON: 255 | { 256 | gx = mul_add(17.0f, c3 + c9, 61.0f * c6) - mul_add(17.0f, c1 + c7, 61.0f * c4); 257 | gy = mul_add(17.0f, c1 + c3, 61.0f * c2) - mul_add(17.0f, c7 + c9, 61.0f * c8); 258 | break; 259 | } 260 | case KIRSCH: 261 | { 262 | const Vec4f g1{ mul_sub(5.0f, c1 + c2 + c3, 3.0f * (c4 + c6 + c7 + c8 + c9)) }; 263 | const Vec4f g2{ mul_sub(5.0f, c1 + c2 + c4, 3.0f * (c3 + c6 + c7 + c8 + c9)) }; 264 | const Vec4f g3{ mul_sub(5.0f, c1 + c4 + c7, 3.0f * (c2 + c3 + c6 + c8 + c9)) }; 265 | const Vec4f g4{ mul_sub(5.0f, c4 + c7 + c8, 3.0f * (c1 + c2 + c3 + c6 + c9)) }; 266 | const Vec4f g5{ mul_sub(5.0f, c7 + c8 + c9, 3.0f * (c1 + c2 + c3 + c4 + c6)) }; 267 | const Vec4f g6{ mul_sub(5.0f, c6 + c8 + c9, 3.0f * (c1 + c2 + c3 + c4 + c7)) }; 268 | const Vec4f g7{ mul_sub(5.0f, c3 + c6 + c9, 3.0f * (c1 + c2 + c4 + c7 + c8)) }; 269 | const Vec4f g8{ mul_sub(5.0f, c2 + c3 + c6, 3.0f * (c1 + c4 + c7 + c8 + c9)) }; 270 | const Vec4f g{ max(max(max(abs(g1), abs(g2)), max(abs(g3), abs(g4))), max(max(abs(g5), abs(g6)), max(abs(g7), abs(g8)))) }; 271 | (g * scale).store_nt(gradient + x); 272 | break; 273 | } 274 | } 275 | } 276 | else 277 | { 278 | const Vec4f c1{ Vec4f().load(prev2 + x - 2) }; 279 | const Vec4f c2{ Vec4f().load(prev2 + x - 1) }; 280 | const Vec4f c3{ Vec4f().load(prev2 + x) }; 281 | const Vec4f c4{ Vec4f().load(prev2 + x + 1) }; 282 | const Vec4f c5{ Vec4f().load(prev2 + x + 2) }; 283 | const Vec4f c6{ Vec4f().load(prev + x - 2) }; 284 | const Vec4f c7{ Vec4f().load(prev + x - 1) }; 285 | const Vec4f c8{ Vec4f().load(prev + x) }; 286 | const Vec4f c9{ Vec4f().load(prev + x + 1) }; 287 | const Vec4f c10{ Vec4f().load(prev + x + 2) }; 288 | const Vec4f c11{ Vec4f().load(cur + x - 2) }; 289 | const Vec4f c12{ Vec4f().load(cur + x - 1) }; 290 | const Vec4f c14{ Vec4f().load(cur + x + 1) }; 291 | const Vec4f c15{ Vec4f().load(cur + x + 2) }; 292 | const Vec4f c16{ Vec4f().load(next + x - 2) }; 293 | const Vec4f c17{ Vec4f().load(next + x - 1) }; 294 | const Vec4f c18{ Vec4f().load(next + x) }; 295 | const Vec4f c19{ Vec4f().load(next + x + 1) }; 296 | const Vec4f c20{ Vec4f().load(next + x + 2) }; 297 | const Vec4f c21{ Vec4f().load(next2 + x - 2) }; 298 | const Vec4f c22{ Vec4f().load(next2 + x - 1) }; 299 | const Vec4f c23{ Vec4f().load(next2 + x) }; 300 | const Vec4f c24{ Vec4f().load(next2 + x + 1) }; 301 | const Vec4f c25{ Vec4f().load(next2 + x + 2) }; 302 | 303 | gx = c5 + c25 + c4 + c24 + mul_add(2.0f, c10 + c20 + c9 + c19, 3.0f * (c15 + c14)) 304 | - c2 - c22 - c1 - c21 - mul_add(2.0f, c7 + c17 + c6 + c16, 3.0f * (c12 + c11)); 305 | gy = c1 + c5 + c6 + c10 + mul_add(2.0f, c2 + c4 + c7 + c9, 3.0f * (c3 + c8)) 306 | - c16 - c20 - c21 - c25 - mul_add(2.0f, c17 + c19 + c22 + c24, 3.0f * (c18 + c23)); 307 | } 308 | 309 | if (op != KIRSCH) 310 | { 311 | gx *= scale; 312 | gy *= scale; 313 | sqrt(mul_add(gx, gx, gy * gy)).store_nt(gradient + x); 314 | } 315 | 316 | if (mode == 0) 317 | { 318 | Vec4f dr{ atan2(gy, gx) }; 319 | dr = if_add(dr < 0.0f, dr, M_PIF); 320 | 321 | const Vec4i bin{ truncatei(mul_add(dr, 4.0f * M_1_PIF, 0.5f)) }; 322 | select(bin >= 4, zero_si128(), bin).store_nt(direction + x); 323 | } 324 | } 325 | 326 | prev2 = prev; 327 | prev = cur; 328 | cur = next; 329 | 330 | if (op != FDOG) 331 | next += (y < height - 2) ? bgStride : -bgStride; 332 | else 333 | { 334 | next = next2; 335 | next2 += (y < height - 3) ? bgStride : -bgStride; 336 | } 337 | 338 | gradient += bgStride; 339 | direction += stride; 340 | } 341 | } 342 | 343 | static void nonMaximumSuppression(const int* _direction, float* _gradient, float* blur, const int width, const int height, const int stride, const int bgStride, const int radiusAlign) noexcept 344 | { 345 | _gradient[-1] = _gradient[1]; 346 | _gradient[-1 + bgStride * (height - 1)] = _gradient[1 + bgStride * (height - 1)]; 347 | _gradient[width] = _gradient[width - 2]; 348 | _gradient[width + bgStride * (height - 1)] = _gradient[width - 2 + bgStride * (height - 1)]; 349 | std::copy_n(_gradient - radiusAlign + bgStride, width + radiusAlign * 2, _gradient - radiusAlign - bgStride); 350 | std::copy_n(_gradient - radiusAlign + bgStride * (height - 2), width + radiusAlign * 2, _gradient - radiusAlign + bgStride * static_cast(height)); 351 | 352 | for (int y{ 0 }; y < height; ++y) 353 | { 354 | for (int x{ 0 }; x < width; x += 4) 355 | { 356 | const Vec4ui direction{ Vec4ui().load_a(_direction + x) }; 357 | 358 | Vec4fb mask{ Vec4fb(direction == 0) }; 359 | Vec4f gradient{ max(Vec4f().load(_gradient + x + 1), Vec4f().load(_gradient + x - 1)) }; 360 | Vec4f result{ gradient & mask }; 361 | 362 | mask = Vec4fb(direction == 1); 363 | gradient = max(Vec4f().load(_gradient + x - bgStride + 1), Vec4f().load(_gradient + x + bgStride - 1)); 364 | result |= gradient & mask; 365 | 366 | mask = Vec4fb(direction == 2); 367 | gradient = max(Vec4f().load_a(_gradient + x - bgStride), Vec4f().load_a(_gradient + x + bgStride)); 368 | result |= gradient & mask; 369 | 370 | mask = Vec4fb(direction == 3); 371 | gradient = max(Vec4f().load(_gradient + x - bgStride - 1), Vec4f().load(_gradient + x + bgStride + 1)); 372 | result |= gradient & mask; 373 | 374 | gradient = Vec4f().load_a(_gradient + x); 375 | select(gradient >= result, gradient, fltLowest).store_nt(blur + x); 376 | } 377 | 378 | _direction += stride; 379 | _gradient += bgStride; 380 | blur += bgStride; 381 | } 382 | } 383 | 384 | template 385 | static void binarizeCE(const float* _srcp, T* dstp, const int width, const int height, const int srcStride, const int dstStride, const int peak) noexcept 386 | { 387 | for (int y{ 0 }; y < height; ++y) 388 | { 389 | for (int x{ 0 }; x < width; x += 4) 390 | { 391 | const Vec4f srcp{ Vec4f().load_a(_srcp + x) }; 392 | 393 | if constexpr (std::is_same_v) 394 | { 395 | const Vec16cb mask{ Vec16cb(compress_saturated(compress_saturated(Vec4ib(srcp == fltMax), zero_si128()), zero_si128())) }; 396 | select(mask, Vec16uc(255), zero_si128()).store_si32(dstp + x); 397 | } 398 | else if constexpr (std::is_same_v) 399 | { 400 | const Vec8sb mask{ Vec8sb(compress_saturated(Vec4ib(srcp == fltMax), zero_si128())) }; 401 | select(mask, Vec8us(peak), zero_si128()).storel(dstp + x); 402 | } 403 | else 404 | { 405 | const Vec4fb mask{ srcp == fltMax }; 406 | select(mask, Vec4f(1.0f), Vec4f(0.0f)).store_nt(dstp + x); 407 | } 408 | } 409 | 410 | _srcp += srcStride; 411 | dstp += dstStride; 412 | } 413 | } 414 | 415 | template 416 | static void discretizeGM(const float* _srcp, T* dstp, const int width, const int height, const int srcStride, const int dstStride, const int peak) noexcept 417 | { 418 | for (int y{ 0 }; y < height; ++y) 419 | { 420 | for (int x{ 0 }; x < width; x += 4) 421 | { 422 | const Vec4f srcp{ Vec4f().load_a(_srcp + x) }; 423 | 424 | if constexpr (std::is_same_v) 425 | { 426 | const Vec16uc result{ compress_saturated_s2u(compress_saturated(truncatei(srcp + 0.5f), zero_si128()), zero_si128()) }; 427 | result.store_si32(dstp + x); 428 | } 429 | else if constexpr (std::is_same_v) 430 | { 431 | const Vec8us result{ compress_saturated_s2u(truncatei(srcp + 0.5f), zero_si128()) }; 432 | min(result, peak).storel(dstp + x); 433 | } 434 | else if constexpr (clampFP) 435 | min(max(srcp, 0.0f), 1.0f).store_nt(dstp + x); 436 | else 437 | srcp.store_nt(dstp + x); 438 | } 439 | 440 | _srcp += srcStride; 441 | dstp += dstStride; 442 | } 443 | } 444 | 445 | template 446 | void vsTCanny::filter_sse2(PVideoFrame& src, PVideoFrame& dst, IScriptEnvironment* env) noexcept 447 | { 448 | const int planes_y[3]{ PLANAR_Y, PLANAR_U, PLANAR_V }; 449 | const int planes_r[3]{ PLANAR_G, PLANAR_B, PLANAR_R }; 450 | const int* current_planes{ (vi.IsRGB()) ? planes_r : planes_y }; 451 | const int planecount{ std::min(vi.NumComponents(), 3) }; 452 | 453 | for (int i{ 0 }; i < planecount; ++i) 454 | { 455 | const int height{ src->GetHeight(current_planes[i]) }; 456 | 457 | if (process[i] == 3) 458 | { 459 | const size_t stride{ src->GetPitch(current_planes[i]) / sizeof(T) }; 460 | const size_t bgStride{ stride + radiusAlign * 2 }; 461 | const size_t dst_stride{ dst->GetPitch(current_planes[i]) / sizeof(T) }; 462 | const size_t width{ src->GetRowSize(current_planes[i]) / sizeof(T) }; 463 | const T* srcp{ reinterpret_cast(src->GetReadPtr(current_planes[i])) }; 464 | T* dstp{ reinterpret_cast(dst->GetWritePtr(current_planes[i])) }; 465 | 466 | float* blur{ vsTCanny::blur + radiusAlign }; 467 | float* gradient{ vsTCanny::gradient + bgStride + radiusAlign }; 468 | 469 | if (radiusV[i] && radiusH[i]) 470 | gaussianBlur(srcp, gradient, blur, weightsH[i].get(), weightsV[i].get(), width, height, stride, bgStride, radiusH[i], radiusV[i]); 471 | else if (radiusV[i]) 472 | gaussianBlurV(srcp, blur, weightsV[i].get(), width, height, stride, bgStride, radiusV[i]); 473 | else if (radiusH[i]) 474 | gaussianBlurH(srcp, gradient, blur, weightsH[i].get(), width, height, stride, bgStride, radiusH[i]); 475 | else 476 | copyPlane(srcp, blur, width, height, stride, bgStride); 477 | 478 | if (mode_ != -1) 479 | { 480 | detectEdge(blur, gradient, direction, width, height, stride, bgStride, mode_, op_, scale); 481 | 482 | if (mode_ == 0) 483 | { 484 | nonMaximumSuppression(direction, gradient, blur, width, height, stride, bgStride, radiusAlign); 485 | hysteresis(blur, found.get(), width, height, bgStride, t_h_, t_l_); 486 | } 487 | } 488 | 489 | switch (mode_) 490 | { 491 | case 0: binarizeCE(blur, dstp, width, height, bgStride, dst_stride, peak); break; 492 | case 1: discretizeGM(gradient, dstp, width, height, bgStride, stride, peak); break; 493 | default: discretizeGM(blur, dstp, width, height, bgStride, dst_stride, peak); break; 494 | } 495 | } 496 | else if (process[i] == 2) 497 | env->BitBlt(dst->GetWritePtr(current_planes[i]), dst->GetPitch(current_planes[i]), src->GetReadPtr(current_planes[i]), src->GetPitch(current_planes[i]), src->GetRowSize(current_planes[i]), height); 498 | } 499 | } 500 | 501 | template void vsTCanny::filter_sse2(PVideoFrame& src, PVideoFrame& dst, IScriptEnvironment* env) noexcept; 502 | template void vsTCanny::filter_sse2(PVideoFrame& src, PVideoFrame& dst, IScriptEnvironment* env) noexcept; 503 | template void vsTCanny::filter_sse2(PVideoFrame& src, PVideoFrame& dst, IScriptEnvironment* env) noexcept; 504 | --------------------------------------------------------------------------------