├── .gitattributes
├── .gitignore
├── CHANGELOG.md
├── CMakeLists.txt
├── LICENSE
├── README.md
├── cmake_uninstall.cmake.in
├── msvc
    ├── vsTCanny.sln
    ├── vsTCanny.vcxproj
    └── vsTCanny.vcxproj.filters
└── src
    ├── VCL2
        ├── LICENSE
        ├── instrset.h
        ├── instrset_detect.cpp
        ├── vector_convert.h
        ├── vectorclass.h
        ├── vectorf128.h
        ├── vectorf256.h
        ├── vectorf256e.h
        ├── vectorf512.h
        ├── vectorf512e.h
        ├── vectori128.h
        ├── vectori256.h
        ├── vectori256e.h
        ├── vectori512.h
        ├── vectori512e.h
        ├── vectori512s.h
        ├── vectori512se.h
        ├── vectormath_common.h
        ├── vectormath_exp.h
        ├── vectormath_hyp.h
        ├── vectormath_lib.h
        └── vectormath_trig.h
    ├── vsTCanny.cpp
    ├── vsTCanny.h
    ├── vsTCanny.rc
    ├── vsTCanny_AVX2.cpp
    ├── vsTCanny_AVX512.cpp
    └── vsTCanny_SSE2.cpp


/.gitattributes:
--------------------------------------------------------------------------------
 1 | #sources
 2 | *.c text
 3 | *.cc text
 4 | *.cxx text
 5 | *.cpp text
 6 | *.c++ text
 7 | *.hpp text
 8 | *.h text
 9 | *.h++ text
10 | *.hh text
11 | 
12 | # Compiled Object files
13 | *.slo binary
14 | *.lo binary
15 | *.o binary
16 | *.obj binary
17 | 
18 | # Precompiled Headers
19 | *.gch binary
20 | *.pch binary
21 | 
22 | # Compiled Dynamic libraries
23 | *.so binary
24 | *.dylib binary
25 | *.dll binary
26 | 
27 | # Compiled Static libraries
28 | *.lai binary
29 | *.la binary
30 | *.a binary
31 | *.lib binary
32 | 
33 | # Executables
34 | *.exe binary
35 | *.out binary
36 | *.app binary
37 | ###############################################################################
38 | # Set default behavior to automatically normalize line endings.
39 | ###############################################################################
40 | * text=auto
41 | 
42 | ###############################################################################
43 | # Set the merge driver for project and solution files
44 | #
45 | # Merging from the command prompt will add diff markers to the files if there
46 | # are conflicts (Merging from VS is not affected by the settings below, in VS
47 | # the diff markers are never inserted). Diff markers may cause the following 
48 | # file extensions to fail to load in VS. An alternative would be to treat
49 | # these files as binary and thus will always conflict and require user
50 | # intervention with every merge. To do so, just comment the entries below and
51 | # uncomment the group further below
52 | ###############################################################################
53 | 
54 | *.sln        text eol=crlf
55 | *.csproj     text eol=crlf
56 | *.vbproj     text eol=crlf
57 | *.vcxproj    text eol=crlf
58 | *.vcproj     text eol=crlf
59 | *.dbproj     text eol=crlf
60 | *.fsproj     text eol=crlf
61 | *.lsproj     text eol=crlf
62 | *.wixproj    text eol=crlf
63 | *.modelproj  text eol=crlf
64 | *.sqlproj    text eol=crlf
65 | *.wmaproj    text eol=crlf
66 | 
67 | *.xproj      text eol=crlf
68 | *.props      text eol=crlf
69 | *.filters    text eol=crlf
70 | *.vcxitems   text eol=crlf
71 | 
72 | 
73 | #*.sln       merge=binary
74 | #*.csproj    merge=binary
75 | #*.vbproj    merge=binary
76 | #*.vcxproj   merge=binary
77 | #*.vcproj    merge=binary
78 | #*.dbproj    merge=binary
79 | #*.fsproj    merge=binary
80 | #*.lsproj    merge=binary
81 | #*.wixproj   merge=binary
82 | #*.modelproj merge=binary
83 | #*.sqlproj   merge=binary
84 | #*.wwaproj   merge=binary
85 | 
86 | #*.xproj     merge=binary
87 | #*.props     merge=binary
88 | #*.filters   merge=binary
89 | #*.vcxitems  merge=binary
90 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | ## Ignore Visual Studio temporary files, build results, and
  2 | ## files generated by popular Visual Studio add-ons.
  3 | ##
  4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
  5 | 
  6 | # User-specific files
  7 | *.rsuser
  8 | *.suo
  9 | *.user
 10 | *.userosscache
 11 | *.sln.docstates
 12 | 
 13 | # User-specific files (MonoDevelop/Xamarin Studio)
 14 | *.userprefs
 15 | 
 16 | # Build results
 17 | [Dd]ebug/
 18 | [Dd]ebugPublic/
 19 | [Rr]elease/
 20 | [Rr]eleases/
 21 | x64/
 22 | x86/
 23 | [Aa][Rr][Mm]/
 24 | [Aa][Rr][Mm]64/
 25 | bld/
 26 | [Bb]in/
 27 | [Oo]bj/
 28 | [Ll]og/
 29 | 
 30 | # Visual Studio 2015/2017 cache/options directory
 31 | .vs/
 32 | # Uncomment if you have tasks that create the project's static files in wwwroot
 33 | #wwwroot/
 34 | 
 35 | # Visual Studio 2017 auto generated files
 36 | Generated\ Files/
 37 | 
 38 | # MSTest test Results
 39 | [Tt]est[Rr]esult*/
 40 | [Bb]uild[Ll]og.*
 41 | 
 42 | # NUNIT
 43 | *.VisualState.xml
 44 | TestResult.xml
 45 | 
 46 | # Build Results of an ATL Project
 47 | [Dd]ebugPS/
 48 | [Rr]eleasePS/
 49 | dlldata.c
 50 | 
 51 | # Benchmark Results
 52 | BenchmarkDotNet.Artifacts/
 53 | 
 54 | # .NET Core
 55 | project.lock.json
 56 | project.fragment.lock.json
 57 | artifacts/
 58 | 
 59 | # StyleCop
 60 | StyleCopReport.xml
 61 | 
 62 | # Files built by Visual Studio
 63 | *_i.c
 64 | *_p.c
 65 | *_h.h
 66 | *.ilk
 67 | *.meta
 68 | *.obj
 69 | *.iobj
 70 | *.pch
 71 | *.pdb
 72 | *.ipdb
 73 | *.pgc
 74 | *.pgd
 75 | *.rsp
 76 | *.sbr
 77 | *.tlb
 78 | *.tli
 79 | *.tlh
 80 | *.tmp
 81 | *.tmp_proj
 82 | *_wpftmp.csproj
 83 | *.log
 84 | *.vspscc
 85 | *.vssscc
 86 | .builds
 87 | *.pidb
 88 | *.svclog
 89 | *.scc
 90 | 
 91 | # Chutzpah Test files
 92 | _Chutzpah*
 93 | 
 94 | # Visual C++ cache files
 95 | ipch/
 96 | *.aps
 97 | *.ncb
 98 | *.opendb
 99 | *.opensdf
100 | *.sdf
101 | *.cachefile
102 | *.VC.db
103 | *.VC.VC.opendb
104 | 
105 | # Visual Studio profiler
106 | *.psess
107 | *.vsp
108 | *.vspx
109 | *.sap
110 | 
111 | # Visual Studio Trace Files
112 | *.e2e
113 | 
114 | # TFS 2012 Local Workspace
115 | $tf/
116 | 
117 | # Guidance Automation Toolkit
118 | *.gpState
119 | 
120 | # ReSharper is a .NET coding add-in
121 | _ReSharper*/
122 | *.[Rr]e[Ss]harper
123 | *.DotSettings.user
124 | 
125 | # JustCode is a .NET coding add-in
126 | .JustCode
127 | 
128 | # TeamCity is a build add-in
129 | _TeamCity*
130 | 
131 | # DotCover is a Code Coverage Tool
132 | *.dotCover
133 | 
134 | # AxoCover is a Code Coverage Tool
135 | .axoCover/*
136 | !.axoCover/settings.json
137 | 
138 | # Visual Studio code coverage results
139 | *.coverage
140 | *.coveragexml
141 | 
142 | # NCrunch
143 | _NCrunch_*
144 | .*crunch*.local.xml
145 | nCrunchTemp_*
146 | 
147 | # MightyMoose
148 | *.mm.*
149 | AutoTest.Net/
150 | 
151 | # Web workbench (sass)
152 | .sass-cache/
153 | 
154 | # Installshield output folder
155 | [Ee]xpress/
156 | 
157 | # DocProject is a documentation generator add-in
158 | DocProject/buildhelp/
159 | DocProject/Help/*.HxT
160 | DocProject/Help/*.HxC
161 | DocProject/Help/*.hhc
162 | DocProject/Help/*.hhk
163 | DocProject/Help/*.hhp
164 | DocProject/Help/Html2
165 | DocProject/Help/html
166 | 
167 | # Click-Once directory
168 | publish/
169 | 
170 | # Publish Web Output
171 | *.[Pp]ublish.xml
172 | *.azurePubxml
173 | # Note: Comment the next line if you want to checkin your web deploy settings,
174 | # but database connection strings (with potential passwords) will be unencrypted
175 | *.pubxml
176 | *.publishproj
177 | 
178 | # Microsoft Azure Web App publish settings. Comment the next line if you want to
179 | # checkin your Azure Web App publish settings, but sensitive information contained
180 | # in these scripts will be unencrypted
181 | PublishScripts/
182 | 
183 | # NuGet Packages
184 | *.nupkg
185 | # The packages folder can be ignored because of Package Restore
186 | **/[Pp]ackages/*
187 | # except build/, which is used as an MSBuild target.
188 | !**/[Pp]ackages/build/
189 | # Uncomment if necessary however generally it will be regenerated when needed
190 | #!**/[Pp]ackages/repositories.config
191 | # NuGet v3's project.json files produces more ignorable files
192 | *.nuget.props
193 | *.nuget.targets
194 | 
195 | # Microsoft Azure Build Output
196 | csx/
197 | *.build.csdef
198 | 
199 | # Microsoft Azure Emulator
200 | ecf/
201 | rcf/
202 | 
203 | # Windows Store app package directories and files
204 | AppPackages/
205 | BundleArtifacts/
206 | Package.StoreAssociation.xml
207 | _pkginfo.txt
208 | *.appx
209 | 
210 | # Visual Studio cache files
211 | # files ending in .cache can be ignored
212 | *.[Cc]ache
213 | # but keep track of directories ending in .cache
214 | !*.[Cc]ache/
215 | 
216 | # Others
217 | ClientBin/
218 | ~$*
219 | *~
220 | *.dbmdl
221 | *.dbproj.schemaview
222 | *.jfm
223 | *.pfx
224 | *.publishsettings
225 | orleans.codegen.cs
226 | 
227 | # Including strong name files can present a security risk
228 | # (https://github.com/github/gitignore/pull/2483#issue-259490424)
229 | #*.snk
230 | 
231 | # Since there are multiple workflows, uncomment next line to ignore bower_components
232 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
233 | #bower_components/
234 | # ASP.NET Core default setup: bower directory is configured as wwwroot/lib/ and bower restore is true
235 | **/wwwroot/lib/
236 | 
237 | # RIA/Silverlight projects
238 | Generated_Code/
239 | 
240 | # Backup & report files from converting an old project file
241 | # to a newer Visual Studio version. Backup files are not needed,
242 | # because we have git ;-)
243 | _UpgradeReport_Files/
244 | Backup*/
245 | UpgradeLog*.XML
246 | UpgradeLog*.htm
247 | ServiceFabricBackup/
248 | *.rptproj.bak
249 | 
250 | # SQL Server files
251 | *.mdf
252 | *.ldf
253 | *.ndf
254 | 
255 | # Business Intelligence projects
256 | *.rdl.data
257 | *.bim.layout
258 | *.bim_*.settings
259 | *.rptproj.rsuser
260 | 
261 | # Microsoft Fakes
262 | FakesAssemblies/
263 | 
264 | # GhostDoc plugin setting file
265 | *.GhostDoc.xml
266 | 
267 | # Node.js Tools for Visual Studio
268 | .ntvs_analysis.dat
269 | node_modules/
270 | 
271 | # Visual Studio 6 build log
272 | *.plg
273 | 
274 | # Visual Studio 6 workspace options file
275 | *.opt
276 | 
277 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
278 | *.vbw
279 | 
280 | # Visual Studio LightSwitch build output
281 | **/*.HTMLClient/GeneratedArtifacts
282 | **/*.DesktopClient/GeneratedArtifacts
283 | **/*.DesktopClient/ModelManifest.xml
284 | **/*.Server/GeneratedArtifacts
285 | **/*.Server/ModelManifest.xml
286 | _Pvt_Extensions
287 | 
288 | # Paket dependency manager
289 | .paket/paket.exe
290 | paket-files/
291 | 
292 | # FAKE - F# Make
293 | .fake/
294 | 
295 | # JetBrains Rider
296 | .idea/
297 | *.sln.iml
298 | 
299 | # CodeRush personal settings
300 | .cr/personal
301 | 
302 | # Python Tools for Visual Studio (PTVS)
303 | __pycache__/
304 | *.pyc
305 | 
306 | # Cake - Uncomment if you are using it
307 | # tools/**
308 | # !tools/packages.config
309 | 
310 | # Tabs Studio
311 | *.tss
312 | 
313 | # Telerik's JustMock configuration file
314 | *.jmconfig
315 | 
316 | # BizTalk build output
317 | *.btp.cs
318 | *.btm.cs
319 | *.odx.cs
320 | *.xsd.cs
321 | 
322 | # OpenCover UI analysis results
323 | OpenCover/
324 | 
325 | # Azure Stream Analytics local run output
326 | ASALocalRun/
327 | 
328 | # MSBuild Binary and Structured Log
329 | *.binlog
330 | 
331 | # NVidia Nsight GPU debugger configuration file
332 | *.nvuser
333 | 
334 | # MFractors (Xamarin productivity tool) working folder
335 | .mfractor/
336 | 
337 | # Local History for Visual Studio
338 | .localhistory/
339 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | ##### 1.1.8:
 2 |     Fixed default sigma_vY when the clip has only one plane. (regression from 1.1.7)
 3 | 
 4 | ##### 1.1.7:
 5 |     Changed the behavior of default sigma_vU.
 6 | 
 7 | ##### 1.1.6:
 8 |     Fixed default sigma_U/V/vU/vV for RGB formats.
 9 |     Changed default sigma_vU/vV. Now they are inherited from sigmaU/V.
10 | 
11 | ##### 1.1.5:
12 |     Fixed the processing of planes for RGB formats.
13 |     Properly clamped float mask to 0-1 range in mode=1. (VS plugin r14)
14 | 
15 | ##### 1.1.4:
16 |     Fixed the behavior when y/u/v=1.
17 | 
18 | ##### 1.1.3:
19 |     Fixed the uninitialized variables when the clip has only one plane.
20 | 
21 | ##### 1.1.2:
22 |     Fixed a bug when sigma=0 and the plane is not processed.
23 | 
24 | ##### 1.1.1:
25 |     Fixed the processing of clips with one plane.
26 | 
27 | ##### 1.1.0:
28 |     Changed chroma planes range from -0.5..0.5 to 0.0..1.0 (float clips). (VS plugin r13)
29 |     Added AVX512 code. (VS plugin r13)
30 |     Added Kroon, Kirsch and FDoG operatos. (VS plugin r13)
31 |     Renamed `gmmax` parameter to `scale` and changed its default to 1.0. (VS plugin r13)
32 |     Changed default sigma_vY from 1.5 to sigmaY.
33 | 
34 | ##### 1.0.1:
35 |     Fixed sigma for RGB clips.
36 | 
37 | ##### 1.0.0:
38 |     Port of the VapourSynth plugin TCanny r12.
39 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.16)
 2 | 
 3 | project(libvstcanny LANGUAGES CXX)
 4 | 
 5 | add_library(vstcanny SHARED
 6 |     src/vsTCanny.cpp
 7 |     src/vsTCanny_SSE2.cpp
 8 |     src/vsTCanny_AVX2.cpp
 9 |     src/vsTCanny_AVX512.cpp
10 | )
11 | 
12 | target_include_directories(vstcanny PRIVATE
13 |     ${CMAKE_CURRENT_SOURCE_DIR}/src
14 |     /usr/local/include/avisynth
15 | )
16 | 
17 | if (NOT CMAKE_BUILD_TYPE)
18 |     set(CMAKE_BUILD_TYPE "Release" CACHE STRING "" FORCE)
19 | endif()
20 | 
21 | string(TOLOWER ${CMAKE_BUILD_TYPE} build_type)
22 | if (build_type STREQUAL debug)
23 |     target_compile_definitions(vstcanny PRIVATE DEBUG_BUILD)
24 | else (build_type STREQUAL release)
25 |     target_compile_definitions(vstcanny PRIVATE RELEASE_BUILD)
26 | endif ()
27 | 
28 | message(STATUS "Build type - ${CMAKE_BUILD_TYPE}")
29 | 
30 | target_compile_features(vstcanny PRIVATE cxx_std_17)
31 | 
32 | set_source_files_properties(src/vsTCanny_SSE2.cpp PROPERTIES COMPILE_OPTIONS "-mfpmath=sse;-msse2")
33 | set_source_files_properties(src/vsTCanny_AVX2.cpp PROPERTIES COMPILE_OPTIONS "-mavx2;-mfma")
34 | set_source_files_properties(src/vsTCanny_AVX512.cpp PROPERTIES COMPILE_OPTIONS "-mavx512f;-mavx512bw;-mavx512dq;-mavx512vl;-mfma")
35 | 
36 | find_package (Git)
37 | 
38 | if (GIT_FOUND)
39 |     execute_process (COMMAND ${GIT_EXECUTABLE} describe --tags --abbrev=0
40 |         OUTPUT_VARIABLE ver
41 |         OUTPUT_STRIP_TRAILING_WHITESPACE
42 |     )
43 |     set_target_properties(vstcanny PROPERTIES OUTPUT_NAME "vstcanny.${ver}")
44 | else ()
45 |     message (STATUS "GIT not found")
46 | endif ()
47 | 
48 | include(GNUInstallDirs)
49 | 
50 | INSTALL(TARGETS vstcanny LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}/avisynth")
51 | 
52 | # uninstall target
53 | if(NOT TARGET uninstall)
54 |   configure_file(
55 |     "${CMAKE_CURRENT_SOURCE_DIR}/cmake_uninstall.cmake.in"
56 |     "${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake"
57 |     IMMEDIATE @ONLY)
58 | 
59 |   add_custom_target(uninstall
60 |     COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake)
61 | endif()
62 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                     GNU GENERAL PUBLIC LICENSE
  2 |                        Version 2, June 1991
  3 | 
  4 |  Copyright (C) 1989, 1991 Free Software Foundation, Inc., <http://fsf.org/>
  5 |  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  6 |  Everyone is permitted to copy and distribute verbatim copies
  7 |  of this license document, but changing it is not allowed.
  8 | 
  9 |                             Preamble
 10 | 
 11 |   The licenses for most software are designed to take away your
 12 | freedom to share and change it.  By contrast, the GNU General Public
 13 | License is intended to guarantee your freedom to share and change free
 14 | software--to make sure the software is free for all its users.  This
 15 | General Public License applies to most of the Free Software
 16 | Foundation's software and to any other program whose authors commit to
 17 | using it.  (Some other Free Software Foundation software is covered by
 18 | the GNU Lesser General Public License instead.)  You can apply it to
 19 | your programs, too.
 20 | 
 21 |   When we speak of free software, we are referring to freedom, not
 22 | price.  Our General Public Licenses are designed to make sure that you
 23 | have the freedom to distribute copies of free software (and charge for
 24 | this service if you wish), that you receive source code or can get it
 25 | if you want it, that you can change the software or use pieces of it
 26 | in new free programs; and that you know you can do these things.
 27 | 
 28 |   To protect your rights, we need to make restrictions that forbid
 29 | anyone to deny you these rights or to ask you to surrender the rights.
 30 | These restrictions translate to certain responsibilities for you if you
 31 | distribute copies of the software, or if you modify it.
 32 | 
 33 |   For example, if you distribute copies of such a program, whether
 34 | gratis or for a fee, you must give the recipients all the rights that
 35 | you have.  You must make sure that they, too, receive or can get the
 36 | source code.  And you must show them these terms so they know their
 37 | rights.
 38 | 
 39 |   We protect your rights with two steps: (1) copyright the software, and
 40 | (2) offer you this license which gives you legal permission to copy,
 41 | distribute and/or modify the software.
 42 | 
 43 |   Also, for each author's protection and ours, we want to make certain
 44 | that everyone understands that there is no warranty for this free
 45 | software.  If the software is modified by someone else and passed on, we
 46 | want its recipients to know that what they have is not the original, so
 47 | that any problems introduced by others will not reflect on the original
 48 | authors' reputations.
 49 | 
 50 |   Finally, any free program is threatened constantly by software
 51 | patents.  We wish to avoid the danger that redistributors of a free
 52 | program will individually obtain patent licenses, in effect making the
 53 | program proprietary.  To prevent this, we have made it clear that any
 54 | patent must be licensed for everyone's free use or not licensed at all.
 55 | 
 56 |   The precise terms and conditions for copying, distribution and
 57 | modification follow.
 58 | 
 59 |                     GNU GENERAL PUBLIC LICENSE
 60 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 61 | 
 62 |   0. This License applies to any program or other work which contains
 63 | a notice placed by the copyright holder saying it may be distributed
 64 | under the terms of this General Public License.  The "Program", below,
 65 | refers to any such program or work, and a "work based on the Program"
 66 | means either the Program or any derivative work under copyright law:
 67 | that is to say, a work containing the Program or a portion of it,
 68 | either verbatim or with modifications and/or translated into another
 69 | language.  (Hereinafter, translation is included without limitation in
 70 | the term "modification".)  Each licensee is addressed as "you".
 71 | 
 72 | Activities other than copying, distribution and modification are not
 73 | covered by this License; they are outside its scope.  The act of
 74 | running the Program is not restricted, and the output from the Program
 75 | is covered only if its contents constitute a work based on the
 76 | Program (independent of having been made by running the Program).
 77 | Whether that is true depends on what the Program does.
 78 | 
 79 |   1. You may copy and distribute verbatim copies of the Program's
 80 | source code as you receive it, in any medium, provided that you
 81 | conspicuously and appropriately publish on each copy an appropriate
 82 | copyright notice and disclaimer of warranty; keep intact all the
 83 | notices that refer to this License and to the absence of any warranty;
 84 | and give any other recipients of the Program a copy of this License
 85 | along with the Program.
 86 | 
 87 | You may charge a fee for the physical act of transferring a copy, and
 88 | you may at your option offer warranty protection in exchange for a fee.
 89 | 
 90 |   2. You may modify your copy or copies of the Program or any portion
 91 | of it, thus forming a work based on the Program, and copy and
 92 | distribute such modifications or work under the terms of Section 1
 93 | above, provided that you also meet all of these conditions:
 94 | 
 95 |     a) You must cause the modified files to carry prominent notices
 96 |     stating that you changed the files and the date of any change.
 97 | 
 98 |     b) You must cause any work that you distribute or publish, that in
 99 |     whole or in part contains or is derived from the Program or any
100 |     part thereof, to be licensed as a whole at no charge to all third
101 |     parties under the terms of this License.
102 | 
103 |     c) If the modified program normally reads commands interactively
104 |     when run, you must cause it, when started running for such
105 |     interactive use in the most ordinary way, to print or display an
106 |     announcement including an appropriate copyright notice and a
107 |     notice that there is no warranty (or else, saying that you provide
108 |     a warranty) and that users may redistribute the program under
109 |     these conditions, and telling the user how to view a copy of this
110 |     License.  (Exception: if the Program itself is interactive but
111 |     does not normally print such an announcement, your work based on
112 |     the Program is not required to print an announcement.)
113 | 
114 | These requirements apply to the modified work as a whole.  If
115 | identifiable sections of that work are not derived from the Program,
116 | and can be reasonably considered independent and separate works in
117 | themselves, then this License, and its terms, do not apply to those
118 | sections when you distribute them as separate works.  But when you
119 | distribute the same sections as part of a whole which is a work based
120 | on the Program, the distribution of the whole must be on the terms of
121 | this License, whose permissions for other licensees extend to the
122 | entire whole, and thus to each and every part regardless of who wrote it.
123 | 
124 | Thus, it is not the intent of this section to claim rights or contest
125 | your rights to work written entirely by you; rather, the intent is to
126 | exercise the right to control the distribution of derivative or
127 | collective works based on the Program.
128 | 
129 | In addition, mere aggregation of another work not based on the Program
130 | with the Program (or with a work based on the Program) on a volume of
131 | a storage or distribution medium does not bring the other work under
132 | the scope of this License.
133 | 
134 |   3. You may copy and distribute the Program (or a work based on it,
135 | under Section 2) in object code or executable form under the terms of
136 | Sections 1 and 2 above provided that you also do one of the following:
137 | 
138 |     a) Accompany it with the complete corresponding machine-readable
139 |     source code, which must be distributed under the terms of Sections
140 |     1 and 2 above on a medium customarily used for software interchange; or,
141 | 
142 |     b) Accompany it with a written offer, valid for at least three
143 |     years, to give any third party, for a charge no more than your
144 |     cost of physically performing source distribution, a complete
145 |     machine-readable copy of the corresponding source code, to be
146 |     distributed under the terms of Sections 1 and 2 above on a medium
147 |     customarily used for software interchange; or,
148 | 
149 |     c) Accompany it with the information you received as to the offer
150 |     to distribute corresponding source code.  (This alternative is
151 |     allowed only for noncommercial distribution and only if you
152 |     received the program in object code or executable form with such
153 |     an offer, in accord with Subsection b above.)
154 | 
155 | The source code for a work means the preferred form of the work for
156 | making modifications to it.  For an executable work, complete source
157 | code means all the source code for all modules it contains, plus any
158 | associated interface definition files, plus the scripts used to
159 | control compilation and installation of the executable.  However, as a
160 | special exception, the source code distributed need not include
161 | anything that is normally distributed (in either source or binary
162 | form) with the major components (compiler, kernel, and so on) of the
163 | operating system on which the executable runs, unless that component
164 | itself accompanies the executable.
165 | 
166 | If distribution of executable or object code is made by offering
167 | access to copy from a designated place, then offering equivalent
168 | access to copy the source code from the same place counts as
169 | distribution of the source code, even though third parties are not
170 | compelled to copy the source along with the object code.
171 | 
172 |   4. You may not copy, modify, sublicense, or distribute the Program
173 | except as expressly provided under this License.  Any attempt
174 | otherwise to copy, modify, sublicense or distribute the Program is
175 | void, and will automatically terminate your rights under this License.
176 | However, parties who have received copies, or rights, from you under
177 | this License will not have their licenses terminated so long as such
178 | parties remain in full compliance.
179 | 
180 |   5. You are not required to accept this License, since you have not
181 | signed it.  However, nothing else grants you permission to modify or
182 | distribute the Program or its derivative works.  These actions are
183 | prohibited by law if you do not accept this License.  Therefore, by
184 | modifying or distributing the Program (or any work based on the
185 | Program), you indicate your acceptance of this License to do so, and
186 | all its terms and conditions for copying, distributing or modifying
187 | the Program or works based on it.
188 | 
189 |   6. Each time you redistribute the Program (or any work based on the
190 | Program), the recipient automatically receives a license from the
191 | original licensor to copy, distribute or modify the Program subject to
192 | these terms and conditions.  You may not impose any further
193 | restrictions on the recipients' exercise of the rights granted herein.
194 | You are not responsible for enforcing compliance by third parties to
195 | this License.
196 | 
197 |   7. If, as a consequence of a court judgment or allegation of patent
198 | infringement or for any other reason (not limited to patent issues),
199 | conditions are imposed on you (whether by court order, agreement or
200 | otherwise) that contradict the conditions of this License, they do not
201 | excuse you from the conditions of this License.  If you cannot
202 | distribute so as to satisfy simultaneously your obligations under this
203 | License and any other pertinent obligations, then as a consequence you
204 | may not distribute the Program at all.  For example, if a patent
205 | license would not permit royalty-free redistribution of the Program by
206 | all those who receive copies directly or indirectly through you, then
207 | the only way you could satisfy both it and this License would be to
208 | refrain entirely from distribution of the Program.
209 | 
210 | If any portion of this section is held invalid or unenforceable under
211 | any particular circumstance, the balance of the section is intended to
212 | apply and the section as a whole is intended to apply in other
213 | circumstances.
214 | 
215 | It is not the purpose of this section to induce you to infringe any
216 | patents or other property right claims or to contest validity of any
217 | such claims; this section has the sole purpose of protecting the
218 | integrity of the free software distribution system, which is
219 | implemented by public license practices.  Many people have made
220 | generous contributions to the wide range of software distributed
221 | through that system in reliance on consistent application of that
222 | system; it is up to the author/donor to decide if he or she is willing
223 | to distribute software through any other system and a licensee cannot
224 | impose that choice.
225 | 
226 | This section is intended to make thoroughly clear what is believed to
227 | be a consequence of the rest of this License.
228 | 
229 |   8. If the distribution and/or use of the Program is restricted in
230 | certain countries either by patents or by copyrighted interfaces, the
231 | original copyright holder who places the Program under this License
232 | may add an explicit geographical distribution limitation excluding
233 | those countries, so that distribution is permitted only in or among
234 | countries not thus excluded.  In such case, this License incorporates
235 | the limitation as if written in the body of this License.
236 | 
237 |   9. The Free Software Foundation may publish revised and/or new versions
238 | of the General Public License from time to time.  Such new versions will
239 | be similar in spirit to the present version, but may differ in detail to
240 | address new problems or concerns.
241 | 
242 | Each version is given a distinguishing version number.  If the Program
243 | specifies a version number of this License which applies to it and "any
244 | later version", you have the option of following the terms and conditions
245 | either of that version or of any later version published by the Free
246 | Software Foundation.  If the Program does not specify a version number of
247 | this License, you may choose any version ever published by the Free Software
248 | Foundation.
249 | 
250 |   10. If you wish to incorporate parts of the Program into other free
251 | programs whose distribution conditions are different, write to the author
252 | to ask for permission.  For software which is copyrighted by the Free
253 | Software Foundation, write to the Free Software Foundation; we sometimes
254 | make exceptions for this.  Our decision will be guided by the two goals
255 | of preserving the free status of all derivatives of our free software and
256 | of promoting the sharing and reuse of software generally.
257 | 
258 |                             NO WARRANTY
259 | 
260 |   11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
268 | REPAIR OR CORRECTION.
269 | 
270 |   12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
278 | POSSIBILITY OF SUCH DAMAGES.
279 | 
280 |                      END OF TERMS AND CONDITIONS
281 | 
282 |             How to Apply These Terms to Your New Programs
283 | 
284 |   If you develop a new program, and you want it to be of the greatest
285 | possible use to the public, the best way to achieve this is to make it
286 | free software which everyone can redistribute and change under these terms.
287 | 
288 |   To do so, attach the following notices to the program.  It is safest
289 | to attach them to the start of each source file to most effectively
290 | convey the exclusion of warranty; and each file should have at least
291 | the "copyright" line and a pointer to where the full notice is found.
292 | 
293 |     MSharpen
294 |     Copyright (C) 2020  AvisynthPlus plugins
295 | 
296 |     This program is free software; you can redistribute it and/or modify
297 |     it under the terms of the GNU General Public License as published by
298 |     the Free Software Foundation; either version 2 of the License, or
299 |     (at your option) any later version.
300 | 
301 |     This program is distributed in the hope that it will be useful,
302 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
303 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
304 |     GNU General Public License for more details.
305 | 
306 |     You should have received a copy of the GNU General Public License along
307 |     with this program; if not, write to the Free Software Foundation, Inc.,
308 |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
309 | 
310 | Also add information on how to contact you by electronic and paper mail.
311 | 
312 | If the program is interactive, make it output a short notice like this
313 | when it starts in an interactive mode:
314 | 
315 |     Gnomovision version 69, Copyright (C) year name of author
316 |     Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
317 |     This is free software, and you are welcome to redistribute it
318 |     under certain conditions; type `show c' for details.
319 | 
320 | The hypothetical commands `show w' and `show c' should show the appropriate
321 | parts of the General Public License.  Of course, the commands you use may
322 | be called something other than `show w' and `show c'; they could even be
323 | mouse-clicks or menu items--whatever suits your program.
324 | 
325 | You should also get your employer (if you work as a programmer) or your
326 | school, if any, to sign a "copyright disclaimer" for the program, if
327 | necessary.  Here is a sample; alter the names:
328 | 
329 |   Yoyodyne, Inc., hereby disclaims all copyright interest in the program
330 |   `Gnomovision' (which makes passes at compilers) written by James Hacker.
331 | 
332 |   {signature of Ty Coon}, 1 April 1989
333 |   Ty Coon, President of Vice
334 | 
335 | This General Public License does not permit incorporating your program into
336 | proprietary programs.  If your program is a subroutine library, you may
337 | consider it more useful to permit linking proprietary applications with the
338 | library.  If this is what you want to do, use the GNU Lesser General
339 | Public License instead of this License.
340 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## Description
  2 | 
  3 | Builds an edge map using canny edge detection.
  4 | 
  5 | This is [a port of the VapourSynth plugin TCanny](https://github.com/HomeOfVapourSynthEvolution/VapourSynth-TCanny).
  6 | 
  7 | ### Requirements:
  8 | 
  9 | - AviSynth 2.60 / AviSynth+ 3.4 or later
 10 | 
 11 | - Microsoft VisualC++ Redistributable Package 2022 (can be downloaded from [here](https://github.com/abbodi1406/vcredist/releases)) (Windows only)
 12 | 
 13 | ### Usage:
 14 | 
 15 | ```
 16 | vsTCanny (clip, float "sigmaY", float "sigmaU", float "sigmaV", float sigma_vY", float "sigma_vU", float "sigma_vV", float "t_h", float "t_l", int "mode", int "op", float "scale", int "y", int "u", int "v", int "opt")
 17 | ```
 18 | 
 19 | ### Parameters:
 20 | 
 21 | - clip\
 22 |     A clip to process. All planar formats are supported.
 23 | 
 24 | - sigmaY, sigmaU, sigmaV\
 25 |     Standard deviation of horizontal gaussian blur.\
 26 |     Must be positive value.\
 27 |     Setting to 0 disables gaussian blur.\
 28 |     Default:
 29 |     - sigmaY = 1.5
 30 |     - sigmaU = sigmaY / horizontal_chroma_subsampling_factor
 31 |     - sigmaV = sigmaU
 32 | 
 33 | - sigma_vY, sigma_vU, sigma_vV\
 34 |     Standard deviation of vertical gaussian blur.\
 35 |     Must be positive value.\
 36 |     Setting to 0 disables gaussian blur.\
 37 |     Default:
 38 |     - sigma_vY = sigmaY
 39 |     - if sigma_vY not defined: if horizontal and vertical subsampling factors are equal `sigma_vU = sigmaU` else `sigma_vU = sigmaU * horizontal_chroma_subsampling_factor`
 40 |     - if sigma_vY defined: sigma_vU = sigma_vY / vertical_chroma_subsampling_factor
 41 |     - sigma_vV = sigma_vU
 42 | 
 43 | - t_h\
 44 |     High gradient magnitude threshold for hysteresis.\
 45 |     Default: 8.0.
 46 | 
 47 | - t_l\
 48 |     Low gradient magnitude threshold for hysteresis.\
 49 |     Must be lower than t_h.\
 50 |     Default: 1.0.
 51 | 
 52 | - mode\
 53 |     Sets output format.\
 54 |     -1: Gaussian blur only.\
 55 |     0: Thresholded edge map (2^bitdepth-1 for edge, 0 for non-edge).\
 56 |     1: Gradient magnitude map.\
 57 |     Default: 0.
 58 | 
 59 | - op\
 60 |     Sets the operator for edge detection.\
 61 |     0: The operator used in tritical's original filter.\
 62 |     1: The Prewitt operator whose use is proposed by P. Zhou et al. [1]\
 63 |     2: The Sobel operator.\
 64 |     3: The Scharr operator.\
 65 |     4: The Kroon operator.\
 66 |     5: The Kirsch operator.\
 67 |     6: The FDoG operator.\
 68 |     Default: 1.
 69 | 
 70 | - scale\
 71 |     Multiplies the gradient by `scale`.\
 72 |     This can be used to increase or decrease the intensity of edges in the output.\
 73 |     Must be greater than 0.0.\
 74 |     Default: 1.0.
 75 | 
 76 | - y, u, v\
 77 |     Planes to process.\
 78 |     1: Return garbage.\
 79 |     2: Copy plane.\
 80 |     3: Process plane. Always process planes when the clip is RGB.\
 81 |     Default: y = u = v = 3.
 82 | 
 83 | - opt\
 84 |     Sets which cpu optimizations to use.\
 85 |     -1: Auto-detect.\
 86 |     0: Use C++ code.\
 87 |     1: Use SSE2 code.\
 88 |     2: Use AVX2 code.\
 89 |     3: Use AVX512 code.\
 90 |     Default: -1.
 91 | 
 92 | [1]: Zhou, P., Ye, W., & Wang, Q. (2011). An Improved Canny Algorithm for Edge Detection. Journal of Computational Information Systems, 7(5), 1516-1523.
 93 | 
 94 | ### Building:
 95 | 
 96 | - Windows\
 97 |     Use solution files.
 98 | 
 99 | - Linux
100 |     ```
101 |     Requirements:
102 |         - Git
103 |         - C++17 compiler
104 |         - CMake >= 3.16
105 |     ```
106 |     ```
107 |     git clone https://github.com/Asd-g/AviSynth-vsTCanny && \
108 |     cd AviSynth-vsTCanny && \
109 |     mkdir build && \
110 |     cd build && \
111 |     cmake .. && \
112 |     make -j$(nproc) && \
113 |     sudo make install
114 |     ```
115 | 


--------------------------------------------------------------------------------
/cmake_uninstall.cmake.in:
--------------------------------------------------------------------------------
 1 | if(NOT EXISTS "@CMAKE_BINARY_DIR@/install_manifest.txt")
 2 |   message(FATAL_ERROR "Cannot find install manifest: @CMAKE_BINARY_DIR@/install_manifest.txt")
 3 | endif()
 4 | 
 5 | file(READ "@CMAKE_BINARY_DIR@/install_manifest.txt" files)
 6 | string(REGEX REPLACE "\n" ";" files "${files}")
 7 | foreach(file ${files})
 8 |   message(STATUS "Uninstalling $ENV{DESTDIR}${file}")
 9 |   if(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}")
10 |     exec_program(
11 |       "@CMAKE_COMMAND@" ARGS "-E remove \"$ENV{DESTDIR}${file}\""
12 |       OUTPUT_VARIABLE rm_out
13 |       RETURN_VALUE rm_retval
14 |       )
15 |     if(NOT "${rm_retval}" STREQUAL 0)
16 |       message(FATAL_ERROR "Problem when removing $ENV{DESTDIR}${file}")
17 |     endif()
18 |   else(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}")
19 |     message(STATUS "File $ENV{DESTDIR}${file} does not exist.")
20 |   endif()
21 | endforeach()
22 | 


--------------------------------------------------------------------------------
/msvc/vsTCanny.sln:
--------------------------------------------------------------------------------
 1 | ﻿
 2 | Microsoft Visual Studio Solution File, Format Version 12.00
 3 | # Visual Studio Version 16
 4 | VisualStudioVersion = 16.0.30503.244
 5 | MinimumVisualStudioVersion = 10.0.40219.1
 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "vsTCanny", "vsTCanny.vcxproj", "{A8044448-4796-42AD-8EFD-B42DE2639A78}"
 7 | EndProject
 8 | Global
 9 | 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
10 | 		Debug|x64 = Debug|x64
11 | 		Debug|x86 = Debug|x86
12 | 		Release|x64 = Release|x64
13 | 		Release|x86 = Release|x86
14 | 	EndGlobalSection
15 | 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
16 | 		{A8044448-4796-42AD-8EFD-B42DE2639A78}.Debug|x64.ActiveCfg = Debug|x64
17 | 		{A8044448-4796-42AD-8EFD-B42DE2639A78}.Debug|x64.Build.0 = Debug|x64
18 | 		{A8044448-4796-42AD-8EFD-B42DE2639A78}.Debug|x86.ActiveCfg = Debug|Win32
19 | 		{A8044448-4796-42AD-8EFD-B42DE2639A78}.Debug|x86.Build.0 = Debug|Win32
20 | 		{A8044448-4796-42AD-8EFD-B42DE2639A78}.Release|x64.ActiveCfg = Release|x64
21 | 		{A8044448-4796-42AD-8EFD-B42DE2639A78}.Release|x64.Build.0 = Release|x64
22 | 		{A8044448-4796-42AD-8EFD-B42DE2639A78}.Release|x86.ActiveCfg = Release|Win32
23 | 		{A8044448-4796-42AD-8EFD-B42DE2639A78}.Release|x86.Build.0 = Release|Win32
24 | 	EndGlobalSection
25 | 	GlobalSection(SolutionProperties) = preSolution
26 | 		HideSolutionNode = FALSE
27 | 	EndGlobalSection
28 | 	GlobalSection(ExtensibilityGlobals) = postSolution
29 | 		SolutionGuid = {C1C68C5C-D0F3-4014-8988-9AFE3E9C5787}
30 | 	EndGlobalSection
31 | EndGlobal
32 | 


--------------------------------------------------------------------------------
/msvc/vsTCanny.vcxproj:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="utf-8"?>
  2 | <Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  3 |   <ItemGroup Label="ProjectConfigurations">
  4 |     <ProjectConfiguration Include="Debug|Win32">
  5 |       <Configuration>Debug</Configuration>
  6 |       <Platform>Win32</Platform>
  7 |     </ProjectConfiguration>
  8 |     <ProjectConfiguration Include="Release|Win32">
  9 |       <Configuration>Release</Configuration>
 10 |       <Platform>Win32</Platform>
 11 |     </ProjectConfiguration>
 12 |     <ProjectConfiguration Include="Debug|x64">
 13 |       <Configuration>Debug</Configuration>
 14 |       <Platform>x64</Platform>
 15 |     </ProjectConfiguration>
 16 |     <ProjectConfiguration Include="Release|x64">
 17 |       <Configuration>Release</Configuration>
 18 |       <Platform>x64</Platform>
 19 |     </ProjectConfiguration>
 20 |   </ItemGroup>
 21 |   <PropertyGroup Label="Globals">
 22 |     <VCProjectVersion>16.0</VCProjectVersion>
 23 |     <ProjectGuid>{A8044448-4796-42AD-8EFD-B42DE2639A78}</ProjectGuid>
 24 |     <Keyword>Win32Proj</Keyword>
 25 |     <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
 26 |   </PropertyGroup>
 27 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
 28 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
 29 |     <ConfigurationType>Application</ConfigurationType>
 30 |     <UseDebugLibraries>true</UseDebugLibraries>
 31 |     <PlatformToolset>v142</PlatformToolset>
 32 |   </PropertyGroup>
 33 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
 34 |     <ConfigurationType>DynamicLibrary</ConfigurationType>
 35 |     <UseDebugLibraries>false</UseDebugLibraries>
 36 |     <PlatformToolset>llvm</PlatformToolset>
 37 |   </PropertyGroup>
 38 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
 39 |     <ConfigurationType>DynamicLibrary</ConfigurationType>
 40 |     <UseDebugLibraries>true</UseDebugLibraries>
 41 |     <PlatformToolset>v142</PlatformToolset>
 42 |   </PropertyGroup>
 43 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
 44 |     <ConfigurationType>DynamicLibrary</ConfigurationType>
 45 |     <UseDebugLibraries>false</UseDebugLibraries>
 46 |     <PlatformToolset>llvm</PlatformToolset>
 47 |   </PropertyGroup>
 48 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
 49 |   <ImportGroup Label="ExtensionSettings">
 50 |   </ImportGroup>
 51 |   <ImportGroup Label="Shared">
 52 |   </ImportGroup>
 53 |   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
 54 |     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
 55 |   </ImportGroup>
 56 |   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
 57 |     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
 58 |   </ImportGroup>
 59 |   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
 60 |     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
 61 |   </ImportGroup>
 62 |   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
 63 |     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
 64 |   </ImportGroup>
 65 |   <PropertyGroup Label="UserMacros" />
 66 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
 67 |     <LinkIncremental>true</LinkIncremental>
 68 |   </PropertyGroup>
 69 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
 70 |     <LinkIncremental>false</LinkIncremental>
 71 |     <IncludePath>..\..\AviSynthPlus\avs_core\include;$(IncludePath)</IncludePath>
 72 |   </PropertyGroup>
 73 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
 74 |     <IncludePath>..\..\AviSynthPlus\avs_core\include;$(IncludePath)</IncludePath>
 75 |   </PropertyGroup>
 76 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
 77 |     <IncludePath>..\..\AviSynthPlus\avs_core\include;$(IncludePath)</IncludePath>
 78 |     <LinkIncremental>false</LinkIncremental>
 79 |   </PropertyGroup>
 80 |   <PropertyGroup Label="LLVM" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
 81 |     <ClangClAdditionalOptions>
 82 |     </ClangClAdditionalOptions>
 83 |     <UseLldLink>true</UseLldLink>
 84 |   </PropertyGroup>
 85 |   <PropertyGroup Label="LLVM" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
 86 |     <UseLldLink>true</UseLldLink>
 87 |   </PropertyGroup>
 88 |   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
 89 |     <ClCompile>
 90 |       <PreprocessorDefinitions>WIN32;_DEBUG;_WINDOWS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
 91 |       <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
 92 |       <WarningLevel>Level3</WarningLevel>
 93 |       <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
 94 |       <Optimization>Disabled</Optimization>
 95 |     </ClCompile>
 96 |     <Link>
 97 |       <TargetMachine>MachineX86</TargetMachine>
 98 |       <GenerateDebugInformation>true</GenerateDebugInformation>
 99 |       <SubSystem>Windows</SubSystem>
100 |     </Link>
101 |   </ItemDefinitionGroup>
102 |   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
103 |     <ClCompile>
104 |       <PreprocessorDefinitions>WIN32;NDEBUG;_WINDOWS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
105 |       <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
106 |       <WarningLevel>Level3</WarningLevel>
107 |       <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
108 |       <LanguageStandard>stdcpp17</LanguageStandard>
109 |       <MultiProcessorCompilation>true</MultiProcessorCompilation>
110 |       <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>
111 |       <IntrinsicFunctions>true</IntrinsicFunctions>
112 |       <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
113 |       <OmitFramePointers>true</OmitFramePointers>
114 |       <FunctionLevelLinking>true</FunctionLevelLinking>
115 |       <FloatingPointModel>Precise</FloatingPointModel>
116 |     </ClCompile>
117 |     <Link>
118 |       <TargetMachine>MachineX86</TargetMachine>
119 |       <GenerateDebugInformation>true</GenerateDebugInformation>
120 |       <SubSystem>Windows</SubSystem>
121 |       <OptimizeReferences>true</OptimizeReferences>
122 |     </Link>
123 |   </ItemDefinitionGroup>
124 |   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
125 |     <ClCompile>
126 |       <LanguageStandard>stdcpp17</LanguageStandard>
127 |     </ClCompile>
128 |   </ItemDefinitionGroup>
129 |   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
130 |     <ClCompile>
131 |       <LanguageStandard>stdcpp17</LanguageStandard>
132 |       <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>
133 |       <IntrinsicFunctions>true</IntrinsicFunctions>
134 |       <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
135 |       <FunctionLevelLinking>true</FunctionLevelLinking>
136 |       <MultiProcessorCompilation>true</MultiProcessorCompilation>
137 |       <FloatingPointModel>Precise</FloatingPointModel>
138 |       <InterproceduralOptimization>MultiFile</InterproceduralOptimization>
139 |       <OptimizeForWindowsApplication>true</OptimizeForWindowsApplication>
140 |     </ClCompile>
141 |     <Link>
142 |       <OptimizeReferences>true</OptimizeReferences>
143 |     </Link>
144 |     <Link>
145 |       <EnableCOMDATFolding>true</EnableCOMDATFolding>
146 |       <AdditionalDependencies>kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
147 |     </Link>
148 |   </ItemDefinitionGroup>
149 |   <ItemGroup>
150 |     <ClCompile Include="..\src\vsTCanny.cpp" />
151 |     <ClCompile Include="..\src\vsTCanny_AVX2.cpp">
152 |       <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
153 |       <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
154 |       <UseProcessorExtensions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">AVX2</UseProcessorExtensions>
155 |       <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|x64'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
156 |     </ClCompile>
157 |     <ClCompile Include="..\src\vsTCanny_AVX512.cpp">
158 |       <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">AdvancedVectorExtensions512</EnableEnhancedInstructionSet>
159 |       <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">AdvancedVectorExtensions512</EnableEnhancedInstructionSet>
160 |       <UseProcessorExtensions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">CORE512</UseProcessorExtensions>
161 |       <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|x64'">AdvancedVectorExtensions512</EnableEnhancedInstructionSet>
162 |     </ClCompile>
163 |     <ClCompile Include="..\src\vsTCanny_SSE2.cpp" />
164 |   </ItemGroup>
165 |   <ItemGroup>
166 |     <ClInclude Include="..\src\vsTCanny.h" />
167 |   </ItemGroup>
168 |   <ItemGroup>
169 |     <ResourceCompile Include="..\src\vsTCanny.rc" />
170 |   </ItemGroup>
171 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
172 |   <ImportGroup Label="ExtensionTargets">
173 |   </ImportGroup>
174 | </Project>


--------------------------------------------------------------------------------
/msvc/vsTCanny.vcxproj.filters:
--------------------------------------------------------------------------------
 1 | ﻿<?xml version="1.0" encoding="utf-8"?>
 2 | <Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
 3 |   <ItemGroup>
 4 |     <Filter Include="Source Files">
 5 |       <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
 6 |       <Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
 7 |     </Filter>
 8 |     <Filter Include="Header Files">
 9 |       <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
10 |       <Extensions>h;hh;hpp;hxx;hm;inl;inc;xsd</Extensions>
11 |     </Filter>
12 |     <Filter Include="Resource Files">
13 |       <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
14 |       <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav</Extensions>
15 |     </Filter>
16 |   </ItemGroup>
17 |   <ItemGroup>
18 |     <ClCompile Include="..\src\vsTCanny.cpp">
19 |       <Filter>Source Files</Filter>
20 |     </ClCompile>
21 |     <ClCompile Include="..\src\vsTCanny_SSE2.cpp">
22 |       <Filter>Source Files</Filter>
23 |     </ClCompile>
24 |     <ClCompile Include="..\src\vsTCanny_AVX2.cpp">
25 |       <Filter>Source Files</Filter>
26 |     </ClCompile>
27 |     <ClCompile Include="..\src\vsTCanny_AVX512.cpp">
28 |       <Filter>Source Files</Filter>
29 |     </ClCompile>
30 |   </ItemGroup>
31 |   <ItemGroup>
32 |     <ClInclude Include="..\src\vsTCanny.h">
33 |       <Filter>Header Files</Filter>
34 |     </ClInclude>
35 |   </ItemGroup>
36 |   <ItemGroup>
37 |     <ResourceCompile Include="..\src\vsTCanny.rc">
38 |       <Filter>Resource Files</Filter>
39 |     </ResourceCompile>
40 |   </ItemGroup>
41 | </Project>


--------------------------------------------------------------------------------
/src/VCL2/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |   
179 |    Copyright 2012-2019 Agner Fog.
180 | 
181 |    Licensed under the Apache License, Version 2.0 (the "License");
182 |    you may not use this file except in compliance with the License.
183 |    You may obtain a copy of the License at
184 | 
185 |        http://www.apache.org/licenses/LICENSE-2.0
186 | 
187 |    Unless required by applicable law or agreed to in writing, software
188 |    distributed under the License is distributed on an "AS IS" BASIS,
189 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
190 |    See the License for the specific language governing permissions and
191 |    limitations under the License.
192 | 


--------------------------------------------------------------------------------
/src/VCL2/instrset_detect.cpp:
--------------------------------------------------------------------------------
  1 | /**************************  instrset_detect.cpp   ****************************
  2 | * Author:        Agner Fog
  3 | * Date created:  2012-05-30
  4 | * Last modified: 2019-08-01
  5 | * Version:       2.00.00
  6 | * Project:       vector class library
  7 | * Description:
  8 | * Functions for checking which instruction sets are supported.
  9 | *
 10 | * (c) Copyright 2012-2019 Agner Fog.
 11 | * Apache License version 2.0 or later.
 12 | ******************************************************************************/
 13 | 
 14 | #include "instrset.h"
 15 | 
 16 | #ifdef VCL_NAMESPACE
 17 | namespace VCL_NAMESPACE {
 18 | #endif
 19 | 
 20 | 
 21 | // Define interface to xgetbv instruction
 22 | static inline uint64_t xgetbv (int ctr) {
 23 | #if (defined (_MSC_FULL_VER) && _MSC_FULL_VER >= 160040000) || (defined (__INTEL_COMPILER) && __INTEL_COMPILER >= 1200)
 24 |     // Microsoft or Intel compiler supporting _xgetbv intrinsic
 25 | 
 26 |     return uint64_t(_xgetbv(ctr));                    // intrinsic function for XGETBV
 27 | 
 28 | #elif defined(__GNUC__) ||  defined (__clang__)       // use inline assembly, Gnu/AT&T syntax
 29 | 
 30 |    uint32_t a, d;
 31 |    __asm("xgetbv" : "=a"(a),"=d"(d) : "c"(ctr) : );
 32 |    return a | (uint64_t(d) << 32);
 33 | 
 34 | #else  // #elif defined (_WIN32)                      // other compiler. try inline assembly with masm/intel/MS syntax
 35 |    uint32_t a, d;
 36 |     __asm {
 37 |         mov ecx, ctr
 38 |         _emit 0x0f
 39 |         _emit 0x01
 40 |         _emit 0xd0 ; // xgetbv
 41 |         mov a, eax
 42 |         mov d, edx
 43 |     }
 44 |    return a | (uint64_t(d) << 32);
 45 | 
 46 | #endif
 47 | }
 48 | 
 49 | /* find supported instruction set
 50 |     return value:
 51 |     0           = 80386 instruction set
 52 |     1  or above = SSE (XMM) supported by CPU (not testing for OS support)
 53 |     2  or above = SSE2
 54 |     3  or above = SSE3
 55 |     4  or above = Supplementary SSE3 (SSSE3)
 56 |     5  or above = SSE4.1
 57 |     6  or above = SSE4.2
 58 |     7  or above = AVX supported by CPU and operating system
 59 |     8  or above = AVX2
 60 |     9  or above = AVX512F
 61 |    10  or above = AVX512VL, AVX512BW, AVX512DQ
 62 | */
 63 | int instrset_detect(void) {
 64 | 
 65 |     static int iset = -1;                                  // remember value for next call
 66 |     if (iset >= 0) {
 67 |         return iset;                                       // called before
 68 |     }
 69 |     iset = 0;                                              // default value
 70 |     int abcd[4] = {0,0,0,0};                               // cpuid results
 71 |     cpuid(abcd, 0);                                        // call cpuid function 0
 72 |     if (abcd[0] == 0) return iset;                         // no further cpuid function supported
 73 |     cpuid(abcd, 1);                                        // call cpuid function 1 for feature flags
 74 |     if ((abcd[3] & (1 <<  0)) == 0) return iset;           // no floating point
 75 |     if ((abcd[3] & (1 << 23)) == 0) return iset;           // no MMX
 76 |     if ((abcd[3] & (1 << 15)) == 0) return iset;           // no conditional move
 77 |     if ((abcd[3] & (1 << 24)) == 0) return iset;           // no FXSAVE
 78 |     if ((abcd[3] & (1 << 25)) == 0) return iset;           // no SSE
 79 |     iset = 1;                                              // 1: SSE supported
 80 |     if ((abcd[3] & (1 << 26)) == 0) return iset;           // no SSE2
 81 |     iset = 2;                                              // 2: SSE2 supported
 82 |     if ((abcd[2] & (1 <<  0)) == 0) return iset;           // no SSE3
 83 |     iset = 3;                                              // 3: SSE3 supported
 84 |     if ((abcd[2] & (1 <<  9)) == 0) return iset;           // no SSSE3
 85 |     iset = 4;                                              // 4: SSSE3 supported
 86 |     if ((abcd[2] & (1 << 19)) == 0) return iset;           // no SSE4.1
 87 |     iset = 5;                                              // 5: SSE4.1 supported
 88 |     if ((abcd[2] & (1 << 23)) == 0) return iset;           // no POPCNT
 89 |     if ((abcd[2] & (1 << 20)) == 0) return iset;           // no SSE4.2
 90 |     iset = 6;                                              // 6: SSE4.2 supported
 91 |     if ((abcd[2] & (1 << 27)) == 0) return iset;           // no OSXSAVE
 92 |     if ((xgetbv(0) & 6) != 6)       return iset;           // AVX not enabled in O.S.
 93 |     if ((abcd[2] & (1 << 28)) == 0) return iset;           // no AVX
 94 |     iset = 7;                                              // 7: AVX supported
 95 |     cpuid(abcd, 7);                                        // call cpuid leaf 7 for feature flags
 96 |     if ((abcd[1] & (1 <<  5)) == 0) return iset;           // no AVX2
 97 |     iset = 8;
 98 |     if ((abcd[1] & (1 << 16)) == 0) return iset;           // no AVX512
 99 |     cpuid(abcd, 0xD);                                      // call cpuid leaf 0xD for feature flags
100 |     if ((abcd[0] & 0x60) != 0x60)   return iset;           // no AVX512
101 |     iset = 9;
102 |     cpuid(abcd, 7);                                        // call cpuid leaf 7 for feature flags
103 |     if ((abcd[1] & (1 << 31)) == 0) return iset;           // no AVX512VL
104 |     if ((abcd[1] & 0x40020000) != 0x40020000) return iset; // no AVX512BW, AVX512DQ
105 |     iset = 10;
106 |     return iset;
107 | }
108 | 
109 | // detect if CPU supports the FMA3 instruction set
110 | bool hasFMA3(void) {
111 |     if (instrset_detect() < 7) return false;               // must have AVX
112 |     int abcd[4];                                           // cpuid results
113 |     cpuid(abcd, 1);                                        // call cpuid function 1
114 |     return ((abcd[2] & (1 << 12)) != 0);                   // ecx bit 12 indicates FMA3
115 | }
116 | 
117 | // detect if CPU supports the FMA4 instruction set
118 | bool hasFMA4(void) {
119 |     if (instrset_detect() < 7) return false;               // must have AVX
120 |     int abcd[4];                                           // cpuid results
121 |     cpuid(abcd, 0x80000001);                               // call cpuid function 0x80000001
122 |     return ((abcd[2] & (1 << 16)) != 0);                   // ecx bit 16 indicates FMA4
123 | }
124 | 
125 | // detect if CPU supports the XOP instruction set
126 | bool hasXOP(void) {
127 |     if (instrset_detect() < 7) return false;               // must have AVX
128 |     int abcd[4];                                           // cpuid results
129 |     cpuid(abcd, 0x80000001);                               // call cpuid function 0x80000001
130 |     return ((abcd[2] & (1 << 11)) != 0);                   // ecx bit 11 indicates XOP
131 | }
132 | 
133 | // detect if CPU supports the F16C instruction set
134 | bool hasF16C(void) {
135 |     if (instrset_detect() < 7) return false;               // must have AVX
136 |     int abcd[4];                                           // cpuid results
137 |     cpuid(abcd, 1);                                        // call cpuid function 1
138 |     return ((abcd[2] & (1 << 29)) != 0);                   // ecx bit 29 indicates F16C
139 | }
140 | 
141 | // detect if CPU supports the AVX512ER instruction set
142 | bool hasAVX512ER(void) {
143 |     if (instrset_detect() < 9) return false;               // must have AVX512F
144 |     int abcd[4];                                           // cpuid results
145 |     cpuid(abcd, 7);                                        // call cpuid function 7
146 |     return ((abcd[1] & (1 << 27)) != 0);                   // ebx bit 27 indicates AVX512ER
147 | }
148 | 
149 | // detect if CPU supports the AVX512VBMI instruction set
150 | bool hasAVX512VBMI(void) {
151 |     if (instrset_detect() < 10) return false;              // must have AVX512BW
152 |     int abcd[4];                                           // cpuid results
153 |     cpuid(abcd, 7);                                        // call cpuid function 7
154 |     return ((abcd[2] & (1 << 1)) != 0);                    // ecx bit 1 indicates AVX512VBMI
155 | }
156 | 
157 | // detect if CPU supports the AVX512VBMI2 instruction set
158 | bool hasAVX512VBMI2(void) {
159 |     if (instrset_detect() < 10) return false;              // must have AVX512BW
160 |     int abcd[4];                                           // cpuid results
161 |     cpuid(abcd, 7);                                        // call cpuid function 7
162 |     return ((abcd[2] & (1 << 6)) != 0);                    // ecx bit 6 indicates AVX512VBMI2
163 | }
164 | 
165 | #ifdef VCL_NAMESPACE
166 | }
167 | #endif
168 | 


--------------------------------------------------------------------------------
/src/VCL2/vector_convert.h:
--------------------------------------------------------------------------------
  1 | /**************************  vector_convert.h   *******************************
  2 | * Author:        Agner Fog
  3 | * Date created:  2014-07-23
  4 | * Last modified: 2019-11-17
  5 | * Version:       2.01.00
  6 | * Project:       vector class library
  7 | * Description:
  8 | * Header file for conversion between different vector classes with different
  9 | * sizes. Also includes verious generic template functions.
 10 | *
 11 | * (c) Copyright 2012-2019 Agner Fog.
 12 | * Apache License version 2.0 or later.
 13 | *****************************************************************************/
 14 | 
 15 | #ifndef VECTOR_CONVERT_H
 16 | #define VECTOR_CONVERT_H
 17 | 
 18 | #ifndef VECTORCLASS_H
 19 | #include "vectorclass.h"
 20 | #endif
 21 | 
 22 | #if VECTORCLASS_H < 20100
 23 | #error Incompatible versions of vector class library mixed
 24 | #endif
 25 | 
 26 | #ifdef VCL_NAMESPACE
 27 | namespace VCL_NAMESPACE {
 28 | #endif
 29 | 
 30 | #if MAX_VECTOR_SIZE >= 256
 31 | 
 32 | /*****************************************************************************
 33 | *
 34 | *          Extend from 128 to 256 bit vectors
 35 | *
 36 | *****************************************************************************/
 37 | 
 38 | #if INSTRSET >= 8  // AVX2. 256 bit integer vectors
 39 | 
 40 | // sign extend
 41 | static inline Vec16s extend (Vec16c const a) {
 42 |     return _mm256_cvtepi8_epi16(a);
 43 | }
 44 | 
 45 | // zero extend
 46 | static inline Vec16us extend (Vec16uc const a) {
 47 |     return _mm256_cvtepu8_epi16(a);
 48 | }
 49 | 
 50 | // sign extend
 51 | static inline Vec8i extend (Vec8s const a) {
 52 |     return _mm256_cvtepi16_epi32(a);
 53 | }
 54 | 
 55 | // zero extend
 56 | static inline Vec8ui extend (Vec8us const a) {
 57 |     return _mm256_cvtepu16_epi32(a);
 58 | }
 59 | 
 60 | // sign extend
 61 | static inline Vec4q extend (Vec4i const a) {
 62 |     return _mm256_cvtepi32_epi64(a);
 63 | }
 64 | 
 65 | // zero extend
 66 | static inline Vec4uq extend (Vec4ui const a) {
 67 |     return _mm256_cvtepu32_epi64(a);
 68 | }
 69 | 
 70 | 
 71 | #else  // no AVX2. 256 bit integer vectors are emulated
 72 | 
 73 | // sign extend and zero extend functions:
 74 | static inline Vec16s extend (Vec16c const a) {
 75 |     return Vec16s(extend_low(a), extend_high(a));
 76 | }
 77 | 
 78 | static inline Vec16us extend (Vec16uc const a) {
 79 |     return Vec16us(extend_low(a), extend_high(a));
 80 | }
 81 | 
 82 | static inline Vec8i extend (Vec8s const a) {
 83 |     return Vec8i(extend_low(a), extend_high(a));
 84 | }
 85 | 
 86 | static inline Vec8ui extend (Vec8us const a) {
 87 |     return Vec8ui(extend_low(a), extend_high(a));
 88 | }
 89 | 
 90 | static inline Vec4q extend (Vec4i const a) {
 91 |     return Vec4q(extend_low(a), extend_high(a));
 92 | }
 93 | 
 94 | static inline Vec4uq extend (Vec4ui const a) {
 95 |     return Vec4uq(extend_low(a), extend_high(a));
 96 | }
 97 | 
 98 | #endif  // AVX2
 99 | 
100 | /*****************************************************************************
101 | *
102 | *          Conversions between float and double
103 | *
104 | *****************************************************************************/
105 | #if INSTRSET >= 7  // AVX. 256 bit float vectors
106 | 
107 | // float to double
108 | static inline Vec4d to_double (Vec4f const a) {
109 |     return _mm256_cvtps_pd(a);
110 | }
111 | 
112 | // double to float
113 | static inline Vec4f to_float (Vec4d const a) {
114 |     return _mm256_cvtpd_ps(a);
115 | }
116 | 
117 | #else  // no AVX2. 256 bit float vectors are emulated
118 | 
119 | // float to double
120 | static inline Vec4d to_double (Vec4f const a) {
121 |     Vec2d lo = _mm_cvtps_pd(a);
122 |     Vec2d hi = _mm_cvtps_pd(_mm_movehl_ps(a, a));
123 |     return Vec4d(lo,hi);
124 | }
125 | 
126 | // double to float
127 | static inline Vec4f to_float (Vec4d const a) {
128 |     Vec4f lo = _mm_cvtpd_ps(a.get_low());
129 |     Vec4f hi = _mm_cvtpd_ps(a.get_high());
130 |     return _mm_movelh_ps(lo, hi);
131 | }
132 | 
133 | #endif
134 | 
135 | /*****************************************************************************
136 | *
137 | *          Reduce from 256 to 128 bit vectors
138 | *
139 | *****************************************************************************/
140 | #if INSTRSET >= 10  // AVX512VL
141 | 
142 | // compress functions. overflow wraps around
143 | static inline Vec16c compress (Vec16s const a) {
144 |     return _mm256_cvtepi16_epi8(a);
145 | }
146 | 
147 | static inline Vec16uc compress (Vec16us const a) {
148 |     return _mm256_cvtepi16_epi8(a);
149 | }
150 | 
151 | static inline Vec8s compress (Vec8i const a) {
152 |     return _mm256_cvtepi32_epi16(a);
153 | }
154 | 
155 | static inline Vec8us compress (Vec8ui const a) {
156 |     return _mm256_cvtepi32_epi16(a);
157 | }
158 | 
159 | static inline Vec4i compress (Vec4q const a) {
160 |     return _mm256_cvtepi64_epi32(a);
161 | }
162 | 
163 | static inline Vec4ui compress (Vec4uq const a) {
164 |     return _mm256_cvtepi64_epi32(a);
165 | }
166 | 
167 | #else  // no AVX512
168 | 
169 | // compress functions. overflow wraps around
170 | static inline Vec16c compress (Vec16s const a) {
171 |     return compress(a.get_low(), a.get_high());
172 | }
173 | 
174 | static inline Vec16uc compress (Vec16us const a) {
175 |     return compress(a.get_low(), a.get_high());
176 | }
177 | 
178 | static inline Vec8s compress (Vec8i const a) {
179 |     return compress(a.get_low(), a.get_high());
180 | }
181 | 
182 | static inline Vec8us compress (Vec8ui const a) {
183 |     return compress(a.get_low(), a.get_high());
184 | }
185 | 
186 | static inline Vec4i compress (Vec4q const a) {
187 |     return compress(a.get_low(), a.get_high());
188 | }
189 | 
190 | static inline Vec4ui compress (Vec4uq const a) {
191 |     return compress(a.get_low(), a.get_high());
192 | }
193 | 
194 | #endif  // AVX512
195 | 
196 | #endif // MAX_VECTOR_SIZE >= 256
197 | 
198 | 
199 | #if MAX_VECTOR_SIZE >= 512
200 | 
201 | /*****************************************************************************
202 | *
203 | *          Extend from 256 to 512 bit vectors
204 | *
205 | *****************************************************************************/
206 | 
207 | #if INSTRSET >= 9  // AVX512. 512 bit integer vectors
208 | 
209 | // sign extend
210 | static inline Vec32s extend (Vec32c const a) {
211 | #if INSTRSET >= 10
212 |     return _mm512_cvtepi8_epi16(a);
213 | #else
214 |     return Vec32s(extend_low(a), extend_high(a));
215 | #endif
216 | }
217 | 
218 | // zero extend
219 | static inline Vec32us extend (Vec32uc const a) {
220 | #if INSTRSET >= 10
221 |     return _mm512_cvtepu8_epi16(a);
222 | #else
223 |     return Vec32us(extend_low(a), extend_high(a));
224 | #endif
225 | }
226 | 
227 | // sign extend
228 | static inline Vec16i extend (Vec16s const a) {
229 |     return _mm512_cvtepi16_epi32(a);
230 | }
231 | 
232 | // zero extend
233 | static inline Vec16ui extend (Vec16us const a) {
234 |     return _mm512_cvtepu16_epi32(a);
235 | }
236 | 
237 | // sign extend
238 | static inline Vec8q extend (Vec8i const a) {
239 |     return _mm512_cvtepi32_epi64(a);
240 | }
241 | 
242 | // zero extend
243 | static inline Vec8uq extend (Vec8ui const a) {
244 |     return _mm512_cvtepu32_epi64(a);
245 | }
246 | 
247 | #else  // no AVX512. 512 bit vectors are emulated
248 | 
249 | 
250 | 
251 | // sign extend
252 | static inline Vec32s extend (Vec32c const a) {
253 |     return Vec32s(extend_low(a), extend_high(a));
254 | }
255 | 
256 | // zero extend
257 | static inline Vec32us extend (Vec32uc const a) {
258 |     return Vec32us(extend_low(a), extend_high(a));
259 | }
260 | 
261 | // sign extend
262 | static inline Vec16i extend (Vec16s const a) {
263 |     return Vec16i(extend_low(a), extend_high(a));
264 | }
265 | 
266 | // zero extend
267 | static inline Vec16ui extend (Vec16us const a) {
268 |     return Vec16ui(extend_low(a), extend_high(a));
269 | }
270 | 
271 | // sign extend
272 | static inline Vec8q extend (Vec8i const a) {
273 |     return Vec8q(extend_low(a), extend_high(a));
274 | }
275 | 
276 | // zero extend
277 | static inline Vec8uq extend (Vec8ui const a) {
278 |     return Vec8uq(extend_low(a), extend_high(a));
279 | }
280 | 
281 | #endif  // AVX512
282 | 
283 | 
284 | /*****************************************************************************
285 | *
286 | *          Reduce from 512 to 256 bit vectors
287 | *
288 | *****************************************************************************/
289 | #if INSTRSET >= 9  // AVX512F
290 | 
291 | // compress functions. overflow wraps around
292 | static inline Vec32c compress (Vec32s const a) {
293 | #if INSTRSET >= 10  // AVVX512BW
294 |     return _mm512_cvtepi16_epi8(a);
295 | #else
296 |     return compress(a.get_low(), a.get_high());
297 | #endif
298 | }
299 | 
300 | static inline Vec32uc compress (Vec32us const a) {
301 |     return Vec32uc(compress(Vec32s(a)));
302 | }
303 | 
304 | static inline Vec16s compress (Vec16i const a) {
305 |     return _mm512_cvtepi32_epi16(a);
306 | }
307 | 
308 | static inline Vec16us compress (Vec16ui const a) {
309 |     return _mm512_cvtepi32_epi16(a);
310 | }
311 | 
312 | static inline Vec8i compress (Vec8q const a) {
313 |     return _mm512_cvtepi64_epi32(a);
314 | }
315 | 
316 | static inline Vec8ui compress (Vec8uq const a) {
317 |     return _mm512_cvtepi64_epi32(a);
318 | }
319 | 
320 | #else  // no AVX512
321 | 
322 | // compress functions. overflow wraps around
323 | static inline Vec32c compress (Vec32s const a) {
324 |     return compress(a.get_low(), a.get_high());
325 | }
326 | 
327 | static inline Vec32uc compress (Vec32us const a) {
328 |     return compress(a.get_low(), a.get_high());
329 | }
330 | 
331 | static inline Vec16s compress (Vec16i const a) {
332 |     return compress(a.get_low(), a.get_high());
333 | }
334 | 
335 | static inline Vec16us compress (Vec16ui const a) {
336 |     return compress(a.get_low(), a.get_high());
337 | }
338 | 
339 | static inline Vec8i compress (Vec8q const a) {
340 |     return compress(a.get_low(), a.get_high());
341 | }
342 | 
343 | static inline Vec8ui compress (Vec8uq const a) {
344 |     return compress(a.get_low(), a.get_high());
345 | }
346 | 
347 | #endif  // AVX512
348 | 
349 | /*****************************************************************************
350 | *
351 | *          Conversions between float and double
352 | *
353 | *****************************************************************************/
354 | 
355 | #if INSTRSET >= 9  // AVX512. 512 bit float vectors
356 | 
357 | // float to double
358 | static inline Vec8d to_double (Vec8f const a) {
359 |     return _mm512_cvtps_pd(a);
360 | }
361 | 
362 | // double to float
363 | static inline Vec8f to_float (Vec8d const a) {
364 |     return _mm512_cvtpd_ps(a);
365 | }
366 | 
367 | #else  // no AVX512. 512 bit float vectors are emulated
368 | 
369 | // float to double
370 | static inline Vec8d to_double (Vec8f const a) {
371 |     Vec4d lo = to_double(a.get_low());
372 |     Vec4d hi = to_double(a.get_high());
373 |     return Vec8d(lo,hi);
374 | }
375 | 
376 | // double to float
377 | static inline Vec8f to_float (Vec8d const a) {
378 |     Vec4f lo = to_float(a.get_low());
379 |     Vec4f hi = to_float(a.get_high());
380 |     return Vec8f(lo, hi);
381 | }
382 | 
383 | #endif
384 | 
385 | #endif // MAX_VECTOR_SIZE >= 512
386 | 
387 | // double to float
388 | static inline Vec4f to_float (Vec2d const a) {
389 |     return _mm_cvtpd_ps(a);
390 | }
391 | 
392 | 
393 | /*****************************************************************************
394 | *
395 | *          Generic template functions
396 | *
397 | *  These templates define functions for multiple vector types in one template
398 | *
399 | *****************************************************************************/
400 | 
401 | // horizontal min/max of vector elements
402 | // implemented with universal template, works for all vector types:
403 | 
404 | template <typename T> auto horizontal_min(T const x) {
405 |     if constexpr ((T::elementtype() & 16) != 0) {
406 |         // T is a float or double vector
407 |         if (horizontal_or(is_nan(x))) {
408 |             // check for NAN because min does not guarantee NAN propagation
409 |             return x[horizontal_find_first(is_nan(x))];
410 |         }
411 |     }
412 |     return horizontal_min1(x);
413 | }
414 | 
415 | template <typename T> auto horizontal_min1(T const x) {
416 |     if constexpr (T::elementtype() <= 3) {       // boolean vector type
417 |         return horizontal_and(x);
418 |     }
419 |     else if constexpr (sizeof(T) >= 32) {
420 |         // split recursively into smaller vectors
421 |         return horizontal_min1(min(x.get_low(), x.get_high()));
422 |     }
423 |     else if constexpr (T::size() == 2) {
424 |         T a = permute2 <1, V_DC>(x);             // high half
425 |         T b = min(a, x);
426 |         return b[0];
427 |     }
428 |     else if constexpr (T::size() == 4) {
429 |         T a = permute4<2, 3, V_DC, V_DC>(x);     // high half
430 |         T b = min(a, x);
431 |         a = permute4<1, V_DC, V_DC, V_DC>(b);
432 |         b = min(a, b);
433 |         return b[0];
434 |     }
435 |     else if constexpr (T::size() == 8) {
436 |         T a = permute8<4, 5, 6, 7, V_DC, V_DC, V_DC, V_DC>(x);  // high half
437 |         T b = min(a, x);
438 |         a = permute8<2, 3, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b);
439 |         b = min(a, b);
440 |         a = permute8<1, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b);
441 |         b = min(a, b);
442 |         return b[0];
443 |     }
444 |     else {
445 |         static_assert(T::size() == 16);          // no other size is allowed
446 |         T a = permute16<8, 9, 10, 11, 12, 13, 14, 15, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC >(x);  // high half
447 |         T b = min(a, x);
448 |         a = permute16<4, 5, 6, 7, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b);
449 |         b = min(a, b);
450 |         a = permute16<2, 3, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b);
451 |         b = min(a, b);
452 |         a = permute16<1, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b);
453 |         b = min(a, b);
454 |         return b[0];
455 |     }
456 | }
457 | 
458 | template <typename T> auto horizontal_max(T const x) {
459 |     if constexpr ((T::elementtype() & 16) != 0) {
460 |         // T is a float or double vector
461 |         if (horizontal_or(is_nan(x))) {
462 |             // check for NAN because max does not guarantee NAN propagation
463 |             return x[horizontal_find_first(is_nan(x))];
464 |         }
465 |     }
466 |     return horizontal_max1(x);
467 | }
468 | 
469 | template <typename T> auto horizontal_max1(T const x) {
470 |     if constexpr (T::elementtype() <= 3) {       // boolean vector type
471 |         return horizontal_or(x);
472 |     }
473 |     else if constexpr (sizeof(T) >= 32) {
474 |         // split recursively into smaller vectors
475 |         return horizontal_max1(max(x.get_low(), x.get_high()));
476 |     }
477 |     else if constexpr (T::size() == 2) {
478 |         T a = permute2 <1, V_DC>(x);             // high half
479 |         T b = max(a, x);
480 |         return b[0];
481 |     }
482 |     else if constexpr (T::size() == 4) {
483 |         T a = permute4<2, 3, V_DC, V_DC>(x);     // high half
484 |         T b = max(a, x);
485 |         a = permute4<1, V_DC, V_DC, V_DC>(b);
486 |         b = max(a, b);
487 |         return b[0];
488 |     }
489 |     else if constexpr (T::size() == 8) {
490 |         T a = permute8<4, 5, 6, 7, V_DC, V_DC, V_DC, V_DC>(x);  // high half
491 |         T b = max(a, x);
492 |         a = permute8<2, 3, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b);
493 |         b = max(a, b);
494 |         a = permute8<1, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b);
495 |         b = max(a, b);
496 |         return b[0];
497 |     }
498 |     else {
499 |         static_assert(T::size() == 16);          // no other size is allowed
500 |         T a = permute16<8, 9, 10, 11, 12, 13, 14, 15, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC >(x);  // high half
501 |         T b = max(a, x);
502 |         a = permute16<4, 5, 6, 7, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b);
503 |         b = max(a, b);
504 |         a = permute16<2, 3, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b);
505 |         b = max(a, b);
506 |         a = permute16<1, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b);
507 |         b = max(a, b);
508 |         return b[0];
509 |     }
510 | }
511 | 
512 | // Find first element that is true in a boolean vector
513 | template <typename V>
514 | static inline int horizontal_find_first(V const x) {
515 |     static_assert(V::elementtype() == 2 || V::elementtype() == 3, "Boolean vector expected");
516 |     auto bits = to_bits(x);                      // convert to bits
517 |     if (bits == 0) return -1;
518 |     if constexpr (V::size() < 32) {
519 |         return bit_scan_forward((uint32_t)bits);
520 |     }
521 |     else {
522 |         return bit_scan_forward(bits);
523 |     }
524 | }
525 | 
526 | // Count the number of elements that are true in a boolean vector
527 | template <typename V>
528 | static inline int horizontal_count(V const x) {
529 |     static_assert(V::elementtype() == 2 || V::elementtype() == 3, "Boolean vector expected");
530 |     auto bits = to_bits(x);                      // convert to bits
531 |     if constexpr (V::size() < 32) {
532 |         return vml_popcnt((uint32_t)bits);
533 |     }
534 |     else {
535 |         return (int)vml_popcnt(bits);
536 |     }
537 | }
538 | 
539 | // maximum and minimum functions. This version is sure to propagate NANs,
540 | // conforming to the new IEEE-754 2019 standard
541 | template <typename V>
542 | static inline V maximum(V const a, V const b) {
543 |     if constexpr (V::elementtype() < 16) {
544 |         return max(a, b);              // integer type
545 |     }
546 |     else {                             // float or double vector
547 |         V y = select(is_nan(a), a, max(a, b));
548 | #ifdef SIGNED_ZERO                     // pedantic about signed zero
549 |         y = select(a == b, a & b, y);  // maximum(+0, -0) = +0
550 | #endif
551 |         return y;
552 |     }
553 | }
554 | 
555 | template <typename V>
556 | static inline V minimum(V const a, V const b) {
557 |     if constexpr (V::elementtype() < 16) {
558 |         return min(a, b);              // integer type
559 |     }
560 |     else {                             // float or double vector
561 |         V y = select(is_nan(a), a, min(a, b));
562 | #ifdef SIGNED_ZERO                     // pedantic about signed zero
563 |         y = select(a == b, a | b, y);  // minimum(+0, -0) = -0
564 | #endif
565 |         return y;
566 |     }
567 | }
568 | 
569 | 
570 | #ifdef VCL_NAMESPACE
571 | }
572 | #endif
573 | 
574 | #endif // VECTOR_CONVERT_H
575 | 


--------------------------------------------------------------------------------
/src/VCL2/vectorclass.h:
--------------------------------------------------------------------------------
 1 | /****************************  vectorclass.h   ********************************
 2 | * Author:        Agner Fog
 3 | * Date created:  2012-05-30
 4 | * Last modified: 2021-08-18
 5 | * Version:       2.01.04
 6 | * Project:       vector class library
 7 | * Home:          https://github.com/vectorclass
 8 | * Description:
 9 | * Header file defining vector classes as interface to intrinsic functions
10 | * in x86 and x86-64 microprocessors with SSE2 and later instruction sets.
11 | *
12 | * Instructions:
13 | * Use Gnu, Clang, Intel or Microsoft C++ compiler. Compile for the desired
14 | * instruction set, which must be at least SSE2. Specify the supported
15 | * instruction set by a command line define, e.g. __SSE4_1__ if the
16 | * compiler does not automatically do so.
17 | * For detailed instructions, see vcl_manual.pdf
18 | *
19 | * Each vector object is represented internally in the CPU as a vector
20 | * register with 128, 256 or 512 bits.
21 | *
22 | * This header file includes the appropriate header files depending on the
23 | * selected instruction set.
24 | *
25 | * (c) Copyright 2012-2021 Agner Fog.
26 | * Apache License version 2.0 or later.
27 | ******************************************************************************/
28 | #ifndef VECTORCLASS_H
29 | #define VECTORCLASS_H  20103
30 | 
31 | // Maximum vector size, bits. Allowed values are 128, 256, 512
32 | #ifndef MAX_VECTOR_SIZE
33 | #define MAX_VECTOR_SIZE 512
34 | #endif
35 | 
36 | // Determine instruction set, and define platform-dependent functions
37 | #include "instrset.h"        // Select supported instruction set
38 | 
39 | #if INSTRSET < 2             // instruction set SSE2 is the minimum
40 | #error Please compile for the SSE2 instruction set or higher
41 | #else
42 | 
43 | // Select appropriate .h files depending on instruction set
44 | #include "vectori128.h"      // 128-bit integer vectors
45 | #include "vectorf128.h"      // 128-bit floating point vectors
46 | 
47 | #if MAX_VECTOR_SIZE >= 256
48 | #if INSTRSET >= 8
49 | #include "vectori256.h"      // 256-bit integer vectors, requires AVX2 instruction set
50 | #else
51 | #include "vectori256e.h"     // 256-bit integer vectors, emulated
52 | #endif  // INSTRSET >= 8
53 | #if INSTRSET >= 7
54 | #include "vectorf256.h"      // 256-bit floating point vectors, requires AVX instruction set
55 | #else
56 | #include "vectorf256e.h"     // 256-bit floating point vectors, emulated
57 | #endif  //  INSTRSET >= 7
58 | #endif  //  MAX_VECTOR_SIZE >= 256
59 | 
60 | #if MAX_VECTOR_SIZE >= 512
61 | #if INSTRSET >= 9
62 | #include "vectori512.h"      // 512-bit vectors of 32 and 64 bit integers, requires AVX512F instruction set
63 | #include "vectorf512.h"      // 512-bit floating point vectors, requires AVX512F instruction set
64 | #else
65 | #include "vectori512e.h"     // 512-bit integer vectors, emulated
66 | #include "vectorf512e.h"     // 512-bit floating point vectors, emulated
67 | #endif  //  INSTRSET >= 9
68 | #if INSTRSET >= 10
69 | #include "vectori512s.h"     // 512-bit vectors of 8 and 16 bit integers, requires AVX512BW instruction set
70 | #else
71 | #include "vectori512se.h"    // 512-bit vectors of 8 and 16 bit integers, emulated
72 | #endif
73 | #endif  //  MAX_VECTOR_SIZE >= 512
74 | 
75 | #include "vector_convert.h"  // conversion between different vector sizes
76 | 
77 | #endif  // INSTRSET >= 2
78 | 
79 | 
80 | #else   // VECTORCLASS_H
81 | 
82 | #if VECTORCLASS_H < 20000
83 | #error Mixed versions of vector class library
84 | #endif
85 | 
86 | #endif  // VECTORCLASS_H
87 | 


--------------------------------------------------------------------------------
/src/VCL2/vectormath_common.h:
--------------------------------------------------------------------------------
  1 | /***************************  vectormath_common.h   ****************************
  2 | * Author:        Agner Fog
  3 | * Date created:  2014-04-18
  4 | * Last modified: 2020-06-08
  5 | * Version:       2.01.03
  6 | * Project:       vector classes
  7 | * Description:
  8 | * Header file containing common code for inline version of mathematical functions.
  9 | *
 10 | * For detailed instructions, see VectorClass.pdf
 11 | *
 12 | * (c) Copyright 2014-2020 Agner Fog.
 13 | * Apache License version 2.0 or later.
 14 | ******************************************************************************/
 15 | 
 16 | #ifndef VECTORMATH_COMMON_H
 17 | #define VECTORMATH_COMMON_H  2
 18 | 
 19 | #ifdef VECTORMATH_LIB_H
 20 | #error conflicting header files. More than one implementation of mathematical functions included
 21 | #endif
 22 | 
 23 | #include <cmath>
 24 | 
 25 | #ifndef VECTORCLASS_H
 26 | #include "vectorclass.h"
 27 | #endif
 28 | 
 29 | #if VECTORCLASS_H < 20000
 30 | #error Incompatible versions of vector class library mixed
 31 | #endif
 32 | 
 33 | 
 34 | /******************************************************************************
 35 |                     Define NAN payload values
 36 | ******************************************************************************/
 37 | #define NAN_LOG 0x101  // logarithm for x<0
 38 | #define NAN_POW 0x102  // negative number raised to non-integer power
 39 | #define NAN_HYP 0x104  // acosh for x<1 and atanh for abs(x)>1
 40 | 
 41 | 
 42 | /******************************************************************************
 43 |                     Define mathematical constants
 44 | ******************************************************************************/
 45 | #define VM_PI       3.14159265358979323846           // pi
 46 | #define VM_PI_2     1.57079632679489661923           // pi / 2
 47 | #define VM_PI_4     0.785398163397448309616          // pi / 4
 48 | #define VM_SQRT2    1.41421356237309504880           // sqrt(2)
 49 | #define VM_LOG2E    1.44269504088896340736           // 1/log(2)
 50 | #define VM_LOG10E   0.434294481903251827651          // 1/log(10)
 51 | #define VM_LOG210   3.321928094887362347808          // log2(10)
 52 | #define VM_LN2      0.693147180559945309417          // log(2)
 53 | #define VM_LN10     2.30258509299404568402           // log(10)
 54 | #define VM_SMALLEST_NORMAL  2.2250738585072014E-308  // smallest normal number, double
 55 | #define VM_SMALLEST_NORMALF 1.17549435E-38f          // smallest normal number, float
 56 | 
 57 | 
 58 | #ifdef VCL_NAMESPACE
 59 | namespace VCL_NAMESPACE {
 60 | #endif
 61 | 
 62 | /******************************************************************************
 63 |       templates for producing infinite and nan in desired vector type
 64 | ******************************************************************************/
 65 | template <class VTYPE>
 66 | static inline VTYPE infinite_vec();
 67 | 
 68 | template <>
 69 | inline Vec2d infinite_vec<Vec2d>() {
 70 |     return infinite2d();
 71 | }
 72 | 
 73 | template <>
 74 | inline Vec4f infinite_vec<Vec4f>() {
 75 |     return infinite4f();
 76 | }
 77 | 
 78 | #if MAX_VECTOR_SIZE >= 256
 79 | 
 80 | template <>
 81 | inline Vec4d infinite_vec<Vec4d>() {
 82 |     return infinite4d();
 83 | }
 84 | 
 85 | template <>
 86 | inline Vec8f infinite_vec<Vec8f>() {
 87 |     return infinite8f();
 88 | }
 89 | 
 90 | #endif // MAX_VECTOR_SIZE >= 256
 91 | 
 92 | #if MAX_VECTOR_SIZE >= 512
 93 | 
 94 | template <>
 95 | inline Vec8d infinite_vec<Vec8d>() {
 96 |     return infinite8d();
 97 | }
 98 | 
 99 | template <>
100 | inline Vec16f infinite_vec<Vec16f>() {
101 |     return infinite16f();
102 | }
103 | 
104 | #endif // MAX_VECTOR_SIZE >= 512
105 | 
106 | 
107 | 
108 | /******************************************************************************
109 | *                 Detect NAN codes
110 | *
111 | * These functions return the code hidden in a NAN. The sign bit is ignored
112 | ******************************************************************************/
113 | 
114 | static inline Vec4ui nan_code(Vec4f const x) {
115 |     Vec4ui a = Vec4ui(reinterpret_i(x));
116 |     Vec4ui const n = 0x007FFFFF;
117 |     return select(Vec4ib(is_nan(x)), a & n, 0);
118 | }
119 | 
120 | // This function returns the code hidden in a NAN. The sign bit is ignored
121 | static inline Vec2uq nan_code(Vec2d const x) {
122 |     Vec2uq a = Vec2uq(reinterpret_i(x));
123 |     return select(Vec2qb(is_nan(x)), a << 12 >> (12+29), 0);
124 | }
125 | 
126 | #if MAX_VECTOR_SIZE >= 256
127 | 
128 | // This function returns the code hidden in a NAN. The sign bit is ignored
129 | static inline Vec8ui nan_code(Vec8f const x) {
130 |     Vec8ui a = Vec8ui(reinterpret_i(x));
131 |     Vec8ui const n = 0x007FFFFF;
132 |     return select(Vec8ib(is_nan(x)), a & n, 0);
133 | }
134 | 
135 | // This function returns the code hidden in a NAN. The sign bit is ignored
136 | static inline Vec4uq nan_code(Vec4d const x) {
137 |     Vec4uq a = Vec4uq(reinterpret_i(x));
138 |     return select(Vec4qb(is_nan(x)), a << 12 >> (12+29), 0);
139 | }
140 | 
141 | #endif // MAX_VECTOR_SIZE >= 256
142 | #if MAX_VECTOR_SIZE >= 512
143 | 
144 | // This function returns the code hidden in a NAN. The sign bit is ignored
145 | static inline Vec16ui nan_code(Vec16f const x) {
146 |     Vec16ui a = Vec16ui(reinterpret_i(x));
147 |     Vec16ui const n = 0x007FFFFF;
148 |     return select(Vec16ib(is_nan(x)), a & n, 0);
149 | }
150 | 
151 | // This function returns the code hidden in a NAN. The sign bit is ignored
152 | static inline Vec8uq nan_code(Vec8d const x) {
153 |     Vec8uq a = Vec8uq(reinterpret_i(x));
154 |     return select(Vec8qb(is_nan(x)), a << 12 >> (12+29), 0);
155 | }
156 | 
157 | #endif // MAX_VECTOR_SIZE >= 512
158 | 
159 | 
160 | /******************************************************************************
161 |                   templates for polynomials
162 | Using Estrin's scheme to make shorter dependency chains and use FMA, starting
163 | longest dependency chains first.
164 | ******************************************************************************/
165 | 
166 | // template <typedef VECTYPE, typedef CTYPE>
167 | template <class VTYPE, class CTYPE>
168 | static inline VTYPE polynomial_2(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2) {
169 |     // calculates polynomial c2*x^2 + c1*x + c0
170 |     // VTYPE may be a vector type, CTYPE is a scalar type
171 |     VTYPE x2 = x * x;
172 |     //return = x2 * c2 + (x * c1 + c0);
173 |     return mul_add(x2, c2, mul_add(x, c1, c0));
174 | }
175 | 
176 | template<class VTYPE, class CTYPE>
177 | static inline VTYPE polynomial_3(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3) {
178 |     // calculates polynomial c3*x^3 + c2*x^2 + c1*x + c0
179 |     // VTYPE may be a vector type, CTYPE is a scalar type
180 |     VTYPE x2 = x * x;
181 |     //return (c2 + c3*x)*x2 + (c1*x + c0);
182 |     return mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0));
183 | }
184 | 
185 | template<class VTYPE, class CTYPE>
186 | static inline VTYPE polynomial_4(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4) {
187 |     // calculates polynomial c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
188 |     // VTYPE may be a vector type, CTYPE is a scalar type
189 |     VTYPE x2 = x * x;
190 |     VTYPE x4 = x2 * x2;
191 |     //return (c2+c3*x)*x2 + ((c0+c1*x) + c4*x4);
192 |     return mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0) + c4*x4);
193 | }
194 | 
195 | template<class VTYPE, class CTYPE>
196 | static inline VTYPE polynomial_4n(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3) {
197 |     // calculates polynomial 1*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
198 |     // VTYPE may be a vector type, CTYPE is a scalar type
199 |     VTYPE x2 = x * x;
200 |     VTYPE x4 = x2 * x2;
201 |     //return (c2+c3*x)*x2 + ((c0+c1*x) + x4);
202 |     return mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0) + x4);
203 | }
204 | 
205 | template<class VTYPE, class CTYPE>
206 | static inline VTYPE polynomial_5(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5) {
207 |     // calculates polynomial c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
208 |     // VTYPE may be a vector type, CTYPE is a scalar type
209 |     VTYPE x2 = x * x;
210 |     VTYPE x4 = x2 * x2;
211 |     //return (c2+c3*x)*x2 + ((c4+c5*x)*x4 + (c0+c1*x));
212 |     return mul_add(mul_add(c3, x, c2), x2, mul_add(mul_add(c5, x, c4), x4, mul_add(c1, x, c0)));
213 | }
214 | 
215 | template<class VTYPE, class CTYPE>
216 | static inline VTYPE polynomial_5n(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4) {
217 |     // calculates polynomial 1*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
218 |     // VTYPE may be a vector type, CTYPE is a scalar type
219 |     VTYPE x2 = x * x;
220 |     VTYPE x4 = x2 * x2;
221 |     //return (c2+c3*x)*x2 + ((c4+x)*x4 + (c0+c1*x));
222 |     return mul_add(mul_add(c3, x, c2), x2, mul_add(c4 + x, x4, mul_add(c1, x, c0)));
223 | }
224 | 
225 | template<class VTYPE, class CTYPE>
226 | static inline VTYPE polynomial_6(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6) {
227 |     // calculates polynomial c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
228 |     // VTYPE may be a vector type, CTYPE is a scalar type
229 |     VTYPE x2 = x * x;
230 |     VTYPE x4 = x2 * x2;
231 |     //return  (c4+c5*x+c6*x2)*x4 + ((c2+c3*x)*x2 + (c0+c1*x));
232 |     return mul_add(mul_add(c6, x2, mul_add(c5, x, c4)), x4, mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0)));
233 | }
234 | 
235 | template<class VTYPE, class CTYPE>
236 | static inline VTYPE polynomial_6n(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5) {
237 |     // calculates polynomial 1*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
238 |     // VTYPE may be a vector type, CTYPE is a scalar type
239 |     VTYPE x2 = x * x;
240 |     VTYPE x4 = x2 * x2;
241 |     //return  (c4+c5*x+x2)*x4 + ((c2+c3*x)*x2 + (c0+c1*x));
242 |     return mul_add(mul_add(c5, x, c4 + x2), x4, mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0)));
243 | }
244 | 
245 | template<class VTYPE, class CTYPE>
246 | static inline VTYPE polynomial_7(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7) {
247 |     // calculates polynomial c7*x^7 + c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
248 |     // VTYPE may be a vector type, CTYPE is a scalar type
249 |     VTYPE x2 = x * x;
250 |     VTYPE x4 = x2 * x2;
251 |     //return  ((c6+c7*x)*x2 + (c4+c5*x))*x4 + ((c2+c3*x)*x2 + (c0+c1*x));
252 |     return mul_add(mul_add(mul_add(c7, x, c6), x2, mul_add(c5, x, c4)), x4, mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0)));
253 | }
254 | 
255 | template<class VTYPE, class CTYPE>
256 | static inline VTYPE polynomial_8(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7, CTYPE c8) {
257 |     // calculates polynomial c8*x^8 + c7*x^7 + c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
258 |     // VTYPE may be a vector type, CTYPE is a scalar type
259 |     VTYPE x2 = x  * x;
260 |     VTYPE x4 = x2 * x2;
261 |     VTYPE x8 = x4 * x4;
262 |     //return  ((c6+c7*x)*x2 + (c4+c5*x))*x4 + (c8*x8 + (c2+c3*x)*x2 + (c0+c1*x));
263 |     return mul_add(mul_add(mul_add(c7, x, c6), x2, mul_add(c5, x, c4)), x4,
264 |         mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0) + c8*x8));
265 | }
266 | 
267 | template<class VTYPE, class CTYPE>
268 | static inline VTYPE polynomial_9(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7, CTYPE c8, CTYPE c9) {
269 |     // calculates polynomial c9*x^9 + c8*x^8 + c7*x^7 + c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
270 |     // VTYPE may be a vector type, CTYPE is a scalar type
271 |     VTYPE x2 = x  * x;
272 |     VTYPE x4 = x2 * x2;
273 |     VTYPE x8 = x4 * x4;
274 |     //return  (((c6+c7*x)*x2 + (c4+c5*x))*x4 + (c8+c9*x)*x8) + ((c2+c3*x)*x2 + (c0+c1*x));
275 |     return mul_add(mul_add(c9, x, c8), x8, mul_add(
276 |         mul_add(mul_add(c7, x, c6), x2, mul_add(c5, x, c4)), x4,
277 |         mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0))));
278 | }
279 | 
280 | template<class VTYPE, class CTYPE>
281 | static inline VTYPE polynomial_10(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7, CTYPE c8, CTYPE c9, CTYPE c10) {
282 |     // calculates polynomial c10*x^10 + c9*x^9 + c8*x^8 + c7*x^7 + c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
283 |     // VTYPE may be a vector type, CTYPE is a scalar type
284 |     VTYPE x2 = x  * x;
285 |     VTYPE x4 = x2 * x2;
286 |     VTYPE x8 = x4 * x4;
287 |     //return  (((c6+c7*x)*x2 + (c4+c5*x))*x4 + (c8+c9*x+c10*x2)*x8) + ((c2+c3*x)*x2 + (c0+c1*x));
288 |     return mul_add(mul_add(x2, c10, mul_add(c9, x, c8)), x8,
289 |         mul_add(mul_add(mul_add(c7, x, c6), x2, mul_add(c5, x, c4)), x4,
290 |             mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0))));
291 | }
292 | 
293 | template<class VTYPE, class CTYPE>
294 | static inline VTYPE polynomial_13(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7, CTYPE c8, CTYPE c9, CTYPE c10, CTYPE c11, CTYPE c12, CTYPE c13) {
295 |     // calculates polynomial c13*x^13 + c12*x^12 + ... + c1*x + c0
296 |     // VTYPE may be a vector type, CTYPE is a scalar type
297 |     VTYPE x2 = x  * x;
298 |     VTYPE x4 = x2 * x2;
299 |     VTYPE x8 = x4 * x4;
300 |     return mul_add(
301 |         mul_add(
302 |             mul_add(c13, x, c12), x4,
303 |             mul_add(mul_add(c11, x, c10), x2, mul_add(c9, x, c8))), x8,
304 |         mul_add(
305 |             mul_add(mul_add(c7, x, c6), x2, mul_add(c5, x, c4)), x4,
306 |             mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0))));
307 | }
308 | 
309 | 
310 | template<class VTYPE, class CTYPE>
311 | static inline VTYPE polynomial_13m(VTYPE const x, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7, CTYPE c8, CTYPE c9, CTYPE c10, CTYPE c11, CTYPE c12, CTYPE c13) {
312 |     // calculates polynomial c13*x^13 + c12*x^12 + ... + x + 0
313 |     // VTYPE may be a vector type, CTYPE is a scalar type
314 |     VTYPE x2 = x  * x;
315 |     VTYPE x4 = x2 * x2;
316 |     VTYPE x8 = x4 * x4;
317 |     // return  ((c8+c9*x) + (c10+c11*x)*x2 + (c12+c13*x)*x4)*x8 + (((c6+c7*x)*x2 + (c4+c5*x))*x4 + ((c2+c3*x)*x2 + x));
318 |     return mul_add(
319 |         mul_add(mul_add(c13, x, c12), x4, mul_add(mul_add(c11, x, c10), x2, mul_add(c9, x, c8))), x8,
320 |         mul_add(mul_add(mul_add(c7, x, c6), x2, mul_add(c5, x, c4)), x4, mul_add(mul_add(c3, x, c2), x2, x)));
321 | }
322 | 
323 | #ifdef VCL_NAMESPACE
324 | }
325 | #endif
326 | 
327 | #endif
328 | 


--------------------------------------------------------------------------------
/src/VCL2/vectormath_hyp.h:
--------------------------------------------------------------------------------
  1 | /****************************  vectormath_hyp.h   ******************************
  2 | * Author:        Agner Fog
  3 | * Date created:  2014-07-09
  4 | * Last modified: 2019-08-01
  5 | * Version:       2.00.00
  6 | * Project:       vector class library
  7 | * Description:
  8 | * Header file containing inline vector functions of hyperbolic and inverse
  9 | * hyperbolic functions:
 10 | * sinh        hyperbolic sine
 11 | * cosh        hyperbolic cosine
 12 | * tanh        hyperbolic tangent
 13 | * asinh       inverse hyperbolic sine
 14 | * acosh       inverse hyperbolic cosine
 15 | * atanh       inverse hyperbolic tangent
 16 | *
 17 | * Theory, methods and inspiration based partially on these sources:
 18 | * > Moshier, Stephen Lloyd Baluk: Methods and programs for mathematical functions.
 19 | *   Ellis Horwood, 1989.
 20 | * > VDT library developed on CERN by Danilo Piparo, Thomas Hauth and
 21 | *   Vincenzo Innocente, 2012, https://svnweb.cern.ch/trac/vdt
 22 | * > Cephes math library by Stephen L. Moshier 1992,
 23 | *   http://www.netlib.org/cephes/
 24 | *
 25 | * For detailed instructions, see vectormath_common.h and vcl_manual.pdf
 26 | *
 27 | * (c) Copyright 2014-2019 Agner Fog.
 28 | * Apache License version 2.0 or later.
 29 | ******************************************************************************/
 30 | 
 31 | #ifndef VECTORMATH_HYP_H
 32 | #define VECTORMATH_HYP_H  1
 33 | 
 34 | #include "vectormath_exp.h"
 35 | 
 36 | #ifdef VCL_NAMESPACE
 37 | namespace VCL_NAMESPACE {
 38 | #endif
 39 | 
 40 | /******************************************************************************
 41 | *                 Hyperbolic functions
 42 | ******************************************************************************/
 43 | 
 44 | // Template for sinh function, double precision
 45 | // This function does not produce denormals
 46 | // Template parameters:
 47 | // VTYPE:  double vector type
 48 | template<typename VTYPE>
 49 | static inline VTYPE sinh_d(VTYPE const x0) {
 50 | // The limit of abs(x) is 709.7, as defined by max_x in vectormath_exp.h for 0.5*exp(x).
 51 | 
 52 |     // Coefficients
 53 |     const double p0 = -3.51754964808151394800E5;
 54 |     const double p1 = -1.15614435765005216044E4;
 55 |     const double p2 = -1.63725857525983828727E2;
 56 |     const double p3 = -7.89474443963537015605E-1;
 57 | 
 58 |     const double q0 = -2.11052978884890840399E6;
 59 |     const double q1 =  3.61578279834431989373E4;
 60 |     const double q2 = -2.77711081420602794433E2;
 61 |     const double q3 =  1.0;
 62 | 
 63 |     // data vectors
 64 |     VTYPE  x, x2, y1, y2;
 65 | 
 66 |     x = abs(x0);
 67 |     auto x_small = x <= 1.0;                     // use Pade approximation if abs(x) <= 1
 68 | 
 69 |     if (horizontal_or(x_small)) {
 70 |         // At least one element needs small method
 71 |         x2 = x*x;
 72 |         y1 = polynomial_3(x2, p0, p1, p2, p3) / polynomial_3(x2, q0, q1, q2, q3);
 73 |         y1 = mul_add(y1, x*x2, x);               // y1 = x + x2*(x*y1);
 74 |     }
 75 |     if (!horizontal_and(x_small)) {
 76 |         // At least one element needs big method
 77 |         y2 =  exp_d<VTYPE, 0, 1>(x);             //   0.5 * exp(x)
 78 |         y2 -= 0.25 / y2;                         // - 0.5 * exp(-x)
 79 |     }
 80 |     y1 = select(x_small, y1, y2);                // choose method
 81 |     y1 = sign_combine(y1, x0);                   // get original sign
 82 |     // you can avoid the sign_combine by replacing x by x0 above, but at a loss of precision
 83 | 
 84 |     return y1;
 85 | }
 86 | 
 87 | // instances of sinh_d template
 88 | static inline Vec2d sinh(Vec2d const x) {
 89 |     return sinh_d(x);
 90 | }
 91 | 
 92 | #if MAX_VECTOR_SIZE >= 256
 93 | static inline Vec4d sinh(Vec4d const x) {
 94 |     return sinh_d(x);
 95 | }
 96 | #endif // MAX_VECTOR_SIZE >= 256
 97 | 
 98 | #if MAX_VECTOR_SIZE >= 512
 99 | static inline Vec8d sinh(Vec8d const x) {
100 |     return sinh_d(x);
101 | }
102 | #endif // MAX_VECTOR_SIZE >= 512
103 | 
104 | 
105 | // Template for sinh function, single precision
106 | // This function does not produce denormals
107 | // Template parameters:
108 | // VTYPE:  double vector type
109 | template<typename VTYPE>
110 | static inline VTYPE sinh_f(VTYPE const x0) {
111 | // The limit of abs(x) is 89.0, as defined by max_x in vectormath_exp.h for 0.5*exp(x).
112 | 
113 |     // Coefficients
114 |     const float r0 = 1.66667160211E-1f;
115 |     const float r1 = 8.33028376239E-3f;
116 |     const float r2 = 2.03721912945E-4f;
117 | 
118 |     // data vectors
119 |     VTYPE x, x2, y1, y2;
120 | 
121 |     x = abs(x0);
122 |     auto x_small = x <= 1.0f;                    // use polynomial approximation if abs(x) <= 1
123 | 
124 |     if (horizontal_or(x_small)) {
125 |         // At least one element needs small method
126 |         x2 = x*x;
127 |         y1 = polynomial_2(x2, r0, r1, r2);
128 |         y1 = mul_add(y1, x2*x, x);               // y1 = x + x2*(x*y1);
129 |     }
130 |     if (!horizontal_and(x_small)) {
131 |         // At least one element needs big method
132 |         y2 =  exp_f<VTYPE, 0, 1>(x);             //   0.5 * exp(x)
133 |         y2 -= 0.25f / y2;                        // - 0.5 * exp(-x)
134 |     }
135 |     y1 = select(x_small, y1, y2);                // choose method
136 |     y1 = sign_combine(y1, x0);                   // get original sign
137 |     // you can avoid the sign_combine by replacing x by x0 above, but at a loss of precision
138 | 
139 |     return y1;
140 | }
141 | 
142 | // instances of sinh_f template
143 | static inline Vec4f sinh(Vec4f const x) {
144 |     return sinh_f(x);
145 | }
146 | 
147 | #if MAX_VECTOR_SIZE >= 256
148 | static inline Vec8f sinh(Vec8f const x) {
149 |     return sinh_f(x);
150 | }
151 | #endif // MAX_VECTOR_SIZE >= 256
152 | 
153 | #if MAX_VECTOR_SIZE >= 512
154 | static inline Vec16f sinh(Vec16f const x) {
155 |     return sinh_f(x);
156 | }
157 | #endif // MAX_VECTOR_SIZE >= 512
158 | 
159 | 
160 | // Template for cosh function, double precision
161 | // This function does not produce denormals
162 | // Template parameters:
163 | // VTYPE:  double vector type
164 | template<typename VTYPE>
165 | static inline VTYPE cosh_d(VTYPE const x0) {
166 | // The limit of abs(x) is 709.7, as defined by max_x in vectormath_exp.h for 0.5*exp(x).
167 | 
168 |     // data vectors
169 |     VTYPE x, y;
170 |     x  = abs(x0);
171 |     y  = exp_d<VTYPE, 0, 1>(x);                  //   0.5 * exp(x)
172 |     y += 0.25 / y;                               // + 0.5 * exp(-x)
173 |     return y;
174 | }
175 | 
176 | // instances of sinh_d template
177 | static inline Vec2d cosh(Vec2d const x) {
178 |     return cosh_d(x);
179 | }
180 | 
181 | #if MAX_VECTOR_SIZE >= 256
182 | static inline Vec4d cosh(Vec4d const x) {
183 |     return cosh_d(x);
184 | }
185 | #endif // MAX_VECTOR_SIZE >= 256
186 | 
187 | #if MAX_VECTOR_SIZE >= 512
188 | static inline Vec8d cosh(Vec8d const x) {
189 |     return cosh_d(x);
190 | }
191 | #endif // MAX_VECTOR_SIZE >= 512
192 | 
193 | 
194 | // Template for cosh function, single precision
195 | // This function does not produce denormals
196 | // Template parameters:
197 | // VTYPE:  double vector type
198 | template<typename VTYPE>
199 | static inline VTYPE cosh_f(VTYPE const x0) {
200 | // The limit of abs(x) is 89.0, as defined by max_x in vectormath_exp.h for 0.5*exp(x).
201 | 
202 |     // data vectors
203 |     VTYPE x, y;
204 |     x  = abs(x0);
205 |     y  = exp_f<VTYPE, 0, 1>(x);                  //   0.5 * exp(x)
206 |     y += 0.25f / y;                              // + 0.5 * exp(-x)
207 |     return y;
208 | }
209 | 
210 | // instances of sinh_d template
211 | static inline Vec4f cosh(Vec4f const x) {
212 |     return cosh_f(x);
213 | }
214 | 
215 | #if MAX_VECTOR_SIZE >= 256
216 | static inline Vec8f cosh(Vec8f const x) {
217 |     return cosh_f(x);
218 | }
219 | #endif // MAX_VECTOR_SIZE >= 256
220 | 
221 | #if MAX_VECTOR_SIZE >= 512
222 | static inline Vec16f cosh(Vec16f const x) {
223 |     return cosh_f(x);
224 | }
225 | #endif // MAX_VECTOR_SIZE >= 512
226 | 
227 | 
228 | // Template for tanh function, double precision
229 | // This function does not produce denormals
230 | // Template parameters:
231 | // VTYPE:  double vector type
232 | template<typename VTYPE>
233 | static inline VTYPE tanh_d(VTYPE const x0) {
234 | 
235 |     // Coefficients
236 |     const double p0 = -1.61468768441708447952E3;
237 |     const double p1 = -9.92877231001918586564E1;
238 |     const double p2 = -9.64399179425052238628E-1;
239 | 
240 |     const double q0 =  4.84406305325125486048E3;
241 |     const double q1 =  2.23548839060100448583E3;
242 |     const double q2 =  1.12811678491632931402E2;
243 |     const double q3 =  1.0;
244 | 
245 |     // data vectors
246 |     VTYPE  x, x2, y1, y2;
247 | 
248 |     x = abs(x0);
249 |     auto x_small = x <= 0.625;                   // use Pade approximation if abs(x) <= 5/8
250 | 
251 |     if (horizontal_or(x_small)) {
252 |         // At least one element needs small method
253 |         x2 = x*x;
254 |         y1 = polynomial_2(x2, p0, p1, p2) / polynomial_3(x2, q0, q1, q2, q3);
255 |         y1 = mul_add(y1, x2*x, x);               // y1 = x + x2*(x*y1);
256 |     }
257 |     if (!horizontal_and(x_small)) {
258 |         // At least one element needs big method
259 |         y2 = exp(x+x);                           // exp(2*x)
260 |         y2 = 1.0 - 2.0 / (y2 + 1.0);             // tanh(x)
261 |     }
262 |     auto x_big = x > 350.;
263 |     y1 = select(x_small, y1, y2);                // choose method
264 |     y1 = select(x_big,  1.0, y1);                // avoid overflow
265 |     y1 = sign_combine(y1, x0);                   // get original sign
266 |     return y1;
267 | }
268 | 
269 | // instances of tanh_d template
270 | static inline Vec2d tanh(Vec2d const x) {
271 |     return tanh_d(x);
272 | }
273 | 
274 | #if MAX_VECTOR_SIZE >= 256
275 | static inline Vec4d tanh(Vec4d const x) {
276 |     return tanh_d(x);
277 | }
278 | #endif // MAX_VECTOR_SIZE >= 256
279 | 
280 | #if MAX_VECTOR_SIZE >= 512
281 | static inline Vec8d tanh(Vec8d const x) {
282 |     return tanh_d(x);
283 | }
284 | #endif // MAX_VECTOR_SIZE >= 512
285 | 
286 | 
287 | // Template for tanh function, single precision
288 | // This function does not produce denormals
289 | // Template parameters:
290 | // VTYPE:  double vector type
291 | template<typename VTYPE>
292 | static inline VTYPE tanh_f(VTYPE const x0) {
293 | // The limit of abs(x) is 89.0, as defined by max_x in vectormath_exp.h for 0.5*exp(x).
294 | 
295 |     // Coefficients
296 |     const float r0 = -3.33332819422E-1f;
297 |     const float r1 =  1.33314422036E-1f;
298 |     const float r2 = -5.37397155531E-2f;
299 |     const float r3 =  2.06390887954E-2f;
300 |     const float r4 = -5.70498872745E-3f;
301 | 
302 |     // data vectors
303 |     VTYPE x, x2, y1, y2;
304 | 
305 |     x = abs(x0);
306 |     auto x_small = x <= 0.625f;                  // use polynomial approximation if abs(x) <= 5/8
307 | 
308 |     if (horizontal_or(x_small)) {
309 |         // At least one element needs small method
310 |         x2 = x*x;
311 |         y1 = polynomial_4(x2, r0, r1, r2, r3, r4);
312 |         y1 = mul_add(y1, x2*x, x);               // y1 = x + (x2*x)*y1;
313 |     }
314 |     if (!horizontal_and(x_small)) {
315 |         // At least one element needs big method
316 |         y2 = exp(x+x);                           // exp(2*x)
317 |         y2 = 1.0f - 2.0f / (y2 + 1.0f);          // tanh(x)
318 |     }
319 |     auto x_big = x > 44.4f;
320 |     y1 = select(x_small, y1, y2);                // choose method
321 |     y1 = select(x_big,  1.0f, y1);               // avoid overflow
322 |     y1 = sign_combine(y1, x0);                   // get original sign
323 |     return y1;
324 | }
325 | 
326 | // instances of tanh_f template
327 | static inline Vec4f tanh(Vec4f const x) {
328 |     return tanh_f(x);
329 | }
330 | 
331 | #if MAX_VECTOR_SIZE >= 256
332 | static inline Vec8f tanh(Vec8f const x) {
333 |     return tanh_f(x);
334 | }
335 | #endif // MAX_VECTOR_SIZE >= 256
336 | 
337 | #if MAX_VECTOR_SIZE >= 512
338 | static inline Vec16f tanh(Vec16f const x) {
339 |     return tanh_f(x);
340 | }
341 | #endif // MAX_VECTOR_SIZE >= 512
342 | 
343 | 
344 | 
345 | /******************************************************************************
346 | *                 Inverse hyperbolic functions
347 | ******************************************************************************/
348 | 
349 | // Template for asinh function, double precision
350 | // This function does not produce denormals
351 | // Template parameters:
352 | // VTYPE:  double vector type
353 | template<typename VTYPE>
354 | static inline VTYPE asinh_d(VTYPE const x0) {
355 | 
356 |     // Coefficients
357 |     const double p0 = -5.56682227230859640450E0;
358 |     const double p1 = -9.09030533308377316566E0;
359 |     const double p2 = -4.37390226194356683570E0;
360 |     const double p3 = -5.91750212056387121207E-1;
361 |     const double p4 = -4.33231683752342103572E-3;
362 | 
363 |     const double q0 =  3.34009336338516356383E1;
364 |     const double q1 =  6.95722521337257608734E1;
365 |     const double q2 =  4.86042483805291788324E1;
366 |     const double q3 =  1.28757002067426453537E1;
367 |     const double q4 =  1.0;
368 | 
369 |     // data vectors
370 |     VTYPE  x, x2, y1, y2;
371 | 
372 |     x2 = x0 * x0;
373 |     x  = abs(x0);
374 |     auto x_small = x <= 0.533;                   // use Pade approximation if abs(x) <= 0.5
375 |     // Both methods give the highest error close to 0.5.
376 |     // This limit is adjusted for minimum error
377 |     auto x_huge  = x > 1.E20;                    // simple approximation, avoid overflow
378 | 
379 |     if (horizontal_or(x_small)) {
380 |         // At least one element needs small method
381 |         y1 = polynomial_4(x2, p0, p1, p2, p3, p4) / polynomial_4(x2, q0, q1, q2, q3, q4);
382 |         y1 = mul_add(y1, x2*x, x);               // y1 = x + (x2*x)*y1;
383 |     }
384 |     if (!horizontal_and(x_small)) {
385 |         // At least one element needs big method
386 |         y2 = log(x + sqrt(x2 + 1.0));
387 |         if (horizontal_or(x_huge)) {
388 |             // At least one element needs huge method to avoid overflow
389 |             y2 = select(x_huge, log(x) + VM_LN2, y2);
390 |         }
391 |     }
392 |     y1 = select(x_small, y1, y2);                // choose method
393 |     y1 = sign_combine(y1, x0);                   // get original sign
394 |     return y1;
395 | }
396 | 
397 | // instances of asinh_d template
398 | static inline Vec2d asinh(Vec2d const x) {
399 |     return asinh_d(x);
400 | }
401 | 
402 | #if MAX_VECTOR_SIZE >= 256
403 | static inline Vec4d asinh(Vec4d const x) {
404 |     return asinh_d(x);
405 | }
406 | #endif // MAX_VECTOR_SIZE >= 256
407 | 
408 | #if MAX_VECTOR_SIZE >= 512
409 | static inline Vec8d asinh(Vec8d const x) {
410 |     return asinh_d(x);
411 | }
412 | #endif // MAX_VECTOR_SIZE >= 512
413 | 
414 | 
415 | // Template for asinh function, single precision
416 | // This function does not produce denormals
417 | // Template parameters:
418 | // VTYPE:  double vector type
419 | template<typename VTYPE>
420 | static inline VTYPE asinh_f(VTYPE const x0) {
421 | 
422 |     // Coefficients
423 |     const float r0 = -1.6666288134E-1f;
424 |     const float r1 =  7.4847586088E-2f;
425 |     const float r2 = -4.2699340972E-2f;
426 |     const float r3 =  2.0122003309E-2f;
427 | 
428 |     // data vectors
429 |     VTYPE  x, x2, y1, y2;
430 | 
431 |     x2 = x0 * x0;
432 |     x  = abs(x0);
433 |     auto x_small = x <= 0.51f;                   // use polynomial approximation if abs(x) <= 0.5
434 |     auto x_huge  = x > 1.E10f;                   // simple approximation, avoid overflow
435 | 
436 |     if (horizontal_or(x_small)) {
437 |         // At least one element needs small method
438 |         y1 = polynomial_3(x2, r0, r1, r2, r3);
439 |         y1 = mul_add(y1, x2*x, x);               // y1 = x + (x2*x)*y1;
440 |     }
441 |     if (!horizontal_and(x_small)) {
442 |         // At least one element needs big method
443 |         y2 = log(x + sqrt(x2 + 1.0f));
444 |         if (horizontal_or(x_huge)) {
445 |             // At least one element needs huge method to avoid overflow
446 |             y2 = select(x_huge, log(x) + (float)VM_LN2, y2);
447 |         }
448 |     }
449 |     y1 = select(x_small, y1, y2);                // choose method
450 |     y1 = sign_combine(y1, x0);                   // get original sign
451 |     return y1;
452 | }
453 | 
454 | // instances of asinh_f template
455 | static inline Vec4f asinh(Vec4f const x) {
456 |     return asinh_f(x);
457 | }
458 | 
459 | #if MAX_VECTOR_SIZE >= 256
460 | static inline Vec8f asinh(Vec8f const x) {
461 |     return asinh_f(x);
462 | }
463 | #endif // MAX_VECTOR_SIZE >= 256
464 | 
465 | #if MAX_VECTOR_SIZE >= 512
466 | static inline Vec16f asinh(Vec16f const x) {
467 |     return asinh_f(x);
468 | }
469 | #endif // MAX_VECTOR_SIZE >= 512
470 | 
471 | 
472 | // Template for acosh function, double precision
473 | // This function does not produce denormals
474 | // Template parameters:
475 | // VTYPE:  double vector type
476 | template<typename VTYPE>
477 | static inline VTYPE acosh_d(VTYPE const x0) {
478 | 
479 |     // Coefficients
480 |     const double p0 = 1.10855947270161294369E5;
481 |     const double p1 = 1.08102874834699867335E5;
482 |     const double p2 = 3.43989375926195455866E4;
483 |     const double p3 = 3.94726656571334401102E3;
484 |     const double p4 = 1.18801130533544501356E2;
485 | 
486 |     const double q0 = 7.83869920495893927727E4;
487 |     const double q1 = 8.29725251988426222434E4;
488 |     const double q2 = 2.97683430363289370382E4;
489 |     const double q3 = 4.15352677227719831579E3;
490 |     const double q4 = 1.86145380837903397292E2;
491 |     const double q5 = 1.0;
492 | 
493 |     // data vectors
494 |     VTYPE  x1, y1, y2;
495 | 
496 |     x1      = x0 - 1.0;
497 |     auto undef   = x0 < 1.0;                     // result is NAN
498 |     auto x_small = x1 < 0.49;                    // use Pade approximation if abs(x-1) < 0.5
499 |     auto x_huge  = x1 > 1.E20;                   // simple approximation, avoid overflow
500 | 
501 |     if (horizontal_or(x_small)) {
502 |         // At least one element needs small method
503 |         y1 = sqrt(x1) * (polynomial_4(x1, p0, p1, p2, p3, p4) / polynomial_5(x1, q0, q1, q2, q3, q4, q5));
504 |         // x < 1 generates NAN
505 |         y1 = select(undef, nan_vec<VTYPE>(NAN_HYP), y1);
506 |     }
507 |     if (!horizontal_and(x_small)) {
508 |         // At least one element needs big method
509 |         y2 = log(x0 + sqrt(mul_sub(x0,x0,1.0)));
510 |         if (horizontal_or(x_huge)) {
511 |             // At least one element needs huge method to avoid overflow
512 |             y2 = select(x_huge, log(x0) + VM_LN2, y2);
513 |         }
514 |     }
515 |     y1 = select(x_small, y1, y2);                // choose method
516 |     return y1;
517 | }
518 | 
519 | // instances of acosh_d template
520 | static inline Vec2d acosh(Vec2d const x) {
521 |     return acosh_d(x);
522 | }
523 | 
524 | #if MAX_VECTOR_SIZE >= 256
525 | static inline Vec4d acosh(Vec4d const x) {
526 |     return acosh_d(x);
527 | }
528 | #endif // MAX_VECTOR_SIZE >= 256
529 | 
530 | #if MAX_VECTOR_SIZE >= 512
531 | static inline Vec8d acosh(Vec8d const x) {
532 |     return acosh_d(x);
533 | }
534 | #endif // MAX_VECTOR_SIZE >= 512
535 | 
536 | 
537 | // Template for acosh function, single precision
538 | // This function does not produce denormals
539 | // Template parameters:
540 | // VTYPE:  double vector type
541 | template<typename VTYPE>
542 | static inline VTYPE acosh_f(VTYPE const x0) {
543 | 
544 |     // Coefficients
545 |     const float r0 =  1.4142135263E0f;
546 |     const float r1 = -1.1784741703E-1f;
547 |     const float r2 =  2.6454905019E-2f;
548 |     const float r3 = -7.5272886713E-3f;
549 |     const float r4 =  1.7596881071E-3f;
550 | 
551 |     // data vectors
552 |     VTYPE  x1, y1, y2;
553 | 
554 |     x1      = x0 - 1.0f;
555 |     auto undef   = x0 < 1.0f;                    // result is NAN
556 |     auto x_small = x1 < 0.49f;                   // use Pade approximation if abs(x-1) < 0.5
557 |     auto x_huge  = x1 > 1.E10f;                  // simple approximation, avoid overflow
558 | 
559 |     if (horizontal_or(x_small)) {
560 |         // At least one element needs small method
561 |         y1 = sqrt(x1) * polynomial_4(x1, r0, r1, r2, r3, r4);
562 |         // x < 1 generates NAN
563 |         y1 = select(undef, nan_vec<VTYPE>(NAN_HYP), y1);
564 |     }
565 |     if (!horizontal_and(x_small)) {
566 |         // At least one element needs big method
567 |         y2 = log(x0 + sqrt(mul_sub(x0,x0,1.0)));
568 |         if (horizontal_or(x_huge)) {
569 |             // At least one element needs huge method to avoid overflow
570 |             y2 = select(x_huge, log(x0) + (float)VM_LN2, y2);
571 |         }
572 |     }
573 |     y1 = select(x_small, y1, y2);                // choose method
574 |     return y1;
575 | }
576 | 
577 | // instances of acosh_f template
578 | static inline Vec4f acosh(Vec4f const x) {
579 |     return acosh_f(x);
580 | }
581 | 
582 | #if MAX_VECTOR_SIZE >= 256
583 | static inline Vec8f acosh(Vec8f const x) {
584 |     return acosh_f(x);
585 | }
586 | #endif // MAX_VECTOR_SIZE >= 256
587 | 
588 | #if MAX_VECTOR_SIZE >= 512
589 | static inline Vec16f acosh(Vec16f const x) {
590 |     return acosh_f(x);
591 | }
592 | #endif // MAX_VECTOR_SIZE >= 512
593 | 
594 | 
595 | // Template for atanh function, double precision
596 | // This function does not produce denormals
597 | // Template parameters:
598 | // VTYPE:  double vector type
599 | template<typename VTYPE>
600 | static inline VTYPE atanh_d(VTYPE const x0) {
601 | 
602 |     // Coefficients
603 |     const double p0 = -3.09092539379866942570E1;
604 |     const double p1 =  6.54566728676544377376E1;
605 |     const double p2 = -4.61252884198732692637E1;
606 |     const double p3 =  1.20426861384072379242E1;
607 |     const double p4 = -8.54074331929669305196E-1;
608 | 
609 |     const double q0 = -9.27277618139601130017E1;
610 |     const double q1 =  2.52006675691344555838E2;
611 |     const double q2 = -2.49839401325893582852E2;
612 |     const double q3 =  1.08938092147140262656E2;
613 |     const double q4 = -1.95638849376911654834E1;
614 |     const double q5 =  1.0;
615 | 
616 |     // data vectors
617 |     VTYPE  x, x2, y1, y2, y3;
618 | 
619 |     x  = abs(x0);
620 |     auto x_small = x < 0.5;                      // use Pade approximation if abs(x) < 0.5
621 | 
622 |     if (horizontal_or(x_small)) {
623 |         // At least one element needs small method
624 |         x2 = x * x;
625 |         y1 = polynomial_4(x2, p0, p1, p2, p3, p4) / polynomial_5(x2, q0, q1, q2, q3, q4, q5);
626 |         y1 = mul_add(y1, x2*x, x);
627 |     }
628 |     if (!horizontal_and(x_small)) {
629 |         // At least one element needs big method
630 |         y2 = log((1.0+x)/(1.0-x)) * 0.5;
631 |         // check if out of range
632 |         y3 = select(x == 1.0, infinite_vec<VTYPE>(), nan_vec<VTYPE>(NAN_HYP));
633 |         y2 = select(x >= 1.0, y3, y2);
634 |     }
635 |     y1 = select(x_small, y1, y2);                // choose method
636 |     y1 = sign_combine(y1, x0);                   // get original sign
637 |     return y1;
638 | }
639 | 
640 | // instances of atanh_d template
641 | static inline Vec2d atanh(Vec2d const x) {
642 |     return atanh_d(x);
643 | }
644 | 
645 | #if MAX_VECTOR_SIZE >= 256
646 | static inline Vec4d atanh(Vec4d const x) {
647 |     return atanh_d(x);
648 | }
649 | #endif // MAX_VECTOR_SIZE >= 256
650 | 
651 | #if MAX_VECTOR_SIZE >= 512
652 | static inline Vec8d atanh(Vec8d const x) {
653 |     return atanh_d(x);
654 | }
655 | #endif // MAX_VECTOR_SIZE >= 512
656 | 
657 | 
658 | // Template for atanh function, single precision
659 | // This function does not produce denormals
660 | // Template parameters:
661 | // VTYPE:  double vector type
662 | template<typename VTYPE>
663 | static inline VTYPE atanh_f(VTYPE const x0) {
664 | 
665 |     // Coefficients
666 |     const float r0 = 3.33337300303E-1f;
667 |     const float r1 = 1.99782164500E-1f;
668 |     const float r2 = 1.46691431730E-1f;
669 |     const float r3 = 8.24370301058E-2f;
670 |     const float r4 = 1.81740078349E-1f;
671 | 
672 |     // data vectors
673 |     VTYPE  x, x2, y1, y2, y3;
674 | 
675 |     x  = abs(x0);
676 |     auto x_small = x < 0.5f;                     // use polynomial approximation if abs(x) < 0.5
677 | 
678 |     if (horizontal_or(x_small)) {
679 |         // At least one element needs small method
680 |         x2 = x * x;
681 |         y1 = polynomial_4(x2, r0, r1, r2, r3, r4);
682 |         y1 = mul_add(y1, x2*x, x);
683 |     }
684 |     if (!horizontal_and(x_small)) {
685 |         // At least one element needs big method
686 |         y2 = log((1.0f+x)/(1.0f-x)) * 0.5f;
687 |         // check if out of range
688 |         y3 = select(x == 1.0f, infinite_vec<VTYPE>(), nan_vec<VTYPE>(NAN_HYP));
689 |         y2 = select(x >= 1.0f, y3, y2);
690 |     }
691 |     y1 = select(x_small, y1, y2);                // choose method
692 |     y1 = sign_combine(y1, x0);                   // get original sign
693 |     return y1;
694 | }
695 | 
696 | // instances of atanh_f template
697 | static inline Vec4f atanh(Vec4f const x) {
698 |     return atanh_f(x);
699 | }
700 | 
701 | #if MAX_VECTOR_SIZE >= 256
702 | static inline Vec8f atanh(Vec8f const x) {
703 |     return atanh_f(x);
704 | }
705 | #endif // MAX_VECTOR_SIZE >= 256
706 | 
707 | #if MAX_VECTOR_SIZE >= 512
708 | static inline Vec16f atanh(Vec16f const x) {
709 |     return atanh_f(x);
710 | }
711 | #endif // MAX_VECTOR_SIZE >= 512
712 | 
713 | #ifdef VCL_NAMESPACE
714 | }
715 | #endif
716 | 
717 | #endif
718 | 


--------------------------------------------------------------------------------
/src/vsTCanny.cpp:
--------------------------------------------------------------------------------
  1 | #include <cmath>
  2 | #include <string>
  3 | 
  4 | #include "vsTCanny.h"
  5 | 
  6 | AVS_FORCEINLINE void* aligned_malloc(size_t size, size_t align)
  7 | {
  8 |     void* result = [&]() {
  9 | #ifdef _WIN32 
 10 |         return _aligned_malloc(size, align);
 11 | #else
 12 |         if (posix_memalign(&result, align, size))
 13 |             return result = nullptr;
 14 |         else
 15 |             return result;
 16 | #endif
 17 |     }();
 18 | 
 19 |     return result;
 20 | }
 21 | 
 22 | AVS_FORCEINLINE void aligned_free(void* ptr)
 23 | {
 24 | #ifdef _WIN32 
 25 |     _aligned_free(ptr);
 26 | #else
 27 |     free(ptr);
 28 | #endif
 29 | }
 30 | 
 31 | template<typename T>
 32 | static void copyPlane(const T* srcp, float* __restrict dstp, const int width, const int height, const int srcStride, const int dstStride) noexcept
 33 | {
 34 |     for (int y{ 0 }; y < height; ++y)
 35 |     {
 36 |         for (int x{ 0 }; x < width; ++x)
 37 |             dstp[x] = srcp[x];
 38 | 
 39 |         srcp += srcStride;
 40 |         dstp += dstStride;
 41 |     }
 42 | }
 43 | 
 44 | template<typename T>
 45 | static void gaussianBlur(const T* _srcp, float* __restrict temp, float* __restrict dstp, const float* weightsH, const float* weightsV, const int width, const int height, const int srcStride, const int dstStride, const int radiusH, const int radiusV) noexcept
 46 | {
 47 |     const int diameter{ radiusV * 2 + 1 };
 48 |     std::unique_ptr<const T* []> srcp{ std::make_unique<const T* []>(diameter) };
 49 | 
 50 |     srcp[radiusV] = _srcp;
 51 |     for (int i{ 1 }; i <= radiusV; ++i)
 52 |         srcp[radiusV - i] = srcp[radiusV + i] = srcp[radiusV] + srcStride * i;
 53 | 
 54 |     weightsH += radiusH;
 55 | 
 56 |     for (int y{ 0 }; y < height; ++y)
 57 |     {
 58 |         for (int x{ 0 }; x < width; ++x)
 59 |         {
 60 |             float sum{ 0.0f };
 61 | 
 62 |             for (int i{ 0 }; i < diameter; ++i)
 63 |                 sum += srcp[i][x] * weightsV[i];
 64 | 
 65 |             temp[x] = sum;
 66 |         }
 67 | 
 68 |         for (int i{ 1 }; i <= radiusH; ++i)
 69 |         {
 70 |             temp[-i] = temp[i];
 71 |             temp[width - 1 + i] = temp[width - 1 - i];
 72 |         }
 73 | 
 74 |         for (int x{ 0 }; x < width; ++x)
 75 |         {
 76 |             float sum{ 0.0f };
 77 | 
 78 |             for (int i = -radiusH; i <= radiusH; ++i)
 79 |                 sum += temp[x + i] * weightsH[i];
 80 | 
 81 |             dstp[x] = sum;
 82 |         }
 83 | 
 84 |         for (int i{ 0 }; i < diameter - 1; ++i)
 85 |             srcp[i] = srcp[i + 1];
 86 | 
 87 |         srcp[diameter - 1] += (y < height - 1 - radiusV) ? srcStride : -srcStride;;
 88 | 
 89 |         dstp += dstStride;
 90 |     }
 91 | }
 92 | 
 93 | template<typename T>
 94 | static void gaussianBlurV(const T* _srcp, float* __restrict dstp, const float* weights, const int width, const int height, const int srcStride, const int dstStride, const int radius) noexcept
 95 | {
 96 |     const int diameter{ radius * 2 + 1 };
 97 |     std::unique_ptr<const T* []> srcp{ std::make_unique<const T* []>(diameter) };
 98 | 
 99 |     srcp[radius] = _srcp;
100 |     for (int i{ 1 }; i <= radius; ++i)
101 |         srcp[radius - i] = srcp[radius + i] = srcp[radius] + srcStride * i;
102 | 
103 |     for (int y{ 0 }; y < height; ++y)
104 |     {
105 |         for (int x{ 0 }; x < width; ++x)
106 |         {
107 |             float sum{ 0.0f };
108 | 
109 |             for (int i{ 0 }; i < diameter; ++i)
110 |                 sum += srcp[i][x] * weights[i];
111 | 
112 |             dstp[x] = sum;
113 |         }
114 | 
115 |         for (int i{ 0 }; i < diameter - 1; ++i)
116 |             srcp[i] = srcp[i + 1];
117 | 
118 |         srcp[diameter - 1] += (y < height - 1 - radius) ? srcStride : -srcStride;
119 | 
120 |         dstp += dstStride;
121 |     }
122 | }
123 | 
124 | template<typename T>
125 | static void gaussianBlurH(const T* srcp, float* __restrict temp, float* __restrict dstp, const float* weights, const int width, const int height, const int srcStride, const int dstStride, const int radius) noexcept
126 | {
127 |     weights += radius;
128 | 
129 |     for (int y{ 0 }; y < height; ++y)
130 |     {
131 |         for (int x{ 0 }; x < width; ++x)
132 |             temp[x] = srcp[x];
133 | 
134 |         for (int i{ 1 }; i <= radius; ++i)
135 |         {
136 |             temp[-i] = temp[i];
137 |             temp[width - 1 + i] = temp[width - 1 - i];
138 |         }
139 | 
140 |         for (int x{ 0 }; x < width; ++x)
141 |         {
142 |             float sum{ 0.0f };
143 | 
144 |             for (int i{ -radius }; i <= radius; ++i)
145 |                 sum += temp[x + i] * weights[i];
146 | 
147 |             dstp[x] = sum;
148 |         }
149 | 
150 |         srcp += srcStride;
151 |         dstp += dstStride;
152 |     }
153 | }
154 | 
155 | static void detectEdge(float* __restrict blur, float* __restrict gradient, int* __restrict direction, const int width, const int height, const int stride, const int bgStride, const int mode, const int op, const float scale) noexcept
156 | {
157 |     float* __restrict cur{ blur };
158 |     float* __restrict next{ blur + bgStride };
159 |     float* __restrict next2{ blur + bgStride * 2 };
160 |     float* __restrict prev{ next };
161 |     float* __restrict prev2{ next2 };
162 | 
163 |     cur[-1] = cur[1];
164 |     cur[width] = cur[width - 2];
165 | 
166 |     if (op == FDOG)
167 |     {
168 |         cur[-2] = cur[2];
169 |         cur[width + 1] = cur[width - 3];
170 |     }
171 | 
172 |     for (int y{ 0 }; y < height; ++y)
173 |     {
174 |         next[-1] = next[1];
175 |         next[width] = next[width - 2];
176 | 
177 |         if (op == FDOG)
178 |         {
179 |             next[-2] = next[2];
180 |             next[width + 1] = next[width - 3];
181 | 
182 |             next2[-1] = next2[1];
183 |             next2[-2] = next2[2];
184 |             next2[width] = next2[width - 2];
185 |             next2[width + 1] = next2[width - 3];
186 |         }
187 | 
188 |         for (int x{ 0 }; x < width; ++x)
189 |         {
190 |             float gx, gy;
191 | 
192 |             if (op != FDOG)
193 |             {
194 |                 const float c1{ prev[x - 1] };
195 |                 const float c2{ prev[x] };
196 |                 const float c3{ prev[x + 1] };
197 |                 const float c4{ cur[x - 1] };
198 |                 const float c6{ cur[x + 1] };
199 |                 const float c7{ next[x - 1] };
200 |                 const float c8{ next[x] };
201 |                 const float c9{ next[x + 1] };
202 | 
203 |                 switch (op)
204 |                 {
205 |                     case TRITICAL:
206 |                     {
207 |                         gx = c6 - c4;
208 |                         gy = c2 - c8;
209 |                         break;
210 |                     }
211 |                     case PREWITT:
212 |                     {
213 |                         gx = (c3 + c6 + c9 - c1 - c4 - c7) / 2.0f;
214 |                         gy = (c1 + c2 + c3 - c7 - c8 - c9) / 2.0f;
215 |                         break;
216 |                     }
217 |                     case SOBEL:
218 |                     {
219 |                         gx = c3 + 2.0f * c6 + c9 - c1 - 2.0f * c4 - c7;
220 |                         gy = c1 + 2.0f * c2 + c3 - c7 - 2.0f * c8 - c9;
221 |                         break;
222 |                     }
223 |                     case SCHARR:
224 |                     {
225 |                         gx = 3.0f * c3 + 10.0f * c6 + 3.0f * c9 - 3.0f * c1 - 10.0f * c4 - 3.0f * c7;
226 |                         gy = 3.0f * c1 + 10.0f * c2 + 3.0f * c3 - 3.0f * c7 - 10.0f * c8 - 3.0f * c9;
227 |                         break;
228 |                     }
229 |                     case KROON:
230 |                     {
231 |                         gx = 17.0f * c3 + 61.0f * c6 + 17.0f * c9 - 17.0f * c1 - 61.0f * c4 - 17.0f * c7;
232 |                         gy = 17.0f * c1 + 61.0f * c2 + 17.0f * c3 - 17.0f * c7 - 61.0f * c8 - 17.0f * c9;
233 |                         break;
234 |                     }
235 |                     case KIRSCH:
236 |                     {
237 |                         const float g1{ 5.0f * c1 + 5.0f * c2 + 5.0f * c3 - 3.0f * c4 - 3.0f * c6 - 3.0f * c7 - 3.0f * c8 - 3.0f * c9 };
238 |                         const float g2{ 5.0f * c1 + 5.0f * c2 - 3.0f * c3 + 5.0f * c4 - 3.0f * c6 - 3.0f * c7 - 3.0f * c8 - 3.0f * c9 };
239 |                         const float g3{ 5.0f * c1 - 3.0f * c2 - 3.0f * c3 + 5.0f * c4 - 3.0f * c6 + 5.0f * c7 - 3.0f * c8 - 3.0f * c9 };
240 |                         const float g4{ -3.0f * c1 - 3.0f * c2 - 3.0f * c3 + 5.0f * c4 - 3.0f * c6 + 5.0f * c7 + 5.0f * c8 - 3.0f * c9 };
241 |                         const float g5{ -3.0f * c1 - 3.0f * c2 - 3.0f * c3 - 3.0f * c4 - 3.0f * c6 + 5.0f * c7 + 5.0f * c8 + 5.0f * c9 };
242 |                         const float g6{ -3.0f * c1 - 3.0f * c2 - 3.0f * c3 - 3.0f * c4 + 5.0f * c6 - 3.0f * c7 + 5.0f * c8 + 5.0f * c9 };
243 |                         const float g7{ -3.0f * c1 - 3.0f * c2 + 5.0f * c3 - 3.0f * c4 + 5.0f * c6 - 3.0f * c7 - 3.0f * c8 + 5.0f * c9 };
244 |                         const float g8{ -3.0f * c1 + 5.0f * c2 + 5.0f * c3 - 3.0f * c4 + 5.0f * c6 - 3.0f * c7 - 3.0f * c8 - 3.0f * c9 };
245 |                         const float g{ std::max({ std::abs(g1), std::abs(g2), std::abs(g3), std::abs(g4), std::abs(g5), std::abs(g6), std::abs(g7), std::abs(g8) }) };
246 |                         gradient[x] = g * scale;
247 |                         break;
248 |                     }
249 |                 }
250 |             }
251 |             else
252 |             {
253 |                 const float c1{ prev2[x - 2] };
254 |                 const float c2{ prev2[x - 1] };
255 |                 const float c3{ prev2[x] };
256 |                 const float c4{ prev2[x + 1] };
257 |                 const float c5{ prev2[x + 2] };
258 |                 const float c6{ prev[x - 2] };
259 |                 const float c7{ prev[x - 1] };
260 |                 const float c8{ prev[x] };
261 |                 const float c9{ prev[x + 1] };
262 |                 const float c10{ prev[x + 2] };
263 |                 const float c11{ cur[x - 2] };
264 |                 const float c12{ cur[x - 1] };
265 |                 const float c14{ cur[x + 1] };
266 |                 const float c15{ cur[x + 2] };
267 |                 const float c16{ next[x - 2] };
268 |                 const float c17{ next[x - 1] };
269 |                 const float c18{ next[x] };
270 |                 const float c19{ next[x + 1] };
271 |                 const float c20{ next[x + 2] };
272 |                 const float c21{ next2[x - 2] };
273 |                 const float c22{ next2[x - 1] };
274 |                 const float c23{ next2[x] };
275 |                 const float c24{ next2[x + 1] };
276 |                 const float c25{ next2[x + 2] };
277 | 
278 |                 gx = c5 + 2.0f * c10 + 3.0f * c15 + 2.0f * c20 + c25 + c4 + 2.0f * c9 + 3.0f * c14 + 2.0f * c19 + c24
279 |                     - c2 - 2.0f * c7 - 3.0f * c12 - 2.0f * c17 - c22 - c1 - 2.0f * c6 - 3.0f * c11 - 2.0f * c16 - c21;
280 |                 gy = c1 + 2.0f * c2 + 3.0f * c3 + 2.0f * c4 + c5 + c6 + 2.0f * c7 + 3.0f * c8 + 2.0f * c9 + c10
281 |                     - c16 - 2.0f * c17 - 3.0f * c18 - 2.0f * c19 - c20 - c21 - 2.0f * c22 - 3.0f * c23 - 2.0f * c24 - c25;
282 |             }
283 | 
284 |             if (op != KIRSCH)
285 |             {
286 |                 gx *= scale;
287 |                 gy *= scale;
288 |                 gradient[x] = std::sqrt(gx * gx + gy * gy);
289 |             }
290 | 
291 |             if (mode == 0)
292 |             {
293 |                 float dr{ std::atan2(gy, gx) };
294 | 
295 |                 if (dr < 0.0f)
296 |                     dr += M_PIF;
297 | 
298 |                 const int bin{ static_cast<int>(dr * 4.0f * M_1_PIF + 0.5f) };
299 |                 direction[x] = (bin >= 4) ? 0 : bin;
300 |             }
301 |         }
302 | 
303 |         prev2 = prev;
304 |         prev = cur;
305 |         cur = next;
306 | 
307 |         if (op != FDOG)
308 |             next += (y < height - 2) ? bgStride : -bgStride;
309 |         else
310 |         {
311 |             next = next2;
312 |             next2 += (y < height - 3) ? bgStride : -bgStride;
313 |         }
314 | 
315 |         gradient += bgStride;
316 |         direction += stride;
317 |     }
318 | }
319 | 
320 | static void nonMaximumSuppression(const int* direction, float* __restrict gradient, float* __restrict blur, const int width, const int height, const int stride, const int bgStride, const int radiusAlign) noexcept
321 | {
322 |     const int offsets[]{ 1, -bgStride + 1, -bgStride, -bgStride - 1 };
323 | 
324 |     gradient[-1] = gradient[1];
325 |     gradient[-1 + bgStride * (height - 1)] = gradient[1 + bgStride * (height - 1)];
326 |     gradient[width] = gradient[width - 2];
327 |     gradient[width + bgStride * (height - 1)] = gradient[width - 2 + bgStride * (height - 1)];
328 |     std::copy_n(gradient - radiusAlign + bgStride, width + radiusAlign * 2, gradient - radiusAlign - bgStride);
329 |     std::copy_n(gradient - radiusAlign + bgStride * (height - 2), width + radiusAlign * 2, gradient - radiusAlign + bgStride * height);
330 | 
331 |     for (int y{ 0 }; y < height; ++y)
332 |     {
333 |         for (int x{ 0 }; x < width; ++x)
334 |         {
335 |             const int offset{ offsets[direction[x]] };
336 |             blur[x] = (gradient[x] >= std::max(gradient[x + offset], gradient[x - offset])) ? gradient[x] : fltLowest;
337 |         }
338 | 
339 |         direction += stride;
340 |         gradient += bgStride;
341 |         blur += bgStride;
342 |     }
343 | }
344 | 
345 | template<typename T>
346 | static void binarizeCE(const float* srcp, T* __restrict dstp, const int width, const int height, const int srcStride, const int dstStride, const int peak) noexcept
347 | {
348 |     for (int y{ 0 }; y < height; ++y)
349 |     {
350 |         for (int x{ 0 }; x < width; ++x)
351 |         {
352 |             if constexpr (std::is_integral_v<T>)
353 |                 dstp[x] = (srcp[x] == fltMax) ? static_cast<T>(peak) : 0;
354 |             else
355 |                 dstp[x] = (srcp[x] == fltMax) ? 1.0f : 0.0f;
356 |         }
357 | 
358 |         srcp += srcStride;
359 |         dstp += dstStride;
360 |     }
361 | }
362 | 
363 | template<typename T, bool clampFP = true>
364 | static void discretizeGM(const float* srcp, T* __restrict dstp, const int width, const int height, const int srcStride, const int dstStride, const int peak) noexcept
365 | {
366 |     for (int y{ 0 }; y < height; ++y)
367 |     {
368 |         for (int x{ 0 }; x < width; ++x)
369 |         {
370 |             if constexpr (std::is_integral_v<T>)
371 |                 dstp[x] = static_cast<T>(std::min(static_cast<int>(srcp[x] + 0.5f), peak));
372 |             else if constexpr (clampFP)
373 |                 dstp[x] = std::clamp(srcp[x], 0.0f, 1.0f);
374 |             else
375 |                 dstp[x] = srcp[x];
376 |         }
377 | 
378 |         srcp += srcStride;
379 |         dstp += dstStride;
380 |     }
381 | }
382 | 
383 | template<typename T>
384 | void vsTCanny::filter_c(PVideoFrame& src, PVideoFrame& dst, IScriptEnvironment* env) noexcept
385 | {
386 |     const int planes_y[3]{ PLANAR_Y, PLANAR_U, PLANAR_V };
387 |     const int planes_r[3]{ PLANAR_G, PLANAR_B, PLANAR_R };
388 |     const int* current_planes{ (vi.IsRGB()) ? planes_r : planes_y };
389 |     const int planecount{ std::min(vi.NumComponents(), 3) };
390 | 
391 |     for (int i{ 0 }; i < planecount; ++i)
392 |     {
393 |         const int height{ src->GetHeight(current_planes[i]) };
394 | 
395 |         if (process[i] == 3)
396 |         {
397 |             const size_t stride{ src->GetPitch(current_planes[i]) / sizeof(T) };
398 |             const size_t bgStride{ stride + radiusAlign * 2 };
399 |             const size_t dst_stride{ dst->GetPitch(current_planes[i]) / sizeof(T) };
400 |             const size_t width{ src->GetRowSize(current_planes[i]) / sizeof(T) };
401 |             const T* srcp{ reinterpret_cast<const T*>(src->GetReadPtr(current_planes[i])) };
402 |             T* dstp{ reinterpret_cast<T*>(dst->GetWritePtr(current_planes[i])) };
403 | 
404 |             float* blur{ vsTCanny::blur + radiusAlign };
405 |             float* gradient{ vsTCanny::gradient + bgStride + radiusAlign };
406 | 
407 |             if (radiusV[i] && radiusH[i])
408 |                 gaussianBlur(srcp, gradient, blur, weightsH[i].get(), weightsV[i].get(), width, height, stride, bgStride, radiusH[i], radiusV[i]);
409 |             else if (radiusV[i])
410 |                 gaussianBlurV(srcp, blur, weightsV[i].get(), width, height, stride, bgStride, radiusV[i]);
411 |             else if (radiusH[i])
412 |                 gaussianBlurH(srcp, gradient, blur, weightsH[i].get(), width, height, stride, bgStride, radiusH[i]);
413 |             else
414 |                 copyPlane(srcp, blur, width, height, stride, bgStride);
415 | 
416 |             if (mode_ != -1)
417 |             {
418 |                 detectEdge(blur, gradient, direction, width, height, stride, bgStride, mode_, op_, scale);
419 | 
420 |                 if (mode_ == 0)
421 |                 {
422 |                     nonMaximumSuppression(direction, gradient, blur, width, height, stride, bgStride, radiusAlign);
423 |                     hysteresis(blur, found.get(), width, height, bgStride, t_h_, t_l_);
424 |                 }
425 |             }
426 | 
427 |             switch (mode_)
428 |             {
429 |                 case 0: binarizeCE(blur, dstp, width, height, bgStride, dst_stride, peak); break;
430 |                 case 1: discretizeGM(gradient, dstp, width, height, bgStride, stride, peak); break;
431 |                 default: discretizeGM<T, false>(blur, dstp, width, height, bgStride, dst_stride, peak); break;
432 |             }
433 |         }
434 |         else if (process[i] == 2)
435 |             env->BitBlt(dst->GetWritePtr(current_planes[i]), dst->GetPitch(current_planes[i]), src->GetReadPtr(current_planes[i]), src->GetPitch(current_planes[i]), src->GetRowSize(current_planes[i]), height);
436 |     }
437 | }
438 | 
439 | static float* gaussianWeights(const float sigma, int& radius) noexcept
440 | {
441 |     const int diameter{ std::max(static_cast<int>(sigma * 3.0f + 0.5f), 1) * 2 + 1 };
442 |     radius = diameter / 2;
443 | 
444 |     float* weights{ new float[diameter]() };
445 |     float sum{ 0.0f };
446 | 
447 |     for (int k = -radius; k <= radius; ++k)
448 |     {
449 |         const float w{ std::exp(-(k * k) / (2.0f * sigma * sigma)) };
450 |         weights[k + radius] = w;
451 |         sum += w;
452 |     }
453 | 
454 |     for (int k{ 0 }; k < diameter; ++k)
455 |         weights[k] /= sum;
456 | 
457 |     return weights;
458 | }
459 | 
460 | vsTCanny::vsTCanny(PClip _child, float sigmaY, float sigmaU, float sigmaV, float sigma_vY, float sigma_vU, float sigma_vV, float t_h, float t_l, int mode, int op, float scale_, int y, int u, int v, int opt, IScriptEnvironment* env)
461 |     : GenericVideoFilter(_child), t_h_(t_h), t_l_(t_l), mode_(mode), op_(op), scale(scale_), process{ 0, 0, 0 }, radiusH{ 0, 0, 0 }, radiusV{ 0, 0, 0 }
462 | {
463 |     if (!vi.IsPlanar())
464 |         env->ThrowError("vsTCanny: the clip is not in planar format.");
465 | 
466 |     const int height{ vi.height };
467 |     const int width{ vi.width };
468 | 
469 |     if (height < 3)
470 |         env->ThrowError("vsTCanny: the clip's height must be at least 3.");
471 |     if (t_l_ >= t_h_)
472 |         env->ThrowError("vsTCanny: t_h must be greater than t_l.");
473 |     if (mode_ < -1 || mode_ > 1)
474 |         env->ThrowError("vsTCanny: mode must be -1, 0, or 1.");
475 |     if (op_ < 0 || op_ > 6)
476 |         env->ThrowError("vsTCanny: op must be 0, 1, 2, 3, 4, 5 or 6.");
477 |     if (op_ == 5 && mode == 0)
478 |         env->ThrowError("vsTCanny: op=5 cannot be used when mode=0.");
479 |     if (scale <= 0.0f)
480 |         env->ThrowError("vsTCanny: scale must be greater than 0.0.");
481 |     if (opt < -1 || opt > 3)
482 |         env->ThrowError("vsTCanny: opt must be between -1..3.");
483 | 
484 |     const bool avx512{ !!(env->GetCPUFlags() & CPUF_AVX512F) };
485 |     const bool avx2{ !!(env->GetCPUFlags() & CPUF_AVX2) };
486 |     const bool sse2{ !!(env->GetCPUFlags() & CPUF_SSE2) };
487 | 
488 |     if (!avx512 && opt == 3)
489 |         env->ThrowError("vsTCanny: opt=3 requires AVX512.");
490 |     if (!avx2 && opt == 2)
491 |         env->ThrowError("vsTCanny: opt=2 requires AVX2.");
492 |     if (!sse2 && opt == 1)
493 |         env->ThrowError("vsTCanny: opt=1 requires SSE2.");
494 | 
495 |     const bool rgb{ vi.IsRGB() };
496 |     int sw{ 0 };
497 |     int sh{ 0 };
498 |     const int planecount{ std::min(vi.NumComponents(), 3) };
499 | 
500 |     if (planecount > 1)
501 |     {
502 |         sw = vi.GetPlaneWidthSubsampling((rgb) ? PLANAR_R : PLANAR_U);
503 |         sh = vi.GetPlaneHeightSubsampling((rgb) ? PLANAR_R : PLANAR_U);
504 | 
505 |         if (sigmaU == -1354.4f)
506 |             sigmaU = (rgb) ? sigmaY : (sigmaY / (1 << sw));
507 |         if (sigmaV == -1354.4f)
508 |             sigmaV = sigmaU;
509 |         if (sigma_vY == -1354.4f)
510 |         {
511 |             sigma_vY = sigmaY;
512 | 
513 |             if (sigma_vU == -1354.4f)
514 |                 sigma_vU = (sw == sh) ? sigmaU : (sigmaU * (1 << sw));
515 |         }
516 |         else
517 |         {
518 |             if (sigma_vU == -1354.4f)
519 |                 sigma_vU = (rgb) ? sigma_vY : (sigma_vY / (1 << sh));
520 |         }
521 |         if (sigma_vV == -1354.4f)
522 |             sigma_vV = sigma_vU;
523 |     }
524 |     else
525 |     {
526 |         if (sigma_vY == -1354.4f)
527 |             sigma_vY = sigmaY;
528 |     }
529 | 
530 |     const float sigmaH[3]{ sigmaY, sigmaU, sigmaV };
531 |     const float sigmaV_[3]{ sigma_vY, sigma_vU, sigma_vV };
532 |     const int planes[3]{ y, u, v };
533 | 
534 |     for (int i{ 0 }; i < planecount; ++i)
535 |     {
536 |         if (rgb)
537 |             process[i] = 3;
538 |         else
539 |         {
540 |             switch (planes[i])
541 |             {
542 |                 case 3: process[i] = 3; break;
543 |                 case 2: process[i] = 2; break;
544 |                 default: process[i] = 1; break;
545 |             }
546 |         }
547 | 
548 |         if (sigmaH[i] < 0.0f)
549 |         {
550 |             const std::string sigmaOrder[3]{ "sigmaY", "sigmaU", "sigmaV" };
551 |             env->ThrowError(std::string{ "vsTCanny: " + sigmaOrder[i] + " must be greater than or equal to 0.0." }.c_str());
552 |         }
553 |         if (sigmaV_[i] < 0.0f)
554 |         {
555 |             const std::string sigmaVOrder[3]{ "sigma_vY", "sigma_vU", "sigma_vV" };
556 |             env->ThrowError(std::string{ "vsTCanny: " + sigmaVOrder[i] + " must be greater than or equal to 0.0." }.c_str());
557 |         }
558 |         if (planes[i] < 1 || planes[i] > 3)
559 |             env->ThrowError("vsTCanny: y, u, v must be between 1..3.");
560 | 
561 |         if (process[i] == 3)
562 |         {
563 |             if (sigmaH[i])
564 |             {
565 |                 weightsH[i].reset(gaussianWeights(sigmaH[i], radiusH[i]));
566 | 
567 |                 const int width_{ (i && !rgb) ? (width >> sw) : width };
568 |                 if (width_ < radiusH[i] + 1)
569 |                 {
570 |                     const std::string planeOrder[3]{ "first", "second", "third" };
571 |                     env->ThrowError(std::string{ "vsTCanny: the " + planeOrder[i] + " plane's width must be greater than or equal to " + std::to_string(radiusH[i] + 1) + " for specified sigma." }.c_str());
572 |                 }
573 |             }
574 |             else
575 |                 radiusH[i] = 0;
576 | 
577 |             if (sigmaV_[i])
578 |             {
579 |                 weightsV[i].reset(gaussianWeights(sigmaV_[i], radiusV[i]));
580 | 
581 |                 const int height_{ (i && !rgb) ? (height >> sh) : height };
582 |                 if (height_ < radiusV[i] + 1)
583 |                 {
584 |                     const std::string planeOrder[3]{ "first", "second", "third" };
585 |                     env->ThrowError(std::string{ "vsTCanny: the " + planeOrder[i] + " plane's height must be greater than or equal to " + std::to_string(radiusV[i] + 1) + " for specified sigma_v." }.c_str());
586 |                 }
587 |             }
588 |             else
589 |                 radiusV[i] = 0;
590 |         }
591 |         else
592 |         {
593 |             radiusH[i] = 0;
594 |             radiusV[i] = 0;
595 |         }
596 |     }
597 | 
598 |     const int comp_size{ vi.ComponentSize() };
599 |     if (comp_size < 4)
600 |     {
601 |         peak = (1 << vi.BitsPerComponent()) - 1;
602 |         const float scale_{ peak / 255.0f };
603 |         t_h_ *= scale_;
604 |         t_l_ *= scale_;
605 |     }
606 |     else
607 |     {
608 |         peak = 0;
609 |         t_h_ /= 255.0f;
610 |         t_l_ /= 255.0f;
611 |     }
612 | 
613 |     int vectorSize;
614 | 
615 |     if ((avx512 && opt < 0) || opt == 3)
616 |     {
617 |         vectorSize = 16;
618 |         alignment = 64;
619 | 
620 |         switch (comp_size)
621 |         {
622 |             case 1: filter = &vsTCanny::filter_avx512<uint8_t>; break;
623 |             case 2: filter = &vsTCanny::filter_avx512<uint16_t>; break;
624 |             default: filter = &vsTCanny::filter_avx512<float>; break;
625 |         }
626 |     }
627 |     else if ((avx2 && opt < 0) || opt == 2)
628 |     {
629 |         vectorSize = 8;
630 |         alignment = 32;
631 | 
632 |         switch (comp_size)
633 |         {
634 |             case 1: filter = &vsTCanny::filter_avx2<uint8_t>; break;
635 |             case 2: filter = &vsTCanny::filter_avx2<uint16_t>; break;
636 |             default: filter = &vsTCanny::filter_avx2<float>; break;
637 |         }
638 |     }
639 |     else if ((sse2 && opt < 0) || opt == 1)
640 |     {
641 |         vectorSize = 4;
642 |         alignment = 16;
643 | 
644 |         switch (comp_size)
645 |         {
646 |             case 1: filter = &vsTCanny::filter_sse2<uint8_t>; break;
647 |             case 2: filter = &vsTCanny::filter_sse2<uint16_t>; break;
648 |             default: filter = &vsTCanny::filter_sse2<float>; break;
649 |         }
650 |     }
651 |     else
652 |     {
653 |         vectorSize = 1;
654 |         alignment = 4;
655 | 
656 |         switch (comp_size)
657 |         {
658 |             case 1: filter = &vsTCanny::filter_c<uint8_t>; break;
659 |             case 2: filter = &vsTCanny::filter_c<uint16_t>; break;
660 |             default: filter = &vsTCanny::filter_c<float>; break;
661 |         }
662 |     }
663 | 
664 |     radiusAlign = (std::max({ radiusH[0], radiusH[1], radiusH[2], (op == FDOG) ? 2 : 1 }) + vectorSize - 1) & ~(vectorSize - 1);
665 | 
666 |     const int pitch{ child->GetFrame(0, env)->GetPitch() / comp_size };
667 | 
668 |     blur = reinterpret_cast<float*>(aligned_malloc((pitch + radiusAlign * 2) * height * sizeof(float), alignment));
669 |     if (!blur)
670 |         env->ThrowError("vsTCanny: malloc failure (blur).");
671 | 
672 |     gradient = reinterpret_cast<float*>(aligned_malloc((pitch + radiusAlign * 2) * (height + 2) * sizeof(float), alignment));
673 |     if (!gradient)
674 |         env->ThrowError("vsTCanny: malloc failure (gradient).");
675 | 
676 |     if (mode_ == 0)
677 |     {
678 |         direction = reinterpret_cast<int*>(aligned_malloc(pitch * height * sizeof(int), alignment));
679 |         if (!direction)
680 |             env->ThrowError("vsTCanny: malloc failure (direction).");
681 | 
682 |         found = std::make_unique<bool[]>(width * height);
683 |     }
684 |     else
685 |         direction = nullptr;
686 | 
687 |     has_at_least_v8 = true;
688 |     try { env->CheckVersion(8); }
689 |     catch (const AvisynthError&) { has_at_least_v8 = false; }
690 | }
691 | 
692 | vsTCanny::~vsTCanny()
693 | {
694 |     aligned_free(blur);
695 |     aligned_free(gradient);
696 | 
697 |     if (direction)
698 |         aligned_free(direction);
699 | }
700 | 
701 | PVideoFrame __stdcall vsTCanny::GetFrame(int n, IScriptEnvironment* env)
702 | {
703 |     PVideoFrame src{ child->GetFrame(n, env) };
704 |     PVideoFrame dst{ (has_at_least_v8) ? env->NewVideoFrameP(vi, &src) : env->NewVideoFrame(vi) };
705 | 
706 |     (this->*filter)(src, dst, env);
707 | 
708 |     return dst;
709 | }
710 | 
711 | AVSValue __cdecl Create_vsTCanny(AVSValue args, void* user_data, IScriptEnvironment* env)
712 | {
713 |     const float sigmaY{ args[1].AsFloatf(1.5f) };
714 |     if (sigmaY < 0.0f)
715 |         env->ThrowError("vsTCanny: sigmaY must be greater than or equal to 0.0.");
716 | 
717 |     const float sigmaU{ args[2].AsFloatf(-1354.4f) };
718 |     if (sigmaU < 0.0f && sigmaU != -1354.4f)
719 |         env->ThrowError("vsTCanny: sigmaU must be greater than or equal to 0.0.");
720 | 
721 |     const float sigmaV{ args[3].AsFloatf(sigmaU) };
722 |     if (sigmaV < 0.0f && sigmaV != -1354.4f)
723 |         env->ThrowError("vsTCanny: sigmaV must be greater than or equal to 0.0.");
724 | 
725 |     const float sigma_vY{ args[4].AsFloatf(-1354.4f) };
726 |     if (sigma_vY < 0.0f && sigma_vY != -1354.4f)
727 |         env->ThrowError("vsTCanny: sigma_vY must be greater than or equal to 0.0.");
728 | 
729 |     const float sigma_vU{ args[5].AsFloatf(-1354.4f) };
730 |     if (sigma_vU < 0.0f && sigma_vU != -1354.4f)
731 |         env->ThrowError("vsTCanny: sigma_vU must be greater than or equal to 0.0.");
732 | 
733 |     const float sigma_vV{ args[6].AsFloatf(sigma_vU) };
734 |     if (sigma_vV < 0.0f && sigma_vV != -1354.4f)
735 |         env->ThrowError("vsTCanny: sigma_vV must be greater than or equal to 0.0.");
736 | 
737 |     return new vsTCanny(
738 |         args[0].AsClip(),
739 |         sigmaY,
740 |         sigmaU,
741 |         sigmaV,
742 |         sigma_vY,
743 |         sigma_vU,
744 |         sigma_vV,
745 |         args[7].AsFloatf(8.0f),
746 |         args[8].AsFloatf(1.0f),
747 |         args[9].AsInt(0),
748 |         args[10].AsInt(1),
749 |         args[11].AsFloatf(1.0f),
750 |         args[12].AsInt(3),
751 |         args[13].AsInt(3),
752 |         args[14].AsInt(3),
753 |         args[15].AsInt(-1),
754 |         env);
755 | }
756 | 
757 | const AVS_Linkage* AVS_linkage;
758 | 
759 | extern "C" __declspec(dllexport)
760 | const char* __stdcall AvisynthPluginInit3(IScriptEnvironment * env, const AVS_Linkage* const vectors)
761 | {
762 |     AVS_linkage = vectors;
763 | 
764 |     env->AddFunction("vsTCanny", "c[sigmaY]f[sigmaU]f[sigmaV]f[sigma_vY]f[sigma_vU]f[sigma_vV]f[t_h]f[t_l]f[mode]i[op]i[scale]f[y]i[u]i[v]i[opt]i", Create_vsTCanny, 0);
765 | 
766 |     return "vsTCanny";
767 | }
768 | 


--------------------------------------------------------------------------------
/src/vsTCanny.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <algorithm>
  4 | #include <limits>
  5 | #include <memory>
  6 | #include <vector>
  7 | 
  8 | #include "avisynth.h"
  9 | 
 10 | static constexpr float M_PIF{ 3.14159265358979323846f };
 11 | static constexpr float M_1_PIF{ 0.318309886183790671538f };
 12 | static constexpr float fltMax{ std::numeric_limits<float>::max() };
 13 | static constexpr float fltLowest{ std::numeric_limits<float>::lowest() };
 14 | 
 15 | enum Operator
 16 | {
 17 |     TRITICAL,
 18 |     PREWITT,
 19 |     SOBEL,
 20 |     SCHARR,
 21 |     KROON,
 22 |     KIRSCH,
 23 |     FDOG
 24 | };
 25 | 
 26 | class vsTCanny : public GenericVideoFilter
 27 | {
 28 |     float t_h_;
 29 |     float t_l_;
 30 |     int mode_;
 31 |     float op_;
 32 |     float scale;
 33 |     int process[3];
 34 |     int radiusH[3];
 35 |     int radiusV[3];
 36 |     std::unique_ptr<float[]> weightsH[3];
 37 |     std::unique_ptr<float[]> weightsV[3];
 38 |     int peak;
 39 |     int alignment;
 40 |     int radiusAlign;
 41 |     float* blur;
 42 |     float* gradient;
 43 |     int* direction;
 44 |     std::unique_ptr<bool[]> found;
 45 |     bool has_at_least_v8;
 46 | 
 47 |     template<typename T>
 48 |     void filter_c(PVideoFrame& src, PVideoFrame& dst, IScriptEnvironment* env) noexcept;
 49 |     template<typename T>
 50 |     void filter_sse2(PVideoFrame& src, PVideoFrame& dst, IScriptEnvironment* env) noexcept;
 51 |     template<typename T>
 52 |     void filter_avx2(PVideoFrame& src, PVideoFrame& dst, IScriptEnvironment* env) noexcept;
 53 |     template<typename T>
 54 |     void filter_avx512(PVideoFrame& src, PVideoFrame& dst, IScriptEnvironment* env) noexcept;
 55 | 
 56 |     void (vsTCanny::* filter)(PVideoFrame& src, PVideoFrame& dst, IScriptEnvironment* env) noexcept;
 57 | 
 58 | public:
 59 |     vsTCanny(PClip _child, float sigmaY, float sigmaU, float sigmaV, float sigma_vY, float sigma_vU, float sigma_vV, float t_h, float t_l, int mode, int op, float gmmax, int y, int u, int v, int opt, IScriptEnvironment* env);
 60 |     PVideoFrame __stdcall GetFrame(int n, IScriptEnvironment* env);
 61 |     int __stdcall SetCacheHints(int cachehints, int frame_range)
 62 |     {
 63 |         return cachehints == CACHE_GET_MTMODE ? MT_MULTI_INSTANCE : 0;
 64 |     }
 65 |     ~vsTCanny();
 66 | };
 67 | 
 68 | static void hysteresis(float* __restrict srcp, bool* __restrict found, const int width, const int height, const int stride, const float t_h, const float t_l) noexcept
 69 | {
 70 |     std::fill_n(found, width * height, false);
 71 |     std::vector<std::pair<int, int>> coordinates;
 72 | 
 73 |     for (int y{ 0 }; y < height; ++y)
 74 |     {
 75 |         for (int x{ 0 }; x < width; ++x)
 76 |         {
 77 |             if (!found[width * y + x] && srcp[stride * y + x] >= t_h)
 78 |             {
 79 |                 srcp[stride * y + x] = fltMax;
 80 |                 found[width * y + x] = true;
 81 | 
 82 |                 coordinates.emplace_back(std::make_pair(x, y));
 83 | 
 84 |                 while (!coordinates.empty())
 85 |                 {
 86 |                     const auto pos = coordinates.back();
 87 |                     coordinates.pop_back();
 88 | 
 89 |                     const int xxStart{ std::max(pos.first - 1, 0) };
 90 |                     const int xxStop{ std::min(pos.first + 1, width - 1) };
 91 |                     const int yyStart{ std::max(pos.second - 1, 0) };
 92 |                     const int yyStop{ std::min(pos.second + 1, height - 1) };
 93 | 
 94 |                     for (int yy{ yyStart }; yy <= yyStop; ++yy)
 95 |                     {
 96 |                         for (int xx{ xxStart }; xx <= xxStop; ++xx)
 97 |                         {
 98 |                             if (!found[width * yy + xx] && srcp[stride * yy + xx] >= t_l)
 99 |                             {
100 |                                 srcp[stride * yy + xx] = fltMax;
101 |                                 found[width * yy + xx] = true;
102 | 
103 |                                 coordinates.emplace_back(std::make_pair(xx, yy));
104 |                             }
105 |                         }
106 |                     }
107 |                 }
108 |             }
109 |         }
110 |     }
111 | }
112 | 


--------------------------------------------------------------------------------
/src/vsTCanny.rc:
--------------------------------------------------------------------------------
 1 | #include <winver.h>
 2 | 
 3 | VS_VERSION_INFO VERSIONINFO
 4 | FILEVERSION             1,1,8,0
 5 | PRODUCTVERSION        	1,1,8,0
 6 | FILEFLAGSMASK           VS_FFI_FILEFLAGSMASK
 7 | FILEFLAGS               0x0L
 8 | FILEOS                  VOS__WINDOWS32
 9 | FILETYPE                VFT_DLL
10 | FILESUBTYPE             VFT2_UNKNOWN
11 | BEGIN
12 |     BLOCK "StringFileInfo"
13 |     BEGIN
14 |         BLOCK "040904E4"
15 |         BEGIN
16 |         VALUE "Comments",         "Canny edge detection filter."
17 |         VALUE "FileDescription",  "TCanny for AviSynth 2.6 / AviSynth+"
18 |         VALUE "FileVersion",      "1.1.8"
19 |         VALUE "InternalName",     "vsTCanny"
20 |         VALUE "OriginalFilename", "vsTCanny.dll"
21 |         VALUE "ProductName",      "vsTCanny"
22 |         VALUE "ProductVersion",   "1.1.8"
23 |         END
24 |     END
25 |     BLOCK "VarFileInfo"
26 |     BEGIN
27 |         VALUE "Translation", 0x409, 1252
28 |     END
29 | END
30 | 


--------------------------------------------------------------------------------
/src/vsTCanny_AVX2.cpp:
--------------------------------------------------------------------------------
  1 | #include "VCL2/vectormath_trig.h"
  2 | #include "vsTCanny.h"
  3 | 
  4 | template<typename T>
  5 | static void copyPlane(const T* srcp, float* dstp, const int width, const int height, const int srcStride, const int dstStride) noexcept
  6 | {
  7 |     for (int y{ 0 }; y < height; ++y)
  8 |     {
  9 |         for (int x{ 0 }; x < width; x += 8)
 10 |         {
 11 |             if constexpr (std::is_same_v<T, uint8_t>)
 12 |                 to_float(Vec8i().load_8uc(srcp + x)).store_nt(dstp + x);
 13 |             else if constexpr (std::is_same_v<T, uint16_t>)
 14 |                 to_float(Vec8i().load_8us(srcp + x)).store_nt(dstp + x);
 15 |             else
 16 |                 Vec8f().load_a(srcp + x).store_nt(dstp + x);
 17 |         }
 18 | 
 19 |         srcp += srcStride;
 20 |         dstp += dstStride;
 21 |     }
 22 | }
 23 | 
 24 | template<typename T>
 25 | static void gaussianBlur(const T* __srcp, float* temp, float* dstp, const float* weightsH, const float* weightsV, const int width, const int height, const int srcStride, const int dstStride, const int radiusH, const int radiusV) noexcept
 26 | {
 27 |     const int diameter{ radiusV * 2 + 1 };
 28 |     std::unique_ptr<const T* []> _srcp{ std::make_unique<const T* []>(diameter) };
 29 | 
 30 |     _srcp[radiusV] = __srcp;
 31 |     for (int i{ 1 }; i <= radiusV; ++i)
 32 |         _srcp[radiusV - i] = _srcp[radiusV + i] = _srcp[radiusV] + srcStride * i;
 33 | 
 34 |     weightsH += radiusH;
 35 | 
 36 |     for (int y{ 0 }; y < height; ++y)
 37 |     {
 38 |         for (int x{ 0 }; x < width; x += 8)
 39 |         {
 40 |             Vec8f sum{ zero_8f() };
 41 | 
 42 |             for (int i{ 0 }; i < diameter; ++i)
 43 |             {
 44 |                 if constexpr (std::is_same_v<T, uint8_t>)
 45 |                 {
 46 |                     const Vec8f srcp{ to_float(Vec8i().load_8uc(_srcp[i] + x)) };
 47 |                     sum = mul_add(srcp, weightsV[i], sum);
 48 |                 }
 49 |                 else if constexpr (std::is_same_v<T, uint16_t>)
 50 |                 {
 51 |                     const Vec8f srcp{ to_float(Vec8i().load_8us(_srcp[i] + x)) };
 52 |                     sum = mul_add(srcp, weightsV[i], sum);
 53 |                 }
 54 |                 else
 55 |                 {
 56 |                     const Vec8f srcp{ Vec8f().load_a(_srcp[i] + x) };
 57 |                     sum = mul_add(srcp, weightsV[i], sum);
 58 |                 }
 59 |             }
 60 | 
 61 |             sum.store_a(temp + x);
 62 |         }
 63 | 
 64 |         for (int i{ 1 }; i <= radiusH; ++i)
 65 |         {
 66 |             temp[-i] = temp[i];
 67 |             temp[width - 1 + i] = temp[width - 1 - i];
 68 |         }
 69 | 
 70 |         for (int x{ 0 }; x < width; x += 8)
 71 |         {
 72 |             Vec8f sum{ zero_8f() };
 73 | 
 74 |             for (int i{ -radiusH }; i <= radiusH; ++i)
 75 |             {
 76 |                 const Vec8f srcp{ Vec8f().load(temp + x + i) };
 77 |                 sum = mul_add(srcp, weightsH[i], sum);
 78 |             }
 79 | 
 80 |             sum.store_nt(dstp + x);
 81 |         }
 82 | 
 83 |         for (int i{ 0 }; i < diameter - 1; ++i)
 84 |             _srcp[i] = _srcp[i + 1];
 85 | 
 86 |         _srcp[diameter - 1] += (y < height - 1 - radiusV) ? srcStride : -srcStride;
 87 | 
 88 |         dstp += dstStride;
 89 |     }
 90 | }
 91 | 
 92 | template<typename T>
 93 | static void gaussianBlurV(const T* __srcp, float* dstp, const float* weights, const int width, const int height, const int srcStride, const int dstStride, const int radius) noexcept
 94 | {
 95 |     const int diameter{ radius * 2 + 1 };
 96 |     std::unique_ptr<const T* []> _srcp{ std::make_unique<const T* []>(diameter) };
 97 | 
 98 |     _srcp[radius] = __srcp;
 99 |     for (int i{ 1 }; i <= radius; ++i)
100 |         _srcp[radius - i] = _srcp[radius + i] = _srcp[radius] + srcStride * i;
101 | 
102 |     for (int y{ 0 }; y < height; ++y)
103 |     {
104 |         for (int x{ 0 }; x < width; x += 8)
105 |         {
106 |             Vec8f sum{ zero_8f() };
107 | 
108 |             for (int i{ 0 }; i < diameter; ++i)
109 |             {
110 |                 if constexpr (std::is_same_v<T, uint8_t>)
111 |                 {
112 |                     const Vec8f srcp{ to_float(Vec8i().load_8uc(_srcp[i] + x)) };
113 |                     sum = mul_add(srcp, weights[i], sum);
114 |                 }
115 |                 else if constexpr (std::is_same_v<T, uint16_t>)
116 |                 {
117 |                     const Vec8f srcp{ to_float(Vec8i().load_8us(_srcp[i] + x)) };
118 |                     sum = mul_add(srcp, weights[i], sum);
119 |                 }
120 |                 else
121 |                 {
122 |                     const Vec8f srcp{ Vec8f().load_a(_srcp[i] + x) };
123 |                     sum = mul_add(srcp, weights[i], sum);
124 |                 }
125 |             }
126 | 
127 |             sum.store_nt(dstp + x);
128 |         }
129 | 
130 |         for (int i{ 0 }; i < diameter - 1; ++i)
131 |             _srcp[i] = _srcp[i + 1];
132 | 
133 |         _srcp[diameter - 1] += (y < height - 1 - radius) ? srcStride : -srcStride;
134 | 
135 |         dstp += dstStride;
136 |     }
137 | }
138 | 
139 | template<typename T>
140 | static void gaussianBlurH(const T* _srcp, float* temp, float* dstp, const float* weights, const int width, const int height, const int srcStride, const int dstStride, const int radius) noexcept
141 | {
142 |     weights += radius;
143 | 
144 |     for (int y{ 0 }; y < height; ++y)
145 |     {
146 |         for (int x{ 0 }; x < width; x += 8)
147 |         {
148 |             if constexpr (std::is_same_v<T, uint8_t>)
149 |                 to_float(Vec8i().load_8uc(_srcp + x)).store_a(temp + x);
150 |             else if constexpr (std::is_same_v<T, uint16_t>)
151 |                 to_float(Vec8i().load_8us(_srcp + x)).store_a(temp + x);
152 |             else
153 |                 Vec8f().load_a(_srcp + x).store_a(temp + x);
154 |         }
155 | 
156 |         for (int i{ 1 }; i <= radius; ++i)
157 |         {
158 |             temp[-i] = temp[i];
159 |             temp[width - 1 + i] = temp[width - 1 - i];
160 |         }
161 | 
162 |         for (int x{ 0 }; x < width; x += 8)
163 |         {
164 |             Vec8f sum{ zero_8f() };
165 | 
166 |             for (int i{ -radius }; i <= radius; ++i)
167 |             {
168 |                 const Vec8f srcp{ Vec8f().load(temp + x + i) };
169 |                 sum = mul_add(srcp, weights[i], sum);
170 |             }
171 | 
172 |             sum.store_nt(dstp + x);
173 |         }
174 | 
175 |         _srcp += srcStride;
176 |         dstp += dstStride;
177 |     }
178 | }
179 | 
180 | static void detectEdge(float* blur, float* gradient, int* direction, const int width, const int height, const int stride, const int bgStride, const int mode, const int op, const float scale) noexcept
181 | {
182 |     float* __restrict cur{ blur };
183 |     float* __restrict next{ blur + bgStride };
184 |     float* __restrict next2{ blur + bgStride * 2 };
185 |     float* __restrict prev{ next };
186 |     float* __restrict prev2{ next2 };
187 | 
188 |     cur[-1] = cur[1];
189 |     cur[width] = cur[width - 2];
190 | 
191 |     if (op == FDOG)
192 |     {
193 |         cur[-2] = cur[2];
194 |         cur[width + 1] = cur[width - 3];
195 |     }
196 | 
197 |     for (int y{ 0 }; y < height; ++y)
198 |     {
199 |         next[-1] = next[1];
200 |         next[width] = next[width - 2];
201 | 
202 |         if (op == FDOG)
203 |         {
204 |             next[-2] = next[2];
205 |             next[width + 1] = next[width - 3];
206 | 
207 |             next2[-1] = next2[1];
208 |             next2[-2] = next2[2];
209 |             next2[width] = next2[width - 2];
210 |             next2[width + 1] = next2[width - 3];
211 |         }
212 | 
213 |         for (int x{ 0 }; x < width; x += 8)
214 |         {
215 |             Vec8f gx, gy;
216 | 
217 |             if (op != FDOG)
218 |             {
219 |                 const Vec8f c1{ Vec8f().load(prev + x - 1) };
220 |                 const Vec8f c2{ Vec8f().load_a(prev + x) };
221 |                 const Vec8f c3{ Vec8f().load(prev + x + 1) };
222 |                 const Vec8f c4{ Vec8f().load(cur + x - 1) };
223 |                 const Vec8f c6{ Vec8f().load(cur + x + 1) };
224 |                 const Vec8f c7{ Vec8f().load(next + x - 1) };
225 |                 const Vec8f c8{ Vec8f().load_a(next + x) };
226 |                 const Vec8f c9{ Vec8f().load(next + x + 1) };
227 | 
228 |                 switch (op)
229 |                 {
230 |                     case TRITICAL:
231 |                     {
232 |                         gx = c6 - c4;
233 |                         gy = c2 - c8;
234 |                         break;
235 |                     }
236 |                     case PREWITT:
237 |                     {
238 |                         gx = (c3 + c6 + c9 - c1 - c4 - c7) * 0.5f;
239 |                         gy = (c1 + c2 + c3 - c7 - c8 - c9) * 0.5f;
240 |                         break;
241 |                     }
242 |                     case SOBEL:
243 |                     {
244 |                         gx = c3 + mul_add(2.0f, c6, c9) - c1 - mul_add(2.0f, c4, c7);
245 |                         gy = c1 + mul_add(2.0f, c2, c3) - c7 - mul_add(2.0f, c8, c9);
246 |                         break;
247 |                     }
248 |                     case SCHARR:
249 |                     {
250 |                         gx = mul_add(3.0f, c3 + c9, 10.0f * c6) - mul_add(3.0f, c1 + c7, 10.0f * c4);
251 |                         gy = mul_add(3.0f, c1 + c3, 10.0f * c2) - mul_add(3.0f, c7 + c9, 10.0f * c8);
252 |                         break;
253 |                     }
254 |                     case KROON:
255 |                     {
256 |                         gx = mul_add(17.0f, c3 + c9, 61.0f * c6) - mul_add(17.0f, c1 + c7, 61.0f * c4);
257 |                         gy = mul_add(17.0f, c1 + c3, 61.0f * c2) - mul_add(17.0f, c7 + c9, 61.0f * c8);
258 |                         break;
259 |                     }
260 |                     case KIRSCH:
261 |                     {
262 |                         const Vec8f g1{ mul_sub(5.0f, c1 + c2 + c3, 3.0f * (c4 + c6 + c7 + c8 + c9)) };
263 |                         const Vec8f g2{ mul_sub(5.0f, c1 + c2 + c4, 3.0f * (c3 + c6 + c7 + c8 + c9)) };
264 |                         const Vec8f g3{ mul_sub(5.0f, c1 + c4 + c7, 3.0f * (c2 + c3 + c6 + c8 + c9)) };
265 |                         const Vec8f g4{ mul_sub(5.0f, c4 + c7 + c8, 3.0f * (c1 + c2 + c3 + c6 + c9)) };
266 |                         const Vec8f g5{ mul_sub(5.0f, c7 + c8 + c9, 3.0f * (c1 + c2 + c3 + c4 + c6)) };
267 |                         const Vec8f g6{ mul_sub(5.0f, c6 + c8 + c9, 3.0f * (c1 + c2 + c3 + c4 + c7)) };
268 |                         const Vec8f g7{ mul_sub(5.0f, c3 + c6 + c9, 3.0f * (c1 + c2 + c4 + c7 + c8)) };
269 |                         const Vec8f g8{ mul_sub(5.0f, c2 + c3 + c6, 3.0f * (c1 + c4 + c7 + c8 + c9)) };
270 |                         const Vec8f g{ max(max(max(abs(g1), abs(g2)), max(abs(g3), abs(g4))), max(max(abs(g5), abs(g6)), max(abs(g7), abs(g8)))) };
271 |                         (g * scale).store_nt(gradient + x);
272 |                         break;
273 |                     }
274 |                 }
275 |             }
276 |             else
277 |             {
278 |                 const Vec8f c1{ Vec8f().load(prev2 + x - 2) };
279 |                 const Vec8f c2{ Vec8f().load(prev2 + x - 1) };
280 |                 const Vec8f c3{ Vec8f().load(prev2 + x) };
281 |                 const Vec8f c4{ Vec8f().load(prev2 + x + 1) };
282 |                 const Vec8f c5{ Vec8f().load(prev2 + x + 2) };
283 |                 const Vec8f c6{ Vec8f().load(prev + x - 2) };
284 |                 const Vec8f c7{ Vec8f().load(prev + x - 1) };
285 |                 const Vec8f c8{ Vec8f().load(prev + x) };
286 |                 const Vec8f c9{ Vec8f().load(prev + x + 1) };
287 |                 const Vec8f c10{ Vec8f().load(prev + x + 2) };
288 |                 const Vec8f c11{ Vec8f().load(cur + x - 2) };
289 |                 const Vec8f c12{ Vec8f().load(cur + x - 1) };
290 |                 const Vec8f c14{ Vec8f().load(cur + x + 1) };
291 |                 const Vec8f c15{ Vec8f().load(cur + x + 2) };
292 |                 const Vec8f c16{ Vec8f().load(next + x - 2) };
293 |                 const Vec8f c17{ Vec8f().load(next + x - 1) };
294 |                 const Vec8f c18{ Vec8f().load(next + x) };
295 |                 const Vec8f c19{ Vec8f().load(next + x + 1) };
296 |                 const Vec8f c20{ Vec8f().load(next + x + 2) };
297 |                 const Vec8f c21{ Vec8f().load(next2 + x - 2) };
298 |                 const Vec8f c22{ Vec8f().load(next2 + x - 1) };
299 |                 const Vec8f c23{ Vec8f().load(next2 + x) };
300 |                 const Vec8f c24{ Vec8f().load(next2 + x + 1) };
301 |                 const Vec8f c25{ Vec8f().load(next2 + x + 2) };
302 | 
303 |                 gx = c5 + c25 + c4 + c24 + mul_add(2.0f, c10 + c20 + c9 + c19, 3.0f * (c15 + c14))
304 |                     - c2 - c22 - c1 - c21 - mul_add(2.0f, c7 + c17 + c6 + c16, 3.0f * (c12 + c11));
305 |                 gy = c1 + c5 + c6 + c10 + mul_add(2.0f, c2 + c4 + c7 + c9, 3.0f * (c3 + c8))
306 |                     - c16 - c20 - c21 - c25 - mul_add(2.0f, c17 + c19 + c22 + c24, 3.0f * (c18 + c23));
307 |             }
308 | 
309 |             if (op != KIRSCH)
310 |             {
311 |                 gx *= scale;
312 |                 gy *= scale;
313 |                 sqrt(mul_add(gx, gx, gy * gy)).store_nt(gradient + x);
314 |             }
315 | 
316 |             if (mode == 0)
317 |             {
318 |                 Vec8f dr{ atan2(gy, gx) };
319 |                 dr = if_add(dr < 0.0f, dr, M_PIF);
320 | 
321 |                 const Vec8i bin{ truncatei(mul_add(dr, 4.0f * M_1_PIF, 0.5f)) };
322 |                 select(bin >= 4, zero_si256(), bin).store_nt(direction + x);
323 |             }
324 |         }
325 | 
326 |         prev2 = prev;
327 |         prev = cur;
328 |         cur = next;
329 | 
330 |         if (op != FDOG)
331 |             next += (y < height - 2) ? bgStride : -bgStride;
332 |         else
333 |         {
334 |             next = next2;
335 |             next2 += (y < height - 3) ? bgStride : -bgStride;
336 |         }
337 | 
338 |         gradient += bgStride;
339 |         direction += stride;
340 |     }
341 | }
342 | 
343 | static void nonMaximumSuppression(const int* _direction, float* _gradient, float* blur, const int width, const int height, const int stride, const int bgStride, const int radiusAlign) noexcept
344 | {
345 |     _gradient[-1] = _gradient[1];
346 |     _gradient[-1 + bgStride * (height - 1)] = _gradient[1 + bgStride * (height - 1)];
347 |     _gradient[width] = _gradient[width - 2];
348 |     _gradient[width + bgStride * (height - 1)] = _gradient[width - 2 + bgStride * (height - 1)];
349 |     std::copy_n(_gradient - radiusAlign + bgStride, width + radiusAlign * 2, _gradient - radiusAlign - bgStride);
350 |     std::copy_n(_gradient - radiusAlign + bgStride * (height - 2), width + radiusAlign * 2, _gradient - radiusAlign + bgStride * static_cast<int64_t>(height));
351 | 
352 |     for (int y{ 0 }; y < height; ++y)
353 |     {
354 |         for (int x{ 0 }; x < width; x += 8)
355 |         {
356 |             const Vec8ui direction{ Vec8ui().load_a(_direction + x) };
357 | 
358 |             Vec8fb mask{ Vec8fb(direction == 0) };
359 |             Vec8f gradient{ max(Vec8f().load(_gradient + x + 1), Vec8f().load(_gradient + x - 1)) };
360 |             Vec8f result{ gradient & mask };
361 | 
362 |             mask = Vec8fb(direction == 1);
363 |             gradient = max(Vec8f().load(_gradient + x - bgStride + 1), Vec8f().load(_gradient + x + bgStride - 1));
364 |             result |= gradient & mask;
365 | 
366 |             mask = Vec8fb(direction == 2);
367 |             gradient = max(Vec8f().load_a(_gradient + x - bgStride), Vec8f().load_a(_gradient + x + bgStride));
368 |             result |= gradient & mask;
369 | 
370 |             mask = Vec8fb(direction == 3);
371 |             gradient = max(Vec8f().load(_gradient + x - bgStride - 1), Vec8f().load(_gradient + x + bgStride + 1));
372 |             result |= gradient & mask;
373 | 
374 |             gradient = Vec8f().load_a(_gradient + x);
375 |             select(gradient >= result, gradient, fltLowest).store_nt(blur + x);
376 |         }
377 | 
378 |         _direction += stride;
379 |         _gradient += bgStride;
380 |         blur += bgStride;
381 |     }
382 | }
383 | 
384 | template<typename T>
385 | static void binarizeCE(const float* _srcp, T* dstp, const int width, const int height, const int srcStride, const int dstStride, const int peak) noexcept
386 | {
387 |     for (int y{ 0 }; y < height; ++y)
388 |     {
389 |         for (int x{ 0 }; x < width; x += 8)
390 |         {
391 |             const Vec8f srcp{ Vec8f().load_a(_srcp + x) };
392 | 
393 |             if constexpr (std::is_same_v<T, uint8_t>)
394 |             {
395 |                 const Vec16cb mask{ Vec16cb(compress_saturated(compress_saturated(Vec8ib(srcp == fltMax), zero_si256()), zero_si256()).get_low()) };
396 |                 select(mask, Vec16uc(255), zero_si128()).storel(dstp + x);
397 |             }
398 |             else if constexpr (std::is_same_v<T, uint16_t>)
399 |             {
400 |                 const Vec8sb mask{ Vec8sb(compress_saturated(Vec8ib(srcp == fltMax), zero_si256()).get_low()) };
401 |                 select(mask, Vec8us(peak), zero_si128()).store_nt(dstp + x);
402 |             }
403 |             else
404 |             {
405 |                 const Vec8fb mask{ srcp == fltMax };
406 |                 select(mask, Vec8f(1.0f), Vec8f(0.0f)).store_nt(dstp + x);
407 |             }
408 |         }
409 | 
410 |         _srcp += srcStride;
411 |         dstp += dstStride;
412 |     }
413 | }
414 | 
415 | template<typename T, bool clampFP = true>
416 | static void discretizeGM(const float* _srcp, T* dstp, const int width, const int height, const int srcStride, const int dstStride, const int peak) noexcept
417 | {
418 |     for (int y{ 0 }; y < height; ++y)
419 |     {
420 |         for (int x{ 0 }; x < width; x += 8)
421 |         {
422 |             const Vec8f srcp{ Vec8f().load_a(_srcp + x) };
423 | 
424 |             if constexpr (std::is_same_v<T, uint8_t>)
425 |             {
426 |                 const Vec16uc result{ compress_saturated_s2u(compress_saturated(truncatei(srcp + 0.5f), zero_si256()), zero_si256()).get_low() };
427 |                 result.storel(dstp + x);
428 |             }
429 |             else if constexpr (std::is_same_v<T, uint16_t>)
430 |             {
431 |                 const Vec8us result{ compress_saturated_s2u(truncatei(srcp + 0.5f), zero_si256()).get_low() };
432 |                 min(result, peak).store_nt(dstp + x);
433 |             }
434 |             else if constexpr (clampFP)
435 |                 min(max(srcp, 0.0f), 1.0f).store_nt(dstp + x);
436 |             else
437 |                 srcp.store_nt(dstp + x);
438 |         }
439 | 
440 |         _srcp += srcStride;
441 |         dstp += dstStride;
442 |     }
443 | }
444 | 
445 | template<typename T>
446 | void vsTCanny::filter_avx2(PVideoFrame& src, PVideoFrame& dst, IScriptEnvironment* env) noexcept
447 | {
448 |     const int planes_y[3]{ PLANAR_Y, PLANAR_U, PLANAR_V };
449 |     const int planes_r[3]{ PLANAR_G, PLANAR_B, PLANAR_R };
450 |     const int* current_planes{ (vi.IsRGB()) ? planes_r : planes_y };
451 |     const int planecount{ std::min(vi.NumComponents(), 3) };
452 | 
453 |     for (int i{ 0 }; i < planecount; ++i)
454 |     {
455 |         const int height{ src->GetHeight(current_planes[i]) };
456 | 
457 |         if (process[i] == 3)
458 |         {
459 |             const size_t stride{ src->GetPitch(current_planes[i]) / sizeof(T) };
460 |             const size_t bgStride{ stride + radiusAlign * 2 };
461 |             const size_t dst_stride{ dst->GetPitch(current_planes[i]) / sizeof(T) };
462 |             const size_t width{ src->GetRowSize(current_planes[i]) / sizeof(T) };
463 |             const T* srcp{ reinterpret_cast<const T*>(src->GetReadPtr(current_planes[i])) };
464 |             T* dstp{ reinterpret_cast<T*>(dst->GetWritePtr(current_planes[i])) };
465 | 
466 |             float* blur{ vsTCanny::blur + radiusAlign };
467 |             float* gradient{ vsTCanny::gradient + bgStride + radiusAlign };
468 | 
469 |             if (radiusV[i] && radiusH[i])
470 |                 gaussianBlur(srcp, gradient, blur, weightsH[i].get(), weightsV[i].get(), width, height, stride, bgStride, radiusH[i], radiusV[i]);
471 |             else if (radiusV[i])
472 |                 gaussianBlurV(srcp, blur, weightsV[i].get(), width, height, stride, bgStride, radiusV[i]);
473 |             else if (radiusH[i])
474 |                 gaussianBlurH(srcp, gradient, blur, weightsH[i].get(), width, height, stride, bgStride, radiusH[i]);
475 |             else
476 |                 copyPlane(srcp, blur, width, height, stride, bgStride);
477 | 
478 |             if (mode_ != -1)
479 |             {
480 |                 detectEdge(blur, gradient, direction, width, height, stride, bgStride, mode_, op_, scale);
481 | 
482 |                 if (mode_ == 0)
483 |                 {
484 |                     nonMaximumSuppression(direction, gradient, blur, width, height, stride, bgStride, radiusAlign);
485 |                     hysteresis(blur, found.get(), width, height, bgStride, t_h_, t_l_);
486 |                 }
487 |             }
488 | 
489 |             switch (mode_)
490 |             {
491 |                 case 0: binarizeCE(blur, dstp, width, height, bgStride, dst_stride, peak); break;
492 |                 case 1: discretizeGM(gradient, dstp, width, height, bgStride, stride, peak); break;
493 |                 default: discretizeGM<T, false>(blur, dstp, width, height, bgStride, dst_stride, peak); break;
494 |             }
495 |         }
496 |         else if (process[i] == 2)
497 |             env->BitBlt(dst->GetWritePtr(current_planes[i]), dst->GetPitch(current_planes[i]), src->GetReadPtr(current_planes[i]), src->GetPitch(current_planes[i]), src->GetRowSize(current_planes[i]), height);
498 |     }
499 | }
500 | 
501 | template void vsTCanny::filter_avx2<uint8_t>(PVideoFrame& src, PVideoFrame& dst, IScriptEnvironment* env) noexcept;
502 | template void vsTCanny::filter_avx2<uint16_t>(PVideoFrame& src, PVideoFrame& dst, IScriptEnvironment* env) noexcept;
503 | template void vsTCanny::filter_avx2<float>(PVideoFrame& src, PVideoFrame& dst, IScriptEnvironment* env) noexcept;
504 | 


--------------------------------------------------------------------------------
/src/vsTCanny_AVX512.cpp:
--------------------------------------------------------------------------------
  1 | #include "VCL2/vectormath_trig.h"
  2 | #include "vsTCanny.h"
  3 | 
  4 | template<typename T>
  5 | static void copyPlane(const T* srcp, float* dstp, const int width, const int height, const int srcStride, const int dstStride) noexcept
  6 | {
  7 |     for (int y{ 0 }; y < height; ++y)
  8 |     {
  9 |         for (int x{ 0 }; x < width; x += 16)
 10 |         {
 11 |             if constexpr (std::is_same_v<T, uint8_t>)
 12 |                 to_float(Vec16i().load_16uc(srcp + x)).store_nt(dstp + x);
 13 |             else if constexpr (std::is_same_v<T, uint16_t>)
 14 |                 to_float(Vec16i().load_16us(srcp + x)).store_nt(dstp + x);
 15 |             else
 16 |                 Vec16f().load_a(srcp + x).store_nt(dstp + x);
 17 |         }
 18 | 
 19 |         srcp += srcStride;
 20 |         dstp += dstStride;
 21 |     }
 22 | }
 23 | 
 24 | template<typename T>
 25 | static void gaussianBlur(const T* __srcp, float* temp, float* dstp, const float* weightsH, const float* weightsV, const int width, const int height, const int srcStride, const int dstStride, const int radiusH, const int radiusV) noexcept
 26 | {
 27 |     const int diameter{ radiusV * 2 + 1 };
 28 |     std::unique_ptr<const T* []> _srcp{ std::make_unique<const T* []>(diameter) };
 29 | 
 30 |     _srcp[radiusV] = __srcp;
 31 |     for (int i{ 1 }; i <= radiusV; ++i)
 32 |         _srcp[radiusV - i] = _srcp[radiusV + i] = _srcp[radiusV] + srcStride * i;
 33 | 
 34 |     weightsH += radiusH;
 35 | 
 36 |     for (int y{ 0 }; y < height; ++y)
 37 |     {
 38 |         for (int x{ 0 }; x < width; x += 16)
 39 |         {
 40 |             Vec16f sum{ zero_16f() };
 41 | 
 42 |             for (int i{ 0 }; i < diameter; ++i)
 43 |             {
 44 |                 if constexpr (std::is_same_v<T, uint8_t>)
 45 |                 {
 46 |                     const Vec16f srcp{ to_float(Vec16i().load_16uc(_srcp[i] + x)) };
 47 |                     sum = mul_add(srcp, weightsV[i], sum);
 48 |                 }
 49 |                 else if constexpr (std::is_same_v<T, uint16_t>)
 50 |                 {
 51 |                     const Vec16f srcp{ to_float(Vec16i().load_16us(_srcp[i] + x)) };
 52 |                     sum = mul_add(srcp, weightsV[i], sum);
 53 |                 }
 54 |                 else
 55 |                 {
 56 |                     const Vec16f srcp{ Vec16f().load_a(_srcp[i] + x) };
 57 |                     sum = mul_add(srcp, weightsV[i], sum);
 58 |                 }
 59 |             }
 60 | 
 61 |             sum.store_a(temp + x);
 62 |         }
 63 | 
 64 |         for (int i{ 1 }; i <= radiusH; ++i)
 65 |         {
 66 |             temp[-i] = temp[i];
 67 |             temp[width - 1 + i] = temp[width - 1 - i];
 68 |         }
 69 | 
 70 |         for (int x{ 0 }; x < width; x += 16)
 71 |         {
 72 |             Vec16f sum{ zero_16f() };
 73 | 
 74 |             for (int i{ -radiusH }; i <= radiusH; ++i)
 75 |             {
 76 |                 const Vec16f srcp{ Vec16f().load(temp + x + i) };
 77 |                 sum = mul_add(srcp, weightsH[i], sum);
 78 |             }
 79 | 
 80 |             sum.store_nt(dstp + x);
 81 |         }
 82 | 
 83 |         for (int i{ 0 }; i < diameter - 1; ++i)
 84 |             _srcp[i] = _srcp[i + 1];
 85 | 
 86 |         _srcp[diameter - 1] += (y < height - 1 - radiusV) ? srcStride : -srcStride;
 87 | 
 88 |         dstp += dstStride;
 89 |     }
 90 | }
 91 | 
 92 | template<typename T>
 93 | static void gaussianBlurV(const T* __srcp, float* dstp, const float* weights, const int width, const int height, const int srcStride, const int dstStride, const int radius) noexcept
 94 | {
 95 |     const int diameter{ radius * 2 + 1 };
 96 |     std::unique_ptr<const T* []> _srcp{ std::make_unique<const T* []>(diameter) };
 97 | 
 98 |     _srcp[radius] = __srcp;
 99 |     for (int i{ 1 }; i <= radius; ++i)
100 |         _srcp[radius - i] = _srcp[radius + i] = _srcp[radius] + srcStride * i;
101 | 
102 |     for (int y{ 0 }; y < height; ++y)
103 |     {
104 |         for (int x{ 0 }; x < width; x += 16)
105 |         {
106 |             Vec16f sum{ zero_16f() };
107 | 
108 |             for (int i{ 0 }; i < diameter; ++i)
109 |             {
110 |                 if constexpr (std::is_same_v<T, uint8_t>)
111 |                 {
112 |                     const Vec16f srcp{ to_float(Vec16i().load_16uc(_srcp[i] + x)) };
113 |                     sum = mul_add(srcp, weights[i], sum);
114 |                 }
115 |                 else if constexpr (std::is_same_v<T, uint16_t>)
116 |                 {
117 |                     const Vec16f srcp{ to_float(Vec16i().load_16us(_srcp[i] + x)) };
118 |                     sum = mul_add(srcp, weights[i], sum);
119 |                 }
120 |                 else
121 |                 {
122 |                     const Vec16f srcp{ Vec16f().load_a(_srcp[i] + x) };
123 |                     sum = mul_add(srcp, weights[i], sum);
124 |                 }
125 |             }
126 | 
127 |             sum.store_nt(dstp + x);
128 |         }
129 | 
130 |         for (int i{ 0 }; i < diameter - 1; ++i)
131 |             _srcp[i] = _srcp[i + 1];
132 | 
133 |         _srcp[diameter - 1] += (y < height - 1 - radius) ? srcStride : -srcStride;
134 | 
135 |         dstp += dstStride;
136 |     }
137 | }
138 | 
139 | template<typename T>
140 | static void gaussianBlurH(const T* _srcp, float* temp, float* dstp, const float* weights, const int width, const int height, const int srcStride, const int dstStride, const int radius) noexcept
141 | {
142 |     weights += radius;
143 | 
144 |     for (int y{ 0 }; y < height; ++y)
145 |     {
146 |         for (int x{ 0 }; x < width; x += 16)
147 |         {
148 |             if constexpr (std::is_same_v<T, uint8_t>)
149 |                 to_float(Vec16i().load_16uc(_srcp + x)).store_a(temp + x);
150 |             else if constexpr (std::is_same_v<T, uint16_t>)
151 |                 to_float(Vec16i().load_16us(_srcp + x)).store_a(temp + x);
152 |             else
153 |                 Vec16f().load_a(_srcp + x).store_a(temp + x);
154 |         }
155 | 
156 |         for (int i{ 1 }; i <= radius; ++i)
157 |         {
158 |             temp[-i] = temp[i];
159 |             temp[width - 1 + i] = temp[width - 1 - i];
160 |         }
161 | 
162 |         for (int x{ 0 }; x < width; x += 16)
163 |         {
164 |             Vec16f sum{ zero_16f() };
165 | 
166 |             for (int i{ -radius }; i <= radius; ++i)
167 |             {
168 |                 const Vec16f srcp{ Vec16f().load(temp + x + i) };
169 |                 sum = mul_add(srcp, weights[i], sum);
170 |             }
171 | 
172 |             sum.store_nt(dstp + x);
173 |         }
174 | 
175 |         _srcp += srcStride;
176 |         dstp += dstStride;
177 |     }
178 | }
179 | 
180 | static void detectEdge(float* blur, float* gradient, int* direction, const int width, const int height, const int stride, const int bgStride, const int mode, const int op, const float scale) noexcept
181 | {
182 |     float* __restrict cur{ blur };
183 |     float* __restrict next{ blur + bgStride };
184 |     float* __restrict next2{ blur + bgStride * 2 };
185 |     float* __restrict prev{ next };
186 |     float* __restrict prev2{ next2 };
187 | 
188 |     cur[-1] = cur[1];
189 |     cur[width] = cur[width - 2];
190 | 
191 |     if (op == FDOG)
192 |     {
193 |         cur[-2] = cur[2];
194 |         cur[width + 1] = cur[width - 3];
195 |     }
196 | 
197 |     for (int y{ 0 }; y < height; ++y)
198 |     {
199 |         next[-1] = next[1];
200 |         next[width] = next[width - 2];
201 | 
202 |         if (op == FDOG)
203 |         {
204 |             next[-2] = next[2];
205 |             next[width + 1] = next[width - 3];
206 | 
207 |             next2[-1] = next2[1];
208 |             next2[-2] = next2[2];
209 |             next2[width] = next2[width - 2];
210 |             next2[width + 1] = next2[width - 3];
211 |         }
212 | 
213 |         for (int x{ 0 }; x < width; x += 16)
214 |         {
215 |             Vec16f gx, gy;
216 | 
217 |             if (op != FDOG)
218 |             {
219 |                 const Vec16f c1{ Vec16f().load(prev + x - 1) };
220 |                 const Vec16f c2{ Vec16f().load_a(prev + x) };
221 |                 const Vec16f c3{ Vec16f().load(prev + x + 1) };
222 |                 const Vec16f c4{ Vec16f().load(cur + x - 1) };
223 |                 const Vec16f c6{ Vec16f().load(cur + x + 1) };
224 |                 const Vec16f c7{ Vec16f().load(next + x - 1) };
225 |                 const Vec16f c8{ Vec16f().load_a(next + x) };
226 |                 const Vec16f c9{ Vec16f().load(next + x + 1) };
227 | 
228 |                 switch (op)
229 |                 {
230 |                     case TRITICAL:
231 |                     {
232 |                         gx = c6 - c4;
233 |                         gy = c2 - c8;
234 |                         break;
235 |                     }
236 |                     case PREWITT:
237 |                     {
238 |                         gx = (c3 + c6 + c9 - c1 - c4 - c7) * 0.5f;
239 |                         gy = (c1 + c2 + c3 - c7 - c8 - c9) * 0.5f;
240 |                         break;
241 |                     }
242 |                     case SOBEL:
243 |                     {
244 |                         gx = c3 + mul_add(2.0f, c6, c9) - c1 - mul_add(2.0f, c4, c7);
245 |                         gy = c1 + mul_add(2.0f, c2, c3) - c7 - mul_add(2.0f, c8, c9);
246 |                         break;
247 |                     }
248 |                     case SCHARR:
249 |                     {
250 |                         gx = mul_add(3.0f, c3 + c9, 10.0f * c6) - mul_add(3.0f, c1 + c7, 10.0f * c4);
251 |                         gy = mul_add(3.0f, c1 + c3, 10.0f * c2) - mul_add(3.0f, c7 + c9, 10.0f * c8);
252 |                         break;
253 |                     }
254 |                     case KROON:
255 |                     {
256 |                         gx = mul_add(17.0f, c3 + c9, 61.0f * c6) - mul_add(17.0f, c1 + c7, 61.0f * c4);
257 |                         gy = mul_add(17.0f, c1 + c3, 61.0f * c2) - mul_add(17.0f, c7 + c9, 61.0f * c8);
258 |                         break;
259 |                     }
260 |                     case KIRSCH:
261 |                     {
262 |                         const Vec16f g1{ mul_sub(5.0f, c1 + c2 + c3, 3.0f * (c4 + c6 + c7 + c8 + c9)) };
263 |                         const Vec16f g2{ mul_sub(5.0f, c1 + c2 + c4, 3.0f * (c3 + c6 + c7 + c8 + c9)) };
264 |                         const Vec16f g3{ mul_sub(5.0f, c1 + c4 + c7, 3.0f * (c2 + c3 + c6 + c8 + c9)) };
265 |                         const Vec16f g4{ mul_sub(5.0f, c4 + c7 + c8, 3.0f * (c1 + c2 + c3 + c6 + c9)) };
266 |                         const Vec16f g5{ mul_sub(5.0f, c7 + c8 + c9, 3.0f * (c1 + c2 + c3 + c4 + c6)) };
267 |                         const Vec16f g6{ mul_sub(5.0f, c6 + c8 + c9, 3.0f * (c1 + c2 + c3 + c4 + c7)) };
268 |                         const Vec16f g7{ mul_sub(5.0f, c3 + c6 + c9, 3.0f * (c1 + c2 + c4 + c7 + c8)) };
269 |                         const Vec16f g8{ mul_sub(5.0f, c2 + c3 + c6, 3.0f * (c1 + c4 + c7 + c8 + c9)) };
270 |                         const Vec16f g{ max(max(max(abs(g1), abs(g2)), max(abs(g3), abs(g4))), max(max(abs(g5), abs(g6)), max(abs(g7), abs(g8)))) };
271 |                         (g * scale).store_nt(gradient + x);
272 |                         break;
273 |                     }
274 |                 }
275 |             }
276 |             else
277 |             {
278 |                 const Vec16f c1{ Vec16f().load(prev2 + x - 2) };
279 |                 const Vec16f c2{ Vec16f().load(prev2 + x - 1) };
280 |                 const Vec16f c3{ Vec16f().load(prev2 + x) };
281 |                 const Vec16f c4{ Vec16f().load(prev2 + x + 1) };
282 |                 const Vec16f c5{ Vec16f().load(prev2 + x + 2) };
283 |                 const Vec16f c6{ Vec16f().load(prev + x - 2) };
284 |                 const Vec16f c7{ Vec16f().load(prev + x - 1) };
285 |                 const Vec16f c8{ Vec16f().load(prev + x) };
286 |                 const Vec16f c9{ Vec16f().load(prev + x + 1) };
287 |                 const Vec16f c10{ Vec16f().load(prev + x + 2) };
288 |                 const Vec16f c11{ Vec16f().load(cur + x - 2) };
289 |                 const Vec16f c12{ Vec16f().load(cur + x - 1) };
290 |                 const Vec16f c14{ Vec16f().load(cur + x + 1) };
291 |                 const Vec16f c15{ Vec16f().load(cur + x + 2) };
292 |                 const Vec16f c16{ Vec16f().load(next + x - 2) };
293 |                 const Vec16f c17{ Vec16f().load(next + x - 1) };
294 |                 const Vec16f c18{ Vec16f().load(next + x) };
295 |                 const Vec16f c19{ Vec16f().load(next + x + 1) };
296 |                 const Vec16f c20{ Vec16f().load(next + x + 2) };
297 |                 const Vec16f c21{ Vec16f().load(next2 + x - 2) };
298 |                 const Vec16f c22{ Vec16f().load(next2 + x - 1) };
299 |                 const Vec16f c23{ Vec16f().load(next2 + x) };
300 |                 const Vec16f c24{ Vec16f().load(next2 + x + 1) };
301 |                 const Vec16f c25{ Vec16f().load(next2 + x + 2) };
302 | 
303 |                 gx = c5 + c25 + c4 + c24 + mul_add(2.0f, c10 + c20 + c9 + c19, 3.0f * (c15 + c14))
304 |                     - c2 - c22 - c1 - c21 - mul_add(2.0f, c7 + c17 + c6 + c16, 3.0f * (c12 + c11));
305 |                 gy = c1 + c5 + c6 + c10 + mul_add(2.0f, c2 + c4 + c7 + c9, 3.0f * (c3 + c8))
306 |                     - c16 - c20 - c21 - c25 - mul_add(2.0f, c17 + c19 + c22 + c24, 3.0f * (c18 + c23));
307 |             }
308 | 
309 |             if (op != KIRSCH)
310 |             {
311 |                 gx *= scale;
312 |                 gy *= scale;
313 |                 sqrt(mul_add(gx, gx, gy * gy)).store_nt(gradient + x);
314 |             }
315 | 
316 |             if (mode == 0)
317 |             {
318 |                 Vec16f dr{ atan2(gy, gx) };
319 |                 dr = if_add(dr < 0.0f, dr, M_PIF);
320 | 
321 |                 const Vec16i bin{ truncatei(mul_add(dr, 4.0f * M_1_PIF, 0.5f)) };
322 |                 select(bin >= 4, zero_si512(), bin).store_nt(direction + x);
323 |             }
324 |         }
325 | 
326 |         prev2 = prev;
327 |         prev = cur;
328 |         cur = next;
329 | 
330 |         if (op != FDOG)
331 |             next += (y < height - 2) ? bgStride : -bgStride;
332 |         else
333 |         {
334 |             next = next2;
335 |             next2 += (y < height - 3) ? bgStride : -bgStride;
336 |         }
337 | 
338 |         gradient += bgStride;
339 |         direction += stride;
340 |     }
341 | }
342 | 
343 | static void nonMaximumSuppression(const int* _direction, float* _gradient, float* blur, const int width, const int height, const int stride, const int bgStride, const int radiusAlign) noexcept
344 | {
345 |     _gradient[-1] = _gradient[1];
346 |     _gradient[-1 + bgStride * (height - 1)] = _gradient[1 + bgStride * (height - 1)];
347 |     _gradient[width] = _gradient[width - 2];
348 |     _gradient[width + bgStride * (height - 1)] = _gradient[width - 2 + bgStride * (height - 1)];
349 |     std::copy_n(_gradient - radiusAlign + bgStride, width + radiusAlign * 2, _gradient - radiusAlign - bgStride);
350 |     std::copy_n(_gradient - radiusAlign + bgStride * (height - 2), width + radiusAlign * 2, _gradient - radiusAlign + bgStride * static_cast<int64_t>(height));
351 | 
352 |     for (int y{ 0 }; y < height; ++y)
353 |     {
354 |         for (int x{ 0 }; x < width; x += 16)
355 |         {
356 |             const Vec16ui direction{ Vec16ui().load_a(_direction + x) };
357 | 
358 |             Vec16fb mask{ Vec16fb(direction == 0) };
359 |             Vec16f gradient{ max(Vec16f().load(_gradient + x + 1), Vec16f().load(_gradient + x - 1)) };
360 |             Vec16f result{ gradient & mask };
361 | 
362 |             mask = Vec16fb(direction == 1);
363 |             gradient = max(Vec16f().load(_gradient + x - bgStride + 1), Vec16f().load(_gradient + x + bgStride - 1));
364 |             result |= gradient & mask;
365 | 
366 |             mask = Vec16fb(direction == 2);
367 |             gradient = max(Vec16f().load_a(_gradient + x - bgStride), Vec16f().load_a(_gradient + x + bgStride));
368 |             result |= gradient & mask;
369 | 
370 |             mask = Vec16fb(direction == 3);
371 |             gradient = max(Vec16f().load(_gradient + x - bgStride - 1), Vec16f().load(_gradient + x + bgStride + 1));
372 |             result |= gradient & mask;
373 | 
374 |             gradient = Vec16f().load_a(_gradient + x);
375 |             select(gradient >= result, gradient, fltLowest).store_nt(blur + x);
376 |         }
377 | 
378 |         _direction += stride;
379 |         _gradient += bgStride;
380 |         blur += bgStride;
381 |     }
382 | }
383 | 
384 | template<typename T>
385 | static void binarizeCE(const float* _srcp, T* dstp, const int width, const int height, const int srcStride, const int dstStride, const int peak) noexcept
386 | {
387 |     for (int y{ 0 }; y < height; ++y)
388 |     {
389 |         for (int x{ 0 }; x < width; x += 16)
390 |         {
391 |             const Vec16f srcp{ Vec16f().load_a(_srcp + x) };
392 | 
393 |             if constexpr (std::is_same_v<T, uint8_t>)
394 |             {
395 |                 const Vec16cb mask{ Vec16cb(srcp == fltMax) };
396 |                 select(mask, Vec16uc(255), zero_si128()).store_nt(dstp + x);
397 |             }
398 |             else if constexpr (std::is_same_v<T, uint16_t>)
399 |             {
400 |                 const Vec16sb mask{ Vec16sb(srcp == fltMax) };
401 |                 select(mask, Vec16us(peak), zero_si256()).store_nt(dstp + x);
402 |             }
403 |             else
404 |             {
405 |                 const Vec16fb mask{ srcp == fltMax };
406 |                 select(mask, Vec16f(1.0f), Vec16f(0.0f)).store_nt(dstp + x);
407 |             }
408 |         }
409 | 
410 |         _srcp += srcStride;
411 |         dstp += dstStride;
412 |     }
413 | }
414 | 
415 | template<typename T, bool clampFP = true>
416 | static void discretizeGM(const float* _srcp, T* dstp, const int width, const int height, const int srcStride, const int dstStride, const int peak) noexcept
417 | {
418 |     for (int y{ 0 }; y < height; ++y)
419 |     {
420 |         for (int x{ 0 }; x < width; x += 16)
421 |         {
422 |             const Vec16f srcp{ Vec16f().load_a(_srcp + x) };
423 | 
424 |             if constexpr (std::is_same_v<T, uint8_t>)
425 |             {
426 |                 const Vec16uc result{ compress_saturated_s2u(compress_saturated(truncatei(srcp + 0.5f), zero_si512()), zero_si512()).get_low().get_low() };
427 |                 result.store_nt(dstp + x);
428 |             }
429 |             else if constexpr (std::is_same_v<T, uint16_t>)
430 |             {
431 |                 const Vec16us result{ compress_saturated_s2u(truncatei(srcp + 0.5f), zero_si512()).get_low() };
432 |                 min(result, peak).store_nt(dstp + x);
433 |             }
434 |             else if constexpr (clampFP)
435 |                 min(max(srcp, 0.0f), 1.0f).store_nt(dstp + x);
436 |             else
437 |                 srcp.store_nt(dstp + x);
438 |         }
439 | 
440 |         _srcp += srcStride;
441 |         dstp += dstStride;
442 |     }
443 | }
444 | 
445 | template<typename T>
446 | void vsTCanny::filter_avx512(PVideoFrame& src, PVideoFrame& dst, IScriptEnvironment* env) noexcept
447 | {
448 |     const int planes_y[3]{ PLANAR_Y, PLANAR_U, PLANAR_V };
449 |     const int planes_r[3]{ PLANAR_G, PLANAR_B, PLANAR_R };
450 |     const int* current_planes{ (vi.IsRGB()) ? planes_r : planes_y };
451 |     const int planecount{ std::min(vi.NumComponents(), 3) };
452 | 
453 |     for (int i{ 0 }; i < planecount; ++i)
454 |     {
455 |         const int height{ src->GetHeight(current_planes[i]) };
456 | 
457 |         if (process[i] == 3)
458 |         {
459 |             const size_t stride{ src->GetPitch(current_planes[i]) / sizeof(T) };
460 |             const size_t bgStride{ stride + radiusAlign * 2 };
461 |             const size_t dst_stride{ dst->GetPitch(current_planes[i]) / sizeof(T) };
462 |             const size_t width{ src->GetRowSize(current_planes[i]) / sizeof(T) };
463 |             const T* srcp{ reinterpret_cast<const T*>(src->GetReadPtr(current_planes[i])) };
464 |             T* dstp{ reinterpret_cast<T*>(dst->GetWritePtr(current_planes[i])) };
465 | 
466 |             float* blur{ vsTCanny::blur + radiusAlign };
467 |             float* gradient{ vsTCanny::gradient + bgStride + radiusAlign };
468 | 
469 |             if (radiusV[i] && radiusH[i])
470 |                 gaussianBlur(srcp, gradient, blur, weightsH[i].get(), weightsV[i].get(), width, height, stride, bgStride, radiusH[i], radiusV[i]);
471 |             else if (radiusV[i])
472 |                 gaussianBlurV(srcp, blur, weightsV[i].get(), width, height, stride, bgStride, radiusV[i]);
473 |             else if (radiusH[i])
474 |                 gaussianBlurH(srcp, gradient, blur, weightsH[i].get(), width, height, stride, bgStride, radiusH[i]);
475 |             else
476 |                 copyPlane(srcp, blur, width, height, stride, bgStride);
477 | 
478 |             if (mode_ != -1)
479 |             {
480 |                 detectEdge(blur, gradient, direction, width, height, stride, bgStride, mode_, op_, scale);
481 | 
482 |                 if (mode_ == 0)
483 |                 {
484 |                     nonMaximumSuppression(direction, gradient, blur, width, height, stride, bgStride, radiusAlign);
485 |                     hysteresis(blur, found.get(), width, height, bgStride, t_h_, t_l_);
486 |                 }
487 |             }
488 | 
489 |             switch (mode_)
490 |             {
491 |                 case 0: binarizeCE(blur, dstp, width, height, bgStride, dst_stride, peak); break;
492 |                 case 1: discretizeGM(gradient, dstp, width, height, bgStride, stride, peak); break;
493 |                 default: discretizeGM<T, false>(blur, dstp, width, height, bgStride, dst_stride, peak); break;
494 |             }
495 |         }
496 |         else if (process[i] == 2)
497 |             env->BitBlt(dst->GetWritePtr(current_planes[i]), dst->GetPitch(current_planes[i]), src->GetReadPtr(current_planes[i]), src->GetPitch(current_planes[i]), src->GetRowSize(current_planes[i]), height);
498 |     }
499 | }
500 | 
501 | template void vsTCanny::filter_avx512<uint8_t>(PVideoFrame& src, PVideoFrame& dst, IScriptEnvironment* env) noexcept;
502 | template void vsTCanny::filter_avx512<uint16_t>(PVideoFrame& src, PVideoFrame& dst, IScriptEnvironment* env) noexcept;
503 | template void vsTCanny::filter_avx512<float>(PVideoFrame& src, PVideoFrame& dst, IScriptEnvironment* env) noexcept;
504 | 


--------------------------------------------------------------------------------
/src/vsTCanny_SSE2.cpp:
--------------------------------------------------------------------------------
  1 | #include "VCL2/vectormath_trig.h"
  2 | #include "vsTCanny.h"
  3 | 
  4 | template<typename T>
  5 | static void copyPlane(const T* srcp, float* dstp, const int width, const int height, const int srcStride, const int dstStride) noexcept
  6 | {
  7 |     for (int y{ 0 }; y < height; ++y)
  8 |     {
  9 |         for (int x{ 0 }; x < width; x += 4)
 10 |         {
 11 |             if constexpr (std::is_same_v<T, uint8_t>)
 12 |                 to_float(Vec4i().load_4uc(srcp + x)).store_nt(dstp + x);
 13 |             else if constexpr (std::is_same_v<T, uint16_t>)
 14 |                 to_float(Vec4i().load_4us(srcp + x)).store_nt(dstp + x);
 15 |             else
 16 |                 Vec4f().load_a(srcp + x).store_nt(dstp + x);
 17 |         }
 18 | 
 19 |         srcp += srcStride;
 20 |         dstp += dstStride;
 21 |     }
 22 | }
 23 | 
 24 | template<typename T>
 25 | static void gaussianBlur(const T* __srcp, float* temp, float* dstp, const float* weightsH, const float* weightsV, const int width, const int height, const int srcStride, const int dstStride, const int radiusH, const int radiusV) noexcept
 26 | {
 27 |     const int diameter{ radiusV * 2 + 1 };
 28 |     std::unique_ptr<const T* []> _srcp{ std::make_unique<const T* []>(diameter) };
 29 | 
 30 |     _srcp[radiusV] = __srcp;
 31 |     for (int i{ 1 }; i <= radiusV; ++i)
 32 |         _srcp[radiusV - i] = _srcp[radiusV + i] = _srcp[radiusV] + srcStride * i;
 33 | 
 34 |     weightsH += radiusH;
 35 | 
 36 |     for (int y{ 0 }; y < height; ++y)
 37 |     {
 38 |         for (int x{ 0 }; x < width; x += 4)
 39 |         {
 40 |             Vec4f sum{ zero_4f() };
 41 | 
 42 |             for (int i{ 0 }; i < diameter; ++i)
 43 |             {
 44 |                 if constexpr (std::is_same_v<T, uint8_t>)
 45 |                 {
 46 |                     const Vec4f srcp{ to_float(Vec4i().load_4uc(_srcp[i] + x)) };
 47 |                     sum = mul_add(srcp, weightsV[i], sum);
 48 |                 }
 49 |                 else if constexpr (std::is_same_v<T, uint16_t>)
 50 |                 {
 51 |                     const Vec4f srcp{ to_float(Vec4i().load_4us(_srcp[i] + x)) };
 52 |                     sum = mul_add(srcp, weightsV[i], sum);
 53 |                 }
 54 |                 else
 55 |                 {
 56 |                     const Vec4f srcp{ Vec4f().load_a(_srcp[i] + x) };
 57 |                     sum = mul_add(srcp, weightsV[i], sum);
 58 |                 }
 59 |             }
 60 | 
 61 |             sum.store_a(temp + x);
 62 |         }
 63 | 
 64 |         for (int i{ 1 }; i <= radiusH; ++i)
 65 |         {
 66 |             temp[-i] = temp[i];
 67 |             temp[width - 1 + i] = temp[width - 1 - i];
 68 |         }
 69 | 
 70 |         for (int x{ 0 }; x < width; x += 4)
 71 |         {
 72 |             Vec4f sum{ zero_4f() };
 73 | 
 74 |             for (int i{ -radiusH }; i <= radiusH; ++i)
 75 |             {
 76 |                 const Vec4f srcp{ Vec4f().load(temp + x + i) };
 77 |                 sum = mul_add(srcp, weightsH[i], sum);
 78 |             }
 79 | 
 80 |             sum.store_nt(dstp + x);
 81 |         }
 82 | 
 83 |         for (int i{ 0 }; i < diameter - 1; ++i)
 84 |             _srcp[i] = _srcp[i + 1];
 85 | 
 86 |         _srcp[diameter - 1] += (y < height - 1 - radiusV) ? srcStride : -srcStride;
 87 | 
 88 |         dstp += dstStride;
 89 |     }
 90 | }
 91 | 
 92 | template<typename T>
 93 | static void gaussianBlurV(const T* __srcp, float* dstp, const float* weights, const int width, const int height, const int srcStride, const int dstStride, const int radius) noexcept
 94 | {
 95 |     const int diameter{ radius * 2 + 1 };
 96 |     std::unique_ptr<const T* []> _srcp{ std::make_unique<const T* []>(diameter) };
 97 | 
 98 |     _srcp[radius] = __srcp;
 99 |     for (int i{ 1 }; i <= radius; ++i)
100 |         _srcp[radius - i] = _srcp[radius + i] = _srcp[radius] + srcStride * i;
101 | 
102 |     for (int y{ 0 }; y < height; ++y)
103 |     {
104 |         for (int x{ 0 }; x < width; x += 4)
105 |         {
106 |             Vec4f sum{ zero_4f() };
107 | 
108 |             for (int i{ 0 }; i < diameter; ++i)
109 |             {
110 |                 if constexpr (std::is_same_v<T, uint8_t>)
111 |                 {
112 |                     const Vec4f srcp{ to_float(Vec4i().load_4uc(_srcp[i] + x)) };
113 |                     sum = mul_add(srcp, weights[i], sum);
114 |                 }
115 |                 else if constexpr (std::is_same_v<T, uint16_t>)
116 |                 {
117 |                     const Vec4f srcp{ to_float(Vec4i().load_4us(_srcp[i] + x)) };
118 |                     sum = mul_add(srcp, weights[i], sum);
119 |                 }
120 |                 else
121 |                 {
122 |                     const Vec4f srcp{ Vec4f().load_a(_srcp[i] + x) };
123 |                     sum = mul_add(srcp, weights[i], sum);
124 |                 }
125 |             }
126 | 
127 |             sum.store_nt(dstp + x);
128 |         }
129 | 
130 |         for (int i{ 0 }; i < diameter - 1; ++i)
131 |             _srcp[i] = _srcp[i + 1];
132 | 
133 |         _srcp[diameter - 1] += (y < height - 1 - radius) ? srcStride : -srcStride;
134 | 
135 |         dstp += dstStride;
136 |     }
137 | }
138 | 
139 | template<typename T>
140 | static void gaussianBlurH(const T* _srcp, float* temp, float* dstp, const float* weights, const int width, const int height, const int srcStride, const int dstStride, const int radius) noexcept
141 | {
142 |     weights += radius;
143 | 
144 |     for (int y{ 0 }; y < height; ++y)
145 |     {
146 |         for (int x{ 0 }; x < width; x += 4)
147 |         {
148 |             if constexpr (std::is_same_v<T, uint8_t>)
149 |                 to_float(Vec4i().load_4uc(_srcp + x)).store_a(temp + x);
150 |             else if constexpr (std::is_same_v<T, uint16_t>)
151 |                 to_float(Vec4i().load_4us(_srcp + x)).store_a(temp + x);
152 |             else
153 |                 Vec4f().load_a(_srcp + x).store_a(temp + x);
154 |         }
155 | 
156 |         for (int i{ 1 }; i <= radius; ++i)
157 |         {
158 |             temp[-i] = temp[i];
159 |             temp[width - 1 + i] = temp[width - 1 - i];
160 |         }
161 | 
162 |         for (int x{ 0 }; x < width; x += 4)
163 |         {
164 |             Vec4f sum{ zero_4f() };
165 | 
166 |             for (int i{ -radius }; i <= radius; ++i)
167 |             {
168 |                 const Vec4f srcp{ Vec4f().load(temp + x + i) };
169 |                 sum = mul_add(srcp, weights[i], sum);
170 |             }
171 | 
172 |             sum.store_nt(dstp + x);
173 |         }
174 | 
175 |         _srcp += srcStride;
176 |         dstp += dstStride;
177 |     }
178 | }
179 | 
180 | static void detectEdge(float* blur, float* gradient, int* direction, const int width, const int height, const int stride, const int bgStride, const int mode, const int op, const float scale) noexcept
181 | {
182 |     float* __restrict cur{ blur };
183 |     float* __restrict next{ blur + bgStride };
184 |     float* __restrict next2{ blur + bgStride * 2 };
185 |     float* __restrict prev{ next };
186 |     float* __restrict prev2{ next2 };
187 | 
188 |     cur[-1] = cur[1];
189 |     cur[width] = cur[width - 2];
190 | 
191 |     if (op == FDOG)
192 |     {
193 |         cur[-2] = cur[2];
194 |         cur[width + 1] = cur[width - 3];
195 |     }
196 | 
197 |     for (int y{ 0 }; y < height; ++y)
198 |     {
199 |         next[-1] = next[1];
200 |         next[width] = next[width - 2];
201 | 
202 |         if (op == FDOG)
203 |         {
204 |             next[-2] = next[2];
205 |             next[width + 1] = next[width - 3];
206 | 
207 |             next2[-1] = next2[1];
208 |             next2[-2] = next2[2];
209 |             next2[width] = next2[width - 2];
210 |             next2[width + 1] = next2[width - 3];
211 |         }
212 | 
213 |         for (int x{ 0 }; x < width; x += 4)
214 |         {
215 |             Vec4f gx, gy;
216 | 
217 |             if (op != FDOG)
218 |             {
219 |                 const Vec4f c1{ Vec4f().load(prev + x - 1) };
220 |                 const Vec4f c2{ Vec4f().load_a(prev + x) };
221 |                 const Vec4f c3{ Vec4f().load(prev + x + 1) };
222 |                 const Vec4f c4{ Vec4f().load(cur + x - 1) };
223 |                 const Vec4f c6{ Vec4f().load(cur + x + 1) };
224 |                 const Vec4f c7{ Vec4f().load(next + x - 1) };
225 |                 const Vec4f c8{ Vec4f().load_a(next + x) };
226 |                 const Vec4f c9{ Vec4f().load(next + x + 1) };
227 | 
228 |                 switch (op)
229 |                 {
230 |                     case TRITICAL:
231 |                     {
232 |                         gx = c6 - c4;
233 |                         gy = c2 - c8;
234 |                         break;
235 |                     }
236 |                     case PREWITT:
237 |                     {
238 |                         gx = (c3 + c6 + c9 - c1 - c4 - c7) * 0.5f;
239 |                         gy = (c1 + c2 + c3 - c7 - c8 - c9) * 0.5f;
240 |                         break;
241 |                     }
242 |                     case SOBEL:
243 |                     {
244 |                         gx = c3 + mul_add(2.0f, c6, c9) - c1 - mul_add(2.0f, c4, c7);
245 |                         gy = c1 + mul_add(2.0f, c2, c3) - c7 - mul_add(2.0f, c8, c9);
246 |                         break;
247 |                     }
248 |                     case SCHARR:
249 |                     {
250 |                         gx = mul_add(3.0f, c3 + c9, 10.0f * c6) - mul_add(3.0f, c1 + c7, 10.0f * c4);
251 |                         gy = mul_add(3.0f, c1 + c3, 10.0f * c2) - mul_add(3.0f, c7 + c9, 10.0f * c8);
252 |                         break;
253 |                     }
254 |                     case KROON:
255 |                     {
256 |                         gx = mul_add(17.0f, c3 + c9, 61.0f * c6) - mul_add(17.0f, c1 + c7, 61.0f * c4);
257 |                         gy = mul_add(17.0f, c1 + c3, 61.0f * c2) - mul_add(17.0f, c7 + c9, 61.0f * c8);
258 |                         break;
259 |                     }
260 |                     case KIRSCH:
261 |                     {
262 |                         const Vec4f g1{ mul_sub(5.0f, c1 + c2 + c3, 3.0f * (c4 + c6 + c7 + c8 + c9)) };
263 |                         const Vec4f g2{ mul_sub(5.0f, c1 + c2 + c4, 3.0f * (c3 + c6 + c7 + c8 + c9)) };
264 |                         const Vec4f g3{ mul_sub(5.0f, c1 + c4 + c7, 3.0f * (c2 + c3 + c6 + c8 + c9)) };
265 |                         const Vec4f g4{ mul_sub(5.0f, c4 + c7 + c8, 3.0f * (c1 + c2 + c3 + c6 + c9)) };
266 |                         const Vec4f g5{ mul_sub(5.0f, c7 + c8 + c9, 3.0f * (c1 + c2 + c3 + c4 + c6)) };
267 |                         const Vec4f g6{ mul_sub(5.0f, c6 + c8 + c9, 3.0f * (c1 + c2 + c3 + c4 + c7)) };
268 |                         const Vec4f g7{ mul_sub(5.0f, c3 + c6 + c9, 3.0f * (c1 + c2 + c4 + c7 + c8)) };
269 |                         const Vec4f g8{ mul_sub(5.0f, c2 + c3 + c6, 3.0f * (c1 + c4 + c7 + c8 + c9)) };
270 |                         const Vec4f g{ max(max(max(abs(g1), abs(g2)), max(abs(g3), abs(g4))), max(max(abs(g5), abs(g6)), max(abs(g7), abs(g8)))) };
271 |                         (g * scale).store_nt(gradient + x);
272 |                         break;
273 |                     }
274 |                 }
275 |             }
276 |             else
277 |             {
278 |                 const Vec4f c1{ Vec4f().load(prev2 + x - 2) };
279 |                 const Vec4f c2{ Vec4f().load(prev2 + x - 1) };
280 |                 const Vec4f c3{ Vec4f().load(prev2 + x) };
281 |                 const Vec4f c4{ Vec4f().load(prev2 + x + 1) };
282 |                 const Vec4f c5{ Vec4f().load(prev2 + x + 2) };
283 |                 const Vec4f c6{ Vec4f().load(prev + x - 2) };
284 |                 const Vec4f c7{ Vec4f().load(prev + x - 1) };
285 |                 const Vec4f c8{ Vec4f().load(prev + x) };
286 |                 const Vec4f c9{ Vec4f().load(prev + x + 1) };
287 |                 const Vec4f c10{ Vec4f().load(prev + x + 2) };
288 |                 const Vec4f c11{ Vec4f().load(cur + x - 2) };
289 |                 const Vec4f c12{ Vec4f().load(cur + x - 1) };
290 |                 const Vec4f c14{ Vec4f().load(cur + x + 1) };
291 |                 const Vec4f c15{ Vec4f().load(cur + x + 2) };
292 |                 const Vec4f c16{ Vec4f().load(next + x - 2) };
293 |                 const Vec4f c17{ Vec4f().load(next + x - 1) };
294 |                 const Vec4f c18{ Vec4f().load(next + x) };
295 |                 const Vec4f c19{ Vec4f().load(next + x + 1) };
296 |                 const Vec4f c20{ Vec4f().load(next + x + 2) };
297 |                 const Vec4f c21{ Vec4f().load(next2 + x - 2) };
298 |                 const Vec4f c22{ Vec4f().load(next2 + x - 1) };
299 |                 const Vec4f c23{ Vec4f().load(next2 + x) };
300 |                 const Vec4f c24{ Vec4f().load(next2 + x + 1) };
301 |                 const Vec4f c25{ Vec4f().load(next2 + x + 2) };
302 | 
303 |                 gx = c5 + c25 + c4 + c24 + mul_add(2.0f, c10 + c20 + c9 + c19, 3.0f * (c15 + c14))
304 |                     - c2 - c22 - c1 - c21 - mul_add(2.0f, c7 + c17 + c6 + c16, 3.0f * (c12 + c11));
305 |                 gy = c1 + c5 + c6 + c10 + mul_add(2.0f, c2 + c4 + c7 + c9, 3.0f * (c3 + c8))
306 |                     - c16 - c20 - c21 - c25 - mul_add(2.0f, c17 + c19 + c22 + c24, 3.0f * (c18 + c23));
307 |             }
308 | 
309 |             if (op != KIRSCH)
310 |             {
311 |                 gx *= scale;
312 |                 gy *= scale;
313 |                 sqrt(mul_add(gx, gx, gy * gy)).store_nt(gradient + x);
314 |             }
315 | 
316 |             if (mode == 0)
317 |             {
318 |                 Vec4f dr{ atan2(gy, gx) };
319 |                 dr = if_add(dr < 0.0f, dr, M_PIF);
320 | 
321 |                 const Vec4i bin{ truncatei(mul_add(dr, 4.0f * M_1_PIF, 0.5f)) };
322 |                 select(bin >= 4, zero_si128(), bin).store_nt(direction + x);
323 |             }
324 |         }
325 | 
326 |         prev2 = prev;
327 |         prev = cur;
328 |         cur = next;
329 | 
330 |         if (op != FDOG)
331 |             next += (y < height - 2) ? bgStride : -bgStride;
332 |         else
333 |         {
334 |             next = next2;
335 |             next2 += (y < height - 3) ? bgStride : -bgStride;
336 |         }
337 | 
338 |         gradient += bgStride;
339 |         direction += stride;
340 |     }
341 | }
342 | 
343 | static void nonMaximumSuppression(const int* _direction, float* _gradient, float* blur, const int width, const int height, const int stride, const int bgStride, const int radiusAlign) noexcept
344 | {
345 |     _gradient[-1] = _gradient[1];
346 |     _gradient[-1 + bgStride * (height - 1)] = _gradient[1 + bgStride * (height - 1)];
347 |     _gradient[width] = _gradient[width - 2];
348 |     _gradient[width + bgStride * (height - 1)] = _gradient[width - 2 + bgStride * (height - 1)];
349 |     std::copy_n(_gradient - radiusAlign + bgStride, width + radiusAlign * 2, _gradient - radiusAlign - bgStride);
350 |     std::copy_n(_gradient - radiusAlign + bgStride * (height - 2), width + radiusAlign * 2, _gradient - radiusAlign + bgStride * static_cast<int64_t>(height));
351 | 
352 |     for (int y{ 0 }; y < height; ++y)
353 |     {
354 |         for (int x{ 0 }; x < width; x += 4)
355 |         {
356 |             const Vec4ui direction{ Vec4ui().load_a(_direction + x) };
357 | 
358 |             Vec4fb mask{ Vec4fb(direction == 0) };
359 |             Vec4f gradient{ max(Vec4f().load(_gradient + x + 1), Vec4f().load(_gradient + x - 1)) };
360 |             Vec4f result{ gradient & mask };
361 | 
362 |             mask = Vec4fb(direction == 1);
363 |             gradient = max(Vec4f().load(_gradient + x - bgStride + 1), Vec4f().load(_gradient + x + bgStride - 1));
364 |             result |= gradient & mask;
365 | 
366 |             mask = Vec4fb(direction == 2);
367 |             gradient = max(Vec4f().load_a(_gradient + x - bgStride), Vec4f().load_a(_gradient + x + bgStride));
368 |             result |= gradient & mask;
369 | 
370 |             mask = Vec4fb(direction == 3);
371 |             gradient = max(Vec4f().load(_gradient + x - bgStride - 1), Vec4f().load(_gradient + x + bgStride + 1));
372 |             result |= gradient & mask;
373 | 
374 |             gradient = Vec4f().load_a(_gradient + x);
375 |             select(gradient >= result, gradient, fltLowest).store_nt(blur + x);
376 |         }
377 | 
378 |         _direction += stride;
379 |         _gradient += bgStride;
380 |         blur += bgStride;
381 |     }
382 | }
383 | 
384 | template<typename T>
385 | static void binarizeCE(const float* _srcp, T* dstp, const int width, const int height, const int srcStride, const int dstStride, const int peak) noexcept
386 | {
387 |     for (int y{ 0 }; y < height; ++y)
388 |     {
389 |         for (int x{ 0 }; x < width; x += 4)
390 |         {
391 |             const Vec4f srcp{ Vec4f().load_a(_srcp + x) };
392 | 
393 |             if constexpr (std::is_same_v<T, uint8_t>)
394 |             {
395 |                 const Vec16cb mask{ Vec16cb(compress_saturated(compress_saturated(Vec4ib(srcp == fltMax), zero_si128()), zero_si128())) };
396 |                 select(mask, Vec16uc(255), zero_si128()).store_si32(dstp + x);
397 |             }
398 |             else if constexpr (std::is_same_v<T, uint16_t>)
399 |             {
400 |                 const Vec8sb mask{ Vec8sb(compress_saturated(Vec4ib(srcp == fltMax), zero_si128())) };
401 |                 select(mask, Vec8us(peak), zero_si128()).storel(dstp + x);
402 |             }
403 |             else
404 |             {
405 |                 const Vec4fb mask{ srcp == fltMax };
406 |                 select(mask, Vec4f(1.0f), Vec4f(0.0f)).store_nt(dstp + x);
407 |             }
408 |         }
409 | 
410 |         _srcp += srcStride;
411 |         dstp += dstStride;
412 |     }
413 | }
414 | 
415 | template<typename T, bool clampFP = true>
416 | static void discretizeGM(const float* _srcp, T* dstp, const int width, const int height, const int srcStride, const int dstStride, const int peak) noexcept
417 | {
418 |     for (int y{ 0 }; y < height; ++y)
419 |     {
420 |         for (int x{ 0 }; x < width; x += 4)
421 |         {
422 |             const Vec4f srcp{ Vec4f().load_a(_srcp + x) };
423 | 
424 |             if constexpr (std::is_same_v<T, uint8_t>)
425 |             {
426 |                 const Vec16uc result{ compress_saturated_s2u(compress_saturated(truncatei(srcp + 0.5f), zero_si128()), zero_si128()) };
427 |                 result.store_si32(dstp + x);
428 |             }
429 |             else if constexpr (std::is_same_v<T, uint16_t>)
430 |             {
431 |                 const Vec8us result{ compress_saturated_s2u(truncatei(srcp + 0.5f), zero_si128()) };
432 |                 min(result, peak).storel(dstp + x);
433 |             }
434 |             else if constexpr (clampFP)
435 |                 min(max(srcp, 0.0f), 1.0f).store_nt(dstp + x);
436 |             else
437 |                 srcp.store_nt(dstp + x);
438 |         }
439 | 
440 |         _srcp += srcStride;
441 |         dstp += dstStride;
442 |     }
443 | }
444 | 
445 | template<typename T>
446 | void vsTCanny::filter_sse2(PVideoFrame& src, PVideoFrame& dst, IScriptEnvironment* env) noexcept
447 | {
448 |     const int planes_y[3]{ PLANAR_Y, PLANAR_U, PLANAR_V };
449 |     const int planes_r[3]{ PLANAR_G, PLANAR_B, PLANAR_R };
450 |     const int* current_planes{ (vi.IsRGB()) ? planes_r : planes_y };
451 |     const int planecount{ std::min(vi.NumComponents(), 3) };
452 | 
453 |     for (int i{ 0 }; i < planecount; ++i)
454 |     {
455 |         const int height{ src->GetHeight(current_planes[i]) };
456 | 
457 |         if (process[i] == 3)
458 |         {
459 |             const size_t stride{ src->GetPitch(current_planes[i]) / sizeof(T) };
460 |             const size_t bgStride{ stride + radiusAlign * 2 };
461 |             const size_t dst_stride{ dst->GetPitch(current_planes[i]) / sizeof(T) };
462 |             const size_t width{ src->GetRowSize(current_planes[i]) / sizeof(T) };
463 |             const T* srcp{ reinterpret_cast<const T*>(src->GetReadPtr(current_planes[i])) };
464 |             T* dstp{ reinterpret_cast<T*>(dst->GetWritePtr(current_planes[i])) };
465 | 
466 |             float* blur{ vsTCanny::blur + radiusAlign };
467 |             float* gradient{ vsTCanny::gradient + bgStride + radiusAlign };
468 | 
469 |             if (radiusV[i] && radiusH[i])
470 |                 gaussianBlur(srcp, gradient, blur, weightsH[i].get(), weightsV[i].get(), width, height, stride, bgStride, radiusH[i], radiusV[i]);
471 |             else if (radiusV[i])
472 |                 gaussianBlurV(srcp, blur, weightsV[i].get(), width, height, stride, bgStride, radiusV[i]);
473 |             else if (radiusH[i])
474 |                 gaussianBlurH(srcp, gradient, blur, weightsH[i].get(), width, height, stride, bgStride, radiusH[i]);
475 |             else
476 |                 copyPlane(srcp, blur, width, height, stride, bgStride);
477 | 
478 |             if (mode_ != -1)
479 |             {
480 |                 detectEdge(blur, gradient, direction, width, height, stride, bgStride, mode_, op_, scale);
481 | 
482 |                 if (mode_ == 0)
483 |                 {
484 |                     nonMaximumSuppression(direction, gradient, blur, width, height, stride, bgStride, radiusAlign);
485 |                     hysteresis(blur, found.get(), width, height, bgStride, t_h_, t_l_);
486 |                 }
487 |             }
488 | 
489 |             switch (mode_)
490 |             {
491 |                 case 0: binarizeCE(blur, dstp, width, height, bgStride, dst_stride, peak); break;
492 |                 case 1: discretizeGM(gradient, dstp, width, height, bgStride, stride, peak); break;
493 |                 default: discretizeGM<T, false>(blur, dstp, width, height, bgStride, dst_stride, peak); break;
494 |             }
495 |         }
496 |         else if (process[i] == 2)
497 |             env->BitBlt(dst->GetWritePtr(current_planes[i]), dst->GetPitch(current_planes[i]), src->GetReadPtr(current_planes[i]), src->GetPitch(current_planes[i]), src->GetRowSize(current_planes[i]), height);
498 |     }
499 | }
500 | 
501 | template void vsTCanny::filter_sse2<uint8_t>(PVideoFrame& src, PVideoFrame& dst, IScriptEnvironment* env) noexcept;
502 | template void vsTCanny::filter_sse2<uint16_t>(PVideoFrame& src, PVideoFrame& dst, IScriptEnvironment* env) noexcept;
503 | template void vsTCanny::filter_sse2<float>(PVideoFrame& src, PVideoFrame& dst, IScriptEnvironment* env) noexcept;
504 | 


--------------------------------------------------------------------------------