├── .gitattributes
├── .gitignore
├── CAS
    ├── CAS.cpp
    ├── CAS.h
    ├── CAS.vcxproj
    ├── CAS.vcxproj.filters
    ├── CAS_AVX2.cpp
    ├── CAS_AVX512.cpp
    ├── CAS_SSE2.cpp
    └── VCL2
    │   ├── LICENSE
    │   ├── instrset.h
    │   ├── instrset_detect.cpp
    │   ├── vector_convert.h
    │   ├── vectorclass.h
    │   ├── vectorf128.h
    │   ├── vectorf256.h
    │   ├── vectorf256e.h
    │   ├── vectorf512.h
    │   ├── vectorf512e.h
    │   ├── vectori128.h
    │   ├── vectori256.h
    │   ├── vectori256e.h
    │   ├── vectori512.h
    │   ├── vectori512e.h
    │   ├── vectori512s.h
    │   ├── vectori512se.h
    │   ├── vectormath_common.h
    │   ├── vectormath_exp.h
    │   ├── vectormath_hyp.h
    │   ├── vectormath_lib.h
    │   └── vectormath_trig.h
├── LICENSE
├── README.md
└── meson.build


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Prerequisites
  2 | *.d
  3 | 
  4 | # Compiled Object files
  5 | *.slo
  6 | *.lo
  7 | *.o
  8 | *.obj
  9 | 
 10 | # Precompiled Headers
 11 | *.gch
 12 | *.pch
 13 | 
 14 | # Compiled Dynamic libraries
 15 | *.so
 16 | *.dylib
 17 | *.dll
 18 | 
 19 | # Fortran module files
 20 | *.mod
 21 | *.smod
 22 | 
 23 | # Compiled Static libraries
 24 | *.lai
 25 | *.la
 26 | *.a
 27 | *.lib
 28 | 
 29 | # Executables
 30 | *.exe
 31 | *.out
 32 | *.app
 33 | 
 34 | # User-specific files
 35 | *.rsuser
 36 | *.suo
 37 | *.user
 38 | *.userosscache
 39 | *.sln.docstates
 40 | 
 41 | # Visual Studio 2015/2017 cache/options directory
 42 | .vs/
 43 | 
 44 | # Files built by Visual Studio
 45 | *_i.c
 46 | *_p.c
 47 | *_h.h
 48 | *.ilk
 49 | *.meta
 50 | *.obj
 51 | *.iobj
 52 | *.pch
 53 | *.pdb
 54 | *.ipdb
 55 | *.pgc
 56 | *.pgd
 57 | *.rsp
 58 | *.sbr
 59 | *.tlb
 60 | *.tli
 61 | *.tlh
 62 | *.tmp
 63 | *.tmp_proj
 64 | *_wpftmp.csproj
 65 | *.log
 66 | *.vspscc
 67 | *.vssscc
 68 | .builds
 69 | *.pidb
 70 | *.svclog
 71 | *.scc
 72 | 
 73 | # Visual C++ cache files
 74 | ipch/
 75 | *.aps
 76 | *.ncb
 77 | *.opendb
 78 | *.opensdf
 79 | *.sdf
 80 | *.cachefile
 81 | *.VC.db
 82 | *.VC.VC.opendb
 83 | 
 84 | # Visual Studio profiler
 85 | *.psess
 86 | *.vsp
 87 | *.vspx
 88 | *.sap
 89 | 
 90 | # Windows thumbnail cache files
 91 | Thumbs.db
 92 | Thumbs.db:encryptable
 93 | ehthumbs.db
 94 | ehthumbs_vista.db
 95 | 
 96 | # Dump file
 97 | *.stackdump
 98 | 
 99 | # Folder config file
100 | [Dd]esktop.ini
101 | 
102 | # Recycle Bin used on file shares
103 | $RECYCLE.BIN/
104 | 
105 | # Windows Installer files
106 | *.cab
107 | *.msi
108 | *.msix
109 | *.msm
110 | *.msp
111 | 
112 | # Windows shortcuts
113 | *.lnk
114 | 


--------------------------------------------------------------------------------
/CAS/CAS.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |     MIT License
  3 | 
  4 |     Copyright (c) 2020 Holy Wu
  5 | 
  6 |     Permission is hereby granted, free of charge, to any person obtaining a copy
  7 |     of this software and associated documentation files (the "Software"), to deal
  8 |     in the Software without restriction, including without limitation the rights
  9 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10 |     copies of the Software, and to permit persons to whom the Software is
 11 |     furnished to do so, subject to the following conditions:
 12 | 
 13 |     The above copyright notice and this permission notice shall be included in all
 14 |     copies or substantial portions of the Software.
 15 | 
 16 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 19 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 21 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 22 |     SOFTWARE.
 23 | */
 24 | 
 25 | #include <cmath>
 26 | 
 27 | #include <algorithm>
 28 | #include <memory>
 29 | #include <string>
 30 | 
 31 | #include "CAS.h"
 32 | 
 33 | #ifdef CAS_X86
 34 | template<typename pixel_t> extern void filter_sse2(const VSFrameRef * src, VSFrameRef * dst, const CASData * const VS_RESTRICT data, const VSAPI * vsapi) noexcept;
 35 | template<typename pixel_t> extern void filter_avx2(const VSFrameRef * src, VSFrameRef * dst, const CASData * const VS_RESTRICT data, const VSAPI * vsapi) noexcept;
 36 | template<typename pixel_t> extern void filter_avx512(const VSFrameRef * src, VSFrameRef * dst, const CASData * const VS_RESTRICT data, const VSAPI * vsapi) noexcept;
 37 | #endif
 38 | 
 39 | template<typename pixel_t>
 40 | static void filter_c(const VSFrameRef * src, VSFrameRef * dst, const CASData * const VS_RESTRICT data, const VSAPI * vsapi) noexcept {
 41 |     using var_t = std::conditional_t<std::is_integral_v<pixel_t>, int, float>;
 42 | 
 43 |     const var_t limit = std::any_cast<var_t>(data->limit);
 44 | 
 45 |     auto filtering = [&](const var_t a, const var_t b, const var_t c, const var_t d, const var_t e, const var_t f, const var_t g, const var_t h, const var_t i,
 46 |                          const float chromaOffset) noexcept {
 47 |         // Soft min and max.
 48 |         //  a b c             b
 49 |         //  d e f * 0.5  +  d e f * 0.5
 50 |         //  g h i             h
 51 |         // These are 2.0x bigger (factored out the extra multiply).
 52 |         var_t mn = std::min({ d, e, f, b, h });
 53 |         const var_t mn2 = std::min({ mn, a, c, g, i });
 54 |         mn += mn2;
 55 | 
 56 |         var_t mx = std::max({ d, e, f, b, h });
 57 |         const var_t mx2 = std::max({ mx, a, c, g, i });
 58 |         mx += mx2;
 59 | 
 60 |         if constexpr (std::is_floating_point_v<pixel_t>) {
 61 |             mn += chromaOffset;
 62 |             mx += chromaOffset;
 63 |         }
 64 | 
 65 |         // Smooth minimum distance to signal limit divided by smooth max.
 66 |         float amp = std::clamp(std::min(mn, limit - mx) / static_cast<float>(mx), 0.0f, 1.0f);
 67 | 
 68 |         // Shaping amount of sharpening.
 69 |         amp = std::sqrt(amp);
 70 | 
 71 |         // Filter shape.
 72 |         //  0 w 0
 73 |         //  w 1 w
 74 |         //  0 w 0
 75 |         const float weight = amp * data->sharpness;
 76 |         return ((b + d + f + h) * weight + e) / (1.0f + 4.0f * weight);
 77 |     };
 78 | 
 79 |     for (int plane = 0; plane < data->vi->format->numPlanes; plane++) {
 80 |         if (data->process[plane]) {
 81 |             const int width = vsapi->getFrameWidth(src, plane);
 82 |             const int height = vsapi->getFrameHeight(src, plane);
 83 |             const int stride = vsapi->getStride(src, plane) / sizeof(pixel_t);
 84 |             const pixel_t * srcp = reinterpret_cast<const pixel_t *>(vsapi->getReadPtr(src, plane));
 85 |             pixel_t * VS_RESTRICT dstp = reinterpret_cast<pixel_t *>(vsapi->getWritePtr(dst, plane));
 86 | 
 87 |             const float chromaOffset = plane ? 1.0f : 0.0f;
 88 | 
 89 |             for (int y = 0; y < height; y++) {
 90 |                 const pixel_t * above = srcp + (y == 0 ? stride : -stride);
 91 |                 const pixel_t * below = srcp + (y == height - 1 ? -stride : stride);
 92 | 
 93 |                 {
 94 |                     const float result = filtering(above[1], above[0], above[1],
 95 |                                                    srcp[1], srcp[0], srcp[1],
 96 |                                                    below[1], below[0], below[1],
 97 |                                                    chromaOffset);
 98 | 
 99 |                     if constexpr (std::is_integral_v<pixel_t>)
100 |                         dstp[0] = std::clamp(static_cast<int>(result + 0.5f), 0, data->peak);
101 |                     else
102 |                         dstp[0] = result;
103 |                 }
104 | 
105 |                 for (int x = 1; x < width - 1; x++) {
106 |                     const float result = filtering(above[x - 1], above[x], above[x + 1],
107 |                                                    srcp[x - 1], srcp[x], srcp[x + 1],
108 |                                                    below[x - 1], below[x], below[x + 1],
109 |                                                    chromaOffset);
110 | 
111 |                     if constexpr (std::is_integral_v<pixel_t>)
112 |                         dstp[x] = std::clamp(static_cast<int>(result + 0.5f), 0, data->peak);
113 |                     else
114 |                         dstp[x] = result;
115 |                 }
116 | 
117 |                 {
118 |                     const float result = filtering(above[width - 2], above[width - 1], above[width - 2],
119 |                                                    srcp[width - 2], srcp[width - 1], srcp[width - 2],
120 |                                                    below[width - 2], below[width - 1], below[width - 2],
121 |                                                    chromaOffset);
122 | 
123 |                     if constexpr (std::is_integral_v<pixel_t>)
124 |                         dstp[width - 1] = std::clamp(static_cast<int>(result + 0.5f), 0, data->peak);
125 |                     else
126 |                         dstp[width - 1] = result;
127 |                 }
128 | 
129 |                 srcp += stride;
130 |                 dstp += stride;
131 |             }
132 |         }
133 |     }
134 | }
135 | 
136 | static void VS_CC casInit(VSMap * in, VSMap * out, void ** instanceData, VSNode * node, VSCore * core, const VSAPI * vsapi) {
137 |     CASData * d = static_cast<CASData *>(*instanceData);
138 |     vsapi->setVideoInfo(d->vi, 1, node);
139 | }
140 | 
141 | static const VSFrameRef * VS_CC casGetFrame(int n, int activationReason, void ** instanceData, void ** frameData, VSFrameContext * frameCtx, VSCore * core, const VSAPI * vsapi) {
142 |     const CASData * d = static_cast<const CASData *>(*instanceData);
143 | 
144 |     if (activationReason == arInitial) {
145 |         vsapi->requestFrameFilter(n, d->node, frameCtx);
146 |     } else if (activationReason == arAllFramesReady) {
147 |         const VSFrameRef * src = vsapi->getFrameFilter(n, d->node, frameCtx);
148 |         const VSFrameRef * fr[] = { d->process[0] ? nullptr : src, d->process[1] ? nullptr : src, d->process[2] ? nullptr : src };
149 |         const int pl[] = { 0, 1, 2 };
150 |         VSFrameRef * dst = vsapi->newVideoFrame2(d->vi->format, d->vi->width, d->vi->height, fr, pl, src, core);
151 | 
152 |         d->filter(src, dst, d, vsapi);
153 | 
154 |         vsapi->freeFrame(src);
155 |         return dst;
156 |     }
157 | 
158 |     return nullptr;
159 | }
160 | 
161 | static void VS_CC casFree(void * instanceData, VSCore * core, const VSAPI * vsapi) {
162 |     CASData * d = static_cast<CASData *>(instanceData);
163 |     vsapi->freeNode(d->node);
164 |     delete d;
165 | }
166 | 
167 | static void VS_CC casCreate(const VSMap * in, VSMap * out, void * userData, VSCore * core, const VSAPI * vsapi) {
168 |     using namespace std::literals;
169 | 
170 |     std::unique_ptr<CASData> d = std::make_unique<CASData>();
171 | 
172 |     try {
173 |         d->node = vsapi->propGetNode(in, "clip", 0, nullptr);
174 |         d->vi = vsapi->getVideoInfo(d->node);
175 |         int err;
176 | 
177 |         if (!isConstantFormat(d->vi) ||
178 |             (d->vi->format->sampleType == stInteger && d->vi->format->bitsPerSample > 16) ||
179 |             (d->vi->format->sampleType == stFloat && d->vi->format->bitsPerSample != 32))
180 |             throw "only constant format 8-16 bit integer and 32 bit float input supported";
181 | 
182 |         for (int plane = 0; plane < d->vi->format->numPlanes; plane++) {
183 |             if (d->vi->width >> (plane ? d->vi->format->subSamplingW : 0) < 3)
184 |                 throw "plane's width must be greater than or equal to 3";
185 | 
186 |             if (d->vi->height >> (plane ? d->vi->format->subSamplingH : 0) < 3)
187 |                 throw "plane's height must be greater than or equal to 3";
188 |         }
189 | 
190 |         d->sharpness = static_cast<float>(vsapi->propGetFloat(in, "sharpness", 0, &err));
191 |         if (err)
192 |             d->sharpness = 0.5f;
193 | 
194 |         {
195 |             const int m = vsapi->propNumElements(in, "planes");
196 | 
197 |             if (m <= 0) {
198 |                 for (int i = 0; i < 3; i++) {
199 |                     d->process[i] = true;
200 |                     if (i == 0 && d->vi->format->colorFamily != cmRGB)
201 |                         break;
202 |                 }
203 |             }
204 | 
205 |             for (int i = 0; i < m; i++) {
206 |                 const int n = int64ToIntS(vsapi->propGetInt(in, "planes", i, nullptr));
207 | 
208 |                 if (n < 0 || n >= d->vi->format->numPlanes)
209 |                     throw "plane index out of range";
210 | 
211 |                 if (d->process[n])
212 |                     throw "plane specified twice";
213 | 
214 |                 d->process[n] = true;
215 |             }
216 |         }
217 | 
218 |         const int opt = int64ToIntS(vsapi->propGetInt(in, "opt", 0, &err));
219 | 
220 |         if (d->sharpness < 0.0f || d->sharpness > 1.0f)
221 |             throw "sharpness must be between 0.0 and 1.0 (inclusive)";
222 | 
223 |         if (opt < 0 || opt > 4)
224 |             throw "opt must be 0, 1, 2, 3, or 4";
225 | 
226 |         {
227 |             if (d->vi->format->bytesPerSample == 1)
228 |                 d->filter = filter_c<uint8_t>;
229 |             else if (d->vi->format->bytesPerSample == 2)
230 |                 d->filter = filter_c<uint16_t>;
231 |             else
232 |                 d->filter = filter_c<float>;
233 | 
234 | #ifdef CAS_X86
235 |             const int iset = instrset_detect();
236 |             if ((opt == 0 && iset >= 10) || opt == 4) {
237 |                 if (d->vi->format->bytesPerSample == 1)
238 |                     d->filter = filter_avx512<uint8_t>;
239 |                 else if (d->vi->format->bytesPerSample == 2)
240 |                     d->filter = filter_avx512<uint16_t>;
241 |                 else
242 |                     d->filter = filter_avx512<float>;
243 |             } else if ((opt == 0 && iset >= 8) || opt == 3) {
244 |                 if (d->vi->format->bytesPerSample == 1)
245 |                     d->filter = filter_avx2<uint8_t>;
246 |                 else if (d->vi->format->bytesPerSample == 2)
247 |                     d->filter = filter_avx2<uint16_t>;
248 |                 else
249 |                     d->filter = filter_avx2<float>;
250 |             } else if ((opt == 0 && iset >= 2) || opt == 2) {
251 |                 if (d->vi->format->bytesPerSample == 1)
252 |                     d->filter = filter_sse2<uint8_t>;
253 |                 else if (d->vi->format->bytesPerSample == 2)
254 |                     d->filter = filter_sse2<uint16_t>;
255 |                 else
256 |                     d->filter = filter_sse2<float>;
257 |             }
258 | #endif
259 |         }
260 | 
261 |         auto lerp = [](const float a, const float b, const float t) noexcept { return a + (b - a) * t; };
262 |         d->sharpness = -1.0f / lerp(16.0f, 5.0f, d->sharpness);
263 | 
264 |         if (d->vi->format->sampleType == stInteger) {
265 |             d->limit = (1 << (d->vi->format->bitsPerSample + 1)) - 1;
266 |             d->peak = (1 << d->vi->format->bitsPerSample) - 1;
267 |         } else {
268 |             d->limit = 2.0f;
269 |         }
270 |     } catch (const char * error) {
271 |         vsapi->setError(out, ("CAS: "s + error).c_str());
272 |         vsapi->freeNode(d->node);
273 |         return;
274 |     }
275 | 
276 |     vsapi->createFilter(in, out, "CAS", casInit, casGetFrame, casFree, fmParallel, 0, d.release(), core);
277 | }
278 | 
279 | //////////////////////////////////////////
280 | // Init
281 | 
282 | VS_EXTERNAL_API(void) VapourSynthPluginInit(VSConfigPlugin configFunc, VSRegisterFunction registerFunc, VSPlugin * plugin) {
283 |     configFunc("com.holywu.cas", "cas", "Contrast Adaptive Sharpening", VAPOURSYNTH_API_VERSION, 1, plugin);
284 |     registerFunc("CAS",
285 |                  "clip:clip;"
286 |                  "sharpness:float:opt;"
287 |                  "planes:int[]:opt;"
288 |                  "opt:int:opt;",
289 |                  casCreate, nullptr, plugin);
290 | }
291 | 


--------------------------------------------------------------------------------
/CAS/CAS.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <any>
 4 | #include <type_traits>
 5 | 
 6 | #include <VapourSynth.h>
 7 | #include <VSHelper.h>
 8 | 
 9 | #ifdef CAS_X86
10 | #include "VCL2/vectorclass.h"
11 | #endif
12 | 
13 | struct CASData final {
14 |     VSNodeRef * node;
15 |     const VSVideoInfo * vi;
16 |     float sharpness;
17 |     bool process[3];
18 |     std::any limit;
19 |     int peak;
20 |     void (*filter)(const VSFrameRef * src, VSFrameRef * dst, const CASData * const VS_RESTRICT data, const VSAPI * vsapi) noexcept;
21 | };
22 | 


--------------------------------------------------------------------------------
/CAS/CAS.vcxproj:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8"?>
 2 | <Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
 3 |   <ItemGroup Label="ProjectConfigurations">
 4 |     <ProjectConfiguration Include="Release|x64">
 5 |       <Configuration>Release</Configuration>
 6 |       <Platform>x64</Platform>
 7 |     </ProjectConfiguration>
 8 |   </ItemGroup>
 9 |   <PropertyGroup Label="Globals">
10 |     <VCProjectVersion>16.0</VCProjectVersion>
11 |     <Keyword>Win32Proj</Keyword>
12 |     <ProjectGuid>{9bb46411-255f-4695-b047-3d09ecdd1e41}</ProjectGuid>
13 |     <RootNamespace>CAS</RootNamespace>
14 |     <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
15 |   </PropertyGroup>
16 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
17 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
18 |     <ConfigurationType>DynamicLibrary</ConfigurationType>
19 |     <UseDebugLibraries>false</UseDebugLibraries>
20 |     <PlatformToolset>v142</PlatformToolset>
21 |     <WholeProgramOptimization>true</WholeProgramOptimization>
22 |     <CharacterSet>Unicode</CharacterSet>
23 |   </PropertyGroup>
24 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
25 |   <ImportGroup Label="ExtensionSettings">
26 |   </ImportGroup>
27 |   <ImportGroup Label="Shared">
28 |   </ImportGroup>
29 |   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
30 |     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
31 |   </ImportGroup>
32 |   <PropertyGroup Label="UserMacros" />
33 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
34 |     <IncludePath>C:\Program Files\VapourSynth\sdk\include\vapoursynth;$(IncludePath)</IncludePath>
35 |   </PropertyGroup>
36 |   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
37 |     <ClCompile>
38 |       <PreprocessorDefinitions>CAS_X86;NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
39 |       <WarningLevel>Level3</WarningLevel>
40 |       <MultiProcessorCompilation>true</MultiProcessorCompilation>
41 |       <BufferSecurityCheck>false</BufferSecurityCheck>
42 |       <FloatingPointExceptions>false</FloatingPointExceptions>
43 |       <ConformanceMode>true</ConformanceMode>
44 |       <LanguageStandard>stdcpp17</LanguageStandard>
45 |     </ClCompile>
46 |     <Link>
47 |       <SubSystem>Windows</SubSystem>
48 |       <EnableCOMDATFolding>true</EnableCOMDATFolding>
49 |       <OptimizeReferences>true</OptimizeReferences>
50 |     </Link>
51 |   </ItemDefinitionGroup>
52 |   <ItemGroup>
53 |     <ClCompile Include="CAS.cpp" />
54 |     <ClCompile Include="CAS_AVX2.cpp">
55 |       <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|x64'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
56 |     </ClCompile>
57 |     <ClCompile Include="CAS_AVX512.cpp">
58 |       <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|x64'">AdvancedVectorExtensions512</EnableEnhancedInstructionSet>
59 |     </ClCompile>
60 |     <ClCompile Include="CAS_SSE2.cpp" />
61 |     <ClCompile Include="VCL2\instrset_detect.cpp" />
62 |   </ItemGroup>
63 |   <ItemGroup>
64 |     <ClInclude Include="CAS.h" />
65 |   </ItemGroup>
66 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
67 |   <ImportGroup Label="ExtensionTargets">
68 |   </ImportGroup>
69 | </Project>


--------------------------------------------------------------------------------
/CAS/CAS.vcxproj.filters:
--------------------------------------------------------------------------------
 1 | ﻿<?xml version="1.0" encoding="utf-8"?>
 2 | <Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
 3 |   <ItemGroup>
 4 |     <Filter Include="Source Files">
 5 |       <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
 6 |       <Extensions>cpp;c;cc;cxx;c++;def;odl;idl;hpj;bat;asm;asmx</Extensions>
 7 |     </Filter>
 8 |     <Filter Include="Header Files">
 9 |       <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
10 |       <Extensions>h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd</Extensions>
11 |     </Filter>
12 |     <Filter Include="Resource Files">
13 |       <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
14 |       <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
15 |     </Filter>
16 |   </ItemGroup>
17 |   <ItemGroup>
18 |     <ClCompile Include="CAS.cpp">
19 |       <Filter>Source Files</Filter>
20 |     </ClCompile>
21 |     <ClCompile Include="CAS_SSE2.cpp">
22 |       <Filter>Source Files</Filter>
23 |     </ClCompile>
24 |     <ClCompile Include="CAS_AVX2.cpp">
25 |       <Filter>Source Files</Filter>
26 |     </ClCompile>
27 |     <ClCompile Include="CAS_AVX512.cpp">
28 |       <Filter>Source Files</Filter>
29 |     </ClCompile>
30 |     <ClCompile Include="VCL2\instrset_detect.cpp">
31 |       <Filter>Source Files</Filter>
32 |     </ClCompile>
33 |   </ItemGroup>
34 |   <ItemGroup>
35 |     <ClInclude Include="CAS.h">
36 |       <Filter>Header Files</Filter>
37 |     </ClInclude>
38 |   </ItemGroup>
39 | </Project>


--------------------------------------------------------------------------------
/CAS/CAS_AVX2.cpp:
--------------------------------------------------------------------------------
  1 | #ifdef CAS_X86
  2 | #include "CAS.h"
  3 | 
  4 | template<typename pixel_t>
  5 | void filter_avx2(const VSFrameRef * src, VSFrameRef * dst, const CASData * const VS_RESTRICT data, const VSAPI * vsapi) noexcept {
  6 |     using var_t = std::conditional_t<std::is_integral_v<pixel_t>, int, float>;
  7 |     using vec_t = std::conditional_t<std::is_integral_v<pixel_t>, Vec8i, Vec8f>;
  8 | 
  9 |     const vec_t limit = std::any_cast<var_t>(data->limit);
 10 | 
 11 |     auto load = [](const pixel_t * srcp) noexcept {
 12 |         if constexpr (std::is_same_v<pixel_t, uint8_t>)
 13 |             return vec_t().load_8uc(srcp);
 14 |         else if constexpr (std::is_same_v<pixel_t, uint16_t>)
 15 |             return vec_t().load_8us(srcp);
 16 |         else
 17 |             return vec_t().load(srcp);
 18 |     };
 19 | 
 20 |     auto store = [&](const Vec8f srcp, pixel_t * dstp) noexcept {
 21 |         if constexpr (std::is_same_v<pixel_t, uint8_t>) {
 22 |             const auto result = compress_saturated_s2u(compress_saturated(truncatei(srcp + 0.5f), zero_si256()), zero_si256()).get_low();
 23 |             result.storel(dstp);
 24 |         } else if constexpr (std::is_same_v<pixel_t, uint16_t>) {
 25 |             const auto result = compress_saturated_s2u(truncatei(srcp + 0.5f), zero_si256()).get_low();
 26 |             min(result, data->peak).store_nt(dstp);
 27 |         } else {
 28 |             srcp.store_nt(dstp);
 29 |         }
 30 |     };
 31 | 
 32 |     auto filtering = [&](const vec_t a, const vec_t b, const vec_t c, const vec_t d, const vec_t e, const vec_t f, const vec_t g, const vec_t h, const vec_t i,
 33 |                          const Vec8f chromaOffset) noexcept {
 34 |         // Soft min and max.
 35 |         //  a b c             b
 36 |         //  d e f * 0.5  +  d e f * 0.5
 37 |         //  g h i             h
 38 |         // These are 2.0x bigger (factored out the extra multiply).
 39 |         vec_t mn = min(min(min(d, e), min(f, b)), h);
 40 |         const vec_t mn2 = min(min(min(mn, a), min(c, g)), i);
 41 |         mn += mn2;
 42 | 
 43 |         vec_t mx = max(max(max(d, e), max(f, b)), h);
 44 |         const vec_t mx2 = max(max(max(mx, a), max(c, g)), i);
 45 |         mx += mx2;
 46 | 
 47 |         if constexpr (std::is_floating_point_v<pixel_t>) {
 48 |             mn += chromaOffset;
 49 |             mx += chromaOffset;
 50 |         }
 51 | 
 52 |         // Smooth minimum distance to signal limit divided by smooth max.
 53 |         Vec8f amp;
 54 |         if constexpr (std::is_integral_v<pixel_t>)
 55 |             amp = min(max(to_float(min(mn, limit - mx)) / to_float(mx), 0.0f), 1.0f);
 56 |         else
 57 |             amp = min(max(min(mn, limit - mx) / mx, 0.0f), 1.0f);
 58 | 
 59 |         // Shaping amount of sharpening.
 60 |         amp = sqrt(amp);
 61 | 
 62 |         // Filter shape.
 63 |         //  0 w 0
 64 |         //  w 1 w
 65 |         //  0 w 0
 66 |         const Vec8f weight = amp * data->sharpness;
 67 |         if constexpr (std::is_integral_v<pixel_t>)
 68 |             return mul_add(to_float((b + d) + (f + h)), weight, to_float(e)) / mul_add(4.0f, weight, 1.0f);
 69 |         else
 70 |             return mul_add((b + d) + (f + h), weight, e) / mul_add(4.0f, weight, 1.0f);
 71 |     };
 72 | 
 73 |     for (int plane = 0; plane < data->vi->format->numPlanes; plane++) {
 74 |         if (data->process[plane]) {
 75 |             const int width = vsapi->getFrameWidth(src, plane);
 76 |             const int height = vsapi->getFrameHeight(src, plane);
 77 |             const int stride = vsapi->getStride(src, plane) / sizeof(pixel_t);
 78 |             const pixel_t * srcp = reinterpret_cast<const pixel_t *>(vsapi->getReadPtr(src, plane));
 79 |             pixel_t * dstp = reinterpret_cast<pixel_t *>(vsapi->getWritePtr(dst, plane));
 80 | 
 81 |             const Vec8f chromaOffset = plane ? 1.0f : 0.0f;
 82 | 
 83 |             const int regularPart = (width - 1) & ~(vec_t().size() - 1);
 84 | 
 85 |             for (int y = 0; y < height; y++) {
 86 |                 const pixel_t * above = srcp + (y == 0 ? stride : -stride);
 87 |                 const pixel_t * below = srcp + (y == height - 1 ? -stride : stride);
 88 | 
 89 |                 {
 90 |                     const vec_t b = load(above + 0);
 91 |                     const vec_t e = load(srcp + 0);
 92 |                     const vec_t h = load(below + 0);
 93 | 
 94 |                     const vec_t a = permute8<1, 0, 1, 2, 3, 4, 5, 6>(b);
 95 |                     const vec_t d = permute8<1, 0, 1, 2, 3, 4, 5, 6>(e);
 96 |                     const vec_t g = permute8<1, 0, 1, 2, 3, 4, 5, 6>(h);
 97 | 
 98 |                     vec_t c, f, i;
 99 |                     if (width > vec_t().size()) {
100 |                         c = load(above + 1);
101 |                         f = load(srcp + 1);
102 |                         i = load(below + 1);
103 |                     } else {
104 |                         c = permute8<1, 2, 3, 4, 5, 6, 7, 6>(b);
105 |                         f = permute8<1, 2, 3, 4, 5, 6, 7, 6>(e);
106 |                         i = permute8<1, 2, 3, 4, 5, 6, 7, 6>(h);
107 |                     }
108 | 
109 |                     const Vec8f result = filtering(a, b, c,
110 |                                                    d, e, f,
111 |                                                    g, h, i,
112 |                                                    chromaOffset);
113 | 
114 |                     store(result, dstp + 0);
115 |                 }
116 | 
117 |                 for (int x = vec_t().size(); x < regularPart; x += vec_t().size()) {
118 |                     const Vec8f result = filtering(load(above + x - 1), load(above + x), load(above + x + 1),
119 |                                                    load(srcp + x - 1), load(srcp + x), load(srcp + x + 1),
120 |                                                    load(below + x - 1), load(below + x), load(below + x + 1),
121 |                                                    chromaOffset);
122 | 
123 |                     store(result, dstp + x);
124 |                 }
125 | 
126 |                 if (regularPart >= vec_t().size()) {
127 |                     const vec_t a = load(above + regularPart - 1);
128 |                     const vec_t d = load(srcp + regularPart - 1);
129 |                     const vec_t g = load(below + regularPart - 1);
130 | 
131 |                     const vec_t b = load(above + regularPart);
132 |                     const vec_t e = load(srcp + regularPart);
133 |                     const vec_t h = load(below + regularPart);
134 | 
135 |                     const vec_t c = permute8<1, 2, 3, 4, 5, 6, 7, 6>(b);
136 |                     const vec_t f = permute8<1, 2, 3, 4, 5, 6, 7, 6>(e);
137 |                     const vec_t i = permute8<1, 2, 3, 4, 5, 6, 7, 6>(h);
138 | 
139 |                     const Vec8f result = filtering(a, b, c,
140 |                                                    d, e, f,
141 |                                                    g, h, i,
142 |                                                    chromaOffset);
143 | 
144 |                     store(result, dstp + regularPart);
145 |                 }
146 | 
147 |                 srcp += stride;
148 |                 dstp += stride;
149 |             }
150 |         }
151 |     }
152 | }
153 | 
154 | template void filter_avx2<uint8_t>(const VSFrameRef * src, VSFrameRef * dst, const CASData * const VS_RESTRICT data, const VSAPI * vsapi) noexcept;
155 | template void filter_avx2<uint16_t>(const VSFrameRef * src, VSFrameRef * dst, const CASData * const VS_RESTRICT data, const VSAPI * vsapi) noexcept;
156 | template void filter_avx2<float>(const VSFrameRef * src, VSFrameRef * dst, const CASData * const VS_RESTRICT data, const VSAPI * vsapi) noexcept;
157 | #endif
158 | 


--------------------------------------------------------------------------------
/CAS/CAS_AVX512.cpp:
--------------------------------------------------------------------------------
  1 | #ifdef CAS_X86
  2 | #include "CAS.h"
  3 | 
  4 | template<typename pixel_t>
  5 | void filter_avx512(const VSFrameRef * src, VSFrameRef * dst, const CASData * const VS_RESTRICT data, const VSAPI * vsapi) noexcept {
  6 |     using var_t = std::conditional_t<std::is_integral_v<pixel_t>, int, float>;
  7 |     using vec_t = std::conditional_t<std::is_integral_v<pixel_t>, Vec16i, Vec16f>;
  8 | 
  9 |     const vec_t limit = std::any_cast<var_t>(data->limit);
 10 | 
 11 |     auto load = [](const pixel_t * srcp) noexcept {
 12 |         if constexpr (std::is_same_v<pixel_t, uint8_t>)
 13 |             return vec_t().load_16uc(srcp);
 14 |         else if constexpr (std::is_same_v<pixel_t, uint16_t>)
 15 |             return vec_t().load_16us(srcp);
 16 |         else
 17 |             return vec_t().load(srcp);
 18 |     };
 19 | 
 20 |     auto store = [&](const Vec16f srcp, pixel_t * dstp) noexcept {
 21 |         if constexpr (std::is_same_v<pixel_t, uint8_t>) {
 22 |             const auto result = compress_saturated_s2u(compress_saturated(truncatei(srcp + 0.5f), zero_si512()), zero_si512()).get_low().get_low();
 23 |             result.store_nt(dstp);
 24 |         } else if constexpr (std::is_same_v<pixel_t, uint16_t>) {
 25 |             const auto result = compress_saturated_s2u(truncatei(srcp + 0.5f), zero_si512()).get_low();
 26 |             min(result, data->peak).store_nt(dstp);
 27 |         } else {
 28 |             srcp.store_nt(dstp);
 29 |         }
 30 |     };
 31 | 
 32 |     auto filtering = [&](const vec_t a, const vec_t b, const vec_t c, const vec_t d, const vec_t e, const vec_t f, const vec_t g, const vec_t h, const vec_t i,
 33 |                          const Vec16f chromaOffset) noexcept {
 34 |         // Soft min and max.
 35 |         //  a b c             b
 36 |         //  d e f * 0.5  +  d e f * 0.5
 37 |         //  g h i             h
 38 |         // These are 2.0x bigger (factored out the extra multiply).
 39 |         vec_t mn = min(min(min(d, e), min(f, b)), h);
 40 |         const vec_t mn2 = min(min(min(mn, a), min(c, g)), i);
 41 |         mn += mn2;
 42 | 
 43 |         vec_t mx = max(max(max(d, e), max(f, b)), h);
 44 |         const vec_t mx2 = max(max(max(mx, a), max(c, g)), i);
 45 |         mx += mx2;
 46 | 
 47 |         if constexpr (std::is_floating_point_v<pixel_t>) {
 48 |             mn += chromaOffset;
 49 |             mx += chromaOffset;
 50 |         }
 51 | 
 52 |         // Smooth minimum distance to signal limit divided by smooth max.
 53 |         Vec16f amp;
 54 |         if constexpr (std::is_integral_v<pixel_t>)
 55 |             amp = min(max(to_float(min(mn, limit - mx)) / to_float(mx), 0.0f), 1.0f);
 56 |         else
 57 |             amp = min(max(min(mn, limit - mx) / mx, 0.0f), 1.0f);
 58 | 
 59 |         // Shaping amount of sharpening.
 60 |         amp = sqrt(amp);
 61 | 
 62 |         // Filter shape.
 63 |         //  0 w 0
 64 |         //  w 1 w
 65 |         //  0 w 0
 66 |         const Vec16f weight = amp * data->sharpness;
 67 |         if constexpr (std::is_integral_v<pixel_t>)
 68 |             return mul_add(to_float((b + d) + (f + h)), weight, to_float(e)) / mul_add(4.0f, weight, 1.0f);
 69 |         else
 70 |             return mul_add((b + d) + (f + h), weight, e) / mul_add(4.0f, weight, 1.0f);
 71 |     };
 72 | 
 73 |     for (int plane = 0; plane < data->vi->format->numPlanes; plane++) {
 74 |         if (data->process[plane]) {
 75 |             const int width = vsapi->getFrameWidth(src, plane);
 76 |             const int height = vsapi->getFrameHeight(src, plane);
 77 |             const int stride = vsapi->getStride(src, plane) / sizeof(pixel_t);
 78 |             const pixel_t * srcp = reinterpret_cast<const pixel_t *>(vsapi->getReadPtr(src, plane));
 79 |             pixel_t * dstp = reinterpret_cast<pixel_t *>(vsapi->getWritePtr(dst, plane));
 80 | 
 81 |             const Vec16f chromaOffset = plane ? 1.0f : 0.0f;
 82 | 
 83 |             const int regularPart = (width - 1) & ~(vec_t().size() - 1);
 84 | 
 85 |             for (int y = 0; y < height; y++) {
 86 |                 const pixel_t * above = srcp + (y == 0 ? stride : -stride);
 87 |                 const pixel_t * below = srcp + (y == height - 1 ? -stride : stride);
 88 | 
 89 |                 {
 90 |                     const vec_t b = load(above + 0);
 91 |                     const vec_t e = load(srcp + 0);
 92 |                     const vec_t h = load(below + 0);
 93 | 
 94 |                     const vec_t a = permute16<1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14>(b);
 95 |                     const vec_t d = permute16<1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14>(e);
 96 |                     const vec_t g = permute16<1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14>(h);
 97 | 
 98 |                     vec_t c, f, i;
 99 |                     if (width > vec_t().size()) {
100 |                         c = load(above + 1);
101 |                         f = load(srcp + 1);
102 |                         i = load(below + 1);
103 |                     } else {
104 |                         c = permute16<1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14>(b);
105 |                         f = permute16<1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14>(e);
106 |                         i = permute16<1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14>(h);
107 |                     }
108 | 
109 |                     const Vec16f result = filtering(a, b, c,
110 |                                                     d, e, f,
111 |                                                     g, h, i,
112 |                                                     chromaOffset);
113 | 
114 |                     store(result, dstp + 0);
115 |                 }
116 | 
117 |                 for (int x = vec_t().size(); x < regularPart; x += vec_t().size()) {
118 |                     const Vec16f result = filtering(load(above + x - 1), load(above + x), load(above + x + 1),
119 |                                                     load(srcp + x - 1), load(srcp + x), load(srcp + x + 1),
120 |                                                     load(below + x - 1), load(below + x), load(below + x + 1),
121 |                                                     chromaOffset);
122 | 
123 |                     store(result, dstp + x);
124 |                 }
125 | 
126 |                 if (regularPart >= vec_t().size()) {
127 |                     const vec_t a = load(above + regularPart - 1);
128 |                     const vec_t d = load(srcp + regularPart - 1);
129 |                     const vec_t g = load(below + regularPart - 1);
130 | 
131 |                     const vec_t b = load(above + regularPart);
132 |                     const vec_t e = load(srcp + regularPart);
133 |                     const vec_t h = load(below + regularPart);
134 | 
135 |                     const vec_t c = permute16<1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14>(b);
136 |                     const vec_t f = permute16<1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14>(e);
137 |                     const vec_t i = permute16<1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14>(h);
138 | 
139 |                     const Vec16f result = filtering(a, b, c,
140 |                                                     d, e, f,
141 |                                                     g, h, i,
142 |                                                     chromaOffset);
143 | 
144 |                     store(result, dstp + regularPart);
145 |                 }
146 | 
147 |                 srcp += stride;
148 |                 dstp += stride;
149 |             }
150 |         }
151 |     }
152 | }
153 | 
154 | template void filter_avx512<uint8_t>(const VSFrameRef * src, VSFrameRef * dst, const CASData * const VS_RESTRICT data, const VSAPI * vsapi) noexcept;
155 | template void filter_avx512<uint16_t>(const VSFrameRef * src, VSFrameRef * dst, const CASData * const VS_RESTRICT data, const VSAPI * vsapi) noexcept;
156 | template void filter_avx512<float>(const VSFrameRef * src, VSFrameRef * dst, const CASData * const VS_RESTRICT data, const VSAPI * vsapi) noexcept;
157 | #endif
158 | 


--------------------------------------------------------------------------------
/CAS/CAS_SSE2.cpp:
--------------------------------------------------------------------------------
  1 | #ifdef CAS_X86
  2 | #include "CAS.h"
  3 | 
  4 | template<typename pixel_t>
  5 | void filter_sse2(const VSFrameRef * src, VSFrameRef * dst, const CASData * const VS_RESTRICT data, const VSAPI * vsapi) noexcept {
  6 |     using var_t = std::conditional_t<std::is_integral_v<pixel_t>, int, float>;
  7 |     using vec_t = std::conditional_t<std::is_integral_v<pixel_t>, Vec4i, Vec4f>;
  8 | 
  9 |     const vec_t limit = std::any_cast<var_t>(data->limit);
 10 | 
 11 |     auto load = [](const pixel_t * srcp) noexcept {
 12 |         if constexpr (std::is_same_v<pixel_t, uint8_t>)
 13 |             return vec_t().load_4uc(srcp);
 14 |         else if constexpr (std::is_same_v<pixel_t, uint16_t>)
 15 |             return vec_t().load_4us(srcp);
 16 |         else
 17 |             return vec_t().load(srcp);
 18 |     };
 19 | 
 20 |     auto store = [&](const Vec4f srcp, pixel_t * dstp) noexcept {
 21 |         if constexpr (std::is_same_v<pixel_t, uint8_t>) {
 22 |             const auto result = compress_saturated_s2u(compress_saturated(truncatei(srcp + 0.5f), zero_si128()), zero_si128());
 23 |             result.store_si32(dstp);
 24 |         } else if constexpr (std::is_same_v<pixel_t, uint16_t>) {
 25 |             const auto result = compress_saturated_s2u(truncatei(srcp + 0.5f), zero_si128());
 26 |             min(result, data->peak).storel(dstp);
 27 |         } else {
 28 |             srcp.store_nt(dstp);
 29 |         }
 30 |     };
 31 | 
 32 |     auto filtering = [&](const vec_t a, const vec_t b, const vec_t c, const vec_t d, const vec_t e, const vec_t f, const vec_t g, const vec_t h, const vec_t i,
 33 |                          const Vec4f chromaOffset) noexcept {
 34 |         // Soft min and max.
 35 |         //  a b c             b
 36 |         //  d e f * 0.5  +  d e f * 0.5
 37 |         //  g h i             h
 38 |         // These are 2.0x bigger (factored out the extra multiply).
 39 |         vec_t mn = min(min(min(d, e), min(f, b)), h);
 40 |         const vec_t mn2 = min(min(min(mn, a), min(c, g)), i);
 41 |         mn += mn2;
 42 | 
 43 |         vec_t mx = max(max(max(d, e), max(f, b)), h);
 44 |         const vec_t mx2 = max(max(max(mx, a), max(c, g)), i);
 45 |         mx += mx2;
 46 | 
 47 |         if constexpr (std::is_floating_point_v<pixel_t>) {
 48 |             mn += chromaOffset;
 49 |             mx += chromaOffset;
 50 |         }
 51 | 
 52 |         // Smooth minimum distance to signal limit divided by smooth max.
 53 |         Vec4f amp;
 54 |         if constexpr (std::is_integral_v<pixel_t>)
 55 |             amp = min(max(to_float(min(mn, limit - mx)) / to_float(mx), 0.0f), 1.0f);
 56 |         else
 57 |             amp = min(max(min(mn, limit - mx) / mx, 0.0f), 1.0f);
 58 | 
 59 |         // Shaping amount of sharpening.
 60 |         amp = sqrt(amp);
 61 | 
 62 |         // Filter shape.
 63 |         //  0 w 0
 64 |         //  w 1 w
 65 |         //  0 w 0
 66 |         const Vec4f weight = amp * data->sharpness;
 67 |         if constexpr (std::is_integral_v<pixel_t>)
 68 |             return mul_add(to_float((b + d) + (f + h)), weight, to_float(e)) / mul_add(4.0f, weight, 1.0f);
 69 |         else
 70 |             return mul_add((b + d) + (f + h), weight, e) / mul_add(4.0f, weight, 1.0f);
 71 |     };
 72 | 
 73 |     for (int plane = 0; plane < data->vi->format->numPlanes; plane++) {
 74 |         if (data->process[plane]) {
 75 |             const int width = vsapi->getFrameWidth(src, plane);
 76 |             const int height = vsapi->getFrameHeight(src, plane);
 77 |             const int stride = vsapi->getStride(src, plane) / sizeof(pixel_t);
 78 |             const pixel_t * srcp = reinterpret_cast<const pixel_t *>(vsapi->getReadPtr(src, plane));
 79 |             pixel_t * dstp = reinterpret_cast<pixel_t *>(vsapi->getWritePtr(dst, plane));
 80 | 
 81 |             const Vec4f chromaOffset = plane ? 1.0f : 0.0f;
 82 | 
 83 |             const int regularPart = (width - 1) & ~(vec_t().size() - 1);
 84 | 
 85 |             for (int y = 0; y < height; y++) {
 86 |                 const pixel_t * above = srcp + (y == 0 ? stride : -stride);
 87 |                 const pixel_t * below = srcp + (y == height - 1 ? -stride : stride);
 88 | 
 89 |                 {
 90 |                     const vec_t b = load(above + 0);
 91 |                     const vec_t e = load(srcp + 0);
 92 |                     const vec_t h = load(below + 0);
 93 | 
 94 |                     const vec_t a = permute4<1, 0, 1, 2>(b);
 95 |                     const vec_t d = permute4<1, 0, 1, 2>(e);
 96 |                     const vec_t g = permute4<1, 0, 1, 2>(h);
 97 | 
 98 |                     vec_t c, f, i;
 99 |                     if (width > vec_t().size()) {
100 |                         c = load(above + 1);
101 |                         f = load(srcp + 1);
102 |                         i = load(below + 1);
103 |                     } else {
104 |                         c = permute4<1, 2, 3, 2>(b);
105 |                         f = permute4<1, 2, 3, 2>(e);
106 |                         i = permute4<1, 2, 3, 2>(h);
107 |                     }
108 | 
109 |                     const Vec4f result = filtering(a, b, c,
110 |                                                    d, e, f,
111 |                                                    g, h, i,
112 |                                                    chromaOffset);
113 | 
114 |                     store(result, dstp + 0);
115 |                 }
116 | 
117 |                 for (int x = vec_t().size(); x < regularPart; x += vec_t().size()) {
118 |                     const Vec4f result = filtering(load(above + x - 1), load(above + x), load(above + x + 1),
119 |                                                    load(srcp + x - 1), load(srcp + x), load(srcp + x + 1),
120 |                                                    load(below + x - 1), load(below + x), load(below + x + 1),
121 |                                                    chromaOffset);
122 | 
123 |                     store(result, dstp + x);
124 |                 }
125 | 
126 |                 if (regularPart >= vec_t().size()) {
127 |                     const vec_t a = load(above + regularPart - 1);
128 |                     const vec_t d = load(srcp + regularPart - 1);
129 |                     const vec_t g = load(below + regularPart - 1);
130 | 
131 |                     const vec_t b = load(above + regularPart);
132 |                     const vec_t e = load(srcp + regularPart);
133 |                     const vec_t h = load(below + regularPart);
134 | 
135 |                     const vec_t c = permute4<1, 2, 3, 2>(b);
136 |                     const vec_t f = permute4<1, 2, 3, 2>(e);
137 |                     const vec_t i = permute4<1, 2, 3, 2>(h);
138 | 
139 |                     const Vec4f result = filtering(a, b, c,
140 |                                                    d, e, f,
141 |                                                    g, h, i,
142 |                                                    chromaOffset);
143 | 
144 |                     store(result, dstp + regularPart);
145 |                 }
146 | 
147 |                 srcp += stride;
148 |                 dstp += stride;
149 |             }
150 |         }
151 |     }
152 | }
153 | 
154 | template void filter_sse2<uint8_t>(const VSFrameRef * src, VSFrameRef * dst, const CASData * const VS_RESTRICT data, const VSAPI * vsapi) noexcept;
155 | template void filter_sse2<uint16_t>(const VSFrameRef * src, VSFrameRef * dst, const CASData * const VS_RESTRICT data, const VSAPI * vsapi) noexcept;
156 | template void filter_sse2<float>(const VSFrameRef * src, VSFrameRef * dst, const CASData * const VS_RESTRICT data, const VSAPI * vsapi) noexcept;
157 | #endif
158 | 


--------------------------------------------------------------------------------
/CAS/VCL2/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |   
179 |    Copyright 2012-2019 Agner Fog.
180 | 
181 |    Licensed under the Apache License, Version 2.0 (the "License");
182 |    you may not use this file except in compliance with the License.
183 |    You may obtain a copy of the License at
184 | 
185 |        http://www.apache.org/licenses/LICENSE-2.0
186 | 
187 |    Unless required by applicable law or agreed to in writing, software
188 |    distributed under the License is distributed on an "AS IS" BASIS,
189 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
190 |    See the License for the specific language governing permissions and
191 |    limitations under the License.
192 | 


--------------------------------------------------------------------------------
/CAS/VCL2/instrset.h:
--------------------------------------------------------------------------------
   1 | /****************************  instrset.h   **********************************
   2 | * Author:        Agner Fog
   3 | * Date created:  2012-05-30
   4 | * Last modified: 2020-06-08
   5 | * Version:       2.01.03
   6 | * Project:       vector class library
   7 | * Description:
   8 | * Header file for various compiler-specific tasks as well as common
   9 | * macros and templates. This file contains:
  10 | *
  11 | * > Selection of the supported instruction set
  12 | * > Defines compiler version macros
  13 | * > Undefines certain macros that prevent function overloading
  14 | * > Helper functions that depend on instruction set, compiler, or platform
  15 | * > Common templates for permute, blend, etc.
  16 | *
  17 | * For instructions, see vcl_manual.pdf
  18 | *
  19 | * (c) Copyright 2012-2020 Agner Fog.
  20 | * Apache License version 2.0 or later.
  21 | ******************************************************************************/
  22 | 
  23 | #ifndef INSTRSET_H
  24 | #define INSTRSET_H 20102
  25 | 
  26 | 
  27 | // Allow the use of floating point permute instructions on integer vectors.
  28 | // Some CPU's have an extra latency of 1 or 2 clock cycles for this, but
  29 | // it may still be faster than alternative implementations:
  30 | #define ALLOW_FP_PERMUTE  true
  31 | 
  32 | 
  33 | // Macro to indicate 64 bit mode
  34 | #if (defined(_M_AMD64) || defined(_M_X64) || defined(__amd64) ) && ! defined(__x86_64__)
  35 | #define __x86_64__ 1  // There are many different macros for this, decide on only one
  36 | #endif
  37 | 
  38 | // The following values of INSTRSET are currently defined:
  39 | // 2:  SSE2
  40 | // 3:  SSE3
  41 | // 4:  SSSE3
  42 | // 5:  SSE4.1
  43 | // 6:  SSE4.2
  44 | // 7:  AVX
  45 | // 8:  AVX2
  46 | // 9:  AVX512F
  47 | // 10: AVX512BW/DQ/VL
  48 | // In the future, INSTRSET = 11 may include AVX512VBMI and AVX512VBMI2, but this
  49 | // decision cannot be made before the market situation for CPUs with these
  50 | // instruction sets is known (these future instruction set extensions are already
  51 | // used in some VCL functions and tested with an emulator)
  52 | 
  53 | // Find instruction set from compiler macros if INSTRSET is not defined.
  54 | // Note: Most of these macros are not defined in Microsoft compilers
  55 | #ifndef INSTRSET
  56 | #if defined ( __AVX512VL__ ) && defined ( __AVX512BW__ ) && defined ( __AVX512DQ__ )
  57 | #define INSTRSET 10
  58 | #elif defined ( __AVX512F__ ) || defined ( __AVX512__ )
  59 | #define INSTRSET 9
  60 | #elif defined ( __AVX2__ )
  61 | #define INSTRSET 8
  62 | #elif defined ( __AVX__ )
  63 | #define INSTRSET 7
  64 | #elif defined ( __SSE4_2__ )
  65 | #define INSTRSET 6
  66 | #elif defined ( __SSE4_1__ )
  67 | #define INSTRSET 5
  68 | #elif defined ( __SSSE3__ )
  69 | #define INSTRSET 4
  70 | #elif defined ( __SSE3__ )
  71 | #define INSTRSET 3
  72 | #elif defined ( __SSE2__ ) || defined ( __x86_64__ )
  73 | #define INSTRSET 2
  74 | #elif defined ( __SSE__ )
  75 | #define INSTRSET 1
  76 | #elif defined ( _M_IX86_FP )           // Defined in MS compiler. 1: SSE, 2: SSE2
  77 | #define INSTRSET _M_IX86_FP
  78 | #else
  79 | #define INSTRSET 0
  80 | #endif // instruction set defines
  81 | #endif // INSTRSET
  82 | 
  83 | // Include the appropriate header file for intrinsic functions
  84 | #if INSTRSET > 7                       // AVX2 and later
  85 | #if defined (__GNUC__) && ! defined (__INTEL_COMPILER)
  86 | #include <x86intrin.h>                 // x86intrin.h includes header files for whatever instruction
  87 |                                        // sets are specified on the compiler command line, such as:
  88 |                                        // xopintrin.h, fma4intrin.h
  89 | #else
  90 | #include <immintrin.h>                 // MS/Intel version of immintrin.h covers AVX and later
  91 | #endif // __GNUC__
  92 | #elif INSTRSET == 7
  93 | #include <immintrin.h>                 // AVX
  94 | #elif INSTRSET == 6
  95 | #include <nmmintrin.h>                 // SSE4.2
  96 | #elif INSTRSET == 5
  97 | #include <smmintrin.h>                 // SSE4.1
  98 | #elif INSTRSET == 4
  99 | #include <tmmintrin.h>                 // SSSE3
 100 | #elif INSTRSET == 3
 101 | #include <pmmintrin.h>                 // SSE3
 102 | #elif INSTRSET == 2
 103 | #include <emmintrin.h>                 // SSE2
 104 | #elif INSTRSET == 1
 105 | #include <xmmintrin.h>                 // SSE
 106 | #endif // INSTRSET
 107 | 
 108 | #if INSTRSET >= 8 && !defined(__FMA__)
 109 | // Assume that all processors that have AVX2 also have FMA3
 110 | #if defined (__GNUC__) && ! defined (__INTEL_COMPILER)
 111 | // Prevent error message in g++ and Clang when using FMA intrinsics with avx2:
 112 | #if !defined(DISABLE_WARNING_AVX2_WITHOUT_FMA)
 113 | #pragma message "It is recommended to specify also option -mfma when using -mavx2 or higher"
 114 | #endif
 115 | #elif ! defined (__clang__)
 116 | #define __FMA__  1
 117 | #endif
 118 | #endif
 119 | 
 120 | // AMD  instruction sets
 121 | #if defined (__XOP__) || defined (__FMA4__)
 122 | #ifdef __GNUC__
 123 | #include <x86intrin.h>                 // AMD XOP (Gnu)
 124 | #else
 125 | #include <ammintrin.h>                 // AMD XOP (Microsoft)
 126 | #endif //  __GNUC__
 127 | #elif defined (__SSE4A__)              // AMD SSE4A
 128 | #include <ammintrin.h>
 129 | #endif // __XOP__
 130 | 
 131 | // FMA3 instruction set
 132 | #if defined (__FMA__) && (defined(__GNUC__) || defined(__clang__))  && ! defined (__INTEL_COMPILER)
 133 | #include <fmaintrin.h>
 134 | #endif // __FMA__
 135 | 
 136 | // FMA4 instruction set
 137 | #if defined (__FMA4__) && (defined(__GNUC__) || defined(__clang__))
 138 | #include <fma4intrin.h> // must have both x86intrin.h and fma4intrin.h, don't know why
 139 | #endif // __FMA4__
 140 | 
 141 | 
 142 | #include <stdint.h>                    // Define integer types with known size
 143 | #include <stdlib.h>                    // define abs(int)
 144 | 
 145 | #ifdef _MSC_VER                        // Microsoft compiler or compatible Intel compiler
 146 | #include <intrin.h>                    // define _BitScanReverse(int), __cpuid(int[4],int), _xgetbv(int)
 147 | #endif // _MSC_VER
 148 | 
 149 | 
 150 | // functions in instrset_detect.cpp:
 151 | #ifdef VCL_NAMESPACE
 152 | namespace VCL_NAMESPACE {
 153 | #endif
 154 |     int  instrset_detect(void);        // tells which instruction sets are supported
 155 |     bool hasFMA3(void);                // true if FMA3 instructions supported
 156 |     bool hasFMA4(void);                // true if FMA4 instructions supported
 157 |     bool hasXOP(void);                 // true if XOP  instructions supported
 158 |     bool hasAVX512ER(void);            // true if AVX512ER instructions supported
 159 |     bool hasAVX512VBMI(void);          // true if AVX512VBMI instructions supported
 160 |     bool hasAVX512VBMI2(void);         // true if AVX512VBMI2 instructions supported
 161 | #ifdef VCL_NAMESPACE
 162 | }
 163 | #endif
 164 | 
 165 | // functions in physical_processors.cpp:
 166 | int physicalProcessors(int * logical_processors = 0);
 167 | 
 168 | 
 169 | // GCC version
 170 | #if defined(__GNUC__) && !defined (GCC_VERSION) && !defined (__clang__)
 171 | #define GCC_VERSION  ((__GNUC__) * 10000 + (__GNUC_MINOR__) * 100 + (__GNUC_PATCHLEVEL__))
 172 | #endif
 173 | 
 174 | // Clang version
 175 | #if defined (__clang__)
 176 | #define CLANG_VERSION  ((__clang_major__) * 10000 + (__clang_minor__) * 100 + (__clang_patchlevel__))
 177 | // Problem: The version number is not consistent across platforms
 178 | // http://llvm.org/bugs/show_bug.cgi?id=12643
 179 | // Apple bug 18746972
 180 | #endif
 181 | 
 182 | // Fix problem with non-overloadable macros named min and max in WinDef.h
 183 | #ifdef _MSC_VER
 184 | #if defined (_WINDEF_) && defined(min) && defined(max)
 185 | #undef min
 186 | #undef max
 187 | #endif
 188 | #ifndef NOMINMAX
 189 | #define NOMINMAX
 190 | #endif
 191 | 
 192 | // warning for poor support for AVX512F in MS compiler
 193 | #ifndef __INTEL_COMPILER
 194 | #if INSTRSET == 9
 195 | #pragma message("Warning: MS compiler cannot generate code for AVX512F without AVX512DQ")
 196 | #endif
 197 | #if _MSC_VER < 1920 && INSTRSET > 8
 198 | #pragma message("Warning: Your compiler has poor support for AVX512. Code may be erroneous.\nPlease use a newer compiler version or a different compiler!")
 199 | #endif
 200 | #endif // __INTEL_COMPILER
 201 | #endif // _MSC_VER
 202 | 
 203 | /* Intel compiler problem:
 204 | The Intel compiler currently cannot compile version 2.00 of VCL. It seems to have
 205 | a problem with constexpr function returns not being constant enough.
 206 | */
 207 | #if defined(__INTEL_COMPILER) && __INTEL_COMPILER < 9999
 208 | #error The Intel compiler version 19.00 cannot compile VCL version 2. Use Version 1.xx of VCL instead
 209 | #endif
 210 | 
 211 | /* Clang problem:
 212 | The Clang compiler treats the intrinsic vector types __m128, __m128i, and __m128d as identical.
 213 | See the bug report at https://bugs.llvm.org/show_bug.cgi?id=17164
 214 | Additional problem: The version number is not consistent across platforms. The Apple build has
 215 | different version numbers. We have to rely on __apple_build_version__ on the Mac platform:
 216 | http://llvm.org/bugs/show_bug.cgi?id=12643
 217 | We have to make switches here when - hopefully - the error some day has been fixed.
 218 | We need different version checks with and whithout __apple_build_version__
 219 | */
 220 | #if (defined (__clang__) || defined(__apple_build_version__)) && !defined(__INTEL_COMPILER)
 221 | #define FIX_CLANG_VECTOR_ALIAS_AMBIGUITY
 222 | #endif
 223 | 
 224 | #if defined (GCC_VERSION) && GCC_VERSION < 99999 && !defined(__clang__)
 225 | #define ZEXT_MISSING  // Gcc 7.4.0 does not have _mm256_zextsi128_si256 and similar functions
 226 | #endif
 227 | 
 228 | 
 229 | #ifdef VCL_NAMESPACE
 230 | namespace VCL_NAMESPACE {
 231 | #endif
 232 | 
 233 | // Constant for indicating don't care in permute and blend functions.
 234 | // V_DC is -256 in Vector class library version 1.xx
 235 | // V_DC can be any value less than -1 in Vector class library version 2.00
 236 | constexpr int V_DC = -256;
 237 | 
 238 | 
 239 | /*****************************************************************************
 240 | *
 241 | *    Helper functions that depend on instruction set, compiler, or platform
 242 | *
 243 | *****************************************************************************/
 244 | 
 245 | // Define interface to cpuid instruction.
 246 | // input:  functionnumber = leaf (eax), ecxleaf = subleaf(ecx)
 247 | // output: output[0] = eax, output[1] = ebx, output[2] = ecx, output[3] = edx
 248 | static inline void cpuid(int output[4], int functionnumber, int ecxleaf = 0) {
 249 | #if defined(__GNUC__) || defined(__clang__)           // use inline assembly, Gnu/AT&T syntax
 250 |     int a, b, c, d;
 251 |     __asm("cpuid" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "a"(functionnumber), "c"(ecxleaf) : );
 252 |     output[0] = a;
 253 |     output[1] = b;
 254 |     output[2] = c;
 255 |     output[3] = d;
 256 | 
 257 | #elif defined (_MSC_VER)                              // Microsoft compiler, intrin.h included
 258 |     __cpuidex(output, functionnumber, ecxleaf);       // intrinsic function for CPUID
 259 | 
 260 | #else                                                 // unknown platform. try inline assembly with masm/intel syntax
 261 |     __asm {
 262 |         mov eax, functionnumber
 263 |         mov ecx, ecxleaf
 264 |         cpuid;
 265 |         mov esi, output
 266 |         mov[esi], eax
 267 |         mov[esi + 4], ebx
 268 |         mov[esi + 8], ecx
 269 |         mov[esi + 12], edx
 270 |     }
 271 | #endif
 272 | }
 273 | 
 274 | 
 275 | // Define popcount function. Gives sum of bits
 276 | #if INSTRSET >= 6   // SSE4.2
 277 | // popcnt instruction is not officially part of the SSE4.2 instruction set,
 278 | // but available in all known processors with SSE4.2
 279 | static inline uint32_t vml_popcnt(uint32_t a) {
 280 |     return (uint32_t)_mm_popcnt_u32(a);  // Intel intrinsic. Supported by gcc and clang
 281 | }
 282 | #ifdef __x86_64__
 283 | static inline int64_t vml_popcnt(uint64_t a) {
 284 |     return _mm_popcnt_u64(a);            // Intel intrinsic.
 285 | }
 286 | #else   // 32 bit mode
 287 | static inline int64_t vml_popcnt(uint64_t a) {
 288 |     return _mm_popcnt_u32(uint32_t(a >> 32)) + _mm_popcnt_u32(uint32_t(a));
 289 | }
 290 | #endif
 291 | #else  // no SSE4.2
 292 | static inline uint32_t vml_popcnt(uint32_t a) {
 293 |     // popcnt instruction not available
 294 |     uint32_t b = a - ((a >> 1) & 0x55555555);
 295 |     uint32_t c = (b & 0x33333333) + ((b >> 2) & 0x33333333);
 296 |     uint32_t d = (c + (c >> 4)) & 0x0F0F0F0F;
 297 |     uint32_t e = d * 0x01010101;
 298 |     return   e >> 24;
 299 | }
 300 | 
 301 | static inline int32_t vml_popcnt(uint64_t a) {
 302 |     return vml_popcnt(uint32_t(a >> 32)) + vml_popcnt(uint32_t(a));
 303 | }
 304 | 
 305 | #endif
 306 | 
 307 | // Define bit-scan-forward function. Gives index to lowest set bit
 308 | #if defined (__GNUC__) || defined(__clang__)
 309 |     // gcc and Clang have no bit_scan_forward intrinsic
 310 | #if defined(__clang__)   // fix clang bug
 311 |     // Clang uses a k register as parameter a when inlined from horizontal_find_first
 312 | __attribute__((noinline))
 313 | #endif
 314 | static uint32_t bit_scan_forward(uint32_t a) {
 315 |     uint32_t r;
 316 |     __asm("bsfl %1, %0" : "=r"(r) : "r"(a) : );
 317 |     return r;
 318 | }
 319 | static inline uint32_t bit_scan_forward(uint64_t a) {
 320 |     uint32_t lo = uint32_t(a);
 321 |     if (lo) return bit_scan_forward(lo);
 322 |     uint32_t hi = uint32_t(a >> 32);
 323 |     return bit_scan_forward(hi) + 32;
 324 | }
 325 | 
 326 | #else  // other compilers
 327 | static inline uint32_t bit_scan_forward(uint32_t a) {
 328 |     unsigned long r;
 329 |     _BitScanForward(&r, a);            // defined in intrin.h for MS and Intel compilers
 330 |     return r;
 331 | }
 332 | #ifdef __x86_64__
 333 | static inline uint32_t bit_scan_forward(uint64_t a) {
 334 |     unsigned long r;
 335 |     _BitScanForward64(&r, a);          // defined in intrin.h for MS and Intel compilers
 336 |     return (uint32_t)r;
 337 | }
 338 | #else
 339 | static inline uint32_t bit_scan_forward(uint64_t a) {
 340 |     uint32_t lo = uint32_t(a);
 341 |     if (lo) return bit_scan_forward(lo);
 342 |     uint32_t hi = uint32_t(a >> 32);
 343 |     return bit_scan_forward(hi) + 32;
 344 | }
 345 | #endif
 346 | #endif
 347 | 
 348 | 
 349 | // Define bit-scan-reverse function. Gives index to highest set bit = floor(log2(a))
 350 | #if defined (__GNUC__) || defined(__clang__)
 351 | static inline uint32_t bit_scan_reverse(uint32_t a) __attribute__((pure));
 352 | static inline uint32_t bit_scan_reverse(uint32_t a) {
 353 |     uint32_t r;
 354 |     __asm("bsrl %1, %0" : "=r"(r) : "r"(a) : );
 355 |     return r;
 356 | }
 357 | #ifdef __x86_64__
 358 | static inline uint32_t bit_scan_reverse(uint64_t a) {
 359 |     uint64_t r;
 360 |     __asm("bsrq %1, %0" : "=r"(r) : "r"(a) : );
 361 |     return r;
 362 | }
 363 | #else   // 32 bit mode
 364 | static inline uint32_t bit_scan_reverse(uint64_t a) {
 365 |     uint64_t ahi = a >> 32;
 366 |     if (ahi == 0) return bit_scan_reverse(uint32_t(a));
 367 |     else return bit_scan_reverse(uint32_t(ahi)) + 32;
 368 | }
 369 | #endif
 370 | #else
 371 | static inline uint32_t bit_scan_reverse(uint32_t a) {
 372 |     unsigned long r;
 373 |     _BitScanReverse(&r, a);            // defined in intrin.h for MS and Intel compilers
 374 |     return r;
 375 | }
 376 | #ifdef __x86_64__
 377 | static inline uint32_t bit_scan_reverse(uint64_t a) {
 378 |     unsigned long r;
 379 |     _BitScanReverse64(&r, a);          // defined in intrin.h for MS and Intel compilers
 380 |     return r;
 381 | }
 382 | #else   // 32 bit mode
 383 | static inline uint32_t bit_scan_reverse(uint64_t a) {
 384 |     uint64_t ahi = a >> 32;
 385 |     if (ahi == 0) return bit_scan_reverse(uint32_t(a));
 386 |     else return bit_scan_reverse(uint32_t(ahi)) + 32;
 387 | }
 388 | #endif
 389 | #endif
 390 | 
 391 | // Same function, for compile-time constants
 392 | constexpr int bit_scan_reverse_const(uint64_t const n) {
 393 |     if (n == 0) return -1;
 394 |     uint64_t a = n, b = 0, j = 64, k = 0;
 395 |     do {
 396 |         j >>= 1;
 397 |         k = (uint64_t)1 << j;
 398 |         if (a >= k) {
 399 |             a >>= j;
 400 |             b += j;
 401 |         }
 402 |     } while (j > 0);
 403 |     return int(b);
 404 | }
 405 | 
 406 | 
 407 | /*****************************************************************************
 408 | *
 409 | *    Common templates
 410 | *
 411 | *****************************************************************************/
 412 | 
 413 | // Template class to represent compile-time integer constant
 414 | template <int32_t  n> class Const_int_t {};      // represent compile-time signed integer constant
 415 | template <uint32_t n> class Const_uint_t {};     // represent compile-time unsigned integer constant
 416 | #define const_int(n)  (Const_int_t <n>())        // n must be compile-time integer constant
 417 | #define const_uint(n) (Const_uint_t<n>())        // n must be compile-time unsigned integer constant
 418 | 
 419 | 
 420 | // template for producing quiet NAN
 421 | template <class VTYPE>
 422 | static inline VTYPE nan_vec(uint32_t payload = 0x100) {
 423 |     if constexpr ((VTYPE::elementtype() & 1) != 0) {  // double
 424 |         union {
 425 |             uint64_t q;
 426 |             double f;
 427 |         } ud;
 428 |         // n is left justified to avoid loss of NAN payload when converting to float
 429 |         ud.q = 0x7FF8000000000000 | uint64_t(payload) << 29;
 430 |         return VTYPE(ud.f);
 431 |     }
 432 |     // float will be converted to double if necessary
 433 |     union {
 434 |         uint32_t i;
 435 |         float f;
 436 |     } uf;
 437 |     uf.i = 0x7FC00000 | (payload & 0x003FFFFF);
 438 |     return VTYPE(uf.f);
 439 | }
 440 | 
 441 | 
 442 | // Test if a parameter is a compile-time constant
 443 | /* Unfortunately, this works only for macro parameters, not for inline function parameters.
 444 |    I hope that some solution will appear in the future, but for now it appears to be
 445 |    impossible to check if a function parameter is a compile-time constant.
 446 |    This would be useful in operator / and in function pow:
 447 |    #if defined(__GNUC__) || defined (__clang__)
 448 |    #define is_constant(a) __builtin_constant_p(a)
 449 |    #else
 450 |    #define is_constant(a) false
 451 |    #endif
 452 | */
 453 | 
 454 | 
 455 | /*****************************************************************************
 456 | *
 457 | *    Helper functions for permute and blend functions
 458 | *
 459 | ******************************************************************************
 460 | Rules for constexpr functions:
 461 | 
 462 | > All variable declarations must include initialization
 463 | 
 464 | > Do not put variable declarations inside a for-clause, e.g. avoid: for (int i=0; ..
 465 |   Instead, you have to declare the loop counter before the for-loop.
 466 | 
 467 | > Do not make constexpr functions that return vector types. This requires type
 468 |   punning with a union, which is not allowed in constexpr functions under C++17.
 469 |   It may be possible under C++20
 470 | 
 471 | *****************************************************************************/
 472 | 
 473 | // Define type for Encapsulated array to use as return type:
 474 | template <typename T, int N>
 475 | struct EList {
 476 |     T a[N];
 477 | };
 478 | 
 479 | 
 480 | // get_inttype: get an integer of a size that matches the element size
 481 | // of vector class V with the value -1
 482 | template <typename V>
 483 | constexpr auto get_inttype() {
 484 |     constexpr int elementsize = sizeof(V) / V::size();  // size of vector elements
 485 | 
 486 |     if constexpr (elementsize >= 8) {
 487 |         return -int64_t(1);
 488 |     }
 489 |     else if constexpr (elementsize >= 4) {
 490 |         return int32_t(-1);
 491 |     }
 492 |     else if constexpr (elementsize >= 2) {
 493 |         return int16_t(-1);
 494 |     }
 495 |     else {
 496 |         return int8_t(-1);
 497 |     }
 498 | }
 499 | 
 500 | 
 501 | // zero_mask: return a compact bit mask mask for zeroing using AVX512 mask.
 502 | // Parameter a is a reference to a constexpr int array of permutation indexes
 503 | template <int N>
 504 | constexpr auto zero_mask(int const (&a)[N]) {
 505 |     uint64_t mask = 0;
 506 |     int i = 0;
 507 | 
 508 |     for (i = 0; i < N; i++) {
 509 |         if (a[i] >= 0) mask |= uint64_t(1) << i;
 510 |     }
 511 |     if constexpr      (N <= 8 ) return uint8_t(mask);
 512 |     else if constexpr (N <= 16) return uint16_t(mask);
 513 |     else if constexpr (N <= 32) return uint32_t(mask);
 514 |     else return mask;
 515 | }
 516 | 
 517 | 
 518 | // zero_mask_broad: return a broad byte mask for zeroing.
 519 | // Parameter a is a reference to a constexpr int array of permutation indexes
 520 | template <typename V>
 521 | constexpr auto zero_mask_broad(int const (&A)[V::size()]) {
 522 |     constexpr int N = V::size();                 // number of vector elements
 523 |     typedef decltype(get_inttype<V>()) Etype;    // element type
 524 |     EList <Etype, N> u = {{0}};                  // list for return
 525 |     int i = 0;
 526 |     for (i = 0; i < N; i++) {
 527 |         u.a[i] = A[i] >= 0 ? get_inttype<V>() : 0;
 528 |     }
 529 |     return u;                                    // return encapsulated array
 530 | }
 531 | 
 532 | 
 533 | // make_bit_mask: return a compact mask of bits from a list of N indexes:
 534 | // B contains options indicating how to gather the mask
 535 | // bit 0-7 in B indicates which bit in each index to collect
 536 | // bit 8 = 0x100:  set 1 in the lower half of the bit mask if the indicated bit is 1.
 537 | // bit 8 = 0    :  set 1 in the lower half of the bit mask if the indicated bit is 0.
 538 | // bit 9 = 0x200:  set 1 in the upper half of the bit mask if the indicated bit is 1.
 539 | // bit 9 = 0    :  set 1 in the upper half of the bit mask if the indicated bit is 0.
 540 | // bit 10 = 0x400: set 1 in the bit mask if the corresponding index is -1 or V_DC
 541 | // Parameter a is a reference to a constexpr int array of permutation indexes
 542 | template <int N, int B>
 543 | constexpr uint64_t make_bit_mask(int const (&a)[N]) {
 544 |     uint64_t r = 0;                              // return value
 545 |     uint8_t  j = uint8_t(B & 0xFF);              // index to selected bit
 546 |     uint64_t s = 0;                              // bit number i in r
 547 |     uint64_t f = 0;                              // 1 if bit not flipped
 548 |     int i = 0;
 549 |     for (i = 0; i < N; i++) {
 550 |         int ix = a[i];
 551 |         if (ix < 0) {                            // -1 or V_DC
 552 |             s = (B >> 10) & 1;
 553 |         }
 554 |         else {
 555 |             s = ((uint32_t)ix >> j) & 1;         // extract selected bit
 556 |             if (i < N/2) {
 557 |                 f = (B >> 8) & 1;                // lower half
 558 |             }
 559 |             else {
 560 |                 f = (B >> 9) & 1;                // upper half
 561 |             }
 562 |             s ^= f ^ 1;                          // flip bit if needed
 563 |         }
 564 |         r |= uint64_t(s) << i;                   // set bit in return value
 565 |     }
 566 |     return r;
 567 | }
 568 | 
 569 | 
 570 | // make_broad_mask: Convert a bit mask m to a broad mask
 571 | // The return value will be a broad boolean mask with elementsize matching vector class V
 572 | template <typename V>
 573 | constexpr auto make_broad_mask(uint64_t const m) {
 574 |     constexpr int N = V::size();                 // number of vector elements
 575 |     typedef decltype(get_inttype<V>()) Etype;    // element type
 576 |     EList <Etype, N> u = {{0}};                  // list for returning
 577 |     int i = 0;
 578 |     for (i = 0; i < N; i++) {
 579 |         u.a[i] = ((m >> i) & 1) != 0 ? get_inttype<V>() : 0;
 580 |     }
 581 |     return u;                                    // return encapsulated array
 582 | }
 583 | 
 584 | 
 585 | // perm_mask_broad: return a mask for permutation by a vector register index.
 586 | // Parameter A is a reference to a constexpr int array of permutation indexes
 587 | template <typename V>
 588 | constexpr auto perm_mask_broad(int const (&A)[V::size()]) {
 589 |     constexpr int N = V::size();                 // number of vector elements
 590 |     typedef decltype(get_inttype<V>()) Etype;    // vector element type
 591 |     EList <Etype, N> u = {{0}};                  // list for returning
 592 |     int i = 0;
 593 |     for (i = 0; i < N; i++) {
 594 |         u.a[i] = Etype(A[i]);
 595 |     }
 596 |     return u;                                    // return encapsulated array
 597 | }
 598 | 
 599 | 
 600 | // perm_flags: returns information about how a permute can be implemented.
 601 | // The return value is composed of these flag bits:
 602 | const int perm_zeroing             = 1;  // needs zeroing
 603 | const int perm_perm                = 2;  // permutation needed
 604 | const int perm_allzero             = 4;  // all is zero or don't care
 605 | const int perm_largeblock          = 8;  // fits permute with a larger block size (e.g permute Vec2q instead of Vec4i)
 606 | const int perm_addz             = 0x10;  // additional zeroing needed after permute with larger block size or shift
 607 | const int perm_addz2            = 0x20;  // additional zeroing needed after perm_zext, perm_compress, or perm_expand
 608 | const int perm_cross_lane       = 0x40;  // permutation crossing 128-bit lanes
 609 | const int perm_same_pattern     = 0x80;  // same permute pattern in all 128-bit lanes
 610 | const int perm_punpckh         = 0x100;  // permutation pattern fits punpckh instruction
 611 | const int perm_punpckl         = 0x200;  // permutation pattern fits punpckl instruction
 612 | const int perm_rotate          = 0x400;  // permutation pattern fits rotation within lanes. 4 bit count returned in bit perm_rot_count
 613 | const int perm_shright        = 0x1000;  // permutation pattern fits shift right within lanes. 4 bit count returned in bit perm_rot_count
 614 | const int perm_shleft         = 0x2000;  // permutation pattern fits shift left within lanes. negative count returned in bit perm_rot_count
 615 | const int perm_rotate_big     = 0x4000;  // permutation pattern fits rotation across lanes. 6 bit count returned in bit perm_rot_count
 616 | const int perm_broadcast      = 0x8000;  // permutation pattern fits broadcast of a single element.
 617 | const int perm_zext          = 0x10000;  // permutation pattern fits zero extension
 618 | const int perm_compress      = 0x20000;  // permutation pattern fits vpcompress instruction
 619 | const int perm_expand        = 0x40000;  // permutation pattern fits vpexpand instruction
 620 | const int perm_outofrange = 0x10000000;  // index out of range
 621 | const int perm_rot_count          = 32;  // rotate or shift count is in bits perm_rot_count to perm_rot_count+3
 622 | const int perm_ipattern           = 40;  // pattern for pshufd is in bit perm_ipattern to perm_ipattern + 7 if perm_same_pattern and elementsize >= 4
 623 | 
 624 | template <typename V>
 625 | constexpr uint64_t perm_flags(int const (&a)[V::size()]) {
 626 |     // a is a reference to a constexpr array of permutation indexes
 627 |     // V is a vector class
 628 |     constexpr int N = V::size();                           // number of elements
 629 |     uint64_t r = perm_largeblock | perm_same_pattern | perm_allzero; // return value
 630 |     uint32_t i = 0;                                        // loop counter
 631 |     int      j = 0;                                        // loop counter
 632 |     int ix = 0;                                            // index number i
 633 |     const uint32_t nlanes = sizeof(V) / 16;                // number of 128-bit lanes
 634 |     const uint32_t lanesize = N / nlanes;                  // elements per lane
 635 |     const uint32_t elementsize = sizeof(V) / N;            // size of each vector element
 636 |     uint32_t lane = 0;                                     // current lane
 637 |     uint32_t rot = 999;                                    // rotate left count
 638 |     int32_t  broadc = 999;                                 // index to broadcasted element
 639 |     uint32_t patfail = 0;                                  // remember certain patterns that do not fit
 640 |     uint32_t addz2 = 0;                                    // remember certain patterns need extra zeroing
 641 |     int32_t  compresslasti = -1;                           // last index in perm_compress fit
 642 |     int32_t  compresslastp = -1;                           // last position in perm_compress fit
 643 |     int32_t  expandlasti = -1;                             // last index in perm_expand fit
 644 |     int32_t  expandlastp = -1;                             // last position in perm_expand fit
 645 | 
 646 |     int lanepattern[lanesize] = {0};                       // pattern in each lane
 647 | 
 648 |     for (i = 0; i < N; i++) {                              // loop through indexes
 649 |         ix = a[i];                                         // current index
 650 |         // meaning of ix: -1 = set to zero, V_DC = don't care, non-negative value = permute.
 651 |         if (ix == -1) {
 652 |             r |= perm_zeroing;                             // zeroing requested
 653 |         }
 654 |         else if (ix != V_DC && uint32_t(ix) >= N) {
 655 |             r |= perm_outofrange;                          // index out of range
 656 |         }
 657 |         if (ix >= 0) {
 658 |             r &= ~ perm_allzero;                           // not all zero
 659 |             if (ix != (int)i) r |= perm_perm;              // needs permutation
 660 |             if (broadc == 999) broadc = ix;                // remember broadcast index
 661 |             else if (broadc != ix) broadc = 1000;          // does not fit broadcast
 662 |         }
 663 |         // check if pattern fits a larger block size:
 664 |         // even indexes must be even, odd indexes must fit the preceding even index + 1
 665 |         if ((i & 1) == 0) {                                // even index
 666 |             if (ix >= 0 && (ix & 1)) r &= ~perm_largeblock;// not even. does not fit larger block size
 667 |             int iy = a[i + 1];                             // next odd index
 668 |             if (iy >= 0 && (iy & 1) == 0) r &= ~ perm_largeblock; // not odd. does not fit larger block size
 669 |             if (ix >= 0 && iy >= 0 && iy != ix+1) r &= ~ perm_largeblock; // does not fit preceding index + 1
 670 |             if (ix == -1 && iy >= 0) r |= perm_addz;       // needs additional zeroing at current block size
 671 |             if (iy == -1 && ix >= 0) r |= perm_addz;       // needs additional zeroing at current block size
 672 |         }
 673 |         lane = i / lanesize;                               // current lane
 674 |         if (lane == 0) {                                   // first lane, or no pattern yet
 675 |             lanepattern[i] = ix;                           // save pattern
 676 |         }
 677 |         // check if crossing lanes
 678 |         if (ix >= 0) {
 679 |             uint32_t lanei = (uint32_t)ix / lanesize;      // source lane
 680 |             if (lanei != lane) r |= perm_cross_lane;       // crossing lane
 681 |         }
 682 |         // check if same pattern in all lanes
 683 |         if (lane != 0 && ix >= 0) {                        // not first lane
 684 |             int j1  = i - int(lane * lanesize);            // index into lanepattern
 685 |             int jx = ix - int(lane * lanesize);            // pattern within lane
 686 |             if (jx < 0 || jx >= (int)lanesize) r &= ~perm_same_pattern; // source is in another lane
 687 |             if (lanepattern[j1] < 0) {
 688 |                 lanepattern[j1] = jx;                      // pattern not known from previous lane
 689 |             }
 690 |             else {
 691 |                 if (lanepattern[j1] != jx) r &= ~perm_same_pattern; // not same pattern
 692 |             }
 693 |         }
 694 |         if (ix >= 0) {
 695 |             // check if pattern fits zero extension (perm_zext)
 696 |             if (uint32_t(ix*2) != i) {
 697 |                 patfail |= 1;                              // does not fit zero extension
 698 |             }
 699 |             // check if pattern fits compress (perm_compress)
 700 |             if (ix > compresslasti && ix - compresslasti >= (int)i - compresslastp) {
 701 |                 if ((int)i - compresslastp > 1) addz2 |= 2;// perm_compress may need additional zeroing
 702 |                 compresslasti = ix;  compresslastp = i;
 703 |             }
 704 |             else {
 705 |                 patfail |= 2;                              // does not fit perm_compress
 706 |             }
 707 |             // check if pattern fits expand (perm_expand)
 708 |             if (ix > expandlasti && ix - expandlasti <= (int)i - expandlastp) {
 709 |                 if (ix - expandlasti > 1) addz2 |= 4;      // perm_expand may need additional zeroing
 710 |                 expandlasti = ix;  expandlastp = i;
 711 |             }
 712 |             else {
 713 |                 patfail |= 4;                              // does not fit perm_compress
 714 |             }
 715 |         }
 716 |         else if (ix == -1) {
 717 |             if ((i & 1) == 0) addz2 |= 1;                  // zero extension needs additional zeroing
 718 |         }
 719 |     }
 720 |     if (!(r & perm_perm)) return r;                        // more checks are superfluous
 721 | 
 722 |     if (!(r & perm_largeblock)) r &= ~ perm_addz;          // remove irrelevant flag
 723 |     if (r & perm_cross_lane) r &= ~ perm_same_pattern;     // remove irrelevant flag
 724 |     if ((patfail & 1) == 0) {
 725 |         r |= perm_zext;                                    // fits zero extension
 726 |         if ((addz2 & 1) != 0) r |= perm_addz2;
 727 |     }
 728 |     else if ((patfail & 2) == 0) {
 729 |         r |= perm_compress;                                // fits compression
 730 |         if ((addz2 & 2) != 0) {                            // check if additional zeroing needed
 731 |             for (j = 0; j < compresslastp; j++) {
 732 |                 if (a[j] == -1) r |= perm_addz2;
 733 |             }
 734 |         }
 735 |     }
 736 |     else if ((patfail & 4) == 0) {
 737 |         r |= perm_expand;                                  // fits expansion
 738 |         if ((addz2 & 4) != 0) {                            // check if additional zeroing needed
 739 |             for (j = 0; j < expandlastp; j++) {
 740 |                 if (a[j] == -1) r |= perm_addz2;
 741 |             }
 742 |         }
 743 |     }
 744 | 
 745 |     if (r & perm_same_pattern) {
 746 |         // same pattern in all lanes. check if it fits specific patterns
 747 |         bool fit = true;
 748 |         // fit shift or rotate
 749 |         for (i = 0; i < lanesize; i++) {
 750 |             if (lanepattern[i] >= 0) {
 751 |                 uint32_t rot1 = uint32_t(lanepattern[i] + lanesize - i) % lanesize;
 752 |                 if (rot == 999) {
 753 |                     rot = rot1;
 754 |                 }
 755 |                 else { // check if fit
 756 |                     if (rot != rot1) fit = false;
 757 |                 }
 758 |             }
 759 |         }
 760 |         rot &= lanesize-1;  // prevent out of range values
 761 |         if (fit) {   // fits rotate, and possibly shift
 762 |             uint64_t rot2 = (rot * elementsize) & 0xF;     // rotate right count in bytes
 763 |             r |= rot2 << perm_rot_count;                   // put shift/rotate count in output bit 16-19
 764 | #if INSTRSET >= 4  // SSSE3
 765 |             r |= perm_rotate;                              // allow palignr
 766 | #endif
 767 |             // fit shift left
 768 |             fit = true;
 769 |             for (i = 0; i < lanesize-rot; i++) {           // check if first rot elements are zero or don't care
 770 |                 if (lanepattern[i] >= 0) fit = false;
 771 |             }
 772 |             if (fit) {
 773 |                 r |= perm_shleft;
 774 |                 for (; i < lanesize; i++) if (lanepattern[i] == -1) r |= perm_addz; // additional zeroing needed
 775 |             }
 776 |             // fit shift right
 777 |             fit = true;
 778 |             for (i = lanesize-(uint32_t)rot; i < lanesize; i++) {    // check if last (lanesize-rot) elements are zero or don't care
 779 |                 if (lanepattern[i] >= 0) fit = false;
 780 |             }
 781 |             if (fit) {
 782 |                 r |= perm_shright;
 783 |                 for (i = 0; i < lanesize-rot; i++) {
 784 |                     if (lanepattern[i] == -1) r |= perm_addz; // additional zeroing needed
 785 |                 }
 786 |             }
 787 |         }
 788 |         // fit punpckhi
 789 |         fit = true;
 790 |         uint32_t j2 = lanesize / 2;
 791 |         for (i = 0; i < lanesize; i++) {
 792 |             if (lanepattern[i] >= 0 && lanepattern[i] != (int)j2) fit = false;
 793 |             if ((i & 1) != 0) j2++;
 794 |         }
 795 |         if (fit) r |= perm_punpckh;
 796 |         // fit punpcklo
 797 |         fit = true;
 798 |         j2 = 0;
 799 |         for (i = 0; i < lanesize; i++) {
 800 |             if (lanepattern[i] >= 0 && lanepattern[i] != (int)j2) fit = false;
 801 |             if ((i & 1) != 0) j2++;
 802 |         }
 803 |         if (fit) r |= perm_punpckl;
 804 |         // fit pshufd
 805 |         if (elementsize >= 4) {
 806 |             uint64_t p = 0;
 807 |             for (i = 0; i < lanesize; i++) {
 808 |                 if (lanesize == 4) {
 809 |                     p |= (lanepattern[i] & 3) << 2 * i;
 810 |                 }
 811 |                 else {  // lanesize = 2
 812 |                     p |= ((lanepattern[i] & 1) * 10 + 4) << 4 * i;
 813 |                 }
 814 |             }
 815 |             r |= p << perm_ipattern;
 816 |         }
 817 |     }
 818 | #if INSTRSET >= 7
 819 |     else {  // not same pattern in all lanes
 820 |         if constexpr (nlanes > 1) {                        // Try if it fits big rotate
 821 |             for (i = 0; i < N; i++) {
 822 |                 ix = a[i];
 823 |                 if (ix >= 0) {
 824 |                     uint32_t rot2 = (ix + N - i) % N;      // rotate count
 825 |                     if (rot == 999) {
 826 |                         rot = rot2;                        // save rotate count
 827 |                     }
 828 |                     else if (rot != rot2) {
 829 |                         rot = 1000; break;                 // does not fit big rotate
 830 |                     }
 831 |                 }
 832 |             }
 833 |             if (rot < N) {                                 // fits big rotate
 834 |                 r |= perm_rotate_big | (uint64_t)rot << perm_rot_count;
 835 |             }
 836 |         }
 837 |     }
 838 | #endif
 839 |     if (broadc < 999 && (r & (perm_rotate|perm_shright|perm_shleft|perm_rotate_big)) == 0) {
 840 |         r |= perm_broadcast | (uint64_t)broadc << perm_rot_count; // fits broadcast
 841 |     }
 842 |     return r;
 843 | }
 844 | 
 845 | 
 846 | // compress_mask: returns a bit mask to use for compression instruction.
 847 | // It is presupposed that perm_flags indicates perm_compress.
 848 | // Additional zeroing is needed if perm_flags indicates perm_addz2
 849 | template <int N>
 850 | constexpr uint64_t compress_mask(int const (&a)[N]) {
 851 |     // a is a reference to a constexpr array of permutation indexes
 852 |     int ix = 0, lasti = -1, lastp = -1;
 853 |     uint64_t m = 0;
 854 |     int i = 0; int j = 1;                                  // loop counters
 855 |     for (i = 0; i < N; i++) {
 856 |         ix = a[i];                                         // permutation index
 857 |         if (ix >= 0) {
 858 |             m |= (uint64_t)1 << ix;                        // mask for compression source
 859 |             for (j = 1; j < i - lastp; j++) {
 860 |                 m |= (uint64_t)1 << (lasti + j);           // dummy filling source
 861 |             }
 862 |             lastp = i; lasti = ix;
 863 |         }
 864 |     }
 865 |     return m;
 866 | }
 867 | 
 868 | // expand_mask: returns a bit mask to use for expansion instruction.
 869 | // It is presupposed that perm_flags indicates perm_expand.
 870 | // Additional zeroing is needed if perm_flags indicates perm_addz2
 871 | template <int N>
 872 | constexpr uint64_t expand_mask(int const (&a)[N]) {
 873 |     // a is a reference to a constexpr array of permutation indexes
 874 |     int ix = 0, lasti = -1, lastp = -1;
 875 |     uint64_t m = 0;
 876 |     int i = 0; int j = 1;
 877 |     for (i = 0; i < N; i++) {
 878 |         ix = a[i];                                         // permutation index
 879 |         if (ix >= 0) {
 880 |             m |= (uint64_t)1 << i;                         // mask for expansion destination
 881 |             for (j = 1; j < ix - lasti; j++) {
 882 |                 m |= (uint64_t)1 << (lastp + j);           // dummy filling destination
 883 |             }
 884 |             lastp = i; lasti = ix;
 885 |         }
 886 |     }
 887 |     return m;
 888 | }
 889 | 
 890 | // perm16_flags: returns information about how to permute a vector of 16-bit integers
 891 | // Note: It is presupposed that perm_flags reports perm_same_pattern
 892 | // The return value is composed of these bits:
 893 | // 1:  data from low  64 bits to low  64 bits. pattern in bit 32-39
 894 | // 2:  data from high 64 bits to high 64 bits. pattern in bit 40-47
 895 | // 4:  data from high 64 bits to low  64 bits. pattern in bit 48-55
 896 | // 8:  data from low  64 bits to high 64 bits. pattern in bit 56-63
 897 | template <typename V>
 898 | constexpr uint64_t perm16_flags(int const (&a)[V::size()]) {
 899 |     // a is a reference to a constexpr array of permutation indexes
 900 |     // V is a vector class
 901 |     constexpr int N = V::size();                           // number of elements
 902 | 
 903 |     uint64_t retval = 0;                                   // return value
 904 |     uint32_t pat[4] = {0,0,0,0};                           // permute patterns
 905 |     uint32_t i = 0;                                        // loop counter
 906 |     int ix = 0;                                            // index number i
 907 |     const uint32_t lanesize = 8;                           // elements per lane
 908 |     uint32_t lane = 0;                                     // current lane
 909 |     int lanepattern[lanesize] = {0};                       // pattern in each lane
 910 | 
 911 |     for (i = 0; i < N; i++) {
 912 |         ix = a[i];
 913 |         lane = i / lanesize;                               // current lane
 914 |         if (lane == 0) {
 915 |             lanepattern[i] = ix;                           // save pattern
 916 |         }
 917 |         else if (ix >= 0) {                                // not first lane
 918 |             uint32_t j = i - lane * lanesize;              // index into lanepattern
 919 |             int jx = ix - lane * lanesize;                 // pattern within lane
 920 |             if (lanepattern[j] < 0) {
 921 |                 lanepattern[j] = jx;                       // pattern not known from previous lane
 922 |             }
 923 |         }
 924 |     }
 925 |     // four patterns: low2low, high2high, high2low, low2high
 926 |     for (i = 0; i < 4; i++) {
 927 |         // loop through low pattern
 928 |         if (lanepattern[i] >= 0) {
 929 |             if (lanepattern[i] < 4) { // low2low
 930 |                 retval |= 1;
 931 |                 pat[0] |= uint32_t(lanepattern[i] & 3) << (2 * i);
 932 |             }
 933 |             else {  // high2low
 934 |                 retval |= 4;
 935 |                 pat[2] |= uint32_t(lanepattern[i] & 3) << (2 * i);
 936 |             }
 937 |         }
 938 |         // loop through high pattern
 939 |         if (lanepattern[i+4] >= 0) {
 940 |             if (lanepattern[i+4] < 4) { // low2high
 941 |                 retval |= 8;
 942 |                 pat[3] |= uint32_t(lanepattern[i+4] & 3) << (2 * i);
 943 |             }
 944 |             else {  // high2high
 945 |                 retval |= 2;
 946 |                 pat[1] |= uint32_t(lanepattern[i+4] & 3) << (2 * i);
 947 |             }
 948 |         }
 949 |     }
 950 |     // join return data
 951 |     for (i = 0; i < 4; i++) {
 952 |         retval |= (uint64_t)pat[i] << (32 + i*8);
 953 |     }
 954 |     return retval;
 955 | }
 956 | 
 957 | 
 958 | // pshufb_mask: return a broad byte mask for permutation within lanes
 959 | // for use with the pshufb instruction (_mm..._shuffle_epi8).
 960 | // The pshufb instruction provides fast permutation and zeroing,
 961 | // allowing different patterns in each lane but no crossing of lane boundaries
 962 | template <typename V, int oppos = 0>
 963 | constexpr auto pshufb_mask(int const (&A)[V::size()]) {
 964 |     // Parameter a is a reference to a constexpr array of permutation indexes
 965 |     // V is a vector class
 966 |     // oppos = 1 for data from the opposite 128-bit lane in 256-bit vectors
 967 |     constexpr uint32_t N = V::size();                      // number of vector elements
 968 |     constexpr uint32_t elementsize = sizeof(V) / N;        // size of each vector element
 969 |     constexpr uint32_t nlanes = sizeof(V) / 16;            // number of 128 bit lanes in vector
 970 |     constexpr uint32_t elements_per_lane = N / nlanes;     // number of vector elements per lane
 971 | 
 972 |     EList <int8_t, sizeof(V)> u = {{0}};                   // list for returning
 973 | 
 974 |     uint32_t i = 0;                                        // loop counters
 975 |     uint32_t j = 0;
 976 |     int m = 0;
 977 |     int k = 0;
 978 |     uint32_t lane = 0;
 979 | 
 980 |     for (lane = 0; lane < nlanes; lane++) {                // loop through lanes
 981 |         for (i = 0; i < elements_per_lane; i++) {          // loop through elements in lane
 982 |             // permutation index for element within lane
 983 |             int8_t p = -1;
 984 |             int ix = A[m];
 985 |             if (ix >= 0) {
 986 |                 ix ^= oppos * elements_per_lane;           // flip bit if opposite lane
 987 |             }
 988 |             ix -= int(lane * elements_per_lane);           // index relative to lane
 989 |             if (ix >= 0 && ix < (int)elements_per_lane) {  // index points to desired lane
 990 |                 p = ix * elementsize;
 991 |             }
 992 |             for (j = 0; j < elementsize; j++) {            // loop through bytes in element
 993 |                 u.a[k++] = p < 0 ? -1 : p + j;             // store byte permutation index
 994 |             }
 995 |             m++;
 996 |         }
 997 |     }
 998 |     return u;                                              // return encapsulated array
 999 | }
1000 | 
1001 | 
1002 | // largeblock_perm: return indexes for replacing a permute or blend with
1003 | // a certain block size by a permute or blend with the double block size.
1004 | // Note: it is presupposed that perm_flags() indicates perm_largeblock
1005 | // It is required that additional zeroing is added if perm_flags() indicates perm_addz
1006 | template <int N>
1007 | constexpr EList<int, N/2> largeblock_perm(int const (&a)[N]) {
1008 |     // Parameter a is a reference to a constexpr array of permutation indexes
1009 |     EList<int, N/2> list = {{0}};                 // result indexes
1010 |     int ix = 0;                                  // even index
1011 |     int iy = 0;                                  // odd index
1012 |     int iz = 0;                                  // combined index
1013 |     bool fit_addz = false;                       // additional zeroing needed at the lower block level
1014 |     int i = 0;                                   // loop counter
1015 | 
1016 |     // check if additional zeroing is needed at current block size
1017 |     for (i = 0; i < N; i += 2) {
1018 |         ix = a[i];                               // even index
1019 |         iy = a[i+1];                             // odd index
1020 |         if ((ix == -1 && iy >= 0) || (iy == -1 && ix >= 0)) {
1021 |             fit_addz = true;
1022 |         }
1023 |     }
1024 | 
1025 |     // loop through indexes
1026 |     for (i = 0; i < N; i += 2) {
1027 |         ix = a[i];                               // even index
1028 |         iy = a[i+1];                             // odd index
1029 |         if (ix >= 0) {
1030 |             iz = ix / 2;                         // half index
1031 |         }
1032 |         else if (iy >= 0) {
1033 |             iz = iy / 2;
1034 |         }
1035 |         else {
1036 |             iz = ix | iy;                        // -1 or V_DC. -1 takes precedence
1037 |             if (fit_addz) iz = V_DC;             // V_DC, because result will be zeroed later
1038 |         }
1039 |         list.a[i/2] = iz;                        // save to list
1040 |     }
1041 |     return list;
1042 | }
1043 | 
1044 | 
1045 | // blend_flags: returns information about how a blend function can be implemented
1046 | // The return value is composed of these flag bits:
1047 | const int blend_zeroing            = 1;  // needs zeroing
1048 | const int blend_allzero            = 2;  // all is zero or don't care
1049 | const int blend_largeblock         = 4;  // fits blend with a larger block size (e.g permute Vec2q instead of Vec4i)
1050 | const int blend_addz               = 8;  // additional zeroing needed after blend with larger block size or shift
1051 | const int blend_a               = 0x10;  // has data from a
1052 | const int blend_b               = 0x20;  // has data from b
1053 | const int blend_perma           = 0x40;  // permutation of a needed
1054 | const int blend_permb           = 0x80;  // permutation of b needed
1055 | const int blend_cross_lane     = 0x100;  // permutation crossing 128-bit lanes
1056 | const int blend_same_pattern   = 0x200;  // same permute/blend pattern in all 128-bit lanes
1057 | const int blend_punpckhab     = 0x1000;  // pattern fits punpckh(a,b)
1058 | const int blend_punpckhba     = 0x2000;  // pattern fits punpckh(b,a)
1059 | const int blend_punpcklab     = 0x4000;  // pattern fits punpckl(a,b)
1060 | const int blend_punpcklba     = 0x8000;  // pattern fits punpckl(b,a)
1061 | const int blend_rotateab     = 0x10000;  // pattern fits palignr(a,b)
1062 | const int blend_rotateba     = 0x20000;  // pattern fits palignr(b,a)
1063 | const int blend_shufab       = 0x40000;  // pattern fits shufps/shufpd(a,b)
1064 | const int blend_shufba       = 0x80000;  // pattern fits shufps/shufpd(b,a)
1065 | const int blend_rotate_big  = 0x100000;  // pattern fits rotation across lanes. count returned in bits blend_rotpattern
1066 | const int blend_outofrange= 0x10000000;  // index out of range
1067 | const int blend_shufpattern       = 32;  // pattern for shufps/shufpd is in bit blend_shufpattern to blend_shufpattern + 7
1068 | const int blend_rotpattern        = 40;  // pattern for palignr is in bit blend_rotpattern to blend_rotpattern + 7
1069 | 
1070 | template <typename V>
1071 | constexpr uint64_t blend_flags(int const (&a)[V::size()]) {
1072 |     // a is a reference to a constexpr array of permutation indexes
1073 |     // V is a vector class
1074 |     constexpr int N = V::size();                           // number of elements
1075 |     uint64_t r = blend_largeblock | blend_same_pattern | blend_allzero; // return value
1076 |     uint32_t iu = 0;                                       // loop counter
1077 |     int32_t ii = 0;                                        // loop counter
1078 |     int ix = 0;                                            // index number i
1079 |     const uint32_t nlanes = sizeof(V) / 16;                // number of 128-bit lanes
1080 |     const uint32_t lanesize = N / nlanes;                  // elements per lane
1081 |     uint32_t lane = 0;                                     // current lane
1082 |     uint32_t rot = 999;                                    // rotate left count
1083 |     int lanepattern[lanesize] = {0};                       // pattern in each lane
1084 |     if (lanesize == 2 && N <= 8) {
1085 |         r |= blend_shufab | blend_shufba;                  // check if it fits shufpd
1086 |     }
1087 | 
1088 |     for (ii = 0; ii < N; ii++) {                           // loop through indexes
1089 |         ix = a[ii];                                        // index
1090 |         if (ix < 0) {
1091 |             if (ix == -1) r |= blend_zeroing;              // set to zero
1092 |             else if (ix != V_DC) {
1093 |                 r = blend_outofrange;  break;              // illegal index
1094 |             }
1095 |         }
1096 |         else {  // ix >= 0
1097 |             r &= ~ blend_allzero;
1098 |             if (ix < N) {
1099 |                 r |= blend_a;                              // data from a
1100 |                 if (ix != ii) r |= blend_perma;            // permutation of a
1101 |             }
1102 |             else if (ix < 2*N) {
1103 |                 r |= blend_b;                              // data from b
1104 |                 if (ix != ii + N) r |= blend_permb;        // permutation of b
1105 |             }
1106 |             else {
1107 |                 r = blend_outofrange;  break;              // illegal index
1108 |             }
1109 |         }
1110 |         // check if pattern fits a larger block size:
1111 |         // even indexes must be even, odd indexes must fit the preceding even index + 1
1112 |         if ((ii & 1) == 0) {                               // even index
1113 |             if (ix >= 0 && (ix&1)) r &= ~blend_largeblock; // not even. does not fit larger block size
1114 |             int iy = a[ii+1];                              // next odd index
1115 |             if (iy >= 0 && (iy & 1) == 0) r &= ~ blend_largeblock; // not odd. does not fit larger block size
1116 |             if (ix >= 0 && iy >= 0 && iy != ix+1) r &= ~ blend_largeblock; // does not fit preceding index + 1
1117 |             if (ix == -1 && iy >= 0) r |= blend_addz;      // needs additional zeroing at current block size
1118 |             if (iy == -1 && ix >= 0) r |= blend_addz;      // needs additional zeroing at current block size
1119 |         }
1120 |         lane = (uint32_t)ii / lanesize;                    // current lane
1121 |         if (lane == 0) {                                   // first lane, or no pattern yet
1122 |             lanepattern[ii] = ix;                          // save pattern
1123 |         }
1124 |         // check if crossing lanes
1125 |         if (ix >= 0) {
1126 |             uint32_t lanei = uint32_t(ix & ~N) / lanesize; // source lane
1127 |             if (lanei != lane) {
1128 |                 r |= blend_cross_lane;                     // crossing lane
1129 |             }
1130 |             if (lanesize == 2) {   // check if it fits pshufd
1131 |                 if (lanei != lane) r &= ~(blend_shufab | blend_shufba);
1132 |                 if ((((ix & N) != 0) ^ ii) & 1) r &= ~blend_shufab;
1133 |                 else r &= ~blend_shufba;
1134 |             }
1135 |         }
1136 |         // check if same pattern in all lanes
1137 |         if (lane != 0 && ix >= 0) {                        // not first lane
1138 |             int j  = ii - int(lane * lanesize);            // index into lanepattern
1139 |             int jx = ix - int(lane * lanesize);            // pattern within lane
1140 |             if (jx < 0 || (jx & ~N) >= (int)lanesize) r &= ~blend_same_pattern; // source is in another lane
1141 |             if (lanepattern[j] < 0) {
1142 |                 lanepattern[j] = jx;                       // pattern not known from previous lane
1143 |             }
1144 |             else {
1145 |                 if (lanepattern[j] != jx) r &= ~blend_same_pattern; // not same pattern
1146 |             }
1147 |         }
1148 |     }
1149 |     if (!(r & blend_largeblock)) r &= ~ blend_addz;        // remove irrelevant flag
1150 |     if (r & blend_cross_lane) r &= ~ blend_same_pattern;   // remove irrelevant flag
1151 |     if (!(r & (blend_perma | blend_permb))) {
1152 |         return r;                                          // no permutation. more checks are superfluous
1153 |     }
1154 |     if (r & blend_same_pattern) {
1155 |         // same pattern in all lanes. check if it fits unpack patterns
1156 |         r |= blend_punpckhab | blend_punpckhba | blend_punpcklab | blend_punpcklba;
1157 |         for (iu = 0; iu < lanesize; iu++) {                // loop through lanepattern
1158 |             ix = lanepattern[iu];
1159 |             if (ix >= 0) {
1160 |                 if ((uint32_t)ix != iu / 2 + (iu & 1) * N)                    r &= ~ blend_punpcklab;
1161 |                 if ((uint32_t)ix != iu / 2 + ((iu & 1) ^ 1) * N)              r &= ~ blend_punpcklba;
1162 |                 if ((uint32_t)ix != (iu + lanesize) / 2 + (iu & 1) * N)       r &= ~ blend_punpckhab;
1163 |                 if ((uint32_t)ix != (iu + lanesize) / 2 + ((iu & 1) ^ 1) * N) r &= ~ blend_punpckhba;
1164 |             }
1165 |         }
1166 | #if INSTRSET >= 4  // SSSE3. check if it fits palignr
1167 |         for (iu = 0; iu < lanesize; iu++) {
1168 |             ix = lanepattern[iu];
1169 |             if (ix >= 0) {
1170 |                 uint32_t t = ix & ~N;
1171 |                 if (ix & N) t += lanesize;
1172 |                 uint32_t tb = (t + 2*lanesize - iu) % (lanesize * 2);
1173 |                 if (rot == 999) {
1174 |                     rot = tb;
1175 |                 }
1176 |                 else { // check if fit
1177 |                     if (rot != tb) rot = 1000;
1178 |                 }
1179 |             }
1180 |         }
1181 |         if (rot < 999) { // firs palignr
1182 |             if (rot < lanesize) {
1183 |                 r |= blend_rotateba;
1184 |             }
1185 |             else {
1186 |                 r |= blend_rotateab;
1187 |             }
1188 |             const uint32_t elementsize = sizeof(V) / N;
1189 |             r |= uint64_t((rot & (lanesize - 1)) * elementsize) << blend_rotpattern;
1190 |         }
1191 | #endif
1192 |         if (lanesize == 4) {
1193 |             // check if it fits shufps
1194 |             r |= blend_shufab | blend_shufba;
1195 |             for (ii = 0; ii < 2; ii++) {
1196 |                 ix = lanepattern[ii];
1197 |                 if (ix >= 0) {
1198 |                     if (ix & N) r &= ~ blend_shufab;
1199 |                     else        r &= ~ blend_shufba;
1200 |                 }
1201 |             }
1202 |             for (; ii < 4; ii++) {
1203 |                 ix = lanepattern[ii];
1204 |                 if (ix >= 0) {
1205 |                     if (ix & N) r &= ~ blend_shufba;
1206 |                     else        r &= ~ blend_shufab;
1207 |                 }
1208 |             }
1209 |             if (r & (blend_shufab | blend_shufba)) {       // fits shufps/shufpd
1210 |                 uint8_t shufpattern = 0;                   // get pattern
1211 |                 for (iu = 0; iu < lanesize; iu++) {
1212 |                     shufpattern |= (lanepattern[iu] & 3) << iu * 2;
1213 |                 }
1214 |                 r |= (uint64_t)shufpattern << blend_shufpattern; // return pattern
1215 |             }
1216 |         }
1217 |     }
1218 |     else if  (nlanes > 1) {  // not same pattern in all lanes
1219 |         rot = 999;                                         // check if it fits big rotate
1220 |         for (ii = 0; ii < N; ii++) {
1221 |             ix = a[ii];
1222 |             if (ix >= 0) {
1223 |                 uint32_t rot2 = (ix + 2 * N - ii) % (2 * N);// rotate count
1224 |                 if (rot == 999) {
1225 |                     rot = rot2;                            // save rotate count
1226 |                 }
1227 |                 else if (rot != rot2) {
1228 |                     rot = 1000; break;                     // does not fit big rotate
1229 |                 }
1230 |             }
1231 |         }
1232 |         if (rot < 2 * N) {                                 // fits big rotate
1233 |             r |= blend_rotate_big | (uint64_t)rot << blend_rotpattern;
1234 |         }
1235 |     }
1236 |     if (lanesize == 2 && (r & (blend_shufab | blend_shufba))) {  // fits shufpd. Get pattern
1237 |         for (ii = 0; ii < N; ii++) {
1238 |             r |= uint64_t(a[ii] & 1) << (blend_shufpattern + ii);
1239 |         }
1240 |     }
1241 |     return r;
1242 | }
1243 | 
1244 | // blend_perm_indexes: return an Indexlist for implementing a blend function as
1245 | // two permutations. N = vector size.
1246 | // dozero = 0: let unused elements be don't care. The two permutation results must be blended
1247 | // dozero = 1: zero unused elements in each permuation. The two permutation results can be OR'ed
1248 | // dozero = 2: indexes that are -1 or V_DC are preserved
1249 | template <int N, int dozero>
1250 | constexpr EList<int, 2*N> blend_perm_indexes(int const (&a)[N]) {
1251 |     // a is a reference to a constexpr array of permutation indexes
1252 |     EList<int, 2*N> list = {{0}};       // list to return
1253 |     int u = dozero ? -1 : V_DC;        // value to use for unused entries
1254 |     int j = 0;
1255 | 
1256 |     for (j = 0; j < N; j++) {          // loop through indexes
1257 |         int ix = a[j];                 // current index
1258 |         if (ix < 0) {                  // zero or don't care
1259 |             if (dozero == 2) {
1260 |                 // list.a[j] = list.a[j + N] = ix;  // fails in gcc in complicated cases
1261 |                 list.a[j] = ix;
1262 |                 list.a[j + N] = ix;
1263 |             }
1264 |             else {
1265 |                 // list.a[j] = list.a[j + N] = u;
1266 |                 list.a[j] = u;
1267 |                 list.a[j + N] = u;
1268 |             }
1269 |         }
1270 |         else if (ix < N) {             // value from a
1271 |             list.a[j]   = ix;
1272 |             list.a[j+N] = u;
1273 |         }
1274 |         else {
1275 |             list.a[j]   = u;           // value from b
1276 |             list.a[j+N] = ix - N;
1277 |         }
1278 |     }
1279 |     return list;
1280 | }
1281 | 
1282 | // largeblock_indexes: return indexes for replacing a permute or blend with a
1283 | // certain block size by a permute or blend with the double block size.
1284 | // Note: it is presupposed that perm_flags or blend_flags indicates _largeblock
1285 | // It is required that additional zeroing is added if perm_flags or blend_flags
1286 | // indicates _addz
1287 | template <int N>
1288 | constexpr EList<int, N/2> largeblock_indexes(int const (&a)[N]) {
1289 |     // Parameter a is a reference to a constexpr array of N permutation indexes
1290 |     EList<int, N/2> list = {{0}};                 // list to return
1291 | 
1292 |     bool fit_addz = false;                       // additional zeroing needed at the lower block level
1293 |     int ix = 0;                                  // even index
1294 |     int iy = 0;                                  // odd index
1295 |     int iz = 0;                                  // combined index
1296 |     int i  = 0;                                  // loop counter
1297 | 
1298 |     for (i = 0; i < N; i += 2) {
1299 |         ix = a[i];                               // even index
1300 |         iy = a[i+1];                             // odd index
1301 |         if (ix >= 0) {
1302 |             iz = ix / 2;                         // half index
1303 |         }
1304 |         else if (iy >= 0) {
1305 |             iz = iy / 2;                         // half index
1306 |         }
1307 |         else iz = ix | iy;                       // -1 or V_DC. -1 takes precedence
1308 |         list.a[i/2] = iz;                        // save to list
1309 |         // check if additional zeroing is needed at current block size
1310 |         if ((ix == -1 && iy >= 0) || (iy == -1 && ix >= 0)) {
1311 |             fit_addz = true;
1312 |         }
1313 |     }
1314 |     // replace -1 by V_DC if fit_addz
1315 |     if (fit_addz) {
1316 |         for (i = 0; i < N/2; i++) {
1317 |             if (list.a[i] < 0) list.a[i] = V_DC;
1318 |         }
1319 |     }
1320 |     return list;
1321 | }
1322 | 
1323 | 
1324 | /****************************************************************************************
1325 | *
1326 | *          Vector blend helper function templates
1327 | *
1328 | * These templates are for emulating a blend with a vector size that is not supported by
1329 | * the instruction set, using multiple blends or permutations of half the vector size
1330 | *
1331 | ****************************************************************************************/
1332 | 
1333 | // Make dummy blend function templates to avoid error messages when the blend funtions are not yet defined
1334 | template <typename dummy> void blend2(){}
1335 | template <typename dummy> void blend4(){}
1336 | template <typename dummy> void blend8(){}
1337 | template <typename dummy> void blend16(){}
1338 | template <typename dummy> void blend32(){}
1339 | 
1340 | // blend_half_indexes: return an Indexlist for emulating a blend function as
1341 | // blends or permutations from multiple sources
1342 | // dozero = 0: let unused elements be don't care. Multiple permutation results must be blended
1343 | // dozero = 1: zero unused elements in each permuation. Multiple permutation results can be OR'ed
1344 | // dozero = 2: indexes that are -1 or V_DC are preserved
1345 | // src1, src2: sources to blend in a partial implementation
1346 | template <int N, int dozero, int src1, int src2>
1347 | constexpr EList<int, N> blend_half_indexes(int const (&a)[N]) {
1348 |     // a is a reference to a constexpr array of permutation indexes
1349 |     EList<int, N> list = {{0}};         // list to return
1350 |     int u = dozero ? -1 : V_DC;        // value to use for unused entries
1351 |     int j = 0;                         // loop counter
1352 | 
1353 |     for (j = 0; j < N; j++) {          // loop through indexes
1354 |         int ix = a[j];                 // current index
1355 |         if (ix < 0) {                  // zero or don't care
1356 |             list.a[j] = (dozero == 2) ? ix : u;
1357 |         }
1358 |         else {
1359 |             int src = ix / N;          // source
1360 |             if (src == src1) {
1361 |                 list.a[j] = ix & (N - 1);
1362 |             }
1363 |             else if (src == src2) {
1364 |                 list.a[j] = (ix & (N - 1)) + N;
1365 |             }
1366 |             else list.a[j] = u;
1367 |         }
1368 |     }
1369 |     return list;
1370 | }
1371 | 
1372 | // selectblend: select one of four sources for blending
1373 | template <typename W, int s>
1374 | static inline auto selectblend(W const a, W const b) {
1375 |     if      constexpr (s == 0) return a.get_low();
1376 |     else if constexpr (s == 1) return a.get_high();
1377 |     else if constexpr (s == 2) return b.get_low();
1378 |     else                       return b.get_high();
1379 | }
1380 | 
1381 | // blend_half: Emulate a blend with a vector size that is not supported
1382 | // by multiple blends with half the vector size.
1383 | // blend_half is called twice, to give the low and high half of the result
1384 | // Parameters: W: type of full-size vector
1385 | // i0...: indexes for low or high half
1386 | // a, b: full size input vectors
1387 | // return value: half-size vector for lower or upper part
1388 | template <typename W, int ... i0>
1389 | auto blend_half(W const& a, W const& b) {
1390 |     typedef decltype(a.get_low()) V;             // type for half-size vector
1391 |     constexpr int N = V::size();                 // size of half-size vector
1392 |     static_assert(sizeof...(i0) == N, "wrong number of indexes in blend_half");
1393 |     constexpr int ind[N] = { i0... };            // array of indexes
1394 | 
1395 |     // lambda to find which of the four possible sources are used
1396 |     // return: EList<int, 5> containing a list of up to 4 sources. The last element is the number of sources used
1397 |     auto listsources = [](int const n, int const (&ind)[N]) constexpr {
1398 |         bool source_used[4] = { false,false,false,false }; // list of sources used
1399 |         int i = 0;
1400 |         for (i = 0; i < n; i++) {
1401 |             int ix = ind[i];                     // index
1402 |             if (ix >= 0) {
1403 |                 int src = ix / n;                // source used
1404 |                 source_used[src & 3] = true;
1405 |             }
1406 |         }
1407 |         // return a list of sources used. The last element is the number of sources used
1408 |         EList<int, 5> sources = {{0}};
1409 |         int nsrc = 0;                            // number of sources
1410 |         for (i = 0; i < 4; i++) {
1411 |             if (source_used[i]) {
1412 |                 sources.a[nsrc++] = i;
1413 |             }
1414 |         }
1415 |         sources.a[4] = nsrc;
1416 |         return sources;
1417 |     };
1418 |     // list of sources used
1419 |     constexpr EList<int, 5> sources = listsources(N, ind);
1420 |     constexpr int nsrc = sources.a[4];           // number of sources used
1421 | 
1422 |     if constexpr (nsrc == 0) {                   // no sources
1423 |         return V(0);
1424 |     }
1425 |     // get indexes for the first one or two sources
1426 |     constexpr int uindex = (nsrc > 2) ? 1 : 2;   // unused elements set to zero if two blends are combined
1427 |     constexpr EList<int, N> L = blend_half_indexes<N, uindex, sources.a[0], sources.a[1]>(ind);
1428 |     V x0;
1429 |     V src0 = selectblend<W, sources.a[0]>(a, b); // first source
1430 |     V src1 = selectblend<W, sources.a[1]>(a, b); // second source
1431 |     if constexpr (N == 2) {
1432 |         x0 = blend2  <L.a[0], L.a[1]> (src0, src1);
1433 |     }
1434 |     else if constexpr (N == 4) {
1435 |         x0 = blend4  <L.a[0], L.a[1], L.a[2], L.a[3]> (src0, src1);
1436 |     }
1437 |     else if constexpr (N == 8) {
1438 |         x0 = blend8  <L.a[0], L.a[1], L.a[2], L.a[3], L.a[4], L.a[5], L.a[6], L.a[7]> (src0, src1);
1439 |     }
1440 |     else if constexpr (N == 16) {
1441 |         x0 = blend16 <L.a[0], L.a[1], L.a[2],  L.a[3],  L.a[4],  L.a[5],  L.a[6],  L.a[7],
1442 |             L.a[8], L.a[9], L.a[10], L.a[11], L.a[12], L.a[13], L.a[14], L.a[15] > (src0, src1);
1443 |     }
1444 |     else if constexpr (N == 32) {
1445 |         x0 = blend32 <L.a[0], L.a[1],  L.a[2],  L.a[3],  L.a[4],  L.a[5],  L.a[6],  L.a[7],
1446 |             L.a[8],  L.a[9],  L.a[10], L.a[11], L.a[12], L.a[13], L.a[14], L.a[15],
1447 |             L.a[16], L.a[17], L.a[18], L.a[19], L.a[20], L.a[21], L.a[22], L.a[23],
1448 |             L.a[24], L.a[25], L.a[26], L.a[27], L.a[28], L.a[29], L.a[30], L.a[31] > (src0, src1);
1449 |     }
1450 |     if constexpr (nsrc > 2) {    // get last one or two sources
1451 |         constexpr EList<int, N> M = blend_half_indexes<N, 1, sources.a[2], sources.a[3]>(ind);
1452 |         V x1;
1453 |         V src2 = selectblend<W, sources.a[2]>(a, b);  // third source
1454 |         V src3 = selectblend<W, sources.a[3]>(a, b);  // fourth source
1455 |         if constexpr (N == 2) {
1456 |             x1 = blend2  <M.a[0], M.a[1]> (src0, src1);
1457 |         }
1458 |         else if constexpr (N == 4) {
1459 |             x1 = blend4  <M.a[0], M.a[1], M.a[2], M.a[3]> (src2, src3);
1460 |         }
1461 |         else if constexpr (N == 8) {
1462 |             x1 = blend8  <M.a[0], M.a[1], M.a[2], M.a[3], M.a[4], M.a[5], M.a[6], M.a[7]> (src2, src3);
1463 |         }
1464 |         else if constexpr (N == 16) {
1465 |             x1 = blend16 <M.a[0], M.a[1], M.a[2],  M.a[3],  M.a[4],  M.a[5],  M.a[6],  M.a[7],
1466 |                 M.a[8], M.a[9], M.a[10], M.a[11], M.a[12], M.a[13], M.a[14], M.a[15] > (src2, src3);
1467 |         }
1468 |         else if constexpr (N == 32) {
1469 |             x1 = blend32 <M.a[0], M.a[1],  M.a[2],   M.a[3],  M.a[4],  M.a[5],  M.a[6],  M.a[7],
1470 |                 M.a[8], M.a[9],  M.a[10],  M.a[11], M.a[12], M.a[13], M.a[14], M.a[15],
1471 |                 M.a[16], M.a[17], M.a[18], M.a[19], M.a[20], M.a[21], M.a[22], M.a[23],
1472 |                 M.a[24], M.a[25], M.a[26], M.a[27], M.a[28], M.a[29], M.a[30], M.a[31] > (src2, src3);
1473 |         }
1474 |         x0 |= x1;      // combine result of two blends. Unused elements are zero
1475 |     }
1476 |     return x0;
1477 | }
1478 | 
1479 | 
1480 | #ifdef VCL_NAMESPACE
1481 | }
1482 | #endif
1483 | 
1484 | 
1485 | #endif // INSTRSET_H
1486 | 


--------------------------------------------------------------------------------
/CAS/VCL2/instrset_detect.cpp:
--------------------------------------------------------------------------------
  1 | /**************************  instrset_detect.cpp   ****************************
  2 | * Author:        Agner Fog
  3 | * Date created:  2012-05-30
  4 | * Last modified: 2019-08-01
  5 | * Version:       2.00.00
  6 | * Project:       vector class library
  7 | * Description:
  8 | * Functions for checking which instruction sets are supported.
  9 | *
 10 | * (c) Copyright 2012-2019 Agner Fog.
 11 | * Apache License version 2.0 or later.
 12 | ******************************************************************************/
 13 | 
 14 | #include "instrset.h"
 15 | 
 16 | #ifdef VCL_NAMESPACE
 17 | namespace VCL_NAMESPACE {
 18 | #endif
 19 | 
 20 | 
 21 | // Define interface to xgetbv instruction
 22 | static inline uint64_t xgetbv (int ctr) {
 23 | #if (defined (_MSC_FULL_VER) && _MSC_FULL_VER >= 160040000) || (defined (__INTEL_COMPILER) && __INTEL_COMPILER >= 1200)
 24 |     // Microsoft or Intel compiler supporting _xgetbv intrinsic
 25 | 
 26 |     return uint64_t(_xgetbv(ctr));                    // intrinsic function for XGETBV
 27 | 
 28 | #elif defined(__GNUC__) ||  defined (__clang__)       // use inline assembly, Gnu/AT&T syntax
 29 | 
 30 |    uint32_t a, d;
 31 |    __asm("xgetbv" : "=a"(a),"=d"(d) : "c"(ctr) : );
 32 |    return a | (uint64_t(d) << 32);
 33 | 
 34 | #else  // #elif defined (_WIN32)                      // other compiler. try inline assembly with masm/intel/MS syntax
 35 |    uint32_t a, d;
 36 |     __asm {
 37 |         mov ecx, ctr
 38 |         _emit 0x0f
 39 |         _emit 0x01
 40 |         _emit 0xd0 ; // xgetbv
 41 |         mov a, eax
 42 |         mov d, edx
 43 |     }
 44 |    return a | (uint64_t(d) << 32);
 45 | 
 46 | #endif
 47 | }
 48 | 
 49 | /* find supported instruction set
 50 |     return value:
 51 |     0           = 80386 instruction set
 52 |     1  or above = SSE (XMM) supported by CPU (not testing for OS support)
 53 |     2  or above = SSE2
 54 |     3  or above = SSE3
 55 |     4  or above = Supplementary SSE3 (SSSE3)
 56 |     5  or above = SSE4.1
 57 |     6  or above = SSE4.2
 58 |     7  or above = AVX supported by CPU and operating system
 59 |     8  or above = AVX2
 60 |     9  or above = AVX512F
 61 |    10  or above = AVX512VL, AVX512BW, AVX512DQ
 62 | */
 63 | int instrset_detect(void) {
 64 | 
 65 |     static int iset = -1;                                  // remember value for next call
 66 |     if (iset >= 0) {
 67 |         return iset;                                       // called before
 68 |     }
 69 |     iset = 0;                                              // default value
 70 |     int abcd[4] = {0,0,0,0};                               // cpuid results
 71 |     cpuid(abcd, 0);                                        // call cpuid function 0
 72 |     if (abcd[0] == 0) return iset;                         // no further cpuid function supported
 73 |     cpuid(abcd, 1);                                        // call cpuid function 1 for feature flags
 74 |     if ((abcd[3] & (1 <<  0)) == 0) return iset;           // no floating point
 75 |     if ((abcd[3] & (1 << 23)) == 0) return iset;           // no MMX
 76 |     if ((abcd[3] & (1 << 15)) == 0) return iset;           // no conditional move
 77 |     if ((abcd[3] & (1 << 24)) == 0) return iset;           // no FXSAVE
 78 |     if ((abcd[3] & (1 << 25)) == 0) return iset;           // no SSE
 79 |     iset = 1;                                              // 1: SSE supported
 80 |     if ((abcd[3] & (1 << 26)) == 0) return iset;           // no SSE2
 81 |     iset = 2;                                              // 2: SSE2 supported
 82 |     if ((abcd[2] & (1 <<  0)) == 0) return iset;           // no SSE3
 83 |     iset = 3;                                              // 3: SSE3 supported
 84 |     if ((abcd[2] & (1 <<  9)) == 0) return iset;           // no SSSE3
 85 |     iset = 4;                                              // 4: SSSE3 supported
 86 |     if ((abcd[2] & (1 << 19)) == 0) return iset;           // no SSE4.1
 87 |     iset = 5;                                              // 5: SSE4.1 supported
 88 |     if ((abcd[2] & (1 << 23)) == 0) return iset;           // no POPCNT
 89 |     if ((abcd[2] & (1 << 20)) == 0) return iset;           // no SSE4.2
 90 |     iset = 6;                                              // 6: SSE4.2 supported
 91 |     if ((abcd[2] & (1 << 27)) == 0) return iset;           // no OSXSAVE
 92 |     if ((xgetbv(0) & 6) != 6)       return iset;           // AVX not enabled in O.S.
 93 |     if ((abcd[2] & (1 << 28)) == 0) return iset;           // no AVX
 94 |     iset = 7;                                              // 7: AVX supported
 95 |     cpuid(abcd, 7);                                        // call cpuid leaf 7 for feature flags
 96 |     if ((abcd[1] & (1 <<  5)) == 0) return iset;           // no AVX2
 97 |     iset = 8;
 98 |     if ((abcd[1] & (1 << 16)) == 0) return iset;           // no AVX512
 99 |     cpuid(abcd, 0xD);                                      // call cpuid leaf 0xD for feature flags
100 |     if ((abcd[0] & 0x60) != 0x60)   return iset;           // no AVX512
101 |     iset = 9;
102 |     cpuid(abcd, 7);                                        // call cpuid leaf 7 for feature flags
103 |     if ((abcd[1] & (1 << 31)) == 0) return iset;           // no AVX512VL
104 |     if ((abcd[1] & 0x40020000) != 0x40020000) return iset; // no AVX512BW, AVX512DQ
105 |     iset = 10;
106 |     return iset;
107 | }
108 | 
109 | // detect if CPU supports the FMA3 instruction set
110 | bool hasFMA3(void) {
111 |     if (instrset_detect() < 7) return false;               // must have AVX
112 |     int abcd[4];                                           // cpuid results
113 |     cpuid(abcd, 1);                                        // call cpuid function 1
114 |     return ((abcd[2] & (1 << 12)) != 0);                   // ecx bit 12 indicates FMA3
115 | }
116 | 
117 | // detect if CPU supports the FMA4 instruction set
118 | bool hasFMA4(void) {
119 |     if (instrset_detect() < 7) return false;               // must have AVX
120 |     int abcd[4];                                           // cpuid results
121 |     cpuid(abcd, 0x80000001);                               // call cpuid function 0x80000001
122 |     return ((abcd[2] & (1 << 16)) != 0);                   // ecx bit 16 indicates FMA4
123 | }
124 | 
125 | // detect if CPU supports the XOP instruction set
126 | bool hasXOP(void) {
127 |     if (instrset_detect() < 7) return false;               // must have AVX
128 |     int abcd[4];                                           // cpuid results
129 |     cpuid(abcd, 0x80000001);                               // call cpuid function 0x80000001
130 |     return ((abcd[2] & (1 << 11)) != 0);                   // ecx bit 11 indicates XOP
131 | }
132 | 
133 | // detect if CPU supports the F16C instruction set
134 | bool hasF16C(void) {
135 |     if (instrset_detect() < 7) return false;               // must have AVX
136 |     int abcd[4];                                           // cpuid results
137 |     cpuid(abcd, 1);                                        // call cpuid function 1
138 |     return ((abcd[2] & (1 << 29)) != 0);                   // ecx bit 29 indicates F16C
139 | }
140 | 
141 | // detect if CPU supports the AVX512ER instruction set
142 | bool hasAVX512ER(void) {
143 |     if (instrset_detect() < 9) return false;               // must have AVX512F
144 |     int abcd[4];                                           // cpuid results
145 |     cpuid(abcd, 7);                                        // call cpuid function 7
146 |     return ((abcd[1] & (1 << 27)) != 0);                   // ebx bit 27 indicates AVX512ER
147 | }
148 | 
149 | // detect if CPU supports the AVX512VBMI instruction set
150 | bool hasAVX512VBMI(void) {
151 |     if (instrset_detect() < 10) return false;              // must have AVX512BW
152 |     int abcd[4];                                           // cpuid results
153 |     cpuid(abcd, 7);                                        // call cpuid function 7
154 |     return ((abcd[2] & (1 << 1)) != 0);                    // ecx bit 1 indicates AVX512VBMI
155 | }
156 | 
157 | // detect if CPU supports the AVX512VBMI2 instruction set
158 | bool hasAVX512VBMI2(void) {
159 |     if (instrset_detect() < 10) return false;              // must have AVX512BW
160 |     int abcd[4];                                           // cpuid results
161 |     cpuid(abcd, 7);                                        // call cpuid function 7
162 |     return ((abcd[2] & (1 << 6)) != 0);                    // ecx bit 6 indicates AVX512VBMI2
163 | }
164 | 
165 | #ifdef VCL_NAMESPACE
166 | }
167 | #endif
168 | 


--------------------------------------------------------------------------------
/CAS/VCL2/vector_convert.h:
--------------------------------------------------------------------------------
  1 | /**************************  vector_convert.h   *******************************
  2 | * Author:        Agner Fog
  3 | * Date created:  2014-07-23
  4 | * Last modified: 2019-11-17
  5 | * Version:       2.01.00
  6 | * Project:       vector class library
  7 | * Description:
  8 | * Header file for conversion between different vector classes with different
  9 | * sizes. Also includes verious generic template functions.
 10 | *
 11 | * (c) Copyright 2012-2019 Agner Fog.
 12 | * Apache License version 2.0 or later.
 13 | *****************************************************************************/
 14 | 
 15 | #ifndef VECTOR_CONVERT_H
 16 | #define VECTOR_CONVERT_H
 17 | 
 18 | #ifndef VECTORCLASS_H
 19 | #include "vectorclass.h"
 20 | #endif
 21 | 
 22 | #if VECTORCLASS_H < 20100
 23 | #error Incompatible versions of vector class library mixed
 24 | #endif
 25 | 
 26 | #ifdef VCL_NAMESPACE
 27 | namespace VCL_NAMESPACE {
 28 | #endif
 29 | 
 30 | #if MAX_VECTOR_SIZE >= 256
 31 | 
 32 | /*****************************************************************************
 33 | *
 34 | *          Extend from 128 to 256 bit vectors
 35 | *
 36 | *****************************************************************************/
 37 | 
 38 | #if INSTRSET >= 8  // AVX2. 256 bit integer vectors
 39 | 
 40 | // sign extend
 41 | static inline Vec16s extend (Vec16c const a) {
 42 |     return _mm256_cvtepi8_epi16(a);
 43 | }
 44 | 
 45 | // zero extend
 46 | static inline Vec16us extend (Vec16uc const a) {
 47 |     return _mm256_cvtepu8_epi16(a);
 48 | }
 49 | 
 50 | // sign extend
 51 | static inline Vec8i extend (Vec8s const a) {
 52 |     return _mm256_cvtepi16_epi32(a);
 53 | }
 54 | 
 55 | // zero extend
 56 | static inline Vec8ui extend (Vec8us const a) {
 57 |     return _mm256_cvtepu16_epi32(a);
 58 | }
 59 | 
 60 | // sign extend
 61 | static inline Vec4q extend (Vec4i const a) {
 62 |     return _mm256_cvtepi32_epi64(a);
 63 | }
 64 | 
 65 | // zero extend
 66 | static inline Vec4uq extend (Vec4ui const a) {
 67 |     return _mm256_cvtepu32_epi64(a);
 68 | }
 69 | 
 70 | 
 71 | #else  // no AVX2. 256 bit integer vectors are emulated
 72 | 
 73 | // sign extend and zero extend functions:
 74 | static inline Vec16s extend (Vec16c const a) {
 75 |     return Vec16s(extend_low(a), extend_high(a));
 76 | }
 77 | 
 78 | static inline Vec16us extend (Vec16uc const a) {
 79 |     return Vec16us(extend_low(a), extend_high(a));
 80 | }
 81 | 
 82 | static inline Vec8i extend (Vec8s const a) {
 83 |     return Vec8i(extend_low(a), extend_high(a));
 84 | }
 85 | 
 86 | static inline Vec8ui extend (Vec8us const a) {
 87 |     return Vec8ui(extend_low(a), extend_high(a));
 88 | }
 89 | 
 90 | static inline Vec4q extend (Vec4i const a) {
 91 |     return Vec4q(extend_low(a), extend_high(a));
 92 | }
 93 | 
 94 | static inline Vec4uq extend (Vec4ui const a) {
 95 |     return Vec4uq(extend_low(a), extend_high(a));
 96 | }
 97 | 
 98 | #endif  // AVX2
 99 | 
100 | /*****************************************************************************
101 | *
102 | *          Conversions between float and double
103 | *
104 | *****************************************************************************/
105 | #if INSTRSET >= 7  // AVX. 256 bit float vectors
106 | 
107 | // float to double
108 | static inline Vec4d to_double (Vec4f const a) {
109 |     return _mm256_cvtps_pd(a);
110 | }
111 | 
112 | // double to float
113 | static inline Vec4f to_float (Vec4d const a) {
114 |     return _mm256_cvtpd_ps(a);
115 | }
116 | 
117 | #else  // no AVX2. 256 bit float vectors are emulated
118 | 
119 | // float to double
120 | static inline Vec4d to_double (Vec4f const a) {
121 |     Vec2d lo = _mm_cvtps_pd(a);
122 |     Vec2d hi = _mm_cvtps_pd(_mm_movehl_ps(a, a));
123 |     return Vec4d(lo,hi);
124 | }
125 | 
126 | // double to float
127 | static inline Vec4f to_float (Vec4d const a) {
128 |     Vec4f lo = _mm_cvtpd_ps(a.get_low());
129 |     Vec4f hi = _mm_cvtpd_ps(a.get_high());
130 |     return _mm_movelh_ps(lo, hi);
131 | }
132 | 
133 | #endif
134 | 
135 | /*****************************************************************************
136 | *
137 | *          Reduce from 256 to 128 bit vectors
138 | *
139 | *****************************************************************************/
140 | #if INSTRSET >= 10  // AVX512VL
141 | 
142 | // compress functions. overflow wraps around
143 | static inline Vec16c compress (Vec16s const a) {
144 |     return _mm256_cvtepi16_epi8(a);
145 | }
146 | 
147 | static inline Vec16uc compress (Vec16us const a) {
148 |     return _mm256_cvtepi16_epi8(a);
149 | }
150 | 
151 | static inline Vec8s compress (Vec8i const a) {
152 |     return _mm256_cvtepi32_epi16(a);
153 | }
154 | 
155 | static inline Vec8us compress (Vec8ui const a) {
156 |     return _mm256_cvtepi32_epi16(a);
157 | }
158 | 
159 | static inline Vec4i compress (Vec4q const a) {
160 |     return _mm256_cvtepi64_epi32(a);
161 | }
162 | 
163 | static inline Vec4ui compress (Vec4uq const a) {
164 |     return _mm256_cvtepi64_epi32(a);
165 | }
166 | 
167 | #else  // no AVX512
168 | 
169 | // compress functions. overflow wraps around
170 | static inline Vec16c compress (Vec16s const a) {
171 |     return compress(a.get_low(), a.get_high());
172 | }
173 | 
174 | static inline Vec16uc compress (Vec16us const a) {
175 |     return compress(a.get_low(), a.get_high());
176 | }
177 | 
178 | static inline Vec8s compress (Vec8i const a) {
179 |     return compress(a.get_low(), a.get_high());
180 | }
181 | 
182 | static inline Vec8us compress (Vec8ui const a) {
183 |     return compress(a.get_low(), a.get_high());
184 | }
185 | 
186 | static inline Vec4i compress (Vec4q const a) {
187 |     return compress(a.get_low(), a.get_high());
188 | }
189 | 
190 | static inline Vec4ui compress (Vec4uq const a) {
191 |     return compress(a.get_low(), a.get_high());
192 | }
193 | 
194 | #endif  // AVX512
195 | 
196 | #endif // MAX_VECTOR_SIZE >= 256
197 | 
198 | 
199 | #if MAX_VECTOR_SIZE >= 512
200 | 
201 | /*****************************************************************************
202 | *
203 | *          Extend from 256 to 512 bit vectors
204 | *
205 | *****************************************************************************/
206 | 
207 | #if INSTRSET >= 9  // AVX512. 512 bit integer vectors
208 | 
209 | // sign extend
210 | static inline Vec32s extend (Vec32c const a) {
211 | #if INSTRSET >= 10
212 |     return _mm512_cvtepi8_epi16(a);
213 | #else
214 |     return Vec32s(extend_low(a), extend_high(a));
215 | #endif
216 | }
217 | 
218 | // zero extend
219 | static inline Vec32us extend (Vec32uc const a) {
220 | #if INSTRSET >= 10
221 |     return _mm512_cvtepu8_epi16(a);
222 | #else
223 |     return Vec32us(extend_low(a), extend_high(a));
224 | #endif
225 | }
226 | 
227 | // sign extend
228 | static inline Vec16i extend (Vec16s const a) {
229 |     return _mm512_cvtepi16_epi32(a);
230 | }
231 | 
232 | // zero extend
233 | static inline Vec16ui extend (Vec16us const a) {
234 |     return _mm512_cvtepu16_epi32(a);
235 | }
236 | 
237 | // sign extend
238 | static inline Vec8q extend (Vec8i const a) {
239 |     return _mm512_cvtepi32_epi64(a);
240 | }
241 | 
242 | // zero extend
243 | static inline Vec8uq extend (Vec8ui const a) {
244 |     return _mm512_cvtepu32_epi64(a);
245 | }
246 | 
247 | #else  // no AVX512. 512 bit vectors are emulated
248 | 
249 | 
250 | 
251 | // sign extend
252 | static inline Vec32s extend (Vec32c const a) {
253 |     return Vec32s(extend_low(a), extend_high(a));
254 | }
255 | 
256 | // zero extend
257 | static inline Vec32us extend (Vec32uc const a) {
258 |     return Vec32us(extend_low(a), extend_high(a));
259 | }
260 | 
261 | // sign extend
262 | static inline Vec16i extend (Vec16s const a) {
263 |     return Vec16i(extend_low(a), extend_high(a));
264 | }
265 | 
266 | // zero extend
267 | static inline Vec16ui extend (Vec16us const a) {
268 |     return Vec16ui(extend_low(a), extend_high(a));
269 | }
270 | 
271 | // sign extend
272 | static inline Vec8q extend (Vec8i const a) {
273 |     return Vec8q(extend_low(a), extend_high(a));
274 | }
275 | 
276 | // zero extend
277 | static inline Vec8uq extend (Vec8ui const a) {
278 |     return Vec8uq(extend_low(a), extend_high(a));
279 | }
280 | 
281 | #endif  // AVX512
282 | 
283 | 
284 | /*****************************************************************************
285 | *
286 | *          Reduce from 512 to 256 bit vectors
287 | *
288 | *****************************************************************************/
289 | #if INSTRSET >= 9  // AVX512F
290 | 
291 | // compress functions. overflow wraps around
292 | static inline Vec32c compress (Vec32s const a) {
293 | #if INSTRSET >= 10  // AVVX512BW
294 |     return _mm512_cvtepi16_epi8(a);
295 | #else
296 |     return compress(a.get_low(), a.get_high());
297 | #endif
298 | }
299 | 
300 | static inline Vec32uc compress (Vec32us const a) {
301 |     return Vec32uc(compress(Vec32s(a)));
302 | }
303 | 
304 | static inline Vec16s compress (Vec16i const a) {
305 |     return _mm512_cvtepi32_epi16(a);
306 | }
307 | 
308 | static inline Vec16us compress (Vec16ui const a) {
309 |     return _mm512_cvtepi32_epi16(a);
310 | }
311 | 
312 | static inline Vec8i compress (Vec8q const a) {
313 |     return _mm512_cvtepi64_epi32(a);
314 | }
315 | 
316 | static inline Vec8ui compress (Vec8uq const a) {
317 |     return _mm512_cvtepi64_epi32(a);
318 | }
319 | 
320 | #else  // no AVX512
321 | 
322 | // compress functions. overflow wraps around
323 | static inline Vec32c compress (Vec32s const a) {
324 |     return compress(a.get_low(), a.get_high());
325 | }
326 | 
327 | static inline Vec32uc compress (Vec32us const a) {
328 |     return compress(a.get_low(), a.get_high());
329 | }
330 | 
331 | static inline Vec16s compress (Vec16i const a) {
332 |     return compress(a.get_low(), a.get_high());
333 | }
334 | 
335 | static inline Vec16us compress (Vec16ui const a) {
336 |     return compress(a.get_low(), a.get_high());
337 | }
338 | 
339 | static inline Vec8i compress (Vec8q const a) {
340 |     return compress(a.get_low(), a.get_high());
341 | }
342 | 
343 | static inline Vec8ui compress (Vec8uq const a) {
344 |     return compress(a.get_low(), a.get_high());
345 | }
346 | 
347 | #endif  // AVX512
348 | 
349 | /*****************************************************************************
350 | *
351 | *          Conversions between float and double
352 | *
353 | *****************************************************************************/
354 | 
355 | #if INSTRSET >= 9  // AVX512. 512 bit float vectors
356 | 
357 | // float to double
358 | static inline Vec8d to_double (Vec8f const a) {
359 |     return _mm512_cvtps_pd(a);
360 | }
361 | 
362 | // double to float
363 | static inline Vec8f to_float (Vec8d const a) {
364 |     return _mm512_cvtpd_ps(a);
365 | }
366 | 
367 | #else  // no AVX512. 512 bit float vectors are emulated
368 | 
369 | // float to double
370 | static inline Vec8d to_double (Vec8f const a) {
371 |     Vec4d lo = to_double(a.get_low());
372 |     Vec4d hi = to_double(a.get_high());
373 |     return Vec8d(lo,hi);
374 | }
375 | 
376 | // double to float
377 | static inline Vec8f to_float (Vec8d const a) {
378 |     Vec4f lo = to_float(a.get_low());
379 |     Vec4f hi = to_float(a.get_high());
380 |     return Vec8f(lo, hi);
381 | }
382 | 
383 | #endif
384 | 
385 | #endif // MAX_VECTOR_SIZE >= 512
386 | 
387 | // double to float
388 | static inline Vec4f to_float (Vec2d const a) {
389 |     return _mm_cvtpd_ps(a);
390 | }
391 | 
392 | 
393 | /*****************************************************************************
394 | *
395 | *          Generic template functions
396 | *
397 | *  These templates define functions for multiple vector types in one template
398 | *
399 | *****************************************************************************/
400 | 
401 | // horizontal min/max of vector elements
402 | // implemented with universal template, works for all vector types:
403 | 
404 | template <typename T> auto horizontal_min(T const x) {
405 |     if constexpr ((T::elementtype() & 16) != 0) {
406 |         // T is a float or double vector
407 |         if (horizontal_or(is_nan(x))) {
408 |             // check for NAN because min does not guarantee NAN propagation
409 |             return x[horizontal_find_first(is_nan(x))];
410 |         }
411 |     }
412 |     return horizontal_min1(x);
413 | }
414 | 
415 | template <typename T> auto horizontal_min1(T const x) {
416 |     if constexpr (T::elementtype() <= 3) {       // boolean vector type
417 |         return horizontal_and(x);
418 |     }
419 |     else if constexpr (sizeof(T) >= 32) {
420 |         // split recursively into smaller vectors
421 |         return horizontal_min1(min(x.get_low(), x.get_high()));
422 |     }
423 |     else if constexpr (T::size() == 2) {
424 |         T a = permute2 <1, V_DC>(x);             // high half
425 |         T b = min(a, x);
426 |         return b[0];
427 |     }
428 |     else if constexpr (T::size() == 4) {
429 |         T a = permute4<2, 3, V_DC, V_DC>(x);     // high half
430 |         T b = min(a, x);
431 |         a = permute4<1, V_DC, V_DC, V_DC>(b);
432 |         b = min(a, b);
433 |         return b[0];
434 |     }
435 |     else if constexpr (T::size() == 8) {
436 |         T a = permute8<4, 5, 6, 7, V_DC, V_DC, V_DC, V_DC>(x);  // high half
437 |         T b = min(a, x);
438 |         a = permute8<2, 3, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b);
439 |         b = min(a, b);
440 |         a = permute8<1, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b);
441 |         b = min(a, b);
442 |         return b[0];
443 |     }
444 |     else {
445 |         static_assert(T::size() == 16);          // no other size is allowed
446 |         T a = permute16<8, 9, 10, 11, 12, 13, 14, 15, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC >(x);  // high half
447 |         T b = min(a, x);
448 |         a = permute16<4, 5, 6, 7, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b);
449 |         b = min(a, b);
450 |         a = permute16<2, 3, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b);
451 |         b = min(a, b);
452 |         a = permute16<1, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b);
453 |         b = min(a, b);
454 |         return b[0];
455 |     }
456 | }
457 | 
458 | template <typename T> auto horizontal_max(T const x) {
459 |     if constexpr ((T::elementtype() & 16) != 0) {
460 |         // T is a float or double vector
461 |         if (horizontal_or(is_nan(x))) {
462 |             // check for NAN because max does not guarantee NAN propagation
463 |             return x[horizontal_find_first(is_nan(x))];
464 |         }
465 |     }
466 |     return horizontal_max1(x);
467 | }
468 | 
469 | template <typename T> auto horizontal_max1(T const x) {
470 |     if constexpr (T::elementtype() <= 3) {       // boolean vector type
471 |         return horizontal_or(x);
472 |     }
473 |     else if constexpr (sizeof(T) >= 32) {
474 |         // split recursively into smaller vectors
475 |         return horizontal_max1(max(x.get_low(), x.get_high()));
476 |     }
477 |     else if constexpr (T::size() == 2) {
478 |         T a = permute2 <1, V_DC>(x);             // high half
479 |         T b = max(a, x);
480 |         return b[0];
481 |     }
482 |     else if constexpr (T::size() == 4) {
483 |         T a = permute4<2, 3, V_DC, V_DC>(x);     // high half
484 |         T b = max(a, x);
485 |         a = permute4<1, V_DC, V_DC, V_DC>(b);
486 |         b = max(a, b);
487 |         return b[0];
488 |     }
489 |     else if constexpr (T::size() == 8) {
490 |         T a = permute8<4, 5, 6, 7, V_DC, V_DC, V_DC, V_DC>(x);  // high half
491 |         T b = max(a, x);
492 |         a = permute8<2, 3, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b);
493 |         b = max(a, b);
494 |         a = permute8<1, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b);
495 |         b = max(a, b);
496 |         return b[0];
497 |     }
498 |     else {
499 |         static_assert(T::size() == 16);          // no other size is allowed
500 |         T a = permute16<8, 9, 10, 11, 12, 13, 14, 15, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC >(x);  // high half
501 |         T b = max(a, x);
502 |         a = permute16<4, 5, 6, 7, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b);
503 |         b = max(a, b);
504 |         a = permute16<2, 3, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b);
505 |         b = max(a, b);
506 |         a = permute16<1, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b);
507 |         b = max(a, b);
508 |         return b[0];
509 |     }
510 | }
511 | 
512 | // Find first element that is true in a boolean vector
513 | template <typename V>
514 | static inline int horizontal_find_first(V const x) {
515 |     static_assert(V::elementtype() == 2 || V::elementtype() == 3, "Boolean vector expected");
516 |     auto bits = to_bits(x);                      // convert to bits
517 |     if (bits == 0) return -1;
518 |     if constexpr (V::size() < 32) {
519 |         return bit_scan_forward((uint32_t)bits);
520 |     }
521 |     else {
522 |         return bit_scan_forward(bits);
523 |     }
524 | }
525 | 
526 | // Count the number of elements that are true in a boolean vector
527 | template <typename V>
528 | static inline int horizontal_count(V const x) {
529 |     static_assert(V::elementtype() == 2 || V::elementtype() == 3, "Boolean vector expected");
530 |     auto bits = to_bits(x);                      // convert to bits
531 |     if constexpr (V::size() < 32) {
532 |         return vml_popcnt((uint32_t)bits);
533 |     }
534 |     else {
535 |         return (int)vml_popcnt(bits);
536 |     }
537 | }
538 | 
539 | // maximum and minimum functions. This version is sure to propagate NANs,
540 | // conforming to the new IEEE-754 2019 standard
541 | template <typename V>
542 | static inline V maximum(V const a, V const b) {
543 |     if constexpr (V::elementtype() < 16) {
544 |         return max(a, b);              // integer type
545 |     }
546 |     else {                             // float or double vector
547 |         V y = select(is_nan(a), a, max(a, b));
548 | #ifdef SIGNED_ZERO                     // pedantic about signed zero
549 |         y = select(a == b, a & b, y);  // maximum(+0, -0) = +0
550 | #endif
551 |         return y;
552 |     }
553 | }
554 | 
555 | template <typename V>
556 | static inline V minimum(V const a, V const b) {
557 |     if constexpr (V::elementtype() < 16) {
558 |         return min(a, b);              // integer type
559 |     }
560 |     else {                             // float or double vector
561 |         V y = select(is_nan(a), a, min(a, b));
562 | #ifdef SIGNED_ZERO                     // pedantic about signed zero
563 |         y = select(a == b, a | b, y);  // minimum(+0, -0) = -0
564 | #endif
565 |         return y;
566 |     }
567 | }
568 | 
569 | 
570 | #ifdef VCL_NAMESPACE
571 | }
572 | #endif
573 | 
574 | #endif // VECTOR_CONVERT_H
575 | 


--------------------------------------------------------------------------------
/CAS/VCL2/vectorclass.h:
--------------------------------------------------------------------------------
 1 | /****************************  vectorclass.h   ********************************
 2 | * Author:        Agner Fog
 3 | * Date created:  2012-05-30
 4 | * Last modified: 2020-04-11
 5 | * Version:       2.01.02
 6 | * Project:       vector class library
 7 | * Home:          https://github.com/vectorclass
 8 | * Description:
 9 | * Header file defining vector classes as interface to intrinsic functions
10 | * in x86 and x86-64 microprocessors with SSE2 and later instruction sets.
11 | *
12 | * Instructions:
13 | * Use Gnu, Clang, Intel or Microsoft C++ compiler. Compile for the desired
14 | * instruction set, which must be at least SSE2. Specify the supported
15 | * instruction set by a command line define, e.g. __SSE4_1__ if the
16 | * compiler does not automatically do so.
17 | * For detailed instructions, see vcl_manual.pdf
18 | *
19 | * Each vector object is represented internally in the CPU as a vector
20 | * register with 128, 256 or 512 bits.
21 | *
22 | * This header file includes the appropriate header files depending on the
23 | * selected instruction set.
24 | *
25 | * (c) Copyright 2012-2020 Agner Fog.
26 | * Apache License version 2.0 or later.
27 | ******************************************************************************/
28 | #ifndef VECTORCLASS_H
29 | #define VECTORCLASS_H  20102
30 | 
31 | // Maximum vector size, bits. Allowed values are 128, 256, 512
32 | #ifndef MAX_VECTOR_SIZE
33 | #define MAX_VECTOR_SIZE 512
34 | #endif
35 | 
36 | // Determine instruction set, and define platform-dependent functions
37 | #include "instrset.h"        // Select supported instruction set
38 | 
39 | #if INSTRSET < 2             // instruction set SSE2 is the minimum
40 | #error Please compile for the SSE2 instruction set or higher
41 | #else
42 | 
43 | // Select appropriate .h files depending on instruction set
44 | #include "vectori128.h"      // 128-bit integer vectors
45 | #include "vectorf128.h"      // 128-bit floating point vectors
46 | 
47 | #if MAX_VECTOR_SIZE >= 256
48 | #if INSTRSET >= 8
49 | #include "vectori256.h"      // 256-bit integer vectors, requires AVX2 instruction set
50 | #else
51 | #include "vectori256e.h"     // 256-bit integer vectors, emulated
52 | #endif  // INSTRSET >= 8
53 | #if INSTRSET >= 7
54 | #include "vectorf256.h"      // 256-bit floating point vectors, requires AVX instruction set
55 | #else
56 | #include "vectorf256e.h"     // 256-bit floating point vectors, emulated
57 | #endif  //  INSTRSET >= 7
58 | #endif  //  MAX_VECTOR_SIZE >= 256
59 | 
60 | #if MAX_VECTOR_SIZE >= 512
61 | #if INSTRSET >= 9
62 | #include "vectori512.h"      // 512-bit vectors of 32 and 64 bit integers, requires AVX512F instruction set
63 | #include "vectorf512.h"      // 512-bit floating point vectors, requires AVX512F instruction set
64 | #else
65 | #include "vectori512e.h"     // 512-bit integer vectors, emulated
66 | #include "vectorf512e.h"     // 512-bit floating point vectors, emulated
67 | #endif  //  INSTRSET >= 9
68 | #if INSTRSET >= 10
69 | #include "vectori512s.h"     // 512-bit vectors of 8 and 16 bit integers, requires AVX512BW instruction set
70 | #else
71 | #include "vectori512se.h"    // 512-bit vectors of 8 and 16 bit integers, emulated
72 | #endif
73 | #endif  //  MAX_VECTOR_SIZE >= 512
74 | 
75 | #include "vector_convert.h"  // conversion between different vector sizes
76 | 
77 | #endif  // INSTRSET >= 2
78 | 
79 | 
80 | #else   // VECTORCLASS_H
81 | 
82 | #if VECTORCLASS_H < 20000
83 | #error Mixed versions of vector class library
84 | #endif
85 | 
86 | #endif  // VECTORCLASS_H
87 | 


--------------------------------------------------------------------------------
/CAS/VCL2/vectormath_common.h:
--------------------------------------------------------------------------------
  1 | /***************************  vectormath_common.h   ****************************
  2 | * Author:        Agner Fog
  3 | * Date created:  2014-04-18
  4 | * Last modified: 2020-06-08
  5 | * Version:       2.01.03
  6 | * Project:       vector classes
  7 | * Description:
  8 | * Header file containing common code for inline version of mathematical functions.
  9 | *
 10 | * For detailed instructions, see VectorClass.pdf
 11 | *
 12 | * (c) Copyright 2014-2020 Agner Fog.
 13 | * Apache License version 2.0 or later.
 14 | ******************************************************************************/
 15 | 
 16 | #ifndef VECTORMATH_COMMON_H
 17 | #define VECTORMATH_COMMON_H  2
 18 | 
 19 | #ifdef VECTORMATH_LIB_H
 20 | #error conflicting header files. More than one implementation of mathematical functions included
 21 | #endif
 22 | 
 23 | #include <cmath>
 24 | 
 25 | #ifndef VECTORCLASS_H
 26 | #include "vectorclass.h"
 27 | #endif
 28 | 
 29 | #if VECTORCLASS_H < 20000
 30 | #error Incompatible versions of vector class library mixed
 31 | #endif
 32 | 
 33 | 
 34 | /******************************************************************************
 35 |                     Define NAN payload values
 36 | ******************************************************************************/
 37 | #define NAN_LOG 0x101  // logarithm for x<0
 38 | #define NAN_POW 0x102  // negative number raised to non-integer power
 39 | #define NAN_HYP 0x104  // acosh for x<1 and atanh for abs(x)>1
 40 | 
 41 | 
 42 | /******************************************************************************
 43 |                     Define mathematical constants
 44 | ******************************************************************************/
 45 | #define VM_PI       3.14159265358979323846           // pi
 46 | #define VM_PI_2     1.57079632679489661923           // pi / 2
 47 | #define VM_PI_4     0.785398163397448309616          // pi / 4
 48 | #define VM_SQRT2    1.41421356237309504880           // sqrt(2)
 49 | #define VM_LOG2E    1.44269504088896340736           // 1/log(2)
 50 | #define VM_LOG10E   0.434294481903251827651          // 1/log(10)
 51 | #define VM_LOG210   3.321928094887362347808          // log2(10)
 52 | #define VM_LN2      0.693147180559945309417          // log(2)
 53 | #define VM_LN10     2.30258509299404568402           // log(10)
 54 | #define VM_SMALLEST_NORMAL  2.2250738585072014E-308  // smallest normal number, double
 55 | #define VM_SMALLEST_NORMALF 1.17549435E-38f          // smallest normal number, float
 56 | 
 57 | 
 58 | #ifdef VCL_NAMESPACE
 59 | namespace VCL_NAMESPACE {
 60 | #endif
 61 | 
 62 | /******************************************************************************
 63 |       templates for producing infinite and nan in desired vector type
 64 | ******************************************************************************/
 65 | template <class VTYPE>
 66 | static inline VTYPE infinite_vec();
 67 | 
 68 | template <>
 69 | inline Vec2d infinite_vec<Vec2d>() {
 70 |     return infinite2d();
 71 | }
 72 | 
 73 | template <>
 74 | inline Vec4f infinite_vec<Vec4f>() {
 75 |     return infinite4f();
 76 | }
 77 | 
 78 | #if MAX_VECTOR_SIZE >= 256
 79 | 
 80 | template <>
 81 | inline Vec4d infinite_vec<Vec4d>() {
 82 |     return infinite4d();
 83 | }
 84 | 
 85 | template <>
 86 | inline Vec8f infinite_vec<Vec8f>() {
 87 |     return infinite8f();
 88 | }
 89 | 
 90 | #endif // MAX_VECTOR_SIZE >= 256
 91 | 
 92 | #if MAX_VECTOR_SIZE >= 512
 93 | 
 94 | template <>
 95 | inline Vec8d infinite_vec<Vec8d>() {
 96 |     return infinite8d();
 97 | }
 98 | 
 99 | template <>
100 | inline Vec16f infinite_vec<Vec16f>() {
101 |     return infinite16f();
102 | }
103 | 
104 | #endif // MAX_VECTOR_SIZE >= 512
105 | 
106 | 
107 | 
108 | /******************************************************************************
109 | *                 Detect NAN codes
110 | *
111 | * These functions return the code hidden in a NAN. The sign bit is ignored
112 | ******************************************************************************/
113 | 
114 | static inline Vec4ui nan_code(Vec4f const x) {
115 |     Vec4ui a = Vec4ui(reinterpret_i(x));
116 |     Vec4ui const n = 0x007FFFFF;
117 |     return select(Vec4ib(is_nan(x)), a & n, 0);
118 | }
119 | 
120 | // This function returns the code hidden in a NAN. The sign bit is ignored
121 | static inline Vec2uq nan_code(Vec2d const x) {
122 |     Vec2uq a = Vec2uq(reinterpret_i(x));
123 |     return select(Vec2qb(is_nan(x)), a << 12 >> (12+29), 0);
124 | }
125 | 
126 | #if MAX_VECTOR_SIZE >= 256
127 | 
128 | // This function returns the code hidden in a NAN. The sign bit is ignored
129 | static inline Vec8ui nan_code(Vec8f const x) {
130 |     Vec8ui a = Vec8ui(reinterpret_i(x));
131 |     Vec8ui const n = 0x007FFFFF;
132 |     return select(Vec8ib(is_nan(x)), a & n, 0);
133 | }
134 | 
135 | // This function returns the code hidden in a NAN. The sign bit is ignored
136 | static inline Vec4uq nan_code(Vec4d const x) {
137 |     Vec4uq a = Vec4uq(reinterpret_i(x));
138 |     return select(Vec4qb(is_nan(x)), a << 12 >> (12+29), 0);
139 | }
140 | 
141 | #endif // MAX_VECTOR_SIZE >= 256
142 | #if MAX_VECTOR_SIZE >= 512
143 | 
144 | // This function returns the code hidden in a NAN. The sign bit is ignored
145 | static inline Vec16ui nan_code(Vec16f const x) {
146 |     Vec16ui a = Vec16ui(reinterpret_i(x));
147 |     Vec16ui const n = 0x007FFFFF;
148 |     return select(Vec16ib(is_nan(x)), a & n, 0);
149 | }
150 | 
151 | // This function returns the code hidden in a NAN. The sign bit is ignored
152 | static inline Vec8uq nan_code(Vec8d const x) {
153 |     Vec8uq a = Vec8uq(reinterpret_i(x));
154 |     return select(Vec8qb(is_nan(x)), a << 12 >> (12+29), 0);
155 | }
156 | 
157 | #endif // MAX_VECTOR_SIZE >= 512
158 | 
159 | 
160 | /******************************************************************************
161 |                   templates for polynomials
162 | Using Estrin's scheme to make shorter dependency chains and use FMA, starting
163 | longest dependency chains first.
164 | ******************************************************************************/
165 | 
166 | // template <typedef VECTYPE, typedef CTYPE>
167 | template <class VTYPE, class CTYPE>
168 | static inline VTYPE polynomial_2(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2) {
169 |     // calculates polynomial c2*x^2 + c1*x + c0
170 |     // VTYPE may be a vector type, CTYPE is a scalar type
171 |     VTYPE x2 = x * x;
172 |     //return = x2 * c2 + (x * c1 + c0);
173 |     return mul_add(x2, c2, mul_add(x, c1, c0));
174 | }
175 | 
176 | template<class VTYPE, class CTYPE>
177 | static inline VTYPE polynomial_3(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3) {
178 |     // calculates polynomial c3*x^3 + c2*x^2 + c1*x + c0
179 |     // VTYPE may be a vector type, CTYPE is a scalar type
180 |     VTYPE x2 = x * x;
181 |     //return (c2 + c3*x)*x2 + (c1*x + c0);
182 |     return mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0));
183 | }
184 | 
185 | template<class VTYPE, class CTYPE>
186 | static inline VTYPE polynomial_4(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4) {
187 |     // calculates polynomial c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
188 |     // VTYPE may be a vector type, CTYPE is a scalar type
189 |     VTYPE x2 = x * x;
190 |     VTYPE x4 = x2 * x2;
191 |     //return (c2+c3*x)*x2 + ((c0+c1*x) + c4*x4);
192 |     return mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0) + c4*x4);
193 | }
194 | 
195 | template<class VTYPE, class CTYPE>
196 | static inline VTYPE polynomial_4n(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3) {
197 |     // calculates polynomial 1*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
198 |     // VTYPE may be a vector type, CTYPE is a scalar type
199 |     VTYPE x2 = x * x;
200 |     VTYPE x4 = x2 * x2;
201 |     //return (c2+c3*x)*x2 + ((c0+c1*x) + x4);
202 |     return mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0) + x4);
203 | }
204 | 
205 | template<class VTYPE, class CTYPE>
206 | static inline VTYPE polynomial_5(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5) {
207 |     // calculates polynomial c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
208 |     // VTYPE may be a vector type, CTYPE is a scalar type
209 |     VTYPE x2 = x * x;
210 |     VTYPE x4 = x2 * x2;
211 |     //return (c2+c3*x)*x2 + ((c4+c5*x)*x4 + (c0+c1*x));
212 |     return mul_add(mul_add(c3, x, c2), x2, mul_add(mul_add(c5, x, c4), x4, mul_add(c1, x, c0)));
213 | }
214 | 
215 | template<class VTYPE, class CTYPE>
216 | static inline VTYPE polynomial_5n(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4) {
217 |     // calculates polynomial 1*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
218 |     // VTYPE may be a vector type, CTYPE is a scalar type
219 |     VTYPE x2 = x * x;
220 |     VTYPE x4 = x2 * x2;
221 |     //return (c2+c3*x)*x2 + ((c4+x)*x4 + (c0+c1*x));
222 |     return mul_add(mul_add(c3, x, c2), x2, mul_add(c4 + x, x4, mul_add(c1, x, c0)));
223 | }
224 | 
225 | template<class VTYPE, class CTYPE>
226 | static inline VTYPE polynomial_6(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6) {
227 |     // calculates polynomial c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
228 |     // VTYPE may be a vector type, CTYPE is a scalar type
229 |     VTYPE x2 = x * x;
230 |     VTYPE x4 = x2 * x2;
231 |     //return  (c4+c5*x+c6*x2)*x4 + ((c2+c3*x)*x2 + (c0+c1*x));
232 |     return mul_add(mul_add(c6, x2, mul_add(c5, x, c4)), x4, mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0)));
233 | }
234 | 
235 | template<class VTYPE, class CTYPE>
236 | static inline VTYPE polynomial_6n(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5) {
237 |     // calculates polynomial 1*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
238 |     // VTYPE may be a vector type, CTYPE is a scalar type
239 |     VTYPE x2 = x * x;
240 |     VTYPE x4 = x2 * x2;
241 |     //return  (c4+c5*x+x2)*x4 + ((c2+c3*x)*x2 + (c0+c1*x));
242 |     return mul_add(mul_add(c5, x, c4 + x2), x4, mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0)));
243 | }
244 | 
245 | template<class VTYPE, class CTYPE>
246 | static inline VTYPE polynomial_7(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7) {
247 |     // calculates polynomial c7*x^7 + c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
248 |     // VTYPE may be a vector type, CTYPE is a scalar type
249 |     VTYPE x2 = x * x;
250 |     VTYPE x4 = x2 * x2;
251 |     //return  ((c6+c7*x)*x2 + (c4+c5*x))*x4 + ((c2+c3*x)*x2 + (c0+c1*x));
252 |     return mul_add(mul_add(mul_add(c7, x, c6), x2, mul_add(c5, x, c4)), x4, mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0)));
253 | }
254 | 
255 | template<class VTYPE, class CTYPE>
256 | static inline VTYPE polynomial_8(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7, CTYPE c8) {
257 |     // calculates polynomial c8*x^8 + c7*x^7 + c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
258 |     // VTYPE may be a vector type, CTYPE is a scalar type
259 |     VTYPE x2 = x  * x;
260 |     VTYPE x4 = x2 * x2;
261 |     VTYPE x8 = x4 * x4;
262 |     //return  ((c6+c7*x)*x2 + (c4+c5*x))*x4 + (c8*x8 + (c2+c3*x)*x2 + (c0+c1*x));
263 |     return mul_add(mul_add(mul_add(c7, x, c6), x2, mul_add(c5, x, c4)), x4,
264 |         mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0) + c8*x8));
265 | }
266 | 
267 | template<class VTYPE, class CTYPE>
268 | static inline VTYPE polynomial_9(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7, CTYPE c8, CTYPE c9) {
269 |     // calculates polynomial c9*x^9 + c8*x^8 + c7*x^7 + c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
270 |     // VTYPE may be a vector type, CTYPE is a scalar type
271 |     VTYPE x2 = x  * x;
272 |     VTYPE x4 = x2 * x2;
273 |     VTYPE x8 = x4 * x4;
274 |     //return  (((c6+c7*x)*x2 + (c4+c5*x))*x4 + (c8+c9*x)*x8) + ((c2+c3*x)*x2 + (c0+c1*x));
275 |     return mul_add(mul_add(c9, x, c8), x8, mul_add(
276 |         mul_add(mul_add(c7, x, c6), x2, mul_add(c5, x, c4)), x4,
277 |         mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0))));
278 | }
279 | 
280 | template<class VTYPE, class CTYPE>
281 | static inline VTYPE polynomial_10(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7, CTYPE c8, CTYPE c9, CTYPE c10) {
282 |     // calculates polynomial c10*x^10 + c9*x^9 + c8*x^8 + c7*x^7 + c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0
283 |     // VTYPE may be a vector type, CTYPE is a scalar type
284 |     VTYPE x2 = x  * x;
285 |     VTYPE x4 = x2 * x2;
286 |     VTYPE x8 = x4 * x4;
287 |     //return  (((c6+c7*x)*x2 + (c4+c5*x))*x4 + (c8+c9*x+c10*x2)*x8) + ((c2+c3*x)*x2 + (c0+c1*x));
288 |     return mul_add(mul_add(x2, c10, mul_add(c9, x, c8)), x8,
289 |         mul_add(mul_add(mul_add(c7, x, c6), x2, mul_add(c5, x, c4)), x4,
290 |             mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0))));
291 | }
292 | 
293 | template<class VTYPE, class CTYPE>
294 | static inline VTYPE polynomial_13(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7, CTYPE c8, CTYPE c9, CTYPE c10, CTYPE c11, CTYPE c12, CTYPE c13) {
295 |     // calculates polynomial c13*x^13 + c12*x^12 + ... + c1*x + c0
296 |     // VTYPE may be a vector type, CTYPE is a scalar type
297 |     VTYPE x2 = x  * x;
298 |     VTYPE x4 = x2 * x2;
299 |     VTYPE x8 = x4 * x4;
300 |     return mul_add(
301 |         mul_add(
302 |             mul_add(c13, x, c12), x4,
303 |             mul_add(mul_add(c11, x, c10), x2, mul_add(c9, x, c8))), x8,
304 |         mul_add(
305 |             mul_add(mul_add(c7, x, c6), x2, mul_add(c5, x, c4)), x4,
306 |             mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0))));
307 | }
308 | 
309 | 
310 | template<class VTYPE, class CTYPE>
311 | static inline VTYPE polynomial_13m(VTYPE const x, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7, CTYPE c8, CTYPE c9, CTYPE c10, CTYPE c11, CTYPE c12, CTYPE c13) {
312 |     // calculates polynomial c13*x^13 + c12*x^12 + ... + x + 0
313 |     // VTYPE may be a vector type, CTYPE is a scalar type
314 |     VTYPE x2 = x  * x;
315 |     VTYPE x4 = x2 * x2;
316 |     VTYPE x8 = x4 * x4;
317 |     // return  ((c8+c9*x) + (c10+c11*x)*x2 + (c12+c13*x)*x4)*x8 + (((c6+c7*x)*x2 + (c4+c5*x))*x4 + ((c2+c3*x)*x2 + x));
318 |     return mul_add(
319 |         mul_add(mul_add(c13, x, c12), x4, mul_add(mul_add(c11, x, c10), x2, mul_add(c9, x, c8))), x8,
320 |         mul_add(mul_add(mul_add(c7, x, c6), x2, mul_add(c5, x, c4)), x4, mul_add(mul_add(c3, x, c2), x2, x)));
321 | }
322 | 
323 | #ifdef VCL_NAMESPACE
324 | }
325 | #endif
326 | 
327 | #endif
328 | 


--------------------------------------------------------------------------------
/CAS/VCL2/vectormath_hyp.h:
--------------------------------------------------------------------------------
  1 | /****************************  vectormath_hyp.h   ******************************
  2 | * Author:        Agner Fog
  3 | * Date created:  2014-07-09
  4 | * Last modified: 2019-08-01
  5 | * Version:       2.00.00
  6 | * Project:       vector class library
  7 | * Description:
  8 | * Header file containing inline vector functions of hyperbolic and inverse
  9 | * hyperbolic functions:
 10 | * sinh        hyperbolic sine
 11 | * cosh        hyperbolic cosine
 12 | * tanh        hyperbolic tangent
 13 | * asinh       inverse hyperbolic sine
 14 | * acosh       inverse hyperbolic cosine
 15 | * atanh       inverse hyperbolic tangent
 16 | *
 17 | * Theory, methods and inspiration based partially on these sources:
 18 | * > Moshier, Stephen Lloyd Baluk: Methods and programs for mathematical functions.
 19 | *   Ellis Horwood, 1989.
 20 | * > VDT library developed on CERN by Danilo Piparo, Thomas Hauth and
 21 | *   Vincenzo Innocente, 2012, https://svnweb.cern.ch/trac/vdt
 22 | * > Cephes math library by Stephen L. Moshier 1992,
 23 | *   http://www.netlib.org/cephes/
 24 | *
 25 | * For detailed instructions, see vectormath_common.h and vcl_manual.pdf
 26 | *
 27 | * (c) Copyright 2014-2019 Agner Fog.
 28 | * Apache License version 2.0 or later.
 29 | ******************************************************************************/
 30 | 
 31 | #ifndef VECTORMATH_HYP_H
 32 | #define VECTORMATH_HYP_H  1
 33 | 
 34 | #include "vectormath_exp.h"
 35 | 
 36 | #ifdef VCL_NAMESPACE
 37 | namespace VCL_NAMESPACE {
 38 | #endif
 39 | 
 40 | /******************************************************************************
 41 | *                 Hyperbolic functions
 42 | ******************************************************************************/
 43 | 
 44 | // Template for sinh function, double precision
 45 | // This function does not produce denormals
 46 | // Template parameters:
 47 | // VTYPE:  double vector type
 48 | template<typename VTYPE>
 49 | static inline VTYPE sinh_d(VTYPE const x0) {
 50 | // The limit of abs(x) is 709.7, as defined by max_x in vectormath_exp.h for 0.5*exp(x).
 51 | 
 52 |     // Coefficients
 53 |     const double p0 = -3.51754964808151394800E5;
 54 |     const double p1 = -1.15614435765005216044E4;
 55 |     const double p2 = -1.63725857525983828727E2;
 56 |     const double p3 = -7.89474443963537015605E-1;
 57 | 
 58 |     const double q0 = -2.11052978884890840399E6;
 59 |     const double q1 =  3.61578279834431989373E4;
 60 |     const double q2 = -2.77711081420602794433E2;
 61 |     const double q3 =  1.0;
 62 | 
 63 |     // data vectors
 64 |     VTYPE  x, x2, y1, y2;
 65 | 
 66 |     x = abs(x0);
 67 |     auto x_small = x <= 1.0;                     // use Pade approximation if abs(x) <= 1
 68 | 
 69 |     if (horizontal_or(x_small)) {
 70 |         // At least one element needs small method
 71 |         x2 = x*x;
 72 |         y1 = polynomial_3(x2, p0, p1, p2, p3) / polynomial_3(x2, q0, q1, q2, q3);
 73 |         y1 = mul_add(y1, x*x2, x);               // y1 = x + x2*(x*y1);
 74 |     }
 75 |     if (!horizontal_and(x_small)) {
 76 |         // At least one element needs big method
 77 |         y2 =  exp_d<VTYPE, 0, 1>(x);             //   0.5 * exp(x)
 78 |         y2 -= 0.25 / y2;                         // - 0.5 * exp(-x)
 79 |     }
 80 |     y1 = select(x_small, y1, y2);                // choose method
 81 |     y1 = sign_combine(y1, x0);                   // get original sign
 82 |     // you can avoid the sign_combine by replacing x by x0 above, but at a loss of precision
 83 | 
 84 |     return y1;
 85 | }
 86 | 
 87 | // instances of sinh_d template
 88 | static inline Vec2d sinh(Vec2d const x) {
 89 |     return sinh_d(x);
 90 | }
 91 | 
 92 | #if MAX_VECTOR_SIZE >= 256
 93 | static inline Vec4d sinh(Vec4d const x) {
 94 |     return sinh_d(x);
 95 | }
 96 | #endif // MAX_VECTOR_SIZE >= 256
 97 | 
 98 | #if MAX_VECTOR_SIZE >= 512
 99 | static inline Vec8d sinh(Vec8d const x) {
100 |     return sinh_d(x);
101 | }
102 | #endif // MAX_VECTOR_SIZE >= 512
103 | 
104 | 
105 | // Template for sinh function, single precision
106 | // This function does not produce denormals
107 | // Template parameters:
108 | // VTYPE:  double vector type
109 | template<typename VTYPE>
110 | static inline VTYPE sinh_f(VTYPE const x0) {
111 | // The limit of abs(x) is 89.0, as defined by max_x in vectormath_exp.h for 0.5*exp(x).
112 | 
113 |     // Coefficients
114 |     const float r0 = 1.66667160211E-1f;
115 |     const float r1 = 8.33028376239E-3f;
116 |     const float r2 = 2.03721912945E-4f;
117 | 
118 |     // data vectors
119 |     VTYPE x, x2, y1, y2;
120 | 
121 |     x = abs(x0);
122 |     auto x_small = x <= 1.0f;                    // use polynomial approximation if abs(x) <= 1
123 | 
124 |     if (horizontal_or(x_small)) {
125 |         // At least one element needs small method
126 |         x2 = x*x;
127 |         y1 = polynomial_2(x2, r0, r1, r2);
128 |         y1 = mul_add(y1, x2*x, x);               // y1 = x + x2*(x*y1);
129 |     }
130 |     if (!horizontal_and(x_small)) {
131 |         // At least one element needs big method
132 |         y2 =  exp_f<VTYPE, 0, 1>(x);             //   0.5 * exp(x)
133 |         y2 -= 0.25f / y2;                        // - 0.5 * exp(-x)
134 |     }
135 |     y1 = select(x_small, y1, y2);                // choose method
136 |     y1 = sign_combine(y1, x0);                   // get original sign
137 |     // you can avoid the sign_combine by replacing x by x0 above, but at a loss of precision
138 | 
139 |     return y1;
140 | }
141 | 
142 | // instances of sinh_f template
143 | static inline Vec4f sinh(Vec4f const x) {
144 |     return sinh_f(x);
145 | }
146 | 
147 | #if MAX_VECTOR_SIZE >= 256
148 | static inline Vec8f sinh(Vec8f const x) {
149 |     return sinh_f(x);
150 | }
151 | #endif // MAX_VECTOR_SIZE >= 256
152 | 
153 | #if MAX_VECTOR_SIZE >= 512
154 | static inline Vec16f sinh(Vec16f const x) {
155 |     return sinh_f(x);
156 | }
157 | #endif // MAX_VECTOR_SIZE >= 512
158 | 
159 | 
160 | // Template for cosh function, double precision
161 | // This function does not produce denormals
162 | // Template parameters:
163 | // VTYPE:  double vector type
164 | template<typename VTYPE>
165 | static inline VTYPE cosh_d(VTYPE const x0) {
166 | // The limit of abs(x) is 709.7, as defined by max_x in vectormath_exp.h for 0.5*exp(x).
167 | 
168 |     // data vectors
169 |     VTYPE x, y;
170 |     x  = abs(x0);
171 |     y  = exp_d<VTYPE, 0, 1>(x);                  //   0.5 * exp(x)
172 |     y += 0.25 / y;                               // + 0.5 * exp(-x)
173 |     return y;
174 | }
175 | 
176 | // instances of sinh_d template
177 | static inline Vec2d cosh(Vec2d const x) {
178 |     return cosh_d(x);
179 | }
180 | 
181 | #if MAX_VECTOR_SIZE >= 256
182 | static inline Vec4d cosh(Vec4d const x) {
183 |     return cosh_d(x);
184 | }
185 | #endif // MAX_VECTOR_SIZE >= 256
186 | 
187 | #if MAX_VECTOR_SIZE >= 512
188 | static inline Vec8d cosh(Vec8d const x) {
189 |     return cosh_d(x);
190 | }
191 | #endif // MAX_VECTOR_SIZE >= 512
192 | 
193 | 
194 | // Template for cosh function, single precision
195 | // This function does not produce denormals
196 | // Template parameters:
197 | // VTYPE:  double vector type
198 | template<typename VTYPE>
199 | static inline VTYPE cosh_f(VTYPE const x0) {
200 | // The limit of abs(x) is 89.0, as defined by max_x in vectormath_exp.h for 0.5*exp(x).
201 | 
202 |     // data vectors
203 |     VTYPE x, y;
204 |     x  = abs(x0);
205 |     y  = exp_f<VTYPE, 0, 1>(x);                  //   0.5 * exp(x)
206 |     y += 0.25f / y;                              // + 0.5 * exp(-x)
207 |     return y;
208 | }
209 | 
210 | // instances of sinh_d template
211 | static inline Vec4f cosh(Vec4f const x) {
212 |     return cosh_f(x);
213 | }
214 | 
215 | #if MAX_VECTOR_SIZE >= 256
216 | static inline Vec8f cosh(Vec8f const x) {
217 |     return cosh_f(x);
218 | }
219 | #endif // MAX_VECTOR_SIZE >= 256
220 | 
221 | #if MAX_VECTOR_SIZE >= 512
222 | static inline Vec16f cosh(Vec16f const x) {
223 |     return cosh_f(x);
224 | }
225 | #endif // MAX_VECTOR_SIZE >= 512
226 | 
227 | 
228 | // Template for tanh function, double precision
229 | // This function does not produce denormals
230 | // Template parameters:
231 | // VTYPE:  double vector type
232 | template<typename VTYPE>
233 | static inline VTYPE tanh_d(VTYPE const x0) {
234 | 
235 |     // Coefficients
236 |     const double p0 = -1.61468768441708447952E3;
237 |     const double p1 = -9.92877231001918586564E1;
238 |     const double p2 = -9.64399179425052238628E-1;
239 | 
240 |     const double q0 =  4.84406305325125486048E3;
241 |     const double q1 =  2.23548839060100448583E3;
242 |     const double q2 =  1.12811678491632931402E2;
243 |     const double q3 =  1.0;
244 | 
245 |     // data vectors
246 |     VTYPE  x, x2, y1, y2;
247 | 
248 |     x = abs(x0);
249 |     auto x_small = x <= 0.625;                   // use Pade approximation if abs(x) <= 5/8
250 | 
251 |     if (horizontal_or(x_small)) {
252 |         // At least one element needs small method
253 |         x2 = x*x;
254 |         y1 = polynomial_2(x2, p0, p1, p2) / polynomial_3(x2, q0, q1, q2, q3);
255 |         y1 = mul_add(y1, x2*x, x);               // y1 = x + x2*(x*y1);
256 |     }
257 |     if (!horizontal_and(x_small)) {
258 |         // At least one element needs big method
259 |         y2 = exp(x+x);                           // exp(2*x)
260 |         y2 = 1.0 - 2.0 / (y2 + 1.0);             // tanh(x)
261 |     }
262 |     auto x_big = x > 350.;
263 |     y1 = select(x_small, y1, y2);                // choose method
264 |     y1 = select(x_big,  1.0, y1);                // avoid overflow
265 |     y1 = sign_combine(y1, x0);                   // get original sign
266 |     return y1;
267 | }
268 | 
269 | // instances of tanh_d template
270 | static inline Vec2d tanh(Vec2d const x) {
271 |     return tanh_d(x);
272 | }
273 | 
274 | #if MAX_VECTOR_SIZE >= 256
275 | static inline Vec4d tanh(Vec4d const x) {
276 |     return tanh_d(x);
277 | }
278 | #endif // MAX_VECTOR_SIZE >= 256
279 | 
280 | #if MAX_VECTOR_SIZE >= 512
281 | static inline Vec8d tanh(Vec8d const x) {
282 |     return tanh_d(x);
283 | }
284 | #endif // MAX_VECTOR_SIZE >= 512
285 | 
286 | 
287 | // Template for tanh function, single precision
288 | // This function does not produce denormals
289 | // Template parameters:
290 | // VTYPE:  double vector type
291 | template<typename VTYPE>
292 | static inline VTYPE tanh_f(VTYPE const x0) {
293 | // The limit of abs(x) is 89.0, as defined by max_x in vectormath_exp.h for 0.5*exp(x).
294 | 
295 |     // Coefficients
296 |     const float r0 = -3.33332819422E-1f;
297 |     const float r1 =  1.33314422036E-1f;
298 |     const float r2 = -5.37397155531E-2f;
299 |     const float r3 =  2.06390887954E-2f;
300 |     const float r4 = -5.70498872745E-3f;
301 | 
302 |     // data vectors
303 |     VTYPE x, x2, y1, y2;
304 | 
305 |     x = abs(x0);
306 |     auto x_small = x <= 0.625f;                  // use polynomial approximation if abs(x) <= 5/8
307 | 
308 |     if (horizontal_or(x_small)) {
309 |         // At least one element needs small method
310 |         x2 = x*x;
311 |         y1 = polynomial_4(x2, r0, r1, r2, r3, r4);
312 |         y1 = mul_add(y1, x2*x, x);               // y1 = x + (x2*x)*y1;
313 |     }
314 |     if (!horizontal_and(x_small)) {
315 |         // At least one element needs big method
316 |         y2 = exp(x+x);                           // exp(2*x)
317 |         y2 = 1.0f - 2.0f / (y2 + 1.0f);          // tanh(x)
318 |     }
319 |     auto x_big = x > 44.4f;
320 |     y1 = select(x_small, y1, y2);                // choose method
321 |     y1 = select(x_big,  1.0f, y1);               // avoid overflow
322 |     y1 = sign_combine(y1, x0);                   // get original sign
323 |     return y1;
324 | }
325 | 
326 | // instances of tanh_f template
327 | static inline Vec4f tanh(Vec4f const x) {
328 |     return tanh_f(x);
329 | }
330 | 
331 | #if MAX_VECTOR_SIZE >= 256
332 | static inline Vec8f tanh(Vec8f const x) {
333 |     return tanh_f(x);
334 | }
335 | #endif // MAX_VECTOR_SIZE >= 256
336 | 
337 | #if MAX_VECTOR_SIZE >= 512
338 | static inline Vec16f tanh(Vec16f const x) {
339 |     return tanh_f(x);
340 | }
341 | #endif // MAX_VECTOR_SIZE >= 512
342 | 
343 | 
344 | 
345 | /******************************************************************************
346 | *                 Inverse hyperbolic functions
347 | ******************************************************************************/
348 | 
349 | // Template for asinh function, double precision
350 | // This function does not produce denormals
351 | // Template parameters:
352 | // VTYPE:  double vector type
353 | template<typename VTYPE>
354 | static inline VTYPE asinh_d(VTYPE const x0) {
355 | 
356 |     // Coefficients
357 |     const double p0 = -5.56682227230859640450E0;
358 |     const double p1 = -9.09030533308377316566E0;
359 |     const double p2 = -4.37390226194356683570E0;
360 |     const double p3 = -5.91750212056387121207E-1;
361 |     const double p4 = -4.33231683752342103572E-3;
362 | 
363 |     const double q0 =  3.34009336338516356383E1;
364 |     const double q1 =  6.95722521337257608734E1;
365 |     const double q2 =  4.86042483805291788324E1;
366 |     const double q3 =  1.28757002067426453537E1;
367 |     const double q4 =  1.0;
368 | 
369 |     // data vectors
370 |     VTYPE  x, x2, y1, y2;
371 | 
372 |     x2 = x0 * x0;
373 |     x  = abs(x0);
374 |     auto x_small = x <= 0.533;                   // use Pade approximation if abs(x) <= 0.5
375 |     // Both methods give the highest error close to 0.5.
376 |     // This limit is adjusted for minimum error
377 |     auto x_huge  = x > 1.E20;                    // simple approximation, avoid overflow
378 | 
379 |     if (horizontal_or(x_small)) {
380 |         // At least one element needs small method
381 |         y1 = polynomial_4(x2, p0, p1, p2, p3, p4) / polynomial_4(x2, q0, q1, q2, q3, q4);
382 |         y1 = mul_add(y1, x2*x, x);               // y1 = x + (x2*x)*y1;
383 |     }
384 |     if (!horizontal_and(x_small)) {
385 |         // At least one element needs big method
386 |         y2 = log(x + sqrt(x2 + 1.0));
387 |         if (horizontal_or(x_huge)) {
388 |             // At least one element needs huge method to avoid overflow
389 |             y2 = select(x_huge, log(x) + VM_LN2, y2);
390 |         }
391 |     }
392 |     y1 = select(x_small, y1, y2);                // choose method
393 |     y1 = sign_combine(y1, x0);                   // get original sign
394 |     return y1;
395 | }
396 | 
397 | // instances of asinh_d template
398 | static inline Vec2d asinh(Vec2d const x) {
399 |     return asinh_d(x);
400 | }
401 | 
402 | #if MAX_VECTOR_SIZE >= 256
403 | static inline Vec4d asinh(Vec4d const x) {
404 |     return asinh_d(x);
405 | }
406 | #endif // MAX_VECTOR_SIZE >= 256
407 | 
408 | #if MAX_VECTOR_SIZE >= 512
409 | static inline Vec8d asinh(Vec8d const x) {
410 |     return asinh_d(x);
411 | }
412 | #endif // MAX_VECTOR_SIZE >= 512
413 | 
414 | 
415 | // Template for asinh function, single precision
416 | // This function does not produce denormals
417 | // Template parameters:
418 | // VTYPE:  double vector type
419 | template<typename VTYPE>
420 | static inline VTYPE asinh_f(VTYPE const x0) {
421 | 
422 |     // Coefficients
423 |     const float r0 = -1.6666288134E-1f;
424 |     const float r1 =  7.4847586088E-2f;
425 |     const float r2 = -4.2699340972E-2f;
426 |     const float r3 =  2.0122003309E-2f;
427 | 
428 |     // data vectors
429 |     VTYPE  x, x2, y1, y2;
430 | 
431 |     x2 = x0 * x0;
432 |     x  = abs(x0);
433 |     auto x_small = x <= 0.51f;                   // use polynomial approximation if abs(x) <= 0.5
434 |     auto x_huge  = x > 1.E10f;                   // simple approximation, avoid overflow
435 | 
436 |     if (horizontal_or(x_small)) {
437 |         // At least one element needs small method
438 |         y1 = polynomial_3(x2, r0, r1, r2, r3);
439 |         y1 = mul_add(y1, x2*x, x);               // y1 = x + (x2*x)*y1;
440 |     }
441 |     if (!horizontal_and(x_small)) {
442 |         // At least one element needs big method
443 |         y2 = log(x + sqrt(x2 + 1.0f));
444 |         if (horizontal_or(x_huge)) {
445 |             // At least one element needs huge method to avoid overflow
446 |             y2 = select(x_huge, log(x) + (float)VM_LN2, y2);
447 |         }
448 |     }
449 |     y1 = select(x_small, y1, y2);                // choose method
450 |     y1 = sign_combine(y1, x0);                   // get original sign
451 |     return y1;
452 | }
453 | 
454 | // instances of asinh_f template
455 | static inline Vec4f asinh(Vec4f const x) {
456 |     return asinh_f(x);
457 | }
458 | 
459 | #if MAX_VECTOR_SIZE >= 256
460 | static inline Vec8f asinh(Vec8f const x) {
461 |     return asinh_f(x);
462 | }
463 | #endif // MAX_VECTOR_SIZE >= 256
464 | 
465 | #if MAX_VECTOR_SIZE >= 512
466 | static inline Vec16f asinh(Vec16f const x) {
467 |     return asinh_f(x);
468 | }
469 | #endif // MAX_VECTOR_SIZE >= 512
470 | 
471 | 
472 | // Template for acosh function, double precision
473 | // This function does not produce denormals
474 | // Template parameters:
475 | // VTYPE:  double vector type
476 | template<typename VTYPE>
477 | static inline VTYPE acosh_d(VTYPE const x0) {
478 | 
479 |     // Coefficients
480 |     const double p0 = 1.10855947270161294369E5;
481 |     const double p1 = 1.08102874834699867335E5;
482 |     const double p2 = 3.43989375926195455866E4;
483 |     const double p3 = 3.94726656571334401102E3;
484 |     const double p4 = 1.18801130533544501356E2;
485 | 
486 |     const double q0 = 7.83869920495893927727E4;
487 |     const double q1 = 8.29725251988426222434E4;
488 |     const double q2 = 2.97683430363289370382E4;
489 |     const double q3 = 4.15352677227719831579E3;
490 |     const double q4 = 1.86145380837903397292E2;
491 |     const double q5 = 1.0;
492 | 
493 |     // data vectors
494 |     VTYPE  x1, y1, y2;
495 | 
496 |     x1      = x0 - 1.0;
497 |     auto undef   = x0 < 1.0;                     // result is NAN
498 |     auto x_small = x1 < 0.49;                    // use Pade approximation if abs(x-1) < 0.5
499 |     auto x_huge  = x1 > 1.E20;                   // simple approximation, avoid overflow
500 | 
501 |     if (horizontal_or(x_small)) {
502 |         // At least one element needs small method
503 |         y1 = sqrt(x1) * (polynomial_4(x1, p0, p1, p2, p3, p4) / polynomial_5(x1, q0, q1, q2, q3, q4, q5));
504 |         // x < 1 generates NAN
505 |         y1 = select(undef, nan_vec<VTYPE>(NAN_HYP), y1);
506 |     }
507 |     if (!horizontal_and(x_small)) {
508 |         // At least one element needs big method
509 |         y2 = log(x0 + sqrt(mul_sub(x0,x0,1.0)));
510 |         if (horizontal_or(x_huge)) {
511 |             // At least one element needs huge method to avoid overflow
512 |             y2 = select(x_huge, log(x0) + VM_LN2, y2);
513 |         }
514 |     }
515 |     y1 = select(x_small, y1, y2);                // choose method
516 |     return y1;
517 | }
518 | 
519 | // instances of acosh_d template
520 | static inline Vec2d acosh(Vec2d const x) {
521 |     return acosh_d(x);
522 | }
523 | 
524 | #if MAX_VECTOR_SIZE >= 256
525 | static inline Vec4d acosh(Vec4d const x) {
526 |     return acosh_d(x);
527 | }
528 | #endif // MAX_VECTOR_SIZE >= 256
529 | 
530 | #if MAX_VECTOR_SIZE >= 512
531 | static inline Vec8d acosh(Vec8d const x) {
532 |     return acosh_d(x);
533 | }
534 | #endif // MAX_VECTOR_SIZE >= 512
535 | 
536 | 
537 | // Template for acosh function, single precision
538 | // This function does not produce denormals
539 | // Template parameters:
540 | // VTYPE:  double vector type
541 | template<typename VTYPE>
542 | static inline VTYPE acosh_f(VTYPE const x0) {
543 | 
544 |     // Coefficients
545 |     const float r0 =  1.4142135263E0f;
546 |     const float r1 = -1.1784741703E-1f;
547 |     const float r2 =  2.6454905019E-2f;
548 |     const float r3 = -7.5272886713E-3f;
549 |     const float r4 =  1.7596881071E-3f;
550 | 
551 |     // data vectors
552 |     VTYPE  x1, y1, y2;
553 | 
554 |     x1      = x0 - 1.0f;
555 |     auto undef   = x0 < 1.0f;                    // result is NAN
556 |     auto x_small = x1 < 0.49f;                   // use Pade approximation if abs(x-1) < 0.5
557 |     auto x_huge  = x1 > 1.E10f;                  // simple approximation, avoid overflow
558 | 
559 |     if (horizontal_or(x_small)) {
560 |         // At least one element needs small method
561 |         y1 = sqrt(x1) * polynomial_4(x1, r0, r1, r2, r3, r4);
562 |         // x < 1 generates NAN
563 |         y1 = select(undef, nan_vec<VTYPE>(NAN_HYP), y1);
564 |     }
565 |     if (!horizontal_and(x_small)) {
566 |         // At least one element needs big method
567 |         y2 = log(x0 + sqrt(mul_sub(x0,x0,1.0)));
568 |         if (horizontal_or(x_huge)) {
569 |             // At least one element needs huge method to avoid overflow
570 |             y2 = select(x_huge, log(x0) + (float)VM_LN2, y2);
571 |         }
572 |     }
573 |     y1 = select(x_small, y1, y2);                // choose method
574 |     return y1;
575 | }
576 | 
577 | // instances of acosh_f template
578 | static inline Vec4f acosh(Vec4f const x) {
579 |     return acosh_f(x);
580 | }
581 | 
582 | #if MAX_VECTOR_SIZE >= 256
583 | static inline Vec8f acosh(Vec8f const x) {
584 |     return acosh_f(x);
585 | }
586 | #endif // MAX_VECTOR_SIZE >= 256
587 | 
588 | #if MAX_VECTOR_SIZE >= 512
589 | static inline Vec16f acosh(Vec16f const x) {
590 |     return acosh_f(x);
591 | }
592 | #endif // MAX_VECTOR_SIZE >= 512
593 | 
594 | 
595 | // Template for atanh function, double precision
596 | // This function does not produce denormals
597 | // Template parameters:
598 | // VTYPE:  double vector type
599 | template<typename VTYPE>
600 | static inline VTYPE atanh_d(VTYPE const x0) {
601 | 
602 |     // Coefficients
603 |     const double p0 = -3.09092539379866942570E1;
604 |     const double p1 =  6.54566728676544377376E1;
605 |     const double p2 = -4.61252884198732692637E1;
606 |     const double p3 =  1.20426861384072379242E1;
607 |     const double p4 = -8.54074331929669305196E-1;
608 | 
609 |     const double q0 = -9.27277618139601130017E1;
610 |     const double q1 =  2.52006675691344555838E2;
611 |     const double q2 = -2.49839401325893582852E2;
612 |     const double q3 =  1.08938092147140262656E2;
613 |     const double q4 = -1.95638849376911654834E1;
614 |     const double q5 =  1.0;
615 | 
616 |     // data vectors
617 |     VTYPE  x, x2, y1, y2, y3;
618 | 
619 |     x  = abs(x0);
620 |     auto x_small = x < 0.5;                      // use Pade approximation if abs(x) < 0.5
621 | 
622 |     if (horizontal_or(x_small)) {
623 |         // At least one element needs small method
624 |         x2 = x * x;
625 |         y1 = polynomial_4(x2, p0, p1, p2, p3, p4) / polynomial_5(x2, q0, q1, q2, q3, q4, q5);
626 |         y1 = mul_add(y1, x2*x, x);
627 |     }
628 |     if (!horizontal_and(x_small)) {
629 |         // At least one element needs big method
630 |         y2 = log((1.0+x)/(1.0-x)) * 0.5;
631 |         // check if out of range
632 |         y3 = select(x == 1.0, infinite_vec<VTYPE>(), nan_vec<VTYPE>(NAN_HYP));
633 |         y2 = select(x >= 1.0, y3, y2);
634 |     }
635 |     y1 = select(x_small, y1, y2);                // choose method
636 |     y1 = sign_combine(y1, x0);                   // get original sign
637 |     return y1;
638 | }
639 | 
640 | // instances of atanh_d template
641 | static inline Vec2d atanh(Vec2d const x) {
642 |     return atanh_d(x);
643 | }
644 | 
645 | #if MAX_VECTOR_SIZE >= 256
646 | static inline Vec4d atanh(Vec4d const x) {
647 |     return atanh_d(x);
648 | }
649 | #endif // MAX_VECTOR_SIZE >= 256
650 | 
651 | #if MAX_VECTOR_SIZE >= 512
652 | static inline Vec8d atanh(Vec8d const x) {
653 |     return atanh_d(x);
654 | }
655 | #endif // MAX_VECTOR_SIZE >= 512
656 | 
657 | 
658 | // Template for atanh function, single precision
659 | // This function does not produce denormals
660 | // Template parameters:
661 | // VTYPE:  double vector type
662 | template<typename VTYPE>
663 | static inline VTYPE atanh_f(VTYPE const x0) {
664 | 
665 |     // Coefficients
666 |     const float r0 = 3.33337300303E-1f;
667 |     const float r1 = 1.99782164500E-1f;
668 |     const float r2 = 1.46691431730E-1f;
669 |     const float r3 = 8.24370301058E-2f;
670 |     const float r4 = 1.81740078349E-1f;
671 | 
672 |     // data vectors
673 |     VTYPE  x, x2, y1, y2, y3;
674 | 
675 |     x  = abs(x0);
676 |     auto x_small = x < 0.5f;                     // use polynomial approximation if abs(x) < 0.5
677 | 
678 |     if (horizontal_or(x_small)) {
679 |         // At least one element needs small method
680 |         x2 = x * x;
681 |         y1 = polynomial_4(x2, r0, r1, r2, r3, r4);
682 |         y1 = mul_add(y1, x2*x, x);
683 |     }
684 |     if (!horizontal_and(x_small)) {
685 |         // At least one element needs big method
686 |         y2 = log((1.0f+x)/(1.0f-x)) * 0.5f;
687 |         // check if out of range
688 |         y3 = select(x == 1.0f, infinite_vec<VTYPE>(), nan_vec<VTYPE>(NAN_HYP));
689 |         y2 = select(x >= 1.0f, y3, y2);
690 |     }
691 |     y1 = select(x_small, y1, y2);                // choose method
692 |     y1 = sign_combine(y1, x0);                   // get original sign
693 |     return y1;
694 | }
695 | 
696 | // instances of atanh_f template
697 | static inline Vec4f atanh(Vec4f const x) {
698 |     return atanh_f(x);
699 | }
700 | 
701 | #if MAX_VECTOR_SIZE >= 256
702 | static inline Vec8f atanh(Vec8f const x) {
703 |     return atanh_f(x);
704 | }
705 | #endif // MAX_VECTOR_SIZE >= 256
706 | 
707 | #if MAX_VECTOR_SIZE >= 512
708 | static inline Vec16f atanh(Vec16f const x) {
709 |     return atanh_f(x);
710 | }
711 | #endif // MAX_VECTOR_SIZE >= 512
712 | 
713 | #ifdef VCL_NAMESPACE
714 | }
715 | #endif
716 | 
717 | #endif
718 | 


--------------------------------------------------------------------------------
/CAS/VCL2/vectormath_trig.h:
--------------------------------------------------------------------------------
  1 | /****************************  vectormath_trig.h   ******************************
  2 | * Author:        Agner Fog
  3 | * Date created:  2014-04-18
  4 | * Last modified: 2020-06-08
  5 | * Version:       2.00.03
  6 | * Project:       vector class library
  7 | * Description:
  8 | * Header file containing inline version of trigonometric functions
  9 | * and inverse trigonometric functions
 10 | * sin, cos, sincos, tan
 11 | * asin, acos, atan, atan2
 12 | *
 13 | * Theory, methods and inspiration based partially on these sources:
 14 | * > Moshier, Stephen Lloyd Baluk: Methods and programs for mathematical functions.
 15 | *   Ellis Horwood, 1989.
 16 | * > VDT library developed on CERN by Danilo Piparo, Thomas Hauth and
 17 | *   Vincenzo Innocente, 2012, https://svnweb.cern.ch/trac/vdt
 18 | * > Cephes math library by Stephen L. Moshier 1992,
 19 | *   http://www.netlib.org/cephes/
 20 | *
 21 | * For detailed instructions, see vectormath_common.h and vcl_manual.pdf
 22 | *
 23 | * (c) Copyright 2014-2020 Agner Fog.
 24 | * Apache License version 2.0 or later.
 25 | ******************************************************************************/
 26 | 
 27 | #ifndef VECTORMATH_TRIG_H
 28 | #define VECTORMATH_TRIG_H  1
 29 | 
 30 | #include "vectormath_common.h"
 31 | 
 32 | #ifdef VCL_NAMESPACE
 33 | namespace VCL_NAMESPACE {
 34 | #endif
 35 | 
 36 | 
 37 | // *************************************************************
 38 | //             sin/cos template, double precision
 39 | // *************************************************************
 40 | // Template parameters:
 41 | // VTYPE:  f.p. vector type
 42 | // SC:     1 = sin, 2 = cos, 3 = sincos
 43 | // Paramterers:
 44 | // xx = input x (radians)
 45 | // cosret = return pointer (only if SC = 3)
 46 | template<typename VTYPE, int SC>
 47 | static inline VTYPE sincos_d(VTYPE * cosret, VTYPE const xx) {
 48 | 
 49 |     // define constants
 50 |     const double P0sin = -1.66666666666666307295E-1;
 51 |     const double P1sin = 8.33333333332211858878E-3;
 52 |     const double P2sin = -1.98412698295895385996E-4;
 53 |     const double P3sin = 2.75573136213857245213E-6;
 54 |     const double P4sin = -2.50507477628578072866E-8;
 55 |     const double P5sin = 1.58962301576546568060E-10;
 56 | 
 57 |     const double P0cos = 4.16666666666665929218E-2;
 58 |     const double P1cos = -1.38888888888730564116E-3;
 59 |     const double P2cos = 2.48015872888517045348E-5;
 60 |     const double P3cos = -2.75573141792967388112E-7;
 61 |     const double P4cos = 2.08757008419747316778E-9;
 62 |     const double P5cos = -1.13585365213876817300E-11;
 63 | 
 64 |     const double DP1 = 7.853981554508209228515625E-1 * 2.;
 65 |     const double DP2 = 7.94662735614792836714E-9 * 2.;
 66 |     const double DP3 = 3.06161699786838294307E-17 * 2.;
 67 |     /*
 68 |     const double DP1sc = 7.85398125648498535156E-1;
 69 |     const double DP2sc = 3.77489470793079817668E-8;
 70 |     const double DP3sc = 2.69515142907905952645E-15;
 71 |     */
 72 |     typedef decltype(roundi(xx)) ITYPE;          // integer vector type
 73 |     typedef decltype(nan_code(xx)) UITYPE;       // unsigned integer vector type
 74 |     typedef decltype(xx < xx) BVTYPE;            // boolean vector type
 75 | 
 76 |     VTYPE  xa, x, y, x2, s, c, sin1, cos1;       // data vectors
 77 |     ITYPE  q, qq, signsin, signcos;              // integer vectors, 64 bit
 78 | 
 79 |     BVTYPE swap, overflow;                       // boolean vectors
 80 | 
 81 |     xa = abs(xx);
 82 | 
 83 |     // Find quadrant
 84 |     y = round(xa * (double)(2. / VM_PI));        // quadrant, as float
 85 |     q = roundi(y);                               // quadrant, as integer
 86 |     // Find quadrant
 87 |     //      0 -   pi/4 => 0
 88 |     //   pi/4 - 3*pi/4 => 1
 89 |     // 3*pi/4 - 5*pi/4 => 2
 90 |     // 5*pi/4 - 7*pi/4 => 3
 91 |     // 7*pi/4 - 8*pi/4 => 4
 92 | 
 93 |     // Reduce by extended precision modular arithmetic
 94 |     x = nmul_add(y, DP3, nmul_add(y, DP2, nmul_add(y, DP1, xa)));    // x = ((xa - y * DP1) - y * DP2) - y * DP3;
 95 | 
 96 |     // Expansion of sin and cos, valid for -pi/4 <= x <= pi/4
 97 |     x2 = x * x;
 98 |     s = polynomial_5(x2, P0sin, P1sin, P2sin, P3sin, P4sin, P5sin);
 99 |     c = polynomial_5(x2, P0cos, P1cos, P2cos, P3cos, P4cos, P5cos);
100 |     s = mul_add(x * x2, s, x);                                       // s = x + (x * x2) * s;
101 |     c = mul_add(x2 * x2, c, nmul_add(x2, 0.5, 1.0));                 // c = 1.0 - x2 * 0.5 + (x2 * x2) * c;
102 | 
103 |     // swap sin and cos if odd quadrant
104 |     swap = BVTYPE((q & 1) != 0);
105 | 
106 |     // check for overflow
107 |     overflow = BVTYPE(UITYPE(q) > 0x80000000000000);  // q big if overflow
108 |     overflow &= is_finite(xa);
109 |     s = select(overflow, 0.0, s);
110 |     c = select(overflow, 1.0, c);
111 | 
112 |     if constexpr ((SC & 1) != 0) {  // calculate sin
113 |         sin1 = select(swap, c, s);
114 |         signsin = ((q << 62) ^ ITYPE(reinterpret_i(xx)));
115 |         sin1 = sign_combine(sin1, reinterpret_d(signsin));
116 |     }
117 |     if constexpr ((SC & 2) != 0) {  // calculate cos
118 |         cos1 = select(swap, s, c);
119 |         signcos = ((q + 1) & 2) << 62;
120 |         cos1 ^= reinterpret_d(signcos);
121 |     }
122 |     if constexpr (SC == 3) {  // calculate both. cos returned through pointer
123 |         *cosret = cos1;
124 |     }
125 |     if constexpr ((SC & 1) != 0) return sin1; else return cos1;
126 | }
127 | 
128 | // instantiations of sincos_d template:
129 | 
130 | static inline Vec2d sin(Vec2d const x) {
131 |     return sincos_d<Vec2d, 1>(0, x);
132 | }
133 | 
134 | static inline Vec2d cos(Vec2d const x) {
135 |     return sincos_d<Vec2d, 2>(0, x);
136 | }
137 | 
138 | static inline Vec2d sincos(Vec2d * cosret, Vec2d const x) {
139 |     return sincos_d<Vec2d, 3>(cosret, x);
140 | }
141 | 
142 | #if MAX_VECTOR_SIZE >= 256
143 | static inline Vec4d sin(Vec4d const x) {
144 |     return sincos_d<Vec4d, 1>(0, x);
145 | }
146 | 
147 | static inline Vec4d cos(Vec4d const x) {
148 |     return sincos_d<Vec4d, 2>(0, x);
149 | }
150 | 
151 | static inline Vec4d sincos(Vec4d * cosret, Vec4d const x) {
152 |     return sincos_d<Vec4d, 3>(cosret, x);
153 | }
154 | #endif // MAX_VECTOR_SIZE >= 256
155 | 
156 | #if MAX_VECTOR_SIZE >= 512
157 | static inline Vec8d sin(Vec8d const x) {
158 |     return sincos_d<Vec8d, 1>(0, x);
159 | }
160 | 
161 | static inline Vec8d cos(Vec8d const x) {
162 |     return sincos_d<Vec8d, 2>(0, x);
163 | }
164 | 
165 | static inline Vec8d sincos(Vec8d * cosret, Vec8d const x) {
166 |     return sincos_d<Vec8d, 3>(cosret, x);
167 | }
168 | #endif // MAX_VECTOR_SIZE >= 512
169 | 
170 | 
171 | // *************************************************************
172 | //             sincos template, single precision
173 | // *************************************************************
174 | // Template parameters:
175 | // VTYPE:  f.p. vector type
176 | // SC:     1 = sin, 2 = cos, 3 = sincos, 4 = tan
177 | // Paramterers:
178 | // xx = input x (radians)
179 | // cosret = return pointer (only if SC = 3)
180 | template<typename VTYPE, int SC>
181 | static inline VTYPE sincos_f(VTYPE * cosret, VTYPE const xx) {
182 | 
183 |     // define constants
184 |     const float DP1F = 0.78515625f * 2.f;
185 |     const float DP2F = 2.4187564849853515625E-4f * 2.f;
186 |     const float DP3F = 3.77489497744594108E-8f * 2.f;
187 | 
188 |     const float P0sinf = -1.6666654611E-1f;
189 |     const float P1sinf = 8.3321608736E-3f;
190 |     const float P2sinf = -1.9515295891E-4f;
191 | 
192 |     const float P0cosf = 4.166664568298827E-2f;
193 |     const float P1cosf = -1.388731625493765E-3f;
194 |     const float P2cosf = 2.443315711809948E-5f;
195 | 
196 |     typedef decltype(roundi(xx)) ITYPE;          // integer vector type
197 |     typedef decltype(nan_code(xx)) UITYPE;       // unsigned integer vector type
198 |     typedef decltype(xx < xx) BVTYPE;            // boolean vector type
199 | 
200 |     VTYPE  xa, x, y, x2, s, c, sin1, cos1;       // data vectors
201 |     ITYPE  q, signsin, signcos;                  // integer vectors
202 |     BVTYPE swap, overflow;                       // boolean vectors
203 | 
204 |     xa = abs(xx);
205 | 
206 |     // Find quadrant
207 |     y = round(xa * (float)(2. / VM_PI));         // quadrant, as float
208 |     q = roundi(y);                               // quadrant, as integer
209 |     //      0 -   pi/4 => 0
210 |     //   pi/4 - 3*pi/4 => 1
211 |     // 3*pi/4 - 5*pi/4 => 2
212 |     // 5*pi/4 - 7*pi/4 => 3
213 |     // 7*pi/4 - 8*pi/4 => 4
214 | 
215 |     // Reduce by extended precision modular arithmetic
216 |     // x = ((xa - y * DP1F) - y * DP2F) - y * DP3F;
217 |     x = nmul_add(y, DP3F, nmul_add(y, DP2F, nmul_add(y, DP1F, xa)));
218 | 
219 |     // A two-step reduction saves time at the cost of precision for very big x:
220 |     //x = (xa - y * DP1F) - y * (DP2F+DP3F);
221 | 
222 |     // Taylor expansion of sin and cos, valid for -pi/4 <= x <= pi/4
223 |     x2 = x * x;
224 |     s = polynomial_2(x2, P0sinf, P1sinf, P2sinf) * (x*x2) + x;
225 |     c = polynomial_2(x2, P0cosf, P1cosf, P2cosf) * (x2*x2) + nmul_add(0.5f, x2, 1.0f);
226 | 
227 |     // swap sin and cos if odd quadrant
228 |     swap = BVTYPE((q & 1) != 0);
229 | 
230 |     // check for overflow
231 |     overflow = BVTYPE(UITYPE(q) > 0x2000000);  // q big if overflow
232 |     overflow &= is_finite(xa);
233 |     s = select(overflow, 0.0f, s);
234 |     c = select(overflow, 1.0f, c);
235 | 
236 |     if constexpr ((SC & 5) != 0) {  // calculate sin
237 |         sin1 = select(swap, c, s);
238 |         signsin = ((q << 30) ^ ITYPE(reinterpret_i(xx)));
239 |         sin1 = sign_combine(sin1, reinterpret_f(signsin));
240 |     }
241 |     if constexpr ((SC & 6) != 0) {  // calculate cos
242 |         cos1 = select(swap, s, c);
243 |         signcos = ((q + 1) & 2) << 30;
244 |         cos1 ^= reinterpret_f(signcos);
245 |     }
246 |     if constexpr (SC == 1) return sin1;
247 |     else if constexpr (SC == 2) return cos1;
248 |     else if constexpr (SC == 3) {  // calculate both. cos returned through pointer
249 |         *cosret = cos1;
250 |         return sin1;
251 |     }
252 |     else {  // SC == 4. tan
253 |         return sin1 / cos1;
254 |     }
255 | }
256 | 
257 | // instantiations of sincos_f template:
258 | 
259 | static inline Vec4f sin(Vec4f const x) {
260 |     return sincos_f<Vec4f, 1>(0, x);
261 | }
262 | 
263 | static inline Vec4f cos(Vec4f const x) {
264 |     return sincos_f<Vec4f, 2>(0, x);
265 | }
266 | 
267 | static inline Vec4f sincos(Vec4f * cosret, Vec4f const x) {
268 |     return sincos_f<Vec4f, 3>(cosret, x);
269 | }
270 | 
271 | static inline Vec4f tan(Vec4f const x) {
272 |     return sincos_f<Vec4f, 4>(0, x);
273 | }
274 | 
275 | #if MAX_VECTOR_SIZE >= 256
276 | static inline Vec8f sin(Vec8f const x) {
277 |     return sincos_f<Vec8f, 1>(0, x);
278 | }
279 | 
280 | static inline Vec8f cos(Vec8f const x) {
281 |     return sincos_f<Vec8f, 2>(0, x);
282 | }
283 | 
284 | static inline Vec8f sincos(Vec8f * cosret, Vec8f const x) {
285 |     return sincos_f<Vec8f, 3>(cosret, x);
286 | }
287 | 
288 | static inline Vec8f tan(Vec8f const x) {
289 |     return sincos_f<Vec8f, 4>(0, x);
290 | }
291 | #endif // MAX_VECTOR_SIZE >= 256
292 | 
293 | #if MAX_VECTOR_SIZE >= 512
294 | static inline Vec16f sin(Vec16f const x) {
295 |     return sincos_f<Vec16f, 1>(0, x);
296 | }
297 | 
298 | static inline Vec16f cos(Vec16f const x) {
299 |     return sincos_f<Vec16f, 2>(0, x);
300 | }
301 | 
302 | static inline Vec16f sincos(Vec16f * cosret, Vec16f const x) {
303 |     return sincos_f<Vec16f, 3>(cosret, x);
304 | }
305 | 
306 | static inline Vec16f tan(Vec16f const x) {
307 |     return sincos_f<Vec16f, 4>(0, x);
308 | }
309 | #endif // MAX_VECTOR_SIZE >= 512
310 | 
311 | 
312 | // *************************************************************
313 | //             tan template, double precision
314 | // *************************************************************
315 | // Template parameters:
316 | // VTYPE:  f.p. vector type
317 | // Paramterers:
318 | // x = input x (radians)
319 | template<typename VTYPE>
320 | static inline VTYPE tan_d(VTYPE const x) {
321 | 
322 |     // define constants
323 |     const double DP1 = 7.853981554508209228515625E-1 * 2.;;
324 |     const double DP2 = 7.94662735614792836714E-9 * 2.;;
325 |     const double DP3 = 3.06161699786838294307E-17 * 2.;;
326 | 
327 |     const double P2tan = -1.30936939181383777646E4;
328 |     const double P1tan = 1.15351664838587416140E6;
329 |     const double P0tan = -1.79565251976484877988E7;
330 | 
331 |     const double Q3tan = 1.36812963470692954678E4;
332 |     const double Q2tan = -1.32089234440210967447E6;
333 |     const double Q1tan = 2.50083801823357915839E7;
334 |     const double Q0tan = -5.38695755929454629881E7;
335 | 
336 |     typedef decltype(x > x) BVTYPE;         // boolean vector type
337 |     VTYPE  xa, y, z, zz, px, qx, tn, recip; // data vectors
338 |     BVTYPE doinvert, xzero, overflow;       // boolean vectors
339 |     typedef decltype(nan_code(x)) UITYPE;   // unsigned integer vector type
340 | 
341 | 
342 |     xa = abs(x);
343 | 
344 |     // Find quadrant
345 |     y = round(xa * (double)(2. / VM_PI));   // quadrant, as float
346 |     auto q = roundi(y);                     // quadrant, as integer
347 |     // Find quadrant
348 |     //      0 -   pi/4 => 0
349 |     //   pi/4 - 3*pi/4 => 1
350 |     // 3*pi/4 - 5*pi/4 => 2
351 |     // 5*pi/4 - 7*pi/4 => 3
352 |     // 7*pi/4 - 8*pi/4 => 4
353 | 
354 |     // Reduce by extended precision modular arithmetic
355 |     // z = ((xa - y * DP1) - y * DP2) - y * DP3;
356 |     z = nmul_add(y, DP3, nmul_add(y, DP2, nmul_add(y, DP1, xa)));
357 | 
358 |     // Pade expansion of tan, valid for -pi/4 <= x <= pi/4
359 |     zz = z * z;
360 |     px = polynomial_2(zz, P0tan, P1tan, P2tan);
361 |     qx = polynomial_4n(zz, Q0tan, Q1tan, Q2tan, Q3tan);
362 | 
363 |     // qx cannot be 0 for x <= pi/4
364 |     tn = mul_add(px / qx, z * zz, z);            // tn = z + z * zz * px / qx;
365 | 
366 |     // if (q&2) tn = -1/tn
367 |     doinvert = BVTYPE((q & 1) != 0);
368 |     xzero = (xa == 0.);
369 |     // avoid division by 0. We will not be using recip anyway if xa == 0.
370 |     // tn never becomes exactly 0 when x = pi/2 so we only have to make
371 |     // a special case for x == 0.
372 |     recip = (-1.) / select(xzero, VTYPE(-1.), tn);
373 |     tn = select(doinvert, recip, tn);
374 |     tn = sign_combine(tn, x);       // get original sign
375 | 
376 |     overflow = BVTYPE(UITYPE(q) > 0x80000000000000) & is_finite(xa);
377 |     tn = select(overflow, 0., tn);
378 | 
379 |     return tn;
380 | }
381 | 
382 | // instantiations of tan_d template:
383 | 
384 | static inline Vec2d tan(Vec2d const x) {
385 |     return tan_d(x);
386 | }
387 | 
388 | #if MAX_VECTOR_SIZE >= 256
389 | static inline Vec4d tan(Vec4d const x) {
390 |     return tan_d(x);
391 | }
392 | #endif // MAX_VECTOR_SIZE >= 256
393 | 
394 | #if MAX_VECTOR_SIZE >= 512
395 | static inline Vec8d tan(Vec8d const x) {
396 |     return tan_d(x);
397 | }
398 | #endif // MAX_VECTOR_SIZE >= 512
399 | 
400 | 
401 | // *************************************************************
402 | //             tan template, single precision
403 | // *************************************************************
404 | // This is removed for the single precision version.
405 | // It is faster to use tan(x) = sin(x)/cos(x)
406 | 
407 | 
408 | 
409 | // *************************************************************
410 | //             asin/acos template, double precision
411 | // *************************************************************
412 | // Template parameters:
413 | // VTYPE:  f.p. vector type
414 | // AC: 0 = asin, 1 = acos
415 | // Paramterers:
416 | // x = input x
417 | template<typename VTYPE, int AC>
418 | static inline VTYPE asin_d(VTYPE const x) {
419 | 
420 |     // define constants
421 |     const double R4asin = 2.967721961301243206100E-3;
422 |     const double R3asin = -5.634242780008963776856E-1;
423 |     const double R2asin = 6.968710824104713396794E0;
424 |     const double R1asin = -2.556901049652824852289E1;
425 |     const double R0asin = 2.853665548261061424989E1;
426 | 
427 |     const double S3asin = -2.194779531642920639778E1;
428 |     const double S2asin = 1.470656354026814941758E2;
429 |     const double S1asin = -3.838770957603691357202E2;
430 |     const double S0asin = 3.424398657913078477438E2;
431 | 
432 |     const double P5asin = 4.253011369004428248960E-3;
433 |     const double P4asin = -6.019598008014123785661E-1;
434 |     const double P3asin = 5.444622390564711410273E0;
435 |     const double P2asin = -1.626247967210700244449E1;
436 |     const double P1asin = 1.956261983317594739197E1;
437 |     const double P0asin = -8.198089802484824371615E0;
438 | 
439 |     const double Q4asin = -1.474091372988853791896E1;
440 |     const double Q3asin = 7.049610280856842141659E1;
441 |     const double Q2asin = -1.471791292232726029859E2;
442 |     const double Q1asin = 1.395105614657485689735E2;
443 |     const double Q0asin = -4.918853881490881290097E1;
444 | 
445 |     VTYPE  xa, xb, x1, x2, x3, x4, x5, px, qx, rx, sx, vx, wx, y1, yb, z, z1, z2;
446 |     bool   dobig, dosmall;
447 | 
448 |     xa = abs(x);
449 |     auto big = xa >= 0.625;  // boolean vector
450 | 
451 |     /*
452 |     Small: xa < 0.625
453 |     ------------------
454 |     x = xa * xa;
455 |     px = PX(x);
456 |     qx = QX(x);
457 |     y1 = x*px/qx;
458 |     y1 = xa * y1 + xa;
459 | 
460 |     Big: xa >= 0.625
461 |     ------------------
462 |     x = 1.0 - xa;
463 |     rx = RX(x);
464 |     sx = SX(x);
465 |     y1 = x * rx/sx;
466 |     x3 = sqrt(x+x);
467 |     y3 = x3 * y1 - MOREBITS;
468 |     z = pi/2 - x3 - y3
469 |     */
470 | 
471 |     // select a common x for all polynomials
472 |     // This allows sharing of powers of x through common subexpression elimination
473 |     x1 = select(big, 1.0 - xa, xa * xa);
474 | 
475 |     // calculate powers of x1 outside branches to make sure they are only calculated once
476 |     x2 = x1 * x1;
477 |     x4 = x2 * x2;
478 |     x5 = x4 * x1;
479 |     x3 = x2 * x1;
480 | 
481 |     dosmall = !horizontal_and(big);    // at least one element is small
482 |     dobig = horizontal_or(big);        // at least one element is big
483 | 
484 |     // calculate polynomials (reuse powers of x)
485 |     if (dosmall) {
486 |         // px = polynomial_5 (x1, P0asin, P1asin, P2asin, P3asin, P4asin, P5asin);
487 |         // qx = polynomial_5n(x1, Q0asin, Q1asin, Q2asin, Q3asin, Q4asin);
488 |         px = mul_add(x3, P3asin, P0asin) + mul_add(x4, P4asin, x1*P1asin) + mul_add(x5, P5asin, x2*P2asin);
489 |         qx = mul_add(x4, Q4asin, x5) + mul_add(x3, Q3asin, x1*Q1asin) + mul_add(x2, Q2asin, Q0asin);
490 |     }
491 |     if (dobig) {
492 |         // rx = polynomial_4 (x1, R0asin, R1asin, R2asin, R3asin, R4asin);
493 |         // sx = polynomial_4n(x1, S0asin, S1asin, S2asin, S3asin);
494 |         rx = mul_add(x3, R3asin, x2*R2asin) + mul_add(x4, R4asin, mul_add(x1, R1asin, R0asin));
495 |         sx = mul_add(x3, S3asin, x4) + mul_add(x2, S2asin, mul_add(x1, S1asin, S0asin));
496 |     }
497 | 
498 |     // select and divide outside branches to avoid dividing twice
499 |     vx = select(big, rx, px);
500 |     wx = select(big, sx, qx);
501 |     y1 = vx / wx * x1;
502 | 
503 |     // results for big
504 |     if (dobig) {                                 // avoid square root if all are small
505 |         xb = sqrt(x1 + x1);                      // this produces NAN if xa > 1 so we don't need a special case for xa > 1
506 |         z1 = mul_add(xb, y1, xb);                // yb = xb * y1; z1 = xb + yb;
507 |     }
508 | 
509 |     // results for small
510 |     z2 = mul_add(xa, y1, xa);                    // z2 = xa * y1 + xa;
511 | 
512 |     // correct for sign
513 |     if constexpr (AC == 1) {                     // acos
514 |         z1 = select(x < 0., VM_PI - z1, z1);
515 |         z2 = VM_PI_2 - sign_combine(z2, x);
516 |         z = select(big, z1, z2);
517 |     }
518 |     else {     // asin
519 |         z1 = VM_PI_2 - z1;
520 |         z = select(big, z1, z2);
521 |         z = sign_combine(z, x);
522 |     }
523 |     return z;
524 | }
525 | 
526 | // instantiations of asin_d template:
527 | 
528 | static inline Vec2d asin(Vec2d const x) {
529 |     return asin_d<Vec2d, 0>(x);
530 | }
531 | 
532 | static inline Vec2d acos(Vec2d const x) {
533 |     return asin_d<Vec2d, 1>(x);
534 | }
535 | 
536 | #if MAX_VECTOR_SIZE >= 256
537 | static inline Vec4d asin(Vec4d const x) {
538 |     return asin_d<Vec4d, 0>(x);
539 | }
540 | 
541 | static inline Vec4d acos(Vec4d const x) {
542 |     return asin_d<Vec4d, 1>(x);
543 | }
544 | #endif // MAX_VECTOR_SIZE >= 256
545 | 
546 | #if MAX_VECTOR_SIZE >= 512
547 | static inline Vec8d asin(Vec8d const x) {
548 |     return asin_d<Vec8d, 0>(x);
549 | }
550 | 
551 | static inline Vec8d acos(Vec8d const x) {
552 |     return asin_d<Vec8d, 1>(x);
553 | }
554 | #endif // MAX_VECTOR_SIZE >= 512
555 | 
556 | 
557 | // *************************************************************
558 | //             asin/acos template, single precision
559 | // *************************************************************
560 | // Template parameters:
561 | // VTYPE:  f.p. vector type
562 | // AC: 0 = asin, 1 = acos
563 | // Paramterers:
564 | // x = input x
565 | template<typename VTYPE, int AC>
566 | static inline VTYPE asin_f(VTYPE const x) {
567 | 
568 |     // define constants
569 |     const float P4asinf = 4.2163199048E-2f;
570 |     const float P3asinf = 2.4181311049E-2f;
571 |     const float P2asinf = 4.5470025998E-2f;
572 |     const float P1asinf = 7.4953002686E-2f;
573 |     const float P0asinf = 1.6666752422E-1f;
574 | 
575 |     VTYPE  xa, x1, x2, x3, x4, xb, z, z1, z2;
576 | 
577 |     xa = abs(x);
578 |     auto big = xa > 0.5f;                        // boolean vector
579 | 
580 |     x1 = 0.5f * (1.0f - xa);
581 |     x2 = xa * xa;
582 |     x3 = select(big, x1, x2);
583 | 
584 |     //if (horizontal_or(big))
585 |     {
586 |         xb = sqrt(x1);
587 |     }
588 |     x4 = select(big, xb, xa);
589 | 
590 |     z = polynomial_4(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
591 |     z = mul_add(z, x3*x4, x4);                   // z = z * (x3*x4) + x4;
592 |     z1 = z + z;
593 | 
594 |     // correct for sign
595 |     if constexpr (AC == 1) {                     // acos
596 |         z1 = select(x < 0., float(VM_PI) - z1, z1);
597 |         z2 = float(VM_PI_2) - sign_combine(z, x);
598 |         z = select(big, z1, z2);
599 |     }
600 |     else {     // asin
601 |         z1 = float(VM_PI_2) - z1;
602 |         z = select(big, z1, z);
603 |         z = sign_combine(z, x);
604 |     }
605 | 
606 |     return z;
607 | }
608 | 
609 | // instantiations of asin_f template:
610 | 
611 | static inline Vec4f asin(Vec4f const x) {
612 |     return asin_f<Vec4f, 0>(x);
613 | }
614 | 
615 | static inline Vec4f acos(Vec4f const x) {
616 |     return asin_f<Vec4f, 1>(x);
617 | }
618 | 
619 | #if MAX_VECTOR_SIZE >= 256
620 | static inline Vec8f asin(Vec8f const x) {
621 |     return asin_f<Vec8f, 0>(x);
622 | }
623 | static inline Vec8f acos(Vec8f const x) {
624 |     return asin_f<Vec8f, 1>(x);
625 | }
626 | #endif // MAX_VECTOR_SIZE >= 256
627 | 
628 | #if MAX_VECTOR_SIZE >= 512
629 | static inline Vec16f asin(Vec16f const x) {
630 |     return asin_f<Vec16f, 0>(x);
631 | }
632 | static inline Vec16f acos(Vec16f const x) {
633 |     return asin_f<Vec16f, 1>(x);
634 | }
635 | #endif // MAX_VECTOR_SIZE >= 512
636 | 
637 | 
638 | // *************************************************************
639 | //             atan template, double precision
640 | // *************************************************************
641 | // Template parameters:
642 | // VTYPE:  f.p. vector type
643 | // T2:     0 = atan, 1 = atan2
644 | // Paramterers:
645 | // y, x. calculate tan(y/x)
646 | // result is between -pi/2 and +pi/2 when x > 0
647 | // result is between -pi and -pi/2 or between pi/2 and pi when x < 0 for atan2
648 | template<typename VTYPE, int T2>
649 | static inline VTYPE atan_d(VTYPE const y, VTYPE const x) {
650 | 
651 |     // define constants
652 |     //const double ONEOPIO4 = 4./VM_PI;
653 |     const double MOREBITS = 6.123233995736765886130E-17;
654 |     const double MOREBITSO2 = MOREBITS * 0.5;
655 |     const double T3PO8 = VM_SQRT2 + 1.; // 2.41421356237309504880;
656 | 
657 |     const double P4atan = -8.750608600031904122785E-1;
658 |     const double P3atan = -1.615753718733365076637E1;
659 |     const double P2atan = -7.500855792314704667340E1;
660 |     const double P1atan = -1.228866684490136173410E2;
661 |     const double P0atan = -6.485021904942025371773E1;
662 | 
663 |     const double Q4atan = 2.485846490142306297962E1;
664 |     const double Q3atan = 1.650270098316988542046E2;
665 |     const double Q2atan = 4.328810604912902668951E2;
666 |     const double Q1atan = 4.853903996359136964868E2;
667 |     const double Q0atan = 1.945506571482613964425E2;
668 | 
669 |     typedef decltype (x > x) BVTYPE;                            // boolean vector type
670 |     VTYPE  t, x1, x2, y1, y2, s, fac, a, b, z, zz, px, qx, re;  // data vectors
671 |     BVTYPE swapxy, notbig, notsmal;                             // boolean vectors
672 | 
673 |     if constexpr (T2 == 1) {  // atan2(y,x)
674 |         // move in first octant
675 |         x1 = abs(x);
676 |         y1 = abs(y);
677 |         swapxy = (y1 > x1);
678 |         // swap x and y if y1 > x1
679 |         x2 = select(swapxy, y1, x1);
680 |         y2 = select(swapxy, x1, y1);
681 | 
682 |         // check for special case: x and y are both +/- INF
683 |         BVTYPE both_infinite = is_inf(x) & is_inf(y);   // x and Y are both infinite
684 |         if (horizontal_or(both_infinite)) {             // at least one element has both infinite
685 |             VTYPE mone = VTYPE(-1.0);
686 |             x2 = select(both_infinite, x2 & mone, x2);  // get 1.0 with the sign of x
687 |             y2 = select(both_infinite, y2 & mone, y2);  // get 1.0 with the sign of y
688 |         }
689 | 
690 |         t = y2 / x2;                  // x = y = 0 gives NAN here
691 |     }
692 |     else {    // atan(y)
693 |         t = abs(y);
694 |     }
695 | 
696 |     // small:  t < 0.66
697 |     // medium: 0.66 <= t <= 2.4142 (1+sqrt(2))
698 |     // big:    t > 2.4142
699 |     notbig  = t <= T3PO8;  // t <= 2.4142
700 |     notsmal = t >= 0.66;   // t >= 0.66
701 | 
702 |     s   = select(notbig, VTYPE(VM_PI_4), VTYPE(VM_PI_2));
703 |     s   = notsmal & s;                   // select(notsmal, s, 0.);
704 |     fac = select(notbig, VTYPE(MOREBITSO2), VTYPE(MOREBITS));
705 |     fac = notsmal & fac;  //select(notsmal, fac, 0.);
706 | 
707 |     // small:  z = t / 1.0;
708 |     // medium: z = (t-1.0) / (t+1.0);
709 |     // big:    z = -1.0 / t;
710 |     a = notbig & t;                    // select(notbig, t, 0.);
711 |     a = if_add(notsmal, a, -1.);
712 |     b = notbig & VTYPE(1.);            //  select(notbig, 1., 0.);
713 |     b = if_add(notsmal, b, t);
714 |     z = a / b;                         // division by 0 will not occur unless x and y are both 0
715 | 
716 |     zz = z * z;
717 | 
718 |     px = polynomial_4(zz, P0atan, P1atan, P2atan, P3atan, P4atan);
719 |     qx = polynomial_5n(zz, Q0atan, Q1atan, Q2atan, Q3atan, Q4atan);
720 | 
721 |     re = mul_add(px / qx, z * zz, z);  // re = (px / qx) * (z * zz) + z;
722 |     re += s + fac;
723 | 
724 |     if constexpr (T2 == 1) {           // atan2(y,x)
725 |         // move back in place
726 |         re = select(swapxy, VM_PI_2 - re, re);
727 |         re = select((x | y) == 0., 0., re);      // atan2(0,0) = 0 by convention
728 |         re = select(sign_bit(x), VM_PI - re, re);// also for x = -0.
729 |     }
730 |     // get sign bit
731 |     re = sign_combine(re, y);
732 | 
733 |     return re;
734 | }
735 | 
736 | // instantiations of atan_d template:
737 | 
738 | static inline Vec2d atan2(Vec2d const y, Vec2d const x) {
739 |     return atan_d<Vec2d, 1>(y, x);
740 | }
741 | 
742 | static inline Vec2d atan(Vec2d const y) {
743 |     return atan_d<Vec2d, 0>(y, 0.);
744 | }
745 | 
746 | #if MAX_VECTOR_SIZE >= 256
747 | static inline Vec4d atan2(Vec4d const y, Vec4d const x) {
748 |     return atan_d<Vec4d, 1>(y, x);
749 | }
750 | 
751 | static inline Vec4d atan(Vec4d const y) {
752 |     return atan_d<Vec4d, 0>(y, 0.);
753 | }
754 | #endif // MAX_VECTOR_SIZE >= 256
755 | 
756 | #if MAX_VECTOR_SIZE >= 512
757 | static inline Vec8d atan2(Vec8d const y, Vec8d const x) {
758 |     return atan_d<Vec8d, 1>(y, x);
759 | }
760 | 
761 | static inline Vec8d atan(Vec8d const y) {
762 |     return atan_d<Vec8d, 0>(y, 0.);
763 | }
764 | #endif // MAX_VECTOR_SIZE >= 512
765 | 
766 | 
767 | 
768 | // *************************************************************
769 | //             atan template, single precision
770 | // *************************************************************
771 | // Template parameters:
772 | // VTYPE:  f.p. vector type
773 | // T2:     0 = atan, 1 = atan2
774 | // Paramterers:
775 | // y, x. calculate tan(y/x)
776 | // result is between -pi/2 and +pi/2 when x > 0
777 | // result is between -pi and -pi/2 or between pi/2 and pi when x < 0 for atan2
778 | template<typename VTYPE, int T2>
779 | static inline VTYPE atan_f(VTYPE const y, VTYPE const x) {
780 | 
781 |     // define constants
782 |     const float P3atanf = 8.05374449538E-2f;
783 |     const float P2atanf = -1.38776856032E-1f;
784 |     const float P1atanf = 1.99777106478E-1f;
785 |     const float P0atanf = -3.33329491539E-1f;
786 | 
787 |     typedef decltype (x > x) BVTYPE;             // boolean vector type
788 |     VTYPE  t, x1, x2, y1, y2, s, a, b, z, zz, re;// data vectors
789 |     BVTYPE swapxy, notbig, notsmal;              // boolean vectors
790 | 
791 |     if constexpr (T2 == 1) {  // atan2(y,x)
792 |         // move in first octant
793 |         x1 = abs(x);
794 |         y1 = abs(y);
795 |         swapxy = (y1 > x1);
796 |         // swap x and y if y1 > x1
797 |         x2 = select(swapxy, y1, x1);
798 |         y2 = select(swapxy, x1, y1);
799 | 
800 |         // check for special case: x and y are both +/- INF
801 |         BVTYPE both_infinite = is_inf(x) & is_inf(y);   // x and Y are both infinite
802 |         if (horizontal_or(both_infinite)) {             // at least one element has both infinite
803 |             VTYPE mone = VTYPE(-1.0f);
804 |             x2 = select(both_infinite, x2 & mone, x2);  // get 1.0 with the sign of x
805 |             y2 = select(both_infinite, y2 & mone, y2);  // get 1.0 with the sign of y
806 |         }
807 | 
808 |         // x = y = 0 will produce NAN. No problem, fixed below
809 |         t = y2 / x2;
810 |     }
811 |     else {    // atan(y)
812 |         t = abs(y);
813 |     }
814 | 
815 |     // small:  t < 0.4142
816 |     // medium: 0.4142 <= t <= 2.4142
817 |     // big:    t > 2.4142  (not for atan2)
818 |     if constexpr (T2 == 0) {  // atan(y)
819 |         notsmal = t >= float(VM_SQRT2 - 1.);     // t >= tan  pi/8
820 |         notbig = t <= float(VM_SQRT2 + 1.);      // t <= tan 3pi/8
821 | 
822 |         s = select(notbig, VTYPE(float(VM_PI_4)), VTYPE(float(VM_PI_2)));
823 |         s = notsmal & s;                         // select(notsmal, s, 0.);
824 | 
825 |         // small:  z = t / 1.0;
826 |         // medium: z = (t-1.0) / (t+1.0);
827 |         // big:    z = -1.0 / t;
828 |         a = notbig & t;                // select(notbig, t, 0.);
829 |         a = if_add(notsmal, a, -1.f);
830 |         b = notbig & VTYPE(1.f);       //  select(notbig, 1., 0.);
831 |         b = if_add(notsmal, b, t);
832 |         z = a / b;                     // division by 0 will not occur unless x and y are both 0
833 |     }
834 |     else {  // atan2(y,x)
835 |         // small:  z = t / 1.0;
836 |         // medium: z = (t-1.0) / (t+1.0);
837 |         notsmal = t >= float(VM_SQRT2 - 1.);
838 |         a = if_add(notsmal, t, -1.f);
839 |         b = if_add(notsmal, 1.f, t);
840 |         s = notsmal & VTYPE(float(VM_PI_4));
841 |         z = a / b;
842 |     }
843 | 
844 |     zz = z * z;
845 | 
846 |     // Taylor expansion
847 |     re = polynomial_3(zz, P0atanf, P1atanf, P2atanf, P3atanf);
848 |     re = mul_add(re, zz * z, z) + s;
849 | 
850 |     if constexpr (T2 == 1) {                               // atan2(y,x)
851 |         // move back in place
852 |         re = select(swapxy, float(VM_PI_2) - re, re);
853 |         re = select((x | y) == 0.f, 0.f, re);              // atan2(0,+0) = 0 by convention
854 |         re = select(sign_bit(x), float(VM_PI) - re, re);   // also for x = -0.
855 |     }
856 |     // get sign bit
857 |     re = sign_combine(re, y);
858 | 
859 |     return re;
860 | }
861 | 
862 | // instantiations of atan_f template:
863 | 
864 | static inline Vec4f atan2(Vec4f const y, Vec4f const x) {
865 |     return atan_f<Vec4f, 1>(y, x);
866 | }
867 | 
868 | static inline Vec4f atan(Vec4f const y) {
869 |     return atan_f<Vec4f, 0>(y, 0.);
870 | }
871 | 
872 | #if MAX_VECTOR_SIZE >= 256
873 | static inline Vec8f atan2(Vec8f const y, Vec8f const x) {
874 |     return atan_f<Vec8f, 1>(y, x);
875 | }
876 | 
877 | static inline Vec8f atan(Vec8f const y) {
878 |     return atan_f<Vec8f, 0>(y, 0.);
879 | }
880 | 
881 | #endif // MAX_VECTOR_SIZE >= 256
882 | 
883 | #if MAX_VECTOR_SIZE >= 512
884 | static inline Vec16f atan2(Vec16f const y, Vec16f const x) {
885 |     return atan_f<Vec16f, 1>(y, x);
886 | }
887 | 
888 | static inline Vec16f atan(Vec16f const y) {
889 |     return atan_f<Vec16f, 0>(y, 0.);
890 | }
891 | 
892 | #endif // MAX_VECTOR_SIZE >= 512
893 | 
894 | #ifdef VCL_NAMESPACE
895 | }
896 | #endif
897 | 
898 | #endif
899 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 HolyWu
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Description
 2 | ===========
 3 | 
 4 | [Contrast Adaptive Sharpening](https://gpuopen.com/fidelityfx-cas/).
 5 | 
 6 | 
 7 | Usage
 8 | =====
 9 | 
10 |     cas.CAS(clip clip[, float sharpness=0.5, int planes, int opt=0])
11 | 
12 | * clip: Clip to process. Any planar format with either integer sample type of 8-16 bit depth or float sample type of 32 bit depth is supported.
13 | 
14 | * sharpness: Sharpening strength.
15 | 
16 | * planes: Sets which planes will be processed. Any unprocessed planes will be simply copied. By default only luma plane is processed for non-RGB formats.
17 | 
18 | * opt: Sets which cpu optimizations to use.
19 |   * 0 = auto detect
20 |   * 1 = use c
21 |   * 2 = use sse2
22 |   * 3 = use avx2
23 |   * 4 = use avx512
24 | 
25 | 
26 | Compilation
27 | ===========
28 | 
29 | ```
30 | meson build
31 | ninja -C build
32 | ninja -C build install
33 | ```
34 | 


--------------------------------------------------------------------------------
/meson.build:
--------------------------------------------------------------------------------
 1 | project('CAS', 'cpp',
 2 |   default_options: ['buildtype=release', 'b_ndebug=if-release', 'cpp_std=c++17'],
 3 |   meson_version: '>=0.48.0',
 4 |   version: '2'
 5 | )
 6 | 
 7 | sources = [
 8 |   'CAS/CAS.cpp',
 9 |   'CAS/CAS.h'
10 | ]
11 | 
12 | vapoursynth_dep = dependency('vapoursynth').partial_dependency(compile_args: true, includes: true)
13 | 
14 | libs = []
15 | 
16 | if host_machine.cpu_family().startswith('x86')
17 |   add_project_arguments('-fno-math-errno', '-fno-trapping-math', '-DCAS_X86', '-mfpmath=sse', '-msse2', language: 'cpp')
18 | 
19 |   sources += [
20 |     'CAS/CAS_SSE2.cpp',
21 |     'CAS/VCL2/instrset.h',
22 |     'CAS/VCL2/instrset_detect.cpp',
23 |     'CAS/VCL2/vector_convert.h',
24 |     'CAS/VCL2/vectorclass.h',
25 |     'CAS/VCL2/vectorf128.h',
26 |     'CAS/VCL2/vectorf256.h',
27 |     'CAS/VCL2/vectorf256e.h',
28 |     'CAS/VCL2/vectorf512.h',
29 |     'CAS/VCL2/vectorf512e.h',
30 |     'CAS/VCL2/vectori128.h',
31 |     'CAS/VCL2/vectori256.h',
32 |     'CAS/VCL2/vectori256e.h',
33 |     'CAS/VCL2/vectori512.h',
34 |     'CAS/VCL2/vectori512e.h',
35 |     'CAS/VCL2/vectori512s.h',
36 |     'CAS/VCL2/vectori512se.h',
37 |     'CAS/VCL2/vectormath_common.h',
38 |     'CAS/VCL2/vectormath_exp.h',
39 |     'CAS/VCL2/vectormath_hyp.h',
40 |     'CAS/VCL2/vectormath_lib.h',
41 |     'CAS/VCL2/vectormath_trig.h'
42 |   ]
43 | 
44 |   libs += static_library('avx2', 'CAS/CAS_AVX2.cpp',
45 |     dependencies: vapoursynth_dep,
46 |     cpp_args: ['-mavx2', '-mfma'],
47 |     gnu_symbol_visibility: 'hidden'
48 |   )
49 | 
50 |   libs += static_library('avx512', 'CAS/CAS_AVX512.cpp',
51 |     dependencies: vapoursynth_dep,
52 |     cpp_args: ['-mavx512f', '-mavx512vl', '-mavx512bw', '-mavx512dq', '-mfma'],
53 |     gnu_symbol_visibility: 'hidden'
54 |   )
55 | endif
56 | 
57 | shared_module('cas', sources,
58 |   dependencies: vapoursynth_dep,
59 |   link_with: libs,
60 |   install: true,
61 |   install_dir: join_paths(vapoursynth_dep.get_pkgconfig_variable('libdir'), 'vapoursynth'),
62 |   gnu_symbol_visibility: 'hidden'
63 | )
64 | 


--------------------------------------------------------------------------------