├── .gitattributes ├── .gitignore ├── CAS ├── CAS.cpp ├── CAS.h ├── CAS.vcxproj ├── CAS.vcxproj.filters ├── CAS_AVX2.cpp ├── CAS_AVX512.cpp ├── CAS_SSE2.cpp └── VCL2 │ ├── LICENSE │ ├── instrset.h │ ├── instrset_detect.cpp │ ├── vector_convert.h │ ├── vectorclass.h │ ├── vectorf128.h │ ├── vectorf256.h │ ├── vectorf256e.h │ ├── vectorf512.h │ ├── vectorf512e.h │ ├── vectori128.h │ ├── vectori256.h │ ├── vectori256e.h │ ├── vectori512.h │ ├── vectori512e.h │ ├── vectori512s.h │ ├── vectori512se.h │ ├── vectormath_common.h │ ├── vectormath_exp.h │ ├── vectormath_hyp.h │ ├── vectormath_lib.h │ └── vectormath_trig.h ├── LICENSE ├── README.md └── meson.build /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Prerequisites 2 | *.d 3 | 4 | # Compiled Object files 5 | *.slo 6 | *.lo 7 | *.o 8 | *.obj 9 | 10 | # Precompiled Headers 11 | *.gch 12 | *.pch 13 | 14 | # Compiled Dynamic libraries 15 | *.so 16 | *.dylib 17 | *.dll 18 | 19 | # Fortran module files 20 | *.mod 21 | *.smod 22 | 23 | # Compiled Static libraries 24 | *.lai 25 | *.la 26 | *.a 27 | *.lib 28 | 29 | # Executables 30 | *.exe 31 | *.out 32 | *.app 33 | 34 | # User-specific files 35 | *.rsuser 36 | *.suo 37 | *.user 38 | *.userosscache 39 | *.sln.docstates 40 | 41 | # Visual Studio 2015/2017 cache/options directory 42 | .vs/ 43 | 44 | # Files built by Visual Studio 45 | *_i.c 46 | *_p.c 47 | *_h.h 48 | *.ilk 49 | *.meta 50 | *.obj 51 | *.iobj 52 | *.pch 53 | *.pdb 54 | *.ipdb 55 | *.pgc 56 | *.pgd 57 | *.rsp 58 | *.sbr 59 | *.tlb 60 | *.tli 61 | *.tlh 62 | *.tmp 63 | *.tmp_proj 64 | *_wpftmp.csproj 65 | *.log 66 | *.vspscc 67 | *.vssscc 68 | .builds 69 | *.pidb 70 | *.svclog 71 | *.scc 72 | 73 | # Visual C++ cache files 74 | ipch/ 75 | *.aps 76 | *.ncb 77 | *.opendb 78 | *.opensdf 79 | *.sdf 80 | *.cachefile 81 | *.VC.db 82 | *.VC.VC.opendb 83 | 84 | # Visual Studio profiler 85 | *.psess 86 | *.vsp 87 | *.vspx 88 | *.sap 89 | 90 | # Windows thumbnail cache files 91 | Thumbs.db 92 | Thumbs.db:encryptable 93 | ehthumbs.db 94 | ehthumbs_vista.db 95 | 96 | # Dump file 97 | *.stackdump 98 | 99 | # Folder config file 100 | [Dd]esktop.ini 101 | 102 | # Recycle Bin used on file shares 103 | $RECYCLE.BIN/ 104 | 105 | # Windows Installer files 106 | *.cab 107 | *.msi 108 | *.msix 109 | *.msm 110 | *.msp 111 | 112 | # Windows shortcuts 113 | *.lnk 114 | -------------------------------------------------------------------------------- /CAS/CAS.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | MIT License 3 | 4 | Copyright (c) 2020 Holy Wu 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | */ 24 | 25 | #include 26 | 27 | #include 28 | #include 29 | #include 30 | 31 | #include "CAS.h" 32 | 33 | #ifdef CAS_X86 34 | template extern void filter_sse2(const VSFrameRef * src, VSFrameRef * dst, const CASData * const VS_RESTRICT data, const VSAPI * vsapi) noexcept; 35 | template extern void filter_avx2(const VSFrameRef * src, VSFrameRef * dst, const CASData * const VS_RESTRICT data, const VSAPI * vsapi) noexcept; 36 | template extern void filter_avx512(const VSFrameRef * src, VSFrameRef * dst, const CASData * const VS_RESTRICT data, const VSAPI * vsapi) noexcept; 37 | #endif 38 | 39 | template 40 | static void filter_c(const VSFrameRef * src, VSFrameRef * dst, const CASData * const VS_RESTRICT data, const VSAPI * vsapi) noexcept { 41 | using var_t = std::conditional_t, int, float>; 42 | 43 | const var_t limit = std::any_cast(data->limit); 44 | 45 | auto filtering = [&](const var_t a, const var_t b, const var_t c, const var_t d, const var_t e, const var_t f, const var_t g, const var_t h, const var_t i, 46 | const float chromaOffset) noexcept { 47 | // Soft min and max. 48 | // a b c b 49 | // d e f * 0.5 + d e f * 0.5 50 | // g h i h 51 | // These are 2.0x bigger (factored out the extra multiply). 52 | var_t mn = std::min({ d, e, f, b, h }); 53 | const var_t mn2 = std::min({ mn, a, c, g, i }); 54 | mn += mn2; 55 | 56 | var_t mx = std::max({ d, e, f, b, h }); 57 | const var_t mx2 = std::max({ mx, a, c, g, i }); 58 | mx += mx2; 59 | 60 | if constexpr (std::is_floating_point_v) { 61 | mn += chromaOffset; 62 | mx += chromaOffset; 63 | } 64 | 65 | // Smooth minimum distance to signal limit divided by smooth max. 66 | float amp = std::clamp(std::min(mn, limit - mx) / static_cast(mx), 0.0f, 1.0f); 67 | 68 | // Shaping amount of sharpening. 69 | amp = std::sqrt(amp); 70 | 71 | // Filter shape. 72 | // 0 w 0 73 | // w 1 w 74 | // 0 w 0 75 | const float weight = amp * data->sharpness; 76 | return ((b + d + f + h) * weight + e) / (1.0f + 4.0f * weight); 77 | }; 78 | 79 | for (int plane = 0; plane < data->vi->format->numPlanes; plane++) { 80 | if (data->process[plane]) { 81 | const int width = vsapi->getFrameWidth(src, plane); 82 | const int height = vsapi->getFrameHeight(src, plane); 83 | const int stride = vsapi->getStride(src, plane) / sizeof(pixel_t); 84 | const pixel_t * srcp = reinterpret_cast(vsapi->getReadPtr(src, plane)); 85 | pixel_t * VS_RESTRICT dstp = reinterpret_cast(vsapi->getWritePtr(dst, plane)); 86 | 87 | const float chromaOffset = plane ? 1.0f : 0.0f; 88 | 89 | for (int y = 0; y < height; y++) { 90 | const pixel_t * above = srcp + (y == 0 ? stride : -stride); 91 | const pixel_t * below = srcp + (y == height - 1 ? -stride : stride); 92 | 93 | { 94 | const float result = filtering(above[1], above[0], above[1], 95 | srcp[1], srcp[0], srcp[1], 96 | below[1], below[0], below[1], 97 | chromaOffset); 98 | 99 | if constexpr (std::is_integral_v) 100 | dstp[0] = std::clamp(static_cast(result + 0.5f), 0, data->peak); 101 | else 102 | dstp[0] = result; 103 | } 104 | 105 | for (int x = 1; x < width - 1; x++) { 106 | const float result = filtering(above[x - 1], above[x], above[x + 1], 107 | srcp[x - 1], srcp[x], srcp[x + 1], 108 | below[x - 1], below[x], below[x + 1], 109 | chromaOffset); 110 | 111 | if constexpr (std::is_integral_v) 112 | dstp[x] = std::clamp(static_cast(result + 0.5f), 0, data->peak); 113 | else 114 | dstp[x] = result; 115 | } 116 | 117 | { 118 | const float result = filtering(above[width - 2], above[width - 1], above[width - 2], 119 | srcp[width - 2], srcp[width - 1], srcp[width - 2], 120 | below[width - 2], below[width - 1], below[width - 2], 121 | chromaOffset); 122 | 123 | if constexpr (std::is_integral_v) 124 | dstp[width - 1] = std::clamp(static_cast(result + 0.5f), 0, data->peak); 125 | else 126 | dstp[width - 1] = result; 127 | } 128 | 129 | srcp += stride; 130 | dstp += stride; 131 | } 132 | } 133 | } 134 | } 135 | 136 | static void VS_CC casInit(VSMap * in, VSMap * out, void ** instanceData, VSNode * node, VSCore * core, const VSAPI * vsapi) { 137 | CASData * d = static_cast(*instanceData); 138 | vsapi->setVideoInfo(d->vi, 1, node); 139 | } 140 | 141 | static const VSFrameRef * VS_CC casGetFrame(int n, int activationReason, void ** instanceData, void ** frameData, VSFrameContext * frameCtx, VSCore * core, const VSAPI * vsapi) { 142 | const CASData * d = static_cast(*instanceData); 143 | 144 | if (activationReason == arInitial) { 145 | vsapi->requestFrameFilter(n, d->node, frameCtx); 146 | } else if (activationReason == arAllFramesReady) { 147 | const VSFrameRef * src = vsapi->getFrameFilter(n, d->node, frameCtx); 148 | const VSFrameRef * fr[] = { d->process[0] ? nullptr : src, d->process[1] ? nullptr : src, d->process[2] ? nullptr : src }; 149 | const int pl[] = { 0, 1, 2 }; 150 | VSFrameRef * dst = vsapi->newVideoFrame2(d->vi->format, d->vi->width, d->vi->height, fr, pl, src, core); 151 | 152 | d->filter(src, dst, d, vsapi); 153 | 154 | vsapi->freeFrame(src); 155 | return dst; 156 | } 157 | 158 | return nullptr; 159 | } 160 | 161 | static void VS_CC casFree(void * instanceData, VSCore * core, const VSAPI * vsapi) { 162 | CASData * d = static_cast(instanceData); 163 | vsapi->freeNode(d->node); 164 | delete d; 165 | } 166 | 167 | static void VS_CC casCreate(const VSMap * in, VSMap * out, void * userData, VSCore * core, const VSAPI * vsapi) { 168 | using namespace std::literals; 169 | 170 | std::unique_ptr d = std::make_unique(); 171 | 172 | try { 173 | d->node = vsapi->propGetNode(in, "clip", 0, nullptr); 174 | d->vi = vsapi->getVideoInfo(d->node); 175 | int err; 176 | 177 | if (!isConstantFormat(d->vi) || 178 | (d->vi->format->sampleType == stInteger && d->vi->format->bitsPerSample > 16) || 179 | (d->vi->format->sampleType == stFloat && d->vi->format->bitsPerSample != 32)) 180 | throw "only constant format 8-16 bit integer and 32 bit float input supported"; 181 | 182 | for (int plane = 0; plane < d->vi->format->numPlanes; plane++) { 183 | if (d->vi->width >> (plane ? d->vi->format->subSamplingW : 0) < 3) 184 | throw "plane's width must be greater than or equal to 3"; 185 | 186 | if (d->vi->height >> (plane ? d->vi->format->subSamplingH : 0) < 3) 187 | throw "plane's height must be greater than or equal to 3"; 188 | } 189 | 190 | d->sharpness = static_cast(vsapi->propGetFloat(in, "sharpness", 0, &err)); 191 | if (err) 192 | d->sharpness = 0.5f; 193 | 194 | { 195 | const int m = vsapi->propNumElements(in, "planes"); 196 | 197 | if (m <= 0) { 198 | for (int i = 0; i < 3; i++) { 199 | d->process[i] = true; 200 | if (i == 0 && d->vi->format->colorFamily != cmRGB) 201 | break; 202 | } 203 | } 204 | 205 | for (int i = 0; i < m; i++) { 206 | const int n = int64ToIntS(vsapi->propGetInt(in, "planes", i, nullptr)); 207 | 208 | if (n < 0 || n >= d->vi->format->numPlanes) 209 | throw "plane index out of range"; 210 | 211 | if (d->process[n]) 212 | throw "plane specified twice"; 213 | 214 | d->process[n] = true; 215 | } 216 | } 217 | 218 | const int opt = int64ToIntS(vsapi->propGetInt(in, "opt", 0, &err)); 219 | 220 | if (d->sharpness < 0.0f || d->sharpness > 1.0f) 221 | throw "sharpness must be between 0.0 and 1.0 (inclusive)"; 222 | 223 | if (opt < 0 || opt > 4) 224 | throw "opt must be 0, 1, 2, 3, or 4"; 225 | 226 | { 227 | if (d->vi->format->bytesPerSample == 1) 228 | d->filter = filter_c; 229 | else if (d->vi->format->bytesPerSample == 2) 230 | d->filter = filter_c; 231 | else 232 | d->filter = filter_c; 233 | 234 | #ifdef CAS_X86 235 | const int iset = instrset_detect(); 236 | if ((opt == 0 && iset >= 10) || opt == 4) { 237 | if (d->vi->format->bytesPerSample == 1) 238 | d->filter = filter_avx512; 239 | else if (d->vi->format->bytesPerSample == 2) 240 | d->filter = filter_avx512; 241 | else 242 | d->filter = filter_avx512; 243 | } else if ((opt == 0 && iset >= 8) || opt == 3) { 244 | if (d->vi->format->bytesPerSample == 1) 245 | d->filter = filter_avx2; 246 | else if (d->vi->format->bytesPerSample == 2) 247 | d->filter = filter_avx2; 248 | else 249 | d->filter = filter_avx2; 250 | } else if ((opt == 0 && iset >= 2) || opt == 2) { 251 | if (d->vi->format->bytesPerSample == 1) 252 | d->filter = filter_sse2; 253 | else if (d->vi->format->bytesPerSample == 2) 254 | d->filter = filter_sse2; 255 | else 256 | d->filter = filter_sse2; 257 | } 258 | #endif 259 | } 260 | 261 | auto lerp = [](const float a, const float b, const float t) noexcept { return a + (b - a) * t; }; 262 | d->sharpness = -1.0f / lerp(16.0f, 5.0f, d->sharpness); 263 | 264 | if (d->vi->format->sampleType == stInteger) { 265 | d->limit = (1 << (d->vi->format->bitsPerSample + 1)) - 1; 266 | d->peak = (1 << d->vi->format->bitsPerSample) - 1; 267 | } else { 268 | d->limit = 2.0f; 269 | } 270 | } catch (const char * error) { 271 | vsapi->setError(out, ("CAS: "s + error).c_str()); 272 | vsapi->freeNode(d->node); 273 | return; 274 | } 275 | 276 | vsapi->createFilter(in, out, "CAS", casInit, casGetFrame, casFree, fmParallel, 0, d.release(), core); 277 | } 278 | 279 | ////////////////////////////////////////// 280 | // Init 281 | 282 | VS_EXTERNAL_API(void) VapourSynthPluginInit(VSConfigPlugin configFunc, VSRegisterFunction registerFunc, VSPlugin * plugin) { 283 | configFunc("com.holywu.cas", "cas", "Contrast Adaptive Sharpening", VAPOURSYNTH_API_VERSION, 1, plugin); 284 | registerFunc("CAS", 285 | "clip:clip;" 286 | "sharpness:float:opt;" 287 | "planes:int[]:opt;" 288 | "opt:int:opt;", 289 | casCreate, nullptr, plugin); 290 | } 291 | -------------------------------------------------------------------------------- /CAS/CAS.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | 9 | #ifdef CAS_X86 10 | #include "VCL2/vectorclass.h" 11 | #endif 12 | 13 | struct CASData final { 14 | VSNodeRef * node; 15 | const VSVideoInfo * vi; 16 | float sharpness; 17 | bool process[3]; 18 | std::any limit; 19 | int peak; 20 | void (*filter)(const VSFrameRef * src, VSFrameRef * dst, const CASData * const VS_RESTRICT data, const VSAPI * vsapi) noexcept; 21 | }; 22 | -------------------------------------------------------------------------------- /CAS/CAS.vcxproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Release 6 | x64 7 | 8 | 9 | 10 | 16.0 11 | Win32Proj 12 | {9bb46411-255f-4695-b047-3d09ecdd1e41} 13 | CAS 14 | 10.0 15 | 16 | 17 | 18 | DynamicLibrary 19 | false 20 | v142 21 | true 22 | Unicode 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | C:\Program Files\VapourSynth\sdk\include\vapoursynth;$(IncludePath) 35 | 36 | 37 | 38 | CAS_X86;NDEBUG;%(PreprocessorDefinitions) 39 | Level3 40 | true 41 | false 42 | false 43 | true 44 | stdcpp17 45 | 46 | 47 | Windows 48 | true 49 | true 50 | 51 | 52 | 53 | 54 | 55 | AdvancedVectorExtensions2 56 | 57 | 58 | AdvancedVectorExtensions512 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | -------------------------------------------------------------------------------- /CAS/CAS.vcxproj.filters: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | {4FC737F1-C7A5-4376-A066-2A32D752A2FF} 6 | cpp;c;cc;cxx;c++;def;odl;idl;hpj;bat;asm;asmx 7 | 8 | 9 | {93995380-89BD-4b04-88EB-625FBE52EBFB} 10 | h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd 11 | 12 | 13 | {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} 14 | rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms 15 | 16 | 17 | 18 | 19 | Source Files 20 | 21 | 22 | Source Files 23 | 24 | 25 | Source Files 26 | 27 | 28 | Source Files 29 | 30 | 31 | Source Files 32 | 33 | 34 | 35 | 36 | Header Files 37 | 38 | 39 | -------------------------------------------------------------------------------- /CAS/CAS_AVX2.cpp: -------------------------------------------------------------------------------- 1 | #ifdef CAS_X86 2 | #include "CAS.h" 3 | 4 | template 5 | void filter_avx2(const VSFrameRef * src, VSFrameRef * dst, const CASData * const VS_RESTRICT data, const VSAPI * vsapi) noexcept { 6 | using var_t = std::conditional_t, int, float>; 7 | using vec_t = std::conditional_t, Vec8i, Vec8f>; 8 | 9 | const vec_t limit = std::any_cast(data->limit); 10 | 11 | auto load = [](const pixel_t * srcp) noexcept { 12 | if constexpr (std::is_same_v) 13 | return vec_t().load_8uc(srcp); 14 | else if constexpr (std::is_same_v) 15 | return vec_t().load_8us(srcp); 16 | else 17 | return vec_t().load(srcp); 18 | }; 19 | 20 | auto store = [&](const Vec8f srcp, pixel_t * dstp) noexcept { 21 | if constexpr (std::is_same_v) { 22 | const auto result = compress_saturated_s2u(compress_saturated(truncatei(srcp + 0.5f), zero_si256()), zero_si256()).get_low(); 23 | result.storel(dstp); 24 | } else if constexpr (std::is_same_v) { 25 | const auto result = compress_saturated_s2u(truncatei(srcp + 0.5f), zero_si256()).get_low(); 26 | min(result, data->peak).store_nt(dstp); 27 | } else { 28 | srcp.store_nt(dstp); 29 | } 30 | }; 31 | 32 | auto filtering = [&](const vec_t a, const vec_t b, const vec_t c, const vec_t d, const vec_t e, const vec_t f, const vec_t g, const vec_t h, const vec_t i, 33 | const Vec8f chromaOffset) noexcept { 34 | // Soft min and max. 35 | // a b c b 36 | // d e f * 0.5 + d e f * 0.5 37 | // g h i h 38 | // These are 2.0x bigger (factored out the extra multiply). 39 | vec_t mn = min(min(min(d, e), min(f, b)), h); 40 | const vec_t mn2 = min(min(min(mn, a), min(c, g)), i); 41 | mn += mn2; 42 | 43 | vec_t mx = max(max(max(d, e), max(f, b)), h); 44 | const vec_t mx2 = max(max(max(mx, a), max(c, g)), i); 45 | mx += mx2; 46 | 47 | if constexpr (std::is_floating_point_v) { 48 | mn += chromaOffset; 49 | mx += chromaOffset; 50 | } 51 | 52 | // Smooth minimum distance to signal limit divided by smooth max. 53 | Vec8f amp; 54 | if constexpr (std::is_integral_v) 55 | amp = min(max(to_float(min(mn, limit - mx)) / to_float(mx), 0.0f), 1.0f); 56 | else 57 | amp = min(max(min(mn, limit - mx) / mx, 0.0f), 1.0f); 58 | 59 | // Shaping amount of sharpening. 60 | amp = sqrt(amp); 61 | 62 | // Filter shape. 63 | // 0 w 0 64 | // w 1 w 65 | // 0 w 0 66 | const Vec8f weight = amp * data->sharpness; 67 | if constexpr (std::is_integral_v) 68 | return mul_add(to_float((b + d) + (f + h)), weight, to_float(e)) / mul_add(4.0f, weight, 1.0f); 69 | else 70 | return mul_add((b + d) + (f + h), weight, e) / mul_add(4.0f, weight, 1.0f); 71 | }; 72 | 73 | for (int plane = 0; plane < data->vi->format->numPlanes; plane++) { 74 | if (data->process[plane]) { 75 | const int width = vsapi->getFrameWidth(src, plane); 76 | const int height = vsapi->getFrameHeight(src, plane); 77 | const int stride = vsapi->getStride(src, plane) / sizeof(pixel_t); 78 | const pixel_t * srcp = reinterpret_cast(vsapi->getReadPtr(src, plane)); 79 | pixel_t * dstp = reinterpret_cast(vsapi->getWritePtr(dst, plane)); 80 | 81 | const Vec8f chromaOffset = plane ? 1.0f : 0.0f; 82 | 83 | const int regularPart = (width - 1) & ~(vec_t().size() - 1); 84 | 85 | for (int y = 0; y < height; y++) { 86 | const pixel_t * above = srcp + (y == 0 ? stride : -stride); 87 | const pixel_t * below = srcp + (y == height - 1 ? -stride : stride); 88 | 89 | { 90 | const vec_t b = load(above + 0); 91 | const vec_t e = load(srcp + 0); 92 | const vec_t h = load(below + 0); 93 | 94 | const vec_t a = permute8<1, 0, 1, 2, 3, 4, 5, 6>(b); 95 | const vec_t d = permute8<1, 0, 1, 2, 3, 4, 5, 6>(e); 96 | const vec_t g = permute8<1, 0, 1, 2, 3, 4, 5, 6>(h); 97 | 98 | vec_t c, f, i; 99 | if (width > vec_t().size()) { 100 | c = load(above + 1); 101 | f = load(srcp + 1); 102 | i = load(below + 1); 103 | } else { 104 | c = permute8<1, 2, 3, 4, 5, 6, 7, 6>(b); 105 | f = permute8<1, 2, 3, 4, 5, 6, 7, 6>(e); 106 | i = permute8<1, 2, 3, 4, 5, 6, 7, 6>(h); 107 | } 108 | 109 | const Vec8f result = filtering(a, b, c, 110 | d, e, f, 111 | g, h, i, 112 | chromaOffset); 113 | 114 | store(result, dstp + 0); 115 | } 116 | 117 | for (int x = vec_t().size(); x < regularPart; x += vec_t().size()) { 118 | const Vec8f result = filtering(load(above + x - 1), load(above + x), load(above + x + 1), 119 | load(srcp + x - 1), load(srcp + x), load(srcp + x + 1), 120 | load(below + x - 1), load(below + x), load(below + x + 1), 121 | chromaOffset); 122 | 123 | store(result, dstp + x); 124 | } 125 | 126 | if (regularPart >= vec_t().size()) { 127 | const vec_t a = load(above + regularPart - 1); 128 | const vec_t d = load(srcp + regularPart - 1); 129 | const vec_t g = load(below + regularPart - 1); 130 | 131 | const vec_t b = load(above + regularPart); 132 | const vec_t e = load(srcp + regularPart); 133 | const vec_t h = load(below + regularPart); 134 | 135 | const vec_t c = permute8<1, 2, 3, 4, 5, 6, 7, 6>(b); 136 | const vec_t f = permute8<1, 2, 3, 4, 5, 6, 7, 6>(e); 137 | const vec_t i = permute8<1, 2, 3, 4, 5, 6, 7, 6>(h); 138 | 139 | const Vec8f result = filtering(a, b, c, 140 | d, e, f, 141 | g, h, i, 142 | chromaOffset); 143 | 144 | store(result, dstp + regularPart); 145 | } 146 | 147 | srcp += stride; 148 | dstp += stride; 149 | } 150 | } 151 | } 152 | } 153 | 154 | template void filter_avx2(const VSFrameRef * src, VSFrameRef * dst, const CASData * const VS_RESTRICT data, const VSAPI * vsapi) noexcept; 155 | template void filter_avx2(const VSFrameRef * src, VSFrameRef * dst, const CASData * const VS_RESTRICT data, const VSAPI * vsapi) noexcept; 156 | template void filter_avx2(const VSFrameRef * src, VSFrameRef * dst, const CASData * const VS_RESTRICT data, const VSAPI * vsapi) noexcept; 157 | #endif 158 | -------------------------------------------------------------------------------- /CAS/CAS_AVX512.cpp: -------------------------------------------------------------------------------- 1 | #ifdef CAS_X86 2 | #include "CAS.h" 3 | 4 | template 5 | void filter_avx512(const VSFrameRef * src, VSFrameRef * dst, const CASData * const VS_RESTRICT data, const VSAPI * vsapi) noexcept { 6 | using var_t = std::conditional_t, int, float>; 7 | using vec_t = std::conditional_t, Vec16i, Vec16f>; 8 | 9 | const vec_t limit = std::any_cast(data->limit); 10 | 11 | auto load = [](const pixel_t * srcp) noexcept { 12 | if constexpr (std::is_same_v) 13 | return vec_t().load_16uc(srcp); 14 | else if constexpr (std::is_same_v) 15 | return vec_t().load_16us(srcp); 16 | else 17 | return vec_t().load(srcp); 18 | }; 19 | 20 | auto store = [&](const Vec16f srcp, pixel_t * dstp) noexcept { 21 | if constexpr (std::is_same_v) { 22 | const auto result = compress_saturated_s2u(compress_saturated(truncatei(srcp + 0.5f), zero_si512()), zero_si512()).get_low().get_low(); 23 | result.store_nt(dstp); 24 | } else if constexpr (std::is_same_v) { 25 | const auto result = compress_saturated_s2u(truncatei(srcp + 0.5f), zero_si512()).get_low(); 26 | min(result, data->peak).store_nt(dstp); 27 | } else { 28 | srcp.store_nt(dstp); 29 | } 30 | }; 31 | 32 | auto filtering = [&](const vec_t a, const vec_t b, const vec_t c, const vec_t d, const vec_t e, const vec_t f, const vec_t g, const vec_t h, const vec_t i, 33 | const Vec16f chromaOffset) noexcept { 34 | // Soft min and max. 35 | // a b c b 36 | // d e f * 0.5 + d e f * 0.5 37 | // g h i h 38 | // These are 2.0x bigger (factored out the extra multiply). 39 | vec_t mn = min(min(min(d, e), min(f, b)), h); 40 | const vec_t mn2 = min(min(min(mn, a), min(c, g)), i); 41 | mn += mn2; 42 | 43 | vec_t mx = max(max(max(d, e), max(f, b)), h); 44 | const vec_t mx2 = max(max(max(mx, a), max(c, g)), i); 45 | mx += mx2; 46 | 47 | if constexpr (std::is_floating_point_v) { 48 | mn += chromaOffset; 49 | mx += chromaOffset; 50 | } 51 | 52 | // Smooth minimum distance to signal limit divided by smooth max. 53 | Vec16f amp; 54 | if constexpr (std::is_integral_v) 55 | amp = min(max(to_float(min(mn, limit - mx)) / to_float(mx), 0.0f), 1.0f); 56 | else 57 | amp = min(max(min(mn, limit - mx) / mx, 0.0f), 1.0f); 58 | 59 | // Shaping amount of sharpening. 60 | amp = sqrt(amp); 61 | 62 | // Filter shape. 63 | // 0 w 0 64 | // w 1 w 65 | // 0 w 0 66 | const Vec16f weight = amp * data->sharpness; 67 | if constexpr (std::is_integral_v) 68 | return mul_add(to_float((b + d) + (f + h)), weight, to_float(e)) / mul_add(4.0f, weight, 1.0f); 69 | else 70 | return mul_add((b + d) + (f + h), weight, e) / mul_add(4.0f, weight, 1.0f); 71 | }; 72 | 73 | for (int plane = 0; plane < data->vi->format->numPlanes; plane++) { 74 | if (data->process[plane]) { 75 | const int width = vsapi->getFrameWidth(src, plane); 76 | const int height = vsapi->getFrameHeight(src, plane); 77 | const int stride = vsapi->getStride(src, plane) / sizeof(pixel_t); 78 | const pixel_t * srcp = reinterpret_cast(vsapi->getReadPtr(src, plane)); 79 | pixel_t * dstp = reinterpret_cast(vsapi->getWritePtr(dst, plane)); 80 | 81 | const Vec16f chromaOffset = plane ? 1.0f : 0.0f; 82 | 83 | const int regularPart = (width - 1) & ~(vec_t().size() - 1); 84 | 85 | for (int y = 0; y < height; y++) { 86 | const pixel_t * above = srcp + (y == 0 ? stride : -stride); 87 | const pixel_t * below = srcp + (y == height - 1 ? -stride : stride); 88 | 89 | { 90 | const vec_t b = load(above + 0); 91 | const vec_t e = load(srcp + 0); 92 | const vec_t h = load(below + 0); 93 | 94 | const vec_t a = permute16<1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14>(b); 95 | const vec_t d = permute16<1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14>(e); 96 | const vec_t g = permute16<1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14>(h); 97 | 98 | vec_t c, f, i; 99 | if (width > vec_t().size()) { 100 | c = load(above + 1); 101 | f = load(srcp + 1); 102 | i = load(below + 1); 103 | } else { 104 | c = permute16<1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14>(b); 105 | f = permute16<1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14>(e); 106 | i = permute16<1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14>(h); 107 | } 108 | 109 | const Vec16f result = filtering(a, b, c, 110 | d, e, f, 111 | g, h, i, 112 | chromaOffset); 113 | 114 | store(result, dstp + 0); 115 | } 116 | 117 | for (int x = vec_t().size(); x < regularPart; x += vec_t().size()) { 118 | const Vec16f result = filtering(load(above + x - 1), load(above + x), load(above + x + 1), 119 | load(srcp + x - 1), load(srcp + x), load(srcp + x + 1), 120 | load(below + x - 1), load(below + x), load(below + x + 1), 121 | chromaOffset); 122 | 123 | store(result, dstp + x); 124 | } 125 | 126 | if (regularPart >= vec_t().size()) { 127 | const vec_t a = load(above + regularPart - 1); 128 | const vec_t d = load(srcp + regularPart - 1); 129 | const vec_t g = load(below + regularPart - 1); 130 | 131 | const vec_t b = load(above + regularPart); 132 | const vec_t e = load(srcp + regularPart); 133 | const vec_t h = load(below + regularPart); 134 | 135 | const vec_t c = permute16<1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14>(b); 136 | const vec_t f = permute16<1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14>(e); 137 | const vec_t i = permute16<1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14>(h); 138 | 139 | const Vec16f result = filtering(a, b, c, 140 | d, e, f, 141 | g, h, i, 142 | chromaOffset); 143 | 144 | store(result, dstp + regularPart); 145 | } 146 | 147 | srcp += stride; 148 | dstp += stride; 149 | } 150 | } 151 | } 152 | } 153 | 154 | template void filter_avx512(const VSFrameRef * src, VSFrameRef * dst, const CASData * const VS_RESTRICT data, const VSAPI * vsapi) noexcept; 155 | template void filter_avx512(const VSFrameRef * src, VSFrameRef * dst, const CASData * const VS_RESTRICT data, const VSAPI * vsapi) noexcept; 156 | template void filter_avx512(const VSFrameRef * src, VSFrameRef * dst, const CASData * const VS_RESTRICT data, const VSAPI * vsapi) noexcept; 157 | #endif 158 | -------------------------------------------------------------------------------- /CAS/CAS_SSE2.cpp: -------------------------------------------------------------------------------- 1 | #ifdef CAS_X86 2 | #include "CAS.h" 3 | 4 | template 5 | void filter_sse2(const VSFrameRef * src, VSFrameRef * dst, const CASData * const VS_RESTRICT data, const VSAPI * vsapi) noexcept { 6 | using var_t = std::conditional_t, int, float>; 7 | using vec_t = std::conditional_t, Vec4i, Vec4f>; 8 | 9 | const vec_t limit = std::any_cast(data->limit); 10 | 11 | auto load = [](const pixel_t * srcp) noexcept { 12 | if constexpr (std::is_same_v) 13 | return vec_t().load_4uc(srcp); 14 | else if constexpr (std::is_same_v) 15 | return vec_t().load_4us(srcp); 16 | else 17 | return vec_t().load(srcp); 18 | }; 19 | 20 | auto store = [&](const Vec4f srcp, pixel_t * dstp) noexcept { 21 | if constexpr (std::is_same_v) { 22 | const auto result = compress_saturated_s2u(compress_saturated(truncatei(srcp + 0.5f), zero_si128()), zero_si128()); 23 | result.store_si32(dstp); 24 | } else if constexpr (std::is_same_v) { 25 | const auto result = compress_saturated_s2u(truncatei(srcp + 0.5f), zero_si128()); 26 | min(result, data->peak).storel(dstp); 27 | } else { 28 | srcp.store_nt(dstp); 29 | } 30 | }; 31 | 32 | auto filtering = [&](const vec_t a, const vec_t b, const vec_t c, const vec_t d, const vec_t e, const vec_t f, const vec_t g, const vec_t h, const vec_t i, 33 | const Vec4f chromaOffset) noexcept { 34 | // Soft min and max. 35 | // a b c b 36 | // d e f * 0.5 + d e f * 0.5 37 | // g h i h 38 | // These are 2.0x bigger (factored out the extra multiply). 39 | vec_t mn = min(min(min(d, e), min(f, b)), h); 40 | const vec_t mn2 = min(min(min(mn, a), min(c, g)), i); 41 | mn += mn2; 42 | 43 | vec_t mx = max(max(max(d, e), max(f, b)), h); 44 | const vec_t mx2 = max(max(max(mx, a), max(c, g)), i); 45 | mx += mx2; 46 | 47 | if constexpr (std::is_floating_point_v) { 48 | mn += chromaOffset; 49 | mx += chromaOffset; 50 | } 51 | 52 | // Smooth minimum distance to signal limit divided by smooth max. 53 | Vec4f amp; 54 | if constexpr (std::is_integral_v) 55 | amp = min(max(to_float(min(mn, limit - mx)) / to_float(mx), 0.0f), 1.0f); 56 | else 57 | amp = min(max(min(mn, limit - mx) / mx, 0.0f), 1.0f); 58 | 59 | // Shaping amount of sharpening. 60 | amp = sqrt(amp); 61 | 62 | // Filter shape. 63 | // 0 w 0 64 | // w 1 w 65 | // 0 w 0 66 | const Vec4f weight = amp * data->sharpness; 67 | if constexpr (std::is_integral_v) 68 | return mul_add(to_float((b + d) + (f + h)), weight, to_float(e)) / mul_add(4.0f, weight, 1.0f); 69 | else 70 | return mul_add((b + d) + (f + h), weight, e) / mul_add(4.0f, weight, 1.0f); 71 | }; 72 | 73 | for (int plane = 0; plane < data->vi->format->numPlanes; plane++) { 74 | if (data->process[plane]) { 75 | const int width = vsapi->getFrameWidth(src, plane); 76 | const int height = vsapi->getFrameHeight(src, plane); 77 | const int stride = vsapi->getStride(src, plane) / sizeof(pixel_t); 78 | const pixel_t * srcp = reinterpret_cast(vsapi->getReadPtr(src, plane)); 79 | pixel_t * dstp = reinterpret_cast(vsapi->getWritePtr(dst, plane)); 80 | 81 | const Vec4f chromaOffset = plane ? 1.0f : 0.0f; 82 | 83 | const int regularPart = (width - 1) & ~(vec_t().size() - 1); 84 | 85 | for (int y = 0; y < height; y++) { 86 | const pixel_t * above = srcp + (y == 0 ? stride : -stride); 87 | const pixel_t * below = srcp + (y == height - 1 ? -stride : stride); 88 | 89 | { 90 | const vec_t b = load(above + 0); 91 | const vec_t e = load(srcp + 0); 92 | const vec_t h = load(below + 0); 93 | 94 | const vec_t a = permute4<1, 0, 1, 2>(b); 95 | const vec_t d = permute4<1, 0, 1, 2>(e); 96 | const vec_t g = permute4<1, 0, 1, 2>(h); 97 | 98 | vec_t c, f, i; 99 | if (width > vec_t().size()) { 100 | c = load(above + 1); 101 | f = load(srcp + 1); 102 | i = load(below + 1); 103 | } else { 104 | c = permute4<1, 2, 3, 2>(b); 105 | f = permute4<1, 2, 3, 2>(e); 106 | i = permute4<1, 2, 3, 2>(h); 107 | } 108 | 109 | const Vec4f result = filtering(a, b, c, 110 | d, e, f, 111 | g, h, i, 112 | chromaOffset); 113 | 114 | store(result, dstp + 0); 115 | } 116 | 117 | for (int x = vec_t().size(); x < regularPart; x += vec_t().size()) { 118 | const Vec4f result = filtering(load(above + x - 1), load(above + x), load(above + x + 1), 119 | load(srcp + x - 1), load(srcp + x), load(srcp + x + 1), 120 | load(below + x - 1), load(below + x), load(below + x + 1), 121 | chromaOffset); 122 | 123 | store(result, dstp + x); 124 | } 125 | 126 | if (regularPart >= vec_t().size()) { 127 | const vec_t a = load(above + regularPart - 1); 128 | const vec_t d = load(srcp + regularPart - 1); 129 | const vec_t g = load(below + regularPart - 1); 130 | 131 | const vec_t b = load(above + regularPart); 132 | const vec_t e = load(srcp + regularPart); 133 | const vec_t h = load(below + regularPart); 134 | 135 | const vec_t c = permute4<1, 2, 3, 2>(b); 136 | const vec_t f = permute4<1, 2, 3, 2>(e); 137 | const vec_t i = permute4<1, 2, 3, 2>(h); 138 | 139 | const Vec4f result = filtering(a, b, c, 140 | d, e, f, 141 | g, h, i, 142 | chromaOffset); 143 | 144 | store(result, dstp + regularPart); 145 | } 146 | 147 | srcp += stride; 148 | dstp += stride; 149 | } 150 | } 151 | } 152 | } 153 | 154 | template void filter_sse2(const VSFrameRef * src, VSFrameRef * dst, const CASData * const VS_RESTRICT data, const VSAPI * vsapi) noexcept; 155 | template void filter_sse2(const VSFrameRef * src, VSFrameRef * dst, const CASData * const VS_RESTRICT data, const VSAPI * vsapi) noexcept; 156 | template void filter_sse2(const VSFrameRef * src, VSFrameRef * dst, const CASData * const VS_RESTRICT data, const VSAPI * vsapi) noexcept; 157 | #endif 158 | -------------------------------------------------------------------------------- /CAS/VCL2/LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | 179 | Copyright 2012-2019 Agner Fog. 180 | 181 | Licensed under the Apache License, Version 2.0 (the "License"); 182 | you may not use this file except in compliance with the License. 183 | You may obtain a copy of the License at 184 | 185 | http://www.apache.org/licenses/LICENSE-2.0 186 | 187 | Unless required by applicable law or agreed to in writing, software 188 | distributed under the License is distributed on an "AS IS" BASIS, 189 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 190 | See the License for the specific language governing permissions and 191 | limitations under the License. 192 | -------------------------------------------------------------------------------- /CAS/VCL2/instrset.h: -------------------------------------------------------------------------------- 1 | /**************************** instrset.h ********************************** 2 | * Author: Agner Fog 3 | * Date created: 2012-05-30 4 | * Last modified: 2020-06-08 5 | * Version: 2.01.03 6 | * Project: vector class library 7 | * Description: 8 | * Header file for various compiler-specific tasks as well as common 9 | * macros and templates. This file contains: 10 | * 11 | * > Selection of the supported instruction set 12 | * > Defines compiler version macros 13 | * > Undefines certain macros that prevent function overloading 14 | * > Helper functions that depend on instruction set, compiler, or platform 15 | * > Common templates for permute, blend, etc. 16 | * 17 | * For instructions, see vcl_manual.pdf 18 | * 19 | * (c) Copyright 2012-2020 Agner Fog. 20 | * Apache License version 2.0 or later. 21 | ******************************************************************************/ 22 | 23 | #ifndef INSTRSET_H 24 | #define INSTRSET_H 20102 25 | 26 | 27 | // Allow the use of floating point permute instructions on integer vectors. 28 | // Some CPU's have an extra latency of 1 or 2 clock cycles for this, but 29 | // it may still be faster than alternative implementations: 30 | #define ALLOW_FP_PERMUTE true 31 | 32 | 33 | // Macro to indicate 64 bit mode 34 | #if (defined(_M_AMD64) || defined(_M_X64) || defined(__amd64) ) && ! defined(__x86_64__) 35 | #define __x86_64__ 1 // There are many different macros for this, decide on only one 36 | #endif 37 | 38 | // The following values of INSTRSET are currently defined: 39 | // 2: SSE2 40 | // 3: SSE3 41 | // 4: SSSE3 42 | // 5: SSE4.1 43 | // 6: SSE4.2 44 | // 7: AVX 45 | // 8: AVX2 46 | // 9: AVX512F 47 | // 10: AVX512BW/DQ/VL 48 | // In the future, INSTRSET = 11 may include AVX512VBMI and AVX512VBMI2, but this 49 | // decision cannot be made before the market situation for CPUs with these 50 | // instruction sets is known (these future instruction set extensions are already 51 | // used in some VCL functions and tested with an emulator) 52 | 53 | // Find instruction set from compiler macros if INSTRSET is not defined. 54 | // Note: Most of these macros are not defined in Microsoft compilers 55 | #ifndef INSTRSET 56 | #if defined ( __AVX512VL__ ) && defined ( __AVX512BW__ ) && defined ( __AVX512DQ__ ) 57 | #define INSTRSET 10 58 | #elif defined ( __AVX512F__ ) || defined ( __AVX512__ ) 59 | #define INSTRSET 9 60 | #elif defined ( __AVX2__ ) 61 | #define INSTRSET 8 62 | #elif defined ( __AVX__ ) 63 | #define INSTRSET 7 64 | #elif defined ( __SSE4_2__ ) 65 | #define INSTRSET 6 66 | #elif defined ( __SSE4_1__ ) 67 | #define INSTRSET 5 68 | #elif defined ( __SSSE3__ ) 69 | #define INSTRSET 4 70 | #elif defined ( __SSE3__ ) 71 | #define INSTRSET 3 72 | #elif defined ( __SSE2__ ) || defined ( __x86_64__ ) 73 | #define INSTRSET 2 74 | #elif defined ( __SSE__ ) 75 | #define INSTRSET 1 76 | #elif defined ( _M_IX86_FP ) // Defined in MS compiler. 1: SSE, 2: SSE2 77 | #define INSTRSET _M_IX86_FP 78 | #else 79 | #define INSTRSET 0 80 | #endif // instruction set defines 81 | #endif // INSTRSET 82 | 83 | // Include the appropriate header file for intrinsic functions 84 | #if INSTRSET > 7 // AVX2 and later 85 | #if defined (__GNUC__) && ! defined (__INTEL_COMPILER) 86 | #include // x86intrin.h includes header files for whatever instruction 87 | // sets are specified on the compiler command line, such as: 88 | // xopintrin.h, fma4intrin.h 89 | #else 90 | #include // MS/Intel version of immintrin.h covers AVX and later 91 | #endif // __GNUC__ 92 | #elif INSTRSET == 7 93 | #include // AVX 94 | #elif INSTRSET == 6 95 | #include // SSE4.2 96 | #elif INSTRSET == 5 97 | #include // SSE4.1 98 | #elif INSTRSET == 4 99 | #include // SSSE3 100 | #elif INSTRSET == 3 101 | #include // SSE3 102 | #elif INSTRSET == 2 103 | #include // SSE2 104 | #elif INSTRSET == 1 105 | #include // SSE 106 | #endif // INSTRSET 107 | 108 | #if INSTRSET >= 8 && !defined(__FMA__) 109 | // Assume that all processors that have AVX2 also have FMA3 110 | #if defined (__GNUC__) && ! defined (__INTEL_COMPILER) 111 | // Prevent error message in g++ and Clang when using FMA intrinsics with avx2: 112 | #if !defined(DISABLE_WARNING_AVX2_WITHOUT_FMA) 113 | #pragma message "It is recommended to specify also option -mfma when using -mavx2 or higher" 114 | #endif 115 | #elif ! defined (__clang__) 116 | #define __FMA__ 1 117 | #endif 118 | #endif 119 | 120 | // AMD instruction sets 121 | #if defined (__XOP__) || defined (__FMA4__) 122 | #ifdef __GNUC__ 123 | #include // AMD XOP (Gnu) 124 | #else 125 | #include // AMD XOP (Microsoft) 126 | #endif // __GNUC__ 127 | #elif defined (__SSE4A__) // AMD SSE4A 128 | #include 129 | #endif // __XOP__ 130 | 131 | // FMA3 instruction set 132 | #if defined (__FMA__) && (defined(__GNUC__) || defined(__clang__)) && ! defined (__INTEL_COMPILER) 133 | #include 134 | #endif // __FMA__ 135 | 136 | // FMA4 instruction set 137 | #if defined (__FMA4__) && (defined(__GNUC__) || defined(__clang__)) 138 | #include // must have both x86intrin.h and fma4intrin.h, don't know why 139 | #endif // __FMA4__ 140 | 141 | 142 | #include // Define integer types with known size 143 | #include // define abs(int) 144 | 145 | #ifdef _MSC_VER // Microsoft compiler or compatible Intel compiler 146 | #include // define _BitScanReverse(int), __cpuid(int[4],int), _xgetbv(int) 147 | #endif // _MSC_VER 148 | 149 | 150 | // functions in instrset_detect.cpp: 151 | #ifdef VCL_NAMESPACE 152 | namespace VCL_NAMESPACE { 153 | #endif 154 | int instrset_detect(void); // tells which instruction sets are supported 155 | bool hasFMA3(void); // true if FMA3 instructions supported 156 | bool hasFMA4(void); // true if FMA4 instructions supported 157 | bool hasXOP(void); // true if XOP instructions supported 158 | bool hasAVX512ER(void); // true if AVX512ER instructions supported 159 | bool hasAVX512VBMI(void); // true if AVX512VBMI instructions supported 160 | bool hasAVX512VBMI2(void); // true if AVX512VBMI2 instructions supported 161 | #ifdef VCL_NAMESPACE 162 | } 163 | #endif 164 | 165 | // functions in physical_processors.cpp: 166 | int physicalProcessors(int * logical_processors = 0); 167 | 168 | 169 | // GCC version 170 | #if defined(__GNUC__) && !defined (GCC_VERSION) && !defined (__clang__) 171 | #define GCC_VERSION ((__GNUC__) * 10000 + (__GNUC_MINOR__) * 100 + (__GNUC_PATCHLEVEL__)) 172 | #endif 173 | 174 | // Clang version 175 | #if defined (__clang__) 176 | #define CLANG_VERSION ((__clang_major__) * 10000 + (__clang_minor__) * 100 + (__clang_patchlevel__)) 177 | // Problem: The version number is not consistent across platforms 178 | // http://llvm.org/bugs/show_bug.cgi?id=12643 179 | // Apple bug 18746972 180 | #endif 181 | 182 | // Fix problem with non-overloadable macros named min and max in WinDef.h 183 | #ifdef _MSC_VER 184 | #if defined (_WINDEF_) && defined(min) && defined(max) 185 | #undef min 186 | #undef max 187 | #endif 188 | #ifndef NOMINMAX 189 | #define NOMINMAX 190 | #endif 191 | 192 | // warning for poor support for AVX512F in MS compiler 193 | #ifndef __INTEL_COMPILER 194 | #if INSTRSET == 9 195 | #pragma message("Warning: MS compiler cannot generate code for AVX512F without AVX512DQ") 196 | #endif 197 | #if _MSC_VER < 1920 && INSTRSET > 8 198 | #pragma message("Warning: Your compiler has poor support for AVX512. Code may be erroneous.\nPlease use a newer compiler version or a different compiler!") 199 | #endif 200 | #endif // __INTEL_COMPILER 201 | #endif // _MSC_VER 202 | 203 | /* Intel compiler problem: 204 | The Intel compiler currently cannot compile version 2.00 of VCL. It seems to have 205 | a problem with constexpr function returns not being constant enough. 206 | */ 207 | #if defined(__INTEL_COMPILER) && __INTEL_COMPILER < 9999 208 | #error The Intel compiler version 19.00 cannot compile VCL version 2. Use Version 1.xx of VCL instead 209 | #endif 210 | 211 | /* Clang problem: 212 | The Clang compiler treats the intrinsic vector types __m128, __m128i, and __m128d as identical. 213 | See the bug report at https://bugs.llvm.org/show_bug.cgi?id=17164 214 | Additional problem: The version number is not consistent across platforms. The Apple build has 215 | different version numbers. We have to rely on __apple_build_version__ on the Mac platform: 216 | http://llvm.org/bugs/show_bug.cgi?id=12643 217 | We have to make switches here when - hopefully - the error some day has been fixed. 218 | We need different version checks with and whithout __apple_build_version__ 219 | */ 220 | #if (defined (__clang__) || defined(__apple_build_version__)) && !defined(__INTEL_COMPILER) 221 | #define FIX_CLANG_VECTOR_ALIAS_AMBIGUITY 222 | #endif 223 | 224 | #if defined (GCC_VERSION) && GCC_VERSION < 99999 && !defined(__clang__) 225 | #define ZEXT_MISSING // Gcc 7.4.0 does not have _mm256_zextsi128_si256 and similar functions 226 | #endif 227 | 228 | 229 | #ifdef VCL_NAMESPACE 230 | namespace VCL_NAMESPACE { 231 | #endif 232 | 233 | // Constant for indicating don't care in permute and blend functions. 234 | // V_DC is -256 in Vector class library version 1.xx 235 | // V_DC can be any value less than -1 in Vector class library version 2.00 236 | constexpr int V_DC = -256; 237 | 238 | 239 | /***************************************************************************** 240 | * 241 | * Helper functions that depend on instruction set, compiler, or platform 242 | * 243 | *****************************************************************************/ 244 | 245 | // Define interface to cpuid instruction. 246 | // input: functionnumber = leaf (eax), ecxleaf = subleaf(ecx) 247 | // output: output[0] = eax, output[1] = ebx, output[2] = ecx, output[3] = edx 248 | static inline void cpuid(int output[4], int functionnumber, int ecxleaf = 0) { 249 | #if defined(__GNUC__) || defined(__clang__) // use inline assembly, Gnu/AT&T syntax 250 | int a, b, c, d; 251 | __asm("cpuid" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "a"(functionnumber), "c"(ecxleaf) : ); 252 | output[0] = a; 253 | output[1] = b; 254 | output[2] = c; 255 | output[3] = d; 256 | 257 | #elif defined (_MSC_VER) // Microsoft compiler, intrin.h included 258 | __cpuidex(output, functionnumber, ecxleaf); // intrinsic function for CPUID 259 | 260 | #else // unknown platform. try inline assembly with masm/intel syntax 261 | __asm { 262 | mov eax, functionnumber 263 | mov ecx, ecxleaf 264 | cpuid; 265 | mov esi, output 266 | mov[esi], eax 267 | mov[esi + 4], ebx 268 | mov[esi + 8], ecx 269 | mov[esi + 12], edx 270 | } 271 | #endif 272 | } 273 | 274 | 275 | // Define popcount function. Gives sum of bits 276 | #if INSTRSET >= 6 // SSE4.2 277 | // popcnt instruction is not officially part of the SSE4.2 instruction set, 278 | // but available in all known processors with SSE4.2 279 | static inline uint32_t vml_popcnt(uint32_t a) { 280 | return (uint32_t)_mm_popcnt_u32(a); // Intel intrinsic. Supported by gcc and clang 281 | } 282 | #ifdef __x86_64__ 283 | static inline int64_t vml_popcnt(uint64_t a) { 284 | return _mm_popcnt_u64(a); // Intel intrinsic. 285 | } 286 | #else // 32 bit mode 287 | static inline int64_t vml_popcnt(uint64_t a) { 288 | return _mm_popcnt_u32(uint32_t(a >> 32)) + _mm_popcnt_u32(uint32_t(a)); 289 | } 290 | #endif 291 | #else // no SSE4.2 292 | static inline uint32_t vml_popcnt(uint32_t a) { 293 | // popcnt instruction not available 294 | uint32_t b = a - ((a >> 1) & 0x55555555); 295 | uint32_t c = (b & 0x33333333) + ((b >> 2) & 0x33333333); 296 | uint32_t d = (c + (c >> 4)) & 0x0F0F0F0F; 297 | uint32_t e = d * 0x01010101; 298 | return e >> 24; 299 | } 300 | 301 | static inline int32_t vml_popcnt(uint64_t a) { 302 | return vml_popcnt(uint32_t(a >> 32)) + vml_popcnt(uint32_t(a)); 303 | } 304 | 305 | #endif 306 | 307 | // Define bit-scan-forward function. Gives index to lowest set bit 308 | #if defined (__GNUC__) || defined(__clang__) 309 | // gcc and Clang have no bit_scan_forward intrinsic 310 | #if defined(__clang__) // fix clang bug 311 | // Clang uses a k register as parameter a when inlined from horizontal_find_first 312 | __attribute__((noinline)) 313 | #endif 314 | static uint32_t bit_scan_forward(uint32_t a) { 315 | uint32_t r; 316 | __asm("bsfl %1, %0" : "=r"(r) : "r"(a) : ); 317 | return r; 318 | } 319 | static inline uint32_t bit_scan_forward(uint64_t a) { 320 | uint32_t lo = uint32_t(a); 321 | if (lo) return bit_scan_forward(lo); 322 | uint32_t hi = uint32_t(a >> 32); 323 | return bit_scan_forward(hi) + 32; 324 | } 325 | 326 | #else // other compilers 327 | static inline uint32_t bit_scan_forward(uint32_t a) { 328 | unsigned long r; 329 | _BitScanForward(&r, a); // defined in intrin.h for MS and Intel compilers 330 | return r; 331 | } 332 | #ifdef __x86_64__ 333 | static inline uint32_t bit_scan_forward(uint64_t a) { 334 | unsigned long r; 335 | _BitScanForward64(&r, a); // defined in intrin.h for MS and Intel compilers 336 | return (uint32_t)r; 337 | } 338 | #else 339 | static inline uint32_t bit_scan_forward(uint64_t a) { 340 | uint32_t lo = uint32_t(a); 341 | if (lo) return bit_scan_forward(lo); 342 | uint32_t hi = uint32_t(a >> 32); 343 | return bit_scan_forward(hi) + 32; 344 | } 345 | #endif 346 | #endif 347 | 348 | 349 | // Define bit-scan-reverse function. Gives index to highest set bit = floor(log2(a)) 350 | #if defined (__GNUC__) || defined(__clang__) 351 | static inline uint32_t bit_scan_reverse(uint32_t a) __attribute__((pure)); 352 | static inline uint32_t bit_scan_reverse(uint32_t a) { 353 | uint32_t r; 354 | __asm("bsrl %1, %0" : "=r"(r) : "r"(a) : ); 355 | return r; 356 | } 357 | #ifdef __x86_64__ 358 | static inline uint32_t bit_scan_reverse(uint64_t a) { 359 | uint64_t r; 360 | __asm("bsrq %1, %0" : "=r"(r) : "r"(a) : ); 361 | return r; 362 | } 363 | #else // 32 bit mode 364 | static inline uint32_t bit_scan_reverse(uint64_t a) { 365 | uint64_t ahi = a >> 32; 366 | if (ahi == 0) return bit_scan_reverse(uint32_t(a)); 367 | else return bit_scan_reverse(uint32_t(ahi)) + 32; 368 | } 369 | #endif 370 | #else 371 | static inline uint32_t bit_scan_reverse(uint32_t a) { 372 | unsigned long r; 373 | _BitScanReverse(&r, a); // defined in intrin.h for MS and Intel compilers 374 | return r; 375 | } 376 | #ifdef __x86_64__ 377 | static inline uint32_t bit_scan_reverse(uint64_t a) { 378 | unsigned long r; 379 | _BitScanReverse64(&r, a); // defined in intrin.h for MS and Intel compilers 380 | return r; 381 | } 382 | #else // 32 bit mode 383 | static inline uint32_t bit_scan_reverse(uint64_t a) { 384 | uint64_t ahi = a >> 32; 385 | if (ahi == 0) return bit_scan_reverse(uint32_t(a)); 386 | else return bit_scan_reverse(uint32_t(ahi)) + 32; 387 | } 388 | #endif 389 | #endif 390 | 391 | // Same function, for compile-time constants 392 | constexpr int bit_scan_reverse_const(uint64_t const n) { 393 | if (n == 0) return -1; 394 | uint64_t a = n, b = 0, j = 64, k = 0; 395 | do { 396 | j >>= 1; 397 | k = (uint64_t)1 << j; 398 | if (a >= k) { 399 | a >>= j; 400 | b += j; 401 | } 402 | } while (j > 0); 403 | return int(b); 404 | } 405 | 406 | 407 | /***************************************************************************** 408 | * 409 | * Common templates 410 | * 411 | *****************************************************************************/ 412 | 413 | // Template class to represent compile-time integer constant 414 | template class Const_int_t {}; // represent compile-time signed integer constant 415 | template class Const_uint_t {}; // represent compile-time unsigned integer constant 416 | #define const_int(n) (Const_int_t ()) // n must be compile-time integer constant 417 | #define const_uint(n) (Const_uint_t()) // n must be compile-time unsigned integer constant 418 | 419 | 420 | // template for producing quiet NAN 421 | template 422 | static inline VTYPE nan_vec(uint32_t payload = 0x100) { 423 | if constexpr ((VTYPE::elementtype() & 1) != 0) { // double 424 | union { 425 | uint64_t q; 426 | double f; 427 | } ud; 428 | // n is left justified to avoid loss of NAN payload when converting to float 429 | ud.q = 0x7FF8000000000000 | uint64_t(payload) << 29; 430 | return VTYPE(ud.f); 431 | } 432 | // float will be converted to double if necessary 433 | union { 434 | uint32_t i; 435 | float f; 436 | } uf; 437 | uf.i = 0x7FC00000 | (payload & 0x003FFFFF); 438 | return VTYPE(uf.f); 439 | } 440 | 441 | 442 | // Test if a parameter is a compile-time constant 443 | /* Unfortunately, this works only for macro parameters, not for inline function parameters. 444 | I hope that some solution will appear in the future, but for now it appears to be 445 | impossible to check if a function parameter is a compile-time constant. 446 | This would be useful in operator / and in function pow: 447 | #if defined(__GNUC__) || defined (__clang__) 448 | #define is_constant(a) __builtin_constant_p(a) 449 | #else 450 | #define is_constant(a) false 451 | #endif 452 | */ 453 | 454 | 455 | /***************************************************************************** 456 | * 457 | * Helper functions for permute and blend functions 458 | * 459 | ****************************************************************************** 460 | Rules for constexpr functions: 461 | 462 | > All variable declarations must include initialization 463 | 464 | > Do not put variable declarations inside a for-clause, e.g. avoid: for (int i=0; .. 465 | Instead, you have to declare the loop counter before the for-loop. 466 | 467 | > Do not make constexpr functions that return vector types. This requires type 468 | punning with a union, which is not allowed in constexpr functions under C++17. 469 | It may be possible under C++20 470 | 471 | *****************************************************************************/ 472 | 473 | // Define type for Encapsulated array to use as return type: 474 | template 475 | struct EList { 476 | T a[N]; 477 | }; 478 | 479 | 480 | // get_inttype: get an integer of a size that matches the element size 481 | // of vector class V with the value -1 482 | template 483 | constexpr auto get_inttype() { 484 | constexpr int elementsize = sizeof(V) / V::size(); // size of vector elements 485 | 486 | if constexpr (elementsize >= 8) { 487 | return -int64_t(1); 488 | } 489 | else if constexpr (elementsize >= 4) { 490 | return int32_t(-1); 491 | } 492 | else if constexpr (elementsize >= 2) { 493 | return int16_t(-1); 494 | } 495 | else { 496 | return int8_t(-1); 497 | } 498 | } 499 | 500 | 501 | // zero_mask: return a compact bit mask mask for zeroing using AVX512 mask. 502 | // Parameter a is a reference to a constexpr int array of permutation indexes 503 | template 504 | constexpr auto zero_mask(int const (&a)[N]) { 505 | uint64_t mask = 0; 506 | int i = 0; 507 | 508 | for (i = 0; i < N; i++) { 509 | if (a[i] >= 0) mask |= uint64_t(1) << i; 510 | } 511 | if constexpr (N <= 8 ) return uint8_t(mask); 512 | else if constexpr (N <= 16) return uint16_t(mask); 513 | else if constexpr (N <= 32) return uint32_t(mask); 514 | else return mask; 515 | } 516 | 517 | 518 | // zero_mask_broad: return a broad byte mask for zeroing. 519 | // Parameter a is a reference to a constexpr int array of permutation indexes 520 | template 521 | constexpr auto zero_mask_broad(int const (&A)[V::size()]) { 522 | constexpr int N = V::size(); // number of vector elements 523 | typedef decltype(get_inttype()) Etype; // element type 524 | EList u = {{0}}; // list for return 525 | int i = 0; 526 | for (i = 0; i < N; i++) { 527 | u.a[i] = A[i] >= 0 ? get_inttype() : 0; 528 | } 529 | return u; // return encapsulated array 530 | } 531 | 532 | 533 | // make_bit_mask: return a compact mask of bits from a list of N indexes: 534 | // B contains options indicating how to gather the mask 535 | // bit 0-7 in B indicates which bit in each index to collect 536 | // bit 8 = 0x100: set 1 in the lower half of the bit mask if the indicated bit is 1. 537 | // bit 8 = 0 : set 1 in the lower half of the bit mask if the indicated bit is 0. 538 | // bit 9 = 0x200: set 1 in the upper half of the bit mask if the indicated bit is 1. 539 | // bit 9 = 0 : set 1 in the upper half of the bit mask if the indicated bit is 0. 540 | // bit 10 = 0x400: set 1 in the bit mask if the corresponding index is -1 or V_DC 541 | // Parameter a is a reference to a constexpr int array of permutation indexes 542 | template 543 | constexpr uint64_t make_bit_mask(int const (&a)[N]) { 544 | uint64_t r = 0; // return value 545 | uint8_t j = uint8_t(B & 0xFF); // index to selected bit 546 | uint64_t s = 0; // bit number i in r 547 | uint64_t f = 0; // 1 if bit not flipped 548 | int i = 0; 549 | for (i = 0; i < N; i++) { 550 | int ix = a[i]; 551 | if (ix < 0) { // -1 or V_DC 552 | s = (B >> 10) & 1; 553 | } 554 | else { 555 | s = ((uint32_t)ix >> j) & 1; // extract selected bit 556 | if (i < N/2) { 557 | f = (B >> 8) & 1; // lower half 558 | } 559 | else { 560 | f = (B >> 9) & 1; // upper half 561 | } 562 | s ^= f ^ 1; // flip bit if needed 563 | } 564 | r |= uint64_t(s) << i; // set bit in return value 565 | } 566 | return r; 567 | } 568 | 569 | 570 | // make_broad_mask: Convert a bit mask m to a broad mask 571 | // The return value will be a broad boolean mask with elementsize matching vector class V 572 | template 573 | constexpr auto make_broad_mask(uint64_t const m) { 574 | constexpr int N = V::size(); // number of vector elements 575 | typedef decltype(get_inttype()) Etype; // element type 576 | EList u = {{0}}; // list for returning 577 | int i = 0; 578 | for (i = 0; i < N; i++) { 579 | u.a[i] = ((m >> i) & 1) != 0 ? get_inttype() : 0; 580 | } 581 | return u; // return encapsulated array 582 | } 583 | 584 | 585 | // perm_mask_broad: return a mask for permutation by a vector register index. 586 | // Parameter A is a reference to a constexpr int array of permutation indexes 587 | template 588 | constexpr auto perm_mask_broad(int const (&A)[V::size()]) { 589 | constexpr int N = V::size(); // number of vector elements 590 | typedef decltype(get_inttype()) Etype; // vector element type 591 | EList u = {{0}}; // list for returning 592 | int i = 0; 593 | for (i = 0; i < N; i++) { 594 | u.a[i] = Etype(A[i]); 595 | } 596 | return u; // return encapsulated array 597 | } 598 | 599 | 600 | // perm_flags: returns information about how a permute can be implemented. 601 | // The return value is composed of these flag bits: 602 | const int perm_zeroing = 1; // needs zeroing 603 | const int perm_perm = 2; // permutation needed 604 | const int perm_allzero = 4; // all is zero or don't care 605 | const int perm_largeblock = 8; // fits permute with a larger block size (e.g permute Vec2q instead of Vec4i) 606 | const int perm_addz = 0x10; // additional zeroing needed after permute with larger block size or shift 607 | const int perm_addz2 = 0x20; // additional zeroing needed after perm_zext, perm_compress, or perm_expand 608 | const int perm_cross_lane = 0x40; // permutation crossing 128-bit lanes 609 | const int perm_same_pattern = 0x80; // same permute pattern in all 128-bit lanes 610 | const int perm_punpckh = 0x100; // permutation pattern fits punpckh instruction 611 | const int perm_punpckl = 0x200; // permutation pattern fits punpckl instruction 612 | const int perm_rotate = 0x400; // permutation pattern fits rotation within lanes. 4 bit count returned in bit perm_rot_count 613 | const int perm_shright = 0x1000; // permutation pattern fits shift right within lanes. 4 bit count returned in bit perm_rot_count 614 | const int perm_shleft = 0x2000; // permutation pattern fits shift left within lanes. negative count returned in bit perm_rot_count 615 | const int perm_rotate_big = 0x4000; // permutation pattern fits rotation across lanes. 6 bit count returned in bit perm_rot_count 616 | const int perm_broadcast = 0x8000; // permutation pattern fits broadcast of a single element. 617 | const int perm_zext = 0x10000; // permutation pattern fits zero extension 618 | const int perm_compress = 0x20000; // permutation pattern fits vpcompress instruction 619 | const int perm_expand = 0x40000; // permutation pattern fits vpexpand instruction 620 | const int perm_outofrange = 0x10000000; // index out of range 621 | const int perm_rot_count = 32; // rotate or shift count is in bits perm_rot_count to perm_rot_count+3 622 | const int perm_ipattern = 40; // pattern for pshufd is in bit perm_ipattern to perm_ipattern + 7 if perm_same_pattern and elementsize >= 4 623 | 624 | template 625 | constexpr uint64_t perm_flags(int const (&a)[V::size()]) { 626 | // a is a reference to a constexpr array of permutation indexes 627 | // V is a vector class 628 | constexpr int N = V::size(); // number of elements 629 | uint64_t r = perm_largeblock | perm_same_pattern | perm_allzero; // return value 630 | uint32_t i = 0; // loop counter 631 | int j = 0; // loop counter 632 | int ix = 0; // index number i 633 | const uint32_t nlanes = sizeof(V) / 16; // number of 128-bit lanes 634 | const uint32_t lanesize = N / nlanes; // elements per lane 635 | const uint32_t elementsize = sizeof(V) / N; // size of each vector element 636 | uint32_t lane = 0; // current lane 637 | uint32_t rot = 999; // rotate left count 638 | int32_t broadc = 999; // index to broadcasted element 639 | uint32_t patfail = 0; // remember certain patterns that do not fit 640 | uint32_t addz2 = 0; // remember certain patterns need extra zeroing 641 | int32_t compresslasti = -1; // last index in perm_compress fit 642 | int32_t compresslastp = -1; // last position in perm_compress fit 643 | int32_t expandlasti = -1; // last index in perm_expand fit 644 | int32_t expandlastp = -1; // last position in perm_expand fit 645 | 646 | int lanepattern[lanesize] = {0}; // pattern in each lane 647 | 648 | for (i = 0; i < N; i++) { // loop through indexes 649 | ix = a[i]; // current index 650 | // meaning of ix: -1 = set to zero, V_DC = don't care, non-negative value = permute. 651 | if (ix == -1) { 652 | r |= perm_zeroing; // zeroing requested 653 | } 654 | else if (ix != V_DC && uint32_t(ix) >= N) { 655 | r |= perm_outofrange; // index out of range 656 | } 657 | if (ix >= 0) { 658 | r &= ~ perm_allzero; // not all zero 659 | if (ix != (int)i) r |= perm_perm; // needs permutation 660 | if (broadc == 999) broadc = ix; // remember broadcast index 661 | else if (broadc != ix) broadc = 1000; // does not fit broadcast 662 | } 663 | // check if pattern fits a larger block size: 664 | // even indexes must be even, odd indexes must fit the preceding even index + 1 665 | if ((i & 1) == 0) { // even index 666 | if (ix >= 0 && (ix & 1)) r &= ~perm_largeblock;// not even. does not fit larger block size 667 | int iy = a[i + 1]; // next odd index 668 | if (iy >= 0 && (iy & 1) == 0) r &= ~ perm_largeblock; // not odd. does not fit larger block size 669 | if (ix >= 0 && iy >= 0 && iy != ix+1) r &= ~ perm_largeblock; // does not fit preceding index + 1 670 | if (ix == -1 && iy >= 0) r |= perm_addz; // needs additional zeroing at current block size 671 | if (iy == -1 && ix >= 0) r |= perm_addz; // needs additional zeroing at current block size 672 | } 673 | lane = i / lanesize; // current lane 674 | if (lane == 0) { // first lane, or no pattern yet 675 | lanepattern[i] = ix; // save pattern 676 | } 677 | // check if crossing lanes 678 | if (ix >= 0) { 679 | uint32_t lanei = (uint32_t)ix / lanesize; // source lane 680 | if (lanei != lane) r |= perm_cross_lane; // crossing lane 681 | } 682 | // check if same pattern in all lanes 683 | if (lane != 0 && ix >= 0) { // not first lane 684 | int j1 = i - int(lane * lanesize); // index into lanepattern 685 | int jx = ix - int(lane * lanesize); // pattern within lane 686 | if (jx < 0 || jx >= (int)lanesize) r &= ~perm_same_pattern; // source is in another lane 687 | if (lanepattern[j1] < 0) { 688 | lanepattern[j1] = jx; // pattern not known from previous lane 689 | } 690 | else { 691 | if (lanepattern[j1] != jx) r &= ~perm_same_pattern; // not same pattern 692 | } 693 | } 694 | if (ix >= 0) { 695 | // check if pattern fits zero extension (perm_zext) 696 | if (uint32_t(ix*2) != i) { 697 | patfail |= 1; // does not fit zero extension 698 | } 699 | // check if pattern fits compress (perm_compress) 700 | if (ix > compresslasti && ix - compresslasti >= (int)i - compresslastp) { 701 | if ((int)i - compresslastp > 1) addz2 |= 2;// perm_compress may need additional zeroing 702 | compresslasti = ix; compresslastp = i; 703 | } 704 | else { 705 | patfail |= 2; // does not fit perm_compress 706 | } 707 | // check if pattern fits expand (perm_expand) 708 | if (ix > expandlasti && ix - expandlasti <= (int)i - expandlastp) { 709 | if (ix - expandlasti > 1) addz2 |= 4; // perm_expand may need additional zeroing 710 | expandlasti = ix; expandlastp = i; 711 | } 712 | else { 713 | patfail |= 4; // does not fit perm_compress 714 | } 715 | } 716 | else if (ix == -1) { 717 | if ((i & 1) == 0) addz2 |= 1; // zero extension needs additional zeroing 718 | } 719 | } 720 | if (!(r & perm_perm)) return r; // more checks are superfluous 721 | 722 | if (!(r & perm_largeblock)) r &= ~ perm_addz; // remove irrelevant flag 723 | if (r & perm_cross_lane) r &= ~ perm_same_pattern; // remove irrelevant flag 724 | if ((patfail & 1) == 0) { 725 | r |= perm_zext; // fits zero extension 726 | if ((addz2 & 1) != 0) r |= perm_addz2; 727 | } 728 | else if ((patfail & 2) == 0) { 729 | r |= perm_compress; // fits compression 730 | if ((addz2 & 2) != 0) { // check if additional zeroing needed 731 | for (j = 0; j < compresslastp; j++) { 732 | if (a[j] == -1) r |= perm_addz2; 733 | } 734 | } 735 | } 736 | else if ((patfail & 4) == 0) { 737 | r |= perm_expand; // fits expansion 738 | if ((addz2 & 4) != 0) { // check if additional zeroing needed 739 | for (j = 0; j < expandlastp; j++) { 740 | if (a[j] == -1) r |= perm_addz2; 741 | } 742 | } 743 | } 744 | 745 | if (r & perm_same_pattern) { 746 | // same pattern in all lanes. check if it fits specific patterns 747 | bool fit = true; 748 | // fit shift or rotate 749 | for (i = 0; i < lanesize; i++) { 750 | if (lanepattern[i] >= 0) { 751 | uint32_t rot1 = uint32_t(lanepattern[i] + lanesize - i) % lanesize; 752 | if (rot == 999) { 753 | rot = rot1; 754 | } 755 | else { // check if fit 756 | if (rot != rot1) fit = false; 757 | } 758 | } 759 | } 760 | rot &= lanesize-1; // prevent out of range values 761 | if (fit) { // fits rotate, and possibly shift 762 | uint64_t rot2 = (rot * elementsize) & 0xF; // rotate right count in bytes 763 | r |= rot2 << perm_rot_count; // put shift/rotate count in output bit 16-19 764 | #if INSTRSET >= 4 // SSSE3 765 | r |= perm_rotate; // allow palignr 766 | #endif 767 | // fit shift left 768 | fit = true; 769 | for (i = 0; i < lanesize-rot; i++) { // check if first rot elements are zero or don't care 770 | if (lanepattern[i] >= 0) fit = false; 771 | } 772 | if (fit) { 773 | r |= perm_shleft; 774 | for (; i < lanesize; i++) if (lanepattern[i] == -1) r |= perm_addz; // additional zeroing needed 775 | } 776 | // fit shift right 777 | fit = true; 778 | for (i = lanesize-(uint32_t)rot; i < lanesize; i++) { // check if last (lanesize-rot) elements are zero or don't care 779 | if (lanepattern[i] >= 0) fit = false; 780 | } 781 | if (fit) { 782 | r |= perm_shright; 783 | for (i = 0; i < lanesize-rot; i++) { 784 | if (lanepattern[i] == -1) r |= perm_addz; // additional zeroing needed 785 | } 786 | } 787 | } 788 | // fit punpckhi 789 | fit = true; 790 | uint32_t j2 = lanesize / 2; 791 | for (i = 0; i < lanesize; i++) { 792 | if (lanepattern[i] >= 0 && lanepattern[i] != (int)j2) fit = false; 793 | if ((i & 1) != 0) j2++; 794 | } 795 | if (fit) r |= perm_punpckh; 796 | // fit punpcklo 797 | fit = true; 798 | j2 = 0; 799 | for (i = 0; i < lanesize; i++) { 800 | if (lanepattern[i] >= 0 && lanepattern[i] != (int)j2) fit = false; 801 | if ((i & 1) != 0) j2++; 802 | } 803 | if (fit) r |= perm_punpckl; 804 | // fit pshufd 805 | if (elementsize >= 4) { 806 | uint64_t p = 0; 807 | for (i = 0; i < lanesize; i++) { 808 | if (lanesize == 4) { 809 | p |= (lanepattern[i] & 3) << 2 * i; 810 | } 811 | else { // lanesize = 2 812 | p |= ((lanepattern[i] & 1) * 10 + 4) << 4 * i; 813 | } 814 | } 815 | r |= p << perm_ipattern; 816 | } 817 | } 818 | #if INSTRSET >= 7 819 | else { // not same pattern in all lanes 820 | if constexpr (nlanes > 1) { // Try if it fits big rotate 821 | for (i = 0; i < N; i++) { 822 | ix = a[i]; 823 | if (ix >= 0) { 824 | uint32_t rot2 = (ix + N - i) % N; // rotate count 825 | if (rot == 999) { 826 | rot = rot2; // save rotate count 827 | } 828 | else if (rot != rot2) { 829 | rot = 1000; break; // does not fit big rotate 830 | } 831 | } 832 | } 833 | if (rot < N) { // fits big rotate 834 | r |= perm_rotate_big | (uint64_t)rot << perm_rot_count; 835 | } 836 | } 837 | } 838 | #endif 839 | if (broadc < 999 && (r & (perm_rotate|perm_shright|perm_shleft|perm_rotate_big)) == 0) { 840 | r |= perm_broadcast | (uint64_t)broadc << perm_rot_count; // fits broadcast 841 | } 842 | return r; 843 | } 844 | 845 | 846 | // compress_mask: returns a bit mask to use for compression instruction. 847 | // It is presupposed that perm_flags indicates perm_compress. 848 | // Additional zeroing is needed if perm_flags indicates perm_addz2 849 | template 850 | constexpr uint64_t compress_mask(int const (&a)[N]) { 851 | // a is a reference to a constexpr array of permutation indexes 852 | int ix = 0, lasti = -1, lastp = -1; 853 | uint64_t m = 0; 854 | int i = 0; int j = 1; // loop counters 855 | for (i = 0; i < N; i++) { 856 | ix = a[i]; // permutation index 857 | if (ix >= 0) { 858 | m |= (uint64_t)1 << ix; // mask for compression source 859 | for (j = 1; j < i - lastp; j++) { 860 | m |= (uint64_t)1 << (lasti + j); // dummy filling source 861 | } 862 | lastp = i; lasti = ix; 863 | } 864 | } 865 | return m; 866 | } 867 | 868 | // expand_mask: returns a bit mask to use for expansion instruction. 869 | // It is presupposed that perm_flags indicates perm_expand. 870 | // Additional zeroing is needed if perm_flags indicates perm_addz2 871 | template 872 | constexpr uint64_t expand_mask(int const (&a)[N]) { 873 | // a is a reference to a constexpr array of permutation indexes 874 | int ix = 0, lasti = -1, lastp = -1; 875 | uint64_t m = 0; 876 | int i = 0; int j = 1; 877 | for (i = 0; i < N; i++) { 878 | ix = a[i]; // permutation index 879 | if (ix >= 0) { 880 | m |= (uint64_t)1 << i; // mask for expansion destination 881 | for (j = 1; j < ix - lasti; j++) { 882 | m |= (uint64_t)1 << (lastp + j); // dummy filling destination 883 | } 884 | lastp = i; lasti = ix; 885 | } 886 | } 887 | return m; 888 | } 889 | 890 | // perm16_flags: returns information about how to permute a vector of 16-bit integers 891 | // Note: It is presupposed that perm_flags reports perm_same_pattern 892 | // The return value is composed of these bits: 893 | // 1: data from low 64 bits to low 64 bits. pattern in bit 32-39 894 | // 2: data from high 64 bits to high 64 bits. pattern in bit 40-47 895 | // 4: data from high 64 bits to low 64 bits. pattern in bit 48-55 896 | // 8: data from low 64 bits to high 64 bits. pattern in bit 56-63 897 | template 898 | constexpr uint64_t perm16_flags(int const (&a)[V::size()]) { 899 | // a is a reference to a constexpr array of permutation indexes 900 | // V is a vector class 901 | constexpr int N = V::size(); // number of elements 902 | 903 | uint64_t retval = 0; // return value 904 | uint32_t pat[4] = {0,0,0,0}; // permute patterns 905 | uint32_t i = 0; // loop counter 906 | int ix = 0; // index number i 907 | const uint32_t lanesize = 8; // elements per lane 908 | uint32_t lane = 0; // current lane 909 | int lanepattern[lanesize] = {0}; // pattern in each lane 910 | 911 | for (i = 0; i < N; i++) { 912 | ix = a[i]; 913 | lane = i / lanesize; // current lane 914 | if (lane == 0) { 915 | lanepattern[i] = ix; // save pattern 916 | } 917 | else if (ix >= 0) { // not first lane 918 | uint32_t j = i - lane * lanesize; // index into lanepattern 919 | int jx = ix - lane * lanesize; // pattern within lane 920 | if (lanepattern[j] < 0) { 921 | lanepattern[j] = jx; // pattern not known from previous lane 922 | } 923 | } 924 | } 925 | // four patterns: low2low, high2high, high2low, low2high 926 | for (i = 0; i < 4; i++) { 927 | // loop through low pattern 928 | if (lanepattern[i] >= 0) { 929 | if (lanepattern[i] < 4) { // low2low 930 | retval |= 1; 931 | pat[0] |= uint32_t(lanepattern[i] & 3) << (2 * i); 932 | } 933 | else { // high2low 934 | retval |= 4; 935 | pat[2] |= uint32_t(lanepattern[i] & 3) << (2 * i); 936 | } 937 | } 938 | // loop through high pattern 939 | if (lanepattern[i+4] >= 0) { 940 | if (lanepattern[i+4] < 4) { // low2high 941 | retval |= 8; 942 | pat[3] |= uint32_t(lanepattern[i+4] & 3) << (2 * i); 943 | } 944 | else { // high2high 945 | retval |= 2; 946 | pat[1] |= uint32_t(lanepattern[i+4] & 3) << (2 * i); 947 | } 948 | } 949 | } 950 | // join return data 951 | for (i = 0; i < 4; i++) { 952 | retval |= (uint64_t)pat[i] << (32 + i*8); 953 | } 954 | return retval; 955 | } 956 | 957 | 958 | // pshufb_mask: return a broad byte mask for permutation within lanes 959 | // for use with the pshufb instruction (_mm..._shuffle_epi8). 960 | // The pshufb instruction provides fast permutation and zeroing, 961 | // allowing different patterns in each lane but no crossing of lane boundaries 962 | template 963 | constexpr auto pshufb_mask(int const (&A)[V::size()]) { 964 | // Parameter a is a reference to a constexpr array of permutation indexes 965 | // V is a vector class 966 | // oppos = 1 for data from the opposite 128-bit lane in 256-bit vectors 967 | constexpr uint32_t N = V::size(); // number of vector elements 968 | constexpr uint32_t elementsize = sizeof(V) / N; // size of each vector element 969 | constexpr uint32_t nlanes = sizeof(V) / 16; // number of 128 bit lanes in vector 970 | constexpr uint32_t elements_per_lane = N / nlanes; // number of vector elements per lane 971 | 972 | EList u = {{0}}; // list for returning 973 | 974 | uint32_t i = 0; // loop counters 975 | uint32_t j = 0; 976 | int m = 0; 977 | int k = 0; 978 | uint32_t lane = 0; 979 | 980 | for (lane = 0; lane < nlanes; lane++) { // loop through lanes 981 | for (i = 0; i < elements_per_lane; i++) { // loop through elements in lane 982 | // permutation index for element within lane 983 | int8_t p = -1; 984 | int ix = A[m]; 985 | if (ix >= 0) { 986 | ix ^= oppos * elements_per_lane; // flip bit if opposite lane 987 | } 988 | ix -= int(lane * elements_per_lane); // index relative to lane 989 | if (ix >= 0 && ix < (int)elements_per_lane) { // index points to desired lane 990 | p = ix * elementsize; 991 | } 992 | for (j = 0; j < elementsize; j++) { // loop through bytes in element 993 | u.a[k++] = p < 0 ? -1 : p + j; // store byte permutation index 994 | } 995 | m++; 996 | } 997 | } 998 | return u; // return encapsulated array 999 | } 1000 | 1001 | 1002 | // largeblock_perm: return indexes for replacing a permute or blend with 1003 | // a certain block size by a permute or blend with the double block size. 1004 | // Note: it is presupposed that perm_flags() indicates perm_largeblock 1005 | // It is required that additional zeroing is added if perm_flags() indicates perm_addz 1006 | template 1007 | constexpr EList largeblock_perm(int const (&a)[N]) { 1008 | // Parameter a is a reference to a constexpr array of permutation indexes 1009 | EList list = {{0}}; // result indexes 1010 | int ix = 0; // even index 1011 | int iy = 0; // odd index 1012 | int iz = 0; // combined index 1013 | bool fit_addz = false; // additional zeroing needed at the lower block level 1014 | int i = 0; // loop counter 1015 | 1016 | // check if additional zeroing is needed at current block size 1017 | for (i = 0; i < N; i += 2) { 1018 | ix = a[i]; // even index 1019 | iy = a[i+1]; // odd index 1020 | if ((ix == -1 && iy >= 0) || (iy == -1 && ix >= 0)) { 1021 | fit_addz = true; 1022 | } 1023 | } 1024 | 1025 | // loop through indexes 1026 | for (i = 0; i < N; i += 2) { 1027 | ix = a[i]; // even index 1028 | iy = a[i+1]; // odd index 1029 | if (ix >= 0) { 1030 | iz = ix / 2; // half index 1031 | } 1032 | else if (iy >= 0) { 1033 | iz = iy / 2; 1034 | } 1035 | else { 1036 | iz = ix | iy; // -1 or V_DC. -1 takes precedence 1037 | if (fit_addz) iz = V_DC; // V_DC, because result will be zeroed later 1038 | } 1039 | list.a[i/2] = iz; // save to list 1040 | } 1041 | return list; 1042 | } 1043 | 1044 | 1045 | // blend_flags: returns information about how a blend function can be implemented 1046 | // The return value is composed of these flag bits: 1047 | const int blend_zeroing = 1; // needs zeroing 1048 | const int blend_allzero = 2; // all is zero or don't care 1049 | const int blend_largeblock = 4; // fits blend with a larger block size (e.g permute Vec2q instead of Vec4i) 1050 | const int blend_addz = 8; // additional zeroing needed after blend with larger block size or shift 1051 | const int blend_a = 0x10; // has data from a 1052 | const int blend_b = 0x20; // has data from b 1053 | const int blend_perma = 0x40; // permutation of a needed 1054 | const int blend_permb = 0x80; // permutation of b needed 1055 | const int blend_cross_lane = 0x100; // permutation crossing 128-bit lanes 1056 | const int blend_same_pattern = 0x200; // same permute/blend pattern in all 128-bit lanes 1057 | const int blend_punpckhab = 0x1000; // pattern fits punpckh(a,b) 1058 | const int blend_punpckhba = 0x2000; // pattern fits punpckh(b,a) 1059 | const int blend_punpcklab = 0x4000; // pattern fits punpckl(a,b) 1060 | const int blend_punpcklba = 0x8000; // pattern fits punpckl(b,a) 1061 | const int blend_rotateab = 0x10000; // pattern fits palignr(a,b) 1062 | const int blend_rotateba = 0x20000; // pattern fits palignr(b,a) 1063 | const int blend_shufab = 0x40000; // pattern fits shufps/shufpd(a,b) 1064 | const int blend_shufba = 0x80000; // pattern fits shufps/shufpd(b,a) 1065 | const int blend_rotate_big = 0x100000; // pattern fits rotation across lanes. count returned in bits blend_rotpattern 1066 | const int blend_outofrange= 0x10000000; // index out of range 1067 | const int blend_shufpattern = 32; // pattern for shufps/shufpd is in bit blend_shufpattern to blend_shufpattern + 7 1068 | const int blend_rotpattern = 40; // pattern for palignr is in bit blend_rotpattern to blend_rotpattern + 7 1069 | 1070 | template 1071 | constexpr uint64_t blend_flags(int const (&a)[V::size()]) { 1072 | // a is a reference to a constexpr array of permutation indexes 1073 | // V is a vector class 1074 | constexpr int N = V::size(); // number of elements 1075 | uint64_t r = blend_largeblock | blend_same_pattern | blend_allzero; // return value 1076 | uint32_t iu = 0; // loop counter 1077 | int32_t ii = 0; // loop counter 1078 | int ix = 0; // index number i 1079 | const uint32_t nlanes = sizeof(V) / 16; // number of 128-bit lanes 1080 | const uint32_t lanesize = N / nlanes; // elements per lane 1081 | uint32_t lane = 0; // current lane 1082 | uint32_t rot = 999; // rotate left count 1083 | int lanepattern[lanesize] = {0}; // pattern in each lane 1084 | if (lanesize == 2 && N <= 8) { 1085 | r |= blend_shufab | blend_shufba; // check if it fits shufpd 1086 | } 1087 | 1088 | for (ii = 0; ii < N; ii++) { // loop through indexes 1089 | ix = a[ii]; // index 1090 | if (ix < 0) { 1091 | if (ix == -1) r |= blend_zeroing; // set to zero 1092 | else if (ix != V_DC) { 1093 | r = blend_outofrange; break; // illegal index 1094 | } 1095 | } 1096 | else { // ix >= 0 1097 | r &= ~ blend_allzero; 1098 | if (ix < N) { 1099 | r |= blend_a; // data from a 1100 | if (ix != ii) r |= blend_perma; // permutation of a 1101 | } 1102 | else if (ix < 2*N) { 1103 | r |= blend_b; // data from b 1104 | if (ix != ii + N) r |= blend_permb; // permutation of b 1105 | } 1106 | else { 1107 | r = blend_outofrange; break; // illegal index 1108 | } 1109 | } 1110 | // check if pattern fits a larger block size: 1111 | // even indexes must be even, odd indexes must fit the preceding even index + 1 1112 | if ((ii & 1) == 0) { // even index 1113 | if (ix >= 0 && (ix&1)) r &= ~blend_largeblock; // not even. does not fit larger block size 1114 | int iy = a[ii+1]; // next odd index 1115 | if (iy >= 0 && (iy & 1) == 0) r &= ~ blend_largeblock; // not odd. does not fit larger block size 1116 | if (ix >= 0 && iy >= 0 && iy != ix+1) r &= ~ blend_largeblock; // does not fit preceding index + 1 1117 | if (ix == -1 && iy >= 0) r |= blend_addz; // needs additional zeroing at current block size 1118 | if (iy == -1 && ix >= 0) r |= blend_addz; // needs additional zeroing at current block size 1119 | } 1120 | lane = (uint32_t)ii / lanesize; // current lane 1121 | if (lane == 0) { // first lane, or no pattern yet 1122 | lanepattern[ii] = ix; // save pattern 1123 | } 1124 | // check if crossing lanes 1125 | if (ix >= 0) { 1126 | uint32_t lanei = uint32_t(ix & ~N) / lanesize; // source lane 1127 | if (lanei != lane) { 1128 | r |= blend_cross_lane; // crossing lane 1129 | } 1130 | if (lanesize == 2) { // check if it fits pshufd 1131 | if (lanei != lane) r &= ~(blend_shufab | blend_shufba); 1132 | if ((((ix & N) != 0) ^ ii) & 1) r &= ~blend_shufab; 1133 | else r &= ~blend_shufba; 1134 | } 1135 | } 1136 | // check if same pattern in all lanes 1137 | if (lane != 0 && ix >= 0) { // not first lane 1138 | int j = ii - int(lane * lanesize); // index into lanepattern 1139 | int jx = ix - int(lane * lanesize); // pattern within lane 1140 | if (jx < 0 || (jx & ~N) >= (int)lanesize) r &= ~blend_same_pattern; // source is in another lane 1141 | if (lanepattern[j] < 0) { 1142 | lanepattern[j] = jx; // pattern not known from previous lane 1143 | } 1144 | else { 1145 | if (lanepattern[j] != jx) r &= ~blend_same_pattern; // not same pattern 1146 | } 1147 | } 1148 | } 1149 | if (!(r & blend_largeblock)) r &= ~ blend_addz; // remove irrelevant flag 1150 | if (r & blend_cross_lane) r &= ~ blend_same_pattern; // remove irrelevant flag 1151 | if (!(r & (blend_perma | blend_permb))) { 1152 | return r; // no permutation. more checks are superfluous 1153 | } 1154 | if (r & blend_same_pattern) { 1155 | // same pattern in all lanes. check if it fits unpack patterns 1156 | r |= blend_punpckhab | blend_punpckhba | blend_punpcklab | blend_punpcklba; 1157 | for (iu = 0; iu < lanesize; iu++) { // loop through lanepattern 1158 | ix = lanepattern[iu]; 1159 | if (ix >= 0) { 1160 | if ((uint32_t)ix != iu / 2 + (iu & 1) * N) r &= ~ blend_punpcklab; 1161 | if ((uint32_t)ix != iu / 2 + ((iu & 1) ^ 1) * N) r &= ~ blend_punpcklba; 1162 | if ((uint32_t)ix != (iu + lanesize) / 2 + (iu & 1) * N) r &= ~ blend_punpckhab; 1163 | if ((uint32_t)ix != (iu + lanesize) / 2 + ((iu & 1) ^ 1) * N) r &= ~ blend_punpckhba; 1164 | } 1165 | } 1166 | #if INSTRSET >= 4 // SSSE3. check if it fits palignr 1167 | for (iu = 0; iu < lanesize; iu++) { 1168 | ix = lanepattern[iu]; 1169 | if (ix >= 0) { 1170 | uint32_t t = ix & ~N; 1171 | if (ix & N) t += lanesize; 1172 | uint32_t tb = (t + 2*lanesize - iu) % (lanesize * 2); 1173 | if (rot == 999) { 1174 | rot = tb; 1175 | } 1176 | else { // check if fit 1177 | if (rot != tb) rot = 1000; 1178 | } 1179 | } 1180 | } 1181 | if (rot < 999) { // firs palignr 1182 | if (rot < lanesize) { 1183 | r |= blend_rotateba; 1184 | } 1185 | else { 1186 | r |= blend_rotateab; 1187 | } 1188 | const uint32_t elementsize = sizeof(V) / N; 1189 | r |= uint64_t((rot & (lanesize - 1)) * elementsize) << blend_rotpattern; 1190 | } 1191 | #endif 1192 | if (lanesize == 4) { 1193 | // check if it fits shufps 1194 | r |= blend_shufab | blend_shufba; 1195 | for (ii = 0; ii < 2; ii++) { 1196 | ix = lanepattern[ii]; 1197 | if (ix >= 0) { 1198 | if (ix & N) r &= ~ blend_shufab; 1199 | else r &= ~ blend_shufba; 1200 | } 1201 | } 1202 | for (; ii < 4; ii++) { 1203 | ix = lanepattern[ii]; 1204 | if (ix >= 0) { 1205 | if (ix & N) r &= ~ blend_shufba; 1206 | else r &= ~ blend_shufab; 1207 | } 1208 | } 1209 | if (r & (blend_shufab | blend_shufba)) { // fits shufps/shufpd 1210 | uint8_t shufpattern = 0; // get pattern 1211 | for (iu = 0; iu < lanesize; iu++) { 1212 | shufpattern |= (lanepattern[iu] & 3) << iu * 2; 1213 | } 1214 | r |= (uint64_t)shufpattern << blend_shufpattern; // return pattern 1215 | } 1216 | } 1217 | } 1218 | else if (nlanes > 1) { // not same pattern in all lanes 1219 | rot = 999; // check if it fits big rotate 1220 | for (ii = 0; ii < N; ii++) { 1221 | ix = a[ii]; 1222 | if (ix >= 0) { 1223 | uint32_t rot2 = (ix + 2 * N - ii) % (2 * N);// rotate count 1224 | if (rot == 999) { 1225 | rot = rot2; // save rotate count 1226 | } 1227 | else if (rot != rot2) { 1228 | rot = 1000; break; // does not fit big rotate 1229 | } 1230 | } 1231 | } 1232 | if (rot < 2 * N) { // fits big rotate 1233 | r |= blend_rotate_big | (uint64_t)rot << blend_rotpattern; 1234 | } 1235 | } 1236 | if (lanesize == 2 && (r & (blend_shufab | blend_shufba))) { // fits shufpd. Get pattern 1237 | for (ii = 0; ii < N; ii++) { 1238 | r |= uint64_t(a[ii] & 1) << (blend_shufpattern + ii); 1239 | } 1240 | } 1241 | return r; 1242 | } 1243 | 1244 | // blend_perm_indexes: return an Indexlist for implementing a blend function as 1245 | // two permutations. N = vector size. 1246 | // dozero = 0: let unused elements be don't care. The two permutation results must be blended 1247 | // dozero = 1: zero unused elements in each permuation. The two permutation results can be OR'ed 1248 | // dozero = 2: indexes that are -1 or V_DC are preserved 1249 | template 1250 | constexpr EList blend_perm_indexes(int const (&a)[N]) { 1251 | // a is a reference to a constexpr array of permutation indexes 1252 | EList list = {{0}}; // list to return 1253 | int u = dozero ? -1 : V_DC; // value to use for unused entries 1254 | int j = 0; 1255 | 1256 | for (j = 0; j < N; j++) { // loop through indexes 1257 | int ix = a[j]; // current index 1258 | if (ix < 0) { // zero or don't care 1259 | if (dozero == 2) { 1260 | // list.a[j] = list.a[j + N] = ix; // fails in gcc in complicated cases 1261 | list.a[j] = ix; 1262 | list.a[j + N] = ix; 1263 | } 1264 | else { 1265 | // list.a[j] = list.a[j + N] = u; 1266 | list.a[j] = u; 1267 | list.a[j + N] = u; 1268 | } 1269 | } 1270 | else if (ix < N) { // value from a 1271 | list.a[j] = ix; 1272 | list.a[j+N] = u; 1273 | } 1274 | else { 1275 | list.a[j] = u; // value from b 1276 | list.a[j+N] = ix - N; 1277 | } 1278 | } 1279 | return list; 1280 | } 1281 | 1282 | // largeblock_indexes: return indexes for replacing a permute or blend with a 1283 | // certain block size by a permute or blend with the double block size. 1284 | // Note: it is presupposed that perm_flags or blend_flags indicates _largeblock 1285 | // It is required that additional zeroing is added if perm_flags or blend_flags 1286 | // indicates _addz 1287 | template 1288 | constexpr EList largeblock_indexes(int const (&a)[N]) { 1289 | // Parameter a is a reference to a constexpr array of N permutation indexes 1290 | EList list = {{0}}; // list to return 1291 | 1292 | bool fit_addz = false; // additional zeroing needed at the lower block level 1293 | int ix = 0; // even index 1294 | int iy = 0; // odd index 1295 | int iz = 0; // combined index 1296 | int i = 0; // loop counter 1297 | 1298 | for (i = 0; i < N; i += 2) { 1299 | ix = a[i]; // even index 1300 | iy = a[i+1]; // odd index 1301 | if (ix >= 0) { 1302 | iz = ix / 2; // half index 1303 | } 1304 | else if (iy >= 0) { 1305 | iz = iy / 2; // half index 1306 | } 1307 | else iz = ix | iy; // -1 or V_DC. -1 takes precedence 1308 | list.a[i/2] = iz; // save to list 1309 | // check if additional zeroing is needed at current block size 1310 | if ((ix == -1 && iy >= 0) || (iy == -1 && ix >= 0)) { 1311 | fit_addz = true; 1312 | } 1313 | } 1314 | // replace -1 by V_DC if fit_addz 1315 | if (fit_addz) { 1316 | for (i = 0; i < N/2; i++) { 1317 | if (list.a[i] < 0) list.a[i] = V_DC; 1318 | } 1319 | } 1320 | return list; 1321 | } 1322 | 1323 | 1324 | /**************************************************************************************** 1325 | * 1326 | * Vector blend helper function templates 1327 | * 1328 | * These templates are for emulating a blend with a vector size that is not supported by 1329 | * the instruction set, using multiple blends or permutations of half the vector size 1330 | * 1331 | ****************************************************************************************/ 1332 | 1333 | // Make dummy blend function templates to avoid error messages when the blend funtions are not yet defined 1334 | template void blend2(){} 1335 | template void blend4(){} 1336 | template void blend8(){} 1337 | template void blend16(){} 1338 | template void blend32(){} 1339 | 1340 | // blend_half_indexes: return an Indexlist for emulating a blend function as 1341 | // blends or permutations from multiple sources 1342 | // dozero = 0: let unused elements be don't care. Multiple permutation results must be blended 1343 | // dozero = 1: zero unused elements in each permuation. Multiple permutation results can be OR'ed 1344 | // dozero = 2: indexes that are -1 or V_DC are preserved 1345 | // src1, src2: sources to blend in a partial implementation 1346 | template 1347 | constexpr EList blend_half_indexes(int const (&a)[N]) { 1348 | // a is a reference to a constexpr array of permutation indexes 1349 | EList list = {{0}}; // list to return 1350 | int u = dozero ? -1 : V_DC; // value to use for unused entries 1351 | int j = 0; // loop counter 1352 | 1353 | for (j = 0; j < N; j++) { // loop through indexes 1354 | int ix = a[j]; // current index 1355 | if (ix < 0) { // zero or don't care 1356 | list.a[j] = (dozero == 2) ? ix : u; 1357 | } 1358 | else { 1359 | int src = ix / N; // source 1360 | if (src == src1) { 1361 | list.a[j] = ix & (N - 1); 1362 | } 1363 | else if (src == src2) { 1364 | list.a[j] = (ix & (N - 1)) + N; 1365 | } 1366 | else list.a[j] = u; 1367 | } 1368 | } 1369 | return list; 1370 | } 1371 | 1372 | // selectblend: select one of four sources for blending 1373 | template 1374 | static inline auto selectblend(W const a, W const b) { 1375 | if constexpr (s == 0) return a.get_low(); 1376 | else if constexpr (s == 1) return a.get_high(); 1377 | else if constexpr (s == 2) return b.get_low(); 1378 | else return b.get_high(); 1379 | } 1380 | 1381 | // blend_half: Emulate a blend with a vector size that is not supported 1382 | // by multiple blends with half the vector size. 1383 | // blend_half is called twice, to give the low and high half of the result 1384 | // Parameters: W: type of full-size vector 1385 | // i0...: indexes for low or high half 1386 | // a, b: full size input vectors 1387 | // return value: half-size vector for lower or upper part 1388 | template 1389 | auto blend_half(W const& a, W const& b) { 1390 | typedef decltype(a.get_low()) V; // type for half-size vector 1391 | constexpr int N = V::size(); // size of half-size vector 1392 | static_assert(sizeof...(i0) == N, "wrong number of indexes in blend_half"); 1393 | constexpr int ind[N] = { i0... }; // array of indexes 1394 | 1395 | // lambda to find which of the four possible sources are used 1396 | // return: EList containing a list of up to 4 sources. The last element is the number of sources used 1397 | auto listsources = [](int const n, int const (&ind)[N]) constexpr { 1398 | bool source_used[4] = { false,false,false,false }; // list of sources used 1399 | int i = 0; 1400 | for (i = 0; i < n; i++) { 1401 | int ix = ind[i]; // index 1402 | if (ix >= 0) { 1403 | int src = ix / n; // source used 1404 | source_used[src & 3] = true; 1405 | } 1406 | } 1407 | // return a list of sources used. The last element is the number of sources used 1408 | EList sources = {{0}}; 1409 | int nsrc = 0; // number of sources 1410 | for (i = 0; i < 4; i++) { 1411 | if (source_used[i]) { 1412 | sources.a[nsrc++] = i; 1413 | } 1414 | } 1415 | sources.a[4] = nsrc; 1416 | return sources; 1417 | }; 1418 | // list of sources used 1419 | constexpr EList sources = listsources(N, ind); 1420 | constexpr int nsrc = sources.a[4]; // number of sources used 1421 | 1422 | if constexpr (nsrc == 0) { // no sources 1423 | return V(0); 1424 | } 1425 | // get indexes for the first one or two sources 1426 | constexpr int uindex = (nsrc > 2) ? 1 : 2; // unused elements set to zero if two blends are combined 1427 | constexpr EList L = blend_half_indexes(ind); 1428 | V x0; 1429 | V src0 = selectblend(a, b); // first source 1430 | V src1 = selectblend(a, b); // second source 1431 | if constexpr (N == 2) { 1432 | x0 = blend2 (src0, src1); 1433 | } 1434 | else if constexpr (N == 4) { 1435 | x0 = blend4 (src0, src1); 1436 | } 1437 | else if constexpr (N == 8) { 1438 | x0 = blend8 (src0, src1); 1439 | } 1440 | else if constexpr (N == 16) { 1441 | x0 = blend16 (src0, src1); 1443 | } 1444 | else if constexpr (N == 32) { 1445 | x0 = blend32 (src0, src1); 1449 | } 1450 | if constexpr (nsrc > 2) { // get last one or two sources 1451 | constexpr EList M = blend_half_indexes(ind); 1452 | V x1; 1453 | V src2 = selectblend(a, b); // third source 1454 | V src3 = selectblend(a, b); // fourth source 1455 | if constexpr (N == 2) { 1456 | x1 = blend2 (src0, src1); 1457 | } 1458 | else if constexpr (N == 4) { 1459 | x1 = blend4 (src2, src3); 1460 | } 1461 | else if constexpr (N == 8) { 1462 | x1 = blend8 (src2, src3); 1463 | } 1464 | else if constexpr (N == 16) { 1465 | x1 = blend16 (src2, src3); 1467 | } 1468 | else if constexpr (N == 32) { 1469 | x1 = blend32 (src2, src3); 1473 | } 1474 | x0 |= x1; // combine result of two blends. Unused elements are zero 1475 | } 1476 | return x0; 1477 | } 1478 | 1479 | 1480 | #ifdef VCL_NAMESPACE 1481 | } 1482 | #endif 1483 | 1484 | 1485 | #endif // INSTRSET_H 1486 | -------------------------------------------------------------------------------- /CAS/VCL2/instrset_detect.cpp: -------------------------------------------------------------------------------- 1 | /************************** instrset_detect.cpp **************************** 2 | * Author: Agner Fog 3 | * Date created: 2012-05-30 4 | * Last modified: 2019-08-01 5 | * Version: 2.00.00 6 | * Project: vector class library 7 | * Description: 8 | * Functions for checking which instruction sets are supported. 9 | * 10 | * (c) Copyright 2012-2019 Agner Fog. 11 | * Apache License version 2.0 or later. 12 | ******************************************************************************/ 13 | 14 | #include "instrset.h" 15 | 16 | #ifdef VCL_NAMESPACE 17 | namespace VCL_NAMESPACE { 18 | #endif 19 | 20 | 21 | // Define interface to xgetbv instruction 22 | static inline uint64_t xgetbv (int ctr) { 23 | #if (defined (_MSC_FULL_VER) && _MSC_FULL_VER >= 160040000) || (defined (__INTEL_COMPILER) && __INTEL_COMPILER >= 1200) 24 | // Microsoft or Intel compiler supporting _xgetbv intrinsic 25 | 26 | return uint64_t(_xgetbv(ctr)); // intrinsic function for XGETBV 27 | 28 | #elif defined(__GNUC__) || defined (__clang__) // use inline assembly, Gnu/AT&T syntax 29 | 30 | uint32_t a, d; 31 | __asm("xgetbv" : "=a"(a),"=d"(d) : "c"(ctr) : ); 32 | return a | (uint64_t(d) << 32); 33 | 34 | #else // #elif defined (_WIN32) // other compiler. try inline assembly with masm/intel/MS syntax 35 | uint32_t a, d; 36 | __asm { 37 | mov ecx, ctr 38 | _emit 0x0f 39 | _emit 0x01 40 | _emit 0xd0 ; // xgetbv 41 | mov a, eax 42 | mov d, edx 43 | } 44 | return a | (uint64_t(d) << 32); 45 | 46 | #endif 47 | } 48 | 49 | /* find supported instruction set 50 | return value: 51 | 0 = 80386 instruction set 52 | 1 or above = SSE (XMM) supported by CPU (not testing for OS support) 53 | 2 or above = SSE2 54 | 3 or above = SSE3 55 | 4 or above = Supplementary SSE3 (SSSE3) 56 | 5 or above = SSE4.1 57 | 6 or above = SSE4.2 58 | 7 or above = AVX supported by CPU and operating system 59 | 8 or above = AVX2 60 | 9 or above = AVX512F 61 | 10 or above = AVX512VL, AVX512BW, AVX512DQ 62 | */ 63 | int instrset_detect(void) { 64 | 65 | static int iset = -1; // remember value for next call 66 | if (iset >= 0) { 67 | return iset; // called before 68 | } 69 | iset = 0; // default value 70 | int abcd[4] = {0,0,0,0}; // cpuid results 71 | cpuid(abcd, 0); // call cpuid function 0 72 | if (abcd[0] == 0) return iset; // no further cpuid function supported 73 | cpuid(abcd, 1); // call cpuid function 1 for feature flags 74 | if ((abcd[3] & (1 << 0)) == 0) return iset; // no floating point 75 | if ((abcd[3] & (1 << 23)) == 0) return iset; // no MMX 76 | if ((abcd[3] & (1 << 15)) == 0) return iset; // no conditional move 77 | if ((abcd[3] & (1 << 24)) == 0) return iset; // no FXSAVE 78 | if ((abcd[3] & (1 << 25)) == 0) return iset; // no SSE 79 | iset = 1; // 1: SSE supported 80 | if ((abcd[3] & (1 << 26)) == 0) return iset; // no SSE2 81 | iset = 2; // 2: SSE2 supported 82 | if ((abcd[2] & (1 << 0)) == 0) return iset; // no SSE3 83 | iset = 3; // 3: SSE3 supported 84 | if ((abcd[2] & (1 << 9)) == 0) return iset; // no SSSE3 85 | iset = 4; // 4: SSSE3 supported 86 | if ((abcd[2] & (1 << 19)) == 0) return iset; // no SSE4.1 87 | iset = 5; // 5: SSE4.1 supported 88 | if ((abcd[2] & (1 << 23)) == 0) return iset; // no POPCNT 89 | if ((abcd[2] & (1 << 20)) == 0) return iset; // no SSE4.2 90 | iset = 6; // 6: SSE4.2 supported 91 | if ((abcd[2] & (1 << 27)) == 0) return iset; // no OSXSAVE 92 | if ((xgetbv(0) & 6) != 6) return iset; // AVX not enabled in O.S. 93 | if ((abcd[2] & (1 << 28)) == 0) return iset; // no AVX 94 | iset = 7; // 7: AVX supported 95 | cpuid(abcd, 7); // call cpuid leaf 7 for feature flags 96 | if ((abcd[1] & (1 << 5)) == 0) return iset; // no AVX2 97 | iset = 8; 98 | if ((abcd[1] & (1 << 16)) == 0) return iset; // no AVX512 99 | cpuid(abcd, 0xD); // call cpuid leaf 0xD for feature flags 100 | if ((abcd[0] & 0x60) != 0x60) return iset; // no AVX512 101 | iset = 9; 102 | cpuid(abcd, 7); // call cpuid leaf 7 for feature flags 103 | if ((abcd[1] & (1 << 31)) == 0) return iset; // no AVX512VL 104 | if ((abcd[1] & 0x40020000) != 0x40020000) return iset; // no AVX512BW, AVX512DQ 105 | iset = 10; 106 | return iset; 107 | } 108 | 109 | // detect if CPU supports the FMA3 instruction set 110 | bool hasFMA3(void) { 111 | if (instrset_detect() < 7) return false; // must have AVX 112 | int abcd[4]; // cpuid results 113 | cpuid(abcd, 1); // call cpuid function 1 114 | return ((abcd[2] & (1 << 12)) != 0); // ecx bit 12 indicates FMA3 115 | } 116 | 117 | // detect if CPU supports the FMA4 instruction set 118 | bool hasFMA4(void) { 119 | if (instrset_detect() < 7) return false; // must have AVX 120 | int abcd[4]; // cpuid results 121 | cpuid(abcd, 0x80000001); // call cpuid function 0x80000001 122 | return ((abcd[2] & (1 << 16)) != 0); // ecx bit 16 indicates FMA4 123 | } 124 | 125 | // detect if CPU supports the XOP instruction set 126 | bool hasXOP(void) { 127 | if (instrset_detect() < 7) return false; // must have AVX 128 | int abcd[4]; // cpuid results 129 | cpuid(abcd, 0x80000001); // call cpuid function 0x80000001 130 | return ((abcd[2] & (1 << 11)) != 0); // ecx bit 11 indicates XOP 131 | } 132 | 133 | // detect if CPU supports the F16C instruction set 134 | bool hasF16C(void) { 135 | if (instrset_detect() < 7) return false; // must have AVX 136 | int abcd[4]; // cpuid results 137 | cpuid(abcd, 1); // call cpuid function 1 138 | return ((abcd[2] & (1 << 29)) != 0); // ecx bit 29 indicates F16C 139 | } 140 | 141 | // detect if CPU supports the AVX512ER instruction set 142 | bool hasAVX512ER(void) { 143 | if (instrset_detect() < 9) return false; // must have AVX512F 144 | int abcd[4]; // cpuid results 145 | cpuid(abcd, 7); // call cpuid function 7 146 | return ((abcd[1] & (1 << 27)) != 0); // ebx bit 27 indicates AVX512ER 147 | } 148 | 149 | // detect if CPU supports the AVX512VBMI instruction set 150 | bool hasAVX512VBMI(void) { 151 | if (instrset_detect() < 10) return false; // must have AVX512BW 152 | int abcd[4]; // cpuid results 153 | cpuid(abcd, 7); // call cpuid function 7 154 | return ((abcd[2] & (1 << 1)) != 0); // ecx bit 1 indicates AVX512VBMI 155 | } 156 | 157 | // detect if CPU supports the AVX512VBMI2 instruction set 158 | bool hasAVX512VBMI2(void) { 159 | if (instrset_detect() < 10) return false; // must have AVX512BW 160 | int abcd[4]; // cpuid results 161 | cpuid(abcd, 7); // call cpuid function 7 162 | return ((abcd[2] & (1 << 6)) != 0); // ecx bit 6 indicates AVX512VBMI2 163 | } 164 | 165 | #ifdef VCL_NAMESPACE 166 | } 167 | #endif 168 | -------------------------------------------------------------------------------- /CAS/VCL2/vector_convert.h: -------------------------------------------------------------------------------- 1 | /************************** vector_convert.h ******************************* 2 | * Author: Agner Fog 3 | * Date created: 2014-07-23 4 | * Last modified: 2019-11-17 5 | * Version: 2.01.00 6 | * Project: vector class library 7 | * Description: 8 | * Header file for conversion between different vector classes with different 9 | * sizes. Also includes verious generic template functions. 10 | * 11 | * (c) Copyright 2012-2019 Agner Fog. 12 | * Apache License version 2.0 or later. 13 | *****************************************************************************/ 14 | 15 | #ifndef VECTOR_CONVERT_H 16 | #define VECTOR_CONVERT_H 17 | 18 | #ifndef VECTORCLASS_H 19 | #include "vectorclass.h" 20 | #endif 21 | 22 | #if VECTORCLASS_H < 20100 23 | #error Incompatible versions of vector class library mixed 24 | #endif 25 | 26 | #ifdef VCL_NAMESPACE 27 | namespace VCL_NAMESPACE { 28 | #endif 29 | 30 | #if MAX_VECTOR_SIZE >= 256 31 | 32 | /***************************************************************************** 33 | * 34 | * Extend from 128 to 256 bit vectors 35 | * 36 | *****************************************************************************/ 37 | 38 | #if INSTRSET >= 8 // AVX2. 256 bit integer vectors 39 | 40 | // sign extend 41 | static inline Vec16s extend (Vec16c const a) { 42 | return _mm256_cvtepi8_epi16(a); 43 | } 44 | 45 | // zero extend 46 | static inline Vec16us extend (Vec16uc const a) { 47 | return _mm256_cvtepu8_epi16(a); 48 | } 49 | 50 | // sign extend 51 | static inline Vec8i extend (Vec8s const a) { 52 | return _mm256_cvtepi16_epi32(a); 53 | } 54 | 55 | // zero extend 56 | static inline Vec8ui extend (Vec8us const a) { 57 | return _mm256_cvtepu16_epi32(a); 58 | } 59 | 60 | // sign extend 61 | static inline Vec4q extend (Vec4i const a) { 62 | return _mm256_cvtepi32_epi64(a); 63 | } 64 | 65 | // zero extend 66 | static inline Vec4uq extend (Vec4ui const a) { 67 | return _mm256_cvtepu32_epi64(a); 68 | } 69 | 70 | 71 | #else // no AVX2. 256 bit integer vectors are emulated 72 | 73 | // sign extend and zero extend functions: 74 | static inline Vec16s extend (Vec16c const a) { 75 | return Vec16s(extend_low(a), extend_high(a)); 76 | } 77 | 78 | static inline Vec16us extend (Vec16uc const a) { 79 | return Vec16us(extend_low(a), extend_high(a)); 80 | } 81 | 82 | static inline Vec8i extend (Vec8s const a) { 83 | return Vec8i(extend_low(a), extend_high(a)); 84 | } 85 | 86 | static inline Vec8ui extend (Vec8us const a) { 87 | return Vec8ui(extend_low(a), extend_high(a)); 88 | } 89 | 90 | static inline Vec4q extend (Vec4i const a) { 91 | return Vec4q(extend_low(a), extend_high(a)); 92 | } 93 | 94 | static inline Vec4uq extend (Vec4ui const a) { 95 | return Vec4uq(extend_low(a), extend_high(a)); 96 | } 97 | 98 | #endif // AVX2 99 | 100 | /***************************************************************************** 101 | * 102 | * Conversions between float and double 103 | * 104 | *****************************************************************************/ 105 | #if INSTRSET >= 7 // AVX. 256 bit float vectors 106 | 107 | // float to double 108 | static inline Vec4d to_double (Vec4f const a) { 109 | return _mm256_cvtps_pd(a); 110 | } 111 | 112 | // double to float 113 | static inline Vec4f to_float (Vec4d const a) { 114 | return _mm256_cvtpd_ps(a); 115 | } 116 | 117 | #else // no AVX2. 256 bit float vectors are emulated 118 | 119 | // float to double 120 | static inline Vec4d to_double (Vec4f const a) { 121 | Vec2d lo = _mm_cvtps_pd(a); 122 | Vec2d hi = _mm_cvtps_pd(_mm_movehl_ps(a, a)); 123 | return Vec4d(lo,hi); 124 | } 125 | 126 | // double to float 127 | static inline Vec4f to_float (Vec4d const a) { 128 | Vec4f lo = _mm_cvtpd_ps(a.get_low()); 129 | Vec4f hi = _mm_cvtpd_ps(a.get_high()); 130 | return _mm_movelh_ps(lo, hi); 131 | } 132 | 133 | #endif 134 | 135 | /***************************************************************************** 136 | * 137 | * Reduce from 256 to 128 bit vectors 138 | * 139 | *****************************************************************************/ 140 | #if INSTRSET >= 10 // AVX512VL 141 | 142 | // compress functions. overflow wraps around 143 | static inline Vec16c compress (Vec16s const a) { 144 | return _mm256_cvtepi16_epi8(a); 145 | } 146 | 147 | static inline Vec16uc compress (Vec16us const a) { 148 | return _mm256_cvtepi16_epi8(a); 149 | } 150 | 151 | static inline Vec8s compress (Vec8i const a) { 152 | return _mm256_cvtepi32_epi16(a); 153 | } 154 | 155 | static inline Vec8us compress (Vec8ui const a) { 156 | return _mm256_cvtepi32_epi16(a); 157 | } 158 | 159 | static inline Vec4i compress (Vec4q const a) { 160 | return _mm256_cvtepi64_epi32(a); 161 | } 162 | 163 | static inline Vec4ui compress (Vec4uq const a) { 164 | return _mm256_cvtepi64_epi32(a); 165 | } 166 | 167 | #else // no AVX512 168 | 169 | // compress functions. overflow wraps around 170 | static inline Vec16c compress (Vec16s const a) { 171 | return compress(a.get_low(), a.get_high()); 172 | } 173 | 174 | static inline Vec16uc compress (Vec16us const a) { 175 | return compress(a.get_low(), a.get_high()); 176 | } 177 | 178 | static inline Vec8s compress (Vec8i const a) { 179 | return compress(a.get_low(), a.get_high()); 180 | } 181 | 182 | static inline Vec8us compress (Vec8ui const a) { 183 | return compress(a.get_low(), a.get_high()); 184 | } 185 | 186 | static inline Vec4i compress (Vec4q const a) { 187 | return compress(a.get_low(), a.get_high()); 188 | } 189 | 190 | static inline Vec4ui compress (Vec4uq const a) { 191 | return compress(a.get_low(), a.get_high()); 192 | } 193 | 194 | #endif // AVX512 195 | 196 | #endif // MAX_VECTOR_SIZE >= 256 197 | 198 | 199 | #if MAX_VECTOR_SIZE >= 512 200 | 201 | /***************************************************************************** 202 | * 203 | * Extend from 256 to 512 bit vectors 204 | * 205 | *****************************************************************************/ 206 | 207 | #if INSTRSET >= 9 // AVX512. 512 bit integer vectors 208 | 209 | // sign extend 210 | static inline Vec32s extend (Vec32c const a) { 211 | #if INSTRSET >= 10 212 | return _mm512_cvtepi8_epi16(a); 213 | #else 214 | return Vec32s(extend_low(a), extend_high(a)); 215 | #endif 216 | } 217 | 218 | // zero extend 219 | static inline Vec32us extend (Vec32uc const a) { 220 | #if INSTRSET >= 10 221 | return _mm512_cvtepu8_epi16(a); 222 | #else 223 | return Vec32us(extend_low(a), extend_high(a)); 224 | #endif 225 | } 226 | 227 | // sign extend 228 | static inline Vec16i extend (Vec16s const a) { 229 | return _mm512_cvtepi16_epi32(a); 230 | } 231 | 232 | // zero extend 233 | static inline Vec16ui extend (Vec16us const a) { 234 | return _mm512_cvtepu16_epi32(a); 235 | } 236 | 237 | // sign extend 238 | static inline Vec8q extend (Vec8i const a) { 239 | return _mm512_cvtepi32_epi64(a); 240 | } 241 | 242 | // zero extend 243 | static inline Vec8uq extend (Vec8ui const a) { 244 | return _mm512_cvtepu32_epi64(a); 245 | } 246 | 247 | #else // no AVX512. 512 bit vectors are emulated 248 | 249 | 250 | 251 | // sign extend 252 | static inline Vec32s extend (Vec32c const a) { 253 | return Vec32s(extend_low(a), extend_high(a)); 254 | } 255 | 256 | // zero extend 257 | static inline Vec32us extend (Vec32uc const a) { 258 | return Vec32us(extend_low(a), extend_high(a)); 259 | } 260 | 261 | // sign extend 262 | static inline Vec16i extend (Vec16s const a) { 263 | return Vec16i(extend_low(a), extend_high(a)); 264 | } 265 | 266 | // zero extend 267 | static inline Vec16ui extend (Vec16us const a) { 268 | return Vec16ui(extend_low(a), extend_high(a)); 269 | } 270 | 271 | // sign extend 272 | static inline Vec8q extend (Vec8i const a) { 273 | return Vec8q(extend_low(a), extend_high(a)); 274 | } 275 | 276 | // zero extend 277 | static inline Vec8uq extend (Vec8ui const a) { 278 | return Vec8uq(extend_low(a), extend_high(a)); 279 | } 280 | 281 | #endif // AVX512 282 | 283 | 284 | /***************************************************************************** 285 | * 286 | * Reduce from 512 to 256 bit vectors 287 | * 288 | *****************************************************************************/ 289 | #if INSTRSET >= 9 // AVX512F 290 | 291 | // compress functions. overflow wraps around 292 | static inline Vec32c compress (Vec32s const a) { 293 | #if INSTRSET >= 10 // AVVX512BW 294 | return _mm512_cvtepi16_epi8(a); 295 | #else 296 | return compress(a.get_low(), a.get_high()); 297 | #endif 298 | } 299 | 300 | static inline Vec32uc compress (Vec32us const a) { 301 | return Vec32uc(compress(Vec32s(a))); 302 | } 303 | 304 | static inline Vec16s compress (Vec16i const a) { 305 | return _mm512_cvtepi32_epi16(a); 306 | } 307 | 308 | static inline Vec16us compress (Vec16ui const a) { 309 | return _mm512_cvtepi32_epi16(a); 310 | } 311 | 312 | static inline Vec8i compress (Vec8q const a) { 313 | return _mm512_cvtepi64_epi32(a); 314 | } 315 | 316 | static inline Vec8ui compress (Vec8uq const a) { 317 | return _mm512_cvtepi64_epi32(a); 318 | } 319 | 320 | #else // no AVX512 321 | 322 | // compress functions. overflow wraps around 323 | static inline Vec32c compress (Vec32s const a) { 324 | return compress(a.get_low(), a.get_high()); 325 | } 326 | 327 | static inline Vec32uc compress (Vec32us const a) { 328 | return compress(a.get_low(), a.get_high()); 329 | } 330 | 331 | static inline Vec16s compress (Vec16i const a) { 332 | return compress(a.get_low(), a.get_high()); 333 | } 334 | 335 | static inline Vec16us compress (Vec16ui const a) { 336 | return compress(a.get_low(), a.get_high()); 337 | } 338 | 339 | static inline Vec8i compress (Vec8q const a) { 340 | return compress(a.get_low(), a.get_high()); 341 | } 342 | 343 | static inline Vec8ui compress (Vec8uq const a) { 344 | return compress(a.get_low(), a.get_high()); 345 | } 346 | 347 | #endif // AVX512 348 | 349 | /***************************************************************************** 350 | * 351 | * Conversions between float and double 352 | * 353 | *****************************************************************************/ 354 | 355 | #if INSTRSET >= 9 // AVX512. 512 bit float vectors 356 | 357 | // float to double 358 | static inline Vec8d to_double (Vec8f const a) { 359 | return _mm512_cvtps_pd(a); 360 | } 361 | 362 | // double to float 363 | static inline Vec8f to_float (Vec8d const a) { 364 | return _mm512_cvtpd_ps(a); 365 | } 366 | 367 | #else // no AVX512. 512 bit float vectors are emulated 368 | 369 | // float to double 370 | static inline Vec8d to_double (Vec8f const a) { 371 | Vec4d lo = to_double(a.get_low()); 372 | Vec4d hi = to_double(a.get_high()); 373 | return Vec8d(lo,hi); 374 | } 375 | 376 | // double to float 377 | static inline Vec8f to_float (Vec8d const a) { 378 | Vec4f lo = to_float(a.get_low()); 379 | Vec4f hi = to_float(a.get_high()); 380 | return Vec8f(lo, hi); 381 | } 382 | 383 | #endif 384 | 385 | #endif // MAX_VECTOR_SIZE >= 512 386 | 387 | // double to float 388 | static inline Vec4f to_float (Vec2d const a) { 389 | return _mm_cvtpd_ps(a); 390 | } 391 | 392 | 393 | /***************************************************************************** 394 | * 395 | * Generic template functions 396 | * 397 | * These templates define functions for multiple vector types in one template 398 | * 399 | *****************************************************************************/ 400 | 401 | // horizontal min/max of vector elements 402 | // implemented with universal template, works for all vector types: 403 | 404 | template auto horizontal_min(T const x) { 405 | if constexpr ((T::elementtype() & 16) != 0) { 406 | // T is a float or double vector 407 | if (horizontal_or(is_nan(x))) { 408 | // check for NAN because min does not guarantee NAN propagation 409 | return x[horizontal_find_first(is_nan(x))]; 410 | } 411 | } 412 | return horizontal_min1(x); 413 | } 414 | 415 | template auto horizontal_min1(T const x) { 416 | if constexpr (T::elementtype() <= 3) { // boolean vector type 417 | return horizontal_and(x); 418 | } 419 | else if constexpr (sizeof(T) >= 32) { 420 | // split recursively into smaller vectors 421 | return horizontal_min1(min(x.get_low(), x.get_high())); 422 | } 423 | else if constexpr (T::size() == 2) { 424 | T a = permute2 <1, V_DC>(x); // high half 425 | T b = min(a, x); 426 | return b[0]; 427 | } 428 | else if constexpr (T::size() == 4) { 429 | T a = permute4<2, 3, V_DC, V_DC>(x); // high half 430 | T b = min(a, x); 431 | a = permute4<1, V_DC, V_DC, V_DC>(b); 432 | b = min(a, b); 433 | return b[0]; 434 | } 435 | else if constexpr (T::size() == 8) { 436 | T a = permute8<4, 5, 6, 7, V_DC, V_DC, V_DC, V_DC>(x); // high half 437 | T b = min(a, x); 438 | a = permute8<2, 3, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b); 439 | b = min(a, b); 440 | a = permute8<1, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b); 441 | b = min(a, b); 442 | return b[0]; 443 | } 444 | else { 445 | static_assert(T::size() == 16); // no other size is allowed 446 | T a = permute16<8, 9, 10, 11, 12, 13, 14, 15, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC >(x); // high half 447 | T b = min(a, x); 448 | a = permute16<4, 5, 6, 7, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b); 449 | b = min(a, b); 450 | a = permute16<2, 3, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b); 451 | b = min(a, b); 452 | a = permute16<1, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b); 453 | b = min(a, b); 454 | return b[0]; 455 | } 456 | } 457 | 458 | template auto horizontal_max(T const x) { 459 | if constexpr ((T::elementtype() & 16) != 0) { 460 | // T is a float or double vector 461 | if (horizontal_or(is_nan(x))) { 462 | // check for NAN because max does not guarantee NAN propagation 463 | return x[horizontal_find_first(is_nan(x))]; 464 | } 465 | } 466 | return horizontal_max1(x); 467 | } 468 | 469 | template auto horizontal_max1(T const x) { 470 | if constexpr (T::elementtype() <= 3) { // boolean vector type 471 | return horizontal_or(x); 472 | } 473 | else if constexpr (sizeof(T) >= 32) { 474 | // split recursively into smaller vectors 475 | return horizontal_max1(max(x.get_low(), x.get_high())); 476 | } 477 | else if constexpr (T::size() == 2) { 478 | T a = permute2 <1, V_DC>(x); // high half 479 | T b = max(a, x); 480 | return b[0]; 481 | } 482 | else if constexpr (T::size() == 4) { 483 | T a = permute4<2, 3, V_DC, V_DC>(x); // high half 484 | T b = max(a, x); 485 | a = permute4<1, V_DC, V_DC, V_DC>(b); 486 | b = max(a, b); 487 | return b[0]; 488 | } 489 | else if constexpr (T::size() == 8) { 490 | T a = permute8<4, 5, 6, 7, V_DC, V_DC, V_DC, V_DC>(x); // high half 491 | T b = max(a, x); 492 | a = permute8<2, 3, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b); 493 | b = max(a, b); 494 | a = permute8<1, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b); 495 | b = max(a, b); 496 | return b[0]; 497 | } 498 | else { 499 | static_assert(T::size() == 16); // no other size is allowed 500 | T a = permute16<8, 9, 10, 11, 12, 13, 14, 15, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC >(x); // high half 501 | T b = max(a, x); 502 | a = permute16<4, 5, 6, 7, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b); 503 | b = max(a, b); 504 | a = permute16<2, 3, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b); 505 | b = max(a, b); 506 | a = permute16<1, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b); 507 | b = max(a, b); 508 | return b[0]; 509 | } 510 | } 511 | 512 | // Find first element that is true in a boolean vector 513 | template 514 | static inline int horizontal_find_first(V const x) { 515 | static_assert(V::elementtype() == 2 || V::elementtype() == 3, "Boolean vector expected"); 516 | auto bits = to_bits(x); // convert to bits 517 | if (bits == 0) return -1; 518 | if constexpr (V::size() < 32) { 519 | return bit_scan_forward((uint32_t)bits); 520 | } 521 | else { 522 | return bit_scan_forward(bits); 523 | } 524 | } 525 | 526 | // Count the number of elements that are true in a boolean vector 527 | template 528 | static inline int horizontal_count(V const x) { 529 | static_assert(V::elementtype() == 2 || V::elementtype() == 3, "Boolean vector expected"); 530 | auto bits = to_bits(x); // convert to bits 531 | if constexpr (V::size() < 32) { 532 | return vml_popcnt((uint32_t)bits); 533 | } 534 | else { 535 | return (int)vml_popcnt(bits); 536 | } 537 | } 538 | 539 | // maximum and minimum functions. This version is sure to propagate NANs, 540 | // conforming to the new IEEE-754 2019 standard 541 | template 542 | static inline V maximum(V const a, V const b) { 543 | if constexpr (V::elementtype() < 16) { 544 | return max(a, b); // integer type 545 | } 546 | else { // float or double vector 547 | V y = select(is_nan(a), a, max(a, b)); 548 | #ifdef SIGNED_ZERO // pedantic about signed zero 549 | y = select(a == b, a & b, y); // maximum(+0, -0) = +0 550 | #endif 551 | return y; 552 | } 553 | } 554 | 555 | template 556 | static inline V minimum(V const a, V const b) { 557 | if constexpr (V::elementtype() < 16) { 558 | return min(a, b); // integer type 559 | } 560 | else { // float or double vector 561 | V y = select(is_nan(a), a, min(a, b)); 562 | #ifdef SIGNED_ZERO // pedantic about signed zero 563 | y = select(a == b, a | b, y); // minimum(+0, -0) = -0 564 | #endif 565 | return y; 566 | } 567 | } 568 | 569 | 570 | #ifdef VCL_NAMESPACE 571 | } 572 | #endif 573 | 574 | #endif // VECTOR_CONVERT_H 575 | -------------------------------------------------------------------------------- /CAS/VCL2/vectorclass.h: -------------------------------------------------------------------------------- 1 | /**************************** vectorclass.h ******************************** 2 | * Author: Agner Fog 3 | * Date created: 2012-05-30 4 | * Last modified: 2020-04-11 5 | * Version: 2.01.02 6 | * Project: vector class library 7 | * Home: https://github.com/vectorclass 8 | * Description: 9 | * Header file defining vector classes as interface to intrinsic functions 10 | * in x86 and x86-64 microprocessors with SSE2 and later instruction sets. 11 | * 12 | * Instructions: 13 | * Use Gnu, Clang, Intel or Microsoft C++ compiler. Compile for the desired 14 | * instruction set, which must be at least SSE2. Specify the supported 15 | * instruction set by a command line define, e.g. __SSE4_1__ if the 16 | * compiler does not automatically do so. 17 | * For detailed instructions, see vcl_manual.pdf 18 | * 19 | * Each vector object is represented internally in the CPU as a vector 20 | * register with 128, 256 or 512 bits. 21 | * 22 | * This header file includes the appropriate header files depending on the 23 | * selected instruction set. 24 | * 25 | * (c) Copyright 2012-2020 Agner Fog. 26 | * Apache License version 2.0 or later. 27 | ******************************************************************************/ 28 | #ifndef VECTORCLASS_H 29 | #define VECTORCLASS_H 20102 30 | 31 | // Maximum vector size, bits. Allowed values are 128, 256, 512 32 | #ifndef MAX_VECTOR_SIZE 33 | #define MAX_VECTOR_SIZE 512 34 | #endif 35 | 36 | // Determine instruction set, and define platform-dependent functions 37 | #include "instrset.h" // Select supported instruction set 38 | 39 | #if INSTRSET < 2 // instruction set SSE2 is the minimum 40 | #error Please compile for the SSE2 instruction set or higher 41 | #else 42 | 43 | // Select appropriate .h files depending on instruction set 44 | #include "vectori128.h" // 128-bit integer vectors 45 | #include "vectorf128.h" // 128-bit floating point vectors 46 | 47 | #if MAX_VECTOR_SIZE >= 256 48 | #if INSTRSET >= 8 49 | #include "vectori256.h" // 256-bit integer vectors, requires AVX2 instruction set 50 | #else 51 | #include "vectori256e.h" // 256-bit integer vectors, emulated 52 | #endif // INSTRSET >= 8 53 | #if INSTRSET >= 7 54 | #include "vectorf256.h" // 256-bit floating point vectors, requires AVX instruction set 55 | #else 56 | #include "vectorf256e.h" // 256-bit floating point vectors, emulated 57 | #endif // INSTRSET >= 7 58 | #endif // MAX_VECTOR_SIZE >= 256 59 | 60 | #if MAX_VECTOR_SIZE >= 512 61 | #if INSTRSET >= 9 62 | #include "vectori512.h" // 512-bit vectors of 32 and 64 bit integers, requires AVX512F instruction set 63 | #include "vectorf512.h" // 512-bit floating point vectors, requires AVX512F instruction set 64 | #else 65 | #include "vectori512e.h" // 512-bit integer vectors, emulated 66 | #include "vectorf512e.h" // 512-bit floating point vectors, emulated 67 | #endif // INSTRSET >= 9 68 | #if INSTRSET >= 10 69 | #include "vectori512s.h" // 512-bit vectors of 8 and 16 bit integers, requires AVX512BW instruction set 70 | #else 71 | #include "vectori512se.h" // 512-bit vectors of 8 and 16 bit integers, emulated 72 | #endif 73 | #endif // MAX_VECTOR_SIZE >= 512 74 | 75 | #include "vector_convert.h" // conversion between different vector sizes 76 | 77 | #endif // INSTRSET >= 2 78 | 79 | 80 | #else // VECTORCLASS_H 81 | 82 | #if VECTORCLASS_H < 20000 83 | #error Mixed versions of vector class library 84 | #endif 85 | 86 | #endif // VECTORCLASS_H 87 | -------------------------------------------------------------------------------- /CAS/VCL2/vectormath_common.h: -------------------------------------------------------------------------------- 1 | /*************************** vectormath_common.h **************************** 2 | * Author: Agner Fog 3 | * Date created: 2014-04-18 4 | * Last modified: 2020-06-08 5 | * Version: 2.01.03 6 | * Project: vector classes 7 | * Description: 8 | * Header file containing common code for inline version of mathematical functions. 9 | * 10 | * For detailed instructions, see VectorClass.pdf 11 | * 12 | * (c) Copyright 2014-2020 Agner Fog. 13 | * Apache License version 2.0 or later. 14 | ******************************************************************************/ 15 | 16 | #ifndef VECTORMATH_COMMON_H 17 | #define VECTORMATH_COMMON_H 2 18 | 19 | #ifdef VECTORMATH_LIB_H 20 | #error conflicting header files. More than one implementation of mathematical functions included 21 | #endif 22 | 23 | #include 24 | 25 | #ifndef VECTORCLASS_H 26 | #include "vectorclass.h" 27 | #endif 28 | 29 | #if VECTORCLASS_H < 20000 30 | #error Incompatible versions of vector class library mixed 31 | #endif 32 | 33 | 34 | /****************************************************************************** 35 | Define NAN payload values 36 | ******************************************************************************/ 37 | #define NAN_LOG 0x101 // logarithm for x<0 38 | #define NAN_POW 0x102 // negative number raised to non-integer power 39 | #define NAN_HYP 0x104 // acosh for x<1 and atanh for abs(x)>1 40 | 41 | 42 | /****************************************************************************** 43 | Define mathematical constants 44 | ******************************************************************************/ 45 | #define VM_PI 3.14159265358979323846 // pi 46 | #define VM_PI_2 1.57079632679489661923 // pi / 2 47 | #define VM_PI_4 0.785398163397448309616 // pi / 4 48 | #define VM_SQRT2 1.41421356237309504880 // sqrt(2) 49 | #define VM_LOG2E 1.44269504088896340736 // 1/log(2) 50 | #define VM_LOG10E 0.434294481903251827651 // 1/log(10) 51 | #define VM_LOG210 3.321928094887362347808 // log2(10) 52 | #define VM_LN2 0.693147180559945309417 // log(2) 53 | #define VM_LN10 2.30258509299404568402 // log(10) 54 | #define VM_SMALLEST_NORMAL 2.2250738585072014E-308 // smallest normal number, double 55 | #define VM_SMALLEST_NORMALF 1.17549435E-38f // smallest normal number, float 56 | 57 | 58 | #ifdef VCL_NAMESPACE 59 | namespace VCL_NAMESPACE { 60 | #endif 61 | 62 | /****************************************************************************** 63 | templates for producing infinite and nan in desired vector type 64 | ******************************************************************************/ 65 | template 66 | static inline VTYPE infinite_vec(); 67 | 68 | template <> 69 | inline Vec2d infinite_vec() { 70 | return infinite2d(); 71 | } 72 | 73 | template <> 74 | inline Vec4f infinite_vec() { 75 | return infinite4f(); 76 | } 77 | 78 | #if MAX_VECTOR_SIZE >= 256 79 | 80 | template <> 81 | inline Vec4d infinite_vec() { 82 | return infinite4d(); 83 | } 84 | 85 | template <> 86 | inline Vec8f infinite_vec() { 87 | return infinite8f(); 88 | } 89 | 90 | #endif // MAX_VECTOR_SIZE >= 256 91 | 92 | #if MAX_VECTOR_SIZE >= 512 93 | 94 | template <> 95 | inline Vec8d infinite_vec() { 96 | return infinite8d(); 97 | } 98 | 99 | template <> 100 | inline Vec16f infinite_vec() { 101 | return infinite16f(); 102 | } 103 | 104 | #endif // MAX_VECTOR_SIZE >= 512 105 | 106 | 107 | 108 | /****************************************************************************** 109 | * Detect NAN codes 110 | * 111 | * These functions return the code hidden in a NAN. The sign bit is ignored 112 | ******************************************************************************/ 113 | 114 | static inline Vec4ui nan_code(Vec4f const x) { 115 | Vec4ui a = Vec4ui(reinterpret_i(x)); 116 | Vec4ui const n = 0x007FFFFF; 117 | return select(Vec4ib(is_nan(x)), a & n, 0); 118 | } 119 | 120 | // This function returns the code hidden in a NAN. The sign bit is ignored 121 | static inline Vec2uq nan_code(Vec2d const x) { 122 | Vec2uq a = Vec2uq(reinterpret_i(x)); 123 | return select(Vec2qb(is_nan(x)), a << 12 >> (12+29), 0); 124 | } 125 | 126 | #if MAX_VECTOR_SIZE >= 256 127 | 128 | // This function returns the code hidden in a NAN. The sign bit is ignored 129 | static inline Vec8ui nan_code(Vec8f const x) { 130 | Vec8ui a = Vec8ui(reinterpret_i(x)); 131 | Vec8ui const n = 0x007FFFFF; 132 | return select(Vec8ib(is_nan(x)), a & n, 0); 133 | } 134 | 135 | // This function returns the code hidden in a NAN. The sign bit is ignored 136 | static inline Vec4uq nan_code(Vec4d const x) { 137 | Vec4uq a = Vec4uq(reinterpret_i(x)); 138 | return select(Vec4qb(is_nan(x)), a << 12 >> (12+29), 0); 139 | } 140 | 141 | #endif // MAX_VECTOR_SIZE >= 256 142 | #if MAX_VECTOR_SIZE >= 512 143 | 144 | // This function returns the code hidden in a NAN. The sign bit is ignored 145 | static inline Vec16ui nan_code(Vec16f const x) { 146 | Vec16ui a = Vec16ui(reinterpret_i(x)); 147 | Vec16ui const n = 0x007FFFFF; 148 | return select(Vec16ib(is_nan(x)), a & n, 0); 149 | } 150 | 151 | // This function returns the code hidden in a NAN. The sign bit is ignored 152 | static inline Vec8uq nan_code(Vec8d const x) { 153 | Vec8uq a = Vec8uq(reinterpret_i(x)); 154 | return select(Vec8qb(is_nan(x)), a << 12 >> (12+29), 0); 155 | } 156 | 157 | #endif // MAX_VECTOR_SIZE >= 512 158 | 159 | 160 | /****************************************************************************** 161 | templates for polynomials 162 | Using Estrin's scheme to make shorter dependency chains and use FMA, starting 163 | longest dependency chains first. 164 | ******************************************************************************/ 165 | 166 | // template 167 | template 168 | static inline VTYPE polynomial_2(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2) { 169 | // calculates polynomial c2*x^2 + c1*x + c0 170 | // VTYPE may be a vector type, CTYPE is a scalar type 171 | VTYPE x2 = x * x; 172 | //return = x2 * c2 + (x * c1 + c0); 173 | return mul_add(x2, c2, mul_add(x, c1, c0)); 174 | } 175 | 176 | template 177 | static inline VTYPE polynomial_3(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3) { 178 | // calculates polynomial c3*x^3 + c2*x^2 + c1*x + c0 179 | // VTYPE may be a vector type, CTYPE is a scalar type 180 | VTYPE x2 = x * x; 181 | //return (c2 + c3*x)*x2 + (c1*x + c0); 182 | return mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0)); 183 | } 184 | 185 | template 186 | static inline VTYPE polynomial_4(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4) { 187 | // calculates polynomial c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0 188 | // VTYPE may be a vector type, CTYPE is a scalar type 189 | VTYPE x2 = x * x; 190 | VTYPE x4 = x2 * x2; 191 | //return (c2+c3*x)*x2 + ((c0+c1*x) + c4*x4); 192 | return mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0) + c4*x4); 193 | } 194 | 195 | template 196 | static inline VTYPE polynomial_4n(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3) { 197 | // calculates polynomial 1*x^4 + c3*x^3 + c2*x^2 + c1*x + c0 198 | // VTYPE may be a vector type, CTYPE is a scalar type 199 | VTYPE x2 = x * x; 200 | VTYPE x4 = x2 * x2; 201 | //return (c2+c3*x)*x2 + ((c0+c1*x) + x4); 202 | return mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0) + x4); 203 | } 204 | 205 | template 206 | static inline VTYPE polynomial_5(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5) { 207 | // calculates polynomial c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0 208 | // VTYPE may be a vector type, CTYPE is a scalar type 209 | VTYPE x2 = x * x; 210 | VTYPE x4 = x2 * x2; 211 | //return (c2+c3*x)*x2 + ((c4+c5*x)*x4 + (c0+c1*x)); 212 | return mul_add(mul_add(c3, x, c2), x2, mul_add(mul_add(c5, x, c4), x4, mul_add(c1, x, c0))); 213 | } 214 | 215 | template 216 | static inline VTYPE polynomial_5n(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4) { 217 | // calculates polynomial 1*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0 218 | // VTYPE may be a vector type, CTYPE is a scalar type 219 | VTYPE x2 = x * x; 220 | VTYPE x4 = x2 * x2; 221 | //return (c2+c3*x)*x2 + ((c4+x)*x4 + (c0+c1*x)); 222 | return mul_add(mul_add(c3, x, c2), x2, mul_add(c4 + x, x4, mul_add(c1, x, c0))); 223 | } 224 | 225 | template 226 | static inline VTYPE polynomial_6(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6) { 227 | // calculates polynomial c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0 228 | // VTYPE may be a vector type, CTYPE is a scalar type 229 | VTYPE x2 = x * x; 230 | VTYPE x4 = x2 * x2; 231 | //return (c4+c5*x+c6*x2)*x4 + ((c2+c3*x)*x2 + (c0+c1*x)); 232 | return mul_add(mul_add(c6, x2, mul_add(c5, x, c4)), x4, mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0))); 233 | } 234 | 235 | template 236 | static inline VTYPE polynomial_6n(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5) { 237 | // calculates polynomial 1*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0 238 | // VTYPE may be a vector type, CTYPE is a scalar type 239 | VTYPE x2 = x * x; 240 | VTYPE x4 = x2 * x2; 241 | //return (c4+c5*x+x2)*x4 + ((c2+c3*x)*x2 + (c0+c1*x)); 242 | return mul_add(mul_add(c5, x, c4 + x2), x4, mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0))); 243 | } 244 | 245 | template 246 | static inline VTYPE polynomial_7(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7) { 247 | // calculates polynomial c7*x^7 + c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0 248 | // VTYPE may be a vector type, CTYPE is a scalar type 249 | VTYPE x2 = x * x; 250 | VTYPE x4 = x2 * x2; 251 | //return ((c6+c7*x)*x2 + (c4+c5*x))*x4 + ((c2+c3*x)*x2 + (c0+c1*x)); 252 | return mul_add(mul_add(mul_add(c7, x, c6), x2, mul_add(c5, x, c4)), x4, mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0))); 253 | } 254 | 255 | template 256 | static inline VTYPE polynomial_8(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7, CTYPE c8) { 257 | // calculates polynomial c8*x^8 + c7*x^7 + c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0 258 | // VTYPE may be a vector type, CTYPE is a scalar type 259 | VTYPE x2 = x * x; 260 | VTYPE x4 = x2 * x2; 261 | VTYPE x8 = x4 * x4; 262 | //return ((c6+c7*x)*x2 + (c4+c5*x))*x4 + (c8*x8 + (c2+c3*x)*x2 + (c0+c1*x)); 263 | return mul_add(mul_add(mul_add(c7, x, c6), x2, mul_add(c5, x, c4)), x4, 264 | mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0) + c8*x8)); 265 | } 266 | 267 | template 268 | static inline VTYPE polynomial_9(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7, CTYPE c8, CTYPE c9) { 269 | // calculates polynomial c9*x^9 + c8*x^8 + c7*x^7 + c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0 270 | // VTYPE may be a vector type, CTYPE is a scalar type 271 | VTYPE x2 = x * x; 272 | VTYPE x4 = x2 * x2; 273 | VTYPE x8 = x4 * x4; 274 | //return (((c6+c7*x)*x2 + (c4+c5*x))*x4 + (c8+c9*x)*x8) + ((c2+c3*x)*x2 + (c0+c1*x)); 275 | return mul_add(mul_add(c9, x, c8), x8, mul_add( 276 | mul_add(mul_add(c7, x, c6), x2, mul_add(c5, x, c4)), x4, 277 | mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0)))); 278 | } 279 | 280 | template 281 | static inline VTYPE polynomial_10(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7, CTYPE c8, CTYPE c9, CTYPE c10) { 282 | // calculates polynomial c10*x^10 + c9*x^9 + c8*x^8 + c7*x^7 + c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0 283 | // VTYPE may be a vector type, CTYPE is a scalar type 284 | VTYPE x2 = x * x; 285 | VTYPE x4 = x2 * x2; 286 | VTYPE x8 = x4 * x4; 287 | //return (((c6+c7*x)*x2 + (c4+c5*x))*x4 + (c8+c9*x+c10*x2)*x8) + ((c2+c3*x)*x2 + (c0+c1*x)); 288 | return mul_add(mul_add(x2, c10, mul_add(c9, x, c8)), x8, 289 | mul_add(mul_add(mul_add(c7, x, c6), x2, mul_add(c5, x, c4)), x4, 290 | mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0)))); 291 | } 292 | 293 | template 294 | static inline VTYPE polynomial_13(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7, CTYPE c8, CTYPE c9, CTYPE c10, CTYPE c11, CTYPE c12, CTYPE c13) { 295 | // calculates polynomial c13*x^13 + c12*x^12 + ... + c1*x + c0 296 | // VTYPE may be a vector type, CTYPE is a scalar type 297 | VTYPE x2 = x * x; 298 | VTYPE x4 = x2 * x2; 299 | VTYPE x8 = x4 * x4; 300 | return mul_add( 301 | mul_add( 302 | mul_add(c13, x, c12), x4, 303 | mul_add(mul_add(c11, x, c10), x2, mul_add(c9, x, c8))), x8, 304 | mul_add( 305 | mul_add(mul_add(c7, x, c6), x2, mul_add(c5, x, c4)), x4, 306 | mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0)))); 307 | } 308 | 309 | 310 | template 311 | static inline VTYPE polynomial_13m(VTYPE const x, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7, CTYPE c8, CTYPE c9, CTYPE c10, CTYPE c11, CTYPE c12, CTYPE c13) { 312 | // calculates polynomial c13*x^13 + c12*x^12 + ... + x + 0 313 | // VTYPE may be a vector type, CTYPE is a scalar type 314 | VTYPE x2 = x * x; 315 | VTYPE x4 = x2 * x2; 316 | VTYPE x8 = x4 * x4; 317 | // return ((c8+c9*x) + (c10+c11*x)*x2 + (c12+c13*x)*x4)*x8 + (((c6+c7*x)*x2 + (c4+c5*x))*x4 + ((c2+c3*x)*x2 + x)); 318 | return mul_add( 319 | mul_add(mul_add(c13, x, c12), x4, mul_add(mul_add(c11, x, c10), x2, mul_add(c9, x, c8))), x8, 320 | mul_add(mul_add(mul_add(c7, x, c6), x2, mul_add(c5, x, c4)), x4, mul_add(mul_add(c3, x, c2), x2, x))); 321 | } 322 | 323 | #ifdef VCL_NAMESPACE 324 | } 325 | #endif 326 | 327 | #endif 328 | -------------------------------------------------------------------------------- /CAS/VCL2/vectormath_hyp.h: -------------------------------------------------------------------------------- 1 | /**************************** vectormath_hyp.h ****************************** 2 | * Author: Agner Fog 3 | * Date created: 2014-07-09 4 | * Last modified: 2019-08-01 5 | * Version: 2.00.00 6 | * Project: vector class library 7 | * Description: 8 | * Header file containing inline vector functions of hyperbolic and inverse 9 | * hyperbolic functions: 10 | * sinh hyperbolic sine 11 | * cosh hyperbolic cosine 12 | * tanh hyperbolic tangent 13 | * asinh inverse hyperbolic sine 14 | * acosh inverse hyperbolic cosine 15 | * atanh inverse hyperbolic tangent 16 | * 17 | * Theory, methods and inspiration based partially on these sources: 18 | * > Moshier, Stephen Lloyd Baluk: Methods and programs for mathematical functions. 19 | * Ellis Horwood, 1989. 20 | * > VDT library developed on CERN by Danilo Piparo, Thomas Hauth and 21 | * Vincenzo Innocente, 2012, https://svnweb.cern.ch/trac/vdt 22 | * > Cephes math library by Stephen L. Moshier 1992, 23 | * http://www.netlib.org/cephes/ 24 | * 25 | * For detailed instructions, see vectormath_common.h and vcl_manual.pdf 26 | * 27 | * (c) Copyright 2014-2019 Agner Fog. 28 | * Apache License version 2.0 or later. 29 | ******************************************************************************/ 30 | 31 | #ifndef VECTORMATH_HYP_H 32 | #define VECTORMATH_HYP_H 1 33 | 34 | #include "vectormath_exp.h" 35 | 36 | #ifdef VCL_NAMESPACE 37 | namespace VCL_NAMESPACE { 38 | #endif 39 | 40 | /****************************************************************************** 41 | * Hyperbolic functions 42 | ******************************************************************************/ 43 | 44 | // Template for sinh function, double precision 45 | // This function does not produce denormals 46 | // Template parameters: 47 | // VTYPE: double vector type 48 | template 49 | static inline VTYPE sinh_d(VTYPE const x0) { 50 | // The limit of abs(x) is 709.7, as defined by max_x in vectormath_exp.h for 0.5*exp(x). 51 | 52 | // Coefficients 53 | const double p0 = -3.51754964808151394800E5; 54 | const double p1 = -1.15614435765005216044E4; 55 | const double p2 = -1.63725857525983828727E2; 56 | const double p3 = -7.89474443963537015605E-1; 57 | 58 | const double q0 = -2.11052978884890840399E6; 59 | const double q1 = 3.61578279834431989373E4; 60 | const double q2 = -2.77711081420602794433E2; 61 | const double q3 = 1.0; 62 | 63 | // data vectors 64 | VTYPE x, x2, y1, y2; 65 | 66 | x = abs(x0); 67 | auto x_small = x <= 1.0; // use Pade approximation if abs(x) <= 1 68 | 69 | if (horizontal_or(x_small)) { 70 | // At least one element needs small method 71 | x2 = x*x; 72 | y1 = polynomial_3(x2, p0, p1, p2, p3) / polynomial_3(x2, q0, q1, q2, q3); 73 | y1 = mul_add(y1, x*x2, x); // y1 = x + x2*(x*y1); 74 | } 75 | if (!horizontal_and(x_small)) { 76 | // At least one element needs big method 77 | y2 = exp_d(x); // 0.5 * exp(x) 78 | y2 -= 0.25 / y2; // - 0.5 * exp(-x) 79 | } 80 | y1 = select(x_small, y1, y2); // choose method 81 | y1 = sign_combine(y1, x0); // get original sign 82 | // you can avoid the sign_combine by replacing x by x0 above, but at a loss of precision 83 | 84 | return y1; 85 | } 86 | 87 | // instances of sinh_d template 88 | static inline Vec2d sinh(Vec2d const x) { 89 | return sinh_d(x); 90 | } 91 | 92 | #if MAX_VECTOR_SIZE >= 256 93 | static inline Vec4d sinh(Vec4d const x) { 94 | return sinh_d(x); 95 | } 96 | #endif // MAX_VECTOR_SIZE >= 256 97 | 98 | #if MAX_VECTOR_SIZE >= 512 99 | static inline Vec8d sinh(Vec8d const x) { 100 | return sinh_d(x); 101 | } 102 | #endif // MAX_VECTOR_SIZE >= 512 103 | 104 | 105 | // Template for sinh function, single precision 106 | // This function does not produce denormals 107 | // Template parameters: 108 | // VTYPE: double vector type 109 | template 110 | static inline VTYPE sinh_f(VTYPE const x0) { 111 | // The limit of abs(x) is 89.0, as defined by max_x in vectormath_exp.h for 0.5*exp(x). 112 | 113 | // Coefficients 114 | const float r0 = 1.66667160211E-1f; 115 | const float r1 = 8.33028376239E-3f; 116 | const float r2 = 2.03721912945E-4f; 117 | 118 | // data vectors 119 | VTYPE x, x2, y1, y2; 120 | 121 | x = abs(x0); 122 | auto x_small = x <= 1.0f; // use polynomial approximation if abs(x) <= 1 123 | 124 | if (horizontal_or(x_small)) { 125 | // At least one element needs small method 126 | x2 = x*x; 127 | y1 = polynomial_2(x2, r0, r1, r2); 128 | y1 = mul_add(y1, x2*x, x); // y1 = x + x2*(x*y1); 129 | } 130 | if (!horizontal_and(x_small)) { 131 | // At least one element needs big method 132 | y2 = exp_f(x); // 0.5 * exp(x) 133 | y2 -= 0.25f / y2; // - 0.5 * exp(-x) 134 | } 135 | y1 = select(x_small, y1, y2); // choose method 136 | y1 = sign_combine(y1, x0); // get original sign 137 | // you can avoid the sign_combine by replacing x by x0 above, but at a loss of precision 138 | 139 | return y1; 140 | } 141 | 142 | // instances of sinh_f template 143 | static inline Vec4f sinh(Vec4f const x) { 144 | return sinh_f(x); 145 | } 146 | 147 | #if MAX_VECTOR_SIZE >= 256 148 | static inline Vec8f sinh(Vec8f const x) { 149 | return sinh_f(x); 150 | } 151 | #endif // MAX_VECTOR_SIZE >= 256 152 | 153 | #if MAX_VECTOR_SIZE >= 512 154 | static inline Vec16f sinh(Vec16f const x) { 155 | return sinh_f(x); 156 | } 157 | #endif // MAX_VECTOR_SIZE >= 512 158 | 159 | 160 | // Template for cosh function, double precision 161 | // This function does not produce denormals 162 | // Template parameters: 163 | // VTYPE: double vector type 164 | template 165 | static inline VTYPE cosh_d(VTYPE const x0) { 166 | // The limit of abs(x) is 709.7, as defined by max_x in vectormath_exp.h for 0.5*exp(x). 167 | 168 | // data vectors 169 | VTYPE x, y; 170 | x = abs(x0); 171 | y = exp_d(x); // 0.5 * exp(x) 172 | y += 0.25 / y; // + 0.5 * exp(-x) 173 | return y; 174 | } 175 | 176 | // instances of sinh_d template 177 | static inline Vec2d cosh(Vec2d const x) { 178 | return cosh_d(x); 179 | } 180 | 181 | #if MAX_VECTOR_SIZE >= 256 182 | static inline Vec4d cosh(Vec4d const x) { 183 | return cosh_d(x); 184 | } 185 | #endif // MAX_VECTOR_SIZE >= 256 186 | 187 | #if MAX_VECTOR_SIZE >= 512 188 | static inline Vec8d cosh(Vec8d const x) { 189 | return cosh_d(x); 190 | } 191 | #endif // MAX_VECTOR_SIZE >= 512 192 | 193 | 194 | // Template for cosh function, single precision 195 | // This function does not produce denormals 196 | // Template parameters: 197 | // VTYPE: double vector type 198 | template 199 | static inline VTYPE cosh_f(VTYPE const x0) { 200 | // The limit of abs(x) is 89.0, as defined by max_x in vectormath_exp.h for 0.5*exp(x). 201 | 202 | // data vectors 203 | VTYPE x, y; 204 | x = abs(x0); 205 | y = exp_f(x); // 0.5 * exp(x) 206 | y += 0.25f / y; // + 0.5 * exp(-x) 207 | return y; 208 | } 209 | 210 | // instances of sinh_d template 211 | static inline Vec4f cosh(Vec4f const x) { 212 | return cosh_f(x); 213 | } 214 | 215 | #if MAX_VECTOR_SIZE >= 256 216 | static inline Vec8f cosh(Vec8f const x) { 217 | return cosh_f(x); 218 | } 219 | #endif // MAX_VECTOR_SIZE >= 256 220 | 221 | #if MAX_VECTOR_SIZE >= 512 222 | static inline Vec16f cosh(Vec16f const x) { 223 | return cosh_f(x); 224 | } 225 | #endif // MAX_VECTOR_SIZE >= 512 226 | 227 | 228 | // Template for tanh function, double precision 229 | // This function does not produce denormals 230 | // Template parameters: 231 | // VTYPE: double vector type 232 | template 233 | static inline VTYPE tanh_d(VTYPE const x0) { 234 | 235 | // Coefficients 236 | const double p0 = -1.61468768441708447952E3; 237 | const double p1 = -9.92877231001918586564E1; 238 | const double p2 = -9.64399179425052238628E-1; 239 | 240 | const double q0 = 4.84406305325125486048E3; 241 | const double q1 = 2.23548839060100448583E3; 242 | const double q2 = 1.12811678491632931402E2; 243 | const double q3 = 1.0; 244 | 245 | // data vectors 246 | VTYPE x, x2, y1, y2; 247 | 248 | x = abs(x0); 249 | auto x_small = x <= 0.625; // use Pade approximation if abs(x) <= 5/8 250 | 251 | if (horizontal_or(x_small)) { 252 | // At least one element needs small method 253 | x2 = x*x; 254 | y1 = polynomial_2(x2, p0, p1, p2) / polynomial_3(x2, q0, q1, q2, q3); 255 | y1 = mul_add(y1, x2*x, x); // y1 = x + x2*(x*y1); 256 | } 257 | if (!horizontal_and(x_small)) { 258 | // At least one element needs big method 259 | y2 = exp(x+x); // exp(2*x) 260 | y2 = 1.0 - 2.0 / (y2 + 1.0); // tanh(x) 261 | } 262 | auto x_big = x > 350.; 263 | y1 = select(x_small, y1, y2); // choose method 264 | y1 = select(x_big, 1.0, y1); // avoid overflow 265 | y1 = sign_combine(y1, x0); // get original sign 266 | return y1; 267 | } 268 | 269 | // instances of tanh_d template 270 | static inline Vec2d tanh(Vec2d const x) { 271 | return tanh_d(x); 272 | } 273 | 274 | #if MAX_VECTOR_SIZE >= 256 275 | static inline Vec4d tanh(Vec4d const x) { 276 | return tanh_d(x); 277 | } 278 | #endif // MAX_VECTOR_SIZE >= 256 279 | 280 | #if MAX_VECTOR_SIZE >= 512 281 | static inline Vec8d tanh(Vec8d const x) { 282 | return tanh_d(x); 283 | } 284 | #endif // MAX_VECTOR_SIZE >= 512 285 | 286 | 287 | // Template for tanh function, single precision 288 | // This function does not produce denormals 289 | // Template parameters: 290 | // VTYPE: double vector type 291 | template 292 | static inline VTYPE tanh_f(VTYPE const x0) { 293 | // The limit of abs(x) is 89.0, as defined by max_x in vectormath_exp.h for 0.5*exp(x). 294 | 295 | // Coefficients 296 | const float r0 = -3.33332819422E-1f; 297 | const float r1 = 1.33314422036E-1f; 298 | const float r2 = -5.37397155531E-2f; 299 | const float r3 = 2.06390887954E-2f; 300 | const float r4 = -5.70498872745E-3f; 301 | 302 | // data vectors 303 | VTYPE x, x2, y1, y2; 304 | 305 | x = abs(x0); 306 | auto x_small = x <= 0.625f; // use polynomial approximation if abs(x) <= 5/8 307 | 308 | if (horizontal_or(x_small)) { 309 | // At least one element needs small method 310 | x2 = x*x; 311 | y1 = polynomial_4(x2, r0, r1, r2, r3, r4); 312 | y1 = mul_add(y1, x2*x, x); // y1 = x + (x2*x)*y1; 313 | } 314 | if (!horizontal_and(x_small)) { 315 | // At least one element needs big method 316 | y2 = exp(x+x); // exp(2*x) 317 | y2 = 1.0f - 2.0f / (y2 + 1.0f); // tanh(x) 318 | } 319 | auto x_big = x > 44.4f; 320 | y1 = select(x_small, y1, y2); // choose method 321 | y1 = select(x_big, 1.0f, y1); // avoid overflow 322 | y1 = sign_combine(y1, x0); // get original sign 323 | return y1; 324 | } 325 | 326 | // instances of tanh_f template 327 | static inline Vec4f tanh(Vec4f const x) { 328 | return tanh_f(x); 329 | } 330 | 331 | #if MAX_VECTOR_SIZE >= 256 332 | static inline Vec8f tanh(Vec8f const x) { 333 | return tanh_f(x); 334 | } 335 | #endif // MAX_VECTOR_SIZE >= 256 336 | 337 | #if MAX_VECTOR_SIZE >= 512 338 | static inline Vec16f tanh(Vec16f const x) { 339 | return tanh_f(x); 340 | } 341 | #endif // MAX_VECTOR_SIZE >= 512 342 | 343 | 344 | 345 | /****************************************************************************** 346 | * Inverse hyperbolic functions 347 | ******************************************************************************/ 348 | 349 | // Template for asinh function, double precision 350 | // This function does not produce denormals 351 | // Template parameters: 352 | // VTYPE: double vector type 353 | template 354 | static inline VTYPE asinh_d(VTYPE const x0) { 355 | 356 | // Coefficients 357 | const double p0 = -5.56682227230859640450E0; 358 | const double p1 = -9.09030533308377316566E0; 359 | const double p2 = -4.37390226194356683570E0; 360 | const double p3 = -5.91750212056387121207E-1; 361 | const double p4 = -4.33231683752342103572E-3; 362 | 363 | const double q0 = 3.34009336338516356383E1; 364 | const double q1 = 6.95722521337257608734E1; 365 | const double q2 = 4.86042483805291788324E1; 366 | const double q3 = 1.28757002067426453537E1; 367 | const double q4 = 1.0; 368 | 369 | // data vectors 370 | VTYPE x, x2, y1, y2; 371 | 372 | x2 = x0 * x0; 373 | x = abs(x0); 374 | auto x_small = x <= 0.533; // use Pade approximation if abs(x) <= 0.5 375 | // Both methods give the highest error close to 0.5. 376 | // This limit is adjusted for minimum error 377 | auto x_huge = x > 1.E20; // simple approximation, avoid overflow 378 | 379 | if (horizontal_or(x_small)) { 380 | // At least one element needs small method 381 | y1 = polynomial_4(x2, p0, p1, p2, p3, p4) / polynomial_4(x2, q0, q1, q2, q3, q4); 382 | y1 = mul_add(y1, x2*x, x); // y1 = x + (x2*x)*y1; 383 | } 384 | if (!horizontal_and(x_small)) { 385 | // At least one element needs big method 386 | y2 = log(x + sqrt(x2 + 1.0)); 387 | if (horizontal_or(x_huge)) { 388 | // At least one element needs huge method to avoid overflow 389 | y2 = select(x_huge, log(x) + VM_LN2, y2); 390 | } 391 | } 392 | y1 = select(x_small, y1, y2); // choose method 393 | y1 = sign_combine(y1, x0); // get original sign 394 | return y1; 395 | } 396 | 397 | // instances of asinh_d template 398 | static inline Vec2d asinh(Vec2d const x) { 399 | return asinh_d(x); 400 | } 401 | 402 | #if MAX_VECTOR_SIZE >= 256 403 | static inline Vec4d asinh(Vec4d const x) { 404 | return asinh_d(x); 405 | } 406 | #endif // MAX_VECTOR_SIZE >= 256 407 | 408 | #if MAX_VECTOR_SIZE >= 512 409 | static inline Vec8d asinh(Vec8d const x) { 410 | return asinh_d(x); 411 | } 412 | #endif // MAX_VECTOR_SIZE >= 512 413 | 414 | 415 | // Template for asinh function, single precision 416 | // This function does not produce denormals 417 | // Template parameters: 418 | // VTYPE: double vector type 419 | template 420 | static inline VTYPE asinh_f(VTYPE const x0) { 421 | 422 | // Coefficients 423 | const float r0 = -1.6666288134E-1f; 424 | const float r1 = 7.4847586088E-2f; 425 | const float r2 = -4.2699340972E-2f; 426 | const float r3 = 2.0122003309E-2f; 427 | 428 | // data vectors 429 | VTYPE x, x2, y1, y2; 430 | 431 | x2 = x0 * x0; 432 | x = abs(x0); 433 | auto x_small = x <= 0.51f; // use polynomial approximation if abs(x) <= 0.5 434 | auto x_huge = x > 1.E10f; // simple approximation, avoid overflow 435 | 436 | if (horizontal_or(x_small)) { 437 | // At least one element needs small method 438 | y1 = polynomial_3(x2, r0, r1, r2, r3); 439 | y1 = mul_add(y1, x2*x, x); // y1 = x + (x2*x)*y1; 440 | } 441 | if (!horizontal_and(x_small)) { 442 | // At least one element needs big method 443 | y2 = log(x + sqrt(x2 + 1.0f)); 444 | if (horizontal_or(x_huge)) { 445 | // At least one element needs huge method to avoid overflow 446 | y2 = select(x_huge, log(x) + (float)VM_LN2, y2); 447 | } 448 | } 449 | y1 = select(x_small, y1, y2); // choose method 450 | y1 = sign_combine(y1, x0); // get original sign 451 | return y1; 452 | } 453 | 454 | // instances of asinh_f template 455 | static inline Vec4f asinh(Vec4f const x) { 456 | return asinh_f(x); 457 | } 458 | 459 | #if MAX_VECTOR_SIZE >= 256 460 | static inline Vec8f asinh(Vec8f const x) { 461 | return asinh_f(x); 462 | } 463 | #endif // MAX_VECTOR_SIZE >= 256 464 | 465 | #if MAX_VECTOR_SIZE >= 512 466 | static inline Vec16f asinh(Vec16f const x) { 467 | return asinh_f(x); 468 | } 469 | #endif // MAX_VECTOR_SIZE >= 512 470 | 471 | 472 | // Template for acosh function, double precision 473 | // This function does not produce denormals 474 | // Template parameters: 475 | // VTYPE: double vector type 476 | template 477 | static inline VTYPE acosh_d(VTYPE const x0) { 478 | 479 | // Coefficients 480 | const double p0 = 1.10855947270161294369E5; 481 | const double p1 = 1.08102874834699867335E5; 482 | const double p2 = 3.43989375926195455866E4; 483 | const double p3 = 3.94726656571334401102E3; 484 | const double p4 = 1.18801130533544501356E2; 485 | 486 | const double q0 = 7.83869920495893927727E4; 487 | const double q1 = 8.29725251988426222434E4; 488 | const double q2 = 2.97683430363289370382E4; 489 | const double q3 = 4.15352677227719831579E3; 490 | const double q4 = 1.86145380837903397292E2; 491 | const double q5 = 1.0; 492 | 493 | // data vectors 494 | VTYPE x1, y1, y2; 495 | 496 | x1 = x0 - 1.0; 497 | auto undef = x0 < 1.0; // result is NAN 498 | auto x_small = x1 < 0.49; // use Pade approximation if abs(x-1) < 0.5 499 | auto x_huge = x1 > 1.E20; // simple approximation, avoid overflow 500 | 501 | if (horizontal_or(x_small)) { 502 | // At least one element needs small method 503 | y1 = sqrt(x1) * (polynomial_4(x1, p0, p1, p2, p3, p4) / polynomial_5(x1, q0, q1, q2, q3, q4, q5)); 504 | // x < 1 generates NAN 505 | y1 = select(undef, nan_vec(NAN_HYP), y1); 506 | } 507 | if (!horizontal_and(x_small)) { 508 | // At least one element needs big method 509 | y2 = log(x0 + sqrt(mul_sub(x0,x0,1.0))); 510 | if (horizontal_or(x_huge)) { 511 | // At least one element needs huge method to avoid overflow 512 | y2 = select(x_huge, log(x0) + VM_LN2, y2); 513 | } 514 | } 515 | y1 = select(x_small, y1, y2); // choose method 516 | return y1; 517 | } 518 | 519 | // instances of acosh_d template 520 | static inline Vec2d acosh(Vec2d const x) { 521 | return acosh_d(x); 522 | } 523 | 524 | #if MAX_VECTOR_SIZE >= 256 525 | static inline Vec4d acosh(Vec4d const x) { 526 | return acosh_d(x); 527 | } 528 | #endif // MAX_VECTOR_SIZE >= 256 529 | 530 | #if MAX_VECTOR_SIZE >= 512 531 | static inline Vec8d acosh(Vec8d const x) { 532 | return acosh_d(x); 533 | } 534 | #endif // MAX_VECTOR_SIZE >= 512 535 | 536 | 537 | // Template for acosh function, single precision 538 | // This function does not produce denormals 539 | // Template parameters: 540 | // VTYPE: double vector type 541 | template 542 | static inline VTYPE acosh_f(VTYPE const x0) { 543 | 544 | // Coefficients 545 | const float r0 = 1.4142135263E0f; 546 | const float r1 = -1.1784741703E-1f; 547 | const float r2 = 2.6454905019E-2f; 548 | const float r3 = -7.5272886713E-3f; 549 | const float r4 = 1.7596881071E-3f; 550 | 551 | // data vectors 552 | VTYPE x1, y1, y2; 553 | 554 | x1 = x0 - 1.0f; 555 | auto undef = x0 < 1.0f; // result is NAN 556 | auto x_small = x1 < 0.49f; // use Pade approximation if abs(x-1) < 0.5 557 | auto x_huge = x1 > 1.E10f; // simple approximation, avoid overflow 558 | 559 | if (horizontal_or(x_small)) { 560 | // At least one element needs small method 561 | y1 = sqrt(x1) * polynomial_4(x1, r0, r1, r2, r3, r4); 562 | // x < 1 generates NAN 563 | y1 = select(undef, nan_vec(NAN_HYP), y1); 564 | } 565 | if (!horizontal_and(x_small)) { 566 | // At least one element needs big method 567 | y2 = log(x0 + sqrt(mul_sub(x0,x0,1.0))); 568 | if (horizontal_or(x_huge)) { 569 | // At least one element needs huge method to avoid overflow 570 | y2 = select(x_huge, log(x0) + (float)VM_LN2, y2); 571 | } 572 | } 573 | y1 = select(x_small, y1, y2); // choose method 574 | return y1; 575 | } 576 | 577 | // instances of acosh_f template 578 | static inline Vec4f acosh(Vec4f const x) { 579 | return acosh_f(x); 580 | } 581 | 582 | #if MAX_VECTOR_SIZE >= 256 583 | static inline Vec8f acosh(Vec8f const x) { 584 | return acosh_f(x); 585 | } 586 | #endif // MAX_VECTOR_SIZE >= 256 587 | 588 | #if MAX_VECTOR_SIZE >= 512 589 | static inline Vec16f acosh(Vec16f const x) { 590 | return acosh_f(x); 591 | } 592 | #endif // MAX_VECTOR_SIZE >= 512 593 | 594 | 595 | // Template for atanh function, double precision 596 | // This function does not produce denormals 597 | // Template parameters: 598 | // VTYPE: double vector type 599 | template 600 | static inline VTYPE atanh_d(VTYPE const x0) { 601 | 602 | // Coefficients 603 | const double p0 = -3.09092539379866942570E1; 604 | const double p1 = 6.54566728676544377376E1; 605 | const double p2 = -4.61252884198732692637E1; 606 | const double p3 = 1.20426861384072379242E1; 607 | const double p4 = -8.54074331929669305196E-1; 608 | 609 | const double q0 = -9.27277618139601130017E1; 610 | const double q1 = 2.52006675691344555838E2; 611 | const double q2 = -2.49839401325893582852E2; 612 | const double q3 = 1.08938092147140262656E2; 613 | const double q4 = -1.95638849376911654834E1; 614 | const double q5 = 1.0; 615 | 616 | // data vectors 617 | VTYPE x, x2, y1, y2, y3; 618 | 619 | x = abs(x0); 620 | auto x_small = x < 0.5; // use Pade approximation if abs(x) < 0.5 621 | 622 | if (horizontal_or(x_small)) { 623 | // At least one element needs small method 624 | x2 = x * x; 625 | y1 = polynomial_4(x2, p0, p1, p2, p3, p4) / polynomial_5(x2, q0, q1, q2, q3, q4, q5); 626 | y1 = mul_add(y1, x2*x, x); 627 | } 628 | if (!horizontal_and(x_small)) { 629 | // At least one element needs big method 630 | y2 = log((1.0+x)/(1.0-x)) * 0.5; 631 | // check if out of range 632 | y3 = select(x == 1.0, infinite_vec(), nan_vec(NAN_HYP)); 633 | y2 = select(x >= 1.0, y3, y2); 634 | } 635 | y1 = select(x_small, y1, y2); // choose method 636 | y1 = sign_combine(y1, x0); // get original sign 637 | return y1; 638 | } 639 | 640 | // instances of atanh_d template 641 | static inline Vec2d atanh(Vec2d const x) { 642 | return atanh_d(x); 643 | } 644 | 645 | #if MAX_VECTOR_SIZE >= 256 646 | static inline Vec4d atanh(Vec4d const x) { 647 | return atanh_d(x); 648 | } 649 | #endif // MAX_VECTOR_SIZE >= 256 650 | 651 | #if MAX_VECTOR_SIZE >= 512 652 | static inline Vec8d atanh(Vec8d const x) { 653 | return atanh_d(x); 654 | } 655 | #endif // MAX_VECTOR_SIZE >= 512 656 | 657 | 658 | // Template for atanh function, single precision 659 | // This function does not produce denormals 660 | // Template parameters: 661 | // VTYPE: double vector type 662 | template 663 | static inline VTYPE atanh_f(VTYPE const x0) { 664 | 665 | // Coefficients 666 | const float r0 = 3.33337300303E-1f; 667 | const float r1 = 1.99782164500E-1f; 668 | const float r2 = 1.46691431730E-1f; 669 | const float r3 = 8.24370301058E-2f; 670 | const float r4 = 1.81740078349E-1f; 671 | 672 | // data vectors 673 | VTYPE x, x2, y1, y2, y3; 674 | 675 | x = abs(x0); 676 | auto x_small = x < 0.5f; // use polynomial approximation if abs(x) < 0.5 677 | 678 | if (horizontal_or(x_small)) { 679 | // At least one element needs small method 680 | x2 = x * x; 681 | y1 = polynomial_4(x2, r0, r1, r2, r3, r4); 682 | y1 = mul_add(y1, x2*x, x); 683 | } 684 | if (!horizontal_and(x_small)) { 685 | // At least one element needs big method 686 | y2 = log((1.0f+x)/(1.0f-x)) * 0.5f; 687 | // check if out of range 688 | y3 = select(x == 1.0f, infinite_vec(), nan_vec(NAN_HYP)); 689 | y2 = select(x >= 1.0f, y3, y2); 690 | } 691 | y1 = select(x_small, y1, y2); // choose method 692 | y1 = sign_combine(y1, x0); // get original sign 693 | return y1; 694 | } 695 | 696 | // instances of atanh_f template 697 | static inline Vec4f atanh(Vec4f const x) { 698 | return atanh_f(x); 699 | } 700 | 701 | #if MAX_VECTOR_SIZE >= 256 702 | static inline Vec8f atanh(Vec8f const x) { 703 | return atanh_f(x); 704 | } 705 | #endif // MAX_VECTOR_SIZE >= 256 706 | 707 | #if MAX_VECTOR_SIZE >= 512 708 | static inline Vec16f atanh(Vec16f const x) { 709 | return atanh_f(x); 710 | } 711 | #endif // MAX_VECTOR_SIZE >= 512 712 | 713 | #ifdef VCL_NAMESPACE 714 | } 715 | #endif 716 | 717 | #endif 718 | -------------------------------------------------------------------------------- /CAS/VCL2/vectormath_trig.h: -------------------------------------------------------------------------------- 1 | /**************************** vectormath_trig.h ****************************** 2 | * Author: Agner Fog 3 | * Date created: 2014-04-18 4 | * Last modified: 2020-06-08 5 | * Version: 2.00.03 6 | * Project: vector class library 7 | * Description: 8 | * Header file containing inline version of trigonometric functions 9 | * and inverse trigonometric functions 10 | * sin, cos, sincos, tan 11 | * asin, acos, atan, atan2 12 | * 13 | * Theory, methods and inspiration based partially on these sources: 14 | * > Moshier, Stephen Lloyd Baluk: Methods and programs for mathematical functions. 15 | * Ellis Horwood, 1989. 16 | * > VDT library developed on CERN by Danilo Piparo, Thomas Hauth and 17 | * Vincenzo Innocente, 2012, https://svnweb.cern.ch/trac/vdt 18 | * > Cephes math library by Stephen L. Moshier 1992, 19 | * http://www.netlib.org/cephes/ 20 | * 21 | * For detailed instructions, see vectormath_common.h and vcl_manual.pdf 22 | * 23 | * (c) Copyright 2014-2020 Agner Fog. 24 | * Apache License version 2.0 or later. 25 | ******************************************************************************/ 26 | 27 | #ifndef VECTORMATH_TRIG_H 28 | #define VECTORMATH_TRIG_H 1 29 | 30 | #include "vectormath_common.h" 31 | 32 | #ifdef VCL_NAMESPACE 33 | namespace VCL_NAMESPACE { 34 | #endif 35 | 36 | 37 | // ************************************************************* 38 | // sin/cos template, double precision 39 | // ************************************************************* 40 | // Template parameters: 41 | // VTYPE: f.p. vector type 42 | // SC: 1 = sin, 2 = cos, 3 = sincos 43 | // Paramterers: 44 | // xx = input x (radians) 45 | // cosret = return pointer (only if SC = 3) 46 | template 47 | static inline VTYPE sincos_d(VTYPE * cosret, VTYPE const xx) { 48 | 49 | // define constants 50 | const double P0sin = -1.66666666666666307295E-1; 51 | const double P1sin = 8.33333333332211858878E-3; 52 | const double P2sin = -1.98412698295895385996E-4; 53 | const double P3sin = 2.75573136213857245213E-6; 54 | const double P4sin = -2.50507477628578072866E-8; 55 | const double P5sin = 1.58962301576546568060E-10; 56 | 57 | const double P0cos = 4.16666666666665929218E-2; 58 | const double P1cos = -1.38888888888730564116E-3; 59 | const double P2cos = 2.48015872888517045348E-5; 60 | const double P3cos = -2.75573141792967388112E-7; 61 | const double P4cos = 2.08757008419747316778E-9; 62 | const double P5cos = -1.13585365213876817300E-11; 63 | 64 | const double DP1 = 7.853981554508209228515625E-1 * 2.; 65 | const double DP2 = 7.94662735614792836714E-9 * 2.; 66 | const double DP3 = 3.06161699786838294307E-17 * 2.; 67 | /* 68 | const double DP1sc = 7.85398125648498535156E-1; 69 | const double DP2sc = 3.77489470793079817668E-8; 70 | const double DP3sc = 2.69515142907905952645E-15; 71 | */ 72 | typedef decltype(roundi(xx)) ITYPE; // integer vector type 73 | typedef decltype(nan_code(xx)) UITYPE; // unsigned integer vector type 74 | typedef decltype(xx < xx) BVTYPE; // boolean vector type 75 | 76 | VTYPE xa, x, y, x2, s, c, sin1, cos1; // data vectors 77 | ITYPE q, qq, signsin, signcos; // integer vectors, 64 bit 78 | 79 | BVTYPE swap, overflow; // boolean vectors 80 | 81 | xa = abs(xx); 82 | 83 | // Find quadrant 84 | y = round(xa * (double)(2. / VM_PI)); // quadrant, as float 85 | q = roundi(y); // quadrant, as integer 86 | // Find quadrant 87 | // 0 - pi/4 => 0 88 | // pi/4 - 3*pi/4 => 1 89 | // 3*pi/4 - 5*pi/4 => 2 90 | // 5*pi/4 - 7*pi/4 => 3 91 | // 7*pi/4 - 8*pi/4 => 4 92 | 93 | // Reduce by extended precision modular arithmetic 94 | x = nmul_add(y, DP3, nmul_add(y, DP2, nmul_add(y, DP1, xa))); // x = ((xa - y * DP1) - y * DP2) - y * DP3; 95 | 96 | // Expansion of sin and cos, valid for -pi/4 <= x <= pi/4 97 | x2 = x * x; 98 | s = polynomial_5(x2, P0sin, P1sin, P2sin, P3sin, P4sin, P5sin); 99 | c = polynomial_5(x2, P0cos, P1cos, P2cos, P3cos, P4cos, P5cos); 100 | s = mul_add(x * x2, s, x); // s = x + (x * x2) * s; 101 | c = mul_add(x2 * x2, c, nmul_add(x2, 0.5, 1.0)); // c = 1.0 - x2 * 0.5 + (x2 * x2) * c; 102 | 103 | // swap sin and cos if odd quadrant 104 | swap = BVTYPE((q & 1) != 0); 105 | 106 | // check for overflow 107 | overflow = BVTYPE(UITYPE(q) > 0x80000000000000); // q big if overflow 108 | overflow &= is_finite(xa); 109 | s = select(overflow, 0.0, s); 110 | c = select(overflow, 1.0, c); 111 | 112 | if constexpr ((SC & 1) != 0) { // calculate sin 113 | sin1 = select(swap, c, s); 114 | signsin = ((q << 62) ^ ITYPE(reinterpret_i(xx))); 115 | sin1 = sign_combine(sin1, reinterpret_d(signsin)); 116 | } 117 | if constexpr ((SC & 2) != 0) { // calculate cos 118 | cos1 = select(swap, s, c); 119 | signcos = ((q + 1) & 2) << 62; 120 | cos1 ^= reinterpret_d(signcos); 121 | } 122 | if constexpr (SC == 3) { // calculate both. cos returned through pointer 123 | *cosret = cos1; 124 | } 125 | if constexpr ((SC & 1) != 0) return sin1; else return cos1; 126 | } 127 | 128 | // instantiations of sincos_d template: 129 | 130 | static inline Vec2d sin(Vec2d const x) { 131 | return sincos_d(0, x); 132 | } 133 | 134 | static inline Vec2d cos(Vec2d const x) { 135 | return sincos_d(0, x); 136 | } 137 | 138 | static inline Vec2d sincos(Vec2d * cosret, Vec2d const x) { 139 | return sincos_d(cosret, x); 140 | } 141 | 142 | #if MAX_VECTOR_SIZE >= 256 143 | static inline Vec4d sin(Vec4d const x) { 144 | return sincos_d(0, x); 145 | } 146 | 147 | static inline Vec4d cos(Vec4d const x) { 148 | return sincos_d(0, x); 149 | } 150 | 151 | static inline Vec4d sincos(Vec4d * cosret, Vec4d const x) { 152 | return sincos_d(cosret, x); 153 | } 154 | #endif // MAX_VECTOR_SIZE >= 256 155 | 156 | #if MAX_VECTOR_SIZE >= 512 157 | static inline Vec8d sin(Vec8d const x) { 158 | return sincos_d(0, x); 159 | } 160 | 161 | static inline Vec8d cos(Vec8d const x) { 162 | return sincos_d(0, x); 163 | } 164 | 165 | static inline Vec8d sincos(Vec8d * cosret, Vec8d const x) { 166 | return sincos_d(cosret, x); 167 | } 168 | #endif // MAX_VECTOR_SIZE >= 512 169 | 170 | 171 | // ************************************************************* 172 | // sincos template, single precision 173 | // ************************************************************* 174 | // Template parameters: 175 | // VTYPE: f.p. vector type 176 | // SC: 1 = sin, 2 = cos, 3 = sincos, 4 = tan 177 | // Paramterers: 178 | // xx = input x (radians) 179 | // cosret = return pointer (only if SC = 3) 180 | template 181 | static inline VTYPE sincos_f(VTYPE * cosret, VTYPE const xx) { 182 | 183 | // define constants 184 | const float DP1F = 0.78515625f * 2.f; 185 | const float DP2F = 2.4187564849853515625E-4f * 2.f; 186 | const float DP3F = 3.77489497744594108E-8f * 2.f; 187 | 188 | const float P0sinf = -1.6666654611E-1f; 189 | const float P1sinf = 8.3321608736E-3f; 190 | const float P2sinf = -1.9515295891E-4f; 191 | 192 | const float P0cosf = 4.166664568298827E-2f; 193 | const float P1cosf = -1.388731625493765E-3f; 194 | const float P2cosf = 2.443315711809948E-5f; 195 | 196 | typedef decltype(roundi(xx)) ITYPE; // integer vector type 197 | typedef decltype(nan_code(xx)) UITYPE; // unsigned integer vector type 198 | typedef decltype(xx < xx) BVTYPE; // boolean vector type 199 | 200 | VTYPE xa, x, y, x2, s, c, sin1, cos1; // data vectors 201 | ITYPE q, signsin, signcos; // integer vectors 202 | BVTYPE swap, overflow; // boolean vectors 203 | 204 | xa = abs(xx); 205 | 206 | // Find quadrant 207 | y = round(xa * (float)(2. / VM_PI)); // quadrant, as float 208 | q = roundi(y); // quadrant, as integer 209 | // 0 - pi/4 => 0 210 | // pi/4 - 3*pi/4 => 1 211 | // 3*pi/4 - 5*pi/4 => 2 212 | // 5*pi/4 - 7*pi/4 => 3 213 | // 7*pi/4 - 8*pi/4 => 4 214 | 215 | // Reduce by extended precision modular arithmetic 216 | // x = ((xa - y * DP1F) - y * DP2F) - y * DP3F; 217 | x = nmul_add(y, DP3F, nmul_add(y, DP2F, nmul_add(y, DP1F, xa))); 218 | 219 | // A two-step reduction saves time at the cost of precision for very big x: 220 | //x = (xa - y * DP1F) - y * (DP2F+DP3F); 221 | 222 | // Taylor expansion of sin and cos, valid for -pi/4 <= x <= pi/4 223 | x2 = x * x; 224 | s = polynomial_2(x2, P0sinf, P1sinf, P2sinf) * (x*x2) + x; 225 | c = polynomial_2(x2, P0cosf, P1cosf, P2cosf) * (x2*x2) + nmul_add(0.5f, x2, 1.0f); 226 | 227 | // swap sin and cos if odd quadrant 228 | swap = BVTYPE((q & 1) != 0); 229 | 230 | // check for overflow 231 | overflow = BVTYPE(UITYPE(q) > 0x2000000); // q big if overflow 232 | overflow &= is_finite(xa); 233 | s = select(overflow, 0.0f, s); 234 | c = select(overflow, 1.0f, c); 235 | 236 | if constexpr ((SC & 5) != 0) { // calculate sin 237 | sin1 = select(swap, c, s); 238 | signsin = ((q << 30) ^ ITYPE(reinterpret_i(xx))); 239 | sin1 = sign_combine(sin1, reinterpret_f(signsin)); 240 | } 241 | if constexpr ((SC & 6) != 0) { // calculate cos 242 | cos1 = select(swap, s, c); 243 | signcos = ((q + 1) & 2) << 30; 244 | cos1 ^= reinterpret_f(signcos); 245 | } 246 | if constexpr (SC == 1) return sin1; 247 | else if constexpr (SC == 2) return cos1; 248 | else if constexpr (SC == 3) { // calculate both. cos returned through pointer 249 | *cosret = cos1; 250 | return sin1; 251 | } 252 | else { // SC == 4. tan 253 | return sin1 / cos1; 254 | } 255 | } 256 | 257 | // instantiations of sincos_f template: 258 | 259 | static inline Vec4f sin(Vec4f const x) { 260 | return sincos_f(0, x); 261 | } 262 | 263 | static inline Vec4f cos(Vec4f const x) { 264 | return sincos_f(0, x); 265 | } 266 | 267 | static inline Vec4f sincos(Vec4f * cosret, Vec4f const x) { 268 | return sincos_f(cosret, x); 269 | } 270 | 271 | static inline Vec4f tan(Vec4f const x) { 272 | return sincos_f(0, x); 273 | } 274 | 275 | #if MAX_VECTOR_SIZE >= 256 276 | static inline Vec8f sin(Vec8f const x) { 277 | return sincos_f(0, x); 278 | } 279 | 280 | static inline Vec8f cos(Vec8f const x) { 281 | return sincos_f(0, x); 282 | } 283 | 284 | static inline Vec8f sincos(Vec8f * cosret, Vec8f const x) { 285 | return sincos_f(cosret, x); 286 | } 287 | 288 | static inline Vec8f tan(Vec8f const x) { 289 | return sincos_f(0, x); 290 | } 291 | #endif // MAX_VECTOR_SIZE >= 256 292 | 293 | #if MAX_VECTOR_SIZE >= 512 294 | static inline Vec16f sin(Vec16f const x) { 295 | return sincos_f(0, x); 296 | } 297 | 298 | static inline Vec16f cos(Vec16f const x) { 299 | return sincos_f(0, x); 300 | } 301 | 302 | static inline Vec16f sincos(Vec16f * cosret, Vec16f const x) { 303 | return sincos_f(cosret, x); 304 | } 305 | 306 | static inline Vec16f tan(Vec16f const x) { 307 | return sincos_f(0, x); 308 | } 309 | #endif // MAX_VECTOR_SIZE >= 512 310 | 311 | 312 | // ************************************************************* 313 | // tan template, double precision 314 | // ************************************************************* 315 | // Template parameters: 316 | // VTYPE: f.p. vector type 317 | // Paramterers: 318 | // x = input x (radians) 319 | template 320 | static inline VTYPE tan_d(VTYPE const x) { 321 | 322 | // define constants 323 | const double DP1 = 7.853981554508209228515625E-1 * 2.;; 324 | const double DP2 = 7.94662735614792836714E-9 * 2.;; 325 | const double DP3 = 3.06161699786838294307E-17 * 2.;; 326 | 327 | const double P2tan = -1.30936939181383777646E4; 328 | const double P1tan = 1.15351664838587416140E6; 329 | const double P0tan = -1.79565251976484877988E7; 330 | 331 | const double Q3tan = 1.36812963470692954678E4; 332 | const double Q2tan = -1.32089234440210967447E6; 333 | const double Q1tan = 2.50083801823357915839E7; 334 | const double Q0tan = -5.38695755929454629881E7; 335 | 336 | typedef decltype(x > x) BVTYPE; // boolean vector type 337 | VTYPE xa, y, z, zz, px, qx, tn, recip; // data vectors 338 | BVTYPE doinvert, xzero, overflow; // boolean vectors 339 | typedef decltype(nan_code(x)) UITYPE; // unsigned integer vector type 340 | 341 | 342 | xa = abs(x); 343 | 344 | // Find quadrant 345 | y = round(xa * (double)(2. / VM_PI)); // quadrant, as float 346 | auto q = roundi(y); // quadrant, as integer 347 | // Find quadrant 348 | // 0 - pi/4 => 0 349 | // pi/4 - 3*pi/4 => 1 350 | // 3*pi/4 - 5*pi/4 => 2 351 | // 5*pi/4 - 7*pi/4 => 3 352 | // 7*pi/4 - 8*pi/4 => 4 353 | 354 | // Reduce by extended precision modular arithmetic 355 | // z = ((xa - y * DP1) - y * DP2) - y * DP3; 356 | z = nmul_add(y, DP3, nmul_add(y, DP2, nmul_add(y, DP1, xa))); 357 | 358 | // Pade expansion of tan, valid for -pi/4 <= x <= pi/4 359 | zz = z * z; 360 | px = polynomial_2(zz, P0tan, P1tan, P2tan); 361 | qx = polynomial_4n(zz, Q0tan, Q1tan, Q2tan, Q3tan); 362 | 363 | // qx cannot be 0 for x <= pi/4 364 | tn = mul_add(px / qx, z * zz, z); // tn = z + z * zz * px / qx; 365 | 366 | // if (q&2) tn = -1/tn 367 | doinvert = BVTYPE((q & 1) != 0); 368 | xzero = (xa == 0.); 369 | // avoid division by 0. We will not be using recip anyway if xa == 0. 370 | // tn never becomes exactly 0 when x = pi/2 so we only have to make 371 | // a special case for x == 0. 372 | recip = (-1.) / select(xzero, VTYPE(-1.), tn); 373 | tn = select(doinvert, recip, tn); 374 | tn = sign_combine(tn, x); // get original sign 375 | 376 | overflow = BVTYPE(UITYPE(q) > 0x80000000000000) & is_finite(xa); 377 | tn = select(overflow, 0., tn); 378 | 379 | return tn; 380 | } 381 | 382 | // instantiations of tan_d template: 383 | 384 | static inline Vec2d tan(Vec2d const x) { 385 | return tan_d(x); 386 | } 387 | 388 | #if MAX_VECTOR_SIZE >= 256 389 | static inline Vec4d tan(Vec4d const x) { 390 | return tan_d(x); 391 | } 392 | #endif // MAX_VECTOR_SIZE >= 256 393 | 394 | #if MAX_VECTOR_SIZE >= 512 395 | static inline Vec8d tan(Vec8d const x) { 396 | return tan_d(x); 397 | } 398 | #endif // MAX_VECTOR_SIZE >= 512 399 | 400 | 401 | // ************************************************************* 402 | // tan template, single precision 403 | // ************************************************************* 404 | // This is removed for the single precision version. 405 | // It is faster to use tan(x) = sin(x)/cos(x) 406 | 407 | 408 | 409 | // ************************************************************* 410 | // asin/acos template, double precision 411 | // ************************************************************* 412 | // Template parameters: 413 | // VTYPE: f.p. vector type 414 | // AC: 0 = asin, 1 = acos 415 | // Paramterers: 416 | // x = input x 417 | template 418 | static inline VTYPE asin_d(VTYPE const x) { 419 | 420 | // define constants 421 | const double R4asin = 2.967721961301243206100E-3; 422 | const double R3asin = -5.634242780008963776856E-1; 423 | const double R2asin = 6.968710824104713396794E0; 424 | const double R1asin = -2.556901049652824852289E1; 425 | const double R0asin = 2.853665548261061424989E1; 426 | 427 | const double S3asin = -2.194779531642920639778E1; 428 | const double S2asin = 1.470656354026814941758E2; 429 | const double S1asin = -3.838770957603691357202E2; 430 | const double S0asin = 3.424398657913078477438E2; 431 | 432 | const double P5asin = 4.253011369004428248960E-3; 433 | const double P4asin = -6.019598008014123785661E-1; 434 | const double P3asin = 5.444622390564711410273E0; 435 | const double P2asin = -1.626247967210700244449E1; 436 | const double P1asin = 1.956261983317594739197E1; 437 | const double P0asin = -8.198089802484824371615E0; 438 | 439 | const double Q4asin = -1.474091372988853791896E1; 440 | const double Q3asin = 7.049610280856842141659E1; 441 | const double Q2asin = -1.471791292232726029859E2; 442 | const double Q1asin = 1.395105614657485689735E2; 443 | const double Q0asin = -4.918853881490881290097E1; 444 | 445 | VTYPE xa, xb, x1, x2, x3, x4, x5, px, qx, rx, sx, vx, wx, y1, yb, z, z1, z2; 446 | bool dobig, dosmall; 447 | 448 | xa = abs(x); 449 | auto big = xa >= 0.625; // boolean vector 450 | 451 | /* 452 | Small: xa < 0.625 453 | ------------------ 454 | x = xa * xa; 455 | px = PX(x); 456 | qx = QX(x); 457 | y1 = x*px/qx; 458 | y1 = xa * y1 + xa; 459 | 460 | Big: xa >= 0.625 461 | ------------------ 462 | x = 1.0 - xa; 463 | rx = RX(x); 464 | sx = SX(x); 465 | y1 = x * rx/sx; 466 | x3 = sqrt(x+x); 467 | y3 = x3 * y1 - MOREBITS; 468 | z = pi/2 - x3 - y3 469 | */ 470 | 471 | // select a common x for all polynomials 472 | // This allows sharing of powers of x through common subexpression elimination 473 | x1 = select(big, 1.0 - xa, xa * xa); 474 | 475 | // calculate powers of x1 outside branches to make sure they are only calculated once 476 | x2 = x1 * x1; 477 | x4 = x2 * x2; 478 | x5 = x4 * x1; 479 | x3 = x2 * x1; 480 | 481 | dosmall = !horizontal_and(big); // at least one element is small 482 | dobig = horizontal_or(big); // at least one element is big 483 | 484 | // calculate polynomials (reuse powers of x) 485 | if (dosmall) { 486 | // px = polynomial_5 (x1, P0asin, P1asin, P2asin, P3asin, P4asin, P5asin); 487 | // qx = polynomial_5n(x1, Q0asin, Q1asin, Q2asin, Q3asin, Q4asin); 488 | px = mul_add(x3, P3asin, P0asin) + mul_add(x4, P4asin, x1*P1asin) + mul_add(x5, P5asin, x2*P2asin); 489 | qx = mul_add(x4, Q4asin, x5) + mul_add(x3, Q3asin, x1*Q1asin) + mul_add(x2, Q2asin, Q0asin); 490 | } 491 | if (dobig) { 492 | // rx = polynomial_4 (x1, R0asin, R1asin, R2asin, R3asin, R4asin); 493 | // sx = polynomial_4n(x1, S0asin, S1asin, S2asin, S3asin); 494 | rx = mul_add(x3, R3asin, x2*R2asin) + mul_add(x4, R4asin, mul_add(x1, R1asin, R0asin)); 495 | sx = mul_add(x3, S3asin, x4) + mul_add(x2, S2asin, mul_add(x1, S1asin, S0asin)); 496 | } 497 | 498 | // select and divide outside branches to avoid dividing twice 499 | vx = select(big, rx, px); 500 | wx = select(big, sx, qx); 501 | y1 = vx / wx * x1; 502 | 503 | // results for big 504 | if (dobig) { // avoid square root if all are small 505 | xb = sqrt(x1 + x1); // this produces NAN if xa > 1 so we don't need a special case for xa > 1 506 | z1 = mul_add(xb, y1, xb); // yb = xb * y1; z1 = xb + yb; 507 | } 508 | 509 | // results for small 510 | z2 = mul_add(xa, y1, xa); // z2 = xa * y1 + xa; 511 | 512 | // correct for sign 513 | if constexpr (AC == 1) { // acos 514 | z1 = select(x < 0., VM_PI - z1, z1); 515 | z2 = VM_PI_2 - sign_combine(z2, x); 516 | z = select(big, z1, z2); 517 | } 518 | else { // asin 519 | z1 = VM_PI_2 - z1; 520 | z = select(big, z1, z2); 521 | z = sign_combine(z, x); 522 | } 523 | return z; 524 | } 525 | 526 | // instantiations of asin_d template: 527 | 528 | static inline Vec2d asin(Vec2d const x) { 529 | return asin_d(x); 530 | } 531 | 532 | static inline Vec2d acos(Vec2d const x) { 533 | return asin_d(x); 534 | } 535 | 536 | #if MAX_VECTOR_SIZE >= 256 537 | static inline Vec4d asin(Vec4d const x) { 538 | return asin_d(x); 539 | } 540 | 541 | static inline Vec4d acos(Vec4d const x) { 542 | return asin_d(x); 543 | } 544 | #endif // MAX_VECTOR_SIZE >= 256 545 | 546 | #if MAX_VECTOR_SIZE >= 512 547 | static inline Vec8d asin(Vec8d const x) { 548 | return asin_d(x); 549 | } 550 | 551 | static inline Vec8d acos(Vec8d const x) { 552 | return asin_d(x); 553 | } 554 | #endif // MAX_VECTOR_SIZE >= 512 555 | 556 | 557 | // ************************************************************* 558 | // asin/acos template, single precision 559 | // ************************************************************* 560 | // Template parameters: 561 | // VTYPE: f.p. vector type 562 | // AC: 0 = asin, 1 = acos 563 | // Paramterers: 564 | // x = input x 565 | template 566 | static inline VTYPE asin_f(VTYPE const x) { 567 | 568 | // define constants 569 | const float P4asinf = 4.2163199048E-2f; 570 | const float P3asinf = 2.4181311049E-2f; 571 | const float P2asinf = 4.5470025998E-2f; 572 | const float P1asinf = 7.4953002686E-2f; 573 | const float P0asinf = 1.6666752422E-1f; 574 | 575 | VTYPE xa, x1, x2, x3, x4, xb, z, z1, z2; 576 | 577 | xa = abs(x); 578 | auto big = xa > 0.5f; // boolean vector 579 | 580 | x1 = 0.5f * (1.0f - xa); 581 | x2 = xa * xa; 582 | x3 = select(big, x1, x2); 583 | 584 | //if (horizontal_or(big)) 585 | { 586 | xb = sqrt(x1); 587 | } 588 | x4 = select(big, xb, xa); 589 | 590 | z = polynomial_4(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf); 591 | z = mul_add(z, x3*x4, x4); // z = z * (x3*x4) + x4; 592 | z1 = z + z; 593 | 594 | // correct for sign 595 | if constexpr (AC == 1) { // acos 596 | z1 = select(x < 0., float(VM_PI) - z1, z1); 597 | z2 = float(VM_PI_2) - sign_combine(z, x); 598 | z = select(big, z1, z2); 599 | } 600 | else { // asin 601 | z1 = float(VM_PI_2) - z1; 602 | z = select(big, z1, z); 603 | z = sign_combine(z, x); 604 | } 605 | 606 | return z; 607 | } 608 | 609 | // instantiations of asin_f template: 610 | 611 | static inline Vec4f asin(Vec4f const x) { 612 | return asin_f(x); 613 | } 614 | 615 | static inline Vec4f acos(Vec4f const x) { 616 | return asin_f(x); 617 | } 618 | 619 | #if MAX_VECTOR_SIZE >= 256 620 | static inline Vec8f asin(Vec8f const x) { 621 | return asin_f(x); 622 | } 623 | static inline Vec8f acos(Vec8f const x) { 624 | return asin_f(x); 625 | } 626 | #endif // MAX_VECTOR_SIZE >= 256 627 | 628 | #if MAX_VECTOR_SIZE >= 512 629 | static inline Vec16f asin(Vec16f const x) { 630 | return asin_f(x); 631 | } 632 | static inline Vec16f acos(Vec16f const x) { 633 | return asin_f(x); 634 | } 635 | #endif // MAX_VECTOR_SIZE >= 512 636 | 637 | 638 | // ************************************************************* 639 | // atan template, double precision 640 | // ************************************************************* 641 | // Template parameters: 642 | // VTYPE: f.p. vector type 643 | // T2: 0 = atan, 1 = atan2 644 | // Paramterers: 645 | // y, x. calculate tan(y/x) 646 | // result is between -pi/2 and +pi/2 when x > 0 647 | // result is between -pi and -pi/2 or between pi/2 and pi when x < 0 for atan2 648 | template 649 | static inline VTYPE atan_d(VTYPE const y, VTYPE const x) { 650 | 651 | // define constants 652 | //const double ONEOPIO4 = 4./VM_PI; 653 | const double MOREBITS = 6.123233995736765886130E-17; 654 | const double MOREBITSO2 = MOREBITS * 0.5; 655 | const double T3PO8 = VM_SQRT2 + 1.; // 2.41421356237309504880; 656 | 657 | const double P4atan = -8.750608600031904122785E-1; 658 | const double P3atan = -1.615753718733365076637E1; 659 | const double P2atan = -7.500855792314704667340E1; 660 | const double P1atan = -1.228866684490136173410E2; 661 | const double P0atan = -6.485021904942025371773E1; 662 | 663 | const double Q4atan = 2.485846490142306297962E1; 664 | const double Q3atan = 1.650270098316988542046E2; 665 | const double Q2atan = 4.328810604912902668951E2; 666 | const double Q1atan = 4.853903996359136964868E2; 667 | const double Q0atan = 1.945506571482613964425E2; 668 | 669 | typedef decltype (x > x) BVTYPE; // boolean vector type 670 | VTYPE t, x1, x2, y1, y2, s, fac, a, b, z, zz, px, qx, re; // data vectors 671 | BVTYPE swapxy, notbig, notsmal; // boolean vectors 672 | 673 | if constexpr (T2 == 1) { // atan2(y,x) 674 | // move in first octant 675 | x1 = abs(x); 676 | y1 = abs(y); 677 | swapxy = (y1 > x1); 678 | // swap x and y if y1 > x1 679 | x2 = select(swapxy, y1, x1); 680 | y2 = select(swapxy, x1, y1); 681 | 682 | // check for special case: x and y are both +/- INF 683 | BVTYPE both_infinite = is_inf(x) & is_inf(y); // x and Y are both infinite 684 | if (horizontal_or(both_infinite)) { // at least one element has both infinite 685 | VTYPE mone = VTYPE(-1.0); 686 | x2 = select(both_infinite, x2 & mone, x2); // get 1.0 with the sign of x 687 | y2 = select(both_infinite, y2 & mone, y2); // get 1.0 with the sign of y 688 | } 689 | 690 | t = y2 / x2; // x = y = 0 gives NAN here 691 | } 692 | else { // atan(y) 693 | t = abs(y); 694 | } 695 | 696 | // small: t < 0.66 697 | // medium: 0.66 <= t <= 2.4142 (1+sqrt(2)) 698 | // big: t > 2.4142 699 | notbig = t <= T3PO8; // t <= 2.4142 700 | notsmal = t >= 0.66; // t >= 0.66 701 | 702 | s = select(notbig, VTYPE(VM_PI_4), VTYPE(VM_PI_2)); 703 | s = notsmal & s; // select(notsmal, s, 0.); 704 | fac = select(notbig, VTYPE(MOREBITSO2), VTYPE(MOREBITS)); 705 | fac = notsmal & fac; //select(notsmal, fac, 0.); 706 | 707 | // small: z = t / 1.0; 708 | // medium: z = (t-1.0) / (t+1.0); 709 | // big: z = -1.0 / t; 710 | a = notbig & t; // select(notbig, t, 0.); 711 | a = if_add(notsmal, a, -1.); 712 | b = notbig & VTYPE(1.); // select(notbig, 1., 0.); 713 | b = if_add(notsmal, b, t); 714 | z = a / b; // division by 0 will not occur unless x and y are both 0 715 | 716 | zz = z * z; 717 | 718 | px = polynomial_4(zz, P0atan, P1atan, P2atan, P3atan, P4atan); 719 | qx = polynomial_5n(zz, Q0atan, Q1atan, Q2atan, Q3atan, Q4atan); 720 | 721 | re = mul_add(px / qx, z * zz, z); // re = (px / qx) * (z * zz) + z; 722 | re += s + fac; 723 | 724 | if constexpr (T2 == 1) { // atan2(y,x) 725 | // move back in place 726 | re = select(swapxy, VM_PI_2 - re, re); 727 | re = select((x | y) == 0., 0., re); // atan2(0,0) = 0 by convention 728 | re = select(sign_bit(x), VM_PI - re, re);// also for x = -0. 729 | } 730 | // get sign bit 731 | re = sign_combine(re, y); 732 | 733 | return re; 734 | } 735 | 736 | // instantiations of atan_d template: 737 | 738 | static inline Vec2d atan2(Vec2d const y, Vec2d const x) { 739 | return atan_d(y, x); 740 | } 741 | 742 | static inline Vec2d atan(Vec2d const y) { 743 | return atan_d(y, 0.); 744 | } 745 | 746 | #if MAX_VECTOR_SIZE >= 256 747 | static inline Vec4d atan2(Vec4d const y, Vec4d const x) { 748 | return atan_d(y, x); 749 | } 750 | 751 | static inline Vec4d atan(Vec4d const y) { 752 | return atan_d(y, 0.); 753 | } 754 | #endif // MAX_VECTOR_SIZE >= 256 755 | 756 | #if MAX_VECTOR_SIZE >= 512 757 | static inline Vec8d atan2(Vec8d const y, Vec8d const x) { 758 | return atan_d(y, x); 759 | } 760 | 761 | static inline Vec8d atan(Vec8d const y) { 762 | return atan_d(y, 0.); 763 | } 764 | #endif // MAX_VECTOR_SIZE >= 512 765 | 766 | 767 | 768 | // ************************************************************* 769 | // atan template, single precision 770 | // ************************************************************* 771 | // Template parameters: 772 | // VTYPE: f.p. vector type 773 | // T2: 0 = atan, 1 = atan2 774 | // Paramterers: 775 | // y, x. calculate tan(y/x) 776 | // result is between -pi/2 and +pi/2 when x > 0 777 | // result is between -pi and -pi/2 or between pi/2 and pi when x < 0 for atan2 778 | template 779 | static inline VTYPE atan_f(VTYPE const y, VTYPE const x) { 780 | 781 | // define constants 782 | const float P3atanf = 8.05374449538E-2f; 783 | const float P2atanf = -1.38776856032E-1f; 784 | const float P1atanf = 1.99777106478E-1f; 785 | const float P0atanf = -3.33329491539E-1f; 786 | 787 | typedef decltype (x > x) BVTYPE; // boolean vector type 788 | VTYPE t, x1, x2, y1, y2, s, a, b, z, zz, re;// data vectors 789 | BVTYPE swapxy, notbig, notsmal; // boolean vectors 790 | 791 | if constexpr (T2 == 1) { // atan2(y,x) 792 | // move in first octant 793 | x1 = abs(x); 794 | y1 = abs(y); 795 | swapxy = (y1 > x1); 796 | // swap x and y if y1 > x1 797 | x2 = select(swapxy, y1, x1); 798 | y2 = select(swapxy, x1, y1); 799 | 800 | // check for special case: x and y are both +/- INF 801 | BVTYPE both_infinite = is_inf(x) & is_inf(y); // x and Y are both infinite 802 | if (horizontal_or(both_infinite)) { // at least one element has both infinite 803 | VTYPE mone = VTYPE(-1.0f); 804 | x2 = select(both_infinite, x2 & mone, x2); // get 1.0 with the sign of x 805 | y2 = select(both_infinite, y2 & mone, y2); // get 1.0 with the sign of y 806 | } 807 | 808 | // x = y = 0 will produce NAN. No problem, fixed below 809 | t = y2 / x2; 810 | } 811 | else { // atan(y) 812 | t = abs(y); 813 | } 814 | 815 | // small: t < 0.4142 816 | // medium: 0.4142 <= t <= 2.4142 817 | // big: t > 2.4142 (not for atan2) 818 | if constexpr (T2 == 0) { // atan(y) 819 | notsmal = t >= float(VM_SQRT2 - 1.); // t >= tan pi/8 820 | notbig = t <= float(VM_SQRT2 + 1.); // t <= tan 3pi/8 821 | 822 | s = select(notbig, VTYPE(float(VM_PI_4)), VTYPE(float(VM_PI_2))); 823 | s = notsmal & s; // select(notsmal, s, 0.); 824 | 825 | // small: z = t / 1.0; 826 | // medium: z = (t-1.0) / (t+1.0); 827 | // big: z = -1.0 / t; 828 | a = notbig & t; // select(notbig, t, 0.); 829 | a = if_add(notsmal, a, -1.f); 830 | b = notbig & VTYPE(1.f); // select(notbig, 1., 0.); 831 | b = if_add(notsmal, b, t); 832 | z = a / b; // division by 0 will not occur unless x and y are both 0 833 | } 834 | else { // atan2(y,x) 835 | // small: z = t / 1.0; 836 | // medium: z = (t-1.0) / (t+1.0); 837 | notsmal = t >= float(VM_SQRT2 - 1.); 838 | a = if_add(notsmal, t, -1.f); 839 | b = if_add(notsmal, 1.f, t); 840 | s = notsmal & VTYPE(float(VM_PI_4)); 841 | z = a / b; 842 | } 843 | 844 | zz = z * z; 845 | 846 | // Taylor expansion 847 | re = polynomial_3(zz, P0atanf, P1atanf, P2atanf, P3atanf); 848 | re = mul_add(re, zz * z, z) + s; 849 | 850 | if constexpr (T2 == 1) { // atan2(y,x) 851 | // move back in place 852 | re = select(swapxy, float(VM_PI_2) - re, re); 853 | re = select((x | y) == 0.f, 0.f, re); // atan2(0,+0) = 0 by convention 854 | re = select(sign_bit(x), float(VM_PI) - re, re); // also for x = -0. 855 | } 856 | // get sign bit 857 | re = sign_combine(re, y); 858 | 859 | return re; 860 | } 861 | 862 | // instantiations of atan_f template: 863 | 864 | static inline Vec4f atan2(Vec4f const y, Vec4f const x) { 865 | return atan_f(y, x); 866 | } 867 | 868 | static inline Vec4f atan(Vec4f const y) { 869 | return atan_f(y, 0.); 870 | } 871 | 872 | #if MAX_VECTOR_SIZE >= 256 873 | static inline Vec8f atan2(Vec8f const y, Vec8f const x) { 874 | return atan_f(y, x); 875 | } 876 | 877 | static inline Vec8f atan(Vec8f const y) { 878 | return atan_f(y, 0.); 879 | } 880 | 881 | #endif // MAX_VECTOR_SIZE >= 256 882 | 883 | #if MAX_VECTOR_SIZE >= 512 884 | static inline Vec16f atan2(Vec16f const y, Vec16f const x) { 885 | return atan_f(y, x); 886 | } 887 | 888 | static inline Vec16f atan(Vec16f const y) { 889 | return atan_f(y, 0.); 890 | } 891 | 892 | #endif // MAX_VECTOR_SIZE >= 512 893 | 894 | #ifdef VCL_NAMESPACE 895 | } 896 | #endif 897 | 898 | #endif 899 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 HolyWu 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Description 2 | =========== 3 | 4 | [Contrast Adaptive Sharpening](https://gpuopen.com/fidelityfx-cas/). 5 | 6 | 7 | Usage 8 | ===== 9 | 10 | cas.CAS(clip clip[, float sharpness=0.5, int planes, int opt=0]) 11 | 12 | * clip: Clip to process. Any planar format with either integer sample type of 8-16 bit depth or float sample type of 32 bit depth is supported. 13 | 14 | * sharpness: Sharpening strength. 15 | 16 | * planes: Sets which planes will be processed. Any unprocessed planes will be simply copied. By default only luma plane is processed for non-RGB formats. 17 | 18 | * opt: Sets which cpu optimizations to use. 19 | * 0 = auto detect 20 | * 1 = use c 21 | * 2 = use sse2 22 | * 3 = use avx2 23 | * 4 = use avx512 24 | 25 | 26 | Compilation 27 | =========== 28 | 29 | ``` 30 | meson build 31 | ninja -C build 32 | ninja -C build install 33 | ``` 34 | -------------------------------------------------------------------------------- /meson.build: -------------------------------------------------------------------------------- 1 | project('CAS', 'cpp', 2 | default_options: ['buildtype=release', 'b_ndebug=if-release', 'cpp_std=c++17'], 3 | meson_version: '>=0.48.0', 4 | version: '2' 5 | ) 6 | 7 | sources = [ 8 | 'CAS/CAS.cpp', 9 | 'CAS/CAS.h' 10 | ] 11 | 12 | vapoursynth_dep = dependency('vapoursynth').partial_dependency(compile_args: true, includes: true) 13 | 14 | libs = [] 15 | 16 | if host_machine.cpu_family().startswith('x86') 17 | add_project_arguments('-fno-math-errno', '-fno-trapping-math', '-DCAS_X86', '-mfpmath=sse', '-msse2', language: 'cpp') 18 | 19 | sources += [ 20 | 'CAS/CAS_SSE2.cpp', 21 | 'CAS/VCL2/instrset.h', 22 | 'CAS/VCL2/instrset_detect.cpp', 23 | 'CAS/VCL2/vector_convert.h', 24 | 'CAS/VCL2/vectorclass.h', 25 | 'CAS/VCL2/vectorf128.h', 26 | 'CAS/VCL2/vectorf256.h', 27 | 'CAS/VCL2/vectorf256e.h', 28 | 'CAS/VCL2/vectorf512.h', 29 | 'CAS/VCL2/vectorf512e.h', 30 | 'CAS/VCL2/vectori128.h', 31 | 'CAS/VCL2/vectori256.h', 32 | 'CAS/VCL2/vectori256e.h', 33 | 'CAS/VCL2/vectori512.h', 34 | 'CAS/VCL2/vectori512e.h', 35 | 'CAS/VCL2/vectori512s.h', 36 | 'CAS/VCL2/vectori512se.h', 37 | 'CAS/VCL2/vectormath_common.h', 38 | 'CAS/VCL2/vectormath_exp.h', 39 | 'CAS/VCL2/vectormath_hyp.h', 40 | 'CAS/VCL2/vectormath_lib.h', 41 | 'CAS/VCL2/vectormath_trig.h' 42 | ] 43 | 44 | libs += static_library('avx2', 'CAS/CAS_AVX2.cpp', 45 | dependencies: vapoursynth_dep, 46 | cpp_args: ['-mavx2', '-mfma'], 47 | gnu_symbol_visibility: 'hidden' 48 | ) 49 | 50 | libs += static_library('avx512', 'CAS/CAS_AVX512.cpp', 51 | dependencies: vapoursynth_dep, 52 | cpp_args: ['-mavx512f', '-mavx512vl', '-mavx512bw', '-mavx512dq', '-mfma'], 53 | gnu_symbol_visibility: 'hidden' 54 | ) 55 | endif 56 | 57 | shared_module('cas', sources, 58 | dependencies: vapoursynth_dep, 59 | link_with: libs, 60 | install: true, 61 | install_dir: join_paths(vapoursynth_dep.get_pkgconfig_variable('libdir'), 'vapoursynth'), 62 | gnu_symbol_visibility: 'hidden' 63 | ) 64 | --------------------------------------------------------------------------------