├── LICENSE ├── README.md ├── changelog.txt ├── dispatch_example1.cpp ├── dispatch_example2.cpp ├── instrset.h ├── instrset_detect.cpp ├── vector_convert.h ├── vectorclass.h ├── vectorf128.h ├── vectorf256.h ├── vectorf256e.h ├── vectorf512.h ├── vectorf512e.h ├── vectorfp16.h ├── vectorfp16e.h ├── vectori128.h ├── vectori256.h ├── vectori256e.h ├── vectori512.h ├── vectori512e.h ├── vectori512s.h ├── vectori512se.h ├── vectormath_common.h ├── vectormath_exp.h ├── vectormath_hyp.h ├── vectormath_lib.h └── vectormath_trig.h /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | 179 | Copyright 2012-2019 Agner Fog. 180 | 181 | Licensed under the Apache License, Version 2.0 (the "License"); 182 | you may not use this file except in compliance with the License. 183 | You may obtain a copy of the License at 184 | 185 | http://www.apache.org/licenses/LICENSE-2.0 186 | 187 | Unless required by applicable law or agreed to in writing, software 188 | distributed under the License is distributed on an "AS IS" BASIS, 189 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 190 | See the License for the specific language governing permissions and 191 | limitations under the License. 192 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # version2 2 | Vector Class Library, latest version 3 | 4 | This is a C++ class library for using the Single Instruction Multiple Data (SIMD) instructions to improve performance on modern microprocessors with the x86 or x86/64 instruction set on Windows, Linux, and Mac platforms. There are no plans to support ARM or other instruction sets. 5 | 6 | [Latest release](https://github.com/vectorclass/version2/releases) 7 | 8 | [Download manual](https://github.com/vectorclass/manual/raw/master/vcl_manual.pdf) 9 | 10 | [Add-on packages for particular applications](https://github.com/vectorclass/add-on) 11 | 12 | [Getting-started video.](https://www.youtube.com/watch?v=TKjYdLIMTrI) Video blogger Christopher Rose has made this nice video telling how to get started with the Vector Class Library. 13 | 14 | **Help:** You may ask for programming help on [StackOverflow](https://stackoverflow.com) using the tag vector-class-library. 15 | -------------------------------------------------------------------------------- /changelog.txt: -------------------------------------------------------------------------------- 1 | Change log for Vector class library 2 | ----------------------------------- 3 | 4 | 2023-07-04 version 2.02.02 5 | * remove various MS compiler warnings 6 | 7 | 2022-06-03 version 2.02.01 8 | * minor bug fixes and updates 9 | 10 | 2022-07-20 version 2.02.00 11 | * support half precision floating point vectors 12 | * new trigonometric functions sinpi, cospi, sincospi, tanpi 13 | * floating point modulo and remainder functions 14 | * functions extend_z and concatenate2 15 | * functions compress_saturated with one integer parameter 16 | * support for Intel compiler 17 | * fix bug for NAN inputs in operator > and >= 18 | * fix bug for underflow in pow function 19 | * fix unsafe optimization in store_partial function 20 | * new add-on package with container class and matrix class templates 21 | * major improvements in add-on package 'complex' 22 | * minor improvements in add-on packages 'random' and 'vector3d' 23 | 24 | 2021-08-18 version 2.01.04 25 | * fix bug in elementtype for Vec8uq 26 | * improved optimization of permute functions in MS compiler by using constexpr 27 | 28 | 2020-11-04 version 2.01.03 29 | * fix overflow in sin, cos, tan for large x 30 | * fix bug in is_nan for instruction sets prior to AVX 31 | * warning for MS compiler versions with poor support for AVX512 32 | 33 | 2020-04-11 version 2.01.02 34 | * only minor fixes 35 | 36 | 2020-02-25 version 2.01.01 37 | * added function store_nt 38 | * New dispatch_example1.cpp, dispatch_example2.cpp 39 | 40 | 2019-11-23 version 2.01.00 41 | * problem with performance of permute and blend functions fixed by avoiding 42 | unions in constexpr functions 43 | 44 | 2019-10-31 version 2.00.02 45 | * bug fix in permute function 46 | * is_nan function improved 47 | * templates constant4ui etc. improved 48 | 49 | 2019-08-02 version 2.00 50 | Derived from version 1.40 51 | * use C++17 52 | * use compact boolean vectors of all sizes if AVX512VL is enabled 53 | * permute and blend functions improved, using C++17 metaprogramming features 54 | * deprecated functions removed 55 | 56 | 2019-08-02 version 1.40 57 | * hosted on github 58 | * license changed to Apache 2.0. 59 | * added classes Vec64c, Vec64uc, Vec32s, Vec32us 60 | * test bench and scripts for automatic testing of VCL 61 | * new functions: maximum, minimum, to_float, to_double 62 | * conversion of bitfields to boolean vectors with load_bits. This replaces to_vec4ib etc. 63 | * shift_bytes_up/down functions changed to templates (old versions deprecated) 64 | * removed VECTORMATH define. vectormath_lib.h rewritten. svmlpatch.lib added. 65 | * many improvements and bug fixes 66 | * renamed functions: round_to_int and round_to_int64 functions renamed to roundi, 67 | * renamed functions: the type letter is removed from all permute and blend functions, 68 | e.g. permute4f renamed to permute4. 69 | These changes are made to facilitate generic template programming 70 | * renamed functions: to_Vec.. Replaced by load_bits member functions 71 | * deprecated functions: set_bit, get_bit 72 | * deprecated bit vector classes: Vec128b, Vec256b, Vec512b 73 | 74 | 2017-07-27 version 1.30 75 | * fixed bug in permute8f for a particular combination of indexes 76 | 77 | 2017-05-10 version 1.29 78 | * Reversed Apple Clang patch in version 1.28 because the problem has reoccurred in 79 | later versions of Clang 80 | 81 | 2017-05-02 version 1.28 82 | * Fixed problem with Apple Clang version 6.2 in vectorf128.h 83 | * Fixed return type for Vec8sb operator > (Vec8us, Vec8us) 84 | * cpuid function modified in instrset_detect.cpp 85 | 86 | 2017-02-19 version 1.27 87 | * fixed problem with scatter functions in MS Visual Studio 88 | 89 | 2016-12-21 version 1.26 90 | * added constant4ui template 91 | * fixed error for complexvec.h with clang 92 | * fixed error in vectormath_exp.h for MAX_VECTOR_SIZE < 512 93 | 94 | 2016-11-25 version 1.25 95 | * scatter functions 96 | * new functions to_float for unsigned integer vectors 97 | * instrset_detect function can detect AVX512VL, AVX512BW, AVX512DQ 98 | * functions hasF16C and hasAVX512ER for detecting instruction set extensions 99 | * fix bugs in horizontal_and and pow(0,0) for AVX512 100 | * functions improved for AVX512 and AVX512VL: pow, approx_recipr, 101 | approx_rsqrt 102 | * functions improved for AVX512DQ: 64 bit multiplication, to_double, 103 | 32 and 64 bit rotate_left, round_to_int64, truncate_to_int64 104 | * functions improved for AVX512ER: approx_recipr, approx_rsqrt, 105 | exponential functions 106 | 107 | 2016-10-31 version 1.24 108 | * fix bug in Vec8uq constructor in vectori512e.h 109 | 110 | 2016-09-27 version 1.23 111 | * temporary fix of a problem in Clang version 3.9 inserted in vectorf128.h 112 | 113 | 2016-05-03 version 1.22 114 | * added optional namespace 115 | * fixed problem with decimal.h 116 | 117 | 2016-04-24 version 1.21 118 | * fix problems with XOP option in gcc 119 | * improved horizontal_and/or for sse2 120 | * improved Vec2q and Vec4q constructor on Microsoft Visual Studio 2015 121 | * removed warnings by gcc option -Wcast-qual 122 | 123 | 2015-12-04 version 1.20 124 | * round functions: suppress precision exception under SSE4.1 and higher 125 | * fix compiler problems with AVX512 multiplication in gcc version 5.1 126 | * fix compiler problems with pow function in Microsoft Visual Studio 2015 127 | 128 | 2015-11-14 version 1.19 129 | * fix various problems with Clang compiler 130 | 131 | 2015-09-25 version 1.18 132 | * fix compiler error for Vec8s divide_by_i(Vec8s const & x) under Clang compiler 133 | * fix error in Vec4d::size() in vectorf256e.h 134 | 135 | 2015-07-31 version 1.17 136 | * improved operator > for Vec4uq 137 | * more special cases in blend4q 138 | * nan_code functions made static inline 139 | * template parameter BTYPE renamed to BVTYPE in mathematical functions to avoid clash 140 | with macro named BTYPE in winnt.h 141 | * fixed bug in Vec4db constructor 142 | 143 | 2014-10-24 version 1.16 144 | * workaround for problem in Clang compiler extended to version 3.09 because not 145 | fixed yet by Clang (vectorf128.h line 134) 146 | * recognize problem with Apple version of Clang reporting wrong version number 147 | * remove various minor problems with Clang 148 | * function pow(vector, int) modified to strengthen type checking and avoid compiler warnings 149 | * manual discusses dynamic allocation of arrays of vectors 150 | * various minor changes 151 | 152 | 2014-10-17 version 1.15 153 | * added files ranvec1.h and ranvec1.cpp for random number generator 154 | * constructors to make boolean vectors from their elements 155 | * constructors and = operators to broadcast boolean scalar into boolean vectors 156 | * various lookup functions improved 157 | * operators &, |, ^, ~, etc. defined for various boolean vectors to avoid converson 158 | to integer vectors 159 | * nmul_add functions 160 | * mul_add etc. moved to main header files 161 | * explicit fused multiply-and-add used in math functions to improve performance 162 | on compilers that don't automatically insert FMA 163 | 164 | 2014-07-24 version 1.14 165 | * support for AVX-512f instruction set and 512-bit vectors: 166 | Vec16i, Vec16ui, Vec8q, Vec8uq, Vec16f, Vec8d, and corresponding boolean vectors 167 | * new define MAX_VECTOR_SIZE, valid values are 128, 256 and 512 168 | * added hyperbolic functions sinh, cosh, tanh, asinh, acosh, atanh 169 | * size() member function on all vector classes returns the number of elements 170 | * functions for conversion between boolean vectors and integer bitfields 171 | * extracting an element from a boolean vector now returns a bool, not an int 172 | * improved precision in exp2 and exp10 functions 173 | * various bug fixes 174 | 175 | 2014-05-11 version 1.13 176 | * pow function improved 177 | * mul_add, mul_sub, mul_sub_x functions 178 | * propagation of error codes through nan_code function 179 | * "denormal" renamed to "subnormal" everywhere, in accordance with IEEE 754-2008 standard 180 | 181 | 2014-04-20 version 1.12 182 | * inline implementation of mathematical functions added (vectormath_exp.h vectormath_trig.h 183 | vectormath_common.h) 184 | * vectormath.h renamed to vectormath_lib.h because a new alternative is added 185 | * gather functions with constant indexes 186 | * function sign_combine 187 | * function pow_const(vector, const int) 188 | * function pow_ratio(vector, const int, const int) 189 | * functions horizontal_find_first, horizontal_count 190 | * function recipr_sqrt removed 191 | * functions round_to_int64_limited, truncate_to_int64_limited, to_double_limited 192 | * function cubic_root renamed to cbrt 193 | * function atan(vector,vector) renamed to atan2 194 | * function if_mul 195 | * function Vec4i round_to_int(Vec2d) 196 | * operator & (float vector, boolean vector) 197 | * operator &= (int vector, int vector) 198 | * removed constructor Vec128b(int) and Vec256b(int) to avoid implicit conversion 199 | * removed signalling nan function 200 | * minor improvements in various blend and lookup functions 201 | 202 | 2014-03-01 version 1.11 203 | * fixed missing unsigned operators >>= in vectori256.h 204 | 205 | 2013-10-04 version 1.10 206 | * clear distinction between boolean vectors and integer vectors for the sake of 207 | compatibility with mask registers in forthcoming AVX512 instruction set 208 | * added function if_add 209 | * tentative support for clang version 3.3 with workaround for bugs 210 | * remove ambiguity for builtin m128i operator == in clang compiler. 211 | * problems in clang compiler, bug reports filed at clang 212 | (http://llvm.org/bugs/show_bug.cgi?id=17164, 17312) 213 | * instrset.h fixes problem with macros named min and max in MS windows.h 214 | * workaround problem in MS Visual Studio 11.0. Bug report 735861 and 804274 215 | * minor bug fixes 216 | 217 | 2013-03-31 version 1.03 beta 218 | * bug fix for Vec2d cos (Vec2d const & x), VECTORMATH = 1 219 | 220 | 2012-08-01 version 1.02 beta 221 | * added file vector3d.h for 3-dimensional vectors 222 | * added file complexvec.h for complex numbers and complex vectors 223 | * added file quaternion.h for quaternions 224 | * added function change_sign for floating point vectors 225 | * added operators +, -, *, / between floating point vectors and scalars to remove 226 | overloading ambiguity 227 | 228 | 2012-07-08 version 1.01 beta 229 | * added file decimal.h with Number <-> string conversion functions: 230 | bin2bcd, bin2ascii, bin2hex_ascii, ascii2bin 231 | * added andnot function for boolean vectors 232 | * added functions shift_bytes_up and shift_bytes_down 233 | * added operators for unsigned integer vector classes: >>=, &, &&, |, ||, ^, ~ 234 | * inteldispatchpatch.cpp removed. Use asmlib instead (www.agner.org/optimize/#asmlib) 235 | * prefix ++ and -- operators now return a reference, postfix operators return a value 236 | * various improvements in permute and blend functions 237 | * minor improvement in abs function 238 | * added version number to VECTORCLASS_H 239 | 240 | 2012-05-30 version 1.00 beta 241 | * first public release at www.agner.org 242 | -------------------------------------------------------------------------------- /dispatch_example1.cpp: -------------------------------------------------------------------------------- 1 | /************************* dispatch_example1.cpp *************************** 2 | Author: Agner Fog 3 | Date created: 2012-05-30 4 | Last modified: 2020-02-25 5 | Version: 2.02.00 6 | Project: vector class library 7 | 8 | Description: Example of automatic CPU dispatching. 9 | This shows how to compile vector code in multiple versions, each 10 | optimized for a different instruction set. The optimal version is 11 | selected by a dispatcher at run time. 12 | 13 | There are two examples of automatic dispatching: 14 | 15 | dispatch_example1.cpp: Uses separate function names for each version. 16 | This is useful for simple cases with one or a few functions. 17 | 18 | dispatch_example2.cpp: Uses separate namespaces for each version. 19 | This is the recommended method for cases with multiple functions, 20 | classes, objects, etc. 21 | 22 | The code has two sections: 23 | 24 | Dispatched code: This code is compiled multiple times to generate multiple instances 25 | of the compiled code, each one optimized for a different instruction set. The 26 | dispatched code section contains the speed-critical part of the program. 27 | 28 | Common code: This code is compiled only once, using the lowest instruction set. 29 | The common code section contains the dispatcher, startup code, user interface, and 30 | other parts of the program that do not need advanced optimization. 31 | 32 | To compile this code, do as in this example: 33 | 34 | # Example of compiling dispatch example with Gnu or Clang compiler: 35 | # Compile dispatch_example1.cpp four times for different instruction sets: 36 | 37 | # Compile for AVX 38 | clang++ -O2 -m64 -mavx -std=c++17 -c dispatch_example1.cpp -od7.o 39 | 40 | # Compile for AVX2 41 | clang++ -O2 -m64 -mavx2 -mfma -std=c++17 -c dispatch_example1.cpp -od8.o 42 | 43 | # Compile for AVX512 44 | clang++ -O2 -m64 -mavx512f -mfma -mavx512vl -mavx512bw -mavx512dq -std=c++17 -c dispatch_example1.cpp -od10.o 45 | 46 | # The last compilation uses the lowest supported instruction set (SSE2) 47 | # This includes the main program, and links all versions together: 48 | # (Change test.exe to test in Linux and Mac) 49 | clang++ -O2 -m64 -msse2 -std=c++17 dispatch_example1.cpp instrset_detect.cpp d7.o d8.o d10.o -otest.exe 50 | 51 | # Run the program 52 | ./test.exe 53 | 54 | (c) Copyright 2012-2022 Agner Fog. 55 | Apache License version 2.0 or later. 56 | ******************************************************************************/ 57 | 58 | /* The different instruction sets are defined in instrset_detect.cpp: 59 | 2: SSE2 60 | 3: SSE3 61 | 4: SSSE3 (Supplementary SSE3) 62 | 5: SSE4.1 63 | 6: SSE4.2 64 | 7: AVX 65 | 8: AVX2 66 | 9: AVX512F 67 | 10: AVX512VL + AVX512BW + AVX512DQ 68 | */ 69 | 70 | 71 | #include 72 | #include "vectorclass.h" 73 | 74 | // Define function type 75 | // Change this to fit the entry function. Should not contain vector types: 76 | typedef float MyFuncType(float const []); 77 | 78 | // function prototypes for each version 79 | MyFuncType myfunc_SSE2, myfunc_AVX, myfunc_AVX2, myfunc_AVX512; 80 | 81 | // function prototypes for common entry point and dispatcher 82 | MyFuncType myfunc, myfunc_dispatch; 83 | 84 | // Define name of entry function depending on which instruction set we compile for 85 | #if INSTRSET >= 10 // AVX512VL 86 | #define FUNCNAME myfunc_AVX512 87 | #elif INSTRSET >= 8 // AVX2 88 | #define FUNCNAME myfunc_AVX2 89 | #elif INSTRSET >= 7 // AVX 90 | #define FUNCNAME myfunc_AVX 91 | #elif INSTRSET == 2 92 | #define FUNCNAME myfunc_SSE2 // SSE2 93 | #else 94 | #error Unsupported instruction set 95 | #endif 96 | 97 | /****************************************************************************** 98 | Dispatched code 99 | 100 | Everything in this section is compiled multiple times, with one version for 101 | each instruction set. Speed-critical vector code belongs here. 102 | ******************************************************************************/ 103 | 104 | // This is the dispatched function that is compiled in multiple versions with different names. 105 | // Make sure this function is static to prevent clash with other versions having the same name. 106 | // The function cannot be member of a class. 107 | static float sum (float const f[]) { 108 | // This example adds 16 floats 109 | Vec16f a; // vector of 16 floats 110 | a.load(f); // load array into vector 111 | return horizontal_add(a); // return sum of 16 elements 112 | } 113 | 114 | // ----------------------------------------------------------------------------- 115 | // Entry function 116 | // ----------------------------------------------------------------------------- 117 | // This is the entry function that is accessed through the dispatcher. 118 | // This serves as the interface between the common code and the dispatched code. 119 | // The entry function cannot be member of a class. 120 | // The entry function must use arrays rather than vectors for input and output. 121 | float FUNCNAME (float const f[]) { 122 | return sum(f); 123 | } 124 | 125 | 126 | /********************************************************************************** 127 | Common code 128 | 129 | Everything in this section is compiled only once, using the lowest instruction set. 130 | 131 | The dispatcher must be placed here. Program main(), user interface, and other 132 | less critical parts of the code are also placed in the common code section. 133 | **********************************************************************************/ 134 | 135 | #if INSTRSET == 2 136 | // The common code is only included in the lowest of the compiled versions 137 | 138 | 139 | // --------------------------------------------------------------------------------- 140 | // Dispacther 141 | // --------------------------------------------------------------------------------- 142 | // This function pointer initially points to the dispatcher. 143 | // After the first call, it points to the selected version of the entry function 144 | MyFuncType * myfunc_pointer = &myfunc_dispatch; // function pointer 145 | 146 | // Dispatcher 147 | float myfunc_dispatch(float const f[]) { 148 | int iset = instrset_detect(); // Detect supported instruction set 149 | // Choose which version of the entry function we want to point to: 150 | if (iset >= 10) myfunc_pointer = &myfunc_AVX512; // AVX512 version 151 | else if (iset >= 8) myfunc_pointer = &myfunc_AVX2; // AVX2 version 152 | else if (iset >= 7) myfunc_pointer = &myfunc_AVX; // AVX version 153 | else if (iset >= 2) myfunc_pointer = &myfunc_SSE2; // SSE2 version 154 | else { 155 | // Error: lowest instruction set not supported. 156 | // Put any appropriate error handler here 157 | fprintf(stderr, "\nError: Instruction set SSE2 not supported on this computer"); 158 | return 0.f; 159 | } 160 | // continue in dispatched version of the function 161 | return (*myfunc_pointer)(f); 162 | } 163 | 164 | 165 | // Call the entry function through the function pointer. 166 | // The first time this function is called, it goes through the dispatcher. 167 | // The dispatcher will change the function pointer so that all subsequent 168 | // calls go directly to the optimal version of the entry function 169 | inline float myfunc(float const f[]) { 170 | return (*myfunc_pointer)(f); // go to dispatched version 171 | } 172 | 173 | 174 | // --------------------------------------------------------------------------------- 175 | // Program main 176 | // --------------------------------------------------------------------------------- 177 | int main() { 178 | 179 | // array of 16 floats 180 | float const a[16] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}; 181 | 182 | float sum = myfunc(a); // call function with dispatching 183 | 184 | printf("\nsum = %8.2f \n", sum); // print result (= 136.00) 185 | 186 | return 0; 187 | } 188 | 189 | #endif // INSTRSET == 2 190 | -------------------------------------------------------------------------------- /dispatch_example2.cpp: -------------------------------------------------------------------------------- 1 | /************************* dispatch_example2.cpp *************************** 2 | Author: Agner Fog 3 | Date created: 2012-05-30 4 | Last modified: 2023-06-03 5 | Version: 2.02.00 6 | Project: vector class library 7 | Description: Example of automatic CPU dispatching. 8 | This shows how to compile vector code in multiple versions, each 9 | optimized for a different instruction set. The optimal version is 10 | selected by a dispatcher at run time. 11 | 12 | There are two examples of automatic dispatching: 13 | 14 | dispatch_example1.cpp: Uses separate function names for each version. 15 | This is useful for simple cases with one or a few functions. 16 | 17 | dispatch_example2.cpp: Uses separate namespaces for each version. 18 | This is the recommended method for cases with multiple functions, 19 | classes, objects, etc. 20 | 21 | The code has two sections: 22 | 23 | Dispatched code: This code is compiled multiple times to generate multiple instances 24 | of the compiled code, each one optimized for a different instruction set. The 25 | dispatched code section contains the speed-critical part of the program. 26 | 27 | Common code: This code is compiled only once, using the lowest instruction set. 28 | The common code section contains the dispatcher, startup code, user interface, and 29 | other parts of the program that do not need advanced optimization. 30 | 31 | To compile this code, do as in this example: 32 | 33 | # Example of compiling dispatch example with Gnu or Clang compiler: 34 | # Compile dispatch_example2.cpp four times for different instruction sets: 35 | 36 | # Compile for AVX 37 | clang++ -O2 -m64 -mavx -std=c++17 -c dispatch_example2.cpp -od7.o 38 | 39 | # Compile for AVX2 40 | clang++ -O2 -m64 -mavx2 -mfma -std=c++17 -c dispatch_example2.cpp -od8.o 41 | 42 | # Compile for AVX512 43 | clang++ -O2 -m64 -mavx512f -mfma -mavx512vl -mavx512bw -mavx512dq -std=c++17 -c dispatch_example2.cpp -od10.o 44 | 45 | # The last compilation uses the lowest supported instruction set (SSE2) 46 | # This includes the main program, and links all versions together: 47 | clang++ -O2 -m64 -msse2 -std=c++17 dispatch_example2.cpp instrset_detect.cpp d7.o d8.o d10.o -otest.exe 48 | 49 | # Run the program 50 | ./test.exe 51 | 52 | (c) Copyright 2012-2023 Agner Fog. 53 | Apache License version 2.0 or later. 54 | ******************************************************************************/ 55 | 56 | /* The different instruction sets are defined in instrset_detect.cpp: 57 | 2: SSE2 58 | 3: SSE3 59 | 4: SSSE3 (Supplementary SSE3) 60 | 5: SSE4.1 61 | 6: SSE4.2 62 | 7: AVX 63 | 8: AVX2 64 | 9: AVX512F 65 | 10: AVX512VL + AVX512BW + AVX512DQ 66 | */ 67 | 68 | #include 69 | #include "vectorclass.h" 70 | 71 | // Define function type 72 | // Change this to fit the entry function. Should not contain vector types: 73 | typedef float MyFuncType(float const []); 74 | 75 | // Define function prototypes for each version 76 | namespace Ns_SSE2{ // SSE2 instruction set 77 | MyFuncType myfunc; 78 | }; 79 | namespace Ns_AVX{ // AVX instruction set 80 | MyFuncType myfunc; 81 | }; 82 | namespace Ns_AVX2{ // AVX2 instruction set 83 | MyFuncType myfunc; 84 | }; 85 | namespace Ns_AVX512{ // AVX512 instruction set 86 | MyFuncType myfunc; 87 | }; 88 | 89 | // function prototypes for entry function and dispatcher, defined outside namespace 90 | MyFuncType myfunc, myfunc_dispatch; 91 | 92 | 93 | // ---------------------------------------------------------------------------- 94 | // Choose namespace name depending on which instruction set we compile for. 95 | // (You may place this in a header file if it is used in multiple cpp files) 96 | // ---------------------------------------------------------------------------- 97 | #if INSTRSET >= 10 // AVX512VL 98 | #define DISPATCHED_NAMESPACE Ns_AVX512 99 | #elif INSTRSET >= 8 // AVX2 100 | #define DISPATCHED_NAMESPACE Ns_AVX2 101 | #elif INSTRSET >= 7 // AVX 102 | #define DISPATCHED_NAMESPACE Ns_AVX 103 | #elif INSTRSET == 2 104 | #define DISPATCHED_NAMESPACE Ns_SSE2 // SSE2 105 | #else 106 | #error Unsupported instruction set 107 | #endif 108 | // ---------------------------------------------------------------------------- 109 | 110 | 111 | /****************************************************************************** 112 | Dispatched code 113 | 114 | Everything in this section is compiled multiple times, with one version for 115 | each instruction set. Speed-critical vector code belongs here. 116 | ******************************************************************************/ 117 | 118 | // Enclose all multiversion code in the chosen namespace 119 | namespace DISPATCHED_NAMESPACE { 120 | 121 | // This section may contain vectors, functions, classes, objects, etc. 122 | 123 | class MyClass { // Just a silly example 124 | public: 125 | float sum(float const f[]) { // This function adds 16 floats 126 | Vec16f a; // Vector of 16 floats 127 | a.load(f); // Load array into vector 128 | return horizontal_add(a); // Return sum of 16 elements 129 | } 130 | }; 131 | 132 | // ----------------------------------------------------------------------------- 133 | // Entry function 134 | // ----------------------------------------------------------------------------- 135 | // This is the entry function that is accessed through the dispatcher. 136 | // This serves as the interface between the common code and the dispatched code. 137 | // The entry function cannot be member of a class. 138 | // The entry function must use arrays rather than vectors for input and output. 139 | float myfunc(float const f[]) { 140 | MyClass myObject; 141 | return myObject.sum(f); 142 | } 143 | } 144 | 145 | /********************************************************************************** 146 | Common code 147 | 148 | Everything in this section is compiled only once, using the lowest instruction set. 149 | 150 | The dispatcher must be placed here. Program main(), user interface, and other 151 | less critical parts of the code are also placed in the common code section. 152 | **********************************************************************************/ 153 | 154 | #if INSTRSET == 2 155 | // The common code is only included in the lowest of the compiled versions 156 | 157 | 158 | // --------------------------------------------------------------------------------- 159 | // Dispacther 160 | // --------------------------------------------------------------------------------- 161 | // This function pointer initially points to the dispatcher. 162 | // After the first call, it points to the selected version of the entry function 163 | MyFuncType * myfunc_pointer = &myfunc_dispatch; // function pointer 164 | 165 | // Dispatch function 166 | float myfunc_dispatch(float const f[]) { 167 | int iset = instrset_detect(); // Detect supported instruction set 168 | // Choose which version of the entry function we want to point to: 169 | if (iset >= 10) myfunc_pointer = &Ns_AVX512::myfunc; // AVX512 version 170 | else if (iset >= 8) myfunc_pointer = &Ns_AVX2::myfunc; // AVX2 version 171 | else if (iset >= 7) myfunc_pointer = &Ns_AVX::myfunc; // AVX version 172 | else if (iset >= 2) myfunc_pointer = &Ns_SSE2::myfunc; // SSE2 version 173 | else { 174 | // Error: lowest instruction set not supported. 175 | // Put any appropriate error handler here 176 | fprintf(stderr, "\nError: Instruction set SSE2 not supported on this computer"); 177 | return 0.f; 178 | } 179 | // continue in the dispatched version of the entry function 180 | return (*myfunc_pointer)(f); 181 | } 182 | 183 | 184 | // Call the entry function through the function pointer. 185 | // The first time this function is called, it goes through the dispatcher. 186 | // The dispatcher will change the function pointer so that all subsequent 187 | // calls go directly to the optimal version of the entry function 188 | inline float myfunc(float const f[]) { 189 | return (*myfunc_pointer)(f); // go to dispatched version 190 | } 191 | 192 | 193 | // --------------------------------------------------------------------------------- 194 | // Program main 195 | // --------------------------------------------------------------------------------- 196 | int main() { 197 | 198 | // Array of 16 floats 199 | float const a[16] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}; 200 | 201 | float sum = myfunc(a); // call function with dispatching 202 | 203 | printf("\nsum = %8.2f \n", sum); // print result (= 136.00) 204 | 205 | return 0; 206 | } 207 | 208 | #endif // INSTRSET == 2 209 | -------------------------------------------------------------------------------- /instrset_detect.cpp: -------------------------------------------------------------------------------- 1 | /************************** instrset_detect.cpp **************************** 2 | * Author: Agner Fog 3 | * Date created: 2012-05-30 4 | * Last modified: 2022-07-20 5 | * Version: 2.02.00 6 | * Project: vector class library 7 | * Description: 8 | * Functions for checking which instruction sets are supported. 9 | * 10 | * (c) Copyright 2012-2022 Agner Fog. 11 | * Apache License version 2.0 or later. 12 | ******************************************************************************/ 13 | 14 | #include "instrset.h" 15 | 16 | #ifdef VCL_NAMESPACE 17 | namespace VCL_NAMESPACE { 18 | #endif 19 | 20 | 21 | // Define interface to xgetbv instruction 22 | static inline uint64_t xgetbv (int ctr) { 23 | #if (defined (_MSC_FULL_VER) && _MSC_FULL_VER >= 160040000) || (defined (__INTEL_COMPILER) && __INTEL_COMPILER >= 1200) 24 | // Microsoft or Intel compiler supporting _xgetbv intrinsic 25 | 26 | return uint64_t(_xgetbv(ctr)); // intrinsic function for XGETBV 27 | 28 | #elif defined(__GNUC__) || defined (__clang__) // use inline assembly, Gnu/AT&T syntax 29 | 30 | uint32_t a, d; 31 | __asm("xgetbv" : "=a"(a),"=d"(d) : "c"(ctr) : ); 32 | return a | (uint64_t(d) << 32); 33 | 34 | #else // #elif defined (_WIN32) // other compiler. try inline assembly with masm/intel/MS syntax 35 | uint32_t a, d; 36 | __asm { 37 | mov ecx, ctr 38 | _emit 0x0f 39 | _emit 0x01 40 | _emit 0xd0 ; // xgetbv 41 | mov a, eax 42 | mov d, edx 43 | } 44 | return a | (uint64_t(d) << 32); 45 | 46 | #endif 47 | } 48 | 49 | /* find supported instruction set 50 | return value: 51 | 0 = 80386 instruction set 52 | 1 or above = SSE (XMM) supported by CPU (not testing for OS support) 53 | 2 or above = SSE2 54 | 3 or above = SSE3 55 | 4 or above = Supplementary SSE3 (SSSE3) 56 | 5 or above = SSE4.1 57 | 6 or above = SSE4.2 58 | 7 or above = AVX supported by CPU and operating system 59 | 8 or above = AVX2 60 | 9 or above = AVX512F 61 | 10 or above = AVX512VL, AVX512BW, AVX512DQ 62 | */ 63 | int instrset_detect(void) { 64 | 65 | static int iset = -1; // remember value for next call 66 | if (iset >= 0) { 67 | return iset; // called before 68 | } 69 | iset = 0; // default value 70 | int abcd[4] = {0,0,0,0}; // cpuid results 71 | cpuid(abcd, 0); // call cpuid function 0 72 | if (abcd[0] == 0) return iset; // no further cpuid function supported 73 | cpuid(abcd, 1); // call cpuid function 1 for feature flags 74 | if ((abcd[3] & (1 << 0)) == 0) return iset; // no floating point 75 | if ((abcd[3] & (1 << 23)) == 0) return iset; // no MMX 76 | if ((abcd[3] & (1 << 15)) == 0) return iset; // no conditional move 77 | if ((abcd[3] & (1 << 24)) == 0) return iset; // no FXSAVE 78 | if ((abcd[3] & (1 << 25)) == 0) return iset; // no SSE 79 | iset = 1; // 1: SSE supported 80 | if ((abcd[3] & (1 << 26)) == 0) return iset; // no SSE2 81 | iset = 2; // 2: SSE2 supported 82 | if ((abcd[2] & (1 << 0)) == 0) return iset; // no SSE3 83 | iset = 3; // 3: SSE3 supported 84 | if ((abcd[2] & (1 << 9)) == 0) return iset; // no SSSE3 85 | iset = 4; // 4: SSSE3 supported 86 | if ((abcd[2] & (1 << 19)) == 0) return iset; // no SSE4.1 87 | iset = 5; // 5: SSE4.1 supported 88 | if ((abcd[2] & (1 << 23)) == 0) return iset; // no POPCNT 89 | if ((abcd[2] & (1 << 20)) == 0) return iset; // no SSE4.2 90 | iset = 6; // 6: SSE4.2 supported 91 | if ((abcd[2] & (1 << 27)) == 0) return iset; // no OSXSAVE 92 | if ((xgetbv(0) & 6) != 6) return iset; // AVX not enabled in O.S. 93 | if ((abcd[2] & (1 << 28)) == 0) return iset; // no AVX 94 | iset = 7; // 7: AVX supported 95 | cpuid(abcd, 7); // call cpuid leaf 7 for feature flags 96 | if ((abcd[1] & (1 << 5)) == 0) return iset; // no AVX2 97 | iset = 8; 98 | if ((abcd[1] & (1 << 16)) == 0) return iset; // no AVX512 99 | cpuid(abcd, 0xD); // call cpuid leaf 0xD for feature flags 100 | if ((abcd[0] & 0x60) != 0x60) return iset; // no AVX512 101 | iset = 9; 102 | cpuid(abcd, 7); // call cpuid leaf 7 for feature flags 103 | if ((abcd[1] & (1 << 31)) == 0) return iset; // no AVX512VL 104 | if ((abcd[1] & 0x40020000) != 0x40020000) return iset; // no AVX512BW, AVX512DQ 105 | iset = 10; 106 | return iset; 107 | } 108 | 109 | // detect if CPU supports the FMA3 instruction set 110 | bool hasFMA3(void) { 111 | if (instrset_detect() < 7) return false; // must have AVX 112 | int abcd[4]; // cpuid results 113 | cpuid(abcd, 1); // call cpuid function 1 114 | return ((abcd[2] & (1 << 12)) != 0); // ecx bit 12 indicates FMA3 115 | } 116 | 117 | // detect if CPU supports the FMA4 instruction set 118 | bool hasFMA4(void) { 119 | if (instrset_detect() < 7) return false; // must have AVX 120 | int abcd[4]; // cpuid results 121 | cpuid(abcd, 0x80000001); // call cpuid function 0x80000001 122 | return ((abcd[2] & (1 << 16)) != 0); // ecx bit 16 indicates FMA4 123 | } 124 | 125 | // detect if CPU supports the XOP instruction set 126 | bool hasXOP(void) { 127 | if (instrset_detect() < 7) return false; // must have AVX 128 | int abcd[4]; // cpuid results 129 | cpuid(abcd, 0x80000001); // call cpuid function 0x80000001 130 | return ((abcd[2] & (1 << 11)) != 0); // ecx bit 11 indicates XOP 131 | } 132 | 133 | // detect if CPU supports the AVX512ER instruction set 134 | bool hasAVX512ER(void) { 135 | if (instrset_detect() < 9) return false; // must have AVX512F 136 | int abcd[4]; // cpuid results 137 | cpuid(abcd, 7); // call cpuid function 7 138 | return ((abcd[1] & (1 << 27)) != 0); // ebx bit 27 indicates AVX512ER 139 | } 140 | 141 | // detect if CPU supports the AVX512VBMI instruction set 142 | bool hasAVX512VBMI(void) { 143 | if (instrset_detect() < 10) return false; // must have AVX512BW 144 | int abcd[4]; // cpuid results 145 | cpuid(abcd, 7); // call cpuid function 7 146 | return ((abcd[2] & (1 << 1)) != 0); // ecx bit 1 indicates AVX512VBMI 147 | } 148 | 149 | // detect if CPU supports the AVX512VBMI2 instruction set 150 | bool hasAVX512VBMI2(void) { 151 | if (instrset_detect() < 10) return false; // must have AVX512BW 152 | int abcd[4]; // cpuid results 153 | cpuid(abcd, 7); // call cpuid function 7 154 | return ((abcd[2] & (1 << 6)) != 0); // ecx bit 6 indicates AVX512VBMI2 155 | } 156 | 157 | // detect if CPU supports the F16C instruction set 158 | bool hasF16C(void) { 159 | if (instrset_detect() < 7) return false; // must have AVX 160 | int abcd[4]; // cpuid results 161 | cpuid(abcd, 1); // call cpuid function 1 162 | return ((abcd[2] & (1 << 29)) != 0); // ecx bit 29 indicates F16C 163 | } 164 | 165 | // detect if CPU supports the AVX512_FP16 instruction set 166 | bool hasAVX512FP16(void) { 167 | if (instrset_detect() < 10) return false; // must have AVX512 168 | int abcd[4]; // cpuid results 169 | cpuid(abcd, 7); // call cpuid function 1 170 | return ((abcd[3] & (1 << 23)) != 0); // edx bit 23 indicates AVX512_FP16 171 | } 172 | 173 | 174 | #ifdef VCL_NAMESPACE 175 | } 176 | #endif 177 | -------------------------------------------------------------------------------- /vector_convert.h: -------------------------------------------------------------------------------- 1 | /************************** vector_convert.h ******************************* 2 | * Author: Agner Fog 3 | * Date created: 2014-07-23 4 | * Last modified: 2022-07-20 5 | * Version: 2.02.00 6 | * Project: vector class library 7 | * Description: 8 | * Header file for conversion between different vector classes with different 9 | * sizes. Also includes verious generic template functions. 10 | * 11 | * (c) Copyright 2012-2022 Agner Fog. 12 | * Apache License version 2.0 or later. 13 | *****************************************************************************/ 14 | 15 | #ifndef VECTOR_CONVERT_H 16 | #define VECTOR_CONVERT_H 17 | 18 | #ifndef VECTORCLASS_H 19 | #include "vectorclass.h" 20 | #endif 21 | 22 | #if VECTORCLASS_H < 20200 23 | #error Incompatible versions of vector class library mixed 24 | #endif 25 | 26 | #ifdef VCL_NAMESPACE 27 | namespace VCL_NAMESPACE { 28 | #endif 29 | 30 | #if MAX_VECTOR_SIZE >= 256 31 | 32 | /***************************************************************************** 33 | * 34 | * Extend from 128 to 256 bit vectors 35 | * 36 | *****************************************************************************/ 37 | 38 | #if INSTRSET >= 8 // AVX2. 256 bit integer vectors 39 | 40 | // sign extend 41 | static inline Vec16s extend (Vec16c const a) { 42 | return _mm256_cvtepi8_epi16(a); 43 | } 44 | 45 | // zero extend 46 | static inline Vec16us extend (Vec16uc const a) { 47 | return _mm256_cvtepu8_epi16(a); 48 | } 49 | 50 | // sign extend 51 | static inline Vec8i extend (Vec8s const a) { 52 | return _mm256_cvtepi16_epi32(a); 53 | } 54 | 55 | // zero extend 56 | static inline Vec8ui extend (Vec8us const a) { 57 | return _mm256_cvtepu16_epi32(a); 58 | } 59 | 60 | // sign extend 61 | static inline Vec4q extend (Vec4i const a) { 62 | return _mm256_cvtepi32_epi64(a); 63 | } 64 | 65 | // zero extend 66 | static inline Vec4uq extend (Vec4ui const a) { 67 | return _mm256_cvtepu32_epi64(a); 68 | } 69 | 70 | 71 | #else // no AVX2. 256 bit integer vectors are emulated 72 | 73 | // sign extend and zero extend functions: 74 | static inline Vec16s extend (Vec16c const a) { 75 | return Vec16s(extend_low(a), extend_high(a)); 76 | } 77 | 78 | static inline Vec16us extend (Vec16uc const a) { 79 | return Vec16us(extend_low(a), extend_high(a)); 80 | } 81 | 82 | static inline Vec8i extend (Vec8s const a) { 83 | return Vec8i(extend_low(a), extend_high(a)); 84 | } 85 | 86 | static inline Vec8ui extend (Vec8us const a) { 87 | return Vec8ui(extend_low(a), extend_high(a)); 88 | } 89 | 90 | static inline Vec4q extend (Vec4i const a) { 91 | return Vec4q(extend_low(a), extend_high(a)); 92 | } 93 | 94 | static inline Vec4uq extend (Vec4ui const a) { 95 | return Vec4uq(extend_low(a), extend_high(a)); 96 | } 97 | 98 | #endif // AVX2 99 | 100 | /***************************************************************************** 101 | * 102 | * Conversions between float and double 103 | * 104 | *****************************************************************************/ 105 | #if INSTRSET >= 7 // AVX. 256 bit float vectors 106 | 107 | // float to double 108 | static inline Vec4d to_double (Vec4f const a) { 109 | return _mm256_cvtps_pd(a); 110 | } 111 | 112 | // double to float 113 | static inline Vec4f to_float (Vec4d const a) { 114 | return _mm256_cvtpd_ps(a); 115 | } 116 | 117 | #else // no AVX2. 256 bit float vectors are emulated 118 | 119 | // float to double 120 | static inline Vec4d to_double (Vec4f const a) { 121 | Vec2d lo = _mm_cvtps_pd(a); 122 | Vec2d hi = _mm_cvtps_pd(_mm_movehl_ps(a, a)); 123 | return Vec4d(lo,hi); 124 | } 125 | 126 | // double to float 127 | static inline Vec4f to_float (Vec4d const a) { 128 | Vec4f lo = _mm_cvtpd_ps(a.get_low()); 129 | Vec4f hi = _mm_cvtpd_ps(a.get_high()); 130 | return _mm_movelh_ps(lo, hi); 131 | } 132 | 133 | #endif 134 | 135 | /***************************************************************************** 136 | * 137 | * Reduce from 256 to 128 bit vectors 138 | * 139 | *****************************************************************************/ 140 | #if INSTRSET >= 10 // AVX512VL 141 | 142 | // compress functions. overflow wraps around 143 | static inline Vec16c compress (Vec16s const a) { 144 | return _mm256_cvtepi16_epi8(a); 145 | } 146 | 147 | static inline Vec16uc compress (Vec16us const a) { 148 | return _mm256_cvtepi16_epi8(a); 149 | } 150 | 151 | static inline Vec8s compress (Vec8i const a) { 152 | return _mm256_cvtepi32_epi16(a); 153 | } 154 | 155 | static inline Vec8us compress (Vec8ui const a) { 156 | return _mm256_cvtepi32_epi16(a); 157 | } 158 | 159 | static inline Vec4i compress (Vec4q const a) { 160 | return _mm256_cvtepi64_epi32(a); 161 | } 162 | 163 | static inline Vec4ui compress (Vec4uq const a) { 164 | return _mm256_cvtepi64_epi32(a); 165 | } 166 | 167 | // compress_saturated functions. overflow saturates 168 | static inline Vec16c compress_saturated (Vec16s const a) { 169 | return _mm256_cvtsepi16_epi8(a); 170 | } 171 | 172 | static inline Vec16uc compress_saturated (Vec16us const a) { 173 | return _mm256_cvtusepi16_epi8(a); 174 | } 175 | 176 | static inline Vec8s compress_saturated (Vec8i const a) { 177 | return _mm256_cvtsepi32_epi16(a); 178 | } 179 | 180 | static inline Vec8us compress_saturated (Vec8ui const a) { 181 | return _mm256_cvtusepi32_epi16(a); 182 | } 183 | 184 | static inline Vec4i compress_saturated (Vec4q const a) { 185 | return _mm256_cvtsepi64_epi32(a); 186 | } 187 | 188 | static inline Vec4ui compress_saturated (Vec4uq const a) { 189 | return _mm256_cvtusepi64_epi32(a); 190 | } 191 | 192 | 193 | #else // no AVX512 194 | 195 | // compress functions. overflow wraps around 196 | static inline Vec16c compress (Vec16s const a) { 197 | return compress(a.get_low(), a.get_high()); 198 | } 199 | 200 | static inline Vec16uc compress (Vec16us const a) { 201 | return compress(a.get_low(), a.get_high()); 202 | } 203 | 204 | static inline Vec8s compress (Vec8i const a) { 205 | return compress(a.get_low(), a.get_high()); 206 | } 207 | 208 | static inline Vec8us compress (Vec8ui const a) { 209 | return compress(a.get_low(), a.get_high()); 210 | } 211 | 212 | static inline Vec4i compress (Vec4q const a) { 213 | return compress(a.get_low(), a.get_high()); 214 | } 215 | 216 | static inline Vec4ui compress (Vec4uq const a) { 217 | return compress(a.get_low(), a.get_high()); 218 | } 219 | 220 | // compress_saturated functions. overflow saturates 221 | static inline Vec16c compress_saturated (Vec16s const a) { 222 | return compress_saturated(a.get_low(), a.get_high()); 223 | } 224 | 225 | static inline Vec16uc compress_saturated (Vec16us const a) { 226 | return compress_saturated(a.get_low(), a.get_high()); 227 | } 228 | 229 | static inline Vec8s compress_saturated (Vec8i const a) { 230 | return compress_saturated(a.get_low(), a.get_high()); 231 | } 232 | 233 | static inline Vec8us compress_saturated (Vec8ui const a) { 234 | return compress_saturated(a.get_low(), a.get_high()); 235 | } 236 | 237 | static inline Vec4i compress_saturated (Vec4q const a) { 238 | return compress_saturated(a.get_low(), a.get_high()); 239 | } 240 | 241 | static inline Vec4ui compress_saturated (Vec4uq const a) { 242 | return compress_saturated(a.get_low(), a.get_high()); 243 | } 244 | 245 | #endif // AVX512 246 | 247 | #endif // MAX_VECTOR_SIZE >= 256 248 | 249 | 250 | #if MAX_VECTOR_SIZE >= 512 251 | 252 | /***************************************************************************** 253 | * 254 | * Reduce from 512 to 256 bit vectors 255 | * 256 | *****************************************************************************/ 257 | #if INSTRSET >= 10 // AVX512VL 258 | 259 | // compress_saturated functions. overflow saturates 260 | static inline Vec32c compress_saturated (Vec32s const a) { 261 | return _mm512_cvtsepi16_epi8(a); 262 | } 263 | 264 | static inline Vec32uc compress_saturated (Vec32us const a) { 265 | return _mm512_cvtusepi16_epi8(a); 266 | } 267 | 268 | static inline Vec16s compress_saturated (Vec16i const a) { 269 | return _mm512_cvtsepi32_epi16(a); 270 | } 271 | 272 | static inline Vec16us compress_saturated (Vec16ui const a) { 273 | return _mm512_cvtusepi32_epi16(a); 274 | } 275 | 276 | static inline Vec8i compress_saturated (Vec8q const a) { 277 | return _mm512_cvtsepi64_epi32(a); 278 | } 279 | 280 | static inline Vec8ui compress_saturated (Vec8uq const a) { 281 | return _mm512_cvtusepi64_epi32(a); 282 | } 283 | 284 | #else // no AVX512 285 | 286 | // compress_saturated functions. overflow saturates 287 | static inline Vec32c compress_saturated (Vec32s const a) { 288 | return compress_saturated(a.get_low(), a.get_high()); 289 | } 290 | 291 | static inline Vec32uc compress_saturated (Vec32us const a) { 292 | return compress_saturated(a.get_low(), a.get_high()); 293 | } 294 | 295 | static inline Vec16s compress_saturated (Vec16i const a) { 296 | return compress_saturated(a.get_low(), a.get_high()); 297 | } 298 | 299 | static inline Vec16us compress_saturated (Vec16ui const a) { 300 | return compress_saturated(a.get_low(), a.get_high()); 301 | } 302 | 303 | static inline Vec8i compress_saturated (Vec8q const a) { 304 | return compress_saturated(a.get_low(), a.get_high()); 305 | } 306 | 307 | static inline Vec8ui compress_saturated (Vec8uq const a) { 308 | return compress_saturated(a.get_low(), a.get_high()); 309 | } 310 | 311 | #endif // AVX512 312 | 313 | /***************************************************************************** 314 | * 315 | * Extend from 256 to 512 bit vectors 316 | * 317 | *****************************************************************************/ 318 | 319 | #if INSTRSET >= 9 // AVX512. 512 bit integer vectors 320 | 321 | // sign extend 322 | static inline Vec32s extend (Vec32c const a) { 323 | #if INSTRSET >= 10 324 | return _mm512_cvtepi8_epi16(a); 325 | #else 326 | return Vec32s(extend_low(a), extend_high(a)); 327 | #endif 328 | } 329 | 330 | // zero extend 331 | static inline Vec32us extend (Vec32uc const a) { 332 | #if INSTRSET >= 10 333 | return _mm512_cvtepu8_epi16(a); 334 | #else 335 | return Vec32us(extend_low(a), extend_high(a)); 336 | #endif 337 | } 338 | 339 | // sign extend 340 | static inline Vec16i extend (Vec16s const a) { 341 | return _mm512_cvtepi16_epi32(a); 342 | } 343 | 344 | // zero extend 345 | static inline Vec16ui extend (Vec16us const a) { 346 | return _mm512_cvtepu16_epi32(a); 347 | } 348 | 349 | // sign extend 350 | static inline Vec8q extend (Vec8i const a) { 351 | return _mm512_cvtepi32_epi64(a); 352 | } 353 | 354 | // zero extend 355 | static inline Vec8uq extend (Vec8ui const a) { 356 | return _mm512_cvtepu32_epi64(a); 357 | } 358 | 359 | #else // no AVX512. 512 bit vectors are emulated 360 | 361 | 362 | 363 | // sign extend 364 | static inline Vec32s extend (Vec32c const a) { 365 | return Vec32s(extend_low(a), extend_high(a)); 366 | } 367 | 368 | // zero extend 369 | static inline Vec32us extend (Vec32uc const a) { 370 | return Vec32us(extend_low(a), extend_high(a)); 371 | } 372 | 373 | // sign extend 374 | static inline Vec16i extend (Vec16s const a) { 375 | return Vec16i(extend_low(a), extend_high(a)); 376 | } 377 | 378 | // zero extend 379 | static inline Vec16ui extend (Vec16us const a) { 380 | return Vec16ui(extend_low(a), extend_high(a)); 381 | } 382 | 383 | // sign extend 384 | static inline Vec8q extend (Vec8i const a) { 385 | return Vec8q(extend_low(a), extend_high(a)); 386 | } 387 | 388 | // zero extend 389 | static inline Vec8uq extend (Vec8ui const a) { 390 | return Vec8uq(extend_low(a), extend_high(a)); 391 | } 392 | 393 | #endif // AVX512 394 | 395 | 396 | /***************************************************************************** 397 | * 398 | * Reduce from 512 to 256 bit vectors 399 | * 400 | *****************************************************************************/ 401 | #if INSTRSET >= 9 // AVX512F 402 | 403 | // compress functions. overflow wraps around 404 | static inline Vec32c compress (Vec32s const a) { 405 | #if INSTRSET >= 10 // AVVX512BW 406 | return _mm512_cvtepi16_epi8(a); 407 | #else 408 | return compress(a.get_low(), a.get_high()); 409 | #endif 410 | } 411 | 412 | static inline Vec32uc compress (Vec32us const a) { 413 | return Vec32uc(compress(Vec32s(a))); 414 | } 415 | 416 | static inline Vec16s compress (Vec16i const a) { 417 | return _mm512_cvtepi32_epi16(a); 418 | } 419 | 420 | static inline Vec16us compress (Vec16ui const a) { 421 | return _mm512_cvtepi32_epi16(a); 422 | } 423 | 424 | static inline Vec8i compress (Vec8q const a) { 425 | return _mm512_cvtepi64_epi32(a); 426 | } 427 | 428 | static inline Vec8ui compress (Vec8uq const a) { 429 | return _mm512_cvtepi64_epi32(a); 430 | } 431 | 432 | #else // no AVX512 433 | 434 | // compress functions. overflow wraps around 435 | static inline Vec32c compress (Vec32s const a) { 436 | return compress(a.get_low(), a.get_high()); 437 | } 438 | 439 | static inline Vec32uc compress (Vec32us const a) { 440 | return compress(a.get_low(), a.get_high()); 441 | } 442 | 443 | static inline Vec16s compress (Vec16i const a) { 444 | return compress(a.get_low(), a.get_high()); 445 | } 446 | 447 | static inline Vec16us compress (Vec16ui const a) { 448 | return compress(a.get_low(), a.get_high()); 449 | } 450 | 451 | static inline Vec8i compress (Vec8q const a) { 452 | return compress(a.get_low(), a.get_high()); 453 | } 454 | 455 | static inline Vec8ui compress (Vec8uq const a) { 456 | return compress(a.get_low(), a.get_high()); 457 | } 458 | 459 | #endif // AVX512 460 | 461 | /***************************************************************************** 462 | * 463 | * Conversions between float and double 464 | * 465 | *****************************************************************************/ 466 | 467 | #if INSTRSET >= 9 // AVX512. 512 bit float vectors 468 | 469 | // float to double 470 | static inline Vec8d to_double (Vec8f const a) { 471 | return _mm512_cvtps_pd(a); 472 | } 473 | 474 | // double to float 475 | static inline Vec8f to_float (Vec8d const a) { 476 | return _mm512_cvtpd_ps(a); 477 | } 478 | 479 | #else // no AVX512. 512 bit float vectors are emulated 480 | 481 | // float to double 482 | static inline Vec8d to_double (Vec8f const a) { 483 | Vec4d lo = to_double(a.get_low()); 484 | Vec4d hi = to_double(a.get_high()); 485 | return Vec8d(lo,hi); 486 | } 487 | 488 | // double to float 489 | static inline Vec8f to_float (Vec8d const a) { 490 | Vec4f lo = to_float(a.get_low()); 491 | Vec4f hi = to_float(a.get_high()); 492 | return Vec8f(lo, hi); 493 | } 494 | 495 | #endif 496 | 497 | #endif // MAX_VECTOR_SIZE >= 512 498 | 499 | // double to float 500 | static inline Vec4f to_float (Vec2d const a) { 501 | return _mm_cvtpd_ps(a); 502 | } 503 | 504 | 505 | /***************************************************************************** 506 | * 507 | * Generic template functions 508 | * 509 | * These templates define functions for multiple vector types in one template 510 | * 511 | *****************************************************************************/ 512 | 513 | // concatenate two vectors into one vector of double size 514 | template auto concatenate2(T const a, T const b) { 515 | static_assert(sizeof(T) * 8 < MAX_VECTOR_SIZE, "Maximum vector size exceeded"); 516 | return decltype(extend_z(a))(a, b); // call constructor for double size vector type 517 | } 518 | 519 | 520 | // horizontal min/max of vector elements 521 | // implemented with universal template, works for all vector types: 522 | 523 | template auto horizontal_min(T const x) { 524 | if constexpr (T::elementtype() >= 15) { 525 | // T is a float or double vector 526 | if (horizontal_or(is_nan(x))) { 527 | // check for NAN because min does not guarantee NAN propagation 528 | return x[horizontal_find_first(is_nan(x))]; 529 | } 530 | } 531 | return horizontal_min1(x); 532 | } 533 | 534 | template auto horizontal_min1(T const x) { 535 | if constexpr (T::elementtype() <= 3) { // boolean vector type 536 | return horizontal_and(x); 537 | } 538 | else if constexpr (sizeof(T) >= 32) { 539 | // split recursively into smaller vectors 540 | return horizontal_min1(min(x.get_low(), x.get_high())); 541 | } 542 | else if constexpr (T::size() == 2) { 543 | T a = permute2 <1, V_DC>(x); // high half 544 | T b = min(a, x); 545 | return b[0]; 546 | } 547 | else if constexpr (T::size() == 4) { 548 | T a = permute4<2, 3, V_DC, V_DC>(x); // high half 549 | T b = min(a, x); 550 | a = permute4<1, V_DC, V_DC, V_DC>(b); 551 | b = min(a, b); 552 | return b[0]; 553 | } 554 | else if constexpr (T::size() == 8) { 555 | T a = permute8<4, 5, 6, 7, V_DC, V_DC, V_DC, V_DC>(x); // high half 556 | T b = min(a, x); 557 | a = permute8<2, 3, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b); 558 | b = min(a, b); 559 | a = permute8<1, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b); 560 | b = min(a, b); 561 | return b[0]; 562 | } 563 | else { 564 | static_assert(T::size() == 16); // no other size is allowed 565 | T a = permute16<8, 9, 10, 11, 12, 13, 14, 15, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC >(x); // high half 566 | T b = min(a, x); 567 | a = permute16<4, 5, 6, 7, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b); 568 | b = min(a, b); 569 | a = permute16<2, 3, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b); 570 | b = min(a, b); 571 | a = permute16<1, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b); 572 | b = min(a, b); 573 | return b[0]; 574 | } 575 | } 576 | 577 | template auto horizontal_max(T const x) { 578 | if constexpr (T::elementtype() >= 15) { 579 | // T is a float or double vector 580 | if (horizontal_or(is_nan(x))) { 581 | // check for NAN because max does not guarantee NAN propagation 582 | return x[horizontal_find_first(is_nan(x))]; 583 | } 584 | } 585 | return horizontal_max1(x); 586 | } 587 | 588 | template auto horizontal_max1(T const x) { 589 | if constexpr (T::elementtype() <= 3) { // boolean vector type 590 | return horizontal_or(x); 591 | } 592 | else if constexpr (sizeof(T) >= 32) { 593 | // split recursively into smaller vectors 594 | return horizontal_max1(max(x.get_low(), x.get_high())); 595 | } 596 | else if constexpr (T::size() == 2) { 597 | T a = permute2 <1, V_DC>(x); // high half 598 | T b = max(a, x); 599 | return b[0]; 600 | } 601 | else if constexpr (T::size() == 4) { 602 | T a = permute4<2, 3, V_DC, V_DC>(x); // high half 603 | T b = max(a, x); 604 | a = permute4<1, V_DC, V_DC, V_DC>(b); 605 | b = max(a, b); 606 | return b[0]; 607 | } 608 | else if constexpr (T::size() == 8) { 609 | T a = permute8<4, 5, 6, 7, V_DC, V_DC, V_DC, V_DC>(x); // high half 610 | T b = max(a, x); 611 | a = permute8<2, 3, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b); 612 | b = max(a, b); 613 | a = permute8<1, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b); 614 | b = max(a, b); 615 | return b[0]; 616 | } 617 | else { 618 | static_assert(T::size() == 16); // no other size is allowed 619 | T a = permute16<8, 9, 10, 11, 12, 13, 14, 15, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC >(x); // high half 620 | T b = max(a, x); 621 | a = permute16<4, 5, 6, 7, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b); 622 | b = max(a, b); 623 | a = permute16<2, 3, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b); 624 | b = max(a, b); 625 | a = permute16<1, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b); 626 | b = max(a, b); 627 | return b[0]; 628 | } 629 | } 630 | 631 | // Find first element that is true in a boolean vector 632 | template 633 | static inline int horizontal_find_first(V const x) { 634 | static_assert(V::elementtype() == 2 || V::elementtype() == 3, "Boolean vector expected"); 635 | auto bits = to_bits(x); // convert to bits 636 | if (bits == 0) return -1; 637 | if constexpr (V::size() < 32) { 638 | return bit_scan_forward((uint32_t)bits); 639 | } 640 | else { 641 | return bit_scan_forward(bits); 642 | } 643 | } 644 | 645 | // Count the number of elements that are true in a boolean vector 646 | template 647 | static inline int horizontal_count(V const x) { 648 | static_assert(V::elementtype() == 2 || V::elementtype() == 3, "Boolean vector expected"); 649 | auto bits = to_bits(x); // convert to bits 650 | if constexpr (V::size() < 32) { 651 | return vml_popcnt((uint32_t)bits); 652 | } 653 | else { 654 | return (int)vml_popcnt(bits); 655 | } 656 | } 657 | 658 | // maximum and minimum functions. This version is sure to propagate NANs, 659 | // conforming to the new IEEE-754 2019 standard 660 | template 661 | static inline V maximum(V const a, V const b) { 662 | if constexpr (V::elementtype() < 15) { 663 | return max(a, b); // integer type 664 | } 665 | else { // float or double vector 666 | V y = select(is_nan(a), a, max(a, b)); 667 | #ifdef SIGNED_ZERO // pedantic about signed zero 668 | y = select(a == b, a & b, y); // maximum(+0, -0) = +0 669 | #endif 670 | return y; 671 | } 672 | } 673 | 674 | template 675 | static inline V minimum(V const a, V const b) { 676 | if constexpr (V::elementtype() < 15) { 677 | return min(a, b); // integer type 678 | } 679 | else { // float or double vector 680 | V y = select(is_nan(a), a, min(a, b)); 681 | #ifdef SIGNED_ZERO // pedantic about signed zero 682 | y = select(a == b, a | b, y); // minimum(+0, -0) = -0 683 | #endif 684 | return y; 685 | } 686 | } 687 | 688 | // floating point remainder 689 | // -denominator/2 <= result < denominator/2 690 | template 691 | static inline V fremainder(V const numerator, double const denominator) { 692 | // (Optimization notice: Calculation of 1/denominator and constants for extended precision reduction 693 | // may be optimized by a compiler moving loop-invariant code. This is intended) 694 | static_assert(V::elementtype() == 16 || V::elementtype() == 17, "wrong vector type"); // supports only float and double 695 | if (denominator > 0.) { // denominator must be positive 696 | if constexpr (V::elementtype() == 16) { // float 697 | #ifdef __FMA__ 698 | float recipd = float(1.0 / denominator); // reciprocal denominator 699 | float fd = float(denominator); // denominator rounded to single precision 700 | float d2 = float(denominator - fd); // remaining bits for double precision 701 | V q = round(numerator * recipd); // divide and round 702 | V m = nmul_add(q, d2, nmul_add(q, fd, numerator));// double precision reduction 703 | #else // no FMA. Use extended precision reduction 704 | union { 705 | float f; 706 | uint32_t i; 707 | } u; 708 | u.f = float(denominator); 709 | u.i &= 0xFFFFF000; // remove 12 least significant bits for extended precision reduction 710 | float d2 = denominator - u.f; // remaining bits 711 | float recipd = float(1.0 / denominator); // reciprocal 712 | V q = round(numerator * recipd); // divide and round 713 | V m = nmul_add(q, d2, nmul_add(q, u.f, numerator));// extended precision reduction 714 | #endif // FMA 715 | if (true) { // Check that result is within desired interval. This may be omitted if not essential: 716 | // This check may be needed in extreme cases of numerator > 1.E5 * denominator 717 | auto too_high = m >= float( denominator * 0.5); 718 | auto too_low = m < float(-denominator * 0.5); 719 | m = if_sub(too_high, m, float(denominator)); 720 | m = if_add(too_low, m, float(denominator)); 721 | } 722 | return m; 723 | } 724 | else if constexpr (V::elementtype() == 17) { // double precision 725 | #ifdef __FMA__ 726 | double recipd = 1.0 / denominator; // reciprocal 727 | V q = round(numerator * recipd); // divide and round 728 | V m = nmul_add(q, denominator, numerator); // nmul_add has extended precision 729 | #else // no FMA. Use extended precision reduction 730 | union { 731 | double f; 732 | uint64_t i; 733 | } u; 734 | u.f = denominator; 735 | u.i &= 0xFFFFFFFFFF000000; // remove 24 least significant bits for extended precision reduction 736 | double d2 = denominator - u.f; // remaining bits 737 | double recipd = 1.0 / denominator; // reciprocal 738 | V q = round(numerator * recipd); // divide and round 739 | V m = nmul_add(q, d2, nmul_add(q, u.f, numerator));// extended precision reduction 740 | #endif // FMA 741 | if (true) { // Check that result is within desired interval. This may be omitted if not essential: 742 | // This check is rarely needed except in extreme cases of numerator > 1.E14 * denominator 743 | auto too_high = m >= denominator * 0.5; 744 | auto too_low = m < -denominator * 0.5; 745 | m = if_sub(too_high, m, denominator); 746 | m = if_add(too_low, m, denominator); 747 | } 748 | return m; 749 | } 750 | } 751 | else { 752 | return nan_vec(1); // denominator is not positive 753 | } 754 | } 755 | 756 | // floating point modulo 757 | // 0 <= result < denominator 758 | template 759 | static inline V fmodulo(V const numerator, double const denominator) { 760 | // (Optimization notice: Calculation of 1/denominator and constants for extended precision reduction 761 | // may be optimized by a compiler moving loop-invariant code. This is intended) 762 | static_assert(V::elementtype() == 16 || V::elementtype() == 17, "wrong vector type"); // supports only float and double 763 | if (denominator > 0.) { // denominator must be positive 764 | if constexpr (V::elementtype() == 16) { // float 765 | #ifdef __FMA__ 766 | float recipd = float(1.0 / denominator); // reciprocal denominator 767 | float fd = float(denominator); // denominator rounded to single precision 768 | float d2 = float(denominator - fd); // remaining bits for double precision 769 | V q = floor(numerator * recipd); // divide and floor 770 | V m = nmul_add(q, d2, nmul_add(q, fd, numerator));// double precision reduction 771 | #else // no FMA. Use extended precision reduction 772 | union { 773 | float f; 774 | uint32_t i; 775 | } u; 776 | u.f = float(denominator); 777 | u.i &= 0xFFFFF000; // remove 12 least significant bits for extended precision reduction 778 | float d2 = denominator - u.f; // remaining bits 779 | float recipd = float(1.0 / denominator); // reciprocal 780 | V q = floor(numerator * recipd); // divide and floor 781 | V m = nmul_add(q, d2, nmul_add(q, u.f, numerator));// extended precision reduction 782 | #endif // FMA 783 | if (true) { // Check that result is within desired interval. This may be omitted if not essential: 784 | // This check may be needed in extreme cases of numerator > 1.E5 * denominator 785 | auto too_high = m >= float(denominator); 786 | auto too_low = m < 0.f; 787 | m = if_sub(too_high, m, float(denominator)); 788 | m = if_add(too_low, m, float(denominator)); 789 | } 790 | return m; 791 | } 792 | else if constexpr (V::elementtype() == 17) { // double precision 793 | #ifdef __FMA__ 794 | double recipd = 1.0 / denominator; // reciprocal 795 | V q = floor(numerator * recipd); // divide and floor 796 | V m = nmul_add(q, denominator, numerator); // nmul_add has extended precision 797 | #else // no FMA. Use extended precision reduction 798 | union { 799 | double f; 800 | uint64_t i; 801 | } u; 802 | u.f = denominator; 803 | u.i &= 0xFFFFFFFFFF000000; // remove 24 least significant bits for extended precision reduction 804 | double d2 = denominator - u.f; // remaining bits 805 | double recipd = 1.0 / denominator; // reciprocal 806 | V q = floor(numerator * recipd); // divide and floor 807 | V m = nmul_add(q, d2, nmul_add(q, u.f, numerator));// extended precision reduction 808 | #endif // FMA 809 | if (true) { // Check that result is within desired interval. This may be omitted if not essential: 810 | // This check is rarely needed except in extreme cases of numerator > 1.E14 * denominator 811 | auto too_high = m >= denominator; 812 | auto too_low = m < 0.; 813 | m = if_sub(too_high, m, denominator); 814 | m = if_add(too_low, m, denominator); 815 | } 816 | return m; 817 | } 818 | } 819 | else { 820 | return nan_vec(1); // denominator is not positive 821 | } 822 | } 823 | 824 | #ifdef VCL_NAMESPACE 825 | } 826 | #endif 827 | 828 | #endif // VECTOR_CONVERT_H 829 | -------------------------------------------------------------------------------- /vectorclass.h: -------------------------------------------------------------------------------- 1 | /**************************** vectorclass.h ******************************** 2 | * Author: Agner Fog 3 | * Date created: 2012-05-30 4 | * Last modified: 2022-07-20 5 | * Version: 2.02.00 6 | * Project: vector class library 7 | * Home: https://github.com/vectorclass 8 | * Description: 9 | * Header file defining vector classes as interface to intrinsic functions 10 | * in x86 and x86-64 microprocessors with SSE2 and later instruction sets. 11 | * 12 | * Instructions: 13 | * Use Gnu, Clang, Microsoft, or Intel C++ compiler. Compile for the desired 14 | * instruction set, which must be at least SSE2. Specify the supported 15 | * instruction set by a command line define, e.g. __SSE4_1__ if the 16 | * compiler does not automatically do so. 17 | * For detailed instructions, see vcl_manual.pdf 18 | * 19 | * Each vector object is represented internally in the CPU as a vector 20 | * register with 128, 256 or 512 bits. 21 | * 22 | * This header file includes the appropriate header files depending on the 23 | * selected instruction set. 24 | * 25 | * (c) Copyright 2012-2022 Agner Fog. 26 | * Apache License version 2.0 or later. 27 | ******************************************************************************/ 28 | 29 | #ifndef VECTORCLASS_H 30 | #define VECTORCLASS_H 20200 31 | 32 | // Maximum vector size, bits. Allowed values are 128, 256, 512 33 | #ifndef MAX_VECTOR_SIZE 34 | #define MAX_VECTOR_SIZE 512 35 | #endif 36 | 37 | // Determine instruction set, and define platform-dependent functions 38 | #include "instrset.h" // Select supported instruction set 39 | 40 | #if INSTRSET < 2 // instruction set SSE2 is the minimum 41 | #error Please compile for the SSE2 instruction set or higher 42 | #else 43 | 44 | // Select appropriate .h files depending on instruction set 45 | #include "vectori128.h" // 128-bit integer vectors 46 | #include "vectorf128.h" // 128-bit floating point vectors 47 | 48 | #if MAX_VECTOR_SIZE >= 256 49 | #if INSTRSET >= 8 50 | #include "vectori256.h" // 256-bit integer vectors, requires AVX2 instruction set 51 | #else 52 | #include "vectori256e.h" // 256-bit integer vectors, emulated 53 | #endif // INSTRSET >= 8 54 | #if INSTRSET >= 7 55 | #include "vectorf256.h" // 256-bit floating point vectors, requires AVX instruction set 56 | #else 57 | #include "vectorf256e.h" // 256-bit floating point vectors, emulated 58 | #endif // INSTRSET >= 7 59 | #endif // MAX_VECTOR_SIZE >= 256 60 | 61 | #if MAX_VECTOR_SIZE >= 512 62 | #if INSTRSET >= 9 63 | #include "vectori512.h" // 512-bit vectors of 32 and 64 bit integers, requires AVX512F instruction set 64 | #include "vectorf512.h" // 512-bit floating point vectors, requires AVX512F instruction set 65 | #else 66 | #include "vectori512e.h" // 512-bit integer vectors, emulated 67 | #include "vectorf512e.h" // 512-bit floating point vectors, emulated 68 | #endif // INSTRSET >= 9 69 | #if INSTRSET >= 10 70 | #include "vectori512s.h" // 512-bit vectors of 8 and 16 bit integers, requires AVX512BW instruction set 71 | #else 72 | #include "vectori512se.h" // 512-bit vectors of 8 and 16 bit integers, emulated 73 | #endif 74 | #endif // MAX_VECTOR_SIZE >= 512 75 | 76 | #include "vector_convert.h" // conversion between different vector sizes, and common templates 77 | 78 | #endif // INSTRSET >= 2 79 | 80 | 81 | #else // VECTORCLASS_H 82 | 83 | #if VECTORCLASS_H < 20000 84 | #error Mixed versions of vector class library 85 | #endif 86 | 87 | #endif // VECTORCLASS_H 88 | -------------------------------------------------------------------------------- /vectormath_common.h: -------------------------------------------------------------------------------- 1 | /*************************** vectormath_common.h **************************** 2 | * Author: Agner Fog 3 | * Date created: 2014-04-18 4 | * Last modified: 2022-07-20 5 | * Version: 2.02.00 6 | * Project: vector classes 7 | * Description: 8 | * Header file containing common code for inline version of mathematical functions. 9 | * 10 | * For detailed instructions, see VectorClass.pdf 11 | * 12 | * (c) Copyright 2014-2022 Agner Fog. 13 | * Apache License version 2.0 or later. 14 | ******************************************************************************/ 15 | 16 | #ifndef VECTORMATH_COMMON_H 17 | #define VECTORMATH_COMMON_H 2 18 | 19 | #ifdef VECTORMATH_LIB_H 20 | #error conflicting header files. More than one implementation of mathematical functions included 21 | #endif 22 | 23 | #include 24 | 25 | #ifndef VECTORCLASS_H 26 | #include "vectorclass.h" 27 | #endif 28 | 29 | #if VECTORCLASS_H < 20200 30 | #error Incompatible versions of vector class library mixed 31 | #endif 32 | 33 | 34 | /****************************************************************************** 35 | Define NAN payload values 36 | ******************************************************************************/ 37 | #define NAN_LOG 0x101 // logarithm for x<0 38 | #define NAN_POW 0x102 // negative number raised to non-integer power 39 | #define NAN_HYP 0x104 // acosh for x<1 and atanh for abs(x)>1 40 | 41 | 42 | /****************************************************************************** 43 | Define mathematical constants 44 | ******************************************************************************/ 45 | #define VM_PI 3.14159265358979323846 // pi 46 | #define VM_PI_2 1.57079632679489661923 // pi / 2 47 | #define VM_PI_4 0.785398163397448309616 // pi / 4 48 | #define VM_SQRT2 1.41421356237309504880 // sqrt(2) 49 | #define VM_LOG2E 1.44269504088896340736 // 1/log(2) 50 | #define VM_LOG10E 0.434294481903251827651 // 1/log(10) 51 | #define VM_LOG210 3.321928094887362347808 // log2(10) 52 | #define VM_LN2 0.693147180559945309417 // log(2) 53 | #define VM_LN10 2.30258509299404568402 // log(10) 54 | #define VM_SMALLEST_NORMAL 2.2250738585072014E-308 // smallest normal number, double 55 | #define VM_SMALLEST_NORMALF 1.17549435E-38f // smallest normal number, float 56 | 57 | 58 | #ifdef VCL_NAMESPACE 59 | namespace VCL_NAMESPACE { 60 | #endif 61 | 62 | /****************************************************************************** 63 | templates for producing infinite and nan in desired vector type 64 | ******************************************************************************/ 65 | template 66 | static inline VTYPE infinite_vec(); 67 | 68 | template <> 69 | inline Vec2d infinite_vec() { 70 | return infinite2d(); 71 | } 72 | 73 | template <> 74 | inline Vec4f infinite_vec() { 75 | return infinite4f(); 76 | } 77 | 78 | #if MAX_VECTOR_SIZE >= 256 79 | 80 | template <> 81 | inline Vec4d infinite_vec() { 82 | return infinite4d(); 83 | } 84 | 85 | template <> 86 | inline Vec8f infinite_vec() { 87 | return infinite8f(); 88 | } 89 | 90 | #endif // MAX_VECTOR_SIZE >= 256 91 | 92 | #if MAX_VECTOR_SIZE >= 512 93 | 94 | template <> 95 | inline Vec8d infinite_vec() { 96 | return infinite8d(); 97 | } 98 | 99 | template <> 100 | inline Vec16f infinite_vec() { 101 | return infinite16f(); 102 | } 103 | 104 | #endif // MAX_VECTOR_SIZE >= 512 105 | 106 | 107 | 108 | /****************************************************************************** 109 | * Detect NAN codes 110 | * 111 | * These functions return the code hidden in a NAN. The sign bit is ignored 112 | ******************************************************************************/ 113 | 114 | static inline Vec4ui nan_code(Vec4f const x) { 115 | Vec4ui a = Vec4ui(reinterpret_i(x)); 116 | Vec4ui const n = 0x007FFFFF; 117 | return select(Vec4ib(is_nan(x)), a & n, 0); 118 | } 119 | 120 | // This function returns the code hidden in a NAN. The sign bit is ignored 121 | static inline Vec2uq nan_code(Vec2d const x) { 122 | Vec2uq a = Vec2uq(reinterpret_i(x)); 123 | return select(Vec2qb(is_nan(x)), a << 12 >> (12+29), 0); 124 | } 125 | 126 | #if MAX_VECTOR_SIZE >= 256 127 | 128 | // This function returns the code hidden in a NAN. The sign bit is ignored 129 | static inline Vec8ui nan_code(Vec8f const x) { 130 | Vec8ui a = Vec8ui(reinterpret_i(x)); 131 | Vec8ui const n = 0x007FFFFF; 132 | return select(Vec8ib(is_nan(x)), a & n, 0); 133 | } 134 | 135 | // This function returns the code hidden in a NAN. The sign bit is ignored 136 | static inline Vec4uq nan_code(Vec4d const x) { 137 | Vec4uq a = Vec4uq(reinterpret_i(x)); 138 | return select(Vec4qb(is_nan(x)), a << 12 >> (12+29), 0); 139 | } 140 | 141 | #endif // MAX_VECTOR_SIZE >= 256 142 | #if MAX_VECTOR_SIZE >= 512 143 | 144 | // This function returns the code hidden in a NAN. The sign bit is ignored 145 | static inline Vec16ui nan_code(Vec16f const x) { 146 | Vec16ui a = Vec16ui(reinterpret_i(x)); 147 | Vec16ui const n = 0x007FFFFF; 148 | return select(Vec16ib(is_nan(x)), a & n, 0); 149 | } 150 | 151 | // This function returns the code hidden in a NAN. The sign bit is ignored 152 | static inline Vec8uq nan_code(Vec8d const x) { 153 | Vec8uq a = Vec8uq(reinterpret_i(x)); 154 | return select(Vec8qb(is_nan(x)), a << 12 >> (12+29), 0); 155 | } 156 | 157 | #endif // MAX_VECTOR_SIZE >= 512 158 | 159 | 160 | /****************************************************************************** 161 | templates for polynomials 162 | Using Estrin's scheme to make shorter dependency chains and use FMA, starting 163 | longest dependency chains first. 164 | ******************************************************************************/ 165 | 166 | // template 167 | template 168 | static inline VTYPE polynomial_2(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2) { 169 | // calculates polynomial c2*x^2 + c1*x + c0 170 | // VTYPE may be a vector type, CTYPE is a scalar type 171 | VTYPE x2 = x * x; 172 | //return = x2 * c2 + (x * c1 + c0); 173 | return mul_add(x2, c2, mul_add(x, c1, c0)); 174 | } 175 | 176 | template 177 | static inline VTYPE polynomial_3(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3) { 178 | // calculates polynomial c3*x^3 + c2*x^2 + c1*x + c0 179 | // VTYPE may be a vector type, CTYPE is a scalar type 180 | VTYPE x2 = x * x; 181 | //return (c2 + c3*x)*x2 + (c1*x + c0); 182 | return mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0)); 183 | } 184 | 185 | template 186 | static inline VTYPE polynomial_4(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4) { 187 | // calculates polynomial c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0 188 | // VTYPE may be a vector type, CTYPE is a scalar type 189 | VTYPE x2 = x * x; 190 | VTYPE x4 = x2 * x2; 191 | //return (c2+c3*x)*x2 + ((c0+c1*x) + c4*x4); 192 | return mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0) + c4*x4); 193 | } 194 | 195 | template 196 | static inline VTYPE polynomial_4n(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3) { 197 | // calculates polynomial 1*x^4 + c3*x^3 + c2*x^2 + c1*x + c0 198 | // VTYPE may be a vector type, CTYPE is a scalar type 199 | VTYPE x2 = x * x; 200 | VTYPE x4 = x2 * x2; 201 | //return (c2+c3*x)*x2 + ((c0+c1*x) + x4); 202 | return mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0) + x4); 203 | } 204 | 205 | template 206 | static inline VTYPE polynomial_5(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5) { 207 | // calculates polynomial c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0 208 | // VTYPE may be a vector type, CTYPE is a scalar type 209 | VTYPE x2 = x * x; 210 | VTYPE x4 = x2 * x2; 211 | //return (c2+c3*x)*x2 + ((c4+c5*x)*x4 + (c0+c1*x)); 212 | return mul_add(mul_add(c3, x, c2), x2, mul_add(mul_add(c5, x, c4), x4, mul_add(c1, x, c0))); 213 | } 214 | 215 | template 216 | static inline VTYPE polynomial_5n(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4) { 217 | // calculates polynomial 1*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0 218 | // VTYPE may be a vector type, CTYPE is a scalar type 219 | VTYPE x2 = x * x; 220 | VTYPE x4 = x2 * x2; 221 | //return (c2+c3*x)*x2 + ((c4+x)*x4 + (c0+c1*x)); 222 | return mul_add(mul_add(c3, x, c2), x2, mul_add(c4 + x, x4, mul_add(c1, x, c0))); 223 | } 224 | 225 | template 226 | static inline VTYPE polynomial_6(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6) { 227 | // calculates polynomial c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0 228 | // VTYPE may be a vector type, CTYPE is a scalar type 229 | VTYPE x2 = x * x; 230 | VTYPE x4 = x2 * x2; 231 | //return (c4+c5*x+c6*x2)*x4 + ((c2+c3*x)*x2 + (c0+c1*x)); 232 | return mul_add(mul_add(c6, x2, mul_add(c5, x, c4)), x4, mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0))); 233 | } 234 | 235 | template 236 | static inline VTYPE polynomial_6n(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5) { 237 | // calculates polynomial 1*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0 238 | // VTYPE may be a vector type, CTYPE is a scalar type 239 | VTYPE x2 = x * x; 240 | VTYPE x4 = x2 * x2; 241 | //return (c4+c5*x+x2)*x4 + ((c2+c3*x)*x2 + (c0+c1*x)); 242 | return mul_add(mul_add(c5, x, c4 + x2), x4, mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0))); 243 | } 244 | 245 | template 246 | static inline VTYPE polynomial_7(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7) { 247 | // calculates polynomial c7*x^7 + c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0 248 | // VTYPE may be a vector type, CTYPE is a scalar type 249 | VTYPE x2 = x * x; 250 | VTYPE x4 = x2 * x2; 251 | //return ((c6+c7*x)*x2 + (c4+c5*x))*x4 + ((c2+c3*x)*x2 + (c0+c1*x)); 252 | return mul_add(mul_add(mul_add(c7, x, c6), x2, mul_add(c5, x, c4)), x4, mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0))); 253 | } 254 | 255 | template 256 | static inline VTYPE polynomial_8(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7, CTYPE c8) { 257 | // calculates polynomial c8*x^8 + c7*x^7 + c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0 258 | // VTYPE may be a vector type, CTYPE is a scalar type 259 | VTYPE x2 = x * x; 260 | VTYPE x4 = x2 * x2; 261 | VTYPE x8 = x4 * x4; 262 | //return ((c6+c7*x)*x2 + (c4+c5*x))*x4 + (c8*x8 + (c2+c3*x)*x2 + (c0+c1*x)); 263 | return mul_add(mul_add(mul_add(c7, x, c6), x2, mul_add(c5, x, c4)), x4, 264 | mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0) + c8*x8)); 265 | } 266 | 267 | template 268 | static inline VTYPE polynomial_9(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7, CTYPE c8, CTYPE c9) { 269 | // calculates polynomial c9*x^9 + c8*x^8 + c7*x^7 + c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0 270 | // VTYPE may be a vector type, CTYPE is a scalar type 271 | VTYPE x2 = x * x; 272 | VTYPE x4 = x2 * x2; 273 | VTYPE x8 = x4 * x4; 274 | //return (((c6+c7*x)*x2 + (c4+c5*x))*x4 + (c8+c9*x)*x8) + ((c2+c3*x)*x2 + (c0+c1*x)); 275 | return mul_add(mul_add(c9, x, c8), x8, mul_add( 276 | mul_add(mul_add(c7, x, c6), x2, mul_add(c5, x, c4)), x4, 277 | mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0)))); 278 | } 279 | 280 | template 281 | static inline VTYPE polynomial_10(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7, CTYPE c8, CTYPE c9, CTYPE c10) { 282 | // calculates polynomial c10*x^10 + c9*x^9 + c8*x^8 + c7*x^7 + c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0 283 | // VTYPE may be a vector type, CTYPE is a scalar type 284 | VTYPE x2 = x * x; 285 | VTYPE x4 = x2 * x2; 286 | VTYPE x8 = x4 * x4; 287 | //return (((c6+c7*x)*x2 + (c4+c5*x))*x4 + (c8+c9*x+c10*x2)*x8) + ((c2+c3*x)*x2 + (c0+c1*x)); 288 | return mul_add(mul_add(x2, c10, mul_add(c9, x, c8)), x8, 289 | mul_add(mul_add(mul_add(c7, x, c6), x2, mul_add(c5, x, c4)), x4, 290 | mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0)))); 291 | } 292 | 293 | template 294 | static inline VTYPE polynomial_13(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7, CTYPE c8, CTYPE c9, CTYPE c10, CTYPE c11, CTYPE c12, CTYPE c13) { 295 | // calculates polynomial c13*x^13 + c12*x^12 + ... + c1*x + c0 296 | // VTYPE may be a vector type, CTYPE is a scalar type 297 | VTYPE x2 = x * x; 298 | VTYPE x4 = x2 * x2; 299 | VTYPE x8 = x4 * x4; 300 | return mul_add( 301 | mul_add( 302 | mul_add(c13, x, c12), x4, 303 | mul_add(mul_add(c11, x, c10), x2, mul_add(c9, x, c8))), x8, 304 | mul_add( 305 | mul_add(mul_add(c7, x, c6), x2, mul_add(c5, x, c4)), x4, 306 | mul_add(mul_add(c3, x, c2), x2, mul_add(c1, x, c0)))); 307 | } 308 | 309 | 310 | template 311 | static inline VTYPE polynomial_13m(VTYPE const x, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7, CTYPE c8, CTYPE c9, CTYPE c10, CTYPE c11, CTYPE c12, CTYPE c13) { 312 | // calculates polynomial c13*x^13 + c12*x^12 + ... + x + 0 313 | // VTYPE may be a vector type, CTYPE is a scalar type 314 | VTYPE x2 = x * x; 315 | VTYPE x4 = x2 * x2; 316 | VTYPE x8 = x4 * x4; 317 | // return ((c8+c9*x) + (c10+c11*x)*x2 + (c12+c13*x)*x4)*x8 + (((c6+c7*x)*x2 + (c4+c5*x))*x4 + ((c2+c3*x)*x2 + x)); 318 | return mul_add( 319 | mul_add(mul_add(c13, x, c12), x4, mul_add(mul_add(c11, x, c10), x2, mul_add(c9, x, c8))), x8, 320 | mul_add(mul_add(mul_add(c7, x, c6), x2, mul_add(c5, x, c4)), x4, mul_add(mul_add(c3, x, c2), x2, x))); 321 | } 322 | 323 | #ifdef VCL_NAMESPACE 324 | } 325 | #endif 326 | 327 | #endif 328 | -------------------------------------------------------------------------------- /vectormath_hyp.h: -------------------------------------------------------------------------------- 1 | /**************************** vectormath_hyp.h ****************************** 2 | * Author: Agner Fog 3 | * Date created: 2014-07-09 4 | * Last modified: 2022-07-20 5 | * Version: 2.02.00 6 | * Project: vector class library 7 | * Description: 8 | * Header file containing inline vector functions of hyperbolic and inverse 9 | * hyperbolic functions: 10 | * sinh hyperbolic sine 11 | * cosh hyperbolic cosine 12 | * tanh hyperbolic tangent 13 | * asinh inverse hyperbolic sine 14 | * acosh inverse hyperbolic cosine 15 | * atanh inverse hyperbolic tangent 16 | * 17 | * Theory, methods and inspiration based partially on these sources: 18 | * > Moshier, Stephen Lloyd Baluk: Methods and programs for mathematical functions. 19 | * Ellis Horwood, 1989. 20 | * > VDT library developed on CERN by Danilo Piparo, Thomas Hauth and 21 | * Vincenzo Innocente, 2012, https://svnweb.cern.ch/trac/vdt 22 | * > Cephes math library by Stephen L. Moshier 1992, 23 | * http://www.netlib.org/cephes/ 24 | * 25 | * For detailed instructions, see vectormath_common.h and vcl_manual.pdf 26 | * 27 | * (c) Copyright 2014-2022 Agner Fog. 28 | * Apache License version 2.0 or later. 29 | ******************************************************************************/ 30 | 31 | #ifndef VECTORMATH_HYP_H 32 | #define VECTORMATH_HYP_H 202 33 | 34 | #include "vectormath_exp.h" 35 | 36 | #ifdef VCL_NAMESPACE 37 | namespace VCL_NAMESPACE { 38 | #endif 39 | 40 | /****************************************************************************** 41 | * Hyperbolic functions 42 | ******************************************************************************/ 43 | 44 | // Template for sinh function, double precision 45 | // This function does not produce denormals 46 | // Template parameters: 47 | // VTYPE: double vector type 48 | template 49 | static inline VTYPE sinh_d(VTYPE const x0) { 50 | // The limit of abs(x) is 709.7, as defined by max_x in vectormath_exp.h for 0.5*exp(x). 51 | 52 | // Coefficients 53 | const double p0 = -3.51754964808151394800E5; 54 | const double p1 = -1.15614435765005216044E4; 55 | const double p2 = -1.63725857525983828727E2; 56 | const double p3 = -7.89474443963537015605E-1; 57 | 58 | const double q0 = -2.11052978884890840399E6; 59 | const double q1 = 3.61578279834431989373E4; 60 | const double q2 = -2.77711081420602794433E2; 61 | const double q3 = 1.0; 62 | 63 | // data vectors 64 | VTYPE x, x2, y1, y2; 65 | 66 | x = abs(x0); 67 | auto x_small = x <= 1.0; // use Pade approximation if abs(x) <= 1 68 | 69 | if (horizontal_or(x_small)) { 70 | // At least one element needs small method 71 | x2 = x*x; 72 | y1 = polynomial_3(x2, p0, p1, p2, p3) / polynomial_3(x2, q0, q1, q2, q3); 73 | y1 = mul_add(y1, x*x2, x); // y1 = x + x2*(x*y1); 74 | } 75 | if (!horizontal_and(x_small)) { 76 | // At least one element needs big method 77 | y2 = exp_d(x); // 0.5 * exp(x) 78 | y2 -= 0.25 / y2; // - 0.5 * exp(-x) 79 | } 80 | y1 = select(x_small, y1, y2); // choose method 81 | y1 = sign_combine(y1, x0); // get original sign 82 | // you can avoid the sign_combine by replacing x by x0 above, but at a loss of precision 83 | 84 | return y1; 85 | } 86 | 87 | // instances of sinh_d template 88 | static inline Vec2d sinh(Vec2d const x) { 89 | return sinh_d(x); 90 | } 91 | 92 | #if MAX_VECTOR_SIZE >= 256 93 | static inline Vec4d sinh(Vec4d const x) { 94 | return sinh_d(x); 95 | } 96 | #endif // MAX_VECTOR_SIZE >= 256 97 | 98 | #if MAX_VECTOR_SIZE >= 512 99 | static inline Vec8d sinh(Vec8d const x) { 100 | return sinh_d(x); 101 | } 102 | #endif // MAX_VECTOR_SIZE >= 512 103 | 104 | 105 | // Template for sinh function, single precision 106 | // This function does not produce denormals 107 | // Template parameters: 108 | // VTYPE: double vector type 109 | template 110 | static inline VTYPE sinh_f(VTYPE const x0) { 111 | // The limit of abs(x) is 89.0, as defined by max_x in vectormath_exp.h for 0.5*exp(x). 112 | 113 | // Coefficients 114 | const float r0 = 1.66667160211E-1f; 115 | const float r1 = 8.33028376239E-3f; 116 | const float r2 = 2.03721912945E-4f; 117 | 118 | // data vectors 119 | VTYPE x, x2, y1, y2; 120 | 121 | x = abs(x0); 122 | auto x_small = x <= 1.0f; // use polynomial approximation if abs(x) <= 1 123 | 124 | if (horizontal_or(x_small)) { 125 | // At least one element needs small method 126 | x2 = x*x; 127 | y1 = polynomial_2(x2, r0, r1, r2); 128 | y1 = mul_add(y1, x2*x, x); // y1 = x + x2*(x*y1); 129 | } 130 | if (!horizontal_and(x_small)) { 131 | // At least one element needs big method 132 | y2 = exp_f(x); // 0.5 * exp(x) 133 | y2 -= 0.25f / y2; // - 0.5 * exp(-x) 134 | } 135 | y1 = select(x_small, y1, y2); // choose method 136 | y1 = sign_combine(y1, x0); // get original sign 137 | // you can avoid the sign_combine by replacing x by x0 above, but at a loss of precision 138 | 139 | return y1; 140 | } 141 | 142 | // instances of sinh_f template 143 | static inline Vec4f sinh(Vec4f const x) { 144 | return sinh_f(x); 145 | } 146 | 147 | #if MAX_VECTOR_SIZE >= 256 148 | static inline Vec8f sinh(Vec8f const x) { 149 | return sinh_f(x); 150 | } 151 | #endif // MAX_VECTOR_SIZE >= 256 152 | 153 | #if MAX_VECTOR_SIZE >= 512 154 | static inline Vec16f sinh(Vec16f const x) { 155 | return sinh_f(x); 156 | } 157 | #endif // MAX_VECTOR_SIZE >= 512 158 | 159 | 160 | // Template for cosh function, double precision 161 | // This function does not produce denormals 162 | // Template parameters: 163 | // VTYPE: double vector type 164 | template 165 | static inline VTYPE cosh_d(VTYPE const x0) { 166 | // The limit of abs(x) is 709.7, as defined by max_x in vectormath_exp.h for 0.5*exp(x). 167 | 168 | // data vectors 169 | VTYPE x, y; 170 | x = abs(x0); 171 | y = exp_d(x); // 0.5 * exp(x) 172 | y += 0.25 / y; // + 0.5 * exp(-x) 173 | return y; 174 | } 175 | 176 | // instances of sinh_d template 177 | static inline Vec2d cosh(Vec2d const x) { 178 | return cosh_d(x); 179 | } 180 | 181 | #if MAX_VECTOR_SIZE >= 256 182 | static inline Vec4d cosh(Vec4d const x) { 183 | return cosh_d(x); 184 | } 185 | #endif // MAX_VECTOR_SIZE >= 256 186 | 187 | #if MAX_VECTOR_SIZE >= 512 188 | static inline Vec8d cosh(Vec8d const x) { 189 | return cosh_d(x); 190 | } 191 | #endif // MAX_VECTOR_SIZE >= 512 192 | 193 | 194 | // Template for cosh function, single precision 195 | // This function does not produce denormals 196 | // Template parameters: 197 | // VTYPE: double vector type 198 | template 199 | static inline VTYPE cosh_f(VTYPE const x0) { 200 | // The limit of abs(x) is 89.0, as defined by max_x in vectormath_exp.h for 0.5*exp(x). 201 | 202 | // data vectors 203 | VTYPE x, y; 204 | x = abs(x0); 205 | y = exp_f(x); // 0.5 * exp(x) 206 | y += 0.25f / y; // + 0.5 * exp(-x) 207 | return y; 208 | } 209 | 210 | // instances of sinh_d template 211 | static inline Vec4f cosh(Vec4f const x) { 212 | return cosh_f(x); 213 | } 214 | 215 | #if MAX_VECTOR_SIZE >= 256 216 | static inline Vec8f cosh(Vec8f const x) { 217 | return cosh_f(x); 218 | } 219 | #endif // MAX_VECTOR_SIZE >= 256 220 | 221 | #if MAX_VECTOR_SIZE >= 512 222 | static inline Vec16f cosh(Vec16f const x) { 223 | return cosh_f(x); 224 | } 225 | #endif // MAX_VECTOR_SIZE >= 512 226 | 227 | 228 | // Template for tanh function, double precision 229 | // This function does not produce denormals 230 | // Template parameters: 231 | // VTYPE: double vector type 232 | template 233 | static inline VTYPE tanh_d(VTYPE const x0) { 234 | 235 | // Coefficients 236 | const double p0 = -1.61468768441708447952E3; 237 | const double p1 = -9.92877231001918586564E1; 238 | const double p2 = -9.64399179425052238628E-1; 239 | 240 | const double q0 = 4.84406305325125486048E3; 241 | const double q1 = 2.23548839060100448583E3; 242 | const double q2 = 1.12811678491632931402E2; 243 | const double q3 = 1.0; 244 | 245 | // data vectors 246 | VTYPE x, x2, y1, y2; 247 | 248 | x = abs(x0); 249 | auto x_small = x <= 0.625; // use Pade approximation if abs(x) <= 5/8 250 | 251 | if (horizontal_or(x_small)) { 252 | // At least one element needs small method 253 | x2 = x*x; 254 | y1 = polynomial_2(x2, p0, p1, p2) / polynomial_3(x2, q0, q1, q2, q3); 255 | y1 = mul_add(y1, x2*x, x); // y1 = x + x2*(x*y1); 256 | } 257 | if (!horizontal_and(x_small)) { 258 | // At least one element needs big method 259 | y2 = exp(x+x); // exp(2*x) 260 | y2 = 1.0 - 2.0 / (y2 + 1.0); // tanh(x) 261 | } 262 | auto x_big = x > 350.; 263 | y1 = select(x_small, y1, y2); // choose method 264 | y1 = select(x_big, 1.0, y1); // avoid overflow 265 | y1 = sign_combine(y1, x0); // get original sign 266 | return y1; 267 | } 268 | 269 | // instances of tanh_d template 270 | static inline Vec2d tanh(Vec2d const x) { 271 | return tanh_d(x); 272 | } 273 | 274 | #if MAX_VECTOR_SIZE >= 256 275 | static inline Vec4d tanh(Vec4d const x) { 276 | return tanh_d(x); 277 | } 278 | #endif // MAX_VECTOR_SIZE >= 256 279 | 280 | #if MAX_VECTOR_SIZE >= 512 281 | static inline Vec8d tanh(Vec8d const x) { 282 | return tanh_d(x); 283 | } 284 | #endif // MAX_VECTOR_SIZE >= 512 285 | 286 | 287 | // Template for tanh function, single precision 288 | // This function does not produce denormals 289 | // Template parameters: 290 | // VTYPE: double vector type 291 | template 292 | static inline VTYPE tanh_f(VTYPE const x0) { 293 | // The limit of abs(x) is 89.0, as defined by max_x in vectormath_exp.h for 0.5*exp(x). 294 | 295 | // Coefficients 296 | const float r0 = -3.33332819422E-1f; 297 | const float r1 = 1.33314422036E-1f; 298 | const float r2 = -5.37397155531E-2f; 299 | const float r3 = 2.06390887954E-2f; 300 | const float r4 = -5.70498872745E-3f; 301 | 302 | // data vectors 303 | VTYPE x, x2, y1, y2; 304 | 305 | x = abs(x0); 306 | auto x_small = x <= 0.625f; // use polynomial approximation if abs(x) <= 5/8 307 | 308 | if (horizontal_or(x_small)) { 309 | // At least one element needs small method 310 | x2 = x*x; 311 | y1 = polynomial_4(x2, r0, r1, r2, r3, r4); 312 | y1 = mul_add(y1, x2*x, x); // y1 = x + (x2*x)*y1; 313 | } 314 | if (!horizontal_and(x_small)) { 315 | // At least one element needs big method 316 | y2 = exp(x+x); // exp(2*x) 317 | y2 = 1.0f - 2.0f / (y2 + 1.0f); // tanh(x) 318 | } 319 | auto x_big = x > 44.4f; 320 | y1 = select(x_small, y1, y2); // choose method 321 | y1 = select(x_big, 1.0f, y1); // avoid overflow 322 | y1 = sign_combine(y1, x0); // get original sign 323 | return y1; 324 | } 325 | 326 | // instances of tanh_f template 327 | static inline Vec4f tanh(Vec4f const x) { 328 | return tanh_f(x); 329 | } 330 | 331 | #if MAX_VECTOR_SIZE >= 256 332 | static inline Vec8f tanh(Vec8f const x) { 333 | return tanh_f(x); 334 | } 335 | #endif // MAX_VECTOR_SIZE >= 256 336 | 337 | #if MAX_VECTOR_SIZE >= 512 338 | static inline Vec16f tanh(Vec16f const x) { 339 | return tanh_f(x); 340 | } 341 | #endif // MAX_VECTOR_SIZE >= 512 342 | 343 | 344 | 345 | /****************************************************************************** 346 | * Inverse hyperbolic functions 347 | ******************************************************************************/ 348 | 349 | // Template for asinh function, double precision 350 | // This function does not produce denormals 351 | // Template parameters: 352 | // VTYPE: double vector type 353 | template 354 | static inline VTYPE asinh_d(VTYPE const x0) { 355 | 356 | // Coefficients 357 | const double p0 = -5.56682227230859640450E0; 358 | const double p1 = -9.09030533308377316566E0; 359 | const double p2 = -4.37390226194356683570E0; 360 | const double p3 = -5.91750212056387121207E-1; 361 | const double p4 = -4.33231683752342103572E-3; 362 | 363 | const double q0 = 3.34009336338516356383E1; 364 | const double q1 = 6.95722521337257608734E1; 365 | const double q2 = 4.86042483805291788324E1; 366 | const double q3 = 1.28757002067426453537E1; 367 | const double q4 = 1.0; 368 | 369 | // data vectors 370 | VTYPE x, x2, y1, y2; 371 | 372 | x2 = x0 * x0; 373 | x = abs(x0); 374 | auto x_small = x <= 0.533; // use Pade approximation if abs(x) <= 0.5 375 | // Both methods give the highest error close to 0.5. 376 | // This limit is adjusted for minimum error 377 | auto x_huge = x > 1.E20; // simple approximation, avoid overflow 378 | 379 | if (horizontal_or(x_small)) { 380 | // At least one element needs small method 381 | y1 = polynomial_4(x2, p0, p1, p2, p3, p4) / polynomial_4(x2, q0, q1, q2, q3, q4); 382 | y1 = mul_add(y1, x2*x, x); // y1 = x + (x2*x)*y1; 383 | } 384 | if (!horizontal_and(x_small)) { 385 | // At least one element needs big method 386 | y2 = log(x + sqrt(x2 + 1.0)); 387 | if (horizontal_or(x_huge)) { 388 | // At least one element needs huge method to avoid overflow 389 | y2 = select(x_huge, log(x) + VM_LN2, y2); 390 | } 391 | } 392 | y1 = select(x_small, y1, y2); // choose method 393 | y1 = sign_combine(y1, x0); // get original sign 394 | return y1; 395 | } 396 | 397 | // instances of asinh_d template 398 | static inline Vec2d asinh(Vec2d const x) { 399 | return asinh_d(x); 400 | } 401 | 402 | #if MAX_VECTOR_SIZE >= 256 403 | static inline Vec4d asinh(Vec4d const x) { 404 | return asinh_d(x); 405 | } 406 | #endif // MAX_VECTOR_SIZE >= 256 407 | 408 | #if MAX_VECTOR_SIZE >= 512 409 | static inline Vec8d asinh(Vec8d const x) { 410 | return asinh_d(x); 411 | } 412 | #endif // MAX_VECTOR_SIZE >= 512 413 | 414 | 415 | // Template for asinh function, single precision 416 | // This function does not produce denormals 417 | // Template parameters: 418 | // VTYPE: double vector type 419 | template 420 | static inline VTYPE asinh_f(VTYPE const x0) { 421 | 422 | // Coefficients 423 | const float r0 = -1.6666288134E-1f; 424 | const float r1 = 7.4847586088E-2f; 425 | const float r2 = -4.2699340972E-2f; 426 | const float r3 = 2.0122003309E-2f; 427 | 428 | // data vectors 429 | VTYPE x, x2, y1, y2; 430 | 431 | x2 = x0 * x0; 432 | x = abs(x0); 433 | auto x_small = x <= 0.51f; // use polynomial approximation if abs(x) <= 0.5 434 | auto x_huge = x > 1.E10f; // simple approximation, avoid overflow 435 | 436 | if (horizontal_or(x_small)) { 437 | // At least one element needs small method 438 | y1 = polynomial_3(x2, r0, r1, r2, r3); 439 | y1 = mul_add(y1, x2*x, x); // y1 = x + (x2*x)*y1; 440 | } 441 | if (!horizontal_and(x_small)) { 442 | // At least one element needs big method 443 | y2 = log(x + sqrt(x2 + 1.0f)); 444 | if (horizontal_or(x_huge)) { 445 | // At least one element needs huge method to avoid overflow 446 | y2 = select(x_huge, log(x) + (float)VM_LN2, y2); 447 | } 448 | } 449 | y1 = select(x_small, y1, y2); // choose method 450 | y1 = sign_combine(y1, x0); // get original sign 451 | return y1; 452 | } 453 | 454 | // instances of asinh_f template 455 | static inline Vec4f asinh(Vec4f const x) { 456 | return asinh_f(x); 457 | } 458 | 459 | #if MAX_VECTOR_SIZE >= 256 460 | static inline Vec8f asinh(Vec8f const x) { 461 | return asinh_f(x); 462 | } 463 | #endif // MAX_VECTOR_SIZE >= 256 464 | 465 | #if MAX_VECTOR_SIZE >= 512 466 | static inline Vec16f asinh(Vec16f const x) { 467 | return asinh_f(x); 468 | } 469 | #endif // MAX_VECTOR_SIZE >= 512 470 | 471 | 472 | // Template for acosh function, double precision 473 | // This function does not produce denormals 474 | // Template parameters: 475 | // VTYPE: double vector type 476 | template 477 | static inline VTYPE acosh_d(VTYPE const x0) { 478 | 479 | // Coefficients 480 | const double p0 = 1.10855947270161294369E5; 481 | const double p1 = 1.08102874834699867335E5; 482 | const double p2 = 3.43989375926195455866E4; 483 | const double p3 = 3.94726656571334401102E3; 484 | const double p4 = 1.18801130533544501356E2; 485 | 486 | const double q0 = 7.83869920495893927727E4; 487 | const double q1 = 8.29725251988426222434E4; 488 | const double q2 = 2.97683430363289370382E4; 489 | const double q3 = 4.15352677227719831579E3; 490 | const double q4 = 1.86145380837903397292E2; 491 | const double q5 = 1.0; 492 | 493 | // data vectors 494 | VTYPE x1, y1, y2; 495 | 496 | x1 = x0 - 1.0; 497 | auto undef = x0 < 1.0; // result is NAN 498 | auto x_small = x1 < 0.49; // use Pade approximation if abs(x-1) < 0.5 499 | auto x_huge = x1 > 1.E20; // simple approximation, avoid overflow 500 | 501 | if (horizontal_or(x_small)) { 502 | // At least one element needs small method 503 | y1 = sqrt(x1) * (polynomial_4(x1, p0, p1, p2, p3, p4) / polynomial_5(x1, q0, q1, q2, q3, q4, q5)); 504 | // x < 1 generates NAN 505 | y1 = select(undef, nan_vec(NAN_HYP), y1); 506 | } 507 | if (!horizontal_and(x_small)) { 508 | // At least one element needs big method 509 | y2 = log(x0 + sqrt(mul_sub(x0,x0,1.0))); 510 | if (horizontal_or(x_huge)) { 511 | // At least one element needs huge method to avoid overflow 512 | y2 = select(x_huge, log(x0) + VM_LN2, y2); 513 | } 514 | } 515 | y1 = select(x_small, y1, y2); // choose method 516 | return y1; 517 | } 518 | 519 | // instances of acosh_d template 520 | static inline Vec2d acosh(Vec2d const x) { 521 | return acosh_d(x); 522 | } 523 | 524 | #if MAX_VECTOR_SIZE >= 256 525 | static inline Vec4d acosh(Vec4d const x) { 526 | return acosh_d(x); 527 | } 528 | #endif // MAX_VECTOR_SIZE >= 256 529 | 530 | #if MAX_VECTOR_SIZE >= 512 531 | static inline Vec8d acosh(Vec8d const x) { 532 | return acosh_d(x); 533 | } 534 | #endif // MAX_VECTOR_SIZE >= 512 535 | 536 | 537 | // Template for acosh function, single precision 538 | // This function does not produce denormals 539 | // Template parameters: 540 | // VTYPE: double vector type 541 | template 542 | static inline VTYPE acosh_f(VTYPE const x0) { 543 | 544 | // Coefficients 545 | const float r0 = 1.4142135263E0f; 546 | const float r1 = -1.1784741703E-1f; 547 | const float r2 = 2.6454905019E-2f; 548 | const float r3 = -7.5272886713E-3f; 549 | const float r4 = 1.7596881071E-3f; 550 | 551 | // data vectors 552 | VTYPE x1, y1, y2; 553 | 554 | x1 = x0 - 1.0f; 555 | auto undef = x0 < 1.0f; // result is NAN 556 | auto x_small = x1 < 0.49f; // use Pade approximation if abs(x-1) < 0.5 557 | auto x_huge = x1 > 1.E10f; // simple approximation, avoid overflow 558 | 559 | if (horizontal_or(x_small)) { 560 | // At least one element needs small method 561 | y1 = sqrt(x1) * polynomial_4(x1, r0, r1, r2, r3, r4); 562 | // x < 1 generates NAN 563 | y1 = select(undef, nan_vec(NAN_HYP), y1); 564 | } 565 | if (!horizontal_and(x_small)) { 566 | // At least one element needs big method 567 | y2 = log(x0 + sqrt(mul_sub(x0,x0,1.0))); 568 | if (horizontal_or(x_huge)) { 569 | // At least one element needs huge method to avoid overflow 570 | y2 = select(x_huge, log(x0) + (float)VM_LN2, y2); 571 | } 572 | } 573 | y1 = select(x_small, y1, y2); // choose method 574 | return y1; 575 | } 576 | 577 | // instances of acosh_f template 578 | static inline Vec4f acosh(Vec4f const x) { 579 | return acosh_f(x); 580 | } 581 | 582 | #if MAX_VECTOR_SIZE >= 256 583 | static inline Vec8f acosh(Vec8f const x) { 584 | return acosh_f(x); 585 | } 586 | #endif // MAX_VECTOR_SIZE >= 256 587 | 588 | #if MAX_VECTOR_SIZE >= 512 589 | static inline Vec16f acosh(Vec16f const x) { 590 | return acosh_f(x); 591 | } 592 | #endif // MAX_VECTOR_SIZE >= 512 593 | 594 | 595 | // Template for atanh function, double precision 596 | // This function does not produce denormals 597 | // Template parameters: 598 | // VTYPE: double vector type 599 | template 600 | static inline VTYPE atanh_d(VTYPE const x0) { 601 | 602 | // Coefficients 603 | const double p0 = -3.09092539379866942570E1; 604 | const double p1 = 6.54566728676544377376E1; 605 | const double p2 = -4.61252884198732692637E1; 606 | const double p3 = 1.20426861384072379242E1; 607 | const double p4 = -8.54074331929669305196E-1; 608 | 609 | const double q0 = -9.27277618139601130017E1; 610 | const double q1 = 2.52006675691344555838E2; 611 | const double q2 = -2.49839401325893582852E2; 612 | const double q3 = 1.08938092147140262656E2; 613 | const double q4 = -1.95638849376911654834E1; 614 | const double q5 = 1.0; 615 | 616 | // data vectors 617 | VTYPE x, x2, y1, y2, y3; 618 | 619 | x = abs(x0); 620 | auto x_small = x < 0.5; // use Pade approximation if abs(x) < 0.5 621 | 622 | if (horizontal_or(x_small)) { 623 | // At least one element needs small method 624 | x2 = x * x; 625 | y1 = polynomial_4(x2, p0, p1, p2, p3, p4) / polynomial_5(x2, q0, q1, q2, q3, q4, q5); 626 | y1 = mul_add(y1, x2*x, x); 627 | } 628 | if (!horizontal_and(x_small)) { 629 | // At least one element needs big method 630 | y2 = log((1.0+x)/(1.0-x)) * 0.5; 631 | // check if out of range 632 | y3 = select(x == 1.0, infinite_vec(), nan_vec(NAN_HYP)); 633 | y2 = select(x >= 1.0, y3, y2); 634 | } 635 | y1 = select(x_small, y1, y2); // choose method 636 | y1 = sign_combine(y1, x0); // get original sign 637 | return y1; 638 | } 639 | 640 | // instances of atanh_d template 641 | static inline Vec2d atanh(Vec2d const x) { 642 | return atanh_d(x); 643 | } 644 | 645 | #if MAX_VECTOR_SIZE >= 256 646 | static inline Vec4d atanh(Vec4d const x) { 647 | return atanh_d(x); 648 | } 649 | #endif // MAX_VECTOR_SIZE >= 256 650 | 651 | #if MAX_VECTOR_SIZE >= 512 652 | static inline Vec8d atanh(Vec8d const x) { 653 | return atanh_d(x); 654 | } 655 | #endif // MAX_VECTOR_SIZE >= 512 656 | 657 | 658 | // Template for atanh function, single precision 659 | // This function does not produce denormals 660 | // Template parameters: 661 | // VTYPE: double vector type 662 | template 663 | static inline VTYPE atanh_f(VTYPE const x0) { 664 | 665 | // Coefficients 666 | const float r0 = 3.33337300303E-1f; 667 | const float r1 = 1.99782164500E-1f; 668 | const float r2 = 1.46691431730E-1f; 669 | const float r3 = 8.24370301058E-2f; 670 | const float r4 = 1.81740078349E-1f; 671 | 672 | // data vectors 673 | VTYPE x, x2, y1, y2, y3; 674 | 675 | x = abs(x0); 676 | auto x_small = x < 0.5f; // use polynomial approximation if abs(x) < 0.5 677 | 678 | if (horizontal_or(x_small)) { 679 | // At least one element needs small method 680 | x2 = x * x; 681 | y1 = polynomial_4(x2, r0, r1, r2, r3, r4); 682 | y1 = mul_add(y1, x2*x, x); 683 | } 684 | if (!horizontal_and(x_small)) { 685 | // At least one element needs big method 686 | y2 = log((1.0f+x)/(1.0f-x)) * 0.5f; 687 | // check if out of range 688 | y3 = select(x == 1.0f, infinite_vec(), nan_vec(NAN_HYP)); 689 | y2 = select(x >= 1.0f, y3, y2); 690 | } 691 | y1 = select(x_small, y1, y2); // choose method 692 | y1 = sign_combine(y1, x0); // get original sign 693 | return y1; 694 | } 695 | 696 | // instances of atanh_f template 697 | static inline Vec4f atanh(Vec4f const x) { 698 | return atanh_f(x); 699 | } 700 | 701 | #if MAX_VECTOR_SIZE >= 256 702 | static inline Vec8f atanh(Vec8f const x) { 703 | return atanh_f(x); 704 | } 705 | #endif // MAX_VECTOR_SIZE >= 256 706 | 707 | #if MAX_VECTOR_SIZE >= 512 708 | static inline Vec16f atanh(Vec16f const x) { 709 | return atanh_f(x); 710 | } 711 | #endif // MAX_VECTOR_SIZE >= 512 712 | 713 | #ifdef VCL_NAMESPACE 714 | } 715 | #endif 716 | 717 | #endif 718 | -------------------------------------------------------------------------------- /vectormath_trig.h: -------------------------------------------------------------------------------- 1 | /**************************** vectormath_trig.h ****************************** 2 | * Author: Agner Fog 3 | * Date created: 2014-04-18 4 | * Last modified: 2022-07-26 5 | * Version: 2.02.00 6 | * Project: vector class library 7 | * Description: 8 | * Header file containing inline version of trigonometric functions 9 | * and inverse trigonometric functions 10 | * sin, cos, sincos, tan 11 | * asin, acos, atan, atan2 12 | * 13 | * Theory, methods, and inspiration based partially on these sources: 14 | * > Moshier, Stephen Lloyd Baluk: Methods and programs for mathematical functions. 15 | * Ellis Horwood, 1989. 16 | * > VDT library developed on CERN by Danilo Piparo, Thomas Hauth and 17 | * Vincenzo Innocente, 2012, https://svnweb.cern.ch/trac/vdt 18 | * > Cephes math library by Stephen L. Moshier 1992, 19 | * http://www.netlib.org/cephes/ 20 | * 21 | * For detailed instructions, see vectormath_common.h and vcl_manual.pdf 22 | * 23 | * (c) Copyright 2014-2022 Agner Fog. 24 | * Apache License version 2.0 or later. 25 | ******************************************************************************/ 26 | 27 | #ifndef VECTORMATH_TRIG_H 28 | #define VECTORMATH_TRIG_H 202 29 | 30 | #include "vectormath_common.h" 31 | 32 | #ifdef VCL_NAMESPACE 33 | namespace VCL_NAMESPACE { 34 | #endif 35 | 36 | 37 | // ************************************************************* 38 | // sin/cos template, double precision 39 | // ************************************************************* 40 | // Template parameters: 41 | // VTYPE: f.p. vector type 42 | // SC: 1 = sin, 2 = cos, 3 = sincos, 4 = tan, 8 = multiply by pi 43 | // Parameters: 44 | // xx = input x (radians) 45 | // cosret = return pointer (only if SC = 3) 46 | template 47 | static inline VTYPE sincos_d(VTYPE * cosret, VTYPE const xx) { 48 | 49 | // define constants 50 | const double P0sin = -1.66666666666666307295E-1; 51 | const double P1sin = 8.33333333332211858878E-3; 52 | const double P2sin = -1.98412698295895385996E-4; 53 | const double P3sin = 2.75573136213857245213E-6; 54 | const double P4sin = -2.50507477628578072866E-8; 55 | const double P5sin = 1.58962301576546568060E-10; 56 | 57 | const double P0cos = 4.16666666666665929218E-2; 58 | const double P1cos = -1.38888888888730564116E-3; 59 | const double P2cos = 2.48015872888517045348E-5; 60 | const double P3cos = -2.75573141792967388112E-7; 61 | const double P4cos = 2.08757008419747316778E-9; 62 | const double P5cos = -1.13585365213876817300E-11; 63 | 64 | const double DP1 = 7.853981554508209228515625E-1 * 2.; 65 | const double DP2 = 7.94662735614792836714E-9 * 2.; 66 | const double DP3 = 3.06161699786838294307E-17 * 2.; 67 | 68 | typedef decltype(roundi(xx)) ITYPE; // integer vector type 69 | //typedef decltype(nan_code(xx)) UITYPE; // unsigned integer vector type 70 | typedef decltype(xx < xx) BVTYPE; // boolean vector type 71 | 72 | VTYPE xa, x, y, x2, s, c, sin1, cos1; // data vectors 73 | ITYPE q, signsin, signcos; // integer vectors, 64 bit 74 | 75 | BVTYPE swap; // boolean vector 76 | 77 | #if INSTRSET < 8 // no FMA 78 | const double input_limit = 1.E13; // lower overflow limit without FMA 79 | #else 80 | const double input_limit = 1.E15; 81 | #endif 82 | 83 | xa = abs(xx); 84 | 85 | // Find quadrant 86 | if constexpr ((SC & 8) != 0) { 87 | y = round(xa * 2.0); 88 | } 89 | else { 90 | xa = select(xa > VTYPE(input_limit), VTYPE(0.f), xa); // overflow limit 91 | y = round(xa * (double)(2. / VM_PI)); // quadrant, as float 92 | } 93 | q = roundi(y); // quadrant, as integer 94 | // Quadrant: 95 | // 0 - pi/4 => 0 96 | // pi/4 - 3*pi/4 => 1 97 | // 3*pi/4 - 5*pi/4 => 2 98 | // 5*pi/4 - 7*pi/4 => 3 99 | // 7*pi/4 - 8*pi/4 => 4 100 | 101 | if constexpr ((SC & 8) != 0) { 102 | x = nmul_add(y, 0.5, xa) * (VM_PI); 103 | } 104 | else { 105 | // Reduce by extended precision modular arithmetic 106 | #if INSTRSET < 8 // no FMA 107 | x = ((xa - y * DP1) - y * DP2) - y * DP3; 108 | #else 109 | x = nmul_add(y, DP3, nmul_add(y, DP2 + DP1, xa)); 110 | #endif 111 | } 112 | // Expansion of sin and cos, valid for -pi/4 <= x <= pi/4 113 | x2 = x * x; 114 | s = polynomial_5(x2, P0sin, P1sin, P2sin, P3sin, P4sin, P5sin); 115 | c = polynomial_5(x2, P0cos, P1cos, P2cos, P3cos, P4cos, P5cos); 116 | s = mul_add(x * x2, s, x); // s = x + (x * x2) * s; 117 | c = mul_add(x2 * x2, c, nmul_add(x2, 0.5, 1.0)); // c = 1.0 - x2 * 0.5 + (x2 * x2) * c; 118 | 119 | // swap sin and cos if odd quadrant 120 | swap = BVTYPE((q & 1) != 0); 121 | 122 | if constexpr ((SC & 1) != 0) { // calculate sin 123 | sin1 = select(swap, c, s); 124 | signsin = ((q << 62) ^ ITYPE(reinterpret_i(xx))); 125 | sin1 = sign_combine(sin1, reinterpret_d(signsin)); 126 | } 127 | if constexpr ((SC & 2) != 0) { // calculate cos 128 | cos1 = select(swap, s, c); 129 | signcos = ((q + 1) & 2) << 62; 130 | cos1 ^= reinterpret_d(signcos); 131 | } 132 | if constexpr ((SC & 7) == 3) { // calculate both. cos returned through pointer 133 | *cosret = cos1; 134 | } 135 | if constexpr ((SC & 1) != 0) return sin1; else return cos1; 136 | } 137 | 138 | // instantiations of sincos_d template: 139 | 140 | static inline Vec2d sin(Vec2d const x) { 141 | return sincos_d(0, x); 142 | } 143 | 144 | static inline Vec2d cos(Vec2d const x) { 145 | return sincos_d(0, x); 146 | } 147 | 148 | static inline Vec2d sincos(Vec2d * cosret, Vec2d const x) { 149 | return sincos_d(cosret, x); 150 | } 151 | 152 | static inline Vec2d sinpi(Vec2d const x) { 153 | return sincos_d(0, x); 154 | } 155 | 156 | static inline Vec2d cospi(Vec2d const x) { 157 | return sincos_d(0, x); 158 | } 159 | 160 | static inline Vec2d sincospi(Vec2d * cosret, Vec2d const x) { 161 | return sincos_d(cosret, x); 162 | } 163 | 164 | 165 | #if MAX_VECTOR_SIZE >= 256 166 | static inline Vec4d sin(Vec4d const x) { 167 | return sincos_d(0, x); 168 | } 169 | 170 | static inline Vec4d cos(Vec4d const x) { 171 | return sincos_d(0, x); 172 | } 173 | 174 | static inline Vec4d sincos(Vec4d * cosret, Vec4d const x) { 175 | return sincos_d(cosret, x); 176 | } 177 | 178 | static inline Vec4d sinpi(Vec4d const x) { 179 | return sincos_d(0, x); 180 | } 181 | 182 | static inline Vec4d cospi(Vec4d const x) { 183 | return sincos_d(0, x); 184 | } 185 | 186 | static inline Vec4d sincospi(Vec4d * cosret, Vec4d const x) { 187 | return sincos_d(cosret, x); 188 | } 189 | 190 | #endif // MAX_VECTOR_SIZE >= 256 191 | 192 | #if MAX_VECTOR_SIZE >= 512 193 | static inline Vec8d sin(Vec8d const x) { 194 | return sincos_d(0, x); 195 | } 196 | 197 | static inline Vec8d cos(Vec8d const x) { 198 | return sincos_d(0, x); 199 | } 200 | 201 | static inline Vec8d sincos(Vec8d * cosret, Vec8d const x) { 202 | return sincos_d(cosret, x); 203 | } 204 | 205 | static inline Vec8d sinpi(Vec8d const x) { 206 | return sincos_d(0, x); 207 | } 208 | 209 | static inline Vec8d cospi(Vec8d const x) { 210 | return sincos_d(0, x); 211 | } 212 | 213 | static inline Vec8d sincospi(Vec8d * cosret, Vec8d const x) { 214 | return sincos_d(cosret, x); 215 | } 216 | 217 | #endif // MAX_VECTOR_SIZE >= 512 218 | 219 | 220 | // ************************************************************* 221 | // sincos template, single precision 222 | // ************************************************************* 223 | // Template parameters: 224 | // VTYPE: f.p. vector type 225 | // SC: 1 = sin, 2 = cos, 3 = sincos, 4 = tan, 8 = multiply by pi 226 | // Parameters: 227 | // xx = input x (radians) 228 | // cosret = return pointer (only if SC = 3) 229 | template 230 | static inline VTYPE sincos_f(VTYPE* cosret, VTYPE const xx) { 231 | 232 | // define constants 233 | const float DP1F = 0.78515625f * 2.f; 234 | const float DP2F = 2.4187564849853515625E-4f * 2.f; 235 | const float DP3F = 3.77489497744594108E-8f * 2.f; 236 | 237 | const float P0sinf = -1.6666654611E-1f; 238 | const float P1sinf = 8.3321608736E-3f; 239 | const float P2sinf = -1.9515295891E-4f; 240 | 241 | const float P0cosf = 4.166664568298827E-2f; 242 | const float P1cosf = -1.388731625493765E-3f; 243 | const float P2cosf = 2.443315711809948E-5f; 244 | 245 | typedef decltype(roundi(xx)) ITYPE; // integer vector type 246 | //typedef decltype(nan_code(xx)) UITYPE; // unsigned integer vector type 247 | typedef decltype(xx < xx) BVTYPE; // boolean vector type 248 | 249 | #if INSTRSET < 8 // no FMA 250 | const float input_limit = 1.E5f; // lower overflow limit without FMA 251 | #else 252 | const float input_limit = 1.E7f; 253 | #endif 254 | 255 | VTYPE xa, x, y, x2, s, c, sin1, cos1; // data vectors 256 | ITYPE q, signsin, signcos; // integer vectors 257 | BVTYPE swap; // boolean vector 258 | 259 | xa = abs(xx); 260 | 261 | // Find quadrant 262 | if constexpr ((SC & 8) != 0) { 263 | y = round(xa * 2.0f); 264 | } 265 | else { 266 | xa = select(xa > VTYPE(input_limit), VTYPE(0.f), xa); // overflow limit 267 | y = round(xa * (float)(2. / VM_PI)); // quadrant, as float 268 | } 269 | q = roundi(y); // quadrant, as integer 270 | // Quadrant: 271 | // 0 - pi/4 => 0 272 | // pi/4 - 3*pi/4 => 1 273 | // 3*pi/4 - 5*pi/4 => 2 274 | // 5*pi/4 - 7*pi/4 => 3 275 | // 7*pi/4 - 8*pi/4 => 4 276 | 277 | if constexpr ((SC & 8) != 0) { 278 | x = nmul_add(y, 0.5f, xa)*float(VM_PI); 279 | } 280 | else { 281 | // Reduce by extended precision modular arithmetic 282 | #if INSTRSET < 8 // no FMA 283 | x = ((xa - y * DP1F) - y * DP2F) - y * DP3F; 284 | #else 285 | x = nmul_add(y, DP3F, nmul_add(y, DP2F + DP1F, xa)); 286 | #endif 287 | } 288 | // Taylor expansion of sin and cos, valid for -pi/4 <= x <= pi/4 289 | x2 = x * x; 290 | s = polynomial_2(x2, P0sinf, P1sinf, P2sinf) * (x*x2) + x; 291 | c = polynomial_2(x2, P0cosf, P1cosf, P2cosf) * (x2*x2) + nmul_add(0.5f, x2, 1.0f); 292 | 293 | // swap sin and cos if odd quadrant 294 | swap = BVTYPE((q & 1) != 0); 295 | 296 | if constexpr ((SC & 5) != 0) { // calculate sin 297 | sin1 = select(swap, c, s); 298 | signsin = ((q << 30) ^ ITYPE(reinterpret_i(xx))); 299 | sin1 = sign_combine(sin1, reinterpret_f(signsin)); 300 | } 301 | if constexpr ((SC & 6) != 0) { // calculate cos 302 | cos1 = select(swap, s, c); 303 | signcos = ((q + 1) & 2) << 30; 304 | cos1 ^= reinterpret_f(signcos); 305 | } 306 | if constexpr ((SC & 7) == 1) return sin1; 307 | else if constexpr ((SC & 7) == 2) return cos1; 308 | else if constexpr ((SC & 7) == 3) { // calculate both. cos returned through pointer 309 | *cosret = cos1; 310 | return sin1; 311 | } 312 | else { // SC == 4. tan 313 | if constexpr (SC == 12) { 314 | // tanpi can give INF result, tan cannot. Get the right sign of INF result according to IEEE 754-2019 315 | cos1 = select(cos1 == 0.f, 0.f, cos1); // remove sign of 0 316 | // the sign of zero output is arbitrary. fixing it would be a waste of code 317 | } 318 | return sin1 / cos1; 319 | } 320 | } 321 | 322 | // instantiations of sincos_f template: 323 | 324 | static inline Vec4f sin(Vec4f const x) { 325 | return sincos_f(0, x); 326 | } 327 | 328 | static inline Vec4f cos(Vec4f const x) { 329 | return sincos_f(0, x); 330 | } 331 | 332 | static inline Vec4f sincos(Vec4f * cosret, Vec4f const x) { 333 | return sincos_f(cosret, x); 334 | } 335 | 336 | static inline Vec4f tan(Vec4f const x) { 337 | return sincos_f(0, x); 338 | } 339 | 340 | static inline Vec4f sinpi(Vec4f const x) { 341 | return sincos_f(0, x); 342 | } 343 | 344 | static inline Vec4f cospi(Vec4f const x) { 345 | return sincos_f(0, x); 346 | } 347 | 348 | static inline Vec4f sincospi(Vec4f * cosret, Vec4f const x) { 349 | return sincos_f(cosret, x); 350 | } 351 | 352 | static inline Vec4f tanpi(Vec4f const x) { 353 | return sincos_f(0, x); 354 | } 355 | 356 | #if MAX_VECTOR_SIZE >= 256 357 | static inline Vec8f sin(Vec8f const x) { 358 | return sincos_f(0, x); 359 | } 360 | 361 | static inline Vec8f cos(Vec8f const x) { 362 | return sincos_f(0, x); 363 | } 364 | 365 | static inline Vec8f sincos(Vec8f * cosret, Vec8f const x) { 366 | return sincos_f(cosret, x); 367 | } 368 | 369 | static inline Vec8f tan(Vec8f const x) { 370 | return sincos_f(0, x); 371 | } 372 | 373 | static inline Vec8f sinpi(Vec8f const x) { 374 | return sincos_f(0, x); 375 | } 376 | 377 | static inline Vec8f cospi(Vec8f const x) { 378 | return sincos_f(0, x); 379 | } 380 | 381 | static inline Vec8f sincospi(Vec8f * cosret, Vec8f const x) { 382 | return sincos_f(cosret, x); 383 | } 384 | 385 | static inline Vec8f tanpi(Vec8f const x) { 386 | return sincos_f(0, x); 387 | } 388 | 389 | #endif // MAX_VECTOR_SIZE >= 256 390 | 391 | #if MAX_VECTOR_SIZE >= 512 392 | static inline Vec16f sin(Vec16f const x) { 393 | return sincos_f(0, x); 394 | } 395 | 396 | static inline Vec16f cos(Vec16f const x) { 397 | return sincos_f(0, x); 398 | } 399 | 400 | static inline Vec16f sincos(Vec16f * cosret, Vec16f const x) { 401 | return sincos_f(cosret, x); 402 | } 403 | 404 | static inline Vec16f tan(Vec16f const x) { 405 | return sincos_f(0, x); 406 | } 407 | 408 | static inline Vec16f sinpi(Vec16f const x) { 409 | return sincos_f(0, x); 410 | } 411 | 412 | static inline Vec16f cospi(Vec16f const x) { 413 | return sincos_f(0, x); 414 | } 415 | 416 | static inline Vec16f sincospi(Vec16f * cosret, Vec16f const x) { 417 | return sincos_f(cosret, x); 418 | } 419 | 420 | static inline Vec16f tanpi(Vec16f const x) { 421 | return sincos_f(0, x); 422 | } 423 | 424 | #endif // MAX_VECTOR_SIZE >= 512 425 | 426 | 427 | // ************************************************************* 428 | // tan template, double precision 429 | // ************************************************************* 430 | // Template parameters: 431 | // VTYPE: f.p. vector type 432 | // Template parameters: 433 | // SC: 0 = tan, 8 = multiply by pi 434 | // Parameters: 435 | // x = input x (radians) 436 | template 437 | static inline VTYPE tan_d(VTYPE const x) { 438 | 439 | // define constants 440 | const double DP1 = 7.853981554508209228515625E-1 * 2.; 441 | const double DP2 = 7.94662735614792836714E-9 * 2.; 442 | const double DP3 = 3.06161699786838294307E-17 * 2.; 443 | 444 | const double P2tan = -1.30936939181383777646E4; 445 | const double P1tan = 1.15351664838587416140E6; 446 | const double P0tan = -1.79565251976484877988E7; 447 | 448 | const double Q3tan = 1.36812963470692954678E4; 449 | const double Q2tan = -1.32089234440210967447E6; 450 | const double Q1tan = 2.50083801823357915839E7; 451 | const double Q0tan = -5.38695755929454629881E7; 452 | 453 | typedef decltype(x > x) BVTYPE; // boolean vector type 454 | VTYPE xa, y, z, zz, px, qx, tn, recip; // data vectors 455 | BVTYPE doinvert; // boolean vector 456 | typedef decltype(nan_code(x)) UITYPE; // unsigned integer vector type 457 | 458 | xa = abs(x); 459 | 460 | // Find quadrant 461 | if constexpr ((SC & 8) != 0) { 462 | y = round(xa * 2.0); 463 | } 464 | else { 465 | xa = select(xa > VTYPE(1.E15), VTYPE(0.), xa); // overflow limit 466 | y = round(xa * (double)(2. / VM_PI)); // quadrant, as float 467 | } 468 | auto q = roundi(y); // quadrant, as integer 469 | // Quadrant: 470 | // 0 - pi/4 => 0 471 | // pi/4 - 3*pi/4 => 1 472 | // 3*pi/4 - 5*pi/4 => 2 473 | // 5*pi/4 - 7*pi/4 => 3 474 | // 7*pi/4 - 8*pi/4 => 4 475 | 476 | if constexpr ((SC & 8) != 0) { 477 | z = nmul_add(y, 0.5, xa) * (VM_PI); 478 | } 479 | else { 480 | // Reduce by extended precision modular arithmetic 481 | #if INSTRSET < 8 // no FMA 482 | z = ((xa - y * DP1) - y * DP2) - y * DP3; 483 | #else 484 | z = nmul_add(y, DP3, nmul_add(y, DP2 + DP1, xa)); 485 | #endif 486 | } 487 | // Pade approximation of tan, valid for -pi/4 <= x <= pi/4 488 | zz = z * z; 489 | px = polynomial_2(zz, P0tan, P1tan, P2tan); 490 | qx = polynomial_4n(zz, Q0tan, Q1tan, Q2tan, Q3tan); 491 | 492 | // qx cannot be 0 for x <= pi/4 493 | tn = mul_add(px / qx, z * zz, z); // tn = z + z * zz * px / qx; 494 | 495 | // if (q&2) tn = -1/tn 496 | doinvert = BVTYPE((q & 1) != 0); 497 | 498 | if constexpr ((SC & 8) != 0) { 499 | // tan cannot give infinity because the input cannot be exactly pi/2. 500 | // tanpi can generate infinity. Get the right sign of infinity: 501 | UITYPE infsign = UITYPE(q) << 62; // get bit 1 into the sign bit position 502 | VTYPE zsign = sign_combine(VTYPE(-0.), reinterpret_d(infsign)); 503 | tn = select(tn == 0., zsign, tn); // get INF with the right sign when tn == 0 504 | // the sign of zero output is arbitrary. fixing it would be a waste of code 505 | } 506 | recip = -1. / tn; 507 | tn = select(doinvert, recip, tn); 508 | tn = sign_combine(tn, x); // combine with original sign of x 509 | return tn; 510 | } 511 | 512 | // instantiations of tan_d template: 513 | 514 | static inline Vec2d tan(Vec2d const x) { 515 | return tan_d(x); 516 | } 517 | 518 | static inline Vec2d tanpi(Vec2d const x) { 519 | return tan_d(x); 520 | } 521 | 522 | #if MAX_VECTOR_SIZE >= 256 523 | static inline Vec4d tan(Vec4d const x) { 524 | return tan_d(x); 525 | } 526 | 527 | static inline Vec4d tanpi(Vec4d const x) { 528 | return tan_d(x); 529 | } 530 | #endif // MAX_VECTOR_SIZE >= 256 531 | 532 | #if MAX_VECTOR_SIZE >= 512 533 | static inline Vec8d tan(Vec8d const x) { 534 | return tan_d(x); 535 | } 536 | 537 | static inline Vec8d tanpi(Vec8d const x) { 538 | return tan_d(x); 539 | } 540 | #endif // MAX_VECTOR_SIZE >= 512 541 | 542 | 543 | // ************************************************************* 544 | // tan template, single precision 545 | // ************************************************************* 546 | // This is removed for the single precision version. 547 | // It is faster to use tan(x) = sin(x)/cos(x) 548 | 549 | 550 | 551 | // ************************************************************* 552 | // asin/acos template, double precision 553 | // ************************************************************* 554 | // Template parameters: 555 | // VTYPE: f.p. vector type 556 | // AC: 0 = asin, 1 = acos 557 | // Parameters: 558 | // x = input x 559 | template 560 | static inline VTYPE asin_d(VTYPE const x) { 561 | 562 | // define constants 563 | const double R4asin = 2.967721961301243206100E-3; 564 | const double R3asin = -5.634242780008963776856E-1; 565 | const double R2asin = 6.968710824104713396794E0; 566 | const double R1asin = -2.556901049652824852289E1; 567 | const double R0asin = 2.853665548261061424989E1; 568 | 569 | const double S3asin = -2.194779531642920639778E1; 570 | const double S2asin = 1.470656354026814941758E2; 571 | const double S1asin = -3.838770957603691357202E2; 572 | const double S0asin = 3.424398657913078477438E2; 573 | 574 | const double P5asin = 4.253011369004428248960E-3; 575 | const double P4asin = -6.019598008014123785661E-1; 576 | const double P3asin = 5.444622390564711410273E0; 577 | const double P2asin = -1.626247967210700244449E1; 578 | const double P1asin = 1.956261983317594739197E1; 579 | const double P0asin = -8.198089802484824371615E0; 580 | 581 | const double Q4asin = -1.474091372988853791896E1; 582 | const double Q3asin = 7.049610280856842141659E1; 583 | const double Q2asin = -1.471791292232726029859E2; 584 | const double Q1asin = 1.395105614657485689735E2; 585 | const double Q0asin = -4.918853881490881290097E1; 586 | 587 | VTYPE xa, xb, x1, x2, x3, x4, x5, px, qx, rx, sx, vx, wx, y1, z, z1, z2; 588 | bool dobig, dosmall; 589 | 590 | xa = abs(x); 591 | auto big = xa >= 0.625; // boolean vector 592 | 593 | /* 594 | Small: xa < 0.625 595 | ------------------ 596 | x = xa * xa; 597 | px = PX(x); 598 | qx = QX(x); 599 | y1 = x*px/qx; 600 | y1 = xa * y1 + xa; 601 | 602 | Big: xa >= 0.625 603 | ------------------ 604 | x = 1.0 - xa; 605 | rx = RX(x); 606 | sx = SX(x); 607 | y1 = x * rx/sx; 608 | x3 = sqrt(x+x); 609 | y3 = x3 * y1 - MOREBITS; 610 | z = pi/2 - x3 - y3 611 | */ 612 | 613 | // select a common x for all polynomials 614 | // This allows sharing of powers of x through common subexpression elimination 615 | x1 = select(big, 1.0 - xa, xa * xa); 616 | 617 | // calculate powers of x1 outside branches to make sure they are only calculated once 618 | x2 = x1 * x1; 619 | x4 = x2 * x2; 620 | x5 = x4 * x1; 621 | x3 = x2 * x1; 622 | 623 | dosmall = !horizontal_and(big); // at least one element is small 624 | dobig = horizontal_or(big); // at least one element is big 625 | 626 | // calculate polynomials (reuse powers of x) 627 | if (dosmall) { 628 | // px = polynomial_5 (x1, P0asin, P1asin, P2asin, P3asin, P4asin, P5asin); 629 | // qx = polynomial_5n(x1, Q0asin, Q1asin, Q2asin, Q3asin, Q4asin); 630 | px = mul_add(x3, P3asin, P0asin) + mul_add(x4, P4asin, x1*P1asin) + mul_add(x5, P5asin, x2*P2asin); 631 | qx = mul_add(x4, Q4asin, x5) + mul_add(x3, Q3asin, x1*Q1asin) + mul_add(x2, Q2asin, Q0asin); 632 | } 633 | if (dobig) { 634 | // rx = polynomial_4 (x1, R0asin, R1asin, R2asin, R3asin, R4asin); 635 | // sx = polynomial_4n(x1, S0asin, S1asin, S2asin, S3asin); 636 | rx = mul_add(x3, R3asin, x2*R2asin) + mul_add(x4, R4asin, mul_add(x1, R1asin, R0asin)); 637 | sx = mul_add(x3, S3asin, x4) + mul_add(x2, S2asin, mul_add(x1, S1asin, S0asin)); 638 | } 639 | 640 | // select and divide outside branches to avoid dividing twice 641 | vx = select(big, rx, px); 642 | wx = select(big, sx, qx); 643 | y1 = vx / wx * x1; 644 | 645 | // results for big 646 | if (dobig) { // avoid square root if all are small 647 | xb = sqrt(x1 + x1); // this produces NAN if xa > 1 so we don't need a special case for xa > 1 648 | z1 = mul_add(xb, y1, xb); // yb = xb * y1; z1 = xb + yb; 649 | } 650 | 651 | // results for small 652 | z2 = mul_add(xa, y1, xa); // z2 = xa * y1 + xa; 653 | 654 | // correct for sign 655 | if constexpr (AC == 1) { // acos 656 | z1 = select(x < 0., VM_PI - z1, z1); 657 | z2 = VM_PI_2 - sign_combine(z2, x); 658 | z = select(big, z1, z2); 659 | } 660 | else { // asin 661 | z1 = VM_PI_2 - z1; 662 | z = select(big, z1, z2); 663 | z = sign_combine(z, x); 664 | } 665 | return z; 666 | } 667 | 668 | // instantiations of asin_d template: 669 | 670 | static inline Vec2d asin(Vec2d const x) { 671 | return asin_d(x); 672 | } 673 | 674 | static inline Vec2d acos(Vec2d const x) { 675 | return asin_d(x); 676 | } 677 | 678 | #if MAX_VECTOR_SIZE >= 256 679 | static inline Vec4d asin(Vec4d const x) { 680 | return asin_d(x); 681 | } 682 | 683 | static inline Vec4d acos(Vec4d const x) { 684 | return asin_d(x); 685 | } 686 | #endif // MAX_VECTOR_SIZE >= 256 687 | 688 | #if MAX_VECTOR_SIZE >= 512 689 | static inline Vec8d asin(Vec8d const x) { 690 | return asin_d(x); 691 | } 692 | 693 | static inline Vec8d acos(Vec8d const x) { 694 | return asin_d(x); 695 | } 696 | #endif // MAX_VECTOR_SIZE >= 512 697 | 698 | 699 | // ************************************************************* 700 | // asin/acos template, single precision 701 | // ************************************************************* 702 | // Template parameters: 703 | // VTYPE: f.p. vector type 704 | // AC: 0 = asin, 1 = acos 705 | // Parameters: 706 | // x = input x 707 | template 708 | static inline VTYPE asin_f(VTYPE const x) { 709 | 710 | // define constants 711 | const float P4asinf = 4.2163199048E-2f; 712 | const float P3asinf = 2.4181311049E-2f; 713 | const float P2asinf = 4.5470025998E-2f; 714 | const float P1asinf = 7.4953002686E-2f; 715 | const float P0asinf = 1.6666752422E-1f; 716 | 717 | VTYPE xa, x1, x2, x3, x4, xb, z, z1, z2; 718 | 719 | xa = abs(x); 720 | auto big = xa > 0.5f; // boolean vector 721 | 722 | x1 = 0.5f * (1.0f - xa); 723 | x2 = xa * xa; 724 | x3 = select(big, x1, x2); 725 | 726 | //if (horizontal_or(big)) 727 | { 728 | xb = sqrt(x1); 729 | } 730 | x4 = select(big, xb, xa); 731 | 732 | z = polynomial_4(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf); 733 | z = mul_add(z, x3*x4, x4); // z = z * (x3*x4) + x4; 734 | z1 = z + z; 735 | 736 | // correct for sign 737 | if constexpr (AC == 1) { // acos 738 | z1 = select(x < 0., float(VM_PI) - z1, z1); 739 | z2 = float(VM_PI_2) - sign_combine(z, x); 740 | z = select(big, z1, z2); 741 | } 742 | else { // asin 743 | z1 = float(VM_PI_2) - z1; 744 | z = select(big, z1, z); 745 | z = sign_combine(z, x); 746 | } 747 | 748 | return z; 749 | } 750 | 751 | // instantiations of asin_f template: 752 | 753 | static inline Vec4f asin(Vec4f const x) { 754 | return asin_f(x); 755 | } 756 | 757 | static inline Vec4f acos(Vec4f const x) { 758 | return asin_f(x); 759 | } 760 | 761 | #if MAX_VECTOR_SIZE >= 256 762 | static inline Vec8f asin(Vec8f const x) { 763 | return asin_f(x); 764 | } 765 | static inline Vec8f acos(Vec8f const x) { 766 | return asin_f(x); 767 | } 768 | #endif // MAX_VECTOR_SIZE >= 256 769 | 770 | #if MAX_VECTOR_SIZE >= 512 771 | static inline Vec16f asin(Vec16f const x) { 772 | return asin_f(x); 773 | } 774 | static inline Vec16f acos(Vec16f const x) { 775 | return asin_f(x); 776 | } 777 | #endif // MAX_VECTOR_SIZE >= 512 778 | 779 | 780 | // ************************************************************* 781 | // atan template, double precision 782 | // ************************************************************* 783 | // Template parameters: 784 | // VTYPE: f.p. vector type 785 | // T2: 0 = atan, 1 = atan2 786 | // Parameters: 787 | // y, x. calculate tan(y/x) 788 | // result is between -pi/2 and +pi/2 when x > 0 789 | // result is between -pi and -pi/2 or between pi/2 and pi when x < 0 for atan2 790 | template 791 | static inline VTYPE atan_d(VTYPE const y, VTYPE const x) { 792 | 793 | // define constants 794 | //const double ONEOPIO4 = 4./VM_PI; 795 | const double MOREBITS = 6.123233995736765886130E-17; 796 | const double MOREBITSO2 = MOREBITS * 0.5; 797 | const double T3PO8 = VM_SQRT2 + 1.; // 2.41421356237309504880; 798 | 799 | const double P4atan = -8.750608600031904122785E-1; 800 | const double P3atan = -1.615753718733365076637E1; 801 | const double P2atan = -7.500855792314704667340E1; 802 | const double P1atan = -1.228866684490136173410E2; 803 | const double P0atan = -6.485021904942025371773E1; 804 | 805 | const double Q4atan = 2.485846490142306297962E1; 806 | const double Q3atan = 1.650270098316988542046E2; 807 | const double Q2atan = 4.328810604912902668951E2; 808 | const double Q1atan = 4.853903996359136964868E2; 809 | const double Q0atan = 1.945506571482613964425E2; 810 | 811 | typedef decltype (x > x) BVTYPE; // boolean vector type 812 | VTYPE t, x1, x2, y1, y2, s, fac, a, b, z, zz, px, qx, re; // data vectors 813 | BVTYPE swapxy, notbig, notsmal; // boolean vectors 814 | 815 | if constexpr (T2 == 1) { // atan2(y,x) 816 | // move in first octant 817 | x1 = abs(x); 818 | y1 = abs(y); 819 | swapxy = (y1 > x1); 820 | // swap x and y if y1 > x1 821 | x2 = select(swapxy, y1, x1); 822 | y2 = select(swapxy, x1, y1); 823 | 824 | // check for special case: x and y are both +/- INF 825 | BVTYPE both_infinite = is_inf(x) & is_inf(y); // x and Y are both infinite 826 | if (horizontal_or(both_infinite)) { // at least one element has both infinite 827 | VTYPE mone = VTYPE(-1.0); 828 | x2 = select(both_infinite, x2 & mone, x2); // get 1.0 with the sign of x 829 | y2 = select(both_infinite, y2 & mone, y2); // get 1.0 with the sign of y 830 | } 831 | 832 | t = y2 / x2; // x = y = 0 gives NAN here 833 | } 834 | else { // atan(y) 835 | t = abs(y); 836 | } 837 | 838 | // small: t < 0.66 839 | // medium: 0.66 <= t <= 2.4142 (1+sqrt(2)) 840 | // big: t > 2.4142 841 | notbig = t <= T3PO8; // t <= 2.4142 842 | notsmal = t >= 0.66; // t >= 0.66 843 | 844 | s = select(notbig, VTYPE(VM_PI_4), VTYPE(VM_PI_2)); 845 | s = notsmal & s; // select(notsmal, s, 0.); 846 | fac = select(notbig, VTYPE(MOREBITSO2), VTYPE(MOREBITS)); 847 | fac = notsmal & fac; //select(notsmal, fac, 0.); 848 | 849 | // small: z = t / 1.0; 850 | // medium: z = (t-1.0) / (t+1.0); 851 | // big: z = -1.0 / t; 852 | a = notbig & t; // select(notbig, t, 0.); 853 | a = if_add(notsmal, a, -1.); 854 | b = notbig & VTYPE(1.); // select(notbig, 1., 0.); 855 | b = if_add(notsmal, b, t); 856 | z = a / b; // division by 0 will not occur unless x and y are both 0 857 | 858 | zz = z * z; 859 | 860 | px = polynomial_4(zz, P0atan, P1atan, P2atan, P3atan, P4atan); 861 | qx = polynomial_5n(zz, Q0atan, Q1atan, Q2atan, Q3atan, Q4atan); 862 | 863 | re = mul_add(px / qx, z * zz, z); // re = (px / qx) * (z * zz) + z; 864 | re += s + fac; 865 | 866 | if constexpr (T2 == 1) { // atan2(y,x) 867 | // move back in place 868 | re = select(swapxy, VM_PI_2 - re, re); 869 | re = select((x | y) == 0., 0., re); // atan2(0,0) = 0 by convention 870 | re = select(sign_bit(x), VM_PI - re, re);// also for x = -0. 871 | } 872 | // get sign bit 873 | re = sign_combine(re, y); 874 | 875 | return re; 876 | } 877 | 878 | // instantiations of atan_d template: 879 | 880 | static inline Vec2d atan2(Vec2d const y, Vec2d const x) { 881 | return atan_d(y, x); 882 | } 883 | 884 | static inline Vec2d atan(Vec2d const y) { 885 | return atan_d(y, 0.); 886 | } 887 | 888 | #if MAX_VECTOR_SIZE >= 256 889 | static inline Vec4d atan2(Vec4d const y, Vec4d const x) { 890 | return atan_d(y, x); 891 | } 892 | 893 | static inline Vec4d atan(Vec4d const y) { 894 | return atan_d(y, 0.); 895 | } 896 | #endif // MAX_VECTOR_SIZE >= 256 897 | 898 | #if MAX_VECTOR_SIZE >= 512 899 | static inline Vec8d atan2(Vec8d const y, Vec8d const x) { 900 | return atan_d(y, x); 901 | } 902 | 903 | static inline Vec8d atan(Vec8d const y) { 904 | return atan_d(y, 0.); 905 | } 906 | #endif // MAX_VECTOR_SIZE >= 512 907 | 908 | 909 | 910 | // ************************************************************* 911 | // atan template, single precision 912 | // ************************************************************* 913 | // Template parameters: 914 | // VTYPE: f.p. vector type 915 | // T2: 0 = atan, 1 = atan2 916 | // Parameters: 917 | // y, x. calculate tan(y/x) 918 | // result is between -pi/2 and +pi/2 when x > 0 919 | // result is between -pi and -pi/2 or between pi/2 and pi when x < 0 for atan2 920 | template 921 | static inline VTYPE atan_f(VTYPE const y, VTYPE const x) { 922 | 923 | // define constants 924 | const float P3atanf = 8.05374449538E-2f; 925 | const float P2atanf = -1.38776856032E-1f; 926 | const float P1atanf = 1.99777106478E-1f; 927 | const float P0atanf = -3.33329491539E-1f; 928 | 929 | typedef decltype (x > x) BVTYPE; // boolean vector type 930 | VTYPE t, x1, x2, y1, y2, s, a, b, z, zz, re;// data vectors 931 | BVTYPE swapxy, notbig, notsmal; // boolean vectors 932 | 933 | if constexpr (T2 == 1) { // atan2(y,x) 934 | // move in first octant 935 | x1 = abs(x); 936 | y1 = abs(y); 937 | swapxy = (y1 > x1); 938 | // swap x and y if y1 > x1 939 | x2 = select(swapxy, y1, x1); 940 | y2 = select(swapxy, x1, y1); 941 | 942 | // check for special case: x and y are both +/- INF 943 | BVTYPE both_infinite = is_inf(x) & is_inf(y); // x and Y are both infinite 944 | if (horizontal_or(both_infinite)) { // at least one element has both infinite 945 | VTYPE mone = VTYPE(-1.0f); 946 | x2 = select(both_infinite, x2 & mone, x2); // get 1.0 with the sign of x 947 | y2 = select(both_infinite, y2 & mone, y2); // get 1.0 with the sign of y 948 | } 949 | 950 | // x = y = 0 will produce NAN. No problem, fixed below 951 | t = y2 / x2; 952 | } 953 | else { // atan(y) 954 | t = abs(y); 955 | } 956 | 957 | // small: t < 0.4142 958 | // medium: 0.4142 <= t <= 2.4142 959 | // big: t > 2.4142 (not for atan2) 960 | if constexpr (T2 == 0) { // atan(y) 961 | notsmal = t >= float(VM_SQRT2 - 1.); // t >= tan pi/8 962 | notbig = t <= float(VM_SQRT2 + 1.); // t <= tan 3pi/8 963 | 964 | s = select(notbig, VTYPE(float(VM_PI_4)), VTYPE(float(VM_PI_2))); 965 | s = notsmal & s; // select(notsmal, s, 0.); 966 | 967 | // small: z = t / 1.0; 968 | // medium: z = (t-1.0) / (t+1.0); 969 | // big: z = -1.0 / t; 970 | a = notbig & t; // select(notbig, t, 0.); 971 | a = if_add(notsmal, a, -1.f); 972 | b = notbig & VTYPE(1.f); // select(notbig, 1., 0.); 973 | b = if_add(notsmal, b, t); 974 | z = a / b; // division by 0 will not occur unless x and y are both 0 975 | } 976 | else { // atan2(y,x) 977 | // small: z = t / 1.0; 978 | // medium: z = (t-1.0) / (t+1.0); 979 | notsmal = t >= float(VM_SQRT2 - 1.); 980 | a = if_add(notsmal, t, -1.f); 981 | b = if_add(notsmal, 1.f, t); 982 | s = notsmal & VTYPE(float(VM_PI_4)); 983 | z = a / b; 984 | } 985 | 986 | zz = z * z; 987 | 988 | // Taylor expansion 989 | re = polynomial_3(zz, P0atanf, P1atanf, P2atanf, P3atanf); 990 | re = mul_add(re, zz * z, z) + s; 991 | 992 | if constexpr (T2 == 1) { // atan2(y,x) 993 | // move back in place 994 | re = select(swapxy, float(VM_PI_2) - re, re); 995 | re = select((x | y) == 0.f, 0.f, re); // atan2(0,+0) = 0 by convention 996 | re = select(sign_bit(x), float(VM_PI) - re, re); // also for x = -0. 997 | } 998 | // get sign bit 999 | re = sign_combine(re, y); 1000 | 1001 | return re; 1002 | } 1003 | 1004 | // instantiations of atan_f template: 1005 | 1006 | static inline Vec4f atan2(Vec4f const y, Vec4f const x) { 1007 | return atan_f(y, x); 1008 | } 1009 | 1010 | static inline Vec4f atan(Vec4f const y) { 1011 | return atan_f(y, 0.); 1012 | } 1013 | 1014 | #if MAX_VECTOR_SIZE >= 256 1015 | static inline Vec8f atan2(Vec8f const y, Vec8f const x) { 1016 | return atan_f(y, x); 1017 | } 1018 | 1019 | static inline Vec8f atan(Vec8f const y) { 1020 | return atan_f(y, 0.); 1021 | } 1022 | 1023 | #endif // MAX_VECTOR_SIZE >= 256 1024 | 1025 | #if MAX_VECTOR_SIZE >= 512 1026 | static inline Vec16f atan2(Vec16f const y, Vec16f const x) { 1027 | return atan_f(y, x); 1028 | } 1029 | 1030 | static inline Vec16f atan(Vec16f const y) { 1031 | return atan_f(y, 0.); 1032 | } 1033 | 1034 | #endif // MAX_VECTOR_SIZE >= 512 1035 | 1036 | #ifdef VCL_NAMESPACE 1037 | } 1038 | #endif 1039 | 1040 | #endif 1041 | --------------------------------------------------------------------------------