├── .github └── workflows │ └── cmake-multi-platform.yml ├── .gitignore ├── LICENSE ├── README.md ├── math_intrinsics.h └── tests ├── CMakeLists.txt ├── benchmark.c ├── greatest.h ├── math_intrinsics.c ├── sokol_time.h └── test.c /.github/workflows/cmake-multi-platform.yml: -------------------------------------------------------------------------------- 1 | name: C/C++ CI 2 | 3 | on: 4 | push: 5 | branches: [ "main" ] 6 | pull_request: 7 | branches: [ "main" ] 8 | 9 | jobs: 10 | build-ubuntu-clang: 11 | name: ubuntu-clang 12 | runs-on: ubuntu-latest 13 | 14 | steps: 15 | - uses: actions/checkout@v3 16 | 17 | - name: Configure CMake 18 | run: cmake ${{github.workspace}}/tests/ -DCMAKE_C_COMPILER=clang 19 | 20 | - name: Build 21 | run: cmake --build ${{github.workspace}}/ 22 | 23 | - name: Test precision 24 | working-directory: ${{github.workspace}}/ 25 | run: ./test_precision 26 | 27 | - name: Test fast 28 | working-directory: ${{github.workspace}}/ 29 | run: ./test_fast 30 | 31 | - name: Benchmark precision 32 | working-directory: ${{github.workspace}}/ 33 | run: ./benchmark_precision 34 | 35 | - name: Benchmark fast 36 | working-directory: ${{github.workspace}}/ 37 | run: ./benchmark_fast 38 | 39 | build-macos: 40 | name: macos 41 | runs-on: macos-latest 42 | 43 | steps: 44 | - uses: actions/checkout@v3 45 | 46 | - name: Configure CMake 47 | run: cmake ${{github.workspace}}/tests/ 48 | 49 | - name: Build 50 | run: cmake --build ${{github.workspace}}/ 51 | 52 | - name: Test precision 53 | working-directory: ${{github.workspace}}/ 54 | run: ./test_precision 55 | 56 | - name: Test fast 57 | working-directory: ${{github.workspace}}/ 58 | run: ./test_fast 59 | 60 | - name: Benchmark precision 61 | working-directory: ${{github.workspace}}/ 62 | run: ./benchmark_precision 63 | 64 | - name: Benchmark fast 65 | working-directory: ${{github.workspace}}/ 66 | run: ./benchmark_fast 67 | 68 | build-windows: 69 | name: windows 70 | runs-on: windows-latest 71 | 72 | steps: 73 | - uses: actions/checkout@v3 74 | 75 | - name: Configure CMake 76 | working-directory: ${{github.workspace}}\tests 77 | run: cmake . 78 | 79 | - name: Build 80 | working-directory: ${{github.workspace}}\tests 81 | run: cmake --build . 82 | 83 | - name: Test precision 84 | working-directory: ${{github.workspace}}\tests\Debug 85 | run: ./test_precision 86 | 87 | - name: Test fast 88 | working-directory: ${{github.workspace}}\tests\Debug 89 | run: ./test_fast 90 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Prerequisites 2 | *.d 3 | 4 | # Object files 5 | *.o 6 | *.ko 7 | *.obj 8 | *.elf 9 | 10 | # Linker output 11 | *.ilk 12 | *.map 13 | *.exp 14 | 15 | # Precompiled Headers 16 | *.gch 17 | *.pch 18 | 19 | # Libraries 20 | *.lib 21 | *.a 22 | *.la 23 | *.lo 24 | 25 | # Shared objects (inc. Windows DLLs) 26 | *.dll 27 | *.so 28 | *.so.* 29 | *.dylib 30 | 31 | # Executables 32 | *.exe 33 | *.out 34 | *.app 35 | *.i*86 36 | *.x86_64 37 | *.hex 38 | 39 | # Debug files 40 | *.dSYM/ 41 | *.su 42 | *.idb 43 | *.pdb 44 | 45 | # Kernel Module Compile Results 46 | *.mod* 47 | *.cmd 48 | .tmp_versions/ 49 | modules.order 50 | Module.symvers 51 | Mkfile.old 52 | dkms.conf 53 | 54 | # MacOS 55 | *.DS_Store 56 | 57 | # XCode 58 | xcuserdata/ 59 | xcshareddata/ 60 | 61 | # visual studio code 62 | *.code-workspace 63 | *.vscode 64 | 65 | # cmake 66 | CMakeLists.txt.user 67 | CMakeCache.txt 68 | CMakeFiles 69 | CMakeScripts 70 | Testing 71 | Makefile 72 | cmake_install.cmake 73 | install_manifest.txt 74 | compile_commands.json 75 | CTestTestfile.cmake 76 | _deps 77 | 78 | # executables 79 | tests/test_fast 80 | tests/test_precision 81 | tests/benchmark_fast 82 | tests/benchmark_precision -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Geolm 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # math_intrinsics 2 | One header file library that implement missing transcendental math functions (cos, sin, acos, and more....) using 100% AVX/Neon instructions (no branching) 3 | 4 | ### unit tests build status 5 | [![Build Status](https://github.com/geolm/math_intrinsics/actions/workflows/cmake-multi-platform.yml/badge.svg)](https://github.com/geolm/math_intrinsics/actions) 6 | 7 | # why 8 | AVX and Neon intrinsics don't provide transcendental math functions. Of course there are already some libraries with those functions but there are usually not free, restricted to one specific hardware or with low precision. This library is super easy to integrate, with a precision close to the C math library (see below) and with MIT license. 9 | 10 | # how to 11 | 12 | It's a one-header lib, just define the macro once in your project and include the header. 13 | 14 | ```C 15 | #define __MATH__INTRINSICS__IMPLEMENTATION__ 16 | #include "math_intrinsics.h" 17 | ``` 18 | 19 | On intel/AMD computer, you need to compile with **-mavx2**. You can add also -mfma. 20 | On ARM based computer nothing required as the lib is for AArch64 21 | 22 | 23 | You can define this macro to generate faster albeit less precise functions (see below for more details) : 24 | ```C 25 | #define __MATH_INTRINSINCS_FAST__ 26 | ``` 27 | 28 | # functions 29 | 30 | ```C 31 | // max error : 5.960464478e-08 32 | __m256 mm256_cos_ps(__m256 a); 33 | 34 | // max error : 5.960464478e-08 35 | __m256 mm256_sin_ps(__m256 a); 36 | 37 | // max error : 5.960464478e-08 38 | void mm256_sincos_ps(__m256 a, __m256 *s, __m256 *c); 39 | 40 | // max error : 2.384185791e-07 41 | __m256 mm256_acos_ps(__m256 a); 42 | 43 | // max error : 1.192092896e-07 44 | __m256 mm256_asin_ps(__m256 a); 45 | 46 | // max error : 1.192092896e-07 47 | __m256 mm256_atan_ps(__m256 a); 48 | 49 | // max error : 2.384185791e-07 50 | __m256 mm256_atan2_ps(__m256 x, __m256 y); 51 | 52 | // max error : 9.107976950e-08 53 | __m256 mm256_log_ps(__m256 a); 54 | 55 | // max error : 2.349663504e-07 56 | __m256 mm256_log2_ps(__m256 x); 57 | 58 | // max error : 1.108270880e-07 59 | __m256 mm256_exp_ps(__m256 a); 60 | 61 | // max error : 1.042427087e-07 62 | __m256 mm256_exp2_ps(__m256 x); 63 | 64 | // max error : 1.184910232e-07 65 | __m256 mm256_cbrt_ps(__m256 a); 66 | 67 | // max error : 9.768706377e-07 68 | __m256 mm256_pow_ps(__m256 x, __m256 y); 69 | ``` 70 | 71 | Note : the same functions are defined in NEON intrinsics style : 72 | 73 | ```C 74 | // max error : 5.960464478e-08 75 | float32x4_t vcosq_f32(float32x4_t a); 76 | 77 | // max error : 5.960464478e-08 78 | float32x4_t vsinq_f32(float32x4_t a); 79 | 80 | // max error : 5.960464478e-08 81 | void vsincosq_f32(float32x4_t a, float32x4_t *s, float32x4_t *c); 82 | 83 | // max error : 2.384185791e-07 84 | float32x4_t vacosq_f32(float32x4_t a); 85 | 86 | // max error : 1.192092896e-07 87 | float32x4_t vasinq_f32(float32x4_t a); 88 | 89 | // max error : 1.192092896e-07 90 | float32x4_t vatanq_f32(float32x4_t a); 91 | 92 | // max error : 2.384185791e-07 93 | float32x4_t vatan2q_f32(float32x4_t x, float32x4_t y); 94 | 95 | // max error : 9.107976950e-08 96 | float32x4_t vlogq_f32(float32x4_t a); 97 | 98 | // max error : 2.349663504e-07 99 | float32x4_t vlog2q_f32(float32x4_t x); 100 | 101 | // max error : 1.108270880e-07 102 | float32x4_t vexpq_f32(float32x4_t a); 103 | 104 | // max error : 1.042427087e-07 105 | float32x4_t vexp2q_f32(float32x4_t a); 106 | 107 | // max error : 1.184910232e-07 108 | float32x4_t vcbrtq_f32(float32x4_t a); 109 | 110 | // max error : 9.768706377e-07 111 | float32x4_t vpowq_f32(float32x4_t x, float32x4_t y); 112 | 113 | ``` 114 | 115 | # fast functions 116 | 117 | If you use the macro \_\_MATH_INTRINSINCS_FAST\_\_ some functions will have a bit less precision but better performances: 118 | 119 | * sin, max_error : 2.682209015e-07 perf : ~1.5x 120 | * cos, max_error : 5.811452866e-07 perf : ~1.5x 121 | * acos, max_error : 6.520748138e-05 perf : ~1.6x 122 | * asin, max_error : 6.520736497e-05 perf : ~1.4x 123 | * exp2, max_error : 2.674510370e-06 perf : ~1.9x 124 | * pow, max error : 8.886078831e-06 perf : ~1.9x 125 | 126 | Check the benchmark actions in build system for more details. As you can see, the precision is still good with a noticeable performance boost. IMO most programs could use the fast version. 127 | 128 | # FAQ 129 | 130 | ## is it fast? 131 | The goal of this library is to provide math function with a good precision with every computation done in AVX/NEON. Performance is not the focus. 132 | 133 | Here's the benchmark results on my old Intel Core i7 from 2018 for 1 billion of operations, comparison against the C standard library. 134 | 135 | ```C 136 | benchmark : mode precision 137 | 138 | .mm256_acos_ps: 723.730 ms c std func: 5408.153 ms ratio: 7.47x 139 | .mm256_asin_ps: 692.439 ms c std func: 5419.091 ms ratio: 7.83x 140 | .mm256_atan_ps: 733.843 ms c std func: 3762.987 ms ratio: 5.13x 141 | .mm256_cbrt_ps: 1522.731 ms c std func: 19559.201 ms ratio: 12.84x 142 | .mm256_cos_ps: 882.112 ms c std func: 15540.117 ms ratio: 17.62x 143 | .mm256_sin_ps: 838.590 ms c std func: 15214.896 ms ratio: 18.14x 144 | .mm256_exp_ps: 830.130 ms c std func: 4399.218 ms ratio: 5.30x 145 | .mm256_exp2_ps: 1007.015 ms c std func: 2076.871 ms ratio: 2.06x 146 | .mm256_log_ps: 1019.277 ms c std func: 16832.281 ms ratio: 16.51x 147 | .mm256_log2_ps: 479.116 ms c std func: 3594.876 ms ratio: 7.50x 148 | ``` 149 | 150 | Don't forget : the function mm256_sincos_ps computes sinus and cosinus for the cost of one. Also you can use the macro \_\_MATH_INTRINSINCS_FAST\_\_ 151 | 152 | ## why AVX2 ? 153 | 154 | On multiple functions this library use a float as an int to have access to the mantissa and the exponent part. While it's doable with AVX1 using SSE4.2, I don't see the point of not using AVX2 which have been on CPU since 2013. 155 | 156 | ## does it handle all float cases (+inf, -inf, NAN) as the C math lib? 157 | 158 | Yes, all functions (except atan2 and pow) are compliant to +inf, -inf, NAN and other special cases (for example log(-4) == NAN). All based on the doc found here https://en.cppreference.com/w/ 159 | 160 | ## what's tested? 161 | 162 | The unit tests cover precision and special cases (inf, nan, ...). At the moment, the Neon version is not ran on GitHub but rather manually on my M1 Pro machine as I didn't had time to setup the emulator properly. 163 | 164 | # references 165 | 166 | [cephes math library](https://github.com/jeremybarnes/cephes/blob/master/single/) 167 | 168 | [simple SSE sin/cos](http://gruntthepeon.free.fr/ssemath/) 169 | 170 | [speeding up atan2f by 50x](https://mazzo.li/posts/vectorized-atan2.html) 171 | -------------------------------------------------------------------------------- /math_intrinsics.h: -------------------------------------------------------------------------------- 1 | #ifndef __MATH__INTRINSICS__H__ 2 | #define __MATH__INTRINSICS__H__ 3 | 4 | /* 5 | 6 | NEON/AVX trascendental math functions 7 | 8 | Documentation can be found https://github.com/Geolm/math_intrinsics/ 9 | 10 | */ 11 | 12 | #ifdef __cplusplus 13 | extern "C" { 14 | #endif 15 | 16 | #if defined(__ARM_NEON) && defined(__ARM_NEON__) 17 | #include 18 | 19 | // max error : 5.960464478e-08 20 | float32x4_t vcosq_f32(float32x4_t a); 21 | 22 | // max error : 5.960464478e-08 23 | float32x4_t vsinq_f32(float32x4_t a); 24 | 25 | // max error : 5.960464478e-08 26 | void vsincosq_f32(float32x4_t a, float32x4_t *s, float32x4_t *c); 27 | 28 | // max error : 2.384185791e-07 29 | float32x4_t vacosq_f32(float32x4_t a); 30 | 31 | // max error : 1.192092896e-07 32 | float32x4_t vasinq_f32(float32x4_t a); 33 | 34 | // max error : 6.699562073e-05 35 | float32x4_t vatanq_f32(float32x4_t a); 36 | 37 | // max error : 2.384185791e-07 38 | float32x4_t vatan2q_f32(float32x4_t x, float32x4_t y); 39 | 40 | // max error : 4.768371582e-07 41 | float32x4_t vlogq_f32(float32x4_t a); 42 | 43 | // max error : 2.349663504e-07 44 | float32x4_t vlog2q_f32(float32x4_t x); 45 | 46 | // max error : 1.108270880e-07 47 | float32x4_t vexpq_f32(float32x4_t a); 48 | 49 | // max error : 1.042427087e-07 50 | float32x4_t vexp2q_f32(float32x4_t a); 51 | 52 | // max error : 4.768371582e-07 53 | float32x4_t vcbrtq_f32(float32x4_t a); 54 | 55 | // max error : 9.768706377e-07 56 | float32x4_t vpowq_f32(float32x4_t x, float32x4_t y); 57 | 58 | #define __MATH__INTRINSICS__NEON__ 59 | 60 | #else 61 | #include 62 | 63 | // max error : 5.960464478e-08 64 | __m256 mm256_cos_ps(__m256 a); 65 | 66 | // max error : 5.960464478e-08 67 | __m256 mm256_sin_ps(__m256 a); 68 | 69 | // max error : 5.960464478e-08 70 | void mm256_sincos_ps(__m256 a, __m256 *s, __m256 *c); 71 | 72 | // max error : 2.384185791e-07 73 | __m256 mm256_acos_ps(__m256 a); 74 | 75 | // max error : 1.192092896e-07 76 | __m256 mm256_asin_ps(__m256 a); 77 | 78 | // max error : 6.699562073e-05 79 | __m256 mm256_atan_ps(__m256 a); 80 | 81 | // max error : 2.384185791e-07 82 | __m256 mm256_atan2_ps(__m256 x, __m256 y); 83 | 84 | // max error : 4.768371582e-07 85 | __m256 mm256_log_ps(__m256 a); 86 | 87 | // max error : 2.349663504e-07 88 | __m256 mm256_log2_ps(__m256 x); 89 | 90 | // max error : 1.108270880e-07 91 | __m256 mm256_exp_ps(__m256 a); 92 | 93 | // max error : 1.042427087e-07 94 | __m256 mm256_exp2_ps(__m256 x); 95 | 96 | // max error : 4.768371582e-07 97 | __m256 mm256_cbrt_ps(__m256 a); 98 | 99 | // max error : 9.768706377e-07 100 | __m256 mm256_pow_ps(__m256 x, __m256 y); 101 | 102 | #define __MATH__INTRINSICS__AVX__ 103 | 104 | #endif 105 | 106 | #ifdef __cplusplus 107 | } 108 | #endif 109 | 110 | #endif 111 | 112 | 113 | #ifdef __MATH__INTRINSICS__IMPLEMENTATION__ 114 | 115 | #define SIMD_MATH_TAU (6.28318530f) 116 | #define SIMD_MATH_PI (3.14159265f) 117 | #define SIMD_MATH_PI2 (1.57079632f) 118 | #define SIMD_MATH_PI4 (0.78539816f) 119 | 120 | #if defined(__ARM_NEON) && defined(__ARM_NEON__) 121 | typedef float32x4_t simd_vector; 122 | 123 | static inline simd_vector simd_add(simd_vector a, simd_vector b) {return vaddq_f32(a, b);} 124 | static inline simd_vector simd_sub(simd_vector a, simd_vector b) {return vsubq_f32(a, b);} 125 | static inline simd_vector simd_mul(simd_vector a, simd_vector b) {return vmulq_f32(a, b);} 126 | static inline simd_vector simd_div(simd_vector a, simd_vector b) {return vdivq_f32(a, b);} 127 | static inline simd_vector simd_abs(simd_vector a) {return vabsq_f32(a);} 128 | static inline simd_vector simd_fmad(simd_vector a, simd_vector b, simd_vector c) {return vfmaq_f32(c, a, b);} 129 | static inline simd_vector simd_or(simd_vector a, simd_vector b) {return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)));} 130 | static inline simd_vector simd_xor(simd_vector a, simd_vector b) {return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)));} 131 | static inline simd_vector simd_and(simd_vector a, simd_vector b) {return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)));} 132 | static inline simd_vector simd_andnot(simd_vector a, simd_vector b) {return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)));} 133 | static inline simd_vector simd_min(simd_vector a, simd_vector b) {return vminq_f32(a, b);} 134 | static inline simd_vector simd_max(simd_vector a, simd_vector b) {return vmaxq_f32(a, b);} 135 | static inline simd_vector simd_cmp_gt(simd_vector a, simd_vector b) {return vreinterpretq_f32_u32(vcgtq_f32(a, b));} 136 | static inline simd_vector simd_cmp_ge(simd_vector a, simd_vector b) {return vreinterpretq_f32_u32(vcgeq_f32(a, b));} 137 | static inline simd_vector simd_cmp_lt(simd_vector a, simd_vector b) {return vreinterpretq_f32_u32(vcltq_f32(a, b));} 138 | static inline simd_vector simd_cmp_le(simd_vector a, simd_vector b) {return vreinterpretq_f32_u32(vcleq_f32(a, b));} 139 | static inline simd_vector simd_cmp_eq(simd_vector a, simd_vector b) {return vreinterpretq_f32_u32(vceqq_f32(a, b));} 140 | static inline simd_vector simd_cmp_neq(simd_vector a, simd_vector b) {return vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(a, b)));} 141 | static inline simd_vector simd_isnan(simd_vector a) {return simd_cmp_neq(a, a);} 142 | static inline simd_vector simd_select(simd_vector a, simd_vector b, simd_vector mask) {return vbslq_f32(vreinterpretq_u32_f32(mask), b, a);} 143 | static inline simd_vector simd_splat(float value) {return vdupq_n_f32(value);} 144 | static inline simd_vector simd_splat_zero(void) {return vdupq_n_f32(0);} 145 | static inline simd_vector simd_splat_positive_infinity(void) {return vreinterpretq_u32_f32(vdupq_n_u32(0x7f800000));} 146 | static inline simd_vector simd_splat_negative_infinity(void) {return vreinterpretq_u32_f32(vdupq_n_u32(0xff800000));} 147 | static inline simd_vector simd_sign_mask(void) {return vreinterpretq_u32_f32(vdupq_n_u32(0x80000000));} 148 | static inline simd_vector simd_inv_sign_mask(void) {return vreinterpretq_u32_f32(vdupq_n_u32(~0x80000000));} 149 | static inline simd_vector simd_abs_mask(void) {return vreinterpretq_u32_f32(vdupq_n_u32(0x7FFFFFFF));} 150 | static inline simd_vector simd_min_normalized(void) {return vreinterpretq_u32_f32(vdupq_n_u32(0x00800000));} // the smallest non denormalized float number 151 | static inline simd_vector simd_inv_mant_mask(void){return vreinterpretq_u32_f32(vdupq_n_u32(~0x7f800000));} 152 | static inline simd_vector simd_floor(simd_vector a) {return vrndmq_f32(a);} 153 | static inline simd_vector simd_round(simd_vector a) {return vrndnq_f32(a);} 154 | static inline simd_vector simd_neg(simd_vector a) {return vnegq_f32(a);} 155 | static inline simd_vector simd_sqrt(simd_vector a) {return vsqrtq_f32(a);} 156 | 157 | typedef int32x4_t simd_vectori; 158 | static inline simd_vectori simd_convert_from_float(simd_vector a) {return vcvtq_s32_f32(a);} 159 | static inline simd_vectori simd_cast_from_float(simd_vector a) {return vreinterpretq_s32_f32(a);} 160 | static inline simd_vector simd_convert_from_int(simd_vectori a) {return vcvtq_f32_s32(a);} 161 | static inline simd_vector simd_cast_from_int(simd_vectori a) {return vreinterpretq_f32_s32(a);} 162 | static inline simd_vectori simd_add_i(simd_vectori a, simd_vectori b) {return vaddq_s32(a, b);} 163 | static inline simd_vectori simd_sub_i(simd_vectori a, simd_vectori b) {return vsubq_s32(a, b);} 164 | static inline simd_vector simd_mul_i(simd_vector a, simd_vector b) {return vmulq_s32(a, b);} 165 | static inline simd_vectori simd_splat_i(int i) {return vdupq_n_s32(i);} 166 | static inline simd_vectori simd_splat_zero_i(void) {return vdupq_n_s32(0);} 167 | static inline simd_vectori simd_shift_left_i(simd_vectori a, int i) {return vshlq_s32(a, vdupq_n_s32(i));} 168 | static inline simd_vectori simd_shift_right_i(simd_vectori a, int i) {return vshlq_s32(a, vdupq_n_s32(-i));} 169 | static inline simd_vectori simd_and_i(simd_vectori a, simd_vectori b) {return vandq_s32(a, b);} 170 | static inline simd_vectori simd_or_i(simd_vectori a, simd_vectori b) {return vorrq_s32(a, b);} 171 | static inline simd_vectori simd_andnot_i(simd_vectori a, simd_vectori b) {return vbicq_s32(a, b);} 172 | static inline simd_vectori simd_cmp_eq_i(simd_vectori a, simd_vectori b) {return vceqq_s32(a, b);} 173 | static inline simd_vectori simd_cmp_gt_i(simd_vectori a, simd_vectori b) {return vcgtq_s32(a, b);} 174 | static inline simd_vectori simd_abs_i(simd_vectori a) {return vabsq_s32(a);} 175 | 176 | #define simd_asin vasinq_f32 177 | #define simd_atan vatanq_f32 178 | #define simd_sincos vsincosq_f32 179 | #define simd_sin vsinq_f32 180 | #define simd_log vlogq_f32 181 | #define simd_exp vexpq_f32 182 | #define simd_log2 vlog2q_f32 183 | #define simd_exp2 vexp2q_f32 184 | 185 | #else 186 | typedef __m256 simd_vector; 187 | 188 | static inline simd_vector simd_add(simd_vector a, simd_vector b) {return _mm256_add_ps(a, b);} 189 | static inline simd_vector simd_sub(simd_vector a, simd_vector b) {return _mm256_sub_ps(a, b);} 190 | static inline simd_vector simd_mul(simd_vector a, simd_vector b) {return _mm256_mul_ps(a, b);} 191 | static inline simd_vector simd_div(simd_vector a, simd_vector b) {return _mm256_div_ps(a, b);} 192 | static inline simd_vector simd_abs_mask(void) {return _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));} 193 | static inline simd_vector simd_abs(simd_vector a) {return _mm256_and_ps(a, simd_abs_mask());} 194 | static inline simd_vector simd_fmad(simd_vector a, simd_vector b, simd_vector c) 195 | { 196 | #ifdef __FMA__ 197 | return _mm256_fmadd_ps(a, b, c); 198 | #else 199 | return _mm256_add_ps(_mm256_mul_ps(a, b), c); 200 | #endif 201 | } 202 | static inline simd_vector simd_or(simd_vector a, simd_vector b) {return _mm256_or_ps(a, b);} 203 | static inline simd_vector simd_and(simd_vector a, simd_vector b) {return _mm256_and_ps(a, b);} 204 | static inline simd_vector simd_andnot(simd_vector a, simd_vector b) {return _mm256_andnot_ps(b, a);} 205 | static inline simd_vector simd_xor(simd_vector a, simd_vector b) {return _mm256_xor_ps(a, b);} 206 | static inline simd_vector simd_min(simd_vector a, simd_vector b) {return _mm256_min_ps(a, b);} 207 | static inline simd_vector simd_max(simd_vector a, simd_vector b) {return _mm256_max_ps(a, b);} 208 | static inline simd_vector simd_select(simd_vector a, simd_vector b, simd_vector mask) {return _mm256_blendv_ps(a, b, mask);} 209 | static inline simd_vector simd_splat(float value) {return _mm256_set1_ps(value);} 210 | static inline simd_vector simd_splat_zero(void) {return _mm256_setzero_ps();} 211 | static inline simd_vector simd_splat_positive_infinity(void) {return _mm256_castsi256_ps(_mm256_set1_epi32(0x7f800000));} 212 | static inline simd_vector simd_splat_negative_infinity(void) {return _mm256_castsi256_ps(_mm256_set1_epi32(0xff800000));} 213 | static inline simd_vector simd_sign_mask(void) {return _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));} 214 | static inline simd_vector simd_inv_sign_mask(void) {return _mm256_castsi256_ps(_mm256_set1_epi32(~0x80000000));} 215 | static inline simd_vector simd_min_normalized(void) {return _mm256_castsi256_ps(_mm256_set1_epi32(0x00800000));} // the smallest non denormalized float number 216 | static inline simd_vector simd_inv_mant_mask(void){return _mm256_castsi256_ps(_mm256_set1_epi32(~0x7f800000));} 217 | static inline simd_vector simd_floor(simd_vector a) {return _mm256_floor_ps(a);} 218 | static inline simd_vector simd_round(simd_vector a) {return _mm256_round_ps(a, _MM_FROUND_NINT);} 219 | static inline simd_vector simd_cmp_gt(simd_vector a, simd_vector b) {return _mm256_cmp_ps(a, b, _CMP_GT_OQ);} 220 | static inline simd_vector simd_cmp_ge(simd_vector a, simd_vector b) {return _mm256_cmp_ps(a, b, _CMP_GE_OQ);} 221 | static inline simd_vector simd_cmp_lt(simd_vector a, simd_vector b) {return _mm256_cmp_ps(a, b, _CMP_LT_OQ);} 222 | static inline simd_vector simd_cmp_le(simd_vector a, simd_vector b) {return _mm256_cmp_ps(a, b, _CMP_LE_OQ);} 223 | static inline simd_vector simd_cmp_eq(simd_vector a, simd_vector b) {return _mm256_cmp_ps(a, b, _CMP_EQ_OQ);} 224 | static inline simd_vector simd_cmp_neq(simd_vector a, simd_vector b) {return _mm256_cmp_ps(a, b, _CMP_NEQ_OQ);} 225 | static inline simd_vector simd_isnan(simd_vector a) {return _mm256_cmp_ps(a, a, _CMP_NEQ_UQ);} 226 | static inline simd_vector simd_sqrt(simd_vector a) {return _mm256_sqrt_ps(a);} 227 | static inline simd_vector simd_neg(simd_vector a) {return _mm256_xor_ps(a, simd_sign_mask());} 228 | 229 | typedef __m256i simd_vectori; 230 | static inline simd_vectori simd_convert_from_float(simd_vector a) {return _mm256_cvttps_epi32(a);} 231 | static inline simd_vectori simd_cast_from_float(simd_vector a) {return _mm256_castps_si256(a);} 232 | static inline simd_vector simd_convert_from_int(simd_vectori a) {return _mm256_cvtepi32_ps(a);} 233 | static inline simd_vector simd_cast_from_int(simd_vectori a) {return _mm256_castsi256_ps(a);} 234 | static inline simd_vectori simd_add_i(simd_vectori a, simd_vectori b) {return _mm256_add_epi32(a, b);} 235 | static inline simd_vectori simd_sub_i(simd_vectori a, simd_vectori b) {return _mm256_sub_epi32(a, b);} 236 | static inline simd_vectori simd_mul_i(simd_vectori a, simd_vectori b) {return _mm256_mullo_epi32(a, b);} 237 | static inline simd_vectori simd_splat_i(int i) {return _mm256_set1_epi32(i);} 238 | static inline simd_vectori simd_splat_zero_i(void) {return _mm256_setzero_si256();} 239 | static inline simd_vectori simd_shift_left_i(simd_vectori a, int i) {return _mm256_slli_epi32(a, i);} 240 | static inline simd_vectori simd_shift_right_i(simd_vectori a, int i) {return _mm256_srai_epi32(a, i);} 241 | static inline simd_vectori simd_and_i(simd_vectori a, simd_vectori b) {return _mm256_and_si256(a, b);} 242 | static inline simd_vectori simd_or_i(simd_vectori a, simd_vectori b) {return _mm256_or_si256(a, b);} 243 | static inline simd_vectori simd_abs_i(simd_vectori a) {return _mm256_abs_epi32(a);} 244 | static inline simd_vectori simd_andnot_i(simd_vectori a, simd_vectori b) {return _mm256_andnot_si256(b, a);} 245 | static inline simd_vectori simd_cmp_eq_i(simd_vectori a, simd_vectori b) {return _mm256_cmpeq_epi32(a, b);} 246 | static inline simd_vectori simd_cmp_gt_i(simd_vectori a, simd_vectori b) {return _mm256_cmpgt_epi32(a, b);} 247 | 248 | 249 | #define simd_asin mm256_asin_ps 250 | #define simd_atan mm256_atan_ps 251 | #define simd_sincos mm256_sincos_ps 252 | #define simd_sin mm256_sin_ps 253 | #define simd_exp mm256_exp_ps 254 | #define simd_log mm256_log_ps 255 | #define simd_exp2 mm256_exp2_ps 256 | #define simd_log2 mm256_log2_ps 257 | 258 | #endif 259 | 260 | //---------------------------------------------------------------------------------------------------------------------- 261 | static inline simd_vector simd_frexp(simd_vector x, simd_vectori* exponent) 262 | { 263 | simd_vectori cast_float = simd_cast_from_float(x); 264 | simd_vectori e = simd_and_i(simd_shift_right_i(cast_float, 23), simd_splat_i(0xff));; 265 | simd_vectori equal_to_zero = simd_and_i(simd_cmp_eq_i(e, simd_splat_zero_i()), simd_cast_from_float(simd_cmp_eq(x, simd_splat_zero()))); 266 | *exponent = simd_andnot_i(simd_sub_i(e, simd_splat_i(0x7e)), equal_to_zero); 267 | cast_float = simd_and_i(cast_float, simd_splat_i(0x807fffff)); 268 | cast_float = simd_or_i(cast_float, simd_splat_i(0x3f000000)); 269 | return simd_select(simd_cast_from_int(cast_float), x, simd_cast_from_int(equal_to_zero)); 270 | } 271 | 272 | //---------------------------------------------------------------------------------------------------------------------- 273 | static inline simd_vector simd_ldexp(simd_vector x, simd_vectori pw2) 274 | { 275 | simd_vectori fl = simd_cast_from_float(x); 276 | simd_vectori e = simd_and_i(simd_shift_right_i(fl, 23), simd_splat_i(0xff)); 277 | e = simd_and_i(simd_add_i(e, pw2), simd_splat_i(0xff)); 278 | simd_vectori is_infinity = simd_cmp_eq_i(e, simd_splat_i(0xff)); 279 | fl = simd_or_i(simd_andnot_i(fl, is_infinity), simd_and_i(fl, simd_splat_i(0xFF800000))); 280 | fl = simd_or_i(simd_shift_left_i(e, 23), simd_and_i(fl, simd_splat_i(0x807fffff))); 281 | simd_vector equal_to_zero = simd_cmp_eq(x, simd_splat_zero()); 282 | return simd_andnot(simd_cast_from_int(fl), equal_to_zero); 283 | } 284 | 285 | //---------------------------------------------------------------------------------------------------------------------- 286 | static inline simd_vector simd_polynomial4(simd_vector x, float* coefficients) 287 | { 288 | simd_vector result = simd_fmad(x, simd_splat(coefficients[0]), simd_splat(coefficients[1])); 289 | result = simd_fmad(x, result, simd_splat(coefficients[2])); 290 | result = simd_fmad(x, result, simd_splat(coefficients[3])); 291 | return result; 292 | } 293 | 294 | //---------------------------------------------------------------------------------------------------------------------- 295 | static inline simd_vector simd_polynomial5(simd_vector x, float* coefficients) 296 | { 297 | simd_vector result = simd_polynomial4(x, coefficients); 298 | result = simd_fmad(x, result, simd_splat(coefficients[4])); 299 | return result; 300 | } 301 | 302 | //---------------------------------------------------------------------------------------------------------------------- 303 | static inline simd_vector simd_polynomial6(simd_vector x, float* coefficients) 304 | { 305 | simd_vector result = simd_polynomial5(x, coefficients); 306 | result = simd_fmad(x, result, simd_splat(coefficients[5])); 307 | return result; 308 | } 309 | 310 | //---------------------------------------------------------------------------------------------------------------------- 311 | static inline simd_vector simd_clamp(simd_vector a, simd_vector range_min, simd_vector range_max) 312 | { 313 | return simd_max(simd_min(a, range_max), range_min); 314 | } 315 | 316 | //---------------------------------------------------------------------------------------------------------------------- 317 | static inline simd_vector simd_sign(simd_vector a) 318 | { 319 | simd_vector result = simd_select(simd_splat_zero(), simd_splat(-1.f), simd_cmp_lt(a, simd_splat_zero())); 320 | return simd_select(result, simd_splat( 1.f), simd_cmp_gt(a, simd_splat_zero())); 321 | } 322 | 323 | static inline simd_vectori simd_select_i(simd_vectori a, simd_vectori b, simd_vectori mask) { return simd_or_i(simd_andnot_i(a, mask), simd_and_i(b, mask));} 324 | static inline simd_vectori simd_neg_i(simd_vectori a){return simd_sub_i(simd_splat_zero_i(), a);} 325 | 326 | 327 | //---------------------------------------------------------------------------------------------------------------------- 328 | // based on http://gruntthepeon.free.fr/ssemath/ 329 | #ifdef __MATH__INTRINSICS__NEON__ 330 | float32x4_t vlogq_f32(float32x4_t x) 331 | #else 332 | __m256 mm256_log_ps(__m256 x) 333 | #endif 334 | { 335 | simd_vector one = simd_splat(1.f); 336 | simd_vector invalid_mask = simd_cmp_le(x, simd_splat_zero()); 337 | invalid_mask = simd_or(invalid_mask, simd_isnan(x)); 338 | simd_vector input_is_zero = simd_cmp_eq(x, simd_splat_zero()); 339 | simd_vector input_is_infinity = simd_cmp_eq(x, simd_splat_positive_infinity()); 340 | 341 | x = simd_max(x, simd_min_normalized()); // cut off denormalized stuff 342 | 343 | simd_vectori emm0 = simd_shift_right_i(simd_cast_from_float(x), 23); 344 | emm0 = simd_sub_i(emm0, simd_splat_i(0x7f)); 345 | simd_vector e = simd_convert_from_int(emm0); 346 | 347 | // keep only the fractional part 348 | x = simd_and(x, simd_inv_mant_mask()); 349 | x = simd_or(x, simd_splat(0.5f)); 350 | 351 | e = simd_add(e, one); 352 | simd_vector mask = simd_cmp_lt(x, simd_splat(0.707106781186547524f)); 353 | simd_vector tmp = simd_and(x, mask); 354 | x = simd_sub(x, one); 355 | e = simd_sub(e, simd_and(one, mask)); 356 | x = simd_add(x, tmp); 357 | 358 | simd_vector z = simd_mul(x,x); 359 | simd_vector y = simd_splat(7.0376836292E-2f); 360 | y = simd_fmad(y, x, simd_splat(-1.1514610310E-1f)); 361 | y = simd_fmad(y, x, simd_splat(1.1676998740E-1f)); 362 | y = simd_fmad(y, x, simd_splat(-1.2420140846E-1f)); 363 | y = simd_fmad(y, x, simd_splat(+1.4249322787E-1f)); 364 | y = simd_fmad(y, x, simd_splat(-1.6668057665E-1f)); 365 | y = simd_fmad(y, x, simd_splat(+2.0000714765E-1f)); 366 | y = simd_fmad(y, x, simd_splat(-2.4999993993E-1f)); 367 | y = simd_fmad(y, x, simd_splat(+3.3333331174E-1f)); 368 | y = simd_mul(y, x); 369 | y = simd_mul(y, z); 370 | 371 | tmp = simd_mul(e, simd_splat(-2.12194440e-4f)); 372 | y = simd_add(y, tmp); 373 | 374 | tmp = simd_mul(z, simd_splat(0.5f)); 375 | y = simd_sub(y, tmp); 376 | 377 | tmp = simd_mul(e, simd_splat(0.693359375f)); 378 | x = simd_add(x, y); 379 | x = simd_add(x, tmp); 380 | x = simd_or(x, invalid_mask); // NAN/negative arg will be NAN 381 | x = simd_select(x, simd_splat_negative_infinity(), input_is_zero); // zero arg will be -inf 382 | x = simd_select(x, simd_splat_positive_infinity(), input_is_infinity); // +inf arg will be +inf 383 | 384 | return x; 385 | } 386 | 387 | //---------------------------------------------------------------------------------------------------------------------- 388 | // based on https://github.com/redorav/hlslpp/blob/master/include/hlsl%2B%2B_vector_float8.h 389 | #ifdef __MATH__INTRINSICS__NEON__ 390 | float32x4_t vlog2q_f32(float32x4_t x) 391 | #else 392 | __m256 mm256_log2_ps(__m256 x) 393 | #endif 394 | { 395 | simd_vector invalid_mask = simd_cmp_le(x, simd_splat_zero()); 396 | invalid_mask = simd_or(invalid_mask, simd_isnan(x)); 397 | simd_vector input_is_zero = simd_cmp_eq(x, simd_splat_zero()); 398 | simd_vector input_is_infinity = simd_cmp_eq(x, simd_splat_positive_infinity()); 399 | simd_vector one = simd_splat(1.f); 400 | simd_vectori exp = simd_splat_i(0x7f800000); 401 | simd_vectori mant = simd_splat_i(0x007fffff); 402 | simd_vectori i = simd_cast_from_float(x); 403 | simd_vector e = simd_convert_from_int(simd_sub_i(simd_shift_right_i(simd_and_i(i, exp), 23), simd_splat_i(127))); 404 | simd_vector m = simd_or(simd_cast_from_int(simd_and_i(i, mant)), one); 405 | 406 | // minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[ 407 | simd_vector p = simd_polynomial6(m, (float[]){-3.4436006e-2f, 3.1821337e-1f, -1.2315303f, 2.5988452f, -3.3241990f, 3.1157899f}); 408 | 409 | // this effectively increases the polynomial degree by one, but ensures that log2(1) == 0 410 | p = simd_mul(p, simd_sub(m, one)); 411 | simd_vector result = simd_add(p, e); 412 | 413 | result = simd_or(result, invalid_mask); // NAN/negative arg will be NAN 414 | result = simd_select(result, simd_splat_negative_infinity(), input_is_zero); // zero arg will be -inf 415 | result = simd_select(result, simd_splat_positive_infinity(), input_is_infinity); // +inf arg will be +inf 416 | 417 | return result; 418 | } 419 | 420 | //---------------------------------------------------------------------------------------------------------------------- 421 | // based on http://gruntthepeon.free.fr/ssemath/ 422 | #ifdef __MATH__INTRINSICS__NEON__ 423 | float32x4_t vexpq_f32(float32x4_t x) 424 | #else 425 | __m256 mm256_exp_ps(__m256 x) 426 | #endif 427 | { 428 | simd_vector invalid_mask = simd_isnan(x); 429 | simd_vector input_is_infinity = simd_cmp_eq(x, simd_splat_positive_infinity()); 430 | simd_vector tmp = simd_splat_zero(); 431 | simd_vector fx; 432 | simd_vector one = simd_splat(1.f); 433 | 434 | x = simd_min(x, simd_splat(88.3762626647949f)); 435 | x = simd_max(x, simd_splat(-88.3762626647949f)); 436 | 437 | // express exp(x) as exp(g + n*log(2)) 438 | fx = simd_fmad(x, simd_splat(1.44269504088896341f), simd_splat(0.5f)); 439 | tmp = simd_floor(fx); 440 | 441 | // if greater, substract 1 442 | simd_vector mask = simd_cmp_gt(tmp, fx); 443 | mask = simd_and(mask, one); 444 | fx = simd_sub(tmp, mask); 445 | 446 | tmp = simd_mul(fx, simd_splat(0.693359375f)); 447 | simd_vector z = simd_mul(fx, simd_splat(-2.12194440e-4f)); 448 | x = simd_sub(x, tmp); 449 | x = simd_sub(x, z); 450 | z = simd_mul(x, x); 451 | simd_vector y = simd_polynomial6(x, (float[]) {1.9875691500E-4f, 1.3981999507E-3f, 8.3334519073E-3f, 452 | 4.1665795894E-2f, 1.6666665459E-1f, 5.0000001201E-1f}); 453 | y = simd_fmad(y, z, x); 454 | y = simd_add(y, one); 455 | 456 | simd_vectori emm0 = simd_convert_from_float(fx); 457 | emm0 = simd_add_i(emm0, simd_splat_i(0x7f)); 458 | emm0 = simd_shift_left_i(emm0, 23); 459 | simd_vector pow2n = simd_cast_from_int(emm0); 460 | 461 | simd_vector result = simd_mul(y, pow2n); 462 | result = simd_or(result, invalid_mask); 463 | result = simd_select(result, simd_splat_positive_infinity(), input_is_infinity); // +inf arg will be +inf 464 | 465 | return result; 466 | } 467 | 468 | //---------------------------------------------------------------------------------------------------------------------- 469 | // based on https://github.com/jeremybarnes/cephes/blob/master/single/exp2f.c 470 | #ifdef __MATH__INTRINSICS__NEON__ 471 | float32x4_t vexp2q_f32(float32x4_t x) 472 | #else 473 | __m256 mm256_exp2_ps(__m256 x) 474 | #endif 475 | { 476 | simd_vector invalid_mask = simd_isnan(x); 477 | simd_vector input_is_infinity = simd_cmp_eq(x, simd_splat_positive_infinity()); 478 | simd_vector equal_to_zero = simd_cmp_eq(x, simd_splat_zero()); 479 | simd_vector one = simd_splat(1.f); 480 | 481 | // clamp values 482 | x = simd_clamp(x, simd_splat(-127.f), simd_splat(127.f)); 483 | 484 | #ifdef __MATH_INTRINSINCS_FAST__ 485 | simd_vector ipart = simd_floor(x); 486 | simd_vector fpart = simd_sub(x, ipart); 487 | 488 | simd_vectori i = simd_shift_left_i(simd_add_i(simd_convert_from_float(ipart), simd_splat_i(127)), 23); 489 | simd_vector expipart = simd_cast_from_int(i); 490 | 491 | // minimax polynomial fit of 2^x, in range [-0.5, 0.5[ 492 | simd_vector expfpart = simd_polynomial5(fpart, (float[]) {1.3534167e-2f, 5.2011464e-2f, 2.4144275e-1f, 6.9300383e-1f, 1.0000026f}); 493 | simd_vector result = simd_mul(expipart, expfpart); 494 | #else 495 | simd_vector px = simd_floor(x); 496 | simd_vectori i0 = simd_convert_from_float(px); 497 | x = simd_sub(x, px); 498 | 499 | simd_vector above_half = simd_cmp_gt(x, simd_splat(.5f)); 500 | i0 = simd_select_i(i0, simd_add_i(i0, simd_splat_i(1)), simd_cast_from_float(above_half)); 501 | x = simd_select(x, simd_sub(x, one), above_half); 502 | 503 | px = simd_polynomial6(x, (float[]) {1.535336188319500E-004f, 1.339887440266574E-003f, 9.618437357674640E-003f, 504 | 5.550332471162809E-002f, 2.402264791363012E-001f, 6.931472028550421E-001f}); 505 | px = simd_fmad(px, x, one); 506 | simd_vector result = simd_ldexp(px, i0); 507 | #endif 508 | 509 | result = simd_select(result, one, equal_to_zero); 510 | result = simd_or(result, invalid_mask); 511 | result = simd_select(result, simd_splat_positive_infinity(), input_is_infinity); // +inf arg will be +inf 512 | return result; 513 | } 514 | 515 | //---------------------------------------------------------------------------------------------------------------------- 516 | // based on http://gruntthepeon.free.fr/ssemath/ 517 | #ifdef __MATH__INTRINSICS__NEON__ 518 | void vsincosq_f32(float32x4_t x, float32x4_t* s, float32x4_t* c) 519 | #else 520 | void mm256_sincos_ps(__m256 x, __m256* s, __m256* c) 521 | #endif 522 | { 523 | simd_vector xmm1, xmm2, xmm3 = simd_splat_zero(), sign_bit_sin, y; 524 | 525 | sign_bit_sin = x; 526 | 527 | // take the absolute value 528 | x = simd_and(x, simd_inv_sign_mask()); 529 | // extract the sign bit (upper one) 530 | sign_bit_sin = simd_and(sign_bit_sin, simd_sign_mask()); 531 | 532 | // scale by 4/Pi 533 | y = simd_mul(x, simd_splat(1.27323954473516f)); 534 | 535 | // store the integer part of y in emm2 536 | simd_vectori emm2 = simd_convert_from_float(y); 537 | 538 | // j=(j+1) & (~1) (see the cephes sources) 539 | emm2 = simd_add_i(emm2, simd_splat_i(1)); 540 | emm2 = simd_and_i(emm2, simd_splat_i(~1)); 541 | y = simd_convert_from_int(emm2); 542 | 543 | simd_vectori emm4 = emm2; 544 | 545 | // get the swap sign flag for the sine 546 | simd_vectori emm0 = simd_and_i(emm2, simd_splat_i(4)); 547 | emm0 = simd_shift_left_i(emm0, 29); 548 | simd_vector swap_sign_bit_sin = simd_cast_from_int(emm0); 549 | 550 | // get the polynom selection mask for the sine 551 | emm2 = simd_and_i(emm2, simd_splat_i(2)); 552 | emm2 = simd_cmp_eq_i(emm2, simd_splat_zero_i()); 553 | simd_vector poly_mask = simd_cast_from_int(emm2); 554 | 555 | // The magic pass: "Extended precision modular arithmetic" 556 | // x = ((x - y * DP1) - y * DP2) - y * DP3; 557 | x = simd_fmad(y, simd_splat(-0.78515625f), x); 558 | x = simd_fmad(y, simd_splat(-2.4187564849853515625e-4f), x); 559 | x = simd_fmad(y, simd_splat(-3.77489497744594108e-8f), x); 560 | 561 | emm4 = simd_sub_i(emm4, simd_splat_i(2)); 562 | emm4 = simd_andnot_i(simd_splat_i(4), emm4); 563 | emm4 = simd_shift_left_i(emm4, 29); 564 | simd_vector sign_bit_cos = simd_cast_from_int(emm4); 565 | 566 | sign_bit_sin = simd_xor(sign_bit_sin, swap_sign_bit_sin); 567 | 568 | // Evaluate the first polynom (0 <= x <= Pi/4) 569 | simd_vector z = simd_mul(x,x); 570 | y = simd_splat(2.443315711809948E-005f); 571 | y = simd_fmad(y, z, simd_splat(-1.388731625493765E-003f)); 572 | y = simd_fmad(y, z, simd_splat(4.166664568298827E-002f)); 573 | y = simd_mul(y, z); 574 | y = simd_mul(y, z); 575 | simd_vector tmp = simd_mul(z, simd_splat(.5f)); 576 | y = simd_sub(y, tmp); 577 | y = simd_add(y, simd_splat(1.f)); 578 | 579 | // Evaluate the second polynom (Pi/4 <= x <= 0) 580 | simd_vector y2 = simd_splat(-1.9515295891E-4f); 581 | y2 = simd_fmad(y2, z, simd_splat(8.3321608736E-3f)); 582 | y2 = simd_fmad(y2, z, simd_splat(-1.6666654611E-1f)); 583 | y2 = simd_mul(y2, z); 584 | y2 = simd_fmad(y2, x, x); 585 | 586 | // select the correct result from the two polynoms 587 | xmm3 = poly_mask; 588 | simd_vector ysin2 = simd_and(y2, xmm3); 589 | simd_vector ysin1 = simd_andnot(y, xmm3); 590 | y2 = simd_sub(y2,ysin2); 591 | y = simd_sub(y, ysin1); 592 | 593 | xmm1 = simd_add(ysin1,ysin2); 594 | xmm2 = simd_add(y,y2); 595 | 596 | // update the sign 597 | *s = simd_xor(xmm1, sign_bit_sin); 598 | *c = simd_xor(xmm2, sign_bit_cos); 599 | } 600 | 601 | //---------------------------------------------------------------------------------------------------------------------- 602 | #ifdef __MATH__INTRINSICS__NEON__ 603 | float32x4_t vsinq_f32(float32x4_t x) 604 | #else 605 | __m256 mm256_sin_ps(__m256 x) 606 | #endif 607 | { 608 | #ifdef __MATH_INTRINSINCS_FAST__ 609 | // range reduction from hlslpp, polynomial computed by lolremez 610 | simd_vector invtau = simd_splat(1.f/SIMD_MATH_TAU); 611 | simd_vector tau = simd_splat(SIMD_MATH_TAU); 612 | simd_vector pi2 = simd_splat(SIMD_MATH_PI2); 613 | 614 | // Range reduction (into [-pi, pi] range) 615 | // Formula is x = x - round(x / 2pi) * 2pi 616 | x = simd_sub(x, simd_mul(simd_round(simd_mul(x, invtau)), tau)); 617 | 618 | simd_vector gt_pi2 = simd_cmp_gt(x, pi2); 619 | simd_vector lt_minus_pi2 = simd_cmp_lt(x, simd_neg(pi2)); 620 | simd_vector ox = x; 621 | 622 | // Use identities/mirroring to remap into the range of the minimax polynomial 623 | simd_vector pi = simd_splat(SIMD_MATH_PI); 624 | x = simd_select(x, simd_sub(pi, ox), gt_pi2); 625 | x = simd_select(x, simd_sub(simd_neg(pi), ox), lt_minus_pi2); 626 | 627 | simd_vector x_squared = simd_mul(x, x); 628 | simd_vector result = simd_polynomial4(x_squared, (float[]){2.6000548e-6f, -1.9806615e-4f, 8.3330173e-3f, -1.6666657e-1f}); 629 | result = simd_mul(result, x_squared); 630 | result = simd_fmad(result, x, x); 631 | 632 | return result; 633 | #else 634 | simd_vector sinus, cosinus; 635 | simd_sincos(x, &sinus, &cosinus); 636 | return sinus; 637 | #endif 638 | } 639 | 640 | //---------------------------------------------------------------------------------------------------------------------- 641 | #ifdef __MATH__INTRINSICS__NEON__ 642 | float32x4_t vcosq_f32(float32x4_t x) 643 | #else 644 | __m256 mm256_cos_ps(__m256 x) 645 | #endif 646 | { 647 | #ifdef __MATH_INTRINSINCS_FAST__ 648 | return simd_sin(simd_sub(simd_splat(SIMD_MATH_PI2), x)); 649 | #else 650 | simd_vector sinus, cosinus; 651 | simd_sincos(x, &sinus, &cosinus); 652 | return cosinus; 653 | #endif 654 | } 655 | 656 | //---------------------------------------------------------------------------------------------------------------------- 657 | #ifdef __MATH__INTRINSICS__NEON__ 658 | float32x4_t vasinq_f32(float32x4_t xx) 659 | #else 660 | __m256 mm256_asin_ps(__m256 xx) 661 | #endif 662 | { 663 | simd_vector output_nan = simd_cmp_gt(simd_abs(xx), simd_splat(1.f)); 664 | simd_vector small_value = simd_cmp_lt(simd_abs(xx), simd_splat(1.0e-4f)); 665 | simd_vector a = simd_abs(xx); 666 | #ifdef __MATH_INTRINSINCS_FAST__ 667 | // based on https://developer.download.nvidia.com/cg/asin.html 668 | simd_vector negate = simd_select(simd_splat_zero(), simd_splat(1.f), simd_cmp_lt(xx, simd_splat_zero())); 669 | simd_vector x = a; 670 | simd_vector result = simd_polynomial4(x, (float[]){-0.0187293f, 0.0742610f, -0.2121144f, 1.5707288f}); 671 | result = simd_sub(simd_splat(SIMD_MATH_PI2), simd_mul(simd_sqrt(simd_sub(simd_splat(1.f), x)), result)); 672 | result = simd_sub(result, simd_mul(simd_mul(simd_splat(2.f), result), negate)); 673 | #else 674 | // based on https://github.com/jeremybarnes/cephes/blob/master/single/asinf.c 675 | simd_vector x = xx; 676 | simd_vector sign = simd_sign(xx); 677 | simd_vector z1 = simd_mul(simd_splat(.5f), simd_sub(simd_splat(1.f), a)); 678 | simd_vector z2 = simd_mul(a, a); 679 | simd_vector flag = simd_cmp_gt(a, simd_splat(.5f)); 680 | simd_vector z = simd_select(z2, z1, flag); 681 | 682 | x = simd_select(a, simd_sqrt(z), flag); 683 | 684 | simd_vector tmp = simd_polynomial5(z, (float[]) {4.2163199048E-2f, 2.4181311049E-2f, 4.5470025998E-2f, 685 | 7.4953002686E-2f, 1.6666752422E-1f}); 686 | tmp = simd_mul(tmp, z); 687 | z = simd_fmad(tmp, x, x); 688 | 689 | tmp = simd_add(z, z); 690 | tmp = simd_sub(simd_splat(SIMD_MATH_PI2), tmp); 691 | z = simd_select(z, tmp, flag); 692 | simd_vector result = simd_mul(z, sign); 693 | #endif 694 | result = simd_or(result, output_nan); 695 | result = simd_select(result, xx, small_value); 696 | return result; 697 | } 698 | 699 | //---------------------------------------------------------------------------------------------------------------------- 700 | // acos(x) = pi/2 - asin(x) 701 | #ifdef __MATH__INTRINSICS__NEON__ 702 | float32x4_t vacosq_f32(float32x4_t x) 703 | #else 704 | __m256 mm256_acos_ps(__m256 x) 705 | #endif 706 | { 707 | #ifdef __MATH_INTRINSINCS_FAST__ 708 | simd_vector negate = simd_select(simd_splat_zero(), simd_splat(1.f), simd_cmp_lt(x, simd_splat_zero())); 709 | x = simd_abs(x); 710 | simd_vector result = simd_polynomial4(x, (float[]){-0.0187293f, 0.0742610f, -0.2121144f, 1.5707288f}); 711 | result = simd_mul(result, simd_sqrt(simd_sub(simd_splat(1.f), x))); 712 | result = simd_sub(result, simd_mul(simd_mul(simd_splat(2.f), negate), result)); 713 | return simd_fmad(negate, simd_splat(SIMD_MATH_PI), result); 714 | #else 715 | simd_vector out_of_bound = simd_cmp_gt(simd_abs(x), simd_splat(1.f)); 716 | simd_vector result = simd_sub(simd_splat(SIMD_MATH_PI2), simd_asin(x)); 717 | result = simd_or(result, out_of_bound); // out of bound outputs NAN 718 | return result; 719 | #endif 720 | } 721 | 722 | //---------------------------------------------------------------------------------------------------------------------- 723 | // based on https://github.com/jeremybarnes/cephes/blob/master/single/atanf.c 724 | #ifdef __MATH__INTRINSICS__NEON__ 725 | float32x4_t vatanq_f32(float32x4_t xx) 726 | #else 727 | __m256 mm256_atan_ps(__m256 xx) 728 | #endif 729 | { 730 | simd_vector sign = simd_sign(xx); 731 | simd_vector x = simd_abs(xx); 732 | simd_vector one = simd_splat(1.f); 733 | 734 | // range reduction 735 | simd_vector above_3pi8 = simd_cmp_gt(x, simd_splat(2.414213562373095f)); 736 | simd_vector above_pi8 = simd_andnot(simd_cmp_gt(x, simd_splat(0.4142135623730950f)), above_3pi8); 737 | simd_vector y = simd_splat_zero(); 738 | 739 | x = simd_select(x, simd_neg(simd_div(one, x)), above_3pi8); 740 | x = simd_select(x, simd_div(simd_sub(x, one), simd_add(x, one)), above_pi8); 741 | y = simd_select(y, simd_splat(SIMD_MATH_PI2), above_3pi8); 742 | y = simd_select(y, simd_splat(SIMD_MATH_PI4), above_pi8); 743 | 744 | // minimax polynomial 745 | simd_vector z = simd_mul(x, x); 746 | simd_vector tmp = simd_polynomial4(z, (float[]) {8.05374449538e-2f, -1.38776856032E-1f, 1.99777106478E-1f, -3.33329491539E-1f}); 747 | tmp = simd_mul(tmp, z); 748 | tmp = simd_fmad(tmp, x, x); 749 | y = simd_add(tmp, y); 750 | y = simd_mul(y, sign); 751 | 752 | return y; 753 | } 754 | 755 | //---------------------------------------------------------------------------------------------------------------------- 756 | // based on https://mazzo.li/posts/vectorized-atan2.html 757 | #ifdef __MATH__INTRINSICS__NEON__ 758 | float32x4_t vatan2q_f32(float32x4_t x, float32x4_t y) 759 | #else 760 | __m256 mm256_atan2_ps(__m256 x, __m256 y) 761 | #endif 762 | { 763 | simd_vector swap = simd_cmp_lt(simd_abs(x), simd_abs(y)); 764 | simd_vector x_equals_zero = simd_cmp_eq(x, simd_splat_zero()); 765 | simd_vector y_equals_zero = simd_cmp_eq(y, simd_splat_zero()); 766 | simd_vector x_over_y = simd_div(x, y); 767 | simd_vector y_over_x = simd_div(y, x); 768 | simd_vector atan_input = simd_select(y_over_x, x_over_y, swap); 769 | simd_vector result = simd_atan(atan_input); 770 | 771 | simd_vector adjust = simd_select(simd_splat(-SIMD_MATH_PI2), simd_splat(SIMD_MATH_PI2), simd_cmp_ge(atan_input, simd_splat_zero())); 772 | result = simd_select(result, simd_sub(adjust, result), swap); 773 | 774 | simd_vector x_sign_mask = simd_cmp_lt(x, simd_splat_zero()); 775 | result = simd_add( simd_and(simd_xor(simd_splat(SIMD_MATH_PI), simd_and(simd_sign_mask(), y)), x_sign_mask), result); 776 | result = simd_select(result, simd_mul(simd_sign(x), simd_splat_zero()), y_equals_zero); 777 | result = simd_select(result, simd_mul(simd_sign(y), simd_splat(SIMD_MATH_PI2)), x_equals_zero); 778 | return result; 779 | } 780 | 781 | //---------------------------------------------------------------------------------------------------------------------- 782 | // based on https://github.com/jeremybarnes/cephes/blob/master/single/cbrtf.c 783 | #ifdef __MATH__INTRINSICS__NEON__ 784 | float32x4_t vcbrtq_f32(float32x4_t xx) 785 | #else 786 | __m256 mm256_cbrt_ps(__m256 xx) 787 | #endif 788 | { 789 | simd_vector one_over_three = simd_splat(0.333333333333f); 790 | simd_vector sign = simd_sign(xx); 791 | simd_vector x = simd_abs(xx); 792 | simd_vector z = x; 793 | 794 | // extract power of 2, leaving mantissa between 0.5 and 1 795 | simd_vectori exponent; 796 | x = simd_frexp(x, &exponent); 797 | 798 | // Approximate cube root of number between .5 and 1 799 | x = simd_polynomial5(x, (float[]) {-0.1346611047335f, 0.5466460136639f, -0.954382247715f, 1.13999833547f, 0.40238979564f}); 800 | 801 | // exponent divided by 3 802 | simd_vectori exponent_is_negative = simd_cmp_gt_i(simd_splat_zero_i(), exponent); 803 | 804 | exponent = simd_abs_i(exponent); 805 | simd_vectori rem = exponent; 806 | exponent = simd_convert_from_float((simd_mul(simd_convert_from_int(exponent), one_over_three))); 807 | rem = simd_sub_i(rem, simd_mul_i(exponent, simd_splat_i(3))); 808 | 809 | simd_vector cbrt2 = simd_splat(1.25992104989487316477f); 810 | simd_vector cbrt4 = simd_splat(1.58740105196819947475f); 811 | 812 | simd_vector rem_equals_1 = simd_cast_from_int(simd_cmp_eq_i(rem, simd_splat_i(1))); 813 | simd_vector rem_equals_2 = simd_cast_from_int(simd_cmp_eq_i(rem, simd_splat_i(2))); 814 | simd_vector x1 = simd_mul(x, simd_select(cbrt4, cbrt2, rem_equals_1)); 815 | simd_vector x2 = simd_div(x, simd_select(cbrt4, cbrt2, rem_equals_1)); 816 | x = simd_select(x, simd_select(x1, x2, simd_cast_from_int(exponent_is_negative)), simd_or(rem_equals_1, rem_equals_2)); 817 | exponent = simd_mul_i(exponent, simd_select_i(simd_splat_i(1), simd_splat_i(-1), exponent_is_negative)); 818 | 819 | // multiply by power of 2 820 | x = simd_ldexp(x, exponent); 821 | 822 | // Newton iteration, x -= ( x - (z/(x*x)) ) * 0.333333333333; 823 | x = simd_sub(x, simd_mul(simd_sub(x, simd_div(z, simd_mul(x, x))), one_over_three)); 824 | x = simd_mul(x, sign); // if input is zero, sign is also zero 825 | 826 | return x; 827 | } 828 | 829 | //---------------------------------------------------------------------------------------------------------------------- 830 | // the implementation based https://github.com/jeremybarnes/cephes/blob/master/single/powf.c is **too** slow 831 | // so we use the classic exp(y * log(x)) 832 | #ifdef __MATH__INTRINSICS__NEON__ 833 | float32x4_t vpowq_f32(float32x4_t x, float32x4_t y) 834 | #else 835 | __m256 mm256_pow_ps(__m256 x, __m256 y) 836 | #endif 837 | { 838 | simd_vector x_equals_zero = simd_cmp_eq(x, simd_splat_zero()); 839 | simd_vector y_equals_zero = simd_cmp_eq(y, simd_splat_zero()); 840 | simd_vector non_integer_power = simd_cmp_neq(y, simd_floor(y)); 841 | simd_vector return_zero = simd_andnot(x_equals_zero, y_equals_zero); 842 | simd_vector return_one = simd_and(x_equals_zero, y_equals_zero); 843 | simd_vector return_nan = simd_and(simd_cmp_lt(x, simd_splat_zero()), non_integer_power); 844 | 845 | #ifdef __MATH_INTRINSINCS_FAST__ 846 | simd_vector result = simd_exp2(simd_mul(y, simd_log2(x))); 847 | #else 848 | simd_vector result = simd_exp(simd_mul(y, simd_log(x))); 849 | #endif 850 | 851 | result = simd_andnot(result, return_zero); 852 | result = simd_select(result, simd_splat(1.f), return_one); 853 | result = simd_or(result, return_nan); 854 | 855 | return result; 856 | } 857 | 858 | #endif // __MATH__INTRINSICS__IMPLEMENTATION__ 859 | 860 | 861 | -------------------------------------------------------------------------------- /tests/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.25) 2 | 3 | set(CMAKE_OSX_ARCHITECTURES arm64;x86_64) 4 | 5 | project(math_intrinsics_unit_tests) 6 | 7 | add_executable(test_precision test.c math_intrinsics.c) 8 | add_executable(test_fast test.c math_intrinsics.c) 9 | add_executable(benchmark_precision benchmark.c math_intrinsics.c) 10 | add_executable(benchmark_fast benchmark.c math_intrinsics.c) 11 | 12 | if(LINUX) 13 | set(CMAKE_EXE_LINKER_FLAGS "-lm") 14 | endif() 15 | 16 | if(MSVC) 17 | target_compile_options(test_precision PRIVATE /W4 /WX /std:c17) 18 | target_compile_options(test_fast PRIVATE /W4 /WX /std:c17 /D__MATH_INTRINSINCS_FAST__) 19 | target_compile_options(benchmark_precision PRIVATE /std:c17) 20 | target_compile_options(benchmark_fast PRIVATE /std:c17 /D__MATH_INTRINSINCS_FAST__) 21 | else() 22 | target_compile_options(test_precision PRIVATE -Wall -Wextra -Wpedantic -Werror -mavx2 -mfma) 23 | target_compile_options(test_fast PRIVATE -Wall -Wextra -Wpedantic -Werror -mavx2 -mfma -D__MATH_INTRINSINCS_FAST__) 24 | target_compile_options(benchmark_precision PRIVATE -O3 -mavx2 -mfma) 25 | target_compile_options(benchmark_fast PRIVATE -O3 -mavx2 -mfma -D__MATH_INTRINSINCS_FAST__) 26 | endif() -------------------------------------------------------------------------------- /tests/benchmark.c: -------------------------------------------------------------------------------- 1 | #define SOKOL_TIME_IMPL 2 | #include "sokol_time.h" 3 | #include 4 | #include 5 | #include 6 | 7 | 8 | #include "../math_intrinsics.h" 9 | 10 | //---------------------------------------------------------------------------------------------------------------------- 11 | // functions pointer definition 12 | typedef float (*reference_function)(float); 13 | typedef float (*reference_function2)(float, float); 14 | #ifdef __MATH__INTRINSICS__AVX__ 15 | typedef __m256 (*approximation_function)(__m256); 16 | typedef __m256 (*approximation_function2)(__m256, __m256); 17 | #define simd_vector_width (8) 18 | #else 19 | typedef float32x4_t (*approximation_function)(float32x4_t); 20 | typedef float32x4_t (*approximation_function2)(float32x4_t, float32x4_t); 21 | #define simd_vector_width (4) 22 | #endif 23 | 24 | #define NUM_ITERATIONS (200000000) 25 | 26 | //---------------------------------------------------------------------------------------------------------------------- 27 | int benchmark(approximation_function function, reference_function reference, const char* name) 28 | { 29 | float init_array[simd_vector_width]; 30 | uint64_t start = 0; 31 | int output = 0; 32 | 33 | for(uint32_t i=0; i 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted, provided that the above 6 | * copyright notice and this permission notice appear in all copies. 7 | * 8 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 9 | * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 10 | * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 11 | * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 12 | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 13 | * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 14 | * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 15 | */ 16 | 17 | #ifndef GREATEST_H 18 | #define GREATEST_H 19 | 20 | #if defined(__cplusplus) && !defined(GREATEST_NO_EXTERN_CPLUSPLUS) 21 | extern "C" { 22 | #endif 23 | 24 | /* 1.5.0 */ 25 | #define GREATEST_VERSION_MAJOR 1 26 | #define GREATEST_VERSION_MINOR 5 27 | #define GREATEST_VERSION_PATCH 0 28 | 29 | /* A unit testing system for C, contained in 1 file. 30 | * It doesn't use dynamic allocation or depend on anything 31 | * beyond ANSI C89. 32 | * 33 | * An up-to-date version can be found at: 34 | * https://github.com/silentbicycle/greatest/ 35 | */ 36 | 37 | 38 | /********************************************************************* 39 | * Minimal test runner template 40 | *********************************************************************/ 41 | #if 0 42 | 43 | #include "greatest.h" 44 | 45 | TEST foo_should_foo(void) { 46 | PASS(); 47 | } 48 | 49 | static void setup_cb(void *data) { 50 | printf("setup callback for each test case\n"); 51 | } 52 | 53 | static void teardown_cb(void *data) { 54 | printf("teardown callback for each test case\n"); 55 | } 56 | 57 | SUITE(suite) { 58 | /* Optional setup/teardown callbacks which will be run before/after 59 | * every test case. If using a test suite, they will be cleared when 60 | * the suite finishes. */ 61 | SET_SETUP(setup_cb, voidp_to_callback_data); 62 | SET_TEARDOWN(teardown_cb, voidp_to_callback_data); 63 | 64 | RUN_TEST(foo_should_foo); 65 | } 66 | 67 | /* Add definitions that need to be in the test runner's main file. */ 68 | GREATEST_MAIN_DEFS(); 69 | 70 | /* Set up, run suite(s) of tests, report pass/fail/skip stats. */ 71 | int run_tests(void) { 72 | GREATEST_INIT(); /* init. greatest internals */ 73 | /* List of suites to run (if any). */ 74 | RUN_SUITE(suite); 75 | 76 | /* Tests can also be run directly, without using test suites. */ 77 | RUN_TEST(foo_should_foo); 78 | 79 | GREATEST_PRINT_REPORT(); /* display results */ 80 | return greatest_all_passed(); 81 | } 82 | 83 | /* main(), for a standalone command-line test runner. 84 | * This replaces run_tests above, and adds command line option 85 | * handling and exiting with a pass/fail status. */ 86 | int main(int argc, char **argv) { 87 | GREATEST_MAIN_BEGIN(); /* init & parse command-line args */ 88 | RUN_SUITE(suite); 89 | GREATEST_MAIN_END(); /* display results */ 90 | } 91 | 92 | #endif 93 | /*********************************************************************/ 94 | 95 | 96 | #include 97 | #include 98 | #include 99 | #include 100 | 101 | /*********** 102 | * Options * 103 | ***********/ 104 | 105 | /* Default column width for non-verbose output. */ 106 | #ifndef GREATEST_DEFAULT_WIDTH 107 | #define GREATEST_DEFAULT_WIDTH 72 108 | #endif 109 | 110 | /* FILE *, for test logging. */ 111 | #ifndef GREATEST_STDOUT 112 | #define GREATEST_STDOUT stdout 113 | #endif 114 | 115 | /* Remove GREATEST_ prefix from most commonly used symbols? */ 116 | #ifndef GREATEST_USE_ABBREVS 117 | #define GREATEST_USE_ABBREVS 1 118 | #endif 119 | 120 | /* Set to 0 to disable all use of setjmp/longjmp. */ 121 | #ifndef GREATEST_USE_LONGJMP 122 | #define GREATEST_USE_LONGJMP 0 123 | #endif 124 | 125 | /* Make it possible to replace fprintf with another 126 | * function with the same interface. */ 127 | #ifndef GREATEST_FPRINTF 128 | #define GREATEST_FPRINTF fprintf 129 | #endif 130 | 131 | #if GREATEST_USE_LONGJMP 132 | #include 133 | #endif 134 | 135 | /* Set to 0 to disable all use of time.h / clock(). */ 136 | #ifndef GREATEST_USE_TIME 137 | #define GREATEST_USE_TIME 1 138 | #endif 139 | 140 | #if GREATEST_USE_TIME 141 | #include 142 | #endif 143 | 144 | /* Floating point type, for ASSERT_IN_RANGE. */ 145 | #ifndef GREATEST_FLOAT 146 | #define GREATEST_FLOAT double 147 | #define GREATEST_FLOAT_FMT "%g" 148 | #endif 149 | 150 | /* Size of buffer for test name + optional '_' separator and suffix */ 151 | #ifndef GREATEST_TESTNAME_BUF_SIZE 152 | #define GREATEST_TESTNAME_BUF_SIZE 128 153 | #endif 154 | 155 | 156 | /********* 157 | * Types * 158 | *********/ 159 | 160 | /* Info for the current running suite. */ 161 | typedef struct greatest_suite_info { 162 | unsigned int tests_run; 163 | unsigned int passed; 164 | unsigned int failed; 165 | unsigned int skipped; 166 | 167 | #if GREATEST_USE_TIME 168 | /* timers, pre/post running suite and individual tests */ 169 | clock_t pre_suite; 170 | clock_t post_suite; 171 | clock_t pre_test; 172 | clock_t post_test; 173 | #endif 174 | } greatest_suite_info; 175 | 176 | /* Type for a suite function. */ 177 | typedef void greatest_suite_cb(void); 178 | 179 | /* Types for setup/teardown callbacks. If non-NULL, these will be run 180 | * and passed the pointer to their additional data. */ 181 | typedef void greatest_setup_cb(void *udata); 182 | typedef void greatest_teardown_cb(void *udata); 183 | 184 | /* Type for an equality comparison between two pointers of the same type. 185 | * Should return non-0 if equal, otherwise 0. 186 | * UDATA is a closure value, passed through from ASSERT_EQUAL_T[m]. */ 187 | typedef int greatest_equal_cb(const void *expd, const void *got, void *udata); 188 | 189 | /* Type for a callback that prints a value pointed to by T. 190 | * Return value has the same meaning as printf's. 191 | * UDATA is a closure value, passed through from ASSERT_EQUAL_T[m]. */ 192 | typedef int greatest_printf_cb(const void *t, void *udata); 193 | 194 | /* Callbacks for an arbitrary type; needed for type-specific 195 | * comparisons via GREATEST_ASSERT_EQUAL_T[m].*/ 196 | typedef struct greatest_type_info { 197 | greatest_equal_cb *equal; 198 | greatest_printf_cb *print; 199 | } greatest_type_info; 200 | 201 | typedef struct greatest_memory_cmp_env { 202 | const unsigned char *exp; 203 | const unsigned char *got; 204 | size_t size; 205 | } greatest_memory_cmp_env; 206 | 207 | /* Callbacks for string and raw memory types. */ 208 | extern greatest_type_info greatest_type_info_string; 209 | extern greatest_type_info greatest_type_info_memory; 210 | 211 | typedef enum { 212 | GREATEST_FLAG_FIRST_FAIL = 0x01, 213 | GREATEST_FLAG_LIST_ONLY = 0x02, 214 | GREATEST_FLAG_ABORT_ON_FAIL = 0x04 215 | } greatest_flag_t; 216 | 217 | /* Internal state for a PRNG, used to shuffle test order. */ 218 | struct greatest_prng { 219 | unsigned char random_order; /* use random ordering? */ 220 | unsigned char initialized; /* is random ordering initialized? */ 221 | unsigned char pad_0[6]; 222 | unsigned long state; /* PRNG state */ 223 | unsigned long count; /* how many tests, this pass */ 224 | unsigned long count_ceil; /* total number of tests */ 225 | unsigned long count_run; /* total tests run */ 226 | unsigned long a; /* LCG multiplier */ 227 | unsigned long c; /* LCG increment */ 228 | unsigned long m; /* LCG modulus, based on count_ceil */ 229 | }; 230 | 231 | /* Struct containing all test runner state. */ 232 | typedef struct greatest_run_info { 233 | unsigned char flags; 234 | unsigned char verbosity; 235 | unsigned char running_test; /* guard for nested RUN_TEST calls */ 236 | unsigned char exact_name_match; 237 | 238 | unsigned int tests_run; /* total test count */ 239 | 240 | /* currently running test suite */ 241 | greatest_suite_info suite; 242 | 243 | /* overall pass/fail/skip counts */ 244 | unsigned int passed; 245 | unsigned int failed; 246 | unsigned int skipped; 247 | unsigned int assertions; 248 | 249 | /* info to print about the most recent failure */ 250 | unsigned int fail_line; 251 | unsigned int pad_1; 252 | const char *fail_file; 253 | const char *msg; 254 | 255 | /* current setup/teardown hooks and userdata */ 256 | greatest_setup_cb *setup; 257 | void *setup_udata; 258 | greatest_teardown_cb *teardown; 259 | void *teardown_udata; 260 | 261 | /* formatting info for ".....s...F"-style output */ 262 | unsigned int col; 263 | unsigned int width; 264 | 265 | /* only run a specific suite or test */ 266 | const char *suite_filter; 267 | const char *test_filter; 268 | const char *test_exclude; 269 | const char *name_suffix; /* print suffix with test name */ 270 | char name_buf[GREATEST_TESTNAME_BUF_SIZE]; 271 | 272 | struct greatest_prng prng[2]; /* 0: suites, 1: tests */ 273 | 274 | #if GREATEST_USE_TIME 275 | /* overall timers */ 276 | clock_t begin; 277 | clock_t end; 278 | #endif 279 | 280 | #if GREATEST_USE_LONGJMP 281 | int pad_jmp_buf; 282 | unsigned char pad_2[4]; 283 | jmp_buf jump_dest; 284 | #endif 285 | } greatest_run_info; 286 | 287 | struct greatest_report_t { 288 | /* overall pass/fail/skip counts */ 289 | unsigned int passed; 290 | unsigned int failed; 291 | unsigned int skipped; 292 | unsigned int assertions; 293 | }; 294 | 295 | /* Global var for the current testing context. 296 | * Initialized by GREATEST_MAIN_DEFS(). */ 297 | extern greatest_run_info greatest_info; 298 | 299 | /* Type for ASSERT_ENUM_EQ's ENUM_STR argument. */ 300 | typedef const char *greatest_enum_str_fun(int value); 301 | 302 | 303 | /********************** 304 | * Exported functions * 305 | **********************/ 306 | 307 | /* These are used internally by greatest macros. */ 308 | int greatest_test_pre(const char *name); 309 | void greatest_test_post(int res); 310 | int greatest_do_assert_equal_t(const void *expd, const void *got, 311 | greatest_type_info *type_info, void *udata); 312 | void greatest_prng_init_first_pass(int id); 313 | int greatest_prng_init_second_pass(int id, unsigned long seed); 314 | void greatest_prng_step(int id); 315 | 316 | /* These are part of the public greatest API. */ 317 | void GREATEST_SET_SETUP_CB(greatest_setup_cb *cb, void *udata); 318 | void GREATEST_SET_TEARDOWN_CB(greatest_teardown_cb *cb, void *udata); 319 | void GREATEST_INIT(void); 320 | void GREATEST_PRINT_REPORT(void); 321 | int greatest_all_passed(void); 322 | void greatest_set_suite_filter(const char *filter); 323 | void greatest_set_test_filter(const char *filter); 324 | void greatest_set_test_exclude(const char *filter); 325 | void greatest_set_exact_name_match(void); 326 | void greatest_stop_at_first_fail(void); 327 | void greatest_abort_on_fail(void); 328 | void greatest_list_only(void); 329 | void greatest_get_report(struct greatest_report_t *report); 330 | unsigned int greatest_get_verbosity(void); 331 | void greatest_set_verbosity(unsigned int verbosity); 332 | void greatest_set_flag(greatest_flag_t flag); 333 | void greatest_set_test_suffix(const char *suffix); 334 | 335 | 336 | /******************** 337 | * Language Support * 338 | ********************/ 339 | 340 | /* If __VA_ARGS__ (C99) is supported, allow parametric testing 341 | * without needing to manually manage the argument struct. */ 342 | #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 19901L) || \ 343 | (defined(_MSC_VER) && _MSC_VER >= 1800) 344 | #define GREATEST_VA_ARGS 345 | #endif 346 | 347 | 348 | /********** 349 | * Macros * 350 | **********/ 351 | 352 | /* Define a suite. (The duplication is intentional -- it eliminates 353 | * a warning from -Wmissing-declarations.) */ 354 | #define GREATEST_SUITE(NAME) void NAME(void); void NAME(void) 355 | 356 | /* Declare a suite, provided by another compilation unit. */ 357 | #define GREATEST_SUITE_EXTERN(NAME) void NAME(void) 358 | 359 | /* Start defining a test function. 360 | * The arguments are not included, to allow parametric testing. */ 361 | #define GREATEST_TEST static enum greatest_test_res 362 | 363 | /* PASS/FAIL/SKIP result from a test. Used internally. */ 364 | typedef enum greatest_test_res { 365 | GREATEST_TEST_RES_PASS = 0, 366 | GREATEST_TEST_RES_FAIL = -1, 367 | GREATEST_TEST_RES_SKIP = 1 368 | } greatest_test_res; 369 | 370 | /* Run a suite. */ 371 | #define GREATEST_RUN_SUITE(S_NAME) greatest_run_suite(S_NAME, #S_NAME) 372 | 373 | /* Run a test in the current suite. */ 374 | #define GREATEST_RUN_TEST(TEST) \ 375 | do { \ 376 | if (greatest_test_pre(#TEST) == 1) { \ 377 | enum greatest_test_res res = GREATEST_SAVE_CONTEXT(); \ 378 | if (res == GREATEST_TEST_RES_PASS) { \ 379 | res = TEST(); \ 380 | } \ 381 | greatest_test_post(res); \ 382 | } \ 383 | } while (0) 384 | 385 | /* Ignore a test, don't warn about it being unused. */ 386 | #define GREATEST_IGNORE_TEST(TEST) (void)TEST 387 | 388 | /* Run a test in the current suite with one void * argument, 389 | * which can be a pointer to a struct with multiple arguments. */ 390 | #define GREATEST_RUN_TEST1(TEST, ENV) \ 391 | do { \ 392 | if (greatest_test_pre(#TEST) == 1) { \ 393 | enum greatest_test_res res = GREATEST_SAVE_CONTEXT(); \ 394 | if (res == GREATEST_TEST_RES_PASS) { \ 395 | res = TEST(ENV); \ 396 | } \ 397 | greatest_test_post(res); \ 398 | } \ 399 | } while (0) 400 | 401 | #ifdef GREATEST_VA_ARGS 402 | #define GREATEST_RUN_TESTp(TEST, ...) \ 403 | do { \ 404 | if (greatest_test_pre(#TEST) == 1) { \ 405 | enum greatest_test_res res = GREATEST_SAVE_CONTEXT(); \ 406 | if (res == GREATEST_TEST_RES_PASS) { \ 407 | res = TEST(__VA_ARGS__); \ 408 | } \ 409 | greatest_test_post(res); \ 410 | } \ 411 | } while (0) 412 | #endif 413 | 414 | 415 | /* Check if the test runner is in verbose mode. */ 416 | #define GREATEST_IS_VERBOSE() ((greatest_info.verbosity) > 0) 417 | #define GREATEST_LIST_ONLY() \ 418 | (greatest_info.flags & GREATEST_FLAG_LIST_ONLY) 419 | #define GREATEST_FIRST_FAIL() \ 420 | (greatest_info.flags & GREATEST_FLAG_FIRST_FAIL) 421 | #define GREATEST_ABORT_ON_FAIL() \ 422 | (greatest_info.flags & GREATEST_FLAG_ABORT_ON_FAIL) 423 | #define GREATEST_FAILURE_ABORT() \ 424 | (GREATEST_FIRST_FAIL() && \ 425 | (greatest_info.suite.failed > 0 || greatest_info.failed > 0)) 426 | 427 | /* Message-less forms of tests defined below. */ 428 | #define GREATEST_PASS() GREATEST_PASSm(NULL) 429 | #define GREATEST_FAIL() GREATEST_FAILm(NULL) 430 | #define GREATEST_SKIP() GREATEST_SKIPm(NULL) 431 | #define GREATEST_ASSERT(COND) \ 432 | GREATEST_ASSERTm(#COND, COND) 433 | #define GREATEST_ASSERT_OR_LONGJMP(COND) \ 434 | GREATEST_ASSERT_OR_LONGJMPm(#COND, COND) 435 | #define GREATEST_ASSERT_FALSE(COND) \ 436 | GREATEST_ASSERT_FALSEm(#COND, COND) 437 | #define GREATEST_ASSERT_EQ(EXP, GOT) \ 438 | GREATEST_ASSERT_EQm(#EXP " != " #GOT, EXP, GOT) 439 | #define GREATEST_ASSERT_NEQ(EXP, GOT) \ 440 | GREATEST_ASSERT_NEQm(#EXP " == " #GOT, EXP, GOT) 441 | #define GREATEST_ASSERT_GT(EXP, GOT) \ 442 | GREATEST_ASSERT_GTm(#EXP " <= " #GOT, EXP, GOT) 443 | #define GREATEST_ASSERT_GTE(EXP, GOT) \ 444 | GREATEST_ASSERT_GTEm(#EXP " < " #GOT, EXP, GOT) 445 | #define GREATEST_ASSERT_LT(EXP, GOT) \ 446 | GREATEST_ASSERT_LTm(#EXP " >= " #GOT, EXP, GOT) 447 | #define GREATEST_ASSERT_LTE(EXP, GOT) \ 448 | GREATEST_ASSERT_LTEm(#EXP " > " #GOT, EXP, GOT) 449 | #define GREATEST_ASSERT_EQ_FMT(EXP, GOT, FMT) \ 450 | GREATEST_ASSERT_EQ_FMTm(#EXP " != " #GOT, EXP, GOT, FMT) 451 | #define GREATEST_ASSERT_IN_RANGE(EXP, GOT, TOL) \ 452 | GREATEST_ASSERT_IN_RANGEm(#EXP " != " #GOT " +/- " #TOL, EXP, GOT, TOL) 453 | #define GREATEST_ASSERT_EQUAL_T(EXP, GOT, TYPE_INFO, UDATA) \ 454 | GREATEST_ASSERT_EQUAL_Tm(#EXP " != " #GOT, EXP, GOT, TYPE_INFO, UDATA) 455 | #define GREATEST_ASSERT_STR_EQ(EXP, GOT) \ 456 | GREATEST_ASSERT_STR_EQm(#EXP " != " #GOT, EXP, GOT) 457 | #define GREATEST_ASSERT_STRN_EQ(EXP, GOT, SIZE) \ 458 | GREATEST_ASSERT_STRN_EQm(#EXP " != " #GOT, EXP, GOT, SIZE) 459 | #define GREATEST_ASSERT_MEM_EQ(EXP, GOT, SIZE) \ 460 | GREATEST_ASSERT_MEM_EQm(#EXP " != " #GOT, EXP, GOT, SIZE) 461 | #define GREATEST_ASSERT_ENUM_EQ(EXP, GOT, ENUM_STR) \ 462 | GREATEST_ASSERT_ENUM_EQm(#EXP " != " #GOT, EXP, GOT, ENUM_STR) 463 | 464 | /* The following forms take an additional message argument first, 465 | * to be displayed by the test runner. */ 466 | 467 | /* Fail if a condition is not true, with message. */ 468 | #define GREATEST_ASSERTm(MSG, COND) \ 469 | do { \ 470 | greatest_info.assertions++; \ 471 | if (!(COND)) { GREATEST_FAILm(MSG); } \ 472 | } while (0) 473 | 474 | /* Fail if a condition is not true, longjmping out of test. */ 475 | #define GREATEST_ASSERT_OR_LONGJMPm(MSG, COND) \ 476 | do { \ 477 | greatest_info.assertions++; \ 478 | if (!(COND)) { GREATEST_FAIL_WITH_LONGJMPm(MSG); } \ 479 | } while (0) 480 | 481 | /* Fail if a condition is not false, with message. */ 482 | #define GREATEST_ASSERT_FALSEm(MSG, COND) \ 483 | do { \ 484 | greatest_info.assertions++; \ 485 | if ((COND)) { GREATEST_FAILm(MSG); } \ 486 | } while (0) 487 | 488 | /* Internal macro for relational assertions */ 489 | #define GREATEST__REL(REL, MSG, EXP, GOT) \ 490 | do { \ 491 | greatest_info.assertions++; \ 492 | if (!((EXP) REL (GOT))) { GREATEST_FAILm(MSG); } \ 493 | } while (0) 494 | 495 | /* Fail if EXP is not ==, !=, >, <, >=, or <= to GOT. */ 496 | #define GREATEST_ASSERT_EQm(MSG,E,G) GREATEST__REL(==, MSG,E,G) 497 | #define GREATEST_ASSERT_NEQm(MSG,E,G) GREATEST__REL(!=, MSG,E,G) 498 | #define GREATEST_ASSERT_GTm(MSG,E,G) GREATEST__REL(>, MSG,E,G) 499 | #define GREATEST_ASSERT_GTEm(MSG,E,G) GREATEST__REL(>=, MSG,E,G) 500 | #define GREATEST_ASSERT_LTm(MSG,E,G) GREATEST__REL(<, MSG,E,G) 501 | #define GREATEST_ASSERT_LTEm(MSG,E,G) GREATEST__REL(<=, MSG,E,G) 502 | 503 | /* Fail if EXP != GOT (equality comparison by ==). 504 | * Warning: FMT, EXP, and GOT will be evaluated more 505 | * than once on failure. */ 506 | #define GREATEST_ASSERT_EQ_FMTm(MSG, EXP, GOT, FMT) \ 507 | do { \ 508 | greatest_info.assertions++; \ 509 | if ((EXP) != (GOT)) { \ 510 | GREATEST_FPRINTF(GREATEST_STDOUT, "\nExpected: "); \ 511 | GREATEST_FPRINTF(GREATEST_STDOUT, FMT, EXP); \ 512 | GREATEST_FPRINTF(GREATEST_STDOUT, "\n Got: "); \ 513 | GREATEST_FPRINTF(GREATEST_STDOUT, FMT, GOT); \ 514 | GREATEST_FPRINTF(GREATEST_STDOUT, "\n"); \ 515 | GREATEST_FAILm(MSG); \ 516 | } \ 517 | } while (0) 518 | 519 | /* Fail if EXP is not equal to GOT, printing enum IDs. */ 520 | #define GREATEST_ASSERT_ENUM_EQm(MSG, EXP, GOT, ENUM_STR) \ 521 | do { \ 522 | int greatest_EXP = (int)(EXP); \ 523 | int greatest_GOT = (int)(GOT); \ 524 | greatest_enum_str_fun *greatest_ENUM_STR = ENUM_STR; \ 525 | if (greatest_EXP != greatest_GOT) { \ 526 | GREATEST_FPRINTF(GREATEST_STDOUT, "\nExpected: %s", \ 527 | greatest_ENUM_STR(greatest_EXP)); \ 528 | GREATEST_FPRINTF(GREATEST_STDOUT, "\n Got: %s\n", \ 529 | greatest_ENUM_STR(greatest_GOT)); \ 530 | GREATEST_FAILm(MSG); \ 531 | } \ 532 | } while (0) \ 533 | 534 | /* Fail if GOT not in range of EXP +|- TOL. */ 535 | #define GREATEST_ASSERT_IN_RANGEm(MSG, EXP, GOT, TOL) \ 536 | do { \ 537 | GREATEST_FLOAT greatest_EXP = (EXP); \ 538 | GREATEST_FLOAT greatest_GOT = (GOT); \ 539 | GREATEST_FLOAT greatest_TOL = (TOL); \ 540 | greatest_info.assertions++; \ 541 | if ((greatest_EXP > greatest_GOT && \ 542 | greatest_EXP - greatest_GOT > greatest_TOL) || \ 543 | (greatest_EXP < greatest_GOT && \ 544 | greatest_GOT - greatest_EXP > greatest_TOL)) { \ 545 | GREATEST_FPRINTF(GREATEST_STDOUT, \ 546 | "\nExpected: " GREATEST_FLOAT_FMT \ 547 | " +/- " GREATEST_FLOAT_FMT \ 548 | "\n Got: " GREATEST_FLOAT_FMT \ 549 | "\n", \ 550 | greatest_EXP, greatest_TOL, greatest_GOT); \ 551 | GREATEST_FAILm(MSG); \ 552 | } \ 553 | } while (0) 554 | 555 | /* Fail if EXP is not equal to GOT, according to strcmp. */ 556 | #define GREATEST_ASSERT_STR_EQm(MSG, EXP, GOT) \ 557 | do { \ 558 | GREATEST_ASSERT_EQUAL_Tm(MSG, EXP, GOT, \ 559 | &greatest_type_info_string, NULL); \ 560 | } while (0) \ 561 | 562 | /* Fail if EXP is not equal to GOT, according to strncmp. */ 563 | #define GREATEST_ASSERT_STRN_EQm(MSG, EXP, GOT, SIZE) \ 564 | do { \ 565 | size_t size = SIZE; \ 566 | GREATEST_ASSERT_EQUAL_Tm(MSG, EXP, GOT, \ 567 | &greatest_type_info_string, &size); \ 568 | } while (0) \ 569 | 570 | /* Fail if EXP is not equal to GOT, according to memcmp. */ 571 | #define GREATEST_ASSERT_MEM_EQm(MSG, EXP, GOT, SIZE) \ 572 | do { \ 573 | greatest_memory_cmp_env env; \ 574 | env.exp = (const unsigned char *)EXP; \ 575 | env.got = (const unsigned char *)GOT; \ 576 | env.size = SIZE; \ 577 | GREATEST_ASSERT_EQUAL_Tm(MSG, env.exp, env.got, \ 578 | &greatest_type_info_memory, &env); \ 579 | } while (0) \ 580 | 581 | /* Fail if EXP is not equal to GOT, according to a comparison 582 | * callback in TYPE_INFO. If they are not equal, optionally use a 583 | * print callback in TYPE_INFO to print them. */ 584 | #define GREATEST_ASSERT_EQUAL_Tm(MSG, EXP, GOT, TYPE_INFO, UDATA) \ 585 | do { \ 586 | greatest_type_info *type_info = (TYPE_INFO); \ 587 | greatest_info.assertions++; \ 588 | if (!greatest_do_assert_equal_t(EXP, GOT, \ 589 | type_info, UDATA)) { \ 590 | if (type_info == NULL || type_info->equal == NULL) { \ 591 | GREATEST_FAILm("type_info->equal callback missing!"); \ 592 | } else { \ 593 | GREATEST_FAILm(MSG); \ 594 | } \ 595 | } \ 596 | } while (0) \ 597 | 598 | /* Pass. */ 599 | #define GREATEST_PASSm(MSG) \ 600 | do { \ 601 | greatest_info.msg = MSG; \ 602 | return GREATEST_TEST_RES_PASS; \ 603 | } while (0) 604 | 605 | /* Fail. */ 606 | #define GREATEST_FAILm(MSG) \ 607 | do { \ 608 | greatest_info.fail_file = __FILE__; \ 609 | greatest_info.fail_line = __LINE__; \ 610 | greatest_info.msg = MSG; \ 611 | if (GREATEST_ABORT_ON_FAIL()) { abort(); } \ 612 | return GREATEST_TEST_RES_FAIL; \ 613 | } while (0) 614 | 615 | /* Optional GREATEST_FAILm variant that longjmps. */ 616 | #if GREATEST_USE_LONGJMP 617 | #define GREATEST_FAIL_WITH_LONGJMP() GREATEST_FAIL_WITH_LONGJMPm(NULL) 618 | #define GREATEST_FAIL_WITH_LONGJMPm(MSG) \ 619 | do { \ 620 | greatest_info.fail_file = __FILE__; \ 621 | greatest_info.fail_line = __LINE__; \ 622 | greatest_info.msg = MSG; \ 623 | longjmp(greatest_info.jump_dest, GREATEST_TEST_RES_FAIL); \ 624 | } while (0) 625 | #endif 626 | 627 | /* Skip the current test. */ 628 | #define GREATEST_SKIPm(MSG) \ 629 | do { \ 630 | greatest_info.msg = MSG; \ 631 | return GREATEST_TEST_RES_SKIP; \ 632 | } while (0) 633 | 634 | /* Check the result of a subfunction using ASSERT, etc. */ 635 | #define GREATEST_CHECK_CALL(RES) \ 636 | do { \ 637 | enum greatest_test_res greatest_RES = RES; \ 638 | if (greatest_RES != GREATEST_TEST_RES_PASS) { \ 639 | return greatest_RES; \ 640 | } \ 641 | } while (0) \ 642 | 643 | #if GREATEST_USE_TIME 644 | #define GREATEST_SET_TIME(NAME) \ 645 | NAME = clock(); \ 646 | if (NAME == (clock_t) -1) { \ 647 | GREATEST_FPRINTF(GREATEST_STDOUT, \ 648 | "clock error: %s\n", #NAME); \ 649 | exit(EXIT_FAILURE); \ 650 | } 651 | 652 | #define GREATEST_CLOCK_DIFF(C1, C2) \ 653 | GREATEST_FPRINTF(GREATEST_STDOUT, " (%lu ticks, %.3f sec)", \ 654 | (long unsigned int) (C2) - (long unsigned int)(C1), \ 655 | (double)((C2) - (C1)) / (1.0 * (double)CLOCKS_PER_SEC)) 656 | #else 657 | #define GREATEST_SET_TIME(UNUSED) 658 | #define GREATEST_CLOCK_DIFF(UNUSED1, UNUSED2) 659 | #endif 660 | 661 | #if GREATEST_USE_LONGJMP 662 | #define GREATEST_SAVE_CONTEXT() \ 663 | /* setjmp returns 0 (GREATEST_TEST_RES_PASS) on first call * \ 664 | * so the test runs, then RES_FAIL from FAIL_WITH_LONGJMP. */ \ 665 | ((enum greatest_test_res)(setjmp(greatest_info.jump_dest))) 666 | #else 667 | #define GREATEST_SAVE_CONTEXT() \ 668 | /*a no-op, since setjmp/longjmp aren't being used */ \ 669 | GREATEST_TEST_RES_PASS 670 | #endif 671 | 672 | /* Run every suite / test function run within BODY in pseudo-random 673 | * order, seeded by SEED. (The top 3 bits of the seed are ignored.) 674 | * 675 | * This should be called like: 676 | * GREATEST_SHUFFLE_TESTS(seed, { 677 | * GREATEST_RUN_TEST(some_test); 678 | * GREATEST_RUN_TEST(some_other_test); 679 | * GREATEST_RUN_TEST(yet_another_test); 680 | * }); 681 | * 682 | * Note that the body of the second argument will be evaluated 683 | * multiple times. */ 684 | #define GREATEST_SHUFFLE_SUITES(SD, BODY) GREATEST_SHUFFLE(0, SD, BODY) 685 | #define GREATEST_SHUFFLE_TESTS(SD, BODY) GREATEST_SHUFFLE(1, SD, BODY) 686 | #define GREATEST_SHUFFLE(ID, SD, BODY) \ 687 | do { \ 688 | struct greatest_prng *prng = &greatest_info.prng[ID]; \ 689 | greatest_prng_init_first_pass(ID); \ 690 | do { \ 691 | prng->count = 0; \ 692 | if (prng->initialized) { greatest_prng_step(ID); } \ 693 | BODY; \ 694 | if (!prng->initialized) { \ 695 | if (!greatest_prng_init_second_pass(ID, SD)) { break; } \ 696 | } else if (prng->count_run == prng->count_ceil) { \ 697 | break; \ 698 | } \ 699 | } while (!GREATEST_FAILURE_ABORT()); \ 700 | prng->count_run = prng->random_order = prng->initialized = 0; \ 701 | } while(0) 702 | 703 | /* Include several function definitions in the main test file. */ 704 | #define GREATEST_MAIN_DEFS() \ 705 | \ 706 | /* Is FILTER a subset of NAME? */ \ 707 | static int greatest_name_match(const char *name, const char *filter, \ 708 | int res_if_none) { \ 709 | size_t offset = 0; \ 710 | size_t filter_len = filter ? strlen(filter) : 0; \ 711 | if (filter_len == 0) { return res_if_none; } /* no filter */ \ 712 | if (greatest_info.exact_name_match && strlen(name) != filter_len) { \ 713 | return 0; /* ignore substring matches */ \ 714 | } \ 715 | while (name[offset] != '\0') { \ 716 | if (name[offset] == filter[0]) { \ 717 | if (0 == strncmp(&name[offset], filter, filter_len)) { \ 718 | return 1; \ 719 | } \ 720 | } \ 721 | offset++; \ 722 | } \ 723 | \ 724 | return 0; \ 725 | } \ 726 | \ 727 | static void greatest_buffer_test_name(const char *name) { \ 728 | struct greatest_run_info *g = &greatest_info; \ 729 | size_t len = strlen(name), size = sizeof(g->name_buf); \ 730 | memset(g->name_buf, 0x00, size); \ 731 | (void)strncat(g->name_buf, name, size - 1); \ 732 | if (g->name_suffix && (len + 1 < size)) { \ 733 | g->name_buf[len] = '_'; \ 734 | strncat(&g->name_buf[len+1], g->name_suffix, size-(len+2)); \ 735 | } \ 736 | } \ 737 | \ 738 | /* Before running a test, check the name filtering and \ 739 | * test shuffling state, if applicable, and then call setup hooks. */ \ 740 | int greatest_test_pre(const char *name) { \ 741 | struct greatest_run_info *g = &greatest_info; \ 742 | int match; \ 743 | greatest_buffer_test_name(name); \ 744 | match = greatest_name_match(g->name_buf, g->test_filter, 1) && \ 745 | !greatest_name_match(g->name_buf, g->test_exclude, 0); \ 746 | if (GREATEST_LIST_ONLY()) { /* just listing test names */ \ 747 | if (match) { \ 748 | GREATEST_FPRINTF(GREATEST_STDOUT, " %s\n", g->name_buf); \ 749 | } \ 750 | goto clear; \ 751 | } \ 752 | if (match && (!GREATEST_FIRST_FAIL() || g->suite.failed == 0)) { \ 753 | struct greatest_prng *p = &g->prng[1]; \ 754 | if (p->random_order) { \ 755 | p->count++; \ 756 | if (!p->initialized || ((p->count - 1) != p->state)) { \ 757 | goto clear; /* don't run this test yet */ \ 758 | } \ 759 | } \ 760 | if (g->running_test) { \ 761 | fprintf(stderr, "Error: Test run inside another test.\n"); \ 762 | return 0; \ 763 | } \ 764 | GREATEST_SET_TIME(g->suite.pre_test); \ 765 | if (g->setup) { g->setup(g->setup_udata); } \ 766 | p->count_run++; \ 767 | g->running_test = 1; \ 768 | return 1; /* test should be run */ \ 769 | } else { \ 770 | goto clear; /* skipped */ \ 771 | } \ 772 | clear: \ 773 | g->name_suffix = NULL; \ 774 | return 0; \ 775 | } \ 776 | \ 777 | static void greatest_do_pass(void) { \ 778 | struct greatest_run_info *g = &greatest_info; \ 779 | if (GREATEST_IS_VERBOSE()) { \ 780 | GREATEST_FPRINTF(GREATEST_STDOUT, "PASS %s: %s", \ 781 | g->name_buf, g->msg ? g->msg : ""); \ 782 | } else { \ 783 | GREATEST_FPRINTF(GREATEST_STDOUT, "."); \ 784 | } \ 785 | g->suite.passed++; \ 786 | } \ 787 | \ 788 | static void greatest_do_fail(void) { \ 789 | struct greatest_run_info *g = &greatest_info; \ 790 | if (GREATEST_IS_VERBOSE()) { \ 791 | GREATEST_FPRINTF(GREATEST_STDOUT, \ 792 | "FAIL %s: %s (%s:%u)", g->name_buf, \ 793 | g->msg ? g->msg : "", g->fail_file, g->fail_line); \ 794 | } else { \ 795 | GREATEST_FPRINTF(GREATEST_STDOUT, "F"); \ 796 | g->col++; /* add linebreak if in line of '.'s */ \ 797 | if (g->col != 0) { \ 798 | GREATEST_FPRINTF(GREATEST_STDOUT, "\n"); \ 799 | g->col = 0; \ 800 | } \ 801 | GREATEST_FPRINTF(GREATEST_STDOUT, "FAIL %s: %s (%s:%u)\n", \ 802 | g->name_buf, g->msg ? g->msg : "", \ 803 | g->fail_file, g->fail_line); \ 804 | } \ 805 | g->suite.failed++; \ 806 | } \ 807 | \ 808 | static void greatest_do_skip(void) { \ 809 | struct greatest_run_info *g = &greatest_info; \ 810 | if (GREATEST_IS_VERBOSE()) { \ 811 | GREATEST_FPRINTF(GREATEST_STDOUT, "SKIP %s: %s", \ 812 | g->name_buf, g->msg ? g->msg : ""); \ 813 | } else { \ 814 | GREATEST_FPRINTF(GREATEST_STDOUT, "s"); \ 815 | } \ 816 | g->suite.skipped++; \ 817 | } \ 818 | \ 819 | void greatest_test_post(int res) { \ 820 | GREATEST_SET_TIME(greatest_info.suite.post_test); \ 821 | if (greatest_info.teardown) { \ 822 | void *udata = greatest_info.teardown_udata; \ 823 | greatest_info.teardown(udata); \ 824 | } \ 825 | \ 826 | greatest_info.running_test = 0; \ 827 | if (res <= GREATEST_TEST_RES_FAIL) { \ 828 | greatest_do_fail(); \ 829 | } else if (res >= GREATEST_TEST_RES_SKIP) { \ 830 | greatest_do_skip(); \ 831 | } else if (res == GREATEST_TEST_RES_PASS) { \ 832 | greatest_do_pass(); \ 833 | } \ 834 | greatest_info.name_suffix = NULL; \ 835 | greatest_info.suite.tests_run++; \ 836 | greatest_info.col++; \ 837 | if (GREATEST_IS_VERBOSE()) { \ 838 | GREATEST_CLOCK_DIFF(greatest_info.suite.pre_test, \ 839 | greatest_info.suite.post_test); \ 840 | GREATEST_FPRINTF(GREATEST_STDOUT, "\n"); \ 841 | } else if (greatest_info.col % greatest_info.width == 0) { \ 842 | GREATEST_FPRINTF(GREATEST_STDOUT, "\n"); \ 843 | greatest_info.col = 0; \ 844 | } \ 845 | fflush(GREATEST_STDOUT); \ 846 | } \ 847 | \ 848 | static void report_suite(void) { \ 849 | if (greatest_info.suite.tests_run > 0) { \ 850 | GREATEST_FPRINTF(GREATEST_STDOUT, \ 851 | "\n%u test%s - %u passed, %u failed, %u skipped", \ 852 | greatest_info.suite.tests_run, \ 853 | greatest_info.suite.tests_run == 1 ? "" : "s", \ 854 | greatest_info.suite.passed, \ 855 | greatest_info.suite.failed, \ 856 | greatest_info.suite.skipped); \ 857 | GREATEST_CLOCK_DIFF(greatest_info.suite.pre_suite, \ 858 | greatest_info.suite.post_suite); \ 859 | GREATEST_FPRINTF(GREATEST_STDOUT, "\n"); \ 860 | } \ 861 | } \ 862 | \ 863 | static void update_counts_and_reset_suite(void) { \ 864 | greatest_info.setup = NULL; \ 865 | greatest_info.setup_udata = NULL; \ 866 | greatest_info.teardown = NULL; \ 867 | greatest_info.teardown_udata = NULL; \ 868 | greatest_info.passed += greatest_info.suite.passed; \ 869 | greatest_info.failed += greatest_info.suite.failed; \ 870 | greatest_info.skipped += greatest_info.suite.skipped; \ 871 | greatest_info.tests_run += greatest_info.suite.tests_run; \ 872 | memset(&greatest_info.suite, 0, sizeof(greatest_info.suite)); \ 873 | greatest_info.col = 0; \ 874 | } \ 875 | \ 876 | static int greatest_suite_pre(const char *suite_name) { \ 877 | struct greatest_prng *p = &greatest_info.prng[0]; \ 878 | if (!greatest_name_match(suite_name, greatest_info.suite_filter, 1) \ 879 | || (GREATEST_FAILURE_ABORT())) { return 0; } \ 880 | if (p->random_order) { \ 881 | p->count++; \ 882 | if (!p->initialized || ((p->count - 1) != p->state)) { \ 883 | return 0; /* don't run this suite yet */ \ 884 | } \ 885 | } \ 886 | p->count_run++; \ 887 | update_counts_and_reset_suite(); \ 888 | GREATEST_FPRINTF(GREATEST_STDOUT, "\n* Suite %s:\n", suite_name); \ 889 | GREATEST_SET_TIME(greatest_info.suite.pre_suite); \ 890 | return 1; \ 891 | } \ 892 | \ 893 | static void greatest_suite_post(void) { \ 894 | GREATEST_SET_TIME(greatest_info.suite.post_suite); \ 895 | report_suite(); \ 896 | } \ 897 | \ 898 | static void greatest_run_suite(greatest_suite_cb *suite_cb, \ 899 | const char *suite_name) { \ 900 | if (greatest_suite_pre(suite_name)) { \ 901 | suite_cb(); \ 902 | greatest_suite_post(); \ 903 | } \ 904 | } \ 905 | \ 906 | int greatest_do_assert_equal_t(const void *expd, const void *got, \ 907 | greatest_type_info *type_info, void *udata) { \ 908 | int eq = 0; \ 909 | if (type_info == NULL || type_info->equal == NULL) { return 0; } \ 910 | eq = type_info->equal(expd, got, udata); \ 911 | if (!eq) { \ 912 | if (type_info->print != NULL) { \ 913 | GREATEST_FPRINTF(GREATEST_STDOUT, "\nExpected: "); \ 914 | (void)type_info->print(expd, udata); \ 915 | GREATEST_FPRINTF(GREATEST_STDOUT, "\n Got: "); \ 916 | (void)type_info->print(got, udata); \ 917 | GREATEST_FPRINTF(GREATEST_STDOUT, "\n"); \ 918 | } \ 919 | } \ 920 | return eq; \ 921 | } \ 922 | \ 923 | static void greatest_usage(const char *name) { \ 924 | GREATEST_FPRINTF(GREATEST_STDOUT, \ 925 | "Usage: %s [-hlfavex] [-s SUITE] [-t TEST] [-x EXCLUDE]\n" \ 926 | " -h, --help print this Help\n" \ 927 | " -l List suites and tests, then exit (dry run)\n" \ 928 | " -f Stop runner after first failure\n" \ 929 | " -a Abort on first failure (implies -f)\n" \ 930 | " -v Verbose output\n" \ 931 | " -s SUITE only run suites containing substring SUITE\n" \ 932 | " -t TEST only run tests containing substring TEST\n" \ 933 | " -e only run exact name match for -s or -t\n" \ 934 | " -x EXCLUDE exclude tests containing substring EXCLUDE\n", \ 935 | name); \ 936 | } \ 937 | \ 938 | static void greatest_parse_options(int argc, char **argv) { \ 939 | int i = 0; \ 940 | for (i = 1; i < argc; i++) { \ 941 | if (argv[i][0] == '-') { \ 942 | char f = argv[i][1]; \ 943 | if ((f == 's' || f == 't' || f == 'x') && argc <= i + 1) { \ 944 | greatest_usage(argv[0]); exit(EXIT_FAILURE); \ 945 | } \ 946 | switch (f) { \ 947 | case 's': /* suite name filter */ \ 948 | greatest_set_suite_filter(argv[i + 1]); i++; break; \ 949 | case 't': /* test name filter */ \ 950 | greatest_set_test_filter(argv[i + 1]); i++; break; \ 951 | case 'x': /* test name exclusion */ \ 952 | greatest_set_test_exclude(argv[i + 1]); i++; break; \ 953 | case 'e': /* exact name match */ \ 954 | greatest_set_exact_name_match(); break; \ 955 | case 'f': /* first fail flag */ \ 956 | greatest_stop_at_first_fail(); break; \ 957 | case 'a': /* abort() on fail flag */ \ 958 | greatest_abort_on_fail(); break; \ 959 | case 'l': /* list only (dry run) */ \ 960 | greatest_list_only(); break; \ 961 | case 'v': /* first fail flag */ \ 962 | greatest_info.verbosity++; break; \ 963 | case 'h': /* help */ \ 964 | greatest_usage(argv[0]); exit(EXIT_SUCCESS); \ 965 | default: \ 966 | case '-': \ 967 | if (0 == strncmp("--help", argv[i], 6)) { \ 968 | greatest_usage(argv[0]); exit(EXIT_SUCCESS); \ 969 | } else if (0 == strcmp("--", argv[i])) { \ 970 | return; /* ignore following arguments */ \ 971 | } \ 972 | GREATEST_FPRINTF(GREATEST_STDOUT, \ 973 | "Unknown argument '%s'\n", argv[i]); \ 974 | greatest_usage(argv[0]); \ 975 | exit(EXIT_FAILURE); \ 976 | } \ 977 | } \ 978 | } \ 979 | } \ 980 | \ 981 | int greatest_all_passed(void) { return (greatest_info.failed == 0); } \ 982 | \ 983 | void greatest_set_test_filter(const char *filter) { \ 984 | greatest_info.test_filter = filter; \ 985 | } \ 986 | \ 987 | void greatest_set_test_exclude(const char *filter) { \ 988 | greatest_info.test_exclude = filter; \ 989 | } \ 990 | \ 991 | void greatest_set_suite_filter(const char *filter) { \ 992 | greatest_info.suite_filter = filter; \ 993 | } \ 994 | \ 995 | void greatest_set_exact_name_match(void) { \ 996 | greatest_info.exact_name_match = 1; \ 997 | } \ 998 | \ 999 | void greatest_stop_at_first_fail(void) { \ 1000 | greatest_set_flag(GREATEST_FLAG_FIRST_FAIL); \ 1001 | } \ 1002 | \ 1003 | void greatest_abort_on_fail(void) { \ 1004 | greatest_set_flag(GREATEST_FLAG_ABORT_ON_FAIL); \ 1005 | } \ 1006 | \ 1007 | void greatest_list_only(void) { \ 1008 | greatest_set_flag(GREATEST_FLAG_LIST_ONLY); \ 1009 | } \ 1010 | \ 1011 | void greatest_get_report(struct greatest_report_t *report) { \ 1012 | if (report) { \ 1013 | report->passed = greatest_info.passed; \ 1014 | report->failed = greatest_info.failed; \ 1015 | report->skipped = greatest_info.skipped; \ 1016 | report->assertions = greatest_info.assertions; \ 1017 | } \ 1018 | } \ 1019 | \ 1020 | unsigned int greatest_get_verbosity(void) { \ 1021 | return greatest_info.verbosity; \ 1022 | } \ 1023 | \ 1024 | void greatest_set_verbosity(unsigned int verbosity) { \ 1025 | greatest_info.verbosity = (unsigned char)verbosity; \ 1026 | } \ 1027 | \ 1028 | void greatest_set_flag(greatest_flag_t flag) { \ 1029 | greatest_info.flags = (unsigned char)(greatest_info.flags | flag); \ 1030 | } \ 1031 | \ 1032 | void greatest_set_test_suffix(const char *suffix) { \ 1033 | greatest_info.name_suffix = suffix; \ 1034 | } \ 1035 | \ 1036 | void GREATEST_SET_SETUP_CB(greatest_setup_cb *cb, void *udata) { \ 1037 | greatest_info.setup = cb; \ 1038 | greatest_info.setup_udata = udata; \ 1039 | } \ 1040 | \ 1041 | void GREATEST_SET_TEARDOWN_CB(greatest_teardown_cb *cb, void *udata) { \ 1042 | greatest_info.teardown = cb; \ 1043 | greatest_info.teardown_udata = udata; \ 1044 | } \ 1045 | \ 1046 | static int greatest_string_equal_cb(const void *expd, const void *got, \ 1047 | void *udata) { \ 1048 | size_t *size = (size_t *)udata; \ 1049 | return (size != NULL \ 1050 | ? (0 == strncmp((const char *)expd, (const char *)got, *size)) \ 1051 | : (0 == strcmp((const char *)expd, (const char *)got))); \ 1052 | } \ 1053 | \ 1054 | static int greatest_string_printf_cb(const void *t, void *udata) { \ 1055 | (void)udata; /* note: does not check \0 termination. */ \ 1056 | return GREATEST_FPRINTF(GREATEST_STDOUT, "%s", (const char *)t); \ 1057 | } \ 1058 | \ 1059 | greatest_type_info greatest_type_info_string = { \ 1060 | greatest_string_equal_cb, greatest_string_printf_cb, \ 1061 | }; \ 1062 | \ 1063 | static int greatest_memory_equal_cb(const void *expd, const void *got, \ 1064 | void *udata) { \ 1065 | greatest_memory_cmp_env *env = (greatest_memory_cmp_env *)udata; \ 1066 | return (0 == memcmp(expd, got, env->size)); \ 1067 | } \ 1068 | \ 1069 | /* Hexdump raw memory, with differences highlighted */ \ 1070 | static int greatest_memory_printf_cb(const void *t, void *udata) { \ 1071 | greatest_memory_cmp_env *env = (greatest_memory_cmp_env *)udata; \ 1072 | const unsigned char *buf = (const unsigned char *)t; \ 1073 | unsigned char diff_mark = ' '; \ 1074 | FILE *out = GREATEST_STDOUT; \ 1075 | size_t i, line_i, line_len = 0; \ 1076 | int len = 0; /* format hexdump with differences highlighted */ \ 1077 | for (i = 0; i < env->size; i+= line_len) { \ 1078 | diff_mark = ' '; \ 1079 | line_len = env->size - i; \ 1080 | if (line_len > 16) { line_len = 16; } \ 1081 | for (line_i = i; line_i < i + line_len; line_i++) { \ 1082 | if (env->exp[line_i] != env->got[line_i]) diff_mark = 'X'; \ 1083 | } \ 1084 | len += GREATEST_FPRINTF(out, "\n%04x %c ", \ 1085 | (unsigned int)i, diff_mark); \ 1086 | for (line_i = i; line_i < i + line_len; line_i++) { \ 1087 | int m = env->exp[line_i] == env->got[line_i]; /* match? */ \ 1088 | len += GREATEST_FPRINTF(out, "%02x%c", \ 1089 | buf[line_i], m ? ' ' : '<'); \ 1090 | } \ 1091 | for (line_i = 0; line_i < 16 - line_len; line_i++) { \ 1092 | len += GREATEST_FPRINTF(out, " "); \ 1093 | } \ 1094 | GREATEST_FPRINTF(out, " "); \ 1095 | for (line_i = i; line_i < i + line_len; line_i++) { \ 1096 | unsigned char c = buf[line_i]; \ 1097 | len += GREATEST_FPRINTF(out, "%c", isprint(c) ? c : '.'); \ 1098 | } \ 1099 | } \ 1100 | len += GREATEST_FPRINTF(out, "\n"); \ 1101 | return len; \ 1102 | } \ 1103 | \ 1104 | void greatest_prng_init_first_pass(int id) { \ 1105 | greatest_info.prng[id].random_order = 1; \ 1106 | greatest_info.prng[id].count_run = 0; \ 1107 | } \ 1108 | \ 1109 | int greatest_prng_init_second_pass(int id, unsigned long seed) { \ 1110 | struct greatest_prng *p = &greatest_info.prng[id]; \ 1111 | if (p->count == 0) { return 0; } \ 1112 | p->count_ceil = p->count; \ 1113 | for (p->m = 1; p->m < p->count; p->m <<= 1) {} \ 1114 | p->state = seed & 0x1fffffff; /* only use lower 29 bits */ \ 1115 | p->a = 4LU * p->state; /* to avoid overflow when */ \ 1116 | p->a = (p->a ? p->a : 4) | 1; /* multiplied by 4 */ \ 1117 | p->c = 2147483647; /* and so p->c ((2 ** 31) - 1) is */ \ 1118 | p->initialized = 1; /* always relatively prime to p->a. */ \ 1119 | fprintf(stderr, "init_second_pass: a %lu, c %lu, state %lu\n", \ 1120 | p->a, p->c, p->state); \ 1121 | return 1; \ 1122 | } \ 1123 | \ 1124 | /* Step the pseudorandom number generator until its state reaches \ 1125 | * another test ID between 0 and the test count. \ 1126 | * This use a linear congruential pseudorandom number generator, \ 1127 | * with the power-of-two ceiling of the test count as the modulus, the \ 1128 | * masked seed as the multiplier, and a prime as the increment. For \ 1129 | * each generated value < the test count, run the corresponding test. \ 1130 | * This will visit all IDs 0 <= X < mod once before repeating, \ 1131 | * with a starting position chosen based on the initial seed. \ 1132 | * For details, see: Knuth, The Art of Computer Programming \ 1133 | * Volume. 2, section 3.2.1. */ \ 1134 | void greatest_prng_step(int id) { \ 1135 | struct greatest_prng *p = &greatest_info.prng[id]; \ 1136 | do { \ 1137 | p->state = ((p->a * p->state) + p->c) & (p->m - 1); \ 1138 | } while (p->state >= p->count_ceil); \ 1139 | } \ 1140 | \ 1141 | void GREATEST_INIT(void) { \ 1142 | /* Suppress unused function warning if features aren't used */ \ 1143 | (void)greatest_run_suite; \ 1144 | (void)greatest_parse_options; \ 1145 | (void)greatest_prng_step; \ 1146 | (void)greatest_prng_init_first_pass; \ 1147 | (void)greatest_prng_init_second_pass; \ 1148 | (void)greatest_set_test_suffix; \ 1149 | \ 1150 | memset(&greatest_info, 0, sizeof(greatest_info)); \ 1151 | greatest_info.width = GREATEST_DEFAULT_WIDTH; \ 1152 | GREATEST_SET_TIME(greatest_info.begin); \ 1153 | } \ 1154 | \ 1155 | /* Report passes, failures, skipped tests, the number of \ 1156 | * assertions, and the overall run time. */ \ 1157 | void GREATEST_PRINT_REPORT(void) { \ 1158 | if (!GREATEST_LIST_ONLY()) { \ 1159 | update_counts_and_reset_suite(); \ 1160 | GREATEST_SET_TIME(greatest_info.end); \ 1161 | GREATEST_FPRINTF(GREATEST_STDOUT, \ 1162 | "\nTotal: %u test%s", \ 1163 | greatest_info.tests_run, \ 1164 | greatest_info.tests_run == 1 ? "" : "s"); \ 1165 | GREATEST_CLOCK_DIFF(greatest_info.begin, \ 1166 | greatest_info.end); \ 1167 | GREATEST_FPRINTF(GREATEST_STDOUT, ", %u assertion%s\n", \ 1168 | greatest_info.assertions, \ 1169 | greatest_info.assertions == 1 ? "" : "s"); \ 1170 | GREATEST_FPRINTF(GREATEST_STDOUT, \ 1171 | "Pass: %u, fail: %u, skip: %u.\n", \ 1172 | greatest_info.passed, \ 1173 | greatest_info.failed, greatest_info.skipped); \ 1174 | } \ 1175 | } \ 1176 | \ 1177 | greatest_type_info greatest_type_info_memory = { \ 1178 | greatest_memory_equal_cb, greatest_memory_printf_cb, \ 1179 | }; \ 1180 | \ 1181 | greatest_run_info greatest_info 1182 | 1183 | /* Handle command-line arguments, etc. */ 1184 | #define GREATEST_MAIN_BEGIN() \ 1185 | do { \ 1186 | GREATEST_INIT(); \ 1187 | greatest_parse_options(argc, argv); \ 1188 | } while (0) 1189 | 1190 | /* Report results, exit with exit status based on results. */ 1191 | #define GREATEST_MAIN_END() \ 1192 | do { \ 1193 | GREATEST_PRINT_REPORT(); \ 1194 | return (greatest_all_passed() ? EXIT_SUCCESS : EXIT_FAILURE); \ 1195 | } while (0) 1196 | 1197 | /* Make abbreviations without the GREATEST_ prefix for the 1198 | * most commonly used symbols. */ 1199 | #if GREATEST_USE_ABBREVS 1200 | #define TEST GREATEST_TEST 1201 | #define SUITE GREATEST_SUITE 1202 | #define SUITE_EXTERN GREATEST_SUITE_EXTERN 1203 | #define RUN_TEST GREATEST_RUN_TEST 1204 | #define RUN_TEST1 GREATEST_RUN_TEST1 1205 | #define RUN_SUITE GREATEST_RUN_SUITE 1206 | #define IGNORE_TEST GREATEST_IGNORE_TEST 1207 | #define ASSERT GREATEST_ASSERT 1208 | #define ASSERTm GREATEST_ASSERTm 1209 | #define ASSERT_FALSE GREATEST_ASSERT_FALSE 1210 | #define ASSERT_EQ GREATEST_ASSERT_EQ 1211 | #define ASSERT_NEQ GREATEST_ASSERT_NEQ 1212 | #define ASSERT_GT GREATEST_ASSERT_GT 1213 | #define ASSERT_GTE GREATEST_ASSERT_GTE 1214 | #define ASSERT_LT GREATEST_ASSERT_LT 1215 | #define ASSERT_LTE GREATEST_ASSERT_LTE 1216 | #define ASSERT_EQ_FMT GREATEST_ASSERT_EQ_FMT 1217 | #define ASSERT_IN_RANGE GREATEST_ASSERT_IN_RANGE 1218 | #define ASSERT_EQUAL_T GREATEST_ASSERT_EQUAL_T 1219 | #define ASSERT_STR_EQ GREATEST_ASSERT_STR_EQ 1220 | #define ASSERT_STRN_EQ GREATEST_ASSERT_STRN_EQ 1221 | #define ASSERT_MEM_EQ GREATEST_ASSERT_MEM_EQ 1222 | #define ASSERT_ENUM_EQ GREATEST_ASSERT_ENUM_EQ 1223 | #define ASSERT_FALSEm GREATEST_ASSERT_FALSEm 1224 | #define ASSERT_EQm GREATEST_ASSERT_EQm 1225 | #define ASSERT_NEQm GREATEST_ASSERT_NEQm 1226 | #define ASSERT_GTm GREATEST_ASSERT_GTm 1227 | #define ASSERT_GTEm GREATEST_ASSERT_GTEm 1228 | #define ASSERT_LTm GREATEST_ASSERT_LTm 1229 | #define ASSERT_LTEm GREATEST_ASSERT_LTEm 1230 | #define ASSERT_EQ_FMTm GREATEST_ASSERT_EQ_FMTm 1231 | #define ASSERT_IN_RANGEm GREATEST_ASSERT_IN_RANGEm 1232 | #define ASSERT_EQUAL_Tm GREATEST_ASSERT_EQUAL_Tm 1233 | #define ASSERT_STR_EQm GREATEST_ASSERT_STR_EQm 1234 | #define ASSERT_STRN_EQm GREATEST_ASSERT_STRN_EQm 1235 | #define ASSERT_MEM_EQm GREATEST_ASSERT_MEM_EQm 1236 | #define ASSERT_ENUM_EQm GREATEST_ASSERT_ENUM_EQm 1237 | #define PASS GREATEST_PASS 1238 | #define FAIL GREATEST_FAIL 1239 | #define SKIP GREATEST_SKIP 1240 | #define PASSm GREATEST_PASSm 1241 | #define FAILm GREATEST_FAILm 1242 | #define SKIPm GREATEST_SKIPm 1243 | #define SET_SETUP GREATEST_SET_SETUP_CB 1244 | #define SET_TEARDOWN GREATEST_SET_TEARDOWN_CB 1245 | #define CHECK_CALL GREATEST_CHECK_CALL 1246 | #define SHUFFLE_TESTS GREATEST_SHUFFLE_TESTS 1247 | #define SHUFFLE_SUITES GREATEST_SHUFFLE_SUITES 1248 | 1249 | #ifdef GREATEST_VA_ARGS 1250 | #define RUN_TESTp GREATEST_RUN_TESTp 1251 | #endif 1252 | 1253 | #if GREATEST_USE_LONGJMP 1254 | #define ASSERT_OR_LONGJMP GREATEST_ASSERT_OR_LONGJMP 1255 | #define ASSERT_OR_LONGJMPm GREATEST_ASSERT_OR_LONGJMPm 1256 | #define FAIL_WITH_LONGJMP GREATEST_FAIL_WITH_LONGJMP 1257 | #define FAIL_WITH_LONGJMPm GREATEST_FAIL_WITH_LONGJMPm 1258 | #endif 1259 | 1260 | #endif /* USE_ABBREVS */ 1261 | 1262 | #if defined(__cplusplus) && !defined(GREATEST_NO_EXTERN_CPLUSPLUS) 1263 | } 1264 | #endif 1265 | 1266 | #endif 1267 | -------------------------------------------------------------------------------- /tests/math_intrinsics.c: -------------------------------------------------------------------------------- 1 | #define __MATH__INTRINSICS__IMPLEMENTATION__ 2 | #include "../math_intrinsics.h" 3 | 4 | -------------------------------------------------------------------------------- /tests/sokol_time.h: -------------------------------------------------------------------------------- 1 | #if defined(SOKOL_IMPL) && !defined(SOKOL_TIME_IMPL) 2 | #define SOKOL_TIME_IMPL 3 | #endif 4 | #ifndef SOKOL_TIME_INCLUDED 5 | /* 6 | sokol_time.h -- simple cross-platform time measurement 7 | 8 | Project URL: https://github.com/floooh/sokol 9 | 10 | Do this: 11 | #define SOKOL_IMPL or 12 | #define SOKOL_TIME_IMPL 13 | before you include this file in *one* C or C++ file to create the 14 | implementation. 15 | 16 | Optionally provide the following defines with your own implementations: 17 | SOKOL_ASSERT(c) - your own assert macro (default: assert(c)) 18 | SOKOL_TIME_API_DECL - public function declaration prefix (default: extern) 19 | SOKOL_API_DECL - same as SOKOL_TIME_API_DECL 20 | SOKOL_API_IMPL - public function implementation prefix (default: -) 21 | 22 | If sokol_time.h is compiled as a DLL, define the following before 23 | including the declaration or implementation: 24 | 25 | SOKOL_DLL 26 | 27 | On Windows, SOKOL_DLL will define SOKOL_TIME_API_DECL as __declspec(dllexport) 28 | or __declspec(dllimport) as needed. 29 | 30 | void stm_setup(); 31 | Call once before any other functions to initialize sokol_time 32 | (this calls for instance QueryPerformanceFrequency on Windows) 33 | 34 | uint64_t stm_now(); 35 | Get current point in time in unspecified 'ticks'. The value that 36 | is returned has no relation to the 'wall-clock' time and is 37 | not in a specific time unit, it is only useful to compute 38 | time differences. 39 | 40 | uint64_t stm_diff(uint64_t new, uint64_t old); 41 | Computes the time difference between new and old. This will always 42 | return a positive, non-zero value. 43 | 44 | uint64_t stm_since(uint64_t start); 45 | Takes the current time, and returns the elapsed time since start 46 | (this is a shortcut for "stm_diff(stm_now(), start)") 47 | 48 | uint64_t stm_laptime(uint64_t* last_time); 49 | This is useful for measuring frame time and other recurring 50 | events. It takes the current time, returns the time difference 51 | to the value in last_time, and stores the current time in 52 | last_time for the next call. If the value in last_time is 0, 53 | the return value will be zero (this usually happens on the 54 | very first call). 55 | 56 | uint64_t stm_round_to_common_refresh_rate(uint64_t duration) 57 | This oddly named function takes a measured frame time and 58 | returns the closest "nearby" common display refresh rate frame duration 59 | in ticks. If the input duration isn't close to any common display 60 | refresh rate, the input duration will be returned unchanged as a fallback. 61 | The main purpose of this function is to remove jitter/inaccuracies from 62 | measured frame times, and instead use the display refresh rate as 63 | frame duration. 64 | NOTE: for more robust frame timing, consider using the 65 | sokol_app.h function sapp_frame_duration() 66 | 67 | Use the following functions to convert a duration in ticks into 68 | useful time units: 69 | 70 | double stm_sec(uint64_t ticks); 71 | double stm_ms(uint64_t ticks); 72 | double stm_us(uint64_t ticks); 73 | double stm_ns(uint64_t ticks); 74 | Converts a tick value into seconds, milliseconds, microseconds 75 | or nanoseconds. Note that not all platforms will have nanosecond 76 | or even microsecond precision. 77 | 78 | Uses the following time measurement functions under the hood: 79 | 80 | Windows: QueryPerformanceFrequency() / QueryPerformanceCounter() 81 | MacOS/iOS: mach_absolute_time() 82 | emscripten: emscripten_get_now() 83 | Linux+others: clock_gettime(CLOCK_MONOTONIC) 84 | 85 | zlib/libpng license 86 | 87 | Copyright (c) 2018 Andre Weissflog 88 | 89 | This software is provided 'as-is', without any express or implied warranty. 90 | In no event will the authors be held liable for any damages arising from the 91 | use of this software. 92 | 93 | Permission is granted to anyone to use this software for any purpose, 94 | including commercial applications, and to alter it and redistribute it 95 | freely, subject to the following restrictions: 96 | 97 | 1. The origin of this software must not be misrepresented; you must not 98 | claim that you wrote the original software. If you use this software in a 99 | product, an acknowledgment in the product documentation would be 100 | appreciated but is not required. 101 | 102 | 2. Altered source versions must be plainly marked as such, and must not 103 | be misrepresented as being the original software. 104 | 105 | 3. This notice may not be removed or altered from any source 106 | distribution. 107 | */ 108 | #define SOKOL_TIME_INCLUDED (1) 109 | #include 110 | 111 | #if defined(SOKOL_API_DECL) && !defined(SOKOL_TIME_API_DECL) 112 | #define SOKOL_TIME_API_DECL SOKOL_API_DECL 113 | #endif 114 | #ifndef SOKOL_TIME_API_DECL 115 | #if defined(_WIN32) && defined(SOKOL_DLL) && defined(SOKOL_TIME_IMPL) 116 | #define SOKOL_TIME_API_DECL __declspec(dllexport) 117 | #elif defined(_WIN32) && defined(SOKOL_DLL) 118 | #define SOKOL_TIME_API_DECL __declspec(dllimport) 119 | #else 120 | #define SOKOL_TIME_API_DECL extern 121 | #endif 122 | #endif 123 | 124 | #ifdef __cplusplus 125 | extern "C" { 126 | #endif 127 | 128 | SOKOL_TIME_API_DECL void stm_setup(void); 129 | SOKOL_TIME_API_DECL uint64_t stm_now(void); 130 | SOKOL_TIME_API_DECL uint64_t stm_diff(uint64_t new_ticks, uint64_t old_ticks); 131 | SOKOL_TIME_API_DECL uint64_t stm_since(uint64_t start_ticks); 132 | SOKOL_TIME_API_DECL uint64_t stm_laptime(uint64_t* last_time); 133 | SOKOL_TIME_API_DECL uint64_t stm_round_to_common_refresh_rate(uint64_t frame_ticks); 134 | SOKOL_TIME_API_DECL double stm_sec(uint64_t ticks); 135 | SOKOL_TIME_API_DECL double stm_ms(uint64_t ticks); 136 | SOKOL_TIME_API_DECL double stm_us(uint64_t ticks); 137 | SOKOL_TIME_API_DECL double stm_ns(uint64_t ticks); 138 | 139 | #ifdef __cplusplus 140 | } /* extern "C" */ 141 | #endif 142 | #endif // SOKOL_TIME_INCLUDED 143 | 144 | /*-- IMPLEMENTATION ----------------------------------------------------------*/ 145 | #ifdef SOKOL_TIME_IMPL 146 | #define SOKOL_TIME_IMPL_INCLUDED (1) 147 | #include /* memset */ 148 | 149 | #ifndef SOKOL_API_IMPL 150 | #define SOKOL_API_IMPL 151 | #endif 152 | #ifndef SOKOL_ASSERT 153 | #include 154 | #define SOKOL_ASSERT(c) assert(c) 155 | #endif 156 | #ifndef _SOKOL_PRIVATE 157 | #if defined(__GNUC__) || defined(__clang__) 158 | #define _SOKOL_PRIVATE __attribute__((unused)) static 159 | #else 160 | #define _SOKOL_PRIVATE static 161 | #endif 162 | #endif 163 | 164 | #if defined(_WIN32) 165 | #ifndef WIN32_LEAN_AND_MEAN 166 | #define WIN32_LEAN_AND_MEAN 167 | #endif 168 | #include 169 | typedef struct { 170 | uint32_t initialized; 171 | LARGE_INTEGER freq; 172 | LARGE_INTEGER start; 173 | } _stm_state_t; 174 | #elif defined(__APPLE__) && defined(__MACH__) 175 | #include 176 | typedef struct { 177 | uint32_t initialized; 178 | mach_timebase_info_data_t timebase; 179 | uint64_t start; 180 | } _stm_state_t; 181 | #elif defined(__EMSCRIPTEN__) 182 | #include 183 | typedef struct { 184 | uint32_t initialized; 185 | double start; 186 | } _stm_state_t; 187 | #else /* anything else, this will need more care for non-Linux platforms */ 188 | #ifdef ESP8266 189 | // On the ESP8266, clock_gettime ignores the first argument and CLOCK_MONOTONIC isn't defined 190 | #define CLOCK_MONOTONIC 0 191 | #endif 192 | #include 193 | typedef struct { 194 | uint32_t initialized; 195 | uint64_t start; 196 | } _stm_state_t; 197 | #endif 198 | static _stm_state_t _stm; 199 | 200 | /* prevent 64-bit overflow when computing relative timestamp 201 | see https://gist.github.com/jspohr/3dc4f00033d79ec5bdaf67bc46c813e3 202 | */ 203 | #if defined(_WIN32) || (defined(__APPLE__) && defined(__MACH__)) 204 | _SOKOL_PRIVATE int64_t _stm_int64_muldiv(int64_t value, int64_t numer, int64_t denom) { 205 | int64_t q = value / denom; 206 | int64_t r = value % denom; 207 | return q * numer + r * numer / denom; 208 | } 209 | #endif 210 | 211 | SOKOL_API_IMPL void stm_setup(void) { 212 | memset(&_stm, 0, sizeof(_stm)); 213 | _stm.initialized = 0xABCDABCD; 214 | #if defined(_WIN32) 215 | QueryPerformanceFrequency(&_stm.freq); 216 | QueryPerformanceCounter(&_stm.start); 217 | #elif defined(__APPLE__) && defined(__MACH__) 218 | mach_timebase_info(&_stm.timebase); 219 | _stm.start = mach_absolute_time(); 220 | #elif defined(__EMSCRIPTEN__) 221 | _stm.start = emscripten_get_now(); 222 | #else 223 | struct timespec ts; 224 | clock_gettime(CLOCK_MONOTONIC, &ts); 225 | _stm.start = (uint64_t)ts.tv_sec*1000000000 + (uint64_t)ts.tv_nsec; 226 | #endif 227 | } 228 | 229 | SOKOL_API_IMPL uint64_t stm_now(void) { 230 | SOKOL_ASSERT(_stm.initialized == 0xABCDABCD); 231 | uint64_t now; 232 | #if defined(_WIN32) 233 | LARGE_INTEGER qpc_t; 234 | QueryPerformanceCounter(&qpc_t); 235 | now = (uint64_t) _stm_int64_muldiv(qpc_t.QuadPart - _stm.start.QuadPart, 1000000000, _stm.freq.QuadPart); 236 | #elif defined(__APPLE__) && defined(__MACH__) 237 | const uint64_t mach_now = mach_absolute_time() - _stm.start; 238 | now = (uint64_t) _stm_int64_muldiv((int64_t)mach_now, (int64_t)_stm.timebase.numer, (int64_t)_stm.timebase.denom); 239 | #elif defined(__EMSCRIPTEN__) 240 | double js_now = emscripten_get_now() - _stm.start; 241 | now = (uint64_t) (js_now * 1000000.0); 242 | #else 243 | struct timespec ts; 244 | clock_gettime(CLOCK_MONOTONIC, &ts); 245 | now = ((uint64_t)ts.tv_sec*1000000000 + (uint64_t)ts.tv_nsec) - _stm.start; 246 | #endif 247 | return now; 248 | } 249 | 250 | SOKOL_API_IMPL uint64_t stm_diff(uint64_t new_ticks, uint64_t old_ticks) { 251 | if (new_ticks > old_ticks) { 252 | return new_ticks - old_ticks; 253 | } 254 | else { 255 | return 1; 256 | } 257 | } 258 | 259 | SOKOL_API_IMPL uint64_t stm_since(uint64_t start_ticks) { 260 | return stm_diff(stm_now(), start_ticks); 261 | } 262 | 263 | SOKOL_API_IMPL uint64_t stm_laptime(uint64_t* last_time) { 264 | SOKOL_ASSERT(last_time); 265 | uint64_t dt = 0; 266 | uint64_t now = stm_now(); 267 | if (0 != *last_time) { 268 | dt = stm_diff(now, *last_time); 269 | } 270 | *last_time = now; 271 | return dt; 272 | } 273 | 274 | // first number is frame duration in ns, second number is tolerance in ns, 275 | // the resulting min/max values must not overlap! 276 | static const uint64_t _stm_refresh_rates[][2] = { 277 | { 16666667, 1000000 }, // 60 Hz: 16.6667 +- 1ms 278 | { 13888889, 250000 }, // 72 Hz: 13.8889 +- 0.25ms 279 | { 13333333, 250000 }, // 75 Hz: 13.3333 +- 0.25ms 280 | { 11764706, 250000 }, // 85 Hz: 11.7647 +- 0.25 281 | { 11111111, 250000 }, // 90 Hz: 11.1111 +- 0.25ms 282 | { 10000000, 500000 }, // 100 Hz: 10.0000 +- 0.5ms 283 | { 8333333, 500000 }, // 120 Hz: 8.3333 +- 0.5ms 284 | { 6944445, 500000 }, // 144 Hz: 6.9445 +- 0.5ms 285 | { 4166667, 1000000 }, // 240 Hz: 4.1666 +- 1ms 286 | { 0, 0 }, // keep the last element always at zero 287 | }; 288 | 289 | SOKOL_API_IMPL uint64_t stm_round_to_common_refresh_rate(uint64_t ticks) { 290 | uint64_t ns; 291 | int i = 0; 292 | while (0 != (ns = _stm_refresh_rates[i][0])) { 293 | uint64_t tol = _stm_refresh_rates[i][1]; 294 | if ((ticks > (ns - tol)) && (ticks < (ns + tol))) { 295 | return ns; 296 | } 297 | i++; 298 | } 299 | // fallthough: didn't fit into any buckets 300 | return ticks; 301 | } 302 | 303 | SOKOL_API_IMPL double stm_sec(uint64_t ticks) { 304 | return (double)ticks / 1000000000.0; 305 | } 306 | 307 | SOKOL_API_IMPL double stm_ms(uint64_t ticks) { 308 | return (double)ticks / 1000000.0; 309 | } 310 | 311 | SOKOL_API_IMPL double stm_us(uint64_t ticks) { 312 | return (double)ticks / 1000.0; 313 | } 314 | 315 | SOKOL_API_IMPL double stm_ns(uint64_t ticks) { 316 | return (double)ticks; 317 | } 318 | #endif /* SOKOL_TIME_IMPL */ 319 | 320 | -------------------------------------------------------------------------------- /tests/test.c: -------------------------------------------------------------------------------- 1 | #ifdef _MSC_VER 2 | #define _CRT_SECURE_NO_WARNINGS 3 | #endif 4 | 5 | #define _USE_MATH_DEFINES 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include "greatest.h" 14 | 15 | #include "../math_intrinsics.h" 16 | 17 | //---------------------------------------------------------------------------------------------------------------------- 18 | // functions pointer definition 19 | typedef float (*reference_function)(float); 20 | typedef float (*reference_function2)(float, float); 21 | #ifdef __MATH__INTRINSICS__AVX__ 22 | typedef __m256 (*approximation_function)(__m256); 23 | typedef __m256 (*approximation_function2)(__m256, __m256); 24 | #define simd_vector_width (8) 25 | #else 26 | typedef float32x4_t (*approximation_function)(float32x4_t); 27 | typedef float32x4_t (*approximation_function2)(float32x4_t, float32x4_t); 28 | #define simd_vector_width (4) 29 | #endif 30 | 31 | //---------------------------------------------------------------------------------------------------------------------- 32 | // generic unit test 33 | TEST generic_test(reference_function ref, approximation_function approx, float range_min, float range_max, float epsilon, uint32_t num_elements, bool relative_error, const char* name) 34 | { 35 | float* input = (float*) malloc(num_elements * sizeof(float)); 36 | float* result = (float*) malloc(num_elements * sizeof(float)); 37 | float step = ((range_max - range_min) / (float) (num_elements-1)); 38 | uint32_t num_vectors = num_elements / simd_vector_width; 39 | 40 | for(uint32_t i=0; i