├── .github
    └── workflows
    │   └── cmake-multi-platform.yml
├── .gitignore
├── LICENSE
├── README.md
├── math_intrinsics.h
└── tests
    ├── CMakeLists.txt
    ├── benchmark.c
    ├── greatest.h
    ├── math_intrinsics.c
    ├── sokol_time.h
    └── test.c


/.github/workflows/cmake-multi-platform.yml:
--------------------------------------------------------------------------------
 1 | name: C/C++ CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ "main" ]
 6 |   pull_request:
 7 |     branches: [ "main" ]
 8 | 
 9 | jobs:
10 |   build-ubuntu-clang:
11 |     name: ubuntu-clang
12 |     runs-on: ubuntu-latest
13 | 
14 |     steps:
15 |     - uses: actions/checkout@v3
16 | 
17 |     - name: Configure CMake
18 |       run: cmake ${{github.workspace}}/tests/ -DCMAKE_C_COMPILER=clang
19 | 
20 |     - name: Build
21 |       run: cmake --build ${{github.workspace}}/
22 | 
23 |     - name: Test precision
24 |       working-directory: ${{github.workspace}}/
25 |       run: ./test_precision
26 | 
27 |     - name: Test fast
28 |       working-directory: ${{github.workspace}}/
29 |       run: ./test_fast
30 | 
31 |     - name: Benchmark precision
32 |       working-directory: ${{github.workspace}}/
33 |       run: ./benchmark_precision
34 | 
35 |     - name: Benchmark fast
36 |       working-directory: ${{github.workspace}}/
37 |       run: ./benchmark_fast
38 |       
39 |   build-macos:
40 |     name: macos
41 |     runs-on: macos-latest
42 | 
43 |     steps:
44 |     - uses: actions/checkout@v3
45 | 
46 |     - name: Configure CMake
47 |       run: cmake ${{github.workspace}}/tests/
48 | 
49 |     - name: Build
50 |       run: cmake --build ${{github.workspace}}/
51 | 
52 |     - name: Test precision
53 |       working-directory: ${{github.workspace}}/
54 |       run: ./test_precision
55 | 
56 |     - name: Test fast
57 |       working-directory: ${{github.workspace}}/
58 |       run: ./test_fast
59 | 
60 |     - name: Benchmark precision
61 |       working-directory: ${{github.workspace}}/
62 |       run: ./benchmark_precision
63 | 
64 |     - name: Benchmark fast
65 |       working-directory: ${{github.workspace}}/
66 |       run: ./benchmark_fast
67 |       
68 |   build-windows:
69 |     name: windows
70 |     runs-on: windows-latest
71 | 
72 |     steps:
73 |     - uses: actions/checkout@v3
74 | 
75 |     - name: Configure CMake
76 |       working-directory: ${{github.workspace}}\tests
77 |       run: cmake .
78 | 
79 |     - name: Build
80 |       working-directory: ${{github.workspace}}\tests
81 |       run: cmake --build .
82 | 
83 |     - name: Test precision
84 |       working-directory:  ${{github.workspace}}\tests\Debug
85 |       run: ./test_precision
86 | 
87 |     - name: Test fast
88 |       working-directory:  ${{github.workspace}}\tests\Debug
89 |       run: ./test_fast
90 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Prerequisites
 2 | *.d
 3 | 
 4 | # Object files
 5 | *.o
 6 | *.ko
 7 | *.obj
 8 | *.elf
 9 | 
10 | # Linker output
11 | *.ilk
12 | *.map
13 | *.exp
14 | 
15 | # Precompiled Headers
16 | *.gch
17 | *.pch
18 | 
19 | # Libraries
20 | *.lib
21 | *.a
22 | *.la
23 | *.lo
24 | 
25 | # Shared objects (inc. Windows DLLs)
26 | *.dll
27 | *.so
28 | *.so.*
29 | *.dylib
30 | 
31 | # Executables
32 | *.exe
33 | *.out
34 | *.app
35 | *.i*86
36 | *.x86_64
37 | *.hex
38 | 
39 | # Debug files
40 | *.dSYM/
41 | *.su
42 | *.idb
43 | *.pdb
44 | 
45 | # Kernel Module Compile Results
46 | *.mod*
47 | *.cmd
48 | .tmp_versions/
49 | modules.order
50 | Module.symvers
51 | Mkfile.old
52 | dkms.conf
53 | 
54 | # MacOS
55 | *.DS_Store
56 | 
57 | # XCode
58 | xcuserdata/
59 | xcshareddata/
60 | 
61 | # visual studio code
62 | *.code-workspace
63 | *.vscode
64 | 
65 | # cmake
66 | CMakeLists.txt.user
67 | CMakeCache.txt
68 | CMakeFiles
69 | CMakeScripts
70 | Testing
71 | Makefile
72 | cmake_install.cmake
73 | install_manifest.txt
74 | compile_commands.json
75 | CTestTestfile.cmake
76 | _deps
77 | 
78 | # executables
79 | tests/test_fast
80 | tests/test_precision
81 | tests/benchmark_fast
82 | tests/benchmark_precision


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Geolm
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # math_intrinsics
  2 | One header file library that implement missing transcendental math functions (cos, sin, acos, and more....) using 100% AVX/Neon instructions (no branching)
  3 | 
  4 | ### unit tests build status
  5 | [![Build Status](https://github.com/geolm/math_intrinsics/actions/workflows/cmake-multi-platform.yml/badge.svg)](https://github.com/geolm/math_intrinsics/actions)
  6 | 
  7 | # why
  8 | AVX and Neon intrinsics don't provide transcendental math functions. Of course there are already some libraries with those functions but there are usually not free, restricted to one specific  hardware or with low precision. This library is super easy to integrate, with a precision close to the C math library (see below) and with MIT license.
  9 | 
 10 | # how to
 11 | 
 12 | It's a one-header lib, just define the macro once in your project and include the header.
 13 | 
 14 | ```C
 15 | #define __MATH__INTRINSICS__IMPLEMENTATION__
 16 | #include "math_intrinsics.h"
 17 | ```
 18 | 
 19 | On intel/AMD computer, you need to compile with **-mavx2**. You can add also -mfma. 
 20 | On ARM based computer nothing required as the lib is for AArch64
 21 | 
 22 | 
 23 | You can define this macro to generate faster albeit less precise functions (see below for more details) :
 24 | ```C
 25 | #define __MATH_INTRINSINCS_FAST__
 26 | ```
 27 | 
 28 | # functions
 29 | 
 30 | ```C
 31 | // max error : 5.960464478e-08
 32 | __m256 mm256_cos_ps(__m256 a);
 33 | 
 34 | // max error : 5.960464478e-08
 35 | __m256 mm256_sin_ps(__m256 a);
 36 | 
 37 | // max error : 5.960464478e-08
 38 | void mm256_sincos_ps(__m256 a, __m256 *s, __m256 *c);
 39 | 
 40 | // max error : 2.384185791e-07
 41 | __m256 mm256_acos_ps(__m256 a);
 42 | 
 43 | // max error : 1.192092896e-07
 44 | __m256 mm256_asin_ps(__m256 a);
 45 | 
 46 | // max error : 1.192092896e-07
 47 | __m256 mm256_atan_ps(__m256 a);
 48 | 
 49 | // max error : 2.384185791e-07
 50 | __m256 mm256_atan2_ps(__m256 x, __m256 y);
 51 | 
 52 | // max error : 9.107976950e-08
 53 | __m256 mm256_log_ps(__m256 a);
 54 | 
 55 | // max error : 2.349663504e-07
 56 | __m256 mm256_log2_ps(__m256 x);
 57 | 
 58 | // max error : 1.108270880e-07
 59 | __m256 mm256_exp_ps(__m256 a);
 60 | 
 61 | // max error : 1.042427087e-07
 62 | __m256 mm256_exp2_ps(__m256 x);
 63 | 
 64 | // max error : 1.184910232e-07
 65 | __m256 mm256_cbrt_ps(__m256 a);
 66 | 
 67 | // max error : 9.768706377e-07
 68 | __m256 mm256_pow_ps(__m256 x, __m256 y);
 69 | ```
 70 | 
 71 | Note : the same functions are defined in NEON intrinsics style :
 72 | 
 73 | ```C
 74 | // max error : 5.960464478e-08
 75 | float32x4_t vcosq_f32(float32x4_t a);
 76 | 
 77 | // max error : 5.960464478e-08
 78 | float32x4_t vsinq_f32(float32x4_t a);
 79 | 
 80 | // max error : 5.960464478e-08
 81 | void vsincosq_f32(float32x4_t a, float32x4_t *s, float32x4_t *c);
 82 | 
 83 | // max error : 2.384185791e-07
 84 | float32x4_t vacosq_f32(float32x4_t a);
 85 | 
 86 | // max error : 1.192092896e-07
 87 | float32x4_t vasinq_f32(float32x4_t a);
 88 | 
 89 | // max error : 1.192092896e-07
 90 | float32x4_t vatanq_f32(float32x4_t a);
 91 | 
 92 | // max error : 2.384185791e-07
 93 | float32x4_t vatan2q_f32(float32x4_t x, float32x4_t y);
 94 | 
 95 | // max error : 9.107976950e-08
 96 | float32x4_t vlogq_f32(float32x4_t a);
 97 | 
 98 | // max error : 2.349663504e-07
 99 | float32x4_t vlog2q_f32(float32x4_t x);
100 | 
101 | // max error : 1.108270880e-07
102 | float32x4_t vexpq_f32(float32x4_t a);
103 | 
104 | // max error : 1.042427087e-07
105 | float32x4_t vexp2q_f32(float32x4_t a);
106 | 
107 | // max error : 1.184910232e-07
108 | float32x4_t vcbrtq_f32(float32x4_t a);
109 | 
110 | // max error : 9.768706377e-07
111 | float32x4_t vpowq_f32(float32x4_t x, float32x4_t y);
112 | 
113 | ```
114 | 
115 | # fast functions 
116 | 
117 | If you use the macro \_\_MATH_INTRINSINCS_FAST\_\_ some functions will have a bit less precision but better performances:
118 | 
119 | * sin, max_error : 2.682209015e-07 perf : ~1.5x
120 | * cos, max_error : 5.811452866e-07 perf : ~1.5x
121 | * acos, max_error : 6.520748138e-05 perf : ~1.6x
122 | * asin, max_error : 6.520736497e-05 perf : ~1.4x
123 | * exp2, max_error : 2.674510370e-06 perf : ~1.9x
124 | * pow, max error : 8.886078831e-06 perf : ~1.9x
125 | 
126 | Check the benchmark actions in build system for more details. As you can see, the precision is still good with a noticeable performance boost. IMO most  programs could use the fast version.
127 | 
128 | # FAQ
129 | 
130 | ## is it fast?
131 | The goal of this library is to provide math function with a good precision with every computation done in AVX/NEON. Performance is not the focus.
132 | 
133 | Here's the benchmark results on my old Intel Core i7 from 2018 for 1 billion of operations, comparison against the C standard library.
134 | 
135 | ```C
136 | benchmark : mode precision
137 | 
138 | .mm256_acos_ps: 723.730 ms	 c std func: 5408.153 ms	  ratio: 7.47x
139 | .mm256_asin_ps: 692.439 ms	 c std func: 5419.091 ms	  ratio: 7.83x
140 | .mm256_atan_ps: 733.843 ms	 c std func: 3762.987 ms	  ratio: 5.13x
141 | .mm256_cbrt_ps: 1522.731 ms	 c std func: 19559.201 ms	  ratio: 12.84x
142 | .mm256_cos_ps: 882.112 ms        c std func: 15540.117 ms	  ratio: 17.62x
143 | .mm256_sin_ps: 838.590 ms	 c std func: 15214.896 ms	  ratio: 18.14x
144 | .mm256_exp_ps: 830.130 ms	 c std func: 4399.218 ms	  ratio: 5.30x
145 | .mm256_exp2_ps: 1007.015 ms	 c std func: 2076.871 ms	  ratio: 2.06x
146 | .mm256_log_ps: 1019.277 ms	 c std func: 16832.281 ms	  ratio: 16.51x
147 | .mm256_log2_ps: 479.116 ms	 c std func: 3594.876 ms	  ratio: 7.50x
148 | ```
149 | 
150 | Don't forget : the function mm256_sincos_ps computes sinus and cosinus for the cost of one. Also you can use the macro \_\_MATH_INTRINSINCS_FAST\_\_ 
151 | 
152 | ## why AVX2 ?
153 | 
154 | On multiple functions this library use a float as an int to have access to the mantissa and the exponent part. While it's doable with AVX1 using SSE4.2, I don't see the point of not using AVX2 which have been on CPU since 2013.
155 | 
156 | ## does it handle all float cases (+inf, -inf, NAN) as the C math lib?
157 | 
158 | Yes, all functions (except atan2 and pow) are compliant to +inf, -inf, NAN and other special cases (for example log(-4) == NAN). All based on the doc found here https://en.cppreference.com/w/
159 | 
160 | ## what's tested?
161 | 
162 | The unit tests cover precision and special cases (inf, nan, ...). At the moment, the Neon version is not ran on GitHub but rather manually on my M1 Pro machine as I didn't had time to setup the emulator properly. 
163 | 
164 | # references
165 | 
166 | [cephes math library](https://github.com/jeremybarnes/cephes/blob/master/single/)
167 | 
168 | [simple SSE sin/cos](http://gruntthepeon.free.fr/ssemath/)
169 | 
170 | [speeding up atan2f by 50x](https://mazzo.li/posts/vectorized-atan2.html)
171 | 


--------------------------------------------------------------------------------
/math_intrinsics.h:
--------------------------------------------------------------------------------
  1 | #ifndef __MATH__INTRINSICS__H__
  2 | #define __MATH__INTRINSICS__H__
  3 | 
  4 | /*
  5 | 
  6 |     NEON/AVX trascendental math functions
  7 | 
  8 |     Documentation can be found https://github.com/Geolm/math_intrinsics/
  9 | 
 10 | */
 11 | 
 12 | #ifdef __cplusplus
 13 | extern "C" {
 14 | #endif
 15 | 
 16 | #if defined(__ARM_NEON) && defined(__ARM_NEON__)
 17 | #include <arm_neon.h>
 18 | 
 19 |     // max error : 5.960464478e-08
 20 |     float32x4_t vcosq_f32(float32x4_t a);
 21 | 
 22 |     // max error : 5.960464478e-08
 23 |     float32x4_t vsinq_f32(float32x4_t a);
 24 | 
 25 |     // max error : 5.960464478e-08
 26 |     void vsincosq_f32(float32x4_t a, float32x4_t *s, float32x4_t *c);
 27 | 
 28 |     // max error : 2.384185791e-07
 29 |     float32x4_t vacosq_f32(float32x4_t a);
 30 | 
 31 |     // max error : 1.192092896e-07
 32 |     float32x4_t vasinq_f32(float32x4_t a);
 33 | 
 34 |     // max error : 6.699562073e-05
 35 |     float32x4_t vatanq_f32(float32x4_t a);
 36 | 
 37 |     // max error : 2.384185791e-07
 38 |     float32x4_t vatan2q_f32(float32x4_t x, float32x4_t y);
 39 | 
 40 |     // max error : 4.768371582e-07
 41 |     float32x4_t vlogq_f32(float32x4_t a);
 42 | 
 43 |     // max error : 2.349663504e-07
 44 |     float32x4_t vlog2q_f32(float32x4_t x);
 45 | 
 46 |     // max error : 1.108270880e-07
 47 |     float32x4_t vexpq_f32(float32x4_t a);
 48 | 
 49 |     // max error : 1.042427087e-07
 50 |     float32x4_t vexp2q_f32(float32x4_t a);
 51 | 
 52 |     // max error : 4.768371582e-07
 53 |     float32x4_t vcbrtq_f32(float32x4_t a);
 54 | 
 55 |     // max error : 9.768706377e-07
 56 |     float32x4_t vpowq_f32(float32x4_t x, float32x4_t y);
 57 | 
 58 |     #define __MATH__INTRINSICS__NEON__
 59 | 
 60 | #else
 61 | #include <immintrin.h>
 62 | 
 63 |     // max error : 5.960464478e-08
 64 |     __m256 mm256_cos_ps(__m256 a);
 65 | 
 66 |     // max error : 5.960464478e-08
 67 |     __m256 mm256_sin_ps(__m256 a);
 68 | 
 69 |     // max error : 5.960464478e-08
 70 |     void mm256_sincos_ps(__m256 a, __m256 *s, __m256 *c);
 71 | 
 72 |     // max error : 2.384185791e-07
 73 |     __m256 mm256_acos_ps(__m256 a);
 74 | 
 75 |     // max error : 1.192092896e-07
 76 |     __m256 mm256_asin_ps(__m256 a);
 77 | 
 78 |     // max error : 6.699562073e-05
 79 |     __m256 mm256_atan_ps(__m256 a);
 80 | 
 81 |     // max error : 2.384185791e-07
 82 |     __m256 mm256_atan2_ps(__m256 x, __m256 y);
 83 | 
 84 |     // max error : 4.768371582e-07
 85 |     __m256 mm256_log_ps(__m256 a);
 86 | 
 87 |     // max error : 2.349663504e-07
 88 |     __m256 mm256_log2_ps(__m256 x);
 89 | 
 90 |     // max error : 1.108270880e-07
 91 |     __m256 mm256_exp_ps(__m256 a);
 92 | 
 93 |     // max error : 1.042427087e-07
 94 |     __m256 mm256_exp2_ps(__m256 x);
 95 | 
 96 |     // max error : 4.768371582e-07
 97 |     __m256 mm256_cbrt_ps(__m256 a);
 98 | 
 99 |     // max error : 9.768706377e-07
100 |     __m256 mm256_pow_ps(__m256 x, __m256 y);
101 | 
102 |     #define __MATH__INTRINSICS__AVX__
103 | 
104 | #endif
105 | 
106 | #ifdef __cplusplus
107 | }
108 | #endif
109 | 
110 | #endif
111 | 
112 | 
113 | #ifdef __MATH__INTRINSICS__IMPLEMENTATION__
114 | 
115 | #define SIMD_MATH_TAU (6.28318530f)
116 | #define SIMD_MATH_PI  (3.14159265f)
117 | #define SIMD_MATH_PI2 (1.57079632f)
118 | #define SIMD_MATH_PI4 (0.78539816f)
119 | 
120 | #if defined(__ARM_NEON) && defined(__ARM_NEON__)
121 |     typedef float32x4_t simd_vector;
122 | 
123 |     static inline simd_vector simd_add(simd_vector a, simd_vector b) {return vaddq_f32(a, b);}
124 |     static inline simd_vector simd_sub(simd_vector a, simd_vector b) {return vsubq_f32(a, b);}
125 |     static inline simd_vector simd_mul(simd_vector a, simd_vector b) {return vmulq_f32(a, b);}
126 |     static inline simd_vector simd_div(simd_vector a, simd_vector b) {return vdivq_f32(a, b);}
127 |     static inline simd_vector simd_abs(simd_vector a) {return vabsq_f32(a);}
128 |     static inline simd_vector simd_fmad(simd_vector a, simd_vector b, simd_vector c) {return vfmaq_f32(c, a, b);}
129 |     static inline simd_vector simd_or(simd_vector a, simd_vector b) {return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)));}
130 |     static inline simd_vector simd_xor(simd_vector a, simd_vector b) {return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)));}
131 |     static inline simd_vector simd_and(simd_vector a, simd_vector b) {return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)));}
132 |     static inline simd_vector simd_andnot(simd_vector a, simd_vector b) {return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)));}
133 |     static inline simd_vector simd_min(simd_vector a, simd_vector b) {return vminq_f32(a, b);}
134 |     static inline simd_vector simd_max(simd_vector a, simd_vector b) {return vmaxq_f32(a, b);}
135 |     static inline simd_vector simd_cmp_gt(simd_vector a, simd_vector b) {return vreinterpretq_f32_u32(vcgtq_f32(a, b));}
136 |     static inline simd_vector simd_cmp_ge(simd_vector a, simd_vector b) {return vreinterpretq_f32_u32(vcgeq_f32(a, b));}
137 |     static inline simd_vector simd_cmp_lt(simd_vector a, simd_vector b) {return vreinterpretq_f32_u32(vcltq_f32(a, b));}
138 |     static inline simd_vector simd_cmp_le(simd_vector a, simd_vector b) {return vreinterpretq_f32_u32(vcleq_f32(a, b));}
139 |     static inline simd_vector simd_cmp_eq(simd_vector a, simd_vector b) {return vreinterpretq_f32_u32(vceqq_f32(a, b));}
140 |     static inline simd_vector simd_cmp_neq(simd_vector a, simd_vector b) {return vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(a, b)));}
141 |     static inline simd_vector simd_isnan(simd_vector a) {return simd_cmp_neq(a, a);}
142 |     static inline simd_vector simd_select(simd_vector a, simd_vector b, simd_vector mask) {return vbslq_f32(vreinterpretq_u32_f32(mask), b, a);}
143 |     static inline simd_vector simd_splat(float value) {return vdupq_n_f32(value);}
144 |     static inline simd_vector simd_splat_zero(void) {return vdupq_n_f32(0);}
145 |     static inline simd_vector simd_splat_positive_infinity(void) {return vreinterpretq_u32_f32(vdupq_n_u32(0x7f800000));}
146 |     static inline simd_vector simd_splat_negative_infinity(void) {return vreinterpretq_u32_f32(vdupq_n_u32(0xff800000));}
147 |     static inline simd_vector simd_sign_mask(void) {return vreinterpretq_u32_f32(vdupq_n_u32(0x80000000));}
148 |     static inline simd_vector simd_inv_sign_mask(void) {return vreinterpretq_u32_f32(vdupq_n_u32(~0x80000000));}
149 |     static inline simd_vector simd_abs_mask(void) {return vreinterpretq_u32_f32(vdupq_n_u32(0x7FFFFFFF));}
150 |     static inline simd_vector simd_min_normalized(void) {return vreinterpretq_u32_f32(vdupq_n_u32(0x00800000));} // the smallest non denormalized float number
151 |     static inline simd_vector simd_inv_mant_mask(void){return vreinterpretq_u32_f32(vdupq_n_u32(~0x7f800000));}
152 |     static inline simd_vector simd_floor(simd_vector a) {return vrndmq_f32(a);}
153 |     static inline simd_vector simd_round(simd_vector a) {return vrndnq_f32(a);}
154 |     static inline simd_vector simd_neg(simd_vector a) {return vnegq_f32(a);}
155 |     static inline simd_vector simd_sqrt(simd_vector a) {return vsqrtq_f32(a);}
156 | 
157 |     typedef int32x4_t simd_vectori;
158 |     static inline simd_vectori simd_convert_from_float(simd_vector a) {return vcvtq_s32_f32(a);}
159 |     static inline simd_vectori simd_cast_from_float(simd_vector a) {return vreinterpretq_s32_f32(a);}
160 |     static inline simd_vector simd_convert_from_int(simd_vectori a) {return vcvtq_f32_s32(a);}
161 |     static inline simd_vector simd_cast_from_int(simd_vectori a) {return vreinterpretq_f32_s32(a);}
162 |     static inline simd_vectori simd_add_i(simd_vectori a, simd_vectori b) {return vaddq_s32(a, b);}
163 |     static inline simd_vectori simd_sub_i(simd_vectori a, simd_vectori b) {return vsubq_s32(a, b);}
164 |     static inline simd_vector simd_mul_i(simd_vector a, simd_vector b) {return vmulq_s32(a, b);}
165 |     static inline simd_vectori simd_splat_i(int i) {return vdupq_n_s32(i);}
166 |     static inline simd_vectori simd_splat_zero_i(void) {return vdupq_n_s32(0);}
167 |     static inline simd_vectori simd_shift_left_i(simd_vectori a, int i) {return vshlq_s32(a, vdupq_n_s32(i));}
168 |     static inline simd_vectori simd_shift_right_i(simd_vectori a, int i) {return vshlq_s32(a, vdupq_n_s32(-i));}
169 |     static inline simd_vectori simd_and_i(simd_vectori a, simd_vectori b) {return vandq_s32(a, b);}
170 |     static inline simd_vectori simd_or_i(simd_vectori a, simd_vectori b) {return vorrq_s32(a, b);}
171 |     static inline simd_vectori simd_andnot_i(simd_vectori a, simd_vectori b) {return vbicq_s32(a, b);}
172 |     static inline simd_vectori simd_cmp_eq_i(simd_vectori a, simd_vectori b) {return vceqq_s32(a, b);}
173 |     static inline simd_vectori simd_cmp_gt_i(simd_vectori a, simd_vectori b) {return vcgtq_s32(a, b);}
174 |     static inline simd_vectori simd_abs_i(simd_vectori a) {return vabsq_s32(a);}
175 | 
176 |     #define simd_asin vasinq_f32
177 |     #define simd_atan vatanq_f32
178 |     #define simd_sincos vsincosq_f32
179 |     #define simd_sin vsinq_f32
180 |     #define simd_log vlogq_f32
181 |     #define simd_exp vexpq_f32
182 |     #define simd_log2 vlog2q_f32
183 |     #define simd_exp2 vexp2q_f32
184 | 
185 | #else
186 |     typedef __m256 simd_vector;
187 | 
188 |     static inline simd_vector simd_add(simd_vector a, simd_vector b) {return _mm256_add_ps(a, b);}
189 |     static inline simd_vector simd_sub(simd_vector a, simd_vector b) {return _mm256_sub_ps(a, b);}
190 |     static inline simd_vector simd_mul(simd_vector a, simd_vector b) {return _mm256_mul_ps(a, b);}
191 |     static inline simd_vector simd_div(simd_vector a, simd_vector b) {return _mm256_div_ps(a, b);}
192 |     static inline simd_vector simd_abs_mask(void) {return _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));}
193 |     static inline simd_vector simd_abs(simd_vector a) {return _mm256_and_ps(a, simd_abs_mask());}
194 |     static inline simd_vector simd_fmad(simd_vector a, simd_vector b, simd_vector c)
195 |     {
196 |     #ifdef __FMA__
197 |         return _mm256_fmadd_ps(a, b, c);
198 |     #else
199 |         return _mm256_add_ps(_mm256_mul_ps(a, b), c);
200 |     #endif
201 |     }
202 |     static inline simd_vector simd_or(simd_vector a, simd_vector b) {return _mm256_or_ps(a, b);}
203 |     static inline simd_vector simd_and(simd_vector a, simd_vector b) {return _mm256_and_ps(a, b);}
204 |     static inline simd_vector simd_andnot(simd_vector a, simd_vector b) {return _mm256_andnot_ps(b, a);}
205 |     static inline simd_vector simd_xor(simd_vector a, simd_vector b) {return _mm256_xor_ps(a, b);}
206 |     static inline simd_vector simd_min(simd_vector a, simd_vector b) {return _mm256_min_ps(a, b);}
207 |     static inline simd_vector simd_max(simd_vector a, simd_vector b) {return _mm256_max_ps(a, b);}
208 |     static inline simd_vector simd_select(simd_vector a, simd_vector b, simd_vector mask) {return _mm256_blendv_ps(a, b, mask);}
209 |     static inline simd_vector simd_splat(float value) {return _mm256_set1_ps(value);}
210 |     static inline simd_vector simd_splat_zero(void) {return _mm256_setzero_ps();}
211 |     static inline simd_vector simd_splat_positive_infinity(void) {return _mm256_castsi256_ps(_mm256_set1_epi32(0x7f800000));}
212 |     static inline simd_vector simd_splat_negative_infinity(void) {return _mm256_castsi256_ps(_mm256_set1_epi32(0xff800000));}
213 |     static inline simd_vector simd_sign_mask(void) {return _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));}
214 |     static inline simd_vector simd_inv_sign_mask(void) {return _mm256_castsi256_ps(_mm256_set1_epi32(~0x80000000));}
215 |     static inline simd_vector simd_min_normalized(void) {return _mm256_castsi256_ps(_mm256_set1_epi32(0x00800000));} // the smallest non denormalized float number
216 |     static inline simd_vector simd_inv_mant_mask(void){return _mm256_castsi256_ps(_mm256_set1_epi32(~0x7f800000));}
217 |     static inline simd_vector simd_floor(simd_vector a) {return _mm256_floor_ps(a);}
218 |     static inline simd_vector simd_round(simd_vector a) {return _mm256_round_ps(a, _MM_FROUND_NINT);}
219 |     static inline simd_vector simd_cmp_gt(simd_vector a, simd_vector b) {return _mm256_cmp_ps(a, b, _CMP_GT_OQ);}
220 |     static inline simd_vector simd_cmp_ge(simd_vector a, simd_vector b) {return _mm256_cmp_ps(a, b, _CMP_GE_OQ);}
221 |     static inline simd_vector simd_cmp_lt(simd_vector a, simd_vector b) {return _mm256_cmp_ps(a, b, _CMP_LT_OQ);}
222 |     static inline simd_vector simd_cmp_le(simd_vector a, simd_vector b) {return _mm256_cmp_ps(a, b, _CMP_LE_OQ);}
223 |     static inline simd_vector simd_cmp_eq(simd_vector a, simd_vector b) {return _mm256_cmp_ps(a, b, _CMP_EQ_OQ);}
224 |     static inline simd_vector simd_cmp_neq(simd_vector a, simd_vector b) {return _mm256_cmp_ps(a, b, _CMP_NEQ_OQ);}
225 |     static inline simd_vector simd_isnan(simd_vector a) {return _mm256_cmp_ps(a, a, _CMP_NEQ_UQ);}
226 |     static inline simd_vector simd_sqrt(simd_vector a) {return _mm256_sqrt_ps(a);}
227 |     static inline simd_vector simd_neg(simd_vector a) {return _mm256_xor_ps(a, simd_sign_mask());}
228 | 
229 |     typedef __m256i simd_vectori;
230 |     static inline simd_vectori simd_convert_from_float(simd_vector a) {return _mm256_cvttps_epi32(a);}
231 |     static inline simd_vectori simd_cast_from_float(simd_vector a) {return _mm256_castps_si256(a);}
232 |     static inline simd_vector simd_convert_from_int(simd_vectori a) {return _mm256_cvtepi32_ps(a);}
233 |     static inline simd_vector simd_cast_from_int(simd_vectori a) {return _mm256_castsi256_ps(a);}
234 |     static inline simd_vectori simd_add_i(simd_vectori a, simd_vectori b) {return _mm256_add_epi32(a, b);}
235 |     static inline simd_vectori simd_sub_i(simd_vectori a, simd_vectori b) {return _mm256_sub_epi32(a, b);}
236 |     static inline simd_vectori simd_mul_i(simd_vectori a, simd_vectori b) {return _mm256_mullo_epi32(a, b);}
237 |     static inline simd_vectori simd_splat_i(int i) {return _mm256_set1_epi32(i);}
238 |     static inline simd_vectori simd_splat_zero_i(void) {return _mm256_setzero_si256();}
239 |     static inline simd_vectori simd_shift_left_i(simd_vectori a, int i) {return _mm256_slli_epi32(a, i);}
240 |     static inline simd_vectori simd_shift_right_i(simd_vectori a, int i) {return _mm256_srai_epi32(a, i);}
241 |     static inline simd_vectori simd_and_i(simd_vectori a, simd_vectori b) {return _mm256_and_si256(a, b);}
242 |     static inline simd_vectori simd_or_i(simd_vectori a, simd_vectori b) {return _mm256_or_si256(a, b);}
243 |     static inline simd_vectori simd_abs_i(simd_vectori a) {return _mm256_abs_epi32(a);}
244 |     static inline simd_vectori simd_andnot_i(simd_vectori a, simd_vectori b) {return _mm256_andnot_si256(b, a);}
245 |     static inline simd_vectori simd_cmp_eq_i(simd_vectori a, simd_vectori b) {return _mm256_cmpeq_epi32(a, b);}
246 |     static inline simd_vectori simd_cmp_gt_i(simd_vectori a, simd_vectori b) {return _mm256_cmpgt_epi32(a, b);}
247 | 
248 | 
249 |     #define simd_asin mm256_asin_ps
250 |     #define simd_atan mm256_atan_ps
251 |     #define simd_sincos mm256_sincos_ps
252 |     #define simd_sin mm256_sin_ps
253 |     #define simd_exp mm256_exp_ps
254 |     #define simd_log mm256_log_ps
255 |     #define simd_exp2 mm256_exp2_ps
256 |     #define simd_log2 mm256_log2_ps
257 | 
258 | #endif
259 | 
260 | //----------------------------------------------------------------------------------------------------------------------
261 | static inline simd_vector simd_frexp(simd_vector x, simd_vectori* exponent)
262 | {
263 |     simd_vectori cast_float = simd_cast_from_float(x);
264 |     simd_vectori e = simd_and_i(simd_shift_right_i(cast_float, 23), simd_splat_i(0xff));;
265 |     simd_vectori equal_to_zero = simd_and_i(simd_cmp_eq_i(e, simd_splat_zero_i()), simd_cast_from_float(simd_cmp_eq(x, simd_splat_zero())));
266 |     *exponent = simd_andnot_i(simd_sub_i(e, simd_splat_i(0x7e)), equal_to_zero);
267 |     cast_float = simd_and_i(cast_float, simd_splat_i(0x807fffff));
268 |     cast_float = simd_or_i(cast_float, simd_splat_i(0x3f000000));
269 |     return simd_select(simd_cast_from_int(cast_float), x, simd_cast_from_int(equal_to_zero));
270 | }
271 | 
272 | //----------------------------------------------------------------------------------------------------------------------
273 | static inline simd_vector simd_ldexp(simd_vector x, simd_vectori pw2)
274 | {
275 |     simd_vectori fl = simd_cast_from_float(x);
276 |     simd_vectori e = simd_and_i(simd_shift_right_i(fl, 23), simd_splat_i(0xff));
277 |     e = simd_and_i(simd_add_i(e, pw2), simd_splat_i(0xff));
278 |     simd_vectori is_infinity = simd_cmp_eq_i(e, simd_splat_i(0xff));
279 |     fl = simd_or_i(simd_andnot_i(fl, is_infinity), simd_and_i(fl, simd_splat_i(0xFF800000)));
280 |     fl = simd_or_i(simd_shift_left_i(e, 23), simd_and_i(fl, simd_splat_i(0x807fffff)));
281 |     simd_vector equal_to_zero = simd_cmp_eq(x, simd_splat_zero());
282 |     return simd_andnot(simd_cast_from_int(fl), equal_to_zero);
283 | }
284 | 
285 | //----------------------------------------------------------------------------------------------------------------------
286 | static inline simd_vector simd_polynomial4(simd_vector x, float* coefficients)
287 | {
288 |     simd_vector result = simd_fmad(x, simd_splat(coefficients[0]), simd_splat(coefficients[1]));
289 |     result = simd_fmad(x, result, simd_splat(coefficients[2]));
290 |     result = simd_fmad(x, result, simd_splat(coefficients[3]));
291 |     return result;
292 | }
293 | 
294 | //----------------------------------------------------------------------------------------------------------------------
295 | static inline simd_vector simd_polynomial5(simd_vector x, float* coefficients)
296 | {
297 |     simd_vector result = simd_polynomial4(x, coefficients);
298 |     result = simd_fmad(x, result, simd_splat(coefficients[4]));
299 |     return result;
300 | }
301 | 
302 | //----------------------------------------------------------------------------------------------------------------------
303 | static inline simd_vector simd_polynomial6(simd_vector x, float* coefficients)
304 | {
305 |     simd_vector result = simd_polynomial5(x, coefficients);
306 |     result = simd_fmad(x, result, simd_splat(coefficients[5]));
307 |     return result;
308 | }
309 | 
310 | //----------------------------------------------------------------------------------------------------------------------
311 | static inline simd_vector simd_clamp(simd_vector a, simd_vector range_min, simd_vector range_max) 
312 | {
313 |     return simd_max(simd_min(a, range_max), range_min);
314 | }
315 | 
316 | //----------------------------------------------------------------------------------------------------------------------
317 | static inline simd_vector simd_sign(simd_vector a)
318 | {
319 |     simd_vector result = simd_select(simd_splat_zero(), simd_splat(-1.f), simd_cmp_lt(a, simd_splat_zero()));
320 |     return simd_select(result, simd_splat( 1.f), simd_cmp_gt(a, simd_splat_zero()));
321 | }
322 | 
323 | static inline simd_vectori simd_select_i(simd_vectori a, simd_vectori b, simd_vectori mask) { return simd_or_i(simd_andnot_i(a, mask), simd_and_i(b, mask));}
324 | static inline simd_vectori simd_neg_i(simd_vectori a){return simd_sub_i(simd_splat_zero_i(), a);}
325 | 
326 | 
327 | //----------------------------------------------------------------------------------------------------------------------
328 | // based on http://gruntthepeon.free.fr/ssemath/
329 | #ifdef __MATH__INTRINSICS__NEON__
330 |     float32x4_t vlogq_f32(float32x4_t x)
331 | #else
332 |     __m256 mm256_log_ps(__m256 x)
333 | #endif
334 | {
335 |     simd_vector one = simd_splat(1.f);
336 |     simd_vector invalid_mask = simd_cmp_le(x, simd_splat_zero());
337 |     invalid_mask = simd_or(invalid_mask, simd_isnan(x));
338 |     simd_vector input_is_zero = simd_cmp_eq(x, simd_splat_zero());
339 |     simd_vector input_is_infinity = simd_cmp_eq(x, simd_splat_positive_infinity());
340 | 
341 |     x = simd_max(x, simd_min_normalized());  // cut off denormalized stuff
342 | 
343 |     simd_vectori emm0 = simd_shift_right_i(simd_cast_from_float(x), 23);
344 |     emm0 = simd_sub_i(emm0, simd_splat_i(0x7f));
345 |     simd_vector e = simd_convert_from_int(emm0);
346 |     
347 |     // keep only the fractional part
348 |     x = simd_and(x, simd_inv_mant_mask());
349 |     x = simd_or(x, simd_splat(0.5f));
350 |     
351 |     e = simd_add(e, one);
352 |     simd_vector mask = simd_cmp_lt(x, simd_splat(0.707106781186547524f));
353 |     simd_vector tmp = simd_and(x, mask);
354 |     x = simd_sub(x, one);
355 |     e = simd_sub(e, simd_and(one, mask));
356 |     x = simd_add(x, tmp);
357 | 
358 |     simd_vector z = simd_mul(x,x);
359 |     simd_vector y = simd_splat(7.0376836292E-2f);
360 |     y = simd_fmad(y, x, simd_splat(-1.1514610310E-1f));
361 |     y = simd_fmad(y, x, simd_splat(1.1676998740E-1f));
362 |     y = simd_fmad(y, x, simd_splat(-1.2420140846E-1f));
363 |     y = simd_fmad(y, x, simd_splat(+1.4249322787E-1f));
364 |     y = simd_fmad(y, x, simd_splat(-1.6668057665E-1f));
365 |     y = simd_fmad(y, x, simd_splat(+2.0000714765E-1f));
366 |     y = simd_fmad(y, x, simd_splat(-2.4999993993E-1f));
367 |     y = simd_fmad(y, x, simd_splat(+3.3333331174E-1f));
368 |     y = simd_mul(y, x);
369 |     y = simd_mul(y, z);
370 | 
371 |     tmp = simd_mul(e, simd_splat(-2.12194440e-4f));
372 |     y = simd_add(y, tmp);
373 | 
374 |     tmp = simd_mul(z, simd_splat(0.5f));
375 |     y = simd_sub(y, tmp);
376 | 
377 |     tmp = simd_mul(e, simd_splat(0.693359375f));
378 |     x = simd_add(x, y);
379 |     x = simd_add(x, tmp);
380 |     x = simd_or(x, invalid_mask); // NAN/negative arg will be NAN
381 |     x = simd_select(x, simd_splat_negative_infinity(), input_is_zero); // zero arg will be -inf
382 |     x = simd_select(x, simd_splat_positive_infinity(), input_is_infinity); // +inf arg will be +inf
383 | 
384 |     return x;
385 | }
386 | 
387 | //----------------------------------------------------------------------------------------------------------------------
388 | // based on https://github.com/redorav/hlslpp/blob/master/include/hlsl%2B%2B_vector_float8.h
389 | #ifdef __MATH__INTRINSICS__NEON__
390 |     float32x4_t vlog2q_f32(float32x4_t x)
391 | #else
392 |     __m256 mm256_log2_ps(__m256 x)
393 | #endif
394 | {
395 |     simd_vector invalid_mask = simd_cmp_le(x, simd_splat_zero());
396 |     invalid_mask = simd_or(invalid_mask, simd_isnan(x));
397 |     simd_vector input_is_zero = simd_cmp_eq(x, simd_splat_zero());
398 |     simd_vector input_is_infinity = simd_cmp_eq(x, simd_splat_positive_infinity());
399 |     simd_vector one = simd_splat(1.f);
400 |     simd_vectori exp = simd_splat_i(0x7f800000);
401 |     simd_vectori mant = simd_splat_i(0x007fffff);
402 |     simd_vectori i = simd_cast_from_float(x);
403 |     simd_vector e = simd_convert_from_int(simd_sub_i(simd_shift_right_i(simd_and_i(i, exp), 23), simd_splat_i(127)));
404 |     simd_vector m = simd_or(simd_cast_from_int(simd_and_i(i, mant)), one);
405 | 
406 |     // minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
407 |     simd_vector p = simd_polynomial6(m, (float[]){-3.4436006e-2f, 3.1821337e-1f, -1.2315303f, 2.5988452f, -3.3241990f, 3.1157899f});
408 | 
409 |     // this effectively increases the polynomial degree by one, but ensures that log2(1) == 0
410 |     p = simd_mul(p, simd_sub(m, one));
411 |     simd_vector result = simd_add(p, e);
412 | 
413 |     result = simd_or(result, invalid_mask); // NAN/negative arg will be NAN
414 |     result = simd_select(result, simd_splat_negative_infinity(), input_is_zero); // zero arg will be -inf
415 |     result = simd_select(result, simd_splat_positive_infinity(), input_is_infinity); // +inf arg will be +inf
416 | 
417 |     return result;
418 | }
419 | 
420 | //----------------------------------------------------------------------------------------------------------------------
421 | // based on http://gruntthepeon.free.fr/ssemath/
422 | #ifdef __MATH__INTRINSICS__NEON__
423 |     float32x4_t vexpq_f32(float32x4_t x)
424 | #else
425 |     __m256 mm256_exp_ps(__m256 x)
426 | #endif
427 | {
428 |     simd_vector invalid_mask = simd_isnan(x);
429 |     simd_vector input_is_infinity = simd_cmp_eq(x, simd_splat_positive_infinity());
430 |     simd_vector tmp = simd_splat_zero();
431 |     simd_vector fx;
432 |     simd_vector one = simd_splat(1.f);
433 | 
434 |     x = simd_min(x, simd_splat(88.3762626647949f));
435 |     x = simd_max(x, simd_splat(-88.3762626647949f));
436 | 
437 |     // express exp(x) as exp(g + n*log(2))
438 |     fx = simd_fmad(x, simd_splat(1.44269504088896341f), simd_splat(0.5f));
439 |     tmp = simd_floor(fx);
440 | 
441 |     // if greater, substract 1
442 |     simd_vector mask = simd_cmp_gt(tmp, fx);
443 |     mask = simd_and(mask, one);
444 |     fx = simd_sub(tmp, mask);
445 | 
446 |     tmp = simd_mul(fx, simd_splat(0.693359375f));
447 |     simd_vector z = simd_mul(fx, simd_splat(-2.12194440e-4f));
448 |     x = simd_sub(x, tmp);
449 |     x = simd_sub(x, z);
450 |     z = simd_mul(x, x);
451 |     simd_vector y = simd_polynomial6(x, (float[]) {1.9875691500E-4f, 1.3981999507E-3f, 8.3334519073E-3f,
452 |                                                    4.1665795894E-2f, 1.6666665459E-1f, 5.0000001201E-1f});
453 |     y = simd_fmad(y, z, x);
454 |     y = simd_add(y, one);
455 | 
456 |     simd_vectori emm0 = simd_convert_from_float(fx);
457 |     emm0 = simd_add_i(emm0, simd_splat_i(0x7f));
458 |     emm0 = simd_shift_left_i(emm0, 23);
459 |     simd_vector pow2n = simd_cast_from_int(emm0);
460 | 
461 |     simd_vector result = simd_mul(y, pow2n);
462 |     result = simd_or(result, invalid_mask);
463 |     result = simd_select(result, simd_splat_positive_infinity(), input_is_infinity); // +inf arg will be +inf
464 | 
465 |     return result;
466 | }
467 | 
468 | //----------------------------------------------------------------------------------------------------------------------
469 | // based on https://github.com/jeremybarnes/cephes/blob/master/single/exp2f.c
470 | #ifdef __MATH__INTRINSICS__NEON__
471 |     float32x4_t vexp2q_f32(float32x4_t x)
472 | #else
473 |     __m256 mm256_exp2_ps(__m256 x)
474 | #endif
475 | {
476 |     simd_vector invalid_mask = simd_isnan(x);
477 |     simd_vector input_is_infinity = simd_cmp_eq(x, simd_splat_positive_infinity());
478 |     simd_vector equal_to_zero = simd_cmp_eq(x, simd_splat_zero());
479 |     simd_vector one = simd_splat(1.f);
480 | 
481 |     // clamp values
482 |     x = simd_clamp(x, simd_splat(-127.f), simd_splat(127.f));
483 | 
484 | #ifdef __MATH_INTRINSINCS_FAST__
485 |     simd_vector ipart = simd_floor(x);
486 |     simd_vector fpart = simd_sub(x, ipart);
487 | 
488 |     simd_vectori i = simd_shift_left_i(simd_add_i(simd_convert_from_float(ipart), simd_splat_i(127)), 23);
489 |     simd_vector expipart = simd_cast_from_int(i);
490 | 
491 |     // minimax polynomial fit of 2^x, in range [-0.5, 0.5[
492 |     simd_vector expfpart = simd_polynomial5(fpart, (float[]) {1.3534167e-2f, 5.2011464e-2f, 2.4144275e-1f, 6.9300383e-1f, 1.0000026f});
493 |     simd_vector result = simd_mul(expipart, expfpart);
494 | #else
495 |     simd_vector px = simd_floor(x);
496 |     simd_vectori i0 = simd_convert_from_float(px);
497 |     x = simd_sub(x, px);
498 | 
499 |     simd_vector above_half = simd_cmp_gt(x, simd_splat(.5f));
500 |     i0 = simd_select_i(i0, simd_add_i(i0, simd_splat_i(1)), simd_cast_from_float(above_half));
501 |     x = simd_select(x, simd_sub(x, one), above_half);
502 | 
503 |     px = simd_polynomial6(x, (float[]) {1.535336188319500E-004f, 1.339887440266574E-003f, 9.618437357674640E-003f,
504 |                                         5.550332471162809E-002f, 2.402264791363012E-001f, 6.931472028550421E-001f});
505 |     px = simd_fmad(px, x,  one);
506 |     simd_vector result = simd_ldexp(px, i0);
507 | #endif
508 | 
509 |     result = simd_select(result, one, equal_to_zero);
510 |     result = simd_or(result, invalid_mask);
511 |     result = simd_select(result, simd_splat_positive_infinity(), input_is_infinity); // +inf arg will be +inf
512 |     return result;
513 | }
514 | 
515 | //----------------------------------------------------------------------------------------------------------------------
516 | // based on http://gruntthepeon.free.fr/ssemath/
517 | #ifdef __MATH__INTRINSICS__NEON__
518 |     void vsincosq_f32(float32x4_t x, float32x4_t* s, float32x4_t* c)
519 | #else
520 |     void mm256_sincos_ps(__m256 x, __m256* s, __m256* c)
521 | #endif
522 | {
523 |     simd_vector xmm1, xmm2, xmm3 = simd_splat_zero(), sign_bit_sin, y;
524 | 
525 |     sign_bit_sin = x;
526 | 
527 |     // take the absolute value
528 |     x = simd_and(x, simd_inv_sign_mask());
529 |     // extract the sign bit (upper one)
530 |     sign_bit_sin = simd_and(sign_bit_sin, simd_sign_mask());
531 | 
532 |     // scale by 4/Pi
533 |     y = simd_mul(x, simd_splat(1.27323954473516f));
534 | 
535 |     // store the integer part of y in emm2 
536 |     simd_vectori emm2 = simd_convert_from_float(y);
537 | 
538 |     // j=(j+1) & (~1) (see the cephes sources)
539 |     emm2 = simd_add_i(emm2, simd_splat_i(1));
540 |     emm2 = simd_and_i(emm2, simd_splat_i(~1));
541 |     y = simd_convert_from_int(emm2);
542 | 
543 |     simd_vectori emm4 = emm2;
544 | 
545 |     // get the swap sign flag for the sine
546 |     simd_vectori emm0 = simd_and_i(emm2, simd_splat_i(4));
547 |     emm0 = simd_shift_left_i(emm0, 29);
548 |     simd_vector swap_sign_bit_sin = simd_cast_from_int(emm0);
549 | 
550 |     // get the polynom selection mask for the sine
551 |     emm2 = simd_and_i(emm2, simd_splat_i(2));
552 |     emm2 = simd_cmp_eq_i(emm2, simd_splat_zero_i());
553 |     simd_vector poly_mask = simd_cast_from_int(emm2); 
554 | 
555 |     // The magic pass: "Extended precision modular arithmetic" 
556 |     //  x = ((x - y * DP1) - y * DP2) - y * DP3; 
557 |     x = simd_fmad(y, simd_splat(-0.78515625f), x);
558 |     x = simd_fmad(y, simd_splat(-2.4187564849853515625e-4f), x);
559 |     x = simd_fmad(y, simd_splat(-3.77489497744594108e-8f), x);
560 | 
561 |     emm4 = simd_sub_i(emm4, simd_splat_i(2));
562 |     emm4 = simd_andnot_i(simd_splat_i(4), emm4);
563 |     emm4 = simd_shift_left_i(emm4, 29);
564 |     simd_vector sign_bit_cos = simd_cast_from_int(emm4); 
565 | 
566 |     sign_bit_sin = simd_xor(sign_bit_sin, swap_sign_bit_sin);
567 |     
568 |     // Evaluate the first polynom  (0 <= x <= Pi/4)
569 |     simd_vector z = simd_mul(x,x);
570 |     y = simd_splat(2.443315711809948E-005f);
571 |     y = simd_fmad(y, z, simd_splat(-1.388731625493765E-003f));
572 |     y = simd_fmad(y, z, simd_splat(4.166664568298827E-002f));
573 |     y = simd_mul(y, z);
574 |     y = simd_mul(y, z);
575 |     simd_vector tmp = simd_mul(z, simd_splat(.5f));
576 |     y = simd_sub(y, tmp);
577 |     y = simd_add(y, simd_splat(1.f));
578 | 
579 |     // Evaluate the second polynom  (Pi/4 <= x <= 0)
580 |     simd_vector y2 = simd_splat(-1.9515295891E-4f);
581 |     y2 = simd_fmad(y2, z, simd_splat(8.3321608736E-3f));
582 |     y2 = simd_fmad(y2, z, simd_splat(-1.6666654611E-1f));
583 |     y2 = simd_mul(y2, z);
584 |     y2 = simd_fmad(y2, x, x);
585 | 
586 |     // select the correct result from the two polynoms
587 |     xmm3 = poly_mask;
588 |     simd_vector ysin2 = simd_and(y2, xmm3);
589 |     simd_vector ysin1 = simd_andnot(y, xmm3);
590 |     y2 = simd_sub(y2,ysin2);
591 |     y = simd_sub(y, ysin1);
592 | 
593 |     xmm1 = simd_add(ysin1,ysin2);
594 |     xmm2 = simd_add(y,y2);
595 | 
596 |     // update the sign
597 |     *s = simd_xor(xmm1, sign_bit_sin);
598 |     *c = simd_xor(xmm2, sign_bit_cos);
599 | }
600 | 
601 | //----------------------------------------------------------------------------------------------------------------------
602 | #ifdef __MATH__INTRINSICS__NEON__
603 | float32x4_t vsinq_f32(float32x4_t x)
604 | #else
605 | __m256 mm256_sin_ps(__m256 x)
606 | #endif
607 | {
608 | #ifdef __MATH_INTRINSINCS_FAST__
609 |     // range reduction from hlslpp, polynomial computed by lolremez
610 |     simd_vector invtau = simd_splat(1.f/SIMD_MATH_TAU);
611 |     simd_vector tau = simd_splat(SIMD_MATH_TAU);
612 |     simd_vector pi2 = simd_splat(SIMD_MATH_PI2);
613 | 
614 |     // Range reduction (into [-pi, pi] range)
615 |     // Formula is x = x - round(x / 2pi) * 2pi
616 |     x = simd_sub(x, simd_mul(simd_round(simd_mul(x, invtau)), tau));
617 | 
618 |     simd_vector gt_pi2 = simd_cmp_gt(x, pi2);
619 |     simd_vector lt_minus_pi2 = simd_cmp_lt(x, simd_neg(pi2));
620 |     simd_vector ox = x;
621 | 
622 |     // Use identities/mirroring to remap into the range of the minimax polynomial
623 |     simd_vector pi = simd_splat(SIMD_MATH_PI);
624 |     x = simd_select(x, simd_sub(pi, ox), gt_pi2);
625 |     x = simd_select(x, simd_sub(simd_neg(pi), ox), lt_minus_pi2);
626 | 
627 |     simd_vector x_squared = simd_mul(x, x);
628 |     simd_vector result = simd_polynomial4(x_squared, (float[]){2.6000548e-6f, -1.9806615e-4f, 8.3330173e-3f, -1.6666657e-1f});
629 |     result = simd_mul(result, x_squared);
630 |     result = simd_fmad(result, x, x);
631 | 
632 |     return result;
633 | #else
634 |     simd_vector sinus, cosinus;
635 |     simd_sincos(x, &sinus, &cosinus);
636 |     return sinus;
637 | #endif
638 | }
639 | 
640 | //----------------------------------------------------------------------------------------------------------------------
641 | #ifdef __MATH__INTRINSICS__NEON__
642 | float32x4_t vcosq_f32(float32x4_t x)
643 | #else
644 | __m256 mm256_cos_ps(__m256 x)
645 | #endif
646 | {
647 | #ifdef __MATH_INTRINSINCS_FAST__
648 |     return simd_sin(simd_sub(simd_splat(SIMD_MATH_PI2), x));
649 | #else
650 |     simd_vector sinus, cosinus;
651 |     simd_sincos(x, &sinus, &cosinus);
652 |     return cosinus;
653 | #endif
654 | }
655 | 
656 | //----------------------------------------------------------------------------------------------------------------------
657 | #ifdef __MATH__INTRINSICS__NEON__
658 |     float32x4_t vasinq_f32(float32x4_t xx)
659 | #else
660 |     __m256 mm256_asin_ps(__m256 xx)
661 | #endif
662 | {
663 |     simd_vector output_nan = simd_cmp_gt(simd_abs(xx), simd_splat(1.f));
664 |     simd_vector small_value = simd_cmp_lt(simd_abs(xx), simd_splat(1.0e-4f));
665 |     simd_vector a  = simd_abs(xx);
666 | #ifdef __MATH_INTRINSINCS_FAST__
667 |     // based on https://developer.download.nvidia.com/cg/asin.html
668 |     simd_vector negate = simd_select(simd_splat_zero(), simd_splat(1.f), simd_cmp_lt(xx, simd_splat_zero()));
669 |     simd_vector x = a;
670 |     simd_vector result = simd_polynomial4(x, (float[]){-0.0187293f, 0.0742610f, -0.2121144f, 1.5707288f});
671 |     result = simd_sub(simd_splat(SIMD_MATH_PI2), simd_mul(simd_sqrt(simd_sub(simd_splat(1.f), x)), result));
672 |     result = simd_sub(result, simd_mul(simd_mul(simd_splat(2.f), result), negate));
673 | #else
674 |     // based on https://github.com/jeremybarnes/cephes/blob/master/single/asinf.c
675 |     simd_vector x = xx;
676 |     simd_vector sign = simd_sign(xx);
677 |     simd_vector z1 = simd_mul(simd_splat(.5f), simd_sub(simd_splat(1.f), a));
678 |     simd_vector z2 = simd_mul(a, a);
679 |     simd_vector flag = simd_cmp_gt(a, simd_splat(.5f));
680 |     simd_vector z = simd_select(z2, z1, flag);
681 | 
682 |     x = simd_select(a, simd_sqrt(z), flag);
683 | 
684 |     simd_vector tmp = simd_polynomial5(z, (float[]) {4.2163199048E-2f, 2.4181311049E-2f, 4.5470025998E-2f, 
685 |                                                     7.4953002686E-2f, 1.6666752422E-1f});
686 |     tmp = simd_mul(tmp, z);
687 |     z = simd_fmad(tmp, x, x);
688 | 
689 |     tmp = simd_add(z, z);
690 |     tmp = simd_sub(simd_splat(SIMD_MATH_PI2), tmp);
691 |     z = simd_select(z, tmp, flag);
692 |     simd_vector result = simd_mul(z, sign);
693 | #endif
694 |     result = simd_or(result, output_nan);
695 |     result = simd_select(result, xx, small_value);
696 |     return result;
697 | }
698 | 
699 | //----------------------------------------------------------------------------------------------------------------------
700 | // acos(x) = pi/2 - asin(x)
701 | #ifdef __MATH__INTRINSICS__NEON__
702 |     float32x4_t vacosq_f32(float32x4_t x)
703 | #else
704 |     __m256 mm256_acos_ps(__m256 x)
705 | #endif
706 | {
707 | #ifdef __MATH_INTRINSINCS_FAST__
708 |     simd_vector negate = simd_select(simd_splat_zero(), simd_splat(1.f), simd_cmp_lt(x, simd_splat_zero()));
709 |     x = simd_abs(x);
710 |     simd_vector result = simd_polynomial4(x, (float[]){-0.0187293f, 0.0742610f, -0.2121144f, 1.5707288f});
711 |     result = simd_mul(result, simd_sqrt(simd_sub(simd_splat(1.f), x)));
712 |     result = simd_sub(result, simd_mul(simd_mul(simd_splat(2.f), negate), result));
713 |     return simd_fmad(negate, simd_splat(SIMD_MATH_PI), result);
714 | #else
715 |     simd_vector out_of_bound = simd_cmp_gt(simd_abs(x), simd_splat(1.f));
716 |     simd_vector result = simd_sub(simd_splat(SIMD_MATH_PI2), simd_asin(x));
717 |     result = simd_or(result, out_of_bound); // out of bound outputs NAN
718 |     return result;
719 | #endif
720 | }
721 | 
722 | //----------------------------------------------------------------------------------------------------------------------
723 | // based on https://github.com/jeremybarnes/cephes/blob/master/single/atanf.c
724 | #ifdef __MATH__INTRINSICS__NEON__
725 |     float32x4_t vatanq_f32(float32x4_t xx)
726 | #else
727 |     __m256 mm256_atan_ps(__m256 xx)
728 | #endif
729 | {
730 |     simd_vector sign = simd_sign(xx);
731 |     simd_vector x = simd_abs(xx);
732 |     simd_vector one = simd_splat(1.f);
733 | 
734 |     // range reduction
735 |     simd_vector above_3pi8 = simd_cmp_gt(x, simd_splat(2.414213562373095f));
736 |     simd_vector above_pi8 = simd_andnot(simd_cmp_gt(x, simd_splat(0.4142135623730950f)), above_3pi8);
737 |     simd_vector y = simd_splat_zero();
738 | 
739 |     x = simd_select(x, simd_neg(simd_div(one, x)), above_3pi8);
740 |     x = simd_select(x, simd_div(simd_sub(x, one), simd_add(x, one)), above_pi8);
741 |     y = simd_select(y, simd_splat(SIMD_MATH_PI2), above_3pi8);
742 |     y = simd_select(y, simd_splat(SIMD_MATH_PI4), above_pi8);
743 | 
744 |     // minimax polynomial
745 |     simd_vector z = simd_mul(x, x);
746 |     simd_vector tmp = simd_polynomial4(z, (float[]) {8.05374449538e-2f, -1.38776856032E-1f, 1.99777106478E-1f, -3.33329491539E-1f});
747 |     tmp = simd_mul(tmp, z);
748 |     tmp = simd_fmad(tmp, x, x);
749 |     y = simd_add(tmp, y);
750 |     y = simd_mul(y, sign);
751 | 
752 |     return y;	
753 | }
754 | 
755 | //----------------------------------------------------------------------------------------------------------------------
756 | // based on https://mazzo.li/posts/vectorized-atan2.html
757 | #ifdef __MATH__INTRINSICS__NEON__
758 |     float32x4_t vatan2q_f32(float32x4_t x, float32x4_t y)
759 | #else
760 |     __m256 mm256_atan2_ps(__m256 x, __m256 y)
761 | #endif
762 | {
763 |     simd_vector swap = simd_cmp_lt(simd_abs(x), simd_abs(y));
764 |     simd_vector x_equals_zero = simd_cmp_eq(x, simd_splat_zero());
765 |     simd_vector y_equals_zero = simd_cmp_eq(y, simd_splat_zero());
766 |     simd_vector x_over_y = simd_div(x, y);
767 |     simd_vector y_over_x = simd_div(y, x);
768 |     simd_vector atan_input = simd_select(y_over_x, x_over_y, swap);
769 |     simd_vector result = simd_atan(atan_input);
770 | 
771 |     simd_vector adjust = simd_select(simd_splat(-SIMD_MATH_PI2), simd_splat(SIMD_MATH_PI2), simd_cmp_ge(atan_input, simd_splat_zero()));
772 |     result = simd_select(result, simd_sub(adjust, result), swap);
773 | 
774 |     simd_vector x_sign_mask = simd_cmp_lt(x, simd_splat_zero());
775 |     result = simd_add( simd_and(simd_xor(simd_splat(SIMD_MATH_PI), simd_and(simd_sign_mask(), y)), x_sign_mask), result);
776 |     result = simd_select(result, simd_mul(simd_sign(x), simd_splat_zero()), y_equals_zero);
777 |     result = simd_select(result, simd_mul(simd_sign(y), simd_splat(SIMD_MATH_PI2)), x_equals_zero);
778 |     return result;
779 | }
780 | 
781 | //----------------------------------------------------------------------------------------------------------------------
782 | // based on https://github.com/jeremybarnes/cephes/blob/master/single/cbrtf.c
783 | #ifdef __MATH__INTRINSICS__NEON__
784 |     float32x4_t vcbrtq_f32(float32x4_t xx)
785 | #else
786 |     __m256 mm256_cbrt_ps(__m256 xx)
787 | #endif
788 | {
789 |     simd_vector one_over_three = simd_splat(0.333333333333f);
790 |     simd_vector sign = simd_sign(xx);
791 |     simd_vector x = simd_abs(xx);
792 |     simd_vector z = x;
793 | 
794 |     // extract power of 2, leaving mantissa between 0.5 and 1
795 |     simd_vectori exponent;
796 |     x = simd_frexp(x, &exponent);
797 | 
798 |     // Approximate cube root of number between .5 and 1
799 |     x = simd_polynomial5(x, (float[]) {-0.1346611047335f, 0.5466460136639f, -0.954382247715f, 1.13999833547f, 0.40238979564f});
800 | 
801 |     // exponent divided by 3
802 |     simd_vectori exponent_is_negative = simd_cmp_gt_i(simd_splat_zero_i(), exponent);
803 |     
804 |     exponent = simd_abs_i(exponent);
805 |     simd_vectori rem = exponent;
806 |     exponent = simd_convert_from_float((simd_mul(simd_convert_from_int(exponent), one_over_three)));
807 |     rem = simd_sub_i(rem, simd_mul_i(exponent, simd_splat_i(3)));
808 | 
809 |     simd_vector cbrt2 = simd_splat(1.25992104989487316477f);
810 |     simd_vector cbrt4 = simd_splat(1.58740105196819947475f);
811 | 
812 |     simd_vector rem_equals_1 = simd_cast_from_int(simd_cmp_eq_i(rem, simd_splat_i(1)));
813 |     simd_vector rem_equals_2 = simd_cast_from_int(simd_cmp_eq_i(rem, simd_splat_i(2)));
814 |     simd_vector x1 = simd_mul(x, simd_select(cbrt4, cbrt2, rem_equals_1));
815 |     simd_vector x2 = simd_div(x, simd_select(cbrt4, cbrt2, rem_equals_1));
816 |     x = simd_select(x, simd_select(x1, x2, simd_cast_from_int(exponent_is_negative)), simd_or(rem_equals_1, rem_equals_2));
817 |     exponent = simd_mul_i(exponent, simd_select_i(simd_splat_i(1), simd_splat_i(-1), exponent_is_negative));
818 | 
819 |     // multiply by power of 2
820 |     x = simd_ldexp(x, exponent);
821 | 
822 |     // Newton iteration, x -= ( x - (z/(x*x)) ) * 0.333333333333;
823 |     x = simd_sub(x, simd_mul(simd_sub(x, simd_div(z, simd_mul(x, x))), one_over_three));
824 |     x = simd_mul(x, sign);  // if input is zero, sign is also zero
825 | 
826 |     return x;
827 | }
828 | 
829 | //----------------------------------------------------------------------------------------------------------------------
830 | // the implementation based https://github.com/jeremybarnes/cephes/blob/master/single/powf.c is **too** slow
831 | // so we use the classic exp(y * log(x))
832 | #ifdef __MATH__INTRINSICS__NEON__
833 |     float32x4_t vpowq_f32(float32x4_t x, float32x4_t y)
834 | #else
835 |     __m256 mm256_pow_ps(__m256 x, __m256 y)
836 | #endif
837 | {
838 |     simd_vector x_equals_zero = simd_cmp_eq(x, simd_splat_zero());
839 |     simd_vector y_equals_zero = simd_cmp_eq(y, simd_splat_zero());
840 |     simd_vector non_integer_power = simd_cmp_neq(y, simd_floor(y));
841 |     simd_vector return_zero = simd_andnot(x_equals_zero, y_equals_zero);
842 |     simd_vector return_one = simd_and(x_equals_zero, y_equals_zero);
843 |     simd_vector return_nan = simd_and(simd_cmp_lt(x, simd_splat_zero()), non_integer_power);
844 | 
845 | #ifdef __MATH_INTRINSINCS_FAST__
846 |     simd_vector result = simd_exp2(simd_mul(y, simd_log2(x)));
847 | #else
848 |     simd_vector result = simd_exp(simd_mul(y, simd_log(x)));
849 | #endif
850 | 
851 |     result = simd_andnot(result, return_zero);
852 |     result = simd_select(result, simd_splat(1.f), return_one);
853 |     result = simd_or(result, return_nan);
854 | 
855 |     return result;
856 | }
857 | 
858 | #endif  // __MATH__INTRINSICS__IMPLEMENTATION__
859 | 
860 | 
861 | 


--------------------------------------------------------------------------------
/tests/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.25)
 2 | 
 3 | set(CMAKE_OSX_ARCHITECTURES arm64;x86_64)
 4 | 
 5 | project(math_intrinsics_unit_tests)
 6 | 
 7 | add_executable(test_precision test.c math_intrinsics.c)
 8 | add_executable(test_fast test.c math_intrinsics.c)
 9 | add_executable(benchmark_precision benchmark.c math_intrinsics.c)
10 | add_executable(benchmark_fast benchmark.c math_intrinsics.c)
11 | 
12 | if(LINUX)
13 |     set(CMAKE_EXE_LINKER_FLAGS "-lm")
14 | endif()
15 | 
16 | if(MSVC)
17 |     target_compile_options(test_precision PRIVATE /W4 /WX /std:c17)
18 |     target_compile_options(test_fast PRIVATE /W4 /WX /std:c17 /D__MATH_INTRINSINCS_FAST__)
19 |     target_compile_options(benchmark_precision PRIVATE /std:c17)
20 |     target_compile_options(benchmark_fast PRIVATE /std:c17 /D__MATH_INTRINSINCS_FAST__)
21 | else()
22 |     target_compile_options(test_precision PRIVATE -Wall -Wextra -Wpedantic -Werror -mavx2 -mfma)
23 |     target_compile_options(test_fast PRIVATE -Wall -Wextra -Wpedantic -Werror -mavx2 -mfma -D__MATH_INTRINSINCS_FAST__)
24 |     target_compile_options(benchmark_precision PRIVATE -O3 -mavx2 -mfma)
25 |     target_compile_options(benchmark_fast PRIVATE -O3 -mavx2 -mfma -D__MATH_INTRINSINCS_FAST__)
26 | endif()


--------------------------------------------------------------------------------
/tests/benchmark.c:
--------------------------------------------------------------------------------
  1 | #define SOKOL_TIME_IMPL
  2 | #include "sokol_time.h"
  3 | #include <stdio.h>
  4 | #include <float.h>
  5 | #include <math.h>
  6 | 
  7 | 
  8 | #include "../math_intrinsics.h"
  9 | 
 10 | //----------------------------------------------------------------------------------------------------------------------
 11 | // functions pointer definition
 12 | typedef float (*reference_function)(float);
 13 | typedef float (*reference_function2)(float, float);
 14 | #ifdef __MATH__INTRINSICS__AVX__
 15 |     typedef __m256 (*approximation_function)(__m256);
 16 |     typedef __m256 (*approximation_function2)(__m256, __m256);
 17 |     #define simd_vector_width (8)
 18 | #else
 19 |     typedef float32x4_t (*approximation_function)(float32x4_t);
 20 |     typedef float32x4_t (*approximation_function2)(float32x4_t, float32x4_t);
 21 |     #define simd_vector_width (4)
 22 | #endif
 23 | 
 24 | #define NUM_ITERATIONS (200000000)
 25 | 
 26 | //----------------------------------------------------------------------------------------------------------------------
 27 | int benchmark(approximation_function function, reference_function reference, const char* name)
 28 | {
 29 |     float init_array[simd_vector_width];
 30 |     uint64_t start = 0;
 31 |     int output = 0;
 32 | 
 33 |     for(uint32_t i=0; i<simd_vector_width; ++i)
 34 |         init_array[i] = (float) (i) / (float) (simd_vector_width);
 35 | 
 36 | #ifdef __MATH__INTRINSICS__AVX__
 37 |     __m256 step = _mm256_set1_ps(FLT_EPSILON);
 38 |     __m256 input = _mm256_loadu_ps(init_array);
 39 |     __m256 result = _mm256_setzero_ps();
 40 | 
 41 |      start = stm_now();
 42 | 
 43 |     for(uint32_t i=0; i<(NUM_ITERATIONS / simd_vector_width); ++i)
 44 |     {
 45 |         result = _mm256_add_ps(result, function(input));
 46 |         input = _mm256_add_ps(input, step);
 47 |     }
 48 | 
 49 |     output = _mm256_cvtss_f32(result);
 50 | #else
 51 |     float32x4_t step = vdupq_n_f32(FLT_EPSILON);
 52 |     float32x4_t input = vld1q_f32(init_array);
 53 |     float32x4_t result = vdupq_n_f32(0.f);
 54 | 
 55 |      start = stm_now();
 56 | 
 57 |     for(uint32_t i=0; i<(NUM_ITERATIONS / simd_vector_width); ++i)
 58 |     {
 59 |         result = vaddq_f32(result, function(input));
 60 |         input = vaddq_f32(input, step);
 61 |     }
 62 | 
 63 |     output = vgetq_lane_f32(result, 0);
 64 | #endif
 65 | 
 66 |     float simd_time = stm_ms(stm_since(start));
 67 | 
 68 |     printf(".%s:\t %05.2fms", name, simd_time);
 69 | 
 70 |     float total = simd_time / 1000.f;
 71 | 
 72 |     start = stm_now();
 73 | 
 74 |     for(uint32_t i=0; i<NUM_ITERATIONS; ++i)
 75 |         total += reference(total);
 76 | 
 77 |     output += total;
 78 | 
 79 |     float clib_time = stm_ms(stm_since(start));
 80 |     
 81 |     printf("\tc std func: %05.2fms\tratio: %2.2fx\n", clib_time, clib_time/simd_time);
 82 | 
 83 |     return output;
 84 | }
 85 | 
 86 | //----------------------------------------------------------------------------------------------------------------------
 87 | int benchmark2(approximation_function2 function, reference_function2 reference, const char* name)
 88 | {
 89 |     float array_x[simd_vector_width];
 90 |     float array_y[simd_vector_width];
 91 |     uint64_t start = 0;
 92 |     int output = 0;
 93 | 
 94 |     for(uint32_t i=0; i<simd_vector_width; ++i)
 95 |     {
 96 |         array_x[i] = (float) (i) / (float) (simd_vector_width);
 97 |         array_y[i] = (float) (i); 
 98 |     }
 99 | 
100 | #ifdef __MATH__INTRINSICS__AVX__
101 |     __m256 step = _mm256_set1_ps(FLT_EPSILON);
102 |     __m256 v_x = _mm256_loadu_ps(array_x);
103 |     __m256 v_y = _mm256_loadu_ps(array_y);
104 |     __m256 result = _mm256_setzero_ps();
105 | 
106 |      start = stm_now();
107 | 
108 |     for(uint32_t i=0; i<(NUM_ITERATIONS / simd_vector_width); ++i)
109 |     {
110 |         result = _mm256_add_ps(result, function(v_x, v_y));
111 |         v_x = _mm256_add_ps(v_x, step);
112 |     }
113 | 
114 |     output = _mm256_cvtss_f32(result);
115 | #else
116 |     float32x4_t step = vdupq_n_f32(FLT_EPSILON);
117 |     float32x4_t v_x = vld1q_f32(array_x);
118 |     float32x4_t v_y = vld1q_f32(array_y);
119 |     float32x4_t result = vdupq_n_f32(0.f);
120 | 
121 |      start = stm_now();
122 | 
123 |     for(uint32_t i=0; i<(NUM_ITERATIONS / simd_vector_width); ++i)
124 |     {
125 |         result = vaddq_f32(result, function(v_x, v_y));
126 |         v_x = vaddq_f32(v_x, step);
127 |     }
128 | 
129 |     output = vgetq_lane_f32(result, 0);
130 | #endif
131 | 
132 |     float simd_time = stm_ms(stm_since(start));
133 | 
134 |     printf(".%s:\t %05.2fms", name, simd_time);
135 | 
136 |     float total = simd_time / 1000.f;
137 | 
138 |     start = stm_now();
139 | 
140 |     float x = 0.f;
141 | 
142 |     for(uint32_t i=0; i<NUM_ITERATIONS; ++i)
143 |     {
144 |         total += reference(x, x*2.f);
145 |         x += 0.001f;
146 |     }
147 | 
148 |     output += total;
149 | 
150 |     float clib_time = stm_ms(stm_since(start));
151 |     
152 |     printf("\tc std func: %05.2fms\tratio: %2.2fx\n", clib_time, clib_time/simd_time);
153 | 
154 |     return output;
155 | }
156 | 
157 | //----------------------------------------------------------------------------------------------------------------------
158 | int main(int argc, char * argv[])
159 | {
160 |     stm_setup();
161 | 
162 | #ifdef __MATH_INTRINSINCS_FAST__
163 |     printf("benchmark, mode fast, %d iterations\n\n", NUM_ITERATIONS);
164 | #else
165 |     printf("benchmark, mode precision, %d iterations\n\n", NUM_ITERATIONS);
166 | #endif
167 | 
168 |     int output = 0;
169 |     
170 | #ifdef __MATH__INTRINSICS__AVX__
171 |     output += benchmark(mm256_acos_ps, acosf, "mm256_acos_ps");
172 |     output += benchmark(mm256_asin_ps, asinf, "mm256_asin_ps");
173 |     output += benchmark(mm256_atan_ps, atanf, "mm256_atan_ps");
174 |     output += benchmark2(mm256_atan2_ps, atan2f, "mm256_atan2_ps");
175 |     output += benchmark(mm256_cbrt_ps, cbrtf, "mm256_cbrt_ps");
176 |     output += benchmark(mm256_cos_ps, cosf, "mm256_cos_ps");
177 |     output += benchmark(mm256_sin_ps, sinf, "mm256_sin_ps");
178 |     output += benchmark(mm256_exp_ps, expf, "mm256_exp_ps");
179 |     output += benchmark(mm256_exp2_ps, exp2f, "mm256_exp2_ps");
180 |     output += benchmark(mm256_log_ps, logf, "mm256_log_ps");
181 |     output += benchmark(mm256_log2_ps, log2f, "mm256_log2_ps");
182 |     output += benchmark2(mm256_pow_ps, powf, "mm256_pow_ps");
183 | #else
184 |     output += benchmark(vacosq_f32, acosf, "vacosq_f32");
185 |     output += benchmark(vasinq_f32, asinf, "vasinq_f32");
186 |     output += benchmark(vatanq_f32, atanf, "vatanq_f32");
187 |     output += benchmark2(vatan2q_f32, atan2f, "vatan2q_f32");
188 |     output += benchmark(vcbrtq_f32, cbrtf, "vcbrtq_f32");
189 |     output += benchmark(vcosq_f32, cosf, "vcosq_f32");
190 |     output += benchmark(vsinq_f32, sinf, "vsinq_f32");
191 |     output += benchmark(vexpq_f32, expf, "vexpq_f32");
192 |     output += benchmark(vexp2q_f32, exp2f, "vexp2q_f32");
193 |     output += benchmark(vlogq_f32, logf, "vlogq_f32");
194 |     output += benchmark(vlog2q_f32, log2f, "vlog2q_f32");
195 |     output += benchmark2(vpowq_f32, powf, "vpowq_f32");
196 | #endif
197 | 
198 |     printf("\n%d\n", output);
199 | 
200 |     return 0;
201 | }


--------------------------------------------------------------------------------
/tests/greatest.h:
--------------------------------------------------------------------------------
   1 | /*
   2 |  * Copyright (c) 2011-2021 Scott Vokes <vokes.s@gmail.com>
   3 |  *
   4 |  * Permission to use, copy, modify, and/or distribute this software for any
   5 |  * purpose with or without fee is hereby granted, provided that the above
   6 |  * copyright notice and this permission notice appear in all copies.
   7 |  *
   8 |  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
   9 |  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  10 |  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
  11 |  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  12 |  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  13 |  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  14 |  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  15 |  */
  16 | 
  17 | #ifndef GREATEST_H
  18 | #define GREATEST_H
  19 | 
  20 | #if defined(__cplusplus) && !defined(GREATEST_NO_EXTERN_CPLUSPLUS)
  21 | extern "C" {
  22 | #endif
  23 | 
  24 | /* 1.5.0 */
  25 | #define GREATEST_VERSION_MAJOR 1
  26 | #define GREATEST_VERSION_MINOR 5
  27 | #define GREATEST_VERSION_PATCH 0
  28 | 
  29 | /* A unit testing system for C, contained in 1 file.
  30 |  * It doesn't use dynamic allocation or depend on anything
  31 |  * beyond ANSI C89.
  32 |  *
  33 |  * An up-to-date version can be found at:
  34 |  *     https://github.com/silentbicycle/greatest/
  35 |  */
  36 | 
  37 | 
  38 | /*********************************************************************
  39 |  * Minimal test runner template
  40 |  *********************************************************************/
  41 | #if 0
  42 | 
  43 | #include "greatest.h"
  44 | 
  45 | TEST foo_should_foo(void) {
  46 |     PASS();
  47 | }
  48 | 
  49 | static void setup_cb(void *data) {
  50 |     printf("setup callback for each test case\n");
  51 | }
  52 | 
  53 | static void teardown_cb(void *data) {
  54 |     printf("teardown callback for each test case\n");
  55 | }
  56 | 
  57 | SUITE(suite) {
  58 |     /* Optional setup/teardown callbacks which will be run before/after
  59 |      * every test case. If using a test suite, they will be cleared when
  60 |      * the suite finishes. */
  61 |     SET_SETUP(setup_cb, voidp_to_callback_data);
  62 |     SET_TEARDOWN(teardown_cb, voidp_to_callback_data);
  63 | 
  64 |     RUN_TEST(foo_should_foo);
  65 | }
  66 | 
  67 | /* Add definitions that need to be in the test runner's main file. */
  68 | GREATEST_MAIN_DEFS();
  69 | 
  70 | /* Set up, run suite(s) of tests, report pass/fail/skip stats. */
  71 | int run_tests(void) {
  72 |     GREATEST_INIT();            /* init. greatest internals */
  73 |     /* List of suites to run (if any). */
  74 |     RUN_SUITE(suite);
  75 | 
  76 |     /* Tests can also be run directly, without using test suites. */
  77 |     RUN_TEST(foo_should_foo);
  78 | 
  79 |     GREATEST_PRINT_REPORT();          /* display results */
  80 |     return greatest_all_passed();
  81 | }
  82 | 
  83 | /* main(), for a standalone command-line test runner.
  84 |  * This replaces run_tests above, and adds command line option
  85 |  * handling and exiting with a pass/fail status. */
  86 | int main(int argc, char **argv) {
  87 |     GREATEST_MAIN_BEGIN();      /* init & parse command-line args */
  88 |     RUN_SUITE(suite);
  89 |     GREATEST_MAIN_END();        /* display results */
  90 | }
  91 | 
  92 | #endif
  93 | /*********************************************************************/
  94 | 
  95 | 
  96 | #include <stdlib.h>
  97 | #include <stdio.h>
  98 | #include <string.h>
  99 | #include <ctype.h>
 100 | 
 101 | /***********
 102 |  * Options *
 103 |  ***********/
 104 | 
 105 | /* Default column width for non-verbose output. */
 106 | #ifndef GREATEST_DEFAULT_WIDTH
 107 | #define GREATEST_DEFAULT_WIDTH 72
 108 | #endif
 109 | 
 110 | /* FILE *, for test logging. */
 111 | #ifndef GREATEST_STDOUT
 112 | #define GREATEST_STDOUT stdout
 113 | #endif
 114 | 
 115 | /* Remove GREATEST_ prefix from most commonly used symbols? */
 116 | #ifndef GREATEST_USE_ABBREVS
 117 | #define GREATEST_USE_ABBREVS 1
 118 | #endif
 119 | 
 120 | /* Set to 0 to disable all use of setjmp/longjmp. */
 121 | #ifndef GREATEST_USE_LONGJMP
 122 | #define GREATEST_USE_LONGJMP 0
 123 | #endif
 124 | 
 125 | /* Make it possible to replace fprintf with another
 126 |  * function with the same interface. */
 127 | #ifndef GREATEST_FPRINTF
 128 | #define GREATEST_FPRINTF fprintf
 129 | #endif
 130 | 
 131 | #if GREATEST_USE_LONGJMP
 132 | #include <setjmp.h>
 133 | #endif
 134 | 
 135 | /* Set to 0 to disable all use of time.h / clock(). */
 136 | #ifndef GREATEST_USE_TIME
 137 | #define GREATEST_USE_TIME 1
 138 | #endif
 139 | 
 140 | #if GREATEST_USE_TIME
 141 | #include <time.h>
 142 | #endif
 143 | 
 144 | /* Floating point type, for ASSERT_IN_RANGE. */
 145 | #ifndef GREATEST_FLOAT
 146 | #define GREATEST_FLOAT double
 147 | #define GREATEST_FLOAT_FMT "%g"
 148 | #endif
 149 | 
 150 | /* Size of buffer for test name + optional '_' separator and suffix */
 151 | #ifndef GREATEST_TESTNAME_BUF_SIZE
 152 | #define GREATEST_TESTNAME_BUF_SIZE 128
 153 | #endif
 154 | 
 155 | 
 156 | /*********
 157 |  * Types *
 158 |  *********/
 159 | 
 160 | /* Info for the current running suite. */
 161 | typedef struct greatest_suite_info {
 162 |     unsigned int tests_run;
 163 |     unsigned int passed;
 164 |     unsigned int failed;
 165 |     unsigned int skipped;
 166 | 
 167 | #if GREATEST_USE_TIME
 168 |     /* timers, pre/post running suite and individual tests */
 169 |     clock_t pre_suite;
 170 |     clock_t post_suite;
 171 |     clock_t pre_test;
 172 |     clock_t post_test;
 173 | #endif
 174 | } greatest_suite_info;
 175 | 
 176 | /* Type for a suite function. */
 177 | typedef void greatest_suite_cb(void);
 178 | 
 179 | /* Types for setup/teardown callbacks. If non-NULL, these will be run
 180 |  * and passed the pointer to their additional data. */
 181 | typedef void greatest_setup_cb(void *udata);
 182 | typedef void greatest_teardown_cb(void *udata);
 183 | 
 184 | /* Type for an equality comparison between two pointers of the same type.
 185 |  * Should return non-0 if equal, otherwise 0.
 186 |  * UDATA is a closure value, passed through from ASSERT_EQUAL_T[m]. */
 187 | typedef int greatest_equal_cb(const void *expd, const void *got, void *udata);
 188 | 
 189 | /* Type for a callback that prints a value pointed to by T.
 190 |  * Return value has the same meaning as printf's.
 191 |  * UDATA is a closure value, passed through from ASSERT_EQUAL_T[m]. */
 192 | typedef int greatest_printf_cb(const void *t, void *udata);
 193 | 
 194 | /* Callbacks for an arbitrary type; needed for type-specific
 195 |  * comparisons via GREATEST_ASSERT_EQUAL_T[m].*/
 196 | typedef struct greatest_type_info {
 197 |     greatest_equal_cb *equal;
 198 |     greatest_printf_cb *print;
 199 | } greatest_type_info;
 200 | 
 201 | typedef struct greatest_memory_cmp_env {
 202 |     const unsigned char *exp;
 203 |     const unsigned char *got;
 204 |     size_t size;
 205 | } greatest_memory_cmp_env;
 206 | 
 207 | /* Callbacks for string and raw memory types. */
 208 | extern greatest_type_info greatest_type_info_string;
 209 | extern greatest_type_info greatest_type_info_memory;
 210 | 
 211 | typedef enum {
 212 |     GREATEST_FLAG_FIRST_FAIL = 0x01,
 213 |     GREATEST_FLAG_LIST_ONLY = 0x02,
 214 |     GREATEST_FLAG_ABORT_ON_FAIL = 0x04
 215 | } greatest_flag_t;
 216 | 
 217 | /* Internal state for a PRNG, used to shuffle test order. */
 218 | struct greatest_prng {
 219 |     unsigned char random_order; /* use random ordering? */
 220 |     unsigned char initialized;  /* is random ordering initialized? */
 221 |     unsigned char pad_0[6];
 222 |     unsigned long state;        /* PRNG state */
 223 |     unsigned long count;        /* how many tests, this pass */
 224 |     unsigned long count_ceil;   /* total number of tests */
 225 |     unsigned long count_run;    /* total tests run */
 226 |     unsigned long a;            /* LCG multiplier */
 227 |     unsigned long c;            /* LCG increment */
 228 |     unsigned long m;            /* LCG modulus, based on count_ceil */
 229 | };
 230 | 
 231 | /* Struct containing all test runner state. */
 232 | typedef struct greatest_run_info {
 233 |     unsigned char flags;
 234 |     unsigned char verbosity;
 235 |     unsigned char running_test; /* guard for nested RUN_TEST calls */
 236 |     unsigned char exact_name_match;
 237 | 
 238 |     unsigned int tests_run;     /* total test count */
 239 | 
 240 |     /* currently running test suite */
 241 |     greatest_suite_info suite;
 242 | 
 243 |     /* overall pass/fail/skip counts */
 244 |     unsigned int passed;
 245 |     unsigned int failed;
 246 |     unsigned int skipped;
 247 |     unsigned int assertions;
 248 | 
 249 |     /* info to print about the most recent failure */
 250 |     unsigned int fail_line;
 251 |     unsigned int pad_1;
 252 |     const char *fail_file;
 253 |     const char *msg;
 254 | 
 255 |     /* current setup/teardown hooks and userdata */
 256 |     greatest_setup_cb *setup;
 257 |     void *setup_udata;
 258 |     greatest_teardown_cb *teardown;
 259 |     void *teardown_udata;
 260 | 
 261 |     /* formatting info for ".....s...F"-style output */
 262 |     unsigned int col;
 263 |     unsigned int width;
 264 | 
 265 |     /* only run a specific suite or test */
 266 |     const char *suite_filter;
 267 |     const char *test_filter;
 268 |     const char *test_exclude;
 269 |     const char *name_suffix;    /* print suffix with test name */
 270 |     char name_buf[GREATEST_TESTNAME_BUF_SIZE];
 271 | 
 272 |     struct greatest_prng prng[2]; /* 0: suites, 1: tests */
 273 | 
 274 | #if GREATEST_USE_TIME
 275 |     /* overall timers */
 276 |     clock_t begin;
 277 |     clock_t end;
 278 | #endif
 279 | 
 280 | #if GREATEST_USE_LONGJMP
 281 |     int pad_jmp_buf;
 282 |     unsigned char pad_2[4];
 283 |     jmp_buf jump_dest;
 284 | #endif
 285 | } greatest_run_info;
 286 | 
 287 | struct greatest_report_t {
 288 |     /* overall pass/fail/skip counts */
 289 |     unsigned int passed;
 290 |     unsigned int failed;
 291 |     unsigned int skipped;
 292 |     unsigned int assertions;
 293 | };
 294 | 
 295 | /* Global var for the current testing context.
 296 |  * Initialized by GREATEST_MAIN_DEFS(). */
 297 | extern greatest_run_info greatest_info;
 298 | 
 299 | /* Type for ASSERT_ENUM_EQ's ENUM_STR argument. */
 300 | typedef const char *greatest_enum_str_fun(int value);
 301 | 
 302 | 
 303 | /**********************
 304 |  * Exported functions *
 305 |  **********************/
 306 | 
 307 | /* These are used internally by greatest macros. */
 308 | int greatest_test_pre(const char *name);
 309 | void greatest_test_post(int res);
 310 | int greatest_do_assert_equal_t(const void *expd, const void *got,
 311 |     greatest_type_info *type_info, void *udata);
 312 | void greatest_prng_init_first_pass(int id);
 313 | int greatest_prng_init_second_pass(int id, unsigned long seed);
 314 | void greatest_prng_step(int id);
 315 | 
 316 | /* These are part of the public greatest API. */
 317 | void GREATEST_SET_SETUP_CB(greatest_setup_cb *cb, void *udata);
 318 | void GREATEST_SET_TEARDOWN_CB(greatest_teardown_cb *cb, void *udata);
 319 | void GREATEST_INIT(void);
 320 | void GREATEST_PRINT_REPORT(void);
 321 | int greatest_all_passed(void);
 322 | void greatest_set_suite_filter(const char *filter);
 323 | void greatest_set_test_filter(const char *filter);
 324 | void greatest_set_test_exclude(const char *filter);
 325 | void greatest_set_exact_name_match(void);
 326 | void greatest_stop_at_first_fail(void);
 327 | void greatest_abort_on_fail(void);
 328 | void greatest_list_only(void);
 329 | void greatest_get_report(struct greatest_report_t *report);
 330 | unsigned int greatest_get_verbosity(void);
 331 | void greatest_set_verbosity(unsigned int verbosity);
 332 | void greatest_set_flag(greatest_flag_t flag);
 333 | void greatest_set_test_suffix(const char *suffix);
 334 | 
 335 | 
 336 | /********************
 337 | * Language Support *
 338 | ********************/
 339 | 
 340 | /* If __VA_ARGS__ (C99) is supported, allow parametric testing
 341 | * without needing to manually manage the argument struct. */
 342 | #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 19901L) ||        \
 343 |     (defined(_MSC_VER) && _MSC_VER >= 1800)
 344 | #define GREATEST_VA_ARGS
 345 | #endif
 346 | 
 347 | 
 348 | /**********
 349 |  * Macros *
 350 |  **********/
 351 | 
 352 | /* Define a suite. (The duplication is intentional -- it eliminates
 353 |  * a warning from -Wmissing-declarations.) */
 354 | #define GREATEST_SUITE(NAME) void NAME(void); void NAME(void)
 355 | 
 356 | /* Declare a suite, provided by another compilation unit. */
 357 | #define GREATEST_SUITE_EXTERN(NAME) void NAME(void)
 358 | 
 359 | /* Start defining a test function.
 360 |  * The arguments are not included, to allow parametric testing. */
 361 | #define GREATEST_TEST static enum greatest_test_res
 362 | 
 363 | /* PASS/FAIL/SKIP result from a test. Used internally. */
 364 | typedef enum greatest_test_res {
 365 |     GREATEST_TEST_RES_PASS = 0,
 366 |     GREATEST_TEST_RES_FAIL = -1,
 367 |     GREATEST_TEST_RES_SKIP = 1
 368 | } greatest_test_res;
 369 | 
 370 | /* Run a suite. */
 371 | #define GREATEST_RUN_SUITE(S_NAME) greatest_run_suite(S_NAME, #S_NAME)
 372 | 
 373 | /* Run a test in the current suite. */
 374 | #define GREATEST_RUN_TEST(TEST)                                         \
 375 |     do {                                                                \
 376 |         if (greatest_test_pre(#TEST) == 1) {                            \
 377 |             enum greatest_test_res res = GREATEST_SAVE_CONTEXT();       \
 378 |             if (res == GREATEST_TEST_RES_PASS) {                        \
 379 |                 res = TEST();                                           \
 380 |             }                                                           \
 381 |             greatest_test_post(res);                                    \
 382 |         }                                                               \
 383 |     } while (0)
 384 | 
 385 | /* Ignore a test, don't warn about it being unused. */
 386 | #define GREATEST_IGNORE_TEST(TEST) (void)TEST
 387 | 
 388 | /* Run a test in the current suite with one void * argument,
 389 |  * which can be a pointer to a struct with multiple arguments. */
 390 | #define GREATEST_RUN_TEST1(TEST, ENV)                                   \
 391 |     do {                                                                \
 392 |         if (greatest_test_pre(#TEST) == 1) {                            \
 393 |             enum greatest_test_res res = GREATEST_SAVE_CONTEXT();       \
 394 |             if (res == GREATEST_TEST_RES_PASS) {                        \
 395 |                 res = TEST(ENV);                                        \
 396 |             }                                                           \
 397 |             greatest_test_post(res);                                    \
 398 |         }                                                               \
 399 |     } while (0)
 400 | 
 401 | #ifdef GREATEST_VA_ARGS
 402 | #define GREATEST_RUN_TESTp(TEST, ...)                                   \
 403 |     do {                                                                \
 404 |         if (greatest_test_pre(#TEST) == 1) {                            \
 405 |             enum greatest_test_res res = GREATEST_SAVE_CONTEXT();       \
 406 |             if (res == GREATEST_TEST_RES_PASS) {                        \
 407 |                 res = TEST(__VA_ARGS__);                                \
 408 |             }                                                           \
 409 |             greatest_test_post(res);                                    \
 410 |         }                                                               \
 411 |     } while (0)
 412 | #endif
 413 | 
 414 | 
 415 | /* Check if the test runner is in verbose mode. */
 416 | #define GREATEST_IS_VERBOSE() ((greatest_info.verbosity) > 0)
 417 | #define GREATEST_LIST_ONLY()                                            \
 418 |     (greatest_info.flags & GREATEST_FLAG_LIST_ONLY)
 419 | #define GREATEST_FIRST_FAIL()                                           \
 420 |     (greatest_info.flags & GREATEST_FLAG_FIRST_FAIL)
 421 | #define GREATEST_ABORT_ON_FAIL()                                        \
 422 |     (greatest_info.flags & GREATEST_FLAG_ABORT_ON_FAIL)
 423 | #define GREATEST_FAILURE_ABORT()                                        \
 424 |     (GREATEST_FIRST_FAIL() &&                                           \
 425 |         (greatest_info.suite.failed > 0 || greatest_info.failed > 0))
 426 | 
 427 | /* Message-less forms of tests defined below. */
 428 | #define GREATEST_PASS() GREATEST_PASSm(NULL)
 429 | #define GREATEST_FAIL() GREATEST_FAILm(NULL)
 430 | #define GREATEST_SKIP() GREATEST_SKIPm(NULL)
 431 | #define GREATEST_ASSERT(COND)                                           \
 432 |     GREATEST_ASSERTm(#COND, COND)
 433 | #define GREATEST_ASSERT_OR_LONGJMP(COND)                                \
 434 |     GREATEST_ASSERT_OR_LONGJMPm(#COND, COND)
 435 | #define GREATEST_ASSERT_FALSE(COND)                                     \
 436 |     GREATEST_ASSERT_FALSEm(#COND, COND)
 437 | #define GREATEST_ASSERT_EQ(EXP, GOT)                                    \
 438 |     GREATEST_ASSERT_EQm(#EXP " != " #GOT, EXP, GOT)
 439 | #define GREATEST_ASSERT_NEQ(EXP, GOT)                                   \
 440 |     GREATEST_ASSERT_NEQm(#EXP " == " #GOT, EXP, GOT)
 441 | #define GREATEST_ASSERT_GT(EXP, GOT)                                    \
 442 |     GREATEST_ASSERT_GTm(#EXP " <= " #GOT, EXP, GOT)
 443 | #define GREATEST_ASSERT_GTE(EXP, GOT)                                   \
 444 |     GREATEST_ASSERT_GTEm(#EXP " < " #GOT, EXP, GOT)
 445 | #define GREATEST_ASSERT_LT(EXP, GOT)                                    \
 446 |     GREATEST_ASSERT_LTm(#EXP " >= " #GOT, EXP, GOT)
 447 | #define GREATEST_ASSERT_LTE(EXP, GOT)                                   \
 448 |     GREATEST_ASSERT_LTEm(#EXP " > " #GOT, EXP, GOT)
 449 | #define GREATEST_ASSERT_EQ_FMT(EXP, GOT, FMT)                           \
 450 |     GREATEST_ASSERT_EQ_FMTm(#EXP " != " #GOT, EXP, GOT, FMT)
 451 | #define GREATEST_ASSERT_IN_RANGE(EXP, GOT, TOL)                         \
 452 |     GREATEST_ASSERT_IN_RANGEm(#EXP " != " #GOT " +/- " #TOL, EXP, GOT, TOL)
 453 | #define GREATEST_ASSERT_EQUAL_T(EXP, GOT, TYPE_INFO, UDATA)             \
 454 |     GREATEST_ASSERT_EQUAL_Tm(#EXP " != " #GOT, EXP, GOT, TYPE_INFO, UDATA)
 455 | #define GREATEST_ASSERT_STR_EQ(EXP, GOT)                                \
 456 |     GREATEST_ASSERT_STR_EQm(#EXP " != " #GOT, EXP, GOT)
 457 | #define GREATEST_ASSERT_STRN_EQ(EXP, GOT, SIZE)                         \
 458 |     GREATEST_ASSERT_STRN_EQm(#EXP " != " #GOT, EXP, GOT, SIZE)
 459 | #define GREATEST_ASSERT_MEM_EQ(EXP, GOT, SIZE)                          \
 460 |     GREATEST_ASSERT_MEM_EQm(#EXP " != " #GOT, EXP, GOT, SIZE)
 461 | #define GREATEST_ASSERT_ENUM_EQ(EXP, GOT, ENUM_STR)                     \
 462 |     GREATEST_ASSERT_ENUM_EQm(#EXP " != " #GOT, EXP, GOT, ENUM_STR)
 463 | 
 464 | /* The following forms take an additional message argument first,
 465 |  * to be displayed by the test runner. */
 466 | 
 467 | /* Fail if a condition is not true, with message. */
 468 | #define GREATEST_ASSERTm(MSG, COND)                                     \
 469 |     do {                                                                \
 470 |         greatest_info.assertions++;                                     \
 471 |         if (!(COND)) { GREATEST_FAILm(MSG); }                           \
 472 |     } while (0)
 473 | 
 474 | /* Fail if a condition is not true, longjmping out of test. */
 475 | #define GREATEST_ASSERT_OR_LONGJMPm(MSG, COND)                          \
 476 |     do {                                                                \
 477 |         greatest_info.assertions++;                                     \
 478 |         if (!(COND)) { GREATEST_FAIL_WITH_LONGJMPm(MSG); }              \
 479 |     } while (0)
 480 | 
 481 | /* Fail if a condition is not false, with message. */
 482 | #define GREATEST_ASSERT_FALSEm(MSG, COND)                               \
 483 |     do {                                                                \
 484 |         greatest_info.assertions++;                                     \
 485 |         if ((COND)) { GREATEST_FAILm(MSG); }                            \
 486 |     } while (0)
 487 | 
 488 | /* Internal macro for relational assertions */
 489 | #define GREATEST__REL(REL, MSG, EXP, GOT)                               \
 490 |     do {                                                                \
 491 |         greatest_info.assertions++;                                     \
 492 |         if (!((EXP) REL (GOT))) { GREATEST_FAILm(MSG); }                \
 493 |     } while (0)
 494 | 
 495 | /* Fail if EXP is not ==, !=, >, <, >=, or <= to GOT. */
 496 | #define GREATEST_ASSERT_EQm(MSG,E,G) GREATEST__REL(==, MSG,E,G)
 497 | #define GREATEST_ASSERT_NEQm(MSG,E,G) GREATEST__REL(!=, MSG,E,G)
 498 | #define GREATEST_ASSERT_GTm(MSG,E,G) GREATEST__REL(>, MSG,E,G)
 499 | #define GREATEST_ASSERT_GTEm(MSG,E,G) GREATEST__REL(>=, MSG,E,G)
 500 | #define GREATEST_ASSERT_LTm(MSG,E,G) GREATEST__REL(<, MSG,E,G)
 501 | #define GREATEST_ASSERT_LTEm(MSG,E,G) GREATEST__REL(<=, MSG,E,G)
 502 | 
 503 | /* Fail if EXP != GOT (equality comparison by ==).
 504 |  * Warning: FMT, EXP, and GOT will be evaluated more
 505 |  * than once on failure. */
 506 | #define GREATEST_ASSERT_EQ_FMTm(MSG, EXP, GOT, FMT)                     \
 507 |     do {                                                                \
 508 |         greatest_info.assertions++;                                     \
 509 |         if ((EXP) != (GOT)) {                                           \
 510 |             GREATEST_FPRINTF(GREATEST_STDOUT, "\nExpected: ");          \
 511 |             GREATEST_FPRINTF(GREATEST_STDOUT, FMT, EXP);                \
 512 |             GREATEST_FPRINTF(GREATEST_STDOUT, "\n     Got: ");          \
 513 |             GREATEST_FPRINTF(GREATEST_STDOUT, FMT, GOT);                \
 514 |             GREATEST_FPRINTF(GREATEST_STDOUT, "\n");                    \
 515 |             GREATEST_FAILm(MSG);                                        \
 516 |         }                                                               \
 517 |     } while (0)
 518 | 
 519 | /* Fail if EXP is not equal to GOT, printing enum IDs. */
 520 | #define GREATEST_ASSERT_ENUM_EQm(MSG, EXP, GOT, ENUM_STR)               \
 521 |     do {                                                                \
 522 |         int greatest_EXP = (int)(EXP);                                  \
 523 |         int greatest_GOT = (int)(GOT);                                  \
 524 |         greatest_enum_str_fun *greatest_ENUM_STR = ENUM_STR;            \
 525 |         if (greatest_EXP != greatest_GOT) {                             \
 526 |             GREATEST_FPRINTF(GREATEST_STDOUT, "\nExpected: %s",         \
 527 |                 greatest_ENUM_STR(greatest_EXP));                       \
 528 |             GREATEST_FPRINTF(GREATEST_STDOUT, "\n     Got: %s\n",       \
 529 |                 greatest_ENUM_STR(greatest_GOT));                       \
 530 |             GREATEST_FAILm(MSG);                                        \
 531 |         }                                                               \
 532 |     } while (0)                                                         \
 533 | 
 534 | /* Fail if GOT not in range of EXP +|- TOL. */
 535 | #define GREATEST_ASSERT_IN_RANGEm(MSG, EXP, GOT, TOL)                   \
 536 |     do {                                                                \
 537 |         GREATEST_FLOAT greatest_EXP = (EXP);                            \
 538 |         GREATEST_FLOAT greatest_GOT = (GOT);                            \
 539 |         GREATEST_FLOAT greatest_TOL = (TOL);                            \
 540 |         greatest_info.assertions++;                                     \
 541 |         if ((greatest_EXP > greatest_GOT &&                             \
 542 |                 greatest_EXP - greatest_GOT > greatest_TOL) ||          \
 543 |             (greatest_EXP < greatest_GOT &&                             \
 544 |                 greatest_GOT - greatest_EXP > greatest_TOL)) {          \
 545 |             GREATEST_FPRINTF(GREATEST_STDOUT,                           \
 546 |                 "\nExpected: " GREATEST_FLOAT_FMT                       \
 547 |                 " +/- " GREATEST_FLOAT_FMT                              \
 548 |                 "\n     Got: " GREATEST_FLOAT_FMT                       \
 549 |                 "\n",                                                   \
 550 |                 greatest_EXP, greatest_TOL, greatest_GOT);              \
 551 |             GREATEST_FAILm(MSG);                                        \
 552 |         }                                                               \
 553 |     } while (0)
 554 | 
 555 | /* Fail if EXP is not equal to GOT, according to strcmp. */
 556 | #define GREATEST_ASSERT_STR_EQm(MSG, EXP, GOT)                          \
 557 |     do {                                                                \
 558 |         GREATEST_ASSERT_EQUAL_Tm(MSG, EXP, GOT,                         \
 559 |             &greatest_type_info_string, NULL);                          \
 560 |     } while (0)                                                         \
 561 | 
 562 | /* Fail if EXP is not equal to GOT, according to strncmp. */
 563 | #define GREATEST_ASSERT_STRN_EQm(MSG, EXP, GOT, SIZE)                   \
 564 |     do {                                                                \
 565 |         size_t size = SIZE;                                             \
 566 |         GREATEST_ASSERT_EQUAL_Tm(MSG, EXP, GOT,                         \
 567 |             &greatest_type_info_string, &size);                         \
 568 |     } while (0)                                                         \
 569 | 
 570 | /* Fail if EXP is not equal to GOT, according to memcmp. */
 571 | #define GREATEST_ASSERT_MEM_EQm(MSG, EXP, GOT, SIZE)                    \
 572 |     do {                                                                \
 573 |         greatest_memory_cmp_env env;                                    \
 574 |         env.exp = (const unsigned char *)EXP;                           \
 575 |         env.got = (const unsigned char *)GOT;                           \
 576 |         env.size = SIZE;                                                \
 577 |         GREATEST_ASSERT_EQUAL_Tm(MSG, env.exp, env.got,                 \
 578 |             &greatest_type_info_memory, &env);                          \
 579 |     } while (0)                                                         \
 580 | 
 581 | /* Fail if EXP is not equal to GOT, according to a comparison
 582 |  * callback in TYPE_INFO. If they are not equal, optionally use a
 583 |  * print callback in TYPE_INFO to print them. */
 584 | #define GREATEST_ASSERT_EQUAL_Tm(MSG, EXP, GOT, TYPE_INFO, UDATA)       \
 585 |     do {                                                                \
 586 |         greatest_type_info *type_info = (TYPE_INFO);                    \
 587 |         greatest_info.assertions++;                                     \
 588 |         if (!greatest_do_assert_equal_t(EXP, GOT,                       \
 589 |                 type_info, UDATA)) {                                    \
 590 |             if (type_info == NULL || type_info->equal == NULL) {        \
 591 |                 GREATEST_FAILm("type_info->equal callback missing!");   \
 592 |             } else {                                                    \
 593 |                 GREATEST_FAILm(MSG);                                    \
 594 |             }                                                           \
 595 |         }                                                               \
 596 |     } while (0)                                                         \
 597 | 
 598 | /* Pass. */
 599 | #define GREATEST_PASSm(MSG)                                             \
 600 |     do {                                                                \
 601 |         greatest_info.msg = MSG;                                        \
 602 |         return GREATEST_TEST_RES_PASS;                                  \
 603 |     } while (0)
 604 | 
 605 | /* Fail. */
 606 | #define GREATEST_FAILm(MSG)                                             \
 607 |     do {                                                                \
 608 |         greatest_info.fail_file = __FILE__;                             \
 609 |         greatest_info.fail_line = __LINE__;                             \
 610 |         greatest_info.msg = MSG;                                        \
 611 |         if (GREATEST_ABORT_ON_FAIL()) { abort(); }                      \
 612 |         return GREATEST_TEST_RES_FAIL;                                  \
 613 |     } while (0)
 614 | 
 615 | /* Optional GREATEST_FAILm variant that longjmps. */
 616 | #if GREATEST_USE_LONGJMP
 617 | #define GREATEST_FAIL_WITH_LONGJMP() GREATEST_FAIL_WITH_LONGJMPm(NULL)
 618 | #define GREATEST_FAIL_WITH_LONGJMPm(MSG)                                \
 619 |     do {                                                                \
 620 |         greatest_info.fail_file = __FILE__;                             \
 621 |         greatest_info.fail_line = __LINE__;                             \
 622 |         greatest_info.msg = MSG;                                        \
 623 |         longjmp(greatest_info.jump_dest, GREATEST_TEST_RES_FAIL);       \
 624 |     } while (0)
 625 | #endif
 626 | 
 627 | /* Skip the current test. */
 628 | #define GREATEST_SKIPm(MSG)                                             \
 629 |     do {                                                                \
 630 |         greatest_info.msg = MSG;                                        \
 631 |         return GREATEST_TEST_RES_SKIP;                                  \
 632 |     } while (0)
 633 | 
 634 | /* Check the result of a subfunction using ASSERT, etc. */
 635 | #define GREATEST_CHECK_CALL(RES)                                        \
 636 |     do {                                                                \
 637 |         enum greatest_test_res greatest_RES = RES;                      \
 638 |         if (greatest_RES != GREATEST_TEST_RES_PASS) {                   \
 639 |             return greatest_RES;                                        \
 640 |         }                                                               \
 641 |     } while (0)                                                         \
 642 | 
 643 | #if GREATEST_USE_TIME
 644 | #define GREATEST_SET_TIME(NAME)                                         \
 645 |     NAME = clock();                                                     \
 646 |     if (NAME == (clock_t) -1) {                                         \
 647 |         GREATEST_FPRINTF(GREATEST_STDOUT,                               \
 648 |             "clock error: %s\n", #NAME);                                \
 649 |         exit(EXIT_FAILURE);                                             \
 650 |     }
 651 | 
 652 | #define GREATEST_CLOCK_DIFF(C1, C2)                                     \
 653 |     GREATEST_FPRINTF(GREATEST_STDOUT, " (%lu ticks, %.3f sec)",         \
 654 |         (long unsigned int) (C2) - (long unsigned int)(C1),             \
 655 |         (double)((C2) - (C1)) / (1.0 * (double)CLOCKS_PER_SEC))
 656 | #else
 657 | #define GREATEST_SET_TIME(UNUSED)
 658 | #define GREATEST_CLOCK_DIFF(UNUSED1, UNUSED2)
 659 | #endif
 660 | 
 661 | #if GREATEST_USE_LONGJMP
 662 | #define GREATEST_SAVE_CONTEXT()                                         \
 663 |         /* setjmp returns 0 (GREATEST_TEST_RES_PASS) on first call *    \
 664 |          * so the test runs, then RES_FAIL from FAIL_WITH_LONGJMP. */   \
 665 |         ((enum greatest_test_res)(setjmp(greatest_info.jump_dest)))
 666 | #else
 667 | #define GREATEST_SAVE_CONTEXT()                                         \
 668 |     /*a no-op, since setjmp/longjmp aren't being used */                \
 669 |     GREATEST_TEST_RES_PASS
 670 | #endif
 671 | 
 672 | /* Run every suite / test function run within BODY in pseudo-random
 673 |  * order, seeded by SEED. (The top 3 bits of the seed are ignored.)
 674 |  *
 675 |  * This should be called like:
 676 |  *     GREATEST_SHUFFLE_TESTS(seed, {
 677 |  *         GREATEST_RUN_TEST(some_test);
 678 |  *         GREATEST_RUN_TEST(some_other_test);
 679 |  *         GREATEST_RUN_TEST(yet_another_test);
 680 |  *     });
 681 |  *
 682 |  * Note that the body of the second argument will be evaluated
 683 |  * multiple times. */
 684 | #define GREATEST_SHUFFLE_SUITES(SD, BODY) GREATEST_SHUFFLE(0, SD, BODY)
 685 | #define GREATEST_SHUFFLE_TESTS(SD, BODY) GREATEST_SHUFFLE(1, SD, BODY)
 686 | #define GREATEST_SHUFFLE(ID, SD, BODY)                                  \
 687 |     do {                                                                \
 688 |         struct greatest_prng *prng = &greatest_info.prng[ID];           \
 689 |         greatest_prng_init_first_pass(ID);                              \
 690 |         do {                                                            \
 691 |             prng->count = 0;                                            \
 692 |             if (prng->initialized) { greatest_prng_step(ID); }          \
 693 |             BODY;                                                       \
 694 |             if (!prng->initialized) {                                   \
 695 |                 if (!greatest_prng_init_second_pass(ID, SD)) { break; } \
 696 |             } else if (prng->count_run == prng->count_ceil) {           \
 697 |                 break;                                                  \
 698 |             }                                                           \
 699 |         } while (!GREATEST_FAILURE_ABORT());                            \
 700 |         prng->count_run = prng->random_order = prng->initialized = 0;   \
 701 |     } while(0)
 702 | 
 703 | /* Include several function definitions in the main test file. */
 704 | #define GREATEST_MAIN_DEFS()                                            \
 705 |                                                                         \
 706 | /* Is FILTER a subset of NAME? */                                       \
 707 | static int greatest_name_match(const char *name, const char *filter,    \
 708 |         int res_if_none) {                                              \
 709 |     size_t offset = 0;                                                  \
 710 |     size_t filter_len = filter ? strlen(filter) : 0;                    \
 711 |     if (filter_len == 0) { return res_if_none; } /* no filter */        \
 712 |     if (greatest_info.exact_name_match && strlen(name) != filter_len) { \
 713 |         return 0; /* ignore substring matches */                        \
 714 |     }                                                                   \
 715 |     while (name[offset] != '\0') {                                      \
 716 |         if (name[offset] == filter[0]) {                                \
 717 |             if (0 == strncmp(&name[offset], filter, filter_len)) {      \
 718 |                 return 1;                                               \
 719 |             }                                                           \
 720 |         }                                                               \
 721 |         offset++;                                                       \
 722 |     }                                                                   \
 723 |                                                                         \
 724 |     return 0;                                                           \
 725 | }                                                                       \
 726 |                                                                         \
 727 | static void greatest_buffer_test_name(const char *name) {               \
 728 |     struct greatest_run_info *g = &greatest_info;                       \
 729 |     size_t len = strlen(name), size = sizeof(g->name_buf);              \
 730 |     memset(g->name_buf, 0x00, size);                                    \
 731 |     (void)strncat(g->name_buf, name, size - 1);                         \
 732 |     if (g->name_suffix && (len + 1 < size)) {                           \
 733 |         g->name_buf[len] = '_';                                         \
 734 |         strncat(&g->name_buf[len+1], g->name_suffix, size-(len+2));     \
 735 |     }                                                                   \
 736 | }                                                                       \
 737 |                                                                         \
 738 | /* Before running a test, check the name filtering and                  \
 739 |  * test shuffling state, if applicable, and then call setup hooks. */   \
 740 | int greatest_test_pre(const char *name) {                               \
 741 |     struct greatest_run_info *g = &greatest_info;                       \
 742 |     int match;                                                          \
 743 |     greatest_buffer_test_name(name);                                    \
 744 |     match = greatest_name_match(g->name_buf, g->test_filter, 1) &&      \
 745 |       !greatest_name_match(g->name_buf, g->test_exclude, 0);            \
 746 |     if (GREATEST_LIST_ONLY()) {   /* just listing test names */         \
 747 |         if (match) {                                                    \
 748 |             GREATEST_FPRINTF(GREATEST_STDOUT, "  %s\n", g->name_buf);   \
 749 |         }                                                               \
 750 |         goto clear;                                                     \
 751 |     }                                                                   \
 752 |     if (match && (!GREATEST_FIRST_FAIL() || g->suite.failed == 0)) {    \
 753 |             struct greatest_prng *p = &g->prng[1];                      \
 754 |         if (p->random_order) {                                          \
 755 |             p->count++;                                                 \
 756 |             if (!p->initialized || ((p->count - 1) != p->state)) {      \
 757 |                 goto clear;       /* don't run this test yet */         \
 758 |             }                                                           \
 759 |         }                                                               \
 760 |         if (g->running_test) {                                          \
 761 |             fprintf(stderr, "Error: Test run inside another test.\n");  \
 762 |             return 0;                                                   \
 763 |         }                                                               \
 764 |         GREATEST_SET_TIME(g->suite.pre_test);                           \
 765 |         if (g->setup) { g->setup(g->setup_udata); }                     \
 766 |         p->count_run++;                                                 \
 767 |         g->running_test = 1;                                            \
 768 |         return 1;                 /* test should be run */              \
 769 |     } else {                                                            \
 770 |         goto clear;               /* skipped */                         \
 771 |     }                                                                   \
 772 | clear:                                                                  \
 773 |     g->name_suffix = NULL;                                              \
 774 |     return 0;                                                           \
 775 | }                                                                       \
 776 |                                                                         \
 777 | static void greatest_do_pass(void) {                                    \
 778 |     struct greatest_run_info *g = &greatest_info;                       \
 779 |     if (GREATEST_IS_VERBOSE()) {                                        \
 780 |         GREATEST_FPRINTF(GREATEST_STDOUT, "PASS %s: %s",                \
 781 |             g->name_buf, g->msg ? g->msg : "");                         \
 782 |     } else {                                                            \
 783 |         GREATEST_FPRINTF(GREATEST_STDOUT, ".");                         \
 784 |     }                                                                   \
 785 |     g->suite.passed++;                                                  \
 786 | }                                                                       \
 787 |                                                                         \
 788 | static void greatest_do_fail(void) {                                    \
 789 |     struct greatest_run_info *g = &greatest_info;                       \
 790 |     if (GREATEST_IS_VERBOSE()) {                                        \
 791 |         GREATEST_FPRINTF(GREATEST_STDOUT,                               \
 792 |             "FAIL %s: %s (%s:%u)", g->name_buf,                         \
 793 |             g->msg ? g->msg : "", g->fail_file, g->fail_line);          \
 794 |     } else {                                                            \
 795 |         GREATEST_FPRINTF(GREATEST_STDOUT, "F");                         \
 796 |         g->col++;  /* add linebreak if in line of '.'s */               \
 797 |         if (g->col != 0) {                                              \
 798 |             GREATEST_FPRINTF(GREATEST_STDOUT, "\n");                    \
 799 |             g->col = 0;                                                 \
 800 |         }                                                               \
 801 |         GREATEST_FPRINTF(GREATEST_STDOUT, "FAIL %s: %s (%s:%u)\n",      \
 802 |             g->name_buf, g->msg ? g->msg : "",                          \
 803 |             g->fail_file, g->fail_line);                                \
 804 |     }                                                                   \
 805 |     g->suite.failed++;                                                  \
 806 | }                                                                       \
 807 |                                                                         \
 808 | static void greatest_do_skip(void) {                                    \
 809 |     struct greatest_run_info *g = &greatest_info;                       \
 810 |     if (GREATEST_IS_VERBOSE()) {                                        \
 811 |         GREATEST_FPRINTF(GREATEST_STDOUT, "SKIP %s: %s",                \
 812 |             g->name_buf, g->msg ? g->msg : "");                         \
 813 |     } else {                                                            \
 814 |         GREATEST_FPRINTF(GREATEST_STDOUT, "s");                         \
 815 |     }                                                                   \
 816 |     g->suite.skipped++;                                                 \
 817 | }                                                                       \
 818 |                                                                         \
 819 | void greatest_test_post(int res) {                                      \
 820 |     GREATEST_SET_TIME(greatest_info.suite.post_test);                   \
 821 |     if (greatest_info.teardown) {                                       \
 822 |         void *udata = greatest_info.teardown_udata;                     \
 823 |         greatest_info.teardown(udata);                                  \
 824 |     }                                                                   \
 825 |                                                                         \
 826 |     greatest_info.running_test = 0;                                     \
 827 |     if (res <= GREATEST_TEST_RES_FAIL) {                                \
 828 |         greatest_do_fail();                                             \
 829 |     } else if (res >= GREATEST_TEST_RES_SKIP) {                         \
 830 |         greatest_do_skip();                                             \
 831 |     } else if (res == GREATEST_TEST_RES_PASS) {                         \
 832 |         greatest_do_pass();                                             \
 833 |     }                                                                   \
 834 |     greatest_info.name_suffix = NULL;                                   \
 835 |     greatest_info.suite.tests_run++;                                    \
 836 |     greatest_info.col++;                                                \
 837 |     if (GREATEST_IS_VERBOSE()) {                                        \
 838 |         GREATEST_CLOCK_DIFF(greatest_info.suite.pre_test,               \
 839 |             greatest_info.suite.post_test);                             \
 840 |         GREATEST_FPRINTF(GREATEST_STDOUT, "\n");                        \
 841 |     } else if (greatest_info.col % greatest_info.width == 0) {          \
 842 |         GREATEST_FPRINTF(GREATEST_STDOUT, "\n");                        \
 843 |         greatest_info.col = 0;                                          \
 844 |     }                                                                   \
 845 |     fflush(GREATEST_STDOUT);                                            \
 846 | }                                                                       \
 847 |                                                                         \
 848 | static void report_suite(void) {                                        \
 849 |     if (greatest_info.suite.tests_run > 0) {                            \
 850 |         GREATEST_FPRINTF(GREATEST_STDOUT,                               \
 851 |             "\n%u test%s - %u passed, %u failed, %u skipped",           \
 852 |             greatest_info.suite.tests_run,                              \
 853 |             greatest_info.suite.tests_run == 1 ? "" : "s",              \
 854 |             greatest_info.suite.passed,                                 \
 855 |             greatest_info.suite.failed,                                 \
 856 |             greatest_info.suite.skipped);                               \
 857 |         GREATEST_CLOCK_DIFF(greatest_info.suite.pre_suite,              \
 858 |             greatest_info.suite.post_suite);                            \
 859 |         GREATEST_FPRINTF(GREATEST_STDOUT, "\n");                        \
 860 |     }                                                                   \
 861 | }                                                                       \
 862 |                                                                         \
 863 | static void update_counts_and_reset_suite(void) {                       \
 864 |     greatest_info.setup = NULL;                                         \
 865 |     greatest_info.setup_udata = NULL;                                   \
 866 |     greatest_info.teardown = NULL;                                      \
 867 |     greatest_info.teardown_udata = NULL;                                \
 868 |     greatest_info.passed += greatest_info.suite.passed;                 \
 869 |     greatest_info.failed += greatest_info.suite.failed;                 \
 870 |     greatest_info.skipped += greatest_info.suite.skipped;               \
 871 |     greatest_info.tests_run += greatest_info.suite.tests_run;           \
 872 |     memset(&greatest_info.suite, 0, sizeof(greatest_info.suite));       \
 873 |     greatest_info.col = 0;                                              \
 874 | }                                                                       \
 875 |                                                                         \
 876 | static int greatest_suite_pre(const char *suite_name) {                 \
 877 |     struct greatest_prng *p = &greatest_info.prng[0];                   \
 878 |     if (!greatest_name_match(suite_name, greatest_info.suite_filter, 1) \
 879 |         || (GREATEST_FAILURE_ABORT())) { return 0; }                    \
 880 |     if (p->random_order) {                                              \
 881 |         p->count++;                                                     \
 882 |         if (!p->initialized || ((p->count - 1) != p->state)) {          \
 883 |             return 0; /* don't run this suite yet */                    \
 884 |         }                                                               \
 885 |     }                                                                   \
 886 |     p->count_run++;                                                     \
 887 |     update_counts_and_reset_suite();                                    \
 888 |     GREATEST_FPRINTF(GREATEST_STDOUT, "\n* Suite %s:\n", suite_name);   \
 889 |     GREATEST_SET_TIME(greatest_info.suite.pre_suite);                   \
 890 |     return 1;                                                           \
 891 | }                                                                       \
 892 |                                                                         \
 893 | static void greatest_suite_post(void) {                                 \
 894 |     GREATEST_SET_TIME(greatest_info.suite.post_suite);                  \
 895 |     report_suite();                                                     \
 896 | }                                                                       \
 897 |                                                                         \
 898 | static void greatest_run_suite(greatest_suite_cb *suite_cb,             \
 899 |                                const char *suite_name) {                \
 900 |     if (greatest_suite_pre(suite_name)) {                               \
 901 |         suite_cb();                                                     \
 902 |         greatest_suite_post();                                          \
 903 |     }                                                                   \
 904 | }                                                                       \
 905 |                                                                         \
 906 | int greatest_do_assert_equal_t(const void *expd, const void *got,       \
 907 |         greatest_type_info *type_info, void *udata) {                   \
 908 |     int eq = 0;                                                         \
 909 |     if (type_info == NULL || type_info->equal == NULL) { return 0; }    \
 910 |     eq = type_info->equal(expd, got, udata);                            \
 911 |     if (!eq) {                                                          \
 912 |         if (type_info->print != NULL) {                                 \
 913 |             GREATEST_FPRINTF(GREATEST_STDOUT, "\nExpected: ");          \
 914 |             (void)type_info->print(expd, udata);                        \
 915 |             GREATEST_FPRINTF(GREATEST_STDOUT, "\n     Got: ");          \
 916 |             (void)type_info->print(got, udata);                         \
 917 |             GREATEST_FPRINTF(GREATEST_STDOUT, "\n");                    \
 918 |         }                                                               \
 919 |     }                                                                   \
 920 |     return eq;                                                          \
 921 | }                                                                       \
 922 |                                                                         \
 923 | static void greatest_usage(const char *name) {                          \
 924 |     GREATEST_FPRINTF(GREATEST_STDOUT,                                   \
 925 |         "Usage: %s [-hlfavex] [-s SUITE] [-t TEST] [-x EXCLUDE]\n"      \
 926 |         "  -h, --help  print this Help\n"                               \
 927 |         "  -l          List suites and tests, then exit (dry run)\n"    \
 928 |         "  -f          Stop runner after first failure\n"               \
 929 |         "  -a          Abort on first failure (implies -f)\n"           \
 930 |         "  -v          Verbose output\n"                                \
 931 |         "  -s SUITE    only run suites containing substring SUITE\n"    \
 932 |         "  -t TEST     only run tests containing substring TEST\n"      \
 933 |         "  -e          only run exact name match for -s or -t\n"        \
 934 |         "  -x EXCLUDE  exclude tests containing substring EXCLUDE\n",   \
 935 |         name);                                                          \
 936 | }                                                                       \
 937 |                                                                         \
 938 | static void greatest_parse_options(int argc, char **argv) {             \
 939 |     int i = 0;                                                          \
 940 |     for (i = 1; i < argc; i++) {                                        \
 941 |         if (argv[i][0] == '-') {                                        \
 942 |             char f = argv[i][1];                                        \
 943 |             if ((f == 's' || f == 't' || f == 'x') && argc <= i + 1) {  \
 944 |                 greatest_usage(argv[0]); exit(EXIT_FAILURE);            \
 945 |             }                                                           \
 946 |             switch (f) {                                                \
 947 |             case 's': /* suite name filter */                           \
 948 |                 greatest_set_suite_filter(argv[i + 1]); i++; break;     \
 949 |             case 't': /* test name filter */                            \
 950 |                 greatest_set_test_filter(argv[i + 1]); i++; break;      \
 951 |             case 'x': /* test name exclusion */                         \
 952 |                 greatest_set_test_exclude(argv[i + 1]); i++; break;     \
 953 |             case 'e': /* exact name match */                            \
 954 |                 greatest_set_exact_name_match(); break;                 \
 955 |             case 'f': /* first fail flag */                             \
 956 |                 greatest_stop_at_first_fail(); break;                   \
 957 |             case 'a': /* abort() on fail flag */                        \
 958 |                 greatest_abort_on_fail(); break;                        \
 959 |             case 'l': /* list only (dry run) */                         \
 960 |                 greatest_list_only(); break;                            \
 961 |             case 'v': /* first fail flag */                             \
 962 |                 greatest_info.verbosity++; break;                       \
 963 |             case 'h': /* help */                                        \
 964 |                 greatest_usage(argv[0]); exit(EXIT_SUCCESS);            \
 965 |             default:                                                    \
 966 |             case '-':                                                   \
 967 |                 if (0 == strncmp("--help", argv[i], 6)) {               \
 968 |                     greatest_usage(argv[0]); exit(EXIT_SUCCESS);        \
 969 |                 } else if (0 == strcmp("--", argv[i])) {                \
 970 |                     return; /* ignore following arguments */            \
 971 |                 }                                                       \
 972 |                 GREATEST_FPRINTF(GREATEST_STDOUT,                       \
 973 |                     "Unknown argument '%s'\n", argv[i]);                \
 974 |                 greatest_usage(argv[0]);                                \
 975 |                 exit(EXIT_FAILURE);                                     \
 976 |             }                                                           \
 977 |         }                                                               \
 978 |     }                                                                   \
 979 | }                                                                       \
 980 |                                                                         \
 981 | int greatest_all_passed(void) { return (greatest_info.failed == 0); }   \
 982 |                                                                         \
 983 | void greatest_set_test_filter(const char *filter) {                     \
 984 |     greatest_info.test_filter = filter;                                 \
 985 | }                                                                       \
 986 |                                                                         \
 987 | void greatest_set_test_exclude(const char *filter) {                    \
 988 |     greatest_info.test_exclude = filter;                                \
 989 | }                                                                       \
 990 |                                                                         \
 991 | void greatest_set_suite_filter(const char *filter) {                    \
 992 |     greatest_info.suite_filter = filter;                                \
 993 | }                                                                       \
 994 |                                                                         \
 995 | void greatest_set_exact_name_match(void) {                              \
 996 |     greatest_info.exact_name_match = 1;                                 \
 997 | }                                                                       \
 998 |                                                                         \
 999 | void greatest_stop_at_first_fail(void) {                                \
1000 |     greatest_set_flag(GREATEST_FLAG_FIRST_FAIL);                        \
1001 | }                                                                       \
1002 |                                                                         \
1003 | void greatest_abort_on_fail(void) {                                     \
1004 |     greatest_set_flag(GREATEST_FLAG_ABORT_ON_FAIL);                     \
1005 | }                                                                       \
1006 |                                                                         \
1007 | void greatest_list_only(void) {                                         \
1008 |     greatest_set_flag(GREATEST_FLAG_LIST_ONLY);                         \
1009 | }                                                                       \
1010 |                                                                         \
1011 | void greatest_get_report(struct greatest_report_t *report) {            \
1012 |     if (report) {                                                       \
1013 |         report->passed = greatest_info.passed;                          \
1014 |         report->failed = greatest_info.failed;                          \
1015 |         report->skipped = greatest_info.skipped;                        \
1016 |         report->assertions = greatest_info.assertions;                  \
1017 |     }                                                                   \
1018 | }                                                                       \
1019 |                                                                         \
1020 | unsigned int greatest_get_verbosity(void) {                             \
1021 |     return greatest_info.verbosity;                                     \
1022 | }                                                                       \
1023 |                                                                         \
1024 | void greatest_set_verbosity(unsigned int verbosity) {                   \
1025 |     greatest_info.verbosity = (unsigned char)verbosity;                 \
1026 | }                                                                       \
1027 |                                                                         \
1028 | void greatest_set_flag(greatest_flag_t flag) {                          \
1029 |     greatest_info.flags = (unsigned char)(greatest_info.flags | flag);  \
1030 | }                                                                       \
1031 |                                                                         \
1032 | void greatest_set_test_suffix(const char *suffix) {                     \
1033 |     greatest_info.name_suffix = suffix;                                 \
1034 | }                                                                       \
1035 |                                                                         \
1036 | void GREATEST_SET_SETUP_CB(greatest_setup_cb *cb, void *udata) {        \
1037 |     greatest_info.setup = cb;                                           \
1038 |     greatest_info.setup_udata = udata;                                  \
1039 | }                                                                       \
1040 |                                                                         \
1041 | void GREATEST_SET_TEARDOWN_CB(greatest_teardown_cb *cb, void *udata) {  \
1042 |     greatest_info.teardown = cb;                                        \
1043 |     greatest_info.teardown_udata = udata;                               \
1044 | }                                                                       \
1045 |                                                                         \
1046 | static int greatest_string_equal_cb(const void *expd, const void *got,  \
1047 |     void *udata) {                                                      \
1048 |     size_t *size = (size_t *)udata;                                     \
1049 |     return (size != NULL                                                \
1050 |         ? (0 == strncmp((const char *)expd, (const char *)got, *size))  \
1051 |         : (0 == strcmp((const char *)expd, (const char *)got)));        \
1052 | }                                                                       \
1053 |                                                                         \
1054 | static int greatest_string_printf_cb(const void *t, void *udata) {      \
1055 |     (void)udata; /* note: does not check \0 termination. */             \
1056 |     return GREATEST_FPRINTF(GREATEST_STDOUT, "%s", (const char *)t);    \
1057 | }                                                                       \
1058 |                                                                         \
1059 | greatest_type_info greatest_type_info_string = {                        \
1060 |     greatest_string_equal_cb, greatest_string_printf_cb,                \
1061 | };                                                                      \
1062 |                                                                         \
1063 | static int greatest_memory_equal_cb(const void *expd, const void *got,  \
1064 |     void *udata) {                                                      \
1065 |     greatest_memory_cmp_env *env = (greatest_memory_cmp_env *)udata;    \
1066 |     return (0 == memcmp(expd, got, env->size));                         \
1067 | }                                                                       \
1068 |                                                                         \
1069 | /* Hexdump raw memory, with differences highlighted */                  \
1070 | static int greatest_memory_printf_cb(const void *t, void *udata) {      \
1071 |     greatest_memory_cmp_env *env = (greatest_memory_cmp_env *)udata;    \
1072 |     const unsigned char *buf = (const unsigned char *)t;                \
1073 |     unsigned char diff_mark = ' ';                                      \
1074 |     FILE *out = GREATEST_STDOUT;                                        \
1075 |     size_t i, line_i, line_len = 0;                                     \
1076 |     int len = 0;   /* format hexdump with differences highlighted */    \
1077 |     for (i = 0; i < env->size; i+= line_len) {                          \
1078 |         diff_mark = ' ';                                                \
1079 |         line_len = env->size - i;                                       \
1080 |         if (line_len > 16) { line_len = 16; }                           \
1081 |         for (line_i = i; line_i < i + line_len; line_i++) {             \
1082 |             if (env->exp[line_i] != env->got[line_i]) diff_mark = 'X';  \
1083 |         }                                                               \
1084 |         len += GREATEST_FPRINTF(out, "\n%04x %c ",                      \
1085 |             (unsigned int)i, diff_mark);                                \
1086 |         for (line_i = i; line_i < i + line_len; line_i++) {             \
1087 |             int m = env->exp[line_i] == env->got[line_i]; /* match? */  \
1088 |             len += GREATEST_FPRINTF(out, "%02x%c",                      \
1089 |                 buf[line_i], m ? ' ' : '<');                            \
1090 |         }                                                               \
1091 |         for (line_i = 0; line_i < 16 - line_len; line_i++) {            \
1092 |             len += GREATEST_FPRINTF(out, "   ");                        \
1093 |         }                                                               \
1094 |         GREATEST_FPRINTF(out, " ");                                     \
1095 |         for (line_i = i; line_i < i + line_len; line_i++) {             \
1096 |             unsigned char c = buf[line_i];                              \
1097 |             len += GREATEST_FPRINTF(out, "%c", isprint(c) ? c : '.');   \
1098 |         }                                                               \
1099 |     }                                                                   \
1100 |     len += GREATEST_FPRINTF(out, "\n");                                 \
1101 |     return len;                                                         \
1102 | }                                                                       \
1103 |                                                                         \
1104 | void greatest_prng_init_first_pass(int id) {                            \
1105 |     greatest_info.prng[id].random_order = 1;                            \
1106 |     greatest_info.prng[id].count_run = 0;                               \
1107 | }                                                                       \
1108 |                                                                         \
1109 | int greatest_prng_init_second_pass(int id, unsigned long seed) {        \
1110 |     struct greatest_prng *p = &greatest_info.prng[id];                  \
1111 |     if (p->count == 0) { return 0; }                                    \
1112 |     p->count_ceil = p->count;                                           \
1113 |     for (p->m = 1; p->m < p->count; p->m <<= 1) {}                      \
1114 |     p->state = seed & 0x1fffffff;     /* only use lower 29 bits */      \
1115 |     p->a = 4LU * p->state;            /* to avoid overflow when */      \
1116 |     p->a = (p->a ? p->a : 4) | 1;            /* multiplied by 4 */      \
1117 |     p->c = 2147483647;        /* and so p->c ((2 ** 31) - 1) is */      \
1118 |     p->initialized = 1;     /* always relatively prime to p->a. */      \
1119 |     fprintf(stderr, "init_second_pass: a %lu, c %lu, state %lu\n",      \
1120 |         p->a, p->c, p->state);                                          \
1121 |     return 1;                                                           \
1122 | }                                                                       \
1123 |                                                                         \
1124 | /* Step the pseudorandom number generator until its state reaches       \
1125 |  * another test ID between 0 and the test count.                        \
1126 |  * This use a linear congruential pseudorandom number generator,        \
1127 |  * with the power-of-two ceiling of the test count as the modulus, the  \
1128 |  * masked seed as the multiplier, and a prime as the increment. For     \
1129 |  * each generated value < the test count, run the corresponding test.   \
1130 |  * This will visit all IDs 0 <= X < mod once before repeating,          \
1131 |  * with a starting position chosen based on the initial seed.           \
1132 |  * For details, see: Knuth, The Art of Computer Programming             \
1133 |  * Volume. 2, section 3.2.1. */                                         \
1134 | void greatest_prng_step(int id) {                                       \
1135 |     struct greatest_prng *p = &greatest_info.prng[id];                  \
1136 |     do {                                                                \
1137 |         p->state = ((p->a * p->state) + p->c) & (p->m - 1);             \
1138 |     } while (p->state >= p->count_ceil);                                \
1139 | }                                                                       \
1140 |                                                                         \
1141 | void GREATEST_INIT(void) {                                              \
1142 |     /* Suppress unused function warning if features aren't used */      \
1143 |     (void)greatest_run_suite;                                           \
1144 |     (void)greatest_parse_options;                                       \
1145 |     (void)greatest_prng_step;                                           \
1146 |     (void)greatest_prng_init_first_pass;                                \
1147 |     (void)greatest_prng_init_second_pass;                               \
1148 |     (void)greatest_set_test_suffix;                                     \
1149 |                                                                         \
1150 |     memset(&greatest_info, 0, sizeof(greatest_info));                   \
1151 |     greatest_info.width = GREATEST_DEFAULT_WIDTH;                       \
1152 |     GREATEST_SET_TIME(greatest_info.begin);                             \
1153 | }                                                                       \
1154 |                                                                         \
1155 | /* Report passes, failures, skipped tests, the number of                \
1156 |  * assertions, and the overall run time. */                             \
1157 | void GREATEST_PRINT_REPORT(void) {                                      \
1158 |     if (!GREATEST_LIST_ONLY()) {                                        \
1159 |         update_counts_and_reset_suite();                                \
1160 |         GREATEST_SET_TIME(greatest_info.end);                           \
1161 |         GREATEST_FPRINTF(GREATEST_STDOUT,                               \
1162 |             "\nTotal: %u test%s",                                       \
1163 |             greatest_info.tests_run,                                    \
1164 |             greatest_info.tests_run == 1 ? "" : "s");                   \
1165 |         GREATEST_CLOCK_DIFF(greatest_info.begin,                        \
1166 |             greatest_info.end);                                         \
1167 |         GREATEST_FPRINTF(GREATEST_STDOUT, ", %u assertion%s\n",         \
1168 |             greatest_info.assertions,                                   \
1169 |             greatest_info.assertions == 1 ? "" : "s");                  \
1170 |         GREATEST_FPRINTF(GREATEST_STDOUT,                               \
1171 |             "Pass: %u, fail: %u, skip: %u.\n",                          \
1172 |             greatest_info.passed,                                       \
1173 |             greatest_info.failed, greatest_info.skipped);               \
1174 |     }                                                                   \
1175 | }                                                                       \
1176 |                                                                         \
1177 | greatest_type_info greatest_type_info_memory = {                        \
1178 |     greatest_memory_equal_cb, greatest_memory_printf_cb,                \
1179 | };                                                                      \
1180 |                                                                         \
1181 | greatest_run_info greatest_info
1182 | 
1183 | /* Handle command-line arguments, etc. */
1184 | #define GREATEST_MAIN_BEGIN()                                           \
1185 |     do {                                                                \
1186 |         GREATEST_INIT();                                                \
1187 |         greatest_parse_options(argc, argv);                             \
1188 |     } while (0)
1189 | 
1190 | /* Report results, exit with exit status based on results. */
1191 | #define GREATEST_MAIN_END()                                             \
1192 |     do {                                                                \
1193 |         GREATEST_PRINT_REPORT();                                        \
1194 |         return (greatest_all_passed() ? EXIT_SUCCESS : EXIT_FAILURE);   \
1195 |     } while (0)
1196 | 
1197 | /* Make abbreviations without the GREATEST_ prefix for the
1198 |  * most commonly used symbols. */
1199 | #if GREATEST_USE_ABBREVS
1200 | #define TEST           GREATEST_TEST
1201 | #define SUITE          GREATEST_SUITE
1202 | #define SUITE_EXTERN   GREATEST_SUITE_EXTERN
1203 | #define RUN_TEST       GREATEST_RUN_TEST
1204 | #define RUN_TEST1      GREATEST_RUN_TEST1
1205 | #define RUN_SUITE      GREATEST_RUN_SUITE
1206 | #define IGNORE_TEST    GREATEST_IGNORE_TEST
1207 | #define ASSERT         GREATEST_ASSERT
1208 | #define ASSERTm        GREATEST_ASSERTm
1209 | #define ASSERT_FALSE   GREATEST_ASSERT_FALSE
1210 | #define ASSERT_EQ      GREATEST_ASSERT_EQ
1211 | #define ASSERT_NEQ     GREATEST_ASSERT_NEQ
1212 | #define ASSERT_GT      GREATEST_ASSERT_GT
1213 | #define ASSERT_GTE     GREATEST_ASSERT_GTE
1214 | #define ASSERT_LT      GREATEST_ASSERT_LT
1215 | #define ASSERT_LTE     GREATEST_ASSERT_LTE
1216 | #define ASSERT_EQ_FMT  GREATEST_ASSERT_EQ_FMT
1217 | #define ASSERT_IN_RANGE GREATEST_ASSERT_IN_RANGE
1218 | #define ASSERT_EQUAL_T GREATEST_ASSERT_EQUAL_T
1219 | #define ASSERT_STR_EQ  GREATEST_ASSERT_STR_EQ
1220 | #define ASSERT_STRN_EQ GREATEST_ASSERT_STRN_EQ
1221 | #define ASSERT_MEM_EQ  GREATEST_ASSERT_MEM_EQ
1222 | #define ASSERT_ENUM_EQ GREATEST_ASSERT_ENUM_EQ
1223 | #define ASSERT_FALSEm  GREATEST_ASSERT_FALSEm
1224 | #define ASSERT_EQm     GREATEST_ASSERT_EQm
1225 | #define ASSERT_NEQm    GREATEST_ASSERT_NEQm
1226 | #define ASSERT_GTm     GREATEST_ASSERT_GTm
1227 | #define ASSERT_GTEm    GREATEST_ASSERT_GTEm
1228 | #define ASSERT_LTm     GREATEST_ASSERT_LTm
1229 | #define ASSERT_LTEm    GREATEST_ASSERT_LTEm
1230 | #define ASSERT_EQ_FMTm GREATEST_ASSERT_EQ_FMTm
1231 | #define ASSERT_IN_RANGEm GREATEST_ASSERT_IN_RANGEm
1232 | #define ASSERT_EQUAL_Tm GREATEST_ASSERT_EQUAL_Tm
1233 | #define ASSERT_STR_EQm GREATEST_ASSERT_STR_EQm
1234 | #define ASSERT_STRN_EQm GREATEST_ASSERT_STRN_EQm
1235 | #define ASSERT_MEM_EQm GREATEST_ASSERT_MEM_EQm
1236 | #define ASSERT_ENUM_EQm GREATEST_ASSERT_ENUM_EQm
1237 | #define PASS           GREATEST_PASS
1238 | #define FAIL           GREATEST_FAIL
1239 | #define SKIP           GREATEST_SKIP
1240 | #define PASSm          GREATEST_PASSm
1241 | #define FAILm          GREATEST_FAILm
1242 | #define SKIPm          GREATEST_SKIPm
1243 | #define SET_SETUP      GREATEST_SET_SETUP_CB
1244 | #define SET_TEARDOWN   GREATEST_SET_TEARDOWN_CB
1245 | #define CHECK_CALL     GREATEST_CHECK_CALL
1246 | #define SHUFFLE_TESTS  GREATEST_SHUFFLE_TESTS
1247 | #define SHUFFLE_SUITES GREATEST_SHUFFLE_SUITES
1248 | 
1249 | #ifdef GREATEST_VA_ARGS
1250 | #define RUN_TESTp      GREATEST_RUN_TESTp
1251 | #endif
1252 | 
1253 | #if GREATEST_USE_LONGJMP
1254 | #define ASSERT_OR_LONGJMP  GREATEST_ASSERT_OR_LONGJMP
1255 | #define ASSERT_OR_LONGJMPm GREATEST_ASSERT_OR_LONGJMPm
1256 | #define FAIL_WITH_LONGJMP  GREATEST_FAIL_WITH_LONGJMP
1257 | #define FAIL_WITH_LONGJMPm GREATEST_FAIL_WITH_LONGJMPm
1258 | #endif
1259 | 
1260 | #endif /* USE_ABBREVS */
1261 | 
1262 | #if defined(__cplusplus) && !defined(GREATEST_NO_EXTERN_CPLUSPLUS)
1263 | }
1264 | #endif
1265 | 
1266 | #endif
1267 | 


--------------------------------------------------------------------------------
/tests/math_intrinsics.c:
--------------------------------------------------------------------------------
1 | #define __MATH__INTRINSICS__IMPLEMENTATION__
2 | #include "../math_intrinsics.h"
3 | 
4 | 


--------------------------------------------------------------------------------
/tests/sokol_time.h:
--------------------------------------------------------------------------------
  1 | #if defined(SOKOL_IMPL) && !defined(SOKOL_TIME_IMPL)
  2 | #define SOKOL_TIME_IMPL
  3 | #endif
  4 | #ifndef SOKOL_TIME_INCLUDED
  5 | /*
  6 |     sokol_time.h    -- simple cross-platform time measurement
  7 | 
  8 |     Project URL: https://github.com/floooh/sokol
  9 | 
 10 |     Do this:
 11 |         #define SOKOL_IMPL or
 12 |         #define SOKOL_TIME_IMPL
 13 |     before you include this file in *one* C or C++ file to create the
 14 |     implementation.
 15 | 
 16 |     Optionally provide the following defines with your own implementations:
 17 |     SOKOL_ASSERT(c)     - your own assert macro (default: assert(c))
 18 |     SOKOL_TIME_API_DECL - public function declaration prefix (default: extern)
 19 |     SOKOL_API_DECL      - same as SOKOL_TIME_API_DECL
 20 |     SOKOL_API_IMPL      - public function implementation prefix (default: -)
 21 | 
 22 |     If sokol_time.h is compiled as a DLL, define the following before
 23 |     including the declaration or implementation:
 24 | 
 25 |     SOKOL_DLL
 26 | 
 27 |     On Windows, SOKOL_DLL will define SOKOL_TIME_API_DECL as __declspec(dllexport)
 28 |     or __declspec(dllimport) as needed.
 29 | 
 30 |     void stm_setup();
 31 |         Call once before any other functions to initialize sokol_time
 32 |         (this calls for instance QueryPerformanceFrequency on Windows)
 33 | 
 34 |     uint64_t stm_now();
 35 |         Get current point in time in unspecified 'ticks'. The value that
 36 |         is returned has no relation to the 'wall-clock' time and is
 37 |         not in a specific time unit, it is only useful to compute
 38 |         time differences.
 39 | 
 40 |     uint64_t stm_diff(uint64_t new, uint64_t old);
 41 |         Computes the time difference between new and old. This will always
 42 |         return a positive, non-zero value.
 43 | 
 44 |     uint64_t stm_since(uint64_t start);
 45 |         Takes the current time, and returns the elapsed time since start
 46 |         (this is a shortcut for "stm_diff(stm_now(), start)")
 47 | 
 48 |     uint64_t stm_laptime(uint64_t* last_time);
 49 |         This is useful for measuring frame time and other recurring
 50 |         events. It takes the current time, returns the time difference
 51 |         to the value in last_time, and stores the current time in
 52 |         last_time for the next call. If the value in last_time is 0,
 53 |         the return value will be zero (this usually happens on the
 54 |         very first call).
 55 | 
 56 |     uint64_t stm_round_to_common_refresh_rate(uint64_t duration)
 57 |         This oddly named function takes a measured frame time and
 58 |         returns the closest "nearby" common display refresh rate frame duration
 59 |         in ticks. If the input duration isn't close to any common display
 60 |         refresh rate, the input duration will be returned unchanged as a fallback.
 61 |         The main purpose of this function is to remove jitter/inaccuracies from
 62 |         measured frame times, and instead use the display refresh rate as
 63 |         frame duration.
 64 |         NOTE: for more robust frame timing, consider using the
 65 |         sokol_app.h function sapp_frame_duration()
 66 | 
 67 |     Use the following functions to convert a duration in ticks into
 68 |     useful time units:
 69 | 
 70 |     double stm_sec(uint64_t ticks);
 71 |     double stm_ms(uint64_t ticks);
 72 |     double stm_us(uint64_t ticks);
 73 |     double stm_ns(uint64_t ticks);
 74 |         Converts a tick value into seconds, milliseconds, microseconds
 75 |         or nanoseconds. Note that not all platforms will have nanosecond
 76 |         or even microsecond precision.
 77 | 
 78 |     Uses the following time measurement functions under the hood:
 79 | 
 80 |     Windows:        QueryPerformanceFrequency() / QueryPerformanceCounter()
 81 |     MacOS/iOS:      mach_absolute_time()
 82 |     emscripten:     emscripten_get_now()
 83 |     Linux+others:   clock_gettime(CLOCK_MONOTONIC)
 84 | 
 85 |     zlib/libpng license
 86 | 
 87 |     Copyright (c) 2018 Andre Weissflog
 88 | 
 89 |     This software is provided 'as-is', without any express or implied warranty.
 90 |     In no event will the authors be held liable for any damages arising from the
 91 |     use of this software.
 92 | 
 93 |     Permission is granted to anyone to use this software for any purpose,
 94 |     including commercial applications, and to alter it and redistribute it
 95 |     freely, subject to the following restrictions:
 96 | 
 97 |         1. The origin of this software must not be misrepresented; you must not
 98 |         claim that you wrote the original software. If you use this software in a
 99 |         product, an acknowledgment in the product documentation would be
100 |         appreciated but is not required.
101 | 
102 |         2. Altered source versions must be plainly marked as such, and must not
103 |         be misrepresented as being the original software.
104 | 
105 |         3. This notice may not be removed or altered from any source
106 |         distribution.
107 | */
108 | #define SOKOL_TIME_INCLUDED (1)
109 | #include <stdint.h>
110 | 
111 | #if defined(SOKOL_API_DECL) && !defined(SOKOL_TIME_API_DECL)
112 | #define SOKOL_TIME_API_DECL SOKOL_API_DECL
113 | #endif
114 | #ifndef SOKOL_TIME_API_DECL
115 | #if defined(_WIN32) && defined(SOKOL_DLL) && defined(SOKOL_TIME_IMPL)
116 | #define SOKOL_TIME_API_DECL __declspec(dllexport)
117 | #elif defined(_WIN32) && defined(SOKOL_DLL)
118 | #define SOKOL_TIME_API_DECL __declspec(dllimport)
119 | #else
120 | #define SOKOL_TIME_API_DECL extern
121 | #endif
122 | #endif
123 | 
124 | #ifdef __cplusplus
125 | extern "C" {
126 | #endif
127 | 
128 | SOKOL_TIME_API_DECL void stm_setup(void);
129 | SOKOL_TIME_API_DECL uint64_t stm_now(void);
130 | SOKOL_TIME_API_DECL uint64_t stm_diff(uint64_t new_ticks, uint64_t old_ticks);
131 | SOKOL_TIME_API_DECL uint64_t stm_since(uint64_t start_ticks);
132 | SOKOL_TIME_API_DECL uint64_t stm_laptime(uint64_t* last_time);
133 | SOKOL_TIME_API_DECL uint64_t stm_round_to_common_refresh_rate(uint64_t frame_ticks);
134 | SOKOL_TIME_API_DECL double stm_sec(uint64_t ticks);
135 | SOKOL_TIME_API_DECL double stm_ms(uint64_t ticks);
136 | SOKOL_TIME_API_DECL double stm_us(uint64_t ticks);
137 | SOKOL_TIME_API_DECL double stm_ns(uint64_t ticks);
138 | 
139 | #ifdef __cplusplus
140 | } /* extern "C" */
141 | #endif
142 | #endif // SOKOL_TIME_INCLUDED
143 | 
144 | /*-- IMPLEMENTATION ----------------------------------------------------------*/
145 | #ifdef SOKOL_TIME_IMPL
146 | #define SOKOL_TIME_IMPL_INCLUDED (1)
147 | #include <string.h> /* memset */
148 | 
149 | #ifndef SOKOL_API_IMPL
150 |     #define SOKOL_API_IMPL
151 | #endif
152 | #ifndef SOKOL_ASSERT
153 |     #include <assert.h>
154 |     #define SOKOL_ASSERT(c) assert(c)
155 | #endif
156 | #ifndef _SOKOL_PRIVATE
157 |     #if defined(__GNUC__) || defined(__clang__)
158 |         #define _SOKOL_PRIVATE __attribute__((unused)) static
159 |     #else
160 |         #define _SOKOL_PRIVATE static
161 |     #endif
162 | #endif
163 | 
164 | #if defined(_WIN32)
165 | #ifndef WIN32_LEAN_AND_MEAN
166 | #define WIN32_LEAN_AND_MEAN
167 | #endif
168 | #include <windows.h>
169 | typedef struct {
170 |     uint32_t initialized;
171 |     LARGE_INTEGER freq;
172 |     LARGE_INTEGER start;
173 | } _stm_state_t;
174 | #elif defined(__APPLE__) && defined(__MACH__)
175 | #include <mach/mach_time.h>
176 | typedef struct {
177 |     uint32_t initialized;
178 |     mach_timebase_info_data_t timebase;
179 |     uint64_t start;
180 | } _stm_state_t;
181 | #elif defined(__EMSCRIPTEN__)
182 | #include <emscripten/emscripten.h>
183 | typedef struct {
184 |     uint32_t initialized;
185 |     double start;
186 | } _stm_state_t;
187 | #else /* anything else, this will need more care for non-Linux platforms */
188 | #ifdef ESP8266
189 | // On the ESP8266, clock_gettime ignores the first argument and CLOCK_MONOTONIC isn't defined
190 | #define CLOCK_MONOTONIC 0
191 | #endif
192 | #include <time.h>
193 | typedef struct {
194 |     uint32_t initialized;
195 |     uint64_t start;
196 | } _stm_state_t;
197 | #endif
198 | static _stm_state_t _stm;
199 | 
200 | /* prevent 64-bit overflow when computing relative timestamp
201 |     see https://gist.github.com/jspohr/3dc4f00033d79ec5bdaf67bc46c813e3
202 | */
203 | #if defined(_WIN32) || (defined(__APPLE__) && defined(__MACH__))
204 | _SOKOL_PRIVATE int64_t _stm_int64_muldiv(int64_t value, int64_t numer, int64_t denom) {
205 |     int64_t q = value / denom;
206 |     int64_t r = value % denom;
207 |     return q * numer + r * numer / denom;
208 | }
209 | #endif
210 | 
211 | SOKOL_API_IMPL void stm_setup(void) {
212 |     memset(&_stm, 0, sizeof(_stm));
213 |     _stm.initialized = 0xABCDABCD;
214 |     #if defined(_WIN32)
215 |         QueryPerformanceFrequency(&_stm.freq);
216 |         QueryPerformanceCounter(&_stm.start);
217 |     #elif defined(__APPLE__) && defined(__MACH__)
218 |         mach_timebase_info(&_stm.timebase);
219 |         _stm.start = mach_absolute_time();
220 |     #elif defined(__EMSCRIPTEN__)
221 |         _stm.start = emscripten_get_now();
222 |     #else
223 |         struct timespec ts;
224 |         clock_gettime(CLOCK_MONOTONIC, &ts);
225 |         _stm.start = (uint64_t)ts.tv_sec*1000000000 + (uint64_t)ts.tv_nsec;
226 |     #endif
227 | }
228 | 
229 | SOKOL_API_IMPL uint64_t stm_now(void) {
230 |     SOKOL_ASSERT(_stm.initialized == 0xABCDABCD);
231 |     uint64_t now;
232 |     #if defined(_WIN32)
233 |         LARGE_INTEGER qpc_t;
234 |         QueryPerformanceCounter(&qpc_t);
235 |         now = (uint64_t) _stm_int64_muldiv(qpc_t.QuadPart - _stm.start.QuadPart, 1000000000, _stm.freq.QuadPart);
236 |     #elif defined(__APPLE__) && defined(__MACH__)
237 |         const uint64_t mach_now = mach_absolute_time() - _stm.start;
238 |         now = (uint64_t) _stm_int64_muldiv((int64_t)mach_now, (int64_t)_stm.timebase.numer, (int64_t)_stm.timebase.denom);
239 |     #elif defined(__EMSCRIPTEN__)
240 |         double js_now = emscripten_get_now() - _stm.start;
241 |         now = (uint64_t) (js_now * 1000000.0);
242 |     #else
243 |         struct timespec ts;
244 |         clock_gettime(CLOCK_MONOTONIC, &ts);
245 |         now = ((uint64_t)ts.tv_sec*1000000000 + (uint64_t)ts.tv_nsec) - _stm.start;
246 |     #endif
247 |     return now;
248 | }
249 | 
250 | SOKOL_API_IMPL uint64_t stm_diff(uint64_t new_ticks, uint64_t old_ticks) {
251 |     if (new_ticks > old_ticks) {
252 |         return new_ticks - old_ticks;
253 |     }
254 |     else {
255 |         return 1;
256 |     }
257 | }
258 | 
259 | SOKOL_API_IMPL uint64_t stm_since(uint64_t start_ticks) {
260 |     return stm_diff(stm_now(), start_ticks);
261 | }
262 | 
263 | SOKOL_API_IMPL uint64_t stm_laptime(uint64_t* last_time) {
264 |     SOKOL_ASSERT(last_time);
265 |     uint64_t dt = 0;
266 |     uint64_t now = stm_now();
267 |     if (0 != *last_time) {
268 |         dt = stm_diff(now, *last_time);
269 |     }
270 |     *last_time = now;
271 |     return dt;
272 | }
273 | 
274 | // first number is frame duration in ns, second number is tolerance in ns,
275 | // the resulting min/max values must not overlap!
276 | static const uint64_t _stm_refresh_rates[][2] = {
277 |     { 16666667, 1000000 },  //  60 Hz: 16.6667 +- 1ms
278 |     { 13888889,  250000 },  //  72 Hz: 13.8889 +- 0.25ms
279 |     { 13333333,  250000 },  //  75 Hz: 13.3333 +- 0.25ms
280 |     { 11764706,  250000 },  //  85 Hz: 11.7647 +- 0.25
281 |     { 11111111,  250000 },  //  90 Hz: 11.1111 +- 0.25ms
282 |     { 10000000,  500000 },  // 100 Hz: 10.0000 +- 0.5ms
283 |     {  8333333,  500000 },  // 120 Hz:  8.3333 +- 0.5ms
284 |     {  6944445,  500000 },  // 144 Hz:  6.9445 +- 0.5ms
285 |     {  4166667, 1000000 },  // 240 Hz:  4.1666 +- 1ms
286 |     {        0,       0 },  // keep the last element always at zero
287 | };
288 | 
289 | SOKOL_API_IMPL uint64_t stm_round_to_common_refresh_rate(uint64_t ticks) {
290 |     uint64_t ns;
291 |     int i = 0;
292 |     while (0 != (ns = _stm_refresh_rates[i][0])) {
293 |         uint64_t tol = _stm_refresh_rates[i][1];
294 |         if ((ticks > (ns - tol)) && (ticks < (ns + tol))) {
295 |             return ns;
296 |         }
297 |         i++;
298 |     }
299 |     // fallthough: didn't fit into any buckets
300 |     return ticks;
301 | }
302 | 
303 | SOKOL_API_IMPL double stm_sec(uint64_t ticks) {
304 |     return (double)ticks / 1000000000.0;
305 | }
306 | 
307 | SOKOL_API_IMPL double stm_ms(uint64_t ticks) {
308 |     return (double)ticks / 1000000.0;
309 | }
310 | 
311 | SOKOL_API_IMPL double stm_us(uint64_t ticks) {
312 |     return (double)ticks / 1000.0;
313 | }
314 | 
315 | SOKOL_API_IMPL double stm_ns(uint64_t ticks) {
316 |     return (double)ticks;
317 | }
318 | #endif /* SOKOL_TIME_IMPL */
319 | 
320 | 


--------------------------------------------------------------------------------
/tests/test.c:
--------------------------------------------------------------------------------
  1 | #ifdef _MSC_VER
  2 | #define _CRT_SECURE_NO_WARNINGS
  3 | #endif
  4 | 
  5 | #define _USE_MATH_DEFINES
  6 | 
  7 | #include <stdio.h>
  8 | #include <stdlib.h>
  9 | #include <stdint.h>
 10 | #include <math.h>
 11 | #include <float.h>
 12 | #include <stdbool.h>
 13 | #include "greatest.h"
 14 | 
 15 | #include "../math_intrinsics.h"
 16 | 
 17 | //----------------------------------------------------------------------------------------------------------------------
 18 | // functions pointer definition
 19 | typedef float (*reference_function)(float);
 20 | typedef float (*reference_function2)(float, float);
 21 | #ifdef __MATH__INTRINSICS__AVX__
 22 |     typedef __m256 (*approximation_function)(__m256);
 23 |     typedef __m256 (*approximation_function2)(__m256, __m256);
 24 |     #define simd_vector_width (8)
 25 | #else
 26 |     typedef float32x4_t (*approximation_function)(float32x4_t);
 27 |     typedef float32x4_t (*approximation_function2)(float32x4_t, float32x4_t);
 28 |     #define simd_vector_width (4)
 29 | #endif
 30 | 
 31 | //----------------------------------------------------------------------------------------------------------------------
 32 | // generic unit test
 33 | TEST generic_test(reference_function ref, approximation_function approx, float range_min, float range_max, float epsilon, uint32_t num_elements, bool relative_error, const char* name)
 34 | {
 35 |     float* input = (float*) malloc(num_elements * sizeof(float));
 36 |     float* result = (float*) malloc(num_elements * sizeof(float));
 37 |     float step = ((range_max - range_min) / (float) (num_elements-1));
 38 |     uint32_t num_vectors = num_elements / simd_vector_width;
 39 | 
 40 |     for(uint32_t i=0; i<num_elements; ++i)
 41 |     {
 42 |         input[i] = (step * (float)(i)) + range_min;
 43 |         result[i] = ref(input[i]);
 44 |     }
 45 | 
 46 | #ifdef __MATH__INTRINSICS__AVX__
 47 |     __m256 v_epsilon = _mm256_set1_ps(epsilon);
 48 |     __m256 v_max_error = _mm256_setzero_ps();
 49 | 
 50 |     for(uint32_t i=0; i<num_vectors; ++i)
 51 |     {
 52 |         __m256 v_input = _mm256_loadu_ps(input + i * simd_vector_width);
 53 |         __m256 v_result = _mm256_loadu_ps(result+ i * simd_vector_width);
 54 |         __m256 v_approx = approx(v_input);
 55 |         __m256 v_error = _mm256_and_ps(_mm256_sub_ps(v_approx, v_result), _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)));
 56 | 
 57 |         if (relative_error)
 58 |             v_error = _mm256_div_ps(v_error, v_result);
 59 |         
 60 |         ASSERT(_mm256_movemask_ps(_mm256_cmp_ps(v_error, v_epsilon, _CMP_LE_OQ)) == 0xff);
 61 |         v_max_error = _mm256_max_ps(v_max_error, v_error);
 62 |     }
 63 | 
 64 |     v_max_error = _mm256_max_ps(v_max_error, _mm256_permute_ps(v_max_error, _MM_SHUFFLE(2, 1, 0, 3)));
 65 |     v_max_error = _mm256_max_ps(v_max_error, _mm256_permute_ps(v_max_error, _MM_SHUFFLE(1, 0, 3, 2)));
 66 |     v_max_error = _mm256_max_ps(v_max_error, _mm256_permute2f128_ps(v_max_error, v_max_error, 1));
 67 | 
 68 |     printf("%s\t max error : %.*e\n", name, FLT_DECIMAL_DIG, _mm256_cvtss_f32(v_max_error));
 69 | #else
 70 |     float32x4_t v_epsilon = vdupq_n_f32(epsilon);
 71 |     float32x4_t v_max_error = vdupq_n_f32(0.f);
 72 | 
 73 |     for(uint32_t i=0; i<num_vectors; ++i)
 74 |     {
 75 |         float32x4_t v_input = vld1q_f32(input + i * simd_vector_width);
 76 |         float32x4_t v_result = vld1q_f32(result+ i * simd_vector_width);
 77 |         float32x4_t v_approx = approx(v_input);
 78 |         float32x4_t v_error = vabsq_f32(vsubq_f32(v_approx, v_result));
 79 | 
 80 |         if (relative_error)
 81 |             v_error = vdivq_f32(v_error, v_result);
 82 |         
 83 |         ASSERT(vminvq_u32(vcleq_f32(v_error, v_epsilon)) == UINT32_MAX);
 84 |         v_max_error = vmaxq_f32(v_max_error, v_error);
 85 |     }
 86 | 
 87 |     printf("%s\t max error : %.*e\n", name, FLT_DECIMAL_DIG, vmaxvq_f32(v_max_error));
 88 | #endif
 89 | 
 90 |     free(input);
 91 |     free(result);
 92 |     
 93 |     PASS();
 94 | }
 95 | 
 96 | //----------------------------------------------------------------------------------------------------------------------
 97 | // generic unit test with 2 arguments
 98 | TEST generic_test2(reference_function2 ref, approximation_function2 approx, float range_min, float range_max, float epsilon, uint32_t num_elements, bool relative_error, const char* name)
 99 | {
100 |     float* x = (float*) malloc(num_elements * sizeof(float));
101 |     float* y = (float*) malloc(num_elements * sizeof(float));
102 |     float* result = (float*) malloc(num_elements * sizeof(float));
103 |     float step = ((range_max - range_min) / (float) (num_elements-1));
104 |     uint32_t num_vectors = num_elements / simd_vector_width;
105 | 
106 |     for(uint32_t i=0; i<num_elements; ++i)
107 |     {
108 |         x[i] = (step * (float)(i)) + range_min;
109 |         y[i] = sinf(x[i]);
110 |         result[i] = ref(x[i], y[i]);
111 |     }
112 | 
113 | #ifdef __MATH__INTRINSICS__AVX__
114 |     __m256 v_epsilon = _mm256_set1_ps(epsilon);
115 |     __m256 v_max_error = _mm256_setzero_ps();
116 | 
117 |     for(uint32_t i=0; i<num_vectors; ++i)
118 |     {
119 |         __m256 v_x = _mm256_loadu_ps(x + i * simd_vector_width);
120 |         __m256 v_y = _mm256_loadu_ps(y + i * simd_vector_width);
121 |         __m256 v_result = _mm256_loadu_ps(result+ i * simd_vector_width);
122 |         __m256 v_approx = approx(v_x, v_y);
123 |         __m256 v_error = _mm256_and_ps(_mm256_sub_ps(v_approx, v_result), _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)));
124 | 
125 |         if (relative_error)
126 |             v_error = _mm256_div_ps(v_error, v_result);
127 |         
128 |         ASSERT(_mm256_movemask_ps(_mm256_cmp_ps(v_error, v_epsilon, _CMP_LE_OQ)) == 0xff);
129 |         v_max_error = _mm256_max_ps(v_max_error, v_error);
130 |     }
131 | 
132 |     v_max_error = _mm256_max_ps(v_max_error, _mm256_permute_ps(v_max_error, _MM_SHUFFLE(2, 1, 0, 3)));
133 |     v_max_error = _mm256_max_ps(v_max_error, _mm256_permute_ps(v_max_error, _MM_SHUFFLE(1, 0, 3, 2)));
134 |     v_max_error = _mm256_max_ps(v_max_error, _mm256_permute2f128_ps(v_max_error, v_max_error, 1));
135 | 
136 |     printf("%s\t max error : %.*e\n", name, FLT_DECIMAL_DIG, _mm256_cvtss_f32(v_max_error));
137 | #else
138 |     float32x4_t v_epsilon = vdupq_n_f32(epsilon);
139 |     float32x4_t v_max_error = vdupq_n_f32(0.f);
140 | 
141 |     for(uint32_t i=0; i<num_vectors; ++i)
142 |     {
143 |         float32x4_t v_x = vld1q_f32(x + i * simd_vector_width);
144 |         float32x4_t v_y = vld1q_f32(y + i * simd_vector_width);
145 |         float32x4_t v_result = vld1q_f32(result+ i * simd_vector_width);
146 |         float32x4_t v_approx = approx(v_x, v_y);
147 |         float32x4_t v_error = vabsq_f32(vsubq_f32(v_approx, v_result));
148 | 
149 |         if (relative_error)
150 |             v_error = vdivq_f32(v_error, v_result);
151 |         
152 |         ASSERT(vminvq_u32(vcleq_f32(v_error, v_epsilon)) == UINT32_MAX);
153 |         v_max_error = vmaxq_f32(v_max_error, v_error);
154 |     }
155 | 
156 |     printf("%s\t max error : %.*e\n", name, FLT_DECIMAL_DIG, vmaxvq_f32(v_max_error));
157 | #endif
158 | 
159 |     free(x);
160 |     free(y);
161 |     free(result);
162 |     
163 |     PASS();
164 | }
165 | 
166 | //----------------------------------------------------------------------------------------------------------------------
167 | TEST value_expected(float input, float target, approximation_function function)
168 | {
169 | #ifdef __MATH__INTRINSICS__AVX__
170 |     __m256 v_input = _mm256_set1_ps(input);
171 |     float result = _mm256_cvtss_f32(function(v_input));
172 |     ASSERT_EQ_FMT(target, result, "%f");
173 | #else
174 |     float32x4_t v_input = vdupq_n_f32(input);
175 |     float result = vgetq_lane_f32(function(v_input), 0);
176 |     ASSERT_EQ_FMT(target, result, "%f");
177 | #endif
178 | 
179 |     PASS();
180 | }
181 | 
182 | //----------------------------------------------------------------------------------------------------------------------
183 | TEST nan_expected(float input, approximation_function function)
184 | {
185 | #ifdef __MATH__INTRINSICS__AVX__
186 |     __m256 v_input = _mm256_set1_ps(input);
187 |     float result = _mm256_cvtss_f32(function(v_input));
188 |     ASSERT(isnan(result));
189 | #else
190 |     float32x4_t v_input = vdupq_n_f32(input);
191 |     float result = vgetq_lane_f32(function(v_input), 0);
192 |     ASSERT(isnan(result));
193 | #endif
194 | 
195 |     PASS();
196 | }
197 | 
198 | #define NUM_SAMPLES (1024)
199 | 
200 | #ifdef __MATH_INTRINSINCS_FAST__
201 | static const float trigo_threshold = 1.e-06f;
202 | static const float exp_threshold = 3.e-06f;
203 | static const float arc_threshold = 1.e-04f;
204 | static const float pow_threshold = 1.e-05f;
205 | static const float cbrt_threshold = 2.e-07f;
206 | #else
207 | static const float trigo_threshold = FLT_EPSILON;
208 | static const float exp_threshold = 2.e-07f;
209 | static const float arc_threshold = 1.e-06f;
210 | static const float pow_threshold = 1.e-06f;
211 | static const float cbrt_threshold = 2.e-07f;
212 | #endif
213 | 
214 | float atan2_xy(float x, float y) {return atan2f(y, x);}
215 | 
216 | SUITE(trigonometry)
217 | {
218 |     printf(".");
219 | 
220 | #ifdef __MATH__INTRINSICS__AVX__
221 |     RUN_TESTp(generic_test, sinf, mm256_sin_ps, -10.f, 10.f, trigo_threshold, NUM_SAMPLES, false, "mm256_sin_ps");
222 |     RUN_TESTp(generic_test, cosf, mm256_cos_ps, -10.f, 10.f, trigo_threshold, NUM_SAMPLES, false, "mm256_cos_ps");
223 |     RUN_TESTp(generic_test, acosf, mm256_acos_ps, -1.f, 1.f, arc_threshold, NUM_SAMPLES, false, "mm256_acos_ps");
224 |     RUN_TESTp(generic_test, asinf, mm256_asin_ps, -1.f, 1.f, arc_threshold, NUM_SAMPLES, false, "mm256_asin_ps");
225 |     RUN_TESTp(generic_test, atanf, mm256_atan_ps, -10.f, 10.f, arc_threshold, NUM_SAMPLES, false, "mm256_atan_ps");
226 | 
227 |     // this task fails on linux and I don't have this OS to debug
228 |     #if !defined(__linux__)
229 |         RUN_TESTp(generic_test2, atan2_xy, mm256_atan2_ps, 0.f, 1000.f, 3.e-07f, NUM_SAMPLES, false, "mm256_atan2_ps");
230 |     #endif
231 | #else
232 |     RUN_TESTp(generic_test, sinf, vsinq_f32, -10.f, 10.f, trigo_threshold, NUM_SAMPLES, false, "vsinq_f32");
233 |     RUN_TESTp(generic_test, cosf, vcosq_f32, -10.f, 10.f, trigo_threshold, NUM_SAMPLES, false, "vcosq_f32");
234 |     RUN_TESTp(generic_test, acosf, vacosq_f32, -1.f, 1.f, arc_threshold, NUM_SAMPLES, false, "vacosq_f32");
235 |     RUN_TESTp(generic_test, asinf, vasinq_f32, -1.f, 1.f, arc_threshold, NUM_SAMPLES, false, "vasinq_f32");
236 |     RUN_TESTp(generic_test, atanf, vatanq_f32, -10.f, 10.f, arc_threshold, NUM_SAMPLES, false, "vatanq_f32");
237 |     RUN_TESTp(generic_test2, atan2_xy, vatan2q_f32, 0.f, 1000.f, 3.e-07f, NUM_SAMPLES, false, "vatan2q_f32");
238 | #endif
239 | }
240 | 
241 | SUITE(exponentiation)
242 | {
243 |     printf(".");
244 | #ifdef __MATH__INTRINSICS__AVX__
245 |     RUN_TESTp(generic_test, logf, mm256_log_ps, FLT_EPSILON, 1.e20f, 1.e-07f, 32768, true, "mm256_log_ps");
246 |     RUN_TESTp(generic_test, log2f, mm256_log2_ps, FLT_EPSILON, 1.e20f, 3.e-07f, 32768, true, "mm256_log2_ps");
247 |     RUN_TESTp(generic_test, expf, mm256_exp_ps, -87.f, 87.f, exp_threshold, NUM_SAMPLES, true, "mm256_exp_ps");
248 |     RUN_TESTp(generic_test, exp2f, mm256_exp2_ps, -126.f, 126.f, exp_threshold, NUM_SAMPLES, true, "mm256_exp2");
249 |     RUN_TESTp(generic_test, cbrtf, mm256_cbrt_ps, -1000.f, 1000.f, cbrt_threshold, 32768, true, "mm256_cbrt_ps");
250 |     RUN_TESTp(generic_test2, powf, mm256_pow_ps, 0.f, 100000.f, pow_threshold, 32768, true, "mm256_pow_ps");
251 | #else
252 |     RUN_TESTp(generic_test, logf, vlogq_f32, FLT_EPSILON, 1.e20f, 1.e-07f, 32768, true, "vlogq_f32");
253 |     RUN_TESTp(generic_test, log2f, vlog2q_f32, FLT_EPSILON, 1.e20f, 3.e-07f, 32768, true, "vlog2q_f32");
254 |     RUN_TESTp(generic_test, expf, vexpq_f32, -87.f, 87.f, exp_threshold, NUM_SAMPLES, true, "vexpq_f32");
255 |     RUN_TESTp(generic_test, exp2f, vexp2q_f32, -126.f, 126.f, exp_threshold, NUM_SAMPLES, true, "vexp2q_f32");
256 |     RUN_TESTp(generic_test, cbrtf, vcbrtq_f32, -1000.f, 1000.f, cbrt_threshold, 4096, true, "vcbrtq_f32");
257 |     RUN_TESTp(generic_test2, powf, vpowq_f32, 0.f, 100000.f, pow_threshold, 32768, true, "vpowq_f32");
258 | #endif
259 | }
260 | 
261 | SUITE(infinity_nan_compliance)
262 | {
263 |     const float positive_inf = INFINITY;
264 |     const float negative_inf = -INFINITY;
265 |     const float not_a_number = nanf("");
266 | 
267 | #ifdef __MATH__INTRINSICS__AVX__
268 | 
269 |     // log
270 |     RUN_TESTp(nan_expected, -1.f, mm256_log_ps);
271 |     RUN_TESTp(nan_expected, not_a_number, mm256_log_ps);
272 |     RUN_TESTp(value_expected,  1.f, 0.f, mm256_log_ps);
273 |     RUN_TESTp(value_expected,  0.f, negative_inf, mm256_log_ps);
274 |     RUN_TESTp(value_expected,  positive_inf, positive_inf, mm256_log_ps);
275 | 
276 |     // log2
277 |     RUN_TESTp(nan_expected, -1.f, mm256_log2_ps);
278 |     RUN_TESTp(nan_expected, not_a_number, mm256_log2_ps);
279 |     RUN_TESTp(value_expected,  1.f, 0.f, mm256_log2_ps);
280 |     RUN_TESTp(value_expected,  0.f, negative_inf, mm256_log2_ps);
281 |     RUN_TESTp(value_expected,  positive_inf, positive_inf, mm256_log2_ps);
282 | 
283 |     // exp
284 |     RUN_TESTp(nan_expected, not_a_number, mm256_exp_ps);
285 |     RUN_TESTp(value_expected, 0.f, 1.f, mm256_exp_ps);
286 |     RUN_TESTp(value_expected,-0.f, 1.f, mm256_exp_ps);
287 |     RUN_TESTp(value_expected, positive_inf, positive_inf, mm256_exp_ps);
288 |     RUN_TESTp(value_expected, negative_inf, 0.f, mm256_exp_ps);
289 | 
290 |     // exp2
291 |     RUN_TESTp(nan_expected, not_a_number, mm256_exp2_ps);
292 |     RUN_TESTp(value_expected, 0.f, 1.f, mm256_exp2_ps);
293 |     RUN_TESTp(value_expected,-0.f, 1.f, mm256_exp2_ps);
294 |     RUN_TESTp(value_expected, positive_inf, positive_inf, mm256_exp2_ps);
295 |     RUN_TESTp(value_expected, negative_inf, 0.f, mm256_exp2_ps);
296 | 
297 |     // sin
298 |     RUN_TESTp(nan_expected, not_a_number, mm256_sin_ps);
299 |     RUN_TESTp(nan_expected, positive_inf, mm256_sin_ps);
300 |     RUN_TESTp(nan_expected, negative_inf, mm256_sin_ps);
301 |     RUN_TESTp(value_expected, 0.f, 0.f, mm256_sin_ps);
302 |     RUN_TESTp(value_expected, -0.f, -0.f, mm256_sin_ps);
303 | 
304 |     // cos
305 |     RUN_TESTp(nan_expected, not_a_number, mm256_cos_ps);
306 |     RUN_TESTp(nan_expected, positive_inf, mm256_cos_ps);
307 |     RUN_TESTp(nan_expected, negative_inf, mm256_cos_ps);
308 |     RUN_TESTp(value_expected, 0.f, 1.f, mm256_cos_ps);
309 |     RUN_TESTp(value_expected, -0.f, 1.f, mm256_cos_ps);
310 | 
311 |     // asin
312 |     RUN_TESTp(nan_expected, not_a_number, mm256_asin_ps);
313 |     RUN_TESTp(nan_expected, 2.f, mm256_asin_ps);
314 |     RUN_TESTp(nan_expected, -2.f, mm256_asin_ps);
315 |     RUN_TESTp(value_expected, 0.f, 0.f, mm256_asin_ps);
316 |     RUN_TESTp(value_expected, -0.f, -0.f, mm256_asin_ps);
317 | 
318 |     // acos
319 |     RUN_TESTp(nan_expected, not_a_number, mm256_acos_ps);
320 |     RUN_TESTp(nan_expected, 2.f, mm256_acos_ps);
321 |     RUN_TESTp(nan_expected, -2.f, mm256_acos_ps);
322 |     RUN_TESTp(value_expected, 1.f, 0.f, mm256_acos_ps);
323 | 
324 |     // atan
325 |     RUN_TESTp(nan_expected, not_a_number, mm256_atan_ps);
326 |     RUN_TESTp(value_expected, 0.f, 0.f, mm256_atan_ps);
327 |     RUN_TESTp(value_expected, -0.f, -0.f, mm256_atan_ps);
328 |     RUN_TESTp(value_expected, positive_inf, (float)M_PI_2, mm256_atan_ps);
329 |     RUN_TESTp(value_expected, negative_inf, (float)-M_PI_2, mm256_atan_ps);
330 | 
331 |     // cbrt
332 |     RUN_TESTp(nan_expected, not_a_number, mm256_cbrt_ps);
333 |     RUN_TESTp(value_expected, 0.f, 0.f, mm256_cbrt_ps);
334 |     RUN_TESTp(value_expected, -0.f, -0.f, mm256_cbrt_ps);
335 | 
336 | #else
337 |     RUN_TESTp(nan_expected, -1.f, vlogq_f32);
338 |     RUN_TESTp(nan_expected, not_a_number, vlogq_f32);
339 |     RUN_TESTp(value_expected,  1.f, 0.f, vlogq_f32);
340 |     RUN_TESTp(value_expected,  0.f, negative_inf, vlogq_f32);
341 |     RUN_TESTp(value_expected,  positive_inf, positive_inf, vlogq_f32);
342 | 
343 |     RUN_TESTp(nan_expected, -1.f, vlog2q_f32);
344 |     RUN_TESTp(nan_expected, not_a_number, vlog2q_f32);
345 |     RUN_TESTp(value_expected,  1.f, 0.f, vlog2q_f32);
346 |     RUN_TESTp(value_expected,  0.f, negative_inf, vlog2q_f32);
347 |     RUN_TESTp(value_expected,  positive_inf, positive_inf, vlog2q_f32);
348 | 
349 |     // exp
350 |     RUN_TESTp(nan_expected, not_a_number, vexpq_f32);
351 |     RUN_TESTp(value_expected, 0.f, 1.f, vexpq_f32);
352 |     RUN_TESTp(value_expected,-0.f, 1.f, vexpq_f32);
353 |     RUN_TESTp(value_expected, positive_inf, positive_inf, vexpq_f32);
354 |     RUN_TESTp(value_expected, negative_inf, 0.f, vexpq_f32);
355 | 
356 |     // exp2
357 |     RUN_TESTp(nan_expected, not_a_number, vexp2q_f32);
358 |     RUN_TESTp(value_expected, 0.f, 1.f, vexp2q_f32);
359 |     RUN_TESTp(value_expected,-0.f, 1.f, vexp2q_f32);
360 |     RUN_TESTp(value_expected, positive_inf, positive_inf, vexp2q_f32);
361 |     RUN_TESTp(value_expected, negative_inf, 0.f, vexp2q_f32);
362 | 
363 |     // sin
364 |     RUN_TESTp(nan_expected, not_a_number, vsinq_f32);
365 |     RUN_TESTp(nan_expected, positive_inf, vsinq_f32);
366 |     RUN_TESTp(nan_expected, negative_inf, vsinq_f32);
367 |     RUN_TESTp(value_expected, 0.f, 0.f, vsinq_f32);
368 |     RUN_TESTp(value_expected, -0.f, -0.f, vsinq_f32);
369 | 
370 |     // cos
371 |     RUN_TESTp(nan_expected, not_a_number, vcosq_f32);
372 |     RUN_TESTp(nan_expected, positive_inf, vcosq_f32);
373 |     RUN_TESTp(nan_expected, negative_inf, vcosq_f32);
374 |     RUN_TESTp(value_expected, 0.f, 1.f, vcosq_f32);
375 |     RUN_TESTp(value_expected, -0.f, 1.f, vcosq_f32);
376 | 
377 |     // asin
378 |     RUN_TESTp(nan_expected, not_a_number, vasinq_f32);
379 |     RUN_TESTp(nan_expected, 2.f, vasinq_f32);
380 |     RUN_TESTp(nan_expected, -2.f, vasinq_f32);
381 |     RUN_TESTp(value_expected, 0.f, 0.f, vasinq_f32);
382 |     RUN_TESTp(value_expected, -0.f, -0.f, vasinq_f32);
383 | 
384 |     // acos
385 |     RUN_TESTp(nan_expected, not_a_number, vacosq_f32);
386 |     RUN_TESTp(nan_expected, 2.f, vacosq_f32);
387 |     RUN_TESTp(nan_expected, -2.f, vacosq_f32);
388 |     RUN_TESTp(value_expected, 1.f, 0.f, vacosq_f32);
389 | 
390 |     // atan
391 |     RUN_TESTp(nan_expected, not_a_number, vatanq_f32);
392 |     RUN_TESTp(value_expected, 0.f, 0.f, vatanq_f32);
393 |     RUN_TESTp(value_expected, -0.f, -0.f, vatanq_f32);
394 |     RUN_TESTp(value_expected, positive_inf, (float)M_PI_2, vatanq_f32);
395 |     RUN_TESTp(value_expected, negative_inf, (float)-M_PI_2, vatanq_f32);
396 | 
397 |     // cbrt
398 |     RUN_TESTp(nan_expected, not_a_number, vcbrtq_f32);
399 |     RUN_TESTp(value_expected, 0.f, 0.f, vcbrtq_f32);
400 |     RUN_TESTp(value_expected, -0.f, -0.f, vcbrtq_f32);
401 | #endif
402 | }
403 | 
404 | GREATEST_MAIN_DEFS();
405 | 
406 | int main(int argc, char * argv[])
407 | {
408 |     GREATEST_MAIN_BEGIN();
409 | 
410 | #ifdef __MATH_INTRINSINCS_FAST__
411 |     printf("math intrinsics unit tests : mode fast\n\n");
412 | #else
413 |     printf("math intrinsics unit tests : mode precision\n\n");
414 | #endif
415 | 
416 |     RUN_SUITE(trigonometry);
417 |     RUN_SUITE(exponentiation);
418 |     RUN_SUITE(infinity_nan_compliance);
419 | 
420 |     GREATEST_MAIN_END();
421 | 
422 |     (void)nan_expected;
423 |     (void)value_expected;
424 | }
425 | 
426 | 


--------------------------------------------------------------------------------