├── .gitignore ├── LICENSE ├── README.md └── src ├── Fragmentarium └── LiftedDomainColoring.frag ├── Mathematica ├── EulerTaitBryan.nb ├── Hash2D_PSE.nb ├── Normal1.nb ├── PrideoutTwist.nb ├── PrnsLcgs.nb ├── QuatHalfAngleCayley.nb ├── README.md ├── RealQuat.wl ├── Sobol.nb ├── ToPlotly.nb ├── Utils.wl └── Weyl2dPoints.nb ├── Posts ├── .gitignore ├── README.md ├── ballcube.c ├── compute_weyl_1d.c ├── discsquare.c ├── hopf2q.c ├── involution_mix.c ├── normals_to_rot.c ├── ortho_basis.c ├── popcnt_norm_dist.c ├── posits │ ├── Posit.nb │ └── Post1.nb ├── q2mat.c ├── quat2tmt.c ├── quat_compose_error.c ├── quatquant0.c ├── slerp │ ├── atan.sollya │ ├── ref.sollya │ ├── sincos.sollya │ ├── slerp.c │ └── util │ │ └── util.sollya ├── tait2q.c ├── trisect │ ├── .gitignore │ ├── test_vector_4096.h │ ├── trisect.c │ └── trisect.sollya └── xorrot.c ├── Python └── Hash2D_PSE.py ├── README.md ├── SFH ├── .gitignore ├── GF2 │ ├── .gitignore │ ├── Makefile │ ├── README.md │ ├── bmat.h │ ├── bmat_avx2.h │ ├── bmat_basics.c │ ├── bmat_charpoly.c │ ├── bmat_everything.h │ ├── bmat_flint.c │ ├── bmat_flint.h │ ├── bmat_func.c │ ├── bmat_gauss.c │ ├── bmat_generic.h │ ├── bmat_i.h │ ├── bmat_m4ri.c │ ├── bmat_m4ri.h │ ├── bmat_mul.c │ ├── bmat_pow.c │ ├── bmat_print.c │ ├── bmat_random.c │ ├── bmat_ref.c │ ├── bmat_ref.h │ ├── bmat_set.c │ ├── bmat_toeplitz.c │ ├── bmat_transpose.c │ ├── examples │ │ └── jump_64.c │ └── tests │ │ ├── .gitignore │ │ ├── Makefile │ │ ├── README.md │ │ ├── bmat_test.c │ │ ├── bmat_test.h │ │ ├── bmat_test_flint.c │ │ ├── bmat_test_kernel.c │ │ ├── bmat_test_m4ri.c │ │ └── bmat_timing.c ├── README.md ├── Sobol.h ├── bitops.h ├── bitops_small.h ├── bitset.h ├── carryless.h ├── compiler_hints.h ├── examples │ ├── LcgsTest.c │ ├── PrnsTest.c │ └── SobolEx1.c ├── extern │ └── README.md ├── f32_horner.h ├── f32_horner2.h ├── f32_hornerx.h ├── f32_math.h ├── f32_math │ ├── .gitignore │ ├── f32_asincos.c │ ├── f32_asincospi.c │ ├── f32_cbrt.c │ ├── f32_rsqrt.c │ ├── f32_sincospi.c │ ├── internal │ │ ├── f32_acospi_sb.h │ │ ├── f32_asincos.h │ │ ├── f32_asincospi.h │ │ ├── f32_asinpi_sb.h │ │ ├── f32_cbrt.h │ │ ├── f32_math_common.h │ │ ├── f32_rsqrt.h │ │ └── f32_sincospi.h │ ├── sollya │ │ ├── acos.sollya │ │ ├── acospi_sb.sollya │ │ ├── asin_classic.sollya │ │ ├── asinpi_classic.sollya │ │ ├── asinpi_sb.sollya │ │ ├── atan.sollya │ │ ├── cospi.sollya │ │ ├── cut.sollya │ │ ├── sinpi.sollya │ │ └── util.sollya │ └── tests │ │ ├── .gitignore │ │ ├── Makefile │ │ ├── README.md │ │ ├── acos.c │ │ ├── acosh.c │ │ ├── acospi.c │ │ ├── asin.c │ │ ├── asinpi.c │ │ ├── atan.c │ │ ├── atanpi.c │ │ ├── backup │ │ ├── acos.c │ │ ├── acospi.c │ │ ├── asin.c │ │ ├── asinpi.c │ │ ├── atan.c │ │ ├── atanpi.c │ │ ├── common.h │ │ ├── core_math_expand.h │ │ └── util.h │ │ ├── cbrt.c │ │ ├── common.h │ │ ├── core_math_expand.h │ │ ├── cospi.c │ │ ├── exp.c │ │ ├── reports │ │ ├── acos.md │ │ ├── acospi.md │ │ ├── asin.md │ │ ├── asinpi.md │ │ ├── cbrt.md │ │ ├── cospi.md │ │ └── sinpi.md │ │ ├── rsqrt.c │ │ ├── sinpi.c │ │ ├── timehack.h │ │ └── util.h ├── f32_quant.h ├── f32_util.h ├── f64_horner.h ├── f64_horner2.h ├── f64_math.h ├── f64_util.h ├── intops.h ├── lcgs.h ├── lprns.h ├── prng_small.h ├── prns.h ├── quat.h ├── sfibpoints.h ├── simd.h ├── simd_2d3d.h ├── swar.h ├── swar_avx2.h ├── swing_twist.h ├── tests │ ├── .gitignore │ ├── Makefile │ ├── carryless.c │ └── swar_avx2.c ├── vec2.h ├── vec3.h └── welford.h ├── Sollya ├── README.md ├── TODO ├── addk.sollya ├── approx.sollya ├── argreduce.sollya ├── common.sollya ├── examples │ ├── .gitignore │ ├── atan.sollya │ ├── atan_pi8.sollya │ ├── disc.sollya │ ├── f.sollya │ ├── foo.sollya │ ├── log2.sollya │ ├── sin.sollya │ └── sincospi.sollya ├── mulk.sollya ├── plotly.sollya ├── struct.sollya └── util.sollya └── TestAndSearch ├── LprnsTestU01.c ├── PrnsTestU01.c ├── WeylTestU01.c ├── jitahash.c └── results ├── lprns_base.txt └── lprns_stream.txt /.gitignore: -------------------------------------------------------------------------------- 1 | *.exe 2 | *~ 3 | *.png 4 | \#*\# 5 | src/Posts/rank1/foo 6 | src/Mathematica/DiscPoints.jpg 7 | src/Mathematica/PrideoutTwist1.jpg 8 | src/Posts/foo.ppm 9 | src/Posts/involution_mix_b1.c 10 | src/Posts/involution_mix_b2.c 11 | src/Posts/divisibility.c 12 | src/Posts/foo.c 13 | src/Posts/lambert.c 14 | src/Posts/lds_disc_1.c 15 | src/Posts/rank1.c 16 | src/Posts/rank1/basis.c 17 | src/Posts/slerp/reparam1.sollya 18 | src/TestAndSearch/check_bijection.c 19 | src/TestAndSearch/WeylBitByBit.c 20 | src/SFH/plotly.h 21 | src/SFH/rng_xform.h 22 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | This is free and unencumbered software released into the public domain. 2 | 3 | Anyone is free to copy, modify, publish, use, compile, sell, or 4 | distribute this software, either in source code form or as a compiled 5 | binary, for any purpose, commercial or non-commercial, and by any 6 | means. 7 | 8 | In jurisdictions that recognize copyright laws, the author or authors 9 | of this software dedicate any and all copyright interest in the 10 | software to the public domain. We make this dedication for the benefit 11 | of the public at large and to the detriment of our heirs and 12 | successors. We intend this dedication to be an overt act of 13 | relinquishment in perpetuity of all present and future rights to this 14 | software under copyright law. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 | OTHER DEALINGS IN THE SOFTWARE. 23 | 24 | For more information, please refer to 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Stand-alone-junk 2 | 3 | A repo for stuff that doesn't deserve one of it's own. 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /src/Fragmentarium/LiftedDomainColoring.frag: -------------------------------------------------------------------------------- 1 | // public domain (unlicence) 2 | 3 | // Pole (magnitude) indications is borked. Haven't 4 | // found anything super happy with. 5 | 6 | #include "2D.frag" 7 | //#include "Progressive2D.frag" 8 | #group DomainPlot 9 | 10 | // coloring tweekable constants 11 | 12 | uniform bool SMOOTH_HSV; checkbox[true] 13 | 14 | // enable/disable growth indicators 15 | uniform bool GROW_IND; checkbox[true] 16 | 17 | // growth indication constant (>=0). larger is less pronounced. 18 | uniform float GROW_K; slider[0.0,0.9,1.0] 19 | 20 | // grid line scale (radial & concentric) 21 | uniform float GRID_K; slider[0.0,0.01,0.2] 22 | 23 | // enable/disable radial grid lines 24 | uniform bool RADIAL_GRID; checkbox[true] 25 | 26 | // number of radial lines 27 | uniform int RADIAL_LINES; slider[1,32,64] 28 | 29 | // radial: post blended in RGB if defined, otherwise 30 | // via saturation. 31 | uniform bool RADIAL_GRID_POST; checkbox[false] 32 | const vec3 rg_color = vec3(1.0,1.0,1.0); 33 | 34 | // limit ouput to magnitude range [0,1] 35 | uniform bool UNIT_ONLY; checkbox[false] 36 | 37 | // blend value for radial lines (0,1) 0=strong, 1=none 38 | uniform float RADIAL_LINE_STR; slider[0.0,0.001,1.0] 39 | 40 | // concentric grid lines: shown if defined 41 | uniform bool CONCENTRIC_GRID; checkbox[true] 42 | 43 | // concentric post blended in RGB if defined, otherwise 44 | // via brightness 45 | uniform bool CONCENTRIC_GRID_POST; checkbox[false] 46 | 47 | uniform bool RECT_GRID; checkbox[false] 48 | 49 | uniform float RECT_GRID_STR; slider[0.0,0.55,1.0] 50 | 51 | #define RECT_GRID_S 52 | 53 | // concentric grid color 54 | const vec3 cg_color = vec3(0.0,0.0,0.0); 55 | 56 | // rotate hue angle (choose what's zero) 57 | uniform float HUE_ANGLE; slider[0.0,0.55,1.0] 58 | 59 | // if defined: darken as point approach zero (south pole) 60 | // and brighten as approachs infinity (north pole) 61 | #define POLE_SHADING 62 | 63 | // if defined: show a rectangle grid. Number of per unit distance. 64 | //#define R_GRID 2.0 65 | 66 | 67 | #define E 2.71828182845 68 | vec2 epowz(vec2 c) 69 | { 70 | return vec2(cos(c.y), sin(c.y))*pow(E, c.x); 71 | } 72 | 73 | // the map to visualize 74 | vec2 map(vec2 z) 75 | { 76 | return z; 77 | } 78 | 79 | 80 | //---------------------- 81 | 82 | // derived constants 83 | #define GROW_M (1.0/(GROW_K+1.0)) 84 | #define GRID_IK (1.0/GRID_K) 85 | #define PI (52707180.0/16777216.0) 86 | #define RL (float(RADIAL_LINES)/(2.0*PI)) 87 | 88 | vec3 hsv2rgb(vec3 c) 89 | { 90 | const vec4 K = vec4(1.0, 2.0/3.0, 1.0/3.0, 3.0); 91 | vec3 p = abs(fract(c.xxx + K.xyz) * 6.0 - K.www); 92 | return c.z * mix(K.xxx, clamp(p - K.xxx, 0.0, 1.0), c.y); 93 | } 94 | 95 | // Fabrice Neyret's smoothed conversion 96 | vec3 fn_hsv2rgb(vec3 c) 97 | { 98 | return c.z*(1.-c.y*smoothstep(2.,1.,abs(mod(c.x*6.+vec3(0,4,2),6.) -3.))); 99 | } 100 | 101 | float grid_step(float t) 102 | { 103 | t = min(t, 1.0-t); 104 | return smoothstep(0.0, GRID_K, t); 105 | } 106 | 107 | // lifted domain plot color for point 'c' 108 | vec3 domainPlot(vec2 c) 109 | { 110 | float d = dot(c,c); 111 | float l2d = log2(d); 112 | float theta = atan(c.y,c.x); 113 | 114 | // growth indication 115 | float b1 = fract(0.5*l2d); // frac part of log2(||c||) 116 | float b0 = b1+b1; 117 | 118 | if (b0 >= 1.0) b0 -= 1.0; 119 | 120 | float b = GROW_M*(b0+GROW_K); 121 | 122 | // concentric grid covers branch-point 123 | if (b1 < 0.5) b = 1.0; 124 | 125 | if (!GROW_IND) b = 1.0; 126 | 127 | // saturation and value 128 | float sat = 1.0; 129 | float val = 1.0; 130 | 131 | // magnitude indication: needs work 132 | if (d < 1.0) { 133 | val = 2.0/(1.0-log2(d)); 134 | val = 1.0; 135 | // val = sqrt(2.0*atan(1.0/sqrt(d))); 136 | } 137 | else if (d > 2.0) { 138 | sat = 2.0/l2d; 139 | //sat = sqrt(2.0*atan(1.0/sqrt(d))); 140 | } 141 | 142 | // rectangular grid 143 | if (RECT_GRID ){ 144 | float x = floor(c.x); 145 | float y = floor(c.y); 146 | if (mod(x+y,2.0)==1.0) sat*=RECT_GRID_STR; 147 | } 148 | 149 | // radial grid (integrated) 150 | if (RADIAL_GRID && !RADIAL_GRID_POST) { 151 | sat = mix(RADIAL_LINE_STR, sat, grid_step(fract(theta*RL))); 152 | } 153 | 154 | // concentric grid (integrated) 155 | if (CONCENTRIC_GRID && !CONCENTRIC_GRID_POST) { 156 | b = mix(0.0, b, grid_step(b0)); 157 | } 158 | 159 | // convert to RGB 160 | float hue = theta*(1.0/(2.0*PI))-HUE_ANGLE; 161 | vec3 v; 162 | 163 | if (SMOOTH_HSV) 164 | v =fn_hsv2rgb(vec3(hue,sat,val)); 165 | else 166 | v = hsv2rgb(vec3(hue,sat,val)); 167 | 168 | v *= b; 169 | 170 | // radial grid (as post effect) 171 | if (RADIAL_GRID && RADIAL_GRID_POST) { 172 | v = mix(rg_color, v, grid_step(fract(theta*RL))); 173 | } 174 | 175 | // concentric grid (as post effect) 176 | if (CONCENTRIC_GRID && CONCENTRIC_GRID_POST) { 177 | v = mix(cg_color, v, grid_step(b0)); 178 | } 179 | 180 | return v; 181 | } 182 | 183 | vec3 color(vec2 z) 184 | { 185 | if (UNIT_ONLY && dot(z,z)>1.0) return vec3(0.0); 186 | 187 | return domainPlot(map(z)); 188 | } 189 | 190 | #preset Default 191 | Center = 0,0 192 | Zoom = 1 193 | AntiAliasScale = 2 194 | AntiAlias = 15 195 | SMOOTH_HSV = true 196 | GROW_IND = true 197 | GROW_K = 0.9 198 | GRID_K = 0.01 199 | RADIAL_GRID = true 200 | RADIAL_GRID_POST = false 201 | RADIAL_LINES = 32 202 | UNIT_ONLY = false 203 | RADIAL_LINE_STR = 0.001 204 | CONCENTRIC_GRID = true 205 | CONCENTRIC_GRID_POST = false 206 | RECT_GRID = false 207 | RECT_GRID_STR = 0.55 208 | HUE_ANGLE = 0 209 | #endpreset -------------------------------------------------------------------------------- /src/Mathematica/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /src/Mathematica/Utils.wl: -------------------------------------------------------------------------------- 1 | (* ::Package:: *) 2 | 3 | (* Marc B. Reynolds, 2017 *) 4 | (* Public Domain under http://unlicense.org, see link for details. *) 5 | 6 | BeginPackage["Utils`"] 7 | 8 | (* printf %e output to a list *) 9 | fromPrintfE[a_ b_ + c_] := a 10^c; 10 | SetAttributes[fromPrintfE, {Listable, HoldAll}]; 11 | 12 | (* printf %a output to a list *) 13 | fromPrintfA0[Hold[Times[a_, Dot[b_, c_]]]] := ToExpression[ 14 | "16^^" <> StringDrop[SymbolName[b], 1] <> "." <> 15 | StringDrop[SymbolName[c], -1]]; 16 | fromPrintfA[Plus[a__, b__]] := fromPrintfA0[Hold[a]] 2^b; 17 | SetAttributes[fromPrintfA, {Listable, HoldAll}]; 18 | 19 | 20 | EndPackage[] 21 | -------------------------------------------------------------------------------- /src/Posts/.gitignore: -------------------------------------------------------------------------------- 1 | gamma 2 | posits 3 | swing_twist -------------------------------------------------------------------------------- /src/Posts/README.md: -------------------------------------------------------------------------------- 1 | # Public domain toy code supplements for blog posts 2 | 3 | Toy examples from: http://marc-b-reynolds.github.io/ 4 | 5 | 6 | | file | brief description | post | 7 | | ---------------- | --------------------------------------------- | ------------------------------------------------------------ | 8 | | ballcube.c | Ball/cylinder/cube maps | [link](http://marc-b-reynolds.github.io/math/2017/01/27/CubeBall.html) | 9 | | discsquare.c | Square/Disc maps | [link](http://marc-b-reynolds.github.io/math/2017/01/08/SquareDisc.html) | 10 | | hopf2q.c | Quaternion/Hopf coordinate conversion | [link](http://marc-b-reynolds.github.io/quaternions/2017/05/12/HopfCoordConvert.html) | 11 | | normals_to_rot.c | Rotation from two normals | [link](http://marc-b-reynolds.github.io/quaternions/2016/08/09/TwoNormToRot.html) | 12 | | ortho_basis.c | Orthonormal basis from normal | [link](http://marc-b-reynolds.github.io/quaternions/2016/07/06/Orthonormal.html) | 13 | | q2mat.c | Quaternion/Rotation matrix conversion | [link](http://marc-b-reynolds.github.io/quaternions/) | 14 | | quat2tmt.c | Factorize quaternion into 2 rotations | [link](http://marc-b-reynolds.github.io/quaternions/2017/05/12/HopfCoordConvert.html) | 15 | | quatquant0.c | Context free quaternion quantization | [link](http://marc-b-reynolds.github.io/quaternions/2017/05/02/QuatQuantPart1.html) | 16 | | tait2q.c | Euler/Tait-Bryan quaternion conversion | [link](http://marc-b-reynolds.github.io/math/2017/04/18/TaitEuler.html) | 17 | | xorrot.c | Bijections of form: `x ^ rot(x,a) ^ rot(x,b)` | [link](http://marc-b-reynolds.github.io/math/2017/10/13/XorRotate.html) | 18 | 19 | -------------------------------------------------------------------------------- /src/Posts/compute_weyl_1d.c: -------------------------------------------------------------------------------- 1 | // Public Domain under http://unlicense.org, see link for details. 2 | 3 | // SEE: http://marc-b-reynolds.github.io/distribution/2020/01/24/Rank1Pre.html 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | // binary32 approximation of (sqrt(5)-1)/2. Choose 12 | // the rounding that insures the final bit is set. 13 | // RD( (sqrt(5)-1)/2 ) = 10368889*2^-24 14 | static const float alpha = 0x1.3c6ef2p-1f; 15 | static const uint32_t K24 = 10368889u << (32-24); 16 | 17 | 18 | // binary32: ensure rounded to even (23-bits) 19 | 20 | //static const uint32_t K23 = 5184445u << (32-23); // 1/phi 21 | //static const uint32_t K23 = 4913933u << (32-23); // 2-sqrt(2) 22 | static const uint32_t K23 = 5664675u << (32-23); // 2-plastic 23 | 24 | static const float alpha23 = K23*0x1.p-32f; 25 | 26 | 27 | // 32-bit version 28 | //static const uint32_t A = 2654435769; 29 | 30 | inline uint32_t uabs(uint32_t a) { return (int32_t)a >=0 ? a : -a; } 31 | 32 | // binary32: (i*alpha) when alpha is rounded to odd 33 | void test_closed_form() 34 | { 35 | float f = 0; 36 | uint32_t n = 0, m=1; 37 | uint32_t i,t,r; 38 | 39 | printf("closed form:\n"); 40 | do { 41 | f = fmodf((float)n*alpha, 1.f); 42 | i = K24*n; 43 | t = (uint32_t)(f*0x1p32f); 44 | r = uabs(i-t) >> 8; 45 | 46 | if (r >= m) { 47 | float e = (float)i * 0x1p-32f; 48 | printf("| %9u | %9f | %f | %f | %08x | %08x| %7d | \n", n, log2f((float)n), e, f, i, t, r); 49 | m <<= 1; 50 | } 51 | n++; 52 | } while(m < 0x01000000); 53 | } 54 | 55 | // binary32: (si+alpha) when alpha is rounded to odd 56 | void test_recurrent_form() 57 | { 58 | float f = 0.f; 59 | uint32_t i = 0; 60 | uint32_t n = 0, m=1; 61 | uint32_t t,r; 62 | 63 | printf("\nadditive method:\n"); 64 | do { 65 | f = f + alpha; if (f >= 1.f) f -= 1.f; 66 | i = i + K24; 67 | t = (uint32_t)(f*0x1p32f); 68 | r = uabs(i-t) >> 8; 69 | 70 | if (r >= m) { 71 | float e = (float)i * 0x1p-32f; 72 | printf("| %9u | %9f | %f | %f | %08x | %08x| %7d | \n", n, log2f((float)n), e, f, i, t, r); 73 | m <<= 1; 74 | } 75 | 76 | n++; 77 | } while(m < 0x01000000); 78 | } 79 | 80 | 81 | // binary32: ensure alpha is even 82 | void test_23() 83 | { 84 | float f = 0; 85 | uint32_t n = 0; 86 | uint32_t i = 0, t; 87 | 88 | t = (uint32_t)(alpha23*0x1p32f); 89 | 90 | printf("\nvalidate 23-bit method: %f %08x %08x\n", alpha23,t, K23); 91 | 92 | // check the constants 93 | if (alpha23 > 0.5f && alpha23 < 1.f) { 94 | if (t == K23) { 95 | if ((t & 0x1ff) == 0) { 96 | 97 | if ((t & 0x200) == 0) 98 | printf("warning: reduced period 23-bit constant should be odd\n"); 99 | 100 | // brute force check all 2^23 elements of the set 101 | do { 102 | f = f + alpha23; if (f >= 1.f) f -= 1.f; 103 | i = i + K23; 104 | t = (uint32_t)(f*0x1p32f); 105 | 106 | if (i-t != 0) { 107 | printf("%8u : %a %a %08x\n", n, (float)i * 0x1p-32f, f,i^t); return; 108 | } 109 | 110 | n++; 111 | } while(i != 0); 112 | 113 | // period is 2^23: n = 2^23 and f=0 at this point since we've walked 114 | // the full period. 115 | if (f == 0.f) 116 | printf(" success\n"); 117 | else 118 | printf(" error: got f=%a (expected 0)\n", f); 119 | 120 | 121 | } 122 | else printf("error: alpha23 has more than 23-bit\n"); 123 | } 124 | else printf("error: alpha23 and K23 aren't the same\n"); 125 | } 126 | else printf("error: alpha23 not on (0.5,1)\n"); 127 | } 128 | 129 | int main(void) 130 | { 131 | test_closed_form(); 132 | test_recurrent_form(); 133 | test_23(); 134 | } 135 | -------------------------------------------------------------------------------- /src/Posts/normals_to_rot.c: -------------------------------------------------------------------------------- 1 | 2 | // Toy code for: // http://marc-b-reynolds.github.io/quaternion/2016/08/09/TwoNormToRot.html 3 | // 4 | // Generate uniform points on the unit sphere, form the rotation, reconstruct the 5 | // target normal and roughly measure angular error. Ignores the degenerate 6 | // case in all implementations. 7 | // 8 | // to compile under VC you'll have to change the float hex-constants...couldn't 9 | // be bothered. 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | #if 1 // LAZY HERE 19 | #include 20 | #endif 21 | 22 | // compile time configuration options 23 | 24 | // enable to test with both pseudo-random and Sobol sequences 25 | #define USE_SOBOL 26 | 27 | #define TRIALS 0xFFFFFFF 28 | 29 | #include "../SFH/quat.h" 30 | 31 | // xoroshiro128+ 32 | 33 | uint64_t rng_state[2]; 34 | 35 | static inline uint64_t rotl(const uint64_t v, int i) 36 | { 37 | return (v << i)|(v >> (64-i)); 38 | } 39 | 40 | static inline uint64_t rng_u64(void) 41 | { 42 | uint64_t s0 = rng_state[0]; 43 | uint64_t s1 = rng_state[1]; 44 | uint64_t r = s0 + s1; 45 | 46 | s1 ^= s0; 47 | rng_state[0] = rotl(s0,55) ^ s1 ^ (s1<<14); 48 | rng_state[1] = rotl(s1,36); 49 | 50 | return r; 51 | } 52 | 53 | static inline float rng_f32(void) 54 | { 55 | return (rng_u64() >> 40)*0x1p-24f; 56 | } 57 | 58 | // uniform on disk 59 | float uniform_disk(vec2_t* p) 60 | { 61 | float d,x,y; 62 | uint64_t v; 63 | 64 | do { 65 | v = rng_u64(); 66 | x = (v >> 40)*0x1p-24f; 67 | y = (v & 0xFFFFFF)*0x1p-24f; 68 | x = 2.f*x-1.f; d = x*x; 69 | y = 2.f*y-1.f; d += y*y; 70 | } while(d >= 1.f); 71 | 72 | p->x = x; 73 | p->y = y; 74 | 75 | return d; 76 | } 77 | 78 | // uniform on S2 79 | void uniform_s2(vec3_t* p) 80 | { 81 | float d,s; 82 | vec2_t v; 83 | 84 | d = uniform_disk(&v); 85 | s = 2.f*sqrtf(1.f-d); 86 | p->x = s*v.x; 87 | p->y = s*v.y; 88 | p->z = 1.f-2.f*d; 89 | } 90 | 91 | void ln(void) {printf("\n");} 92 | 93 | void vec3_print(vec3_t* v) 94 | { 95 | printf("(%+f,%+f,%+f) ",v->x,v->y,v->z); 96 | } 97 | 98 | void vec3_printa(vec3_t* v) 99 | { 100 | printf("(%+a,%+a,%+a) ",v->x,v->y,v->z); 101 | } 102 | 103 | void matrix_ver(float* m, vec3_t* a, vec3_t* b) 104 | { 105 | vec3_t v; 106 | vec3_t s; 107 | vec3_cross(&v,a,b); 108 | vec3_hmul(&s,&v,&v); 109 | 110 | float d = vec3_dot(a,b); 111 | float r = 1.f/(1.f+d); 112 | float rz = v.z*r; 113 | float xy = v.x*v.y*r; 114 | float xz = v.x*rz; 115 | float yz = v.y*rz; 116 | 117 | m[1]= xy-v.z; m[3]= xy+v.z; 118 | m[2]= xz+v.y; m[6]= xz-v.y; 119 | m[5]= yz-v.x; m[7]= yz+v.x; 120 | 121 | m[0]= d+r*s.x; 122 | m[4]= d+r*s.y; 123 | m[8]= d+r*s.z; 124 | } 125 | 126 | void m33_xform(vec3_t* r, float* m, vec3_t* v) 127 | { 128 | r->x = m[0]*v->x + m[1]*v->y + m[2]*v->z; 129 | r->y = m[3]*v->x + m[4]*v->y + m[5]*v->z; 130 | r->z = m[6]*v->x + m[7]*v->y + m[8]*v->z; 131 | } 132 | 133 | float spew(vec3_t* r, vec3_t* a, vec3_t* b, char c) 134 | { 135 | float d = vec3_dot(r,b); 136 | float e = vec3_dot(a,b); 137 | printf("%c: %f ", c, 57.2958f*acosf(d)); 138 | vec3_print(r); 139 | vec3_print(b); 140 | vec3_print(a); 141 | printf("%+f %10f\n", e, 57.2958f*acosf(e)); 142 | return d; 143 | } 144 | 145 | 146 | int main() 147 | { 148 | uint64_t t = __rdtsc(); 149 | 150 | rng_state[0] = t; 151 | rng_state[1] = t ^ _rdtsc(); 152 | float d0 = 1.f; 153 | float d1 = 1.f; 154 | 155 | for(uint32_t i=0; i> np.uint64(35) 39 | self.state ^= self.state << np.uint64(4) 40 | return np.uint64(self.state) 41 | 42 | xorshift = XorShiftRng(1) 43 | 44 | # uint types to binary32 45 | def u64tosp(v): return float(np.uint64(v) >> np.uint64(40))*2**(-24) 46 | def u32tosp(v): return float(np.uint32(v) >> np.uint32(8))*2**(-24) 47 | 48 | # I'm sure this is stupid. 49 | def mask32(v): return np.uint32(v & 0xFFFFFFFF) 50 | def mul32(a,b): return mask32(a*b) 51 | def add32(a,b): return mask32(a+b) 52 | 53 | # xorshift 54 | def xs(v,s): 55 | return v ^ (v >> np.uint32(s)) 56 | 57 | # xorshift - multiply 58 | def xsm(v,s,m): 59 | v = xs(v,s) 60 | return mul32(m, v) 61 | 62 | # multiply - xorshift 63 | def mxs(v,m,s): 64 | v = mul32(m, v) 65 | return xs(v,s) 66 | 67 | def wang32(seed): 68 | seed = (seed ^ np.uint32(61)) ^ (seed >> np.uint32(16)) 69 | seed = mxs(seed, 9, 4) 70 | seed = mxs(seed, 0x27d4eb2d, 15) 71 | return seed 72 | 73 | # White noise on [0,1) 74 | def whiteNoise(x,y): return u64tosp(xorshift.next()) 75 | 76 | hscale = 1.0/np.sqrt(2.0) 77 | 78 | def haar(a): 79 | if len(a) == 1: 80 | return a.copy() 81 | 82 | mid = (a[0::2] + a[1::2]) * hscale 83 | side = (a[0::2] - a[1::2]) * hscale 84 | 85 | return np.hstack((haar(mid), side)) 86 | 87 | 88 | def haar_2d(img): 89 | h,w = img.shape 90 | rows = np.zeros(img.shape, dtype=float) 91 | 92 | for y in range(h): 93 | rows[y] = haar(img[y]) 94 | 95 | cols = np.zeros(img.shape, dtype=float) 96 | 97 | for x in range(w): 98 | cols[:,x] = haar(rows[:,x]) 99 | 100 | return cols 101 | 102 | # As per: 103 | # http://www.reedbeta.com/blog/2013/01/12/quick-and-easy-gpu-random-numbers-in-d3d11/ 104 | # except perform 2D->1D 105 | def wang(x,y): return u32tosp(wang32(add32(y,wang32(x)))) 106 | 107 | # https://github.com/ashima/webgl-noise 108 | def mod289(x): 109 | return x - math.floor(x * (1.0/289.0))*289.0 110 | 111 | def permute(x): 112 | return mod289(((x*34.0)+1.0)*x) 113 | 114 | def ppoly(x,y): 115 | return permute(y+permute(x)) 116 | 117 | # Murmurhash2 bit finalizer 118 | def mh2bf(x): 119 | x = xsm(x, 0x5bd1e995, 13) 120 | x = xs(x, 15) 121 | return x 122 | 123 | # Murmurhash2 step 124 | def mh2s(x): 125 | x = mxs(x, 0x5bd1e995, 24) 126 | return mul32(x, 0x5bd1e995) 127 | 128 | # Murmurhash2 add x & y to h 129 | def mh2(h,x,y): 130 | x = mh2s(x) 131 | y = mh2s(y) 132 | h ^= x 133 | h = mul32(h, 0x5bd1e995) 134 | h ^= y 135 | return h; 136 | 137 | # standard Murmurhash2 construction 138 | def mh2_std(x,y): 139 | h = mh2(0x9747b28c,x,y) 140 | return mh2bf(h) 141 | 142 | 143 | # http://marc-b-reynolds.github.io/math/2016/03/29/weyl_hash.html 144 | def weyl(x,y): 145 | x = mul32(x, 0x3504f333) 146 | y = mul32(y, 0xf1bbcdcb) 147 | x ^= y; 148 | x = mul32(x, 741103597) 149 | return u32tosp(x) 150 | 151 | useSubPlot = False 152 | 153 | def showSPE(name, func): 154 | 155 | # THIS IS COMPLETELY FUBARed: Can't figure out out to show 156 | # pixel exact figures. 157 | 158 | if useSubPlot: 159 | foo = 2.0*float(size)/96.0; 160 | else: 161 | foo = 2.0*float(size)/96.0; 162 | 163 | #py.figure(figsize=(foo,2*(foo+10)), dpi=100) 164 | 165 | samples = np.ones((size, size), dtype=np.float64) 166 | 167 | for y in range(size): 168 | for x in range(size): 169 | samples[y][x] = func(x-offX, y-offY) 170 | 171 | fft1 = fftpack.fft2(samples) # 172 | fft = fftpack.fftshift(fft1) 173 | pse = np.abs(fft)**2 174 | 175 | if useSubPlot: 176 | py.subplot(1,2,1) 177 | else: 178 | py.figure(1, figsize=(foo,foo), dpi=96) 179 | 180 | py.axis("off") 181 | py.title(name) 182 | py.xlabel("space") 183 | py.imshow(samples, cmap=py.cm.Greys, interpolation="nearest") 184 | 185 | if useSubPlot: 186 | py.subplot(1,2,2) 187 | else: 188 | py.figure(2, figsize=(foo,foo), dpi=96) 189 | 190 | py.axis("off") 191 | py.imshow(np.log10(pse), cmap=py.cm.Greys, interpolation="nearest") 192 | py.xlabel("PSE") 193 | 194 | #hist = np.histogram(samples, bins=np.arange(0, 256)) 195 | #py.plot(hist[1][:-1], hist[0], lw=2) 196 | #py.hist2d(np.log10(pse), bins=100) 197 | py.show() 198 | 199 | 200 | showSPE("Reference white noise", whiteNoise) 201 | showSPE("ppoly(y+ppoly(x))", ppoly) 202 | showSPE("wang(y+wang(x))", wang) 203 | showSPE("M(weyl(W0,x)^weyl(W1,y))", weyl) 204 | showSPE("MurmurHash2_BF(MurmurHash(x,y))", mh2_std) 205 | 206 | 207 | -------------------------------------------------------------------------------- /src/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | | Directory | description | 4 | | --------------|-------------| 5 | | Mathematica | junk in * | 6 | | Python | junk in * | 7 | | Posts | support files for blog post | 8 | | SFH | Public domain single file libraries | 9 | | Sollya | A helper library for creating numeric functions | 10 | | TestAndSearch | | 11 | -------------------------------------------------------------------------------- /src/SFH/.gitignore: -------------------------------------------------------------------------------- 1 | \#* 2 | .\#* 3 | *~ 4 | f64_pair.h 5 | -------------------------------------------------------------------------------- /src/SFH/GF2/.gitignore: -------------------------------------------------------------------------------- 1 | .makedep 2 | lib* 3 | bmat_goof 4 | bmat_goof.c 5 | bmat_goof2.c 6 | bmat_bm.c 7 | obj 8 | backup 9 | examples 10 | -------------------------------------------------------------------------------- /src/SFH/GF2/Makefile: -------------------------------------------------------------------------------- 1 | # Dumb mini makefile just for me: 2 | # 3 | 4 | # if CC is the default (not environment varible nor supplied to make, then default 5 | ifeq ($(origin CC),default) 6 | CC = clang-15 7 | endif 8 | 9 | LDLIBS = -lm -lm4ri 10 | CFLAGS = -g3 -march=native -Wall -Wextra -Wconversion -Wpedantic -Wno-unused-function 11 | DFLAGS = -DBMAT_DEBUG 12 | 13 | #SRC := ${wildcard *.c} 14 | SRC := bmat_basics.c bmat_block.c bmat_set.c bmat_mul.c bmat_print.c bmat_gauss.c bmat_transpose.c bmat_random.c bmat_ref.c bmat_toeplitz.c bmat_func.c bmat_charpoly.c bmat_m4ri.c bmat_flint.c bmat_pow.c 15 | HEADERS := ${wildcard *.h} 16 | 17 | IDIRS := -I.. 18 | ODIR := obj 19 | R_OBJ := ${addprefix ${ODIR}/, ${SRC:.c=.u}} 20 | D_OBJ := ${addprefix ${ODIR}/, ${SRC:.c=.o}} 21 | DEPS := ${addprefix ${ODIR}/, ${SRC:.c=.d}} 22 | 23 | all: ${ODIR} libbmat.a 24 | 25 | debug: ${ODIR} libbmat_d.a 26 | 27 | depend: ${DEPS} 28 | 29 | ${ODIR}: 30 | @mkdir ${ODIR} 31 | 32 | spew: 33 | @echo ${R_OBJ} 34 | 35 | libbmat.a: ${R_OBJ} 36 | ${AR} rcs $@ ${R_OBJ} 37 | 38 | libbmat_d.a: ${D_OBJ} 39 | ${AR} rcs $@ ${D_OBJ} 40 | 41 | clean: 42 | @-${RM} ${R_OBJ} ${D_OBJ} 43 | 44 | distclean: clean 45 | @-${RM} ${DEPS} 46 | @-${RM} *~ 47 | 48 | -include ${DEPS} 49 | 50 | ${ODIR}/%.d:%.c 51 | @-echo "# autogenerated by Makefile" > $@ 52 | @mkdir -p ${ODIR} 53 | @$(CC) -MM -MQ${ODIR}/${<:.c=.u} ${IDIRS} ${CFLAGS} $< >> $@ 54 | @$(CC) -MM -MQ${ODIR}/${<:.c=.o} ${IDIRS} ${DFLAGS} ${CFLAGS} $< >> $@ 55 | 56 | ${ODIR}/%.u:%.c 57 | ${CC} -O3 -c ${IDIRS} ${CFLAGS} $< -o $@ 58 | 59 | ${ODIR}/%.o:%.c 60 | ${CC} -O1 -c ${IDIRS} ${DFLAGS} ${CFLAGS} $< -o $@ 61 | 62 | .DEFAULT help: 63 | @echo "help : there is no buildsystem" 64 | @echo " make : builds the release library" 65 | @echo " make debug : builds the debug library" 66 | @echo " make clean : deletes the libraries and object files" 67 | @echo " make distclean : clean + kill emacs tempfiles and .makedep file" 68 | 69 | #.PHONEY: clean distclean all debug depend 70 | -------------------------------------------------------------------------------- /src/SFH/GF2/README.md: -------------------------------------------------------------------------------- 1 | # Stuff 2 | 3 | 4 | 5 | -------------------------------------------------------------------------------- /src/SFH/GF2/bmat_avx2.h: -------------------------------------------------------------------------------- 1 | // Marc B. Reynolds, 2022-2025 2 | // Public Domain under http://unlicense.org, see link for details. 3 | 4 | // versions designed for AVX2 (but might be compiled for others via 5 | // SIMDe or similar) 6 | 7 | #pragma once 8 | 9 | extern void bmat_transpose_16_avx2(bmat_rparam_16(d), bmat_param_16(m)); 10 | extern void bmat_transpose_32_avx2(bmat_rparam_32(d), bmat_param_32(s)); 11 | 12 | // bad me. should be here. 13 | //extern void bmat_transpose_64(bmat_rparam_64(d), bmat_param_64(m)); 14 | -------------------------------------------------------------------------------- /src/SFH/GF2/bmat_everything.h: -------------------------------------------------------------------------------- 1 | // Marc B. Reynolds, 2022-2025 2 | // Public Domain under http://unlicense.org, see link for details. 3 | 4 | // just "include every source file here" + "include this" and VOILA! 5 | // SRLY: for no-build system (no configuration communication), multi-config + easy LTO. 6 | // sure it's evil but (shrug) not a library. 7 | 8 | // Done on purpose in some places..including everything will 9 | // make it pop-up. 10 | #if defined(__GNUC__) 11 | #if defined(__clang__) 12 | #else 13 | #pragma GCC diagnostic ignored "-Wrestrict" 14 | #endif 15 | #endif 16 | 17 | #include "bmat_i.h" 18 | #include "bmat_basics.c" 19 | #include "bmat_set.c" 20 | #include "bmat_toeplitz.c" 21 | #include "bmat_block.c" 22 | #include "bmat_gauss.c" 23 | #include "bmat_charpoly.c" 24 | #include "bmat_mul.c" 25 | #include "bmat_pow.c" 26 | #include "bmat_transpose.c" 27 | #include "bmat_random.c" 28 | #include "bmat_func.c" 29 | 30 | #include "bmat_ref.c" 31 | #include "bmat_print.c" 32 | 33 | #if defined(BMAT_M4RI_ENABLE) 34 | #include "bmat_m4ri.c" 35 | #endif 36 | 37 | #if defined(BMAT_FLINT_ENABLE) 38 | #include "bmat_flint.c" 39 | #endif 40 | -------------------------------------------------------------------------------- /src/SFH/GF2/bmat_flint.h: -------------------------------------------------------------------------------- 1 | // Marc B. Reynolds, 2022-2025 2 | // Public Domain under http://unlicense.org, see link for details. 3 | 4 | #pragma once 5 | 6 | #include 7 | #include 8 | #include 9 | -------------------------------------------------------------------------------- /src/SFH/GF2/bmat_generic.h: -------------------------------------------------------------------------------- 1 | // Marc B. Reynolds, 2022-2025 2 | // Public Domain under http://unlicense.org, see link for details. 3 | 4 | // internal file for generic 64-bit hardware versions. These 5 | // should use SWAR (except for 64x64..no extra space) 6 | 7 | #pragma once 8 | 9 | extern void bmat_transpose_8_gen (bmat_rparam_8 (d), bmat_param_8 (m)); 10 | extern void bmat_transpose_16_gen(bmat_rparam_16(d), bmat_param_16(m)); 11 | extern void bmat_transpose_32_gen(bmat_rparam_32(d), bmat_param_32(m)) 12 | extern void bmat_transpose_64_gen(bmat_rparam_64(d), bmat_param_64(m)) 13 | -------------------------------------------------------------------------------- /src/SFH/GF2/bmat_m4ri.h: -------------------------------------------------------------------------------- 1 | // Marc B. Reynolds, 2022-2025 2 | // Public Domain under http://unlicense.org, see link for details. 3 | 4 | #pragma once 5 | 6 | #include 7 | #include 8 | 9 | extern mzd_t* m4ri_alloc_8 (void); 10 | extern mzd_t* m4ri_alloc_16(void); 11 | extern mzd_t* m4ri_alloc_32(void); 12 | extern mzd_t* m4ri_alloc_64(void); 13 | 14 | extern void m4ri_free(mzd_t*); 15 | 16 | extern void bmat_to_m4ri_8 (mzd_t*, bmat_param_8(m)); 17 | extern void bmat_to_m4ri_16(mzd_t*, bmat_param_16(m)); 18 | extern void bmat_to_m4ri_32(mzd_t*, bmat_param_32(m)); 19 | extern void bmat_to_m4ri_64(mzd_t*, bmat_param_64(m)); 20 | 21 | extern void bmat_from_m4ri_8 (bmat_param_8(m), mzd_t*); 22 | extern void bmat_from_m4ri_16(bmat_param_16(m), mzd_t*); 23 | extern void bmat_from_m4ri_32(bmat_param_32(m), mzd_t*); 24 | extern void bmat_from_m4ri_64(bmat_param_64(m), mzd_t*); 25 | 26 | extern mzd_t* m4ri_make_8 (bmat_param_8 (s)); 27 | extern mzd_t* m4ri_make_16(bmat_param_16(s)); 28 | extern mzd_t* m4ri_make_32(bmat_param_32(s)); 29 | extern mzd_t* m4ri_make_64(bmat_param_64(s)); 30 | 31 | extern void m4ri_set_rv_8 (mzd_t* v, uint8_t x); 32 | extern void m4ri_set_rv_16(mzd_t* v, uint16_t x); 33 | extern void m4ri_set_rv_32(mzd_t* v, uint32_t x); 34 | extern void m4ri_set_rv_64(mzd_t* v, uint64_t x); 35 | 36 | extern uint32_t m4ri_get_rv_8 (mzd_t* v); 37 | extern uint32_t m4ri_get_rv_16(mzd_t* v); 38 | extern uint32_t m4ri_get_rv_32(mzd_t* v); 39 | extern uint64_t m4ri_get_rv_64(mzd_t* v); 40 | 41 | // heavy weight wrappers 42 | extern void m4ri_wrap_mm_8 (bmat_param_8(C), bmat_param_8(A), bmat_param_8(B)); 43 | extern void m4ri_wrap_mm_16(bmat_param_16(C), bmat_param_16(A), bmat_param_16(B)); 44 | extern void m4ri_wrap_mm_32(bmat_param_32(C), bmat_param_32(A), bmat_param_32(B)); 45 | extern void m4ri_wrap_mm_64(bmat_param_64(C), bmat_param_64(A), bmat_param_64(B)); 46 | extern void m4ri_wrap_mt_8 (bmat_param_8(C), bmat_param_8(A), bmat_param_8(B)); 47 | extern void m4ri_wrap_mt_16(bmat_param_16(C), bmat_param_16(A), bmat_param_16(B)); 48 | extern void m4ri_wrap_mt_32(bmat_param_32(C), bmat_param_32(A), bmat_param_32(B)); 49 | extern void m4ri_wrap_mt_64(bmat_param_64(C), bmat_param_64(A), bmat_param_64(B)); 50 | 51 | extern uint32_t m4ri_wrap_vm_8 (uint32_t V, bmat_param_8 (M)); 52 | extern uint32_t m4ri_wrap_vm_16(uint32_t V, bmat_param_16(M)); 53 | extern uint32_t m4ri_wrap_vm_32(uint32_t V, bmat_param_32(M)); 54 | extern uint64_t m4ri_wrap_vm_64(uint64_t V, bmat_param_64(M)); 55 | extern uint32_t m4ri_wrap_mv_8 (bmat_param_8 (M), uint32_t V); 56 | extern uint32_t m4ri_wrap_mv_16(bmat_param_16(M), uint32_t V); 57 | extern uint32_t m4ri_wrap_mv_32(bmat_param_32(M), uint32_t V); 58 | extern uint64_t m4ri_wrap_mv_64(bmat_param_64(M), uint64_t V); 59 | 60 | extern uint32_t m4ri_wrap_rref_8 (bmat_param_8 (m)); 61 | extern uint32_t m4ri_wrap_rref_16(bmat_param_16(m)); 62 | extern uint32_t m4ri_wrap_rref_32(bmat_param_32(m)); 63 | extern uint32_t m4ri_wrap_rref_64(bmat_param_64(m)); 64 | 65 | extern uint32_t m4ri_wrap_rref2_8 (bmat_param_8 (a),bmat_param_8 (b)); 66 | extern uint32_t m4ri_wrap_rref2_16(bmat_param_16(a),bmat_param_16(b)); 67 | extern uint32_t m4ri_wrap_rref2_32(bmat_param_32(a),bmat_param_32(b)); 68 | extern uint32_t m4ri_wrap_rref2_64(bmat_param_64(a),bmat_param_64(b)); 69 | 70 | extern bool m4ri_wrap_inverse_8 (bmat_param_8 (m)); 71 | extern bool m4ri_wrap_inverse_16(bmat_param_16(m)); 72 | extern bool m4ri_wrap_inverse_32(bmat_param_32(m)); 73 | extern bool m4ri_wrap_inverse_64(bmat_param_64(m)); 74 | -------------------------------------------------------------------------------- /src/SFH/GF2/bmat_pow.c: -------------------------------------------------------------------------------- 1 | // Marc B. Reynolds, 2022-2025 2 | // Public Domain under http://unlicense.org, see link for details. 3 | 4 | #include "bmat_i.h" 5 | 6 | /// Powers 7 | ///============================================================== 8 | /// 9 | 10 | // nothing done (as you can see) 11 | 12 | 13 | //******************************************************************* 14 | /// ---------- 15 | /// 16 | /// ## bmat_pow2_*n*(m) 17 | /// 18 | /// Computes $ M = M^2 $ 19 | /// 20 | ///
function list: 21 | /// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ c 22 | /// void bmat_pow2_8 (bmat_rparam_8 (m),bmat_param_8 (a)) 23 | /// void bmat_pow2_16(bmat_rparam_16(m),bmat_param_16(a)) 24 | /// void bmat_pow2_32(bmat_rparam_32(m),bmat_param_16(a)) 25 | /// void bmat_pow2_64(bmat_rparam_64(m),bmat_param_16(a)) 26 | /// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 27 | ///
28 | 29 | void bmat_pow2_8 (bmat_rparam_8 (m)) { bmat_defdup_8 (a,m); bmat_mul_8 (m,a,a); } 30 | void bmat_pow2_16(bmat_rparam_16(m)) { bmat_defdup_16(a,m); bmat_mul_16(m,a,a); } 31 | void bmat_pow2_32(bmat_rparam_32(m)) { bmat_defdup_32(a,m); bmat_mul_32(m,a,a); } 32 | void bmat_pow2_64(bmat_rparam_64(m)) { bmat_defdup_64(a,m); bmat_mul_64(m,a,a); } 33 | 34 | 35 | 36 | //******************************************************************* 37 | /// ---------- 38 | /// 39 | /// ## bmat_pow_*n*(m,a,n) 40 | /// 41 | /// Computes $ M = A^n $ 42 | /// 43 | ///
function list: 44 | /// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ c 45 | /// void bmat_pow_8 (bmat_rparam_8 (m), bmat_param_8 (a), uint64_t n) 46 | /// void bmat_pow_16(bmat_rparam_16(m), bmat_param_16(a), uint64_t n) 47 | /// void bmat_pow_32(bmat_rparam_32(m), bmat_param_32(a), uint64_t n) 48 | /// void bmat_pow_64(bmat_rparam_64(m), bmat_param_64(a), uint64_t n) 49 | /// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 50 | ///
51 | 52 | void bmat_pow_8(bmat_rparam_8(m), bmat_param_8(a), uint64_t n) 53 | { 54 | bmat_def_8(s); 55 | bmat_def_8(t); 56 | 57 | bmat_set_unit_8(m); // M=I 58 | bmat_dup_8(s,a); // S=A 59 | 60 | while (n != 0) { 61 | if (n & 1) { 62 | bmat_dup_8(t,m); 63 | bmat_mul_8(m,t,s); // M=M*S 64 | } 65 | 66 | bmat_pow2_8(s); // S=S^2 67 | 68 | n >>= 1; 69 | } 70 | } 71 | 72 | void bmat_pow_16(bmat_rparam_16(m), bmat_param_16(a), uint64_t n) 73 | { 74 | bmat_def_16(s); 75 | bmat_def_16(t); 76 | 77 | bmat_set_unit_16(m); // M=I 78 | bmat_dup_16(s,a); // S=A 79 | 80 | while (n != 0) { 81 | if (n & 1) { 82 | bmat_dup_16(t,m); 83 | bmat_mul_16(m,t,s); // M=M*S 84 | } 85 | 86 | bmat_pow2_16(s); // S=S^2 87 | 88 | n >>= 1; 89 | } 90 | } 91 | 92 | 93 | void bmat_pow_32(bmat_rparam_32(m), bmat_param_32(a), uint64_t n) 94 | { 95 | bmat_def_32(s); 96 | bmat_def_32(t); 97 | 98 | bmat_set_unit_32(m); // M=I 99 | bmat_dup_32(s,a); // S=A 100 | 101 | while (n != 0) { 102 | if (n & 1) { 103 | bmat_dup_32(t,m); 104 | bmat_mul_32(m,t,s); // M=M*S 105 | } 106 | 107 | bmat_pow2_32(s); // S=S^2 108 | 109 | n >>= 1; 110 | } 111 | } 112 | 113 | void bmat_pow_64(bmat_rparam_64(m), bmat_param_64(a), uint64_t n) 114 | { 115 | bmat_def_64(s); 116 | bmat_def_64(t); 117 | 118 | bmat_set_unit_64(m); // M=I 119 | bmat_dup_64(s,a); // S=A 120 | 121 | while (n != 0) { 122 | if (n & 1) { 123 | bmat_dup_64(t,m); 124 | bmat_mul_64(m,t,s); // M=M*S 125 | } 126 | 127 | bmat_pow2_64(s); // S=S^2 128 | 129 | n >>= 1; 130 | } 131 | } 132 | 133 | -------------------------------------------------------------------------------- /src/SFH/GF2/bmat_ref.h: -------------------------------------------------------------------------------- 1 | // Marc B. Reynolds, 2022-2025 2 | // Public Domain under http://unlicense.org, see link for details. 3 | 4 | // internal file for reference versions (self testing). unless I forget the 5 | // reference version should be good on big-endian. Specifically they should 6 | // all be non SWAR and be using on the primitive integer type that corresponds 7 | // to 'n' (e.g. bmat_func_8 works on uint8_t) 8 | 9 | #pragma once 10 | 11 | extern void bmat_set_unit_8_ref (bmat_param_8 (m)); 12 | extern void bmat_set_unit_16_ref(bmat_param_16(m)); 13 | extern void bmat_set_unit_32_ref(bmat_param_32(m)); 14 | extern void bmat_set_unit_64_ref(bmat_param_64(m)); 15 | 16 | extern void bmat_set_exchange_8_ref (bmat_param_8 (m)); 17 | extern void bmat_set_exchange_16_ref(bmat_param_16(m)); 18 | extern void bmat_set_exchange_32_ref(bmat_param_32(m)); 19 | extern void bmat_set_exchange_64_ref(bmat_param_64(m)); 20 | 21 | extern uint32_t bmat_rank_8_ref (bmat_param_8 (m)); 22 | extern uint32_t bmat_rank_16_ref(bmat_param_16(m)); 23 | extern uint32_t bmat_rank_32_ref(bmat_param_32(m)); 24 | extern uint32_t bmat_rank_64_ref(bmat_param_64(m)); 25 | 26 | extern void bmat_transpose_8_ref (bmat_param_8 (a), bmat_param_8 (s)); 27 | extern void bmat_transpose_16_ref(bmat_param_16(a), bmat_param_16(s)); 28 | extern void bmat_transpose_32_ref(bmat_param_32(a), bmat_param_32(s)); 29 | extern void bmat_transpose_64_ref(bmat_param_64(a), bmat_param_64(s)); 30 | 31 | extern void bmat_mul_8_ref (bmat_param_8 (r), bmat_param_8 (a), bmat_param_8 (b)); 32 | extern void bmat_mul_16_ref(bmat_param_16(r), bmat_param_16(a), bmat_param_16(b)); 33 | extern void bmat_mul_32_ref(bmat_param_32(r), bmat_param_32(a), bmat_param_32(b)); 34 | extern void bmat_mul_64_ref(bmat_param_64(r), bmat_param_64(a), bmat_param_64(b)); 35 | 36 | extern void bmat_mult_8_ref (bmat_param_8 (c), bmat_param_8 (a), bmat_param_8 (b)); 37 | extern void bmat_mult_16_ref(bmat_param_16(c), bmat_param_16(a), bmat_param_16(b)); 38 | extern void bmat_mult_32_ref(bmat_param_32(c), bmat_param_32(a), bmat_param_32(b)); 39 | extern void bmat_mult_64_ref(bmat_param_64(c), bmat_param_64(a), bmat_param_64(b)); 40 | 41 | extern uint8_t bmat_vmul_8_ref (uint8_t, bmat_param_8 (m)); 42 | extern uint16_t bmat_vmul_16_ref(uint16_t, bmat_param_16(m)); 43 | extern uint32_t bmat_vmul_32_ref(uint32_t, bmat_param_32(m)); 44 | extern uint64_t bmat_vmul_64_ref(uint64_t, bmat_param_64(m)); 45 | 46 | extern uint8_t bmat_mulv_8_ref (bmat_param_8 (m), uint8_t); 47 | extern uint16_t bmat_mulv_16_ref(bmat_param_16(m), uint16_t); 48 | extern uint32_t bmat_mulv_32_ref(bmat_param_32(m), uint32_t); 49 | extern uint64_t bmat_mulv_64_ref(bmat_param_64(m), uint64_t); 50 | 51 | extern void bmat_row_lshift_8_ref (bmat_param_8 (d), bmat_param_8 (m), uint32_t s); 52 | extern void bmat_row_lshift_16_ref(bmat_param_16(d), bmat_param_16(m), uint32_t s); 53 | extern void bmat_row_lshift_32_ref(bmat_param_32(d), bmat_param_32(m), uint32_t s); 54 | extern void bmat_row_lshift_64_ref(bmat_param_64(d), bmat_param_64(m), uint32_t s); 55 | 56 | extern void bmat_row_rshift_8_ref (bmat_param_8 (d), bmat_param_8 (m), uint32_t s); 57 | extern void bmat_row_rshift_16_ref(bmat_param_16(d), bmat_param_16(m), uint32_t s); 58 | extern void bmat_row_rshift_32_ref(bmat_param_32(d), bmat_param_32(m), uint32_t s); 59 | extern void bmat_row_rshift_64_ref(bmat_param_64(d), bmat_param_64(m), uint32_t s); 60 | -------------------------------------------------------------------------------- /src/SFH/GF2/tests/.gitignore: -------------------------------------------------------------------------------- 1 | # ignore everything! 2 | * 3 | 4 | # except for these: 5 | !*.[ch] 6 | !Makefile 7 | !README.md 8 | 9 | # OK: skip these for now as well 10 | dev* -------------------------------------------------------------------------------- /src/SFH/GF2/tests/Makefile: -------------------------------------------------------------------------------- 1 | # Dumb mini makefile: 2 | # 0) assumes clang/GCC like options 3 | # 1) every .c file is to be built into an executable 4 | 5 | # if CC is the default (not environment varible nor supplied to make, then default 6 | ifeq ($(origin CC),default) 7 | CC = clang-15 8 | endif 9 | 10 | IDIRS = -I../.. -I.. 11 | CFLAGS = -g3 -O3 -flto ${IDIRS} -march=native -Wall -Wextra -Wconversion -Wpedantic -Wno-unused-function 12 | LDLIBS = 13 | ODIR := obj 14 | 15 | # add filtering into group: m4ri, flint and neither 16 | SRC := ${wildcard *.c} 17 | HEADERS := ${wildcard *.h} 18 | TARGETS := ${SRC:.c=} 19 | SIMDE := ${SRC:.c=_simde} 20 | DEPS := ${addprefix ${ODIR}/, ${SRC:.c=.d}} 21 | 22 | all: ${TARGETS} 23 | 24 | SIMDe: ${SIMDE} 25 | 26 | spew: 27 | @echo ${SRC:.c=_simde} 28 | 29 | .DEFAULT help: 30 | @echo "help : there is no buildsystem" 31 | @echo " make : builds all the test" 32 | @echo " make SIMDe : builds all the test (with SIMDe)" 33 | @echo " make clean : deletes all executables" 34 | @echo " make distclean : clean + kill emacs tempfiles and makedep file" 35 | 36 | .PHONY: clean 37 | 38 | clean: 39 | -${RM} ${TARGETS} ${SIMDE} ${DEPS} 40 | 41 | distclean: clean 42 | -${RM} *~ 43 | 44 | ${ODIR}/%.d:%.c 45 | @-echo "building dependencies: " $< 46 | @-echo "# autogenerated by Makefile" > $@ 47 | @mkdir -p ${ODIR} 48 | @$(CC) -MM -MQ${<:.c=} ${IDIRS} ${CFLAGS} $< >> $@ 49 | 50 | # anything with a flint suffix requires flint & m4ri 51 | %_flint: %_flint.c Makefile 52 | ${CC} ${CFLAGS} $< -L.. ${LDLIBS} -lflint -lm4ri -o $@ 53 | 54 | # anything with a m4ri suffix requires m4ri 55 | %_m4ri_simde: %_m4ri.c Makefile 56 | ${CC} -DSFH_USE_SIMDE ${CFLAGS} $< -L.. ${LDLIBS} -lm4ri -o $@ 57 | 58 | %_m4ri: %_m4ri.c Makefile 59 | ${CC} ${CFLAGS} $< -L.. ${LDLIBS} -lm4ri -o $@ 60 | 61 | %_simde:%.c Makefile 62 | ${CC} -DSFH_USE_SIMDE ${CFLAGS} $< -L.. ${LDLIBS} -o $@ 63 | 64 | %:%.c Makefile 65 | ${CC} ${CFLAGS} $< -L.. ${LDLIBS} -o $@ 66 | 67 | -include ${DEPS} 68 | -------------------------------------------------------------------------------- /src/SFH/GF2/tests/README.md: -------------------------------------------------------------------------------- 1 | # testing stuff 2 | 3 | One file is one executable thing going on. These evil little things just include all the source (via. bmat_everything.h) and lto. 4 | 5 | | file | description | 6 | | -------- |-------------| 7 | | bmat_test.c | tests basic functions against (internal) reference versions | 8 | | bmat_test_kernel.c | tests kernel and fixed point | 9 | | bmat_timing.c | completely janky timing test of some functions | 10 | | bmat_test_m4ri.c | tests (internal) reference versions against [M4RI](https://github.com/malb/m4ri) | 11 | | bmat_test_flint.c | tests (internal) reference versions against [FLINT](https://flintlib.org/) | 12 | | --- | --- | 13 | | dev_*.c | internal dev testing hacks | 14 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /src/SFH/GF2/tests/bmat_test.h: -------------------------------------------------------------------------------- 1 | // Marc B. Reynolds, 2017-2024 2 | // Public Domain under http://unlicense.org, see link for details. 3 | 4 | // types and builders defs. copy & paste madness. formating is for eye scanning. sorry..not sorry. 5 | 6 | #pragma once 7 | 8 | #include 9 | #include "prng_small.h" 10 | 11 | // massive temp hack 12 | #define HEADER "\033[95m" 13 | #define OKBLUE "\033[94m" 14 | #define OKCYAN "\033[96m" 15 | #define OKGREEN "\033[92m" 16 | #define WARNING "\033[93m" 17 | #define FAIL "\033[91m" 18 | #define ENDC "\033[0m" 19 | #define BOLD "\033[1m" 20 | #define UNDERLINE "\033[4m" 21 | 22 | void test_banner(char* str) 23 | { 24 | printf(BOLD " TESTING: %s" ENDC "\n", str); 25 | } 26 | 27 | 28 | // note: pass returns zero and fail one. 29 | uint32_t test_fail(void) { printf(FAIL "FAIL!" ENDC "\n"); return 1; } 30 | uint32_t test_pass(void) { printf(OKGREEN "passed" ENDC "\n"); return 0; } 31 | 32 | uint32_t test_skipped(char* reason) 33 | { 34 | printf(WARNING "skipped" ENDC ": %s\n", reason); 35 | return 0; 36 | } 37 | 38 | typedef void (test_fn_vv_t)(void); 39 | 40 | void test_header_8 (void) { printf(" 8x8: "); fflush(stdout); } 41 | void test_header_16(void) { printf(" 16x16: "); fflush(stdout); } 42 | void test_header_32(void) { printf(" 32x32: "); fflush(stdout); } 43 | void test_header_64(void) { printf(" 64x64: "); fflush(stdout); } 44 | 45 | 46 | typedef bool (test_fn_b1p_t)(uint64_t*); 47 | typedef bool (test_fn_b2p_t)(uint64_t*,uint64_t*); 48 | 49 | typedef void (test_fn_v1p_t)(uint64_t*); 50 | typedef void (test_fn_v2p_t)(uint64_t*,uint64_t*); 51 | typedef void (test_fn_v3p_t)(uint64_t*,uint64_t*,uint64_t*); 52 | 53 | typedef uint32_t (test_fn_d1p_t)(uint64_t*); 54 | 55 | typedef void (test_fn_rsample_t)(uint64_t*, prng_t*); 56 | 57 | // pair this down. what was I thinking? 58 | typedef struct { 59 | uint32_t n; 60 | test_fn_vv_t* rsize; 61 | 62 | test_fn_v2p_t* dup; 63 | test_fn_v1p_t* set_unit; 64 | 65 | test_fn_v3p_t* add; 66 | test_fn_v2p_t* sum; 67 | 68 | test_fn_b2p_t* equal; 69 | test_fn_b1p_t* is_zero; 70 | 71 | test_fn_d1p_t* rank; 72 | 73 | test_fn_rsample_t* random; 74 | } test_fn_set_t; 75 | 76 | #define TEST_FN_E(N,W) .N = bmat_##N##_##W 77 | 78 | #define TEST_FN_SET(W) \ 79 | { \ 80 | .n = W, \ 81 | .rsize = test_header_##W, \ 82 | TEST_FN_E(dup,W), \ 83 | TEST_FN_E(set_unit,W), \ 84 | TEST_FN_E(add,W), \ 85 | TEST_FN_E(sum,W), \ 86 | TEST_FN_E(equal,W), \ 87 | TEST_FN_E(is_zero,W), \ 88 | TEST_FN_E(rank,W), \ 89 | TEST_FN_E(random,W), \ 90 | } 91 | 92 | test_fn_set_t test_fn_set_8 = TEST_FN_SET( 8); 93 | test_fn_set_t test_fn_set_16 = TEST_FN_SET(16); 94 | test_fn_set_t test_fn_set_32 = TEST_FN_SET(32); 95 | test_fn_set_t test_fn_set_64 = TEST_FN_SET(64); 96 | 97 | test_fn_set_t* fn_set[] = 98 | { 99 | &test_fn_set_8, 100 | &test_fn_set_16, 101 | &test_fn_set_32, 102 | &test_fn_set_64, 103 | }; 104 | 105 | 106 | //****************************************************************************** 107 | // validate two functions are inverses: m = f1(f0(m)) = f0(f1(m)) 108 | static inline uint32_t test_rt_ufunc_n(prng_t* prng, 109 | test_fn_set_t* mset, 110 | test_fn_v2p_t* f0, 111 | test_fn_v2p_t* f1, 112 | uint32_t n) 113 | { 114 | uint64_t m0[64],r0[64],r1[64]; 115 | 116 | mset->rsize(); 117 | 118 | for(uint32_t i=0; irandom(m0,prng); 120 | 121 | f0(r0,m0); 122 | f1(r1,r0); 123 | if (mset->equal(m0,r1)) continue; 124 | 125 | return test_fail(); 126 | } 127 | return test_pass(); 128 | } 129 | 130 | 131 | 132 | 133 | //****************************************************************************** 134 | // validate two functions are the same: f0(m0,m1) = f1(m0,m1) 135 | static inline uint32_t test_eq_bfunc_n(prng_t* prng, 136 | test_fn_set_t* mset, 137 | test_fn_v3p_t* f0, 138 | test_fn_v3p_t* f1, 139 | uint32_t n) 140 | { 141 | uint64_t m0[64],m1[64],r0[64],r1[64]; 142 | 143 | mset->rsize(); 144 | 145 | if (f0 == f1) 146 | return test_skipped("this configuration uses the reference"); 147 | 148 | for(uint32_t i=0; irandom(m0,prng); 150 | mset->random(m1,prng); 151 | 152 | f0(r0,m0,m1); 153 | f1(r1,m0,m1); 154 | 155 | if (mset->equal(r0,r1)) continue; 156 | 157 | return test_fail(); 158 | } 159 | return test_pass(); 160 | } 161 | 162 | 163 | 164 | -------------------------------------------------------------------------------- /src/SFH/README.md: -------------------------------------------------------------------------------- 1 | # Public domain single header (and not) file libraries 2 | 3 | 4 | http://unlicense.org/ 5 | 6 | High level overview: 7 | 8 | 9 | | file | description | 10 | | -------- |-------------| 11 | | bitops.h | | 12 | | carryless.h | carryless product | 13 | | f32_horner.h | FMA horner's method helpers | 14 | | f32_util.h | | 15 | | f32_horner.h | | 16 | | f64_util.h | | 17 | | intops.h | | 18 | | lcgs.h | random access pseudo random numbers (LCG based) | 19 | | prns.h | random access pseudo random numbers (Weyl based) | 20 | | welford.h | streaming computation of mean, variance, std dev | 21 | | Sobol.h | sobol sequences | 22 | | vec2.h | | 23 | | vec3.h | | 24 | | quat.h | | 25 | | rng_xform.h | | 26 | | swing_twist.h | | 27 | | f64_util.h | | 28 | | int_util.h | | 29 | -------------------------------------------------------------------------------- /src/SFH/bitops_small.h: -------------------------------------------------------------------------------- 1 | // Marc B. Reynolds, 2023-2025 2 | // Public Domain under http://unlicense.org, see link for details. 3 | 4 | // small collection of operations on (unpacked) 8/16 bit values 5 | 6 | #ifndef BITOPS_SMALL_H 7 | #define BITOPS_SMALL_H 8 | 9 | #define BIT_PERMUTE_T(T,X,M,S) ((T)((X & M) << S) | ((X >> S) & M)) 10 | #define BIT_PERMUTE_8(X,M,S) BIT_PERMUTE_T(uint8_t, X,M,S) 11 | #define BIT_PERMUTE_16(X,M,S) BIT_PERMUTE_T(uint16_t, X,M,S) 12 | 13 | inline uint8_t rot_8 (uint8_t x, uint32_t n) { n &= 0x07; return (uint8_t) ((x<>(-n & 0x07))); } 14 | inline uint8_t ror_8 (uint8_t x, uint32_t n) { n &= 0x07; return (uint8_t) ((x>>n) | (x<<(-n & 0x07))); } 15 | inline uint16_t rot_16(uint16_t x, uint32_t n) { n &= 0x0f; return (uint16_t)((x<>(-n & 0x0f))); } 16 | inline uint16_t ror_16(uint16_t x, uint32_t n) { n &= 0x0f; return (uint16_t)((x>>n) | (x<<(-n & 0x0f))); } 17 | inline uint8_t rol_8 (uint8_t x, uint32_t n) { return rot_8 (x,n); } 18 | inline uint16_t rol_16(uint16_t x, uint32_t n) { return rot_16(x,n); } 19 | 20 | 21 | #if (BITS_HAS_BIT_REVERSE) 22 | 23 | static inline uint8_t bit_reverse_8 (uint8_t x) { return (uint8_t) (bit_reverse_32(x) >> 24); } 24 | static inline uint16_t bit_reverse_16(uint16_t x) { return (uint16_t)(bit_reverse_32(x) >> 16); } 25 | 26 | #else 27 | 28 | // software fallback for no hardware bitreverse. actually 29 | // clang will compile this to the above versions if there is. 30 | 31 | static inline uint8_t bit_reverse_8(uint8_t x) 32 | { 33 | x = BIT_PERMUTE_T(uint8_t, x, 0x0f, 4); 34 | x = BIT_PERMUTE_T(uint8_t, x, 0x33, 2); 35 | x = BIT_PERMUTE_T(uint8_t, x, 0x55, 1); 36 | return x; 37 | } 38 | 39 | static inline uint16_t bit_reverse_16(uint16_t x) 40 | { 41 | x = BIT_PERMUTE_T(uint16_t, x, 0x00ff, 8); 42 | x = BIT_PERMUTE_T(uint16_t, x, 0x0f0f, 4); 43 | x = BIT_PERMUTE_T(uint16_t, x, 0x3333, 2); 44 | x = BIT_PERMUTE_T(uint16_t, x, 0x5555, 1); 45 | return x; 46 | } 47 | 48 | #endif 49 | 50 | 51 | #endif 52 | -------------------------------------------------------------------------------- /src/SFH/compiler_hints.h: -------------------------------------------------------------------------------- 1 | #ifndef COMPILER_HINTS_H 2 | #define COMPILER_HINTS_H 3 | 4 | // compiler hints: must assume that the hint will be ignored 5 | 6 | // hint_result_barrier 7 | // This is a hacky attempt at a lightweight code motion barrier: 8 | // 1) It shouldn't work but it seems to (on current versions of GCC & clang) 9 | // 2) I haven't seen it "break" other optimizations but it increases the 10 | // register pressure by one between computed 'v' and usage so it 11 | // can increase uop count which is generally a loss. 12 | // 3) If it ends up doing nothing..then no big deal. 13 | // 14 | // Roughly attempting to compute some primative value 'v' (using a hardware op 15 | // with longish latency) which will be used later in the total computation. 16 | // 17 | // v = some_computation(); 18 | // do other work here { wanting to hide the latency of 'v' } 19 | // do something with 'v' here 20 | // 21 | // TODO: change to Alexander Monakov's chained suggestion 22 | #if defined(__GNUC__) 23 | #define hint_result_barrier(X) __asm__ __volatile__("" : "+r"(X) : "r"(X)); 24 | // __asm__ __volatile__("" : "+r"(X) : ); 25 | #else 26 | #define hint_result_barrier(X) 27 | #endif 28 | 29 | // attempted to hint to the complier not to promote a computation 30 | // with 'v' into a constant load. use with care and only at point 31 | // of the computation with 'v' to not potentially break constant 32 | // propogation. 33 | // other_value = op(no_const_fold_n(v)) 34 | 35 | #if defined(__GNUC__) 36 | #pragma GCC diagnostic push 37 | #if defined(__clang__) 38 | #pragma GCC diagnostic ignored "-Wlanguage-extension-token" 39 | #endif 40 | static inline uint32_t hint_no_const_fold_32(uint32_t v) 41 | { 42 | asm ("" : "+r" (v)); 43 | return v; 44 | } 45 | 46 | static inline uint64_t hint_no_const_fold_64(uint64_t v) 47 | { 48 | asm ("" : "+r" (v)); 49 | return v; 50 | } 51 | #pragma GCC diagnostic pop 52 | #else 53 | static inline uint32_t hint_no_const_fold_32(uint32_t v) { return v; } 54 | static inline uint64_t hint_no_const_fold_64(uint64_t v) { return v; } 55 | #endif 56 | 57 | 58 | 59 | // hint_rw_barrier() : compiler read/write barrier 60 | 61 | #if defined(__GNUC__) 62 | #define hint_rw_barrier() __asm__ __volatile__("": : :"memory") 63 | #elif defined(_MSC_VER) 64 | #define hint_rw_barrier() _ReadWriteBarrier() // deprecated 65 | #else 66 | #include 67 | #define hint_rw_barrier() atomic_signal_fence(memory_order_acq_rel) 68 | #endif 69 | 70 | #define hint_pragma(X) _Pragma(#X) 71 | 72 | #if defined(__clang__) 73 | #define hint_unroll(X) hint_pragma(clang loop unroll_count(X)) 74 | #elif defined(__GNUC__) 75 | #define hint_unroll(X) hint_pragma(GCC unroll X) 76 | #else 77 | #define hint_unroll(X) 78 | #endif 79 | 80 | // because clang loses it's mind about loop unrolling 81 | #if defined(__clang__) 82 | #define hint_no_unroll _Pragma("clang loop unroll(disable)") 83 | #else 84 | #define hint_no_unroll 85 | #endif 86 | 87 | #if defined(__clang__) 88 | #define hint_assume(expression) __builtin_assume(expression) 89 | #else 90 | #define hint_assume(expression) do { if (!(expression)) __builtin_unreachable(); } while (0) 91 | #endif 92 | 93 | #if defined(__GNUC__) 94 | #define hint_expect(expression) __builtin_expect(expression) 95 | #define hint_unreachable() __builtin_unreachable() 96 | #if defined(__clang__) 97 | #define hint_unpredictable(X) __builtin_unpredictable(X) 98 | #else 99 | #define hint_unpredictable(X) __builtin_expect_with_probability(X,1,0.5) 100 | #endif 101 | #elif defined(_MSC_VER) 102 | #define hint_expect(expression) (expression) 103 | #define hint_unpredictable(X) (X) 104 | #define hint_unreachable() __assume(0) 105 | #else 106 | #define hint_expect(expression) (expression) 107 | #define hint_unpredictable(X) (X) 108 | #define hint_unreachable() 109 | #endif 110 | 111 | 112 | #if defined(__GNUC__) 113 | #define hint_no_inline __attribute__((__noinline__)) 114 | #define hint_pure_func __attribute__((__pure__)) 115 | #define hint_const_func __attribute__((__const__)) 116 | #elif defined(_MSC_VER) 117 | #define hint_no_inline __declspec(noinline) 118 | #define hint_pure_func 119 | #define hint_const_func 120 | #else 121 | #define hint_no_inline 122 | #define hint_pure_fun 123 | #define hint_const_func 124 | #endif 125 | 126 | 127 | 128 | 129 | #endif 130 | -------------------------------------------------------------------------------- /src/SFH/examples/LcgsTest.c: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include 5 | 6 | #define LCGS_IMPLEMENTATION 7 | #define LCGS_MLCG 8 | #define LCGS_64 9 | 10 | #include "../lcgs.h" 11 | 12 | #define L_POW 4 13 | #define LEN (1< 28 | #endif 29 | 30 | void spot_check(LCGS_UINT e, LCGS_UINT a, char* msg) 31 | { 32 | if (e==a) return; 33 | 34 | #if defined(LCGS_64) 35 | printf("error: 0x%0" PRIx64 " 0x%0" PRIx64 " %s\n", e, a, msg); 36 | #else 37 | printf("error: 0x%08x 0x%08x %s\n", e, a, msg); 38 | #endif 39 | } 40 | 41 | // minimal spot checks 42 | int main(int argc, char** argv) 43 | { 44 | lcgs_t rng; 45 | lcgs_t rng2; 46 | lcgs_t rng3; 47 | 48 | uint32_t i = 0; 49 | 50 | 51 | #if defined(LCGS_64) 52 | uint64_t o = __rdtsc(); 53 | 54 | printf("LCGS: init state: 0x%0" PRIx64 "\n", o); 55 | 56 | #else 57 | uint64_t TSC = __rdtsc(); 58 | uint32_t o = (uint32_t)((TSC>>32)^TSC); 59 | 60 | printf("LCGS: init state: 0x%08x\n", o); 61 | #endif 62 | 63 | //*** PRNG testing 64 | 65 | lcgs_state_set(&rng, (LCGS_UINT)1); 66 | lcgs_next(&rng); 67 | lcgs_prev(&rng); 68 | spot_check(rng.i, (LCGS_UINT)1, "SANITY CHECK FAILED"); 69 | 70 | o=1L; 71 | 72 | lcgs_state_set(&rng, o); 73 | 74 | // history = LEN members starting at 'o' 75 | // test forward seeking at same time 76 | for(i=0; i 6 | #include 7 | #include 8 | 9 | //#define PRNS_FAST_COUNTER 10 | #include "../prns.h" 11 | 12 | #define L_POW 5 13 | #define LEN (1< 21 | #endif 22 | 23 | // cut-n-paste of xoroshiro128+ 24 | // http://xoroshiro.di.unimi.it/xoroshiro128plus.c 25 | uint64_t s[2]; 26 | 27 | static inline uint64_t rotl(const uint64_t x, int k) 28 | { 29 | return (x << k) | (x >> (64 - k)); 30 | } 31 | 32 | uint64_t next(void) { 33 | const uint64_t s0 = s[0]; 34 | uint64_t s1 = s[1]; 35 | const uint64_t result = s0 + s1; 36 | 37 | s1 ^= s0; 38 | s[0] = rotl(s0, 55) ^ s1 ^ (s1 << 14); // a, b 39 | s[1] = rotl(s1, 36); // c 40 | 41 | return result; 42 | } 43 | 44 | // end of cut-n-paste 45 | 46 | void spot_check(uint64_t e, uint64_t a, char* msg) 47 | { 48 | if (e==a) return; 49 | 50 | printf("error: 0x%0" PRIx64 " 0x%0" PRIx64 " %s\n", e, a, msg); 51 | } 52 | 53 | // minimal spot checks 54 | int main(int argc, char** argv) 55 | { 56 | prns_t rng; 57 | uint32_t i = 0; 58 | uint64_t o = __rdtsc(); // <- LAZY HERE 59 | uint64_t t; 60 | 61 | s[0]=o; 62 | s[1]=1; 63 | 64 | //*** PRNG testing 65 | printf("PRNG --testing members: [0x%0" PRIx64 ", 0x%0" PRIx64 "] \n", o, o+LEN); 66 | 67 | prns_set(&rng, o); 68 | 69 | // history = LEN members starting at 'o' 70 | for(i=0; i> 32; 102 | uint64_t y = next() >> 32; 103 | uint64_t h = prns_mix(x<<32 ^ y); 104 | 105 | history[h >> (64-L_POW)]++; 106 | } 107 | 108 | // Pearson's 109 | double chi = 0.0; 110 | double ave = (1.0*HASH_ITEMS)/LEN; // Ei 111 | double s = 1.0/ave; 112 | double d; 113 | 114 | for(i=0; i 3 | 4 | #define SOBOL_IMPLEMENTATION 5 | #include "../sobol.h" 6 | 7 | #define LEN 100 8 | 9 | int error(int spos, int cpos) 10 | { 11 | fprintf(stderr, "opps (%d,%d)\n",spos,cpos); 12 | return -1; 13 | } 14 | 15 | int main(int argc, char** argv) 16 | { 17 | sobol_1d_t s1p; 18 | sobol_2d_t s2p; 19 | sobol_3d_t s3p; 20 | sobol_fixed_2d_t s2f; 21 | sobol_fixed_3d_t s3f; 22 | sobol_fixed_4d_t s4f; 23 | 24 | argc = argc; 25 | argv = argv; 26 | 27 | uint32_t i; 28 | float f1; 29 | float f2[2]; 30 | float f3[3]; 31 | float f4[4]; 32 | 33 | sobol_1d_init(&s1p, 0); 34 | sobol_2d_init(&s2p, 0,0); 35 | sobol_3d_init(&s3p, 0,0,0); 36 | sobol_fixed_2d_init(&s2f, LEN, 0); 37 | sobol_fixed_3d_init(&s3f, LEN, 0,0); 38 | sobol_fixed_4d_init(&s4f, LEN, 0,0,0); 39 | 40 | 41 | for(i=0; i2 ULP | CR% | FR% | 2 ULP% | >2 ULP% | 14 | // |k5 |x0| 4|1032035044| 31570400| 1537527| 210246| 96.872570| 2.963374| 0.144321| 0.019735| 15 | // |k5 |x1| 3|1032156173| 31586667| 1610375| 2| 96.883940| 2.964901| 0.151159| 0.000000| 16 | // |k5 |x2| 3|1040648342| 23095092| 1609781| 2| 97.681063| 2.167834| 0.151103| 0.000000| 17 | // |k6 |x0| 2|1052761039| 12552051| 40127| 0| 98.818028| 1.178206| 0.003767| 0.000000| 18 | // |k6 |x1| 1|1055324695| 10028522| 0| 0| 99.058667| 0.941333| 0.000000| 0.000000| 19 | // |k6 |x2| 1|1063825073| 1528144| 0| 0| 99.856560| 0.143440| 0.000000| 0.000000| 20 | // |k7 |x0| 2|1053857208| 11461428| 34581| 0| 98.920920| 1.075834| 0.003246| 0.000000| 21 | // |k7 |x1| 1|1056653721| 8699496| 0| 0| 99.183417| 0.816583| 0.000000| 0.000000| 22 | // |k7 |x2| 1|1065206869| 146348| 0| 0| 99.986263| 0.013737| 0.000000| 0.000000| 23 | // |k8 |x0| 2|1053891035| 11427570| 34612| 0| 98.924096| 1.072656| 0.003249| 0.000000| 24 | // |k8 |x1| 1|1056690373| 8662844| 0| 0| 99.186857| 0.813143| 0.000000| 0.000000| 25 | // |k8 |x2| 1|1065251470| 101747| 0| 0| 99.990449| 0.009551| 0.000000| 0.000000| 26 | // 27 | // asinpi (abs error "spitballs") 28 | // | K | ~abs error | 29 | // |a0 |1.356564e-03| 30 | // |a1 |1.281016e-04| 31 | // |a2 |1.433492e-05| 32 | // |a3 |1.795590e-06| 33 | // |a4 |2.682209e-07| 34 | // |a5 |7.450581e-08| correct round abs error ~= 1.490116e-08 35 | // 36 | // cospi: (an = abs error "spitballs" kernels) 37 | // | K | ULP | CR | FR | 2 ULP | >2 ULP | CR% | FR% | 2 ULP% | >2 ULP% | ~abs error | 38 | // | a0| 1672415|1763516917| 32348103| 11562178| 323279238| 82.766771| 1.518187| 0.542645| 15.172397|2.108824e-02| 39 | // | a1| 223084|1788031508| 46773361| 12366869| 283534698| 83.917309| 2.195204| 0.580412| 13.307075|1.356602e-03| 40 | // | a2| 30588|1820812274| 55051337| 12702512| 242140313| 85.455802| 2.583713| 0.596164| 11.364321|1.281500e-04| 41 | // | a3| 4487|1852015093| 65150815| 12284275| 201256253| 86.920237| 3.057710| 0.576535| 9.445518|1.436472e-05| 42 | // | a4| 681|1884039738| 76053538| 13127479| 157485681| 88.423243| 3.569405| 0.616109| 7.391243|1.847744e-06| 43 | // | a5| 107|1919663150| 95488708| 25661795| 89892783| 90.095150| 4.481552| 1.204380| 4.218919|2.980232e-07| 44 | // | d7| 1|2063138893| 67567543| 0| 0| 96.828867| 3.171133| 0.000000| 0.000000|5.960464e-08| 45 | // | d8| 1|2121232268| 9474168| 0| 0| 99.555351| 0.444649| 0.000000| 0.000000|5.960464e-08| 46 | 47 | 48 | float f32_asinpi(float x) { f32_asinpi_x2(&f32_asinpi_k6, x); } 49 | -------------------------------------------------------------------------------- /src/SFH/f32_math/f32_cbrt.c: -------------------------------------------------------------------------------- 1 | // Public Domain under http://unlicense.org, see link for details. 2 | // Marc B. Reynolds, 2022-2025 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include "f32_util.h" 9 | #include "internal/f32_math_common.h" 10 | 11 | // cube root and reciprocal cube root 12 | 13 | // SEE: 14 | // * https://gist.github.com/Marc-B-Reynolds/739a46f55c2a9ead54f4d0629ee5e417 15 | // * "Fast Calculation of Cube and Inverse Cube Roots Using a Magic Constant 16 | // and Its Implementation on Microcontrollers", Moroz, Samotyy, Walczyk, 17 | // Cieslinski, 2021 (algorithm 5 (A5) and algorithm 6) 18 | // https://www.mdpi.com/1996-1073/14/4/1058 19 | 20 | 21 | -------------------------------------------------------------------------------- /src/SFH/f32_math/f32_rsqrt.c: -------------------------------------------------------------------------------- 1 | // Public Domain under http://unlicense.org, see link for details. 2 | // Marc B. Reynolds, 2022-2025 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include "f32_util.h" 9 | #include "internal/f32_math_common.h" 10 | #include "internal/f32_rsqrt.h" 11 | 12 | // currently dead file 13 | -------------------------------------------------------------------------------- /src/SFH/f32_math/f32_sincospi.c: -------------------------------------------------------------------------------- 1 | // Public Domain under http://unlicense.org, see link for details. 2 | // Marc B. Reynolds, 2020-2025 3 | 4 | #include "f32_utils.h" 5 | #include "internal/f32_math_common.h" 6 | 7 | -------------------------------------------------------------------------------- /src/SFH/f32_math/internal/f32_asinpi_sb.h: -------------------------------------------------------------------------------- 1 | // Public Domain under http://unlicense.org, see link for details. 2 | // Marc B. Reynolds, 2022-2025 3 | 4 | #ifndef F32_ASINPI_SB_H 5 | #define F32_ASINPI_SB_H 6 | 7 | // SEE: 8 | // * (put post link here) 9 | // * f32_acospi_sb.h 10 | // 11 | // implementation is simply: 12 | // asinpi(x) = 1/2 - acospi(x). 13 | // 14 | // which is awful numerically. A better solution would be a cheap 15 | // direct form but a good expression is alluding me. An example is 16 | // that RN(acospi(x)) = 1/2 on [-0x1.921fb4p-24, 0x1.921fb4p-25] 17 | // so the asinpi approximation on this range will be zero instead 18 | // of ~x/pi. But spitballing and abs error (shrug)! 19 | 20 | // given f = P(x) expand approximation restricted to positive input. 21 | // return NaN for negative inputs. 22 | // just hacked: now test blockhead 23 | static inline float f32_asinpi_sb_xp(float (*f)(float), float x) 24 | { 25 | float t = f32_sqrt(1.f-x); 26 | float p = f(x); 27 | 28 | return -fmaf(t,p,-0.5f); 29 | } 30 | 31 | // given f = P(x) expand full range approximation: 32 | // asinpi(x) = sign(x)(1/2 - sqrt(1-|x|)) P(|x|) 33 | static inline float f32_asinpi_sb_xf(float (*f)(float), float x) 34 | { 35 | // currently only hybrid reduction: (SEE: f32_odd_reduce) 36 | // as it generally works well with current Intel & ARM arches 37 | 38 | float a = fabsf(x); 39 | float sx = f32_xor(x, a); 40 | float c = f32_xor(0.5f, sx); // (x>=0) ? .5 : -.5 41 | float t = -f32_sqrt(1.f-a); 42 | float p = f(a); 43 | 44 | // combine the subresults: here the sign is being applied the polynomial 45 | // result 'p' but could instead be applied to sqrt result 't'. ideally 46 | // it would applied to the sub-result that's expected to complete first. 47 | // roughly very low degree P apply to 'p', otherwise to 't' (as below) 48 | 49 | return fmaf(t, f32_xor(p,sx), c); 50 | } 51 | 52 | 53 | // longer polynomial version (see note above) 54 | // just hacked: now test blockhead 55 | static inline float f32_asinpi_sb_xf_l(float (*f)(float), float x) 56 | { 57 | float a = fabsf(x); 58 | float sx = f32_xor(x, a); 59 | float c = f32_xor(0.5f, sx); 60 | float t = -f32_sqrt(1.f-a); 61 | float p = f(a); 62 | 63 | return fmaf(f32_xor(t,sx), p, c); 64 | } 65 | 66 | 67 | 68 | #endif 69 | -------------------------------------------------------------------------------- /src/SFH/f32_math/internal/f32_math_common.h: -------------------------------------------------------------------------------- 1 | // Public Domain under http://unlicense.org, see link for details. 2 | // Marc B. Reynolds, 2015-2025 3 | 4 | // common approximation utility functions 5 | 6 | #ifndef F32_MATH_COMMON_H 7 | #define F32_MATH_COMMON_H 8 | 9 | #ifndef F32_HORNER2 10 | #include "f32_util.h" 11 | #include "f32_horner.h" 12 | #include "f32_horner2.h" 13 | #endif 14 | 15 | #ifndef F64_HORNER2 16 | #include "f64_util.h" 17 | #include "f64_horner.h" 18 | #include "f64_horner2.h" 19 | #endif 20 | 21 | // given a function 'f' defined for postive 'x': extend to it 22 | // negative numbers as an odd function: f(-x) = -f(x) 23 | // into an odd function. Multiple (compile time) variants. 24 | static inline float f32_odd_reduce(float x, float (*f)(float)) 25 | { 26 | #if defined(F32_ODD_REDUCE_BITOPS) 27 | // purely bit manipulation. 28 | 29 | uint32_t ix = f32_to_bits(x); // bit pattern of x 30 | uint32_t sx = ix & 0x80000000; // sign(x) {bit } 31 | uint32_t ax = ix ^ sx; // |x| {bits} 32 | float a = f32_from_bits(ax); // |x| 33 | float r = f(a); // core approximation 34 | 35 | return f32_mulsign(r, sx); // restore sign 36 | 37 | #elif defined(F32_ODD_REDUCE_STDFUNCS) 38 | // standard functions only 39 | float a = fabsf(x); 40 | float r = f(a); // core approximation 41 | 42 | return copysignf(r, x); // restore sign 43 | 44 | #else 45 | // default: hybrid. 46 | float a = fabsf(x); 47 | float sx = f32_xor(x,a); // isolated sign bit of 'x' 48 | float r = f(a); // core approximation 49 | 50 | return f32_xor(r, sx); // restore sign 51 | #endif 52 | } 53 | 54 | 55 | #endif 56 | -------------------------------------------------------------------------------- /src/SFH/f32_math/internal/f32_rsqrt.h: -------------------------------------------------------------------------------- 1 | // Public Domain under http://unlicense.org, see link for details. 2 | // Marc B. Reynolds, 2022-2025 3 | 4 | #ifndef F32_RSQRT_H 5 | #define F32_RSQRT_H 6 | 7 | // The "best" simple way to compute 1/sqrt(x) is sqrt(1/x) which is 8 | // faithfully rounded (f32_rsqrt in f32_util.h). 9 | // 10 | // correctly: 86.991060% (14594678) 11 | // faithfully: 13.008940% (2182538) 12 | // 13 | // SEE: 14 | // * https://gist.github.com/Marc-B-Reynolds/9fb24a7a4ee915e6e973bf9f4d08c404 15 | // * "Fast Compensated Algorithms for the Reciprocal Square Root, 16 | // the Reciprocal Hypotenuse, and Givens Rotations", Carlos F. Borges, 2021 17 | // https://arxiv.org/abs/2103.08694 18 | // * "High-level algorithms for correctly-rounded reciprocal square roots", 19 | // Borges, Jeannerod & Muller, 2022 20 | // https://hal.inria.fr/hal-03728088 21 | 22 | // Newton-Raphson step for 1/sqrt(x) using FMA when 23 | // x = input to function 24 | // h = -x/2 25 | // r = RN(1/x) 26 | static inline float f32_rsqrt_nr_step(float x, float h, float r) 27 | { 28 | float s = fmaf(h,r,0.5f); 29 | float t = fmaf(x,x, -r); 30 | float v = fmaf(h,t, s); 31 | return fmaf(x,v,x); 32 | } 33 | 34 | // Halley method step for 1/sqrt(x) 35 | // x = input to function 36 | // h = -x/2 37 | // r = RN(1/x) 38 | static inline float f32_rsqrt_hm_step(float x, float h, float r) 39 | { 40 | float s = fmaf(r,h,0.5f); 41 | float t = fmaf(x,x,-r); 42 | float v = fmaf(h,t, s); 43 | float w = fmaf(1.5f*v,v,v); 44 | return fmaf(x,w,x); 45 | } 46 | 47 | // correctly: 99.999994% (16777215) 48 | // faithfully: 0.000006% (1) 49 | // Borges Algorithm 2 (rsqrtNewton in Borges/Jeannerod/Muller) 50 | // +4 fma & +1 product vs. standard 51 | static inline float f32_rsqrt_hq(float x) 52 | { 53 | float r = 1.f/x; 54 | float h = -0.5f*x; 55 | 56 | x = f32_sqrt(r); 57 | x = f32_rsqrt_nr_step(x,h,r); 58 | 59 | return x; 60 | } 61 | 62 | // correctly rounded (in round to even/ties to nearest) 63 | // Borges Algorithm 3 (rsqrtHalley in Borges/Jeannerod/Muller) 64 | // +5 fma & +1 product vs. standard 65 | // DOUBLE PROMOTION WORKS HERE! (correct this oversight) 66 | // BUT FOR A NARROWER RANGE...find it. 67 | static inline float f32_rsqrt_cb(float x) 68 | { 69 | float r = 1.f/x; 70 | float h = -0.5f*x; 71 | 72 | x = f32_sqrt(r); 73 | x = f32_rsqrt_hm_step(x,h,r); 74 | 75 | return x; 76 | } 77 | 78 | #endif 79 | -------------------------------------------------------------------------------- /src/SFH/f32_math/internal/f32_sincospi.h: -------------------------------------------------------------------------------- 1 | // Public Domain under http://unlicense.org, see link for details. 2 | // Marc B. Reynolds, 2020-2025 3 | 4 | 5 | #ifndef F32_SINCOSPI_H 6 | #define F32_SINCOSPI_H 7 | 8 | // http://marc-b-reynolds.github.io/math/2020/03/11/SinCosPi.html 9 | // original post version with reworked expression. one less constant 10 | // load vs. f32_sinpi_k6 11 | static inline float f32_sinpi_o6(float a) 12 | { 13 | // blog post ordering of coefficients 14 | static const float C[] = 15 | { 0x1.921fb6p1f, -0x1.4abbecp2f, 0x1.466b2p1f, -0x1.2f5992p-1f }; 16 | 17 | float r,a2,a3; 18 | 19 | a2 = a*a; a3 = a2*a; 20 | r = fmaf(C[3], a2, C[2]); 21 | r = fmaf(r, a2, C[1]); 22 | r = a3 * r; 23 | r = fmaf(C[0], a, r); 24 | 25 | return r; 26 | } 27 | 28 | //************* sinpi relative error 29 | 30 | static inline float f32_sinpi_k5(float x) 31 | { 32 | static const float C[] = {0x1.3f3dep1f, -0x1.4aa60ap2f}; 33 | 34 | float s = x*x; 35 | float r; 36 | 37 | r = f32_horner_1(s,C); 38 | r = fmaf(r,s, -0x1.db31b8p-25f); 39 | r = fmaf(x, 0x1.921f8ep1f, x*r); 40 | 41 | return r; 42 | } 43 | 44 | static inline float f32_sinpi_k6(float x) 45 | { 46 | static const float C[] = {-0x1.2d9e5ap-1f, 0x1.465edcp1f, -0x1.4abbbap2f}; 47 | 48 | float s = x*x; 49 | float r; 50 | 51 | r = f32_horner_2(s,C); 52 | r = fmaf(r,s, -0x1.9e5ee4p-24f); 53 | r = fmaf(x, 0x1.921fb6p1f, x*r); 54 | 55 | return r; 56 | } 57 | 58 | static inline float f32_sinpi_k7(float x) 59 | { 60 | static const float C[] = {0x1.48208cp-4f, -0x1.32babap-1f, 0x1.466b8ep1f, -0x1.4abbcep2f}; 61 | 62 | float s = x*x; 63 | float r; 64 | 65 | r = f32_horner_3(s,C); 66 | r = fmaf(r,s, -0x1.7883b8p-24f); 67 | r = fmaf(x, 0x1.921fb6p1f, x*r); 68 | 69 | return r; 70 | } 71 | 72 | // one more term version only as tiny effect 73 | static inline float f32_sinpi_k8(float x) 74 | { 75 | static const float C[] = {0x1.f095fp-6f, 0x1.36acd8p-4f, -0x1.32a156p-1f, 0x1.466b76p1f, -0x1.4abbcep2f}; 76 | 77 | float s = x*x; 78 | float r; 79 | 80 | r = f32_horner_4(s,C); 81 | r = fmaf(r,s, -0x1.783058p-24f); 82 | r = fmaf(x, 0x1.921fb6p1f, x*r); 83 | 84 | return r; 85 | } 86 | 87 | static inline float f32_sinpi_d6(float v) 88 | { 89 | static const double C[] = {0x1.4bc25574ce357p-4, -0x1.32ca854cad10ep-1, 0x1.466bba8bfcp1, -0x1.4abbce564cd85p2, 0x1.921fb5443af5fp1}; 90 | 91 | double x = (double)v; 92 | double s = x*x; 93 | 94 | return (float)(x*f64_horner_4(s,C)); 95 | } 96 | 97 | 98 | //************* sinpi absolute error 99 | 100 | static inline float f32_sinpi_a5(float x) 101 | { 102 | static const float C[] = {0x1.3e2042p1f, -0x1.4a9ac4p2f}; 103 | 104 | float s = x*x; 105 | float r; 106 | 107 | r = f32_horner_1(s,C); 108 | r = fmaf(r,s, -0x1.12767p-24f); 109 | r = fmaf(x, 0x1.921f32p1f, x*r); 110 | 111 | return r; 112 | } 113 | 114 | static inline float f32_sinpi_a6(float x) 115 | { 116 | static const float C[] = {-0x1.2cf9fep-1f, 0x1.465a5cp1f, -0x1.4abba8p2f}; 117 | 118 | float s = x*x; 119 | float r; 120 | 121 | r = f32_horner_2(s,C); 122 | r = fmaf(r,s, 0x1.d2c0fp-24f); 123 | r = fmaf(x, 0x1.921fb4p1f, x*r); 124 | 125 | return r; 126 | } 127 | 128 | //************* cospi 129 | 130 | // kernel codomain on [sqrt(2)/2, 1] and optimizing for abs outperforms wrt 131 | // relative error as well. 132 | 133 | // not exact at f(0) 134 | static inline float f32_cospi_k3(float x) 135 | { 136 | static const float C[] = {0x1.f7b478p1f, -0x1.3ba49ep2f, 0x1.fffeb2p-1f}; 137 | 138 | return f32_horner_2(x*x,C); 139 | } 140 | 141 | static inline float f32_cospi_k4(float x) 142 | { 143 | static const float C[] = {-0x1.4ea1aep0f, 0x1.03b132p2f, -0x1.3bd3ap2f, 1.f}; 144 | 145 | return f32_horner_3(x*x,C); 146 | } 147 | 148 | static inline float f32_cospi_k5(float x) 149 | { 150 | static const float C[] = {0x1.d61e28p-3f, -0x1.55b684p0f, 0x1.03c1b6p2f, -0x1.3bd3ccp2f, 1.f}; 151 | 152 | return f32_horner_4(x*x,C); 153 | } 154 | 155 | // promote to double kernel (exact at f(0) due to rounding) 156 | static inline float f32_cospi_d5(float x) 157 | { 158 | static const double C[] = {0x1.d9c364eac5b26p-3, -0x1.55c57b06db5e8p0, 0x1.03c1dc1bafc62p2, -0x1.3bd3cc7323531p2, 0x1.ffffffff97c47p-1}; 159 | 160 | double d = (double)x; 161 | double s = d*d; 162 | 163 | // using second order doesn't change error 164 | return (float)f64_horner2_4(s,C); 165 | } 166 | 167 | #endif 168 | -------------------------------------------------------------------------------- /src/SFH/f32_math/sollya/acospi_sb.sollya: -------------------------------------------------------------------------------- 1 | // -*- mode: c; -*- 2 | 3 | // spitball approximations of acospi(x) 4 | 5 | // set up some globals 6 | E = absolute; 7 | B = [|24...|]; 8 | F = acos(x)/pi; 9 | tx = sqrt(1-x); 10 | 11 | // range set to avoid bad control points 12 | x0 = 0x1.9p-25; // RN(acospi(x)) = 1/2 on [-0x1.921fb4p-24, 0x1.921fb4p-25] 13 | x1 = 1 - 2^-25; // 1 - ulp(1)/2 14 | R = [x0 ; x1]; 15 | 16 | execute("util.sollya"); 17 | execute("cut.sollya"); 18 | 19 | // for acospi(x) : relaxed f(0) 20 | rbasis2 = [|tx,tx*x|]; 21 | rbasis3 = [|tx,tx*x,tx*x^2|]; 22 | rbasis4 = [|tx,tx*x,tx*x^2,tx*x^3|]; 23 | rbasis5 = [|tx,tx*x,tx*x^2,tx*x^3,tx*x^4|]; 24 | rbasis6 = [|tx,tx*x,tx*x^2,tx*x^3,tx*x^4,tx*x^5|]; 25 | rbasis7 = [|tx,tx*x,tx*x^2,tx*x^3,tx*x^4,tx*x^5,tx*x^6|]; 26 | rbasis8 = [|tx,tx*x,tx*x^2,tx*x^3,tx*x^4,tx*x^5,tx*x^6,tx*x^7|]; 27 | 28 | // for acospi(x) : exact f(0) 29 | ebasis2 = [|tx*x|]; 30 | ebasis3 = [|tx*x,tx*x^2|]; 31 | ebasis4 = [|tx*x,tx*x^2,tx*x^3|]; 32 | ebasis5 = [|tx*x,tx*x^2,tx*x^3,tx*x^4|]; 33 | ebasis6 = [|tx*x,tx*x^2,tx*x^3,tx*x^4,tx*x^5|]; 34 | ebasis7 = [|tx*x,tx*x^2,tx*x^3,tx*x^4,tx*x^5,tx*x^6|]; 35 | ebasis8 = [|tx*x,tx*x^2,tx*x^3,tx*x^4,tx*x^5,tx*x^6,tx*x^7|]; 36 | 37 | 38 | // f(0) relaxed approx 39 | procedure acospi_build_relaxed(x0, x1) 40 | { 41 | var r; 42 | 43 | r.arange = [x0;x1]; 44 | r.approx = fpminimax(acos(x)/pi, builder_basis, B, r.arange, floating, E); 45 | r.aerror = single(dirtyinfnorm(r.approx-acos(x)/pi, r.arange)); 46 | 47 | return r; 48 | }; 49 | 50 | // f(0) exact approx 51 | procedure acospi_build_exact(x0, x1) 52 | { 53 | var r; 54 | 55 | r.arange = [x0;x1]; 56 | r.approx = fpminimax(acos(x)/pi, builder_basis, B, r.arange, floating, E, tx*0.5); 57 | r.aerror = single(dirtyinfnorm(r.approx-acos(x)/pi, r.arange)); 58 | 59 | return r; 60 | }; 61 | 62 | 63 | // relative error (via f32_build...) 64 | //display=hexadecimal!; 65 | //f32_build_constrained(F, ebasis5, x0, x1, 0.5*tx).approx; 66 | //f32_build_constrained(F, ebasis6, x0, x1, 0.5*tx).approx; 67 | //f32_build_constrained(F, ebasis7, x0, x1, 0.5*tx).approx; 68 | 69 | // the 2 promote to 64-bit kernels 70 | //f64_build(F, rbasis7, x0, x1).approx; 71 | //f64_build(F, rbasis8, x0, x1).approx; 72 | 73 | 74 | procedure acospi_find_cut_exact(basis) 75 | { 76 | builder_basis=basis; 77 | return find_cut(acospi_build_exact, x0, x1); 78 | }; 79 | 80 | procedure acospi_find_cut_relaxed(basis) 81 | { 82 | builder_basis=basis; 83 | return find_cut(acospi_build_relaxed, x0, x1); 84 | }; 85 | 86 | //**** cut - doesn't seem useful here 87 | 88 | // minimize abs error 89 | //acospi_find_cut_exact(ebasis2); // 0x1.1a6b24p-1 90 | //acospi_find_cut_exact(ebasis3); // 0x1.065e6cp-1 91 | //acospi_find_cut_exact(ebasis4); // 0x1.f65f3p-2 92 | //acospi_find_cut_exact(ebasis5); // 0x1.e838dp-2 93 | //acospi_find_cut_exact(ebasis6); // 0x1.de6d4cp-2 94 | //acospi_find_cut_exact(ebasis7); // 0x1.d6d5f4p-2 95 | 96 | 97 | //E = relative; 98 | // minimize rel error 99 | //acospi_find_cut_exact(ebasis3); // 0x1.22188p-1 100 | //acospi_find_cut_exact(ebasis4); // 0x1.0e778p-1 101 | //acospi_find_cut_exact(ebasis5); // 0x1.02e464p-1 102 | //acospi_find_cut_exact(ebasis6); // 0x1.f64f7p-2 103 | //acospi_find_cut_exact(ebasis7); // 0x1.ebf4b8p-2 104 | 105 | //display=hexadecimal!; 106 | //cut6 = 0x1.de6d4cp-2; 107 | //f32_build_constrained(F, ebasis7, x0, cut6, 0.5*tx).approx; 108 | //f32_build(F, rbasis7, cut6, x1).approx; 109 | -------------------------------------------------------------------------------- /src/SFH/f32_math/sollya/asin_classic.sollya: -------------------------------------------------------------------------------- 1 | // -*- mode: c; -*- 2 | 3 | // classic polynomial approximation of asin on [-1/2,1/2] 4 | // x + x^3 P(x^2) 5 | 6 | // set up some globals 7 | x0 = 0+2^-2048; 8 | x1 = 1/2; 9 | 10 | execute("util.sollya"); 11 | 12 | // x + x^3 P(x^2) 13 | procedure asin_build(basis, x0,x1) 14 | { 15 | var r,c; 16 | 17 | // build the approximation and dump P. need to drop 18 | // the coefficient of 'x' (which is constrained to 1) 19 | r = f32_build_constrained(asin(x), basis, x0, x1, x); 20 | c = tail(list_of_odd_coeff(r.approx)); 21 | 22 | print("// ~abs error =", r.aerror); 23 | print("// ~rel error =", r.rerror); 24 | f32_print_kernel_list("f32_asin_k" @ (length(c)-1) , c); 25 | 26 | return r; 27 | }; 28 | 29 | P4 = asin_build([|3,5,7,9|], x0,x1); 30 | P5 = asin_build([|3,5,7,9,11|], x0,x1); 31 | P6 = asin_build([|3,5,7,9,11,13|], x0,x1); 32 | P7 = asin_build([|3,5,7,9,11,13,15|], x0,x1); 33 | 34 | 35 | // for promote to double variants of P 36 | procedure asin_build_dp(basis, x0,x1) 37 | { 38 | var r,c; 39 | 40 | // build the approximation and dump P. need to drop 41 | // the coefficient of 'x' (which is constrained to 1) 42 | r = f64_build_constrained(asin(x), basis, x0, x1, x); 43 | c = tail(list_of_odd_coeff(r.approx)); 44 | 45 | print("// ~abs error =", r.aerror); 46 | print("// ~rel error =", r.rerror); 47 | f32_print_kernel_list("f32_asin_k" @ (length(c)-1) , c); 48 | 49 | return r; 50 | }; 51 | 52 | 53 | // I haven't tried but there doesn't appear to be any "margin" in being fancy. 54 | if (true) then { 55 | P8 = asin_build_dp([|3,5,7,9,11,13,15|], x0,x1); 56 | P9 = fn_build_constrained(asin(x),[|3,5,7,9,11,13,15|], x0,x1,x, [|48,24...|], relative); 57 | PA = asin_build([|3,5,7,9,11,13,15,17|], x0,x1); 58 | P7.rerror; 59 | P8.rerror; 60 | P9.rerror; 61 | PA.rerror; 62 | }; 63 | -------------------------------------------------------------------------------- /src/SFH/f32_math/sollya/asinpi_classic.sollya: -------------------------------------------------------------------------------- 1 | // -*- mode: c; -*- 2 | 3 | // classic polynomial approximation of asinpi on [-1/2,1/2] 4 | // x/pi + x^3 P(x^2) 5 | 6 | execute("util.sollya"); 7 | execute("util.sollya"); 8 | 9 | x0 = 0+2^-2048; 10 | x1 = 1/2; 11 | 12 | // correctly rounded initial term approximations 13 | // asinpi(x) ~= x/pi + x^3 P(x^2) 14 | procedure asinpi_build_cr(basis, x0,x1) 15 | { 16 | var r,c; 17 | 18 | // build the approximation and dump P. need to drop 19 | // the coefficient of 'x' (which is constrained to 1/pi) 20 | r = f32_build_constrained(asin(x)/pi, basis, x0, x1, x/pi); 21 | c = tail(list_of_odd_coeff(r.approx)); 22 | 23 | print("// ~abs error =", r.aerror); 24 | print("// ~rel error =", r.rerror); 25 | f32_print_kernel_list("f32_asinpi_k" @ (length(c)+1) , c); 26 | 27 | return r; 28 | }; 29 | 30 | P4 = asinpi_build_cr([|3,5,7,9|], x0,x1); 31 | P5 = asinpi_build_cr([|3,5,7,9,11|], x0,x1); 32 | P6 = asinpi_build_cr([|3,5,7,9,11,13|], x0,x1); 33 | P7 = asinpi_build_cr([|3,5,7,9,11,13,15|], x0,x1); 34 | 35 | -------------------------------------------------------------------------------- /src/SFH/f32_math/sollya/asinpi_sb.sollya: -------------------------------------------------------------------------------- 1 | // -*- mode: c; -*- 2 | 3 | // attempts at spitball approximations of asinpi(x) 4 | // can't think of an expression form that works well. 5 | // the "magic" of making acospi(x) stems from transforming 6 | // it to asinpi...so not too suprising. This file is 7 | // complete junk of tinkering around. 8 | 9 | // set up some globals 10 | T = floating; 11 | E = absolute; 12 | B = [|24...|]; 13 | F = asin(x)/pi; 14 | tx = sqrt(1-x); 15 | x0 = 2^-128; 16 | x1 = 1-2^-25; 17 | 18 | execute("util.sollya"); 19 | 20 | procedure asinpi_make(basis, r) 21 | { 22 | var e,approx,p0,p1; 23 | 24 | print("asinpi(x) on: " @ r); 25 | 26 | approx = fpminimax(F,basis,B,r,T,E); 27 | 28 | display=powers!; write(" "); approx; // for perverted languages without hex float 29 | display=hexadecimal!; write(" "); approx; // for happy languages 30 | display=decimal!; write(" "); approx; // for happy humans 31 | 32 | p0; 33 | 34 | display=hexadecimal!; 35 | e = single(dirtyinfnorm(approx-F, r)); 36 | write(" approx abs error: ", e); 37 | display=decimal!; 38 | write(" (", e, ")\n\n"); 39 | }; 40 | 41 | 42 | // wip 43 | procedure asinpi_build(basis, x0,x1) 44 | { 45 | var r; 46 | 47 | r = f32_build_constrained(asin(x)/pi, basis, x0, x1, x/pi); 48 | 49 | print("// ~abs error =", r.aerror); 50 | print("// ~rel error =", r.rerror); 51 | //f32_print_kernel_list("f32_asin_sb_k" @ (length(c)-1) , c); 52 | 53 | return r; 54 | }; 55 | 56 | 57 | xbasis3 = [|tx,tx*x |]; 58 | xbasis4 = [|tx,tx*x,tx*x^2 |]; 59 | xbasis5 = [|tx,tx*x,tx*x^2,tx*x^3 |]; 60 | 61 | asinpi_build(xbasis3, 2^-128, 1-2^-25).approx; 62 | asinpi_build(xbasis4, 2^-128, 1-2^-25).approx; 63 | asinpi_build(xbasis5, 2^-128, 1-2^-25).approx; 64 | 65 | quit; 66 | 67 | 68 | // for asinpi(x) : at least it converges. sucks 69 | // performance wise. 70 | basis3 = [|1,x, tx,tx*x |]; 71 | basis4 = [|1,x, tx,tx*x,tx*x^2|]; 72 | basis5 = [|1,x,x^2, tx,tx*x,tx*x^2|]; 73 | basis6 = [|1,x,x^2, tx,tx*x,tx*x^2,tx*x^3|]; 74 | basis7 = [|1,x,x^2,x^3,tx,tx*x,tx*x^2,tx*x^3|]; 75 | 76 | xe = 1-2^-25; 77 | R = [2^-128;xe]; // abs error: 78 | 79 | if (false) then { 80 | asinpi_make(basis3, R); // 0x1.a22000p-12 (3.987551e-4) 81 | asinpi_make(basis4, R); // 0x1.c563f0p-15 (5.404835e-5) 82 | asinpi_make(basis5, R); // 0x1.4b0000p-16 (1.972914e-5) 83 | asinpi_make(basis6, R); // 0x1.c9af1ep-19 (3.410012e-6) 84 | asinpi_make(basis7, R); // 0x1.30bd62p-20 (1.135244e-6) 85 | }; 86 | 87 | if (true) then { 88 | basis3 = [|1,x, tx,tx*x |]; 89 | basis4 = [|1,x, tx,tx*x,tx*x^2|]; 90 | basis5 = [|1,x,x^3, tx,tx*x,tx*x^2|]; 91 | basis6 = [|1,x,x^3, tx,tx*x,tx*x^2,tx*x^3|]; 92 | basis7 = [|1,x,x^3,x^5,tx,tx*x,tx*x^2,tx*x^3|]; 93 | 94 | asinpi_make(basis3, R); // 95 | asinpi_make(basis4, R); // 96 | asinpi_make(basis5, R); // 97 | asinpi_make(basis6, R); // 98 | //asinpi_make(basis7, R); // 99 | }; 100 | -------------------------------------------------------------------------------- /src/SFH/f32_math/sollya/atan.sollya: -------------------------------------------------------------------------------- 1 | // -*- mode: c; -*- 2 | 3 | // classic polynomial approximation of asin on [-1/2,1/2] 4 | // x + x^3 P(x^2) 5 | 6 | // set up some globals 7 | x0 = 0+2^-2048; 8 | x1 = tan(pi/8); 9 | 10 | execute("util.sollya"); 11 | 12 | // x + x^3 P(x^2) 13 | procedure atan_build(basis, x0,x1) 14 | { 15 | var r,c; 16 | 17 | // build the approximation and dump P. need to drop 18 | // the coefficient of 'x' (which is constrained to 1) 19 | r = f32_build_constrained(atan(x), basis, x0, x1, x); 20 | c = tail(list_of_odd_coeff(r.approx)); 21 | 22 | print("// ~abs error =", r.aerror); 23 | print("// ~rel error =", r.rerror); 24 | f32_print_kernel_list("f32_atan_k" @ (length(c)-1) , c); 25 | 26 | return r; 27 | }; 28 | 29 | P4 = atan_build([|3,5,7,9|], x0,x1); 30 | P5 = atan_build([|3,5,7,9,11|], x0,x1); 31 | P6 = atan_build([|3,5,7,9,11,13|], x0,x1); 32 | P7 = atan_build([|3,5,7,9,11,13,15|], x0,x1); 33 | 34 | 35 | // for promote to double variants of P 36 | procedure atan_build_dp(basis, x0,x1) 37 | { 38 | var r,c; 39 | 40 | // build the approximation and dump P. need to drop 41 | // the coefficient of 'x' (which is constrained to 1) 42 | r = f64_build_constrained(atan(x), basis, x0, x1, x); 43 | c = tail(list_of_odd_coeff(r.approx)); 44 | 45 | print("// ~abs error =", r.aerror); 46 | print("// ~rel error =", r.rerror); 47 | f64_print_kernel_list("f32_atan_d" @ (length(c)-1) , c); 48 | 49 | return r; 50 | }; 51 | 52 | P4 = atan_build_dp([|3,5,7,9|], x0,x1); 53 | P5 = atan_build_dp([|3,5,7,9,11|], x0,x1); 54 | P6 = atan_build_dp([|3,5,7,9,11,13|], x0,x1); 55 | //P7 = atan_build_dp([|3,5,7,9,11,13,15|], x0,x1); 56 | 57 | -------------------------------------------------------------------------------- /src/SFH/f32_math/sollya/cospi.sollya: -------------------------------------------------------------------------------- 1 | // -*- mode: c; -*- 2 | 3 | // polynomial approximations of cospi on [0,1/4] 4 | 5 | execute("util.sollya"); 6 | 7 | x0 = 0; 8 | x1 = 1/4; 9 | 10 | procedure cospi_build(basis, x0,x1) 11 | { 12 | var r; 13 | 14 | r = f32_build_ae(cos(pi*x), basis, x0, x1); 15 | 16 | print("// ~abs error =", r.aerror); 17 | print("// ~rel error =", r.rerror); 18 | f32_print_kernel_even("f32_cospi_k", r.approx); 19 | 20 | return r; 21 | }; 22 | 23 | basis = [|0,2|]; 24 | 25 | for i from 4 to 10 by 2 do { 26 | var approx; 27 | 28 | basis = basis :.i; 29 | approx = cospi_build(basis, x0,x1); 30 | print(""); 31 | }; 32 | 33 | 34 | //f64_build_constrained_ae(cos(pi*x), [|2,4,6,8|], x0, x1, 1); 35 | a = f64_build_ae(cos(pi*x), [|0,2,4,6,8|], x0, x1); 36 | a; 37 | display=hexadecimal!; 38 | revert(list_of_even_coeff(a.approx)); 39 | -------------------------------------------------------------------------------- /src/SFH/f32_math/sollya/cut.sollya: -------------------------------------------------------------------------------- 1 | // -*- mode: c; -*- 2 | 3 | // hacky garbage WIP 4 | 5 | 6 | // sigh. not even started 7 | 8 | // 9 | // f : function being approximated 10 | // build : function to build approximation (returns error measure) 11 | // range : range of the approximation 12 | // c0,c1 : subrange to search of the 'cut' 13 | 14 | /* 15 | procedure find_cut_same_exp(f,build,range,basis,c0,c1) 16 | { 17 | var cut,scale,r0,r1; 18 | 19 | cut = c0; 20 | scale = exponent(c0); 21 | 22 | if (scale == exponent(c1)) then { 23 | var i0,i1; 24 | i0 = mantissa(c0); 25 | i1 = mantissa(c1); 26 | scale = 2^(-scale); 27 | 28 | 29 | } 30 | else { 31 | print("find_cut_same_exp: search bounds must have the same exponent"); 32 | }; 33 | 34 | return cut; 35 | }; 36 | */ 37 | 38 | // WIP (cut-n-paste junk atm): assumes neither odd nor even 39 | procedure build_cut(capprox) 40 | { 41 | var c0,c1; 42 | 43 | 44 | display=decimal!; 45 | 46 | /* 47 | write("static inline f32_", name @ "_k" @ (length(c)+1) @ "(float x)\n{\n static "); 48 | f32_write_list("C", c); 49 | print(""); 50 | print(" float s = x*x;"); 51 | print(" float r;\n"); 52 | print(" r = f32_horner_" @ (length(c)-1) @ "(s,C);"); 53 | display=hexadecimal!; 54 | print(" r = fmaf(r,s,", e.l @ "f)"); 55 | print(" r = fmaf(x,", e.h @ "f, x*r);"); 56 | print(""); 57 | print(" return r;"); 58 | print("}\n"); 59 | */ 60 | 61 | display=d!; 62 | }; 63 | 64 | 65 | procedure find_cut_initial(build,x0,x1,cut) 66 | { 67 | var d; 68 | var e_diff,t; 69 | var c_lo,c_hi; 70 | var cnt; 71 | var lo,hi,r; 72 | 73 | d = display; 74 | cnt = 1; 75 | c_lo = x0; 76 | c_hi = x1; 77 | 78 | lo = build(x0,cut); 79 | hi = build(cut,x1); 80 | display = hexadecimal!; 81 | e_diff = abs(lo.aerror-hi.aerror); 82 | 83 | print(" cut=", cut, " [", c_lo,",",c_hi,"]"); 84 | 85 | for cnt from 1 to 32 do { 86 | display=decimal!; 87 | 88 | if (lo.aerror < hi.aerror) then { 89 | c_lo = cut; 90 | } else { 91 | c_hi = cut; 92 | }; 93 | 94 | t = abs(lo.aerror-hi.aerror); 95 | 96 | if (t < e_diff) then { e_diff=t; write("+"); } else { write(" "); }; 97 | 98 | cut = single((c_lo+c_hi)*0.5); 99 | lo = build(x0,cut); 100 | hi = build(cut,x1); 101 | 102 | if (cnt < 10) then { write(" "); }; 103 | write(cnt, ": "); 104 | display=hexadecimal!; 105 | write(lo.aerror, " ", hi.aerror); 106 | print(" cut=", cut, " [", c_lo,",",c_hi,"]", single(e_diff)); 107 | }; 108 | 109 | display = d!; 110 | 111 | r.lo = lo; 112 | r.hi = hi; 113 | 114 | return r; 115 | }; 116 | 117 | 118 | 119 | // helper: initial at point middle of range 120 | procedure find_cut(build,x0,x1) 121 | { 122 | find_cut_initial(build,x0,x1,single(0.5*(x0+x1))); 123 | }; 124 | -------------------------------------------------------------------------------- /src/SFH/f32_math/sollya/sinpi.sollya: -------------------------------------------------------------------------------- 1 | // -*- mode: c; -*- 2 | 3 | // polynomial approximations of sinpi on [0,1/4] 4 | 5 | execute("util.sollya"); 6 | 7 | x0 = 0; 8 | x1 = 1/4; 9 | 10 | procedure sinpi_build(basis, x0,x1) 11 | { 12 | var r; 13 | 14 | r = f32_build_hp_lead(sin(pi*x), basis, x0, x1); 15 | 16 | print("// ~abs error =", r.aerror); 17 | print("// ~rel error =", r.rerror); 18 | f32_print_kernel_odd_xp_lead("sinpi", r.approx); 19 | 20 | return r; 21 | }; 22 | 23 | print("// sinpi relative error. extended precision lead coeff"); 24 | 25 | basis = [|1,3|]; 26 | 27 | for i from 5 to 9 by 2 do 28 | { 29 | var approx; 30 | 31 | basis = basis :.i; 32 | approx = sinpi_build(basis, x0,x1); 33 | print(""); 34 | }; 35 | 36 | 37 | procedure sinpi_build_ae(basis, x0,x1) 38 | { 39 | var r; 40 | 41 | r = f32_build_hp_lead_ae(sin(pi*x), basis, x0, x1); 42 | 43 | print("// ~abs error =", r.aerror); 44 | print("// ~rel error =", r.rerror); 45 | f32_print_kernel_odd_xp_lead("sinpia", r.approx); 46 | 47 | return r; 48 | }; 49 | 50 | 51 | print("// sinpi absolute error. extended precision lead coeff"); 52 | 53 | basis = [|1,3|]; 54 | x0 = 2^-1024; 55 | 56 | for i from 5 to 7 by 2 do { 57 | var approx; 58 | 59 | basis = basis :.i; 60 | approx = sinpi_build_ae(basis, x0,x1); 61 | print(""); 62 | }; 63 | 64 | 65 | d6 = f64_build(sin(pi*x), [|1,3,5,7,9|], x0, x1); 66 | d6.approx; 67 | f64_print_kernel_odd("foo", d6.approx); 68 | -------------------------------------------------------------------------------- /src/SFH/f32_math/tests/.gitignore: -------------------------------------------------------------------------------- 1 | .makedep 2 | *.txt 3 | cbrt 4 | asin 5 | acos 6 | atan 7 | asinpi 8 | acospi 9 | atanpi 10 | sin 11 | cos 12 | sinpi 13 | cospi 14 | cosh 15 | sinh 16 | acosh 17 | asinh 18 | exp 19 | expm1 20 | log2 21 | log2p1 22 | log 23 | log21 24 | rsqrt 25 | -------------------------------------------------------------------------------- /src/SFH/f32_math/tests/Makefile: -------------------------------------------------------------------------------- 1 | # Dumb mini makefile: 2 | # 0) assumes clang/GCC like options 3 | # 1) every .c file is to be built into an executable 4 | 5 | # if CC is the default (not environment varible nor supplied to make, then default 6 | ifeq ($(origin CC),default) 7 | CC = clang-15 8 | endif 9 | 10 | IDIRS = -I../.. -I.. 11 | CFLAGS = -g -O3 ${IDIRS} -march=native -Wall -Wextra -Wconversion -Wpedantic -Wno-unused-function -fno-math-errno -ffp-contract=off 12 | LDLIBS = -lm 13 | 14 | SRC := ${wildcard *.c} 15 | HEADERS := ${wildcard *.h} 16 | TARGETS := ${SRC:.c=} 17 | 18 | all: ${TARGETS} 19 | 20 | clean: 21 | -${RM} ${TARGETS} 22 | 23 | distclean: clean 24 | -${RM} .makedep *~ 25 | 26 | .makedep: 27 | @-echo "building dependencies" 28 | @-echo "# autogenerated by Makefile" > .makedep 29 | @$(foreach file, $(SRC), ${CC} ${IDIRS} -MM -MQ${file:.c=} $(file) >> .makedep;) 30 | 31 | 32 | %:%.c Makefile 33 | ${CC} ${CFLAGS} $< ${LDLIBS} -o $@ 34 | 35 | -include .makedep 36 | -------------------------------------------------------------------------------- /src/SFH/f32_math/tests/README.md: -------------------------------------------------------------------------------- 1 | Super hacky correctness test 2 | -------------------------------------------------------------------------------- /src/SFH/f32_math/tests/backup/atan.c: -------------------------------------------------------------------------------- 1 | // Public Domain under http://unlicense.org, see link for details. 2 | // 3 | // *****EXCEPT:************************ 4 | // 1) reference (cr_atanf) version: 5 | // ************************************ 6 | // 7 | // The CORE-MATH routine fall under: 8 | // 9 | // Permission is hereby granted, free of charge, to any person obtaining a copy 10 | // of this software and associated documentation files (the "Software"), to deal 11 | // in the Software without restriction, including without limitation the rights 12 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 13 | //copies of the Software, and to permit persons to whom the Software is 14 | // furnished to do so, subject to the following conditions: 15 | // 16 | // The above copyright notice and this permission notice shall be included in all 17 | // copies or substantial portions of the Software. 18 | // 19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 22 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 24 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 25 | // SOFTWARE. 26 | 27 | 28 | // compile with (or equiv). 29 | // clang -O3 -march=native -Wall -Wextra -Wconversion -Wpedantic -Wno-unused-function -fno-math-errno -ffp-contract=off atanf.c -o atanf -lm 30 | 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | 38 | #include "util.h" 39 | 40 | //********************************************************************** 41 | 42 | float libm(float x) { return atanf(x); } 43 | 44 | 45 | //********************************************************************** 46 | // SEE: https://core-math.gitlabpages.inria.fr 47 | // and license info at top of file. 48 | 49 | // oh my! 50 | #pragma GCC diagnostic push 51 | #pragma GCC diagnostic ignored "-Wpragmas" 52 | #pragma GCC diagnostic ignored "-Wpedantic" 53 | #pragma GCC diagnostic ignored "-Wunknown-warning-option" 54 | #pragma GCC diagnostic ignored "-Wconversion" 55 | #pragma GCC diagnostic ignored "-Wsign-conversion" 56 | #pragma GCC diagnostic ignored "-Wshorten-64-to-32" 57 | #pragma GCC diagnostic ignored "-Wimplicit-float-conversion" 58 | #pragma GCC diagnostic ignored "-Wimplicit-int-conversion" 59 | #pragma GCC diagnostic ignored "-Wfloat-conversion" 60 | #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" 61 | 62 | #include 63 | 64 | typedef union {float f; uint32_t u;} b32u32_u; 65 | typedef union {double f; uint64_t u;} b64u64_u; 66 | typedef uint64_t u64; 67 | 68 | float cr_atanf(float x){ 69 | const double pi2 = 0x1.921fb54442d18p+0; 70 | b32u32_u t = {.f = x}; 71 | int e = (t.u>>23)&0xff, gt = e>=127; 72 | if(__builtin_expect(e==0xff, 0)) { 73 | if(t.u<<9) return x; // nan 74 | return __builtin_copysign(pi2,(double)x); // inf 75 | } 76 | if (__builtin_expect(e<127-13, 0)){ 77 | if (__builtin_expect(e<127-25, 0)) 78 | return __builtin_fmaf(-x, __builtin_fabsf(x), x); 79 | return __builtin_fmaf(-0x1.5555555555555p-2f*x, x*x, x); 80 | } 81 | /* now |x| >= 0x1p-13 */ 82 | double z = x; 83 | if (gt) z = 1/z; /* gt is non-zero for |x| >= 1 */ 84 | double z2 = z*z, z4 = z2*z2, z8 = z4*z4; 85 | /* polynomials generated using rminimax 86 | (https://gitlab.inria.fr/sfilip/rminimax) with the following command: 87 | ./ratapprox --function="atan(x)" --dom=[0.000122070,1] --num=[x,x^3,x^5,x^7,x^9,x^11,x^13] --den=[1,x^2,x^4,x^6,x^8,x^10,x^12] --output=atanf.sollya --log 88 | (see output atanf.sollya) 89 | The coefficient cd[0] was slightly reduced from the original value 90 | 0x1.51eccde075d67p-2 to avoid an exceptional case for |x| = 0x1.1ad646p-4 91 | and rounding to nearest. 92 | */ 93 | static const double cn[] = 94 | {0x1.51eccde075d67p-2, 0x1.a76bb5637f2f2p-1, 0x1.81e0eed20de88p-1, 95 | 0x1.376c8ca67d11dp-2, 0x1.aec7b69202ac6p-5, 0x1.9561899acc73ep-9, 96 | 0x1.bf9fa5b67e6p-16}; 97 | static const double cd[] = 98 | {0x1.51eccde075d66p-2, 0x1.dfbdd7b392d28p-1, 0x1p+0, 99 | 0x1.fd22bf0e89b54p-2, 0x1.d91ff8b576282p-4, 0x1.653ea99fc9bbp-7, 100 | 0x1.1e7fcc202340ap-12}; 101 | double cn0 = cn[0] + z2*cn[1]; 102 | double cn2 = cn[2] + z2*cn[3]; 103 | double cn4 = cn[4] + z2*cn[5]; 104 | double cn6 = cn[6]; 105 | cn0 += z4*cn2; 106 | cn4 += z4*cn6; 107 | cn0 += z8*cn4; 108 | cn0 *= z; 109 | double cd0 = cd[0] + z2*cd[1]; 110 | double cd2 = cd[2] + z2*cd[3]; 111 | double cd4 = cd[4] + z2*cd[5]; 112 | double cd6 = cd[6]; 113 | cd0 += z4*cd2; 114 | cd4 += z4*cd6; 115 | cd0 += z8*cd4; 116 | double r = cn0/cd0; 117 | if (!gt) return r; /* for |x| < 1, (float) r is correctly rounded */ 118 | 119 | /* now |x| >= 1 */ 120 | r = __builtin_copysign(0x1.0fdaa22168c23p-7, z) - r + __builtin_copysign(0x1.9p0, z); 121 | return r; 122 | } 123 | 124 | #pragma GCC diagnostic pop 125 | 126 | //******************************************************** 127 | 128 | 129 | func_entry_t func_table[] = 130 | { 131 | ENTRY(libm), 132 | }; 133 | 134 | const char* func_name = "atan"; 135 | 136 | float cr_func(float x) { return cr_atanf(x); } 137 | 138 | #include "common.h" 139 | 140 | //******************************************************** 141 | 142 | int main(void) 143 | { 144 | error_dump(); 145 | 146 | return 0; 147 | } 148 | -------------------------------------------------------------------------------- /src/SFH/f32_math/tests/backup/atanpi.c: -------------------------------------------------------------------------------- 1 | // Public Domain under http://unlicense.org, see link for details. 2 | // 3 | // *****EXCEPT:************************ 4 | // 1) reference (cr_atanf) version: 5 | // ************************************ 6 | // 7 | // The CORE-MATH routine fall under: 8 | // 9 | // Permission is hereby granted, free of charge, to any person obtaining a copy 10 | // of this software and associated documentation files (the "Software"), to deal 11 | // in the Software without restriction, including without limitation the rights 12 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 13 | //copies of the Software, and to permit persons to whom the Software is 14 | // furnished to do so, subject to the following conditions: 15 | // 16 | // The above copyright notice and this permission notice shall be included in all 17 | // copies or substantial portions of the Software. 18 | // 19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 22 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 24 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 25 | // SOFTWARE. 26 | 27 | 28 | // compile with (or equiv). 29 | // clang -O3 -march=native -Wall -Wextra -Wconversion -Wpedantic -Wno-unused-function -fno-math-errno -ffp-contract=off atanpif.c -o atanpif -lm 30 | 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | 38 | #include "util.h" 39 | 40 | //********************************************************************** 41 | 42 | float libm(float x) { return atanf(x)/((float)M_PI); } 43 | 44 | 45 | //********************************************************************** 46 | // SEE: https://core-math.gitlabpages.inria.fr 47 | // and license info at top of file. 48 | 49 | // oh my! 50 | #pragma GCC diagnostic push 51 | #pragma GCC diagnostic ignored "-Wpragmas" 52 | #pragma GCC diagnostic ignored "-Wpedantic" 53 | #pragma GCC diagnostic ignored "-Wunknown-warning-option" 54 | #pragma GCC diagnostic ignored "-Wconversion" 55 | #pragma GCC diagnostic ignored "-Wsign-conversion" 56 | #pragma GCC diagnostic ignored "-Wshorten-64-to-32" 57 | #pragma GCC diagnostic ignored "-Wimplicit-float-conversion" 58 | #pragma GCC diagnostic ignored "-Wimplicit-int-conversion" 59 | #pragma GCC diagnostic ignored "-Wfloat-conversion" 60 | #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" 61 | 62 | #include 63 | 64 | typedef union {float f; uint32_t u;} b32u32_u; 65 | 66 | float cr_atanpif(float x){ 67 | b32u32_u t = {.f = x}; 68 | int32_t e = (t.u>>23)&0xff, gt = e>=127; 69 | if(__builtin_expect(e>127+24, 0)) { 70 | float f = __builtin_copysignf(0.5f, x); 71 | if(__builtin_expect(e==0xff, 0)) { 72 | if(t.u<<9) return x; // nan 73 | return f; // inf 74 | } 75 | return f - 0x1.45f306p-2f/x; 76 | } 77 | double z = x; 78 | if (__builtin_expect(e<127-13, 0)){ 79 | double sx = z*0x1.45f306dc9c883p-2; 80 | if (__builtin_expect(e<127-25, 0)) return sx; 81 | return sx - (0x1.5555555555555p-2*sx)*(x*x); 82 | } 83 | unsigned ax = t.u&(~0u>>1); 84 | if(__builtin_expect(ax == 0x3fa267ddu, 0)) return __builtin_copysignf(0x1.267004p-2f,x) - __builtin_copysignf(0x1p-55f,x); 85 | if(__builtin_expect(ax == 0x3f693531u, 0)) return __builtin_copysignf(0x1.e1a662p-3f,x) + __builtin_copysignf(0x1p-28f,x); 86 | if(__builtin_expect(ax == 0x3f800000u, 0)) return __builtin_copysignf(0x1p-2f,x); 87 | if(gt) z = 1/z; 88 | double z2 = z*z, z4 = z2*z2, z8 = z4*z4; 89 | static const double cn[] = 90 | {0x1.45f306dc9c882p-2, 0x1.733b561bc23d5p-1, 0x1.28d9805bdfbf2p-1, 91 | 0x1.8c3ba966ae287p-3, 0x1.94a7f81ee634bp-6, 0x1.a6bbf6127a6dfp-11}; 92 | static const double cd[] = 93 | {0x1p+0, 0x1.4e3b3ecc2518fp+1, 0x1.3ef4a360ff063p+1, 0x1.0f1dc55bad551p+0, 94 | 0x1.8da0fecc018a4p-3, 0x1.8fa87803776bfp-7, 0x1.dadf2ca0acb43p-14}; 95 | double cn0 = cn[0] + z2*cn[1]; 96 | double cn2 = cn[2] + z2*cn[3]; 97 | double cn4 = cn[4] + z2*cn[5]; 98 | cn0 += z4*cn2; 99 | cn0 += z8*cn4; 100 | cn0 *= z; 101 | double cd0 = cd[0] + z2*cd[1]; 102 | double cd2 = cd[2] + z2*cd[3]; 103 | double cd4 = cd[4] + z2*cd[5]; 104 | double cd6 = cd[6]; 105 | cd0 += z4*cd2; 106 | cd4 += z4*cd6; 107 | cd0 += z8*cd4; 108 | double r = cn0/cd0; 109 | if (gt) r = __builtin_copysign(0.5, z) - r; 110 | return r; 111 | } 112 | 113 | #pragma GCC diagnostic pop 114 | 115 | //******************************************************** 116 | 117 | 118 | func_entry_t func_table[] = 119 | { 120 | ENTRY(libm), 121 | }; 122 | 123 | const char* func_name = "atanpi"; 124 | 125 | float cr_func(float x) { return cr_atanpif(x); } 126 | 127 | #include "common.h" 128 | 129 | //******************************************************** 130 | 131 | int main(void) 132 | { 133 | 134 | error_dump(); 135 | 136 | return 0; 137 | } 138 | -------------------------------------------------------------------------------- /src/SFH/f32_math/tests/backup/common.h: -------------------------------------------------------------------------------- 1 | #ifndef __COMMON_H__ 2 | #define __COMMON_H__ 3 | 4 | typedef struct { 5 | float abs; 6 | uint32_t max; 7 | uint32_t ulp[4]; 8 | } func_error_t; 9 | 10 | // tracks total error data across multiple intervals 11 | func_error_t func_error[LENGTHOF(func_table)] = {{0}}; 12 | 13 | static const uint32_t f32_sign_bit_k = 0x80000000; 14 | 15 | static inline uint32_t f32_ulp_dist(float a, float b) 16 | { 17 | uint32_t ua = f32_to_bits(a); 18 | uint32_t ub = f32_to_bits(b); 19 | 20 | if ((int32_t)(ub^ua) >= 0) 21 | return u32_abs(ua-ub); 22 | 23 | return ua+ub+f32_sign_bit_k; 24 | } 25 | 26 | // add error data from current interval to the totals 27 | void error_to_totals(func_error_t* e) 28 | { 29 | for(uint32_t fi=0; fi < LENGTHOF(func_table); fi++, e++) { 30 | func_error[fi].ulp[0] += e->ulp[0]; 31 | func_error[fi].ulp[1] += e->ulp[1]; 32 | func_error[fi].ulp[2] += e->ulp[2]; 33 | func_error[fi].ulp[3] += e->ulp[3]; 34 | 35 | if (e->max > func_error[fi].max) func_error[fi].max = e->max; 36 | if (e->abs > func_error[fi].abs) func_error[fi].abs = e->abs; 37 | } 38 | } 39 | 40 | void error_dump_i(func_error_t* e) 41 | { 42 | printf("|%15s|%10s|%10s|%10s|%10s|%10s|%10s|%10s|%10s|%10s|%12s|\n", 43 | "func", "max ULP", "CR", "FR", "2 ULP", "> 2 ULP", 44 | "CR%", "FR%", "2 ULP%","> 2 ULP%", "abs"); 45 | 46 | printf("|%15s|%10s|%10s|%10s|%10s|%10s|%10s|%10s|%10s|%10s|%12s|\n", 47 | "---:", "---:", "---:", "---:", "---:", "---:", 48 | "---:", "---:", "---:","---:", "---:"); 49 | 50 | for(uint32_t fi=0; fi < LENGTHOF(func_table); fi++, e++) { 51 | uint32_t u0 = e->ulp[0]; 52 | uint32_t u1 = e->ulp[1]; 53 | uint32_t u2 = e->ulp[2]; 54 | uint32_t u3 = e->ulp[3]; 55 | uint32_t t = (u0+u1+u2+u3); 56 | double s = 100.0/(double)t; 57 | 58 | printf("|%15s|%10u|%10u|%10u|%10u|%10u|%10f|%10f|%10f|%10f|%e|\n", 59 | func_table[fi].name, e->max, 60 | u0, u1, u2, u3, 61 | s*u0, s*u1, s*u2, s*u3, e->abs 62 | ); 63 | } 64 | } 65 | 66 | void error_dump() 67 | { 68 | printf("\nTOTAL: %s\n", func_name); 69 | error_dump_i(func_error); 70 | } 71 | 72 | 73 | static inline void brute_error_add(func_error_t* error, float e, float a) 74 | { 75 | float d = fabsf(e-a); 76 | 77 | if (d == 0.f) { error->ulp[0]++; return; }; 78 | 79 | uint32_t ulp = f32_ulp_dist_ss(e,a); 80 | 81 | if (ulp > error->max) { error->max = ulp; } 82 | if (d > error->abs) { error->abs = d; } 83 | 84 | if (ulp > 3) ulp = 3; 85 | 86 | error->ulp[ulp]++; 87 | } 88 | 89 | // for when signs of e and a can be different 90 | static inline void brute_error_add_ds(func_error_t* error, float e, float a) 91 | { 92 | float d = fabsf(e-a); 93 | 94 | if (d == 0.f) { error->ulp[0]++; return; }; 95 | 96 | uint32_t ulp = f32_ulp_dist(e,a); 97 | 98 | if (ulp > error->max) { error->max = ulp; } 99 | if (d > error->abs) { error->abs = d; } 100 | 101 | if (ulp > 3) ulp = 3; 102 | 103 | error->ulp[ulp]++; 104 | } 105 | 106 | void brute_force(uint32_t x0, uint32_t x1) 107 | { 108 | if (x1 < x0) { uint32_t t = x0; x0=x1; x1=t; } 109 | 110 | float f0 = f32_from_bits(x0); 111 | float f1 = f32_from_bits(x1); 112 | 113 | printf("\nchecking: %s on [%08x,%08x] [%e,%e]\n", func_name, x0,x1,f0,f1); 114 | 115 | func_error_t error[LENGTHOF(func_table)] = {{0}}; 116 | 117 | for(uint32_t ix=x0; ix<=x1; ix++) { 118 | float x = f32_from_bits(ix); 119 | float cr = cr_func(x); 120 | 121 | for(uint32_t fi=0; fi < LENGTHOF(func_table); fi++) { 122 | float r = func_table[fi].f(x); 123 | brute_error_add(error+fi, cr,r); 124 | } 125 | } 126 | 127 | error_to_totals(error); 128 | error_dump_i(error); 129 | } 130 | 131 | 132 | // correctly rounded f(x) = k on [x0,x1] 133 | void brute_const_range(uint32_t x0, uint32_t x1, float k) 134 | { 135 | func_error_t error[LENGTHOF(func_table)] = {{0}}; 136 | 137 | float f0 = f32_from_bits(x0); 138 | float f1 = f32_from_bits(x1); 139 | 140 | printf("\nchecking: %s on [%08x,%08x] [%e,%e] : {constant result range %a}\n", func_name, x0,x1,f0,f1,k); 141 | 142 | for(uint32_t fi=0; fi < LENGTHOF(func_table); fi++) { 143 | for(uint32_t xi=x0; xi<=x1; xi++) { 144 | float x = f32_from_bits(xi); 145 | float r = func_table[fi].f(x); 146 | brute_error_add(error+fi,k,r); 147 | } 148 | } 149 | error_to_totals(error); 150 | error_dump_i(error); 151 | } 152 | 153 | // correctly rounded f(x) = x on [x0,x1] 154 | void brute_identity_range(uint32_t x0, uint32_t x1) 155 | { 156 | func_error_t error[LENGTHOF(func_table)] = {{0}}; 157 | 158 | float f0 = f32_from_bits(x0); 159 | float f1 = f32_from_bits(x1); 160 | 161 | printf("\nchecking: %s on [%08x,%08x] [%e,%e] : {result = input range} \n", func_name, x0,x1,f0,f1); 162 | 163 | for(uint32_t fi=0; fi < LENGTHOF(func_table); fi++) { 164 | for(uint32_t xi=x0; xi<=x1; xi++) { 165 | float x = f32_from_bits(xi); 166 | float r = func_table[fi].f(x); 167 | brute_error_add(error+fi,x,r); 168 | } 169 | } 170 | 171 | error_to_totals(error); 172 | error_dump_i(error); 173 | } 174 | 175 | void brute_di() 176 | { 177 | uint32_t x0 = f32_to_bits(0x1.0p-149f); 178 | uint32_t x1 = f32_to_bits(0x1.0p-126f); 179 | brute_force(x0,x1); 180 | } 181 | 182 | void brute_1pot(float x) 183 | { 184 | uint32_t x0 = f32_to_bits(x); 185 | uint32_t x1 = f32_to_bits(2.f*x); 186 | brute_force(x0,x1); 187 | } 188 | 189 | void brute_1pot_pn(float x) 190 | { 191 | uint32_t x0 = f32_to_bits(x); 192 | uint32_t x1 = f32_to_bits(2.f*x); 193 | brute_force(x0,x1); 194 | x0 = f32_to_bits(-x); 195 | x1 = f32_to_bits(-2.f*x); 196 | brute_force(x0,x1); 197 | } 198 | 199 | 200 | 201 | #endif 202 | -------------------------------------------------------------------------------- /src/SFH/f32_math/tests/backup/core_math_expand.h: -------------------------------------------------------------------------------- 1 | // WIP 2 | 3 | // for including a core-math function and suppressing the warnings. only 4 | // handles GCC & clang like compilers ATM 5 | // SEE: https://core-math.gitlabpages.inria.fr 6 | 7 | #if defined(__GNUC__) 8 | 9 | #pragma GCC diagnostic push 10 | #pragma GCC diagnostic ignored "-Wpragmas" 11 | #pragma GCC diagnostic ignored "-Wpedantic" 12 | #pragma GCC diagnostic ignored "-Wunknown-warning-option" 13 | #pragma GCC diagnostic ignored "-Wconversion" 14 | #pragma GCC diagnostic ignored "-Wsign-conversion" 15 | #pragma GCC diagnostic ignored "-Wshorten-64-to-32" 16 | #pragma GCC diagnostic ignored "-Wimplicit-float-conversion" 17 | #pragma GCC diagnostic ignored "-Wimplicit-int-conversion" 18 | #pragma GCC diagnostic ignored "-Wfloat-conversion" 19 | #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" 20 | #include STRINGIFY(CORE_MATH_FUNC) 21 | #pragma GCC diagnostic pop 22 | 23 | #else 24 | #include STRINGIFY(CORE_MATH_FUNC) 25 | #endif 26 | -------------------------------------------------------------------------------- /src/SFH/f32_math/tests/backup/util.h: -------------------------------------------------------------------------------- 1 | #ifndef __UTIL_H__ 2 | #define __UTIL_H__ 3 | 4 | static inline uint32_t f32_to_bits(float x) 5 | { 6 | uint32_t u; memcpy(&u,&x,4); return u; 7 | } 8 | 9 | static inline float f32_from_bits(uint32_t x) 10 | { 11 | float u; memcpy(&u,&x,4); return u; 12 | } 13 | 14 | // if 'v' is float and 's' is all clear (except sign bit) 15 | static inline float f32_mulsign(float v, uint32_t s) 16 | { 17 | return f32_from_bits(f32_to_bits(v)^s); 18 | } 19 | 20 | static inline uint32_t u32_abs(uint32_t x) 21 | { 22 | return (int32_t)x >= 0 ? x : -x; 23 | } 24 | 25 | // ulp distance provided a & b are finite 26 | // and have the same sign 27 | static inline uint32_t f32_ulp_dist_ss(float a, float b) 28 | { 29 | uint32_t ua = f32_to_bits(a); 30 | uint32_t ub = f32_to_bits(b); 31 | return u32_abs(ua-ub); 32 | } 33 | 34 | const double f64_pi = 0x1.921fb54442d18p1; 35 | const double f64_half_pi = 0.5*f64_pi; 36 | 37 | const uint32_t f32_bits_one = 0x3f800000; 38 | 39 | const float f32_pi = 0x1.921fb6p1f; 40 | const float f32_half_pi = 0.5f*f32_pi; 41 | const float PIO2F = 0.5f*0x1.921fb6p1f; 42 | 43 | typedef struct { float h,l; } f32_pair_t; 44 | 45 | // pi as unevaluated pair 46 | const f32_pair_t f32_up_pi = {.h=0x1.921fb6p1f, .l=-0x1.777a5cp-24f }; 47 | const f32_pair_t f32_up_pio2 = {.h=0x1.921fb6p0f, .l=-0x1.777a5cp-23f }; 48 | 49 | // pi as multiplicative pair (h*l) 50 | const f32_pair_t f32_mk_pi = {.h = (float)(61*256661), .l= (float)(13*73*14879)*0x1.0p-46f}; 51 | const float f32_mk_pi_lo = -0x1.1ee59ep-45f; 52 | 53 | 54 | static inline float f32_add_pi(float x) 55 | { 56 | const float pi_a = f32_mk_pi.h; 57 | const float pi_b = f32_mk_pi.l; 58 | 59 | return fmaf(pi_a, pi_b, x); 60 | } 61 | 62 | static inline float f32_add_pi_x(float x) 63 | { 64 | const float pi_a = f32_mk_pi.h; 65 | const float pi_b = f32_mk_pi.l; 66 | 67 | return fmaf(pi_a, pi_b, x + f32_mk_pi_lo); 68 | } 69 | 70 | // fma variant of : pi/2 + x 71 | static inline float f32_add_half_pi(float x) 72 | { 73 | const float pi_a = f32_mk_pi.h; 74 | const float pi_b = 0.5f*f32_mk_pi.l; 75 | 76 | return fmaf(pi_a, pi_b, x); 77 | } 78 | 79 | static inline float f32_add_half_pi_x(float x) 80 | { 81 | const float pi_a = f32_mk_pi.h; 82 | const float pi_b = 0.5f*f32_mk_pi.l; 83 | 84 | return fmaf(pi_a, pi_b, x + (0.5f*f32_mk_pi_lo)); 85 | } 86 | 87 | 88 | // high to low coeffificent ordering 89 | inline float f32_horner_1(float x, const float C[static 2]) 90 | { 91 | return fmaf(x,C[0],C[1]); 92 | } 93 | 94 | inline float f32_horner_2(float x, const float C[static 3]) 95 | { 96 | return fmaf(x,fmaf(x,C[0],C[1]),C[2]); 97 | } 98 | 99 | inline float f32_horner_3(float x, const float C[static 4]) 100 | { 101 | return fmaf(x,fmaf(x,fmaf(x,C[0],C[1]),C[2]),C[3]); 102 | } 103 | 104 | inline float f32_horner_4(float x, const float C[static 5]) 105 | { 106 | return fmaf(x,fmaf(x,fmaf(x,fmaf(x,C[0],C[1]),C[2]),C[3]),C[4]); 107 | } 108 | 109 | inline float f32_horner_5(float x, const float C[static 6]) 110 | { 111 | return fmaf(x,fmaf(x,fmaf(x,fmaf(x,fmaf(x,C[0],C[1]),C[2]),C[3]),C[4]),C[5]); 112 | } 113 | 114 | inline float f32_horner_6(float x, const float C[static 7]) 115 | { 116 | return fmaf(x,fmaf(x,fmaf(x,fmaf(x,fmaf(x,fmaf(x,C[0],C[1]),C[2]),C[3]),C[4]),C[5]),C[6]); 117 | } 118 | 119 | //************************************************************************ 120 | 121 | inline double f64_horner_1(double x, const double C[static 2]) 122 | { 123 | return fma(x,C[0],C[1]); 124 | } 125 | 126 | inline double f64_horner_2(double x, const double C[static 3]) 127 | { 128 | return fma(x,fma(x,C[0],C[1]),C[2]); 129 | } 130 | 131 | inline double f64_horner_3(double x, const double C[static 4]) 132 | { 133 | return fma(x,fma(x,fma(x,C[0],C[1]),C[2]),C[3]); 134 | } 135 | 136 | inline double f64_horner_4(double x, const double C[static 5]) 137 | { 138 | return fma(x,fma(x,fma(x,fma(x,C[0],C[1]),C[2]),C[3]),C[4]); 139 | } 140 | 141 | inline double f64_horner_5(double x, const double C[static 6]) 142 | { 143 | return fma(x,fma(x,fma(x,fma(x,fma(x,C[0],C[1]),C[2]),C[3]),C[4]),C[5]); 144 | } 145 | 146 | //************************************************************************ 147 | // stuff for the drivers 148 | 149 | typedef struct { 150 | float (*f)(float); 151 | char* name; 152 | } func_entry_t; 153 | 154 | #define LENGTHOF(X) (sizeof(X)/sizeof(X[0])) 155 | #define STRINGIFY(S) STRINGIFY_(S) 156 | #define STRINGIFY_(S) #S 157 | #define ENTRY(X) { .f=&X, .name=STRINGIFY(X) } 158 | 159 | 160 | #endif 161 | -------------------------------------------------------------------------------- /src/SFH/f32_math/tests/core_math_expand.h: -------------------------------------------------------------------------------- 1 | // WIP 2 | 3 | // for including a core-math function and suppressing the warnings. only 4 | // handles GCC & clang like compilers ATM 5 | // SEE: https://core-math.gitlabpages.inria.fr 6 | // 7 | // #define CORE_MATH_FUNC sinpi 8 | // #include "core_math_expand.h" 9 | 10 | #if defined(__GNUC__) 11 | 12 | #pragma GCC diagnostic push 13 | #pragma GCC diagnostic ignored "-Wpragmas" 14 | #pragma GCC diagnostic ignored "-Wpedantic" 15 | #pragma GCC diagnostic ignored "-Wunknown-warning-option" 16 | #pragma GCC diagnostic ignored "-Wconversion" 17 | #pragma GCC diagnostic ignored "-Wsign-conversion" 18 | #pragma GCC diagnostic ignored "-Wshorten-64-to-32" 19 | #pragma GCC diagnostic ignored "-Wimplicit-float-conversion" 20 | #pragma GCC diagnostic ignored "-Wimplicit-int-conversion" 21 | #pragma GCC diagnostic ignored "-Wfloat-conversion" 22 | #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" 23 | #include STRINGIFY(CORE_MATH_FUNC) 24 | #pragma GCC diagnostic pop 25 | 26 | #else 27 | #include STRINGIFY(CORE_MATH_FUNC) 28 | #endif 29 | -------------------------------------------------------------------------------- /src/SFH/f32_math/tests/reports/asin.md: -------------------------------------------------------------------------------- 1 |
click for range breakdown 2 | 3 | 4 | running: minimal sanity check 5 | 6 | checking: asin on [00000000,3f800000] [0.000000e+00,1.000000e+00] 7 | | func| max ULP| CR| FR| 2 ULP| > 2 ULP| CR%| FR%| 2 ULP%| > 2 ULP%| abs| 8 | | ---:| ---:| ---:| ---:| ---:| ---:| ---:| ---:| ---:| ---:| ---:| 9 | | libm| 1|1063062367| 2290850| 0| 0| 99.784968| 0.215032| 0.000000| 0.000000|1.192093e-07| 10 | | fdlibm_asinf| 1|1063254912| 2098305| 0| 0| 99.803041| 0.196959| 0.000000| 0.000000|1.192093e-07| 11 | | asin_x0_k3| 3|1040272132| 23110632| 1940988| 29465| 97.645749| 2.169293| 0.182192| 0.002766|1.788139e-07| 12 | | asin_x0_k4| 2|1061765494| 3569698| 18025| 0| 99.663236| 0.335072| 0.001692| 0.000000|1.192093e-07| 13 | | asin_x0_k5| 2|1062965052| 2374286| 13879| 0| 99.775834| 0.222864| 0.001303| 0.000000|1.192093e-07| 14 | | asin_x0_k6| 2|1063010542| 2329026| 13649| 0| 99.780103| 0.218615| 0.001281| 0.000000|1.192093e-07| 15 | | asin_x1_k3| 2|1040176617| 23593913| 1582687| 0| 97.636784| 2.214656| 0.148560| 0.000000|1.192093e-07| 16 | | asin_x1_k4| 1|1063732831| 1620386| 0| 0| 99.847902| 0.152098| 0.000000| 0.000000|1.192093e-07| 17 | | asin_x1_k5| 1|1065124477| 228740| 0| 0| 99.978529| 0.021471| 0.000000| 0.000000|1.192093e-07| 18 | | asin_x1_k6| 1|1065172411| 180806| 0| 0| 99.983029| 0.016971| 0.000000| 0.000000|1.192093e-07| 19 | 20 |
21 | 22 | 23 | TOTAL: asin 24 | | func| max ULP| CR| FR| 2 ULP| > 2 ULP| CR%| FR%| 2 ULP%| > 2 ULP%| abs| 25 | | ---:| ---:| ---:| ---:| ---:| ---:| ---:| ---:| ---:| ---:| ---:| 26 | | libm| 1|1063062367| 2290850| 0| 0| 99.784968| 0.215032| 0.000000| 0.000000|1.192093e-07| 27 | | fdlibm_asinf| 1|1063254912| 2098305| 0| 0| 99.803041| 0.196959| 0.000000| 0.000000|1.192093e-07| 28 | | asin_x0_k3| 3|1040272132| 23110632| 1940988| 29465| 97.645749| 2.169293| 0.182192| 0.002766|1.788139e-07| 29 | | asin_x0_k4| 2|1061765494| 3569698| 18025| 0| 99.663236| 0.335072| 0.001692| 0.000000|1.192093e-07| 30 | | asin_x0_k5| 2|1062965052| 2374286| 13879| 0| 99.775834| 0.222864| 0.001303| 0.000000|1.192093e-07| 31 | | asin_x0_k6| 2|1063010542| 2329026| 13649| 0| 99.780103| 0.218615| 0.001281| 0.000000|1.192093e-07| 32 | | asin_x1_k3| 2|1040176617| 23593913| 1582687| 0| 97.636784| 2.214656| 0.148560| 0.000000|1.192093e-07| 33 | | asin_x1_k4| 1|1063732831| 1620386| 0| 0| 99.847902| 0.152098| 0.000000| 0.000000|1.192093e-07| 34 | | asin_x1_k5| 1|1065124477| 228740| 0| 0| 99.978529| 0.021471| 0.000000| 0.000000|1.192093e-07| 35 | | asin_x1_k6| 1|1065172411| 180806| 0| 0| 99.983029| 0.016971| 0.000000| 0.000000|1.192093e-07| 36 | -------------------------------------------------------------------------------- /src/SFH/f32_math/tests/reports/cbrt.md: -------------------------------------------------------------------------------- 1 | ## REPORTED NUMBERS ARE LIMITED TO NORMAL INPUTS 2 | 3 |
click for range breakdown 4 | 5 | 6 | running: minimal sanity check 7 | 8 | checking: cbrt on [00800000,02000000] [1.175494e-38,9.403955e-38] 9 | | func| max ULP| CR| FR| 2 ULP| > 2 ULP| CR%| FR%| 2 ULP%| > 2 ULP%| abs| 10 | | ---:| ---:| ---:| ---:| ---:| ---:| ---:| ---:| ---:| ---:| ---:| 11 | | libm| 1| 22496783| 2669042| 0| 0| 89.394180| 10.605820| 0.000000| 0.000000|2.710505e-20| 12 | | f32_cbrt_1| 3| 13153315| 11473539| 538956| 15| 52.266576| 45.591746| 2.141619| 0.000060|8.131516e-20| 13 | | f32_cbrt_2| 2| 19516945| 5648678| 202| 0| 77.553369| 22.445829| 0.000803| 0.000000|5.421011e-20| 14 | | f32_cbrt_3| 1| 20354319| 4811506| 0| 0| 80.880794| 19.119206| 0.000000| 0.000000|2.710505e-20| 15 | | f32_cbrt_4| 1| 24722722| 443103| 0| 0| 98.239267| 1.760733| 0.000000| 0.000000|2.710505e-20| 16 | | f32_cbrt_5| 0| 25165825| 0| 0| 0|100.000000| 0.000000| 0.000000| 0.000000|0.000000e+00| 17 | 18 | checking: cbrt on [7c800000,7e000000] [5.316912e+36,4.253530e+37] 19 | | func| max ULP| CR| FR| 2 ULP| > 2 ULP| CR%| FR%| 2 ULP%| > 2 ULP%| abs| 20 | | ---:| ---:| ---:| ---:| ---:| ---:| ---:| ---:| ---:| ---:| ---:| 21 | | libm| 1| 22496783| 2669042| 0| 0| 89.394180| 10.605820| 0.000000| 0.000000|2.621440e+05| 22 | | f32_cbrt_1| 3| 13153314| 11473540| 538956| 15| 52.266572| 45.591750| 2.141619| 0.000060|5.242880e+05| 23 | | f32_cbrt_2| 2| 19516945| 5648678| 202| 0| 77.553369| 22.445829| 0.000803| 0.000000|5.242880e+05| 24 | | f32_cbrt_3| 1| 20354319| 4811506| 0| 0| 80.880794| 19.119206| 0.000000| 0.000000|2.621440e+05| 25 | | f32_cbrt_4| 1| 24722722| 443103| 0| 0| 98.239267| 1.760733| 0.000000| 0.000000|2.621440e+05| 26 | | f32_cbrt_5| 0| 25165825| 0| 0| 0|100.000000| 0.000000| 0.000000| 0.000000|0.000000e+00| 27 | 28 | checking: cbrt on [3e800000,40000000] [2.500000e-01,2.000000e+00] 29 | | func| max ULP| CR| FR| 2 ULP| > 2 ULP| CR%| FR%| 2 ULP%| > 2 ULP%| abs| 30 | | ---:| ---:| ---:| ---:| ---:| ---:| ---:| ---:| ---:| ---:| ---:| 31 | | libm| 1| 22496783| 2669042| 0| 0| 89.394180| 10.605820| 0.000000| 0.000000|1.192093e-07| 32 | | f32_cbrt_1| 3| 13153314| 11473540| 538956| 15| 52.266572| 45.591750| 2.141619| 0.000060|2.384186e-07| 33 | | f32_cbrt_2| 2| 19516945| 5648678| 202| 0| 77.553369| 22.445829| 0.000803| 0.000000|1.192093e-07| 34 | | f32_cbrt_3| 1| 20354319| 4811506| 0| 0| 80.880794| 19.119206| 0.000000| 0.000000|1.192093e-07| 35 | | f32_cbrt_4| 1| 24722722| 443103| 0| 0| 98.239267| 1.760733| 0.000000| 0.000000|1.192093e-07| 36 | | f32_cbrt_5| 0| 25165825| 0| 0| 0|100.000000| 0.000000| 0.000000| 0.000000|0.000000e+00| 37 | 38 |
39 | 40 | 41 | TOTAL: cbrt 42 | | func| max ULP| CR| FR| 2 ULP| > 2 ULP| CR%| FR%| 2 ULP%| > 2 ULP%| abs| 43 | | ---:| ---:| ---:| ---:| ---:| ---:| ---:| ---:| ---:| ---:| ---:| 44 | | libm| 1| 67490349| 8007126| 0| 0| 89.394180| 10.605820| 0.000000| 0.000000|2.621440e+05| 45 | | f32_cbrt_1| 3| 39459943| 34420619| 1616868| 45| 52.266573| 45.591749| 2.141619| 0.000060|5.242880e+05| 46 | | f32_cbrt_2| 2| 58550835| 16946034| 606| 0| 77.553369| 22.445829| 0.000803| 0.000000|5.242880e+05| 47 | | f32_cbrt_3| 1| 61062957| 14434518| 0| 0| 80.880794| 19.119206| 0.000000| 0.000000|2.621440e+05| 48 | | f32_cbrt_4| 1| 74168166| 1329309| 0| 0| 98.239267| 1.760733| 0.000000| 0.000000|2.621440e+05| 49 | | f32_cbrt_5| 0| 75497475| 0| 0| 0|100.000000| 0.000000| 0.000000| 0.000000|0.000000e+00| 50 | -------------------------------------------------------------------------------- /src/SFH/f32_math/tests/timehack.h: -------------------------------------------------------------------------------- 1 | // Public Domain under http://unlicense.org, see link for details. 2 | 3 | // THIS HAS NO MEANFUL VALUE AT MEASURING PERFORMANCE. Hacky dev pseudo-aid only. 4 | // copy-n-pasted here. no PRNG sampling, no selection of range and it's garbage anyway 5 | 6 | //********************************************************************************************* 7 | 8 | const uint32_t time_count = 1<<20; // number of func calls/trial 9 | const int32_t time_trials = 16; // number of trials 10 | const int32_t time_retry = 15; // max number of retries 11 | const int32_t time_rerun = 0; // max number of retries 12 | const double time_vlimit = 0.02; // stdd acceptable value (or retry) 13 | 14 | const double time_scale = 1.0/((double)time_count); 15 | 16 | static inline uint64_t time_get(void) 17 | { 18 | #if 1 19 | #define TIME_STR "cycles" 20 | return _rdtsc(); 21 | #else 22 | #define TIME_STR "ns " 23 | struct timespec ts; 24 | clock_gettime(CLOCK_REALTIME, &ts); 25 | return (1000*1000*1000*(uint64_t)ts.tv_sec + (uint64_t)ts.tv_nsec); 26 | #endif 27 | } 28 | 29 | static inline double doubletime(void) 30 | { 31 | return (double)time_get(); 32 | } 33 | 34 | // Welford's method for streaming mean/variance/stdev 35 | // --- just being lazy 36 | typedef struct { double n,m,s; } seq_stats_t; 37 | 38 | inline void seq_stats_init(seq_stats_t* d) 39 | { 40 | memset(d,0,sizeof(seq_stats_t)); 41 | } 42 | 43 | void seq_stats_add(seq_stats_t* d, double x) 44 | { 45 | d->n += 1.0; 46 | 47 | double m = d->m; 48 | double s = d->s; 49 | double dm = x-m; 50 | 51 | d->m = m + dm/d->n; 52 | d->s = s + dm*(x-d->m); 53 | } 54 | 55 | inline double seq_stats_mean(seq_stats_t* d) { return d->m; } 56 | inline double seq_stats_variance(seq_stats_t* d) { return d->s/(d->n-1.0); } 57 | inline double seq_stats_stddev(seq_stats_t* d) { return sqrt(seq_stats_variance(d)); } 58 | 59 | float time_func(float (*f)(float), int32_t n) 60 | { 61 | // use an additive recurrence to cover the range cheaply 62 | // currently: [0,1]...which is useless here...copy-n-paste...needs the fixing part 63 | // WAS WRITTEN FOR BRANCH FREE. needs different strategy 64 | // for branchy functions...cheap PRNG 65 | static const uint32_t A = 2654435769; 66 | 67 | float r = 0.f; 68 | float x = 0.f; 69 | uint32_t u = A; 70 | float v; 71 | 72 | while(n-- > 0) { 73 | v = f(x); 74 | 75 | if (v != v) { printf("%f ", x); } 76 | 77 | u += A; 78 | x = (float)(u) * (0x1.0p-33f); 79 | r += v; 80 | 81 | if (r != r) { printf("%f ", v); } 82 | } 83 | 84 | return r; 85 | } 86 | 87 | int cmp_u64(const void * a, const void * b) 88 | { 89 | return ( *(uint64_t*)a > *(uint64_t*)b ); 90 | } 91 | 92 | volatile float sink; 93 | 94 | void timing_run(float (*f)(float), uint64_t data[static time_trials]) 95 | { 96 | uint32_t s0 = 0; 97 | 98 | do { 99 | for(uint32_t n=s0; nm; 134 | double sum = 0.0; 135 | 136 | for(uint32_t n=0; nname); 160 | 161 | // temp hack (variable) 162 | timing_run(entry->f, data); 163 | gof = timing_gather(&stats, data); 164 | 165 | double std = sqrt(seq_stats_variance(&stats)); 166 | 167 | printf("%12f %10f %10f",stats.m, std, gof); 168 | 169 | //if (gof > time_vlimit) printf(" <-- garbage timing (retry)"); 170 | printf("\n"); 171 | 172 | entry++; 173 | } 174 | 175 | printf("sink = %f\n", sink); 176 | } 177 | 178 | -------------------------------------------------------------------------------- /src/SFH/f32_math/tests/util.h: -------------------------------------------------------------------------------- 1 | #ifndef __UTIL_H__ 2 | #define __UTIL_H__ 3 | 4 | #define FUNC_TYPE_ODD 1 5 | #define FUNC_TYPE_EVEN 2 6 | 7 | // bunch of wip temp hacks 8 | 9 | typedef struct { 10 | float (*f)(float); 11 | char* name; 12 | char* notes; 13 | uint32_t flags; 14 | uint32_t type; 15 | } func_entry_t; 16 | 17 | typedef struct { 18 | uint32_t x0; 19 | uint32_t x1; 20 | char* name; 21 | } func_range_entry_t; 22 | 23 | typedef struct { 24 | char* name; 25 | func_range_entry_t* table; 26 | uint32_t num; 27 | } func_range_table_t; 28 | 29 | #define LENGTHOF(X) (sizeof(X)/sizeof(X[0])) 30 | #define STRINGIFY(S) STRINGIFY_(S) 31 | #define STRINGIFY_(S) #S 32 | #define ENTRY(X) { .f=&X, .name=STRINGIFY(X) } 33 | 34 | 35 | #endif 36 | -------------------------------------------------------------------------------- /src/SFH/f32_quant.h: -------------------------------------------------------------------------------- 1 | // Public Domain under http://unlicense.org, see link for details. 2 | // Marc B. Reynolds, 2022-2025 3 | 4 | #ifndef F32_QUANT_H 5 | #define F32_QUANT_H 6 | 7 | // scalar uniform quantization (suq) helpers 8 | 9 | //**** helper functions: 10 | 11 | typedef struct { float h,l; } suq_scale_t; 12 | 13 | // local version of generating a pair of constants 14 | // to compute 1/n for correctly rounded decodes. 15 | // SEE: https://marc-b-reynolds.github.io/math/2019/03/12/FpDiv.html 16 | static inline suq_scale_t suq_scale(uint32_t n) 17 | { 18 | const float y = (float)n; 19 | const float rh = 1.f/y; 20 | const float rl = -fmaf(rh, y, -1.f)/y; 21 | 22 | return (suq_scale_t){.h = rh, .l = rl}; 23 | } 24 | 25 | //**** scalar uniform encoding funcs 26 | 27 | static inline int32_t suq_scale_i32(float x, float s) 28 | { 29 | return (int32_t)(s*x); 30 | } 31 | 32 | static inline int32_t suq_scale_round_i32(float x, float s) 33 | { 34 | return (int32_t)fmaf(s,x,0.5f); 35 | } 36 | 37 | 38 | static inline uint32_t suq_scale_u32(float x, float s) 39 | { 40 | return (uint32_t)((int32_t)(s*x)); 41 | } 42 | 43 | static inline uint32_t suq_scale_translate_u32(float x, float s, float t) 44 | { 45 | return (uint32_t)((int32_t)fmaf(x,s,t)); 46 | } 47 | 48 | 49 | static inline uint32_t suq_scale_round_u32(float x, float s) 50 | { 51 | return (uint32_t)((int32_t)fmaf(x,s,0.5f)); 52 | } 53 | 54 | 55 | // floor quantization 56 | static inline uint32_t suq_encode_f(float x , uint32_t n) 57 | { 58 | return suq_scale_u32(x, (float)n); 59 | } 60 | 61 | // rounded quantization 62 | static inline uint32_t suq_encode_r(float x , uint32_t n) 63 | { 64 | return suq_scale_round_u32(x, (float)(n-1)); 65 | } 66 | 67 | 68 | //**** scalar uniform decoding funcs 69 | 70 | // center reconstruction 71 | static inline float suq_decode_c(uint32_t u, uint32_t n) 72 | { 73 | float s = (1.f/(float)n); 74 | 75 | return fmaf((float)u, s, 0.5f*s); 76 | } 77 | 78 | // center reconstruction: correctly rounded 79 | // (for properly handling non power-of-two 'n') 80 | static inline float suq_decode_c_cr(uint32_t u, uint32_t n) 81 | { 82 | const suq_scale_t s = suq_scale(n); 83 | 84 | float f = (float)u + 0.5f; 85 | 86 | return fmaf(f, s.h, f*s.l); 87 | } 88 | 89 | // left reconstruction 90 | static inline float suq_decode_l(uint32_t u, uint32_t n) 91 | { 92 | return (float)u * (1.f/(float)(n-1)); 93 | } 94 | 95 | // left reconstruction: correctly rounded 96 | // (for properly handling when n-1 isn't a power of two) 97 | static inline float suq_decode_l_cr(uint32_t u, uint32_t n) 98 | { 99 | const suq_scale_t s = suq_scale(n-1); 100 | 101 | float f = (float)u; 102 | float r = fmaf(f, s.h, f*s.l); 103 | 104 | return r; 105 | } 106 | 107 | //**** encode/decode pairs helpers: (assumes n is power-of-two) 108 | 109 | // output point set excludes 0 & 1. point distance = 1/n {floor/center} 110 | // x on [0,1) -> [0,n-1] 111 | // u on [0,n-1] -> (0,1) 112 | static inline uint32_t suq_encode_fc(float x , uint32_t n) { return suq_encode_f(x,n); } 113 | static inline float suq_decode_fc(uint32_t u, uint32_t n) { return suq_decode_c(u,n); } 114 | 115 | 116 | // output point set includes 0 & 1. point distance = 1/(n-1) {rounded/left} 117 | // x on [0,1] -> [0,n-1] 118 | // u on [0,n-1] -> [0,1] 119 | static inline uint32_t suq_encode_rl(float x, uint32_t n) { return suq_encode_r(x,n); } 120 | static inline float suq_decode_rl(uint32_t u, uint32_t n) { return suq_decode_l_cr(u,n); } 121 | 122 | 123 | // wrapped quantization (assumes n is power-of-two). 124 | // domain of input is x mod 1 125 | // Zero is member of output set (1 is equivalent to 0) 126 | // point distance = 1/n 127 | // x on [0,1) -> [0,n-1] {well input is mod 1} 128 | // u on [0,n-1] -> [0,1) 129 | static inline uint32_t suq_encode_w(float x , uint32_t n) 130 | { 131 | // not doing anything clever here since n is assumed POT 132 | return suq_scale_translate_u32(x,(float)(n), 0.5f) % n; 133 | } 134 | 135 | // wrapped dequantization (assume n is power-of-two) 136 | static inline float suq_decode_w(uint32_t u, uint32_t n) 137 | { 138 | return (float)u * (1.f/(float)(n)); 139 | } 140 | 141 | // wrapped reconstruction: correctly rounded 142 | // for when n isn't a power of two. a bit useless ATM 143 | // since there's no special case encode for this 144 | static inline float suq_decode_w_cr(uint32_t u, uint32_t n) 145 | { 146 | // I'm assuming this will be optimized away into 147 | // precomputed constants. 148 | const suq_scale_t s = suq_scale(n); 149 | 150 | float f = (float)u; 151 | float r = fmaf(f, s.h, f*s.l); 152 | 153 | return r; 154 | } 155 | 156 | static inline uint32_t unorm8_encode(float x) 157 | { 158 | return suq_scale_round_u32(x,255.f); 159 | } 160 | 161 | // SEE: https://fgiesen.wordpress.com/2024/11/06/exact-unorm8-to-float/ 162 | static inline float unorm8_decode_cr(uint32_t u) 163 | { 164 | static const float k = 1.f/(255.f*3.f); 165 | 166 | float f = (float)(3*u); 167 | float r = k*f; 168 | 169 | return r; 170 | } 171 | 172 | static inline uint32_t snorm8_encode(float x) 173 | { 174 | return suq_scale_u32(x, (255.f/2.f) + 128.0f); 175 | } 176 | 177 | static inline float snorm8_decode_cr(uint32_t u) 178 | { 179 | static const float k = 2.f/(255.f*3.f); 180 | 181 | float f = (float)(3*u); 182 | float r = fmaf(k,f,-1.f); 183 | 184 | return r; 185 | } 186 | 187 | #endif 188 | 189 | -------------------------------------------------------------------------------- /src/SFH/f64_horner.h: -------------------------------------------------------------------------------- 1 | // Public Domain under http://unlicense.org, see link for details. 2 | // Marc B. Reynolds, 2015-2025 3 | 4 | // evalulate n^th degree polynomial in Horner's form. Coefficient arrays are from high to low. 5 | 6 | #ifndef F64_HORNER 7 | #define F64_HORNER 8 | 9 | static inline double f64_horner_1(double x, const double C[static 2]) 10 | { 11 | return fma(x,C[0],C[1]); 12 | } 13 | 14 | static inline double f64_horner_2 (double x, const double C[static 3]) { return fma(x,f64_horner_1 (x,C),C[ 2]); } 15 | static inline double f64_horner_3 (double x, const double C[static 4]) { return fma(x,f64_horner_2 (x,C),C[ 3]); } 16 | static inline double f64_horner_4 (double x, const double C[static 5]) { return fma(x,f64_horner_3 (x,C),C[ 4]); } 17 | static inline double f64_horner_5 (double x, const double C[static 6]) { return fma(x,f64_horner_4 (x,C),C[ 5]); } 18 | static inline double f64_horner_6 (double x, const double C[static 7]) { return fma(x,f64_horner_5 (x,C),C[ 6]); } 19 | static inline double f64_horner_7 (double x, const double C[static 8]) { return fma(x,f64_horner_6 (x,C),C[ 7]); } 20 | static inline double f64_horner_8 (double x, const double C[static 9]) { return fma(x,f64_horner_7 (x,C),C[ 8]); } 21 | static inline double f64_horner_9 (double x, const double C[static 10]) { return fma(x,f64_horner_8 (x,C),C[ 9]); } 22 | static inline double f64_horner_10(double x, const double C[static 11]) { return fma(x,f64_horner_9 (x,C),C[10]); } 23 | static inline double f64_horner_11(double x, const double C[static 12]) { return fma(x,f64_horner_10(x,C),C[11]); } 24 | static inline double f64_horner_12(double x, const double C[static 13]) { return fma(x,f64_horner_11(x,C),C[12]); } 25 | static inline double f64_horner_13(double x, const double C[static 14]) { return fma(x,f64_horner_12(x,C),C[13]); } 26 | static inline double f64_horner_14(double x, const double C[static 15]) { return fma(x,f64_horner_13(x,C),C[14]); } 27 | static inline double f64_horner_15(double x, const double C[static 16]) { return fma(x,f64_horner_14(x,C),C[15]); } 28 | 29 | #endif 30 | 31 | -------------------------------------------------------------------------------- /src/SFH/f64_horner2.h: -------------------------------------------------------------------------------- 1 | // Public Domain under http://unlicense.org, see link for details. 2 | // Marc B. Reynolds, 2015-2025 3 | 4 | // evalulate n^th degree polynomial in Horner's form. Coefficient arrays are from high to low (c_n,... c_0) 5 | 6 | // WARNING: hacked..revalidate. also clean-up (odd/even pairing?) 7 | 8 | #ifndef F64_HORNER2 9 | #define F64_HORNER2 10 | 11 | // second order versions (layout compat with standard) 12 | 13 | // odd 14 | static inline double f64_horner2_3(const double x, const double c[static 4]) 15 | { 16 | double x2 = x*x; 17 | double e,o; 18 | e = c[1]; o = c[0]; 19 | e = fma(e,x2,c[3]); o = fma(o,x2,c[2]); 20 | 21 | return fma(x,o,e); 22 | } 23 | 24 | static inline double f64_horner2_5(const double x, const double c[static 6]) 25 | { 26 | double x2 = x*x; 27 | double e,o; 28 | e = c[1]; o = c[0]; 29 | e = fma(e,x2,c[3]); o = fma(o,x2,c[2]); 30 | e = fma(e,x2,c[5]); o = fma(o,x2,c[4]); 31 | 32 | return fma(x,o,e); 33 | } 34 | 35 | static inline double f64_horner2_7(const double x, const double c[static 8]) 36 | { 37 | double x2 = x*x; 38 | double e,o; 39 | e = c[1]; o = c[0]; 40 | e = fma(e,x2,c[3]); o = fma(o,x2,c[2]); 41 | e = fma(e,x2,c[5]); o = fma(o,x2,c[4]); 42 | e = fma(e,x2,c[7]); o = fma(o,x2,c[6]); 43 | 44 | return fma(x,o,e); 45 | } 46 | 47 | static inline double f64_horner2_9(const double x, const double c[static 10]) 48 | { 49 | double x2 = x*x; 50 | double e,o; 51 | e = c[1]; o = c[0]; 52 | e = fma(e,x2,c[3]); o = fma(o,x2,c[2]); 53 | e = fma(e,x2,c[5]); o = fma(o,x2,c[4]); 54 | e = fma(e,x2,c[7]); o = fma(o,x2,c[6]); 55 | e = fma(e,x2,c[9]); o = fma(o,x2,c[8]); 56 | 57 | return fma(x,o,e); 58 | } 59 | 60 | static inline double f64_horner2_11(const double x, const double c[static 12]) 61 | { 62 | double x2 = x*x; 63 | double e,o; 64 | e = c[ 1]; o = c[ 0]; 65 | e = fma(e,x2,c[ 3]); o = fma(o,x2,c[ 2]); 66 | e = fma(e,x2,c[ 5]); o = fma(o,x2,c[ 4]); 67 | e = fma(e,x2,c[ 7]); o = fma(o,x2,c[ 6]); 68 | e = fma(e,x2,c[ 9]); o = fma(o,x2,c[ 8]); 69 | e = fma(e,x2,c[11]); o = fma(o,x2,c[10]); 70 | 71 | return fma(x,o,e); 72 | } 73 | 74 | // even 75 | static inline double f64_horner2_4(const double x, const double c[static 5]) 76 | { 77 | double x2 = x*x; 78 | double e,o; 79 | e = c[0]; o = c[1]; 80 | e = fma(e,x2,c[2]); o = fma(o,x2,c[3]); 81 | e = fma(e,x2,c[4]); 82 | 83 | return fma(x,o,e); 84 | } 85 | 86 | static inline double f64_horner2_6(const double x, const double c[static 7]) 87 | { 88 | double x2 = x*x; 89 | double e,o; 90 | e = c[0]; o = c[1]; 91 | e = fma(e,x2,c[2]); o = fma(o,x2,c[3]); 92 | e = fma(e,x2,c[4]); o = fma(o,x2,c[5]); 93 | e = fma(e,x2,c[6]); 94 | 95 | return fma(x,o,e); 96 | } 97 | 98 | static inline double f64_horner2_8(const double x, const double c[static 9]) 99 | { 100 | double x2 = x*x; 101 | double e,o; 102 | e = c[0]; o = c[1]; 103 | e = fma(e,x2,c[2]); o = fma(o,x2,c[3]); 104 | e = fma(e,x2,c[4]); o = fma(o,x2,c[5]); 105 | e = fma(e,x2,c[6]); o = fma(o,x2,c[7]); 106 | e = fma(e,x2,c[8]); 107 | 108 | return fma(x,o,e); 109 | } 110 | 111 | static inline double f64_horner2_10(const double x, const double c[static 11]) 112 | { 113 | double x2 = x*x; 114 | double e,o; 115 | e = c[0]; o = c[1]; 116 | e = fma(e,x2,c[2]); o = fma(o,x2,c[3]); 117 | e = fma(e,x2,c[4]); o = fma(o,x2,c[5]); 118 | e = fma(e,x2,c[6]); o = fma(o,x2,c[7]); 119 | e = fma(e,x2,c[8]); o = fma(o,x2,c[9]); 120 | e = fma(e,x2,c[10]); 121 | 122 | return fma(x,o,e); 123 | } 124 | 125 | 126 | #endif 127 | 128 | -------------------------------------------------------------------------------- /src/SFH/f64_math.h: -------------------------------------------------------------------------------- 1 | // Public Domain under http://unlicense.org, see link for details. 2 | // Marc B. Reynolds, 2015-2025 3 | 4 | #ifndef F64_LIBM 5 | #define F64_LIBM 6 | 7 | #ifndef BITOPS_H 8 | #include "bitops.h" 9 | #endif 10 | 11 | #ifndef F64_UTIL_H 12 | #include "f64_util.h" 13 | #endif 14 | 15 | 16 | //************************************************************************* 17 | // solve quadratic equation ax^2+bx+c for real roots 18 | // complex roots will yield a standard NaN 19 | 20 | // larger magnitude root 21 | static inline double f64_quadratic_max(double a, double b, double c) 22 | { 23 | double t0 = f64_sqrt(f64_mms(b,b,4.0*a,c)); 24 | double t1 = b+copysign(t0,b); 25 | return t1/(-2.0*a); 26 | } 27 | 28 | // smaller magnitude root 29 | static inline double f64_quadratic_min(double a, double b, double c) 30 | { 31 | double t0 = f64_sqrt(f64_mms(b,b,4.0*a,c)); 32 | double t1 = b+copysign(t0,b); 33 | return (-2.0*c)/t1; 34 | } 35 | 36 | static inline void f64_quadratic(f64_pair_t* r, double a, double b, double c) 37 | { 38 | double t0 = f64_sqrt(f64_mms(b,b,4.0*a,c)); 39 | double t1 = b+copysign(t0,b); 40 | 41 | r->h = t1/(-2.0*a); 42 | r->l = (-2.0*c)/t1; 43 | } 44 | 45 | #endif 46 | 47 | -------------------------------------------------------------------------------- /src/SFH/lcgs.h: -------------------------------------------------------------------------------- 1 | // Marc B. Reynolds, 2016-2025 2 | // Public Domain under http://unlicense.org, see link for details. 3 | // 4 | // SEE: http://marc-b-reynolds.github.io/shf/2016/04/19/prns.html 5 | 6 | #ifndef LCGS_H 7 | #define LCGS_H 8 | 9 | #if defined(LCGS_64) 10 | #define LCGS_UINT uint64_t 11 | #define LCGS_SINT int64_t 12 | #if !defined(LCGS_MF) 13 | #if defined(LCGS_MLCG) 14 | #define LCGS_MF UINT64_C(0x106689d45497fdb5) 15 | #define LCGS_MR UINT64_C(0xbb91f78bdac4c89d) 16 | #else 17 | #define LCGS_MF UINT64_C(0x27bb2ee687b0b0fd) 18 | #define LCGS_MR UINT64_C(0xdfe66807999cec55) 19 | #define LCGS_AF UINT64_C(0x1234567) 20 | #endif 21 | #endif 22 | #else 23 | #define LCGS_UINT uint32_t 24 | #define LCGS_SINT int32_t 25 | #if !defined(LCGS_MF) 26 | #if defined(LCGS_MLCG) 27 | #define LCGS_MF 0x2c2c57ed 28 | #define LCGS_MR 0xf4ed9de5 29 | #else 30 | #define LCGS_MF 0xac564b05 31 | #define LCGS_MR 0xdc33c9cd 32 | #define LCGS_AF 0x1233 33 | #endif 34 | #endif 35 | #endif 36 | 37 | #ifndef LCGS_AF 38 | #define LCGS_AF 0 39 | #ifndef LCGS_MLCG 40 | #define LCGS_MLCG 41 | #endif 42 | #endif 43 | 44 | #define LCGS_AR ((LCGS_UINT)0-(LCGS_MR*LCGS_AF)) 45 | 46 | #ifndef LCGS_MIX 47 | #define LCGS_MIX(X) lcgs_mix(X) 48 | #endif 49 | 50 | typedef struct { LCGS_UINT i; } lcgs_t; 51 | 52 | 53 | #if defined(LCGS_IMPLEMENTATION) 54 | LCGS_UINT lcgs_state(LCGS_UINT state, LCGS_SINT dp) 55 | { 56 | LCGS_UINT m = 1; 57 | LCGS_UINT a = 0; 58 | LCGS_UINT dm,da; 59 | 60 | if (dp >= 0) { 61 | dm = LCGS_MF; 62 | da = LCGS_AF; 63 | } 64 | else { 65 | dm = LCGS_MR; 66 | da = LCGS_AR; 67 | dp = -dp; 68 | } 69 | 70 | while (dp) { 71 | if (dp & 1) { 72 | m = m*dm; 73 | a = a*dm + da; 74 | } 75 | da *= dm+1; 76 | dm *= dm; 77 | dp >>= 1; 78 | } 79 | 80 | return m*state + a; 81 | } 82 | 83 | LCGS_UINT lcgs_state_0(LCGS_UINT state, LCGS_SINT dp) 84 | { 85 | LCGS_UINT m = 1; 86 | LCGS_UINT dm; 87 | 88 | if (dp >= 0) { 89 | dm = LCGS_MF; 90 | } 91 | else { 92 | dm = LCGS_MR; 93 | dp = -dp; 94 | } 95 | 96 | while (dp) { 97 | if (dp & 1) { 98 | m = m*dm; 99 | } 100 | dm *= dm; 101 | dp >>= 1; 102 | } 103 | 104 | return m*state; 105 | } 106 | #else 107 | extern LCGS_UINT lcgs_state(LCGS_UINT state, LCGS_SINT dp); 108 | extern LCGS_UINT lcgs_state_0(LCGS_UINT state, LCGS_SINT dp); 109 | #endif 110 | 111 | static inline LCGS_UINT lcgs_state_get(lcgs_t* gen) 112 | { 113 | return gen->i; 114 | } 115 | 116 | static inline void lcgs_state_set(lcgs_t* gen, LCGS_UINT state) 117 | { 118 | gen->i = state; 119 | } 120 | 121 | static inline void lcgs_seek(lcgs_t* gen, LCGS_SINT offset) 122 | { 123 | #ifndef LCGS_MLCG 124 | gen->i = lcgs_state(gen->i, offset); 125 | #else 126 | gen->i = lcgs_state_0(gen->i, offset); 127 | #endif 128 | } 129 | 130 | 131 | #if defined(LCGS_PCG_MIXING) 132 | 133 | // ***** start of PCG copy-and-paste code, which is under: 134 | // http://www.apache.org/licenses/LICENSE-2.0 135 | // SEE: http://www.pcg-random.org 136 | 137 | #if !defined(LCGS_64) 138 | inline uint32_t pcg_output_rxs_m_xs_32_32(uint32_t state) 139 | { 140 | uint32_t word = ((state >> ((state >> 28u) + 4u)) ^ state) * 277803737u; 141 | return (word >> 22u) ^ word; 142 | } 143 | 144 | #else 145 | 146 | static inline uint64_t pcg_output_rxs_m_xs_64_64(uint64_t state) 147 | { 148 | uint64_t word = ((state >> ((state >> 59u) + 5u)) ^ state) * 12605985483714917081ull; 149 | return (word >> 43u) ^ word; 150 | } 151 | 152 | static inline uint32_t pcg_rotr_32(uint32_t value, unsigned int rot) 153 | { 154 | #if PCG_USE_INLINE_ASM && __clang__ && (__x86_64__ || __i386__) 155 | asm ("rorl %%cl, %0" : "=r" (value) : "0" (value), "c" (rot)); 156 | return value; 157 | #else 158 | return (value >> rot) | (value << ((- rot) & 31)); 159 | #endif 160 | } 161 | 162 | static inline uint64_t pcg_output_xsl_rr_rr_64_64(uint64_t state) 163 | { 164 | uint32_t rot1 = (uint32_t)(state >> 59u); 165 | uint32_t high = (uint32_t)(state >> 32u); 166 | uint32_t low = (uint32_t)state; 167 | uint32_t xored = high ^ low; 168 | uint32_t newlow = pcg_rotr_32(xored, rot1); 169 | uint32_t newhigh = pcg_rotr_32(high, newlow & 31u); 170 | return (((uint64_t)newhigh) << 32u) | newlow; 171 | } 172 | 173 | #endif 174 | 175 | // ***** end of PCG copy-and-paste 176 | 177 | #else 178 | 179 | #if defined(_MSC_VER) 180 | #include 181 | #define __builtin_bswap32(X) _byteswap_ulong(X) 182 | #define __builtin_bswap64(X) _byteswap_uint64(X) 183 | #endif 184 | 185 | #endif 186 | 187 | static inline LCGS_UINT lcgs_mix(LCGS_UINT x) 188 | { 189 | #if defined(LCGS_64) 190 | uint64_t s = __builtin_bswap64(x); 191 | #else 192 | uint64_t s = __builtin_bswap32(x); 193 | #endif 194 | 195 | return s^x; 196 | } 197 | 198 | static inline LCGS_UINT lcgs_peek(lcgs_t* gen) 199 | { 200 | return LCGS_MIX(gen->i); 201 | } 202 | 203 | static inline LCGS_UINT lcgs_next(lcgs_t* gen) 204 | { 205 | LCGS_UINT i = gen->i; 206 | LCGS_UINT r = LCGS_MIX(i); 207 | gen->i = LCGS_MF*i + LCGS_AF; 208 | return r; 209 | } 210 | 211 | static inline LCGS_UINT lcgs_prev(lcgs_t* gen) 212 | { 213 | LCGS_UINT i = gen->i; 214 | LCGS_UINT r = LCGS_MIX(i); 215 | gen->i = LCGS_MR*i + LCGS_AR; 216 | return r; 217 | } 218 | 219 | #undef LCGS_UINT 220 | #undef LCGS_SINT 221 | 222 | #endif 223 | -------------------------------------------------------------------------------- /src/SFH/prng_small.h: -------------------------------------------------------------------------------- 1 | // Marc B. Reynolds, 2021-2025 2 | // Public Domain under http://unlicense.org, see link for details. 3 | 4 | #pragma once 5 | #define PRNG_SMALL_H 6 | 7 | // small feature pseudorandom number generator 8 | // * 9 | // * single sequence combined generator formed from a LCG 10 | // and a XGB (a state update from xorshiro family) with 11 | // a period of 2^64(2^64-1) 12 | // * no parameterization support for multiple generators. 13 | // instead the XGB portion can be fast forwarded by 2^64 14 | 15 | 16 | // helper functions: 17 | #if !defined(_MSC_VER) 18 | // seems like "pretty much" everybody (stern eyes) can match this pattern 19 | static inline uint64_t prng_rot(uint64_t x, uint32_t n) { n &= 0x3f; return (x<>(-n & 0x3f)); } 20 | #else 21 | static inline uint64_t prng_rot(uint64_t x, uint32_t n) { return (uint64_t)_rotl64(x,n); } 22 | #endif 23 | 24 | // complete this 25 | #if defined(__ARM_ARCH) && defined(__ARM_FEATURE_CRC32) 26 | #include // problem for SIMDe. rethink just need crc def 27 | static inline uint64_t prng_crc32c_64(uint64_t x, uint32_t k) { return __crc32cd(k,x); } 28 | #else 29 | #if !defined(_MSC_VER) 30 | #include // problem for SIMDe. rethink just need crc def 31 | #else 32 | #include 33 | #endif 34 | 35 | static inline uint64_t prng_crc32c_64(uint64_t x, uint32_t k) { return _mm_crc32_u64(k,x); } 36 | #endif 37 | 38 | static inline uint64_t prng_mix_64(uint64_t x) 39 | { 40 | x = (x ^ (x >> 30)) * 0xbf58476d1ce4e5b9; 41 | x = (x ^ (x >> 27)) * 0x94d049bb133111eb; 42 | x = (x ^ (x >> 31)); 43 | return x; 44 | } 45 | 46 | // slots of each state element 47 | enum { PRNG_LCG_0, PRNG_XGB_L, PRNG_XGB_H, PRNG_LENGTH }; 48 | 49 | typedef struct { uint64_t state[3]; } prng_t; 50 | 51 | static const uint64_t prng_mul_k = UINT64_C(0xd1342543de82ef95); 52 | static const uint64_t prng_add_k = UINT64_C(0x2545f4914f6cdd1d); 53 | 54 | static inline uint64_t prng_u64(prng_t* prng) 55 | { 56 | uint64_t s0 = prng->state[PRNG_XGB_L]; 57 | uint64_t s1 = prng->state[PRNG_XGB_H]; 58 | uint64_t s2 = prng->state[PRNG_LCG_0]; 59 | uint64_t r = prng_mix_64(s0 + s2); 60 | 61 | s1 ^= s0; 62 | prng->state[PRNG_LCG_0] = prng_mul_k * s2 + prng_add_k; 63 | prng->state[PRNG_XGB_L] = prng_rot(s0,55) ^ s1 ^ (s1<<14); 64 | prng->state[PRNG_XGB_H] = prng_rot(s1,36); 65 | 66 | return r; 67 | } 68 | 69 | static inline uint32_t prng_u32(prng_t* prng) 70 | { 71 | return (uint32_t)prng_u64(prng); 72 | } 73 | -------------------------------------------------------------------------------- /src/SFH/sfibpoints.h: -------------------------------------------------------------------------------- 1 | // Marc B. Reynolds, 2018-2025 2 | // Public Domain under http://unlicense.org, see link for details. 3 | 4 | #ifndef SF_WALK_H 5 | #define SF_WALK_H 6 | 7 | // Spherical Fibonacci point set for testing 8 | // http://marc-b-reynolds.github.io/math/2018/06/21/SFPoints4ET.html 9 | 10 | // requires: stdint.h, math.h (for fma & sqrt) and vec3_t defined 11 | 12 | // constant turning rate: 13 | // TX = cos(2pi K) 14 | // TY = sin(2pi K) 15 | // K = frac(phi) = 1/phi = (sqrt(5)-1)/2 16 | 17 | #if defined(_MSC_VER) 18 | static const double SF_WALK_TX = -0.73736887807831985597317725478205829858779907226562; 19 | static const double SF_WALK_TY = -0.67549029426152362720614519275841303169727325439453; 20 | #else 21 | static const double SF_WALK_TX = -0x1.798869e0de834p-1; 22 | static const double SF_WALK_TY = -0x1.59d9dd253cc11p-1; 23 | #endif 24 | 25 | typedef struct { 26 | double z,dz; // incremental height on cap 27 | double x0,y0; // incremental point on unit circle p(n) 28 | double x1,y1; // and p(n-1) 29 | } sf_walk_t; 30 | 31 | // internal worker init routine 32 | static inline void sf_walk_zone_init_i(sf_walk_t* w, uint64_t n, double z0, double z1) 33 | { 34 | w->x0 = 1.0; 35 | w->y0 = 0.0; 36 | w->x1 = SF_WALK_TX; 37 | w->y1 = -SF_WALK_TY; 38 | w->z = z0; 39 | w->dz = (z0-z1)/(double)n; 40 | } 41 | 42 | // spherical zone walk 43 | // n = number of points to generate 44 | // z0 = first z coordinate of zone 45 | // z1 = last z coordinate of zone 46 | static inline void sf_walk_zone_init(sf_walk_t* w, uint64_t n, float z0, float z1) 47 | { 48 | sf_walk_zone_init_i(w,n,(double)z0,(double)z1); 49 | } 50 | 51 | // spherical cap walk (center of cap at +Z) 52 | // n = number of points to generate 53 | // h = height of cap (ex: half-sphere=1, full-sphere=2) 54 | static inline void sf_walk_cap_init(sf_walk_t* w, uint64_t n, float h) 55 | { 56 | sf_walk_zone_init_i(w, n, 1.0, 1.0-(double)h); 57 | } 58 | 59 | // full sphere walk 60 | // n = number of points to generate 61 | static inline void sf_walk_init(sf_walk_t* w, uint32_t n) 62 | { 63 | sf_walk_cap_init(w, n, 2.f); 64 | } 65 | 66 | // return the next point in the set 67 | static inline vec3_t sf_walk_next(sf_walk_t* w) 68 | { 69 | static const double M = 2.0*SF_WALK_TX; 70 | 71 | double x0=w->x0, y0=w->y0; 72 | double x1=w->x1, y1=w->y1; 73 | 74 | double ct,st; 75 | 76 | // current cylinder to cap mapping values 77 | ct = w->z; 78 | st = sqrt((1.0-ct)*(1.0+ct)); 79 | 80 | // output current point in set 81 | vec3_t v; 82 | 83 | v.x = (float)(st*x0); 84 | v.y = (float)(st*y0); 85 | v.z = (float)(ct); 86 | 87 | // update point on circle: turn by 2pi*K 88 | // via reverse Goertzel Algorithm 89 | w->x0 = fma(M, x0, -x1); 90 | w->y0 = fma(M, y0, -y1); 91 | w->x1 = x0; 92 | w->y1 = y0; 93 | 94 | // update height in cap position 95 | w->z -= w->dz; 96 | 97 | return v; 98 | } 99 | 100 | #endif 101 | -------------------------------------------------------------------------------- /src/SFH/tests/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !*.c 3 | !*.h 4 | !Makefile -------------------------------------------------------------------------------- /src/SFH/tests/Makefile: -------------------------------------------------------------------------------- 1 | # Dumb mini makefile: 2 | # 0) assumes clang/GCC like options 3 | # 1) every .c file is to be built into an executable 4 | 5 | # if CC is the default (not environment varible nor supplied to make, then default 6 | ifeq ($(origin CC),default) 7 | CC = clang-15 8 | endif 9 | 10 | IDIRS = -I../.. -I.. 11 | CFLAGS = -g3 -O3 ${IDIRS} -march=native -Wall -Wextra -Wconversion -Wpedantic -Wno-unused-function -fno-math-errno -ffp-contract=off 12 | LDLIBS = -lm 13 | 14 | SRC := ${wildcard *.c} 15 | HEADERS := ${wildcard *.h} 16 | TARGETS := ${SRC:.c=} 17 | 18 | all: ${TARGETS} 19 | 20 | clean: 21 | -${RM} ${TARGETS} 22 | 23 | distclean: clean 24 | -${RM} .makedep *~ 25 | 26 | .makedep: 27 | @-echo "building dependencies" 28 | @-echo "# autogenerated by Makefile" > .makedep 29 | @$(foreach file, $(SRC), ${CC} ${IDIRS} -MM -MQ${file:.c=} $(file) >> .makedep;) 30 | 31 | 32 | %:%.c Makefile 33 | ${CC} ${CFLAGS} $< ${LDLIBS} -o $@ 34 | 35 | -include .makedep 36 | -------------------------------------------------------------------------------- /src/SFH/tests/carryless.c: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #define CARRYLESS_IMPLEMENTATION 8 | 9 | #include "carryless.h" 10 | #include "prng_small.h" 11 | 12 | #if 0 13 | 14 | static const uint32_t test_runs = 0x10000000; 15 | 16 | uint32_t cl_unit_rng_32(rng_t* rng) { return rng_next_u32(rng)|1u; } 17 | uint32_t cr_unit_rng_32(rng_t* rng) { return rng_next_u32(rng)|(1u<<31); } 18 | uint32_t cc_unit_rng_32(rng_t* rng) 19 | { 20 | uint32_t r = cl_unit_rng_32(rng); return r ^= (r>>1); 21 | } 22 | 23 | uint64_t cl_unit_rng_64(rng_t* rng) { return rng_next_u64(rng)|1u; } 24 | uint64_t cr_unit_rng_64(rng_t* rng) { return rng_next_u64(rng)|(1u<<63); } 25 | uint64_t cc_unit_rng_64(rng_t* rng) 26 | { 27 | uint64_t r = cl_unit_rng_64(rng); return r ^= (r>>1); 28 | } 29 | 30 | typedef struct { 31 | char* name; 32 | uint32_t identity; 33 | uint32_t (*draw)(rng_t*); 34 | uint32_t (*inv)(uint32_t); 35 | uint32_t (*mul)(uint32_t,uint32_t); 36 | } def_32_t; 37 | 38 | typedef struct { 39 | char* name; 40 | uint64_t identity; 41 | uint64_t (*draw)(rng_t*); 42 | uint64_t (*inv)(uint64_t); 43 | uint64_t (*mul)(uint64_t,uint64_t); 44 | } def_64_t; 45 | 46 | 47 | uint32_t test_32(def_32_t* def, rng_t* rng) 48 | { 49 | uint32_t errors = 0; 50 | 51 | for(uint32_t i=0; idraw(rng); 53 | uint32_t b = def->mul(a); 54 | uint32_t r = def->mul(a,b); 55 | 56 | if (r != def->identity) { 57 | printf("%08x : %08x %08x\n",a,b,r); 58 | } 59 | } 60 | 61 | return errors; 62 | } 63 | 64 | uint64_t test_64(def_64_t* def, rng_t* rng) 65 | { 66 | uint64_t errors = 0; 67 | 68 | for(uint64_t i=0; idraw(rng); 70 | uint64_t b = def->mul(a); 71 | uint64_t r = def->mul(a,b); 72 | 73 | if (r != def->identity) { 74 | printf("%016lx : %016lx %016lx\n",a,b,r); 75 | } 76 | } 77 | 78 | return errors; 79 | } 80 | 81 | #endif 82 | 83 | 84 | #if 0 85 | void test_foo(prng_t* prng) 86 | { 87 | for(uint32_t i=0; i<0xffffff; i++) { 88 | uint64_t a = prng_u64(prng); 89 | uint64_t b = prng_u64(prng); 90 | uint64_t r0 = cl_gcd_64(a,b); 91 | 92 | cl_gcd_u64_t p = cl_extended_gcd_64(a,b); 93 | 94 | uint64_t r1 = cl_mul_64(a,p.x) ^ cl_mul_64(b,p.y); 95 | 96 | if (r0 != r1 || r0 != p.r) { 97 | printf("%016lx,%016lx = %016lx %016lx\n",a,b,r0,r1); 98 | } 99 | 100 | } 101 | } 102 | #endif 103 | 104 | 105 | int main(void) 106 | { 107 | return 0; 108 | } 109 | -------------------------------------------------------------------------------- /src/SFH/vec2.h: -------------------------------------------------------------------------------- 1 | // Marc B. Reynolds, 2016-2025 2 | // Public Domain under http://unlicense.org, see link for details. 3 | // 4 | // This is utter junk example code for some posts. 5 | // quick cut/paste/mod so errors are likely 6 | 7 | #ifndef VEC2_H 8 | #define VEC2_H 9 | 10 | #ifndef F32_UTIL_H 11 | #include "f32_util.h" 12 | #endif 13 | 14 | typedef union { 15 | struct { float x, y; }; 16 | float f[2]; 17 | } vec2_t; 18 | 19 | #define AX a->x 20 | #define AY a->y 21 | #define BX b->x 22 | #define BY b->y 23 | 24 | static inline void vec2_set(vec2_t* d, float x, float y) 25 | { 26 | d->x = x; d->y = y; 27 | } 28 | 29 | 30 | static inline void vec2_zero(vec2_t* v) { vec2_set(v,0.f,0.f); } 31 | static inline void vec2_dup(vec2_t* d, vec2_t* s) { vec2_set(d,s->x,s->y); } 32 | 33 | static inline void vec2_add(vec2_t* d, vec2_t* a, vec2_t* b) 34 | { 35 | d->x = AX+BX; 36 | d->y = AY+BY; 37 | } 38 | 39 | static inline void vec2_sub(vec2_t* d, vec2_t* a, vec2_t* b) 40 | { 41 | d->x = AX-BX; 42 | d->y = AY-BY; 43 | } 44 | 45 | static inline float vec2_dot(vec2_t* a, vec2_t* b) 46 | { 47 | return AX*BX + AY*BY; 48 | } 49 | 50 | static inline float vec2_norm(vec2_t* a) 51 | { 52 | return vec2_dot(a,a); 53 | } 54 | 55 | static inline void vec2_neg(vec2_t* a) 56 | { 57 | vec2_set(a,-AX,-AY); 58 | } 59 | 60 | // complex: conjugate (a^*) 61 | static inline void vec2_conj(vec2_t* a) 62 | { 63 | a->y = -a->y; 64 | } 65 | 66 | static inline void vec2_ortho(vec2_t* d, vec2_t* a) 67 | { 68 | vec2_set(d,-AY,AX); 69 | } 70 | 71 | // complex: d=ab 72 | static inline void vec2_mul(vec2_t* d, vec2_t* a, vec2_t* b) 73 | { 74 | vec2_set(d,AX*BX-AY*BY, AX*BY+AY*BX); 75 | } 76 | 77 | // complex: d=ab^* 78 | static inline void vec2_mulc(vec2_t* d, vec2_t* a, vec2_t* b) 79 | { 80 | vec2_set(d,AX*BX+AY*BY, AY*BX-AX*BY); 81 | } 82 | 83 | // complex: d=a^2 84 | static inline void vec2_sq(vec2_t* d, vec2_t* a) 85 | { 86 | float x=a->x, y=a->y, xy=x*y; 87 | vec2_set(d,(x+y)*(x-y),xy+xy); 88 | } 89 | 90 | // complex: d=sqrt(a) where |a|=1 91 | static inline void vec2_usqrt(vec2_t* d, vec2_t* a) 92 | { 93 | float x = a->x, y = a->y; 94 | float m = x+1.f; m = f32_sqrtf(m+m); 95 | float rx = 0.5f * m; 96 | float ry = y/(m + 0x1p-126f); 97 | vec2_set(d, rx,ry); 98 | } 99 | 100 | #undef AX 101 | #undef AY 102 | #undef BX 103 | #undef BY 104 | 105 | #endif 106 | -------------------------------------------------------------------------------- /src/SFH/vec3.h: -------------------------------------------------------------------------------- 1 | // Marc B. Reynolds, 2016-2025 2 | // Public Domain under http://unlicense.org, see link for details. 3 | // 4 | // This is utter junk example code for some posts. 5 | // quick cut/paste/mod so errors are likely 6 | 7 | #ifndef VEC3_H 8 | #define VEC3_H 9 | 10 | #include "f32_util.h" 11 | 12 | typedef union { 13 | struct{ float x, y, z; }; 14 | float f[3]; 15 | } vec3_t; 16 | 17 | #define AX a->x 18 | #define AY a->y 19 | #define AZ a->z 20 | #define BX b->x 21 | #define BY b->y 22 | #define BZ b->z 23 | 24 | static inline void vec3_set(vec3_t* v, float x, float y, float z) 25 | { 26 | v->x=x; v->y=y; v->z=z; 27 | } 28 | 29 | static inline void vec3_dup(vec3_t* a, vec3_t* b) { vec3_set(a, b->x,b->y,b->z); } 30 | 31 | static inline void vec3_zero(vec3_t* v) { vec3_set(v, 0.f,0.f,0.f); } 32 | 33 | 34 | static inline void vec3_add(vec3_t* d, vec3_t* a, vec3_t* b) 35 | { 36 | d->x = AX + BX; d->y = AY + BY; d->z = AZ + BZ; 37 | } 38 | 39 | static inline void vec3_sub(vec3_t* d, vec3_t* a, vec3_t* b) 40 | { 41 | d->x = AX - BX; d->y = AY - BY; d->z = AZ - BZ; 42 | } 43 | 44 | static inline float vec3_dot(vec3_t* a, vec3_t* b) 45 | { 46 | return AX*BX + AY*BY + AZ*BZ; 47 | } 48 | 49 | static inline float vec3_norm(vec3_t* a) { return vec3_dot(a,a); } 50 | 51 | static inline void vec3_scale(vec3_t* v, float s) 52 | { 53 | v->x *= s; 54 | v->y *= s; 55 | v->z *= s; 56 | } 57 | 58 | static inline void vec3_neg(vec3_t* v) { v->x = -v->x; v->y = -v->y; v->z = -v->z; } 59 | 60 | static inline void vec3_cross(vec3_t* r, vec3_t* a, vec3_t* b) 61 | { 62 | float x = a->y*b->z - a->z*b->y; 63 | float y = a->z*b->x - a->x*b->z; 64 | float z = a->x*b->y - a->y*b->x; 65 | vec3_set(r, x, y, z); 66 | } 67 | 68 | 69 | static inline void vec3_hmul(vec3_t* r, vec3_t* a, vec3_t* b) 70 | { 71 | vec3_set(r, AX*BX, AY*BY, AZ*BZ); 72 | } 73 | 74 | static inline void vec3_set_scale(vec3_t* a, vec3_t* b, float s) 75 | { 76 | vec3_set(a, s*BX, s*BY, s*BZ); 77 | } 78 | 79 | // SEE: marc-b-reynolds.github.io/quaternions/2016/07/06/Orthonormal.html 80 | // for math and other versions 81 | static inline void vec3_ortho_basis(vec3_t* v, vec3_t* xp, vec3_t* yp) 82 | { 83 | // this assumes v->z not approaching -z 84 | float x = -v->x; 85 | float y = v->y; 86 | float z = v->z; 87 | 88 | float a = y/(1.f + z); 89 | float b = y*a; 90 | float c = x*a; 91 | 92 | vec3_set(xp, z+b, c, x); 93 | vec3_set(yp, c, 1.f-b, -y); 94 | } 95 | 96 | 97 | #undef AX 98 | #undef AY 99 | #undef AZ 100 | #undef BX 101 | #undef BY 102 | #undef BZ 103 | 104 | #endif 105 | -------------------------------------------------------------------------------- /src/SFH/welford.h: -------------------------------------------------------------------------------- 1 | // Marc B. Reynolds, 2018-2025 2 | // Public Domain under http://unlicense.org, see link for details. 3 | 4 | #ifndef WELFORD_H 5 | #define WELFORD_H 6 | 7 | 8 | // Welford's method for streaming mean/variance/stdev 9 | typedef struct { double n,m,s; } seq_stats_t; 10 | 11 | static inline void seq_stats_init(seq_stats_t* d) 12 | { 13 | memset(d,0,sizeof(seq_stats_t)); 14 | } 15 | 16 | static inline void seq_stats_add(seq_stats_t* d, float v) 17 | { 18 | double x = (double)v; 19 | 20 | d->n += 1.0; 21 | 22 | double m = d->m; 23 | double s = d->s; 24 | double dm = x-m; 25 | 26 | d->m = m + dm/d->n; 27 | d->s = fma(dm, x-d->m, s); 28 | } 29 | 30 | static inline double seq_stats_mean(seq_stats_t* d) { return d->m; } 31 | static inline double seq_stats_variance(seq_stats_t* d) { return d->s/(d->n-1.0); } 32 | static inline double seq_stats_stddev(seq_stats_t* d) { return sqrt(seq_stats_variance(d)); } 33 | 34 | static inline void seq_stats_print(seq_stats_t* d) 35 | { 36 | printf("mean=%f,variance=%f,std-dev=%f, count=%u\n", 37 | d->m, 38 | seq_stats_variance(d), 39 | seq_stats_stddev(d), 40 | (uint32_t)d->n 41 | ); 42 | } 43 | 44 | #endif 45 | -------------------------------------------------------------------------------- /src/Sollya/README.md: -------------------------------------------------------------------------------- 1 | 2 | [Sollya](http://sollya.gforge.inria.fr) is a DSL for creating numeric functions. 3 | 4 | This is a *helper* library of very questionable value which I tossed to together from various scripts that I've written. It's totally useless as is since there's no documentation, meaningful examples and features are only half-implemented. 5 | 6 | -------------------------------------------------------------------------------- /src/Sollya/TODO: -------------------------------------------------------------------------------- 1 | This is very hacky. List of things I'll probably never get around to 2 | 3 | Approximations (approx.sollya) 4 | * improve and extend approx with constraints 5 | 6 | HTML plotting (plotly.sollya) 7 | * emit javascript to generate and share 'x' coord of error plots with 8 | the same range and sample points 9 | * make the TeX function configurable (rational vs integer x 2^e) code 10 | is already in approx.sollya 11 | -------------------------------------------------------------------------------- /src/Sollya/addk.sollya: -------------------------------------------------------------------------------- 1 | /* -*- mode: c; -*- */ 2 | 3 | // Marc B. Reynolds, 2016-2023 4 | // Public Domain under http://unlicense.org, see link for details. 5 | 6 | // for first step of splitting a constant K into a multiplicative pair {H,L} 7 | // so K ~= H*L. For adding K to 'x': x+K ~= fma(H,L,x) 8 | // SEE: https://marc-b-reynolds.github.io/math/2020/01/09/ConstAddMul.html 9 | 10 | procedure add_k(k,bits) 11 | { 12 | var f,d,p,e,m,t; 13 | f = round(k,bits,RN); 14 | d = mantissa(round(k,bits,RU)-f); 15 | m = mantissa(f); 16 | e = exponent(f); 17 | p = precision(m); 18 | m = m*2^(bits-p); 19 | e = e-bits+p; 20 | 21 | return [|f,m,e,d|]; 22 | }; 23 | 24 | procedure add_k_q(k,w) { var f; f=round(k,w,RN); return [|mantissa(f),exponent(f)|]; }; 25 | 26 | procedure f32_add_k(x) { add_k(x, 48); }; 27 | procedure f64_add_k(x) { add_k(x,106); }; 28 | 29 | f32_add_k(Pi); 30 | f64_add_k(Pi); 31 | f32_add_k(2/(sqrt(5)+1)); 32 | add_k_q(2/(sqrt(5)+1),48); 33 | 34 | -------------------------------------------------------------------------------- /src/Sollya/argreduce.sollya: -------------------------------------------------------------------------------- 1 | /* -*- mode: c; -*- */ 2 | 3 | // Marc B. Reynolds, 2016-2023 4 | // Public Domain under http://unlicense.org, see link for details. 5 | 6 | // Formally Verified Argument Reduction with a Fused Multiply-Add 7 | // https://hal.archives-ouvertes.fr/hal-00168401 8 | 9 | procedure log2_ulp_p(v,p) { return exponent(v)-p+1; }; 10 | 11 | procedure arg_reduce_p(c,p) 12 | { 13 | var r,c1,c2,t; 14 | 15 | r = round(1/c,p, RN); 16 | c1 = round(1/r,p-2,RN); 17 | t = log2_ulp_p(c1,p)-p+4; 18 | c2 = nearestint((c-c1)*2^-t)*2^t; 19 | c2 = round(c2,p,RN); 20 | 21 | // add checks 22 | 23 | //if (abs()) then { }; 24 | 25 | return [|r,c1,c2,t|]; 26 | }; 27 | 28 | procedure arg_reduce_p_(c,p) 29 | { 30 | var r,c1,c2,t; 31 | 32 | r = round(1/c,p, RN); 33 | c1 = round(1/r,p-2,RN); 34 | t = log2_ulp_p(c1,p)-p+4; 35 | c2 = nearestint((c-c1)*2^-t)*2^t; 36 | c2 = round(c2,p,RN); 37 | 38 | // add checks 39 | 40 | //if (abs()) then { }; 41 | 42 | return [|r,c1,c2,t|]; 43 | }; 44 | 45 | 46 | // two constant Cody-Waite 47 | procedure arg_reduce_cw2(c,p,b) 48 | { 49 | var c1,c2; 50 | c1 = round(c,b,RN); 51 | c2 = round(c-c1,p,RN); 52 | 53 | return [|c1,c2|]; 54 | }; 55 | 56 | // split constant K into h+l where each 57 | // is a p-bit number. 58 | procedure split_to_pair(K,p) 59 | { 60 | var h,l; 61 | h = round(K, p, RN); 62 | l = round(K-h,p, RN); 63 | return [|h,l|]; 64 | }; 65 | 66 | 67 | procedure f32_arg_reduce(C) { arg_reduce_p(C,24); }; 68 | procedure f64_arg_reduce(C) { arg_reduce_p(C,53); }; 69 | 70 | //t = exponent(1.0)-24+1; 71 | //single(1+2^(t-1)); 72 | 73 | display=dyadic!; 74 | //display=powers!; 75 | //[|10680707*2^(-25), 13176796*2^(-22), -11464520*2^-45|]; 76 | //f32_arg_reduce(Pi); 77 | //f64_arg_reduce(Pi); 78 | //f32_arg_reduce(log(2)); 79 | //f64_arg_reduce(log(2)); 80 | //log2_ulp_p(1,24); 81 | 82 | //arg_reduce_cw2(pi,24,8); 83 | arg_reduce_p_(pi,24); 84 | arg_reduce_p_(pi,53); 85 | 86 | display=hexadecimal!; 87 | split_to_pair(pi,24); 88 | split_to_pair(pi,53); 89 | -------------------------------------------------------------------------------- /src/Sollya/common.sollya: -------------------------------------------------------------------------------- 1 | /* -*- mode: c; -*- */ 2 | 3 | // Marc B. Reynolds, 2016-2023 4 | // Public Domain under http://unlicense.org, see link for details. 5 | 6 | // make a copy of list with f applied 7 | procedure list_apply(l,f) 8 | { 9 | var r,v,i; 10 | 11 | r = [||]; 12 | 13 | for i from 0 to length(l)-1 do { 14 | v = f(l[i]); 15 | r = r :. v; 16 | }; 17 | 18 | return r; 19 | }; 20 | 21 | u_round_f32 = proc(n) { return single(n); }; 22 | 23 | // make a copy of list with elements rounded to binary32 24 | procedure list_to_f32(l) { return list_apply(l, u_round_f32); }; 25 | 26 | // name prefix of name to use 27 | // a approximation of 'f' 28 | // f function 29 | // r range 30 | procedure a_error(name,a,f,r) 31 | { 32 | var e,z,d,l,i; 33 | 34 | // create a debug list of values of 'x' where the approximation 35 | // error is max. 36 | z = list_to_f32(dirtyfindzeros(diff(a-f), r)); 37 | l = length(z); 38 | 39 | write("const float " @ name @ "_emax[] = {"); 40 | 41 | write(z[0] @ "f"); 42 | 43 | for i from 1 to l-1 do { 44 | write("," @ z[i] @ "f"); 45 | }; 46 | 47 | z = list_to_f32(dirtyfindzeros(a-f, r)); 48 | l = length(z); 49 | 50 | 51 | write("};\nconst float " @ name @ "_emin[] = {"); 52 | 53 | if (l > 0) then { 54 | write(z[0] @ "f"); 55 | for i from 1 to l-1 do { write("," @ z[i] @ "f"); }; 56 | }; 57 | 58 | e = single(dirtyinfnorm(a-f, r)); 59 | print("};\n\n// peak-error: " @ e); 60 | }; 61 | 62 | // dump-out copy-n-pastable list of coefficients, peak-error 63 | // and locations of peak-error. 64 | procedure print_source(name,p) 65 | { 66 | var e,i,c; 67 | 68 | e = degree(p); 69 | 70 | display=powers!; 71 | print("// " @ name @ "[x_] = " @ p @ ";"); 72 | display=decimal!; 73 | 74 | write("inline float " @ name @ "(float x)\n{\n const float c[] = {\n "); 75 | 76 | write(coeff(p,0) @ "f"); 77 | 78 | for i from 1 to e do { 79 | write(",\n " @ coeff(p,i) @ "f"); 80 | }; 81 | 82 | print(" };\n\n return f32_poly_p" @ e @ "(x,c);\n}\n"); 83 | }; 84 | 85 | procedure s_get_field(s,f,d) 86 | { 87 | var t; 88 | t = "match s with { ." @ f @ " = default } : (s."@ f @") default: ("@ d @ ") ;"; 89 | return parse(t); 90 | }; 91 | 92 | procedure s_get_string_field(s,f,d) 93 | { 94 | var t; 95 | t = "match s with { ." @ f @ " = default } : (s."@ f @") default: (\""@ d @ "\") ;"; 96 | return parse(t); 97 | }; 98 | 99 | 100 | // returns true if 's' is a structure with field named 'f' 101 | procedure s_has_field(s,f) 102 | { 103 | var t; 104 | t = "match s with { ." @ f @ " = default } : (true) default: (false) ;"; 105 | return parse(t); 106 | }; 107 | 108 | // returns a copy of 's' 109 | procedure s_copy(s) 110 | { 111 | var t,S; 112 | t = "" @ s; 113 | return parse(t); 114 | }; 115 | -------------------------------------------------------------------------------- /src/Sollya/examples/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /src/Sollya/examples/atan_pi8.sollya: -------------------------------------------------------------------------------- 1 | /* -*- mode: c; -*- */ 2 | 3 | // Marc B. Reynolds, 2016 4 | // Public Domain under http://unlicense.org, see link for details. 5 | 6 | //verbosity=10; 7 | 8 | // Using arctangent as an example 9 | // load the library 10 | execute("../util.sollya"); 11 | execute("../struct.sollya"); 12 | execute("../approx.sollya"); 13 | execute("../plotly.sollya"); 14 | 15 | // temp hack..html is too big for a gist 16 | _plotly._points = 200; 17 | 18 | // *********************************************************************** 19 | // Use the library to build some approximations 20 | 21 | print("creating approximations"); 22 | 23 | // set up a template structure for full range min rel and abs error 24 | atan_r = a_setup(atan(x), [0;tan(pi/8)]); 25 | 26 | // build relative error 27 | print(" rel error"); 28 | atan_r2 = s_copy(atan_r); // make a copy 29 | atan_r2._nterms = 2; 30 | atan_r2._min_type = relative; 31 | atan_r2 = a_build_approx(atan_r2); 32 | atan_r3 = a_add_term(atan_r2); 33 | atan_r4 = a_add_term(atan_r3); 34 | atan_r5 = a_add_term(atan_r4); 35 | atan_r6 = a_add_term(atan_r5); 36 | 37 | // temp hack version of creating a listing 38 | atan_r5 = a_make_listing(atan_r5); 39 | write("atan_" @ atan_r5._nterms @ "(float x) {\n float r;\n"); 40 | atan_r5._listing; 41 | print(" return r;\n}\n"); 42 | atan_r5._infnorm; 43 | atan_r5._poly; 44 | 45 | 46 | 47 | // *********************************************************************** 48 | // start building an html report 49 | print("building report"); 50 | 51 | page.new("atan_pi8.html"); 52 | 53 | // reports are associated with figures. start a new figure and 54 | // all "text" added appears above it. 55 | plotly_new_error_figure("figure 1"); 56 | 57 | page.section("arctangent"); 58 | 59 | page.subsection("polynomial relative error \\(x \\in \\pm \\left( \\sqrt{2} - 1 \\right) \\)"); 60 | page.add_trace(atan_r2); 61 | page.add_trace(atan_r3); 62 | page.add_trace(atan_r4); 63 | page.add_trace(atan_r5); 64 | page.add_trace(atan_r6); 65 | 66 | plotly_new_error_figure("figure 2"); 67 | 68 | plotly_build_pages(); 69 | plotly_show_html("atan_pi8.html"); // automatically open the result 70 | -------------------------------------------------------------------------------- /src/Sollya/examples/disc.sollya: -------------------------------------------------------------------------------- 1 | /* -*- mode: c; -*- */ 2 | 3 | // Marc B. Reynolds, 2017 4 | // Public Domain under http://unlicense.org, see link for details. 5 | 6 | // sin(pi/4 x), cos(pi/4 x) for x on [-1,1], 7 | // no range reduction for atan. 8 | 9 | // make html plots of approximations 10 | 11 | //verbosity=10; 12 | 13 | // load the library 14 | execute("../util.sollya"); 15 | execute("../struct.sollya"); 16 | execute("../approx.sollya"); 17 | execute("../plotly.sollya"); 18 | 19 | // temp hack..html is too big for a gist 20 | _plotly._points = 200; 21 | 22 | // setup functions to approximate 23 | sin_2 = a_setup(sin((pi/4)*x), [-1;1]); 24 | sin_2._nterms = 2; 25 | sin_2._min_type = absolute; 26 | 27 | cos_2 = a_setup(cos((pi/4)*x), [-1;1]); 28 | cos_2._nterms = 2; 29 | cos_2._min_type = absolute; 30 | 31 | print("creating sin"); 32 | sin_2 = a_build_approx(sin_2); 33 | sin_3 = a_add_term(sin_2); 34 | sin_4 = a_add_term(sin_3); 35 | 36 | sin_2c = a_constrain(sin_2, 1, sqrt(2)/2); 37 | sin_3c = a_constrain(sin_3, 1, sqrt(2)/2); 38 | sin_4c = a_constrain(sin_4, 1, sqrt(2)/2); 39 | 40 | print("creating cos"); 41 | cos_2 = a_build_approx(cos_2); 42 | cos_3 = a_add_term(cos_2); 43 | cos_4 = a_add_term(cos_3); 44 | 45 | cos_2c = a_constrain(cos_2, 1, sqrt(2)/2); 46 | cos_3c = a_constrain(cos_3, 1, sqrt(2)/2); 47 | cos_4c = a_constrain(cos_4, 1, sqrt(2)/2); 48 | 49 | print("building report"); 50 | 51 | page.new("disc.html"); 52 | 53 | plotly_new_error_figure("~sin(pi x/4)"); 54 | page.section("Approximation of sin"); 55 | 56 | page.add_trace(sin_2c); 57 | page.add_trace(sin_2); 58 | page.add_trace(sin_3c); 59 | page.add_trace(sin_3); 60 | page.add_trace(sin_4c); 61 | page.add_trace(sin_4); 62 | 63 | plotly_new_error_figure("~cos(pi x/4)"); 64 | page.section("Approximation of cos"); 65 | page.add_trace(cos_2c); 66 | page.add_trace(cos_2); 67 | page.add_trace(cos_3c); 68 | page.add_trace(cos_3); 69 | page.add_trace(cos_4c); 70 | page.add_trace(cos_4); 71 | 72 | plotly_build_pages(); 73 | plotly_show_html("disc.html"); // automatically open the result 74 | 75 | print("making listings"); 76 | 77 | a_to_inline_f32(sin_2, "sin2"); 78 | a_to_inline_f32(sin_2c, "sin2c"); 79 | a_to_inline_f32(sin_3, "sin3"); 80 | a_to_inline_f32(sin_3c, "sin3c"); 81 | a_to_inline_f32(sin_4, "sin4"); 82 | a_to_inline_f32(sin_4c, "sin4c"); 83 | 84 | a_to_inline_f32(cos_2, "cos2"); 85 | a_to_inline_f32(cos_2c, "cos2c"); 86 | a_to_inline_f32(cos_3, "cos3"); 87 | a_to_inline_f32(cos_3c, "cos3c"); 88 | a_to_inline_f32(cos_4, "cos4"); 89 | a_to_inline_f32(cos_4c, "cos4c"); 90 | 91 | 92 | 93 | -------------------------------------------------------------------------------- /src/Sollya/examples/f.sollya: -------------------------------------------------------------------------------- 1 | /* -*- mode: c; -*- */ 2 | 3 | // Marc B. Reynolds, 2016 4 | // Public Domain under http://unlicense.org, see link for details. 5 | 6 | //verbosity=10; 7 | 8 | // Using arctangent as an example 9 | // load the library 10 | execute("../util.sollya"); 11 | execute("../struct.sollya"); 12 | execute("../approx.sollya"); 13 | execute("../plotly.sollya"); 14 | 15 | // temp hack..html is too big for a gist 16 | _plotly._points = 200; 17 | 18 | // *********************************************************************** 19 | // Use the library to build some approximations 20 | 21 | //print("creating approximations"); 22 | 23 | // set up a template structure for full range min rel and abs error 24 | f_r = a_setup(log(1+x)/(x*log(2)), [1/sqrt(2)-1; sqrt(2)-1]); 25 | f_a = s_copy(f_r); 26 | f_a._nterms = 11; 27 | f_a = a_build_approx(f_a); 28 | 29 | f_a; 30 | 31 | a_to_inline_f32(f_a, "f"); 32 | 33 | print("Single precision:"); 34 | r=[1/sqrt(2)-1; sqrt(2)-1]; 35 | 36 | //f=log2(1+x)/x; 37 | f=log(1+x)/(x*log(2)); 38 | 39 | p=fpminimax(f, 11, [|24...|], r, floating, relative); 40 | b=dirtyinfnorm(p-f, r); 41 | p; 42 | print("\ninf-norm:",b); 43 | print("\nzeros", dirtyfindzeros(f-p,r),"\n"); 44 | print("max-e:", dirtyfindzeros(diff(f-p),r)); 45 | 46 | //print("\nDouble precision:"); 47 | //p=fpminimax(f, 11, [|53...|], r, floating, relative); 48 | //b=dirtyinfnorm(p-f, r); 49 | //p; 50 | //print("\ninf-norm:",b); 51 | //print("\nzeros", dirtyfindzeros(f-p,r),"\n"); 52 | //print("max-e:", dirtyfindzeros(diff(f-p),r)); 53 | -------------------------------------------------------------------------------- /src/Sollya/examples/foo.sollya: -------------------------------------------------------------------------------- 1 | /* -*- mode: c; -*- */ 2 | 3 | // Marc B. Reynolds, 2016 4 | // Public Domain under http://unlicense.org, see link for details. 5 | 6 | //verbosity=10; 7 | 8 | execute("../util.sollya"); 9 | execute("../struct.sollya"); 10 | execute("../approx.sollya"); 11 | execute("../plotly.sollya"); 12 | 13 | // In this directory run: sollya sin.sollya 14 | 15 | // Approximate sin on [-pi/2, pi/2] 16 | // * single precision (the default) 17 | // * minimize relative error (the default) 18 | // * use 5 non-zero terms 19 | // * allow the library to detect the symettry 20 | 21 | // set-up 22 | f_a = a_setup(sin(asin(x)/3), [0;1]); 23 | f_a._nterms = 6; 24 | //sin_a._min_type = absolute; // default is rel 25 | // some explict set-up 26 | //sin_a._symmetry = A_SYMMETRY_EVEN; // symmetry 27 | //sin_a._degree = [|1, 3, 5, 7, 9|]; // monimonials to use (ignore _nterms) 28 | 29 | // make the approximation 30 | f_a = a_build_approx(f_a); 31 | f_a._infnorm; 32 | 33 | f_a._poly; 34 | 35 | // temp hack version of creating a listing 36 | f_a = a_make_listing(f_a); 37 | write("f_" @ f_a._nterms @ "(float x) {\n float r;\n"); 38 | f_a._listing; 39 | print(" return r;\n}\n"); 40 | 41 | -------------------------------------------------------------------------------- /src/Sollya/examples/log2.sollya: -------------------------------------------------------------------------------- 1 | /* -*- mode: c; -*- */ 2 | 3 | // Marc B. Reynolds, 2016 4 | // Public Domain under http://unlicense.org, see link for details. 5 | 6 | //verbosity=10; 7 | 8 | execute("../util.sollya"); 9 | execute("../struct.sollya"); 10 | execute("../approx.sollya"); 11 | execute("../plotly.sollya"); 12 | 13 | //ln2 = a_setup(log2(x), [1/2;1]); 14 | //ln2._nterms = 5; 15 | //ln2 = a_build_approx(ln2); 16 | 17 | // my script are choking on this..go manual. 18 | // not tranforming log2 is a poke in the eye 19 | R = [1/2;1]; 20 | P = fpminimax(log2(x), 5, [|24...|],R, floating, absolute); 21 | E = log2(x)-P; 22 | INFN = single(dirtyinfnorm(E,R)); 23 | Z = list_to_f32(dirtyfindzeros(E,R)); 24 | 25 | print("no transform"); 26 | print("// error =", INFN); 27 | print("// zero =", Z); 28 | P; 29 | 30 | print(""); 31 | print("transform"); 32 | 33 | //l2._infnorm; 34 | R = [-.1715728752538099023966;.1715728752538099023966]; // range of the approximation 35 | ch = a_setup(2*atanh(x)/log(2), R); 36 | ch._nterms = 2; 37 | ch._symmetry = A_SYMMETRY_ODD; // too dumb 38 | ch = a_build_approx(ch); 39 | ch._infnorm; 40 | ch._poly; 41 | 42 | print(""); 43 | print("transform"); 44 | 45 | //F = 2*atanh(x)/log(2); // function to approximate 46 | 47 | //T = fixed; 48 | //T = floating; 49 | 50 | //P = fpminimax(2*atanh(x)/log(2), {1,3,5,7}, [|24...|],R, T, absolute); 51 | //display=decimal!; 52 | //P; 53 | //display=hexadecimal!; 54 | //P; 55 | //display=decimal!; 56 | //print("norm = ", accurateinfnorm(F-P, R, 23)); 57 | 58 | -------------------------------------------------------------------------------- /src/Sollya/examples/sin.sollya: -------------------------------------------------------------------------------- 1 | /* -*- mode: c; -*- */ 2 | 3 | // Marc B. Reynolds, 2016 4 | // Public Domain under http://unlicense.org, see link for details. 5 | 6 | //verbosity=10; 7 | 8 | execute("../util.sollya"); 9 | execute("../struct.sollya"); 10 | execute("../approx.sollya"); 11 | execute("../plotly.sollya"); 12 | 13 | // In this directory run: sollya sin.sollya 14 | 15 | // Approximate sin on [-pi/2, pi/2] 16 | // * single precision (the default) 17 | // * minimize relative error (the default) 18 | // * use 5 non-zero terms 19 | // * allow the library to detect the symettry 20 | 21 | // set-up 22 | sin_a = a_setup(sin(x), [-pi/2;pi/2]); 23 | sin_a._nterms = 5; 24 | //sin_a._min_type = absolute; // default is rel 25 | // some explict set-up 26 | //sin_a._symmetry = A_SYMMETRY_EVEN; // symmetry 27 | //sin_a._degree = [|1, 3, 5, 7, 9|]; // monimonials to use (ignore _nterms) 28 | 29 | // make the approximation 30 | sin_a = a_build_approx(sin_a); 31 | sin_a._infnorm; 32 | 33 | // temp hack version of creating a listing 34 | sin_a = a_make_listing(sin_a); 35 | write("sin_core_" @ sin_a._nterms @ "(float x) {\n float r;\n"); 36 | sin_a._listing; 37 | print(" return r;\n}\n"); 38 | 39 | 40 | //page.new("sin.html"); 41 | //page.add_trace(sin_a); 42 | //plotly_build_pages(); 43 | //plotly_show_html("sin.html"); // automatically open the result 44 | 45 | 46 | -------------------------------------------------------------------------------- /src/Sollya/examples/sincospi.sollya: -------------------------------------------------------------------------------- 1 | /* -*- mode: c; -*- */ 2 | 3 | // Marc B. Reynolds, 2016 4 | // Public Domain under http://unlicense.org, see link for details. 5 | 6 | //verbosity=10; 7 | 8 | execute("../util.sollya"); 9 | execute("../struct.sollya"); 10 | execute("../approx.sollya"); 11 | execute("../plotly.sollya"); 12 | 13 | // In this directory run: sollya sincospi.sollya (BROKEN HACK) 14 | 15 | // Approximate sincos 16 | // * single precision (the default) 17 | // * minimize relative error (the default) 18 | // * use 5 non-zero terms 19 | // * allow the library to detect the symettry 20 | 21 | scale = pi/4; 22 | 23 | // set-up 24 | sin_a = a_setup(sin(pi/4*x), [0;1]); 25 | sin_a._nterms = 5; 26 | // some explict set-up 27 | //sin_a._symmetry = A_SYMMETRY_EVEN; // symmetry 28 | //sin_a._degree = [|1, 3, 5, 7, 9|]; // monimonials to use (ignore _nterms) 29 | print("here"); 30 | sin_a = a_make_listing(sin_a); 31 | write("sin_core_" @ sin_a._nterms @ "(float x) {\n float r;\n"); 32 | sin_a._listing; 33 | print(" return r;\n}\n"); 34 | 35 | 36 | quit; 37 | cos_a = a_setup(cos(scale*x), [0;1]); 38 | 39 | // make the approximation 40 | cos_a = a_build_approx(cos_a); 41 | cos_a._infnorm; 42 | 43 | // temp hack version of creating a listing 44 | cos_a = a_make_listing(cos_a); 45 | write("cos_core_" @ cos_a._nterms @ "(float x) {\n float r;\n"); 46 | cos_a._listing; 47 | print(" return r;\n}\n"); 48 | 49 | 50 | //page.new("sin.html"); 51 | //page.add_trace(sin_a); 52 | //plotly_build_pages(); 53 | //plotly_show_html("sin.html"); // automatically open the result 54 | 55 | 56 | -------------------------------------------------------------------------------- /src/Sollya/mulk.sollya: -------------------------------------------------------------------------------- 1 | /* -*- mode: c; -*- */ 2 | 3 | // Marc B. Reynolds, 2016-2023 4 | // Public Domain under http://unlicense.org, see link for details. 5 | 6 | // given a constant 'x' factor into unevaluated part x ~= (h+l) 7 | 8 | procedure f32_mul_k(name,x) 9 | { 10 | var h,l,e; 11 | h = single(x); 12 | l = single(x-h); 13 | print("const f32_pair_t f32_mul_k_" @ name @ 14 | " = {.h = " @ h @ "f, .l=" @ l @ "f};"); 15 | }; 16 | 17 | procedure f64_mul_k(name,x) 18 | { 19 | var h,l; 20 | h = double(x); 21 | l = double(x-h); 22 | print("const f64_pair_t f64_mul_k_" @ name @ 23 | " = {.h = " @ h @ ", .l=" @ l @ "};"); 24 | }; 25 | 26 | 27 | 28 | if (false) then { 29 | display = hexadecimal!; 30 | 31 | print("// extended precision multiplicative constants as unevaluate pairs: {RN(K) + RN(K-RN(K))}"); 32 | f32_mul_k("pi ", Pi); 33 | f32_mul_k("pi_i ", 1/Pi); 34 | f32_mul_k("log2 ", log(2)); 35 | f32_mul_k("log2_i ", 1/log(2)); 36 | f32_mul_k("log10 ", log(10)); 37 | f32_mul_k("log10_i", 1/log(10)); 38 | f32_mul_k("e ", exp(1)); 39 | f32_mul_k("e_i ", 1/exp(1)); 40 | 41 | print("\n// extended precision multiplicative constants as unevaluate pairs: {RN(K) + RN(K-RN(K))}"); 42 | f64_mul_k("pi ", Pi); 43 | f64_mul_k("pi_i ", 1/Pi); 44 | f64_mul_k("log2 ", log(2)); 45 | f64_mul_k("log2_i ", 1/log(2)); 46 | f64_mul_k("log10 ", log(10)); 47 | f64_mul_k("log10_i", 1/log(10)); 48 | f64_mul_k("e ", exp(1)); 49 | f64_mul_k("e_i ", 1/exp(1)); 50 | }; 51 | -------------------------------------------------------------------------------- /src/Sollya/struct.sollya: -------------------------------------------------------------------------------- 1 | /* -*- mode: c; -*- */ 2 | 3 | // Marc B. Reynolds, 2016-2023 4 | // Public Domain under http://unlicense.org, see link for details. 5 | 6 | // utility procs for manipulating structures. 7 | 8 | if (!isbound(_s_library_loaded)) then { 9 | 10 | _s_library_loaded = true; 11 | 12 | // returns value of field named 'f' from struct 's' if defined, otherwise returns 'd' 13 | // WARNING 'd' cannot be a string, use s_get_string_field. Could probably change this 14 | // to using isbound instead. 15 | procedure s_get_field(s,f,d) 16 | { 17 | var t; 18 | t = "match s with { ." @ f @ " = default } : (s."@ f @") default: ("@ d @ ") ;"; 19 | return parse(t); 20 | }; 21 | 22 | procedure s_get_string_field(s,f,d) 23 | { 24 | var t; 25 | t = "match s with { ." @ f @ " = default } : (s."@ f @") default: (\""@ d @ "\") ;"; 26 | return parse(t); 27 | }; 28 | 29 | 30 | // returns true if 's' is a structure with field named 'f' 31 | procedure s_has_field(s,f) 32 | { 33 | var t; 34 | t = "match s with { ." @ f @ " = default } : (true) default: (false) ;"; 35 | return parse(t); 36 | }; 37 | 38 | // not done 39 | //procedure s_has_fields(l=...) 40 | //{ 41 | // var s,t,e; 42 | // e = length(l); 43 | //}; 44 | 45 | 46 | // returns a copy of 's' 47 | procedure s_copy(s) 48 | { 49 | var t,S; 50 | t = "" @ s; 51 | return parse(t); 52 | }; 53 | }; 54 | -------------------------------------------------------------------------------- /src/Sollya/util.sollya: -------------------------------------------------------------------------------- 1 | /* -*- mode: c; -*- */ 2 | 3 | // Marc B. Reynolds, 2016-2023 4 | // Public Domain under http://unlicense.org, see link for details. 5 | 6 | if (!isbound(_util_library_loaded)) then { 7 | 8 | _util_library_loaded = true; 9 | 10 | procedure coeff_f32(p) 11 | { 12 | return roundcoefficients(p,[|24...|]); 13 | }; 14 | 15 | procedure coeff_f64(p) 16 | { 17 | return roundcoefficients(p,[|53...|]); 18 | }; 19 | 20 | procedure within_dx(a,b,d) 21 | { 22 | return abs(a-b) < d; 23 | }; 24 | 25 | // first 'n' odd values 26 | procedure odd_list(n) 27 | { 28 | var c,l,i; 29 | c = 2*n; 30 | l = [||]; 31 | 32 | for i from 1 to c by 2 do l = l:.i; 33 | 34 | return l; 35 | }; 36 | 37 | // first 'n' even values 38 | procedure even_list(n) 39 | { 40 | var c,l,i; 41 | c = 2*n-1; 42 | l = [||]; 43 | 44 | for i from 0 to c by 2 do l = l:.i; 45 | 46 | return l; 47 | }; 48 | 49 | procedure full_list(n) 50 | { 51 | var c,l,i; 52 | l = [||]; 53 | 54 | for i from 0 to n-1 do l = l:.i; 55 | 56 | return l; 57 | }; 58 | 59 | 60 | procedure to_csv_string(l=...) 61 | { 62 | var s,e,i; 63 | 64 | s = ""; 65 | e = length(l)-2; 66 | 67 | for i from 0 to e do { 68 | s = s @ l[i] @ ","; 69 | }; 70 | 71 | s = s @ l[e+1]; 72 | 73 | return s; 74 | }; 75 | 76 | u_round_f32 = proc(n) { return single(n); }; 77 | u_round_f64 = proc(n) { return double(n); }; 78 | 79 | // make a copy of list with f applied 80 | procedure list_apply(l,f) 81 | { 82 | var r,v,i; 83 | 84 | r = [||]; 85 | 86 | for i from 0 to length(l)-1 do { 87 | v = f(l[i]); 88 | r = r :. v; 89 | }; 90 | 91 | return r; 92 | }; 93 | 94 | 95 | // make a copy of list as binary32 96 | procedure list_to_f32(l) { return list_apply(l, u_round_f32); }; 97 | 98 | // make a copy of list as binary64 99 | procedure list_to_f64(l) 100 | { 101 | var r,v,i; 102 | 103 | r = [||]; 104 | 105 | for i from 0 to length(l)-1 do { 106 | v := double(l[i]); 107 | r = r :. v; 108 | }; 109 | 110 | return r; 111 | }; 112 | 113 | // return true if 'v' is in list 'l' 114 | procedure is_in_list(l, v) 115 | { 116 | var i,r,n; 117 | 118 | i = 0; 119 | r = false; 120 | n = length(l); 121 | 122 | while ((i < n) && (!r)) do { 123 | if (l[i] == v) then {r = true;}; 124 | i := i + 1; 125 | }; 126 | 127 | return r; 128 | }; 129 | 130 | 131 | // scale elem of 'l' by 's' and convert to binary32 132 | procedure scale_list_to_f32(l,s) 133 | { 134 | var r,v,i; 135 | 136 | r = [||]; 137 | 138 | for i from 0 to length(l)-1 do { 139 | v := s*single(l[i]); 140 | r = r :. v; 141 | }; 142 | 143 | return r; 144 | }; 145 | 146 | // for error testing 147 | procedure fma(a,b,c) { double(a*b+c); }; 148 | procedure fmaf(a,b,c) { single(a*b+c); }; 149 | 150 | procedure f64_ufp(v) { v = double(v); return 2^(exponent(v)+precision(v)- 1); }; 151 | procedure f64_ulp(v) { v = double(v); return 2^(exponent(v)+precision(v)-53); }; 152 | procedure f64_succ(v) { v = double(v); return single(v+f64_ulp(v)); }; 153 | procedure f64_pred(v) { v = double(v); return single(v-f64_ulp(v)); }; 154 | 155 | procedure f32_ufp(v) { v = single(v); return 2^(exponent(v)+precision(v)- 1); }; 156 | procedure f32_ulp(v) { v = single(v); return 2^(exponent(v)+precision(v)-24); }; 157 | procedure f32_succ(v) { v = single(v); return single(v+f32_ulp(v)); }; 158 | procedure f32_pred(v) { v = single(v); return single(v-f32_ulp(v)); }; 159 | 160 | 161 | }; 162 | 163 | -------------------------------------------------------------------------------- /src/TestAndSearch/PrnsTestU01.c: -------------------------------------------------------------------------------- 1 | // This is a battery test for: 2 | // http://github.com/Marc-B-Reynolds/Stand-alone-junk/blob/master/src/SFH/prns.h 3 | // 4 | // The docs and structure are here: 5 | // http://marc-b-reynolds.github.io/shf/2016/04/19/prns.html 6 | // 7 | // The testing will run until stopped, unless NUMBER_OF_RUNS is defined 8 | // 9 | // Building requires either of these libraries: 10 | // http://github.com/Marc-B-Reynolds/TestU01x 11 | // http://simul.iro.umontreal.ca/testu01/tu01.html 12 | 13 | #define PRNS_SMALLCRUSH 14 | 15 | // inject this note at the top of the output...for custom 16 | // mixing or whatever other note desired. 17 | #ifndef PRNS_SMALLCRUSH 18 | #define NOTATION "standard" 19 | #else 20 | #define NOTATION "smallcrush" 21 | #endif 22 | 23 | // if defined the lower 32-bit results are used for 24 | // integer test, otherwise the upper. See the docs 25 | // on dropping final right-xorshift. 26 | //#define USE_LOWER_BITS 27 | 28 | // if defined run Smallcrush, otherwise Crush 29 | #define SMALLCRUSH 30 | 31 | // Run with specified inital state. Undefined uses __rdtsc() 32 | //#define INITAL_STATE 0x1L 33 | 34 | // Setup whatever configuration to be tested 35 | 36 | //#define PRNS_MIX_13 37 | 38 | #define PRNS_NO_FINAL_XORSHIFT 39 | 40 | // if defined then the value is the number of time to run the 41 | // battery...otherwise will continue until stopped. 42 | #define NUMBER_OF_RUNS 100 43 | 44 | #ifdef PRNS_WEYL 45 | // just to prevent compile time errors. The battery only test standard 46 | // member access (walking forward). Backward is identical statistical 47 | // properties. 48 | #define PRNS_WEYL_I 0x1L 49 | #endif 50 | 51 | #define NAME "prns" 52 | 53 | #include 54 | #include 55 | #include 56 | 57 | 58 | #if defined(_MSC_VER) 59 | #define inline _inline 60 | #define I2F (1.0/((1.0*(1<<22))*(1.0*(1<<30)))) 61 | _inline uint32_t __builtin_ctz(uint32_t x) { unsigned long r; _BitScanForward(&r, (unsigned long)x); return (uint32_t)r; } 62 | _inline uint64_t __builtin_clzl(uint64_t x) { unsigned long r; _BitScanReverse64(&r, x); return (uint64_t)r; } 63 | _inline uint64_t __builtin_bswap64(uint64_t x) { return _byteswap_uint64(x); } 64 | #else 65 | #include 66 | #define I2F 0x1p-52f 67 | #endif 68 | 69 | #include "util.h" 70 | #include "unif01.h" 71 | #include "swrite.h" 72 | 73 | #include "../SFH/prns.h" 74 | 75 | prns_t state; 76 | 77 | // for 32-bit integer tests 78 | static uint32_t next_u32(void* p, void* s) 79 | { 80 | uint64_t r = prns_next(&state); 81 | 82 | #if defined(USE_LOWER_BITS) 83 | return (uint32_t)r; 84 | #else 85 | return (uint32_t)(r >> 32); 86 | #endif 87 | } 88 | 89 | // for double tests 90 | static double next_f64(void* p, void* s) 91 | { 92 | uint64_t r = prns_next(&state); 93 | // map to [0,1) 94 | return (r >> 12) * I2F; 95 | } 96 | 97 | // dump start of test state: useful if one wanted to 98 | // check questionable p-values (or to repeat failures 99 | // sanity checks) 100 | static void print_state(void* s) 101 | { 102 | printf(" S = 0x%0" PRIx64 "\n", state.i); 103 | } 104 | 105 | // 106 | unif01_Gen* createGenerator() 107 | { 108 | unif01_Gen* gen = util_Malloc(sizeof(unif01_Gen)); 109 | 110 | gen->state = 0; 111 | gen->param = 0; 112 | gen->name = NAME; 113 | gen->GetU01 = (void*)&next_f64; 114 | gen->GetBits = (void*)&next_u32; 115 | gen->Write = &print_state; 116 | 117 | return gen; 118 | } 119 | 120 | void deleteGenerator(unif01_Gen* gen) 121 | { 122 | if (gen != NULL) { 123 | //util_Free(gen->param); 124 | //util_Free(gen->state); 125 | util_Free(gen); 126 | } 127 | } 128 | 129 | 130 | #include "bbattery.h" 131 | 132 | int main(void) 133 | { 134 | unif01_Gen* gen = createGenerator(); 135 | uint64_t s = __rdtsc(); 136 | uint32_t c = -1; 137 | uint32_t t; 138 | 139 | swrite_Basic = FALSE; // only print summary 140 | 141 | uint64_t z = s; 142 | 143 | // dump out the configuration 144 | printf("PRNS with configuation options (" NOTATION ") \n"); 145 | printf(" WEYL: 0x%0" PRIx64 "\n", PRNS_WEYL); 146 | printf(" S0: %d\n", PRNS_MIX_S0); 147 | printf(" S1: %d\n", PRNS_MIX_S1); 148 | #ifndef PRNS_NO_FINAL_XORSHIFT 149 | printf(" S2: %d\n", PRNS_MIX_S2); 150 | #else 151 | printf(" drops last right-xorshift\n"); 152 | #endif 153 | printf(" M0: 0x%0" PRIx64 "\n", PRNS_MIX_M0); 154 | printf(" M1: 0x%0" PRIx64 "\n", PRNS_MIX_M1); 155 | 156 | #ifdef USE_LOWER_BITS 157 | printf(" low bits\n"); 158 | #else 159 | printf(" high bits\n"); 160 | #endif 161 | 162 | // the state is initialized to a sobol sequence in 163 | // an attempt to get as good a coverage of the state 164 | // space as possible. 165 | 166 | do { 167 | // setting the raw state of the generator 168 | state.i = s; 169 | 170 | printf("run %d -- state = 0x%0" PRIx64 "\n", -(int32_t)c, state.i); 171 | 172 | #if defined(SMALLCRUSH) 173 | bbattery_SmallCrush(gen); 174 | #else 175 | bbattery_Crush(gen); 176 | #endif 177 | 178 | #if defined(NUMBER_OF_RUNS) 179 | if (0 - c >= NUMBER_OF_RUNS) break; 180 | #endif 181 | 182 | // could improve the domain of the search 183 | t = __builtin_ctz(c) * 2; 184 | s ^= 0x8000000000000000UL >> t; 185 | c -= 1; 186 | } while (1); 187 | 188 | deleteGenerator(gen); 189 | 190 | return 0; 191 | } 192 | --------------------------------------------------------------------------------