├── README ├── math_acosf.c ├── math_asinf.c ├── math_atan2f.c ├── math_atanf.c ├── math_ceilf.c ├── math_cosf.c ├── math_coshf.c ├── math_debug.c ├── math_expf.c ├── math_fabsf.c ├── math_floorf.c ├── math_fmodf.c ├── math_invsqrtf.c ├── math_ldexpf.c ├── math_log10f.c ├── math_logf.c ├── math_mat2.c ├── math_mat3.c ├── math_mat4.c ├── math_modf.c ├── math_neon.h ├── math_powf.c ├── math_runfast.c ├── math_sincosf.c ├── math_sinf.c ├── math_sinfv.c ├── math_sinhf.c ├── math_sqrtf.c ├── math_sqrtfv.c ├── math_tanf.c ├── math_tanhf.c ├── math_vec2.c ├── math_vec3.c └── math_vec4.c /README: -------------------------------------------------------------------------------- 1 | 2 | Library: MATH-NEON 3 | By: Lachlan Tychsen-Smith 4 | Licence: MIT (expat) 5 | ======================================================================================= 6 | This project implements the cmath functions and some optimised matrix functions 7 | with the aim of increasing the floating point performance of ARM Cortex A-8 8 | based platforms. As well as implementing the functions in ARM NEON assembly, 9 | they sacrifice error checking and some accuracy to achieve better performance. 10 | 11 | Function Errors: 12 | ======================================================================================= 13 | The measurement and characterisations of the inaccuracies present within these 14 | functions is really a field within itself. For the benchmark i provide the 15 | maximum absolute, maximum relative and root mean squared error compared to the 16 | cmath implementations over the specified range. However these values can be 17 | misleading, especially for functions which quickly go to infinity. So its always a 18 | good idea to test it within your actual program. In general, this library will not 19 | be as accurate as cmath, however for many functions it is close enough to be 20 | negilible. 21 | 22 | Notes: 23 | ======================================================================================= 24 | - The *_c functions are c implementations of the *_neon code. 25 | - Like cmath, The errors present in the functions are very dependent on the 26 | range which your operating in. So you should test them first. 27 | - Look in the "math_neon.h" file for discriptions of the functions. In some 28 | function files there are also notes on the specific implementation. 29 | - The *_neon functions make certain assumptions about the location of arguments 30 | that is incompatible with inlining. 31 | 32 | Contact: 33 | ======================================================================================= 34 | Name: Lachlan Tychsen-Smith 35 | Email: lachlan.ts@gmail.com 36 | -------------------------------------------------------------------------------- /math_acosf.c: -------------------------------------------------------------------------------- 1 | /* 2 | The MIT License (MIT) 3 | 4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in 14 | all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. 23 | */ 24 | 25 | #include "math.h" 26 | #include "math_neon.h" 27 | 28 | /* 29 | Test func : acosf(x) 30 | Test Range: -1.0 < x < 1.0 31 | Peak Error: ~0.005% 32 | RMS Error: ~0.001% 33 | */ 34 | 35 | const float __acosf_pi_2 = M_PI_2; 36 | 37 | float acosf_c(float x) 38 | { 39 | return __acosf_pi_2 - asinf_c(x); 40 | } 41 | 42 | 43 | float acosf_neon_hfp(float x) 44 | { 45 | #ifdef __MATH_NEON 46 | asinf_neon_hfp(x); 47 | asm volatile ( 48 | "vdup.f32 d1, %0 \n\t" //d1 = {pi/2, pi/2}; 49 | "vsub.f32 d0, d1, d0 \n\t" //d0 = d1 - d0; 50 | ::"r"(__acosf_pi_2): 51 | ); 52 | #endif 53 | } 54 | 55 | float acosf_neon_sfp(float x) 56 | { 57 | #ifdef __MATH_NEON 58 | asm volatile ("vmov.f32 s0, r0 \n\t"); 59 | acosf_neon_hfp(x); 60 | asm volatile ("vmov.f32 r0, s0 \n\t"); 61 | #else 62 | return acosf_c(x); 63 | #endif 64 | } 65 | 66 | 67 | 68 | -------------------------------------------------------------------------------- /math_asinf.c: -------------------------------------------------------------------------------- 1 | /* 2 | The MIT License (MIT) 3 | 4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in 14 | all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. 23 | */ 24 | 25 | #include "math.h" 26 | #include "math_neon.h" 27 | 28 | /* 29 | Test func : asinf(x) 30 | Test Range: -1.0 < x < 1.0 31 | Peak Error: ~0.005% 32 | RMS Error: ~0.001% 33 | */ 34 | 35 | 36 | const float __asinf_lut[4] = { 37 | 0.105312459675071, //p7 38 | 0.169303418571894, //p3 39 | 0.051599985887214, //p5 40 | 0.999954835104825 //p1 41 | }; 42 | 43 | const float __asinf_pi_2 = M_PI_2; 44 | 45 | float asinf_c(float x) 46 | { 47 | 48 | float a, b, c, d, r, ax; 49 | int m; 50 | 51 | union { 52 | float f; 53 | int i; 54 | } xx; 55 | 56 | ax = fabs(x); 57 | d = 0.5; 58 | d = d - ax*0.5; 59 | 60 | //fast invsqrt approx 61 | xx.f = d; 62 | xx.i = 0x5F3759DF - (xx.i >> 1); //VRSQRTE 63 | c = d * xx.f; 64 | b = (3.0f - c * xx.f) * 0.5; //VRSQRTS 65 | xx.f = xx.f * b; 66 | c = d * xx.f; 67 | b = (3.0f - c * xx.f) * 0.5; 68 | xx.f = xx.f * b; 69 | 70 | //fast inverse approx 71 | d = xx.f; 72 | m = 0x3F800000 - (xx.i & 0x7F800000); 73 | xx.i = xx.i + m; 74 | xx.f = 1.41176471f - 0.47058824f * xx.f; 75 | xx.i = xx.i + m; 76 | b = 2.0 - xx.f * d; 77 | xx.f = xx.f * b; 78 | b = 2.0 - xx.f * d; 79 | xx.f = xx.f * b; 80 | 81 | //if |x|>0.5 -> x = sqrt((1-x)/2) 82 | xx.f = xx.f - ax; 83 | a = (ax > 0.5f); 84 | d = __asinf_pi_2 * a; 85 | c = 1.0f - 3.0f * a; 86 | ax = ax + xx.f * a; 87 | 88 | //polynomial evaluation 89 | xx.f = ax * ax; 90 | a = (__asinf_lut[0] * ax) * xx.f + (__asinf_lut[2] * ax); 91 | b = (__asinf_lut[1] * ax) * xx.f + (__asinf_lut[3] * ax); 92 | xx.f = xx.f * xx.f; 93 | r = b + a * xx.f; 94 | r = d + c * r; 95 | 96 | a = r + r; 97 | b = (x < 0.0f); 98 | r = r - a * b; 99 | return r; 100 | } 101 | 102 | 103 | float asinf_neon_hfp(float x) 104 | { 105 | #ifdef __MATH_NEON 106 | asm volatile ( 107 | 108 | "vdup.f32 d0, d0[0] \n\t" //d0 = {x, x}; 109 | "vdup.f32 d4, %1 \n\t" //d4 = {pi/2, pi/2}; 110 | "vmov.f32 d6, d0 \n\t" //d6 = d0; 111 | "vabs.f32 d0, d0 \n\t" //d0 = fabs(d0) ; 112 | 113 | "vmov.f32 d5, #0.5 \n\t" //d5 = 0.5; 114 | "vmls.f32 d5, d0, d5 \n\t" //d5 = d5 - d0*d5; 115 | 116 | //fast invsqrt approx 117 | "vmov.f32 d1, d5 \n\t" //d1 = d5 118 | "vrsqrte.f32 d5, d5 \n\t" //d5 = ~ 1.0 / sqrt(d5) 119 | "vmul.f32 d2, d5, d1 \n\t" //d2 = d5 * d1 120 | "vrsqrts.f32 d3, d2, d5 \n\t" //d3 = (3 - d5 * d2) / 2 121 | "vmul.f32 d5, d5, d3 \n\t" //d5 = d5 * d3 122 | "vmul.f32 d2, d5, d1 \n\t" //d2 = d5 * d1 123 | "vrsqrts.f32 d3, d2, d5 \n\t" //d3 = (3 - d5 * d3) / 2 124 | "vmul.f32 d5, d5, d3 \n\t" //d5 = d5 * d3 125 | 126 | //fast reciporical approximation 127 | "vrecpe.f32 d1, d5 \n\t" //d1 = ~ 1 / d5; 128 | "vrecps.f32 d2, d1, d5 \n\t" //d2 = 2.0 - d1 * d5; 129 | "vmul.f32 d1, d1, d2 \n\t" //d1 = d1 * d2; 130 | "vrecps.f32 d2, d1, d5 \n\t" //d2 = 2.0 - d1 * d5; 131 | "vmul.f32 d5, d1, d2 \n\t" //d5 = d1 * d2; 132 | 133 | //if |x| > 0.5 -> ax = sqrt((1-ax)/2), r = pi/2 134 | "vsub.f32 d5, d0, d5 \n\t" //d5 = d0 - d5; 135 | "vmov.f32 d2, #0.5 \n\t" //d2 = 0.5; 136 | "vcgt.f32 d3, d0, d2 \n\t" //d3 = (d0 > d2); 137 | "vmov.f32 d1, #3.0 \n\t" //d5 = 3.0; 138 | "vshr.u32 d3, #31 \n\t" //d3 = d3 >> 31; 139 | "vmov.f32 d16, #1.0 \n\t" //d16 = 1.0; 140 | "vcvt.f32.u32 d3, d3 \n\t" //d3 = (float) d3; 141 | "vmls.f32 d0, d5, d3[0] \n\t" //d0 = d0 - d5 * d3[0]; 142 | "vmul.f32 d7, d4, d3[0] \n\t" //d7 = d5 * d4; 143 | "vmls.f32 d16, d1, d3[0] \n\t" //d16 = d16 - d1 * d3; 144 | 145 | //polynomial: 146 | "vmul.f32 d2, d0, d0 \n\t" //d2 = d0*d0 = {ax^2, ax^2} 147 | "vld1.32 {d4, d5}, [%0] \n\t" //d4 = {p7, p3}, d5 = {p5, p1} 148 | "vmul.f32 d3, d2, d2 \n\t" //d3 = d2*d2 = {x^4, x^4} 149 | "vmul.f32 q0, q2, d0[0] \n\t" //q0 = q2 * d0[0] = {p7x, p3x, p5x, p1x} 150 | "vmla.f32 d1, d0, d2[0] \n\t" //d1 = d1 + d0*d2[0] = {p5x + p7x^3, p1x + p3x^3} 151 | "vmla.f32 d1, d3, d1[0] \n\t" //d1 = d1 + d3*d1[0] = {..., p1x + p3x^3 + p5x^5 + p7x^7} 152 | 153 | "vmla.f32 d7, d1, d16 \n\t" //d7 = d7 + d1*d16 154 | 155 | "vadd.f32 d2, d7, d7 \n\t" //d2 = d7 + d7 156 | "vclt.f32 d3, d6, #0 \n\t" //d3 = (d6 < 0) 157 | "vshr.u32 d3, #31 \n\t" //d3 = d3 >> 31; 158 | "vcvt.f32.u32 d3, d3 \n\t" //d3 = (float) d3 159 | "vmls.f32 d7, d2, d3[0] \n\t" //d7 = d7 - d2 * d3[0]; 160 | 161 | "vmov.f32 s0, s15 \n\t" //s0 = s3 162 | 163 | :: "r"(__asinf_lut), "r"(__asinf_pi_2) 164 | : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" 165 | ); 166 | #endif 167 | } 168 | 169 | 170 | float asinf_neon_sfp(float x) 171 | { 172 | #ifdef __MATH_NEON 173 | asm volatile ("vmov.f32 s0, r0 \n\t"); 174 | asinf_neon_hfp(x); 175 | asm volatile ("vmov.f32 r0, s0 \n\t"); 176 | #else 177 | return asinf_c(x); 178 | #endif 179 | } 180 | 181 | 182 | 183 | 184 | -------------------------------------------------------------------------------- /math_atan2f.c: -------------------------------------------------------------------------------- 1 | /* 2 | The MIT License (MIT) 3 | 4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in 14 | all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. 23 | */ 24 | 25 | #include "math.h" 26 | #include "math_neon.h" 27 | 28 | const float __atan2f_lut[4] = { 29 | -0.0443265554792128, //p7 30 | -0.3258083974640975, //p3 31 | +0.1555786518463281, //p5 32 | +0.9997878412794807 //p1 33 | }; 34 | 35 | const float __atan2f_pi_2 = M_PI_2; 36 | 37 | float atan2f_c(float y, float x) 38 | { 39 | float a, b, c, r, xx; 40 | int m; 41 | union { 42 | float f; 43 | int i; 44 | } xinv; 45 | 46 | //fast inverse approximation (2x newton) 47 | xx = fabs(x); 48 | xinv.f = xx; 49 | m = 0x3F800000 - (xinv.i & 0x7F800000); 50 | xinv.i = xinv.i + m; 51 | xinv.f = 1.41176471f - 0.47058824f * xinv.f; 52 | xinv.i = xinv.i + m; 53 | b = 2.0 - xinv.f * xx; 54 | xinv.f = xinv.f * b; 55 | b = 2.0 - xinv.f * xx; 56 | xinv.f = xinv.f * b; 57 | 58 | c = fabs(y * xinv.f); 59 | 60 | //fast inverse approximation (2x newton) 61 | xinv.f = c; 62 | m = 0x3F800000 - (xinv.i & 0x7F800000); 63 | xinv.i = xinv.i + m; 64 | xinv.f = 1.41176471f - 0.47058824f * xinv.f; 65 | xinv.i = xinv.i + m; 66 | b = 2.0 - xinv.f * c; 67 | xinv.f = xinv.f * b; 68 | b = 2.0 - xinv.f * c; 69 | xinv.f = xinv.f * b; 70 | 71 | //if |x| > 1.0 -> ax = -1/ax, r = pi/2 72 | xinv.f = xinv.f + c; 73 | a = (c > 1.0f); 74 | c = c - a * xinv.f; 75 | r = a * __atan2f_pi_2; 76 | 77 | //polynomial evaluation 78 | xx = c * c; 79 | a = (__atan2f_lut[0] * c) * xx + (__atan2f_lut[2] * c); 80 | b = (__atan2f_lut[1] * c) * xx + (__atan2f_lut[3] * c); 81 | xx = xx * xx; 82 | r = r + a * xx; 83 | r = r + b; 84 | 85 | //determine quadrant and test for small x. 86 | b = M_PI; 87 | b = b - 2.0f * r; 88 | r = r + (x < 0.0f) * b; 89 | b = (fabs(x) < 0.000001f); 90 | c = !b; 91 | r = c * r; 92 | r = r + __atan2f_pi_2 * b; 93 | b = r + r; 94 | r = r - (y < 0.0f) * b; 95 | 96 | return r; 97 | } 98 | 99 | float atan2f_neon_hfp(float y, float x) 100 | { 101 | #ifdef __MATH_NEON 102 | asm volatile ( 103 | 104 | "vdup.f32 d17, d0[1] \n\t" //d17 = {x, x}; 105 | "vdup.f32 d16, d0[0] \n\t" //d16 = {y, y}; 106 | 107 | //1.0 / x 108 | "vrecpe.f32 d18, d17 \n\t" //d16 = ~ 1 / d1; 109 | "vrecps.f32 d19, d18, d17 \n\t" //d17 = 2.0 - d16 * d1; 110 | "vmul.f32 d18, d18, d19 \n\t" //d16 = d16 * d17; 111 | "vrecps.f32 d19, d18, d17 \n\t" //d17 = 2.0 - d16 * d1; 112 | "vmul.f32 d18, d18, d19 \n\t" //d16 = d16 * d17; 113 | 114 | //y * (1.0 /x) 115 | "vmul.f32 d0, d16, d18 \n\t" //d0 = d16 * d18; 116 | 117 | 118 | "vdup.f32 d4, %1 \n\t" //d4 = {pi/2, pi/2}; 119 | "vmov.f32 d6, d0 \n\t" //d6 = d0; 120 | "vabs.f32 d0, d0 \n\t" //d0 = fabs(d0) ; 121 | 122 | //fast reciporical approximation 123 | "vrecpe.f32 d1, d0 \n\t" //d1 = ~ 1 / d0; 124 | "vrecps.f32 d2, d1, d0 \n\t" //d2 = 2.0 - d1 * d0; 125 | "vmul.f32 d1, d1, d2 \n\t" //d1 = d1 * d2; 126 | "vrecps.f32 d2, d1, d0 \n\t" //d2 = 2.0 - d1 * d0; 127 | "vmul.f32 d1, d1, d2 \n\t" //d1 = d1 * d2; 128 | 129 | //if |x| > 1.0 -> ax = 1/ax, r = pi/2 130 | "vadd.f32 d1, d1, d0 \n\t" //d1 = d1 + d0; 131 | "vmov.f32 d2, #1.0 \n\t" //d2 = 1.0; 132 | "vcgt.f32 d3, d0, d2 \n\t" //d3 = (d0 > d2); 133 | "vcvt.f32.u32 d3, d3 \n\t" //d3 = (float) d3; 134 | "vmls.f32 d0, d1, d3 \n\t" //d0 = d0 - d1 * d3; 135 | "vmul.f32 d7, d3, d4 \n\t" //d7 = d3 * d4; 136 | 137 | //polynomial: 138 | "vmul.f32 d2, d0, d0 \n\t" //d2 = d0*d0 = {ax^2, ax^2} 139 | "vld1.32 {d4, d5}, [%0] \n\t" //d4 = {p7, p3}, d5 = {p5, p1} 140 | "vmul.f32 d3, d2, d2 \n\t" //d3 = d2*d2 = {x^4, x^4} 141 | "vmul.f32 q0, q2, d0[0] \n\t" //q0 = q2 * d0[0] = {p7x, p3x, p5x, p1x} 142 | "vmla.f32 d1, d0, d2[0] \n\t" //d1 = d1 + d0*d2[0] = {p5x + p7x^3, p1x + p3x^3} 143 | "vmla.f32 d1, d3, d1[0] \n\t" //d1 = d1 + d3*d1[0] = {..., p1x + p3x^3 + p5x^5 + p7x^7} 144 | "vadd.f32 d1, d1, d7 \n\t" //d1 = d1 + d7 145 | 146 | "vadd.f32 d2, d1, d1 \n\t" //d2 = d1 + d1 147 | "vclt.f32 d3, d6, #0 \n\t" //d3 = (d6 < 0) 148 | "vcvt.f32.u32 d3, d3 \n\t" //d3 = (float) d3 149 | "vmls.f32 d1, d3, d2 \n\t" //d1 = d1 - d2 * d3; 150 | 151 | "vmov.f32 s0, s3 \n\t" //s0 = s3 152 | 153 | :: "r"(__atan2f_lut), "r"(__atan2f_pi_2) 154 | : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" 155 | ); 156 | #endif 157 | } 158 | 159 | 160 | float atan2f_neon_sfp(float x, float y) 161 | { 162 | #ifdef __MATH_NEON 163 | asm volatile ("vmov.f32 s0, r0 \n\t"); 164 | asm volatile ("vmov.f32 s1, r1 \n\t"); 165 | atan2f_neon_hfp(x, y); 166 | asm volatile ("vmov.f32 r0, s0 \n\t"); 167 | #else 168 | return atan2f_c(y, x); 169 | #endif 170 | }; 171 | -------------------------------------------------------------------------------- /math_atanf.c: -------------------------------------------------------------------------------- 1 | /* 2 | The MIT License (MIT) 3 | 4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in 14 | all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. 23 | */ 24 | 25 | #include "math.h" 26 | #include "math_neon.h" 27 | 28 | const float __atanf_lut[4] = { 29 | -0.0443265554792128, //p7 30 | -0.3258083974640975, //p3 31 | +0.1555786518463281, //p5 32 | +0.9997878412794807 //p1 33 | }; 34 | 35 | const float __atanf_pi_2 = M_PI_2; 36 | 37 | float atanf_c(float x) 38 | { 39 | 40 | float a, b, r, xx; 41 | int m; 42 | 43 | union { 44 | float f; 45 | int i; 46 | } xinv, ax; 47 | 48 | ax.f = fabs(x); 49 | 50 | //fast inverse approximation (2x newton) 51 | xinv.f = ax.f; 52 | m = 0x3F800000 - (xinv.i & 0x7F800000); 53 | xinv.i = xinv.i + m; 54 | xinv.f = 1.41176471f - 0.47058824f * xinv.f; 55 | xinv.i = xinv.i + m; 56 | b = 2.0 - xinv.f * ax.f; 57 | xinv.f = xinv.f * b; 58 | b = 2.0 - xinv.f * ax.f; 59 | xinv.f = xinv.f * b; 60 | 61 | //if |x| > 1.0 -> ax = -1/ax, r = pi/2 62 | xinv.f = xinv.f + ax.f; 63 | a = (ax.f > 1.0f); 64 | ax.f = ax.f - a * xinv.f; 65 | r = a * __atanf_pi_2; 66 | 67 | //polynomial evaluation 68 | xx = ax.f * ax.f; 69 | a = (__atanf_lut[0] * ax.f) * xx + (__atanf_lut[2] * ax.f); 70 | b = (__atanf_lut[1] * ax.f) * xx + (__atanf_lut[3] * ax.f); 71 | xx = xx * xx; 72 | b = b + a * xx; 73 | r = r + b; 74 | 75 | //if x < 0 -> r = -r 76 | a = 2 * r; 77 | b = (x < 0.0f); 78 | r = r - a * b; 79 | 80 | return r; 81 | } 82 | 83 | 84 | float atanf_neon_hfp(float x) 85 | { 86 | #ifdef __MATH_NEON 87 | asm volatile ( 88 | 89 | "vdup.f32 d0, d0[0] \n\t" //d0 = {x, x}; 90 | 91 | "vdup.f32 d4, %1 \n\t" //d4 = {pi/2, pi/2}; 92 | "vmov.f32 d6, d0 \n\t" //d6 = d0; 93 | "vabs.f32 d0, d0 \n\t" //d0 = fabs(d0) ; 94 | 95 | //fast reciporical approximation 96 | "vrecpe.f32 d1, d0 \n\t" //d1 = ~ 1 / d0; 97 | "vrecps.f32 d2, d1, d0 \n\t" //d2 = 2.0 - d1 * d0; 98 | "vmul.f32 d1, d1, d2 \n\t" //d1 = d1 * d2; 99 | "vrecps.f32 d2, d1, d0 \n\t" //d2 = 2.0 - d1 * d0; 100 | "vmul.f32 d1, d1, d2 \n\t" //d1 = d1 * d2; 101 | 102 | 103 | //if |x| > 1.0 -> ax = -1/ax, r = pi/2 104 | "vadd.f32 d1, d1, d0 \n\t" //d1 = d1 + d0; 105 | "vmov.f32 d2, #1.0 \n\t" //d2 = 1.0; 106 | "vcgt.f32 d3, d0, d2 \n\t" //d3 = (d0 > d2); 107 | "vshr.u32 d3, #31 \n\t" //d3 = (d0 > d2); 108 | "vcvt.f32.u32 d3, d3 \n\t" //d5 = (float) d3; 109 | "vmls.f32 d0, d1, d3[0] \n\t" //d0 = d0 - d1 * d3[0]; 110 | "vmul.f32 d7, d4, d3[0] \n\t" //d7 = d5 * d4; 111 | 112 | //polynomial: 113 | "vmul.f32 d2, d0, d0 \n\t" //d2 = d0*d0 = {ax^2, ax^2} 114 | "vld1.32 {d4, d5}, [%0] \n\t" //d4 = {p7, p3}, d5 = {p5, p1} 115 | "vmul.f32 d3, d2, d2 \n\t" //d3 = d2*d2 = {x^4, x^4} 116 | "vmul.f32 q0, q2, d0[0] \n\t" //q0 = q2 * d0[0] = {p7x, p3x, p5x, p1x} 117 | "vmla.f32 d1, d0, d2[0] \n\t" //d1 = d1 + d0*d2[0] = {p5x + p7x^3, p1x + p3x^3} 118 | "vmla.f32 d1, d3, d1[0] \n\t" //d1 = d1 + d3*d1[0] = {..., p1x + p3x^3 + p5x^5 + p7x^7} 119 | "vadd.f32 d1, d1, d7 \n\t" //d1 = d1 + d7 120 | 121 | "vadd.f32 d2, d1, d1 \n\t" //d2 = d1 + d1 122 | "vclt.f32 d3, d6, #0 \n\t" //d3 = (d6 < 0) 123 | "vshr.u32 d3, #31 \n\t" //d3 = (d0 > d2); 124 | "vcvt.f32.u32 d3, d3 \n\t" //d3 = (float) d3 125 | "vmls.f32 d1, d3, d2 \n\t" //d1 = d1 - d2 * d3; 126 | 127 | "vmov.f32 s0, s3 \n\t" //s0 = s3 128 | 129 | :: "r"(__atanf_lut), "r"(__atanf_pi_2) 130 | : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" 131 | ); 132 | 133 | #endif 134 | } 135 | 136 | 137 | float atanf_neon_sfp(float x) 138 | { 139 | #ifdef __MATH_NEON 140 | asm volatile ("vdup.f32 d0, r0 \n\t"); 141 | atanf_neon_hfp(x); 142 | asm volatile ("vmov.f32 r0, s0 \n\t"); 143 | #else 144 | return atanf_c(x); 145 | #endif 146 | }; 147 | 148 | 149 | 150 | -------------------------------------------------------------------------------- /math_ceilf.c: -------------------------------------------------------------------------------- 1 | /* 2 | The MIT License (MIT) 3 | 4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in 14 | all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. 23 | */ 24 | 25 | /* 26 | Assumes the floating point value |x| < 2147483648 27 | */ 28 | 29 | #include "math.h" 30 | #include "math_neon.h" 31 | 32 | float ceilf_c(float x) 33 | { 34 | int n; 35 | float r; 36 | n = (int) x; 37 | r = (float) n; 38 | r = r + (x > r); 39 | return r; 40 | } 41 | 42 | float ceilf_neon_hfp(float x) 43 | { 44 | #ifdef __MATH_NEON 45 | asm volatile ( 46 | 47 | "vcvt.s32.f32 d1, d0 \n\t" //d1 = (int) d0; 48 | "vcvt.f32.s32 d1, d1 \n\t" //d1 = (float) d1; 49 | "vcgt.f32 d0, d0, d1 \n\t" //d0 = (d0 > d1); 50 | "vshr.u32 d0, #31 \n\t" //d0 = d0 >> 31; 51 | "vcvt.f32.u32 d0, d0 \n\t" //d0 = (float) d0; 52 | "vadd.f32 d0, d1, d0 \n\t" //d0 = d1 + d0; 53 | 54 | ::: "d0", "d1" 55 | ); 56 | 57 | #endif 58 | } 59 | 60 | float ceilf_neon_sfp(float x) 61 | { 62 | #ifdef __MATH_NEON 63 | asm volatile ("vmov.f32 s0, r0 \n\t"); 64 | ceilf_neon_hfp(x); 65 | asm volatile ("vmov.f32 r0, s0 \n\t"); 66 | #else 67 | return ceilf_c(x); 68 | #endif 69 | }; 70 | 71 | 72 | -------------------------------------------------------------------------------- /math_cosf.c: -------------------------------------------------------------------------------- 1 | /* 2 | The MIT License (MIT) 3 | 4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in 14 | all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. 23 | */ 24 | 25 | #include "math_neon.h" 26 | 27 | float cosf_c(float x) 28 | { 29 | return sinf_c(x + M_PI_2); 30 | } 31 | 32 | float cosf_neon_hfp(float x) 33 | { 34 | #ifdef __MATH_NEON 35 | float xx = x + M_PI_2; 36 | return sinf_neon_hfp(xx); 37 | #endif 38 | } 39 | 40 | float cosf_neon_sfp(float x) 41 | { 42 | #ifdef __MATH_NEON 43 | asm volatile ("vdup.f32 d0, r0 \n\t"); 44 | cosf_neon_hfp(x); 45 | asm volatile ("vmov.f32 r0, s0 \n\t"); 46 | #else 47 | return cosf_c(x); 48 | #endif 49 | }; 50 | 51 | -------------------------------------------------------------------------------- /math_coshf.c: -------------------------------------------------------------------------------- 1 | /* 2 | The MIT License (MIT) 3 | 4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in 14 | all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. 23 | */ 24 | 25 | #include "math.h" 26 | #include "math_neon.h" 27 | 28 | const float __coshf_rng[2] = { 29 | 1.442695041f, 30 | 0.693147180f 31 | }; 32 | 33 | const float __coshf_lut[16] = { 34 | 0.00019578093328483123, //p7 35 | 0.00019578093328483123, //p7 36 | 0.0014122663401803872, //p6 37 | 0.0014122663401803872, //p6 38 | 0.008336936973260111, //p5 39 | 0.008336936973260111, //p5 40 | 0.04165989275009526, //p4 41 | 0.04165989275009526, //p4 42 | 0.16666570253074878, //p3 43 | 0.16666570253074878, //p3 44 | 0.5000006143673624, //p2 45 | 0.5000006143673624, //p2 46 | 1.000000059694879, //p1 47 | 1.000000059694879, //p1 48 | 0.9999999916728642, //p0 49 | 0.9999999916728642 //p0 50 | }; 51 | 52 | 53 | float coshf_c(float x) 54 | { 55 | float a, b, xx; 56 | xx = -x; 57 | a = expf_c(x); 58 | b = expf_c(xx); 59 | a = a * 0.5f; 60 | a = a + 0.5f * b; 61 | return a; 62 | } 63 | 64 | 65 | float coshf_neon_hfp(float x) 66 | { 67 | #ifdef __MATH_NEON 68 | asm volatile ( 69 | "vdup.f32 d0, d0[0] \n\t" //d0 = {x, x} 70 | "fnegs s1, s1 \n\t" //s1 = -s1 71 | 72 | //Range Reduction: 73 | "vld1.32 d2, [%0] \n\t" //d2 = {invrange, range} 74 | "vld1.32 {d16, d17}, [%1]! \n\t" 75 | "vmul.f32 d6, d0, d2[0] \n\t" //d6 = d0 * d2[0] 76 | "vcvt.s32.f32 d6, d6 \n\t" //d6 = (int) d6 77 | "vld1.32 {d18}, [%1]! \n\t" 78 | "vcvt.f32.s32 d1, d6 \n\t" //d1 = (float) d6 79 | "vld1.32 {d19}, [%1]! \n\t" 80 | "vmls.f32 d0, d1, d2[1] \n\t" //d0 = d0 - d1 * d2[1] 81 | "vld1.32 {d20}, [%1]! \n\t" 82 | 83 | //polynomial: 84 | "vmla.f32 d17, d16, d0 \n\t" //d17 = d17 + d16 * d0; 85 | "vld1.32 {d21}, [%1]! \n\t" 86 | "vmla.f32 d18, d17, d0 \n\t" //d18 = d18 + d17 * d0; 87 | "vld1.32 {d22}, [%1]! \n\t" 88 | "vmla.f32 d19, d18, d0 \n\t" //d19 = d19 + d18 * d0; 89 | "vld1.32 {d23}, [%1]! \n\t" 90 | "vmla.f32 d20, d19, d0 \n\t" //d20 = d20 + d19 * d0; 91 | "vmla.f32 d21, d20, d0 \n\t" //d21 = d21 + d20 * d0; 92 | "vmla.f32 d22, d21, d0 \n\t" //d22 = d22 + d21 * d0; 93 | "vmla.f32 d23, d22, d0 \n\t" //d23 = d23 + d22 * d0; 94 | 95 | //multiply by 2 ^ m 96 | "vshl.i32 d6, d6, #23 \n\t" //d6 = d6 << 23 97 | "vadd.i32 d0, d23, d6 \n\t" //d0 = d22 + d6 98 | 99 | "vdup.f32 d2, d0[1] \n\t" //d2 = s1 100 | "vmov.f32 d1, #0.5 \n\t" //d1 = 0.5 101 | "vadd.f32 d0, d0, d2 \n\t" //d0 = d0 + d2 102 | "vmul.f32 d0, d1 \n\t" //d0 = d0 * d1 103 | 104 | :: "r"(__coshf_rng), "r"(__coshf_lut) 105 | : "d0", "d1", "q1", "q2", "d6" 106 | ); 107 | 108 | #endif 109 | } 110 | 111 | float coshf_neon_sfp(float x) 112 | { 113 | #ifdef __MATH_NEON 114 | asm volatile ("vmov.f32 s0, r0 \n\t"); 115 | coshf_neon_hfp(x); 116 | asm volatile ("vmov.f32 r0, s0 \n\t"); 117 | #else 118 | return coshf_c(x); 119 | #endif 120 | }; 121 | -------------------------------------------------------------------------------- /math_debug.c: -------------------------------------------------------------------------------- 1 | /* 2 | The MIT License (MIT) 3 | 4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in 14 | all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. 23 | */ 24 | 25 | #include "math_neon.h" 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #ifdef WIN32 32 | #include 33 | #else 34 | #include 35 | #include 36 | #endif 37 | 38 | #define randf() (rand() / (RAND_MAX + 1.0f)) 39 | 40 | 41 | 42 | struct test1_s { 43 | const char* name; 44 | float (*func)(float); //the function 45 | float (*bench)(float); //the function to benchmark against. 46 | float rng0, rng1; 47 | int num; 48 | float emaxabs; 49 | float xmaxabs; 50 | float emaxrel; 51 | float xmaxrel; 52 | float erms; 53 | int time; //time to execute num functions; 54 | }; 55 | 56 | struct test2_s { 57 | const char* name; 58 | float (*func)(float, float); //the function 59 | float (*bench)(float, float); //the function to benchmark against. 60 | float rng0, rng1; 61 | int num; 62 | float emaxabs; 63 | float xmaxabs; 64 | float emaxrel; 65 | float xmaxrel; 66 | float erms; 67 | int time; //time to execute num functions; 68 | }; 69 | 70 | 71 | float invsqrtf(float x){ 72 | return (1.0f / sqrtf(x)); 73 | } 74 | 75 | typedef struct test1_s test1_t; 76 | typedef struct test2_s test2_t; 77 | 78 | test1_t test1[51] = 79 | { 80 | {"sinf ", sinf, sinf, -M_PI, M_PI, 500000}, 81 | {"sinf_c ", sinf_c, sinf, -M_PI, M_PI, 500000}, 82 | {"sinf_neon ", sinf_neon, sinf, -M_PI, M_PI, 500000}, 83 | 84 | {"cosf ", cosf, cosf, -M_PI, M_PI, 500000}, 85 | {"cosf_c ", cosf_c, cosf, -M_PI, M_PI, 500000}, 86 | {"cosf_neon ", cosf_neon, cosf, -M_PI, M_PI, 500000}, 87 | 88 | {"tanf ", tanf, tanf, -M_PI_4, M_PI_4, 500000, 0, 0, 0}, 89 | {"tanf_c ", tanf_c, tanf, -M_PI_4, M_PI_4, 500000, 0, 0, 0}, 90 | {"tanf_neon ", tanf_neon, tanf, -M_PI_4, M_PI_4, 500000, 0, 0, 0}, 91 | 92 | {"asinf ", asinf, asinf, -1, 1, 500000, 0, 0, 0}, 93 | {"asinf_c ", asinf_c, asinf, -1, 1, 500000, 0, 0, 0}, 94 | {"asinf_neon ", asinf_neon, asinf, -1, 1, 500000, 0, 0, 0}, 95 | 96 | {"acosf ", acosf, acosf, -1, 1, 500000, 0, 0, 0}, 97 | {"acosf_c ", acosf_c, acosf, -1, 1, 500000, 0, 0, 0}, 98 | {"acosf_neon ", acosf_neon, acosf, -1, 1, 500000, 0, 0, 0}, 99 | 100 | {"atanf ", atanf, atanf, -1, 1, 500000, 0, 0, 0}, 101 | {"atanf_c ", atanf_c, atanf, -1, 1, 500000, 0, 0, 0}, 102 | {"atanf_neon ", atanf_neon, atanf, -1, 1, 500000, 0, 0, 0}, 103 | 104 | {"sinhf ", sinhf, sinhf, -M_PI, M_PI, 500000, 0, 0, 0}, 105 | {"sinhf_c ", sinhf_c, sinhf, -M_PI, M_PI, 500000, 0, 0, 0}, 106 | {"sinhf_neon ", sinhf_neon, sinhf, -M_PI, M_PI, 500000, 0, 0, 0}, 107 | 108 | {"coshf ", coshf, coshf, -M_PI, M_PI, 500000, 0, 0, 0}, 109 | {"coshf_c ", coshf_c, coshf, -M_PI, M_PI, 500000, 0, 0, 0}, 110 | {"coshf_neon ", coshf_neon, coshf, -M_PI, M_PI, 500000, 0, 0, 0}, 111 | 112 | {"tanhf ", tanhf, tanhf, -M_PI, M_PI, 500000, 0, 0, 0}, 113 | {"tanhf_c ", tanhf_c, tanhf, -M_PI, M_PI, 500000, 0, 0, 0}, 114 | {"tanhf_neon ", tanhf_neon, tanhf, -M_PI, M_PI, 500000, 0, 0, 0}, 115 | 116 | {"expf ", expf, expf, 0, 10, 500000, 0, 0, 0}, 117 | {"expf_c ", expf_c, expf, 0, 10, 500000, 0, 0, 0}, 118 | {"expf_neon ", expf_neon, expf, 0, 10, 500000, 0, 0, 0}, 119 | 120 | {"logf ", logf, logf, 1, 1000, 500000, 0, 0, 0}, 121 | {"logf_c ", logf_c, logf, 1, 1000, 500000, 0, 0, 0}, 122 | {"logf_neon ", logf_neon, logf, 1, 1000, 500000, 0, 0, 0}, 123 | 124 | {"log10f ", log10f, log10f, 1, 1000, 500000, 0, 0, 0}, 125 | {"log10f_c ", log10f_c, log10f, 1, 1000, 500000, 0, 0, 0}, 126 | {"log10f_neon ", log10f_neon,log10f, 1, 1000, 500000, 0, 0, 0}, 127 | 128 | {"floorf ", floorf, floorf, 1, 1000, 5000000, 0, 0, 0}, 129 | {"floorf_c ", floorf_c, floorf, 1, 1000, 5000000, 0, 0, 0}, 130 | {"floorf_neon", floorf_neon,floorf, 1, 1000, 5000000, 0, 0, 0}, 131 | 132 | {"ceilf ", ceilf, ceilf, 1, 1000, 5000000, 0, 0, 0}, 133 | {"ceilf_c ", ceilf_c, ceilf, 1, 1000, 5000000, 0, 0, 0}, 134 | {"ceilf_neon", ceilf_neon, ceilf, 1, 1000, 5000000, 0, 0, 0}, 135 | 136 | {"fabsf ", fabsf, fabsf, 1, 1000, 5000000, 0, 0, 0}, 137 | {"fabsf_c ", fabsf_c, fabsf, 1, 1000, 5000000, 0, 0, 0}, 138 | {"fabsf_neon", fabsf_neon, fabsf, 1, 1000, 5000000, 0, 0, 0}, 139 | 140 | {"sqrtf ", sqrtf, sqrtf, 1, 1000, 500000, 0, 0, 0}, 141 | {"sqrtf_c ", sqrtf_c, sqrtf, 1, 1000, 500000, 0, 0, 0}, 142 | {"sqrtf_neon ", sqrtf_neon, sqrtf, 1, 1000, 500000, 0, 0, 0}, 143 | 144 | {"invsqrtf ", invsqrtf, invsqrtf, 1, 1000, 500000, 0, 0, 0}, 145 | {"invsqrtf_c ", invsqrtf_c, invsqrtf, 1, 1000, 500000, 0, 0, 0}, 146 | {"invsqrtf_neon ", invsqrtf_neon, invsqrtf, 1, 1000, 500000, 0, 0, 0}, 147 | }; 148 | 149 | test2_t test2[9] = 150 | { 151 | {"atan2f ", atan2f, atan2f, 0.1, 10, 10000, 0, 0, 0}, 152 | {"atan2f_c ", atan2f_c, atan2f, 0.1, 10, 10000, 0, 0, 0}, 153 | {"atan2f_neon ", atan2f_neon,atan2f, 0.1, 10, 10000, 0, 0, 0}, 154 | 155 | {"powf ", powf, powf, 1, 10, 10000, 0, 0, 0}, 156 | {"powf_c ", powf_c, powf, 1, 10, 10000, 0, 0, 0}, 157 | {"powf_neon ", powf_neon, powf, 1, 10, 10000, 0, 0, 0}, 158 | 159 | {"fmodf ", fmodf, fmodf, 1, 10, 10000, 0, 0, 0}, 160 | {"fmodf_c ", fmodf_c, fmodf, 1, 10, 10000, 0, 0, 0}, 161 | {"fmodf_neon ", fmodf_neon, fmodf, 1, 10, 10000, 0, 0, 0}, 162 | 163 | }; 164 | 165 | 166 | void 167 | test_mathfunc1(test1_t *tst) 168 | { 169 | 170 | float x; 171 | float dx = (tst->rng1 - tst->rng0) / ((float)tst->num); 172 | #ifndef WIN32 173 | struct rusage ru; 174 | #endif 175 | 176 | tst->emaxabs = tst->xmaxabs = 0; 177 | tst->emaxrel = tst->xmaxrel = 0; 178 | tst->erms = 0; 179 | for(x = tst->rng0; x < tst->rng1 ; x += dx){ 180 | float r = (tst->func)((float)x); 181 | float rr = (tst->bench)((float)x); 182 | float dr = fabs(r - rr); 183 | float drr = dr * (100.0f / rr); 184 | tst->erms += dr*dr; 185 | if (dr > tst->emaxabs){ 186 | tst->emaxabs = dr; 187 | tst->xmaxabs = x; 188 | } 189 | if (drr > tst->emaxrel){ 190 | tst->emaxrel = drr; 191 | tst->xmaxrel = x; 192 | } 193 | } 194 | tst->erms = sqrt(tst->erms / ((float) tst->num)); 195 | 196 | #ifdef WIN32 197 | tst->time = (1000 * clock()) / (CLOCKS_PER_SEC / 1000); 198 | #else 199 | getrusage(RUSAGE_SELF, &ru); 200 | tst->time = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec; 201 | #endif 202 | 203 | for(x = tst->rng0; x < tst->rng1 ; x += dx){ 204 | (tst->func)((float)x); 205 | } 206 | 207 | #ifdef WIN32 208 | tst->time = (1000 * clock()) / (CLOCKS_PER_SEC / 1000) - tst->time; 209 | #else 210 | getrusage(RUSAGE_SELF, &ru); 211 | tst->time = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec - tst->time; 212 | #endif 213 | 214 | } 215 | 216 | void 217 | test_mathfunc2(test2_t *tst) 218 | { 219 | float x, y; 220 | float rng = tst->rng1 - tst->rng0; 221 | float d = (rng * rng) / ((float) tst->num); 222 | #ifndef WIN32 223 | struct rusage ru; 224 | #endif 225 | 226 | tst->emaxabs = tst->xmaxabs = 0; 227 | tst->emaxrel = tst->xmaxrel = 0; 228 | for(y = (tst->rng0); y < (tst->rng1) ; y += d){ 229 | for(x = (tst->rng0); x < (tst->rng1); x += d){ 230 | float r = (tst->func)((float)x, y); 231 | float rr = (tst->bench)((float)x, y); 232 | float dr = fabs(r - rr); 233 | float drr = dr * (100.0f / rr); 234 | if (dr > tst->emaxabs){ 235 | tst->emaxabs = dr; 236 | tst->xmaxabs = x; 237 | } 238 | if (drr > tst->emaxrel && fabsf(rr) > 0.0001){ 239 | tst->emaxrel = drr; 240 | tst->xmaxrel = x; 241 | } 242 | } 243 | } 244 | 245 | #ifdef WIN32 246 | tst->time = (1000 * clock()) / (CLOCKS_PER_SEC / 1000) ; 247 | #else 248 | getrusage(RUSAGE_SELF, &ru); 249 | tst->time = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec; 250 | #endif 251 | 252 | for(y = tst->rng0; y < tst->rng1 ; y += d){ 253 | for(x = tst->rng0; x < tst->rng1 ; x += d){ 254 | (tst->func)((float)x, (float)y); 255 | } 256 | } 257 | 258 | #ifdef WIN32 259 | tst->time = (1000 * clock()) / (CLOCKS_PER_SEC / 1000) - tst->time; 260 | #else 261 | getrusage(RUSAGE_SELF, &ru); 262 | tst->time = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec - tst->time; 263 | #endif 264 | 265 | } 266 | 267 | void test_vectorfunc() 268 | { 269 | float v0[4], v1[4], d[4]; 270 | 271 | for(int i=0;i<4;i++) 272 | { 273 | v0[i] = 10*randf() - 5; 274 | v1[i] = 10*randf() - 5; 275 | d[i] = 10*randf() - 5; 276 | } 277 | 278 | int testnum = 5000000; 279 | struct rusage ru; 280 | int v2t[3], v3t[3], v4t[3]; 281 | float r; 282 | 283 | printf("\n"); 284 | 285 | //dot 2 286 | getrusage(RUSAGE_SELF, &ru); 287 | v2t[0] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec; 288 | for(int i=0;i < testnum; i++) 289 | { 290 | r = dot2_c(v0, v1); 291 | }; 292 | getrusage(RUSAGE_SELF, &ru); 293 | v2t[1] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec; 294 | for(int i=0;i < testnum; i++) 295 | { 296 | r = dot2_neon(v0, v1); 297 | }; 298 | getrusage(RUSAGE_SELF, &ru); 299 | v2t[2] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec; 300 | 301 | r = dot2_c(v0, v1); 302 | printf("dot2_c = %f\n", r); 303 | r = dot2_neon(v0, v1); 304 | printf("dot2_neon = %f\n", r); 305 | 306 | printf("dot2: c=%i \t neon=%i \t rate=%.2f \n", v2t[1] - v2t[0], v2t[2] - v2t[1], 307 | (float)(v2t[1] - v2t[0]) / (float)(v2t[2] - v2t[1])); 308 | 309 | //normalize 2 310 | getrusage(RUSAGE_SELF, &ru); 311 | v2t[0] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec; 312 | for(int i=0;i < testnum; i++) 313 | { 314 | normalize2_c(v0, d); 315 | }; 316 | getrusage(RUSAGE_SELF, &ru); 317 | v2t[1] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec; 318 | for(int i=0;i < testnum; i++) 319 | { 320 | normalize2_neon(v0, d); 321 | }; 322 | getrusage(RUSAGE_SELF, &ru); 323 | v2t[2] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec; 324 | 325 | 326 | normalize2_c(v0, d); 327 | printf("normalize2_c = [%.2f, %.2f]\n", d[0], d[1]); 328 | normalize2_neon(v0, d); 329 | printf("normalize2_neon = [%.2f, %.2f]\n", d[0], d[1]); 330 | 331 | printf("normalize2: c=%i \t neon=%i \t rate=%.2f \n", v2t[1] - v2t[0], v2t[2] - v2t[1], 332 | (float)(v2t[1] - v2t[0]) / (float)(v2t[2] - v2t[1])); 333 | printf("\n"); 334 | 335 | 336 | //dot 3 337 | getrusage(RUSAGE_SELF, &ru); 338 | v3t[0] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec; 339 | for(int i=0;i < testnum; i++) 340 | { 341 | r = dot3_c(v0, v1); 342 | }; 343 | getrusage(RUSAGE_SELF, &ru); 344 | v3t[1] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec; 345 | for(int i=0;i < testnum; i++) 346 | { 347 | r = dot3_neon(v0, v1); 348 | }; 349 | getrusage(RUSAGE_SELF, &ru); 350 | v3t[2] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec; 351 | 352 | r = dot3_c(v0, v1); 353 | printf("dot3_c = %f\n", r); 354 | r = dot3_neon(v0, v1); 355 | printf("dot3_neon = %f\n", r); 356 | 357 | printf("dot3: c=%i \t neon=%i \t rate=%.2f \n", v3t[1] - v3t[0], v3t[2] - v3t[1], 358 | (float)(v3t[1] - v3t[0]) / (float)(v3t[2] - v3t[1])); 359 | 360 | //normalize 3 361 | getrusage(RUSAGE_SELF, &ru); 362 | v3t[0] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec; 363 | for(int i=0;i < testnum; i++) 364 | { 365 | normalize3_c(v0, d); 366 | }; 367 | getrusage(RUSAGE_SELF, &ru); 368 | v3t[1] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec; 369 | for(int i=0;i < testnum; i++) 370 | { 371 | normalize3_neon(v0, d); 372 | }; 373 | getrusage(RUSAGE_SELF, &ru); 374 | v3t[2] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec; 375 | 376 | 377 | normalize3_c(v0, d); 378 | printf("normalize3_c = [%.2f, %.2f, %.2f]\n", d[0], d[1], d[2]); 379 | normalize3_neon(v0, d); 380 | printf("normalize3_neon = [%.2f, %.2f, %.2f]\n", d[0], d[1], d[2]); 381 | 382 | printf("normalize3: c=%i \t neon=%i \t rate=%.2f \n", v3t[1] - v3t[0], v3t[2] - v3t[1], 383 | (float)(v3t[1] - v3t[0]) / (float)(v3t[2] - v3t[1])); 384 | 385 | //cross 3 386 | getrusage(RUSAGE_SELF, &ru); 387 | v3t[0] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec; 388 | for(int i=0;i < testnum; i++) 389 | { 390 | cross3_c(v0, v1, d); 391 | }; 392 | getrusage(RUSAGE_SELF, &ru); 393 | v3t[1] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec; 394 | for(int i=0;i < testnum; i++) 395 | { 396 | cross3_neon(v0, v1, d); 397 | }; 398 | getrusage(RUSAGE_SELF, &ru); 399 | v3t[2] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec; 400 | 401 | 402 | cross3_c(v0, v1, d); 403 | printf("cross3_c = [%.2f, %.2f, %.2f]\n", d[0], d[1], d[2]); 404 | cross3_neon(v0, v1, d); 405 | printf("cross3_neon = [%.2f, %.2f, %.2f]\n", d[0], d[1], d[2]); 406 | 407 | printf("cross3: c=%i \t neon=%i \t rate=%.2f \n", v3t[1] - v3t[0], v3t[2] - v3t[1], 408 | (float)(v3t[1] - v3t[0]) / (float)(v3t[2] - v3t[1])); 409 | printf("\n"); 410 | 411 | 412 | //dot 4 413 | getrusage(RUSAGE_SELF, &ru); 414 | v4t[0] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec; 415 | for(int i=0;i < testnum; i++) 416 | { 417 | r = dot4_c(v0, v1); 418 | }; 419 | getrusage(RUSAGE_SELF, &ru); 420 | v4t[1] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec; 421 | for(int i=0;i < testnum; i++) 422 | { 423 | r = dot4_neon(v0, v1); 424 | }; 425 | getrusage(RUSAGE_SELF, &ru); 426 | v4t[2] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec; 427 | 428 | r = dot4_c(v0, v1); 429 | printf("dot4_c = %f\n", r); 430 | r = dot4_neon(v0, v1); 431 | printf("dot4_neon = %f\n", r); 432 | 433 | printf("dot4: c=%i \t neon=%i \t rate=%.2f \n", v4t[1] - v4t[0], v4t[2] - v4t[1], 434 | (float)(v4t[1] - v4t[0]) / (float)(v4t[2] - v4t[1])); 435 | 436 | //normalize 4 437 | getrusage(RUSAGE_SELF, &ru); 438 | v4t[0] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec; 439 | for(int i=0;i < testnum; i++) 440 | { 441 | normalize4_c(v0, d); 442 | }; 443 | getrusage(RUSAGE_SELF, &ru); 444 | v4t[1] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec; 445 | for(int i=0;i < testnum; i++) 446 | { 447 | normalize4_neon(v0, d); 448 | }; 449 | getrusage(RUSAGE_SELF, &ru); 450 | v4t[2] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec; 451 | 452 | 453 | normalize4_c(v0, d); 454 | printf("normalize4_c = [%.2f, %.2f, %.2f, %.2f]\n", d[0], d[1], d[2], d[3]); 455 | normalize4_neon(v0, d); 456 | printf("normalize4_neon = [%.2f, %.2f, %.2f, %.2f]\n", d[0], d[1], d[2], d[3]); 457 | 458 | printf("normalize4: c=%i \t neon=%i \t rate=%.2f \n", v4t[1] - v4t[0], v4t[2] - v4t[1], 459 | (float)(v4t[1] - v4t[0]) / (float)(v4t[2] - v4t[1])); 460 | printf("\n"); 461 | 462 | 463 | } 464 | 465 | 466 | 467 | void test_matrixfunc() 468 | { 469 | float m0[16], m1[16], m2[16]; 470 | int m2t[3], m3t[3], m4t[3]; 471 | 472 | int i; 473 | int testnum = 1000000; 474 | struct rusage ru; 475 | 476 | for(int i=0;i<16;i++) 477 | { 478 | m0[i] = 10.0f * randf() - 5.0f; 479 | m1[i] = 10.0f * randf() - 5.0f; 480 | m2[i] = 10.0f * randf() - 5.0f; 481 | } 482 | 483 | 484 | //matmul2 485 | getrusage(RUSAGE_SELF, &ru); 486 | m2t[0] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec; 487 | for(i = 0; i < testnum; i++){ 488 | matmul2_c(m0, m1, m2); 489 | } 490 | getrusage(RUSAGE_SELF, &ru); 491 | m2t[1] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec; 492 | for(i = 0; i < testnum; i++){ 493 | matmul2_neon(m0, m1, m2); 494 | } 495 | getrusage(RUSAGE_SELF, &ru); 496 | m2t[2] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec; 497 | 498 | matmul2_c(m0, m1, m2); 499 | printf("matmul2_c = \n"); 500 | printf("\t\t\t|%.2f, %.2f|\n", m2[0], m2[2]); 501 | printf("\t\t\t|%.2f, %.2f|\n", m2[1], m2[3]); 502 | 503 | matmul2_neon(m0, m1, m2); 504 | printf("matmul2_neon = \n"); 505 | printf("\t\t\t|%.2f, %.2f|\n", m2[0], m2[2]); 506 | printf("\t\t\t|%.2f, %.2f|\n", m2[1], m2[3]); 507 | 508 | printf("matmul2: c=%i \t neon=%i \t rate=%.2f \n", m2t[1] - m2t[0], m2t[2] - m2t[1], 509 | (float)(m2t[1] - m2t[0]) / (float)(m2t[2] - m2t[1])); 510 | 511 | 512 | //matvec2 513 | getrusage(RUSAGE_SELF, &ru); 514 | m2t[0] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec; 515 | for(i = 0; i < testnum; i++){ 516 | matvec2_c(m0, m1, m2); 517 | } 518 | getrusage(RUSAGE_SELF, &ru); 519 | m2t[1] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec; 520 | for(i = 0; i < testnum; i++){ 521 | matvec2_neon(m0, m1, m2); 522 | } 523 | getrusage(RUSAGE_SELF, &ru); 524 | m2t[2] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec; 525 | 526 | memset(m2, 0, 4*sizeof(float)); 527 | matvec2_c(m0, m1, m2); 528 | printf("matvec2_c = |%.2f, %.2f|\n", m2[0], m2[1]); 529 | 530 | memset(m2, 0, 4*sizeof(float)); 531 | matvec2_neon(m0, m1, m2); 532 | printf("matvec2_neon = |%.2f, %.2f|\n", m2[0], m2[1]); 533 | 534 | printf("matvec2: c=%i \t neon=%i \t rate=%.2f \n", m2t[1] - m2t[0], m2t[2] - m2t[1], 535 | (float)(m2t[1] - m2t[0]) / (float)(m2t[2] - m2t[1])); 536 | 537 | //MAT3 538 | getrusage(RUSAGE_SELF, &ru); 539 | m3t[0] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec; 540 | for(i = 0; i < testnum; i++){ 541 | matmul3_c(m0, m1, m2); 542 | } 543 | getrusage(RUSAGE_SELF, &ru); 544 | m3t[1] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec; 545 | for(i = 0; i < testnum; i++){ 546 | matmul3_neon(m0, m1, m2); 547 | } 548 | getrusage(RUSAGE_SELF, &ru); 549 | m3t[2] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec; 550 | 551 | memset(m2, 0, 9*sizeof(float)); 552 | matmul3_c(m0, m1, m2); 553 | printf("matmul3_c =\n"); 554 | printf("\t\t\t|%.2f, %.2f, %.2f|\n", m2[0], m2[3], m2[6]); 555 | printf("\t\t\t|%.2f, %.2f, %.2f|\n", m2[1], m2[4], m2[7]); 556 | printf("\t\t\t|%.2f, %.2f, %.2f|\n", m2[2], m2[5], m2[8]); 557 | 558 | memset(m2, 0, 9*sizeof(float)); 559 | matmul3_neon(m0, m1, m2); 560 | printf("matmul3_neon =\n"); 561 | printf("\t\t\t|%.2f, %.2f, %.2f|\n", m2[0], m2[3], m2[6]); 562 | printf("\t\t\t|%.2f, %.2f, %.2f|\n", m2[1], m2[4], m2[7]); 563 | printf("\t\t\t|%.2f, %.2f, %.2f|\n", m2[2], m2[5], m2[8]); 564 | 565 | printf("matmul3: c=%i \t neon=%i \t rate=%.2f \n", m3t[1] - m3t[0], m3t[2] - m3t[1], 566 | (float)(m3t[1] - m3t[0]) / (float)(m3t[2] - m3t[1])); 567 | 568 | //matvec3 569 | getrusage(RUSAGE_SELF, &ru); 570 | m3t[0] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec; 571 | for(i = 0; i < testnum; i++){ 572 | matvec3_c(m0, m1, m2); 573 | } 574 | getrusage(RUSAGE_SELF, &ru); 575 | m3t[1] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec; 576 | for(i = 0; i < testnum; i++){ 577 | matvec3_neon(m0, m1, m2); 578 | } 579 | getrusage(RUSAGE_SELF, &ru); 580 | m3t[2] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec; 581 | 582 | memset(m2, 0, 4*sizeof(float)); 583 | matvec3_c(m0, m1, m2); 584 | printf("matvec3_c = |%.2f, %.2f, %.2f|\n", m2[0], m2[1], m2[2]); 585 | 586 | memset(m2, 0, 4*sizeof(float)); 587 | matvec3_neon(m0, m1, m2); 588 | printf("matvec3_neon = |%.2f, %.2f, %.2f|\n", m2[0], m2[1], m2[2]); 589 | 590 | printf("matvec3: c=%i \t neon=%i \t rate=%.2f \n", m2t[1] - m2t[0], m2t[2] - m2t[1], 591 | (float)(m2t[1] - m2t[0]) / (float)(m2t[2] - m2t[1])); 592 | 593 | //MAT4 594 | getrusage(RUSAGE_SELF, &ru); 595 | m4t[0] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec; 596 | for(i = 0; i < testnum; i++){ 597 | matmul4_c(m0, m1, m2); 598 | } 599 | getrusage(RUSAGE_SELF, &ru); 600 | m4t[1] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec; 601 | for(i = 0; i < testnum; i++){ 602 | matmul4_neon(m0, m1, m2); 603 | } 604 | getrusage(RUSAGE_SELF, &ru); 605 | m4t[2] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec; 606 | 607 | memset(m2, 0, 16*sizeof(float)); 608 | matmul4_c(m0, m1, m2); 609 | printf("matmul4_c =\n"); 610 | printf("\t\t\t|%.2f, %.2f, %.2f, %.2f|\n", m2[0], m2[4], m2[8], m2[12]); 611 | printf("\t\t\t|%.2f, %.2f, %.2f, %.2f|\n", m2[1], m2[5], m2[9], m2[13]); 612 | printf("\t\t\t|%.2f, %.2f, %.2f, %.2f|\n", m2[2], m2[6], m2[10], m2[14]); 613 | printf("\t\t\t|%.2f, %.2f, %.2f, %.2f|\n", m2[3], m2[7], m2[11], m2[15]); 614 | 615 | memset(m2, 0, 16*sizeof(float)); 616 | matmul4_neon(m0, m1, m2); 617 | printf("matmul4_neon =\n"); 618 | printf("\t\t\t|%.2f, %.2f, %.2f, %.2f|\n", m2[0], m2[4], m2[8], m2[12]); 619 | printf("\t\t\t|%.2f, %.2f, %.2f, %.2f|\n", m2[1], m2[5], m2[9], m2[13]); 620 | printf("\t\t\t|%.2f, %.2f, %.2f, %.2f|\n", m2[2], m2[6], m2[10], m2[14]); 621 | printf("\t\t\t|%.2f, %.2f, %.2f, %.2f|\n", m2[3], m2[7], m2[11], m2[15]); 622 | 623 | printf("matmul4: c=%i \t neon=%i \t rate=%.2f \n", m4t[1] - m4t[0], m4t[2] - m4t[1], 624 | (float)(m4t[1] - m4t[0]) / (float)(m4t[2] - m4t[1])); 625 | 626 | //matvec4 627 | getrusage(RUSAGE_SELF, &ru); 628 | m4t[0] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec; 629 | for(i = 0; i < testnum; i++){ 630 | matvec4_c(m0, m1, m2); 631 | } 632 | getrusage(RUSAGE_SELF, &ru); 633 | m4t[1] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec; 634 | for(i = 0; i < testnum; i++){ 635 | matvec4_neon(m0, m1, m2); 636 | } 637 | getrusage(RUSAGE_SELF, &ru); 638 | m4t[2] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec; 639 | 640 | memset(m2, 0, 4*sizeof(float)); 641 | matvec4_c(m0, m1, m2); 642 | printf("matvec4_c = |%.2f, %.2f, %.2f, %f|\n", m2[0], m2[1], m2[2], m2[3]); 643 | 644 | memset(m2, 0, 4*sizeof(float)); 645 | matvec4_neon(m0, m1, m2); 646 | printf("matvec4_neon = |%.2f, %.2f, %.2f, %f|\n", m2[0], m2[1], m2[2], m2[3]); 647 | 648 | printf("matvec4: c=%i \t neon=%i \t rate=%.2f \n", m2t[1] - m2t[0], m2t[2] - m2t[1], 649 | (float)(m2t[1] - m2t[0]) / (float)(m2t[2] - m2t[1])); 650 | 651 | 652 | } 653 | 654 | int main(int argc, char** argv) 655 | { 656 | 657 | int i, ii; 658 | if (argc > 1 && strcmp(argv[1], "-norunfast") == 0){ 659 | printf("RUNFAST: Disabled \n"); 660 | }else { 661 | printf("RUNFAST: Enabled \n"); 662 | enable_runfast(); 663 | } 664 | 665 | srand(time(NULL)); 666 | 667 | #if 1 668 | //test single argument functions: 669 | printf("------------------------------------------------------------------------------------------------------\n"); 670 | printf("MATRIX FUNCTION TESTS \n"); 671 | printf("------------------------------------------------------------------------------------------------------\n"); 672 | 673 | test_matrixfunc(); 674 | test_vectorfunc(); 675 | 676 | printf("------------------------------------------------------------------------------------------------------\n"); 677 | printf("CMATH FUNCTION TESTS \n"); 678 | printf("------------------------------------------------------------------------------------------------------\n"); 679 | printf("Function\tRange\t\tNumber\tABS Max Error\tREL Max Error\tRMS Error\tTime\tRate\n"); 680 | printf("------------------------------------------------------------------------------------------------------\n"); 681 | for(i = 0; i < 51; i++){ 682 | test_mathfunc1(&test1[i]); 683 | 684 | ii = i - (i % 3); 685 | printf("%s\t", test1[i].name); 686 | printf("[%.2f, %.2f]\t", test1[i].rng0, test1[i].rng1); 687 | printf("%i\t", test1[i].num); 688 | printf("%.2e\t", test1[i].emaxabs); 689 | printf("%.2e%%\t", test1[i].emaxrel); 690 | printf("%.2e\t", test1[i].erms); 691 | printf("%i\t", test1[i].time); 692 | printf("x%.2f\t", (float)test1[ii].time / test1[i].time); 693 | printf("\n"); 694 | } 695 | for(i = 0; i < 9; i++){ 696 | test_mathfunc2(&test2[i]); 697 | 698 | ii = i - (i % 3); 699 | 700 | printf("%s\t", test2[i].name); 701 | printf("[%.2f, %.2f]\t", test2[i].rng0, test2[i].rng1); 702 | printf("%i\t", test2[i].num); 703 | printf("%.2e\t", test2[i].emaxabs); 704 | printf("%.2e%%\t", test2[i].emaxrel); 705 | printf("%.2e\t", test2[i].erms); 706 | printf("%i\t", test2[i].time); 707 | printf("x%.2f\t", (float)test2[ii].time / test2[i].time); 708 | printf("\n"); 709 | } 710 | 711 | #else 712 | 713 | 714 | float x = 0; 715 | for(x = -M_PI_2; x < M_PI_2; x+= 0.01) 716 | { 717 | printf("x=%.2f\t in=%.2f\t c=%.2f\t neon=%.2f \n", x, sinhf(x), sinhf_c(x), sinhf_neon(x)); 718 | } 719 | 720 | #endif 721 | 722 | return 0; 723 | } 724 | -------------------------------------------------------------------------------- /math_expf.c: -------------------------------------------------------------------------------- 1 | /* 2 | The MIT License (MIT) 3 | 4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in 14 | all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. 23 | */ 24 | 25 | /* 26 | Based on: 27 | 28 | e ^ x = (1+m) * (2^n) 29 | x = log(1+m) + n * log(2) 30 | n = (int) (x * 1.0 / log(2)) 31 | (1+m) = e ^ (x - n * log(2)) 32 | (1+m) = Poly(x - n * log(2)) 33 | 34 | where Poly(x) is the Minimax approximation of e ^ x over the 35 | range [-Log(2), Log(2)] 36 | 37 | Test func : expf(x) 38 | Test Range: 0 < x < 50 39 | Peak Error: ~0.00024% 40 | RMS Error: ~0.00007% 41 | */ 42 | 43 | #include "math.h" 44 | #include "math_neon.h" 45 | 46 | const float __expf_rng[2] = { 47 | 1.442695041f, 48 | 0.693147180f 49 | }; 50 | 51 | const float __expf_lut[8] = { 52 | 0.9999999916728642, //p0 53 | 0.04165989275009526, //p4 54 | 0.5000006143673624, //p2 55 | 0.0014122663401803872, //p6 56 | 1.000000059694879, //p1 57 | 0.008336936973260111, //p5 58 | 0.16666570253074878, //p3 59 | 0.00019578093328483123 //p7 60 | }; 61 | 62 | float expf_c(float x) 63 | { 64 | float a, b, c, d, xx; 65 | int m; 66 | 67 | union { 68 | float f; 69 | int i; 70 | } r; 71 | 72 | //Range Reduction: 73 | m = (int) (x * __expf_rng[0]); 74 | x = x - ((float) m) * __expf_rng[1]; 75 | 76 | //Taylor Polynomial (Estrins) 77 | a = (__expf_lut[4] * x) + (__expf_lut[0]); 78 | b = (__expf_lut[6] * x) + (__expf_lut[2]); 79 | c = (__expf_lut[5] * x) + (__expf_lut[1]); 80 | d = (__expf_lut[7] * x) + (__expf_lut[3]); 81 | xx = x * x; 82 | a = a + b * xx; 83 | c = c + d * xx; 84 | xx = xx* xx; 85 | r.f = a + c * xx; 86 | 87 | //multiply by 2 ^ m 88 | m = m << 23; 89 | r.i = r.i + m; 90 | 91 | return r.f; 92 | } 93 | 94 | float expf_neon_hfp(float x) 95 | { 96 | #ifdef __MATH_NEON 97 | asm volatile ( 98 | "vdup.f32 d0, d0[0] \n\t" //d0 = {x, x} 99 | 100 | //Range Reduction: 101 | "vld1.32 d2, [%0] \n\t" //d2 = {invrange, range} 102 | "vmul.f32 d6, d0, d2[0] \n\t" //d6 = d0 * d2[0] 103 | "vcvt.s32.f32 d6, d6 \n\t" //d6 = (int) d6 104 | "vcvt.f32.s32 d1, d6 \n\t" //d1 = (float) d6 105 | "vmls.f32 d0, d1, d2[1] \n\t" //d0 = d0 - d1 * d2[1] 106 | 107 | //polynomial: 108 | "vmul.f32 d1, d0, d0 \n\t" //d1 = d0*d0 = {x^2, x^2} 109 | "vld1.32 {d2, d3, d4, d5}, [%1] \n\t" //q1 = {p0, p4, p2, p6}, q2 = {p1, p5, p3, p7} ; 110 | "vmla.f32 q1, q2, d0[0] \n\t" //q1 = q1 + q2 * d0[0] 111 | "vmla.f32 d2, d3, d1[0] \n\t" //d2 = d2 + d3 * d1[0] 112 | "vmul.f32 d1, d1, d1 \n\t" //d1 = d1 * d1 = {x^4, x^4} 113 | "vmla.f32 d2, d1, d2[1] \n\t" //d2 = d2 + d1 * d2[1] 114 | 115 | //multiply by 2 ^ m 116 | "vshl.i32 d6, d6, #23 \n\t" //d6 = d6 << 23 117 | "vadd.i32 d0, d2, d6 \n\t" //d0 = d2 + d6 118 | 119 | :: "r"(__expf_rng), "r"(__expf_lut) 120 | : "d0", "d1", "q1", "q2", "d6" 121 | ); 122 | #endif 123 | } 124 | 125 | float expf_neon_sfp(float x) 126 | { 127 | #ifdef __MATH_NEON 128 | asm volatile ("vmov.f32 s0, r0 \n\t"); 129 | expf_neon_hfp(x); 130 | asm volatile ("vmov.f32 r0, s0 \n\t"); 131 | #else 132 | return expf_c(x); 133 | #endif 134 | }; 135 | 136 | -------------------------------------------------------------------------------- /math_fabsf.c: -------------------------------------------------------------------------------- 1 | /* 2 | The MIT License (MIT) 3 | 4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in 14 | all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. 23 | */ 24 | 25 | #include "math_neon.h" 26 | 27 | 28 | float fabsf_c(float x) 29 | { 30 | union { 31 | int i; 32 | float f; 33 | } xx; 34 | 35 | xx.f = x; 36 | xx.i = xx.i & 0x7FFFFFFF; 37 | return xx.f; 38 | } 39 | 40 | float fabsf_neon_hfp(float x) 41 | { 42 | #ifdef __MATH_NEON 43 | asm volatile ( 44 | "fabss s0, s0 \n\t" //s0 = fabs(s0) 45 | ); 46 | #endif 47 | } 48 | 49 | float fabsf_neon_sfp(float x) 50 | { 51 | #ifdef __MATH_NEON 52 | asm volatile ( 53 | "bic r0, r0, #0x80000000 \n\t" //r0 = r0 & ~(1 << 31) 54 | ); 55 | #else 56 | return fabsf_c(x); 57 | #endif 58 | } 59 | -------------------------------------------------------------------------------- /math_floorf.c: -------------------------------------------------------------------------------- 1 | /* 2 | The MIT License (MIT) 3 | 4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in 14 | all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. 23 | */ 24 | 25 | /* 26 | Assumes the floating point value |x| < 2147483648 27 | */ 28 | 29 | #include "math.h" 30 | #include "math_neon.h" 31 | 32 | float floorf_c(float x) 33 | { 34 | int n; 35 | float r; 36 | n = (int) x; 37 | r = (float) n; 38 | r = r - (r > x); 39 | return r; 40 | } 41 | 42 | float floorf_neon_hfp(float x) 43 | { 44 | #ifdef __MATH_NEON 45 | asm volatile ( 46 | "vcvt.s32.f32 d1, d0 \n\t" //d1 = (int) d0; 47 | "vcvt.f32.s32 d1, d1 \n\t" //d1 = (float) d1; 48 | "vcgt.f32 d0, d1, d0 \n\t" //d0 = (d1 > d0); 49 | "vshr.u32 d0, #31 \n\t" //d0 = d0 >> 31; 50 | "vcvt.f32.u32 d0, d0 \n\t" //d0 = (float) d0; 51 | "vsub.f32 d0, d1, d0 \n\t" //d0 = d1 - d0; 52 | ::: "d0", "d1" 53 | ); 54 | #endif 55 | } 56 | 57 | float floorf_neon_sfp(float x) 58 | { 59 | #ifdef __MATH_NEON 60 | asm volatile ("vmov.f32 s0, r0 \n\t"); 61 | floorf_neon_hfp(x); 62 | asm volatile ("vmov.f32 r0, s0 \n\t"); 63 | #else 64 | return floorf_c(x); 65 | #endif 66 | }; 67 | -------------------------------------------------------------------------------- /math_fmodf.c: -------------------------------------------------------------------------------- 1 | /* 2 | The MIT License (MIT) 3 | 4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in 14 | all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. 23 | */ 24 | 25 | /* 26 | Assumes the floating point value |x / y| < 2,147,483,648 27 | */ 28 | 29 | #include "math_neon.h" 30 | 31 | float fmodf_c(float x, float y) 32 | { 33 | int n; 34 | union { 35 | float f; 36 | int i; 37 | } yinv; 38 | float a; 39 | 40 | //fast reciporical approximation (4x Newton) 41 | yinv.f = y; 42 | n = 0x3F800000 - (yinv.i & 0x7F800000); 43 | yinv.i = yinv.i + n; 44 | yinv.f = 1.41176471f - 0.47058824f * yinv.f; 45 | yinv.i = yinv.i + n; 46 | a = 2.0 - yinv.f * y; 47 | yinv.f = yinv.f * a; 48 | a = 2.0 - yinv.f * y; 49 | yinv.f = yinv.f * a; 50 | a = 2.0 - yinv.f * y; 51 | yinv.f = yinv.f * a; 52 | a = 2.0 - yinv.f * y; 53 | yinv.f = yinv.f * a; 54 | 55 | n = (int)(x * yinv.f); 56 | x = x - ((float)n) * y; 57 | return x; 58 | } 59 | 60 | 61 | float fmodf_neon_hfp(float x, float y) 62 | { 63 | #ifdef __MATH_NEON 64 | asm volatile ( 65 | "vdup.f32 d1, d0[1] \n\t" //d1[0] = y 66 | "vdup.f32 d0, d0[0] \n\t" //d1[0] = y 67 | 68 | //fast reciporical approximation 69 | "vrecpe.f32 d2, d1 \n\t" //d2 = ~1.0 / d1 70 | "vrecps.f32 d3, d2, d1 \n\t" //d3 = 2.0 - d2 * d1; 71 | "vmul.f32 d2, d2, d3 \n\t" //d2 = d2 * d3; 72 | "vrecps.f32 d3, d2, d1 \n\t" //d3 = 2.0 - d2 * d1; 73 | "vmul.f32 d2, d2, d3 \n\t" //d2 = d2 * d3; 74 | "vrecps.f32 d3, d2, d1 \n\t" //d3 = 2.0 - d2 * d1; 75 | "vmul.f32 d2, d2, d3 \n\t" //d2 = d2 * d3; 76 | "vrecps.f32 d3, d2, d1 \n\t" //d3 = 2.0 - d2 * d1; 77 | "vmul.f32 d2, d2, d3 \n\t" //d2 = d2 * d3; 78 | 79 | "vmul.f32 d2, d2, d0 \n\t" //d2 = d2 * d0; 80 | "vcvt.s32.f32 d2, d2 \n\t" //d2 = (int) d2; 81 | "vcvt.f32.s32 d2, d2 \n\t" //d2 = (float) d2; 82 | "vmls.f32 d0, d1, d2 \n\t" //d0 = d0 - d1 * d2; 83 | 84 | ::: "d0", "d1", "d2", "d3" 85 | ); 86 | #endif 87 | } 88 | 89 | 90 | float fmodf_neon_sfp(float x, float y) 91 | { 92 | #ifdef __MATH_NEON 93 | asm volatile ("vmov.f32 s0, r0 \n\t"); 94 | asm volatile ("vmov.f32 s1, r1 \n\t"); 95 | fmodf_neon_hfp(x, y); 96 | asm volatile ("vmov.f32 r0, s0 \n\t"); 97 | #else 98 | return fmodf_c(x,y); 99 | #endif 100 | }; 101 | -------------------------------------------------------------------------------- /math_invsqrtf.c: -------------------------------------------------------------------------------- 1 | /* 2 | The MIT License (MIT) 3 | 4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in 14 | all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. 23 | */ 24 | 25 | #include "math.h" 26 | #include "math_neon.h" 27 | 28 | float invsqrtf_c(float x) 29 | { 30 | 31 | float b, c; 32 | union { 33 | float f; 34 | int i; 35 | } a; 36 | 37 | //fast invsqrt approx 38 | a.f = x; 39 | a.i = 0x5F3759DF - (a.i >> 1); //VRSQRTE 40 | c = x * a.f; 41 | b = (3.0f - c * a.f) * 0.5; //VRSQRTS 42 | a.f = a.f * b; 43 | c = x * a.f; 44 | b = (3.0f - c * a.f) * 0.5; 45 | a.f = a.f * b; 46 | 47 | return a.f; 48 | } 49 | 50 | float invsqrtf_neon_hfp(float x) 51 | { 52 | #ifdef __MATH_NEON 53 | asm volatile ( 54 | 55 | "vmov.f32 d1, d0 \n\t" //d1 = d0 56 | "vrsqrte.f32 d0, d0 \n\t" //d0 = ~ 1.0 / sqrt(d0) 57 | "vmul.f32 d2, d0, d1 \n\t" //d2 = d0 * d1 58 | "vrsqrts.f32 d3, d2, d0 \n\t" //d3 = (3 - d0 * d2) / 2 59 | "vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d3 60 | "vmul.f32 d2, d0, d1 \n\t" //d2 = d0 * d1 61 | "vrsqrts.f32 d3, d2, d0 \n\t" //d4 = (3 - d0 * d3) / 2 62 | "vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d4 63 | 64 | ::: "d0", "d1", "d2", "d3" 65 | ); 66 | #endif 67 | } 68 | 69 | float invsqrtf_neon_sfp(float x) 70 | { 71 | #ifdef __MATH_NEON 72 | asm volatile ("vmov.f32 s0, r0 \n\t"); 73 | invsqrtf_neon_hfp(x); 74 | asm volatile ("vmov.f32 r0, s0 \n\t"); 75 | #else 76 | return invsqrtf_c(x); 77 | #endif 78 | }; 79 | 80 | -------------------------------------------------------------------------------- /math_ldexpf.c: -------------------------------------------------------------------------------- 1 | /* 2 | The MIT License (MIT) 3 | 4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in 14 | all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. 23 | */ 24 | 25 | #include "math.h" 26 | #include "math_neon.h" 27 | 28 | float ldexpf_c(float m, int e) 29 | { 30 | union { 31 | float f; 32 | int i; 33 | } r; 34 | r.f = m; 35 | r.i += (e << 23); 36 | return r.f; 37 | } 38 | 39 | float ldexpf_neon_hfp(float m, int e) 40 | { 41 | #ifdef __MATH_NEON 42 | float r; 43 | asm volatile ( 44 | "lsl r0, r0, #23 \n\t" //r0 = r0 << 23 45 | "vdup.i32 d1, r0 \n\t" //d1 = {r0, r0} 46 | "vadd.i32 d0, d0, d1 \n\t" //d0 = d0 + d1 47 | ::: "d0", "d1" 48 | ); 49 | #endif 50 | } 51 | 52 | float ldexpf_neon_sfp(float m, int e) 53 | { 54 | #ifdef __MATH_NEON 55 | float r; 56 | asm volatile ( 57 | "lsl r1, r1, #23 \n\t" //r1 = r1 << 23 58 | "vdup.f32 d0, r0 \n\t" //d0 = {r0, r0} 59 | "vdup.i32 d1, r1 \n\t" //d1 = {r1, r1} 60 | "vadd.i32 d0, d0, d1 \n\t" //d0 = d0 + d1 61 | "vmov.f32 r0, s0 \n\t" //r0 = s0 62 | ::: "d0", "d1" 63 | ); 64 | #else 65 | return ldexpf_c(m,e); 66 | #endif 67 | } 68 | -------------------------------------------------------------------------------- /math_log10f.c: -------------------------------------------------------------------------------- 1 | /* 2 | The MIT License (MIT) 3 | 4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in 14 | all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. 23 | */ 24 | 25 | /* 26 | Based on: 27 | 28 | log10(x) = log10((1+m) * (2^n)) 29 | log(x) = n * log10(2) + log10(1 + m) 30 | log(1+m) = Poly(1+m) 31 | 32 | where Poly(x) is the Minimax approximation of log10(x) over the 33 | range [1, 2] 34 | 35 | Test func : log10f(x) 36 | Test Range: 1 < x < 10000 37 | Peak Error: ~0.000040% 38 | RMS Error: ~0.000008% 39 | */ 40 | 41 | #include "math.h" 42 | #include "math_neon.h" 43 | 44 | const float __log10f_rng = 0.3010299957f; 45 | 46 | const float __log10f_lut[8] = { 47 | -0.99697286229624, //p0 48 | -1.07301643912502, //p4 49 | -2.46980061535534, //p2 50 | -0.07176870463131, //p6 51 | 2.247870219989470, //p1 52 | 0.366547581117400, //p5 53 | 1.991005185100089, //p3 54 | 0.006135635201050, //p7 55 | }; 56 | 57 | float log10f_c(float x) 58 | { 59 | float a, b, c, d, xx; 60 | int m; 61 | 62 | union { 63 | float f; 64 | int i; 65 | } r; 66 | 67 | //extract exponent 68 | r.f = x; 69 | m = (r.i >> 23); 70 | m = m - 127; 71 | r.i = r.i - (m << 23); 72 | 73 | //Taylor Polynomial (Estrins) 74 | xx = r.f * r.f; 75 | a = (__log10f_lut[4] * r.f) + (__log10f_lut[0]); 76 | b = (__log10f_lut[6] * r.f) + (__log10f_lut[2]); 77 | c = (__log10f_lut[5] * r.f) + (__log10f_lut[1]); 78 | d = (__log10f_lut[7] * r.f) + (__log10f_lut[3]); 79 | a = a + b * xx; 80 | c = c + d * xx; 81 | xx = xx * xx; 82 | r.f = a + c * xx; 83 | 84 | //add exponent 85 | r.f = r.f + ((float) m) * __log10f_rng; 86 | 87 | return r.f; 88 | } 89 | 90 | float log10f_neon_hfp(float x) 91 | { 92 | #ifdef __MATH_NEON 93 | asm volatile ( 94 | 95 | "vdup.f32 d0, d0[0] \n\t" //d0 = {x,x}; 96 | 97 | //extract exponent 98 | "vmov.i32 d2, #127 \n\t" //d2 = 127; 99 | "vshr.u32 d6, d0, #23 \n\t" //d6 = d0 >> 23; 100 | "vsub.i32 d6, d6, d2 \n\t" //d6 = d6 - d2; 101 | "vshl.u32 d1, d6, #23 \n\t" //d1 = d6 << 23; 102 | "vsub.i32 d0, d0, d1 \n\t" //d0 = d0 + d1; 103 | 104 | //polynomial: 105 | "vmul.f32 d1, d0, d0 \n\t" //d1 = d0*d0 = {x^2, x^2} 106 | "vld1.32 {d2, d3, d4, d5}, [%1] \n\t" //q1 = {p0, p4, p2, p6}, q2 = {p1, p5, p3, p7} ; 107 | "vmla.f32 q1, q2, d0[0] \n\t" //q1 = q1 + q2 * d0[0] 108 | "vmla.f32 d2, d3, d1[0] \n\t" //d2 = d2 + d3 * d1[0] 109 | "vmul.f32 d1, d1, d1 \n\t" //d1 = d1 * d1 = {x^4, x^4} 110 | "vmla.f32 d2, d1, d2[1] \n\t" //d2 = d2 + d1 * d2[1] 111 | 112 | //add exponent 113 | "vdup.32 d7, %0 \n\t" //d7 = {rng, rng} 114 | "vcvt.f32.s32 d6, d6 \n\t" //d6 = (float) d6 115 | "vmla.f32 d2, d6, d7 \n\t" //d2 = d2 + d6 * d7 116 | 117 | "vmov.f32 s0, s4 \n\t" //s0 = s4 118 | 119 | :: "r"(__log10f_rng), "r"(__log10f_lut) 120 | : "d0", "d1", "q1", "q2", "d6", "d7" 121 | ); 122 | #endif 123 | } 124 | 125 | 126 | float log10f_neon_sfp(float x) 127 | { 128 | #ifdef __MATH_NEON 129 | asm volatile ("vmov.f32 s0, r0 \n\t"); 130 | log10f_neon_hfp(x); 131 | asm volatile ("vmov.f32 r0, s0 \n\t"); 132 | #else 133 | return log10f_c(x); 134 | #endif 135 | }; 136 | -------------------------------------------------------------------------------- /math_logf.c: -------------------------------------------------------------------------------- 1 | /* 2 | The MIT License (MIT) 3 | 4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in 14 | all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. 23 | */ 24 | 25 | /* 26 | Based on: 27 | 28 | log(x) = log((1+m) * (2^n)) 29 | log(x) = n * log(2) + log(1 + m) 30 | log(1+m) = Poly(1+m) 31 | 32 | where Poly(x) is the Minimax approximation of log(x) over the 33 | range [1, 2] 34 | 35 | Test func : logf(x) 36 | Test Range: 1 < x < 10000 37 | Peak Error: ~0.000601% 38 | RMS Error: ~0.000005% 39 | */ 40 | 41 | #include "math.h" 42 | #include "math_neon.h" 43 | 44 | const float __logf_rng = 0.693147180f; 45 | 46 | const float __logf_lut[8] = { 47 | -2.295614848256274, //p0 48 | -2.470711633419806, //p4 49 | -5.686926051100417, //p2 50 | -0.165253547131978, //p6 51 | +5.175912446351073, //p1 52 | +0.844006986174912, //p5 53 | +4.584458825456749, //p3 54 | +0.014127821926000 //p7 55 | }; 56 | 57 | float logf_c(float x) 58 | { 59 | float a, b, c, d, xx; 60 | int m; 61 | 62 | union { 63 | float f; 64 | int i; 65 | } r; 66 | 67 | //extract exponent 68 | r.f = x; 69 | m = (r.i >> 23); 70 | m = m - 127; 71 | r.i = r.i - (m << 23); 72 | 73 | //Taylor Polynomial (Estrins) 74 | xx = r.f * r.f; 75 | a = (__logf_lut[4] * r.f) + (__logf_lut[0]); 76 | b = (__logf_lut[6] * r.f) + (__logf_lut[2]); 77 | c = (__logf_lut[5] * r.f) + (__logf_lut[1]); 78 | d = (__logf_lut[7] * r.f) + (__logf_lut[3]); 79 | a = a + b * xx; 80 | c = c + d * xx; 81 | xx = xx * xx; 82 | r.f = a + c * xx; 83 | 84 | //add exponent 85 | r.f = r.f + ((float) m) * __logf_rng; 86 | 87 | return r.f; 88 | } 89 | 90 | float logf_neon_hfp(float x) 91 | { 92 | #ifdef __MATH_NEON 93 | asm volatile ( 94 | 95 | "vdup.f32 d0, d0[0] \n\t" //d0 = {x,x}; 96 | 97 | //extract exponent 98 | "vmov.i32 d2, #127 \n\t" //d2 = 127; 99 | "vshr.u32 d6, d0, #23 \n\t" //d6 = d0 >> 23; 100 | "vsub.i32 d6, d6, d2 \n\t" //d6 = d6 - d2; 101 | "vshl.u32 d1, d6, #23 \n\t" //d1 = d6 << 23; 102 | "vsub.i32 d0, d0, d1 \n\t" //d0 = d0 + d1; 103 | 104 | //polynomial: 105 | "vmul.f32 d1, d0, d0 \n\t" //d1 = d0*d0 = {x^2, x^2} 106 | "vld1.32 {d2, d3, d4, d5}, [%1] \n\t" //q1 = {p0, p4, p2, p6}, q2 = {p1, p5, p3, p7} ; 107 | "vmla.f32 q1, q2, d0[0] \n\t" //q1 = q1 + q2 * d0[0] 108 | "vmla.f32 d2, d3, d1[0] \n\t" //d2 = d2 + d3 * d1[0] 109 | "vmul.f32 d1, d1, d1 \n\t" //d1 = d1 * d1 = {x^4, x^4} 110 | "vmla.f32 d2, d1, d2[1] \n\t" //d2 = d2 + d1 * d2[1] 111 | 112 | //add exponent 113 | "vdup.32 d7, %0 \n\t" //d7 = {rng, rng} 114 | "vcvt.f32.s32 d6, d6 \n\t" //d6 = (float) d6 115 | "vmla.f32 d2, d6, d7 \n\t" //d2 = d2 + d6 * d7 116 | 117 | "vmov.f32 s0, s4 \n\t" //s0 = s4 118 | 119 | :: "r"(__logf_rng), "r"(__logf_lut) 120 | : "d0", "d1", "q1", "q2", "d6", "d7" 121 | ); 122 | #endif 123 | } 124 | 125 | float logf_neon_sfp(float x) 126 | { 127 | #ifdef __MATH_NEON 128 | asm volatile ("vmov.f32 s0, r0 \n\t"); 129 | logf_neon_hfp(x); 130 | asm volatile ("vmov.f32 r0, s0 \n\t"); 131 | #else 132 | return logf_c(x); 133 | #endif 134 | }; 135 | 136 | -------------------------------------------------------------------------------- /math_mat2.c: -------------------------------------------------------------------------------- 1 | /* 2 | The MIT License (MIT) 3 | 4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in 14 | all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. 23 | */ 24 | 25 | /* 26 | Matrices are specified in column major format: 27 | 28 | | a c | 29 | | b d | 30 | 31 | therefore m[2] = c 32 | */ 33 | 34 | #include "math_neon.h" 35 | 36 | //matrix matrix multipication. d = m0 * m1; 37 | void 38 | matmul2_c(float m0[4], float m1[4], float d[4]) 39 | { 40 | d[0] = m0[0]*m1[0] + m0[2]*m1[1]; 41 | d[1] = m0[1]*m1[0] + m0[3]*m1[1]; 42 | d[2] = m0[0]*m1[2] + m0[2]*m1[3]; 43 | d[3] = m0[1]*m1[2] + m0[3]*m1[3]; 44 | } 45 | 46 | void 47 | matmul2_neon(float m0[4], float m1[4], float d[4]) 48 | { 49 | #ifdef __MATH_NEON 50 | asm volatile ( 51 | "vld1.32 {d0, d1}, [%0] \n\t" //Q1 = m0 52 | "vld1.32 {d2, d3}, [%1] \n\t" //Q2 = m1 53 | 54 | "vmul.f32 d4, d0, d2[0] \n\t" //D4 = D0*D2[0] 55 | "vmul.f32 d5, d0, d3[0] \n\t" //D5 = D0*D3[0] 56 | "vmla.f32 d4, d1, d2[1] \n\t" //D4 += D1*D2[1] 57 | "vmla.f32 d5, d1, d3[1] \n\t" //D5 += D1*D3[1] 58 | 59 | "vst1.32 {d4, d5}, [%2] \n\t" //Q4 = m+12 60 | :: "r"(m0), "r"(m1), "r"(d) 61 | : "q0", "q1", "q2", "memory" 62 | ); 63 | #else 64 | matmul2_c(m0, m1, d); 65 | #endif 66 | } 67 | 68 | 69 | //matrix vector multiplication. d = m * v 70 | void 71 | matvec2_c(float m[4], float v[2], float d[2]) 72 | { 73 | d[0] = m[0]*v[0] + m[2]*v[1]; 74 | d[1] = m[1]*v[0] + m[3]*v[1]; 75 | } 76 | 77 | void 78 | matvec2_neon(float m[4], float v[2], float d[2]) 79 | { 80 | #ifdef __MATH_NEON 81 | asm volatile ( 82 | "vld1.32 d0, [%1] \n\t" //d0 = v 83 | "vld1.32 {d1, d2}, [%0] \n\t" //Q1 = m 84 | 85 | "vmul.f32 d3, d1, d0[0] \n\t" //Q5 = Q1*d0[0] 86 | "vmla.f32 d3, d2, d0[1] \n\t" //Q5 += Q1*d0[1] 87 | 88 | "vst1.32 d3, [%2] \n\t" //Q4 = m+12 89 | :: "r"(m), "r"(v), "r"(d) 90 | : "d0", "d1", "d2","d3", "memory" 91 | ); 92 | #else 93 | matvec2_c(m, v, d); 94 | #endif 95 | } 96 | -------------------------------------------------------------------------------- /math_mat3.c: -------------------------------------------------------------------------------- 1 | /* 2 | The MIT License (MIT) 3 | 4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in 14 | all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. 23 | */ 24 | 25 | /* 26 | Matrices are specified in row major format: 27 | 28 | | x0 x2 | 29 | | x1 x3 | 30 | 31 | therefore m[2] = x2 32 | 33 | */ 34 | 35 | #include "math_neon.h" 36 | 37 | //matrix matrix multipication. d = m0 * m1; 38 | void 39 | matmul3_c(float m0[9], float m1[9], float d[9]) 40 | { 41 | d[0] = m0[0]*m1[0] + m0[3]*m1[1] + m0[6]*m1[2]; 42 | d[1] = m0[1]*m1[0] + m0[4]*m1[1] + m0[7]*m1[2]; 43 | d[2] = m0[2]*m1[0] + m0[5]*m1[1] + m0[8]*m1[2]; 44 | d[3] = m0[0]*m1[3] + m0[3]*m1[4] + m0[6]*m1[5]; 45 | d[4] = m0[1]*m1[3] + m0[4]*m1[4] + m0[7]*m1[5]; 46 | d[5] = m0[2]*m1[3] + m0[5]*m1[4] + m0[8]*m1[5]; 47 | d[6] = m0[0]*m1[6] + m0[3]*m1[7] + m0[6]*m1[8]; 48 | d[7] = m0[1]*m1[6] + m0[4]*m1[7] + m0[7]*m1[8]; 49 | d[8] = m0[2]*m1[6] + m0[5]*m1[7] + m0[8]*m1[8]; 50 | } 51 | 52 | void 53 | matmul3_neon(float m0[9], float m1[9], float d[9]) 54 | { 55 | #ifdef __MATH_NEON 56 | asm volatile ( 57 | "vld1.32 {d0, d1}, [%1]! \n\t" //q0 = m1 58 | "vld1.32 {d2, d3}, [%1]! \n\t" //q1 = m1+4 59 | "flds s8, [%1] \n\t" //q2 = m1+8 60 | 61 | "vld1.32 {d6, d7}, [%0] \n\t" //q3[0] = m0 62 | "add %0, %0, #12 \n\t" //q3[0] = m0 63 | "vld1.32 {d8, d9}, [%0] \n\t" //q4[0] = m0+12 64 | "add %0, %0, #12 \n\t" //q3[0] = m0 65 | "vld1.32 {d10}, [%0] \n\t" //q5[0] = m0+24 66 | "add %0, %0, #8 \n\t" //q3[0] = m0 67 | "flds s22, [%0] \n\t" //q2 = m1+8 68 | 69 | "vmul.f32 q6, q3, d0[0] \n\t" //q12 = q3 * d0[0] 70 | "vmul.f32 q7, q3, d1[1] \n\t" //q13 = q3 * d2[0] 71 | "vmul.f32 q8, q3, d3[0] \n\t" //q14 = q3 * d4[0] 72 | "vmla.f32 q6, q4, d0[1] \n\t" //q12 = q9 * d0[1] 73 | "vmla.f32 q7, q4, d2[0] \n\t" //q13 = q9 * d2[1] 74 | "vmla.f32 q8, q4, d3[1] \n\t" //q14 = q9 * d4[1] 75 | "vmla.f32 q6, q5, d1[0] \n\t" //q12 = q10 * d0[0] 76 | "vmla.f32 q7, q5, d2[1] \n\t" //q13 = q10 * d2[0] 77 | "vmla.f32 q8, q5, d4[0] \n\t" //q14 = q10 * d4[0] 78 | 79 | "vmov.f32 q0, q8 \n\t" //q14 = q10 * d4[0] 80 | "vst1.32 {d12, d13}, [%2] \n\t" //d = q12 81 | "add %2, %2, #12 \n\t" //q3[0] = m0 82 | "vst1.32 {d14, d15}, [%2] \n\t" //d+4 = q13 83 | "add %2, %2, #12 \n\t" //q3[0] = m0 84 | "vst1.32 {d0}, [%2] \n\t" //d+8 = q14 85 | "add %2, %2, #8 \n\t" //q3[0] = m0 86 | "fsts s2, [%2] \n\t" //d = q12 87 | 88 | : "+r"(m0), "+r"(m1), "+r"(d): 89 | : "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "memory" 90 | ); 91 | #else 92 | matmul3_c(m0, m1, d); 93 | #endif 94 | }; 95 | 96 | //matrix vector multiplication. d = m * v 97 | void 98 | matvec3_c(float m[9], float v[3], float d[3]) 99 | { 100 | d[0] = m[0]*v[0] + m[3]*v[1] + m[6]*v[2]; 101 | d[1] = m[1]*v[0] + m[4]*v[1] + m[7]*v[2]; 102 | d[2] = m[2]*v[0] + m[5]*v[1] + m[8]*v[2]; 103 | } 104 | 105 | void 106 | matvec3_neon(float m[9], float v[3], float d[3]) 107 | { 108 | #ifdef __MATH_NEON 109 | int tmp; 110 | asm volatile ( 111 | "mov %3, #12 \n\t" //r3 = 12 112 | "vld1.32 {d0, d1}, [%1] \n\t" //Q0 = v 113 | "vld1.32 {d2, d3}, [%0], %3 \n\t" //Q1 = m 114 | "vld1.32 {d4, d5}, [%0], %3 \n\t" //Q2 = m+12 115 | "vld1.32 {d6, d7}, [%0], %3 \n\t" //Q3 = m+24 116 | 117 | "vmul.f32 q9, q1, d0[0] \n\t" //Q9 = Q1*Q0[0] 118 | "vmla.f32 q9, q2, d0[1] \n\t" //Q9 += Q2*Q0[1] 119 | "vmla.f32 q9, q3, d1[0] \n\t" //Q9 += Q3*Q0[2] 120 | "vmov.f32 q0, q9 \n\t" //Q0 = q9 121 | 122 | "vst1.32 d0, [%2]! \n\t" //r2 = D24 123 | "fsts s2, [%2] \n\t" //r2 = D25[0] 124 | 125 | : "+r"(m), "+r"(v), "+r"(d), "+r"(tmp): 126 | : "q0", "q9", "q10","q11", "q12", "q13", "memory" 127 | ); 128 | #else 129 | matvec3_c(m, v, d); 130 | #endif 131 | } 132 | -------------------------------------------------------------------------------- /math_mat4.c: -------------------------------------------------------------------------------- 1 | /* 2 | The MIT License (MIT) 3 | 4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in 14 | all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. 23 | */ 24 | 25 | /* 26 | Matrices are specified in row major format: 27 | 28 | | x0 x2 | 29 | | x1 x3 | 30 | 31 | therefore m[2] = x2 32 | 33 | */ 34 | 35 | #include "math_neon.h" 36 | 37 | //matrix matrix multipication. d = m0 * m1; 38 | void 39 | matmul4_c(float m0[16], float m1[16], float d[16]) 40 | { 41 | d[0] = m0[0]*m1[0] + m0[4]*m1[1] + m0[8]*m1[2] + m0[12]*m1[3]; 42 | d[1] = m0[1]*m1[0] + m0[5]*m1[1] + m0[9]*m1[2] + m0[13]*m1[3]; 43 | d[2] = m0[2]*m1[0] + m0[6]*m1[1] + m0[10]*m1[2] + m0[14]*m1[3]; 44 | d[3] = m0[3]*m1[0] + m0[7]*m1[1] + m0[11]*m1[2] + m0[15]*m1[3]; 45 | d[4] = m0[0]*m1[4] + m0[4]*m1[5] + m0[8]*m1[6] + m0[12]*m1[7]; 46 | d[5] = m0[1]*m1[4] + m0[5]*m1[5] + m0[9]*m1[6] + m0[13]*m1[7]; 47 | d[6] = m0[2]*m1[4] + m0[6]*m1[5] + m0[10]*m1[6] + m0[14]*m1[7]; 48 | d[7] = m0[3]*m1[4] + m0[7]*m1[5] + m0[11]*m1[6] + m0[15]*m1[7]; 49 | d[8] = m0[0]*m1[8] + m0[4]*m1[9] + m0[8]*m1[10] + m0[12]*m1[11]; 50 | d[9] = m0[1]*m1[8] + m0[5]*m1[9] + m0[9]*m1[10] + m0[13]*m1[11]; 51 | d[10] = m0[2]*m1[8] + m0[6]*m1[9] + m0[10]*m1[10] + m0[14]*m1[11]; 52 | d[11] = m0[3]*m1[8] + m0[7]*m1[9] + m0[11]*m1[10] + m0[15]*m1[11]; 53 | d[12] = m0[0]*m1[12] + m0[4]*m1[13] + m0[8]*m1[14] + m0[12]*m1[15]; 54 | d[13] = m0[1]*m1[12] + m0[5]*m1[13] + m0[9]*m1[14] + m0[13]*m1[15]; 55 | d[14] = m0[2]*m1[12] + m0[6]*m1[13] + m0[10]*m1[14] + m0[14]*m1[15]; 56 | d[15] = m0[3]*m1[12] + m0[7]*m1[13] + m0[11]*m1[14] + m0[15]*m1[15]; 57 | } 58 | 59 | void 60 | matmul4_neon(float m0[16], float m1[16], float d[16]) 61 | { 62 | #ifdef __MATH_NEON 63 | asm volatile ( 64 | "vld1.32 {d0, d1}, [%1]! \n\t" //q0 = m1 65 | "vld1.32 {d2, d3}, [%1]! \n\t" //q1 = m1+4 66 | "vld1.32 {d4, d5}, [%1]! \n\t" //q2 = m1+8 67 | "vld1.32 {d6, d7}, [%1] \n\t" //q3 = m1+12 68 | "vld1.32 {d16, d17}, [%0]! \n\t" //q8 = m0 69 | "vld1.32 {d18, d19}, [%0]! \n\t" //q9 = m0+4 70 | "vld1.32 {d20, d21}, [%0]! \n\t" //q10 = m0+8 71 | "vld1.32 {d22, d23}, [%0] \n\t" //q11 = m0+12 72 | 73 | "vmul.f32 q12, q8, d0[0] \n\t" //q12 = q8 * d0[0] 74 | "vmul.f32 q13, q8, d2[0] \n\t" //q13 = q8 * d2[0] 75 | "vmul.f32 q14, q8, d4[0] \n\t" //q14 = q8 * d4[0] 76 | "vmul.f32 q15, q8, d6[0] \n\t" //q15 = q8 * d6[0] 77 | "vmla.f32 q12, q9, d0[1] \n\t" //q12 = q9 * d0[1] 78 | "vmla.f32 q13, q9, d2[1] \n\t" //q13 = q9 * d2[1] 79 | "vmla.f32 q14, q9, d4[1] \n\t" //q14 = q9 * d4[1] 80 | "vmla.f32 q15, q9, d6[1] \n\t" //q15 = q9 * d6[1] 81 | "vmla.f32 q12, q10, d1[0] \n\t" //q12 = q10 * d0[0] 82 | "vmla.f32 q13, q10, d3[0] \n\t" //q13 = q10 * d2[0] 83 | "vmla.f32 q14, q10, d5[0] \n\t" //q14 = q10 * d4[0] 84 | "vmla.f32 q15, q10, d7[0] \n\t" //q15 = q10 * d6[0] 85 | "vmla.f32 q12, q11, d1[1] \n\t" //q12 = q11 * d0[1] 86 | "vmla.f32 q13, q11, d3[1] \n\t" //q13 = q11 * d2[1] 87 | "vmla.f32 q14, q11, d5[1] \n\t" //q14 = q11 * d4[1] 88 | "vmla.f32 q15, q11, d7[1] \n\t" //q15 = q11 * d6[1] 89 | 90 | "vst1.32 {d24, d25}, [%2]! \n\t" //d = q12 91 | "vst1.32 {d26, d27}, [%2]! \n\t" //d+4 = q13 92 | "vst1.32 {d28, d29}, [%2]! \n\t" //d+8 = q14 93 | "vst1.32 {d30, d31}, [%2] \n\t" //d+12 = q15 94 | 95 | : "+r"(m0), "+r"(m1), "+r"(d) : 96 | : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", 97 | "memory" 98 | ); 99 | #else 100 | matmul4_c(m0, m1, d); 101 | #endif 102 | } 103 | 104 | 105 | //matrix vector multiplication. d = m * v 106 | void 107 | matvec4_c(float m[16], float v[4], float d[4]) 108 | { 109 | d[0] = m[0]*v[0] + m[4]*v[1] + m[8]*v[2] + m[12]*v[3]; 110 | d[1] = m[1]*v[0] + m[5]*v[1] + m[9]*v[2] + m[13]*v[3]; 111 | d[2] = m[2]*v[0] + m[6]*v[1] + m[10]*v[2] + m[14]*v[3]; 112 | d[3] = m[3]*v[0] + m[7]*v[1] + m[11]*v[2] + m[15]*v[3]; 113 | } 114 | 115 | void 116 | matvec4_neon(float m[16], float v[4], float d[4]) 117 | { 118 | #ifdef __MATH_NEON 119 | asm volatile ( 120 | "vld1.32 {d0, d1}, [%1] \n\t" //Q0 = v 121 | "vld1.32 {d18, d19}, [%0]! \n\t" //Q1 = m 122 | "vld1.32 {d20, d21}, [%0]! \n\t" //Q2 = m+4 123 | "vld1.32 {d22, d23}, [%0]! \n\t" //Q3 = m+8 124 | "vld1.32 {d24, d25}, [%0]! \n\t" //Q4 = m+12 125 | 126 | "vmul.f32 q13, q9, d0[0] \n\t" //Q5 = Q1*Q0[0] 127 | "vmla.f32 q13, q10, d0[1] \n\t" //Q5 += Q1*Q0[1] 128 | "vmla.f32 q13, q11, d1[0] \n\t" //Q5 += Q2*Q0[2] 129 | "vmla.f32 q13, q12, d1[1] \n\t" //Q5 += Q3*Q0[3] 130 | 131 | "vst1.32 {d26, d27}, [%2] \n\t" //Q4 = m+12 132 | : 133 | : "r"(m), "r"(v), "r"(d) 134 | : "q0", "q9", "q10","q11", "q12", "q13", "memory" 135 | ); 136 | #else 137 | matvec4_c(m, v, d); 138 | #endif 139 | } 140 | 141 | 142 | 143 | 144 | 145 | -------------------------------------------------------------------------------- /math_modf.c: -------------------------------------------------------------------------------- 1 | /* 2 | The MIT License (MIT) 3 | 4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in 14 | all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. 23 | */ 24 | 25 | /* 26 | Assumes the floating point value |x| < 2,147,483,648 27 | */ 28 | 29 | #include "math_neon.h" 30 | 31 | float modf_c(float x, int *i) 32 | { 33 | int n; 34 | n = (int)x; 35 | *i = n; 36 | x = x - (float)n; 37 | return x; 38 | } 39 | 40 | 41 | float modf_neon_hfp(float x, int *i) 42 | { 43 | #ifdef __MATH_NEON 44 | asm volatile ( 45 | "vcvt.s32.f32 d1, d0 \n\t" //d1 = (int) d0; 46 | "vcvt.f32.s32 d2, d1 \n\t" //d2 = (float) d1; 47 | "vsub.f32 d0, d0, d2 \n\t" //d0 = d0 - d2; 48 | "vstr.i32 s2, [r0] \n\t" //[r0] = d1[0] 49 | ::: "d0", "d1", "d2" 50 | ); 51 | #endif 52 | } 53 | 54 | 55 | float modf_neon_sfp(float x, int *i) 56 | { 57 | #ifdef __MATH_NEON 58 | asm volatile ( 59 | "vdup.f32 d0, r0 \n\t" //d0 = {x, x} 60 | "vcvt.s32.f32 d1, d0 \n\t" //d1 = (int) d0; 61 | "vcvt.f32.s32 d2, d1 \n\t" //d2 = (float) d1; 62 | "vsub.f32 d0, d0, d2 \n\t" //d0 = d0 - d2; 63 | "vstr.i32 s2, [r1] \n\t" //[r0] = d1[0] 64 | "vmov.f32 r0, s0 \n\t" //r0 = d0[0]; 65 | ::: "d0", "d1", "d2" 66 | ); 67 | 68 | #else 69 | return modf_c(x, i); 70 | #endif 71 | } 72 | -------------------------------------------------------------------------------- /math_neon.h: -------------------------------------------------------------------------------- 1 | /* 2 | The MIT License (MIT) 3 | 4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in 14 | all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. 23 | */ 24 | 25 | #ifndef __MATH_NEON_H__ 26 | #define __MATH_NEON_H__ 27 | 28 | #if !defined(__i386__) && defined(__arm__) 29 | //if defined neon ASM routines are used, otherwise all calls to *_neon 30 | //functions are rerouted to their equivalent *_c function. 31 | #define __MATH_NEON 32 | 33 | //Default Floating Point value ABI: 0=softfp, 1=hardfp. Only effects *_neon routines. 34 | //You can access the hardfp versions directly via the *_hard suffix. 35 | //You can access the softfp versions directly via the *_soft suffix. 36 | #define __MATH_FPABI 0 37 | 38 | #endif 39 | 40 | #ifdef GCC 41 | #define ALIGN(A) __attribute__ ((aligned (A)) 42 | #else 43 | #define ALIGN(A) 44 | #endif 45 | 46 | #ifndef _MATH_H 47 | #define M_PI 3.14159265358979323846 /* pi */ 48 | #define M_PI_2 1.57079632679489661923 /* pi/2 */ 49 | #define M_PI_4 0.78539816339744830962 /* pi/4 */ 50 | #define M_E 2.7182818284590452354 /* e */ 51 | #define M_LOG2E 1.4426950408889634074 /* log_2 e */ 52 | #define M_LOG10E 0.43429448190325182765 /* log_10 e */ 53 | #define M_LN2 0.69314718055994530942 /* log_e 2 */ 54 | #define M_LN10 2.30258509299404568402 /* log_e 10 */ 55 | #define M_1_PI 0.31830988618379067154 /* 1/pi */ 56 | #define M_2_PI 0.63661977236758134308 /* 2/pi */ 57 | #define M_2_SQRTPI 1.12837916709551257390 /* 2/sqrt(pi) */ 58 | #define M_SQRT2 1.41421356237309504880 /* sqrt(2) */ 59 | #define M_SQRT1_2 0.70710678118654752440 /* 1/sqrt(2) */ 60 | #endif 61 | 62 | #if __MATH_FPABI == 1 63 | #define sinf_neon sinf_neon_hfp 64 | #define cosf_neon cosf_neon_hfp 65 | #define sincosf_neon sincosf_neon_hfp 66 | #define tanf_neon tanf_neon_hfp 67 | #define atanf_neon atanf_neon_hfp 68 | #define atan2f_neon atan2f_neon_hfp 69 | #define asinf_neon asinf_neon_hfp 70 | #define acosf_neon acosf_neon_hfp 71 | #define sinhf_neon sinhf_neon_hfp 72 | #define coshf_neon coshf_neon_hfp 73 | #define tanhf_neon tanhf_neon_hfp 74 | #define expf_neon expf_neon_hfp 75 | #define logf_neon logf_neon_hfp 76 | #define log10f_neon log10f_neon_hfp 77 | #define powf_neon powf_neon_hfp 78 | #define floorf_neon floorf_neon_hfp 79 | #define ceilf_neon ceilf_neon_hfp 80 | #define fabsf_neon fabsf_neon_hfp 81 | #define ldexpf_neon ldexpf_neon_hfp 82 | #define frexpf_neon frexpf_neon_hfp 83 | #define fmodf_neon fmodf_neon_hfp 84 | #define modf_neon modf_neon_hfp 85 | #define sqrtf_neon sqrtf_neon_hfp 86 | #define invsqrtf_neon invsqrtf_neon_hfp 87 | #else 88 | #define sinf_neon sinf_neon_sfp 89 | #define cosf_neon cosf_neon_sfp 90 | #define sincosf_neon sincosf_neon_sfp 91 | #define tanf_neon tanf_neon_sfp 92 | #define atanf_neon atanf_neon_sfp 93 | #define atan2f_neon atan2f_neon_sfp 94 | #define asinf_neon asinf_neon_sfp 95 | #define acosf_neon acosf_neon_sfp 96 | #define sinhf_neon sinhf_neon_sfp 97 | #define coshf_neon coshf_neon_sfp 98 | #define tanhf_neon tanhf_neon_sfp 99 | #define expf_neon expf_neon_sfp 100 | #define logf_neon logf_neon_sfp 101 | #define log10f_neon log10f_neon_sfp 102 | #define powf_neon powf_neon_sfp 103 | #define floorf_neon floorf_neon_sfp 104 | #define ceilf_neon ceilf_neon_sfp 105 | #define fabsf_neon fabsf_neon_sfp 106 | #define ldexpf_neon ldexpf_neon_sfp 107 | #define frexpf_neon frexpf_neon_sfp 108 | #define fmodf_neon fmodf_neon_sfp 109 | #define modf_neon modf_neon_sfp 110 | #define sqrtf_neon sqrtf_neon_sfp 111 | #define invsqrtf_neon invsqrtf_neon_sfp 112 | 113 | #define dot2_neon dot2_neon_sfp 114 | #define dot3_neon dot3_neon_sfp 115 | #define dot4_neon dot4_neon_sfp 116 | #endif 117 | 118 | /* 119 | function: enable_runfast 120 | this function enables the floating point runfast mode on the 121 | ARM Cortex A8. 122 | */ 123 | void enable_runfast(); 124 | 125 | 126 | float dot2_c(float v0[2], float v1[2]); 127 | float dot2_neon(float v0[2], float v1[2]); 128 | float dot3_c(float v0[3], float v1[3]); 129 | float dot3_neon(float v0[3], float v1[3]); 130 | float dot4_c(float v0[4], float v1[4]); 131 | float dot4_neon(float v0[4], float v1[4]); 132 | 133 | void cross3_c(float v0[3], float v1[3], float d[3]); 134 | void cross3_neon(float v0[3], float v1[3], float d[3]); 135 | 136 | void normalize2_c(float v[2], float d[2]); 137 | void normalize2_neon(float v[2], float d[2]); 138 | void normalize3_c(float v[3], float d[3]); 139 | void normalize3_neon(float v[3], float d[3]); 140 | void normalize4_c(float v[4], float d[4]); 141 | void normalize4_neon(float v[4], float d[4]); 142 | 143 | /* 144 | function: matmul2 145 | arguments: m0 2x2 matrix, m1 2x2 matrix 146 | return: d 2x2 matrix 147 | expression: d = m0 * m1 148 | */ 149 | void matmul2_c(float m0[4], float m1[4], float d[4]); 150 | void matmul2_neon(float m0[4], float m1[4], float d[4]); 151 | 152 | /* 153 | function: matmul3 154 | arguments: m0 3x3 matrix, m1 3x3 matrix 155 | return: d 3x3 matrix 156 | expression: d = m0 * m1 157 | */ 158 | void matmul3_c(float m0[9], float m1[9], float d[9]); 159 | void matmul3_neon(float m0[9], float m1[9], float d[9]); 160 | 161 | /* 162 | function: matmul4 163 | arguments: m0 4x4 matrix, m1 4x4 matrix 164 | return: d 4x4 matrix 165 | expression: d = m0 * m1 166 | */ 167 | void matmul4_c(float m0[16], float m1[16], float d[16]); 168 | void matmul4_neon(float m0[16], float m1[16], float d[16]); 169 | \ 170 | /* 171 | function: matvec2 172 | arguments: m 2x2 matrix, v 2 element vector 173 | return: d 2x2 matrix 174 | expression: d = m * v 175 | */ 176 | void matvec2_c(float m[4], float v[2], float d[2]); 177 | void matvec2_neon(float m[4], float v[2], float d[2]); 178 | 179 | /* 180 | function: matvec3 181 | arguments: m 3x3 matrix, v 3 element vector 182 | return: d 3x3 matrix 183 | expression: d = m * v 184 | */ 185 | void matvec3_c(float m[9], float v[3], float d[3]); 186 | void matvec3_neon(float m[9], float v[3], float d[3]); 187 | 188 | /* 189 | function: matvec4 190 | arguments: m 4x4 matrix, v 4 element vector 191 | return: d 4x4 matrix 192 | expression: d = m * v 193 | */ 194 | void matvec4_c(float m[16], float v[4], float d[4]); 195 | void matvec4_neon(float m[16], float v[4], float d[4]); 196 | 197 | /* 198 | function: sinf 199 | arguments: x radians 200 | return: the sine function evaluated at x radians. 201 | expression: r = sin(x) 202 | */ 203 | float sinf_c(float x); 204 | float sinf_neon_hfp(float x); 205 | float sinf_neon_sfp(float x); 206 | 207 | /* 208 | function: cosf 209 | arguments: x radians 210 | return: the cosine function evaluated at x radians. 211 | expression: r = cos(x) 212 | notes: computed using cos(x) = sin(x + pi/2) 213 | */ 214 | float cosf_c(float x); 215 | float cosf_neon_hfp(float x); 216 | float cosf_neon_sfp(float x); 217 | 218 | /* 219 | function: sincosf 220 | arguments: x radians, r[2] result array. 221 | return: both the sine and the cosine evaluated at x radians. 222 | expression: r = {sin(x), cos(x)} 223 | notes: faster than evaluating seperately. 224 | */ 225 | void sincosf_c(float x, float r[2]); 226 | void sincosf_neon_hfp(float x, float r[2]); 227 | void sincosf_neon_sfp(float x, float r[2]); 228 | 229 | /* 230 | function: sinfv 231 | return: the sine function evaluated at x[i] radians 232 | expression: r[i] = sin(x[i]) 233 | notes: faster than evaluating individually. 234 | r and x can be the same memory location. 235 | */ 236 | void sinfv_c(float *x, int n, float *r); 237 | void sinfv_neon(float *x, int n, float *r); 238 | 239 | /* 240 | function: tanf 241 | return: the tangent evaluated at x radians. 242 | expression: r = tan(x) 243 | notes: computed using tan(x) = sin(x) / cos(x) 244 | */ 245 | float tanf_c(float x); 246 | float tanf_neon_hfp(float x); 247 | float tanf_neon_sfp(float x); 248 | 249 | /* 250 | function: atanf 251 | return: the arctangent evaluated at x. 252 | expression: r = atan(x) 253 | */ 254 | float atanf_c(float x); 255 | float atanf_neon_hfp(float x); 256 | float atanf_neon_sfp(float x); 257 | 258 | /* 259 | function: atanf 260 | return: the arctangent evaluated at x. 261 | expression: r = atan(x) 262 | */ 263 | float atan2f_c(float y, float x); 264 | float atan2f_neon_hfp(float y, float x); 265 | float atan2f_neon_sfp(float y, float x); 266 | 267 | /* 268 | function: asinf 269 | return: the arcsine evaluated at x. 270 | expression: r = asin(x) 271 | */ 272 | float asinf_c(float x); 273 | float asinf_neon_hfp(float x); 274 | float asinf_neon_sfp(float x); 275 | 276 | /* 277 | function: acosf 278 | return: the arcsine evaluated at x. 279 | expression: r = asin(x) 280 | */ 281 | float acosf_c(float x); 282 | float acosf_neon_hfp(float x); 283 | float acosf_neon_sfp(float x); 284 | 285 | /* 286 | function: sinhf 287 | return: the arcsine evaluated at x. 288 | expression: r = asin(x) 289 | */ 290 | float sinhf_c(float x); 291 | float sinhf_neon_hfp(float x); 292 | float sinhf_neon_sfp(float x); 293 | 294 | /* 295 | function: coshf 296 | return: the arcsine evaluated at x. 297 | expression: r = asin(x) 298 | */ 299 | float coshf_c(float x); 300 | float coshf_neon_hfp(float x); 301 | float coshf_neon_sfp(float x); 302 | 303 | /* 304 | function: tanhf 305 | return: the arcsine evaluated at x. 306 | expression: r = asin(x) 307 | */ 308 | float tanhf_c(float x); 309 | float tanhf_neon_hfp(float x); 310 | float tanhf_neon_sfp(float x); 311 | 312 | /* 313 | function: expf 314 | return: the natural exponential evaluated at x. 315 | expression: r = e ** x 316 | */ 317 | float expf_c(float x); 318 | float expf_neon_hfp(float x); 319 | float expf_neon_sfp(float x); 320 | 321 | /* 322 | function: logf 323 | return: the value of the natural logarithm of x. 324 | expression: r = ln(x) 325 | notes: assumes x > 0 326 | */ 327 | float logf_c(float x); 328 | float logf_neon_hfp(float x); 329 | float logf_neon_sfp(float x); 330 | 331 | /* 332 | function: log10f 333 | return: the value of the power 10 logarithm of x. 334 | expression: r = log10(x) 335 | notes: assumes x > 0 336 | */ 337 | float log10f_c(float x); 338 | float log10f_neon_hfp(float x); 339 | float log10f_neon_sfp(float x); 340 | 341 | /* 342 | function: powf 343 | return: x raised to the power of n, x ** n. 344 | expression: r = x ** y 345 | notes: computed using e ** (y * ln(x)) 346 | */ 347 | float powf_c(float x, float n); 348 | float powf_neon_sfp(float x, float n); 349 | float powf_neon_hfp(float x, float n); 350 | 351 | /* 352 | function: floorf 353 | return: x rounded down (towards negative infinity) to its nearest 354 | integer value. 355 | notes: assumes |x| < 2 ** 31 356 | */ 357 | float floorf_c(float x); 358 | float floorf_neon_sfp(float x); 359 | float floorf_neon_hfp(float x); 360 | 361 | /* 362 | function: ceilf 363 | return: x rounded up (towards positive infinity) to its nearest 364 | integer value. 365 | notes: assumes |x| < 2 ** 31 366 | */ 367 | float ceilf_c(float x); 368 | float ceilf_neon_hfp(float x); 369 | float ceilf_neon_sfp(float x); 370 | 371 | /* 372 | function: fabsf 373 | return: absolute vvalue of x 374 | notes: assumes |x| < 2 ** 31 375 | */ 376 | float fabsf_c(float x); 377 | float fabsf_neon_hfp(float x); 378 | float fabsf_neon_sfp(float x); 379 | 380 | /* 381 | function: ldexpf 382 | return: the value of m multiplied by 2 to the power of e. 383 | expression: r = m * (2 ** e) 384 | */ 385 | float ldexpf_c(float m, int e); 386 | float ldexpf_neon_hfp(float m, int e); 387 | float ldexpf_neon_sfp(float m, int e); 388 | 389 | /* 390 | function: frexpf 391 | return: the exponent and mantissa of x 392 | */ 393 | float frexpf_c(float x, int *e); 394 | float frexpf_neon_hfp(float x, int *e); 395 | float frexpf_neon_sfp(float x, int *e); 396 | 397 | /* 398 | function: fmodf 399 | return: the remainder of x divided by y, x % y 400 | expression: r = x - floor(x / y) * y; 401 | notes: assumes that |x / y| < 2 ** 31 402 | */ 403 | float fmodf_c(float x, float y); 404 | float fmodf_neon_hfp(float x, float y); 405 | float fmodf_neon_sfp(float x, float y); 406 | 407 | /* 408 | function: modf 409 | return: breaks x into the integer (i) and fractional part (return) 410 | notes: assumes that |x| < 2 ** 31 411 | */ 412 | float modf_c(float x, int *i); 413 | float modf_neon_hfp(float x, int *i); 414 | float modf_neon_sfp(float x, int *i); 415 | 416 | /* 417 | function: sqrtf 418 | return: (x^0.5) 419 | notes: 420 | */ 421 | float sqrtf_c(float x); 422 | float sqrtf_neon_hfp(float x); 423 | float sqrtf_neon_sfp(float x); 424 | 425 | 426 | /* 427 | function: invsqrtf 428 | return: 1.0f / (x^0.5) 429 | notes: 430 | */ 431 | float invsqrtf_c(float x); 432 | float invsqrtf_neon_hfp(float x); 433 | float invsqrtf_neon_sfp(float x); 434 | 435 | #endif 436 | -------------------------------------------------------------------------------- /math_powf.c: -------------------------------------------------------------------------------- 1 | /* 2 | The MIT License (MIT) 3 | 4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in 14 | all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. 23 | */ 24 | 25 | /* 26 | Based on x ^ n = exp(n * log(x)) 27 | 28 | Test func : powf(x, n) 29 | Test Range: (1,1) < (x, n) < (10, 10) 30 | Peak Error: ~0.0010% 31 | RMS Error: ~0.0002% 32 | */ 33 | 34 | #include "math.h" 35 | #include "math_neon.h" 36 | 37 | const float __powf_rng[2] = { 38 | 1.442695041f, 39 | 0.693147180f 40 | }; 41 | 42 | const float __powf_lut[16] = { 43 | -2.295614848256274, //p0 log 44 | -2.470711633419806, //p4 45 | -5.686926051100417, //p2 46 | -0.165253547131978, //p6 47 | +5.175912446351073, //p1 48 | +0.844006986174912, //p5 49 | +4.584458825456749, //p3 50 | +0.014127821926000, //p7 51 | 0.9999999916728642, //p0 exp 52 | 0.04165989275009526, //p4 53 | 0.5000006143673624, //p2 54 | 0.0014122663401803872, //p6 55 | 1.000000059694879, //p1 56 | 0.008336936973260111, //p5 57 | 0.16666570253074878, //p3 58 | 0.00019578093328483123 //p7 59 | }; 60 | 61 | float powf_c(float x, float n) 62 | { 63 | float a, b, c, d, xx; 64 | int m; 65 | 66 | union { 67 | float f; 68 | int i; 69 | } r; 70 | 71 | //extract exponent 72 | r.f = x; 73 | m = (r.i >> 23); 74 | m = m - 127; 75 | r.i = r.i - (m << 23); 76 | 77 | //Taylor Polynomial (Estrins) 78 | xx = r.f * r.f; 79 | a = (__powf_lut[4] * r.f) + (__powf_lut[0]); 80 | b = (__powf_lut[6] * r.f) + (__powf_lut[2]); 81 | c = (__powf_lut[5] * r.f) + (__powf_lut[1]); 82 | d = (__powf_lut[7] * r.f) + (__powf_lut[3]); 83 | a = a + b * xx; 84 | c = c + d * xx; 85 | xx = xx * xx; 86 | r.f = a + c * xx; 87 | 88 | //add exponent 89 | r.f = r.f + ((float) m) * __powf_rng[1]; 90 | 91 | r.f = r.f * n; 92 | 93 | 94 | //Range Reduction: 95 | m = (int) (r.f * __powf_rng[0]); 96 | r.f = r.f - ((float) m) * __powf_rng[1]; 97 | 98 | //Taylor Polynomial (Estrins) 99 | a = (__powf_lut[12] * r.f) + (__powf_lut[8]); 100 | b = (__powf_lut[14] * r.f) + (__powf_lut[10]); 101 | c = (__powf_lut[13] * r.f) + (__powf_lut[9]); 102 | d = (__powf_lut[15] * r.f) + (__powf_lut[11]); 103 | xx = r.f * r.f; 104 | a = a + b * xx; 105 | c = c + d * xx; 106 | xx = xx* xx; 107 | r.f = a + c * xx; 108 | 109 | //multiply by 2 ^ m 110 | m = m << 23; 111 | r.i = r.i + m; 112 | 113 | return r.f; 114 | } 115 | 116 | float powf_neon_hfp(float x, float n) 117 | { 118 | #ifdef __MATH_NEON 119 | asm volatile ( 120 | 121 | "vdup.f32 d16, d0[1] \n\t" //d16 = {y,y}; 122 | "vdup.f32 d0, d0[0] \n\t" //d0 = {x,x}; 123 | 124 | //extract exponent 125 | "vmov.i32 d2, #127 \n\t" //d2 = 127; 126 | "vshr.u32 d6, d0, #23 \n\t" //d6 = d0 >> 23; 127 | "vsub.i32 d6, d6, d2 \n\t" //d6 = d6 - d2; 128 | "vshl.u32 d1, d6, #23 \n\t" //d1 = d6 << 23; 129 | "vsub.i32 d0, d0, d1 \n\t" //d0 = d0 + d1; 130 | 131 | //polynomial: 132 | "vmul.f32 d1, d0, d0 \n\t" //d1 = d0*d0 = {x^2, x^2} 133 | "vld1.32 {d2, d3, d4, d5}, [%1]! \n\t" //q1 = {p0, p4, p2, p6}, q2 = {p1, p5, p3, p7} ; 134 | "vmla.f32 q1, q2, d0[0] \n\t" //q1 = q1 + q2 * d0[0] 135 | "vmla.f32 d2, d3, d1[0] \n\t" //d2 = d2 + d3 * d1[0] 136 | "vmul.f32 d1, d1, d1 \n\t" //d1 = d1 * d1 = {x^4, x^4} 137 | "vmla.f32 d2, d1, d2[1] \n\t" //d2 = d2 + d1 * d2[1] 138 | 139 | //add exponent 140 | "vld1.32 d7, [%0] \n\t" //d7 = {invrange, range} 141 | "vcvt.f32.s32 d6, d6 \n\t" //d6 = (float) d6 142 | "vmla.f32 d2, d6, d7[1] \n\t" //d2 = d2 + d6 * d7[1] 143 | 144 | "vdup.f32 d0, d2[0] \n\t" //d0 = d2[0] 145 | "vmul.f32 d0, d0, d16 \n\t" //d0 = d0 * d16 146 | 147 | //Range Reduction: 148 | "vmul.f32 d6, d0, d7[0] \n\t" //d6 = d0 * d7[0] 149 | "vcvt.u32.f32 d6, d6 \n\t" //d6 = (int) d6 150 | "vcvt.f32.u32 d1, d6 \n\t" //d1 = (float) d6 151 | "vmls.f32 d0, d1, d7[1] \n\t" //d0 = d0 - d1 * d7[1] 152 | 153 | //polynomial: 154 | "vmul.f32 d1, d0, d0 \n\t" //d1 = d0*d0 = {x^2, x^2} 155 | "vld1.32 {d2, d3, d4, d5}, [%1] \n\t" //q1 = {p0, p4, p2, p6}, q2 = {p1, p5, p3, p7} ; 156 | "vmla.f32 q1, q2, d0[0] \n\t" //q1 = q1 + q2 * d0[0] 157 | "vmla.f32 d2, d3, d1[0] \n\t" //d2 = d2 + d3 * d1[0] 158 | "vmul.f32 d1, d1, d1 \n\t" //d1 = d1 * d1 = {x^4, x^4} 159 | "vmla.f32 d2, d1, d2[1] \n\t" //d2 = d2 + d1 * d2[1] 160 | 161 | //multiply by 2 ^ m 162 | "vshl.i32 d6, d6, #23 \n\t" //d6 = d6 << 23 163 | "vadd.i32 d0, d2, d6 \n\t" //d0 = d2 + d6 164 | 165 | 166 | :: "r"(__powf_rng), "r"(__powf_lut) 167 | : "d0", "d1", "d2","d3", "d4", "d5", "d6", "d7" 168 | ); 169 | #endif 170 | } 171 | 172 | float powf_neon_sfp(float x, float n) 173 | { 174 | #ifdef __MATH_NEON 175 | asm volatile ("vmov.f32 s0, r0 \n\t"); 176 | asm volatile ("vmov.f32 s1, r1 \n\t"); 177 | powf_neon_hfp(x, n); 178 | asm volatile ("vmov.f32 r0, s0 \n\t"); 179 | #else 180 | return powf_c(x, n); 181 | #endif 182 | }; 183 | -------------------------------------------------------------------------------- /math_runfast.c: -------------------------------------------------------------------------------- 1 | /* 2 | The MIT License (MIT) 3 | 4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in 14 | all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. 23 | */ 24 | 25 | 26 | void 27 | enable_runfast() 28 | { 29 | #ifdef __arm__ 30 | static const unsigned int x = 0x04086060; 31 | static const unsigned int y = 0x03000000; 32 | int r; 33 | asm volatile ( 34 | "fmrx %0, fpscr \n\t" //r0 = FPSCR 35 | "and %0, %0, %1 \n\t" //r0 = r0 & 0x04086060 36 | "orr %0, %0, %2 \n\t" //r0 = r0 | 0x03000000 37 | "fmxr fpscr, %0 \n\t" //FPSCR = r0 38 | : "=r"(r) 39 | : "r"(x), "r"(y) 40 | ); 41 | #endif 42 | } 43 | -------------------------------------------------------------------------------- /math_sincosf.c: -------------------------------------------------------------------------------- 1 | /* 2 | The MIT License (MIT) 3 | 4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in 14 | all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. 23 | */ 24 | 25 | #include "math.h" 26 | #include "math_neon.h" 27 | 28 | const float __sincosf_rng[2] = { 29 | 2.0 / M_PI, 30 | M_PI / 2.0 31 | }; 32 | 33 | const float __sincosf_lut[8] = { 34 | -0.00018365f, //p7 35 | -0.00018365f, //p7 36 | +0.00830636f, //p5 37 | +0.00830636f, //p5 38 | -0.16664831f, //p3 39 | -0.16664831f, //p3 40 | +0.99999661f, //p1 41 | +0.99999661f, //p1 42 | }; 43 | 44 | void sincosf_c( float x, float r[2]) 45 | { 46 | union { 47 | float f; 48 | int i; 49 | } ax, bx; 50 | 51 | float y; 52 | float a, b, c, d, xx, yy; 53 | int m, n, o, p; 54 | 55 | y = x + __sincosf_rng[1]; 56 | ax.f = fabsf(x); 57 | bx.f = fabsf(y); 58 | 59 | //Range Reduction: 60 | m = (int) (ax.f * __sincosf_rng[0]); 61 | o = (int) (bx.f * __sincosf_rng[0]); 62 | ax.f = ax.f - (((float)m) * __sincosf_rng[1]); 63 | bx.f = bx.f - (((float)o) * __sincosf_rng[1]); 64 | 65 | //Test Quadrant 66 | n = m & 1; 67 | p = o & 1; 68 | ax.f = ax.f - n * __sincosf_rng[1]; 69 | bx.f = bx.f - p * __sincosf_rng[1]; 70 | m = m >> 1; 71 | o = o >> 1; 72 | n = n ^ m; 73 | p = p ^ o; 74 | m = (x < 0.0); 75 | o = (y < 0.0); 76 | n = n ^ m; 77 | p = p ^ o; 78 | n = n << 31; 79 | p = p << 31; 80 | ax.i = ax.i ^ n; 81 | bx.i = bx.i ^ p; 82 | 83 | //Taylor Polynomial 84 | xx = ax.f * ax.f; 85 | yy = bx.f * bx.f; 86 | r[0] = __sincosf_lut[0]; 87 | r[1] = __sincosf_lut[1]; 88 | r[0] = r[0] * xx + __sincosf_lut[2]; 89 | r[1] = r[1] * yy + __sincosf_lut[3]; 90 | r[0] = r[0] * xx + __sincosf_lut[4]; 91 | r[1] = r[1] * yy + __sincosf_lut[5]; 92 | r[0] = r[0] * xx + __sincosf_lut[6]; 93 | r[1] = r[1] * yy + __sincosf_lut[7]; 94 | r[0] = r[0] * ax.f; 95 | r[1] = r[1] * bx.f; 96 | 97 | } 98 | 99 | void sincosf_neon_hfp(float x, float r[2]) 100 | { 101 | //HACK: Assumes for softfp that r1 = x, and for hardfp that s0 = x. 102 | #ifdef __MATH_NEON 103 | asm volatile ( 104 | //{x, y} = {x, x + pi/2} 105 | "vdup.f32 d1, d0[0] \n\t" //d1 = {x, x} 106 | "vld1.32 d3, [%1] \n\t" //d3 = {invrange, range} 107 | "vadd.f32 d0, d1, d3 \n\t" //d0 = d1 + d3 108 | "vmov.f32 s0, s2 \n\t" //d0[0] = d1[0] 109 | "vabs.f32 d1, d0 \n\t" //d1 = {abs(x), abs(y)} 110 | 111 | //Range Reduction: 112 | "vmul.f32 d2, d1, d3[0] \n\t" //d2 = d1 * d3[0] 113 | "vcvt.u32.f32 d2, d2 \n\t" //d2 = (int) d2 114 | "vcvt.f32.u32 d4, d2 \n\t" //d4 = (float) d2 115 | "vmls.f32 d1, d4, d3[1] \n\t" //d1 = d1 - d4 * d3[1] 116 | 117 | //Checking Quadrant: 118 | //ax = ax - (k&1) * M_PI_2 119 | "vmov.i32 d4, #1 \n\t" //d4 = 1 120 | "vand.i32 d4, d4, d2 \n\t" //d4 = d4 & d2 121 | "vcvt.f32.u32 d5, d4 \n\t" //d5 = (float) d4 122 | "vmls.f32 d1, d5, d3[1] \n\t" //d1 = d1 - d5 * d3[1] 123 | 124 | //ax = ax ^ ((k & 1) ^ (k >> 1) ^ (x < 0) << 31) 125 | "vshr.u32 d3, d2, #1 \n\t" //d3 = d2 >> 1 126 | "veor.i32 d4, d4, d3 \n\t" //d4 = d4 ^ d3 127 | "vclt.f32 d3, d0, #0 \n\t" //d3 = (d0 < 0.0) 128 | "veor.i32 d4, d4, d3 \n\t" //d4 = d4 ^ d3 129 | "vshl.i32 d4, d4, #31 \n\t" //d4 = d4 << 31 130 | "veor.i32 d0, d1, d4 \n\t" //d0 = d1 ^ d4 131 | 132 | //polynomial: 133 | "vldm %2!, {d2, d3} \n\t" //d2 = {p7, p7}, d3 = {p5, p5}, r3 += 4; 134 | "vmul.f32 d1, d0, d0 \n\t" //d1 = d0 * d0 = {x^2, y^2} 135 | "vldm %2!, {d4} \n\t" //d4 = {p3, p3}, r3 += 2; 136 | "vmla.f32 d3, d2, d1 \n\t" //d3 = d3 + d2 * d1; 137 | "vldm %2!, {d5} \n\t" //d5 = {p1, p1}, r3 += 2; 138 | "vmla.f32 d4, d3, d1 \n\t" //d4 = d4 + d3 * d1; 139 | "vmla.f32 d5, d4, d1 \n\t" //d5 = d5 + d4 * d1; 140 | "vmul.f32 d5, d5, d0 \n\t" //d5 = d5 * d0; 141 | 142 | "vstm.f32 %0, {d5} \n\t" //r[0] = d5[0], r[1]=d5[1]; 143 | 144 | : "+r"(r) 145 | : "r"(__sincosf_rng), "r"(__sincosf_lut) 146 | : "d0", "d1", "d2", "d3", "d4", "d5" 147 | ); 148 | #else 149 | sincosf_c(x, r); 150 | #endif 151 | } 152 | 153 | void sincosf_neon_sfp(float x, float r[2]) 154 | { 155 | #ifdef __MATH_NEON 156 | asm volatile ("vdup.f32 d0, r0 \n\t"); 157 | sincosf_neon_hfp(x, r); 158 | asm volatile ("vmov.f32 r0, s0 \n\t"); 159 | #else 160 | sincosf_c(x, r); 161 | #endif 162 | }; 163 | 164 | -------------------------------------------------------------------------------- /math_sinf.c: -------------------------------------------------------------------------------- 1 | /* 2 | The MIT License (MIT) 3 | 4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in 14 | all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. 23 | */ 24 | 25 | #include 26 | #include "math_neon.h" 27 | 28 | static const float __sinf_rng[2] = { 29 | 2.0 / M_PI, 30 | M_PI / 2.0 31 | } ALIGN(16); 32 | 33 | static const float __sinf_lut[4] = { 34 | -0.00018365f, //p7 35 | -0.16664831f, //p3 36 | +0.00830636f, //p5 37 | +0.99999661f, //p1 38 | } ALIGN(16); 39 | 40 | float sinf_c(float x) 41 | { 42 | union { 43 | float f; 44 | int i; 45 | } ax; 46 | 47 | float r, a, b, xx; 48 | int m, n; 49 | 50 | ax.f = fabsf(x); 51 | 52 | //Range Reduction: 53 | m = (int) (ax.f * __sinf_rng[0]); 54 | ax.f = ax.f - (((float)m) * __sinf_rng[1]); 55 | 56 | //Test Quadrant 57 | n = m & 1; 58 | ax.f = ax.f - n * __sinf_rng[1]; 59 | m = m >> 1; 60 | n = n ^ m; 61 | m = (x < 0.0); 62 | n = n ^ m; 63 | n = n << 31; 64 | ax.i = ax.i ^ n; 65 | 66 | //Taylor Polynomial (Estrins) 67 | xx = ax.f * ax.f; 68 | a = (__sinf_lut[0] * ax.f) * xx + (__sinf_lut[2] * ax.f); 69 | b = (__sinf_lut[1] * ax.f) * xx + (__sinf_lut[3] * ax.f); 70 | xx = xx * xx; 71 | r = b + a * xx; 72 | 73 | return r; 74 | } 75 | 76 | float sinf_neon_hfp(float x) 77 | { 78 | #ifdef __MATH_NEON 79 | asm volatile ( 80 | 81 | "vld1.32 d3, [%0] \n\t" //d3 = {invrange, range} 82 | "vdup.f32 d0, d0[0] \n\t" //d0 = {x, x} 83 | "vabs.f32 d1, d0 \n\t" //d1 = {ax, ax} 84 | 85 | "vmul.f32 d2, d1, d3[0] \n\t" //d2 = d1 * d3[0] 86 | "vcvt.u32.f32 d2, d2 \n\t" //d2 = (int) d2 87 | "vmov.i32 d5, #1 \n\t" //d5 = 1 88 | "vcvt.f32.u32 d4, d2 \n\t" //d4 = (float) d2 89 | "vshr.u32 d7, d2, #1 \n\t" //d7 = d2 >> 1 90 | "vmls.f32 d1, d4, d3[1] \n\t" //d1 = d1 - d4 * d3[1] 91 | 92 | "vand.i32 d5, d2, d5 \n\t" //d5 = d2 & d5 93 | "vclt.f32 d18, d0, #0 \n\t" //d18 = (d0 < 0.0) 94 | "vcvt.f32.u32 d6, d5 \n\t" //d6 = (float) d5 95 | "vmls.f32 d1, d6, d3[1] \n\t" //d1 = d1 - d6 * d3[1] 96 | "veor.i32 d5, d5, d7 \n\t" //d5 = d5 ^ d7 97 | "vmul.f32 d2, d1, d1 \n\t" //d2 = d1*d1 = {x^2, x^2} 98 | 99 | "vld1.32 {d16, d17}, [%1] \n\t" //q8 = {p7, p3, p5, p1} 100 | "veor.i32 d5, d5, d18 \n\t" //d5 = d5 ^ d18 101 | "vshl.i32 d5, d5, #31 \n\t" //d5 = d5 << 31 102 | "veor.i32 d1, d1, d5 \n\t" //d1 = d1 ^ d5 103 | 104 | "vmul.f32 d3, d2, d2 \n\t" //d3 = d2*d2 = {x^4, x^4} 105 | "vmul.f32 q0, q8, d1[0] \n\t" //q0 = q8 * d1[0] = {p7x, p3x, p5x, p1x} 106 | "vmla.f32 d1, d0, d2[0] \n\t" //d1 = d1 + d0*d2 = {p5x + p7x^3, p1x + p3x^3} 107 | "vmla.f32 d1, d3, d1[0] \n\t" //d1 = d1 + d3*d0 = {...., p1x + p3x^3 + p5x^5 + p7x^7} 108 | 109 | "vmov.f32 s0, s3 \n\t" //s0 = s3 110 | : 111 | : "r"(__sinf_rng), "r"(__sinf_lut) 112 | : "q0", "q1", "q2", "q3", "q8", "q9" 113 | ); 114 | #endif 115 | } 116 | 117 | float sinf_neon_sfp(float x) 118 | { 119 | #ifdef __MATH_NEON 120 | asm volatile ("vdup.f32 d0, r0 \n\t"); 121 | sinf_neon_hfp(x); 122 | asm volatile ("vmov.f32 r0, s0 \n\t"); 123 | #else 124 | return sinf_c(x); 125 | #endif 126 | 127 | }; 128 | 129 | -------------------------------------------------------------------------------- /math_sinfv.c: -------------------------------------------------------------------------------- 1 | /* 2 | The MIT License (MIT) 3 | 4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in 14 | all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. 23 | */ 24 | 25 | #include "math.h" 26 | #include "math_neon.h" 27 | 28 | const float __sinfv_rng[2] = { 29 | 2.0 / M_PI, 30 | M_PI / 2.0, 31 | }; 32 | 33 | const float __sinfv_lut[4] = { 34 | -0.00018365f, //p7 35 | -0.16664831f, //p3 36 | +0.00830636f, //p5 37 | +0.99999661f, //p1 38 | }; 39 | 40 | void sinfv_c(float *x, int n, float *r) 41 | { 42 | union { 43 | float f; 44 | int i; 45 | } ax, bx; 46 | 47 | float aa, ab, ba, bb, axx, bxx; 48 | int am, bm, an, bn; 49 | 50 | if (n & 0x1) { 51 | *r++ = sinf_c(*x++); 52 | n--; 53 | } 54 | 55 | float rng0 = __sinfv_rng[0]; 56 | float rng1 = __sinfv_rng[1]; 57 | 58 | while(n > 0){ 59 | 60 | float x0 = *x++; 61 | float x1 = *x++; 62 | 63 | ax.f = fabsf(x0); 64 | bx.f = fabsf(x1); 65 | 66 | //Range Reduction: 67 | am = (int) (ax.f * rng0); 68 | bm = (int) (bx.f * rng0); 69 | 70 | ax.f = ax.f - (((float)am) * rng1); 71 | bx.f = bx.f - (((float)bm) * rng1); 72 | 73 | //Test Quadrant 74 | an = am & 1; 75 | bn = bm & 1; 76 | ax.f = ax.f - an * rng1; 77 | bx.f = bx.f - bn * rng1; 78 | am = (am & 2) >> 1; 79 | bm = (bm & 2) >> 1; 80 | ax.i = ax.i ^ ((an ^ am ^ (x0 < 0)) << 31); 81 | bx.i = bx.i ^ ((bn ^ bm ^ (x1 < 0)) << 31); 82 | 83 | //Taylor Polynomial (Estrins) 84 | axx = ax.f * ax.f; 85 | bxx = bx.f * bx.f; 86 | aa = (__sinfv_lut[0] * ax.f) * axx + (__sinfv_lut[2] * ax.f); 87 | ba = (__sinfv_lut[0] * bx.f) * bxx + (__sinfv_lut[2] * bx.f); 88 | ab = (__sinfv_lut[1] * ax.f) * axx + (__sinfv_lut[3] * ax.f); 89 | bb = (__sinfv_lut[1] * bx.f) * bxx + (__sinfv_lut[3] * bx.f); 90 | axx = axx * axx; 91 | bxx = bxx * bxx; 92 | *r++ = ab + aa * axx; 93 | *r++ = bb + ba * bxx; 94 | n -= 2; 95 | } 96 | 97 | 98 | } 99 | 100 | void sinfv_neon(float *x, int n, float *r) 101 | { 102 | #ifdef __MATH_NEON 103 | asm volatile ("" 104 | : 105 | :"r"(x), "r"(n) 106 | ); 107 | #else 108 | sinfv_c(x, n, r); 109 | #endif 110 | } 111 | -------------------------------------------------------------------------------- /math_sinhf.c: -------------------------------------------------------------------------------- 1 | /* 2 | The MIT License (MIT) 3 | 4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in 14 | all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. 23 | */ 24 | 25 | #include "math.h" 26 | #include "math_neon.h" 27 | 28 | const float __sinhf_rng[2] = { 29 | 1.442695041f, 30 | 0.693147180f 31 | }; 32 | 33 | const float __sinhf_lut[16] = { 34 | 0.00019578093328483123, //p7 35 | 0.00019578093328483123, //p7 36 | 0.0014122663401803872, //p6 37 | 0.0014122663401803872, //p6 38 | 0.008336936973260111, //p5 39 | 0.008336936973260111, //p5 40 | 0.04165989275009526, //p4 41 | 0.04165989275009526, //p4 42 | 0.16666570253074878, //p3 43 | 0.16666570253074878, //p3 44 | 0.5000006143673624, //p2 45 | 0.5000006143673624, //p2 46 | 1.000000059694879, //p1 47 | 1.000000059694879, //p1 48 | 0.9999999916728642, //p0 49 | 0.9999999916728642 //p0 50 | }; 51 | 52 | 53 | float sinhf_c(float x) 54 | { 55 | float a, b, xx; 56 | xx = -x; 57 | a = expf_c(x); 58 | b = expf_c(xx); 59 | a = a - b; 60 | a = a * 0.5f; 61 | return a; 62 | } 63 | 64 | 65 | float sinhf_neon_hfp(float x) 66 | { 67 | #ifdef __MATH_NEON 68 | asm volatile ( 69 | "vdup.f32 d0, d0[0] \n\t" //d0 = {x, x} 70 | "fnegs s1, s1 \n\t" //s1 = -s1 71 | 72 | //Range Reduction: 73 | "vld1.32 d2, [%0] \n\t" //d2 = {invrange, range} 74 | "vld1.32 {d16, d17}, [%1]! \n\t" 75 | "vmul.f32 d6, d0, d2[0] \n\t" //d6 = d0 * d2[0] 76 | "vcvt.s32.f32 d6, d6 \n\t" //d6 = (int) d6 77 | "vld1.32 {d18}, [%1]! \n\t" 78 | "vcvt.f32.s32 d1, d6 \n\t" //d1 = (float) d6 79 | "vld1.32 {d19}, [%1]! \n\t" 80 | "vmls.f32 d0, d1, d2[1] \n\t" //d0 = d0 - d1 * d2[1] 81 | "vld1.32 {d20}, [%1]! \n\t" 82 | 83 | //polynomial: 84 | "vmla.f32 d17, d16, d0 \n\t" //d17 = d17 + d16 * d0; 85 | "vld1.32 {d21}, [%1]! \n\t" 86 | "vmla.f32 d18, d17, d0 \n\t" //d18 = d18 + d17 * d0; 87 | "vld1.32 {d22}, [%1]! \n\t" 88 | "vmla.f32 d19, d18, d0 \n\t" //d19 = d19 + d18 * d0; 89 | "vld1.32 {d23}, [%1]! \n\t" 90 | "vmla.f32 d20, d19, d0 \n\t" //d20 = d20 + d19 * d0; 91 | "vmla.f32 d21, d20, d0 \n\t" //d21 = d21 + d20 * d0; 92 | "vmla.f32 d22, d21, d0 \n\t" //d22 = d22 + d21 * d0; 93 | "vmla.f32 d23, d22, d0 \n\t" //d23 = d23 + d22 * d0; 94 | 95 | //multiply by 2 ^ m 96 | "vshl.i32 d6, d6, #23 \n\t" //d6 = d6 << 23 97 | "vadd.i32 d0, d23, d6 \n\t" //d0 = d22 + d6 98 | 99 | "vdup.f32 d2, d0[1] \n\t" //d2 = s1 100 | "vmov.f32 d1, #0.5 \n\t" //d1 = 0.5 101 | "vsub.f32 d0, d0, d2 \n\t" //d0 = d0 - d2 102 | "vmul.f32 d0, d1 \n\t" //d0 = d0 * d1 103 | 104 | :: "r"(__sinhf_rng), "r"(__sinhf_lut) 105 | : "d0", "d1", "q1", "q2", "d6" 106 | ); 107 | 108 | #endif 109 | } 110 | 111 | float sinhf_neon_sfp(float x) 112 | { 113 | #ifdef __MATH_NEON 114 | asm volatile ("vmov.f32 s0, r0 \n\t"); 115 | sinhf_neon_hfp(x); 116 | asm volatile ("vmov.f32 r0, s0 \n\t"); 117 | #else 118 | return sinhf_c(x); 119 | #endif 120 | }; 121 | -------------------------------------------------------------------------------- /math_sqrtf.c: -------------------------------------------------------------------------------- 1 | /* 2 | The MIT License (MIT) 3 | 4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in 14 | all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. 23 | */ 24 | 25 | /* 26 | Test func : sqrtf(x) 27 | Test Range: 0 < x < 1,000,000,000 28 | Peak Error: ~0.0010% 29 | RMS Error: ~0.0005% 30 | */ 31 | 32 | #include "math.h" 33 | #include "math_neon.h" 34 | 35 | float sqrtf_c(float x) 36 | { 37 | 38 | float b, c; 39 | int m; 40 | union { 41 | float f; 42 | int i; 43 | } a; 44 | 45 | //fast invsqrt approx 46 | a.f = x; 47 | a.i = 0x5F3759DF - (a.i >> 1); //VRSQRTE 48 | c = x * a.f; 49 | b = (3.0f - c * a.f) * 0.5; //VRSQRTS 50 | a.f = a.f * b; 51 | c = x * a.f; 52 | b = (3.0f - c * a.f) * 0.5; 53 | a.f = a.f * b; 54 | 55 | //fast inverse approx 56 | x = a.f; 57 | m = 0x3F800000 - (a.i & 0x7F800000); 58 | a.i = a.i + m; 59 | a.f = 1.41176471f - 0.47058824f * a.f; 60 | a.i = a.i + m; 61 | b = 2.0 - a.f * x; 62 | a.f = a.f * b; 63 | b = 2.0 - a.f * x; 64 | a.f = a.f * b; 65 | 66 | return a.f; 67 | } 68 | 69 | float sqrtf_neon_hfp(float x) 70 | { 71 | #ifdef __MATH_NEON 72 | asm volatile ( 73 | 74 | //fast invsqrt approx 75 | "vmov.f32 d1, d0 \n\t" //d1 = d0 76 | "vrsqrte.f32 d0, d0 \n\t" //d0 = ~ 1.0 / sqrt(d0) 77 | "vmul.f32 d2, d0, d1 \n\t" //d2 = d0 * d1 78 | "vrsqrts.f32 d3, d2, d0 \n\t" //d3 = (3 - d0 * d2) / 2 79 | "vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d3 80 | "vmul.f32 d2, d0, d1 \n\t" //d2 = d0 * d1 81 | "vrsqrts.f32 d3, d2, d0 \n\t" //d4 = (3 - d0 * d3) / 2 82 | "vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d3 83 | 84 | //fast reciporical approximation 85 | "vrecpe.f32 d1, d0 \n\t" //d1 = ~ 1 / d0; 86 | "vrecps.f32 d2, d1, d0 \n\t" //d2 = 2.0 - d1 * d0; 87 | "vmul.f32 d1, d1, d2 \n\t" //d1 = d1 * d2; 88 | "vrecps.f32 d2, d1, d0 \n\t" //d2 = 2.0 - d1 * d0; 89 | "vmul.f32 d0, d1, d2 \n\t" //d0 = d1 * d2; 90 | 91 | ::: "d0", "d1", "d2", "d3" 92 | ); 93 | #endif 94 | } 95 | 96 | float sqrtf_neon_sfp(float x) 97 | { 98 | #ifdef __MATH_NEON 99 | asm volatile ("vmov.f32 s0, r0 \n\t"); 100 | sqrtf_neon_hfp(x); 101 | asm volatile ("vmov.f32 r0, s0 \n\t"); 102 | #else 103 | return sqrtf_c(x); 104 | #endif 105 | }; 106 | -------------------------------------------------------------------------------- /math_sqrtfv.c: -------------------------------------------------------------------------------- 1 | /* 2 | The MIT License (MIT) 3 | 4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in 14 | all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. 23 | */ 24 | /* 25 | Test func : sqrtf(x) 26 | Test Range: 0 < x < 1,000,000,000 27 | Peak Error: ~0.0010% 28 | RMS Error: ~0.0005% 29 | */ 30 | 31 | #include "math.h" 32 | #include "math_neon.h" 33 | 34 | void sqrtfv_c(float *x, int n, float *r) 35 | { 36 | 37 | float x0, x1; 38 | float b0, b1, c0, c1; 39 | int m0, m1; 40 | union { 41 | float f; 42 | int i; 43 | } a0, a1; 44 | 45 | 46 | if (n & 0x1){ 47 | *r++ = sqrtf_c(*x++); 48 | n--; 49 | } 50 | 51 | while(n > 0){ 52 | 53 | x0 = *x++; 54 | x1 = *x++; 55 | 56 | //fast invsqrt approx 57 | a0.f = x0; 58 | a1.f = x1; 59 | a0.i = 0x5F3759DF - (a0.i >> 1); //VRSQRTE 60 | a1.i = 0x5F3759DF - (a1.i >> 1); //VRSQRTE 61 | c0 = x0 * a0.f; 62 | c1 = x1 * a1.f; 63 | b0 = (3.0f - c0 * a0.f) * 0.5; //VRSQRTS 64 | b1 = (3.0f - c1 * a1.f) * 0.5; //VRSQRTS 65 | a0.f = a0.f * b0; 66 | a1.f = a1.f * b1; 67 | c0 = x0 * a0.f; 68 | c1 = x1 * a1.f; 69 | b0 = (3.0f - c0 * a0.f) * 0.5; //VRSQRTS 70 | b1 = (3.0f - c1 * a1.f) * 0.5; //VRSQRTS 71 | a0.f = a0.f * b0; 72 | a1.f = a1.f * b1; 73 | 74 | //fast inverse approx 75 | c0 = a0.f; 76 | c0 = a1.f; 77 | m0 = 0x3F800000 - (a0.i & 0x7F800000); 78 | m1 = 0x3F800000 - (a1.i & 0x7F800000); 79 | a0.i = a0.i + m0; 80 | a1.i = a1.i + m1; 81 | a0.f = 1.41176471f - 0.47058824f * a0.f; 82 | a1.f = 1.41176471f - 0.47058824f * a1.f; 83 | a0.i = a0.i + m0; 84 | a1.i = a1.i + m1; 85 | b0 = 2.0 - a0.f * c0; 86 | b1 = 2.0 - a1.f * c1; 87 | a0.f = a0.f * b0; 88 | a1.f = a1.f * b1; 89 | b0 = 2.0 - a0.f * c0; 90 | b1 = 2.0 - a1.f * c1; 91 | a0.f = a0.f * b0; 92 | a1.f = a1.f * b1; 93 | 94 | *r++ = a0.f; 95 | *r++ = a1.f; 96 | n -= 2; 97 | 98 | } 99 | } 100 | 101 | void sqrtfv_neon(float *x, int n, float *r) 102 | { 103 | #ifdef __MATH_NEON 104 | asm volatile ( 105 | 106 | "tst r1, #1 \n\t" //r1 & 1 107 | "beq 1f \n\t" // 108 | 109 | "vld1.32 d0[0], [r0]! \n\t" //s0 = *x++ 110 | "mov ip, lr \n\t" //ip = lr 111 | //"bl sqrtf_neon_hfp \n\t" //sqrtf_neon 112 | "mov lr, ip \n\t" //lr = ip 113 | "vst1.32 d0[0], [r2]! \n\t" //*r++ = r0 114 | "subs r1, r1, #1 \n\t" //r1 = r1 - 1; 115 | "bxeq lr \n\t" // 116 | 117 | "1: \n\t" // 118 | 119 | "vld1.32 d0, [r0]! \n\t" //d0 = (*x[0], *x[1]), x+=2; 120 | 121 | //fast invsqrt approx 122 | "vmov.f32 d1, d0 \n\t" //d1 = d0 123 | "vrsqrte.f32 d0, d0 \n\t" //d0 = ~ 1.0 / sqrt(d0) 124 | "vmul.f32 d2, d0, d1 \n\t" //d3 = d0 * d2 125 | "vrsqrts.f32 d3, d2, d0 \n\t" //d4 = (3 - d0 * d3) / 2 126 | "vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d4 127 | "vmul.f32 d2, d0, d1 \n\t" //d3 = d0 * d2 128 | "vrsqrts.f32 d3, d2, d0 \n\t" //d4 = (3 - d0 * d3) / 2 129 | "vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d4 130 | 131 | //fast reciporical approximation 132 | "vrecpe.f32 d1, d0 \n\t" //d1 = ~ 1 / d0; 133 | "vrecps.f32 d2, d1, d0 \n\t" //d2 = 2.0 - d1 * d0; 134 | "vmul.f32 d1, d1, d2 \n\t" //d1 = d1 * d2; 135 | "vrecps.f32 d2, d1, d0 \n\t" //d2 = 2.0 - d1 * d0; 136 | "vmul.f32 d0, d1, d2 \n\t" //d0 = d1 * d2; 137 | 138 | "vst1.64 d0, [r2]! \n\t" //*r++ = d0; 139 | "subs r1, r1, #2 \n\t" //n = n - 2; update flags 140 | "bgt 1b \n\t" // 141 | 142 | ::: "d0", "d1", "d2", "d3" 143 | ); 144 | #else 145 | sqrtfv_c(x, n, r); 146 | #endif 147 | } 148 | -------------------------------------------------------------------------------- /math_tanf.c: -------------------------------------------------------------------------------- 1 | /* 2 | The MIT License (MIT) 3 | 4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in 14 | all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. 23 | */ 24 | 25 | #include "math.h" 26 | #include "math_neon.h" 27 | 28 | const float __tanf_rng[2] = { 29 | 2.0 / M_PI, 30 | M_PI / 2.0 31 | }; 32 | 33 | const float __tanf_lut[4] = { 34 | -0.00018365f, //p7 35 | -0.16664831f, //p3 36 | +0.00830636f, //p5 37 | +0.99999661f, //p1 38 | }; 39 | 40 | float tanf_c(float x){ 41 | 42 | union { 43 | float f; 44 | int i; 45 | } ax, c; 46 | 47 | float r, a, b, xx, cc, cx; 48 | int m; 49 | 50 | ax.f = fabsf(x); 51 | 52 | //Range Reduction: 53 | m = (int) (ax.f * __tanf_rng[0]); 54 | ax.f = ax.f - (((float)m) * __tanf_rng[1]); 55 | 56 | //Test Quadrant 57 | ax.f = ax.f - (m & 1) * __tanf_rng[1]; 58 | ax.i = ax.i ^ ((*(int*)&x) & 0x80000000); 59 | 60 | //Taylor Polynomial (Estrins) 61 | xx = ax.f * ax.f; 62 | a = (__tanf_lut[0] * ax.f) * xx + (__tanf_lut[2] * ax.f); 63 | b = (__tanf_lut[1] * ax.f) * xx + (__tanf_lut[3] * ax.f); 64 | xx = xx * xx; 65 | r = b + a * xx; 66 | 67 | //cosine 68 | c.f = 1.0 - r * r; 69 | 70 | //fast invsqrt approximation (2x newton iterations) 71 | cc = c.f; 72 | c.i = 0x5F3759DF - (c.i >> 1); //VRSQRTE 73 | cx = cc * c.f; 74 | a = (3.0f - cx * c.f) / 2; //VRSQRTS 75 | c.f = c.f * a; 76 | cx = cc * c.f; 77 | a = (3.0f - cx * c.f) / 2; 78 | c.f = c.f * a; 79 | 80 | r = r * c.f; 81 | 82 | return r; 83 | } 84 | 85 | 86 | float tanf_neon_hfp(float x) 87 | { 88 | #ifdef __MATH_NEON 89 | asm volatile ( 90 | 91 | "vdup.f32 d0, d0[0] \n\t" //d0 = {x, x} 92 | "vabs.f32 d1, d0 \n\t" //d1 = {ax, ax} 93 | 94 | //Range Reduction: 95 | "vld1.32 d3, [%0] \n\t" //d3 = {invrange, range} 96 | "vmul.f32 d2, d1, d3[0] \n\t" //d2 = d1 * d3[0] 97 | "vcvt.u32.f32 d2, d2 \n\t" //d2 = (int) d2 98 | "vcvt.f32.u32 d4, d2 \n\t" //d4 = (float) d2 99 | "vmls.f32 d1, d4, d3[1] \n\t" //d1 = d1 - d4 * d3[1] 100 | 101 | //Checking Quadrant: 102 | //ax = ax - (k&1) * M_PI_2 103 | "vmov.i32 d4, #1 \n\t" //d4 = 1 104 | "vand.i32 d2, d2, d4 \n\t" //d2 = d2 & d4 105 | "vcvt.f32.u32 d2, d2 \n\t" //d2 = (float) d2 106 | "vmls.f32 d1, d2, d3[1] \n\t" //d1 = d1 - d2 * d3[1] 107 | 108 | //ax = ax ^ ( x.i & 0x800000000) 109 | "vmov.i32 d4, #0x80000000 \n\t" //d4 = 0x80000000 110 | "vand.i32 d0, d0, d4 \n\t" //d0 = d0 & d4 111 | "veor.i32 d1, d1, d0 \n\t" //d1 = d1 ^ d0 112 | 113 | //polynomial: 114 | "vmul.f32 d2, d1, d1 \n\t" //d2 = d1*d1 = {x^2, x^2} 115 | "vld1.32 {d4, d5}, [%1] \n\t" //d4 = {p7, p3}, d5 = {p5, p1} 116 | "vmul.f32 d3, d2, d2 \n\t" //d3 = d2*d2 = {x^4, x^4} 117 | "vmul.f32 q0, q2, d1[0] \n\t" //q0 = q2 * d1[0] = {p7x, p3x, p5x, p1x} 118 | "vmla.f32 d1, d0, d2[0] \n\t" //d1 = d1 + d0*d2 = {p5x + p7x^3, p1x + p3x^3} 119 | "vmla.f32 d1, d3, d1[0] \n\t" //d1 = d1 + d3*d0 = {..., p1x + p3x^3 + p5x^5 + p7x^7} 120 | 121 | //cosine 122 | "vmov.f32 s1, #1.0 \n\t" //d0[1] = 1.0 123 | "vmls.f32 d0, d1, d1 \n\t" //d0 = {..., 1.0 - sx*sx} 124 | 125 | //invsqrt approx 126 | "vmov.f32 d2, d0 \n\t" //d2 = d0 127 | "vrsqrte.f32 d0, d0 \n\t" //d0 = ~ 1.0 / sqrt(d0) 128 | "vmul.f32 d3, d0, d2 \n\t" //d3 = d0 * d2 129 | "vrsqrts.f32 d4, d3, d0 \n\t" //d4 = (3 - d0 * d3) / 2 130 | "vmul.f32 d0, d0, d4 \n\t" //d0 = d0 * d4 131 | "vmul.f32 d3, d0, d2 \n\t" //d3 = d0 * d2 132 | "vrsqrts.f32 d4, d3, d0 \n\t" //d4 = (3 - d0 * d3) / 2 133 | "vmul.f32 d0, d0, d4 \n\t" //d0 = d0 * d4 134 | 135 | "vmul.f32 d0, d0, d1 \n\t" //d0 = d0 * d1 136 | 137 | "vmov.f32 s0, s1 \n\t" //s0 = s1 138 | 139 | :: "r"(__tanf_rng), "r"(__tanf_lut) 140 | : "d0", "d1", "d2", "d3", "d4", "d5" 141 | ); 142 | #endif 143 | } 144 | 145 | 146 | float tanf_neon_sfp(float x) 147 | { 148 | #ifdef __MATH_NEON 149 | asm volatile ("vdup.f32 d0, r0 \n\t"); 150 | tanf_neon_hfp(x); 151 | asm volatile ("vmov.f32 r0, s0 \n\t"); 152 | #else 153 | return tanf_c(x); 154 | #endif 155 | }; 156 | 157 | -------------------------------------------------------------------------------- /math_tanhf.c: -------------------------------------------------------------------------------- 1 | /* 2 | The MIT License (MIT) 3 | 4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in 14 | all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. 23 | */ 24 | 25 | #include "math.h" 26 | #include "math_neon.h" 27 | 28 | /* 29 | TanH = (e^x - e^-x) / (e^x + e^-x) 30 | TanH = (e^x - e^-x)(e^x) / (e^x + e^-x)(e^x) 31 | TanH = (e^2x - 1) / (e^2x + 1) 32 | 33 | */ 34 | 35 | float tanhf_c(float x) 36 | { 37 | float a, b, c; 38 | int m; 39 | union{ 40 | float f; 41 | int i; 42 | } xx; 43 | 44 | x = 2.0f * x; 45 | a = expf_c(x); 46 | c = a + 1.0f; 47 | 48 | //reciporical approx. 49 | xx.f = c; 50 | m = 0x3F800000 - (xx.i & 0x7F800000); 51 | xx.i = xx.i + m; 52 | xx.f = 1.41176471f - 0.47058824f * xx.f; 53 | xx.i = xx.i + m; 54 | b = 2.0 - xx.f * c; 55 | xx.f = xx.f * b; 56 | b = 2.0 - xx.f * c; 57 | xx.f = xx.f * b; 58 | c = a - 1.0; 59 | xx.f *= c; 60 | return xx.f; 61 | } 62 | 63 | 64 | float tanhf_neon_hfp(float x) 65 | { 66 | #ifdef __MATH_NEON 67 | asm volatile ("vadd.f32 d0, d0, d0 \n\t"); 68 | expf_neon_hfp(x); 69 | asm volatile ( 70 | "vmov.f32 d2, #1.0 \n\t" 71 | "vsub.f32 d3, d0, d2 \n\t" 72 | "vadd.f32 d0, d0, d2 \n\t" 73 | 74 | "vrecpe.f32 d1, d0 \n\t" //d1 = ~ 1 / d0; 75 | "vrecps.f32 d2, d1, d0 \n\t" //d2 = 2.0 - d1 * d0; 76 | "vmul.f32 d1, d1, d2 \n\t" //d1 = d1 * d2; 77 | "vrecps.f32 d2, d1, d0 \n\t" //d2 = 2.0 - d1 * d0; 78 | "vmul.f32 d0, d1, d2 \n\t" //d0 = d1 * d2; 79 | "vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d3; 80 | ::: "d0", "d1", "d2", "d3" 81 | ); 82 | #endif 83 | } 84 | 85 | float tanhf_neon_sfp(float x) 86 | { 87 | #ifdef __MATH_NEON 88 | asm volatile ("vmov.f32 s0, r0 \n\t"); 89 | tanhf_neon_hfp(x); 90 | asm volatile ("vmov.f32 r0, s0 \n\t"); 91 | #else 92 | return tanhf_c(x); 93 | #endif 94 | }; 95 | 96 | -------------------------------------------------------------------------------- /math_vec2.c: -------------------------------------------------------------------------------- 1 | /* 2 | The MIT License (MIT) 3 | 4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in 14 | all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. 23 | */ 24 | 25 | 26 | #include "math_neon.h" 27 | 28 | //vec2 scalar product 29 | float 30 | dot2_c(float v0[2], float v1[2]) 31 | { 32 | float r; 33 | r = v0[0]*v1[0]; 34 | r += v0[1]*v1[1]; 35 | return r; 36 | } 37 | 38 | void 39 | normalize2_c(float v[2], float d[2]) 40 | { 41 | float b, c, x; 42 | union { 43 | float f; 44 | int i; 45 | } a; 46 | 47 | x = v[0]*v[0]; 48 | x += v[1]*v[1]; 49 | 50 | //fast invsqrt approx 51 | a.f = x; 52 | a.i = 0x5F3759DF - (a.i >> 1); //VRSQRTE 53 | c = x * a.f; 54 | b = (3.0f - c * a.f) * 0.5; //VRSQRTS 55 | a.f = a.f * b; 56 | c = x * a.f; 57 | b = (3.0f - c * a.f) * 0.5; 58 | a.f = a.f * b; 59 | 60 | d[0] = v[0]*a.f; 61 | d[1] = v[1]*a.f; 62 | } 63 | 64 | float 65 | dot2_neon_hfp(float v0[2], float v1[2]) 66 | { 67 | #ifdef __MATH_NEON 68 | asm volatile ( 69 | "vld1.32 {d2}, [%0] \n\t" //d2={x0,y0} 70 | "vld1.32 {d4}, [%1] \n\t" //d4={x1,y1} 71 | "vmul.f32 d0, d2, d4 \n\t" //d0 = d2*d4 72 | "vpadd.f32 d0, d0, d0 \n\t" //d0 = d[0] + d[1] 73 | :: "r"(v0), "r"(v1) 74 | : 75 | ); 76 | #endif 77 | } 78 | 79 | float 80 | dot2_neon_sfp(float v0[2], float v1[2]) 81 | { 82 | #ifdef __MATH_NEON 83 | dot2_neon_hfp(v0, v1); 84 | asm volatile ("vmov.f32 r0, s0 \n\t"); 85 | #else 86 | return dot2_c(v0, v1); 87 | #endif 88 | }; 89 | 90 | void 91 | normalize2_neon(float v[2], float d[2]) 92 | { 93 | #ifdef __MATH_NEON 94 | asm volatile ( 95 | "vld1.32 d4, [%0] \n\t" //d4 = {x0,y0} 96 | "vmul.f32 d0, d4, d4 \n\t" //d0 = d2*d2 97 | "vpadd.f32 d0, d0 \n\t" //d0 = d[0] + d[1] 98 | 99 | "vmov.f32 d1, d0 \n\t" //d1 = d0 100 | "vrsqrte.f32 d0, d0 \n\t" //d0 = ~ 1.0 / sqrt(d0) 101 | "vmul.f32 d2, d0, d1 \n\t" //d2 = d0 * d1 102 | "vrsqrts.f32 d3, d2, d0 \n\t" //d3 = (3 - d0 * d2) / 2 103 | "vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d3 104 | "vmul.f32 d2, d0, d1 \n\t" //d2 = d0 * d1 105 | "vrsqrts.f32 d3, d2, d0 \n\t" //d3 = (3 - d0 * d2) / 2 106 | "vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d3 107 | 108 | "vmul.f32 d4, d4, d0[0] \n\t" //d4 = d4*d0[0] 109 | "vst1.32 d4, [%1] \n\t" // 110 | 111 | :: "r"(v), "r"(d) 112 | : "d0", "d1", "d2", "d3", "d4", "memory" 113 | ); 114 | #else 115 | normalize2_c(v, d); 116 | #endif 117 | } 118 | 119 | -------------------------------------------------------------------------------- /math_vec3.c: -------------------------------------------------------------------------------- 1 | /* 2 | The MIT License (MIT) 3 | 4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in 14 | all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. 23 | */ 24 | 25 | #include "math_neon.h" 26 | 27 | //vec4 scalar product 28 | float 29 | dot3_c(float v0[3], float v1[3]) 30 | { 31 | float r; 32 | r = v0[0]*v1[0]; 33 | r += v0[1]*v1[1]; 34 | r += v0[2]*v1[2]; 35 | return r; 36 | } 37 | 38 | void 39 | cross3_c(float v0[3], float v1[3], float d[3]) 40 | { 41 | d[0] = v0[1]*v1[2] - v0[2]*v1[1]; 42 | d[1] = v0[2]*v1[0] - v0[0]*v1[2]; 43 | d[2] = v0[0]*v1[1] - v0[1]*v1[0]; 44 | } 45 | 46 | void 47 | normalize3_c(float v[3], float d[3]) 48 | { 49 | float b, c, x; 50 | union { 51 | float f; 52 | int i; 53 | } a; 54 | 55 | x = v[0]*v[0]; 56 | x += v[1]*v[1]; 57 | x += v[2]*v[2]; 58 | 59 | //fast invsqrt approx 60 | a.f = x; 61 | a.i = 0x5F3759DF - (a.i >> 1); //VRSQRTE 62 | c = x * a.f; 63 | b = (3.0f - c * a.f) * 0.5; //VRSQRTS 64 | a.f = a.f * b; 65 | c = x * a.f; 66 | b = (3.0f - c * a.f) * 0.5; 67 | a.f = a.f * b; 68 | 69 | d[0] = v[0]*a.f; 70 | d[1] = v[1]*a.f; 71 | d[2] = v[2]*a.f; 72 | } 73 | 74 | 75 | float 76 | dot3_neon_hfp(float v0[3], float v1[3]) 77 | { 78 | #ifdef __MATH_NEON 79 | asm volatile ( 80 | "vld1.32 {d2}, [%0] \n\t" //d2={x0,y0} 81 | "flds s6, [%0, #8] \n\t" //d3[0]={z0} 82 | "vld1.32 {d4}, [%1] \n\t" //d4={x1,y1} 83 | "flds s10, [%1, #8] \n\t" //d5[0]={z1} 84 | 85 | "vmul.f32 d0, d2, d4 \n\t" //d0= d2*d4 86 | "vpadd.f32 d0, d0, d0 \n\t" //d0 = d[0] + d[1] 87 | "vmla.f32 d0, d3, d5 \n\t" //d0 = d0 + d3*d5 88 | :: "r"(v0), "r"(v1) 89 | : "d0","d1","d2","d3","d4","d5" 90 | ); 91 | #endif 92 | } 93 | 94 | float 95 | dot3_neon_sfp(float v0[3], float v1[3]) 96 | { 97 | #ifdef __MATH_NEON 98 | dot3_neon_hfp(v0, v1); 99 | asm volatile ("vmov.f32 r0, s0 \n\t"); 100 | #else 101 | return dot3_c(v0, v1); 102 | #endif 103 | }; 104 | 105 | 106 | void cross3_neon(float v0[3], float v1[3], float d[3]) 107 | { 108 | #ifdef __MATH_NEON 109 | asm volatile ( 110 | "flds s3, [%0] \n\t" //d1[1]={x0} 111 | "add %0, %0, #4 \n\t" // 112 | "vld1.32 {d0}, [%0] \n\t" //d0={y0,z0} 113 | "vmov.f32 s2, s1 \n\t" //d1[0]={z0} 114 | 115 | "flds s5, [%1] \n\t" //d2[1]={x1} 116 | "add %1, %1, #4 \n\t" // 117 | "vld1.32 {d3}, [%1] \n\t" //d3={y1,z1} 118 | "vmov.f32 s4, s7 \n\t" //d2[0]=d3[1] 119 | 120 | "vmul.f32 d4, d0, d2 \n\t" //d4=d0*d2 121 | "vmls.f32 d4, d1, d3 \n\t" //d4-=d1*d3 122 | 123 | "vmul.f32 d5, d3, d1[1] \n\t" //d5=d3*d1[1] 124 | "vmls.f32 d5, d0, d2[1] \n\t" //d5-=d0*d2[1] 125 | 126 | "vst1.32 d4, [%2] \n\t" // 127 | "add %2, %2, #8 \n\t" // 128 | "fsts s10, [%2] \n\t" // 129 | 130 | : "+r"(v0), "+r"(v1), "+r"(d): 131 | : "d0", "d1", "d2", "d3", "d4", "d5", "memory" 132 | ); 133 | #else 134 | cross3_c(v0,v1,d); 135 | #endif 136 | } 137 | 138 | void 139 | normalize3_neon(float v[3], float d[3]) 140 | { 141 | #ifdef __MATH_NEON 142 | asm volatile ( 143 | "vld1.32 {d4}, [%0] \n\t" //d4={x0,y0} 144 | "flds s10, [%0, #8] \n\t" //d5[0]={z0} 145 | 146 | "vmul.f32 d0, d4, d4 \n\t" //d0= d4*d4 147 | "vpadd.f32 d0, d0 \n\t" //d0 = d[0] + d[1] 148 | "vmla.f32 d0, d5, d5 \n\t" //d0 = d0 + d5*d5 149 | 150 | "vmov.f32 d1, d0 \n\t" //d1 = d0 151 | "vrsqrte.f32 d0, d0 \n\t" //d0 = ~ 1.0 / sqrt(d0) 152 | "vmul.f32 d2, d0, d1 \n\t" //d2 = d0 * d1 153 | "vrsqrts.f32 d3, d2, d0 \n\t" //d3 = (3 - d0 * d2) / 2 154 | "vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d3 155 | "vmul.f32 d2, d0, d1 \n\t" //d2 = d0 * d1 156 | "vrsqrts.f32 d3, d2, d0 \n\t" //d4 = (3 - d0 * d3) / 2 157 | "vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d4 158 | 159 | "vmul.f32 q2, q2, d0[0] \n\t" //d0= d2*d4 160 | "vst1.32 {d4}, [%1] \n\t" // 161 | "fsts s10, [%1, #8] \n\t" // 162 | 163 | :: "r"(v), "r"(d) 164 | : "d0", "d1", "d2", "d3", "d4", "d5", "memory" 165 | ); 166 | #else 167 | normalize3_c(v, d); 168 | #endif 169 | 170 | } 171 | 172 | 173 | -------------------------------------------------------------------------------- /math_vec4.c: -------------------------------------------------------------------------------- 1 | /* 2 | The MIT License (MIT) 3 | 4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in 14 | all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. 23 | */ 24 | 25 | #include "math_neon.h" 26 | 27 | 28 | #ifdef __MATH_NEON 29 | #include "arm_neon.h" 30 | #endif 31 | 32 | //vec4 scalar product 33 | float dot4_c(float v0[4], float v1[4]) 34 | { 35 | float r; 36 | r = v0[0]*v1[0]; 37 | r += v0[1]*v1[1]; 38 | r += v0[2]*v1[2]; 39 | r += v0[3]*v1[3]; 40 | return r; 41 | } 42 | 43 | void normalize4_c(float v[4], float d[4]) 44 | { 45 | float b, c, x; 46 | union { 47 | float f; 48 | int i; 49 | } a; 50 | 51 | x = v[0]*v[0]; 52 | x += v[1]*v[1]; 53 | x += v[2]*v[2]; 54 | x += v[3]*v[3]; 55 | 56 | //fast invsqrt approx 57 | a.f = x; 58 | a.i = 0x5F3759DF - (a.i >> 1); //VRSQRTE 59 | c = x * a.f; 60 | b = (3.0f - c * a.f) * 0.5; //VRSQRTS 61 | a.f = a.f * b; 62 | c = x * a.f; 63 | b = (3.0f - c * a.f) * 0.5; 64 | a.f = a.f * b; 65 | 66 | d[0] = v[0]*a.f; 67 | d[1] = v[1]*a.f; 68 | d[2] = v[2]*a.f; 69 | d[3] = v[3]*a.f; 70 | } 71 | 72 | void normalize4_neon(float v[4], float d[4]) 73 | { 74 | #ifdef __MATH_NEON 75 | asm volatile ( 76 | "vld1.32 {d4, d5}, [%0] \n\t" //d2={x0,y0}, d3={z0, w0} 77 | "vmul.f32 d0, d4, d4 \n\t" //d0= d4*d4 78 | "vmla.f32 d0, d5, d5 \n\t" //d0 = d0 + d5*d5 79 | "vpadd.f32 d0, d0 \n\t" //d0 = d[0] + d[1] 80 | 81 | "vmov.f32 d1, d0 \n\t" //d1 = d0 82 | "vrsqrte.f32 d0, d0 \n\t" //d0 = ~ 1.0 / sqrt(d0) 83 | "vmul.f32 d2, d0, d1 \n\t" //d2 = d0 * d1 84 | "vrsqrts.f32 d3, d2, d0 \n\t" //d3 = (3 - d0 * d2) / 2 85 | "vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d3 86 | "vmul.f32 d2, d0, d1 \n\t" //d2 = d0 * d1 87 | "vrsqrts.f32 d3, d2, d0 \n\t" //d4 = (3 - d0 * d3) / 2 88 | "vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d4 89 | 90 | "vmul.f32 q2, q2, d0[0] \n\t" //d0= d2*d4 91 | "vst1.32 {d4, d5}, [%1] \n\t" //d2={x0,y0}, d3={z0, w0} 92 | 93 | :: "r"(v), "r"(d) 94 | : "d0", "d1", "d2", "d3", "d4", "d5", "memory" 95 | ); 96 | #else 97 | normalize4_c(v, d); 98 | #endif 99 | 100 | } 101 | 102 | 103 | float dot4_neon_hfp(float v0[4], float v1[4]) 104 | { 105 | #ifdef __MATH_NEON 106 | asm volatile ( 107 | "vld1.32 {d2, d3}, [%0] \n\t" //d2={x0,y0}, d3={z0, w0} 108 | "vld1.32 {d4, d5}, [%1] \n\t" //d4={x1,y1}, d5={z1, w1} 109 | "vmul.f32 d0, d2, d4 \n\t" //d0= d2*d4 110 | "vmla.f32 d0, d3, d5 \n\t" //d0 = d0 + d3*d5 111 | "vpadd.f32 d0, d0 \n\t" //d0 = d[0] + d[1] 112 | :: "r"(v0), "r"(v1) : 113 | ); 114 | #endif 115 | } 116 | 117 | 118 | #ifdef __MATH_NEON 119 | float32_t dot4_neon(float32x4_t v0, float32x4_t v1) 120 | { 121 | float32x2_t a, b, c, d, r; 122 | a = vget_high_f32(v0); 123 | b = vget_low_f32(v0); 124 | c = vget_high_f32(v1); 125 | d = vget_low_f32(v1); 126 | 127 | r = vmul_f32(a, c); 128 | r = vmla_f32(r, b, d); 129 | r = vpadd_f32(r, r); 130 | return vget_lane_f32(r, 0); 131 | } 132 | #endif 133 | 134 | float dot4_neon_sfp(float v0[4], float v1[4]) 135 | { 136 | #ifdef __MATH_NEON 137 | dot4_neon_hfp(v0, v1); 138 | asm volatile ("vmov.f32 r0, s0 \n\t"); 139 | #else 140 | return dot4_c(v0, v1); 141 | #endif 142 | }; 143 | 144 | --------------------------------------------------------------------------------