├── README
├── math_acosf.c
├── math_asinf.c
├── math_atan2f.c
├── math_atanf.c
├── math_ceilf.c
├── math_cosf.c
├── math_coshf.c
├── math_debug.c
├── math_expf.c
├── math_fabsf.c
├── math_floorf.c
├── math_fmodf.c
├── math_invsqrtf.c
├── math_ldexpf.c
├── math_log10f.c
├── math_logf.c
├── math_mat2.c
├── math_mat3.c
├── math_mat4.c
├── math_modf.c
├── math_neon.h
├── math_powf.c
├── math_runfast.c
├── math_sincosf.c
├── math_sinf.c
├── math_sinfv.c
├── math_sinhf.c
├── math_sqrtf.c
├── math_sqrtfv.c
├── math_tanf.c
├── math_tanhf.c
├── math_vec2.c
├── math_vec3.c
└── math_vec4.c


/README:
--------------------------------------------------------------------------------
 1 | 
 2 | Library: 	MATH-NEON
 3 | By:			Lachlan Tychsen-Smith
 4 | Licence:	MIT (expat)
 5 | =======================================================================================
 6 | This project implements the cmath functions and some optimised matrix functions 
 7 | with the aim of increasing the floating point performance of ARM Cortex A-8
 8 | based platforms. As well as implementing the functions in ARM NEON assembly, 
 9 | they sacrifice error checking and some accuracy to achieve better performance.
10 | 
11 | Function Errors:
12 | =======================================================================================
13 | The measurement and characterisations of the inaccuracies present within these 
14 | functions is really a field within itself. For the benchmark i provide the 
15 | maximum absolute, maximum relative and root mean squared error compared to the
16 | cmath implementations over the specified range. However these values can be 
17 | misleading, especially for functions which quickly go to infinity. So its always a 
18 | good idea to test it within your actual program. In general, this library will not 
19 | be as accurate as cmath, however for many functions it is close enough to be
20 | negilible. 
21 | 	
22 | Notes:
23 | =======================================================================================
24 | - The *_c functions are c implementations of the *_neon code.
25 | - Like cmath, The errors present in the functions are very dependent on the 
26 |   range which your operating in. So you should test them first.
27 | - Look in the "math_neon.h" file for discriptions of the functions. In some 
28 |   function files there are also notes on the specific implementation.
29 | - The *_neon functions make certain assumptions about the location of arguments 
30 |   that is incompatible with inlining. 
31 | 	  
32 | Contact:
33 | =======================================================================================
34 | Name: 	Lachlan Tychsen-Smith 
35 | Email: 	lachlan.ts@gmail.com
36 | 


--------------------------------------------------------------------------------
/math_acosf.c:
--------------------------------------------------------------------------------
 1 | /*
 2 | The MIT License (MIT)
 3 | 
 4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in
14 | all copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 | THE SOFTWARE.
23 | */
24 | 
25 | #include "math.h"
26 | #include "math_neon.h"
27 | 
28 | /*
29 | Test func : acosf(x)
30 | Test Range: -1.0 < x < 1.0
31 | Peak Error:	~0.005%
32 | RMS  Error: ~0.001%
33 | */
34 | 
35 | const float __acosf_pi_2 = M_PI_2;
36 | 
37 | float acosf_c(float x)
38 | {
39 | 	return __acosf_pi_2 - asinf_c(x);
40 | }
41 | 
42 | 
43 | float acosf_neon_hfp(float x)
44 | {
45 | #ifdef __MATH_NEON
46 | 	asinf_neon_hfp(x);
47 | 	asm volatile (
48 | 	"vdup.f32	 	d1, %0					\n\t"	//d1 = {pi/2, pi/2};
49 | 	"vsub.f32	 	d0, d1, d0				\n\t"	//d0 = d1 - d0;
50 | 	::"r"(__acosf_pi_2):
51 | 	);
52 | #endif
53 | }
54 | 
55 | float acosf_neon_sfp(float x)
56 | {
57 | #ifdef __MATH_NEON
58 | 	asm volatile ("vmov.f32 s0, r0 		\n\t");
59 | 	acosf_neon_hfp(x);
60 | 	asm volatile ("vmov.f32 r0, s0 		\n\t");
61 | #else
62 | 	return acosf_c(x);
63 | #endif
64 | }
65 | 
66 | 
67 | 
68 | 


--------------------------------------------------------------------------------
/math_asinf.c:
--------------------------------------------------------------------------------
  1 | /*
  2 | The MIT License (MIT)
  3 | 
  4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
  5 | 
  6 | Permission is hereby granted, free of charge, to any person obtaining a copy
  7 | of this software and associated documentation files (the "Software"), to deal
  8 | in the Software without restriction, including without limitation the rights
  9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10 | copies of the Software, and to permit persons to whom the Software is
 11 | furnished to do so, subject to the following conditions:
 12 | 
 13 | The above copyright notice and this permission notice shall be included in
 14 | all copies or substantial portions of the Software.
 15 | 
 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 22 | THE SOFTWARE.
 23 | */
 24 | 
 25 | #include "math.h"
 26 | #include "math_neon.h"
 27 | 
 28 | /*
 29 | Test func : asinf(x)
 30 | Test Range: -1.0 < x < 1.0
 31 | Peak Error:	~0.005%
 32 | RMS  Error: ~0.001%
 33 | */
 34 | 
 35 | 
 36 | const float __asinf_lut[4] = {
 37 | 	0.105312459675071, 	//p7
 38 | 	0.169303418571894,	//p3
 39 | 	0.051599985887214, 	//p5
 40 | 	0.999954835104825	//p1
 41 | }; 
 42 | 
 43 | const float __asinf_pi_2 = M_PI_2;
 44 | 
 45 | float asinf_c(float x)
 46 | {
 47 | 
 48 | 	float a, b, c, d, r, ax;
 49 | 	int m;
 50 | 	
 51 | 	union {
 52 | 		float f;
 53 | 		int i;
 54 | 	} xx;
 55 | 
 56 | 	ax = fabs(x);
 57 | 	d = 0.5;
 58 | 	d = d - ax*0.5;
 59 | 		
 60 | 	//fast invsqrt approx
 61 | 	xx.f = d;
 62 | 	xx.i = 0x5F3759DF - (xx.i >> 1);		//VRSQRTE
 63 | 	c = d * xx.f;
 64 | 	b = (3.0f - c * xx.f) * 0.5;		//VRSQRTS
 65 | 	xx.f = xx.f * b;		
 66 | 	c = d * xx.f;
 67 | 	b = (3.0f - c * xx.f) * 0.5;
 68 |     xx.f = xx.f * b;	
 69 | 
 70 | 	//fast inverse approx
 71 | 	d = xx.f;
 72 | 	m = 0x3F800000 - (xx.i & 0x7F800000);
 73 | 	xx.i = xx.i + m;
 74 | 	xx.f = 1.41176471f - 0.47058824f * xx.f;
 75 | 	xx.i = xx.i + m;
 76 | 	b = 2.0 - xx.f * d;
 77 | 	xx.f = xx.f * b;	
 78 | 	b = 2.0 - xx.f * d;
 79 | 	xx.f = xx.f * b;
 80 | 	
 81 | 	//if |x|>0.5 -> x = sqrt((1-x)/2)
 82 | 	xx.f = xx.f - ax;	
 83 | 	a = (ax > 0.5f);
 84 | 	d = __asinf_pi_2 * a;
 85 | 	c = 1.0f - 3.0f * a;
 86 | 	ax = ax + xx.f * a;
 87 | 		
 88 | 	//polynomial evaluation
 89 | 	xx.f = ax * ax;	
 90 | 	a = (__asinf_lut[0] * ax) * xx.f + (__asinf_lut[2] * ax);
 91 | 	b = (__asinf_lut[1] * ax) * xx.f + (__asinf_lut[3] * ax);
 92 | 	xx.f = xx.f * xx.f;
 93 | 	r = b + a * xx.f; 
 94 | 	r = d + c * r;
 95 | 
 96 | 	a = r + r;
 97 | 	b = (x < 0.0f);
 98 | 	r = r - a * b;
 99 | 	return r;
100 | }
101 | 
102 | 
103 | float asinf_neon_hfp(float x)
104 | {
105 | #ifdef __MATH_NEON
106 | 	asm volatile (
107 | 
108 | 	"vdup.f32	 	d0, d0[0]				\n\t"	//d0 = {x, x};
109 | 	"vdup.f32	 	d4, %1					\n\t"	//d4 = {pi/2, pi/2};
110 | 	"vmov.f32	 	d6, d0					\n\t"	//d6 = d0;
111 | 	"vabs.f32	 	d0, d0					\n\t"	//d0 = fabs(d0) ;
112 | 
113 | 	"vmov.f32	 	d5, #0.5				\n\t"	//d5 = 0.5;
114 | 	"vmls.f32	 	d5, d0, d5				\n\t"	//d5 = d5 - d0*d5;
115 | 
116 | 	//fast invsqrt approx
117 | 	"vmov.f32 		d1, d5					\n\t"	//d1 = d5
118 | 	"vrsqrte.f32 	d5, d5					\n\t"	//d5 = ~ 1.0 / sqrt(d5)
119 | 	"vmul.f32 		d2, d5, d1				\n\t"	//d2 = d5 * d1
120 | 	"vrsqrts.f32 	d3, d2, d5				\n\t"	//d3 = (3 - d5 * d2) / 2 	
121 | 	"vmul.f32 		d5, d5, d3				\n\t"	//d5 = d5 * d3
122 | 	"vmul.f32 		d2, d5, d1				\n\t"	//d2 = d5 * d1	
123 | 	"vrsqrts.f32 	d3, d2, d5				\n\t"	//d3 = (3 - d5 * d3) / 2	
124 | 	"vmul.f32 		d5, d5, d3				\n\t"	//d5 = d5 * d3	
125 | 		
126 | 	//fast reciporical approximation
127 | 	"vrecpe.f32		d1, d5					\n\t"	//d1 = ~ 1 / d5; 
128 | 	"vrecps.f32		d2, d1, d5				\n\t"	//d2 = 2.0 - d1 * d5; 
129 | 	"vmul.f32		d1, d1, d2				\n\t"	//d1 = d1 * d2; 
130 | 	"vrecps.f32		d2, d1, d5				\n\t"	//d2 = 2.0 - d1 * d5; 
131 | 	"vmul.f32		d5, d1, d2				\n\t"	//d5 = d1 * d2; 
132 | 	
133 | 	//if |x| > 0.5 -> ax = sqrt((1-ax)/2), r = pi/2
134 | 	"vsub.f32		d5, d0, d5				\n\t"	//d5 = d0 - d5; 
135 | 	"vmov.f32	 	d2, #0.5				\n\t"	//d2 = 0.5;
136 | 	"vcgt.f32	 	d3, d0, d2				\n\t"	//d3 = (d0 > d2);
137 | 	"vmov.f32		d1, #3.0 				\n\t"	//d5 = 3.0; 	
138 | 	"vshr.u32	 	d3, #31					\n\t"	//d3 = d3 >> 31;
139 | 	"vmov.f32		d16, #1.0 				\n\t"	//d16 = 1.0; 	
140 | 	"vcvt.f32.u32	d3, d3					\n\t"	//d3 = (float) d3;	
141 | 	"vmls.f32		d0, d5, d3[0]			\n\t"	//d0 = d0 - d5 * d3[0]; 	
142 | 	"vmul.f32		d7, d4, d3[0] 			\n\t"	//d7 = d5 * d4; 		
143 | 	"vmls.f32		d16, d1, d3[0] 			\n\t"	//d16 = d16 - d1 * d3; 	
144 | 		
145 | 	//polynomial:
146 | 	"vmul.f32 		d2, d0, d0				\n\t"	//d2 = d0*d0 = {ax^2, ax^2}	
147 | 	"vld1.32 		{d4, d5}, [%0]			\n\t"	//d4 = {p7, p3}, d5 = {p5, p1}
148 | 	"vmul.f32 		d3, d2, d2				\n\t"	//d3 = d2*d2 = {x^4, x^4}		
149 | 	"vmul.f32 		q0, q2, d0[0]			\n\t"	//q0 = q2 * d0[0] = {p7x, p3x, p5x, p1x}
150 | 	"vmla.f32 		d1, d0, d2[0]			\n\t"	//d1 = d1 + d0*d2[0] = {p5x + p7x^3, p1x + p3x^3}		
151 | 	"vmla.f32 		d1, d3, d1[0]			\n\t"	//d1 = d1 + d3*d1[0] = {..., p1x + p3x^3 + p5x^5 + p7x^7}		
152 | 
153 | 	"vmla.f32 		d7, d1, d16				\n\t"	//d7 = d7 + d1*d16		
154 | 
155 | 	"vadd.f32 		d2, d7, d7				\n\t"	//d2 = d7 + d7		
156 | 	"vclt.f32	 	d3, d6, #0				\n\t"	//d3 = (d6 < 0)	
157 | 	"vshr.u32	 	d3, #31					\n\t"	//d3 = d3 >> 31;
158 | 	"vcvt.f32.u32	d3, d3					\n\t"	//d3 = (float) d3	
159 | 	"vmls.f32 		d7, d2, d3[0]			\n\t"	//d7 = d7 - d2 * d3[0];
160 | 
161 | 	"vmov.f32 		s0, s15					\n\t"	//s0 = s3
162 | 
163 | 	:: "r"(__asinf_lut),  "r"(__asinf_pi_2) 
164 |     : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"
165 | 	);
166 | #endif
167 | }
168 | 
169 | 
170 | float asinf_neon_sfp(float x)
171 | {
172 | #ifdef __MATH_NEON
173 | 	asm volatile ("vmov.f32 s0, r0 		\n\t");
174 | 	asinf_neon_hfp(x);
175 | 	asm volatile ("vmov.f32 r0, s0 		\n\t");
176 | #else
177 | 	return asinf_c(x);
178 | #endif
179 | }
180 | 
181 | 
182 | 
183 | 
184 | 


--------------------------------------------------------------------------------
/math_atan2f.c:
--------------------------------------------------------------------------------
  1 | /*
  2 | The MIT License (MIT)
  3 | 
  4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
  5 | 
  6 | Permission is hereby granted, free of charge, to any person obtaining a copy
  7 | of this software and associated documentation files (the "Software"), to deal
  8 | in the Software without restriction, including without limitation the rights
  9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10 | copies of the Software, and to permit persons to whom the Software is
 11 | furnished to do so, subject to the following conditions:
 12 | 
 13 | The above copyright notice and this permission notice shall be included in
 14 | all copies or substantial portions of the Software.
 15 | 
 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 22 | THE SOFTWARE.
 23 | */
 24 | 
 25 | #include "math.h"
 26 | #include "math_neon.h"
 27 | 
 28 | const float __atan2f_lut[4] = {
 29 | 	-0.0443265554792128,	//p7
 30 | 	-0.3258083974640975,	//p3
 31 | 	+0.1555786518463281,	//p5
 32 | 	+0.9997878412794807  	//p1
 33 | }; 
 34 |  
 35 | const float __atan2f_pi_2 = M_PI_2;
 36 | 
 37 | float atan2f_c(float y, float x)
 38 | {
 39 | 	float a, b, c, r, xx;
 40 | 	int m;
 41 | 	union {
 42 | 		float f;
 43 | 		int i;
 44 | 	} xinv;
 45 | 
 46 | 	//fast inverse approximation (2x newton)
 47 | 	xx = fabs(x);
 48 | 	xinv.f = xx;
 49 | 	m = 0x3F800000 - (xinv.i & 0x7F800000);
 50 | 	xinv.i = xinv.i + m;
 51 | 	xinv.f = 1.41176471f - 0.47058824f * xinv.f;
 52 | 	xinv.i = xinv.i + m;
 53 | 	b = 2.0 - xinv.f * xx;
 54 | 	xinv.f = xinv.f * b;	
 55 | 	b = 2.0 - xinv.f * xx;
 56 | 	xinv.f = xinv.f * b;
 57 | 	
 58 | 	c = fabs(y * xinv.f);
 59 | 
 60 | 	//fast inverse approximation (2x newton)
 61 | 	xinv.f = c;
 62 | 	m = 0x3F800000 - (xinv.i & 0x7F800000);
 63 | 	xinv.i = xinv.i + m;
 64 | 	xinv.f = 1.41176471f - 0.47058824f * xinv.f;
 65 | 	xinv.i = xinv.i + m;
 66 | 	b = 2.0 - xinv.f * c;
 67 | 	xinv.f = xinv.f * b;	
 68 | 	b = 2.0 - xinv.f * c;
 69 | 	xinv.f = xinv.f * b;
 70 | 	
 71 | 	//if |x| > 1.0 -> ax = -1/ax, r = pi/2
 72 | 	xinv.f = xinv.f + c;
 73 | 	a = (c > 1.0f);
 74 | 	c = c - a * xinv.f;
 75 | 	r = a * __atan2f_pi_2;
 76 | 	
 77 | 	//polynomial evaluation
 78 | 	xx = c * c;	
 79 | 	a = (__atan2f_lut[0] * c) * xx + (__atan2f_lut[2] * c);
 80 | 	b = (__atan2f_lut[1] * c) * xx + (__atan2f_lut[3] * c);
 81 | 	xx = xx * xx;
 82 | 	r = r + a * xx; 
 83 | 	r = r + b;
 84 | 
 85 | 	//determine quadrant and test for small x.
 86 | 	b = M_PI;
 87 | 	b = b - 2.0f * r;
 88 | 	r = r + (x < 0.0f) * b;
 89 | 	b = (fabs(x) < 0.000001f);
 90 | 	c = !b;
 91 | 	r = c * r;
 92 | 	r = r + __atan2f_pi_2 * b;
 93 | 	b = r + r;
 94 | 	r = r - (y < 0.0f) * b;
 95 | 	
 96 | 	return r;
 97 | }
 98 | 
 99 | float atan2f_neon_hfp(float y, float x)
100 | {
101 | #ifdef __MATH_NEON
102 | 	asm volatile (
103 | 
104 | 	"vdup.f32	 	d17, d0[1]				\n\t"	//d17 = {x, x};
105 | 	"vdup.f32	 	d16, d0[0]				\n\t"	//d16 = {y, y};
106 | 	
107 | 	//1.0 / x
108 | 	"vrecpe.f32		d18, d17				\n\t"	//d16 = ~ 1 / d1; 
109 | 	"vrecps.f32		d19, d18, d17			\n\t"	//d17 = 2.0 - d16 * d1; 
110 | 	"vmul.f32		d18, d18, d19			\n\t"	//d16 = d16 * d17; 
111 | 	"vrecps.f32		d19, d18, d17			\n\t"	//d17 = 2.0 - d16 * d1; 
112 | 	"vmul.f32		d18, d18, d19			\n\t"	//d16 = d16 * d17; 
113 | 
114 | 	//y * (1.0 /x)
115 | 	"vmul.f32		d0, d16, d18			\n\t"	//d0 = d16 * d18; 
116 | 
117 | 
118 | 	"vdup.f32	 	d4, %1					\n\t"	//d4 = {pi/2, pi/2};
119 | 	"vmov.f32	 	d6, d0					\n\t"	//d6 = d0;
120 | 	"vabs.f32	 	d0, d0					\n\t"	//d0 = fabs(d0) ;
121 | 
122 | 	//fast reciporical approximation
123 | 	"vrecpe.f32		d1, d0					\n\t"	//d1 = ~ 1 / d0; 
124 | 	"vrecps.f32		d2, d1, d0				\n\t"	//d2 = 2.0 - d1 * d0; 
125 | 	"vmul.f32		d1, d1, d2				\n\t"	//d1 = d1 * d2; 
126 | 	"vrecps.f32		d2, d1, d0				\n\t"	//d2 = 2.0 - d1 * d0; 
127 | 	"vmul.f32		d1, d1, d2				\n\t"	//d1 = d1 * d2; 
128 | 
129 | 	//if |x| > 1.0 -> ax = 1/ax, r = pi/2
130 | 	"vadd.f32		d1, d1, d0				\n\t"	//d1 = d1 + d0; 
131 | 	"vmov.f32	 	d2, #1.0				\n\t"	//d2 = 1.0;
132 | 	"vcgt.f32	 	d3, d0, d2				\n\t"	//d3 = (d0 > d2);
133 | 	"vcvt.f32.u32	d3, d3					\n\t"	//d3 = (float) d3;
134 | 	"vmls.f32		d0, d1, d3				\n\t"	//d0 = d0 - d1 * d3; 	
135 | 	"vmul.f32		d7, d3, d4				\n\t"	//d7 = d3 * d4; 	
136 | 		
137 | 	//polynomial:
138 | 	"vmul.f32 		d2, d0, d0				\n\t"	//d2 = d0*d0 = {ax^2, ax^2}	
139 | 	"vld1.32 		{d4, d5}, [%0]			\n\t"	//d4 = {p7, p3}, d5 = {p5, p1}
140 | 	"vmul.f32 		d3, d2, d2				\n\t"	//d3 = d2*d2 = {x^4, x^4}		
141 | 	"vmul.f32 		q0, q2, d0[0]			\n\t"	//q0 = q2 * d0[0] = {p7x, p3x, p5x, p1x}
142 | 	"vmla.f32 		d1, d0, d2[0]			\n\t"	//d1 = d1 + d0*d2[0] = {p5x + p7x^3, p1x + p3x^3}		
143 | 	"vmla.f32 		d1, d3, d1[0]			\n\t"	//d1 = d1 + d3*d1[0] = {..., p1x + p3x^3 + p5x^5 + p7x^7}		
144 | 	"vadd.f32 		d1, d1, d7				\n\t"	//d1 = d1 + d7		
145 | 	
146 | 	"vadd.f32 		d2, d1, d1				\n\t"	//d2 = d1 + d1		
147 | 	"vclt.f32	 	d3, d6, #0				\n\t"	//d3 = (d6 < 0)	
148 | 	"vcvt.f32.u32	d3, d3					\n\t"	//d3 = (float) d3	
149 | 	"vmls.f32 		d1, d3, d2				\n\t"	//d1 = d1 - d2 * d3;
150 | 
151 | 	"vmov.f32 		s0, s3					\n\t"	//s0 = s3
152 | 
153 | 	:: "r"(__atan2f_lut),  "r"(__atan2f_pi_2) 
154 |     : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"
155 | 	);
156 | #endif
157 | }
158 | 
159 | 
160 | float atan2f_neon_sfp(float x, float y)
161 | {
162 | #ifdef __MATH_NEON
163 | 	asm volatile ("vmov.f32 s0, r0 		\n\t");
164 | 	asm volatile ("vmov.f32 s1, r1 		\n\t");
165 | 	atan2f_neon_hfp(x, y);
166 | 	asm volatile ("vmov.f32 r0, s0 		\n\t");
167 | #else
168 | 	return atan2f_c(y, x);
169 | #endif
170 | };
171 | 


--------------------------------------------------------------------------------
/math_atanf.c:
--------------------------------------------------------------------------------
  1 | /*
  2 | The MIT License (MIT)
  3 | 
  4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
  5 | 
  6 | Permission is hereby granted, free of charge, to any person obtaining a copy
  7 | of this software and associated documentation files (the "Software"), to deal
  8 | in the Software without restriction, including without limitation the rights
  9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10 | copies of the Software, and to permit persons to whom the Software is
 11 | furnished to do so, subject to the following conditions:
 12 | 
 13 | The above copyright notice and this permission notice shall be included in
 14 | all copies or substantial portions of the Software.
 15 | 
 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 22 | THE SOFTWARE.
 23 | */
 24 | 
 25 | #include "math.h"
 26 | #include "math_neon.h"
 27 | 
 28 | const float __atanf_lut[4] = {
 29 | 	-0.0443265554792128,	//p7
 30 | 	-0.3258083974640975,	//p3
 31 | 	+0.1555786518463281,	//p5
 32 | 	+0.9997878412794807  	//p1
 33 | }; 
 34 |  
 35 | const float __atanf_pi_2 = M_PI_2;
 36 |     
 37 | float atanf_c(float x)
 38 | {
 39 | 
 40 | 	float a, b, r, xx;
 41 | 	int m;
 42 | 	
 43 | 	union {
 44 | 		float f;
 45 | 		int i;
 46 | 	} xinv, ax;
 47 | 
 48 | 	ax.f = fabs(x);
 49 | 	
 50 | 	//fast inverse approximation (2x newton)
 51 | 	xinv.f = ax.f;
 52 | 	m = 0x3F800000 - (xinv.i & 0x7F800000);
 53 | 	xinv.i = xinv.i + m;
 54 | 	xinv.f = 1.41176471f - 0.47058824f * xinv.f;
 55 | 	xinv.i = xinv.i + m;
 56 | 	b = 2.0 - xinv.f * ax.f;
 57 | 	xinv.f = xinv.f * b;	
 58 | 	b = 2.0 - xinv.f * ax.f;
 59 | 	xinv.f = xinv.f * b;
 60 | 	
 61 | 	//if |x| > 1.0 -> ax = -1/ax, r = pi/2
 62 | 	xinv.f = xinv.f + ax.f;
 63 | 	a = (ax.f > 1.0f);
 64 | 	ax.f = ax.f - a * xinv.f;
 65 | 	r = a * __atanf_pi_2;
 66 | 	
 67 | 	//polynomial evaluation
 68 | 	xx = ax.f * ax.f;	
 69 | 	a = (__atanf_lut[0] * ax.f) * xx + (__atanf_lut[2] * ax.f);
 70 | 	b = (__atanf_lut[1] * ax.f) * xx + (__atanf_lut[3] * ax.f);
 71 | 	xx = xx * xx;
 72 | 	b = b + a * xx; 
 73 | 	r = r + b;
 74 | 
 75 | 	//if x < 0 -> r = -r
 76 | 	a = 2 * r;
 77 | 	b = (x < 0.0f);
 78 | 	r = r - a * b;
 79 | 
 80 | 	return r;
 81 | }
 82 | 
 83 | 
 84 | float atanf_neon_hfp(float x)
 85 | {
 86 | #ifdef __MATH_NEON
 87 | 	asm volatile (
 88 | 
 89 | 	"vdup.f32	 	d0, d0[0]				\n\t"	//d0 = {x, x};
 90 | 
 91 | 	"vdup.f32	 	d4, %1					\n\t"	//d4 = {pi/2, pi/2};
 92 | 	"vmov.f32	 	d6, d0					\n\t"	//d6 = d0;
 93 | 	"vabs.f32	 	d0, d0					\n\t"	//d0 = fabs(d0) ;
 94 | 
 95 | 	//fast reciporical approximation
 96 | 	"vrecpe.f32		d1, d0					\n\t"	//d1 = ~ 1 / d0; 
 97 | 	"vrecps.f32		d2, d1, d0				\n\t"	//d2 = 2.0 - d1 * d0; 
 98 | 	"vmul.f32		d1, d1, d2				\n\t"	//d1 = d1 * d2; 
 99 | 	"vrecps.f32		d2, d1, d0				\n\t"	//d2 = 2.0 - d1 * d0; 
100 | 	"vmul.f32		d1, d1, d2				\n\t"	//d1 = d1 * d2; 
101 | 
102 | 		
103 | 	//if |x| > 1.0 -> ax = -1/ax, r = pi/2
104 | 	"vadd.f32		d1, d1, d0				\n\t"	//d1 = d1 + d0; 
105 | 	"vmov.f32	 	d2, #1.0				\n\t"	//d2 = 1.0;
106 | 	"vcgt.f32	 	d3, d0, d2				\n\t"	//d3 = (d0 > d2);
107 | 	"vshr.u32	 	d3, #31					\n\t"	//d3 = (d0 > d2);
108 | 	"vcvt.f32.u32	d3, d3					\n\t"	//d5 = (float) d3;	
109 | 	"vmls.f32		d0, d1, d3[0]			\n\t"	//d0 = d0 - d1 * d3[0]; 	
110 | 	"vmul.f32		d7, d4, d3[0] 			\n\t"	//d7 = d5 * d4; 	
111 | 	
112 | 	//polynomial:
113 | 	"vmul.f32 		d2, d0, d0				\n\t"	//d2 = d0*d0 = {ax^2, ax^2}	
114 | 	"vld1.32 		{d4, d5}, [%0]			\n\t"	//d4 = {p7, p3}, d5 = {p5, p1}
115 | 	"vmul.f32 		d3, d2, d2				\n\t"	//d3 = d2*d2 = {x^4, x^4}		
116 | 	"vmul.f32 		q0, q2, d0[0]			\n\t"	//q0 = q2 * d0[0] = {p7x, p3x, p5x, p1x}
117 | 	"vmla.f32 		d1, d0, d2[0]			\n\t"	//d1 = d1 + d0*d2[0] = {p5x + p7x^3, p1x + p3x^3}		
118 | 	"vmla.f32 		d1, d3, d1[0]			\n\t"	//d1 = d1 + d3*d1[0] = {..., p1x + p3x^3 + p5x^5 + p7x^7}		
119 | 	"vadd.f32 		d1, d1, d7				\n\t"	//d1 = d1 + d7		
120 | 
121 | 	"vadd.f32 		d2, d1, d1				\n\t"	//d2 = d1 + d1		
122 | 	"vclt.f32	 	d3, d6, #0				\n\t"	//d3 = (d6 < 0)	
123 | 	"vshr.u32	 	d3, #31					\n\t"	//d3 = (d0 > d2);
124 | 	"vcvt.f32.u32	d3, d3					\n\t"	//d3 = (float) d3	
125 | 	"vmls.f32 		d1, d3, d2				\n\t"	//d1 = d1 - d2 * d3;
126 | 
127 | 	"vmov.f32 		s0, s3					\n\t"	//s0 = s3
128 | 
129 | 	:: "r"(__atanf_lut),  "r"(__atanf_pi_2) 
130 |     : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"
131 | 	);
132 | 
133 | #endif
134 | }
135 | 
136 | 
137 | float atanf_neon_sfp(float x)
138 | {
139 | #ifdef __MATH_NEON
140 | 	asm volatile ("vdup.f32 d0, r0 		\n\t");
141 | 	atanf_neon_hfp(x);
142 | 	asm volatile ("vmov.f32 r0, s0 		\n\t");
143 | #else
144 | 	return atanf_c(x);
145 | #endif
146 | };
147 | 
148 | 
149 | 
150 | 


--------------------------------------------------------------------------------
/math_ceilf.c:
--------------------------------------------------------------------------------
 1 | /*
 2 | The MIT License (MIT)
 3 | 
 4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in
14 | all copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 | THE SOFTWARE.
23 | */
24 | 
25 | /*
26 | Assumes the floating point value |x| < 2147483648
27 | */
28 | 
29 | #include "math.h"
30 | #include "math_neon.h"
31 | 
32 | float ceilf_c(float x)
33 | {
34 | 	int n;
35 | 	float r;	
36 | 	n = (int) x;
37 | 	r = (float) n;
38 | 	r = r + (x > r);
39 | 	return r;
40 | }
41 | 
42 | float ceilf_neon_hfp(float x)
43 | {
44 | #ifdef __MATH_NEON
45 | 	asm volatile (
46 | 
47 | 	"vcvt.s32.f32 	d1, d0					\n\t"	//d1 = (int) d0;
48 | 	"vcvt.f32.s32 	d1, d1					\n\t"	//d1 = (float) d1;
49 | 	"vcgt.f32 		d0, d0, d1				\n\t"	//d0 = (d0 > d1);
50 | 	"vshr.u32 		d0, #31					\n\t"	//d0 = d0 >> 31;
51 | 	"vcvt.f32.u32 	d0, d0					\n\t"	//d0 = (float) d0;
52 | 	"vadd.f32 		d0, d1, d0				\n\t"	//d0 = d1 + d0;
53 | 
54 | 	::: "d0", "d1"
55 | 	);
56 | 		
57 | #endif
58 | }
59 | 
60 | float ceilf_neon_sfp(float x)
61 | {
62 | #ifdef __MATH_NEON
63 | 	asm volatile ("vmov.f32 s0, r0 		\n\t");
64 | 	ceilf_neon_hfp(x);
65 | 	asm volatile ("vmov.f32 r0, s0 		\n\t");
66 | #else
67 | 	return ceilf_c(x);
68 | #endif
69 | };
70 | 
71 | 
72 | 


--------------------------------------------------------------------------------
/math_cosf.c:
--------------------------------------------------------------------------------
 1 | /*
 2 | The MIT License (MIT)
 3 | 
 4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in
14 | all copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 | THE SOFTWARE.
23 | */
24 | 
25 | #include "math_neon.h"
26 | 
27 | float cosf_c(float x)
28 | {
29 | 	return sinf_c(x + M_PI_2);
30 | }
31 | 
32 | float cosf_neon_hfp(float x)
33 | {
34 | #ifdef __MATH_NEON
35 | 	float xx = x + M_PI_2;
36 | 	return sinf_neon_hfp(xx);
37 | #endif
38 | }
39 | 
40 | float cosf_neon_sfp(float x)
41 | {
42 | #ifdef __MATH_NEON
43 | 	asm volatile ("vdup.f32 d0, r0 		\n\t");
44 | 	cosf_neon_hfp(x);
45 | 	asm volatile ("vmov.f32 r0, s0 		\n\t");
46 | #else
47 | 	return cosf_c(x);
48 | #endif
49 | };
50 | 
51 | 


--------------------------------------------------------------------------------
/math_coshf.c:
--------------------------------------------------------------------------------
  1 | /*
  2 | The MIT License (MIT)
  3 | 
  4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
  5 | 
  6 | Permission is hereby granted, free of charge, to any person obtaining a copy
  7 | of this software and associated documentation files (the "Software"), to deal
  8 | in the Software without restriction, including without limitation the rights
  9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10 | copies of the Software, and to permit persons to whom the Software is
 11 | furnished to do so, subject to the following conditions:
 12 | 
 13 | The above copyright notice and this permission notice shall be included in
 14 | all copies or substantial portions of the Software.
 15 | 
 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 22 | THE SOFTWARE.
 23 | */
 24 | 
 25 | #include "math.h"
 26 | #include "math_neon.h"
 27 | 
 28 | const float __coshf_rng[2] = {
 29 | 	1.442695041f,
 30 | 	0.693147180f
 31 | };
 32 | 
 33 | const float __coshf_lut[16] = {
 34 | 	0.00019578093328483123,	//p7
 35 | 	0.00019578093328483123,	//p7
 36 | 	0.0014122663401803872, 	//p6
 37 | 	0.0014122663401803872, 	//p6
 38 | 	0.008336936973260111, 	//p5
 39 | 	0.008336936973260111, 	//p5
 40 | 	0.04165989275009526, 	//p4
 41 | 	0.04165989275009526, 	//p4
 42 | 	0.16666570253074878, 	//p3
 43 | 	0.16666570253074878, 	//p3
 44 | 	0.5000006143673624, 	//p2
 45 | 	0.5000006143673624, 	//p2
 46 | 	1.000000059694879, 		//p1
 47 | 	1.000000059694879, 		//p1
 48 | 	0.9999999916728642,		//p0
 49 | 	0.9999999916728642		//p0
 50 | };
 51 | 
 52 |   
 53 | float coshf_c(float x)
 54 | {
 55 | 	float a, b, xx;
 56 | 	xx = -x;
 57 | 	a = expf_c(x);
 58 | 	b = expf_c(xx);
 59 | 	a = a * 0.5f;
 60 | 	a = a + 0.5f * b;
 61 | 	return a;
 62 | }
 63 | 
 64 | 
 65 | float coshf_neon_hfp(float x)
 66 | {
 67 | #ifdef __MATH_NEON
 68 | 	asm volatile (
 69 | 	"vdup.f32 		d0, d0[0]				\n\t"	//d0 = {x, x}	
 70 | 	"fnegs 			s1, s1					\n\t"	//s1 = -s1
 71 | 	
 72 | 	//Range Reduction:
 73 | 	"vld1.32 		d2, [%0]				\n\t"	//d2 = {invrange, range}
 74 | 	"vld1.32 		{d16, d17}, [%1]!		\n\t"	
 75 | 	"vmul.f32 		d6, d0, d2[0]			\n\t"	//d6 = d0 * d2[0] 
 76 | 	"vcvt.s32.f32 	d6, d6					\n\t"	//d6 = (int) d6
 77 | 	"vld1.32 		{d18}, [%1]!			\n\t"	
 78 | 	"vcvt.f32.s32 	d1, d6					\n\t"	//d1 = (float) d6
 79 | 	"vld1.32 		{d19}, [%1]!			\n\t"	
 80 | 	"vmls.f32 		d0, d1, d2[1]			\n\t"	//d0 = d0 - d1 * d2[1]
 81 | 	"vld1.32 		{d20}, [%1]!			\n\t"	
 82 | 		
 83 | 	//polynomial:
 84 | 	"vmla.f32 		d17, d16, d0			\n\t"	//d17 = d17 + d16 * d0;	
 85 | 	"vld1.32 		{d21}, [%1]!			\n\t"	
 86 | 	"vmla.f32 		d18, d17, d0			\n\t"	//d18 = d18 + d17 * d0;	
 87 | 	"vld1.32 		{d22}, [%1]!			\n\t"	
 88 | 	"vmla.f32 		d19, d18, d0			\n\t"	//d19 = d19 + d18 * d0;	
 89 | 	"vld1.32 		{d23}, [%1]!			\n\t"	
 90 | 	"vmla.f32 		d20, d19, d0			\n\t"	//d20 = d20 + d19 * d0;	
 91 | 	"vmla.f32 		d21, d20, d0			\n\t"	//d21 = d21 + d20 * d0;	
 92 | 	"vmla.f32 		d22, d21, d0			\n\t"	//d22 = d22 + d21 * d0;	
 93 | 	"vmla.f32 		d23, d22, d0			\n\t"	//d23 = d23 + d22 * d0;	
 94 | 	
 95 | 	//multiply by 2 ^ m 	
 96 | 	"vshl.i32 		d6, d6, #23				\n\t"	//d6 = d6 << 23		
 97 | 	"vadd.i32 		d0, d23, d6				\n\t"	//d0 = d22 + d6		
 98 | 
 99 | 	"vdup.f32 		d2, d0[1]				\n\t"	//d2 = s1		
100 | 	"vmov.f32 		d1, #0.5				\n\t"	//d1 = 0.5		
101 | 	"vadd.f32 		d0, d0, d2				\n\t"	//d0 = d0 + d2		
102 | 	"vmul.f32 		d0, d1					\n\t"	//d0 = d0 * d1		
103 | 
104 | 	:: "r"(__coshf_rng), "r"(__coshf_lut) 
105 |     : "d0", "d1", "q1", "q2", "d6"
106 | 	);
107 | 		
108 | #endif
109 | }
110 | 
111 | float coshf_neon_sfp(float x)
112 | {
113 | #ifdef __MATH_NEON
114 | 	asm volatile ("vmov.f32 s0, r0 		\n\t");
115 | 	coshf_neon_hfp(x);
116 | 	asm volatile ("vmov.f32 r0, s0 		\n\t");
117 | #else
118 | 	return coshf_c(x);
119 | #endif
120 | };
121 | 


--------------------------------------------------------------------------------
/math_debug.c:
--------------------------------------------------------------------------------
  1 | /*
  2 | The MIT License (MIT)
  3 | 
  4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
  5 | 
  6 | Permission is hereby granted, free of charge, to any person obtaining a copy
  7 | of this software and associated documentation files (the "Software"), to deal
  8 | in the Software without restriction, including without limitation the rights
  9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10 | copies of the Software, and to permit persons to whom the Software is
 11 | furnished to do so, subject to the following conditions:
 12 | 
 13 | The above copyright notice and this permission notice shall be included in
 14 | all copies or substantial portions of the Software.
 15 | 
 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 22 | THE SOFTWARE.
 23 | */
 24 | 
 25 | #include "math_neon.h"
 26 | #include <stdlib.h>
 27 | #include <stdio.h>
 28 | #include <string.h>
 29 | #include <math.h>
 30 | #include <time.h>
 31 | #ifdef WIN32
 32 | #include <time.h>
 33 | #else
 34 | #include <sys/time.h>
 35 | #include <sys/resource.h>
 36 | #endif
 37 | 
 38 | #define randf()	(rand() / (RAND_MAX + 1.0f))
 39 | 
 40 | 
 41 | 
 42 | struct	test1_s {
 43 | 	const char*	name;
 44 | 	float 		(*func)(float);	//the function
 45 | 	float 		(*bench)(float);	//the function to benchmark against.
 46 | 	float 		rng0, rng1;
 47 | 	int			num;
 48 | 	float 		emaxabs;
 49 | 	float 		xmaxabs;
 50 | 	float 		emaxrel;
 51 | 	float 		xmaxrel;
 52 | 	float 		erms;
 53 | 	int			time;				//time to execute num functions;
 54 | };
 55 | 
 56 | struct	test2_s {
 57 | 	const char*	name;
 58 | 	float 		(*func)(float, float);	//the function
 59 | 	float 		(*bench)(float, float);	//the function to benchmark against.
 60 | 	float 		rng0, rng1;
 61 | 	int			num;
 62 | 	float 		emaxabs;
 63 | 	float 		xmaxabs;
 64 | 	float 		emaxrel;
 65 | 	float 		xmaxrel;
 66 | 	float 		erms;
 67 | 	int			time;				//time to execute num functions;
 68 | };
 69 | 
 70 | 
 71 | float invsqrtf(float x){
 72 | 	return (1.0f / sqrtf(x));
 73 | }
 74 | 
 75 | typedef struct test1_s test1_t;
 76 | typedef struct test2_s test2_t;
 77 | 
 78 | test1_t test1[51] = 
 79 | {
 80 | 	{"sinf       ", 	sinf, 		sinf, 	-M_PI, 		M_PI, 	500000},
 81 | 	{"sinf_c     ", 	sinf_c, 	sinf, 	-M_PI, 		M_PI, 	500000},
 82 | 	{"sinf_neon  ", 	sinf_neon, 	sinf, 	-M_PI, 		M_PI, 	500000},
 83 | 	
 84 | 	{"cosf       ", 	cosf, 		cosf, 	-M_PI, 		M_PI, 	500000},
 85 | 	{"cosf_c     ", 	cosf_c, 	cosf, 	-M_PI, 		M_PI, 	500000},
 86 | 	{"cosf_neon  ", 	cosf_neon, 	cosf, 	-M_PI, 		M_PI, 	500000},
 87 | 
 88 | 	{"tanf       ", 	tanf, 		tanf, 	-M_PI_4, 	M_PI_4, 500000, 0, 0, 0},
 89 | 	{"tanf_c     ", 	tanf_c, 	tanf, 	-M_PI_4, 	M_PI_4, 500000, 0, 0, 0},
 90 | 	{"tanf_neon  ", 	tanf_neon, 	tanf, 	-M_PI_4, 	M_PI_4, 500000, 0, 0, 0},
 91 | 
 92 | 	{"asinf      ", 	asinf, 		asinf, 	-1, 		1, 		500000, 0, 0, 0},
 93 | 	{"asinf_c    ", 	asinf_c, 	asinf, 	-1, 		1,	 	500000, 0, 0, 0},
 94 | 	{"asinf_neon ",		asinf_neon,	asinf, 	-1, 		1, 		500000, 0, 0, 0},
 95 | 	
 96 | 	{"acosf      ", 	acosf, 		acosf, 	-1, 		1, 		500000, 0, 0, 0},
 97 | 	{"acosf_c    ", 	acosf_c, 	acosf, 	-1, 		1,	 	500000, 0, 0, 0},
 98 | 	{"acosf_neon ",		acosf_neon,	acosf, 	-1, 		1, 		500000, 0, 0, 0},
 99 | 	
100 | 	{"atanf      ", 	atanf, 		atanf, 	-1, 		1, 		500000, 0, 0, 0},
101 | 	{"atanf_c    ", 	atanf_c, 	atanf, 	-1, 		1,	 	500000, 0, 0, 0},
102 | 	{"atanf_neon ",		atanf_neon,	atanf, 	-1, 		1, 		500000, 0, 0, 0},
103 | 
104 | 	{"sinhf       ", 	sinhf, 		sinhf, 	-M_PI, 		M_PI, 	500000, 0, 0, 0},
105 | 	{"sinhf_c     ", 	sinhf_c, 	sinhf, 	-M_PI, 		M_PI, 	500000, 0, 0, 0},
106 | 	{"sinhf_neon  ", 	sinhf_neon, sinhf, 	-M_PI, 		M_PI, 	500000, 0, 0, 0},
107 | 	
108 | 	{"coshf       ", 	coshf, 		coshf, 	-M_PI, 		M_PI, 	500000, 0, 0, 0},
109 | 	{"coshf_c     ", 	coshf_c, 	coshf, 	-M_PI, 		M_PI, 	500000, 0, 0, 0},
110 | 	{"coshf_neon  ", 	coshf_neon, coshf, 	-M_PI, 		M_PI, 	500000, 0, 0, 0},
111 | 
112 | 	{"tanhf       ", 	tanhf, 		tanhf, 	-M_PI, 		M_PI, 	500000, 0, 0, 0},
113 | 	{"tanhf_c     ", 	tanhf_c, 	tanhf, 	-M_PI, 		M_PI, 	500000, 0, 0, 0},
114 | 	{"tanhf_neon  ", 	tanhf_neon, tanhf, 	-M_PI, 		M_PI, 	500000, 0, 0, 0},
115 | 
116 | 	{"expf       ", 	expf, 		expf, 	0, 			10, 	500000, 0, 0, 0},
117 | 	{"expf_c     ", 	expf_c, 	expf, 	0, 			10, 	500000, 0, 0, 0},
118 | 	{"expf_neon  ",		expf_neon, 	expf, 	0, 			10, 	500000, 0, 0, 0},
119 | 	
120 | 	{"logf       ", 	logf, 		logf, 	1, 			1000, 	500000, 0, 0, 0},
121 | 	{"logf_c     ", 	logf_c, 	logf, 	1, 			1000, 	500000, 0, 0, 0},
122 | 	{"logf_neon  ",		logf_neon, 	logf, 	1, 			1000, 	500000, 0, 0, 0},
123 | 
124 | 	{"log10f       ", 	log10f, 	log10f, 1, 			1000, 	500000, 0, 0, 0},
125 | 	{"log10f_c     ", 	log10f_c, 	log10f, 1, 			1000, 	500000, 0, 0, 0},
126 | 	{"log10f_neon  ",	log10f_neon,log10f, 1, 			1000, 	500000, 0, 0, 0},
127 | 
128 | 	{"floorf     ", 	floorf, 	floorf, 1, 			1000, 	5000000, 0, 0, 0},
129 | 	{"floorf_c   ", 	floorf_c, 	floorf, 1, 			1000, 	5000000, 0, 0, 0},
130 | 	{"floorf_neon",		floorf_neon,floorf, 1, 			1000, 	5000000, 0, 0, 0},
131 | 
132 | 	{"ceilf     ", 		ceilf, 		ceilf, 	1, 			1000, 	5000000, 0, 0, 0},
133 | 	{"ceilf_c   ", 		ceilf_c, 	ceilf, 	1, 			1000, 	5000000, 0, 0, 0},
134 | 	{"ceilf_neon",		ceilf_neon,	ceilf, 	1, 			1000, 	5000000, 0, 0, 0},
135 | 
136 | 	{"fabsf     ", 		fabsf, 		fabsf, 	1, 			1000, 	5000000, 0, 0, 0},
137 | 	{"fabsf_c   ", 		fabsf_c, 	fabsf, 	1, 			1000, 	5000000, 0, 0, 0},
138 | 	{"fabsf_neon",		fabsf_neon,	fabsf, 	1, 			1000, 	5000000, 0, 0, 0},
139 | 
140 | 	{"sqrtf      ", 	sqrtf, 		sqrtf, 	1, 			1000, 	500000, 0, 0, 0},
141 | 	{"sqrtf_c    ", 	sqrtf_c, 	sqrtf, 	1, 			1000, 	500000, 0, 0, 0},
142 | 	{"sqrtf_neon ",		sqrtf_neon,	sqrtf, 	1, 			1000, 	500000, 0, 0, 0},
143 | 
144 | 	{"invsqrtf      ", 	invsqrtf, 		invsqrtf, 	1, 	1000, 	500000, 0, 0, 0},
145 | 	{"invsqrtf_c    ", 	invsqrtf_c, 	invsqrtf, 	1, 	1000, 	500000, 0, 0, 0},
146 | 	{"invsqrtf_neon ",	invsqrtf_neon,	invsqrtf, 	1, 	1000, 	500000, 0, 0, 0},
147 | };
148 | 
149 | test2_t test2[9] = 
150 | {
151 | 	{"atan2f       ", 	atan2f, 	atan2f, 0.1, 		10, 	10000, 0, 0, 0},
152 | 	{"atan2f_c     ", 	atan2f_c, 	atan2f, 0.1, 		10, 	10000, 0, 0, 0},
153 | 	{"atan2f_neon  ", 	atan2f_neon,atan2f, 0.1, 		10, 	10000, 0, 0, 0},
154 | 	
155 | 	{"powf       ", 	powf, 		powf, 	1, 			10, 	10000, 0, 0, 0},
156 | 	{"powf_c     ", 	powf_c, 	powf, 	1, 			10, 	10000, 0, 0, 0},
157 | 	{"powf_neon  ", 	powf_neon, 	powf, 	1, 			10, 	10000, 0, 0, 0},
158 | 
159 | 	{"fmodf       ", 	fmodf, 		fmodf, 	1, 			10, 	10000, 0, 0, 0},
160 | 	{"fmodf_c     ", 	fmodf_c, 	fmodf, 	1, 			10, 	10000, 0, 0, 0},
161 | 	{"fmodf_neon  ", 	fmodf_neon, fmodf, 	1, 			10, 	10000, 0, 0, 0},
162 | 
163 | };
164 | 
165 | 
166 | void 
167 | test_mathfunc1(test1_t *tst)
168 | {
169 | 
170 | 	float x;
171 | 	float dx = (tst->rng1 - tst->rng0) / ((float)tst->num);
172 | #ifndef WIN32
173 | 	struct rusage ru;
174 | #endif
175 | 
176 | 	tst->emaxabs = tst->xmaxabs = 0;
177 | 	tst->emaxrel = tst->xmaxrel = 0;
178 | 	tst->erms = 0;
179 | 	for(x = tst->rng0; x < tst->rng1 ; x += dx){	
180 | 		float r = (tst->func)((float)x);
181 | 		float rr = (tst->bench)((float)x);
182 | 		float dr = fabs(r - rr);
183 | 		float drr = dr * (100.0f / rr);
184 | 		tst->erms += dr*dr;
185 | 		if (dr > tst->emaxabs){
186 | 			tst->emaxabs = dr;
187 | 			tst->xmaxabs = x;
188 | 		}
189 | 		if (drr > tst->emaxrel){
190 | 			tst->emaxrel = drr;
191 | 			tst->xmaxrel = x;
192 | 		}
193 | 	}
194 | 	tst->erms = sqrt(tst->erms / ((float) tst->num));
195 | 	
196 | #ifdef WIN32
197 | 	tst->time = (1000 * clock()) / (CLOCKS_PER_SEC / 1000);
198 | #else
199 | 	getrusage(RUSAGE_SELF, &ru);	
200 | 	tst->time = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec;
201 | #endif
202 | 
203 | 	for(x = tst->rng0; x < tst->rng1 ; x += dx){	
204 | 		(tst->func)((float)x);
205 | 	}
206 | 
207 | #ifdef WIN32
208 | 	tst->time = (1000 * clock()) / (CLOCKS_PER_SEC / 1000) - tst->time;
209 | #else
210 | 	getrusage(RUSAGE_SELF, &ru);	
211 | 	tst->time = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec - tst->time;
212 | #endif
213 | 
214 | }
215 | 
216 | void
217 | test_mathfunc2(test2_t *tst)
218 | {
219 | 	float x, y;
220 | 	float rng = tst->rng1 - tst->rng0;
221 | 	float d = (rng * rng) / ((float) tst->num);
222 | #ifndef WIN32
223 | 	struct rusage ru;
224 | #endif
225 | 
226 | 	tst->emaxabs = tst->xmaxabs = 0;
227 | 	tst->emaxrel = tst->xmaxrel = 0;
228 | 	for(y = (tst->rng0); y < (tst->rng1) ; y += d){	
229 | 		for(x = (tst->rng0); x < (tst->rng1); x += d){	
230 | 			float r = (tst->func)((float)x, y);
231 | 			float rr = (tst->bench)((float)x, y);
232 | 			float dr = fabs(r - rr);
233 | 			float drr = dr * (100.0f / rr);
234 | 			if (dr > tst->emaxabs){
235 | 				tst->emaxabs = dr;
236 | 				tst->xmaxabs = x;
237 | 			}
238 | 			if (drr > tst->emaxrel && fabsf(rr) > 0.0001){
239 | 				tst->emaxrel = drr;
240 | 				tst->xmaxrel = x;
241 | 			}
242 | 		}
243 | 	}
244 | 	
245 | #ifdef WIN32
246 | 	tst->time = (1000 * clock()) / (CLOCKS_PER_SEC / 1000) ;
247 | #else
248 | 	getrusage(RUSAGE_SELF, &ru);	
249 | 	tst->time = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec;
250 | #endif
251 | 
252 | 	for(y = tst->rng0; y < tst->rng1 ; y += d){	
253 | 		for(x = tst->rng0; x < tst->rng1 ; x += d){	
254 | 			(tst->func)((float)x, (float)y);
255 | 		}
256 | 	}
257 | 
258 | #ifdef WIN32
259 | 	tst->time = (1000 * clock()) / (CLOCKS_PER_SEC / 1000) - tst->time;
260 | #else
261 | 	getrusage(RUSAGE_SELF, &ru);	
262 | 	tst->time = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec - tst->time;
263 | #endif
264 | 
265 | }
266 | 
267 | void test_vectorfunc()
268 | {
269 | 	float v0[4], v1[4], d[4];
270 | 	
271 | 	for(int i=0;i<4;i++)
272 | 	{
273 | 		v0[i] = 10*randf() - 5;
274 | 		v1[i] = 10*randf() - 5;
275 | 		d[i] = 10*randf() - 5;		
276 | 	}
277 | 	
278 | 	int testnum = 5000000;
279 | 	struct rusage ru;
280 | 	int v2t[3], v3t[3], v4t[3];
281 | 	float r;
282 | 	
283 | 	printf("\n");
284 | 	
285 | 	//dot 2
286 | 	getrusage(RUSAGE_SELF, &ru);	
287 | 	v2t[0] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec;
288 | 	for(int i=0;i < testnum; i++)
289 | 	{
290 | 		r = dot2_c(v0, v1);
291 | 	};
292 | 	getrusage(RUSAGE_SELF, &ru);	
293 | 	v2t[1] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec;
294 | 	for(int i=0;i < testnum; i++)
295 | 	{
296 | 		r = dot2_neon(v0, v1);
297 | 	};
298 | 	getrusage(RUSAGE_SELF, &ru);	
299 | 	v2t[2] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec;
300 | 
301 | 	r = dot2_c(v0, v1);
302 | 	printf("dot2_c = %f\n", r);
303 | 	r = dot2_neon(v0, v1);
304 | 	printf("dot2_neon = %f\n", r);
305 | 	
306 | 	printf("dot2: c=%i \t neon=%i \t rate=%.2f \n", v2t[1] - v2t[0], v2t[2] - v2t[1], 
307 | 	(float)(v2t[1] - v2t[0]) / (float)(v2t[2] - v2t[1]));
308 | 
309 | 	//normalize 2
310 | 	getrusage(RUSAGE_SELF, &ru);	
311 | 	v2t[0] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec;
312 | 	for(int i=0;i < testnum; i++)
313 | 	{
314 | 		normalize2_c(v0, d);
315 | 	};
316 | 	getrusage(RUSAGE_SELF, &ru);	
317 | 	v2t[1] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec;
318 | 	for(int i=0;i < testnum; i++)
319 | 	{
320 | 		normalize2_neon(v0, d);
321 | 	};
322 | 	getrusage(RUSAGE_SELF, &ru);	
323 | 	v2t[2] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec;
324 | 
325 | 
326 | 	normalize2_c(v0, d);
327 | 	printf("normalize2_c = [%.2f, %.2f]\n", d[0], d[1]);
328 | 	normalize2_neon(v0, d);
329 | 	printf("normalize2_neon = [%.2f, %.2f]\n", d[0], d[1]);
330 | 	
331 | 	printf("normalize2: c=%i \t neon=%i \t rate=%.2f \n", v2t[1] - v2t[0], v2t[2] - v2t[1], 
332 | 	(float)(v2t[1] - v2t[0]) / (float)(v2t[2] - v2t[1]));
333 | 	printf("\n");
334 | 
335 | 	
336 | 	//dot 3
337 | 	getrusage(RUSAGE_SELF, &ru);	
338 | 	v3t[0] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec;
339 | 	for(int i=0;i < testnum; i++)
340 | 	{
341 | 		r = dot3_c(v0, v1);
342 | 	};
343 | 	getrusage(RUSAGE_SELF, &ru);	
344 | 	v3t[1] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec;
345 | 	for(int i=0;i < testnum; i++)
346 | 	{
347 | 		r = dot3_neon(v0, v1);
348 | 	};
349 | 	getrusage(RUSAGE_SELF, &ru);	
350 | 	v3t[2] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec;
351 | 
352 | 	r = dot3_c(v0, v1);
353 | 	printf("dot3_c = %f\n", r);
354 | 	r = dot3_neon(v0, v1);
355 | 	printf("dot3_neon = %f\n", r);
356 | 	
357 | 	printf("dot3: c=%i \t neon=%i \t rate=%.2f \n", v3t[1] - v3t[0], v3t[2] - v3t[1], 
358 | 	(float)(v3t[1] - v3t[0]) / (float)(v3t[2] - v3t[1]));
359 | 
360 | 	//normalize 3
361 | 	getrusage(RUSAGE_SELF, &ru);	
362 | 	v3t[0] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec;
363 | 	for(int i=0;i < testnum; i++)
364 | 	{
365 | 		normalize3_c(v0, d);
366 | 	};
367 | 	getrusage(RUSAGE_SELF, &ru);	
368 | 	v3t[1] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec;
369 | 	for(int i=0;i < testnum; i++)
370 | 	{
371 | 		normalize3_neon(v0, d);
372 | 	};
373 | 	getrusage(RUSAGE_SELF, &ru);	
374 | 	v3t[2] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec;
375 | 
376 | 
377 | 	normalize3_c(v0, d);
378 | 	printf("normalize3_c = [%.2f, %.2f, %.2f]\n", d[0], d[1], d[2]);
379 | 	normalize3_neon(v0, d);
380 | 	printf("normalize3_neon = [%.2f, %.2f, %.2f]\n", d[0], d[1], d[2]);
381 | 	
382 | 	printf("normalize3: c=%i \t neon=%i \t rate=%.2f \n", v3t[1] - v3t[0], v3t[2] - v3t[1], 
383 | 	(float)(v3t[1] - v3t[0]) / (float)(v3t[2] - v3t[1]));
384 | 
385 | 	//cross 3
386 | 	getrusage(RUSAGE_SELF, &ru);	
387 | 	v3t[0] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec;
388 | 	for(int i=0;i < testnum; i++)
389 | 	{
390 | 		cross3_c(v0, v1, d);
391 | 	};
392 | 	getrusage(RUSAGE_SELF, &ru);	
393 | 	v3t[1] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec;
394 | 	for(int i=0;i < testnum; i++)
395 | 	{
396 | 		cross3_neon(v0, v1, d);
397 | 	};
398 | 	getrusage(RUSAGE_SELF, &ru);	
399 | 	v3t[2] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec;
400 | 
401 | 
402 | 	cross3_c(v0, v1, d);
403 | 	printf("cross3_c = [%.2f, %.2f, %.2f]\n", d[0], d[1], d[2]);
404 | 	cross3_neon(v0, v1, d);
405 | 	printf("cross3_neon = [%.2f, %.2f, %.2f]\n", d[0], d[1], d[2]);
406 | 	
407 | 	printf("cross3: c=%i \t neon=%i \t rate=%.2f \n", v3t[1] - v3t[0], v3t[2] - v3t[1], 
408 | 	(float)(v3t[1] - v3t[0]) / (float)(v3t[2] - v3t[1]));
409 | 	printf("\n");
410 | 
411 | 
412 | 	//dot 4
413 | 	getrusage(RUSAGE_SELF, &ru);	
414 | 	v4t[0] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec;
415 | 	for(int i=0;i < testnum; i++)
416 | 	{
417 | 		r = dot4_c(v0, v1);
418 | 	};
419 | 	getrusage(RUSAGE_SELF, &ru);	
420 | 	v4t[1] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec;
421 | 	for(int i=0;i < testnum; i++)
422 | 	{
423 | 		r = dot4_neon(v0, v1);
424 | 	};
425 | 	getrusage(RUSAGE_SELF, &ru);	
426 | 	v4t[2] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec;
427 | 
428 | 	r = dot4_c(v0, v1);
429 | 	printf("dot4_c = %f\n", r);
430 | 	r = dot4_neon(v0, v1);
431 | 	printf("dot4_neon = %f\n", r);
432 | 	
433 | 	printf("dot4: c=%i \t neon=%i \t rate=%.2f \n", v4t[1] - v4t[0], v4t[2] - v4t[1], 
434 | 	(float)(v4t[1] - v4t[0]) / (float)(v4t[2] - v4t[1]));
435 | 	
436 | 	//normalize 4
437 | 	getrusage(RUSAGE_SELF, &ru);	
438 | 	v4t[0] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec;
439 | 	for(int i=0;i < testnum; i++)
440 | 	{
441 | 		normalize4_c(v0, d);
442 | 	};
443 | 	getrusage(RUSAGE_SELF, &ru);	
444 | 	v4t[1] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec;
445 | 	for(int i=0;i < testnum; i++)
446 | 	{
447 | 		normalize4_neon(v0, d);
448 | 	};
449 | 	getrusage(RUSAGE_SELF, &ru);	
450 | 	v4t[2] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec;
451 | 
452 | 
453 | 	normalize4_c(v0, d);
454 | 	printf("normalize4_c = [%.2f, %.2f, %.2f, %.2f]\n", d[0], d[1], d[2], d[3]);
455 | 	normalize4_neon(v0, d);
456 | 	printf("normalize4_neon = [%.2f, %.2f, %.2f, %.2f]\n", d[0], d[1], d[2], d[3]);
457 | 	
458 | 	printf("normalize4: c=%i \t neon=%i \t rate=%.2f \n", v4t[1] - v4t[0], v4t[2] - v4t[1], 
459 | 	(float)(v4t[1] - v4t[0]) / (float)(v4t[2] - v4t[1]));
460 | 	printf("\n");
461 | 
462 | 
463 | }
464 | 
465 | 
466 | 
467 | void test_matrixfunc()
468 | {
469 | 	float m0[16], m1[16], m2[16];
470 | 	int m2t[3], m3t[3], m4t[3];
471 | 	
472 | 	int i;
473 | 	int testnum = 1000000;
474 | 	struct rusage ru;
475 | 	
476 | 	for(int i=0;i<16;i++)
477 | 	{
478 | 		m0[i] = 10.0f * randf() - 5.0f; 
479 | 		m1[i] = 10.0f * randf() - 5.0f; 
480 | 		m2[i] = 10.0f * randf() - 5.0f; 
481 | 	}
482 | 
483 | 
484 | 	//matmul2 
485 | 	getrusage(RUSAGE_SELF, &ru);	
486 | 	m2t[0] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec;
487 | 	for(i = 0; i < testnum; i++){
488 | 		matmul2_c(m0, m1, m2);	
489 | 	}
490 | 	getrusage(RUSAGE_SELF, &ru);	
491 | 	m2t[1] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec;
492 | 	for(i = 0; i < testnum; i++){
493 | 		matmul2_neon(m0, m1, m2);
494 | 	}
495 | 	getrusage(RUSAGE_SELF, &ru);	
496 | 	m2t[2] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec;
497 | 
498 | 	matmul2_c(m0, m1, m2);	
499 | 	printf("matmul2_c = \n");
500 | 	printf("\t\t\t|%.2f, %.2f|\n", m2[0], m2[2]);
501 | 	printf("\t\t\t|%.2f, %.2f|\n", m2[1], m2[3]);
502 | 
503 | 	matmul2_neon(m0, m1, m2);	
504 | 	printf("matmul2_neon = \n");
505 | 	printf("\t\t\t|%.2f, %.2f|\n", m2[0], m2[2]);
506 | 	printf("\t\t\t|%.2f, %.2f|\n", m2[1], m2[3]);
507 | 	
508 | 	printf("matmul2: c=%i \t neon=%i \t rate=%.2f \n", m2t[1] - m2t[0], m2t[2] - m2t[1], 
509 | 		(float)(m2t[1] - m2t[0]) / (float)(m2t[2] - m2t[1]));
510 | 
511 | 
512 | 	//matvec2 
513 | 	getrusage(RUSAGE_SELF, &ru);	
514 | 	m2t[0] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec;
515 | 	for(i = 0; i < testnum; i++){
516 | 		matvec2_c(m0, m1, m2);	
517 | 	}
518 | 	getrusage(RUSAGE_SELF, &ru);	
519 | 	m2t[1] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec;
520 | 	for(i = 0; i < testnum; i++){
521 | 		matvec2_neon(m0, m1, m2);
522 | 	}
523 | 	getrusage(RUSAGE_SELF, &ru);	
524 | 	m2t[2] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec;
525 | 
526 | 	memset(m2, 0, 4*sizeof(float));
527 | 	matvec2_c(m0, m1, m2);	
528 | 	printf("matvec2_c = |%.2f, %.2f|\n", m2[0], m2[1]);
529 | 	
530 | 	memset(m2, 0, 4*sizeof(float));
531 | 	matvec2_neon(m0, m1, m2);	
532 | 	printf("matvec2_neon = |%.2f, %.2f|\n", m2[0], m2[1]);
533 | 
534 | 	printf("matvec2: c=%i \t neon=%i \t rate=%.2f \n", m2t[1] - m2t[0], m2t[2] - m2t[1], 
535 | 		(float)(m2t[1] - m2t[0]) / (float)(m2t[2] - m2t[1]));
536 | 
537 | 	//MAT3
538 | 	getrusage(RUSAGE_SELF, &ru);	
539 | 	m3t[0] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec;
540 | 	for(i = 0; i < testnum; i++){
541 | 		matmul3_c(m0, m1, m2);	
542 | 	}
543 | 	getrusage(RUSAGE_SELF, &ru);	
544 | 	m3t[1] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec;
545 | 	for(i = 0; i < testnum; i++){
546 | 		matmul3_neon(m0, m1, m2);
547 | 	}
548 | 	getrusage(RUSAGE_SELF, &ru);	
549 | 	m3t[2] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec;
550 | 
551 | 	memset(m2, 0, 9*sizeof(float));
552 | 	matmul3_c(m0, m1, m2);	
553 | 	printf("matmul3_c =\n");
554 | 	printf("\t\t\t|%.2f, %.2f, %.2f|\n", m2[0], m2[3], m2[6]);
555 | 	printf("\t\t\t|%.2f, %.2f, %.2f|\n", m2[1], m2[4], m2[7]);
556 | 	printf("\t\t\t|%.2f, %.2f, %.2f|\n", m2[2], m2[5], m2[8]);
557 | 	
558 | 	memset(m2, 0, 9*sizeof(float));
559 | 	matmul3_neon(m0, m1, m2);	
560 | 	printf("matmul3_neon =\n");
561 | 	printf("\t\t\t|%.2f, %.2f, %.2f|\n", m2[0], m2[3], m2[6]);
562 | 	printf("\t\t\t|%.2f, %.2f, %.2f|\n", m2[1], m2[4], m2[7]);
563 | 	printf("\t\t\t|%.2f, %.2f, %.2f|\n", m2[2], m2[5], m2[8]);
564 | 	
565 | 	printf("matmul3: c=%i \t neon=%i \t rate=%.2f \n", m3t[1] - m3t[0], m3t[2] - m3t[1], 
566 | 		(float)(m3t[1] - m3t[0]) / (float)(m3t[2] - m3t[1]));
567 | 
568 | 	//matvec3
569 | 	getrusage(RUSAGE_SELF, &ru);	
570 | 	m3t[0] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec;
571 | 	for(i = 0; i < testnum; i++){
572 | 		matvec3_c(m0, m1, m2);	
573 | 	}
574 | 	getrusage(RUSAGE_SELF, &ru);	
575 | 	m3t[1] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec;
576 | 	for(i = 0; i < testnum; i++){
577 | 		matvec3_neon(m0, m1, m2);
578 | 	}
579 | 	getrusage(RUSAGE_SELF, &ru);	
580 | 	m3t[2] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec;
581 | 
582 | 	memset(m2, 0, 4*sizeof(float));
583 | 	matvec3_c(m0, m1, m2);	
584 | 	printf("matvec3_c = |%.2f, %.2f, %.2f|\n", m2[0], m2[1], m2[2]);
585 | 
586 | 	memset(m2, 0, 4*sizeof(float));
587 | 	matvec3_neon(m0, m1, m2);	
588 | 	printf("matvec3_neon = |%.2f, %.2f, %.2f|\n", m2[0], m2[1], m2[2]);
589 | 	
590 | 	printf("matvec3: c=%i \t neon=%i \t rate=%.2f \n", m2t[1] - m2t[0], m2t[2] - m2t[1], 
591 | 		(float)(m2t[1] - m2t[0]) / (float)(m2t[2] - m2t[1]));
592 | 
593 | 	//MAT4
594 | 	getrusage(RUSAGE_SELF, &ru);	
595 | 	m4t[0] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec;
596 | 	for(i = 0; i < testnum; i++){
597 | 		matmul4_c(m0, m1, m2);	
598 | 	}
599 | 	getrusage(RUSAGE_SELF, &ru);	
600 | 	m4t[1] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec;
601 | 	for(i = 0; i < testnum; i++){
602 | 		matmul4_neon(m0, m1, m2);
603 | 	}
604 | 	getrusage(RUSAGE_SELF, &ru);	
605 | 	m4t[2] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec;
606 | 
607 | 	memset(m2, 0, 16*sizeof(float));
608 | 	matmul4_c(m0, m1, m2);	
609 | 	printf("matmul4_c =\n");
610 | 	printf("\t\t\t|%.2f, %.2f, %.2f, %.2f|\n", m2[0], m2[4], m2[8], m2[12]);
611 | 	printf("\t\t\t|%.2f, %.2f, %.2f, %.2f|\n", m2[1], m2[5], m2[9], m2[13]);
612 | 	printf("\t\t\t|%.2f, %.2f, %.2f, %.2f|\n", m2[2], m2[6], m2[10], m2[14]);
613 | 	printf("\t\t\t|%.2f, %.2f, %.2f, %.2f|\n", m2[3], m2[7], m2[11], m2[15]);
614 | 	
615 | 	memset(m2, 0, 16*sizeof(float));
616 | 	matmul4_neon(m0, m1, m2);	
617 | 	printf("matmul4_neon =\n");
618 | 	printf("\t\t\t|%.2f, %.2f, %.2f, %.2f|\n", m2[0], m2[4], m2[8], m2[12]);
619 | 	printf("\t\t\t|%.2f, %.2f, %.2f, %.2f|\n", m2[1], m2[5], m2[9], m2[13]);
620 | 	printf("\t\t\t|%.2f, %.2f, %.2f, %.2f|\n", m2[2], m2[6], m2[10], m2[14]);
621 | 	printf("\t\t\t|%.2f, %.2f, %.2f, %.2f|\n", m2[3], m2[7], m2[11], m2[15]);
622 | 	
623 | 	printf("matmul4: c=%i \t neon=%i \t rate=%.2f \n", m4t[1] - m4t[0], m4t[2] - m4t[1], 
624 | 		(float)(m4t[1] - m4t[0]) / (float)(m4t[2] - m4t[1]));
625 | 
626 | 	//matvec4
627 | 	getrusage(RUSAGE_SELF, &ru);	
628 | 	m4t[0] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec;
629 | 	for(i = 0; i < testnum; i++){
630 | 		matvec4_c(m0, m1, m2);	
631 | 	}
632 | 	getrusage(RUSAGE_SELF, &ru);	
633 | 	m4t[1] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec;
634 | 	for(i = 0; i < testnum; i++){
635 | 		matvec4_neon(m0, m1, m2);
636 | 	}
637 | 	getrusage(RUSAGE_SELF, &ru);	
638 | 	m4t[2] = ru.ru_utime.tv_sec * 1000000 + ru.ru_utime.tv_usec;
639 | 
640 | 	memset(m2, 0, 4*sizeof(float));
641 | 	matvec4_c(m0, m1, m2);	
642 | 	printf("matvec4_c = |%.2f, %.2f, %.2f, %f|\n", m2[0], m2[1], m2[2], m2[3]);
643 | 
644 | 	memset(m2, 0, 4*sizeof(float));
645 | 	matvec4_neon(m0, m1, m2);	
646 | 	printf("matvec4_neon = |%.2f, %.2f, %.2f, %f|\n", m2[0], m2[1], m2[2], m2[3]);
647 | 	
648 | 	printf("matvec4: c=%i \t neon=%i \t rate=%.2f \n", m2t[1] - m2t[0], m2t[2] - m2t[1], 
649 | 		(float)(m2t[1] - m2t[0]) / (float)(m2t[2] - m2t[1]));
650 | 
651 | 
652 | }
653 | 
654 | int main(int argc, char** argv)
655 | {	
656 | 
657 | 	int i, ii;
658 | 	if (argc > 1 && strcmp(argv[1], "-norunfast") == 0){
659 | 		printf("RUNFAST: Disabled \n");
660 | 	}else {
661 | 		printf("RUNFAST: Enabled \n");
662 | 		enable_runfast();
663 | 	}
664 | 
665 | 	srand(time(NULL));
666 | 
667 | #if 1
668 | 	//test single argument functions:
669 | 	printf("------------------------------------------------------------------------------------------------------\n");	
670 | 	printf("MATRIX FUNCTION TESTS \n");	
671 | 	printf("------------------------------------------------------------------------------------------------------\n");	
672 | 	
673 | 	test_matrixfunc();
674 | 	test_vectorfunc();
675 | 
676 | 	printf("------------------------------------------------------------------------------------------------------\n");	
677 | 	printf("CMATH FUNCTION TESTS \n");	
678 | 	printf("------------------------------------------------------------------------------------------------------\n");	
679 | 	printf("Function\tRange\t\tNumber\tABS Max Error\tREL Max Error\tRMS Error\tTime\tRate\n");	
680 | 	printf("------------------------------------------------------------------------------------------------------\n");	
681 | 	for(i = 0; i < 51; i++){
682 | 		test_mathfunc1(&test1[i]);	
683 | 		
684 | 		ii = i - (i % 3);
685 | 		printf("%s\t", test1[i].name);
686 | 		printf("[%.2f, %.2f]\t", test1[i].rng0, test1[i].rng1);
687 | 		printf("%i\t", test1[i].num);
688 | 		printf("%.2e\t", test1[i].emaxabs);
689 | 		printf("%.2e%%\t", test1[i].emaxrel);
690 | 		printf("%.2e\t", test1[i].erms);
691 | 		printf("%i\t", test1[i].time);
692 | 		printf("x%.2f\t", (float)test1[ii].time / test1[i].time);
693 | 		printf("\n");
694 | 	}
695 | 	for(i = 0; i < 9; i++){
696 | 		test_mathfunc2(&test2[i]);
697 | 	
698 | 		ii = i - (i % 3);
699 | 		
700 | 		printf("%s\t", test2[i].name);
701 | 		printf("[%.2f, %.2f]\t", test2[i].rng0, test2[i].rng1);
702 | 		printf("%i\t", test2[i].num);
703 | 		printf("%.2e\t", test2[i].emaxabs);
704 | 		printf("%.2e%%\t", test2[i].emaxrel);
705 | 		printf("%.2e\t", test2[i].erms);
706 | 		printf("%i\t", test2[i].time);
707 | 		printf("x%.2f\t", (float)test2[ii].time / test2[i].time);
708 | 		printf("\n");
709 | 	}
710 | 	
711 | #else
712 | 
713 | 
714 | 	float x = 0;
715 | 	for(x = -M_PI_2; x < M_PI_2; x+= 0.01)
716 | 	{
717 | 		printf("x=%.2f\t in=%.2f\t c=%.2f\t neon=%.2f \n", x, sinhf(x), sinhf_c(x), sinhf_neon(x));
718 | 	}
719 | 
720 | #endif
721 | 	
722 | 	return 0;
723 | } 
724 | 


--------------------------------------------------------------------------------
/math_expf.c:
--------------------------------------------------------------------------------
  1 | /*
  2 | The MIT License (MIT)
  3 | 
  4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
  5 | 
  6 | Permission is hereby granted, free of charge, to any person obtaining a copy
  7 | of this software and associated documentation files (the "Software"), to deal
  8 | in the Software without restriction, including without limitation the rights
  9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10 | copies of the Software, and to permit persons to whom the Software is
 11 | furnished to do so, subject to the following conditions:
 12 | 
 13 | The above copyright notice and this permission notice shall be included in
 14 | all copies or substantial portions of the Software.
 15 | 
 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 22 | THE SOFTWARE.
 23 | */
 24 | 
 25 | /*
 26 | Based on: 
 27 | 
 28 | 		e ^ x = (1+m) * (2^n)
 29 | 		x = log(1+m) + n * log(2)
 30 | 		n = (int) (x * 1.0 / log(2))
 31 | 		(1+m) = e ^ (x - n * log(2))
 32 | 		(1+m) = Poly(x - n * log(2))
 33 | 		
 34 | 		where Poly(x) is the Minimax approximation of e ^ x over the 
 35 | 		range [-Log(2), Log(2)]
 36 | 
 37 | Test func : expf(x)
 38 | Test Range: 0 < x < 50
 39 | Peak Error:	~0.00024%
 40 | RMS  Error: ~0.00007%
 41 | */
 42 | 
 43 | #include "math.h"
 44 | #include "math_neon.h"
 45 | 
 46 | const float __expf_rng[2] = {
 47 | 	1.442695041f,
 48 | 	0.693147180f
 49 | };
 50 | 
 51 | const float __expf_lut[8] = {
 52 | 	0.9999999916728642,		//p0
 53 | 	0.04165989275009526, 	//p4
 54 | 	0.5000006143673624, 	//p2
 55 | 	0.0014122663401803872, 	//p6
 56 | 	1.000000059694879, 		//p1
 57 | 	0.008336936973260111, 	//p5
 58 | 	0.16666570253074878, 	//p3
 59 | 	0.00019578093328483123	//p7
 60 | };
 61 | 
 62 | float expf_c(float x)
 63 | {
 64 | 	float a, b, c, d, xx;
 65 | 	int m;
 66 | 	
 67 | 	union {
 68 | 		float   f;
 69 | 		int 	i;
 70 | 	} r;
 71 | 		
 72 | 	//Range Reduction:
 73 | 	m = (int) (x * __expf_rng[0]);
 74 | 	x = x - ((float) m) * __expf_rng[1];	
 75 | 	
 76 | 	//Taylor Polynomial (Estrins)
 77 | 	a = (__expf_lut[4] * x) + (__expf_lut[0]);
 78 | 	b = (__expf_lut[6] * x) + (__expf_lut[2]);
 79 | 	c = (__expf_lut[5] * x) + (__expf_lut[1]);
 80 | 	d = (__expf_lut[7] * x) + (__expf_lut[3]);
 81 | 	xx = x * x;
 82 | 	a = a + b * xx; 
 83 | 	c = c + d * xx;
 84 | 	xx = xx* xx;
 85 | 	r.f = a + c * xx; 
 86 | 	
 87 | 	//multiply by 2 ^ m 
 88 | 	m = m << 23;
 89 | 	r.i = r.i + m;
 90 | 
 91 | 	return r.f;
 92 | }
 93 | 
 94 | float expf_neon_hfp(float x)
 95 | {
 96 | #ifdef __MATH_NEON
 97 | 	asm volatile (
 98 | 	"vdup.f32 		d0, d0[0]				\n\t"	//d0 = {x, x}
 99 | 	
100 | 	//Range Reduction:
101 | 	"vld1.32 		d2, [%0]				\n\t"	//d2 = {invrange, range}
102 | 	"vmul.f32 		d6, d0, d2[0]			\n\t"	//d6 = d0 * d2[0] 
103 | 	"vcvt.s32.f32 	d6, d6					\n\t"	//d6 = (int) d6
104 | 	"vcvt.f32.s32 	d1, d6					\n\t"	//d1 = (float) d6
105 | 	"vmls.f32 		d0, d1, d2[1]			\n\t"	//d0 = d0 - d1 * d2[1]
106 | 		
107 | 	//polynomial:
108 | 	"vmul.f32 		d1, d0, d0				\n\t"	//d1 = d0*d0 = {x^2, x^2}	
109 | 	"vld1.32 		{d2, d3, d4, d5}, [%1]	\n\t"	//q1 = {p0, p4, p2, p6}, q2 = {p1, p5, p3, p7} ;
110 | 	"vmla.f32 		q1, q2, d0[0]			\n\t"	//q1 = q1 + q2 * d0[0]		
111 | 	"vmla.f32 		d2, d3, d1[0]			\n\t"	//d2 = d2 + d3 * d1[0]		
112 | 	"vmul.f32 		d1, d1, d1				\n\t"	//d1 = d1 * d1 = {x^4, x^4}	
113 | 	"vmla.f32 		d2, d1, d2[1]			\n\t"	//d2 = d2 + d1 * d2[1]		
114 | 
115 | 	//multiply by 2 ^ m 	
116 | 	"vshl.i32 		d6, d6, #23				\n\t"	//d6 = d6 << 23		
117 | 	"vadd.i32 		d0, d2, d6				\n\t"	//d0 = d2 + d6		
118 | 
119 | 	:: "r"(__expf_rng), "r"(__expf_lut) 
120 |     : "d0", "d1", "q1", "q2", "d6"
121 | 	);
122 | #endif
123 | }
124 | 
125 | float expf_neon_sfp(float x)
126 | {
127 | #ifdef __MATH_NEON
128 | 	asm volatile ("vmov.f32 s0, r0 		\n\t");
129 | 	expf_neon_hfp(x);
130 | 	asm volatile ("vmov.f32 r0, s0 		\n\t");
131 | #else
132 | 	return expf_c(x);
133 | #endif
134 | };
135 | 
136 | 


--------------------------------------------------------------------------------
/math_fabsf.c:
--------------------------------------------------------------------------------
 1 | /*
 2 | The MIT License (MIT)
 3 | 
 4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in
14 | all copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 | THE SOFTWARE.
23 | */
24 | 
25 | #include "math_neon.h"
26 | 
27 | 	
28 | float fabsf_c(float x)
29 | {
30 | 	union {
31 | 		int i;
32 | 		float f;
33 | 	} xx;
34 | 
35 | 	xx.f = x;
36 | 	xx.i = xx.i & 0x7FFFFFFF;
37 | 	return xx.f;
38 | }
39 | 
40 | float fabsf_neon_hfp(float x)
41 | {
42 | #ifdef __MATH_NEON
43 | 	asm volatile (
44 | 	"fabss	 		s0, s0					\n\t"	//s0 = fabs(s0)
45 | 	);
46 | #endif
47 | }
48 | 
49 | float fabsf_neon_sfp(float x)
50 | {
51 | #ifdef __MATH_NEON
52 | 	asm volatile (
53 | 	"bic	 		r0, r0, #0x80000000		\n\t"	//r0 = r0 & ~(1 << 31)
54 | 	);
55 | #else
56 | 	return fabsf_c(x);
57 | #endif
58 | }
59 | 


--------------------------------------------------------------------------------
/math_floorf.c:
--------------------------------------------------------------------------------
 1 | /*
 2 | The MIT License (MIT)
 3 | 
 4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in
14 | all copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 | THE SOFTWARE.
23 | */
24 | 
25 | /*
26 | Assumes the floating point value |x| < 2147483648
27 | */
28 | 
29 | #include "math.h"
30 | #include "math_neon.h"
31 | 
32 | float floorf_c(float x)
33 | {
34 | 	int n;
35 | 	float r;	
36 | 	n = (int) x;
37 | 	r = (float) n;
38 | 	r = r - (r > x);
39 | 	return r;
40 | }
41 | 
42 | float floorf_neon_hfp(float x)
43 | {
44 | #ifdef __MATH_NEON
45 | 	asm volatile (
46 | 	"vcvt.s32.f32 	d1, d0					\n\t"	//d1 = (int) d0;
47 | 	"vcvt.f32.s32 	d1, d1					\n\t"	//d1 = (float) d1;
48 | 	"vcgt.f32 		d0, d1, d0				\n\t"	//d0 = (d1 > d0);
49 | 	"vshr.u32 		d0, #31					\n\t"	//d0 = d0 >> 31;
50 | 	"vcvt.f32.u32 	d0, d0					\n\t"	//d0 = (float) d0;
51 | 	"vsub.f32 		d0, d1, d0				\n\t"	//d0 = d1 - d0;
52 | 	::: "d0", "d1"
53 | 	);
54 | #endif
55 | }
56 | 
57 | float floorf_neon_sfp(float x)
58 | {
59 | #ifdef __MATH_NEON
60 | 	asm volatile ("vmov.f32 s0, r0 		\n\t");
61 | 	floorf_neon_hfp(x);
62 | 	asm volatile ("vmov.f32 r0, s0 		\n\t");
63 | #else
64 | 	return floorf_c(x);
65 | #endif
66 | };
67 | 


--------------------------------------------------------------------------------
/math_fmodf.c:
--------------------------------------------------------------------------------
  1 | /*
  2 | The MIT License (MIT)
  3 | 
  4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
  5 | 
  6 | Permission is hereby granted, free of charge, to any person obtaining a copy
  7 | of this software and associated documentation files (the "Software"), to deal
  8 | in the Software without restriction, including without limitation the rights
  9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10 | copies of the Software, and to permit persons to whom the Software is
 11 | furnished to do so, subject to the following conditions:
 12 | 
 13 | The above copyright notice and this permission notice shall be included in
 14 | all copies or substantial portions of the Software.
 15 | 
 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 22 | THE SOFTWARE.
 23 | */
 24 | 
 25 | /*
 26 | Assumes the floating point value |x / y| < 2,147,483,648
 27 | */
 28 | 
 29 | #include "math_neon.h"
 30 | 
 31 | float fmodf_c(float x, float y)
 32 | {
 33 | 	int n;
 34 | 	union {
 35 | 		float f;
 36 | 		int   i;
 37 | 	} yinv;
 38 | 	float a;
 39 | 	
 40 | 	//fast reciporical approximation (4x Newton)
 41 | 	yinv.f = y;
 42 | 	n = 0x3F800000 - (yinv.i & 0x7F800000);
 43 | 	yinv.i = yinv.i + n;
 44 | 	yinv.f = 1.41176471f - 0.47058824f * yinv.f;
 45 | 	yinv.i = yinv.i + n;
 46 | 	a = 2.0 - yinv.f * y;
 47 | 	yinv.f = yinv.f * a;	
 48 | 	a = 2.0 - yinv.f * y;
 49 | 	yinv.f = yinv.f * a;
 50 | 	a = 2.0 - yinv.f * y;
 51 | 	yinv.f = yinv.f * a;
 52 | 	a = 2.0 - yinv.f * y;
 53 | 	yinv.f = yinv.f * a;
 54 | 	
 55 | 	n = (int)(x * yinv.f);
 56 | 	x = x - ((float)n) * y;
 57 | 	return x;
 58 | }
 59 | 
 60 | 
 61 | float fmodf_neon_hfp(float x, float y)
 62 | {
 63 | #ifdef __MATH_NEON
 64 | 	asm volatile (
 65 | 	"vdup.f32 		d1, d0[1]					\n\t"	//d1[0] = y
 66 | 	"vdup.f32 		d0, d0[0]					\n\t"	//d1[0] = y
 67 | 	
 68 | 	//fast reciporical approximation
 69 | 	"vrecpe.f32 	d2, d1					\n\t"	//d2 = ~1.0 / d1
 70 | 	"vrecps.f32		d3, d2, d1				\n\t"	//d3 = 2.0 - d2 * d1; 
 71 | 	"vmul.f32		d2, d2, d3				\n\t"	//d2 = d2 * d3; 
 72 | 	"vrecps.f32		d3, d2, d1				\n\t"	//d3 = 2.0 - d2 * d1; 
 73 | 	"vmul.f32		d2, d2, d3				\n\t"	//d2 = d2 * d3; 
 74 | 	"vrecps.f32		d3, d2, d1				\n\t"	//d3 = 2.0 - d2 * d1; 
 75 | 	"vmul.f32		d2, d2, d3				\n\t"	//d2 = d2 * d3; 
 76 | 	"vrecps.f32		d3, d2, d1				\n\t"	//d3 = 2.0 - d2 * d1; 
 77 | 	"vmul.f32		d2, d2, d3				\n\t"	//d2 = d2 * d3; 
 78 | 
 79 | 	"vmul.f32		d2, d2, d0				\n\t"	//d2 = d2 * d0; 
 80 | 	"vcvt.s32.f32	d2, d2					\n\t"	//d2 = (int) d2; 
 81 | 	"vcvt.f32.s32	d2, d2					\n\t"	//d2 = (float) d2; 
 82 | 	"vmls.f32		d0, d1, d2				\n\t"	//d0 = d0 - d1 * d2; 
 83 | 
 84 | 	::: "d0", "d1", "d2", "d3"
 85 | 	);
 86 | #endif
 87 | }
 88 | 
 89 | 
 90 | float fmodf_neon_sfp(float x, float y)
 91 | {
 92 | #ifdef __MATH_NEON
 93 | 	asm volatile ("vmov.f32 s0, r0 		\n\t");
 94 | 	asm volatile ("vmov.f32 s1, r1 		\n\t");
 95 | 	fmodf_neon_hfp(x, y);
 96 | 	asm volatile ("vmov.f32 r0, s0 		\n\t");
 97 | #else
 98 | 	return fmodf_c(x,y);
 99 | #endif
100 | };
101 | 


--------------------------------------------------------------------------------
/math_invsqrtf.c:
--------------------------------------------------------------------------------
 1 | /*
 2 | The MIT License (MIT)
 3 | 
 4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in
14 | all copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 | THE SOFTWARE.
23 | */
24 | 
25 | #include "math.h"
26 | #include "math_neon.h"
27 | 
28 | float invsqrtf_c(float x)
29 | {
30 | 
31 | 	float b, c;
32 | 	union {
33 | 		float 	f;
34 | 		int 	i;
35 | 	} a;
36 | 	
37 | 	//fast invsqrt approx
38 | 	a.f = x;
39 | 	a.i = 0x5F3759DF - (a.i >> 1);		//VRSQRTE
40 | 	c = x * a.f;
41 | 	b = (3.0f - c * a.f) * 0.5;		//VRSQRTS
42 | 	a.f = a.f * b;		
43 | 	c = x * a.f;
44 | 	b = (3.0f - c * a.f) * 0.5;
45 |     a.f = a.f * b;	
46 | 
47 | 	return a.f;
48 | }
49 | 
50 | float invsqrtf_neon_hfp(float x)
51 | {
52 | #ifdef __MATH_NEON
53 | 	asm volatile (
54 | 		
55 | 	"vmov.f32 		d1, d0					\n\t"	//d1 = d0
56 | 	"vrsqrte.f32 	d0, d0					\n\t"	//d0 = ~ 1.0 / sqrt(d0)
57 | 	"vmul.f32 		d2, d0, d1				\n\t"	//d2 = d0 * d1
58 | 	"vrsqrts.f32 	d3, d2, d0				\n\t"	//d3 = (3 - d0 * d2) / 2 	
59 | 	"vmul.f32 		d0, d0, d3				\n\t"	//d0 = d0 * d3
60 | 	"vmul.f32 		d2, d0, d1				\n\t"	//d2 = d0 * d1	
61 | 	"vrsqrts.f32 	d3, d2, d0				\n\t"	//d4 = (3 - d0 * d3) / 2	
62 | 	"vmul.f32 		d0, d0, d3				\n\t"	//d0 = d0 * d4	
63 | 		
64 | 	::: "d0", "d1", "d2", "d3"
65 | 	);
66 | #endif
67 | }
68 | 
69 | float invsqrtf_neon_sfp(float x)
70 | {
71 | #ifdef __MATH_NEON
72 | 	asm volatile ("vmov.f32 s0, r0 		\n\t");
73 | 	invsqrtf_neon_hfp(x);
74 | 	asm volatile ("vmov.f32 r0, s0 		\n\t");
75 | #else
76 | 	return invsqrtf_c(x);
77 | #endif
78 | };
79 | 
80 | 


--------------------------------------------------------------------------------
/math_ldexpf.c:
--------------------------------------------------------------------------------
 1 | /*
 2 | The MIT License (MIT)
 3 | 
 4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in
14 | all copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 | THE SOFTWARE.
23 | */
24 | 
25 | #include "math.h"
26 | #include "math_neon.h"
27 | 
28 | float ldexpf_c(float m, int e)
29 | {
30 | 	union {
31 | 		float 	f;
32 | 		int 	i;
33 | 	} r;
34 | 	r.f = m;
35 | 	r.i += (e << 23);
36 | 	return r.f;
37 | }
38 | 
39 | float ldexpf_neon_hfp(float m, int e)
40 | {
41 | #ifdef __MATH_NEON
42 | 	float r;
43 | 	asm volatile (
44 | 	"lsl 			r0, r0, #23				\n\t"	//r0 = r0 << 23	
45 | 	"vdup.i32 		d1, r0					\n\t"	//d1 = {r0, r0}
46 | 	"vadd.i32 		d0, d0, d1				\n\t"	//d0 = d0 + d1
47 | 	::: "d0", "d1"
48 | 	);
49 | #endif
50 | }
51 | 
52 | float ldexpf_neon_sfp(float m, int e)
53 | {
54 | #ifdef __MATH_NEON
55 | 	float r;
56 | 	asm volatile (
57 | 	"lsl 			r1, r1, #23				\n\t"	//r1 = r1 << 23	
58 | 	"vdup.f32 		d0, r0					\n\t"	//d0 = {r0, r0}	
59 | 	"vdup.i32 		d1, r1					\n\t"	//d1 = {r1, r1}
60 | 	"vadd.i32 		d0, d0, d1				\n\t"	//d0 = d0 + d1
61 | 	"vmov.f32 		r0, s0					\n\t"	//r0 = s0
62 | 	::: "d0", "d1"
63 | 	);
64 | #else
65 | 	return ldexpf_c(m,e);
66 | #endif
67 | }
68 | 


--------------------------------------------------------------------------------
/math_log10f.c:
--------------------------------------------------------------------------------
  1 | /*
  2 | The MIT License (MIT)
  3 | 
  4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
  5 | 
  6 | Permission is hereby granted, free of charge, to any person obtaining a copy
  7 | of this software and associated documentation files (the "Software"), to deal
  8 | in the Software without restriction, including without limitation the rights
  9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10 | copies of the Software, and to permit persons to whom the Software is
 11 | furnished to do so, subject to the following conditions:
 12 | 
 13 | The above copyright notice and this permission notice shall be included in
 14 | all copies or substantial portions of the Software.
 15 | 
 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 22 | THE SOFTWARE.
 23 | */
 24 | 
 25 | /*
 26 | Based on: 
 27 | 
 28 | 		log10(x) = log10((1+m) * (2^n))
 29 | 		log(x) = n * log10(2) + log10(1 + m)
 30 | 		log(1+m) = Poly(1+m)
 31 | 		
 32 | 		where Poly(x) is the Minimax approximation of log10(x) over the 
 33 | 		range [1, 2]
 34 | 
 35 | Test func : log10f(x)
 36 | Test Range: 1 < x < 10000
 37 | Peak Error:	~0.000040%
 38 | RMS  Error: ~0.000008%
 39 | */
 40 | 
 41 | #include "math.h"
 42 | #include "math_neon.h"
 43 | 
 44 | const float __log10f_rng =  0.3010299957f;
 45 | 
 46 | const float __log10f_lut[8] = {
 47 | 	-0.99697286229624, 		//p0
 48 | 	-1.07301643912502, 		//p4
 49 | 	-2.46980061535534, 		//p2
 50 | 	-0.07176870463131, 		//p6
 51 | 	2.247870219989470, 		//p1
 52 | 	0.366547581117400, 		//p5
 53 | 	1.991005185100089, 		//p3
 54 | 	0.006135635201050,		//p7
 55 | };
 56 | 
 57 | float log10f_c(float x)
 58 | {
 59 | 	float a, b, c, d, xx;
 60 | 	int m;
 61 | 	
 62 | 	union {
 63 | 		float   f;
 64 | 		int 	i;
 65 | 	} r;
 66 | 	
 67 | 	//extract exponent
 68 | 	r.f = x;
 69 | 	m = (r.i >> 23);
 70 | 	m = m - 127;
 71 | 	r.i = r.i - (m << 23);
 72 | 		
 73 | 	//Taylor Polynomial (Estrins)
 74 | 	xx = r.f * r.f;
 75 | 	a = (__log10f_lut[4] * r.f) + (__log10f_lut[0]);
 76 | 	b = (__log10f_lut[6] * r.f) + (__log10f_lut[2]);
 77 | 	c = (__log10f_lut[5] * r.f) + (__log10f_lut[1]);
 78 | 	d = (__log10f_lut[7] * r.f) + (__log10f_lut[3]);
 79 | 	a = a + b * xx;
 80 | 	c = c + d * xx;
 81 | 	xx = xx * xx;
 82 | 	r.f = a + c * xx;
 83 | 
 84 | 	//add exponent
 85 | 	r.f = r.f + ((float) m) * __log10f_rng;
 86 | 
 87 | 	return r.f;
 88 | }
 89 | 
 90 | float log10f_neon_hfp(float x)
 91 | {
 92 | #ifdef __MATH_NEON
 93 | 	asm volatile (
 94 | 	
 95 | 	"vdup.f32		d0, d0[0]				\n\t"	//d0 = {x,x};
 96 | 	
 97 | 	//extract exponent
 98 | 	"vmov.i32		d2, #127				\n\t"	//d2 = 127;
 99 | 	"vshr.u32		d6, d0, #23				\n\t"	//d6 = d0 >> 23;
100 | 	"vsub.i32		d6, d6, d2				\n\t"	//d6 = d6 - d2;
101 | 	"vshl.u32		d1, d6, #23				\n\t"	//d1 = d6 << 23;
102 | 	"vsub.i32		d0, d0, d1				\n\t"	//d0 = d0 + d1;
103 | 
104 | 	//polynomial:
105 | 	"vmul.f32 		d1, d0, d0				\n\t"	//d1 = d0*d0 = {x^2, x^2}	
106 | 	"vld1.32 		{d2, d3, d4, d5}, [%1]	\n\t"	//q1 = {p0, p4, p2, p6}, q2 = {p1, p5, p3, p7} ;
107 | 	"vmla.f32 		q1, q2, d0[0]			\n\t"	//q1 = q1 + q2 * d0[0]		
108 | 	"vmla.f32 		d2, d3, d1[0]			\n\t"	//d2 = d2 + d3 * d1[0]		
109 | 	"vmul.f32 		d1, d1, d1				\n\t"	//d1 = d1 * d1 = {x^4, x^4}	
110 | 	"vmla.f32 		d2, d1, d2[1]			\n\t"	//d2 = d2 + d1 * d2[1]		
111 | 
112 | 	//add exponent 	
113 | 	"vdup.32 		d7, %0					\n\t"	//d7 = {rng, rng}
114 | 	"vcvt.f32.s32 	d6, d6					\n\t"	//d6 = (float) d6
115 | 	"vmla.f32 		d2, d6, d7				\n\t"	//d2 = d2 + d6 * d7		
116 | 
117 | 	"vmov.f32 		s0, s4					\n\t"	//s0 = s4
118 | 
119 | 	:: "r"(__log10f_rng), "r"(__log10f_lut) 
120 |     : "d0", "d1", "q1", "q2", "d6", "d7"
121 | 	);
122 | #endif
123 | }
124 | 
125 | 
126 | float log10f_neon_sfp(float x)
127 | {
128 | #ifdef __MATH_NEON
129 | 	asm volatile ("vmov.f32 s0, r0 		\n\t");
130 | 	log10f_neon_hfp(x);
131 | 	asm volatile ("vmov.f32 r0, s0 		\n\t");
132 | #else
133 | 	return log10f_c(x);
134 | #endif
135 | };
136 | 


--------------------------------------------------------------------------------
/math_logf.c:
--------------------------------------------------------------------------------
  1 | /*
  2 | The MIT License (MIT)
  3 | 
  4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
  5 | 
  6 | Permission is hereby granted, free of charge, to any person obtaining a copy
  7 | of this software and associated documentation files (the "Software"), to deal
  8 | in the Software without restriction, including without limitation the rights
  9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10 | copies of the Software, and to permit persons to whom the Software is
 11 | furnished to do so, subject to the following conditions:
 12 | 
 13 | The above copyright notice and this permission notice shall be included in
 14 | all copies or substantial portions of the Software.
 15 | 
 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 22 | THE SOFTWARE.
 23 | */
 24 | 
 25 | /*
 26 | Based on: 
 27 | 
 28 | 		log(x) = log((1+m) * (2^n))
 29 | 		log(x) = n * log(2) + log(1 + m)
 30 | 		log(1+m) = Poly(1+m)
 31 | 		
 32 | 		where Poly(x) is the Minimax approximation of log(x) over the 
 33 | 		range [1, 2]
 34 | 
 35 | Test func : logf(x)
 36 | Test Range: 1 < x < 10000
 37 | Peak Error:	~0.000601%
 38 | RMS  Error: ~0.000005%
 39 | */
 40 | 
 41 | #include "math.h"
 42 | #include "math_neon.h"
 43 | 
 44 | const float __logf_rng =  0.693147180f;
 45 | 
 46 | const float __logf_lut[8] = {
 47 | 	-2.295614848256274, 	//p0
 48 | 	-2.470711633419806, 	//p4
 49 | 	-5.686926051100417, 	//p2
 50 | 	-0.165253547131978, 	//p6
 51 | 	+5.175912446351073, 	//p1
 52 | 	+0.844006986174912, 	//p5
 53 | 	+4.584458825456749, 	//p3
 54 | 	+0.014127821926000		//p7
 55 | };
 56 | 
 57 | float logf_c(float x)
 58 | {
 59 | 	float a, b, c, d, xx;
 60 | 	int m;
 61 | 	
 62 | 	union {
 63 | 		float   f;
 64 | 		int 	i;
 65 | 	} r;
 66 | 	
 67 | 	//extract exponent
 68 | 	r.f = x;
 69 | 	m = (r.i >> 23);
 70 | 	m = m - 127;
 71 | 	r.i = r.i - (m << 23);
 72 | 		
 73 | 	//Taylor Polynomial (Estrins)
 74 | 	xx = r.f * r.f;
 75 | 	a = (__logf_lut[4] * r.f) + (__logf_lut[0]);
 76 | 	b = (__logf_lut[6] * r.f) + (__logf_lut[2]);
 77 | 	c = (__logf_lut[5] * r.f) + (__logf_lut[1]);
 78 | 	d = (__logf_lut[7] * r.f) + (__logf_lut[3]);
 79 | 	a = a + b * xx;
 80 | 	c = c + d * xx;
 81 | 	xx = xx * xx;
 82 | 	r.f = a + c * xx;
 83 | 
 84 | 	//add exponent
 85 | 	r.f = r.f + ((float) m) * __logf_rng;
 86 | 
 87 | 	return r.f;
 88 | }
 89 | 
 90 | float logf_neon_hfp(float x)
 91 | {
 92 | #ifdef __MATH_NEON
 93 | 	asm volatile (
 94 | 	
 95 | 	"vdup.f32		d0, d0[0]				\n\t"	//d0 = {x,x};
 96 | 	
 97 | 	//extract exponent
 98 | 	"vmov.i32		d2, #127				\n\t"	//d2 = 127;
 99 | 	"vshr.u32		d6, d0, #23				\n\t"	//d6 = d0 >> 23;
100 | 	"vsub.i32		d6, d6, d2				\n\t"	//d6 = d6 - d2;
101 | 	"vshl.u32		d1, d6, #23				\n\t"	//d1 = d6 << 23;
102 | 	"vsub.i32		d0, d0, d1				\n\t"	//d0 = d0 + d1;
103 | 
104 | 	//polynomial:
105 | 	"vmul.f32 		d1, d0, d0				\n\t"	//d1 = d0*d0 = {x^2, x^2}	
106 | 	"vld1.32 		{d2, d3, d4, d5}, [%1]	\n\t"	//q1 = {p0, p4, p2, p6}, q2 = {p1, p5, p3, p7} ;
107 | 	"vmla.f32 		q1, q2, d0[0]			\n\t"	//q1 = q1 + q2 * d0[0]		
108 | 	"vmla.f32 		d2, d3, d1[0]			\n\t"	//d2 = d2 + d3 * d1[0]		
109 | 	"vmul.f32 		d1, d1, d1				\n\t"	//d1 = d1 * d1 = {x^4, x^4}	
110 | 	"vmla.f32 		d2, d1, d2[1]			\n\t"	//d2 = d2 + d1 * d2[1]		
111 | 
112 | 	//add exponent 	
113 | 	"vdup.32 		d7, %0					\n\t"	//d7 = {rng, rng}
114 | 	"vcvt.f32.s32 	d6, d6					\n\t"	//d6 = (float) d6
115 | 	"vmla.f32 		d2, d6, d7				\n\t"	//d2 = d2 + d6 * d7		
116 | 
117 | 	"vmov.f32 		s0, s4					\n\t"	//s0 = s4
118 | 
119 | 	:: "r"(__logf_rng), "r"(__logf_lut) 
120 |     : "d0", "d1", "q1", "q2", "d6", "d7"
121 | 	);
122 | #endif
123 | }
124 | 
125 | float logf_neon_sfp(float x)
126 | {
127 | #ifdef __MATH_NEON
128 | 	asm volatile ("vmov.f32 s0, r0 		\n\t");
129 | 	logf_neon_hfp(x);
130 | 	asm volatile ("vmov.f32 r0, s0 		\n\t");
131 | #else
132 | 	return logf_c(x);
133 | #endif
134 | };
135 | 
136 | 


--------------------------------------------------------------------------------
/math_mat2.c:
--------------------------------------------------------------------------------
 1 | /*
 2 | The MIT License (MIT)
 3 | 
 4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in
14 | all copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 | THE SOFTWARE.
23 | */
24 | 
25 | /*
26 | Matrices are specified in column major format:
27 | 
28 | | a c |
29 | | b d |
30 | 
31 | therefore m[2] = c
32 | */
33 | 
34 | #include "math_neon.h"
35 | 
36 | //matrix matrix multipication. d = m0 * m1;
37 | void
38 | matmul2_c(float m0[4], float m1[4], float d[4])
39 | {	
40 | 	d[0] = m0[0]*m1[0] + m0[2]*m1[1];	
41 | 	d[1] = m0[1]*m1[0] + m0[3]*m1[1];
42 | 	d[2] = m0[0]*m1[2] + m0[2]*m1[3];
43 | 	d[3] = m0[1]*m1[2] + m0[3]*m1[3];
44 | }
45 | 
46 | void
47 | matmul2_neon(float m0[4], float m1[4], float d[4])
48 | {	
49 | #ifdef __MATH_NEON
50 | 	asm volatile (
51 | 	"vld1.32 		{d0, d1}, [%0]			\n\t"	//Q1 = m0
52 | 	"vld1.32 		{d2, d3}, [%1]			\n\t"	//Q2 = m1
53 | 	
54 | 	"vmul.f32 		d4, d0, d2[0]			\n\t"	//D4 = D0*D2[0]
55 | 	"vmul.f32 		d5, d0, d3[0]			\n\t"	//D5 = D0*D3[0]
56 | 	"vmla.f32 		d4, d1, d2[1]			\n\t"	//D4 += D1*D2[1]
57 | 	"vmla.f32 		d5, d1, d3[1]			\n\t"	//D5 += D1*D3[1]
58 | 	
59 | 	"vst1.32 		{d4, d5}, [%2] 			\n\t"	//Q4 = m+12	
60 | 	:: "r"(m0), "r"(m1), "r"(d) 
61 |     : "q0", "q1", "q2", "memory"
62 | 	);	
63 | #else
64 | 	matmul2_c(m0, m1, d);
65 | #endif
66 | }
67 | 
68 | 
69 | //matrix vector multiplication. d = m * v
70 | void
71 | matvec2_c(float m[4], float v[2], float d[2])
72 | {
73 | 	d[0] = m[0]*v[0] + m[2]*v[1];
74 | 	d[1] = m[1]*v[0] + m[3]*v[1];
75 | }
76 | 
77 | void
78 | matvec2_neon(float m[4], float v[2], float d[2])
79 | {
80 | #ifdef __MATH_NEON
81 | 	asm volatile (
82 | 	"vld1.32        d0, [%1]				\n\t"	//d0 = v
83 | 	"vld1.32 		{d1, d2}, [%0]			\n\t"	//Q1 = m
84 | 	
85 | 	"vmul.f32 		d3, d1, d0[0]			\n\t"	//Q5 = Q1*d0[0]
86 | 	"vmla.f32 		d3, d2, d0[1]			\n\t"	//Q5 += Q1*d0[1] 
87 | 	
88 | 	"vst1.32 		d3, [%2] 				\n\t"	//Q4 = m+12	
89 | 	:: "r"(m), "r"(v), "r"(d) 
90 |     : "d0", "d1", "d2","d3", "memory"
91 | 	);	
92 | #else
93 | 	matvec2_c(m, v, d);
94 | #endif
95 | }
96 | 


--------------------------------------------------------------------------------
/math_mat3.c:
--------------------------------------------------------------------------------
  1 | /*
  2 | The MIT License (MIT)
  3 | 
  4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
  5 | 
  6 | Permission is hereby granted, free of charge, to any person obtaining a copy
  7 | of this software and associated documentation files (the "Software"), to deal
  8 | in the Software without restriction, including without limitation the rights
  9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10 | copies of the Software, and to permit persons to whom the Software is
 11 | furnished to do so, subject to the following conditions:
 12 | 
 13 | The above copyright notice and this permission notice shall be included in
 14 | all copies or substantial portions of the Software.
 15 | 
 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 22 | THE SOFTWARE.
 23 | */
 24 | 
 25 | /*
 26 | Matrices are specified in row major format:
 27 | 
 28 | | x0 x2 |
 29 | | x1 x3 |
 30 | 
 31 | therefore m[2] = x2
 32 | 
 33 | */
 34 | 
 35 | #include "math_neon.h"
 36 | 
 37 | //matrix matrix multipication. d = m0 * m1;
 38 | void
 39 | matmul3_c(float m0[9], float m1[9], float d[9])
 40 | {
 41 | 	d[0] = m0[0]*m1[0] + m0[3]*m1[1] + m0[6]*m1[2];
 42 | 	d[1] = m0[1]*m1[0] + m0[4]*m1[1] + m0[7]*m1[2];
 43 | 	d[2] = m0[2]*m1[0] + m0[5]*m1[1] + m0[8]*m1[2];
 44 | 	d[3] = m0[0]*m1[3] + m0[3]*m1[4] + m0[6]*m1[5];
 45 | 	d[4] = m0[1]*m1[3] + m0[4]*m1[4] + m0[7]*m1[5];
 46 | 	d[5] = m0[2]*m1[3] + m0[5]*m1[4] + m0[8]*m1[5];
 47 | 	d[6] = m0[0]*m1[6] + m0[3]*m1[7] + m0[6]*m1[8];
 48 | 	d[7] = m0[1]*m1[6] + m0[4]*m1[7] + m0[7]*m1[8];
 49 | 	d[8] = m0[2]*m1[6] + m0[5]*m1[7] + m0[8]*m1[8];
 50 | }
 51 | 
 52 | void 
 53 | matmul3_neon(float m0[9], float m1[9], float d[9])
 54 | {
 55 | #ifdef __MATH_NEON
 56 | 	asm volatile (
 57 | 	"vld1.32 		{d0, d1}, [%1]!			\n\t"	//q0 = m1
 58 | 	"vld1.32 		{d2, d3}, [%1]!			\n\t"	//q1 = m1+4
 59 | 	"flds 			s8, [%1]				\n\t"	//q2 = m1+8
 60 | 	
 61 | 	"vld1.32 		{d6, d7}, [%0]			\n\t"	//q3[0] = m0
 62 | 	"add 			%0, %0, #12				\n\t"	//q3[0] = m0
 63 | 	"vld1.32 		{d8, d9}, [%0]			\n\t"	//q4[0] = m0+12
 64 | 	"add 			%0, %0, #12				\n\t"	//q3[0] = m0
 65 | 	"vld1.32 		{d10}, [%0]				\n\t"	//q5[0] = m0+24
 66 | 	"add 			%0, %0, #8				\n\t"	//q3[0] = m0
 67 | 	"flds 			s22, [%0]				\n\t"	//q2 = m1+8
 68 | 	
 69 | 	"vmul.f32 		q6, q3, d0[0] 			\n\t"	//q12 = q3 * d0[0]
 70 | 	"vmul.f32 		q7, q3, d1[1] 			\n\t"	//q13 = q3 * d2[0]
 71 | 	"vmul.f32 		q8, q3, d3[0] 			\n\t"	//q14 = q3 * d4[0]
 72 | 	"vmla.f32 		q6, q4, d0[1] 			\n\t"	//q12 = q9 * d0[1]
 73 | 	"vmla.f32 		q7, q4, d2[0] 			\n\t"	//q13 = q9 * d2[1]
 74 | 	"vmla.f32 		q8, q4, d3[1] 			\n\t"	//q14 = q9 * d4[1]
 75 | 	"vmla.f32 		q6, q5, d1[0] 			\n\t"	//q12 = q10 * d0[0]
 76 | 	"vmla.f32 		q7, q5, d2[1] 			\n\t"	//q13 = q10 * d2[0]
 77 | 	"vmla.f32 		q8, q5, d4[0] 			\n\t"	//q14 = q10 * d4[0]
 78 | 
 79 | 	"vmov.f32 		q0, q8 					\n\t"	//q14 = q10 * d4[0]
 80 | 	"vst1.32 		{d12, d13}, [%2] 		\n\t"	//d = q12
 81 | 	"add 			%2, %2, #12				\n\t"	//q3[0] = m0
 82 | 	"vst1.32 		{d14, d15}, [%2] 		\n\t"	//d+4 = q13	
 83 | 	"add 			%2, %2, #12				\n\t"	//q3[0] = m0
 84 | 	"vst1.32 		{d0}, [%2] 				\n\t"	//d+8 = q14	
 85 | 	"add 			%2, %2, #8				\n\t"	//q3[0] = m0
 86 | 	"fsts 			s2, [%2] 				\n\t"	//d = q12	
 87 | 	
 88 | 	: "+r"(m0), "+r"(m1), "+r"(d): 
 89 |     : "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "memory"
 90 | 	);	
 91 | #else
 92 | 	matmul3_c(m0, m1, d);
 93 | #endif
 94 | };
 95 | 
 96 | //matrix vector multiplication. d = m * v
 97 | void
 98 | matvec3_c(float m[9], float v[3], float d[3])
 99 | {
100 | 	d[0] = m[0]*v[0] + m[3]*v[1] + m[6]*v[2];
101 | 	d[1] = m[1]*v[0] + m[4]*v[1] + m[7]*v[2];
102 | 	d[2] = m[2]*v[0] + m[5]*v[1] + m[8]*v[2];
103 | }
104 | 
105 | void
106 | matvec3_neon(float m[9], float v[3], float d[3])
107 | {
108 | #ifdef __MATH_NEON
109 | 	int tmp;
110 | 	asm volatile (
111 | 	"mov 			%3, #12					\n\t"	//r3 = 12
112 | 	"vld1.32 		{d0, d1}, [%1]			\n\t"	//Q0 = v
113 | 	"vld1.32 		{d2, d3}, [%0], %3		\n\t"	//Q1 = m
114 | 	"vld1.32 		{d4, d5}, [%0], %3		\n\t"	//Q2 = m+12
115 | 	"vld1.32 		{d6, d7}, [%0], %3		\n\t"	//Q3 = m+24
116 | 	
117 | 	"vmul.f32 		q9, q1, d0[0]			\n\t"	//Q9 = Q1*Q0[0]
118 | 	"vmla.f32 		q9, q2, d0[1]			\n\t"	//Q9 += Q2*Q0[1] 
119 | 	"vmla.f32 		q9, q3, d1[0]			\n\t"	//Q9 += Q3*Q0[2] 
120 | 	"vmov.f32 		q0, q9					\n\t"	//Q0 = q9
121 | 	
122 | 	"vst1.32 		d0, [%2]! 				\n\t"	//r2 = D24	
123 | 	"fsts 			s2, [%2] 				\n\t"	//r2 = D25[0]	
124 | 
125 | 	: "+r"(m), "+r"(v), "+r"(d), "+r"(tmp):
126 |     : "q0", "q9", "q10","q11", "q12", "q13", "memory"
127 | 	);	
128 | #else
129 | 	matvec3_c(m, v, d);
130 | #endif
131 | }
132 | 


--------------------------------------------------------------------------------
/math_mat4.c:
--------------------------------------------------------------------------------
  1 | /*
  2 | The MIT License (MIT)
  3 | 
  4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
  5 | 
  6 | Permission is hereby granted, free of charge, to any person obtaining a copy
  7 | of this software and associated documentation files (the "Software"), to deal
  8 | in the Software without restriction, including without limitation the rights
  9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10 | copies of the Software, and to permit persons to whom the Software is
 11 | furnished to do so, subject to the following conditions:
 12 | 
 13 | The above copyright notice and this permission notice shall be included in
 14 | all copies or substantial portions of the Software.
 15 | 
 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 22 | THE SOFTWARE.
 23 | */
 24 | 
 25 | /*
 26 | Matrices are specified in row major format:
 27 | 
 28 | | x0 x2 |
 29 | | x1 x3 |
 30 | 
 31 | therefore m[2] = x2
 32 | 
 33 | */
 34 | 
 35 | #include "math_neon.h"
 36 | 
 37 | //matrix matrix multipication. d = m0 * m1;
 38 | void
 39 | matmul4_c(float m0[16], float m1[16], float d[16])
 40 | {
 41 | 	d[0] = m0[0]*m1[0] + m0[4]*m1[1] + m0[8]*m1[2] + m0[12]*m1[3];
 42 | 	d[1] = m0[1]*m1[0] + m0[5]*m1[1] + m0[9]*m1[2] + m0[13]*m1[3];
 43 | 	d[2] = m0[2]*m1[0] + m0[6]*m1[1] + m0[10]*m1[2] + m0[14]*m1[3];
 44 | 	d[3] = m0[3]*m1[0] + m0[7]*m1[1] + m0[11]*m1[2] + m0[15]*m1[3];
 45 | 	d[4] = m0[0]*m1[4] + m0[4]*m1[5] + m0[8]*m1[6] + m0[12]*m1[7];
 46 | 	d[5] = m0[1]*m1[4] + m0[5]*m1[5] + m0[9]*m1[6] + m0[13]*m1[7];
 47 | 	d[6] = m0[2]*m1[4] + m0[6]*m1[5] + m0[10]*m1[6] + m0[14]*m1[7];
 48 | 	d[7] = m0[3]*m1[4] + m0[7]*m1[5] + m0[11]*m1[6] + m0[15]*m1[7];
 49 | 	d[8] = m0[0]*m1[8] + m0[4]*m1[9] + m0[8]*m1[10] + m0[12]*m1[11];
 50 | 	d[9] = m0[1]*m1[8] + m0[5]*m1[9] + m0[9]*m1[10] + m0[13]*m1[11];
 51 | 	d[10] = m0[2]*m1[8] + m0[6]*m1[9] + m0[10]*m1[10] + m0[14]*m1[11];
 52 | 	d[11] = m0[3]*m1[8] + m0[7]*m1[9] + m0[11]*m1[10] + m0[15]*m1[11];
 53 | 	d[12] = m0[0]*m1[12] + m0[4]*m1[13] + m0[8]*m1[14] + m0[12]*m1[15];
 54 | 	d[13] = m0[1]*m1[12] + m0[5]*m1[13] + m0[9]*m1[14] + m0[13]*m1[15];
 55 | 	d[14] = m0[2]*m1[12] + m0[6]*m1[13] + m0[10]*m1[14] + m0[14]*m1[15];
 56 | 	d[15] = m0[3]*m1[12] + m0[7]*m1[13] + m0[11]*m1[14] + m0[15]*m1[15];
 57 | }
 58 | 
 59 | void 
 60 | matmul4_neon(float m0[16], float m1[16], float d[16])
 61 | {
 62 | #ifdef __MATH_NEON
 63 | 	asm volatile (
 64 | 	"vld1.32 		{d0, d1}, [%1]!			\n\t"	//q0 = m1
 65 | 	"vld1.32 		{d2, d3}, [%1]!			\n\t"	//q1 = m1+4
 66 | 	"vld1.32 		{d4, d5}, [%1]!			\n\t"	//q2 = m1+8
 67 | 	"vld1.32 		{d6, d7}, [%1]			\n\t"	//q3 = m1+12
 68 | 	"vld1.32 		{d16, d17}, [%0]!		\n\t"	//q8 = m0
 69 | 	"vld1.32 		{d18, d19}, [%0]!		\n\t"	//q9 = m0+4
 70 | 	"vld1.32 		{d20, d21}, [%0]!		\n\t"	//q10 = m0+8
 71 | 	"vld1.32 		{d22, d23}, [%0]		\n\t"	//q11 = m0+12
 72 | 
 73 | 	"vmul.f32 		q12, q8, d0[0] 			\n\t"	//q12 = q8 * d0[0]
 74 | 	"vmul.f32 		q13, q8, d2[0] 			\n\t"	//q13 = q8 * d2[0]
 75 | 	"vmul.f32 		q14, q8, d4[0] 			\n\t"	//q14 = q8 * d4[0]
 76 | 	"vmul.f32 		q15, q8, d6[0]	 		\n\t"	//q15 = q8 * d6[0]
 77 | 	"vmla.f32 		q12, q9, d0[1] 			\n\t"	//q12 = q9 * d0[1]
 78 | 	"vmla.f32 		q13, q9, d2[1] 			\n\t"	//q13 = q9 * d2[1]
 79 | 	"vmla.f32 		q14, q9, d4[1] 			\n\t"	//q14 = q9 * d4[1]
 80 | 	"vmla.f32 		q15, q9, d6[1] 			\n\t"	//q15 = q9 * d6[1]
 81 | 	"vmla.f32 		q12, q10, d1[0] 		\n\t"	//q12 = q10 * d0[0]
 82 | 	"vmla.f32 		q13, q10, d3[0] 		\n\t"	//q13 = q10 * d2[0]
 83 | 	"vmla.f32 		q14, q10, d5[0] 		\n\t"	//q14 = q10 * d4[0]
 84 | 	"vmla.f32 		q15, q10, d7[0] 		\n\t"	//q15 = q10 * d6[0]
 85 | 	"vmla.f32 		q12, q11, d1[1] 		\n\t"	//q12 = q11 * d0[1]
 86 | 	"vmla.f32 		q13, q11, d3[1] 		\n\t"	//q13 = q11 * d2[1]
 87 | 	"vmla.f32 		q14, q11, d5[1] 		\n\t"	//q14 = q11 * d4[1]
 88 | 	"vmla.f32 		q15, q11, d7[1]	 		\n\t"	//q15 = q11 * d6[1]
 89 | 
 90 | 	"vst1.32 		{d24, d25}, [%2]! 		\n\t"	//d = q12	
 91 | 	"vst1.32 		{d26, d27}, [%2]!		\n\t"	//d+4 = q13	
 92 | 	"vst1.32 		{d28, d29}, [%2]! 		\n\t"	//d+8 = q14	
 93 | 	"vst1.32 		{d30, d31}, [%2]	 	\n\t"	//d+12 = q15	
 94 | 
 95 | 	: "+r"(m0), "+r"(m1), "+r"(d) : 
 96 |     : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15",
 97 | 	"memory"
 98 | 	);	
 99 | #else
100 | 	matmul4_c(m0, m1, d);
101 | #endif
102 | }
103 | 
104 | 
105 | //matrix vector multiplication. d = m * v
106 | void
107 | matvec4_c(float m[16], float v[4], float d[4])
108 | {
109 | 	d[0] = m[0]*v[0] + m[4]*v[1] + m[8]*v[2] + m[12]*v[3];
110 | 	d[1] = m[1]*v[0] + m[5]*v[1] + m[9]*v[2] + m[13]*v[3];
111 | 	d[2] = m[2]*v[0] + m[6]*v[1] + m[10]*v[2] + m[14]*v[3];
112 | 	d[3] = m[3]*v[0] + m[7]*v[1] + m[11]*v[2] + m[15]*v[3];
113 | }
114 | 
115 | void
116 | matvec4_neon(float m[16], float v[4], float d[4])
117 | {
118 | #ifdef __MATH_NEON
119 | 	asm volatile (
120 | 	"vld1.32 		{d0, d1}, [%1]			\n\t"	//Q0 = v
121 | 	"vld1.32 		{d18, d19}, [%0]!		\n\t"	//Q1 = m
122 | 	"vld1.32 		{d20, d21}, [%0]!		\n\t"	//Q2 = m+4
123 | 	"vld1.32 		{d22, d23}, [%0]!		\n\t"	//Q3 = m+8
124 | 	"vld1.32 		{d24, d25}, [%0]!		\n\t"	//Q4 = m+12	
125 | 	
126 | 	"vmul.f32 		q13, q9, d0[0]			\n\t"	//Q5 = Q1*Q0[0]
127 | 	"vmla.f32 		q13, q10, d0[1]			\n\t"	//Q5 += Q1*Q0[1] 
128 | 	"vmla.f32 		q13, q11, d1[0]			\n\t"	//Q5 += Q2*Q0[2] 
129 | 	"vmla.f32 		q13, q12, d1[1]			\n\t"	//Q5 += Q3*Q0[3]
130 | 	
131 | 	"vst1.32 		{d26, d27}, [%2] 		\n\t"	//Q4 = m+12	
132 | 	: 
133 | 	: "r"(m), "r"(v), "r"(d) 
134 |     : "q0", "q9", "q10","q11", "q12", "q13", "memory"
135 | 	);	
136 | #else
137 | 	matvec4_c(m, v, d);
138 | #endif
139 | }
140 | 
141 | 
142 | 
143 | 
144 | 
145 | 


--------------------------------------------------------------------------------
/math_modf.c:
--------------------------------------------------------------------------------
 1 | /*
 2 | The MIT License (MIT)
 3 | 
 4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in
14 | all copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 | THE SOFTWARE.
23 | */
24 | 
25 | /*
26 | Assumes the floating point value |x| < 2,147,483,648
27 | */
28 | 
29 | #include "math_neon.h"
30 | 
31 | float modf_c(float x, int *i)
32 | {
33 | 	int n;
34 | 	n = (int)x;
35 | 	*i = n;
36 | 	x = x - (float)n;
37 | 	return x;
38 | }
39 | 
40 | 
41 | float modf_neon_hfp(float x, int *i)
42 | {
43 | #ifdef __MATH_NEON
44 | 	asm volatile (	
45 | 	"vcvt.s32.f32	d1, d0					\n\t"	//d1 = (int) d0; 
46 | 	"vcvt.f32.s32	d2, d1					\n\t"	//d2 = (float) d1;
47 | 	"vsub.f32		d0, d0, d2				\n\t"	//d0 = d0 - d2; 
48 | 	"vstr.i32		s2, [r0]				\n\t"	//[r0] = d1[0] 
49 | 	::: "d0", "d1", "d2"
50 | 	);		
51 | #endif
52 | }
53 | 
54 | 
55 | float modf_neon_sfp(float x, int *i)
56 | {
57 | #ifdef __MATH_NEON
58 | 	asm volatile (
59 | 	"vdup.f32 		d0, r0					\n\t"	//d0 = {x, x}	
60 | 	"vcvt.s32.f32	d1, d0					\n\t"	//d1 = (int) d0; 
61 | 	"vcvt.f32.s32	d2, d1					\n\t"	//d2 = (float) d1;
62 | 	"vsub.f32		d0, d0, d2				\n\t"	//d0 = d0 - d2; 
63 | 	"vstr.i32		s2, [r1]				\n\t"	//[r0] = d1[0] 
64 | 	"vmov.f32 		r0, s0					\n\t"	//r0 = d0[0];
65 | 	::: "d0", "d1", "d2"
66 | 	);
67 | 		
68 | #else
69 | 	return modf_c(x, i);
70 | #endif
71 | }
72 | 


--------------------------------------------------------------------------------
/math_neon.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | The MIT License (MIT)
  3 | 
  4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
  5 | 
  6 | Permission is hereby granted, free of charge, to any person obtaining a copy
  7 | of this software and associated documentation files (the "Software"), to deal
  8 | in the Software without restriction, including without limitation the rights
  9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10 | copies of the Software, and to permit persons to whom the Software is
 11 | furnished to do so, subject to the following conditions:
 12 | 
 13 | The above copyright notice and this permission notice shall be included in
 14 | all copies or substantial portions of the Software.
 15 | 
 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 22 | THE SOFTWARE.
 23 | */
 24 | 
 25 | #ifndef __MATH_NEON_H__ 
 26 | #define __MATH_NEON_H__ 
 27 | 
 28 | #if !defined(__i386__) && defined(__arm__)
 29 | //if defined neon ASM routines are used, otherwise all calls to *_neon 
 30 | //functions are rerouted to their equivalent *_c function.
 31 | #define __MATH_NEON			
 32 | 
 33 | //Default Floating Point value ABI: 0=softfp, 1=hardfp. Only effects *_neon routines.
 34 | //You can access the hardfp versions directly via the *_hard suffix. 
 35 | //You can access the softfp versions directly via the *_soft suffix. 
 36 | #define __MATH_FPABI 	0	
 37 | 
 38 | #endif
 39 | 
 40 | #ifdef GCC
 41 | #define ALIGN(A) __attribute__ ((aligned (A))
 42 | #else
 43 | #define ALIGN(A)
 44 | #endif
 45 | 
 46 | #ifndef _MATH_H
 47 | #define M_PI		3.14159265358979323846	/* pi */
 48 | #define M_PI_2		1.57079632679489661923	/* pi/2 */
 49 | #define M_PI_4		0.78539816339744830962	/* pi/4 */
 50 | #define M_E			2.7182818284590452354	/* e */
 51 | #define M_LOG2E		1.4426950408889634074	/* log_2 e */
 52 | #define M_LOG10E	0.43429448190325182765	/* log_10 e */
 53 | #define M_LN2		0.69314718055994530942	/* log_e 2 */
 54 | #define M_LN10		2.30258509299404568402	/* log_e 10 */
 55 | #define M_1_PI		0.31830988618379067154	/* 1/pi */
 56 | #define M_2_PI		0.63661977236758134308	/* 2/pi */
 57 | #define M_2_SQRTPI	1.12837916709551257390	/* 2/sqrt(pi) */
 58 | #define M_SQRT2		1.41421356237309504880	/* sqrt(2) */
 59 | #define M_SQRT1_2	0.70710678118654752440	/* 1/sqrt(2) */
 60 | #endif 
 61 | 
 62 | #if __MATH_FPABI == 1
 63 | #define sinf_neon		sinf_neon_hfp
 64 | #define cosf_neon		cosf_neon_hfp
 65 | #define	sincosf_neon	sincosf_neon_hfp
 66 | #define tanf_neon		tanf_neon_hfp
 67 | #define atanf_neon		atanf_neon_hfp
 68 | #define atan2f_neon		atan2f_neon_hfp
 69 | #define asinf_neon		asinf_neon_hfp
 70 | #define acosf_neon		acosf_neon_hfp
 71 | #define sinhf_neon		sinhf_neon_hfp
 72 | #define coshf_neon		coshf_neon_hfp
 73 | #define tanhf_neon		tanhf_neon_hfp
 74 | #define expf_neon		expf_neon_hfp
 75 | #define logf_neon		logf_neon_hfp
 76 | #define log10f_neon		log10f_neon_hfp
 77 | #define powf_neon		powf_neon_hfp
 78 | #define floorf_neon		floorf_neon_hfp
 79 | #define ceilf_neon		ceilf_neon_hfp
 80 | #define fabsf_neon		fabsf_neon_hfp
 81 | #define ldexpf_neon		ldexpf_neon_hfp
 82 | #define frexpf_neon		frexpf_neon_hfp
 83 | #define fmodf_neon		fmodf_neon_hfp
 84 | #define modf_neon		modf_neon_hfp
 85 | #define sqrtf_neon		sqrtf_neon_hfp
 86 | #define invsqrtf_neon	invsqrtf_neon_hfp
 87 | #else
 88 | #define sinf_neon		sinf_neon_sfp
 89 | #define cosf_neon		cosf_neon_sfp
 90 | #define	sincosf_neon	sincosf_neon_sfp
 91 | #define tanf_neon		tanf_neon_sfp
 92 | #define atanf_neon		atanf_neon_sfp
 93 | #define atan2f_neon		atan2f_neon_sfp
 94 | #define asinf_neon		asinf_neon_sfp
 95 | #define acosf_neon		acosf_neon_sfp
 96 | #define sinhf_neon		sinhf_neon_sfp
 97 | #define coshf_neon		coshf_neon_sfp
 98 | #define tanhf_neon		tanhf_neon_sfp
 99 | #define expf_neon		expf_neon_sfp
100 | #define logf_neon		logf_neon_sfp
101 | #define log10f_neon		log10f_neon_sfp
102 | #define powf_neon		powf_neon_sfp
103 | #define floorf_neon		floorf_neon_sfp
104 | #define ceilf_neon		ceilf_neon_sfp
105 | #define fabsf_neon		fabsf_neon_sfp
106 | #define ldexpf_neon		ldexpf_neon_sfp
107 | #define frexpf_neon		frexpf_neon_sfp
108 | #define fmodf_neon		fmodf_neon_sfp
109 | #define modf_neon		modf_neon_sfp
110 | #define sqrtf_neon		sqrtf_neon_sfp
111 | #define invsqrtf_neon	invsqrtf_neon_sfp
112 | 
113 | #define dot2_neon		dot2_neon_sfp
114 | #define dot3_neon		dot3_neon_sfp
115 | #define dot4_neon		dot4_neon_sfp
116 | #endif
117 | 
118 | /* 
119 | function:	enable_runfast
120 | 			this function enables the floating point runfast mode on the 
121 | 			ARM Cortex A8.  	
122 | */
123 | void		enable_runfast();
124 | 
125 | 
126 | float dot2_c(float v0[2], float v1[2]);
127 | float dot2_neon(float v0[2], float v1[2]);
128 | float dot3_c(float v0[3], float v1[3]);
129 | float dot3_neon(float v0[3], float v1[3]);
130 | float dot4_c(float v0[4], float v1[4]);
131 | float dot4_neon(float v0[4], float v1[4]);
132 | 
133 | void cross3_c(float v0[3], float v1[3], float d[3]);
134 | void cross3_neon(float v0[3], float v1[3], float d[3]);
135 | 
136 | void normalize2_c(float v[2], float d[2]);
137 | void normalize2_neon(float v[2], float d[2]);
138 | void normalize3_c(float v[3], float d[3]);
139 | void normalize3_neon(float v[3], float d[3]);
140 | void normalize4_c(float v[4], float d[4]);
141 | void normalize4_neon(float v[4], float d[4]);
142 | 
143 | /* 
144 | function:	matmul2
145 | arguments:  m0 2x2 matrix, m1 2x2 matrix
146 | return: 	d 2x2 matrix
147 | expression: d = m0 * m1
148 | */
149 | void		matmul2_c(float m0[4], float m1[4], float d[4]);
150 | void		matmul2_neon(float m0[4], float m1[4], float d[4]);
151 | 
152 | /* 
153 | function:	matmul3
154 | arguments:  m0 3x3 matrix, m1 3x3 matrix
155 | return: 	d 3x3 matrix
156 | expression: d = m0 * m1
157 | */
158 | void		matmul3_c(float m0[9], float m1[9], float d[9]);
159 | void		matmul3_neon(float m0[9], float m1[9], float d[9]);
160 | 
161 | /* 
162 | function:	matmul4
163 | arguments:  m0 4x4 matrix, m1 4x4 matrix
164 | return: 	d 4x4 matrix
165 | expression: d = m0 * m1
166 | */
167 | void		matmul4_c(float m0[16], float m1[16], float d[16]);
168 | void		matmul4_neon(float m0[16], float m1[16], float d[16]);
169 | \
170 | /* 
171 | function:	matvec2
172 | arguments:  m 2x2 matrix, v 2 element vector
173 | return: 	d 2x2 matrix
174 | expression: d = m * v
175 | */
176 | void		matvec2_c(float m[4], float v[2], float d[2]);
177 | void		matvec2_neon(float m[4], float v[2], float d[2]);
178 | 
179 | /* 
180 | function:	matvec3
181 | arguments:  m 3x3 matrix, v 3 element vector
182 | return: 	d 3x3 matrix
183 | expression: d = m * v
184 | */
185 | void		matvec3_c(float m[9], float v[3], float d[3]);
186 | void		matvec3_neon(float m[9], float v[3], float d[3]);
187 | 
188 | /* 
189 | function:	matvec4
190 | arguments:  m 4x4 matrix, v 4 element vector
191 | return: 	d 4x4 matrix
192 | expression: d = m * v
193 | */
194 | void		matvec4_c(float m[16], float v[4], float d[4]);
195 | void		matvec4_neon(float m[16], float v[4], float d[4]);
196 | 
197 | /* 
198 | function:	sinf
199 | arguments:  x radians
200 | return: 	the sine function evaluated at x radians.	
201 | expression: r = sin(x) 	
202 | */
203 | float 		sinf_c(float x);
204 | float 		sinf_neon_hfp(float x);
205 | float 		sinf_neon_sfp(float x);
206 | 
207 | /* 
208 | function:	cosf
209 | arguments:  x radians
210 | return: 	the cosine function evaluated at x radians.	
211 | expression: r = cos(x) 	
212 | notes:		computed using cos(x) = sin(x + pi/2)
213 | */
214 | float 		cosf_c(float x);
215 | float 		cosf_neon_hfp(float x);
216 | float 		cosf_neon_sfp(float x);
217 | 
218 | /* 
219 | function:	sincosf
220 | arguments:  x radians, r[2] result array.
221 | return: 	both the sine and the cosine evaluated at x radians.	
222 | expression: r = {sin(x), cos(x)} 	
223 | notes:		faster than evaluating seperately.
224 | */
225 | void		sincosf_c(float x, float r[2]);
226 | void		sincosf_neon_hfp(float x, float r[2]);
227 | void		sincosf_neon_sfp(float x, float r[2]);
228 | 
229 | /* 
230 | function:	sinfv
231 | return: 	the sine function evaluated at x[i] radians 	
232 | expression: r[i] = sin(x[i])	
233 | notes:		faster than evaluating individually.
234 | 			r and x can be the same memory location.
235 | */
236 | void		sinfv_c(float *x, int n, float *r);
237 | void  		sinfv_neon(float *x, int n, float *r);
238 | 
239 | /* 
240 | function:	tanf
241 | return: 	the tangent evaluated at x radians.	
242 | expression: r = tan(x) 	
243 | notes:		computed using tan(x) = sin(x) / cos(x)
244 | */
245 | float 		tanf_c(float x);
246 | float 		tanf_neon_hfp(float x);
247 | float 		tanf_neon_sfp(float x);
248 | 
249 | /* 
250 | function:	atanf
251 | return: 	the arctangent evaluated at x.	
252 | expression: r = atan(x) 	
253 | */
254 | float 		atanf_c(float x);
255 | float 		atanf_neon_hfp(float x);
256 | float 		atanf_neon_sfp(float x);
257 | 
258 | /* 
259 | function:	atanf
260 | return: 	the arctangent evaluated at x.	
261 | expression: r = atan(x) 	
262 | */
263 | float 		atan2f_c(float y, float x);
264 | float 		atan2f_neon_hfp(float y, float x);
265 | float 		atan2f_neon_sfp(float y, float x);
266 | 
267 | /* 
268 | function:	asinf
269 | return: 	the arcsine evaluated at x.	
270 | expression: r = asin(x) 	
271 | */
272 | float 		asinf_c(float x);
273 | float 		asinf_neon_hfp(float x);
274 | float 		asinf_neon_sfp(float x);
275 | 
276 | /* 
277 | function:	acosf
278 | return: 	the arcsine evaluated at x.	
279 | expression: r = asin(x) 	
280 | */
281 | float 		acosf_c(float x);
282 | float 		acosf_neon_hfp(float x);
283 | float 		acosf_neon_sfp(float x);
284 | 
285 | /* 
286 | function:	sinhf
287 | return: 	the arcsine evaluated at x.	
288 | expression: r = asin(x) 	
289 | */
290 | float 		sinhf_c(float x);
291 | float 		sinhf_neon_hfp(float x);
292 | float 		sinhf_neon_sfp(float x);
293 | 
294 | /* 
295 | function:	coshf
296 | return: 	the arcsine evaluated at x.	
297 | expression: r = asin(x) 	
298 | */
299 | float 		coshf_c(float x);
300 | float 		coshf_neon_hfp(float x);
301 | float 		coshf_neon_sfp(float x);
302 | 
303 | /* 
304 | function:	tanhf
305 | return: 	the arcsine evaluated at x.	
306 | expression: r = asin(x) 	
307 | */
308 | float 		tanhf_c(float x);
309 | float 		tanhf_neon_hfp(float x);
310 | float 		tanhf_neon_sfp(float x);
311 | 
312 | /* 
313 | function:	expf
314 | return: 	the natural exponential evaluated at x.	
315 | expression: r = e ** x	
316 | */
317 | float 		expf_c(float x);
318 | float 		expf_neon_hfp(float x);
319 | float 		expf_neon_sfp(float x);
320 | 
321 | /* 
322 | function:	logf
323 | return: 	the value of the natural logarithm of x.	
324 | expression: r = ln(x)	
325 | notes:		assumes x > 0
326 | */
327 | float 		logf_c(float x);
328 | float 		logf_neon_hfp(float x);
329 | float 		logf_neon_sfp(float x);
330 | 
331 | /* 
332 | function:	log10f
333 | return: 	the value of the power 10 logarithm of x.	
334 | expression: r = log10(x)	
335 | notes:		assumes x > 0
336 | */
337 | float 		log10f_c(float x);
338 | float 		log10f_neon_hfp(float x);
339 | float 		log10f_neon_sfp(float x);
340 | 
341 | /* 
342 | function:	powf
343 | return: 	x raised to the power of n, x ** n.
344 | expression: r = x ** y	
345 | notes:		computed using e ** (y * ln(x))
346 | */
347 | float 		powf_c(float x, float n);
348 | float 		powf_neon_sfp(float x, float n);
349 | float 		powf_neon_hfp(float x, float n);
350 | 
351 | /* 
352 | function:	floorf
353 | return: 	x rounded down (towards negative infinity) to its nearest 
354 | 			integer value.	
355 | notes:		assumes |x| < 2 ** 31
356 | */
357 | float 		floorf_c(float x);
358 | float 		floorf_neon_sfp(float x);
359 | float 		floorf_neon_hfp(float x);
360 | 
361 | /* 
362 | function:	ceilf
363 | return: 	x rounded up (towards positive infinity) to its nearest 
364 | 			integer value.	
365 | notes:		assumes |x| < 2 ** 31
366 | */
367 | float 		ceilf_c(float x);
368 | float 		ceilf_neon_hfp(float x);
369 | float 		ceilf_neon_sfp(float x);
370 | 
371 | /* 
372 | function:	fabsf
373 | return: 	absolute vvalue of x	
374 | notes:		assumes |x| < 2 ** 31
375 | */
376 | float 		fabsf_c(float x);
377 | float 		fabsf_neon_hfp(float x);
378 | float 		fabsf_neon_sfp(float x);
379 | 
380 | /* 
381 | function:	ldexpf
382 | return: 	the value of m multiplied by 2 to the power of e. 
383 | expression: r = m * (2 ** e)
384 | */
385 | float 		ldexpf_c(float m, int e);
386 | float 		ldexpf_neon_hfp(float m, int e);
387 | float 		ldexpf_neon_sfp(float m, int e);
388 | 
389 | /* 
390 | function:	frexpf
391 | return: 	the exponent and mantissa of x 
392 | */
393 | float 		frexpf_c(float x, int *e);
394 | float 		frexpf_neon_hfp(float x, int *e);
395 | float 		frexpf_neon_sfp(float x, int *e);
396 | 
397 | /* 
398 | function:	fmodf
399 | return: 	the remainder of x divided by y, x % y	
400 | expression: r = x - floor(x / y) * y;
401 | notes:		assumes that |x / y| < 2 ** 31 
402 | */
403 | float 		fmodf_c(float x, float y);
404 | float 		fmodf_neon_hfp(float x, float y);
405 | float 		fmodf_neon_sfp(float x, float y);
406 | 
407 | /* 
408 | function:	modf
409 | return: 	breaks x into the integer (i) and fractional part (return)
410 | notes:		assumes that |x| < 2 ** 31 
411 | */
412 | float 		modf_c(float x, int *i);
413 | float 		modf_neon_hfp(float x, int *i);
414 | float 		modf_neon_sfp(float x, int *i);
415 | 
416 | /* 
417 | function:	sqrtf
418 | return: 	(x^0.5)
419 | notes:		 
420 | */
421 | float 		sqrtf_c(float x);
422 | float 		sqrtf_neon_hfp(float x);
423 | float 		sqrtf_neon_sfp(float x);
424 | 
425 | 
426 | /* 
427 | function:	invsqrtf
428 | return: 	1.0f / (x^0.5)
429 | notes:		 
430 | */
431 | float 		invsqrtf_c(float x);
432 | float 		invsqrtf_neon_hfp(float x);
433 | float 		invsqrtf_neon_sfp(float x);
434 | 
435 | #endif
436 | 


--------------------------------------------------------------------------------
/math_powf.c:
--------------------------------------------------------------------------------
  1 | /*
  2 | The MIT License (MIT)
  3 | 
  4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
  5 | 
  6 | Permission is hereby granted, free of charge, to any person obtaining a copy
  7 | of this software and associated documentation files (the "Software"), to deal
  8 | in the Software without restriction, including without limitation the rights
  9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10 | copies of the Software, and to permit persons to whom the Software is
 11 | furnished to do so, subject to the following conditions:
 12 | 
 13 | The above copyright notice and this permission notice shall be included in
 14 | all copies or substantial portions of the Software.
 15 | 
 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 22 | THE SOFTWARE.
 23 | */
 24 | 
 25 | /*
 26 | Based on x ^ n = exp(n * log(x))
 27 | 
 28 | Test func : powf(x, n)
 29 | Test Range: (1,1) < (x, n) < (10, 10)
 30 | Peak Error:	~0.0010%
 31 | RMS  Error: ~0.0002%
 32 | */
 33 | 
 34 | #include "math.h"
 35 | #include "math_neon.h"
 36 | 
 37 | const float __powf_rng[2] = {
 38 | 	1.442695041f,
 39 | 	0.693147180f
 40 | };
 41 | 
 42 | const float __powf_lut[16] = {
 43 | 	-2.295614848256274, 	//p0	log
 44 | 	-2.470711633419806, 	//p4
 45 | 	-5.686926051100417, 	//p2
 46 | 	-0.165253547131978, 	//p6
 47 | 	+5.175912446351073, 	//p1
 48 | 	+0.844006986174912, 	//p5
 49 | 	+4.584458825456749, 	//p3
 50 | 	+0.014127821926000,		//p7
 51 | 	0.9999999916728642,		//p0	exp
 52 | 	0.04165989275009526, 	//p4
 53 | 	0.5000006143673624, 	//p2
 54 | 	0.0014122663401803872, 	//p6
 55 | 	1.000000059694879, 		//p1
 56 | 	0.008336936973260111, 	//p5
 57 | 	0.16666570253074878, 	//p3
 58 | 	0.00019578093328483123	//p7
 59 | };
 60 | 
 61 | float powf_c(float x, float n)
 62 | {
 63 | 	float a, b, c, d, xx;
 64 | 	int m;
 65 | 	
 66 | 	union {
 67 | 		float   f;
 68 | 		int 	i;
 69 | 	} r;
 70 | 	
 71 | 	//extract exponent
 72 | 	r.f = x;
 73 | 	m = (r.i >> 23);
 74 | 	m = m - 127;
 75 | 	r.i = r.i - (m << 23);
 76 | 	
 77 | 	//Taylor Polynomial (Estrins)
 78 | 	xx = r.f * r.f;
 79 | 	a = (__powf_lut[4] * r.f) + (__powf_lut[0]);
 80 | 	b = (__powf_lut[6] * r.f) + (__powf_lut[2]);
 81 | 	c = (__powf_lut[5] * r.f) + (__powf_lut[1]);
 82 | 	d = (__powf_lut[7] * r.f) + (__powf_lut[3]);
 83 | 	a = a + b * xx;
 84 | 	c = c + d * xx;
 85 | 	xx = xx * xx;
 86 | 	r.f = a + c * xx;
 87 | 
 88 | 	//add exponent
 89 | 	r.f = r.f + ((float) m) * __powf_rng[1];
 90 | 
 91 | 	r.f = r.f * n;
 92 | 
 93 | 
 94 | 	//Range Reduction:
 95 | 	m = (int) (r.f * __powf_rng[0]);
 96 | 	r.f = r.f - ((float) m) * __powf_rng[1];	
 97 | 	
 98 | 	//Taylor Polynomial (Estrins)
 99 | 	a = (__powf_lut[12] * r.f) + (__powf_lut[8]);
100 | 	b = (__powf_lut[14] * r.f) + (__powf_lut[10]);
101 | 	c = (__powf_lut[13] * r.f) + (__powf_lut[9]);
102 | 	d = (__powf_lut[15] * r.f) + (__powf_lut[11]);
103 | 	xx = r.f * r.f;
104 | 	a = a + b * xx; 
105 | 	c = c + d * xx;
106 | 	xx = xx* xx;
107 | 	r.f = a + c * xx; 
108 | 	
109 | 	//multiply by 2 ^ m 
110 | 	m = m << 23;
111 | 	r.i = r.i + m;
112 | 
113 | 	return r.f;
114 | }
115 | 
116 | float powf_neon_hfp(float x, float n)
117 | {
118 | #ifdef __MATH_NEON
119 | 	asm volatile (
120 | 		
121 | 	"vdup.f32		d16, d0[1]				\n\t"	//d16 = {y,y};	
122 | 	"vdup.f32		d0, d0[0]				\n\t"	//d0 = {x,x};
123 | 	
124 | 	//extract exponent
125 | 	"vmov.i32		d2, #127				\n\t"	//d2 = 127;
126 | 	"vshr.u32		d6, d0, #23				\n\t"	//d6 = d0 >> 23;
127 | 	"vsub.i32		d6, d6, d2				\n\t"	//d6 = d6 - d2;
128 | 	"vshl.u32		d1, d6, #23				\n\t"	//d1 = d6 << 23;
129 | 	"vsub.i32		d0, d0, d1				\n\t"	//d0 = d0 + d1;
130 | 
131 | 	//polynomial:
132 | 	"vmul.f32 		d1, d0, d0				\n\t"	//d1 = d0*d0 = {x^2, x^2}	
133 | 	"vld1.32 		{d2, d3, d4, d5}, [%1]!	\n\t"	//q1 = {p0, p4, p2, p6}, q2 = {p1, p5, p3, p7} ;
134 | 	"vmla.f32 		q1, q2, d0[0]			\n\t"	//q1 = q1 + q2 * d0[0]		
135 | 	"vmla.f32 		d2, d3, d1[0]			\n\t"	//d2 = d2 + d3 * d1[0]		
136 | 	"vmul.f32 		d1, d1, d1				\n\t"	//d1 = d1 * d1 = {x^4, x^4}	
137 | 	"vmla.f32 		d2, d1, d2[1]			\n\t"	//d2 = d2 + d1 * d2[1]		
138 | 
139 | 	//add exponent 	
140 | 	"vld1.32 		d7, [%0]				\n\t"	//d7 = {invrange, range}
141 | 	"vcvt.f32.s32 	d6, d6					\n\t"	//d6 = (float) d6
142 | 	"vmla.f32 		d2, d6, d7[1]			\n\t"	//d2 = d2 + d6 * d7[1]		
143 | 
144 | 	"vdup.f32 		d0, d2[0]				\n\t"	//d0 = d2[0]		
145 | 	"vmul.f32 		d0, d0, d16				\n\t"	//d0 = d0 * d16	
146 | 
147 | 	//Range Reduction:
148 | 	"vmul.f32 		d6, d0, d7[0]			\n\t"	//d6 = d0 * d7[0] 
149 | 	"vcvt.u32.f32 	d6, d6					\n\t"	//d6 = (int) d6
150 | 	"vcvt.f32.u32 	d1, d6					\n\t"	//d1 = (float) d6
151 | 	"vmls.f32 		d0, d1, d7[1]			\n\t"	//d0 = d0 - d1 * d7[1]
152 | 		
153 | 	//polynomial:
154 | 	"vmul.f32 		d1, d0, d0				\n\t"	//d1 = d0*d0 = {x^2, x^2}	
155 | 	"vld1.32 		{d2, d3, d4, d5}, [%1]	\n\t"	//q1 = {p0, p4, p2, p6}, q2 = {p1, p5, p3, p7} ;
156 | 	"vmla.f32 		q1, q2, d0[0]			\n\t"	//q1 = q1 + q2 * d0[0]		
157 | 	"vmla.f32 		d2, d3, d1[0]			\n\t"	//d2 = d2 + d3 * d1[0]		
158 | 	"vmul.f32 		d1, d1, d1				\n\t"	//d1 = d1 * d1 = {x^4, x^4}	
159 | 	"vmla.f32 		d2, d1, d2[1]			\n\t"	//d2 = d2 + d1 * d2[1]		
160 | 
161 | 	//multiply by 2 ^ m 	
162 | 	"vshl.i32 		d6, d6, #23				\n\t"	//d6 = d6 << 23		
163 | 	"vadd.i32 		d0, d2, d6				\n\t"	//d0 = d2 + d6		
164 | 
165 | 
166 | 	:: "r"(__powf_rng), "r"(__powf_lut) 
167 |     : "d0", "d1", "d2","d3", "d4", "d5", "d6", "d7"
168 | 	);
169 | #endif
170 | }
171 | 
172 | float powf_neon_sfp(float x, float n)
173 | {
174 | #ifdef __MATH_NEON
175 | 	asm volatile ("vmov.f32 s0, r0 		\n\t");
176 | 	asm volatile ("vmov.f32 s1, r1 		\n\t");
177 | 	powf_neon_hfp(x, n);
178 | 	asm volatile ("vmov.f32 r0, s0 		\n\t");
179 | #else
180 | 	return powf_c(x, n);
181 | #endif
182 | };
183 | 


--------------------------------------------------------------------------------
/math_runfast.c:
--------------------------------------------------------------------------------
 1 | /*
 2 | The MIT License (MIT)
 3 | 
 4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in
14 | all copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 | THE SOFTWARE.
23 | */
24 | 
25 | 
26 | void 
27 | enable_runfast()
28 | {
29 | #ifdef __arm__
30 | 	static const unsigned int x = 0x04086060;
31 | 	static const unsigned int y = 0x03000000;
32 | 	int r;
33 | 	asm volatile (
34 | 		"fmrx	%0, fpscr			\n\t"	//r0 = FPSCR
35 | 		"and	%0, %0, %1			\n\t"	//r0 = r0 & 0x04086060
36 | 		"orr	%0, %0, %2			\n\t"	//r0 = r0 | 0x03000000
37 | 		"fmxr	fpscr, %0			\n\t"	//FPSCR = r0
38 | 		: "=r"(r)
39 | 		: "r"(x), "r"(y)
40 | 	);
41 | #endif
42 | }
43 | 


--------------------------------------------------------------------------------
/math_sincosf.c:
--------------------------------------------------------------------------------
  1 | /*
  2 | The MIT License (MIT)
  3 | 
  4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
  5 | 
  6 | Permission is hereby granted, free of charge, to any person obtaining a copy
  7 | of this software and associated documentation files (the "Software"), to deal
  8 | in the Software without restriction, including without limitation the rights
  9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10 | copies of the Software, and to permit persons to whom the Software is
 11 | furnished to do so, subject to the following conditions:
 12 | 
 13 | The above copyright notice and this permission notice shall be included in
 14 | all copies or substantial portions of the Software.
 15 | 
 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 22 | THE SOFTWARE.
 23 | */
 24 | 
 25 | #include "math.h"
 26 | #include "math_neon.h"
 27 | 
 28 | const float __sincosf_rng[2] = {
 29 | 	2.0 / M_PI,
 30 | 	M_PI / 2.0
 31 | };
 32 | 
 33 | const float __sincosf_lut[8] = {
 34 | 	-0.00018365f,	//p7
 35 | 	-0.00018365f,	//p7
 36 | 	+0.00830636f,	//p5
 37 | 	+0.00830636f,	//p5
 38 | 	-0.16664831f,	//p3
 39 | 	-0.16664831f,	//p3
 40 | 	+0.99999661f,	//p1
 41 | 	+0.99999661f,	//p1
 42 | };
 43 | 
 44 | void sincosf_c( float x, float r[2])
 45 | {
 46 | 	union {
 47 | 		float 	f;
 48 | 		int 	i;
 49 | 	} ax, bx;
 50 | 	
 51 | 	float y;
 52 | 	float a, b, c, d, xx, yy;
 53 | 	int m, n, o, p;
 54 | 	
 55 | 	y = x + __sincosf_rng[1];
 56 | 	ax.f = fabsf(x);
 57 | 	bx.f = fabsf(y);
 58 | 	
 59 | 	//Range Reduction:
 60 | 	m = (int) (ax.f * __sincosf_rng[0]);	
 61 | 	o = (int) (bx.f * __sincosf_rng[0]);	
 62 | 	ax.f = ax.f - (((float)m) * __sincosf_rng[1]);
 63 | 	bx.f = bx.f - (((float)o) * __sincosf_rng[1]);
 64 | 	
 65 | 	//Test Quadrant
 66 | 	n = m & 1;
 67 | 	p = o & 1;
 68 | 	ax.f = ax.f - n * __sincosf_rng[1];	
 69 | 	bx.f = bx.f - p * __sincosf_rng[1];	
 70 | 	m = m >> 1;
 71 | 	o = o >> 1;
 72 | 	n = n ^ m;
 73 | 	p = p ^ o;
 74 | 	m = (x < 0.0);
 75 | 	o = (y < 0.0);
 76 | 	n = n ^ m;	
 77 | 	p = p ^ o;	
 78 | 	n = n << 31;
 79 | 	p = p << 31;
 80 | 	ax.i = ax.i ^ n; 
 81 | 	bx.i = bx.i ^ p; 
 82 | 
 83 | 	//Taylor Polynomial
 84 | 	xx = ax.f * ax.f;	
 85 | 	yy = bx.f * bx.f;
 86 | 	r[0] = __sincosf_lut[0];
 87 | 	r[1] = __sincosf_lut[1];
 88 | 	r[0] = r[0] * xx + __sincosf_lut[2];
 89 | 	r[1] = r[1] * yy + __sincosf_lut[3];
 90 | 	r[0] = r[0] * xx + __sincosf_lut[4];
 91 | 	r[1] = r[1] * yy + __sincosf_lut[5];
 92 | 	r[0] = r[0] * xx + __sincosf_lut[6];
 93 | 	r[1] = r[1] * yy + __sincosf_lut[7];
 94 | 	r[0] = r[0] * ax.f;
 95 | 	r[1] = r[1] * bx.f;
 96 | 
 97 | }
 98 | 
 99 | void sincosf_neon_hfp(float x, float r[2])
100 | {
101 | //HACK: Assumes for softfp that r1 = x, and for hardfp that s0 = x.
102 | #ifdef __MATH_NEON
103 | 	asm volatile (
104 | 	//{x, y} = {x, x + pi/2}
105 | 	"vdup.f32 		d1, d0[0]				\n\t"	//d1 = {x, x}
106 | 	"vld1.32 		d3, [%1]				\n\t"	//d3 = {invrange, range}
107 | 	"vadd.f32 		d0, d1, d3				\n\t"	//d0 = d1 + d3
108 | 	"vmov.f32 		s0, s2					\n\t"	//d0[0] = d1[0]	
109 | 	"vabs.f32 		d1, d0					\n\t"	//d1 = {abs(x), abs(y)}
110 | 	
111 | 	//Range Reduction:
112 | 	"vmul.f32 		d2, d1, d3[0]			\n\t"	//d2 = d1 * d3[0] 
113 | 	"vcvt.u32.f32 	d2, d2					\n\t"	//d2 = (int) d2
114 | 	"vcvt.f32.u32 	d4, d2					\n\t"	//d4 = (float) d2
115 | 	"vmls.f32 		d1, d4, d3[1]			\n\t"	//d1 = d1 - d4 * d3[1]
116 | 	
117 | 	//Checking Quadrant:
118 | 	//ax = ax - (k&1) * M_PI_2
119 | 	"vmov.i32	 	d4, #1					\n\t"	//d4 = 1
120 | 	"vand.i32	 	d4, d4, d2				\n\t"	//d4 = d4 & d2
121 | 	"vcvt.f32.u32 	d5, d4					\n\t"	//d5 = (float) d4
122 | 	"vmls.f32 		d1, d5, d3[1]			\n\t"	//d1 = d1 - d5 * d3[1]
123 | 
124 | 	//ax = ax ^ ((k & 1) ^ (k >> 1) ^ (x < 0) << 31)
125 | 	"vshr.u32 		d3, d2, #1				\n\t"	//d3 = d2 >> 1
126 | 	"veor.i32 		d4, d4, d3				\n\t"	//d4 = d4 ^ d3	
127 | 	"vclt.f32 		d3, d0, #0				\n\t"	//d3 = (d0 < 0.0)
128 | 	"veor.i32 		d4, d4, d3				\n\t"	//d4 = d4 ^ d3	
129 | 	"vshl.i32 		d4, d4, #31				\n\t"	//d4 = d4 << 31
130 | 	"veor.i32 		d0, d1, d4				\n\t"	//d0 = d1 ^ d4
131 | 	
132 | 	//polynomial:
133 | 	"vldm 			%2!, {d2, d3}	 		\n\t"	//d2 = {p7, p7}, d3 = {p5, p5}, r3 += 4;
134 | 	"vmul.f32 		d1, d0, d0				\n\t"	//d1 = d0 * d0 = {x^2, y^2}
135 | 	"vldm 			%2!, {d4}				\n\t"	//d4 = {p3, p3}, r3 += 2;
136 | 	"vmla.f32 		d3, d2, d1				\n\t"	//d3 = d3 + d2 * d1;	
137 | 	"vldm	 		%2!, {d5}				\n\t"	//d5 = {p1, p1}, r3 += 2;
138 | 	"vmla.f32 		d4, d3, d1				\n\t"	//d4 = d4 + d3 * d1;	
139 | 	"vmla.f32 		d5, d4, d1				\n\t"	//d5 = d5 + d4 * d1;	
140 | 	"vmul.f32 		d5, d5, d0				\n\t"	//d5 = d5 * d0;	
141 | 	
142 | 	"vstm.f32 		%0, {d5}				\n\t"	//r[0] = d5[0], r[1]=d5[1];	
143 | 	
144 | 	: "+r"(r)
145 | 	: "r"(__sincosf_rng), "r"(__sincosf_lut) 
146 |     : "d0", "d1", "d2", "d3", "d4", "d5"
147 | 	);
148 | #else
149 | 	sincosf_c(x, r);
150 | #endif
151 | }
152 | 
153 | void sincosf_neon_sfp(float x, float r[2])
154 | {
155 | #ifdef __MATH_NEON
156 | 	asm volatile ("vdup.f32 d0, r0 		\n\t");
157 | 	sincosf_neon_hfp(x, r);
158 | 	asm volatile ("vmov.f32 r0, s0 		\n\t");
159 | #else 
160 |     sincosf_c(x, r);
161 | #endif
162 | };
163 | 
164 | 


--------------------------------------------------------------------------------
/math_sinf.c:
--------------------------------------------------------------------------------
  1 | /*
  2 | The MIT License (MIT)
  3 | 
  4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
  5 | 
  6 | Permission is hereby granted, free of charge, to any person obtaining a copy
  7 | of this software and associated documentation files (the "Software"), to deal
  8 | in the Software without restriction, including without limitation the rights
  9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10 | copies of the Software, and to permit persons to whom the Software is
 11 | furnished to do so, subject to the following conditions:
 12 | 
 13 | The above copyright notice and this permission notice shall be included in
 14 | all copies or substantial portions of the Software.
 15 | 
 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 22 | THE SOFTWARE.
 23 | */
 24 | 
 25 | #include <math.h>
 26 | #include "math_neon.h"
 27 | 
 28 | static const float __sinf_rng[2] = {
 29 | 	2.0 / M_PI,
 30 | 	M_PI / 2.0
 31 | } ALIGN(16);
 32 | 
 33 | static const float __sinf_lut[4] = {
 34 | 	-0.00018365f,	//p7
 35 | 	-0.16664831f,	//p3
 36 | 	+0.00830636f,	//p5
 37 | 	+0.99999661f,	//p1
 38 | } ALIGN(16);
 39 | 
 40 | float sinf_c(float x)
 41 | {
 42 | 	union {
 43 | 		float 	f;
 44 | 		int 	i;
 45 | 	} ax;
 46 | 	
 47 | 	float r, a, b, xx;
 48 | 	int m, n;
 49 | 	
 50 | 	ax.f = fabsf(x);
 51 | 
 52 | 	//Range Reduction:
 53 | 	m = (int) (ax.f * __sinf_rng[0]);	
 54 | 	ax.f = ax.f - (((float)m) * __sinf_rng[1]);
 55 | 
 56 | 	//Test Quadrant
 57 | 	n = m & 1;
 58 | 	ax.f = ax.f - n * __sinf_rng[1];	
 59 | 	m = m >> 1;
 60 | 	n = n ^ m;
 61 | 	m = (x < 0.0);
 62 | 	n = n ^ m;	
 63 | 	n = n << 31;
 64 | 	ax.i = ax.i ^ n; 
 65 | 
 66 | 	//Taylor Polynomial (Estrins)
 67 | 	xx = ax.f * ax.f;	
 68 | 	a = (__sinf_lut[0] * ax.f) * xx + (__sinf_lut[2] * ax.f);
 69 | 	b = (__sinf_lut[1] * ax.f) * xx + (__sinf_lut[3] * ax.f);
 70 | 	xx = xx * xx;
 71 | 	r = b + a * xx;
 72 | 
 73 | 	return r;
 74 | }
 75 | 
 76 | float sinf_neon_hfp(float x)
 77 | {
 78 | #ifdef __MATH_NEON
 79 | 	asm volatile (
 80 | 	
 81 | 	"vld1.32 		d3, [%0]				\n\t"	//d3 = {invrange, range}
 82 | 	"vdup.f32 		d0, d0[0]				\n\t"	//d0 = {x, x}
 83 | 	"vabs.f32 		d1, d0					\n\t"	//d1 = {ax, ax}
 84 | 	
 85 | 	"vmul.f32 		d2, d1, d3[0]			\n\t"	//d2 = d1 * d3[0] 
 86 | 	"vcvt.u32.f32 	d2, d2					\n\t"	//d2 = (int) d2
 87 | 	"vmov.i32	 	d5, #1					\n\t"	//d5 = 1	
 88 | 	"vcvt.f32.u32 	d4, d2					\n\t"	//d4 = (float) d2	
 89 | 	"vshr.u32 		d7, d2, #1				\n\t"	//d7 = d2 >> 1
 90 | 	"vmls.f32 		d1, d4, d3[1]			\n\t"	//d1 = d1 - d4 * d3[1]
 91 | 	
 92 | 	"vand.i32 		d5, d2, d5				\n\t"	//d5 = d2 & d5
 93 | 	"vclt.f32 		d18, d0, #0				\n\t"	//d18 = (d0 < 0.0)
 94 | 	"vcvt.f32.u32 	d6, d5					\n\t"	//d6 = (float) d5
 95 | 	"vmls.f32 		d1, d6, d3[1]			\n\t"	//d1 = d1 - d6 * d3[1]
 96 | 	"veor.i32 		d5, d5, d7				\n\t"	//d5 = d5 ^ d7	
 97 | 	"vmul.f32 		d2, d1, d1				\n\t"	//d2 = d1*d1 = {x^2, x^2}	
 98 | 	
 99 | 	"vld1.32 		{d16, d17}, [%1]		\n\t"	//q8 = {p7, p3, p5, p1}
100 | 	"veor.i32 		d5, d5, d18				\n\t"	//d5 = d5 ^ d18	
101 | 	"vshl.i32 		d5, d5, #31				\n\t"	//d5 = d5 << 31
102 | 	"veor.i32 		d1, d1, d5				\n\t"	//d1 = d1 ^ d5
103 | 	
104 | 	"vmul.f32 		d3, d2, d2				\n\t"	//d3 = d2*d2 = {x^4, x^4}		
105 | 	"vmul.f32 		q0, q8, d1[0]			\n\t"	//q0 = q8 * d1[0] = {p7x, p3x, p5x, p1x}
106 | 	"vmla.f32 		d1, d0, d2[0]			\n\t"	//d1 = d1 + d0*d2 = {p5x + p7x^3, p1x + p3x^3}		
107 | 	"vmla.f32 		d1, d3, d1[0]			\n\t"	//d1 = d1 + d3*d0 = {...., p1x + p3x^3 + p5x^5 + p7x^7}		
108 | 
109 | 	"vmov.f32 		s0, s3					\n\t"	//s0 = s3
110 | 	: 
111 | 	: "r"(__sinf_rng), "r"(__sinf_lut) 
112 |     : "q0", "q1", "q2", "q3", "q8", "q9"
113 | 	);
114 | #endif
115 | }
116 | 
117 | float sinf_neon_sfp(float x)
118 | {
119 | #ifdef __MATH_NEON
120 | 	asm volatile ("vdup.f32 d0, r0 		\n\t");
121 | 	sinf_neon_hfp(x);
122 | 	asm volatile ("vmov.f32 r0, s0 		\n\t");
123 | #else
124 | 	return sinf_c(x);
125 | #endif
126 | 
127 | };
128 | 
129 | 


--------------------------------------------------------------------------------
/math_sinfv.c:
--------------------------------------------------------------------------------
  1 | /*
  2 | The MIT License (MIT)
  3 | 
  4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
  5 | 
  6 | Permission is hereby granted, free of charge, to any person obtaining a copy
  7 | of this software and associated documentation files (the "Software"), to deal
  8 | in the Software without restriction, including without limitation the rights
  9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10 | copies of the Software, and to permit persons to whom the Software is
 11 | furnished to do so, subject to the following conditions:
 12 | 
 13 | The above copyright notice and this permission notice shall be included in
 14 | all copies or substantial portions of the Software.
 15 | 
 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 22 | THE SOFTWARE.
 23 | */
 24 | 
 25 | #include "math.h"
 26 | #include "math_neon.h"
 27 | 
 28 | const float __sinfv_rng[2] = {
 29 | 	2.0 / M_PI,
 30 | 	M_PI / 2.0, 
 31 | };
 32 | 
 33 | const float __sinfv_lut[4] = {
 34 | 	-0.00018365f,	//p7
 35 | 	-0.16664831f,	//p3
 36 | 	+0.00830636f,	//p5
 37 | 	+0.99999661f,	//p1
 38 | };
 39 | 
 40 | void sinfv_c(float *x, int n, float *r)
 41 | {
 42 | 	union {
 43 | 		float 	f;
 44 | 		int 	i;
 45 | 	} ax, bx;
 46 | 	
 47 | 	float aa, ab, ba, bb, axx, bxx;
 48 | 	int am, bm, an, bn;
 49 | 
 50 | 	if (n & 0x1) {
 51 | 		*r++ = sinf_c(*x++);
 52 | 		n--;
 53 | 	}
 54 | 
 55 | 	float rng0 = __sinfv_rng[0];
 56 | 	float rng1 = __sinfv_rng[1];
 57 | 
 58 | 	while(n > 0){
 59 | 		
 60 | 		float x0 = *x++;
 61 | 		float x1 = *x++;
 62 | 		
 63 | 		ax.f = fabsf(x0);
 64 | 		bx.f = fabsf(x1);
 65 | 
 66 | 		//Range Reduction:
 67 | 		am = (int) (ax.f * rng0);	
 68 | 		bm = (int) (bx.f * rng0);	
 69 | 		
 70 | 		ax.f = ax.f - (((float)am) * rng1);
 71 | 		bx.f = bx.f - (((float)bm) * rng1);
 72 | 
 73 | 		//Test Quadrant
 74 | 		an = am & 1;
 75 | 		bn = bm & 1;
 76 | 		ax.f = ax.f - an * rng1;
 77 | 		bx.f = bx.f - bn * rng1;
 78 | 		am = (am & 2) >> 1;
 79 | 		bm = (bm & 2) >> 1;
 80 | 		ax.i = ax.i ^ ((an ^ am ^ (x0 < 0)) << 31);
 81 | 		bx.i = bx.i ^ ((bn ^ bm ^ (x1 < 0)) << 31);
 82 | 			
 83 | 		//Taylor Polynomial (Estrins)
 84 | 		axx = ax.f * ax.f;	
 85 | 		bxx = bx.f * bx.f;	
 86 | 		aa = (__sinfv_lut[0] * ax.f) * axx + (__sinfv_lut[2] * ax.f);
 87 | 		ba = (__sinfv_lut[0] * bx.f) * bxx + (__sinfv_lut[2] * bx.f);
 88 | 		ab = (__sinfv_lut[1] * ax.f) * axx + (__sinfv_lut[3] * ax.f);
 89 | 		bb = (__sinfv_lut[1] * bx.f) * bxx + (__sinfv_lut[3] * bx.f);
 90 | 		axx = axx * axx;
 91 | 		bxx = bxx * bxx;
 92 | 		*r++ = ab + aa * axx;
 93 | 		*r++ = bb + ba * bxx;
 94 | 		n -= 2;
 95 | 	}
 96 | 	
 97 | 	
 98 | }
 99 | 
100 | void sinfv_neon(float *x, int n, float *r)
101 | {
102 | #ifdef __MATH_NEON
103 | 	asm volatile (""
104 | 	:
105 | 	:"r"(x), "r"(n)
106 | 	);
107 | #else
108 | 	sinfv_c(x, n, r);
109 | #endif
110 | }
111 | 


--------------------------------------------------------------------------------
/math_sinhf.c:
--------------------------------------------------------------------------------
  1 | /*
  2 | The MIT License (MIT)
  3 | 
  4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
  5 | 
  6 | Permission is hereby granted, free of charge, to any person obtaining a copy
  7 | of this software and associated documentation files (the "Software"), to deal
  8 | in the Software without restriction, including without limitation the rights
  9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10 | copies of the Software, and to permit persons to whom the Software is
 11 | furnished to do so, subject to the following conditions:
 12 | 
 13 | The above copyright notice and this permission notice shall be included in
 14 | all copies or substantial portions of the Software.
 15 | 
 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 22 | THE SOFTWARE.
 23 | */
 24 | 
 25 | #include "math.h"
 26 | #include "math_neon.h"
 27 | 
 28 | const float __sinhf_rng[2] = {
 29 | 	1.442695041f,
 30 | 	0.693147180f
 31 | };
 32 | 
 33 | const float __sinhf_lut[16] = {
 34 | 	0.00019578093328483123,	//p7
 35 | 	0.00019578093328483123,	//p7
 36 | 	0.0014122663401803872, 	//p6
 37 | 	0.0014122663401803872, 	//p6
 38 | 	0.008336936973260111, 	//p5
 39 | 	0.008336936973260111, 	//p5
 40 | 	0.04165989275009526, 	//p4
 41 | 	0.04165989275009526, 	//p4
 42 | 	0.16666570253074878, 	//p3
 43 | 	0.16666570253074878, 	//p3
 44 | 	0.5000006143673624, 	//p2
 45 | 	0.5000006143673624, 	//p2
 46 | 	1.000000059694879, 		//p1
 47 | 	1.000000059694879, 		//p1
 48 | 	0.9999999916728642,		//p0
 49 | 	0.9999999916728642		//p0
 50 | };
 51 | 
 52 | 
 53 | float sinhf_c(float x)
 54 | {
 55 | 	float a, b, xx;
 56 | 	xx = -x;
 57 | 	a = expf_c(x);
 58 | 	b = expf_c(xx);
 59 | 	a = a - b;
 60 | 	a = a * 0.5f;
 61 | 	return a;
 62 | }
 63 | 
 64 | 
 65 | float sinhf_neon_hfp(float x)
 66 | {
 67 | #ifdef __MATH_NEON
 68 | 	asm volatile (
 69 | 	"vdup.f32 		d0, d0[0]				\n\t"	//d0 = {x, x}	
 70 | 	"fnegs 			s1, s1					\n\t"	//s1 = -s1
 71 | 	
 72 | 	//Range Reduction:
 73 | 	"vld1.32 		d2, [%0]				\n\t"	//d2 = {invrange, range}
 74 | 	"vld1.32 		{d16, d17}, [%1]!		\n\t"	
 75 | 	"vmul.f32 		d6, d0, d2[0]			\n\t"	//d6 = d0 * d2[0] 
 76 | 	"vcvt.s32.f32 	d6, d6					\n\t"	//d6 = (int) d6
 77 | 	"vld1.32 		{d18}, [%1]!			\n\t"	
 78 | 	"vcvt.f32.s32 	d1, d6					\n\t"	//d1 = (float) d6
 79 | 	"vld1.32 		{d19}, [%1]!			\n\t"	
 80 | 	"vmls.f32 		d0, d1, d2[1]			\n\t"	//d0 = d0 - d1 * d2[1]
 81 | 	"vld1.32 		{d20}, [%1]!			\n\t"	
 82 | 		
 83 | 	//polynomial:
 84 | 	"vmla.f32 		d17, d16, d0			\n\t"	//d17 = d17 + d16 * d0;	
 85 | 	"vld1.32 		{d21}, [%1]!			\n\t"	
 86 | 	"vmla.f32 		d18, d17, d0			\n\t"	//d18 = d18 + d17 * d0;	
 87 | 	"vld1.32 		{d22}, [%1]!			\n\t"	
 88 | 	"vmla.f32 		d19, d18, d0			\n\t"	//d19 = d19 + d18 * d0;	
 89 | 	"vld1.32 		{d23}, [%1]!			\n\t"	
 90 | 	"vmla.f32 		d20, d19, d0			\n\t"	//d20 = d20 + d19 * d0;	
 91 | 	"vmla.f32 		d21, d20, d0			\n\t"	//d21 = d21 + d20 * d0;	
 92 | 	"vmla.f32 		d22, d21, d0			\n\t"	//d22 = d22 + d21 * d0;	
 93 | 	"vmla.f32 		d23, d22, d0			\n\t"	//d23 = d23 + d22 * d0;	
 94 | 	
 95 | 	//multiply by 2 ^ m 	
 96 | 	"vshl.i32 		d6, d6, #23				\n\t"	//d6 = d6 << 23		
 97 | 	"vadd.i32 		d0, d23, d6				\n\t"	//d0 = d22 + d6		
 98 | 
 99 | 	"vdup.f32 		d2, d0[1]				\n\t"	//d2 = s1		
100 | 	"vmov.f32 		d1, #0.5				\n\t"	//d1 = 0.5		
101 | 	"vsub.f32 		d0, d0, d2				\n\t"	//d0 = d0 - d2		
102 | 	"vmul.f32 		d0, d1					\n\t"	//d0 = d0 * d1		
103 | 
104 | 	:: "r"(__sinhf_rng), "r"(__sinhf_lut) 
105 |     : "d0", "d1", "q1", "q2", "d6"
106 | 	);
107 | 	
108 | #endif
109 | }
110 | 
111 | float sinhf_neon_sfp(float x)
112 | {
113 | #ifdef __MATH_NEON
114 | 	asm volatile ("vmov.f32 s0, r0 		\n\t");
115 | 	sinhf_neon_hfp(x);
116 | 	asm volatile ("vmov.f32 r0, s0 		\n\t");
117 | #else
118 | 	return sinhf_c(x);
119 | #endif
120 | };
121 | 


--------------------------------------------------------------------------------
/math_sqrtf.c:
--------------------------------------------------------------------------------
  1 | /*
  2 | The MIT License (MIT)
  3 | 
  4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
  5 | 
  6 | Permission is hereby granted, free of charge, to any person obtaining a copy
  7 | of this software and associated documentation files (the "Software"), to deal
  8 | in the Software without restriction, including without limitation the rights
  9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10 | copies of the Software, and to permit persons to whom the Software is
 11 | furnished to do so, subject to the following conditions:
 12 | 
 13 | The above copyright notice and this permission notice shall be included in
 14 | all copies or substantial portions of the Software.
 15 | 
 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 22 | THE SOFTWARE.
 23 | */
 24 | 
 25 | /*
 26 | Test func : sqrtf(x)
 27 | Test Range: 0 < x < 1,000,000,000
 28 | Peak Error:	~0.0010%
 29 | RMS  Error: ~0.0005%
 30 | */
 31 | 
 32 | #include "math.h"
 33 | #include "math_neon.h"
 34 | 
 35 | float sqrtf_c(float x)
 36 | {
 37 | 
 38 | 	float b, c;
 39 | 	int m;
 40 | 	union {
 41 | 		float 	f;
 42 | 		int 	i;
 43 | 	} a;
 44 | 	
 45 | 	//fast invsqrt approx
 46 | 	a.f = x;
 47 | 	a.i = 0x5F3759DF - (a.i >> 1);		//VRSQRTE
 48 | 	c = x * a.f;
 49 | 	b = (3.0f - c * a.f) * 0.5;		//VRSQRTS
 50 | 	a.f = a.f * b;		
 51 | 	c = x * a.f;
 52 | 	b = (3.0f - c * a.f) * 0.5;
 53 |     a.f = a.f * b;	
 54 | 
 55 | 	//fast inverse approx
 56 | 	x = a.f;
 57 | 	m = 0x3F800000 - (a.i & 0x7F800000);
 58 | 	a.i = a.i + m;
 59 | 	a.f = 1.41176471f - 0.47058824f * a.f;
 60 | 	a.i = a.i + m;
 61 | 	b = 2.0 - a.f * x;
 62 | 	a.f = a.f * b;	
 63 | 	b = 2.0 - a.f * x;
 64 | 	a.f = a.f * b;
 65 | 
 66 | 	return a.f;
 67 | }
 68 | 
 69 | float sqrtf_neon_hfp(float x)
 70 | {
 71 | #ifdef __MATH_NEON
 72 | 	asm volatile (
 73 | 		
 74 | 	//fast invsqrt approx
 75 | 	"vmov.f32 		d1, d0					\n\t"	//d1 = d0
 76 | 	"vrsqrte.f32 	d0, d0					\n\t"	//d0 = ~ 1.0 / sqrt(d0)
 77 | 	"vmul.f32 		d2, d0, d1				\n\t"	//d2 = d0 * d1
 78 | 	"vrsqrts.f32 	d3, d2, d0				\n\t"	//d3 = (3 - d0 * d2) / 2 	
 79 | 	"vmul.f32 		d0, d0, d3				\n\t"	//d0 = d0 * d3
 80 | 	"vmul.f32 		d2, d0, d1				\n\t"	//d2 = d0 * d1	
 81 | 	"vrsqrts.f32 	d3, d2, d0				\n\t"	//d4 = (3 - d0 * d3) / 2	
 82 | 	"vmul.f32 		d0, d0, d3				\n\t"	//d0 = d0 * d3	
 83 | 		
 84 | 	//fast reciporical approximation
 85 | 	"vrecpe.f32		d1, d0					\n\t"	//d1 = ~ 1 / d0; 
 86 | 	"vrecps.f32		d2, d1, d0				\n\t"	//d2 = 2.0 - d1 * d0; 
 87 | 	"vmul.f32		d1, d1, d2				\n\t"	//d1 = d1 * d2; 
 88 | 	"vrecps.f32		d2, d1, d0				\n\t"	//d2 = 2.0 - d1 * d0; 
 89 | 	"vmul.f32		d0, d1, d2				\n\t"	//d0 = d1 * d2; 
 90 | 
 91 | 	::: "d0", "d1", "d2", "d3"
 92 | 	);
 93 | #endif
 94 | }
 95 | 
 96 | float sqrtf_neon_sfp(float x)
 97 | {
 98 | #ifdef __MATH_NEON
 99 | 	asm volatile ("vmov.f32 s0, r0 		\n\t");
100 | 	sqrtf_neon_hfp(x);
101 | 	asm volatile ("vmov.f32 r0, s0 		\n\t");
102 | #else
103 | 	return sqrtf_c(x);
104 | #endif
105 | };
106 | 


--------------------------------------------------------------------------------
/math_sqrtfv.c:
--------------------------------------------------------------------------------
  1 | /*
  2 | The MIT License (MIT)
  3 | 
  4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
  5 | 
  6 | Permission is hereby granted, free of charge, to any person obtaining a copy
  7 | of this software and associated documentation files (the "Software"), to deal
  8 | in the Software without restriction, including without limitation the rights
  9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10 | copies of the Software, and to permit persons to whom the Software is
 11 | furnished to do so, subject to the following conditions:
 12 | 
 13 | The above copyright notice and this permission notice shall be included in
 14 | all copies or substantial portions of the Software.
 15 | 
 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 22 | THE SOFTWARE.
 23 | */
 24 | /*
 25 | Test func : sqrtf(x)
 26 | Test Range: 0 < x < 1,000,000,000
 27 | Peak Error:	~0.0010%
 28 | RMS  Error: ~0.0005%
 29 | */
 30 | 
 31 | #include "math.h"
 32 | #include "math_neon.h"
 33 | 
 34 | void sqrtfv_c(float *x, int n, float *r)
 35 | {
 36 | 
 37 | 	float x0, x1;
 38 | 	float b0, b1, c0, c1;
 39 | 	int m0, m1;
 40 | 	union {
 41 | 		float 	f;
 42 | 		int 	i;
 43 | 	} a0, a1;
 44 | 
 45 | 
 46 | 	if (n & 0x1){
 47 | 		*r++ = sqrtf_c(*x++);
 48 | 		n--;
 49 | 	}
 50 | 
 51 | 	while(n > 0){
 52 | 	
 53 | 		x0 = *x++;
 54 | 		x1 = *x++;
 55 | 	
 56 | 		//fast invsqrt approx
 57 | 		a0.f = x0;
 58 | 		a1.f = x1;
 59 | 		a0.i = 0x5F3759DF - (a0.i >> 1);		//VRSQRTE
 60 | 		a1.i = 0x5F3759DF - (a1.i >> 1);		//VRSQRTE
 61 | 		c0 = x0 * a0.f;
 62 | 		c1 = x1 * a1.f;
 63 | 		b0 = (3.0f - c0 * a0.f) * 0.5;		//VRSQRTS
 64 | 		b1 = (3.0f - c1 * a1.f) * 0.5;		//VRSQRTS
 65 | 		a0.f = a0.f * b0;		
 66 | 		a1.f = a1.f * b1;		
 67 | 		c0 = x0 * a0.f;
 68 | 		c1 = x1 * a1.f;
 69 | 		b0 = (3.0f - c0 * a0.f) * 0.5;		//VRSQRTS
 70 | 		b1 = (3.0f - c1 * a1.f) * 0.5;		//VRSQRTS
 71 | 		a0.f = a0.f * b0;		
 72 | 		a1.f = a1.f * b1;		
 73 | 
 74 | 		//fast inverse approx
 75 | 		c0 = a0.f;
 76 | 		c0 = a1.f;
 77 | 		m0 = 0x3F800000 - (a0.i & 0x7F800000);
 78 | 		m1 = 0x3F800000 - (a1.i & 0x7F800000);
 79 | 		a0.i = a0.i + m0;
 80 | 		a1.i = a1.i + m1;
 81 | 		a0.f = 1.41176471f - 0.47058824f * a0.f;
 82 | 		a1.f = 1.41176471f - 0.47058824f * a1.f;
 83 | 		a0.i = a0.i + m0;
 84 | 		a1.i = a1.i + m1;
 85 | 		b0 = 2.0 - a0.f * c0;
 86 | 		b1 = 2.0 - a1.f * c1;
 87 | 		a0.f = a0.f * b0;	
 88 | 		a1.f = a1.f * b1;	
 89 | 		b0 = 2.0 - a0.f * c0;
 90 | 		b1 = 2.0 - a1.f * c1;
 91 | 		a0.f = a0.f * b0;
 92 | 		a1.f = a1.f * b1;
 93 | 		
 94 | 		*r++ = a0.f;
 95 | 		*r++ = a1.f;
 96 | 		n -= 2;
 97 | 
 98 | 	}
 99 | }
100 | 
101 | void sqrtfv_neon(float *x, int n, float *r)
102 | {
103 | #ifdef __MATH_NEON
104 | 	asm volatile (
105 | 
106 | 	"tst 			r1, #1 					\n\t"	//r1 & 1
107 | 	"beq 			1f 						\n\t"	//
108 | 
109 | 	"vld1.32		d0[0], [r0]! 			\n\t"	//s0 = *x++
110 | 	"mov 			ip, lr 					\n\t"	//ip = lr
111 | 	//"bl 			sqrtf_neon_hfp 			\n\t"	//sqrtf_neon
112 | 	"mov 			lr, ip 					\n\t"	//lr = ip
113 | 	"vst1.32		d0[0], [r2]! 			\n\t"	//*r++ = r0
114 | 	"subs 			r1, r1, #1				\n\t"	//r1 = r1 - 1;		
115 | 	"bxeq 			lr						\n\t"	//
116 | 
117 | 	"1:				 						\n\t"	//
118 | 
119 | 	"vld1.32 		d0, [r0]! 				\n\t"	//d0 = (*x[0], *x[1]), x+=2;
120 | 	
121 | 	//fast invsqrt approx
122 | 	"vmov.f32 		d1, d0					\n\t"	//d1 = d0
123 | 	"vrsqrte.f32 	d0, d0					\n\t"	//d0 = ~ 1.0 / sqrt(d0)
124 | 	"vmul.f32 		d2, d0, d1				\n\t"	//d3 = d0 * d2
125 | 	"vrsqrts.f32 	d3, d2, d0				\n\t"	//d4 = (3 - d0 * d3) / 2 	
126 | 	"vmul.f32 		d0, d0, d3				\n\t"	//d0 = d0 * d4	
127 | 	"vmul.f32 		d2, d0, d1				\n\t"	//d3 = d0 * d2	
128 | 	"vrsqrts.f32 	d3, d2, d0				\n\t"	//d4 = (3 - d0 * d3) / 2	
129 | 	"vmul.f32 		d0, d0, d3				\n\t"	//d0 = d0 * d4	
130 | 		
131 | 	//fast reciporical approximation
132 | 	"vrecpe.f32		d1, d0					\n\t"	//d1 = ~ 1 / d0; 
133 | 	"vrecps.f32		d2, d1, d0				\n\t"	//d2 = 2.0 - d1 * d0; 
134 | 	"vmul.f32		d1, d1, d2				\n\t"	//d1 = d1 * d2; 
135 | 	"vrecps.f32		d2, d1, d0				\n\t"	//d2 = 2.0 - d1 * d0; 
136 | 	"vmul.f32		d0, d1, d2				\n\t"	//d0 = d1 * d2; 
137 | 
138 | 	"vst1.64 		d0, [r2]!				\n\t"	//*r++ = d0;
139 | 	"subs 			r1, r1, #2				\n\t"	//n = n - 2; update flags
140 | 	"bgt 			1b 						\n\t"	//
141 | 
142 | 	::: "d0", "d1", "d2", "d3"
143 | );
144 | #else
145 | 	sqrtfv_c(x, n, r);
146 | #endif
147 | }
148 | 


--------------------------------------------------------------------------------
/math_tanf.c:
--------------------------------------------------------------------------------
  1 | /*
  2 | The MIT License (MIT)
  3 | 
  4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
  5 | 
  6 | Permission is hereby granted, free of charge, to any person obtaining a copy
  7 | of this software and associated documentation files (the "Software"), to deal
  8 | in the Software without restriction, including without limitation the rights
  9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10 | copies of the Software, and to permit persons to whom the Software is
 11 | furnished to do so, subject to the following conditions:
 12 | 
 13 | The above copyright notice and this permission notice shall be included in
 14 | all copies or substantial portions of the Software.
 15 | 
 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 22 | THE SOFTWARE.
 23 | */
 24 | 
 25 | #include "math.h"
 26 | #include "math_neon.h"
 27 | 
 28 | const float __tanf_rng[2] = {
 29 | 	2.0 / M_PI,
 30 | 	M_PI / 2.0
 31 | };
 32 | 
 33 | const float __tanf_lut[4] = {
 34 | 	-0.00018365f,	//p7
 35 | 	-0.16664831f,	//p3
 36 | 	+0.00830636f,	//p5
 37 | 	+0.99999661f,	//p1
 38 | };
 39 |  
 40 | float tanf_c(float x){
 41 | 
 42 | 	union {
 43 | 		float f;
 44 | 		int i;
 45 | 	} ax, c;
 46 | 
 47 | 	float r, a, b, xx, cc, cx;
 48 | 	int m;
 49 | 	
 50 | 	ax.f = fabsf(x);
 51 | 
 52 | 	//Range Reduction:
 53 | 	m = (int) (ax.f * __tanf_rng[0]);	
 54 | 	ax.f = ax.f - (((float)m) * __tanf_rng[1]);
 55 | 
 56 | 	//Test Quadrant
 57 | 	ax.f = ax.f - (m & 1) * __tanf_rng[1];
 58 | 	ax.i = ax.i ^ ((*(int*)&x) & 0x80000000);
 59 | 		
 60 | 	//Taylor Polynomial (Estrins)
 61 | 	xx = ax.f * ax.f;	
 62 | 	a = (__tanf_lut[0] * ax.f) * xx + (__tanf_lut[2] * ax.f);
 63 | 	b = (__tanf_lut[1] * ax.f) * xx + (__tanf_lut[3] * ax.f);
 64 | 	xx = xx * xx;
 65 | 	r = b + a * xx;
 66 | 
 67 | 	//cosine
 68 | 	c.f = 1.0 - r * r;
 69 | 	
 70 | 	//fast invsqrt approximation (2x newton iterations)
 71 |     cc = c.f;
 72 | 	c.i = 0x5F3759DF - (c.i >> 1);		//VRSQRTE
 73 | 	cx = cc * c.f;
 74 | 	a = (3.0f - cx * c.f) / 2;			//VRSQRTS
 75 | 	c.f = c.f * a;		
 76 | 	cx = cc * c.f;
 77 | 	a = (3.0f - cx * c.f) / 2;
 78 |     c.f = c.f * a;	
 79 | 
 80 | 	r = r * c.f;
 81 | 	
 82 | 	return r;
 83 | }
 84 | 
 85 | 
 86 | float tanf_neon_hfp(float x)
 87 | {
 88 | #ifdef __MATH_NEON
 89 | 	asm volatile (
 90 | 
 91 | 	"vdup.f32 		d0, d0[0]				\n\t"	//d0 = {x, x}
 92 | 	"vabs.f32 		d1, d0					\n\t"	//d1 = {ax, ax}
 93 | 	
 94 | 	//Range Reduction:
 95 | 	"vld1.32 		d3, [%0]				\n\t"	//d3 = {invrange, range}
 96 | 	"vmul.f32 		d2, d1, d3[0]			\n\t"	//d2 = d1 * d3[0] 
 97 | 	"vcvt.u32.f32 	d2, d2					\n\t"	//d2 = (int) d2
 98 | 	"vcvt.f32.u32 	d4, d2					\n\t"	//d4 = (float) d2
 99 | 	"vmls.f32 		d1, d4, d3[1]			\n\t"	//d1 = d1 - d4 * d3[1]
100 | 	
101 | 	//Checking Quadrant:
102 | 	//ax = ax - (k&1) * M_PI_2
103 | 	"vmov.i32 		d4, #1					\n\t"	//d4 = 1
104 | 	"vand.i32 		d2, d2, d4				\n\t"	//d2 = d2 & d4
105 | 	"vcvt.f32.u32 	d2, d2					\n\t"	//d2 = (float) d2
106 | 	"vmls.f32 		d1, d2, d3[1]			\n\t"	//d1 = d1 - d2 * d3[1]
107 | 	
108 | 	//ax = ax ^ ( x.i & 0x800000000)
109 | 	"vmov.i32 		d4, #0x80000000			\n\t"	//d4 = 0x80000000
110 | 	"vand.i32 		d0, d0, d4				\n\t"	//d0 = d0 & d4
111 | 	"veor.i32 		d1, d1, d0				\n\t"	//d1 = d1 ^ d0
112 | 	
113 | 	//polynomial:
114 | 	"vmul.f32 		d2, d1, d1				\n\t"	//d2 = d1*d1 = {x^2, x^2}	
115 | 	"vld1.32 		{d4, d5}, [%1]			\n\t"	//d4 = {p7, p3}, d5 = {p5, p1}
116 | 	"vmul.f32 		d3, d2, d2				\n\t"	//d3 = d2*d2 = {x^4, x^4}		
117 | 	"vmul.f32 		q0, q2, d1[0]			\n\t"	//q0 = q2 * d1[0] = {p7x, p3x, p5x, p1x}
118 | 	"vmla.f32 		d1, d0, d2[0]			\n\t"	//d1 = d1 + d0*d2 = {p5x + p7x^3, p1x + p3x^3}		
119 | 	"vmla.f32 		d1, d3, d1[0]			\n\t"	//d1 = d1 + d3*d0 = {..., p1x + p3x^3 + p5x^5 + p7x^7}		
120 | 	
121 | 	//cosine
122 | 	"vmov.f32 		s1, #1.0				\n\t"	//d0[1] = 1.0
123 | 	"vmls.f32 		d0, d1, d1				\n\t"	//d0 = {..., 1.0 - sx*sx}
124 | 	
125 | 	//invsqrt approx
126 | 	"vmov.f32 		d2, d0					\n\t"	//d2 = d0
127 | 	"vrsqrte.f32 	d0, d0					\n\t"	//d0 = ~ 1.0 / sqrt(d0)
128 | 	"vmul.f32 		d3, d0, d2				\n\t"	//d3 = d0 * d2
129 | 	"vrsqrts.f32 	d4, d3, d0				\n\t"	//d4 = (3 - d0 * d3) / 2 	
130 | 	"vmul.f32 		d0, d0, d4				\n\t"	//d0 = d0 * d4	
131 | 	"vmul.f32 		d3, d0, d2				\n\t"	//d3 = d0 * d2	
132 | 	"vrsqrts.f32 	d4, d3, d0				\n\t"	//d4 = (3 - d0 * d3) / 2	
133 | 	"vmul.f32 		d0, d0, d4				\n\t"	//d0 = d0 * d4	
134 | 	
135 | 	"vmul.f32 		d0, d0, d1				\n\t"	//d0 = d0 * d1
136 | 	
137 | 	"vmov.f32 		s0, s1					\n\t"	//s0 = s1
138 | 	
139 | 	:: "r"(__tanf_rng), "r"(__tanf_lut) 
140 |     : "d0", "d1", "d2", "d3", "d4", "d5"
141 | 	);
142 | #endif
143 | }
144 | 
145 | 
146 | float tanf_neon_sfp(float x)
147 | {
148 | #ifdef __MATH_NEON
149 | 	asm volatile ("vdup.f32 d0, r0 		\n\t");
150 | 	tanf_neon_hfp(x);
151 | 	asm volatile ("vmov.f32 r0, s0 		\n\t");
152 | #else
153 | 	return tanf_c(x);
154 | #endif
155 | };
156 | 
157 | 


--------------------------------------------------------------------------------
/math_tanhf.c:
--------------------------------------------------------------------------------
 1 | /*
 2 | The MIT License (MIT)
 3 | 
 4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in
14 | all copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 | THE SOFTWARE.
23 | */
24 | 
25 | #include "math.h"
26 | #include "math_neon.h"
27 |  
28 | /* 
29 | TanH = (e^x - e^-x) / (e^x + e^-x)
30 | TanH = (e^x - e^-x)(e^x) / (e^x + e^-x)(e^x)
31 | TanH = (e^2x - 1) / (e^2x + 1)
32 | 
33 | */
34 |  
35 | float tanhf_c(float x)
36 | {
37 | 	float a, b, c;
38 | 	int m;
39 | 	union{
40 | 		float 	f;
41 | 		int 	i;
42 | 	} xx;
43 | 	
44 | 	x = 2.0f * x;
45 | 	a = expf_c(x);
46 | 	c = a + 1.0f;
47 | 		
48 | 	//reciporical approx.
49 | 	xx.f = c;
50 | 	m = 0x3F800000 - (xx.i & 0x7F800000);
51 | 	xx.i = xx.i + m;
52 | 	xx.f = 1.41176471f - 0.47058824f * xx.f;
53 | 	xx.i = xx.i + m;
54 | 	b = 2.0 - xx.f * c;
55 | 	xx.f = xx.f * b;	
56 | 	b = 2.0 - xx.f * c;
57 | 	xx.f = xx.f * b;
58 | 	c = a - 1.0;
59 | 	xx.f *= c;
60 | 	return xx.f;
61 | }
62 | 
63 | 
64 | float tanhf_neon_hfp(float x)
65 | {
66 | #ifdef __MATH_NEON
67 | 	asm volatile ("vadd.f32 d0, d0, d0 		\n\t");
68 | 	expf_neon_hfp(x);
69 | 	asm volatile (
70 | 	"vmov.f32 		d2, #1.0 				\n\t"
71 | 	"vsub.f32 		d3, d0, d2 				\n\t"
72 | 	"vadd.f32 		d0, d0, d2 				\n\t"
73 | 
74 | 	"vrecpe.f32		d1, d0					\n\t"	//d1 = ~ 1 / d0; 
75 | 	"vrecps.f32		d2, d1, d0				\n\t"	//d2 = 2.0 - d1 * d0; 
76 | 	"vmul.f32		d1, d1, d2				\n\t"	//d1 = d1 * d2; 
77 | 	"vrecps.f32		d2, d1, d0				\n\t"	//d2 = 2.0 - d1 * d0; 
78 | 	"vmul.f32		d0, d1, d2				\n\t"	//d0 = d1 * d2; 
79 | 	"vmul.f32		d0, d0, d3				\n\t"	//d0 = d0 * d3; 	
80 | 	::: "d0", "d1", "d2", "d3"
81 | 	);	
82 | #endif
83 | }
84 | 
85 | float tanhf_neon_sfp(float x)
86 | {
87 | #ifdef __MATH_NEON
88 | 	asm volatile ("vmov.f32 s0, r0 		\n\t");
89 | 	tanhf_neon_hfp(x);
90 | 	asm volatile ("vmov.f32 r0, s0 		\n\t");
91 | #else
92 | 	return tanhf_c(x);
93 | #endif
94 | };
95 | 
96 | 


--------------------------------------------------------------------------------
/math_vec2.c:
--------------------------------------------------------------------------------
  1 | /*
  2 | The MIT License (MIT)
  3 | 
  4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
  5 | 
  6 | Permission is hereby granted, free of charge, to any person obtaining a copy
  7 | of this software and associated documentation files (the "Software"), to deal
  8 | in the Software without restriction, including without limitation the rights
  9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10 | copies of the Software, and to permit persons to whom the Software is
 11 | furnished to do so, subject to the following conditions:
 12 | 
 13 | The above copyright notice and this permission notice shall be included in
 14 | all copies or substantial portions of the Software.
 15 | 
 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 22 | THE SOFTWARE.
 23 | */
 24 | 
 25 | 
 26 | #include "math_neon.h"
 27 | 
 28 | //vec2 scalar product
 29 | float 
 30 | dot2_c(float v0[2], float v1[2])
 31 | {
 32 | 	float r;
 33 | 	r = v0[0]*v1[0];
 34 | 	r += v0[1]*v1[1];
 35 | 	return r;
 36 | }
 37 | 
 38 | void 
 39 | normalize2_c(float v[2], float d[2])
 40 | {
 41 | 	float b, c, x;
 42 | 	union {
 43 | 		float 	f;
 44 | 		int 	i;
 45 | 	} a;
 46 | 	
 47 | 	x = v[0]*v[0];
 48 | 	x += v[1]*v[1];
 49 | 
 50 | 	//fast invsqrt approx
 51 | 	a.f = x;
 52 | 	a.i = 0x5F3759DF - (a.i >> 1);		//VRSQRTE
 53 | 	c = x * a.f;
 54 | 	b = (3.0f - c * a.f) * 0.5;		//VRSQRTS
 55 | 	a.f = a.f * b;		
 56 | 	c = x * a.f;
 57 | 	b = (3.0f - c * a.f) * 0.5;
 58 |     a.f = a.f * b;	
 59 | 
 60 | 	d[0] = v[0]*a.f;
 61 | 	d[1] = v[1]*a.f;
 62 | }
 63 | 
 64 | float 
 65 | dot2_neon_hfp(float v0[2], float v1[2])
 66 | {
 67 | #ifdef __MATH_NEON
 68 | 	asm volatile (
 69 | 	"vld1.32 		{d2}, [%0]			\n\t"	//d2={x0,y0}
 70 | 	"vld1.32 		{d4}, [%1]			\n\t"	//d4={x1,y1}
 71 | 	"vmul.f32 		d0, d2, d4			\n\t"	//d0 = d2*d4
 72 | 	"vpadd.f32 		d0, d0, d0			\n\t"	//d0 = d[0] + d[1]
 73 | 	:: "r"(v0), "r"(v1) 
 74 |     : 
 75 | 	);	
 76 | #endif
 77 | }
 78 | 
 79 | float 
 80 | dot2_neon_sfp(float v0[2], float v1[2])
 81 | {
 82 | #ifdef __MATH_NEON
 83 | 	dot2_neon_hfp(v0, v1);
 84 | 	asm volatile ("vmov.f32 r0, s0 		\n\t");
 85 | #else
 86 | 	return dot2_c(v0, v1);
 87 | #endif
 88 | };
 89 | 
 90 | void 
 91 | normalize2_neon(float v[2], float d[2])
 92 | {
 93 | #ifdef __MATH_NEON
 94 | 	asm volatile (
 95 | 	"vld1.32 		d4, [%0]				\n\t"	//d4 = {x0,y0}
 96 | 	"vmul.f32 		d0, d4, d4				\n\t"	//d0 = d2*d2
 97 | 	"vpadd.f32 		d0, d0					\n\t"	//d0 = d[0] + d[1]
 98 | 	
 99 | 	"vmov.f32 		d1, d0					\n\t"	//d1 = d0
100 | 	"vrsqrte.f32 	d0, d0					\n\t"	//d0 = ~ 1.0 / sqrt(d0)
101 | 	"vmul.f32 		d2, d0, d1				\n\t"	//d2 = d0 * d1
102 | 	"vrsqrts.f32 	d3, d2, d0				\n\t"	//d3 = (3 - d0 * d2) / 2 	
103 | 	"vmul.f32 		d0, d0, d3				\n\t"	//d0 = d0 * d3
104 | 	"vmul.f32 		d2, d0, d1				\n\t"	//d2 = d0 * d1	
105 | 	"vrsqrts.f32 	d3, d2, d0				\n\t"	//d3 = (3 - d0 * d2) / 2	
106 | 	"vmul.f32 		d0, d0, d3				\n\t"	//d0 = d0 * d3	
107 | 
108 | 	"vmul.f32 		d4, d4, d0[0]			\n\t"	//d4 = d4*d0[0]
109 | 	"vst1.32 		d4, [%1]				\n\t"	//
110 | 	
111 | 	:: "r"(v), "r"(d) 
112 |     : "d0", "d1", "d2", "d3", "d4", "memory"
113 | 	);	
114 | #else
115 | 	normalize2_c(v, d);
116 | #endif
117 | }
118 | 
119 | 


--------------------------------------------------------------------------------
/math_vec3.c:
--------------------------------------------------------------------------------
  1 | /*
  2 | The MIT License (MIT)
  3 | 
  4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
  5 | 
  6 | Permission is hereby granted, free of charge, to any person obtaining a copy
  7 | of this software and associated documentation files (the "Software"), to deal
  8 | in the Software without restriction, including without limitation the rights
  9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10 | copies of the Software, and to permit persons to whom the Software is
 11 | furnished to do so, subject to the following conditions:
 12 | 
 13 | The above copyright notice and this permission notice shall be included in
 14 | all copies or substantial portions of the Software.
 15 | 
 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 22 | THE SOFTWARE.
 23 | */
 24 | 
 25 | #include "math_neon.h"
 26 | 
 27 | //vec4 scalar product
 28 | float 
 29 | dot3_c(float v0[3], float v1[3])
 30 | {
 31 | 	float r;
 32 | 	r = v0[0]*v1[0];
 33 | 	r += v0[1]*v1[1];
 34 | 	r += v0[2]*v1[2]; 
 35 | 	return r;
 36 | }
 37 | 
 38 | void
 39 | cross3_c(float v0[3], float v1[3], float d[3])
 40 | {
 41 | 	d[0] = v0[1]*v1[2] - v0[2]*v1[1];
 42 | 	d[1] = v0[2]*v1[0] - v0[0]*v1[2];
 43 | 	d[2] = v0[0]*v1[1] - v0[1]*v1[0];
 44 | }
 45 | 
 46 | void 
 47 | normalize3_c(float v[3], float d[3])
 48 | {
 49 | 	float b, c, x;
 50 | 	union {
 51 | 		float 	f;
 52 | 		int 	i;
 53 | 	} a;
 54 | 	
 55 | 	x = v[0]*v[0];
 56 | 	x += v[1]*v[1];
 57 | 	x += v[2]*v[2];
 58 | 
 59 | 	//fast invsqrt approx
 60 | 	a.f = x;
 61 | 	a.i = 0x5F3759DF - (a.i >> 1);		//VRSQRTE
 62 | 	c = x * a.f;
 63 | 	b = (3.0f - c * a.f) * 0.5;		//VRSQRTS
 64 | 	a.f = a.f * b;		
 65 | 	c = x * a.f;
 66 | 	b = (3.0f - c * a.f) * 0.5;
 67 |     a.f = a.f * b;	
 68 | 
 69 | 	d[0] = v[0]*a.f;
 70 | 	d[1] = v[1]*a.f;
 71 | 	d[2] = v[2]*a.f;
 72 | }
 73 | 
 74 | 
 75 | float 
 76 | dot3_neon_hfp(float v0[3], float v1[3])
 77 | {
 78 | #ifdef __MATH_NEON
 79 | 	asm volatile (
 80 | 	"vld1.32 		{d2}, [%0]			\n\t"	//d2={x0,y0}
 81 | 	"flds 			s6, [%0, #8]		\n\t"	//d3[0]={z0}
 82 | 	"vld1.32 		{d4}, [%1]			\n\t"	//d4={x1,y1}
 83 | 	"flds 			s10, [%1, #8]	\n\t"	//d5[0]={z1}
 84 | 
 85 | 	"vmul.f32 		d0, d2, d4			\n\t"	//d0= d2*d4
 86 | 	"vpadd.f32 		d0, d0, d0			\n\t"	//d0 = d[0] + d[1]
 87 | 	"vmla.f32 		d0, d3, d5			\n\t"	//d0 = d0 + d3*d5 
 88 | 	:: "r"(v0), "r"(v1) 
 89 |     : "d0","d1","d2","d3","d4","d5"
 90 | 	);	
 91 | #endif
 92 | }
 93 | 
 94 | float 
 95 | dot3_neon_sfp(float v0[3], float v1[3])
 96 | {
 97 | #ifdef __MATH_NEON
 98 | 	dot3_neon_hfp(v0, v1);
 99 | 	asm volatile ("vmov.f32 r0, s0 		\n\t");
100 | #else
101 | 	return dot3_c(v0, v1);
102 | #endif
103 | };
104 | 
105 | 
106 | void cross3_neon(float v0[3], float v1[3], float d[3])
107 | {
108 | #ifdef __MATH_NEON
109 | 	asm volatile (
110 | 	"flds 			s3, [%0]			\n\t"	//d1[1]={x0}
111 | 	"add 			%0, %0, #4			\n\t"	//
112 | 	"vld1.32 		{d0}, [%0]			\n\t"	//d0={y0,z0}
113 | 	"vmov.f32 		s2, s1		 		\n\t"	//d1[0]={z0}
114 | 
115 | 	"flds 			s5, [%1]			\n\t"	//d2[1]={x1}
116 | 	"add 			%1, %1, #4			\n\t"	//
117 | 	"vld1.32 		{d3}, [%1]			\n\t"	//d3={y1,z1}
118 | 	"vmov.f32 		s4, s7				\n\t"	//d2[0]=d3[1]
119 | 	
120 | 	"vmul.f32 		d4, d0, d2			\n\t"	//d4=d0*d2
121 | 	"vmls.f32 		d4, d1, d3			\n\t"	//d4-=d1*d3
122 | 	
123 | 	"vmul.f32 		d5, d3, d1[1]		\n\t"	//d5=d3*d1[1]
124 | 	"vmls.f32 		d5, d0, d2[1]		\n\t"	//d5-=d0*d2[1]
125 | 	
126 | 	"vst1.32 		d4, [%2]			\n\t"	//
127 | 	"add 			%2, %2, #8			\n\t"	//
128 | 	"fsts 			s10, [%2]			\n\t"	//
129 | 	
130 | 	: "+r"(v0), "+r"(v1), "+r"(d):
131 |     : "d0", "d1", "d2", "d3", "d4", "d5", "memory"
132 | 	);	
133 | #else
134 | 	cross3_c(v0,v1,d);
135 | #endif
136 | }
137 | 
138 | void 
139 | normalize3_neon(float v[3], float d[3])
140 | {
141 | #ifdef __MATH_NEON
142 | 	asm volatile (
143 | 	"vld1.32 		{d4}, [%0]				\n\t"	//d4={x0,y0}
144 | 	"flds 			s10, [%0, #8]			\n\t"	//d5[0]={z0}
145 | 
146 | 	"vmul.f32 		d0, d4, d4				\n\t"	//d0= d4*d4
147 | 	"vpadd.f32 		d0, d0					\n\t"	//d0 = d[0] + d[1]
148 | 	"vmla.f32 		d0, d5, d5				\n\t"	//d0 = d0 + d5*d5 
149 | 	
150 | 	"vmov.f32 		d1, d0					\n\t"	//d1 = d0
151 | 	"vrsqrte.f32 	d0, d0					\n\t"	//d0 = ~ 1.0 / sqrt(d0)
152 | 	"vmul.f32 		d2, d0, d1				\n\t"	//d2 = d0 * d1
153 | 	"vrsqrts.f32 	d3, d2, d0				\n\t"	//d3 = (3 - d0 * d2) / 2 	
154 | 	"vmul.f32 		d0, d0, d3				\n\t"	//d0 = d0 * d3
155 | 	"vmul.f32 		d2, d0, d1				\n\t"	//d2 = d0 * d1	
156 | 	"vrsqrts.f32 	d3, d2, d0				\n\t"	//d4 = (3 - d0 * d3) / 2	
157 | 	"vmul.f32 		d0, d0, d3				\n\t"	//d0 = d0 * d4	
158 | 
159 | 	"vmul.f32 		q2, q2, d0[0]			\n\t"	//d0= d2*d4
160 | 	"vst1.32 		{d4}, [%1]				\n\t"	//
161 | 	"fsts 			s10, [%1, #8]			\n\t"	//
162 | 	
163 | 	:: "r"(v), "r"(d) 
164 |     : "d0", "d1", "d2", "d3", "d4", "d5", "memory"
165 | 	);	
166 | #else
167 | 	normalize3_c(v, d);
168 | #endif
169 | 
170 | }
171 | 
172 | 
173 | 


--------------------------------------------------------------------------------
/math_vec4.c:
--------------------------------------------------------------------------------
  1 | /*
  2 | The MIT License (MIT)
  3 | 
  4 | Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
  5 | 
  6 | Permission is hereby granted, free of charge, to any person obtaining a copy
  7 | of this software and associated documentation files (the "Software"), to deal
  8 | in the Software without restriction, including without limitation the rights
  9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10 | copies of the Software, and to permit persons to whom the Software is
 11 | furnished to do so, subject to the following conditions:
 12 | 
 13 | The above copyright notice and this permission notice shall be included in
 14 | all copies or substantial portions of the Software.
 15 | 
 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 22 | THE SOFTWARE.
 23 | */
 24 | 
 25 | #include "math_neon.h"
 26 | 
 27 | 
 28 | #ifdef __MATH_NEON
 29 | #include "arm_neon.h" 
 30 | #endif
 31 | 
 32 | //vec4 scalar product
 33 | float dot4_c(float v0[4], float v1[4])
 34 | {
 35 | 	float r;
 36 | 	r = v0[0]*v1[0];
 37 | 	r += v0[1]*v1[1];
 38 | 	r += v0[2]*v1[2]; 
 39 | 	r += v0[3]*v1[3];
 40 | 	return r;
 41 | }
 42 | 
 43 | void normalize4_c(float v[4], float d[4])
 44 | {
 45 | 	float b, c, x;
 46 | 	union {
 47 | 		float 	f;
 48 | 		int 	i;
 49 | 	} a;
 50 | 	
 51 | 	x = v[0]*v[0];
 52 | 	x += v[1]*v[1];
 53 | 	x += v[2]*v[2];
 54 | 	x += v[3]*v[3];
 55 | 
 56 | 	//fast invsqrt approx
 57 | 	a.f = x;
 58 | 	a.i = 0x5F3759DF - (a.i >> 1);		//VRSQRTE
 59 | 	c = x * a.f;
 60 | 	b = (3.0f - c * a.f) * 0.5;		//VRSQRTS
 61 | 	a.f = a.f * b;		
 62 | 	c = x * a.f;
 63 | 	b = (3.0f - c * a.f) * 0.5;
 64 |     a.f = a.f * b;	
 65 | 
 66 | 	d[0] = v[0]*a.f;
 67 | 	d[1] = v[1]*a.f;
 68 | 	d[2] = v[2]*a.f;
 69 | 	d[3] = v[3]*a.f;
 70 | }
 71 | 
 72 | void normalize4_neon(float v[4], float d[4])
 73 | {
 74 | #ifdef __MATH_NEON
 75 | 	asm volatile (
 76 | 	"vld1.32 		{d4, d5}, [%0]			\n\t"	//d2={x0,y0}, d3={z0, w0}
 77 | 	"vmul.f32 		d0, d4, d4				\n\t"	//d0= d4*d4
 78 | 	"vmla.f32 		d0, d5, d5				\n\t"	//d0 = d0 + d5*d5 
 79 | 	"vpadd.f32 		d0, d0					\n\t"	//d0 = d[0] + d[1]
 80 | 	
 81 | 	"vmov.f32 		d1, d0					\n\t"	//d1 = d0
 82 | 	"vrsqrte.f32 	d0, d0					\n\t"	//d0 = ~ 1.0 / sqrt(d0)
 83 | 	"vmul.f32 		d2, d0, d1				\n\t"	//d2 = d0 * d1
 84 | 	"vrsqrts.f32 	d3, d2, d0				\n\t"	//d3 = (3 - d0 * d2) / 2 	
 85 | 	"vmul.f32 		d0, d0, d3				\n\t"	//d0 = d0 * d3
 86 | 	"vmul.f32 		d2, d0, d1				\n\t"	//d2 = d0 * d1	
 87 | 	"vrsqrts.f32 	d3, d2, d0				\n\t"	//d4 = (3 - d0 * d3) / 2	
 88 | 	"vmul.f32 		d0, d0, d3				\n\t"	//d0 = d0 * d4	
 89 | 
 90 | 	"vmul.f32 		q2, q2, d0[0]			\n\t"	//d0= d2*d4
 91 | 	"vst1.32 		{d4, d5}, [%1]			\n\t"	//d2={x0,y0}, d3={z0, w0}
 92 | 	
 93 | 	:: "r"(v), "r"(d) 
 94 |     : "d0", "d1", "d2", "d3", "d4", "d5", "memory"
 95 | 	);	
 96 | #else
 97 | 	normalize4_c(v, d);
 98 | #endif
 99 | 
100 | }
101 | 
102 | 
103 | float dot4_neon_hfp(float v0[4], float v1[4])
104 | {
105 | #ifdef __MATH_NEON
106 | 	asm volatile (
107 | 	"vld1.32 		{d2, d3}, [%0]			\n\t"	//d2={x0,y0}, d3={z0, w0}
108 | 	"vld1.32 		{d4, d5}, [%1]			\n\t"	//d4={x1,y1}, d5={z1, w1}
109 | 	"vmul.f32 		d0, d2, d4				\n\t"	//d0= d2*d4
110 | 	"vmla.f32 		d0, d3, d5				\n\t"	//d0 = d0 + d3*d5 
111 | 	"vpadd.f32 		d0, d0					\n\t"	//d0 = d[0] + d[1]
112 | 	:: "r"(v0), "r"(v1) : 
113 | 	);	
114 | #endif
115 | }
116 | 
117 | 
118 | #ifdef __MATH_NEON
119 | float32_t dot4_neon(float32x4_t v0, float32x4_t v1)
120 | {	
121 | 	float32x2_t a, b, c, d, r;
122 | 	a = vget_high_f32(v0);
123 | 	b = vget_low_f32(v0);
124 | 	c = vget_high_f32(v1);
125 | 	d = vget_low_f32(v1);
126 | 	
127 | 	r = vmul_f32(a, c);
128 | 	r = vmla_f32(r, b, d);
129 | 	r = vpadd_f32(r, r);
130 | 	return vget_lane_f32(r, 0);
131 | }
132 | #endif
133 | 
134 | float dot4_neon_sfp(float v0[4], float v1[4])
135 | {
136 | #ifdef __MATH_NEON
137 | 	dot4_neon_hfp(v0, v1);
138 | 	asm volatile ("vmov.f32 r0, s0 		\n\t");
139 | #else
140 | 	return dot4_c(v0, v1);
141 | #endif
142 | };
143 | 
144 | 


--------------------------------------------------------------------------------