├── .attic
    └── memcpy_memset.txt
├── README.md
├── common.h
├── config-defaults.h
├── config.h
├── everything.h
└── vectormath
    ├── mat44_multiply.h
    ├── vec3_dot.h
    ├── vec4_dot.h
    ├── vec4_homogenize.h
    ├── vec4_reciprocal.h
    ├── vec4n_dot.h
    └── vector3.h


/.attic/memcpy_memset.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | (Need to enable PLD, see
 3 | http://infocenter.arm.com/help/topic/com.arm.doc.ddi0344b/Babjbfdb.html
 4 | http://infocenter.arm.com/help/topic/com.arm.doc.ddi0344k/Cbbbdaed.html
 5 | http://infocenter.arm.com/help/topic/com.arm.doc.ddi0344k/Bgbciiaf.html
 6 | )
 7 | 
 8 | memcpy_neon:
 9 |         push            {r4-r11}
10 |         mov             r3, r0
11 | 1:      subs            r2, r2, #128
12 |         pld             [r1, #64]
13 |         pld             [r1, #256]
14 |         pld             [r1, #320]
15 |         ldm             r1!, {r4-r11}
16 |         vld1.64         {d0-d3},   [r1,:128]!
17 |         vld1.64         {d4-d7},   [r1,:128]!
18 |         vld1.64         {d16-d19}, [r1,:128]!
19 |         stm             r3!, {r4-r11}
20 |         vst1.64         {d0-d3},   [r3,:128]!
21 |         vst1.64         {d4-d7},   [r3,:128]!
22 |         vst1.64         {d16-d19}, [r3,:128]!
23 |         bgt             1b
24 |         pop             {r4-r11}
25 |         bx              lr
26 | 
27 | memset_neon_1: http://gitorious.org/0xdroid/bionic/commit/780898e723d883e0ed13387f11066275121048b9
28 | memset_neon_2:
29 |         push            {r4-r11}
30 |         mov             r3,  r0
31 |         vdup.8          q0,  r1
32 |         vmov            q1,  q0
33 |         orr             r4,  r1, r1, lsl #8
34 |         orr             r4,  r4, r4, lsl #16
35 |         mov             r5,  r4
36 |         mov             r6,  r4
37 |         mov             r7,  r4
38 |         mov             r8,  r4
39 |         mov             r9,  r4
40 |         mov             r10, r4
41 |         mov             r11, r4
42 |         add             r12, r3,  r2, lsr #2
43 | 1:      subs            r2,  r2, #128
44 |         pld             [r3, #64]
45 |         stm             r3!, {r4-r11}
46 |         vst1.64         {d0-d3},   [r12,:128]!
47 |         vst1.64         {d0-d3},   [r12,:128]!
48 |         vst1.64         {d0-d3},   [r12,:128]!
49 |         bgt             1b
50 |         pop             {r4-r11}
51 |         bx              lr
52 | 
53 | 
54 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ARMv7 Functions
 2 | ===============
 3 | 
 4 | This is a collection of various functions optimized for __armv7__ and __neon__.
 5 | 
 6 | The five holy laws
 7 | ------------------
 8 | 
 9 | 1. __Never return floating point values by value__. It would work fine if <code>-mfloat-abi=hard</code> was supported everywhere, but sadly it's not. With the more common <code>-mfloat-abi=softfp</code>, every time you do a <code>return my_float_value</code>, it does either a <code>fmrs</code> or a <code>vstr</code>, followed by a load operation in order to read the result back! __Instead, use a non-const reference as first parameter__. It allows super smooth inlining of your intermediate results without unnecessary loads and stores, just like it would do if hard floats were available (works for vector types too) !
10 | 2. __Try to minimize loads and stores__. Though GCC doesn't support evolved <code>vldmia</code>/<code>vstmia</code> and will generate poor code for operations on <code>float32x4x4_t</code>, so handcoding them make sense in that case.
11 | 3. __Use vector types everywhere it makes sense__. Functions prefixed with <code>vec3_</code> and <code>vec4_</code> directly work on <code>float32x4_t</code>. Those prefixed with <code>mat44_</code> directly work with <code>float32x4x4_t</code>. Parameters are passed as references, so the compiler doesn't perform unnecessary ARM register transfers.
12 | 4. __Don't hard-code registers__, but use dummy values instead for clobber, and let the compiler allocate registers as needed.
13 | 5. __A good clobber list is an empty clobber list__. If you let the compiler handle loads for you, "_memory_" shouldn't even show up in your clobber list. The only item that might is "_cc_".
14 | 
15 | Compilation flags
16 | -----------------
17 | 
18 | For best performance I usually use the following CFLAGS: <code>-mthumb -mcpu=cortex-a8 -mfpu=neon -mfloat-abi=softfp -mvectorize-with-neon-quad -O3 -ffast-math -fomit-frame-pointer -fstrict-aliasing -fgcse-las -funsafe-loop-optimizations -fsee -ftree-vectorize</code>, with <code>-arch armv7</code> if it's _gcc for iOS_ or <code>-march=armv7-a</code> if it's _eabi-none-gcc_.
19 | 
20 | Preprocessor macros
21 | -------------------
22 | 
23 | Several preprocessor macros, when defined, change the behaviour of the code. See <code>config.h</code> and <code>config-defaults.h</code> for details…
24 | 
25 | 


--------------------------------------------------------------------------------
/common.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <armv7-functions/config.h>
 4 | #include <armv7-functions/config-defaults.h>
 5 | 
 6 | // A few sanity checks
 7 | #ifndef __FAST_MATH__
 8 | 	#error "Please add -ffast-math to your compile flags"
 9 | #endif
10 | 
11 | #ifndef __ARM_NEON__
12 | 	#error "Please add -mfpu=neon to your compile flags"
13 | #endif
14 | 
15 | // Most needed header
16 | #include <arm_neon.h>
17 | 
18 | // Standard defines, prefixed with ARMV7_FUNC_*.
19 | #define ARMV7_FUNC_API static inline __attribute__((always_inline))
20 | #define ARMV7_FUNC_PI  0x1.921FB54442D18p1f
21 | #define ARMV7_FUNC_2PI 0x1.921FB54442D18p2f
22 | 
23 | // Standard types
24 | namespace ARM7_FUNC_NAMESPACE {
25 | 	typedef float32x4_t   vector3_t;
26 | 	typedef float32x4_t   vector4_t;
27 | 	typedef float32x4x4_t matrix44_t;
28 | }
29 | 
30 | 


--------------------------------------------------------------------------------
/config-defaults.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #ifndef ARM7_FUNC_NORMALIZATION_RECIPROCAL_STEPS
 4 | 	#define ARM7_FUNC_NORMALIZATION_RECIPROCAL_STEPS 2
 5 | #endif
 6 | 
 7 | #ifndef ARM7_FUNC_NAMESPACE
 8 | 	#define ARM7_FUNC_NAMESPACE armv7func
 9 | #endif
10 | 
11 | 


--------------------------------------------------------------------------------
/config.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | // Number of Newton-Raphson iterations used for
 4 | // computing 1/w, in vector normalizations.
 5 | // Default to 2
 6 | //#define ARM7_FUNC_NORMALIZATION_RECIPROCAL_STEPS 2
 7 | 
 8 | // Name of our namespace
 9 | // Default is "armv7func"
10 | //#define ARM7_FUNC_NAMESPACE armv7_is_cool
11 | 
12 | 


--------------------------------------------------------------------------------
/everything.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <armv7-functions/common.h>
 4 | 
 5 | #include <armv7-functions/vectormath/mat44_multiply.h>
 6 | #include <armv7-functions/vectormath/vec3_dot.h>
 7 | #include <armv7-functions/vectormath/vec4_dot.h>
 8 | #include <armv7-functions/vectormath/vec4n_dot.h>
 9 | #include <armv7-functions/vectormath/vec4_homogenize.h>
10 | #include <armv7-functions/vectormath/vec4_reciprocal.h>
11 | 
12 | //#include <armv7-functions/vectormath/vector3.h>
13 | 
14 | 


--------------------------------------------------------------------------------
/vectormath/mat44_multiply.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <armv7-functions/common.h>
 3 | 
 4 | namespace ARM7_FUNC_NAMESPACE {
 5 | 
 6 | ARMV7_FUNC_API void mat44_multiply(matrix44_t& result, const matrix44_t& a, const matrix44_t& b) {
 7 | 	// result = first column of B x first row of A
 8 | 	result.val[0] = vmulq_lane_f32(b.val[0], vget_low_f32(a.val[0]), 0);
 9 | 	result.val[1] = vmulq_lane_f32(b.val[0], vget_low_f32(a.val[1]), 0);
10 | 	result.val[2] = vmulq_lane_f32(b.val[0], vget_low_f32(a.val[2]), 0);
11 | 	result.val[3] = vmulq_lane_f32(b.val[0], vget_low_f32(a.val[3]), 0);
12 | 	// result += second column of B x second row of A
13 | 	result.val[0] = vmlaq_lane_f32(result.val[0], b.val[1], vget_low_f32(a.val[0]), 1);
14 | 	result.val[1] = vmlaq_lane_f32(result.val[1], b.val[1], vget_low_f32(a.val[1]), 1);
15 | 	result.val[2] = vmlaq_lane_f32(result.val[2], b.val[1], vget_low_f32(a.val[2]), 1);
16 | 	result.val[3] = vmlaq_lane_f32(result.val[3], b.val[1], vget_low_f32(a.val[3]), 1);
17 | 	// result += third column of B x third row of A
18 | 	result.val[0] = vmlaq_lane_f32(result.val[0], b.val[2], vget_high_f32(a.val[0]), 0);
19 | 	result.val[1] = vmlaq_lane_f32(result.val[1], b.val[2], vget_high_f32(a.val[1]), 0);
20 | 	result.val[2] = vmlaq_lane_f32(result.val[2], b.val[2], vget_high_f32(a.val[2]), 0);
21 | 	result.val[3] = vmlaq_lane_f32(result.val[3], b.val[2], vget_high_f32(a.val[3]), 0);
22 | 	// result += last column of B x last row of A
23 | 	result.val[0] = vmlaq_lane_f32(result.val[0], b.val[3], vget_high_f32(a.val[0]), 1);
24 | 	result.val[1] = vmlaq_lane_f32(result.val[1], b.val[3], vget_high_f32(a.val[1]), 1);
25 | 	result.val[2] = vmlaq_lane_f32(result.val[2], b.val[3], vget_high_f32(a.val[2]), 1);
26 | 	result.val[3] = vmlaq_lane_f32(result.val[3], b.val[3], vget_high_f32(a.val[3]), 1);
27 | 
28 | #if 0
29 | 	// Original, hand-written assembly:
30 | 	// Pros:
31 | 	//    * used vldmia/vstmia, which gcc can't at this point
32 | 	//    * used Um constraints and %m operand, allowing gcc
33 | 	//      to use "sp" where it made sense
34 | 	// Cons:
35 | 	//    * performed the full matrix multiplication, even
36 | 	//      when only a sub-expression was really used. The
37 | 	//      intrinsics code abode, OTOH, has parts optimized
38 | 	//      away by the compiler.
39 | 	asm volatile(
40 | 	"vldmia   %m[a], {q4-q7}\n\t"
41 | 	"vldmia   %m[b], {q8-q11}\n\t"
42 | 	"vmul.f32 q0,  q8,  d8[0]\n\t"
43 | 	"vmul.f32 q1,  q8, d10[0]\n\t"
44 | 	"vmul.f32 q2,  q8, d12[0]\n\t"
45 | 	"vmul.f32 q3,  q8, d14[0]\n\t"
46 | 	"vmla.f32 q0,  q9,  d8[1]\n\t"
47 | 	"vmla.f32 q1,  q9, d10[1]\n\t"
48 | 	"vmla.f32 q2,  q9, d12[1]\n\t"
49 | 	"vmla.f32 q3,  q9, d14[1]\n\t"
50 | 	"vmla.f32 q0, q10,  d9[0]\n\t"
51 | 	"vmla.f32 q1, q10, d11[0]\n\t"
52 | 	"vmla.f32 q2, q10, d13[0]\n\t"
53 | 	"vmla.f32 q3, q10, d15[0]\n\t"
54 | 	"vmla.f32 q0, q11,  d9[1]\n\t"
55 | 	"vmla.f32 q1, q11, d11[1]\n\t"
56 | 	"vmla.f32 q2, q11, d13[1]\n\t"
57 | 	"vmla.f32 q3, q11, d15[1]\n\t"
58 | 	"vstmia   %m[result], {q0-q3}"
59 | 	:
60 | 	: [result] "Um" (result), [a] "Um" (a), [b] "Um" (b)
61 | 	: "memory",
62 | 	  "q0", "q1", "q2", "q3",
63 | 	  "q4", "q5", "q6", "q7",
64 | 	  "q8", "q9","q10", "q11"
65 | 	);
66 | #endif
67 | }
68 | 
69 | } // ARM7_FUNC_NAMESPACE
70 | 
71 | 


--------------------------------------------------------------------------------
/vectormath/vec3_dot.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <armv7-functions/common.h>
 3 | 
 4 | namespace ARM7_FUNC_NAMESPACE {
 5 | 
 6 | ARMV7_FUNC_API void vec3_dot(float& result, const vector3_t& a, const vector3_t& b) {
 7 | 	register vector3_t tmp;
 8 | 	register const int zero(0);
 9 | 	asm volatile (
10 | 	"# %q[tmp].x = dot(%q[a].xyz, %q[b].xyz);\n\t"
11 | 	"vmul.f32   %q[tmp],    %q[a],   %q[b]\n\t"
12 | 	"vmov.32    %f[tmp][1],	%[zero]\n\t"
13 | 	"vadd.f32   %e[tmp],    %e[tmp], %f[tmp]\n\t"
14 | 	"vpadd.f32  %e[tmp],    %e[tmp]"
15 | 	: [tmp] "=&w" (tmp) : [a] "w" (a), [b] "w" (b), [zero] "r" (zero): );
16 | 	result = vgetq_lane_f32(tmp, 0);
17 | }
18 | 
19 | } // ARM7_FUNC_NAMESPACE
20 | 
21 | 


--------------------------------------------------------------------------------
/vectormath/vec4_dot.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <armv7-functions/common.h>
 3 | 
 4 | namespace ARM7_FUNC_NAMESPACE {
 5 | 
 6 | ARMV7_FUNC_API void vec4_dot(float& result, const vector4_t& a, const vector4_t& b) {
 7 | 	register vector4_t tmp;
 8 | 	asm volatile (
 9 | 	"# %q[tmp].x = dot(%q[a].xyzw, %q[b].xyzw);\n\t"
10 | 	"vmul.f32   %q[tmp],    %q[a],   %q[b]\n\t"
11 | 	"vadd.f32   %e[tmp],    %e[tmp], %f[tmp]\n\t"
12 | 	"vpadd.f32  %e[tmp],    %e[tmp]"
13 | 	: [tmp] "=&w" (tmp) : [a] "w" (a), [b] "w" (b) : );
14 | 	result = vgetq_lane_f32(tmp, 0);
15 | }
16 | 
17 | } // ARM7_FUNC_NAMESPACE
18 | 
19 | 


--------------------------------------------------------------------------------
/vectormath/vec4_homogenize.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <armv7-functions/common.h>
 3 | #include <armv7-functions/vectormath/vec4_reciprocal.h>
 4 | 
 5 | namespace ARM7_FUNC_NAMESPACE {
 6 | 
 7 | ARMV7_FUNC_API void vec4_homogenize(vector4_t& vec) {
 8 | 	asm volatile("#begin vec4_homogenize" :::);
 9 | 	register vector4_t wwww, wwww_recp;
10 | 
11 | 	// This use a neon>arm transfer! why??
12 | 	// tmp1 = vdupq_n_f32(vgetq_lane_f32(vec, 3));
13 | 	asm volatile("vdup.f32 %q[wwww], %f[xyzw][1]" : [wwww] "=&w" (wwww) : [xyzw] "w" (vec) : );
14 | 
15 | 	vec4_reciprocal(wwww_recp, wwww);
16 | 	vec *= wwww_recp;
17 | 	asm volatile("#end vec4_homogenize" :::);
18 | }
19 | 
20 | } // ARM7_FUNC_NAMESPACE
21 | 
22 | 


--------------------------------------------------------------------------------
/vectormath/vec4_reciprocal.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <armv7-functions/common.h>
 3 | 
 4 | namespace ARM7_FUNC_NAMESPACE {
 5 | 
 6 | namespace details {
 7 | 
 8 | 	template<int steps>
 9 | 	struct NewtonRaphsonStepper {
10 | 		static inline __attribute__((always_inline)) void exec(float32x4_t& x_recp, const float32x4_t& x) {
11 | 			x_recp*= vrecpsq_f32(x_recp, x);
12 | 			NewtonRaphsonStepper<steps-1>::exec(x_recp, x);
13 | 		}
14 | 	};
15 | 
16 | 	template<>
17 | 	struct NewtonRaphsonStepper<0> {
18 | 		static inline __attribute__((always_inline)) void exec(float32x4_t& x_recp, const float32x4_t& x) { }
19 | 	};
20 | }
21 | 
22 | ARMV7_FUNC_API void vec4_reciprocal(vector4_t& result, const vector4_t& vec) {
23 | 	asm volatile("#begin vec4_reciprocal" :::);
24 | 	result = vrecpeq_f32(vec);
25 | 	details::NewtonRaphsonStepper<ARM7_FUNC_NORMALIZATION_RECIPROCAL_STEPS>::exec(result, vec);
26 | 	asm volatile("#end vec4_reciprocal" :::);
27 | }
28 | 
29 | } // ARM7_FUNC_NAMESPACE
30 | 
31 | 


--------------------------------------------------------------------------------
/vectormath/vec4n_dot.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <armv7-functions/common.h>
 3 | 
 4 | namespace ARM7_FUNC_NAMESPACE {
 5 | 
 6 | // Compute the dot product of two vectors of 4n floats
 7 | ARMV7_FUNC_API void vec4n_dot(float& result, const vector4_t* a, const vector4_t* b, unsigned int n) {
 8 | 	register vector4_t tmp, va, vb;
 9 | 	asm volatile (
10 | 	"# %q[tmp].x = dot(%q[va][i], %q[vb][i]) for (i=0; i<n; i++);\n\t"
11 | 	"vmov.f32 %q[tmp], #0.0\n\t"
12 | 	"1:\n\t"
13 | 	"vldmia    %[a]!, {%q[va]}\n\t"
14 | 	"vldmia    %[b]!, {%q[vb]}\n\t"
15 | 	"vmla.f32  %q[tmp], %q[va], %q[vb]\n\t"
16 | 	"subs      %[n], %[n], #16\n\t"
17 | 	"bne 1b\n\t"
18 | 	"vadd.f32  %e[tmp], %e[tmp], %f[tmp]\n\t"
19 | 	"vpadd.f32 %e[tmp], %e[tmp]"
20 | 	: [tmp] "=&w" (tmp), [a] "=r" (a), [b] "=r" (b), [n] "=r" (n), [va] "=w" (va), [vb] "=w" (vb)
21 | 	: "1" (a), "2" (b), "3" (n)
22 | 	: "memory", "cc"
23 | 	);
24 | 	result = vgetq_lane_f32(tmp, 0);
25 | }
26 | 
27 | } // ARM7_FUNC_NAMESPACE
28 | 
29 | 


--------------------------------------------------------------------------------
/vectormath/vector3.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #include <armv7-functions/common.h>
  3 | #include <armv7-functions/vectormath/vec4_reciprocal.h>
  4 | // TODO:
  5 | // #include <armv7-functions/vectormath/vec4_reciprocal_squareroot.h>
  6 | #error "Please don't use this, it's not ready"
  7 | 
  8 | namespace ARM7_FUNC_NAMESPACE {
  9 | 
 10 | class Vector3Static;
 11 | class Vector3 {
 12 | private:
 13 | 	float32x4_t val;
 14 | 	friend class Vector3Static;
 15 | 
 16 | public:
 17 | 	Vector3()
 18 | 	{ }
 19 | 
 20 | 	Vector3(const Vector3 & vec)
 21 | 	: val(vec.val)
 22 | 	{ }
 23 | 
 24 | 	Vector3(const float32_t& x, const float32_t& y, const float32_t& z) {
 25 | 		setX(x);
 26 | 		setY(y);
 27 | 		setZ(z);
 28 | 	}
 29 | 
 30 | 	explicit
 31 | 	Vector3(const float32_t& scalar)
 32 | 	: val(vdupq_n_f32(scalar))
 33 | 	{ }
 34 | 
 35 | 	explicit
 36 | 	Vector3(const float32x4_t& v)
 37 | 	: val(v)
 38 | 	{ }
 39 | 
 40 | 	Vector3& operator=(const Vector3& vec) {
 41 | 		val=vec.val;
 42 | 		return *this;
 43 | 	}
 44 | 
 45 | 	Vector3& setX(const float32_t& x) {
 46 | 		return setElem(0, x);
 47 | 	}
 48 | 
 49 | 	Vector3& setY(const float32_t& y) {
 50 | 		return setElem(1, y);
 51 | 	}
 52 | 
 53 | 	Vector3& setZ(const float32_t& z) {
 54 | 		return setElem(2, z);
 55 | 	}
 56 | 
 57 | 	float32_t getX() const {
 58 | 		return getElem(0);
 59 | 	}
 60 | 
 61 | 	float32_t getY() const {
 62 | 		return getElem(1);
 63 | 	}
 64 | 
 65 | 	float32_t getZ() const {
 66 | 		return getElem(2);
 67 | 	}
 68 | 
 69 | 	Vector3& setElem(int idx, const float32_t& value) {
 70 | 		val = vsetq_lane_f32(value, val, idx);
 71 | 		return *this;
 72 | 	}
 73 | 
 74 | 	float32_t getElem(int idx) const {
 75 | 		return vgetq_lane_f32(val, idx);
 76 | 	}
 77 | 
 78 | 	float32_t operator[](int idx) const {
 79 | 		return getElem(idx);
 80 | 	}
 81 | 
 82 | 	const Vector3 operator+(const Vector3& vec) const {
 83 | 		return Vector3(val + vec.val);
 84 | 	}
 85 | 
 86 | 	const Vector3 operator-(const Vector3& vec) const {
 87 | 		return Vector3(val - vec.val);
 88 | 	}
 89 | 
 90 | 	const Vector3 operator*(const float32_t& scalar) const {
 91 | 		return Vector3(val * vdupq_n_f32(scalar));
 92 | 	}
 93 | 
 94 | 	const Vector3 operator/(const float32_t& scalar) const {
 95 | 		float32x4_t tmp;
 96 | 		vec4_reciprocal(tmp, vdupq_n_f32(scalar));
 97 | 		return Vector3(val * tmp);
 98 | 	}
 99 | 
100 | 	Vector3& operator+=(const Vector3& vec) {
101 | 		val += vec.val;
102 | 		return *this;
103 | 	}
104 | 
105 | 	Vector3& operator-=(const Vector3& vec) {
106 | 		val -= vec.val;
107 | 		return *this;
108 | 	}
109 | 
110 | 	Vector3& operator*=(const float32_t& scalar) {
111 | 		val *= vdupq_n_f32(scalar);
112 | 		return *this;
113 | 	}
114 | 
115 | 	Vector3& operator/=(const float32_t& scalar) {
116 | 		float32x4_t tmp;
117 | 		vec4_reciprocal(tmp, vdupq_n_f32(scalar));
118 | 		val *= tmp;
119 | 		return *this;
120 | 	}
121 | 
122 | 	const Vector3 operator-( ) const {
123 | 		return Vector3(-val);
124 | 	}
125 | 
126 | 	static const Vector3& xAxis() {
127 | 		static Vector3 instance((float32x4_t) { 1.f, 0.f, 0.f, 0.f });
128 | 		return instance;
129 | 	}
130 | 
131 | 	static const Vector3& yAxis() {
132 | 		static Vector3 instance((float32x4_t) { 0.f, 1.f, 0.f, 0.f });
133 | 		return instance;
134 | 	}
135 | 
136 | 	static const Vector3& zAxis() {
137 | 		static Vector3 instance((float32x4_t) { 0.f, 0.f, 1.f, 0.f });
138 | 		return instance;
139 | 	}
140 | 
141 | } __attribute__((aligned(16)));
142 | 
143 | class Vector3Static {
144 | public:
145 | 	static const Vector3 mulPerElem(const Vector3& vec0, const Vector3& vec1) {
146 | 		return Vector3(vec0.val * vec1.val);
147 | 	}
148 | 	static const Vector3 divPerElem(const Vector3& vec0, const Vector3& vec1 ) {
149 | 		return mulPerElem(vec0, recipPerElem(vec1));
150 | 	}
151 | 	static const Vector3 recipPerElem(const Vector3& vec) {
152 | 		float32x4_t tmp;
153 | 		vec4_reciprocal(tmp, vec.val);
154 | 		return Vector3(tmp);
155 | 	}
156 | 
157 | 	static const Vector3 sqrtPerElem(const Vector3& vec) {
158 | 		float32x4_t tmp;
159 | 		// TODO:
160 | 		// vec4_reciprocal_squareroot(tmp, vec.val);
161 | 		// vec4_reciprocal(tmp, tmp);
162 | 		return Vector3(tmp);
163 | 	}
164 | 
165 | 	static const Vector3 rsqrtPerElem(const Vector3& vec) {
166 | 		float32x4_t tmp;
167 | 		// TODO:
168 | 		// vec4_reciprocal_squareroot(tmp, vec.val);
169 | 		return Vector3(tmp);
170 | 	}
171 | 
172 | 	static const Vector3 absPerElem(const Vector3& vec);
173 | 	static const Vector3 copySignPerElem(const Vector3& vec0, const Vector3& vec1);
174 | 	static const Vector3 maxPerElem(const Vector3& vec0, const Vector3& vec1);
175 | 	static const Vector3 minPerElem(const Vector3& vec0, const Vector3& vec1);
176 | 	static float maxElem(const Vector3& vec);
177 | 	static float minElem(const Vector3& vec);
178 | 	static float sum(const Vector3& vec);
179 | 	static float dot(const Vector3& vec0, const Vector3 & vec1);
180 | 	static float lengthSqr(const Vector3& vec);
181 | 	static float length(const Vector3& vec);
182 | 	static const Vector3 normalize(const Vector3& vec);
183 | 	static const Vector3 cross(const Vector3& vec0, const Vector3& vec1);
184 | };
185 | 
186 | inline const Vector3 operator*(const float32_t& scalar, const Vector3& vec) {
187 | 	return vec*scalar;
188 | }
189 | 
190 | inline const Vector3 mulPerElem(const Vector3& vec0, const Vector3& vec1) {
191 | 	return Vector3Static::mulPerElem(vec0, vec1);
192 | }
193 | 
194 | inline const Vector3 divPerElem(const Vector3& vec0, const Vector3& vec1) {
195 | 	return Vector3Static::divPerElem(vec0, vec1);
196 | }
197 | 
198 | inline const Vector3 recipPerElem(const Vector3& vec) {
199 | 	return Vector3Static::recipPerElem(vec);
200 | }
201 | 
202 | inline const Vector3 sqrtPerElem( const Vector3 & vec );
203 | inline const Vector3 rsqrtPerElem( const Vector3 & vec );
204 | inline const Vector3 absPerElem( const Vector3 & vec );
205 | inline const Vector3 copySignPerElem( const Vector3 & vec0, const Vector3 & vec1 );
206 | inline const Vector3 maxPerElem( const Vector3 & vec0, const Vector3 & vec1 );
207 | inline const Vector3 minPerElem( const Vector3 & vec0, const Vector3 & vec1 );
208 | inline float maxElem( const Vector3 & vec );
209 | inline float minElem( const Vector3 & vec );
210 | inline float sum( const Vector3 & vec );
211 | inline float dot( const Vector3 & vec0, const Vector3 & vec1 );
212 | inline float lengthSqr( const Vector3 & vec );
213 | inline float length( const Vector3 & vec );
214 | inline const Vector3 normalize( const Vector3 & vec );
215 | inline const Vector3 cross( const Vector3 & vec0, const Vector3 & vec1 );
216 | 
217 | //inline const Matrix3 outer( const Vector3 & vec0, const Vector3 & vec1 );
218 | //inline const Vector3 rowMul( const Vector3 & vec, const Matrix3 & mat );
219 | //inline const Matrix3 crossMatrix( const Vector3 & vec );
220 | //inline const Matrix3 crossMatrixMul( const Vector3 & vec, const Matrix3 & mat );
221 | 
222 | inline const Vector3 lerp( float t, const Vector3 & vec0, const Vector3 & vec1 );
223 | inline const Vector3 slerp( float t, const Vector3 & unitVec0, const Vector3 & unitVec1 );
224 | inline const Vector3 select( const Vector3 & vec0, const Vector3 & vec1, bool select1 );
225 | 
226 | } // ARM7_FUNC_NAMESPACE
227 | 
228 | 


--------------------------------------------------------------------------------