├── .gitignore
├── .travis.yml
├── include
    └── vectorial
    │   ├── vectorial.h
    │   ├── simd4x4f_sse.h
    │   ├── simd2f_common.h
    │   ├── simd2f.h
    │   ├── simd4x4f_scalar.h
    │   ├── simd4x4f_gnu.h
    │   ├── simd4x4f_neon.h
    │   ├── simd4f.h
    │   ├── vec_convert.h
    │   ├── simd4f_common.h
    │   ├── config.h
    │   ├── simd2f_neon.h
    │   ├── simd4f_scalar.h
    │   ├── vec2f.h
    │   ├── mat4f.h
    │   ├── vec4f.h
    │   ├── vec3f.h
    │   ├── simd4f_gnu.h
    │   ├── simd4f_sse.h
    │   ├── simd4f_neon.h
    │   └── simd4x4f.h
├── tools
    ├── update_spec.rb
    └── spechelper.m
├── bench
    ├── add_bench.cpp
    ├── dot_bench.cpp
    ├── matrix_bench.cpp
    ├── bench.h
    ├── quad_bench.cpp
    └── bench.cpp
├── LICENSE
├── spec
    ├── spec_mat4f.cpp
    ├── spec_main.cpp
    ├── spec.cpp
    ├── spec.h
    ├── spec_vec2f.cpp
    ├── spec_simd2f.cpp
    ├── spec_vec3f.cpp
    ├── spec_vec4f.cpp
    ├── spec_helper.h
    └── spec_simd4f.cpp
├── README
├── vectorial.sln
├── vectorialbenchmark.vcproj
├── vectorial.vcproj
└── Makefile


/.gitignore:
--------------------------------------------------------------------------------
1 | *.o
2 | *.orig
3 | specsuite-*
4 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: cpp
2 | compiler:
3 |   - gcc
4 |   - clang
5 | 
6 | script: make
7 | 


--------------------------------------------------------------------------------
/include/vectorial/vectorial.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |   Vectorial
 3 |   Copyright (c) 2010 Mikko Lehtonen
 4 |   Licensed under the terms of the two-clause BSD License (see LICENSE)
 5 | */
 6 | #ifndef VECTORIAL_VECTORIAL_H
 7 | #define VECTORIAL_VECTORIAL_H
 8 | 
 9 | 
10 | #include "vectorial/vec2f.h"
11 | #include "vectorial/vec3f.h"
12 | #include "vectorial/vec4f.h"
13 | 
14 | #include "vectorial/vec_convert.h"
15 | 
16 | #include "vectorial/mat4f.h"
17 | 
18 | 
19 | #endif
20 | 


--------------------------------------------------------------------------------
/include/vectorial/simd4x4f_sse.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |   Vectorial
 3 |   Copyright (c) 2010 Mikko Lehtonen
 4 |   Licensed under the terms of the two-clause BSD License (see LICENSE)
 5 | */
 6 | #ifndef VECTORIAL_SIMD4X4F_SSE_H
 7 | #define VECTORIAL_SIMD4X4F_SSE_H
 8 | 
 9 | 
10 | 
11 | vectorial_inline void simd4x4f_transpose_inplace(simd4x4f *s) {
12 |     _MM_TRANSPOSE4_PS(s->x, s->y, s->z, s->w);
13 | }
14 | 
15 | vectorial_inline void simd4x4f_transpose(const simd4x4f *s, simd4x4f *out) {
16 |     *out=*s;
17 |     simd4x4f_transpose_inplace(out);
18 | }
19 | 
20 | 
21 | 
22 | 
23 | #endif
24 | 


--------------------------------------------------------------------------------
/tools/update_spec.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | SPECHELPER = File.join(File.dirname(__FILE__), "spechelper.m")
 4 | def octave_eval(str, type)
 5 |   puts "evalling (#{type}): #{str}"
 6 |   ret = `octave --quiet --eval 'source("#{SPECHELPER}"); spec_formatter(#{str}, "#{type}")'`
 7 |   puts "    = #{ret.strip}"
 8 |   ret
 9 | end
10 | 
11 | 
12 | ARGV.each do |fn|
13 |   str = File.read(fn)
14 |   str.gsub!(%r{(// octave (\w+):)(.*?)\n(.*?\n)}) do |match|
15 |     e = octave_eval($3, $2)
16 | 
17 |     [$1, $3, "\n", e, "\n"].join
18 |   end
19 |   File.open(fn, "w") do |f|
20 |     f.write str
21 |   end
22 | 
23 | end
24 | 
25 | 


--------------------------------------------------------------------------------
/include/vectorial/simd2f_common.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |   Vectorial
 3 |   Copyright (c) 2014 Google
 4 |   Licensed under the terms of the two-clause BSD License (see LICENSE)
 5 | */
 6 | #ifndef VECTORIAL_SIMD2F_COMMON_H
 7 | #define VECTORIAL_SIMD2F_COMMON_H
 8 | 
 9 | vectorial_inline simd2f simd2f_length2(simd2f v) {
10 |     return simd2f_sqrt( simd2f_dot2(v,v) );
11 | }
12 | 
13 | vectorial_inline simd2f simd2f_length2_squared(simd2f v) {
14 |     return simd2f_dot2(v,v);
15 | }
16 | 
17 | vectorial_inline simd2f simd2f_normalize2(simd2f a) {
18 |     simd2f invlen = simd2f_rsqrt( simd2f_dot2(a,a) );
19 |     return simd2f_mul(a, invlen);
20 | }
21 | 
22 | #endif
23 | 


--------------------------------------------------------------------------------
/include/vectorial/simd2f.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |   Vectorial
 3 |   Copyright (c) 2014 Google, Inc.
 4 |   Licensed under the terms of the two-clause BSD License (see LICENSE)
 5 | */
 6 | 
 7 | #ifndef VECTORIAL_SIMD2F_H
 8 | #define VECTORIAL_SIMD2F_H
 9 | 
10 | #include "vectorial/config.h"
11 | 
12 | #if defined(VECTORIAL_NEON)
13 |     #include "simd2f_neon.h"
14 | #else
15 |     #error No implementation defined
16 | #endif
17 | 
18 | #include "simd2f_common.h"
19 | 
20 | #ifdef __cplusplus
21 | 
22 |     #ifdef VECTORIAL_OSTREAM
23 |         #include <ostream>
24 | 
25 |         vectorial_inline std::ostream& operator<<(std::ostream& os, const simd2f& v) {
26 |             os << "simd2f(" << simd2f_get_x(v) << ", "
27 |                        << simd2f_get_y(v) << ")";
28 |             return os;
29 |         }
30 |     #endif
31 | 
32 | #endif
33 | 
34 | 
35 | 
36 | 
37 | #endif
38 | 
39 | 


--------------------------------------------------------------------------------
/include/vectorial/simd4x4f_scalar.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |   Vectorial
 3 |   Copyright (c) 2010 Mikko Lehtonen
 4 |   Licensed under the terms of the two-clause BSD License (see LICENSE)
 5 | */
 6 | #ifndef VECTORIAL_SIMD4X4F_SCALAR_H
 7 | #define VECTORIAL_SIMD4X4F_SCALAR_H
 8 | 
 9 | 
10 | vectorial_inline void simd4x4f_transpose_inplace(simd4x4f *s) {
11 |     simd4x4f d=*s;
12 |     s->x.x = d.x.x;
13 |     s->x.y = d.y.x;
14 |     s->x.z = d.z.x;
15 |     s->x.w = d.w.x;
16 | 
17 |     s->y.x = d.x.y;
18 |     s->y.y = d.y.y;
19 |     s->y.z = d.z.y;
20 |     s->y.w = d.w.y;
21 | 
22 |     s->z.x = d.x.z;
23 |     s->z.y = d.y.z;
24 |     s->z.z = d.z.z;
25 |     s->z.w = d.w.z;
26 | 
27 |     s->w.x = d.x.w;
28 |     s->w.y = d.y.w;
29 |     s->w.z = d.z.w;
30 |     s->w.w = d.w.w;
31 | 
32 | }
33 | 
34 | vectorial_inline void simd4x4f_transpose(const simd4x4f *s, simd4x4f *out) {
35 |     *out=*s;
36 |     simd4x4f_transpose_inplace(out);
37 | }
38 | 
39 | 
40 | 
41 | #endif
42 | 


--------------------------------------------------------------------------------
/include/vectorial/simd4x4f_gnu.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |   Vectorial
 3 |   Copyright (c) 2010 Mikko Lehtonen
 4 |   Licensed under the terms of the two-clause BSD License (see LICENSE)
 5 | */
 6 | #ifndef VECTORIAL_SIMD4X4F_GNU_H
 7 | #define VECTORIAL_SIMD4X4F_GNU_H
 8 | 
 9 | 
10 | 
11 | vectorial_inline void simd4x4f_transpose_inplace(simd4x4f* s) {
12 |     const _simd4f_union sx = { s->x };
13 |     const _simd4f_union sy = { s->y };
14 |     const _simd4f_union sz = { s->z };
15 |     const _simd4f_union sw = { s->w };
16 |     
17 |     const simd4f dx = { sx.f[0], sy.f[0], sz.f[0], sw.f[0] };
18 |     const simd4f dy = { sx.f[1], sy.f[1], sz.f[1], sw.f[1] };
19 |     const simd4f dz = { sx.f[2], sy.f[2], sz.f[2], sw.f[2] };
20 |     const simd4f dw = { sx.f[3], sy.f[3], sz.f[3], sw.f[3] };
21 | 
22 |     s->x = dx;
23 |     s->y = dy;
24 |     s->z = dz;
25 |     s->w = dw;
26 | 
27 | }
28 | 
29 | vectorial_inline void simd4x4f_transpose(const simd4x4f *s, simd4x4f *out) {
30 |     *out=*s;
31 |     simd4x4f_transpose_inplace(out);
32 | }
33 | 
34 | 
35 | 
36 | #endif
37 | 


--------------------------------------------------------------------------------
/include/vectorial/simd4x4f_neon.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |   Vectorial
 3 |   Copyright (c) 2010 Mikko Lehtonen
 4 |   Licensed under the terms of the two-clause BSD License (see LICENSE)
 5 | */
 6 | #ifndef VECTORIAL_SIMD4X4F_NEON_H
 7 | #define VECTORIAL_SIMD4X4F_NEON_H
 8 | 
 9 | 
10 | vectorial_inline void simd4x4f_transpose_inplace(simd4x4f* s) {
11 |     const _simd4f_union sx = { s->x };
12 |     const _simd4f_union sy = { s->y };
13 |     const _simd4f_union sz = { s->z };
14 |     const _simd4f_union sw = { s->w };
15 |     
16 |     const simd4f dx = simd4f_create( sx.f[0], sy.f[0], sz.f[0], sw.f[0] );
17 |     const simd4f dy = simd4f_create( sx.f[1], sy.f[1], sz.f[1], sw.f[1] );
18 |     const simd4f dz = simd4f_create( sx.f[2], sy.f[2], sz.f[2], sw.f[2] );
19 |     const simd4f dw = simd4f_create( sx.f[3], sy.f[3], sz.f[3], sw.f[3] );
20 | 
21 |     s->x = dx;
22 |     s->y = dy;
23 |     s->z = dz;
24 |     s->w = dw;
25 | 
26 | }
27 | 
28 | vectorial_inline void simd4x4f_transpose(const simd4x4f *s, simd4x4f *out) {
29 |     *out=*s;
30 |     simd4x4f_transpose_inplace(out);
31 | }
32 | 
33 | 
34 | 
35 | #endif
36 | 


--------------------------------------------------------------------------------
/bench/add_bench.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #include "bench.h"
 3 | #include <stdlib.h>
 4 | 
 5 | #include <iostream>
 6 | #include "vectorial/vec4f.h"
 7 | 
 8 | #define NUM (81920)
 9 | #define ITER 100
10 | using namespace vectorial;
11 | 
12 | namespace {
13 |     vec4f* alloc_vec4f(size_t n) {
14 |         void *ptr = memalign(n*sizeof(vec4f), 16);
15 |         return static_cast<vec4f*>(ptr);
16 |     }
17 | }
18 | 
19 | 
20 | 
21 | static vec4f * a;
22 | static vec4f * b;
23 | static vec4f * c;
24 | 
25 | 
26 | 
27 | 
28 | void add_func() {
29 |     
30 |     vec4f* vectorial_restrict aa = a;
31 |     vec4f* vectorial_restrict bb = b;
32 |     vec4f* vectorial_restrict cc = c;
33 |     
34 |     for(size_t i = 0; i < NUM; ++i)
35 |     {
36 |         cc[i] = aa[i] + bb[i];
37 |     }    
38 | }
39 | 
40 | void add_bench() {
41 | 
42 |     a = alloc_vec4f(NUM);
43 |     b = alloc_vec4f(NUM);
44 |     c = alloc_vec4f(NUM);
45 | 
46 | 
47 |     for(size_t i = 0; i < NUM; ++i)
48 |     {
49 |         a[i]=vec4f(i,i,i,i);
50 |         b[i]=vec4f(NUM-i, NUM-i, NUM-i, NUM-i);
51 |     }
52 |         
53 |     profile("add", add_func, ITER, NUM);
54 | 
55 |     memfree(a);
56 |     memfree(b);
57 |     memfree(c);
58 | 
59 | 
60 | }
61 | 


--------------------------------------------------------------------------------
/include/vectorial/simd4f.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |   Vectorial
 3 |   Copyright (c) 2010 Mikko Lehtonen
 4 |   Licensed under the terms of the two-clause BSD License (see LICENSE)
 5 | */
 6 | 
 7 | #ifndef VECTORIAL_SIMD4F_H
 8 | #define VECTORIAL_SIMD4F_H
 9 | 
10 | #ifndef VECTORIAL_CONFIG_H
11 |   #include "vectorial/config.h"
12 | #endif
13 | 
14 | 
15 | #ifdef VECTORIAL_SCALAR
16 |     #include "simd4f_scalar.h"
17 | #elif defined(VECTORIAL_SSE)
18 |     #include "simd4f_sse.h"
19 | #elif defined(VECTORIAL_GNU)
20 |     #include "simd4f_gnu.h"
21 | #elif defined(VECTORIAL_NEON)
22 |     #include "simd4f_neon.h"
23 | #else
24 |     #error No implementation defined
25 | #endif
26 | 
27 | #include "simd4f_common.h"
28 | 
29 | 
30 | 
31 | #ifdef __cplusplus
32 | 
33 |     #ifdef VECTORIAL_OSTREAM
34 |         #include <ostream>
35 | 
36 |         vectorial_inline std::ostream& operator<<(std::ostream& os, const simd4f& v) {
37 |             os << "simd4f(" << simd4f_get_x(v) << ", "
38 |                        << simd4f_get_y(v) << ", "
39 |                        << simd4f_get_z(v) << ", "
40 |                        << simd4f_get_w(v) << ")";
41 |             return os;
42 |         }
43 |     #endif
44 | 
45 | #endif
46 | 
47 | 
48 | 
49 | 
50 | #endif
51 | 
52 | 


--------------------------------------------------------------------------------
/bench/dot_bench.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #include "bench.h"
 3 | #include <stdlib.h>
 4 | 
 5 | #include <iostream>
 6 | #include "vectorial/vec4f.h"
 7 | 
 8 | #define NUM (81920)
 9 | #define ITER 100
10 | using namespace vectorial;
11 | 
12 | namespace {
13 |     vec4f* alloc_vec4f(size_t n) {
14 |         void *ptr = memalign(n*sizeof(vec4f), 16);
15 |         return static_cast<vec4f*>(ptr);
16 |     }    
17 | }
18 | 
19 | 
20 | 
21 | static vec4f * a;
22 | static vec4f * b;
23 | static float * c;
24 | 
25 | 
26 | 
27 | 
28 | void dot_func() {
29 |     
30 |     vec4f* vectorial_restrict aa = a;
31 |     vec4f* vectorial_restrict bb = b;
32 |     float* vectorial_restrict cc = c;
33 |     
34 |     for(size_t i = 0; i < NUM; ++i)
35 |     {
36 |         cc[i] = dot(aa[i], bb[i]);
37 |     }    
38 | }
39 | 
40 | void dot_bench() {
41 | 
42 |     a = alloc_vec4f(NUM);
43 |     b = alloc_vec4f(NUM);
44 |     c = static_cast<float*>(malloc(NUM * sizeof(float)));
45 | 
46 | 
47 |     for(size_t i = 0; i < NUM; ++i)
48 |     {
49 |         a[i]=vec4f(i,i,i,i);
50 |         b[i]=vec4f(NUM-i, NUM-i, NUM-i, NUM-i);
51 |     }
52 |         
53 |     profile("dot", dot_func, ITER, NUM);
54 | 
55 |     memfree(a);
56 |     memfree(b);
57 |     memfree(c);
58 | 
59 | 
60 | }
61 | 


--------------------------------------------------------------------------------
/include/vectorial/vec_convert.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |   Vectorial
 3 |   Copyright (c) 2010 Mikko Lehtonen
 4 |   Licensed under the terms of the two-clause BSD License (see LICENSE)
 5 | */
 6 | #ifndef VECTORIAL_VEC_CONVERT_H
 7 | #define VECTORIAL_VEC_CONVERT_H
 8 | 
 9 | 
10 | namespace vectorial {
11 |     
12 |     inline vec3f vec4f::xyz() const { return vec3f(value); }
13 |     inline vec2f vec4f::xy() const { return vec2f(value); }
14 | 
15 |     inline vec4f vec3f::xyz0() const { return vec4f(simd4f_zero_w(value)); }
16 |     inline vec4f vec3f::xyz1() const { return xyz0() + vec4f(0.0f, 0.0f, 0.0f, 1.0f); }
17 |     inline vec4f vec3f::xyzw(float w) const { return xyz0() + vec4f(0.0f, 0.0f, 0.0f, w); }
18 |     inline vec3f vec3f::xyz() const { return vec3f(value); }
19 |     inline vec3f vec3f::xy0() const { return vec3f(value) * vec3f(1.0f, 1.0f, 0.0f); }
20 |     inline vec2f vec3f::xy() const { return vec2f(value); }
21 | 
22 |     inline vec4f vec2f::xy00() const { return vec4f(simd4f_zero_zw(value)); }
23 |     inline vec4f vec2f::xy01() const { return xy00() + vec4f(0.0f, 0.0f, 0.0f, 1.0f); }
24 |     inline vec4f vec2f::xyzw(float z, float w) const { return xy00() + vec4f(0.0f, 0.0f, z, w); }
25 |     inline vec3f vec2f::xy0() const { return vec3f(simd4f_zero_zw(value)); }
26 |     inline vec2f vec2f::xy() const { return vec2f(value); }
27 | 
28 | }
29 | 
30 | 
31 | #endif
32 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright 2010 Mikko Lehtonen. All rights reserved.
 2 | 
 3 | Redistribution and use in source and binary forms, with or without modification, are
 4 | permitted provided that the following conditions are met:
 5 | 
 6 |    1. Redistributions of source code must retain the above copyright notice, this list of
 7 |       conditions and the following disclaimer.
 8 | 
 9 |    2. Redistributions in binary form must reproduce the above copyright notice, this list
10 |       of conditions and the following disclaimer in the documentation and/or other materials
11 |       provided with the distribution.
12 | 
13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
14 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
15 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
16 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
17 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
18 | NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
19 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
20 | WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
21 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
22 | POSSIBILITY OF SUCH DAMAGE.
23 | 


--------------------------------------------------------------------------------
/bench/matrix_bench.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #include "bench.h"
 3 | #include <stdlib.h>
 4 | 
 5 | #include <iostream>
 6 | #include "vectorial/simd4x4f.h"
 7 | 
 8 | #define NUM (819200)
 9 | #define ITER 100
10 | //using namespace vectorial;
11 | 
12 | namespace {
13 |     simd4x4f* alloc_vec4x4f(size_t n) {
14 |         void *ptr = memalign(n*sizeof(simd4x4f), 16);
15 |         return static_cast<simd4x4f*>(ptr);
16 |     }    
17 | }
18 | 
19 | 
20 | 
21 | static simd4x4f * a;
22 | static simd4x4f * b;
23 | static simd4x4f * c;
24 | 
25 | 
26 | 
27 | 
28 | void matrix_func() {
29 |     
30 |     simd4x4f* vectorial_restrict aa = a;
31 |     simd4x4f* vectorial_restrict bb = b;
32 |     simd4x4f* vectorial_restrict cc = c;
33 |     
34 |     for(size_t i = 0; i < NUM; ++i)
35 |     {
36 |         simd4x4f_matrix_mul(&aa[i], &bb[i], &bb[i]);
37 |     }    
38 | }
39 | 
40 | void matrix_bench() {
41 | 
42 |     a = alloc_vec4x4f(NUM);
43 |     b = alloc_vec4x4f(NUM);
44 |     c = alloc_vec4x4f(NUM);
45 | 
46 | 
47 |     for(size_t i = 0; i < NUM; ++i)
48 |     {
49 |         simd4f v = simd4f_create(i,i,i,i);
50 |         simd4f vi = simd4f_create(NUM-i,NUM-i,NUM-i,NUM-i);
51 |         a[i]=simd4x4f_create(v,v,v,v);
52 |         b[i]=simd4x4f_create(vi,vi,vi,vi);
53 |     }
54 |         
55 |     profile("matrix mul", matrix_func, ITER, NUM);
56 | 
57 |     memfree(a);
58 |     memfree(b);
59 |     memfree(c);
60 | 
61 | 
62 | }
63 | 


--------------------------------------------------------------------------------
/bench/bench.h:
--------------------------------------------------------------------------------
 1 | #ifndef BENCH_H
 2 | #define BENCH_H
 3 | 
 4 | #include <string>
 5 | #include <stdlib.h>
 6 | 
 7 | #ifdef __APPLE__
 8 |     #define BENCH_MACH
 9 |     #include <mach/mach_time.h>
10 |     #include <stdint.h>
11 | #elif defined(_WIN32)
12 |     #define BENCH_QPC
13 |     #define WIN32_LEAN_AND_MEAN
14 |     #include <windows.h>
15 |     #include <malloc.h>
16 | #else
17 |     #define BENCH_GTOD
18 |     #include <sys/time.h>
19 | #endif
20 | 
21 | 
22 | static void* memalign(size_t count, size_t align) {
23 |     #ifdef _WIN32
24 |     return _aligned_malloc(count,align);
25 |     #else
26 |     void *ptr;
27 |     int e = posix_memalign(&ptr, align, count);
28 |     //    if( e == EINVAL ) printf("EINVAL posix_memalign\n");
29 |     //    if( e == ENOMEM ) printf("ENOMEM posix_memalign\n");
30 |     return ptr;
31 |     #endif
32 | }
33 | 
34 | static void memfree(void* ptr) {
35 |     #ifdef _WIN32
36 |     _aligned_free(ptr);
37 |     #else
38 |     free(ptr);
39 |     #endif
40 | }
41 | 
42 | namespace profiler {
43 | 
44 |     #ifdef BENCH_GTOD
45 |         typedef struct timeval time_t;
46 |     #endif
47 |     #ifdef BENCH_MACH
48 |         typedef const uint64_t time_t;
49 |     #endif
50 |     #ifdef BENCH_QPC
51 |         typedef LARGE_INTEGER time_t;
52 |     #endif
53 | 
54 |     void init();
55 |     time_t now();
56 | 
57 |     double diffTime(time_t start, time_t end);
58 | 
59 | }
60 | 
61 | std::string formatTime(double d, double relative=-1);
62 | void profile(const char* name, void (*func)(), int iterations, int elements);
63 | 
64 | 
65 | #endif
66 | 


--------------------------------------------------------------------------------
/spec/spec_mat4f.cpp:
--------------------------------------------------------------------------------
 1 | #include "spec_helper.h"
 2 | #include <iostream>
 3 | using vectorial::vec4f;
 4 | using vectorial::mat4f;
 5 | 
 6 | const int epsilon = 1;
 7 | 
 8 | describe(mat4f, "constructing") {
 9 |     it("should have default constructor that does nothing..") {
10 |         mat4f x;
11 |     }
12 | 
13 |     it("should have constructor that constructs from four vec4") {
14 |         mat4f x( vec4f(1,2,3,4), vec4f(5,6,7,8), vec4f(9,10,11,12), vec4f(13,14,15,16) );
15 | 
16 |         // octave mat4f: [1,5,9,13 ; 2,6,10,14 ; 3,7,11,15 ; 4,8,12,16 ]
17 |         should_be_equal_mat4f(x, simd4x4f_create(simd4f_create(1.000000000000000f, 2.000000000000000f, 3.000000000000000f, 4.000000000000000f), simd4f_create(5.000000000000000f, 6.000000000000000f, 7.000000000000000f, 8.000000000000000f), simd4f_create(9.000000000000000f, 10.000000000000000f, 11.000000000000000f, 12.000000000000000f), simd4f_create(13.000000000000000f, 14.000000000000000f, 15.000000000000000f, 16.000000000000000f)), epsilon );
18 |     }
19 |     
20 |     it("should have static function to create identity matrix") {
21 |         
22 |         mat4f x = mat4f::identity();
23 |         
24 |         // octave mat4f: [1,0,0,0;0,1,0,0;0,0,1,0;0,0,0,1]
25 |         should_be_equal_mat4f(x, simd4x4f_create(simd4f_create(1.000000000000000f, 0.000000000000000f, 0.000000000000000f, 0.000000000000000f), simd4f_create(0.000000000000000f, 1.000000000000000f, 0.000000000000000f, 0.000000000000000f), simd4f_create(0.000000000000000f, 0.000000000000000f, 1.000000000000000f, 0.000000000000000f), simd4f_create(0.000000000000000f, 0.000000000000000f, 0.000000000000000f, 1.000000000000000f)), epsilon );
26 |     }
27 |     
28 | }
29 | 
30 | 


--------------------------------------------------------------------------------
/spec/spec_main.cpp:
--------------------------------------------------------------------------------
 1 | /* Specific - Minimal C++ spec framework.
 2 |  
 3 | 
 4 | The zlib/libpng License
 5 | 
 6 | 
 7 | Copyright (c) 2008 Mikko Lehtonen
 8 | 
 9 | This software is provided 'as-is', without any express or implied
10 | warranty. In no event will the authors be held liable for any damages
11 | arising from the use of this software.
12 | 
13 | Permission is granted to anyone to use this software for any purpose,
14 | including commercial applications, and to alter it and redistribute it
15 | freely, subject to the following restrictions:
16 | 
17 |     1. The origin of this software must not be misrepresented; you must not
18 |     claim that you wrote the original software. If you use this software
19 |     in a product, an acknowledgment in the product documentation would be
20 |     appreciated but is not required.
21 | 
22 |     2. Altered source versions must be plainly marked as such, and must not be
23 |     misrepresented as being the original software.
24 | 
25 |     3. This notice may not be removed or altered from any source
26 |     distribution.
27 | */
28 | 
29 | 
30 | #include "spec.h"
31 | #include <cstdlib>
32 | 
33 | int main(int argc, char *argv[]) 
34 | {
35 | 
36 |     std::string subset("");
37 | 
38 |     specific::ProgressWriter progressWriter;
39 |     specific::SpecdocWriter specdocWriter;
40 |     specific::SpecWriter* writer = &progressWriter;
41 | 
42 |     for(size_t i = 1; i < size_t(argc); ++i) {
43 |         if( std::string("-s") == argv[i] ) {
44 |             writer = &specdocWriter;
45 |         } else {
46 |             subset = argv[i];
47 |         }
48 |     }
49 | 
50 | 
51 |     bool success = specific::SpecRunner::getInstance().run(*writer, subset);
52 | 
53 |     return success ? EXIT_SUCCESS : EXIT_FAILURE;
54 | }
55 | 
56 | 


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
 1 | 
 2 |     Vectorial - vector math library
 3 | 
 4 | 
 5 | 
 6 |   Motivation
 7 | 
 8 |     I couldn't find an open source math library that was usable and
 9 |     supported simd - especially the ARM NEON variant.
10 | 
11 | 
12 |   Features
13 | 
14 |     Supports NEON, SSE, scalar and generic gcc vector extension.
15 |     Most basic vector and matrix math is available, but not quite
16 |     yet full featured.
17 | 
18 | 
19 |   Design
20 | 
21 |     Vectorial consists of two main parts, pure-C wrapper around
22 |     platform-specific vector instructions in the simd*.h files
23 |     and C++ classes for common uses, the vec*.h and mat*.h
24 | 
25 |     The config.h autodetects approriate vector instructions to use.
26 | 
27 |     The platform-specific support is done with intrisincs only,
28 |     allowing the compiler to have a full view of the code, hopefully
29 |     resulting in better optimizations especially with reordering etc.
30 | 
31 | 
32 |   Installation / Usage
33 | 
34 |     Add vectorial/include to your include path
35 | 
36 |     #include "vectorial/simd4f.h"  
37 |     for C-only simd wrapper, using it looks like this:
38 |       simd4f v = simd4f_normalize( simd4f_add( simd4f_create(1,2,3,4), y) );
39 |       float z = simd4f_get_z(v);
40 | 
41 |     #include "vectorial/vectorial.h"
42 |     for C++ classes. They reside in vectorial namespace, you might
43 |     want to alias them to your own namespace
44 |       namespace myproject {
45 |         using namespace ::vectorial;
46 |         // if you like different name: typedef vec3f Vector3;
47 |       }
48 |       using myproject::vec4f;
49 |       
50 |       vec4f v = normalize( vec4f(1,2,3,4) + y );
51 |       float z = v.z();
52 | 
53 | 
54 |   License
55 | 
56 |     2-clause BSD. See LICENSE
57 | 
58 | 
59 | 
60 | 
61 | 


--------------------------------------------------------------------------------
/tools/spechelper.m:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env octave
 2 | 
 3 | 1;
 4 | 
 5 | function spec_formatter (val,type)
 6 | 
 7 |     if( isscalar(val) == 1 ) 
 8 |         printf("        should_be_close_to(x, %15.15ff, epsilon );", val);
 9 |         return;
10 |     endif
11 | 
12 |     if( size(val) == [1,2] ) 
13 |         if( strcmp(type,"simd2f") == 1 )
14 |         printf("        should_be_equal_%s(x, simd2f_create(%15.15ff, %15.15ff), epsilon );",type, val(1), val(2));
15 |         else
16 |         printf("        should_be_equal_%s(x, simd4f_create(%15.15ff, %15.15ff, 0.0f, 0.0f), epsilon );",type, val(1), val(2));
17 |         endif
18 |         return;
19 |     endif
20 | 
21 |     if( size(val) == [1,3] ) 
22 |         printf("        should_be_equal_%s(x, simd4f_create(%15.15ff, %15.15ff, %15.15ff, 0.0f), epsilon );",type, val(1), val(2), val(3));
23 |         return;
24 |     endif
25 | 
26 |     if( size(val) == [1,4] ) 
27 |         printf("        should_be_equal_%s(x, simd4f_create(%15.15ff, %15.15ff, %15.15ff, %15.15ff), epsilon );",type, val(1), val(2), val(3), val(4));
28 |         return;
29 |     endif
30 | 
31 |     if( size(val) == [4,1] ) 
32 |         printf("        should_be_equal_%s(x, simd4f_create(%15.15ff, %15.15ff, %15.15ff, %15.15ff), epsilon );",type, val(1), val(2), val(3), val(4));
33 |         return;
34 |     endif
35 | 
36 |     if( size(val) == [4,4] ) 
37 |         printf("        should_be_equal_%s(x, simd4x4f_create(simd4f_create(%15.15ff, %15.15ff, %15.15ff, %15.15ff), simd4f_create(%15.15ff, %15.15ff, %15.15ff, %15.15ff), simd4f_create(%15.15ff, %15.15ff, %15.15ff, %15.15ff), simd4f_create(%15.15ff, %15.15ff, %15.15ff, %15.15ff)), epsilon );",type, 
38 |         val(1), val(2), val(3), val(4), val(5), val(6), val(7), val(8), val(9), val(10), val(11), val(12), val(13), val(14), val(15), val(16)
39 |         );
40 |         return;
41 |     endif
42 | 
43 | 
44 | endfunction
45 | 
46 | 


--------------------------------------------------------------------------------
/vectorial.sln:
--------------------------------------------------------------------------------
 1 | 
 2 | Microsoft Visual Studio Solution File, Format Version 10.00
 3 | # Visual C++ Express 2008
 4 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "vectorial specsuite", "vectorial.vcproj", "{9450BCE8-02CB-4169-8471-2DFF764817F4}"
 5 | EndProject
 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "vectorial benchmark", "vectorialbenchmark.vcproj", "{1E78F64D-C404-4048-8AE6-217089480E8A}"
 7 | EndProject
 8 | Global
 9 | 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
10 | 		Debug|Win32 = Debug|Win32
11 | 		Release Scalar|Win32 = Release Scalar|Win32
12 | 		Release SSE|Win32 = Release SSE|Win32
13 | 	EndGlobalSection
14 | 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
15 | 		{9450BCE8-02CB-4169-8471-2DFF764817F4}.Debug|Win32.ActiveCfg = Debug|Win32
16 | 		{9450BCE8-02CB-4169-8471-2DFF764817F4}.Debug|Win32.Build.0 = Debug|Win32
17 | 		{9450BCE8-02CB-4169-8471-2DFF764817F4}.Release Scalar|Win32.ActiveCfg = Release Scalar|Win32
18 | 		{9450BCE8-02CB-4169-8471-2DFF764817F4}.Release Scalar|Win32.Build.0 = Release Scalar|Win32
19 | 		{9450BCE8-02CB-4169-8471-2DFF764817F4}.Release SSE|Win32.ActiveCfg = Release|Win32
20 | 		{9450BCE8-02CB-4169-8471-2DFF764817F4}.Release SSE|Win32.Build.0 = Release|Win32
21 | 		{1E78F64D-C404-4048-8AE6-217089480E8A}.Debug|Win32.ActiveCfg = Debug|Win32
22 | 		{1E78F64D-C404-4048-8AE6-217089480E8A}.Debug|Win32.Build.0 = Debug|Win32
23 | 		{1E78F64D-C404-4048-8AE6-217089480E8A}.Release Scalar|Win32.ActiveCfg = Release Scalar|Win32
24 | 		{1E78F64D-C404-4048-8AE6-217089480E8A}.Release Scalar|Win32.Build.0 = Release Scalar|Win32
25 | 		{1E78F64D-C404-4048-8AE6-217089480E8A}.Release SSE|Win32.ActiveCfg = Release|Win32
26 | 		{1E78F64D-C404-4048-8AE6-217089480E8A}.Release SSE|Win32.Build.0 = Release|Win32
27 | 	EndGlobalSection
28 | 	GlobalSection(SolutionProperties) = preSolution
29 | 		HideSolutionNode = FALSE
30 | 	EndGlobalSection
31 | EndGlobal
32 | 


--------------------------------------------------------------------------------
/include/vectorial/simd4f_common.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |   Vectorial
 3 |   Copyright (c) 2010 Mikko Lehtonen
 4 |   Copyright (c) 2014 Google, Inc.
 5 |   Licensed under the terms of the two-clause BSD License (see LICENSE)
 6 | */
 7 | #ifndef VECTORIAL_SIMD4F_COMMON_H
 8 | #define VECTORIAL_SIMD4F_COMMON_H
 9 | 
10 | 
11 | vectorial_inline simd4f simd4f_sum(simd4f v) { 
12 |     const simd4f s1 = simd4f_add(simd4f_splat_x(v), simd4f_splat_y(v));
13 |     const simd4f s2 = simd4f_add(s1, simd4f_splat_z(v));
14 |     const simd4f s3 = simd4f_add(s2, simd4f_splat_w(v));
15 |     return s3;
16 | }
17 | 
18 | vectorial_inline simd4f simd4f_dot4(simd4f lhs, simd4f rhs) {
19 |     return simd4f_sum( simd4f_mul(lhs, rhs) );
20 | }
21 | 
22 | vectorial_inline simd4f simd4f_dot2(simd4f lhs, simd4f rhs) {
23 |     const simd4f m = simd4f_mul(lhs, rhs);
24 |     const simd4f s1 = simd4f_add(simd4f_splat_x(m), simd4f_splat_y(m));
25 |     return s1;
26 | }
27 | 
28 | 
29 | vectorial_inline simd4f simd4f_length4(simd4f v) {
30 |     return simd4f_sqrt( simd4f_dot4(v,v) );
31 | }
32 | 
33 | vectorial_inline simd4f simd4f_length3(simd4f v) {
34 |     return simd4f_sqrt( simd4f_dot3(v,v) );
35 | }
36 | 
37 | vectorial_inline simd4f simd4f_length2(simd4f v) {
38 |     return simd4f_sqrt( simd4f_dot2(v,v) );
39 | }
40 | 
41 | vectorial_inline simd4f simd4f_length4_squared(simd4f v) {
42 |     return simd4f_dot4(v,v);
43 | }
44 | 
45 | vectorial_inline simd4f simd4f_length3_squared(simd4f v) {
46 |     return simd4f_dot3(v,v);
47 | }
48 | 
49 | vectorial_inline float simd4f_length3_squared_scalar(simd4f v) {
50 |     return simd4f_dot3_scalar(v,v);
51 | }
52 | 
53 | vectorial_inline simd4f simd4f_length2_squared(simd4f v) {
54 |     return simd4f_dot2(v,v);
55 | }
56 | 
57 | 
58 | vectorial_inline simd4f simd4f_normalize4(simd4f a) {
59 |     simd4f invlen = simd4f_rsqrt( simd4f_dot4(a,a) );
60 |     return simd4f_mul(a, invlen);    
61 | }
62 | 
63 | vectorial_inline simd4f simd4f_normalize3(simd4f a) {
64 |     simd4f invlen = simd4f_rsqrt( simd4f_dot3(a,a) );
65 |     return simd4f_mul(a, invlen);
66 | }
67 | 
68 | vectorial_inline simd4f simd4f_normalize2(simd4f a) {
69 |     simd4f invlen = simd4f_rsqrt( simd4f_dot2(a,a) );
70 |     return simd4f_mul(a, invlen);    
71 | }
72 | 
73 | 
74 | #endif
75 | 


--------------------------------------------------------------------------------
/include/vectorial/config.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |   Vectorial
  3 |   Copyright (c) 2010 Mikko Lehtonen
  4 |   Licensed under the terms of the two-clause BSD License (see LICENSE)
  5 | */
  6 | #ifndef VECTORIAL_CONFIG_H
  7 | #define VECTORIAL_CONFIG_H
  8 | 
  9 | 
 10 | #ifndef VECTORIAL_FORCED
 11 |     #if defined(__SSE__) || (_M_IX86_FP > 0) || (_M_X64 > 0)
 12 | 
 13 |         #define VECTORIAL_SSE
 14 | 
 15 |     // __ARM_NEON is used instead of __ARM_NEON__ on armv8.
 16 |     #elif defined(__ARM_NEON__) || defined(__ARM_NEON)
 17 | 
 18 |         #define VECTORIAL_NEON
 19 | 
 20 |     // Don't use gnu extension for arm, buggy with some gccs with armv6 and -Os,
 21 |     // Also doesn't seem perform as well
 22 |     #elif defined(__GNUC__) && !defined(__arm__)
 23 | 
 24 |         #define VECTORIAL_GNU
 25 | 
 26 |     #else
 27 | 
 28 |         #define VECTORIAL_SCALAR
 29 | 
 30 |     #endif
 31 | #endif
 32 | 
 33 | 
 34 | 
 35 | #ifdef VECTORIAL_SCALAR
 36 |     #define VECTORIAL_SIMD_TYPE "scalar"
 37 | #endif
 38 | 
 39 | #ifdef VECTORIAL_SSE
 40 |     #define VECTORIAL_SIMD_TYPE "sse"
 41 | #endif
 42 | 
 43 | #ifdef VECTORIAL_NEON
 44 |     #define VECTORIAL_SIMD_TYPE "neon"
 45 |     #define VECTORIAL_HAVE_SIMD2F
 46 | #endif
 47 | 
 48 | #ifdef VECTORIAL_GNU
 49 |     #define VECTORIAL_SIMD_TYPE "gnu"
 50 | #endif
 51 | 
 52 | 
 53 | 
 54 | #if defined(VECTORIAL_FORCED) && !defined(VECTORIAL_SIMD_TYPE)
 55 |     #error VECTORIAL_FORCED set but no simd-type found, try f.ex. VECTORIAL_SCALAR
 56 | #endif
 57 | 
 58 | 
 59 | #define vectorial_inline    static inline
 60 | 
 61 | #if defined(__GNUC__) 
 62 |   #if defined(__cplusplus)
 63 |     #define vectorial_restrict  __restrict
 64 |   #endif
 65 |   #define simd4f_aligned16  __attribute__ ((aligned (16)))
 66 | #elif defined(_WIN32)
 67 |   #define vectorial_restrict  
 68 |   #define simd4f_aligned16   __declspec(align(16))
 69 | #else
 70 |   #define vectorial_restrict  restrict
 71 |   #define simd4f_aligned16   
 72 | #endif
 73 | // #define vectorial_restrict
 74 | 
 75 | #ifdef __GNUC__
 76 |     #define vectorial_pure __attribute__((pure))
 77 | #else
 78 |     #define vectorial_pure
 79 | #endif
 80 | 
 81 | #ifdef _WIN32
 82 |   #if defined(min) || defined(max)
 83 | #pragma message ( "set NOMINMAX as preprocessor macro, undefining min/max " )
 84 | #undef min
 85 | #undef max
 86 |   #endif
 87 | #endif
 88 | 
 89 | #ifdef __cplusplus
 90 |     // Hack around msvc badness
 91 |     #define SIMD_PARAM(t, p) const t& p
 92 | #else
 93 |     #define SIMD_PARAM(t, p) t p
 94 | #endif
 95 |                     
 96 | #define VECTORIAL_PI      3.14159265f
 97 | #define VECTORIAL_HALFPI  1.57079633f
 98 | 
 99 | 
100 | 
101 | #endif
102 | 


--------------------------------------------------------------------------------
/bench/quad_bench.cpp:
--------------------------------------------------------------------------------
  1 | 
  2 | #include "bench.h"
  3 | #include <stdlib.h>
  4 | 
  5 | #include <iostream>
  6 | #include "vectorial/simd4x4f.h"
  7 | 
  8 | #define NUM (81920)
  9 | #define ITER 100
 10 | //using namespace vectorial;
 11 | 
 12 | namespace {
 13 |     simd4x4f* alloc_simd4x4f(size_t n) {
 14 |         void *ptr = memalign(n*sizeof(simd4x4f), 16);
 15 |         return static_cast<simd4x4f*>(ptr);
 16 |     }    
 17 | }
 18 | 
 19 | 
 20 | 
 21 | static simd4x4f * a;
 22 | static simd4x4f * b;
 23 | static simd4x4f * c;
 24 | 
 25 | 
 26 | 
 27 | static simd4x4f add_4x4(SIMD_PARAM(simd4x4f, a), SIMD_PARAM(simd4x4f, b)) {
 28 |     return simd4x4f_create(
 29 |         simd4f_add(a.x, b.x),
 30 |         simd4f_add(a.y, b.y),
 31 |         simd4f_add(a.z, b.z),
 32 |         simd4f_add(a.w, b.w)
 33 |         );
 34 | }
 35 | 
 36 | static simd4x4f add_4x4_rp(simd4x4f *a, simd4x4f *b) {
 37 |     return simd4x4f_create(
 38 |         simd4f_add(a->x, b->x),
 39 |         simd4f_add(a->y, b->y),
 40 |         simd4f_add(a->z, b->z),
 41 |         simd4f_add(a->w, b->w)
 42 |         );
 43 | }
 44 | 
 45 | 
 46 | static void add_4x4_p(simd4x4f *a, simd4x4f *b, simd4x4f *out) {
 47 |     out->x = simd4f_add(a->x, b->x);
 48 |     out->y = simd4f_add(a->y, b->y);
 49 |     out->z = simd4f_add(a->z, b->z);
 50 |     out->w = simd4f_add(a->w, b->w);
 51 | }
 52 | 
 53 | 
 54 | 
 55 | 
 56 | void quad_return_func() {
 57 |     
 58 |     
 59 |     simd4x4f* aa = a;
 60 |     simd4x4f* bb = b;
 61 |     simd4x4f* cc = c;
 62 |     
 63 |     for(size_t i = 0; i < NUM; ++i)
 64 |     {
 65 |         bb[i] = add_4x4(aa[i], bb[i]);
 66 |     }    
 67 | }
 68 | 
 69 | 
 70 | void quad_pointer_func() {
 71 |     
 72 |     simd4x4f* aa = a;
 73 |     simd4x4f* bb = b;
 74 |     simd4x4f* cc = c;
 75 |     
 76 |     for(size_t i = 0; i < NUM; ++i)
 77 |     {
 78 |         add_4x4_p(&aa[i], &bb[i], &bb[i]);
 79 |     }
 80 |     
 81 | 
 82 | }
 83 | 
 84 | void quad_pointer_return_func() {
 85 |     
 86 |     simd4x4f* aa = a;
 87 |     simd4x4f* bb = b;
 88 |     simd4x4f* cc = c;
 89 |     
 90 |     for(size_t i = 0; i < NUM; ++i)
 91 |     {
 92 |         bb[i] = add_4x4_rp(&aa[i], &bb[i]);
 93 |     }    
 94 |     
 95 | 
 96 | }
 97 | 
 98 | 
 99 | void quad_bench() {
100 | 
101 |     a = alloc_simd4x4f(NUM);
102 |     b = alloc_simd4x4f(NUM);
103 |     c = alloc_simd4x4f(NUM);
104 | 
105 | 
106 |     for(size_t i = 0; i < NUM; ++i)
107 |     {
108 |         simd4f t = simd4f_create(i,i,i,i); 
109 |         simd4f t2 = simd4f_create(NUM-i,NUM-i,NUM-i,NUM-i); 
110 |         a[i]=simd4x4f_create(t,t,t,t);
111 |         b[i]=simd4x4f_create(t2,t2,t2,t2);
112 |     }
113 |         
114 |     profile("quad return-value", quad_return_func, ITER, NUM);
115 |     profile("quad pass-by-pointer", quad_pointer_func, ITER, NUM);
116 |     profile("quad pass-by-pointer return-value", quad_pointer_return_func, ITER, NUM);
117 | 
118 |     memfree(a);
119 |     memfree(b);
120 |     memfree(c);
121 | 
122 | 
123 | }
124 | 


--------------------------------------------------------------------------------
/bench/bench.cpp:
--------------------------------------------------------------------------------
  1 | #include "bench.h"
  2 | #include <sstream>
  3 | #include <iostream>
  4 | #include "vectorial/config.h"
  5 | 
  6 | 
  7 | namespace profiler {
  8 | 
  9 |     #ifdef BENCH_MACH
 10 |     mach_timebase_info_data_t info;
 11 |     void init() {
 12 |         mach_timebase_info(&info);
 13 |     }
 14 |     #endif
 15 |     
 16 |     #ifdef BENCH_GTOD
 17 |     void init() {
 18 |     }
 19 |     #endif
 20 | 
 21 |     #ifdef BENCH_QPC
 22 |     double frequency;
 23 |     void init() {
 24 |         LARGE_INTEGER freq;
 25 |         QueryPerformanceFrequency(&freq);
 26 |         frequency = (double)freq.QuadPart;
 27 |     }
 28 |     #endif
 29 | 
 30 | 
 31 |     time_t now() {
 32 | 
 33 |         #ifdef BENCH_MACH
 34 |         return mach_absolute_time();
 35 |         #endif
 36 | 
 37 |         #ifdef BENCH_GTOD
 38 |         time_t v;
 39 |         gettimeofday(&v, NULL);
 40 |         return v;
 41 |         #endif
 42 |         
 43 |         #ifdef BENCH_QPC
 44 |         LARGE_INTEGER v;
 45 |         QueryPerformanceCounter(&v);
 46 |         return v;
 47 |         #endif
 48 | 
 49 |     }
 50 |     
 51 |     
 52 |     double diffTime(time_t start, time_t end) {
 53 |         
 54 |         #ifdef BENCH_GTOD
 55 |         return (end.tv_sec - start.tv_sec) + (end.tv_usec - start.tv_usec) / 1000000.0;
 56 |         #endif
 57 |         
 58 |         #ifdef BENCH_MACH        
 59 |         return ((end-start) * info.numer / info.denom) / 1000000000.0;
 60 |         #endif
 61 | 
 62 |         #ifdef BENCH_QPC
 63 |         return (end.QuadPart - start.QuadPart) / frequency;
 64 |         #endif
 65 |     }
 66 |     
 67 | }
 68 | 
 69 | 
 70 | std::string formatTime(double d, double relative ) {
 71 |     const double sec   = 1.0;
 72 |     const double milli = 0.001;
 73 |     const double micro = 0.000001;
 74 |     const double nano  = 0.000000001;
 75 |     std::stringstream ss;
 76 |     if( relative < 0.0) relative=d;
 77 |     if( relative >= sec ) ss << d << "s";
 78 |     else if( relative >= milli ) ss << d/milli << "ms";
 79 |     else if( relative >= micro ) ss << d/micro <<"us";
 80 |     else ss << d/nano << "ns";
 81 |     return ss.str();
 82 | }
 83 | 
 84 | void profile(const char* name, void (*func)(), int iterations, int elements) {
 85 |     
 86 |     profiler::init();
 87 |     profiler::time_t start = profiler::now();
 88 |     for(int i = 0; i < iterations; ++i)
 89 |     {
 90 |         func();
 91 |     }
 92 |     profiler::time_t end = profiler::now();
 93 |     
 94 |     std::cout << "Using simd: " << VECTORIAL_SIMD_TYPE << std::endl;
 95 |     std::cout << "Testing: " << name << std::endl;
 96 |     std::cout << "Duration " << formatTime(profiler::diffTime(start,end)) << std::endl;
 97 |     std::cout << "Per iter " << formatTime(profiler::diffTime(start,end) / iterations) << std::endl;
 98 |     std::cout << "Per item " << formatTime(profiler::diffTime(start,end) / iterations / elements) << std::endl;
 99 | 
100 |     
101 | }
102 | 
103 | void add_bench();
104 | void dot_bench();
105 | void quad_bench();
106 | void matrix_bench();
107 | 
108 | int main() {
109 |     
110 | //    add_bench();
111 | //    dot_bench();
112 | //    quad_bench();
113 |     matrix_bench();
114 | 
115 |     return 0;
116 | }
117 | 
118 | 


--------------------------------------------------------------------------------
/include/vectorial/simd2f_neon.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |   Vectorial
  3 |   Copyright (c) 2010 Mikko Lehtonen
  4 |   Copyright (c) 2014 Google, Inc.
  5 |   Licensed under the terms of the two-clause BSD License (see LICENSE)
  6 | */
  7 | #ifndef VECTORIAL_SIMD2F_NEON_H
  8 | #define VECTORIAL_SIMD2F_NEON_H
  9 | 
 10 | #include <arm_neon.h>
 11 | 
 12 | #ifdef __cplusplus
 13 | extern "C" {
 14 | #endif
 15 | 
 16 | 
 17 | typedef float32x2_t simd2f;
 18 | 
 19 | typedef union {
 20 |     simd2f s ;
 21 |     float f[2];
 22 | } _simd2f_union;
 23 | 
 24 | 
 25 | 
 26 | vectorial_inline simd2f simd2f_create(float x, float y) {
 27 |     const float32_t d[2] = { x,y };
 28 |     simd2f s = vld1_f32(d);
 29 |     return s;
 30 | }
 31 | 
 32 | vectorial_inline simd2f simd2f_zero() { return vdup_n_f32(0.0f); }
 33 | 
 34 | vectorial_inline simd2f simd2f_uload2(const float *ary) {
 35 |     const float32_t* ary32 = (const float32_t*)ary;
 36 |     simd2f s = vld1_f32(ary32);
 37 |     return s;
 38 | }
 39 | 
 40 | vectorial_inline void simd2f_ustore2(const simd2f val, float *ary) {
 41 |     vst1_f32( (float32_t*)ary, val);
 42 | }
 43 | 
 44 | vectorial_inline simd2f simd2f_splat(float v) {
 45 |     simd2f s = vdup_n_f32(v);
 46 |     return s;
 47 | }
 48 | 
 49 | vectorial_inline simd2f simd2f_splat_x(simd2f v) {
 50 |     simd2f ret = vdup_lane_f32(v, 0);
 51 |     return ret;
 52 | }
 53 | 
 54 | vectorial_inline simd2f simd2f_splat_y(simd2f v) {
 55 |     simd2f ret = vdup_lane_f32(v, 1);
 56 |     return ret;
 57 | }
 58 | 
 59 | vectorial_inline simd2f simd2f_reciprocal(simd2f v) {
 60 |     simd2f estimate = vrecpe_f32(v);
 61 |     estimate = vmul_f32(vrecps_f32(estimate, v), estimate);
 62 |     estimate = vmul_f32(vrecps_f32(estimate, v), estimate);
 63 |     return estimate;
 64 | }
 65 | 
 66 | vectorial_inline void simd2f_rsqrt_1iteration(const simd2f& v, simd2f& estimate) {
 67 |     simd2f estimate2 = vmul_f32(estimate, v);
 68 |     estimate = vmul_f32(estimate, vrsqrts_f32(estimate2, estimate));
 69 | }
 70 | 
 71 | vectorial_inline simd2f simd2f_rsqrt1(simd2f v) {
 72 |     simd2f estimate = vrsqrte_f32(v);
 73 |     simd2f_rsqrt_1iteration(v, estimate);
 74 |     return estimate;
 75 | }
 76 | 
 77 | vectorial_inline simd2f simd2f_rsqrt2(simd2f v) {
 78 |     simd2f estimate = vrsqrte_f32(v);
 79 |     simd2f_rsqrt_1iteration(v, estimate);
 80 |     simd2f_rsqrt_1iteration(v, estimate);
 81 |     return estimate;
 82 | }
 83 | 
 84 | vectorial_inline simd2f simd2f_rsqrt3(simd2f v) {
 85 |     simd2f estimate = vrsqrte_f32(v);
 86 |     simd2f_rsqrt_1iteration(v, estimate);
 87 |     simd2f_rsqrt_1iteration(v, estimate);
 88 |     simd2f_rsqrt_1iteration(v, estimate);
 89 |     return estimate;
 90 | }
 91 | 
 92 | // http://en.wikipedia.org/wiki/Fast_inverse_square_root makes the argument for
 93 | // one iteration but two gives a signficant accuracy improvment.
 94 | vectorial_inline simd2f simd2f_rsqrt(simd2f v) {
 95 |     return simd2f_rsqrt2(v);
 96 | }
 97 | 
 98 | vectorial_inline simd2f simd2f_sqrt(simd2f v) {
 99 | 
100 |     return vreinterpret_f32_u32(vand_u32( vtst_u32(vreinterpret_u32_f32(v),
101 |                                                       vreinterpret_u32_f32(v)),
102 |                                             vreinterpret_u32_f32(
103 |                                               simd2f_reciprocal(simd2f_rsqrt(v)))
104 |                                           )
105 |                                 );
106 | 
107 | }
108 | 
109 | // arithmetics
110 | 
111 | vectorial_inline simd2f simd2f_add(simd2f lhs, simd2f rhs) {
112 |     simd2f ret = vadd_f32(lhs, rhs);
113 |     return ret;
114 | }
115 | 
116 | vectorial_inline simd2f simd2f_sub(simd2f lhs, simd2f rhs) {
117 |     simd2f ret = vsub_f32(lhs, rhs);
118 |     return ret;
119 | }
120 | 
121 | vectorial_inline simd2f simd2f_mul(simd2f lhs, simd2f rhs) {
122 |     simd2f ret = vmul_f32(lhs, rhs);
123 |     return ret;
124 | }
125 | 
126 | vectorial_inline simd2f simd2f_div(simd2f lhs, simd2f rhs) {
127 |     simd2f recip = simd2f_reciprocal( rhs );
128 |     simd2f ret = vmul_f32(lhs, recip);
129 |     return ret;
130 | }
131 | 
132 | vectorial_inline simd2f simd2f_madd(simd2f m1, simd2f m2, simd2f a) {
133 |     return vmla_f32( a, m1, m2 );
134 | }
135 | 
136 | vectorial_inline float simd2f_get_x(simd2f s) { return vget_lane_f32(s, 0); }
137 | vectorial_inline float simd2f_get_y(simd2f s) { return vget_lane_f32(s, 1); }
138 | 
139 | vectorial_inline simd2f simd2f_dot2(simd2f lhs, simd2f rhs) {
140 |     const simd2f m = simd2f_mul(lhs, rhs);
141 |     return vpadd_f32(m, m);
142 | }
143 | 
144 | vectorial_inline simd2f simd2f_min(simd2f a, simd2f b) {
145 |     return vmin_f32( a, b );
146 | }
147 | 
148 | vectorial_inline simd2f simd2f_max(simd2f a, simd2f b) {
149 |     return vmax_f32( a, b );
150 | }
151 | 
152 | 
153 | #ifdef __cplusplus
154 | }
155 | #endif
156 | 
157 | 
158 | #endif
159 | 
160 | 


--------------------------------------------------------------------------------
/include/vectorial/simd4f_scalar.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |   Vectorial
  3 |   Copyright (c) 2010 Mikko Lehtonen
  4 |   Licensed under the terms of the two-clause BSD License (see LICENSE)
  5 | */
  6 | #ifndef VECTORIAL_SIMD4F_SCALAR_H
  7 | #define VECTORIAL_SIMD4F_SCALAR_H
  8 | 
  9 | #include <math.h>
 10 | #include <string.h>  // memcpy
 11 | 
 12 | #ifdef __cplusplus
 13 | extern "C" {
 14 | #endif
 15 | 
 16 | 
 17 | typedef struct { 
 18 |     float x;
 19 |     float y; 
 20 |     float z; 
 21 |     float w;
 22 | } simd4f;
 23 | 
 24 | 
 25 | 
 26 | vectorial_inline simd4f simd4f_create(float x, float y, float z, float w) {
 27 |     simd4f s = { x, y, z, w };
 28 |     return s;
 29 | }
 30 | 
 31 | vectorial_inline simd4f simd4f_zero() { return simd4f_create(0.0f, 0.0f, 0.0f, 0.0f); }
 32 | 
 33 | vectorial_inline simd4f simd4f_uload4(const float *ary) {
 34 |     simd4f s = { ary[0], ary[1], ary[2], ary[3] };
 35 |     return s;
 36 | }
 37 | 
 38 | vectorial_inline simd4f simd4f_uload3(const float *ary) {
 39 |     simd4f s = { ary[0], ary[1], ary[2], 0 };
 40 |     return s;
 41 | }
 42 | 
 43 | vectorial_inline simd4f simd4f_uload2(const float *ary) {
 44 |     simd4f s = { ary[0], ary[1], 0, 0 };
 45 |     return s;
 46 | }
 47 | 
 48 | 
 49 | vectorial_inline void simd4f_ustore4(const simd4f val, float *ary) {
 50 |     memcpy(ary, &val, sizeof(float) * 4);
 51 | }
 52 | 
 53 | vectorial_inline void simd4f_ustore3(const simd4f val, float *ary) {
 54 |     memcpy(ary, &val, sizeof(float) * 3);
 55 | }
 56 | 
 57 | vectorial_inline void simd4f_ustore2(const simd4f val, float *ary) {
 58 |     memcpy(ary, &val, sizeof(float) * 2);
 59 | }
 60 | 
 61 | 
 62 | 
 63 | // utilities
 64 | vectorial_inline simd4f simd4f_splat(float v) { 
 65 |     simd4f s = { v, v, v, v }; 
 66 |     return s;
 67 | }
 68 | 
 69 | vectorial_inline simd4f simd4f_splat_x(simd4f v) { 
 70 |     simd4f s = { v.x, v.x, v.x, v.x }; 
 71 |     return s;
 72 | }
 73 | 
 74 | vectorial_inline simd4f simd4f_splat_y(simd4f v) { 
 75 |     simd4f s = { v.y, v.y, v.y, v.y }; 
 76 |     return s;
 77 | }
 78 | 
 79 | vectorial_inline simd4f simd4f_splat_z(simd4f v) { 
 80 |     simd4f s = { v.z, v.z, v.z, v.z }; 
 81 |     return s;
 82 | }
 83 | 
 84 | vectorial_inline simd4f simd4f_splat_w(simd4f v) { 
 85 |     simd4f s = { v.w, v.w, v.w, v.w }; 
 86 |     return s;
 87 | }
 88 | 
 89 | vectorial_inline simd4f simd4f_reciprocal(simd4f v) { 
 90 |     simd4f s = { 1.0f/v.x, 1.0f/v.y, 1.0f/v.z, 1.0f/v.w }; 
 91 |     return s;
 92 | }
 93 | 
 94 | vectorial_inline simd4f simd4f_sqrt(simd4f v) { 
 95 |     simd4f s = { sqrtf(v.x), sqrtf(v.y), sqrtf(v.z), sqrtf(v.w) }; 
 96 |     return s;
 97 | }
 98 | 
 99 | vectorial_inline simd4f simd4f_rsqrt(simd4f v) { 
100 |     simd4f s = { 1.0f/sqrtf(v.x), 1.0f/sqrtf(v.y), 1.0f/sqrtf(v.z), 1.0f/sqrtf(v.w) }; 
101 |     return s;
102 | }
103 | 
104 | 
105 | // arithmetic
106 | 
107 | vectorial_inline simd4f simd4f_add(simd4f lhs, simd4f rhs) {
108 |     simd4f ret = { lhs.x + rhs.x, lhs.y + rhs.y, lhs.z + rhs.z, lhs.w + rhs.w };
109 |     return ret;
110 | }
111 | 
112 | vectorial_inline simd4f simd4f_sub(simd4f lhs, simd4f rhs) {
113 |     simd4f ret = { lhs.x - rhs.x, lhs.y - rhs.y, lhs.z - rhs.z, lhs.w - rhs.w };
114 |     return ret;
115 | }
116 | 
117 | vectorial_inline simd4f simd4f_mul(simd4f lhs, simd4f rhs) {
118 |     simd4f ret = { lhs.x * rhs.x, lhs.y * rhs.y, lhs.z * rhs.z, lhs.w * rhs.w };
119 |     return ret;
120 | }
121 | 
122 | vectorial_inline simd4f simd4f_div(simd4f lhs, simd4f rhs) {
123 |     simd4f ret = { lhs.x / rhs.x, lhs.y / rhs.y, lhs.z / rhs.z, lhs.w / rhs.w };
124 |     return ret;
125 | }
126 | 
127 | vectorial_inline simd4f simd4f_madd(simd4f m1, simd4f m2, simd4f a) {
128 |     return simd4f_add( simd4f_mul(m1, m2), a );
129 | }
130 | 
131 | vectorial_inline float simd4f_dot3_scalar(simd4f lhs, simd4f rhs) {
132 |     return lhs.x * rhs.x + lhs.y * rhs.y + lhs.z * rhs.z;
133 | }
134 | 
135 | vectorial_inline simd4f simd4f_dot3(simd4f lhs, simd4f rhs) {
136 |     return simd4f_splat( simd4f_dot3_scalar(lhs, rhs) );
137 | }
138 | 
139 | vectorial_inline simd4f simd4f_cross3(simd4f lhs, simd4f rhs) {
140 |     return simd4f_create( lhs.y * rhs.z - lhs.z * rhs.y,
141 |                           lhs.z * rhs.x - lhs.x * rhs.z,
142 |                           lhs.x * rhs.y - lhs.y * rhs.x, 0);
143 | }
144 | 
145 | 
146 | vectorial_inline float simd4f_get_x(simd4f s) { return s.x; }
147 | vectorial_inline float simd4f_get_y(simd4f s) { return s.y; }
148 | vectorial_inline float simd4f_get_z(simd4f s) { return s.z; }
149 | vectorial_inline float simd4f_get_w(simd4f s) { return s.w; }
150 | 
151 | 
152 | vectorial_inline simd4f simd4f_shuffle_wxyz(simd4f s) { return simd4f_create(s.w, s.x, s.y, s.z); }
153 | vectorial_inline simd4f simd4f_shuffle_zwxy(simd4f s) { return simd4f_create(s.z, s.w, s.x, s.y); }
154 | vectorial_inline simd4f simd4f_shuffle_yzwx(simd4f s) { return simd4f_create(s.y, s.z, s.w, s.x); }
155 | 
156 | 
157 | vectorial_inline simd4f simd4f_zero_w(simd4f s) {
158 |     return simd4f_create(s.x, s.y, s.z, 0.0f);
159 | }
160 | 
161 | vectorial_inline simd4f simd4f_zero_zw(simd4f s) {
162 |     return simd4f_create(s.x, s.y, 0.0f, 0.0f);
163 | }
164 | 
165 | 
166 | vectorial_inline simd4f simd4f_merge_high(simd4f abcd, simd4f xyzw) { 
167 |     return simd4f_create(abcd.z, abcd.w, xyzw.z, xyzw.w);
168 | }
169 | 
170 | vectorial_inline simd4f simd4f_flip_sign_0101(simd4f s) {
171 |     return simd4f_create(s.x, -s.y, s.z, -s.w);
172 | }
173 | 
174 | vectorial_inline simd4f simd4f_flip_sign_1010(simd4f s) {
175 |     return simd4f_create(-s.x, s.y, -s.z, s.w);
176 | }
177 | 
178 | vectorial_inline simd4f simd4f_min(simd4f a, simd4f b) {
179 |     return simd4f_create( a.x < b.x ? a.x : b.x, 
180 |                           a.y < b.y ? a.y : b.y, 
181 |                           a.z < b.z ? a.z : b.z, 
182 |                           a.w < b.w ? a.w : b.w );
183 | }
184 | 
185 | vectorial_inline simd4f simd4f_max(simd4f a, simd4f b) {
186 |     return simd4f_create( a.x > b.x ? a.x : b.x, 
187 |                           a.y > b.y ? a.y : b.y, 
188 |                           a.z > b.z ? a.z : b.z, 
189 |                           a.w > b.w ? a.w : b.w );
190 | }
191 | 
192 | 
193 | #ifdef __cplusplus
194 | }
195 | #endif
196 | 
197 | 
198 | #endif
199 | 
200 | 


--------------------------------------------------------------------------------
/include/vectorial/vec2f.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |   Vectorial
  3 |   Copyright (c) 2010 Mikko Lehtonen
  4 |   Licensed under the terms of the two-clause BSD License (see LICENSE)
  5 | */
  6 | #ifndef VECTORIAL_VEC2F_H
  7 | 
  8 | #ifndef VECTORIAL_SIMD4F_H
  9 |   #include "vectorial/simd4f.h"
 10 | #endif
 11 | 
 12 | 
 13 | 
 14 | namespace vectorial {
 15 |     
 16 |     class vec4f;
 17 |     class vec3f;
 18 | 
 19 |     class vec2f {
 20 |     public:
 21 | 
 22 |         simd4f value;
 23 |     
 24 |         inline vec2f() {}
 25 |         inline vec2f(const vec2f& v) : value(v.value) {}
 26 |         inline vec2f(const simd4f& v) : value(v) {}
 27 |         explicit inline vec2f(float xy) : value( simd4f_splat(xy) ) {}
 28 |         inline vec2f(float x, float y) : value( simd4f_create(x,y,0,0) ) {}
 29 |         explicit inline vec2f(const float *ary) : value( simd4f_uload2(ary) ) { }
 30 |             
 31 |         inline float x() const { return simd4f_get_x(value); }
 32 |         inline float y() const { return simd4f_get_y(value); }
 33 | 
 34 |         inline void load(const float *ary) { value = simd4f_uload2(ary); }
 35 |         inline void store(float *ary) const { simd4f_ustore2(value, ary); }
 36 |     
 37 |         enum { elements = 2 };
 38 | 
 39 |         static vec2f zero() { return vec2f(simd4f_zero()); }
 40 |         static vec2f one() { return vec2f(1.0f); }
 41 |         static vec2f xAxis() { return vec2f(1.0f, 0.0f); }
 42 |         static vec2f yAxis() { return vec2f(0.0f, 1.0f); }
 43 | 
 44 |         inline vec4f xyzw(float z, float w) const;
 45 |         inline vec4f xy00() const;
 46 |         inline vec4f xy01() const;
 47 |         inline vec3f xyz(float z) const;
 48 |         inline vec3f xy0() const;
 49 |         inline vec2f xy() const;
 50 | 
 51 |     };
 52 | 
 53 |     vectorial_inline vec2f operator-(const vec2f& lhs) {
 54 |         return vec2f( simd4f_sub(simd4f_zero(), lhs.value) );
 55 |     }
 56 | 
 57 | 
 58 |     vectorial_inline vec2f operator+(const vec2f& lhs, const vec2f& rhs) {
 59 |         return vec2f( simd4f_add(lhs.value, rhs.value) );
 60 |     }
 61 | 
 62 |     vectorial_inline vec2f operator-(const vec2f& lhs, const vec2f& rhs) {
 63 |         return vec2f( simd4f_sub(lhs.value, rhs.value) );
 64 |     }
 65 | 
 66 |     vectorial_inline vec2f operator*(const vec2f& lhs, const vec2f& rhs) {
 67 |         return vec2f( simd4f_mul(lhs.value, rhs.value) );
 68 |     }
 69 | 
 70 |     vectorial_inline vec2f operator/(const vec2f& lhs, const vec2f& rhs) {
 71 |         return vec2f( simd4f_div(lhs.value, rhs.value) );
 72 |     }
 73 | 
 74 | 
 75 |     vectorial_inline vec2f operator+=(vec2f& lhs, const vec2f& rhs) {
 76 |         return lhs = vec2f( simd4f_add(lhs.value, rhs.value) );
 77 |     }
 78 | 
 79 |     vectorial_inline vec2f operator-=(vec2f& lhs, const vec2f& rhs) {
 80 |         return lhs = vec2f( simd4f_sub(lhs.value, rhs.value) );
 81 |     }
 82 | 
 83 |     vectorial_inline vec2f operator*=(vec2f& lhs, const vec2f& rhs) {
 84 |         return lhs = vec2f( simd4f_mul(lhs.value, rhs.value) );
 85 |     }
 86 | 
 87 |     vectorial_inline vec2f operator/=(vec2f& lhs, const vec2f& rhs) {
 88 |         return lhs = vec2f( simd4f_div(lhs.value, rhs.value) );
 89 |     }
 90 | 
 91 | 
 92 | 
 93 |     vectorial_inline vec2f operator+(const vec2f& lhs, float rhs) {
 94 |         return vec2f( simd4f_add(lhs.value, simd4f_splat(rhs)) );
 95 |     }
 96 | 
 97 |     vectorial_inline vec2f operator-(const vec2f& lhs, float rhs) {
 98 |         return vec2f( simd4f_sub(lhs.value, simd4f_splat(rhs)) );
 99 |     }
100 | 
101 |     vectorial_inline vec2f operator*(const vec2f& lhs, float rhs) {
102 |         return vec2f( simd4f_mul(lhs.value, simd4f_splat(rhs)) );
103 |     }
104 | 
105 |     vectorial_inline vec2f operator/(const vec2f& lhs, float rhs) {
106 |         return vec2f( simd4f_div(lhs.value, simd4f_splat(rhs)) );
107 |     }
108 | 
109 |     vectorial_inline vec2f operator+(float lhs, const vec2f& rhs) {
110 |         return vec2f( simd4f_add(simd4f_splat(lhs), rhs.value) );
111 |     }
112 | 
113 |     vectorial_inline vec2f operator-(float lhs, const vec2f& rhs) {
114 |         return vec2f( simd4f_sub(simd4f_splat(lhs), rhs.value) );
115 |     }
116 | 
117 |     vectorial_inline vec2f operator*(float lhs, const vec2f& rhs) {
118 |         return vec2f( simd4f_mul(simd4f_splat(lhs), rhs.value) );
119 |     }
120 | 
121 |     vectorial_inline vec2f operator/(float lhs, const vec2f& rhs) {
122 |         return vec2f( simd4f_div(simd4f_splat(lhs), rhs.value) );
123 |     }
124 | 
125 | 
126 |     vectorial_inline vec2f operator+=(vec2f& lhs, float rhs) {
127 |         return lhs = vec2f( simd4f_add(lhs.value, simd4f_splat(rhs)) );
128 |     }
129 | 
130 |     vectorial_inline vec2f operator-=(vec2f& lhs, float rhs) {
131 |         return lhs = vec2f( simd4f_sub(lhs.value, simd4f_splat(rhs)) );
132 |     }
133 | 
134 |     vectorial_inline vec2f operator*=(vec2f& lhs, float rhs) {
135 |         return lhs = vec2f( simd4f_mul(lhs.value, simd4f_splat(rhs)) );
136 |     }
137 | 
138 |     vectorial_inline vec2f operator/=(vec2f& lhs, float rhs) {
139 |         return lhs = vec2f( simd4f_div(lhs.value, simd4f_splat(rhs)) );
140 |     }
141 | 
142 | 
143 |     vectorial_inline float dot(const vec2f& lhs, const vec2f& rhs) {
144 |         return simd4f_get_x( simd4f_dot2(lhs.value, rhs.value) );
145 |     }
146 | 
147 |     
148 |     vectorial_inline float length(const vec2f& v) {
149 |         return simd4f_get_x( simd4f_length2(v.value) );
150 |     }
151 | 
152 |     vectorial_inline float length_squared(const vec2f& v) {
153 |         return simd4f_get_x( simd4f_length2_squared(v.value) );
154 |     }
155 | 
156 |     vectorial_inline vec2f normalize(const vec2f& v) {
157 |         return vec2f( simd4f_normalize2(v.value) );
158 |     }
159 | 
160 |     vectorial_inline vec2f min(const vec2f& a, const vec2f& b) {
161 |         return vec2f( simd4f_min(a.value, b.value) );
162 |     }
163 | 
164 |     vectorial_inline vec2f max(const vec2f& a, const vec2f& b) {
165 |         return vec2f( simd4f_max(a.value, b.value) );
166 |     }
167 | 
168 | 
169 | }
170 | 
171 | 
172 | namespace std {
173 |     inline ::vectorial::vec2f min(const ::vectorial::vec2f& a, const ::vectorial::vec2f& b) { return ::vectorial::min(a,b); }
174 |     inline ::vectorial::vec2f max(const ::vectorial::vec2f& a, const ::vectorial::vec2f& b) { return ::vectorial::max(a,b); }
175 | }
176 | 
177 | 
178 | #ifdef VECTORIAL_OSTREAM
179 | #include <ostream>
180 | 
181 | vectorial_inline std::ostream& operator<<(std::ostream& os, const vectorial::vec2f& v) {
182 |     os << "[ " << v.x() << ", "
183 |                << v.y() << " ]";
184 |     return os;
185 | }
186 | #endif
187 | 
188 | 
189 | 
190 | 
191 | #endif
192 | 


--------------------------------------------------------------------------------
/include/vectorial/mat4f.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |   Vectorial
  3 |   Copyright (c) 2010 Mikko Lehtonen
  4 |   Licensed under the terms of the two-clause BSD License (see LICENSE)
  5 | */
  6 | #ifndef VECTORIAL_MAT4F_H
  7 | #define VECTORIAL_MAT4F_H
  8 | 
  9 | #ifndef VECTORIAL_SIMD4X4F_H
 10 |   #include "vectorial/simd4x4f.h"
 11 | #endif
 12 | 
 13 | #ifndef VECTORIAL_VEC4F_H
 14 |   #include "vectorial/vec4f.h"
 15 | #endif
 16 | 
 17 | 
 18 | namespace vectorial {
 19 |     
 20 | 
 21 |     class mat4f {
 22 |     public:
 23 | 
 24 |         simd4x4f value;
 25 |     
 26 |         inline mat4f() {}
 27 |         inline mat4f(const mat4f& m) : value(m.value) {}
 28 |         inline mat4f(const simd4x4f& v) : value(v) {}
 29 |         inline mat4f(const vec4f& v0, const vec4f& v1, const vec4f& v2, const vec4f& v3) : value(simd4x4f_create(v0.value, v1.value, v2.value, v3.value)) {}
 30 |         explicit inline mat4f(const float *ary) { simd4x4f_uload(&value, ary); }
 31 | 
 32 |         inline void load(const float *ary) { 
 33 |             value.x = simd4f_uload4(ary);
 34 |             value.y = simd4f_uload4(ary+4); 
 35 |             value.z = simd4f_uload4(ary+8); 
 36 |             value.w = simd4f_uload4(ary+12); 
 37 |         }
 38 | 
 39 |         inline void store(float *ary) const { 
 40 |             simd4f_ustore4(value.x, ary);
 41 |             simd4f_ustore4(value.y, ary+4);
 42 |             simd4f_ustore4(value.z, ary+8);
 43 |             simd4f_ustore4(value.w, ary+12);
 44 |         }
 45 | 
 46 |         static mat4f identity() { mat4f m; simd4x4f_identity(&m.value); return m; }
 47 | 
 48 |         static mat4f perspective(float fovy, float aspect, float znear, float zfar) {
 49 |             simd4x4f m;
 50 |             simd4x4f_perspective(&m, fovy, aspect, znear, zfar);
 51 |             return m;
 52 |         }
 53 |         
 54 |         static mat4f ortho(float left, float right, float bottom, float top, float znear, float zfar) {
 55 |             simd4x4f m;
 56 |             simd4x4f_ortho(&m, left, right, bottom, top, znear, zfar);
 57 |             return m;
 58 |         }
 59 |         
 60 |         static mat4f lookAt(const vec3f& eye, const vec3f& center, const vec3f& up) {
 61 |             simd4x4f m;
 62 |             simd4x4f_lookat(&m, eye.value, center.value, up.value);
 63 |             return m;            
 64 |         }
 65 | 
 66 |         static mat4f translation(const vec3f& pos) {
 67 |             simd4x4f m;
 68 |             simd4x4f_translation(&m, pos.x(), pos.y(), pos.z());
 69 |             return m;            
 70 |         }
 71 | 
 72 |         static mat4f axisRotation(float angle, const vec3f& axis) {
 73 |             simd4x4f m;
 74 |             simd4x4f_axis_rotation(&m, angle, axis.value);
 75 |             return m;            
 76 |         }
 77 | 
 78 |         static mat4f scale(float scale) {
 79 |             return simd4x4f_create( simd4f_create(scale,0,0,0),
 80 |                                     simd4f_create(0,scale,0,0),
 81 |                                     simd4f_create(0,0,scale,0),
 82 |                                     simd4f_create(0,0,0,1) );
 83 |         }
 84 | 
 85 |         static mat4f scale(const vec3f& scale) {
 86 |             return simd4x4f_create( simd4f_create(scale.x(),0,0,0),
 87 |                                    simd4f_create(0,scale.y(),0,0),
 88 |                                    simd4f_create(0,0,scale.z(),0),
 89 |                                    simd4f_create(0,0,0,1) );
 90 |         }
 91 | 
 92 |     };
 93 |     
 94 |     
 95 |     vectorial_inline mat4f operator*(const mat4f& lhs, const mat4f& rhs) {
 96 |         mat4f ret;
 97 |         simd4x4f_matrix_mul(&lhs.value, &rhs.value, &ret.value);
 98 |         return ret;
 99 |     }
100 | 
101 |     vectorial_inline mat4f operator*=(mat4f& lhs, const mat4f& rhs) {
102 |         const simd4x4f tmp = lhs.value;
103 |         simd4x4f_matrix_mul(&tmp, &rhs.value, &lhs.value);
104 |         return lhs;
105 |     }
106 | 
107 | 
108 |     vectorial_inline vec4f operator*(const mat4f& lhs, const vec4f& rhs) {
109 |         vec4f ret;
110 |         simd4x4f_matrix_vector_mul(&lhs.value, &rhs.value, &ret.value);
111 |         return ret;
112 |     }
113 | 
114 |     vectorial_inline vec3f transformVector(const mat4f& lhs, const vec3f& rhs) {
115 |         vec3f ret;
116 |         simd4x4f_matrix_vector3_mul(&lhs.value, &rhs.value, &ret.value);
117 |         return ret;
118 |     }
119 | 
120 |     vectorial_inline vec4f transformVector(const mat4f& lhs, const vec4f& rhs) {
121 |         vec4f ret;
122 |         simd4x4f_matrix_vector_mul(&lhs.value, &rhs.value, &ret.value);
123 |         return ret;
124 |     }
125 |     
126 |     vectorial_inline vec3f transformPoint(const mat4f& lhs, const vec3f& rhs) {
127 |         vec3f ret;
128 |         simd4x4f_matrix_point3_mul(&lhs.value, &rhs.value, &ret.value);
129 |         return ret;
130 |     }
131 | 
132 |     vectorial_inline vec3f orthoInverseTransformPoint(const mat4f& lhs, const vec3f& rhs) {
133 |         vec3f ret;
134 |         simd4x4f_inv_ortho_matrix_point3_mul(&lhs.value, &rhs.value, &ret.value);
135 |         return ret;
136 |     }
137 | 
138 |     vectorial_inline vec3f orthoInverseTransformVector(const mat4f& lhs, const vec3f& rhs) {
139 |         vec3f ret;
140 |         simd4x4f_inv_ortho_matrix_vector3_mul(&lhs.value, &rhs.value, &ret.value);
141 |         return ret;
142 |     }
143 | 
144 |     
145 |     vectorial_inline mat4f transpose(const mat4f& m) {
146 |         mat4f ret;
147 |         simd4x4f_transpose(&m.value, &ret.value);
148 |         return ret;
149 |     }
150 | 
151 | 
152 |     vectorial_inline mat4f inverse(const mat4f& m) {
153 |         mat4f ret;
154 |         simd4x4f_inverse(&m.value, &ret.value);
155 |         return ret;
156 |     }
157 | 
158 | 
159 | 
160 | }
161 | 
162 | 
163 | 
164 | #ifdef VECTORIAL_OSTREAM
165 | //#include <ostream>
166 | 
167 | vectorial_inline std::ostream& operator<<(std::ostream& os, const vectorial::mat4f& v) {
168 | 
169 |     os << "[ ";
170 |     os << simd4f_get_x(v.value.x) << ", ";
171 |     os << simd4f_get_x(v.value.y) << ", ";
172 |     os << simd4f_get_x(v.value.z) << ", ";
173 |     os << simd4f_get_x(v.value.w) << " ; ";
174 | 
175 |     os << simd4f_get_y(v.value.x) << ", ";
176 |     os << simd4f_get_y(v.value.y) << ", ";
177 |     os << simd4f_get_y(v.value.z) << ", ";
178 |     os << simd4f_get_y(v.value.w) << " ; ";
179 | 
180 |     os << simd4f_get_z(v.value.x) << ", ";
181 |     os << simd4f_get_z(v.value.y) << ", ";
182 |     os << simd4f_get_z(v.value.z) << ", ";
183 |     os << simd4f_get_z(v.value.w) << " ; ";
184 | 
185 |     os << simd4f_get_w(v.value.x) << ", ";
186 |     os << simd4f_get_w(v.value.y) << ", ";
187 |     os << simd4f_get_w(v.value.z) << ", ";
188 |     os << simd4f_get_w(v.value.w) << " ]";
189 | 
190 |     return os;
191 | }
192 | #endif
193 | 
194 | 
195 | 
196 | 
197 | #endif
198 | 


--------------------------------------------------------------------------------
/include/vectorial/vec4f.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |   Vectorial
  3 |   Copyright (c) 2010 Mikko Lehtonen
  4 |   Licensed under the terms of the two-clause BSD License (see LICENSE)
  5 | */
  6 | #ifndef VECTORIAL_VEC4F_H
  7 | #define VECTORIAL_VEC4F_H
  8 | 
  9 | #ifndef VECTORIAL_SIMD4F_H
 10 |   #include "vectorial/simd4f.h"
 11 | #endif
 12 | 
 13 | 
 14 | 
 15 | namespace vectorial {
 16 |     
 17 |     class vec3f;
 18 |     class vec2f;
 19 | 
 20 |     class vec4f {
 21 |     public:
 22 | 
 23 |         simd4f value;
 24 |     
 25 |         inline vec4f() {}
 26 |         inline vec4f(const vec4f& v) : value(v.value) {}
 27 |         inline vec4f(const simd4f& v) : value(v) {}
 28 |         explicit inline vec4f(float xyzw) : value( simd4f_splat(xyzw) ) {}
 29 |         inline vec4f(float x, float y, float z, float w) : value( simd4f_create(x,y,z,w) ) {}
 30 |         explicit inline vec4f(const float *ary) : value( simd4f_uload4(ary) ) { }
 31 |             
 32 |         inline float x() const { return simd4f_get_x(value); }
 33 |         inline float y() const { return simd4f_get_y(value); }
 34 |         inline float z() const { return simd4f_get_z(value); }
 35 |         inline float w() const { return simd4f_get_w(value); }
 36 | 
 37 |         inline void load(const float *ary) { value = simd4f_uload4(ary); }
 38 |         inline void store(float *ary) const { simd4f_ustore4(value, ary); }
 39 |         
 40 |         enum { elements = 4 };
 41 | 
 42 | 
 43 |         static vec4f zero() { return vec4f(simd4f_zero()); }
 44 |         static vec4f one() { return vec4f(1.0f); }
 45 |         static vec4f xAxis() { return vec4f(1.0f, 0.0f, 0.0f, 0.0f); }
 46 |         static vec4f yAxis() { return vec4f(0.0f, 1.0f, 0.0f, 0.0f); }
 47 |         static vec4f zAxis() { return vec4f(0.0f, 0.0f, 1.0f, 0.0f); }
 48 |         static vec4f wAxis() { return vec4f(0.0f, 0.0f, 0.0f, 1.0f); }
 49 | 
 50 | 
 51 |         inline vec3f xyz() const;
 52 |         inline vec2f xy() const;
 53 | 
 54 |     };
 55 | 
 56 | 
 57 |     vectorial_inline vec4f operator-(const vec4f& lhs) {
 58 |         return vec4f( simd4f_sub(simd4f_zero(), lhs.value) );
 59 |     }
 60 | 
 61 | 
 62 |     vectorial_inline vec4f operator+(const vec4f& lhs, const vec4f& rhs) {
 63 |         return vec4f( simd4f_add(lhs.value, rhs.value) );
 64 |     }
 65 | 
 66 |     vectorial_inline vec4f operator-(const vec4f& lhs, const vec4f& rhs) {
 67 |         return vec4f( simd4f_sub(lhs.value, rhs.value) );
 68 |     }
 69 | 
 70 |     vectorial_inline vec4f operator*(const vec4f& lhs, const vec4f& rhs) {
 71 |         return vec4f( simd4f_mul(lhs.value, rhs.value) );
 72 |     }
 73 | 
 74 |     vectorial_inline vec4f operator/(const vec4f& lhs, const vec4f& rhs) {
 75 |         return vec4f( simd4f_div(lhs.value, rhs.value) );
 76 |     }
 77 | 
 78 | 
 79 |     vectorial_inline vec4f operator+=(vec4f& lhs, const vec4f& rhs) {
 80 |         return lhs = vec4f( simd4f_add(lhs.value, rhs.value) );
 81 |     }
 82 | 
 83 |     vectorial_inline vec4f operator-=(vec4f& lhs, const vec4f& rhs) {
 84 |         return lhs = vec4f( simd4f_sub(lhs.value, rhs.value) );
 85 |     }
 86 | 
 87 |     vectorial_inline vec4f operator*=(vec4f& lhs, const vec4f& rhs) {
 88 |         return lhs = vec4f( simd4f_mul(lhs.value, rhs.value) );
 89 |     }
 90 | 
 91 |     vectorial_inline vec4f operator/=(vec4f& lhs, const vec4f& rhs) {
 92 |         return lhs = vec4f( simd4f_div(lhs.value, rhs.value) );
 93 |     }
 94 | 
 95 | 
 96 | 
 97 |     vectorial_inline vec4f operator+(const vec4f& lhs, float rhs) {
 98 |         return vec4f( simd4f_add(lhs.value, simd4f_splat(rhs)) );
 99 |     }
100 | 
101 |     vectorial_inline vec4f operator-(const vec4f& lhs, float rhs) {
102 |         return vec4f( simd4f_sub(lhs.value, simd4f_splat(rhs)) );
103 |     }
104 | 
105 |     vectorial_inline vec4f operator*(const vec4f& lhs, float rhs) {
106 |         return vec4f( simd4f_mul(lhs.value, simd4f_splat(rhs)) );
107 |     }
108 | 
109 |     vectorial_inline vec4f operator/(const vec4f& lhs, float rhs) {
110 |         return vec4f( simd4f_div(lhs.value, simd4f_splat(rhs)) );
111 |     }
112 | 
113 |     vectorial_inline vec4f operator+(float lhs, const vec4f& rhs) {
114 |         return vec4f( simd4f_add(simd4f_splat(lhs), rhs.value) );
115 |     }
116 | 
117 |     vectorial_inline vec4f operator-(float lhs, const vec4f& rhs) {
118 |         return vec4f( simd4f_sub(simd4f_splat(lhs), rhs.value) );
119 |     }
120 | 
121 |     vectorial_inline vec4f operator*(float lhs, const vec4f& rhs) {
122 |         return vec4f( simd4f_mul(simd4f_splat(lhs), rhs.value) );
123 |     }
124 | 
125 |     vectorial_inline vec4f operator/(float lhs, const vec4f& rhs) {
126 |         return vec4f( simd4f_div(simd4f_splat(lhs), rhs.value) );
127 |     }
128 | 
129 | 
130 |     vectorial_inline vec4f operator+=(vec4f& lhs, float rhs) {
131 |         return lhs = vec4f( simd4f_add(lhs.value, simd4f_splat(rhs)) );
132 |     }
133 | 
134 |     vectorial_inline vec4f operator-=(vec4f& lhs, float rhs) {
135 |         return lhs = vec4f( simd4f_sub(lhs.value, simd4f_splat(rhs)) );
136 |     }
137 | 
138 |     vectorial_inline vec4f operator*=(vec4f& lhs, float rhs) {
139 |         return lhs = vec4f( simd4f_mul(lhs.value, simd4f_splat(rhs)) );
140 |     }
141 | 
142 |     vectorial_inline vec4f operator/=(vec4f& lhs, float rhs) {
143 |         return lhs = vec4f( simd4f_div(lhs.value, simd4f_splat(rhs)) );
144 |     }
145 | 
146 | 
147 |     vectorial_inline float dot(const vec4f& lhs, const vec4f& rhs) {
148 |         return simd4f_get_x( simd4f_dot4(lhs.value, rhs.value) );
149 |     }
150 |     
151 |     
152 |     vectorial_inline float length(const vec4f& v) {
153 |         return simd4f_get_x( simd4f_length4(v.value) );
154 |     }
155 | 
156 |     vectorial_inline float length_squared(const vec4f& v) {
157 |         return simd4f_get_x( simd4f_length4_squared(v.value) );
158 |     }
159 | 
160 |     vectorial_inline vec4f normalize(const vec4f& v) {
161 |         return vec4f( simd4f_normalize4(v.value) );
162 |     }
163 | 
164 |     vectorial_inline vec4f min(const vec4f& a, const vec4f& b) {
165 |         return vec4f( simd4f_min(a.value, b.value) );
166 |     }
167 | 
168 |     vectorial_inline vec4f max(const vec4f& a, const vec4f& b) {
169 |         return vec4f( simd4f_max(a.value, b.value) );
170 |     }
171 | 
172 | 
173 | }
174 | 
175 | 
176 | namespace std {
177 |     inline ::vectorial::vec4f min(const ::vectorial::vec4f& a, const ::vectorial::vec4f& b) { return ::vectorial::min(a,b); }
178 |     inline ::vectorial::vec4f max(const ::vectorial::vec4f& a, const ::vectorial::vec4f& b) { return ::vectorial::max(a,b); }
179 | }
180 | 
181 | 
182 | #ifdef VECTORIAL_OSTREAM
183 | #include <ostream>
184 | 
185 | vectorial_inline std::ostream& operator<<(std::ostream& os, const vectorial::vec4f& v) {
186 |     os << "[ " << v.x() << ", "
187 |                << v.y() << ", "
188 |                << v.z() << ", "
189 |                << v.w() << " ]";
190 |     return os;
191 | }
192 | #endif
193 | 
194 | 
195 | #endif
196 | 


--------------------------------------------------------------------------------
/include/vectorial/vec3f.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |   Vectorial
  3 |   Copyright (c) 2010 Mikko Lehtonen
  4 |   Copyright (c) 2014 Google, Inc.
  5 |   Licensed under the terms of the two-clause BSD License (see LICENSE)
  6 | */
  7 | #ifndef VECTORIAL_VEC3F_H
  8 | 
  9 | #ifndef VECTORIAL_SIMD4F_H
 10 |   #include "vectorial/simd4f.h"
 11 | #endif
 12 | 
 13 | 
 14 | 
 15 | namespace vectorial {
 16 |     
 17 |     class vec4f;
 18 |     class vec2f;
 19 | 
 20 |     class vec3f {
 21 |     public:
 22 | 
 23 |         simd4f value;
 24 |     
 25 |         inline vec3f() {}
 26 |         inline vec3f(const vec3f& v) : value(v.value) {}
 27 |         inline vec3f(const simd4f& v) : value(v) {}
 28 |         explicit inline vec3f(float xyz) : value( simd4f_splat(xyz) ) {}
 29 |         inline vec3f(float x, float y, float z) : value( simd4f_create(x,y,z,0) ) {}
 30 |         explicit inline vec3f(const float *ary) : value( simd4f_uload3(ary) ) { }
 31 |             
 32 |         inline float x() const { return simd4f_get_x(value); }
 33 |         inline float y() const { return simd4f_get_y(value); }
 34 |         inline float z() const { return simd4f_get_z(value); }
 35 | 
 36 |         inline void load(const float *ary) { value = simd4f_uload3(ary); }
 37 |         inline void store(float *ary) const { simd4f_ustore3(value, ary); }
 38 |     
 39 |         enum { elements = 3 };
 40 | 
 41 |         static vec3f zero() { return vec3f(simd4f_zero()); }
 42 |         static vec3f one() { return vec3f(1.0f); }
 43 |         static vec3f xAxis() { return vec3f(1.0f, 0.0f, 0.0f); }
 44 |         static vec3f yAxis() { return vec3f(0.0f, 1.0f, 0.0f); }
 45 |         static vec3f zAxis() { return vec3f(0.0f, 0.0f, 1.0f); }
 46 | 
 47 |         inline vec4f xyz0() const;
 48 |         inline vec4f xyz1() const;
 49 |         inline vec4f xyzw(float w) const;
 50 |         inline vec3f xyz() const;
 51 |         inline vec3f xy0() const;
 52 |         inline vec2f xy() const;
 53 |     };
 54 | 
 55 |     vectorial_inline vec3f operator-(const vec3f& lhs) {
 56 |         return vec3f( simd4f_sub(simd4f_zero(), lhs.value) );
 57 |     }
 58 |     
 59 | 
 60 |     vectorial_inline vec3f operator+(const vec3f& lhs, const vec3f& rhs) {
 61 |         return vec3f( simd4f_add(lhs.value, rhs.value) );
 62 |     }
 63 | 
 64 |     vectorial_inline vec3f operator-(const vec3f& lhs, const vec3f& rhs) {
 65 |         return vec3f( simd4f_sub(lhs.value, rhs.value) );
 66 |     }
 67 | 
 68 |     vectorial_inline vec3f operator*(const vec3f& lhs, const vec3f& rhs) {
 69 |         return vec3f( simd4f_mul(lhs.value, rhs.value) );
 70 |     }
 71 | 
 72 |     vectorial_inline vec3f operator/(const vec3f& lhs, const vec3f& rhs) {
 73 |         return vec3f( simd4f_div(lhs.value, rhs.value) );
 74 |     }
 75 | 
 76 | 
 77 |     vectorial_inline vec3f operator+=(vec3f& lhs, const vec3f& rhs) {
 78 |         return lhs = vec3f( simd4f_add(lhs.value, rhs.value) );
 79 |     }
 80 | 
 81 |     vectorial_inline vec3f operator-=(vec3f& lhs, const vec3f& rhs) {
 82 |         return lhs = vec3f( simd4f_sub(lhs.value, rhs.value) );
 83 |     }
 84 | 
 85 |     vectorial_inline vec3f operator*=(vec3f& lhs, const vec3f& rhs) {
 86 |         return lhs = vec3f( simd4f_mul(lhs.value, rhs.value) );
 87 |     }
 88 | 
 89 |     vectorial_inline vec3f operator/=(vec3f& lhs, const vec3f& rhs) {
 90 |         return lhs = vec3f( simd4f_div(lhs.value, rhs.value) );
 91 |     }
 92 | 
 93 | 
 94 | 
 95 |     vectorial_inline vec3f operator+(const vec3f& lhs, float rhs) {
 96 |         return vec3f( simd4f_add(lhs.value, simd4f_splat(rhs)) );
 97 |     }
 98 | 
 99 |     vectorial_inline vec3f operator-(const vec3f& lhs, float rhs) {
100 |         return vec3f( simd4f_sub(lhs.value, simd4f_splat(rhs)) );
101 |     }
102 | 
103 |     vectorial_inline vec3f operator*(const vec3f& lhs, float rhs) {
104 |         return vec3f( simd4f_mul(lhs.value, simd4f_splat(rhs)) );
105 |     }
106 | 
107 |     vectorial_inline vec3f operator/(const vec3f& lhs, float rhs) {
108 |         return vec3f( simd4f_div(lhs.value, simd4f_splat(rhs)) );
109 |     }
110 | 
111 |     vectorial_inline vec3f operator+(float lhs, const vec3f& rhs) {
112 |         return vec3f( simd4f_add(simd4f_splat(lhs), rhs.value) );
113 |     }
114 | 
115 |     vectorial_inline vec3f operator-(float lhs, const vec3f& rhs) {
116 |         return vec3f( simd4f_sub(simd4f_splat(lhs), rhs.value) );
117 |     }
118 | 
119 |     vectorial_inline vec3f operator*(float lhs, const vec3f& rhs) {
120 |         return vec3f( simd4f_mul(simd4f_splat(lhs), rhs.value) );
121 |     }
122 | 
123 |     vectorial_inline vec3f operator/(float lhs, const vec3f& rhs) {
124 |         return vec3f( simd4f_div(simd4f_splat(lhs), rhs.value) );
125 |     }
126 | 
127 | 
128 |     vectorial_inline vec3f operator+=(vec3f& lhs, float rhs) {
129 |         return lhs = vec3f( simd4f_add(lhs.value, simd4f_splat(rhs)) );
130 |     }
131 | 
132 |     vectorial_inline vec3f operator-=(vec3f& lhs, float rhs) {
133 |         return lhs = vec3f( simd4f_sub(lhs.value, simd4f_splat(rhs)) );
134 |     }
135 | 
136 |     vectorial_inline vec3f operator*=(vec3f& lhs, float rhs) {
137 |         return lhs = vec3f( simd4f_mul(lhs.value, simd4f_splat(rhs)) );
138 |     }
139 | 
140 |     vectorial_inline vec3f operator/=(vec3f& lhs, float rhs) {
141 |         return lhs = vec3f( simd4f_div(lhs.value, simd4f_splat(rhs)) );
142 |     }
143 | 
144 | 
145 |     vectorial_inline float dot(const vec3f& lhs, const vec3f& rhs) {
146 |         return simd4f_dot3_scalar(lhs.value, rhs.value);
147 |     }
148 | 
149 |     vectorial_inline vec3f cross(const vec3f& lhs, const vec3f& rhs) {
150 |         return simd4f_cross3(lhs.value, rhs.value);
151 |     }
152 |     
153 |     
154 |     vectorial_inline float length(const vec3f& v) {
155 |         return simd4f_get_x( simd4f_length3(v.value) );
156 |     }
157 | 
158 |     vectorial_inline float length_squared(const vec3f& v) {
159 |         return simd4f_get_x( simd4f_length3_squared(v.value) );
160 |     }
161 | 
162 |     vectorial_inline vec3f normalize(const vec3f& v) {
163 |         return vec3f( simd4f_normalize3(v.value) );
164 |     }
165 | 
166 |     vectorial_inline vec3f min(const vec3f& a, const vec3f& b) {
167 |         return vec3f( simd4f_min(a.value, b.value) );
168 |     }
169 | 
170 |     vectorial_inline vec3f max(const vec3f& a, const vec3f& b) {
171 |         return vec3f( simd4f_max(a.value, b.value) );
172 |     }
173 | 
174 | }
175 | 
176 | 
177 | namespace std {
178 |     inline ::vectorial::vec3f min(const ::vectorial::vec3f& a, const ::vectorial::vec3f& b) { return ::vectorial::min(a,b); }
179 |     inline ::vectorial::vec3f max(const ::vectorial::vec3f& a, const ::vectorial::vec3f& b) { return ::vectorial::max(a,b); }
180 | }
181 | 
182 | 
183 | #ifdef VECTORIAL_OSTREAM
184 | #include <ostream>
185 | 
186 | vectorial_inline std::ostream& operator<<(std::ostream& os, const vectorial::vec3f& v) {
187 |     os << "[ " << v.x() << ", "
188 |                << v.y() << ", "
189 |                << v.z() << " ]";
190 |     return os;
191 | }
192 | #endif
193 | 
194 | 
195 | 
196 | 
197 | #endif
198 | 


--------------------------------------------------------------------------------
/include/vectorial/simd4f_gnu.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |   Vectorial
  3 |   Copyright (c) 2010 Mikko Lehtonen
  4 |   Licensed under the terms of the two-clause BSD License (see LICENSE)
  5 | */
  6 | #ifndef VECTORIAL_SIMD4F_GNU_H
  7 | #define VECTORIAL_SIMD4F_GNU_H
  8 | 
  9 | #include <math.h>
 10 | #include <string.h>  // memcpy
 11 | 
 12 | 
 13 | #ifdef __cplusplus
 14 | extern "C" {
 15 | #endif
 16 | 
 17 | 
 18 | typedef float simd4f __attribute__ ((vector_size (16)));
 19 | 
 20 | typedef union {
 21 |     simd4f s ;
 22 |     float f[4];
 23 | } _simd4f_union;
 24 | 
 25 | vectorial_inline float simd4f_get_x(simd4f s) { _simd4f_union u={s}; return u.f[0]; }
 26 | vectorial_inline float simd4f_get_y(simd4f s) { _simd4f_union u={s}; return u.f[1]; }
 27 | vectorial_inline float simd4f_get_z(simd4f s) { _simd4f_union u={s}; return u.f[2]; }
 28 | vectorial_inline float simd4f_get_w(simd4f s) { _simd4f_union u={s}; return u.f[3]; }
 29 | 
 30 | 
 31 | vectorial_inline simd4f simd4f_create(float x, float y, float z, float w) {
 32 |     simd4f s = { x, y, z, w };
 33 |     return s;
 34 | }
 35 | 
 36 | vectorial_inline simd4f simd4f_zero() { return simd4f_create(0.0f, 0.0f, 0.0f, 0.0f); }
 37 | 
 38 | vectorial_inline simd4f simd4f_uload4(const float *ary) {
 39 |     simd4f s = { ary[0], ary[1], ary[2], ary[3] };
 40 |     return s;
 41 | }
 42 | 
 43 | vectorial_inline simd4f simd4f_uload3(const float *ary) {
 44 |     simd4f s = { ary[0], ary[1], ary[2], 0 };
 45 |     return s;
 46 | }
 47 | 
 48 | vectorial_inline simd4f simd4f_uload2(const float *ary) {
 49 |     simd4f s = { ary[0], ary[1], 0, 0 };
 50 |     return s;
 51 | }
 52 | 
 53 | 
 54 | vectorial_inline void simd4f_ustore4(const simd4f val, float *ary) {
 55 |     memcpy(ary, &val, sizeof(float) * 4);
 56 | }
 57 | 
 58 | vectorial_inline void simd4f_ustore3(const simd4f val, float *ary) {
 59 |     memcpy(ary, &val, sizeof(float) * 3);
 60 | }
 61 | 
 62 | vectorial_inline void simd4f_ustore2(const simd4f val, float *ary) {
 63 |     memcpy(ary, &val, sizeof(float) * 2);
 64 | }
 65 | 
 66 | 
 67 | vectorial_inline simd4f simd4f_splat(float v) { 
 68 |     simd4f s = { v, v, v, v }; 
 69 |     return s;
 70 | }
 71 | 
 72 | vectorial_inline simd4f simd4f_splat_x(simd4f v) { 
 73 |     float s = simd4f_get_x(v);
 74 |     simd4f ret = { s, s, s, s }; 
 75 |     return ret;
 76 | }
 77 | 
 78 | vectorial_inline simd4f simd4f_splat_y(simd4f v) { 
 79 |     float s = simd4f_get_y(v);
 80 |     simd4f ret = { s, s, s, s }; 
 81 |     return ret;
 82 | }
 83 | 
 84 | vectorial_inline simd4f simd4f_splat_z(simd4f v) { 
 85 |     float s = simd4f_get_z(v);
 86 |     simd4f ret = { s, s, s, s }; 
 87 |     return ret;
 88 | }
 89 | 
 90 | vectorial_inline simd4f simd4f_splat_w(simd4f v) { 
 91 |     float s = simd4f_get_w(v);
 92 |     simd4f ret = { s, s, s, s }; 
 93 |     return ret;
 94 | }
 95 | 
 96 | vectorial_inline simd4f simd4f_reciprocal(simd4f v) { 
 97 |     return simd4f_splat(1.0f) / v;
 98 | }
 99 | 
100 | vectorial_inline simd4f simd4f_sqrt(simd4f v) { 
101 |     simd4f ret = { sqrtf(simd4f_get_x(v)), sqrtf(simd4f_get_y(v)), sqrtf(simd4f_get_z(v)), sqrtf(simd4f_get_w(v)) };
102 |     return ret;
103 | }
104 | 
105 | vectorial_inline simd4f simd4f_rsqrt(simd4f v) { 
106 |     return simd4f_splat(1.0f) / simd4f_sqrt(v);
107 | }
108 | 
109 | 
110 | 
111 | vectorial_inline simd4f simd4f_add(simd4f lhs, simd4f rhs) {
112 |     simd4f ret = lhs + rhs;
113 |     return ret;
114 | }
115 | 
116 | vectorial_inline simd4f simd4f_sub(simd4f lhs, simd4f rhs) {
117 |     simd4f ret = lhs - rhs;
118 |     return ret;
119 | }
120 | 
121 | vectorial_inline simd4f simd4f_mul(simd4f lhs, simd4f rhs) {
122 |     simd4f ret = lhs * rhs;
123 |     return ret;
124 | }
125 | 
126 | vectorial_inline simd4f simd4f_div(simd4f lhs, simd4f rhs) {
127 |     simd4f ret = lhs / rhs;
128 |     return ret;
129 | }
130 | 
131 | vectorial_inline simd4f simd4f_madd(simd4f m1, simd4f m2, simd4f a) {
132 |     return simd4f_add( simd4f_mul(m1, m2), a );
133 | }
134 | 
135 | vectorial_inline float simd4f_dot3_scalar(simd4f lhs, simd4f rhs) {
136 |     _simd4f_union l = {lhs};
137 |     _simd4f_union r = {rhs};
138 |     return l.f[0] * r.f[0] + l.f[1] * r.f[1] + l.f[2] * r.f[2];
139 | }
140 | 
141 | vectorial_inline simd4f simd4f_dot3(simd4f lhs, simd4f rhs) {
142 |     return simd4f_splat( simd4f_dot3_scalar(lhs, rhs) );
143 | }
144 | 
145 | vectorial_inline simd4f simd4f_cross3(simd4f l, simd4f r) {
146 |     _simd4f_union lhs = {l};
147 |     _simd4f_union rhs = {r};
148 |     
149 |     return simd4f_create( lhs.f[1] * rhs.f[2] - lhs.f[2] * rhs.f[1],
150 |                           lhs.f[2] * rhs.f[0] - lhs.f[0] * rhs.f[2],
151 |                           lhs.f[0] * rhs.f[1] - lhs.f[1] * rhs.f[0], 0);
152 | }
153 | 
154 | 
155 | vectorial_inline simd4f simd4f_shuffle_wxyz(simd4f s) { 
156 |     _simd4f_union u = {s};
157 |     return simd4f_create(u.f[3], u.f[0], u.f[1], u.f[2]); 
158 | }
159 | 
160 | vectorial_inline simd4f simd4f_shuffle_zwxy(simd4f s) { 
161 |     _simd4f_union u = {s};
162 |     return simd4f_create(u.f[2], u.f[3], u.f[0], u.f[1]); 
163 | }
164 | 
165 | vectorial_inline simd4f simd4f_shuffle_yzwx(simd4f s) { 
166 |     _simd4f_union u = {s};
167 |     return simd4f_create(u.f[1], u.f[2], u.f[3], u.f[0]); 
168 | }
169 | 
170 | 
171 | vectorial_inline simd4f simd4f_zero_w(simd4f s) {
172 |     _simd4f_union u = {s};
173 |     return simd4f_create(u.f[0], u.f[1], u.f[2], 0.0f);
174 | }
175 | 
176 | vectorial_inline simd4f simd4f_zero_zw(simd4f s) {
177 |     _simd4f_union u = {s};
178 |     return simd4f_create(u.f[0], u.f[1], 0.0f, 0.0f);
179 | }
180 | 
181 | 
182 | vectorial_inline simd4f simd4f_merge_high(simd4f abcd, simd4f xyzw) { 
183 |     _simd4f_union u1 = {abcd};
184 |     _simd4f_union u2 = {xyzw};
185 |     return simd4f_create(u1.f[2], u1.f[3], u2.f[2], u2.f[3]);
186 | }
187 | 
188 | vectorial_inline simd4f simd4f_flip_sign_0101(simd4f s) {
189 |     _simd4f_union u = {s};
190 |     return simd4f_create(u.f[0], -u.f[1], u.f[2], -u.f[3]);
191 | }
192 | 
193 | vectorial_inline simd4f simd4f_flip_sign_1010(simd4f s) {
194 |     _simd4f_union u = {s};
195 |     return simd4f_create(-u.f[0], u.f[1], -u.f[2], u.f[3]);
196 | }
197 | 
198 | 
199 | vectorial_inline simd4f simd4f_min(simd4f a, simd4f b) {
200 |     _simd4f_union ua = {a};
201 |     _simd4f_union ub = {b};
202 |     return simd4f_create( ua.f[0] < ub.f[0] ? ua.f[0] : ub.f[0], 
203 |                           ua.f[1] < ub.f[1] ? ua.f[1] : ub.f[1], 
204 |                           ua.f[2] < ub.f[2] ? ua.f[2] : ub.f[2], 
205 |                           ua.f[3] < ub.f[3] ? ua.f[3] : ub.f[3] );
206 | }
207 | 
208 | vectorial_inline simd4f simd4f_max(simd4f a, simd4f b) {
209 |     _simd4f_union ua = {a};
210 |     _simd4f_union ub = {b};
211 |     return simd4f_create( ua.f[0] > ub.f[0] ? ua.f[0] : ub.f[0], 
212 |                           ua.f[1] > ub.f[1] ? ua.f[1] : ub.f[1], 
213 |                           ua.f[2] > ub.f[2] ? ua.f[2] : ub.f[2], 
214 |                           ua.f[3] > ub.f[3] ? ua.f[3] : ub.f[3] );
215 | }
216 | 
217 | 
218 | 
219 | #ifdef __cplusplus
220 | }
221 | #endif
222 | 
223 | 
224 | #endif
225 | 
226 | 


--------------------------------------------------------------------------------
/spec/spec.cpp:
--------------------------------------------------------------------------------
  1 | /* Specific - Minimal C++ spec framework.
  2 |  
  3 | 
  4 | The zlib/libpng License
  5 | 
  6 | 
  7 | Copyright (c) 2008 Mikko Lehtonen
  8 | 
  9 | This software is provided 'as-is', without any express or implied
 10 | warranty. In no event will the authors be held liable for any damages
 11 | arising from the use of this software.
 12 | 
 13 | Permission is granted to anyone to use this software for any purpose,
 14 | including commercial applications, and to alter it and redistribute it
 15 | freely, subject to the following restrictions:
 16 | 
 17 |     1. The origin of this software must not be misrepresented; you must not
 18 |     claim that you wrote the original software. If you use this software
 19 |     in a product, an acknowledgment in the product documentation would be
 20 |     appreciated but is not required.
 21 | 
 22 |     2. Altered source versions must be plainly marked as such, and must not be
 23 |     misrepresented as being the original software.
 24 | 
 25 |     3. This notice may not be removed or altered from any source
 26 |     distribution.
 27 | */
 28 | 
 29 | 
 30 | #include "spec.h"
 31 | 
 32 | #include <iostream>
 33 | 
 34 | namespace specific {
 35 | 
 36 | 
 37 | 
 38 |     void SpecWriter::startGroup(std::string /*group*/, std::string /*description*/) {}
 39 | 
 40 |     void SpecWriter::addFailedAssertation(std::string msg, const char *file, int line) {
 41 |         mFailures.push_back( SpecFailure(msg,file,line) );
 42 |     }
 43 |     void SpecWriter::addSpecResult(SpecResult r) {
 44 |         mResults.push_back( r );
 45 |     }
 46 |     void SpecWriter::start() {}
 47 |     void SpecWriter::stop() {
 48 |         std::cout << std::endl;
 49 |         size_t nth = 0;
 50 |         for(std::vector<SpecFailure>::iterator i=mFailures.begin(); i != mFailures.end(); ++i, ++nth)
 51 |         {
 52 |             std::cout << std::endl;
 53 |             std::cout << (nth+1) << ") Failed assertation at " << i->file << ":"
 54 |             << i->line << ":" << std::endl << "  " << i->msg << std::endl;
 55 |         }
 56 |         std::cout << std::endl << mResults.size() << " examples, " << mFailures.size() << " failures" << std::endl;
 57 |  
 58 |     }
 59 |  
 60 | 
 61 | 
 62 |     void ProgressWriter::addSpecResult(SpecResult r) {
 63 |         SpecWriter::addSpecResult(r);
 64 |         switch(r.type) {
 65 |             case SpecResult::PASSED:
 66 |                 std::cout << ".";
 67 |                 break;
 68 |             case SpecResult::FAILED:
 69 |                 std::cout << "F";
 70 |                 break;
 71 |             case SpecResult::ERRORED:
 72 |                 std::cout << "E";
 73 |                 break;
 74 |         }
 75 |         std::cout << std::flush;
 76 |     }
 77 | 
 78 | 
 79 | 
 80 |     void SpecdocWriter::startGroup(std::string group, std::string description) {
 81 |         std::cout << group << ": " << description << std::endl;            
 82 |     }
 83 | 
 84 | 
 85 |     void SpecdocWriter::addSpecResult(SpecResult r) {
 86 |         SpecWriter::addSpecResult(r);
 87 |         size_t nth = mFailures.size();
 88 |         std::cout << "- " << r.test;
 89 |         switch(r.type) {
 90 |             case SpecResult::PASSED:
 91 |                 std::cout << " [OK]";
 92 |                 break;
 93 |             case SpecResult::FAILED:
 94 |                 std::cout << " [FAILED - " << nth << "]";
 95 |                 break;
 96 |             case SpecResult::ERRORED:
 97 |                 std::cout << " [ERROR - "<< nth <<"]";
 98 |                 break;
 99 |         }
100 |         std::cout << std::endl;
101 |     }
102 | 
103 | 
104 | 
105 | 
106 |     class spec_failure {};
107 | 
108 | 
109 | 
110 |     SpecBase::SpecBase() : mWriter(NULL), mName(NULL),
111 |         mFailed(false), mLastFailed(false), mError(false), mExecutionPoint(0), mContinuePoint(0) 
112 |     {
113 |         SpecRunner::getInstance().add(this);
114 |     }
115 | 
116 | 
117 |     SpecBase::~SpecBase() {
118 |         
119 |     }
120 | 
121 | 
122 |     bool SpecBase::startSpec(const char* name) 
123 |     {
124 |         endSpec();
125 | 
126 |         mExecutionPoint++;
127 |         if(mExecutionPoint <= mContinuePoint) return false;
128 |         mContinuePoint++;
129 | 
130 |         mName = name;
131 |         return true;
132 |     }
133 | 
134 | 
135 |     void SpecBase::endSpec() 
136 |     {
137 |         if(!mName) return;
138 | 
139 |         SpecResult r;
140 |         r.group = getGroup();
141 |         r.description = getDescription();
142 |         r.type = SpecResult::PASSED;
143 |         if(mLastFailed) r.type = SpecResult::FAILED;
144 |         if(mError) r.type = SpecResult::ERRORED;
145 |         r.test = mName;
146 |         mWriter->addSpecResult( r );
147 |         
148 |         mName = NULL; 
149 |     }
150 | 
151 | 
152 |     void SpecBase::should_test(bool value, const char* message, const char* file, int line) {
153 |         mLastFailed=false;
154 |         if(!value) {
155 |             mWriter->addFailedAssertation(message, file, line);
156 |             mLastFailed = mFailed = true;
157 |             throw spec_failure();
158 |         }
159 |     }
160 | 
161 |     
162 |     void SpecBase::error(std::string msg) {
163 |         mWriter->addFailedAssertation(msg, "exception", 0);
164 |         mLastFailed = true;
165 |         mFailed = true;
166 |         mError = true;
167 |     }
168 | 
169 |     bool SpecBase::done() {
170 |         if( mError ) {
171 |             mError = false;
172 |             return false;
173 |         }
174 |         return true;
175 |     }
176 | 
177 | 
178 |     SpecRunner::SpecRunner() {}
179 |     SpecRunner::~SpecRunner() { }
180 | 
181 |     SpecRunner& SpecRunner::getInstance() {
182 |         static SpecRunner* instance = NULL;
183 |         if( instance == NULL ) {
184 |             instance = new SpecRunner;
185 |         }
186 |         return *instance;
187 |     }
188 | 
189 | 
190 |     bool SpecRunner::run(SpecWriter& writer, const std::string subset) {
191 |         bool success = true;
192 |         
193 |         writer.start();
194 |         std::vector<SpecBase*>::iterator i = mSpecs.begin();
195 |         for(; i != mSpecs.end(); ++i) {
196 |             SpecBase *b = *i;
197 |             if( b->getGroup().find(subset, 0) == std::string::npos ) continue;
198 |             b->mContinuePoint = 0;
199 |             b->setWriter(&writer);
200 |             writer.startGroup( b->getGroup(), b->getDescription() );
201 |             do {
202 |                 b->mExecutionPoint = 0;
203 |                 try {
204 |                     b->specify();
205 |                 } catch(spec_failure& e) {
206 |                     b->mError=true;
207 |                 } catch( std::exception& e) {
208 |                     b->error(e.what());
209 |                 } catch( ... ) {
210 |                     b->error("unknown exception");
211 |                 }
212 |                 b->endSpec();
213 |                 
214 |             } while( !b->done() );
215 |             
216 |             success = success && b->isSuccessful();
217 | 
218 |         }
219 |         writer.stop();
220 | 
221 |         return success;
222 |     }
223 | 
224 | 
225 | }
226 | 
227 | 
228 | 
229 | 
230 | 


--------------------------------------------------------------------------------
/spec/spec.h:
--------------------------------------------------------------------------------
  1 | /* Specific - Minimal C++ spec framework.
  2 |  
  3 | 
  4 | The zlib/libpng License
  5 | 
  6 | 
  7 | Copyright (c) 2008 Mikko Lehtonen
  8 | 
  9 | This software is provided 'as-is', without any express or implied
 10 | warranty. In no event will the authors be held liable for any damages
 11 | arising from the use of this software.
 12 | 
 13 | Permission is granted to anyone to use this software for any purpose,
 14 | including commercial applications, and to alter it and redistribute it
 15 | freely, subject to the following restrictions:
 16 | 
 17 |     1. The origin of this software must not be misrepresented; you must not
 18 |     claim that you wrote the original software. If you use this software
 19 |     in a product, an acknowledgment in the product documentation would be
 20 |     appreciated but is not required.
 21 | 
 22 |     2. Altered source versions must be plainly marked as such, and must not be
 23 |     misrepresented as being the original software.
 24 | 
 25 |     3. This notice may not be removed or altered from any source
 26 |     distribution.
 27 | */
 28 | 
 29 | 
 30 | #ifndef SPECIFIC_SPEC_H
 31 | #define SPECIFIC_SPEC_H
 32 | 
 33 | #include <string>
 34 | #include <vector>
 35 | #include <stdexcept>
 36 | #include <sstream>
 37 | 
 38 | namespace specific {
 39 | 
 40 | 
 41 |     class SpecResult {
 42 |     public:
 43 |         typedef enum {
 44 |             PASSED,
 45 |             FAILED,
 46 |             ERRORED
 47 |         } Type;
 48 |       
 49 |         Type type;
 50 |         
 51 |         std::string group;
 52 |         std::string description;
 53 |         std::string test;
 54 |     };
 55 | 
 56 | 
 57 |     class SpecFailure {
 58 |     public:
 59 |         SpecFailure(std::string amsg, const char* afile, int aline)
 60 |             : msg(amsg), file(afile), line(aline) { }
 61 |         std::string msg;
 62 |         const char* file;
 63 |         int line;
 64 |     };
 65 | 
 66 | 
 67 |     class SpecWriter {
 68 |     public:
 69 |         std::vector<SpecResult> mResults;
 70 |         std::vector<SpecFailure> mFailures;
 71 |         SpecWriter() {}
 72 |         virtual ~SpecWriter() {}
 73 |         virtual void startGroup(std::string group, std::string description);
 74 |         virtual void addFailedAssertation(std::string msg, const char *file, int line);
 75 |         virtual void addSpecResult(SpecResult r);
 76 |         virtual void start();
 77 |         virtual void stop();
 78 |     };
 79 | 
 80 | 
 81 |     class ProgressWriter : public SpecWriter {
 82 |     public:
 83 |         void addSpecResult(SpecResult r);
 84 |     };
 85 | 
 86 | 
 87 | 
 88 |     class SpecdocWriter : public SpecWriter {
 89 |     public:
 90 |         void startGroup(std::string group, std::string description);
 91 |         void addSpecResult(SpecResult r);
 92 |     };
 93 | 
 94 | 
 95 | 
 96 |     template<class T> std::string inspect(const T& value) {
 97 |         std::stringstream ss;
 98 |         ss << value;
 99 |         return ss.str();
100 |     }
101 | 
102 | 
103 |     class SpecBase {
104 |     public:
105 |         SpecBase();
106 |         virtual ~SpecBase();
107 | 
108 |         virtual void specify() = 0;
109 | 
110 |         void setWriter(SpecWriter* w) { mWriter = w; }
111 | 
112 |         bool startSpec(const char* name);
113 |         void endSpec();
114 | 
115 |         void should_test(bool value, const char* message, const char* file, int line);
116 | 
117 |         template<typename T1, typename T2> void should_equal_template(const T1& a, const T2& b, const char* file, int line) {
118 |             std::stringstream ss;
119 |             ss << "`" << ::specific::inspect(a) << "'" << " == " << "`" << ::specific::inspect(b) << "'";
120 |             should_test( a == b, ss.str().c_str(), file, line);
121 |         }
122 | 
123 |         template<typename T1, typename T2> void should_not_equal_template(const T1& a, const T2& b, const char* file, int line) {
124 |             std::stringstream ss;
125 |             ss << "`" << ::specific::inspect(a) << "'" << " != " << "`" << ::specific::inspect(b) << "'";
126 |             should_test( a != b, ss.str().c_str(), file, line);
127 |         }
128 | 
129 | 
130 | 
131 |         virtual std::string getGroup() = 0;
132 |         virtual std::string getDescription() = 0;
133 | 
134 |         bool isSuccessful() { return !mFailed; }
135 |         
136 |         bool done();
137 |         
138 |         void error(std::string msg);
139 | 
140 |         SpecWriter* mWriter;
141 |         const char* mName;
142 |         bool mFailed;
143 |         bool mLastFailed;
144 |         bool mError;
145 |         int mExecutionPoint;
146 |         int mContinuePoint;
147 |         char *mFile;
148 |         std::string mErrorMessage;
149 |         int mLine;
150 |     };
151 | 
152 | 
153 |     class SpecRunner {
154 |     public:
155 |         static SpecRunner& getInstance();
156 |         void add(SpecBase* spec) { mSpecs.push_back( spec ); }
157 |         bool run(SpecWriter& writer, const std::string subset = "");
158 |     private:
159 | 
160 |         std::vector<SpecBase*> mSpecs;
161 | 
162 |         SpecRunner();
163 |         ~SpecRunner();
164 |     };
165 | 
166 |     #define SPEC_UNIQUE_NAME3(x,y) x##y
167 |     #define SPEC_UNIQUE_NAME2(x,y) SPEC_UNIQUE_NAME3(x,y)
168 | 
169 |     #define SPEC_NAME(x) SPEC_UNIQUE_NAME2(SPEC_##x, SPEC_UNIQUE_NAME2(_startingOnLine, __LINE__) )
170 | 
171 | 
172 |     #define describe(group, description)                                    \
173 |     class SPEC_NAME(group) : public specific::SpecBase                         \
174 |     {                                                                       \
175 |     public:                                                                 \
176 |         void specify();                                                     \
177 |         std::string getGroup() { return #group; }                           \
178 |         std::string getDescription() { return description; }                \
179 |     };                                                                      \
180 |     static SPEC_NAME(group) SPEC_UNIQUE_NAME2(SPEC_NAME(group), _instance); \
181 |     void SPEC_NAME(group)::specify()
182 |     
183 | 
184 |     #define it(description) if(startSpec(description))
185 | 
186 | 
187 |     // Matchers
188 |     #define should_be_true(a) should_test(a, #a, __FILE__, __LINE__)
189 |     #define should_be_false(a) should_be_true( !a )
190 | 
191 |     #ifndef SPECIFIC_NO_OSTREAM
192 |         #define should_equal(a, b) should_equal_template( a,b, __FILE__, __LINE__ )
193 |         #define should_not_equal(a, b) should_not_equal_template( a,b, __FILE__, __LINE__ )
194 |     #else
195 |         #define should_equal(a, b) should_be_true( (a) == (b) )
196 |         #define should_not_equal(a, b) should_be_true( (a) != (b) )
197 |     #endif
198 | 
199 |     #define should_throw(code, what) \
200 |     do {                             \
201 |         bool _thrown = false;        \
202 |         try {                        \
203 |           code ;                     \
204 |         } catch(what& e) {           \
205 |             _thrown = true;          \
206 |         }                            \
207 |         should_test(_thrown, "should throw exception " #what, __FILE__, __LINE__); \
208 |     } while(0)
209 | 
210 | 
211 | 
212 | }
213 | 
214 | 
215 | 
216 | #endif /* Include guard */
217 | 
218 | 


--------------------------------------------------------------------------------
/include/vectorial/simd4f_sse.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |   Vectorial
  3 |   Copyright (c) 2010 Mikko Lehtonen
  4 |   Copyright (c) 2014 Google, Inc.
  5 |   Licensed under the terms of the two-clause BSD License (see LICENSE)
  6 | */
  7 | #ifndef VECTORIAL_SIMD4F_SSE_H
  8 | #define VECTORIAL_SIMD4F_SSE_H
  9 | 
 10 | // Conditionally enable SSE4.1 otherwise fallback to SSE.
 11 | #if defined(_M_IX86_FP)
 12 |     #if _M_IX86_FP >=2
 13 |         #define VECTORIAL_USE_SSE4_1
 14 |     #endif
 15 | #elif defined(__SSE4_1__)
 16 |         #define VECTORIAL_USE_SSE4_1
 17 | #endif
 18 | 
 19 | #include <xmmintrin.h>
 20 | #if defined(VECTORIAL_USE_SSE4_1)
 21 |     #include <smmintrin.h>
 22 | #endif
 23 | #include <string.h>  // memcpy
 24 | 
 25 | #ifdef __cplusplus
 26 | extern "C" {
 27 | #endif
 28 | 
 29 | 
 30 | typedef __m128 simd4f; 
 31 | 
 32 | typedef union {
 33 |     simd4f s ;
 34 |     float f[4];
 35 |     unsigned int ui[4];
 36 | } _simd4f_union;
 37 | 
 38 | // creating
 39 | 
 40 | vectorial_inline simd4f simd4f_create(float x, float y, float z, float w) {
 41 |     simd4f s = { x, y, z, w };
 42 |     return s;
 43 | }
 44 | 
 45 | vectorial_inline simd4f simd4f_zero() { return _mm_setzero_ps(); }
 46 | 
 47 | vectorial_inline simd4f simd4f_uload4(const float *ary) {
 48 |     simd4f s = _mm_loadu_ps(ary);
 49 |     return s;
 50 | }
 51 | 
 52 | vectorial_inline simd4f simd4f_uload3(const float *ary) {
 53 |     simd4f s = simd4f_create(ary[0], ary[1], ary[2], 0);
 54 |     return s;
 55 | }
 56 | 
 57 | vectorial_inline simd4f simd4f_uload2(const float *ary) {
 58 |     simd4f s = simd4f_create(ary[0], ary[1], 0, 0);
 59 |     return s;
 60 | }
 61 | 
 62 | 
 63 | vectorial_inline void simd4f_ustore4(const simd4f val, float *ary) {
 64 |     _mm_storeu_ps(ary, val);
 65 | }
 66 | 
 67 | vectorial_inline void simd4f_ustore3(const simd4f val, float *ary) {
 68 |     memcpy(ary, &val, sizeof(float) * 3);
 69 | }
 70 | 
 71 | vectorial_inline void simd4f_ustore2(const simd4f val, float *ary) {
 72 |     memcpy(ary, &val, sizeof(float) * 2);
 73 | }
 74 | 
 75 | 
 76 | // utilites
 77 | 
 78 | vectorial_inline simd4f simd4f_splat(float v) { 
 79 |     simd4f s = _mm_set1_ps(v); 
 80 |     return s;
 81 | }
 82 | 
 83 | vectorial_inline simd4f simd4f_splat_x(simd4f v) { 
 84 |     simd4f s = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0,0,0,0)); 
 85 |     return s;
 86 | }
 87 | 
 88 | vectorial_inline simd4f simd4f_splat_y(simd4f v) { 
 89 |     simd4f s = _mm_shuffle_ps(v, v, _MM_SHUFFLE(1,1,1,1)); 
 90 |     return s;
 91 | }
 92 | 
 93 | vectorial_inline simd4f simd4f_splat_z(simd4f v) { 
 94 |     simd4f s = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2,2,2,2)); 
 95 |     return s;
 96 | }
 97 | 
 98 | vectorial_inline simd4f simd4f_splat_w(simd4f v) { 
 99 |     simd4f s = _mm_shuffle_ps(v, v, _MM_SHUFFLE(3,3,3,3)); 
100 |     return s;
101 | }
102 | 
103 | 
104 | // arithmetic
105 | 
106 | vectorial_inline simd4f simd4f_add(simd4f lhs, simd4f rhs) {
107 |     simd4f ret = _mm_add_ps(lhs, rhs);
108 |     return ret;
109 | }
110 | 
111 | vectorial_inline simd4f simd4f_sub(simd4f lhs, simd4f rhs) {
112 |     simd4f ret = _mm_sub_ps(lhs, rhs);
113 |     return ret;
114 | }
115 | 
116 | vectorial_inline simd4f simd4f_mul(simd4f lhs, simd4f rhs) {
117 |     simd4f ret = _mm_mul_ps(lhs, rhs);
118 |     return ret;
119 | }
120 | 
121 | vectorial_inline simd4f simd4f_div(simd4f lhs, simd4f rhs) {
122 |     simd4f ret = _mm_div_ps(lhs, rhs);
123 |     return ret;
124 | }
125 | 
126 | vectorial_inline simd4f simd4f_madd(simd4f m1, simd4f m2, simd4f a) {
127 |     return simd4f_add( simd4f_mul(m1, m2), a );
128 | }
129 | 
130 | 
131 | 
132 | 
133 | vectorial_inline simd4f simd4f_reciprocal(simd4f v) { 
134 |     simd4f s = _mm_rcp_ps(v); 
135 |     const simd4f two = simd4f_create(2.0f, 2.0f, 2.0f, 2.0f);
136 |     s = simd4f_mul(s, simd4f_sub(two, simd4f_mul(v, s)));
137 |     return s;
138 | }
139 | 
140 | vectorial_inline simd4f simd4f_sqrt(simd4f v) { 
141 |     simd4f s = _mm_sqrt_ps(v); 
142 |     return s;
143 | }
144 | 
145 | vectorial_inline simd4f simd4f_rsqrt(simd4f v) { 
146 |     simd4f s = _mm_rsqrt_ps(v); 
147 |     const simd4f half = simd4f_create(0.5f, 0.5f, 0.5f, 0.5f);
148 |     const simd4f three = simd4f_create(3.0f, 3.0f, 3.0f, 3.0f);
149 |     s = simd4f_mul(simd4f_mul(s, half), simd4f_sub(three, simd4f_mul(s, simd4f_mul(v,s))));
150 |     return s;
151 | }
152 | 
153 | vectorial_inline float simd4f_get_x(simd4f s) { _simd4f_union u={s}; return u.f[0]; }
154 | vectorial_inline float simd4f_get_y(simd4f s) { _simd4f_union u={s}; return u.f[1]; }
155 | vectorial_inline float simd4f_get_z(simd4f s) { _simd4f_union u={s}; return u.f[2]; }
156 | vectorial_inline float simd4f_get_w(simd4f s) { _simd4f_union u={s}; return u.f[3]; }
157 | 
158 | vectorial_inline simd4f simd4f_dot3(simd4f lhs,simd4f rhs) {
159 | #if defined(VECTORIAL_USE_SSE4_1)
160 |     return _mm_dp_ps(lhs, rhs, 0x7f);
161 | #else
162 |     simd4f_aligned16 const unsigned int mask_array[] = { 0xffffffff, 0xffffffff, 0xffffffff, 0 };
163 |     const simd4f mask = _mm_load_ps((const float*)mask_array);
164 |     const simd4f m = _mm_mul_ps(lhs, rhs);
165 |     const simd4f s0 = _mm_and_ps(m, mask);
166 |     const simd4f s1 = _mm_add_ps(s0, _mm_movehl_ps(s0, s0));
167 |     const simd4f s2 = _mm_add_ss(s1, _mm_shuffle_ps(s1, s1, 1));
168 |     return _mm_shuffle_ps(s2,s2, 0);
169 | #endif
170 | }
171 | 
172 | vectorial_inline float simd4f_dot3_scalar(simd4f lhs,simd4f rhs) {
173 |     return simd4f_get_x(simd4f_dot3(lhs, rhs));
174 | }
175 | 
176 | vectorial_inline simd4f simd4f_cross3(simd4f lhs, simd4f rhs) {
177 |     
178 |     const simd4f lyzx = _mm_shuffle_ps(lhs, lhs, _MM_SHUFFLE(3,0,2,1));
179 |     const simd4f lzxy = _mm_shuffle_ps(lhs, lhs, _MM_SHUFFLE(3,1,0,2));
180 | 
181 |     const simd4f ryzx = _mm_shuffle_ps(rhs, rhs, _MM_SHUFFLE(3,0,2,1));
182 |     const simd4f rzxy = _mm_shuffle_ps(rhs, rhs, _MM_SHUFFLE(3,1,0,2));
183 | 
184 |     return _mm_sub_ps(_mm_mul_ps(lyzx, rzxy), _mm_mul_ps(lzxy, ryzx));
185 | 
186 | }
187 | 
188 | vectorial_inline simd4f simd4f_shuffle_wxyz(simd4f s) { return _mm_shuffle_ps(s,s, _MM_SHUFFLE(2,1,0,3) ); }
189 | vectorial_inline simd4f simd4f_shuffle_zwxy(simd4f s) { return _mm_shuffle_ps(s,s, _MM_SHUFFLE(1,0,3,2) ); }
190 | vectorial_inline simd4f simd4f_shuffle_yzwx(simd4f s) { return _mm_shuffle_ps(s,s, _MM_SHUFFLE(0,3,2,1) ); }
191 | 
192 | vectorial_inline simd4f simd4f_zero_w(simd4f s) {
193 |     simd4f r = _mm_unpackhi_ps(s, _mm_setzero_ps());
194 |     return _mm_movelh_ps(s, r);
195 | }
196 | 
197 | vectorial_inline simd4f simd4f_zero_zw(simd4f s) {
198 |     return _mm_movelh_ps(s, _mm_setzero_ps());
199 | }
200 | 
201 | vectorial_inline simd4f simd4f_merge_high(simd4f xyzw, simd4f abcd) { 
202 |     return _mm_movehl_ps(abcd, xyzw);
203 | }
204 | 
205 | 
206 | typedef simd4f_aligned16 union {
207 |     unsigned int ui[4];
208 |     float f[4];
209 | } _simd4f_uif;
210 | 
211 | vectorial_inline simd4f simd4f_flip_sign_0101(simd4f s) {
212 |     const _simd4f_uif upnpn = { { 0x00000000, 0x80000000, 0x00000000, 0x80000000 } };
213 |     return _mm_xor_ps( s, _mm_load_ps(upnpn.f) ); 
214 | }
215 | 
216 | vectorial_inline simd4f simd4f_flip_sign_1010(simd4f s) {
217 |     const _simd4f_uif unpnp = { { 0x80000000, 0x00000000, 0x80000000, 0x00000000 } };
218 |     return _mm_xor_ps( s, _mm_load_ps(unpnp.f) ); 
219 | }
220 | 
221 | vectorial_inline simd4f simd4f_min(simd4f a, simd4f b) {
222 |     return _mm_min_ps( a, b ); 
223 | }
224 | 
225 | vectorial_inline simd4f simd4f_max(simd4f a, simd4f b) {
226 |     return _mm_max_ps( a, b ); 
227 | }
228 | 
229 | 
230 | 
231 | #ifdef __cplusplus
232 | }
233 | #endif
234 | 
235 | 
236 | #endif
237 | 


--------------------------------------------------------------------------------
/vectorialbenchmark.vcproj:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="Windows-1252"?>
  2 | <VisualStudioProject
  3 | 	ProjectType="Visual C++"
  4 | 	Version="9,00"
  5 | 	Name="vectorial benchmark"
  6 | 	ProjectGUID="{1E78F64D-C404-4048-8AE6-217089480E8A}"
  7 | 	RootNamespace="vectorialbenchmark"
  8 | 	Keyword="Win32Proj"
  9 | 	TargetFrameworkVersion="196613"
 10 | 	>
 11 | 	<Platforms>
 12 | 		<Platform
 13 | 			Name="Win32"
 14 | 		/>
 15 | 	</Platforms>
 16 | 	<ToolFiles>
 17 | 	</ToolFiles>
 18 | 	<Configurations>
 19 | 		<Configuration
 20 | 			Name="Debug|Win32"
 21 | 			OutputDirectory="$(SolutionDir)$(ConfigurationName)"
 22 | 			IntermediateDirectory="$(ConfigurationName)"
 23 | 			ConfigurationType="1"
 24 | 			CharacterSet="1"
 25 | 			>
 26 | 			<Tool
 27 | 				Name="VCPreBuildEventTool"
 28 | 			/>
 29 | 			<Tool
 30 | 				Name="VCCustomBuildTool"
 31 | 			/>
 32 | 			<Tool
 33 | 				Name="VCXMLDataGeneratorTool"
 34 | 			/>
 35 | 			<Tool
 36 | 				Name="VCWebServiceProxyGeneratorTool"
 37 | 			/>
 38 | 			<Tool
 39 | 				Name="VCMIDLTool"
 40 | 			/>
 41 | 			<Tool
 42 | 				Name="VCCLCompilerTool"
 43 | 				Optimization="0"
 44 | 				AdditionalIncludeDirectories="include"
 45 | 				PreprocessorDefinitions="WIN32;_DEBUG;_CONSOLE"
 46 | 				MinimalRebuild="true"
 47 | 				BasicRuntimeChecks="3"
 48 | 				RuntimeLibrary="3"
 49 | 				UsePrecompiledHeader="0"
 50 | 				WarningLevel="3"
 51 | 				DebugInformationFormat="4"
 52 | 			/>
 53 | 			<Tool
 54 | 				Name="VCManagedResourceCompilerTool"
 55 | 			/>
 56 | 			<Tool
 57 | 				Name="VCResourceCompilerTool"
 58 | 			/>
 59 | 			<Tool
 60 | 				Name="VCPreLinkEventTool"
 61 | 			/>
 62 | 			<Tool
 63 | 				Name="VCLinkerTool"
 64 | 				LinkIncremental="2"
 65 | 				GenerateDebugInformation="true"
 66 | 				SubSystem="1"
 67 | 				TargetMachine="1"
 68 | 			/>
 69 | 			<Tool
 70 | 				Name="VCALinkTool"
 71 | 			/>
 72 | 			<Tool
 73 | 				Name="VCManifestTool"
 74 | 			/>
 75 | 			<Tool
 76 | 				Name="VCXDCMakeTool"
 77 | 			/>
 78 | 			<Tool
 79 | 				Name="VCBscMakeTool"
 80 | 			/>
 81 | 			<Tool
 82 | 				Name="VCFxCopTool"
 83 | 			/>
 84 | 			<Tool
 85 | 				Name="VCAppVerifierTool"
 86 | 			/>
 87 | 			<Tool
 88 | 				Name="VCPostBuildEventTool"
 89 | 			/>
 90 | 		</Configuration>
 91 | 		<Configuration
 92 | 			Name="Release|Win32"
 93 | 			OutputDirectory="$(SolutionDir)$(ConfigurationName)"
 94 | 			IntermediateDirectory="$(ConfigurationName)"
 95 | 			ConfigurationType="1"
 96 | 			CharacterSet="1"
 97 | 			WholeProgramOptimization="1"
 98 | 			>
 99 | 			<Tool
100 | 				Name="VCPreBuildEventTool"
101 | 			/>
102 | 			<Tool
103 | 				Name="VCCustomBuildTool"
104 | 			/>
105 | 			<Tool
106 | 				Name="VCXMLDataGeneratorTool"
107 | 			/>
108 | 			<Tool
109 | 				Name="VCWebServiceProxyGeneratorTool"
110 | 			/>
111 | 			<Tool
112 | 				Name="VCMIDLTool"
113 | 			/>
114 | 			<Tool
115 | 				Name="VCCLCompilerTool"
116 | 				Optimization="2"
117 | 				EnableIntrinsicFunctions="true"
118 | 				AdditionalIncludeDirectories="include"
119 | 				PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE;NOMINMAX"
120 | 				RuntimeLibrary="2"
121 | 				EnableFunctionLevelLinking="false"
122 | 				EnableEnhancedInstructionSet="2"
123 | 				FloatingPointModel="2"
124 | 				UsePrecompiledHeader="0"
125 | 				WarningLevel="3"
126 | 				DebugInformationFormat="3"
127 | 			/>
128 | 			<Tool
129 | 				Name="VCManagedResourceCompilerTool"
130 | 			/>
131 | 			<Tool
132 | 				Name="VCResourceCompilerTool"
133 | 			/>
134 | 			<Tool
135 | 				Name="VCPreLinkEventTool"
136 | 			/>
137 | 			<Tool
138 | 				Name="VCLinkerTool"
139 | 				LinkIncremental="1"
140 | 				GenerateDebugInformation="true"
141 | 				SubSystem="1"
142 | 				OptimizeReferences="2"
143 | 				EnableCOMDATFolding="2"
144 | 				TargetMachine="1"
145 | 			/>
146 | 			<Tool
147 | 				Name="VCALinkTool"
148 | 			/>
149 | 			<Tool
150 | 				Name="VCManifestTool"
151 | 			/>
152 | 			<Tool
153 | 				Name="VCXDCMakeTool"
154 | 			/>
155 | 			<Tool
156 | 				Name="VCBscMakeTool"
157 | 			/>
158 | 			<Tool
159 | 				Name="VCFxCopTool"
160 | 			/>
161 | 			<Tool
162 | 				Name="VCAppVerifierTool"
163 | 			/>
164 | 			<Tool
165 | 				Name="VCPostBuildEventTool"
166 | 			/>
167 | 		</Configuration>
168 | 		<Configuration
169 | 			Name="Release Scalar|Win32"
170 | 			OutputDirectory="$(SolutionDir)$(ConfigurationName)"
171 | 			IntermediateDirectory="$(ConfigurationName)"
172 | 			ConfigurationType="1"
173 | 			CharacterSet="1"
174 | 			WholeProgramOptimization="1"
175 | 			>
176 | 			<Tool
177 | 				Name="VCPreBuildEventTool"
178 | 			/>
179 | 			<Tool
180 | 				Name="VCCustomBuildTool"
181 | 			/>
182 | 			<Tool
183 | 				Name="VCXMLDataGeneratorTool"
184 | 			/>
185 | 			<Tool
186 | 				Name="VCWebServiceProxyGeneratorTool"
187 | 			/>
188 | 			<Tool
189 | 				Name="VCMIDLTool"
190 | 			/>
191 | 			<Tool
192 | 				Name="VCCLCompilerTool"
193 | 				Optimization="2"
194 | 				EnableIntrinsicFunctions="true"
195 | 				AdditionalIncludeDirectories="include"
196 | 				PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE"
197 | 				RuntimeLibrary="2"
198 | 				EnableFunctionLevelLinking="false"
199 | 				EnableEnhancedInstructionSet="0"
200 | 				FloatingPointModel="2"
201 | 				UsePrecompiledHeader="0"
202 | 				WarningLevel="3"
203 | 				DebugInformationFormat="3"
204 | 			/>
205 | 			<Tool
206 | 				Name="VCManagedResourceCompilerTool"
207 | 			/>
208 | 			<Tool
209 | 				Name="VCResourceCompilerTool"
210 | 			/>
211 | 			<Tool
212 | 				Name="VCPreLinkEventTool"
213 | 			/>
214 | 			<Tool
215 | 				Name="VCLinkerTool"
216 | 				LinkIncremental="1"
217 | 				GenerateDebugInformation="true"
218 | 				SubSystem="1"
219 | 				OptimizeReferences="2"
220 | 				EnableCOMDATFolding="2"
221 | 				TargetMachine="1"
222 | 			/>
223 | 			<Tool
224 | 				Name="VCALinkTool"
225 | 			/>
226 | 			<Tool
227 | 				Name="VCManifestTool"
228 | 			/>
229 | 			<Tool
230 | 				Name="VCXDCMakeTool"
231 | 			/>
232 | 			<Tool
233 | 				Name="VCBscMakeTool"
234 | 			/>
235 | 			<Tool
236 | 				Name="VCFxCopTool"
237 | 			/>
238 | 			<Tool
239 | 				Name="VCAppVerifierTool"
240 | 			/>
241 | 			<Tool
242 | 				Name="VCPostBuildEventTool"
243 | 			/>
244 | 		</Configuration>
245 | 	</Configurations>
246 | 	<References>
247 | 	</References>
248 | 	<Files>
249 | 		<Filter
250 | 			Name="vectorial"
251 | 			>
252 | 			<File
253 | 				RelativePath=".\include\vectorial\config.h"
254 | 				>
255 | 			</File>
256 | 			<File
257 | 				RelativePath=".\include\vectorial\simd4f.h"
258 | 				>
259 | 			</File>
260 | 			<File
261 | 				RelativePath=".\include\vectorial\simd4f_common.h"
262 | 				>
263 | 			</File>
264 | 			<File
265 | 				RelativePath=".\include\vectorial\simd4f_gnu.h"
266 | 				>
267 | 			</File>
268 | 			<File
269 | 				RelativePath=".\include\vectorial\simd4f_neon.h"
270 | 				>
271 | 			</File>
272 | 			<File
273 | 				RelativePath=".\include\vectorial\simd4f_scalar.h"
274 | 				>
275 | 			</File>
276 | 			<File
277 | 				RelativePath=".\include\vectorial\simd4f_sse.h"
278 | 				>
279 | 			</File>
280 | 			<File
281 | 				RelativePath=".\include\vectorial\simd4x4f.h"
282 | 				>
283 | 			</File>
284 | 			<File
285 | 				RelativePath=".\include\vectorial\simd4x4f_gnu.h"
286 | 				>
287 | 			</File>
288 | 			<File
289 | 				RelativePath=".\include\vectorial\simd4x4f_neon.h"
290 | 				>
291 | 			</File>
292 | 			<File
293 | 				RelativePath=".\include\vectorial\simd4x4f_scalar.h"
294 | 				>
295 | 			</File>
296 | 			<File
297 | 				RelativePath=".\include\vectorial\simd4x4f_sse.h"
298 | 				>
299 | 			</File>
300 | 			<File
301 | 				RelativePath=".\include\vectorial\vec2f.h"
302 | 				>
303 | 			</File>
304 | 			<File
305 | 				RelativePath=".\include\vectorial\vec3f.h"
306 | 				>
307 | 			</File>
308 | 			<File
309 | 				RelativePath=".\include\vectorial\vec4f.h"
310 | 				>
311 | 			</File>
312 | 		</Filter>
313 | 		<Filter
314 | 			Name="bench"
315 | 			>
316 | 			<File
317 | 				RelativePath=".\bench\add_bench.cpp"
318 | 				>
319 | 			</File>
320 | 			<File
321 | 				RelativePath=".\bench\bench.cpp"
322 | 				>
323 | 			</File>
324 | 			<File
325 | 				RelativePath=".\bench\bench.h"
326 | 				>
327 | 			</File>
328 | 			<File
329 | 				RelativePath=".\bench\dot_bench.cpp"
330 | 				>
331 | 			</File>
332 | 			<File
333 | 				RelativePath=".\bench\quad_bench.cpp"
334 | 				>
335 | 			</File>
336 | 		</Filter>
337 | 	</Files>
338 | 	<Globals>
339 | 	</Globals>
340 | </VisualStudioProject>
341 | 


--------------------------------------------------------------------------------
/vectorial.vcproj:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <VisualStudioProject
  3 | 	ProjectType="Visual C++"
  4 | 	Version="9,00"
  5 | 	Name="vectorial specsuite"
  6 | 	ProjectGUID="{9450BCE8-02CB-4169-8471-2DFF764817F4}"
  7 | 	RootNamespace="vectorial specsuite"
  8 | 	Keyword="Win32Proj"
  9 | 	TargetFrameworkVersion="0"
 10 | 	>
 11 | 	<Platforms>
 12 | 		<Platform
 13 | 			Name="Win32"
 14 | 		/>
 15 | 	</Platforms>
 16 | 	<ToolFiles>
 17 | 	</ToolFiles>
 18 | 	<Configurations>
 19 | 		<Configuration
 20 | 			Name="Debug|Win32"
 21 | 			OutputDirectory="Debug"
 22 | 			IntermediateDirectory="Debug"
 23 | 			ConfigurationType="1"
 24 | 			>
 25 | 			<Tool
 26 | 				Name="VCPreBuildEventTool"
 27 | 			/>
 28 | 			<Tool
 29 | 				Name="VCCustomBuildTool"
 30 | 			/>
 31 | 			<Tool
 32 | 				Name="VCXMLDataGeneratorTool"
 33 | 			/>
 34 | 			<Tool
 35 | 				Name="VCWebServiceProxyGeneratorTool"
 36 | 			/>
 37 | 			<Tool
 38 | 				Name="VCMIDLTool"
 39 | 			/>
 40 | 			<Tool
 41 | 				Name="VCCLCompilerTool"
 42 | 				Optimization="0"
 43 | 				AdditionalIncludeDirectories="include"
 44 | 				PreprocessorDefinitions="WIN32;_DEBUG;_CONSOLE;"
 45 | 				MinimalRebuild="true"
 46 | 				BasicRuntimeChecks="3"
 47 | 				RuntimeLibrary="3"
 48 | 				FloatingPointModel="0"
 49 | 				UsePrecompiledHeader="0"
 50 | 				WarningLevel="3"
 51 | 				Detect64BitPortabilityProblems="false"
 52 | 				DebugInformationFormat="4"
 53 | 			/>
 54 | 			<Tool
 55 | 				Name="VCManagedResourceCompilerTool"
 56 | 			/>
 57 | 			<Tool
 58 | 				Name="VCResourceCompilerTool"
 59 | 			/>
 60 | 			<Tool
 61 | 				Name="VCPreLinkEventTool"
 62 | 			/>
 63 | 			<Tool
 64 | 				Name="VCLinkerTool"
 65 | 				LinkIncremental="2"
 66 | 				GenerateDebugInformation="true"
 67 | 				SubSystem="1"
 68 | 				TargetMachine="1"
 69 | 			/>
 70 | 			<Tool
 71 | 				Name="VCALinkTool"
 72 | 			/>
 73 | 			<Tool
 74 | 				Name="VCManifestTool"
 75 | 			/>
 76 | 			<Tool
 77 | 				Name="VCXDCMakeTool"
 78 | 			/>
 79 | 			<Tool
 80 | 				Name="VCBscMakeTool"
 81 | 			/>
 82 | 			<Tool
 83 | 				Name="VCFxCopTool"
 84 | 			/>
 85 | 			<Tool
 86 | 				Name="VCAppVerifierTool"
 87 | 			/>
 88 | 			<Tool
 89 | 				Name="VCPostBuildEventTool"
 90 | 			/>
 91 | 		</Configuration>
 92 | 		<Configuration
 93 | 			Name="Release|Win32"
 94 | 			OutputDirectory="Release"
 95 | 			IntermediateDirectory="Release"
 96 | 			ConfigurationType="1"
 97 | 			>
 98 | 			<Tool
 99 | 				Name="VCPreBuildEventTool"
100 | 			/>
101 | 			<Tool
102 | 				Name="VCCustomBuildTool"
103 | 			/>
104 | 			<Tool
105 | 				Name="VCXMLDataGeneratorTool"
106 | 			/>
107 | 			<Tool
108 | 				Name="VCWebServiceProxyGeneratorTool"
109 | 			/>
110 | 			<Tool
111 | 				Name="VCMIDLTool"
112 | 			/>
113 | 			<Tool
114 | 				Name="VCCLCompilerTool"
115 | 				AdditionalIncludeDirectories="include"
116 | 				PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE;NOMINMAX"
117 | 				RuntimeLibrary="2"
118 | 				EnableEnhancedInstructionSet="2"
119 | 				UsePrecompiledHeader="0"
120 | 				WarningLevel="3"
121 | 				Detect64BitPortabilityProblems="false"
122 | 				DebugInformationFormat="3"
123 | 			/>
124 | 			<Tool
125 | 				Name="VCManagedResourceCompilerTool"
126 | 			/>
127 | 			<Tool
128 | 				Name="VCResourceCompilerTool"
129 | 			/>
130 | 			<Tool
131 | 				Name="VCPreLinkEventTool"
132 | 			/>
133 | 			<Tool
134 | 				Name="VCLinkerTool"
135 | 				LinkIncremental="0"
136 | 				GenerateDebugInformation="true"
137 | 				SubSystem="1"
138 | 				OptimizeReferences="2"
139 | 				EnableCOMDATFolding="2"
140 | 				TargetMachine="1"
141 | 			/>
142 | 			<Tool
143 | 				Name="VCALinkTool"
144 | 			/>
145 | 			<Tool
146 | 				Name="VCManifestTool"
147 | 			/>
148 | 			<Tool
149 | 				Name="VCXDCMakeTool"
150 | 			/>
151 | 			<Tool
152 | 				Name="VCBscMakeTool"
153 | 			/>
154 | 			<Tool
155 | 				Name="VCFxCopTool"
156 | 			/>
157 | 			<Tool
158 | 				Name="VCAppVerifierTool"
159 | 			/>
160 | 			<Tool
161 | 				Name="VCPostBuildEventTool"
162 | 			/>
163 | 		</Configuration>
164 | 		<Configuration
165 | 			Name="Release Scalar|Win32"
166 | 			OutputDirectory="$(ConfigurationName)"
167 | 			IntermediateDirectory="$(ConfigurationName)"
168 | 			ConfigurationType="1"
169 | 			>
170 | 			<Tool
171 | 				Name="VCPreBuildEventTool"
172 | 			/>
173 | 			<Tool
174 | 				Name="VCCustomBuildTool"
175 | 			/>
176 | 			<Tool
177 | 				Name="VCXMLDataGeneratorTool"
178 | 			/>
179 | 			<Tool
180 | 				Name="VCWebServiceProxyGeneratorTool"
181 | 			/>
182 | 			<Tool
183 | 				Name="VCMIDLTool"
184 | 			/>
185 | 			<Tool
186 | 				Name="VCCLCompilerTool"
187 | 				AdditionalIncludeDirectories="include"
188 | 				PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE;"
189 | 				RuntimeLibrary="2"
190 | 				UsePrecompiledHeader="0"
191 | 				WarningLevel="3"
192 | 				Detect64BitPortabilityProblems="false"
193 | 				DebugInformationFormat="3"
194 | 			/>
195 | 			<Tool
196 | 				Name="VCManagedResourceCompilerTool"
197 | 			/>
198 | 			<Tool
199 | 				Name="VCResourceCompilerTool"
200 | 			/>
201 | 			<Tool
202 | 				Name="VCPreLinkEventTool"
203 | 			/>
204 | 			<Tool
205 | 				Name="VCLinkerTool"
206 | 				LinkIncremental="0"
207 | 				GenerateDebugInformation="true"
208 | 				SubSystem="1"
209 | 				OptimizeReferences="2"
210 | 				EnableCOMDATFolding="2"
211 | 				TargetMachine="1"
212 | 			/>
213 | 			<Tool
214 | 				Name="VCALinkTool"
215 | 			/>
216 | 			<Tool
217 | 				Name="VCManifestTool"
218 | 			/>
219 | 			<Tool
220 | 				Name="VCXDCMakeTool"
221 | 			/>
222 | 			<Tool
223 | 				Name="VCBscMakeTool"
224 | 			/>
225 | 			<Tool
226 | 				Name="VCFxCopTool"
227 | 			/>
228 | 			<Tool
229 | 				Name="VCAppVerifierTool"
230 | 			/>
231 | 			<Tool
232 | 				Name="VCPostBuildEventTool"
233 | 			/>
234 | 		</Configuration>
235 | 	</Configurations>
236 | 	<References>
237 | 	</References>
238 | 	<Files>
239 | 		<Filter
240 | 			Name="vectorial"
241 | 			>
242 | 			<File
243 | 				RelativePath=".\include\vectorial\config.h"
244 | 				>
245 | 			</File>
246 | 			<File
247 | 				RelativePath=".\include\vectorial\simd4f.h"
248 | 				>
249 | 			</File>
250 | 			<File
251 | 				RelativePath=".\include\vectorial\simd4f_common.h"
252 | 				>
253 | 			</File>
254 | 			<File
255 | 				RelativePath=".\include\vectorial\simd4f_gnu.h"
256 | 				>
257 | 			</File>
258 | 			<File
259 | 				RelativePath=".\include\vectorial\simd4f_neon.h"
260 | 				>
261 | 			</File>
262 | 			<File
263 | 				RelativePath=".\include\vectorial\simd4f_scalar.h"
264 | 				>
265 | 			</File>
266 | 			<File
267 | 				RelativePath=".\include\vectorial\simd4f_sse.h"
268 | 				>
269 | 			</File>
270 | 			<File
271 | 				RelativePath=".\include\vectorial\simd4x4f.h"
272 | 				>
273 | 			</File>
274 | 			<File
275 | 				RelativePath=".\include\vectorial\simd4x4f_gnu.h"
276 | 				>
277 | 			</File>
278 | 			<File
279 | 				RelativePath=".\include\vectorial\simd4x4f_neon.h"
280 | 				>
281 | 			</File>
282 | 			<File
283 | 				RelativePath=".\include\vectorial\simd4x4f_scalar.h"
284 | 				>
285 | 			</File>
286 | 			<File
287 | 				RelativePath=".\include\vectorial\simd4x4f_sse.h"
288 | 				>
289 | 			</File>
290 | 			<File
291 | 				RelativePath=".\include\vectorial\vec2f.h"
292 | 				>
293 | 			</File>
294 | 			<File
295 | 				RelativePath=".\include\vectorial\vec3f.h"
296 | 				>
297 | 			</File>
298 | 			<File
299 | 				RelativePath=".\include\vectorial\vec4f.h"
300 | 				>
301 | 			</File>
302 | 		</Filter>
303 | 		<Filter
304 | 			Name="spec"
305 | 			>
306 | 			<File
307 | 				RelativePath=".\spec\spec.cpp"
308 | 				>
309 | 			</File>
310 | 			<File
311 | 				RelativePath=".\spec\spec.h"
312 | 				>
313 | 			</File>
314 | 			<File
315 | 				RelativePath=".\spec\spec_helper.h"
316 | 				>
317 | 			</File>
318 | 			<File
319 | 				RelativePath=".\spec\spec_main.cpp"
320 | 				>
321 | 			</File>
322 | 			<File
323 | 				RelativePath=".\spec\spec_mat4f.cpp"
324 | 				>
325 | 			</File>
326 | 			<File
327 | 				RelativePath=".\spec\spec_simd4f.cpp"
328 | 				>
329 | 			</File>
330 | 			<File
331 | 				RelativePath=".\spec\spec_simd4x4f.cpp"
332 | 				>
333 | 			</File>
334 | 			<File
335 | 				RelativePath=".\spec\spec_vec2f.cpp"
336 | 				>
337 | 			</File>
338 | 			<File
339 | 				RelativePath=".\spec\spec_vec3f.cpp"
340 | 				>
341 | 			</File>
342 | 			<File
343 | 				RelativePath=".\spec\spec_vec4f.cpp"
344 | 				>
345 | 			</File>
346 | 		</Filter>
347 | 	</Files>
348 | 	<Globals>
349 | 	</Globals>
350 | </VisualStudioProject>
351 | 


--------------------------------------------------------------------------------
/spec/spec_vec2f.cpp:
--------------------------------------------------------------------------------
  1 | #include "spec_helper.h"
  2 | #include <iostream>
  3 | using vectorial::vec2f;
  4 | 
  5 | const int epsilon = 1;
  6 | 
  7 | describe(vec2f, "constructing") {
  8 |     it("should have default constructor that does nothing..") {
  9 |         vec2f x;
 10 |     }
 11 | 
 12 |     it("should have constructor with element values") {
 13 |         vec2f x(10,20);
 14 |         // octave vec2f: [10,20]
 15 |         should_be_equal_vec2f(x, simd4f_create(10.000000000000000f, 20.000000000000000f, 0.0f, 0.0f), epsilon );
 16 |         
 17 |     }
 18 | 
 19 |     it("should have constructor that loads from a float array") {
 20 |         float ary[2] = { 1,2 };
 21 |         vec2f x(ary);
 22 |         // octave vec2f: [1,2]
 23 |         should_be_equal_vec2f(x, simd4f_create(1.000000000000000f, 2.000000000000000f, 0.0f, 0.0f), epsilon );
 24 |     }
 25 | 
 26 | }
 27 | 
 28 | describe(vec2f, "loads and stores") {
 29 | 
 30 |     it("should have method for loading from a float array") {
 31 |         float ary[2] = { 1, 2 };
 32 |         vec2f x(-1, -1 );
 33 |         x.load(ary);
 34 |         // octave vec2f: [1,2]
 35 |         should_be_equal_vec2f(x, simd4f_create(1.000000000000000f, 2.000000000000000f, 0.0f, 0.0f), epsilon );
 36 |     }
 37 | 
 38 |     it("should have method for storing to a float array") {
 39 |         float ary[2] = { -1, -1 };
 40 |         vec2f x(1, 2);
 41 |         x.store(ary);
 42 |         should_be_close_to(ary[0], 1, epsilon);
 43 |         should_be_close_to(ary[1], 2, epsilon);
 44 |     }
 45 | 
 46 | }
 47 | 
 48 | 
 49 | describe(vec2f, "arithmetic with another vec2f") {
 50 |     
 51 |     it("should have operator+ for component-wise addition") {
 52 |         vec2f a(1,2);
 53 |         vec2f b(10,20);
 54 |         vec2f x = a + b;
 55 |         // octave vec2f: [1,2] + [10,20]
 56 |         should_be_equal_vec2f(x, simd4f_create(11.000000000000000f, 22.000000000000000f, 0.0f, 0.0f), epsilon );
 57 | 
 58 |     }
 59 | 
 60 |     it("should have operator- for component-wise subtraction") {
 61 |         vec2f a(1,2);
 62 |         vec2f b(10,20);
 63 |         vec2f x = b - a;
 64 |         // octave vec2f:  [10,20] - [1,2]
 65 |         should_be_equal_vec2f(x, simd4f_create(9.000000000000000f, 18.000000000000000f, 0.0f, 0.0f), epsilon );
 66 | 
 67 |     }
 68 | 
 69 |     it("should have operator* for component-wise multiplication") {
 70 |         vec2f a(1,2);
 71 |         vec2f b(10,20);
 72 |         vec2f x = a * b;
 73 |         // octave vec2f: [1,2] .* [10,20]
 74 |         should_be_equal_vec2f(x, simd4f_create(10.000000000000000f, 40.000000000000000f, 0.0f, 0.0f), epsilon );
 75 | 
 76 |     }
 77 | 
 78 |     it("should have operator/ for component-wise division") {
 79 |         vec2f a(1,2);
 80 |         vec2f b(10,20);
 81 |         vec2f x = b / a;
 82 |         // octave vec2f:  [10,20] ./ [1,2]
 83 |         should_be_equal_vec2f(x, simd4f_create(10.000000000000000f, 10.000000000000000f, 0.0f, 0.0f), epsilon );
 84 | 
 85 |     }
 86 |    
 87 | 
 88 | 
 89 |     it("should have operator+= for component-wise addition") {
 90 |         vec2f x(1,2);
 91 |         vec2f b(10,20);
 92 |         x += b;
 93 |         // octave vec2f: [1,2] + [10,20]
 94 |         should_be_equal_vec2f(x, simd4f_create(11.000000000000000f, 22.000000000000000f, 0.0f, 0.0f), epsilon );
 95 | 
 96 |     }
 97 | 
 98 |     it("should have operator-= for component-wise subtraction") {
 99 |         vec2f a(1,2);
100 |         vec2f x(10,20);
101 |         x -= a;
102 |         // octave vec2f:  [10,20] - [1,2]
103 |         should_be_equal_vec2f(x, simd4f_create(9.000000000000000f, 18.000000000000000f, 0.0f, 0.0f), epsilon );
104 | 
105 |     }
106 | 
107 |     it("should have operator*= for component-wise multiplication") {
108 |         vec2f x(1,2);
109 |         vec2f b(10,20);
110 |         x *= b;
111 |         // octave vec2f: [1,2] .* [10,20]
112 |         should_be_equal_vec2f(x, simd4f_create(10.000000000000000f, 40.000000000000000f, 0.0f, 0.0f), epsilon );
113 | 
114 |     }
115 | 
116 |     it("should have operator/= for component-wise division") {
117 |         vec2f a(1,2);
118 |         vec2f x(10,20);
119 |         x /= a;
120 |         // octave vec2f:  [10,20] ./ [1,2]
121 |         should_be_equal_vec2f(x, simd4f_create(10.000000000000000f, 10.000000000000000f, 0.0f, 0.0f), epsilon );
122 | 
123 |     }
124 | 
125 | 
126 | }
127 | 
128 | 
129 | describe(vec2f, "arithmetic with scalar") {
130 |     
131 |     it("should have operator+ for component-wise addition") {
132 |         vec2f a(1,2);
133 |         float b=10;
134 |         vec2f x = a + b;
135 |         // octave vec2f: [1,2] + 10
136 |         should_be_equal_vec2f(x, simd4f_create(11.000000000000000f, 12.000000000000000f, 0.0f, 0.0f), epsilon );
137 | 
138 |     }
139 | 
140 |     it("should have operator- for component-wise subtraction") {
141 |         float a=10;
142 |         vec2f b(10,20);
143 |         vec2f x = b - a;
144 |         // octave vec2f:  [10,20] - 10
145 |         should_be_equal_vec2f(x, simd4f_create(0.000000000000000f, 10.000000000000000f, 0.0f, 0.0f), epsilon );
146 | 
147 |     }
148 | 
149 |     it("should have operator* for component-wise multiplication") {
150 |         vec2f a(1,2);
151 |         float b=10;
152 |         vec2f x = a * b;
153 |         // octave vec2f: [1,2] .* 10
154 |         should_be_equal_vec2f(x, simd4f_create(10.000000000000000f, 20.000000000000000f, 0.0f, 0.0f), epsilon );
155 | 
156 |     }
157 | 
158 |     it("should have operator/ for component-wise division") {
159 |         vec2f a(10,20);
160 |         float b=10;
161 |         vec2f x = a / b;
162 |         // octave vec2f: [10,20] ./ 10
163 |         should_be_equal_vec2f(x, simd4f_create(1.000000000000000f, 2.000000000000000f, 0.0f, 0.0f), epsilon );
164 | 
165 |     }
166 | 
167 | 
168 | 
169 |     it("should have operator+ for component-wise addition (float as lhs)") {
170 |         vec2f b(1,2);
171 |         float a=10;
172 |         vec2f x = a + b;
173 |         // octave vec2f: 10 + [1,2]
174 |         should_be_equal_vec2f(x, simd4f_create(11.000000000000000f, 12.000000000000000f, 0.0f, 0.0f), epsilon );
175 | 
176 |     }
177 | 
178 |     it("should have operator- for component-wise subtraction (float as lhs)") {
179 |         float b=50;
180 |         vec2f a(10,20);
181 |         vec2f x = b - a;
182 |         // octave vec2f:  50 - [10,20]
183 |         should_be_equal_vec2f(x, simd4f_create(40.000000000000000f, 30.000000000000000f, 0.0f, 0.0f), epsilon );
184 | 
185 |     }
186 | 
187 |     it("should have operator* for component-wise multiplication (float as lhs)") {
188 |         vec2f b(1,2);
189 |         float a=10;
190 |         vec2f x = a * b;
191 |         // octave vec2f: 10 .* [1,2] 
192 |         should_be_equal_vec2f(x, simd4f_create(10.000000000000000f, 20.000000000000000f, 0.0f, 0.0f), epsilon );
193 | 
194 |     }
195 | 
196 |     it("should have operator* for component-wise multiplication (float as lhs)") {
197 |         vec2f b(10,20);
198 |         float a=40;
199 |         vec2f x = a / b;
200 |         // octave vec2f: 40 ./ [10,20] 
201 |         should_be_equal_vec2f(x, simd4f_create(4.000000000000000f, 2.000000000000000f, 0.0f, 0.0f), epsilon );
202 | 
203 |     }
204 | 
205 |     
206 | }
207 | 
208 | 
209 | 
210 | describe(vec2f, "vector math") {
211 | 
212 |     it("should have unary minus operator") {
213 |         vec2f a(1,2);
214 |         vec2f x = -a;
215 |         // octave vec2f: -[1,2]
216 |         should_be_equal_vec2f(x, simd4f_create(-1.000000000000000f, -2.000000000000000f, 0.0f, 0.0f), epsilon );
217 |     }
218 | 
219 | 
220 |     it("should have dot function") {
221 |         vec2f a(1,2);
222 |         vec2f b(6,7);
223 |         float x = vectorial::dot(a,b);
224 |         
225 |         // octave vec2f: dot([1,2],[6,7])
226 |         should_be_close_to(x, 20.000000000000000f, epsilon );
227 |     }
228 | 
229 |     it("should have length_squared function") {
230 |         vec2f a(1,2);
231 |         float x = vectorial::length_squared(a);
232 |         
233 |         // octave vec2f: dot([1,2],[1,2])
234 |         should_be_close_to(x, 5.000000000000000f, epsilon );
235 |     }
236 | 
237 |     it("should have length function") {
238 |         vec2f a(1,2);
239 |         float x = vectorial::length(a);
240 |         
241 |         // octave vec2f: norm([1,2])
242 |         should_be_close_to(x, 2.236067977499790f, epsilon );
243 |     }
244 |     
245 |     
246 |     it("should have normalize function") {
247 |         vec2f a(1,2);
248 |         vec2f x = vectorial::normalize(a);
249 |         // octave vec2f: [1,2] / norm([1,2])
250 |         should_be_equal_vec2f(x, simd4f_create(0.447213595499958f, 0.894427190999916f, 0.0f, 0.0f), epsilon );
251 |     }
252 | 
253 | }
254 | 
255 | 
256 | 


--------------------------------------------------------------------------------
/spec/spec_simd2f.cpp:
--------------------------------------------------------------------------------
  1 | 
  2 | #include "spec_helper.h"
  3 | 
  4 | const int epsilon = 1;
  5 | 
  6 | #ifdef VECTORIAL_HAVE_SIMD2F
  7 | 
  8 | describe(simd2f, "sanity") {
  9 |     it("VECTORIAL_SIMD_TYPE should be defined to a string") {
 10 |         std::cout << "Simd type: " << VECTORIAL_SIMD_TYPE << std::endl;
 11 |     }
 12 | }
 13 | 
 14 | describe(simd2f, "creating") {
 15 |     
 16 |     it("should be possible to create with simd2f_create") {
 17 |         
 18 |         simd2f x = simd2f_create(1, 2);
 19 | 
 20 |         should_be_close_to( simd2f_get_x(x), 1, epsilon);
 21 |         should_be_close_to( simd2f_get_y(x), 2, epsilon);
 22 | 
 23 |         // octave simd2f: [1,2]
 24 |         should_be_equal_simd2f(x, simd2f_create(1.000000000000000f, 2.000000000000000f), epsilon );
 25 |         
 26 |     }
 27 | 
 28 |     it("should have simd2f_zero for zero vector") {
 29 | 
 30 |         simd2f x = simd2f_zero();
 31 | 
 32 |         // octave simd2f: [0,0]
 33 |         should_be_equal_simd2f(x, simd2f_create(0.000000000000000f, 0.000000000000000f), epsilon );
 34 |     }
 35 |     
 36 |     
 37 | }
 38 | #ifdef _MSC_VER
 39 | #include <malloc.h>
 40 | #else
 41 | #include <alloca.h>
 42 | #endif
 43 | 
 44 | #define unaligned_mem(n) ((float*)((unsigned char*)alloca(sizeof(float)*n+4)+4))
 45 | 
 46 | describe(simd2f, "utilities") {
 47 | 
 48 |     it("should have simd2f_uload2 for loading two float values from float an unaligned array into simd2f") {
 49 |         float *f = unaligned_mem(2);
 50 |         f[0] = 1;
 51 |         f[1] = 2;
 52 |         simd2f x = simd2f_uload2(f);
 53 |         // octave simd2f: [1,2]
 54 |         should_be_equal_simd2f(x, simd2f_create(1.000000000000000f, 2.000000000000000f), epsilon );
 55 |     }
 56 | 
 57 |     it("should have simd2f_ustore2 for storing two float values from simd2f to an unaligned array") {
 58 |         float *f = unaligned_mem(2);
 59 |         f[0] = -1;
 60 |         f[1] = -1;
 61 |         simd2f a = simd2f_create(1,2);
 62 |         simd2f_ustore2(a, f);
 63 |         should_be_close_to(f[0], 1, epsilon);
 64 |         should_be_close_to(f[1], 2, epsilon);
 65 |     }
 66 | 
 67 | 
 68 |     it("should have simd2f_splat that expands a single scalar to all elements") {
 69 |         simd2f x = simd2f_splat(42);
 70 |         // octave simd2f: [42,42]
 71 |         should_be_equal_simd2f(x, simd2f_create(42.000000000000000f, 42.000000000000000f), epsilon );
 72 |     }
 73 | 
 74 |     it("should have simd2f_splat_x,y splatting of an element") {
 75 |         simd2f a = simd2f_create(1,2);
 76 | 
 77 |         simd2f x;
 78 |         
 79 |         x = simd2f_splat_x(a);
 80 |         // octave simd2f: [1,1]
 81 |         should_be_equal_simd2f(x, simd2f_create(1.000000000000000f, 1.000000000000000f), epsilon );
 82 | 
 83 |         x = simd2f_splat_y(a);
 84 |         // octave simd2f: [2,2]
 85 |         should_be_equal_simd2f(x, simd2f_create(2.000000000000000f, 2.000000000000000f), epsilon );
 86 | 
 87 |     }
 88 | 
 89 | #if 0
 90 |     it("should have simd2f_sum that adds elements") {
 91 |         simd2f a = simd2f_create(1,2);
 92 |         simd2f x = simd2f_sum(a);
 93 |         // octave simd2f: [sum([1,2]), sum([1,2,3,4])]
 94 |         should_be_equal_simd2f(x, simd2f_create(3.000000000000000f, 10.000000000000000f), epsilon );
 95 |         
 96 |     }
 97 | #endif
 98 | 
 99 |     it("should have simd2f_reciprocal") {
100 |         simd2f a = simd2f_create(0.00001f, 2.00001f);
101 |         simd2f x = simd2f_reciprocal(a);
102 |         // octave simd2f: 1 ./ [0.00001, 2.00001]
103 |         should_be_equal_simd2f(x, simd2f_create(99999.999999999985448f, 0.499997500012500f), epsilon );
104 |     }
105 | 
106 |     it("should have simd2f_sqrt") {
107 |         simd2f a = simd2f_create(0.00001f, 2.00001f);
108 |         simd2f x = simd2f_sqrt(a);
109 |         // octave simd2f:  sqrt([0.00001, 2.00001])
110 |         should_be_equal_simd2f(x, simd2f_create(0.003162277660168f, 1.414217097902582f), epsilon );
111 | 
112 |         x = simd2f_sqrt( simd2f_create(0.0f, 0.0f) );
113 |         // octave simd2f:  sqrt([0, 0])
114 |         should_be_equal_simd2f(x, simd2f_create(0.000000000000000f, 0.000000000000000f), epsilon );
115 |     }
116 | 
117 |     it("should have simd2f_rsqrt for reciprocal of square-root") {
118 |         simd2f a = simd2f_create(0.00001f, 2.00001f);
119 |         simd2f x = simd2f_rsqrt(a);
120 |         const int epsilon = 4; // Grant larger error
121 |         // octave simd2f:  1 ./ sqrt([0.00001, 2.00001])
122 |         should_be_equal_simd2f(x, simd2f_create(316.227766016837904f, 0.707105013426224f), epsilon );
123 |     }
124 | 
125 | }
126 | 
127 | describe(simd2f, "arithmetic with another simd2f") {
128 | 
129 |     it("should have simd2f_add for component-wise addition") {
130 |         simd2f a = simd2f_create(1,2);
131 |         simd2f b = simd2f_create(10,20);
132 |         
133 |         simd2f x = simd2f_add(a,b);
134 |         // octave simd2f: [1,2] + [10,20]
135 |         should_be_equal_simd2f(x, simd2f_create(11.000000000000000f, 22.000000000000000f), epsilon );
136 |     }
137 | 
138 |     it("should have simd2f_sub for component-wise subtraction") {
139 |         simd2f a = simd2f_create(1,2);
140 |         simd2f b = simd2f_create(10,20);
141 |         
142 |         simd2f x = simd2f_sub(b,a);
143 |         // octave simd2f: [10,20] - [1,2] 
144 |         should_be_equal_simd2f(x, simd2f_create(9.000000000000000f, 18.000000000000000f), epsilon );
145 |     }
146 | 
147 |     it("should have simd2f_mul for component-wise multiply") {
148 |         simd2f a = simd2f_create(1,2);
149 |         simd2f b = simd2f_create(10,20);
150 |         
151 |         simd2f x = simd2f_mul(a,b);
152 |         // octave simd2f: [1,2] .* [10,20]
153 |         should_be_equal_simd2f(x, simd2f_create(10.000000000000000f, 40.000000000000000f), epsilon );
154 |     }
155 | 
156 |     it("should have simd2f_div for component-wise division") {
157 |         simd2f a = simd2f_create(1,2);
158 |         simd2f b = simd2f_create(10,20);
159 |         
160 |         simd2f x = simd2f_div(b,a);
161 |         // octave simd2f: [10,20] ./ [1,2] 
162 |         should_be_equal_simd2f(x, simd2f_create(10.000000000000000f, 10.000000000000000f), epsilon );
163 |     }
164 | 
165 |     it("should have simd2f_madd for multiply-add") {
166 |         simd2f a = simd2f_create(1,2);
167 |         simd2f b = simd2f_create(100,100);
168 |         simd2f c = simd2f_create(6,7);
169 | 
170 |         simd2f x = simd2f_madd(a,b,c);
171 |         // octave simd2f: [1,2] .* [100,100] .+ [6,7]
172 |         should_be_equal_simd2f(x, simd2f_create(106.000000000000000f, 207.000000000000000f), epsilon );
173 | 
174 |     }
175 | 
176 | }
177 | 
178 | 
179 | describe(simd2f, "vector math") {
180 | 
181 |     it("should have simd2f_dot2 for two component dot product") {
182 |         simd2f a = simd2f_create(1,2);
183 |         simd2f b = simd2f_create(10,20);
184 |         
185 |         simd2f x = simd2f_dot2(a,b);
186 |         // octave simd2f: [dot([1, 2], [10, 20]),dot([1, 2], [10, 20])]
187 |         should_be_equal_simd2f(x, simd2f_create(50.000000000000000f, 50.000000000000000f), epsilon );
188 |     }
189 | 
190 |     it("should have simd2f_length2 for two component vector length") {
191 |         simd2f a = simd2f_create(1,2);
192 |         simd2f x = simd2f_length2(a);
193 |         // octave simd2f: [norm([1,2]),norm([1,2])]
194 |         should_be_equal_simd2f(x, simd2f_create(2.236067977499790f, 2.236067977499790f), epsilon );
195 | 
196 |     }
197 | 
198 | 
199 |     it("should have simd2f_length2_squared for two component squared vector length") {
200 |         simd2f a = simd2f_create(1,2);
201 |         simd2f x = simd2f_length2_squared(a);
202 |         // octave simd2f: ([dot([1,2], [1,2]), dot([1,2], [1,2])])
203 |         should_be_equal_simd2f(x, simd2f_create(5.000000000000000f, 5.000000000000000f), epsilon );
204 | 
205 |     }
206 | 
207 |     it("should have simd2f_normalize2 for normalizing two component vector to unit length") {
208 |         simd2f a = simd2f_create(1,2);
209 |         simd2f x = simd2f_normalize2(a);
210 |         // octave simd2f: [1,2] / norm([1,2])
211 |         should_be_equal_simd2f(x, simd2f_create(0.447213595499958f, 0.894427190999916f), epsilon );
212 |     }
213 | 
214 | }
215 | 
216 | 
217 | describe(simd2f, "min-max") {
218 | 
219 |     it("should have simd2f_min for choosing minimum elements") {
220 |         simd2f a = simd2f_create(1.0f,  2.0f);
221 |         simd2f b = simd2f_create(2.0f, -2.0f);
222 | 
223 |         simd2f x = simd2f_min(a,b);
224 |         should_be_equal_simd2f(x, simd2f_create(1.0f, -2.0f), epsilon);
225 | 
226 |     }
227 | 
228 |     it("should have simd2f_max for choosing maximum elements") {
229 |         simd2f a = simd2f_create(1.0f,  2.0f);
230 |         simd2f b = simd2f_create(2.0f, -2.0f);
231 | 
232 |         simd2f x = simd2f_max(a,b);
233 |         should_be_equal_simd2f(x, simd2f_create(2.0f, 2.0f), epsilon);
234 | 
235 |     }
236 | 
237 | }
238 | 
239 | 
240 | 
241 | #endif
242 | 
243 | 


--------------------------------------------------------------------------------
/include/vectorial/simd4f_neon.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |   Vectorial
  3 |   Copyright (c) 2010 Mikko Lehtonen
  4 |   Copyright (c) 2014 Google, Inc.
  5 |   Licensed under the terms of the two-clause BSD License (see LICENSE)
  6 | */
  7 | #ifndef VECTORIAL_SIMD4F_NEON_H
  8 | #define VECTORIAL_SIMD4F_NEON_H
  9 | 
 10 | #include <arm_neon.h>
 11 | 
 12 | #ifdef __cplusplus
 13 | extern "C" {
 14 | #endif
 15 | 
 16 | 
 17 | typedef float32x4_t simd4f;
 18 | typedef float32x2_t simd2f;
 19 | 
 20 | typedef union {
 21 |     simd4f s ;
 22 |     float f[4];
 23 | } _simd4f_union;
 24 | 
 25 | 
 26 | 
 27 | vectorial_inline simd4f simd4f_create(float x, float y, float z, float w) {
 28 |     const float32_t d[4] = { x,y,z,w };
 29 |     simd4f s = vld1q_f32(d);
 30 |     return s;
 31 | }
 32 | 
 33 | vectorial_inline simd4f simd4f_zero() { return vdupq_n_f32(0.0f); }
 34 | 
 35 | vectorial_inline simd4f simd4f_uload4(const float *ary) {
 36 |     const float32_t* ary32 = (const float32_t*)ary;
 37 |     simd4f s = vld1q_f32(ary32);    
 38 |     return s;
 39 | }
 40 | 
 41 | vectorial_inline simd4f simd4f_uload3(const float *ary) {
 42 |     simd4f s = simd4f_create(ary[0], ary[1], ary[2], 0);
 43 |     return s;
 44 | }
 45 | 
 46 | vectorial_inline simd4f simd4f_uload2(const float *ary) {
 47 |     const float32_t* ary32 = (const float32_t*)ary;
 48 |     float32x2_t low = vld1_f32(ary32);
 49 |     const float32_t zero = 0;
 50 |     float32x2_t high = vld1_dup_f32(&zero); // { 0,0 } but stupid warnings from llvm-gcc
 51 |     return vcombine_f32(low, high);
 52 | }
 53 | 
 54 | 
 55 | vectorial_inline void simd4f_ustore4(const simd4f val, float *ary) {
 56 |     vst1q_f32( (float32_t*)ary, val);
 57 | }
 58 | 
 59 | vectorial_inline void simd4f_ustore3(const simd4f val, float *ary) {
 60 |     float* local_data = ary;
 61 |     vst1q_lane_f32(local_data++, val, 0);
 62 |     vst1q_lane_f32(local_data++, val, 1);
 63 |     vst1q_lane_f32(local_data, val, 2);
 64 | }
 65 | 
 66 | vectorial_inline void simd4f_ustore2(const simd4f val, float *ary) {
 67 |     const float32x2_t low = vget_low_f32(val);
 68 |     vst1_f32( (float32_t*)ary, low);
 69 | }
 70 | 
 71 | 
 72 | 
 73 | 
 74 | vectorial_inline simd4f simd4f_splat(float v) { 
 75 |     simd4f s = vdupq_n_f32(v);
 76 |     return s;
 77 | }
 78 | 
 79 | // todo: or is simd4f_splat(simd4f_get_x(v))  better?
 80 | 
 81 | vectorial_inline simd4f simd4f_splat_x(simd4f v) {
 82 |     float32x2_t o = vget_low_f32(v);
 83 |     simd4f ret = vdupq_lane_f32(o, 0);
 84 |     return ret;
 85 | }
 86 | 
 87 | vectorial_inline simd4f simd4f_splat_y(simd4f v) { 
 88 |     float32x2_t o = vget_low_f32(v);
 89 |     simd4f ret = vdupq_lane_f32(o, 1);
 90 |     return ret;
 91 | }
 92 | 
 93 | vectorial_inline simd4f simd4f_splat_z(simd4f v) { 
 94 |     float32x2_t o = vget_high_f32(v);
 95 |     simd4f ret = vdupq_lane_f32(o, 0);
 96 |     return ret;
 97 | }
 98 | 
 99 | vectorial_inline simd4f simd4f_splat_w(simd4f v) { 
100 |     float32x2_t o = vget_high_f32(v);
101 |     simd4f ret = vdupq_lane_f32(o, 1);
102 |     return ret;
103 | }
104 | 
105 | vectorial_inline simd4f simd4f_reciprocal(simd4f v) { 
106 |     simd4f estimate = vrecpeq_f32(v);
107 |     estimate = vmulq_f32(vrecpsq_f32(estimate, v), estimate);
108 |     estimate = vmulq_f32(vrecpsq_f32(estimate, v), estimate);
109 |     return estimate;
110 | }
111 | 
112 | vectorial_inline void simd4f_rsqrt_1iteration(const simd4f& v, simd4f& estimate) {
113 |     simd4f estimate2 = vmulq_f32(estimate, v);
114 |     estimate = vmulq_f32(estimate, vrsqrtsq_f32(estimate2, estimate));
115 | }
116 | 
117 | vectorial_inline simd4f simd4f_rsqrt1(simd4f v) {
118 |     simd4f estimate = vrsqrteq_f32(v);
119 |     simd4f_rsqrt_1iteration(v, estimate);
120 |     return estimate;
121 | }
122 | 
123 | vectorial_inline simd4f simd4f_rsqrt2(simd4f v) {
124 |     simd4f estimate = vrsqrteq_f32(v);
125 |     simd4f_rsqrt_1iteration(v, estimate);
126 |     simd4f_rsqrt_1iteration(v, estimate);
127 |     return estimate;
128 | }
129 | 
130 | vectorial_inline simd4f simd4f_rsqrt3(simd4f v) {
131 |     simd4f estimate = vrsqrteq_f32(v);
132 |     simd4f_rsqrt_1iteration(v, estimate);
133 |     simd4f_rsqrt_1iteration(v, estimate);
134 |     simd4f_rsqrt_1iteration(v, estimate);
135 |     return estimate;
136 | }
137 | 
138 | // http://en.wikipedia.org/wiki/Fast_inverse_square_root makes the argument for
139 | // one iteration but two gives a signficant accuracy improvment.
140 | vectorial_inline simd4f simd4f_rsqrt(simd4f v) {
141 |     return simd4f_rsqrt2(v);
142 | }
143 | 
144 | vectorial_inline simd4f simd4f_sqrt(simd4f v) { 
145 | 
146 |     return vreinterpretq_f32_u32(vandq_u32( vtstq_u32(vreinterpretq_u32_f32(v),  
147 |                                                       vreinterpretq_u32_f32(v)), 
148 |                                             vreinterpretq_u32_f32(
149 |                                               simd4f_reciprocal(simd4f_rsqrt(v)))
150 |                                           )
151 |                                 );
152 | 
153 | }
154 | 
155 | 
156 | 
157 | // arithmetics
158 | 
159 | vectorial_inline simd4f simd4f_add(simd4f lhs, simd4f rhs) {
160 |     simd4f ret = vaddq_f32(lhs, rhs);
161 |     return ret;
162 | }
163 | 
164 | vectorial_inline simd4f simd4f_sub(simd4f lhs, simd4f rhs) {
165 |     simd4f ret = vsubq_f32(lhs, rhs);
166 |     return ret;
167 | }
168 | 
169 | vectorial_inline simd4f simd4f_mul(simd4f lhs, simd4f rhs) {
170 |     simd4f ret = vmulq_f32(lhs, rhs);
171 |     return ret;
172 | }
173 | 
174 | vectorial_inline simd4f simd4f_div(simd4f lhs, simd4f rhs) {
175 |     simd4f recip = simd4f_reciprocal( rhs );
176 |     simd4f ret = vmulq_f32(lhs, recip);
177 |     return ret;
178 | }
179 | 
180 | vectorial_inline simd4f simd4f_madd(simd4f m1, simd4f m2, simd4f a) {
181 |     return vmlaq_f32( a, m1, m2 );
182 | }
183 | 
184 | 
185 | 
186 | vectorial_inline float simd4f_get_x(simd4f s) { return vgetq_lane_f32(s, 0); }
187 | vectorial_inline float simd4f_get_y(simd4f s) { return vgetq_lane_f32(s, 1); }
188 | vectorial_inline float simd4f_get_z(simd4f s) { return vgetq_lane_f32(s, 2); }
189 | vectorial_inline float simd4f_get_w(simd4f s) { return vgetq_lane_f32(s, 3); }
190 | 
191 | // This function returns x*x+y*y+z*z and ignores the w component.
192 | vectorial_inline float simd4f_dot3_scalar(simd4f lhs, simd4f rhs) {
193 |     const simd4f m = simd4f_mul(lhs, rhs);
194 |     simd2f s1 = vpadd_f32(vget_low_f32(m), vget_low_f32(m));
195 |     s1 = vadd_f32(s1, vget_high_f32(m));
196 |     return vget_lane_f32(s1, 0);
197 | }
198 | 
199 | vectorial_inline simd4f simd4f_dot3(simd4f lhs, simd4f rhs) {
200 |     return simd4f_splat(simd4f_dot3_scalar(lhs, rhs));
201 | }
202 | 
203 | vectorial_inline simd4f simd4f_cross3(simd4f lhs, simd4f rhs) {
204 |     // Compute lhs and rhs in order yzx
205 |     simd2f lhs_low = vget_low_f32(lhs);
206 |     simd2f rhs_low = vget_low_f32(rhs);
207 |     simd4f lhs_yzx = vcombine_f32(vext_f32(lhs_low, vget_high_f32(lhs),1), lhs_low);
208 |     simd4f rhs_yzx = vcombine_f32(vext_f32(rhs_low, vget_high_f32(rhs),1), rhs_low);
209 |     // Compute cross in order zxy
210 |     simd4f s3 = simd4f_sub(simd4f_mul(rhs_yzx, lhs), simd4f_mul(lhs_yzx, rhs));
211 |     // Permute cross to order xyz and zero out the fourth value
212 |     simd2f low = vget_low_f32(s3);
213 |     static const uint32_t mask_array[] = {
214 |       0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0};
215 |     static const int32x4_t mask = vld1q_s32((const int32_t*)mask_array);
216 |     s3 = vcombine_f32(vext_f32(low, vget_high_f32(s3), 1), low);
217 |     return (simd4f)vandq_s32((int32x4_t)s3,mask);
218 | }
219 | 
220 | vectorial_inline simd4f simd4f_shuffle_wxyz(simd4f s) { 
221 |     _simd4f_union u = {s};
222 |     return simd4f_create( u.f[3], u.f[0], u.f[1], u.f[2]); 
223 | }
224 | 
225 | vectorial_inline simd4f simd4f_shuffle_zwxy(simd4f s) { 
226 |     _simd4f_union u = {s};
227 |     return simd4f_create(u.f[2], u.f[3], u.f[0], u.f[1]); 
228 | }
229 | 
230 | vectorial_inline simd4f simd4f_shuffle_yzwx(simd4f s) { 
231 |     _simd4f_union u = {s};
232 |     return simd4f_create(u.f[1], u.f[2], u.f[3], u.f[0]); 
233 | }
234 | 
235 | 
236 | vectorial_inline simd4f simd4f_zero_w(simd4f s) {
237 |     _simd4f_union u = {s};
238 |     return simd4f_create(u.f[0], u.f[1], u.f[2], 0.0f);
239 | }
240 | 
241 | vectorial_inline simd4f simd4f_zero_zw(simd4f s) {
242 |     _simd4f_union u = {s};
243 |     return simd4f_create(u.f[0], u.f[1], 0.0f, 0.0f);
244 | }
245 | 
246 | 
247 | vectorial_inline simd4f simd4f_merge_high(simd4f xyzw, simd4f abcd) { 
248 |     _simd4f_union u1 = {xyzw};
249 |     _simd4f_union u2 = {abcd};
250 |     return simd4f_create(u1.f[2], u1.f[3], u2.f[2], u2.f[3]); 
251 | }
252 | 
253 | vectorial_inline simd4f simd4f_flip_sign_0101(simd4f s) {
254 |     const unsigned int upnpn[4] = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
255 |     const uint32x4_t pnpn = vld1q_u32( upnpn );
256 |     return vreinterpretq_f32_u32( veorq_u32( vreinterpretq_u32_f32(s), pnpn ) ); 
257 | }
258 | 
259 | vectorial_inline simd4f simd4f_flip_sign_1010(simd4f s) {
260 |     const unsigned int unpnp[4] = { 0x80000000, 0x00000000, 0x80000000, 0x00000000 };
261 |     const uint32x4_t npnp = vld1q_u32( unpnp );
262 |     return vreinterpretq_f32_u32( veorq_u32( vreinterpretq_u32_f32(s), npnp ) ); 
263 | }
264 | 
265 | 
266 | vectorial_inline simd4f simd4f_min(simd4f a, simd4f b) {
267 |     return vminq_f32( a, b ); 
268 | }
269 | 
270 | vectorial_inline simd4f simd4f_max(simd4f a, simd4f b) {
271 |     return vmaxq_f32( a, b ); 
272 | }
273 | 
274 | 
275 | #ifdef __cplusplus
276 | }
277 | #endif
278 | 
279 | 
280 | #endif
281 | 


--------------------------------------------------------------------------------
/spec/spec_vec3f.cpp:
--------------------------------------------------------------------------------
  1 | #include "spec_helper.h"
  2 | #include <iostream>
  3 | using vectorial::vec3f;
  4 | 
  5 | const int epsilon = 1;
  6 | 
  7 | describe(vec3f, "constructing") {
  8 |     it("should have default constructor that does nothing..") {
  9 |         vec3f x;
 10 |     }
 11 | 
 12 |     it("should have constructor with element values") {
 13 |         vec3f x(10,20,30);
 14 |         // octave vec3f: [10,20,30]
 15 |         should_be_equal_vec3f(x, simd4f_create(10.000000000000000f, 20.000000000000000f, 30.000000000000000f, 0.0f), epsilon );
 16 |         
 17 |     }
 18 | 
 19 |     it("should have constructor that loads from a float array") {
 20 |         float ary[3] = { 1,2,3 };
 21 |         vec3f x(ary);
 22 |         // octave vec3f: [1,2,3]
 23 |         should_be_equal_vec3f(x, simd4f_create(1.000000000000000f, 2.000000000000000f, 3.000000000000000f, 0.0f), epsilon );
 24 |     }
 25 | 
 26 | }
 27 | 
 28 | describe(vec3f, "loads and stores") {
 29 | 
 30 |     it("should have method for loading from a float array") {
 31 |         float ary[3] = { 1,2,3 };
 32 |         vec3f x(-1, -1, -1 );
 33 |         x.load(ary);
 34 |         // octave vec3f: [1,2,3]
 35 |         should_be_equal_vec3f(x, simd4f_create(1.000000000000000f, 2.000000000000000f, 3.000000000000000f, 0.0f), epsilon );
 36 |     }
 37 | 
 38 |     it("should have method for storing to a float array") {
 39 |         float ary[3] = { -1, -1, -1 };
 40 |         vec3f x(1, 2, 3);
 41 |         x.store(ary);
 42 |         should_be_close_to(ary[0], 1, epsilon);
 43 |         should_be_close_to(ary[1], 2, epsilon);
 44 |         should_be_close_to(ary[2], 3, epsilon);
 45 |     }
 46 | 
 47 | }
 48 | 
 49 | describe(vec3f, "arithmetic with another vec3f") {
 50 |     
 51 |     it("should have operator+ for component-wise addition") {
 52 |         vec3f a(1,2,3);
 53 |         vec3f b(10,20,30);
 54 |         vec3f x = a + b;
 55 |         // octave vec3f: [1,2,3] + [10,20,30]
 56 |         should_be_equal_vec3f(x, simd4f_create(11.000000000000000f, 22.000000000000000f, 33.000000000000000f, 0.0f), epsilon );
 57 | 
 58 |     }
 59 | 
 60 |     it("should have operator- for component-wise subtraction") {
 61 |         vec3f a(1,2,3);
 62 |         vec3f b(10,20,30);
 63 |         vec3f x = b - a;
 64 |         // octave vec3f:  [10,20,30] - [1,2,3]
 65 |         should_be_equal_vec3f(x, simd4f_create(9.000000000000000f, 18.000000000000000f, 27.000000000000000f, 0.0f), epsilon );
 66 | 
 67 |     }
 68 | 
 69 |     it("should have operator* for component-wise multiplication") {
 70 |         vec3f a(1,2,3);
 71 |         vec3f b(10,20,30);
 72 |         vec3f x = a * b;
 73 |         // octave vec3f: [1,2,3] .* [10,20,30]
 74 |         should_be_equal_vec3f(x, simd4f_create(10.000000000000000f, 40.000000000000000f, 90.000000000000000f, 0.0f), epsilon );
 75 | 
 76 |     }
 77 | 
 78 |     it("should have operator/ for component-wise division") {
 79 |         vec3f a(1,2,3);
 80 |         vec3f b(10,20,30);
 81 |         vec3f x = b / a;
 82 |         // octave vec3f:  [10,20,30] ./ [1,2,3]
 83 |         should_be_equal_vec3f(x, simd4f_create(10.000000000000000f, 10.000000000000000f, 10.000000000000000f, 0.0f), epsilon );
 84 | 
 85 |     }
 86 | 
 87 | 
 88 | 
 89 |     it("should have operator+= for component-wise addition") {
 90 |         vec3f x(1,2,3);
 91 |         vec3f b(10,20,30);
 92 |         x += b;
 93 |         // octave vec3f: [1,2,3] + [10,20,30]
 94 |         should_be_equal_vec3f(x, simd4f_create(11.000000000000000f, 22.000000000000000f, 33.000000000000000f, 0.0f), epsilon );
 95 | 
 96 |     }
 97 | 
 98 |     it("should have operator-= for component-wise subtraction") {
 99 |         vec3f a(1,2,3);
100 |         vec3f x(10,20,30);
101 |         x -= a;
102 |         // octave vec3f:  [10,20,30] - [1,2,3]
103 |         should_be_equal_vec3f(x, simd4f_create(9.000000000000000f, 18.000000000000000f, 27.000000000000000f, 0.0f), epsilon );
104 | 
105 |     }
106 | 
107 |     it("should have operator*= for component-wise multiplication") {
108 |         vec3f x(1,2,3);
109 |         vec3f b(10,20,30);
110 |         x *= b;
111 |         // octave vec3f: [1,2,3] .* [10,20,30]
112 |         should_be_equal_vec3f(x, simd4f_create(10.000000000000000f, 40.000000000000000f, 90.000000000000000f, 0.0f), epsilon );
113 | 
114 |     }
115 | 
116 |     it("should have operator/= for component-wise division") {
117 |         vec3f a(1,2,3);
118 |         vec3f x(10,20,30);
119 |         x /= a;
120 |         // octave vec3f:  [10,20,30] ./ [1,2,3]
121 |         should_be_equal_vec3f(x, simd4f_create(10.000000000000000f, 10.000000000000000f, 10.000000000000000f, 0.0f), epsilon );
122 | 
123 |     }
124 |     
125 | }
126 | 
127 | 
128 | describe(vec3f, "arithmetic with scalar") {
129 |     
130 |     it("should have operator+ for component-wise addition") {
131 |         vec3f a(1,2,3);
132 |         float b=10;
133 |         vec3f x = a + b;
134 |         // octave vec3f: [1,2,3] + 10
135 |         should_be_equal_vec3f(x, simd4f_create(11.000000000000000f, 12.000000000000000f, 13.000000000000000f, 0.0f), epsilon );
136 | 
137 |     }
138 | 
139 |     it("should have operator- for component-wise subtraction") {
140 |         float a=10;
141 |         vec3f b(10,20,30);
142 |         vec3f x = b - a;
143 |         // octave vec3f:  [10,20,30] - 10
144 |         should_be_equal_vec3f(x, simd4f_create(0.000000000000000f, 10.000000000000000f, 20.000000000000000f, 0.0f), epsilon );
145 | 
146 |     }
147 | 
148 |     it("should have operator* for component-wise multiplication") {
149 |         vec3f a(1,2,3);
150 |         float b=10;
151 |         vec3f x = a * b;
152 |         // octave vec3f: [1,2,3] .* 10
153 |         should_be_equal_vec3f(x, simd4f_create(10.000000000000000f, 20.000000000000000f, 30.000000000000000f, 0.0f), epsilon );
154 | 
155 |     }
156 | 
157 |     it("should have operator/ for component-wise division") {
158 |         vec3f a(10,20,30);
159 |         float b=10;
160 |         vec3f x = a / b;
161 |         // octave vec3f: [10,20,30] ./ 10
162 |         should_be_equal_vec3f(x, simd4f_create(1.000000000000000f, 2.000000000000000f, 3.000000000000000f, 0.0f), epsilon );
163 | 
164 |     }
165 | 
166 | 
167 | 
168 |     it("should have operator+ for component-wise addition (float as lhs)") {
169 |         vec3f b(1,2,3);
170 |         float a=10;
171 |         vec3f x = a + b;
172 |         // octave vec3f: 10 + [1,2,3]
173 |         should_be_equal_vec3f(x, simd4f_create(11.000000000000000f, 12.000000000000000f, 13.000000000000000f, 0.0f), epsilon );
174 | 
175 |     }
176 | 
177 |     it("should have operator- for component-wise subtraction (float as lhs)") {
178 |         float b=50;
179 |         vec3f a(10,20,30);
180 |         vec3f x = b - a;
181 |         // octave vec3f:  50 - [10,20,30]
182 |         should_be_equal_vec3f(x, simd4f_create(40.000000000000000f, 30.000000000000000f, 20.000000000000000f, 0.0f), epsilon );
183 | 
184 |     }
185 | 
186 |     it("should have operator* for component-wise multiplication (float as lhs)") {
187 |         vec3f b(1,2,3);
188 |         float a=10;
189 |         vec3f x = a * b;
190 |         // octave vec3f: 10 .* [1,2,3] 
191 |         should_be_equal_vec3f(x, simd4f_create(10.000000000000000f, 20.000000000000000f, 30.000000000000000f, 0.0f), epsilon );
192 | 
193 |     }
194 | 
195 |     it("should have operator* for component-wise multiplication (float as lhs)") {
196 |         vec3f b(10,20,30);
197 |         float a=40;
198 |         vec3f x = a / b;
199 |         // octave vec3f: 40 ./ [10,20,30] 
200 |         should_be_equal_vec3f(x, simd4f_create(4.000000000000000f, 2.000000000000000f, 1.333333333333333f, 0.0f), epsilon );
201 | 
202 |     }
203 | 
204 |     
205 | }
206 | 
207 | 
208 | 
209 | describe(vec3f, "vector math") {
210 | 
211 |     it("should have unary minus operator") {
212 |         vec3f a(1,2,3);
213 |         vec3f x = -a;
214 |         // octave vec3f: -[1,2,3]
215 |         should_be_equal_vec3f(x, simd4f_create(-1.000000000000000f, -2.000000000000000f, -3.000000000000000f, 0.0f), epsilon );
216 |     }
217 | 
218 | 
219 |     it("should have dot function") {
220 |         vec3f a(1,2,3);
221 |         vec3f b(6,7,8);
222 |         float x = vectorial::dot(a,b);
223 |         
224 |         // octave vec3f: dot([1,2,3],[6,7,8])
225 |         should_be_close_to(x, 44.000000000000000f, epsilon );
226 |     }
227 | 
228 |     it("should have cross function") {
229 |         vec3f a(1,2,3);
230 |         vec3f b(6,7,8);
231 |         vec3f x = vectorial::cross(a,b);
232 |         
233 |         // octave vec3f: cross([1,2,3],[6,7,8])
234 |         should_be_equal_vec3f(x, simd4f_create(-5.000000000000000f, 10.000000000000000f, -5.000000000000000f, 0.0f), epsilon );
235 |     }
236 | 
237 |     it("should have length_squared function") {
238 |         vec3f a(1,2,3);
239 |         float x = vectorial::length_squared(a);
240 |         
241 |         // octave vec3f: dot([1,2,3],[1,2,3])
242 |         should_be_close_to(x, 14.000000000000000f, epsilon );
243 |     }
244 | 
245 |     it("should have length function") {
246 |         vec3f a(1,2,3);
247 |         float x = vectorial::length(a);
248 |         
249 |         // octave vec3f: norm([1,2,3])
250 |         should_be_close_to(x, 3.741657386773941f, epsilon );
251 |     }
252 |     
253 |     
254 |     it("should have normalize function") {
255 |         vec3f a(1,2,3);
256 |         vec3f x = vectorial::normalize(a);
257 |         // octave vec3f: [1,2,3] / norm([1,2,3])
258 |         should_be_equal_vec3f(x, simd4f_create(0.267261241912424f, 0.534522483824849f, 0.801783725737273f, 0.0f), epsilon );
259 |     }
260 | 
261 | }
262 | 
263 | 
264 | 


--------------------------------------------------------------------------------
/spec/spec_vec4f.cpp:
--------------------------------------------------------------------------------
  1 | #include "spec_helper.h"
  2 | #include <iostream>
  3 | using vectorial::vec4f;
  4 | 
  5 | const int epsilon = 1;
  6 | 
  7 | describe(vec4f, "constructing") {
  8 |     it("should have default constructor that does nothing..") {
  9 |         vec4f x;
 10 |     }
 11 | 
 12 |     it("should have constructor with element values") {
 13 |         vec4f x(10,20,30,40);
 14 |         // octave vec4f: [10,20,30,40]
 15 |         should_be_equal_vec4f(x, simd4f_create(10.000000000000000f, 20.000000000000000f, 30.000000000000000f, 40.000000000000000f), epsilon );
 16 |         
 17 |     }
 18 | 
 19 |     it("should have constructor that loads from a float array") {
 20 |         float ary[4] = { 1,2,3,4 };
 21 |         vec4f x(ary);
 22 |         // octave vec4f: [1,2,3,4]
 23 |         should_be_equal_vec4f(x, simd4f_create(1.000000000000000f, 2.000000000000000f, 3.000000000000000f, 4.000000000000000f), epsilon );
 24 |     }
 25 | 
 26 | }
 27 | 
 28 | describe(vec4f, "loads and stores") {
 29 | 
 30 | 
 31 |     it("should have method for loading from a float array") {
 32 |         float ary[4] = { 1,2,3,4 };
 33 |         vec4f x(-1, -1, -1, -1);
 34 |         x.load(ary);
 35 |         // octave vec4f: [1,2,3,4]
 36 |         should_be_equal_vec4f(x, simd4f_create(1.000000000000000f, 2.000000000000000f, 3.000000000000000f, 4.000000000000000f), epsilon );
 37 |     }
 38 | 
 39 |     it("should have method for storing to a float array") {
 40 |         float ary[4] = { -1, -1, -1, -1 };
 41 |         vec4f x(1, 2, 3, 4);
 42 |         x.store(ary);
 43 |         should_be_close_to(ary[0], 1, epsilon);
 44 |         should_be_close_to(ary[1], 2, epsilon);
 45 |         should_be_close_to(ary[2], 3, epsilon);
 46 |         should_be_close_to(ary[3], 4, epsilon);
 47 |     }
 48 | 
 49 | }
 50 | 
 51 | describe(vec4f, "arithmetic with another vec4f") {
 52 |     
 53 |     it("should have operator+ for component-wise addition") {
 54 |         vec4f a(1,2,3,4);
 55 |         vec4f b(10,20,30,40);
 56 |         vec4f x = a + b;
 57 |         // octave vec4f: [1,2,3,4] + [10,20,30,40]
 58 |         should_be_equal_vec4f(x, simd4f_create(11.000000000000000f, 22.000000000000000f, 33.000000000000000f, 44.000000000000000f), epsilon );
 59 | 
 60 |     }
 61 | 
 62 |     it("should have operator- for component-wise subtraction") {
 63 |         vec4f a(1,2,3,4);
 64 |         vec4f b(10,20,30,40);
 65 |         vec4f x = b - a;
 66 |         // octave vec4f:  [10,20,30,40] - [1,2,3,4]
 67 |         should_be_equal_vec4f(x, simd4f_create(9.000000000000000f, 18.000000000000000f, 27.000000000000000f, 36.000000000000000f), epsilon );
 68 | 
 69 |     }
 70 | 
 71 |     it("should have operator* for component-wise multiplication") {
 72 |         vec4f a(1,2,3,4);
 73 |         vec4f b(10,20,30,40);
 74 |         vec4f x = a * b;
 75 |         // octave vec4f: [1,2,3,4] .* [10,20,30,40]
 76 |         should_be_equal_vec4f(x, simd4f_create(10.000000000000000f, 40.000000000000000f, 90.000000000000000f, 160.000000000000000f), epsilon );
 77 | 
 78 |     }
 79 | 
 80 |     it("should have operator/ for component-wise division") {
 81 |         vec4f a(1,2,3,4);
 82 |         vec4f b(10,20,30,40);
 83 |         vec4f x = b / a;
 84 |         // octave vec4f:  [10,20,30,40] ./ [1,2,3,4]
 85 |         should_be_equal_vec4f(x, simd4f_create(10.000000000000000f, 10.000000000000000f, 10.000000000000000f, 10.000000000000000f), epsilon );
 86 | 
 87 |     }
 88 | 
 89 | 
 90 | 
 91 | 	
 92 |     it("should have operator+= for component-wise addition") {
 93 |         vec4f x(1,2,3,4);
 94 |         vec4f b(10,20,30,40);
 95 |         x += b;
 96 |         // octave vec4f: [1,2,3,4] + [10,20,30,40]
 97 |         should_be_equal_vec4f(x, simd4f_create(11.000000000000000f, 22.000000000000000f, 33.000000000000000f, 44.000000000000000f), epsilon );
 98 | 
 99 |     }
100 | 
101 |     it("should have operator-= for component-wise subtraction") {
102 |         vec4f a(1,2,3,4);
103 |         vec4f x(10,20,30,40);
104 |         x -= a;
105 |         // octave vec4f:  [10,20,30,40] - [1,2,3,4]
106 |         should_be_equal_vec4f(x, simd4f_create(9.000000000000000f, 18.000000000000000f, 27.000000000000000f, 36.000000000000000f), epsilon );
107 | 
108 |     }
109 | 
110 |     it("should have operator*= for component-wise multiplication") {
111 |         vec4f x(1,2,3,4);
112 |         vec4f b(10,20,30,40);
113 |         x *= b;
114 |         // octave vec4f: [1,2,3,4] .* [10,20,30,40]
115 |         should_be_equal_vec4f(x, simd4f_create(10.000000000000000f, 40.000000000000000f, 90.000000000000000f, 160.000000000000000f), epsilon );
116 | 
117 |     }
118 | 
119 |     it("should have operator/= for component-wise division") {
120 |         vec4f a(1,2,3,4);
121 |         vec4f x(10,20,30,40);
122 |         x /= a;
123 |         // octave vec4f:  [10,20,30,40] ./ [1,2,3,4]
124 |         should_be_equal_vec4f(x, simd4f_create(10.000000000000000f, 10.000000000000000f, 10.000000000000000f, 10.000000000000000f), epsilon );
125 | 
126 |     }
127 | 
128 | 
129 |     
130 | }
131 | 
132 | 
133 | describe(vec4f, "arithmetic with scalar") {
134 |     
135 |     it("should have operator+ for component-wise addition") {
136 |         vec4f a(1,2,3,4);
137 |         float b=10;
138 |         vec4f x = a + b;
139 |         // octave vec4f: [1,2,3,4] + 10
140 |         should_be_equal_vec4f(x, simd4f_create(11.000000000000000f, 12.000000000000000f, 13.000000000000000f, 14.000000000000000f), epsilon );
141 | 
142 |     }
143 | 
144 |     it("should have operator- for component-wise subtraction") {
145 |         float a=10;
146 |         vec4f b(10,20,30,40);
147 |         vec4f x = b - a;
148 |         // octave vec4f:  [10,20,30,40] - 10
149 |         should_be_equal_vec4f(x, simd4f_create(0.000000000000000f, 10.000000000000000f, 20.000000000000000f, 30.000000000000000f), epsilon );
150 | 
151 |     }
152 | 
153 |     it("should have operator* for component-wise multiplication") {
154 |         vec4f a(1,2,3,4);
155 |         float b=10;
156 |         vec4f x = a * b;
157 |         // octave vec4f: [1,2,3,4] .* 10
158 |         should_be_equal_vec4f(x, simd4f_create(10.000000000000000f, 20.000000000000000f, 30.000000000000000f, 40.000000000000000f), epsilon );
159 | 
160 |     }
161 | 
162 |     it("should have operator/ for component-wise division") {
163 |         vec4f a(10,20,30,40);
164 |         float b=10;
165 |         vec4f x = a / b;
166 |         // octave vec4f: [10,20,30,40] ./ 10
167 |         should_be_equal_vec4f(x, simd4f_create(1.000000000000000f, 2.000000000000000f, 3.000000000000000f, 4.000000000000000f), epsilon );
168 | 
169 |     }
170 | 
171 | 
172 | 
173 |     it("should have operator+ for component-wise addition (float as lhs)") {
174 |         vec4f b(1,2,3,4);
175 |         float a=10;
176 |         vec4f x = a + b;
177 |         // octave vec4f: 10 + [1,2,3,4]
178 |         should_be_equal_vec4f(x, simd4f_create(11.000000000000000f, 12.000000000000000f, 13.000000000000000f, 14.000000000000000f), epsilon );
179 | 
180 |     }
181 | 
182 |     it("should have operator- for component-wise subtraction (float as lhs)") {
183 |         float b=50;
184 |         vec4f a(10,20,30,40);
185 |         vec4f x = b - a;
186 |         // octave vec4f:  50 - [10,20,30,40]
187 |         should_be_equal_vec4f(x, simd4f_create(40.000000000000000f, 30.000000000000000f, 20.000000000000000f, 10.000000000000000f), epsilon );
188 | 
189 |     }
190 | 
191 |     it("should have operator* for component-wise multiplication (float as lhs)") {
192 |         vec4f b(1,2,3,4);
193 |         float a=10;
194 |         vec4f x = a * b;
195 |         // octave vec4f: 10 .* [1,2,3,4] 
196 |         should_be_equal_vec4f(x, simd4f_create(10.000000000000000f, 20.000000000000000f, 30.000000000000000f, 40.000000000000000f), epsilon );
197 | 
198 |     }
199 | 
200 |     it("should have operator* for component-wise multiplication (float as lhs)") {
201 |         vec4f b(10,20,30,40);
202 |         float a=40;
203 |         vec4f x = a / b;
204 |         // octave vec4f: 40 ./ [10,20,30,40] 
205 |         should_be_equal_vec4f(x, simd4f_create(4.000000000000000f, 2.000000000000000f, 1.333333333333333f, 1.000000000000000f), epsilon );
206 | 
207 |     }
208 | 
209 |     
210 | }
211 | 
212 | 
213 | 
214 | describe(vec4f, "vector math") {
215 | 
216 |     it("should have unary minus operator") {
217 |         vec4f a(1,2,3,4);
218 |         vec4f x = -a;
219 |         // octave vec4f: -[1,2,3,4]
220 |         should_be_equal_vec4f(x, simd4f_create(-1.000000000000000f, -2.000000000000000f, -3.000000000000000f, -4.000000000000000f), epsilon );
221 |     }
222 | 
223 |     it("should have dot function") {
224 |         vec4f a(1,2,3,4);
225 |         vec4f b(6,7,8,9);
226 |         float x = vectorial::dot(a,b);
227 |         
228 |         // octave vec4f: dot([1,2,3,4],[6,7,8,9])
229 |         should_be_close_to(x, 80.000000000000000f, epsilon );
230 |     }
231 | 
232 |     it("should have length_squared function") {
233 |         vec4f a(1,2,3,4);
234 |         float x = vectorial::length_squared(a);
235 |         
236 |         // octave vec4f: dot([1,2,3,4],[1,2,3,4])
237 |         should_be_close_to(x, 30.000000000000000f, epsilon );
238 |     }
239 | 
240 |     it("should have length function") {
241 |         vec4f a(1,2,3,4);
242 |         float x = vectorial::length(a);
243 |         
244 |         // octave vec4f: norm([1,2,3,4])
245 |         should_be_close_to(x, 5.477225575051661f, epsilon );
246 |     }
247 |     
248 |     
249 |     it("should have normalize function") {
250 |         vec4f a(1,2,3,4);
251 |         vec4f x = vectorial::normalize(a);
252 |         // octave vec4f: [1,2,3,4] / norm([1,2,3,4])
253 |         should_be_equal_vec4f(x, simd4f_create(0.182574185835055f, 0.365148371670111f, 0.547722557505166f, 0.730296743340221f), epsilon );
254 |     }
255 | 
256 | }
257 | 
258 | 
259 | 


--------------------------------------------------------------------------------
/spec/spec_helper.h:
--------------------------------------------------------------------------------
  1 | #ifndef VECTORIAL_SPEC_HELPER_H
  2 | #define VECTORIAL_SPEC_HELPER_H
  3 | 
  4 | #define VECTORIAL_OSTREAM
  5 | 
  6 | #include "spec.h"
  7 | 
  8 | #include "vectorial/vectorial.h"
  9 | 
 10 | #ifdef VECTORIAL_HAVE_SIMD2F
 11 | #include "vectorial/simd2f.h"
 12 | #endif
 13 | 
 14 | #include <cmath>
 15 | #include <cstdlib>
 16 | #include <iostream>
 17 | 
 18 | #define should_be_close_to(a,b,tolerance) should_be_close_to_(this, a,b,tolerance,__FILE__,__LINE__)
 19 | #define should_be_equal_simd4f( a, b, tolerance) should_be_equal_simd4f_(this, a,b,tolerance,__FILE__,__LINE__)
 20 | #define should_be_equal_simd2f( a, b, tolerance) should_be_equal_simd2f_(this, a,b,tolerance,__FILE__,__LINE__)
 21 | #define should_be_equal_vec4f( a, b, tolerance) should_be_equal_vec4f_(this, a,b,tolerance,__FILE__,__LINE__)
 22 | #define should_be_equal_vec3f( a, b, tolerance) should_be_equal_vec3f_(this, a,b,tolerance,__FILE__,__LINE__)
 23 | #define should_be_equal_vec2f( a, b, tolerance) should_be_equal_vec2f_(this, a,b,tolerance,__FILE__,__LINE__)
 24 | 
 25 | #define should_be_equal_simd4x4f( a, b, tolerance) should_be_equal_simd4x4f_(this, a,b,tolerance,__FILE__,__LINE__)
 26 | 
 27 | #define should_be_equal_mat4f( a, b, tolerance) should_be_equal_mat4f_(this, a,b,tolerance,__FILE__,__LINE__)
 28 | 
 29 | // Based on:
 30 | // http://www.cygnus-software.com/papers/comparingfloats/comparingfloats.htm
 31 | // 
 32 | static inline bool compare_floats(float A, float B, int maxUlps)
 33 | {
 34 |     // Make sure maxUlps is non-negative and small enough that the
 35 |     // default NAN won't compare as equal to anything.
 36 |     // assert(maxUlps > 0 && maxUlps < 4 * 1024 * 1024);
 37 |     union {
 38 |         float f;
 39 |         int i;
 40 |     } f2iA, f2iB;
 41 |     f2iA.f = A;
 42 |     f2iB.f = B;
 43 |     
 44 |     int aInt = f2iA.i;
 45 | //    int aInt = *(int*)&A;
 46 |     // Make aInt lexicographically ordered as a twos-complement int
 47 |     if (aInt < 0)
 48 |         aInt = 0x80000000 - aInt;
 49 |     // Make bInt lexicographically ordered as a twos-complement int
 50 |     int bInt = f2iB.i;
 51 | //    int bInt = *(int*)&B;
 52 |     if (bInt < 0)
 53 |         bInt = 0x80000000 - bInt;
 54 |     int intDiff = abs(aInt - bInt);
 55 |     if (intDiff <= maxUlps)
 56 |         return true;
 57 |     return false;
 58 | }
 59 | 
 60 | 
 61 | 
 62 | 
 63 | 
 64 | 
 65 | 
 66 | 
 67 | 
 68 | static inline void should_be_close_to_(specific::SpecBase *spec, float a, float b, int tolerance, const char *file, int line) {
 69 |     
 70 |     bool equal=true;
 71 |     if( !compare_floats(a,b,tolerance) ) equal = false;
 72 | 
 73 |     std::stringstream ss;
 74 |     ss << a << " == " << b << " (with tolerance of " << tolerance << ")";
 75 |     spec->should_test(equal, ss.str().c_str(), file, line);
 76 |     
 77 |     
 78 | }
 79 | 
 80 | #ifdef VECTORIAL_HAVE_SIMD2F
 81 | static inline void should_be_equal_simd2f_(specific::SpecBase *spec, const simd2f& a, const simd2f& b, int tolerance, const char *file, int line) {
 82 | 
 83 |     bool equal=true;
 84 |     if( !compare_floats( simd2f_get_x(a), simd2f_get_x(b), tolerance) ) equal = false;
 85 |     if( !compare_floats( simd2f_get_y(a), simd2f_get_y(b), tolerance) ) equal = false;
 86 | 
 87 |     std::stringstream ss;
 88 |     ss << a << " == " << b << " (with tolerance of " << tolerance << ")";
 89 |     spec->should_test(equal, ss.str().c_str(), file, line);
 90 | 
 91 | }
 92 | #endif
 93 | 
 94 | static inline void should_be_equal_simd4f_(specific::SpecBase *spec, const simd4f& a, const simd4f& b, int tolerance, const char *file, int line) {
 95 |     
 96 |     bool equal=true;
 97 |     if( !compare_floats( simd4f_get_x(a), simd4f_get_x(b), tolerance) ) equal = false;
 98 |     if( !compare_floats( simd4f_get_y(a), simd4f_get_y(b), tolerance) ) equal = false;
 99 |     if( !compare_floats( simd4f_get_z(a), simd4f_get_z(b), tolerance) ) equal = false;
100 |     if( !compare_floats( simd4f_get_w(a), simd4f_get_w(b), tolerance) ) equal = false;
101 |     
102 |     std::stringstream ss;
103 |     ss << a << " == " << b << " (with tolerance of " << tolerance << ")";
104 |     spec->should_test(equal, ss.str().c_str(), file, line);
105 |     
106 |     
107 | }
108 | 
109 | static inline void should_be_equal_vec4f_(specific::SpecBase *spec, const vectorial::vec4f& a, const vectorial::vec4f& b, int tolerance, const char *file, int line) {
110 |     
111 |     bool equal=true;
112 |     if( !compare_floats( a.x(), b.x(), tolerance) ) equal = false;
113 |     if( !compare_floats( a.y(), b.y(), tolerance) ) equal = false;
114 |     if( !compare_floats( a.z(), b.z(), tolerance) ) equal = false;
115 |     if( !compare_floats( a.w(), b.w(), tolerance) ) equal = false;
116 |     
117 |     std::stringstream ss;
118 |     ss << a << " == " << b << " (with tolerance of " << tolerance << ")";
119 |     spec->should_test(equal, ss.str().c_str(), file, line);
120 |     
121 |     
122 | }
123 | 
124 | static inline void should_be_equal_vec3f_(specific::SpecBase *spec, const vectorial::vec3f& a, const vectorial::vec3f& b, int tolerance, const char *file, int line) {
125 |     
126 |     bool equal=true;
127 |     if( !compare_floats( a.x(), b.x(), tolerance) ) equal = false;
128 |     if( !compare_floats( a.y(), b.y(), tolerance) ) equal = false;
129 |     if( !compare_floats( a.z(), b.z(), tolerance) ) equal = false;
130 |     
131 |     std::stringstream ss;
132 |     ss << a << " == " << b << " (with tolerance of " << tolerance << ")";
133 |     spec->should_test(equal, ss.str().c_str(), file, line);
134 |     
135 |     
136 | }
137 | 
138 | static inline void should_be_equal_vec2f_(specific::SpecBase *spec, const vectorial::vec2f& a, const vectorial::vec2f& b, int tolerance, const char *file, int line) {
139 |     
140 |     bool equal=true;
141 |     if( !compare_floats( a.x(), b.x(), tolerance) ) equal = false;
142 |     if( !compare_floats( a.y(), b.y(), tolerance) ) equal = false;
143 |     
144 |     std::stringstream ss;
145 |     ss << a << " == " << b << " (with tolerance of " << tolerance << ")";
146 |     spec->should_test(equal, ss.str().c_str(), file, line);
147 |     
148 |     
149 | }
150 | 
151 | 
152 | 
153 | static inline void should_be_equal_simd4x4f_(specific::SpecBase *spec, const simd4x4f& a, const simd4x4f& b, int tolerance, const char *file, int line) {
154 |     
155 |     bool equal=true;
156 |     if( !compare_floats( simd4f_get_x(a.x), simd4f_get_x(b.x), tolerance) ) equal = false;
157 |     if( !compare_floats( simd4f_get_y(a.x), simd4f_get_y(b.x), tolerance) ) equal = false;
158 |     if( !compare_floats( simd4f_get_z(a.x), simd4f_get_z(b.x), tolerance) ) equal = false;
159 |     if( !compare_floats( simd4f_get_w(a.x), simd4f_get_w(b.x), tolerance) ) equal = false;
160 | 
161 |     if( !compare_floats( simd4f_get_x(a.y), simd4f_get_x(b.y), tolerance) ) equal = false;
162 |     if( !compare_floats( simd4f_get_y(a.y), simd4f_get_y(b.y), tolerance) ) equal = false;
163 |     if( !compare_floats( simd4f_get_z(a.y), simd4f_get_z(b.y), tolerance) ) equal = false;
164 |     if( !compare_floats( simd4f_get_w(a.y), simd4f_get_w(b.y), tolerance) ) equal = false;
165 | 
166 |     if( !compare_floats( simd4f_get_x(a.z), simd4f_get_x(b.z), tolerance) ) equal = false;
167 |     if( !compare_floats( simd4f_get_y(a.z), simd4f_get_y(b.z), tolerance) ) equal = false;
168 |     if( !compare_floats( simd4f_get_z(a.z), simd4f_get_z(b.z), tolerance) ) equal = false;
169 |     if( !compare_floats( simd4f_get_w(a.z), simd4f_get_w(b.z), tolerance) ) equal = false;
170 | 
171 |     if( !compare_floats( simd4f_get_x(a.w), simd4f_get_x(b.w), tolerance) ) equal = false;
172 |     if( !compare_floats( simd4f_get_y(a.w), simd4f_get_y(b.w), tolerance) ) equal = false;
173 |     if( !compare_floats( simd4f_get_z(a.w), simd4f_get_z(b.w), tolerance) ) equal = false;
174 |     if( !compare_floats( simd4f_get_w(a.w), simd4f_get_w(b.w), tolerance) ) equal = false;
175 |     
176 |     std::stringstream ss;
177 |     ss << a << " == " << b << " (with tolerance of " << tolerance << ")";
178 |     spec->should_test(equal, ss.str().c_str(), file, line);
179 |     
180 |     
181 | }
182 | 
183 | static inline void should_be_equal_mat4f_(specific::SpecBase *spec, const vectorial::mat4f& a, const vectorial::mat4f& b, int tolerance, const char *file, int line) {
184 |                                                                         
185 |     bool equal=true;                                                    
186 |     if( !compare_floats( simd4f_get_x(a.value.x), simd4f_get_x(b.value.x), tolerance) ) equal = false;
187 |     if( !compare_floats( simd4f_get_y(a.value.x), simd4f_get_y(b.value.x), tolerance) ) equal = false;
188 |     if( !compare_floats( simd4f_get_z(a.value.x), simd4f_get_z(b.value.x), tolerance) ) equal = false;
189 |     if( !compare_floats( simd4f_get_w(a.value.x), simd4f_get_w(b.value.x), tolerance) ) equal = false;
190 | 
191 |     if( !compare_floats( simd4f_get_x(a.value.y), simd4f_get_x(b.value.y), tolerance) ) equal = false;
192 |     if( !compare_floats( simd4f_get_y(a.value.y), simd4f_get_y(b.value.y), tolerance) ) equal = false;
193 |     if( !compare_floats( simd4f_get_z(a.value.y), simd4f_get_z(b.value.y), tolerance) ) equal = false;
194 |     if( !compare_floats( simd4f_get_w(a.value.y), simd4f_get_w(b.value.y), tolerance) ) equal = false;
195 | 
196 |     if( !compare_floats( simd4f_get_x(a.value.z), simd4f_get_x(b.value.z), tolerance) ) equal = false;
197 |     if( !compare_floats( simd4f_get_y(a.value.z), simd4f_get_y(b.value.z), tolerance) ) equal = false;
198 |     if( !compare_floats( simd4f_get_z(a.value.z), simd4f_get_z(b.value.z), tolerance) ) equal = false;
199 |     if( !compare_floats( simd4f_get_w(a.value.z), simd4f_get_w(b.value.z), tolerance) ) equal = false;
200 | 
201 |     if( !compare_floats( simd4f_get_x(a.value.w), simd4f_get_x(b.value.w), tolerance) ) equal = false;
202 |     if( !compare_floats( simd4f_get_y(a.value.w), simd4f_get_y(b.value.w), tolerance) ) equal = false;
203 |     if( !compare_floats( simd4f_get_z(a.value.w), simd4f_get_z(b.value.w), tolerance) ) equal = false;
204 |     if( !compare_floats( simd4f_get_w(a.value.w), simd4f_get_w(b.value.w), tolerance) ) equal = false;
205 |     
206 |     std::stringstream ss;
207 |     ss << a << " == " << b << " (with tolerance of " << tolerance << " ulps)";
208 |     spec->should_test(equal, ss.str().c_str(), file, line);
209 |     
210 |     
211 | }
212 | 
213 | 
214 | 
215 | #endif
216 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | 
  2 | CXX?=g++
  3 | CLANG_CC=clang
  4 | CLANG_CXX=clang++
  5 | 
  6 | IPHONE_PLATFORM_PATH = /Developer/Platforms/iPhoneOS.platform/Developer
  7 | IPHONE_ISYSROOT_PATH = $(IPHONE_PLATFORM_PATH)/SDKs/iPhoneOS4.2.sdk/
  8 | IPHONE_CC = $(IPHONE_PLATFORM_PATH)/usr/bin/g++ -isysroot $(IPHONE_ISYSROOT_PATH)   -arch armv7
  9 | # -mfloat-abi=softfp -mfpu=neon  
 10 | 
 11 | #CXXFLAGS += -Iinclude -O0
 12 | #CXXFLAGS += -g -Iinclude -Wall -Wextra -pedantic -Wno-unused -O3 -fstrict-aliasing -Wstrict-aliasing=2 -ffast-math 
 13 | CXXFLAGS += -Iinclude -Wall -Wextra -pedantic -Wno-unused -O3 -fstrict-aliasing -Wstrict-aliasing=2 -ffast-math  -D__extern_always_inline=inline
 14 | 
 15 | SPEC_SRC = $(wildcard spec/*.cpp)
 16 | SPEC_OBJ = $(SPEC_SRC:.cpp=.o)
 17 | 
 18 | BENCH_SRC = $(wildcard bench/*.cpp)
 19 | BENCH_OBJ = $(BENCH_SRC:.cpp=.o)
 20 | BENCH_ASM = $(patsubst %.cpp,asm$(SUFFIX)/%.S,$(BENCH_SRC))
 21 | 
 22 | SUFFIX=
 23 | 
 24 | DEFAULT_CC=1
 25 | 
 26 | ifeq ($(FORCE_SCALAR),1)
 27 | 	CXXFLAGS+= -DVECTORIAL_FORCED -DVECTORIAL_SCALAR
 28 | 	SUFFIX=-scalar
 29 | endif
 30 | 
 31 | ifeq ($(FORCE_SSE),1)
 32 | 	CXXFLAGS+= -DVECTORIAL_FORCED -DVECTORIAL_SSE -msse -msse2 -mfpmath=sse
 33 | 	SUFFIX=-sse
 34 | endif
 35 | 
 36 | ifeq ($(FORCE_GNU),1)
 37 | 	CXXFLAGS+= -DVECTORIAL_FORCED -DVECTORIAL_GNU 
 38 | 	#-msse -msse2 -mfpmath=sse
 39 | 	SUFFIX=-gnu
 40 | endif
 41 | 
 42 | ifeq ($(FORCE_NEON),1)
 43 | 	CXXFLAGS+= -DVECTORIAL_FORCED -DVECTORIAL_NEON
 44 | 	SUFFIX=-neon
 45 | 	ARM=1
 46 | endif
 47 | 
 48 | 
 49 | ifeq ($(ARM),1)
 50 | ifeq ($(shell uname -s),Darwin)
 51 | 	CC=$(IPHONE_CC)
 52 | 	CXX=$(IPHONE_CC)
 53 | endif
 54 | #	CXXFLAGS+= -mcpu=cortex-a8 
 55 | 	CXXFLAGS+= -mno-thumb -mfloat-abi=softfp -mfpu=neon
 56 | 	DEFAULT_CC=0
 57 | endif
 58 | 
 59 | ifeq ($(CLANG),1)
 60 | 	CC=$(CLANG_CC)
 61 | 	CXX=$(CLANG_CXX)
 62 | 	DEFAULT_CC=0
 63 | endif
 64 | 
 65 | ifeq ($(DEFAULT_CC),1)
 66 | #	CXXFLAGS += -msse -msse2 -mfpmath=sse
 67 | endif
 68 | 
 69 | ifeq ($(ASM),1)
 70 | 	CC+= -S
 71 | 	CXX+= -S
 72 | endif
 73 | 
 74 | BUILDDIR=build$(SUFFIX)
 75 | SPEC_OBJ := $(addprefix $(BUILDDIR)/,$(SPEC_OBJ))
 76 | BENCH_OBJ := $(addprefix $(BUILDDIR)/,$(BENCH_OBJ))
 77 | SILENT=@
 78 | MKDIR=mkdir -p
 79 | PATH_SEPARATOR=/
 80 | 
 81 | $(BUILDDIR)/%.o: %.cpp
 82 | 	@echo CXX $<
 83 | 	$(SILENT) $(MKDIR) $(subst /,$(PATH_SEPARATOR),$(dir $@))
 84 | 	$(SILENT) $(COMPILE.cc) -o $@ $<
 85 | 
 86 | 
 87 | 
 88 | .PHONY: all
 89 | all: specsuite$(SUFFIX)
 90 | 	./specsuite$(SUFFIX)
 91 | 
 92 | 
 93 | .PHONY: full
 94 | full:
 95 | 	@clear
 96 | 	@echo FULL COMPILE at `date +%H:%M:%S`
 97 | #	FORCE_SCALAR=1 $(MAKE) clean 
 98 | 	@FORCE_SCALAR=1 $(MAKE)  specsuite-scalar
 99 | #	FORCE_GNU=1 $(MAKE) clean 
100 | 	@FORCE_GNU=1 $(MAKE)  specsuite-gnu
101 | #	FORCE_SSE=1 $(MAKE) clean 
102 | 	@FORCE_SSE=1 $(MAKE)  specsuite-sse
103 | #	FORCE_NEON=1 $(MAKE) clean 
104 | #	FORCE_NEON=1 $(MAKE) specsuite-neon
105 | 	@./specsuite-scalar
106 | 	@./specsuite-sse
107 | 	@./specsuite-gnu
108 | 
109 | specsuite$(SUFFIX): $(SPEC_OBJ)
110 | 	@echo LINK $@
111 | 	@$(CXX) $(LDFLAGS) $^ -o $@
112 | 
113 | .PHONY: depend
114 | depend:
115 | 	@echo DEP
116 | 	@makedepend -Y -- $(CXXFLAGS) -- $(SPEC_SRC) $(BENCH_SRC) -p$(BUILDDIR)/ > /dev/null 2>&1 
117 | 	@$(RM) Makefile.bak
118 | 
119 | define asm-command
120 | @mkdir -p $(dir asm$(SUFFIX)/$(1))
121 | $(CXX) $(CXXFLAGS) -S $(1) -o asm$(SUFFIX)/$(1).S
122 | 
123 | endef
124 | 
125 | bench-asm: $(BENCH_SRC)
126 | 	$(foreach p,$(BENCH_SRC),$(call asm-command,$(p)))
127 | 
128 | benchmark$(SUFFIX): $(BENCH_OBJ) bench-asm
129 | 	$(CXX) $(BENCH_OBJ) -o $@
130 | 
131 | .PHONY: bench-full
132 | bench-full:
133 | 	FORCE_SCALAR=1 $(MAKE) benchmark-scalar
134 | 	FORCE_GNU=1 $(MAKE) benchmark-gnu
135 | 	FORCE_SSE=1 $(MAKE) benchmark-sse
136 | #	FORCE_NEON=1 $(MAKE) clean 
137 | #	FORCE_NEON=1 $(MAKE) benchmark-neon
138 | 	./benchmark-scalar
139 | 	./benchmark-sse
140 | 	./benchmark-gnu
141 | 
142 | .PHONY: clean
143 | clean:
144 | 	rm -f $(SPEC_OBJ) $(BENCH_OBJ) benchmark$(SUFFIX) specsuite$(SUFFIX) 
145 | 	rm -rf asm$(SUFFIX)
146 | 
147 | .PHONY: realclean
148 | realclean: clean
149 | 	rm -f specsuite*
150 | 	rm -rf build*
151 | 
152 | 
153 | .PHONY: update_spec
154 | update_spec:
155 | 	./tools/update_spec.rb spec/spec_*.cpp
156 | 
157 | ifeq ($(MAKECMDGOALS),export)
158 | ifeq ($(origin to),undefined)
159 | $(error to not set, like  make export to=/foo/bar)
160 | endif
161 | endif
162 | 
163 | .PHONY: export
164 | export:
165 | 	$(SILENT) git archive --format tar master | tar x -C $(to)
166 | 
167 | 
168 | include/vectorial/vec2f.h include/vectorial/vec3f.h include/vectorial/vec4f.h: include/vectorial/simd4f.h
169 | include/vectorial/simd4f.h: include/vectorial/simd4f_scalar.h
170 | include/vectorial/simd4f.h: include/vectorial/simd4f_neon.h
171 | include/vectorial/simd4f.h: include/vectorial/simd4f_gnu.h
172 | include/vectorial/simd4f.h: include/vectorial/simd4f_sse.h
173 | include/vectorial/simd4f.h: include/vectorial/simd4f_scalar.h
174 | include/vectorial/simd4f.h: include/vectorial/config.h
175 | include/vectorial/simd4x4f.h: include/vectorial/simd4f.h
176 | include/vectorial/simd4x4f.h: include/vectorial/simd4x4f_scalar.h
177 | include/vectorial/simd4x4f.h: include/vectorial/simd4x4f_neon.h
178 | include/vectorial/simd4x4f.h: include/vectorial/simd4x4f_gnu.h
179 | include/vectorial/simd4x4f.h: include/vectorial/simd4x4f_sse.h
180 | include/vectorial/simd4x4f.h: include/vectorial/config.h
181 | spec/spec_helper.h: include/vectorial/simd4x4f.h include/vectorial/simd4f.h include/vectorial/vec4f.h include/vectorial/vec3f.h include/vectorial/vec2f.h
182 | spec/spec.cpp: spec/spec.h
183 | spec/spec_main.cpp: spec/spec.h
184 | spec/spec_simd4f.cpp: spec/spec_helper.h
185 | spec/spec_simd4x4f.cpp: spec/spec_helper.h
186 | spec/spec_vec2f.cpp: spec/spec_helper.h
187 | spec/spec_vec3f.cpp: spec/spec_helper.h
188 | spec/spec_vec4f.cpp: spec/spec_helper.h
189 | 
190 | $(BUILDDIR)/spec/spec_simd4f.o: \
191 |   include/vectorial/simd4x4f.h include/vectorial/simd4f.h \
192 |   include/vectorial/simd4f_scalar.h include/vectorial/simd4f_neon.h \
193 |   include/vectorial/simd4f_gnu.h include/vectorial/simd4f_sse.h \
194 |   include/vectorial/config.h
195 | 
196 | $(BUILDDIR)/spec/spec_simd4x4f.o: \
197 |   include/vectorial/simd4x4f.h include/vectorial/simd4f.h \
198 |   include/vectorial/simd4f_scalar.h include/vectorial/simd4f_neon.h \
199 |   include/vectorial/simd4f_gnu.h include/vectorial/simd4f_sse.h \
200 |   include/vectorial/simd4x4f_scalar.h include/vectorial/simd4x4f_neon.h \
201 |   include/vectorial/simd4x4f_gnu.h include/vectorial/simd4x4f_sse.h include/vectorial/config.h
202 |   
203 | $(BUILDDIR)/spec/spec_vec2f.o $(BUILDDIR)/spec/spec_vec3f.o $(BUILDDIR)/spec/spec_vec4f.o: \
204 |   include/vectorial/simd4x4f.h include/vectorial/simd4f.h \
205 |   include/vectorial/vec4f.h include/vectorial/vec3f.h include/vectorial/vec2f.h \
206 |   include/vectorial/simd4f_scalar.h include/vectorial/simd4f_neon.h \
207 |   include/vectorial/simd4f_gnu.h include/vectorial/simd4f_sse.h \
208 |   include/vectorial/simd4x4f_scalar.h include/vectorial/simd4x4f_neon.h \
209 |   include/vectorial/simd4x4f_gnu.h include/vectorial/simd4x4f_sse.h include/vectorial/config.h
210 | 
211 | 
212 | 
213 | 
214 | 
215 | # DO NOT DELETE
216 | 
217 | $(BUILDDIR)/spec/spec.o: spec/spec.h
218 | $(BUILDDIR)/spec/spec_main.o: spec/spec.h
219 | $(BUILDDIR)/spec/spec_mat4f.o: spec/spec_helper.h spec/spec.h
220 | $(BUILDDIR)/spec/spec_mat4f.o: include/vectorial/simd4f.h
221 | $(BUILDDIR)/spec/spec_mat4f.o: include/vectorial/config.h
222 | $(BUILDDIR)/spec/spec_mat4f.o: include/vectorial/simd4f_gnu.h
223 | $(BUILDDIR)/spec/spec_mat4f.o: include/vectorial/simd4f_common.h
224 | $(BUILDDIR)/spec/spec_mat4f.o: include/vectorial/vec4f.h include/vectorial/vec3f.h
225 | $(BUILDDIR)/spec/spec_mat4f.o: include/vectorial/vec2f.h
226 | $(BUILDDIR)/spec/spec_mat4f.o: include/vectorial/simd4x4f.h
227 | $(BUILDDIR)/spec/spec_mat4f.o: include/vectorial/simd4f.h
228 | $(BUILDDIR)/spec/spec_mat4f.o: include/vectorial/simd4x4f_gnu.h
229 | $(BUILDDIR)/spec/spec_mat4f.o: include/vectorial/mat4f.h
230 | $(BUILDDIR)/spec/spec_simd4f.o: spec/spec_helper.h spec/spec.h
231 | $(BUILDDIR)/spec/spec_simd4f.o: include/vectorial/simd4f.h
232 | $(BUILDDIR)/spec/spec_simd4f.o: include/vectorial/config.h
233 | $(BUILDDIR)/spec/spec_simd4f.o: include/vectorial/simd4f_gnu.h
234 | $(BUILDDIR)/spec/spec_simd4f.o: include/vectorial/simd4f_common.h
235 | $(BUILDDIR)/spec/spec_simd4f.o: include/vectorial/vec4f.h include/vectorial/vec3f.h
236 | $(BUILDDIR)/spec/spec_simd4f.o: include/vectorial/vec2f.h
237 | $(BUILDDIR)/spec/spec_simd4f.o: include/vectorial/simd4x4f.h
238 | $(BUILDDIR)/spec/spec_simd4f.o: include/vectorial/simd4f.h
239 | $(BUILDDIR)/spec/spec_simd4f.o: include/vectorial/simd4x4f_gnu.h
240 | $(BUILDDIR)/spec/spec_simd4f.o: include/vectorial/mat4f.h
241 | $(BUILDDIR)/spec/spec_simd4x4f.o: spec/spec_helper.h spec/spec.h
242 | $(BUILDDIR)/spec/spec_simd4x4f.o: include/vectorial/simd4f.h
243 | $(BUILDDIR)/spec/spec_simd4x4f.o: include/vectorial/config.h
244 | $(BUILDDIR)/spec/spec_simd4x4f.o: include/vectorial/simd4f_gnu.h
245 | $(BUILDDIR)/spec/spec_simd4x4f.o: include/vectorial/simd4f_common.h
246 | $(BUILDDIR)/spec/spec_simd4x4f.o: include/vectorial/vec4f.h
247 | $(BUILDDIR)/spec/spec_simd4x4f.o: include/vectorial/vec3f.h
248 | $(BUILDDIR)/spec/spec_simd4x4f.o: include/vectorial/vec2f.h
249 | $(BUILDDIR)/spec/spec_simd4x4f.o: include/vectorial/simd4x4f.h
250 | $(BUILDDIR)/spec/spec_simd4x4f.o: include/vectorial/simd4f.h
251 | $(BUILDDIR)/spec/spec_simd4x4f.o: include/vectorial/simd4x4f_gnu.h
252 | $(BUILDDIR)/spec/spec_simd4x4f.o: include/vectorial/mat4f.h
253 | $(BUILDDIR)/spec/spec_vec2f.o: spec/spec_helper.h spec/spec.h
254 | $(BUILDDIR)/spec/spec_vec2f.o: include/vectorial/simd4f.h
255 | $(BUILDDIR)/spec/spec_vec2f.o: include/vectorial/config.h
256 | $(BUILDDIR)/spec/spec_vec2f.o: include/vectorial/simd4f_gnu.h
257 | $(BUILDDIR)/spec/spec_vec2f.o: include/vectorial/simd4f_common.h
258 | $(BUILDDIR)/spec/spec_vec2f.o: include/vectorial/vec4f.h include/vectorial/vec3f.h
259 | $(BUILDDIR)/spec/spec_vec2f.o: include/vectorial/vec2f.h
260 | $(BUILDDIR)/spec/spec_vec2f.o: include/vectorial/simd4x4f.h
261 | $(BUILDDIR)/spec/spec_vec2f.o: include/vectorial/simd4f.h
262 | $(BUILDDIR)/spec/spec_vec2f.o: include/vectorial/simd4x4f_gnu.h
263 | $(BUILDDIR)/spec/spec_vec2f.o: include/vectorial/mat4f.h
264 | $(BUILDDIR)/spec/spec_vec3f.o: spec/spec_helper.h spec/spec.h
265 | $(BUILDDIR)/spec/spec_vec3f.o: include/vectorial/simd4f.h
266 | $(BUILDDIR)/spec/spec_vec3f.o: include/vectorial/config.h
267 | $(BUILDDIR)/spec/spec_vec3f.o: include/vectorial/simd4f_gnu.h
268 | $(BUILDDIR)/spec/spec_vec3f.o: include/vectorial/simd4f_common.h
269 | $(BUILDDIR)/spec/spec_vec3f.o: include/vectorial/vec4f.h include/vectorial/vec3f.h
270 | $(BUILDDIR)/spec/spec_vec3f.o: include/vectorial/vec2f.h
271 | $(BUILDDIR)/spec/spec_vec3f.o: include/vectorial/simd4x4f.h
272 | $(BUILDDIR)/spec/spec_vec3f.o: include/vectorial/simd4f.h
273 | $(BUILDDIR)/spec/spec_vec3f.o: include/vectorial/simd4x4f_gnu.h
274 | $(BUILDDIR)/spec/spec_vec3f.o: include/vectorial/mat4f.h
275 | $(BUILDDIR)/spec/spec_vec4f.o: spec/spec_helper.h spec/spec.h
276 | $(BUILDDIR)/spec/spec_vec4f.o: include/vectorial/simd4f.h
277 | $(BUILDDIR)/spec/spec_vec4f.o: include/vectorial/config.h
278 | $(BUILDDIR)/spec/spec_vec4f.o: include/vectorial/simd4f_gnu.h
279 | $(BUILDDIR)/spec/spec_vec4f.o: include/vectorial/simd4f_common.h
280 | $(BUILDDIR)/spec/spec_vec4f.o: include/vectorial/vec4f.h include/vectorial/vec3f.h
281 | $(BUILDDIR)/spec/spec_vec4f.o: include/vectorial/vec2f.h
282 | $(BUILDDIR)/spec/spec_vec4f.o: include/vectorial/simd4x4f.h
283 | $(BUILDDIR)/spec/spec_vec4f.o: include/vectorial/simd4f.h
284 | $(BUILDDIR)/spec/spec_vec4f.o: include/vectorial/simd4x4f_gnu.h
285 | $(BUILDDIR)/spec/spec_vec4f.o: include/vectorial/mat4f.h
286 | $(BUILDDIR)/bench/add_bench.o: bench/bench.h include/vectorial/vec4f.h
287 | $(BUILDDIR)/bench/bench.o: bench/bench.h include/vectorial/config.h
288 | $(BUILDDIR)/bench/dot_bench.o: bench/bench.h include/vectorial/vec4f.h
289 | $(BUILDDIR)/bench/matrix_bench.o: bench/bench.h include/vectorial/simd4x4f.h
290 | $(BUILDDIR)/bench/matrix_bench.o: include/vectorial/simd4f.h
291 | $(BUILDDIR)/bench/matrix_bench.o: include/vectorial/simd4x4f_gnu.h
292 | $(BUILDDIR)/bench/quad_bench.o: bench/bench.h include/vectorial/simd4x4f.h
293 | $(BUILDDIR)/bench/quad_bench.o: include/vectorial/simd4f.h
294 | $(BUILDDIR)/bench/quad_bench.o: include/vectorial/simd4x4f_gnu.h
295 | 


--------------------------------------------------------------------------------
/include/vectorial/simd4x4f.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |   Vectorial
  3 |   Copyright (c) 2010 Mikko Lehtonen
  4 |   Copyright (c) 2014 Google, Inc.
  5 |   Licensed under the terms of the two-clause BSD License (see LICENSE)
  6 | */
  7 | #ifndef VECTORIAL_SIMD4X4F_H
  8 | #define VECTORIAL_SIMD4X4F_H
  9 | 
 10 | 
 11 | #include "simd4f.h"
 12 | 
 13 | #include <math.h>
 14 | 
 15 | /*
 16 |   Note, x,y,z,w are conceptually columns with matrix math.
 17 | */
 18 | 
 19 | typedef struct {
 20 |     simd4f x,y,z,w;
 21 | } simd4x4f;
 22 | 
 23 | 
 24 | 
 25 | vectorial_inline simd4x4f simd4x4f_create(simd4f x, simd4f y, simd4f z, SIMD_PARAM(simd4f, w)) {
 26 |     simd4x4f s = { x, y, z, w };
 27 |     return s;
 28 | }
 29 | 
 30 | 
 31 | vectorial_inline void simd4x4f_identity(simd4x4f* m) {
 32 |     *m = simd4x4f_create( simd4f_create(1.0f, 0.0f, 0.0f, 0.0f),
 33 |                           simd4f_create(0.0f, 1.0f, 0.0f, 0.0f),
 34 |                           simd4f_create(0.0f, 0.0f, 1.0f, 0.0f),
 35 |                           simd4f_create(0.0f, 0.0f, 0.0f, 1.0f));
 36 | }
 37 | 
 38 | 
 39 | 
 40 | vectorial_inline void simd4x4f_uload(simd4x4f* m, const float *f) {
 41 | 
 42 |     m->x = simd4f_uload4(f + 0);
 43 |     m->y = simd4f_uload4(f + 4);
 44 |     m->z = simd4f_uload4(f + 8);
 45 |     m->w = simd4f_uload4(f + 12);
 46 | 
 47 | }
 48 | 
 49 | 
 50 | 
 51 | 
 52 | 
 53 | #ifdef VECTORIAL_SCALAR
 54 |     #include "simd4x4f_scalar.h"
 55 | #elif defined(VECTORIAL_SSE)
 56 |     #include "simd4x4f_sse.h"
 57 | #elif defined(VECTORIAL_GNU)
 58 |     #include "simd4x4f_gnu.h"
 59 | #elif defined(VECTORIAL_NEON)
 60 |     #include "simd4x4f_neon.h"
 61 | #else
 62 |     #error No implementation defined
 63 | #endif
 64 | 
 65 | vectorial_inline void simd4x4f_sum(const simd4x4f* a, simd4f* out) {
 66 |     simd4f t;
 67 |     t = simd4f_add(a->x, a->y);
 68 |     t = simd4f_add(t, a->z);
 69 |     t = simd4f_add(t, a->w);
 70 |     *out = t;
 71 | }
 72 | 
 73 | vectorial_inline void simd4x4f_matrix_vector_mul(const simd4x4f* a, const simd4f * b, simd4f* out) {
 74 | 
 75 |     const simd4f x = a->x;
 76 |     const simd4f y = a->y;
 77 |     const simd4f z = a->z;
 78 |     const simd4f w = a->w;
 79 |     const simd4f v = *b;
 80 |     const simd4f vx = simd4f_splat_x(v);
 81 |     const simd4f vy = simd4f_splat_y(v);
 82 |     const simd4f vz = simd4f_splat_z(v);
 83 |     const simd4f vw = simd4f_splat_w(v);
 84 | 
 85 |     #if 0
 86 |     // In a hasty benchmark, this actually performed worse on neon
 87 |     // TODO: revisit and conditionalize accordingly
 88 | 
 89 |     *out = simd4f_madd(x, vx, 
 90 |              simd4f_madd(y, vy, 
 91 |                simd4f_madd(z, vz, 
 92 |                  simd4f_mul(w, vw) ) ) );
 93 | 
 94 |     #else    
 95 | 
 96 |      *out = simd4f_add(simd4f_mul(x, vx), 
 97 |               simd4f_add(simd4f_mul(y, vy), 
 98 |                 simd4f_add(simd4f_mul(z, vz), 
 99 |                   simd4f_mul(w, vw) ) ) );
100 | 
101 |     #endif
102 | }
103 | 
104 | vectorial_inline void simd4x4f_matrix_vector3_mul(const simd4x4f* a, const simd4f * b, simd4f* out) {
105 | 
106 |     #if 0
107 |     *out = simd4f_madd( a->x, simd4f_splat_x(*b), 
108 |              simd4f_madd( a->y, simd4f_splat_y(*b), 
109 |                simd4f_mul(a->z, simd4f_splat_z(*b)) ) );
110 |     #else
111 |     *out = simd4f_add( simd4f_mul(a->x, simd4f_splat_x(*b)), 
112 |              simd4f_add( simd4f_mul(a->y, simd4f_splat_y(*b)), 
113 |                simd4f_mul(a->z, simd4f_splat_z(*b)) ) );
114 |     #endif
115 | 
116 | }
117 | 
118 | vectorial_inline void simd4x4f_matrix_point3_mul(const simd4x4f* a, const simd4f * b, simd4f* out) {
119 | 
120 |     #if 0
121 |     *out = simd4f_madd( a->x, simd4f_splat_x(*b),
122 |              simd4f_madd( a->y, simd4f_splat_y(*b),
123 |                simd4f_madd( a->z, simd4f_splat_z(*b),
124 |                  a->w ) ) );
125 |     #else
126 |     *out = simd4f_add( simd4f_mul(a->x, simd4f_splat_x(*b)),
127 |              simd4f_add( simd4f_mul(a->y, simd4f_splat_y(*b)),
128 |                simd4f_add( simd4f_mul(a->z, simd4f_splat_z(*b)),
129 |                  a->w ) ) );
130 |     #endif
131 | 
132 | }
133 | 
134 | vectorial_inline void simd4x4f_inv_ortho_matrix_point3_mul(const simd4x4f* a, const simd4f * b, simd4f* out) {
135 |     simd4f translation = simd4f_sub(*b, a->w);
136 | 
137 |     simd4x4f transpose = *a;
138 | 
139 |     transpose.w = simd4f_create(0,0,0,0);
140 |     simd4x4f_transpose_inplace(&transpose);
141 | 
142 |     simd4x4f_matrix_point3_mul(&transpose, &translation, out);
143 | }
144 | 
145 | vectorial_inline void simd4x4f_inv_ortho_matrix_vector3_mul(const simd4x4f* a, const simd4f * b, simd4f* out) {
146 |     simd4f translation = *b;
147 | 
148 |     simd4x4f transpose = *a;
149 | 
150 |     transpose.w = simd4f_create(0,0,0,0);
151 |     simd4x4f_transpose_inplace(&transpose);
152 | 
153 |     simd4x4f_matrix_vector3_mul(&transpose, &translation, out);
154 | }
155 | 
156 | 
157 | vectorial_inline void simd4x4f_matrix_mul(const simd4x4f* a, const simd4x4f* b, simd4x4f* out) {
158 | 
159 |     simd4x4f_matrix_vector_mul(a, &b->x, &out->x);
160 |     simd4x4f_matrix_vector_mul(a, &b->y, &out->y);
161 |     simd4x4f_matrix_vector_mul(a, &b->z, &out->z);
162 |     simd4x4f_matrix_vector_mul(a, &b->w, &out->w);
163 | 
164 | }
165 | 
166 | 
167 | 
168 | 
169 | vectorial_inline void simd4x4f_perspective(simd4x4f *m, float fovy_radians, float aspect, float znear, float zfar) {
170 |     
171 |     float deltaz = zfar - znear;
172 |     float cotangent = tanf( VECTORIAL_HALFPI - fovy_radians * 0.5f );
173 |     
174 |     float a = cotangent / aspect;
175 |     float b = cotangent;
176 |     float c = -(zfar + znear) / deltaz;
177 |     float d = -2 * znear * zfar / deltaz;
178 |     
179 |     m->x = simd4f_create( a, 0, 0,  0);
180 |     m->y = simd4f_create( 0, b, 0,  0);
181 |     m->z = simd4f_create( 0, 0, c, -1);
182 |     m->w = simd4f_create( 0, 0, d,  0);
183 | 
184 | }
185 | 
186 | vectorial_inline void simd4x4f_ortho(simd4x4f *m, float left, float right, float bottom, float top, float znear, float zfar) {
187 |     
188 |     float deltax = right - left;
189 |     float deltay = top - bottom;
190 |     float deltaz = zfar - znear;
191 | 
192 |     float a = 2.0f / deltax;
193 |     float b = -(right + left) / deltax;
194 |     float c = 2.0f / deltay;
195 |     float d = -(top + bottom) / deltay;
196 |     float e =  -2.0f / deltaz;
197 |     float f = -(zfar + znear) / deltaz;
198 |     
199 |     m->x = simd4f_create( a, 0, 0, 0);
200 |     m->y = simd4f_create( 0, c, 0, 0);
201 |     m->z = simd4f_create( 0, 0, e, 0);
202 |     m->w = simd4f_create( b, d, f, 1);
203 |     
204 | }
205 | 
206 | 
207 | vectorial_inline void simd4x4f_lookat(simd4x4f *m, simd4f eye, simd4f center, simd4f up) {
208 |     
209 |     simd4f zaxis = simd4f_normalize3( simd4f_sub(center, eye) );
210 |     simd4f xaxis = simd4f_normalize3( simd4f_cross3( zaxis, up ) );
211 |     simd4f yaxis = simd4f_cross3(xaxis, zaxis);
212 | 
213 |     zaxis = simd4f_sub( simd4f_zero(), zaxis);
214 | 
215 |     float x = -simd4f_dot3_scalar(xaxis, eye);
216 |     float y = -simd4f_dot3_scalar(yaxis, eye);
217 |     float z = -simd4f_dot3_scalar(zaxis, eye);
218 | 
219 |     m->x = xaxis;
220 |     m->y = yaxis;
221 |     m->z = zaxis;
222 | 
223 |     m->w = simd4f_create( 0,0,0, 1);
224 |     simd4x4f_transpose_inplace(m);
225 |     m->w = simd4f_create( x,y,z,1);
226 | 
227 | }
228 | 
229 | 
230 | vectorial_inline void simd4x4f_translation(simd4x4f* m, float x, float y, float z) {
231 |     *m = simd4x4f_create( simd4f_create(1.0f, 0.0f, 0.0f, 0.0f),
232 |                           simd4f_create(0.0f, 1.0f, 0.0f, 0.0f),
233 |                           simd4f_create(0.0f, 0.0f, 1.0f, 0.0f),
234 |                           simd4f_create(   x,    y,    z, 1.0f));
235 | }
236 | 
237 | 
238 | vectorial_inline void simd4x4f_axis_rotation(simd4x4f* m, float radians, simd4f axis) {
239 | 
240 |     radians = -radians;
241 | 
242 |     axis = simd4f_normalize3(axis);
243 | 
244 |     const float sine = sinf(radians);
245 |     const float cosine = cosf(radians);
246 | 
247 |     const float x = simd4f_get_x(axis);
248 |     const float y = simd4f_get_y(axis);
249 |     const float z = simd4f_get_z(axis);
250 | 
251 |     const float ab = x * y * (1 - cosine);
252 |     const float bc = y * z * (1 - cosine);
253 |     const float ca = z * x * (1 - cosine);
254 | 
255 |     const float tx = x * x;
256 |     const float ty = y * y;
257 |     const float tz = z * z;
258 | 
259 |     const simd4f i = simd4f_create( tx + cosine * (1 - tx), ab - z * sine,          ca + y * sine,          0);
260 |     const simd4f j = simd4f_create( ab + z * sine,          ty + cosine * (1 - ty), bc - x * sine,          0);
261 |     const simd4f k = simd4f_create( ca - y * sine,          bc + x * sine,          tz + cosine * (1 - tz), 0);
262 |     
263 |     *m = simd4x4f_create( i,j,k, simd4f_create(0, 0, 0, 1) );
264 |         
265 | }
266 | 
267 | 
268 | 
269 | vectorial_inline void simd4x4f_add(const simd4x4f* a, const simd4x4f* b, simd4x4f* out) {
270 |     
271 |     out->x = simd4f_add(a->x, b->x);
272 |     out->y = simd4f_add(a->y, b->y);
273 |     out->z = simd4f_add(a->z, b->z);
274 |     out->w = simd4f_add(a->w, b->w);
275 |     
276 | }
277 | 
278 | vectorial_inline void simd4x4f_sub(const simd4x4f* a, const simd4x4f* b, simd4x4f* out) {
279 |     
280 |     out->x = simd4f_sub(a->x, b->x);
281 |     out->y = simd4f_sub(a->y, b->y);
282 |     out->z = simd4f_sub(a->z, b->z);
283 |     out->w = simd4f_sub(a->w, b->w);
284 |     
285 | }
286 | 
287 | vectorial_inline void simd4x4f_mul(const simd4x4f* a, const simd4x4f* b, simd4x4f* out) {
288 |     
289 |     out->x = simd4f_mul(a->x, b->x);
290 |     out->y = simd4f_mul(a->y, b->y);
291 |     out->z = simd4f_mul(a->z, b->z);
292 |     out->w = simd4f_mul(a->w, b->w);
293 |     
294 | }
295 | 
296 | vectorial_inline void simd4x4f_div(simd4x4f* a, simd4x4f* b, simd4x4f* out) {
297 |     
298 |     out->x = simd4f_div(a->x, b->x);
299 |     out->y = simd4f_div(a->y, b->y);
300 |     out->z = simd4f_div(a->z, b->z);
301 |     out->w = simd4f_div(a->w, b->w);
302 |     
303 | }
304 | 
305 | vectorial_inline simd4f simd4x4f_inverse(const simd4x4f* a, simd4x4f* out) {
306 | 
307 |     const simd4f c0 = a->x;
308 |     const simd4f c1 = a->y;
309 |     const simd4f c2 = a->z;
310 |     const simd4f c3 = a->w;
311 | 
312 |     const simd4f c0_wxyz = simd4f_shuffle_wxyz(c0);
313 |     const simd4f c0_zwxy = simd4f_shuffle_zwxy(c0);
314 |     const simd4f c0_yzwx = simd4f_shuffle_yzwx(c0);
315 | 
316 |     const simd4f c1_wxyz = simd4f_shuffle_wxyz(c1);
317 |     const simd4f c1_zwxy = simd4f_shuffle_zwxy(c1);
318 |     const simd4f c1_yzwx = simd4f_shuffle_yzwx(c1);
319 | 
320 |     const simd4f c2_wxyz = simd4f_shuffle_wxyz(c2);
321 |     const simd4f c2_zwxy = simd4f_shuffle_zwxy(c2);
322 |     const simd4f c2_yzwx = simd4f_shuffle_yzwx(c2);
323 | 
324 |     const simd4f c3_wxyz = simd4f_shuffle_wxyz(c3);
325 |     const simd4f c3_zwxy = simd4f_shuffle_zwxy(c3);
326 |     const simd4f c3_yzwx = simd4f_shuffle_yzwx(c3);
327 | 
328 |     const simd4f c0_wxyz_x_c1 = simd4f_mul(c0_wxyz, c1);
329 |     const simd4f c0_wxyz_x_c1_yzwx = simd4f_mul(c0_wxyz, c1_yzwx);
330 |     const simd4f c0_wxyz_x_c1_zwxy = simd4f_mul(c0_wxyz, c1_zwxy);
331 | 
332 |     const simd4f c2_wxyz_x_c3 = simd4f_mul(c2_wxyz, c3);
333 |     const simd4f c2_wxyz_x_c3_yzwx = simd4f_mul(c2_wxyz, c3_yzwx);
334 |     const simd4f c2_wxyz_x_c3_zwxy = simd4f_mul(c2_wxyz, c3_zwxy);
335 | 
336 |     const simd4f ar1 = simd4f_sub( simd4f_shuffle_wxyz(c2_wxyz_x_c3_zwxy), simd4f_shuffle_zwxy(c2_wxyz_x_c3) );
337 |     const simd4f ar2 = simd4f_sub( simd4f_shuffle_zwxy(c2_wxyz_x_c3_yzwx), c2_wxyz_x_c3_yzwx );
338 |     const simd4f ar3 = simd4f_sub( c2_wxyz_x_c3_zwxy, simd4f_shuffle_wxyz(c2_wxyz_x_c3) );
339 | 
340 |     const simd4f br1 = simd4f_sub( simd4f_shuffle_wxyz(c0_wxyz_x_c1_zwxy), simd4f_shuffle_zwxy(c0_wxyz_x_c1) );
341 |     const simd4f br2 = simd4f_sub( simd4f_shuffle_zwxy(c0_wxyz_x_c1_yzwx), c0_wxyz_x_c1_yzwx );
342 |     const simd4f br3 = simd4f_sub( c0_wxyz_x_c1_zwxy, simd4f_shuffle_wxyz(c0_wxyz_x_c1) );
343 | 
344 | 
345 |     const simd4f c0_sum = simd4f_madd(c0_yzwx, ar3,
346 |                             simd4f_madd(c0_zwxy, ar2,
347 |                               simd4f_mul(c0_wxyz, ar1)));
348 | 
349 |     const simd4f c1_sum = simd4f_madd(c1_wxyz,  ar1, 
350 |                             simd4f_madd(c1_zwxy,  ar2, 
351 |                               simd4f_mul(c1_yzwx, ar3)));
352 | 
353 |     const simd4f c2_sum = simd4f_madd(c2_yzwx, br3,
354 |                             simd4f_madd(c2_zwxy, br2,
355 |                               simd4f_mul(c2_wxyz, br1)));
356 | 
357 |     const simd4f c3_sum = simd4f_madd(c3_yzwx, br3,
358 |                             simd4f_madd(c3_zwxy, br2,
359 |                               simd4f_mul(c3_wxyz, br1)));
360 | 
361 | 
362 |     const simd4f d0 = simd4f_mul(c1_sum, c0);
363 |     const simd4f d1 = simd4f_add(d0, simd4f_merge_high(d0, d0));
364 |     const simd4f det = simd4f_sub(d1, simd4f_splat_y(d1));
365 | 
366 |     const simd4f invdet = simd4f_splat_x( simd4f_div(simd4f_splat(1.0f), det) );
367 | 
368 |     const simd4f o0 = simd4f_mul( simd4f_flip_sign_0101(c1_sum), invdet );
369 |     const simd4f o1 = simd4f_mul( simd4f_flip_sign_1010(c0_sum), invdet );
370 |     const simd4f o2 = simd4f_mul( simd4f_flip_sign_0101(c3_sum), invdet );
371 |     const simd4f o3 = simd4f_mul( simd4f_flip_sign_1010(c2_sum), invdet );
372 | 
373 |     const simd4x4f mt = simd4x4f_create(o0, o1, o2, o3);
374 |     
375 |     simd4x4f_transpose( &mt, out);
376 | 
377 |     return det;
378 | }
379 | 
380 | #ifdef __cplusplus
381 | 
382 |     #ifdef VECTORIAL_OSTREAM
383 |         #include <ostream>
384 | 
385 |         vectorial_inline std::ostream& operator<<(std::ostream& os, const simd4x4f& v) {
386 |             os << "simd4x4f(simd4f(" << simd4f_get_x(v.x) << ", "
387 |                        << simd4f_get_y(v.x) << ", "
388 |                        << simd4f_get_z(v.x) << ", "
389 |                        << simd4f_get_w(v.x) << "),\n"
390 |                        << "         simd4f(" << simd4f_get_x(v.y) << ", "
391 |                        << simd4f_get_y(v.y) << ", "
392 |                        << simd4f_get_z(v.y) << ", "
393 |                        << simd4f_get_w(v.y) << "),\n"
394 |                        << "         simd4f(" << simd4f_get_x(v.z) << ", "
395 |                        << simd4f_get_y(v.z) << ", "
396 |                        << simd4f_get_z(v.z) << ", "
397 |                        << simd4f_get_w(v.z) << "),\n"
398 |                        << "         simd4f(" << simd4f_get_x(v.w) << ", "
399 |                        << simd4f_get_y(v.w) << ", "
400 |                        << simd4f_get_z(v.w) << ", "
401 |                        << simd4f_get_w(v.w) << "))";
402 |             return os;
403 |         }
404 |     #endif
405 | 
406 | #endif
407 | 
408 | 
409 | 
410 | 
411 | 
412 | #endif 
413 | 


--------------------------------------------------------------------------------
/spec/spec_simd4f.cpp:
--------------------------------------------------------------------------------
  1 | 
  2 | #include "spec_helper.h"
  3 | 
  4 | const int epsilon = 1;
  5 | 
  6 | describe(simd4f, "sanity") {
  7 |     it("VECTORIAL_SIMD_TYPE should be defined to a string") {
  8 |         std::cout << "Simd type: " << VECTORIAL_SIMD_TYPE << std::endl;
  9 |     }
 10 | }
 11 | 
 12 | describe(simd4f, "creating") {
 13 |     
 14 |     it("should be possible to create with simd4f_create") {
 15 |         
 16 |         simd4f x = simd4f_create(1, 2, 3, 4);
 17 | 
 18 |         should_be_close_to( simd4f_get_x(x), 1, epsilon);
 19 |         should_be_close_to( simd4f_get_y(x), 2, epsilon);
 20 |         should_be_close_to( simd4f_get_z(x), 3, epsilon);
 21 |         should_be_close_to( simd4f_get_w(x), 4, epsilon);
 22 | 
 23 |         // octave simd4f: [1,2,3,4]
 24 |         should_be_equal_simd4f(x, simd4f_create(1.000000000000000f, 2.000000000000000f, 3.000000000000000f, 4.000000000000000f), epsilon );
 25 |         
 26 |     }
 27 | 
 28 |     it("should have simd4f_zero for zero vector") {
 29 | 
 30 |         simd4f x = simd4f_zero();
 31 | 
 32 |         // octave simd4f: [0,0,0,0]
 33 |         should_be_equal_simd4f(x, simd4f_create(0.000000000000000f, 0.000000000000000f, 0.000000000000000f, 0.000000000000000f), epsilon );
 34 |     }
 35 |     
 36 |     
 37 | }
 38 | #ifdef _MSC_VER
 39 | #include <malloc.h>
 40 | #else
 41 | #include <alloca.h>
 42 | #endif
 43 | 
 44 | #define unaligned_mem(n) ((float*)((unsigned char*)alloca(sizeof(float)*n+4)+4))
 45 | 
 46 | describe(simd4f, "utilities") {
 47 | 
 48 |     it("should have simd4f_uload4 for loading four float values from an unaligned float array into simd4f") {
 49 |         float *f = unaligned_mem(4);
 50 |         f[0] = 1;
 51 |         f[1] = 2;
 52 |         f[2] = 3;
 53 |         f[3] = 4;
 54 |         simd4f x = simd4f_uload4(f);
 55 |         // octave simd4f: [1,2,3,4]
 56 |         should_be_equal_simd4f(x, simd4f_create(1.000000000000000f, 2.000000000000000f, 3.000000000000000f, 4.000000000000000f), epsilon );
 57 |     }
 58 | 
 59 |     it("should have simd4f_uload3 for loading three float values from an unaligned float array into simd4f") {
 60 |         float *f = unaligned_mem(3);
 61 |         f[0] = 1;
 62 |         f[1] = 2;
 63 |         f[2] = 3;
 64 |         simd4f x = simd4f_uload3(f);
 65 |         // octave simd4f: [1,2,3]
 66 |         should_be_equal_simd4f(x, simd4f_create(1.000000000000000f, 2.000000000000000f, 3.000000000000000f, 0.0f), epsilon );
 67 |     }
 68 | 
 69 |     it("should have simd4f_uload2 for loading two float values from float an unaligned array into simd4f") {
 70 |         float *f = unaligned_mem(2);
 71 |         f[0] = 1;
 72 |         f[1] = 2;
 73 |         simd4f x = simd4f_uload2(f);
 74 |         // octave simd4f: [1,2]
 75 |         should_be_equal_simd4f(x, simd4f_create(1.000000000000000f, 2.000000000000000f, 0.0f, 0.0f), epsilon );
 76 |     }
 77 | 
 78 | 
 79 |     it("should have simd4f_ustore4 for storing four float values from simd4f to an unaligned array") {
 80 |         float *f = unaligned_mem(4);
 81 |         f[0] = -1;
 82 |         f[1] = -1;
 83 |         f[2] = -1;
 84 |         f[3] = -1;
 85 |         simd4f a = simd4f_create(1,2,3,4);
 86 |         simd4f_ustore4(a, f);
 87 |         should_be_close_to(f[0], 1, epsilon);
 88 |         should_be_close_to(f[1], 2, epsilon);
 89 |         should_be_close_to(f[2], 3, epsilon);
 90 |         should_be_close_to(f[3], 4, epsilon);
 91 |     }
 92 | 
 93 |     it("should have simd4f_ustore3 for storing three float values from simd4f to an unaligned array") {
 94 |         float *f = unaligned_mem(3);
 95 |         f[0] = -1;
 96 |         f[1] = -1;
 97 |         f[2] = -1;
 98 |         simd4f a = simd4f_create(1,2,3,4);
 99 |         simd4f_ustore3(a, f);
100 |         should_be_close_to(f[0], 1, epsilon);
101 |         should_be_close_to(f[1], 2, epsilon);
102 |         should_be_close_to(f[2], 3, epsilon);
103 |     }
104 | 
105 |     it("should have simd4f_ustore2 for storing two float values from simd4f to an unaligned array") {
106 |         float *f = unaligned_mem(2);
107 |         f[0] = -1;
108 |         f[1] = -1;
109 |         simd4f a = simd4f_create(1,2,3,4);
110 |         simd4f_ustore2(a, f);
111 |         should_be_close_to(f[0], 1, epsilon);
112 |         should_be_close_to(f[1], 2, epsilon);
113 |     }
114 | 
115 | 
116 | 
117 | 
118 |     it("should have simd4f_splat that expands a single scalar to all elements") {
119 |         simd4f x = simd4f_splat(42);
120 |         // octave simd4f: [42,42,42,42]
121 |         should_be_equal_simd4f(x, simd4f_create(42.000000000000000f, 42.000000000000000f, 42.000000000000000f, 42.000000000000000f), epsilon );
122 |     }
123 | 
124 |     it("should have simd4f_splat_x,y,z,w splatting of an element") {
125 |         simd4f a = simd4f_create(1,2,3,4);
126 | 
127 |         simd4f x;
128 |         
129 |         x = simd4f_splat_x(a);
130 |         // octave simd4f: [1,1,1,1]
131 |         should_be_equal_simd4f(x, simd4f_create(1.000000000000000f, 1.000000000000000f, 1.000000000000000f, 1.000000000000000f), epsilon );
132 | 
133 |         x = simd4f_splat_y(a);
134 |         // octave simd4f: [2,2,2,2]
135 |         should_be_equal_simd4f(x, simd4f_create(2.000000000000000f, 2.000000000000000f, 2.000000000000000f, 2.000000000000000f), epsilon );
136 | 
137 |         x = simd4f_splat_z(a);
138 |         // octave simd4f: [3,3,3,3]
139 |         should_be_equal_simd4f(x, simd4f_create(3.000000000000000f, 3.000000000000000f, 3.000000000000000f, 3.000000000000000f), epsilon );
140 | 
141 |         x = simd4f_splat_w(a);
142 |         // octave simd4f: [4,4,4,4]
143 |         should_be_equal_simd4f(x, simd4f_create(4.000000000000000f, 4.000000000000000f, 4.000000000000000f, 4.000000000000000f), epsilon );
144 |     }
145 |     
146 |     it("should have simd4f_sum that adds elements") {
147 |         simd4f a = simd4f_create(1,2,3,4);
148 |         simd4f x = simd4f_sum(a);
149 |         // octave simd4f: [sum([1,2,3,4]), sum([1,2,3,4]), sum([1,2,3,4]), sum([1,2,3,4])]
150 |         should_be_equal_simd4f(x, simd4f_create(10.000000000000000f, 10.000000000000000f, 10.000000000000000f, 10.000000000000000f), epsilon );
151 |         
152 |     }
153 |         
154 |     it("should have simd4f_reciprocal") {
155 |         simd4f a = simd4f_create(0.00001f, 2.00001f, 3.0f, 99999999.0f);
156 |         simd4f x = simd4f_reciprocal(a);
157 |         // octave simd4f: 1 ./ [0.00001, 2.00001, 3.0, 99999999.0]
158 |         should_be_equal_simd4f(x, simd4f_create(99999.999999999985448f, 0.499997500012500f, 0.333333333333333f, 0.000000010000000f), epsilon );
159 |     }
160 | 
161 |     it("should have simd4f_sqrt") {
162 |         simd4f a = simd4f_create(0.00001f, 2.00001f, 3.0f, 99999999.0f);
163 |         simd4f x = simd4f_sqrt(a);
164 |         // octave simd4f:  sqrt([0.00001, 2.00001, 3.0, 99999999.0])
165 |         should_be_equal_simd4f(x, simd4f_create(0.003162277660168f, 1.414217097902582f, 1.732050807568877f, 9999.999949999999444f), epsilon );
166 | 
167 |         x = simd4f_sqrt( simd4f_create(0.0f, 0.0f, 0.0f, 0.0f) );
168 |         // octave simd4f:  sqrt([0, 0, 0, 0])
169 |         should_be_equal_simd4f(x, simd4f_create(0.000000000000000f, 0.000000000000000f, 0.000000000000000f, 0.000000000000000f), epsilon );
170 |     }
171 | 
172 |     it("should have simd4f_rsqrt for reciprocal of square-root") {
173 |         simd4f a = simd4f_create(0.00001f, 2.00001f, 3.0f, 99999999.0f);
174 |         simd4f x = simd4f_rsqrt(a);
175 |         const int epsilon = 4; // Grant larger error
176 |         // octave simd4f:  1 ./ sqrt([0.00001, 2.00001, 3.0, 99999999.0])
177 |         should_be_equal_simd4f(x, simd4f_create(316.227766016837904f, 0.707105013426224f, 0.577350269189626f, 0.000100000000500f), epsilon );
178 |     }
179 | 
180 | }
181 | 
182 | describe(simd4f, "arithmetic with another simd4f") {
183 | 
184 |     it("should have simd4f_add for component-wise addition") {
185 |         simd4f a = simd4f_create(1,2,3,4);
186 |         simd4f b = simd4f_create(10,20,30,40);
187 |         
188 |         simd4f x = simd4f_add(a,b);
189 |         // octave simd4f: [1,2,3,4] + [10,20,30,40]
190 |         should_be_equal_simd4f(x, simd4f_create(11.000000000000000f, 22.000000000000000f, 33.000000000000000f, 44.000000000000000f), epsilon );
191 |     }
192 | 
193 |     it("should have simd4f_sub for component-wise subtraction") {
194 |         simd4f a = simd4f_create(1,2,3,4);
195 |         simd4f b = simd4f_create(10,20,30,40);
196 |         
197 |         simd4f x = simd4f_sub(b,a);
198 |         // octave simd4f: [10,20,30,40] - [1,2,3,4] 
199 |         should_be_equal_simd4f(x, simd4f_create(9.000000000000000f, 18.000000000000000f, 27.000000000000000f, 36.000000000000000f), epsilon );
200 |     }
201 | 
202 |     it("should have simd4f_mul for component-wise multiply") {
203 |         simd4f a = simd4f_create(1,2,3,4);
204 |         simd4f b = simd4f_create(10,20,30,40);
205 |         
206 |         simd4f x = simd4f_mul(a,b);
207 |         // octave simd4f: [1,2,3,4] .* [10,20,30,40]
208 |         should_be_equal_simd4f(x, simd4f_create(10.000000000000000f, 40.000000000000000f, 90.000000000000000f, 160.000000000000000f), epsilon );
209 |     }
210 | 
211 |     it("should have simd4f_div for component-wise division") {
212 |         simd4f a = simd4f_create(1,2,3,4);
213 |         simd4f b = simd4f_create(10,20,30,40);
214 |         
215 |         simd4f x = simd4f_div(b,a);
216 |         // octave simd4f: [10,20,30,40] ./ [1,2,3,4] 
217 |         should_be_equal_simd4f(x, simd4f_create(10.000000000000000f, 10.000000000000000f, 10.000000000000000f, 10.000000000000000f), epsilon );
218 |     }
219 | 
220 |     it("should have simd4f_madd for multiply-add") {
221 |         simd4f a = simd4f_create(1,2,3,4);
222 |         simd4f b = simd4f_create(100,100,100,100);
223 |         simd4f c = simd4f_create(6,7,8,9);
224 | 
225 |         simd4f x = simd4f_madd(a,b,c);
226 |         // octave simd4f: [1,2,3,4] .* [100,100,100,100] .+ [6,7,8,9]
227 |         should_be_equal_simd4f(x, simd4f_create(106.000000000000000f, 207.000000000000000f, 308.000000000000000f, 409.000000000000000f), epsilon );
228 | 
229 |     }
230 | 
231 | }
232 | 
233 | 
234 | describe(simd4f, "vector math") {
235 |     
236 |     it("should have simd4f_dot4 for four component dot product") {
237 |         simd4f a = simd4f_create(1,2,3,4);
238 |         simd4f b = simd4f_create(10,20,30,40);
239 |         
240 |         simd4f x = simd4f_dot4(a,b);
241 |         // octave simd4f: [dot([1, 2, 3, 4], [10, 20, 30, 40]),dot([1, 2, 3, 4], [10, 20, 30, 40]),dot([1, 2, 3, 4], [10, 20, 30, 40]),dot([1, 2, 3, 4], [10, 20, 30, 40])]
242 |         should_be_equal_simd4f(x, simd4f_create(300.000000000000000f, 300.000000000000000f, 300.000000000000000f, 300.000000000000000f), epsilon );
243 |     }
244 | 
245 |     it("should have simd4f_dot3_scalar for three component dot product returning float") {
246 |         simd4f a = simd4f_create(1,2,3,9999);
247 |         simd4f b = simd4f_create(10,20,30,-9990);
248 |         
249 |         float x = simd4f_dot3_scalar(a,b);
250 |         // octave float: dot([1, 2, 3], [10, 20, 30])
251 |         should_be_close_to(x, 140.000000000000000f, epsilon );
252 |     }
253 | 
254 |     it("should have simd4f_dot3 for three component dot product returning simd4f") {
255 |         simd4f a = simd4f_create(1,2,3,9999);
256 |         simd4f b = simd4f_create(10,20,30,-9990);
257 |         
258 |         simd4f x = simd4f_dot3(a,b);
259 |         // octave simd4f: [dot([1, 2, 3], [10, 20, 30]),dot([1, 2, 3], [10, 20, 30]),dot([1, 2, 3], [10, 20, 30]),dot([1, 2, 3], [10, 20, 30])]
260 |         should_be_equal_simd4f(x, simd4f_create(140.000000000000000f, 140.000000000000000f, 140.000000000000000f, 140.000000000000000f), epsilon );
261 |     }
262 | 
263 |     it("should have simd4f_dot2 for two component dot product") {
264 |         simd4f a = simd4f_create(1,2,3,9999);
265 |         simd4f b = simd4f_create(10,20,30,-9990);
266 |         
267 |         simd4f x = simd4f_dot2(a,b);
268 |         // octave simd4f: [dot([1, 2], [10, 20]),dot([1, 2], [10, 20]),dot([1, 2], [10, 20]),dot([1, 2], [10, 20])]
269 |         should_be_equal_simd4f(x, simd4f_create(50.000000000000000f, 50.000000000000000f, 50.000000000000000f, 50.000000000000000f), epsilon );
270 |     }
271 |     
272 |     it("should have simd4f_length4 for four component vector length") {
273 |         simd4f a = simd4f_create(1,2,-3,9999);
274 |         simd4f x = simd4f_length4(a);
275 |         // octave simd4f: [norm([1,2,-3,9999]), norm([1,2,-3,9999]), norm([1,2,-3,9999]), norm([1,2,-3,9999])]
276 |         should_be_equal_simd4f(x, simd4f_create(9999.000700069982486f, 9999.000700069982486f, 9999.000700069982486f, 9999.000700069982486f), epsilon );
277 | 
278 |     }
279 | 
280 |     it("should have simd4f_length3 for three component vector length") {
281 |         simd4f a = simd4f_create(1,2,-3,9999);
282 |         simd4f x = simd4f_length3(a);
283 |         // octave simd4f: [norm([1,2,-3]), norm([1,2,-3]), norm([1,2,-3]), norm([1,2,-3])]
284 |         should_be_equal_simd4f(x, simd4f_create(3.741657386773941f, 3.741657386773941f, 3.741657386773941f, 3.741657386773941f), epsilon );
285 | 
286 |     }
287 | 
288 |     it("should have simd4f_length2 for two component vector length") {
289 |         simd4f a = simd4f_create(1,2,-3,9999);
290 |         simd4f x = simd4f_length2(a);
291 |         // octave simd4f: [norm([1,2]),norm([1,2]),norm([1,2]),norm([1,2])]
292 |         should_be_equal_simd4f(x, simd4f_create(2.236067977499790f, 2.236067977499790f, 2.236067977499790f, 2.236067977499790f), epsilon );
293 | 
294 |     }
295 | 
296 | 
297 |     it("should have simd4f_length4_squared for four component squared vector length") {
298 |         simd4f a = simd4f_create(1,2,-3,9999);
299 |         simd4f x = simd4f_length4_squared(a);
300 |         // octave simd4f: ([(dot([1,2,-3,9999], [1,2,-3,9999])), (dot([1,2,-3,9999], [1,2,-3,9999])), (dot([1,2,-3,9999], [1,2,-3,9999])), (dot([1,2,-3,9999], [1,2,-3,9999]))])
301 |         should_be_equal_simd4f(x, simd4f_create(99980015.000000000000000f, 99980015.000000000000000f, 99980015.000000000000000f, 99980015.000000000000000f), epsilon );
302 | 
303 |     }
304 | 
305 |     it("should have simd4f_length3_squared for three component squared vector length") {
306 |         simd4f a = simd4f_create(1,2,-3,9999);
307 |         simd4f x = simd4f_length3_squared(a);
308 |         // octave simd4f: ([dot([1,2,-3], [1,2,-3]), dot([1,2,-3], [1,2,-3]), dot([1,2,-3], [1,2,-3]), dot([1,2,-3], [1,2,-3])])
309 |         should_be_equal_simd4f(x, simd4f_create(14.000000000000000f, 14.000000000000000f, 14.000000000000000f, 14.000000000000000f), epsilon );
310 | 
311 |     }
312 | 
313 |     it("should have simd4f_length2_squared for two component squared vector length") {
314 |         simd4f a = simd4f_create(1,2,-3,9999);
315 |         simd4f x = simd4f_length2_squared(a);
316 |         // octave simd4f: ([dot([1,2], [1,2]), dot([1,2], [1,2]), dot([1,2], [1,2]), dot([1,2], [1,2])])
317 |         should_be_equal_simd4f(x, simd4f_create(5.000000000000000f, 5.000000000000000f, 5.000000000000000f, 5.000000000000000f), epsilon );
318 | 
319 |     }
320 |     
321 |     
322 |     
323 |     it("should have simd4f_cross3 for cross product") {
324 |         simd4f a = simd4f_create(1,12,3,-9999);
325 |         simd4f b = simd4f_create(5,6,-17, 9999);
326 | 
327 |         simd4f x = simd4f_cross3(a,b);
328 |         // octave simd4f: horzcat(  cross( [1,12,3], [5,6,-17] )  , [0] )
329 |         should_be_equal_simd4f(x, simd4f_create(-222.000000000000000f, 32.000000000000000f, -54.000000000000000f, 0.000000000000000f), epsilon );
330 | 
331 |     }
332 |     
333 |     it("should have simd4f_normalize4 for normalizing four const vector to unit length") {
334 |         simd4f a = simd4f_create(1,2,3,4);
335 |         simd4f x = simd4f_normalize4(a);
336 |         // octave simd4f: [1,2,3,4] / norm([1,2,3,4])
337 |         should_be_equal_simd4f(x, simd4f_create(0.182574185835055f, 0.365148371670111f, 0.547722557505166f, 0.730296743340221f), epsilon );
338 |     }
339 | 
340 |     it("should have simd4f_normalize3 for normalizing three component vector to unit length") {
341 |         simd4f a = simd4f_create(1,2,3,0);
342 |         simd4f x = simd4f_normalize3(a);
343 |         // octave simd4f: [1,2,3,0] / norm([1,2,3])
344 |         should_be_equal_simd4f(x, simd4f_create(0.267261241912424f, 0.534522483824849f, 0.801783725737273f, 0.000000000000000f), epsilon );
345 |     }
346 | 
347 |     it("should have simd4f_normalize2 for normalizing two component vector to unit length") {
348 |         simd4f a = simd4f_create(1,2,0,0);
349 |         simd4f x = simd4f_normalize2(a);
350 |         // octave simd4f: [1,2,0,0] / norm([1,2])
351 |         should_be_equal_simd4f(x, simd4f_create(0.447213595499958f, 0.894427190999916f, 0.000000000000000f, 0.000000000000000f), epsilon );
352 |     }
353 | 
354 |     
355 | }
356 | 
357 | describe(simd4f, "shuffles and merges") {
358 |     
359 |     it("should have simd4f_shuffle_wxyz") {
360 |         simd4f a = simd4f_create(1,2,3,4);
361 |         simd4f x = simd4f_shuffle_wxyz(a);
362 |         should_be_equal_simd4f(x, simd4f_create(4,1,2,3), epsilon );
363 |     }
364 | 
365 |     it("should have simd4f_shuffle_zwxy") {
366 |         simd4f a = simd4f_create(1,2,3,4);
367 |         simd4f x = simd4f_shuffle_zwxy(a);
368 |         should_be_equal_simd4f(x, simd4f_create(3,4,1,2), epsilon );
369 |     }
370 | 
371 |     it("should have simd4f_shuffle_yzwx") {
372 |         simd4f a = simd4f_create(1,2,3,4);
373 |         simd4f x = simd4f_shuffle_yzwx(a);
374 |         should_be_equal_simd4f(x, simd4f_create(2,3,4,1), epsilon );
375 |     }
376 | 
377 |     it("should have simd4f_merge_high") {
378 |         simd4f a = simd4f_create(1,2,3,4);
379 |         simd4f b = simd4f_create(5,6,7,8);
380 |         simd4f x = simd4f_merge_high(a,b);
381 |         should_be_equal_simd4f(x, simd4f_create(3,4,7,8), epsilon );
382 |     }
383 |     
384 | }
385 | 
386 | describe(simd4f, "signs") {
387 | 
388 |     it("should have simd4f_flip_sign_0101 for flipping even elements sign") {
389 |         simd4f a = simd4f_create(1,2,3,4);
390 |         simd4f x = simd4f_flip_sign_0101(a);
391 |         should_be_equal_simd4f(x, simd4f_create(1,-2,3,-4), epsilon );
392 |     }
393 | 
394 |     it("should have simd4f_flip_sign_1010 for flipping even elements sign") {
395 |         simd4f a = simd4f_create(1,2,3,4);
396 |         simd4f x = simd4f_flip_sign_1010(a);
397 |         should_be_equal_simd4f(x, simd4f_create(-1,2,-3,4), epsilon );
398 |     }
399 | 
400 | }
401 | 
402 | describe(simd4f, "min-max") {
403 |     
404 |     it("should have simd4f_min for choosing minimum elements") {
405 |         simd4f a = simd4f_create(1.0f,  2.0f, -300000000.0f, -0.000002f);
406 |         simd4f b = simd4f_create(2.0f, -2.0f,  300000000.0f,  0.000001f);
407 | 
408 |         simd4f x = simd4f_min(a,b);
409 |         should_be_equal_simd4f(x, simd4f_create(1.0f, -2.0f, -300000000.0f, -0.000002f), epsilon);
410 |         
411 |     }
412 | 
413 |     it("should have simd4f_max for choosing maximum elements") {
414 |         simd4f a = simd4f_create(1.0f,  2.0f, -300000000.0f, -0.000002f);
415 |         simd4f b = simd4f_create(2.0f, -2.0f,  300000000.0f,  0.000001f);
416 | 
417 |         simd4f x = simd4f_max(a,b);
418 |         should_be_equal_simd4f(x, simd4f_create(2.0f, 2.0f, 300000000.0f, 0.000001f), epsilon);
419 |         
420 |     }
421 |     
422 |     
423 |     
424 | }
425 | 
426 | 
427 | describe(simd4f, "zeroing")
428 | {
429 | 
430 |     it("should have simd4f_zero_w that zeros the last element")
431 |     {
432 |         const float nan = sqrtf(-1.0f);
433 |         simd4f a = simd4f_create(1.0f, 2.0f, 3.0f, 4.0f);
434 |         simd4f b = simd4f_create(1.0f, 2.0f, 3.0f, nan);
435 |         simd4f x = simd4f_zero_w(a);
436 |         should_be_equal_simd4f(x, simd4f_create(1.0f, 2.0f, 3.0f, 0.0f), epsilon);
437 |         x = simd4f_zero_w(b);
438 |         should_be_equal_simd4f(x, simd4f_create(1.0f, 2.0f, 3.0f, 0.0f), epsilon);
439 |     }
440 | 
441 |     it("should have simd4f_zero_zw that zeros the last element")
442 |     {
443 |         const float nan = sqrtf(-1.0f);
444 |         simd4f a = simd4f_create(1.0f, 2.0f, 3.0f, 4.0f);
445 |         simd4f b = simd4f_create(1.0f, 2.0f, nan, nan);
446 |         simd4f x = simd4f_zero_zw(a);
447 |         should_be_equal_simd4f(x, simd4f_create(1.0f, 2.0f, 0.0f, 0.0f), epsilon);
448 |         x = simd4f_zero_zw(b);
449 |         should_be_equal_simd4f(x, simd4f_create(1.0f, 2.0f, 0.0f, 0.0f), epsilon);
450 |     }
451 | 
452 | }
453 | 
454 | 
455 | 
456 | 
457 | 
458 | 


--------------------------------------------------------------------------------