├── .gitignore ├── .travis.yml ├── include └── vectorial │ ├── vectorial.h │ ├── simd4x4f_sse.h │ ├── simd2f_common.h │ ├── simd2f.h │ ├── simd4x4f_scalar.h │ ├── simd4x4f_gnu.h │ ├── simd4x4f_neon.h │ ├── simd4f.h │ ├── vec_convert.h │ ├── simd4f_common.h │ ├── config.h │ ├── simd2f_neon.h │ ├── simd4f_scalar.h │ ├── vec2f.h │ ├── mat4f.h │ ├── vec4f.h │ ├── vec3f.h │ ├── simd4f_gnu.h │ ├── simd4f_sse.h │ ├── simd4f_neon.h │ └── simd4x4f.h ├── tools ├── update_spec.rb └── spechelper.m ├── bench ├── add_bench.cpp ├── dot_bench.cpp ├── matrix_bench.cpp ├── bench.h ├── quad_bench.cpp └── bench.cpp ├── LICENSE ├── spec ├── spec_mat4f.cpp ├── spec_main.cpp ├── spec.cpp ├── spec.h ├── spec_vec2f.cpp ├── spec_simd2f.cpp ├── spec_vec3f.cpp ├── spec_vec4f.cpp ├── spec_helper.h └── spec_simd4f.cpp ├── README ├── vectorial.sln ├── vectorialbenchmark.vcproj ├── vectorial.vcproj └── Makefile /.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.orig 3 | specsuite-* 4 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: cpp 2 | compiler: 3 | - gcc 4 | - clang 5 | 6 | script: make 7 | -------------------------------------------------------------------------------- /include/vectorial/vectorial.h: -------------------------------------------------------------------------------- 1 | /* 2 | Vectorial 3 | Copyright (c) 2010 Mikko Lehtonen 4 | Licensed under the terms of the two-clause BSD License (see LICENSE) 5 | */ 6 | #ifndef VECTORIAL_VECTORIAL_H 7 | #define VECTORIAL_VECTORIAL_H 8 | 9 | 10 | #include "vectorial/vec2f.h" 11 | #include "vectorial/vec3f.h" 12 | #include "vectorial/vec4f.h" 13 | 14 | #include "vectorial/vec_convert.h" 15 | 16 | #include "vectorial/mat4f.h" 17 | 18 | 19 | #endif 20 | -------------------------------------------------------------------------------- /include/vectorial/simd4x4f_sse.h: -------------------------------------------------------------------------------- 1 | /* 2 | Vectorial 3 | Copyright (c) 2010 Mikko Lehtonen 4 | Licensed under the terms of the two-clause BSD License (see LICENSE) 5 | */ 6 | #ifndef VECTORIAL_SIMD4X4F_SSE_H 7 | #define VECTORIAL_SIMD4X4F_SSE_H 8 | 9 | 10 | 11 | vectorial_inline void simd4x4f_transpose_inplace(simd4x4f *s) { 12 | _MM_TRANSPOSE4_PS(s->x, s->y, s->z, s->w); 13 | } 14 | 15 | vectorial_inline void simd4x4f_transpose(const simd4x4f *s, simd4x4f *out) { 16 | *out=*s; 17 | simd4x4f_transpose_inplace(out); 18 | } 19 | 20 | 21 | 22 | 23 | #endif 24 | -------------------------------------------------------------------------------- /tools/update_spec.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | SPECHELPER = File.join(File.dirname(__FILE__), "spechelper.m") 4 | def octave_eval(str, type) 5 | puts "evalling (#{type}): #{str}" 6 | ret = `octave --quiet --eval 'source("#{SPECHELPER}"); spec_formatter(#{str}, "#{type}")'` 7 | puts " = #{ret.strip}" 8 | ret 9 | end 10 | 11 | 12 | ARGV.each do |fn| 13 | str = File.read(fn) 14 | str.gsub!(%r{(// octave (\w+):)(.*?)\n(.*?\n)}) do |match| 15 | e = octave_eval($3, $2) 16 | 17 | [$1, $3, "\n", e, "\n"].join 18 | end 19 | File.open(fn, "w") do |f| 20 | f.write str 21 | end 22 | 23 | end 24 | 25 | -------------------------------------------------------------------------------- /include/vectorial/simd2f_common.h: -------------------------------------------------------------------------------- 1 | /* 2 | Vectorial 3 | Copyright (c) 2014 Google 4 | Licensed under the terms of the two-clause BSD License (see LICENSE) 5 | */ 6 | #ifndef VECTORIAL_SIMD2F_COMMON_H 7 | #define VECTORIAL_SIMD2F_COMMON_H 8 | 9 | vectorial_inline simd2f simd2f_length2(simd2f v) { 10 | return simd2f_sqrt( simd2f_dot2(v,v) ); 11 | } 12 | 13 | vectorial_inline simd2f simd2f_length2_squared(simd2f v) { 14 | return simd2f_dot2(v,v); 15 | } 16 | 17 | vectorial_inline simd2f simd2f_normalize2(simd2f a) { 18 | simd2f invlen = simd2f_rsqrt( simd2f_dot2(a,a) ); 19 | return simd2f_mul(a, invlen); 20 | } 21 | 22 | #endif 23 | -------------------------------------------------------------------------------- /include/vectorial/simd2f.h: -------------------------------------------------------------------------------- 1 | /* 2 | Vectorial 3 | Copyright (c) 2014 Google, Inc. 4 | Licensed under the terms of the two-clause BSD License (see LICENSE) 5 | */ 6 | 7 | #ifndef VECTORIAL_SIMD2F_H 8 | #define VECTORIAL_SIMD2F_H 9 | 10 | #include "vectorial/config.h" 11 | 12 | #if defined(VECTORIAL_NEON) 13 | #include "simd2f_neon.h" 14 | #else 15 | #error No implementation defined 16 | #endif 17 | 18 | #include "simd2f_common.h" 19 | 20 | #ifdef __cplusplus 21 | 22 | #ifdef VECTORIAL_OSTREAM 23 | #include 24 | 25 | vectorial_inline std::ostream& operator<<(std::ostream& os, const simd2f& v) { 26 | os << "simd2f(" << simd2f_get_x(v) << ", " 27 | << simd2f_get_y(v) << ")"; 28 | return os; 29 | } 30 | #endif 31 | 32 | #endif 33 | 34 | 35 | 36 | 37 | #endif 38 | 39 | -------------------------------------------------------------------------------- /include/vectorial/simd4x4f_scalar.h: -------------------------------------------------------------------------------- 1 | /* 2 | Vectorial 3 | Copyright (c) 2010 Mikko Lehtonen 4 | Licensed under the terms of the two-clause BSD License (see LICENSE) 5 | */ 6 | #ifndef VECTORIAL_SIMD4X4F_SCALAR_H 7 | #define VECTORIAL_SIMD4X4F_SCALAR_H 8 | 9 | 10 | vectorial_inline void simd4x4f_transpose_inplace(simd4x4f *s) { 11 | simd4x4f d=*s; 12 | s->x.x = d.x.x; 13 | s->x.y = d.y.x; 14 | s->x.z = d.z.x; 15 | s->x.w = d.w.x; 16 | 17 | s->y.x = d.x.y; 18 | s->y.y = d.y.y; 19 | s->y.z = d.z.y; 20 | s->y.w = d.w.y; 21 | 22 | s->z.x = d.x.z; 23 | s->z.y = d.y.z; 24 | s->z.z = d.z.z; 25 | s->z.w = d.w.z; 26 | 27 | s->w.x = d.x.w; 28 | s->w.y = d.y.w; 29 | s->w.z = d.z.w; 30 | s->w.w = d.w.w; 31 | 32 | } 33 | 34 | vectorial_inline void simd4x4f_transpose(const simd4x4f *s, simd4x4f *out) { 35 | *out=*s; 36 | simd4x4f_transpose_inplace(out); 37 | } 38 | 39 | 40 | 41 | #endif 42 | -------------------------------------------------------------------------------- /include/vectorial/simd4x4f_gnu.h: -------------------------------------------------------------------------------- 1 | /* 2 | Vectorial 3 | Copyright (c) 2010 Mikko Lehtonen 4 | Licensed under the terms of the two-clause BSD License (see LICENSE) 5 | */ 6 | #ifndef VECTORIAL_SIMD4X4F_GNU_H 7 | #define VECTORIAL_SIMD4X4F_GNU_H 8 | 9 | 10 | 11 | vectorial_inline void simd4x4f_transpose_inplace(simd4x4f* s) { 12 | const _simd4f_union sx = { s->x }; 13 | const _simd4f_union sy = { s->y }; 14 | const _simd4f_union sz = { s->z }; 15 | const _simd4f_union sw = { s->w }; 16 | 17 | const simd4f dx = { sx.f[0], sy.f[0], sz.f[0], sw.f[0] }; 18 | const simd4f dy = { sx.f[1], sy.f[1], sz.f[1], sw.f[1] }; 19 | const simd4f dz = { sx.f[2], sy.f[2], sz.f[2], sw.f[2] }; 20 | const simd4f dw = { sx.f[3], sy.f[3], sz.f[3], sw.f[3] }; 21 | 22 | s->x = dx; 23 | s->y = dy; 24 | s->z = dz; 25 | s->w = dw; 26 | 27 | } 28 | 29 | vectorial_inline void simd4x4f_transpose(const simd4x4f *s, simd4x4f *out) { 30 | *out=*s; 31 | simd4x4f_transpose_inplace(out); 32 | } 33 | 34 | 35 | 36 | #endif 37 | -------------------------------------------------------------------------------- /include/vectorial/simd4x4f_neon.h: -------------------------------------------------------------------------------- 1 | /* 2 | Vectorial 3 | Copyright (c) 2010 Mikko Lehtonen 4 | Licensed under the terms of the two-clause BSD License (see LICENSE) 5 | */ 6 | #ifndef VECTORIAL_SIMD4X4F_NEON_H 7 | #define VECTORIAL_SIMD4X4F_NEON_H 8 | 9 | 10 | vectorial_inline void simd4x4f_transpose_inplace(simd4x4f* s) { 11 | const _simd4f_union sx = { s->x }; 12 | const _simd4f_union sy = { s->y }; 13 | const _simd4f_union sz = { s->z }; 14 | const _simd4f_union sw = { s->w }; 15 | 16 | const simd4f dx = simd4f_create( sx.f[0], sy.f[0], sz.f[0], sw.f[0] ); 17 | const simd4f dy = simd4f_create( sx.f[1], sy.f[1], sz.f[1], sw.f[1] ); 18 | const simd4f dz = simd4f_create( sx.f[2], sy.f[2], sz.f[2], sw.f[2] ); 19 | const simd4f dw = simd4f_create( sx.f[3], sy.f[3], sz.f[3], sw.f[3] ); 20 | 21 | s->x = dx; 22 | s->y = dy; 23 | s->z = dz; 24 | s->w = dw; 25 | 26 | } 27 | 28 | vectorial_inline void simd4x4f_transpose(const simd4x4f *s, simd4x4f *out) { 29 | *out=*s; 30 | simd4x4f_transpose_inplace(out); 31 | } 32 | 33 | 34 | 35 | #endif 36 | -------------------------------------------------------------------------------- /bench/add_bench.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "bench.h" 3 | #include 4 | 5 | #include 6 | #include "vectorial/vec4f.h" 7 | 8 | #define NUM (81920) 9 | #define ITER 100 10 | using namespace vectorial; 11 | 12 | namespace { 13 | vec4f* alloc_vec4f(size_t n) { 14 | void *ptr = memalign(n*sizeof(vec4f), 16); 15 | return static_cast(ptr); 16 | } 17 | } 18 | 19 | 20 | 21 | static vec4f * a; 22 | static vec4f * b; 23 | static vec4f * c; 24 | 25 | 26 | 27 | 28 | void add_func() { 29 | 30 | vec4f* vectorial_restrict aa = a; 31 | vec4f* vectorial_restrict bb = b; 32 | vec4f* vectorial_restrict cc = c; 33 | 34 | for(size_t i = 0; i < NUM; ++i) 35 | { 36 | cc[i] = aa[i] + bb[i]; 37 | } 38 | } 39 | 40 | void add_bench() { 41 | 42 | a = alloc_vec4f(NUM); 43 | b = alloc_vec4f(NUM); 44 | c = alloc_vec4f(NUM); 45 | 46 | 47 | for(size_t i = 0; i < NUM; ++i) 48 | { 49 | a[i]=vec4f(i,i,i,i); 50 | b[i]=vec4f(NUM-i, NUM-i, NUM-i, NUM-i); 51 | } 52 | 53 | profile("add", add_func, ITER, NUM); 54 | 55 | memfree(a); 56 | memfree(b); 57 | memfree(c); 58 | 59 | 60 | } 61 | -------------------------------------------------------------------------------- /include/vectorial/simd4f.h: -------------------------------------------------------------------------------- 1 | /* 2 | Vectorial 3 | Copyright (c) 2010 Mikko Lehtonen 4 | Licensed under the terms of the two-clause BSD License (see LICENSE) 5 | */ 6 | 7 | #ifndef VECTORIAL_SIMD4F_H 8 | #define VECTORIAL_SIMD4F_H 9 | 10 | #ifndef VECTORIAL_CONFIG_H 11 | #include "vectorial/config.h" 12 | #endif 13 | 14 | 15 | #ifdef VECTORIAL_SCALAR 16 | #include "simd4f_scalar.h" 17 | #elif defined(VECTORIAL_SSE) 18 | #include "simd4f_sse.h" 19 | #elif defined(VECTORIAL_GNU) 20 | #include "simd4f_gnu.h" 21 | #elif defined(VECTORIAL_NEON) 22 | #include "simd4f_neon.h" 23 | #else 24 | #error No implementation defined 25 | #endif 26 | 27 | #include "simd4f_common.h" 28 | 29 | 30 | 31 | #ifdef __cplusplus 32 | 33 | #ifdef VECTORIAL_OSTREAM 34 | #include 35 | 36 | vectorial_inline std::ostream& operator<<(std::ostream& os, const simd4f& v) { 37 | os << "simd4f(" << simd4f_get_x(v) << ", " 38 | << simd4f_get_y(v) << ", " 39 | << simd4f_get_z(v) << ", " 40 | << simd4f_get_w(v) << ")"; 41 | return os; 42 | } 43 | #endif 44 | 45 | #endif 46 | 47 | 48 | 49 | 50 | #endif 51 | 52 | -------------------------------------------------------------------------------- /bench/dot_bench.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "bench.h" 3 | #include 4 | 5 | #include 6 | #include "vectorial/vec4f.h" 7 | 8 | #define NUM (81920) 9 | #define ITER 100 10 | using namespace vectorial; 11 | 12 | namespace { 13 | vec4f* alloc_vec4f(size_t n) { 14 | void *ptr = memalign(n*sizeof(vec4f), 16); 15 | return static_cast(ptr); 16 | } 17 | } 18 | 19 | 20 | 21 | static vec4f * a; 22 | static vec4f * b; 23 | static float * c; 24 | 25 | 26 | 27 | 28 | void dot_func() { 29 | 30 | vec4f* vectorial_restrict aa = a; 31 | vec4f* vectorial_restrict bb = b; 32 | float* vectorial_restrict cc = c; 33 | 34 | for(size_t i = 0; i < NUM; ++i) 35 | { 36 | cc[i] = dot(aa[i], bb[i]); 37 | } 38 | } 39 | 40 | void dot_bench() { 41 | 42 | a = alloc_vec4f(NUM); 43 | b = alloc_vec4f(NUM); 44 | c = static_cast(malloc(NUM * sizeof(float))); 45 | 46 | 47 | for(size_t i = 0; i < NUM; ++i) 48 | { 49 | a[i]=vec4f(i,i,i,i); 50 | b[i]=vec4f(NUM-i, NUM-i, NUM-i, NUM-i); 51 | } 52 | 53 | profile("dot", dot_func, ITER, NUM); 54 | 55 | memfree(a); 56 | memfree(b); 57 | memfree(c); 58 | 59 | 60 | } 61 | -------------------------------------------------------------------------------- /include/vectorial/vec_convert.h: -------------------------------------------------------------------------------- 1 | /* 2 | Vectorial 3 | Copyright (c) 2010 Mikko Lehtonen 4 | Licensed under the terms of the two-clause BSD License (see LICENSE) 5 | */ 6 | #ifndef VECTORIAL_VEC_CONVERT_H 7 | #define VECTORIAL_VEC_CONVERT_H 8 | 9 | 10 | namespace vectorial { 11 | 12 | inline vec3f vec4f::xyz() const { return vec3f(value); } 13 | inline vec2f vec4f::xy() const { return vec2f(value); } 14 | 15 | inline vec4f vec3f::xyz0() const { return vec4f(simd4f_zero_w(value)); } 16 | inline vec4f vec3f::xyz1() const { return xyz0() + vec4f(0.0f, 0.0f, 0.0f, 1.0f); } 17 | inline vec4f vec3f::xyzw(float w) const { return xyz0() + vec4f(0.0f, 0.0f, 0.0f, w); } 18 | inline vec3f vec3f::xyz() const { return vec3f(value); } 19 | inline vec3f vec3f::xy0() const { return vec3f(value) * vec3f(1.0f, 1.0f, 0.0f); } 20 | inline vec2f vec3f::xy() const { return vec2f(value); } 21 | 22 | inline vec4f vec2f::xy00() const { return vec4f(simd4f_zero_zw(value)); } 23 | inline vec4f vec2f::xy01() const { return xy00() + vec4f(0.0f, 0.0f, 0.0f, 1.0f); } 24 | inline vec4f vec2f::xyzw(float z, float w) const { return xy00() + vec4f(0.0f, 0.0f, z, w); } 25 | inline vec3f vec2f::xy0() const { return vec3f(simd4f_zero_zw(value)); } 26 | inline vec2f vec2f::xy() const { return vec2f(value); } 27 | 28 | } 29 | 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2010 Mikko Lehtonen. All rights reserved. 2 | 3 | Redistribution and use in source and binary forms, with or without modification, are 4 | permitted provided that the following conditions are met: 5 | 6 | 1. Redistributions of source code must retain the above copyright notice, this list of 7 | conditions and the following disclaimer. 8 | 9 | 2. Redistributions in binary form must reproduce the above copyright notice, this list 10 | of conditions and the following disclaimer in the documentation and/or other materials 11 | provided with the distribution. 12 | 13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 14 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 16 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 17 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 18 | NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 19 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 20 | WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 21 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 22 | POSSIBILITY OF SUCH DAMAGE. 23 | -------------------------------------------------------------------------------- /bench/matrix_bench.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "bench.h" 3 | #include 4 | 5 | #include 6 | #include "vectorial/simd4x4f.h" 7 | 8 | #define NUM (819200) 9 | #define ITER 100 10 | //using namespace vectorial; 11 | 12 | namespace { 13 | simd4x4f* alloc_vec4x4f(size_t n) { 14 | void *ptr = memalign(n*sizeof(simd4x4f), 16); 15 | return static_cast(ptr); 16 | } 17 | } 18 | 19 | 20 | 21 | static simd4x4f * a; 22 | static simd4x4f * b; 23 | static simd4x4f * c; 24 | 25 | 26 | 27 | 28 | void matrix_func() { 29 | 30 | simd4x4f* vectorial_restrict aa = a; 31 | simd4x4f* vectorial_restrict bb = b; 32 | simd4x4f* vectorial_restrict cc = c; 33 | 34 | for(size_t i = 0; i < NUM; ++i) 35 | { 36 | simd4x4f_matrix_mul(&aa[i], &bb[i], &bb[i]); 37 | } 38 | } 39 | 40 | void matrix_bench() { 41 | 42 | a = alloc_vec4x4f(NUM); 43 | b = alloc_vec4x4f(NUM); 44 | c = alloc_vec4x4f(NUM); 45 | 46 | 47 | for(size_t i = 0; i < NUM; ++i) 48 | { 49 | simd4f v = simd4f_create(i,i,i,i); 50 | simd4f vi = simd4f_create(NUM-i,NUM-i,NUM-i,NUM-i); 51 | a[i]=simd4x4f_create(v,v,v,v); 52 | b[i]=simd4x4f_create(vi,vi,vi,vi); 53 | } 54 | 55 | profile("matrix mul", matrix_func, ITER, NUM); 56 | 57 | memfree(a); 58 | memfree(b); 59 | memfree(c); 60 | 61 | 62 | } 63 | -------------------------------------------------------------------------------- /bench/bench.h: -------------------------------------------------------------------------------- 1 | #ifndef BENCH_H 2 | #define BENCH_H 3 | 4 | #include 5 | #include 6 | 7 | #ifdef __APPLE__ 8 | #define BENCH_MACH 9 | #include 10 | #include 11 | #elif defined(_WIN32) 12 | #define BENCH_QPC 13 | #define WIN32_LEAN_AND_MEAN 14 | #include 15 | #include 16 | #else 17 | #define BENCH_GTOD 18 | #include 19 | #endif 20 | 21 | 22 | static void* memalign(size_t count, size_t align) { 23 | #ifdef _WIN32 24 | return _aligned_malloc(count,align); 25 | #else 26 | void *ptr; 27 | int e = posix_memalign(&ptr, align, count); 28 | // if( e == EINVAL ) printf("EINVAL posix_memalign\n"); 29 | // if( e == ENOMEM ) printf("ENOMEM posix_memalign\n"); 30 | return ptr; 31 | #endif 32 | } 33 | 34 | static void memfree(void* ptr) { 35 | #ifdef _WIN32 36 | _aligned_free(ptr); 37 | #else 38 | free(ptr); 39 | #endif 40 | } 41 | 42 | namespace profiler { 43 | 44 | #ifdef BENCH_GTOD 45 | typedef struct timeval time_t; 46 | #endif 47 | #ifdef BENCH_MACH 48 | typedef const uint64_t time_t; 49 | #endif 50 | #ifdef BENCH_QPC 51 | typedef LARGE_INTEGER time_t; 52 | #endif 53 | 54 | void init(); 55 | time_t now(); 56 | 57 | double diffTime(time_t start, time_t end); 58 | 59 | } 60 | 61 | std::string formatTime(double d, double relative=-1); 62 | void profile(const char* name, void (*func)(), int iterations, int elements); 63 | 64 | 65 | #endif 66 | -------------------------------------------------------------------------------- /spec/spec_mat4f.cpp: -------------------------------------------------------------------------------- 1 | #include "spec_helper.h" 2 | #include 3 | using vectorial::vec4f; 4 | using vectorial::mat4f; 5 | 6 | const int epsilon = 1; 7 | 8 | describe(mat4f, "constructing") { 9 | it("should have default constructor that does nothing..") { 10 | mat4f x; 11 | } 12 | 13 | it("should have constructor that constructs from four vec4") { 14 | mat4f x( vec4f(1,2,3,4), vec4f(5,6,7,8), vec4f(9,10,11,12), vec4f(13,14,15,16) ); 15 | 16 | // octave mat4f: [1,5,9,13 ; 2,6,10,14 ; 3,7,11,15 ; 4,8,12,16 ] 17 | should_be_equal_mat4f(x, simd4x4f_create(simd4f_create(1.000000000000000f, 2.000000000000000f, 3.000000000000000f, 4.000000000000000f), simd4f_create(5.000000000000000f, 6.000000000000000f, 7.000000000000000f, 8.000000000000000f), simd4f_create(9.000000000000000f, 10.000000000000000f, 11.000000000000000f, 12.000000000000000f), simd4f_create(13.000000000000000f, 14.000000000000000f, 15.000000000000000f, 16.000000000000000f)), epsilon ); 18 | } 19 | 20 | it("should have static function to create identity matrix") { 21 | 22 | mat4f x = mat4f::identity(); 23 | 24 | // octave mat4f: [1,0,0,0;0,1,0,0;0,0,1,0;0,0,0,1] 25 | should_be_equal_mat4f(x, simd4x4f_create(simd4f_create(1.000000000000000f, 0.000000000000000f, 0.000000000000000f, 0.000000000000000f), simd4f_create(0.000000000000000f, 1.000000000000000f, 0.000000000000000f, 0.000000000000000f), simd4f_create(0.000000000000000f, 0.000000000000000f, 1.000000000000000f, 0.000000000000000f), simd4f_create(0.000000000000000f, 0.000000000000000f, 0.000000000000000f, 1.000000000000000f)), epsilon ); 26 | } 27 | 28 | } 29 | 30 | -------------------------------------------------------------------------------- /spec/spec_main.cpp: -------------------------------------------------------------------------------- 1 | /* Specific - Minimal C++ spec framework. 2 | 3 | 4 | The zlib/libpng License 5 | 6 | 7 | Copyright (c) 2008 Mikko Lehtonen 8 | 9 | This software is provided 'as-is', without any express or implied 10 | warranty. In no event will the authors be held liable for any damages 11 | arising from the use of this software. 12 | 13 | Permission is granted to anyone to use this software for any purpose, 14 | including commercial applications, and to alter it and redistribute it 15 | freely, subject to the following restrictions: 16 | 17 | 1. The origin of this software must not be misrepresented; you must not 18 | claim that you wrote the original software. If you use this software 19 | in a product, an acknowledgment in the product documentation would be 20 | appreciated but is not required. 21 | 22 | 2. Altered source versions must be plainly marked as such, and must not be 23 | misrepresented as being the original software. 24 | 25 | 3. This notice may not be removed or altered from any source 26 | distribution. 27 | */ 28 | 29 | 30 | #include "spec.h" 31 | #include 32 | 33 | int main(int argc, char *argv[]) 34 | { 35 | 36 | std::string subset(""); 37 | 38 | specific::ProgressWriter progressWriter; 39 | specific::SpecdocWriter specdocWriter; 40 | specific::SpecWriter* writer = &progressWriter; 41 | 42 | for(size_t i = 1; i < size_t(argc); ++i) { 43 | if( std::string("-s") == argv[i] ) { 44 | writer = &specdocWriter; 45 | } else { 46 | subset = argv[i]; 47 | } 48 | } 49 | 50 | 51 | bool success = specific::SpecRunner::getInstance().run(*writer, subset); 52 | 53 | return success ? EXIT_SUCCESS : EXIT_FAILURE; 54 | } 55 | 56 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | 2 | Vectorial - vector math library 3 | 4 | 5 | 6 | Motivation 7 | 8 | I couldn't find an open source math library that was usable and 9 | supported simd - especially the ARM NEON variant. 10 | 11 | 12 | Features 13 | 14 | Supports NEON, SSE, scalar and generic gcc vector extension. 15 | Most basic vector and matrix math is available, but not quite 16 | yet full featured. 17 | 18 | 19 | Design 20 | 21 | Vectorial consists of two main parts, pure-C wrapper around 22 | platform-specific vector instructions in the simd*.h files 23 | and C++ classes for common uses, the vec*.h and mat*.h 24 | 25 | The config.h autodetects approriate vector instructions to use. 26 | 27 | The platform-specific support is done with intrisincs only, 28 | allowing the compiler to have a full view of the code, hopefully 29 | resulting in better optimizations especially with reordering etc. 30 | 31 | 32 | Installation / Usage 33 | 34 | Add vectorial/include to your include path 35 | 36 | #include "vectorial/simd4f.h" 37 | for C-only simd wrapper, using it looks like this: 38 | simd4f v = simd4f_normalize( simd4f_add( simd4f_create(1,2,3,4), y) ); 39 | float z = simd4f_get_z(v); 40 | 41 | #include "vectorial/vectorial.h" 42 | for C++ classes. They reside in vectorial namespace, you might 43 | want to alias them to your own namespace 44 | namespace myproject { 45 | using namespace ::vectorial; 46 | // if you like different name: typedef vec3f Vector3; 47 | } 48 | using myproject::vec4f; 49 | 50 | vec4f v = normalize( vec4f(1,2,3,4) + y ); 51 | float z = v.z(); 52 | 53 | 54 | License 55 | 56 | 2-clause BSD. See LICENSE 57 | 58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /tools/spechelper.m: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env octave 2 | 3 | 1; 4 | 5 | function spec_formatter (val,type) 6 | 7 | if( isscalar(val) == 1 ) 8 | printf(" should_be_close_to(x, %15.15ff, epsilon );", val); 9 | return; 10 | endif 11 | 12 | if( size(val) == [1,2] ) 13 | if( strcmp(type,"simd2f") == 1 ) 14 | printf(" should_be_equal_%s(x, simd2f_create(%15.15ff, %15.15ff), epsilon );",type, val(1), val(2)); 15 | else 16 | printf(" should_be_equal_%s(x, simd4f_create(%15.15ff, %15.15ff, 0.0f, 0.0f), epsilon );",type, val(1), val(2)); 17 | endif 18 | return; 19 | endif 20 | 21 | if( size(val) == [1,3] ) 22 | printf(" should_be_equal_%s(x, simd4f_create(%15.15ff, %15.15ff, %15.15ff, 0.0f), epsilon );",type, val(1), val(2), val(3)); 23 | return; 24 | endif 25 | 26 | if( size(val) == [1,4] ) 27 | printf(" should_be_equal_%s(x, simd4f_create(%15.15ff, %15.15ff, %15.15ff, %15.15ff), epsilon );",type, val(1), val(2), val(3), val(4)); 28 | return; 29 | endif 30 | 31 | if( size(val) == [4,1] ) 32 | printf(" should_be_equal_%s(x, simd4f_create(%15.15ff, %15.15ff, %15.15ff, %15.15ff), epsilon );",type, val(1), val(2), val(3), val(4)); 33 | return; 34 | endif 35 | 36 | if( size(val) == [4,4] ) 37 | printf(" should_be_equal_%s(x, simd4x4f_create(simd4f_create(%15.15ff, %15.15ff, %15.15ff, %15.15ff), simd4f_create(%15.15ff, %15.15ff, %15.15ff, %15.15ff), simd4f_create(%15.15ff, %15.15ff, %15.15ff, %15.15ff), simd4f_create(%15.15ff, %15.15ff, %15.15ff, %15.15ff)), epsilon );",type, 38 | val(1), val(2), val(3), val(4), val(5), val(6), val(7), val(8), val(9), val(10), val(11), val(12), val(13), val(14), val(15), val(16) 39 | ); 40 | return; 41 | endif 42 | 43 | 44 | endfunction 45 | 46 | -------------------------------------------------------------------------------- /vectorial.sln: -------------------------------------------------------------------------------- 1 | 2 | Microsoft Visual Studio Solution File, Format Version 10.00 3 | # Visual C++ Express 2008 4 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "vectorial specsuite", "vectorial.vcproj", "{9450BCE8-02CB-4169-8471-2DFF764817F4}" 5 | EndProject 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "vectorial benchmark", "vectorialbenchmark.vcproj", "{1E78F64D-C404-4048-8AE6-217089480E8A}" 7 | EndProject 8 | Global 9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 10 | Debug|Win32 = Debug|Win32 11 | Release Scalar|Win32 = Release Scalar|Win32 12 | Release SSE|Win32 = Release SSE|Win32 13 | EndGlobalSection 14 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 15 | {9450BCE8-02CB-4169-8471-2DFF764817F4}.Debug|Win32.ActiveCfg = Debug|Win32 16 | {9450BCE8-02CB-4169-8471-2DFF764817F4}.Debug|Win32.Build.0 = Debug|Win32 17 | {9450BCE8-02CB-4169-8471-2DFF764817F4}.Release Scalar|Win32.ActiveCfg = Release Scalar|Win32 18 | {9450BCE8-02CB-4169-8471-2DFF764817F4}.Release Scalar|Win32.Build.0 = Release Scalar|Win32 19 | {9450BCE8-02CB-4169-8471-2DFF764817F4}.Release SSE|Win32.ActiveCfg = Release|Win32 20 | {9450BCE8-02CB-4169-8471-2DFF764817F4}.Release SSE|Win32.Build.0 = Release|Win32 21 | {1E78F64D-C404-4048-8AE6-217089480E8A}.Debug|Win32.ActiveCfg = Debug|Win32 22 | {1E78F64D-C404-4048-8AE6-217089480E8A}.Debug|Win32.Build.0 = Debug|Win32 23 | {1E78F64D-C404-4048-8AE6-217089480E8A}.Release Scalar|Win32.ActiveCfg = Release Scalar|Win32 24 | {1E78F64D-C404-4048-8AE6-217089480E8A}.Release Scalar|Win32.Build.0 = Release Scalar|Win32 25 | {1E78F64D-C404-4048-8AE6-217089480E8A}.Release SSE|Win32.ActiveCfg = Release|Win32 26 | {1E78F64D-C404-4048-8AE6-217089480E8A}.Release SSE|Win32.Build.0 = Release|Win32 27 | EndGlobalSection 28 | GlobalSection(SolutionProperties) = preSolution 29 | HideSolutionNode = FALSE 30 | EndGlobalSection 31 | EndGlobal 32 | -------------------------------------------------------------------------------- /include/vectorial/simd4f_common.h: -------------------------------------------------------------------------------- 1 | /* 2 | Vectorial 3 | Copyright (c) 2010 Mikko Lehtonen 4 | Copyright (c) 2014 Google, Inc. 5 | Licensed under the terms of the two-clause BSD License (see LICENSE) 6 | */ 7 | #ifndef VECTORIAL_SIMD4F_COMMON_H 8 | #define VECTORIAL_SIMD4F_COMMON_H 9 | 10 | 11 | vectorial_inline simd4f simd4f_sum(simd4f v) { 12 | const simd4f s1 = simd4f_add(simd4f_splat_x(v), simd4f_splat_y(v)); 13 | const simd4f s2 = simd4f_add(s1, simd4f_splat_z(v)); 14 | const simd4f s3 = simd4f_add(s2, simd4f_splat_w(v)); 15 | return s3; 16 | } 17 | 18 | vectorial_inline simd4f simd4f_dot4(simd4f lhs, simd4f rhs) { 19 | return simd4f_sum( simd4f_mul(lhs, rhs) ); 20 | } 21 | 22 | vectorial_inline simd4f simd4f_dot2(simd4f lhs, simd4f rhs) { 23 | const simd4f m = simd4f_mul(lhs, rhs); 24 | const simd4f s1 = simd4f_add(simd4f_splat_x(m), simd4f_splat_y(m)); 25 | return s1; 26 | } 27 | 28 | 29 | vectorial_inline simd4f simd4f_length4(simd4f v) { 30 | return simd4f_sqrt( simd4f_dot4(v,v) ); 31 | } 32 | 33 | vectorial_inline simd4f simd4f_length3(simd4f v) { 34 | return simd4f_sqrt( simd4f_dot3(v,v) ); 35 | } 36 | 37 | vectorial_inline simd4f simd4f_length2(simd4f v) { 38 | return simd4f_sqrt( simd4f_dot2(v,v) ); 39 | } 40 | 41 | vectorial_inline simd4f simd4f_length4_squared(simd4f v) { 42 | return simd4f_dot4(v,v); 43 | } 44 | 45 | vectorial_inline simd4f simd4f_length3_squared(simd4f v) { 46 | return simd4f_dot3(v,v); 47 | } 48 | 49 | vectorial_inline float simd4f_length3_squared_scalar(simd4f v) { 50 | return simd4f_dot3_scalar(v,v); 51 | } 52 | 53 | vectorial_inline simd4f simd4f_length2_squared(simd4f v) { 54 | return simd4f_dot2(v,v); 55 | } 56 | 57 | 58 | vectorial_inline simd4f simd4f_normalize4(simd4f a) { 59 | simd4f invlen = simd4f_rsqrt( simd4f_dot4(a,a) ); 60 | return simd4f_mul(a, invlen); 61 | } 62 | 63 | vectorial_inline simd4f simd4f_normalize3(simd4f a) { 64 | simd4f invlen = simd4f_rsqrt( simd4f_dot3(a,a) ); 65 | return simd4f_mul(a, invlen); 66 | } 67 | 68 | vectorial_inline simd4f simd4f_normalize2(simd4f a) { 69 | simd4f invlen = simd4f_rsqrt( simd4f_dot2(a,a) ); 70 | return simd4f_mul(a, invlen); 71 | } 72 | 73 | 74 | #endif 75 | -------------------------------------------------------------------------------- /include/vectorial/config.h: -------------------------------------------------------------------------------- 1 | /* 2 | Vectorial 3 | Copyright (c) 2010 Mikko Lehtonen 4 | Licensed under the terms of the two-clause BSD License (see LICENSE) 5 | */ 6 | #ifndef VECTORIAL_CONFIG_H 7 | #define VECTORIAL_CONFIG_H 8 | 9 | 10 | #ifndef VECTORIAL_FORCED 11 | #if defined(__SSE__) || (_M_IX86_FP > 0) || (_M_X64 > 0) 12 | 13 | #define VECTORIAL_SSE 14 | 15 | // __ARM_NEON is used instead of __ARM_NEON__ on armv8. 16 | #elif defined(__ARM_NEON__) || defined(__ARM_NEON) 17 | 18 | #define VECTORIAL_NEON 19 | 20 | // Don't use gnu extension for arm, buggy with some gccs with armv6 and -Os, 21 | // Also doesn't seem perform as well 22 | #elif defined(__GNUC__) && !defined(__arm__) 23 | 24 | #define VECTORIAL_GNU 25 | 26 | #else 27 | 28 | #define VECTORIAL_SCALAR 29 | 30 | #endif 31 | #endif 32 | 33 | 34 | 35 | #ifdef VECTORIAL_SCALAR 36 | #define VECTORIAL_SIMD_TYPE "scalar" 37 | #endif 38 | 39 | #ifdef VECTORIAL_SSE 40 | #define VECTORIAL_SIMD_TYPE "sse" 41 | #endif 42 | 43 | #ifdef VECTORIAL_NEON 44 | #define VECTORIAL_SIMD_TYPE "neon" 45 | #define VECTORIAL_HAVE_SIMD2F 46 | #endif 47 | 48 | #ifdef VECTORIAL_GNU 49 | #define VECTORIAL_SIMD_TYPE "gnu" 50 | #endif 51 | 52 | 53 | 54 | #if defined(VECTORIAL_FORCED) && !defined(VECTORIAL_SIMD_TYPE) 55 | #error VECTORIAL_FORCED set but no simd-type found, try f.ex. VECTORIAL_SCALAR 56 | #endif 57 | 58 | 59 | #define vectorial_inline static inline 60 | 61 | #if defined(__GNUC__) 62 | #if defined(__cplusplus) 63 | #define vectorial_restrict __restrict 64 | #endif 65 | #define simd4f_aligned16 __attribute__ ((aligned (16))) 66 | #elif defined(_WIN32) 67 | #define vectorial_restrict 68 | #define simd4f_aligned16 __declspec(align(16)) 69 | #else 70 | #define vectorial_restrict restrict 71 | #define simd4f_aligned16 72 | #endif 73 | // #define vectorial_restrict 74 | 75 | #ifdef __GNUC__ 76 | #define vectorial_pure __attribute__((pure)) 77 | #else 78 | #define vectorial_pure 79 | #endif 80 | 81 | #ifdef _WIN32 82 | #if defined(min) || defined(max) 83 | #pragma message ( "set NOMINMAX as preprocessor macro, undefining min/max " ) 84 | #undef min 85 | #undef max 86 | #endif 87 | #endif 88 | 89 | #ifdef __cplusplus 90 | // Hack around msvc badness 91 | #define SIMD_PARAM(t, p) const t& p 92 | #else 93 | #define SIMD_PARAM(t, p) t p 94 | #endif 95 | 96 | #define VECTORIAL_PI 3.14159265f 97 | #define VECTORIAL_HALFPI 1.57079633f 98 | 99 | 100 | 101 | #endif 102 | -------------------------------------------------------------------------------- /bench/quad_bench.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "bench.h" 3 | #include 4 | 5 | #include 6 | #include "vectorial/simd4x4f.h" 7 | 8 | #define NUM (81920) 9 | #define ITER 100 10 | //using namespace vectorial; 11 | 12 | namespace { 13 | simd4x4f* alloc_simd4x4f(size_t n) { 14 | void *ptr = memalign(n*sizeof(simd4x4f), 16); 15 | return static_cast(ptr); 16 | } 17 | } 18 | 19 | 20 | 21 | static simd4x4f * a; 22 | static simd4x4f * b; 23 | static simd4x4f * c; 24 | 25 | 26 | 27 | static simd4x4f add_4x4(SIMD_PARAM(simd4x4f, a), SIMD_PARAM(simd4x4f, b)) { 28 | return simd4x4f_create( 29 | simd4f_add(a.x, b.x), 30 | simd4f_add(a.y, b.y), 31 | simd4f_add(a.z, b.z), 32 | simd4f_add(a.w, b.w) 33 | ); 34 | } 35 | 36 | static simd4x4f add_4x4_rp(simd4x4f *a, simd4x4f *b) { 37 | return simd4x4f_create( 38 | simd4f_add(a->x, b->x), 39 | simd4f_add(a->y, b->y), 40 | simd4f_add(a->z, b->z), 41 | simd4f_add(a->w, b->w) 42 | ); 43 | } 44 | 45 | 46 | static void add_4x4_p(simd4x4f *a, simd4x4f *b, simd4x4f *out) { 47 | out->x = simd4f_add(a->x, b->x); 48 | out->y = simd4f_add(a->y, b->y); 49 | out->z = simd4f_add(a->z, b->z); 50 | out->w = simd4f_add(a->w, b->w); 51 | } 52 | 53 | 54 | 55 | 56 | void quad_return_func() { 57 | 58 | 59 | simd4x4f* aa = a; 60 | simd4x4f* bb = b; 61 | simd4x4f* cc = c; 62 | 63 | for(size_t i = 0; i < NUM; ++i) 64 | { 65 | bb[i] = add_4x4(aa[i], bb[i]); 66 | } 67 | } 68 | 69 | 70 | void quad_pointer_func() { 71 | 72 | simd4x4f* aa = a; 73 | simd4x4f* bb = b; 74 | simd4x4f* cc = c; 75 | 76 | for(size_t i = 0; i < NUM; ++i) 77 | { 78 | add_4x4_p(&aa[i], &bb[i], &bb[i]); 79 | } 80 | 81 | 82 | } 83 | 84 | void quad_pointer_return_func() { 85 | 86 | simd4x4f* aa = a; 87 | simd4x4f* bb = b; 88 | simd4x4f* cc = c; 89 | 90 | for(size_t i = 0; i < NUM; ++i) 91 | { 92 | bb[i] = add_4x4_rp(&aa[i], &bb[i]); 93 | } 94 | 95 | 96 | } 97 | 98 | 99 | void quad_bench() { 100 | 101 | a = alloc_simd4x4f(NUM); 102 | b = alloc_simd4x4f(NUM); 103 | c = alloc_simd4x4f(NUM); 104 | 105 | 106 | for(size_t i = 0; i < NUM; ++i) 107 | { 108 | simd4f t = simd4f_create(i,i,i,i); 109 | simd4f t2 = simd4f_create(NUM-i,NUM-i,NUM-i,NUM-i); 110 | a[i]=simd4x4f_create(t,t,t,t); 111 | b[i]=simd4x4f_create(t2,t2,t2,t2); 112 | } 113 | 114 | profile("quad return-value", quad_return_func, ITER, NUM); 115 | profile("quad pass-by-pointer", quad_pointer_func, ITER, NUM); 116 | profile("quad pass-by-pointer return-value", quad_pointer_return_func, ITER, NUM); 117 | 118 | memfree(a); 119 | memfree(b); 120 | memfree(c); 121 | 122 | 123 | } 124 | -------------------------------------------------------------------------------- /bench/bench.cpp: -------------------------------------------------------------------------------- 1 | #include "bench.h" 2 | #include 3 | #include 4 | #include "vectorial/config.h" 5 | 6 | 7 | namespace profiler { 8 | 9 | #ifdef BENCH_MACH 10 | mach_timebase_info_data_t info; 11 | void init() { 12 | mach_timebase_info(&info); 13 | } 14 | #endif 15 | 16 | #ifdef BENCH_GTOD 17 | void init() { 18 | } 19 | #endif 20 | 21 | #ifdef BENCH_QPC 22 | double frequency; 23 | void init() { 24 | LARGE_INTEGER freq; 25 | QueryPerformanceFrequency(&freq); 26 | frequency = (double)freq.QuadPart; 27 | } 28 | #endif 29 | 30 | 31 | time_t now() { 32 | 33 | #ifdef BENCH_MACH 34 | return mach_absolute_time(); 35 | #endif 36 | 37 | #ifdef BENCH_GTOD 38 | time_t v; 39 | gettimeofday(&v, NULL); 40 | return v; 41 | #endif 42 | 43 | #ifdef BENCH_QPC 44 | LARGE_INTEGER v; 45 | QueryPerformanceCounter(&v); 46 | return v; 47 | #endif 48 | 49 | } 50 | 51 | 52 | double diffTime(time_t start, time_t end) { 53 | 54 | #ifdef BENCH_GTOD 55 | return (end.tv_sec - start.tv_sec) + (end.tv_usec - start.tv_usec) / 1000000.0; 56 | #endif 57 | 58 | #ifdef BENCH_MACH 59 | return ((end-start) * info.numer / info.denom) / 1000000000.0; 60 | #endif 61 | 62 | #ifdef BENCH_QPC 63 | return (end.QuadPart - start.QuadPart) / frequency; 64 | #endif 65 | } 66 | 67 | } 68 | 69 | 70 | std::string formatTime(double d, double relative ) { 71 | const double sec = 1.0; 72 | const double milli = 0.001; 73 | const double micro = 0.000001; 74 | const double nano = 0.000000001; 75 | std::stringstream ss; 76 | if( relative < 0.0) relative=d; 77 | if( relative >= sec ) ss << d << "s"; 78 | else if( relative >= milli ) ss << d/milli << "ms"; 79 | else if( relative >= micro ) ss << d/micro <<"us"; 80 | else ss << d/nano << "ns"; 81 | return ss.str(); 82 | } 83 | 84 | void profile(const char* name, void (*func)(), int iterations, int elements) { 85 | 86 | profiler::init(); 87 | profiler::time_t start = profiler::now(); 88 | for(int i = 0; i < iterations; ++i) 89 | { 90 | func(); 91 | } 92 | profiler::time_t end = profiler::now(); 93 | 94 | std::cout << "Using simd: " << VECTORIAL_SIMD_TYPE << std::endl; 95 | std::cout << "Testing: " << name << std::endl; 96 | std::cout << "Duration " << formatTime(profiler::diffTime(start,end)) << std::endl; 97 | std::cout << "Per iter " << formatTime(profiler::diffTime(start,end) / iterations) << std::endl; 98 | std::cout << "Per item " << formatTime(profiler::diffTime(start,end) / iterations / elements) << std::endl; 99 | 100 | 101 | } 102 | 103 | void add_bench(); 104 | void dot_bench(); 105 | void quad_bench(); 106 | void matrix_bench(); 107 | 108 | int main() { 109 | 110 | // add_bench(); 111 | // dot_bench(); 112 | // quad_bench(); 113 | matrix_bench(); 114 | 115 | return 0; 116 | } 117 | 118 | -------------------------------------------------------------------------------- /include/vectorial/simd2f_neon.h: -------------------------------------------------------------------------------- 1 | /* 2 | Vectorial 3 | Copyright (c) 2010 Mikko Lehtonen 4 | Copyright (c) 2014 Google, Inc. 5 | Licensed under the terms of the two-clause BSD License (see LICENSE) 6 | */ 7 | #ifndef VECTORIAL_SIMD2F_NEON_H 8 | #define VECTORIAL_SIMD2F_NEON_H 9 | 10 | #include 11 | 12 | #ifdef __cplusplus 13 | extern "C" { 14 | #endif 15 | 16 | 17 | typedef float32x2_t simd2f; 18 | 19 | typedef union { 20 | simd2f s ; 21 | float f[2]; 22 | } _simd2f_union; 23 | 24 | 25 | 26 | vectorial_inline simd2f simd2f_create(float x, float y) { 27 | const float32_t d[2] = { x,y }; 28 | simd2f s = vld1_f32(d); 29 | return s; 30 | } 31 | 32 | vectorial_inline simd2f simd2f_zero() { return vdup_n_f32(0.0f); } 33 | 34 | vectorial_inline simd2f simd2f_uload2(const float *ary) { 35 | const float32_t* ary32 = (const float32_t*)ary; 36 | simd2f s = vld1_f32(ary32); 37 | return s; 38 | } 39 | 40 | vectorial_inline void simd2f_ustore2(const simd2f val, float *ary) { 41 | vst1_f32( (float32_t*)ary, val); 42 | } 43 | 44 | vectorial_inline simd2f simd2f_splat(float v) { 45 | simd2f s = vdup_n_f32(v); 46 | return s; 47 | } 48 | 49 | vectorial_inline simd2f simd2f_splat_x(simd2f v) { 50 | simd2f ret = vdup_lane_f32(v, 0); 51 | return ret; 52 | } 53 | 54 | vectorial_inline simd2f simd2f_splat_y(simd2f v) { 55 | simd2f ret = vdup_lane_f32(v, 1); 56 | return ret; 57 | } 58 | 59 | vectorial_inline simd2f simd2f_reciprocal(simd2f v) { 60 | simd2f estimate = vrecpe_f32(v); 61 | estimate = vmul_f32(vrecps_f32(estimate, v), estimate); 62 | estimate = vmul_f32(vrecps_f32(estimate, v), estimate); 63 | return estimate; 64 | } 65 | 66 | vectorial_inline void simd2f_rsqrt_1iteration(const simd2f& v, simd2f& estimate) { 67 | simd2f estimate2 = vmul_f32(estimate, v); 68 | estimate = vmul_f32(estimate, vrsqrts_f32(estimate2, estimate)); 69 | } 70 | 71 | vectorial_inline simd2f simd2f_rsqrt1(simd2f v) { 72 | simd2f estimate = vrsqrte_f32(v); 73 | simd2f_rsqrt_1iteration(v, estimate); 74 | return estimate; 75 | } 76 | 77 | vectorial_inline simd2f simd2f_rsqrt2(simd2f v) { 78 | simd2f estimate = vrsqrte_f32(v); 79 | simd2f_rsqrt_1iteration(v, estimate); 80 | simd2f_rsqrt_1iteration(v, estimate); 81 | return estimate; 82 | } 83 | 84 | vectorial_inline simd2f simd2f_rsqrt3(simd2f v) { 85 | simd2f estimate = vrsqrte_f32(v); 86 | simd2f_rsqrt_1iteration(v, estimate); 87 | simd2f_rsqrt_1iteration(v, estimate); 88 | simd2f_rsqrt_1iteration(v, estimate); 89 | return estimate; 90 | } 91 | 92 | // http://en.wikipedia.org/wiki/Fast_inverse_square_root makes the argument for 93 | // one iteration but two gives a signficant accuracy improvment. 94 | vectorial_inline simd2f simd2f_rsqrt(simd2f v) { 95 | return simd2f_rsqrt2(v); 96 | } 97 | 98 | vectorial_inline simd2f simd2f_sqrt(simd2f v) { 99 | 100 | return vreinterpret_f32_u32(vand_u32( vtst_u32(vreinterpret_u32_f32(v), 101 | vreinterpret_u32_f32(v)), 102 | vreinterpret_u32_f32( 103 | simd2f_reciprocal(simd2f_rsqrt(v))) 104 | ) 105 | ); 106 | 107 | } 108 | 109 | // arithmetics 110 | 111 | vectorial_inline simd2f simd2f_add(simd2f lhs, simd2f rhs) { 112 | simd2f ret = vadd_f32(lhs, rhs); 113 | return ret; 114 | } 115 | 116 | vectorial_inline simd2f simd2f_sub(simd2f lhs, simd2f rhs) { 117 | simd2f ret = vsub_f32(lhs, rhs); 118 | return ret; 119 | } 120 | 121 | vectorial_inline simd2f simd2f_mul(simd2f lhs, simd2f rhs) { 122 | simd2f ret = vmul_f32(lhs, rhs); 123 | return ret; 124 | } 125 | 126 | vectorial_inline simd2f simd2f_div(simd2f lhs, simd2f rhs) { 127 | simd2f recip = simd2f_reciprocal( rhs ); 128 | simd2f ret = vmul_f32(lhs, recip); 129 | return ret; 130 | } 131 | 132 | vectorial_inline simd2f simd2f_madd(simd2f m1, simd2f m2, simd2f a) { 133 | return vmla_f32( a, m1, m2 ); 134 | } 135 | 136 | vectorial_inline float simd2f_get_x(simd2f s) { return vget_lane_f32(s, 0); } 137 | vectorial_inline float simd2f_get_y(simd2f s) { return vget_lane_f32(s, 1); } 138 | 139 | vectorial_inline simd2f simd2f_dot2(simd2f lhs, simd2f rhs) { 140 | const simd2f m = simd2f_mul(lhs, rhs); 141 | return vpadd_f32(m, m); 142 | } 143 | 144 | vectorial_inline simd2f simd2f_min(simd2f a, simd2f b) { 145 | return vmin_f32( a, b ); 146 | } 147 | 148 | vectorial_inline simd2f simd2f_max(simd2f a, simd2f b) { 149 | return vmax_f32( a, b ); 150 | } 151 | 152 | 153 | #ifdef __cplusplus 154 | } 155 | #endif 156 | 157 | 158 | #endif 159 | 160 | -------------------------------------------------------------------------------- /include/vectorial/simd4f_scalar.h: -------------------------------------------------------------------------------- 1 | /* 2 | Vectorial 3 | Copyright (c) 2010 Mikko Lehtonen 4 | Licensed under the terms of the two-clause BSD License (see LICENSE) 5 | */ 6 | #ifndef VECTORIAL_SIMD4F_SCALAR_H 7 | #define VECTORIAL_SIMD4F_SCALAR_H 8 | 9 | #include 10 | #include // memcpy 11 | 12 | #ifdef __cplusplus 13 | extern "C" { 14 | #endif 15 | 16 | 17 | typedef struct { 18 | float x; 19 | float y; 20 | float z; 21 | float w; 22 | } simd4f; 23 | 24 | 25 | 26 | vectorial_inline simd4f simd4f_create(float x, float y, float z, float w) { 27 | simd4f s = { x, y, z, w }; 28 | return s; 29 | } 30 | 31 | vectorial_inline simd4f simd4f_zero() { return simd4f_create(0.0f, 0.0f, 0.0f, 0.0f); } 32 | 33 | vectorial_inline simd4f simd4f_uload4(const float *ary) { 34 | simd4f s = { ary[0], ary[1], ary[2], ary[3] }; 35 | return s; 36 | } 37 | 38 | vectorial_inline simd4f simd4f_uload3(const float *ary) { 39 | simd4f s = { ary[0], ary[1], ary[2], 0 }; 40 | return s; 41 | } 42 | 43 | vectorial_inline simd4f simd4f_uload2(const float *ary) { 44 | simd4f s = { ary[0], ary[1], 0, 0 }; 45 | return s; 46 | } 47 | 48 | 49 | vectorial_inline void simd4f_ustore4(const simd4f val, float *ary) { 50 | memcpy(ary, &val, sizeof(float) * 4); 51 | } 52 | 53 | vectorial_inline void simd4f_ustore3(const simd4f val, float *ary) { 54 | memcpy(ary, &val, sizeof(float) * 3); 55 | } 56 | 57 | vectorial_inline void simd4f_ustore2(const simd4f val, float *ary) { 58 | memcpy(ary, &val, sizeof(float) * 2); 59 | } 60 | 61 | 62 | 63 | // utilities 64 | vectorial_inline simd4f simd4f_splat(float v) { 65 | simd4f s = { v, v, v, v }; 66 | return s; 67 | } 68 | 69 | vectorial_inline simd4f simd4f_splat_x(simd4f v) { 70 | simd4f s = { v.x, v.x, v.x, v.x }; 71 | return s; 72 | } 73 | 74 | vectorial_inline simd4f simd4f_splat_y(simd4f v) { 75 | simd4f s = { v.y, v.y, v.y, v.y }; 76 | return s; 77 | } 78 | 79 | vectorial_inline simd4f simd4f_splat_z(simd4f v) { 80 | simd4f s = { v.z, v.z, v.z, v.z }; 81 | return s; 82 | } 83 | 84 | vectorial_inline simd4f simd4f_splat_w(simd4f v) { 85 | simd4f s = { v.w, v.w, v.w, v.w }; 86 | return s; 87 | } 88 | 89 | vectorial_inline simd4f simd4f_reciprocal(simd4f v) { 90 | simd4f s = { 1.0f/v.x, 1.0f/v.y, 1.0f/v.z, 1.0f/v.w }; 91 | return s; 92 | } 93 | 94 | vectorial_inline simd4f simd4f_sqrt(simd4f v) { 95 | simd4f s = { sqrtf(v.x), sqrtf(v.y), sqrtf(v.z), sqrtf(v.w) }; 96 | return s; 97 | } 98 | 99 | vectorial_inline simd4f simd4f_rsqrt(simd4f v) { 100 | simd4f s = { 1.0f/sqrtf(v.x), 1.0f/sqrtf(v.y), 1.0f/sqrtf(v.z), 1.0f/sqrtf(v.w) }; 101 | return s; 102 | } 103 | 104 | 105 | // arithmetic 106 | 107 | vectorial_inline simd4f simd4f_add(simd4f lhs, simd4f rhs) { 108 | simd4f ret = { lhs.x + rhs.x, lhs.y + rhs.y, lhs.z + rhs.z, lhs.w + rhs.w }; 109 | return ret; 110 | } 111 | 112 | vectorial_inline simd4f simd4f_sub(simd4f lhs, simd4f rhs) { 113 | simd4f ret = { lhs.x - rhs.x, lhs.y - rhs.y, lhs.z - rhs.z, lhs.w - rhs.w }; 114 | return ret; 115 | } 116 | 117 | vectorial_inline simd4f simd4f_mul(simd4f lhs, simd4f rhs) { 118 | simd4f ret = { lhs.x * rhs.x, lhs.y * rhs.y, lhs.z * rhs.z, lhs.w * rhs.w }; 119 | return ret; 120 | } 121 | 122 | vectorial_inline simd4f simd4f_div(simd4f lhs, simd4f rhs) { 123 | simd4f ret = { lhs.x / rhs.x, lhs.y / rhs.y, lhs.z / rhs.z, lhs.w / rhs.w }; 124 | return ret; 125 | } 126 | 127 | vectorial_inline simd4f simd4f_madd(simd4f m1, simd4f m2, simd4f a) { 128 | return simd4f_add( simd4f_mul(m1, m2), a ); 129 | } 130 | 131 | vectorial_inline float simd4f_dot3_scalar(simd4f lhs, simd4f rhs) { 132 | return lhs.x * rhs.x + lhs.y * rhs.y + lhs.z * rhs.z; 133 | } 134 | 135 | vectorial_inline simd4f simd4f_dot3(simd4f lhs, simd4f rhs) { 136 | return simd4f_splat( simd4f_dot3_scalar(lhs, rhs) ); 137 | } 138 | 139 | vectorial_inline simd4f simd4f_cross3(simd4f lhs, simd4f rhs) { 140 | return simd4f_create( lhs.y * rhs.z - lhs.z * rhs.y, 141 | lhs.z * rhs.x - lhs.x * rhs.z, 142 | lhs.x * rhs.y - lhs.y * rhs.x, 0); 143 | } 144 | 145 | 146 | vectorial_inline float simd4f_get_x(simd4f s) { return s.x; } 147 | vectorial_inline float simd4f_get_y(simd4f s) { return s.y; } 148 | vectorial_inline float simd4f_get_z(simd4f s) { return s.z; } 149 | vectorial_inline float simd4f_get_w(simd4f s) { return s.w; } 150 | 151 | 152 | vectorial_inline simd4f simd4f_shuffle_wxyz(simd4f s) { return simd4f_create(s.w, s.x, s.y, s.z); } 153 | vectorial_inline simd4f simd4f_shuffle_zwxy(simd4f s) { return simd4f_create(s.z, s.w, s.x, s.y); } 154 | vectorial_inline simd4f simd4f_shuffle_yzwx(simd4f s) { return simd4f_create(s.y, s.z, s.w, s.x); } 155 | 156 | 157 | vectorial_inline simd4f simd4f_zero_w(simd4f s) { 158 | return simd4f_create(s.x, s.y, s.z, 0.0f); 159 | } 160 | 161 | vectorial_inline simd4f simd4f_zero_zw(simd4f s) { 162 | return simd4f_create(s.x, s.y, 0.0f, 0.0f); 163 | } 164 | 165 | 166 | vectorial_inline simd4f simd4f_merge_high(simd4f abcd, simd4f xyzw) { 167 | return simd4f_create(abcd.z, abcd.w, xyzw.z, xyzw.w); 168 | } 169 | 170 | vectorial_inline simd4f simd4f_flip_sign_0101(simd4f s) { 171 | return simd4f_create(s.x, -s.y, s.z, -s.w); 172 | } 173 | 174 | vectorial_inline simd4f simd4f_flip_sign_1010(simd4f s) { 175 | return simd4f_create(-s.x, s.y, -s.z, s.w); 176 | } 177 | 178 | vectorial_inline simd4f simd4f_min(simd4f a, simd4f b) { 179 | return simd4f_create( a.x < b.x ? a.x : b.x, 180 | a.y < b.y ? a.y : b.y, 181 | a.z < b.z ? a.z : b.z, 182 | a.w < b.w ? a.w : b.w ); 183 | } 184 | 185 | vectorial_inline simd4f simd4f_max(simd4f a, simd4f b) { 186 | return simd4f_create( a.x > b.x ? a.x : b.x, 187 | a.y > b.y ? a.y : b.y, 188 | a.z > b.z ? a.z : b.z, 189 | a.w > b.w ? a.w : b.w ); 190 | } 191 | 192 | 193 | #ifdef __cplusplus 194 | } 195 | #endif 196 | 197 | 198 | #endif 199 | 200 | -------------------------------------------------------------------------------- /include/vectorial/vec2f.h: -------------------------------------------------------------------------------- 1 | /* 2 | Vectorial 3 | Copyright (c) 2010 Mikko Lehtonen 4 | Licensed under the terms of the two-clause BSD License (see LICENSE) 5 | */ 6 | #ifndef VECTORIAL_VEC2F_H 7 | 8 | #ifndef VECTORIAL_SIMD4F_H 9 | #include "vectorial/simd4f.h" 10 | #endif 11 | 12 | 13 | 14 | namespace vectorial { 15 | 16 | class vec4f; 17 | class vec3f; 18 | 19 | class vec2f { 20 | public: 21 | 22 | simd4f value; 23 | 24 | inline vec2f() {} 25 | inline vec2f(const vec2f& v) : value(v.value) {} 26 | inline vec2f(const simd4f& v) : value(v) {} 27 | explicit inline vec2f(float xy) : value( simd4f_splat(xy) ) {} 28 | inline vec2f(float x, float y) : value( simd4f_create(x,y,0,0) ) {} 29 | explicit inline vec2f(const float *ary) : value( simd4f_uload2(ary) ) { } 30 | 31 | inline float x() const { return simd4f_get_x(value); } 32 | inline float y() const { return simd4f_get_y(value); } 33 | 34 | inline void load(const float *ary) { value = simd4f_uload2(ary); } 35 | inline void store(float *ary) const { simd4f_ustore2(value, ary); } 36 | 37 | enum { elements = 2 }; 38 | 39 | static vec2f zero() { return vec2f(simd4f_zero()); } 40 | static vec2f one() { return vec2f(1.0f); } 41 | static vec2f xAxis() { return vec2f(1.0f, 0.0f); } 42 | static vec2f yAxis() { return vec2f(0.0f, 1.0f); } 43 | 44 | inline vec4f xyzw(float z, float w) const; 45 | inline vec4f xy00() const; 46 | inline vec4f xy01() const; 47 | inline vec3f xyz(float z) const; 48 | inline vec3f xy0() const; 49 | inline vec2f xy() const; 50 | 51 | }; 52 | 53 | vectorial_inline vec2f operator-(const vec2f& lhs) { 54 | return vec2f( simd4f_sub(simd4f_zero(), lhs.value) ); 55 | } 56 | 57 | 58 | vectorial_inline vec2f operator+(const vec2f& lhs, const vec2f& rhs) { 59 | return vec2f( simd4f_add(lhs.value, rhs.value) ); 60 | } 61 | 62 | vectorial_inline vec2f operator-(const vec2f& lhs, const vec2f& rhs) { 63 | return vec2f( simd4f_sub(lhs.value, rhs.value) ); 64 | } 65 | 66 | vectorial_inline vec2f operator*(const vec2f& lhs, const vec2f& rhs) { 67 | return vec2f( simd4f_mul(lhs.value, rhs.value) ); 68 | } 69 | 70 | vectorial_inline vec2f operator/(const vec2f& lhs, const vec2f& rhs) { 71 | return vec2f( simd4f_div(lhs.value, rhs.value) ); 72 | } 73 | 74 | 75 | vectorial_inline vec2f operator+=(vec2f& lhs, const vec2f& rhs) { 76 | return lhs = vec2f( simd4f_add(lhs.value, rhs.value) ); 77 | } 78 | 79 | vectorial_inline vec2f operator-=(vec2f& lhs, const vec2f& rhs) { 80 | return lhs = vec2f( simd4f_sub(lhs.value, rhs.value) ); 81 | } 82 | 83 | vectorial_inline vec2f operator*=(vec2f& lhs, const vec2f& rhs) { 84 | return lhs = vec2f( simd4f_mul(lhs.value, rhs.value) ); 85 | } 86 | 87 | vectorial_inline vec2f operator/=(vec2f& lhs, const vec2f& rhs) { 88 | return lhs = vec2f( simd4f_div(lhs.value, rhs.value) ); 89 | } 90 | 91 | 92 | 93 | vectorial_inline vec2f operator+(const vec2f& lhs, float rhs) { 94 | return vec2f( simd4f_add(lhs.value, simd4f_splat(rhs)) ); 95 | } 96 | 97 | vectorial_inline vec2f operator-(const vec2f& lhs, float rhs) { 98 | return vec2f( simd4f_sub(lhs.value, simd4f_splat(rhs)) ); 99 | } 100 | 101 | vectorial_inline vec2f operator*(const vec2f& lhs, float rhs) { 102 | return vec2f( simd4f_mul(lhs.value, simd4f_splat(rhs)) ); 103 | } 104 | 105 | vectorial_inline vec2f operator/(const vec2f& lhs, float rhs) { 106 | return vec2f( simd4f_div(lhs.value, simd4f_splat(rhs)) ); 107 | } 108 | 109 | vectorial_inline vec2f operator+(float lhs, const vec2f& rhs) { 110 | return vec2f( simd4f_add(simd4f_splat(lhs), rhs.value) ); 111 | } 112 | 113 | vectorial_inline vec2f operator-(float lhs, const vec2f& rhs) { 114 | return vec2f( simd4f_sub(simd4f_splat(lhs), rhs.value) ); 115 | } 116 | 117 | vectorial_inline vec2f operator*(float lhs, const vec2f& rhs) { 118 | return vec2f( simd4f_mul(simd4f_splat(lhs), rhs.value) ); 119 | } 120 | 121 | vectorial_inline vec2f operator/(float lhs, const vec2f& rhs) { 122 | return vec2f( simd4f_div(simd4f_splat(lhs), rhs.value) ); 123 | } 124 | 125 | 126 | vectorial_inline vec2f operator+=(vec2f& lhs, float rhs) { 127 | return lhs = vec2f( simd4f_add(lhs.value, simd4f_splat(rhs)) ); 128 | } 129 | 130 | vectorial_inline vec2f operator-=(vec2f& lhs, float rhs) { 131 | return lhs = vec2f( simd4f_sub(lhs.value, simd4f_splat(rhs)) ); 132 | } 133 | 134 | vectorial_inline vec2f operator*=(vec2f& lhs, float rhs) { 135 | return lhs = vec2f( simd4f_mul(lhs.value, simd4f_splat(rhs)) ); 136 | } 137 | 138 | vectorial_inline vec2f operator/=(vec2f& lhs, float rhs) { 139 | return lhs = vec2f( simd4f_div(lhs.value, simd4f_splat(rhs)) ); 140 | } 141 | 142 | 143 | vectorial_inline float dot(const vec2f& lhs, const vec2f& rhs) { 144 | return simd4f_get_x( simd4f_dot2(lhs.value, rhs.value) ); 145 | } 146 | 147 | 148 | vectorial_inline float length(const vec2f& v) { 149 | return simd4f_get_x( simd4f_length2(v.value) ); 150 | } 151 | 152 | vectorial_inline float length_squared(const vec2f& v) { 153 | return simd4f_get_x( simd4f_length2_squared(v.value) ); 154 | } 155 | 156 | vectorial_inline vec2f normalize(const vec2f& v) { 157 | return vec2f( simd4f_normalize2(v.value) ); 158 | } 159 | 160 | vectorial_inline vec2f min(const vec2f& a, const vec2f& b) { 161 | return vec2f( simd4f_min(a.value, b.value) ); 162 | } 163 | 164 | vectorial_inline vec2f max(const vec2f& a, const vec2f& b) { 165 | return vec2f( simd4f_max(a.value, b.value) ); 166 | } 167 | 168 | 169 | } 170 | 171 | 172 | namespace std { 173 | inline ::vectorial::vec2f min(const ::vectorial::vec2f& a, const ::vectorial::vec2f& b) { return ::vectorial::min(a,b); } 174 | inline ::vectorial::vec2f max(const ::vectorial::vec2f& a, const ::vectorial::vec2f& b) { return ::vectorial::max(a,b); } 175 | } 176 | 177 | 178 | #ifdef VECTORIAL_OSTREAM 179 | #include 180 | 181 | vectorial_inline std::ostream& operator<<(std::ostream& os, const vectorial::vec2f& v) { 182 | os << "[ " << v.x() << ", " 183 | << v.y() << " ]"; 184 | return os; 185 | } 186 | #endif 187 | 188 | 189 | 190 | 191 | #endif 192 | -------------------------------------------------------------------------------- /include/vectorial/mat4f.h: -------------------------------------------------------------------------------- 1 | /* 2 | Vectorial 3 | Copyright (c) 2010 Mikko Lehtonen 4 | Licensed under the terms of the two-clause BSD License (see LICENSE) 5 | */ 6 | #ifndef VECTORIAL_MAT4F_H 7 | #define VECTORIAL_MAT4F_H 8 | 9 | #ifndef VECTORIAL_SIMD4X4F_H 10 | #include "vectorial/simd4x4f.h" 11 | #endif 12 | 13 | #ifndef VECTORIAL_VEC4F_H 14 | #include "vectorial/vec4f.h" 15 | #endif 16 | 17 | 18 | namespace vectorial { 19 | 20 | 21 | class mat4f { 22 | public: 23 | 24 | simd4x4f value; 25 | 26 | inline mat4f() {} 27 | inline mat4f(const mat4f& m) : value(m.value) {} 28 | inline mat4f(const simd4x4f& v) : value(v) {} 29 | inline mat4f(const vec4f& v0, const vec4f& v1, const vec4f& v2, const vec4f& v3) : value(simd4x4f_create(v0.value, v1.value, v2.value, v3.value)) {} 30 | explicit inline mat4f(const float *ary) { simd4x4f_uload(&value, ary); } 31 | 32 | inline void load(const float *ary) { 33 | value.x = simd4f_uload4(ary); 34 | value.y = simd4f_uload4(ary+4); 35 | value.z = simd4f_uload4(ary+8); 36 | value.w = simd4f_uload4(ary+12); 37 | } 38 | 39 | inline void store(float *ary) const { 40 | simd4f_ustore4(value.x, ary); 41 | simd4f_ustore4(value.y, ary+4); 42 | simd4f_ustore4(value.z, ary+8); 43 | simd4f_ustore4(value.w, ary+12); 44 | } 45 | 46 | static mat4f identity() { mat4f m; simd4x4f_identity(&m.value); return m; } 47 | 48 | static mat4f perspective(float fovy, float aspect, float znear, float zfar) { 49 | simd4x4f m; 50 | simd4x4f_perspective(&m, fovy, aspect, znear, zfar); 51 | return m; 52 | } 53 | 54 | static mat4f ortho(float left, float right, float bottom, float top, float znear, float zfar) { 55 | simd4x4f m; 56 | simd4x4f_ortho(&m, left, right, bottom, top, znear, zfar); 57 | return m; 58 | } 59 | 60 | static mat4f lookAt(const vec3f& eye, const vec3f& center, const vec3f& up) { 61 | simd4x4f m; 62 | simd4x4f_lookat(&m, eye.value, center.value, up.value); 63 | return m; 64 | } 65 | 66 | static mat4f translation(const vec3f& pos) { 67 | simd4x4f m; 68 | simd4x4f_translation(&m, pos.x(), pos.y(), pos.z()); 69 | return m; 70 | } 71 | 72 | static mat4f axisRotation(float angle, const vec3f& axis) { 73 | simd4x4f m; 74 | simd4x4f_axis_rotation(&m, angle, axis.value); 75 | return m; 76 | } 77 | 78 | static mat4f scale(float scale) { 79 | return simd4x4f_create( simd4f_create(scale,0,0,0), 80 | simd4f_create(0,scale,0,0), 81 | simd4f_create(0,0,scale,0), 82 | simd4f_create(0,0,0,1) ); 83 | } 84 | 85 | static mat4f scale(const vec3f& scale) { 86 | return simd4x4f_create( simd4f_create(scale.x(),0,0,0), 87 | simd4f_create(0,scale.y(),0,0), 88 | simd4f_create(0,0,scale.z(),0), 89 | simd4f_create(0,0,0,1) ); 90 | } 91 | 92 | }; 93 | 94 | 95 | vectorial_inline mat4f operator*(const mat4f& lhs, const mat4f& rhs) { 96 | mat4f ret; 97 | simd4x4f_matrix_mul(&lhs.value, &rhs.value, &ret.value); 98 | return ret; 99 | } 100 | 101 | vectorial_inline mat4f operator*=(mat4f& lhs, const mat4f& rhs) { 102 | const simd4x4f tmp = lhs.value; 103 | simd4x4f_matrix_mul(&tmp, &rhs.value, &lhs.value); 104 | return lhs; 105 | } 106 | 107 | 108 | vectorial_inline vec4f operator*(const mat4f& lhs, const vec4f& rhs) { 109 | vec4f ret; 110 | simd4x4f_matrix_vector_mul(&lhs.value, &rhs.value, &ret.value); 111 | return ret; 112 | } 113 | 114 | vectorial_inline vec3f transformVector(const mat4f& lhs, const vec3f& rhs) { 115 | vec3f ret; 116 | simd4x4f_matrix_vector3_mul(&lhs.value, &rhs.value, &ret.value); 117 | return ret; 118 | } 119 | 120 | vectorial_inline vec4f transformVector(const mat4f& lhs, const vec4f& rhs) { 121 | vec4f ret; 122 | simd4x4f_matrix_vector_mul(&lhs.value, &rhs.value, &ret.value); 123 | return ret; 124 | } 125 | 126 | vectorial_inline vec3f transformPoint(const mat4f& lhs, const vec3f& rhs) { 127 | vec3f ret; 128 | simd4x4f_matrix_point3_mul(&lhs.value, &rhs.value, &ret.value); 129 | return ret; 130 | } 131 | 132 | vectorial_inline vec3f orthoInverseTransformPoint(const mat4f& lhs, const vec3f& rhs) { 133 | vec3f ret; 134 | simd4x4f_inv_ortho_matrix_point3_mul(&lhs.value, &rhs.value, &ret.value); 135 | return ret; 136 | } 137 | 138 | vectorial_inline vec3f orthoInverseTransformVector(const mat4f& lhs, const vec3f& rhs) { 139 | vec3f ret; 140 | simd4x4f_inv_ortho_matrix_vector3_mul(&lhs.value, &rhs.value, &ret.value); 141 | return ret; 142 | } 143 | 144 | 145 | vectorial_inline mat4f transpose(const mat4f& m) { 146 | mat4f ret; 147 | simd4x4f_transpose(&m.value, &ret.value); 148 | return ret; 149 | } 150 | 151 | 152 | vectorial_inline mat4f inverse(const mat4f& m) { 153 | mat4f ret; 154 | simd4x4f_inverse(&m.value, &ret.value); 155 | return ret; 156 | } 157 | 158 | 159 | 160 | } 161 | 162 | 163 | 164 | #ifdef VECTORIAL_OSTREAM 165 | //#include 166 | 167 | vectorial_inline std::ostream& operator<<(std::ostream& os, const vectorial::mat4f& v) { 168 | 169 | os << "[ "; 170 | os << simd4f_get_x(v.value.x) << ", "; 171 | os << simd4f_get_x(v.value.y) << ", "; 172 | os << simd4f_get_x(v.value.z) << ", "; 173 | os << simd4f_get_x(v.value.w) << " ; "; 174 | 175 | os << simd4f_get_y(v.value.x) << ", "; 176 | os << simd4f_get_y(v.value.y) << ", "; 177 | os << simd4f_get_y(v.value.z) << ", "; 178 | os << simd4f_get_y(v.value.w) << " ; "; 179 | 180 | os << simd4f_get_z(v.value.x) << ", "; 181 | os << simd4f_get_z(v.value.y) << ", "; 182 | os << simd4f_get_z(v.value.z) << ", "; 183 | os << simd4f_get_z(v.value.w) << " ; "; 184 | 185 | os << simd4f_get_w(v.value.x) << ", "; 186 | os << simd4f_get_w(v.value.y) << ", "; 187 | os << simd4f_get_w(v.value.z) << ", "; 188 | os << simd4f_get_w(v.value.w) << " ]"; 189 | 190 | return os; 191 | } 192 | #endif 193 | 194 | 195 | 196 | 197 | #endif 198 | -------------------------------------------------------------------------------- /include/vectorial/vec4f.h: -------------------------------------------------------------------------------- 1 | /* 2 | Vectorial 3 | Copyright (c) 2010 Mikko Lehtonen 4 | Licensed under the terms of the two-clause BSD License (see LICENSE) 5 | */ 6 | #ifndef VECTORIAL_VEC4F_H 7 | #define VECTORIAL_VEC4F_H 8 | 9 | #ifndef VECTORIAL_SIMD4F_H 10 | #include "vectorial/simd4f.h" 11 | #endif 12 | 13 | 14 | 15 | namespace vectorial { 16 | 17 | class vec3f; 18 | class vec2f; 19 | 20 | class vec4f { 21 | public: 22 | 23 | simd4f value; 24 | 25 | inline vec4f() {} 26 | inline vec4f(const vec4f& v) : value(v.value) {} 27 | inline vec4f(const simd4f& v) : value(v) {} 28 | explicit inline vec4f(float xyzw) : value( simd4f_splat(xyzw) ) {} 29 | inline vec4f(float x, float y, float z, float w) : value( simd4f_create(x,y,z,w) ) {} 30 | explicit inline vec4f(const float *ary) : value( simd4f_uload4(ary) ) { } 31 | 32 | inline float x() const { return simd4f_get_x(value); } 33 | inline float y() const { return simd4f_get_y(value); } 34 | inline float z() const { return simd4f_get_z(value); } 35 | inline float w() const { return simd4f_get_w(value); } 36 | 37 | inline void load(const float *ary) { value = simd4f_uload4(ary); } 38 | inline void store(float *ary) const { simd4f_ustore4(value, ary); } 39 | 40 | enum { elements = 4 }; 41 | 42 | 43 | static vec4f zero() { return vec4f(simd4f_zero()); } 44 | static vec4f one() { return vec4f(1.0f); } 45 | static vec4f xAxis() { return vec4f(1.0f, 0.0f, 0.0f, 0.0f); } 46 | static vec4f yAxis() { return vec4f(0.0f, 1.0f, 0.0f, 0.0f); } 47 | static vec4f zAxis() { return vec4f(0.0f, 0.0f, 1.0f, 0.0f); } 48 | static vec4f wAxis() { return vec4f(0.0f, 0.0f, 0.0f, 1.0f); } 49 | 50 | 51 | inline vec3f xyz() const; 52 | inline vec2f xy() const; 53 | 54 | }; 55 | 56 | 57 | vectorial_inline vec4f operator-(const vec4f& lhs) { 58 | return vec4f( simd4f_sub(simd4f_zero(), lhs.value) ); 59 | } 60 | 61 | 62 | vectorial_inline vec4f operator+(const vec4f& lhs, const vec4f& rhs) { 63 | return vec4f( simd4f_add(lhs.value, rhs.value) ); 64 | } 65 | 66 | vectorial_inline vec4f operator-(const vec4f& lhs, const vec4f& rhs) { 67 | return vec4f( simd4f_sub(lhs.value, rhs.value) ); 68 | } 69 | 70 | vectorial_inline vec4f operator*(const vec4f& lhs, const vec4f& rhs) { 71 | return vec4f( simd4f_mul(lhs.value, rhs.value) ); 72 | } 73 | 74 | vectorial_inline vec4f operator/(const vec4f& lhs, const vec4f& rhs) { 75 | return vec4f( simd4f_div(lhs.value, rhs.value) ); 76 | } 77 | 78 | 79 | vectorial_inline vec4f operator+=(vec4f& lhs, const vec4f& rhs) { 80 | return lhs = vec4f( simd4f_add(lhs.value, rhs.value) ); 81 | } 82 | 83 | vectorial_inline vec4f operator-=(vec4f& lhs, const vec4f& rhs) { 84 | return lhs = vec4f( simd4f_sub(lhs.value, rhs.value) ); 85 | } 86 | 87 | vectorial_inline vec4f operator*=(vec4f& lhs, const vec4f& rhs) { 88 | return lhs = vec4f( simd4f_mul(lhs.value, rhs.value) ); 89 | } 90 | 91 | vectorial_inline vec4f operator/=(vec4f& lhs, const vec4f& rhs) { 92 | return lhs = vec4f( simd4f_div(lhs.value, rhs.value) ); 93 | } 94 | 95 | 96 | 97 | vectorial_inline vec4f operator+(const vec4f& lhs, float rhs) { 98 | return vec4f( simd4f_add(lhs.value, simd4f_splat(rhs)) ); 99 | } 100 | 101 | vectorial_inline vec4f operator-(const vec4f& lhs, float rhs) { 102 | return vec4f( simd4f_sub(lhs.value, simd4f_splat(rhs)) ); 103 | } 104 | 105 | vectorial_inline vec4f operator*(const vec4f& lhs, float rhs) { 106 | return vec4f( simd4f_mul(lhs.value, simd4f_splat(rhs)) ); 107 | } 108 | 109 | vectorial_inline vec4f operator/(const vec4f& lhs, float rhs) { 110 | return vec4f( simd4f_div(lhs.value, simd4f_splat(rhs)) ); 111 | } 112 | 113 | vectorial_inline vec4f operator+(float lhs, const vec4f& rhs) { 114 | return vec4f( simd4f_add(simd4f_splat(lhs), rhs.value) ); 115 | } 116 | 117 | vectorial_inline vec4f operator-(float lhs, const vec4f& rhs) { 118 | return vec4f( simd4f_sub(simd4f_splat(lhs), rhs.value) ); 119 | } 120 | 121 | vectorial_inline vec4f operator*(float lhs, const vec4f& rhs) { 122 | return vec4f( simd4f_mul(simd4f_splat(lhs), rhs.value) ); 123 | } 124 | 125 | vectorial_inline vec4f operator/(float lhs, const vec4f& rhs) { 126 | return vec4f( simd4f_div(simd4f_splat(lhs), rhs.value) ); 127 | } 128 | 129 | 130 | vectorial_inline vec4f operator+=(vec4f& lhs, float rhs) { 131 | return lhs = vec4f( simd4f_add(lhs.value, simd4f_splat(rhs)) ); 132 | } 133 | 134 | vectorial_inline vec4f operator-=(vec4f& lhs, float rhs) { 135 | return lhs = vec4f( simd4f_sub(lhs.value, simd4f_splat(rhs)) ); 136 | } 137 | 138 | vectorial_inline vec4f operator*=(vec4f& lhs, float rhs) { 139 | return lhs = vec4f( simd4f_mul(lhs.value, simd4f_splat(rhs)) ); 140 | } 141 | 142 | vectorial_inline vec4f operator/=(vec4f& lhs, float rhs) { 143 | return lhs = vec4f( simd4f_div(lhs.value, simd4f_splat(rhs)) ); 144 | } 145 | 146 | 147 | vectorial_inline float dot(const vec4f& lhs, const vec4f& rhs) { 148 | return simd4f_get_x( simd4f_dot4(lhs.value, rhs.value) ); 149 | } 150 | 151 | 152 | vectorial_inline float length(const vec4f& v) { 153 | return simd4f_get_x( simd4f_length4(v.value) ); 154 | } 155 | 156 | vectorial_inline float length_squared(const vec4f& v) { 157 | return simd4f_get_x( simd4f_length4_squared(v.value) ); 158 | } 159 | 160 | vectorial_inline vec4f normalize(const vec4f& v) { 161 | return vec4f( simd4f_normalize4(v.value) ); 162 | } 163 | 164 | vectorial_inline vec4f min(const vec4f& a, const vec4f& b) { 165 | return vec4f( simd4f_min(a.value, b.value) ); 166 | } 167 | 168 | vectorial_inline vec4f max(const vec4f& a, const vec4f& b) { 169 | return vec4f( simd4f_max(a.value, b.value) ); 170 | } 171 | 172 | 173 | } 174 | 175 | 176 | namespace std { 177 | inline ::vectorial::vec4f min(const ::vectorial::vec4f& a, const ::vectorial::vec4f& b) { return ::vectorial::min(a,b); } 178 | inline ::vectorial::vec4f max(const ::vectorial::vec4f& a, const ::vectorial::vec4f& b) { return ::vectorial::max(a,b); } 179 | } 180 | 181 | 182 | #ifdef VECTORIAL_OSTREAM 183 | #include 184 | 185 | vectorial_inline std::ostream& operator<<(std::ostream& os, const vectorial::vec4f& v) { 186 | os << "[ " << v.x() << ", " 187 | << v.y() << ", " 188 | << v.z() << ", " 189 | << v.w() << " ]"; 190 | return os; 191 | } 192 | #endif 193 | 194 | 195 | #endif 196 | -------------------------------------------------------------------------------- /include/vectorial/vec3f.h: -------------------------------------------------------------------------------- 1 | /* 2 | Vectorial 3 | Copyright (c) 2010 Mikko Lehtonen 4 | Copyright (c) 2014 Google, Inc. 5 | Licensed under the terms of the two-clause BSD License (see LICENSE) 6 | */ 7 | #ifndef VECTORIAL_VEC3F_H 8 | 9 | #ifndef VECTORIAL_SIMD4F_H 10 | #include "vectorial/simd4f.h" 11 | #endif 12 | 13 | 14 | 15 | namespace vectorial { 16 | 17 | class vec4f; 18 | class vec2f; 19 | 20 | class vec3f { 21 | public: 22 | 23 | simd4f value; 24 | 25 | inline vec3f() {} 26 | inline vec3f(const vec3f& v) : value(v.value) {} 27 | inline vec3f(const simd4f& v) : value(v) {} 28 | explicit inline vec3f(float xyz) : value( simd4f_splat(xyz) ) {} 29 | inline vec3f(float x, float y, float z) : value( simd4f_create(x,y,z,0) ) {} 30 | explicit inline vec3f(const float *ary) : value( simd4f_uload3(ary) ) { } 31 | 32 | inline float x() const { return simd4f_get_x(value); } 33 | inline float y() const { return simd4f_get_y(value); } 34 | inline float z() const { return simd4f_get_z(value); } 35 | 36 | inline void load(const float *ary) { value = simd4f_uload3(ary); } 37 | inline void store(float *ary) const { simd4f_ustore3(value, ary); } 38 | 39 | enum { elements = 3 }; 40 | 41 | static vec3f zero() { return vec3f(simd4f_zero()); } 42 | static vec3f one() { return vec3f(1.0f); } 43 | static vec3f xAxis() { return vec3f(1.0f, 0.0f, 0.0f); } 44 | static vec3f yAxis() { return vec3f(0.0f, 1.0f, 0.0f); } 45 | static vec3f zAxis() { return vec3f(0.0f, 0.0f, 1.0f); } 46 | 47 | inline vec4f xyz0() const; 48 | inline vec4f xyz1() const; 49 | inline vec4f xyzw(float w) const; 50 | inline vec3f xyz() const; 51 | inline vec3f xy0() const; 52 | inline vec2f xy() const; 53 | }; 54 | 55 | vectorial_inline vec3f operator-(const vec3f& lhs) { 56 | return vec3f( simd4f_sub(simd4f_zero(), lhs.value) ); 57 | } 58 | 59 | 60 | vectorial_inline vec3f operator+(const vec3f& lhs, const vec3f& rhs) { 61 | return vec3f( simd4f_add(lhs.value, rhs.value) ); 62 | } 63 | 64 | vectorial_inline vec3f operator-(const vec3f& lhs, const vec3f& rhs) { 65 | return vec3f( simd4f_sub(lhs.value, rhs.value) ); 66 | } 67 | 68 | vectorial_inline vec3f operator*(const vec3f& lhs, const vec3f& rhs) { 69 | return vec3f( simd4f_mul(lhs.value, rhs.value) ); 70 | } 71 | 72 | vectorial_inline vec3f operator/(const vec3f& lhs, const vec3f& rhs) { 73 | return vec3f( simd4f_div(lhs.value, rhs.value) ); 74 | } 75 | 76 | 77 | vectorial_inline vec3f operator+=(vec3f& lhs, const vec3f& rhs) { 78 | return lhs = vec3f( simd4f_add(lhs.value, rhs.value) ); 79 | } 80 | 81 | vectorial_inline vec3f operator-=(vec3f& lhs, const vec3f& rhs) { 82 | return lhs = vec3f( simd4f_sub(lhs.value, rhs.value) ); 83 | } 84 | 85 | vectorial_inline vec3f operator*=(vec3f& lhs, const vec3f& rhs) { 86 | return lhs = vec3f( simd4f_mul(lhs.value, rhs.value) ); 87 | } 88 | 89 | vectorial_inline vec3f operator/=(vec3f& lhs, const vec3f& rhs) { 90 | return lhs = vec3f( simd4f_div(lhs.value, rhs.value) ); 91 | } 92 | 93 | 94 | 95 | vectorial_inline vec3f operator+(const vec3f& lhs, float rhs) { 96 | return vec3f( simd4f_add(lhs.value, simd4f_splat(rhs)) ); 97 | } 98 | 99 | vectorial_inline vec3f operator-(const vec3f& lhs, float rhs) { 100 | return vec3f( simd4f_sub(lhs.value, simd4f_splat(rhs)) ); 101 | } 102 | 103 | vectorial_inline vec3f operator*(const vec3f& lhs, float rhs) { 104 | return vec3f( simd4f_mul(lhs.value, simd4f_splat(rhs)) ); 105 | } 106 | 107 | vectorial_inline vec3f operator/(const vec3f& lhs, float rhs) { 108 | return vec3f( simd4f_div(lhs.value, simd4f_splat(rhs)) ); 109 | } 110 | 111 | vectorial_inline vec3f operator+(float lhs, const vec3f& rhs) { 112 | return vec3f( simd4f_add(simd4f_splat(lhs), rhs.value) ); 113 | } 114 | 115 | vectorial_inline vec3f operator-(float lhs, const vec3f& rhs) { 116 | return vec3f( simd4f_sub(simd4f_splat(lhs), rhs.value) ); 117 | } 118 | 119 | vectorial_inline vec3f operator*(float lhs, const vec3f& rhs) { 120 | return vec3f( simd4f_mul(simd4f_splat(lhs), rhs.value) ); 121 | } 122 | 123 | vectorial_inline vec3f operator/(float lhs, const vec3f& rhs) { 124 | return vec3f( simd4f_div(simd4f_splat(lhs), rhs.value) ); 125 | } 126 | 127 | 128 | vectorial_inline vec3f operator+=(vec3f& lhs, float rhs) { 129 | return lhs = vec3f( simd4f_add(lhs.value, simd4f_splat(rhs)) ); 130 | } 131 | 132 | vectorial_inline vec3f operator-=(vec3f& lhs, float rhs) { 133 | return lhs = vec3f( simd4f_sub(lhs.value, simd4f_splat(rhs)) ); 134 | } 135 | 136 | vectorial_inline vec3f operator*=(vec3f& lhs, float rhs) { 137 | return lhs = vec3f( simd4f_mul(lhs.value, simd4f_splat(rhs)) ); 138 | } 139 | 140 | vectorial_inline vec3f operator/=(vec3f& lhs, float rhs) { 141 | return lhs = vec3f( simd4f_div(lhs.value, simd4f_splat(rhs)) ); 142 | } 143 | 144 | 145 | vectorial_inline float dot(const vec3f& lhs, const vec3f& rhs) { 146 | return simd4f_dot3_scalar(lhs.value, rhs.value); 147 | } 148 | 149 | vectorial_inline vec3f cross(const vec3f& lhs, const vec3f& rhs) { 150 | return simd4f_cross3(lhs.value, rhs.value); 151 | } 152 | 153 | 154 | vectorial_inline float length(const vec3f& v) { 155 | return simd4f_get_x( simd4f_length3(v.value) ); 156 | } 157 | 158 | vectorial_inline float length_squared(const vec3f& v) { 159 | return simd4f_get_x( simd4f_length3_squared(v.value) ); 160 | } 161 | 162 | vectorial_inline vec3f normalize(const vec3f& v) { 163 | return vec3f( simd4f_normalize3(v.value) ); 164 | } 165 | 166 | vectorial_inline vec3f min(const vec3f& a, const vec3f& b) { 167 | return vec3f( simd4f_min(a.value, b.value) ); 168 | } 169 | 170 | vectorial_inline vec3f max(const vec3f& a, const vec3f& b) { 171 | return vec3f( simd4f_max(a.value, b.value) ); 172 | } 173 | 174 | } 175 | 176 | 177 | namespace std { 178 | inline ::vectorial::vec3f min(const ::vectorial::vec3f& a, const ::vectorial::vec3f& b) { return ::vectorial::min(a,b); } 179 | inline ::vectorial::vec3f max(const ::vectorial::vec3f& a, const ::vectorial::vec3f& b) { return ::vectorial::max(a,b); } 180 | } 181 | 182 | 183 | #ifdef VECTORIAL_OSTREAM 184 | #include 185 | 186 | vectorial_inline std::ostream& operator<<(std::ostream& os, const vectorial::vec3f& v) { 187 | os << "[ " << v.x() << ", " 188 | << v.y() << ", " 189 | << v.z() << " ]"; 190 | return os; 191 | } 192 | #endif 193 | 194 | 195 | 196 | 197 | #endif 198 | -------------------------------------------------------------------------------- /include/vectorial/simd4f_gnu.h: -------------------------------------------------------------------------------- 1 | /* 2 | Vectorial 3 | Copyright (c) 2010 Mikko Lehtonen 4 | Licensed under the terms of the two-clause BSD License (see LICENSE) 5 | */ 6 | #ifndef VECTORIAL_SIMD4F_GNU_H 7 | #define VECTORIAL_SIMD4F_GNU_H 8 | 9 | #include 10 | #include // memcpy 11 | 12 | 13 | #ifdef __cplusplus 14 | extern "C" { 15 | #endif 16 | 17 | 18 | typedef float simd4f __attribute__ ((vector_size (16))); 19 | 20 | typedef union { 21 | simd4f s ; 22 | float f[4]; 23 | } _simd4f_union; 24 | 25 | vectorial_inline float simd4f_get_x(simd4f s) { _simd4f_union u={s}; return u.f[0]; } 26 | vectorial_inline float simd4f_get_y(simd4f s) { _simd4f_union u={s}; return u.f[1]; } 27 | vectorial_inline float simd4f_get_z(simd4f s) { _simd4f_union u={s}; return u.f[2]; } 28 | vectorial_inline float simd4f_get_w(simd4f s) { _simd4f_union u={s}; return u.f[3]; } 29 | 30 | 31 | vectorial_inline simd4f simd4f_create(float x, float y, float z, float w) { 32 | simd4f s = { x, y, z, w }; 33 | return s; 34 | } 35 | 36 | vectorial_inline simd4f simd4f_zero() { return simd4f_create(0.0f, 0.0f, 0.0f, 0.0f); } 37 | 38 | vectorial_inline simd4f simd4f_uload4(const float *ary) { 39 | simd4f s = { ary[0], ary[1], ary[2], ary[3] }; 40 | return s; 41 | } 42 | 43 | vectorial_inline simd4f simd4f_uload3(const float *ary) { 44 | simd4f s = { ary[0], ary[1], ary[2], 0 }; 45 | return s; 46 | } 47 | 48 | vectorial_inline simd4f simd4f_uload2(const float *ary) { 49 | simd4f s = { ary[0], ary[1], 0, 0 }; 50 | return s; 51 | } 52 | 53 | 54 | vectorial_inline void simd4f_ustore4(const simd4f val, float *ary) { 55 | memcpy(ary, &val, sizeof(float) * 4); 56 | } 57 | 58 | vectorial_inline void simd4f_ustore3(const simd4f val, float *ary) { 59 | memcpy(ary, &val, sizeof(float) * 3); 60 | } 61 | 62 | vectorial_inline void simd4f_ustore2(const simd4f val, float *ary) { 63 | memcpy(ary, &val, sizeof(float) * 2); 64 | } 65 | 66 | 67 | vectorial_inline simd4f simd4f_splat(float v) { 68 | simd4f s = { v, v, v, v }; 69 | return s; 70 | } 71 | 72 | vectorial_inline simd4f simd4f_splat_x(simd4f v) { 73 | float s = simd4f_get_x(v); 74 | simd4f ret = { s, s, s, s }; 75 | return ret; 76 | } 77 | 78 | vectorial_inline simd4f simd4f_splat_y(simd4f v) { 79 | float s = simd4f_get_y(v); 80 | simd4f ret = { s, s, s, s }; 81 | return ret; 82 | } 83 | 84 | vectorial_inline simd4f simd4f_splat_z(simd4f v) { 85 | float s = simd4f_get_z(v); 86 | simd4f ret = { s, s, s, s }; 87 | return ret; 88 | } 89 | 90 | vectorial_inline simd4f simd4f_splat_w(simd4f v) { 91 | float s = simd4f_get_w(v); 92 | simd4f ret = { s, s, s, s }; 93 | return ret; 94 | } 95 | 96 | vectorial_inline simd4f simd4f_reciprocal(simd4f v) { 97 | return simd4f_splat(1.0f) / v; 98 | } 99 | 100 | vectorial_inline simd4f simd4f_sqrt(simd4f v) { 101 | simd4f ret = { sqrtf(simd4f_get_x(v)), sqrtf(simd4f_get_y(v)), sqrtf(simd4f_get_z(v)), sqrtf(simd4f_get_w(v)) }; 102 | return ret; 103 | } 104 | 105 | vectorial_inline simd4f simd4f_rsqrt(simd4f v) { 106 | return simd4f_splat(1.0f) / simd4f_sqrt(v); 107 | } 108 | 109 | 110 | 111 | vectorial_inline simd4f simd4f_add(simd4f lhs, simd4f rhs) { 112 | simd4f ret = lhs + rhs; 113 | return ret; 114 | } 115 | 116 | vectorial_inline simd4f simd4f_sub(simd4f lhs, simd4f rhs) { 117 | simd4f ret = lhs - rhs; 118 | return ret; 119 | } 120 | 121 | vectorial_inline simd4f simd4f_mul(simd4f lhs, simd4f rhs) { 122 | simd4f ret = lhs * rhs; 123 | return ret; 124 | } 125 | 126 | vectorial_inline simd4f simd4f_div(simd4f lhs, simd4f rhs) { 127 | simd4f ret = lhs / rhs; 128 | return ret; 129 | } 130 | 131 | vectorial_inline simd4f simd4f_madd(simd4f m1, simd4f m2, simd4f a) { 132 | return simd4f_add( simd4f_mul(m1, m2), a ); 133 | } 134 | 135 | vectorial_inline float simd4f_dot3_scalar(simd4f lhs, simd4f rhs) { 136 | _simd4f_union l = {lhs}; 137 | _simd4f_union r = {rhs}; 138 | return l.f[0] * r.f[0] + l.f[1] * r.f[1] + l.f[2] * r.f[2]; 139 | } 140 | 141 | vectorial_inline simd4f simd4f_dot3(simd4f lhs, simd4f rhs) { 142 | return simd4f_splat( simd4f_dot3_scalar(lhs, rhs) ); 143 | } 144 | 145 | vectorial_inline simd4f simd4f_cross3(simd4f l, simd4f r) { 146 | _simd4f_union lhs = {l}; 147 | _simd4f_union rhs = {r}; 148 | 149 | return simd4f_create( lhs.f[1] * rhs.f[2] - lhs.f[2] * rhs.f[1], 150 | lhs.f[2] * rhs.f[0] - lhs.f[0] * rhs.f[2], 151 | lhs.f[0] * rhs.f[1] - lhs.f[1] * rhs.f[0], 0); 152 | } 153 | 154 | 155 | vectorial_inline simd4f simd4f_shuffle_wxyz(simd4f s) { 156 | _simd4f_union u = {s}; 157 | return simd4f_create(u.f[3], u.f[0], u.f[1], u.f[2]); 158 | } 159 | 160 | vectorial_inline simd4f simd4f_shuffle_zwxy(simd4f s) { 161 | _simd4f_union u = {s}; 162 | return simd4f_create(u.f[2], u.f[3], u.f[0], u.f[1]); 163 | } 164 | 165 | vectorial_inline simd4f simd4f_shuffle_yzwx(simd4f s) { 166 | _simd4f_union u = {s}; 167 | return simd4f_create(u.f[1], u.f[2], u.f[3], u.f[0]); 168 | } 169 | 170 | 171 | vectorial_inline simd4f simd4f_zero_w(simd4f s) { 172 | _simd4f_union u = {s}; 173 | return simd4f_create(u.f[0], u.f[1], u.f[2], 0.0f); 174 | } 175 | 176 | vectorial_inline simd4f simd4f_zero_zw(simd4f s) { 177 | _simd4f_union u = {s}; 178 | return simd4f_create(u.f[0], u.f[1], 0.0f, 0.0f); 179 | } 180 | 181 | 182 | vectorial_inline simd4f simd4f_merge_high(simd4f abcd, simd4f xyzw) { 183 | _simd4f_union u1 = {abcd}; 184 | _simd4f_union u2 = {xyzw}; 185 | return simd4f_create(u1.f[2], u1.f[3], u2.f[2], u2.f[3]); 186 | } 187 | 188 | vectorial_inline simd4f simd4f_flip_sign_0101(simd4f s) { 189 | _simd4f_union u = {s}; 190 | return simd4f_create(u.f[0], -u.f[1], u.f[2], -u.f[3]); 191 | } 192 | 193 | vectorial_inline simd4f simd4f_flip_sign_1010(simd4f s) { 194 | _simd4f_union u = {s}; 195 | return simd4f_create(-u.f[0], u.f[1], -u.f[2], u.f[3]); 196 | } 197 | 198 | 199 | vectorial_inline simd4f simd4f_min(simd4f a, simd4f b) { 200 | _simd4f_union ua = {a}; 201 | _simd4f_union ub = {b}; 202 | return simd4f_create( ua.f[0] < ub.f[0] ? ua.f[0] : ub.f[0], 203 | ua.f[1] < ub.f[1] ? ua.f[1] : ub.f[1], 204 | ua.f[2] < ub.f[2] ? ua.f[2] : ub.f[2], 205 | ua.f[3] < ub.f[3] ? ua.f[3] : ub.f[3] ); 206 | } 207 | 208 | vectorial_inline simd4f simd4f_max(simd4f a, simd4f b) { 209 | _simd4f_union ua = {a}; 210 | _simd4f_union ub = {b}; 211 | return simd4f_create( ua.f[0] > ub.f[0] ? ua.f[0] : ub.f[0], 212 | ua.f[1] > ub.f[1] ? ua.f[1] : ub.f[1], 213 | ua.f[2] > ub.f[2] ? ua.f[2] : ub.f[2], 214 | ua.f[3] > ub.f[3] ? ua.f[3] : ub.f[3] ); 215 | } 216 | 217 | 218 | 219 | #ifdef __cplusplus 220 | } 221 | #endif 222 | 223 | 224 | #endif 225 | 226 | -------------------------------------------------------------------------------- /spec/spec.cpp: -------------------------------------------------------------------------------- 1 | /* Specific - Minimal C++ spec framework. 2 | 3 | 4 | The zlib/libpng License 5 | 6 | 7 | Copyright (c) 2008 Mikko Lehtonen 8 | 9 | This software is provided 'as-is', without any express or implied 10 | warranty. In no event will the authors be held liable for any damages 11 | arising from the use of this software. 12 | 13 | Permission is granted to anyone to use this software for any purpose, 14 | including commercial applications, and to alter it and redistribute it 15 | freely, subject to the following restrictions: 16 | 17 | 1. The origin of this software must not be misrepresented; you must not 18 | claim that you wrote the original software. If you use this software 19 | in a product, an acknowledgment in the product documentation would be 20 | appreciated but is not required. 21 | 22 | 2. Altered source versions must be plainly marked as such, and must not be 23 | misrepresented as being the original software. 24 | 25 | 3. This notice may not be removed or altered from any source 26 | distribution. 27 | */ 28 | 29 | 30 | #include "spec.h" 31 | 32 | #include 33 | 34 | namespace specific { 35 | 36 | 37 | 38 | void SpecWriter::startGroup(std::string /*group*/, std::string /*description*/) {} 39 | 40 | void SpecWriter::addFailedAssertation(std::string msg, const char *file, int line) { 41 | mFailures.push_back( SpecFailure(msg,file,line) ); 42 | } 43 | void SpecWriter::addSpecResult(SpecResult r) { 44 | mResults.push_back( r ); 45 | } 46 | void SpecWriter::start() {} 47 | void SpecWriter::stop() { 48 | std::cout << std::endl; 49 | size_t nth = 0; 50 | for(std::vector::iterator i=mFailures.begin(); i != mFailures.end(); ++i, ++nth) 51 | { 52 | std::cout << std::endl; 53 | std::cout << (nth+1) << ") Failed assertation at " << i->file << ":" 54 | << i->line << ":" << std::endl << " " << i->msg << std::endl; 55 | } 56 | std::cout << std::endl << mResults.size() << " examples, " << mFailures.size() << " failures" << std::endl; 57 | 58 | } 59 | 60 | 61 | 62 | void ProgressWriter::addSpecResult(SpecResult r) { 63 | SpecWriter::addSpecResult(r); 64 | switch(r.type) { 65 | case SpecResult::PASSED: 66 | std::cout << "."; 67 | break; 68 | case SpecResult::FAILED: 69 | std::cout << "F"; 70 | break; 71 | case SpecResult::ERRORED: 72 | std::cout << "E"; 73 | break; 74 | } 75 | std::cout << std::flush; 76 | } 77 | 78 | 79 | 80 | void SpecdocWriter::startGroup(std::string group, std::string description) { 81 | std::cout << group << ": " << description << std::endl; 82 | } 83 | 84 | 85 | void SpecdocWriter::addSpecResult(SpecResult r) { 86 | SpecWriter::addSpecResult(r); 87 | size_t nth = mFailures.size(); 88 | std::cout << "- " << r.test; 89 | switch(r.type) { 90 | case SpecResult::PASSED: 91 | std::cout << " [OK]"; 92 | break; 93 | case SpecResult::FAILED: 94 | std::cout << " [FAILED - " << nth << "]"; 95 | break; 96 | case SpecResult::ERRORED: 97 | std::cout << " [ERROR - "<< nth <<"]"; 98 | break; 99 | } 100 | std::cout << std::endl; 101 | } 102 | 103 | 104 | 105 | 106 | class spec_failure {}; 107 | 108 | 109 | 110 | SpecBase::SpecBase() : mWriter(NULL), mName(NULL), 111 | mFailed(false), mLastFailed(false), mError(false), mExecutionPoint(0), mContinuePoint(0) 112 | { 113 | SpecRunner::getInstance().add(this); 114 | } 115 | 116 | 117 | SpecBase::~SpecBase() { 118 | 119 | } 120 | 121 | 122 | bool SpecBase::startSpec(const char* name) 123 | { 124 | endSpec(); 125 | 126 | mExecutionPoint++; 127 | if(mExecutionPoint <= mContinuePoint) return false; 128 | mContinuePoint++; 129 | 130 | mName = name; 131 | return true; 132 | } 133 | 134 | 135 | void SpecBase::endSpec() 136 | { 137 | if(!mName) return; 138 | 139 | SpecResult r; 140 | r.group = getGroup(); 141 | r.description = getDescription(); 142 | r.type = SpecResult::PASSED; 143 | if(mLastFailed) r.type = SpecResult::FAILED; 144 | if(mError) r.type = SpecResult::ERRORED; 145 | r.test = mName; 146 | mWriter->addSpecResult( r ); 147 | 148 | mName = NULL; 149 | } 150 | 151 | 152 | void SpecBase::should_test(bool value, const char* message, const char* file, int line) { 153 | mLastFailed=false; 154 | if(!value) { 155 | mWriter->addFailedAssertation(message, file, line); 156 | mLastFailed = mFailed = true; 157 | throw spec_failure(); 158 | } 159 | } 160 | 161 | 162 | void SpecBase::error(std::string msg) { 163 | mWriter->addFailedAssertation(msg, "exception", 0); 164 | mLastFailed = true; 165 | mFailed = true; 166 | mError = true; 167 | } 168 | 169 | bool SpecBase::done() { 170 | if( mError ) { 171 | mError = false; 172 | return false; 173 | } 174 | return true; 175 | } 176 | 177 | 178 | SpecRunner::SpecRunner() {} 179 | SpecRunner::~SpecRunner() { } 180 | 181 | SpecRunner& SpecRunner::getInstance() { 182 | static SpecRunner* instance = NULL; 183 | if( instance == NULL ) { 184 | instance = new SpecRunner; 185 | } 186 | return *instance; 187 | } 188 | 189 | 190 | bool SpecRunner::run(SpecWriter& writer, const std::string subset) { 191 | bool success = true; 192 | 193 | writer.start(); 194 | std::vector::iterator i = mSpecs.begin(); 195 | for(; i != mSpecs.end(); ++i) { 196 | SpecBase *b = *i; 197 | if( b->getGroup().find(subset, 0) == std::string::npos ) continue; 198 | b->mContinuePoint = 0; 199 | b->setWriter(&writer); 200 | writer.startGroup( b->getGroup(), b->getDescription() ); 201 | do { 202 | b->mExecutionPoint = 0; 203 | try { 204 | b->specify(); 205 | } catch(spec_failure& e) { 206 | b->mError=true; 207 | } catch( std::exception& e) { 208 | b->error(e.what()); 209 | } catch( ... ) { 210 | b->error("unknown exception"); 211 | } 212 | b->endSpec(); 213 | 214 | } while( !b->done() ); 215 | 216 | success = success && b->isSuccessful(); 217 | 218 | } 219 | writer.stop(); 220 | 221 | return success; 222 | } 223 | 224 | 225 | } 226 | 227 | 228 | 229 | 230 | -------------------------------------------------------------------------------- /spec/spec.h: -------------------------------------------------------------------------------- 1 | /* Specific - Minimal C++ spec framework. 2 | 3 | 4 | The zlib/libpng License 5 | 6 | 7 | Copyright (c) 2008 Mikko Lehtonen 8 | 9 | This software is provided 'as-is', without any express or implied 10 | warranty. In no event will the authors be held liable for any damages 11 | arising from the use of this software. 12 | 13 | Permission is granted to anyone to use this software for any purpose, 14 | including commercial applications, and to alter it and redistribute it 15 | freely, subject to the following restrictions: 16 | 17 | 1. The origin of this software must not be misrepresented; you must not 18 | claim that you wrote the original software. If you use this software 19 | in a product, an acknowledgment in the product documentation would be 20 | appreciated but is not required. 21 | 22 | 2. Altered source versions must be plainly marked as such, and must not be 23 | misrepresented as being the original software. 24 | 25 | 3. This notice may not be removed or altered from any source 26 | distribution. 27 | */ 28 | 29 | 30 | #ifndef SPECIFIC_SPEC_H 31 | #define SPECIFIC_SPEC_H 32 | 33 | #include 34 | #include 35 | #include 36 | #include 37 | 38 | namespace specific { 39 | 40 | 41 | class SpecResult { 42 | public: 43 | typedef enum { 44 | PASSED, 45 | FAILED, 46 | ERRORED 47 | } Type; 48 | 49 | Type type; 50 | 51 | std::string group; 52 | std::string description; 53 | std::string test; 54 | }; 55 | 56 | 57 | class SpecFailure { 58 | public: 59 | SpecFailure(std::string amsg, const char* afile, int aline) 60 | : msg(amsg), file(afile), line(aline) { } 61 | std::string msg; 62 | const char* file; 63 | int line; 64 | }; 65 | 66 | 67 | class SpecWriter { 68 | public: 69 | std::vector mResults; 70 | std::vector mFailures; 71 | SpecWriter() {} 72 | virtual ~SpecWriter() {} 73 | virtual void startGroup(std::string group, std::string description); 74 | virtual void addFailedAssertation(std::string msg, const char *file, int line); 75 | virtual void addSpecResult(SpecResult r); 76 | virtual void start(); 77 | virtual void stop(); 78 | }; 79 | 80 | 81 | class ProgressWriter : public SpecWriter { 82 | public: 83 | void addSpecResult(SpecResult r); 84 | }; 85 | 86 | 87 | 88 | class SpecdocWriter : public SpecWriter { 89 | public: 90 | void startGroup(std::string group, std::string description); 91 | void addSpecResult(SpecResult r); 92 | }; 93 | 94 | 95 | 96 | template std::string inspect(const T& value) { 97 | std::stringstream ss; 98 | ss << value; 99 | return ss.str(); 100 | } 101 | 102 | 103 | class SpecBase { 104 | public: 105 | SpecBase(); 106 | virtual ~SpecBase(); 107 | 108 | virtual void specify() = 0; 109 | 110 | void setWriter(SpecWriter* w) { mWriter = w; } 111 | 112 | bool startSpec(const char* name); 113 | void endSpec(); 114 | 115 | void should_test(bool value, const char* message, const char* file, int line); 116 | 117 | template void should_equal_template(const T1& a, const T2& b, const char* file, int line) { 118 | std::stringstream ss; 119 | ss << "`" << ::specific::inspect(a) << "'" << " == " << "`" << ::specific::inspect(b) << "'"; 120 | should_test( a == b, ss.str().c_str(), file, line); 121 | } 122 | 123 | template void should_not_equal_template(const T1& a, const T2& b, const char* file, int line) { 124 | std::stringstream ss; 125 | ss << "`" << ::specific::inspect(a) << "'" << " != " << "`" << ::specific::inspect(b) << "'"; 126 | should_test( a != b, ss.str().c_str(), file, line); 127 | } 128 | 129 | 130 | 131 | virtual std::string getGroup() = 0; 132 | virtual std::string getDescription() = 0; 133 | 134 | bool isSuccessful() { return !mFailed; } 135 | 136 | bool done(); 137 | 138 | void error(std::string msg); 139 | 140 | SpecWriter* mWriter; 141 | const char* mName; 142 | bool mFailed; 143 | bool mLastFailed; 144 | bool mError; 145 | int mExecutionPoint; 146 | int mContinuePoint; 147 | char *mFile; 148 | std::string mErrorMessage; 149 | int mLine; 150 | }; 151 | 152 | 153 | class SpecRunner { 154 | public: 155 | static SpecRunner& getInstance(); 156 | void add(SpecBase* spec) { mSpecs.push_back( spec ); } 157 | bool run(SpecWriter& writer, const std::string subset = ""); 158 | private: 159 | 160 | std::vector mSpecs; 161 | 162 | SpecRunner(); 163 | ~SpecRunner(); 164 | }; 165 | 166 | #define SPEC_UNIQUE_NAME3(x,y) x##y 167 | #define SPEC_UNIQUE_NAME2(x,y) SPEC_UNIQUE_NAME3(x,y) 168 | 169 | #define SPEC_NAME(x) SPEC_UNIQUE_NAME2(SPEC_##x, SPEC_UNIQUE_NAME2(_startingOnLine, __LINE__) ) 170 | 171 | 172 | #define describe(group, description) \ 173 | class SPEC_NAME(group) : public specific::SpecBase \ 174 | { \ 175 | public: \ 176 | void specify(); \ 177 | std::string getGroup() { return #group; } \ 178 | std::string getDescription() { return description; } \ 179 | }; \ 180 | static SPEC_NAME(group) SPEC_UNIQUE_NAME2(SPEC_NAME(group), _instance); \ 181 | void SPEC_NAME(group)::specify() 182 | 183 | 184 | #define it(description) if(startSpec(description)) 185 | 186 | 187 | // Matchers 188 | #define should_be_true(a) should_test(a, #a, __FILE__, __LINE__) 189 | #define should_be_false(a) should_be_true( !a ) 190 | 191 | #ifndef SPECIFIC_NO_OSTREAM 192 | #define should_equal(a, b) should_equal_template( a,b, __FILE__, __LINE__ ) 193 | #define should_not_equal(a, b) should_not_equal_template( a,b, __FILE__, __LINE__ ) 194 | #else 195 | #define should_equal(a, b) should_be_true( (a) == (b) ) 196 | #define should_not_equal(a, b) should_be_true( (a) != (b) ) 197 | #endif 198 | 199 | #define should_throw(code, what) \ 200 | do { \ 201 | bool _thrown = false; \ 202 | try { \ 203 | code ; \ 204 | } catch(what& e) { \ 205 | _thrown = true; \ 206 | } \ 207 | should_test(_thrown, "should throw exception " #what, __FILE__, __LINE__); \ 208 | } while(0) 209 | 210 | 211 | 212 | } 213 | 214 | 215 | 216 | #endif /* Include guard */ 217 | 218 | -------------------------------------------------------------------------------- /include/vectorial/simd4f_sse.h: -------------------------------------------------------------------------------- 1 | /* 2 | Vectorial 3 | Copyright (c) 2010 Mikko Lehtonen 4 | Copyright (c) 2014 Google, Inc. 5 | Licensed under the terms of the two-clause BSD License (see LICENSE) 6 | */ 7 | #ifndef VECTORIAL_SIMD4F_SSE_H 8 | #define VECTORIAL_SIMD4F_SSE_H 9 | 10 | // Conditionally enable SSE4.1 otherwise fallback to SSE. 11 | #if defined(_M_IX86_FP) 12 | #if _M_IX86_FP >=2 13 | #define VECTORIAL_USE_SSE4_1 14 | #endif 15 | #elif defined(__SSE4_1__) 16 | #define VECTORIAL_USE_SSE4_1 17 | #endif 18 | 19 | #include 20 | #if defined(VECTORIAL_USE_SSE4_1) 21 | #include 22 | #endif 23 | #include // memcpy 24 | 25 | #ifdef __cplusplus 26 | extern "C" { 27 | #endif 28 | 29 | 30 | typedef __m128 simd4f; 31 | 32 | typedef union { 33 | simd4f s ; 34 | float f[4]; 35 | unsigned int ui[4]; 36 | } _simd4f_union; 37 | 38 | // creating 39 | 40 | vectorial_inline simd4f simd4f_create(float x, float y, float z, float w) { 41 | simd4f s = { x, y, z, w }; 42 | return s; 43 | } 44 | 45 | vectorial_inline simd4f simd4f_zero() { return _mm_setzero_ps(); } 46 | 47 | vectorial_inline simd4f simd4f_uload4(const float *ary) { 48 | simd4f s = _mm_loadu_ps(ary); 49 | return s; 50 | } 51 | 52 | vectorial_inline simd4f simd4f_uload3(const float *ary) { 53 | simd4f s = simd4f_create(ary[0], ary[1], ary[2], 0); 54 | return s; 55 | } 56 | 57 | vectorial_inline simd4f simd4f_uload2(const float *ary) { 58 | simd4f s = simd4f_create(ary[0], ary[1], 0, 0); 59 | return s; 60 | } 61 | 62 | 63 | vectorial_inline void simd4f_ustore4(const simd4f val, float *ary) { 64 | _mm_storeu_ps(ary, val); 65 | } 66 | 67 | vectorial_inline void simd4f_ustore3(const simd4f val, float *ary) { 68 | memcpy(ary, &val, sizeof(float) * 3); 69 | } 70 | 71 | vectorial_inline void simd4f_ustore2(const simd4f val, float *ary) { 72 | memcpy(ary, &val, sizeof(float) * 2); 73 | } 74 | 75 | 76 | // utilites 77 | 78 | vectorial_inline simd4f simd4f_splat(float v) { 79 | simd4f s = _mm_set1_ps(v); 80 | return s; 81 | } 82 | 83 | vectorial_inline simd4f simd4f_splat_x(simd4f v) { 84 | simd4f s = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0,0,0,0)); 85 | return s; 86 | } 87 | 88 | vectorial_inline simd4f simd4f_splat_y(simd4f v) { 89 | simd4f s = _mm_shuffle_ps(v, v, _MM_SHUFFLE(1,1,1,1)); 90 | return s; 91 | } 92 | 93 | vectorial_inline simd4f simd4f_splat_z(simd4f v) { 94 | simd4f s = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2,2,2,2)); 95 | return s; 96 | } 97 | 98 | vectorial_inline simd4f simd4f_splat_w(simd4f v) { 99 | simd4f s = _mm_shuffle_ps(v, v, _MM_SHUFFLE(3,3,3,3)); 100 | return s; 101 | } 102 | 103 | 104 | // arithmetic 105 | 106 | vectorial_inline simd4f simd4f_add(simd4f lhs, simd4f rhs) { 107 | simd4f ret = _mm_add_ps(lhs, rhs); 108 | return ret; 109 | } 110 | 111 | vectorial_inline simd4f simd4f_sub(simd4f lhs, simd4f rhs) { 112 | simd4f ret = _mm_sub_ps(lhs, rhs); 113 | return ret; 114 | } 115 | 116 | vectorial_inline simd4f simd4f_mul(simd4f lhs, simd4f rhs) { 117 | simd4f ret = _mm_mul_ps(lhs, rhs); 118 | return ret; 119 | } 120 | 121 | vectorial_inline simd4f simd4f_div(simd4f lhs, simd4f rhs) { 122 | simd4f ret = _mm_div_ps(lhs, rhs); 123 | return ret; 124 | } 125 | 126 | vectorial_inline simd4f simd4f_madd(simd4f m1, simd4f m2, simd4f a) { 127 | return simd4f_add( simd4f_mul(m1, m2), a ); 128 | } 129 | 130 | 131 | 132 | 133 | vectorial_inline simd4f simd4f_reciprocal(simd4f v) { 134 | simd4f s = _mm_rcp_ps(v); 135 | const simd4f two = simd4f_create(2.0f, 2.0f, 2.0f, 2.0f); 136 | s = simd4f_mul(s, simd4f_sub(two, simd4f_mul(v, s))); 137 | return s; 138 | } 139 | 140 | vectorial_inline simd4f simd4f_sqrt(simd4f v) { 141 | simd4f s = _mm_sqrt_ps(v); 142 | return s; 143 | } 144 | 145 | vectorial_inline simd4f simd4f_rsqrt(simd4f v) { 146 | simd4f s = _mm_rsqrt_ps(v); 147 | const simd4f half = simd4f_create(0.5f, 0.5f, 0.5f, 0.5f); 148 | const simd4f three = simd4f_create(3.0f, 3.0f, 3.0f, 3.0f); 149 | s = simd4f_mul(simd4f_mul(s, half), simd4f_sub(three, simd4f_mul(s, simd4f_mul(v,s)))); 150 | return s; 151 | } 152 | 153 | vectorial_inline float simd4f_get_x(simd4f s) { _simd4f_union u={s}; return u.f[0]; } 154 | vectorial_inline float simd4f_get_y(simd4f s) { _simd4f_union u={s}; return u.f[1]; } 155 | vectorial_inline float simd4f_get_z(simd4f s) { _simd4f_union u={s}; return u.f[2]; } 156 | vectorial_inline float simd4f_get_w(simd4f s) { _simd4f_union u={s}; return u.f[3]; } 157 | 158 | vectorial_inline simd4f simd4f_dot3(simd4f lhs,simd4f rhs) { 159 | #if defined(VECTORIAL_USE_SSE4_1) 160 | return _mm_dp_ps(lhs, rhs, 0x7f); 161 | #else 162 | simd4f_aligned16 const unsigned int mask_array[] = { 0xffffffff, 0xffffffff, 0xffffffff, 0 }; 163 | const simd4f mask = _mm_load_ps((const float*)mask_array); 164 | const simd4f m = _mm_mul_ps(lhs, rhs); 165 | const simd4f s0 = _mm_and_ps(m, mask); 166 | const simd4f s1 = _mm_add_ps(s0, _mm_movehl_ps(s0, s0)); 167 | const simd4f s2 = _mm_add_ss(s1, _mm_shuffle_ps(s1, s1, 1)); 168 | return _mm_shuffle_ps(s2,s2, 0); 169 | #endif 170 | } 171 | 172 | vectorial_inline float simd4f_dot3_scalar(simd4f lhs,simd4f rhs) { 173 | return simd4f_get_x(simd4f_dot3(lhs, rhs)); 174 | } 175 | 176 | vectorial_inline simd4f simd4f_cross3(simd4f lhs, simd4f rhs) { 177 | 178 | const simd4f lyzx = _mm_shuffle_ps(lhs, lhs, _MM_SHUFFLE(3,0,2,1)); 179 | const simd4f lzxy = _mm_shuffle_ps(lhs, lhs, _MM_SHUFFLE(3,1,0,2)); 180 | 181 | const simd4f ryzx = _mm_shuffle_ps(rhs, rhs, _MM_SHUFFLE(3,0,2,1)); 182 | const simd4f rzxy = _mm_shuffle_ps(rhs, rhs, _MM_SHUFFLE(3,1,0,2)); 183 | 184 | return _mm_sub_ps(_mm_mul_ps(lyzx, rzxy), _mm_mul_ps(lzxy, ryzx)); 185 | 186 | } 187 | 188 | vectorial_inline simd4f simd4f_shuffle_wxyz(simd4f s) { return _mm_shuffle_ps(s,s, _MM_SHUFFLE(2,1,0,3) ); } 189 | vectorial_inline simd4f simd4f_shuffle_zwxy(simd4f s) { return _mm_shuffle_ps(s,s, _MM_SHUFFLE(1,0,3,2) ); } 190 | vectorial_inline simd4f simd4f_shuffle_yzwx(simd4f s) { return _mm_shuffle_ps(s,s, _MM_SHUFFLE(0,3,2,1) ); } 191 | 192 | vectorial_inline simd4f simd4f_zero_w(simd4f s) { 193 | simd4f r = _mm_unpackhi_ps(s, _mm_setzero_ps()); 194 | return _mm_movelh_ps(s, r); 195 | } 196 | 197 | vectorial_inline simd4f simd4f_zero_zw(simd4f s) { 198 | return _mm_movelh_ps(s, _mm_setzero_ps()); 199 | } 200 | 201 | vectorial_inline simd4f simd4f_merge_high(simd4f xyzw, simd4f abcd) { 202 | return _mm_movehl_ps(abcd, xyzw); 203 | } 204 | 205 | 206 | typedef simd4f_aligned16 union { 207 | unsigned int ui[4]; 208 | float f[4]; 209 | } _simd4f_uif; 210 | 211 | vectorial_inline simd4f simd4f_flip_sign_0101(simd4f s) { 212 | const _simd4f_uif upnpn = { { 0x00000000, 0x80000000, 0x00000000, 0x80000000 } }; 213 | return _mm_xor_ps( s, _mm_load_ps(upnpn.f) ); 214 | } 215 | 216 | vectorial_inline simd4f simd4f_flip_sign_1010(simd4f s) { 217 | const _simd4f_uif unpnp = { { 0x80000000, 0x00000000, 0x80000000, 0x00000000 } }; 218 | return _mm_xor_ps( s, _mm_load_ps(unpnp.f) ); 219 | } 220 | 221 | vectorial_inline simd4f simd4f_min(simd4f a, simd4f b) { 222 | return _mm_min_ps( a, b ); 223 | } 224 | 225 | vectorial_inline simd4f simd4f_max(simd4f a, simd4f b) { 226 | return _mm_max_ps( a, b ); 227 | } 228 | 229 | 230 | 231 | #ifdef __cplusplus 232 | } 233 | #endif 234 | 235 | 236 | #endif 237 | -------------------------------------------------------------------------------- /vectorialbenchmark.vcproj: -------------------------------------------------------------------------------- 1 | 2 | 11 | 12 | 15 | 16 | 17 | 18 | 19 | 26 | 29 | 32 | 35 | 38 | 41 | 53 | 56 | 59 | 62 | 69 | 72 | 75 | 78 | 81 | 84 | 87 | 90 | 91 | 99 | 102 | 105 | 108 | 111 | 114 | 128 | 131 | 134 | 137 | 146 | 149 | 152 | 155 | 158 | 161 | 164 | 167 | 168 | 176 | 179 | 182 | 185 | 188 | 191 | 205 | 208 | 211 | 214 | 223 | 226 | 229 | 232 | 235 | 238 | 241 | 244 | 245 | 246 | 247 | 248 | 249 | 252 | 255 | 256 | 259 | 260 | 263 | 264 | 267 | 268 | 271 | 272 | 275 | 276 | 279 | 280 | 283 | 284 | 287 | 288 | 291 | 292 | 295 | 296 | 299 | 300 | 303 | 304 | 307 | 308 | 311 | 312 | 313 | 316 | 319 | 320 | 323 | 324 | 327 | 328 | 331 | 332 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | -------------------------------------------------------------------------------- /vectorial.vcproj: -------------------------------------------------------------------------------- 1 | 2 | 11 | 12 | 15 | 16 | 17 | 18 | 19 | 25 | 28 | 31 | 34 | 37 | 40 | 54 | 57 | 60 | 63 | 70 | 73 | 76 | 79 | 82 | 85 | 88 | 91 | 92 | 98 | 101 | 104 | 107 | 110 | 113 | 124 | 127 | 130 | 133 | 142 | 145 | 148 | 151 | 154 | 157 | 160 | 163 | 164 | 170 | 173 | 176 | 179 | 182 | 185 | 195 | 198 | 201 | 204 | 213 | 216 | 219 | 222 | 225 | 228 | 231 | 234 | 235 | 236 | 237 | 238 | 239 | 242 | 245 | 246 | 249 | 250 | 253 | 254 | 257 | 258 | 261 | 262 | 265 | 266 | 269 | 270 | 273 | 274 | 277 | 278 | 281 | 282 | 285 | 286 | 289 | 290 | 293 | 294 | 297 | 298 | 301 | 302 | 303 | 306 | 309 | 310 | 313 | 314 | 317 | 318 | 321 | 322 | 325 | 326 | 329 | 330 | 333 | 334 | 337 | 338 | 341 | 342 | 345 | 346 | 347 | 348 | 349 | 350 | 351 | -------------------------------------------------------------------------------- /spec/spec_vec2f.cpp: -------------------------------------------------------------------------------- 1 | #include "spec_helper.h" 2 | #include 3 | using vectorial::vec2f; 4 | 5 | const int epsilon = 1; 6 | 7 | describe(vec2f, "constructing") { 8 | it("should have default constructor that does nothing..") { 9 | vec2f x; 10 | } 11 | 12 | it("should have constructor with element values") { 13 | vec2f x(10,20); 14 | // octave vec2f: [10,20] 15 | should_be_equal_vec2f(x, simd4f_create(10.000000000000000f, 20.000000000000000f, 0.0f, 0.0f), epsilon ); 16 | 17 | } 18 | 19 | it("should have constructor that loads from a float array") { 20 | float ary[2] = { 1,2 }; 21 | vec2f x(ary); 22 | // octave vec2f: [1,2] 23 | should_be_equal_vec2f(x, simd4f_create(1.000000000000000f, 2.000000000000000f, 0.0f, 0.0f), epsilon ); 24 | } 25 | 26 | } 27 | 28 | describe(vec2f, "loads and stores") { 29 | 30 | it("should have method for loading from a float array") { 31 | float ary[2] = { 1, 2 }; 32 | vec2f x(-1, -1 ); 33 | x.load(ary); 34 | // octave vec2f: [1,2] 35 | should_be_equal_vec2f(x, simd4f_create(1.000000000000000f, 2.000000000000000f, 0.0f, 0.0f), epsilon ); 36 | } 37 | 38 | it("should have method for storing to a float array") { 39 | float ary[2] = { -1, -1 }; 40 | vec2f x(1, 2); 41 | x.store(ary); 42 | should_be_close_to(ary[0], 1, epsilon); 43 | should_be_close_to(ary[1], 2, epsilon); 44 | } 45 | 46 | } 47 | 48 | 49 | describe(vec2f, "arithmetic with another vec2f") { 50 | 51 | it("should have operator+ for component-wise addition") { 52 | vec2f a(1,2); 53 | vec2f b(10,20); 54 | vec2f x = a + b; 55 | // octave vec2f: [1,2] + [10,20] 56 | should_be_equal_vec2f(x, simd4f_create(11.000000000000000f, 22.000000000000000f, 0.0f, 0.0f), epsilon ); 57 | 58 | } 59 | 60 | it("should have operator- for component-wise subtraction") { 61 | vec2f a(1,2); 62 | vec2f b(10,20); 63 | vec2f x = b - a; 64 | // octave vec2f: [10,20] - [1,2] 65 | should_be_equal_vec2f(x, simd4f_create(9.000000000000000f, 18.000000000000000f, 0.0f, 0.0f), epsilon ); 66 | 67 | } 68 | 69 | it("should have operator* for component-wise multiplication") { 70 | vec2f a(1,2); 71 | vec2f b(10,20); 72 | vec2f x = a * b; 73 | // octave vec2f: [1,2] .* [10,20] 74 | should_be_equal_vec2f(x, simd4f_create(10.000000000000000f, 40.000000000000000f, 0.0f, 0.0f), epsilon ); 75 | 76 | } 77 | 78 | it("should have operator/ for component-wise division") { 79 | vec2f a(1,2); 80 | vec2f b(10,20); 81 | vec2f x = b / a; 82 | // octave vec2f: [10,20] ./ [1,2] 83 | should_be_equal_vec2f(x, simd4f_create(10.000000000000000f, 10.000000000000000f, 0.0f, 0.0f), epsilon ); 84 | 85 | } 86 | 87 | 88 | 89 | it("should have operator+= for component-wise addition") { 90 | vec2f x(1,2); 91 | vec2f b(10,20); 92 | x += b; 93 | // octave vec2f: [1,2] + [10,20] 94 | should_be_equal_vec2f(x, simd4f_create(11.000000000000000f, 22.000000000000000f, 0.0f, 0.0f), epsilon ); 95 | 96 | } 97 | 98 | it("should have operator-= for component-wise subtraction") { 99 | vec2f a(1,2); 100 | vec2f x(10,20); 101 | x -= a; 102 | // octave vec2f: [10,20] - [1,2] 103 | should_be_equal_vec2f(x, simd4f_create(9.000000000000000f, 18.000000000000000f, 0.0f, 0.0f), epsilon ); 104 | 105 | } 106 | 107 | it("should have operator*= for component-wise multiplication") { 108 | vec2f x(1,2); 109 | vec2f b(10,20); 110 | x *= b; 111 | // octave vec2f: [1,2] .* [10,20] 112 | should_be_equal_vec2f(x, simd4f_create(10.000000000000000f, 40.000000000000000f, 0.0f, 0.0f), epsilon ); 113 | 114 | } 115 | 116 | it("should have operator/= for component-wise division") { 117 | vec2f a(1,2); 118 | vec2f x(10,20); 119 | x /= a; 120 | // octave vec2f: [10,20] ./ [1,2] 121 | should_be_equal_vec2f(x, simd4f_create(10.000000000000000f, 10.000000000000000f, 0.0f, 0.0f), epsilon ); 122 | 123 | } 124 | 125 | 126 | } 127 | 128 | 129 | describe(vec2f, "arithmetic with scalar") { 130 | 131 | it("should have operator+ for component-wise addition") { 132 | vec2f a(1,2); 133 | float b=10; 134 | vec2f x = a + b; 135 | // octave vec2f: [1,2] + 10 136 | should_be_equal_vec2f(x, simd4f_create(11.000000000000000f, 12.000000000000000f, 0.0f, 0.0f), epsilon ); 137 | 138 | } 139 | 140 | it("should have operator- for component-wise subtraction") { 141 | float a=10; 142 | vec2f b(10,20); 143 | vec2f x = b - a; 144 | // octave vec2f: [10,20] - 10 145 | should_be_equal_vec2f(x, simd4f_create(0.000000000000000f, 10.000000000000000f, 0.0f, 0.0f), epsilon ); 146 | 147 | } 148 | 149 | it("should have operator* for component-wise multiplication") { 150 | vec2f a(1,2); 151 | float b=10; 152 | vec2f x = a * b; 153 | // octave vec2f: [1,2] .* 10 154 | should_be_equal_vec2f(x, simd4f_create(10.000000000000000f, 20.000000000000000f, 0.0f, 0.0f), epsilon ); 155 | 156 | } 157 | 158 | it("should have operator/ for component-wise division") { 159 | vec2f a(10,20); 160 | float b=10; 161 | vec2f x = a / b; 162 | // octave vec2f: [10,20] ./ 10 163 | should_be_equal_vec2f(x, simd4f_create(1.000000000000000f, 2.000000000000000f, 0.0f, 0.0f), epsilon ); 164 | 165 | } 166 | 167 | 168 | 169 | it("should have operator+ for component-wise addition (float as lhs)") { 170 | vec2f b(1,2); 171 | float a=10; 172 | vec2f x = a + b; 173 | // octave vec2f: 10 + [1,2] 174 | should_be_equal_vec2f(x, simd4f_create(11.000000000000000f, 12.000000000000000f, 0.0f, 0.0f), epsilon ); 175 | 176 | } 177 | 178 | it("should have operator- for component-wise subtraction (float as lhs)") { 179 | float b=50; 180 | vec2f a(10,20); 181 | vec2f x = b - a; 182 | // octave vec2f: 50 - [10,20] 183 | should_be_equal_vec2f(x, simd4f_create(40.000000000000000f, 30.000000000000000f, 0.0f, 0.0f), epsilon ); 184 | 185 | } 186 | 187 | it("should have operator* for component-wise multiplication (float as lhs)") { 188 | vec2f b(1,2); 189 | float a=10; 190 | vec2f x = a * b; 191 | // octave vec2f: 10 .* [1,2] 192 | should_be_equal_vec2f(x, simd4f_create(10.000000000000000f, 20.000000000000000f, 0.0f, 0.0f), epsilon ); 193 | 194 | } 195 | 196 | it("should have operator* for component-wise multiplication (float as lhs)") { 197 | vec2f b(10,20); 198 | float a=40; 199 | vec2f x = a / b; 200 | // octave vec2f: 40 ./ [10,20] 201 | should_be_equal_vec2f(x, simd4f_create(4.000000000000000f, 2.000000000000000f, 0.0f, 0.0f), epsilon ); 202 | 203 | } 204 | 205 | 206 | } 207 | 208 | 209 | 210 | describe(vec2f, "vector math") { 211 | 212 | it("should have unary minus operator") { 213 | vec2f a(1,2); 214 | vec2f x = -a; 215 | // octave vec2f: -[1,2] 216 | should_be_equal_vec2f(x, simd4f_create(-1.000000000000000f, -2.000000000000000f, 0.0f, 0.0f), epsilon ); 217 | } 218 | 219 | 220 | it("should have dot function") { 221 | vec2f a(1,2); 222 | vec2f b(6,7); 223 | float x = vectorial::dot(a,b); 224 | 225 | // octave vec2f: dot([1,2],[6,7]) 226 | should_be_close_to(x, 20.000000000000000f, epsilon ); 227 | } 228 | 229 | it("should have length_squared function") { 230 | vec2f a(1,2); 231 | float x = vectorial::length_squared(a); 232 | 233 | // octave vec2f: dot([1,2],[1,2]) 234 | should_be_close_to(x, 5.000000000000000f, epsilon ); 235 | } 236 | 237 | it("should have length function") { 238 | vec2f a(1,2); 239 | float x = vectorial::length(a); 240 | 241 | // octave vec2f: norm([1,2]) 242 | should_be_close_to(x, 2.236067977499790f, epsilon ); 243 | } 244 | 245 | 246 | it("should have normalize function") { 247 | vec2f a(1,2); 248 | vec2f x = vectorial::normalize(a); 249 | // octave vec2f: [1,2] / norm([1,2]) 250 | should_be_equal_vec2f(x, simd4f_create(0.447213595499958f, 0.894427190999916f, 0.0f, 0.0f), epsilon ); 251 | } 252 | 253 | } 254 | 255 | 256 | -------------------------------------------------------------------------------- /spec/spec_simd2f.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "spec_helper.h" 3 | 4 | const int epsilon = 1; 5 | 6 | #ifdef VECTORIAL_HAVE_SIMD2F 7 | 8 | describe(simd2f, "sanity") { 9 | it("VECTORIAL_SIMD_TYPE should be defined to a string") { 10 | std::cout << "Simd type: " << VECTORIAL_SIMD_TYPE << std::endl; 11 | } 12 | } 13 | 14 | describe(simd2f, "creating") { 15 | 16 | it("should be possible to create with simd2f_create") { 17 | 18 | simd2f x = simd2f_create(1, 2); 19 | 20 | should_be_close_to( simd2f_get_x(x), 1, epsilon); 21 | should_be_close_to( simd2f_get_y(x), 2, epsilon); 22 | 23 | // octave simd2f: [1,2] 24 | should_be_equal_simd2f(x, simd2f_create(1.000000000000000f, 2.000000000000000f), epsilon ); 25 | 26 | } 27 | 28 | it("should have simd2f_zero for zero vector") { 29 | 30 | simd2f x = simd2f_zero(); 31 | 32 | // octave simd2f: [0,0] 33 | should_be_equal_simd2f(x, simd2f_create(0.000000000000000f, 0.000000000000000f), epsilon ); 34 | } 35 | 36 | 37 | } 38 | #ifdef _MSC_VER 39 | #include 40 | #else 41 | #include 42 | #endif 43 | 44 | #define unaligned_mem(n) ((float*)((unsigned char*)alloca(sizeof(float)*n+4)+4)) 45 | 46 | describe(simd2f, "utilities") { 47 | 48 | it("should have simd2f_uload2 for loading two float values from float an unaligned array into simd2f") { 49 | float *f = unaligned_mem(2); 50 | f[0] = 1; 51 | f[1] = 2; 52 | simd2f x = simd2f_uload2(f); 53 | // octave simd2f: [1,2] 54 | should_be_equal_simd2f(x, simd2f_create(1.000000000000000f, 2.000000000000000f), epsilon ); 55 | } 56 | 57 | it("should have simd2f_ustore2 for storing two float values from simd2f to an unaligned array") { 58 | float *f = unaligned_mem(2); 59 | f[0] = -1; 60 | f[1] = -1; 61 | simd2f a = simd2f_create(1,2); 62 | simd2f_ustore2(a, f); 63 | should_be_close_to(f[0], 1, epsilon); 64 | should_be_close_to(f[1], 2, epsilon); 65 | } 66 | 67 | 68 | it("should have simd2f_splat that expands a single scalar to all elements") { 69 | simd2f x = simd2f_splat(42); 70 | // octave simd2f: [42,42] 71 | should_be_equal_simd2f(x, simd2f_create(42.000000000000000f, 42.000000000000000f), epsilon ); 72 | } 73 | 74 | it("should have simd2f_splat_x,y splatting of an element") { 75 | simd2f a = simd2f_create(1,2); 76 | 77 | simd2f x; 78 | 79 | x = simd2f_splat_x(a); 80 | // octave simd2f: [1,1] 81 | should_be_equal_simd2f(x, simd2f_create(1.000000000000000f, 1.000000000000000f), epsilon ); 82 | 83 | x = simd2f_splat_y(a); 84 | // octave simd2f: [2,2] 85 | should_be_equal_simd2f(x, simd2f_create(2.000000000000000f, 2.000000000000000f), epsilon ); 86 | 87 | } 88 | 89 | #if 0 90 | it("should have simd2f_sum that adds elements") { 91 | simd2f a = simd2f_create(1,2); 92 | simd2f x = simd2f_sum(a); 93 | // octave simd2f: [sum([1,2]), sum([1,2,3,4])] 94 | should_be_equal_simd2f(x, simd2f_create(3.000000000000000f, 10.000000000000000f), epsilon ); 95 | 96 | } 97 | #endif 98 | 99 | it("should have simd2f_reciprocal") { 100 | simd2f a = simd2f_create(0.00001f, 2.00001f); 101 | simd2f x = simd2f_reciprocal(a); 102 | // octave simd2f: 1 ./ [0.00001, 2.00001] 103 | should_be_equal_simd2f(x, simd2f_create(99999.999999999985448f, 0.499997500012500f), epsilon ); 104 | } 105 | 106 | it("should have simd2f_sqrt") { 107 | simd2f a = simd2f_create(0.00001f, 2.00001f); 108 | simd2f x = simd2f_sqrt(a); 109 | // octave simd2f: sqrt([0.00001, 2.00001]) 110 | should_be_equal_simd2f(x, simd2f_create(0.003162277660168f, 1.414217097902582f), epsilon ); 111 | 112 | x = simd2f_sqrt( simd2f_create(0.0f, 0.0f) ); 113 | // octave simd2f: sqrt([0, 0]) 114 | should_be_equal_simd2f(x, simd2f_create(0.000000000000000f, 0.000000000000000f), epsilon ); 115 | } 116 | 117 | it("should have simd2f_rsqrt for reciprocal of square-root") { 118 | simd2f a = simd2f_create(0.00001f, 2.00001f); 119 | simd2f x = simd2f_rsqrt(a); 120 | const int epsilon = 4; // Grant larger error 121 | // octave simd2f: 1 ./ sqrt([0.00001, 2.00001]) 122 | should_be_equal_simd2f(x, simd2f_create(316.227766016837904f, 0.707105013426224f), epsilon ); 123 | } 124 | 125 | } 126 | 127 | describe(simd2f, "arithmetic with another simd2f") { 128 | 129 | it("should have simd2f_add for component-wise addition") { 130 | simd2f a = simd2f_create(1,2); 131 | simd2f b = simd2f_create(10,20); 132 | 133 | simd2f x = simd2f_add(a,b); 134 | // octave simd2f: [1,2] + [10,20] 135 | should_be_equal_simd2f(x, simd2f_create(11.000000000000000f, 22.000000000000000f), epsilon ); 136 | } 137 | 138 | it("should have simd2f_sub for component-wise subtraction") { 139 | simd2f a = simd2f_create(1,2); 140 | simd2f b = simd2f_create(10,20); 141 | 142 | simd2f x = simd2f_sub(b,a); 143 | // octave simd2f: [10,20] - [1,2] 144 | should_be_equal_simd2f(x, simd2f_create(9.000000000000000f, 18.000000000000000f), epsilon ); 145 | } 146 | 147 | it("should have simd2f_mul for component-wise multiply") { 148 | simd2f a = simd2f_create(1,2); 149 | simd2f b = simd2f_create(10,20); 150 | 151 | simd2f x = simd2f_mul(a,b); 152 | // octave simd2f: [1,2] .* [10,20] 153 | should_be_equal_simd2f(x, simd2f_create(10.000000000000000f, 40.000000000000000f), epsilon ); 154 | } 155 | 156 | it("should have simd2f_div for component-wise division") { 157 | simd2f a = simd2f_create(1,2); 158 | simd2f b = simd2f_create(10,20); 159 | 160 | simd2f x = simd2f_div(b,a); 161 | // octave simd2f: [10,20] ./ [1,2] 162 | should_be_equal_simd2f(x, simd2f_create(10.000000000000000f, 10.000000000000000f), epsilon ); 163 | } 164 | 165 | it("should have simd2f_madd for multiply-add") { 166 | simd2f a = simd2f_create(1,2); 167 | simd2f b = simd2f_create(100,100); 168 | simd2f c = simd2f_create(6,7); 169 | 170 | simd2f x = simd2f_madd(a,b,c); 171 | // octave simd2f: [1,2] .* [100,100] .+ [6,7] 172 | should_be_equal_simd2f(x, simd2f_create(106.000000000000000f, 207.000000000000000f), epsilon ); 173 | 174 | } 175 | 176 | } 177 | 178 | 179 | describe(simd2f, "vector math") { 180 | 181 | it("should have simd2f_dot2 for two component dot product") { 182 | simd2f a = simd2f_create(1,2); 183 | simd2f b = simd2f_create(10,20); 184 | 185 | simd2f x = simd2f_dot2(a,b); 186 | // octave simd2f: [dot([1, 2], [10, 20]),dot([1, 2], [10, 20])] 187 | should_be_equal_simd2f(x, simd2f_create(50.000000000000000f, 50.000000000000000f), epsilon ); 188 | } 189 | 190 | it("should have simd2f_length2 for two component vector length") { 191 | simd2f a = simd2f_create(1,2); 192 | simd2f x = simd2f_length2(a); 193 | // octave simd2f: [norm([1,2]),norm([1,2])] 194 | should_be_equal_simd2f(x, simd2f_create(2.236067977499790f, 2.236067977499790f), epsilon ); 195 | 196 | } 197 | 198 | 199 | it("should have simd2f_length2_squared for two component squared vector length") { 200 | simd2f a = simd2f_create(1,2); 201 | simd2f x = simd2f_length2_squared(a); 202 | // octave simd2f: ([dot([1,2], [1,2]), dot([1,2], [1,2])]) 203 | should_be_equal_simd2f(x, simd2f_create(5.000000000000000f, 5.000000000000000f), epsilon ); 204 | 205 | } 206 | 207 | it("should have simd2f_normalize2 for normalizing two component vector to unit length") { 208 | simd2f a = simd2f_create(1,2); 209 | simd2f x = simd2f_normalize2(a); 210 | // octave simd2f: [1,2] / norm([1,2]) 211 | should_be_equal_simd2f(x, simd2f_create(0.447213595499958f, 0.894427190999916f), epsilon ); 212 | } 213 | 214 | } 215 | 216 | 217 | describe(simd2f, "min-max") { 218 | 219 | it("should have simd2f_min for choosing minimum elements") { 220 | simd2f a = simd2f_create(1.0f, 2.0f); 221 | simd2f b = simd2f_create(2.0f, -2.0f); 222 | 223 | simd2f x = simd2f_min(a,b); 224 | should_be_equal_simd2f(x, simd2f_create(1.0f, -2.0f), epsilon); 225 | 226 | } 227 | 228 | it("should have simd2f_max for choosing maximum elements") { 229 | simd2f a = simd2f_create(1.0f, 2.0f); 230 | simd2f b = simd2f_create(2.0f, -2.0f); 231 | 232 | simd2f x = simd2f_max(a,b); 233 | should_be_equal_simd2f(x, simd2f_create(2.0f, 2.0f), epsilon); 234 | 235 | } 236 | 237 | } 238 | 239 | 240 | 241 | #endif 242 | 243 | -------------------------------------------------------------------------------- /include/vectorial/simd4f_neon.h: -------------------------------------------------------------------------------- 1 | /* 2 | Vectorial 3 | Copyright (c) 2010 Mikko Lehtonen 4 | Copyright (c) 2014 Google, Inc. 5 | Licensed under the terms of the two-clause BSD License (see LICENSE) 6 | */ 7 | #ifndef VECTORIAL_SIMD4F_NEON_H 8 | #define VECTORIAL_SIMD4F_NEON_H 9 | 10 | #include 11 | 12 | #ifdef __cplusplus 13 | extern "C" { 14 | #endif 15 | 16 | 17 | typedef float32x4_t simd4f; 18 | typedef float32x2_t simd2f; 19 | 20 | typedef union { 21 | simd4f s ; 22 | float f[4]; 23 | } _simd4f_union; 24 | 25 | 26 | 27 | vectorial_inline simd4f simd4f_create(float x, float y, float z, float w) { 28 | const float32_t d[4] = { x,y,z,w }; 29 | simd4f s = vld1q_f32(d); 30 | return s; 31 | } 32 | 33 | vectorial_inline simd4f simd4f_zero() { return vdupq_n_f32(0.0f); } 34 | 35 | vectorial_inline simd4f simd4f_uload4(const float *ary) { 36 | const float32_t* ary32 = (const float32_t*)ary; 37 | simd4f s = vld1q_f32(ary32); 38 | return s; 39 | } 40 | 41 | vectorial_inline simd4f simd4f_uload3(const float *ary) { 42 | simd4f s = simd4f_create(ary[0], ary[1], ary[2], 0); 43 | return s; 44 | } 45 | 46 | vectorial_inline simd4f simd4f_uload2(const float *ary) { 47 | const float32_t* ary32 = (const float32_t*)ary; 48 | float32x2_t low = vld1_f32(ary32); 49 | const float32_t zero = 0; 50 | float32x2_t high = vld1_dup_f32(&zero); // { 0,0 } but stupid warnings from llvm-gcc 51 | return vcombine_f32(low, high); 52 | } 53 | 54 | 55 | vectorial_inline void simd4f_ustore4(const simd4f val, float *ary) { 56 | vst1q_f32( (float32_t*)ary, val); 57 | } 58 | 59 | vectorial_inline void simd4f_ustore3(const simd4f val, float *ary) { 60 | float* local_data = ary; 61 | vst1q_lane_f32(local_data++, val, 0); 62 | vst1q_lane_f32(local_data++, val, 1); 63 | vst1q_lane_f32(local_data, val, 2); 64 | } 65 | 66 | vectorial_inline void simd4f_ustore2(const simd4f val, float *ary) { 67 | const float32x2_t low = vget_low_f32(val); 68 | vst1_f32( (float32_t*)ary, low); 69 | } 70 | 71 | 72 | 73 | 74 | vectorial_inline simd4f simd4f_splat(float v) { 75 | simd4f s = vdupq_n_f32(v); 76 | return s; 77 | } 78 | 79 | // todo: or is simd4f_splat(simd4f_get_x(v)) better? 80 | 81 | vectorial_inline simd4f simd4f_splat_x(simd4f v) { 82 | float32x2_t o = vget_low_f32(v); 83 | simd4f ret = vdupq_lane_f32(o, 0); 84 | return ret; 85 | } 86 | 87 | vectorial_inline simd4f simd4f_splat_y(simd4f v) { 88 | float32x2_t o = vget_low_f32(v); 89 | simd4f ret = vdupq_lane_f32(o, 1); 90 | return ret; 91 | } 92 | 93 | vectorial_inline simd4f simd4f_splat_z(simd4f v) { 94 | float32x2_t o = vget_high_f32(v); 95 | simd4f ret = vdupq_lane_f32(o, 0); 96 | return ret; 97 | } 98 | 99 | vectorial_inline simd4f simd4f_splat_w(simd4f v) { 100 | float32x2_t o = vget_high_f32(v); 101 | simd4f ret = vdupq_lane_f32(o, 1); 102 | return ret; 103 | } 104 | 105 | vectorial_inline simd4f simd4f_reciprocal(simd4f v) { 106 | simd4f estimate = vrecpeq_f32(v); 107 | estimate = vmulq_f32(vrecpsq_f32(estimate, v), estimate); 108 | estimate = vmulq_f32(vrecpsq_f32(estimate, v), estimate); 109 | return estimate; 110 | } 111 | 112 | vectorial_inline void simd4f_rsqrt_1iteration(const simd4f& v, simd4f& estimate) { 113 | simd4f estimate2 = vmulq_f32(estimate, v); 114 | estimate = vmulq_f32(estimate, vrsqrtsq_f32(estimate2, estimate)); 115 | } 116 | 117 | vectorial_inline simd4f simd4f_rsqrt1(simd4f v) { 118 | simd4f estimate = vrsqrteq_f32(v); 119 | simd4f_rsqrt_1iteration(v, estimate); 120 | return estimate; 121 | } 122 | 123 | vectorial_inline simd4f simd4f_rsqrt2(simd4f v) { 124 | simd4f estimate = vrsqrteq_f32(v); 125 | simd4f_rsqrt_1iteration(v, estimate); 126 | simd4f_rsqrt_1iteration(v, estimate); 127 | return estimate; 128 | } 129 | 130 | vectorial_inline simd4f simd4f_rsqrt3(simd4f v) { 131 | simd4f estimate = vrsqrteq_f32(v); 132 | simd4f_rsqrt_1iteration(v, estimate); 133 | simd4f_rsqrt_1iteration(v, estimate); 134 | simd4f_rsqrt_1iteration(v, estimate); 135 | return estimate; 136 | } 137 | 138 | // http://en.wikipedia.org/wiki/Fast_inverse_square_root makes the argument for 139 | // one iteration but two gives a signficant accuracy improvment. 140 | vectorial_inline simd4f simd4f_rsqrt(simd4f v) { 141 | return simd4f_rsqrt2(v); 142 | } 143 | 144 | vectorial_inline simd4f simd4f_sqrt(simd4f v) { 145 | 146 | return vreinterpretq_f32_u32(vandq_u32( vtstq_u32(vreinterpretq_u32_f32(v), 147 | vreinterpretq_u32_f32(v)), 148 | vreinterpretq_u32_f32( 149 | simd4f_reciprocal(simd4f_rsqrt(v))) 150 | ) 151 | ); 152 | 153 | } 154 | 155 | 156 | 157 | // arithmetics 158 | 159 | vectorial_inline simd4f simd4f_add(simd4f lhs, simd4f rhs) { 160 | simd4f ret = vaddq_f32(lhs, rhs); 161 | return ret; 162 | } 163 | 164 | vectorial_inline simd4f simd4f_sub(simd4f lhs, simd4f rhs) { 165 | simd4f ret = vsubq_f32(lhs, rhs); 166 | return ret; 167 | } 168 | 169 | vectorial_inline simd4f simd4f_mul(simd4f lhs, simd4f rhs) { 170 | simd4f ret = vmulq_f32(lhs, rhs); 171 | return ret; 172 | } 173 | 174 | vectorial_inline simd4f simd4f_div(simd4f lhs, simd4f rhs) { 175 | simd4f recip = simd4f_reciprocal( rhs ); 176 | simd4f ret = vmulq_f32(lhs, recip); 177 | return ret; 178 | } 179 | 180 | vectorial_inline simd4f simd4f_madd(simd4f m1, simd4f m2, simd4f a) { 181 | return vmlaq_f32( a, m1, m2 ); 182 | } 183 | 184 | 185 | 186 | vectorial_inline float simd4f_get_x(simd4f s) { return vgetq_lane_f32(s, 0); } 187 | vectorial_inline float simd4f_get_y(simd4f s) { return vgetq_lane_f32(s, 1); } 188 | vectorial_inline float simd4f_get_z(simd4f s) { return vgetq_lane_f32(s, 2); } 189 | vectorial_inline float simd4f_get_w(simd4f s) { return vgetq_lane_f32(s, 3); } 190 | 191 | // This function returns x*x+y*y+z*z and ignores the w component. 192 | vectorial_inline float simd4f_dot3_scalar(simd4f lhs, simd4f rhs) { 193 | const simd4f m = simd4f_mul(lhs, rhs); 194 | simd2f s1 = vpadd_f32(vget_low_f32(m), vget_low_f32(m)); 195 | s1 = vadd_f32(s1, vget_high_f32(m)); 196 | return vget_lane_f32(s1, 0); 197 | } 198 | 199 | vectorial_inline simd4f simd4f_dot3(simd4f lhs, simd4f rhs) { 200 | return simd4f_splat(simd4f_dot3_scalar(lhs, rhs)); 201 | } 202 | 203 | vectorial_inline simd4f simd4f_cross3(simd4f lhs, simd4f rhs) { 204 | // Compute lhs and rhs in order yzx 205 | simd2f lhs_low = vget_low_f32(lhs); 206 | simd2f rhs_low = vget_low_f32(rhs); 207 | simd4f lhs_yzx = vcombine_f32(vext_f32(lhs_low, vget_high_f32(lhs),1), lhs_low); 208 | simd4f rhs_yzx = vcombine_f32(vext_f32(rhs_low, vget_high_f32(rhs),1), rhs_low); 209 | // Compute cross in order zxy 210 | simd4f s3 = simd4f_sub(simd4f_mul(rhs_yzx, lhs), simd4f_mul(lhs_yzx, rhs)); 211 | // Permute cross to order xyz and zero out the fourth value 212 | simd2f low = vget_low_f32(s3); 213 | static const uint32_t mask_array[] = { 214 | 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0}; 215 | static const int32x4_t mask = vld1q_s32((const int32_t*)mask_array); 216 | s3 = vcombine_f32(vext_f32(low, vget_high_f32(s3), 1), low); 217 | return (simd4f)vandq_s32((int32x4_t)s3,mask); 218 | } 219 | 220 | vectorial_inline simd4f simd4f_shuffle_wxyz(simd4f s) { 221 | _simd4f_union u = {s}; 222 | return simd4f_create( u.f[3], u.f[0], u.f[1], u.f[2]); 223 | } 224 | 225 | vectorial_inline simd4f simd4f_shuffle_zwxy(simd4f s) { 226 | _simd4f_union u = {s}; 227 | return simd4f_create(u.f[2], u.f[3], u.f[0], u.f[1]); 228 | } 229 | 230 | vectorial_inline simd4f simd4f_shuffle_yzwx(simd4f s) { 231 | _simd4f_union u = {s}; 232 | return simd4f_create(u.f[1], u.f[2], u.f[3], u.f[0]); 233 | } 234 | 235 | 236 | vectorial_inline simd4f simd4f_zero_w(simd4f s) { 237 | _simd4f_union u = {s}; 238 | return simd4f_create(u.f[0], u.f[1], u.f[2], 0.0f); 239 | } 240 | 241 | vectorial_inline simd4f simd4f_zero_zw(simd4f s) { 242 | _simd4f_union u = {s}; 243 | return simd4f_create(u.f[0], u.f[1], 0.0f, 0.0f); 244 | } 245 | 246 | 247 | vectorial_inline simd4f simd4f_merge_high(simd4f xyzw, simd4f abcd) { 248 | _simd4f_union u1 = {xyzw}; 249 | _simd4f_union u2 = {abcd}; 250 | return simd4f_create(u1.f[2], u1.f[3], u2.f[2], u2.f[3]); 251 | } 252 | 253 | vectorial_inline simd4f simd4f_flip_sign_0101(simd4f s) { 254 | const unsigned int upnpn[4] = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 }; 255 | const uint32x4_t pnpn = vld1q_u32( upnpn ); 256 | return vreinterpretq_f32_u32( veorq_u32( vreinterpretq_u32_f32(s), pnpn ) ); 257 | } 258 | 259 | vectorial_inline simd4f simd4f_flip_sign_1010(simd4f s) { 260 | const unsigned int unpnp[4] = { 0x80000000, 0x00000000, 0x80000000, 0x00000000 }; 261 | const uint32x4_t npnp = vld1q_u32( unpnp ); 262 | return vreinterpretq_f32_u32( veorq_u32( vreinterpretq_u32_f32(s), npnp ) ); 263 | } 264 | 265 | 266 | vectorial_inline simd4f simd4f_min(simd4f a, simd4f b) { 267 | return vminq_f32( a, b ); 268 | } 269 | 270 | vectorial_inline simd4f simd4f_max(simd4f a, simd4f b) { 271 | return vmaxq_f32( a, b ); 272 | } 273 | 274 | 275 | #ifdef __cplusplus 276 | } 277 | #endif 278 | 279 | 280 | #endif 281 | -------------------------------------------------------------------------------- /spec/spec_vec3f.cpp: -------------------------------------------------------------------------------- 1 | #include "spec_helper.h" 2 | #include 3 | using vectorial::vec3f; 4 | 5 | const int epsilon = 1; 6 | 7 | describe(vec3f, "constructing") { 8 | it("should have default constructor that does nothing..") { 9 | vec3f x; 10 | } 11 | 12 | it("should have constructor with element values") { 13 | vec3f x(10,20,30); 14 | // octave vec3f: [10,20,30] 15 | should_be_equal_vec3f(x, simd4f_create(10.000000000000000f, 20.000000000000000f, 30.000000000000000f, 0.0f), epsilon ); 16 | 17 | } 18 | 19 | it("should have constructor that loads from a float array") { 20 | float ary[3] = { 1,2,3 }; 21 | vec3f x(ary); 22 | // octave vec3f: [1,2,3] 23 | should_be_equal_vec3f(x, simd4f_create(1.000000000000000f, 2.000000000000000f, 3.000000000000000f, 0.0f), epsilon ); 24 | } 25 | 26 | } 27 | 28 | describe(vec3f, "loads and stores") { 29 | 30 | it("should have method for loading from a float array") { 31 | float ary[3] = { 1,2,3 }; 32 | vec3f x(-1, -1, -1 ); 33 | x.load(ary); 34 | // octave vec3f: [1,2,3] 35 | should_be_equal_vec3f(x, simd4f_create(1.000000000000000f, 2.000000000000000f, 3.000000000000000f, 0.0f), epsilon ); 36 | } 37 | 38 | it("should have method for storing to a float array") { 39 | float ary[3] = { -1, -1, -1 }; 40 | vec3f x(1, 2, 3); 41 | x.store(ary); 42 | should_be_close_to(ary[0], 1, epsilon); 43 | should_be_close_to(ary[1], 2, epsilon); 44 | should_be_close_to(ary[2], 3, epsilon); 45 | } 46 | 47 | } 48 | 49 | describe(vec3f, "arithmetic with another vec3f") { 50 | 51 | it("should have operator+ for component-wise addition") { 52 | vec3f a(1,2,3); 53 | vec3f b(10,20,30); 54 | vec3f x = a + b; 55 | // octave vec3f: [1,2,3] + [10,20,30] 56 | should_be_equal_vec3f(x, simd4f_create(11.000000000000000f, 22.000000000000000f, 33.000000000000000f, 0.0f), epsilon ); 57 | 58 | } 59 | 60 | it("should have operator- for component-wise subtraction") { 61 | vec3f a(1,2,3); 62 | vec3f b(10,20,30); 63 | vec3f x = b - a; 64 | // octave vec3f: [10,20,30] - [1,2,3] 65 | should_be_equal_vec3f(x, simd4f_create(9.000000000000000f, 18.000000000000000f, 27.000000000000000f, 0.0f), epsilon ); 66 | 67 | } 68 | 69 | it("should have operator* for component-wise multiplication") { 70 | vec3f a(1,2,3); 71 | vec3f b(10,20,30); 72 | vec3f x = a * b; 73 | // octave vec3f: [1,2,3] .* [10,20,30] 74 | should_be_equal_vec3f(x, simd4f_create(10.000000000000000f, 40.000000000000000f, 90.000000000000000f, 0.0f), epsilon ); 75 | 76 | } 77 | 78 | it("should have operator/ for component-wise division") { 79 | vec3f a(1,2,3); 80 | vec3f b(10,20,30); 81 | vec3f x = b / a; 82 | // octave vec3f: [10,20,30] ./ [1,2,3] 83 | should_be_equal_vec3f(x, simd4f_create(10.000000000000000f, 10.000000000000000f, 10.000000000000000f, 0.0f), epsilon ); 84 | 85 | } 86 | 87 | 88 | 89 | it("should have operator+= for component-wise addition") { 90 | vec3f x(1,2,3); 91 | vec3f b(10,20,30); 92 | x += b; 93 | // octave vec3f: [1,2,3] + [10,20,30] 94 | should_be_equal_vec3f(x, simd4f_create(11.000000000000000f, 22.000000000000000f, 33.000000000000000f, 0.0f), epsilon ); 95 | 96 | } 97 | 98 | it("should have operator-= for component-wise subtraction") { 99 | vec3f a(1,2,3); 100 | vec3f x(10,20,30); 101 | x -= a; 102 | // octave vec3f: [10,20,30] - [1,2,3] 103 | should_be_equal_vec3f(x, simd4f_create(9.000000000000000f, 18.000000000000000f, 27.000000000000000f, 0.0f), epsilon ); 104 | 105 | } 106 | 107 | it("should have operator*= for component-wise multiplication") { 108 | vec3f x(1,2,3); 109 | vec3f b(10,20,30); 110 | x *= b; 111 | // octave vec3f: [1,2,3] .* [10,20,30] 112 | should_be_equal_vec3f(x, simd4f_create(10.000000000000000f, 40.000000000000000f, 90.000000000000000f, 0.0f), epsilon ); 113 | 114 | } 115 | 116 | it("should have operator/= for component-wise division") { 117 | vec3f a(1,2,3); 118 | vec3f x(10,20,30); 119 | x /= a; 120 | // octave vec3f: [10,20,30] ./ [1,2,3] 121 | should_be_equal_vec3f(x, simd4f_create(10.000000000000000f, 10.000000000000000f, 10.000000000000000f, 0.0f), epsilon ); 122 | 123 | } 124 | 125 | } 126 | 127 | 128 | describe(vec3f, "arithmetic with scalar") { 129 | 130 | it("should have operator+ for component-wise addition") { 131 | vec3f a(1,2,3); 132 | float b=10; 133 | vec3f x = a + b; 134 | // octave vec3f: [1,2,3] + 10 135 | should_be_equal_vec3f(x, simd4f_create(11.000000000000000f, 12.000000000000000f, 13.000000000000000f, 0.0f), epsilon ); 136 | 137 | } 138 | 139 | it("should have operator- for component-wise subtraction") { 140 | float a=10; 141 | vec3f b(10,20,30); 142 | vec3f x = b - a; 143 | // octave vec3f: [10,20,30] - 10 144 | should_be_equal_vec3f(x, simd4f_create(0.000000000000000f, 10.000000000000000f, 20.000000000000000f, 0.0f), epsilon ); 145 | 146 | } 147 | 148 | it("should have operator* for component-wise multiplication") { 149 | vec3f a(1,2,3); 150 | float b=10; 151 | vec3f x = a * b; 152 | // octave vec3f: [1,2,3] .* 10 153 | should_be_equal_vec3f(x, simd4f_create(10.000000000000000f, 20.000000000000000f, 30.000000000000000f, 0.0f), epsilon ); 154 | 155 | } 156 | 157 | it("should have operator/ for component-wise division") { 158 | vec3f a(10,20,30); 159 | float b=10; 160 | vec3f x = a / b; 161 | // octave vec3f: [10,20,30] ./ 10 162 | should_be_equal_vec3f(x, simd4f_create(1.000000000000000f, 2.000000000000000f, 3.000000000000000f, 0.0f), epsilon ); 163 | 164 | } 165 | 166 | 167 | 168 | it("should have operator+ for component-wise addition (float as lhs)") { 169 | vec3f b(1,2,3); 170 | float a=10; 171 | vec3f x = a + b; 172 | // octave vec3f: 10 + [1,2,3] 173 | should_be_equal_vec3f(x, simd4f_create(11.000000000000000f, 12.000000000000000f, 13.000000000000000f, 0.0f), epsilon ); 174 | 175 | } 176 | 177 | it("should have operator- for component-wise subtraction (float as lhs)") { 178 | float b=50; 179 | vec3f a(10,20,30); 180 | vec3f x = b - a; 181 | // octave vec3f: 50 - [10,20,30] 182 | should_be_equal_vec3f(x, simd4f_create(40.000000000000000f, 30.000000000000000f, 20.000000000000000f, 0.0f), epsilon ); 183 | 184 | } 185 | 186 | it("should have operator* for component-wise multiplication (float as lhs)") { 187 | vec3f b(1,2,3); 188 | float a=10; 189 | vec3f x = a * b; 190 | // octave vec3f: 10 .* [1,2,3] 191 | should_be_equal_vec3f(x, simd4f_create(10.000000000000000f, 20.000000000000000f, 30.000000000000000f, 0.0f), epsilon ); 192 | 193 | } 194 | 195 | it("should have operator* for component-wise multiplication (float as lhs)") { 196 | vec3f b(10,20,30); 197 | float a=40; 198 | vec3f x = a / b; 199 | // octave vec3f: 40 ./ [10,20,30] 200 | should_be_equal_vec3f(x, simd4f_create(4.000000000000000f, 2.000000000000000f, 1.333333333333333f, 0.0f), epsilon ); 201 | 202 | } 203 | 204 | 205 | } 206 | 207 | 208 | 209 | describe(vec3f, "vector math") { 210 | 211 | it("should have unary minus operator") { 212 | vec3f a(1,2,3); 213 | vec3f x = -a; 214 | // octave vec3f: -[1,2,3] 215 | should_be_equal_vec3f(x, simd4f_create(-1.000000000000000f, -2.000000000000000f, -3.000000000000000f, 0.0f), epsilon ); 216 | } 217 | 218 | 219 | it("should have dot function") { 220 | vec3f a(1,2,3); 221 | vec3f b(6,7,8); 222 | float x = vectorial::dot(a,b); 223 | 224 | // octave vec3f: dot([1,2,3],[6,7,8]) 225 | should_be_close_to(x, 44.000000000000000f, epsilon ); 226 | } 227 | 228 | it("should have cross function") { 229 | vec3f a(1,2,3); 230 | vec3f b(6,7,8); 231 | vec3f x = vectorial::cross(a,b); 232 | 233 | // octave vec3f: cross([1,2,3],[6,7,8]) 234 | should_be_equal_vec3f(x, simd4f_create(-5.000000000000000f, 10.000000000000000f, -5.000000000000000f, 0.0f), epsilon ); 235 | } 236 | 237 | it("should have length_squared function") { 238 | vec3f a(1,2,3); 239 | float x = vectorial::length_squared(a); 240 | 241 | // octave vec3f: dot([1,2,3],[1,2,3]) 242 | should_be_close_to(x, 14.000000000000000f, epsilon ); 243 | } 244 | 245 | it("should have length function") { 246 | vec3f a(1,2,3); 247 | float x = vectorial::length(a); 248 | 249 | // octave vec3f: norm([1,2,3]) 250 | should_be_close_to(x, 3.741657386773941f, epsilon ); 251 | } 252 | 253 | 254 | it("should have normalize function") { 255 | vec3f a(1,2,3); 256 | vec3f x = vectorial::normalize(a); 257 | // octave vec3f: [1,2,3] / norm([1,2,3]) 258 | should_be_equal_vec3f(x, simd4f_create(0.267261241912424f, 0.534522483824849f, 0.801783725737273f, 0.0f), epsilon ); 259 | } 260 | 261 | } 262 | 263 | 264 | -------------------------------------------------------------------------------- /spec/spec_vec4f.cpp: -------------------------------------------------------------------------------- 1 | #include "spec_helper.h" 2 | #include 3 | using vectorial::vec4f; 4 | 5 | const int epsilon = 1; 6 | 7 | describe(vec4f, "constructing") { 8 | it("should have default constructor that does nothing..") { 9 | vec4f x; 10 | } 11 | 12 | it("should have constructor with element values") { 13 | vec4f x(10,20,30,40); 14 | // octave vec4f: [10,20,30,40] 15 | should_be_equal_vec4f(x, simd4f_create(10.000000000000000f, 20.000000000000000f, 30.000000000000000f, 40.000000000000000f), epsilon ); 16 | 17 | } 18 | 19 | it("should have constructor that loads from a float array") { 20 | float ary[4] = { 1,2,3,4 }; 21 | vec4f x(ary); 22 | // octave vec4f: [1,2,3,4] 23 | should_be_equal_vec4f(x, simd4f_create(1.000000000000000f, 2.000000000000000f, 3.000000000000000f, 4.000000000000000f), epsilon ); 24 | } 25 | 26 | } 27 | 28 | describe(vec4f, "loads and stores") { 29 | 30 | 31 | it("should have method for loading from a float array") { 32 | float ary[4] = { 1,2,3,4 }; 33 | vec4f x(-1, -1, -1, -1); 34 | x.load(ary); 35 | // octave vec4f: [1,2,3,4] 36 | should_be_equal_vec4f(x, simd4f_create(1.000000000000000f, 2.000000000000000f, 3.000000000000000f, 4.000000000000000f), epsilon ); 37 | } 38 | 39 | it("should have method for storing to a float array") { 40 | float ary[4] = { -1, -1, -1, -1 }; 41 | vec4f x(1, 2, 3, 4); 42 | x.store(ary); 43 | should_be_close_to(ary[0], 1, epsilon); 44 | should_be_close_to(ary[1], 2, epsilon); 45 | should_be_close_to(ary[2], 3, epsilon); 46 | should_be_close_to(ary[3], 4, epsilon); 47 | } 48 | 49 | } 50 | 51 | describe(vec4f, "arithmetic with another vec4f") { 52 | 53 | it("should have operator+ for component-wise addition") { 54 | vec4f a(1,2,3,4); 55 | vec4f b(10,20,30,40); 56 | vec4f x = a + b; 57 | // octave vec4f: [1,2,3,4] + [10,20,30,40] 58 | should_be_equal_vec4f(x, simd4f_create(11.000000000000000f, 22.000000000000000f, 33.000000000000000f, 44.000000000000000f), epsilon ); 59 | 60 | } 61 | 62 | it("should have operator- for component-wise subtraction") { 63 | vec4f a(1,2,3,4); 64 | vec4f b(10,20,30,40); 65 | vec4f x = b - a; 66 | // octave vec4f: [10,20,30,40] - [1,2,3,4] 67 | should_be_equal_vec4f(x, simd4f_create(9.000000000000000f, 18.000000000000000f, 27.000000000000000f, 36.000000000000000f), epsilon ); 68 | 69 | } 70 | 71 | it("should have operator* for component-wise multiplication") { 72 | vec4f a(1,2,3,4); 73 | vec4f b(10,20,30,40); 74 | vec4f x = a * b; 75 | // octave vec4f: [1,2,3,4] .* [10,20,30,40] 76 | should_be_equal_vec4f(x, simd4f_create(10.000000000000000f, 40.000000000000000f, 90.000000000000000f, 160.000000000000000f), epsilon ); 77 | 78 | } 79 | 80 | it("should have operator/ for component-wise division") { 81 | vec4f a(1,2,3,4); 82 | vec4f b(10,20,30,40); 83 | vec4f x = b / a; 84 | // octave vec4f: [10,20,30,40] ./ [1,2,3,4] 85 | should_be_equal_vec4f(x, simd4f_create(10.000000000000000f, 10.000000000000000f, 10.000000000000000f, 10.000000000000000f), epsilon ); 86 | 87 | } 88 | 89 | 90 | 91 | 92 | it("should have operator+= for component-wise addition") { 93 | vec4f x(1,2,3,4); 94 | vec4f b(10,20,30,40); 95 | x += b; 96 | // octave vec4f: [1,2,3,4] + [10,20,30,40] 97 | should_be_equal_vec4f(x, simd4f_create(11.000000000000000f, 22.000000000000000f, 33.000000000000000f, 44.000000000000000f), epsilon ); 98 | 99 | } 100 | 101 | it("should have operator-= for component-wise subtraction") { 102 | vec4f a(1,2,3,4); 103 | vec4f x(10,20,30,40); 104 | x -= a; 105 | // octave vec4f: [10,20,30,40] - [1,2,3,4] 106 | should_be_equal_vec4f(x, simd4f_create(9.000000000000000f, 18.000000000000000f, 27.000000000000000f, 36.000000000000000f), epsilon ); 107 | 108 | } 109 | 110 | it("should have operator*= for component-wise multiplication") { 111 | vec4f x(1,2,3,4); 112 | vec4f b(10,20,30,40); 113 | x *= b; 114 | // octave vec4f: [1,2,3,4] .* [10,20,30,40] 115 | should_be_equal_vec4f(x, simd4f_create(10.000000000000000f, 40.000000000000000f, 90.000000000000000f, 160.000000000000000f), epsilon ); 116 | 117 | } 118 | 119 | it("should have operator/= for component-wise division") { 120 | vec4f a(1,2,3,4); 121 | vec4f x(10,20,30,40); 122 | x /= a; 123 | // octave vec4f: [10,20,30,40] ./ [1,2,3,4] 124 | should_be_equal_vec4f(x, simd4f_create(10.000000000000000f, 10.000000000000000f, 10.000000000000000f, 10.000000000000000f), epsilon ); 125 | 126 | } 127 | 128 | 129 | 130 | } 131 | 132 | 133 | describe(vec4f, "arithmetic with scalar") { 134 | 135 | it("should have operator+ for component-wise addition") { 136 | vec4f a(1,2,3,4); 137 | float b=10; 138 | vec4f x = a + b; 139 | // octave vec4f: [1,2,3,4] + 10 140 | should_be_equal_vec4f(x, simd4f_create(11.000000000000000f, 12.000000000000000f, 13.000000000000000f, 14.000000000000000f), epsilon ); 141 | 142 | } 143 | 144 | it("should have operator- for component-wise subtraction") { 145 | float a=10; 146 | vec4f b(10,20,30,40); 147 | vec4f x = b - a; 148 | // octave vec4f: [10,20,30,40] - 10 149 | should_be_equal_vec4f(x, simd4f_create(0.000000000000000f, 10.000000000000000f, 20.000000000000000f, 30.000000000000000f), epsilon ); 150 | 151 | } 152 | 153 | it("should have operator* for component-wise multiplication") { 154 | vec4f a(1,2,3,4); 155 | float b=10; 156 | vec4f x = a * b; 157 | // octave vec4f: [1,2,3,4] .* 10 158 | should_be_equal_vec4f(x, simd4f_create(10.000000000000000f, 20.000000000000000f, 30.000000000000000f, 40.000000000000000f), epsilon ); 159 | 160 | } 161 | 162 | it("should have operator/ for component-wise division") { 163 | vec4f a(10,20,30,40); 164 | float b=10; 165 | vec4f x = a / b; 166 | // octave vec4f: [10,20,30,40] ./ 10 167 | should_be_equal_vec4f(x, simd4f_create(1.000000000000000f, 2.000000000000000f, 3.000000000000000f, 4.000000000000000f), epsilon ); 168 | 169 | } 170 | 171 | 172 | 173 | it("should have operator+ for component-wise addition (float as lhs)") { 174 | vec4f b(1,2,3,4); 175 | float a=10; 176 | vec4f x = a + b; 177 | // octave vec4f: 10 + [1,2,3,4] 178 | should_be_equal_vec4f(x, simd4f_create(11.000000000000000f, 12.000000000000000f, 13.000000000000000f, 14.000000000000000f), epsilon ); 179 | 180 | } 181 | 182 | it("should have operator- for component-wise subtraction (float as lhs)") { 183 | float b=50; 184 | vec4f a(10,20,30,40); 185 | vec4f x = b - a; 186 | // octave vec4f: 50 - [10,20,30,40] 187 | should_be_equal_vec4f(x, simd4f_create(40.000000000000000f, 30.000000000000000f, 20.000000000000000f, 10.000000000000000f), epsilon ); 188 | 189 | } 190 | 191 | it("should have operator* for component-wise multiplication (float as lhs)") { 192 | vec4f b(1,2,3,4); 193 | float a=10; 194 | vec4f x = a * b; 195 | // octave vec4f: 10 .* [1,2,3,4] 196 | should_be_equal_vec4f(x, simd4f_create(10.000000000000000f, 20.000000000000000f, 30.000000000000000f, 40.000000000000000f), epsilon ); 197 | 198 | } 199 | 200 | it("should have operator* for component-wise multiplication (float as lhs)") { 201 | vec4f b(10,20,30,40); 202 | float a=40; 203 | vec4f x = a / b; 204 | // octave vec4f: 40 ./ [10,20,30,40] 205 | should_be_equal_vec4f(x, simd4f_create(4.000000000000000f, 2.000000000000000f, 1.333333333333333f, 1.000000000000000f), epsilon ); 206 | 207 | } 208 | 209 | 210 | } 211 | 212 | 213 | 214 | describe(vec4f, "vector math") { 215 | 216 | it("should have unary minus operator") { 217 | vec4f a(1,2,3,4); 218 | vec4f x = -a; 219 | // octave vec4f: -[1,2,3,4] 220 | should_be_equal_vec4f(x, simd4f_create(-1.000000000000000f, -2.000000000000000f, -3.000000000000000f, -4.000000000000000f), epsilon ); 221 | } 222 | 223 | it("should have dot function") { 224 | vec4f a(1,2,3,4); 225 | vec4f b(6,7,8,9); 226 | float x = vectorial::dot(a,b); 227 | 228 | // octave vec4f: dot([1,2,3,4],[6,7,8,9]) 229 | should_be_close_to(x, 80.000000000000000f, epsilon ); 230 | } 231 | 232 | it("should have length_squared function") { 233 | vec4f a(1,2,3,4); 234 | float x = vectorial::length_squared(a); 235 | 236 | // octave vec4f: dot([1,2,3,4],[1,2,3,4]) 237 | should_be_close_to(x, 30.000000000000000f, epsilon ); 238 | } 239 | 240 | it("should have length function") { 241 | vec4f a(1,2,3,4); 242 | float x = vectorial::length(a); 243 | 244 | // octave vec4f: norm([1,2,3,4]) 245 | should_be_close_to(x, 5.477225575051661f, epsilon ); 246 | } 247 | 248 | 249 | it("should have normalize function") { 250 | vec4f a(1,2,3,4); 251 | vec4f x = vectorial::normalize(a); 252 | // octave vec4f: [1,2,3,4] / norm([1,2,3,4]) 253 | should_be_equal_vec4f(x, simd4f_create(0.182574185835055f, 0.365148371670111f, 0.547722557505166f, 0.730296743340221f), epsilon ); 254 | } 255 | 256 | } 257 | 258 | 259 | -------------------------------------------------------------------------------- /spec/spec_helper.h: -------------------------------------------------------------------------------- 1 | #ifndef VECTORIAL_SPEC_HELPER_H 2 | #define VECTORIAL_SPEC_HELPER_H 3 | 4 | #define VECTORIAL_OSTREAM 5 | 6 | #include "spec.h" 7 | 8 | #include "vectorial/vectorial.h" 9 | 10 | #ifdef VECTORIAL_HAVE_SIMD2F 11 | #include "vectorial/simd2f.h" 12 | #endif 13 | 14 | #include 15 | #include 16 | #include 17 | 18 | #define should_be_close_to(a,b,tolerance) should_be_close_to_(this, a,b,tolerance,__FILE__,__LINE__) 19 | #define should_be_equal_simd4f( a, b, tolerance) should_be_equal_simd4f_(this, a,b,tolerance,__FILE__,__LINE__) 20 | #define should_be_equal_simd2f( a, b, tolerance) should_be_equal_simd2f_(this, a,b,tolerance,__FILE__,__LINE__) 21 | #define should_be_equal_vec4f( a, b, tolerance) should_be_equal_vec4f_(this, a,b,tolerance,__FILE__,__LINE__) 22 | #define should_be_equal_vec3f( a, b, tolerance) should_be_equal_vec3f_(this, a,b,tolerance,__FILE__,__LINE__) 23 | #define should_be_equal_vec2f( a, b, tolerance) should_be_equal_vec2f_(this, a,b,tolerance,__FILE__,__LINE__) 24 | 25 | #define should_be_equal_simd4x4f( a, b, tolerance) should_be_equal_simd4x4f_(this, a,b,tolerance,__FILE__,__LINE__) 26 | 27 | #define should_be_equal_mat4f( a, b, tolerance) should_be_equal_mat4f_(this, a,b,tolerance,__FILE__,__LINE__) 28 | 29 | // Based on: 30 | // http://www.cygnus-software.com/papers/comparingfloats/comparingfloats.htm 31 | // 32 | static inline bool compare_floats(float A, float B, int maxUlps) 33 | { 34 | // Make sure maxUlps is non-negative and small enough that the 35 | // default NAN won't compare as equal to anything. 36 | // assert(maxUlps > 0 && maxUlps < 4 * 1024 * 1024); 37 | union { 38 | float f; 39 | int i; 40 | } f2iA, f2iB; 41 | f2iA.f = A; 42 | f2iB.f = B; 43 | 44 | int aInt = f2iA.i; 45 | // int aInt = *(int*)&A; 46 | // Make aInt lexicographically ordered as a twos-complement int 47 | if (aInt < 0) 48 | aInt = 0x80000000 - aInt; 49 | // Make bInt lexicographically ordered as a twos-complement int 50 | int bInt = f2iB.i; 51 | // int bInt = *(int*)&B; 52 | if (bInt < 0) 53 | bInt = 0x80000000 - bInt; 54 | int intDiff = abs(aInt - bInt); 55 | if (intDiff <= maxUlps) 56 | return true; 57 | return false; 58 | } 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | static inline void should_be_close_to_(specific::SpecBase *spec, float a, float b, int tolerance, const char *file, int line) { 69 | 70 | bool equal=true; 71 | if( !compare_floats(a,b,tolerance) ) equal = false; 72 | 73 | std::stringstream ss; 74 | ss << a << " == " << b << " (with tolerance of " << tolerance << ")"; 75 | spec->should_test(equal, ss.str().c_str(), file, line); 76 | 77 | 78 | } 79 | 80 | #ifdef VECTORIAL_HAVE_SIMD2F 81 | static inline void should_be_equal_simd2f_(specific::SpecBase *spec, const simd2f& a, const simd2f& b, int tolerance, const char *file, int line) { 82 | 83 | bool equal=true; 84 | if( !compare_floats( simd2f_get_x(a), simd2f_get_x(b), tolerance) ) equal = false; 85 | if( !compare_floats( simd2f_get_y(a), simd2f_get_y(b), tolerance) ) equal = false; 86 | 87 | std::stringstream ss; 88 | ss << a << " == " << b << " (with tolerance of " << tolerance << ")"; 89 | spec->should_test(equal, ss.str().c_str(), file, line); 90 | 91 | } 92 | #endif 93 | 94 | static inline void should_be_equal_simd4f_(specific::SpecBase *spec, const simd4f& a, const simd4f& b, int tolerance, const char *file, int line) { 95 | 96 | bool equal=true; 97 | if( !compare_floats( simd4f_get_x(a), simd4f_get_x(b), tolerance) ) equal = false; 98 | if( !compare_floats( simd4f_get_y(a), simd4f_get_y(b), tolerance) ) equal = false; 99 | if( !compare_floats( simd4f_get_z(a), simd4f_get_z(b), tolerance) ) equal = false; 100 | if( !compare_floats( simd4f_get_w(a), simd4f_get_w(b), tolerance) ) equal = false; 101 | 102 | std::stringstream ss; 103 | ss << a << " == " << b << " (with tolerance of " << tolerance << ")"; 104 | spec->should_test(equal, ss.str().c_str(), file, line); 105 | 106 | 107 | } 108 | 109 | static inline void should_be_equal_vec4f_(specific::SpecBase *spec, const vectorial::vec4f& a, const vectorial::vec4f& b, int tolerance, const char *file, int line) { 110 | 111 | bool equal=true; 112 | if( !compare_floats( a.x(), b.x(), tolerance) ) equal = false; 113 | if( !compare_floats( a.y(), b.y(), tolerance) ) equal = false; 114 | if( !compare_floats( a.z(), b.z(), tolerance) ) equal = false; 115 | if( !compare_floats( a.w(), b.w(), tolerance) ) equal = false; 116 | 117 | std::stringstream ss; 118 | ss << a << " == " << b << " (with tolerance of " << tolerance << ")"; 119 | spec->should_test(equal, ss.str().c_str(), file, line); 120 | 121 | 122 | } 123 | 124 | static inline void should_be_equal_vec3f_(specific::SpecBase *spec, const vectorial::vec3f& a, const vectorial::vec3f& b, int tolerance, const char *file, int line) { 125 | 126 | bool equal=true; 127 | if( !compare_floats( a.x(), b.x(), tolerance) ) equal = false; 128 | if( !compare_floats( a.y(), b.y(), tolerance) ) equal = false; 129 | if( !compare_floats( a.z(), b.z(), tolerance) ) equal = false; 130 | 131 | std::stringstream ss; 132 | ss << a << " == " << b << " (with tolerance of " << tolerance << ")"; 133 | spec->should_test(equal, ss.str().c_str(), file, line); 134 | 135 | 136 | } 137 | 138 | static inline void should_be_equal_vec2f_(specific::SpecBase *spec, const vectorial::vec2f& a, const vectorial::vec2f& b, int tolerance, const char *file, int line) { 139 | 140 | bool equal=true; 141 | if( !compare_floats( a.x(), b.x(), tolerance) ) equal = false; 142 | if( !compare_floats( a.y(), b.y(), tolerance) ) equal = false; 143 | 144 | std::stringstream ss; 145 | ss << a << " == " << b << " (with tolerance of " << tolerance << ")"; 146 | spec->should_test(equal, ss.str().c_str(), file, line); 147 | 148 | 149 | } 150 | 151 | 152 | 153 | static inline void should_be_equal_simd4x4f_(specific::SpecBase *spec, const simd4x4f& a, const simd4x4f& b, int tolerance, const char *file, int line) { 154 | 155 | bool equal=true; 156 | if( !compare_floats( simd4f_get_x(a.x), simd4f_get_x(b.x), tolerance) ) equal = false; 157 | if( !compare_floats( simd4f_get_y(a.x), simd4f_get_y(b.x), tolerance) ) equal = false; 158 | if( !compare_floats( simd4f_get_z(a.x), simd4f_get_z(b.x), tolerance) ) equal = false; 159 | if( !compare_floats( simd4f_get_w(a.x), simd4f_get_w(b.x), tolerance) ) equal = false; 160 | 161 | if( !compare_floats( simd4f_get_x(a.y), simd4f_get_x(b.y), tolerance) ) equal = false; 162 | if( !compare_floats( simd4f_get_y(a.y), simd4f_get_y(b.y), tolerance) ) equal = false; 163 | if( !compare_floats( simd4f_get_z(a.y), simd4f_get_z(b.y), tolerance) ) equal = false; 164 | if( !compare_floats( simd4f_get_w(a.y), simd4f_get_w(b.y), tolerance) ) equal = false; 165 | 166 | if( !compare_floats( simd4f_get_x(a.z), simd4f_get_x(b.z), tolerance) ) equal = false; 167 | if( !compare_floats( simd4f_get_y(a.z), simd4f_get_y(b.z), tolerance) ) equal = false; 168 | if( !compare_floats( simd4f_get_z(a.z), simd4f_get_z(b.z), tolerance) ) equal = false; 169 | if( !compare_floats( simd4f_get_w(a.z), simd4f_get_w(b.z), tolerance) ) equal = false; 170 | 171 | if( !compare_floats( simd4f_get_x(a.w), simd4f_get_x(b.w), tolerance) ) equal = false; 172 | if( !compare_floats( simd4f_get_y(a.w), simd4f_get_y(b.w), tolerance) ) equal = false; 173 | if( !compare_floats( simd4f_get_z(a.w), simd4f_get_z(b.w), tolerance) ) equal = false; 174 | if( !compare_floats( simd4f_get_w(a.w), simd4f_get_w(b.w), tolerance) ) equal = false; 175 | 176 | std::stringstream ss; 177 | ss << a << " == " << b << " (with tolerance of " << tolerance << ")"; 178 | spec->should_test(equal, ss.str().c_str(), file, line); 179 | 180 | 181 | } 182 | 183 | static inline void should_be_equal_mat4f_(specific::SpecBase *spec, const vectorial::mat4f& a, const vectorial::mat4f& b, int tolerance, const char *file, int line) { 184 | 185 | bool equal=true; 186 | if( !compare_floats( simd4f_get_x(a.value.x), simd4f_get_x(b.value.x), tolerance) ) equal = false; 187 | if( !compare_floats( simd4f_get_y(a.value.x), simd4f_get_y(b.value.x), tolerance) ) equal = false; 188 | if( !compare_floats( simd4f_get_z(a.value.x), simd4f_get_z(b.value.x), tolerance) ) equal = false; 189 | if( !compare_floats( simd4f_get_w(a.value.x), simd4f_get_w(b.value.x), tolerance) ) equal = false; 190 | 191 | if( !compare_floats( simd4f_get_x(a.value.y), simd4f_get_x(b.value.y), tolerance) ) equal = false; 192 | if( !compare_floats( simd4f_get_y(a.value.y), simd4f_get_y(b.value.y), tolerance) ) equal = false; 193 | if( !compare_floats( simd4f_get_z(a.value.y), simd4f_get_z(b.value.y), tolerance) ) equal = false; 194 | if( !compare_floats( simd4f_get_w(a.value.y), simd4f_get_w(b.value.y), tolerance) ) equal = false; 195 | 196 | if( !compare_floats( simd4f_get_x(a.value.z), simd4f_get_x(b.value.z), tolerance) ) equal = false; 197 | if( !compare_floats( simd4f_get_y(a.value.z), simd4f_get_y(b.value.z), tolerance) ) equal = false; 198 | if( !compare_floats( simd4f_get_z(a.value.z), simd4f_get_z(b.value.z), tolerance) ) equal = false; 199 | if( !compare_floats( simd4f_get_w(a.value.z), simd4f_get_w(b.value.z), tolerance) ) equal = false; 200 | 201 | if( !compare_floats( simd4f_get_x(a.value.w), simd4f_get_x(b.value.w), tolerance) ) equal = false; 202 | if( !compare_floats( simd4f_get_y(a.value.w), simd4f_get_y(b.value.w), tolerance) ) equal = false; 203 | if( !compare_floats( simd4f_get_z(a.value.w), simd4f_get_z(b.value.w), tolerance) ) equal = false; 204 | if( !compare_floats( simd4f_get_w(a.value.w), simd4f_get_w(b.value.w), tolerance) ) equal = false; 205 | 206 | std::stringstream ss; 207 | ss << a << " == " << b << " (with tolerance of " << tolerance << " ulps)"; 208 | spec->should_test(equal, ss.str().c_str(), file, line); 209 | 210 | 211 | } 212 | 213 | 214 | 215 | #endif 216 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | CXX?=g++ 3 | CLANG_CC=clang 4 | CLANG_CXX=clang++ 5 | 6 | IPHONE_PLATFORM_PATH = /Developer/Platforms/iPhoneOS.platform/Developer 7 | IPHONE_ISYSROOT_PATH = $(IPHONE_PLATFORM_PATH)/SDKs/iPhoneOS4.2.sdk/ 8 | IPHONE_CC = $(IPHONE_PLATFORM_PATH)/usr/bin/g++ -isysroot $(IPHONE_ISYSROOT_PATH) -arch armv7 9 | # -mfloat-abi=softfp -mfpu=neon 10 | 11 | #CXXFLAGS += -Iinclude -O0 12 | #CXXFLAGS += -g -Iinclude -Wall -Wextra -pedantic -Wno-unused -O3 -fstrict-aliasing -Wstrict-aliasing=2 -ffast-math 13 | CXXFLAGS += -Iinclude -Wall -Wextra -pedantic -Wno-unused -O3 -fstrict-aliasing -Wstrict-aliasing=2 -ffast-math -D__extern_always_inline=inline 14 | 15 | SPEC_SRC = $(wildcard spec/*.cpp) 16 | SPEC_OBJ = $(SPEC_SRC:.cpp=.o) 17 | 18 | BENCH_SRC = $(wildcard bench/*.cpp) 19 | BENCH_OBJ = $(BENCH_SRC:.cpp=.o) 20 | BENCH_ASM = $(patsubst %.cpp,asm$(SUFFIX)/%.S,$(BENCH_SRC)) 21 | 22 | SUFFIX= 23 | 24 | DEFAULT_CC=1 25 | 26 | ifeq ($(FORCE_SCALAR),1) 27 | CXXFLAGS+= -DVECTORIAL_FORCED -DVECTORIAL_SCALAR 28 | SUFFIX=-scalar 29 | endif 30 | 31 | ifeq ($(FORCE_SSE),1) 32 | CXXFLAGS+= -DVECTORIAL_FORCED -DVECTORIAL_SSE -msse -msse2 -mfpmath=sse 33 | SUFFIX=-sse 34 | endif 35 | 36 | ifeq ($(FORCE_GNU),1) 37 | CXXFLAGS+= -DVECTORIAL_FORCED -DVECTORIAL_GNU 38 | #-msse -msse2 -mfpmath=sse 39 | SUFFIX=-gnu 40 | endif 41 | 42 | ifeq ($(FORCE_NEON),1) 43 | CXXFLAGS+= -DVECTORIAL_FORCED -DVECTORIAL_NEON 44 | SUFFIX=-neon 45 | ARM=1 46 | endif 47 | 48 | 49 | ifeq ($(ARM),1) 50 | ifeq ($(shell uname -s),Darwin) 51 | CC=$(IPHONE_CC) 52 | CXX=$(IPHONE_CC) 53 | endif 54 | # CXXFLAGS+= -mcpu=cortex-a8 55 | CXXFLAGS+= -mno-thumb -mfloat-abi=softfp -mfpu=neon 56 | DEFAULT_CC=0 57 | endif 58 | 59 | ifeq ($(CLANG),1) 60 | CC=$(CLANG_CC) 61 | CXX=$(CLANG_CXX) 62 | DEFAULT_CC=0 63 | endif 64 | 65 | ifeq ($(DEFAULT_CC),1) 66 | # CXXFLAGS += -msse -msse2 -mfpmath=sse 67 | endif 68 | 69 | ifeq ($(ASM),1) 70 | CC+= -S 71 | CXX+= -S 72 | endif 73 | 74 | BUILDDIR=build$(SUFFIX) 75 | SPEC_OBJ := $(addprefix $(BUILDDIR)/,$(SPEC_OBJ)) 76 | BENCH_OBJ := $(addprefix $(BUILDDIR)/,$(BENCH_OBJ)) 77 | SILENT=@ 78 | MKDIR=mkdir -p 79 | PATH_SEPARATOR=/ 80 | 81 | $(BUILDDIR)/%.o: %.cpp 82 | @echo CXX $< 83 | $(SILENT) $(MKDIR) $(subst /,$(PATH_SEPARATOR),$(dir $@)) 84 | $(SILENT) $(COMPILE.cc) -o $@ $< 85 | 86 | 87 | 88 | .PHONY: all 89 | all: specsuite$(SUFFIX) 90 | ./specsuite$(SUFFIX) 91 | 92 | 93 | .PHONY: full 94 | full: 95 | @clear 96 | @echo FULL COMPILE at `date +%H:%M:%S` 97 | # FORCE_SCALAR=1 $(MAKE) clean 98 | @FORCE_SCALAR=1 $(MAKE) specsuite-scalar 99 | # FORCE_GNU=1 $(MAKE) clean 100 | @FORCE_GNU=1 $(MAKE) specsuite-gnu 101 | # FORCE_SSE=1 $(MAKE) clean 102 | @FORCE_SSE=1 $(MAKE) specsuite-sse 103 | # FORCE_NEON=1 $(MAKE) clean 104 | # FORCE_NEON=1 $(MAKE) specsuite-neon 105 | @./specsuite-scalar 106 | @./specsuite-sse 107 | @./specsuite-gnu 108 | 109 | specsuite$(SUFFIX): $(SPEC_OBJ) 110 | @echo LINK $@ 111 | @$(CXX) $(LDFLAGS) $^ -o $@ 112 | 113 | .PHONY: depend 114 | depend: 115 | @echo DEP 116 | @makedepend -Y -- $(CXXFLAGS) -- $(SPEC_SRC) $(BENCH_SRC) -p$(BUILDDIR)/ > /dev/null 2>&1 117 | @$(RM) Makefile.bak 118 | 119 | define asm-command 120 | @mkdir -p $(dir asm$(SUFFIX)/$(1)) 121 | $(CXX) $(CXXFLAGS) -S $(1) -o asm$(SUFFIX)/$(1).S 122 | 123 | endef 124 | 125 | bench-asm: $(BENCH_SRC) 126 | $(foreach p,$(BENCH_SRC),$(call asm-command,$(p))) 127 | 128 | benchmark$(SUFFIX): $(BENCH_OBJ) bench-asm 129 | $(CXX) $(BENCH_OBJ) -o $@ 130 | 131 | .PHONY: bench-full 132 | bench-full: 133 | FORCE_SCALAR=1 $(MAKE) benchmark-scalar 134 | FORCE_GNU=1 $(MAKE) benchmark-gnu 135 | FORCE_SSE=1 $(MAKE) benchmark-sse 136 | # FORCE_NEON=1 $(MAKE) clean 137 | # FORCE_NEON=1 $(MAKE) benchmark-neon 138 | ./benchmark-scalar 139 | ./benchmark-sse 140 | ./benchmark-gnu 141 | 142 | .PHONY: clean 143 | clean: 144 | rm -f $(SPEC_OBJ) $(BENCH_OBJ) benchmark$(SUFFIX) specsuite$(SUFFIX) 145 | rm -rf asm$(SUFFIX) 146 | 147 | .PHONY: realclean 148 | realclean: clean 149 | rm -f specsuite* 150 | rm -rf build* 151 | 152 | 153 | .PHONY: update_spec 154 | update_spec: 155 | ./tools/update_spec.rb spec/spec_*.cpp 156 | 157 | ifeq ($(MAKECMDGOALS),export) 158 | ifeq ($(origin to),undefined) 159 | $(error to not set, like make export to=/foo/bar) 160 | endif 161 | endif 162 | 163 | .PHONY: export 164 | export: 165 | $(SILENT) git archive --format tar master | tar x -C $(to) 166 | 167 | 168 | include/vectorial/vec2f.h include/vectorial/vec3f.h include/vectorial/vec4f.h: include/vectorial/simd4f.h 169 | include/vectorial/simd4f.h: include/vectorial/simd4f_scalar.h 170 | include/vectorial/simd4f.h: include/vectorial/simd4f_neon.h 171 | include/vectorial/simd4f.h: include/vectorial/simd4f_gnu.h 172 | include/vectorial/simd4f.h: include/vectorial/simd4f_sse.h 173 | include/vectorial/simd4f.h: include/vectorial/simd4f_scalar.h 174 | include/vectorial/simd4f.h: include/vectorial/config.h 175 | include/vectorial/simd4x4f.h: include/vectorial/simd4f.h 176 | include/vectorial/simd4x4f.h: include/vectorial/simd4x4f_scalar.h 177 | include/vectorial/simd4x4f.h: include/vectorial/simd4x4f_neon.h 178 | include/vectorial/simd4x4f.h: include/vectorial/simd4x4f_gnu.h 179 | include/vectorial/simd4x4f.h: include/vectorial/simd4x4f_sse.h 180 | include/vectorial/simd4x4f.h: include/vectorial/config.h 181 | spec/spec_helper.h: include/vectorial/simd4x4f.h include/vectorial/simd4f.h include/vectorial/vec4f.h include/vectorial/vec3f.h include/vectorial/vec2f.h 182 | spec/spec.cpp: spec/spec.h 183 | spec/spec_main.cpp: spec/spec.h 184 | spec/spec_simd4f.cpp: spec/spec_helper.h 185 | spec/spec_simd4x4f.cpp: spec/spec_helper.h 186 | spec/spec_vec2f.cpp: spec/spec_helper.h 187 | spec/spec_vec3f.cpp: spec/spec_helper.h 188 | spec/spec_vec4f.cpp: spec/spec_helper.h 189 | 190 | $(BUILDDIR)/spec/spec_simd4f.o: \ 191 | include/vectorial/simd4x4f.h include/vectorial/simd4f.h \ 192 | include/vectorial/simd4f_scalar.h include/vectorial/simd4f_neon.h \ 193 | include/vectorial/simd4f_gnu.h include/vectorial/simd4f_sse.h \ 194 | include/vectorial/config.h 195 | 196 | $(BUILDDIR)/spec/spec_simd4x4f.o: \ 197 | include/vectorial/simd4x4f.h include/vectorial/simd4f.h \ 198 | include/vectorial/simd4f_scalar.h include/vectorial/simd4f_neon.h \ 199 | include/vectorial/simd4f_gnu.h include/vectorial/simd4f_sse.h \ 200 | include/vectorial/simd4x4f_scalar.h include/vectorial/simd4x4f_neon.h \ 201 | include/vectorial/simd4x4f_gnu.h include/vectorial/simd4x4f_sse.h include/vectorial/config.h 202 | 203 | $(BUILDDIR)/spec/spec_vec2f.o $(BUILDDIR)/spec/spec_vec3f.o $(BUILDDIR)/spec/spec_vec4f.o: \ 204 | include/vectorial/simd4x4f.h include/vectorial/simd4f.h \ 205 | include/vectorial/vec4f.h include/vectorial/vec3f.h include/vectorial/vec2f.h \ 206 | include/vectorial/simd4f_scalar.h include/vectorial/simd4f_neon.h \ 207 | include/vectorial/simd4f_gnu.h include/vectorial/simd4f_sse.h \ 208 | include/vectorial/simd4x4f_scalar.h include/vectorial/simd4x4f_neon.h \ 209 | include/vectorial/simd4x4f_gnu.h include/vectorial/simd4x4f_sse.h include/vectorial/config.h 210 | 211 | 212 | 213 | 214 | 215 | # DO NOT DELETE 216 | 217 | $(BUILDDIR)/spec/spec.o: spec/spec.h 218 | $(BUILDDIR)/spec/spec_main.o: spec/spec.h 219 | $(BUILDDIR)/spec/spec_mat4f.o: spec/spec_helper.h spec/spec.h 220 | $(BUILDDIR)/spec/spec_mat4f.o: include/vectorial/simd4f.h 221 | $(BUILDDIR)/spec/spec_mat4f.o: include/vectorial/config.h 222 | $(BUILDDIR)/spec/spec_mat4f.o: include/vectorial/simd4f_gnu.h 223 | $(BUILDDIR)/spec/spec_mat4f.o: include/vectorial/simd4f_common.h 224 | $(BUILDDIR)/spec/spec_mat4f.o: include/vectorial/vec4f.h include/vectorial/vec3f.h 225 | $(BUILDDIR)/spec/spec_mat4f.o: include/vectorial/vec2f.h 226 | $(BUILDDIR)/spec/spec_mat4f.o: include/vectorial/simd4x4f.h 227 | $(BUILDDIR)/spec/spec_mat4f.o: include/vectorial/simd4f.h 228 | $(BUILDDIR)/spec/spec_mat4f.o: include/vectorial/simd4x4f_gnu.h 229 | $(BUILDDIR)/spec/spec_mat4f.o: include/vectorial/mat4f.h 230 | $(BUILDDIR)/spec/spec_simd4f.o: spec/spec_helper.h spec/spec.h 231 | $(BUILDDIR)/spec/spec_simd4f.o: include/vectorial/simd4f.h 232 | $(BUILDDIR)/spec/spec_simd4f.o: include/vectorial/config.h 233 | $(BUILDDIR)/spec/spec_simd4f.o: include/vectorial/simd4f_gnu.h 234 | $(BUILDDIR)/spec/spec_simd4f.o: include/vectorial/simd4f_common.h 235 | $(BUILDDIR)/spec/spec_simd4f.o: include/vectorial/vec4f.h include/vectorial/vec3f.h 236 | $(BUILDDIR)/spec/spec_simd4f.o: include/vectorial/vec2f.h 237 | $(BUILDDIR)/spec/spec_simd4f.o: include/vectorial/simd4x4f.h 238 | $(BUILDDIR)/spec/spec_simd4f.o: include/vectorial/simd4f.h 239 | $(BUILDDIR)/spec/spec_simd4f.o: include/vectorial/simd4x4f_gnu.h 240 | $(BUILDDIR)/spec/spec_simd4f.o: include/vectorial/mat4f.h 241 | $(BUILDDIR)/spec/spec_simd4x4f.o: spec/spec_helper.h spec/spec.h 242 | $(BUILDDIR)/spec/spec_simd4x4f.o: include/vectorial/simd4f.h 243 | $(BUILDDIR)/spec/spec_simd4x4f.o: include/vectorial/config.h 244 | $(BUILDDIR)/spec/spec_simd4x4f.o: include/vectorial/simd4f_gnu.h 245 | $(BUILDDIR)/spec/spec_simd4x4f.o: include/vectorial/simd4f_common.h 246 | $(BUILDDIR)/spec/spec_simd4x4f.o: include/vectorial/vec4f.h 247 | $(BUILDDIR)/spec/spec_simd4x4f.o: include/vectorial/vec3f.h 248 | $(BUILDDIR)/spec/spec_simd4x4f.o: include/vectorial/vec2f.h 249 | $(BUILDDIR)/spec/spec_simd4x4f.o: include/vectorial/simd4x4f.h 250 | $(BUILDDIR)/spec/spec_simd4x4f.o: include/vectorial/simd4f.h 251 | $(BUILDDIR)/spec/spec_simd4x4f.o: include/vectorial/simd4x4f_gnu.h 252 | $(BUILDDIR)/spec/spec_simd4x4f.o: include/vectorial/mat4f.h 253 | $(BUILDDIR)/spec/spec_vec2f.o: spec/spec_helper.h spec/spec.h 254 | $(BUILDDIR)/spec/spec_vec2f.o: include/vectorial/simd4f.h 255 | $(BUILDDIR)/spec/spec_vec2f.o: include/vectorial/config.h 256 | $(BUILDDIR)/spec/spec_vec2f.o: include/vectorial/simd4f_gnu.h 257 | $(BUILDDIR)/spec/spec_vec2f.o: include/vectorial/simd4f_common.h 258 | $(BUILDDIR)/spec/spec_vec2f.o: include/vectorial/vec4f.h include/vectorial/vec3f.h 259 | $(BUILDDIR)/spec/spec_vec2f.o: include/vectorial/vec2f.h 260 | $(BUILDDIR)/spec/spec_vec2f.o: include/vectorial/simd4x4f.h 261 | $(BUILDDIR)/spec/spec_vec2f.o: include/vectorial/simd4f.h 262 | $(BUILDDIR)/spec/spec_vec2f.o: include/vectorial/simd4x4f_gnu.h 263 | $(BUILDDIR)/spec/spec_vec2f.o: include/vectorial/mat4f.h 264 | $(BUILDDIR)/spec/spec_vec3f.o: spec/spec_helper.h spec/spec.h 265 | $(BUILDDIR)/spec/spec_vec3f.o: include/vectorial/simd4f.h 266 | $(BUILDDIR)/spec/spec_vec3f.o: include/vectorial/config.h 267 | $(BUILDDIR)/spec/spec_vec3f.o: include/vectorial/simd4f_gnu.h 268 | $(BUILDDIR)/spec/spec_vec3f.o: include/vectorial/simd4f_common.h 269 | $(BUILDDIR)/spec/spec_vec3f.o: include/vectorial/vec4f.h include/vectorial/vec3f.h 270 | $(BUILDDIR)/spec/spec_vec3f.o: include/vectorial/vec2f.h 271 | $(BUILDDIR)/spec/spec_vec3f.o: include/vectorial/simd4x4f.h 272 | $(BUILDDIR)/spec/spec_vec3f.o: include/vectorial/simd4f.h 273 | $(BUILDDIR)/spec/spec_vec3f.o: include/vectorial/simd4x4f_gnu.h 274 | $(BUILDDIR)/spec/spec_vec3f.o: include/vectorial/mat4f.h 275 | $(BUILDDIR)/spec/spec_vec4f.o: spec/spec_helper.h spec/spec.h 276 | $(BUILDDIR)/spec/spec_vec4f.o: include/vectorial/simd4f.h 277 | $(BUILDDIR)/spec/spec_vec4f.o: include/vectorial/config.h 278 | $(BUILDDIR)/spec/spec_vec4f.o: include/vectorial/simd4f_gnu.h 279 | $(BUILDDIR)/spec/spec_vec4f.o: include/vectorial/simd4f_common.h 280 | $(BUILDDIR)/spec/spec_vec4f.o: include/vectorial/vec4f.h include/vectorial/vec3f.h 281 | $(BUILDDIR)/spec/spec_vec4f.o: include/vectorial/vec2f.h 282 | $(BUILDDIR)/spec/spec_vec4f.o: include/vectorial/simd4x4f.h 283 | $(BUILDDIR)/spec/spec_vec4f.o: include/vectorial/simd4f.h 284 | $(BUILDDIR)/spec/spec_vec4f.o: include/vectorial/simd4x4f_gnu.h 285 | $(BUILDDIR)/spec/spec_vec4f.o: include/vectorial/mat4f.h 286 | $(BUILDDIR)/bench/add_bench.o: bench/bench.h include/vectorial/vec4f.h 287 | $(BUILDDIR)/bench/bench.o: bench/bench.h include/vectorial/config.h 288 | $(BUILDDIR)/bench/dot_bench.o: bench/bench.h include/vectorial/vec4f.h 289 | $(BUILDDIR)/bench/matrix_bench.o: bench/bench.h include/vectorial/simd4x4f.h 290 | $(BUILDDIR)/bench/matrix_bench.o: include/vectorial/simd4f.h 291 | $(BUILDDIR)/bench/matrix_bench.o: include/vectorial/simd4x4f_gnu.h 292 | $(BUILDDIR)/bench/quad_bench.o: bench/bench.h include/vectorial/simd4x4f.h 293 | $(BUILDDIR)/bench/quad_bench.o: include/vectorial/simd4f.h 294 | $(BUILDDIR)/bench/quad_bench.o: include/vectorial/simd4x4f_gnu.h 295 | -------------------------------------------------------------------------------- /include/vectorial/simd4x4f.h: -------------------------------------------------------------------------------- 1 | /* 2 | Vectorial 3 | Copyright (c) 2010 Mikko Lehtonen 4 | Copyright (c) 2014 Google, Inc. 5 | Licensed under the terms of the two-clause BSD License (see LICENSE) 6 | */ 7 | #ifndef VECTORIAL_SIMD4X4F_H 8 | #define VECTORIAL_SIMD4X4F_H 9 | 10 | 11 | #include "simd4f.h" 12 | 13 | #include 14 | 15 | /* 16 | Note, x,y,z,w are conceptually columns with matrix math. 17 | */ 18 | 19 | typedef struct { 20 | simd4f x,y,z,w; 21 | } simd4x4f; 22 | 23 | 24 | 25 | vectorial_inline simd4x4f simd4x4f_create(simd4f x, simd4f y, simd4f z, SIMD_PARAM(simd4f, w)) { 26 | simd4x4f s = { x, y, z, w }; 27 | return s; 28 | } 29 | 30 | 31 | vectorial_inline void simd4x4f_identity(simd4x4f* m) { 32 | *m = simd4x4f_create( simd4f_create(1.0f, 0.0f, 0.0f, 0.0f), 33 | simd4f_create(0.0f, 1.0f, 0.0f, 0.0f), 34 | simd4f_create(0.0f, 0.0f, 1.0f, 0.0f), 35 | simd4f_create(0.0f, 0.0f, 0.0f, 1.0f)); 36 | } 37 | 38 | 39 | 40 | vectorial_inline void simd4x4f_uload(simd4x4f* m, const float *f) { 41 | 42 | m->x = simd4f_uload4(f + 0); 43 | m->y = simd4f_uload4(f + 4); 44 | m->z = simd4f_uload4(f + 8); 45 | m->w = simd4f_uload4(f + 12); 46 | 47 | } 48 | 49 | 50 | 51 | 52 | 53 | #ifdef VECTORIAL_SCALAR 54 | #include "simd4x4f_scalar.h" 55 | #elif defined(VECTORIAL_SSE) 56 | #include "simd4x4f_sse.h" 57 | #elif defined(VECTORIAL_GNU) 58 | #include "simd4x4f_gnu.h" 59 | #elif defined(VECTORIAL_NEON) 60 | #include "simd4x4f_neon.h" 61 | #else 62 | #error No implementation defined 63 | #endif 64 | 65 | vectorial_inline void simd4x4f_sum(const simd4x4f* a, simd4f* out) { 66 | simd4f t; 67 | t = simd4f_add(a->x, a->y); 68 | t = simd4f_add(t, a->z); 69 | t = simd4f_add(t, a->w); 70 | *out = t; 71 | } 72 | 73 | vectorial_inline void simd4x4f_matrix_vector_mul(const simd4x4f* a, const simd4f * b, simd4f* out) { 74 | 75 | const simd4f x = a->x; 76 | const simd4f y = a->y; 77 | const simd4f z = a->z; 78 | const simd4f w = a->w; 79 | const simd4f v = *b; 80 | const simd4f vx = simd4f_splat_x(v); 81 | const simd4f vy = simd4f_splat_y(v); 82 | const simd4f vz = simd4f_splat_z(v); 83 | const simd4f vw = simd4f_splat_w(v); 84 | 85 | #if 0 86 | // In a hasty benchmark, this actually performed worse on neon 87 | // TODO: revisit and conditionalize accordingly 88 | 89 | *out = simd4f_madd(x, vx, 90 | simd4f_madd(y, vy, 91 | simd4f_madd(z, vz, 92 | simd4f_mul(w, vw) ) ) ); 93 | 94 | #else 95 | 96 | *out = simd4f_add(simd4f_mul(x, vx), 97 | simd4f_add(simd4f_mul(y, vy), 98 | simd4f_add(simd4f_mul(z, vz), 99 | simd4f_mul(w, vw) ) ) ); 100 | 101 | #endif 102 | } 103 | 104 | vectorial_inline void simd4x4f_matrix_vector3_mul(const simd4x4f* a, const simd4f * b, simd4f* out) { 105 | 106 | #if 0 107 | *out = simd4f_madd( a->x, simd4f_splat_x(*b), 108 | simd4f_madd( a->y, simd4f_splat_y(*b), 109 | simd4f_mul(a->z, simd4f_splat_z(*b)) ) ); 110 | #else 111 | *out = simd4f_add( simd4f_mul(a->x, simd4f_splat_x(*b)), 112 | simd4f_add( simd4f_mul(a->y, simd4f_splat_y(*b)), 113 | simd4f_mul(a->z, simd4f_splat_z(*b)) ) ); 114 | #endif 115 | 116 | } 117 | 118 | vectorial_inline void simd4x4f_matrix_point3_mul(const simd4x4f* a, const simd4f * b, simd4f* out) { 119 | 120 | #if 0 121 | *out = simd4f_madd( a->x, simd4f_splat_x(*b), 122 | simd4f_madd( a->y, simd4f_splat_y(*b), 123 | simd4f_madd( a->z, simd4f_splat_z(*b), 124 | a->w ) ) ); 125 | #else 126 | *out = simd4f_add( simd4f_mul(a->x, simd4f_splat_x(*b)), 127 | simd4f_add( simd4f_mul(a->y, simd4f_splat_y(*b)), 128 | simd4f_add( simd4f_mul(a->z, simd4f_splat_z(*b)), 129 | a->w ) ) ); 130 | #endif 131 | 132 | } 133 | 134 | vectorial_inline void simd4x4f_inv_ortho_matrix_point3_mul(const simd4x4f* a, const simd4f * b, simd4f* out) { 135 | simd4f translation = simd4f_sub(*b, a->w); 136 | 137 | simd4x4f transpose = *a; 138 | 139 | transpose.w = simd4f_create(0,0,0,0); 140 | simd4x4f_transpose_inplace(&transpose); 141 | 142 | simd4x4f_matrix_point3_mul(&transpose, &translation, out); 143 | } 144 | 145 | vectorial_inline void simd4x4f_inv_ortho_matrix_vector3_mul(const simd4x4f* a, const simd4f * b, simd4f* out) { 146 | simd4f translation = *b; 147 | 148 | simd4x4f transpose = *a; 149 | 150 | transpose.w = simd4f_create(0,0,0,0); 151 | simd4x4f_transpose_inplace(&transpose); 152 | 153 | simd4x4f_matrix_vector3_mul(&transpose, &translation, out); 154 | } 155 | 156 | 157 | vectorial_inline void simd4x4f_matrix_mul(const simd4x4f* a, const simd4x4f* b, simd4x4f* out) { 158 | 159 | simd4x4f_matrix_vector_mul(a, &b->x, &out->x); 160 | simd4x4f_matrix_vector_mul(a, &b->y, &out->y); 161 | simd4x4f_matrix_vector_mul(a, &b->z, &out->z); 162 | simd4x4f_matrix_vector_mul(a, &b->w, &out->w); 163 | 164 | } 165 | 166 | 167 | 168 | 169 | vectorial_inline void simd4x4f_perspective(simd4x4f *m, float fovy_radians, float aspect, float znear, float zfar) { 170 | 171 | float deltaz = zfar - znear; 172 | float cotangent = tanf( VECTORIAL_HALFPI - fovy_radians * 0.5f ); 173 | 174 | float a = cotangent / aspect; 175 | float b = cotangent; 176 | float c = -(zfar + znear) / deltaz; 177 | float d = -2 * znear * zfar / deltaz; 178 | 179 | m->x = simd4f_create( a, 0, 0, 0); 180 | m->y = simd4f_create( 0, b, 0, 0); 181 | m->z = simd4f_create( 0, 0, c, -1); 182 | m->w = simd4f_create( 0, 0, d, 0); 183 | 184 | } 185 | 186 | vectorial_inline void simd4x4f_ortho(simd4x4f *m, float left, float right, float bottom, float top, float znear, float zfar) { 187 | 188 | float deltax = right - left; 189 | float deltay = top - bottom; 190 | float deltaz = zfar - znear; 191 | 192 | float a = 2.0f / deltax; 193 | float b = -(right + left) / deltax; 194 | float c = 2.0f / deltay; 195 | float d = -(top + bottom) / deltay; 196 | float e = -2.0f / deltaz; 197 | float f = -(zfar + znear) / deltaz; 198 | 199 | m->x = simd4f_create( a, 0, 0, 0); 200 | m->y = simd4f_create( 0, c, 0, 0); 201 | m->z = simd4f_create( 0, 0, e, 0); 202 | m->w = simd4f_create( b, d, f, 1); 203 | 204 | } 205 | 206 | 207 | vectorial_inline void simd4x4f_lookat(simd4x4f *m, simd4f eye, simd4f center, simd4f up) { 208 | 209 | simd4f zaxis = simd4f_normalize3( simd4f_sub(center, eye) ); 210 | simd4f xaxis = simd4f_normalize3( simd4f_cross3( zaxis, up ) ); 211 | simd4f yaxis = simd4f_cross3(xaxis, zaxis); 212 | 213 | zaxis = simd4f_sub( simd4f_zero(), zaxis); 214 | 215 | float x = -simd4f_dot3_scalar(xaxis, eye); 216 | float y = -simd4f_dot3_scalar(yaxis, eye); 217 | float z = -simd4f_dot3_scalar(zaxis, eye); 218 | 219 | m->x = xaxis; 220 | m->y = yaxis; 221 | m->z = zaxis; 222 | 223 | m->w = simd4f_create( 0,0,0, 1); 224 | simd4x4f_transpose_inplace(m); 225 | m->w = simd4f_create( x,y,z,1); 226 | 227 | } 228 | 229 | 230 | vectorial_inline void simd4x4f_translation(simd4x4f* m, float x, float y, float z) { 231 | *m = simd4x4f_create( simd4f_create(1.0f, 0.0f, 0.0f, 0.0f), 232 | simd4f_create(0.0f, 1.0f, 0.0f, 0.0f), 233 | simd4f_create(0.0f, 0.0f, 1.0f, 0.0f), 234 | simd4f_create( x, y, z, 1.0f)); 235 | } 236 | 237 | 238 | vectorial_inline void simd4x4f_axis_rotation(simd4x4f* m, float radians, simd4f axis) { 239 | 240 | radians = -radians; 241 | 242 | axis = simd4f_normalize3(axis); 243 | 244 | const float sine = sinf(radians); 245 | const float cosine = cosf(radians); 246 | 247 | const float x = simd4f_get_x(axis); 248 | const float y = simd4f_get_y(axis); 249 | const float z = simd4f_get_z(axis); 250 | 251 | const float ab = x * y * (1 - cosine); 252 | const float bc = y * z * (1 - cosine); 253 | const float ca = z * x * (1 - cosine); 254 | 255 | const float tx = x * x; 256 | const float ty = y * y; 257 | const float tz = z * z; 258 | 259 | const simd4f i = simd4f_create( tx + cosine * (1 - tx), ab - z * sine, ca + y * sine, 0); 260 | const simd4f j = simd4f_create( ab + z * sine, ty + cosine * (1 - ty), bc - x * sine, 0); 261 | const simd4f k = simd4f_create( ca - y * sine, bc + x * sine, tz + cosine * (1 - tz), 0); 262 | 263 | *m = simd4x4f_create( i,j,k, simd4f_create(0, 0, 0, 1) ); 264 | 265 | } 266 | 267 | 268 | 269 | vectorial_inline void simd4x4f_add(const simd4x4f* a, const simd4x4f* b, simd4x4f* out) { 270 | 271 | out->x = simd4f_add(a->x, b->x); 272 | out->y = simd4f_add(a->y, b->y); 273 | out->z = simd4f_add(a->z, b->z); 274 | out->w = simd4f_add(a->w, b->w); 275 | 276 | } 277 | 278 | vectorial_inline void simd4x4f_sub(const simd4x4f* a, const simd4x4f* b, simd4x4f* out) { 279 | 280 | out->x = simd4f_sub(a->x, b->x); 281 | out->y = simd4f_sub(a->y, b->y); 282 | out->z = simd4f_sub(a->z, b->z); 283 | out->w = simd4f_sub(a->w, b->w); 284 | 285 | } 286 | 287 | vectorial_inline void simd4x4f_mul(const simd4x4f* a, const simd4x4f* b, simd4x4f* out) { 288 | 289 | out->x = simd4f_mul(a->x, b->x); 290 | out->y = simd4f_mul(a->y, b->y); 291 | out->z = simd4f_mul(a->z, b->z); 292 | out->w = simd4f_mul(a->w, b->w); 293 | 294 | } 295 | 296 | vectorial_inline void simd4x4f_div(simd4x4f* a, simd4x4f* b, simd4x4f* out) { 297 | 298 | out->x = simd4f_div(a->x, b->x); 299 | out->y = simd4f_div(a->y, b->y); 300 | out->z = simd4f_div(a->z, b->z); 301 | out->w = simd4f_div(a->w, b->w); 302 | 303 | } 304 | 305 | vectorial_inline simd4f simd4x4f_inverse(const simd4x4f* a, simd4x4f* out) { 306 | 307 | const simd4f c0 = a->x; 308 | const simd4f c1 = a->y; 309 | const simd4f c2 = a->z; 310 | const simd4f c3 = a->w; 311 | 312 | const simd4f c0_wxyz = simd4f_shuffle_wxyz(c0); 313 | const simd4f c0_zwxy = simd4f_shuffle_zwxy(c0); 314 | const simd4f c0_yzwx = simd4f_shuffle_yzwx(c0); 315 | 316 | const simd4f c1_wxyz = simd4f_shuffle_wxyz(c1); 317 | const simd4f c1_zwxy = simd4f_shuffle_zwxy(c1); 318 | const simd4f c1_yzwx = simd4f_shuffle_yzwx(c1); 319 | 320 | const simd4f c2_wxyz = simd4f_shuffle_wxyz(c2); 321 | const simd4f c2_zwxy = simd4f_shuffle_zwxy(c2); 322 | const simd4f c2_yzwx = simd4f_shuffle_yzwx(c2); 323 | 324 | const simd4f c3_wxyz = simd4f_shuffle_wxyz(c3); 325 | const simd4f c3_zwxy = simd4f_shuffle_zwxy(c3); 326 | const simd4f c3_yzwx = simd4f_shuffle_yzwx(c3); 327 | 328 | const simd4f c0_wxyz_x_c1 = simd4f_mul(c0_wxyz, c1); 329 | const simd4f c0_wxyz_x_c1_yzwx = simd4f_mul(c0_wxyz, c1_yzwx); 330 | const simd4f c0_wxyz_x_c1_zwxy = simd4f_mul(c0_wxyz, c1_zwxy); 331 | 332 | const simd4f c2_wxyz_x_c3 = simd4f_mul(c2_wxyz, c3); 333 | const simd4f c2_wxyz_x_c3_yzwx = simd4f_mul(c2_wxyz, c3_yzwx); 334 | const simd4f c2_wxyz_x_c3_zwxy = simd4f_mul(c2_wxyz, c3_zwxy); 335 | 336 | const simd4f ar1 = simd4f_sub( simd4f_shuffle_wxyz(c2_wxyz_x_c3_zwxy), simd4f_shuffle_zwxy(c2_wxyz_x_c3) ); 337 | const simd4f ar2 = simd4f_sub( simd4f_shuffle_zwxy(c2_wxyz_x_c3_yzwx), c2_wxyz_x_c3_yzwx ); 338 | const simd4f ar3 = simd4f_sub( c2_wxyz_x_c3_zwxy, simd4f_shuffle_wxyz(c2_wxyz_x_c3) ); 339 | 340 | const simd4f br1 = simd4f_sub( simd4f_shuffle_wxyz(c0_wxyz_x_c1_zwxy), simd4f_shuffle_zwxy(c0_wxyz_x_c1) ); 341 | const simd4f br2 = simd4f_sub( simd4f_shuffle_zwxy(c0_wxyz_x_c1_yzwx), c0_wxyz_x_c1_yzwx ); 342 | const simd4f br3 = simd4f_sub( c0_wxyz_x_c1_zwxy, simd4f_shuffle_wxyz(c0_wxyz_x_c1) ); 343 | 344 | 345 | const simd4f c0_sum = simd4f_madd(c0_yzwx, ar3, 346 | simd4f_madd(c0_zwxy, ar2, 347 | simd4f_mul(c0_wxyz, ar1))); 348 | 349 | const simd4f c1_sum = simd4f_madd(c1_wxyz, ar1, 350 | simd4f_madd(c1_zwxy, ar2, 351 | simd4f_mul(c1_yzwx, ar3))); 352 | 353 | const simd4f c2_sum = simd4f_madd(c2_yzwx, br3, 354 | simd4f_madd(c2_zwxy, br2, 355 | simd4f_mul(c2_wxyz, br1))); 356 | 357 | const simd4f c3_sum = simd4f_madd(c3_yzwx, br3, 358 | simd4f_madd(c3_zwxy, br2, 359 | simd4f_mul(c3_wxyz, br1))); 360 | 361 | 362 | const simd4f d0 = simd4f_mul(c1_sum, c0); 363 | const simd4f d1 = simd4f_add(d0, simd4f_merge_high(d0, d0)); 364 | const simd4f det = simd4f_sub(d1, simd4f_splat_y(d1)); 365 | 366 | const simd4f invdet = simd4f_splat_x( simd4f_div(simd4f_splat(1.0f), det) ); 367 | 368 | const simd4f o0 = simd4f_mul( simd4f_flip_sign_0101(c1_sum), invdet ); 369 | const simd4f o1 = simd4f_mul( simd4f_flip_sign_1010(c0_sum), invdet ); 370 | const simd4f o2 = simd4f_mul( simd4f_flip_sign_0101(c3_sum), invdet ); 371 | const simd4f o3 = simd4f_mul( simd4f_flip_sign_1010(c2_sum), invdet ); 372 | 373 | const simd4x4f mt = simd4x4f_create(o0, o1, o2, o3); 374 | 375 | simd4x4f_transpose( &mt, out); 376 | 377 | return det; 378 | } 379 | 380 | #ifdef __cplusplus 381 | 382 | #ifdef VECTORIAL_OSTREAM 383 | #include 384 | 385 | vectorial_inline std::ostream& operator<<(std::ostream& os, const simd4x4f& v) { 386 | os << "simd4x4f(simd4f(" << simd4f_get_x(v.x) << ", " 387 | << simd4f_get_y(v.x) << ", " 388 | << simd4f_get_z(v.x) << ", " 389 | << simd4f_get_w(v.x) << "),\n" 390 | << " simd4f(" << simd4f_get_x(v.y) << ", " 391 | << simd4f_get_y(v.y) << ", " 392 | << simd4f_get_z(v.y) << ", " 393 | << simd4f_get_w(v.y) << "),\n" 394 | << " simd4f(" << simd4f_get_x(v.z) << ", " 395 | << simd4f_get_y(v.z) << ", " 396 | << simd4f_get_z(v.z) << ", " 397 | << simd4f_get_w(v.z) << "),\n" 398 | << " simd4f(" << simd4f_get_x(v.w) << ", " 399 | << simd4f_get_y(v.w) << ", " 400 | << simd4f_get_z(v.w) << ", " 401 | << simd4f_get_w(v.w) << "))"; 402 | return os; 403 | } 404 | #endif 405 | 406 | #endif 407 | 408 | 409 | 410 | 411 | 412 | #endif 413 | -------------------------------------------------------------------------------- /spec/spec_simd4f.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "spec_helper.h" 3 | 4 | const int epsilon = 1; 5 | 6 | describe(simd4f, "sanity") { 7 | it("VECTORIAL_SIMD_TYPE should be defined to a string") { 8 | std::cout << "Simd type: " << VECTORIAL_SIMD_TYPE << std::endl; 9 | } 10 | } 11 | 12 | describe(simd4f, "creating") { 13 | 14 | it("should be possible to create with simd4f_create") { 15 | 16 | simd4f x = simd4f_create(1, 2, 3, 4); 17 | 18 | should_be_close_to( simd4f_get_x(x), 1, epsilon); 19 | should_be_close_to( simd4f_get_y(x), 2, epsilon); 20 | should_be_close_to( simd4f_get_z(x), 3, epsilon); 21 | should_be_close_to( simd4f_get_w(x), 4, epsilon); 22 | 23 | // octave simd4f: [1,2,3,4] 24 | should_be_equal_simd4f(x, simd4f_create(1.000000000000000f, 2.000000000000000f, 3.000000000000000f, 4.000000000000000f), epsilon ); 25 | 26 | } 27 | 28 | it("should have simd4f_zero for zero vector") { 29 | 30 | simd4f x = simd4f_zero(); 31 | 32 | // octave simd4f: [0,0,0,0] 33 | should_be_equal_simd4f(x, simd4f_create(0.000000000000000f, 0.000000000000000f, 0.000000000000000f, 0.000000000000000f), epsilon ); 34 | } 35 | 36 | 37 | } 38 | #ifdef _MSC_VER 39 | #include 40 | #else 41 | #include 42 | #endif 43 | 44 | #define unaligned_mem(n) ((float*)((unsigned char*)alloca(sizeof(float)*n+4)+4)) 45 | 46 | describe(simd4f, "utilities") { 47 | 48 | it("should have simd4f_uload4 for loading four float values from an unaligned float array into simd4f") { 49 | float *f = unaligned_mem(4); 50 | f[0] = 1; 51 | f[1] = 2; 52 | f[2] = 3; 53 | f[3] = 4; 54 | simd4f x = simd4f_uload4(f); 55 | // octave simd4f: [1,2,3,4] 56 | should_be_equal_simd4f(x, simd4f_create(1.000000000000000f, 2.000000000000000f, 3.000000000000000f, 4.000000000000000f), epsilon ); 57 | } 58 | 59 | it("should have simd4f_uload3 for loading three float values from an unaligned float array into simd4f") { 60 | float *f = unaligned_mem(3); 61 | f[0] = 1; 62 | f[1] = 2; 63 | f[2] = 3; 64 | simd4f x = simd4f_uload3(f); 65 | // octave simd4f: [1,2,3] 66 | should_be_equal_simd4f(x, simd4f_create(1.000000000000000f, 2.000000000000000f, 3.000000000000000f, 0.0f), epsilon ); 67 | } 68 | 69 | it("should have simd4f_uload2 for loading two float values from float an unaligned array into simd4f") { 70 | float *f = unaligned_mem(2); 71 | f[0] = 1; 72 | f[1] = 2; 73 | simd4f x = simd4f_uload2(f); 74 | // octave simd4f: [1,2] 75 | should_be_equal_simd4f(x, simd4f_create(1.000000000000000f, 2.000000000000000f, 0.0f, 0.0f), epsilon ); 76 | } 77 | 78 | 79 | it("should have simd4f_ustore4 for storing four float values from simd4f to an unaligned array") { 80 | float *f = unaligned_mem(4); 81 | f[0] = -1; 82 | f[1] = -1; 83 | f[2] = -1; 84 | f[3] = -1; 85 | simd4f a = simd4f_create(1,2,3,4); 86 | simd4f_ustore4(a, f); 87 | should_be_close_to(f[0], 1, epsilon); 88 | should_be_close_to(f[1], 2, epsilon); 89 | should_be_close_to(f[2], 3, epsilon); 90 | should_be_close_to(f[3], 4, epsilon); 91 | } 92 | 93 | it("should have simd4f_ustore3 for storing three float values from simd4f to an unaligned array") { 94 | float *f = unaligned_mem(3); 95 | f[0] = -1; 96 | f[1] = -1; 97 | f[2] = -1; 98 | simd4f a = simd4f_create(1,2,3,4); 99 | simd4f_ustore3(a, f); 100 | should_be_close_to(f[0], 1, epsilon); 101 | should_be_close_to(f[1], 2, epsilon); 102 | should_be_close_to(f[2], 3, epsilon); 103 | } 104 | 105 | it("should have simd4f_ustore2 for storing two float values from simd4f to an unaligned array") { 106 | float *f = unaligned_mem(2); 107 | f[0] = -1; 108 | f[1] = -1; 109 | simd4f a = simd4f_create(1,2,3,4); 110 | simd4f_ustore2(a, f); 111 | should_be_close_to(f[0], 1, epsilon); 112 | should_be_close_to(f[1], 2, epsilon); 113 | } 114 | 115 | 116 | 117 | 118 | it("should have simd4f_splat that expands a single scalar to all elements") { 119 | simd4f x = simd4f_splat(42); 120 | // octave simd4f: [42,42,42,42] 121 | should_be_equal_simd4f(x, simd4f_create(42.000000000000000f, 42.000000000000000f, 42.000000000000000f, 42.000000000000000f), epsilon ); 122 | } 123 | 124 | it("should have simd4f_splat_x,y,z,w splatting of an element") { 125 | simd4f a = simd4f_create(1,2,3,4); 126 | 127 | simd4f x; 128 | 129 | x = simd4f_splat_x(a); 130 | // octave simd4f: [1,1,1,1] 131 | should_be_equal_simd4f(x, simd4f_create(1.000000000000000f, 1.000000000000000f, 1.000000000000000f, 1.000000000000000f), epsilon ); 132 | 133 | x = simd4f_splat_y(a); 134 | // octave simd4f: [2,2,2,2] 135 | should_be_equal_simd4f(x, simd4f_create(2.000000000000000f, 2.000000000000000f, 2.000000000000000f, 2.000000000000000f), epsilon ); 136 | 137 | x = simd4f_splat_z(a); 138 | // octave simd4f: [3,3,3,3] 139 | should_be_equal_simd4f(x, simd4f_create(3.000000000000000f, 3.000000000000000f, 3.000000000000000f, 3.000000000000000f), epsilon ); 140 | 141 | x = simd4f_splat_w(a); 142 | // octave simd4f: [4,4,4,4] 143 | should_be_equal_simd4f(x, simd4f_create(4.000000000000000f, 4.000000000000000f, 4.000000000000000f, 4.000000000000000f), epsilon ); 144 | } 145 | 146 | it("should have simd4f_sum that adds elements") { 147 | simd4f a = simd4f_create(1,2,3,4); 148 | simd4f x = simd4f_sum(a); 149 | // octave simd4f: [sum([1,2,3,4]), sum([1,2,3,4]), sum([1,2,3,4]), sum([1,2,3,4])] 150 | should_be_equal_simd4f(x, simd4f_create(10.000000000000000f, 10.000000000000000f, 10.000000000000000f, 10.000000000000000f), epsilon ); 151 | 152 | } 153 | 154 | it("should have simd4f_reciprocal") { 155 | simd4f a = simd4f_create(0.00001f, 2.00001f, 3.0f, 99999999.0f); 156 | simd4f x = simd4f_reciprocal(a); 157 | // octave simd4f: 1 ./ [0.00001, 2.00001, 3.0, 99999999.0] 158 | should_be_equal_simd4f(x, simd4f_create(99999.999999999985448f, 0.499997500012500f, 0.333333333333333f, 0.000000010000000f), epsilon ); 159 | } 160 | 161 | it("should have simd4f_sqrt") { 162 | simd4f a = simd4f_create(0.00001f, 2.00001f, 3.0f, 99999999.0f); 163 | simd4f x = simd4f_sqrt(a); 164 | // octave simd4f: sqrt([0.00001, 2.00001, 3.0, 99999999.0]) 165 | should_be_equal_simd4f(x, simd4f_create(0.003162277660168f, 1.414217097902582f, 1.732050807568877f, 9999.999949999999444f), epsilon ); 166 | 167 | x = simd4f_sqrt( simd4f_create(0.0f, 0.0f, 0.0f, 0.0f) ); 168 | // octave simd4f: sqrt([0, 0, 0, 0]) 169 | should_be_equal_simd4f(x, simd4f_create(0.000000000000000f, 0.000000000000000f, 0.000000000000000f, 0.000000000000000f), epsilon ); 170 | } 171 | 172 | it("should have simd4f_rsqrt for reciprocal of square-root") { 173 | simd4f a = simd4f_create(0.00001f, 2.00001f, 3.0f, 99999999.0f); 174 | simd4f x = simd4f_rsqrt(a); 175 | const int epsilon = 4; // Grant larger error 176 | // octave simd4f: 1 ./ sqrt([0.00001, 2.00001, 3.0, 99999999.0]) 177 | should_be_equal_simd4f(x, simd4f_create(316.227766016837904f, 0.707105013426224f, 0.577350269189626f, 0.000100000000500f), epsilon ); 178 | } 179 | 180 | } 181 | 182 | describe(simd4f, "arithmetic with another simd4f") { 183 | 184 | it("should have simd4f_add for component-wise addition") { 185 | simd4f a = simd4f_create(1,2,3,4); 186 | simd4f b = simd4f_create(10,20,30,40); 187 | 188 | simd4f x = simd4f_add(a,b); 189 | // octave simd4f: [1,2,3,4] + [10,20,30,40] 190 | should_be_equal_simd4f(x, simd4f_create(11.000000000000000f, 22.000000000000000f, 33.000000000000000f, 44.000000000000000f), epsilon ); 191 | } 192 | 193 | it("should have simd4f_sub for component-wise subtraction") { 194 | simd4f a = simd4f_create(1,2,3,4); 195 | simd4f b = simd4f_create(10,20,30,40); 196 | 197 | simd4f x = simd4f_sub(b,a); 198 | // octave simd4f: [10,20,30,40] - [1,2,3,4] 199 | should_be_equal_simd4f(x, simd4f_create(9.000000000000000f, 18.000000000000000f, 27.000000000000000f, 36.000000000000000f), epsilon ); 200 | } 201 | 202 | it("should have simd4f_mul for component-wise multiply") { 203 | simd4f a = simd4f_create(1,2,3,4); 204 | simd4f b = simd4f_create(10,20,30,40); 205 | 206 | simd4f x = simd4f_mul(a,b); 207 | // octave simd4f: [1,2,3,4] .* [10,20,30,40] 208 | should_be_equal_simd4f(x, simd4f_create(10.000000000000000f, 40.000000000000000f, 90.000000000000000f, 160.000000000000000f), epsilon ); 209 | } 210 | 211 | it("should have simd4f_div for component-wise division") { 212 | simd4f a = simd4f_create(1,2,3,4); 213 | simd4f b = simd4f_create(10,20,30,40); 214 | 215 | simd4f x = simd4f_div(b,a); 216 | // octave simd4f: [10,20,30,40] ./ [1,2,3,4] 217 | should_be_equal_simd4f(x, simd4f_create(10.000000000000000f, 10.000000000000000f, 10.000000000000000f, 10.000000000000000f), epsilon ); 218 | } 219 | 220 | it("should have simd4f_madd for multiply-add") { 221 | simd4f a = simd4f_create(1,2,3,4); 222 | simd4f b = simd4f_create(100,100,100,100); 223 | simd4f c = simd4f_create(6,7,8,9); 224 | 225 | simd4f x = simd4f_madd(a,b,c); 226 | // octave simd4f: [1,2,3,4] .* [100,100,100,100] .+ [6,7,8,9] 227 | should_be_equal_simd4f(x, simd4f_create(106.000000000000000f, 207.000000000000000f, 308.000000000000000f, 409.000000000000000f), epsilon ); 228 | 229 | } 230 | 231 | } 232 | 233 | 234 | describe(simd4f, "vector math") { 235 | 236 | it("should have simd4f_dot4 for four component dot product") { 237 | simd4f a = simd4f_create(1,2,3,4); 238 | simd4f b = simd4f_create(10,20,30,40); 239 | 240 | simd4f x = simd4f_dot4(a,b); 241 | // octave simd4f: [dot([1, 2, 3, 4], [10, 20, 30, 40]),dot([1, 2, 3, 4], [10, 20, 30, 40]),dot([1, 2, 3, 4], [10, 20, 30, 40]),dot([1, 2, 3, 4], [10, 20, 30, 40])] 242 | should_be_equal_simd4f(x, simd4f_create(300.000000000000000f, 300.000000000000000f, 300.000000000000000f, 300.000000000000000f), epsilon ); 243 | } 244 | 245 | it("should have simd4f_dot3_scalar for three component dot product returning float") { 246 | simd4f a = simd4f_create(1,2,3,9999); 247 | simd4f b = simd4f_create(10,20,30,-9990); 248 | 249 | float x = simd4f_dot3_scalar(a,b); 250 | // octave float: dot([1, 2, 3], [10, 20, 30]) 251 | should_be_close_to(x, 140.000000000000000f, epsilon ); 252 | } 253 | 254 | it("should have simd4f_dot3 for three component dot product returning simd4f") { 255 | simd4f a = simd4f_create(1,2,3,9999); 256 | simd4f b = simd4f_create(10,20,30,-9990); 257 | 258 | simd4f x = simd4f_dot3(a,b); 259 | // octave simd4f: [dot([1, 2, 3], [10, 20, 30]),dot([1, 2, 3], [10, 20, 30]),dot([1, 2, 3], [10, 20, 30]),dot([1, 2, 3], [10, 20, 30])] 260 | should_be_equal_simd4f(x, simd4f_create(140.000000000000000f, 140.000000000000000f, 140.000000000000000f, 140.000000000000000f), epsilon ); 261 | } 262 | 263 | it("should have simd4f_dot2 for two component dot product") { 264 | simd4f a = simd4f_create(1,2,3,9999); 265 | simd4f b = simd4f_create(10,20,30,-9990); 266 | 267 | simd4f x = simd4f_dot2(a,b); 268 | // octave simd4f: [dot([1, 2], [10, 20]),dot([1, 2], [10, 20]),dot([1, 2], [10, 20]),dot([1, 2], [10, 20])] 269 | should_be_equal_simd4f(x, simd4f_create(50.000000000000000f, 50.000000000000000f, 50.000000000000000f, 50.000000000000000f), epsilon ); 270 | } 271 | 272 | it("should have simd4f_length4 for four component vector length") { 273 | simd4f a = simd4f_create(1,2,-3,9999); 274 | simd4f x = simd4f_length4(a); 275 | // octave simd4f: [norm([1,2,-3,9999]), norm([1,2,-3,9999]), norm([1,2,-3,9999]), norm([1,2,-3,9999])] 276 | should_be_equal_simd4f(x, simd4f_create(9999.000700069982486f, 9999.000700069982486f, 9999.000700069982486f, 9999.000700069982486f), epsilon ); 277 | 278 | } 279 | 280 | it("should have simd4f_length3 for three component vector length") { 281 | simd4f a = simd4f_create(1,2,-3,9999); 282 | simd4f x = simd4f_length3(a); 283 | // octave simd4f: [norm([1,2,-3]), norm([1,2,-3]), norm([1,2,-3]), norm([1,2,-3])] 284 | should_be_equal_simd4f(x, simd4f_create(3.741657386773941f, 3.741657386773941f, 3.741657386773941f, 3.741657386773941f), epsilon ); 285 | 286 | } 287 | 288 | it("should have simd4f_length2 for two component vector length") { 289 | simd4f a = simd4f_create(1,2,-3,9999); 290 | simd4f x = simd4f_length2(a); 291 | // octave simd4f: [norm([1,2]),norm([1,2]),norm([1,2]),norm([1,2])] 292 | should_be_equal_simd4f(x, simd4f_create(2.236067977499790f, 2.236067977499790f, 2.236067977499790f, 2.236067977499790f), epsilon ); 293 | 294 | } 295 | 296 | 297 | it("should have simd4f_length4_squared for four component squared vector length") { 298 | simd4f a = simd4f_create(1,2,-3,9999); 299 | simd4f x = simd4f_length4_squared(a); 300 | // octave simd4f: ([(dot([1,2,-3,9999], [1,2,-3,9999])), (dot([1,2,-3,9999], [1,2,-3,9999])), (dot([1,2,-3,9999], [1,2,-3,9999])), (dot([1,2,-3,9999], [1,2,-3,9999]))]) 301 | should_be_equal_simd4f(x, simd4f_create(99980015.000000000000000f, 99980015.000000000000000f, 99980015.000000000000000f, 99980015.000000000000000f), epsilon ); 302 | 303 | } 304 | 305 | it("should have simd4f_length3_squared for three component squared vector length") { 306 | simd4f a = simd4f_create(1,2,-3,9999); 307 | simd4f x = simd4f_length3_squared(a); 308 | // octave simd4f: ([dot([1,2,-3], [1,2,-3]), dot([1,2,-3], [1,2,-3]), dot([1,2,-3], [1,2,-3]), dot([1,2,-3], [1,2,-3])]) 309 | should_be_equal_simd4f(x, simd4f_create(14.000000000000000f, 14.000000000000000f, 14.000000000000000f, 14.000000000000000f), epsilon ); 310 | 311 | } 312 | 313 | it("should have simd4f_length2_squared for two component squared vector length") { 314 | simd4f a = simd4f_create(1,2,-3,9999); 315 | simd4f x = simd4f_length2_squared(a); 316 | // octave simd4f: ([dot([1,2], [1,2]), dot([1,2], [1,2]), dot([1,2], [1,2]), dot([1,2], [1,2])]) 317 | should_be_equal_simd4f(x, simd4f_create(5.000000000000000f, 5.000000000000000f, 5.000000000000000f, 5.000000000000000f), epsilon ); 318 | 319 | } 320 | 321 | 322 | 323 | it("should have simd4f_cross3 for cross product") { 324 | simd4f a = simd4f_create(1,12,3,-9999); 325 | simd4f b = simd4f_create(5,6,-17, 9999); 326 | 327 | simd4f x = simd4f_cross3(a,b); 328 | // octave simd4f: horzcat( cross( [1,12,3], [5,6,-17] ) , [0] ) 329 | should_be_equal_simd4f(x, simd4f_create(-222.000000000000000f, 32.000000000000000f, -54.000000000000000f, 0.000000000000000f), epsilon ); 330 | 331 | } 332 | 333 | it("should have simd4f_normalize4 for normalizing four const vector to unit length") { 334 | simd4f a = simd4f_create(1,2,3,4); 335 | simd4f x = simd4f_normalize4(a); 336 | // octave simd4f: [1,2,3,4] / norm([1,2,3,4]) 337 | should_be_equal_simd4f(x, simd4f_create(0.182574185835055f, 0.365148371670111f, 0.547722557505166f, 0.730296743340221f), epsilon ); 338 | } 339 | 340 | it("should have simd4f_normalize3 for normalizing three component vector to unit length") { 341 | simd4f a = simd4f_create(1,2,3,0); 342 | simd4f x = simd4f_normalize3(a); 343 | // octave simd4f: [1,2,3,0] / norm([1,2,3]) 344 | should_be_equal_simd4f(x, simd4f_create(0.267261241912424f, 0.534522483824849f, 0.801783725737273f, 0.000000000000000f), epsilon ); 345 | } 346 | 347 | it("should have simd4f_normalize2 for normalizing two component vector to unit length") { 348 | simd4f a = simd4f_create(1,2,0,0); 349 | simd4f x = simd4f_normalize2(a); 350 | // octave simd4f: [1,2,0,0] / norm([1,2]) 351 | should_be_equal_simd4f(x, simd4f_create(0.447213595499958f, 0.894427190999916f, 0.000000000000000f, 0.000000000000000f), epsilon ); 352 | } 353 | 354 | 355 | } 356 | 357 | describe(simd4f, "shuffles and merges") { 358 | 359 | it("should have simd4f_shuffle_wxyz") { 360 | simd4f a = simd4f_create(1,2,3,4); 361 | simd4f x = simd4f_shuffle_wxyz(a); 362 | should_be_equal_simd4f(x, simd4f_create(4,1,2,3), epsilon ); 363 | } 364 | 365 | it("should have simd4f_shuffle_zwxy") { 366 | simd4f a = simd4f_create(1,2,3,4); 367 | simd4f x = simd4f_shuffle_zwxy(a); 368 | should_be_equal_simd4f(x, simd4f_create(3,4,1,2), epsilon ); 369 | } 370 | 371 | it("should have simd4f_shuffle_yzwx") { 372 | simd4f a = simd4f_create(1,2,3,4); 373 | simd4f x = simd4f_shuffle_yzwx(a); 374 | should_be_equal_simd4f(x, simd4f_create(2,3,4,1), epsilon ); 375 | } 376 | 377 | it("should have simd4f_merge_high") { 378 | simd4f a = simd4f_create(1,2,3,4); 379 | simd4f b = simd4f_create(5,6,7,8); 380 | simd4f x = simd4f_merge_high(a,b); 381 | should_be_equal_simd4f(x, simd4f_create(3,4,7,8), epsilon ); 382 | } 383 | 384 | } 385 | 386 | describe(simd4f, "signs") { 387 | 388 | it("should have simd4f_flip_sign_0101 for flipping even elements sign") { 389 | simd4f a = simd4f_create(1,2,3,4); 390 | simd4f x = simd4f_flip_sign_0101(a); 391 | should_be_equal_simd4f(x, simd4f_create(1,-2,3,-4), epsilon ); 392 | } 393 | 394 | it("should have simd4f_flip_sign_1010 for flipping even elements sign") { 395 | simd4f a = simd4f_create(1,2,3,4); 396 | simd4f x = simd4f_flip_sign_1010(a); 397 | should_be_equal_simd4f(x, simd4f_create(-1,2,-3,4), epsilon ); 398 | } 399 | 400 | } 401 | 402 | describe(simd4f, "min-max") { 403 | 404 | it("should have simd4f_min for choosing minimum elements") { 405 | simd4f a = simd4f_create(1.0f, 2.0f, -300000000.0f, -0.000002f); 406 | simd4f b = simd4f_create(2.0f, -2.0f, 300000000.0f, 0.000001f); 407 | 408 | simd4f x = simd4f_min(a,b); 409 | should_be_equal_simd4f(x, simd4f_create(1.0f, -2.0f, -300000000.0f, -0.000002f), epsilon); 410 | 411 | } 412 | 413 | it("should have simd4f_max for choosing maximum elements") { 414 | simd4f a = simd4f_create(1.0f, 2.0f, -300000000.0f, -0.000002f); 415 | simd4f b = simd4f_create(2.0f, -2.0f, 300000000.0f, 0.000001f); 416 | 417 | simd4f x = simd4f_max(a,b); 418 | should_be_equal_simd4f(x, simd4f_create(2.0f, 2.0f, 300000000.0f, 0.000001f), epsilon); 419 | 420 | } 421 | 422 | 423 | 424 | } 425 | 426 | 427 | describe(simd4f, "zeroing") 428 | { 429 | 430 | it("should have simd4f_zero_w that zeros the last element") 431 | { 432 | const float nan = sqrtf(-1.0f); 433 | simd4f a = simd4f_create(1.0f, 2.0f, 3.0f, 4.0f); 434 | simd4f b = simd4f_create(1.0f, 2.0f, 3.0f, nan); 435 | simd4f x = simd4f_zero_w(a); 436 | should_be_equal_simd4f(x, simd4f_create(1.0f, 2.0f, 3.0f, 0.0f), epsilon); 437 | x = simd4f_zero_w(b); 438 | should_be_equal_simd4f(x, simd4f_create(1.0f, 2.0f, 3.0f, 0.0f), epsilon); 439 | } 440 | 441 | it("should have simd4f_zero_zw that zeros the last element") 442 | { 443 | const float nan = sqrtf(-1.0f); 444 | simd4f a = simd4f_create(1.0f, 2.0f, 3.0f, 4.0f); 445 | simd4f b = simd4f_create(1.0f, 2.0f, nan, nan); 446 | simd4f x = simd4f_zero_zw(a); 447 | should_be_equal_simd4f(x, simd4f_create(1.0f, 2.0f, 0.0f, 0.0f), epsilon); 448 | x = simd4f_zero_zw(b); 449 | should_be_equal_simd4f(x, simd4f_create(1.0f, 2.0f, 0.0f, 0.0f), epsilon); 450 | } 451 | 452 | } 453 | 454 | 455 | 456 | 457 | 458 | --------------------------------------------------------------------------------