├── .gitignore ├── Makefile ├── README.md └── source ├── bvec4.h ├── dmat2.h ├── dmat4.h ├── dvec2.h ├── dvec4.h ├── ivec4.h ├── main.cpp ├── mat4.h ├── swizzle2.h ├── swizzle4.h ├── tests ├── test.h ├── vec4.cpp └── vec4.h ├── uvec4.h └── vec4.h /.gitignore: -------------------------------------------------------------------------------- 1 | .build/ 2 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | #--------------------------------------------------------------------------------- 2 | .SUFFIXES: 3 | .SILENT: 4 | 5 | #--------------------------------------------------------------------------------- 6 | # TARGET is the name of the output 7 | # EXT is the extension of the application (example: .exe) 8 | # BUILD is the directory where object files & intermediate files will be placed 9 | # SOURCES is a list of directories containing source code 10 | # PACKAGES is a list of packages to link to the project (example: freefont) 11 | # CROSS is a target for cross compilation ended with a dash (example: mingw32-msvc-) 12 | # VERSION is GCC's version (example -3.4) 13 | #--------------------------------------------------------------------------------- 14 | TARGET := $(shell basename $(CURDIR)) 15 | EXT := 16 | BUILD := .build 17 | SOURCES := source source/tests 18 | PACKAGES := 19 | CROSS := 20 | VERSION := 21 | 22 | #--------------------------------------------------------------------------------- 23 | # options for code generation 24 | #--------------------------------------------------------------------------------- 25 | ASFLAGS := 26 | CFLAGS := -W -Wall -msse2 -g -O2 27 | CXXFLAGS := -fno-rtti -fno-exceptions -fomit-frame-pointer 28 | LDFLAGS := 29 | 30 | #--------------------------------------------------------------------------------- 31 | # any extra libraries we wish to link with the project 32 | #--------------------------------------------------------------------------------- 33 | LIBS := 34 | 35 | #--------------------------------------------------------------------------------- 36 | # everything is automatic from here on 37 | #--------------------------------------------------------------------------------- 38 | CFLAGS += $(INCLUDE) $(foreach pkg,$(PACKAGES),`pkg-config --cflags $(pkg)`) 39 | LIBS += $(foreach pkg,$(PACKAGES),`pkg-config --libs $(pkg)`) 40 | CXXFLAGS += $(CFLAGS) 41 | LDFLAGS += $(CFLAGS) 42 | 43 | export AS := $(CROSS)as$(VERSION) 44 | export CC := $(CROSS)gcc$(VERSION) 45 | export CXX := $(CROSS)g++$(VERSION) 46 | 47 | #--------------------------------------------------------------------------------- 48 | %.o: %.cpp 49 | @echo $(notdir $<) 50 | $(CXX) -MMD -MP -MF $(DEPSDIR)/$*.d $(CXXFLAGS) -c $< -o $@ 51 | 52 | #--------------------------------------------------------------------------------- 53 | %.o: %.c 54 | @echo $(notdir $<) 55 | $(CC) -MMD -MP -MF $(DEPSDIR)/$*.d $(CFLAGS) -c $< -o $@ 56 | 57 | #--------------------------------------------------------------------------------- 58 | %.o: %.s 59 | @echo $(notdir $<) 60 | $(AS) --MD $(DEPSDIR)/$*.d $(ASFLAGS) $< -o$@ 61 | 62 | #--------------------------------------------------------------------------------- 63 | ifneq ($(BUILD),$(notdir $(CURDIR))) 64 | #--------------------------------------------------------------------------------- 65 | 66 | export OUTPUT := $(CURDIR)/$(TARGET)$(EXT) 67 | export VPATH := $(foreach dir,$(SOURCES),$(CURDIR)/$(dir)) \ 68 | $(foreach dir,$(DATA),$(CURDIR)/$(dir)) 69 | export DEPSDIR := $(CURDIR)/$(BUILD) 70 | 71 | ASFILES := $(foreach dir,$(SOURCES),$(notdir $(wildcard $(dir)/*.s))) 72 | CFILES := $(foreach dir,$(SOURCES),$(notdir $(wildcard $(dir)/*.c))) 73 | CPPFILES := $(foreach dir,$(SOURCES),$(notdir $(wildcard $(dir)/*.cpp))) 74 | 75 | #--------------------------------------------------------------------------------- 76 | # use CXX for linking C++ projects, CC for standard C 77 | #--------------------------------------------------------------------------------- 78 | ifeq ($(strip $(CPPFILES)),) 79 | export LD := $(CC) 80 | else 81 | export LD := $(CXX) 82 | endif 83 | #--------------------------------------------------------------------------------- 84 | 85 | export OFILES := $(CPPFILES:.cpp=.o) $(CFILES:.c=.o) 86 | export INCLUDE := $(foreach dir,$(INCLUDES),-I$(CURDIR)/$(dir)) \ 87 | $(foreach dir,$(LIBDIRS),-I$(dir)/include) \ 88 | -I$(CURDIR)/$(BUILD) 89 | export LIBPATHS := $(foreach dir,$(LIBDIRS),-L$(dir)/lib) 90 | 91 | .PHONY: $(BUILD) clean all Makefile 92 | 93 | #--------------------------------------------------------------------------------- 94 | all: $(BUILD) 95 | 96 | $(BUILD): 97 | @[ -d $@ ] || mkdir -p $@ 98 | @$(MAKE) --no-print-directory -C $(BUILD) -f $(CURDIR)/Makefile 99 | 100 | #--------------------------------------------------------------------------------- 101 | clean: 102 | @echo clean ... 103 | $(RM) -rf $(BUILD) $(OUTPUT) 104 | 105 | else 106 | 107 | DEPENDS := $(OFILES:.o=.d) 108 | 109 | #--------------------------------------------------------------------------------- 110 | # main target 111 | #--------------------------------------------------------------------------------- 112 | $(OUTPUT) : $(OFILES) 113 | @echo linking... 114 | @$(LD) $(LDFLAGS) $(OFILES) $(LIBPATHS) $(LIBS) -o $@ 115 | @$(CROSS)strip $@ 116 | @echo built $(notdir $@) 117 | 118 | -include $(DEPENDS) 119 | 120 | endif 121 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # glsl-sse2 2 | 3 | glsl-sse2 is a header-only abstraction library aimed providing the comfort 4 | of GLSL programming language and efficiency of SSE2. In short, it is an optimized 5 | SIMD vector library that behaves like the GLSL shading language. 6 | 7 | ## Why? 8 | 9 | SSE2 is a very powerful extension used in the ia32 and amd64 instruction set 10 | often overlooked by many programmers who are seeking a performance boost for 11 | their applications. 12 | 13 | The programmers who are aware of SSE, either lack the knowledge and know-how to 14 | fully utilize it, or are forced to write unportable, unmaintainable assembly 15 | code, or use SSE intrinsics. SSE intrinsic based code quality depends heavily on 16 | the compiler's quality and very often produces poor quality code that it's 17 | scalar equivalent will out-perform (for more information on the subject, 18 | see http://www.liranuna.com/?p=984). 19 | 20 | glsl-sse2 takes care of all the nasty stuff while providing the familiar API of 21 | GLSL - no ugly assembly no cache misses due to bad compiler output and no need 22 | to resort to unmaintainable or unportable code. 23 | 24 | ## Example code and output 25 | 26 | glsl-sse2 is geared toward performance, and will do it's best to signal the 27 | compiler how to best utilize the situation at hand: 28 | This example of a cross product between two `vec4`s (this code assumes the 29 | `w` component is meaningless): 30 | 31 | vec4 cross(const vec4 &a, const vec4 &b) 32 | { 33 | return a.yzxw * b.zxyw - a.zxyw * b.yzxw; 34 | } 35 | 36 | would be converted to this assembly code using glsl-sse2 and GCC 4.4: 37 | 38 | _Z5crossRK4vec4S1_: 39 | movaps (%rsi), %xmm1 40 | movaps (%rdx), %xmm2 41 | pshufd $201, %xmm1, %xmm5 42 | pshufd $210, %xmm2, %xmm0 43 | pshufd $210, %xmm1, %xmm4 44 | pshufd $201, %xmm2, %xmm3 45 | mulps %xmm0, %xmm5 46 | mulps %xmm3, %xmm4 47 | subps %xmm4, %xmm5 48 | movaps %xmm5, (%rdi) 49 | ret 50 | 51 | Minimal instructions and compiler hints help GCC to output a hand-written 52 | quality assembly code, complete with instruction pairing and little overhead. 53 | 54 | ## Getting maximum performance out of glsl-sse2 55 | 56 | glsl-sse2 is already written in a way that will try and make the compiler output 57 | the best possible code. However, some compilers are unable to produce the best 58 | possible code - compilers that ignore instruction pairing hints and parallel 59 | operations. It is best to avoid those compilers if you are seeking high 60 | performance. 61 | Most notable compiler that produces poor code is MSVC 2008 and below. 62 | 63 | Another factor that will effect code performance is the architecture. glsl-sse2 64 | will be most effective in a 64bit environment where there are double the SSE 65 | registers to operate on, causing less register pressure, as well as a guarentee 66 | of SSE2 being present - without the need to check CPUID (although most, if not 67 | all, CPUs today support SSE2 which was released in 2001, a decade ago!). 68 | 69 | ## Status 70 | 71 | Currently the project is in a stable state, however it had not been subjected 72 | to real-world (ab)use yet. You are welcome to try it at your own risk, and file 73 | bug reports, if such are found. 74 | 75 | ## What about `vec3`? Where is `mat2`? 76 | 77 | Because of alignment issues, only direct SSE2 compatible types will be 78 | implemented. Types such as `vec3`, `mat3` and types that depend on those, 79 | will not be implemented. 80 | 81 | If you need an unsupported type, simply use a bigger one - you will still get 82 | the best out of the vectorization power of SSE (there are, of course, several 83 | exceptions, such as `mat2` and `vec2`, depending on the operations performed). 84 | 85 | ## Tested compilers 86 | 87 | * GNU C Compiler 4.x and above 88 | * Microsoft Visual C++ 2008 and better 89 | * Intel C++ Compiler 10.0 and above 90 | * clang 2.8 91 | * llvm-gcc using DragonEgg 2.8 92 | 93 | ## Recommended compilers 94 | 95 | Since not all compilers are equal, and behave differently, glsl-sse2 does its 96 | best to try and make most of the compilers output similar code. However some 97 | compilers take hints better than others. These compilers are best suited to 98 | extract most performance out of glsl-sse2, ordered from best to worst: 99 | 100 | * Intel C++ Compiler 12.0+ 101 | * GNU C Compiler 4.4+ 102 | * Microsoft Visual C++ 2010+ 103 | * clang 2.8 / DragonEgg 104 | 105 | The use of LLVM compilers (clang, dragonegg) is highly discouraged, as LLVM does 106 | not output code that makes good use of instruction pairing, which can result a 107 | flat 100% speed increase when used correctly (such as ICC/GCC). 108 | 109 | ## TODO 110 | - Downswizzle `dvec4` => `dvec2` (Circular dependency issues) 111 | - Refactor swizzling, it's a mess (`new_swizzle` branch only works on GCC 4.4) 112 | - Conversion functions between vectors 113 | - Missing classes: `dmat2x4`, `dmat4x2` 114 | - Better tests 115 | - Namespacing 116 | - Complete rewrite of `bvec4` 117 | - swizzling of booleans 118 | - `bvec4` generators for all vector classes 119 | - Exponential functions for `vec4`, `dvec4` 120 | - Trigonometric functions for `vec4`, `dvec4` 121 | - Division for `ivec4`, `uvec4` 122 | -------------------------------------------------------------------------------- /source/bvec4.h: -------------------------------------------------------------------------------- 1 | #ifndef __BVEC4_H__ 2 | #define __BVEC4_H__ 3 | 4 | #include "vec4.h" 5 | 6 | typedef union bvec4 7 | { 8 | public: 9 | bvec4(bool _x, bool _y, bool _z, bool _w): 10 | x(_x), y(_y), z(_z), w(_w) { 11 | // Empty 12 | } 13 | 14 | // ----------------------------------------------------------------- // 15 | /* 16 | // not is a reserved word in C++03. 17 | friend inline const bvec4 not(const vec4 &v0, const vec4 &v1) { 18 | return bvec4(0xF - all); 19 | } 20 | */ 21 | friend inline const bvec4 equal(const vec4 &v0, const vec4 &v1); 22 | 23 | friend inline const bvec4 notEqual(const vec4 &v0, const vec4 &v1); 24 | 25 | friend inline const bvec4 greaterThan(const vec4 &v0, const vec4 &v1); 26 | 27 | friend inline const bvec4 greaterThanEqual(const vec4 &v0, const vec4 &v1); 28 | 29 | friend inline const bvec4 lessThan(const vec4 &v0, const vec4 &v1); 30 | 31 | friend inline const bvec4 lessThanEqual(const vec4 &v0, const vec4 &v1); 32 | 33 | // ----------------------------------------------------------------- // 34 | 35 | friend inline const bvec4 isnan(const vec4 &v); 36 | 37 | friend inline const bvec4 isinf(const vec4 &v); 38 | 39 | // ----------------------------------------------------------------- // 40 | 41 | friend inline bool any(const bvec4 &b); 42 | 43 | friend inline bool all(const bvec4 &b); 44 | 45 | // ----------------------------------------------------------------- // 46 | 47 | friend inline bool operator == (const bvec4 &b0, const bvec4 &b1) { 48 | return (b0.all & 0xF) == (b1.all & 0xF); 49 | } 50 | 51 | friend inline bool operator != (const bvec4 &b0, const bvec4 &b1) { 52 | return (b0.all & 0xF) != (b1.all & 0xF); 53 | } 54 | 55 | // ----------------------------------------------------------------- // 56 | 57 | // Vertex / Vector 58 | struct { 59 | bool x :1; 60 | bool y :1; 61 | bool z :1; 62 | bool w :1; 63 | }; 64 | // Color 65 | struct { 66 | bool r :1; 67 | bool g :1; 68 | bool b :1; 69 | bool a :1; 70 | }; 71 | // Texture coordinates 72 | struct { 73 | bool s :1; 74 | bool t :1; 75 | bool p :1; 76 | bool q :1; 77 | }; 78 | 79 | private: 80 | 81 | // Mask created from _mm_movemask_ps 82 | explicit bvec4(int mask) { 83 | all = mask; 84 | }; 85 | 86 | unsigned char all; 87 | } bvec4; 88 | 89 | inline const bvec4 equal(const vec4 &v0, const vec4 &v1) { 90 | return bvec4(_mm_movemask_ps(_mm_cmpeq_ps(v0.m, v1.m))); 91 | } 92 | 93 | inline const bvec4 notEqual(const vec4 &v0, const vec4 &v1) { 94 | return bvec4(_mm_movemask_ps(_mm_cmpneq_ps(v0.m, v1.m))); 95 | } 96 | 97 | inline const bvec4 greaterThan(const vec4 &v0, const vec4 &v1) { 98 | return bvec4(_mm_movemask_ps(_mm_cmpgt_ps(v0.m, v1.m))); 99 | } 100 | 101 | inline const bvec4 greaterThanEqual(const vec4 &v0, const vec4 &v1) { 102 | return bvec4(_mm_movemask_ps(_mm_cmpge_ps(v0.m, v1.m))); 103 | } 104 | 105 | inline const bvec4 lessThan(const vec4 &v0, const vec4 &v1) { 106 | return bvec4(_mm_movemask_ps(_mm_cmplt_ps(v0.m, v1.m))); 107 | } 108 | 109 | inline const bvec4 lessThanEqual(const vec4 &v0, const vec4 &v1) { 110 | return bvec4(_mm_movemask_ps(_mm_cmple_ps(v0.m, v1.m))); 111 | } 112 | 113 | // ------------------------------------------------------------------------- // 114 | 115 | inline const bvec4 isnan(const vec4 &v) { 116 | return bvec4(_mm_movemask_ps(_mm_cmpunord_ps(v.m, v.m))); 117 | } 118 | 119 | inline const bvec4 isinf(const vec4 &v) { 120 | return bvec4(_mm_movemask_ps(_mm_cmpeq_ps( 121 | _mm_andnot_ps(_mm_set1_ps(-0.f), v.m), 122 | _mm_castsi128_ps(_mm_set1_epi32(0x7F800000))))); 123 | } 124 | 125 | // ------------------------------------------------------------------------- // 126 | 127 | inline bool any(const bvec4 &b) { 128 | return (b.all & 0xF) != 0x0; 129 | } 130 | 131 | inline bool all(const bvec4 &b) { 132 | return (b.all & 0xF) == 0xF; 133 | } 134 | 135 | #endif 136 | -------------------------------------------------------------------------------- /source/dmat2.h: -------------------------------------------------------------------------------- 1 | #include "dvec2.h" 2 | 3 | #ifndef __DMAT2_H__ 4 | #define __DMAT2_H__ 5 | 6 | class dmat2 7 | { 8 | public: 9 | // Identity matrix 10 | inline dmat2() { 11 | m1 = _mm_setr_pd(1.0, 0.0); 12 | m2 = _mm_setr_pd(0.0, 0.0); 13 | } 14 | 15 | // Scaled matrix 16 | explicit inline dmat2(double d) { 17 | m1 = _mm_setr_pd( d, 0.0); 18 | m2 = _mm_setr_pd(0.0, d); 19 | } 20 | 21 | // 4 vectors constructor 22 | inline dmat2(const dvec2 &_v1, const dvec2 &_v2) { 23 | m1 = _v1.m; 24 | m2 = _v2.m; 25 | } 26 | 27 | // Full scalar constructor 28 | inline dmat2(double _d1, double _d2, double _d3, double _d4) { 29 | m1 = _mm_setr_pd( _d1, _d2); 30 | m2 = _mm_setr_pd( _d3, _d4); 31 | } 32 | 33 | // Copy constructor 34 | inline dmat2(const dmat2 &m) { 35 | m1 = m.m1; 36 | m2 = m.m2; 37 | } 38 | 39 | // ----------------------------------------------------------------- // 40 | 41 | inline void* operator new(size_t size) throw() { 42 | return _mm_malloc(size, 16); 43 | } 44 | 45 | inline void operator delete(void* ptr) { 46 | _mm_free(ptr); 47 | } 48 | 49 | // ----------------------------------------------------------------- // 50 | 51 | // Write direct access operator 52 | inline dvec2& operator[](int index) { 53 | return reinterpret_cast(m[index]); 54 | } 55 | 56 | // Read direct access operator 57 | inline const dvec2& operator[](int index) const { 58 | return reinterpret_cast(m[index]); 59 | } 60 | 61 | // Cast operator 62 | inline operator double*() { 63 | return reinterpret_cast(this); 64 | } 65 | 66 | // Const cast operator 67 | inline operator const double*() const { 68 | return reinterpret_cast(this); 69 | } 70 | 71 | // ----------------------------------------------------------------- // 72 | 73 | inline dmat2& operator += (double d) { 74 | __m128d dd = _mm_set1_pd(d); 75 | m1 = _mm_add_pd(m1, dd); 76 | m2 = _mm_add_pd(m2, dd); 77 | 78 | return *this; 79 | } 80 | 81 | inline dmat2& operator += (const dmat2 &m) { 82 | m1 = _mm_add_pd(m1, m.m1); 83 | m2 = _mm_add_pd(m2, m.m2); 84 | 85 | return *this; 86 | } 87 | 88 | inline dmat2& operator -= (double d) { 89 | __m128d dd = _mm_set1_pd(d); 90 | m1 = _mm_sub_pd(m1, dd); 91 | m2 = _mm_sub_pd(m2, dd); 92 | 93 | return *this; 94 | } 95 | 96 | inline dmat2& operator -= (const dmat2 &m) { 97 | m1 = _mm_sub_pd(m1, m.m1); 98 | m2 = _mm_sub_pd(m2, m.m2); 99 | 100 | return *this; 101 | } 102 | 103 | inline dmat2& operator *= (double d) { 104 | __m128d dd = _mm_set1_pd(d); 105 | m1 = _mm_mul_pd(m1, dd); 106 | m2 = _mm_mul_pd(m2, dd); 107 | 108 | return *this; 109 | } 110 | 111 | inline dmat2& operator *= (const dmat2 &m) { 112 | m1 = _mm_add_pd(_mm_mul_pd(_mm_unpacklo_pd(m1, m1), m.m1), 113 | _mm_mul_pd(_mm_unpackhi_pd(m1, m1), m.m2)); 114 | m2 = _mm_add_pd(_mm_mul_pd(_mm_unpacklo_pd(m2, m2), m.m1), 115 | _mm_mul_pd(_mm_unpackhi_pd(m2, m2), m.m2)); 116 | 117 | return *this; 118 | } 119 | 120 | inline dmat2& operator /= (double d) { 121 | __m128d dd = _mm_set1_pd(d); 122 | m1 = _mm_div_pd(m1, dd); 123 | m2 = _mm_div_pd(m2, dd); 124 | 125 | return *this; 126 | } 127 | 128 | inline dmat2& operator /= (const dmat2 &m) { 129 | m1 = _mm_div_pd(m1, m.m1); 130 | m2 = _mm_div_pd(m2, m.m2); 131 | 132 | return *this; 133 | } 134 | 135 | // ----------------------------------------------------------------- // 136 | 137 | friend inline dmat2 operator + (const dmat2 &m, double d) { 138 | __m128d dd = _mm_set1_pd(d); 139 | return dmat2(_mm_add_pd(m.m1, dd), _mm_add_pd(m.m2, dd)); 140 | } 141 | 142 | friend inline dmat2 operator + (const dmat2 &m0, const dmat2 &m1) { 143 | return dmat2(_mm_add_pd(m0.m1, m1.m1), _mm_add_pd(m0.m2, m1.m2)); 144 | } 145 | 146 | friend inline dmat2 operator - (const dmat2 &m, double d) { 147 | __m128d dd = _mm_set1_pd(d); 148 | return dmat2(_mm_sub_pd(m.m1, dd), _mm_sub_pd(m.m2, dd)); 149 | } 150 | 151 | friend inline dmat2 operator - (double d, const dmat2 &m) { 152 | __m128d dd = _mm_set1_pd(d); 153 | return dmat2(_mm_sub_pd(dd, m.m1), _mm_sub_pd(dd, m.m2)); 154 | } 155 | 156 | friend inline dmat2 operator - (const dmat2 &m0, const dmat2 &m1) { 157 | return dmat2(_mm_sub_pd(m0.m1, m1.m1), _mm_sub_pd(m0.m2, m1.m2)); 158 | } 159 | 160 | friend inline dmat2 operator * (const dmat2 &m, double d) { 161 | __m128d dd = _mm_set1_pd(d); 162 | return dmat2(_mm_mul_pd(m.m1, dd), _mm_mul_pd(m.m2, dd)); 163 | } 164 | 165 | friend inline dvec2 operator * (const dmat2 &m, const dvec2 &v) { 166 | return _mm_add_pd(_mm_mul_pd(_mm_unpacklo_pd(v.m, v.m), m.m1), 167 | _mm_mul_pd(_mm_unpackhi_pd(v.m, v.m), m.m2)); 168 | } 169 | 170 | friend inline dvec2 operator * (const dvec2 &v, const dmat2 &m) { 171 | return _mm_add_pd(_mm_mul_pd(_mm_unpacklo_pd(v.m, v.m), 172 | _mm_unpacklo_pd(m.m1, m.m2)), 173 | _mm_mul_pd(_mm_unpackhi_pd(v.m, v.m), 174 | _mm_unpackhi_pd(m.m1, m.m2))); 175 | } 176 | 177 | friend inline dmat2 operator * (const dmat2 &m0, const dmat2 &m1) { 178 | return dmat2(_mm_add_pd(_mm_mul_pd(_mm_unpacklo_pd(m0.m1, m0.m1), m1.m1), 179 | _mm_mul_pd(_mm_unpackhi_pd(m0.m1, m0.m1), m1.m2)), 180 | _mm_add_pd(_mm_mul_pd(_mm_unpacklo_pd(m0.m2, m0.m2), m1.m1), 181 | _mm_mul_pd(_mm_unpackhi_pd(m0.m2, m0.m2), m1.m2))); 182 | } 183 | 184 | friend inline dmat2 operator / (const dmat2 &m, double d) { 185 | __m128d dd = _mm_set1_pd(d); 186 | return dmat2(_mm_div_pd(m.m1, dd), _mm_div_pd(m.m2, dd)); 187 | } 188 | 189 | friend inline dmat2 operator / (double d, const dmat2 &m) { 190 | __m128d dd = _mm_set1_pd(d); 191 | return dmat2(_mm_div_pd(dd, m.m1), _mm_div_pd(dd, m.m2)); 192 | } 193 | 194 | friend inline dmat2 operator / (const dmat2 &m0, const dmat2 &m1) { 195 | return dmat2(_mm_div_pd(m0.m1, m1.m1), _mm_div_pd(m0.m2, m1.m2)); 196 | } 197 | 198 | // ----------------------------------------------------------------- // 199 | 200 | friend inline dmat2 matrixCompMult(const dmat2 &m0, const dmat2 &m1) { 201 | return dmat2(_mm_mul_pd(m0.m1, m1.m1), _mm_mul_pd(m0.m2, m1.m2)); 202 | } 203 | 204 | // ----------------------------------------------------------------- // 205 | 206 | friend inline dmat2 transpose(const dmat2 &m) { 207 | return dmat2(_mm_unpackhi_pd(m.m1, m.m2), 208 | _mm_unpacklo_pd(m.m1, m.m2)); 209 | } 210 | 211 | friend inline double determinant(const dmat2 &m) { 212 | __m128d d = _mm_mul_pd(m.m1, _mm_shuffle_pd(m.m2, m.m2, 0x01)); 213 | return _mm_cvtsd_f64(_mm_sub_pd(d, _mm_shuffle_pd(d, d, 0x01))); 214 | } 215 | 216 | friend inline dmat2 inverse(const dmat2 &m) { 217 | __m128d d = _mm_mul_pd(m.m1, _mm_shuffle_pd(m.m2, m.m2, 0x01)); 218 | d = _mm_sub_pd(d, _mm_shuffle_pd(d, d, 0x01)); 219 | d = _mm_div_pd(_mm_set1_pd(1.0), _mm_unpacklo_pd(d, d)); 220 | return dmat2(_mm_mul_pd(_mm_xor_pd(_mm_unpackhi_pd(m.m2, m.m1), 221 | _mm_set_pd(-0.0, 0.0)), d), 222 | _mm_mul_pd(_mm_xor_pd(_mm_unpacklo_pd(m.m2, m.m1), 223 | _mm_set_pd( 0.0, -0.0)), d)); 224 | } 225 | 226 | // ----------------------------------------------------------------- // 227 | 228 | private: 229 | // SSE constructor 230 | inline dmat2(const __m128d &_m1, const __m128d &_m2) { 231 | m1 = _m1; 232 | m2 = _m2; 233 | } 234 | 235 | union { 236 | __m128d m[2]; 237 | struct { 238 | __m128d m1; 239 | __m128d m2; 240 | }; 241 | 242 | /* // This code is waiting for unrestricted unions feature in c++0x 243 | dvec2 v[2]; 244 | struct { 245 | dvec2 v1; 246 | dvec2 v2; 247 | }; 248 | */ 249 | }; 250 | }; 251 | 252 | #endif 253 | -------------------------------------------------------------------------------- /source/dmat4.h: -------------------------------------------------------------------------------- 1 | #include "dvec4.h" 2 | 3 | #ifndef __DMAT4_H__ 4 | #define __DMAT4_H__ 5 | 6 | class dmat4 7 | { 8 | public: 9 | // Identity matrix 10 | inline dmat4() { 11 | m11 = _mm_setr_pd(1.0, 0.0); 12 | m12 = _mm_setr_pd(0.0, 0.0); 13 | 14 | m21 = _mm_setr_pd(0.0, 1.0); 15 | m22 = _mm_setr_pd(0.0, 0.0); 16 | 17 | m31 = _mm_setr_pd(0.0, 0.0); 18 | m32 = _mm_setr_pd(1.0, 0.0); 19 | 20 | m41 = _mm_setr_pd(0.0, 0.0); 21 | m42 = _mm_setr_pd(0.0, 1.0); 22 | } 23 | 24 | // Scaled matrix 25 | explicit inline dmat4(double d) { 26 | m11 = _mm_setr_pd( d, 0.0); 27 | m12 = _mm_setr_pd(0.0, 0.0); 28 | 29 | m21 = _mm_setr_pd(0.0, d); 30 | m22 = _mm_setr_pd(0.0, 0.0); 31 | 32 | m31 = _mm_setr_pd(0.0, 0.0); 33 | m32 = _mm_setr_pd( d, 0.0); 34 | 35 | m41 = _mm_setr_pd(0.0, 0.0); 36 | m42 = _mm_setr_pd(0.0, d); 37 | } 38 | 39 | // 4 vectors constructor 40 | inline dmat4(const dvec4 &_v1, const dvec4 &_v2, 41 | const dvec4 &_v3, const dvec4 &_v4) { 42 | m11 = _v1.m1; 43 | m12 = _v1.m2; 44 | 45 | m21 = _v2.m1; 46 | m22 = _v2.m2; 47 | 48 | m31 = _v3.m1; 49 | m32 = _v3.m2; 50 | 51 | m41 = _v4.m1; 52 | m42 = _v4.m2; 53 | } 54 | 55 | // Full scalar constructor 56 | inline dmat4(double _d1, double _d2, double _d3, double _d4, 57 | double _d5, double _d6, double _d7, double _d8, 58 | double _d9, double _d10, double _d11, double _d12, 59 | double _d13, double _d14, double _d15, double _d16) { 60 | m11 = _mm_setr_pd( _d1, _d2); 61 | m12 = _mm_setr_pd( _d3, _d4); 62 | 63 | m21 = _mm_setr_pd( _d5, _d6); 64 | m22 = _mm_setr_pd( _d7, _d8); 65 | 66 | m31 = _mm_setr_pd( _d9, _d10); 67 | m32 = _mm_setr_pd(_d11, _d12); 68 | 69 | m41 = _mm_setr_pd(_d13, _d14); 70 | m42 = _mm_setr_pd(_d15, _d16); 71 | } 72 | 73 | // Copy constructor 74 | inline dmat4(const dmat4 &m) { 75 | m11 = m.m11; 76 | m12 = m.m12; 77 | 78 | m21 = m.m21; 79 | m22 = m.m22; 80 | 81 | m31 = m.m31; 82 | m32 = m.m32; 83 | 84 | m41 = m.m41; 85 | m42 = m.m42; 86 | } 87 | 88 | // ----------------------------------------------------------------- // 89 | 90 | inline void* operator new(size_t size) throw() { 91 | return _mm_malloc(size, 16); 92 | } 93 | 94 | inline void operator delete(void* ptr) { 95 | _mm_free(ptr); 96 | } 97 | 98 | // ----------------------------------------------------------------- // 99 | 100 | // Write direct access operator 101 | inline dvec4& operator[](int index) { 102 | return reinterpret_cast(m[index]); 103 | } 104 | 105 | // Read direct access operator 106 | inline const dvec4& operator[](int index) const { 107 | return reinterpret_cast(m[index]); 108 | } 109 | 110 | // Cast operator 111 | inline operator double*() { 112 | return reinterpret_cast(this); 113 | } 114 | 115 | // Const cast operator 116 | inline operator const double*() const { 117 | return reinterpret_cast(this); 118 | } 119 | 120 | // ----------------------------------------------------------------- // 121 | 122 | inline dmat4& operator += (double d) { 123 | __m128d dd = _mm_set1_pd(d); 124 | m11 = _mm_add_pd(m11, dd); 125 | m12 = _mm_add_pd(m12, dd); 126 | 127 | m21 = _mm_add_pd(m21, dd); 128 | m22 = _mm_add_pd(m22, dd); 129 | 130 | m31 = _mm_add_pd(m31, dd); 131 | m32 = _mm_add_pd(m32, dd); 132 | 133 | m41 = _mm_add_pd(m41, dd); 134 | m42 = _mm_add_pd(m42, dd); 135 | 136 | return *this; 137 | } 138 | 139 | inline dmat4& operator += (const dmat4 &m) { 140 | m11 = _mm_add_pd(m11, m.m11); 141 | m12 = _mm_add_pd(m12, m.m12); 142 | 143 | m21 = _mm_add_pd(m21, m.m21); 144 | m22 = _mm_add_pd(m22, m.m22); 145 | 146 | m31 = _mm_add_pd(m31, m.m31); 147 | m32 = _mm_add_pd(m32, m.m32); 148 | 149 | m41 = _mm_add_pd(m41, m.m41); 150 | m42 = _mm_add_pd(m42, m.m42); 151 | 152 | return *this; 153 | } 154 | 155 | inline dmat4& operator -= (double d) { 156 | __m128d dd = _mm_set1_pd(d); 157 | m11 = _mm_sub_pd(m11, dd); 158 | m12 = _mm_sub_pd(m12, dd); 159 | 160 | m21 = _mm_sub_pd(m21, dd); 161 | m22 = _mm_sub_pd(m22, dd); 162 | 163 | m31 = _mm_sub_pd(m31, dd); 164 | m32 = _mm_sub_pd(m32, dd); 165 | 166 | m41 = _mm_sub_pd(m41, dd); 167 | m42 = _mm_sub_pd(m42, dd); 168 | 169 | return *this; 170 | } 171 | 172 | inline dmat4& operator -= (const dmat4 &m) { 173 | m11 = _mm_sub_pd(m11, m.m11); 174 | m12 = _mm_sub_pd(m12, m.m12); 175 | 176 | m21 = _mm_sub_pd(m21, m.m21); 177 | m22 = _mm_sub_pd(m22, m.m22); 178 | 179 | m31 = _mm_sub_pd(m31, m.m31); 180 | m32 = _mm_sub_pd(m32, m.m32); 181 | 182 | m41 = _mm_sub_pd(m41, m.m41); 183 | m42 = _mm_sub_pd(m42, m.m42); 184 | 185 | return *this; 186 | } 187 | 188 | inline dmat4& operator *= (double d) { 189 | __m128d dd = _mm_set1_pd(d); 190 | m11 = _mm_mul_pd(m11, dd); 191 | m12 = _mm_mul_pd(m12, dd); 192 | 193 | m21 = _mm_mul_pd(m21, dd); 194 | m22 = _mm_mul_pd(m22, dd); 195 | 196 | m31 = _mm_mul_pd(m31, dd); 197 | m32 = _mm_mul_pd(m32, dd); 198 | 199 | m41 = _mm_mul_pd(m41, dd); 200 | m42 = _mm_mul_pd(m42, dd); 201 | 202 | return *this; 203 | } 204 | 205 | inline dmat4& operator *= (const dmat4 &m) { 206 | __m128d xx1 = _mm_unpacklo_pd(m11, m11); 207 | __m128d yy1 = _mm_unpackhi_pd(m11, m11); 208 | __m128d zz1 = _mm_unpacklo_pd(m12, m12); 209 | __m128d ww1 = _mm_unpackhi_pd(m12, m12); 210 | __m128d xx2 = _mm_unpacklo_pd(m21, m21); 211 | __m128d yy2 = _mm_unpackhi_pd(m21, m21); 212 | __m128d zz2 = _mm_unpacklo_pd(m22, m22); 213 | __m128d ww2 = _mm_unpackhi_pd(m22, m22); 214 | __m128d xx3 = _mm_unpacklo_pd(m31, m31); 215 | __m128d yy3 = _mm_unpackhi_pd(m31, m31); 216 | __m128d zz3 = _mm_unpacklo_pd(m32, m32); 217 | __m128d ww3 = _mm_unpackhi_pd(m32, m32); 218 | __m128d xx4 = _mm_unpacklo_pd(m41, m41); 219 | __m128d yy4 = _mm_unpackhi_pd(m41, m41); 220 | __m128d zz4 = _mm_unpacklo_pd(m42, m42); 221 | __m128d ww4 = _mm_unpackhi_pd(m42, m42); 222 | m11 = _mm_add_pd(_mm_add_pd(_mm_mul_pd(m.m11, xx1), 223 | _mm_mul_pd(m.m21, yy1)), 224 | _mm_add_pd(_mm_mul_pd(m.m31, zz1), 225 | _mm_mul_pd(m.m41, ww1))); 226 | m12 = _mm_add_pd(_mm_add_pd(_mm_mul_pd(m.m12, xx1), 227 | _mm_mul_pd(m.m22, yy1)), 228 | _mm_add_pd(_mm_mul_pd(m.m32, zz1), 229 | _mm_mul_pd(m.m42, ww1))); 230 | m21 = _mm_add_pd(_mm_add_pd(_mm_mul_pd(m.m11, xx2), 231 | _mm_mul_pd(m.m21, yy2)), 232 | _mm_add_pd(_mm_mul_pd(m.m31, zz2), 233 | _mm_mul_pd(m.m41, ww2))); 234 | m22 = _mm_add_pd(_mm_add_pd(_mm_mul_pd(m.m12, xx2), 235 | _mm_mul_pd(m.m22, yy2)), 236 | _mm_add_pd(_mm_mul_pd(m.m32, zz2), 237 | _mm_mul_pd(m.m42, ww2))); 238 | m31 = _mm_add_pd(_mm_add_pd(_mm_mul_pd(m.m11, xx3), 239 | _mm_mul_pd(m.m21, yy3)), 240 | _mm_add_pd(_mm_mul_pd(m.m31, zz3), 241 | _mm_mul_pd(m.m41, ww3))); 242 | m32 = _mm_add_pd(_mm_add_pd(_mm_mul_pd(m.m12, xx3), 243 | _mm_mul_pd(m.m22, yy3)), 244 | _mm_add_pd(_mm_mul_pd(m.m32, zz3), 245 | _mm_mul_pd(m.m42, ww3))); 246 | m41 = _mm_add_pd(_mm_add_pd(_mm_mul_pd(m.m11, xx4), 247 | _mm_mul_pd(m.m21, yy4)), 248 | _mm_add_pd(_mm_mul_pd(m.m31, zz4), 249 | _mm_mul_pd(m.m41, ww4))); 250 | m42 = _mm_add_pd(_mm_add_pd(_mm_mul_pd(m.m12, xx4), 251 | _mm_mul_pd(m.m22, yy4)), 252 | _mm_add_pd(_mm_mul_pd(m.m32, zz4), 253 | _mm_mul_pd(m.m42, ww4))); 254 | 255 | return *this; 256 | } 257 | 258 | inline dmat4& operator /= (double d) { 259 | __m128d dd = _mm_set1_pd(d); 260 | m11 = _mm_div_pd(m11, dd); 261 | m12 = _mm_div_pd(m12, dd); 262 | 263 | m21 = _mm_div_pd(m21, dd); 264 | m22 = _mm_div_pd(m22, dd); 265 | 266 | m31 = _mm_div_pd(m31, dd); 267 | m32 = _mm_div_pd(m32, dd); 268 | 269 | m41 = _mm_div_pd(m41, dd); 270 | m42 = _mm_div_pd(m42, dd); 271 | 272 | return *this; 273 | } 274 | 275 | inline dmat4& operator /= (const dmat4 &m) { 276 | m11 = _mm_div_pd(m11, m.m11); 277 | m12 = _mm_div_pd(m12, m.m12); 278 | 279 | m21 = _mm_div_pd(m21, m.m21); 280 | m22 = _mm_div_pd(m22, m.m22); 281 | 282 | m31 = _mm_div_pd(m31, m.m31); 283 | m32 = _mm_div_pd(m32, m.m32); 284 | 285 | m41 = _mm_div_pd(m41, m.m41); 286 | m42 = _mm_div_pd(m42, m.m42); 287 | 288 | return *this; 289 | } 290 | 291 | // ----------------------------------------------------------------- // 292 | 293 | friend inline dmat4 operator + (const dmat4 &m, double d) { 294 | __m128d dd = _mm_set1_pd(d); 295 | return dmat4(_mm_add_pd(m.m11, dd), _mm_add_pd(m.m12, dd), 296 | _mm_add_pd(m.m21, dd), _mm_add_pd(m.m22, dd), 297 | _mm_add_pd(m.m31, dd), _mm_add_pd(m.m32, dd), 298 | _mm_add_pd(m.m41, dd), _mm_add_pd(m.m42, dd)); 299 | } 300 | 301 | friend inline dmat4 operator + (const dmat4 &m0, const dmat4 &m1) { 302 | return dmat4(_mm_add_pd(m0.m11, m1.m11), _mm_add_pd(m0.m12, m1.m12), 303 | _mm_add_pd(m0.m21, m1.m21), _mm_add_pd(m0.m22, m1.m22), 304 | _mm_add_pd(m0.m31, m1.m31), _mm_add_pd(m0.m32, m1.m32), 305 | _mm_add_pd(m0.m41, m1.m41), _mm_add_pd(m0.m42, m1.m42)); 306 | } 307 | 308 | friend inline dmat4 operator - (const dmat4 &m, double d) { 309 | __m128d dd = _mm_set1_pd(d); 310 | return dmat4(_mm_sub_pd(m.m11, dd), _mm_sub_pd(m.m12, dd), 311 | _mm_sub_pd(m.m21, dd), _mm_sub_pd(m.m22, dd), 312 | _mm_sub_pd(m.m31, dd), _mm_sub_pd(m.m32, dd), 313 | _mm_sub_pd(m.m41, dd), _mm_sub_pd(m.m42, dd)); 314 | } 315 | 316 | friend inline dmat4 operator - (double d, const dmat4 &m) { 317 | __m128d dd = _mm_set1_pd(d); 318 | return dmat4(_mm_sub_pd(dd, m.m11), _mm_sub_pd(dd, m.m12), 319 | _mm_sub_pd(dd, m.m21), _mm_sub_pd(dd, m.m22), 320 | _mm_sub_pd(dd, m.m31), _mm_sub_pd(dd, m.m32), 321 | _mm_sub_pd(dd, m.m41), _mm_sub_pd(dd, m.m42)); 322 | } 323 | 324 | friend inline dmat4 operator - (const dmat4 &m0, const dmat4 &m1) { 325 | return dmat4(_mm_sub_pd(m0.m11, m1.m11), _mm_sub_pd(m0.m12, m1.m12), 326 | _mm_sub_pd(m0.m21, m1.m21), _mm_sub_pd(m0.m22, m1.m22), 327 | _mm_sub_pd(m0.m31, m1.m31), _mm_sub_pd(m0.m32, m1.m32), 328 | _mm_sub_pd(m0.m41, m1.m41), _mm_sub_pd(m0.m42, m1.m42)); 329 | } 330 | 331 | friend inline dmat4 operator * (const dmat4 &m, double d) { 332 | __m128d dd = _mm_set1_pd(d); 333 | return dmat4(_mm_mul_pd(m.m11, dd), _mm_mul_pd(m.m12, dd), 334 | _mm_mul_pd(m.m21, dd), _mm_mul_pd(m.m22, dd), 335 | _mm_mul_pd(m.m31, dd), _mm_mul_pd(m.m32, dd), 336 | _mm_mul_pd(m.m41, dd), _mm_mul_pd(m.m42, dd)); 337 | } 338 | 339 | friend inline dvec4 operator * (const dmat4 &m, const dvec4 &v) { 340 | __m128d _xx = _mm_unpacklo_pd(v.m1, v.m1); 341 | __m128d _yy = _mm_unpackhi_pd(v.m1, v.m1); 342 | __m128d _zz = _mm_unpacklo_pd(v.m2, v.m2); 343 | __m128d _ww = _mm_unpackhi_pd(v.m2, v.m2); 344 | return dvec4(_mm_add_pd(_mm_add_pd(_mm_mul_pd(m.m11, _xx), 345 | _mm_mul_pd(m.m21, _yy)), 346 | _mm_add_pd(_mm_mul_pd(m.m31, _zz), 347 | _mm_mul_pd(m.m41, _ww))), 348 | _mm_add_pd(_mm_add_pd(_mm_mul_pd(m.m12, _xx), 349 | _mm_mul_pd(m.m22, _yy)), 350 | _mm_add_pd(_mm_mul_pd(m.m32, _zz), 351 | _mm_mul_pd(m.m42, _ww)))); 352 | } 353 | 354 | friend inline dvec4 operator * (const dvec4 &v, const dmat4 &m) { 355 | __m128d _xx = _mm_unpacklo_pd(v.m1, v.m1); 356 | __m128d _yy = _mm_unpackhi_pd(v.m1, v.m1); 357 | __m128d _zz = _mm_unpacklo_pd(v.m2, v.m2); 358 | __m128d _ww = _mm_unpackhi_pd(v.m2, v.m2); 359 | return dvec4(_mm_add_pd(_mm_add_pd(_mm_mul_pd(_mm_unpacklo_pd(m.m11, m.m21), _xx), 360 | _mm_mul_pd(_mm_unpackhi_pd(m.m11, m.m21), _yy)), 361 | _mm_add_pd(_mm_mul_pd(_mm_unpacklo_pd(m.m12, m.m22), _zz), 362 | _mm_mul_pd(_mm_unpackhi_pd(m.m12, m.m22), _ww))), 363 | _mm_add_pd(_mm_add_pd(_mm_mul_pd(_mm_unpacklo_pd(m.m31, m.m41), _xx), 364 | _mm_mul_pd(_mm_unpackhi_pd(m.m31, m.m41), _yy)), 365 | _mm_add_pd(_mm_mul_pd(_mm_unpacklo_pd(m.m32, m.m42), _zz), 366 | _mm_mul_pd(_mm_unpackhi_pd(m.m32, m.m42), _ww)))); 367 | } 368 | 369 | friend inline dmat4 operator * (const dmat4 &m0, const dmat4 &m1) { 370 | __m128d xx1 = _mm_unpacklo_pd(m0[0].m1, m0[0].m1); 371 | __m128d yy1 = _mm_unpackhi_pd(m0[0].m1, m0[0].m1); 372 | __m128d zz1 = _mm_unpacklo_pd(m0[0].m2, m0[0].m2); 373 | __m128d ww1 = _mm_unpackhi_pd(m0[0].m2, m0[0].m2); 374 | __m128d xx2 = _mm_unpacklo_pd(m0[1].m1, m0[1].m1); 375 | __m128d yy2 = _mm_unpackhi_pd(m0[1].m1, m0[1].m1); 376 | __m128d zz2 = _mm_unpacklo_pd(m0[1].m2, m0[1].m2); 377 | __m128d ww2 = _mm_unpackhi_pd(m0[1].m2, m0[1].m2); 378 | __m128d xx3 = _mm_unpacklo_pd(m0[2].m1, m0[2].m1); 379 | __m128d yy3 = _mm_unpackhi_pd(m0[2].m1, m0[2].m1); 380 | __m128d zz3 = _mm_unpacklo_pd(m0[2].m2, m0[2].m2); 381 | __m128d ww3 = _mm_unpackhi_pd(m0[2].m2, m0[2].m2); 382 | __m128d xx4 = _mm_unpacklo_pd(m0[3].m1, m0[3].m1); 383 | __m128d yy4 = _mm_unpackhi_pd(m0[3].m1, m0[3].m1); 384 | __m128d zz4 = _mm_unpacklo_pd(m0[3].m2, m0[3].m2); 385 | __m128d ww4 = _mm_unpackhi_pd(m0[3].m2, m0[3].m2); 386 | return dmat4(_mm_add_pd(_mm_add_pd(_mm_mul_pd(m1.m11, xx1), 387 | _mm_mul_pd(m1.m21, yy1)), 388 | _mm_add_pd(_mm_mul_pd(m1.m31, zz1), 389 | _mm_mul_pd(m1.m41, ww1))), 390 | _mm_add_pd(_mm_add_pd(_mm_mul_pd(m1.m12, xx1), 391 | _mm_mul_pd(m1.m22, yy1)), 392 | _mm_add_pd(_mm_mul_pd(m1.m32, zz1), 393 | _mm_mul_pd(m1.m42, ww1))), 394 | _mm_add_pd(_mm_add_pd(_mm_mul_pd(m1.m11, xx2), 395 | _mm_mul_pd(m1.m21, yy2)), 396 | _mm_add_pd(_mm_mul_pd(m1.m31, zz2), 397 | _mm_mul_pd(m1.m41, ww2))), 398 | _mm_add_pd(_mm_add_pd(_mm_mul_pd(m1.m12, xx2), 399 | _mm_mul_pd(m1.m22, yy2)), 400 | _mm_add_pd(_mm_mul_pd(m1.m32, zz2), 401 | _mm_mul_pd(m1.m42, ww2))), 402 | _mm_add_pd(_mm_add_pd(_mm_mul_pd(m1.m11, xx3), 403 | _mm_mul_pd(m1.m21, yy3)), 404 | _mm_add_pd(_mm_mul_pd(m1.m31, zz3), 405 | _mm_mul_pd(m1.m41, ww3))), 406 | _mm_add_pd(_mm_add_pd(_mm_mul_pd(m1.m12, xx3), 407 | _mm_mul_pd(m1.m22, yy3)), 408 | _mm_add_pd(_mm_mul_pd(m1.m32, zz3), 409 | _mm_mul_pd(m1.m42, ww3))), 410 | _mm_add_pd(_mm_add_pd(_mm_mul_pd(m1.m11, xx4), 411 | _mm_mul_pd(m1.m21, yy4)), 412 | _mm_add_pd(_mm_mul_pd(m1.m31, zz4), 413 | _mm_mul_pd(m1.m41, ww4))), 414 | _mm_add_pd(_mm_add_pd(_mm_mul_pd(m1.m12, xx4), 415 | _mm_mul_pd(m1.m22, yy4)), 416 | _mm_add_pd(_mm_mul_pd(m1.m32, zz4), 417 | _mm_mul_pd(m1.m42, ww4)))); 418 | } 419 | 420 | friend inline dmat4 operator / (const dmat4 &m, double d) { 421 | __m128d dd = _mm_set1_pd(d); 422 | return dmat4(_mm_div_pd(m.m11, dd), _mm_div_pd(m.m12, dd), 423 | _mm_div_pd(m.m21, dd), _mm_div_pd(m.m22, dd), 424 | _mm_div_pd(m.m31, dd), _mm_div_pd(m.m32, dd), 425 | _mm_div_pd(m.m41, dd), _mm_div_pd(m.m42, dd)); 426 | } 427 | 428 | friend inline dmat4 operator / (double d, const dmat4 &m) { 429 | __m128d dd = _mm_set1_pd(d); 430 | return dmat4(_mm_div_pd(dd, m.m11), _mm_div_pd(dd, m.m12), 431 | _mm_div_pd(dd, m.m21), _mm_div_pd(dd, m.m22), 432 | _mm_div_pd(dd, m.m31), _mm_div_pd(dd, m.m32), 433 | _mm_div_pd(dd, m.m41), _mm_div_pd(dd, m.m42)); 434 | } 435 | 436 | friend inline dmat4 operator / (const dmat4 &m0, const dmat4 &m1) { 437 | return dmat4(_mm_div_pd(m0.m11, m1.m11), _mm_div_pd(m0.m12, m1.m12), 438 | _mm_div_pd(m0.m21, m1.m21), _mm_div_pd(m0.m22, m1.m22), 439 | _mm_div_pd(m0.m31, m1.m31), _mm_div_pd(m0.m32, m1.m32), 440 | _mm_div_pd(m0.m41, m1.m41), _mm_div_pd(m0.m42, m1.m42)); 441 | } 442 | 443 | // ----------------------------------------------------------------- // 444 | 445 | friend inline dmat4 matrixCompMult(const dmat4 &m0, const dmat4 &m1) { 446 | return dmat4(_mm_mul_pd(m0.m11, m1.m11), _mm_mul_pd(m0.m12, m1.m12), 447 | _mm_mul_pd(m0.m21, m1.m21), _mm_mul_pd(m0.m22, m1.m22), 448 | _mm_mul_pd(m0.m31, m1.m31), _mm_mul_pd(m0.m32, m1.m32), 449 | _mm_mul_pd(m0.m41, m1.m41), _mm_mul_pd(m0.m42, m1.m42)); 450 | } 451 | 452 | // ----------------------------------------------------------------- // 453 | 454 | friend inline dmat4 transpose(const dmat4 &m) { 455 | return dmat4(_mm_unpacklo_pd(m.m11, m.m21), _mm_unpacklo_pd(m.m31, m.m41), 456 | _mm_unpackhi_pd(m.m11, m.m21), _mm_unpackhi_pd(m.m31, m.m41), 457 | _mm_unpacklo_pd(m.m12, m.m22), _mm_unpacklo_pd(m.m32, m.m42), 458 | _mm_unpackhi_pd(m.m12, m.m22), _mm_unpackhi_pd(m.m32, m.m42)); 459 | } 460 | 461 | friend inline double determinant(const dmat4 &m) { 462 | __m128d r1 = _mm_mul_pd(m.m11, _mm_shuffle_pd(m.m21, m.m21, 0x01)); 463 | __m128d r2 = _mm_mul_pd(m.m12, _mm_shuffle_pd(m.m22, m.m22, 0x01)); 464 | __m128d r3 = _mm_mul_pd(m.m31, _mm_shuffle_pd(m.m41, m.m41, 0x01)); 465 | __m128d r4 = _mm_mul_pd(m.m32, _mm_shuffle_pd(m.m42, m.m42, 0x01)); 466 | __m128d c1 = _mm_sub_pd(_mm_mul_pd(m.m31, _mm_unpackhi_pd(m.m42, m.m42)), 467 | _mm_mul_pd(m.m41, _mm_unpackhi_pd(m.m32, m.m32))); 468 | __m128d c2 = _mm_sub_pd(_mm_mul_pd(m.m41, _mm_unpacklo_pd(m.m32, m.m32)), 469 | _mm_mul_pd(m.m31, _mm_unpacklo_pd(m.m42, m.m42))); 470 | __m128d d = _mm_add_pd(_mm_mul_pd(_mm_sub_pd( 471 | _mm_mul_pd(m.m12, _mm_unpackhi_pd(m.m21, m.m21)), 472 | _mm_mul_pd(m.m22, _mm_unpackhi_pd(m.m11, m.m11))), 473 | _mm_unpacklo_pd(c1, c2)), 474 | _mm_mul_pd(_mm_sub_pd( 475 | _mm_mul_pd(m.m22, _mm_unpacklo_pd(m.m11, m.m11)), 476 | _mm_mul_pd(m.m12, _mm_unpacklo_pd(m.m21, m.m21))), 477 | _mm_unpackhi_pd(c1, c2))); 478 | r1 = _mm_sub_sd(r1, _mm_unpackhi_pd(r1, r1)); 479 | r2 = _mm_sub_sd(r2, _mm_unpackhi_pd(r2, r2)); 480 | r3 = _mm_sub_sd(r3, _mm_unpackhi_pd(r3, r3)); 481 | r4 = _mm_sub_sd(r4, _mm_unpackhi_pd(r4, r4)); 482 | return _mm_cvtsd_f64(_mm_sub_sd(_mm_add_sd(_mm_mul_sd(r1, r4), 483 | _mm_mul_sd(r2, r3)), 484 | _mm_add_sd(_mm_unpackhi_pd(d, d), d))); 485 | } 486 | 487 | friend inline dmat4 inverse(const dmat4 &m) { 488 | __m128d r1 = _mm_mul_pd(m.m11, _mm_shuffle_pd(m.m21, m.m21, 0x01)); 489 | __m128d r2 = _mm_mul_pd(m.m12, _mm_shuffle_pd(m.m22, m.m22, 0x01)); 490 | __m128d r3 = _mm_mul_pd(m.m31, _mm_shuffle_pd(m.m41, m.m41, 0x01)); 491 | __m128d r4 = _mm_mul_pd(m.m32, _mm_shuffle_pd(m.m42, m.m42, 0x01)); 492 | __m128d v11 = _mm_sub_pd(_mm_mul_pd(_mm_unpackhi_pd(m.m21, m.m21), m.m12), 493 | _mm_mul_pd(_mm_unpackhi_pd(m.m11, m.m11), m.m22)); 494 | __m128d v12 = _mm_sub_pd(_mm_mul_pd(_mm_unpacklo_pd(m.m11, m.m11), m.m22), 495 | _mm_mul_pd(_mm_unpacklo_pd(m.m21, m.m21), m.m12)); 496 | __m128d v21 = _mm_sub_pd(_mm_mul_pd(_mm_unpackhi_pd(m.m42, m.m42), m.m31), 497 | _mm_mul_pd(_mm_unpackhi_pd(m.m32, m.m32), m.m41)); 498 | __m128d v22 = _mm_sub_pd(_mm_mul_pd(_mm_unpacklo_pd(m.m32, m.m32), m.m41), 499 | _mm_mul_pd(_mm_unpacklo_pd(m.m42, m.m42), m.m31)); 500 | __m128d d = _mm_add_pd(_mm_mul_pd(v11, _mm_unpacklo_pd(v21, v22)), 501 | _mm_mul_pd(v12, _mm_unpackhi_pd(v21, v22))); 502 | r1 = _mm_sub_sd(r1, _mm_unpackhi_pd(r1, r1)); 503 | r2 = _mm_sub_sd(r2, _mm_unpackhi_pd(r2, r2)); 504 | r3 = _mm_sub_sd(r3, _mm_unpackhi_pd(r3, r3)); 505 | r4 = _mm_sub_sd(r4, _mm_unpackhi_pd(r4, r4)); 506 | d = _mm_add_sd(_mm_unpackhi_pd(d, d), d); 507 | d = _mm_div_sd(_mm_set_sd(1.0), 508 | _mm_sub_sd(_mm_add_sd(_mm_mul_sd(r1, r4), 509 | _mm_mul_sd(r2, r3)), d)); 510 | d = _mm_unpacklo_pd( d, d); 511 | r1 = _mm_unpacklo_pd(r1, r1); 512 | r2 = _mm_unpacklo_pd(r2, r2); 513 | r3 = _mm_unpacklo_pd(r3, r3); 514 | r4 = _mm_unpacklo_pd(r4, r4); 515 | __m128d i11 = _mm_sub_pd(_mm_mul_pd(m.m11, r4), _mm_add_pd( 516 | _mm_mul_pd(v21, _mm_unpacklo_pd(m.m12, m.m12)), 517 | _mm_mul_pd(v22, _mm_unpackhi_pd(m.m12, m.m12)))); 518 | __m128d i12 = _mm_sub_pd(_mm_mul_pd(m.m21, r4), _mm_add_pd( 519 | _mm_mul_pd(v21, _mm_unpacklo_pd(m.m22, m.m22)), 520 | _mm_mul_pd(v22, _mm_unpackhi_pd(m.m22, m.m22)))); 521 | __m128d i41 = _mm_sub_pd(_mm_mul_pd(m.m32, r1), _mm_add_pd( 522 | _mm_mul_pd(v11, _mm_unpacklo_pd(m.m31, m.m31)), 523 | _mm_mul_pd(v12, _mm_unpackhi_pd(m.m31, m.m31)))); 524 | __m128d i42 = _mm_sub_pd(_mm_mul_pd(m.m42, r1), _mm_add_pd( 525 | _mm_mul_pd(v11, _mm_unpacklo_pd(m.m41, m.m41)), 526 | _mm_mul_pd(v12, _mm_unpackhi_pd(m.m41, m.m41)))); 527 | __m128d i21 = _mm_sub_pd(_mm_mul_pd(m.m31, r2), _mm_sub_pd( 528 | _mm_mul_pd(_mm_shuffle_pd(v12, v11, 0x01), m.m32), 529 | _mm_mul_pd(_mm_shuffle_pd(m.m32, m.m32, 0x01), 530 | _mm_shuffle_pd(v12, v11, 0x02)))); 531 | __m128d i22 = _mm_sub_pd(_mm_mul_pd(m.m41, r2), _mm_sub_pd( 532 | _mm_mul_pd(_mm_shuffle_pd(v12, v11, 0x01), m.m42), 533 | _mm_mul_pd(_mm_shuffle_pd(m.m42, m.m42, 0x01), 534 | _mm_shuffle_pd(v12, v11, 0x02)))); 535 | __m128d i31 = _mm_sub_pd(_mm_mul_pd(m.m12, r3), _mm_sub_pd( 536 | _mm_mul_pd(_mm_shuffle_pd(v22, v21, 0x01), m.m11), 537 | _mm_mul_pd(_mm_shuffle_pd(m.m11, m.m11, 0x01), 538 | _mm_shuffle_pd(v22, v21, 0x02)))); 539 | __m128d i32 = _mm_sub_pd(_mm_mul_pd(m.m22, r3), _mm_sub_pd( 540 | _mm_mul_pd(_mm_shuffle_pd(v22, v21, 0x01), m.m21), 541 | _mm_mul_pd(_mm_shuffle_pd(m.m21, m.m21, 0x01), 542 | _mm_shuffle_pd(v22, v21, 0x02)))); 543 | __m128d d1 = _mm_xor_pd(d, _mm_setr_pd( 0.0, -0.0)); 544 | __m128d d2 = _mm_xor_pd(d, _mm_setr_pd(-0.0, 0.0)); 545 | return dmat4(_mm_mul_pd(_mm_unpackhi_pd(i12, i11), d1), 546 | _mm_mul_pd(_mm_unpackhi_pd(i22, i21), d1), 547 | _mm_mul_pd(_mm_unpacklo_pd(i12, i11), d2), 548 | _mm_mul_pd(_mm_unpacklo_pd(i22, i21), d2), 549 | _mm_mul_pd(_mm_unpackhi_pd(i32, i31), d1), 550 | _mm_mul_pd(_mm_unpackhi_pd(i42, i41), d1), 551 | _mm_mul_pd(_mm_unpacklo_pd(i32, i31), d2), 552 | _mm_mul_pd(_mm_unpacklo_pd(i42, i41), d2)); 553 | } 554 | 555 | // ----------------------------------------------------------------- // 556 | 557 | private: 558 | // SSE constructor 559 | inline dmat4(const __m128d &_m11, const __m128d &_m12, 560 | const __m128d &_m21, const __m128d &_m22, 561 | const __m128d &_m31, const __m128d &_m32, 562 | const __m128d &_m41, const __m128d &_m42) { 563 | m11 = _m11; 564 | m12 = _m12; 565 | 566 | m21 = _m21; 567 | m22 = _m22; 568 | 569 | m31 = _m31; 570 | m32 = _m32; 571 | 572 | m41 = _m41; 573 | m42 = _m42; 574 | } 575 | 576 | union { 577 | __m128d m[4][2]; 578 | struct { 579 | __m128d m11, m12; 580 | __m128d m21, m22; 581 | __m128d m31, m32; 582 | __m128d m41, m42; 583 | }; 584 | 585 | /* // This code is waiting for unrestricted unions feature in c++0x 586 | dvec4 v[4]; 587 | struct { 588 | dvec4 v1; 589 | dvec4 v2; 590 | dvec4 v3; 591 | dvec4 v4; 592 | }; 593 | */ 594 | }; 595 | }; 596 | 597 | #endif 598 | -------------------------------------------------------------------------------- /source/dvec2.h: -------------------------------------------------------------------------------- 1 | #ifndef __DVEC2_H__ 2 | #define __DVEC2_H__ 3 | 4 | #include 5 | 6 | #include "dvec4.h" 7 | 8 | class dvec2 9 | { 10 | private: 11 | // Merges mask `target` with `m` into one unified mask that does the same sequential shuffle 12 | template 13 | struct _mask_merger 14 | { 15 | enum 16 | { 17 | ROW0 = ((target >> (((m >> 0) & 1) << 1)) & 1) << 0, 18 | ROW1 = ((target >> (((m >> 1) & 1) << 1)) & 1) << 1, 19 | 20 | MASK = ROW0 | ROW1, 21 | }; 22 | 23 | private: 24 | _mask_merger(); 25 | }; 26 | 27 | // Since we are working in little endian land, this reverses the shuffle mask 28 | template 29 | struct _mask_reverser 30 | { 31 | enum 32 | { 33 | ROW0 = 0 << (((m >> 0) & 3) << 1), 34 | ROW1 = 1 << (((m >> 2) & 3) << 1), 35 | 36 | MASK = ROW0 | ROW1, 37 | }; 38 | 39 | private: 40 | _mask_reverser(); 41 | }; 42 | 43 | // Splits a mask to two low and high masks 44 | template 45 | struct _mask_splitter 46 | { 47 | enum 48 | { 49 | HI = ((mask >> 0) & 1) | ((mask >> 2) & 1) << 1, 50 | LO = ((mask >> 4) & 1) | ((mask >> 6) & 1) << 1, 51 | }; 52 | 53 | private: 54 | _mask_splitter(); 55 | }; 56 | 57 | // Swizzle helper (Read only) 58 | template 59 | struct _swzl_ro 60 | { 61 | friend class dvec2; 62 | 63 | public: 64 | inline operator const dvec2 () const { 65 | return _mm_shuffle_pd(v.m, v.m, mask); 66 | } 67 | 68 | inline double operator[](int index) const { 69 | return v[(mask >> (index << 1)) & 0x3]; 70 | } 71 | 72 | // Swizzle of the swizzle, read only const (2) 73 | template 74 | inline _swzl_ro<_mask_merger::MASK> shuffle2_ro2() const { 75 | typedef _mask_merger merged; 76 | return _swzl_ro(v); 77 | } 78 | 79 | // Swizzle of the swizzle, read/write const 80 | template 81 | inline _swzl_ro<_mask_merger::MASK> shuffle2_rw2() const { 82 | typedef _mask_merger merged; 83 | return _swzl_ro(v); 84 | } 85 | 86 | const double &x, &y; 87 | const double &r, &g; 88 | const double &s, &t; 89 | 90 | private: 91 | // This massive constructor maps a vector to references 92 | inline _swzl_ro(const dvec2 &v): 93 | x(v[(mask >> 0) & 0x1]), y(v[(mask >> 1) & 0x1]), 94 | r(v[(mask >> 0) & 0x1]), g(v[(mask >> 1) & 0x1]), 95 | s(v[(mask >> 0) & 0x1]), t(v[(mask >> 1) & 0x1]), 96 | 97 | v(v) { 98 | // Empty 99 | } 100 | 101 | // Reference to unswizzled self 102 | const dvec2 &v; 103 | }; 104 | 105 | // Swizzle helper (Read/Write) 106 | template 107 | struct _swzl_rw 108 | { 109 | friend class dvec2; 110 | 111 | public: 112 | inline operator const dvec2 () const { 113 | return _mm_shuffle_pd(v.m, v.m, mask); 114 | } 115 | 116 | inline double& operator[](int index) { 117 | return v[(mask >> (index << 1)) & 0x3]; 118 | } 119 | 120 | // Swizzle from dvec2 121 | inline dvec2& operator = (const dvec2 &r) { 122 | return v = _mm_shuffle_pd(r.m, r.m, _mask_reverser::MASK); 123 | } 124 | 125 | // Swizzle from same r/o mask (v1.xyzw = v2.xyzw) 126 | inline dvec2& operator = (const _swzl_ro &s) { 127 | return v = s.v; 128 | } 129 | 130 | // Swizzle from same mask (v1.xyzw = v2.xyzw) 131 | inline dvec2& operator = (const _swzl_rw &s) { 132 | return v = s.v; 133 | } 134 | 135 | // Swizzle mask => other_mask, r/o (v1.zwxy = v2.xyxy) 136 | template 137 | inline dvec2& operator = (const _swzl_ro &s) { 138 | typedef _mask_merger::MASK> merged; 139 | 140 | return v = _mm_shuffle_pd(s.v.m, s.v.m, merged::MASK); 141 | } 142 | 143 | // Swizzle mask => other_mask (v1.zwxy = v2.xyzw) 144 | template 145 | inline dvec2& operator = (const _swzl_rw &s) { 146 | typedef _mask_merger::MASK> merged; 147 | 148 | return v = _mm_shuffle_pd(s.v.m, s.v.m, merged::MASK); 149 | } 150 | 151 | // Swizzle of the swizzle, read only (v.xxxx.yyyy) (2) 152 | template 153 | inline _swzl_ro<_mask_merger::MASK> shuffle4_ro2() const { 154 | typedef _mask_merger merged; 155 | 156 | return _swzl_ro(v); 157 | } 158 | 159 | // Swizzle of the swizzle, read only (v.xxxx.yyyy) (4) 160 | template 161 | inline _swzl_ro<_mask_merger::MASK> shuffle2_ro2() const { 162 | typedef _mask_merger merged; 163 | 164 | return _swzl_ro(v); 165 | } 166 | 167 | // Swizzle of the swizzle, read/write (v1.zyxw.wzyx = ...) 168 | template 169 | inline _swzl_rw<_mask_merger::MASK> shuffle2_rw2() { 170 | typedef _mask_merger merged; 171 | 172 | return _swzl_rw(v); 173 | } 174 | 175 | // ----------------------------------------------------------------- // 176 | 177 | inline dvec2& operator += (double s) { 178 | return v += s; 179 | } 180 | 181 | inline dvec2& operator += (const dvec2 &v0) { 182 | return v += v0.shuffle2_ro2(); 183 | } 184 | 185 | inline dvec2& operator -= (double s) { 186 | return v -= s; 187 | } 188 | 189 | inline dvec2& operator -= (const dvec2 &v0) { 190 | return v -= v0.shuffle2_ro2(); 191 | } 192 | 193 | inline dvec2& operator *= (double s) { 194 | return v *= s; 195 | } 196 | 197 | inline dvec2& operator *= (const dvec2 &v0) { 198 | return v *= v0.shuffle2_ro2(); 199 | } 200 | 201 | inline dvec2& operator /= (double s) { 202 | return v /= s; 203 | } 204 | 205 | inline dvec2& operator /= (const dvec2 &v0) { 206 | return v /= v0.shuffle2_ro2(); 207 | } 208 | 209 | // ----------------------------------------------------------------- // 210 | 211 | double &x, &y; 212 | double &r, &g; 213 | double &s, &t; 214 | 215 | private: 216 | // This massive contructor maps a vector to references 217 | inline _swzl_rw(dvec2 &v): 218 | x(v[(mask >> 0) & 0x1]), y(v[(mask >> 1) & 0x1]), 219 | r(v[(mask >> 0) & 0x1]), g(v[(mask >> 1) & 0x1]), 220 | s(v[(mask >> 0) & 0x1]), t(v[(mask >> 1) & 0x1]), 221 | 222 | v(v) { 223 | // Empty 224 | } 225 | 226 | // Reference to unswizzled self 227 | dvec2 &v; 228 | }; 229 | 230 | public: 231 | // Empty constructor 232 | inline dvec2() { 233 | m = _mm_setzero_pd(); 234 | } 235 | 236 | // Fill constructor 237 | explicit inline dvec2(double d) { 238 | m = _mm_set1_pd(d); 239 | } 240 | 241 | // 4 var init constructor 242 | inline dvec2(double _x, double _y) { 243 | m = _mm_setr_pd(_x, _y); 244 | } 245 | 246 | // Integer array constructor 247 | inline dvec2(const double* dv) { 248 | m = _mm_loadu_pd(dv); 249 | } 250 | 251 | // Copy constructor 252 | inline dvec2(const dvec2 &v) { 253 | m = v.m; 254 | } 255 | 256 | // SSE2 compatible constructor 257 | inline dvec2(const __m128d &_m) { 258 | m = _m; 259 | } 260 | 261 | // ----------------------------------------------------------------- // 262 | 263 | inline void* operator new(size_t size) throw() { 264 | return _mm_malloc(size, 16); 265 | } 266 | 267 | inline void operator delete(void* ptr) { 268 | _mm_free(ptr); 269 | } 270 | 271 | // ----------------------------------------------------------------- // 272 | 273 | // Write direct access operator 274 | inline double& operator[](int index) { 275 | return reinterpret_cast(this)[index]; 276 | } 277 | 278 | // Read direct access operator 279 | inline const double& operator[](int index) const { 280 | return reinterpret_cast(this)[index]; 281 | } 282 | 283 | // Cast operator 284 | inline operator double* () { 285 | return reinterpret_cast(this); 286 | } 287 | 288 | // Const cast operator 289 | inline operator const double* () const { 290 | return reinterpret_cast(this); 291 | } 292 | 293 | // ----------------------------------------------------------------- // 294 | 295 | // Read-write swizzle 296 | template 297 | inline _swzl_rw shuffle2_rw2() { 298 | return _swzl_rw(*this); 299 | } 300 | 301 | // Read-write swizzle, const, actually read only 302 | template 303 | inline _swzl_ro shuffle2_rw2() const { 304 | return _swzl_ro(*this); 305 | } 306 | 307 | // Read-only swizzle (2) 308 | template 309 | inline _swzl_ro shuffle2_ro2() const { 310 | return _swzl_ro(*this); 311 | } 312 | 313 | // Read-only swizzle (4) 314 | template 315 | inline dvec4 shuffle4_ro2() const { 316 | return dvec4(_mm_shuffle_pd(m, m, _mask_splitter::HI), 317 | _mm_shuffle_pd(m, m, _mask_splitter::LO)); 318 | } 319 | 320 | // ----------------------------------------------------------------- // 321 | 322 | friend inline dvec2& operator += (dvec2 &v, double d) { 323 | v.m = _mm_add_pd(v.m, _mm_set1_pd(d)); 324 | return v; 325 | } 326 | 327 | friend inline dvec2& operator += (dvec2 &v0, const dvec2 &v1) { 328 | v0.m = _mm_add_pd(v0.m, v1.m); 329 | return v0; 330 | } 331 | 332 | friend inline dvec2& operator -= (dvec2 &v, double d) { 333 | v.m = _mm_sub_pd(v.m, _mm_set1_pd(d)); 334 | return v; 335 | } 336 | 337 | friend inline dvec2& operator -= (dvec2 &v0, const dvec2 &v1) { 338 | v0.m = _mm_sub_pd(v0.m, v1.m); 339 | return v0; 340 | } 341 | 342 | friend inline dvec2& operator *= (dvec2 &v, double d) { 343 | v.m = _mm_mul_pd(v.m, _mm_set1_pd(d)); 344 | return v; 345 | } 346 | 347 | friend inline dvec2& operator *= (dvec2 &v0, const dvec2 &v1) { 348 | v0.m = _mm_mul_pd(v0.m, v1.m); 349 | return v0; 350 | } 351 | 352 | friend inline dvec2& operator /= (dvec2 &v, double d) { 353 | v.m = _mm_div_pd(v.m, _mm_set1_pd(d)); 354 | return v; 355 | } 356 | 357 | friend inline dvec2& operator /= (dvec2 &v0, const dvec2 &v1) { 358 | v0.m = _mm_div_pd(v0.m, v1.m); 359 | return v0; 360 | } 361 | 362 | // ----------------------------------------------------------------- // 363 | 364 | friend inline const dvec2 operator + (double d, const dvec2 &v) { 365 | return _mm_add_pd(_mm_set1_pd(d), v.m); 366 | } 367 | 368 | friend inline const dvec2 operator + (const dvec2 &v, double d) { 369 | return _mm_add_pd(v.m, _mm_set1_pd(d)); 370 | } 371 | 372 | friend inline const dvec2 operator + (const dvec2 &v0, const dvec2 &v1) { 373 | return _mm_add_pd(v0.m, v1.m); 374 | } 375 | 376 | friend inline const dvec2 operator - (const dvec2 &v) { 377 | return _mm_xor_pd(v.m, _mm_set1_pd(-0.0)); 378 | } 379 | 380 | friend inline const dvec2 operator - (double d, const dvec2 &v) { 381 | return _mm_sub_pd(_mm_set1_pd(d), v.m); 382 | } 383 | 384 | friend inline const dvec2 operator - (const dvec2 &v, double d) { 385 | return _mm_sub_pd(v.m, _mm_set1_pd(d)); 386 | } 387 | 388 | friend inline const dvec2 operator - (const dvec2 &v0, const dvec2 &v1) { 389 | return _mm_sub_pd(v0.m, v1.m); 390 | } 391 | 392 | friend inline const dvec2 operator * (double d, const dvec2 &v) { 393 | return _mm_mul_pd(_mm_set1_pd(d), v.m); 394 | } 395 | 396 | friend inline const dvec2 operator * (const dvec2 &v, double d) { 397 | return _mm_mul_pd(v.m, _mm_set1_pd(d)); 398 | } 399 | 400 | friend inline const dvec2 operator * (const dvec2 &v0, const dvec2 &v1) { 401 | return _mm_mul_pd(v0.m, v1.m); 402 | } 403 | 404 | friend inline const dvec2 operator / (double d, const dvec2 &v) { 405 | return _mm_div_pd(_mm_set1_pd(d), v.m); 406 | } 407 | 408 | friend inline const dvec2 operator / (const dvec2 &v, double d) { 409 | return _mm_div_pd(v.m, _mm_set1_pd(d)); 410 | } 411 | 412 | friend inline const dvec2 operator / (const dvec2 &v0, const dvec2 &v1) { 413 | return _mm_div_pd(v0.m, v1.m); 414 | } 415 | 416 | // ----------------------------------------------------------------- // 417 | /* 418 | friend inline const dvec2 pow(const dvec2 &v0, const dvec2 &v1) { 419 | // TODO 420 | } 421 | 422 | friend inline const dvec2 exp(const dvec2 &v) { 423 | // TODO 424 | } 425 | */ 426 | friend inline const dvec2 log(const dvec2 &v) { 427 | return log2(v) * 0.69314718055994530942; 428 | } 429 | /* 430 | friend inline const dvec2 exp2(const dvec2 &v) { 431 | // TODO 432 | } 433 | */ 434 | friend inline const dvec2 log2(const dvec2 &v) { 435 | __m128d o = _mm_set1_pd(1.0); 436 | __m128d c = _mm_castsi128_pd(_mm_set1_epi64x(0x7FF0000000000000LL)); 437 | __m128d f = _mm_sub_pd(_mm_or_pd(_mm_andnot_pd(c, v.m), 438 | _mm_and_pd(c, o)), o); 439 | __m128i a = _mm_sub_epi32(_mm_srli_epi32(_mm_castpd_si128(v.m), 20), 440 | _mm_set1_epi32(1023)); 441 | __m128d hi = _mm_add_pd(_mm_mul_pd(_mm_set1_pd( 3.61276447184348752E-05), f), 442 | _mm_set1_pd(-4.16662127033480827E-04)); 443 | __m128d lo = _mm_add_pd(_mm_mul_pd(_mm_set1_pd(-1.43988260692073185E-01), f), 444 | _mm_set1_pd( 1.60245637034704267E-01)); 445 | hi = _mm_add_pd(_mm_mul_pd(f, hi), _mm_set1_pd( 2.28193656337578229E-03)); 446 | lo = _mm_add_pd(_mm_mul_pd(f, lo), _mm_set1_pd(-1.80329036970820794E-01)); 447 | hi = _mm_add_pd(_mm_mul_pd(f, hi), _mm_set1_pd(-7.93793829370930689E-03)); 448 | lo = _mm_add_pd(_mm_mul_pd(f, lo), _mm_set1_pd( 2.06098446037376922E-01)); 449 | hi = _mm_add_pd(_mm_mul_pd(f, hi), _mm_set1_pd( 1.98461565426430164E-02)); 450 | lo = _mm_add_pd(_mm_mul_pd(f, lo), _mm_set1_pd(-2.40449108727688962E-01)); 451 | hi = _mm_add_pd(_mm_mul_pd(f, hi), _mm_set1_pd(-3.84093543662501949E-02)); 452 | lo = _mm_add_pd(_mm_mul_pd(f, lo), _mm_set1_pd( 2.88539004851839364E-01)); 453 | hi = _mm_add_pd(_mm_mul_pd(f, hi), _mm_set1_pd( 6.08335872067172597E-02)); 454 | lo = _mm_add_pd(_mm_mul_pd(f, lo), _mm_set1_pd(-3.60673760117245982E-01)); 455 | hi = _mm_add_pd(_mm_mul_pd(f, hi), _mm_set1_pd(-8.27937055456904317E-02)); 456 | lo = _mm_add_pd(_mm_mul_pd(f, lo), _mm_set1_pd( 4.80898346961226595E-01)); 457 | hi = _mm_add_pd(_mm_mul_pd(f, hi), _mm_set1_pd( 1.01392360727236079E-01)); 458 | lo = _mm_add_pd(_mm_mul_pd(f, lo), _mm_set1_pd(-7.21347520444469934E-01)); 459 | hi = _mm_add_pd(_mm_mul_pd(f, hi), _mm_set1_pd(-1.16530490533844182E-01)); 460 | lo = _mm_add_pd(_mm_mul_pd(f, lo), _mm_set1_pd( 0.44269504088896339E+00)); 461 | hi = _mm_add_pd(_mm_mul_pd(f, hi), _mm_set1_pd( 1.30009193360025350E-01)); 462 | __m128d x2 = _mm_mul_pd(f, f); 463 | __m128d x10 = _mm_mul_pd(x2, x2); 464 | x10 = _mm_mul_pd(x10, x10); 465 | x10 = _mm_mul_pd(x2, x10); 466 | return _mm_add_pd(_mm_add_pd(_mm_mul_pd( 467 | _mm_add_pd(_mm_mul_pd(x10, hi), lo), f), f), 468 | _mm_cvtepi32_pd(_mm_shuffle_epi32(a, 0x0D))); 469 | } 470 | 471 | friend inline const dvec2 sqrt(const dvec2 &v) { 472 | return _mm_sqrt_pd(v.m); 473 | } 474 | 475 | friend inline const dvec2 inversesqrt(const dvec2 &v) { 476 | return _mm_div_pd(_mm_set1_pd(1.0), _mm_sqrt_pd(v.m)); 477 | } 478 | 479 | // ----------------------------------------------------------------- // 480 | 481 | friend inline const dvec2 abs(const dvec2 &v) { 482 | return _mm_andnot_pd(_mm_set1_pd(-0.0), v.m); 483 | } 484 | 485 | friend inline const dvec2 ceil(const dvec2 &v) { 486 | return _mm_cvtepi32_pd(_mm_cvtpd_epi32(_mm_add_pd(v.m, _mm_set1_pd(0.5)))); 487 | } 488 | 489 | friend inline const dvec2 clamp(const dvec2 &v, double d1, double d2) { 490 | return _mm_max_pd(_mm_min_pd(v.m, _mm_set1_pd(d2)), 491 | _mm_set1_pd(d1)); 492 | } 493 | 494 | friend inline const dvec2 clamp(const dvec2 &v0, 495 | const dvec2 &v1, const dvec2 &v2) { 496 | return _mm_max_pd(_mm_min_pd(v0.m, v2.m), v1.m); 497 | } 498 | 499 | friend inline const dvec2 floor(const dvec2 &v) { 500 | return _mm_cvtepi32_pd(_mm_srai_epi32(_mm_cvtpd_epi32( 501 | _mm_sub_pd(_mm_add_pd(v.m, v.m), _mm_set1_pd(0.5))), 1)); 502 | } 503 | 504 | friend inline const dvec2 fract(const dvec2 &v) { 505 | return _mm_sub_pd(v.m, _mm_cvtepi32_pd(_mm_srai_epi32( 506 | _mm_cvtpd_epi32(_mm_sub_pd(_mm_add_pd(v.m, v.m), 507 | _mm_set1_pd(0.5))), 1))); 508 | } 509 | 510 | friend inline const dvec2 max(const dvec2 &v, double d) { 511 | return _mm_max_pd(v.m, _mm_set1_pd(d)); 512 | } 513 | 514 | friend inline const dvec2 max(const dvec2 &v0, const dvec2 &v1) { 515 | return _mm_max_pd(v0.m, v1.m); 516 | } 517 | 518 | friend inline const dvec2 min(const dvec2 &v, double d) { 519 | return _mm_min_pd(v.m, _mm_set1_pd(d)); 520 | } 521 | 522 | friend inline const dvec2 min(const dvec2 &v0, const dvec2 &v1) { 523 | return _mm_min_pd(v0.m, v1.m); 524 | } 525 | 526 | friend inline const dvec2 mix(const dvec2 &v0, const dvec2 &v1, 527 | double d) { 528 | __m128d dd = _mm_set1_pd(d); 529 | return _mm_add_pd(_mm_mul_pd(v0.m, _mm_sub_pd(_mm_set1_pd(1.0), dd)), 530 | _mm_mul_pd(v1.m, dd)); 531 | } 532 | 533 | friend inline const dvec2 mix(const dvec2 &v0, const dvec2 &v1, 534 | const dvec2 &v2) { 535 | return _mm_add_pd(_mm_mul_pd(v0.m, _mm_sub_pd(_mm_set1_pd(1.0), v2.m)), 536 | _mm_mul_pd(v1.m, v2.m)); 537 | } 538 | 539 | friend inline const dvec2 mod(const dvec2 &v, double d) { 540 | __m128d dd = _mm_set1_pd(d); 541 | __m128d d1 = _mm_div_pd(v.m, dd); 542 | return _mm_sub_pd(v.m, _mm_mul_pd(dd, _mm_cvtepi32_pd( 543 | _mm_srai_epi32(_mm_cvtpd_epi32(_mm_sub_pd( 544 | _mm_add_pd(d1, d1), _mm_set1_pd(0.5))), 1)))); 545 | } 546 | 547 | friend inline const dvec2 mod(const dvec2 &v0, const dvec2 &v1) { 548 | __m128d d1 = _mm_div_pd(v0.m, v1.m); 549 | return _mm_sub_pd(v0.m, _mm_mul_pd(v1.m, _mm_cvtepi32_pd( 550 | _mm_srai_epi32(_mm_cvtpd_epi32(_mm_sub_pd( 551 | _mm_add_pd(d1, d1), _mm_set1_pd(0.5))), 1)))); 552 | } 553 | 554 | friend inline const dvec2 modf(const dvec2 &v0, dvec2 &v1) { 555 | v1.m = _mm_or_pd(_mm_cvtepi32_pd(_mm_cvttpd_epi32(v0.m)), 556 | _mm_and_pd(_mm_set1_pd(-0.0), v0.m)); 557 | return _mm_sub_pd(v0.m, v1.m); 558 | } 559 | 560 | friend inline const dvec2 sign(const dvec2 &v) { 561 | return _mm_and_pd(_mm_or_pd(_mm_and_pd(v.m, _mm_set1_pd(-0.0)), _mm_set1_pd(1)), 562 | _mm_cmpneq_pd(v.m, _mm_setzero_pd())); 563 | } 564 | 565 | friend inline const dvec2 smoothstep(double d1, double d2, 566 | const dvec2 &v) { 567 | __m128d dd1 = _mm_set1_pd(d1); 568 | __m128d c = _mm_max_pd(_mm_min_pd(_mm_div_pd(_mm_sub_pd(v.m, dd1), 569 | _mm_sub_pd(_mm_set1_pd(d2), dd1)), 570 | _mm_set1_pd(1.0)), _mm_setzero_pd()); 571 | return _mm_mul_pd(_mm_mul_pd(c, c), 572 | _mm_sub_pd(_mm_set1_pd(3.0), _mm_add_pd(c, c))); 573 | } 574 | 575 | friend inline const dvec2 smoothstep(const dvec2 &v0, 576 | const dvec2 &v1, const dvec2 &v2) { 577 | __m128d c = _mm_max_pd(_mm_min_pd(_mm_div_pd(_mm_sub_pd(v2.m, v0.m), 578 | _mm_sub_pd(v1.m, v0.m)), _mm_set1_pd(1.0)), 579 | _mm_setzero_pd()); 580 | return _mm_mul_pd(_mm_mul_pd(c, c), 581 | _mm_sub_pd(_mm_set1_pd(3.0), _mm_add_pd(c, c))); 582 | } 583 | 584 | friend inline const dvec2 step(double d, const dvec2 &v) { 585 | return _mm_and_pd(_mm_cmple_pd(v.m, _mm_set1_pd(d)), 586 | _mm_set1_pd(1.0)); 587 | } 588 | 589 | friend inline const dvec2 step(const dvec2 &v0, const dvec2 &v1) { 590 | return _mm_and_pd(_mm_cmple_pd(v0.m, v1.m), _mm_set1_pd(1.0)); 591 | } 592 | 593 | friend inline const dvec2 trunc(const dvec2 &v) { 594 | return _mm_cvtepi32_pd(_mm_cvtpd_epi32(_mm_sub_pd(v.m, 595 | _mm_or_pd(_mm_and_pd(v.m, _mm_set1_pd(-0.0)), 596 | _mm_set1_pd(0.5))))); 597 | } 598 | 599 | // ----------------------------------------------------------------- // 600 | 601 | friend inline double distance(const dvec2 &v0, const dvec2 &v1) { 602 | __m128d d = _mm_sub_pd(v0.m, v1.m); 603 | __m128d l = _mm_mul_pd(d, d); 604 | return _mm_cvtsd_f64(_mm_add_pd(l, _mm_shuffle_pd(l, l, 0x01))); 605 | } 606 | 607 | friend inline double dot(const dvec2 &v0, const dvec2 &v1) { 608 | __m128d l = _mm_mul_pd(v0.m, v1.m); 609 | return _mm_cvtsd_f64(_mm_add_pd(l, _mm_shuffle_pd(l, l, 0x01))); 610 | } 611 | 612 | friend inline const dvec2 faceforward(const dvec2 &v0, 613 | const dvec2 &v1, const dvec2 &v2) { 614 | __m128d l = _mm_mul_pd(v2.m, v1.m); 615 | return _mm_xor_pd(_mm_and_pd(_mm_cmpnlt_pd( 616 | _mm_add_pd(l, _mm_shuffle_pd(l, l, 0x01)), 617 | _mm_setzero_pd()), _mm_set1_pd(-0.f)), v0.m); 618 | } 619 | 620 | friend inline double length(const dvec2 &v) { 621 | __m128d l = _mm_mul_pd(v.m, v.m); 622 | return _mm_cvtsd_f64(_mm_sqrt_pd(_mm_add_pd(l, _mm_shuffle_pd(l, l, 0x01)))); 623 | } 624 | 625 | friend inline const dvec2 normalize(const dvec2 &v) { 626 | __m128d l = _mm_mul_pd(v.m, v.m); 627 | return _mm_div_pd(v.m, _mm_sqrt_pd(_mm_add_pd(l, _mm_shuffle_pd(l, l, 0x01)))); 628 | } 629 | 630 | friend inline const dvec2 reflect(const dvec2 &v0, const dvec2 &v1) { 631 | __m128d l = _mm_mul_pd(v0.m, v1.m); 632 | __m128d d = _mm_add_pd(l, _mm_shuffle_pd(l, l, 0x01)); 633 | return _mm_sub_pd(v0.m, _mm_mul_pd(_mm_add_pd(d, d), v1.m)); 634 | 635 | } 636 | 637 | friend inline const dvec2 refract(const dvec2 &v0, const dvec2 &v1, 638 | double d) { 639 | __m128d o = _mm_set1_pd(1.0); 640 | __m128d e = _mm_set1_pd(d); 641 | __m128d l = _mm_mul_pd(v0.m, v1.m); 642 | __m128d dd = _mm_add_pd(l, _mm_shuffle_pd(l, l, 0x01)); 643 | __m128d k = _mm_sub_pd(o, _mm_mul_pd(_mm_mul_pd(e, e), 644 | _mm_sub_pd(o, _mm_mul_pd(dd, dd)))); 645 | return _mm_and_pd(_mm_cmpnlt_pd(k, _mm_setzero_pd()), 646 | _mm_mul_pd(_mm_mul_pd(e, _mm_sub_pd(v0.m, 647 | _mm_mul_pd(_mm_mul_pd(e, dd), _mm_sqrt_pd(k)))), v1.m)); 648 | } 649 | 650 | // ----------------------------------------------------------------- // 651 | 652 | friend inline bool operator == (const dvec2 &v0, const dvec2 &v1) { 653 | return _mm_movemask_pd(_mm_cmpeq_pd(v0.m, v1.m)) == 0x03; 654 | } 655 | 656 | friend inline bool operator != (const dvec2 &v0, const dvec2 &v1) { 657 | return _mm_movemask_pd(_mm_cmpneq_pd(v0.m, v1.m)) != 0x00; 658 | } 659 | 660 | // ----------------------------------------------------------------- // 661 | 662 | union { 663 | // Vertex / Vector 664 | struct { 665 | double x, y; 666 | }; 667 | // Color 668 | struct { 669 | double r, g; 670 | }; 671 | // Texture coordinates 672 | struct { 673 | double s, t; 674 | }; 675 | 676 | // SSE2 registers 677 | __m128d m; 678 | }; 679 | }; 680 | 681 | #include "swizzle2.h" 682 | #include "swizzle4.h" 683 | 684 | #endif 685 | -------------------------------------------------------------------------------- /source/dvec4.h: -------------------------------------------------------------------------------- 1 | #ifndef __DVEC4_H__ 2 | #define __DVEC4_H__ 3 | 4 | class dvec4 5 | { 6 | private: 7 | // The actual swizzle code, since we operate on two xmm registers 8 | template 9 | static inline dvec4 shuffle(const dvec4 &v) 10 | { 11 | const __m128d &S1 = v.m[(mask >> 1) & 1]; 12 | const __m128d &S2 = v.m[(mask >> 3) & 1]; 13 | const __m128d &S3 = v.m[(mask >> 5) & 1]; 14 | const __m128d &S4 = v.m[(mask >> 7) & 1]; 15 | 16 | return dvec4(_mm_shuffle_pd(S1, S2, _mask_splitter::HI), 17 | _mm_shuffle_pd(S3, S4, _mask_splitter::LO)); 18 | } 19 | 20 | // Merges mask `target` with `m` into one unified mask that does the same sequential shuffle 21 | template 22 | struct _mask_merger 23 | { 24 | enum 25 | { 26 | ROW0 = ((target >> (((m >> 0) & 3) << 1)) & 3) << 0, 27 | ROW1 = ((target >> (((m >> 2) & 3) << 1)) & 3) << 2, 28 | ROW2 = ((target >> (((m >> 4) & 3) << 1)) & 3) << 4, 29 | ROW3 = ((target >> (((m >> 6) & 3) << 1)) & 3) << 6, 30 | 31 | MASK = ROW0 | ROW1 | ROW2 | ROW3, 32 | }; 33 | 34 | private: 35 | _mask_merger(); 36 | }; 37 | 38 | // Since we are working in little endian land, this reverses the shuffle mask 39 | template 40 | struct _mask_reverser 41 | { 42 | enum 43 | { 44 | ROW0 = 0 << (((m >> 0) & 3) << 1), 45 | ROW1 = 1 << (((m >> 2) & 3) << 1), 46 | ROW2 = 2 << (((m >> 4) & 3) << 1), 47 | ROW3 = 3 << (((m >> 6) & 3) << 1), 48 | 49 | MASK = ROW0 | ROW1 | ROW2 | ROW3, 50 | }; 51 | 52 | private: 53 | _mask_reverser(); 54 | }; 55 | 56 | // Splits a mask to two low and high masks 57 | template 58 | struct _mask_splitter 59 | { 60 | enum 61 | { 62 | HI = ((mask >> 0) & 1) | ((mask >> 2) & 1) << 1, 63 | LO = ((mask >> 4) & 1) | ((mask >> 6) & 1) << 1, 64 | }; 65 | 66 | private: 67 | _mask_splitter(); 68 | }; 69 | 70 | // Swizzle helper (Read only) 71 | template 72 | struct _swzl_ro 73 | { 74 | friend class dvec4; 75 | 76 | public: 77 | inline operator const dvec4 () const { 78 | return shuffle(v); 79 | } 80 | 81 | inline double operator[](int index) const { 82 | return v[(mask >> (index << 1)) & 0x3]; 83 | } 84 | 85 | // Swizzle of the swizzle, read only const (2) 86 | template 87 | inline _swzl_ro<_mask_merger::MASK> shuffle4_ro2() const { 88 | typedef _mask_merger merged; 89 | return _swzl_ro(v); 90 | } 91 | 92 | // Swizzle of the swizzle, read only const (4) 93 | template 94 | inline _swzl_ro<_mask_merger::MASK> shuffle4_ro4() const { 95 | typedef _mask_merger merged; 96 | return _swzl_ro(v); 97 | } 98 | 99 | // Swizzle of the swizzle, read/write const 100 | template 101 | inline _swzl_ro<_mask_merger::MASK> shuffle4_rw4() const { 102 | typedef _mask_merger merged; 103 | return _swzl_ro(v); 104 | } 105 | 106 | const double &x, &y, &z, &w; 107 | const double &r, &g, &b, &a; 108 | const double &s, &t, &p, &q; 109 | 110 | private: 111 | // This massive constructor maps a vector to references 112 | inline _swzl_ro(const dvec4 &v): 113 | x(v[(mask >> 0) & 0x3]), y(v[(mask >> 2) & 0x3]), 114 | z(v[(mask >> 4) & 0x3]), w(v[(mask >> 6) & 0x3]), 115 | 116 | r(v[(mask >> 0) & 0x3]), g(v[(mask >> 2) & 0x3]), 117 | b(v[(mask >> 4) & 0x3]), a(v[(mask >> 6) & 0x3]), 118 | 119 | s(v[(mask >> 0) & 0x3]), t(v[(mask >> 2) & 0x3]), 120 | p(v[(mask >> 4) & 0x3]), q(v[(mask >> 6) & 0x3]), 121 | 122 | v(v) { 123 | // Empty 124 | } 125 | 126 | // Reference to unswizzled self 127 | const dvec4 &v; 128 | }; 129 | 130 | // Swizzle helper (Read/Write) 131 | template 132 | struct _swzl_rw 133 | { 134 | friend class dvec4; 135 | 136 | public: 137 | inline operator const dvec4 () const { 138 | return shuffle(v); 139 | } 140 | 141 | inline double& operator[](int index) { 142 | return v[(mask >> (index << 1)) & 0x3]; 143 | } 144 | 145 | // Swizzle from dvec4 146 | inline dvec4& operator = (const dvec4 &r) { 147 | return v = shuffle<_mask_reverser::MASK>(r); 148 | } 149 | 150 | // Swizzle from same r/o mask (v1.xyzw = v2.xyzw) 151 | inline dvec4& operator = (const _swzl_ro &s) { 152 | return v = s.v; 153 | } 154 | 155 | // Swizzle from same mask (v1.xyzw = v2.xyzw) 156 | inline dvec4& operator = (const _swzl_rw &s) { 157 | return v = s.v; 158 | } 159 | 160 | // Swizzle mask => other_mask, r/o (v1.zwxy = v2.xyxy) 161 | template 162 | inline dvec4& operator = (const _swzl_ro &s) { 163 | typedef _mask_merger::MASK> merged; 164 | 165 | return v = shuffle(s.v); 166 | } 167 | 168 | // Swizzle mask => other_mask (v1.zwxy = v2.xyzw) 169 | template 170 | inline dvec4& operator = (const _swzl_rw &s) { 171 | typedef _mask_merger::MASK> merged; 172 | 173 | return v = shuffle(s.v); 174 | } 175 | 176 | // Swizzle of the swizzle, read only (v.xxxx.yyyy) (2) 177 | template 178 | inline _swzl_ro<_mask_merger::MASK> shuffle4_ro2() const { 179 | typedef _mask_merger merged; 180 | 181 | return _swzl_ro(v); 182 | } 183 | 184 | // Swizzle of the swizzle, read only (v.xxxx.yyyy) (4) 185 | template 186 | inline _swzl_ro<_mask_merger::MASK> shuffle4_ro4() const { 187 | typedef _mask_merger merged; 188 | 189 | return _swzl_ro(v); 190 | } 191 | 192 | // Swizzle of the swizzle, read/write (v1.zyxw.wzyx = ...) 193 | template 194 | inline _swzl_rw<_mask_merger::MASK> shuffle4_rw4() { 195 | typedef _mask_merger merged; 196 | 197 | return _swzl_rw(v); 198 | } 199 | 200 | // ----------------------------------------------------------------- // 201 | 202 | inline dvec4& operator += (double s) { 203 | return v += s; 204 | } 205 | 206 | inline dvec4& operator += (const dvec4 &v0) { 207 | return v += v0.shuffle4_ro4(); 208 | } 209 | 210 | inline dvec4& operator -= (double s) { 211 | return v -= s; 212 | } 213 | 214 | inline dvec4& operator -= (const dvec4 &v0) { 215 | return v -= v0.shuffle4_ro4(); 216 | } 217 | 218 | inline dvec4& operator *= (double s) { 219 | return v *= s; 220 | } 221 | 222 | inline dvec4& operator *= (const dvec4 &v0) { 223 | return v *= v0.shuffle4_ro4(); 224 | } 225 | 226 | inline dvec4& operator /= (double s) { 227 | return v /= s; 228 | } 229 | 230 | inline dvec4& operator /= (const dvec4 &v0) { 231 | return v /= v0.shuffle4_ro4(); 232 | } 233 | 234 | // ----------------------------------------------------------------- // 235 | 236 | double &x, &y, &z, &w; 237 | double &r, &g, &b, &a; 238 | double &s, &t, &p, &q; 239 | 240 | private: 241 | // This massive contructor maps a vector to references 242 | inline _swzl_rw(dvec4 &v): 243 | x(v[(mask >> 0) & 0x3]), y(v[(mask >> 2) & 0x3]), 244 | z(v[(mask >> 4) & 0x3]), w(v[(mask >> 6) & 0x3]), 245 | 246 | r(v[(mask >> 0) & 0x3]), g(v[(mask >> 2) & 0x3]), 247 | b(v[(mask >> 4) & 0x3]), a(v[(mask >> 6) & 0x3]), 248 | 249 | s(v[(mask >> 0) & 0x3]), t(v[(mask >> 2) & 0x3]), 250 | p(v[(mask >> 4) & 0x3]), q(v[(mask >> 6) & 0x3]), 251 | 252 | v(v) { 253 | // Empty 254 | } 255 | 256 | // Reference to unswizzled self 257 | dvec4 &v; 258 | }; 259 | 260 | public: 261 | // Empty constructor 262 | inline dvec4() { 263 | m1 = _mm_setzero_pd(); 264 | m2 = _mm_setzero_pd(); 265 | } 266 | 267 | // Fill constructor 268 | explicit inline dvec4(double d) { 269 | m1 = _mm_set1_pd(d); 270 | m2 = _mm_set1_pd(d); 271 | } 272 | 273 | // 4 var init constructor 274 | inline dvec4(double _x, double _y, double _z, double _w) { 275 | m1 = _mm_setr_pd(_x, _y); 276 | m2 = _mm_setr_pd(_z, _w); 277 | } 278 | 279 | // Integer array constructor 280 | inline dvec4(const double* dv) { 281 | m1 = _mm_loadu_pd(dv); 282 | m2 = _mm_loadu_pd(dv + 2); 283 | } 284 | 285 | // Copy constructor 286 | inline dvec4(const dvec4 &v) { 287 | m1 = v.m1; 288 | m2 = v.m2; 289 | } 290 | 291 | // SSE2 compatible constructor 292 | inline dvec4(const __m128d &_m1, const __m128d &_m2) { 293 | m1 = _m1; 294 | m2 = _m2; 295 | } 296 | 297 | // ----------------------------------------------------------------- // 298 | 299 | inline void* operator new(size_t size) throw() { 300 | return _mm_malloc(size, 16); 301 | } 302 | 303 | inline void operator delete(void* ptr) { 304 | _mm_free(ptr); 305 | } 306 | 307 | // ----------------------------------------------------------------- // 308 | 309 | // Write direct access operator 310 | inline double& operator[](int index) { 311 | return reinterpret_cast(this)[index]; 312 | } 313 | 314 | // Read direct access operator 315 | inline const double& operator[](int index) const { 316 | return reinterpret_cast(this)[index]; 317 | } 318 | 319 | // Cast operator 320 | inline operator double* () { 321 | return reinterpret_cast(this); 322 | } 323 | 324 | // Const cast operator 325 | inline operator const double* () const { 326 | return reinterpret_cast(this); 327 | } 328 | 329 | // ----------------------------------------------------------------- // 330 | 331 | // Read-write swizzle 332 | template 333 | inline _swzl_rw shuffle4_rw4() { 334 | return _swzl_rw(*this); 335 | } 336 | 337 | // Read-write swizzle, const, actually read only 338 | template 339 | inline _swzl_ro shuffle4_rw4() const { 340 | return _swzl_ro(*this); 341 | } 342 | 343 | // Read-only swizzle (2) 344 | template 345 | inline _swzl_ro shuffle4_ro2() const { 346 | return _swzl_ro(*this); 347 | } 348 | 349 | // Read-only swizzle (4) 350 | template 351 | inline _swzl_ro shuffle4_ro4() const { 352 | return _swzl_ro(*this); 353 | } 354 | 355 | // ----------------------------------------------------------------- // 356 | 357 | friend inline dvec4& operator += (dvec4 &v, double d) { 358 | __m128d dd = _mm_set1_pd(d); 359 | v.m1 = _mm_add_pd(v.m1, dd); 360 | v.m2 = _mm_add_pd(v.m2, dd); 361 | return v; 362 | } 363 | 364 | friend inline dvec4& operator += (dvec4 &v0, const dvec4 &v1) { 365 | v0.m1 = _mm_add_pd(v0.m1, v1.m1); 366 | v0.m2 = _mm_add_pd(v0.m2, v1.m2); 367 | return v0; 368 | } 369 | 370 | friend inline dvec4& operator -= (dvec4 &v, double d) { 371 | __m128d dd = _mm_set1_pd(d); 372 | v.m1 = _mm_sub_pd(v.m1, dd); 373 | v.m2 = _mm_sub_pd(v.m2, dd); 374 | return v; 375 | } 376 | 377 | friend inline dvec4& operator -= (dvec4 &v0, const dvec4 &v1) { 378 | v0.m1 = _mm_sub_pd(v0.m1, v1.m1); 379 | v0.m2 = _mm_sub_pd(v0.m2, v1.m2); 380 | return v0; 381 | } 382 | 383 | friend inline dvec4& operator *= (dvec4 &v, double d) { 384 | __m128d dd = _mm_set1_pd(d); 385 | v.m1 = _mm_mul_pd(v.m1, dd); 386 | v.m2 = _mm_mul_pd(v.m2, dd); 387 | return v; 388 | } 389 | 390 | friend inline dvec4& operator *= (dvec4 &v0, const dvec4 &v1) { 391 | v0.m1 = _mm_mul_pd(v0.m1, v1.m1); 392 | v0.m2 = _mm_mul_pd(v0.m2, v1.m2); 393 | return v0; 394 | } 395 | 396 | friend inline dvec4& operator /= (dvec4 &v, double d) { 397 | __m128d dd = _mm_set1_pd(d); 398 | v.m1 = _mm_div_pd(v.m1, dd); 399 | v.m2 = _mm_div_pd(v.m2, dd); 400 | return v; 401 | } 402 | 403 | friend inline dvec4& operator /= (dvec4 &v0, const dvec4 &v1) { 404 | v0.m1 = _mm_div_pd(v0.m1, v1.m1); 405 | v0.m2 = _mm_div_pd(v0.m2, v1.m2); 406 | return v0; 407 | } 408 | 409 | // ----------------------------------------------------------------- // 410 | 411 | friend inline const dvec4 operator + (double d, const dvec4 &v) { 412 | __m128d dd = _mm_set1_pd(d); 413 | return dvec4(_mm_add_pd(dd, v.m1), _mm_add_pd(dd, v.m2)); 414 | } 415 | 416 | friend inline const dvec4 operator + (const dvec4 &v, double d) { 417 | __m128d dd = _mm_set1_pd(d); 418 | return dvec4(_mm_add_pd(v.m1, dd), _mm_add_pd(v.m2, dd)); 419 | } 420 | 421 | friend inline const dvec4 operator + (const dvec4 &v0, const dvec4 &v1) { 422 | return dvec4(_mm_add_pd(v0.m1, v1.m1), _mm_add_pd(v0.m2, v1.m2)); 423 | } 424 | 425 | friend inline const dvec4 operator - (const dvec4 &v) { 426 | __m128d nz = _mm_set1_pd(-0.0); 427 | return dvec4(_mm_xor_pd(v.m1, nz), _mm_xor_pd(v.m2, nz)); 428 | } 429 | 430 | friend inline const dvec4 operator - (double d, const dvec4 &v) { 431 | __m128d dd = _mm_set1_pd(d); 432 | return dvec4(_mm_sub_pd(dd, v.m1), _mm_sub_pd(dd, v.m2)); 433 | } 434 | 435 | friend inline const dvec4 operator - (const dvec4 &v, double d) { 436 | __m128d dd = _mm_set1_pd(d); 437 | return dvec4(_mm_sub_pd(v.m1, dd), _mm_sub_pd(v.m2, dd)); 438 | } 439 | 440 | friend inline const dvec4 operator - (const dvec4 &v0, const dvec4 &v1) { 441 | return dvec4(_mm_sub_pd(v0.m1, v1.m1), _mm_sub_pd(v0.m2, v1.m2)); 442 | } 443 | 444 | friend inline const dvec4 operator * (double d, const dvec4 &v) { 445 | __m128d dd = _mm_set1_pd(d); 446 | return dvec4(_mm_mul_pd(dd, v.m1), _mm_mul_pd(dd, v.m2)); 447 | } 448 | 449 | friend inline const dvec4 operator * (const dvec4 &v, double d) { 450 | __m128d dd = _mm_set1_pd(d); 451 | return dvec4(_mm_mul_pd(v.m1, dd), _mm_mul_pd(v.m2, dd)); 452 | } 453 | 454 | friend inline const dvec4 operator * (const dvec4 &v0, const dvec4 &v1) { 455 | return dvec4(_mm_mul_pd(v0.m1, v1.m1), _mm_mul_pd(v0.m2, v1.m2)); 456 | } 457 | 458 | friend inline const dvec4 operator / (double d, const dvec4 &v) { 459 | __m128d dd = _mm_set1_pd(d); 460 | return dvec4(_mm_div_pd(dd, v.m1), _mm_div_pd(dd, v.m2)); 461 | } 462 | 463 | friend inline const dvec4 operator / (const dvec4 &v, double d) { 464 | __m128d dd = _mm_set1_pd(d); 465 | return dvec4(_mm_div_pd(v.m1, dd), _mm_div_pd(v.m2, dd)); 466 | } 467 | 468 | friend inline const dvec4 operator / (const dvec4 &v0, const dvec4 &v1) { 469 | return dvec4(_mm_div_pd(v0.m1, v1.m1), _mm_div_pd(v0.m2, v1.m2)); 470 | } 471 | 472 | // ----------------------------------------------------------------- // 473 | /* 474 | friend inline const dvec4 pow(const dvec4 &v0, const dvec4 &v1) { 475 | // TODO 476 | } 477 | 478 | friend inline const dvec4 exp(const dvec4 &v) { 479 | // TODO 480 | } 481 | */ 482 | friend inline const dvec4 log(const dvec4 &v) { 483 | return log2(v) * 0.69314718055994530942; 484 | } 485 | /* 486 | friend inline const dvec4 exp2(const dvec4 &v) { 487 | // TODO 488 | } 489 | */ 490 | friend inline const dvec4 log2(const dvec4 &v) { 491 | __m128d p0 = _mm_set1_pd( 3.61276447184348752e-05); 492 | __m128d p1 = _mm_set1_pd(-4.16662127033480827e-04); 493 | __m128d p2 = _mm_set1_pd(-1.43988260692073185e-01); 494 | __m128d p3 = _mm_set1_pd( 1.60245637034704267e-01); 495 | __m128d p4 = _mm_set1_pd( 2.28193656337578229e-03); 496 | __m128d p5 = _mm_set1_pd(-1.80329036970820794e-01); 497 | __m128d p6 = _mm_set1_pd(-7.93793829370930689e-03); 498 | __m128d p7 = _mm_set1_pd( 2.06098446037376922e-01); 499 | __m128d p8 = _mm_set1_pd( 1.98461565426430164e-02); 500 | __m128d p9 = _mm_set1_pd(-2.40449108727688962e-01); 501 | __m128d p10 = _mm_set1_pd(-3.84093543662501949e-02); 502 | __m128d p11 = _mm_set1_pd( 2.88539004851839364e-01); 503 | __m128d p12 = _mm_set1_pd( 6.08335872067172597e-02); 504 | __m128d p13 = _mm_set1_pd(-3.60673760117245982e-01); 505 | __m128d p14 = _mm_set1_pd(-8.27937055456904317e-02); 506 | __m128d p15 = _mm_set1_pd( 4.80898346961226595e-01); 507 | __m128d p16 = _mm_set1_pd( 1.01392360727236079e-01); 508 | __m128d p17 = _mm_set1_pd(-7.21347520444469934e-01); 509 | __m128d p18 = _mm_set1_pd(-1.16530490533844182e-01); 510 | __m128d p19 = _mm_set1_pd( 0.44269504088896339e+00); 511 | __m128d p20 = _mm_set1_pd( 1.30009193360025350e-01); 512 | __m128d o = _mm_set1_pd(1.0); 513 | __m128i t = _mm_set1_epi32(1023); 514 | __m128d c = _mm_castsi128_pd(_mm_set1_epi64x(0x7FF0000000000000LL)); 515 | __m128d co = _mm_and_pd(c, o); 516 | __m128d f1 = _mm_sub_pd(_mm_or_pd(_mm_andnot_pd(c, v.m1), co), o); 517 | __m128d f2 = _mm_sub_pd(_mm_or_pd(_mm_andnot_pd(c, v.m2), co), o); 518 | __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_castpd_si128(v.m1), 20), t); 519 | __m128i a2 = _mm_sub_epi32(_mm_srli_epi32(_mm_castpd_si128(v.m2), 20), t); 520 | __m128d hi1 = _mm_add_pd(_mm_mul_pd(p0, f1), p1); 521 | __m128d hi2 = _mm_add_pd(_mm_mul_pd(p0, f2), p1); 522 | __m128d lo1 = _mm_add_pd(_mm_mul_pd(p2, f1), p3); 523 | __m128d lo2 = _mm_add_pd(_mm_mul_pd(p2, f2), p3); 524 | hi1 = _mm_add_pd(_mm_mul_pd(f1, hi1), p4); 525 | hi2 = _mm_add_pd(_mm_mul_pd(f2, hi2), p4); 526 | lo1 = _mm_add_pd(_mm_mul_pd(f1, lo1), p5); 527 | lo2 = _mm_add_pd(_mm_mul_pd(f2, lo2), p5); 528 | hi1 = _mm_add_pd(_mm_mul_pd(f1, hi1), p6); 529 | hi2 = _mm_add_pd(_mm_mul_pd(f2, hi2), p6); 530 | lo1 = _mm_add_pd(_mm_mul_pd(f1, lo1), p7); 531 | lo2 = _mm_add_pd(_mm_mul_pd(f2, lo2), p7); 532 | hi1 = _mm_add_pd(_mm_mul_pd(f1, hi1), p8); 533 | hi2 = _mm_add_pd(_mm_mul_pd(f2, hi2), p8); 534 | lo1 = _mm_add_pd(_mm_mul_pd(f1, lo1), p9); 535 | lo2 = _mm_add_pd(_mm_mul_pd(f2, lo2), p9); 536 | hi1 = _mm_add_pd(_mm_mul_pd(f1, hi1), p10); 537 | hi2 = _mm_add_pd(_mm_mul_pd(f2, hi2), p10); 538 | lo1 = _mm_add_pd(_mm_mul_pd(f1, lo1), p11); 539 | lo2 = _mm_add_pd(_mm_mul_pd(f2, lo2), p11); 540 | hi1 = _mm_add_pd(_mm_mul_pd(f1, hi1), p12); 541 | hi2 = _mm_add_pd(_mm_mul_pd(f2, hi2), p12); 542 | lo1 = _mm_add_pd(_mm_mul_pd(f1, lo1), p13); 543 | lo2 = _mm_add_pd(_mm_mul_pd(f2, lo2), p13); 544 | hi1 = _mm_add_pd(_mm_mul_pd(f1, hi1), p14); 545 | hi2 = _mm_add_pd(_mm_mul_pd(f2, hi2), p14); 546 | lo1 = _mm_add_pd(_mm_mul_pd(f1, lo1), p15); 547 | lo2 = _mm_add_pd(_mm_mul_pd(f2, lo2), p15); 548 | hi1 = _mm_add_pd(_mm_mul_pd(f1, hi1), p16); 549 | hi2 = _mm_add_pd(_mm_mul_pd(f2, hi2), p16); 550 | lo1 = _mm_add_pd(_mm_mul_pd(f1, lo1), p17); 551 | lo2 = _mm_add_pd(_mm_mul_pd(f2, lo2), p17); 552 | hi1 = _mm_add_pd(_mm_mul_pd(f1, hi1), p18); 553 | hi2 = _mm_add_pd(_mm_mul_pd(f2, hi2), p18); 554 | lo1 = _mm_add_pd(_mm_mul_pd(f1, lo1), p19); 555 | lo2 = _mm_add_pd(_mm_mul_pd(f2, lo2), p19); 556 | hi1 = _mm_add_pd(_mm_mul_pd(f1, hi1), p20); 557 | hi2 = _mm_add_pd(_mm_mul_pd(f2, hi2), p20); 558 | __m128d x21 = _mm_mul_pd(f1, f1); 559 | __m128d x101 = _mm_mul_pd(x21, x21); 560 | __m128d x22 = _mm_mul_pd(f2, f2); 561 | __m128d x102 = _mm_mul_pd(x22, x22); 562 | x101 = _mm_mul_pd(x101, x101); 563 | x101 = _mm_mul_pd(x21, x101); 564 | x102 = _mm_mul_pd(x102, x102); 565 | x102 = _mm_mul_pd(x22, x102); 566 | return dvec4(_mm_add_pd(_mm_add_pd(_mm_mul_pd( 567 | _mm_add_pd(_mm_mul_pd(x101, hi1), lo1), f1), f1), 568 | _mm_cvtepi32_pd(_mm_shuffle_epi32(a1, 0x0D))), 569 | _mm_add_pd(_mm_add_pd(_mm_mul_pd( 570 | _mm_add_pd(_mm_mul_pd(x102, hi2), lo2), f2), f2), 571 | _mm_cvtepi32_pd(_mm_shuffle_epi32(a2, 0x0D)))); 572 | } 573 | 574 | friend inline const dvec4 sqrt(const dvec4 &v) { 575 | return dvec4(_mm_sqrt_pd(v.m1), _mm_sqrt_pd(v.m2)); 576 | } 577 | 578 | friend inline const dvec4 inversesqrt(const dvec4 &v) { 579 | __m128d o = _mm_set1_pd(1.0); 580 | return dvec4(_mm_div_pd(o, _mm_sqrt_pd(v.m1)), 581 | _mm_div_pd(o, _mm_sqrt_pd(v.m2))); 582 | } 583 | 584 | // ----------------------------------------------------------------- // 585 | 586 | friend inline const dvec4 abs(const dvec4 &v) { 587 | __m128d nz = _mm_set1_pd(-0.0); 588 | return dvec4(_mm_andnot_pd(nz, v.m1), _mm_andnot_pd(nz, v.m2)); 589 | } 590 | 591 | friend inline const dvec4 ceil(const dvec4 &v) { 592 | __m128d h = _mm_set1_pd(0.5); 593 | return dvec4(_mm_cvtepi32_pd(_mm_cvtpd_epi32(_mm_add_pd(v.m1, h))), 594 | _mm_cvtepi32_pd(_mm_cvtpd_epi32(_mm_add_pd(v.m2, h)))); 595 | } 596 | 597 | friend inline const dvec4 clamp(const dvec4 &v, double d1, double d2) { 598 | __m128d dd1 = _mm_set1_pd(d1); 599 | __m128d dd2 = _mm_set1_pd(d2); 600 | return dvec4(_mm_max_pd(_mm_min_pd(v.m1, dd2), dd1), 601 | _mm_max_pd(_mm_min_pd(v.m2, dd2), dd1)); 602 | } 603 | 604 | friend inline const dvec4 clamp(const dvec4 &v0, 605 | const dvec4 &v1, const dvec4 &v2) { 606 | return dvec4(_mm_max_pd(_mm_min_pd(v0.m1, v2.m1), v1.m1), 607 | _mm_max_pd(_mm_min_pd(v0.m2, v2.m2), v1.m2)); 608 | } 609 | 610 | friend inline const dvec4 floor(const dvec4 &v) { 611 | __m128d h = _mm_set1_pd(0.5); 612 | return dvec4(_mm_cvtepi32_pd(_mm_srai_epi32( 613 | _mm_cvtpd_epi32(_mm_sub_pd(_mm_add_pd(v.m1, v.m1), h)), 1)), 614 | _mm_cvtepi32_pd(_mm_srai_epi32( 615 | _mm_cvtpd_epi32(_mm_sub_pd(_mm_add_pd(v.m2, v.m2), h)), 1))); 616 | } 617 | 618 | friend inline const dvec4 fract(const dvec4 &v) { 619 | __m128d h = _mm_set1_pd(0.5); 620 | return dvec4(_mm_sub_pd(v.m1, _mm_cvtepi32_pd(_mm_srai_epi32( 621 | _mm_cvtpd_epi32(_mm_sub_pd(_mm_add_pd(v.m1, v.m1), h)), 1))), 622 | _mm_sub_pd(v.m2, _mm_cvtepi32_pd(_mm_srai_epi32( 623 | _mm_cvtpd_epi32(_mm_sub_pd(_mm_add_pd(v.m2, v.m2), h)), 1)))); 624 | } 625 | 626 | friend inline const dvec4 max(const dvec4 &v, double d) { 627 | __m128d dd = _mm_set1_pd(d); 628 | return dvec4(_mm_max_pd(v.m1, dd), _mm_max_pd(v.m2, dd)); 629 | } 630 | 631 | friend inline const dvec4 max(const dvec4 &v0, const dvec4 &v1) { 632 | return dvec4(_mm_max_pd(v0.m1, v1.m1), _mm_max_pd(v0.m2, v1.m2)); 633 | } 634 | 635 | friend inline const dvec4 min(const dvec4 &v, double d) { 636 | __m128d dd = _mm_set1_pd(d); 637 | return dvec4(_mm_min_pd(v.m1, dd), _mm_min_pd(v.m2, dd)); 638 | } 639 | 640 | friend inline const dvec4 min(const dvec4 &v0, const dvec4 &v1) { 641 | return dvec4(_mm_min_pd(v0.m1, v1.m1), _mm_min_pd(v0.m2, v1.m2)); 642 | } 643 | 644 | friend inline const dvec4 mix(const dvec4 &v0, const dvec4 &v1, 645 | double d) { 646 | __m128d o = _mm_set1_pd(1.0); 647 | __m128d dd = _mm_set1_pd(d); 648 | return dvec4(_mm_add_pd(_mm_mul_pd(v0.m1, _mm_sub_pd(o, dd)), 649 | _mm_mul_pd(v1.m1, dd)), 650 | _mm_add_pd(_mm_mul_pd(v0.m2, _mm_sub_pd(o, dd)), 651 | _mm_mul_pd(v1.m2, dd))); 652 | } 653 | 654 | friend inline const dvec4 mix(const dvec4 &v0, const dvec4 &v1, 655 | const dvec4 &v2) { 656 | __m128d o = _mm_set1_pd(1.0); 657 | return dvec4(_mm_add_pd(_mm_mul_pd(v0.m1, _mm_sub_pd(o, v2.m1)), 658 | _mm_mul_pd(v1.m1, v2.m1)), 659 | _mm_add_pd(_mm_mul_pd(v0.m2, _mm_sub_pd(o, v2.m2)), 660 | _mm_mul_pd(v1.m2, v2.m2))); 661 | } 662 | 663 | friend inline const dvec4 mod(const dvec4 &v, double d) { 664 | __m128d h = _mm_set1_pd(0.5); 665 | __m128d dd = _mm_set1_pd(d); 666 | __m128d d1 = _mm_div_pd(v.m1, dd); 667 | __m128d d2 = _mm_div_pd(v.m2, dd); 668 | return dvec4(_mm_sub_pd(v.m1, _mm_mul_pd(dd, _mm_cvtepi32_pd( 669 | _mm_srai_epi32(_mm_cvtpd_epi32(_mm_sub_pd( 670 | _mm_add_pd(d1, d1), h)), 1)))), 671 | _mm_sub_pd(v.m2, _mm_mul_pd(dd, _mm_cvtepi32_pd( 672 | _mm_srai_epi32(_mm_cvtpd_epi32(_mm_sub_pd( 673 | _mm_add_pd(d2, d2), h)), 1))))); 674 | } 675 | 676 | friend inline const dvec4 mod(const dvec4 &v0, const dvec4 &v1) { 677 | __m128d h = _mm_set1_pd(0.5); 678 | __m128d d1 = _mm_div_pd(v0.m1, v1.m1); 679 | __m128d d2 = _mm_div_pd(v0.m2, v1.m2); 680 | return dvec4(_mm_sub_pd(v0.m1, _mm_mul_pd(v1.m1, _mm_cvtepi32_pd( 681 | _mm_srai_epi32(_mm_cvtpd_epi32(_mm_sub_pd( 682 | _mm_add_pd(d1, d1), h)), 1)))), 683 | _mm_sub_pd(v0.m2, _mm_mul_pd(v1.m2, _mm_cvtepi32_pd( 684 | _mm_srai_epi32(_mm_cvtpd_epi32(_mm_sub_pd( 685 | _mm_add_pd(d2, d2), h)), 1))))); 686 | } 687 | 688 | friend inline const dvec4 modf(const dvec4 &v0, dvec4 &v1) { 689 | __m128d nz = _mm_set1_pd(-0.0); 690 | v1.m1 = _mm_or_pd(_mm_cvtepi32_pd(_mm_cvttpd_epi32(v0.m1)), 691 | _mm_and_pd(nz, v0.m1)); 692 | v1.m2 = _mm_or_pd(_mm_cvtepi32_pd(_mm_cvttpd_epi32(v0.m2)), 693 | _mm_and_pd(nz, v0.m2)); 694 | return dvec4(_mm_sub_pd(v0.m1, v1.m1), 695 | _mm_sub_pd(v0.m2, v1.m2)); 696 | } 697 | 698 | friend inline const dvec4 sign(const dvec4 &v) { 699 | __m128d o = _mm_set1_pd(1); 700 | __m128d z = _mm_setzero_pd(); 701 | __m128d nz = _mm_set1_pd(-0.0); 702 | return dvec4(_mm_and_pd(_mm_or_pd(_mm_and_pd(v.m1, nz), o), 703 | _mm_cmpneq_pd(v.m1, z)), 704 | _mm_and_pd(_mm_or_pd(_mm_and_pd(v.m2, nz), o), 705 | _mm_cmpneq_pd(v.m2, z))); 706 | } 707 | 708 | friend inline const dvec4 smoothstep(double d1, double d2, 709 | const dvec4 &v) { 710 | __m128d z = _mm_setzero_pd(); 711 | __m128d o = _mm_set1_pd(1.0); 712 | __m128d t = _mm_set1_pd(3.0); 713 | __m128d dd1 = _mm_set1_pd(d1); 714 | __m128d dd2 = _mm_set1_pd(d2); 715 | __m128d r1 = _mm_max_pd(_mm_min_pd(_mm_div_pd( 716 | _mm_sub_pd(v.m1, dd1), 717 | _mm_sub_pd(dd2, dd1)), o), z); 718 | __m128d r2 = _mm_max_pd(_mm_min_pd(_mm_div_pd( 719 | _mm_sub_pd(v.m2, dd1), 720 | _mm_sub_pd(dd2, dd1)), o), z); 721 | return dvec4(_mm_mul_pd(_mm_mul_pd(r1, r1), 722 | _mm_sub_pd(t, _mm_add_pd(r1, r1))), 723 | _mm_mul_pd(_mm_mul_pd(r2, r2), 724 | _mm_sub_pd(t, _mm_add_pd(r2, r2)))); 725 | } 726 | 727 | friend inline const dvec4 smoothstep(const dvec4 &v0, 728 | const dvec4 &v1, const dvec4 &v2) { 729 | __m128d z = _mm_setzero_pd(); 730 | __m128d o = _mm_set1_pd(1.0); 731 | __m128d t = _mm_set1_pd(3.0); 732 | __m128d r1 = _mm_max_pd(_mm_min_pd(_mm_div_pd( 733 | _mm_sub_pd(v2.m1, v0.m1), 734 | _mm_sub_pd(v1.m1, v0.m1)), o), z); 735 | __m128d r2 = _mm_max_pd(_mm_min_pd(_mm_div_pd( 736 | _mm_sub_pd(v2.m2, v0.m2), 737 | _mm_sub_pd(v1.m2, v0.m2)), o), z); 738 | return dvec4(_mm_mul_pd(_mm_mul_pd(r1, r1), 739 | _mm_sub_pd(t, _mm_add_pd(r1, r1))), 740 | _mm_mul_pd(_mm_mul_pd(r2, r2), 741 | _mm_sub_pd(t, _mm_add_pd(r2, r2)))); 742 | } 743 | 744 | friend inline const dvec4 step(double d, const dvec4 &v) { 745 | __m128d o = _mm_set1_pd(1.0); 746 | __m128d dd = _mm_set1_pd(d); 747 | return dvec4(_mm_and_pd(_mm_cmple_pd(v.m1, dd), o), 748 | _mm_and_pd(_mm_cmple_pd(v.m2, dd), o)); 749 | } 750 | 751 | friend inline const dvec4 step(const dvec4 &v0, const dvec4 &v1) { 752 | __m128d o = _mm_set1_pd(1.0); 753 | return dvec4(_mm_and_pd(_mm_cmple_pd(v0.m1, v1.m1), o), 754 | _mm_and_pd(_mm_cmple_pd(v0.m2, v1.m2), o)); 755 | } 756 | 757 | friend inline const dvec4 trunc(const dvec4 &v) { 758 | __m128d h = _mm_set1_pd(0.5); 759 | __m128d nz = _mm_set1_pd(-0.0); 760 | return dvec4(_mm_cvtepi32_pd(_mm_cvtpd_epi32(_mm_sub_pd(v.m1, 761 | _mm_or_pd(_mm_and_pd(v.m1, nz), h)))), 762 | _mm_cvtepi32_pd(_mm_cvtpd_epi32(_mm_sub_pd(v.m2, 763 | _mm_or_pd(_mm_and_pd(v.m2, nz), h))))); 764 | } 765 | 766 | // ----------------------------------------------------------------- // 767 | 768 | friend inline double distance(const dvec4 &v0, const dvec4 &v1) { 769 | __m128d dd1 = _mm_sub_pd(v0.m1, v1.m1); 770 | __m128d dd2 = _mm_sub_pd(v0.m2, v1.m2); 771 | __m128d ll1 = _mm_mul_pd(dd1, dd1); 772 | __m128d ll2 = _mm_mul_pd(dd2, dd2); 773 | return _mm_cvtsd_f64(_mm_sqrt_pd(_mm_add_pd( 774 | _mm_add_pd(ll1, ll2), 775 | _mm_add_pd(_mm_shuffle_pd(ll1, ll1, 0x01), 776 | _mm_shuffle_pd(ll2, ll2, 0x01))))); 777 | } 778 | 779 | friend inline double dot(const dvec4 &v0, const dvec4 &v1) { 780 | __m128d ll1 = _mm_mul_pd(v0.m1, v1.m1); 781 | __m128d ll2 = _mm_mul_pd(v0.m2, v1.m2); 782 | return _mm_cvtsd_f64(_mm_add_pd( 783 | _mm_add_pd(ll1, ll2), 784 | _mm_add_pd(_mm_shuffle_pd(ll1, ll1, 0x01), 785 | _mm_shuffle_pd(ll2, ll2, 0x01)))); 786 | } 787 | 788 | friend inline const dvec4 faceforward(const dvec4 &v0, 789 | const dvec4 &v1, const dvec4 &v2) { 790 | __m128d z = _mm_setzero_pd(); 791 | __m128d nz = _mm_set1_pd(-0.0); 792 | __m128d ll1 = _mm_mul_pd(v2.m1, v1.m1); 793 | __m128d ll2 = _mm_mul_pd(v2.m2, v1.m2); 794 | __m128d dot = _mm_add_pd( 795 | _mm_add_pd(ll1, ll2), 796 | _mm_add_pd(_mm_shuffle_pd(ll1, ll1, 0x01), 797 | _mm_shuffle_pd(ll2, ll2, 0x01))); 798 | return dvec4(_mm_xor_pd(_mm_and_pd(_mm_cmpnlt_pd(dot, z), nz), v0.m1), 799 | _mm_xor_pd(_mm_and_pd(_mm_cmpnlt_pd(dot, z), nz), v0.m2)); 800 | } 801 | 802 | friend inline double length(const dvec4 &v) { 803 | __m128d ll1 = _mm_mul_pd(v.m1, v.m1); 804 | __m128d ll2 = _mm_mul_pd(v.m2, v.m2); 805 | return _mm_cvtsd_f64(_mm_sqrt_pd(_mm_add_pd( 806 | _mm_add_pd(ll1, ll2), 807 | _mm_add_pd(_mm_shuffle_pd(ll1, ll1, 0x01), 808 | _mm_shuffle_pd(ll2, ll2, 0x01))))); 809 | } 810 | 811 | friend inline const dvec4 normalize(const dvec4 &v) { 812 | __m128d ll1 = _mm_mul_pd(v.m1, v.m1); 813 | __m128d ll2 = _mm_mul_pd(v.m2, v.m2); 814 | __m128d len = _mm_sqrt_pd(_mm_add_pd( 815 | _mm_add_pd(ll1, ll2), 816 | _mm_add_pd(_mm_shuffle_pd(ll1, ll1, 0x01), 817 | _mm_shuffle_pd(ll2, ll2, 0x01)))); 818 | return dvec4(_mm_div_pd(v.m1, len), _mm_div_pd(v.m2, len)); 819 | } 820 | 821 | friend inline const dvec4 reflect(const dvec4 &v0, const dvec4 &v1) { 822 | __m128d ll1 = _mm_mul_pd(v0.m1, v1.m1); 823 | __m128d ll2 = _mm_mul_pd(v0.m2, v1.m2); 824 | __m128d res = _mm_add_pd(_mm_add_pd(ll1, ll2), 825 | _mm_add_pd(_mm_shuffle_pd(ll1, ll1, 0x01), 826 | _mm_shuffle_pd(ll2, ll2, 0x01))); 827 | res = _mm_add_pd(res, res); 828 | return dvec4(_mm_sub_pd(v0.m1, _mm_mul_pd(res, v1.m1)), 829 | _mm_sub_pd(v0.m2, _mm_mul_pd(res, v1.m2))); 830 | } 831 | 832 | friend inline const dvec4 refract(const dvec4 &v0, const dvec4 &v1, 833 | double d) { 834 | __m128d o = _mm_set1_pd(1.0); 835 | __m128d z = _mm_set1_pd(0.0); 836 | __m128d e = _mm_set1_pd(d); 837 | __m128d ll1 = _mm_mul_pd(v1.m1, v0.m1); 838 | __m128d ll2 = _mm_mul_pd(v1.m2, v0.m2); 839 | __m128d dot = _mm_add_pd(_mm_add_pd(ll1, ll2), 840 | _mm_add_pd(_mm_shuffle_pd(ll1, ll1, 0x01), 841 | _mm_shuffle_pd(ll2, ll2, 0x01))); 842 | __m128d k = _mm_sub_pd(o, _mm_mul_pd(_mm_mul_pd(e, e), _mm_sub_pd(o, 843 | _mm_mul_pd(dot, dot)))); 844 | return dvec4(_mm_and_pd(_mm_cmpnlt_pd(k, z), _mm_mul_pd( 845 | _mm_mul_pd(e, _mm_sub_pd(v0.m1, _mm_mul_pd( 846 | _mm_mul_pd(e, dot), _mm_sqrt_pd(k)))), v1.m1)), 847 | _mm_and_pd(_mm_cmpnlt_pd(k, z), _mm_mul_pd( 848 | _mm_mul_pd(e, _mm_sub_pd(v0.m2, _mm_mul_pd( 849 | _mm_mul_pd(e, dot), _mm_sqrt_pd(k)))), v1.m2))); 850 | } 851 | 852 | // ----------------------------------------------------------------- // 853 | 854 | friend inline bool operator == (const dvec4 &v0, const dvec4 &v1) { 855 | return _mm_movemask_pd(_mm_and_pd( 856 | _mm_cmpeq_pd(v0.m1, v1.m1), 857 | _mm_cmpeq_pd(v0.m2, v1.m2))) == 0x03; 858 | } 859 | 860 | friend inline bool operator != (const dvec4 &v0, const dvec4 &v1) { 861 | return _mm_movemask_pd(_mm_and_pd( 862 | _mm_cmpneq_pd(v0.m1, v1.m1), 863 | _mm_cmpneq_pd(v0.m2, v1.m2))) != 0x00; 864 | } 865 | 866 | // ----------------------------------------------------------------- // 867 | 868 | union { 869 | // Vertex / Vector 870 | struct { 871 | double x, y, z, w; 872 | }; 873 | // Color 874 | struct { 875 | double r, g, b, a; 876 | }; 877 | // Texture coordinates 878 | struct { 879 | double s, t, p, q; 880 | }; 881 | 882 | // SSE2 registers 883 | __m128d m[2]; 884 | struct { 885 | __m128d m1; 886 | __m128d m2; 887 | }; 888 | }; 889 | }; 890 | 891 | #include "swizzle4.h" 892 | 893 | // Template specialization for mask 0xE4 (No shuffle) 894 | template <> 895 | inline dvec4 dvec4::shuffle<0xE4>(const dvec4 &v) 896 | { 897 | return v; 898 | } 899 | 900 | #endif 901 | -------------------------------------------------------------------------------- /source/ivec4.h: -------------------------------------------------------------------------------- 1 | #ifndef __IVEC4_H__ 2 | #define __IVEC4_H__ 3 | 4 | #include 5 | 6 | class ivec4 7 | { 8 | private: 9 | template 10 | static inline __m128i shuffle(const __m128i &xmm) { 11 | return _mm_shuffle_epi32(xmm, mask); 12 | } 13 | 14 | // Merges mask `target` with `m` into one unified mask that does the same sequential shuffle 15 | template 16 | struct _mask_merger 17 | { 18 | enum 19 | { 20 | ROW0 = ((target >> (((m >> 0) & 3) << 1)) & 3) << 0, 21 | ROW1 = ((target >> (((m >> 2) & 3) << 1)) & 3) << 2, 22 | ROW2 = ((target >> (((m >> 4) & 3) << 1)) & 3) << 4, 23 | ROW3 = ((target >> (((m >> 6) & 3) << 1)) & 3) << 6, 24 | 25 | MASK = ROW0 | ROW1 | ROW2 | ROW3, 26 | }; 27 | 28 | private: 29 | _mask_merger(); 30 | }; 31 | 32 | // Since we are working in little endian land, this reverses the shuffle mask 33 | template 34 | struct _mask_reverser 35 | { 36 | enum 37 | { 38 | ROW0 = 0 << (((m >> 0) & 3) << 1), 39 | ROW1 = 1 << (((m >> 2) & 3) << 1), 40 | ROW2 = 2 << (((m >> 4) & 3) << 1), 41 | ROW3 = 3 << (((m >> 6) & 3) << 1), 42 | 43 | MASK = ROW0 | ROW1 | ROW2 | ROW3, 44 | }; 45 | 46 | private: 47 | _mask_reverser(); 48 | }; 49 | 50 | // Swizzle helper (Read only) 51 | template 52 | struct _swzl_ro 53 | { 54 | friend class ivec4; 55 | 56 | public: 57 | inline operator const ivec4 () const { 58 | return shuffle(v.m); 59 | } 60 | 61 | inline int32_t operator[](int index) const { 62 | return v[(mask >> (index << 1)) & 0x3]; 63 | } 64 | 65 | // Swizzle of the swizzle, read only const (2) 66 | template 67 | inline _swzl_ro<_mask_merger::MASK> shuffle4_ro2() const { 68 | typedef _mask_merger merged; 69 | return _swzl_ro(v); 70 | } 71 | 72 | // Swizzle of the swizzle, read only const (4) 73 | template 74 | inline _swzl_ro<_mask_merger::MASK> shuffle4_ro4() const { 75 | typedef _mask_merger merged; 76 | return _swzl_ro(v); 77 | } 78 | 79 | // Swizzle of the swizzle, read/write const 80 | template 81 | inline _swzl_ro<_mask_merger::MASK> shuffle4_rw4() const { 82 | typedef _mask_merger merged; 83 | return _swzl_ro(v); 84 | } 85 | 86 | const int32_t &x, &y, &z, &w; 87 | const int32_t &r, &g, &b, &a; 88 | const int32_t &s, &t, &p, &q; 89 | 90 | private: 91 | // This massive constructor maps a vector to references 92 | inline _swzl_ro(const ivec4 &v): 93 | x(v[(mask >> 0) & 0x3]), y(v[(mask >> 2) & 0x3]), 94 | z(v[(mask >> 4) & 0x3]), w(v[(mask >> 6) & 0x3]), 95 | 96 | r(v[(mask >> 0) & 0x3]), g(v[(mask >> 2) & 0x3]), 97 | b(v[(mask >> 4) & 0x3]), a(v[(mask >> 6) & 0x3]), 98 | 99 | s(v[(mask >> 0) & 0x3]), t(v[(mask >> 2) & 0x3]), 100 | p(v[(mask >> 4) & 0x3]), q(v[(mask >> 6) & 0x3]), 101 | 102 | v(v) { 103 | // Empty 104 | } 105 | 106 | // Reference to unswizzled self 107 | const ivec4 &v; 108 | }; 109 | 110 | // Swizzle helper (Read/Write) 111 | template 112 | struct _swzl_rw 113 | { 114 | friend class ivec4; 115 | 116 | public: 117 | inline operator const ivec4 () const { 118 | return shuffle(v.m); 119 | } 120 | 121 | inline int32_t& operator[](int index) const { 122 | return v[(mask >> (index << 1)) & 0x3]; 123 | } 124 | 125 | // Swizzle from ivec4 126 | inline ivec4& operator = (const ivec4 &r) { 127 | return v = shuffle<_mask_reverser::MASK>(r.m); 128 | } 129 | 130 | // Swizzle from same r/o mask (v1.xyzw = v2.xyzw) 131 | inline ivec4& operator = (const _swzl_ro &s) { 132 | return v = s.v; 133 | } 134 | 135 | // Swizzle from same mask (v1.xyzw = v2.xyzw) 136 | inline ivec4& operator = (const _swzl_rw &s) { 137 | return v = s.v; 138 | } 139 | 140 | // Swizzle mask => other_mask, r/o (v1.zwxy = v2.xyxy) 141 | template 142 | inline ivec4& operator = (const _swzl_ro &s) { 143 | typedef _mask_merger::MASK> merged; 144 | 145 | return v = shuffle(s.v.m);; 146 | } 147 | 148 | // Swizzle mask => other_mask (v1.zwxy = v2.xyxy) 149 | template 150 | inline ivec4& operator = (const _swzl_rw &s) { 151 | typedef _mask_merger::MASK> merged; 152 | 153 | return v = shuffle(s.v.m); 154 | } 155 | 156 | // Swizzle of the swizzle, read only (v.xxxx.yyyy) (2) 157 | template 158 | inline _swzl_ro<_mask_merger::MASK> shuffle4_ro2() const { 159 | typedef _mask_merger merged; 160 | 161 | return _swzl_ro(v); 162 | } 163 | 164 | // Swizzle of the swizzle, read only (v.xxxx.yyyy) (4) 165 | template 166 | inline _swzl_ro<_mask_merger::MASK> shuffle4_ro4() const { 167 | typedef _mask_merger merged; 168 | 169 | return _swzl_ro(v); 170 | } 171 | 172 | // Swizzle of the swizzle, read/write (v1.zyxw.wzyx = ...) 173 | template 174 | inline _swzl_rw<_mask_merger::MASK> shuffle4_rw4() { 175 | typedef _mask_merger merged; 176 | 177 | return _swzl_rw(v); 178 | } 179 | 180 | // ----------------------------------------------------------------- // 181 | 182 | inline ivec4& operator += (int32_t s) { 183 | return v += s; 184 | } 185 | 186 | inline ivec4& operator += (const ivec4 &v0) { 187 | return v += v0.shuffle4_ro4(); 188 | } 189 | 190 | inline ivec4& operator -= (int32_t s) { 191 | return v -= s; 192 | } 193 | 194 | inline ivec4& operator -= (const ivec4 &v0) { 195 | return v -= v0.shuffle4_ro4(); 196 | } 197 | 198 | inline ivec4& operator *= (int32_t s) { 199 | return v *= s; 200 | } 201 | 202 | inline ivec4& operator *= (const ivec4 &v0) { 203 | return v *= v0.shuffle4_ro4(); 204 | } 205 | 206 | inline ivec4& operator /= (int32_t s) { 207 | return v /= s; 208 | } 209 | 210 | inline ivec4& operator /= (const ivec4 &v0) { 211 | return v /= v0.shuffle4_ro4(); 212 | } 213 | 214 | // ----------------------------------------------------------------- // 215 | 216 | int32_t &x, &y, &z, &w; 217 | int32_t &r, &g, &b, &a; 218 | int32_t &s, &t, &p, &q; 219 | 220 | private: 221 | // This massive contructor maps a vector to references 222 | inline _swzl_rw(ivec4 &v): 223 | x(v[(mask >> 0) & 0x3]), y(v[(mask >> 2) & 0x3]), 224 | z(v[(mask >> 4) & 0x3]), w(v[(mask >> 6) & 0x3]), 225 | 226 | r(v[(mask >> 0) & 0x3]), g(v[(mask >> 2) & 0x3]), 227 | b(v[(mask >> 4) & 0x3]), a(v[(mask >> 6) & 0x3]), 228 | 229 | s(v[(mask >> 0) & 0x3]), t(v[(mask >> 2) & 0x3]), 230 | p(v[(mask >> 4) & 0x3]), q(v[(mask >> 6) & 0x3]), 231 | 232 | v(v) { 233 | // Empty 234 | } 235 | 236 | // Refrence to unswizzled self 237 | ivec4 &v; 238 | }; 239 | 240 | // ----------------------------------------------------------------- // 241 | 242 | public: 243 | // Empty constructor 244 | inline ivec4() { 245 | m = _mm_setzero_si128(); 246 | } 247 | 248 | // Fill constructor 249 | explicit inline ivec4(int32_t i) { 250 | m = _mm_set1_epi32(i); 251 | } 252 | 253 | // 4 var init constructor 254 | inline ivec4(int32_t _x, int32_t _y, int32_t _z, int32_t _w) { 255 | m = _mm_setr_epi32(_x, _y, _z, _w); 256 | } 257 | 258 | // Integer array constructor 259 | inline ivec4(const int32_t* fv) { 260 | m = _mm_castps_si128(_mm_loadu_ps((const float*)fv)); 261 | } 262 | 263 | // Copy constructor 264 | inline ivec4(const ivec4 &v) { 265 | m = v.m; 266 | } 267 | 268 | // SSE2 compatible constructor 269 | inline ivec4(const __m128i &_m) { 270 | m = _m; 271 | } 272 | 273 | // ----------------------------------------------------------------- // 274 | 275 | inline void* operator new(size_t size) throw() { 276 | return _mm_malloc(size, 16); 277 | } 278 | 279 | inline void operator delete(void* ptr) { 280 | _mm_free(ptr); 281 | } 282 | 283 | // ----------------------------------------------------------------- // 284 | 285 | // Read-write swizzle 286 | template 287 | inline _swzl_rw shuffle4_rw4() { 288 | return _swzl_rw(*this); 289 | } 290 | 291 | // Read-write swizzle, const, actually read only 292 | template 293 | inline _swzl_ro shuffle4_rw4() const { 294 | return _swzl_ro(*this); 295 | } 296 | 297 | // Read-only swizzle (2) 298 | template 299 | inline _swzl_ro shuffle4_ro2() const { 300 | return _swzl_ro(*this); 301 | } 302 | 303 | // Read-only swizzle (4) 304 | template 305 | inline _swzl_ro shuffle4_ro4() const { 306 | return _swzl_ro(*this); 307 | } 308 | 309 | // ----------------------------------------------------------------- // 310 | 311 | // Write direct access operator 312 | inline int32_t& operator[](int index) { 313 | return reinterpret_cast(this)[index]; 314 | } 315 | 316 | // Read direct access operator 317 | inline const int32_t& operator[](int index) const { 318 | return reinterpret_cast(this)[index]; 319 | } 320 | 321 | // Cast operator 322 | inline operator int32_t* () { 323 | return reinterpret_cast(this); 324 | } 325 | 326 | // Const cast operator 327 | inline operator const int32_t* () const { 328 | return reinterpret_cast(this); 329 | } 330 | 331 | // ----------------------------------------------------------------- // 332 | 333 | friend inline ivec4& operator += (ivec4 &v, int32_t i) { 334 | v.m = _mm_add_epi32(v.m, _mm_set1_epi32(i)); 335 | return v; 336 | } 337 | 338 | friend inline ivec4& operator += (ivec4 &v0, const ivec4 &v1) { 339 | v0.m = _mm_add_epi32(v0.m, v1.m); 340 | return v0; 341 | } 342 | 343 | friend inline ivec4& operator -= (ivec4 &v, int32_t i) { 344 | v.m = _mm_sub_epi32(v.m, _mm_set1_epi32(i)); 345 | return v; 346 | } 347 | 348 | friend inline ivec4& operator -= (ivec4 &v0, const ivec4 &v1) { 349 | v0.m = _mm_sub_epi32(v0.m, v1.m); 350 | return v0; 351 | } 352 | 353 | friend inline ivec4& operator *= (ivec4 &v, int32_t i) { 354 | __m128i ii = _mm_set1_epi32(i); 355 | v.m = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(_mm_mul_epu32(v.m, ii)), 356 | _mm_castsi128_ps(_mm_mul_epu32(_mm_srli_si128(v.m, 4), 357 | _mm_srli_si128(ii, 4))), 0x88)); 358 | return v; 359 | } 360 | 361 | friend inline ivec4& operator *= (ivec4 &v0, const ivec4 &v1) { 362 | v0.m = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(_mm_mul_epu32(v0.m, v1.m)), 363 | _mm_castsi128_ps(_mm_mul_epu32(_mm_srli_si128(v0.m, 4), 364 | _mm_srli_si128(v1.m, 4))), 0x88)); 365 | return v0; 366 | } 367 | /* 368 | friend inline ivec4& operator /= (ivec4 &v, int32_t f) { 369 | // TODO 370 | } 371 | 372 | friend inline ivec4& operator /= (iivec4 &v0, const ivec4 &v1) { 373 | // TODO 374 | } 375 | */ 376 | // ----------------------------------------------------------------- // 377 | 378 | friend inline const ivec4 operator + (int32_t i, const ivec4 &v) { 379 | return _mm_add_epi32(_mm_set1_epi32(i), v.m); 380 | } 381 | 382 | friend inline const ivec4 operator + (const ivec4 &v, int32_t i) { 383 | return _mm_add_epi32(v.m, _mm_set1_epi32(i)); 384 | } 385 | 386 | friend inline const ivec4 operator + (const ivec4 &v0, const ivec4 &v1) { 387 | return _mm_add_epi32(v0.m, v1.m); 388 | } 389 | 390 | friend inline const ivec4 operator - (const ivec4 &v) { 391 | return _mm_sub_epi32(_mm_setzero_si128(), v.m); 392 | } 393 | 394 | friend inline const ivec4 operator - (int32_t i, const ivec4 &v) { 395 | return _mm_sub_epi32(_mm_set1_epi32(i), v.m); 396 | } 397 | 398 | friend inline const ivec4 operator - (const ivec4 &v, int32_t i) { 399 | return _mm_sub_epi32(v.m, _mm_set1_epi32(i)); 400 | } 401 | 402 | friend inline const ivec4 operator - (const ivec4 &v0, const ivec4 &v1) { 403 | return _mm_sub_epi32(v0.m, v1.m); 404 | } 405 | 406 | friend inline const ivec4 operator * (int32_t i, const ivec4 &v) { 407 | __m128i ii = _mm_set1_epi32(i); 408 | return _mm_castps_si128(_mm_shuffle_ps( 409 | _mm_castsi128_ps(_mm_mul_epu32(ii, v.m)), 410 | _mm_castsi128_ps(_mm_mul_epu32(_mm_srli_si128(ii, 4), 411 | _mm_srli_si128(v.m, 4))), 0x88)); 412 | } 413 | 414 | friend inline const ivec4 operator * (const ivec4 &v, int32_t i) { 415 | __m128i ii = _mm_set1_epi32(i); 416 | return _mm_castps_si128(_mm_shuffle_ps( 417 | _mm_castsi128_ps(_mm_mul_epu32(v.m, ii)), 418 | _mm_castsi128_ps(_mm_mul_epu32(_mm_srli_si128(v.m, 4), 419 | _mm_srli_si128(ii, 4))), 0x88)); 420 | } 421 | 422 | friend inline const ivec4 operator * (const ivec4 &v0, const ivec4 &v1) { 423 | return _mm_castps_si128(_mm_shuffle_ps( 424 | _mm_castsi128_ps(_mm_mul_epu32(v0.m, v1.m)), 425 | _mm_castsi128_ps(_mm_mul_epu32(_mm_srli_si128(v0.m, 4), 426 | _mm_srli_si128(v1.m, 4))), 0x88)); 427 | } 428 | /* 429 | friend inline const ivec4 operator / (int32_t f, const ivec4 &v) { 430 | // TODO 431 | } 432 | 433 | friend inline const ivec4 operator / (const ivec4 &v, int32_t f) { 434 | // TODO 435 | } 436 | 437 | friend inline const ivec4 operator / (const ivec4 &v0, const ivec4 &v1) { 438 | // TODO 439 | } 440 | */ 441 | // ----------------------------------------------------------------- // 442 | 443 | friend inline const ivec4 abs(const ivec4 &v) { 444 | __m128i mask = _mm_srai_epi32(v.m, 31); 445 | return _mm_xor_si128(_mm_add_epi32(v.m, mask), mask); 446 | } 447 | 448 | friend inline const ivec4 clamp(const ivec4 &v, int32_t f1, int32_t f2) { 449 | return max(min(v, f2), f1); 450 | } 451 | 452 | friend inline const ivec4 clamp(const ivec4 &v0, 453 | const ivec4 &v1, const ivec4 &v2) { 454 | return max(v1, min(v2, v0)); 455 | } 456 | 457 | friend inline const ivec4 max(const ivec4 &v, int32_t i) { 458 | __m128i ii = _mm_set1_epi32(i); 459 | __m128i m = _mm_cmplt_epi32(v.m, ii); 460 | return _mm_or_si128(_mm_andnot_si128(m, v.m), _mm_and_si128(ii, m)); 461 | } 462 | 463 | friend inline const ivec4 max(const ivec4 &v0, const ivec4 &v1) { 464 | __m128i m = _mm_cmplt_epi32(v0.m, v1.m); 465 | return _mm_or_si128(_mm_andnot_si128(m, v0.m), _mm_and_si128(v1.m, m)); 466 | } 467 | 468 | friend inline const ivec4 min(const ivec4 &v, int32_t i) { 469 | __m128i ii = _mm_set1_epi32(i); 470 | __m128i m = _mm_cmpgt_epi32(v.m, ii); 471 | return _mm_or_si128(_mm_andnot_si128(m, v.m), _mm_and_si128(ii, m)); 472 | } 473 | 474 | friend inline const ivec4 min(const ivec4 &v0, const ivec4 &v1) { 475 | __m128i m = _mm_cmpgt_epi32(v0.m, v1.m); 476 | return _mm_or_si128(_mm_andnot_si128(m, v0.m), _mm_and_si128(v1.m, m)); 477 | } 478 | 479 | friend inline const ivec4 sign(const ivec4 &v) { 480 | return _mm_or_si128(_mm_add_epi32(_mm_cmpeq_epi32(v.m, _mm_setzero_si128()), 481 | _mm_set1_epi32(1)),_mm_srai_epi32(v.m, 31)); 482 | } 483 | 484 | // ----------------------------------------------------------------- // 485 | 486 | friend inline bool operator == (const ivec4 &v0, const ivec4 &v1) { 487 | return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(v0.m, v1.m))) == 0xF); 488 | } 489 | 490 | friend inline bool operator != (const ivec4 &v0, const ivec4 &v1) { 491 | return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(v0.m, v1.m))) != 0xF); 492 | } 493 | 494 | // ----------------------------------------------------------------- // 495 | 496 | union { 497 | // Vertex / Vector 498 | struct { 499 | int32_t x, y, z, w; 500 | }; 501 | // Color 502 | struct { 503 | int32_t r, g, b, a; 504 | }; 505 | // Texture coordinates 506 | struct { 507 | int32_t s, t, p, q; 508 | }; 509 | 510 | // SSE2 register 511 | __m128i m; 512 | }; 513 | }; 514 | 515 | // Template specialization for mask 0xE4 (No shuffle) 516 | template<> 517 | inline __m128i ivec4::shuffle<0xE4>(const __m128i &xmm) { 518 | return xmm; 519 | } 520 | 521 | #include "swizzle4.h" 522 | 523 | #endif 524 | -------------------------------------------------------------------------------- /source/main.cpp: -------------------------------------------------------------------------------- 1 | #include "tests/vec4.h" 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | /* 9 | __m128 color32_to_vec4(uint32_t c) 10 | { 11 | __m128 m = _mm_cvtpu8_ps(_mm_cvtsi32_si64(c)); 12 | m = _mm_shuffle_ps(m, m, 0x1B); 13 | return _mm_mul_ps(m, _mm_set1_ps(1.0f / 255.0f)); 14 | } 15 | 16 | uint32_t vec4_to_color32(__m128 m) 17 | { 18 | m = _mm_shuffle_ps(m, m, 0x1B); 19 | m = _mm_mul_ps(m, _mm_set1_ps(255.0f)); 20 | 21 | __m64 ll = _mm_cvtps_pi16(m); 22 | ll = _mm_or_si64(ll, _m_psrlqi(ll, 8)); 23 | ll = _mm_and_si64(ll, _m_from_int64(0xFFFF0000FFFFLL)); 24 | ll = _mm_or_si64(ll, _m_psrlqi(ll, 16)); 25 | return _m_to_int(ll); 26 | } 27 | */ 28 | 29 | #include "vec4.h" 30 | #include "dvec4.h" 31 | #include "ivec4.h" 32 | #include "uvec4.h" 33 | #include "bvec4.h" 34 | 35 | void printv(const vec4 &v) 36 | { 37 | printf("%f, %f, %f, %f\n", v.x, v.y, v.z, v.w); 38 | } 39 | 40 | void printv(const dvec4 &v) 41 | { 42 | printf("%f, %f, %f, %f\n", v.x, v.y, v.z, v.w); 43 | } 44 | 45 | void printv(const ivec4 &v) 46 | { 47 | printf("%d, %d, %d, %d\n", v.x, v.y, v.z, v.w); 48 | } 49 | 50 | void printv(const uvec4 &v) 51 | { 52 | printf("%u, %u, %u, %u\n", v.x, v.y, v.z, v.w); 53 | } 54 | 55 | void printv(const bvec4 &v) 56 | { 57 | printf("%s, %s, %s, %s\n", 58 | v.x ? "true" : "false", 59 | v.y ? "true" : "false", 60 | v.z ? "true" : "false", 61 | v.w ? "true" : "false"); 62 | } 63 | 64 | int main() 65 | { 66 | tests::vec4::testEquality(); 67 | tests::vec4::testAccessors(); 68 | tests::vec4::testSwizzleEquality(); 69 | tests::vec4::testSwizzleWrite(); 70 | tests::vec4::testUnary(); 71 | 72 | #define VECTOR dvec4 73 | 74 | VECTOR v (4, 3, 2, 1); 75 | VECTOR res(1, 2, 3, 4); 76 | 77 | v.xyzw = res.wzyx; 78 | assert(v == VECTOR(4, 3, 2, 1)); 79 | 80 | v.xyzw = res.wzyx.xxyy; 81 | assert(v == VECTOR(4, 4, 3, 3)); 82 | 83 | v.xyzw = res; 84 | assert(v == res); 85 | 86 | res = VECTOR(5, 6, 7, 8); 87 | assert(res == VECTOR(5, 6, 7, 8)); 88 | 89 | v.wzyx = VECTOR(0, 2, 1, 3); 90 | assert(v == VECTOR(3, 1, 2, 0)); 91 | 92 | v.wzyx = res.xyxy; 93 | assert(v == VECTOR(6, 5, 6, 5)); 94 | 95 | v.wzyx = res.xxxx.yyyy; 96 | assert(v == VECTOR(5, 5, 5, 5)); 97 | 98 | v.wzyx.wzyx = VECTOR(1, 2, 3, 4); 99 | assert(v == VECTOR(1, 2, 3, 4)); 100 | 101 | v.wzyx.wzyx = v.xxxx; 102 | assert(v == VECTOR(1, 1, 1, 1)); 103 | 104 | v.xyzw.xyzw = VECTOR(1, 2, 3, 4); 105 | assert(v == VECTOR(1, 2, 3, 4)); 106 | 107 | v.wzyx.wzyx = v.xyzw.xxxx; 108 | assert(v == VECTOR(1, 1, 1, 1)); 109 | 110 | res.wzyx.y += 1; 111 | assert(res == VECTOR(5, 6, 8, 8)); 112 | 113 | res.wzyx[0] += 1; 114 | assert(res == VECTOR(5, 6, 8, 9)); 115 | 116 | res.wzyx += res.xyxy; 117 | assert(res == VECTOR(11, 11, 14, 14)); 118 | 119 | res = clamp(res.zxwy, 12, 13); 120 | assert(res == VECTOR(13, 12, 13, 12)); 121 | 122 | res = res.xxxx + res.xyzw; 123 | assert(res == VECTOR(26, 25, 26, 25)); 124 | 125 | res = res.xyzw + res.zwxy; 126 | assert(res == VECTOR(52, 50, 52, 50)); 127 | 128 | // const correctness 129 | const VECTOR c(1, 2, 3, 4); 130 | assert(c == VECTOR(1, 2, 3, 4)); 131 | assert(c.xxxx == VECTOR(1, 1, 1, 1)); 132 | assert(c.xxxx.x == 1); 133 | assert(c.xyzw == c.xyzw); 134 | assert(c.wyzx == VECTOR(4, 2, 3, 1)); 135 | assert(c.wyzx.xxxx == VECTOR(4, 4, 4, 4)); 136 | assert(c.wyzx.xzyw == c.wzyx); 137 | assert(c.wyzx.wyzx == c); 138 | assert(c.wyzx.wyzx.wyzx == c.wyzx); 139 | assert(c.wyzx.wyzx.wyzx.wyzx == c); 140 | assert(c.wyzx.wyzx.wyzx.wyzx.x == c.x); 141 | 142 | printf("All tests passed\n"); 143 | 144 | return 0; 145 | } 146 | -------------------------------------------------------------------------------- /source/mat4.h: -------------------------------------------------------------------------------- 1 | #include "vec4.h" 2 | 3 | #ifndef __MAT4_H__ 4 | #define __MAT4_H__ 5 | 6 | class mat4 7 | { 8 | private: 9 | // Most compilers don't use pshufd (SSE2) when _mm_shuffle(x, x, mask) is used 10 | // This macro saves 2-3 movaps instructions when shuffling 11 | // This has to be a macro since mask HAS to be an immidiate value 12 | #define _mm_shufd(xmm, mask) _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(xmm), mask)) 13 | 14 | public: 15 | // Identity matrix 16 | inline mat4() { 17 | m1 = _mm_setr_ps(1.0f, 0.0f, 0.0f, 0.0f); 18 | m2 = _mm_setr_ps(0.0f, 1.0f, 0.0f, 0.0f); 19 | m3 = _mm_setr_ps(0.0f, 0.0f, 1.0f, 0.0f); 20 | m4 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f); 21 | } 22 | 23 | // Scaled matrix 24 | explicit inline mat4(float f) { 25 | m1 = _mm_setr_ps( f, 0.0f, 0.0f, 0.0f); 26 | m2 = _mm_setr_ps(0.0f, f, 0.0f, 0.0f); 27 | m3 = _mm_setr_ps(0.0f, 0.0f, f, 0.0f); 28 | m4 = _mm_setr_ps(0.0f, 0.0f, 0.0f, f); 29 | } 30 | 31 | // 4 vectors constructor 32 | inline mat4(const vec4 &_v1, const vec4 &_v2, 33 | const vec4 &_v3, const vec4 &_v4) { 34 | m1 = _v1.m; 35 | m2 = _v2.m; 36 | m3 = _v3.m; 37 | m4 = _v4.m; 38 | } 39 | 40 | // Full scalar constructor 41 | inline mat4(float _f1, float _f2, float _f3, float _f4, 42 | float _f5, float _f6, float _f7, float _f8, 43 | float _f9, float _f10, float _f11, float _f12, 44 | float _f13, float _f14, float _f15, float _f16) { 45 | m1 = _mm_setr_ps( _f1, _f2, _f3, _f4); 46 | m2 = _mm_setr_ps( _f5, _f6, _f7, _f8); 47 | m3 = _mm_setr_ps( _f9, _f10, _f11, _f12); 48 | m4 = _mm_setr_ps(_f13, _f14, _f15, _f16); 49 | } 50 | 51 | // Copy constructor 52 | inline mat4(const mat4 &m) { 53 | m1 = m.m1; 54 | m2 = m.m2; 55 | m3 = m.m3; 56 | m4 = m.m4; 57 | } 58 | 59 | // ----------------------------------------------------------------- // 60 | 61 | inline void* operator new(size_t size) throw() { 62 | return _mm_malloc(size, 16); 63 | } 64 | 65 | inline void operator delete(void* ptr) { 66 | _mm_free(ptr); 67 | } 68 | 69 | // ----------------------------------------------------------------- // 70 | 71 | // Write direct access operator 72 | inline vec4& operator[](int index) { 73 | return reinterpret_cast(m[index]); 74 | } 75 | 76 | // Read direct access operator 77 | inline const vec4& operator[](int index) const { 78 | return reinterpret_cast(m[index]); 79 | } 80 | 81 | // Cast operator 82 | inline operator float*() { 83 | return reinterpret_cast(this); 84 | } 85 | 86 | // Const cast operator 87 | inline operator const float*() const { 88 | return reinterpret_cast(this); 89 | } 90 | 91 | // ----------------------------------------------------------------- // 92 | 93 | inline mat4& operator += (float f) { 94 | __m128 ff = _mm_set1_ps(f); 95 | m1 = _mm_add_ps(m1, ff); 96 | m2 = _mm_add_ps(m2, ff); 97 | m3 = _mm_add_ps(m3, ff); 98 | m4 = _mm_add_ps(m4, ff); 99 | 100 | return *this; 101 | } 102 | 103 | inline mat4& operator += (const mat4 &m) { 104 | m1 = _mm_add_ps(m1, m.m1); 105 | m2 = _mm_add_ps(m2, m.m2); 106 | m3 = _mm_add_ps(m3, m.m3); 107 | m4 = _mm_add_ps(m4, m.m4); 108 | 109 | return *this; 110 | } 111 | 112 | inline mat4& operator -= (float f) { 113 | __m128 ff = _mm_set1_ps(f); 114 | m1 = _mm_sub_ps(m1, ff); 115 | m2 = _mm_sub_ps(m2, ff); 116 | m3 = _mm_sub_ps(m3, ff); 117 | m4 = _mm_sub_ps(m4, ff); 118 | 119 | return *this; 120 | } 121 | 122 | inline mat4& operator -= (const mat4 &m) { 123 | m1 = _mm_sub_ps(m1, m.m1); 124 | m2 = _mm_sub_ps(m2, m.m2); 125 | m3 = _mm_sub_ps(m3, m.m3); 126 | m4 = _mm_sub_ps(m4, m.m4); 127 | 128 | return *this; 129 | } 130 | 131 | inline mat4& operator *= (float f) { 132 | __m128 ff = _mm_set1_ps(f); 133 | m1 = _mm_mul_ps(m1, ff); 134 | m2 = _mm_mul_ps(m2, ff); 135 | m3 = _mm_mul_ps(m3, ff); 136 | m4 = _mm_mul_ps(m4, ff); 137 | 138 | return *this; 139 | } 140 | 141 | inline mat4& operator *= (const mat4 &m) { 142 | m1 = _mm_add_ps(_mm_add_ps( 143 | _mm_add_ps(_mm_mul_ps(_mm_shufd(m1, 0x00), m.m1), 144 | _mm_mul_ps(_mm_shufd(m1, 0x55), m.m2)), 145 | _mm_mul_ps(_mm_shufd(m1, 0xAA), m.m3)), 146 | _mm_mul_ps(_mm_shufd(m1, 0xFF), m.m4)); 147 | m2 = _mm_add_ps(_mm_add_ps( 148 | _mm_add_ps(_mm_mul_ps(_mm_shufd(m2, 0x00), m.m1), 149 | _mm_mul_ps(_mm_shufd(m2, 0x55), m.m2)), 150 | _mm_mul_ps(_mm_shufd(m2, 0xAA), m.m3)), 151 | _mm_mul_ps(_mm_shufd(m2, 0xFF), m.m4)); 152 | m3 = _mm_add_ps(_mm_add_ps( 153 | _mm_add_ps(_mm_mul_ps(_mm_shufd(m3, 0x00), m.m1), 154 | _mm_mul_ps(_mm_shufd(m3, 0x55), m.m2)), 155 | _mm_mul_ps(_mm_shufd(m3, 0xAA), m.m3)), 156 | _mm_mul_ps(_mm_shufd(m3, 0xFF), m.m4)); 157 | m4 = _mm_add_ps(_mm_add_ps( 158 | _mm_add_ps(_mm_mul_ps(_mm_shufd(m4, 0x00), m.m1), 159 | _mm_mul_ps(_mm_shufd(m4, 0x55), m.m2)), 160 | _mm_mul_ps(_mm_shufd(m4, 0xAA), m.m3)), 161 | _mm_mul_ps(_mm_shufd(m4, 0xFF), m.m4)); 162 | return *this; 163 | } 164 | 165 | inline mat4& operator /= (float f) { 166 | __m128 ff = _mm_set1_ps(f); 167 | m1 = _mm_div_ps(m1, ff); 168 | m2 = _mm_div_ps(m2, ff); 169 | m3 = _mm_div_ps(m3, ff); 170 | m4 = _mm_div_ps(m4, ff); 171 | 172 | return *this; 173 | } 174 | 175 | inline mat4& operator /= (const mat4 &m) { 176 | m1 = _mm_div_ps(m1, m.m1); 177 | m2 = _mm_div_ps(m2, m.m2); 178 | m3 = _mm_div_ps(m3, m.m3); 179 | m4 = _mm_div_ps(m4, m.m4); 180 | 181 | return *this; 182 | } 183 | 184 | // ----------------------------------------------------------------- // 185 | 186 | friend inline mat4 operator + (const mat4 &m, float f) { 187 | __m128 ff = _mm_set1_ps(f); 188 | return mat4(_mm_add_ps(m.m1, ff), _mm_add_ps(m.m2, ff), 189 | _mm_add_ps(m.m3, ff), _mm_add_ps(m.m4, ff)); 190 | } 191 | 192 | friend inline mat4 operator + (const mat4 &m0, const mat4 &m1) { 193 | return mat4(_mm_add_ps(m0.m1, m1.m1), _mm_add_ps(m0.m2, m1.m2), 194 | _mm_add_ps(m0.m3, m1.m3), _mm_add_ps(m0.m4, m1.m4)); 195 | } 196 | 197 | friend inline mat4 operator - (const mat4 &m, float f) { 198 | __m128 ff = _mm_set1_ps(f); 199 | return mat4(_mm_sub_ps(m.m1, ff), _mm_sub_ps(m.m2, ff), 200 | _mm_sub_ps(m.m3, ff), _mm_sub_ps(m.m4, ff)); 201 | } 202 | 203 | friend inline mat4 operator - (float f, const mat4 &m) { 204 | __m128 ff = _mm_set1_ps(f); 205 | return mat4(_mm_sub_ps(ff, m.m1), _mm_sub_ps(ff, m.m2), 206 | _mm_sub_ps(ff, m.m3), _mm_sub_ps(ff, m.m4)); 207 | } 208 | 209 | friend inline mat4 operator - (const mat4 &m0, const mat4 &m1) { 210 | return mat4(_mm_sub_ps(m0.m1, m1.m1), _mm_sub_ps(m0.m2, m1.m2), 211 | _mm_sub_ps(m0.m3, m1.m3), _mm_sub_ps(m0.m4, m1.m4)); 212 | } 213 | 214 | friend inline mat4 operator * (const mat4 &m, float f) { 215 | __m128 ff = _mm_set1_ps(f); 216 | return mat4(_mm_mul_ps(m.m1, ff), _mm_mul_ps(m.m2, ff), 217 | _mm_mul_ps(m.m3, ff), _mm_mul_ps(m.m4, ff)); 218 | } 219 | 220 | friend inline vec4 operator * (const mat4 &m, const vec4 &v) { 221 | return _mm_add_ps(_mm_add_ps( 222 | _mm_mul_ps(m.m1, _mm_shufd(v.m, 0x00)), 223 | _mm_mul_ps(m.m2, _mm_shufd(v.m, 0x55))), 224 | _mm_add_ps(_mm_mul_ps(m.m3, _mm_shufd(v.m, 0xAA)), 225 | _mm_mul_ps(m.m4, _mm_shufd(v.m, 0xFF)))); 226 | } 227 | 228 | friend inline vec4 operator * (const vec4 &v, const mat4 &m) { 229 | __m128 t1 = _mm_unpacklo_ps(m.m1, m.m2); 230 | __m128 t2 = _mm_unpacklo_ps(m.m3, m.m4); 231 | __m128 t3 = _mm_unpackhi_ps(m.m1, m.m2); 232 | __m128 t4 = _mm_unpackhi_ps(m.m3, m.m4); 233 | return _mm_add_ps(_mm_add_ps( 234 | _mm_mul_ps(_mm_movelh_ps(t1, t2), 235 | _mm_shufd(v.m, 0x00)), 236 | _mm_mul_ps(_mm_movehl_ps(t2, t1), 237 | _mm_shufd(v.m, 0x55))), 238 | _mm_add_ps(_mm_mul_ps(_mm_movelh_ps(t3, t4), 239 | _mm_shufd(v.m, 0xAA)), 240 | _mm_mul_ps(_mm_movehl_ps(t4, t3), 241 | _mm_shufd(v.m, 0xFF)))); 242 | } 243 | 244 | friend inline mat4 operator * (const mat4 &m0, const mat4 &m1) { 245 | return mat4(_mm_add_ps(_mm_add_ps( 246 | _mm_add_ps(_mm_mul_ps(_mm_shufd(m0.m1, 0x00), m1.m1), 247 | _mm_mul_ps(_mm_shufd(m0.m1, 0x55), m1.m2)), 248 | _mm_mul_ps(_mm_shufd(m0.m1, 0xAA), m1.m3)), 249 | _mm_mul_ps(_mm_shufd(m0.m1, 0xFF), m1.m4)), 250 | _mm_add_ps(_mm_add_ps( 251 | _mm_add_ps(_mm_mul_ps(_mm_shufd(m0.m2, 0x00), m1.m1), 252 | _mm_mul_ps(_mm_shufd(m0.m2, 0x55), m1.m2)), 253 | _mm_mul_ps(_mm_shufd(m0.m2, 0xAA), m1.m3)), 254 | _mm_mul_ps(_mm_shufd(m0.m2, 0xFF), m1.m4)), 255 | _mm_add_ps(_mm_add_ps( 256 | _mm_add_ps(_mm_mul_ps(_mm_shufd(m0.m3, 0x00), m1.m1), 257 | _mm_mul_ps(_mm_shufd(m0.m3, 0x55), m1.m2)), 258 | _mm_mul_ps(_mm_shufd(m0.m3, 0xAA), m1.m3)), 259 | _mm_mul_ps(_mm_shufd(m0.m3, 0xFF), m1.m4)), 260 | _mm_add_ps(_mm_add_ps( 261 | _mm_add_ps(_mm_mul_ps(_mm_shufd(m0.m4, 0x00), m1.m1), 262 | _mm_mul_ps(_mm_shufd(m0.m4, 0x55), m1.m2)), 263 | _mm_mul_ps(_mm_shufd(m0.m4, 0xAA), m1.m3)), 264 | _mm_mul_ps(_mm_shufd(m0.m4, 0xFF), m1.m4))); 265 | } 266 | 267 | friend inline mat4 operator / (const mat4 &m, float f) { 268 | __m128 ff = _mm_set1_ps(f); 269 | return mat4(_mm_div_ps(m.m1, ff), _mm_div_ps(m.m2, ff), 270 | _mm_div_ps(m.m3, ff), _mm_div_ps(m.m4, ff)); 271 | } 272 | 273 | friend inline mat4 operator / (float f, const mat4 &m) { 274 | __m128 ff = _mm_set1_ps(f); 275 | return mat4(_mm_div_ps(ff, m.m1), _mm_div_ps(ff, m.m2), 276 | _mm_div_ps(ff, m.m3), _mm_div_ps(ff, m.m4)); 277 | } 278 | 279 | friend inline mat4 operator / (const mat4 &m0, const mat4 &m1) { 280 | return mat4(_mm_div_ps(m0.m1, m1.m1), _mm_div_ps(m0.m2, m1.m2), 281 | _mm_div_ps(m0.m3, m1.m3), _mm_div_ps(m0.m4, m1.m4)); 282 | } 283 | 284 | // ----------------------------------------------------------------- // 285 | 286 | friend inline mat4 matrixCompMult(const mat4 &m0, const mat4 &m1) { 287 | return mat4(_mm_mul_ps(m0.m1, m1.m1), _mm_mul_ps(m0.m2, m1.m2), 288 | _mm_mul_ps(m0.m3, m1.m3), _mm_mul_ps(m0.m4, m1.m4)); 289 | } 290 | 291 | // ----------------------------------------------------------------- // 292 | 293 | friend inline mat4 transpose(const mat4 &m) { 294 | __m128 t1 = _mm_unpacklo_ps(m.m1, m.m2); 295 | __m128 t2 = _mm_unpacklo_ps(m.m3, m.m4); 296 | __m128 t3 = _mm_unpackhi_ps(m.m1, m.m2); 297 | __m128 t4 = _mm_unpackhi_ps(m.m3, m.m4); 298 | return mat4(_mm_movelh_ps(t1, t2), _mm_movehl_ps(t2, t1), 299 | _mm_movelh_ps(t3, t4), _mm_movehl_ps(t4, t3)); 300 | } 301 | 302 | friend inline float determinant(const mat4 &m) { 303 | __m128 r = _mm_shufd(m.m3, 0x39 ); 304 | __m128 v1 = _mm_mul_ps(r, m.m4 ); 305 | __m128 v2 = _mm_mul_ps(r, _mm_shufd(m.m4, 0x4E)); 306 | __m128 v3 = _mm_mul_ps(r, _mm_shufd(m.m4, 0x93)); 307 | __m128 r1 = _mm_sub_ps(_mm_shufd(v2, 0x39), 308 | _mm_shufd(v1, 0x4E)); 309 | __m128 r2 = _mm_sub_ps(_mm_shufd(v3, 0x4E), v3); 310 | __m128 r3 = _mm_sub_ps(v2, _mm_shufd(v1, 0x39)); 311 | 312 | v1 = _mm_shufd(m.m2, 0x93); 313 | v2 = _mm_shufd(m.m2, 0x39); 314 | v3 = _mm_shufd(m.m2, 0x4E); 315 | __m128 d = _mm_mul_ps(_mm_add_ps(_mm_add_ps( 316 | _mm_mul_ps(v2, r1), 317 | _mm_mul_ps(v3, r2)), 318 | _mm_mul_ps(v1, r3)), m.m1); 319 | d = _mm_add_ps(d, _mm_shufd(d, 0x4E)); 320 | d = _mm_sub_ss(d, _mm_shufd(d, 0x11)); 321 | return _mm_cvtss_f32(d); 322 | } 323 | 324 | friend inline mat4 inverse(const mat4 &m) { 325 | __m128 f1 = _mm_sub_ps(_mm_mul_ps(_mm_shuffle_ps(m.m3, m.m2, 0xAA), 326 | _mm_shufd(_mm_shuffle_ps(m.m4, m.m3, 0xFF), 0x80)), 327 | _mm_mul_ps(_mm_shufd(_mm_shuffle_ps(m.m4, m.m3, 0xAA), 0x80), 328 | _mm_shuffle_ps(m.m3, m.m2, 0xFF))); 329 | __m128 f2 = _mm_sub_ps(_mm_mul_ps(_mm_shuffle_ps(m.m3, m.m2, 0x55), 330 | _mm_shufd(_mm_shuffle_ps(m.m4, m.m3, 0xFF), 0x80)), 331 | _mm_mul_ps(_mm_shufd(_mm_shuffle_ps(m.m4, m.m3, 0x55), 0x80), 332 | _mm_shuffle_ps(m.m3, m.m2, 0xFF))); 333 | __m128 f3 = _mm_sub_ps(_mm_mul_ps(_mm_shuffle_ps(m.m3, m.m2, 0x55), 334 | _mm_shufd(_mm_shuffle_ps(m.m4, m.m3, 0xAA), 0x80)), 335 | _mm_mul_ps(_mm_shufd(_mm_shuffle_ps(m.m4, m.m3, 0x55), 0x80), 336 | _mm_shuffle_ps(m.m3, m.m2, 0xAA))); 337 | __m128 f4 = _mm_sub_ps(_mm_mul_ps(_mm_shuffle_ps(m.m3, m.m2, 0x00), 338 | _mm_shufd(_mm_shuffle_ps(m.m4, m.m3, 0xFF), 0x80)), 339 | _mm_mul_ps(_mm_shufd(_mm_shuffle_ps(m.m4, m.m3, 0x00), 0x80), 340 | _mm_shuffle_ps(m.m3, m.m2, 0xFF))); 341 | __m128 f5 = _mm_sub_ps(_mm_mul_ps(_mm_shuffle_ps(m.m3, m.m2, 0x00), 342 | _mm_shufd(_mm_shuffle_ps(m.m4, m.m3, 0xAA), 0x80)), 343 | _mm_mul_ps(_mm_shufd(_mm_shuffle_ps(m.m4, m.m3, 0x00), 0x80), 344 | _mm_shuffle_ps(m.m3, m.m2, 0xAA))); 345 | __m128 f6 = _mm_sub_ps(_mm_mul_ps(_mm_shuffle_ps(m.m3, m.m2, 0x00), 346 | _mm_shufd(_mm_shuffle_ps(m.m4, m.m3, 0x55), 0x80)), 347 | _mm_mul_ps(_mm_shufd(_mm_shuffle_ps(m.m4, m.m3, 0x00), 0x80), 348 | _mm_shuffle_ps(m.m3, m.m2, 0x55))); 349 | __m128 v1 = _mm_shufd(_mm_shuffle_ps(m.m2, m.m1, 0x00), 0xA8); 350 | __m128 v2 = _mm_shufd(_mm_shuffle_ps(m.m2, m.m1, 0x55), 0xA8); 351 | __m128 v3 = _mm_shufd(_mm_shuffle_ps(m.m2, m.m1, 0xAA), 0xA8); 352 | __m128 v4 = _mm_shufd(_mm_shuffle_ps(m.m2, m.m1, 0xFF), 0xA8); 353 | __m128 s1 = _mm_set_ps(-0.0f, 0.0f, -0.0f, 0.0f); 354 | __m128 s2 = _mm_set_ps( 0.0f, -0.0f, 0.0f, -0.0f); 355 | __m128 i1 = _mm_xor_ps(s1, _mm_add_ps( 356 | _mm_sub_ps(_mm_mul_ps(v2, f1), 357 | _mm_mul_ps(v3, f2)), 358 | _mm_mul_ps(v4, f3))); 359 | __m128 i2 = _mm_xor_ps(s2, _mm_add_ps( 360 | _mm_sub_ps(_mm_mul_ps(v1, f1), 361 | _mm_mul_ps(v3, f4)), 362 | _mm_mul_ps(v4, f5))); 363 | __m128 i3 = _mm_xor_ps(s1, _mm_add_ps( 364 | _mm_sub_ps(_mm_mul_ps(v1, f2), 365 | _mm_mul_ps(v2, f4)), 366 | _mm_mul_ps(v4, f6))); 367 | __m128 i4 = _mm_xor_ps(s2, _mm_add_ps( 368 | _mm_sub_ps(_mm_mul_ps(v1, f3), 369 | _mm_mul_ps(v2, f5)), 370 | _mm_mul_ps(v3, f6))); 371 | __m128 d = _mm_mul_ps(m.m1, _mm_movelh_ps(_mm_unpacklo_ps(i1, i2), 372 | _mm_unpacklo_ps(i3, i4))); 373 | d = _mm_add_ps(d, _mm_shufd(d, 0x4E)); 374 | d = _mm_add_ps(d, _mm_shufd(d, 0x11)); 375 | d = _mm_div_ps(_mm_set1_ps(1.0f), d); 376 | return mat4(vec4(_mm_mul_ps(i1, d)), 377 | vec4(_mm_mul_ps(i2, d)), 378 | vec4(_mm_mul_ps(i3, d)), 379 | vec4(_mm_mul_ps(i4, d))); 380 | } 381 | 382 | // ----------------------------------------------------------------- // 383 | 384 | private: 385 | // SSE constructor 386 | inline mat4(const __m128 &_m1, const __m128 &_m2, 387 | const __m128 &_m3, const __m128 &_m4) { 388 | m1 = _m1; 389 | m2 = _m2; 390 | m3 = _m3; 391 | m4 = _m4; 392 | } 393 | 394 | union { 395 | __m128 m[4]; 396 | struct { 397 | __m128 m1; 398 | __m128 m2; 399 | __m128 m3; 400 | __m128 m4; 401 | }; 402 | 403 | /* // This code is waiting for unrestricted unions feature in c++0x 404 | vec4 v[4]; 405 | struct { 406 | vec4 v1; 407 | vec4 v2; 408 | vec4 v3; 409 | vec4 v4; 410 | }; 411 | */ 412 | }; 413 | 414 | // Avoid pollution 415 | #undef _mm_shufd 416 | }; 417 | 418 | #endif 419 | -------------------------------------------------------------------------------- /source/swizzle2.h: -------------------------------------------------------------------------------- 1 | #ifndef __SWIZZLE2_H__ 2 | #define __SWIZZLE2_H__ 3 | 4 | #define xy shuffle2_rw2<_MM_SHUFFLE2(1,0)>() 5 | #define yx shuffle2_rw2<_MM_SHUFFLE2(0,1)>() 6 | 7 | #define xx shuffle2_ro2<_MM_SHUFFLE2(0,0)>() 8 | #define yy shuffle2_ro2<_MM_SHUFFLE2(1,1)>() 9 | 10 | #define xxxx shuffle4_ro2<_MM_SHUFFLE(0,0,0,0)>() 11 | #define xxxy shuffle4_ro2<_MM_SHUFFLE(1,0,0,0)>() 12 | #define xxyx shuffle4_ro2<_MM_SHUFFLE(0,1,0,0)>() 13 | #define xxyy shuffle4_ro2<_MM_SHUFFLE(1,1,0,0)>() 14 | #define xyxx shuffle4_ro2<_MM_SHUFFLE(0,0,1,0)>() 15 | #define xyxy shuffle4_ro2<_MM_SHUFFLE(1,0,1,0)>() 16 | #define xyyx shuffle4_ro2<_MM_SHUFFLE(0,1,1,0)>() 17 | #define xyyy shuffle4_ro2<_MM_SHUFFLE(1,1,1,0)>() 18 | #define yxxx shuffle4_ro2<_MM_SHUFFLE(0,0,0,1)>() 19 | #define yxxy shuffle4_ro2<_MM_SHUFFLE(1,0,0,1)>() 20 | #define yxyx shuffle4_ro2<_MM_SHUFFLE(0,1,0,1)>() 21 | #define yxyy shuffle4_ro2<_MM_SHUFFLE(1,1,0,1)>() 22 | #define yyxx shuffle4_ro2<_MM_SHUFFLE(0,0,1,1)>() 23 | #define yyxy shuffle4_ro2<_MM_SHUFFLE(1,0,1,1)>() 24 | #define yyyx shuffle4_ro2<_MM_SHUFFLE(0,1,1,1)>() 25 | #define yyyy shuffle4_ro2<_MM_SHUFFLE(1,1,1,1)>() 26 | 27 | #endif 28 | -------------------------------------------------------------------------------- /source/swizzle4.h: -------------------------------------------------------------------------------- 1 | #ifndef __SWIZZLE4_H__ 2 | #define __SWIZZLE4_H__ 3 | 4 | #define wzyx shuffle4_rw4<_MM_SHUFFLE(0,1,2,3)>() 5 | #define zwyx shuffle4_rw4<_MM_SHUFFLE(0,1,3,2)>() 6 | #define wyzx shuffle4_rw4<_MM_SHUFFLE(0,2,1,3)>() 7 | #define ywzx shuffle4_rw4<_MM_SHUFFLE(0,2,3,1)>() 8 | #define zywx shuffle4_rw4<_MM_SHUFFLE(0,3,1,2)>() 9 | #define yzwx shuffle4_rw4<_MM_SHUFFLE(0,3,2,1)>() 10 | #define wzxy shuffle4_rw4<_MM_SHUFFLE(1,0,2,3)>() 11 | #define zwxy shuffle4_rw4<_MM_SHUFFLE(1,0,3,2)>() 12 | #define wxzy shuffle4_rw4<_MM_SHUFFLE(1,2,0,3)>() 13 | #define xwzy shuffle4_rw4<_MM_SHUFFLE(1,2,3,0)>() 14 | #define zxwy shuffle4_rw4<_MM_SHUFFLE(1,3,0,2)>() 15 | #define xzwy shuffle4_rw4<_MM_SHUFFLE(1,3,2,0)>() 16 | #define wyxz shuffle4_rw4<_MM_SHUFFLE(2,0,1,3)>() 17 | #define ywxz shuffle4_rw4<_MM_SHUFFLE(2,0,3,1)>() 18 | #define wxyz shuffle4_rw4<_MM_SHUFFLE(2,1,0,3)>() 19 | #define xwyz shuffle4_rw4<_MM_SHUFFLE(2,1,3,0)>() 20 | #define yxwz shuffle4_rw4<_MM_SHUFFLE(2,3,0,1)>() 21 | #define xywz shuffle4_rw4<_MM_SHUFFLE(2,3,1,0)>() 22 | #define zyxw shuffle4_rw4<_MM_SHUFFLE(3,0,1,2)>() 23 | #define yzxw shuffle4_rw4<_MM_SHUFFLE(3,0,2,1)>() 24 | #define zxyw shuffle4_rw4<_MM_SHUFFLE(3,1,0,2)>() 25 | #define xzyw shuffle4_rw4<_MM_SHUFFLE(3,1,2,0)>() 26 | #define yxzw shuffle4_rw4<_MM_SHUFFLE(3,2,0,1)>() 27 | #define xyzw shuffle4_rw4<_MM_SHUFFLE(3,2,1,0)>() 28 | 29 | #define xxxz shuffle4_ro4<_MM_SHUFFLE(2,0,0,0)>() 30 | #define xxxw shuffle4_ro4<_MM_SHUFFLE(3,0,0,0)>() 31 | #define xxyz shuffle4_ro4<_MM_SHUFFLE(2,1,0,0)>() 32 | #define xxyw shuffle4_ro4<_MM_SHUFFLE(3,1,0,0)>() 33 | #define xxzx shuffle4_ro4<_MM_SHUFFLE(0,2,0,0)>() 34 | #define xxzy shuffle4_ro4<_MM_SHUFFLE(1,2,0,0)>() 35 | #define xxzz shuffle4_ro4<_MM_SHUFFLE(2,2,0,0)>() 36 | #define xxzw shuffle4_ro4<_MM_SHUFFLE(3,2,0,0)>() 37 | #define xxwx shuffle4_ro4<_MM_SHUFFLE(0,3,0,0)>() 38 | #define xxwy shuffle4_ro4<_MM_SHUFFLE(1,3,0,0)>() 39 | #define xxwz shuffle4_ro4<_MM_SHUFFLE(2,3,0,0)>() 40 | #define xxww shuffle4_ro4<_MM_SHUFFLE(3,3,0,0)>() 41 | #define xyxz shuffle4_ro4<_MM_SHUFFLE(2,0,1,0)>() 42 | #define xyxw shuffle4_ro4<_MM_SHUFFLE(3,0,1,0)>() 43 | #define xyyz shuffle4_ro4<_MM_SHUFFLE(2,1,1,0)>() 44 | #define xyyw shuffle4_ro4<_MM_SHUFFLE(3,1,1,0)>() 45 | #define xyzx shuffle4_ro4<_MM_SHUFFLE(0,2,1,0)>() 46 | #define xyzy shuffle4_ro4<_MM_SHUFFLE(1,2,1,0)>() 47 | #define xyzz shuffle4_ro4<_MM_SHUFFLE(2,2,1,0)>() 48 | #define xywx shuffle4_ro4<_MM_SHUFFLE(0,3,1,0)>() 49 | #define xywy shuffle4_ro4<_MM_SHUFFLE(1,3,1,0)>() 50 | #define xyww shuffle4_ro4<_MM_SHUFFLE(3,3,1,0)>() 51 | #define xzxx shuffle4_ro4<_MM_SHUFFLE(0,0,2,0)>() 52 | #define xzxy shuffle4_ro4<_MM_SHUFFLE(1,0,2,0)>() 53 | #define xzxz shuffle4_ro4<_MM_SHUFFLE(2,0,2,0)>() 54 | #define xzxw shuffle4_ro4<_MM_SHUFFLE(3,0,2,0)>() 55 | #define xzyx shuffle4_ro4<_MM_SHUFFLE(0,1,2,0)>() 56 | #define xzyy shuffle4_ro4<_MM_SHUFFLE(1,1,2,0)>() 57 | #define xzyz shuffle4_ro4<_MM_SHUFFLE(2,1,2,0)>() 58 | #define xzzx shuffle4_ro4<_MM_SHUFFLE(0,2,2,0)>() 59 | #define xzzy shuffle4_ro4<_MM_SHUFFLE(1,2,2,0)>() 60 | #define xzzz shuffle4_ro4<_MM_SHUFFLE(2,2,2,0)>() 61 | #define xzzw shuffle4_ro4<_MM_SHUFFLE(3,2,2,0)>() 62 | #define xzwx shuffle4_ro4<_MM_SHUFFLE(0,3,2,0)>() 63 | #define xzwz shuffle4_ro4<_MM_SHUFFLE(2,3,2,0)>() 64 | #define xzww shuffle4_ro4<_MM_SHUFFLE(3,3,2,0)>() 65 | #define xwxx shuffle4_ro4<_MM_SHUFFLE(0,0,3,0)>() 66 | #define xwxy shuffle4_ro4<_MM_SHUFFLE(1,0,3,0)>() 67 | #define xwxz shuffle4_ro4<_MM_SHUFFLE(2,0,3,0)>() 68 | #define xwxw shuffle4_ro4<_MM_SHUFFLE(3,0,3,0)>() 69 | #define xwyx shuffle4_ro4<_MM_SHUFFLE(0,1,3,0)>() 70 | #define xwyy shuffle4_ro4<_MM_SHUFFLE(1,1,3,0)>() 71 | #define xwyw shuffle4_ro4<_MM_SHUFFLE(3,1,3,0)>() 72 | #define xwzx shuffle4_ro4<_MM_SHUFFLE(0,2,3,0)>() 73 | #define xwzz shuffle4_ro4<_MM_SHUFFLE(2,2,3,0)>() 74 | #define xwzw shuffle4_ro4<_MM_SHUFFLE(3,2,3,0)>() 75 | #define xwwx shuffle4_ro4<_MM_SHUFFLE(0,3,3,0)>() 76 | #define xwwy shuffle4_ro4<_MM_SHUFFLE(1,3,3,0)>() 77 | #define xwwz shuffle4_ro4<_MM_SHUFFLE(2,3,3,0)>() 78 | #define xwww shuffle4_ro4<_MM_SHUFFLE(3,3,3,0)>() 79 | #define yxxz shuffle4_ro4<_MM_SHUFFLE(2,0,0,1)>() 80 | #define yxxw shuffle4_ro4<_MM_SHUFFLE(3,0,0,1)>() 81 | #define yxyz shuffle4_ro4<_MM_SHUFFLE(2,1,0,1)>() 82 | #define yxyw shuffle4_ro4<_MM_SHUFFLE(3,1,0,1)>() 83 | #define yxzx shuffle4_ro4<_MM_SHUFFLE(0,2,0,1)>() 84 | #define yxzy shuffle4_ro4<_MM_SHUFFLE(1,2,0,1)>() 85 | #define yxzz shuffle4_ro4<_MM_SHUFFLE(2,2,0,1)>() 86 | #define yxwx shuffle4_ro4<_MM_SHUFFLE(0,3,0,1)>() 87 | #define yxwy shuffle4_ro4<_MM_SHUFFLE(1,3,0,1)>() 88 | #define yxww shuffle4_ro4<_MM_SHUFFLE(3,3,0,1)>() 89 | #define yyxz shuffle4_ro4<_MM_SHUFFLE(2,0,1,1)>() 90 | #define yyxw shuffle4_ro4<_MM_SHUFFLE(3,0,1,1)>() 91 | #define yyyz shuffle4_ro4<_MM_SHUFFLE(2,1,1,1)>() 92 | #define yyyw shuffle4_ro4<_MM_SHUFFLE(3,1,1,1)>() 93 | #define yyzx shuffle4_ro4<_MM_SHUFFLE(0,2,1,1)>() 94 | #define yyzy shuffle4_ro4<_MM_SHUFFLE(1,2,1,1)>() 95 | #define yyzz shuffle4_ro4<_MM_SHUFFLE(2,2,1,1)>() 96 | #define yyzw shuffle4_ro4<_MM_SHUFFLE(3,2,1,1)>() 97 | #define yywx shuffle4_ro4<_MM_SHUFFLE(0,3,1,1)>() 98 | #define yywy shuffle4_ro4<_MM_SHUFFLE(1,3,1,1)>() 99 | #define yywz shuffle4_ro4<_MM_SHUFFLE(2,3,1,1)>() 100 | #define yyww shuffle4_ro4<_MM_SHUFFLE(3,3,1,1)>() 101 | #define yzxx shuffle4_ro4<_MM_SHUFFLE(0,0,2,1)>() 102 | #define yzxy shuffle4_ro4<_MM_SHUFFLE(1,0,2,1)>() 103 | #define yzxz shuffle4_ro4<_MM_SHUFFLE(2,0,2,1)>() 104 | #define yzyx shuffle4_ro4<_MM_SHUFFLE(0,1,2,1)>() 105 | #define yzyy shuffle4_ro4<_MM_SHUFFLE(1,1,2,1)>() 106 | #define yzyz shuffle4_ro4<_MM_SHUFFLE(2,1,2,1)>() 107 | #define yzyw shuffle4_ro4<_MM_SHUFFLE(3,1,2,1)>() 108 | #define yzzx shuffle4_ro4<_MM_SHUFFLE(0,2,2,1)>() 109 | #define yzzy shuffle4_ro4<_MM_SHUFFLE(1,2,2,1)>() 110 | #define yzzz shuffle4_ro4<_MM_SHUFFLE(2,2,2,1)>() 111 | #define yzzw shuffle4_ro4<_MM_SHUFFLE(3,2,2,1)>() 112 | #define yzwy shuffle4_ro4<_MM_SHUFFLE(1,3,2,1)>() 113 | #define yzwz shuffle4_ro4<_MM_SHUFFLE(2,3,2,1)>() 114 | #define yzww shuffle4_ro4<_MM_SHUFFLE(3,3,2,1)>() 115 | #define ywxx shuffle4_ro4<_MM_SHUFFLE(0,0,3,1)>() 116 | #define ywxy shuffle4_ro4<_MM_SHUFFLE(1,0,3,1)>() 117 | #define ywxw shuffle4_ro4<_MM_SHUFFLE(3,0,3,1)>() 118 | #define ywyx shuffle4_ro4<_MM_SHUFFLE(0,1,3,1)>() 119 | #define ywyy shuffle4_ro4<_MM_SHUFFLE(1,1,3,1)>() 120 | #define ywyz shuffle4_ro4<_MM_SHUFFLE(2,1,3,1)>() 121 | #define ywyw shuffle4_ro4<_MM_SHUFFLE(3,1,3,1)>() 122 | #define ywzy shuffle4_ro4<_MM_SHUFFLE(1,2,3,1)>() 123 | #define ywzz shuffle4_ro4<_MM_SHUFFLE(2,2,3,1)>() 124 | #define ywzw shuffle4_ro4<_MM_SHUFFLE(3,2,3,1)>() 125 | #define ywwx shuffle4_ro4<_MM_SHUFFLE(0,3,3,1)>() 126 | #define ywwy shuffle4_ro4<_MM_SHUFFLE(1,3,3,1)>() 127 | #define ywwz shuffle4_ro4<_MM_SHUFFLE(2,3,3,1)>() 128 | #define ywww shuffle4_ro4<_MM_SHUFFLE(3,3,3,1)>() 129 | #define zxxx shuffle4_ro4<_MM_SHUFFLE(0,0,0,2)>() 130 | #define zxxy shuffle4_ro4<_MM_SHUFFLE(1,0,0,2)>() 131 | #define zxxz shuffle4_ro4<_MM_SHUFFLE(2,0,0,2)>() 132 | #define zxxw shuffle4_ro4<_MM_SHUFFLE(3,0,0,2)>() 133 | #define zxyx shuffle4_ro4<_MM_SHUFFLE(0,1,0,2)>() 134 | #define zxyy shuffle4_ro4<_MM_SHUFFLE(1,1,0,2)>() 135 | #define zxyz shuffle4_ro4<_MM_SHUFFLE(2,1,0,2)>() 136 | #define zxzx shuffle4_ro4<_MM_SHUFFLE(0,2,0,2)>() 137 | #define zxzy shuffle4_ro4<_MM_SHUFFLE(1,2,0,2)>() 138 | #define zxzz shuffle4_ro4<_MM_SHUFFLE(2,2,0,2)>() 139 | #define zxzw shuffle4_ro4<_MM_SHUFFLE(3,2,0,2)>() 140 | #define zxwx shuffle4_ro4<_MM_SHUFFLE(0,3,0,2)>() 141 | #define zxwz shuffle4_ro4<_MM_SHUFFLE(2,3,0,2)>() 142 | #define zxww shuffle4_ro4<_MM_SHUFFLE(3,3,0,2)>() 143 | #define zyxx shuffle4_ro4<_MM_SHUFFLE(0,0,1,2)>() 144 | #define zyxy shuffle4_ro4<_MM_SHUFFLE(1,0,1,2)>() 145 | #define zyxz shuffle4_ro4<_MM_SHUFFLE(2,0,1,2)>() 146 | #define zyyx shuffle4_ro4<_MM_SHUFFLE(0,1,1,2)>() 147 | #define zyyy shuffle4_ro4<_MM_SHUFFLE(1,1,1,2)>() 148 | #define zyyz shuffle4_ro4<_MM_SHUFFLE(2,1,1,2)>() 149 | #define zyyw shuffle4_ro4<_MM_SHUFFLE(3,1,1,2)>() 150 | #define zyzx shuffle4_ro4<_MM_SHUFFLE(0,2,1,2)>() 151 | #define zyzy shuffle4_ro4<_MM_SHUFFLE(1,2,1,2)>() 152 | #define zyzz shuffle4_ro4<_MM_SHUFFLE(2,2,1,2)>() 153 | #define zyzw shuffle4_ro4<_MM_SHUFFLE(3,2,1,2)>() 154 | #define zywy shuffle4_ro4<_MM_SHUFFLE(1,3,1,2)>() 155 | #define zywz shuffle4_ro4<_MM_SHUFFLE(2,3,1,2)>() 156 | #define zyww shuffle4_ro4<_MM_SHUFFLE(3,3,1,2)>() 157 | #define zzxx shuffle4_ro4<_MM_SHUFFLE(0,0,2,2)>() 158 | #define zzxy shuffle4_ro4<_MM_SHUFFLE(1,0,2,2)>() 159 | #define zzxz shuffle4_ro4<_MM_SHUFFLE(2,0,2,2)>() 160 | #define zzxw shuffle4_ro4<_MM_SHUFFLE(3,0,2,2)>() 161 | #define zzyx shuffle4_ro4<_MM_SHUFFLE(0,1,2,2)>() 162 | #define zzyy shuffle4_ro4<_MM_SHUFFLE(1,1,2,2)>() 163 | #define zzyz shuffle4_ro4<_MM_SHUFFLE(2,1,2,2)>() 164 | #define zzyw shuffle4_ro4<_MM_SHUFFLE(3,1,2,2)>() 165 | #define zzzx shuffle4_ro4<_MM_SHUFFLE(0,2,2,2)>() 166 | #define zzzy shuffle4_ro4<_MM_SHUFFLE(1,2,2,2)>() 167 | #define zzzz shuffle4_ro4<_MM_SHUFFLE(2,2,2,2)>() 168 | #define zzzw shuffle4_ro4<_MM_SHUFFLE(3,2,2,2)>() 169 | #define zzwx shuffle4_ro4<_MM_SHUFFLE(0,3,2,2)>() 170 | #define zzwy shuffle4_ro4<_MM_SHUFFLE(1,3,2,2)>() 171 | #define zzwz shuffle4_ro4<_MM_SHUFFLE(2,3,2,2)>() 172 | #define zzww shuffle4_ro4<_MM_SHUFFLE(3,3,2,2)>() 173 | #define zwxx shuffle4_ro4<_MM_SHUFFLE(0,0,3,2)>() 174 | #define zwxz shuffle4_ro4<_MM_SHUFFLE(2,0,3,2)>() 175 | #define zwxw shuffle4_ro4<_MM_SHUFFLE(3,0,3,2)>() 176 | #define zwyy shuffle4_ro4<_MM_SHUFFLE(1,1,3,2)>() 177 | #define zwyz shuffle4_ro4<_MM_SHUFFLE(2,1,3,2)>() 178 | #define zwyw shuffle4_ro4<_MM_SHUFFLE(3,1,3,2)>() 179 | #define zwzx shuffle4_ro4<_MM_SHUFFLE(0,2,3,2)>() 180 | #define zwzy shuffle4_ro4<_MM_SHUFFLE(1,2,3,2)>() 181 | #define zwzz shuffle4_ro4<_MM_SHUFFLE(2,2,3,2)>() 182 | #define zwzw shuffle4_ro4<_MM_SHUFFLE(3,2,3,2)>() 183 | #define zwwx shuffle4_ro4<_MM_SHUFFLE(0,3,3,2)>() 184 | #define zwwy shuffle4_ro4<_MM_SHUFFLE(1,3,3,2)>() 185 | #define zwwz shuffle4_ro4<_MM_SHUFFLE(2,3,3,2)>() 186 | #define zwww shuffle4_ro4<_MM_SHUFFLE(3,3,3,2)>() 187 | #define wxxx shuffle4_ro4<_MM_SHUFFLE(0,0,0,3)>() 188 | #define wxxy shuffle4_ro4<_MM_SHUFFLE(1,0,0,3)>() 189 | #define wxxz shuffle4_ro4<_MM_SHUFFLE(2,0,0,3)>() 190 | #define wxxw shuffle4_ro4<_MM_SHUFFLE(3,0,0,3)>() 191 | #define wxyx shuffle4_ro4<_MM_SHUFFLE(0,1,0,3)>() 192 | #define wxyy shuffle4_ro4<_MM_SHUFFLE(1,1,0,3)>() 193 | #define wxyw shuffle4_ro4<_MM_SHUFFLE(3,1,0,3)>() 194 | #define wxzx shuffle4_ro4<_MM_SHUFFLE(0,2,0,3)>() 195 | #define wxzz shuffle4_ro4<_MM_SHUFFLE(2,2,0,3)>() 196 | #define wxzw shuffle4_ro4<_MM_SHUFFLE(3,2,0,3)>() 197 | #define wxwx shuffle4_ro4<_MM_SHUFFLE(0,3,0,3)>() 198 | #define wxwy shuffle4_ro4<_MM_SHUFFLE(1,3,0,3)>() 199 | #define wxwz shuffle4_ro4<_MM_SHUFFLE(2,3,0,3)>() 200 | #define wxww shuffle4_ro4<_MM_SHUFFLE(3,3,0,3)>() 201 | #define wyxx shuffle4_ro4<_MM_SHUFFLE(0,0,1,3)>() 202 | #define wyxy shuffle4_ro4<_MM_SHUFFLE(1,0,1,3)>() 203 | #define wyxw shuffle4_ro4<_MM_SHUFFLE(3,0,1,3)>() 204 | #define wyyx shuffle4_ro4<_MM_SHUFFLE(0,1,1,3)>() 205 | #define wyyy shuffle4_ro4<_MM_SHUFFLE(1,1,1,3)>() 206 | #define wyyz shuffle4_ro4<_MM_SHUFFLE(2,1,1,3)>() 207 | #define wyyw shuffle4_ro4<_MM_SHUFFLE(3,1,1,3)>() 208 | #define wyzy shuffle4_ro4<_MM_SHUFFLE(1,2,1,3)>() 209 | #define wyzz shuffle4_ro4<_MM_SHUFFLE(2,2,1,3)>() 210 | #define wyzw shuffle4_ro4<_MM_SHUFFLE(3,2,1,3)>() 211 | #define wywx shuffle4_ro4<_MM_SHUFFLE(0,3,1,3)>() 212 | #define wywy shuffle4_ro4<_MM_SHUFFLE(1,3,1,3)>() 213 | #define wywz shuffle4_ro4<_MM_SHUFFLE(2,3,1,3)>() 214 | #define wyww shuffle4_ro4<_MM_SHUFFLE(3,3,1,3)>() 215 | #define wzxx shuffle4_ro4<_MM_SHUFFLE(0,0,2,3)>() 216 | #define wzxz shuffle4_ro4<_MM_SHUFFLE(2,0,2,3)>() 217 | #define wzxw shuffle4_ro4<_MM_SHUFFLE(3,0,2,3)>() 218 | #define wzyy shuffle4_ro4<_MM_SHUFFLE(1,1,2,3)>() 219 | #define wzyz shuffle4_ro4<_MM_SHUFFLE(2,1,2,3)>() 220 | #define wzyw shuffle4_ro4<_MM_SHUFFLE(3,1,2,3)>() 221 | #define wzzx shuffle4_ro4<_MM_SHUFFLE(0,2,2,3)>() 222 | #define wzzy shuffle4_ro4<_MM_SHUFFLE(1,2,2,3)>() 223 | #define wzzz shuffle4_ro4<_MM_SHUFFLE(2,2,2,3)>() 224 | #define wzzw shuffle4_ro4<_MM_SHUFFLE(3,2,2,3)>() 225 | #define wzwx shuffle4_ro4<_MM_SHUFFLE(0,3,2,3)>() 226 | #define wzwy shuffle4_ro4<_MM_SHUFFLE(1,3,2,3)>() 227 | #define wzwz shuffle4_ro4<_MM_SHUFFLE(2,3,2,3)>() 228 | #define wzww shuffle4_ro4<_MM_SHUFFLE(3,3,2,3)>() 229 | #define wwxx shuffle4_ro4<_MM_SHUFFLE(0,0,3,3)>() 230 | #define wwxy shuffle4_ro4<_MM_SHUFFLE(1,0,3,3)>() 231 | #define wwxz shuffle4_ro4<_MM_SHUFFLE(2,0,3,3)>() 232 | #define wwxw shuffle4_ro4<_MM_SHUFFLE(3,0,3,3)>() 233 | #define wwyx shuffle4_ro4<_MM_SHUFFLE(0,1,3,3)>() 234 | #define wwyy shuffle4_ro4<_MM_SHUFFLE(1,1,3,3)>() 235 | #define wwyz shuffle4_ro4<_MM_SHUFFLE(2,1,3,3)>() 236 | #define wwyw shuffle4_ro4<_MM_SHUFFLE(3,1,3,3)>() 237 | #define wwzx shuffle4_ro4<_MM_SHUFFLE(0,2,3,3)>() 238 | #define wwzy shuffle4_ro4<_MM_SHUFFLE(1,2,3,3)>() 239 | #define wwzz shuffle4_ro4<_MM_SHUFFLE(2,2,3,3)>() 240 | #define wwzw shuffle4_ro4<_MM_SHUFFLE(3,2,3,3)>() 241 | #define wwwx shuffle4_ro4<_MM_SHUFFLE(0,3,3,3)>() 242 | #define wwwy shuffle4_ro4<_MM_SHUFFLE(1,3,3,3)>() 243 | #define wwwz shuffle4_ro4<_MM_SHUFFLE(2,3,3,3)>() 244 | #define wwww shuffle4_ro4<_MM_SHUFFLE(3,3,3,3)>() 245 | 246 | #endif 247 | -------------------------------------------------------------------------------- /source/tests/test.h: -------------------------------------------------------------------------------- 1 | #ifndef TEST_H 2 | #define TEST_H 3 | 4 | #include 5 | 6 | namespace tests 7 | { 8 | 9 | class test 10 | { 11 | private: 12 | test(); 13 | }; 14 | 15 | } 16 | 17 | #endif /* TEST_H */ 18 | -------------------------------------------------------------------------------- /source/tests/vec4.cpp: -------------------------------------------------------------------------------- 1 | #include "vec4.h" 2 | #include "../vec4.h" 3 | 4 | #include 5 | 6 | namespace tests 7 | { 8 | 9 | void vec4::testEquality() 10 | { 11 | ::vec4 v(1, 2, 3, 4); 12 | 13 | assert(v == v); 14 | assert(v == ::vec4(1, 2, 3, 4)); 15 | assert(::vec4(1, 2, 3, 4) == v); 16 | assert(::vec4(1, 2, 3, 4) == ::vec4(1, 2, 3, 4)); 17 | } 18 | 19 | void vec4::testSwizzleEquality() 20 | { 21 | ::vec4 v(1, 2, 3, 4); 22 | 23 | assert(v.wzyx == ::vec4(4, 3, 2, 1)); 24 | assert(::vec4(4, 3, 2, 1) == v.wzyx); 25 | 26 | assert(v.xyzw == v); 27 | assert(v.yyyy == v.yyyy); 28 | assert(v.wzzw == v.wzzw); 29 | 30 | assert(v.xyzw.xyzw == v); 31 | assert(v.wzyx.wzyx == v); 32 | 33 | assert(v.yzwx.yzwx == v.zwxy); 34 | assert(v.xwzy.xwzy == v.xyzw); 35 | 36 | assert(v.xxxx.yyyy == ::vec4(1)); 37 | assert(v.yyyy.xxxx == ::vec4(2)); 38 | assert(v.wwww.xyzw == ::vec4(4)); 39 | 40 | assert(v.zzzz.xyzw == ::vec4(3)); 41 | assert(v.wwww.yyzz == ::vec4(4)); 42 | } 43 | 44 | void vec4::testSwizzleAccessors() 45 | { 46 | ::vec4 v(1, 2, 3, 4); 47 | /* 48 | assert(v.xyzw.x == v.x); 49 | assert(v.xyzw.y == v.y); 50 | 51 | assert(v.wzyx.x == v.w); 52 | assert(v.wzyx.y == v.z); 53 | */ 54 | } 55 | 56 | void vec4::testSwizzleWrite() 57 | { 58 | ::vec4 v; 59 | 60 | v.xyzw = ::vec4(1, 2, 3, 4); 61 | assert(v == ::vec4(1, 2, 3, 4)); 62 | 63 | v.wzyx = ::vec4(1, 2, 3, 4); 64 | assert(v == ::vec4(4, 3, 2, 1)); 65 | 66 | v.yzwx = ::vec4(1, 2, 3, 4); 67 | assert(v == ::vec4(4, 1, 2, 3)); 68 | 69 | v.yxzw = ::vec4(1, 2, 3, 4); 70 | assert(v == ::vec4(2, 1, 3, 4)); 71 | 72 | v.zywx = ::vec4(1,2,3,4); 73 | assert(v == ::vec4(4, 2, 1, 3)); 74 | 75 | v.zywx = ::vec4(1,2,3,4).xyzw; 76 | assert(v == ::vec4(4, 2, 1, 3)); 77 | 78 | v.zywx = ::vec4(1,2,3,4).wxyz; 79 | assert(v == ::vec4(3, 1, 4, 2)); 80 | 81 | v.yzwx = ::vec4(1, 2, 3, 4).yzwx; 82 | assert(v == ::vec4(1, 2, 3, 4)); 83 | } 84 | 85 | void vec4::testUnary() 86 | { 87 | ::vec4 v(-1337, 42, -0, 85070591730234615865843651857942052864.F); // bignum = 2**126 88 | 89 | ::vec4 v_sqrt = sqrt(v); 90 | assert(std::isnan(v_sqrt.x)); 91 | assert(approxEqual(v_sqrt.y, 6.4807407F)); 92 | assert(v_sqrt.z == 0); 93 | assert(v_sqrt.w == 9223372036854775808.F); // bignum = 2**63 94 | } 95 | 96 | void vec4::testAccessors() 97 | { 98 | ::vec4 v(1, 2, 3, 4); 99 | 100 | assert(v.x == 1); 101 | assert(v.y == 2); 102 | assert(v.z == 3); 103 | assert(v.w == 4); 104 | 105 | assert(v.r == 1); 106 | assert(v.g == 2); 107 | assert(v.b == 3); 108 | assert(v.a == 4); 109 | 110 | assert(v.s == 1); 111 | assert(v.t == 2); 112 | assert(v.p == 3); 113 | assert(v.q == 4); 114 | 115 | assert(v[0] == 1); 116 | assert(v[1] == 2); 117 | assert(v[2] == 3); 118 | assert(v[3] == 4); 119 | 120 | assert(&v.y == &v.x + 1); 121 | assert(&v.z == &v.y + 1); 122 | assert(&v.w == &v.z + 1); 123 | 124 | assert(&v.g == &v.r + 1); 125 | assert(&v.b == &v.g + 1); 126 | assert(&v.a == &v.b + 1); 127 | 128 | assert(&v.t == &v.s + 1); 129 | assert(&v.p == &v.t + 1); 130 | assert(&v.q == &v.p + 1); 131 | 132 | assert(&v.x == &v.r); 133 | assert(&v.y == &v.g); 134 | assert(&v.z == &v.b); 135 | assert(&v.w == &v.a); 136 | 137 | assert(&v.x == &v.s); 138 | assert(&v.y == &v.t); 139 | assert(&v.z == &v.p); 140 | assert(&v.w == &v.q); 141 | 142 | float *f = v; 143 | 144 | assert(f[0] == 1); 145 | assert(f[1] == 2); 146 | assert(f[2] == 3); 147 | assert(f[3] == 4); 148 | 149 | assert(sizeof(v) == 4 * sizeof(float)); 150 | } 151 | 152 | template 153 | bool vec4::approxEqual(T a, T b, T fuzziness) 154 | { 155 | T diff = a - b; 156 | return std::abs(diff) <= fuzziness; 157 | } 158 | 159 | } 160 | -------------------------------------------------------------------------------- /source/tests/vec4.h: -------------------------------------------------------------------------------- 1 | #ifndef TESTS_VEC4_H 2 | #define TESTS_VEC4_H 3 | 4 | #include "test.h" 5 | 6 | namespace tests 7 | { 8 | 9 | class vec4 : 10 | public test 11 | { 12 | public: 13 | static void testEquality(); 14 | static void testInequality(); 15 | 16 | static void testSwizzleEquality(); 17 | static void testSwizzleAccessors(); 18 | static void testSwizzleRead(); 19 | static void testSwizzleWrite(); 20 | static void testSwizzleReadWrite(); 21 | 22 | static void testUnary(); 23 | 24 | static void testAccessors(); 25 | 26 | protected: 27 | template 28 | static bool approxEqual(T a, T b, T fuzziness = 1.0F / (1 << 20)); 29 | }; 30 | 31 | } 32 | 33 | #endif /* TESTS_VEC4_H */ 34 | -------------------------------------------------------------------------------- /source/uvec4.h: -------------------------------------------------------------------------------- 1 | #ifndef __UVEC4_H__ 2 | #define __UVEC4_H__ 3 | 4 | #include 5 | #include 6 | 7 | class uvec4 8 | { 9 | private: 10 | template 11 | static inline __m128i shuffle(const __m128i &xmm) { 12 | return _mm_shuffle_epi32(xmm, mask); 13 | } 14 | 15 | // Merges mask `target` with `m` into one unified mask that does the same sequential shuffle 16 | template 17 | struct _mask_merger 18 | { 19 | enum 20 | { 21 | ROW0 = ((target >> (((m >> 0) & 3) << 1)) & 3) << 0, 22 | ROW1 = ((target >> (((m >> 2) & 3) << 1)) & 3) << 2, 23 | ROW2 = ((target >> (((m >> 4) & 3) << 1)) & 3) << 4, 24 | ROW3 = ((target >> (((m >> 6) & 3) << 1)) & 3) << 6, 25 | 26 | MASK = ROW0 | ROW1 | ROW2 | ROW3, 27 | }; 28 | 29 | private: 30 | _mask_merger(); 31 | }; 32 | 33 | // Since we are working in little endian land, this reverses the shuffle mask 34 | template 35 | struct _mask_reverser 36 | { 37 | enum 38 | { 39 | ROW0 = 0 << (((m >> 0) & 3) << 1), 40 | ROW1 = 1 << (((m >> 2) & 3) << 1), 41 | ROW2 = 2 << (((m >> 4) & 3) << 1), 42 | ROW3 = 3 << (((m >> 6) & 3) << 1), 43 | 44 | MASK = ROW0 | ROW1 | ROW2 | ROW3, 45 | }; 46 | 47 | private: 48 | _mask_reverser(); 49 | }; 50 | 51 | // Swizzle helper (Read only) 52 | template 53 | struct _swzl_ro 54 | { 55 | friend class uvec4; 56 | 57 | public: 58 | inline operator const uvec4 () const { 59 | return shuffle(v.m); 60 | } 61 | 62 | inline uint32_t operator[](int index) const { 63 | return v[(mask >> (index << 1)) & 0x3]; 64 | } 65 | 66 | // Swizzle of the swizzle, read only const 67 | template 68 | inline _swzl_ro<_mask_merger::MASK> shuffle4_ro2() const { 69 | typedef _mask_merger merged; 70 | return _swzl_ro(v); 71 | } 72 | 73 | // Swizzle of the swizzle, read only const (4) 74 | template 75 | inline _swzl_ro<_mask_merger::MASK> shuffle4_ro4() const { 76 | typedef _mask_merger merged; 77 | return _swzl_ro(v); 78 | } 79 | 80 | // Swizzle of the swizzle, read/write const 81 | template 82 | inline _swzl_ro<_mask_merger::MASK> shuffle4_rw4() const { 83 | typedef _mask_merger merged; 84 | return _swzl_ro(v); 85 | } 86 | 87 | const uint32_t &x, &y, &z, &w; 88 | const uint32_t &r, &g, &b, &a; 89 | const uint32_t &s, &t, &p, &q; 90 | 91 | private: 92 | // This massive constructor maps a vector to references 93 | inline _swzl_ro(const uvec4 &v): 94 | x(v[(mask >> 0) & 0x3]), y(v[(mask >> 2) & 0x3]), 95 | z(v[(mask >> 4) & 0x3]), w(v[(mask >> 6) & 0x3]), 96 | 97 | r(v[(mask >> 0) & 0x3]), g(v[(mask >> 2) & 0x3]), 98 | b(v[(mask >> 4) & 0x3]), a(v[(mask >> 6) & 0x3]), 99 | 100 | s(v[(mask >> 0) & 0x3]), t(v[(mask >> 2) & 0x3]), 101 | p(v[(mask >> 4) & 0x3]), q(v[(mask >> 6) & 0x3]), 102 | 103 | v(v) { 104 | // Empty 105 | } 106 | 107 | // Reference to unswizzled self 108 | const uvec4 &v; 109 | }; 110 | 111 | // Swizzle helper (Read/Write) 112 | template 113 | struct _swzl_rw 114 | { 115 | friend class uvec4; 116 | 117 | public: 118 | inline operator const uvec4 () const { 119 | return shuffle(v.m); 120 | } 121 | 122 | inline uint32_t& operator[](int index) const { 123 | return v[(mask >> (index << 1)) & 0x3]; 124 | } 125 | 126 | // Swizzle from uvec4 127 | inline uvec4& operator = (const uvec4 &r) { 128 | return v = shuffle<_mask_reverser::MASK>(r.m); 129 | } 130 | 131 | // Swizzle from same r/o mask (v1.xyzw = v2.xyzw) 132 | inline uvec4& operator = (const _swzl_ro &s) { 133 | return v = s.v; 134 | } 135 | 136 | // Swizzle from same mask (v1.xyzw = v2.xyzw) 137 | inline uvec4& operator = (const _swzl_rw &s) { 138 | return v = s.v; 139 | } 140 | 141 | // Swizzle mask => other_mask, r/o (v1.zwxy = v2.xyxy) 142 | template 143 | inline uvec4& operator = (const _swzl_ro &s) { 144 | typedef _mask_merger::MASK> merged; 145 | 146 | return v = shuffle(s.v.m); 147 | } 148 | 149 | // Swizzle mask => other_mask (v1.zwxy = v2.xyxy) 150 | template 151 | inline uvec4& operator = (const _swzl_rw &s) { 152 | typedef _mask_merger::MASK> merged; 153 | 154 | return v = shuffle(s.v.m); 155 | } 156 | 157 | // Swizzle of the swizzle, read only (v.xxxx.yyyy) (2) 158 | template 159 | inline _swzl_ro<_mask_merger::MASK> shuffle4_ro2() const { 160 | typedef _mask_merger merged; 161 | 162 | return _swzl_ro(v); 163 | } 164 | 165 | // Swizzle of the swizzle, read only (v.xxxx.yyyy) (4) 166 | template 167 | inline _swzl_ro<_mask_merger::MASK> shuffle4_ro4() const { 168 | typedef _mask_merger merged; 169 | 170 | return _swzl_ro(v); 171 | } 172 | 173 | // Swizzle of the swizzle, read/write (v1.zyxw.wzyx = ...) 174 | template 175 | inline _swzl_rw<_mask_merger::MASK> shuffle4_rw4() { 176 | typedef _mask_merger merged; 177 | 178 | return _swzl_rw(v); 179 | } 180 | 181 | // ----------------------------------------------------------------- // 182 | 183 | inline uvec4& operator += (uint32_t s) { 184 | return v += s; 185 | } 186 | 187 | inline uvec4& operator += (const uvec4 &v0) { 188 | return v += v0.shuffle4_ro4(); 189 | } 190 | 191 | inline uvec4& operator -= (uint32_t s) { 192 | return v -= s; 193 | } 194 | 195 | inline uvec4& operator -= (const uvec4 &v0) { 196 | return v -= v0.shuffle4_ro4(); 197 | } 198 | 199 | inline uvec4& operator *= (uint32_t s) { 200 | return v *= s; 201 | } 202 | 203 | inline uvec4& operator *= (const uvec4 &v0) { 204 | return v *= v0.shuffle4_ro4(); 205 | } 206 | 207 | inline uvec4& operator /= (uint32_t s) { 208 | return v /= s; 209 | } 210 | 211 | inline uvec4& operator /= (const uvec4 &v0) { 212 | return v /= v0.shuffle4_ro4(); 213 | } 214 | 215 | // ----------------------------------------------------------------- // 216 | 217 | uint32_t &x, &y, &z, &w; 218 | uint32_t &r, &g, &b, &a; 219 | uint32_t &s, &t, &p, &q; 220 | 221 | private: 222 | // This massive contructor maps a vector to references 223 | inline _swzl_rw(uvec4 &v): 224 | x(v[(mask >> 0) & 0x3]), y(v[(mask >> 2) & 0x3]), 225 | z(v[(mask >> 4) & 0x3]), w(v[(mask >> 6) & 0x3]), 226 | 227 | r(v[(mask >> 0) & 0x3]), g(v[(mask >> 2) & 0x3]), 228 | b(v[(mask >> 4) & 0x3]), a(v[(mask >> 6) & 0x3]), 229 | 230 | s(v[(mask >> 0) & 0x3]), t(v[(mask >> 2) & 0x3]), 231 | p(v[(mask >> 4) & 0x3]), q(v[(mask >> 6) & 0x3]), 232 | 233 | v(v) { 234 | // Empty 235 | } 236 | 237 | // Refrence to unswizzled self 238 | uvec4 &v; 239 | }; 240 | 241 | // ----------------------------------------------------------------- // 242 | 243 | public: 244 | // Empty constructor 245 | inline uvec4() { 246 | m = _mm_setzero_si128(); 247 | } 248 | 249 | // Fill constructor 250 | explicit inline uvec4(uint32_t i) { 251 | m = _mm_set1_epi32(i); 252 | } 253 | 254 | // 4 var init constructor 255 | inline uvec4(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w) { 256 | m = _mm_setr_epi32(_x, _y, _z, _w); 257 | } 258 | 259 | // Integer array constructor 260 | inline uvec4(const uint32_t* fv) { 261 | m = _mm_castps_si128(_mm_loadu_ps((const float*)fv)); 262 | } 263 | 264 | // Copy constructor 265 | inline uvec4(const uvec4 &v) { 266 | m = v.m; 267 | } 268 | 269 | // SSE2 compatible constructor 270 | inline uvec4(const __m128i &_m) { 271 | m = _m; 272 | } 273 | 274 | // ----------------------------------------------------------------- // 275 | 276 | inline void* operator new(size_t size) throw() { 277 | return _mm_malloc(size, 16); 278 | } 279 | 280 | inline void operator delete(void* ptr) { 281 | _mm_free(ptr); 282 | } 283 | 284 | // ----------------------------------------------------------------- // 285 | 286 | // Read-write swizzle 287 | template 288 | inline _swzl_rw shuffle4_rw4() { 289 | return _swzl_rw(*this); 290 | } 291 | 292 | // Read-write swizzle, const, actually read only 293 | template 294 | inline _swzl_ro shuffle4_rw4() const { 295 | return _swzl_ro(*this); 296 | } 297 | 298 | // Read-only swizzle (2) 299 | template 300 | inline _swzl_ro shuffle4_ro2() const { 301 | return _swzl_ro(*this); 302 | } 303 | 304 | // Read-only swizzle (4) 305 | template 306 | inline _swzl_ro shuffle4_ro4() const { 307 | return _swzl_ro(*this); 308 | } 309 | 310 | // ----------------------------------------------------------------- // 311 | 312 | // Write direct access operator 313 | inline uint32_t& operator[](int index) { 314 | return reinterpret_cast(this)[index]; 315 | } 316 | 317 | // Read direct access operator 318 | inline const uint32_t& operator[](int index) const { 319 | return reinterpret_cast(this)[index]; 320 | } 321 | 322 | // Cast operator 323 | inline operator uint32_t* () { 324 | return reinterpret_cast(this); 325 | } 326 | 327 | // Const cast operator 328 | inline operator const uint32_t* () const { 329 | return reinterpret_cast(this); 330 | } 331 | 332 | // ----------------------------------------------------------------- // 333 | 334 | friend inline uvec4& operator += (uvec4 &v, uint32_t u) { 335 | v.m = _mm_add_epi32(v.m, _mm_set1_epi32(u)); 336 | return v; 337 | } 338 | 339 | friend inline uvec4& operator += (uvec4 &v0, const uvec4 &v1) { 340 | v0.m = _mm_add_epi32(v0.m, v1.m); 341 | return v0; 342 | } 343 | 344 | friend inline uvec4& operator -= (uvec4 &v, uint32_t u) { 345 | v.m = _mm_sub_epi32(v.m, _mm_set1_epi32(u)); 346 | return v; 347 | } 348 | 349 | friend inline uvec4& operator -= (uvec4 &v0, const uvec4 &v1) { 350 | v0.m = _mm_sub_epi32(v0.m, v1.m); 351 | return v0; 352 | } 353 | 354 | friend inline uvec4& operator *= (uvec4 &v, uint32_t u) { 355 | __m128i uu = _mm_set1_epi32(u); 356 | v.m = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(_mm_mul_epu32(v.m, uu)), 357 | _mm_castsi128_ps(_mm_mul_epu32(_mm_srli_si128(v.m, 4), 358 | _mm_srli_si128(uu, 4))), 0x88)); 359 | return v; 360 | } 361 | 362 | friend inline uvec4& operator *= (uvec4 &v0, const uvec4 &v1) { 363 | v0.m = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(_mm_mul_epu32(v0.m, v1.m)), 364 | _mm_castsi128_ps(_mm_mul_epu32(_mm_srli_si128(v0.m, 4), 365 | _mm_srli_si128(v1.m, 4))), 0x88)); 366 | return v0; 367 | } 368 | /* 369 | friend inline uvec4& operator /= (uvec4 &v, uint32_t f) { 370 | // TODO 371 | } 372 | 373 | friend inline uvec4& operator /= (iuvec4 &v0, const uvec4 &v1) { 374 | // TODO 375 | } 376 | */ 377 | // ----------------------------------------------------------------- // 378 | 379 | friend inline const uvec4 operator + (uint32_t i, const uvec4 &v) { 380 | return _mm_add_epi32(_mm_set1_epi32(i), v.m); 381 | } 382 | 383 | friend inline const uvec4 operator + (const uvec4 &v, uint32_t i) { 384 | return _mm_add_epi32(v.m, _mm_set1_epi32(i)); 385 | } 386 | 387 | friend inline const uvec4 operator + (const uvec4 &v0, const uvec4 &v1) { 388 | return _mm_add_epi32(v0.m, v1.m); 389 | } 390 | 391 | friend inline const uvec4 operator - (uint32_t u, const uvec4 &v) { 392 | return _mm_sub_epi32(_mm_set1_epi32(u), v.m); 393 | } 394 | 395 | friend inline const uvec4 operator - (const uvec4 &v, uint32_t u) { 396 | return _mm_sub_epi32(v.m, _mm_set1_epi32(u)); 397 | } 398 | 399 | friend inline const uvec4 operator - (const uvec4 &v0, const uvec4 &v1) { 400 | return _mm_sub_epi32(v0.m, v1.m); 401 | } 402 | 403 | friend inline const uvec4 operator * (uint32_t u, const uvec4 &v) { 404 | __m128i uu = _mm_set1_epi32(u); 405 | return _mm_castps_si128(_mm_shuffle_ps( 406 | _mm_castsi128_ps(_mm_mul_epu32(uu, v.m)), 407 | _mm_castsi128_ps(_mm_mul_epu32(_mm_srli_si128(uu, 4), 408 | _mm_srli_si128(v.m, 4))), 0x88)); 409 | } 410 | 411 | friend inline const uvec4 operator * (const uvec4 &v, uint32_t u) { 412 | __m128i uu = _mm_set1_epi32(u); 413 | return _mm_castps_si128(_mm_shuffle_ps( 414 | _mm_castsi128_ps(_mm_mul_epu32(v.m, uu)), 415 | _mm_castsi128_ps(_mm_mul_epu32(_mm_srli_si128(v.m, 4), 416 | _mm_srli_si128(uu, 4))), 0x88)); 417 | } 418 | 419 | friend inline const uvec4 operator * (const uvec4 &v0, const uvec4 &v1) { 420 | return _mm_castps_si128(_mm_shuffle_ps( 421 | _mm_castsi128_ps(_mm_mul_epu32(v0.m, v1.m)), 422 | _mm_castsi128_ps(_mm_mul_epu32(_mm_srli_si128(v0.m, 4), 423 | _mm_srli_si128(v1.m, 4))), 0x88)); 424 | } 425 | /* 426 | friend inline const uvec4 operator / (uint32_t u, const uvec4 &v) { 427 | // TODO 428 | } 429 | 430 | friend inline const uvec4 operator / (const uvec4 &v, uint32_t u) { 431 | // TODO 432 | } 433 | 434 | friend inline const uvec4 operator / (const uvec4 &v0, const uvec4 &v1) { 435 | // TODO 436 | } 437 | */ 438 | // ----------------------------------------------------------------- // 439 | 440 | friend inline const uvec4 clamp(const uvec4 &v, uint32_t u1, uint32_t u2) { 441 | return max(min(v, u2), u1); 442 | } 443 | 444 | friend inline const uvec4 clamp(const uvec4 &v0, 445 | const uvec4 &v1, const uvec4 &v2) { 446 | return max(v1, min(v2, v0)); 447 | } 448 | 449 | friend inline const uvec4 max(const uvec4 &v, uint32_t u) { 450 | __m128i uu = _mm_set1_epi32(u); 451 | __m128i m = _mm_set1_epi32(0x80000000); 452 | __m128i mm = _mm_cmplt_epi32(_mm_xor_si128(v.m, m), _mm_xor_si128(uu, m)); 453 | return _mm_or_si128(_mm_andnot_si128(mm, v.m), _mm_and_si128(uu, mm)); 454 | } 455 | 456 | friend inline const uvec4 max(const uvec4 &v0, const uvec4 &v1) { 457 | __m128i m = _mm_set1_epi32(0x80000000); 458 | __m128i mm = _mm_cmplt_epi32(_mm_xor_si128(v0.m, m), _mm_xor_si128(v1.m, m)); 459 | return _mm_or_si128(_mm_andnot_si128(mm, v0.m), _mm_and_si128(v1.m, mm)); 460 | } 461 | 462 | friend inline const uvec4 min(const uvec4 &v, uint32_t u) { 463 | __m128i uu = _mm_set1_epi32(u); 464 | __m128i m = _mm_set1_epi32(0x80000000); 465 | __m128i mm = _mm_cmpgt_epi32(_mm_xor_si128(v.m, m), _mm_xor_si128(uu, m)); 466 | return _mm_or_si128(_mm_andnot_si128(mm, v.m), _mm_and_si128(uu, mm)); 467 | } 468 | 469 | friend inline const uvec4 min(const uvec4 &v0, const uvec4 &v1) { 470 | __m128i m = _mm_set1_epi32(0x80000000); 471 | __m128i mm = _mm_cmpgt_epi32(_mm_xor_si128(v0.m, m), _mm_xor_si128(v1.m, m)); 472 | return _mm_or_si128(_mm_andnot_si128(mm, v0.m), _mm_and_si128(v1.m, mm)); 473 | } 474 | 475 | // ----------------------------------------------------------------- // 476 | 477 | friend inline bool operator == (const uvec4 &v0, const uvec4 &v1) { 478 | return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(v0.m, v1.m))) == 0xF); 479 | } 480 | 481 | friend inline bool operator != (const uvec4 &v0, const uvec4 &v1) { 482 | return (_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(v0.m, v1.m))) != 0xF); 483 | } 484 | 485 | // ----------------------------------------------------------------- // 486 | 487 | union { 488 | // Vertex / Vector 489 | struct { 490 | uint32_t x, y, z, w; 491 | }; 492 | // Color 493 | struct { 494 | uint32_t r, g, b, a; 495 | }; 496 | // Texture coordinates 497 | struct { 498 | uint32_t s, t, p, q; 499 | }; 500 | 501 | // SSE2 register 502 | __m128i m; 503 | }; 504 | }; 505 | 506 | // Template specialization for mask 0xE4 (No shuffle) 507 | template<> 508 | inline __m128i uvec4::shuffle<0xE4>(const __m128i &xmm) { 509 | return xmm; 510 | } 511 | 512 | #include "swizzle4.h" 513 | 514 | #endif 515 | -------------------------------------------------------------------------------- /source/vec4.h: -------------------------------------------------------------------------------- 1 | #ifndef __VEC4_H__ 2 | #define __VEC4_H__ 3 | 4 | #include 5 | 6 | class vec4 7 | { 8 | private: 9 | // Most compilers don't use pshufd (SSE2) when _mm_shuffle(x, x, mask) is used 10 | // This macro saves 2-3 movaps instructions when shuffling 11 | // This has to be a macro since mask HAS to be an immidiate value 12 | #define _mm_shufd(xmm, mask) _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(xmm), mask)) 13 | 14 | template 15 | static inline __m128 shuffle(const __m128 &xmm) { 16 | return _mm_shufd(xmm, mask); 17 | } 18 | 19 | // Merges mask `target` with `m` into one unified mask that does the same sequential shuffle 20 | template 21 | struct _mask_merger 22 | { 23 | enum 24 | { 25 | ROW0 = ((target >> (((m >> 0) & 3) << 1)) & 3) << 0, 26 | ROW1 = ((target >> (((m >> 2) & 3) << 1)) & 3) << 2, 27 | ROW2 = ((target >> (((m >> 4) & 3) << 1)) & 3) << 4, 28 | ROW3 = ((target >> (((m >> 6) & 3) << 1)) & 3) << 6, 29 | 30 | MASK = ROW0 | ROW1 | ROW2 | ROW3, 31 | }; 32 | 33 | private: 34 | _mask_merger(); 35 | }; 36 | 37 | // Since we are working in little endian land, this reverses the shuffle mask 38 | template 39 | struct _mask_reverser 40 | { 41 | enum 42 | { 43 | ROW0 = 0 << (((m >> 0) & 3) << 1), 44 | ROW1 = 1 << (((m >> 2) & 3) << 1), 45 | ROW2 = 2 << (((m >> 4) & 3) << 1), 46 | ROW3 = 3 << (((m >> 6) & 3) << 1), 47 | 48 | MASK = ROW0 | ROW1 | ROW2 | ROW3, 49 | }; 50 | 51 | private: 52 | _mask_reverser(); 53 | }; 54 | 55 | // Swizzle helper (Read only) 56 | template 57 | struct _swzl_ro 58 | { 59 | friend class vec4; 60 | 61 | public: 62 | inline operator const vec4 () const { 63 | return shuffle(v.m); 64 | } 65 | 66 | inline float operator[](int index) const { 67 | return v[(mask >> (index << 1)) & 0x3]; 68 | } 69 | 70 | // Swizzle of the swizzle, read only const (2) 71 | template 72 | inline _swzl_ro<_mask_merger::MASK> shuffle4_ro2() const { 73 | typedef _mask_merger merged; 74 | return _swzl_ro(v); 75 | } 76 | 77 | // Swizzle of the swizzle, read only const (4) 78 | template 79 | inline _swzl_ro<_mask_merger::MASK> shuffle4_ro4() const { 80 | typedef _mask_merger merged; 81 | return _swzl_ro(v); 82 | } 83 | 84 | // Swizzle of the swizzle, read/write const 85 | template 86 | inline _swzl_ro<_mask_merger::MASK> shuffle4_rw4() const { 87 | typedef _mask_merger merged; 88 | return _swzl_ro(v); 89 | } 90 | 91 | const float &x, &y, &z, &w; 92 | const float &r, &g, &b, &a; 93 | const float &s, &t, &p, &q; 94 | 95 | private: 96 | // This massive constructor maps a vector to references 97 | inline _swzl_ro(const vec4 &v): 98 | x(v[(mask >> 0) & 0x3]), y(v[(mask >> 2) & 0x3]), 99 | z(v[(mask >> 4) & 0x3]), w(v[(mask >> 6) & 0x3]), 100 | 101 | r(v[(mask >> 0) & 0x3]), g(v[(mask >> 2) & 0x3]), 102 | b(v[(mask >> 4) & 0x3]), a(v[(mask >> 6) & 0x3]), 103 | 104 | s(v[(mask >> 0) & 0x3]), t(v[(mask >> 2) & 0x3]), 105 | p(v[(mask >> 4) & 0x3]), q(v[(mask >> 6) & 0x3]), 106 | 107 | v(v) { 108 | // Empty 109 | } 110 | 111 | // Reference to unswizzled self 112 | const vec4 &v; 113 | }; 114 | 115 | // Swizzle helper (Read/Write) 116 | template 117 | struct _swzl_rw 118 | { 119 | friend class vec4; 120 | 121 | public: 122 | inline operator const vec4 () const { 123 | return shuffle(v.m); 124 | } 125 | 126 | inline float& operator[](int index) { 127 | return v[(mask >> (index << 1)) & 0x3]; 128 | } 129 | 130 | // Swizzle from vec4 131 | inline vec4& operator = (const vec4 &r) { 132 | return v = shuffle<_mask_reverser::MASK>(r.m); 133 | } 134 | 135 | // Swizzle from same r/o mask (v1.xyzw = v2.xyzw) 136 | inline vec4& operator = (const _swzl_ro &s) { 137 | return v = s.v; 138 | } 139 | 140 | // Swizzle from same mask (v1.xyzw = v2.xyzw) 141 | inline vec4& operator = (const _swzl_rw &s) { 142 | return v = s.v; 143 | } 144 | 145 | // Swizzle mask => other_mask, r/o (v1.zwxy = v2.xyxy) 146 | template 147 | inline vec4& operator = (const _swzl_ro &s) { 148 | typedef _mask_merger::MASK> merged; 149 | 150 | return v = shuffle(s.v.m); 151 | } 152 | 153 | // Swizzle mask => other_mask (v1.zwxy = v2.xyxy) 154 | template 155 | inline vec4& operator = (const _swzl_rw &s) { 156 | typedef _mask_merger::MASK> merged; 157 | 158 | return v = shuffle(s.v.m); 159 | } 160 | 161 | // Swizzle of the swizzle, read only (v.xxxx.yyyy) (2) 162 | template 163 | inline _swzl_ro<_mask_merger::MASK> shuffle4_ro2() const { 164 | typedef _mask_merger merged; 165 | 166 | return _swzl_ro(v); 167 | } 168 | 169 | // Swizzle of the swizzle, read only (v.xxxx.yyyy) (4) 170 | template 171 | inline _swzl_ro<_mask_merger::MASK> shuffle4_ro4() const { 172 | typedef _mask_merger merged; 173 | 174 | return _swzl_ro(v); 175 | } 176 | 177 | // Swizzle of the swizzle, read/write (v1.zyxw.wzyx = ...) 178 | template 179 | inline _swzl_rw<_mask_merger::MASK> shuffle4_rw4() { 180 | typedef _mask_merger merged; 181 | 182 | return _swzl_rw(v); 183 | } 184 | 185 | // ----------------------------------------------------------------- // 186 | 187 | inline vec4& operator += (float s) { 188 | return v += s; 189 | } 190 | 191 | inline vec4& operator += (const vec4 &v0) { 192 | return v += v0.shuffle4_ro4(); 193 | } 194 | 195 | inline vec4& operator -= (float s) { 196 | return v -= s; 197 | } 198 | 199 | inline vec4& operator -= (const vec4 &v0) { 200 | return v -= v0.shuffle4_ro4(); 201 | } 202 | 203 | inline vec4& operator *= (float s) { 204 | return v *= s; 205 | } 206 | 207 | inline vec4& operator *= (const vec4 &v0) { 208 | return v *= v0.shuffle4_ro4(); 209 | } 210 | 211 | inline vec4& operator /= (float s) { 212 | return v /= s; 213 | } 214 | 215 | inline vec4& operator /= (const vec4 &v0) { 216 | return v /= v0.shuffle4_ro4(); 217 | } 218 | 219 | // ----------------------------------------------------------------- // 220 | 221 | float &x, &y, &z, &w; 222 | float &r, &g, &b, &a; 223 | float &s, &t, &p, &q; 224 | 225 | private: 226 | // This massive contructor maps a vector to references 227 | inline _swzl_rw(vec4 &v): 228 | x(v[(mask >> 0) & 0x3]), y(v[(mask >> 2) & 0x3]), 229 | z(v[(mask >> 4) & 0x3]), w(v[(mask >> 6) & 0x3]), 230 | 231 | r(v[(mask >> 0) & 0x3]), g(v[(mask >> 2) & 0x3]), 232 | b(v[(mask >> 4) & 0x3]), a(v[(mask >> 6) & 0x3]), 233 | 234 | s(v[(mask >> 0) & 0x3]), t(v[(mask >> 2) & 0x3]), 235 | p(v[(mask >> 4) & 0x3]), q(v[(mask >> 6) & 0x3]), 236 | 237 | v(v) { 238 | // Empty 239 | } 240 | 241 | // Refrence to unswizzled self 242 | vec4 &v; 243 | }; 244 | 245 | // ----------------------------------------------------------------- // 246 | 247 | public: 248 | // Empty constructor 249 | inline vec4() { 250 | m = _mm_setzero_ps(); 251 | } 252 | 253 | // Fill constructor 254 | explicit inline vec4(float f) { 255 | m = _mm_set1_ps(f); 256 | } 257 | 258 | // 4 var init constructor 259 | inline vec4(float _x, float _y, float _z, float _w) { 260 | m = _mm_setr_ps(_x, _y, _z, _w); 261 | } 262 | 263 | // Float array constructor 264 | inline vec4(const float* fv) { 265 | m = _mm_loadu_ps(fv); 266 | } 267 | 268 | // Copy constructor 269 | inline vec4(const vec4 &v) { 270 | m = v.m; 271 | } 272 | 273 | // SSE compatible constructor 274 | inline vec4(const __m128 &_m) { 275 | m = _m; 276 | } 277 | 278 | // ----------------------------------------------------------------- // 279 | 280 | inline void* operator new(size_t size) throw() { 281 | return _mm_malloc(size, 16); 282 | } 283 | 284 | inline void operator delete(void* ptr) { 285 | _mm_free(ptr); 286 | } 287 | 288 | // ----------------------------------------------------------------- // 289 | 290 | // Read-write swizzle 291 | template 292 | inline _swzl_rw shuffle4_rw4() { 293 | return _swzl_rw(*this); 294 | } 295 | 296 | // Read-write swizzle, const, actually read only 297 | template 298 | inline _swzl_ro shuffle4_rw4() const { 299 | return _swzl_ro(*this); 300 | } 301 | 302 | // Read-only swizzle (2) 303 | template 304 | inline _swzl_ro shuffle4_ro2() const { 305 | return _swzl_ro(*this); 306 | } 307 | 308 | // Read-only swizzle (4) 309 | template 310 | inline _swzl_ro shuffle4_ro4() const { 311 | return _swzl_ro(*this); 312 | } 313 | 314 | // ----------------------------------------------------------------- // 315 | 316 | // Write direct access operator 317 | inline float& operator[](int index) { 318 | return reinterpret_cast(this)[index]; 319 | } 320 | 321 | // Read direct access operator 322 | inline const float& operator[](int index) const { 323 | return reinterpret_cast(this)[index]; 324 | } 325 | 326 | // Cast operator 327 | inline operator float* () { 328 | return reinterpret_cast(this); 329 | } 330 | 331 | // Const cast operator 332 | inline operator const float* () const { 333 | return reinterpret_cast(this); 334 | } 335 | 336 | // ----------------------------------------------------------------- // 337 | 338 | friend inline vec4& operator += (vec4 &v, float f) { 339 | v.m = _mm_add_ps(v.m, _mm_set1_ps(f)); 340 | return v; 341 | } 342 | 343 | friend inline vec4& operator += (vec4 &v0, const vec4 &v1) { 344 | v0.m = _mm_add_ps(v0.m, v1.m); 345 | return v0; 346 | } 347 | 348 | friend inline vec4& operator -= (vec4 &v, float f) { 349 | v.m = _mm_sub_ps(v.m, _mm_set1_ps(f)); 350 | return v; 351 | } 352 | 353 | friend inline vec4& operator -= (vec4 &v0, const vec4 &v1) { 354 | v0.m = _mm_sub_ps(v0.m, v1.m); 355 | return v0; 356 | } 357 | 358 | friend inline vec4& operator *= (vec4 &v, float f) { 359 | v.m = _mm_mul_ps(v.m, _mm_set1_ps(f)); 360 | return v; 361 | } 362 | 363 | friend inline vec4& operator *= (vec4 &v0, const vec4 &v1) { 364 | v0.m = _mm_mul_ps(v0.m, v1.m); 365 | return v0; 366 | } 367 | 368 | friend inline vec4& operator /= (vec4 &v, float f) { 369 | v.m = _mm_div_ps(v.m, _mm_set1_ps(f)); 370 | return v; 371 | } 372 | 373 | friend inline vec4& operator /= (vec4 &v0, const vec4 &v1) { 374 | v0.m = _mm_div_ps(v0.m, v1.m); 375 | return v0; 376 | } 377 | 378 | // ----------------------------------------------------------------- // 379 | 380 | friend inline const vec4 operator + (float f, const vec4 &v) { 381 | return _mm_add_ps(_mm_set1_ps(f), v.m); 382 | } 383 | 384 | friend inline const vec4 operator + (const vec4 &v, float f) { 385 | return _mm_add_ps(v.m, _mm_set1_ps(f)); 386 | } 387 | 388 | friend inline const vec4 operator + (const vec4 &v0, const vec4 &v1) { 389 | return _mm_add_ps(v0.m, v1.m); 390 | } 391 | 392 | friend inline const vec4 operator - (const vec4 &v) { 393 | return _mm_xor_ps(v.m, _mm_set1_ps(-0.f)); 394 | } 395 | 396 | friend inline const vec4 operator - (float f, const vec4 &v) { 397 | return _mm_sub_ps( _mm_set1_ps(f), v.m); 398 | } 399 | 400 | friend inline const vec4 operator - (const vec4 &v, float f) { 401 | return _mm_sub_ps(v.m, _mm_set1_ps(f)); 402 | } 403 | 404 | friend inline const vec4 operator - (const vec4 &v0, const vec4 &v1) { 405 | return _mm_sub_ps(v0.m, v1.m); 406 | } 407 | 408 | friend inline const vec4 operator * (float f, const vec4 &v) { 409 | return _mm_mul_ps(_mm_set1_ps(f), v.m); 410 | } 411 | 412 | friend inline const vec4 operator * (const vec4 &v, float f) { 413 | return _mm_mul_ps(v.m, _mm_set1_ps(f)); 414 | } 415 | 416 | friend inline const vec4 operator * (const vec4 &v0, const vec4 &v1) { 417 | return _mm_mul_ps(v0.m, v1.m); 418 | } 419 | 420 | friend inline const vec4 operator / (float f, const vec4 &v) { 421 | return _mm_div_ps(_mm_set1_ps(f), v.m); 422 | } 423 | 424 | friend inline const vec4 operator / (const vec4 &v, float f) { 425 | return _mm_div_ps(v.m, _mm_set1_ps(f)); 426 | } 427 | 428 | friend inline const vec4 operator / (const vec4 &v0, const vec4 &v1) { 429 | return _mm_div_ps(v0.m, v1.m); 430 | } 431 | 432 | // ----------------------------------------------------------------- // 433 | 434 | friend inline const vec4 pow(const vec4 &v0, const vec4 &v1) { 435 | return exp2(log2(abs(v0)) * v1); 436 | } 437 | /* 438 | friend inline const vec4 exp(const vec4 &v) { 439 | // TODO 440 | } 441 | */ 442 | friend inline const vec4 log(const vec4 &v) { 443 | return log2(v) * 0.69314718055995f; 444 | } 445 | 446 | friend inline const vec4 exp2(const vec4 &v) { 447 | __m128i ix = _mm_cvttps_epi32(_mm_add_ps(v.m, _mm_castsi128_ps( 448 | _mm_andnot_si128(_mm_srai_epi32( 449 | _mm_cvttps_epi32(v.m), 31), 450 | _mm_set1_epi32(0x3F7FFFFF))))); 451 | __m128 f = _mm_mul_ps(_mm_sub_ps(_mm_cvtepi32_ps(ix), v.m), 452 | _mm_set1_ps(0.69314718055994530942f)); 453 | __m128 hi = _mm_add_ps(_mm_mul_ps(f, _mm_set1_ps(-0.0001413161f)), 454 | _mm_set1_ps( 0.0013298820f)); 455 | __m128 lo = _mm_add_ps(_mm_mul_ps(f, _mm_set1_ps(-0.1666653019f)), 456 | _mm_set1_ps( 0.4999999206f)); 457 | hi = _mm_add_ps(_mm_mul_ps(f, hi), _mm_set1_ps(-0.0083013598f)); 458 | hi = _mm_add_ps(_mm_mul_ps(f, hi), _mm_set1_ps( 0.0416573475f)); 459 | lo = _mm_add_ps(_mm_mul_ps(f, lo), _mm_set1_ps(-0.9999999995f)); 460 | lo = _mm_add_ps(_mm_mul_ps(f, lo), _mm_set1_ps(1.0f)); 461 | __m128 f2 = _mm_mul_ps(f, f); 462 | return _mm_or_ps(_mm_mul_ps(_mm_add_ps( 463 | _mm_mul_ps(_mm_mul_ps(f2, f2), hi), lo), 464 | _mm_castsi128_ps(_mm_and_si128(_mm_slli_epi32(( 465 | _mm_add_epi32(ix, _mm_set1_epi32(127))), 23), 466 | _mm_cmpgt_epi32(ix, _mm_set1_epi32(-128))))), 467 | _mm_castsi128_ps(_mm_srli_epi32( 468 | _mm_cmpgt_epi32(ix, _mm_set1_epi32( 128)), 1))); 469 | } 470 | 471 | friend inline const vec4 log2(const vec4 &v) { 472 | __m128i e = _mm_sub_epi32(_mm_srli_epi32(_mm_castps_si128( 473 | _mm_andnot_ps(_mm_set1_ps(-0.0f), v.m)), 23), 474 | _mm_set1_epi32(127)); 475 | __m128 y = _mm_sub_ps(_mm_castsi128_ps(_mm_sub_epi32( 476 | _mm_castps_si128(v.m), _mm_slli_epi32(e, 23))), 477 | _mm_set1_ps(1.0f)); 478 | __m128 x2 = _mm_mul_ps(y, y); 479 | __m128 x4 = _mm_mul_ps(x2, x2); 480 | __m128 hi = _mm_add_ps(_mm_mul_ps(y, _mm_set1_ps(-0.00931049621349f)), 481 | _mm_set1_ps( 0.05206469089414f)); 482 | __m128 lo = _mm_add_ps(_mm_mul_ps(y, _mm_set1_ps( 0.47868480909345f)), 483 | _mm_set1_ps(-0.72116591947498f)); 484 | hi = _mm_add_ps(_mm_mul_ps(y, hi), _mm_set1_ps(-0.13753123777116f)); 485 | hi = _mm_add_ps(_mm_mul_ps(y, hi), _mm_set1_ps( 0.24187369696082f)); 486 | hi = _mm_add_ps(_mm_mul_ps(y, hi), _mm_set1_ps(-0.34730547155299f)); 487 | lo = _mm_add_ps(_mm_mul_ps(y, lo), _mm_set1_ps(1.442689881667200f)); 488 | return _mm_add_ps(_mm_add_ps(_mm_mul_ps(x4, hi), 489 | _mm_mul_ps(y, lo)), _mm_cvtepi32_ps(e)); 490 | } 491 | 492 | friend inline const vec4 sqrt(const vec4 &v) { 493 | return _mm_sqrt_ps(v.m); 494 | } 495 | 496 | friend inline const vec4 inversesqrt(const vec4 &v) { 497 | return _mm_div_ps(_mm_set1_ps(1.0f), _mm_sqrt_ps(v.m)); 498 | } 499 | 500 | // ----------------------------------------------------------------- // 501 | 502 | friend inline const vec4 abs(const vec4 &v) { 503 | return _mm_andnot_ps(_mm_set1_ps(-0.f), v.m); 504 | } 505 | 506 | friend inline const vec4 ceil(const vec4 &v) { 507 | __m128 z = _mm_set1_ps(-0.f); 508 | __m128 i = _mm_cvtepi32_ps(_mm_cvttps_epi32(v.m)); 509 | __m128 m = _mm_cmple_ps(_mm_andnot_ps(z, v.m), 510 | _mm_set1_ps(8388608.0f)); 511 | return _mm_xor_ps(_mm_andnot_ps(m, v.m), _mm_and_ps(m, 512 | _mm_or_ps(_mm_and_ps(z, v.m), _mm_add_ps(i, 513 | _mm_and_ps(_mm_cmplt_ps(i, v.m), 514 | _mm_set1_ps(1.0f)))))); 515 | } 516 | 517 | friend inline const vec4 clamp(const vec4 &v0, float f1, float f2) { 518 | return _mm_max_ps(_mm_set1_ps(f1), 519 | _mm_min_ps(_mm_set1_ps(f2), v0.m)); 520 | } 521 | 522 | friend inline const vec4 clamp(const vec4 &v0, 523 | const vec4 &v1, const vec4 &v2) { 524 | return _mm_max_ps(v1.m, _mm_min_ps(v2.m, v0.m)); 525 | } 526 | 527 | friend inline const vec4 floor(const vec4 &v) { 528 | __m128 z = _mm_set1_ps(-0.f); 529 | __m128 i = _mm_cvtepi32_ps(_mm_cvttps_epi32(v.m)); 530 | __m128 m = _mm_cmple_ps(_mm_andnot_ps(z, v.m), 531 | _mm_set1_ps(8388608.0f)); 532 | return _mm_xor_ps(_mm_andnot_ps(m, v.m), _mm_and_ps(m, 533 | _mm_or_ps(_mm_and_ps(z, v.m), _mm_sub_ps(i, 534 | _mm_and_ps(_mm_cmpgt_ps(i, v.m), 535 | _mm_set1_ps(1.0f)))))); 536 | } 537 | 538 | friend inline const vec4 fract(const vec4 &v) { 539 | __m128 z = _mm_set1_ps(-0.f); 540 | __m128 m = _mm_cmple_ps(_mm_andnot_ps(z, v.m), 541 | _mm_set1_ps(8388608.f)); 542 | return _mm_or_ps(_mm_and_ps(z, v.m), _mm_or_ps(_mm_andnot_ps(m, 543 | _mm_and_ps(_mm_cmpunord_ps(v.m, v.m), v.m)), 544 | _mm_and_ps(m, _mm_sub_ps(v.m, _mm_cvtepi32_ps( 545 | _mm_cvttps_epi32(v.m)))))); 546 | } 547 | 548 | friend inline const vec4 max(const vec4 &v, float f) { 549 | return _mm_max_ps(v.m, _mm_set1_ps(f)); 550 | } 551 | 552 | friend inline const vec4 max(const vec4 &v0, const vec4 &v1) { 553 | return _mm_max_ps(v0.m, v1.m); 554 | } 555 | 556 | friend inline const vec4 min(const vec4 &v, float f) { 557 | return _mm_min_ps(v.m, _mm_set1_ps(f)); 558 | } 559 | 560 | friend inline const vec4 min(const vec4 &v0, const vec4 &v1) { 561 | return _mm_min_ps(v0.m, v1.m); 562 | } 563 | 564 | friend inline const vec4 mix(const vec4 &v0, const vec4 &v1, 565 | float f) { 566 | __m128 ff = _mm_set1_ps(f); 567 | return _mm_add_ps(_mm_mul_ps(v0.m, _mm_sub_ps(_mm_set1_ps(1.f), ff)), 568 | _mm_mul_ps(v1.m, ff)); 569 | } 570 | 571 | friend inline const vec4 mix(const vec4 &v0, const vec4 &v1, 572 | const vec4 &v2) { 573 | return _mm_add_ps(_mm_mul_ps(v0.m, _mm_sub_ps(_mm_set1_ps(1.f), v1.m)), 574 | _mm_mul_ps(v1.m, v2.m)); 575 | } 576 | 577 | friend inline const vec4 mod(const vec4 &v, float f) { 578 | __m128 ff = _mm_set1_ps(f); 579 | __m128 d = _mm_div_ps(v.m, ff); 580 | __m128 m = _mm_cmpunord_ps(d, 581 | _mm_cmpge_ps(_mm_andnot_ps(_mm_set1_ps(-0.0f), d), 582 | _mm_set1_ps(8388608.0f))); 583 | return _mm_sub_ps(v.m, _mm_mul_ps(ff, _mm_or_ps(_mm_andnot_ps(m, 584 | _mm_cvtepi32_ps(_mm_cvtps_epi32( 585 | _mm_sub_ps(d, _mm_set1_ps(0.5f))))), 586 | _mm_and_ps(m, d)))); 587 | } 588 | 589 | friend inline const vec4 mod(const vec4 &v0, const vec4 &v1) { 590 | __m128 d = _mm_div_ps(v0.m, v1.m); 591 | __m128 m = _mm_cmpunord_ps(d, 592 | _mm_cmpge_ps(_mm_andnot_ps(_mm_set1_ps(-0.0f), d), 593 | _mm_set1_ps(8388608.0f))); 594 | return _mm_sub_ps(v0.m, _mm_mul_ps(v1.m, _mm_or_ps(_mm_andnot_ps(m, 595 | _mm_cvtepi32_ps(_mm_cvtps_epi32( 596 | _mm_sub_ps(d, _mm_set1_ps(0.5f))))), 597 | _mm_and_ps(m, d)))); 598 | } 599 | 600 | friend inline const vec4 modf(const vec4 &v0, vec4 &v1) { 601 | __m128 m = _mm_cmpunord_ps(v0.m, 602 | _mm_cmpge_ps(_mm_andnot_ps(_mm_set1_ps(-0.0f), v0.m), 603 | _mm_set1_ps(8388608.0f))); 604 | v1.m = _mm_or_ps(_mm_andnot_ps(m, _mm_cvtepi32_ps(_mm_cvtps_epi32( 605 | _mm_sub_ps(v0.m, _mm_set1_ps(0.5f))))), 606 | _mm_and_ps(m, v0.m)); 607 | return _mm_sub_ps(v0.m, v1.m); 608 | } 609 | 610 | friend inline const vec4 round(const vec4 &v) { 611 | __m128 z = _mm_set1_ps(-0.f); 612 | __m128 s = _mm_and_ps(v.m, z); 613 | __m128 m = _mm_cmple_ps(_mm_andnot_ps(z, v.m), 614 | _mm_set1_ps(8388608.0f)); 615 | return _mm_or_ps(_mm_or_ps(_mm_andnot_ps(m, v.m), _mm_and_ps( 616 | _mm_cvtepi32_ps(_mm_cvttps_epi32(_mm_add_ps(v.m, 617 | _mm_or_ps(_mm_set1_ps(0.5f), s)))), m)), s); 618 | } 619 | 620 | friend inline const vec4 roundEven(const vec4 &v) { 621 | __m128 m = _mm_or_ps(_mm_and_ps(_mm_set1_ps(-0.0f), v.m), 622 | _mm_set1_ps(8388608.0f)); 623 | return _mm_sub_ps(_mm_add_ps(v.m, m), m); 624 | } 625 | 626 | friend inline const vec4 sign(const vec4 &v) { 627 | return _mm_and_ps(_mm_or_ps(_mm_and_ps(v.m, _mm_set1_ps(-0.f)), 628 | _mm_set1_ps(1.0f)), 629 | _mm_cmpneq_ps(v.m, _mm_setzero_ps())); 630 | } 631 | 632 | friend inline const vec4 smoothstep(float f1, float f2, 633 | const vec4 &v) { 634 | __m128 ff1 = _mm_set1_ps(f1); 635 | __m128 c = _mm_max_ps(_mm_min_ps(_mm_div_ps(_mm_sub_ps(v.m, ff1), 636 | _mm_sub_ps(_mm_set1_ps(f2), ff1)), 637 | _mm_set1_ps(1.f)), _mm_setzero_ps()); 638 | return _mm_mul_ps(_mm_mul_ps(c, c), 639 | _mm_sub_ps(_mm_set1_ps(3.0f), _mm_add_ps(c, c))); 640 | } 641 | 642 | friend inline const vec4 smoothstep(const vec4 &v0, 643 | const vec4 &v1, const vec4 &v2) { 644 | __m128 c = _mm_max_ps(_mm_min_ps(_mm_div_ps(_mm_sub_ps(v2.m, v0.m), 645 | _mm_sub_ps(v1.m, v0.m)), _mm_set1_ps(1.f)), 646 | _mm_setzero_ps()); 647 | return _mm_mul_ps(_mm_mul_ps(c, c), 648 | _mm_sub_ps(_mm_set1_ps(3.0f), _mm_add_ps(c, c))); 649 | } 650 | 651 | friend inline const vec4 step(float f, const vec4 &v) { 652 | return _mm_and_ps(_mm_cmple_ps(v.m, _mm_set1_ps(f)), 653 | _mm_set1_ps(1.0f)); 654 | } 655 | 656 | friend inline const vec4 step(const vec4 &v0, const vec4 &v1) { 657 | return _mm_and_ps(_mm_cmple_ps(v0.m, v1.m), _mm_set1_ps(1.0f)); 658 | } 659 | 660 | friend inline const vec4 trunc(const vec4 &v) { 661 | __m128 z = _mm_set1_ps(-0.f); 662 | __m128 m = _mm_cmple_ps(_mm_andnot_ps(z, v.m), 663 | _mm_set1_ps(8388608.f)); 664 | return _mm_or_ps(_mm_andnot_ps(m, v.m), _mm_and_ps(m, _mm_or_ps( 665 | _mm_and_ps(z, v.m), _mm_cvtepi32_ps( 666 | _mm_cvttps_epi32(v.m))))); 667 | } 668 | 669 | // ----------------------------------------------------------------- // 670 | 671 | friend inline float distance(const vec4 &v0, const vec4 &v1) { 672 | __m128 l = _mm_sub_ps(v0.m, v1.m); 673 | l = _mm_mul_ps(l, l); 674 | l = _mm_add_ps(l, _mm_shufd(l, 0x4E)); 675 | return _mm_cvtss_f32(_mm_sqrt_ss(_mm_add_ss(l, 676 | _mm_shufd(l, 0x11)))); 677 | } 678 | 679 | friend inline float dot(const vec4 &v0, const vec4 &v1) { 680 | __m128 l = _mm_mul_ps(v0.m, v1.m); 681 | l = _mm_add_ps(l, _mm_shufd(l, 0x4E)); 682 | return _mm_cvtss_f32(_mm_add_ss(l, _mm_shufd(l, 0x11))); 683 | } 684 | 685 | friend inline const vec4 faceforward(const vec4 &v0, 686 | const vec4 &v1, const vec4 &v2) { 687 | __m128 l = _mm_mul_ps(v2.m, v1.m); 688 | l = _mm_add_ps(l, _mm_shufd(l, 0x4E)); 689 | return _mm_xor_ps(_mm_and_ps(_mm_cmpnlt_ps( 690 | _mm_add_ps(l, _mm_shufd(l, 0x11)), 691 | _mm_setzero_ps()), _mm_set1_ps(-0.f)), v0.m); 692 | } 693 | 694 | friend inline float length(const vec4 &v) { 695 | __m128 l = _mm_mul_ps(v.m, v.m); 696 | l = _mm_add_ps(l, _mm_shufd(l, 0x4E)); 697 | return _mm_cvtss_f32(_mm_sqrt_ss(_mm_add_ss(l, 698 | _mm_shufd(l, 0x11)))); 699 | } 700 | 701 | friend inline const vec4 normalize(const vec4 &v) { 702 | __m128 l = _mm_mul_ps(v.m, v.m); 703 | l = _mm_add_ps(l, _mm_shufd(l, 0x4E)); 704 | return _mm_div_ps(v.m, _mm_sqrt_ps(_mm_add_ps(l, 705 | _mm_shufd(l, 0x11)))); 706 | } 707 | 708 | friend inline const vec4 reflect(const vec4 &v0, const vec4 &v1) { 709 | __m128 l = _mm_mul_ps(v1.m, v0.m); 710 | l = _mm_add_ps(l, _mm_shufd(l, 0x4E)); 711 | l = _mm_add_ps(l, _mm_shufd(l, 0x11)); 712 | return _mm_sub_ps(v0.m, _mm_mul_ps(_mm_add_ps(l, l), v1.m)); 713 | } 714 | 715 | friend inline const vec4 refract(const vec4 &v0, const vec4 &v1, 716 | float f) { 717 | __m128 o = _mm_set1_ps(1.0f); 718 | __m128 e = _mm_set1_ps(f); 719 | __m128 d = _mm_mul_ps(v1.m, v0.m); 720 | d = _mm_add_ps(d, _mm_shufd(d, 0x4E)); 721 | d = _mm_add_ps(d, _mm_shufd(d, 0x11)); 722 | __m128 k = _mm_sub_ps(o, _mm_mul_ps(_mm_mul_ps(e, e), 723 | _mm_sub_ps(o, _mm_mul_ps(d, d)))); 724 | return _mm_and_ps(_mm_cmpnlt_ps(k, _mm_setzero_ps()), 725 | _mm_mul_ps(_mm_mul_ps(e, _mm_sub_ps(v0.m, 726 | _mm_mul_ps(_mm_mul_ps(e, d), _mm_sqrt_ps(k)))), 727 | v1.m)); 728 | } 729 | 730 | // ----------------------------------------------------------------- // 731 | 732 | friend inline bool operator == (const vec4 &v0, const vec4 &v1) { 733 | return (_mm_movemask_ps(_mm_cmpeq_ps(v0.m, v1.m)) == 0xF); 734 | } 735 | 736 | friend inline bool operator != (const vec4 &v0, const vec4 &v1) { 737 | return (_mm_movemask_ps(_mm_cmpneq_ps(v0.m, v1.m)) != 0x0); 738 | } 739 | 740 | // ----------------------------------------------------------------- // 741 | 742 | union { 743 | // Vertex / Vector 744 | struct { 745 | float x, y, z, w; 746 | }; 747 | // Color 748 | struct { 749 | float r, g, b, a; 750 | }; 751 | // Texture coordinates 752 | struct { 753 | float s, t, p, q; 754 | }; 755 | 756 | // SSE register 757 | __m128 m; 758 | }; 759 | 760 | // Avoid pollution 761 | #undef _mm_shufd 762 | }; 763 | 764 | // Template specialization for mask 0xE4 (No shuffle) 765 | template<> 766 | inline __m128 vec4::shuffle<0xE4>(const __m128 &xmm) { 767 | return xmm; 768 | } 769 | 770 | #include "swizzle2.h" 771 | #include "swizzle4.h" 772 | 773 | #endif 774 | --------------------------------------------------------------------------------