├── README.md ├── include ├── Deform.h ├── Singular_Value_Decomposition_Givens_QR_Factorization_Kernel.hpp ├── Singular_Value_Decomposition_Jacobi_Conjugation_Kernel.hpp ├── Singular_Value_Decomposition_Kernel_Declarations.hpp ├── Singular_Value_Decomposition_Main_Kernel_Body.hpp ├── Singular_Value_Decomposition_Preamble.hpp ├── WunderSVD3x3.h └── arap_wrapper.h ├── lib ├── arap_debug.dll ├── arap_debug.lib ├── arap_release.dll └── arap_release.lib └── src ├── Deform.cpp └── WunderSVD3x3.cpp /README.md: -------------------------------------------------------------------------------- 1 | # ARAP 2 | This is a c++ implementation of as-rigid-as-possible surface modeling 3 | 4 | Sorkine, Olga, and Marc Alexa. "As-rigid-as-possible surface modeling." Symposium on Geometry processing. Vol. 4. 2007. 5 | -------------------------------------------------------------------------------- /include/Deform.h: -------------------------------------------------------------------------------- 1 | #ifndef _Deform_H 2 | #define _Deform_H 3 | 4 | #define ARAP_DLL_EXPORTS 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | #include 12 | 13 | #include "WunderSVD3x3.h" 14 | 15 | #include "arap_wrapper.h" 16 | 17 | class ARAP_DLL_API Deform 18 | { 19 | public: 20 | 21 | enum ArapType 22 | { 23 | ORIGIN_HARD, 24 | HARD_SOFT 25 | }; 26 | 27 | typedef std::vector< std::vector > AdjList; 28 | typedef std::vector< std::vector > TriangleList; 29 | typedef std::vector< float > VectorF; 30 | typedef std::vector< int > VectorI; 31 | 32 | typedef std::vector< Eigen::Vector3i > FaceList; 33 | 34 | struct Constraint 35 | { 36 | Constraint(Eigen::Vector3f _ctr, int _cid){ ctr = _ctr; cid = _cid; } 37 | 38 | Eigen::Vector3f ctr; 39 | int cid; 40 | }; 41 | 42 | struct ConstraintCompare 43 | { 44 | bool operator()(const Constraint& ctr1, const Constraint& ctr2) 45 | { 46 | return ctr1.cid < ctr2.cid; 47 | } 48 | }; 49 | 50 | typedef std::vector< Constraint > ConstraintSet; 51 | 52 | public: 53 | Deform(){}; 54 | ~Deform(){}; 55 | Deform(const float *P_data, int P_Num, const AdjList &adj_list, const TriangleList &triangle_list); 56 | 57 | // complete deform call 58 | float *do_Deform(); 59 | 60 | // deform call by iteration number 61 | float *do_Deform(int iter_num); 62 | 63 | // get current P' point i stores from P_Prime[3*i + 0] 64 | float *get_P_Prime(); 65 | 66 | // do deform one iterate, returned current P' 67 | float *do_Deform_Iter(float &delta); 68 | 69 | // energy value 70 | float energy(); 71 | 72 | // set energy tolerance 73 | void set_tolerance(float tol); 74 | 75 | // set max iteration 76 | void set_max_iteration(int iter_num); 77 | 78 | // set lambda. Default is 5 79 | void set_lambda(float lambda); 80 | 81 | // initial hard constraints 82 | void set_hard_ctrs(const VectorF &T, const VectorI &idx_T); 83 | 84 | // initial soft constraints 85 | void set_soft_ctrs(const VectorF &T, const VectorI &idx_T); 86 | 87 | // arap type 88 | void set_arap_type(ArapType at); 89 | 90 | private: 91 | // pre-build weight matrix 92 | void build_weight_matrix(); 93 | 94 | // pre-build laplacian matrix 95 | void build_laplacian_matrix(); 96 | 97 | // build right-hand d 98 | void build_rh_d(); 99 | 100 | // set linear equation system 101 | void set_linear_sys(); 102 | 103 | // update R 104 | void update_Ri(); 105 | 106 | // update P 107 | float update_P_Prime(); 108 | 109 | // pre-compute weights 110 | float compute_wij(const float *p1, const float *p2, const float *p3, const float *p4 = nullptr); 111 | 112 | // find share vertex of an edge 113 | void find_share_vertex(int i, int j, VectorI &share_vertex); 114 | 115 | // hard constraint or not 116 | int is_hard_ctrs(int id); 117 | 118 | // soft constraint or not 119 | int is_soft_ctrs(int id); 120 | 121 | private: 122 | ArapType at; 123 | 124 | const float* P_data; 125 | Eigen::Matrix3Xf P_Prime; 126 | Eigen::Matrix3Xf P; 127 | AdjList adj_list; 128 | FaceList face_list; 129 | 130 | ConstraintSet hard_ctrs; 131 | ConstraintSet soft_ctrs; 132 | 133 | std::vector R; 134 | 135 | Eigen::SparseMatrix Weight; 136 | Eigen::SparseMatrix L; 137 | Eigen::SparseLU> chol; 138 | // Eigen::SimplicialCholesky> chol; 139 | Eigen::MatrixX3f d; 140 | 141 | int P_Num; 142 | int max_iter; 143 | double min_tol; 144 | float lambda; 145 | }; 146 | 147 | #endif -------------------------------------------------------------------------------- /include/Singular_Value_Decomposition_Givens_QR_Factorization_Kernel.hpp: -------------------------------------------------------------------------------- 1 | //##################################################################### 2 | // Copyright (c) 2010-2011, Eftychios Sifakis. 3 | // 4 | // Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 5 | // * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 6 | // * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or 7 | // other materials provided with the distribution. 8 | // 9 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, 10 | // BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT 11 | // SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 12 | // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 13 | // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 14 | // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 15 | //##################################################################### 16 | 17 | //########################################################### 18 | // Compute the Givens half-angle, construct the Givens quaternion and the rotation sine/cosine (for the full angle) 19 | //########################################################### 20 | 21 | ENABLE_SCALAR_IMPLEMENTATION(Ssh.f=SANPIVOT.f*SANPIVOT.f;) ENABLE_SSE_IMPLEMENTATION(Vsh=_mm_mul_ps(VANPIVOT,VANPIVOT);) ENABLE_AVX_IMPLEMENTATION(Vsh=_mm256_mul_ps(VANPIVOT,VANPIVOT);) 22 | ENABLE_SCALAR_IMPLEMENTATION(Ssh.ui=(Ssh.f>=Ssmall_number.f)?0xffffffff:0;) ENABLE_SSE_IMPLEMENTATION(Vsh=_mm_cmpge_ps(Vsh,Vsmall_number);) ENABLE_AVX_IMPLEMENTATION(Vsh=_mm256_cmp_ps(Vsh,Vsmall_number, _CMP_GE_OS);) //ENABLE_AVX_IMPLEMENTATION(Vsh=_mm256_cmpge_ps(Vsh,Vsmall_number);) 23 | ENABLE_SCALAR_IMPLEMENTATION(Ssh.ui=Ssh.ui&SANPIVOT.ui;) ENABLE_SSE_IMPLEMENTATION(Vsh=_mm_and_ps(Vsh,VANPIVOT);) ENABLE_AVX_IMPLEMENTATION(Vsh=_mm256_and_ps(Vsh,VANPIVOT);) 24 | 25 | ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=0.;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Vtmp5,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Vtmp5,Vtmp5);) 26 | ENABLE_SCALAR_IMPLEMENTATION(Sch.f=Stmp5.f-SAPIVOT.f;) ENABLE_SSE_IMPLEMENTATION(Vch=_mm_sub_ps(Vtmp5,VAPIVOT);) ENABLE_AVX_IMPLEMENTATION(Vch=_mm256_sub_ps(Vtmp5,VAPIVOT);) 27 | ENABLE_SCALAR_IMPLEMENTATION(Sch.f=std::max(Sch.f,SAPIVOT.f);) ENABLE_SSE_IMPLEMENTATION(Vch=_mm_max_ps(Vch,VAPIVOT);) ENABLE_AVX_IMPLEMENTATION(Vch=_mm256_max_ps(Vch,VAPIVOT);) 28 | ENABLE_SCALAR_IMPLEMENTATION(Sch.f=std::max(Sch.f,Ssmall_number.f);) ENABLE_SSE_IMPLEMENTATION(Vch=_mm_max_ps(Vch,Vsmall_number);) ENABLE_AVX_IMPLEMENTATION(Vch=_mm256_max_ps(Vch,Vsmall_number);) 29 | ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=(SAPIVOT.f>=Stmp5.f)?0xffffffff:0;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_cmpge_ps(VAPIVOT,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_cmp_ps(VAPIVOT,Vtmp5, _CMP_GE_OS);) //ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_cmpge_ps(VAPIVOT,Vtmp5);) 30 | 31 | ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sch.f*Sch.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vch,Vch);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vch,Vch);) 32 | ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Ssh.f*Ssh.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_mul_ps(Vsh,Vsh);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_mul_ps(Vsh,Vsh);) 33 | ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Stmp1.f+Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_add_ps(Vtmp1,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_add_ps(Vtmp1,Vtmp2);) 34 | ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=rsqrt(Stmp2.f);) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_rsqrt_ps(Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_rsqrt_ps(Vtmp2);) 35 | 36 | ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Stmp1.f*Sone_half.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_mul_ps(Vtmp1,Vone_half);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_mul_ps(Vtmp1,Vone_half);) 37 | ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Stmp1.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_mul_ps(Vtmp1,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_mul_ps(Vtmp1,Vtmp4);) 38 | ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Stmp1.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_mul_ps(Vtmp1,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_mul_ps(Vtmp1,Vtmp3);) 39 | ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Stmp2.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_mul_ps(Vtmp2,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_mul_ps(Vtmp2,Vtmp3);) 40 | ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Stmp1.f+Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_add_ps(Vtmp1,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_add_ps(Vtmp1,Vtmp4);) 41 | ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Stmp1.f-Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_sub_ps(Vtmp1,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_sub_ps(Vtmp1,Vtmp3);) 42 | ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Stmp1.f*Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vtmp1,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vtmp1,Vtmp2);) 43 | 44 | ENABLE_SCALAR_IMPLEMENTATION(Sch.f=Sch.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Vch=_mm_add_ps(Vch,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Vch=_mm256_add_ps(Vch,Vtmp1);) 45 | 46 | ENABLE_SCALAR_IMPLEMENTATION(Stmp1.ui=~Stmp5.ui&Ssh.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_andnot_ps(Vtmp5,Vsh);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=Vch;) 47 | ENABLE_SCALAR_IMPLEMENTATION(Stmp2.ui=~Stmp5.ui&Sch.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_andnot_ps(Vtmp5,Vch);) ENABLE_AVX_IMPLEMENTATION(Vch=_mm256_blendv_ps(Vsh,Vch,Vtmp5);) 48 | ENABLE_SCALAR_IMPLEMENTATION(Sch.ui=Stmp5.ui&Sch.ui;) ENABLE_SSE_IMPLEMENTATION(Vch=_mm_and_ps(Vtmp5,Vch);) ENABLE_AVX_IMPLEMENTATION(Vsh=_mm256_blendv_ps(Vtmp1,Vsh,Vtmp5);) 49 | ENABLE_SCALAR_IMPLEMENTATION(Ssh.ui=Stmp5.ui&Ssh.ui;) ENABLE_SSE_IMPLEMENTATION(Vsh=_mm_and_ps(Vtmp5,Vsh);) 50 | ENABLE_SCALAR_IMPLEMENTATION(Sch.ui=Sch.ui|Stmp1.ui;) ENABLE_SSE_IMPLEMENTATION(Vch=_mm_or_ps(Vch,Vtmp1);) 51 | ENABLE_SCALAR_IMPLEMENTATION(Ssh.ui=Ssh.ui|Stmp2.ui;) ENABLE_SSE_IMPLEMENTATION(Vsh=_mm_or_ps(Vsh,Vtmp2);) 52 | 53 | ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sch.f*Sch.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vch,Vch);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vch,Vch);) 54 | ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Ssh.f*Ssh.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_mul_ps(Vsh,Vsh);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_mul_ps(Vsh,Vsh);) 55 | ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Stmp1.f+Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_add_ps(Vtmp1,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_add_ps(Vtmp1,Vtmp2);) 56 | ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=rsqrt(Stmp2.f);) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_rsqrt_ps(Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_rsqrt_ps(Vtmp2);) 57 | 58 | ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Stmp1.f*Sone_half.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_mul_ps(Vtmp1,Vone_half);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_mul_ps(Vtmp1,Vone_half);) 59 | ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Stmp1.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_mul_ps(Vtmp1,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_mul_ps(Vtmp1,Vtmp4);) 60 | ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Stmp1.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_mul_ps(Vtmp1,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_mul_ps(Vtmp1,Vtmp3);) 61 | ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Stmp2.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_mul_ps(Vtmp2,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_mul_ps(Vtmp2,Vtmp3);) 62 | ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Stmp1.f+Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_add_ps(Vtmp1,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_add_ps(Vtmp1,Vtmp4);) 63 | ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Stmp1.f-Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_sub_ps(Vtmp1,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_sub_ps(Vtmp1,Vtmp3);) 64 | 65 | ENABLE_SCALAR_IMPLEMENTATION(Sch.f=Sch.f*Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Vch=_mm_mul_ps(Vch,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Vch=_mm256_mul_ps(Vch,Vtmp1);) 66 | ENABLE_SCALAR_IMPLEMENTATION(Ssh.f=Ssh.f*Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Vsh=_mm_mul_ps(Vsh,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Vsh=_mm256_mul_ps(Vsh,Vtmp1);) 67 | 68 | ENABLE_SCALAR_IMPLEMENTATION(Sc.f=Sch.f*Sch.f;) ENABLE_SSE_IMPLEMENTATION(Vc=_mm_mul_ps(Vch,Vch);) ENABLE_AVX_IMPLEMENTATION(Vc=_mm256_mul_ps(Vch,Vch);)ENABLE_SCALAR_IMPLEMENTATION(Ss.f=Ssh.f*Ssh.f;) ENABLE_SSE_IMPLEMENTATION(Vs=_mm_mul_ps(Vsh,Vsh);) ENABLE_AVX_IMPLEMENTATION(Vs=_mm256_mul_ps(Vsh,Vsh);) 69 | ENABLE_SCALAR_IMPLEMENTATION(Sc.f=Sc.f-Ss.f;) ENABLE_SSE_IMPLEMENTATION(Vc=_mm_sub_ps(Vc,Vs);) ENABLE_AVX_IMPLEMENTATION(Vc=_mm256_sub_ps(Vc,Vs);) 70 | ENABLE_SCALAR_IMPLEMENTATION(Ss.f=Ssh.f*Sch.f;) ENABLE_SSE_IMPLEMENTATION(Vs=_mm_mul_ps(Vsh,Vch);) ENABLE_AVX_IMPLEMENTATION(Vs=_mm256_mul_ps(Vsh,Vch);) 71 | ENABLE_SCALAR_IMPLEMENTATION(Ss.f=Ss.f+Ss.f;) ENABLE_SSE_IMPLEMENTATION(Vs=_mm_add_ps(Vs,Vs);) ENABLE_AVX_IMPLEMENTATION(Vs=_mm256_add_ps(Vs,Vs);) 72 | 73 | //########################################################### 74 | // Rotate matrix A 75 | //########################################################### 76 | 77 | ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Ss.f*SA11.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vs,VA11);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vs,VA11);) 78 | ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Ss.f*SA21.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_mul_ps(Vs,VA21);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_mul_ps(Vs,VA21);) 79 | ENABLE_SCALAR_IMPLEMENTATION(SA11.f=Sc.f*SA11.f;) ENABLE_SSE_IMPLEMENTATION(VA11=_mm_mul_ps(Vc,VA11);) ENABLE_AVX_IMPLEMENTATION(VA11=_mm256_mul_ps(Vc,VA11);) 80 | ENABLE_SCALAR_IMPLEMENTATION(SA21.f=Sc.f*SA21.f;) ENABLE_SSE_IMPLEMENTATION(VA21=_mm_mul_ps(Vc,VA21);) ENABLE_AVX_IMPLEMENTATION(VA21=_mm256_mul_ps(Vc,VA21);) 81 | ENABLE_SCALAR_IMPLEMENTATION(SA11.f=SA11.f+Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(VA11=_mm_add_ps(VA11,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(VA11=_mm256_add_ps(VA11,Vtmp2);) 82 | ENABLE_SCALAR_IMPLEMENTATION(SA21.f=SA21.f-Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(VA21=_mm_sub_ps(VA21,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(VA21=_mm256_sub_ps(VA21,Vtmp1);) 83 | 84 | ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Ss.f*SA12.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vs,VA12);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vs,VA12);) 85 | ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Ss.f*SA22.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_mul_ps(Vs,VA22);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_mul_ps(Vs,VA22);) 86 | ENABLE_SCALAR_IMPLEMENTATION(SA12.f=Sc.f*SA12.f;) ENABLE_SSE_IMPLEMENTATION(VA12=_mm_mul_ps(Vc,VA12);) ENABLE_AVX_IMPLEMENTATION(VA12=_mm256_mul_ps(Vc,VA12);) 87 | ENABLE_SCALAR_IMPLEMENTATION(SA22.f=Sc.f*SA22.f;) ENABLE_SSE_IMPLEMENTATION(VA22=_mm_mul_ps(Vc,VA22);) ENABLE_AVX_IMPLEMENTATION(VA22=_mm256_mul_ps(Vc,VA22);) 88 | ENABLE_SCALAR_IMPLEMENTATION(SA12.f=SA12.f+Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(VA12=_mm_add_ps(VA12,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(VA12=_mm256_add_ps(VA12,Vtmp2);) 89 | ENABLE_SCALAR_IMPLEMENTATION(SA22.f=SA22.f-Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(VA22=_mm_sub_ps(VA22,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(VA22=_mm256_sub_ps(VA22,Vtmp1);) 90 | 91 | ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Ss.f*SA13.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vs,VA13);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vs,VA13);) 92 | ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Ss.f*SA23.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_mul_ps(Vs,VA23);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_mul_ps(Vs,VA23);) 93 | ENABLE_SCALAR_IMPLEMENTATION(SA13.f=Sc.f*SA13.f;) ENABLE_SSE_IMPLEMENTATION(VA13=_mm_mul_ps(Vc,VA13);) ENABLE_AVX_IMPLEMENTATION(VA13=_mm256_mul_ps(Vc,VA13);) 94 | ENABLE_SCALAR_IMPLEMENTATION(SA23.f=Sc.f*SA23.f;) ENABLE_SSE_IMPLEMENTATION(VA23=_mm_mul_ps(Vc,VA23);) ENABLE_AVX_IMPLEMENTATION(VA23=_mm256_mul_ps(Vc,VA23);) 95 | ENABLE_SCALAR_IMPLEMENTATION(SA13.f=SA13.f+Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(VA13=_mm_add_ps(VA13,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(VA13=_mm256_add_ps(VA13,Vtmp2);) 96 | ENABLE_SCALAR_IMPLEMENTATION(SA23.f=SA23.f-Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(VA23=_mm_sub_ps(VA23,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(VA23=_mm256_sub_ps(VA23,Vtmp1);) 97 | 98 | //########################################################### 99 | // Update matrix U 100 | //########################################################### 101 | 102 | #ifdef COMPUTE_U_AS_MATRIX 103 | ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Ss.f*SU11.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vs,VU11);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vs,VU11);) 104 | ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Ss.f*SU12.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_mul_ps(Vs,VU12);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_mul_ps(Vs,VU12);) 105 | ENABLE_SCALAR_IMPLEMENTATION(SU11.f=Sc.f*SU11.f;) ENABLE_SSE_IMPLEMENTATION(VU11=_mm_mul_ps(Vc,VU11);) ENABLE_AVX_IMPLEMENTATION(VU11=_mm256_mul_ps(Vc,VU11);) 106 | ENABLE_SCALAR_IMPLEMENTATION(SU12.f=Sc.f*SU12.f;) ENABLE_SSE_IMPLEMENTATION(VU12=_mm_mul_ps(Vc,VU12);) ENABLE_AVX_IMPLEMENTATION(VU12=_mm256_mul_ps(Vc,VU12);) 107 | ENABLE_SCALAR_IMPLEMENTATION(SU11.f=SU11.f+Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(VU11=_mm_add_ps(VU11,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(VU11=_mm256_add_ps(VU11,Vtmp2);) 108 | ENABLE_SCALAR_IMPLEMENTATION(SU12.f=SU12.f-Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(VU12=_mm_sub_ps(VU12,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(VU12=_mm256_sub_ps(VU12,Vtmp1);) 109 | 110 | ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Ss.f*SU21.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vs,VU21);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vs,VU21);) 111 | ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Ss.f*SU22.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_mul_ps(Vs,VU22);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_mul_ps(Vs,VU22);) 112 | ENABLE_SCALAR_IMPLEMENTATION(SU21.f=Sc.f*SU21.f;) ENABLE_SSE_IMPLEMENTATION(VU21=_mm_mul_ps(Vc,VU21);) ENABLE_AVX_IMPLEMENTATION(VU21=_mm256_mul_ps(Vc,VU21);) 113 | ENABLE_SCALAR_IMPLEMENTATION(SU22.f=Sc.f*SU22.f;) ENABLE_SSE_IMPLEMENTATION(VU22=_mm_mul_ps(Vc,VU22);) ENABLE_AVX_IMPLEMENTATION(VU22=_mm256_mul_ps(Vc,VU22);) 114 | ENABLE_SCALAR_IMPLEMENTATION(SU21.f=SU21.f+Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(VU21=_mm_add_ps(VU21,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(VU21=_mm256_add_ps(VU21,Vtmp2);) 115 | ENABLE_SCALAR_IMPLEMENTATION(SU22.f=SU22.f-Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(VU22=_mm_sub_ps(VU22,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(VU22=_mm256_sub_ps(VU22,Vtmp1);) 116 | 117 | ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Ss.f*SU31.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vs,VU31);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vs,VU31);) 118 | ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Ss.f*SU32.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_mul_ps(Vs,VU32);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_mul_ps(Vs,VU32);) 119 | ENABLE_SCALAR_IMPLEMENTATION(SU31.f=Sc.f*SU31.f;) ENABLE_SSE_IMPLEMENTATION(VU31=_mm_mul_ps(Vc,VU31);) ENABLE_AVX_IMPLEMENTATION(VU31=_mm256_mul_ps(Vc,VU31);) 120 | ENABLE_SCALAR_IMPLEMENTATION(SU32.f=Sc.f*SU32.f;) ENABLE_SSE_IMPLEMENTATION(VU32=_mm_mul_ps(Vc,VU32);) ENABLE_AVX_IMPLEMENTATION(VU32=_mm256_mul_ps(Vc,VU32);) 121 | ENABLE_SCALAR_IMPLEMENTATION(SU31.f=SU31.f+Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(VU31=_mm_add_ps(VU31,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(VU31=_mm256_add_ps(VU31,Vtmp2);) 122 | ENABLE_SCALAR_IMPLEMENTATION(SU32.f=SU32.f-Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(VU32=_mm_sub_ps(VU32,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(VU32=_mm256_sub_ps(VU32,Vtmp1);) 123 | #endif 124 | -------------------------------------------------------------------------------- /include/Singular_Value_Decomposition_Jacobi_Conjugation_Kernel.hpp: -------------------------------------------------------------------------------- 1 | //##################################################################### 2 | // Copyright (c) 2010-2011, Eftychios Sifakis. 3 | // 4 | // Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 5 | // * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 6 | // * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or 7 | // other materials provided with the distribution. 8 | // 9 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, 10 | // BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT 11 | // SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 12 | // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 13 | // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 14 | // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 15 | //##################################################################### 16 | 17 | //########################################################### 18 | // Compute the Givens angle (and half-angle) 19 | //########################################################### 20 | 21 | ENABLE_SCALAR_IMPLEMENTATION(Ssh.f=SS21.f*Sone_half.f;) ENABLE_SSE_IMPLEMENTATION(Vsh=_mm_mul_ps(VS21,Vone_half);) ENABLE_AVX_IMPLEMENTATION(Vsh=_mm256_mul_ps(VS21,Vone_half);) 22 | ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=SS11.f-SS22.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_sub_ps(VS11,VS22);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_sub_ps(VS11,VS22);) 23 | 24 | ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Ssh.f*Ssh.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_mul_ps(Vsh,Vsh);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_mul_ps(Vsh,Vsh);) 25 | ENABLE_SCALAR_IMPLEMENTATION(Stmp1.ui=(Stmp2.f>=Stiny_number.f)?0xffffffff:0;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_cmpge_ps(Vtmp2,Vtiny_number);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_cmp_ps(Vtmp2,Vtiny_number, _CMP_GE_OS);) //ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_cmpge_ps(Vtmp2,Vtiny_number);) 26 | ENABLE_SCALAR_IMPLEMENTATION(Ssh.ui=Stmp1.ui&Ssh.ui;) ENABLE_SSE_IMPLEMENTATION(Vsh=_mm_and_ps(Vtmp1,Vsh);) ENABLE_AVX_IMPLEMENTATION(Vsh=_mm256_and_ps(Vtmp1,Vsh);) 27 | ENABLE_SCALAR_IMPLEMENTATION(Sch.ui=Stmp1.ui&Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vch=_mm_and_ps(Vtmp1,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vch=_mm256_blendv_ps(Vone,Vtmp5,Vtmp1);) 28 | ENABLE_SCALAR_IMPLEMENTATION(Stmp2.ui=~Stmp1.ui&Sone.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_andnot_ps(Vtmp1,Vone);) 29 | ENABLE_SCALAR_IMPLEMENTATION(Sch.ui=Sch.ui|Stmp2.ui;) ENABLE_SSE_IMPLEMENTATION(Vch=_mm_or_ps(Vch,Vtmp2);) 30 | 31 | ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Ssh.f*Ssh.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vsh,Vsh);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vsh,Vsh);) 32 | ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Sch.f*Sch.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_mul_ps(Vch,Vch);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_mul_ps(Vch,Vch);) 33 | ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Stmp1.f+Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_add_ps(Vtmp1,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_add_ps(Vtmp1,Vtmp2);) 34 | ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=rsqrt(Stmp3.f);) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_rsqrt_ps(Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_rsqrt_ps(Vtmp3);) 35 | 36 | #ifdef USE_ACCURATE_RSQRT_IN_JACOBI_CONJUGATION 37 | ENABLE_SCALAR_IMPLEMENTATION(Ss.f=Stmp4.f*Sone_half.f;) ENABLE_SSE_IMPLEMENTATION(Vs=_mm_mul_ps(Vtmp4,Vone_half);) ENABLE_AVX_IMPLEMENTATION(Vs=_mm256_mul_ps(Vtmp4,Vone_half);) 38 | ENABLE_SCALAR_IMPLEMENTATION(Sc.f=Stmp4.f*Ss.f;) ENABLE_SSE_IMPLEMENTATION(Vc=_mm_mul_ps(Vtmp4,Vs);) ENABLE_AVX_IMPLEMENTATION(Vc=_mm256_mul_ps(Vtmp4,Vs);) 39 | ENABLE_SCALAR_IMPLEMENTATION(Sc.f=Stmp4.f*Sc.f;) ENABLE_SSE_IMPLEMENTATION(Vc=_mm_mul_ps(Vtmp4,Vc);) ENABLE_AVX_IMPLEMENTATION(Vc=_mm256_mul_ps(Vtmp4,Vc);) 40 | ENABLE_SCALAR_IMPLEMENTATION(Sc.f=Stmp3.f*Sc.f;) ENABLE_SSE_IMPLEMENTATION(Vc=_mm_mul_ps(Vtmp3,Vc);) ENABLE_AVX_IMPLEMENTATION(Vc=_mm256_mul_ps(Vtmp3,Vc);) 41 | ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Stmp4.f+Ss.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_add_ps(Vtmp4,Vs);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_add_ps(Vtmp4,Vs);) 42 | ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Stmp4.f-Sc.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_sub_ps(Vtmp4,Vc);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_sub_ps(Vtmp4,Vc);) 43 | #endif 44 | 45 | ENABLE_SCALAR_IMPLEMENTATION(Ssh.f=Stmp4.f*Ssh.f;) ENABLE_SSE_IMPLEMENTATION(Vsh=_mm_mul_ps(Vtmp4,Vsh);) ENABLE_AVX_IMPLEMENTATION(Vsh=_mm256_mul_ps(Vtmp4,Vsh);) 46 | ENABLE_SCALAR_IMPLEMENTATION(Sch.f=Stmp4.f*Sch.f;) ENABLE_SSE_IMPLEMENTATION(Vch=_mm_mul_ps(Vtmp4,Vch);) ENABLE_AVX_IMPLEMENTATION(Vch=_mm256_mul_ps(Vtmp4,Vch);) 47 | 48 | ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sfour_gamma_squared.f*Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vfour_gamma_squared,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vfour_gamma_squared,Vtmp1);) 49 | ENABLE_SCALAR_IMPLEMENTATION(Stmp1.ui=(Stmp2.f<=Stmp1.f)?0xffffffff:0;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_cmple_ps(Vtmp2,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_cmp_ps(Vtmp2,Vtmp1, _CMP_LE_OS);) //ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_cmple_ps(Vtmp2,Vtmp1);) 50 | 51 | ENABLE_SCALAR_IMPLEMENTATION(Stmp2.ui=Ssine_pi_over_eight.ui&Stmp1.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_and_ps(Vsine_pi_over_eight,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Vsh=_mm256_blendv_ps(Vsh,Vsine_pi_over_eight,Vtmp1);) 52 | ENABLE_SCALAR_IMPLEMENTATION(Ssh.ui=~Stmp1.ui&Ssh.ui;) ENABLE_SSE_IMPLEMENTATION(Vsh=_mm_andnot_ps(Vtmp1,Vsh);) 53 | ENABLE_SCALAR_IMPLEMENTATION(Ssh.ui=Ssh.ui|Stmp2.ui;) ENABLE_SSE_IMPLEMENTATION(Vsh=_mm_or_ps(Vsh,Vtmp2);) 54 | ENABLE_SCALAR_IMPLEMENTATION(Stmp2.ui=Scosine_pi_over_eight.ui&Stmp1.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_and_ps(Vcosine_pi_over_eight,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Vch=_mm256_blendv_ps(Vch,Vcosine_pi_over_eight,Vtmp1);) 55 | ENABLE_SCALAR_IMPLEMENTATION(Sch.ui=~Stmp1.ui&Sch.ui;) ENABLE_SSE_IMPLEMENTATION(Vch=_mm_andnot_ps(Vtmp1,Vch);) 56 | ENABLE_SCALAR_IMPLEMENTATION(Sch.ui=Sch.ui|Stmp2.ui;) ENABLE_SSE_IMPLEMENTATION(Vch=_mm_or_ps(Vch,Vtmp2);) 57 | 58 | ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Ssh.f*Ssh.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vsh,Vsh);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vsh,Vsh);) 59 | ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Sch.f*Sch.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_mul_ps(Vch,Vch);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_mul_ps(Vch,Vch);) 60 | ENABLE_SCALAR_IMPLEMENTATION(Sc.f=Stmp2.f-Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Vc=_mm_sub_ps(Vtmp2,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Vc=_mm256_sub_ps(Vtmp2,Vtmp1);) 61 | ENABLE_SCALAR_IMPLEMENTATION(Ss.f=Sch.f*Ssh.f;) ENABLE_SSE_IMPLEMENTATION(Vs=_mm_mul_ps(Vch,Vsh);) ENABLE_AVX_IMPLEMENTATION(Vs=_mm256_mul_ps(Vch,Vsh);) 62 | ENABLE_SCALAR_IMPLEMENTATION(Ss.f=Ss.f+Ss.f;) ENABLE_SSE_IMPLEMENTATION(Vs=_mm_add_ps(Vs,Vs);) ENABLE_AVX_IMPLEMENTATION(Vs=_mm256_add_ps(Vs,Vs);) 63 | 64 | //########################################################### 65 | // Perform the actual Givens conjugation 66 | //########################################################### 67 | 68 | #ifndef USE_ACCURATE_RSQRT_IN_JACOBI_CONJUGATION 69 | ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Stmp1.f+Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_add_ps(Vtmp1,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_add_ps(Vtmp1,Vtmp2);) 70 | ENABLE_SCALAR_IMPLEMENTATION(SS33.f=SS33.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(VS33=_mm_mul_ps(VS33,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(VS33=_mm256_mul_ps(VS33,Vtmp3);) 71 | ENABLE_SCALAR_IMPLEMENTATION(SS31.f=SS31.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(VS31=_mm_mul_ps(VS31,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(VS31=_mm256_mul_ps(VS31,Vtmp3);) 72 | ENABLE_SCALAR_IMPLEMENTATION(SS32.f=SS32.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(VS32=_mm_mul_ps(VS32,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(VS32=_mm256_mul_ps(VS32,Vtmp3);) 73 | ENABLE_SCALAR_IMPLEMENTATION(SS33.f=SS33.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(VS33=_mm_mul_ps(VS33,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(VS33=_mm256_mul_ps(VS33,Vtmp3);) 74 | #endif 75 | 76 | ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Ss.f*SS31.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vs,VS31);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vs,VS31);) 77 | ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Ss.f*SS32.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_mul_ps(Vs,VS32);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_mul_ps(Vs,VS32);) 78 | ENABLE_SCALAR_IMPLEMENTATION(SS31.f=Sc.f*SS31.f;) ENABLE_SSE_IMPLEMENTATION(VS31=_mm_mul_ps(Vc,VS31);) ENABLE_AVX_IMPLEMENTATION(VS31=_mm256_mul_ps(Vc,VS31);) 79 | ENABLE_SCALAR_IMPLEMENTATION(SS32.f=Sc.f*SS32.f;) ENABLE_SSE_IMPLEMENTATION(VS32=_mm_mul_ps(Vc,VS32);) ENABLE_AVX_IMPLEMENTATION(VS32=_mm256_mul_ps(Vc,VS32);) 80 | ENABLE_SCALAR_IMPLEMENTATION(SS31.f=Stmp2.f+SS31.f;) ENABLE_SSE_IMPLEMENTATION(VS31=_mm_add_ps(Vtmp2,VS31);) ENABLE_AVX_IMPLEMENTATION(VS31=_mm256_add_ps(Vtmp2,VS31);) 81 | ENABLE_SCALAR_IMPLEMENTATION(SS32.f=SS32.f-Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(VS32=_mm_sub_ps(VS32,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(VS32=_mm256_sub_ps(VS32,Vtmp1);) 82 | 83 | ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Ss.f*Ss.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_mul_ps(Vs,Vs);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_mul_ps(Vs,Vs);) 84 | ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=SS22.f*Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(VS22,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(VS22,Vtmp2);) 85 | ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=SS11.f*Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_mul_ps(VS11,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_mul_ps(VS11,Vtmp2);) 86 | ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Sc.f*Sc.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_mul_ps(Vc,Vc);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_mul_ps(Vc,Vc);) 87 | ENABLE_SCALAR_IMPLEMENTATION(SS11.f=SS11.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(VS11=_mm_mul_ps(VS11,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(VS11=_mm256_mul_ps(VS11,Vtmp4);) 88 | ENABLE_SCALAR_IMPLEMENTATION(SS22.f=SS22.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(VS22=_mm_mul_ps(VS22,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(VS22=_mm256_mul_ps(VS22,Vtmp4);) 89 | ENABLE_SCALAR_IMPLEMENTATION(SS11.f=SS11.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(VS11=_mm_add_ps(VS11,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(VS11=_mm256_add_ps(VS11,Vtmp1);) 90 | ENABLE_SCALAR_IMPLEMENTATION(SS22.f=SS22.f+Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(VS22=_mm_add_ps(VS22,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(VS22=_mm256_add_ps(VS22,Vtmp3);) 91 | ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Stmp4.f-Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_sub_ps(Vtmp4,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_sub_ps(Vtmp4,Vtmp2);) 92 | ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=SS21.f+SS21.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_add_ps(VS21,VS21);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_add_ps(VS21,VS21);) 93 | ENABLE_SCALAR_IMPLEMENTATION(SS21.f=SS21.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(VS21=_mm_mul_ps(VS21,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(VS21=_mm256_mul_ps(VS21,Vtmp4);) 94 | ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Sc.f*Ss.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_mul_ps(Vc,Vs);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_mul_ps(Vc,Vs);) 95 | ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Stmp2.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_mul_ps(Vtmp2,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_mul_ps(Vtmp2,Vtmp4);) 96 | ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=Stmp5.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_mul_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_mul_ps(Vtmp5,Vtmp4);) 97 | ENABLE_SCALAR_IMPLEMENTATION(SS11.f=SS11.f+Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(VS11=_mm_add_ps(VS11,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(VS11=_mm256_add_ps(VS11,Vtmp2);) 98 | ENABLE_SCALAR_IMPLEMENTATION(SS21.f=SS21.f-Stmp5.f;) ENABLE_SSE_IMPLEMENTATION(VS21=_mm_sub_ps(VS21,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(VS21=_mm256_sub_ps(VS21,Vtmp5);) 99 | ENABLE_SCALAR_IMPLEMENTATION(SS22.f=SS22.f-Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(VS22=_mm_sub_ps(VS22,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(VS22=_mm256_sub_ps(VS22,Vtmp2);) 100 | 101 | //########################################################### 102 | // Compute the cumulative rotation, in quaternion form 103 | //########################################################### 104 | 105 | ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Ssh.f*Sqvvx.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vsh,Vqvvx);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vsh,Vqvvx);) 106 | ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Ssh.f*Sqvvy.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_mul_ps(Vsh,Vqvvy);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_mul_ps(Vsh,Vqvvy);) 107 | ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Ssh.f*Sqvvz.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_mul_ps(Vsh,Vqvvz);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_mul_ps(Vsh,Vqvvz);) 108 | ENABLE_SCALAR_IMPLEMENTATION(Ssh.f=Ssh.f*Sqvs.f;) ENABLE_SSE_IMPLEMENTATION(Vsh=_mm_mul_ps(Vsh,Vqvs);) ENABLE_AVX_IMPLEMENTATION(Vsh=_mm256_mul_ps(Vsh,Vqvs);) 109 | 110 | ENABLE_SCALAR_IMPLEMENTATION(Sqvs.f=Sch.f*Sqvs.f;) ENABLE_SSE_IMPLEMENTATION(Vqvs=_mm_mul_ps(Vch,Vqvs);) ENABLE_AVX_IMPLEMENTATION(Vqvs=_mm256_mul_ps(Vch,Vqvs);) 111 | ENABLE_SCALAR_IMPLEMENTATION(Sqvvx.f=Sch.f*Sqvvx.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvx=_mm_mul_ps(Vch,Vqvvx);) ENABLE_AVX_IMPLEMENTATION(Vqvvx=_mm256_mul_ps(Vch,Vqvvx);) 112 | ENABLE_SCALAR_IMPLEMENTATION(Sqvvy.f=Sch.f*Sqvvy.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvy=_mm_mul_ps(Vch,Vqvvy);) ENABLE_AVX_IMPLEMENTATION(Vqvvy=_mm256_mul_ps(Vch,Vqvvy);) 113 | ENABLE_SCALAR_IMPLEMENTATION(Sqvvz.f=Sch.f*Sqvvz.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvz=_mm_mul_ps(Vch,Vqvvz);) ENABLE_AVX_IMPLEMENTATION(Vqvvz=_mm256_mul_ps(Vch,Vqvvz);) 114 | 115 | ENABLE_SCALAR_IMPLEMENTATION(SQVVZ.f=SQVVZ.f+Ssh.f;) ENABLE_SSE_IMPLEMENTATION(VQVVZ=_mm_add_ps(VQVVZ,Vsh);) ENABLE_AVX_IMPLEMENTATION(VQVVZ=_mm256_add_ps(VQVVZ,Vsh);) 116 | ENABLE_SCALAR_IMPLEMENTATION(Sqvs.f=Sqvs.f-STMP3.f;) ENABLE_SSE_IMPLEMENTATION(Vqvs=_mm_sub_ps(Vqvs,VTMP3);) ENABLE_AVX_IMPLEMENTATION(Vqvs=_mm256_sub_ps(Vqvs,VTMP3);) 117 | ENABLE_SCALAR_IMPLEMENTATION(SQVVX.f=SQVVX.f+STMP2.f;) ENABLE_SSE_IMPLEMENTATION(VQVVX=_mm_add_ps(VQVVX,VTMP2);) ENABLE_AVX_IMPLEMENTATION(VQVVX=_mm256_add_ps(VQVVX,VTMP2);) 118 | ENABLE_SCALAR_IMPLEMENTATION(SQVVY.f=SQVVY.f-STMP1.f;) ENABLE_SSE_IMPLEMENTATION(VQVVY=_mm_sub_ps(VQVVY,VTMP1);) ENABLE_AVX_IMPLEMENTATION(VQVVY=_mm256_sub_ps(VQVVY,VTMP1);) 119 | -------------------------------------------------------------------------------- /include/Singular_Value_Decomposition_Kernel_Declarations.hpp: -------------------------------------------------------------------------------- 1 | //##################################################################### 2 | // Copyright (c) 2010-2011, Eftychios Sifakis. 3 | // 4 | // Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 5 | // * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 6 | // * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or 7 | // other materials provided with the distribution. 8 | // 9 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, 10 | // BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT 11 | // SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 12 | // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 13 | // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 14 | // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 15 | //##################################################################### 16 | 17 | //########################################################### 18 | // Local variable declarations 19 | //########################################################### 20 | 21 | #ifdef PRINT_DEBUGGING_OUTPUT 22 | 23 | #ifdef USE_SSE_IMPLEMENTATION 24 | float buf[4]; 25 | float A11,A21,A31,A12,A22,A32,A13,A23,A33; 26 | float S11,S21,S31,S22,S32,S33; 27 | #ifdef COMPUTE_V_AS_QUATERNION 28 | float QVS,QVVX,QVVY,QVVZ; 29 | #endif 30 | #ifdef COMPUTE_V_AS_MATRIX 31 | float V11,V21,V31,V12,V22,V32,V13,V23,V33; 32 | #endif 33 | #ifdef COMPUTE_U_AS_QUATERNION 34 | float QUS,QUVX,QUVY,QUVZ; 35 | #endif 36 | #ifdef COMPUTE_U_AS_MATRIX 37 | float U11,U21,U31,U12,U22,U32,U13,U23,U33; 38 | #endif 39 | #endif 40 | 41 | #ifdef USE_AVX_IMPLEMENTATION 42 | float buf[8]; 43 | float A11,A21,A31,A12,A22,A32,A13,A23,A33; 44 | float S11,S21,S31,S22,S32,S33; 45 | #ifdef COMPUTE_V_AS_QUATERNION 46 | float QVS,QVVX,QVVY,QVVZ; 47 | #endif 48 | #ifdef COMPUTE_V_AS_MATRIX 49 | float V11,V21,V31,V12,V22,V32,V13,V23,V33; 50 | #endif 51 | #ifdef COMPUTE_U_AS_QUATERNION 52 | float QUS,QUVX,QUVY,QUVZ; 53 | #endif 54 | #ifdef COMPUTE_U_AS_MATRIX 55 | float U11,U21,U31,U12,U22,U32,U13,U23,U33; 56 | #endif 57 | #endif 58 | 59 | #endif 60 | 61 | const float Four_Gamma_Squared=sqrt(8.)+3.; 62 | const float Sine_Pi_Over_Eight=.5*sqrt(2.-sqrt(2.)); 63 | const float Cosine_Pi_Over_Eight=.5*sqrt(2.+sqrt(2.)); 64 | 65 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sfour_gamma_squared;) ENABLE_SSE_IMPLEMENTATION(__m128 Vfour_gamma_squared;) ENABLE_AVX_IMPLEMENTATION(__m256 Vfour_gamma_squared;) 66 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Ssine_pi_over_eight;) ENABLE_SSE_IMPLEMENTATION(__m128 Vsine_pi_over_eight;) ENABLE_AVX_IMPLEMENTATION(__m256 Vsine_pi_over_eight;) 67 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Scosine_pi_over_eight;) ENABLE_SSE_IMPLEMENTATION(__m128 Vcosine_pi_over_eight;) ENABLE_AVX_IMPLEMENTATION(__m256 Vcosine_pi_over_eight;) 68 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sone_half;) ENABLE_SSE_IMPLEMENTATION(__m128 Vone_half;) ENABLE_AVX_IMPLEMENTATION(__m256 Vone_half;) 69 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sone;) ENABLE_SSE_IMPLEMENTATION(__m128 Vone;) ENABLE_AVX_IMPLEMENTATION(__m256 Vone;) 70 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Stiny_number;) ENABLE_SSE_IMPLEMENTATION(__m128 Vtiny_number;) ENABLE_AVX_IMPLEMENTATION(__m256 Vtiny_number;) 71 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Ssmall_number;) ENABLE_SSE_IMPLEMENTATION(__m128 Vsmall_number;) ENABLE_AVX_IMPLEMENTATION(__m256 Vsmall_number;) 72 | 73 | ENABLE_SCALAR_IMPLEMENTATION(Sfour_gamma_squared.f=Four_Gamma_Squared;) ENABLE_SSE_IMPLEMENTATION(Vfour_gamma_squared=_mm_set1_ps(Four_Gamma_Squared);) ENABLE_AVX_IMPLEMENTATION(Vfour_gamma_squared=_mm256_set1_ps(Four_Gamma_Squared);) 74 | ENABLE_SCALAR_IMPLEMENTATION(Ssine_pi_over_eight.f=Sine_Pi_Over_Eight;) ENABLE_SSE_IMPLEMENTATION(Vsine_pi_over_eight=_mm_set1_ps(Sine_Pi_Over_Eight);) ENABLE_AVX_IMPLEMENTATION(Vsine_pi_over_eight=_mm256_set1_ps(Sine_Pi_Over_Eight);) 75 | ENABLE_SCALAR_IMPLEMENTATION(Scosine_pi_over_eight.f=Cosine_Pi_Over_Eight;) ENABLE_SSE_IMPLEMENTATION(Vcosine_pi_over_eight=_mm_set1_ps(Cosine_Pi_Over_Eight);) ENABLE_AVX_IMPLEMENTATION(Vcosine_pi_over_eight=_mm256_set1_ps(Cosine_Pi_Over_Eight);) 76 | ENABLE_SCALAR_IMPLEMENTATION(Sone_half.f=.5;) ENABLE_SSE_IMPLEMENTATION(Vone_half=_mm_set1_ps(.5);) ENABLE_AVX_IMPLEMENTATION(Vone_half=_mm256_set1_ps(.5);) 77 | ENABLE_SCALAR_IMPLEMENTATION(Sone.f=1.;) ENABLE_SSE_IMPLEMENTATION(Vone=_mm_set1_ps(1.);) ENABLE_AVX_IMPLEMENTATION(Vone=_mm256_set1_ps(1.);) 78 | ENABLE_SCALAR_IMPLEMENTATION(Stiny_number.f=1.e-20;) ENABLE_SSE_IMPLEMENTATION(Vtiny_number=_mm_set1_ps(1.e-20);) ENABLE_AVX_IMPLEMENTATION(Vtiny_number=_mm256_set1_ps(1.e-20);) 79 | ENABLE_SCALAR_IMPLEMENTATION(Ssmall_number.f=1.e-12;) ENABLE_SSE_IMPLEMENTATION(Vsmall_number=_mm_set1_ps(1.e-12);) ENABLE_AVX_IMPLEMENTATION(Vsmall_number=_mm256_set1_ps(1.e-12);) 80 | 81 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sa11;) ENABLE_SSE_IMPLEMENTATION(__m128 Va11;) ENABLE_AVX_IMPLEMENTATION(__m256 Va11;) 82 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sa21;) ENABLE_SSE_IMPLEMENTATION(__m128 Va21;) ENABLE_AVX_IMPLEMENTATION(__m256 Va21;) 83 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sa31;) ENABLE_SSE_IMPLEMENTATION(__m128 Va31;) ENABLE_AVX_IMPLEMENTATION(__m256 Va31;) 84 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sa12;) ENABLE_SSE_IMPLEMENTATION(__m128 Va12;) ENABLE_AVX_IMPLEMENTATION(__m256 Va12;) 85 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sa22;) ENABLE_SSE_IMPLEMENTATION(__m128 Va22;) ENABLE_AVX_IMPLEMENTATION(__m256 Va22;) 86 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sa32;) ENABLE_SSE_IMPLEMENTATION(__m128 Va32;) ENABLE_AVX_IMPLEMENTATION(__m256 Va32;) 87 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sa13;) ENABLE_SSE_IMPLEMENTATION(__m128 Va13;) ENABLE_AVX_IMPLEMENTATION(__m256 Va13;) 88 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sa23;) ENABLE_SSE_IMPLEMENTATION(__m128 Va23;) ENABLE_AVX_IMPLEMENTATION(__m256 Va23;) 89 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sa33;) ENABLE_SSE_IMPLEMENTATION(__m128 Va33;) ENABLE_AVX_IMPLEMENTATION(__m256 Va33;) 90 | 91 | #ifdef COMPUTE_V_AS_MATRIX 92 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sv11;) ENABLE_SSE_IMPLEMENTATION(__m128 Vv11;) ENABLE_AVX_IMPLEMENTATION(__m256 Vv11;) 93 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sv21;) ENABLE_SSE_IMPLEMENTATION(__m128 Vv21;) ENABLE_AVX_IMPLEMENTATION(__m256 Vv21;) 94 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sv31;) ENABLE_SSE_IMPLEMENTATION(__m128 Vv31;) ENABLE_AVX_IMPLEMENTATION(__m256 Vv31;) 95 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sv12;) ENABLE_SSE_IMPLEMENTATION(__m128 Vv12;) ENABLE_AVX_IMPLEMENTATION(__m256 Vv12;) 96 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sv22;) ENABLE_SSE_IMPLEMENTATION(__m128 Vv22;) ENABLE_AVX_IMPLEMENTATION(__m256 Vv22;) 97 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sv32;) ENABLE_SSE_IMPLEMENTATION(__m128 Vv32;) ENABLE_AVX_IMPLEMENTATION(__m256 Vv32;) 98 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sv13;) ENABLE_SSE_IMPLEMENTATION(__m128 Vv13;) ENABLE_AVX_IMPLEMENTATION(__m256 Vv13;) 99 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sv23;) ENABLE_SSE_IMPLEMENTATION(__m128 Vv23;) ENABLE_AVX_IMPLEMENTATION(__m256 Vv23;) 100 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sv33;) ENABLE_SSE_IMPLEMENTATION(__m128 Vv33;) ENABLE_AVX_IMPLEMENTATION(__m256 Vv33;) 101 | #endif 102 | 103 | #ifdef COMPUTE_V_AS_QUATERNION 104 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sqvs;) ENABLE_SSE_IMPLEMENTATION(__m128 Vqvs;) ENABLE_AVX_IMPLEMENTATION(__m256 Vqvs;) 105 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sqvvx;) ENABLE_SSE_IMPLEMENTATION(__m128 Vqvvx;) ENABLE_AVX_IMPLEMENTATION(__m256 Vqvvx;) 106 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sqvvy;) ENABLE_SSE_IMPLEMENTATION(__m128 Vqvvy;) ENABLE_AVX_IMPLEMENTATION(__m256 Vqvvy;) 107 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sqvvz;) ENABLE_SSE_IMPLEMENTATION(__m128 Vqvvz;) ENABLE_AVX_IMPLEMENTATION(__m256 Vqvvz;) 108 | #endif 109 | 110 | #ifdef COMPUTE_U_AS_MATRIX 111 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Su11;) ENABLE_SSE_IMPLEMENTATION(__m128 Vu11;) ENABLE_AVX_IMPLEMENTATION(__m256 Vu11;) 112 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Su21;) ENABLE_SSE_IMPLEMENTATION(__m128 Vu21;) ENABLE_AVX_IMPLEMENTATION(__m256 Vu21;) 113 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Su31;) ENABLE_SSE_IMPLEMENTATION(__m128 Vu31;) ENABLE_AVX_IMPLEMENTATION(__m256 Vu31;) 114 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Su12;) ENABLE_SSE_IMPLEMENTATION(__m128 Vu12;) ENABLE_AVX_IMPLEMENTATION(__m256 Vu12;) 115 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Su22;) ENABLE_SSE_IMPLEMENTATION(__m128 Vu22;) ENABLE_AVX_IMPLEMENTATION(__m256 Vu22;) 116 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Su32;) ENABLE_SSE_IMPLEMENTATION(__m128 Vu32;) ENABLE_AVX_IMPLEMENTATION(__m256 Vu32;) 117 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Su13;) ENABLE_SSE_IMPLEMENTATION(__m128 Vu13;) ENABLE_AVX_IMPLEMENTATION(__m256 Vu13;) 118 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Su23;) ENABLE_SSE_IMPLEMENTATION(__m128 Vu23;) ENABLE_AVX_IMPLEMENTATION(__m256 Vu23;) 119 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Su33;) ENABLE_SSE_IMPLEMENTATION(__m128 Vu33;) ENABLE_AVX_IMPLEMENTATION(__m256 Vu33;) 120 | #endif 121 | 122 | #ifdef COMPUTE_U_AS_QUATERNION 123 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Squs;) ENABLE_SSE_IMPLEMENTATION(__m128 Vqus;) ENABLE_AVX_IMPLEMENTATION(__m256 Vqus;) 124 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Squvx;) ENABLE_SSE_IMPLEMENTATION(__m128 Vquvx;) ENABLE_AVX_IMPLEMENTATION(__m256 Vquvx;) 125 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Squvy;) ENABLE_SSE_IMPLEMENTATION(__m128 Vquvy;) ENABLE_AVX_IMPLEMENTATION(__m256 Vquvy;) 126 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Squvz;) ENABLE_SSE_IMPLEMENTATION(__m128 Vquvz;) ENABLE_AVX_IMPLEMENTATION(__m256 Vquvz;) 127 | #endif 128 | 129 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sc;) ENABLE_SSE_IMPLEMENTATION(__m128 Vc;) ENABLE_AVX_IMPLEMENTATION(__m256 Vc;) 130 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Ss;) ENABLE_SSE_IMPLEMENTATION(__m128 Vs;) ENABLE_AVX_IMPLEMENTATION(__m256 Vs;) 131 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sch;) ENABLE_SSE_IMPLEMENTATION(__m128 Vch;) ENABLE_AVX_IMPLEMENTATION(__m256 Vch;) 132 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Ssh;) ENABLE_SSE_IMPLEMENTATION(__m128 Vsh;) ENABLE_AVX_IMPLEMENTATION(__m256 Vsh;) 133 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Stmp1;) ENABLE_SSE_IMPLEMENTATION(__m128 Vtmp1;) ENABLE_AVX_IMPLEMENTATION(__m256 Vtmp1;) 134 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Stmp2;) ENABLE_SSE_IMPLEMENTATION(__m128 Vtmp2;) ENABLE_AVX_IMPLEMENTATION(__m256 Vtmp2;) 135 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Stmp3;) ENABLE_SSE_IMPLEMENTATION(__m128 Vtmp3;) ENABLE_AVX_IMPLEMENTATION(__m256 Vtmp3;) 136 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Stmp4;) ENABLE_SSE_IMPLEMENTATION(__m128 Vtmp4;) ENABLE_AVX_IMPLEMENTATION(__m256 Vtmp4;) 137 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Stmp5;) ENABLE_SSE_IMPLEMENTATION(__m128 Vtmp5;) ENABLE_AVX_IMPLEMENTATION(__m256 Vtmp5;) 138 | -------------------------------------------------------------------------------- /include/Singular_Value_Decomposition_Main_Kernel_Body.hpp: -------------------------------------------------------------------------------- 1 | //##################################################################### 2 | // Copyright (c) 2010-2011, Eftychios Sifakis. 3 | // 4 | // Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 5 | // * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 6 | // * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or 7 | // other materials provided with the distribution. 8 | // 9 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, 10 | // BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT 11 | // SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 12 | // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 13 | // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 14 | // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 15 | //##################################################################### 16 | 17 | #ifdef __INTEL_COMPILER 18 | #pragma warning( disable : 592 ) 19 | #endif 20 | 21 | // #define USE_ACCURATE_RSQRT_IN_JACOBI_CONJUGATION 22 | // #define PERFORM_STRICT_QUATERNION_RENORMALIZATION 23 | 24 | { // Begin block : Scope of qV (if not maintained) 25 | 26 | #ifndef COMPUTE_V_AS_QUATERNION 27 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sqvs;) ENABLE_SSE_IMPLEMENTATION(__m128 Vqvs;) ENABLE_AVX_IMPLEMENTATION(__m256 Vqvs;) 28 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sqvvx;) ENABLE_SSE_IMPLEMENTATION(__m128 Vqvvx;) ENABLE_AVX_IMPLEMENTATION(__m256 Vqvvx;) 29 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sqvvy;) ENABLE_SSE_IMPLEMENTATION(__m128 Vqvvy;) ENABLE_AVX_IMPLEMENTATION(__m256 Vqvvy;) 30 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sqvvz;) ENABLE_SSE_IMPLEMENTATION(__m128 Vqvvz;) ENABLE_AVX_IMPLEMENTATION(__m256 Vqvvz;) 31 | #endif 32 | 33 | { // Begin block : Symmetric eigenanalysis 34 | 35 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Ss11;) ENABLE_SSE_IMPLEMENTATION(__m128 Vs11;) ENABLE_AVX_IMPLEMENTATION(__m256 Vs11;) 36 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Ss21;) ENABLE_SSE_IMPLEMENTATION(__m128 Vs21;) ENABLE_AVX_IMPLEMENTATION(__m256 Vs21;) 37 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Ss31;) ENABLE_SSE_IMPLEMENTATION(__m128 Vs31;) ENABLE_AVX_IMPLEMENTATION(__m256 Vs31;) 38 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Ss22;) ENABLE_SSE_IMPLEMENTATION(__m128 Vs22;) ENABLE_AVX_IMPLEMENTATION(__m256 Vs22;) 39 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Ss32;) ENABLE_SSE_IMPLEMENTATION(__m128 Vs32;) ENABLE_AVX_IMPLEMENTATION(__m256 Vs32;) 40 | ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Ss33;) ENABLE_SSE_IMPLEMENTATION(__m128 Vs33;) ENABLE_AVX_IMPLEMENTATION(__m256 Vs33;) 41 | 42 | ENABLE_SCALAR_IMPLEMENTATION(Sqvs.f=1.;) ENABLE_SSE_IMPLEMENTATION(Vqvs=Vone;) ENABLE_AVX_IMPLEMENTATION(Vqvs=Vone;) 43 | ENABLE_SCALAR_IMPLEMENTATION(Sqvvx.f=0.;) ENABLE_SSE_IMPLEMENTATION(Vqvvx=_mm_xor_ps(Vqvvx,Vqvvx);) ENABLE_AVX_IMPLEMENTATION(Vqvvx=_mm256_xor_ps(Vqvvx,Vqvvx);) 44 | ENABLE_SCALAR_IMPLEMENTATION(Sqvvy.f=0.;) ENABLE_SSE_IMPLEMENTATION(Vqvvy=_mm_xor_ps(Vqvvy,Vqvvy);) ENABLE_AVX_IMPLEMENTATION(Vqvvy=_mm256_xor_ps(Vqvvy,Vqvvy);) 45 | ENABLE_SCALAR_IMPLEMENTATION(Sqvvz.f=0.;) ENABLE_SSE_IMPLEMENTATION(Vqvvz=_mm_xor_ps(Vqvvz,Vqvvz);) ENABLE_AVX_IMPLEMENTATION(Vqvvz=_mm256_xor_ps(Vqvvz,Vqvvz);) 46 | 47 | //########################################################### 48 | // Compute normal equations matrix 49 | //########################################################### 50 | 51 | ENABLE_SCALAR_IMPLEMENTATION(Ss11.f=Sa11.f*Sa11.f;) ENABLE_SSE_IMPLEMENTATION(Vs11=_mm_mul_ps(Va11,Va11);) ENABLE_AVX_IMPLEMENTATION(Vs11=_mm256_mul_ps(Va11,Va11);) 52 | ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa21.f*Sa21.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va21,Va21);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va21,Va21);) 53 | ENABLE_SCALAR_IMPLEMENTATION(Ss11.f=Stmp1.f+Ss11.f;) ENABLE_SSE_IMPLEMENTATION(Vs11=_mm_add_ps(Vtmp1,Vs11);) ENABLE_AVX_IMPLEMENTATION(Vs11=_mm256_add_ps(Vtmp1,Vs11);) 54 | ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa31.f*Sa31.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va31,Va31);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va31,Va31);) 55 | ENABLE_SCALAR_IMPLEMENTATION(Ss11.f=Stmp1.f+Ss11.f;) ENABLE_SSE_IMPLEMENTATION(Vs11=_mm_add_ps(Vtmp1,Vs11);) ENABLE_AVX_IMPLEMENTATION(Vs11=_mm256_add_ps(Vtmp1,Vs11);) 56 | 57 | ENABLE_SCALAR_IMPLEMENTATION(Ss21.f=Sa12.f*Sa11.f;) ENABLE_SSE_IMPLEMENTATION(Vs21=_mm_mul_ps(Va12,Va11);) ENABLE_AVX_IMPLEMENTATION(Vs21=_mm256_mul_ps(Va12,Va11);) 58 | ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa22.f*Sa21.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va22,Va21);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va22,Va21);) 59 | ENABLE_SCALAR_IMPLEMENTATION(Ss21.f=Stmp1.f+Ss21.f;) ENABLE_SSE_IMPLEMENTATION(Vs21=_mm_add_ps(Vtmp1,Vs21);) ENABLE_AVX_IMPLEMENTATION(Vs21=_mm256_add_ps(Vtmp1,Vs21);) 60 | ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa32.f*Sa31.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va32,Va31);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va32,Va31);) 61 | ENABLE_SCALAR_IMPLEMENTATION(Ss21.f=Stmp1.f+Ss21.f;) ENABLE_SSE_IMPLEMENTATION(Vs21=_mm_add_ps(Vtmp1,Vs21);) ENABLE_AVX_IMPLEMENTATION(Vs21=_mm256_add_ps(Vtmp1,Vs21);) 62 | 63 | ENABLE_SCALAR_IMPLEMENTATION(Ss31.f=Sa13.f*Sa11.f;) ENABLE_SSE_IMPLEMENTATION(Vs31=_mm_mul_ps(Va13,Va11);) ENABLE_AVX_IMPLEMENTATION(Vs31=_mm256_mul_ps(Va13,Va11);) 64 | ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa23.f*Sa21.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va23,Va21);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va23,Va21);) 65 | ENABLE_SCALAR_IMPLEMENTATION(Ss31.f=Stmp1.f+Ss31.f;) ENABLE_SSE_IMPLEMENTATION(Vs31=_mm_add_ps(Vtmp1,Vs31);) ENABLE_AVX_IMPLEMENTATION(Vs31=_mm256_add_ps(Vtmp1,Vs31);) 66 | ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa33.f*Sa31.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va33,Va31);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va33,Va31);) 67 | ENABLE_SCALAR_IMPLEMENTATION(Ss31.f=Stmp1.f+Ss31.f;) ENABLE_SSE_IMPLEMENTATION(Vs31=_mm_add_ps(Vtmp1,Vs31);) ENABLE_AVX_IMPLEMENTATION(Vs31=_mm256_add_ps(Vtmp1,Vs31);) 68 | 69 | ENABLE_SCALAR_IMPLEMENTATION(Ss22.f=Sa12.f*Sa12.f;) ENABLE_SSE_IMPLEMENTATION(Vs22=_mm_mul_ps(Va12,Va12);) ENABLE_AVX_IMPLEMENTATION(Vs22=_mm256_mul_ps(Va12,Va12);) 70 | ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa22.f*Sa22.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va22,Va22);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va22,Va22);) 71 | ENABLE_SCALAR_IMPLEMENTATION(Ss22.f=Stmp1.f+Ss22.f;) ENABLE_SSE_IMPLEMENTATION(Vs22=_mm_add_ps(Vtmp1,Vs22);) ENABLE_AVX_IMPLEMENTATION(Vs22=_mm256_add_ps(Vtmp1,Vs22);) 72 | ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa32.f*Sa32.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va32,Va32);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va32,Va32);) 73 | ENABLE_SCALAR_IMPLEMENTATION(Ss22.f=Stmp1.f+Ss22.f;) ENABLE_SSE_IMPLEMENTATION(Vs22=_mm_add_ps(Vtmp1,Vs22);) ENABLE_AVX_IMPLEMENTATION(Vs22=_mm256_add_ps(Vtmp1,Vs22);) 74 | 75 | ENABLE_SCALAR_IMPLEMENTATION(Ss32.f=Sa13.f*Sa12.f;) ENABLE_SSE_IMPLEMENTATION(Vs32=_mm_mul_ps(Va13,Va12);) ENABLE_AVX_IMPLEMENTATION(Vs32=_mm256_mul_ps(Va13,Va12);) 76 | ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa23.f*Sa22.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va23,Va22);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va23,Va22);) 77 | ENABLE_SCALAR_IMPLEMENTATION(Ss32.f=Stmp1.f+Ss32.f;) ENABLE_SSE_IMPLEMENTATION(Vs32=_mm_add_ps(Vtmp1,Vs32);) ENABLE_AVX_IMPLEMENTATION(Vs32=_mm256_add_ps(Vtmp1,Vs32);) 78 | ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa33.f*Sa32.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va33,Va32);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va33,Va32);) 79 | ENABLE_SCALAR_IMPLEMENTATION(Ss32.f=Stmp1.f+Ss32.f;) ENABLE_SSE_IMPLEMENTATION(Vs32=_mm_add_ps(Vtmp1,Vs32);) ENABLE_AVX_IMPLEMENTATION(Vs32=_mm256_add_ps(Vtmp1,Vs32);) 80 | 81 | ENABLE_SCALAR_IMPLEMENTATION(Ss33.f=Sa13.f*Sa13.f;) ENABLE_SSE_IMPLEMENTATION(Vs33=_mm_mul_ps(Va13,Va13);) ENABLE_AVX_IMPLEMENTATION(Vs33=_mm256_mul_ps(Va13,Va13);) 82 | ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa23.f*Sa23.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va23,Va23);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va23,Va23);) 83 | ENABLE_SCALAR_IMPLEMENTATION(Ss33.f=Stmp1.f+Ss33.f;) ENABLE_SSE_IMPLEMENTATION(Vs33=_mm_add_ps(Vtmp1,Vs33);) ENABLE_AVX_IMPLEMENTATION(Vs33=_mm256_add_ps(Vtmp1,Vs33);) 84 | ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa33.f*Sa33.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va33,Va33);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va33,Va33);) 85 | ENABLE_SCALAR_IMPLEMENTATION(Ss33.f=Stmp1.f+Ss33.f;) ENABLE_SSE_IMPLEMENTATION(Vs33=_mm_add_ps(Vtmp1,Vs33);) ENABLE_AVX_IMPLEMENTATION(Vs33=_mm256_add_ps(Vtmp1,Vs33);) 86 | 87 | //########################################################### 88 | // Solve symmetric eigenproblem using Jacobi iteration 89 | //########################################################### 90 | 91 | for(int sweep=1;sweep<=4;sweep++){ 92 | 93 | // First Jacobi conjugation 94 | 95 | #define SS11 Ss11 96 | #define SS21 Ss21 97 | #define SS31 Ss31 98 | #define SS22 Ss22 99 | #define SS32 Ss32 100 | #define SS33 Ss33 101 | #define SQVVX Sqvvx 102 | #define SQVVY Sqvvy 103 | #define SQVVZ Sqvvz 104 | #define STMP1 Stmp1 105 | #define STMP2 Stmp2 106 | #define STMP3 Stmp3 107 | 108 | #define VS11 Vs11 109 | #define VS21 Vs21 110 | #define VS31 Vs31 111 | #define VS22 Vs22 112 | #define VS32 Vs32 113 | #define VS33 Vs33 114 | #define VQVVX Vqvvx 115 | #define VQVVY Vqvvy 116 | #define VQVVZ Vqvvz 117 | #define VTMP1 Vtmp1 118 | #define VTMP2 Vtmp2 119 | #define VTMP3 Vtmp3 120 | 121 | #include "Singular_Value_Decomposition_Jacobi_Conjugation_Kernel.hpp" 122 | 123 | #undef SS11 124 | #undef SS21 125 | #undef SS31 126 | #undef SS22 127 | #undef SS32 128 | #undef SS33 129 | #undef SQVVX 130 | #undef SQVVY 131 | #undef SQVVZ 132 | #undef STMP1 133 | #undef STMP2 134 | #undef STMP3 135 | 136 | #undef VS11 137 | #undef VS21 138 | #undef VS31 139 | #undef VS22 140 | #undef VS32 141 | #undef VS33 142 | #undef VQVVX 143 | #undef VQVVY 144 | #undef VQVVZ 145 | #undef VTMP1 146 | #undef VTMP2 147 | #undef VTMP3 148 | 149 | // Second Jacobi conjugation 150 | 151 | #define SS11 Ss22 152 | #define SS21 Ss32 153 | #define SS31 Ss21 154 | #define SS22 Ss33 155 | #define SS32 Ss31 156 | #define SS33 Ss11 157 | #define SQVVX Sqvvy 158 | #define SQVVY Sqvvz 159 | #define SQVVZ Sqvvx 160 | #define STMP1 Stmp2 161 | #define STMP2 Stmp3 162 | #define STMP3 Stmp1 163 | 164 | #define VS11 Vs22 165 | #define VS21 Vs32 166 | #define VS31 Vs21 167 | #define VS22 Vs33 168 | #define VS32 Vs31 169 | #define VS33 Vs11 170 | #define VQVVX Vqvvy 171 | #define VQVVY Vqvvz 172 | #define VQVVZ Vqvvx 173 | #define VTMP1 Vtmp2 174 | #define VTMP2 Vtmp3 175 | #define VTMP3 Vtmp1 176 | 177 | #include "Singular_Value_Decomposition_Jacobi_Conjugation_Kernel.hpp" 178 | 179 | #undef SS11 180 | #undef SS21 181 | #undef SS31 182 | #undef SS22 183 | #undef SS32 184 | #undef SS33 185 | #undef SQVVX 186 | #undef SQVVY 187 | #undef SQVVZ 188 | #undef STMP1 189 | #undef STMP2 190 | #undef STMP3 191 | 192 | #undef VS11 193 | #undef VS21 194 | #undef VS31 195 | #undef VS22 196 | #undef VS32 197 | #undef VS33 198 | #undef VQVVX 199 | #undef VQVVY 200 | #undef VQVVZ 201 | #undef VTMP1 202 | #undef VTMP2 203 | #undef VTMP3 204 | 205 | // Third Jacobi conjugation 206 | 207 | #define SS11 Ss33 208 | #define SS21 Ss31 209 | #define SS31 Ss32 210 | #define SS22 Ss11 211 | #define SS32 Ss21 212 | #define SS33 Ss22 213 | #define SQVVX Sqvvz 214 | #define SQVVY Sqvvx 215 | #define SQVVZ Sqvvy 216 | #define STMP1 Stmp3 217 | #define STMP2 Stmp1 218 | #define STMP3 Stmp2 219 | 220 | #define VS11 Vs33 221 | #define VS21 Vs31 222 | #define VS31 Vs32 223 | #define VS22 Vs11 224 | #define VS32 Vs21 225 | #define VS33 Vs22 226 | #define VQVVX Vqvvz 227 | #define VQVVY Vqvvx 228 | #define VQVVZ Vqvvy 229 | #define VTMP1 Vtmp3 230 | #define VTMP2 Vtmp1 231 | #define VTMP3 Vtmp2 232 | 233 | #include "Singular_Value_Decomposition_Jacobi_Conjugation_Kernel.hpp" 234 | 235 | #undef SS11 236 | #undef SS21 237 | #undef SS31 238 | #undef SS22 239 | #undef SS32 240 | #undef SS33 241 | #undef SQVVX 242 | #undef SQVVY 243 | #undef SQVVZ 244 | #undef STMP1 245 | #undef STMP2 246 | #undef STMP3 247 | 248 | #undef VS11 249 | #undef VS21 250 | #undef VS31 251 | #undef VS22 252 | #undef VS32 253 | #undef VS33 254 | #undef VQVVX 255 | #undef VQVVY 256 | #undef VQVVZ 257 | #undef VTMP1 258 | #undef VTMP2 259 | #undef VTMP3 260 | } 261 | 262 | #ifdef PRINT_DEBUGGING_OUTPUT 263 | #ifdef USE_SCALAR_IMPLEMENTATION 264 | std::cout<<"Scalar S ="< 19 | #include 20 | #endif 21 | 22 | #ifdef USE_SCALAR_IMPLEMENTATION 23 | #define ENABLE_SCALAR_IMPLEMENTATION(X) X 24 | #else 25 | #define ENABLE_SCALAR_IMPLEMENTATION(X) 26 | #endif 27 | 28 | #ifdef USE_SSE_IMPLEMENTATION 29 | #define ENABLE_SSE_IMPLEMENTATION(X) X 30 | #else 31 | #define ENABLE_SSE_IMPLEMENTATION(X) 32 | #endif 33 | 34 | #ifdef USE_AVX_IMPLEMENTATION 35 | #include 36 | #define ENABLE_AVX_IMPLEMENTATION(X) X 37 | #else 38 | #include 39 | #define ENABLE_AVX_IMPLEMENTATION(X) 40 | #endif 41 | 42 | #ifdef USE_SCALAR_IMPLEMENTATION 43 | // Changed to inline 44 | inline float rsqrt(const float f) 45 | { 46 | float buf[4]; 47 | buf[0]=f; 48 | __m128 v=_mm_loadu_ps(buf); 49 | v=_mm_rsqrt_ss(v); 50 | _mm_storeu_ps(buf,v); 51 | return buf[0]; 52 | } 53 | #endif 54 | 55 | 56 | -------------------------------------------------------------------------------- /include/WunderSVD3x3.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | // Super fast 3x3 SVD according to http://pages.cs.wisc.edu/~sifakis/project_pages/svd.html 4 | // The resulting decomposition is A = U * diag(S[0], S[1], S[2]) * V' 5 | // BEWARE: this SVD algorithm guarantees that det(U) = det(V) = 1, but this 6 | // comes at the cost that 'sigma3' can be negative 7 | // for computing polar decomposition it's great because all we need to do is U*V' 8 | // and the result will automatically have positive determinant 9 | 10 | template 11 | void wunderSVD3x3(const Eigen::Matrix& A, Eigen::Matrix &U, Eigen::Matrix &S, Eigen::Matrix&V); 12 | -------------------------------------------------------------------------------- /include/arap_wrapper.h: -------------------------------------------------------------------------------- 1 | #ifndef ARAP_WRAPPER_H 2 | #define ARAP_WRAPPER_H 3 | 4 | #ifdef WIN32 5 | //---------------------------------------------------------------------- 6 | // To compile the code into a windows DLL, you must define the 7 | // symbol ARAP_DLL_EXPORTS. 8 | // 9 | // To compile the code statically into a windows executable 10 | // (i.e. not using a separate DLL) define ARAP_DLL_STATIC. 11 | //---------------------------------------------------------------------- 12 | #ifdef ARAP_STATIC 13 | #define ARAP_DLL_API // since ARAP_STATIC is defined, code is statically 14 | // linked and no exports or imports are needed 15 | #else 16 | 17 | #ifdef ARAP_DLL_EXPORTS 18 | #define ARAP_DLL_API __declspec(dllexport) 19 | #else 20 | #define ARAP_DLL_API __declspec(dllimport) 21 | #endif 22 | 23 | #endif 24 | 25 | #else 26 | //---------------------------------------------------------------------- 27 | // ARAP_DLL_API is ignored for all other systems 28 | //---------------------------------------------------------------------- 29 | #define ARAP_DLL_API 30 | #endif 31 | 32 | #endif -------------------------------------------------------------------------------- /lib/arap_debug.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fanxiaochen/ARAP/7247cd90991567f8e2b0619c8d2fa678676f9c51/lib/arap_debug.dll -------------------------------------------------------------------------------- /lib/arap_debug.lib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fanxiaochen/ARAP/7247cd90991567f8e2b0619c8d2fa678676f9c51/lib/arap_debug.lib -------------------------------------------------------------------------------- /lib/arap_release.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fanxiaochen/ARAP/7247cd90991567f8e2b0619c8d2fa678676f9c51/lib/arap_release.dll -------------------------------------------------------------------------------- /lib/arap_release.lib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fanxiaochen/ARAP/7247cd90991567f8e2b0619c8d2fa678676f9c51/lib/arap_release.lib -------------------------------------------------------------------------------- /src/Deform.cpp: -------------------------------------------------------------------------------- 1 | #include "Deform.h" 2 | 3 | using namespace std; 4 | using namespace Eigen; 5 | 6 | Deform::Deform(const float *P_data, int P_Num, const AdjList &adj_list, const TriangleList &triangle_list) 7 | :at(ORIGIN_HARD), 8 | P_data(P_data), 9 | P_Num(P_Num), 10 | max_iter(10), 11 | min_tol(1e-3), 12 | lambda(5), 13 | adj_list(adj_list) 14 | { 15 | 16 | for (size_t i = 0, i_end = triangle_list.size(); i < i_end; ++ i) 17 | { 18 | std::vector face = triangle_list[i]; 19 | std::sort(face.begin(), face.end()); 20 | assert(face.size() == 3); 21 | face_list.push_back(Eigen::Vector3i(face[0], face[1], face[2])); 22 | } 23 | 24 | P.resize(3, P_Num); 25 | for (int i = 0; i != P_Num; ++i) {P.col(i) << P_data[3*i], P_data[3*i+1], P_data[3*i+2];} 26 | P_Prime = P; 27 | R = vector(P_Num, Matrix3f::Identity()); 28 | 29 | // Weight 30 | build_weight_matrix(); 31 | } 32 | 33 | float *Deform::do_Deform() 34 | { 35 | int iter = 0; 36 | double delta = 0; 37 | 38 | set_linear_sys(); 39 | 40 | do { 41 | delta = update_P_Prime(); 42 | update_Ri(); 43 | build_rh_d(); 44 | cout << "iter: " << ++ iter << "\tdelta: " << delta << endl; 45 | 46 | }while(delta > min_tol && iter < max_iter); 47 | 48 | return P_Prime.data(); 49 | } 50 | 51 | float *Deform::do_Deform(int iter_num) 52 | { 53 | int iter = 0; 54 | double delta = 0; 55 | 56 | set_linear_sys(); 57 | 58 | do { 59 | delta = update_P_Prime(); 60 | update_Ri(); 61 | build_rh_d(); 62 | cout << "iter: " << ++ iter << "\tdelta: " << delta << endl; 63 | 64 | }while(iter < iter_num); 65 | 66 | return P_Prime.data(); 67 | } 68 | 69 | float *Deform::get_P_Prime() 70 | { 71 | return P_Prime.data(); 72 | } 73 | 74 | float* Deform::do_Deform_Iter(float &delta) 75 | { 76 | set_linear_sys(); 77 | delta = update_P_Prime(); 78 | return P_Prime.data(); 79 | } 80 | 81 | void Deform::set_arap_type(ArapType at) 82 | { 83 | this->at = at; 84 | } 85 | 86 | void Deform::set_tolerance(float tol) 87 | { 88 | min_tol = tol; 89 | } 90 | 91 | void Deform::set_max_iteration(int iter_num) 92 | { 93 | max_iter = iter_num; 94 | } 95 | 96 | void Deform::set_lambda(float lamdba) 97 | { 98 | this->lambda = lamdba; 99 | } 100 | 101 | void Deform::set_hard_ctrs(const VectorF &T, const VectorI &idx_T) 102 | { 103 | assert(T.size()/3 == idx_T.size()); 104 | 105 | for (int i = 0, i_end = idx_T.size(); i < i_end; ++ i) 106 | { 107 | int idx = idx_T[i]; 108 | P_Prime.col(idx) << T[3*i], T[3*i+1], T[3*i+2]; 109 | } 110 | 111 | for (int i = 0, i_end = idx_T.size(); i < i_end; ++ i) 112 | { 113 | int cid = idx_T[i]; 114 | 115 | Eigen::Vector3f ctr; 116 | ctr << T[3*i], T[3*i+1], T[3*i+2]; 117 | 118 | hard_ctrs.push_back(Constraint(ctr, cid)); 119 | } 120 | 121 | std::sort(hard_ctrs.begin(), hard_ctrs.end(), ConstraintCompare()); 122 | } 123 | 124 | void Deform::set_soft_ctrs(const VectorF &T, const VectorI &idx_T) 125 | { 126 | assert(T.size()/3 == idx_T.size()); 127 | 128 | for (int i = 0, i_end = idx_T.size(); i < i_end; ++ i) 129 | { 130 | int cid = idx_T[i]; 131 | 132 | Eigen::Vector3f ctr; 133 | ctr << T[3*i], T[3*i+1], T[3*i+2]; 134 | 135 | soft_ctrs.push_back(Constraint(ctr, cid)); 136 | } 137 | 138 | std::sort(soft_ctrs.begin(), soft_ctrs.end(), ConstraintCompare()); 139 | } 140 | 141 | float Deform::energy() 142 | { 143 | // to do 144 | return 0; 145 | } 146 | 147 | void Deform::build_weight_matrix() 148 | { 149 | vector > weight_list; 150 | weight_list.reserve(3*7*P_Num); // each vertex may have about 7 vertices connected 151 | 152 | for (int i = 0; i != P_Num; ++i) 153 | { 154 | for (decltype(adj_list[i].size()) j = 0; j != adj_list[i].size(); ++j) 155 | { 156 | int id_j = adj_list[i][j]; 157 | 158 | vector share_vertex; 159 | find_share_vertex(i, id_j, share_vertex); 160 | 161 | float wij = 0; 162 | if (share_vertex.size()==2) wij = compute_wij(&P_data[3*i], &P_data[3*id_j], 163 | &P_data[3*share_vertex[0]], &P_data[3*share_vertex[1]]); 164 | else wij = compute_wij(&P_data[3*i], &P_data[3*id_j], &P_data[3*share_vertex[0]]); 165 | 166 | weight_list.push_back(Triplet(i, id_j, wij)); 167 | weight_list.push_back(Triplet(i+P_Num, id_j+P_Num, wij)); 168 | weight_list.push_back(Triplet(i+2*P_Num, id_j+2*P_Num, wij)); 169 | } 170 | } 171 | 172 | Weight.resize(3*P_Num, 3*P_Num); 173 | Weight.setFromTriplets(weight_list.begin(), weight_list.end()); 174 | } 175 | 176 | void Deform::build_laplacian_matrix() 177 | { 178 | SparseMatrix weight = Weight; 179 | 180 | vector > weight_sum; 181 | weight_sum.reserve(3*P_Num); 182 | 183 | for (int i = 0; i != P_Num; ++i) 184 | { 185 | float wi = 0; 186 | 187 | for (decltype(adj_list[i].size()) j = 0; j != adj_list[i].size(); ++j) 188 | { 189 | int id_j = adj_list[i][j]; 190 | 191 | if (is_hard_ctrs(i) == -1) 192 | wi += Weight.coeffRef(i, id_j); 193 | else 194 | { 195 | weight.coeffRef(i, id_j) = 0; 196 | weight.coeffRef(i+P_Num, id_j+P_Num) = 0; 197 | weight.coeffRef(i+2*P_Num, id_j+2*P_Num) = 0; 198 | 199 | wi = 1; 200 | } 201 | } 202 | 203 | if (is_soft_ctrs(i) != -1 && at == HARD_SOFT) 204 | wi += lambda / 2; 205 | 206 | weight_sum.push_back(Triplet(i, i, wi)); 207 | weight_sum.push_back(Triplet(i+P_Num, i+P_Num, wi)); 208 | weight_sum.push_back(Triplet(i+2*P_Num, i+2*P_Num, wi)); 209 | } 210 | 211 | SparseMatrix Weight_sum(3*P_Num, 3*P_Num); 212 | Weight_sum.setFromTriplets(weight_sum.begin(), weight_sum.end()); 213 | 214 | L = Weight_sum - weight; 215 | 216 | chol.analyzePattern(L); 217 | chol.factorize(L); 218 | } 219 | 220 | int Deform::is_hard_ctrs(int id) 221 | { 222 | // binary search 223 | int k = 0, k_end = hard_ctrs.size(); 224 | 225 | if (k_end == 0) 226 | return -1; 227 | 228 | while (k <= k_end) 229 | { 230 | int m = (k + k_end) / 2; 231 | 232 | if (id == hard_ctrs[m].cid) 233 | return m; 234 | else if (id < hard_ctrs[m].cid) 235 | k_end = m - 1; 236 | else 237 | k = m + 1; 238 | } 239 | 240 | return -1; 241 | } 242 | 243 | int Deform::is_soft_ctrs(int id) 244 | { 245 | // binary search 246 | int k = 0, k_end = soft_ctrs.size(); 247 | 248 | if (k_end == 0) 249 | return -1; 250 | 251 | while (k <= k_end) 252 | { 253 | int m = (k + k_end) / 2; 254 | 255 | if (id == soft_ctrs[m].cid) 256 | return m; 257 | else if (id < soft_ctrs[m].cid) 258 | k_end = m - 1; 259 | else 260 | k = m + 1; 261 | } 262 | 263 | return -1; 264 | } 265 | 266 | void Deform::build_rh_d() 267 | { 268 | d = MatrixX3f::Zero(P_Num, 3); 269 | for (int i = 0; i != P_Num; ++i) { 270 | 271 | int hctr_idx = is_hard_ctrs(i); 272 | if (hctr_idx != -1) 273 | d.row(i) = hard_ctrs[hctr_idx].ctr; 274 | else 275 | { 276 | // if there is not any single unconnected point this for loop can have a more efficient representation 277 | for (decltype(adj_list[i].size()) j = 0; j != adj_list[i].size(); ++j) { 278 | d.row(i) += ((Weight.coeffRef(i, adj_list[i][j])/2)*(R[i]+R[adj_list[i][j]])*(P.col(i) - P.col(adj_list[i][j]))).transpose(); 279 | } 280 | } 281 | 282 | int sctr_idx = is_soft_ctrs(i); 283 | if (sctr_idx != -1) 284 | d.row(i) += (lambda / 2) * soft_ctrs[sctr_idx].ctr; 285 | } 286 | } 287 | 288 | float Deform::compute_wij(const float *p1, const float *p2, const float *p3, const float *p4) 289 | { 290 | float e1 = sqrt((p1[0]-p2[0])*(p1[0]-p2[0])+(p1[1]-p2[1])*(p1[1]-p2[1])+(p1[2]-p2[2])*(p1[2]-p2[2])); 291 | float e2 = sqrt((p1[0]-p3[0])*(p1[0]-p3[0])+(p1[1]-p3[1])*(p1[1]-p3[1])+(p1[2]-p3[2])*(p1[2]-p3[2])); 292 | float e3 = sqrt((p3[0]-p2[0])*(p3[0]-p2[0])+(p3[1]-p2[1])*(p3[1]-p2[1])+(p3[2]-p2[2])*(p3[2]-p2[2])); 293 | float alpha_cos = fabs((e3*e3+e2*e2-e1*e1)/(2*e3*e2)); 294 | float beta_cos = 0; 295 | if (p4 != nullptr) { 296 | float e4 = sqrt((p1[0]-p4[0])*(p1[0]-p4[0])+(p1[1]-p4[1])*(p1[1]-p4[1])+(p1[2]-p4[2])*(p1[2]-p4[2])); 297 | float e5 = sqrt((p4[0]-p2[0])*(p4[0]-p2[0])+(p4[1]-p2[1])*(p4[1]-p2[1])+(p4[2]-p2[2])*(p4[2]-p2[2])); 298 | beta_cos = fabs((e4*e4+e5*e5-e1*e1)/(2*e4*e5)); 299 | } 300 | return ((alpha_cos/sqrt(1-alpha_cos*alpha_cos))+(beta_cos/sqrt(1-beta_cos*beta_cos)))/2; 301 | } 302 | 303 | void Deform::find_share_vertex(int pi, int pj, VectorI &share_vertex) 304 | { 305 | vector vertices; 306 | set_intersection(adj_list[pi].begin(), adj_list[pi].end(), adj_list[pj].begin(), adj_list[pj].end(), back_inserter(vertices)); 307 | for (auto &i : vertices) { 308 | vector f; 309 | f.push_back(pi); 310 | f.push_back(pj); 311 | f.push_back(i); 312 | sort(f.begin(), f.end()); 313 | vector::iterator it = find(face_list.begin(), face_list.end(), Map(&f[0])); 314 | if (it != face_list.end()) { 315 | if ((*it)(0) != pi && (*it)(0) != pj) share_vertex.push_back((*it)(0)); 316 | else if ((*it)(1) != pi && (*it)(1) != pj) share_vertex.push_back((*it)(1)); 317 | else share_vertex.push_back((*it)(2)); 318 | } 319 | } 320 | if (share_vertex.size() > 2) { 321 | cout << "share vertices number warning: " << share_vertex.size() << endl; 322 | } 323 | } 324 | 325 | void Deform::update_Ri() 326 | { 327 | Matrix3f Si; 328 | MatrixXf Di; 329 | Matrix3Xf Pi_Prime; 330 | Matrix3Xf Pi; 331 | for (int i = 0; i != P_Num; ++i) { 332 | Di = MatrixXf::Zero(adj_list[i].size(), adj_list[i].size()); 333 | Pi_Prime.resize(3, adj_list[i].size()); 334 | Pi.resize(3, adj_list[i].size()); 335 | // if there is not any single unconnected point this for loop can have a more efficient representation 336 | for (decltype(adj_list[i].size()) j = 0; j != adj_list[i].size(); ++j) { 337 | Di(j, j) = Weight.coeffRef(i, adj_list[i][j]); 338 | Pi.col(j) = P.col(i) - P.col(adj_list[i][j]); 339 | Pi_Prime.col(j) = P_Prime.col(i) - P_Prime.col(adj_list[i][j]); 340 | } 341 | Si = Pi * Di * Pi_Prime.transpose(); 342 | Matrix3f Ui; 343 | Vector3f Wi; 344 | Matrix3f Vi; 345 | wunderSVD3x3(Si, Ui, Wi, Vi); 346 | R[i] = Vi * Ui.transpose(); 347 | 348 | if (R[i].determinant() < 0) 349 | std::cout << "determinant is negative!" << std::endl; 350 | } 351 | } 352 | 353 | float Deform::update_P_Prime() 354 | { 355 | VectorXf d_vec(VectorXf::Map(d.data(), d.cols()*d.rows())); 356 | VectorXf P_Prime_vec = chol.solve(d_vec); 357 | P_Prime.row(0) = P_Prime_vec.segment(0, P_Num); 358 | P_Prime.row(1) = P_Prime_vec.segment(0+P_Num, P_Num); 359 | P_Prime.row(2) = P_Prime_vec.segment(0+2*P_Num, P_Num); 360 | 361 | return (P_Prime - P).norm() / P.norm(); 362 | } 363 | 364 | 365 | void Deform::set_linear_sys() 366 | { 367 | build_laplacian_matrix(); 368 | build_rh_d(); 369 | } 370 | -------------------------------------------------------------------------------- /src/WunderSVD3x3.cpp: -------------------------------------------------------------------------------- 1 | #include "WunderSVD3x3.h" 2 | 3 | #include 4 | #include 5 | 6 | #define USE_SCALAR_IMPLEMENTATION 7 | #undef USE_SSE_IMPLEMENTATION 8 | #undef USE_AVX_IMPLEMENTATION 9 | #define COMPUTE_U_AS_MATRIX 10 | #define COMPUTE_V_AS_MATRIX 11 | #include "Singular_Value_Decomposition_Preamble.hpp" 12 | 13 | #pragma runtime_checks( "u", off ) // disable runtime asserts on xor eax,eax type of stuff (doesn't always work, disable explicitly in compiler settings) 14 | template 15 | void wunderSVD3x3(const Eigen::Matrix& A, Eigen::Matrix &U, Eigen::Matrix &S, Eigen::Matrix&V) 16 | { 17 | // this code only supports the scalar version (otherwise we'd need to pass arrays of matrices) 18 | 19 | #include "Singular_Value_Decomposition_Kernel_Declarations.hpp" 20 | 21 | ENABLE_SCALAR_IMPLEMENTATION(Sa11.f=A(0,0);) ENABLE_SSE_IMPLEMENTATION(Va11=_mm_loadu_ps(a11);) ENABLE_AVX_IMPLEMENTATION(Va11=_mm256_loadu_ps(a11);) 22 | ENABLE_SCALAR_IMPLEMENTATION(Sa21.f=A(1,0);) ENABLE_SSE_IMPLEMENTATION(Va21=_mm_loadu_ps(a21);) ENABLE_AVX_IMPLEMENTATION(Va21=_mm256_loadu_ps(a21);) 23 | ENABLE_SCALAR_IMPLEMENTATION(Sa31.f=A(2,0);) ENABLE_SSE_IMPLEMENTATION(Va31=_mm_loadu_ps(a31);) ENABLE_AVX_IMPLEMENTATION(Va31=_mm256_loadu_ps(a31);) 24 | ENABLE_SCALAR_IMPLEMENTATION(Sa12.f=A(0,1);) ENABLE_SSE_IMPLEMENTATION(Va12=_mm_loadu_ps(a12);) ENABLE_AVX_IMPLEMENTATION(Va12=_mm256_loadu_ps(a12);) 25 | ENABLE_SCALAR_IMPLEMENTATION(Sa22.f=A(1,1);) ENABLE_SSE_IMPLEMENTATION(Va22=_mm_loadu_ps(a22);) ENABLE_AVX_IMPLEMENTATION(Va22=_mm256_loadu_ps(a22);) 26 | ENABLE_SCALAR_IMPLEMENTATION(Sa32.f=A(2,1);) ENABLE_SSE_IMPLEMENTATION(Va32=_mm_loadu_ps(a32);) ENABLE_AVX_IMPLEMENTATION(Va32=_mm256_loadu_ps(a32);) 27 | ENABLE_SCALAR_IMPLEMENTATION(Sa13.f=A(0,2);) ENABLE_SSE_IMPLEMENTATION(Va13=_mm_loadu_ps(a13);) ENABLE_AVX_IMPLEMENTATION(Va13=_mm256_loadu_ps(a13);) 28 | ENABLE_SCALAR_IMPLEMENTATION(Sa23.f=A(1,2);) ENABLE_SSE_IMPLEMENTATION(Va23=_mm_loadu_ps(a23);) ENABLE_AVX_IMPLEMENTATION(Va23=_mm256_loadu_ps(a23);) 29 | ENABLE_SCALAR_IMPLEMENTATION(Sa33.f=A(2,2);) ENABLE_SSE_IMPLEMENTATION(Va33=_mm_loadu_ps(a33);) ENABLE_AVX_IMPLEMENTATION(Va33=_mm256_loadu_ps(a33);) 30 | 31 | #include "Singular_Value_Decomposition_Main_Kernel_Body.hpp" 32 | 33 | ENABLE_SCALAR_IMPLEMENTATION(U(0,0)=Su11.f;) ENABLE_SSE_IMPLEMENTATION(_mm_storeu_ps(u11,Vu11);) ENABLE_AVX_IMPLEMENTATION(_mm256_storeu_ps(u11,Vu11);) 34 | ENABLE_SCALAR_IMPLEMENTATION(U(1,0)=Su21.f;) ENABLE_SSE_IMPLEMENTATION(_mm_storeu_ps(u21,Vu21);) ENABLE_AVX_IMPLEMENTATION(_mm256_storeu_ps(u21,Vu21);) 35 | ENABLE_SCALAR_IMPLEMENTATION(U(2,0)=Su31.f;) ENABLE_SSE_IMPLEMENTATION(_mm_storeu_ps(u31,Vu31);) ENABLE_AVX_IMPLEMENTATION(_mm256_storeu_ps(u31,Vu31);) 36 | ENABLE_SCALAR_IMPLEMENTATION(U(0,1)=Su12.f;) ENABLE_SSE_IMPLEMENTATION(_mm_storeu_ps(u12,Vu12);) ENABLE_AVX_IMPLEMENTATION(_mm256_storeu_ps(u12,Vu12);) 37 | ENABLE_SCALAR_IMPLEMENTATION(U(1,1)=Su22.f;) ENABLE_SSE_IMPLEMENTATION(_mm_storeu_ps(u22,Vu22);) ENABLE_AVX_IMPLEMENTATION(_mm256_storeu_ps(u22,Vu22);) 38 | ENABLE_SCALAR_IMPLEMENTATION(U(2,1)=Su32.f;) ENABLE_SSE_IMPLEMENTATION(_mm_storeu_ps(u32,Vu32);) ENABLE_AVX_IMPLEMENTATION(_mm256_storeu_ps(u32,Vu32);) 39 | ENABLE_SCALAR_IMPLEMENTATION(U(0,2)=Su13.f;) ENABLE_SSE_IMPLEMENTATION(_mm_storeu_ps(u13,Vu13);) ENABLE_AVX_IMPLEMENTATION(_mm256_storeu_ps(u13,Vu13);) 40 | ENABLE_SCALAR_IMPLEMENTATION(U(1,2)=Su23.f;) ENABLE_SSE_IMPLEMENTATION(_mm_storeu_ps(u23,Vu23);) ENABLE_AVX_IMPLEMENTATION(_mm256_storeu_ps(u23,Vu23);) 41 | ENABLE_SCALAR_IMPLEMENTATION(U(2,2)=Su33.f;) ENABLE_SSE_IMPLEMENTATION(_mm_storeu_ps(u33,Vu33);) ENABLE_AVX_IMPLEMENTATION(_mm256_storeu_ps(u33,Vu33);) 42 | 43 | ENABLE_SCALAR_IMPLEMENTATION(V(0,0)=Sv11.f;) ENABLE_SSE_IMPLEMENTATION(_mm_storeu_ps(v11,Vv11);) ENABLE_AVX_IMPLEMENTATION(_mm256_storeu_ps(v11,Vv11);) 44 | ENABLE_SCALAR_IMPLEMENTATION(V(1,0)=Sv21.f;) ENABLE_SSE_IMPLEMENTATION(_mm_storeu_ps(v21,Vv21);) ENABLE_AVX_IMPLEMENTATION(_mm256_storeu_ps(v21,Vv21);) 45 | ENABLE_SCALAR_IMPLEMENTATION(V(2,0)=Sv31.f;) ENABLE_SSE_IMPLEMENTATION(_mm_storeu_ps(v31,Vv31);) ENABLE_AVX_IMPLEMENTATION(_mm256_storeu_ps(v31,Vv31);) 46 | ENABLE_SCALAR_IMPLEMENTATION(V(0,1)=Sv12.f;) ENABLE_SSE_IMPLEMENTATION(_mm_storeu_ps(v12,Vv12);) ENABLE_AVX_IMPLEMENTATION(_mm256_storeu_ps(v12,Vv12);) 47 | ENABLE_SCALAR_IMPLEMENTATION(V(1,1)=Sv22.f;) ENABLE_SSE_IMPLEMENTATION(_mm_storeu_ps(v22,Vv22);) ENABLE_AVX_IMPLEMENTATION(_mm256_storeu_ps(v22,Vv22);) 48 | ENABLE_SCALAR_IMPLEMENTATION(V(2,1)=Sv32.f;) ENABLE_SSE_IMPLEMENTATION(_mm_storeu_ps(v32,Vv32);) ENABLE_AVX_IMPLEMENTATION(_mm256_storeu_ps(v32,Vv32);) 49 | ENABLE_SCALAR_IMPLEMENTATION(V(0,2)=Sv13.f;) ENABLE_SSE_IMPLEMENTATION(_mm_storeu_ps(v13,Vv13);) ENABLE_AVX_IMPLEMENTATION(_mm256_storeu_ps(v13,Vv13);) 50 | ENABLE_SCALAR_IMPLEMENTATION(V(1,2)=Sv23.f;) ENABLE_SSE_IMPLEMENTATION(_mm_storeu_ps(v23,Vv23);) ENABLE_AVX_IMPLEMENTATION(_mm256_storeu_ps(v23,Vv23);) 51 | ENABLE_SCALAR_IMPLEMENTATION(V(2,2)=Sv33.f;) ENABLE_SSE_IMPLEMENTATION(_mm_storeu_ps(v33,Vv33);) ENABLE_AVX_IMPLEMENTATION(_mm256_storeu_ps(v33,Vv33);) 52 | 53 | ENABLE_SCALAR_IMPLEMENTATION(S(0,0)=Sa11.f;) ENABLE_SSE_IMPLEMENTATION(_mm_storeu_ps(sigma1,Va11);) ENABLE_AVX_IMPLEMENTATION(_mm256_storeu_ps(sigma1,Va11);) 54 | ENABLE_SCALAR_IMPLEMENTATION(S(1,0)=Sa22.f;) ENABLE_SSE_IMPLEMENTATION(_mm_storeu_ps(sigma2,Va22);) ENABLE_AVX_IMPLEMENTATION(_mm256_storeu_ps(sigma2,Va22);) 55 | ENABLE_SCALAR_IMPLEMENTATION(S(2,0)=Sa33.f;) ENABLE_SSE_IMPLEMENTATION(_mm_storeu_ps(sigma3,Va33);) ENABLE_AVX_IMPLEMENTATION(_mm256_storeu_ps(sigma3,Va33);) 56 | } 57 | #pragma runtime_checks( "u", restore ) 58 | 59 | // forced instantiation 60 | template void wunderSVD3x3(const Eigen::Matrix& A, Eigen::Matrix &U, Eigen::Matrix &S, Eigen::Matrix&V); 61 | // doesn't even make sense with double because the wunder-SVD code is only single precision anyway... --------------------------------------------------------------------------------