├── speedup.png ├── libflame_sources ├── FLA_HQRRP_UT_blk_var2.h ├── Makefile ├── simple_test.c └── FLA_HQRRP_UT_blk_var2.c ├── lapack_compatible_sources ├── NoFLA_HQRRP_WY_blk_var4.h ├── Makefile ├── simple_test.c └── NoFLA_HQRRP_WY_blk_var4.c ├── License.txt └── README.md /speedup.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flame/hqrrp/HEAD/speedup.png -------------------------------------------------------------------------------- /libflame_sources/FLA_HQRRP_UT_blk_var2.h: -------------------------------------------------------------------------------- 1 | #include "FLAME.h" 2 | 3 | int FLA_HQRRP_UT_blk_var2( FLA_Obj A, FLA_Obj p, FLA_Obj s, 4 | int nb_alg, int pp, int panel_pivoting ); 5 | 6 | -------------------------------------------------------------------------------- /lapack_compatible_sources/NoFLA_HQRRP_WY_blk_var4.h: -------------------------------------------------------------------------------- 1 | 2 | void dgeqp4( int * m, int * n, double * A, int * lda, int * jpvt, double * tau, 3 | double * work, int * lwork, int * info ); 4 | 5 | int NoFLA_HQRRP_WY_blk_var4( int m_A, int n_A, double * buff_A, int ldim_A, 6 | int * buff_jpvt, double * buff_tau, 7 | int nb_alg, int pp, int panel_pivoting ); 8 | 9 | -------------------------------------------------------------------------------- /lapack_compatible_sources/Makefile: -------------------------------------------------------------------------------- 1 | 2 | # Defintions of variables. 3 | 4 | CC = gcc 5 | CCFLAGS = -O -fopenmp 6 | LD = gcc 7 | LDFLAGS = -O -fopenmp 8 | 9 | # Defintions of rules. 10 | 11 | simple_test.x : simple_test.o NoFLA_HQRRP_WY_blk_var4.o 12 | $(LD) $(LDFLAGS) \ 13 | -o simple_test.x \ 14 | simple_test.o \ 15 | NoFLA_HQRRP_WY_blk_var4.c -lm \ 16 | /usr/local/lapack/liblapack_340_p4b64_gf.a \ 17 | /usr/local/lapack/mt_openblas/lib/libopenblas_haswellp-r0.2.14.a \ 18 | -lgfortran 19 | 20 | simple_test.o : simple_test.c 21 | $(CC) $(CCFLAGS) -O -fopenmp -c simple_test.c 22 | 23 | %.o : %.c 24 | $(CC) $(CCFLAGS) -c $< -o $@ 25 | 26 | clean: 27 | rm -f a.out *.x *.o *~ core 28 | 29 | -------------------------------------------------------------------------------- /libflame_sources/Makefile: -------------------------------------------------------------------------------- 1 | 2 | # Defintions of variables. 3 | 4 | LIBFLAME_PATH = /usr/local/libflame-current-gc 5 | 6 | CC = gcc 7 | CCFLAGS = -O -fopenmp -I$(LIBFLAME_PATH)/include-x86_64-r -msse3 8 | LD = gcc 9 | LDFLAGS = -O -fopenmp -I$(LIBFLAME_PATH)/include-x86_64-r -msse3 10 | 11 | # Defintions of rules. 12 | 13 | simple_test.x : simple_test.o FLA_HQRRP_UT_blk_var2.o 14 | $(LD) $(LDFLAGS) \ 15 | -o simple_test.x \ 16 | simple_test.o \ 17 | FLA_HQRRP_UT_blk_var2.o \ 18 | -L$(LIBFLAME_PATH)/lib -lflame-x86_64-r \ 19 | /usr/local/lapack/mt_openblas/lib/libopenblas_haswellp-r0.2.14.a \ 20 | -lm 21 | 22 | simple_test.o : simple_test.c 23 | $(CC) $(CCFLAGS) -c simple_test.c 24 | 25 | %.o : %.c 26 | $(CC) $(CCFLAGS) -c $< -o $@ 27 | 28 | clean: 29 | rm -f a.out *.x *.o *~ core 30 | 31 | -------------------------------------------------------------------------------- /License.txt: -------------------------------------------------------------------------------- 1 | HQRRP software 2 | License 3 | --- 4 | 5 | The HQRRP software is licensed under the following license, typically 6 | known as the "new" or "modified" or "3-clause" BSD license. 7 | 8 | Copyright (C) 2016, 9 | Universitat Jaume I, 10 | University of Colorado at Boulder, 11 | The University of Texas at Austin. 12 | 13 | Redistribution and use in source and binary forms, with or without 14 | modification, are permitted provided that the following conditions are 15 | met: 16 | - Redistributions of source code must retain the above copyright 17 | notice, this list of conditions and the following disclaimer. 18 | - Redistributions in binary form must reproduce the above copyright 19 | notice, this list of conditions and the following disclaimer in the 20 | documentation and/or other materials provided with the distribution. 21 | - Neither the name of the universities of its contributors nor the names 22 | of its contributors may be used to endorse or promote products 23 | derived from this software without specific prior written permission. 24 | 25 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 26 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 27 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 28 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 29 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 30 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 31 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 32 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 33 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 34 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 35 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 36 | 37 | -------------------------------------------------------------------------------- /libflame_sources/simple_test.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "FLAME.h" 5 | 6 | #define PRINT_DATA 7 | 8 | 9 | // ============================================================================ 10 | // Declaration of local prototypes. 11 | 12 | static void matrix_generate( FLA_Obj A ); 13 | 14 | static void init_pvt( FLA_Obj p ); 15 | 16 | 17 | // ============================================================================ 18 | int main( int argc, char *argv[] ) { 19 | int m_A, n_A; 20 | FLA_Obj A, p, tau; 21 | 22 | // Initialize FLAME. 23 | FLA_Init(); 24 | 25 | // Some initializations. 26 | m_A = 5; 27 | n_A = 5; 28 | 29 | // Create FLAME objects, and attach buffers. 30 | FLA_Obj_create( FLA_DOUBLE, m_A, n_A, 0, 0, & A ); 31 | FLA_Obj_create( FLA_INT, n_A, 1, 0, 0, & p ); 32 | FLA_Obj_create( FLA_DOUBLE, n_A, 1, 0, 0, & tau ); 33 | 34 | // Generate matrix. 35 | //// FLA_Random_matrix( A ); 36 | matrix_generate( A ); 37 | 38 | // Initialize vector with pivots. 39 | init_pvt( p ); 40 | 41 | // Print initial data. 42 | #ifdef PRINT_DATA 43 | FLA_Obj_show( " Ai = [ ", A, "%le", " ];" ); 44 | FLA_Obj_show( " pi = [ ", p, "%d", " ];" ); 45 | FLA_Obj_show( " taui = [ ", tau, "%le", " ];" ); 46 | #endif 47 | 48 | // Factorize matrix. 49 | printf( "%% Just before computing factorization.\n" ); 50 | // New factorization. 51 | FLA_HQRRP_UT_blk_var2( A, p, tau, 64, 10, 1 ); 52 | printf( "%% Just after computing factorization.\n" ); 53 | 54 | // Print results. 55 | #ifdef PRINT_DATA 56 | FLA_Obj_show( " Af = [ ", A, "%le", " ];" ); 57 | FLA_Obj_show( " pf = [ ", p, "%d", " ];" ); 58 | FLA_Obj_show( " tauf = [ ", tau, "%le", " ];" ); 59 | #endif 60 | 61 | // Free objects. 62 | FLA_Obj_free( & A ); 63 | FLA_Obj_free( & p ); 64 | FLA_Obj_free( & tau ); 65 | 66 | // Finalize FLAME. 67 | printf( "%% End of Program\n" ); 68 | FLA_Finalize(); 69 | 70 | 71 | return 0; 72 | } 73 | 74 | // ============================================================================ 75 | static void matrix_generate( FLA_Obj A ) { 76 | double * buff_A; 77 | int m_A, n_A, ldim_A; 78 | int i, j, num; 79 | 80 | buff_A = ( double * ) FLA_Obj_buffer_at_view( A ); 81 | m_A = FLA_Obj_length( A ); 82 | n_A = FLA_Obj_width ( A ); 83 | ldim_A = FLA_Obj_col_stride( A ); 84 | 85 | // 86 | // Matrix with integer values. 87 | // --------------------------- 88 | // 89 | if( ( m_A > 0 )&&( n_A > 0 ) ) { 90 | num = 1; 91 | for ( j = 0; j < n_A; j++ ) { 92 | for ( i = ( j % m_A ); i < m_A; i++ ) { 93 | buff_A[ i + j * ldim_A ] = ( double ) num; 94 | num++; 95 | } 96 | for ( i = 0; i < ( j % m_A ); i++ ) { 97 | buff_A[ i + j * ldim_A ] = ( double ) num; 98 | num++; 99 | } 100 | } 101 | if( ( m_A > 0 )&&( n_A > 0 ) ) { 102 | buff_A[ 0 + 0 * ldim_A ] = 1.2; 103 | } 104 | #if 0 105 | // Scale down matrix. 106 | if( num == 0.0 ) { 107 | rnum = 1.0; 108 | } else { 109 | rnum = 1.0 / num; 110 | } 111 | for ( j = 0; j < n_A; j++ ) { 112 | for ( i = 0; i < m_A; i++ ) { 113 | buff_A[ i + j * ldim_A ] *= rnum; 114 | } 115 | } 116 | #endif 117 | } 118 | } 119 | 120 | // ============================================================================ 121 | static void init_pvt( FLA_Obj p ) { 122 | int * buff_p, n_p; 123 | int i; 124 | 125 | buff_p = ( int * ) FLA_Obj_buffer_at_view( p ); 126 | n_p = FLA_Obj_length( p ); 127 | for( i = 0; i < n_p; i++ ) { 128 | buff_p[ i ] = ( i + 1 ); 129 | } 130 | } 131 | 132 | 133 | -------------------------------------------------------------------------------- /lapack_compatible_sources/simple_test.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #define max( a, b ) ( (a) > (b) ? (a) : (b) ) 6 | #define min( a, b ) ( (a) < (b) ? (a) : (b) ) 7 | 8 | #define PRINT_DATA 9 | 10 | 11 | // ============================================================================ 12 | // Declaration of local prototypes. 13 | 14 | static void matrix_generate( int m_A, int n_A, double * buff_A, int ldim_A ); 15 | 16 | static void print_double_matrix( char * name, int m_A, int n_A, 17 | double * buff_A, int ldim_A ); 18 | 19 | static void print_double_vector( char * name, int n, double * vector ); 20 | 21 | static void print_int_vector( char * name, int n, int * vector ); 22 | 23 | static void init_pvt( int n, int * vector ); 24 | 25 | static void set_pvt_to_zero( int n_p, int * buff_p ); 26 | 27 | 28 | 29 | // ============================================================================ 30 | int main( int argc, char *argv[] ) { 31 | int nb_alg, pp, m_A, n_A, mn_A, ldim_A, ldim_Q, info, lwork; 32 | double * buff_A, * buff_tau, * buff_Q, * buff_wk_qp4, * buff_wk_orgqr; 33 | int * buff_p; 34 | 35 | // Create matrix A, vector p, vector s, and matrix Q. 36 | m_A = 7; 37 | n_A = 5; 38 | mn_A = min( m_A, n_A ); 39 | buff_A = ( double * ) malloc( m_A * n_A * sizeof( double ) ); 40 | ldim_A = max( 1, m_A ); 41 | 42 | buff_p = ( int * ) malloc( n_A * sizeof( int ) ); 43 | 44 | buff_tau = ( double * ) malloc( n_A * sizeof( double ) ); 45 | 46 | buff_Q = ( double * ) malloc( m_A * mn_A * sizeof( double ) ); 47 | ldim_Q = max( 1, m_A ); 48 | 49 | // Generate matrix. 50 | matrix_generate( m_A, n_A, buff_A, ldim_A ); 51 | 52 | #ifdef PRINT_DATA 53 | print_double_matrix( "ai", m_A, n_A, buff_A, ldim_A ); 54 | print_double_vector( "taui", n_A, buff_tau ); 55 | #endif 56 | 57 | // Initialize vector with pivots. 58 | set_pvt_to_zero( n_A, buff_p ); 59 | buff_p[ 0 ] = 0; 60 | buff_p[ 1 ] = 1; 61 | buff_p[ 2 ] = 1; 62 | buff_p[ 3 ] = 0; 63 | buff_p[ 4 ] = 1; 64 | #ifdef PRINT_DATA 65 | print_int_vector( "pi", n_A, buff_p ); 66 | #endif 67 | 68 | // Create workspace. 69 | lwork = max( 1, 128 * n_A ); 70 | buff_wk_qp4 = ( double * ) malloc( lwork * sizeof( double ) ); 71 | 72 | // Factorize matrix. 73 | printf( "%% Just before computing factorization.\n" ); 74 | // New factorization. 75 | dgeqp4( & m_A, & n_A, buff_A, & ldim_A, buff_p, buff_tau, 76 | buff_wk_qp4, & lwork, & info ); 77 | // Current factorization. 78 | // dgeqp3_( & m_A, & n_A, buff_A, & ldim_A, buff_p, buff_tau, 79 | // buff_wk_qp4, & lwork, & info ); 80 | printf( "%% Just after computing factorization.\n" ); 81 | 82 | printf( "%% Info after factorization: %d \n", info ); 83 | printf( "%% Work[ 0 ] after factorization: %d \n", ( int ) buff_wk_qp4[ 0 ] ); 84 | 85 | // Remove workspace. 86 | free( buff_wk_qp4 ); 87 | 88 | // Build matrix Q. 89 | lwork = max( 1, 128 * n_A ); 90 | buff_wk_orgqr = ( double * ) malloc( lwork * sizeof( double ) ); 91 | dlacpy_( "All", & m_A, & mn_A, buff_A, & ldim_A, buff_Q, & ldim_Q ); 92 | dorgqr_( & m_A, & mn_A, & mn_A, buff_Q, & ldim_Q, buff_tau, 93 | buff_wk_orgqr, & lwork, & info ); 94 | if( info != 0 ) { 95 | fprintf( stderr, "Error in dorgqr: Info: %d\n", info ); 96 | } 97 | free( buff_wk_orgqr ); 98 | 99 | // Print results. 100 | #ifdef PRINT_DATA 101 | print_double_matrix( "af", m_A, n_A, buff_A, ldim_A ); 102 | print_int_vector( "pf", n_A, buff_p ); 103 | print_double_vector( "tauf", n_A, buff_tau ); 104 | print_double_matrix( "qf", m_A, mn_A, buff_Q, ldim_Q ); 105 | #endif 106 | 107 | // Free matrices and vectors. 108 | free( buff_A ); 109 | free( buff_p ); 110 | free( buff_tau ); 111 | free( buff_Q ); 112 | 113 | printf( "%% End of Program\n" ); 114 | 115 | return 0; 116 | } 117 | 118 | // ============================================================================ 119 | static void matrix_generate( int m_A, int n_A, double * buff_A, int ldim_A ) { 120 | int i, j, num; 121 | 122 | // 123 | // Matrix with integer values. 124 | // --------------------------- 125 | // 126 | if( ( m_A > 0 )&&( n_A > 0 ) ) { 127 | num = 1; 128 | for ( j = 0; j < n_A; j++ ) { 129 | for ( i = ( j % m_A ); i < m_A; i++ ) { 130 | buff_A[ i + j * ldim_A ] = ( double ) num; 131 | num++; 132 | } 133 | for ( i = 0; i < ( j % m_A ); i++ ) { 134 | buff_A[ i + j * ldim_A ] = ( double ) num; 135 | num++; 136 | } 137 | } 138 | if( ( m_A > 0 )&&( n_A > 0 ) ) { 139 | buff_A[ 0 + 0 * ldim_A ] = 1.2; 140 | } 141 | #if 0 142 | // Scale down matrix. 143 | if( num == 0.0 ) { 144 | rnum = 1.0; 145 | } else { 146 | rnum = 1.0 / num; 147 | } 148 | for ( j = 0; j < n_A; j++ ) { 149 | for ( i = 0; i < m_A; i++ ) { 150 | buff_A[ i + j * ldim_A ] *= rnum; 151 | } 152 | } 153 | #endif 154 | } 155 | } 156 | 157 | // ============================================================================ 158 | static void print_double_matrix( char * name, int m_A, int n_A, 159 | double * buff_A, int ldim_A ) { 160 | int i, j; 161 | 162 | printf( "%s = [\n", name ); 163 | for( i = 0; i < m_A; i++ ) { 164 | for( j = 0; j < n_A; j++ ) { 165 | printf( "%le ", buff_A[ i + j * ldim_A ] ); 166 | } 167 | printf( "\n" ); 168 | } 169 | printf( "];\n" ); 170 | } 171 | 172 | // ============================================================================ 173 | static void print_double_vector( char * name, int n_v, double * buff_v ) { 174 | int i, j; 175 | 176 | printf( "%s = [\n", name ); 177 | for( i = 0; i < n_v; i++ ) { 178 | printf( "%le\n", buff_v[ i ] ); 179 | } 180 | printf( "\n" ); 181 | printf( "];\n" ); 182 | } 183 | 184 | // ============================================================================ 185 | static void print_int_vector( char * name, int n_v, int * buff_v ) { 186 | int i, j; 187 | 188 | printf( "%s = [\n", name ); 189 | for( i = 0; i < n_v; i++ ) { 190 | printf( "%d\n", buff_v[ i ] ); 191 | } 192 | printf( "];\n" ); 193 | } 194 | 195 | // ============================================================================ 196 | static void init_pvt( int n_p, int * buff_p ) { 197 | int i; 198 | 199 | for( i = 0; i < n_p; i++ ) { 200 | buff_p[ i ] = ( i + 1 ); 201 | } 202 | } 203 | 204 | // ============================================================================ 205 | static void set_pvt_to_zero( int n_p, int * buff_p ) { 206 | int i; 207 | 208 | for( i = 0; i < n_p; i++ ) { 209 | buff_p[ i ] = 0; 210 | } 211 | } 212 | 213 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # HQRRP 2 | 3 | ## Authors 4 | 5 | * Per-Gunnar Martinsson, 6 | Dept. of Applied Mathematics, 7 | University of Colorado at Boulder, 8 | 526 UCB, Boulder, CO 80309-0526, USA. 9 | 10 | * Gregorio Quintana-Orti, 11 | Depto. de Ingenieria y Ciencia de Computadores, 12 | Universitat Jaume I, 13 | 12.071 Castellon, Spain. 14 | 15 | * Nathan Heavner, 16 | Dept. of Applied Mathematics, 17 | University of Colorado at Boulder, 18 | 526 UCB, Boulder, CO 80309-0526, USA. 19 | 20 | * Robert van de Geijn, 21 | Dept. of Computer Science and Institute for Computational Engineering and 22 | Sciences, 23 | The University of Texas at Austin, 24 | Austin, TX, USA. 25 | 26 | ## Correspondence 27 | 28 | Please send correspondence about the code to 29 | Gregorio Quintana-Ortí: 30 | 31 | Correspondence about the paper should be sent to 32 | Per-Gunnar J. Martinsson: 33 | 34 | ## License 35 | 36 | New 3-clause BSD. 37 | See file License.txt for more details. 38 | 39 | ## Disclaimer 40 | 41 | This code is distributed in the hope that it will be useful, but 42 | WITHOUT ANY WARRANTY EXPRESSED OR IMPLIED. 43 | 44 | ## Description 45 | 46 | Householder transformation based QR factorization with column pivoting is an 47 | important algorithm for, for example, determining an approximate basis for 48 | the column space of a matrix. It is, unfortunately, notoriously difficult to 49 | implement for high performance. 50 | 51 | Recently, techniques that use randomized sampling have been developed that 52 | do achieve high performance by casting most computation in terms of 53 | matrix-matrix multiplication. 54 | 55 | For example, we describe such an algorithm in our recent paper: 56 | 57 | * P.-G. Martinsson, G. Quintana-Orti, N. Heavner, R. van de Geijn. 58 | "Householder QR Factorization: Adding Randomization for Column Pivoting. 59 | FLAME Working Note #78" 60 | http://arxiv.org/abs/1512.02671 61 | 62 | This directory contains an implementation that we call Householder QR 63 | factorization with Randomization for Pivoting (HQRRP), based on the insights 64 | in that paper. 65 | 66 | The new code outperforms LAPACK's core routine DGEQP3 both in unicore and 67 | multicore architectures for medium and large matrix sizes, often by large 68 | factor. The new implementation comes with an interface that is plug 69 | compatible with DGEQP3. 70 | 71 | The new code can be downloaded from https://github.com/flame/hqrrp/. 72 | 73 | The algorithm was originally implemented using the FLAME/C API with 74 | a variation of the compact WY transform we call the UT transform. 75 | In addition, 76 | we also provide an implementation that instead uses the original compact 77 | WY transform so that many routines from LAPACK can be employed in a 78 | seamless fashion. 79 | 80 | This implementation as well as the original implementation based on the UT 81 | transform will eventually be included in the libflame library: 82 | https://github.com/flame/libflame/ 83 | 84 | We will appreciate feedback from the community on the use of this code. 85 | 86 | ## Performance benefit 87 | 88 | ![alt tag](./speedup.png) 89 | 90 | ## Citing this work 91 | 92 | We ask those who benefit from this work 93 | to cite the following articles: 94 | 95 | ``` 96 | @article{doi:10.1137/16M1081270, 97 | author = {Martinsson, P. and Quintana Ort\'{\i}, G. 98 | and Heavner, N. and van de Geijn, R.}, 99 | title = {Householder QR Factorization 100 | With Randomization for Column Pivoting (HQRRP)}, 101 | journal = {SIAM Journal on Scientific Computing}, 102 | volume = {39}, 103 | number = {2}, 104 | pages = {C96-C115}, 105 | year = {2017}, 106 | doi = {10.1137/16M1081270}, 107 | URL = {https://doi.org/10.1137/16M1081270}, 108 | eprint = {https://doi.org/10.1137/16M1081270} 109 | } 110 | 111 | @ARTICLE{martinsson2015blocked, 112 | title = {Blocked rank-revealing QR factorizations: How randomized 113 | sampling can be used to avoid single-vector pivoting}, 114 | author = {Martinsson, Per-Gunnar}, 115 | journal = {arXiv preprint arXiv:1505.08115}, 116 | year = {2015}, 117 | month = {may} 118 | } 119 | 120 | @ARTICLE{2015arXiv151202671M, 121 | author = {{Martinsson}, P.-G. and {Quintana-Ort\’{\i}}, G. 122 | and {Heavner}, N. and {van de Geijn}, R.}, 123 | title = "{Householder {QR} Factorization: Adding Randomization for 124 | Column Pivoting. {FLAME} {W}orking {N}ote \#78}", 125 | journal = {ArXiv e-prints}, 126 | archivePrefix = "arXiv", 127 | eprint = {1512.02671}, 128 | primaryClass = "math.NA", 129 | keywords = {Mathematics - Numerical Analysis, Computer Science - 130 | Numerical Analysis}, 131 | year = 2015, 132 | month = dec, 133 | adsurl = {http://adsabs.harvard.edu/abs/2015arXiv151202671M}, 134 | adsnote = {Provided by the SAO/NASA Astrophysics Data System} 135 | } 136 | ``` 137 | 138 | ## Details 139 | 140 | We offer two variants of the code: 141 | 142 | * LAPACK-compatible pure C code: 143 | It uses compact WY transformations. 144 | The sources are stored in the `lapack_compatible_sources` directory. 145 | 146 | * LAPACK-like libflame code: 147 | It uses compact UT transformations. 148 | This code resembles the algorithm in the paper. 149 | The sources are stored in the `libflame_sources` directory. 150 | 151 | ### Details of LAPACK-compatible pure C code: 152 | 153 | The new code contains the following two main routines: 154 | 155 | ``` 156 | void dgeqp4( int * m, int * n, double * A, int * lda, int * jpvt, double * tau, 157 | double * work, int * lwork, int * info ); 158 | // 159 | // This routine is plug compatible with LAPACK's routine dgeqp3. 160 | // It computes the new HQRRP while keeping the same header as LAPACK's dgeqp3. 161 | // It uses dgeqpf or dgeqp3 for small matrices. The thresholds are defined in 162 | // constants THRESHOLD_FOR_DGEQPF and THRESHOLD_FOR_DGEQP3. 163 | // This routine calls the next one with block size 64 and oversampling 10. 164 | // 165 | 166 | int NoFLA_HQRRP_WY_blk_var4( int m_A, int n_A, double * buff_A, int ldim_A, 167 | int * buff_jpvt, double * buff_tau, 168 | int nb_alg, int pp, int panel_pivoting ); 169 | // 170 | // This routine is not plug compatible with LAPACK's routine dgeqp3. 171 | // It computes the new HQRRP and allows the user to fine tune more arguments, 172 | // such as the block size, oversampling, etc. 173 | // 174 | ``` 175 | 176 | These two routines are stored in the `NoFLA_HQRRP_WY_blk_var4.c` file. 177 | The `simple_test.c` file in this directory 178 | contains a main program to test routine `dgeqp4`. 179 | 180 | ### Details of LAPACK-like libflame code: 181 | 182 | The new code contains the following main routine: 183 | 184 | ``` 185 | int FLA_HQRRP_UT_blk_var2( FLA_Obj A, FLA_Obj p, FLA_Obj s, 186 | int nb_alg, int pp, int panel_pivoting ); 187 | // 188 | // This routine is not plug compatible with LAPACK's routine dgeqp3. 189 | // It computes the new HQRRP and allows the user to fine tune more arguments, 190 | // such as the block size, oversampling, etc. 191 | // 192 | ``` 193 | 194 | This routine is stored in the `FLA_HQRRP_UT_blk_var2.c` file. 195 | The `simple_test.c` file in this directory contains a main program to test it. 196 | 197 | -------------------------------------------------------------------------------- /lapack_compatible_sources/NoFLA_HQRRP_WY_blk_var4.c: -------------------------------------------------------------------------------- 1 | /* 2 | =============================================================================== 3 | Authors 4 | =============================================================================== 5 | 6 | Per-Gunnar Martinsson 7 | Dept. of Applied Mathematics, 8 | University of Colorado at Boulder, 9 | 526 UCB, Boulder, CO 80309-0526, USA 10 | 11 | Gregorio Quintana-Orti 12 | Depto. de Ingenieria y Ciencia de Computadores, 13 | Universitat Jaume I, 14 | 12.071 Castellon, Spain 15 | 16 | Nathan Heavner 17 | Dept. of Applied Mathematics, 18 | University of Colorado at Boulder, 19 | 526 UCB, Boulder, CO 80309-0526, USA 20 | 21 | Robert van de Geijn 22 | Dept. of Computer Science and Institute for Computational Engineering and 23 | Sciences, 24 | The University of Texas at Austin 25 | Austin, TX. 26 | 27 | =============================================================================== 28 | Copyright 29 | =============================================================================== 30 | 31 | Copyright (C) 2016, 32 | Universitat Jaume I, 33 | University of Colorado at Boulder, 34 | The University of Texas at Austin. 35 | 36 | =============================================================================== 37 | Disclaimer 38 | =============================================================================== 39 | 40 | This code is distributed in the hope that it will be useful, but 41 | WITHOUT ANY WARRANTY EXPRESSED OR IMPLIED. 42 | 43 | */ 44 | 45 | #include 46 | #include 47 | #include 48 | #include "NoFLA_HQRRP_WY_blk_var4.h" 49 | 50 | 51 | // Matrices with dimensions smaller than THRESHOLD_FOR_DGEQPF are processed 52 | // with LAPACK's routine dgeqpf. 53 | // Matrices with dimensions between THRESHOLD_FOR_DGEQPF and 54 | // THRESHOLD_FOR_DGEQP3 are processed with LAPACK's routine dgeqp3. 55 | // Matrices with dimensions larger than THRESHOLD_FOR_DGEQP3 are processed 56 | // with the new HQRRP code. 57 | #define THRESHOLD_FOR_DGEQPF 250 58 | #define THRESHOLD_FOR_DGEQP3 1000 59 | 60 | 61 | // ============================================================================ 62 | // Definition of macros. 63 | 64 | #define max( a, b ) ( (a) > (b) ? (a) : (b) ) 65 | #define min( a, b ) ( (a) > (b) ? (b) : (a) ) 66 | #define dabs( a ) ( (a) >= 0.0 ? (a) : -(a) ) 67 | 68 | // ============================================================================ 69 | // Compilation declarations. 70 | 71 | #undef CHECK_DOWNDATING_OF_Y 72 | 73 | 74 | // ============================================================================ 75 | // Declaration of local prototypes. 76 | 77 | static int NoFLA_Normal_random_matrix( int m_A, int n_A, 78 | double * buff_A, int ldim_A ); 79 | 80 | static double NoFLA_Normal_random_number( double mu, double sigma ); 81 | 82 | static int NoFLA_Downdate_Y( 83 | int m_U11, int n_U11, double * buff_U11, int ldim_U11, 84 | int m_U21, int n_U21, double * buff_U21, int ldim_U21, 85 | int m_A12, int n_A12, double * buff_A12, int ldim_A12, 86 | int m_T, int n_T, double * buff_T, int ldim_T, 87 | int m_Y2, int n_Y2, double * buff_Y2, int ldim_Y2, 88 | int m_G1, int n_G1, double * buff_G1, int ldim_G1, 89 | int m_G2, int n_G2, double * buff_G2, int ldim_G2 ); 90 | 91 | static int NoFLA_Apply_Q_WY_lhfc_blk_var4( 92 | int m_U, int n_U, double * buff_U, int ldim_U, 93 | int m_T, int n_T, double * buff_T, int ldim_T, 94 | int m_B, int n_B, double * buff_B, int ldim_B ); 95 | 96 | static int NoFLA_Apply_Q_WY_rnfc_blk_var4( 97 | int m_U, int n_U, double * buff_U, int ldim_U, 98 | int m_T, int n_T, double * buff_T, int ldim_T, 99 | int m_B, int n_B, double * buff_B, int ldim_B ); 100 | 101 | static int NoFLA_QRPmod_WY_unb_var4( int pivoting, int num_stages, 102 | int m_A, int n_A, double * buff_A, int ldim_A, 103 | int * buff_p, double * buff_t, 104 | int pivot_B, int m_B, double * buff_B, int ldim_B, 105 | int pivot_C, int m_C, double * buff_C, int ldim_C, 106 | int build_T, double * buff_T, int ldim_T ); 107 | 108 | static int NoFLA_QRP_compute_norms( 109 | int m_A, int n_A, double * buff_A, int ldim_A, 110 | double * buff_d, double * buff_e ); 111 | 112 | static int NoFLA_QRP_downdate_partial_norms( int m_A, int n_A, 113 | double * buff_d, int st_d, 114 | double * buff_e, int st_e, 115 | double * buff_wt, int st_wt, 116 | double * buff_A, int ldim_A ); 117 | 118 | static int NoFLA_QRP_pivot_G_B_C( int j_max_col, 119 | int m_G, double * buff_G, int ldim_G, 120 | int pivot_B, int m_B, double * buff_B, int ldim_B, 121 | int pivot_C, int m_C, double * buff_C, int ldim_C, 122 | int * buff_p, 123 | double * buff_d, double * buff_e ); 124 | 125 | 126 | // ============================================================================ 127 | void dgeqp4( int * m, int * n, double * A, int * lda, int * jpvt, double * tau, 128 | double * work, int * lwork, int * info ) { 129 | // 130 | // This routine is plug compatible with LAPACK's routine dgeqp3. 131 | // It computes the new HQRRP while keeping the same header as LAPACK's dgeqp3. 132 | // It uses dgeqpf or dgeqp3 for small matrices. The thresholds are defined in 133 | // constants THRESHOLD_FOR_DGEQPF and THRESHOLD_FOR_DGEQP3. 134 | // 135 | int INB = 1; 136 | int i_one = 1, i_minus_one = -1, 137 | m_A, n_A, mn_A, ldim_A, lquery, nb, num_factorized_fixed_cols, 138 | minus_info, iws, lwkopt, j, k, num_fixed_cols, n_rest, itmp; 139 | int * previous_jpvt; 140 | int ilaenv_(); 141 | 142 | // Some initializations. 143 | m_A = * m; 144 | n_A = * n; 145 | mn_A = min( m_A, n_A ); 146 | ldim_A = * lda; 147 | 148 | // Check input arguments. 149 | * info = 0; 150 | lquery = ( * lwork == -1 ); 151 | if( m_A < 0 ) { 152 | * info = -1; 153 | } else if ( n_A < 0 ) { 154 | * info = -2; 155 | } else if ( ldim_A < max( 1, m_A ) ) { 156 | * info = -4; 157 | } 158 | 159 | if( *info == 0 ) { 160 | if( mn_A == 0 ) { 161 | iws = 1; 162 | lwkopt = 1; 163 | } else { 164 | iws = 3 * n_A + 1; 165 | nb = ilaenv_( & INB, "DGEQRF", ' ', & m_A, & n_A, & i_minus_one, 166 | & i_minus_one ); 167 | lwkopt = 2 * n_A + ( n_A + 1 ) * nb; 168 | } 169 | work[ 0 ] = ( double ) lwkopt; 170 | 171 | if ( ( * lwork < iws )&&( ! lquery ) ) { 172 | * info = -8; 173 | } 174 | } 175 | 176 | if( * info != 0 ) { 177 | minus_info = - * info; 178 | xerbla_( "DGEQP3", & minus_info ); 179 | return; 180 | } else if( lquery ) { 181 | return; 182 | } 183 | 184 | // Quick return if possible. 185 | if( mn_A == 0 ) { 186 | return; 187 | } 188 | 189 | // Use LAPACK's DGEQPF or DGEQP3 for small matrices. 190 | if( mn_A < THRESHOLD_FOR_DGEQPF ) { 191 | // Call to LAPACK routine. 192 | //// printf( "Calling dgeqpf\n" ); 193 | dgeqpf_( m, n, A, lda, jpvt, tau, work, info ); 194 | return; 195 | } else if( mn_A < THRESHOLD_FOR_DGEQP3 ) { 196 | //// printf( "Calling dgeqp3\n" ); 197 | dgeqp3_( m, n, A, lda, jpvt, tau, work, lwork, info ); 198 | return; 199 | } 200 | 201 | // Move initial columns up front. 202 | num_fixed_cols = 0; 203 | for( j = 0; j < n_A; j++ ) { 204 | if( jpvt[ j ] != 0 ) { 205 | if( j != num_fixed_cols ) { 206 | //// printf( "Swapping columns: %d %d \n", j, num_fixed_cols ); 207 | dswap_( & m_A, & A[ 0 + j * ldim_A ], & i_one, 208 | & A[ 0 + num_fixed_cols * ldim_A ], & i_one ); 209 | jpvt[ j ] = jpvt[ num_fixed_cols ]; 210 | jpvt[ num_fixed_cols ] = j + 1; 211 | } else { 212 | jpvt[ j ] = j + 1 ; 213 | } 214 | num_fixed_cols++; 215 | } else { 216 | jpvt[ j ] = j + 1 ; 217 | } 218 | } 219 | 220 | // Factorize fixed columns at the front. 221 | num_factorized_fixed_cols = min( m_A, num_fixed_cols ); 222 | if( num_factorized_fixed_cols > 0 ) { 223 | dgeqrf_( & m_A, & num_factorized_fixed_cols, A, & ldim_A, tau, work, lwork, 224 | info ); 225 | if( * info != 0 ) { 226 | fprintf( stderr, "ERROR in dgeqrf: Info: %d \n", * info ); 227 | } 228 | iws = max( iws, ( int ) work[ 0 ] ); 229 | if( num_factorized_fixed_cols < n_A ) { 230 | n_rest = n_A - num_factorized_fixed_cols; 231 | dormqr_( "Left", "Transpose", 232 | & m_A, & n_rest, & num_factorized_fixed_cols, 233 | A, & ldim_A, tau, 234 | & A[ 0 + num_factorized_fixed_cols * ldim_A ], & ldim_A, 235 | work, lwork, info ); 236 | if( * info != 0 ) { 237 | fprintf( stderr, "ERROR in dormqr: Info: %d \n", * info ); 238 | } 239 | 240 | iws = max( iws, ( int ) work[ 0 ] ); 241 | } 242 | } 243 | 244 | // Create intermediate jpvt vector. 245 | previous_jpvt = ( int * ) malloc( n_A * sizeof( int ) ); 246 | 247 | // Save a copy of jpvt vector. 248 | if( num_factorized_fixed_cols > 0 ) { 249 | // Copy vector. 250 | for( j = 0; j < n_A; j++ ) { 251 | previous_jpvt[ j ] = jpvt[ j ]; 252 | } 253 | } 254 | 255 | // Factorize free columns at the bottom with default values: 256 | // nb_alg = 64, pp = 10, panel_pivoting = 1. 257 | if( num_factorized_fixed_cols < mn_A ) { 258 | * info = NoFLA_HQRRP_WY_blk_var4( 259 | m_A - num_factorized_fixed_cols, n_A - num_factorized_fixed_cols, 260 | & A[ num_factorized_fixed_cols + num_factorized_fixed_cols * ldim_A ], 261 | ldim_A, 262 | & jpvt[ num_factorized_fixed_cols ], 263 | & tau[ num_factorized_fixed_cols ], 264 | 64, 10, 1 ); 265 | } 266 | 267 | // Pivot block above factorized block by NoFLA_HQRRP. 268 | if( num_factorized_fixed_cols > 0 ) { 269 | // Pivot block above factorized block. 270 | for( j = num_factorized_fixed_cols; j < n_A; j++ ) { 271 | //// printf( "%% Processing j: %d \n", j ); 272 | for( k = j; k < n_A; k++ ) { 273 | if( jpvt[ j ] == previous_jpvt[ k ] ) { 274 | //// printf( "%% Found j: %d k: %d \n", j, k ); 275 | break; 276 | } 277 | } 278 | // Swap vector previous_jpvt and block above factorized block. 279 | if( k != j ) { 280 | // Swap elements in previous_jpvt. 281 | //// printf( "%% Swapping j: %d k: %d \n", j, k ); 282 | itmp = previous_jpvt[ j ]; 283 | previous_jpvt[ j ] = previous_jpvt[ k ]; 284 | previous_jpvt[ k ] = itmp; 285 | 286 | // Swap columns in block above factorized block. 287 | dswap_( & num_factorized_fixed_cols, 288 | & A[ 0 + j * ldim_A ], & i_one, 289 | & A[ 0 + k * ldim_A ], & i_one ); 290 | } 291 | } 292 | } 293 | 294 | // Remove intermediate jpvt vector. 295 | free( previous_jpvt ); 296 | 297 | // Return workspace length required. 298 | work[ 0 ] = iws; 299 | return; 300 | } 301 | 302 | // ============================================================================ 303 | int NoFLA_HQRRP_WY_blk_var4( int m_A, int n_A, double * buff_A, int ldim_A, 304 | int * buff_jpvt, double * buff_tau, 305 | int nb_alg, int pp, int panel_pivoting ) { 306 | // 307 | // HQRRP: It computes the Householder QR with Randomized Pivoting of matrix A. 308 | // This routine is almost compatible with LAPACK's dgeqp3. 309 | // The main difference is that this routine does not manage fixed columns. 310 | // 311 | // Main features: 312 | // * BLAS-3 based. 313 | // * Norm downdating method by Drmac. 314 | // * Downdating for computing Y. 315 | // * No use of libflame. 316 | // * Compact WY transformations are used instead of UT transformations. 317 | // * LAPACK's routine dlarfb is used to apply block transformations. 318 | // 319 | // Arguments: 320 | // ---------- 321 | // m_A: Number of rows of matrix A. 322 | // n_A: Number of columns of matrix A. 323 | // buff_A: Address/pointer of/to data in matrix A. Matrix A must be 324 | // stored in column-order. 325 | // ldim_A: Leading dimension of matrix A. 326 | // buff_jpvt: Input/output vector with the pivots. 327 | // buff_tau: Output vector with the tau values of the Householder factors. 328 | // nb_alg: Block size. 329 | // Usual values for nb_alg are 32, 64, etc. 330 | // pp: Oversampling size. 331 | // Usual values for pp are 5, 10, etc. 332 | // panel_pivoting: If panel_pivoting==1, QR with pivoting is applied to 333 | // factorize the panels of matrix A. Otherwise, QR without 334 | // pivoting is used. Usual value for panel_pivoting is 1. 335 | // Final comments: 336 | // --------------- 337 | // This code has been created from a libflame code. Hence, you can find some 338 | // commented calls to libflame routines. We have left them to make it easier 339 | // to interpret the meaning of the C code. 340 | // 341 | int b, j, last_iter, mn_A, m_Y, n_Y, ldim_Y, m_V, n_V, ldim_V, 342 | m_W, n_W, ldim_W, n_VR, m_AB1, n_AB1, ldim_T1_T, 343 | m_A11, n_A11, m_A12, n_A12, m_A21, n_A21, m_A22, 344 | m_G, n_G, ldim_G; 345 | double * buff_Y, * buff_V, * buff_W, * buff_VR, * buff_YR, 346 | * buff_s, * buff_sB, * buff_s1, 347 | * buff_AR, * buff_AB1, * buff_A01, * buff_Y1, * buff_T1_T, 348 | * buff_A11, * buff_A21, * buff_A12, 349 | * buff_Y2, * buff_G, * buff_G1, * buff_G2; 350 | int * buff_p, * buff_pB, * buff_p1; 351 | double d_zero = 0.0; 352 | double d_one = 1.0; 353 | 354 | // Executable Statements. 355 | //// printf( "%% NoFLA_HQRRP_WY_blk_var4.\n" ); 356 | 357 | // Check arguments. 358 | if( m_A < 0 ) { 359 | fprintf( stderr, 360 | "ERROR in NoFLA_HQRRP_WY_blk_var4: m_A is < 0.\n" ); 361 | } if( n_A < 0 ) { 362 | fprintf( stderr, 363 | "ERROR in NoFLA_HQRRP_WY_blk_var4: n_A is < 0.\n" ); 364 | } if( ldim_A < max( 1, m_A ) ) { 365 | fprintf( stderr, 366 | "ERROR in NoFLA_HQRRP_WY_blk_var4: ldim_A is < max( 1, m_A ).\n" ); 367 | } 368 | 369 | // Some initializations. 370 | mn_A = min( m_A, n_A ); 371 | buff_p = buff_jpvt; 372 | buff_s = buff_tau; 373 | 374 | // Quick return. 375 | if( mn_A == 0 ) { 376 | return 0; 377 | } 378 | 379 | // Initialize the seed for the generator of random numbers. 380 | srand( 12 ); 381 | 382 | // Create auxiliary objects. 383 | m_Y = nb_alg + pp; 384 | n_Y = n_A; 385 | buff_Y = ( double * ) malloc( m_Y * n_Y * sizeof( double ) ); 386 | ldim_Y = m_Y; 387 | 388 | m_V = nb_alg + pp; 389 | n_V = n_A; 390 | buff_V = ( double * ) malloc( m_V * n_V * sizeof( double ) ); 391 | ldim_V = m_V; 392 | 393 | m_W = nb_alg; 394 | n_W = n_A; 395 | buff_W = ( double * ) malloc( m_W * n_W * sizeof( double ) ); 396 | ldim_W = m_W; 397 | 398 | m_G = nb_alg + pp; 399 | n_G = m_A; 400 | buff_G = ( double * ) malloc( m_G * n_G * sizeof( double ) ); 401 | ldim_G = m_G; 402 | 403 | // Initialize matrices G and Y. 404 | NoFLA_Normal_random_matrix( nb_alg + pp, m_A, buff_G, ldim_G ); 405 | //// FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, 406 | //// FLA_ONE, G, A, FLA_ZERO, Y ); 407 | dgemm_( "No tranpose", "No transpose", & m_Y, & n_Y, & m_A, 408 | & d_one, buff_G, & ldim_G, buff_A, & ldim_A, 409 | & d_zero, buff_Y, & ldim_Y ); 410 | 411 | // Main Loop. 412 | for( j = 0; j < mn_A; j += nb_alg ) { 413 | b = min( nb_alg, min( n_A - j, m_A - j ) ); 414 | 415 | // Check whether it is the last iteration. 416 | last_iter = ( ( ( j + nb_alg >= m_A )||( j + nb_alg >= n_A ) ) ? 1 : 0 ); 417 | 418 | // Some initializations for the iteration of this loop. 419 | n_VR = n_V - j; 420 | buff_VR = & buff_V[ 0 + j * ldim_V ]; 421 | buff_YR = & buff_Y[ 0 + j * ldim_Y ]; 422 | buff_pB = & buff_p[ j ]; 423 | buff_sB = & buff_s[ j ]; 424 | buff_AR = & buff_A[ 0 + j * ldim_A ]; 425 | 426 | m_AB1 = m_A - j; 427 | n_AB1 = b; 428 | buff_AB1 = & buff_A[ j + j * ldim_A ]; 429 | buff_p1 = & buff_p[ j ]; 430 | buff_s1 = & buff_s[ j ]; 431 | buff_A01 = & buff_A[ 0 + j * ldim_A ]; 432 | buff_Y1 = & buff_Y[ 0 + j * ldim_Y ]; 433 | buff_T1_T = & buff_W[ 0 + j * ldim_W ]; 434 | ldim_T1_T = ldim_W; 435 | 436 | buff_A11 = & buff_A[ j + j * ldim_A ]; 437 | m_A11 = b; 438 | n_A11 = b; 439 | 440 | buff_A21 = & buff_A[ min( m_A - 1, j + nb_alg ) + j * ldim_A ]; 441 | m_A21 = max( 0, m_A - j - b ); 442 | n_A21 = b; 443 | 444 | buff_A12 = & buff_A[ j + min( n_A - 1, j + b ) * ldim_A ]; 445 | m_A12 = b; 446 | n_A12 = max( 0, n_A - j - b ); 447 | 448 | //// buff_A22 = & buff_A[ min( m_A - 1, j + b ) + 449 | //// min( n_A - 1, j + b ) * ldim_A ]; 450 | m_A22 = max( 0, m_A - j - b ); 451 | //// n_A22 = max( 0, n_A - j - b ); 452 | 453 | buff_Y2 = & buff_Y[ 0 + min( n_Y - 1, j + b ) * ldim_Y ]; 454 | buff_G1 = & buff_G[ 0 + j * ldim_G ]; 455 | buff_G2 = & buff_G[ 0 + min( n_G - 1, j + b ) * ldim_G ]; 456 | 457 | #ifdef CHECK_DOWNDATING_OF_Y 458 | // Check downdating of matrix Y: Compare downdated matrix Y with 459 | // matrix Y computed from scratch. 460 | int m_cyr, n_cyr, ldim_cyr, m_ABR, ii, jj; 461 | double * buff_cyr, aux, sum; 462 | 463 | m_cyr = m_Y; 464 | n_cyr = n_Y - j; 465 | ldim_cyr = m_cyr; 466 | m_ABR = m_A - j; 467 | buff_cyr = ( double * ) malloc( m_cyr * n_cyr * sizeof( double ) ); 468 | 469 | //// FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, 470 | //// FLA_ONE, GR, ABR, FLA_ZERO, CYR ); 471 | dgemm_( "No tranpose", "No transpose", & m_cyr, & n_cyr, & m_ABR, 472 | & d_one, & buff_G[ 0 + j * ldim_G ], & ldim_G, 473 | & buff_A[ j + j * ldim_A ], & ldim_A, 474 | & d_zero, & buff_cyr[ 0 + 0 * ldim_cyr ], & ldim_cyr ); 475 | 476 | //// print_double_matrix( "cyr", m_cyr, n_cyr, buff_cyr, ldim_cyr ); 477 | //// print_double_matrix( "y", m_Y, n_Y, buff_Y, ldim_Y ); 478 | sum = 0.0; 479 | for( jj = 0; jj < n_cyr; jj++ ) { 480 | for( ii = 0; ii < m_cyr; ii++ ) { 481 | aux = buff_Y[ ii + ( j + jj ) * ldim_Y ] - 482 | buff_cyr[ ii + jj * ldim_cyr ]; 483 | sum += aux * aux; 484 | } 485 | } 486 | sum = sqrt( sum ); 487 | printf( "%% diff between Y and downdated Y: %le\n", sum ); 488 | 489 | free( buff_cyr ); 490 | #endif 491 | 492 | if( last_iter == 0 ) { 493 | // Compute QRP of YR, and apply permutations to matrix AR. 494 | // A copy of YR is made into VR, and permutations are applied to YR. 495 | //// FLA_Merge_2x1( ATR, 496 | //// ABR, & AR ); 497 | //// FLA_Copy( YR, VR ); 498 | //// FLA_QRPmod_WY_unb_var4( 1, bRow, VR, pB, sB, 1, AR, 1, YR, 0, None ); 499 | 500 | dlacpy_( "All", & m_V, & n_VR, buff_YR, & ldim_Y, 501 | buff_VR, & ldim_V ); 502 | NoFLA_QRPmod_WY_unb_var4( 1, b, 503 | m_V, n_VR, buff_VR, ldim_V, buff_pB, buff_sB, 504 | 1, m_A, buff_AR, ldim_A, 505 | 1, m_Y, buff_YR, ldim_Y, 506 | 0, buff_Y, ldim_Y ); 507 | } 508 | 509 | // 510 | // Compute QRP of panel AB1 = [ A11; A21 ]. 511 | // Apply same permutations to A01 and Y1, and build T1_T. 512 | // 513 | //// FLA_Part_2x1( W1, & T1_T, 514 | //// & None, b, FLA_TOP ); 515 | //// FLA_Merge_2x1( A11, 516 | //// A21, & AB1 ); 517 | //// FLA_QRPmod_WY_unb_var4( panel_pivoting, -1, AB1, p1, s1, 518 | //// 1, A01, 1, Y1, 1, T1_T ); 519 | 520 | NoFLA_QRPmod_WY_unb_var4( panel_pivoting, -1, 521 | m_AB1, n_AB1, buff_AB1, ldim_A, buff_p1, buff_s1, 522 | 1, j, buff_A01, ldim_A, 523 | 1, m_Y, buff_Y1, ldim_Y, 524 | 1, buff_T1_T, ldim_W ); 525 | 526 | // 527 | // Update the rest of the matrix. 528 | // 529 | if ( ( j + b ) < n_A ) { 530 | // Apply the Householder transforms associated with AB1 = [ A11; A21 ] 531 | // and T1_T to [ A12; A22 ]: 532 | // / A12 \ := QB1' / A12 \ 533 | // \ A22 / \ A22 / 534 | // where QB1 is formed from AB1 and T1_T. 535 | //// MyFLA_Apply_Q_WY_lhfc_blk_var4( A11, A21, T1_T, A12, A22 ); 536 | 537 | NoFLA_Apply_Q_WY_lhfc_blk_var4( 538 | m_A11 + m_A21, n_A11, buff_A11, ldim_A, 539 | b, b, buff_T1_T, ldim_W, 540 | m_A12 + m_A22, n_A12, buff_A12, ldim_A ); 541 | } 542 | 543 | // 544 | // Downdate matrix Y. 545 | // 546 | if ( ! last_iter ) { 547 | //// MyFLA_Downdate_Y( A11, A21, A12, T1_T, Y2, G1, G2 ); 548 | 549 | NoFLA_Downdate_Y( 550 | m_A11, n_A11, buff_A11, ldim_A, 551 | m_A21, n_A21, buff_A21, ldim_A, 552 | m_A12, n_A12, buff_A12, ldim_A, 553 | b, b, buff_T1_T, ldim_T1_T, 554 | m_Y, max( 0, n_Y - j - b ), buff_Y2, ldim_Y, 555 | m_G, b, buff_G1, ldim_G, 556 | m_G, max( 0, n_G - j - b ), buff_G2, ldim_G ); 557 | } 558 | } 559 | 560 | // Remove auxiliary objects. 561 | //// FLA_Obj_free( & G ); 562 | //// FLA_Obj_free( & Y ); 563 | //// FLA_Obj_free( & V ); 564 | //// FLA_Obj_free( & W ); 565 | free( buff_G ); 566 | free( buff_Y ); 567 | free( buff_V ); 568 | free( buff_W ); 569 | 570 | return 0; 571 | } 572 | 573 | 574 | // ============================================================================ 575 | static int NoFLA_Normal_random_matrix( int m_A, int n_A, 576 | double * buff_A, int ldim_A ) { 577 | // 578 | // It generates a random matrix with normal distribution. 579 | // 580 | int i, j; 581 | 582 | // Main loop. 583 | for ( j = 0; j < n_A; j++ ) { 584 | for ( i = 0; i < m_A; i++ ) { 585 | buff_A[ i + j * ldim_A ] = NoFLA_Normal_random_number( 0.0, 1.0 ); 586 | } 587 | } 588 | 589 | return 0; 590 | } 591 | 592 | /* ========================================================================= */ 593 | static double NoFLA_Normal_random_number( double mu, double sigma ) { 594 | static int alternate_calls = 0; 595 | static double b1, b2; 596 | double c1, c2, a, factor; 597 | 598 | // Quick return. 599 | if( alternate_calls == 1 ) { 600 | alternate_calls = ! alternate_calls; 601 | return( mu + sigma * b2 ); 602 | } 603 | // Main loop. 604 | do { 605 | c1 = -1.0 + 2.0 * ( (double) rand() / RAND_MAX ); 606 | c2 = -1.0 + 2.0 * ( (double) rand() / RAND_MAX ); 607 | a = c1 * c1 + c2 * c2; 608 | } while ( ( a == 0 )||( a >= 1 ) ); 609 | factor = sqrt( ( -2 * log( a ) ) / a ); 610 | b1 = c1 * factor; 611 | b2 = c2 * factor; 612 | alternate_calls = ! alternate_calls; 613 | return( mu + sigma * b1 ); 614 | } 615 | 616 | // ============================================================================ 617 | static int NoFLA_Downdate_Y( 618 | int m_U11, int n_U11, double * buff_U11, int ldim_U11, 619 | int m_U21, int n_U21, double * buff_U21, int ldim_U21, 620 | int m_A12, int n_A12, double * buff_A12, int ldim_A12, 621 | int m_T, int n_T, double * buff_T, int ldim_T, 622 | int m_Y2, int n_Y2, double * buff_Y2, int ldim_Y2, 623 | int m_G1, int n_G1, double * buff_G1, int ldim_G1, 624 | int m_G2, int n_G2, double * buff_G2, int ldim_G2 ) { 625 | // 626 | // It downdates matrix Y, and updates matrix G. 627 | // Only Y2 of Y is updated. 628 | // Only G1 and G2 of G are updated. 629 | // 630 | // Y2 = Y2 - ( G1 - ( G1*U11 + G2*U21 ) * T11 * U11' ) * R12. 631 | // 632 | int i, j; 633 | double * buff_B; 634 | double d_one = 1.0; 635 | double d_minus_one = -1.0; 636 | int m_B = m_G1; 637 | int n_B = n_G1; 638 | int ldim_B = m_G1; 639 | 640 | // Create object B. 641 | //// FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, G1, & B ); 642 | buff_B = ( double * ) malloc( m_B * n_B * sizeof( double ) ); 643 | 644 | // B = G1. 645 | //// FLA_Copy( G1, B ); 646 | dlacpy_( "All", & m_G1, & n_G1, buff_G1, & ldim_G1, 647 | buff_B, & ldim_B ); 648 | 649 | // B = B * U11. 650 | //// FLA_Trmm( FLA_RIGHT, FLA_LOWER_TRIANGULAR, 651 | //// FLA_NO_TRANSPOSE, FLA_UNIT_DIAG, 652 | //// FLA_ONE, U11, B ); 653 | dtrmm_( "Right", "Lower", "No transpose", "Unit", & m_B, & n_B, 654 | & d_one, buff_U11, & ldim_U11, buff_B, & ldim_B ); 655 | 656 | // B = B + G2 * U21. 657 | //// FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, 658 | //// FLA_ONE, G2, U21, FLA_ONE, B ); 659 | dgemm_( "No transpose", "No tranpose", & m_B, & n_B, & m_U21, 660 | & d_one, buff_G2, & ldim_G2, buff_U21, & ldim_U21, 661 | & d_one, buff_B, & ldim_B ); 662 | 663 | // B = B * T11. 664 | //// FLA_Trsm( FLA_RIGHT, FLA_UPPER_TRIANGULAR, 665 | //// FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, 666 | //// FLA_ONE, T, B ); 667 | //// dtrsm_( "Right", "Upper", "No transpose", "Non-unit", & m_B, & n_B, 668 | //// & d_one, buff_T, & ldim_T, buff_B, & ldim_B ); 669 | // Used dtrmm instead of dtrsm because of using compact WY instead of UT. 670 | dtrmm_( "Right", "Upper", "No transpose", "Non-unit", & m_B, & n_B, 671 | & d_one, buff_T, & ldim_T, buff_B, & ldim_B ); 672 | 673 | // B = - B * U11^H. 674 | //// FLA_Trmm( FLA_RIGHT, FLA_LOWER_TRIANGULAR, 675 | //// FLA_CONJ_TRANSPOSE, FLA_UNIT_DIAG, 676 | //// FLA_MINUS_ONE, U11, B ); 677 | dtrmm_( "Right", "Lower", "Conj_tranpose", "Unit", & m_B, & n_B, 678 | & d_minus_one, buff_U11, & ldim_U11, buff_B, & ldim_B ); 679 | 680 | // B = G1 + B. 681 | //// FLA_Axpy( FLA_ONE, G1, B ); 682 | for( j = 0; j < n_B; j++ ) { 683 | for( i = 0; i < m_B; i++ ) { 684 | buff_B[ i + j * ldim_B ] += buff_G1[ i + j * ldim_G1 ]; 685 | } 686 | } 687 | 688 | // Y2 = Y2 - B * R12. 689 | //// FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, 690 | //// FLA_MINUS_ONE, B, A12, FLA_ONE, Y2 ); 691 | dgemm_( "No transpose", "No transpose", & m_Y2, & n_Y2, & m_A12, 692 | & d_minus_one, buff_B, & ldim_B, buff_A12, & ldim_A12, 693 | & d_one, buff_Y2, & ldim_Y2 ); 694 | 695 | // 696 | // GR = GR * Q 697 | // 698 | NoFLA_Apply_Q_WY_rnfc_blk_var4( 699 | m_U11 + m_U21, n_U11, buff_U11, ldim_U11, 700 | m_T, n_T, buff_T, ldim_T, 701 | m_G1, n_G1 + n_G2, buff_G1, ldim_G1 ); 702 | 703 | // Remove object B. 704 | //// FLA_Obj_free( & B ); 705 | free( buff_B ); 706 | 707 | return 0; 708 | } 709 | 710 | // ============================================================================ 711 | static int NoFLA_Apply_Q_WY_lhfc_blk_var4( 712 | int m_U, int n_U, double * buff_U, int ldim_U, 713 | int m_T, int n_T, double * buff_T, int ldim_T, 714 | int m_B, int n_B, double * buff_B, int ldim_B ) { 715 | // 716 | // It applies the transpose of a block transformation Q to a matrix B from 717 | // the left: 718 | // B := Q' * B 719 | // where: 720 | // Q = I - U * T' * U'. 721 | // 722 | double * buff_W; 723 | int ldim_W; 724 | 725 | // Create auxiliary object. 726 | //// FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, B1, & W ); 727 | buff_W = ( double * ) malloc( n_B * n_U * sizeof( double ) ); 728 | ldim_W = max( 1, n_B ); 729 | 730 | // Apply the block transformation. 731 | dlarfb_( "Left", "Transpose", "Forward", "Columnwise", 732 | & m_B, & n_B, & n_U, buff_U, & ldim_U, buff_T, & ldim_T, 733 | buff_B, & ldim_B, buff_W, & ldim_W ); 734 | 735 | // Remove auxiliary object. 736 | //// FLA_Obj_free( & W ); 737 | free( buff_W ); 738 | 739 | return 0; 740 | } 741 | 742 | // ============================================================================ 743 | static int NoFLA_Apply_Q_WY_rnfc_blk_var4( 744 | int m_U, int n_U, double * buff_U, int ldim_U, 745 | int m_T, int n_T, double * buff_T, int ldim_T, 746 | int m_B, int n_B, double * buff_B, int ldim_B ) { 747 | // 748 | // It applies a block transformation Q to a matrix B from the right: 749 | // B = B * Q 750 | // where: 751 | // Q = I - U * T' * U'. 752 | // 753 | double * buff_W; 754 | int ldim_W; 755 | 756 | // Create auxiliary object. 757 | //// FLA_Obj_create_conf_to( FLA_TRANSPOSE, B1, & W ); 758 | buff_W = ( double * ) malloc( m_B * n_U * sizeof( double ) ); 759 | ldim_W = max( 1, m_B ); 760 | 761 | // Apply the block transformation. 762 | dlarfb_( "Right", "No transpose", "Forward", "Columnwise", 763 | & m_B, & n_B, & n_U, buff_U, & ldim_U, buff_T, & ldim_T, 764 | buff_B, & ldim_B, buff_W, & ldim_W ); 765 | 766 | // Remove auxiliary object. 767 | //// FLA_Obj_free( & W ); 768 | free( buff_W ); 769 | 770 | return 0; 771 | } 772 | 773 | // ============================================================================ 774 | static int NoFLA_QRPmod_WY_unb_var4( int pivoting, int num_stages, 775 | int m_A, int n_A, double * buff_A, int ldim_A, 776 | int * buff_p, double * buff_t, 777 | int pivot_B, int m_B, double * buff_B, int ldim_B, 778 | int pivot_C, int m_C, double * buff_C, int ldim_C, 779 | int build_T, double * buff_T, int ldim_T ) { 780 | // 781 | // It computes an unblocked QR factorization of matrix A with or without 782 | // pivoting. Matrices B and C are optionally pivoted, and matrix T is 783 | // optionally built. 784 | // 785 | // Arguments: 786 | // "pivoting": If pivoting==1, then QR factorization with pivoting is used. 787 | // "numstages": It tells the number of columns that are factorized. 788 | // If "num_stages" is negative, the whole matrix A is factorized. 789 | // If "num_stages" is positive, only the first "num_stages" are factorized. 790 | // "pivot_B": if "pivot_B" is true, matrix "B" is pivoted too. 791 | // "pivot_C": if "pivot_C" is true, matrix "C" is pivoted too. 792 | // "build_T": if "build_T" is true, matrix "T" is built. 793 | // 794 | int j, mn_A, m_a21, m_A22, n_A22, n_dB, idx_max_col, 795 | i_one = 1, n_house_vector, m_rest; 796 | double * buff_d, * buff_e, * buff_workspace, diag; 797 | int idamax_(); 798 | 799 | //// printf( "NoFLA_QRPmod_WY_unb_var4. pivoting: %d \n", pivoting ); 800 | 801 | // Some initializations. 802 | mn_A = min( m_A, n_A ); 803 | 804 | // Set the number of stages, if needed. 805 | if( num_stages < 0 ) { 806 | num_stages = mn_A; 807 | } 808 | 809 | // Create auxiliary vectors. 810 | buff_d = ( double * ) malloc( n_A * sizeof( double ) ); 811 | buff_e = ( double * ) malloc( n_A * sizeof( double ) ); 812 | buff_workspace = ( double * ) malloc( n_A * sizeof( double ) ); 813 | 814 | if( pivoting == 1 ) { 815 | // Compute initial norms of A into d and e. 816 | NoFLA_QRP_compute_norms( m_A, n_A, buff_A, ldim_A, buff_d, buff_e ); 817 | } 818 | 819 | // Main Loop. 820 | for( j = 0; j < num_stages; j++ ) { 821 | n_dB = n_A - j; 822 | m_a21 = m_A - j - 1; 823 | m_A22 = m_A - j - 1; 824 | n_A22 = n_A - j - 1; 825 | 826 | if( pivoting == 1 ) { 827 | // Obtain the index of the column with largest 2-norm. 828 | idx_max_col = idamax_( & n_dB, & buff_d[ j ], & i_one ) - 1; 829 | 830 | // Swap columns of A, B, C, pivots, and norms vectors. 831 | NoFLA_QRP_pivot_G_B_C( idx_max_col, 832 | m_A, & buff_A[ 0 + j * ldim_A ], ldim_A, 833 | pivot_B, m_B, & buff_B[ 0 + j * ldim_B ], ldim_B, 834 | pivot_C, m_C, & buff_C[ 0 + j * ldim_C ], ldim_C, 835 | & buff_p[ j ], 836 | & buff_d[ j ], 837 | & buff_e[ j ] ); 838 | } 839 | 840 | // Compute tau1 and u21 from alpha11 and a21 such that tau1 and u21 841 | // determine a Householder transform H such that applying H from the 842 | // left to the column vector consisting of alpha11 and a21 annihilates 843 | // the entries in a21 (and updates alpha11). 844 | n_house_vector = m_a21 + 1; 845 | dlarfg_( & n_house_vector, 846 | & buff_A[ j + j * ldim_A ], 847 | & buff_A[ min( m_A-1, j+1 ) + j * ldim_A ], & i_one, 848 | & buff_t[ j ] ); 849 | 850 | // / a12t \ = H / a12t \ 851 | // \ A22 / \ A22 / 852 | // 853 | // where H is formed from tau1 and u21. 854 | diag = buff_A[ j + j * ldim_A ]; 855 | buff_A[ j + j * ldim_A ] = 1.0; 856 | m_rest = m_A22 + 1; 857 | dlarf_( "Left", & m_rest, & n_A22, 858 | & buff_A[ j + j * ldim_A ], & i_one, 859 | & buff_t[ j ], 860 | & buff_A[ j + ( j+1 ) * ldim_A ], & ldim_A, 861 | buff_workspace ); 862 | buff_A[ j + j * ldim_A ] = diag; 863 | 864 | if( pivoting == 1 ) { 865 | // Update partial column norms. 866 | NoFLA_QRP_downdate_partial_norms( m_A22, n_A22, 867 | & buff_d[ j+1 ], 1, 868 | & buff_e[ j+1 ], 1, 869 | & buff_A[ j + ( j+1 ) * ldim_A ], ldim_A, 870 | & buff_A[ ( j+1 ) + min( n_A-1, ( j+1 ) ) * ldim_A ], ldim_A ); 871 | } 872 | } 873 | 874 | // Build T. 875 | if( build_T ) { 876 | dlarft_( "Forward", "Columnwise", & m_A, & num_stages, buff_A, & ldim_A, 877 | buff_t, buff_T, & ldim_T ); 878 | } 879 | 880 | // Remove auxiliary vectors. 881 | free( buff_d ); 882 | free( buff_e ); 883 | free( buff_workspace ); 884 | 885 | return 0; 886 | } 887 | 888 | // ============================================================================ 889 | static int NoFLA_QRP_compute_norms( 890 | int m_A, int n_A, double * buff_A, int ldim_A, 891 | double * buff_d, double * buff_e ) { 892 | // 893 | // It computes the column norms of matrix A. The norms are stored into 894 | // vectors d and e. 895 | // 896 | int j, i_one = 1; 897 | double dnrm2_(); 898 | 899 | // Main loop. 900 | for( j = 0; j < n_A; j++ ) { 901 | * buff_d = dnrm2_( & m_A, buff_A, & i_one ); 902 | * buff_e = * buff_d; 903 | buff_A += ldim_A; 904 | buff_d++; 905 | buff_e++; 906 | } 907 | 908 | return 0; 909 | } 910 | 911 | // ============================================================================ 912 | static int NoFLA_QRP_downdate_partial_norms( int m_A, int n_A, 913 | double * buff_d, int st_d, 914 | double * buff_e, int st_e, 915 | double * buff_wt, int st_wt, 916 | double * buff_A, int ldim_A ) { 917 | // 918 | // It updates (downdates) the column norms of matrix A. It uses Drmac's method. 919 | // 920 | int j, i_one = 1; 921 | double * ptr_d, * ptr_e, * ptr_wt, * ptr_A; 922 | double temp, temp2, temp5, tol3z; 923 | double dnrm2_(), dlamch_(); 924 | 925 | /* 926 | * 927 | * Update partial column norms 928 | * 929 | DO 30 J = I + 1, N 930 | IF( WORK( J ).NE.ZERO ) THEN 931 | * 932 | * NOTE: The following 4 lines follow from the analysis in 933 | * Lapack Working Note 176. 934 | * 935 | TEMP = ABS( A( I, J ) ) / WORK( J ) 936 | TEMP = MAX( ZERO, ( ONE+TEMP )*( ONE-TEMP ) ) 937 | TEMP2 = TEMP*( WORK( J ) / WORK( N+J ) )**2 938 | IF( TEMP2 .LE. TOL3Z ) THEN 939 | IF( M-I.GT.0 ) THEN 940 | WORK( J ) = DNRM2( M-I, A( I+1, J ), 1 ) 941 | WORK( N+J ) = WORK( J ) 942 | ELSE 943 | WORK( J ) = ZERO 944 | WORK( N+J ) = ZERO 945 | END IF 946 | ELSE 947 | WORK( J ) = WORK( J )*SQRT( TEMP ) 948 | END IF 949 | END IF 950 | 30 CONTINUE 951 | */ 952 | 953 | // Some initializations. 954 | tol3z = sqrt( dlamch_( "Epsilon" ) ); 955 | ptr_d = buff_d; 956 | ptr_e = buff_e; 957 | ptr_wt = buff_wt; 958 | ptr_A = buff_A; 959 | 960 | // Main loop. 961 | for( j = 0; j < n_A; j++ ) { 962 | if( * ptr_d != 0.0 ) { 963 | temp = dabs( * ptr_wt ) / * ptr_d; 964 | temp = max( 0.0, ( 1.0 + temp ) * ( 1 - temp ) ); 965 | temp5 = * ptr_d / * ptr_e; 966 | temp2 = temp * temp5 * temp5; 967 | if( temp2 <= tol3z ) { 968 | if( m_A > 0 ) { 969 | * ptr_d = dnrm2_( & m_A, ptr_A, & i_one ); 970 | * ptr_e = *ptr_d; 971 | } else { 972 | * ptr_d = 0.0; 973 | * ptr_e = 0.0; 974 | } 975 | } else { 976 | * ptr_d = * ptr_d * sqrt( temp ); 977 | } 978 | } 979 | ptr_A += ldim_A; 980 | ptr_d += st_d; 981 | ptr_e += st_e; 982 | ptr_wt += st_wt; 983 | } 984 | 985 | return 0; 986 | } 987 | 988 | 989 | // ============================================================================ 990 | static int NoFLA_QRP_pivot_G_B_C( int j_max_col, 991 | int m_G, double * buff_G, int ldim_G, 992 | int pivot_B, int m_B, double * buff_B, int ldim_B, 993 | int pivot_C, int m_C, double * buff_C, int ldim_C, 994 | int * buff_p, 995 | double * buff_d, double * buff_e ) { 996 | // 997 | // It pivots matrix G, pivot vector p, and norms vectors d and e. 998 | // Matrices B and C are optionally pivoted. 999 | // 1000 | int ival, i_one = 1; 1001 | double * ptr_g1, * ptr_g2, * ptr_b1, * ptr_b2, * ptr_c1, * ptr_c2; 1002 | 1003 | // Swap columns of G, pivots, and norms. 1004 | if( j_max_col != 0 ) { 1005 | 1006 | // Swap full column 0 and column "j_max_col" of G. 1007 | ptr_g1 = & buff_G[ 0 + 0 * ldim_G ]; 1008 | ptr_g2 = & buff_G[ 0 + j_max_col * ldim_G ]; 1009 | dswap_( & m_G, ptr_g1, & i_one, ptr_g2, & i_one ); 1010 | 1011 | // Swap full column 0 and column "j_max_col" of B. 1012 | if( pivot_B ) { 1013 | ptr_b1 = & buff_B[ 0 + 0 * ldim_B ]; 1014 | ptr_b2 = & buff_B[ 0 + j_max_col * ldim_B ]; 1015 | dswap_( & m_B, ptr_b1, & i_one, ptr_b2, & i_one ); 1016 | } 1017 | 1018 | // Swap full column 0 and column "j_max_col" of C. 1019 | if( pivot_C ) { 1020 | ptr_c1 = & buff_C[ 0 + 0 * ldim_C ]; 1021 | ptr_c2 = & buff_C[ 0 + j_max_col * ldim_C ]; 1022 | dswap_( & m_C, ptr_c1, & i_one, ptr_c2, & i_one ); 1023 | } 1024 | 1025 | // Swap element 0 and element "j_max_col" of pivot vector "p". 1026 | ival = buff_p[ j_max_col ]; 1027 | buff_p[ j_max_col ] = buff_p[ 0 ]; 1028 | buff_p[ 0 ] = ival; 1029 | 1030 | // Copy norms of column 0 to column "j_max_col". 1031 | buff_d[ j_max_col ] = buff_d[ 0 ]; 1032 | buff_e[ j_max_col ] = buff_e[ 0 ]; 1033 | } 1034 | 1035 | return 0; 1036 | } 1037 | 1038 | -------------------------------------------------------------------------------- /libflame_sources/FLA_HQRRP_UT_blk_var2.c: -------------------------------------------------------------------------------- 1 | /* 2 | =============================================================================== 3 | Authors 4 | =============================================================================== 5 | 6 | Per-Gunnar Martinsson 7 | Dept. of Applied Mathematics, 8 | University of Colorado at Boulder, 9 | 526 UCB, Boulder, CO 80309-0526, USA 10 | 11 | Gregorio Quintana-Orti 12 | Depto. de Ingenieria y Ciencia de Computadores, 13 | Universitat Jaume I, 14 | 12.071 Castellon, Spain 15 | 16 | Nathan Heavner 17 | Dept. of Applied Mathematics, 18 | University of Colorado at Boulder, 19 | 526 UCB, Boulder, CO 80309-0526, USA 20 | 21 | Robert van de Geijn 22 | Dept. of Computer Science and Institute for Computational Engineering and 23 | Sciences, 24 | The University of Texas at Austin 25 | Austin, TX. 26 | 27 | =============================================================================== 28 | Copyright 29 | =============================================================================== 30 | 31 | Copyright (C) 2016, 32 | Universitat Jaume I, 33 | University of Colorado at Boulder, 34 | The University of Texas at Austin. 35 | 36 | =============================================================================== 37 | Disclaimer 38 | =============================================================================== 39 | 40 | This code is distributed in the hope that it will be useful, but 41 | WITHOUT ANY WARRANTY EXPRESSED OR IMPLIED. 42 | 43 | */ 44 | 45 | #include 46 | #include 47 | #include 48 | #include "FLAME.h" 49 | #include "FLA_HQRRP_UT_blk_var2.h" 50 | 51 | 52 | /* 53 | // Matrices with dimensions smaller than THRESHOLD_FOR_DGEQPF are processed 54 | // with LAPACK's routine dgeqpf. 55 | // Matrices with dimensions between THRESHOLD_FOR_DGEQPF and 56 | // THRESHOLD_FOR_DGEQP3 are processed with LAPACK's routine dgeqp3. 57 | // Matrices with dimensions larger than THRESHOLD_FOR_DGEQP3 are processed 58 | // with the new HQRRP code. 59 | #define THRESHOLD_FOR_DGEQPF 250 60 | #define THRESHOLD_FOR_DGEQP3 1000 61 | */ 62 | 63 | 64 | // ============================================================================ 65 | // Compilation declarations. 66 | 67 | #undef CHECK_DOWNDATING_OF_Y 68 | #undef PROFILE 69 | 70 | 71 | // ============================================================================ 72 | // Declaration of local prototypes. 73 | 74 | static int MyFLA_Normal_random_matrix( FLA_Obj A ); 75 | 76 | static double MyFLA_Normal_random_number( double mu, double sigma ); 77 | 78 | static FLA_Error MyFLA_Downdate_Y( FLA_Obj U11, FLA_Obj U21, FLA_Obj A12, 79 | FLA_Obj T, 80 | FLA_Obj Y2, FLA_Obj G1, FLA_Obj G2 ); 81 | 82 | static FLA_Error MyFLA_Apply_Q_UT_lhfc_blk_var2( FLA_Obj U11, FLA_Obj U21, 83 | FLA_Obj T, 84 | FLA_Obj B1, FLA_Obj B2 ); 85 | 86 | static FLA_Error MyFLA_Apply_Q_UT_rnfc_blk_var2( FLA_Obj U11, FLA_Obj U21, 87 | FLA_Obj T, 88 | FLA_Obj B1, FLA_Obj B2 ); 89 | 90 | static FLA_Error MyFLA_QRPmod_UT_unb_var2( int pivoting, int num_stages, 91 | FLA_Obj A, FLA_Obj p, FLA_Obj t, 92 | int pivot_B, FLA_Obj B, 93 | int pivot_C, FLA_Obj C, 94 | int build_T, FLA_Obj T ); 95 | 96 | static FLA_Error MyFLA_Apply_H2_UT_l_opd_var2( 97 | int m_u2_A2, 98 | int n_a1t, 99 | double * tau, 100 | double * u2, int inc_u2, 101 | double * a1t, int inc_a1t, 102 | double * A2, int ldim_A2, 103 | double * workspace ); 104 | 105 | static FLA_Error MyFLA_QRP_compute_norms( FLA_Obj A, FLA_Obj d, FLA_Obj e ); 106 | 107 | static FLA_Error NoFLA_QRP_pivot_B_C( int j_max_col, 108 | int m_G, double * buff_G, int ldim_G, 109 | int pivot_B, int m_B, double * buff_B, int ldim_B, 110 | int pivot_C, int m_C, double * buff_C, int ldim_C, 111 | int * buff_p, 112 | double * buff_d, double * buff_e ); 113 | 114 | static FLA_Error NoFLA_QRP_downdate_partial_norms( int m_A, int n_A, 115 | double * buff_d, int st_d, 116 | double * buff_e, int st_e, 117 | double * buff_wt, int st_wt, 118 | double * buff_A, int ldim_A ); 119 | 120 | static int NoFLA_my_idamax( int n, double * buff_data ); 121 | 122 | 123 | // ============================================================================ 124 | int FLA_HQRRP_UT_blk_var2( FLA_Obj A, FLA_Obj p, FLA_Obj s, 125 | int nb_alg, int pp, int panel_pivoting ) { 126 | // 127 | // HQRRP: It computes the Householder QR with Randomized Pivoting of matrix A. 128 | // This routine is almost compatible with LAPACK's dgeqp3. 129 | // The main difference is that this routine does not manage fixed columns. 130 | // 131 | // Main features: 132 | // * BLAS-3 based. 133 | // * Norm downdating method by Drmac. 134 | // * Downdating for computing Y. 135 | // * Use of libflame. 136 | // * UT transformations are used. 137 | // 138 | // Arguments: 139 | // ---------- 140 | // A: Matrix to be factorized. 141 | // p: output vector with the pivots. 142 | // s: Output vector with the tau values of the Householder factors. 143 | // nb_alg: Block size. Usual values for nb_alg are 32, 64, etc. 144 | // pp: Oversampling size. Usual values for pp are 5, 10, etc. 145 | // panel_pivoting: If panel_pivoting==1, QR with pivoting is applied to 146 | // factorize the panels of matrix A. Otherwise, QR without 147 | // pivoting is used. Usual value for panel_pivoting is 1. 148 | // 149 | // Declaration of variables. 150 | FLA_Obj ATL, ATR, A00, A01, A02, 151 | ABL, ABR, A10, A11, A12, 152 | A20, A21, A22; 153 | FLA_Obj pT, p0, 154 | pB, p1, 155 | p2; 156 | FLA_Obj sT, s0, 157 | sB, s1, 158 | s2; 159 | FLA_Obj GL, GR, G0, G1, G2; 160 | FLA_Obj YL, YR, Y0, Y1, Y2; 161 | FLA_Obj VL, VR, V0, V1, V2; 162 | FLA_Obj WL, WR, W0, W1, W2; 163 | FLA_Obj AR, AB1, G, Y, V, W, T1_T, None; 164 | int bRow, m_A, n_A, mn_A, dtype_A, last_iter; 165 | #ifdef PROFILE 166 | double t1, t2, tt_qrp1, tt_qrp2, tt_updt, tt_down; 167 | #endif 168 | 169 | // Executable Statements. 170 | //// printf( "%% FLA_HQRRP_UT_blk_var2.\n" ); 171 | 172 | // Some initializations. 173 | m_A = FLA_Obj_length( A ); 174 | n_A = FLA_Obj_width ( A ); 175 | mn_A = min( m_A, n_A ); 176 | dtype_A = FLA_Obj_datatype( A ); 177 | 178 | // Quick return. 179 | if( mn_A == 0 ) { 180 | return 0; 181 | } 182 | 183 | // Initialize the seed for the generator of random numbers. 184 | srand( 12 ); 185 | 186 | #ifdef PROFILE 187 | tt_qrp1 = 0.0; 188 | tt_qrp2 = 0.0; 189 | tt_updt = 0.0; 190 | tt_down = 0.0; 191 | #endif 192 | 193 | // Create and initialize auxiliary objects. 194 | FLA_Obj_create( dtype_A, nb_alg + pp, m_A, 0, 0, & G ); 195 | FLA_Obj_create( dtype_A, nb_alg + pp, n_A, 0, 0, & Y ); 196 | FLA_Obj_create( dtype_A, nb_alg + pp, n_A, 0, 0, & V ); 197 | FLA_Obj_create( dtype_A, nb_alg, n_A, 0, 0, & W ); 198 | 199 | // Initialize matrices G and Y. 200 | MyFLA_Normal_random_matrix( G ); 201 | FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, 202 | FLA_ONE, G, A, FLA_ZERO, Y ); 203 | 204 | // Initial Partitioning. 205 | FLA_Part_2x2( A, & ATL, & ATR, 206 | & ABL, & ABR, 0, 0, FLA_TL ); 207 | FLA_Part_2x1( p, & pT, 208 | & pB, 0, FLA_TOP ); 209 | FLA_Part_2x1( s, & sT, 210 | & sB, 0, FLA_TOP ); 211 | FLA_Part_1x2( G, & GL, & GR, 0, FLA_LEFT ); 212 | FLA_Part_1x2( Y, & YL, & YR, 0, FLA_LEFT ); 213 | FLA_Part_1x2( V, & VL, & VR, 0, FLA_LEFT ); 214 | FLA_Part_1x2( W, & WL, & WR, 0, FLA_LEFT ); 215 | 216 | // Main Loop. 217 | while( ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) )&& 218 | ( FLA_Obj_width ( ATL ) < FLA_Obj_width ( A ) ) ) { 219 | bRow = min( FLA_Obj_min_dim( ABR ), nb_alg ); 220 | 221 | // Iteration Initial Partitioning. 222 | FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &A01, &A02, 223 | /* ************* */ /* ******************** */ 224 | &A10, /**/ &A11, &A12, 225 | ABL, /**/ ABR, &A20, /**/ &A21, &A22, 226 | bRow, bRow, FLA_BR ); 227 | FLA_Repart_2x1_to_3x1( pT, &p0, 228 | /* ** */ /* *** */ 229 | &p1, 230 | pB, &p2, bRow, FLA_BOTTOM ); 231 | FLA_Repart_2x1_to_3x1( sT, &s0, 232 | /* ** */ /* ** */ 233 | &s1, 234 | sB, &s2, bRow, FLA_BOTTOM ); 235 | FLA_Repart_1x2_to_1x3( GL, /**/ GR, &G0, /**/ &G1, &G2, 236 | bRow, FLA_RIGHT ); 237 | FLA_Repart_1x2_to_1x3( YL, /**/ YR, &Y0, /**/ &Y1, &Y2, 238 | bRow, FLA_RIGHT ); 239 | FLA_Repart_1x2_to_1x3( VL, /**/ VR, &V0, /**/ &V1, &V2, 240 | bRow, FLA_RIGHT ); 241 | FLA_Repart_1x2_to_1x3( WL, /**/ WR, &W0, /**/ &W1, &W2, 242 | bRow, FLA_RIGHT ); 243 | 244 | // ------------------------------------------------------------------------ 245 | 246 | //// printf( "Iter: %ld \n", FLA_Obj_length( ATL ) ); 247 | 248 | // Check whether it is the last iteration. 249 | last_iter = ( FLA_Obj_min_dim( A22 ) <= 0 ? 1 : 0 ); 250 | 251 | #ifdef CHECK_DOWNDATING_OF_Y 252 | // Check downdating of matrix Y: Compare downdated matrix Y with 253 | // matrix Y computed from scratch. 254 | int m_cyr, n_cyr, ldim_cyr, ldim_YR, ii, jj; 255 | double * buff_cyr, * buff_YR, aux, sum; 256 | FLA_Obj CYR; 257 | 258 | FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, YR, & CYR ); 259 | 260 | FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, 261 | FLA_ONE, GR, ABR, FLA_ZERO, CYR ); 262 | 263 | //// FLA_Obj_show( " CYR = [ ", CYR, "%le", " ];" ); 264 | //// FLA_Obj_show( " Y = [ ", Y, "%le", " ];" ); 265 | 266 | m_cyr = FLA_Obj_length( CYR ); 267 | n_cyr = FLA_Obj_width( CYR ); 268 | buff_cyr = ( double * ) FLA_Obj_buffer_at_view( CYR ); 269 | ldim_cyr = FLA_Obj_col_stride( CYR ); 270 | buff_YR = ( double * ) FLA_Obj_buffer_at_view( YR ); 271 | ldim_YR = FLA_Obj_col_stride( YR ); 272 | sum = 0.0; 273 | for( jj = 0; jj < n_cyr; jj++ ) { 274 | for( ii = 0; ii < m_cyr; ii++ ) { 275 | aux = buff_YR [ ii + jj * ldim_YR ] - 276 | buff_cyr[ ii + jj * ldim_cyr ]; 277 | sum += aux * aux; 278 | } 279 | } 280 | sum = sqrt( sum ); 281 | printf( "%% diff between Y and downdated Y: %le\n", sum ); 282 | 283 | FLA_Obj_free( & CYR ); 284 | #endif 285 | 286 | if( last_iter == 0 ) { 287 | // Compute QRP of YR, and apply permutations to matrix AR. 288 | // A copy of YR is made into VR, and permutations are applied to YR. 289 | #ifdef PROFILE 290 | t1 = FLA_Clock(); 291 | #endif 292 | FLA_Merge_2x1( ATR, 293 | ABR, & AR ); 294 | FLA_Copy( YR, VR ); 295 | MyFLA_QRPmod_UT_unb_var2( 1, bRow, VR, pB, sB, 1, AR, 1, YR, 0, None ); 296 | 297 | #ifdef PROFILE 298 | t2 = FLA_Clock(); 299 | tt_qrp1 += ( t2 - t1 ); 300 | #endif 301 | } 302 | 303 | // 304 | // Compute QRP of panel AB1 = [ A11; A21 ]. 305 | // Apply same permutations to A01 and Y1, and build T1_T. 306 | // 307 | #ifdef PROFILE 308 | t1 = FLA_Clock(); 309 | #endif 310 | 311 | FLA_Part_2x1( W1, & T1_T, 312 | & None, bRow, FLA_TOP ); 313 | FLA_Merge_2x1( A11, 314 | A21, & AB1 ); 315 | MyFLA_QRPmod_UT_unb_var2( panel_pivoting, -1, AB1, p1, s1, 316 | 1, A01, 1, Y1, 1, T1_T ); 317 | 318 | #ifdef PROFILE 319 | t2 = FLA_Clock(); 320 | tt_qrp2 += ( t2 - t1 ); 321 | #endif 322 | 323 | // 324 | // Update the rest of the matrix. 325 | // 326 | if ( FLA_Obj_width( A12 ) > 0 ) { 327 | // Apply the Householder transforms associated with AB1 = [ A11; A21 ] 328 | // and T1_T to [ A12; A22 ]: 329 | // / A12 \ := QB1' / A12 \ 330 | // \ A22 / \ A22 / 331 | // where QB1 is formed from AB1 and T1_T. 332 | #ifdef PROFILE 333 | t1 = FLA_Clock(); 334 | #endif 335 | 336 | MyFLA_Apply_Q_UT_lhfc_blk_var2( A11, A21, T1_T, A12, A22 ); 337 | 338 | #ifdef PROFILE 339 | t2 = FLA_Clock(); 340 | tt_updt += ( t2 - t1 ); 341 | #endif 342 | } 343 | 344 | // 345 | // Downdate matrix Y. 346 | // 347 | if ( FLA_Obj_width( Y2 ) > 0 ) { 348 | #ifdef PROFILE 349 | t1 = FLA_Clock(); 350 | #endif 351 | 352 | MyFLA_Downdate_Y( A11, A21, A12, T1_T, Y2, G1, G2 ); 353 | 354 | #ifdef PROFILE 355 | t2 = FLA_Clock(); 356 | tt_down += ( t2 - t1 ); 357 | #endif 358 | } 359 | 360 | // ------------------------------------------------------------------------ 361 | // Iteration Final Repartitioning. 362 | 363 | FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, A01, /**/ A02, 364 | A10, A11, /**/ A12, 365 | /* ************** */ /* ****************** */ 366 | &ABL, /**/ &ABR, A20, A21, /**/ A22, 367 | FLA_TL ); 368 | FLA_Cont_with_3x1_to_2x1( &pT, p0, 369 | p1, 370 | /* ** */ /* *** */ 371 | &pB, p2, FLA_TOP ); 372 | FLA_Cont_with_3x1_to_2x1( &sT, s0, 373 | s1, 374 | /* ** */ /* ** */ 375 | &sB, s2, FLA_TOP ); 376 | FLA_Cont_with_1x3_to_1x2( &GL, /**/ &GR, G0, G1, /**/ G2, 377 | FLA_LEFT ); 378 | FLA_Cont_with_1x3_to_1x2( &YL, /**/ &YR, Y0, Y1, /**/ Y2, 379 | FLA_LEFT ); 380 | FLA_Cont_with_1x3_to_1x2( &VL, /**/ &VR, V0, V1, /**/ V2, 381 | FLA_LEFT ); 382 | FLA_Cont_with_1x3_to_1x2( &WL, /**/ &WR, W0, W1, /**/ W2, 383 | FLA_LEFT ); 384 | } 385 | 386 | // Remove auxiliary objects. 387 | FLA_Obj_free( & G ); 388 | FLA_Obj_free( & Y ); 389 | FLA_Obj_free( & V ); 390 | FLA_Obj_free( & W ); 391 | 392 | #ifdef PROFILE 393 | printf( "%% tt_qrp_1: %le\n", tt_qrp1 ); 394 | printf( "%% tt_qrp_2: %le\n", tt_qrp2 ); 395 | printf( "%% tt_updt_rest: %le\n", tt_updt ); 396 | printf( "%% tt_downdating: %le\n", tt_down ); 397 | printf( "%% total_time: %le\n", tt_qrp1 + tt_qrp2 + tt_updt + tt_down); 398 | #endif 399 | 400 | return 0; 401 | } 402 | 403 | 404 | // ============================================================================ 405 | static int MyFLA_Normal_random_matrix( FLA_Obj A ) { 406 | // 407 | // It sets a random matrix with normal distribution. 408 | // 409 | 410 | switch( FLA_Obj_datatype( A ) ) { 411 | 412 | case FLA_DOUBLE: 413 | { 414 | double * buff_A; 415 | int m_A, n_A, ldim_A, i, j; 416 | 417 | // Some initializations. 418 | m_A = FLA_Obj_length( A ); 419 | n_A = FLA_Obj_width ( A ); 420 | buff_A = ( double * ) FLA_Obj_buffer_at_view( A ); 421 | ldim_A = FLA_Obj_col_stride( A ); 422 | 423 | // Main loop. 424 | for ( j = 0; j < n_A; j++ ) { 425 | for ( i = 0; i < m_A; i++ ) { 426 | buff_A[ i + j * ldim_A ] = MyFLA_Normal_random_number( 0.0, 1.0 ); 427 | } 428 | } 429 | } 430 | break; 431 | 432 | default: 433 | fprintf( stderr, "+++ ERROR in MyFLA_Normal_random_matrix: " ); 434 | fprintf( stderr, "Datatype not implemented: %d\n", FLA_Obj_datatype( A ) ); 435 | } 436 | 437 | return FLA_SUCCESS; 438 | } 439 | 440 | /* ========================================================================= */ 441 | static double MyFLA_Normal_random_number( double mu, double sigma ) { 442 | static int alternate_calls = 0; 443 | static double b1, b2; 444 | double c1, c2, a, factor; 445 | 446 | // Quick return. 447 | if( alternate_calls == 1 ) { 448 | alternate_calls = ! alternate_calls; 449 | return( mu + sigma * b2 ); 450 | } 451 | // Main loop. 452 | do { 453 | c1 = -1.0 + 2.0 * ( (double) rand() / RAND_MAX ); 454 | c2 = -1.0 + 2.0 * ( (double) rand() / RAND_MAX ); 455 | a = c1 * c1 + c2 * c2; 456 | } while ( ( a == 0 )||( a >= 1 ) ); 457 | factor = sqrt( ( -2 * log( a ) ) / a ); 458 | b1 = c1 * factor; 459 | b2 = c2 * factor; 460 | alternate_calls = ! alternate_calls; 461 | return( mu + sigma * b1 ); 462 | } 463 | 464 | // ============================================================================ 465 | static FLA_Error MyFLA_Downdate_Y( FLA_Obj U11, FLA_Obj U21, FLA_Obj A12, 466 | FLA_Obj T, 467 | FLA_Obj Y2, FLA_Obj G1, FLA_Obj G2 ) { 468 | // 469 | // It downdates matrix Y, and updates matrix G. 470 | // Only Y2 of Y is updated. 471 | // Only G1 and G2 of G are updated. 472 | // 473 | // Y2 = Y2 - ( G1 - ( G1*U11 + G2*U21 ) * inv( T11 ) * U11' ) * R12. 474 | // 475 | FLA_Obj B; 476 | 477 | // Create auxiliary object B. 478 | FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, G1, & B ); 479 | 480 | // B = G1. 481 | FLA_Copy( G1, B ); 482 | 483 | // B = B * U11. 484 | FLA_Trmm( FLA_RIGHT, FLA_LOWER_TRIANGULAR, 485 | FLA_NO_TRANSPOSE, FLA_UNIT_DIAG, 486 | FLA_ONE, U11, B ); 487 | 488 | // B = B + G2 * U21. 489 | FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, 490 | FLA_ONE, G2, U21, FLA_ONE, B ); 491 | 492 | // B = B * inv( T11 ). 493 | FLA_Trsm( FLA_RIGHT, FLA_UPPER_TRIANGULAR, 494 | FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG, 495 | FLA_ONE, T, B ); 496 | 497 | // B = - B * U11^H. 498 | FLA_Trmm( FLA_RIGHT, FLA_LOWER_TRIANGULAR, 499 | FLA_CONJ_TRANSPOSE, FLA_UNIT_DIAG, 500 | FLA_MINUS_ONE, U11, B ); 501 | 502 | // B = G1 + B. 503 | FLA_Axpy( FLA_ONE, G1, B ); 504 | 505 | // Y2 = Y2 - B * R12. 506 | FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, 507 | FLA_MINUS_ONE, B, A12, FLA_ONE, Y2 ); 508 | 509 | // 510 | // GR = GR * Q 511 | // 512 | MyFLA_Apply_Q_UT_rnfc_blk_var2( U11, U21, T, G1, G2 ); 513 | 514 | // Remove auxiliary object B. 515 | FLA_Obj_free( & B ); 516 | 517 | return FLA_SUCCESS; 518 | } 519 | 520 | // ============================================================================ 521 | static FLA_Error MyFLA_Apply_Q_UT_lhfc_blk_var2( FLA_Obj U11, FLA_Obj U21, 522 | FLA_Obj T, 523 | FLA_Obj B1, FLA_Obj B2 ) { 524 | // 525 | // It applies the conjugate-transpose of a unitary matrix Q to a matrix B from 526 | // the left: 527 | // B := Q' B 528 | // where: 529 | // B = [ B1; B2 ] 530 | // U = [ U11; U21 ] 531 | // Q = ( I - U * inv(T)' * U' )'. 532 | // 533 | FLA_Obj W; 534 | 535 | // Create auxiliary object. 536 | FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, B1, & W ); 537 | 538 | // W = B1; 539 | 540 | FLA_Copyt( FLA_NO_TRANSPOSE, B1, W ); 541 | 542 | // U11 = trilu( U11 ); 543 | // U21 = U21; 544 | // W = triu( T )' * ( U11' * B1 + U21' * B2 ); 545 | 546 | FLA_Trmm( FLA_LEFT, FLA_LOWER_TRIANGULAR, 547 | FLA_CONJ_TRANSPOSE, FLA_UNIT_DIAG, 548 | FLA_ONE, U11, W ); 549 | 550 | FLA_Gemm( FLA_CONJ_TRANSPOSE, FLA_NO_TRANSPOSE, 551 | FLA_ONE, U21, B2, FLA_ONE, W ); 552 | 553 | FLA_Trsm( FLA_LEFT, FLA_UPPER_TRIANGULAR, 554 | FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG, 555 | FLA_ONE, T, W ); 556 | 557 | // B2 = B2 - U21 * W; 558 | // B1 = B1 - U11 * W; 559 | 560 | FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, 561 | FLA_MINUS_ONE, U21, W, FLA_ONE, B2 ); 562 | 563 | FLA_Trmm( FLA_LEFT, FLA_LOWER_TRIANGULAR, 564 | FLA_NO_TRANSPOSE, FLA_UNIT_DIAG, 565 | FLA_MINUS_ONE, U11, W ); 566 | 567 | FLA_Axpyt( FLA_NO_TRANSPOSE, FLA_ONE, W, B1 ); 568 | 569 | // Remove auxiliary object. 570 | FLA_Obj_free( & W ); 571 | 572 | return FLA_SUCCESS; 573 | } 574 | 575 | // ============================================================================ 576 | static FLA_Error MyFLA_Apply_Q_UT_rnfc_blk_var2( FLA_Obj U11, FLA_Obj U21, 577 | FLA_Obj T, 578 | FLA_Obj B1, FLA_Obj B2 ) { 579 | // 580 | // It applies a unitary matrix Q to a matrix B from the right: 581 | // B = B * Q 582 | // where: 583 | // B = [ B1; B2 ] 584 | // U = [ U11; U21 ] 585 | // Q = ( I - U * inv(T)' * U' ). 586 | // 587 | FLA_Obj W; 588 | 589 | // Create auxiliary object. 590 | FLA_Obj_create_conf_to( FLA_TRANSPOSE, B1, & W ); 591 | 592 | // W = B1^T; 593 | 594 | FLA_Copyt( FLA_TRANSPOSE, B1, W ); 595 | 596 | // U11 = trilu( U11 ); 597 | // U21 = U21; 598 | // Let W^T be conformal to B1. 599 | // W^T = ( B1 * U11 + B2 * U21 ) * inv( triu(T) ); 600 | // W = triu(T)' * ( U11' * B1' + U21' * B2' ); 601 | 602 | FLA_Trmm( FLA_LEFT, FLA_LOWER_TRIANGULAR, 603 | FLA_TRANSPOSE, FLA_UNIT_DIAG, 604 | FLA_ONE, U11, W ); 605 | 606 | FLA_Gemm( FLA_TRANSPOSE, FLA_TRANSPOSE, 607 | FLA_ONE, U21, B2, FLA_ONE, W ); 608 | 609 | FLA_Trsm( FLA_LEFT, FLA_UPPER_TRIANGULAR, 610 | FLA_TRANSPOSE, FLA_NONUNIT_DIAG, 611 | FLA_ONE, T, W ); 612 | 613 | // B2 = B2 - W^T * U21'; 614 | // B1 = B1 - W^T * U11'; 615 | // = B1 - ( conj(U11) * W )^T; 616 | 617 | FLA_Gemm( FLA_TRANSPOSE, FLA_CONJ_TRANSPOSE, 618 | FLA_MINUS_ONE, W, U21, FLA_ONE, B2 ); 619 | 620 | FLA_Trmm( FLA_LEFT, FLA_LOWER_TRIANGULAR, 621 | FLA_CONJ_NO_TRANSPOSE, FLA_UNIT_DIAG, 622 | FLA_MINUS_ONE, U11, W ); 623 | 624 | FLA_Axpyt( FLA_TRANSPOSE, FLA_ONE, W, B1 ); 625 | 626 | // Remove auxiliary object. 627 | FLA_Obj_free( & W ); 628 | 629 | return FLA_SUCCESS; 630 | } 631 | 632 | /* ========================================================================= */ 633 | static FLA_Error MyFLA_QRPmod_UT_unb_var2( int pivoting, int num_stages, 634 | FLA_Obj A, FLA_Obj p, FLA_Obj t, 635 | int pivot_B, FLA_Obj B, 636 | int pivot_C, FLA_Obj C, 637 | int build_T, FLA_Obj T ) { 638 | // 639 | // It computes an unblocked QR factorization of matrix A with or without 640 | // pivoting. Matrices B and C are optionally pivoted, and matrix T is 641 | // optionally built. 642 | // 643 | // Arguments: 644 | // "pivoting": If pivoting==1, then QR factorization with pivoting is used. 645 | // "numstages": It tells the number of columns that are factorized. 646 | // If "num_stages" is negative, the whole matrix A is factorized. 647 | // If "num_stages" is positive, only the first "num_stages" are factorized. 648 | // "pivot_B": if "pivot_B" is true, matrix "B" is pivoted too. 649 | // "pivot_C": if "pivot_C" is true, matrix "C" is pivoted too. 650 | // "build_T": if "build_T" is true, matrix "T" is built. 651 | // 652 | // Declaration of variables. 653 | FLA_Obj d, e, workspace; 654 | int j, m_A, n_A, mn_A, m_B, m_C, dtype_A, ldim_A, ldim_B, ldim_C, 655 | ldim_T, m_A20, n_A20, m_a21, m_A22, n_A22, n_dB, idx_max_col, 656 | i_one = 1; 657 | double * buff_A, * buff_t, * buff_B, * buff_C, * buff_T, 658 | * buff_d, * buff_e, * buff_workspace, d_one = 1.0; 659 | int * buff_p; 660 | 661 | int idamax_(); 662 | 663 | //// printf( " %% MyFLA_QRPmod_UT_unb_var2. build_T: %d\n", build_T ); 664 | 665 | // Some initializations. 666 | dtype_A = FLA_Obj_datatype( A ); 667 | m_A = FLA_Obj_length( A ); 668 | n_A = FLA_Obj_width ( A ); 669 | mn_A = min( m_A, n_A ); 670 | ldim_A = FLA_Obj_col_stride( A ); 671 | buff_A = ( double * ) FLA_Obj_buffer_at_view( A ); 672 | 673 | buff_p = ( int * ) FLA_Obj_buffer_at_view( p ); 674 | 675 | buff_t = ( double * ) FLA_Obj_buffer_at_view( t ); 676 | 677 | if( pivot_B ) { 678 | m_B = FLA_Obj_length( B ); 679 | buff_B = ( double * ) FLA_Obj_buffer_at_view( B ); 680 | ldim_B = FLA_Obj_col_stride( B ); 681 | } 682 | 683 | if( pivot_C ) { 684 | m_C = FLA_Obj_length( C ); 685 | buff_C = ( double * ) FLA_Obj_buffer_at_view( C ); 686 | ldim_C = FLA_Obj_col_stride( C ); 687 | } 688 | 689 | if( build_T ) { 690 | buff_T = ( double * ) FLA_Obj_buffer_at_view( T ); 691 | ldim_T = FLA_Obj_col_stride( T ); 692 | } 693 | 694 | // Set the number of stages, if needed. 695 | if( num_stages < 0 ) { 696 | num_stages = mn_A; 697 | } 698 | 699 | // Create auxiliary objects. 700 | FLA_Obj_create( dtype_A, n_A, 1, 0, 0, & d ); 701 | FLA_Obj_create( dtype_A, n_A, 1, 0, 0, & e ); 702 | FLA_Obj_create( dtype_A, n_A, 1, 0, 0, & workspace ); 703 | 704 | buff_d = ( double * ) FLA_Obj_buffer_at_view( d ); 705 | buff_e = ( double * ) FLA_Obj_buffer_at_view( e ); 706 | buff_workspace = ( double * ) FLA_Obj_buffer_at_view( workspace ); 707 | 708 | if( pivoting == 1 ) { 709 | // Compute initial norms of A into d and e. 710 | MyFLA_QRP_compute_norms( A, d, e ); 711 | } 712 | 713 | // Main Loop. 714 | //// printf( " m x n: %d x %d num_stages: %d \n", m_A, n_A, num_stages ); 715 | for( j = 0; j < num_stages; j++ ) { 716 | //// printf( "%% ----------------------------------\n" ); 717 | //// printf( "%% Iter: %d \n", j ); 718 | //// printf( "%% ----------------------------------\n" ); 719 | 720 | n_dB = n_A - j; 721 | m_a21 = m_A - j - 1; 722 | m_A22 = m_A - j - 1; 723 | n_A22 = n_A - j - 1; 724 | m_A20 = m_A - j - 1; 725 | n_A20 = j; 726 | 727 | if( pivoting == 1 ) { 728 | // Obtain the index of the column with largest 2-norm. 729 | //// FLA_Amax( dB, max_col ); 730 | // Function idamax of MKL can fail by returning an index equal or larger 731 | // than the dimension when working with small numbers. Hence, an own 732 | // implementation is used. 733 | //// printf( " Chivato 901. n_dB: %d my_idamax: %d\n", 734 | //// n_dB, NoFLA_my_idamax( n_dB, & buff_d[ j ] ) ); 735 | //// printf( " Chivato 902. n_dB: %d idx_max_col: %d\n", 736 | //// n_dB, idamax_( & n_dB, & buff_d[ j ], & i_one ) - 1 ); 737 | //// idx_max_col = idamax_( & n_dB, & buff_d[ j ], & i_one ) - 1; 738 | idx_max_col = NoFLA_my_idamax( n_dB, & buff_d[ j ] ); 739 | 740 | // Swap columns of G, pivots, and norms. 741 | //// FLA_QRP_pivot_B( max_col, GR, BR, pB, dB, eB ); 742 | NoFLA_QRP_pivot_B_C( idx_max_col, 743 | m_A, & buff_A[ 0 + j * ldim_A ], ldim_A, 744 | pivot_B, m_B, & buff_B[ 0 + j * ldim_B ], ldim_B, 745 | pivot_C, m_C, & buff_C[ 0 + j * ldim_C ], ldim_C, 746 | & buff_p[ j ], 747 | & buff_d[ j ], 748 | & buff_e[ j ] ); 749 | } 750 | 751 | // Compute tau1 and u21 from alpha11 and a21 such that tau1 and u21 752 | // determine a Householder transform H such that applying H from the 753 | // left to the column vector consisting of alpha11 and a21 annihilates 754 | // the entries in a21 (and updates alpha11). 755 | //// FLA_Househ2( alpha11, 756 | //// a21, tau1 ); 757 | //// FLA_Househ2_UT( FLA_LEFT, 758 | //// alpha11, 759 | //// a21, tau1 ); 760 | 761 | FLA_Househ2_UT_l_opd( m_a21, 762 | & buff_A[ j + j * ldim_A ], 763 | & buff_A[ ( j+1 ) + j * ldim_A ], i_one, 764 | & buff_t[ j ] ); 765 | 766 | // / a12t \ = H / a12t \ 767 | // \ A22 / \ A22 / 768 | // 769 | // where H is formed from tau1 and u21. 770 | //// FLA_QR_Update_Rest_blk_b2( tau1, a21, a12t, 771 | //// A22 ); 772 | /// FLA_Apply_H2_UT( FLA_LEFT, tau1, a21, a12t, 773 | /// A22 ); 774 | 775 | MyFLA_Apply_H2_UT_l_opd_var2( 776 | m_A22, //// m_u2_A2, 777 | n_A22, //// n_a1t, 778 | & buff_t[ j ], //// tau, 779 | & buff_A[ ( j+1 ) + j * ldim_A ], 1, //// u2,inc_u2, 780 | & buff_A[ j + ( j+1 ) * ldim_A ], ldim_A, //// a1t,inc_a1t, 781 | & buff_A[ ( j+1 ) + ( j+1 ) * ldim_A ], ldim_A, 782 | buff_workspace ); //// A2,rs_A2,cs_A2); 783 | 784 | // Build T. 785 | if( build_T ) { 786 | // rho11 = tau1; 787 | //// FLA_Copy( tau1, rho11 ); 788 | buff_T[ j + j * ldim_T ] = buff_t[ j ]; 789 | 790 | // t01 = a10t' + A20' * u21; 791 | //// FLA_Copyt_external( FLA_CONJ_TRANSPOSE, a10t, r01 ); 792 | dcopy_( & j, & buff_A[ j + 0 * ldim_A ], & ldim_A, 793 | & buff_T[ 0 + j * ldim_T ], & i_one ); 794 | 795 | //// FLA_Gemv_external( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, 796 | //// FLA_ONE, r01 ); 797 | dgemv_( "Transpose", & m_A20, & n_A20, 798 | & d_one, 799 | & buff_A[ ( j+1 ) + 0 * ldim_A ], & ldim_A, 800 | & buff_A[ ( j+1 ) + j * ldim_A ], & i_one, 801 | & d_one, 802 | & buff_T[ 0 + j * ldim_T ], & i_one ); 803 | } 804 | 805 | if( pivoting == 1 ) { 806 | // Update partial column norms. 807 | //// FLA_QRP_update_partial_norms( dB, eB, a12t, A22 ); 808 | 809 | NoFLA_QRP_downdate_partial_norms( m_A22, n_A22, 810 | & buff_d[ j+1 ], 1, 811 | & buff_e[ j+1 ], 1, 812 | & buff_A[ j + ( j+1 ) * ldim_A ], ldim_A, 813 | & buff_A[ ( j+1 ) + ( j+1 ) * ldim_A ], ldim_A ); 814 | } 815 | } 816 | 817 | // Remove auxiliary objects. 818 | FLA_Obj_free( & d ); 819 | FLA_Obj_free( & e ); 820 | FLA_Obj_free( & workspace ); 821 | 822 | return FLA_SUCCESS; 823 | } 824 | 825 | /* ========================================================================= */ 826 | static FLA_Error MyFLA_Apply_H2_UT_l_opd_var2( 827 | int m_u2_A2, 828 | int n_a1t, 829 | double * tau, 830 | double * u2, int inc_u2, 831 | double * a1t, int inc_a1t, 832 | double * A2, int ldim_A2, 833 | double * workspace ) { 834 | 835 | double one_p = 1.0; 836 | double minus_one_p = -1.0; 837 | double rtau; 838 | int inc_w1t; 839 | 840 | // FLA_Obj w1t; 841 | double * w1t; 842 | 843 | // if ( FLA_Obj_has_zero_dim( a1t ) ) return FLA_SUCCESS; 844 | if ( n_a1t == 0 ) return FLA_SUCCESS; 845 | 846 | // FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, a1t, &w1t ); 847 | //// w1t = ( double* ) FLA_malloc( n_a1t * sizeof( *a1t ) ); 848 | w1t = workspace; 849 | inc_w1t = 1; 850 | 851 | // // w1t = a1t; 852 | // FLA_Copy_external( a1t, w1t ); 853 | //// bl1_dcopyv( BLIS1_NO_CONJUGATE, 854 | //// n_a1t, 855 | //// a1t, inc_a1t, 856 | //// w1t, inc_w1t ); 857 | dcopy_( & n_a1t, 858 | a1t, & inc_a1t, 859 | w1t, & inc_w1t ); 860 | 861 | // // w1t = w1t + u2' * A2; 862 | // // w1t = w1t + A2^T * conj(u2); 863 | // FLA_Gemvc_external( FLA_TRANSPOSE, FLA_CONJUGATE, FLA_ONE, A2, u2, 864 | // FLA_ONE, w1t ); 865 | //// bl1_dgemv( BLIS1_TRANSPOSE, 866 | //// BLIS1_CONJUGATE, 867 | //// m_u2_A2, 868 | //// n_a1t, 869 | //// one_p, 870 | //// A2, rs_A2, cs_A2, 871 | //// u2, inc_u2, 872 | //// one_p, 873 | //// w1t, inc_w1t ); 874 | dgemv_( "Transpose", 875 | & m_u2_A2, 876 | & n_a1t, 877 | & one_p, 878 | A2, & ldim_A2, 879 | u2, & inc_u2, 880 | & one_p, 881 | w1t, & inc_w1t ); 882 | 883 | // // w1t = w1t / tau; 884 | // FLA_Inv_scalc_external( FLA_NO_CONJUGATE, tau, w1t ); 885 | //// bl1_dinvscalv( BLIS1_NO_CONJUGATE, 886 | //// n_a1t, 887 | //// tau, 888 | //// w1t, inc_w1t ); 889 | if( * tau == 0.0 ) { 890 | fprintf( stderr, "ERROR in MyFLA_Apply_H2_UT_l_opd_var2: Tau is zero.\n" ); 891 | } else { 892 | rtau = 1.0 / ( * tau ); 893 | } 894 | dscal_( & n_a1t, 895 | & rtau, 896 | w1t, & inc_w1t ); 897 | 898 | // // a1t = - w1t + a1t; 899 | // FLA_Axpy_external( FLA_MINUS_ONE, w1t, a1t ); 900 | //// bl1_daxpyv( BLIS1_NO_CONJUGATE, 901 | //// n_a1t, 902 | //// minus_one_p, 903 | //// w1t, inc_w1t, 904 | //// a1t, inc_a1t ); 905 | daxpy_( & n_a1t, 906 | & minus_one_p, 907 | w1t, & inc_w1t, 908 | a1t, & inc_a1t ); 909 | 910 | // // A2 = - u2 * w1t + A2; 911 | // FLA_Ger_external( FLA_MINUS_ONE, u2, w1t, A2 ); 912 | //// bl1_dger( BLIS1_NO_CONJUGATE, 913 | //// BLIS1_NO_CONJUGATE, 914 | //// m_u2_A2, 915 | //// n_a1t, 916 | //// minus_one_p, 917 | //// u2, inc_u2, 918 | //// w1t, inc_w1t, 919 | //// A2, rs_A2, cs_A2 ); 920 | dger_( & m_u2_A2, 921 | & n_a1t, 922 | & minus_one_p, 923 | u2, & inc_u2, 924 | w1t, & inc_w1t, 925 | A2, & ldim_A2 ); 926 | 927 | //// // FLA_Obj_free( &w1t ); 928 | //// FLA_free( w1t ); 929 | 930 | return FLA_SUCCESS; 931 | } 932 | 933 | /* ========================================================================= */ 934 | static FLA_Error MyFLA_QRP_compute_norms( FLA_Obj A, FLA_Obj d, FLA_Obj e ) { 935 | 936 | switch( FLA_Obj_datatype( A ) ) { 937 | 938 | case FLA_DOUBLE: { 939 | int m_A, n_A, ld_A, st_d, st_e, j, i_one = 1; 940 | double * ptr_A, * ptr_d, * ptr_e, dnrm2_(); 941 | 942 | m_A = FLA_Obj_length( A ); 943 | n_A = FLA_Obj_width( A ); 944 | ptr_A = ( double * ) FLA_Obj_buffer_at_view( A ); 945 | ptr_d = ( double * ) FLA_Obj_buffer_at_view( d ); 946 | ptr_e = ( double * ) FLA_Obj_buffer_at_view( e ); 947 | ld_A = FLA_Obj_col_stride( A ); 948 | 949 | st_d = 0; 950 | if( FLA_Obj_length( d ) == 1 ) { 951 | st_d = FLA_Obj_col_stride( d ); 952 | } else if( FLA_Obj_width( d ) == 1 ) { 953 | st_d = 1; 954 | } else { 955 | FLA_Print_message( "MyFLA_QRP_compute_norms: Object d is not a vector", 956 | __FILE__, __LINE__ ); 957 | FLA_Abort(); 958 | } 959 | st_e = 0; 960 | if( FLA_Obj_length( e ) == 1 ) { 961 | st_e = FLA_Obj_col_stride( e ); 962 | } else if( FLA_Obj_width( e ) == 1 ) { 963 | st_e = 1; 964 | } else { 965 | FLA_Print_message( "MyFLA_QRP_compute_norms: Object e is not a vector", 966 | __FILE__, __LINE__ ); 967 | FLA_Abort(); 968 | } 969 | 970 | for( j = 0; j < n_A; j++ ) { 971 | *ptr_d = dnrm2_( & m_A, ptr_A, & i_one ); 972 | *ptr_e = *ptr_d; 973 | ptr_A += ld_A; 974 | ptr_d += st_d; 975 | ptr_e += st_e; 976 | } 977 | 978 | break; 979 | } 980 | default: 981 | FLA_Print_message( "MyFLA_QRP_compute_norms: datatype not yet implemented", 982 | __FILE__, __LINE__ ); 983 | FLA_Abort(); 984 | } 985 | 986 | return FLA_SUCCESS; 987 | } 988 | 989 | /* ========================================================================= */ 990 | static FLA_Error NoFLA_QRP_pivot_B_C( int j_max_col, 991 | int m_G, double * buff_G, int ldim_G, 992 | int pivot_B, int m_B, double * buff_B, int ldim_B, 993 | int pivot_C, int m_C, double * buff_C, int ldim_C, 994 | int * buff_p, 995 | double * buff_d, double * buff_e ) { 996 | // Matrices B and C are optionally pivoted. 997 | 998 | // Declaration of variables. 999 | int ival, i_one = 1; 1000 | double * ptr_g1, * ptr_g2, * ptr_b1, * ptr_b2, * ptr_c1, * ptr_c2; 1001 | 1002 | //// j_max_col = *( ( int * ) FLA_Obj_buffer_at_view( max_col ) ); 1003 | //// printf( "j_max_col: %d\n", j_max_col ); 1004 | //// FLA_Obj_show( " max_col = [ ", max_col, "%d", " ];" ); 1005 | 1006 | // Swap columns of G, pivots, and norms. 1007 | if( j_max_col != 0 ) { 1008 | 1009 | // Swap full column 0 and column "j_max_col" of G. 1010 | ptr_g1 = & buff_G[ 0 + 0 * ldim_G ]; 1011 | ptr_g2 = & buff_G[ 0 + j_max_col * ldim_G ]; 1012 | dswap_( & m_G, ptr_g1, & i_one, ptr_g2, & i_one ); 1013 | 1014 | // Swap full column 0 and column "j_max_col" of B. 1015 | if( pivot_B ) { 1016 | ptr_b1 = & buff_B[ 0 + 0 * ldim_B ]; 1017 | ptr_b2 = & buff_B[ 0 + j_max_col * ldim_B ]; 1018 | dswap_( & m_B, ptr_b1, & i_one, ptr_b2, & i_one ); 1019 | } 1020 | 1021 | // Swap full column 0 and column "j_max_col" of C. 1022 | if( pivot_C ) { 1023 | ptr_c1 = & buff_C[ 0 + 0 * ldim_C ]; 1024 | ptr_c2 = & buff_C[ 0 + j_max_col * ldim_C ]; 1025 | dswap_( & m_C, ptr_c1, & i_one, ptr_c2, & i_one ); 1026 | } 1027 | 1028 | // Swap element 0 and element "j_max_col" of pivot vector "p". 1029 | ival = buff_p[ j_max_col ]; 1030 | buff_p[ j_max_col ] = buff_p[ 0 ]; 1031 | buff_p[ 0 ] = ival; 1032 | 1033 | // Copy norms of column 0 to column "j_max_col". 1034 | buff_d[ j_max_col ] = buff_d[ 0 ]; 1035 | buff_e[ j_max_col ] = buff_e[ 0 ]; 1036 | } 1037 | 1038 | return FLA_SUCCESS; 1039 | } 1040 | 1041 | /* ========================================================================= */ 1042 | static FLA_Error NoFLA_QRP_downdate_partial_norms( int m_A, int n_A, 1043 | double * buff_d, int st_d, 1044 | double * buff_e, int st_e, 1045 | double * buff_wt, int st_wt, 1046 | double * buff_A, int ldim_A ) { 1047 | //// printf( " NoFLA_QRP_downdate_partial_norms by Drmac. \n" ); 1048 | 1049 | int j, i_one = 1; 1050 | double * ptr_d, * ptr_e, * ptr_wt, * ptr_A; 1051 | double temp, temp2, temp5, tol3z; 1052 | double dnrm2_(); 1053 | 1054 | /* 1055 | * 1056 | * Update partial column norms 1057 | * 1058 | DO 30 J = I + 1, N 1059 | IF( WORK( J ).NE.ZERO ) THEN 1060 | * 1061 | * NOTE: The following 4 lines follow from the analysis in 1062 | * Lapack Working Note 176. 1063 | * 1064 | TEMP = ABS( A( I, J ) ) / WORK( J ) 1065 | TEMP = MAX( ZERO, ( ONE+TEMP )*( ONE-TEMP ) ) 1066 | TEMP2 = TEMP*( WORK( J ) / WORK( N+J ) )**2 1067 | IF( TEMP2 .LE. TOL3Z ) THEN 1068 | IF( M-I.GT.0 ) THEN 1069 | WORK( J ) = DNRM2( M-I, A( I+1, J ), 1 ) 1070 | WORK( N+J ) = WORK( J ) 1071 | ELSE 1072 | WORK( J ) = ZERO 1073 | WORK( N+J ) = ZERO 1074 | END IF 1075 | ELSE 1076 | WORK( J ) = WORK( J )*SQRT( TEMP ) 1077 | END IF 1078 | END IF 1079 | 30 CONTINUE 1080 | */ 1081 | 1082 | // Some initializations. 1083 | tol3z = sqrt( FLA_Mach_params_opd( FLA_MACH_EPS ) ); 1084 | //// printf( "tol3z: %24.17le\n", tol3z ); 1085 | //// printf( "eps: %24.17le\n", FLA_Mach_params_opd( FLA_MACH_EPS ) ); 1086 | ptr_d = buff_d; 1087 | ptr_e = buff_e; 1088 | ptr_wt = buff_wt; 1089 | ptr_A = buff_A; 1090 | 1091 | // Main loop. 1092 | for( j = 0; j < n_A; j++ ) { 1093 | if( * ptr_d != 0.0 ) { 1094 | temp = dabs( * ptr_wt ) / * ptr_d; 1095 | temp = max( 0.0, ( 1.0 + temp ) * ( 1 - temp ) ); 1096 | temp5 = * ptr_d / * ptr_e; 1097 | temp2 = temp * temp5 * temp5; 1098 | //// printf( "temp2: %24.17lf tol3z: %24.17lf\n", temp2, tol3z ); 1099 | if( temp2 <= tol3z ) { 1100 | //// printf( "F. Cancel Drm %4d %14.6le %14.6le\n", 1101 | //// j, * ptr_wt, * ptr_d ); 1102 | if( m_A > 0 ) { 1103 | * ptr_d = dnrm2_( & m_A, ptr_A, & i_one ); 1104 | * ptr_e = *ptr_d; 1105 | } else { 1106 | * ptr_d = 0.0; 1107 | * ptr_e = 0.0; 1108 | } 1109 | } else { 1110 | * ptr_d = * ptr_d * sqrt( temp ); 1111 | } 1112 | } 1113 | ptr_A += ldim_A; 1114 | ptr_d += st_d; 1115 | ptr_e += st_e; 1116 | ptr_wt += st_wt; 1117 | } 1118 | 1119 | return FLA_SUCCESS; 1120 | } 1121 | 1122 | /* ========================================================================= */ 1123 | static int NoFLA_my_idamax( int n, double * buff_data ) { 1124 | int i, result; 1125 | double dmax; 1126 | 1127 | // IDAMAX = 0 1128 | // IF (N.LT.1 .OR. INCX.LE.0) RETURN 1129 | if( n < 1 ) { 1130 | return -1; 1131 | } 1132 | 1133 | // IDAMAX = 1 1134 | // IF (N.EQ.1) RETURN 1135 | if( n == 1 ) { 1136 | return 0; 1137 | } 1138 | 1139 | // * 1140 | // * code for increment equal to 1 1141 | // * 1142 | // DMAX = DABS(DX(1)) 1143 | // DO I = 2,N 1144 | // IF (DABS(DX(I)).GT.DMAX) THEN 1145 | // IDAMAX = I 1146 | // DMAX = DABS(DX(I)) 1147 | // END IF 1148 | // END DO 1149 | 1150 | /* 1151 | for( i = 0; i < n; i++ ) { 1152 | printf( " dabs( buff_data[ %d ] ) = %25.16le\n", 1153 | i, dabs( buff_data[ i ] ) ); 1154 | } */ 1155 | 1156 | result = 0; 1157 | dmax = dabs( buff_data[ 0 ] ); 1158 | for( i = 1; i < n; i++ ) { 1159 | if( dmax < dabs( buff_data[ i ] ) ) { 1160 | result = i; 1161 | dmax = dabs( buff_data[ i ] ); 1162 | } 1163 | } 1164 | 1165 | return result; 1166 | } 1167 | 1168 | 1169 | 1170 | 1171 | 1172 | 1173 | 1174 | 1175 | 1176 | 1177 | 1178 | 1179 | 1180 | 1181 | 1182 | 1183 | 1184 | 1185 | 1186 | 1187 | 1188 | 1189 | 1190 | 1191 | 1192 | 1193 | 1194 | 1195 | 1196 | 1197 | 1198 | 1199 | 1200 | --------------------------------------------------------------------------------