├── speedup.png
├── libflame_sources
    ├── FLA_HQRRP_UT_blk_var2.h
    ├── Makefile
    ├── simple_test.c
    └── FLA_HQRRP_UT_blk_var2.c
├── lapack_compatible_sources
    ├── NoFLA_HQRRP_WY_blk_var4.h
    ├── Makefile
    ├── simple_test.c
    └── NoFLA_HQRRP_WY_blk_var4.c
├── License.txt
└── README.md


/speedup.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flame/hqrrp/HEAD/speedup.png


--------------------------------------------------------------------------------
/libflame_sources/FLA_HQRRP_UT_blk_var2.h:
--------------------------------------------------------------------------------
1 | #include "FLAME.h"
2 | 
3 | int FLA_HQRRP_UT_blk_var2( FLA_Obj A, FLA_Obj p, FLA_Obj s, 
4 |         int nb_alg, int pp, int panel_pivoting );
5 | 
6 | 


--------------------------------------------------------------------------------
/lapack_compatible_sources/NoFLA_HQRRP_WY_blk_var4.h:
--------------------------------------------------------------------------------
1 | 
2 | void dgeqp4( int * m, int * n, double * A, int * lda, int * jpvt, double * tau,
3 |          double * work, int * lwork, int * info );
4 | 
5 | int NoFLA_HQRRP_WY_blk_var4( int m_A, int n_A, double * buff_A, int ldim_A,
6 |         int * buff_jpvt, double * buff_tau,
7 |         int nb_alg, int pp, int panel_pivoting );
8 | 
9 | 


--------------------------------------------------------------------------------
/lapack_compatible_sources/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | # Defintions of variables.
 3 | 
 4 | CC      = gcc
 5 | CCFLAGS = -O -fopenmp
 6 | LD      = gcc
 7 | LDFLAGS = -O -fopenmp
 8 | 
 9 | # Defintions of rules.
10 | 
11 | simple_test.x : simple_test.o NoFLA_HQRRP_WY_blk_var4.o
12 | 	$(LD) $(LDFLAGS) \
13 |             -o simple_test.x \
14 |             simple_test.o \
15 |             NoFLA_HQRRP_WY_blk_var4.c -lm  \
16 |             /usr/local/lapack/liblapack_340_p4b64_gf.a \
17 |             /usr/local/lapack/mt_openblas/lib/libopenblas_haswellp-r0.2.14.a \
18 |             -lgfortran
19 | 
20 | simple_test.o : simple_test.c
21 | 	$(CC) $(CCFLAGS) -O -fopenmp -c simple_test.c
22 | 
23 | %.o : %.c
24 | 	$(CC) $(CCFLAGS) -c $< -o $@
25 | 
26 | clean: 
27 | 	rm -f a.out *.x *.o *~ core
28 | 
29 | 


--------------------------------------------------------------------------------
/libflame_sources/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | # Defintions of variables.
 3 | 
 4 | LIBFLAME_PATH = /usr/local/libflame-current-gc
 5 | 
 6 | CC      = gcc
 7 | CCFLAGS = -O -fopenmp -I$(LIBFLAME_PATH)/include-x86_64-r -msse3
 8 | LD      = gcc
 9 | LDFLAGS = -O -fopenmp -I$(LIBFLAME_PATH)/include-x86_64-r -msse3
10 | 
11 | # Defintions of rules.
12 | 
13 | simple_test.x : simple_test.o FLA_HQRRP_UT_blk_var2.o
14 | 	$(LD) $(LDFLAGS) \
15 |             -o simple_test.x \
16 |             simple_test.o \
17 |             FLA_HQRRP_UT_blk_var2.o \
18 |             -L$(LIBFLAME_PATH)/lib -lflame-x86_64-r  \
19 |             /usr/local/lapack/mt_openblas/lib/libopenblas_haswellp-r0.2.14.a \
20 |             -lm
21 | 
22 | simple_test.o : simple_test.c
23 | 	$(CC) $(CCFLAGS) -c simple_test.c
24 | 
25 | %.o : %.c
26 | 	$(CC) $(CCFLAGS) -c $< -o $@
27 | 
28 | clean: 
29 | 	rm -f a.out *.x *.o *~ core
30 | 
31 | 


--------------------------------------------------------------------------------
/License.txt:
--------------------------------------------------------------------------------
 1 | HQRRP software
 2 | License
 3 | ---
 4 | 
 5 | The HQRRP software is licensed under the following license, typically
 6 | known as the "new" or "modified" or "3-clause" BSD license.
 7 | 
 8 | Copyright (C) 2016, 
 9 |   Universitat Jaume I,
10 |   University of Colorado at Boulder,
11 |   The University of Texas at Austin.
12 | 
13 | Redistribution and use in source and binary forms, with or without
14 | modification, are permitted provided that the following conditions are
15 | met:
16 |  - Redistributions of source code must retain the above copyright
17 |    notice, this list of conditions and the following disclaimer.
18 |  - Redistributions in binary form must reproduce the above copyright
19 |    notice, this list of conditions and the following disclaimer in the
20 |    documentation and/or other materials provided with the distribution.
21 |  - Neither the name of the universities of its contributors nor the names
22 |    of its contributors may be used to endorse or promote products
23 |    derived from this software without specific prior written permission.
24 | 
25 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
26 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
27 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
28 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
29 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
30 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
31 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
32 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
33 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
34 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
35 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 | 
37 | 


--------------------------------------------------------------------------------
/libflame_sources/simple_test.c:
--------------------------------------------------------------------------------
  1 | #include <math.h>
  2 | #include <stdlib.h>
  3 | #include <stdio.h>
  4 | #include "FLAME.h"
  5 | 
  6 | #define PRINT_DATA
  7 | 
  8 | 
  9 | // ============================================================================
 10 | // Declaration of local prototypes.
 11 | 
 12 | static void matrix_generate( FLA_Obj A );
 13 | 
 14 | static void init_pvt( FLA_Obj p );
 15 | 
 16 | 
 17 | // ============================================================================
 18 | int main( int argc, char *argv[] ) {
 19 |   int      m_A, n_A;
 20 |   FLA_Obj  A, p, tau;
 21 | 
 22 |   // Initialize FLAME.
 23 |   FLA_Init();
 24 | 
 25 |   // Some initializations.
 26 |   m_A = 5;
 27 |   n_A = 5;
 28 | 
 29 |   // Create FLAME objects, and attach buffers.
 30 |   FLA_Obj_create( FLA_DOUBLE, m_A, n_A, 0, 0, & A );
 31 |   FLA_Obj_create( FLA_INT,    n_A, 1,   0, 0, & p );
 32 |   FLA_Obj_create( FLA_DOUBLE, n_A, 1,   0, 0, & tau );
 33 | 
 34 |   // Generate matrix.
 35 |   //// FLA_Random_matrix( A );
 36 |   matrix_generate( A );
 37 | 
 38 |   // Initialize vector with pivots.
 39 |   init_pvt( p );
 40 | 
 41 |   // Print initial data.
 42 | #ifdef PRINT_DATA
 43 |   FLA_Obj_show( " Ai = [ ", A, "%le", " ];" );
 44 |   FLA_Obj_show( " pi = [ ", p, "%d", " ];" );
 45 |   FLA_Obj_show( " taui = [ ", tau, "%le", " ];" );
 46 | #endif
 47 | 
 48 |   // Factorize matrix.
 49 |   printf( "%% Just before computing factorization.\n" );
 50 |   // New factorization.
 51 |   FLA_HQRRP_UT_blk_var2( A, p, tau, 64, 10, 1 );
 52 |   printf( "%% Just after computing factorization.\n" );
 53 | 
 54 |   // Print results.
 55 | #ifdef PRINT_DATA
 56 |   FLA_Obj_show( " Af = [ ", A, "%le", " ];" );
 57 |   FLA_Obj_show( " pf = [ ", p, "%d", " ];" );
 58 |   FLA_Obj_show( " tauf = [ ", tau, "%le", " ];" );
 59 | #endif
 60 | 
 61 |   // Free objects.
 62 |   FLA_Obj_free( & A );
 63 |   FLA_Obj_free( & p );
 64 |   FLA_Obj_free( & tau );
 65 | 
 66 |   // Finalize FLAME.
 67 |   printf( "%% End of Program\n" );
 68 |   FLA_Finalize();
 69 | 
 70 | 
 71 |   return 0;
 72 | }
 73 | 
 74 | // ============================================================================
 75 | static void matrix_generate( FLA_Obj A ) {
 76 |   double  * buff_A;
 77 |   int     m_A, n_A, ldim_A;
 78 |   int     i, j, num;
 79 | 
 80 |   buff_A = ( double * ) FLA_Obj_buffer_at_view( A );
 81 |   m_A    = FLA_Obj_length( A );
 82 |   n_A    = FLA_Obj_width ( A );
 83 |   ldim_A = FLA_Obj_col_stride( A );
 84 | 
 85 |   //
 86 |   // Matrix with integer values.
 87 |   // ---------------------------
 88 |   //
 89 |   if( ( m_A > 0 )&&( n_A > 0 ) ) {
 90 |     num = 1;
 91 |     for ( j = 0; j < n_A; j++ ) {
 92 |       for ( i = ( j % m_A ); i < m_A; i++ ) {
 93 |         buff_A[ i + j * ldim_A ] = ( double ) num;
 94 |         num++;
 95 |       }
 96 |       for ( i = 0; i < ( j % m_A ); i++ ) {
 97 |         buff_A[ i + j * ldim_A ] = ( double ) num;
 98 |         num++;
 99 |       }
100 |     }
101 |     if( ( m_A > 0 )&&( n_A > 0 ) ) {
102 |       buff_A[ 0 + 0 * ldim_A ] = 1.2;
103 |     }
104 | #if 0
105 |     // Scale down matrix.
106 |     if( num == 0.0 ) {
107 |       rnum = 1.0;
108 |     } else {
109 |       rnum = 1.0 / num;
110 |     }
111 |     for ( j = 0; j < n_A; j++ ) {
112 |       for ( i = 0; i < m_A; i++ ) {
113 |         buff_A[ i + j * ldim_A ] *= rnum;
114 |       }
115 |     }
116 | #endif
117 |   }
118 | }
119 | 
120 | // ============================================================================
121 | static void init_pvt( FLA_Obj p ) {
122 |   int  * buff_p, n_p;
123 |   int  i;
124 | 
125 |   buff_p = ( int * ) FLA_Obj_buffer_at_view( p );
126 |   n_p    = FLA_Obj_length( p );
127 |   for( i = 0; i < n_p; i++ ) {
128 |     buff_p[ i ] = ( i + 1 );
129 |   }
130 | }
131 | 
132 | 
133 | 


--------------------------------------------------------------------------------
/lapack_compatible_sources/simple_test.c:
--------------------------------------------------------------------------------
  1 | #include <math.h>
  2 | #include <stdlib.h>
  3 | #include <stdio.h>
  4 | 
  5 | #define max( a, b ) ( (a) > (b) ? (a) : (b) )
  6 | #define min( a, b ) ( (a) < (b) ? (a) : (b) )
  7 | 
  8 | #define PRINT_DATA
  9 | 
 10 | 
 11 | // ============================================================================
 12 | // Declaration of local prototypes.
 13 | 
 14 | static void matrix_generate( int m_A, int n_A, double * buff_A, int ldim_A );
 15 | 
 16 | static void print_double_matrix( char * name, int m_A, int n_A, 
 17 |                 double * buff_A, int ldim_A );
 18 | 
 19 | static void print_double_vector( char * name, int n, double * vector );
 20 | 
 21 | static void print_int_vector( char * name, int n, int * vector );
 22 | 
 23 | static void init_pvt( int n, int * vector );
 24 | 
 25 | static void set_pvt_to_zero( int n_p, int * buff_p );
 26 | 
 27 | 
 28 | 
 29 | // ============================================================================
 30 | int main( int argc, char *argv[] ) {
 31 |   int     nb_alg, pp, m_A, n_A, mn_A, ldim_A, ldim_Q, info, lwork;
 32 |   double  * buff_A, * buff_tau, * buff_Q, * buff_wk_qp4, * buff_wk_orgqr;
 33 |   int     * buff_p;
 34 | 
 35 |   // Create matrix A, vector p, vector s, and matrix Q.
 36 |   m_A      = 7;
 37 |   n_A      = 5;
 38 |   mn_A     = min( m_A, n_A );
 39 |   buff_A   = ( double * ) malloc( m_A * n_A * sizeof( double ) );
 40 |   ldim_A   = max( 1, m_A );
 41 | 
 42 |   buff_p   = ( int * ) malloc( n_A * sizeof( int ) );
 43 | 
 44 |   buff_tau = ( double * ) malloc( n_A * sizeof( double ) );
 45 | 
 46 |   buff_Q   = ( double * ) malloc( m_A * mn_A * sizeof( double ) );
 47 |   ldim_Q   = max( 1, m_A );
 48 | 
 49 |   // Generate matrix.
 50 |   matrix_generate( m_A, n_A, buff_A, ldim_A );
 51 | 
 52 | #ifdef PRINT_DATA
 53 |   print_double_matrix( "ai", m_A, n_A, buff_A, ldim_A );
 54 |   print_double_vector( "taui", n_A, buff_tau );
 55 | #endif
 56 | 
 57 |   // Initialize vector with pivots.
 58 |   set_pvt_to_zero( n_A, buff_p );
 59 |   buff_p[ 0 ] = 0;
 60 |   buff_p[ 1 ] = 1;
 61 |   buff_p[ 2 ] = 1;
 62 |   buff_p[ 3 ] = 0;
 63 |   buff_p[ 4 ] = 1;
 64 | #ifdef PRINT_DATA
 65 |   print_int_vector( "pi", n_A, buff_p );
 66 | #endif
 67 | 
 68 |   // Create workspace.
 69 |   lwork       = max( 1, 128 * n_A );
 70 |   buff_wk_qp4 = ( double * ) malloc( lwork * sizeof( double ) );
 71 | 
 72 |   // Factorize matrix.
 73 |   printf( "%% Just before computing factorization.\n" );
 74 |   // New factorization.
 75 |   dgeqp4( & m_A, & n_A, buff_A, & ldim_A, buff_p, buff_tau, 
 76 |           buff_wk_qp4, & lwork, & info );
 77 |   // Current factorization.
 78 |   // dgeqp3_( & m_A, & n_A, buff_A, & ldim_A, buff_p, buff_tau, 
 79 |   //          buff_wk_qp4, & lwork, & info );
 80 |   printf( "%% Just after computing factorization.\n" );
 81 | 
 82 |   printf( "%% Info after factorization:      %d \n", info );
 83 |   printf( "%% Work[ 0 ] after factorization: %d \n", ( int ) buff_wk_qp4[ 0 ] );
 84 | 
 85 |   // Remove workspace.
 86 |   free( buff_wk_qp4 );
 87 | 
 88 |   // Build matrix Q.
 89 |   lwork     = max( 1, 128 * n_A );
 90 |   buff_wk_orgqr = ( double * ) malloc( lwork * sizeof( double ) );
 91 |   dlacpy_( "All", & m_A, & mn_A, buff_A, & ldim_A, buff_Q, & ldim_Q );
 92 |   dorgqr_( & m_A, & mn_A, & mn_A, buff_Q, & ldim_Q, buff_tau,
 93 |            buff_wk_orgqr, & lwork, & info );
 94 |   if( info != 0 ) {
 95 |     fprintf( stderr, "Error in dorgqr: Info: %d\n", info );
 96 |   }
 97 |   free( buff_wk_orgqr );
 98 | 
 99 |   // Print results.
100 | #ifdef PRINT_DATA
101 |   print_double_matrix( "af", m_A, n_A, buff_A, ldim_A );
102 |   print_int_vector( "pf", n_A, buff_p );
103 |   print_double_vector( "tauf", n_A, buff_tau );
104 |   print_double_matrix( "qf", m_A, mn_A, buff_Q, ldim_Q );
105 | #endif
106 | 
107 |   // Free matrices and vectors.
108 |   free( buff_A );
109 |   free( buff_p );
110 |   free( buff_tau );
111 |   free( buff_Q );
112 | 
113 |   printf( "%% End of Program\n" );
114 | 
115 |   return 0;
116 | }
117 | 
118 | // ============================================================================
119 | static void matrix_generate( int m_A, int n_A, double * buff_A, int ldim_A ) {
120 |   int  i, j, num;
121 | 
122 |   //
123 |   // Matrix with integer values.
124 |   // ---------------------------
125 |   //
126 |   if( ( m_A > 0 )&&( n_A > 0 ) ) {
127 |     num = 1;
128 |     for ( j = 0; j < n_A; j++ ) {
129 |       for ( i = ( j % m_A ); i < m_A; i++ ) {
130 |         buff_A[ i + j * ldim_A ] = ( double ) num;
131 |         num++;
132 |       }
133 |       for ( i = 0; i < ( j % m_A ); i++ ) {
134 |         buff_A[ i + j * ldim_A ] = ( double ) num;
135 |         num++;
136 |       }
137 |     }
138 |     if( ( m_A > 0 )&&( n_A > 0 ) ) {
139 |       buff_A[ 0 + 0 * ldim_A ] = 1.2;
140 |     }
141 | #if 0
142 |     // Scale down matrix.
143 |     if( num == 0.0 ) {
144 |       rnum = 1.0;
145 |     } else {
146 |       rnum = 1.0 / num;
147 |     }
148 |     for ( j = 0; j < n_A; j++ ) {
149 |       for ( i = 0; i < m_A; i++ ) {
150 |         buff_A[ i + j * ldim_A ] *= rnum;
151 |       }
152 |     }
153 | #endif
154 |   }
155 | }
156 | 
157 | // ============================================================================
158 | static void print_double_matrix( char * name, int m_A, int n_A, 
159 |                 double * buff_A, int ldim_A ) {
160 |   int  i, j;
161 | 
162 |   printf( "%s = [\n", name );
163 |   for( i = 0; i < m_A; i++ ) {
164 |     for( j = 0; j < n_A; j++ ) {
165 |       printf( "%le ", buff_A[ i + j * ldim_A ] );
166 |     }
167 |     printf( "\n" );
168 |   }
169 |   printf( "];\n" );
170 | }
171 | 
172 | // ============================================================================
173 | static void print_double_vector( char * name, int n_v, double * buff_v ) {
174 |   int  i, j;
175 | 
176 |   printf( "%s = [\n", name );
177 |   for( i = 0; i < n_v; i++ ) {
178 |     printf( "%le\n", buff_v[ i ] );
179 |   }
180 |   printf( "\n" );
181 |   printf( "];\n" );
182 | }
183 | 
184 | // ============================================================================
185 | static void print_int_vector( char * name, int n_v, int * buff_v ) {
186 |   int  i, j;
187 | 
188 |   printf( "%s = [\n", name );
189 |   for( i = 0; i < n_v; i++ ) {
190 |     printf( "%d\n", buff_v[ i ] );
191 |   }
192 |   printf( "];\n" );
193 | }
194 | 
195 | // ============================================================================
196 | static void init_pvt( int n_p, int * buff_p ) {
197 |   int  i;
198 | 
199 |   for( i = 0; i < n_p; i++ ) {
200 |     buff_p[ i ] = ( i + 1 );
201 |   }
202 | }
203 | 
204 | // ============================================================================
205 | static void set_pvt_to_zero( int n_p, int * buff_p ) {
206 |   int  i;
207 | 
208 |   for( i = 0; i < n_p; i++ ) {
209 |     buff_p[ i ] = 0;
210 |   }
211 | }
212 | 
213 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # HQRRP
  2 | 
  3 | ## Authors
  4 | 
  5 | * Per-Gunnar Martinsson,
  6 |   Dept. of Applied Mathematics,
  7 |   University of Colorado at Boulder,
  8 |   526 UCB, Boulder, CO 80309-0526, USA.
  9 | 
 10 | * Gregorio Quintana-Orti,
 11 |   Depto. de Ingenieria y Ciencia de Computadores,
 12 |   Universitat Jaume I,
 13 |   12.071 Castellon, Spain.
 14 | 
 15 | * Nathan Heavner,
 16 |   Dept. of Applied Mathematics,
 17 |   University of Colorado at Boulder,
 18 |   526 UCB, Boulder, CO 80309-0526, USA.
 19 | 
 20 | * Robert van de Geijn,
 21 |   Dept. of Computer Science and Institute for Computational Engineering and
 22 |   Sciences,
 23 |   The University of Texas at Austin,
 24 |   Austin, TX, USA.
 25 | 
 26 | ## Correspondence
 27 | 
 28 | Please send correspondence about the code to 
 29 | Gregorio Quintana-Ortí: <gquintan@icc.uji.es>
 30 | 
 31 | Correspondence about the paper should be sent to
 32 | Per-Gunnar J. Martinsson: <Per-gunnar.Martinsson@colorado.edu>
 33 | 
 34 | ## License
 35 | 
 36 | New 3-clause BSD.
 37 | See file License.txt for more details.
 38 | 
 39 | ## Disclaimer
 40 | 
 41 | This code is distributed in the hope that it will be useful, but
 42 | WITHOUT ANY WARRANTY EXPRESSED OR IMPLIED. 
 43 | 
 44 | ## Description
 45 | 
 46 | Householder transformation based QR factorization with column pivoting is an 
 47 | important algorithm for, for example, determining an approximate basis for 
 48 | the column space of a matrix. It is, unfortunately, notoriously difficult to 
 49 | implement for high performance.
 50 | 
 51 | Recently, techniques that use randomized sampling have been developed that
 52 | do achieve high performance by casting most computation in terms of
 53 | matrix-matrix multiplication.
 54 | 
 55 | For example, we describe such an algorithm in our recent paper:
 56 | 
 57 |   * P.-G. Martinsson, G. Quintana-Orti, N. Heavner, R. van de Geijn.
 58 |     "Householder QR Factorization: Adding Randomization for Column Pivoting.
 59 |     FLAME Working Note #78" 
 60 |     http://arxiv.org/abs/1512.02671
 61 | 
 62 | This directory contains an implementation that we call Householder QR
 63 | factorization with Randomization for Pivoting (HQRRP), based on the insights 
 64 | in that paper.
 65 | 
 66 | The new code outperforms LAPACK's core routine DGEQP3 both in unicore and 
 67 | multicore architectures for medium and large matrix sizes, often by large 
 68 | factor. The new implementation comes with an interface that is plug 
 69 | compatible with DGEQP3. 
 70 | 
 71 | The new code can be downloaded from https://github.com/flame/hqrrp/.
 72 | 
 73 | The algorithm was originally implemented using the FLAME/C API with 
 74 | a variation of the compact WY transform we call the UT transform. 
 75 | In addition, 
 76 | we also provide an implementation that instead uses the original compact 
 77 | WY transform so that many routines from LAPACK can be employed in a 
 78 | seamless fashion.  
 79 | 
 80 | This implementation as well as the original implementation based on the UT
 81 | transform will eventually be included in the libflame library: 
 82 | https://github.com/flame/libflame/
 83 | 
 84 | We will appreciate feedback from the community on the use of this code.
 85 | 
 86 | ## Performance benefit
 87 | 
 88 | ![alt tag](./speedup.png)
 89 | 
 90 | ## Citing this work
 91 | 
 92 | We ask those who benefit from this work 
 93 | to cite the following articles:
 94 | 
 95 | ```
 96 | @article{doi:10.1137/16M1081270,
 97 | author  = {Martinsson, P. and Quintana Ort\'{\i}, G.
 98 |            and Heavner, N. and van de Geijn, R.},
 99 | title   = {Householder QR Factorization  
100 |            With Randomization for Column Pivoting (HQRRP)},
101 | journal = {SIAM Journal on Scientific Computing},
102 | volume  = {39},
103 | number  = {2},
104 | pages   = {C96-C115},
105 | year    = {2017},
106 | doi     = {10.1137/16M1081270},
107 | URL     = {https://doi.org/10.1137/16M1081270},
108 | eprint  = {https://doi.org/10.1137/16M1081270}
109 | }
110 | 
111 | @ARTICLE{martinsson2015blocked,
112 | title   = {Blocked rank-revealing QR factorizations: How randomized 
113 |            sampling can be used to avoid single-vector pivoting},
114 | author  = {Martinsson, Per-Gunnar},
115 | journal = {arXiv preprint arXiv:1505.08115},
116 | year    = {2015},
117 | month   = {may}
118 | }
119 | 
120 | @ARTICLE{2015arXiv151202671M,
121 | author        = {{Martinsson}, P.-G. and {Quintana-Ort\’{\i}}, G. 
122 |                  and {Heavner}, N. and {van de Geijn}, R.},
123 | title         = "{Householder {QR} Factorization: Adding Randomization for 
124 |                  Column Pivoting. {FLAME} {W}orking {N}ote \#78}",
125 | journal       = {ArXiv e-prints},
126 | archivePrefix = "arXiv",
127 | eprint        = {1512.02671},
128 | primaryClass  = "math.NA",
129 | keywords      = {Mathematics - Numerical Analysis, Computer Science - 
130 |                  Numerical Analysis},
131 | year          = 2015,
132 | month         = dec,
133 | adsurl        = {http://adsabs.harvard.edu/abs/2015arXiv151202671M},
134 | adsnote       = {Provided by the SAO/NASA Astrophysics Data System}
135 | }
136 | ```
137 | 
138 | ## Details
139 | 
140 | We offer two variants of the code:
141 | 
142 | * LAPACK-compatible pure C code: 
143 |   It uses compact WY transformations.
144 |   The sources are stored in the `lapack_compatible_sources` directory.
145 | 
146 | * LAPACK-like libflame code: 
147 |   It uses compact UT transformations.
148 |   This code resembles the algorithm in the paper.
149 |   The sources are stored in the `libflame_sources` directory.
150 | 
151 | ### Details of LAPACK-compatible pure C code: 
152 | 
153 | The new code contains the following two main routines:
154 | 
155 | ```
156 | void dgeqp4( int * m, int * n, double * A, int * lda, int * jpvt, double * tau,
157 |          double * work, int * lwork, int * info );
158 | // 
159 | // This routine is plug compatible with LAPACK's routine dgeqp3.
160 | // It computes the new HQRRP while keeping the same header as LAPACK's dgeqp3.
161 | // It uses dgeqpf or dgeqp3 for small matrices. The thresholds are defined in
162 | // constants THRESHOLD_FOR_DGEQPF and THRESHOLD_FOR_DGEQP3.
163 | // This routine calls the next one with block size 64 and oversampling 10.
164 | //
165 | 
166 | int NoFLA_HQRRP_WY_blk_var4( int m_A, int n_A, double * buff_A, int ldim_A,
167 |         int * buff_jpvt, double * buff_tau,
168 |         int nb_alg, int pp, int panel_pivoting );
169 | // 
170 | // This routine is not plug compatible with LAPACK's routine dgeqp3.
171 | // It computes the new HQRRP and allows the user to fine tune more arguments,
172 | // such as the block size, oversampling, etc.
173 | //
174 | ```
175 | 
176 | These two routines are stored in the `NoFLA_HQRRP_WY_blk_var4.c` file.
177 | The `simple_test.c` file in this directory
178 | contains a main program to test routine `dgeqp4`.
179 | 
180 | ### Details of LAPACK-like libflame code: 
181 | 
182 | The new code contains the following main routine:
183 | 
184 | ```
185 | int FLA_HQRRP_UT_blk_var2( FLA_Obj A, FLA_Obj p, FLA_Obj s, 
186 |         int nb_alg, int pp, int panel_pivoting );
187 | // 
188 | // This routine is not plug compatible with LAPACK's routine dgeqp3.
189 | // It computes the new HQRRP and allows the user to fine tune more arguments,
190 | // such as the block size, oversampling, etc.
191 | //
192 | ```
193 | 
194 | This routine is stored in the `FLA_HQRRP_UT_blk_var2.c` file.
195 | The `simple_test.c` file in this directory contains a main program to test it.
196 | 
197 | 


--------------------------------------------------------------------------------
/lapack_compatible_sources/NoFLA_HQRRP_WY_blk_var4.c:
--------------------------------------------------------------------------------
   1 | /*
   2 | ===============================================================================
   3 | Authors
   4 | ===============================================================================
   5 | 
   6 | Per-Gunnar Martinsson
   7 |   Dept. of Applied Mathematics, 
   8 |   University of Colorado at Boulder, 
   9 |   526 UCB, Boulder, CO 80309-0526, USA
  10 | 
  11 | Gregorio Quintana-Orti
  12 |   Depto. de Ingenieria y Ciencia de Computadores, 
  13 |   Universitat Jaume I, 
  14 |   12.071 Castellon, Spain
  15 | 
  16 | Nathan Heavner
  17 |   Dept. of Applied Mathematics, 
  18 |   University of Colorado at Boulder, 
  19 |   526 UCB, Boulder, CO 80309-0526, USA
  20 | 
  21 | Robert van de Geijn
  22 |   Dept. of Computer Science and Institute for Computational Engineering and 
  23 |   Sciences, 
  24 |   The University of Texas at Austin
  25 |   Austin, TX.
  26 | 
  27 | ===============================================================================
  28 | Copyright
  29 | ===============================================================================
  30 | 
  31 | Copyright (C) 2016, 
  32 |   Universitat Jaume I,
  33 |   University of Colorado at Boulder,
  34 |   The University of Texas at Austin.
  35 | 
  36 | ===============================================================================
  37 | Disclaimer
  38 | ===============================================================================
  39 | 
  40 | This code is distributed in the hope that it will be useful, but
  41 | WITHOUT ANY WARRANTY EXPRESSED OR IMPLIED.
  42 | 
  43 | */
  44 | 
  45 | #include <stdlib.h>
  46 | #include <stdio.h>
  47 | #include <math.h>
  48 | #include "NoFLA_HQRRP_WY_blk_var4.h"
  49 | 
  50 | 
  51 | // Matrices with dimensions smaller than THRESHOLD_FOR_DGEQPF are processed 
  52 | // with LAPACK's routine dgeqpf.
  53 | // Matrices with dimensions between THRESHOLD_FOR_DGEQPF and 
  54 | // THRESHOLD_FOR_DGEQP3 are processed with LAPACK's routine dgeqp3.
  55 | // Matrices with dimensions larger than THRESHOLD_FOR_DGEQP3 are processed 
  56 | // with the new HQRRP code.
  57 | #define THRESHOLD_FOR_DGEQPF   250
  58 | #define THRESHOLD_FOR_DGEQP3  1000
  59 | 
  60 | 
  61 | // ============================================================================
  62 | // Definition of macros.
  63 | 
  64 | #define max( a, b )  ( (a) > (b) ? (a) : (b) )
  65 | #define min( a, b )  ( (a) > (b) ? (b) : (a) )
  66 | #define dabs( a )    ( (a) >= 0.0 ? (a) : -(a) )
  67 | 
  68 | // ============================================================================
  69 | // Compilation declarations.
  70 | 
  71 | #undef CHECK_DOWNDATING_OF_Y
  72 | 
  73 | 
  74 | // ============================================================================
  75 | // Declaration of local prototypes.
  76 | 
  77 | static int NoFLA_Normal_random_matrix( int m_A, int n_A, 
  78 |                double * buff_A, int ldim_A );
  79 | 
  80 | static double NoFLA_Normal_random_number( double mu, double sigma );
  81 | 
  82 | static int NoFLA_Downdate_Y( 
  83 |                int m_U11, int n_U11, double * buff_U11, int ldim_U11,
  84 |                int m_U21, int n_U21, double * buff_U21, int ldim_U21,
  85 |                int m_A12, int n_A12, double * buff_A12, int ldim_A12,
  86 |                int m_T, int n_T, double * buff_T, int ldim_T,
  87 |                int m_Y2, int n_Y2, double * buff_Y2, int ldim_Y2,
  88 |                int m_G1, int n_G1, double * buff_G1, int ldim_G1,
  89 |                int m_G2, int n_G2, double * buff_G2, int ldim_G2 );
  90 | 
  91 | static int NoFLA_Apply_Q_WY_lhfc_blk_var4( 
  92 |                int m_U, int n_U, double * buff_U, int ldim_U,
  93 |                int m_T, int n_T, double * buff_T, int ldim_T,
  94 |                int m_B, int n_B, double * buff_B, int ldim_B );
  95 | 
  96 | static int NoFLA_Apply_Q_WY_rnfc_blk_var4( 
  97 |                int m_U, int n_U, double * buff_U, int ldim_U,
  98 |                int m_T, int n_T, double * buff_T, int ldim_T,
  99 |                int m_B, int n_B, double * buff_B, int ldim_B );
 100 | 
 101 | static int NoFLA_QRPmod_WY_unb_var4( int pivoting, int num_stages, 
 102 |                int m_A, int n_A, double * buff_A, int ldim_A,
 103 |                int * buff_p, double * buff_t, 
 104 |                int pivot_B, int m_B, double * buff_B, int ldim_B,
 105 |                int pivot_C, int m_C, double * buff_C, int ldim_C,
 106 |                int build_T, double * buff_T, int ldim_T );
 107 | 
 108 | static int NoFLA_QRP_compute_norms(
 109 |                int m_A, int n_A, double * buff_A, int ldim_A,
 110 |                double * buff_d, double * buff_e );
 111 | 
 112 | static int NoFLA_QRP_downdate_partial_norms( int m_A, int n_A,
 113 |                double * buff_d,  int st_d,
 114 |                double * buff_e,  int st_e,
 115 |                double * buff_wt, int st_wt,
 116 |                double * buff_A,  int ldim_A );
 117 | 
 118 | static int NoFLA_QRP_pivot_G_B_C( int j_max_col,
 119 |                int m_G, double * buff_G, int ldim_G, 
 120 |                int pivot_B, int m_B, double * buff_B, int ldim_B, 
 121 |                int pivot_C, int m_C, double * buff_C, int ldim_C, 
 122 |                int * buff_p,
 123 |                double * buff_d, double * buff_e );
 124 | 
 125 | 
 126 | // ============================================================================
 127 | void dgeqp4( int * m, int * n, double * A, int * lda, int * jpvt, double * tau,
 128 |         double * work, int * lwork, int * info ) {
 129 | // 
 130 | // This routine is plug compatible with LAPACK's routine dgeqp3.
 131 | // It computes the new HQRRP while keeping the same header as LAPACK's dgeqp3.
 132 | // It uses dgeqpf or dgeqp3 for small matrices. The thresholds are defined in
 133 | // constants THRESHOLD_FOR_DGEQPF and THRESHOLD_FOR_DGEQP3.
 134 | //
 135 |   int     INB = 1;
 136 |   int     i_one = 1, i_minus_one = -1, 
 137 |           m_A, n_A, mn_A, ldim_A, lquery, nb, num_factorized_fixed_cols, 
 138 |           minus_info, iws, lwkopt, j, k, num_fixed_cols, n_rest, itmp;
 139 |   int     * previous_jpvt;
 140 |   int     ilaenv_();
 141 | 
 142 |   // Some initializations.
 143 |   m_A    = * m;
 144 |   n_A    = * n;
 145 |   mn_A   = min( m_A, n_A );
 146 |   ldim_A = * lda;
 147 | 
 148 |   // Check input arguments.
 149 |   * info = 0;
 150 |   lquery = ( * lwork == -1 );
 151 |   if( m_A < 0 ) {
 152 |      * info = -1;
 153 |   } else if ( n_A < 0 ) {
 154 |      * info = -2;
 155 |   } else if ( ldim_A < max( 1, m_A ) ) {
 156 |      * info = -4;
 157 |   }
 158 | 
 159 |   if( *info == 0 ) {
 160 |     if( mn_A == 0 ) {
 161 |       iws    = 1;
 162 |       lwkopt = 1;
 163 |     } else {
 164 |       iws    = 3 * n_A + 1;
 165 |       nb     = ilaenv_( & INB, "DGEQRF", ' ', & m_A, & n_A, & i_minus_one, 
 166 |                         & i_minus_one );
 167 |       lwkopt = 2 * n_A + ( n_A + 1 ) * nb;
 168 |     }
 169 |     work[ 0 ] = ( double ) lwkopt;
 170 | 
 171 |     if ( ( * lwork < iws )&&( ! lquery ) ) {
 172 |       * info = -8;
 173 |     }
 174 |   }
 175 | 
 176 |   if( * info != 0 ) {
 177 |     minus_info = - * info;
 178 |     xerbla_( "DGEQP3", & minus_info );
 179 |     return;
 180 |   } else if( lquery ) {
 181 |     return;
 182 |   }
 183 | 
 184 |   // Quick return if possible.
 185 |   if( mn_A == 0 ) {
 186 |     return;
 187 |   }
 188 | 
 189 |   // Use LAPACK's DGEQPF or DGEQP3 for small matrices.
 190 |   if( mn_A < THRESHOLD_FOR_DGEQPF ) {
 191 |     // Call to LAPACK routine.
 192 |     //// printf( "Calling dgeqpf\n" );
 193 |     dgeqpf_( m, n, A, lda, jpvt, tau, work, info );
 194 |     return;
 195 |   } else if( mn_A < THRESHOLD_FOR_DGEQP3 ) {
 196 |     //// printf( "Calling dgeqp3\n" );
 197 |     dgeqp3_( m, n, A, lda, jpvt, tau, work, lwork, info );
 198 |     return;
 199 |   }
 200 | 
 201 |   // Move initial columns up front.
 202 |   num_fixed_cols = 0;
 203 |   for( j = 0; j < n_A; j++ ) {
 204 |     if( jpvt[ j ] != 0 ) {
 205 |       if( j != num_fixed_cols ) {
 206 |         //// printf( "Swapping columns: %d %d \n", j, num_fixed_cols );
 207 |         dswap_( & m_A, & A[ 0 + j              * ldim_A ], & i_one, 
 208 |                        & A[ 0 + num_fixed_cols * ldim_A ], & i_one );
 209 |         jpvt[ j ] = jpvt[ num_fixed_cols ];
 210 |         jpvt[ num_fixed_cols ] = j + 1;
 211 |       } else {
 212 |         jpvt[ j ] = j + 1 ;
 213 |       }
 214 |       num_fixed_cols++;
 215 |     } else {
 216 |       jpvt[ j ] = j + 1 ;
 217 |     }
 218 |   }
 219 | 
 220 |   // Factorize fixed columns at the front.
 221 |   num_factorized_fixed_cols = min( m_A, num_fixed_cols );
 222 |   if( num_factorized_fixed_cols > 0 ) {
 223 |     dgeqrf_( & m_A, & num_factorized_fixed_cols, A, & ldim_A, tau, work, lwork,
 224 |              info );
 225 |     if( * info != 0 ) {
 226 |       fprintf( stderr, "ERROR in dgeqrf: Info: %d \n", * info );
 227 |     }
 228 |     iws = max( iws, ( int ) work[ 0 ] );
 229 |     if( num_factorized_fixed_cols < n_A ) {
 230 |       n_rest = n_A - num_factorized_fixed_cols;
 231 |       dormqr_( "Left", "Transpose", 
 232 |                & m_A, & n_rest, & num_factorized_fixed_cols,
 233 |                A, & ldim_A, tau,
 234 |                & A[ 0 + num_factorized_fixed_cols * ldim_A ], & ldim_A, 
 235 |                work, lwork, info );
 236 |       if( * info != 0 ) {
 237 |         fprintf( stderr, "ERROR in dormqr: Info: %d \n", * info );
 238 |       }
 239 | 
 240 |       iws = max( iws, ( int ) work[ 0 ] );
 241 |     }
 242 |   }
 243 | 
 244 |   // Create intermediate jpvt vector.
 245 |   previous_jpvt = ( int * ) malloc( n_A * sizeof( int ) );
 246 | 
 247 |   // Save a copy of jpvt vector.
 248 |   if( num_factorized_fixed_cols > 0 ) {
 249 |     // Copy vector.
 250 |     for( j = 0; j < n_A; j++ ) {
 251 |       previous_jpvt[ j ] = jpvt[ j ];
 252 |     }
 253 |   }
 254 | 
 255 |   // Factorize free columns at the bottom with default values:
 256 |   // nb_alg = 64, pp = 10, panel_pivoting = 1.
 257 |   if( num_factorized_fixed_cols < mn_A ) {
 258 |     * info = NoFLA_HQRRP_WY_blk_var4( 
 259 |         m_A - num_factorized_fixed_cols, n_A - num_factorized_fixed_cols, 
 260 |         & A[ num_factorized_fixed_cols + num_factorized_fixed_cols * ldim_A ], 
 261 |             ldim_A,
 262 |         & jpvt[ num_factorized_fixed_cols ], 
 263 |         & tau[ num_factorized_fixed_cols ],
 264 |         64, 10, 1 );
 265 |   }
 266 | 
 267 |   // Pivot block above factorized block by NoFLA_HQRRP.
 268 |   if( num_factorized_fixed_cols > 0 ) {
 269 |     // Pivot block above factorized block.
 270 |     for( j = num_factorized_fixed_cols; j < n_A; j++ ) {
 271 |       //// printf( "%% Processing j: %d \n", j );
 272 |       for( k = j; k < n_A; k++ ) {
 273 |         if( jpvt[ j ] == previous_jpvt[ k ] ) {
 274 |           //// printf( "%%   Found j: %d  k: %d \n", j, k );
 275 |           break;
 276 |         }
 277 |       }
 278 |       // Swap vector previous_jpvt and block above factorized block.
 279 |       if( k != j ) { 
 280 |         // Swap elements in previous_jpvt.
 281 |         //// printf( "%%   Swapping  j: %d  k: %d \n", j, k );
 282 |         itmp = previous_jpvt[ j ];
 283 |         previous_jpvt[ j ] = previous_jpvt[ k ];
 284 |         previous_jpvt[ k ] = itmp;
 285 | 
 286 |         // Swap columns in block above factorized block.
 287 |         dswap_( & num_factorized_fixed_cols,
 288 |                 & A[ 0 + j * ldim_A ], & i_one,
 289 |                 & A[ 0 + k * ldim_A ], & i_one );
 290 |       }
 291 |     }
 292 |   }
 293 | 
 294 |   // Remove intermediate jpvt vector.
 295 |   free( previous_jpvt );
 296 | 
 297 |   // Return workspace length required.
 298 |   work[ 0 ] = iws;
 299 |   return;
 300 | }
 301 | 
 302 | // ============================================================================
 303 | int NoFLA_HQRRP_WY_blk_var4( int m_A, int n_A, double * buff_A, int ldim_A,
 304 |         int * buff_jpvt, double * buff_tau,
 305 |         int nb_alg, int pp, int panel_pivoting ) {
 306 | //
 307 | // HQRRP: It computes the Householder QR with Randomized Pivoting of matrix A.
 308 | // This routine is almost compatible with LAPACK's dgeqp3.
 309 | // The main difference is that this routine does not manage fixed columns.
 310 | //
 311 | // Main features:
 312 | //   * BLAS-3 based.
 313 | //   * Norm downdating method by Drmac.
 314 | //   * Downdating for computing Y.
 315 | //   * No use of libflame.
 316 | //   * Compact WY transformations are used instead of UT transformations.
 317 | //   * LAPACK's routine dlarfb is used to apply block transformations.
 318 | //
 319 | // Arguments:
 320 | // ----------
 321 | // m_A:            Number of rows of matrix A.
 322 | // n_A:            Number of columns of matrix A.
 323 | // buff_A:         Address/pointer of/to data in matrix A. Matrix A must be 
 324 | //                 stored in column-order.
 325 | // ldim_A:         Leading dimension of matrix A.
 326 | // buff_jpvt:      Input/output vector with the pivots.
 327 | // buff_tau:       Output vector with the tau values of the Householder factors.
 328 | // nb_alg:         Block size. 
 329 | //                 Usual values for nb_alg are 32, 64, etc.
 330 | // pp:             Oversampling size.
 331 | //                 Usual values for pp are 5, 10, etc.
 332 | // panel_pivoting: If panel_pivoting==1, QR with pivoting is applied to 
 333 | //                 factorize the panels of matrix A. Otherwise, QR without 
 334 | //                 pivoting is used. Usual value for panel_pivoting is 1.
 335 | // Final comments:
 336 | // ---------------
 337 | // This code has been created from a libflame code. Hence, you can find some
 338 | // commented calls to libflame routines. We have left them to make it easier
 339 | // to interpret the meaning of the C code.
 340 | //
 341 |   int     b, j, last_iter, mn_A, m_Y, n_Y, ldim_Y, m_V, n_V, ldim_V, 
 342 |           m_W, n_W, ldim_W, n_VR, m_AB1, n_AB1, ldim_T1_T,
 343 |           m_A11, n_A11, m_A12, n_A12, m_A21, n_A21, m_A22,
 344 |           m_G, n_G, ldim_G;
 345 |   double  * buff_Y, * buff_V, * buff_W, * buff_VR, * buff_YR, 
 346 |           * buff_s, * buff_sB, * buff_s1, 
 347 |           * buff_AR, * buff_AB1, * buff_A01, * buff_Y1, * buff_T1_T,
 348 |           * buff_A11, * buff_A21, * buff_A12,
 349 |           * buff_Y2, * buff_G, * buff_G1, * buff_G2;
 350 |   int     * buff_p, * buff_pB, * buff_p1;
 351 |   double  d_zero = 0.0;
 352 |   double  d_one  = 1.0;
 353 | 
 354 |   // Executable Statements.
 355 |   //// printf( "%% NoFLA_HQRRP_WY_blk_var4.\n" );
 356 | 
 357 |   // Check arguments.
 358 |   if( m_A < 0 ) {
 359 |     fprintf( stderr, 
 360 |              "ERROR in NoFLA_HQRRP_WY_blk_var4: m_A is < 0.\n" );
 361 |   } if( n_A < 0 ) {
 362 |     fprintf( stderr, 
 363 |              "ERROR in NoFLA_HQRRP_WY_blk_var4: n_A is < 0.\n" );
 364 |   } if( ldim_A < max( 1, m_A ) ) {
 365 |     fprintf( stderr, 
 366 |              "ERROR in NoFLA_HQRRP_WY_blk_var4: ldim_A is < max( 1, m_A ).\n" );
 367 |   }
 368 | 
 369 |   // Some initializations.
 370 |   mn_A   = min( m_A, n_A );
 371 |   buff_p = buff_jpvt;
 372 |   buff_s = buff_tau;
 373 | 
 374 |   // Quick return.
 375 |   if( mn_A == 0 ) {
 376 |     return 0;
 377 |   }
 378 | 
 379 |   // Initialize the seed for the generator of random numbers.
 380 |   srand( 12 );
 381 | 
 382 |   // Create auxiliary objects.
 383 |   m_Y     = nb_alg + pp;
 384 |   n_Y     = n_A;
 385 |   buff_Y  = ( double * ) malloc( m_Y * n_Y * sizeof( double ) );
 386 |   ldim_Y  = m_Y;
 387 | 
 388 |   m_V     = nb_alg + pp;
 389 |   n_V     = n_A;
 390 |   buff_V  = ( double * ) malloc( m_V * n_V * sizeof( double ) );
 391 |   ldim_V  = m_V;
 392 | 
 393 |   m_W     = nb_alg;
 394 |   n_W     = n_A;
 395 |   buff_W  = ( double * ) malloc( m_W * n_W * sizeof( double ) );
 396 |   ldim_W  = m_W;
 397 | 
 398 |   m_G     = nb_alg + pp;
 399 |   n_G     = m_A;
 400 |   buff_G  = ( double * ) malloc( m_G * n_G * sizeof( double ) );
 401 |   ldim_G  = m_G;
 402 | 
 403 |   // Initialize matrices G and Y.
 404 |   NoFLA_Normal_random_matrix( nb_alg + pp, m_A, buff_G, ldim_G );
 405 |   //// FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, 
 406 |   ////           FLA_ONE, G, A, FLA_ZERO, Y );
 407 |   dgemm_( "No tranpose", "No transpose", & m_Y, & n_Y, & m_A, 
 408 |           & d_one, buff_G, & ldim_G, buff_A, & ldim_A, 
 409 |           & d_zero, buff_Y, & ldim_Y );
 410 | 
 411 |   // Main Loop.
 412 |   for( j = 0; j < mn_A; j += nb_alg ) {
 413 |     b = min( nb_alg, min( n_A - j, m_A - j ) );
 414 | 
 415 |     // Check whether it is the last iteration.
 416 |     last_iter = ( ( ( j + nb_alg >= m_A )||( j + nb_alg >= n_A ) ) ? 1 : 0 );
 417 | 
 418 |     // Some initializations for the iteration of this loop.
 419 |     n_VR = n_V - j;
 420 |     buff_VR = & buff_V[ 0 + j * ldim_V ];
 421 |     buff_YR = & buff_Y[ 0 + j * ldim_Y ];
 422 |     buff_pB = & buff_p[ j ];
 423 |     buff_sB = & buff_s[ j ];
 424 |     buff_AR = & buff_A[ 0 + j * ldim_A ];
 425 | 
 426 |     m_AB1     = m_A - j;
 427 |     n_AB1     = b;
 428 |     buff_AB1  = & buff_A[ j + j * ldim_A ];
 429 |     buff_p1   = & buff_p[ j ];
 430 |     buff_s1   = & buff_s[ j ];
 431 |     buff_A01  = & buff_A[ 0 + j * ldim_A ];
 432 |     buff_Y1   = & buff_Y[ 0 + j * ldim_Y ];
 433 |     buff_T1_T = & buff_W[ 0 + j * ldim_W ];
 434 |     ldim_T1_T = ldim_W;
 435 | 
 436 |     buff_A11 = & buff_A[ j + j * ldim_A ];
 437 |     m_A11 = b;
 438 |     n_A11 = b;
 439 | 
 440 |     buff_A21 = & buff_A[ min( m_A - 1, j + nb_alg ) + j * ldim_A ];
 441 |     m_A21 = max( 0, m_A - j - b );
 442 |     n_A21 = b;
 443 | 
 444 |     buff_A12 = & buff_A[ j + min( n_A - 1, j + b ) * ldim_A ];
 445 |     m_A12 = b;
 446 |     n_A12 = max( 0, n_A - j - b );
 447 | 
 448 |     //// buff_A22 = & buff_A[ min( m_A - 1, j + b ) + 
 449 |     ////                      min( n_A - 1, j + b ) * ldim_A ];
 450 |     m_A22 = max( 0, m_A - j - b );
 451 |     //// n_A22 = max( 0, n_A - j - b );
 452 | 
 453 |     buff_Y2 = & buff_Y[ 0 + min( n_Y - 1, j + b ) * ldim_Y ];
 454 |     buff_G1 = & buff_G[ 0 + j * ldim_G ];
 455 |     buff_G2 = & buff_G[ 0 + min( n_G - 1, j + b ) * ldim_G ];
 456 |       
 457 | #ifdef CHECK_DOWNDATING_OF_Y
 458 |     // Check downdating of matrix Y: Compare downdated matrix Y with 
 459 |     // matrix Y computed from scratch.
 460 |     int     m_cyr, n_cyr, ldim_cyr, m_ABR, ii, jj;
 461 |     double  * buff_cyr, aux, sum;
 462 | 
 463 |     m_cyr    = m_Y;
 464 |     n_cyr    = n_Y - j;
 465 |     ldim_cyr = m_cyr;
 466 |     m_ABR    = m_A - j;
 467 |     buff_cyr = ( double * ) malloc( m_cyr * n_cyr * sizeof( double ) );
 468 |  
 469 |     //// FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, 
 470 |     ////           FLA_ONE, GR, ABR, FLA_ZERO, CYR ); 
 471 |     dgemm_( "No tranpose", "No transpose", & m_cyr, & n_cyr, & m_ABR,
 472 |             & d_one, & buff_G[ 0 + j * ldim_G ], & ldim_G,
 473 |                      & buff_A[ j + j * ldim_A ], & ldim_A,
 474 |             & d_zero, & buff_cyr[ 0 + 0 * ldim_cyr ], & ldim_cyr );
 475 | 
 476 |     //// print_double_matrix( "cyr", m_cyr, n_cyr, buff_cyr, ldim_cyr );
 477 |     //// print_double_matrix( "y", m_Y, n_Y, buff_Y, ldim_Y );
 478 |     sum = 0.0;
 479 |     for( jj = 0; jj < n_cyr; jj++ ) {
 480 |       for( ii = 0; ii < m_cyr; ii++ ) {
 481 |         aux = buff_Y[ ii + ( j + jj ) * ldim_Y ] -
 482 |               buff_cyr[ ii + jj * ldim_cyr ];
 483 |         sum += aux * aux;
 484 |       }
 485 |     }
 486 |     sum = sqrt( sum );
 487 |     printf( "%%  diff between Y and downdated Y: %le\n", sum );
 488 | 
 489 |     free( buff_cyr );
 490 | #endif
 491 | 
 492 |     if( last_iter == 0 ) {
 493 |       // Compute QRP of YR, and apply permutations to matrix AR.
 494 |       // A copy of YR is made into VR, and permutations are applied to YR.
 495 |       //// FLA_Merge_2x1( ATR,
 496 |       ////                ABR,   & AR );
 497 |       //// FLA_Copy( YR, VR );
 498 |       //// FLA_QRPmod_WY_unb_var4( 1, bRow, VR, pB, sB, 1, AR, 1, YR, 0, None );
 499 | 
 500 |       dlacpy_( "All", & m_V, & n_VR, buff_YR, & ldim_Y,
 501 |                                      buff_VR, & ldim_V );
 502 |       NoFLA_QRPmod_WY_unb_var4( 1, b,
 503 |           m_V, n_VR, buff_VR, ldim_V, buff_pB, buff_sB,
 504 |           1, m_A, buff_AR, ldim_A,
 505 |           1, m_Y, buff_YR, ldim_Y,
 506 |           0, buff_Y, ldim_Y );
 507 |     }
 508 | 
 509 |     //
 510 |     // Compute QRP of panel AB1 = [ A11; A21 ].
 511 |     // Apply same permutations to A01 and Y1, and build T1_T.
 512 |     //
 513 |     //// FLA_Part_2x1( W1,   & T1_T,
 514 |     ////                     & None,    b, FLA_TOP );
 515 |     //// FLA_Merge_2x1( A11,
 516 |     ////                A21,   & AB1 );
 517 |     //// FLA_QRPmod_WY_unb_var4( panel_pivoting, -1, AB1, p1, s1, 
 518 |     ////                         1, A01, 1, Y1, 1, T1_T );
 519 | 
 520 |     NoFLA_QRPmod_WY_unb_var4( panel_pivoting, -1,
 521 |         m_AB1, n_AB1, buff_AB1, ldim_A, buff_p1, buff_s1,
 522 |         1, j, buff_A01, ldim_A,
 523 |         1, m_Y, buff_Y1, ldim_Y,
 524 |         1, buff_T1_T, ldim_W );
 525 | 
 526 |     //
 527 |     // Update the rest of the matrix.
 528 |     //
 529 |     if ( ( j + b ) < n_A ) {
 530 |       // Apply the Householder transforms associated with AB1 = [ A11; A21 ] 
 531 |       // and T1_T to [ A12; A22 ]:
 532 |       //   / A12 \ := QB1' / A12 \
 533 |       //   \ A22 /         \ A22 /
 534 |       // where QB1 is formed from AB1 and T1_T.
 535 |       //// MyFLA_Apply_Q_WY_lhfc_blk_var4( A11, A21, T1_T, A12, A22 );
 536 | 
 537 |       NoFLA_Apply_Q_WY_lhfc_blk_var4( 
 538 |           m_A11 + m_A21, n_A11, buff_A11, ldim_A,
 539 |           b, b, buff_T1_T, ldim_W,
 540 |           m_A12 + m_A22, n_A12, buff_A12, ldim_A );
 541 |     }
 542 | 
 543 |     //
 544 |     // Downdate matrix Y.
 545 |     //
 546 |     if ( ! last_iter ) {
 547 |       //// MyFLA_Downdate_Y( A11, A21, A12, T1_T, Y2, G1, G2 );
 548 | 
 549 |       NoFLA_Downdate_Y(
 550 |           m_A11, n_A11, buff_A11, ldim_A,
 551 |           m_A21, n_A21, buff_A21, ldim_A,
 552 |           m_A12, n_A12, buff_A12, ldim_A,
 553 |           b, b, buff_T1_T, ldim_T1_T,
 554 |           m_Y, max( 0, n_Y - j - b ), buff_Y2, ldim_Y,
 555 |           m_G, b, buff_G1, ldim_G,
 556 |           m_G, max( 0, n_G - j - b ), buff_G2, ldim_G );
 557 |     }
 558 |   }
 559 | 
 560 |   // Remove auxiliary objects.
 561 |   //// FLA_Obj_free( & G );
 562 |   //// FLA_Obj_free( & Y );
 563 |   //// FLA_Obj_free( & V );
 564 |   //// FLA_Obj_free( & W );
 565 |   free( buff_G );
 566 |   free( buff_Y );
 567 |   free( buff_V );
 568 |   free( buff_W );
 569 | 
 570 |   return 0;
 571 | }
 572 | 
 573 | 
 574 | // ============================================================================
 575 | static int NoFLA_Normal_random_matrix( int m_A, int n_A, 
 576 |                double * buff_A, int ldim_A ) {
 577 | //
 578 | // It generates a random matrix with normal distribution.
 579 | //
 580 |   int  i, j;
 581 | 
 582 |   // Main loop.
 583 |   for ( j = 0; j < n_A; j++ ) {
 584 |     for ( i = 0; i < m_A; i++ ) {
 585 |       buff_A[ i + j * ldim_A ] = NoFLA_Normal_random_number( 0.0, 1.0 );
 586 |     }
 587 |   }
 588 | 
 589 |   return 0;
 590 | }
 591 | 
 592 | /* ========================================================================= */
 593 | static double NoFLA_Normal_random_number( double mu, double sigma ) {
 594 |   static int     alternate_calls = 0;
 595 |   static double  b1, b2;
 596 |   double         c1, c2, a, factor;
 597 | 
 598 |   // Quick return.
 599 |   if( alternate_calls == 1 ) {
 600 |     alternate_calls = ! alternate_calls;
 601 |     return( mu + sigma * b2 );
 602 |   }
 603 |   // Main loop.
 604 |   do {
 605 |     c1 = -1.0 + 2.0 * ( (double) rand() / RAND_MAX );
 606 |     c2 = -1.0 + 2.0 * ( (double) rand() / RAND_MAX );
 607 |     a = c1 * c1 + c2 * c2;
 608 |   } while ( ( a == 0 )||( a >= 1 ) );
 609 |   factor = sqrt( ( -2 * log( a ) ) / a );
 610 |   b1 = c1 * factor;
 611 |   b2 = c2 * factor;
 612 |   alternate_calls = ! alternate_calls;
 613 |   return( mu + sigma * b1 );
 614 | }
 615 | 
 616 | // ============================================================================
 617 | static int NoFLA_Downdate_Y( 
 618 |                int m_U11, int n_U11, double * buff_U11, int ldim_U11,
 619 |                int m_U21, int n_U21, double * buff_U21, int ldim_U21,
 620 |                int m_A12, int n_A12, double * buff_A12, int ldim_A12,
 621 |                int m_T, int n_T, double * buff_T, int ldim_T,
 622 |                int m_Y2, int n_Y2, double * buff_Y2, int ldim_Y2,
 623 |                int m_G1, int n_G1, double * buff_G1, int ldim_G1,
 624 |                int m_G2, int n_G2, double * buff_G2, int ldim_G2 ) {
 625 | //
 626 | // It downdates matrix Y, and updates matrix G.
 627 | // Only Y2 of Y is updated.
 628 | // Only G1 and G2 of G are updated.
 629 | //
 630 | // Y2 = Y2 - ( G1 - ( G1*U11 + G2*U21 ) * T11 * U11' ) * R12.
 631 | //
 632 |   int    i, j;
 633 |   double * buff_B;
 634 |   double d_one       = 1.0;
 635 |   double d_minus_one = -1.0;
 636 |   int    m_B         = m_G1;
 637 |   int    n_B         = n_G1;
 638 |   int    ldim_B      = m_G1;
 639 | 
 640 |   // Create object B.
 641 |   //// FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, G1, & B );
 642 |   buff_B = ( double * ) malloc( m_B * n_B * sizeof( double ) );
 643 | 
 644 |   // B = G1.
 645 |   //// FLA_Copy( G1, B );
 646 |   dlacpy_( "All", & m_G1, & n_G1, buff_G1, & ldim_G1,
 647 |                                   buff_B, & ldim_B );
 648 | 
 649 |   // B = B * U11.
 650 |   //// FLA_Trmm( FLA_RIGHT, FLA_LOWER_TRIANGULAR,
 651 |   ////           FLA_NO_TRANSPOSE, FLA_UNIT_DIAG,
 652 |   ////           FLA_ONE, U11, B );
 653 |   dtrmm_( "Right", "Lower", "No transpose", "Unit", & m_B, & n_B,
 654 |           & d_one, buff_U11, & ldim_U11, buff_B, & ldim_B );
 655 | 
 656 |   // B = B + G2 * U21.
 657 |   //// FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
 658 |   ////           FLA_ONE, G2, U21, FLA_ONE, B );
 659 |   dgemm_( "No transpose", "No tranpose", & m_B, & n_B, & m_U21,
 660 |           & d_one, buff_G2, & ldim_G2, buff_U21, & ldim_U21,
 661 |           & d_one, buff_B, & ldim_B );
 662 | 
 663 |   // B = B * T11.
 664 |   //// FLA_Trsm( FLA_RIGHT, FLA_UPPER_TRIANGULAR,
 665 |   ////           FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG,
 666 |   ////           FLA_ONE, T, B );
 667 |   //// dtrsm_( "Right", "Upper", "No transpose", "Non-unit", & m_B, & n_B,
 668 |   ////         & d_one, buff_T, & ldim_T, buff_B, & ldim_B );
 669 |   // Used dtrmm instead of dtrsm because of using compact WY instead of UT.
 670 |   dtrmm_( "Right", "Upper", "No transpose", "Non-unit", & m_B, & n_B,
 671 |           & d_one, buff_T, & ldim_T, buff_B, & ldim_B );
 672 | 
 673 |   // B = - B * U11^H.
 674 |   //// FLA_Trmm( FLA_RIGHT, FLA_LOWER_TRIANGULAR,
 675 |   ////           FLA_CONJ_TRANSPOSE, FLA_UNIT_DIAG,
 676 |   ////           FLA_MINUS_ONE, U11, B );
 677 |   dtrmm_( "Right", "Lower", "Conj_tranpose", "Unit", & m_B, & n_B,
 678 |           & d_minus_one, buff_U11, & ldim_U11, buff_B, & ldim_B );
 679 | 
 680 |   // B = G1 + B.
 681 |   //// FLA_Axpy( FLA_ONE, G1, B );
 682 |   for( j = 0; j < n_B; j++ ) {
 683 |     for( i = 0; i < m_B; i++ ) {
 684 |       buff_B[ i + j * ldim_B ] += buff_G1[ i + j * ldim_G1 ];
 685 |     }
 686 |   }
 687 | 
 688 |   // Y2 = Y2 - B * R12.
 689 |   //// FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
 690 |   ////           FLA_MINUS_ONE, B, A12, FLA_ONE, Y2 );
 691 |   dgemm_( "No transpose", "No transpose", & m_Y2, & n_Y2, & m_A12,
 692 |           & d_minus_one, buff_B, & ldim_B, buff_A12, & ldim_A12,
 693 |           & d_one, buff_Y2, & ldim_Y2 );
 694 | 
 695 |   //
 696 |   // GR = GR * Q
 697 |   //
 698 |   NoFLA_Apply_Q_WY_rnfc_blk_var4( 
 699 |           m_U11 + m_U21, n_U11, buff_U11, ldim_U11,
 700 |           m_T, n_T, buff_T, ldim_T,
 701 |           m_G1, n_G1 + n_G2, buff_G1, ldim_G1 );
 702 | 
 703 |   // Remove object B.
 704 |   //// FLA_Obj_free( & B );
 705 |   free( buff_B );
 706 | 
 707 |   return 0;
 708 | }
 709 | 
 710 | // ============================================================================
 711 | static int NoFLA_Apply_Q_WY_lhfc_blk_var4( 
 712 |                int m_U, int n_U, double * buff_U, int ldim_U,
 713 |                int m_T, int n_T, double * buff_T, int ldim_T,
 714 |                int m_B, int n_B, double * buff_B, int ldim_B ) {
 715 | //
 716 | // It applies the transpose of a block transformation Q to a matrix B from 
 717 | // the left:
 718 | //   B := Q' * B
 719 | // where:
 720 | //   Q = I - U * T' * U'.
 721 | //
 722 |   double  * buff_W;
 723 |   int     ldim_W;
 724 | 
 725 |   // Create auxiliary object.
 726 |   //// FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, B1, & W );
 727 |   buff_W = ( double * ) malloc( n_B * n_U * sizeof( double ) );
 728 |   ldim_W = max( 1, n_B );
 729 |  
 730 |   // Apply the block transformation. 
 731 |   dlarfb_( "Left", "Transpose", "Forward", "Columnwise", 
 732 |            & m_B, & n_B, & n_U, buff_U, & ldim_U, buff_T, & ldim_T, 
 733 |            buff_B, & ldim_B, buff_W, & ldim_W );
 734 | 
 735 |   // Remove auxiliary object.
 736 |   //// FLA_Obj_free( & W );
 737 |   free( buff_W );
 738 | 
 739 |   return 0;
 740 | }
 741 | 
 742 | // ============================================================================
 743 | static int NoFLA_Apply_Q_WY_rnfc_blk_var4( 
 744 |                int m_U, int n_U, double * buff_U, int ldim_U,
 745 |                int m_T, int n_T, double * buff_T, int ldim_T,
 746 |                int m_B, int n_B, double * buff_B, int ldim_B ) {
 747 | //
 748 | // It applies a block transformation Q to a matrix B from the right:
 749 | //   B = B * Q
 750 | // where:
 751 | //   Q = I - U * T' * U'.
 752 | //
 753 |   double  * buff_W;
 754 |   int     ldim_W;
 755 | 
 756 |   // Create auxiliary object.
 757 |   //// FLA_Obj_create_conf_to( FLA_TRANSPOSE, B1, & W );
 758 |   buff_W = ( double * ) malloc( m_B * n_U * sizeof( double ) );
 759 |   ldim_W = max( 1, m_B );
 760 |   
 761 |   // Apply the block transformation. 
 762 |   dlarfb_( "Right", "No transpose", "Forward", "Columnwise", 
 763 |            & m_B, & n_B, & n_U, buff_U, & ldim_U, buff_T, & ldim_T, 
 764 |            buff_B, & ldim_B, buff_W, & ldim_W );
 765 | 
 766 |   // Remove auxiliary object.
 767 |   //// FLA_Obj_free( & W );
 768 |   free( buff_W );
 769 | 
 770 |   return 0;
 771 | }
 772 | 
 773 | // ============================================================================
 774 | static int NoFLA_QRPmod_WY_unb_var4( int pivoting, int num_stages, 
 775 |                int m_A, int n_A, double * buff_A, int ldim_A,
 776 |                int * buff_p, double * buff_t, 
 777 |                int pivot_B, int m_B, double * buff_B, int ldim_B,
 778 |                int pivot_C, int m_C, double * buff_C, int ldim_C,
 779 |                int build_T, double * buff_T, int ldim_T ) {
 780 | //
 781 | // It computes an unblocked QR factorization of matrix A with or without 
 782 | // pivoting. Matrices B and C are optionally pivoted, and matrix T is
 783 | // optionally built.
 784 | //
 785 | // Arguments:
 786 | // "pivoting": If pivoting==1, then QR factorization with pivoting is used.
 787 | // "numstages": It tells the number of columns that are factorized.
 788 | //   If "num_stages" is negative, the whole matrix A is factorized.
 789 | //   If "num_stages" is positive, only the first "num_stages" are factorized.
 790 | // "pivot_B": if "pivot_B" is true, matrix "B" is pivoted too.
 791 | // "pivot_C": if "pivot_C" is true, matrix "C" is pivoted too.
 792 | // "build_T": if "build_T" is true, matrix "T" is built.
 793 | //
 794 |   int     j, mn_A, m_a21, m_A22, n_A22, n_dB, idx_max_col, 
 795 |           i_one = 1, n_house_vector, m_rest;
 796 |   double  * buff_d, * buff_e, * buff_workspace, diag;
 797 |   int     idamax_();
 798 | 
 799 |   //// printf( "NoFLA_QRPmod_WY_unb_var4. pivoting: %d \n", pivoting );
 800 | 
 801 |   // Some initializations.
 802 |   mn_A    = min( m_A, n_A );
 803 | 
 804 |   // Set the number of stages, if needed.
 805 |   if( num_stages < 0 ) {
 806 |     num_stages = mn_A;
 807 |   }
 808 | 
 809 |   // Create auxiliary vectors.
 810 |   buff_d         = ( double * ) malloc( n_A * sizeof( double ) );
 811 |   buff_e         = ( double * ) malloc( n_A * sizeof( double ) );
 812 |   buff_workspace = ( double * ) malloc( n_A * sizeof( double ) );
 813 | 
 814 |   if( pivoting == 1 ) {
 815 |     // Compute initial norms of A into d and e.
 816 |     NoFLA_QRP_compute_norms( m_A, n_A, buff_A, ldim_A, buff_d, buff_e );
 817 |   }
 818 | 
 819 |   // Main Loop.
 820 |   for( j = 0; j < num_stages; j++ ) {
 821 |     n_dB  = n_A - j;
 822 |     m_a21 = m_A - j - 1;
 823 |     m_A22 = m_A - j - 1;
 824 |     n_A22 = n_A - j - 1;
 825 | 
 826 |     if( pivoting == 1 ) {
 827 |       // Obtain the index of the column with largest 2-norm.
 828 |       idx_max_col = idamax_( & n_dB, & buff_d[ j ], & i_one ) - 1;
 829 | 
 830 |       // Swap columns of A, B, C, pivots, and norms vectors.
 831 |       NoFLA_QRP_pivot_G_B_C( idx_max_col,
 832 |           m_A, & buff_A[ 0 + j * ldim_A ], ldim_A,
 833 |           pivot_B, m_B, & buff_B[ 0 + j * ldim_B ], ldim_B,
 834 |           pivot_C, m_C, & buff_C[ 0 + j * ldim_C ], ldim_C,
 835 |           & buff_p[ j ],
 836 |           & buff_d[ j ],
 837 |           & buff_e[ j ] );
 838 |     }
 839 | 
 840 |     // Compute tau1 and u21 from alpha11 and a21 such that tau1 and u21
 841 |     // determine a Householder transform H such that applying H from the
 842 |     // left to the column vector consisting of alpha11 and a21 annihilates
 843 |     // the entries in a21 (and updates alpha11).
 844 |     n_house_vector = m_a21 + 1;
 845 |     dlarfg_( & n_house_vector,
 846 |              & buff_A[ j + j * ldim_A ],
 847 |              & buff_A[ min( m_A-1, j+1 ) + j * ldim_A ], & i_one,
 848 |              & buff_t[ j ] );
 849 | 
 850 |     // / a12t \ =  H / a12t \
 851 |     // \ A22  /      \ A22  /
 852 |     //
 853 |     // where H is formed from tau1 and u21.
 854 |     diag = buff_A[ j + j * ldim_A ];
 855 |     buff_A[ j + j * ldim_A ] = 1.0;
 856 |     m_rest = m_A22 + 1;
 857 |     dlarf_( "Left", & m_rest, & n_A22, 
 858 |         & buff_A[ j + j * ldim_A ], & i_one,
 859 |         & buff_t[ j ],
 860 |         & buff_A[ j + ( j+1 ) * ldim_A ], & ldim_A,
 861 |         buff_workspace );
 862 |     buff_A[ j + j * ldim_A ] = diag;
 863 | 
 864 |     if( pivoting == 1 ) {
 865 |       // Update partial column norms.
 866 |       NoFLA_QRP_downdate_partial_norms( m_A22, n_A22, 
 867 |           & buff_d[ j+1 ], 1,
 868 |           & buff_e[ j+1 ], 1,
 869 |           & buff_A[ j + ( j+1 ) * ldim_A ], ldim_A,
 870 |           & buff_A[ ( j+1 ) + min( n_A-1, ( j+1 ) ) * ldim_A ], ldim_A );
 871 |     }
 872 |   }
 873 | 
 874 |   // Build T.
 875 |   if( build_T ) {
 876 |     dlarft_( "Forward", "Columnwise", & m_A, & num_stages, buff_A, & ldim_A, 
 877 |              buff_t, buff_T, & ldim_T );
 878 |   }
 879 | 
 880 |   // Remove auxiliary vectors.
 881 |   free( buff_d );
 882 |   free( buff_e );
 883 |   free( buff_workspace );
 884 | 
 885 |   return 0;
 886 | }
 887 | 
 888 | // ============================================================================
 889 | static int NoFLA_QRP_compute_norms(
 890 |                int m_A, int n_A, double * buff_A, int ldim_A,
 891 |                double * buff_d, double * buff_e ) {
 892 | //
 893 | // It computes the column norms of matrix A. The norms are stored into 
 894 | // vectors d and e.
 895 | //
 896 |   int     j, i_one = 1;
 897 |   double  dnrm2_();
 898 | 
 899 |   // Main loop.
 900 |   for( j = 0; j < n_A; j++ ) {
 901 |     * buff_d = dnrm2_( & m_A, buff_A, & i_one );
 902 |     * buff_e = * buff_d;
 903 |     buff_A += ldim_A;
 904 |     buff_d++;
 905 |     buff_e++;
 906 |   }
 907 | 
 908 |   return 0;
 909 | }
 910 | 
 911 | // ============================================================================
 912 | static int NoFLA_QRP_downdate_partial_norms( int m_A, int n_A,
 913 |                double * buff_d,  int st_d,
 914 |                double * buff_e,  int st_e,
 915 |                double * buff_wt, int st_wt,
 916 |                double * buff_A,  int ldim_A ) {
 917 | //
 918 | // It updates (downdates) the column norms of matrix A. It uses Drmac's method.
 919 | //
 920 |   int     j, i_one = 1;
 921 |   double  * ptr_d, * ptr_e, * ptr_wt, * ptr_A;
 922 |   double  temp, temp2, temp5, tol3z;
 923 |   double  dnrm2_(), dlamch_();
 924 | 
 925 |   /*
 926 | *
 927 | *           Update partial column norms
 928 | *
 929 |           DO 30 J = I + 1, N
 930 |              IF( WORK( J ).NE.ZERO ) THEN
 931 | *
 932 | *                 NOTE: The following 4 lines follow from the analysis in
 933 | *                 Lapack Working Note 176.
 934 | *                 
 935 |                 TEMP = ABS( A( I, J ) ) / WORK( J )
 936 |                 TEMP = MAX( ZERO, ( ONE+TEMP )*( ONE-TEMP ) )
 937 |                 TEMP2 = TEMP*( WORK( J ) / WORK( N+J ) )**2
 938 |                 IF( TEMP2 .LE. TOL3Z ) THEN 
 939 |                    IF( M-I.GT.0 ) THEN
 940 |                       WORK( J ) = DNRM2( M-I, A( I+1, J ), 1 )
 941 |                       WORK( N+J ) = WORK( J )
 942 |                    ELSE
 943 |                       WORK( J ) = ZERO
 944 |                       WORK( N+J ) = ZERO
 945 |                    END IF
 946 |                 ELSE
 947 |                    WORK( J ) = WORK( J )*SQRT( TEMP )
 948 |                 END IF
 949 |              END IF
 950 |  30       CONTINUE
 951 |   */
 952 | 
 953 |   // Some initializations.
 954 |   tol3z = sqrt( dlamch_( "Epsilon" ) );
 955 |   ptr_d  = buff_d;
 956 |   ptr_e  = buff_e;
 957 |   ptr_wt = buff_wt;
 958 |   ptr_A  = buff_A;
 959 | 
 960 |   // Main loop.
 961 |   for( j = 0; j < n_A; j++ ) {
 962 |     if( * ptr_d != 0.0 ) {
 963 |       temp = dabs( * ptr_wt ) / * ptr_d;
 964 |       temp = max( 0.0, ( 1.0 + temp ) * ( 1 - temp ) );
 965 |       temp5 = * ptr_d / * ptr_e;
 966 |       temp2 = temp * temp5 * temp5;
 967 |       if( temp2 <= tol3z ) {
 968 |         if( m_A > 0 ) {
 969 |           * ptr_d = dnrm2_( & m_A, ptr_A, & i_one );
 970 |           * ptr_e = *ptr_d;
 971 |         } else {
 972 |           * ptr_d = 0.0;
 973 |           * ptr_e = 0.0;
 974 |         }
 975 |       } else {
 976 |         * ptr_d = * ptr_d * sqrt( temp );
 977 |       }
 978 |     } 
 979 |     ptr_A  += ldim_A;
 980 |     ptr_d  += st_d;
 981 |     ptr_e  += st_e;
 982 |     ptr_wt += st_wt;
 983 |   }
 984 | 
 985 |   return 0;
 986 | }
 987 | 
 988 | 
 989 | // ============================================================================
 990 | static int NoFLA_QRP_pivot_G_B_C( int j_max_col,
 991 |                int m_G, double * buff_G, int ldim_G, 
 992 |                int pivot_B, int m_B, double * buff_B, int ldim_B, 
 993 |                int pivot_C, int m_C, double * buff_C, int ldim_C, 
 994 |                int * buff_p,
 995 |                double * buff_d, double * buff_e ) {
 996 | //
 997 | // It pivots matrix G, pivot vector p, and norms vectors d and e.
 998 | // Matrices B and C are optionally pivoted.
 999 | //
1000 |   int     ival, i_one = 1;
1001 |   double  * ptr_g1, * ptr_g2, * ptr_b1, * ptr_b2, * ptr_c1, * ptr_c2;
1002 | 
1003 |   // Swap columns of G, pivots, and norms.
1004 |   if( j_max_col != 0 ) {
1005 | 
1006 |     // Swap full column 0 and column "j_max_col" of G.
1007 |     ptr_g1 = & buff_G[ 0 + 0         * ldim_G ];
1008 |     ptr_g2 = & buff_G[ 0 + j_max_col * ldim_G ];
1009 |     dswap_( & m_G, ptr_g1, & i_one, ptr_g2, & i_one );
1010 | 
1011 |     // Swap full column 0 and column "j_max_col" of B.
1012 |     if( pivot_B ) {
1013 |       ptr_b1 = & buff_B[ 0 + 0         * ldim_B ];
1014 |       ptr_b2 = & buff_B[ 0 + j_max_col * ldim_B ];
1015 |       dswap_( & m_B, ptr_b1, & i_one, ptr_b2, & i_one );
1016 |     }
1017 | 
1018 |     // Swap full column 0 and column "j_max_col" of C.
1019 |     if( pivot_C ) {
1020 |       ptr_c1 = & buff_C[ 0 + 0         * ldim_C ];
1021 |       ptr_c2 = & buff_C[ 0 + j_max_col * ldim_C ];
1022 |       dswap_( & m_C, ptr_c1, & i_one, ptr_c2, & i_one );
1023 |     }
1024 | 
1025 |     // Swap element 0 and element "j_max_col" of pivot vector "p".
1026 |     ival = buff_p[ j_max_col ];
1027 |     buff_p[ j_max_col ] = buff_p[ 0 ];
1028 |     buff_p[ 0 ] = ival;
1029 | 
1030 |     // Copy norms of column 0 to column "j_max_col".
1031 |     buff_d[ j_max_col ] = buff_d[ 0 ];
1032 |     buff_e[ j_max_col ] = buff_e[ 0 ];
1033 |   }
1034 | 
1035 |   return 0;
1036 | }
1037 | 
1038 | 


--------------------------------------------------------------------------------
/libflame_sources/FLA_HQRRP_UT_blk_var2.c:
--------------------------------------------------------------------------------
   1 | /*
   2 | ===============================================================================
   3 | Authors
   4 | ===============================================================================
   5 | 
   6 | Per-Gunnar Martinsson
   7 |   Dept. of Applied Mathematics, 
   8 |   University of Colorado at Boulder, 
   9 |   526 UCB, Boulder, CO 80309-0526, USA
  10 | 
  11 | Gregorio Quintana-Orti
  12 |   Depto. de Ingenieria y Ciencia de Computadores, 
  13 |   Universitat Jaume I, 
  14 |   12.071 Castellon, Spain
  15 | 
  16 | Nathan Heavner
  17 |   Dept. of Applied Mathematics, 
  18 |   University of Colorado at Boulder, 
  19 |   526 UCB, Boulder, CO 80309-0526, USA
  20 | 
  21 | Robert van de Geijn
  22 |   Dept. of Computer Science and Institute for Computational Engineering and 
  23 |   Sciences, 
  24 |   The University of Texas at Austin
  25 |   Austin, TX.
  26 | 
  27 | ===============================================================================
  28 | Copyright
  29 | ===============================================================================
  30 | 
  31 | Copyright (C) 2016, 
  32 |   Universitat Jaume I,
  33 |   University of Colorado at Boulder,
  34 |   The University of Texas at Austin.
  35 | 
  36 | ===============================================================================
  37 | Disclaimer
  38 | ===============================================================================
  39 | 
  40 | This code is distributed in the hope that it will be useful, but
  41 | WITHOUT ANY WARRANTY EXPRESSED OR IMPLIED.
  42 | 
  43 | */
  44 | 
  45 | #include <stdlib.h>
  46 | #include <stdio.h>
  47 | #include <math.h>
  48 | #include "FLAME.h"
  49 | #include "FLA_HQRRP_UT_blk_var2.h"
  50 | 
  51 | 
  52 | /*
  53 | // Matrices with dimensions smaller than THRESHOLD_FOR_DGEQPF are processed 
  54 | // with LAPACK's routine dgeqpf.
  55 | // Matrices with dimensions between THRESHOLD_FOR_DGEQPF and 
  56 | // THRESHOLD_FOR_DGEQP3 are processed with LAPACK's routine dgeqp3.
  57 | // Matrices with dimensions larger than THRESHOLD_FOR_DGEQP3 are processed 
  58 | // with the new HQRRP code.
  59 | #define THRESHOLD_FOR_DGEQPF   250
  60 | #define THRESHOLD_FOR_DGEQP3  1000
  61 | */
  62 | 
  63 | 
  64 | // ============================================================================
  65 | // Compilation declarations.
  66 | 
  67 | #undef CHECK_DOWNDATING_OF_Y
  68 | #undef PROFILE
  69 | 
  70 | 
  71 | // ============================================================================
  72 | // Declaration of local prototypes.
  73 | 
  74 | static int MyFLA_Normal_random_matrix( FLA_Obj A ); 
  75 | 
  76 | static double MyFLA_Normal_random_number( double mu, double sigma );
  77 | 
  78 | static FLA_Error MyFLA_Downdate_Y( FLA_Obj U11, FLA_Obj U21, FLA_Obj A12,
  79 |                      FLA_Obj T, 
  80 |                      FLA_Obj Y2, FLA_Obj G1, FLA_Obj G2 );
  81 | 
  82 | static FLA_Error MyFLA_Apply_Q_UT_lhfc_blk_var2( FLA_Obj U11, FLA_Obj U21,
  83 |                      FLA_Obj T, 
  84 |                      FLA_Obj B1, FLA_Obj B2 );
  85 | 
  86 | static FLA_Error MyFLA_Apply_Q_UT_rnfc_blk_var2( FLA_Obj U11, FLA_Obj U21,
  87 |                      FLA_Obj T, 
  88 |                      FLA_Obj B1, FLA_Obj B2 );
  89 | 
  90 | static FLA_Error MyFLA_QRPmod_UT_unb_var2( int pivoting, int num_stages, 
  91 |                      FLA_Obj A, FLA_Obj p, FLA_Obj t, 
  92 |                      int pivot_B, FLA_Obj B,
  93 |                      int pivot_C, FLA_Obj C,
  94 |                      int build_T, FLA_Obj T );
  95 | 
  96 | static FLA_Error MyFLA_Apply_H2_UT_l_opd_var2( 
  97 |                      int m_u2_A2,
  98 |                      int n_a1t,
  99 |                      double * tau,
 100 |                      double * u2, int inc_u2,
 101 |                      double * a1t, int inc_a1t,
 102 |                      double * A2, int ldim_A2,
 103 |                      double * workspace );
 104 | 
 105 | static FLA_Error MyFLA_QRP_compute_norms( FLA_Obj A, FLA_Obj d, FLA_Obj e );
 106 | 
 107 | static FLA_Error NoFLA_QRP_pivot_B_C( int j_max_col,
 108 |                      int m_G, double * buff_G, int ldim_G, 
 109 |                      int pivot_B, int m_B, double * buff_B, int ldim_B, 
 110 |                      int pivot_C, int m_C, double * buff_C, int ldim_C, 
 111 |                      int * buff_p,
 112 |                      double * buff_d, double * buff_e );
 113 | 
 114 | static FLA_Error NoFLA_QRP_downdate_partial_norms( int m_A, int n_A,
 115 |                      double * buff_d,  int st_d,
 116 |                      double * buff_e,  int st_e,
 117 |                      double * buff_wt, int st_wt,
 118 |                      double * buff_A,  int ldim_A );
 119 | 
 120 | static int NoFLA_my_idamax( int n, double * buff_data );
 121 | 
 122 | 
 123 | // ============================================================================
 124 | int FLA_HQRRP_UT_blk_var2( FLA_Obj A, FLA_Obj p, FLA_Obj s, 
 125 |         int nb_alg, int pp, int panel_pivoting ) {
 126 | //
 127 | // HQRRP: It computes the Householder QR with Randomized Pivoting of matrix A.
 128 | // This routine is almost compatible with LAPACK's dgeqp3.
 129 | // The main difference is that this routine does not manage fixed columns.
 130 | //
 131 | // Main features:
 132 | //   * BLAS-3 based.
 133 | //   * Norm downdating method by Drmac.
 134 | //   * Downdating for computing Y.
 135 | //   * Use of libflame.
 136 | //   * UT transformations are used.
 137 | //
 138 | // Arguments:
 139 | // ----------
 140 | // A:              Matrix to be factorized.
 141 | // p:              output vector with the pivots.
 142 | // s:              Output vector with the tau values of the Householder factors.
 143 | // nb_alg:         Block size. Usual values for nb_alg are 32, 64, etc.
 144 | // pp:             Oversampling size. Usual values for pp are 5, 10, etc.
 145 | // panel_pivoting: If panel_pivoting==1, QR with pivoting is applied to 
 146 | //                 factorize the panels of matrix A. Otherwise, QR without 
 147 | //                 pivoting is used. Usual value for panel_pivoting is 1.
 148 | //
 149 |   // Declaration of variables.
 150 |   FLA_Obj ATL, ATR,    A00, A01, A02, 
 151 |           ABL, ABR,    A10, A11, A12,
 152 |                        A20, A21, A22;
 153 |   FLA_Obj pT,          p0,
 154 |           pB,          p1,
 155 |                        p2;
 156 |   FLA_Obj sT,          s0,
 157 |           sB,          s1,
 158 |                        s2;
 159 |   FLA_Obj GL,  GR,     G0,   G1,  G2;
 160 |   FLA_Obj YL,  YR,     Y0,   Y1,  Y2;
 161 |   FLA_Obj VL,  VR,     V0,   V1,  V2;
 162 |   FLA_Obj WL,  WR,     W0,   W1,  W2;
 163 |   FLA_Obj AR, AB1, G, Y, V, W, T1_T, None;
 164 |   int     bRow, m_A, n_A, mn_A, dtype_A, last_iter;
 165 | #ifdef PROFILE
 166 |   double  t1, t2, tt_qrp1, tt_qrp2, tt_updt, tt_down;
 167 | #endif
 168 | 
 169 |   // Executable Statements.
 170 |   //// printf( "%% FLA_HQRRP_UT_blk_var2.\n" );
 171 | 
 172 |   // Some initializations.
 173 |   m_A     = FLA_Obj_length( A );
 174 |   n_A     = FLA_Obj_width ( A );
 175 |   mn_A    = min( m_A, n_A );
 176 |   dtype_A = FLA_Obj_datatype( A );
 177 | 
 178 |   // Quick return.
 179 |   if( mn_A == 0 ) {
 180 |     return 0;
 181 |   }
 182 | 
 183 |   // Initialize the seed for the generator of random numbers.
 184 |   srand( 12 );
 185 | 
 186 | #ifdef PROFILE
 187 |   tt_qrp1 = 0.0;
 188 |   tt_qrp2 = 0.0;
 189 |   tt_updt = 0.0;
 190 |   tt_down = 0.0;
 191 | #endif
 192 | 
 193 |   // Create and initialize auxiliary objects.
 194 |   FLA_Obj_create( dtype_A, nb_alg + pp, m_A,    0, 0, & G );
 195 |   FLA_Obj_create( dtype_A, nb_alg + pp, n_A,    0, 0, & Y );
 196 |   FLA_Obj_create( dtype_A, nb_alg + pp, n_A,    0, 0, & V );
 197 |   FLA_Obj_create( dtype_A, nb_alg,      n_A,    0, 0, & W );
 198 | 
 199 |   // Initialize matrices G and Y.
 200 |   MyFLA_Normal_random_matrix( G );
 201 |   FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, 
 202 |             FLA_ONE, G, A, FLA_ZERO, Y );
 203 | 
 204 |   // Initial Partitioning.
 205 |   FLA_Part_2x2( A,    & ATL, & ATR,
 206 |                       & ABL, & ABR,    0, 0, FLA_TL );
 207 |   FLA_Part_2x1( p,    & pT, 
 208 |                       & pB,            0, FLA_TOP );
 209 |   FLA_Part_2x1( s,    & sT,
 210 |                       & sB,            0, FLA_TOP );
 211 |   FLA_Part_1x2( G,    & GL,  & GR,     0, FLA_LEFT );
 212 |   FLA_Part_1x2( Y,    & YL,  & YR,     0, FLA_LEFT );
 213 |   FLA_Part_1x2( V,    & VL,  & VR,     0, FLA_LEFT );
 214 |   FLA_Part_1x2( W,    & WL,  & WR,     0, FLA_LEFT );
 215 | 
 216 |   // Main Loop.
 217 |   while( ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) )&&
 218 |          ( FLA_Obj_width ( ATL ) < FLA_Obj_width ( A ) ) ) {
 219 |     bRow = min( FLA_Obj_min_dim( ABR ), nb_alg );
 220 | 
 221 |     // Iteration Initial Partitioning.
 222 |     FLA_Repart_2x2_to_3x3( ATL, /**/ ATR,       &A00, /**/ &A01, &A02,
 223 |                         /* ************* */   /* ******************** */
 224 |                                                 &A10, /**/ &A11, &A12,
 225 |                            ABL, /**/ ABR,       &A20, /**/ &A21, &A22,
 226 |                            bRow, bRow, FLA_BR );
 227 |     FLA_Repart_2x1_to_3x1( pT,                  &p0, 
 228 |                         /* ** */              /* *** */
 229 |                                                 &p1, 
 230 |                            pB,                  &p2,    bRow, FLA_BOTTOM );
 231 |     FLA_Repart_2x1_to_3x1( sT,                  &s0, 
 232 |                         /* ** */              /* ** */
 233 |                                                 &s1, 
 234 |                            sB,                  &s2,        bRow, FLA_BOTTOM );
 235 |     FLA_Repart_1x2_to_1x3( GL,   /**/ GR,       &G0,  /**/ &G1,  &G2,
 236 |                            bRow, FLA_RIGHT );
 237 |     FLA_Repart_1x2_to_1x3( YL,   /**/ YR,       &Y0,  /**/ &Y1,  &Y2,
 238 |                            bRow, FLA_RIGHT );
 239 |     FLA_Repart_1x2_to_1x3( VL,   /**/ VR,       &V0,  /**/ &V1,  &V2,
 240 |                            bRow, FLA_RIGHT );
 241 |     FLA_Repart_1x2_to_1x3( WL,   /**/ WR,       &W0,  /**/ &W1,  &W2,
 242 |                            bRow, FLA_RIGHT );
 243 | 
 244 |     // ------------------------------------------------------------------------
 245 | 
 246 |     //// printf( "Iter: %ld \n", FLA_Obj_length( ATL ) );
 247 | 
 248 |     // Check whether it is the last iteration.
 249 |     last_iter = ( FLA_Obj_min_dim( A22 ) <= 0 ? 1 : 0 );
 250 | 
 251 | #ifdef CHECK_DOWNDATING_OF_Y
 252 |     // Check downdating of matrix Y: Compare downdated matrix Y with 
 253 |     // matrix Y computed from scratch.
 254 |     int      m_cyr, n_cyr, ldim_cyr, ldim_YR, ii, jj; 
 255 |     double   * buff_cyr, * buff_YR, aux, sum;
 256 |     FLA_Obj  CYR;
 257 | 
 258 |     FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, YR, & CYR );
 259 | 
 260 |     FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE, 
 261 |               FLA_ONE, GR, ABR, FLA_ZERO, CYR ); 
 262 | 
 263 |     //// FLA_Obj_show( " CYR = [ ", CYR, "%le", " ];" );
 264 |     //// FLA_Obj_show( " Y  = [ ", Y, "%le", " ];" );
 265 | 
 266 |     m_cyr    = FLA_Obj_length( CYR );
 267 |     n_cyr    = FLA_Obj_width( CYR );
 268 |     buff_cyr = ( double * ) FLA_Obj_buffer_at_view( CYR );
 269 |     ldim_cyr = FLA_Obj_col_stride( CYR );
 270 |     buff_YR   = ( double * ) FLA_Obj_buffer_at_view( YR );
 271 |     ldim_YR  = FLA_Obj_col_stride( YR );
 272 |     sum = 0.0;
 273 |     for( jj = 0; jj < n_cyr; jj++ ) {
 274 |       for( ii = 0; ii < m_cyr; ii++ ) {
 275 |         aux = buff_YR [ ii + jj * ldim_YR ] -
 276 |               buff_cyr[ ii + jj * ldim_cyr ];
 277 |         sum += aux * aux;
 278 |       }
 279 |     }
 280 |     sum = sqrt( sum );
 281 |     printf( "%%  diff between Y and downdated Y: %le\n", sum );
 282 | 
 283 |     FLA_Obj_free( & CYR );
 284 | #endif
 285 | 
 286 |     if( last_iter == 0 ) {
 287 |       // Compute QRP of YR, and apply permutations to matrix AR.
 288 |       // A copy of YR is made into VR, and permutations are applied to YR.
 289 | #ifdef PROFILE
 290 |       t1 = FLA_Clock();
 291 | #endif
 292 |       FLA_Merge_2x1( ATR,
 293 |                      ABR,   & AR );
 294 |       FLA_Copy( YR, VR );
 295 |       MyFLA_QRPmod_UT_unb_var2( 1, bRow, VR, pB, sB, 1, AR, 1, YR, 0, None );
 296 | 
 297 | #ifdef PROFILE
 298 |       t2 = FLA_Clock();
 299 |       tt_qrp1 += ( t2 - t1 );
 300 | #endif
 301 |     }
 302 | 
 303 |     //
 304 |     // Compute QRP of panel AB1 = [ A11; A21 ].
 305 |     // Apply same permutations to A01 and Y1, and build T1_T.
 306 |     //
 307 | #ifdef PROFILE
 308 |     t1 = FLA_Clock();
 309 | #endif
 310 | 
 311 |     FLA_Part_2x1( W1,   & T1_T,
 312 |                         & None,    bRow, FLA_TOP );
 313 |     FLA_Merge_2x1( A11,
 314 |                    A21,   & AB1 );
 315 |     MyFLA_QRPmod_UT_unb_var2( panel_pivoting, -1, AB1, p1, s1, 
 316 |         1, A01, 1, Y1, 1, T1_T );
 317 | 
 318 | #ifdef PROFILE
 319 |     t2 = FLA_Clock();
 320 |     tt_qrp2 += ( t2 - t1 );
 321 | #endif
 322 | 
 323 |     //
 324 |     // Update the rest of the matrix.
 325 |     //
 326 |     if ( FLA_Obj_width( A12 ) > 0 ) {
 327 |       // Apply the Householder transforms associated with AB1 = [ A11; A21 ] 
 328 |       // and T1_T to [ A12; A22 ]:
 329 |       //   / A12 \ := QB1' / A12 \
 330 |       //   \ A22 /         \ A22 /
 331 |       // where QB1 is formed from AB1 and T1_T.
 332 | #ifdef PROFILE
 333 |       t1 = FLA_Clock();
 334 | #endif
 335 | 
 336 |       MyFLA_Apply_Q_UT_lhfc_blk_var2( A11, A21, T1_T, A12, A22 );
 337 | 
 338 | #ifdef PROFILE
 339 |       t2 = FLA_Clock();
 340 |       tt_updt += ( t2 - t1 );
 341 | #endif
 342 |     }
 343 | 
 344 |     //
 345 |     // Downdate matrix Y.
 346 |     //
 347 |     if ( FLA_Obj_width( Y2 ) > 0 ) {
 348 | #ifdef PROFILE
 349 |       t1 = FLA_Clock();
 350 | #endif
 351 | 
 352 |       MyFLA_Downdate_Y( A11, A21, A12, T1_T, Y2, G1, G2 );
 353 | 
 354 | #ifdef PROFILE
 355 |       t2 = FLA_Clock();
 356 |       tt_down += ( t2 - t1 );
 357 | #endif
 358 |     }
 359 | 
 360 |     // ------------------------------------------------------------------------
 361 |     // Iteration Final Repartitioning.
 362 | 
 363 |     FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR,       A00, A01, /**/ A02,
 364 |                                                      A10, A11, /**/ A12,
 365 |                             /* ************** */  /* ****************** */
 366 |                               &ABL, /**/ &ABR,       A20, A21, /**/ A22,
 367 |                               FLA_TL );
 368 |     FLA_Cont_with_3x1_to_2x1( &pT,                   p0, 
 369 |                                                      p1, 
 370 |                             /* ** */              /* *** */
 371 |                               &pB,                   p2,     FLA_TOP );
 372 |     FLA_Cont_with_3x1_to_2x1( &sT,                   s0, 
 373 |                                                      s1, 
 374 |                             /* ** */              /* ** */
 375 |                               &sB,                   s2,     FLA_TOP );
 376 |     FLA_Cont_with_1x3_to_1x2( &GL,   /**/ &GR,       G0,  G1,  /**/ G2,
 377 |                               FLA_LEFT );
 378 |     FLA_Cont_with_1x3_to_1x2( &YL,   /**/ &YR,       Y0,  Y1,  /**/ Y2,
 379 |                               FLA_LEFT );
 380 |     FLA_Cont_with_1x3_to_1x2( &VL,   /**/ &VR,       V0,  V1,  /**/ V2,
 381 |                               FLA_LEFT );
 382 |     FLA_Cont_with_1x3_to_1x2( &WL,   /**/ &WR,       W0,  W1,  /**/ W2,
 383 |                               FLA_LEFT );
 384 |   }
 385 | 
 386 |   // Remove auxiliary objects.
 387 |   FLA_Obj_free( & G );
 388 |   FLA_Obj_free( & Y );
 389 |   FLA_Obj_free( & V );
 390 |   FLA_Obj_free( & W );
 391 | 
 392 | #ifdef PROFILE
 393 |   printf( "%% tt_qrp_1:       %le\n", tt_qrp1 );
 394 |   printf( "%% tt_qrp_2:       %le\n", tt_qrp2 );
 395 |   printf( "%% tt_updt_rest:   %le\n", tt_updt );
 396 |   printf( "%% tt_downdating:  %le\n", tt_down );
 397 |   printf( "%% total_time:     %le\n", tt_qrp1 + tt_qrp2 + tt_updt + tt_down);
 398 | #endif
 399 | 
 400 |   return 0;
 401 | }
 402 | 
 403 | 
 404 | // ============================================================================
 405 | static int MyFLA_Normal_random_matrix( FLA_Obj A ) {
 406 | //
 407 | // It sets a random matrix with normal distribution.
 408 | //
 409 | 
 410 |   switch( FLA_Obj_datatype( A ) ) {
 411 | 
 412 |   case FLA_DOUBLE:
 413 |     {
 414 |       double  * buff_A;
 415 |       int     m_A, n_A, ldim_A, i, j;
 416 | 
 417 |       // Some initializations.
 418 |       m_A     = FLA_Obj_length( A );
 419 |       n_A     = FLA_Obj_width ( A );
 420 |       buff_A  = ( double * ) FLA_Obj_buffer_at_view( A );
 421 |       ldim_A  = FLA_Obj_col_stride( A );
 422 | 
 423 |       // Main loop.
 424 |       for ( j = 0; j < n_A; j++ ) {
 425 |         for ( i = 0; i < m_A; i++ ) {
 426 |           buff_A[ i + j * ldim_A ] = MyFLA_Normal_random_number( 0.0, 1.0 );
 427 |         }
 428 |       }
 429 |     }
 430 |     break;
 431 | 
 432 |   default:
 433 |     fprintf( stderr, "+++ ERROR in MyFLA_Normal_random_matrix:  " );
 434 |     fprintf( stderr, "Datatype not implemented: %d\n", FLA_Obj_datatype( A ) );
 435 |   }
 436 | 
 437 |   return FLA_SUCCESS;
 438 | }
 439 | 
 440 | /* ========================================================================= */
 441 | static double MyFLA_Normal_random_number( double mu, double sigma ) {
 442 |   static int     alternate_calls = 0;
 443 |   static double  b1, b2;
 444 |   double         c1, c2, a, factor;
 445 | 
 446 |   // Quick return.
 447 |   if( alternate_calls == 1 ) {
 448 |     alternate_calls = ! alternate_calls;
 449 |     return( mu + sigma * b2 );
 450 |   }
 451 |   // Main loop.
 452 |   do {
 453 |     c1 = -1.0 + 2.0 * ( (double) rand() / RAND_MAX );
 454 |     c2 = -1.0 + 2.0 * ( (double) rand() / RAND_MAX );
 455 |     a = c1 * c1 + c2 * c2;
 456 |   } while ( ( a == 0 )||( a >= 1 ) );
 457 |   factor = sqrt( ( -2 * log( a ) ) / a );
 458 |   b1 = c1 * factor;
 459 |   b2 = c2 * factor;
 460 |   alternate_calls = ! alternate_calls;
 461 |   return( mu + sigma * b1 );
 462 | }
 463 | 
 464 | // ============================================================================
 465 | static FLA_Error MyFLA_Downdate_Y( FLA_Obj U11, FLA_Obj U21, FLA_Obj A12,
 466 |                      FLA_Obj T, 
 467 |                      FLA_Obj Y2, FLA_Obj G1, FLA_Obj G2 ) {
 468 | //
 469 | // It downdates matrix Y, and updates matrix G.
 470 | // Only Y2 of Y is updated.
 471 | // Only G1 and G2 of G are updated.
 472 | //
 473 | // Y2 = Y2 - ( G1 - ( G1*U11 + G2*U21 ) * inv( T11 ) * U11' ) * R12.
 474 | //
 475 |   FLA_Obj  B;
 476 | 
 477 |   // Create auxiliary object B.
 478 |   FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, G1, & B );
 479 | 
 480 |   // B = G1.
 481 |   FLA_Copy( G1, B );
 482 | 
 483 |   // B = B * U11.
 484 |   FLA_Trmm( FLA_RIGHT, FLA_LOWER_TRIANGULAR,
 485 |             FLA_NO_TRANSPOSE, FLA_UNIT_DIAG,
 486 |             FLA_ONE, U11, B );
 487 | 
 488 |   // B = B + G2 * U21.
 489 |   FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
 490 |             FLA_ONE, G2, U21, FLA_ONE, B );
 491 | 
 492 |   // B = B * inv( T11 ).
 493 |   FLA_Trsm( FLA_RIGHT, FLA_UPPER_TRIANGULAR,
 494 |             FLA_NO_TRANSPOSE, FLA_NONUNIT_DIAG,
 495 |             FLA_ONE, T, B );
 496 | 
 497 |   // B = - B * U11^H.
 498 |   FLA_Trmm( FLA_RIGHT, FLA_LOWER_TRIANGULAR,
 499 |             FLA_CONJ_TRANSPOSE, FLA_UNIT_DIAG,
 500 |             FLA_MINUS_ONE, U11, B );
 501 | 
 502 |   // B = G1 + B.
 503 |   FLA_Axpy( FLA_ONE, G1, B );
 504 | 
 505 |   // Y2 = Y2 - B * R12.
 506 |   FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
 507 |             FLA_MINUS_ONE, B, A12, FLA_ONE, Y2 );
 508 | 
 509 |   //
 510 |   // GR = GR * Q
 511 |   //
 512 |   MyFLA_Apply_Q_UT_rnfc_blk_var2( U11, U21, T, G1, G2 );
 513 | 
 514 |   // Remove auxiliary object B.
 515 |   FLA_Obj_free( & B );
 516 | 
 517 |   return FLA_SUCCESS;
 518 | }
 519 | 
 520 | // ============================================================================
 521 | static FLA_Error MyFLA_Apply_Q_UT_lhfc_blk_var2( FLA_Obj U11, FLA_Obj U21,
 522 |                      FLA_Obj T, 
 523 |                      FLA_Obj B1, FLA_Obj B2 ) {
 524 | //
 525 | // It applies the conjugate-transpose of a unitary matrix Q to a matrix B from 
 526 | // the left:
 527 | //   B := Q' B
 528 | // where:
 529 | //   B = [ B1; B2 ]
 530 | //   U = [ U11; U21 ]
 531 | //   Q = ( I - U * inv(T)' * U' )'.
 532 | //
 533 |   FLA_Obj  W;
 534 | 
 535 |   // Create auxiliary object.
 536 |   FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, B1, & W );
 537 | 
 538 |   // W = B1;
 539 | 
 540 |   FLA_Copyt( FLA_NO_TRANSPOSE, B1, W );
 541 | 
 542 |   // U11 = trilu( U11 );
 543 |   // U21 = U21;
 544 |   // W = triu( T )' * ( U11' * B1 + U21' * B2 );
 545 | 
 546 |   FLA_Trmm( FLA_LEFT, FLA_LOWER_TRIANGULAR,
 547 |             FLA_CONJ_TRANSPOSE, FLA_UNIT_DIAG,
 548 |             FLA_ONE, U11, W );
 549 | 
 550 |   FLA_Gemm( FLA_CONJ_TRANSPOSE, FLA_NO_TRANSPOSE, 
 551 |             FLA_ONE, U21, B2, FLA_ONE, W );
 552 | 
 553 |   FLA_Trsm( FLA_LEFT, FLA_UPPER_TRIANGULAR,
 554 |             FLA_CONJ_TRANSPOSE, FLA_NONUNIT_DIAG,
 555 |             FLA_ONE, T, W );
 556 | 
 557 |   // B2 = B2 - U21 * W;
 558 |   // B1 = B1 - U11 * W;
 559 | 
 560 |   FLA_Gemm( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
 561 |             FLA_MINUS_ONE, U21, W, FLA_ONE, B2 );
 562 | 
 563 |   FLA_Trmm( FLA_LEFT, FLA_LOWER_TRIANGULAR,
 564 |             FLA_NO_TRANSPOSE, FLA_UNIT_DIAG,
 565 |             FLA_MINUS_ONE, U11, W );
 566 | 
 567 |   FLA_Axpyt( FLA_NO_TRANSPOSE, FLA_ONE, W, B1 );
 568 | 
 569 |   // Remove auxiliary object.
 570 |   FLA_Obj_free( & W );
 571 | 
 572 |   return FLA_SUCCESS;
 573 | }
 574 | 
 575 | // ============================================================================
 576 | static FLA_Error MyFLA_Apply_Q_UT_rnfc_blk_var2( FLA_Obj U11, FLA_Obj U21, 
 577 |                      FLA_Obj T,
 578 |                      FLA_Obj B1, FLA_Obj B2 ) {
 579 | //
 580 | // It applies a unitary matrix Q to a matrix B from the right:
 581 | //   B = B * Q
 582 | // where:
 583 | //   B = [ B1; B2 ]
 584 | //   U = [ U11; U21 ]
 585 | //   Q = ( I - U * inv(T)' * U' ).
 586 | //
 587 |   FLA_Obj  W;
 588 | 
 589 |   // Create auxiliary object.
 590 |   FLA_Obj_create_conf_to( FLA_TRANSPOSE, B1, & W );
 591 | 
 592 |   // W = B1^T;
 593 | 
 594 |   FLA_Copyt( FLA_TRANSPOSE, B1, W );
 595 | 
 596 |   // U11 = trilu( U11 );
 597 |   // U21 = U21;
 598 |   // Let W^T be conformal to B1.
 599 |   // W^T = ( B1 * U11 + B2 * U21 ) * inv( triu(T) );
 600 |   // W   = triu(T)' * ( U11' * B1' + U21' * B2' );
 601 | 
 602 |   FLA_Trmm( FLA_LEFT, FLA_LOWER_TRIANGULAR,
 603 |             FLA_TRANSPOSE, FLA_UNIT_DIAG,
 604 |             FLA_ONE, U11, W );
 605 | 
 606 |   FLA_Gemm( FLA_TRANSPOSE, FLA_TRANSPOSE, 
 607 |             FLA_ONE, U21, B2, FLA_ONE, W );
 608 | 
 609 |   FLA_Trsm( FLA_LEFT, FLA_UPPER_TRIANGULAR,
 610 |             FLA_TRANSPOSE, FLA_NONUNIT_DIAG,
 611 |             FLA_ONE, T, W );
 612 | 
 613 |   // B2 = B2 - W^T * U21';
 614 |   // B1 = B1 - W^T * U11';
 615 |   //    = B1 - ( conj(U11) * W )^T;
 616 | 
 617 |   FLA_Gemm( FLA_TRANSPOSE, FLA_CONJ_TRANSPOSE,
 618 |             FLA_MINUS_ONE, W, U21, FLA_ONE, B2 );
 619 | 
 620 |   FLA_Trmm( FLA_LEFT, FLA_LOWER_TRIANGULAR,
 621 |             FLA_CONJ_NO_TRANSPOSE, FLA_UNIT_DIAG,
 622 |             FLA_MINUS_ONE, U11, W );
 623 | 
 624 |   FLA_Axpyt( FLA_TRANSPOSE, FLA_ONE, W, B1 );
 625 | 
 626 |   // Remove auxiliary object.
 627 |   FLA_Obj_free( & W );
 628 | 
 629 |   return FLA_SUCCESS;
 630 | }
 631 | 
 632 | /* ========================================================================= */
 633 | static FLA_Error MyFLA_QRPmod_UT_unb_var2( int pivoting, int num_stages, 
 634 |                      FLA_Obj A, FLA_Obj p, FLA_Obj t, 
 635 |                      int pivot_B, FLA_Obj B,
 636 |                      int pivot_C, FLA_Obj C,
 637 |                      int build_T, FLA_Obj T ) {
 638 | //
 639 | // It computes an unblocked QR factorization of matrix A with or without 
 640 | // pivoting. Matrices B and C are optionally pivoted, and matrix T is
 641 | // optionally built.
 642 | //
 643 | // Arguments:
 644 | // "pivoting": If pivoting==1, then QR factorization with pivoting is used.
 645 | // "numstages": It tells the number of columns that are factorized.
 646 | //   If "num_stages" is negative, the whole matrix A is factorized.
 647 | //   If "num_stages" is positive, only the first "num_stages" are factorized.
 648 | // "pivot_B": if "pivot_B" is true, matrix "B" is pivoted too.
 649 | // "pivot_C": if "pivot_C" is true, matrix "C" is pivoted too.
 650 | // "build_T": if "build_T" is true, matrix "T" is built.
 651 | //
 652 |   // Declaration of variables.
 653 |   FLA_Obj  d, e, workspace;
 654 |   int      j, m_A, n_A, mn_A, m_B, m_C, dtype_A, ldim_A, ldim_B, ldim_C, 
 655 |            ldim_T, m_A20, n_A20, m_a21, m_A22, n_A22, n_dB, idx_max_col, 
 656 |            i_one = 1;
 657 |   double   * buff_A, * buff_t, * buff_B, * buff_C, * buff_T, 
 658 |            * buff_d, * buff_e, * buff_workspace, d_one = 1.0;
 659 |   int      * buff_p;
 660 | 
 661 |   int idamax_();
 662 | 
 663 |   //// printf( "  %% MyFLA_QRPmod_UT_unb_var2. build_T: %d\n", build_T );
 664 | 
 665 |   // Some initializations.
 666 |   dtype_A = FLA_Obj_datatype( A );
 667 |   m_A     = FLA_Obj_length( A );
 668 |   n_A     = FLA_Obj_width ( A );
 669 |   mn_A    = min( m_A, n_A );
 670 |   ldim_A  = FLA_Obj_col_stride( A );
 671 |   buff_A  = ( double * ) FLA_Obj_buffer_at_view( A );
 672 | 
 673 |   buff_p  = ( int * )    FLA_Obj_buffer_at_view( p );
 674 | 
 675 |   buff_t  = ( double * ) FLA_Obj_buffer_at_view( t );
 676 | 
 677 |   if( pivot_B ) {
 678 |     m_B     = FLA_Obj_length( B );
 679 |     buff_B  = ( double * ) FLA_Obj_buffer_at_view( B );
 680 |     ldim_B  = FLA_Obj_col_stride( B );
 681 |   }
 682 | 
 683 |   if( pivot_C ) {
 684 |     m_C     = FLA_Obj_length( C );
 685 |     buff_C  = ( double * ) FLA_Obj_buffer_at_view( C );
 686 |     ldim_C  = FLA_Obj_col_stride( C );
 687 |   }
 688 | 
 689 |   if( build_T ) {
 690 |     buff_T  = ( double * ) FLA_Obj_buffer_at_view( T );
 691 |     ldim_T  = FLA_Obj_col_stride( T );
 692 |   }
 693 | 
 694 |   // Set the number of stages, if needed.
 695 |   if( num_stages < 0 ) {
 696 |     num_stages = mn_A;
 697 |   }
 698 | 
 699 |   // Create auxiliary objects.
 700 |   FLA_Obj_create( dtype_A, n_A, 1, 0, 0, & d );
 701 |   FLA_Obj_create( dtype_A, n_A, 1, 0, 0, & e );
 702 |   FLA_Obj_create( dtype_A, n_A, 1, 0, 0, & workspace );
 703 | 
 704 |   buff_d = ( double * ) FLA_Obj_buffer_at_view( d );
 705 |   buff_e = ( double * ) FLA_Obj_buffer_at_view( e );
 706 |   buff_workspace = ( double * ) FLA_Obj_buffer_at_view( workspace );
 707 | 
 708 |   if( pivoting == 1 ) {
 709 |     // Compute initial norms of A into d and e.
 710 |     MyFLA_QRP_compute_norms( A, d, e );
 711 |   }
 712 | 
 713 |   // Main Loop.
 714 |   //// printf( "  m x n: %d x %d  num_stages: %d \n", m_A, n_A, num_stages );
 715 |   for( j = 0; j < num_stages; j++ ) {
 716 |     //// printf( "%% ----------------------------------\n" );
 717 |     //// printf( "%% Iter: %d \n", j );
 718 |     //// printf( "%% ----------------------------------\n" );
 719 | 
 720 |     n_dB  = n_A - j;
 721 |     m_a21 = m_A - j - 1;
 722 |     m_A22 = m_A - j - 1;
 723 |     n_A22 = n_A - j - 1;
 724 |     m_A20 = m_A - j - 1;
 725 |     n_A20 = j;
 726 | 
 727 |     if( pivoting == 1 ) {
 728 |       // Obtain the index of the column with largest 2-norm.
 729 |       //// FLA_Amax( dB, max_col );
 730 |       // Function idamax of MKL can fail by returning an index equal or larger 
 731 |       // than the dimension when working with small numbers. Hence, an own 
 732 |       // implementation is used.
 733 |       //// printf( "  Chivato 901. n_dB: %d  my_idamax:   %d\n", 
 734 |       ////         n_dB, NoFLA_my_idamax( n_dB, & buff_d[ j ] ) );
 735 |       //// printf( "  Chivato 902. n_dB: %d  idx_max_col: %d\n", 
 736 |       ////         n_dB, idamax_( & n_dB, & buff_d[ j ], & i_one ) - 1 );
 737 |       //// idx_max_col = idamax_( & n_dB, & buff_d[ j ], & i_one ) - 1;
 738 |       idx_max_col = NoFLA_my_idamax( n_dB, & buff_d[ j ] );
 739 | 
 740 |       // Swap columns of G, pivots, and norms.
 741 |       //// FLA_QRP_pivot_B( max_col, GR, BR, pB, dB, eB );
 742 |       NoFLA_QRP_pivot_B_C( idx_max_col,
 743 |           m_A, & buff_A[ 0 + j * ldim_A ], ldim_A,
 744 |           pivot_B, m_B, & buff_B[ 0 + j * ldim_B ], ldim_B,
 745 |           pivot_C, m_C, & buff_C[ 0 + j * ldim_C ], ldim_C,
 746 |           & buff_p[ j ],
 747 |           & buff_d[ j ],
 748 |           & buff_e[ j ] );
 749 |     }
 750 | 
 751 |     // Compute tau1 and u21 from alpha11 and a21 such that tau1 and u21
 752 |     // determine a Householder transform H such that applying H from the
 753 |     // left to the column vector consisting of alpha11 and a21 annihilates
 754 |     // the entries in a21 (and updates alpha11).
 755 |     //// FLA_Househ2( alpha11,
 756 |     ////              a21, tau1 );
 757 |     //// FLA_Househ2_UT( FLA_LEFT,
 758 |     ////                 alpha11,
 759 |     ////                 a21, tau1 );
 760 | 
 761 |     FLA_Househ2_UT_l_opd( m_a21,
 762 |                           & buff_A[ j + j * ldim_A ],
 763 |                           & buff_A[ ( j+1 ) + j * ldim_A ], i_one,
 764 |                           & buff_t[ j ] );
 765 | 
 766 |     // / a12t \ =  H / a12t \
 767 |     // \ A22  /      \ A22  /
 768 |     //
 769 |     // where H is formed from tau1 and u21.
 770 |     //// FLA_QR_Update_Rest_blk_b2( tau1, a21, a12t,
 771 |     ////                                       A22 );
 772 |     /// FLA_Apply_H2_UT( FLA_LEFT, tau1, a21, a12t,
 773 |     ///                                       A22 );
 774 | 
 775 |     MyFLA_Apply_H2_UT_l_opd_var2(
 776 |         m_A22, //// m_u2_A2,
 777 |         n_A22, //// n_a1t,
 778 |         & buff_t[ j ], //// tau,
 779 |         & buff_A[ ( j+1 ) + j * ldim_A ], 1, //// u2,inc_u2,
 780 |         & buff_A[ j + ( j+1 ) * ldim_A ], ldim_A, //// a1t,inc_a1t,
 781 |         & buff_A[ ( j+1 ) + ( j+1 ) * ldim_A ], ldim_A,
 782 |         buff_workspace ); //// A2,rs_A2,cs_A2);
 783 | 
 784 |     // Build T.
 785 |     if( build_T ) {
 786 |       // rho11 = tau1;
 787 |       //// FLA_Copy( tau1, rho11 );
 788 |       buff_T[ j + j * ldim_T ] = buff_t[ j ];
 789 | 
 790 |       // t01 = a10t' + A20' * u21;
 791 |       //// FLA_Copyt_external( FLA_CONJ_TRANSPOSE, a10t, r01 );
 792 |       dcopy_( & j, & buff_A[ j + 0 * ldim_A ], & ldim_A,
 793 |                    & buff_T[ 0 + j * ldim_T ], & i_one );
 794 | 
 795 |       //// FLA_Gemv_external( FLA_CONJ_TRANSPOSE, FLA_ONE, A20, a21, 
 796 |       ////                    FLA_ONE, r01 );
 797 |       dgemv_( "Transpose", & m_A20, & n_A20,
 798 |           & d_one,
 799 |           & buff_A[ ( j+1 ) + 0 * ldim_A ], & ldim_A,
 800 |           & buff_A[ ( j+1 ) + j * ldim_A ], & i_one,
 801 |           & d_one,
 802 |           & buff_T[ 0 + j * ldim_T ], & i_one );
 803 |     }
 804 | 
 805 |     if( pivoting == 1 ) {
 806 |       // Update partial column norms.
 807 |       //// FLA_QRP_update_partial_norms( dB, eB, a12t, A22 );
 808 | 
 809 |       NoFLA_QRP_downdate_partial_norms( m_A22, n_A22, 
 810 |           & buff_d[ j+1 ], 1,
 811 |           & buff_e[ j+1 ], 1,
 812 |           & buff_A[ j + ( j+1 ) * ldim_A ], ldim_A,
 813 |           & buff_A[ ( j+1 ) + ( j+1 ) * ldim_A ], ldim_A );
 814 |     }
 815 |   }
 816 | 
 817 |   // Remove auxiliary objects.
 818 |   FLA_Obj_free( & d );
 819 |   FLA_Obj_free( & e );
 820 |   FLA_Obj_free( & workspace );
 821 | 
 822 |   return FLA_SUCCESS;
 823 | }
 824 | 
 825 | /* ========================================================================= */
 826 | static FLA_Error MyFLA_Apply_H2_UT_l_opd_var2( 
 827 |                      int m_u2_A2,
 828 |                      int n_a1t,
 829 |                      double * tau,
 830 |                      double * u2, int inc_u2,
 831 |                      double * a1t, int inc_a1t,
 832 |                      double * A2, int ldim_A2, 
 833 |                      double * workspace ) {
 834 | 
 835 |   double  one_p       = 1.0;
 836 |   double  minus_one_p = -1.0;
 837 |   double  rtau;
 838 |   int     inc_w1t;
 839 | 
 840 |   // FLA_Obj w1t;
 841 |   double * w1t;
 842 | 
 843 |   // if ( FLA_Obj_has_zero_dim( a1t ) ) return FLA_SUCCESS;
 844 |   if ( n_a1t == 0 ) return FLA_SUCCESS;
 845 | 
 846 |   // FLA_Obj_create_conf_to( FLA_NO_TRANSPOSE, a1t, &w1t );
 847 |   //// w1t = ( double* ) FLA_malloc( n_a1t * sizeof( *a1t ) );
 848 |   w1t = workspace;
 849 |   inc_w1t = 1;
 850 | 
 851 |   // // w1t = a1t;
 852 |   // FLA_Copy_external( a1t, w1t );
 853 |   //// bl1_dcopyv( BLIS1_NO_CONJUGATE,
 854 |   ////             n_a1t,
 855 |   ////             a1t, inc_a1t, 
 856 |   ////             w1t, inc_w1t ); 
 857 |   dcopy_( & n_a1t,
 858 |           a1t, & inc_a1t, 
 859 |           w1t, & inc_w1t ); 
 860 | 
 861 |   // // w1t = w1t + u2' * A2;
 862 |   // // w1t = w1t + A2^T * conj(u2);
 863 |   // FLA_Gemvc_external( FLA_TRANSPOSE, FLA_CONJUGATE, FLA_ONE, A2, u2, 
 864 |   //     FLA_ONE, w1t );
 865 |   //// bl1_dgemv( BLIS1_TRANSPOSE,
 866 |   ////            BLIS1_CONJUGATE,
 867 |   ////            m_u2_A2,
 868 |   ////            n_a1t,
 869 |   ////            one_p,
 870 |   ////            A2, rs_A2, cs_A2,
 871 |   ////            u2, inc_u2,
 872 |   ////            one_p,
 873 |   ////            w1t, inc_w1t );
 874 |   dgemv_( "Transpose",
 875 |           & m_u2_A2,
 876 |           & n_a1t,
 877 |           & one_p,
 878 |           A2, & ldim_A2,
 879 |           u2, & inc_u2,
 880 |           & one_p,
 881 |           w1t, & inc_w1t );
 882 | 
 883 |   // // w1t = w1t / tau;
 884 |   // FLA_Inv_scalc_external( FLA_NO_CONJUGATE, tau, w1t );
 885 |   //// bl1_dinvscalv( BLIS1_NO_CONJUGATE,
 886 |   ////                n_a1t,
 887 |   ////                tau,
 888 |   ////                w1t, inc_w1t );
 889 |   if( * tau == 0.0 ) {
 890 |     fprintf( stderr, "ERROR in MyFLA_Apply_H2_UT_l_opd_var2: Tau is zero.\n" );
 891 |   } else {
 892 |     rtau = 1.0 / ( * tau );
 893 |   }
 894 |   dscal_( & n_a1t,
 895 |           & rtau,
 896 |           w1t, & inc_w1t );
 897 | 
 898 |   // // a1t = - w1t + a1t;
 899 |   // FLA_Axpy_external( FLA_MINUS_ONE, w1t, a1t );
 900 |   //// bl1_daxpyv( BLIS1_NO_CONJUGATE,
 901 |   ////             n_a1t,
 902 |   ////             minus_one_p,
 903 |   ////             w1t, inc_w1t,
 904 |   ////             a1t, inc_a1t );
 905 |   daxpy_( & n_a1t,
 906 |           & minus_one_p,
 907 |           w1t, & inc_w1t,
 908 |           a1t, & inc_a1t );
 909 | 
 910 |   // // A2 = - u2 * w1t + A2;
 911 |   // FLA_Ger_external( FLA_MINUS_ONE, u2, w1t, A2 );
 912 |   //// bl1_dger( BLIS1_NO_CONJUGATE,
 913 |   ////           BLIS1_NO_CONJUGATE,
 914 |   ////           m_u2_A2,
 915 |   ////           n_a1t,
 916 |   ////           minus_one_p,
 917 |   ////           u2, inc_u2,
 918 |   ////           w1t, inc_w1t,
 919 |   ////           A2, rs_A2, cs_A2 );
 920 |   dger_( & m_u2_A2,
 921 |          & n_a1t,
 922 |          & minus_one_p,
 923 |          u2, & inc_u2,
 924 |          w1t, & inc_w1t,
 925 |          A2, & ldim_A2 );
 926 | 
 927 |   //// // FLA_Obj_free( &w1t );
 928 |   //// FLA_free( w1t );
 929 | 
 930 |   return FLA_SUCCESS;
 931 | }
 932 | 
 933 | /* ========================================================================= */
 934 | static FLA_Error MyFLA_QRP_compute_norms( FLA_Obj A, FLA_Obj d, FLA_Obj e ) {
 935 | 
 936 |   switch( FLA_Obj_datatype( A ) ) {
 937 | 
 938 |   case FLA_DOUBLE: {
 939 |     int     m_A, n_A, ld_A, st_d, st_e, j, i_one = 1;
 940 |     double  * ptr_A, * ptr_d, * ptr_e, dnrm2_();
 941 | 
 942 |     m_A   = FLA_Obj_length( A );
 943 |     n_A   = FLA_Obj_width( A );
 944 |     ptr_A = ( double * ) FLA_Obj_buffer_at_view( A );
 945 |     ptr_d = ( double * ) FLA_Obj_buffer_at_view( d );
 946 |     ptr_e = ( double * ) FLA_Obj_buffer_at_view( e );
 947 |     ld_A  = FLA_Obj_col_stride( A );
 948 | 
 949 |     st_d = 0;
 950 |     if( FLA_Obj_length( d ) == 1 ) {
 951 |       st_d = FLA_Obj_col_stride( d );
 952 |     } else if( FLA_Obj_width( d ) == 1 ) {
 953 |       st_d = 1;
 954 |     } else {
 955 |       FLA_Print_message( "MyFLA_QRP_compute_norms: Object d is not a vector", 
 956 |                          __FILE__, __LINE__ );
 957 |       FLA_Abort();
 958 |     }
 959 |     st_e = 0;
 960 |     if( FLA_Obj_length( e ) == 1 ) {
 961 |       st_e = FLA_Obj_col_stride( e );
 962 |     } else if( FLA_Obj_width( e ) == 1 ) {
 963 |       st_e = 1;
 964 |     } else {
 965 |       FLA_Print_message( "MyFLA_QRP_compute_norms: Object e is not a vector", 
 966 |                          __FILE__, __LINE__ );
 967 |       FLA_Abort();
 968 |     }
 969 | 
 970 |     for( j = 0; j < n_A; j++ ) {
 971 |       *ptr_d = dnrm2_( & m_A, ptr_A, & i_one );
 972 |       *ptr_e = *ptr_d;
 973 |       ptr_A += ld_A;
 974 |       ptr_d += st_d;
 975 |       ptr_e += st_e;
 976 |     }
 977 | 
 978 |     break;
 979 |   }
 980 |   default:
 981 |     FLA_Print_message( "MyFLA_QRP_compute_norms: datatype not yet implemented", 
 982 |                        __FILE__, __LINE__ );
 983 |     FLA_Abort();
 984 |   }
 985 | 
 986 |   return FLA_SUCCESS;
 987 | }
 988 | 
 989 | /* ========================================================================= */
 990 | static FLA_Error NoFLA_QRP_pivot_B_C( int j_max_col,
 991 |                      int m_G, double * buff_G, int ldim_G, 
 992 |                      int pivot_B, int m_B, double * buff_B, int ldim_B, 
 993 |                      int pivot_C, int m_C, double * buff_C, int ldim_C, 
 994 |                      int * buff_p,
 995 |                      double * buff_d, double * buff_e ) {
 996 | // Matrices B and C are optionally pivoted.
 997 | 
 998 |   // Declaration of variables.
 999 |   int     ival, i_one = 1;
1000 |   double  * ptr_g1, * ptr_g2, * ptr_b1, * ptr_b2, * ptr_c1, * ptr_c2;
1001 | 
1002 |   //// j_max_col = *( ( int * ) FLA_Obj_buffer_at_view( max_col ) );
1003 |   //// printf( "j_max_col: %d\n", j_max_col );
1004 |   //// FLA_Obj_show( " max_col = [ ", max_col, "%d", " ];" );
1005 | 
1006 |   // Swap columns of G, pivots, and norms.
1007 |   if( j_max_col != 0 ) {
1008 | 
1009 |     // Swap full column 0 and column "j_max_col" of G.
1010 |     ptr_g1 = & buff_G[ 0 + 0         * ldim_G ];
1011 |     ptr_g2 = & buff_G[ 0 + j_max_col * ldim_G ];
1012 |     dswap_( & m_G, ptr_g1, & i_one, ptr_g2, & i_one );
1013 | 
1014 |     // Swap full column 0 and column "j_max_col" of B.
1015 |     if( pivot_B ) {
1016 |       ptr_b1 = & buff_B[ 0 + 0         * ldim_B ];
1017 |       ptr_b2 = & buff_B[ 0 + j_max_col * ldim_B ];
1018 |       dswap_( & m_B, ptr_b1, & i_one, ptr_b2, & i_one );
1019 |     }
1020 | 
1021 |     // Swap full column 0 and column "j_max_col" of C.
1022 |     if( pivot_C ) {
1023 |       ptr_c1 = & buff_C[ 0 + 0         * ldim_C ];
1024 |       ptr_c2 = & buff_C[ 0 + j_max_col * ldim_C ];
1025 |       dswap_( & m_C, ptr_c1, & i_one, ptr_c2, & i_one );
1026 |     }
1027 | 
1028 |     // Swap element 0 and element "j_max_col" of pivot vector "p".
1029 |     ival = buff_p[ j_max_col ];
1030 |     buff_p[ j_max_col ] = buff_p[ 0 ];
1031 |     buff_p[ 0 ] = ival;
1032 | 
1033 |     // Copy norms of column 0 to column "j_max_col".
1034 |     buff_d[ j_max_col ] = buff_d[ 0 ];
1035 |     buff_e[ j_max_col ] = buff_e[ 0 ];
1036 |   }
1037 | 
1038 |   return FLA_SUCCESS;
1039 | }
1040 | 
1041 | /* ========================================================================= */
1042 | static FLA_Error NoFLA_QRP_downdate_partial_norms( int m_A, int n_A,
1043 |                      double * buff_d,  int st_d,
1044 |                      double * buff_e,  int st_e,
1045 |                      double * buff_wt, int st_wt,
1046 |                      double * buff_A,  int ldim_A ) {
1047 |   //// printf( "  NoFLA_QRP_downdate_partial_norms by Drmac. \n" );
1048 | 
1049 |   int     j, i_one = 1;
1050 |   double  * ptr_d, * ptr_e, * ptr_wt, * ptr_A;
1051 |   double  temp, temp2, temp5, tol3z;
1052 |   double  dnrm2_();
1053 | 
1054 |   /*
1055 | *
1056 | *           Update partial column norms
1057 | *
1058 |           DO 30 J = I + 1, N
1059 |              IF( WORK( J ).NE.ZERO ) THEN
1060 | *
1061 | *                 NOTE: The following 4 lines follow from the analysis in
1062 | *                 Lapack Working Note 176.
1063 | *                 
1064 |                 TEMP = ABS( A( I, J ) ) / WORK( J )
1065 |                 TEMP = MAX( ZERO, ( ONE+TEMP )*( ONE-TEMP ) )
1066 |                 TEMP2 = TEMP*( WORK( J ) / WORK( N+J ) )**2
1067 |                 IF( TEMP2 .LE. TOL3Z ) THEN 
1068 |                    IF( M-I.GT.0 ) THEN
1069 |                       WORK( J ) = DNRM2( M-I, A( I+1, J ), 1 )
1070 |                       WORK( N+J ) = WORK( J )
1071 |                    ELSE
1072 |                       WORK( J ) = ZERO
1073 |                       WORK( N+J ) = ZERO
1074 |                    END IF
1075 |                 ELSE
1076 |                    WORK( J ) = WORK( J )*SQRT( TEMP )
1077 |                 END IF
1078 |              END IF
1079 |  30       CONTINUE
1080 |   */
1081 | 
1082 |   // Some initializations.
1083 |   tol3z = sqrt( FLA_Mach_params_opd( FLA_MACH_EPS ) );
1084 |   //// printf( "tol3z: %24.17le\n", tol3z );
1085 |   //// printf( "eps:   %24.17le\n", FLA_Mach_params_opd( FLA_MACH_EPS ) );
1086 |   ptr_d  = buff_d;
1087 |   ptr_e  = buff_e;
1088 |   ptr_wt = buff_wt;
1089 |   ptr_A  = buff_A;
1090 | 
1091 |   // Main loop.
1092 |   for( j = 0; j < n_A; j++ ) {
1093 |     if( * ptr_d != 0.0 ) {
1094 |       temp = dabs( * ptr_wt ) / * ptr_d;
1095 |       temp = max( 0.0, ( 1.0 + temp ) * ( 1 - temp ) );
1096 |       temp5 = * ptr_d / * ptr_e;
1097 |       temp2 = temp * temp5 * temp5;
1098 |       //// printf( "temp2: %24.17lf   tol3z: %24.17lf\n", temp2, tol3z );
1099 |       if( temp2 <= tol3z ) {
1100 |         //// printf( "F. Cancel Drm %4d %14.6le %14.6le\n", 
1101 |         ////         j, * ptr_wt, * ptr_d );
1102 |         if( m_A > 0 ) {
1103 |           * ptr_d = dnrm2_( & m_A, ptr_A, & i_one );
1104 |           * ptr_e = *ptr_d;
1105 |         } else {
1106 |           * ptr_d = 0.0;
1107 |           * ptr_e = 0.0;
1108 |         }
1109 |       } else {
1110 |         * ptr_d = * ptr_d * sqrt( temp );
1111 |       }
1112 |     } 
1113 |     ptr_A  += ldim_A;
1114 |     ptr_d  += st_d;
1115 |     ptr_e  += st_e;
1116 |     ptr_wt += st_wt;
1117 |   }
1118 | 
1119 |   return FLA_SUCCESS;
1120 | }
1121 | 
1122 | /* ========================================================================= */
1123 | static int NoFLA_my_idamax( int n, double * buff_data ) {
1124 |   int     i, result;
1125 |   double  dmax;
1126 | 
1127 |   // IDAMAX = 0
1128 |   // IF (N.LT.1 .OR. INCX.LE.0) RETURN
1129 |   if( n < 1 ) {
1130 |     return -1;
1131 |   }
1132 | 
1133 |   // IDAMAX = 1
1134 |   // IF (N.EQ.1) RETURN
1135 |   if( n == 1 ) {
1136 |     return 0;
1137 |   }
1138 | 
1139 |   // *
1140 |   // * code for increment equal to 1
1141 |   // *
1142 |   // DMAX = DABS(DX(1))
1143 |   // DO I = 2,N
1144 |   //   IF (DABS(DX(I)).GT.DMAX) THEN
1145 |   //     IDAMAX = I
1146 |   //     DMAX = DABS(DX(I))
1147 |   //   END IF
1148 |   // END DO
1149 | 
1150 |   /*
1151 |   for( i = 0; i < n; i++ ) {
1152 |     printf( "    dabs( buff_data[ %d ] ) = %25.16le\n", 
1153 |             i, dabs( buff_data[ i ] ) );
1154 |   } */
1155 | 
1156 |   result = 0;
1157 |   dmax   = dabs( buff_data[ 0 ] );
1158 |   for( i = 1; i < n; i++ ) {
1159 |     if( dmax < dabs( buff_data[ i ] ) ) {
1160 |       result = i;
1161 |       dmax   = dabs( buff_data[ i ] );
1162 |     }
1163 |   }
1164 | 
1165 |   return result;
1166 | }
1167 | 
1168 | 
1169 | 
1170 | 
1171 | 
1172 | 
1173 | 
1174 | 
1175 | 
1176 | 
1177 | 
1178 | 
1179 | 
1180 | 
1181 | 
1182 | 
1183 | 
1184 | 
1185 | 
1186 | 
1187 | 
1188 | 
1189 | 
1190 | 
1191 | 
1192 | 
1193 | 
1194 | 
1195 | 
1196 | 
1197 | 
1198 | 
1199 | 
1200 | 


--------------------------------------------------------------------------------