├── src
    ├── Makefile
    ├── CMakeLists.txt
    ├── mellipke.c
    ├── pdgenm2.c
    ├── mellipj.c
    └── pdgeqdwh.c
├── .gitignore
├── .gitmodules
├── polar.pc.in
├── LICENSE
├── include
    ├── polar.h
    ├── myscalapack.h
    └── flops.h
├── scalapack-cmake-libsltmg.patch
├── timing
    ├── CMakeLists.txt
    ├── timing.c
    ├── timing_pdgeqdwh.c
    └── timing_pdgezolopd.c
├── testing
    ├── CMakeLists.txt
    ├── testing.c
    ├── testing_pdgeqdwh.c
    └── testing_pdgezolopd.c
├── Jenkinsfile
├── README.md
├── CMakeLists.txt
└── scalapack-cmake-libsltmg-only.patch


/src/Makefile:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.o
2 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "cmake_modules/ecrc"]
2 | 	path = cmake_modules/ecrc
3 | 	url = https://github.com/ecrc/ecrc_cmake.git
4 | 


--------------------------------------------------------------------------------
/polar.pc.in:
--------------------------------------------------------------------------------
 1 | prefix=@prefix@
 2 | libdir=@libdir@
 3 | 
 4 | Name: polar
 5 | Description: QR-based Dynamically Weighted Halley (POLAR)
 6 | Version: @POLAR_VERSION@
 7 | URL: http://github.com/ecrc/polar
 8 | Libs: -L${libdir} -lpolar
 9 | Requires: mpi sclapack
10 | 


--------------------------------------------------------------------------------
/src/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Define source files
 2 | set( POLAR_SRC pdgenm2.c
 3 |                mellipj.c
 4 |                mellipke.c
 5 |                pdgezolopd.c
 6 |                pdgeqdwh.c
 7 |                )
 8 | 
 9 | 
10 | list( APPEND CMAKE_SHARED_LINKER_FLAGS "-Wl,--enable-new-dtags")
11 | # Transform the list into a space separate string
12 | string(REPLACE ";" " " CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS}")
13 | 
14 | # Build POLAR
15 | add_library( ${POLAR_LIBNAME} ${POLAR_SRC} )
16 | # Libraries to link
17 | target_link_libraries( ${POLAR_LIBNAME} ${MPI_C_LIBRARIES} ${SCALAPACK_LIBRARIES} ${EXTRA_LIBS} )
18 | 
19 | # Installation
20 | install( TARGETS ${POLAR_LIBNAME} DESTINATION lib )
21 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2017, King Abdullah University of Science and Technology
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/include/polar.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  *
 3 |  * Copyright (c) 2017, King Abdullah University of Science and Technology
 4 |  * All rights reserved.
 5 |  *
 6 |  **/
 7 | 
 8 | #include <stdlib.h>
 9 | #include <stdio.h>
10 | #include <string.h>
11 | #include <assert.h>
12 | #include <math.h>
13 | #include <complex.h>
14 | #include <getopt.h>
15 | #include <unistd.h>
16 | #include <sys/time.h>
17 | #include <sys/resource.h>
18 | #include <mpi.h>
19 | #include <mkl_lapack.h>
20 | #include <mkl_lapacke.h>
21 | #include "myscalapack.h"
22 | #include "flops.h"
23 | 
24 | #ifndef max
25 | #define max(a, b) ((a) > (b) ? (a) : (b))
26 | #endif
27 | #ifndef min
28 | #define min(a, b) ((a) < (b) ? (a) : (b))
29 | #endif
30 | 
31 | 
32 | int pdgeqdwh( char *jobh, int m, int n,
33 | 	      double *A, int iA, int jA, int *descA, 
34 |               double *H, int iH, int jH, int *descH,
35 |               double *Work1, int lWork1, 
36 |               double *Work2, int lWork2, 
37 |               int *info);
38 | 
39 | int pdgezolopd( char *jobh, int m, int n,
40 | 	      double *A, int iA, int jA, int *descA, 
41 |               double *H, int iH, int jH, int *descH,
42 |               double *Work1, int lWork1, 
43 |               double *Work2, int lWork2, 
44 |               int *info);
45 | 
46 | void pdgenm2( double *A, int M, int N, int descA[9], 
47 |               double *W, int descW[9], double *Sx, int descSx[9], 
48 |               double *e, double tol);
49 | 
50 | int mellipj( double u, double alpha, 
51 |              double *sn, double *cn, double *dn, 
52 |              double *work);
53 | 
54 | int mellipke( double alpha,  
55 |               double *k, double *e);
56 | 
57 | 


--------------------------------------------------------------------------------
/scalapack-cmake-libsltmg.patch:
--------------------------------------------------------------------------------
 1 | diff -Naur -x BLACS -x PBLAS a/CMakeLists.txt b/CMakeLists.txt
 2 | --- a/CMakeLists.txt	2012-05-02 06:58:24.000000000 +0300
 3 | +++ b/CMakeLists.txt	2017-02-15 11:26:02.000000000 +0300
 4 | @@ -140,6 +140,11 @@
 5 |  IF(LAPACK_LIBRARIES)
 6 |    include(CheckFortranFunctionExists)
 7 |    message(STATUS "--> LAPACK supplied by user is ${LAPACK_LIBRARIES}.")
 8 | +# receive in white space separated list, and transform into cmake list (;)
 9 | +SEPARATE_ARGUMENTS(LAPACK_LIBRARIES) 
10 | +set( AUXSTR ${LAPACK_LIBRARIES} )
11 | +set(LAPACK_LIBRARIES "${AUXSTR}" CACHE STRING "User provided list of lapack libraries." FORCE)
12 | +  message(STATUS "----> LAPACK supplied by user is ${LAPACK_LIBRARIES}.")
13 |    set(CMAKE_REQUIRED_LIBRARIES ${LAPACK_LIBRARIES})
14 |    CHECK_FORTRAN_FUNCTION_EXISTS("dgesv" LAPACK_FOUND)
15 |    unset( CMAKE_REQUIRED_LIBRARIES )
16 | diff -Naur -x BLACS -x PBLAS a/TESTING/EIG/CMakeLists.txt b/TESTING/EIG/CMakeLists.txt
17 | --- a/TESTING/EIG/CMakeLists.txt	2011-11-08 23:48:31.000000000 +0300
18 | +++ b/TESTING/EIG/CMakeLists.txt	2017-02-20 15:15:08.000000000 +0300
19 | @@ -97,3 +97,12 @@
20 |  target_link_libraries(xshseqr scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES})
21 |  target_link_libraries(xdhseqr scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES})
22 |  
23 | +file(GLOB SLTMG_SRC "*.f")
24 | +add_library( sltmg STATIC ${SLTMG_SRC})
25 | +scalapack_install_library( sltmg )
26 | +#install( TARGETS sltmg DESTINATION lib )
27 | +ADD_CUSTOM_TARGET(install.sltmg
28 | +    ${CMAKE_COMMAND}
29 | +    -DBUILD_TYPE=${CMAKE_BUILD_TYPE}
30 | +    -P ${CMAKE_CURRENT_BINARY_DIR}/cmake_install.cmake)
31 | +ADD_DEPENDENCIES(install.sltmg sltmg) 
32 | 


--------------------------------------------------------------------------------
/timing/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Defines the source files of the tests. Each file generates 1 test
 2 | file(GLOB tests_files "*.c")
 3 | 
 4 | include_directories( ${CMAKE_SOURCE_DIR}/include )
 5 | link_directories( ${CMAKE_BINARY_DIR}/lib )
 6 | 
 7 | # Uses RUNPATH instead of RPATH
 8 | #set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--enable-new-dtags")
 9 | list( APPEND CMAKE_EXE_LINKER_FLAGS "-Wl,--enable-new-dtags")
10 | # Transform the list into a space separate string
11 | string(REPLACE ";" " " CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}")
12 | 
13 | foreach( test_src ${tests_files} )
14 |     get_filename_component( test_exe ${test_src}  NAME_WE )
15 |     if( "${test_exe}" STREQUAL "timing" )
16 |         # skip timing.c
17 |         continue()
18 |     endif()
19 | 
20 |     add_executable( ${test_exe} timing.c ${test_src} )
21 |     add_dependencies( ${test_exe} ${POLAR_LIBNAME} )
22 |     if( _build_netlib-scalapack )
23 |         add_dependencies( ${test_exe} NETLIB-SCALAPACK )
24 |     endif()
25 | 
26 |     target_link_libraries( ${test_exe} ${POLAR_LIBNAME}
27 |                                        ${MPI_C_LIBRARIES}
28 |                                        ${SLTMG_LIBRARIES}
29 |                                        ${SCALAPACK_LIBRARIES}
30 |                                        ${LAPACKE_LIBRARIES}
31 |                                        ${LAPACK_LIBRARIES}
32 |                                        ${BLAS_LIBRARIES}
33 |                                        ${EXTRA_LIBS} )
34 |     install( TARGETS ${test_exe} DESTINATION lib/timing )
35 | endforeach()
36 | #set( _p 4 )
37 | #set( _q 4 )
38 | #math( EXPR _pq "${_p} * ${_q}" )
39 | 
40 | set( _p ${P} )
41 | set( _q ${Q} )
42 | set( _pq ${ncores} )
43 | 
44 | # Add basic tests. To run them type: 'make test' in build directory
45 | # add_test( NAME "timing_pdgesvd:A" COMMAND mpirun -np ${_pq} timing_pdgesvd --nprow ${_p} --npcol ${_q} --nb 64 --n_range 512:512:512 --niter 3 --check --qwmr )
46 | # add_test( NAME "timing_pdgesvd:B" COMMAND mpirun -np ${_pq} timing_pdgesvd --nprow ${_p} --npcol ${_q} --nb 64 --n_range 512:512:512 --niter 3 --check --qwdc )
47 | # add_test( NAME "timing_pdgesvd:C" COMMAND mpirun -np ${_pq} timing_pdgesvd --nprow ${_p} --npcol ${_q} --nb 64 --n_range 512:512:512 --niter 3 --check --qwel )
48 | add_test( NAME "timing_pdgeqdwh:A"   COMMAND mpirun -np ${_pq} ./timing_pdgeqdwh --nprow ${_p} --npcol ${_q} --nb 64 --n_range 512:512:512 --niter 3 --check )
49 | add_test( NAME "timing_pdgezolopd:B" COMMAND mpirun -np ${_pq} ./timing_pdgezolopd --nprow ${_p} --npcol ${_q} --nb 64 --n_range 512:512:512 --niter 3 --check --cond 1 )
50 | 


--------------------------------------------------------------------------------
/testing/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Defines the source files of the tests. Each file generates 1 test
 2 | file(GLOB tests_files "*.c")
 3 | 
 4 | include_directories( ${CMAKE_SOURCE_DIR}/include )
 5 | link_directories( ${CMAKE_BINARY_DIR}/lib )
 6 | 
 7 | # Uses RUNPATH instead of RPATH
 8 | #set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--enable-new-dtags")
 9 | list( APPEND CMAKE_EXE_LINKER_FLAGS "-Wl,--enable-new-dtags")
10 | # Transform the list into a space separate string
11 | string(REPLACE ";" " " CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}")
12 | 
13 | foreach( test_src ${tests_files} )
14 |     get_filename_component( test_exe ${test_src}  NAME_WE )
15 |     if( "${test_exe}" STREQUAL "testing" )
16 |         # skip testing.c
17 |         continue()
18 |     endif()
19 | 
20 |     add_executable( ${test_exe} testing.c ${test_src}  )
21 |     add_dependencies( ${test_exe} ${POLAR_LIBNAME} )
22 |     if( _build_netlib-scalapack )
23 |         add_dependencies( ${test_exe} NETLIB-SCALAPACK )
24 |     endif()
25 | 
26 |     target_link_libraries( ${test_exe} ${POLAR_LIBNAME}
27 |                                        ${MPI_C_LIBRARIES}
28 |                                        ${SLTMG_LIBRARIES}
29 |                                        ${SCALAPACK_LIBRARIES}
30 |                                        ${LAPACKE_LIBRARIES}
31 |                                        ${LAPACK_LIBRARIES}
32 |                                        ${BLAS_LIBRARIES}
33 |                                        ${EXTRA_LIBS} )
34 |     install( TARGETS ${test_exe} DESTINATION lib/testing )
35 | endforeach()
36 | #set( _p 4 )
37 | #set( _q 4 )
38 | #math( EXPR _pq "${_p} * ${_q}" )
39 | 
40 | set( _p ${P} )
41 | set( _q ${Q} )
42 | set( _pq ${ncores} )
43 | 
44 | # Add basic tests. To run them type: 'make test' in build directory
45 | # add_test( NAME "testing_pdgesvd:A" COMMAND mpirun -np ${_pq} testing_pdgesvd --nprow ${_p} --npcol ${_q} --nb 64 --n_range 512:512:512 --niter 3 --check --qwmr )
46 | # add_test( NAME "testing_pdgesvd:B" COMMAND mpirun -np ${_pq} testing_pdgesvd --nprow ${_p} --npcol ${_q} --nb 64 --n_range 512:512:512 --niter 3 --check --qwdc )
47 | # add_test( NAME "testing_pdgesvd:C" COMMAND mpirun -np ${_pq} testing_pdgesvd --nprow ${_p} --npcol ${_q} --nb 64 --n_range 512:512:512 --niter 3 --check --qwel )
48 | add_test( NAME "testing_pdgeqdwh:A"   COMMAND mpirun -np ${_pq} ./testing_pdgeqdwh --nprow ${_p} --npcol ${_q} --nb 64 --n_range 512:512:512 --niter 3 --check )
49 | add_test( NAME "testing_pdgezolopd:B" COMMAND mpirun -np ${_pq} ./testing_pdgezolopd --nprow ${_p} --npcol ${_q} --nb 64 --n_range 512:512:512 --niter 3 --check --cond 1 )
50 | 
51 | 


--------------------------------------------------------------------------------
/timing/timing.c:
--------------------------------------------------------------------------------
 1 | /**
 2 |  *
 3 |  * Copyright (c) 2017, King Abdullah University of Science and Technology
 4 |  * All rights reserved.
 5 |  *
 6 |  **/
 7 | 
 8 | /**
 9 |  *
10 |  * @file timing.c
11 |  *
12 |  *  POLAR is a high performance software framework for computing 
13 |  *  the polar decomposition on distributed-memory manycore systems provided by KAUST
14 |  *
15 |  * @version 3.0.0
16 |  * @author Dalal Sukkari
17 |  * @author Hatem Ltaief
18 |  * @date 2018-11-08
19 |  *
20 |  **/
21 | 
22 | #include <stdio.h>
23 | #include <stdlib.h>
24 | #include <string.h>
25 | int
26 | get_range(char *range, int *start_p, int *stop_p, int *step_p) {
27 |     char *s, *s1, buf[21];
28 |     int colon_count, copy_len, nbuf=20, n;
29 |     int start=1000, stop=10000, step=1000;
30 | 
31 |     colon_count = 0;
32 |     for (s = strchr( range, ':'); s; s = strchr( s+1, ':'))
33 |         colon_count++;
34 | 
35 |     if (colon_count == 0) { /* No colon in range. */
36 |         if (sscanf( range, "%d", &start ) < 1 || start < 1)
37 |             return -1;
38 |         step = start / 10;
39 |         if (step < 1) step = 1;
40 |         stop = start + 10 * step;
41 | 
42 |     } else if (colon_count == 1) { /* One colon in range.*/
43 |         /* First, get the second number (after colon): the stop value. */
44 |         s = strchr( range, ':' );
45 |         if (sscanf( s+1, "%d", &stop ) < 1 || stop < 1)
46 |             return -1;
47 | 
48 |         /* Next, get the first number (before colon): the start value. */
49 |         n = s - range;
50 |         copy_len = n > nbuf ? nbuf : n;
51 |         strncpy( buf, range, copy_len );
52 |         buf[copy_len] = 0;
53 |         if (sscanf( buf, "%d", &start ) < 1 || start > stop || start < 1)
54 |             return -1;
55 | 
56 |         /* Let's have 10 steps or less. */
57 |         step = (stop - start) / 10;
58 |         if (step < 1)
59 |             step = 1;
60 |     } else if (colon_count == 2) { /* Two colons in range. */
61 |         /* First, get the first number (before the first colon): the start value. */
62 |         s = strchr( range, ':' );
63 |         n = s - range;
64 |         copy_len = n > nbuf ? nbuf : n;
65 |         strncpy( buf, range, copy_len );
66 |         buf[copy_len] = 0;
67 |         if (sscanf( buf, "%d", &start ) < 1 || start < 1)
68 |             return -1;
69 | 
70 |         /* Next, get the second number (after the first colon): the stop value. */
71 |         s1 = strchr( s+1, ':' );
72 |         n = s1 - (s + 1);
73 |         copy_len = n > nbuf ? nbuf : n;
74 |         strncpy( buf, s+1, copy_len );
75 |         buf[copy_len] = 0;
76 |         if (sscanf( buf, "%d", &stop ) < 1 || stop < start)
77 |             return -1;
78 | 
79 |         /* Finally, get the third number (after the second colon): the step value. */
80 |         if (sscanf( s1+1, "%d", &step ) < 1 || step < 1)
81 |             return -1;
82 |     } else
83 | 
84 |         return -1;
85 | 
86 |     *start_p = start;
87 |     *stop_p = stop;
88 |     *step_p = step;
89 | 
90 |     return 0;
91 | }
92 | 


--------------------------------------------------------------------------------
/testing/testing.c:
--------------------------------------------------------------------------------
 1 | /**
 2 |  *
 3 |  * Copyright (c) 2017, King Abdullah University of Science and Technology
 4 |  * All rights reserved.
 5 |  *
 6 |  **/
 7 | 
 8 | /**
 9 |  *
10 |  * @file testing.c
11 |  *
12 |  *  POLAR is a high performance software framework for computing 
13 |  *  the polar decomposition on distributed-memory manycore systems provided by KAUST
14 |  *
15 |  * @version 3.0.0
16 |  * @author Dalal Sukkari
17 |  * @author Hatem Ltaief
18 |  * @date 2018-11-08
19 |  *
20 |  **/
21 | 
22 | #include <stdio.h>
23 | #include <stdlib.h>
24 | #include <string.h>
25 | int
26 | get_range(char *range, int *start_p, int *stop_p, int *step_p) {
27 |     char *s, *s1, buf[21];
28 |     int colon_count, copy_len, nbuf=20, n;
29 |     int start=1000, stop=10000, step=1000;
30 | 
31 |     colon_count = 0;
32 |     for (s = strchr( range, ':'); s; s = strchr( s+1, ':'))
33 |         colon_count++;
34 | 
35 |     if (colon_count == 0) { /* No colon in range. */
36 |         if (sscanf( range, "%d", &start ) < 1 || start < 1)
37 |             return -1;
38 |         step = start / 10;
39 |         if (step < 1) step = 1;
40 |         stop = start + 10 * step;
41 | 
42 |     } else if (colon_count == 1) { /* One colon in range.*/
43 |         /* First, get the second number (after colon): the stop value. */
44 |         s = strchr( range, ':' );
45 |         if (sscanf( s+1, "%d", &stop ) < 1 || stop < 1)
46 |             return -1;
47 | 
48 |         /* Next, get the first number (before colon): the start value. */
49 |         n = s - range;
50 |         copy_len = n > nbuf ? nbuf : n;
51 |         strncpy( buf, range, copy_len );
52 |         buf[copy_len] = 0;
53 |         if (sscanf( buf, "%d", &start ) < 1 || start > stop || start < 1)
54 |             return -1;
55 | 
56 |         /* Let's have 10 steps or less. */
57 |         step = (stop - start) / 10;
58 |         if (step < 1)
59 |             step = 1;
60 |     } else if (colon_count == 2) { /* Two colons in range. */
61 |         /* First, get the first number (before the first colon): the start value. */
62 |         s = strchr( range, ':' );
63 |         n = s - range;
64 |         copy_len = n > nbuf ? nbuf : n;
65 |         strncpy( buf, range, copy_len );
66 |         buf[copy_len] = 0;
67 |         if (sscanf( buf, "%d", &start ) < 1 || start < 1)
68 |             return -1;
69 | 
70 |         /* Next, get the second number (after the first colon): the stop value. */
71 |         s1 = strchr( s+1, ':' );
72 |         n = s1 - (s + 1);
73 |         copy_len = n > nbuf ? nbuf : n;
74 |         strncpy( buf, s+1, copy_len );
75 |         buf[copy_len] = 0;
76 |         if (sscanf( buf, "%d", &stop ) < 1 || stop < start)
77 |             return -1;
78 | 
79 |         /* Finally, get the third number (after the second colon): the step value. */
80 |         if (sscanf( s1+1, "%d", &step ) < 1 || step < 1)
81 |             return -1;
82 |     } else
83 | 
84 |         return -1;
85 | 
86 |     *start_p = start;
87 |     *stop_p = stop;
88 |     *step_p = step;
89 | 
90 |     return 0;
91 | }
92 | 


--------------------------------------------------------------------------------
/src/mellipke.c:
--------------------------------------------------------------------------------
 1 | /**
 2 |  ZOLOPD-SVD
 3 |  *
 4 |  * (C) Copyright 2016 King Abdullah University of Science and Technology
 5 |  * Authors:
 6 |  * Dalal Sukkari (dalal.sukkari@kaust.edu.sa)
 7 |  * David Keyes (david.keyes@kaust.edu.sa)
 8 |  * Hatem Ltaief (hatem.ltaief@kaust.edu.sa)
 9 |  *  
10 |  * Redistribution  and  use  in  source and binary forms, with or without
11 |  * modification,  are  permitted  provided  that the following conditions
12 |  * are met:
13 |  * 
14 |  * Redistributions  of  source  code  must  retain  the above copyright
15 |  * notice,  this  list  of  conditions  and  the  following  disclaimer.
16 |  * Redistributions  in  binary  form must reproduce the above copyright
17 |  * notice,  this list of conditions and the following disclaimer in the
18 |  * documentation  and/or other materials provided with the distribution.
19 |  * Neither  the  name of the King Abdullah University of Science and
20 |  * Technology nor the names of its contributors may be used to endorse
21 |  * or promote products derived from this software without specific prior
22 |  * written permission.
23 |  *
24 |  *
25 |  THIS  SOFTWARE  IS  PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
26 |  ``AS IS''  AND  ANY  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
27 |  LIMITED  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
28 |  A  PARTICULAR  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
29 |  HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
30 |  SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT NOT
31 |  LIMITED  TO,  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
32 |  DATA,  OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
33 |  THEORY  OF  LIABILITY,  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
34 |  (INCLUDING  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
35 |  OF  THIS  SOFTWARE,  EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 |  **/
37 | #include "polar.h"
38 | 
39 | int mellipke( double alpha, double *k, double *e)
40 | {
41 |     /* ELLIPKE Complete elliptic integral. Modified from Matlab's built-in code 
42 |      * for improved accuracy
43 |      */
44 |     double m, m1, mm, a0, b0, s0;
45 |     double a1, b1, c1, w1;
46 |     double eps, tol;
47 |     double mmold;
48 |     int i1;
49 | 
50 |     m = sin(alpha) * sin(alpha);
51 |     m1 = cos(alpha) * cos(alpha);
52 | 
53 |     eps = LAPACKE_dlamch_work('e');
54 |     tol = eps;
55 |     
56 | 
57 |     if (m == 0.0) {k[0] = 0.0; *e = k[0]; return 0;}
58 |     if (m < 0 || m > 1){ 
59 | 	fprintf(stderr, "error(ellipke:MOutOfRange)") ;
60 | 	return -1;
61 |     }
62 | 
63 |     a0 = 1.;
64 |     b0 = cos(alpha);
65 |     s0 = m;
66 |     i1 = 0.; mm = 1.;
67 | 
68 |     while ( mm > tol){
69 |         a1 = (a0+b0)/2;
70 |         b1 = sqrt(a0*b0);
71 |         c1 = (a0-b0)/2;
72 |         i1 = i1 + 1;
73 |         w1 = cpow(2,i1)*c1*c1;
74 |         mm = w1;
75 |         s0 = s0 + w1;
76 |         a0 = a1;
77 |         b0 = b1;
78 |     }
79 | 
80 |     *k = M_PI/(2*a1);
81 |     *e = k[0]*(1-s0/2);
82 | return 0;
83 | }
84 | 


--------------------------------------------------------------------------------
/src/pdgenm2.c:
--------------------------------------------------------------------------------
 1 | /**
 2 |  *
 3 |  * Copyright (c) 2017, King Abdullah University of Science and Technology
 4 |  * All rights reserved.
 5 |  *
 6 |  **/
 7 | 
 8 | /**
 9 |  *
10 |  * @file pdgenm2.c
11 |  *
12 |  *  ZOLOPD is a high performance software framework for computing 
13 |  *  the polar decomposition on distributed-memory manycore systems provided by KAUST
14 |  *
15 |  * @version 3.0.0
16 |  * @author Dalal Sukkari
17 |  * @author Hatem Ltaief
18 |  * @date 2018-11-08
19 |  *
20 |  **/
21 | 
22 | #include "polar.h"
23 | 
24 | void pdgenm2( double *A, int M, int N, int descA[9], 
25 |               double *W, int descW[9], 
26 |               double *Sx, int descSx[9], 
27 |               double *e, double tol)
28 | {
29 |     /*
30 |     *
31 |     *   NORMEST Estimate the matrix 2-norm.
32 |     *   NORMEST(S) is an estimate of the 2-norm of the matrix S.
33 |     *   NORMEST(S,tol) uses relative error tol instead of 1.e-6.
34 |     *   [nrm,cnt] = NORMEST(..) also gives the number of iterations used.
35 |     *
36 |     *   This function is intended primarily for sparse matrices,
37 |     *   although it works correctly and may be useful for large, full
38 |     *   matrices as well.  Use NORMEST when your problem is large
39 |     *   enough that NORM takes too long to compute and an approximate
40 |     *   norm is acceptable.
41 |     *
42 |     */
43 | 
44 |     int maxiter = 100; /* should never take this many iterations. */
45 |     int i1 = 1;
46 |     int cnt = 0;
47 |     int info;
48 |     double e0, alpha, beta, normx, normSx;
49 |     //x = sum(abs(S),1)';
50 |     /* Since in qdwh.c we are finding norm1 of the matrix A, 
51 |      * then we already have the sum of the columns saved in W
52 |      * otherwise we should call the following pdlange
53 |      */
54 |     //e0 = pdlange_ ( "1", &M, &N, A, &i1, &i1, descA, W); 
55 |     double *w  = (double *)malloc(1*sizeof(double)) ;
56 |     
57 |     *e = pdlange_ ( "f", &N, &i1, W, &i1, &i1, descW, w);
58 |     //pdnrm2_( &N, e, W, &i1 , &i1, descW , &i1);
59 |     
60 |     if (*e == 0){ return;}
61 |     //x = x/e;
62 |     alpha = 1.0;
63 |     pdlascl_( "G", e, &alpha, &N, &i1, W, &i1, &i1, descW, &info);
64 | 
65 |     e0 = 0;
66 |     while ( (cnt < maxiter) &&
67 |            (fabs((*e) - e0) > (tol * (*e))) )
68 |    {
69 |         e0 = *e; alpha = 1.0; beta = 0.0;
70 |         pdgemv_ ("N", &M, &N, &alpha, A, &i1, &i1, descA, W, &i1, &i1, descW, &i1, &beta, Sx, &i1, &i1, descSx, &i1);
71 |         normSx = pdlange_ ( "f", &N, &i1, Sx, &i1, &i1, descSx, w);
72 | 
73 |         //if nnz(Sx) == 0
74 |         //    Sx = rand(size(Sx),class(Sx));
75 |         //end
76 | 
77 |         pdgemv_ ("T", &M, &N, &alpha, A, &i1, &i1, descA, Sx, &i1, &i1, descSx, &i1, &beta, W, &i1, &i1, descW, &i1);
78 |         normx = pdlange_ ( "f", &N, &i1, W, &i1, &i1, descW, w);
79 |    
80 |         *e = normx/normSx;
81 |         pdlascl_( "G", &normx, &alpha, &N, &i1, W, &i1, &i1, descW, &info);
82 |         cnt = cnt+1;
83 |         if ( (cnt >= maxiter) &&
84 |             (fabs((*e) - e0) > (tol * (*e))) ) {
85 |             fprintf(stderr, "normest: didn't converge\n");
86 |         }
87 |     }
88 |     return;
89 | }
90 | 


--------------------------------------------------------------------------------
/Jenkinsfile:
--------------------------------------------------------------------------------
  1 | pipeline {
  2 | /*
  3 |  * Defining where to run
  4 |  */
  5 | //// Any:
  6 | // agent any
  7 | //// By agent label:
  8 | //      agent { label 'sandybridge' }
  9 | 
 10 |     agent { label 'jenkinsfile' }
 11 |     triggers {
 12 |         pollSCM('H/10 * * * *')
 13 |     }
 14 | 
 15 |     options {
 16 |         disableConcurrentBuilds()
 17 |         buildDiscarder(logRotator(numToKeepStr: '50'))
 18 |         timestamps()
 19 |     }
 20 | 
 21 |     stages {
 22 |         stage ('build') {
 23 |             steps {
 24 |                 sh '''#!/bin/bash -le
 25 |                     # loads modules
 26 |                     module purge
 27 |                     module load cmake/3.9.6
 28 |                     module load intel/2017
 29 |                     module load intelmpi/2017/intel-2017
 30 | 
 31 |                     set -x
 32 |                     module list
 33 | 
 34 |                     export CC=icc # just in case
 35 |                     export FC=ifort # just in case
 36 |                     export F90=ifort # just in case
 37 |                     export I_MPI_CC="$CC"
 38 |                     export I_MPI_FC="$FC"
 39 |                     export I_MPI_F90="$F90"
 40 | 
 41 |                     mkdir -p build
 42 |                     cd build && rm -rf ./*
 43 |                     cmake .. -DCMAKE_INSTALL_PREFIX=$PWD/installdir -DPOLAR_TESTING:BOOL=ON -DEXTRA_LIBS="ifcore"
 44 | 
 45 |                     # build
 46 |                     make
 47 | 
 48 |                     # install
 49 |                     make install
 50 | 
 51 |                 '''
 52 |             }
 53 |         }
 54 |         stage ('test') {
 55 |             steps {
 56 |                 sh '''#!/bin/bash -le
 57 |                     # loads modules
 58 |                     module purge
 59 |                     module load cmake/3.9.6
 60 |                     module load intel/2017
 61 |                     module load intelmpi/2017/intel-2017
 62 | 
 63 |                     set -x
 64 | 
 65 |                     module list
 66 | 
 67 |                     # Delete previous CTest results and run tests
 68 |                     rm -rf $WORKSPACE/build/Testing
 69 |                     cd $WORKSPACE/build
 70 |                     export PATH=$PATH:. 
 71 |                     ctest --no-compress-output -T Test
 72 |                 '''
 73 |             }
 74 |         }
 75 |         stage ('package') {
 76 |             steps {
 77 |                 sh 'cd build && make package'
 78 |                 archiveArtifacts allowEmptyArchive: true, artifacts: 'build/POLAR-3.0.0-Linux.tar.gz'
 79 |             }
 80 |         }
 81 |     }
 82 |     // Post build actions
 83 |     post {
 84 |         //always {
 85 |         //}
 86 |         //success {
 87 |         //}
 88 |         //unstable {
 89 |         //}
 90 |         //failure {
 91 |         //}
 92 |         unstable {
 93 |                 emailext body: "${env.JOB_NAME} - Please go to ${env.BUILD_URL}", subject: "Jenkins Pipeline build is UNSTABLE", recipientProviders: [[$class: 'CulpritsRecipientProvider'], [$class: 'RequesterRecipientProvider']]
 94 |         }
 95 |         failure {
 96 |                 emailext body: "${env.JOB_NAME} - Please go to ${env.BUILD_URL}", subject: "Jenkins Pipeline build FAILED", recipientProviders: [[$class: 'CulpritsRecipientProvider'], [$class: 'RequesterRecipientProvider']]
 97 |         }
 98 |     }
 99 | }
100 | 


--------------------------------------------------------------------------------
/src/mellipj.c:
--------------------------------------------------------------------------------
  1 | 
  2 | /**
  3 |  ZOLOPD-SVD
  4 |  *
  5 |  * (C) Copyright 2016 King Abdullah University of Science and Technology
  6 |  * Authors:
  7 |  * Dalal Sukkari (dalal.sukkari@kaust.edu.sa)
  8 |  * David Keyes (david.keyes@kaust.edu.sa)
  9 |  * Hatem Ltaief (hatem.ltaief@kaust.edu.sa)
 10 |  *  
 11 |  * Redistribution  and  use  in  source and binary forms, with or without
 12 |  * modification,  are  permitted  provided  that the following conditions
 13 |  * are met:
 14 |  * 
 15 |  * Redistributions  of  source  code  must  retain  the above copyright
 16 |  * notice,  this  list  of  conditions  and  the  following  disclaimer.
 17 |  * Redistributions  in  binary  form must reproduce the above copyright
 18 |  * notice,  this list of conditions and the following disclaimer in the
 19 |  * documentation  and/or other materials provided with the distribution.
 20 |  * Neither  the  name of the King Abdullah University of Science and
 21 |  * Technology nor the names of its contributors may be used to endorse
 22 |  * or promote products derived from this software without specific prior
 23 |  * written permission.
 24 |  *
 25 |  *
 26 |  THIS  SOFTWARE  IS  PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 27 |  ``AS IS''  AND  ANY  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 28 |  LIMITED  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 29 |  A  PARTICULAR  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 30 |  HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 31 |  SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT NOT
 32 |  LIMITED  TO,  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 33 |  DATA,  OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 34 |  THEORY  OF  LIABILITY,  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 35 |  (INCLUDING  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 36 |  OF  THIS  SOFTWARE,  EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 37 | function [sn,cn,dn] = mellipj(u,alpha,tol)
 38 | mELLIPJ Jacobi elliptic functions. MATLAB's built-in code, modified for improved accuracy. 
 39 |    [SN,CN,DN] = ELLIPJ(U,M) returns the values of the Jacobi elliptic 
 40 |    functions Sn, Cn and Dn, evaluated for corresponding elements of 
 41 |    argument U and parameter M.  U and M must be arrays of the same 
 42 |    size or either can be scalar.  As currently implemented, M is 
 43 |    limited to 0 <= M <= 1. 
 44 | 
 45 |    [SN,CN,DN] = ELLIPJ(U,M,TOL) computes the elliptic functions to
 46 |    the accuracy TOL instead of the default TOL = EPS.  
 47 | */
 48 | #include "polar.h"
 49 | 
 50 | int mellipj(double u, double alpha, double *sn, double *cn, double *dn, double *work)
 51 | {
 52 | 
 53 |     double eps, tol;
 54 |     int chunk = 10, i, n, in;
 55 |      
 56 |     double *a    = work;
 57 |     double *b    = a + chunk + 1;
 58 |     double *c    = b + chunk + 1;
 59 | 
 60 |     double m1, m, mmax;
 61 | 
 62 |     memset(a, 0, (chunk + 1)* sizeof(double));
 63 |     memset(b, 0, (chunk + 1)* sizeof(double));
 64 |     memset(c, 0, (chunk + 1)* sizeof(double));
 65 | 
 66 |     u=creal(u);
 67 |     alpha=creal(alpha);
 68 | 
 69 |     m  = sin(alpha) * sin(alpha);
 70 |     m1 = cos(alpha) * cos(alpha);
 71 | 
 72 |     eps = LAPACKE_dlamch_work('e');
 73 |     tol = eps;
 74 | 
 75 |     mmax = 1;
 76 | 
 77 |     *cn=u;
 78 |     *sn = cn[0]; *dn = sn[0];
 79 | 
 80 | 
 81 |     if ((m < 0) || (m > 1)){ 
 82 | 	fprintf(stderr, "error(ellipj:MOutOfRange)") ;
 83 | 	return -1;
 84 |     }
 85 | 
 86 |     // pre-allocate space and augment if needed
 87 |     chunk = 10;
 88 |     c[0] = creal(sin(alpha));
 89 |     b[0] = creal(cos(alpha));
 90 |     a[0] = c[0];
 91 | 
 92 |     a[0]=1;
 93 | 
 94 |     n = 0.;
 95 |     i = 1;
 96 |     in = 0;
 97 |     while ((fabs(c[i-1]) > tol) && i<1000 ){
 98 |         i = i + 1;
 99 |         if (i > chunk+1){
100 | 
101 |            a = work;
102 |            b = a + (2*chunk + 1);
103 |            c = b + (2*chunk + 1);
104 |            memset(a, 0, (2*chunk + 1)* sizeof(double));
105 |            memset(b, 0, (2*chunk + 1)* sizeof(double));
106 |            memset(c, 0, (2*chunk + 1)* sizeof(double));
107 |         }
108 |         a[i-1] = 0.5 * (a[i-2] + b[i-2]);
109 |         b[i-1] = sqrt(a[i-2] * b[i-2]);
110 |         c[i-1] = 0.5 * (a[i-2] - b[i-2]);
111 |         if ((fabs(c[i-1]) <= tol) && (fabs(c[i-2]) > tol)) {in = 1;}
112 |         if (in != 0){
113 |            n = i-1;
114 |         }
115 |     }
116 |     double *phin = c + (2*chunk + 1);
117 |     memset(phin, 0, i * sizeof(double));
118 |     phin[0]=u;
119 | 
120 |     phin[i-1] = cpow(2,n)*a[i-1]*u;
121 |     while (i > 1){
122 |         i = i - 1;
123 |         if (n >= i) {in = 1;}
124 |         phin[i-1] = phin[i];
125 |         if (in != 0){
126 |            phin[i-1] = 0.5 * (asin(c[i]*sin(creal(phin[i]))/a[i]) + phin[i]);
127 |         }
128 |     }
129 |     *sn = sin(creal(phin[0]));
130 |     *cn = cos(creal(phin[0]));
131 |     *dn = sqrt(1 - m * sn[0]*sn[0]);
132 | 
133 | return 0;
134 | }
135 | 
136 | int choosem(double con, int *m){  
137 |     //choose the Zolotarev degree according to the condnum
138 |     if (con < 1.001)         {*m=2;  }        
139 |     else if (con <= 1.01)    {*m = 3;}            
140 |     else if (con <= 1.1)     {*m = 4;}            
141 |     else if (con <= 1.2)     {*m = 5;}            
142 |     else if (con <= 1.5)     {*m = 6;}
143 |     else if (con <= 2)       {*m = 8;}  // one-step convergence till here
144 |     else if (con < 6.5)      {*m = 2;}
145 |     else if (con < 180)      {*m = 3;}       
146 |     else if (con < 1.5*1e4) {*m = 4;}
147 |     else if (con < 2*1e6)   {*m = 5;}
148 |     else if (con < 1*1e9)   {*m = 6;}
149 |     else if (con < 3*1e12)  {*m = 7;}
150 |     else                     {*m = 8;}
151 |     
152 | return 0;
153 | }
154 | 
155 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | POLAR
  2 | ================
  3 | POLAR is a high performance open-source software package to compute the polar decomposition ((PD)) of a dense matrix A = UH
  4 | based on QR-based Dynamically Weighted Halley (QDWH) and ZOLO-PD algorithms.
  5 | 
  6 | QDWH
  7 | ================
  8 | 
  9 | The **QR-based Dynamically Weighted Halley** (QDWH) is one of
 10 | the most popular algorithms to compute the polar decomposition. It is backward stable and converges
 11 | in at most six iterations. 
 12 | 
 13 | ZOLO-PD
 14 | ================
 15 | 
 16 | ZOLO-PD relies on the **Zolotarev function which is the best rational approximant to the sign function**  
 17 | for computing the polar decomposition.
 18 | Building upon on the QR-based
 19 | Dynamically Weighted Halley (QDWH) algorithm, the key idea
 20 | lies in finding the best rational approximation for the scalar sign function,
 21 | which also corresponds to the polar factor for symmetric matrices,
 22 | to further accelerate the QDWH convergence.
 23 | Based on the Zolotarev rational functions---introduced by Zolotarev (ZOLO) in
 24 | 1877--- this new PD algorithm ZOLO-PD converges within two iterations even for ill-conditioned matrices,
 25 | instead of the original six iterations needed for QDWH.
 26 | ZOLO-PD uses the property of Zolotarev functions that optimality is maintained when
 27 | two functions are composed in an appropriate manner.
 28 | The resulting ZOLO-PD has a convergence rate up to seventeen,
 29 | in contrast to the cubic convergence rate for QDWH.
 30 | This comes at the price of higher arithmetic costs and memory footprint. These
 31 | extra floating-point operations can, however, be processed in an
 32 | embarrassingly parallel fashion. 
 33 | 
 34 | Current Features of QDWH/ZOLOPD
 35 | ===========================
 36 | 
 37 | - Written in C.
 38 | - Support for Double Precision.
 39 | - Support for Two-Dimensional Block Cyclic Data Distribution.
 40 | - ScaLAPACK Interface / Native Interface.
 41 | - ScaLAPACK-Compliant Error Handling.
 42 | - ScaLAPACK-Derived Testing Suite
 43 | - ScaLAPACK-Compliant Accuracy.
 44 |  
 45 | Programming models and backends:
 46 | 1.  MPI
 47 | 2.  ScaLAPACK
 48 | 
 49 | 
 50 | Installation
 51 | ============
 52 | 
 53 | The installation requires at least **CMake** of version 3.2.3. To build the polar decomposition based on QDWH/ZOLOPD,
 54 | follow these instructions:
 55 | 
 56 | 1.  Get polar from git repository
 57 | 
 58 |         git clone git@github.com:ecrc/polar
 59 | 
 60 | 2.  Go into polar folder
 61 | 
 62 |         cd polar
 63 | 
 64 | 3.  Create build directory and go there
 65 | 
 66 |         mkdir build && cd build
 67 | 
 68 | 4.  Use CMake to get all the dependencies
 69 | 
 70 |         cmake .. -DCMAKE_INSTALL_PREFIX=/path/to/install/ 
 71 | 
 72 | 5.  To use exist dependencies
 73 | 
 74 |         cmake .. -DCMAKE_INSTALL_PREFIX=/path/to/install/ -DSCALAPACK_DIR=/path/to/scalapack/install/ -DSLTMG_LIBRARIES=/path/to/scalapack/install/lib/libsltmg.a
 75 | 
 76 | 5.  To build the testing binaries (optional) add the following:
 77 | 
 78 |         -DPOLAR_TESTING:BOOL=ON
 79 | 
 80 | 5.  Build polar
 81 | 
 82 |         make -j
 83 | 
 84 | 6.  Install polar
 85 | 
 86 |         make install
 87 | 
 88 | 7. Add line
 89 | 
 90 |         export PKG_CONFIG_PATH=/path/to/install/lib/pkgconfig:$PKG_CONFIG_PATH
 91 | 
 92 |     to your .bashrc file.
 93 | 
 94 | Now you can use pkg-config executable to collect compiler and linker flags for
 95 | polar based on QDWH/ZOLO-PD.
 96 | 
 97 | Testing and Timing
 98 | ==================
 99 | 
100 | The directories testing and timing contain an example 
101 | to test the accuracy and the performance of QDWH/ZOLO-PD using
102 | ill (with condition number less than 5.e12) and well-conditioned random matrices.
103 | 
104 |    The complete list of options is available below with -h option:
105 |   
106 |   ```
107 |        "======= QDWH/ZOLOPD testing using ScaLAPACK\n"
108 |        " -p      --nprow         : Number of MPI process rows\n"
109 |        " -q      --npcol         : Number of MPI process cols\n"
110 |        " -jl     --lvec          : Compute left singular vectors\n"
111 |        " -jr     --rvec          : Compute right singular vectors\n"
112 |        " -n      --N             : Dimension of the matrix\n"
113 |        " -b      --nb            : Block size\n"
114 |        " -m      --mode          : [1:6] Mode from pdlatms used to generate the matrix\n"
115 |        " -k      --cond          : Condition number used to generate the matrix\n"
116 |        " -o      --optcond       : Estimate Condition number using QR\n"
117 |        " -i      --niter         : Number of iterations\n"
118 |        " -r      --n_range       : Range for matrix sizes Start:Stop:Step\n"
119 |        " -c      --check         : Check the solution\n"
120 |        " -v      --verbose       : Verbose\n"
121 |        " -h      --help          : Print this help\n" );
122 | ```
123 |      On Cray systems, the launching command typically looks like:
124 |     
125 |        srun --ntasks=nT --hint=nomultithread ./main --nprow p --npcol q  --b 64 --cond 1e16 --niter 1 --n_range start:stop:step  --check
126 | 
127 |      1. The number of the nodes is N, the number of tasks (nT) = N * (number_of_cores per node ). The programming model is pure MPI (no OpenMP, i.e., sequential BLAS).
128 |      2. PxQ is the process grid configuration, where (nT - PxQ = 0)
129 | 
130 | 
131 | TODO List
132 | =========
133 | 
134 | 1.  Add support for the other precisions 
135 | 2.  Extend task-based programming model
136 | 3.  Port to various dynamic runtime systems
137 | 
138 | 
139 | References
140 | ==========
141 | 1. H. Ltaief, D. Sukkari, A. Esposito, Y. Nakatsukasa and D. Keyes, Massively Parallel 
142 | Polar Decomposition on Distributed-Memory Systems, *Submitted to IEEE Transactions on 
143 | Parallel Computing TOPC*, http://hdl.handle.net/10754/626359.1, 2018.
144 | 2. D. Sukkari, H. Ltaief, A. Esposito and D. Keyes, A QDWH-Based SVD Software Framework on
145 | Distributed-Memory Manycore Systems, *Submitted to ACM Transactions on Mathematical Software TOMS*, 
146 | http://hdl.handle.net/10754/626212, 2017.
147 | 3. D. Sukkari, H. Ltaief, M. Faverge, and D. Keyes, Asynchronous Task-Based Polar
148 | Decomposition on Massively Parallel Systems, *IEEE Transactions on Parallel and 
149 | Distributed Systems TPDS*, volume 29, pages 312–323, https://ieeexplore.ieee.org/document/8053812/, 2017.
150 | 4. D. Sukkari, H. Ltaief and D. Keyes, A High Performance QDWH-SVD Solver using
151 | Hardware Accelerators, *ACM Transactions on Mathematical Software TOMS*, vol. 43 (1), pp. 1-25, 2016.
152 | 5. D. Sukkari, H. Ltaief and D. Keyes, High Performance Polar Decomposition for SVD
153 | Solvers on Distributed Memory Systems, Best Papers, *Proceedings of the 22nd International 
154 | Euro-Par Conference*, https://doi.org/10.1007/978-3-319-43659-3_44, 2016.
155 | 6. D.Sukkari, H. Ltaief and D. Keyes, A High Performance QDWH-SVD Solver using 
156 | Hardware Accelerators, *ACM Transactions on Mathematical Software TOMS*, 
157 | http://doi.acm. org/10.1145/2894747, volume 43, pages 6:1–6:25, 2016.
158 | 7. Y. Nakatsukasa and N. J. Higham, Stable and Efficient Spectral Divide and Conquer 
159 | Algorithms for the Symmetric Eigenvalue Decomposition and the SVD, *SIAM Journal on Scientific Computing*,
160 | vol. 35, no. 3, pp. A1325–A1349, http://epubs.siam.org/doi/abs/10.1137/120876605, 2013.
161 | 8. Y. Nakatsukasa, R. Freund, using Zolotarev's Rational Approximation for Computing the Polar, 
162 | Symmetric Eigenvalue, and Singular Value Decompositions, *SIAM Review*, 
163 | https://books.google.com.sa/books?id=a9d7rgEACAAJ, 2016.
164 | 
165 | 
166 | Questions?
167 | ==========
168 | Please feel free to create an issue on Github for any questions and inquiries.
169 | 
170 | 


--------------------------------------------------------------------------------
/include/myscalapack.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  *
  3 |  * Copyright (c) 2017, King Abdullah University of Science and Technology
  4 |  * All rights reserved.
  5 |  *
  6 |  **/
  7 | 
  8 | /*
  9 |  * Copyright (c) 2009-2010 The University of Tennessee and The University
 10 |  *                         of Tennessee Research Foundation.  All rights
 11 |  *                         reserved.
 12 |  * Copyright (c) 2010      University of Denver, Colorado.
 13 |  */
 14 | 
 15 | #ifndef MYSCALAPACK_H
 16 | #define MYSCALAPACK_H
 17 | 
 18 | #ifdef SUP_
 19 | #define pdpotrf_  pdpotrf
 20 | #define pdpotri_  pdpotri
 21 | #define pdpotrs_  pdpotrs
 22 | #define pdsymm_   pdsymm
 23 | #define pdgetrf_  pdgetrf 
 24 | #define pdlansy_  pdlansy 
 25 | #define pdmatgen_ pdmatgen
 26 | #define pdtrsm_   pdtrsm
 27 | #define psgesv_   psgesv
 28 | #define pdgesv_   pdgesv
 29 | #define psgemm_   psgemm
 30 | #define pdgemm_   pdgemm
 31 | #define numroc_   numroc
 32 | #define pslange_  pslange
 33 | #define pdlange_  pdlange
 34 | #define pslacpy_  pslacpy
 35 | #define pdlacpy_  pdlacpy
 36 | #define pdgeqrf_  pdgeqrf
 37 | #define pdormqr_  pdormqr
 38 | #define psgesvd_  psgesvd
 39 | #define pdgesvd_  pdgesvd
 40 | #define pslaset_  pslaset
 41 | #define pdlaset_  pdlaset
 42 | #define pselset_  pselset
 43 | #define pdelset_  pdelset
 44 | #define pslamch_  pslamch
 45 | #define pdlamch_  pdlamch
 46 | #define indxg2p_  indxg2p
 47 | #define indxg2l_  indxg2l
 48 | #define descinit_ descinit
 49 | #define pslawrite_ pslawrite
 50 | #define pdlawrite_ pdlawrite
 51 | #define blacs_get_      blacs_get
 52 | #define blacs_pinfo_    blacs_pinfo
 53 | #define blacs_gridinit_ blacs_gridinit
 54 | #define blacs_gridinfo_ blacs_gridinfo
 55 | #define blacs_gridexit_ blacs_gridexit
 56 | #define blacs_exit_     blacs_exit
 57 | #endif
 58 | 
 59 | extern void Cblacs_pinfo( int* mypnum, int* nprocs);
 60 | extern void Cblacs_get( int context, int request, int* value);
 61 | extern int  Cblacs_gridinit( int* context, char * order, int np_row, int np_col);
 62 | extern void Cblacs_gridinfo( int context, int*  np_row, int* np_col, int*  my_row, int*  my_col);
 63 | extern void Cblacs_gridexit( int context);
 64 | extern void Cblacs_exit( int error_code);
 65 | 
 66 | extern void blacs_pinfo_( int *mypnum, int *nprocs);
 67 | extern void blacs_get_( int *context, int *request, int* value);
 68 | extern void blacs_gridinit_( int* context, char *order, int *np_row, int *np_col);
 69 | extern void blacs_gridinfo_( int *context, int *np_row, int *np_col, int *my_row, int *my_col);
 70 | extern void blacs_gridexit_( int *context);
 71 | extern void blacs_exit_( int *error_code);
 72 | 
 73 | extern void pdgeqrf_( int *m, int *n, double *a, int *ia, int *ja, int *desca, double *tau, double *work, int *lwork, int *info );
 74 | extern void pdormqr_( char *side, char *trans, int *m, int *n, int *k, double *a, int *ia,
 75 | 		int *ja, int *desca, double *tau, double *c, int *ic, int *jc, int *descc, double *work, int *lwork, int *info );
 76 | extern void pdtrsm_ ( char *side, char *uplo, char *transa, char *diag, int *m, int *n, double *alpha, double *a, int *ia,
 77 | 		int *ja, int *desca, double *b, int *ib, int *jb, int *descb );
 78 | 
 79 | extern float  pslange_( char *norm, int *m, int *n, float     *A, int *ia, int *ja, int *descA, float *work);
 80 | extern double pdlange_( char *norm, int *m, int *n, double    *A, int *ia, int *ja, int *descA, double *work);
 81 | 
 82 | 
 83 | extern void pslacpy_( char *uplo, int *m, int *n, float     *A, int *ia, int *ja, int *descA,
 84 |                                                     float     *B, int *ib, int *jb, int *descB);
 85 | extern void pdlacpy_( char *uplo, int *m, int *n, double     *A, int *ia, int *ja, int *descA,
 86 |                                                     double     *B, int *ib, int *jb, int *descB);
 87 | 
 88 | extern void psgesv_( int *n, int *nrhs, float     *A, int *ia, int *ja, int *descA, int *ipiv,
 89 |                                           float     *B, int *ib, int *jb, int *descB, int *info);
 90 | extern void pdgesv_( int *n, int *nrhs, double    *A, int *ia, int *ja, int *descA, int *ipiv,
 91 |                                           double    *B, int *ib, int *jb, int *descB, int *info);
 92 | 
 93 | extern void psgemm_( char *transa, char *transb, int *M, int *N, int *K,
 94 |                                           float     *alpha,
 95 |                                           float     *A, int *ia, int *ja, int *descA,
 96 |                                           float     *B, int *ib, int *jb, int *descB,
 97 |                                           float     *beta,
 98 |                                           float     *C, int *ic, int *jc, int *descC );
 99 | extern void pdgemm_( char *transa, char *transb, int *M, int *N, int *K,
100 |                                           double    *alpha,
101 |                                           double    *A, int *ia, int *ja, int *descA,
102 |                                           double    *B, int *ib, int *jb, int *descB,
103 |                                           double    *beta,
104 |                                           double    *C, int *ic, int *jc, int *descC );
105 | 
106 | extern void psgesvd_( char *jobu, char *jobvt, int *m, int *n,
107 |                                   float     *A, int *ia, int *ja, int *descA,
108 |                                   float     *s,
109 |                                   float     *U, int *iu, int *ju, int *descU,
110 |                                   float     *VT, int *ivt, int *jvt, int *descVT,
111 |                                   float     *work, int *lwork, int *info);
112 | extern void pdgesvd_( char *jobu, char *jobvt, int *m, int *n,
113 |                                   double    *A, int *ia, int *ja, int *descA,
114 |                                   double    *s,
115 |                                   double    *U, int *iu, int *ju, int *descU,
116 |                                   double    *VT, int *ivt, int *jvt, int *descVT,
117 |                                   double    *work, int *lwork, int *info);
118 | 
119 | extern void pslaset_( char *uplo, int *m, int *n, float     *alpha, float     *beta, float     *A, int *ia, int *ja, int *descA );
120 | extern void pdlaset_( char *uplo, int *m, int *n, double    *alpha, double    *beta, double    *A, int *ia, int *ja, int *descA );
121 | 
122 | extern void pselset_( float     *A, int *ia, int *ja, int *descA, float     *alpha);
123 | extern void pdelset_( double    *A, int *ia, int *ja, int *descA, double    *alpha);
124 | 
125 | extern void pslawrite_( char **filenam, int *m, int *n, float  *A, int *ia, int *ja, int *descA, int *irwrit, int *icwrit, float  *work);
126 | extern void pdlawrite_( char **filenam, int *m, int *n, double *A, int *ia, int *ja, int *descA, int *irwrit, int *icwrit, double *work);
127 | 
128 | extern float pslamch_( int *ictxt, char *cmach);
129 | extern double pdlamch_( int *ictxt, char *cmach);
130 | 
131 | extern int indxg2p_( int *indxglob, int *nb, int *iproc, int *isrcproc, int *nprocs);
132 | extern int indxg2l_( int *indxglob, int *nb, int *iproc, int *isrcproc, int *nprocs);
133 | extern int numroc_( int *n, int *nb, int *iproc, int *isrcproc, int *nprocs);
134 | extern void descinit_( int *desc, int *m, int *n, int *mb, int *nb, int *irsrc, int *icsrc,
135 | 				int *ictxt, int *lld, int *info);
136 | 
137 | extern void   pdgetrf_ ( int* m, int *n, double *a, int *i1, int *i2, int *desca, int* ipiv, int *info );
138 | extern void   pdgetrs_ ( char* trans, int* n, int* nrhs, double* A, int* ia, int* ja, int* descA, int* ippiv, double* B, int* ib, int* jb, int* descB, int* info);
139 | extern double pdlansy_ ( char *norm, char *uplo, int *n, double *a, int *ia, int *ja, int *desca, double *work );
140 | extern void   pdmatgen_( int *ictxt, char *aform, char *diag, int *m, int *n, int *mb, int *nb, double *a, int *lda, int *iarow, int *iacol, int *iseed, int *iroff, int *irnum, int *icoff, int *icnum, int *myrow, int *mycol, int *nprow, int *npcol );
141 | 
142 | 
143 | extern void pdpotrf_( char *uplo, int *n, double *a, int *ia, int *ja, int *desca, int *info );
144 | extern void pdpotri_( char *uplo, int *n, double *a, int *ia, int *ja, int *desca, int *info );
145 | extern void pdpotrs_( char *uplo, int *n, int *nrhs, double *a, int *ia, int *ja, int *desca, double *b, int *ib, int *jb, int *descb, int *info );
146 | 
147 | extern void pdsymm_(char *side, char *uplo, int *m, int *n, double *alpha, double *a, int *ia, int *ja, int *desca, double *b, int *ib, int *jb, int *descb, double *beta, double *c, int *ic, int *jc, int *descc);
148 | 
149 | extern void pdposv_(char *uplo, int *n, int *nrhs, double *a, int *ia, int *ja,
150 |   int *desca, double *b, int *ib, int *jb, int *descb, int *info);
151 | extern void pdgeadd_(char *trans, int *m, int *n, double *alpha, double *a,
152 |   int *ia, int *ja, int *desca, double *beta, double *c, int *ic, int *jc,
153 |   int *descc);
154 | extern void pdgemv_(char *trans, int *m, int *n, double *alpha, double *a,
155 |   int *ia, int *ja, int *desca, double *x, int *ix, int *jx, int *descx,
156 |   int *incx, double *beta, double *y, int *iy, int *jy, int *descy, int *incy);
157 | extern void chk1mat_(int *ma, int *mapos0, int *na, int *napos0, int *ia,
158 |   int *ja, int *desca, int *descapos0, int *info);
159 | extern void pchk1mat_(int *ma, int *mapos0, int *na, int *napos0, int *ia,
160 |   int *ja, int *desca, int *descapos0, int *nextra, int *ex, int *expos,
161 |   int *info);
162 | extern void pdlascl_(char *type, double *cfrom, double *cto, int *m, int *n,
163 |   double *a, int *ia, int *ja, int *desca, int *info);
164 | extern void pdorgqr_(int *m, int *n, int *k, double *a, int *ia, int *ja,
165 |   int *desca, double *tau, double *work, int *lwork, int *info);
166 | extern void pdgecon_(char *norm, int *n, double *a, int *ia, int *ja,
167 |   int *desca, double *anorm, double *rcond, double *work, int *lwork,
168 |   int *iwork, int *liwork, int *info);
169 | extern void pxerbla_(int *ictxt, char *srname, int *info);
170 | extern void pdtrtri_(char *uplo, char *diag, int *n, double *a, int *ia,
171 |   int *ja, int *desca, int *info);
172 | 
173 | #endif
174 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | cmake_minimum_required(VERSION 3.2.3)
  2 | # directly make an error if in-source build
  3 | if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_BINARY_DIR}")
  4 |     message(FATAL_ERROR "In-source builds are not allowed.\n"
  5 |                         "Please create a build directory first and execute cmake configuration from "
  6 |                         "this directory. Example: mkdir build && cd build && cmake ..")
  7 | endif()
  8 | 
  9 | project(POLAR Fortran C)
 10 | 
 11 | #########################
 12 | ### VARIABLES ###########
 13 | #########################
 14 | set( POLAR_LIBNAME polar )
 15 | option( POLAR_TESTING "Generates testing binaries" OFF)
 16 | option( BUILD_SHARED_LIBS "Generates shared libraries instead of static" OFF)
 17 | option( SHARED_NETLIBSCALAPACK "Build NETLIB-SCALAPACK as shared library." ${BUILD_SHARED_LIBS})
 18 | option( USE_PARALLEL_BLAS "If possible, prefer parallel blas libraries" ON )
 19 | set(EXTRA_LIBS "" CACHE STRING "Additional libraries that may be needed to compile/link" )
 20 | 
 21 | # Version 3.0.0
 22 | SET( POLAR_VERSION_MAJOR 3 )
 23 | SET( POLAR_VERSION_MINOR 0 )
 24 | SET( POLAR_VERSION_PATCH 0 )
 25 | set( POLAR_VERSION "${POLAR_VERSION_MAJOR}.${POLAR_VERSION_MINOR}.${POLAR_VERSION_PATCH}" )
 26 | #execute_process(COMMAND git rev-parse --short HEAD OUTPUT_VARIABLE POLAR_VERSION_RELEASE )
 27 | 
 28 | 
 29 | # Set the RPATH config
 30 | # --------------------
 31 | # use, i.e. don't skip the full RPATH for the build tree
 32 | set(CMAKE_SKIP_BUILD_RPATH  FALSE)
 33 | # when building, use the install RPATH
 34 | # (automated test will need this)
 35 | set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE)
 36 | # the RPATH to be used when installing
 37 | set(CMAKE_INSTALL_RPATH ${CMAKE_INSTALL_PREFIX}/lib)
 38 | 
 39 | 
 40 | # Packaging (make package)
 41 | SET(CPACK_PACKAGE_VERSION ${POLAR_VERSION})
 42 | SET(CPACK_GENERATOR "TGZ")
 43 | INCLUDE(CPack)
 44 | 
 45 | 
 46 | # This project needs C99 standard to compile properly
 47 | # (could have used CMAKE_C_FLAGS instead)
 48 | #ADD_DEFINITIONS( -std=c99 )
 49 | #set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c99")
 50 | 
 51 | link_directories( ${CMAKE_INSTALL_PREFIX}/lib )
 52 | include_directories( ${CMAKE_SOURCE_DIR}/include )
 53 | 
 54 | ## CMAKE MODULES :: ECRC
 55 | ## REQUIRED FOR TESTS TO LINK LIBRARIES
 56 | if(NOT EXISTS "${PROJECT_SOURCE_DIR}/cmake_modules/ecrc/modules" )
 57 |     find_package(Git REQUIRED)
 58 |     execute_process(COMMAND ${GIT_EXECUTABLE} submodule init WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} RESULT_VARIABLE _res_init OUTPUT_QUIET ERROR_QUIET)
 59 |     execute_process(COMMAND ${GIT_EXECUTABLE} submodule update WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} RESULT_VARIABLE _res_update OUTPUT_QUIET ERROR_QUIET)
 60 |     if( ${_res_init} GREATER 0 OR ${_res_update} GREATER 0 )
 61 |         message(FATAL_ERROR "ECRC CMake modules were not found.\n"
 62 |                             "We tried: 'git submodule init && git submodule update' and resulted in error" )
 63 |     endif()
 64 | endif()
 65 | ## ECRC INITIALIZATION
 66 | list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake_modules)
 67 | list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake_modules/ecrc/modules")
 68 | set(ECRC_CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake_modules/ecrc/modules )
 69 | include(EcrcInit)
 70 | 
 71 | # MPI
 72 | find_package( MPIEXT REQUIRED )
 73 | if( MPI_FOUND )
 74 |     # TODO check MPI_C_COMPILER
 75 |     include_directories( ${MPI_C_INCLUDE_PATH} )
 76 |     set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${MPI_C_LINK_FLAGS}" )
 77 | endif()
 78 | 
 79 | ## BLAS
 80 | find_package(BLASEXT)
 81 | if(BLAS_FOUND)
 82 | # Use parallel blas
 83 |     if( USE_PARALLEL_BLAS AND NOT "${BLAS_PAR_LIBRARIES}" STREQUAL "" )
 84 |         message( STATUS "Using Parallel Blas" )
 85 |         set( BLAS_LIBRARIES "${BLAS_PAR_LIBRARIES}" )
 86 |     endif()
 87 |     if (BLAS_LIBRARY_DIRS)
 88 |         # the RPATH to be used when installing
 89 |         list( APPEND CMAKE_INSTALL_RPATH ${BLAS_LIBRARY_DIRS})
 90 |         link_directories( ${BLAS_LIBRARY_DIRS} )
 91 |     endif()
 92 |     if(BLAS_LINKER_FLAGS)
 93 |         list(APPEND CMAKE_EXE_LINKER_FLAGS ${BLAS_LINKER_FLAGS})
 94 |         list(APPEND CMAKE_SHARED_LINKER_FLAGS ${BLAS_LINKER_FLAGS})
 95 |         #set( CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${BLAS_LINKER_FLAGS}")
 96 |         list( REMOVE_DUPLICATES CMAKE_EXE_LINKER_FLAGS )
 97 |     endif()
 98 | else()
 99 |     message( FATAL_ERROR "BLAS library has not been found")
100 | endif()
101 | 
102 | # LAPACK
103 | #find_package(LAPACK REQUIRED)
104 | find_package(LAPACKEXT REQUIRED)
105 | if(LAPACK_FOUND)
106 |     include_directories(${LAPACK_INCLUDE_DIRS})
107 |     if(LAPACK_LIBRARY_DIRS)
108 |         # the RPATH to be used when installing
109 |         list(APPEND CMAKE_INSTALL_RPATH ${LAPACK_LIBRARY_DIRS})
110 |         link_directories( ${LAPACK_LIBRARY_DIRS} )
111 |     endif()
112 |     if(LAPACK_LINKER_FLAGS)
113 |         list( REMOVE_DUPLICATES LAPACK_LINKER_FLAGS )
114 |         list( APPEND CMAKE_EXE_LINKER_FLAGS ${LAPACK_LINKER_FLAGS})
115 |         list( REMOVE_DUPLICATES CMAKE_EXE_LINKER_FLAGS )
116 |     endif()
117 | endif()
118 | 
119 | # SCALAPACK
120 | find_package( SCALAPACK )
121 | if(SCALAPACK_FOUND)
122 |     if (SCALAPACK_LIBRARY_DIRS)
123 |         # the RPATH to be used when installing
124 |         list(APPEND CMAKE_INSTALL_RPATH ${SCALAPACK_LIBRARY_DIRS})
125 |         link_directories(${SCALAPACK_LIBRARY_DIRS})
126 |     endif()
127 |     if(SCALAPACK_LINKER_FLAGS)
128 |         list( REMOVE_DUPLICATES SCALAPACK_LINKER_FLAGS )
129 |         list( APPEND CMAKE_EXE_LINKER_FLAGS ${SCALAPACK_LINKER_FLAGS})
130 |         list( REMOVE_DUPLICATES CMAKE_EXE_LINKER_FLAGS )
131 |     endif()
132 | endif()
133 | 
134 | 
135 | include(ExternalProject)
136 | 
137 | 
138 | # SLTMG - look for sltmg (only used for tests )
139 | if( POLAR_TESTING )
140 |     set( SLTMG_LIBRARIES "" CACHE STRING "User provided sltmg library" )
141 |     set( _notfound_msg_ "SLTMG not found, NETLIB-SCALAPACK will be downloaded in order to generate libsltmg.a\n If you want to use your own SLTMG, define SLTMG_LIBRARIES and make sure SCALAPACK is found." )
142 |     if( SLTMG_LIBRARIES )
143 |         set(CMAKE_REQUIRED_LIBRARIES "${SLTMG_LIBRARIES};${SCALAPACK_LIBRARIES}")
144 |         check_fortran_function_exists("pdlatms" SLTMG_WORKS)
145 |         set(CMAKE_REQUIRED_LIBRARIES)
146 |         if( SLTMG_WORKS AND SCALAPACK_FOUND )
147 |             message(STATUS "Found SLTMG : ${SLTMG_LIBRARIES}")
148 |         else()
149 |             message( STATUS "${_notfound_msg_}")
150 |         endif()
151 |     else()
152 |         message( STATUS "${_notfound_msg_}")
153 |     endif()
154 | endif()
155 | 
156 | if( NOT SCALAPACK_FOUND OR ( POLAR_TESTING AND NOT SLTMG_WORKS ) )
157 |     # Download NETLIB-SCALAPACK
158 |     set( _build_netlib-scalapack )
159 |     set( NETLIB_SCALAPACK_URL "http://www.netlib.org/scalapack/scalapack-2.0.2.tgz" CACHE STRING "URL to download NETLIB-SCALAPACK project" )
160 |     if(NOT SCALAPACK_FOUND)
161 |         set( SCALAPACK_LIBRARY_DIRS "${CMAKE_INSTALL_PREFIX}/lib")
162 |         set( SCALAPACK_LIBRARIES "scalapack")
163 |         set( _install_target "install" )
164 |         set( _build_target "all" )
165 |         set( _patch_file "scalapack-cmake-libsltmg.patch" )
166 |         message( "## NETLIB-SCALAPACK will be downloaded from ${NETLIB_SCALAPACK_URL} and built along with libsltmg.a " )
167 |     else()
168 |         set( _install_target "install.sltmg" )
169 |         set( _build_target "sltmg" )
170 |         set( _patch_file "scalapack-cmake-libsltmg-only.patch" )
171 |         message( "## NETLIB-SCALAPACK will be downloaded from ${NETLIB_SCALAPACK_URL} in order to generate libsltmg.a " )
172 |     endif()
173 | 
174 |     # Use the built sltmg
175 |     set( SLTMG_LIBRARIES "sltmg" )
176 | 
177 |     # build scalapack for sltmg
178 |     string(REPLACE ";" " " LAPACK_LIBRARIES_STR "${LAPACK_LIBRARIES}")
179 |     ExternalProject_Add( NETLIB-SCALAPACK
180 |                          PREFIX dependencies
181 |                          URL ${NETLIB_SCALAPACK_URL}
182 |                          #URL /tmp/scalapack-2.0.2.tgz
183 |                          CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${CMAKE_INSTALL_PREFIX} -DBUILD_SHARED_LIBS:BOOL=${SHARED_NETLIBSCALAPACK} -DLAPACK_LIBRARIES:STRING='${LAPACK_LIBRARIES_STR}'
184 |                          PATCH_COMMAND patch -p1 -i ${CMAKE_SOURCE_DIR}/${_patch_file}
185 |                          BUILD_COMMAND ${CMAKE_COMMAND} --build . --target ${_build_target} -- -j -l ${NUMBER_OF_CPU}
186 |                          INSTALL_COMMAND ${CMAKE_COMMAND} --build . --target ${_install_target} --
187 |                          #LOG_CONFIGURE 1
188 |                          #LOG_BUILD 1
189 |                          #LOG_INSTALL 1
190 |     )
191 | 
192 | endif()
193 | 
194 | 
195 | # POLAR code
196 | add_subdirectory(src)
197 | 
198 | # TESTS
199 | if( POLAR_TESTING )
200 | 
201 |     ## LAPACKE
202 |     find_package(LAPACKE COMPONENTS LAPACKEXT)
203 |     if(LAPACK_FOUND AND LAPACK_LIBRARY_DIRS)
204 |         # the RPATH to be used when installing
205 |         list(APPEND CMAKE_INSTALL_RPATH ${LAPACK_LIBRARY_DIRS})
206 |     else()
207 |         message(FATAL_ERROR "A LAPACK library is required but has not been found")
208 |     endif()
209 |     if(LAPACKE_FOUND)
210 |         if ( LAPACKE_INCLUDE_DIRS ) # Not always this variable is set, especially when is included in LAPACK
211 |             include_directories(${LAPACKE_INCLUDE_DIRS})
212 |         endif()
213 |         if(LAPACKE_LIBRARY_DIRS)
214 |             # the RPATH to be used when installing
215 |             list(APPEND CMAKE_INSTALL_RPATH "${LAPACKE_LIBRARY_DIRS}")
216 |         endif()
217 |         if(LAPACKE_LINKER_FLAGS)
218 |             list(APPEND CMAKE_EXE_LINKER_FLAGS ${LAPACKE_LINKER_FLAGS})
219 |         endif()
220 |     else()
221 |         if(ECRC_VERBOSE_FIND_PACKAGE)
222 |             if (LAPACKE_STANDALONE OR NOT LAPACKE_WORKS)
223 |                 if (NOT LAPACKE_lapacke.h_DIRS)
224 |                     Print_Find_Header_Status(lapacke lapacke.h)
225 |                 endif ()
226 |                 if (NOT LAPACKE_lapacke_LIBRARY)
227 |                     Print_Find_Library_Status(lapacke liblapacke)
228 |                 endif ()
229 |             endif()
230 |         else()
231 |             message(WARNING "LAPACKE library has not been found and ECRC_VERBOSE_FIND_PACKAGE is set to OFF"
232 |                             "Try to activate ECRC_VERBOSE_FIND_PACKAGE option (-DECRC_VERBOSE_FIND_PACKAGE=ON) to get some hints for the detection")
233 |         endif()
234 |         #message(FATAL_ERROR "A LAPACKE library is required but has not been found")
235 |     endif()
236 | 
237 |     # BUILD TESTS
238 |     enable_testing() # enables ctest
239 |     add_definitions(-DHAVE_GETOPT_LONG)
240 |     add_subdirectory(testing)
241 |     add_subdirectory(timing)
242 | endif()
243 | 
244 | install( FILES ${CMAKE_SOURCE_DIR}/include/polar.h DESTINATION include )
245 | 
246 | # PKGCONFIG
247 | set(prefix ${CMAKE_INSTALL_PREFIX})
248 | set(libdir ${CMAKE_INSTALL_PREFIX}/lib)
249 | set(PKG_CONFIG_DIR ${libdir}/pkgconfig)
250 | configure_file(${CMAKE_SOURCE_DIR}/polar.pc.in ${CMAKE_CURRENT_BINARY_DIR}/polar.pc)
251 | install( FILES ${CMAKE_BINARY_DIR}/polar.pc
252 |          DESTINATION ${PKG_CONFIG_DIR}
253 |        )
254 | 


--------------------------------------------------------------------------------
/scalapack-cmake-libsltmg-only.patch:
--------------------------------------------------------------------------------
  1 | diff -Naur -x BLACS -x PBLAS a/CMakeLists.txt c/CMakeLists.txt
  2 | --- a/CMakeLists.txt	2012-05-02 06:58:24.000000000 +0300
  3 | +++ c/CMakeLists.txt	2017-02-22 15:58:25.000000000 +0300
  4 | @@ -1,311 +1,11 @@
  5 |  cmake_minimum_required(VERSION 2.8)
  6 | -project(SCALAPACK C Fortran)
  7 | -# Configure the warning and code coverage suppression file
  8 | -configure_file( 
  9 | -  "${SCALAPACK_SOURCE_DIR}/CMAKE/CTestCustom.cmake.in"
 10 | -  "${SCALAPACK_BINARY_DIR}/CTestCustom.cmake"
 11 | -  COPYONLY
 12 | -)
 13 | -
 14 | -# Add the CMake directory for custon CMake modules
 15 | -set(CMAKE_MODULE_PATH "${SCALAPACK_SOURCE_DIR}/CMAKE" ${CMAKE_MODULE_PATH})
 16 | -
 17 | -if (UNIX)
 18 | -   if ( "${CMAKE_Fortran_COMPILER}" MATCHES "ifort" )
 19 | -  set( CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fltconsistency -fp_port" )
 20 | -   endif ()
 21 | -endif ()
 22 | -
 23 | -#
 24 | -#   MPI
 25 | -#
 26 | -#set(MPI_BASE_DIR "/Users/julie/opt/openmpi/" CACHE PATH "MPI Path")
 27 | -#set(MPI_BASE_DIR "/Users/julie/opt/mpich2/" CACHE PATH "MPI Path")
 28 | -set(CMAKE_PREFIX_PATH "${MPI_BASE_DIR};${CMAKE_PREFIX_PATH}")
 29 | -#set(MPI_COMPILER ${MPI_BASE_DIR}/bin/mpicc)
 30 | -
 31 | -find_package(MPI)
 32 | -if (MPI_FOUND)
 33 | -   message(STATUS "Found MPI_LIBRARY : ${MPI_FOUND} ")
 34 | -   INCLUDE_DIRECTORIES(${MPI_INCLUDE_PATH})
 35 | -
 36 | -   find_program(MPI_C_COMPILER
 37 | -      NAMES mpicc
 38 | -      HINTS "${MPI_BASE_DIR}"
 39 | -      PATH_SUFFIXES bin
 40 | -      DOC "MPI C compiler.")
 41 | -   MARK_AS_ADVANCED(MPI_C_COMPILER)
 42 | -   if ("${MPI_C_COMPILER}" STREQUAL "MPI_C_COMPILER-NOTFOUND")
 43 | -      message(ERROR "--> MPI C Compiler NOT FOUND (please set MPI_BASE_DIR accordingly")
 44 | -   else()
 45 | -      message(STATUS "--> MPI C Compiler : ${MPI_C_COMPILER}")
 46 | -      SET(CMAKE_C_COMPILER "${MPI_C_COMPILER}")
 47 | -      message(STATUS "--> C Compiler : ${CMAKE_C_COMPILER}")
 48 | -   endif()
 49 | -   find_program(MPI_Fortran_COMPILER
 50 | -      NAMES mpif77
 51 | -      HINTS "${MPI_BASE_DIR}"
 52 | -      PATH_SUFFIXES bin
 53 | -      DOC "MPI Fortran compiler.")
 54 | -   MARK_AS_ADVANCED(MPI_Fortran_COMPILER)
 55 | -   
 56 | -   
 57 | -
 58 | -   if ("${MPI_Fortran_COMPILER}" STREQUAL "MPI_Fortran_COMPILER-NOTFOUND")
 59 | -      message(ERROR "--> MPI Fortran Compiler NOT FOUND (please set MPI_BASE_DIR accordingly")
 60 | -   else()
 61 | -      message(STATUS "--> MPI Fortran Compiler : ${MPI_Fortran_COMPILER}")
 62 | -      SET(Fortran_COMPILER "${CMAKE_Fortran_COMPILER}")
 63 | -      SET(CMAKE_Fortran_COMPILER "${MPI_Fortran_COMPILER}")
 64 | -      message(STATUS "--> Fortran Compiler : ${CMAKE_Fortran_COMPILER}")
 65 | -   endif()
 66 | -   
 67 | -else()
 68 | -   message(STATUS "Found MPI_LIBRARY : ${MPI_FOUND} ")
 69 | -   set(MPI_BASE_DIR ${MPI_BASE_DIR} CACHE PATH "MPI Path")
 70 | -   UNSET(MPIEXEC CACHE)
 71 | -   UNSET(MPIEXEC_POSTFLAGS CACHE)
 72 | -   UNSET(MPIEXEC_PREFLAGS CACHE)
 73 | -   UNSET(MPIEXEC_MAX_NUMPROCS CACHE)
 74 | -   UNSET(MPIEXEC_NUMPROC_FLAG CACHE)
 75 | -   UNSET(MPI_COMPILE_FLAGS CACHE)
 76 | -   UNSET(MPI_LINK_FLAGS CACHE)
 77 | -   UNSET(MPI_INCLUDE_PATH CACHE)
 78 | -   message(FATAL_ERROR "--> MPI Library NOT FOUND -- please set MPI_BASE_DIR accordingly --")
 79 | -endif()
 80 | -
 81 | -
 82 | -if (UNIX)
 83 | -   if ( "${CMAKE_Fortran_COMPILER}" MATCHES "ifort" )
 84 | -  set( CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fltconsistency -fp_port" )
 85 | -   endif ()
 86 | -endif ()
 87 | -
 88 | -macro(SCALAPACK_install_library lib)
 89 | -  install(TARGETS ${lib} EXPORT scalapack-targets
 90 | -    ARCHIVE DESTINATION lib${LIB_SUFFIX}
 91 | -    LIBRARY DESTINATION lib${LIB_SUFFIX}
 92 | -    RUNTIME DESTINATION Testing
 93 | -  )
 94 | -endmacro()
 95 | -
 96 | -# --------------------------------------------------
 97 | -# Testing
 98 | -SET(DART_TESTING_TIMEOUT 600)
 99 | -
100 | -enable_testing()
101 | -include(CTest)
102 | -enable_testing()
103 | -# --------------------------------------------------
104 | -
105 | -# Organize output files.  On Windows this also keeps .dll files next
106 | -# to the .exe files that need them, making tests easy to run.
107 | -set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${SCALAPACK_BINARY_DIR}/TESTING)
108 | -set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${SCALAPACK_BINARY_DIR}/lib)
109 | -set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${SCALAPACK_BINARY_DIR}/lib)
110 | -
111 | -# --------------------------------------------------
112 | -# Check for any necessary platform specific compiler flags
113 | -include( CheckBLACSCompilerFlags )
114 | -CheckBLACSCompilerFlags()
115 | -
116 | -set(prefix ${CMAKE_INSTALL_PREFIX})
117 | -set(libdir ${CMAKE_INSTALL_PREFIX}/lib${LIB_SUFFIX})
118 | -set(PKG_CONFIG_DIR ${libdir}/pkgconfig)
119 | -
120 | -# --------------------------------------------------
121 | -# BLACS Internal variables
122 | -#
123 | -#   Fortran Mangling, MPI Tests and BLACS settings
124 | -#
125 | -include(FortranMangling)
126 | -COMPILE(install_COMPILED)
127 | -
128 | -FORTRAN_MANGLING(CDEFS)
129 | -#MESSAGE(STATUS "Setting CDEFS = ${CDEFS}")
130 | -#set(CDEFS ${CDEFS} CACHE STRING "Fortran Mangling" FORCE)
131 | -MESSAGE(STATUS "=========")
132 | -
133 | -# --------------------------------------------------
134 | -# Compiler Flags
135 | -ADD_DEFINITIONS( "-D${CDEFS}")
136 | -
137 | -# --------------------------------------------------
138 | -# ScaLAPACK needs BLAS and LAPACK
139 | -
140 | -option(USE_OPTIMIZED_LAPACK_BLAS "Whether or not to search for optimized LAPACK and BLAS libraries on your machine (if not found, Reference LAPACK and BLAS will be downloaded and installed)" ON)
141 | -
142 | -message(STATUS "CHECKING BLAS AND LAPACK LIBRARIES")
143 | -
144 | -IF(LAPACK_LIBRARIES)
145 | -  include(CheckFortranFunctionExists)
146 | -  message(STATUS "--> LAPACK supplied by user is ${LAPACK_LIBRARIES}.")
147 | -  set(CMAKE_REQUIRED_LIBRARIES ${LAPACK_LIBRARIES})
148 | -  CHECK_FORTRAN_FUNCTION_EXISTS("dgesv" LAPACK_FOUND)
149 | -  unset( CMAKE_REQUIRED_LIBRARIES )
150 | -  message(STATUS "--> LAPACK routine dgesv is found: ${LAPACK_FOUND}.")
151 | -ENDIF()
152 | -
153 | -if(LAPACK_FOUND)
154 | -      message(STATUS "--> LAPACK supplied by user is WORKING, will use ${LAPACK_LIBRARIES}.")
155 | -else(LAPACK_FOUND)
156 | -   if(USE_OPTIMIZED_LAPACK_BLAS)
157 | -      message(STATUS "--> Searching for optimized LAPACK and BLAS libraries on your machine.")
158 | -      find_package(LAPACK)
159 | -   ENDIF(USE_OPTIMIZED_LAPACK_BLAS)
160 | -   if(NOT LAPACK_FOUND)
161 | -      message(STATUS "--> LAPACK and BLAS were not found. Reference LAPACK and BLAS will be downloaded and installed")
162 | -      include(ExternalProject)
163 | -      ExternalProject_Add(
164 | -		lapack
165 | -		URL http://www.netlib.org/lapack/lapack.tgz
166 | -		CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${SCALAPACK_BINARY_DIR}
167 | -        PREFIX ${SCALAPACK_BINARY_DIR}/dependencies
168 | -      )
169 | -	  if (UNIX)
170 | -         SET(LAPACK_LIBRARIES ${SCALAPACK_BINARY_DIR}/lib/liblapack.a CACHE STRING "LAPACK library" FORCE)
171 | -         SET(BLAS_LIBRARIES ${SCALAPACK_BINARY_DIR}/lib/libblas.a CACHE STRING "BLAS library" FORCE)
172 | -	  else (UNIX) # On Windows
173 | -         SET(LAPACK_LIBRARIES ${SCALAPACK_BINARY_DIR}/lib/liblapack.lib CACHE STRING "LAPACK library" FORCE)
174 | -         SET(BLAS_LIBRARIES ${SCALAPACK_BINARY_DIR}/lib/libblas.lib CACHE STRING "BLAS library" FORCE)
175 | -      endif (UNIX)
176 | -   ENDIF()
177 | -ENDIF()
178 | -
179 | -message(STATUS "BLAS library: ${BLAS_LIBRARIES}")
180 | -message(STATUS "LAPACK library: ${LAPACK_LIBRARIES}")
181 | -
182 | -MESSAGE(STATUS "=========")
183 | -
184 | -# --------------------------------------------------
185 | -# By default static library
186 | -OPTION(BUILD_SHARED_LIBS "Build shared libraries" OFF )
187 | -OPTION(BUILD_STATIC_LIBS "Build static libraries" ON )
188 | -
189 | -# --------------------------------------------------
190 | -# Subdirectories that need to be processed
191 | -   
192 | -macro(append_subdir_files variable dirname)
193 | -get_directory_property(holder DIRECTORY ${dirname} DEFINITION ${variable})
194 | -foreach(depfile ${holder})
195 | -  list(APPEND ${variable} "${dirname}/${depfile}")
196 | -endforeach()
197 | -endmacro()
198 | -
199 | -
200 | -#
201 | -# BLACS
202 | -#
203 | -add_subdirectory(BLACS)
204 | -append_subdir_files(blacs "BLACS/SRC")
205 | -
206 | -#
207 | -# TOOLS
208 | -#
209 | -add_subdirectory(TOOLS)
210 | -append_subdir_files(tools TOOLS)
211 | -append_subdir_files(tools-C TOOLS)
212 | -append_subdir_files(extra_lapack "TOOLS/LAPACK")
213 | -
214 | -#
215 | -# PBLAS
216 | -#
217 | -add_subdirectory(PBLAS)
218 | -append_subdir_files(pblas "PBLAS/SRC")
219 | -append_subdir_files(pblas-F "PBLAS/SRC")
220 | -append_subdir_files(pbblas "PBLAS/SRC/PBBLAS")
221 | -append_subdir_files(ptzblas "PBLAS/SRC/PTZBLAS")
222 | -append_subdir_files(ptools "PBLAS/SRC/PTOOLS")
223 | -
224 | -#
225 | -# REDIST
226 | -#
227 | -add_subdirectory(REDIST)
228 | -append_subdir_files(redist "REDIST/SRC")
229 | -
230 | -#
231 | -# SRC
232 | -#
233 | -add_subdirectory(SRC)
234 | -append_subdir_files(src "SRC")
235 | -append_subdir_files(src-C "SRC")
236 | -
237 | -if (UNIX)
238 | -   add_library(scalapack ${blacs} ${tools} ${tools-C} ${extra_lapack} ${pblas} ${pblas-F} ${ptzblas} ${ptools} ${pbblas} ${redist} ${src} ${src-C})
239 | -   target_link_libraries( scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES})
240 | -   scalapack_install_library(scalapack)
241 | -else (UNIX) # Need to separate Fortran and C Code
242 | -   OPTION(BUILD_SHARED_LIBS "Build shared libraries" ON )
243 | -   add_library(scalapack ${blacs} ${tools-C} ${pblas} ${ptools} ${redist} ${src-C})
244 | -   target_link_libraries( scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES})
245 | -   add_library(scalapack-F ${pblas-F} ${pbblas} ${ptzblas} ${tools} ${src} ${extra_lapack} )
246 | -   target_link_libraries( scalapack-F ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES})
247 | -   scalapack_install_library(scalapack)
248 | -   scalapack_install_library(scalapack-F)
249 | -endif (UNIX)
250 | -add_subdirectory(TESTING)
251 | -
252 | -# --------------------------------------------------
253 | -# CPACK Packaging 
254 | -
255 | -SET(CPACK_PACKAGE_NAME "ScaLAPACK")
256 | -SET(CPACK_PACKAGE_VENDOR "University of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd")
257 | -SET(CPACK_PACKAGE_DESCRIPTION_SUMMARY "ScaLAPACK- Linear Algebra Package")
258 | -set(SCALAPACK_VERSION 2.0.2)
259 | -set(CPACK_PACKAGE_VERSION_MAJOR 2)
260 | -set(CPACK_PACKAGE_VERSION_MINOR 0)
261 | -set(CPACK_PACKAGE_VERSION_PATCH 2)
262 | -set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE")
263 | -SET(CPACK_PACKAGE_INSTALL_DIRECTORY "SCALAPACK")
264 | -IF(WIN32 AND NOT UNIX)
265 | -  # There is a bug in NSI that does not handle full unix paths properly. Make
266 | -  # sure there is at least one set of four (4) backlasshes.
267 | -  SET(CPACK_NSIS_HELP_LINK "http:\\\\\\\\http://icl.cs.utk.edu/lapack-forum")
268 | -  SET(CPACK_NSIS_URL_INFO_ABOUT "http:\\\\\\\\www.netlib.org/scalapack")
269 | -  SET(CPACK_NSIS_CONTACT "scalapack@eecs.utk.edu")
270 | -  SET(CPACK_NSIS_MODIFY_PATH ON)
271 | -  SET(CPACK_NSIS_DISPLAY_NAME "SCALAPACK-${SCALAPACK_VERSION}")
272 | -  set(CPACK_PACKAGE_RELOCATABLE "true")
273 | -ELSE(WIN32 AND NOT UNIX)
274 | -  SET(CPACK_GENERATOR "TGZ")
275 | -  SET(CPACK_SOURCE_GENERATOR TGZ)
276 | -  SET(CPACK_SOURCE_PACKAGE_FILE_NAME "scalapack-${SCALAPACK_VERSION}" )
277 | -  SET(CPACK_SOURCE_IGNORE_FILES ~$ .svn ${CPACK_SOURCE_IGNORE_FILES} )
278 | -ENDIF(WIN32 AND NOT UNIX)
279 | -INCLUDE(CPack)
280 | -
281 | -
282 | -# --------------------------------------------------
283 | -
284 | -
285 | -export(TARGETS scalapack FILE scalapack-targets.cmake)
286 | -
287 | -if( NOT LAPACK_FOUND )
288 | - install(FILES
289 | -  ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES}
290 | -  DESTINATION lib
291 | -   )
292 | -endif( NOT LAPACK_FOUND )
293 | -
294 | -configure_file(${SCALAPACK_SOURCE_DIR}/CMAKE/scalapack-config-version.cmake.in
295 | -  ${SCALAPACK_BINARY_DIR}/scalapack-config-version.cmake @ONLY)
296 | -configure_file(${SCALAPACK_SOURCE_DIR}/CMAKE/scalapack-config-build.cmake.in
297 | -  ${SCALAPACK_BINARY_DIR}/scalapack-config.cmake @ONLY)
298 | -
299 | -configure_file(${CMAKE_CURRENT_SOURCE_DIR}/scalapack.pc.in ${CMAKE_CURRENT_BINARY_DIR}/scalapack.pc)
300 | - install(FILES
301 | -  ${CMAKE_CURRENT_BINARY_DIR}/scalapack.pc
302 | -  DESTINATION ${PKG_CONFIG_DIR}
303 | -   )
304 | -
305 | -configure_file(${SCALAPACK_SOURCE_DIR}/CMAKE/scalapack-config-install.cmake.in
306 | -  ${SCALAPACK_BINARY_DIR}/CMakeFiles/scalapack-config.cmake @ONLY)
307 | -install(FILES
308 | -  ${SCALAPACK_BINARY_DIR}/CMakeFiles/scalapack-config.cmake
309 | -  ${SCALAPACK_BINARY_DIR}/scalapack-config-version.cmake
310 | -  DESTINATION lib/cmake/scalapack-${SCALAPACK_VERSION}
311 | -  )
312 | -
313 | -install(EXPORT scalapack-targets
314 | -  DESTINATION lib/cmake/scalapack-${SCALAPACK_VERSION})
315 | +project(SLTMG Fortran)
316 |  
317 | +file(GLOB SLTMG_SRC "TESTING/EIG/*.f")
318 | +add_library( sltmg STATIC ${SLTMG_SRC})
319 | +install( TARGETS sltmg DESTINATION lib )
320 | +ADD_CUSTOM_TARGET(install.sltmg
321 | +    ${CMAKE_COMMAND}
322 | +    -DBUILD_TYPE=${CMAKE_BUILD_TYPE}
323 | +    -P ${CMAKE_CURRENT_BINARY_DIR}/cmake_install.cmake)
324 | +ADD_DEPENDENCIES(install.sltmg sltmg) 
325 | 


--------------------------------------------------------------------------------
/testing/testing_pdgeqdwh.c:
--------------------------------------------------------------------------------
  1 | /**
  2 |  *
  3 |  * Copyright (c) 2017, King Abdullah University of Science and Technology
  4 |  * All rights reserved.
  5 |  *
  6 |  **/
  7 | 
  8 | /**
  9 |  *
 10 |  * @file testing_pdgeqdwh.c
 11 |  *
 12 |  *  QDWH is a high performance software framework for computing 
 13 |  *  the polar decomposition on distributed-memory manycore systems provided by KAUST
 14 |  *
 15 |  * @version 3.0.0
 16 |  * @author Dalal Sukkari
 17 |  * @author Hatem Ltaief
 18 |  * @date 2018-11-08
 19 |  *
 20 |  **/
 21 | 
 22 | #include <stdlib.h>
 23 | #include "polar.h"
 24 | 
 25 | 
 26 | /* Default values of parameters */
 27 | int nprow         = 1;
 28 | int npcol         = 1;
 29 | int lvec          = 1;
 30 | int rvec          = 1;
 31 | int n             = 5120;
 32 | int nb            = 128;
 33 | int mode          = 4;
 34 | double cond       = 9.0072e+15;
 35 | int optcond       = 0;
 36 | int start         = 5120;
 37 | int stop          = 5120;
 38 | int step          = 1;
 39 | int niter         = 1;
 40 | int check         = 0;
 41 | int verbose       = 0;
 42 | 
 43 | static inline double cWtime(void)
 44 | {
 45 |     struct timeval tp;
 46 |     gettimeofday( &tp, NULL );
 47 |     return tp.tv_sec + 1e-6 * tp.tv_usec;
 48 | }
 49 | 
 50 | void print_usage(void)
 51 | {
 52 |     fprintf(stderr,
 53 |             "======= QDWH testing using ScaLAPACK\n"
 54 |             " -p      --nprow         : Number of MPI process rows\n"
 55 |             " -q      --npcol         : Number of MPI process cols\n"
 56 |             " -jl     --lvec          : Compute left singular vectors\n"
 57 |             " -jr     --rvec          : Compute right singular vectors\n"
 58 |             " -n      --N             : Dimension of the matrix\n"
 59 |             " -b      --nb            : Block size\n"
 60 |             " -m      --mode          : [1:6] Mode from pdlatms used to generate the matrix\n"
 61 |             " -k      --cond          : Condition number used to generate the matrix\n"
 62 |             " -o      --optcond       : Estimate Condition number using QR\n"
 63 |             " -i      --niter         : Number of iterations\n"
 64 |             " -r      --n_range       : Range for matrix sizes Start:Stop:Step\n"
 65 |             " -c      --check         : Check the solution\n"
 66 |             " -v      --verbose       : Verbose\n"
 67 |             " -h      --help          : Print this help\n" );
 68 | }
 69 | 
 70 | #define GETOPT_STRING "p:q:x:y:n:b:m:i:o:r:Q,S:s:w:e:c:f:t:v:h"
 71 | 
 72 | static struct option long_options[] =
 73 |     {
 74 |         /* PaRSEC specific options */
 75 |         {"nprow",      required_argument,  0, 'p'},
 76 |         {"npcol",      required_argument,  0, 'q'},
 77 |         {"jl",         no_argument,        0, 'x'},
 78 |         {"lvec",       no_argument,        0, 'x'},
 79 |         {"jr",         no_argument,        0, 'y'},
 80 |         {"rvec",       no_argument,        0, 'y'},
 81 |         {"N",          required_argument,  0, 'n'},
 82 |         {"n",          required_argument,  0, 'n'},
 83 |         {"nb",         required_argument,  0, 'b'},
 84 |         {"b",          required_argument,  0, 'b'},
 85 |         {"mode",       required_argument,  0, 'm'},
 86 |         {"m",          required_argument,  0, 'm'},
 87 |         {"cond",       required_argument,  0, 'k'},
 88 |         {"k",          required_argument,  0, 'k'},
 89 |         {"optcond",    required_argument,  0, 'o'},
 90 |         {"o",          required_argument,  0, 'o'},
 91 |         {"i",          required_argument,  0, 'i'},
 92 |         {"niter",      required_argument,  0, 'i'},
 93 |         {"r",          required_argument,  0, 'r'},
 94 |         {"n_range",    required_argument,  0, 'r'},
 95 |         {"check",      no_argument,        0, 'c'},
 96 |         {"verbose",    no_argument,        0, 'v'},
 97 |         {"help",       no_argument,        0, 'h'},
 98 |         {"h",          no_argument,        0, 'h'},
 99 |         {0, 0, 0, 0}
100 |     };
101 | 
102 | static void parse_arguments(int argc, char** argv)
103 | {
104 |     int opt = 0;
105 |     int c;
106 |     int myrank_mpi;
107 | 
108 |     MPI_Comm_rank(MPI_COMM_WORLD, &myrank_mpi);
109 | 
110 |     do {
111 | #if defined(HAVE_GETOPT_LONG)
112 |         c = getopt_long_only(argc, argv, "",
113 |                         long_options, &opt);
114 | #else
115 |         c = getopt(argc, argv, GETOPT_STRING);
116 |         (void) opt;
117 | #endif  /* defined(HAVE_GETOPT_LONG) */
118 | 
119 |         switch(c) {
120 |         case 'p': nprow     = atoi(optarg); break;
121 |         case 'q': npcol     = atoi(optarg); break;
122 |         case 'n': n         = atoi(optarg); start = n; stop = n; step = 1; break;
123 |         case 'b': nb        = atoi(optarg); break;
124 |         case 'm': mode      = atoi(optarg); break;
125 |         case 'k': cond      = atof(optarg); break;
126 |         case 'o': optcond   = atof(optarg); break;
127 |         case 'i': niter     = atoi(optarg); break;
128 |         case 'r': get_range( optarg, &start, &stop, &step ); break;
129 |         case 'c': check     = 1; break;
130 |         case 'v': verbose   = 1; break;
131 |         case 'h':
132 |             if (myrank_mpi == 0) print_usage(); MPI_Finalize(); exit(0);
133 |             break;
134 |         default:
135 |             break;
136 |         }
137 |     } while(-1 != c);
138 | }
139 | 
140 | int main(int argc, char **argv) {
141 | 
142 | 
143 |     int myrank_mpi, nprocs_mpi;
144 |     int ictxt, myrow, mycol;
145 |     int mloc, nloc, mlocW;
146 |     int mpi_comm_rows, mpi_comm_cols;
147 |     int i, j, k, iter, size, info_facto_mr, info_facto_dc, info_facto_qw, info_facto_el, info_facto_sl, info, iseed;
148 |     int my_info_facto;
149 |     int i0 = 0, i1 = 1;
150 |     int lwork, liwork, ldw;
151 | 
152 |     /* Allocation for the input/output matrices */
153 |     int descA[9], descH[9];
154 |     double *A=NULL, *H=NULL;
155 | 
156 |     /* Allocation to check the results */
157 |     int descAcpy[9], descC[9]; 
158 |     double *Acpy=NULL, *C=NULL;
159 | 
160 |     /* Allocation for pdlatsm */
161 |     double *Wloc1=NULL, *Wloc2=NULL, *D=NULL;
162 | 
163 |     double eps = LAPACKE_dlamch_work('e');
164 |     int iprepad, ipostpad, sizemqrleft, sizemqrright, sizeqrf, sizeqtq,
165 |         sizechk, sizesyevx, isizesyevx,
166 |         sizesubtst, isizesubtst, sizetst,
167 |         isizetst;
168 | /**/
169 | 
170 |     double flops, GFLOPS;
171 | 
172 |     double orth_Uqw, berr_UHqw;
173 |     double frobA;
174 | 
175 |     double alpha, beta;
176 |     char *jobu, *jobvt;
177 | 
178 | 
179 | /**/
180 | 
181 |     if (verbose & myrank_mpi == 0) fprintf(stderr, "Program starts... \n");
182 | 
183 |     MPI_Init(&argc, &argv);
184 |     MPI_Comm_rank(MPI_COMM_WORLD, &myrank_mpi);
185 |     MPI_Comm_size(MPI_COMM_WORLD, &nprocs_mpi);
186 | 
187 |     if (verbose & myrank_mpi == 0) fprintf(stderr, "MPI Init done\n");
188 | 
189 |     parse_arguments(argc, argv);
190 | 
191 |     if (verbose & myrank_mpi == 0) fprintf(stderr, "Checking arguments done\n");
192 | 
193 |     Cblacs_get( -1, 0, &ictxt );
194 |     Cblacs_gridinit( &ictxt, "R", nprow, npcol );
195 |     Cblacs_gridinfo( ictxt, &nprow, &npcol, &myrow, &mycol );
196 | 
197 |     if (verbose & myrank_mpi == 0) fprintf(stderr, "BLACS Init done\n");
198 | 
199 |     if (myrank_mpi == 0) {
200 |         fprintf(stderr, "# \n");
201 |         fprintf(stderr, "# NPROCS %d P %d Q %d\n", nprocs_mpi, nprow, npcol);
202 |         fprintf(stderr, "# niter %d\n", niter);
203 |         fprintf(stderr, "# n_range %d:%d:%d mode: %d cond: %2.4e \n", start, stop, step, mode, cond);
204 |         fprintf(stderr, "# \n");
205 |     }
206 | 
207 |     if (verbose & myrank_mpi == 0) fprintf(stderr, "Range loop starts\n");
208 | 
209 | 
210 |      // Begin loop over range
211 |     for (size = start; size <= stop; size += step) {
212 |         while ( (int)((double)size / (double)nb) < ( max(nprow , npcol) )){
213 |             if (myrank_mpi == 0) fprintf(stderr, " Matrix size is small to be facrorized using this number of processors \n");
214 |                size += step;
215 |         }
216 |         n = size; ldw = 2*n;
217 | 	mloc  = numroc_( &n, &nb, &myrow, &i0, &nprow );
218 | 	nloc  = numroc_( &n, &nb, &mycol, &i0, &npcol );
219 | 	mlocW = numroc_( &ldw, &nb, &myrow, &i0, &nprow );
220 |         if (verbose & myrank_mpi == 0) fprintf(stderr, "Desc Init starts %d\n", mloc);
221 | 
222 | 	descinit_( descA, &n, &n, &nb, &nb, &i0, &i0, &ictxt, &mloc, &info );
223 | 	descinit_( descAcpy, &n, &n, &nb, &nb, &i0, &i0, &ictxt, &mloc, &info );
224 | 	descinit_( descC, &n, &n, &nb, &nb, &i0, &i0, &ictxt, &mloc, &info );
225 | 	descinit_( descH, &n, &n, &nb, &nb, &i0, &i0, &ictxt, &mloc, &info );
226 |         if (verbose & myrank_mpi == 0) fprintf(stderr, "Desc Init ends %d\n", mloc);
227 | 
228 | 	A     = (double *)malloc(mloc*nloc*sizeof(double)) ;
229 | 	H     = (double *)malloc(mloc*nloc*sizeof(double)) ;
230 | 	C     = (double *)malloc(mloc*nloc*sizeof(double)) ;
231 | 	Acpy  = (double *)malloc(mloc*nloc*sizeof(double)) ;
232 | 	D     = (double *)malloc(n*sizeof(double)) ;
233 | 
234 |         
235 |         /* Generate matrix by pdlatms */
236 |         {
237 |            char   *dist = "N"; /* NORMAL( 0, 1 )  ( 'N' for normal ) */
238 |            int    iseed[4] = {1, 0, 0, 1};
239 |            char   *sym = "P"; /* The generated matrix is symmetric, with
240 |                                 eigenvalues (= singular values) specified by D, COND,
241 |                                 MODE, and DMAX; they will not be negative.
242 |                                 "N" not supported. */
243 |            //int    mode = 4; /* sets D(i)=1 - (i-1)/(N-1)*(1 - 1/COND) */
244 |            //double cond = 1.0/eps;
245 |            double dmax = 1.0;
246 |            int    kl   = n;
247 |            int    ku   = n;
248 |            char   *pack = "N"; /* no packing */
249 |            int    order = n;
250 |            int    info;
251 |          
252 |            pdlasizesep_( descA, 
253 |                          &iprepad, &ipostpad, &sizemqrleft, &sizemqrright, &sizeqrf, 
254 |                          &lwork, 
255 |                          &sizeqtq, &sizechk, &sizesyevx, &isizesyevx, &sizesubtst, 
256 |                          &isizesubtst, &sizetst, &isizetst );
257 |            if (verbose & myrank_mpi == 0) fprintf(stderr, "Setting lwork done\n");
258 |            Wloc1 = (double *)calloc(lwork,sizeof(double)) ;
259 |            pdlatms_(&n, &n, dist,
260 |                     iseed, sym, D, &mode, &cond, &dmax,
261 |                     &kl, &ku, pack, 
262 |                     A, &i1, &i1, descA, &order, 
263 |                     Wloc1, &lwork, &info);
264 |            if (verbose & myrank_mpi == 0) fprintf(stderr, "MatGen done\n");
265 |            if (info != 0) {
266 |                fprintf(stderr, "An error occured during matrix generation: %d\n", info );
267 |                return EXIT_FAILURE;
268 |            }
269 |            pdlacpy_( "All", &n, &n, 
270 |                      A, &i1, &i1, descA, 
271 |                      Acpy, &i1, &i1, descAcpy ); 
272 |            frobA  = pdlange_ ( "f", &n, &n, A, &i1, &i1, descA, Wloc1);
273 |            beta = 0.0;
274 | 
275 |            if (verbose & myrank_mpi == 0) fprintf(stderr, "Copy to Acpy done\n");
276 | 
277 |            free( Wloc1 );
278 |         }
279 | 
280 |         if (myrank_mpi == 0) fprintf(stderr, "\n\n");
281 |         if (myrank_mpi == 0) fprintf(stderr, "/////////////////////////////////////////////////////////////////////////\n");
282 |         if (myrank_mpi == 0) fprintf(stderr, "/////////////////////////////////////////////////////////////////////////\n");
283 |         if (myrank_mpi == 0) fprintf(stderr, "/////////////////////////////////////////////////////////////////////////\n");
284 | 
285 |         // QDWH
286 | 	//if ( qwmr || qwdc || qwel || polarqdwh) {
287 |         jobu   = lvec ? "V" : "N";
288 |         int lWork1, lWork2, lWi;
289 |         Wloc1  = (double *)calloc(1,sizeof(double)) ;
290 |         Wloc2  = (double *)calloc(1,sizeof(double)) ;
291 |         lWork1 = -1; 
292 |         lWork2 = -1; 
293 | 
294 |         pdgeqdwh( "H", n, n, 
295 |                   A, i1, i1, descA, 
296 |                   H, i1, i1, descH, 
297 |                   //NULL, lWork, //debug
298 |                   Wloc1, lWork1,
299 |                   Wloc2, lWork2,
300 |                   &my_info_facto);
301 | 
302 |         lWork1 = Wloc1[0];
303 |         lWork2 = Wloc2[0];
304 | 
305 |         
306 |         //Wloc  = (double *)malloc(lWork*n*sizeof(double));
307 | 	Wloc1  = (double *)malloc((lWork1*nloc)*sizeof(double));
308 | 	Wloc2  = (double *)malloc((lWork2*nloc)*sizeof(double));
309 | 
310 |         //int MB3 = 3*n;
311 |         //int mlocW3 = numroc_( &MB3, &nb, &myrow, &i0, &nprow );
312 |         //mlocW3 = ((mlocW3+(nb-1))/nb)*nb;
313 | 	//Wloc  = (double *)malloc((mlocW3*nloc)*sizeof(double));
314 |                
315 | 
316 |         for (iter = 0; iter < niter; iter++) {
317 |             pdlacpy_( "All", &n, &n, 
318 |                       Acpy, &i1, &i1, descAcpy, 
319 |                       A,    &i1, &i1, descA ); 
320 |             flops = 0.0;
321 |             if (verbose & myrank_mpi == 0) fprintf(stderr, "\nQDWH + ScaLAPACK EIG done\n");
322 |             if (verbose & myrank_mpi == 0) fprintf(stderr, "QDWH starts...\n");
323 | 
324 |             /*
325 |              * Find polar decomposition using QDWH. 
326 |              * C contains the positive-definite factor. 
327 |              * A contains the orthogonal polar factor.
328 |              */
329 | 
330 |             pdgeqdwh( "H", n, n,
331 |                       A, i1, i1, descA, 
332 |                       H, i1, i1, descH, 
333 |                       //NULL, lWork, //debug
334 |                       Wloc1, lWork1,
335 |                       Wloc2, lWork2,
336 |                       &my_info_facto);
337 | 
338 | 	    MPI_Allreduce( &my_info_facto, &info_facto_qw, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);
339 | 
340 |             if (verbose & myrank_mpi == 0) fprintf(stderr, "QDWH ends...\n");
341 |             if (verbose & check & myrank_mpi == 0) fprintf(stderr, "Testing QDWH starts...\n");
342 |             /*
343 |              * Checking the polar factorization
344 |              */
345 |             if(check ){
346 |                /*
347 |                 * checking orthogonality of Up
348 |                 */ 
349 |                 alpha = 0.0; beta = 1.0;
350 |                 pdlaset_( "G", &n, &n, &alpha, &beta, C, &i1, &i1, descC);
351 |                 alpha = 1.0; beta = -1.0;
352 |                 pdgemm_( "T", "N", &n, &n, &n, 
353 |                          &alpha, 
354 |                          A, &i1, &i1, descA, 
355 |                          A, &i1, &i1, descA, 
356 |                          &beta, 
357 |                          C, &i1, &i1, descC);
358 |                 orth_Uqw  = pdlange_ ( "f", &n, &n, C, &i1, &i1, descC, Wloc1)/sqrt(n);
359 |                /*
360 |                 * checking the factorization |A-Up*H|
361 |                 */ 
362 |                 pdlacpy_( "A", &n, &n, Acpy, &i1, &i1, descAcpy, C, &i1, &i1, descC );
363 |                 alpha = 1.0; beta = -1.0;
364 |                 pdgemm_( "N", "N", &n, &n, &n, 
365 |                          &alpha, 
366 |                          A, &i1, &i1, descA, 
367 |                          H, &i1, &i1, descH, 
368 |                          &beta, 
369 |                          C, &i1, &i1, descC);
370 |                 berr_UHqw  = pdlange_ ( "f", &n, &n, C, &i1, &i1, descC, Wloc1)/frobA;
371 |                 if (  myrank_mpi == 0) {
372 |                      fprintf(stderr, "# QDWH \n"); 
373 |                      fprintf(stderr, "#\n");
374 | 	             fprintf(stderr, "# \tN      \tNB    \tNP    \tP   \tQ    \tinfo  \tBerr_UpH  \tOrth_Up     \n");
375 | 	             fprintf(stderr, "   %6d \t%4d \t%4d \t%3d \t%3d", n, nb, nprocs_mpi, nprow, npcol);
376 | 	             fprintf(stderr, "\t%d \t%2.4e \t%2.4e \n", info_facto_qw, berr_UHqw, orth_Uqw);
377 |                      fprintf(stderr, "/////////////////////////////////////////////////////////////////////////\n");
378 |                 }
379 |             }
380 |         }
381 | 
382 | 
383 | 
384 |         if (myrank_mpi == 0) fprintf(stderr, "/////////////////////////////////////////////////////////////////////////\n");
385 | 
386 | 	free( A );
387 | 	free( Acpy );
388 | 	free( C );
389 | 	free( H );
390 | 	free( D );
391 |         free(Wloc1);
392 |         free(Wloc2);
393 |         if (verbose & myrank_mpi == 0) fprintf(stderr, "Free matrices done\n");
394 |     } // End loop over range
395 | 
396 | 
397 |         if (verbose & myrank_mpi == 0) fprintf(stderr, "Range loop ends\n");
398 | 
399 | 	blacs_gridexit_( &i0 );
400 | 	MPI_Finalize();
401 |         if (verbose & myrank_mpi == 0) fprintf(stderr, "Program ends...\n");
402 | 	return 0;
403 | }
404 | 


--------------------------------------------------------------------------------
/testing/testing_pdgezolopd.c:
--------------------------------------------------------------------------------
  1 | /**
  2 |  *
  3 |  * Copyright (c) 2017, King Abdullah University of Science and Technology
  4 |  * All rights reserved.
  5 |  *
  6 |  **/
  7 | 
  8 | /**
  9 |  *
 10 |  * @file testing_pdgezolopd.c
 11 |  *
 12 |  *  ZOLOPD is a high performance software framework for computing 
 13 |  *  the polar decomposition on distributed-memory manycore systems provided by KAUST
 14 |  *
 15 |  * @version 3.0.0
 16 |  * @author Dalal Sukkari
 17 |  * @author Hatem Ltaief
 18 |  * @date 2018-11-08
 19 |  *
 20 |  **/
 21 | 
 22 | #include "polar.h"
 23 | 
 24 | 
 25 | /* Default values of parameters */
 26 | int nprow         = 1;
 27 | int npcol         = 1;
 28 | int lvec          = 1;
 29 | int rvec          = 1;
 30 | int n             = 5120;
 31 | int nb            = 128;
 32 | int mode          = 4;
 33 | double cond       = 9.0072e+15;
 34 | int optcond       = 0;
 35 | int start         = 5120;
 36 | int stop          = 5120;
 37 | int step          = 1;
 38 | int niter         = 1;
 39 | int check         = 0;
 40 | int verbose       = 0;
 41 | 
 42 | static inline double cWtime(void)
 43 | {
 44 |     struct timeval tp;
 45 |     gettimeofday( &tp, NULL );
 46 |     return tp.tv_sec + 1e-6 * tp.tv_usec;
 47 | }
 48 | 
 49 | void print_usage(void)
 50 | {
 51 |     fprintf(stderr,
 52 |             "======= ZOLOPD testing using ScaLAPACK\n"
 53 |             " -p      --nprow         : Number of MPI process rows\n"
 54 |             " -q      --npcol         : Number of MPI process cols\n"
 55 |             " -jl     --lvec          : Compute left singular vectors\n"
 56 |             " -jr     --rvec          : Compute right singular vectors\n"
 57 |             " -n      --N             : Dimension of the matrix\n"
 58 |             " -b      --nb            : Block size\n"
 59 |             " -m      --mode          : [1:6] Mode from pdlatms used to generate the matrix\n"
 60 |             " -k      --cond          : Condition number used to generate the matrix\n"
 61 |             " -o      --optcond       : Estimate Condition number using QR\n"
 62 |             " -i      --niter         : Number of iterations\n"
 63 |             " -r      --n_range       : Range for matrix sizes Start:Stop:Step\n"
 64 |             " -c      --check         : Check the solution\n"
 65 |             " -v      --verbose       : Verbose\n"
 66 |             " -h      --help          : Print this help\n" );
 67 | }
 68 | 
 69 | #define GETOPT_STRING "p:q:x:y:n:b:m:i:o:r:Q,S:s:w:e:c:f:t:v:h"
 70 | 
 71 | static struct option long_options[] =
 72 |     {
 73 |         /* PaRSEC specific options */
 74 |         {"nprow",      required_argument,  0, 'p'},
 75 |         {"npcol",      required_argument,  0, 'q'},
 76 |         {"jl",         no_argument,        0, 'x'},
 77 |         {"lvec",       no_argument,        0, 'x'},
 78 |         {"jr",         no_argument,        0, 'y'},
 79 |         {"rvec",       no_argument,        0, 'y'},
 80 |         {"N",          required_argument,  0, 'n'},
 81 |         {"n",          required_argument,  0, 'n'},
 82 |         {"nb",         required_argument,  0, 'b'},
 83 |         {"b",          required_argument,  0, 'b'},
 84 |         {"mode",       required_argument,  0, 'm'},
 85 |         {"m",          required_argument,  0, 'm'},
 86 |         {"cond",       required_argument,  0, 'k'},
 87 |         {"k",          required_argument,  0, 'k'},
 88 |         {"optcond",    required_argument,  0, 'o'},
 89 |         {"o",          required_argument,  0, 'o'},
 90 |         {"i",          required_argument,  0, 'i'},
 91 |         {"niter",      required_argument,  0, 'i'},
 92 |         {"r",          required_argument,  0, 'r'},
 93 |         {"n_range",    required_argument,  0, 'r'},
 94 |         {"check",      no_argument,        0, 'c'},
 95 |         {"verbose",    no_argument,        0, 'v'},
 96 |         {"help",       no_argument,        0, 'h'},
 97 |         {"h",          no_argument,        0, 'h'},
 98 |         {0, 0, 0, 0}
 99 |     };
100 | 
101 | static void parse_arguments(int argc, char** argv)
102 | {
103 |     int opt = 0;
104 |     int c;
105 |     int myrank_mpi;
106 | 
107 |     MPI_Comm_rank(MPI_COMM_WORLD, &myrank_mpi);
108 | 
109 |     do {
110 | #if defined(HAVE_GETOPT_LONG)
111 |         c = getopt_long_only(argc, argv, "",
112 |                         long_options, &opt);
113 | #else
114 |         c = getopt(argc, argv, GETOPT_STRING);
115 |         (void) opt;
116 | #endif  /* defined(HAVE_GETOPT_LONG) */
117 | 
118 |         switch(c) {
119 |         case 'p': nprow     = atoi(optarg); break;
120 |         case 'q': npcol     = atoi(optarg); break;
121 |         case 'n': n         = atoi(optarg); start = n; stop = n; step = 1; break;
122 |         case 'b': nb        = atoi(optarg); break;
123 |         case 'm': mode      = atoi(optarg); break;
124 |         case 'k': cond      = atof(optarg); break;
125 |         case 'o': optcond   = atof(optarg); break;
126 |         case 'i': niter     = atoi(optarg); break;
127 |         case 'r': get_range( optarg, &start, &stop, &step ); break;
128 |         case 'c': check     = 1; break;
129 |         case 'v': verbose   = 1; break;
130 |         case 'h':
131 |             if (myrank_mpi == 0) print_usage(); MPI_Finalize(); exit(0);
132 |             break;
133 |         default:
134 |             break;
135 |         }
136 |     } while(-1 != c);
137 | }
138 | 
139 | int main(int argc, char **argv) {
140 | 
141 | 
142 |     int myrank_mpi, nprocs_mpi;
143 |     int ictxt, myrow, mycol;
144 |     int mloc, nloc, mlocW;
145 |     int mpi_comm_rows, mpi_comm_cols;
146 |     int i, j, k, iter, size, info_facto_mr, info_facto_dc, info_facto_zo, info_facto_el, info_facto_sl, info, iseed;
147 |     int my_info_facto;
148 |     int i0 = 0, i1 = 1;
149 |     int lwork, liwork, ldw;
150 | 
151 |     /* Allocation for the input/output matrices */
152 |     int descA[9], descH[9];
153 |     double *A=NULL, *H=NULL;
154 | 
155 |     /* Allocation to check the results */
156 |     int descAcpy[9], descC[9]; 
157 |     double *Acpy=NULL, *C=NULL;
158 | 
159 |     /* Allocation for pdlatsm */
160 |     double *Wloc1=NULL, *Wloc2=NULL, *D=NULL;
161 | 
162 |     double eps = LAPACKE_dlamch_work('e');
163 |     int iprepad, ipostpad, sizemqrleft, sizemqrright, sizeqrf, sizeqtq,
164 |         sizechk, sizesyevx, isizesyevx,
165 |         sizesubtst, isizesubtst, sizetst,
166 |         isizetst;
167 | /**/
168 | 
169 |     double flops, GFLOPS;
170 | 
171 |     double orth_Uzo, berr_UHzo;
172 |     double frobA;
173 | 
174 |     double alpha, beta;
175 |     char *jobu, *jobvt;
176 | 
177 | 
178 | /**/
179 | 
180 |     if (verbose & myrank_mpi == 0) fprintf(stderr, "Program starts... \n");
181 | 
182 |     MPI_Init(&argc, &argv);
183 |     MPI_Comm_rank(MPI_COMM_WORLD, &myrank_mpi);
184 |     MPI_Comm_size(MPI_COMM_WORLD, &nprocs_mpi);
185 | 
186 |     if (verbose & myrank_mpi == 0) fprintf(stderr, "MPI Init done\n");
187 | 
188 |     parse_arguments(argc, argv);
189 | 
190 |     if (verbose & myrank_mpi == 0) fprintf(stderr, "Checking arguments done\n");
191 | 
192 |     Cblacs_get( -1, 0, &ictxt );
193 |     Cblacs_gridinit( &ictxt, "R", nprow, npcol );
194 |     Cblacs_gridinfo( ictxt, &nprow, &npcol, &myrow, &mycol );
195 | 
196 |     if (verbose & myrank_mpi == 0) fprintf(stderr, "BLACS Init done\n");
197 | 
198 |     if (myrank_mpi == 0) {
199 |         fprintf(stderr, "# \n");
200 |         fprintf(stderr, "# NPROCS %d P %d Q %d\n", nprocs_mpi, nprow, npcol);
201 |         fprintf(stderr, "# niter %d\n", niter);
202 |         fprintf(stderr, "# n_range %d:%d:%d mode: %d cond: %2.4e \n", start, stop, step, mode, cond);
203 |         fprintf(stderr, "# \n");
204 |     }
205 | 
206 |     if (verbose & myrank_mpi == 0) fprintf(stderr, "Range loop starts\n");
207 | 
208 | 
209 |      // Begin loop over range
210 |     for (size = start; size <= stop; size += step) {
211 |         while ( (int)((double)size / (double)nb) < ( max(nprow , npcol) )){
212 |             if (myrank_mpi == 0) fprintf(stderr, " Matrix size is small to be facrorized using this number of processors \n");
213 |                size += step;
214 |         }
215 |         n = size; ldw = 2*n;
216 | 	mloc  = numroc_( &n, &nb, &myrow, &i0, &nprow );
217 | 	nloc  = numroc_( &n, &nb, &mycol, &i0, &npcol );
218 | 	mlocW = numroc_( &ldw, &nb, &myrow, &i0, &nprow );
219 |         if (verbose & myrank_mpi == 0) fprintf(stderr, "Desc Init starts %d\n", mloc);
220 | 
221 | 	descinit_( descA, &n, &n, &nb, &nb, &i0, &i0, &ictxt, &mloc, &info );
222 | 	descinit_( descAcpy, &n, &n, &nb, &nb, &i0, &i0, &ictxt, &mloc, &info );
223 | 	descinit_( descC, &n, &n, &nb, &nb, &i0, &i0, &ictxt, &mloc, &info );
224 | 	descinit_( descH, &n, &n, &nb, &nb, &i0, &i0, &ictxt, &mloc, &info );
225 |         if (verbose & myrank_mpi == 0) fprintf(stderr, "Desc Init ends %d\n", mloc);
226 | 
227 | 	A     = (double *)malloc(mloc*nloc*sizeof(double)) ;
228 | 	H     = (double *)malloc(mloc*nloc*sizeof(double)) ;
229 | 	C     = (double *)malloc(mloc*nloc*sizeof(double)) ;
230 | 	Acpy  = (double *)malloc(mloc*nloc*sizeof(double)) ;
231 | 	D     = (double *)malloc(n*sizeof(double)) ;
232 | 
233 |         
234 |         /* Generate matrix by pdlatms */
235 |         {
236 |            char   *dist = "N"; /* NORMAL( 0, 1 )  ( 'N' for normal ) */
237 |            int    iseed[4] = {1, 0, 0, 1};
238 |            char   *sym = "P"; /* The generated matrix is symmetric, with
239 |                                 eigenvalues (= singular values) specified by D, COND,
240 |                                 MODE, and DMAX; they will not be negative.
241 |                                 "N" not supported. */
242 |            //int    mode = 4; /* sets D(i)=1 - (i-1)/(N-1)*(1 - 1/COND) */
243 |            //double cond = 1.0/eps;
244 |            double dmax = 1.0;
245 |            int    kl   = n;
246 |            int    ku   = n;
247 |            char   *pack = "N"; /* no packing */
248 |            int    order = n;
249 |            int    info;
250 |          
251 |            pdlasizesep_( descA, 
252 |                          &iprepad, &ipostpad, &sizemqrleft, &sizemqrright, &sizeqrf, 
253 |                          &lwork, 
254 |                          &sizeqtq, &sizechk, &sizesyevx, &isizesyevx, &sizesubtst, 
255 |                          &isizesubtst, &sizetst, &isizetst );
256 |            if (verbose & myrank_mpi == 0) fprintf(stderr, "Setting lwork done\n");
257 |            Wloc1 = (double *)calloc(lwork,sizeof(double)) ;
258 |            pdlatms_(&n, &n, dist,
259 |                     iseed, sym, D, &mode, &cond, &dmax,
260 |                     &kl, &ku, pack, 
261 |                     A, &i1, &i1, descA, &order, 
262 |                     Wloc1, &lwork, &info);
263 |            if (verbose & myrank_mpi == 0) fprintf(stderr, "MatGen done\n");
264 |            if (info != 0) {
265 |                fprintf(stderr, "An error occured during matrix generation: %d\n", info );
266 |                return EXIT_FAILURE;
267 |            }
268 |            pdlacpy_( "All", &n, &n, 
269 |                      A, &i1, &i1, descA, 
270 |                      Acpy, &i1, &i1, descAcpy ); 
271 |            frobA  = pdlange_ ( "f", &n, &n, A, &i1, &i1, descA, Wloc1);
272 |            beta = 0.0;
273 | 
274 |            if (verbose & myrank_mpi == 0) fprintf(stderr, "Copy to Acpy done\n");
275 | 
276 |            free( Wloc1 );
277 |         }
278 | 
279 |         if (myrank_mpi == 0) fprintf(stderr, "\n\n");
280 |         if (myrank_mpi == 0) fprintf(stderr, "/////////////////////////////////////////////////////////////////////////\n");
281 |         if (myrank_mpi == 0) fprintf(stderr, "/////////////////////////////////////////////////////////////////////////\n");
282 |         if (myrank_mpi == 0) fprintf(stderr, "/////////////////////////////////////////////////////////////////////////\n");
283 | 
284 |         // ZOLOPD
285 | 	//if ( zomr || zodc || zoel || polarzolopd) {
286 |         jobu   = lvec ? "V" : "N";
287 |         int lWork1, lWork2, lWi;
288 |         Wloc1  = (double *)calloc(1,sizeof(double)) ;
289 |         Wloc2  = (double *)calloc(1,sizeof(double)) ;
290 |         lWork1 = -1; 
291 |         lWork2 = -1; 
292 | 
293 |         pdgezolopd( "H", n, n, 
294 |                   A, i1, i1, descA, 
295 |                   H, i1, i1, descH, 
296 |                   //NULL, lWork, //debug
297 |                   Wloc1, lWork1,
298 |                   Wloc2, lWork2,
299 |                   &my_info_facto);
300 | 
301 |         lWork1 = Wloc1[0];
302 |         lWork2 = Wloc2[0];
303 | 
304 |         
305 |         //Wloc  = (double *)malloc(lWork*n*sizeof(double));
306 | 	Wloc1  = (double *)malloc((lWork1*nloc)*sizeof(double));
307 | 	Wloc2  = (double *)malloc((lWork2*nloc)*sizeof(double));
308 | 
309 |         //int MB3 = 3*n;
310 |         //int mlocW3 = numroc_( &MB3, &nb, &myrow, &i0, &nprow );
311 |         //mlocW3 = ((mlocW3+(nb-1))/nb)*nb;
312 | 	//Wloc  = (double *)malloc((mlocW3*nloc)*sizeof(double));
313 |                
314 | 
315 |         for (iter = 0; iter < niter; iter++) {
316 |             pdlacpy_( "All", &n, &n, 
317 |                       Acpy, &i1, &i1, descAcpy, 
318 |                       A,    &i1, &i1, descA ); 
319 |             flops = 0.0;
320 |             if (verbose & myrank_mpi == 0) fprintf(stderr, "\nZOLOPD + ScaLAPACK EIG done\n");
321 |             if (verbose & myrank_mpi == 0) fprintf(stderr, "ZOLOPD starts...\n");
322 | 
323 |             /*
324 |              * Find polar decomposition using ZOLOPD. 
325 |              * C contains the positive-definite factor. 
326 |              * A contains the orthogonal polar factor.
327 |              */
328 | 
329 |             pdgezolopd( "H", n, n,
330 |                       A, i1, i1, descA, 
331 |                       H, i1, i1, descH, 
332 |                       //NULL, lWork, //debug
333 |                       Wloc1, lWork1,
334 |                       Wloc2, lWork2,
335 |                       &my_info_facto);
336 | 
337 | 	    MPI_Allreduce( &my_info_facto, &info_facto_zo, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);
338 | 
339 |             if (verbose & myrank_mpi == 0) fprintf(stderr, "ZOLOPD ends...\n");
340 |             if (verbose & check & myrank_mpi == 0) fprintf(stderr, "Testing ZOLOPD starts...\n");
341 |             /*
342 |              * Checking the polar factorization
343 |              */
344 |             if(check ){
345 |                /*
346 |                 * checking orthogonality of Up
347 |                 */ 
348 |                 alpha = 0.0; beta = 1.0;
349 |                 pdlaset_( "G", &n, &n, &alpha, &beta, C, &i1, &i1, descC);
350 |                 alpha = 1.0; beta = -1.0;
351 |                 pdgemm_( "T", "N", &n, &n, &n, 
352 |                          &alpha, 
353 |                          A, &i1, &i1, descA, 
354 |                          A, &i1, &i1, descA, 
355 |                          &beta, 
356 |                          C, &i1, &i1, descC);
357 |                 orth_Uzo  = pdlange_ ( "f", &n, &n, C, &i1, &i1, descC, Wloc1)/sqrt(n);
358 |                /*
359 |                 * checking the factorization |A-Up*H|
360 |                 */ 
361 |                 pdlacpy_( "A", &n, &n, Acpy, &i1, &i1, descAcpy, C, &i1, &i1, descC );
362 |                 alpha = 1.0; beta = -1.0;
363 |                 pdgemm_( "N", "N", &n, &n, &n, 
364 |                          &alpha, 
365 |                          A, &i1, &i1, descA, 
366 |                          H, &i1, &i1, descH, 
367 |                          &beta, 
368 |                          C, &i1, &i1, descC);
369 |                 berr_UHzo  = pdlange_ ( "f", &n, &n, C, &i1, &i1, descC, Wloc1)/frobA;
370 |                 if (  myrank_mpi == 0) {
371 |                      fprintf(stderr, "# ZOLOPD \n"); 
372 |                      fprintf(stderr, "#\n");
373 | 	             fprintf(stderr, "# \tN      \tNB    \tNP    \tP   \tQ    \tinfo  \tBerr_UpH  \tOrth_Up     \n");
374 | 	             fprintf(stderr, "   %6d \t%4d \t%4d \t%3d \t%3d", n, nb, nprocs_mpi, nprow, npcol);
375 | 	             fprintf(stderr, "\t%d \t%2.4e \t%2.4e \n", info_facto_zo, berr_UHzo, orth_Uzo);
376 |                      fprintf(stderr, "/////////////////////////////////////////////////////////////////////////\n");
377 |                 }
378 |             }
379 |         }
380 | 
381 | 
382 | 
383 |         if (myrank_mpi == 0) fprintf(stderr, "/////////////////////////////////////////////////////////////////////////\n");
384 | 
385 | 	free( A );
386 | 	free( Acpy );
387 | 	free( C );
388 | 	free( H );
389 | 	free( D );
390 |         free(Wloc1);
391 |         free(Wloc2);
392 |         if (verbose & myrank_mpi == 0) fprintf(stderr, "Free matrices done\n");
393 |     } // End loop over range
394 | 
395 | 
396 |         if (verbose & myrank_mpi == 0) fprintf(stderr, "Range loop ends\n");
397 | 
398 | 	blacs_gridexit_( &i0 );
399 | 	MPI_Finalize();
400 |         if (verbose & myrank_mpi == 0) fprintf(stderr, "Program ends...\n");
401 | 	return 0;
402 | }
403 | 


--------------------------------------------------------------------------------
/timing/timing_pdgeqdwh.c:
--------------------------------------------------------------------------------
  1 | /**
  2 |  *
  3 |  * Copyright (c) 2017, King Abdullah University of Science and Technology
  4 |  * All rights reserved.
  5 |  *
  6 |  **/
  7 | 
  8 | /**
  9 |  *
 10 |  * @file timing_pdgeqdwh.c
 11 |  *
 12 |  *  QDWH is a high performance software framework for computing 
 13 |  *  the polar decomposition on distributed-memory manycore systems provided by KAUST
 14 |  *
 15 |  * @version 3.0.0
 16 |  * @author Dalal Sukkari
 17 |  * @author Hatem Ltaief
 18 |  * @date 2018-11-08
 19 |  *
 20 |  **/
 21 | 
 22 | #include <stdlib.h>
 23 | #include "polar.h"
 24 | 
 25 | 
 26 | /* Default values of parameters */
 27 | int nprow         = 1;
 28 | int npcol         = 1;
 29 | int lvec          = 1;
 30 | int rvec          = 1;
 31 | int n             = 5120;
 32 | int nb            = 128;
 33 | int mode          = 4;
 34 | double cond       = 9.0072e+15;
 35 | int optcond       = 0;
 36 | int start         = 5120;
 37 | int stop          = 5120;
 38 | int step          = 1;
 39 | int niter         = 1;
 40 | int check         = 0;
 41 | int verbose       = 0;
 42 | 
 43 | static inline double cWtime(void)
 44 | {
 45 |     struct timeval tp;
 46 |     gettimeofday( &tp, NULL );
 47 |     return tp.tv_sec + 1e-6 * tp.tv_usec;
 48 | }
 49 | 
 50 | void print_usage(void)
 51 | {
 52 |     fprintf(stderr,
 53 |             "======= QDWH timing using ScaLAPACK\n"
 54 |             " -p      --nprow         : Number of MPI process rows\n"
 55 |             " -q      --npcol         : Number of MPI process cols\n"
 56 |             " -jl     --lvec          : Compute left singular vectors\n"
 57 |             " -jr     --rvec          : Compute right singular vectors\n"
 58 |             " -n      --N             : Dimension of the matrix\n"
 59 |             " -b      --nb            : Block size\n"
 60 |             " -m      --mode          : [1:6] Mode from pdlatms used to generate the matrix\n"
 61 |             " -k      --cond          : Condition number used to generate the matrix\n"
 62 |             " -o      --optcond       : Estimate Condition number using QR\n"
 63 |             " -i      --niter         : Number of iterations\n"
 64 |             " -r      --n_range       : Range for matrix sizes Start:Stop:Step\n"
 65 |             " -c      --check         : Check the solution\n"
 66 |             " -v      --verbose       : Verbose\n"
 67 |             " -h      --help          : Print this help\n" );
 68 | }
 69 | 
 70 | #define GETOPT_STRING "p:q:x:y:n:b:m:i:o:r:Q,S:s:w:e:c:f:t:v:h"
 71 | 
 72 | static struct option long_options[] =
 73 |     {
 74 |         /* PaRSEC specific options */
 75 |         {"nprow",      required_argument,  0, 'p'},
 76 |         {"npcol",      required_argument,  0, 'q'},
 77 |         {"jl",         no_argument,        0, 'x'},
 78 |         {"lvec",       no_argument,        0, 'x'},
 79 |         {"jr",         no_argument,        0, 'y'},
 80 |         {"rvec",       no_argument,        0, 'y'},
 81 |         {"N",          required_argument,  0, 'n'},
 82 |         {"n",          required_argument,  0, 'n'},
 83 |         {"nb",         required_argument,  0, 'b'},
 84 |         {"b",          required_argument,  0, 'b'},
 85 |         {"mode",       required_argument,  0, 'm'},
 86 |         {"m",          required_argument,  0, 'm'},
 87 |         {"cond",       required_argument,  0, 'k'},
 88 |         {"k",          required_argument,  0, 'k'},
 89 |         {"optcond",    required_argument,  0, 'o'},
 90 |         {"o",          required_argument,  0, 'o'},
 91 |         {"i",          required_argument,  0, 'i'},
 92 |         {"niter",      required_argument,  0, 'i'},
 93 |         {"r",          required_argument,  0, 'r'},
 94 |         {"n_range",    required_argument,  0, 'r'},
 95 |         {"check",      no_argument,        0, 'c'},
 96 |         {"verbose",    no_argument,        0, 'v'},
 97 |         {"help",       no_argument,        0, 'h'},
 98 |         {"h",          no_argument,        0, 'h'},
 99 |         {0, 0, 0, 0}
100 |     };
101 | 
102 | static void parse_arguments(int argc, char** argv)
103 | {
104 |     int opt = 0;
105 |     int c;
106 |     int myrank_mpi;
107 | 
108 |     MPI_Comm_rank(MPI_COMM_WORLD, &myrank_mpi);
109 | 
110 |     do {
111 | #if defined(HAVE_GETOPT_LONG)
112 |         c = getopt_long_only(argc, argv, "",
113 |                         long_options, &opt);
114 | #else
115 |         c = getopt(argc, argv, GETOPT_STRING);
116 |         (void) opt;
117 | #endif  /* defined(HAVE_GETOPT_LONG) */
118 | 
119 |         switch(c) {
120 |         case 'p': nprow     = atoi(optarg); break;
121 |         case 'q': npcol     = atoi(optarg); break;
122 |         case 'n': n         = atoi(optarg); start = n; stop = n; step = 1; break;
123 |         case 'b': nb        = atoi(optarg); break;
124 |         case 'm': mode      = atoi(optarg); break;
125 |         case 'k': cond      = atof(optarg); break;
126 |         case 'o': optcond   = atof(optarg); break;
127 |         case 'i': niter     = atoi(optarg); break;
128 |         case 'r': get_range( optarg, &start, &stop, &step ); break;
129 |         case 'c': check     = 1; break;
130 |         case 'v': verbose   = 1; break;
131 |         case 'h':
132 |             if (myrank_mpi == 0) print_usage(); MPI_Finalize(); exit(0);
133 |             break;
134 |         default:
135 |             break;
136 |         }
137 |     } while(-1 != c);
138 | }
139 | 
140 | int main(int argc, char **argv) {
141 | 
142 | 
143 |     int myrank_mpi, nprocs_mpi;
144 |     int ictxt, myrow, mycol;
145 |     int mloc, nloc, mlocW;
146 |     int mpi_comm_rows, mpi_comm_cols;
147 |     int i, j, k, iter, size, info_facto_mr, info_facto_dc, info_facto_qw, info_facto_el, info_facto_sl, info, iseed;
148 |     int my_info_facto;
149 |     int i0 = 0, i1 = 1;
150 |     int lwork, liwork, ldw;
151 | 
152 |     /* Allocation for the input/output matrices */
153 |     int descA[9], descH[9];
154 |     double *A=NULL, *H=NULL;
155 | 
156 |     /* Allocation to check the results */
157 |     int descAcpy[9], descC[9]; 
158 |     double *Acpy=NULL, *C=NULL;
159 | 
160 |     /* Allocation for pdlatsm */
161 |     double *Wloc1=NULL, *Wloc2=NULL, *D=NULL;
162 | 
163 |     double eps = LAPACKE_dlamch_work('e');
164 |     int iprepad, ipostpad, sizemqrleft, sizemqrright, sizeqrf, sizeqtq,
165 |         sizechk, sizesyevx, isizesyevx,
166 |         sizesubtst, isizesubtst, sizetst,
167 |         isizetst;
168 | /**/
169 | 
170 |     double flops, GFLOPS;
171 | 
172 |     double orth_Uqw, berr_UHqw;
173 |     double frobA;
174 | 
175 |     double alpha, beta;
176 |     char *jobu, *jobvt;
177 | 
178 |     double my_elapsed_polarqdwh = 0.0, elapsed_polarqdwh = 0.0, sumtime_polarqdwh = 0.0;
179 |     double max_time_polarqdwh  = 0.0, min_time_polarqdwh  = 1e20;
180 | 
181 | /**/
182 | 
183 |     if (verbose & myrank_mpi == 0) fprintf(stderr, "Program starts... \n");
184 | 
185 |     MPI_Init(&argc, &argv);
186 |     MPI_Comm_rank(MPI_COMM_WORLD, &myrank_mpi);
187 |     MPI_Comm_size(MPI_COMM_WORLD, &nprocs_mpi);
188 | 
189 |     if (verbose & myrank_mpi == 0) fprintf(stderr, "MPI Init done\n");
190 | 
191 |     parse_arguments(argc, argv);
192 | 
193 |     if (verbose & myrank_mpi == 0) fprintf(stderr, "Checking arguments done\n");
194 | 
195 |     Cblacs_get( -1, 0, &ictxt );
196 |     Cblacs_gridinit( &ictxt, "R", nprow, npcol );
197 |     Cblacs_gridinfo( ictxt, &nprow, &npcol, &myrow, &mycol );
198 | 
199 |     if (verbose & myrank_mpi == 0) fprintf(stderr, "BLACS Init done\n");
200 | 
201 |     if (myrank_mpi == 0) {
202 |         fprintf(stderr, "# \n");
203 |         fprintf(stderr, "# NPROCS %d P %d Q %d\n", nprocs_mpi, nprow, npcol);
204 |         fprintf(stderr, "# niter %d\n", niter);
205 |         fprintf(stderr, "# n_range %d:%d:%d mode: %d cond: %2.4e \n", start, stop, step, mode, cond);
206 |         fprintf(stderr, "# \n");
207 |     }
208 | 
209 |     if (verbose & myrank_mpi == 0) fprintf(stderr, "Range loop starts\n");
210 | 
211 | 
212 |      // Begin loop over range
213 |     for (size = start; size <= stop; size += step) {
214 |         while ( (int)((double)size / (double)nb) < ( max(nprow , npcol) )){
215 |             if (myrank_mpi == 0) fprintf(stderr, " Matrix size is small to be facrorized using this number of processors \n");
216 |                size += step;
217 |         }
218 |         n = size; ldw = 2*n;
219 | 	mloc  = numroc_( &n, &nb, &myrow, &i0, &nprow );
220 | 	nloc  = numroc_( &n, &nb, &mycol, &i0, &npcol );
221 | 	mlocW = numroc_( &ldw, &nb, &myrow, &i0, &nprow );
222 |         if (verbose & myrank_mpi == 0) fprintf(stderr, "Desc Init starts %d\n", mloc);
223 | 
224 | 	descinit_( descA, &n, &n, &nb, &nb, &i0, &i0, &ictxt, &mloc, &info );
225 | 	descinit_( descAcpy, &n, &n, &nb, &nb, &i0, &i0, &ictxt, &mloc, &info );
226 | 	descinit_( descC, &n, &n, &nb, &nb, &i0, &i0, &ictxt, &mloc, &info );
227 | 	descinit_( descH, &n, &n, &nb, &nb, &i0, &i0, &ictxt, &mloc, &info );
228 |         if (verbose & myrank_mpi == 0) fprintf(stderr, "Desc Init ends %d\n", mloc);
229 | 
230 | 	A     = (double *)malloc(mloc*nloc*sizeof(double)) ;
231 | 	H     = (double *)malloc(mloc*nloc*sizeof(double)) ;
232 | 	C     = (double *)malloc(mloc*nloc*sizeof(double)) ;
233 | 	Acpy  = (double *)malloc(mloc*nloc*sizeof(double)) ;
234 | 	D     = (double *)malloc(n*sizeof(double)) ;
235 | 
236 |         
237 |         /* Initialize the timing counters */
238 |         my_elapsed_polarqdwh = 0.0, elapsed_polarqdwh = 0.0, sumtime_polarqdwh = 0.0; 
239 |         max_time_polarqdwh  = 0.0, min_time_polarqdwh  = 1e20;
240 | 
241 |         /* Generate matrix by pdlatms */
242 |         {
243 |            char   *dist = "N"; /* NORMAL( 0, 1 )  ( 'N' for normal ) */
244 |            int    iseed[4] = {1, 0, 0, 1};
245 |            char   *sym = "P"; /* The generated matrix is symmetric, with
246 |                                 eigenvalues (= singular values) specified by D, COND,
247 |                                 MODE, and DMAX; they will not be negative.
248 |                                 "N" not supported. */
249 |            //int    mode = 4; /* sets D(i)=1 - (i-1)/(N-1)*(1 - 1/COND) */
250 |            //double cond = 1.0/eps;
251 |            double dmax = 1.0;
252 |            int    kl   = n;
253 |            int    ku   = n;
254 |            char   *pack = "N"; /* no packing */
255 |            int    order = n;
256 |            int    info;
257 |          
258 |            pdlasizesep_( descA, 
259 |                          &iprepad, &ipostpad, &sizemqrleft, &sizemqrright, &sizeqrf, 
260 |                          &lwork, 
261 |                          &sizeqtq, &sizechk, &sizesyevx, &isizesyevx, &sizesubtst, 
262 |                          &isizesubtst, &sizetst, &isizetst );
263 |            if (verbose & myrank_mpi == 0) fprintf(stderr, "Setting lwork done\n");
264 |            Wloc1 = (double *)calloc(lwork,sizeof(double)) ;
265 |            pdlatms_(&n, &n, dist,
266 |                     iseed, sym, D, &mode, &cond, &dmax,
267 |                     &kl, &ku, pack, 
268 |                     A, &i1, &i1, descA, &order, 
269 |                     Wloc1, &lwork, &info);
270 |            if (verbose & myrank_mpi == 0) fprintf(stderr, "MatGen done\n");
271 |            if (info != 0) {
272 |                fprintf(stderr, "An error occured during matrix generation: %d\n", info );
273 |                return EXIT_FAILURE;
274 |            }
275 |            pdlacpy_( "All", &n, &n, 
276 |                      A, &i1, &i1, descA, 
277 |                      Acpy, &i1, &i1, descAcpy ); 
278 |            frobA  = pdlange_ ( "f", &n, &n, A, &i1, &i1, descA, Wloc1);
279 |            beta = 0.0;
280 | 
281 |            if (verbose & myrank_mpi == 0) fprintf(stderr, "Copy to Acpy done\n");
282 | 
283 |            free( Wloc1 );
284 |         }
285 | 
286 |         if (myrank_mpi == 0) fprintf(stderr, "\n\n");
287 |         if (myrank_mpi == 0) fprintf(stderr, "/////////////////////////////////////////////////////////////////////////\n");
288 |         if (myrank_mpi == 0) fprintf(stderr, "/////////////////////////////////////////////////////////////////////////\n");
289 |         if (myrank_mpi == 0) fprintf(stderr, "/////////////////////////////////////////////////////////////////////////\n");
290 | 
291 |         // QDWH
292 | 	//if ( qwmr || qwdc || qwel || polarqdwh) {
293 |         jobu   = lvec ? "V" : "N";
294 |         int lWork1, lWork2, lWi;
295 |         Wloc1  = (double *)calloc(1,sizeof(double)) ;
296 |         Wloc2  = (double *)calloc(1,sizeof(double)) ;
297 |         lWork1 = -1; 
298 |         lWork2 = -1; 
299 | 
300 |         pdgeqdwh( "H", n, n,
301 |                   A, i1, i1, descA, 
302 |                   H, i1, i1, descH, 
303 |                   //NULL, lWork, //debug
304 |                   Wloc1, lWork1,
305 |                   Wloc2, lWork2,
306 |                   &my_info_facto);
307 | 
308 |         lWork1 = Wloc1[0];
309 |         lWork2 = Wloc2[0];
310 | 
311 |         
312 |         //Wloc  = (double *)malloc(lWork*n*sizeof(double));
313 | 	Wloc1  = (double *)malloc((lWork1*nloc)*sizeof(double));
314 | 	Wloc2  = (double *)malloc((lWork2*nloc)*sizeof(double));
315 | 
316 |         //int MB3 = 3*n;
317 |         //int mlocW3 = numroc_( &MB3, &nb, &myrow, &i0, &nprow );
318 |         //mlocW3 = ((mlocW3+(nb-1))/nb)*nb;
319 | 	//Wloc  = (double *)malloc((mlocW3*nloc)*sizeof(double));
320 |                
321 | 
322 |         for (iter = 0; iter < niter; iter++) {
323 |             pdlacpy_( "All", &n, &n, 
324 |                       Acpy, &i1, &i1, descAcpy, 
325 |                       A,    &i1, &i1, descA ); 
326 |             flops = 0.0;
327 |             if (verbose & myrank_mpi == 0) fprintf(stderr, "\nQDWH + ScaLAPACK EIG done\n");
328 |             if (verbose & myrank_mpi == 0) fprintf(stderr, "QDWH starts...\n");
329 | 
330 |             /*
331 |              * Find polar decomposition using QDWH. 
332 |              * C contains the positive-definite factor. 
333 |              * A contains the orthogonal polar factor.
334 |              */
335 |             my_elapsed_polarqdwh   = 0.0;
336 |             my_elapsed_polarqdwh   =- MPI_Wtime();
337 | 
338 |             pdgeqdwh( "H", n, n,
339 |                       A, i1, i1, descA, 
340 |                       H, i1, i1, descH, 
341 |                       //NULL, lWork, //debug
342 |                       Wloc1, lWork1,
343 |                       Wloc2, lWork2,
344 |                       &my_info_facto);
345 | 
346 |             my_elapsed_polarqdwh   += MPI_Wtime();
347 |             MPI_Allreduce( &my_elapsed_polarqdwh, &elapsed_polarqdwh, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
348 |             sumtime_polarqdwh += elapsed_polarqdwh;
349 |             if ( elapsed_polarqdwh >= max_time_polarqdwh ) { max_time_polarqdwh = elapsed_polarqdwh;} 
350 |             if ( elapsed_polarqdwh <= min_time_polarqdwh ) { min_time_polarqdwh = elapsed_polarqdwh;} 
351 | 	    MPI_Allreduce( &my_info_facto, &info_facto_qw, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);
352 | 
353 | 	    MPI_Allreduce( &my_info_facto, &info_facto_qw, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);
354 | 
355 |             if (verbose & myrank_mpi == 0) fprintf(stderr, "QDWH ends...\n");
356 |             if (verbose & check & myrank_mpi == 0) fprintf(stderr, "Testing QDWH starts...\n");
357 |             /*
358 |              * Checking the polar factorization
359 |              */
360 |             if(check ){
361 |                /*
362 |                 * checking orthogonality of Up
363 |                 */ 
364 |                 alpha = 0.0; beta = 1.0;
365 |                 pdlaset_( "G", &n, &n, &alpha, &beta, C, &i1, &i1, descC);
366 |                 alpha = 1.0; beta = -1.0;
367 |                 pdgemm_( "T", "N", &n, &n, &n, 
368 |                          &alpha, 
369 |                          A, &i1, &i1, descA, 
370 |                          A, &i1, &i1, descA, 
371 |                          &beta, 
372 |                          C, &i1, &i1, descC);
373 |                 orth_Uqw  = pdlange_ ( "f", &n, &n, C, &i1, &i1, descC, Wloc1)/frobA;
374 |                /*
375 |                 * checking the factorization |A-Up*H|
376 |                 */ 
377 |                 pdlacpy_( "A", &n, &n, Acpy, &i1, &i1, descAcpy, C, &i1, &i1, descC );
378 |                 alpha = 1.0; beta = -1.0;
379 |                 pdgemm_( "N", "N", &n, &n, &n, 
380 |                          &alpha, 
381 |                          A, &i1, &i1, descA, 
382 |                          H, &i1, &i1, descH, 
383 |                          &beta, 
384 |                          C, &i1, &i1, descC);
385 |                 berr_UHqw  = pdlange_ ( "f", &n, &n, C, &i1, &i1, descC, Wloc1)/frobA;
386 |             }
387 |         }
388 |         if (  myrank_mpi == 0) {
389 |             fprintf(stderr, "# QDWH \n"); 
390 |             fprintf(stderr, "#\n");
391 | 	    fprintf(stderr, "# \tN     \tNB   \tNP   \tP   \tQ   \tGflop/s \tAvg-Time     \tMax-Time    \tMin-Time    \tBerr_UpH  \tOrth_Up  \tinfo     \n");
392 | 	    fprintf(stderr, "   %6d \t%4d \t%4d \t%3d \t%3d \t%8.2f", n, nb, nprocs_mpi, nprow, npcol, flops/1e9/min_time_polarqdwh);
393 | 	    fprintf(stderr, "\t%6.2f \t\t%6.2f \t\t%6.2f \t\t%2.4e \t%2.4e \t%d \n", sumtime_polarqdwh/niter, max_time_polarqdwh, min_time_polarqdwh, berr_UHqw, orth_Uqw,info_facto_qw);
394 |             fprintf(stderr, "/////////////////////////////////////////////////////////////////////////\n");
395 |          }
396 | 
397 | 
398 | 
399 |         if (myrank_mpi == 0) fprintf(stderr, "/////////////////////////////////////////////////////////////////////////\n");
400 | 
401 | 	free( A );
402 | 	free( Acpy );
403 | 	free( C );
404 | 	free( H );
405 | 	free( D );
406 |         free(Wloc1);
407 |         free(Wloc2);
408 |         if (verbose & myrank_mpi == 0) fprintf(stderr, "Free matrices done\n");
409 |     } // End loop over range
410 | 
411 | 
412 |         if (verbose & myrank_mpi == 0) fprintf(stderr, "Range loop ends\n");
413 | 
414 | 	blacs_gridexit_( &i0 );
415 | 	MPI_Finalize();
416 |         if (verbose & myrank_mpi == 0) fprintf(stderr, "Program ends...\n");
417 | 	return 0;
418 | }
419 | 


--------------------------------------------------------------------------------
/timing/timing_pdgezolopd.c:
--------------------------------------------------------------------------------
  1 | /**
  2 |  *
  3 |  * Copyright (c) 2017, King Abdullah University of Science and Technology
  4 |  * All rights reserved.
  5 |  *
  6 |  **/
  7 | 
  8 | /**
  9 |  *
 10 |  * @file timing_pdgezolopd.c
 11 |  *
 12 |  *  ZOLOPD is a high performance software framework for computing 
 13 |  *  the polar decomposition on distributed-memory manycore systems provided by KAUST
 14 |  *
 15 |  * @version 3.0.0
 16 |  * @author Dalal Sukkari
 17 |  * @author Hatem Ltaief
 18 |  * @date 2018-11-08
 19 |  *
 20 |  **/
 21 | 
 22 | #include <stdlib.h>
 23 | #include "polar.h"
 24 | 
 25 | 
 26 | /* Default values of parameters */
 27 | int nprow         = 1;
 28 | int npcol         = 1;
 29 | int lvec          = 1;
 30 | int rvec          = 1;
 31 | int n             = 5120;
 32 | int nb            = 128;
 33 | int mode          = 4;
 34 | double cond       = 9.0072e+15;
 35 | int optcond       = 0;
 36 | int start         = 5120;
 37 | int stop          = 5120;
 38 | int step          = 1;
 39 | int niter         = 1;
 40 | int check         = 0;
 41 | int verbose       = 0;
 42 | 
 43 | static inline double cWtime(void)
 44 | {
 45 |     struct timeval tp;
 46 |     gettimeofday( &tp, NULL );
 47 |     return tp.tv_sec + 1e-6 * tp.tv_usec;
 48 | }
 49 | 
 50 | void print_usage(void)
 51 | {
 52 |     fprintf(stderr,
 53 |             "======= ZOLOPD timing using ScaLAPACK\n"
 54 |             " -p      --nprow         : Number of MPI process rows\n"
 55 |             " -q      --npcol         : Number of MPI process cols\n"
 56 |             " -jl     --lvec          : Compute left singular vectors\n"
 57 |             " -jr     --rvec          : Compute right singular vectors\n"
 58 |             " -n      --N             : Dimension of the matrix\n"
 59 |             " -b      --nb            : Block size\n"
 60 |             " -m      --mode          : [1:6] Mode from pdlatms used to generate the matrix\n"
 61 |             " -k      --cond          : Condition number used to generate the matrix\n"
 62 |             " -o      --optcond       : Estimate Condition number using QR\n"
 63 |             " -i      --niter         : Number of iterations\n"
 64 |             " -r      --n_range       : Range for matrix sizes Start:Stop:Step\n"
 65 |             " -c      --check         : Check the solution\n"
 66 |             " -v      --verbose       : Verbose\n"
 67 |             " -h      --help          : Print this help\n" );
 68 | }
 69 | 
 70 | #define GETOPT_STRING "p:q:x:y:n:b:m:i:o:r:Q,S:s:w:e:c:f:t:v:h"
 71 | 
 72 | static struct option long_options[] =
 73 |     {
 74 |         /* PaRSEC specific options */
 75 |         {"nprow",      required_argument,  0, 'p'},
 76 |         {"npcol",      required_argument,  0, 'q'},
 77 |         {"jl",         no_argument,        0, 'x'},
 78 |         {"lvec",       no_argument,        0, 'x'},
 79 |         {"jr",         no_argument,        0, 'y'},
 80 |         {"rvec",       no_argument,        0, 'y'},
 81 |         {"N",          required_argument,  0, 'n'},
 82 |         {"n",          required_argument,  0, 'n'},
 83 |         {"nb",         required_argument,  0, 'b'},
 84 |         {"b",          required_argument,  0, 'b'},
 85 |         {"mode",       required_argument,  0, 'm'},
 86 |         {"m",          required_argument,  0, 'm'},
 87 |         {"cond",       required_argument,  0, 'k'},
 88 |         {"k",          required_argument,  0, 'k'},
 89 |         {"optcond",    required_argument,  0, 'o'},
 90 |         {"o",          required_argument,  0, 'o'},
 91 |         {"i",          required_argument,  0, 'i'},
 92 |         {"niter",      required_argument,  0, 'i'},
 93 |         {"r",          required_argument,  0, 'r'},
 94 |         {"n_range",    required_argument,  0, 'r'},
 95 |         {"check",      no_argument,        0, 'c'},
 96 |         {"verbose",    no_argument,        0, 'v'},
 97 |         {"help",       no_argument,        0, 'h'},
 98 |         {"h",          no_argument,        0, 'h'},
 99 |         {0, 0, 0, 0}
100 |     };
101 | 
102 | static void parse_arguments(int argc, char** argv)
103 | {
104 |     int opt = 0;
105 |     int c;
106 |     int myrank_mpi;
107 | 
108 |     MPI_Comm_rank(MPI_COMM_WORLD, &myrank_mpi);
109 | 
110 |     do {
111 | #if defined(HAVE_GETOPT_LONG)
112 |         c = getopt_long_only(argc, argv, "",
113 |                         long_options, &opt);
114 | #else
115 |         c = getopt(argc, argv, GETOPT_STRING);
116 |         (void) opt;
117 | #endif  /* defined(HAVE_GETOPT_LONG) */
118 | 
119 |         switch(c) {
120 |         case 'p': nprow     = atoi(optarg); break;
121 |         case 'q': npcol     = atoi(optarg); break;
122 |         case 'n': n         = atoi(optarg); start = n; stop = n; step = 1; break;
123 |         case 'b': nb        = atoi(optarg); break;
124 |         case 'm': mode      = atoi(optarg); break;
125 |         case 'k': cond      = atof(optarg); break;
126 |         case 'o': optcond   = atof(optarg); break;
127 |         case 'i': niter     = atoi(optarg); break;
128 |         case 'r': get_range( optarg, &start, &stop, &step ); break;
129 |         case 'c': check     = 1; break;
130 |         case 'v': verbose   = 1; break;
131 |         case 'h':
132 |             if (myrank_mpi == 0) print_usage(); MPI_Finalize(); exit(0);
133 |             break;
134 |         default:
135 |             break;
136 |         }
137 |     } while(-1 != c);
138 | }
139 | 
140 | int main(int argc, char **argv) {
141 | 
142 | 
143 |     int myrank_mpi, nprocs_mpi;
144 |     int ictxt, myrow, mycol;
145 |     int mloc, nloc, mlocW;
146 |     int mpi_comm_rows, mpi_comm_cols;
147 |     int i, j, k, iter, size, info_facto_mr, info_facto_dc, info_facto_zo, info_facto_el, info_facto_sl, info, iseed;
148 |     int my_info_facto;
149 |     int i0 = 0, i1 = 1;
150 |     int lwork, liwork, ldw;
151 | 
152 |     /* Allocation for the input/output matrices */
153 |     int descA[9], descH[9];
154 |     double *A=NULL, *H=NULL;
155 | 
156 |     /* Allocation to check the results */
157 |     int descAcpy[9], descC[9]; 
158 |     double *Acpy=NULL, *C=NULL;
159 | 
160 |     /* Allocation for pdlatsm */
161 |     double *Wloc1=NULL, *Wloc2=NULL, *D=NULL;
162 | 
163 |     double eps = LAPACKE_dlamch_work('e');
164 |     int iprepad, ipostpad, sizemqrleft, sizemqrright, sizeqrf, sizeqtq,
165 |         sizechk, sizesyevx, isizesyevx,
166 |         sizesubtst, isizesubtst, sizetst,
167 |         isizetst;
168 | /**/
169 | 
170 |     double flops, GFLOPS;
171 | 
172 |     double orth_Uzo, berr_UHzo;
173 |     double frobA;
174 | 
175 |     double alpha, beta;
176 |     char *jobu, *jobvt;
177 | 
178 |     double my_elapsed_polarzolopd = 0.0, elapsed_polarzolopd = 0.0, sumtime_polarzolopd = 0.0;
179 |     double max_time_polarzolopd  = 0.0, min_time_polarzolopd  = 1e20;
180 | 
181 | /**/
182 | 
183 |     if (verbose & myrank_mpi == 0) fprintf(stderr, "Program starts... \n");
184 | 
185 |     MPI_Init(&argc, &argv);
186 |     MPI_Comm_rank(MPI_COMM_WORLD, &myrank_mpi);
187 |     MPI_Comm_size(MPI_COMM_WORLD, &nprocs_mpi);
188 | 
189 |     if (verbose & myrank_mpi == 0) fprintf(stderr, "MPI Init done\n");
190 | 
191 |     parse_arguments(argc, argv);
192 | 
193 |     if (verbose & myrank_mpi == 0) fprintf(stderr, "Checking arguments done\n");
194 | 
195 |     Cblacs_get( -1, 0, &ictxt );
196 |     Cblacs_gridinit( &ictxt, "R", nprow, npcol );
197 |     Cblacs_gridinfo( ictxt, &nprow, &npcol, &myrow, &mycol );
198 | 
199 |     if (verbose & myrank_mpi == 0) fprintf(stderr, "BLACS Init done\n");
200 | 
201 |     if (myrank_mpi == 0) {
202 |         fprintf(stderr, "# \n");
203 |         fprintf(stderr, "# NPROCS %d P %d Q %d\n", nprocs_mpi, nprow, npcol);
204 |         fprintf(stderr, "# niter %d\n", niter);
205 |         fprintf(stderr, "# n_range %d:%d:%d mode: %d cond: %2.4e \n", start, stop, step, mode, cond);
206 |         fprintf(stderr, "# \n");
207 |     }
208 | 
209 |     if (verbose & myrank_mpi == 0) fprintf(stderr, "Range loop starts\n");
210 | 
211 | 
212 |      // Begin loop over range
213 |     for (size = start; size <= stop; size += step) {
214 |         while ( (int)((double)size / (double)nb) < ( max(nprow , npcol) )){
215 |             if (myrank_mpi == 0) fprintf(stderr, " Matrix size is small to be facrorized using this number of processors \n");
216 |                size += step;
217 |         }
218 |         n = size; ldw = 2*n;
219 | 	mloc  = numroc_( &n, &nb, &myrow, &i0, &nprow );
220 | 	nloc  = numroc_( &n, &nb, &mycol, &i0, &npcol );
221 | 	mlocW = numroc_( &ldw, &nb, &myrow, &i0, &nprow );
222 |         if (verbose & myrank_mpi == 0) fprintf(stderr, "Desc Init starts %d\n", mloc);
223 | 
224 | 	descinit_( descA, &n, &n, &nb, &nb, &i0, &i0, &ictxt, &mloc, &info );
225 | 	descinit_( descAcpy, &n, &n, &nb, &nb, &i0, &i0, &ictxt, &mloc, &info );
226 | 	descinit_( descC, &n, &n, &nb, &nb, &i0, &i0, &ictxt, &mloc, &info );
227 | 	descinit_( descH, &n, &n, &nb, &nb, &i0, &i0, &ictxt, &mloc, &info );
228 |         if (verbose & myrank_mpi == 0) fprintf(stderr, "Desc Init ends %d\n", mloc);
229 | 
230 | 	A     = (double *)malloc(mloc*nloc*sizeof(double)) ;
231 | 	H     = (double *)malloc(mloc*nloc*sizeof(double)) ;
232 | 	C     = (double *)malloc(mloc*nloc*sizeof(double)) ;
233 | 	Acpy  = (double *)malloc(mloc*nloc*sizeof(double)) ;
234 | 	D     = (double *)malloc(n*sizeof(double)) ;
235 | 
236 |         
237 |         /* Initialize the timing counters */
238 |         my_elapsed_polarzolopd = 0.0, elapsed_polarzolopd = 0.0, sumtime_polarzolopd = 0.0; 
239 |         max_time_polarzolopd  = 0.0, min_time_polarzolopd  = 1e20;
240 | 
241 |         /* Generate matrix by pdlatms */
242 |         {
243 |            char   *dist = "N"; /* NORMAL( 0, 1 )  ( 'N' for normal ) */
244 |            int    iseed[4] = {1, 0, 0, 1};
245 |            char   *sym = "P"; /* The generated matrix is symmetric, with
246 |                                 eigenvalues (= singular values) specified by D, COND,
247 |                                 MODE, and DMAX; they will not be negative.
248 |                                 "N" not supported. */
249 |            //int    mode = 4; /* sets D(i)=1 - (i-1)/(N-1)*(1 - 1/COND) */
250 |            //double cond = 1.0/eps;
251 |            double dmax = 1.0;
252 |            int    kl   = n;
253 |            int    ku   = n;
254 |            char   *pack = "N"; /* no packing */
255 |            int    order = n;
256 |            int    info;
257 |          
258 |            pdlasizesep_( descA, 
259 |                          &iprepad, &ipostpad, &sizemqrleft, &sizemqrright, &sizeqrf, 
260 |                          &lwork, 
261 |                          &sizeqtq, &sizechk, &sizesyevx, &isizesyevx, &sizesubtst, 
262 |                          &isizesubtst, &sizetst, &isizetst );
263 |            if (verbose & myrank_mpi == 0) fprintf(stderr, "Setting lwork done\n");
264 |            Wloc1 = (double *)calloc(lwork,sizeof(double)) ;
265 |            pdlatms_(&n, &n, dist,
266 |                     iseed, sym, D, &mode, &cond, &dmax,
267 |                     &kl, &ku, pack, 
268 |                     A, &i1, &i1, descA, &order, 
269 |                     Wloc1, &lwork, &info);
270 |            if (verbose & myrank_mpi == 0) fprintf(stderr, "MatGen done\n");
271 |            if (info != 0) {
272 |                fprintf(stderr, "An error occured during matrix generation: %d\n", info );
273 |                return EXIT_FAILURE;
274 |            }
275 |            pdlacpy_( "All", &n, &n, 
276 |                      A, &i1, &i1, descA, 
277 |                      Acpy, &i1, &i1, descAcpy ); 
278 |            frobA  = pdlange_ ( "f", &n, &n, A, &i1, &i1, descA, Wloc1);
279 |            beta = 0.0;
280 | 
281 |            if (verbose & myrank_mpi == 0) fprintf(stderr, "Copy to Acpy done\n");
282 | 
283 |            free( Wloc1 );
284 |         }
285 | 
286 |         if (myrank_mpi == 0) fprintf(stderr, "\n\n");
287 |         if (myrank_mpi == 0) fprintf(stderr, "/////////////////////////////////////////////////////////////////////////\n");
288 |         if (myrank_mpi == 0) fprintf(stderr, "/////////////////////////////////////////////////////////////////////////\n");
289 |         if (myrank_mpi == 0) fprintf(stderr, "/////////////////////////////////////////////////////////////////////////\n");
290 | 
291 |         // ZOLOPD
292 | 	//if ( zomr || zodc || zoel || polarzolopd) {
293 |         jobu   = lvec ? "V" : "N";
294 |         int lWork1, lWork2, lWi;
295 |         Wloc1  = (double *)calloc(1,sizeof(double)) ;
296 |         Wloc2  = (double *)calloc(1,sizeof(double)) ;
297 |         lWork1 = -1; 
298 |         lWork2 = -1; 
299 | 
300 |         pdgezolopd( "H", n, n,
301 |                   A, i1, i1, descA, 
302 |                   H, i1, i1, descH, 
303 |                   //NULL, lWork, //debug
304 |                   Wloc1, lWork1,
305 |                   Wloc2, lWork2,
306 |                   &my_info_facto);
307 | 
308 |         lWork1 = Wloc1[0];
309 |         lWork2 = Wloc2[0];
310 | 
311 |         
312 |         //Wloc  = (double *)malloc(lWork*n*sizeof(double));
313 | 	Wloc1  = (double *)malloc((lWork1*nloc)*sizeof(double));
314 | 	Wloc2  = (double *)malloc((lWork2*nloc)*sizeof(double));
315 | 
316 |         //int MB3 = 3*n;
317 |         //int mlocW3 = numroc_( &MB3, &nb, &myrow, &i0, &nprow );
318 |         //mlocW3 = ((mlocW3+(nb-1))/nb)*nb;
319 | 	//Wloc  = (double *)malloc((mlocW3*nloc)*sizeof(double));
320 |                
321 | 
322 |         for (iter = 0; iter < niter; iter++) {
323 |             pdlacpy_( "All", &n, &n, 
324 |                       Acpy, &i1, &i1, descAcpy, 
325 |                       A,    &i1, &i1, descA ); 
326 |             flops = 0.0;
327 |             if (verbose & myrank_mpi == 0) fprintf(stderr, "\nZOLOPD + ScaLAPACK EIG done\n");
328 |             if (verbose & myrank_mpi == 0) fprintf(stderr, "ZOLOPD starts...\n");
329 | 
330 |             /*
331 |              * Find polar decomposition using ZOLOPD. 
332 |              * C contains the positive-definite factor. 
333 |              * A contains the orthogonal polar factor.
334 |              */
335 |             my_elapsed_polarzolopd   = 0.0;
336 |             my_elapsed_polarzolopd   =- MPI_Wtime();
337 | 
338 |             pdgezolopd( "H", n, n,
339 |                       A, i1, i1, descA, 
340 |                       H, i1, i1, descH, 
341 |                       //NULL, lWork, //debug
342 |                       Wloc1, lWork1,
343 |                       Wloc2, lWork2,
344 |                       &my_info_facto);
345 | 
346 |             my_elapsed_polarzolopd   += MPI_Wtime();
347 |             MPI_Allreduce( &my_elapsed_polarzolopd, &elapsed_polarzolopd, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
348 |             sumtime_polarzolopd += elapsed_polarzolopd;
349 |             if ( elapsed_polarzolopd >= max_time_polarzolopd ) { max_time_polarzolopd = elapsed_polarzolopd;} 
350 |             if ( elapsed_polarzolopd <= min_time_polarzolopd ) { min_time_polarzolopd = elapsed_polarzolopd;} 
351 | 	    MPI_Allreduce( &my_info_facto, &info_facto_zo, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);
352 | 
353 | 	    MPI_Allreduce( &my_info_facto, &info_facto_zo, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);
354 | 
355 |             if (verbose & myrank_mpi == 0) fprintf(stderr, "ZOLOPD ends...\n");
356 |             if (verbose & check & myrank_mpi == 0) fprintf(stderr, "Testing ZOLOPD starts...\n");
357 |             /*
358 |              * Checking the polar factorization
359 |              */
360 |             if(check ){
361 |                /*
362 |                 * checking orthogonality of Up
363 |                 */ 
364 |                 alpha = 0.0; beta = 1.0;
365 |                 pdlaset_( "G", &n, &n, &alpha, &beta, C, &i1, &i1, descC);
366 |                 alpha = 1.0; beta = -1.0;
367 |                 pdgemm_( "T", "N", &n, &n, &n, 
368 |                          &alpha, 
369 |                          A, &i1, &i1, descA, 
370 |                          A, &i1, &i1, descA, 
371 |                          &beta, 
372 |                          C, &i1, &i1, descC);
373 |                 orth_Uzo  = pdlange_ ( "f", &n, &n, C, &i1, &i1, descC, Wloc1)/frobA;
374 |                /*
375 |                 * checking the factorization |A-Up*H|
376 |                 */ 
377 |                 pdlacpy_( "A", &n, &n, Acpy, &i1, &i1, descAcpy, C, &i1, &i1, descC );
378 |                 alpha = 1.0; beta = -1.0;
379 |                 pdgemm_( "N", "N", &n, &n, &n, 
380 |                          &alpha, 
381 |                          A, &i1, &i1, descA, 
382 |                          H, &i1, &i1, descH, 
383 |                          &beta, 
384 |                          C, &i1, &i1, descC);
385 |                 berr_UHzo  = pdlange_ ( "f", &n, &n, C, &i1, &i1, descC, Wloc1)/frobA;
386 |             }
387 |         }
388 |         if (  myrank_mpi == 0) {
389 |             fprintf(stderr, "# ZOLOPD \n"); 
390 |             fprintf(stderr, "#\n");
391 | 	    fprintf(stderr, "# \tN     \tNB   \tNP   \tP   \tQ   \tGflop/s \tAvg-Time     \tMax-Time    \tMin-Time    \tBerr_UpH  \tOrth_Up  \tinfo     \n");
392 | 	    fprintf(stderr, "   %6d \t%4d \t%4d \t%3d \t%3d \t%8.2f", n, nb, nprocs_mpi, nprow, npcol, flops/1e9/min_time_polarzolopd);
393 | 	    fprintf(stderr, "\t%6.2f \t\t%6.2f \t\t%6.2f \t\t%2.4e \t%2.4e \t%d \n", sumtime_polarzolopd/niter, max_time_polarzolopd, min_time_polarzolopd, berr_UHzo, orth_Uzo,info_facto_zo);
394 |             fprintf(stderr, "/////////////////////////////////////////////////////////////////////////\n");
395 |          }
396 | 
397 | 
398 | 
399 |         if (myrank_mpi == 0) fprintf(stderr, "/////////////////////////////////////////////////////////////////////////\n");
400 | 
401 | 	free( A );
402 | 	free( Acpy );
403 | 	free( C );
404 | 	free( H );
405 | 	free( D );
406 |         free(Wloc1);
407 |         free(Wloc2);
408 |         if (verbose & myrank_mpi == 0) fprintf(stderr, "Free matrices done\n");
409 |     } // End loop over range
410 | 
411 | 
412 |         if (verbose & myrank_mpi == 0) fprintf(stderr, "Range loop ends\n");
413 | 
414 | 	blacs_gridexit_( &i0 );
415 | 	MPI_Finalize();
416 |         if (verbose & myrank_mpi == 0) fprintf(stderr, "Program ends...\n");
417 | 	return 0;
418 | }
419 | 


--------------------------------------------------------------------------------
/include/flops.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  *
  3 |  * Copyright (c) 2017, King Abdullah University of Science and Technology
  4 |  * All rights reserved.
  5 |  *
  6 |  **/
  7 | 
  8 | /**
  9 |  *
 10 |  * @file flops.h
 11 |  *
 12 |  *  File provided by Univ. of Tennessee,
 13 |  *
 14 |  * @version 3.0.0
 15 |  * @author Mathieu Faverge
 16 |  * @date 2018-11-08
 17 |  *
 18 |  **/
 19 | /*
 20 |  * This file provide the flops formula for all Level 3 BLAS and some
 21 |  * Lapack routines.  Each macro uses the same size parameters as the
 22 |  * function associated and provide one formula for additions and one
 23 |  * for multiplications. Example to use these macros:
 24 |  *
 25 |  *    FLOPS_ZGEMM( m, n, k )
 26 |  *
 27 |  * All the formula are reported in the LAPACK Lawn 41:
 28 |  *     http://www.netlib.org/lapack/lawns/lawn41.ps
 29 |  */
 30 | #ifndef _FLOPS_H_
 31 | #define _FLOPS_H_
 32 | 
 33 | /************************************************************************
 34 |  *           Generic formula coming from LAWN 41
 35 |  ***********************************************************************/
 36 | 
 37 | /*
 38 |  * Level 2 BLAS 
 39 |  */  
 40 | #define FMULS_GEMV(__m, __n) ((double)(__m) * (double)(__n) + 2. * (double)(__m))
 41 | #define FADDS_GEMV(__m, __n) ((double)(__m) * (double)(__n)                     )
 42 | 
 43 | #define FMULS_SYMV(__n) FMULS_GEMV( (__n), (__n) )
 44 | #define FADDS_SYMV(__n) FADDS_GEMV( (__n), (__n) )
 45 | #define FMULS_HEMV FMULS_SYMV
 46 | #define FADDS_HEMV FADDS_SYMV
 47 | 
 48 | /*
 49 |  * Level 3 BLAS 
 50 |  */
 51 | #define FMULS_GEMM(__m, __n, __k) ((double)(__m) * (double)(__n) * (double)(__k))
 52 | #define FADDS_GEMM(__m, __n, __k) ((double)(__m) * (double)(__n) * (double)(__k))
 53 | 
 54 | #define FMULS_SYMM(__side, __m, __n) ( ( (__side) == Left ) ? FMULS_GEMM((__m), (__m), (__n)) : FMULS_GEMM((__m), (__n), (__n)) )
 55 | #define FADDS_SYMM(__side, __m, __n) ( ( (__side) == Left ) ? FADDS_GEMM((__m), (__m), (__n)) : FADDS_GEMM((__m), (__n), (__n)) )
 56 | #define FMULS_HEMM FMULS_SYMM
 57 | #define FADDS_HEMM FADDS_SYMM
 58 | 
 59 | #define FMULS_SYRK(__k, __n) (0.5 * (double)(__k) * (double)(__n) * ((double)(__n)+1.))
 60 | #define FADDS_SYRK(__k, __n) (0.5 * (double)(__k) * (double)(__n) * ((double)(__n)+1.))
 61 | #define FMULS_HERK FMULS_SYRK
 62 | #define FADDS_HERK FADDS_SYRK
 63 | 
 64 | #define FMULS_SYR2K(__k, __n) ((double)(__k) * (double)(__n) * (double)(__n)                )
 65 | #define FADDS_SYR2K(__k, __n) ((double)(__k) * (double)(__n) * (double)(__n) + (double)(__n))
 66 | #define FMULS_HER2K FMULS_SYR2K
 67 | #define FADDS_HER2K FADDS_SYR2K
 68 | 
 69 | #define FMULS_TRMM_2(__m, __n) (0.5 * (double)(__n) * (double)(__m) * ((double)(__m)+1.))
 70 | #define FADDS_TRMM_2(__m, __n) (0.5 * (double)(__n) * (double)(__m) * ((double)(__m)-1.))
 71 | 
 72 | 
 73 | #define FMULS_TRMM(__side, __m, __n) ( ( (__side) == 'L' ) ? FMULS_TRMM_2((__m), (__n)) : FMULS_TRMM_2((__n), (__m)) )
 74 | #define FADDS_TRMM(__side, __m, __n) ( ( (__side) == 'L' ) ? FADDS_TRMM_2((__m), (__n)) : FADDS_TRMM_2((__n), (__m)) )
 75 | 
 76 | #define FMULS_TRSM FMULS_TRMM
 77 | #define FADDS_TRSM FMULS_TRMM
 78 | 
 79 | /*
 80 |  * Lapack
 81 |  */
 82 | #define FMULS_GETRF(__m, __n) ( ((__m) < (__n)) ? (0.5 * (double)(__m) * ((double)(__m) * ((double)(__n) - (1./3.) * (__m) - 1. ) + (double)(__n)) + (2. / 3.) * (__m)) \
 83 |                                 :                 (0.5 * (double)(__n) * ((double)(__n) * ((double)(__m) - (1./3.) * (__n) - 1. ) + (double)(__m)) + (2. / 3.) * (__n)) )
 84 | #define FADDS_GETRF(__m, __n) ( ((__m) < (__n)) ? (0.5 * (double)(__m) * ((double)(__m) * ((double)(__n) - (1./3.) * (__m)      ) - (double)(__n)) + (1. / 6.) * (__m)) \
 85 |                                 :                 (0.5 * (double)(__n) * ((double)(__n) * ((double)(__m) - (1./3.) * (__n)      ) - (double)(__m)) + (1. / 6.) * (__n)) )
 86 | 
 87 | #define FMULS_GETRI(__n) ( (double)(__n) * ((5. / 6.) + (double)(__n) * ((2. / 3.) * (double)(__n) + 0.5)) )
 88 | #define FADDS_GETRI(__n) ( (double)(__n) * ((5. / 6.) + (double)(__n) * ((2. / 3.) * (double)(__n) - 1.5)) )
 89 | 
 90 | #define FMULS_GETRS(__n, __nrhs) ((double)(__nrhs) * (double)(__n) *  (double)(__n)       )
 91 | #define FADDS_GETRS(__n, __nrhs) ((double)(__nrhs) * (double)(__n) * ((double)(__n) - 1. ))
 92 | 
 93 | #define FMULS_POTRF(__n) ((double)(__n) * (((1. / 6.) * (double)(__n) + 0.5) * (double)(__n) + (1. / 3.)))
 94 | #define FADDS_POTRF(__n) ((double)(__n) * (((1. / 6.) * (double)(__n)      ) * (double)(__n) - (1. / 6.)))
 95 | 
 96 | #define FMULS_POTRI(__n) ( (double)(__n) * ((2. / 3.) + (double)(__n) * ((1. / 3.) * (double)(__n) + 1. )) )
 97 | #define FADDS_POTRI(__n) ( (double)(__n) * ((1. / 6.) + (double)(__n) * ((1. / 3.) * (double)(__n) - 0.5)) )
 98 | 
 99 | #define FMULS_POTRS(__n, __nrhs) ((double)(__nrhs) * (double)(__n) * ((double)(__n) + 1. ))
100 | #define FADDS_POTRS(__n, __nrhs) ((double)(__nrhs) * (double)(__n) * ((double)(__n) - 1. ))
101 | 
102 | //SPBTRF
103 | //SPBTRS
104 | //SSYTRF
105 | //SSYTRI
106 | //SSYTRS
107 | 
108 | #define FMULS_GEQRF(__m, __n) (((__m) > (__n)) ? ((double)(__n) * ((double)(__n) * (  0.5-(1./3.) * (double)(__n) + (double)(__m)) +    (double)(__m) + 23. / 6.)) \
109 |                                :                 ((double)(__m) * ((double)(__m) * ( -0.5-(1./3.) * (double)(__m) + (double)(__n)) + 2.*(double)(__n) + 23. / 6.)) )
110 | #define FADDS_GEQRF(__m, __n) (((__m) > (__n)) ? ((double)(__n) * ((double)(__n) * (  0.5-(1./3.) * (double)(__n) + (double)(__m))                    +  5. / 6.)) \
111 |                                :                 ((double)(__m) * ((double)(__m) * ( -0.5-(1./3.) * (double)(__m) + (double)(__n)) +    (double)(__n) +  5. / 6.)) )
112 | 
113 | #define FMULS_GEQLF(__m, __n) FMULS_GEQRF(__m, __n)
114 | #define FADDS_GEQLF(__m, __n) FADDS_GEQRF(__m, __n)
115 | 
116 | #define FMULS_GERQF(__m, __n) (((__m) > (__n)) ? ((double)(__n) * ((double)(__n) * (  0.5-(1./3.) * (double)(__n) + (double)(__m)) +    (double)(__m) + 29. / 6.)) \
117 |                                :                 ((double)(__m) * ((double)(__m) * ( -0.5-(1./3.) * (double)(__m) + (double)(__n)) + 2.*(double)(__n) + 29. / 6.)) )
118 | #define FADDS_GERQF(__m, __n) (((__m) > (__n)) ? ((double)(__n) * ((double)(__n) * ( -0.5-(1./3.) * (double)(__n) + (double)(__m)) +    (double)(__m) +  5. / 6.)) \
119 |                                :                 ((double)(__m) * ((double)(__m) * (  0.5-(1./3.) * (double)(__m) + (double)(__n)) +                  +  5. / 6.)) )
120 | 
121 | #define FMULS_GELQF(__m, __n) FMULS_GERQF(__m, __n)
122 | #define FADDS_GELQF(__m, __n) FADDS_GERQF(__m, __n)
123 | 
124 | #define FMULS_UNGQR(__m, __n, __k) ((double)(__k) * (2.* (double)(__m) * (double)(__n) +            2. * (double)(__n) - 5./3. + (double)(__k) * ( 2./3. * (double)(__k) - ((double)(__m) + (double)(__n)) - 1.)))
125 | #define FADDS_UNGQR(__m, __n, __k) ((double)(__k) * (2.* (double)(__m) * (double)(__n) + (double)(__n) - (double)(__m) + 1./3. + (double)(__k) * ( 2./3. * (double)(__k) - ((double)(__m) + (double)(__n))     )))
126 | #define FMULS_UNGQL FMULS_UNGQR
127 | #define FMULS_ORGQR FMULS_UNGQR
128 | #define FMULS_ORGQL FMULS_UNGQR
129 | #define FADDS_UNGQL FADDS_UNGQR
130 | #define FADDS_ORGQR FADDS_UNGQR
131 | #define FADDS_ORGQL FADDS_UNGQR
132 | 
133 | #define FMULS_UNGRQ(__m, __n, __k) ((double)(__k) * (2.* (double)(__m) * (double)(__n) + (double)(__m) + (double)(__n) - 2./3. + (double)(__k) * ( 2./3. * (double)(__k) - ((double)(__m) + (double)(__n)) - 1.)))
134 | #define FADDS_UNGRQ(__m, __n, __k) ((double)(__k) * (2.* (double)(__m) * (double)(__n) + (double)(__m) - (double)(__n) + 1./3. + (double)(__k) * ( 2./3. * (double)(__k) - ((double)(__m) + (double)(__n))     )))
135 | #define FMULS_UNGLQ FMULS_UNGRQ
136 | #define FMULS_ORGRQ FMULS_UNGRQ
137 | #define FMULS_ORGLQ FMULS_UNGRQ
138 | #define FADDS_UNGLQ FADDS_UNGRQ
139 | #define FADDS_ORGRQ FADDS_UNGRQ
140 | #define FADDS_ORGLQ FADDS_UNGRQ
141 | 
142 | #define FMULS_GEQRS(__m, __n, __nrhs) ((double)(__nrhs) * ((double)(__n) * ( 2.* (double)(__m) - 0.5 * (double)(__n) + 2.5)))
143 | #define FADDS_GEQRS(__m, __n, __nrhs) ((double)(__nrhs) * ((double)(__n) * ( 2.* (double)(__m) - 0.5 * (double)(__n) + 0.5)))
144 | 
145 | //UNMQR, UNMLQ, UNMQL, UNMRQ (Left)
146 | //UNMQR, UNMLQ, UNMQL, UNMRQ (Right)
147 | 
148 | #define FMULS_TRTRI(__n) ((double)(__n) * ((double)(__n) * ( 1./6. * (double)(__n) + 0.5 ) + 1./3.))
149 | #define FADDS_TRTRI(__n) ((double)(__n) * ((double)(__n) * ( 1./6. * (double)(__n) - 0.5 ) + 1./3.))
150 | 
151 | #define FMULS_GEHRD(__n) ( (double)(__n) * ((double)(__n) * (5./3. *(double)(__n) + 0.5) - 7./6.) - 13. )
152 | #define FADDS_GEHRD(__n) ( (double)(__n) * ((double)(__n) * (5./3. *(double)(__n) - 1. ) - 2./3.) -  8. )
153 | 
154 | #define FMULS_SYTRD(__n) ( (double)(__n) *  ( (double)(__n) * ( 2./3. * (double)(__n) + 2.5 ) - 1./6. ) - 15.)
155 | #define FADDS_SYTRD(__n) ( (double)(__n) *  ( (double)(__n) * ( 2./3. * (double)(__n) + 1.  ) - 8./3. ) -  4.)
156 | #define FMULS_HETRD FMULS_SYTRD
157 | #define FADDS_HETRD FADDS_SYTRD
158 | 
159 | #define FMULS_GEBRD(__m, __n) ( ((__m) >= (__n)) ? ((double)(__n) * ((double)(__n) * (2. * (double)(__m) - 2./3. * (double)(__n) + 2. )                 + 20./3.)) \
160 |                                 :                  ((double)(__m) * ((double)(__m) * (2. * (double)(__n) - 2./3. * (double)(__m) + 2. )                 + 20./3.)) )
161 | #define FADDS_GEBRD(__m, __n) ( ((__m) >= (__n)) ? ((double)(__n) * ((double)(__n) * (2. * (double)(__m) - 2./3. * (double)(__n) + 1. ) - (double)(__m) +  5./3.)) \
162 |                                 :                  ((double)(__m) * ((double)(__m) * (2. * (double)(__n) - 2./3. * (double)(__m) + 1. ) - (double)(__n) +  5./3.)) )
163 | 
164 | 
165 | /*******************************************************************************
166 |  *               Users functions
167 |  ******************************************************************************/
168 | 
169 | /*
170 |  * Level 2 BLAS 
171 |  */  
172 | #define FLOPS_ZGEMV(__m, __n) (6. * FMULS_GEMV((__m), (__n)) + 2.0 * FADDS_GEMV((__m), (__n)) )
173 | #define FLOPS_CGEMV(__m, __n) (6. * FMULS_GEMV((__m), (__n)) + 2.0 * FADDS_GEMV((__m), (__n)) )
174 | #define FLOPS_DGEMV(__m, __n) (     FMULS_GEMV((__m), (__n)) +       FADDS_GEMV((__m), (__n)) )
175 | #define FLOPS_SGEMV(__m, __n) (     FMULS_GEMV((__m), (__n)) +       FADDS_GEMV((__m), (__n)) )
176 |  
177 | #define FLOPS_ZHEMV(__n) (6. * FMULS_HEMV((__n)) + 2.0 * FADDS_HEMV((__n)) )
178 | #define FLOPS_CHEMV(__n) (6. * FMULS_HEMV((__n)) + 2.0 * FADDS_HEMV((__n)) )
179 | 
180 | #define FLOPS_ZSYMV(__n) (6. * FMULS_SYMV((__n)) + 2.0 * FADDS_SYMV((__n)) )
181 | #define FLOPS_CSYMV(__n) (6. * FMULS_SYMV((__n)) + 2.0 * FADDS_SYMV((__n)) )
182 | #define FLOPS_DSYMV(__n) (     FMULS_SYMV((__n)) +       FADDS_SYMV((__n)) )
183 | #define FLOPS_SSYMV(__n) (     FMULS_SYMV((__n)) +       FADDS_SYMV((__n)) )
184 |  
185 | /*
186 |  * Level 3 BLAS 
187 |  */
188 | #define FLOPS_ZGEMM(__m, __n, __k) (6. * FMULS_GEMM((__m), (__n), (__k)) + 2.0 * FADDS_GEMM((__m), (__n), (__k)) )
189 | #define FLOPS_CGEMM(__m, __n, __k) (6. * FMULS_GEMM((__m), (__n), (__k)) + 2.0 * FADDS_GEMM((__m), (__n), (__k)) )
190 | #define FLOPS_DGEMM(__m, __n, __k) (     FMULS_GEMM((__m), (__n), (__k)) +       FADDS_GEMM((__m), (__n), (__k)) )
191 | #define FLOPS_SGEMM(__m, __n, __k) (     FMULS_GEMM((__m), (__n), (__k)) +       FADDS_GEMM((__m), (__n), (__k)) )
192 | 
193 | #define FLOPS_ZHEMM(__side, __m, __n) (6. * FMULS_HEMM(__side, (__m), (__n)) + 2.0 * FADDS_HEMM(__side, (__m), (__n)) )
194 | #define FLOPS_CHEMM(__side, __m, __n) (6. * FMULS_HEMM(__side, (__m), (__n)) + 2.0 * FADDS_HEMM(__side, (__m), (__n)) )
195 | 
196 | #define FLOPS_ZSYMM(__side, __m, __n) (6. * FMULS_SYMM(__side, (__m), (__n)) + 2.0 * FADDS_SYMM(__side, (__m), (__n)) )
197 | #define FLOPS_CSYMM(__side, __m, __n) (6. * FMULS_SYMM(__side, (__m), (__n)) + 2.0 * FADDS_SYMM(__side, (__m), (__n)) )
198 | #define FLOPS_DSYMM(__side, __m, __n) (     FMULS_SYMM(__side, (__m), (__n)) +       FADDS_SYMM(__side, (__m), (__n)) )
199 | #define FLOPS_SSYMM(__side, __m, __n) (     FMULS_SYMM(__side, (__m), (__n)) +       FADDS_SYMM(__side, (__m), (__n)) )
200 |  
201 | #define FLOPS_ZHERK(__k, __n) (6. * FMULS_HERK((__k), (__n)) + 2.0 * FADDS_HERK((__k), (__n)) )
202 | #define FLOPS_CHERK(__k, __n) (6. * FMULS_HERK((__k), (__n)) + 2.0 * FADDS_HERK((__k), (__n)) )
203 | 
204 | #define FLOPS_ZSYRK(__k, __n) (6. * FMULS_SYRK((__k), (__n)) + 2.0 * FADDS_SYRK((__k), (__n)) )
205 | #define FLOPS_CSYRK(__k, __n) (6. * FMULS_SYRK((__k), (__n)) + 2.0 * FADDS_SYRK((__k), (__n)) )
206 | #define FLOPS_DSYRK(__k, __n) (     FMULS_SYRK((__k), (__n)) +       FADDS_SYRK((__k), (__n)) )
207 | #define FLOPS_SSYRK(__k, __n) (     FMULS_SYRK((__k), (__n)) +       FADDS_SYRK((__k), (__n)) )
208 |  
209 | #define FLOPS_ZHER2K(__k, __n) (6. * FMULS_HER2K((__k), (__n)) + 2.0 * FADDS_HER2K((__k), (__n)) )
210 | #define FLOPS_CHER2K(__k, __n) (6. * FMULS_HER2K((__k), (__n)) + 2.0 * FADDS_HER2K((__k), (__n)) )
211 | 
212 | #define FLOPS_ZSYR2K(__k, __n) (6. * FMULS_SYR2K((__k), (__n)) + 2.0 * FADDS_SYR2K((__k), (__n)) )
213 | #define FLOPS_CSYR2K(__k, __n) (6. * FMULS_SYR2K((__k), (__n)) + 2.0 * FADDS_SYR2K((__k), (__n)) )
214 | #define FLOPS_DSYR2K(__k, __n) (     FMULS_SYR2K((__k), (__n)) +       FADDS_SYR2K((__k), (__n)) )
215 | #define FLOPS_SSYR2K(__k, __n) (     FMULS_SYR2K((__k), (__n)) +       FADDS_SYR2K((__k), (__n)) )
216 |  
217 | #define FLOPS_ZTRMM(__side, __m, __n) (6. * FMULS_TRMM(__side, (__m), (__n)) + 2.0 * FADDS_TRMM(__side, (__m), (__n)) )
218 | #define FLOPS_CTRMM(__side, __m, __n) (6. * FMULS_TRMM(__side, (__m), (__n)) + 2.0 * FADDS_TRMM(__side, (__m), (__n)) )
219 | #define FLOPS_DTRMM(__side, __m, __n) (     FMULS_TRMM(__side, (__m), (__n)) +       FADDS_TRMM(__side, (__m), (__n)) )
220 | #define FLOPS_STRMM(__side, __m, __n) (     FMULS_TRMM(__side, (__m), (__n)) +       FADDS_TRMM(__side, (__m), (__n)) )
221 |  
222 | #define FLOPS_ZTRSM(__side, __m, __n) (6. * FMULS_TRSM(__side, (__m), (__n)) + 2.0 * FADDS_TRSM(__side, (__m), (__n)) )
223 | #define FLOPS_CTRSM(__side, __m, __n) (6. * FMULS_TRSM(__side, (__m), (__n)) + 2.0 * FADDS_TRSM(__side, (__m), (__n)) )
224 | #define FLOPS_DTRSM(__side, __m, __n) (     FMULS_TRSM(__side, (__m), (__n)) +       FADDS_TRSM(__side, (__m), (__n)) )
225 | #define FLOPS_STRSM(__side, __m, __n) (     FMULS_TRSM(__side, (__m), (__n)) +       FADDS_TRSM(__side, (__m), (__n)) )
226 |  
227 | /*
228 |  * Lapack
229 |  */
230 | #define FLOPS_ZGETRF(__m, __n) (6. * FMULS_GETRF((__m), (__n)) + 2.0 * FADDS_GETRF((__m), (__n)) )
231 | #define FLOPS_CGETRF(__m, __n) (6. * FMULS_GETRF((__m), (__n)) + 2.0 * FADDS_GETRF((__m), (__n)) )
232 | #define FLOPS_DGETRF(__m, __n) (     FMULS_GETRF((__m), (__n)) +       FADDS_GETRF((__m), (__n)) )
233 | #define FLOPS_SGETRF(__m, __n) (     FMULS_GETRF((__m), (__n)) +       FADDS_GETRF((__m), (__n)) )
234 | 
235 | #define FLOPS_ZGETRI(__n) (6. * FMULS_GETRI((__n)) + 2.0 * FADDS_GETRI((__n)) )
236 | #define FLOPS_CGETRI(__n) (6. * FMULS_GETRI((__n)) + 2.0 * FADDS_GETRI((__n)) )
237 | #define FLOPS_DGETRI(__n) (     FMULS_GETRI((__n)) +       FADDS_GETRI((__n)) )
238 | #define FLOPS_SGETRI(__n) (     FMULS_GETRI((__n)) +       FADDS_GETRI((__n)) )
239 | 
240 | #define FLOPS_ZGETRS(__n, __nrhs) (6. * FMULS_GETRS((__n), (__nrhs)) + 2.0 * FADDS_GETRS((__n), (__nrhs)) )
241 | #define FLOPS_CGETRS(__n, __nrhs) (6. * FMULS_GETRS((__n), (__nrhs)) + 2.0 * FADDS_GETRS((__n), (__nrhs)) )
242 | #define FLOPS_DGETRS(__n, __nrhs) (     FMULS_GETRS((__n), (__nrhs)) +       FADDS_GETRS((__n), (__nrhs)) )
243 | #define FLOPS_SGETRS(__n, __nrhs) (     FMULS_GETRS((__n), (__nrhs)) +       FADDS_GETRS((__n), (__nrhs)) )
244 | 
245 | #define FLOPS_ZPOTRF(__n) (6. * FMULS_POTRF((__n)) + 2.0 * FADDS_POTRF((__n)) )
246 | #define FLOPS_CPOTRF(__n) (6. * FMULS_POTRF((__n)) + 2.0 * FADDS_POTRF((__n)) )
247 | #define FLOPS_DPOTRF(__n) (     FMULS_POTRF((__n)) +       FADDS_POTRF((__n)) )
248 | #define FLOPS_SPOTRF(__n) (     FMULS_POTRF((__n)) +       FADDS_POTRF((__n)) )
249 | 
250 | #define FLOPS_ZPOTRI(__n) (6. * FMULS_POTRI((__n)) + 2.0 * FADDS_POTRI((__n)) )
251 | #define FLOPS_CPOTRI(__n) (6. * FMULS_POTRI((__n)) + 2.0 * FADDS_POTRI((__n)) )
252 | #define FLOPS_DPOTRI(__n) (     FMULS_POTRI((__n)) +       FADDS_POTRI((__n)) )
253 | #define FLOPS_SPOTRI(__n) (     FMULS_POTRI((__n)) +       FADDS_POTRI((__n)) )
254 | 
255 | #define FLOPS_ZPOTRS(__n, __nrhs) (6. * FMULS_POTRS((__n), (__nrhs)) + 2.0 * FADDS_POTRS((__n), (__nrhs)) )
256 | #define FLOPS_CPOTRS(__n, __nrhs) (6. * FMULS_POTRS((__n), (__nrhs)) + 2.0 * FADDS_POTRS((__n), (__nrhs)) )
257 | #define FLOPS_DPOTRS(__n, __nrhs) (     FMULS_POTRS((__n), (__nrhs)) +       FADDS_POTRS((__n), (__nrhs)) )
258 | #define FLOPS_SPOTRS(__n, __nrhs) (     FMULS_POTRS((__n), (__nrhs)) +       FADDS_POTRS((__n), (__nrhs)) )
259 | 
260 | #define FLOPS_ZGEQRF(__m, __n) (6. * FMULS_GEQRF((__m), (__n)) + 2.0 * FADDS_GEQRF((__m), (__n)) )
261 | #define FLOPS_CGEQRF(__m, __n) (6. * FMULS_GEQRF((__m), (__n)) + 2.0 * FADDS_GEQRF((__m), (__n)) )
262 | #define FLOPS_DGEQRF(__m, __n) (     FMULS_GEQRF((__m), (__n)) +       FADDS_GEQRF((__m), (__n)) )
263 | #define FLOPS_SGEQRF(__m, __n) (     FMULS_GEQRF((__m), (__n)) +       FADDS_GEQRF((__m), (__n)) )
264 | 
265 | #define FLOPS_ZGEQLF(__m, __n) (6. * FMULS_GEQLF((__m), (__n)) + 2.0 * FADDS_GEQLF((__m), (__n)) )
266 | #define FLOPS_CGEQLF(__m, __n) (6. * FMULS_GEQLF((__m), (__n)) + 2.0 * FADDS_GEQLF((__m), (__n)) )
267 | #define FLOPS_DGEQLF(__m, __n) (     FMULS_GEQLF((__m), (__n)) +       FADDS_GEQLF((__m), (__n)) )
268 | #define FLOPS_SGEQLF(__m, __n) (     FMULS_GEQLF((__m), (__n)) +       FADDS_GEQLF((__m), (__n)) )
269 | 
270 | #define FLOPS_ZGERQF(__m, __n) (6. * FMULS_GERQF((__m), (__n)) + 2.0 * FADDS_GERQF((__m), (__n)) )
271 | #define FLOPS_CGERQF(__m, __n) (6. * FMULS_GERQF((__m), (__n)) + 2.0 * FADDS_GERQF((__m), (__n)) )
272 | #define FLOPS_DGERQF(__m, __n) (     FMULS_GERQF((__m), (__n)) +       FADDS_GERQF((__m), (__n)) )
273 | #define FLOPS_SGERQF(__m, __n) (     FMULS_GERQF((__m), (__n)) +       FADDS_GERQF((__m), (__n)) )
274 | 
275 | #define FLOPS_ZGELQF(__m, __n) (6. * FMULS_GELQF((__m), (__n)) + 2.0 * FADDS_GELQF((__m), (__n)) )
276 | #define FLOPS_CGELQF(__m, __n) (6. * FMULS_GELQF((__m), (__n)) + 2.0 * FADDS_GELQF((__m), (__n)) )
277 | #define FLOPS_DGELQF(__m, __n) (     FMULS_GELQF((__m), (__n)) +       FADDS_GELQF((__m), (__n)) )
278 | #define FLOPS_SGELQF(__m, __n) (     FMULS_GELQF((__m), (__n)) +       FADDS_GELQF((__m), (__n)) )
279 | 
280 | #define FLOPS_ZUNGQR(__m, __n, __k) (6. * FMULS_UNGQR((__m), (__n), (__k)) + 2.0 * FADDS_UNGQR((__m), (__n), (__k)) )
281 | #define FLOPS_CUNGQR(__m, __n, __k) (6. * FMULS_UNGQR((__m), (__n), (__k)) + 2.0 * FADDS_UNGQR((__m), (__n), (__k)) )
282 | #define FLOPS_DUNGQR(__m, __n, __k) (     FMULS_UNGQR((__m), (__n), (__k)) +       FADDS_UNGQR((__m), (__n), (__k)) )
283 | #define FLOPS_SUNGQR(__m, __n, __k) (     FMULS_UNGQR((__m), (__n), (__k)) +       FADDS_UNGQR((__m), (__n), (__k)) )
284 | 
285 | #define FLOPS_ZUNGQL(__m, __n, __k) (6. * FMULS_UNGQL((__m), (__n), (__k)) + 2.0 * FADDS_UNGQL((__m), (__n), (__k)) )
286 | #define FLOPS_CUNGQL(__m, __n, __k) (6. * FMULS_UNGQL((__m), (__n), (__k)) + 2.0 * FADDS_UNGQL((__m), (__n), (__k)) )
287 | #define FLOPS_DUNGQL(__m, __n, __k) (     FMULS_UNGQL((__m), (__n), (__k)) +       FADDS_UNGQL((__m), (__n), (__k)) )
288 | #define FLOPS_SUNGQL(__m, __n, __k) (     FMULS_UNGQL((__m), (__n), (__k)) +       FADDS_UNGQL((__m), (__n), (__k)) )
289 | 
290 | #define FLOPS_ZORGQR(__m, __n, __k) (6. * FMULS_ORGQR((__m), (__n), (__k)) + 2.0 * FADDS_ORGQR((__m), (__n), (__k)) )
291 | #define FLOPS_CORGQR(__m, __n, __k) (6. * FMULS_ORGQR((__m), (__n), (__k)) + 2.0 * FADDS_ORGQR((__m), (__n), (__k)) )
292 | #define FLOPS_DORGQR(__m, __n, __k) (     FMULS_ORGQR((__m), (__n), (__k)) +       FADDS_ORGQR((__m), (__n), (__k)) )
293 | #define FLOPS_SORGQR(__m, __n, __k) (     FMULS_ORGQR((__m), (__n), (__k)) +       FADDS_ORGQR((__m), (__n), (__k)) )
294 | 
295 | #define FLOPS_ZORGQL(__m, __n, __k) (6. * FMULS_ORGQL((__m), (__n), (__k)) + 2.0 * FADDS_ORGQL((__m), (__n), (__k)) )
296 | #define FLOPS_CORGQL(__m, __n, __k) (6. * FMULS_ORGQL((__m), (__n), (__k)) + 2.0 * FADDS_ORGQL((__m), (__n), (__k)) )
297 | #define FLOPS_DORGQL(__m, __n, __k) (     FMULS_ORGQL((__m), (__n), (__k)) +       FADDS_ORGQL((__m), (__n), (__k)) )
298 | #define FLOPS_SORGQL(__m, __n, __k) (     FMULS_ORGQL((__m), (__n), (__k)) +       FADDS_ORGQL((__m), (__n), (__k)) )
299 | 
300 | #define FLOPS_ZUNGRQ(__m, __n, __k) (6. * FMULS_UNGRQ((__m), (__n), (__k)) + 2.0 * FADDS_UNGRQ((__m), (__n), (__k)) )
301 | #define FLOPS_CUNGRQ(__m, __n, __k) (6. * FMULS_UNGRQ((__m), (__n), (__k)) + 2.0 * FADDS_UNGRQ((__m), (__n), (__k)) )
302 | #define FLOPS_DUNGRQ(__m, __n, __k) (     FMULS_UNGRQ((__m), (__n), (__k)) +       FADDS_UNGRQ((__m), (__n), (__k)) )
303 | #define FLOPS_SUNGRQ(__m, __n, __k) (     FMULS_UNGRQ((__m), (__n), (__k)) +       FADDS_UNGRQ((__m), (__n), (__k)) )
304 | 
305 | #define FLOPS_ZUNGLQ(__m, __n, __k) (6. * FMULS_UNGLQ((__m), (__n), (__k)) + 2.0 * FADDS_UNGLQ((__m), (__n), (__k)) )
306 | #define FLOPS_CUNGLQ(__m, __n, __k) (6. * FMULS_UNGLQ((__m), (__n), (__k)) + 2.0 * FADDS_UNGLQ((__m), (__n), (__k)) )
307 | #define FLOPS_DUNGLQ(__m, __n, __k) (     FMULS_UNGLQ((__m), (__n), (__k)) +       FADDS_UNGLQ((__m), (__n), (__k)) )
308 | #define FLOPS_SUNGLQ(__m, __n, __k) (     FMULS_UNGLQ((__m), (__n), (__k)) +       FADDS_UNGLQ((__m), (__n), (__k)) )
309 | 
310 | #define FLOPS_ZORGRQ(__m, __n, __k) (6. * FMULS_ORGRQ((__m), (__n), (__k)) + 2.0 * FADDS_ORGRQ((__m), (__n), (__k)) )
311 | #define FLOPS_CORGRQ(__m, __n, __k) (6. * FMULS_ORGRQ((__m), (__n), (__k)) + 2.0 * FADDS_ORGRQ((__m), (__n), (__k)) )
312 | #define FLOPS_DORGRQ(__m, __n, __k) (     FMULS_ORGRQ((__m), (__n), (__k)) +       FADDS_ORGRQ((__m), (__n), (__k)) )
313 | #define FLOPS_SORGRQ(__m, __n, __k) (     FMULS_ORGRQ((__m), (__n), (__k)) +       FADDS_ORGRQ((__m), (__n), (__k)) )
314 | 
315 | #define FLOPS_ZORGLQ(__m, __n, __k) (6. * FMULS_ORGLQ((__m), (__n), (__k)) + 2.0 * FADDS_ORGLQ((__m), (__n), (__k)) )
316 | #define FLOPS_CORGLQ(__m, __n, __k) (6. * FMULS_ORGLQ((__m), (__n), (__k)) + 2.0 * FADDS_ORGLQ((__m), (__n), (__k)) )
317 | #define FLOPS_DORGLQ(__m, __n, __k) (     FMULS_ORGLQ((__m), (__n), (__k)) +       FADDS_ORGLQ((__m), (__n), (__k)) )
318 | #define FLOPS_SORGLQ(__m, __n, __k) (     FMULS_ORGLQ((__m), (__n), (__k)) +       FADDS_ORGLQ((__m), (__n), (__k)) )
319 | 
320 | ///////
321 | #define FMULS_UNMQR(m_, n_, k_, side_) (( (side_) == Left ) \
322 |     ?  (2.*(n_)*(m_)*(k_) - (n_)*(k_)*(k_) + 2.*(n_)*(k_)) \
323 |     :  (2.*(n_)*(m_)*(k_) - (m_)*(k_)*(k_) + (m_)*(k_) + (n_)*(k_) - 0.5*(k_)*(k_) + 0.5*(k_)))
324 | #define FADDS_UNMQR(m_, n_, k_, side_) (( ((side_)) == Left ) \
325 |     ?  (2.*(n_)*(m_)*(k_) - (n_)*(k_)*(k_) + (n_)*(k_)) \
326 |     :  (2.*(n_)*(m_)*(k_) - (m_)*(k_)*(k_) + (m_)*(k_)))
327 | #define FLOPS_DORMQR(m_, n_, k_, side_) (     FMULS_UNMQR((double)(m_), (double)(n_), (double)(k_), (side_)) +       FADDS_UNMQR((double)(m_), (double)(n_), (double)(k_), (side_)) )
328 | //////
329 | 
330 | #define FLOPS_ZGEQRS(__m, __n, __nrhs) (6. * FMULS_GEQRS((__m), (__n), (__nrhs)) + 2.0 * FADDS_GEQRS((__m), (__n), (__nrhs)) )
331 | #define FLOPS_CGEQRS(__m, __n, __nrhs) (6. * FMULS_GEQRS((__m), (__n), (__nrhs)) + 2.0 * FADDS_GEQRS((__m), (__n), (__nrhs)) )
332 | #define FLOPS_DGEQRS(__m, __n, __nrhs) (     FMULS_GEQRS((__m), (__n), (__nrhs)) +       FADDS_GEQRS((__m), (__n), (__nrhs)) )
333 | #define FLOPS_SGEQRS(__m, __n, __nrhs) (     FMULS_GEQRS((__m), (__n), (__nrhs)) +       FADDS_GEQRS((__m), (__n), (__nrhs)) )
334 | 
335 | #define FLOPS_ZTRTRI(__n) (6. * FMULS_TRTRI((__n)) + 2.0 * FADDS_TRTRI((__n)) )
336 | #define FLOPS_CTRTRI(__n) (6. * FMULS_TRTRI((__n)) + 2.0 * FADDS_TRTRI((__n)) )
337 | #define FLOPS_DTRTRI(__n) (     FMULS_TRTRI((__n)) +       FADDS_TRTRI((__n)) )
338 | #define FLOPS_STRTRI(__n) (     FMULS_TRTRI((__n)) +       FADDS_TRTRI((__n)) )
339 | 
340 | #define FLOPS_ZGEHRD(__n) (6. * FMULS_GEHRD((__n)) + 2.0 * FADDS_GEHRD((__n)) )
341 | #define FLOPS_CGEHRD(__n) (6. * FMULS_GEHRD((__n)) + 2.0 * FADDS_GEHRD((__n)) )
342 | #define FLOPS_DGEHRD(__n) (     FMULS_GEHRD((__n)) +       FADDS_GEHRD((__n)) )
343 | #define FLOPS_SGEHRD(__n) (     FMULS_GEHRD((__n)) +       FADDS_GEHRD((__n)) )
344 | 
345 | #define FLOPS_ZHETRD(__n) (6. * FMULS_HETRD((__n)) + 2.0 * FADDS_HETRD((__n)) )
346 | #define FLOPS_CHETRD(__n) (6. * FMULS_HETRD((__n)) + 2.0 * FADDS_HETRD((__n)) )
347 | 
348 | #define FLOPS_ZSYTRD(__n) (6. * FMULS_SYTRD((__n)) + 2.0 * FADDS_SYTRD((__n)) )
349 | #define FLOPS_CSYTRD(__n) (6. * FMULS_SYTRD((__n)) + 2.0 * FADDS_SYTRD((__n)) )
350 | #define FLOPS_DSYTRD(__n) (     FMULS_SYTRD((__n)) +       FADDS_SYTRD((__n)) )
351 | #define FLOPS_SSYTRD(__n) (     FMULS_SYTRD((__n)) +       FADDS_SYTRD((__n)) )
352 | 
353 | #define FLOPS_ZGEBRD(__m, __n) (6. * FMULS_GEBRD((__m), (__n)) + 2.0 * FADDS_GEBRD((__m), (__n)) )
354 | #define FLOPS_CGEBRD(__m, __n) (6. * FMULS_GEBRD((__m), (__n)) + 2.0 * FADDS_GEBRD((__m), (__n)) )
355 | #define FLOPS_DGEBRD(__m, __n) (     FMULS_GEBRD((__m), (__n)) +       FADDS_GEBRD((__m), (__n)) )
356 | #define FLOPS_SGEBRD(__m, __n) (     FMULS_GEBRD((__m), (__n)) +       FADDS_GEBRD((__m), (__n)) )
357 | 
358 | #endif /* _FLOPS_H_ */
359 | 


--------------------------------------------------------------------------------
/src/pdgeqdwh.c:
--------------------------------------------------------------------------------
  1 | /**
  2 |  *
  3 |  * Copyright (c) 2017, King Abdullah University of Science and Technology
  4 |  * All rights reserved.
  5 |  *
  6 |  **/
  7 | 
  8 | /**
  9 |  *
 10 |  * @file pdgeqdwh.c
 11 |  *
 12 |  *  QDWH is a high performance software framework for computing 
 13 |  *  the polar decomposition on distributed-memory manycore systems provided by KAUST
 14 |  *
 15 |  * @version 3.0.0
 16 |  * @author Dalal Sukkari
 17 |  * @author Hatem Ltaief
 18 |  * @date 2018-11-08
 19 |  *
 20 |  **/
 21 | 
 22 | #include "polar.h"
 23 | 
 24 | extern void pdgenm2( double *A, int M, int N, int descA[9], double *W, int descW[9], double *Sx, int descSx[9], double *e, double tol);
 25 | 
 26 | #ifndef max
 27 | #define max(a, b) ((a) > (b) ? (a) : (b))
 28 | #endif
 29 | #ifndef min
 30 | #define min(a, b) ((a) < (b) ? (a) : (b))
 31 | #endif
 32 | 
 33 | 
 34 |  /*******************************************************************************
 35 |  *  .. Scalar Arguments ..
 36 |  *  INTEGER            IA, INFO, JA, LWORK, M, N
 37 |  *   ..
 38 |  *  .. Array Arguments ..
 39 |  *  INTEGER            DESCA( * )
 40 |     DOUBLE PRECISION   A( * ), TAU( * ), WORK( * )
 41 |  *     ..
 42 |  *
 43 |  *  Purpose
 44 |  *  =======
 45 |  *  
 46 |  *  PDGDWH computes the polar decomposition of a real distributed M-by-N  
 47 |  *
 48 |  *  matrix A = U * H.
 49 |  *
 50 |  *  Notes
 51 |  *  =====
 52 |  *
 53 |  *  Each global data object is described by an associated description
 54 |  *  vector.  This vector stores the information required to establish
 55 |  *  the mapping between an object element and its corresponding process
 56 |  *  and memory location.
 57 |  *
 58 |  *  Let A be a generic term for any 2D block cyclicly distributed array.
 59 |  *  Such a global array has an associated description vector DESCA.
 60 |  *  In the following comments, the character _ should be read as
 61 |  *  "of the global array".
 62 |  *
 63 |  *  NOTATION        STORED IN      EXPLANATION
 64 |  *  --------------- -------------- --------------------------------------
 65 |  *  DTYPE_A(global) DESCA( DTYPE_ )The descriptor type.  In this case,
 66 |  *                                 DTYPE_A = 1.
 67 |  *  CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating
 68 |  *                                 the BLACS process grid A is distribu-
 69 |  *                                 ted over. The context itself is glo-
 70 |  *                                 bal, but the handle (the integer
 71 |  *                                 value) may vary.
 72 |  *  M_A    (global) DESCA( M_ )    The number of rows in the global
 73 |  *                                 array A.
 74 |  *  N_A    (global) DESCA( N_ )    The number of columns in the global
 75 |  *                                 array A.
 76 |  *  MB_A   (global) DESCA( MB_ )   The blocking factor used to distribute
 77 |  *                                 the rows of the array.
 78 |  *  NB_A   (global) DESCA( NB_ )   The blocking factor used to distribute
 79 |  *                                 the columns of the array.
 80 |  *  RSRC_A (global) DESCA( RSRC_ ) The process row over which the first
 81 |  *                                 row of the array A is distributed.
 82 |  *  CSRC_A (global) DESCA( CSRC_ ) The process column over which the
 83 |  *                                 first column of the array A is
 84 |  *                                 distributed.
 85 |  *  LLD_A  (local)  DESCA( LLD_ )  The leading dimension of the local
 86 |  *                                 array.  LLD_A >= MAX(1,LOCr(M_A)).
 87 |  *
 88 |  *  Let K be the number of rows or columns of a distributed matrix,
 89 |  *  and assume that its process grid has dimension p x q.
 90 |  *  LOCr( K ) denotes the number of elements of K that a process
 91 |  *  would receive if K were distributed over the p processes of its
 92 |  *  process column.
 93 |  *  Similarly, LOCc( K ) denotes the number of elements of K that a
 94 |  *  process would receive if K were distributed over the q processes of
 95 |  *  its process row.
 96 |  *  The values of LOCr() and LOCc() may be determined via a call to the
 97 |  *  ScaLAPACK tool function, NUMROC:
 98 |  *          LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
 99 |  *          LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
100 |  *  An upper bound for these quantities may be computed by:
101 |  *          LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A
102 |  *          LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A
103 |  *
104 |  *  Arguments
105 |  *************
106 |  *  JOBH    (global input) CHARACTER*1
107 |  *          Specifies options for computing H:
108 |  *          = 'H':  the H (the symmetric positive
109 |  *                  semidefinite polar factor) are returned in the array H;
110 |  *          = 'N':  no columns of H (no symmetric positive semidefinite polar factor) are
111 |  *                  computed.
112 |  *
113 |  *  M       (global input) INTEGER
114 |  *          The number of rows of the input matrix A.  M >= 0.
115 |  *
116 |  *  N       (global input) INTEGER
117 |  *          The number of columns of the input matrix A.  N >= 0.
118 |  *
119 |  *  A       (local input/output) block cyclic DOUBLE PRECISION
120 |  *          array,
121 |  *          global dimension (M, N), local dimension (MP, NQ)
122 |  *          On entry, this array contains the matrix to be factorized 
123 |  *          On exit, it contain the orthogonal polar factor A_P
124 |  *
125 |  *  IA      (global input) INTEGER
126 |  *          The row index in the global array A indicating the first
127 |  *          row of sub( A ).
128 |  *
129 |  *  JA      (global input) INTEGER
130 |  *          The column index in the global array A indicating the
131 |  *          first column of sub( A ).
132 |  *
133 |  *  DESCA   (global input) INTEGER array of dimension DLEN_
134 |  *          The array descriptor for the distributed matrix A.
135 |  *
136 |  *  H       (local output) block cyclic DOUBLE PRECISION
137 |  *          array,
138 |  *          global dimension (M, N), local dimension (MP, NQ)
139 |  *          On exit, this array contains the symmetric positive semidefinite polar factor H 
140 |  *
141 |  *  IH      (global input) INTEGER
142 |  *          The row index in the global array H indicating the first
143 |  *          row of sub( H ).
144 |  *
145 |  *  JH      (global input) INTEGER
146 |  *          The column index in the global array H indicating the
147 |  *          first column of sub( H ).
148 |  *
149 |  *  DESCH   (global input) INTEGER array of dimension DLEN_
150 |  *          The array descriptor for the distributed matrix H.
151 |  *
152 |  *  WORK    (local workspace/output) DOUBLE PRECISION   array, dimension
153 |  *          (LWORK* NQ)
154 |  *          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
155 |  *
156 |  * LWORK    (local input) INTEGER
157 |  *          The dimension of the array WORK.
158 |  *          LWORK must be at least LWORK >= 3*MP*NQ
159 |  *          If LWORK = -1, then LWORK is global input and a workspace    
160 |  *          query is assumed; the routine only calculates the minimum
161 |  *          and optimal size for all work arrays. Each of these
162 |  *          values is returned in the first entry of the corresponding
163 |  *          work array, and no error message is issued by PXERBLA.         
164 |  *
165 |  *  INFO (global output) INTEGER
166 |  *          = 0:  successful exit
167 |  *          < 0:  If the i-th argument is an array and the j-entry had
168 |  *                an illegal value, then INFO = -(i*100+j), if the i-th
169 |  *                argument is a scalar and had an illegal value, then
170 |  *                INFO = -i.
171 |  *
172 |  ******************************************************************************/
173 | 
174 | 
175 | int pdgeqdwh( char *jobh, int m, int n,
176 | 	      double *A, int iA, int jA, int *descA, 
177 |               double *H, int iH, int jH, int *descH,
178 |               double *Work1, int lWork1, 
179 |               double *Work2, int lWork2, 
180 |               int *info)
181 | {
182 | 
183 |     int    init = 0;
184 |     double eps;
185 |     double tol1;
186 |     double tol3;
187 | 
188 |     complex dd, sqd, a1;
189 |     double conv = 100.;
190 |     double a, b, c, L2, Liconv, alpha, beta, Anorm, Ainvnorm, Li, norm_est;
191 |     double tol = 3.e-1;
192 |     double flops_dgeqrf, flops_dorgqr, flops_dgemm, flops_dpotrf, flops_dtrsm;
193 |     int MB = 2*m;
194 |     int it, itconv, facto = -1;
195 |     int itqr = 0, itpo = 0, alloc_qr = 0;
196 |     int i1 =1, i0 = 0, iM = m+1;
197 |     int myrank_mpi;
198 |     double qwtime, litime, nrmtime, potime, qrtime, Htime;
199 |     double sync_time_elapsed, reduced_time_elapsed;
200 | 
201 |     int verbose = 0, prof = 0, optcond = 0;
202 |     double flops;
203 |     flops = 0.0;
204 | 
205 |     int mloc, nloc, mlocW, nb;   
206 |     int myrow, mycol, nprow, npcol;   
207 |     int ctxt_ = 1, nb_ = 5;
208 |     int ictxt;
209 |     int wantH;
210 | 
211 |     int lWi, lwork_qr;
212 |     int *Wi = (int *)calloc(1,sizeof(int)) ;
213 |     double *W  = (double *)calloc(1,sizeof(double)) ;
214 | 
215 |     int iinfo;
216 | 
217 |     /*
218 |      * Get the grid parameters
219 |      */
220 |     ictxt = descA[ctxt_];
221 |     Cblacs_get( -1, 0, &ictxt );
222 |     nb = descA[nb_];
223 |     Cblacs_gridinfo( ictxt, &nprow, &npcol, &myrow, &mycol );
224 |     mloc  = numroc_( &m, &nb, &myrow, &i0, &nprow );
225 |     nloc  = numroc_( &n, &nb, &mycol, &i0, &npcol );
226 |     mlocW = numroc_( &MB, &nb, &myrow, &i0, &nprow );
227 |  
228 |     int lmin1, lmin2, lquery;
229 |     *info = 0; 
230 |     lquery =  (lWork1 == -1 || lWork2 == -1); 
231 |     wantH = 0;
232 | 
233 |    /*
234 |     * Test the input parameters
235 |     */
236 |     if( nprow == -1 ){
237 |        *info = -(700+ctxt_);
238 |     }
239 |     else { 
240 |        if ( m < n ){
241 | 	   fprintf(stderr, "error(m >= n is required)") ;
242 | 	   return -1;
243 |        }
244 |        if (jobh[0] == 'H' || jobh[0] == 'h'){
245 |            wantH = 1;
246 |        }
247 | 
248 |        int i2 = 2, i3 = 3, i7 = 7, i11 = 11;
249 |        int *idum1, *idum2;
250 |        idum1 = (int *)malloc(2*sizeof(int)) ;
251 |        idum2 = (int *)malloc(2*sizeof(int)) ;
252 |        chk1mat_(&m, &i2, &n, &i3, &iA, &jA, descA, &i7, info);
253 |        if (wantH){
254 |           chk1mat_(&m, &i2, &n, &i3, &iH, &jH, descH, &i11, info);
255 |        }
256 |        //igamx2d_(descA[ctxt_], "A", " ", &i1, &i1, info, &i1, &i1, &i1, &i_1, &i_1, &i0);
257 | 
258 |        lquery =  (lWork1 == -1 || lWork2 == -1); 
259 |        if (*info == 0){
260 |           lmin1 = mloc;
261 |           lmin2 = mlocW;
262 |           Work1[0] = lmin1;
263 |           Work2[0] = lmin2;
264 |           lquery =  (lWork1 == -1 || lWork2 == -1); 
265 |           if( (lWork1 < lmin1) & !lquery ){
266 |               *info = -13;
267 |           }
268 |           if( (lWork2 < lmin2) & !lquery ){
269 |               *info = -15;
270 |           }
271 |        }
272 | 
273 |        idum1[0] = wantH;
274 |        if( lWork1 == -1 || lWork2 == -1) {
275 |              idum1[1] = -1;
276 |        }
277 |        else {
278 |              idum1[1] =  1;
279 |        }
280 |        idum2[0] =  1;
281 |        idum2[1] =  15;
282 |        pchk1mat_( &m, &i2, &n, &i3, &iA, &jA, descA, &i7, &i2, idum1, idum2,
283 |                         info );
284 |        if ((*info == 0) && wantH){
285 |           pchk1mat_( &m, &i2, &n, &i3, &iH, &jH, descH, &i11, &i0, idum1, idum2,
286 |                         info );
287 |        }
288 |     }
289 |       
290 |     if( *info != 0 ){
291 |         pxerbla_( &ictxt, "PDGEQDWH", &(int){-1*info[0]} ); 
292 |         return 0;
293 |     }
294 |     else if ( lquery ){
295 |     //lquery =  (lWork1 == -1 || lWork2 == -1); 
296 |     //if ( lquery ){
297 |     /*
298 |      * Find Workspace 
299 |      */
300 |       /*
301 |        int lwork_qr = -1, lwork_cn = -1;
302 |        pdgecon_ ("1", &m, H, &iH, &jH, descH, 
303 |                  &Anorm, &Li, 
304 |                  Work, &lwork_cn, Wi, &lWi, info);
305 |        lwork_cn  = (int)Work[0];
306 |        lWi = N;//(int)iWloc[0];
307 | 
308 |        pdgeqrf_(&MB, &n, H, &iH, &iH, descH, 
309 |                 tau, Work, &lwork_qr, info);
310 |        lwork_qr  = Work[0];
311 |        lWork  = max ( lwork_cn, lwork_qr);
312 |        lWi = N;
313 |        */
314 |        //Work[0]  = mlocW_; //B = Work;
315 |        //Work[0] = 3*mloc;
316 |        //Work[0] = 3*m;
317 |        //Work[0] = ((3*m+nb)/nb)*nb;
318 |        Work1[0] = mloc;
319 |        Work2[0] = mlocW;
320 |        return 0;
321 |     } 
322 | 
323 |     /* Quick return if possible */
324 |     if ( m == 0 || n == 0 ){
325 |          return 0;
326 |     }
327 | 
328 |     /**
329 |      * Create the required workspaces
330 |      * Needed for debugging the code
331 |      */
332 | 
333 |     double *U=NULL, *B=NULL;
334 |     int descU[9], descB[9];
335 | 
336 |     //int MB3 = 3*m;
337 |     //int mlocW3 = numroc_( &MB3, &nb, &myrow, &i0, &nprow );
338 |     if ( Work1 == NULL ) {
339 | 	U  = (double *)malloc(mloc*nloc*sizeof(double));
340 | 	//B  = (double *)malloc(mlocW*nloc*sizeof(double));
341 |     }
342 |     if ( Work2 == NULL ) {
343 | 	//A  = (double *)malloc(mloc*nloc*sizeof(double));
344 | 	B  = (double *)malloc(mlocW*nloc*sizeof(double));
345 |     }
346 |     else {
347 |         U = Work1;
348 |         //B = A + mlocW3*nloc;
349 |         B = Work2;
350 |     }
351 | 
352 |     descinit_( descU, &m, &n, &nb, &nb, &i0, &i0, &ictxt, &mloc, &iinfo );
353 |     descinit_( descB, &MB, &n, &nb, &nb, &i0, &i0, &ictxt, &mlocW, &iinfo ); //B = A + mloc*nloc;
354 | 
355 |     //lWork = 3*m; MB = 2*m;
356 |     //descinit_( descA, &lWork, &n, &nb, &nb, &MB, &i0, &ictxt, &mloc, &iinfo );
357 |     //descinit_( descB, &lWork, &N, &nb, &nb, &i0, &i0, &ictxt, &mlocW, &iinfo ); //B = Work;
358 | 
359 |     double *tau   = (double *)malloc(nloc*sizeof(double)) ;
360 | 
361 |     if ( !optcond ){
362 |         lWi = n; 
363 |         Wi  = (int *)malloc(lWi*sizeof(int)) ;
364 |     }
365 | 
366 |     MPI_Comm_rank(MPI_COMM_WORLD, &myrank_mpi);
367 | 
368 |     if (verbose && (myrank_mpi == 0)) { fprintf(stderr, "Entering QDWH\n");}
369 |     if (verbose && (myrank_mpi == 0)) { fprintf(stderr, "Preparing workspace for QDWH\n");}
370 |     qwtime = 0.0;
371 |     if(prof) {qwtime -= MPI_Wtime();}
372 | 
373 |     if (!init) {
374 | 	eps  = pdlamch_( &ictxt, "E" ); 
375 | 	tol1 = 5. * eps;
376 | 	tol3 = pow(tol1, 1./3.);
377 | 	init = 1;
378 |     }
379 | 
380 | 
381 |     if (verbose && (myrank_mpi == 0)) { fprintf(stderr, "Finish preparing workspace for QDWH\n");}
382 | 
383 |     /*
384 |      * Save copy of A ==> H = U'*A
385 |      */
386 |     pdlacpy_ ( "A", &m, &n, A, &i1, &i1, descA, U, &i1, &i1, descU );
387 | 
388 |     if (verbose && (myrank_mpi == 0)) { fprintf(stderr, "Cond estimate starts\n");}
389 |     /*
390 |      * Calculate Li: reciprocal of condition number estimation
391 |      */
392 | 
393 |     litime = 0.0;
394 |     if(prof) {litime =- MPI_Wtime();}
395 | 
396 |     pdlacpy_ ( "A", &m, &n, A, &i1, &i1, descA, B, &i1, &i1, descB );
397 |     if (verbose && (myrank_mpi == 0)) { fprintf(stderr, "lacpy ends\n");}
398 |     Anorm = pdlange_ ( "1", &m, &n, U, &i1, &i1, descU, H);
399 |     if (verbose && (myrank_mpi == 0)) { fprintf(stderr, "dlange ends\n");}
400 | 
401 |     alpha = 1.0; 
402 |     pdgenm2( U, m, n, descU, B, descB, H, descH, &norm_est, tol);
403 |     pdlascl_( "G", &norm_est, &alpha, &m, &n, A, &i1, &i1, descA, &iinfo);
404 |     //pdlascl_( "G", &alpha, &norm_est, &m, &n, A, &i1, &i1, descA, &iinfo);
405 | 
406 | 
407 |     /* estimate condition number using QR */
408 |     if ( optcond ){
409 |         pdgeqrf_(&m, &n, B, &i1, &i1, descB, tau, H, &lWork1, &iinfo);
410 | 
411 |         sync_time_elapsed =- MPI_Wtime();
412 |         pdtrtri_( "U", "N", &n, B, &i1, &i1, descB, &iinfo );
413 |         sync_time_elapsed += MPI_Wtime();
414 |         MPI_Allreduce( &sync_time_elapsed, &reduced_time_elapsed, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
415 | 
416 |         Ainvnorm = pdlange_ ( "1", &m, &n, B, &i1, &i1, descB, H);
417 |         Li = ( 1.0 / Ainvnorm)/Anorm;    
418 |         Li = norm_est/1.1*Li;    
419 |         flops += FLOPS_DGEQRF( m, n )
420 |                + FLOPS_DTRTRI(  n );
421 |     }
422 |     /* estimate condition number using LU */
423 |     else {
424 |         pdgetrf_ ( &m, &n, B, &i1, &i1, descB, Wi, &iinfo );
425 |         if (verbose && (myrank_mpi == 0)) { fprintf(stderr, "LU ends\n");}
426 | 
427 |         int lwork_cn = -1;
428 |         pdgecon_ ("1", &m, B, &i1, &i1, descB, &Anorm, &Li, H, &lwork_cn, Wi, &lWi, &iinfo);
429 |         lwork_cn = H[0];
430 | 
431 |         pdgecon_ ("1", &m, B, &i1, &i1, descB, &Anorm, &Li, H, &lwork_cn, Wi, &lWi, &iinfo);
432 |         Li = norm_est/1.1*Li;    
433 |         //flops += FLOPS_DGETRF(n, n) + 2. * FLOPS_DTRSM( 'L', n, 1 );
434 |         flops += FLOPS_DGETRF(n, n);
435 |     }
436 | 
437 |     if(prof) {litime += MPI_Wtime();}
438 |     if (verbose && (myrank_mpi == 0)) { fprintf(stderr, "Cond estimate ends\n");}
439 | 
440 |     /*
441 |      * Calculate norm_est
442 |      * Scal the matrix by norm_est
443 |      */
444 | 
445 |     if (verbose && (myrank_mpi == 0)) { fprintf(stderr, "Normest starts\n");}
446 |     nrmtime = 0.0;
447 |     if(prof) {nrmtime =- MPI_Wtime();}
448 | 
449 | 
450 |     if(prof) {nrmtime += MPI_Wtime();}
451 |     if (verbose && (myrank_mpi == 0)) { fprintf(stderr, "Normest ends\n");}
452 | 
453 | 
454 |     itconv = 0; Liconv = Li;
455 | 
456 |     int itcqr = 0, itcpo = 0;     
457 |     while(itconv == 0 || fabs(1-Liconv) > tol1 ) {
458 | 	/* To find the minimum number of iterations to converge. 
459 |          * itconv = number of iterations needed until |Li - 1| < tol1 
460 | 	 * This should have converged in less than 50 iterations
461 |          */
462 | 	if (itconv > 100) {
463 | 	    exit(-1);
464 | 	    break;
465 | 	}
466 | 	itconv++;
467 | 
468 | 	L2  = Liconv * Liconv;
469 | 	dd  = cpow( 4. * (1. - L2 ) / (L2 * L2), 1./3. );
470 | 	sqd = sqrt(1. + dd);
471 | 	a1  = sqd + sqrt( 8. - 4. * dd + 8. * (2. - L2) / (L2 * sqd) ) / 2.;
472 | 	a   = creal(a1);
473 | 	b   = (a - 1.) * (a - 1.) / 4.;
474 | 	c   = a + b - 1.;
475 |         if (c > 100) {itcqr += 1; alloc_qr = 1;} else {itcpo += 1;}
476 | 	// Update Liconv
477 | 	Liconv  = Liconv * (a + b * L2) / (1. + c * L2);
478 |     }
479 |     if (verbose && (myrank_mpi == 0)) { fprintf(stderr, "QDWH loop starts\n");}
480 |     if (myrank_mpi == 0) { fprintf(stderr, "\nItConv %d itcqr %d itcpo %d norm_est %2.4e Li %2.4e \n", itconv, itcqr, itcpo, norm_est, Li); fprintf(stderr, "It Facto Conv\n");}
481 |     it = 0;
482 | 
483 |     if ( alloc_qr ){
484 |          lwork_qr = -1;
485 |          pdgeqrf_(&MB, &n, B, &i1, &i1, descB, 
486 |                   tau, W, &lwork_qr, &iinfo);
487 |          lwork_qr  = W[0];
488 |          W  = (double *)malloc((lwork_qr)*sizeof(double)) ;
489 |     }
490 | 
491 | 
492 |     while(conv > tol3 || it < itconv ) {
493 | 	/* This should have converged in less than 50 iterations */
494 | 	if (it > 100) {
495 | 	    exit(-1);
496 | 	    break;
497 | 	}
498 | 	it++;
499 | 
500 | 	/* Copy U into B1 */
501 |         //pdlacpy_( "A", &m, &n, U, &i1, &i1, descU, C, &i1, &i1, descC );
502 | 
503 | 	// Compute parameters L,a,b,c (second, equivalent way).
504 | 	L2  = Li * Li;
505 | 	dd  = cpow( 4. * (1. - L2 ) / (L2 * L2), 1./3. );
506 | 	sqd = sqrt(1. + dd);
507 | 	a1  = sqd + sqrt( 8. - 4. * dd + 8. * (2. - L2) / (L2 * sqd) ) / 2.;
508 | 	a   = creal(a1);
509 | 	b   = (a - 1.) * (a - 1.) / 4.;
510 | 	c   = a + b - 1.;
511 | 	// Update Li
512 | 	Li  = Li * (a + b * L2) / (1. + c * L2);
513 | 
514 | 	if ( c > 100) {
515 |             
516 |             qrtime = 0.0;
517 |             if(prof) {qrtime =- MPI_Wtime();}
518 | 
519 | 	    /* Copy U into C to check the convergence of QDWH */
520 |             if (it >= itconv ){
521 |                 pdlacpy_( "A", &m, &n, A, &i1, &i1, descA, H, &i1, &i1, descH );
522 |             }
523 | 
524 | 	    /**
525 | 	     * Generate the matrix B = [ B1 ] = [ sqrt(c) * U ]
526 | 	     *                         [ B2 ] = [ Id          ]
527 | 	     */
528 |             pdlacpy_( "A", &m, &n, A, &i1, &i1, descA, B, &i1, &i1, descB );
529 |             alpha = 1.0; beta = sqrt(c);
530 |             pdlascl_( "G", &alpha, &beta, &m, &n, B, &i1, &i1, descB, &iinfo);
531 |             alpha = 0.; beta =1.; 
532 |             pdlaset_( "G", &m, &n, &alpha, &beta, B, &iM, &i1, descB);
533 | 
534 | 	    /**
535 | 	     * Factorize B = QR, and generate the associated Q
536 | 	     */
537 |             sync_time_elapsed =- MPI_Wtime();
538 | 
539 |             pdgeqrf_(&MB, &n, B, &i1, &i1, descB, tau, W, &lwork_qr, &iinfo);
540 |             pdorgqr_(&MB, &n, &n, B, &i1, &i1, descB, tau, W, &lwork_qr, &iinfo);
541 | 
542 |             sync_time_elapsed += MPI_Wtime();
543 |             MPI_Allreduce( &sync_time_elapsed, &reduced_time_elapsed, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
544 | 
545 | 	    /**
546 | 	     * Gemm to find the conv-norm
547 | 	     *  U = ( (a-b/c)/sqrt(c) ) * Q1 * Q2' + (b/c) * U
548 | 	     */
549 |             alpha = (a-b/c)/sqrt(c); beta = (b/c);
550 |             pdgemm_( "N", "T", &m, &n, &n, &alpha, B, &i1, &i1, descB, B, &iM, &i1, 
551 |                      descB, &beta, A, &i1, &i1, descA);
552 | 
553 |             if(prof) {qrtime += MPI_Wtime();}
554 | 
555 | 	    /* Main flops used in this step */
556 | 	    flops_dgeqrf = FLOPS_DGEQRF( 2*m, n );
557 | 	    flops_dorgqr = FLOPS_DORGQR( 2*m, n, n );
558 | 	    flops_dgemm  = FLOPS_DGEMM( m, n, n );
559 | 	    flops += flops_dgeqrf + flops_dorgqr + flops_dgemm;
560 | 
561 |             itqr += 1;
562 | 	    facto = 0;
563 | 	}
564 | 	else {
565 | 	    /**
566 | 	     * Compute Q1 = c * U * U' + I
567 | 	     */
568 |             alloc_qr = 0;
569 |             potime = 0.0;
570 |             if(prof) {potime =- MPI_Wtime();}
571 | 
572 |             alpha = 0.; beta =1.; 
573 |             pdlaset_( "G", &m, &n, &alpha, &beta, H, &i1, &i1, descH);
574 | 
575 |             sync_time_elapsed =- MPI_Wtime();
576 | 
577 |             pdgemm_( "T", "N", &m, &n, &n, &c, A, &i1, &i1, descA, A, &i1, &i1, 
578 |                      descA, &beta, H, &i1, &i1, descH);
579 | 
580 |             sync_time_elapsed += MPI_Wtime();
581 |             MPI_Allreduce( &sync_time_elapsed, &reduced_time_elapsed, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
582 | 
583 | 	    /**
584 | 	     * Solve Q1 x = Q2, with Q2 = U
585 | 	     */
586 |             alpha = 1.0; beta = 0.0;
587 |             pdgeadd_( "T", &m, &n, &alpha, A, &i1, &i1, descA, &beta, B, &i1, &i1, descB);
588 | 
589 |             sync_time_elapsed =- MPI_Wtime();
590 | 
591 |             pdposv_( "U", &m, &n, H, &i1, &i1, descH, B, &i1, &i1, descB, &iinfo);
592 | 
593 |             sync_time_elapsed += MPI_Wtime();
594 |             MPI_Allreduce( &sync_time_elapsed, &reduced_time_elapsed, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
595 | 
596 | 	    /* Copy U into H to check the convergence of QDWH */
597 |             if (it >= itconv ){
598 |                 pdlacpy_( "A", &m, &n, A, &i1, &i1, descA, H, &i1, &i1, descH );
599 |             }
600 | 
601 | 	    /**
602 | 	     * Compute U =  (a-b/c) * Q2' + (b/c) * U
603 | 	     */
604 |             alpha = (a-b/c); beta = (b/c);
605 |             pdgeadd_ ( "T", &m, &n, &alpha, B, &i1, &i1, descB, &beta, A, &i1, &i1, descA);
606 | 
607 |             if(prof) {potime += MPI_Wtime();}
608 | 
609 | 	    /* Main flops used in this step */
610 | 	    flops_dgemm  = FLOPS_DGEMM( m, n, n );
611 | 	    flops_dpotrf = FLOPS_DPOTRF( m );
612 | 	    flops_dtrsm  = FLOPS_DTRSM( 'L', m, n );
613 | 	    flops += flops_dgemm + flops_dpotrf + 2. * flops_dtrsm;
614 | 
615 |             itpo += 1;
616 | 	    facto = 1;
617 |         }
618 | 
619 | 	/**
620 | 	 * Compute the norm of the symmetric matrix U - B1
621 | 	 */
622 |         conv = 10.;
623 |         if(it >= itconv ){
624 |             alpha = 1.0; beta = -1.0;
625 |             pdgeadd_ ( "N", &m, &n, &alpha, A, &i1, &i1, descA, &beta, H, &i1, &i1, descH);
626 | 
627 |             sync_time_elapsed =- MPI_Wtime();
628 | 
629 |             conv = pdlange_( "F", &m, &n, H, &i1, &i1, descH, W);
630 | 
631 |             sync_time_elapsed += MPI_Wtime();
632 |             MPI_Allreduce( &sync_time_elapsed, &reduced_time_elapsed, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
633 |         }
634 |         if (verbose && (myrank_mpi == 0)) fprintf(stderr, "%02d %-5s %e\n", it,
635 | 	       facto == 0 ? "QR" : "PO", conv );
636 |     }
637 |     if (verbose && (myrank_mpi == 0)) { fprintf(stderr, "QDWH loop ends\n");}
638 | 
639 |     /*
640 |      * A = U*H ==> H = U'*A ==> H = 0.5*(H'+H)
641 |      */
642 | 
643 |      Htime = 0.0;
644 |      if(prof) {Htime =- MPI_Wtime();}
645 | 
646 |      alpha = 1.0; beta = 0.0;
647 |      pdgemm_( "T", "N", &m, &n, &n, &alpha, A, &i1, &i1, descA, U, &i1, &i1, 
648 |               descU, &beta, H, &i1, &i1, descH);
649 |      pdlacpy_( "A", &m, &n, H, &i1, &i1, descH, B, &i1, &i1, descB );
650 |      alpha = 0.5; 
651 |      pdgeadd_ ( "T", &m, &n, &alpha, B, &i1, &i1, descB, &alpha, H, &i1, &i1, descH);
652 | 
653 |      if(prof) {Htime += MPI_Wtime();}
654 | 
655 |      flops_dgemm  = FLOPS_DGEMM( m, n, n );
656 |      flops += flops_dgemm;
657 | 
658 | 
659 |     if(prof) {qwtime += MPI_Wtime();}
660 | 
661 |     if (prof && (myrank_mpi == 0)) {
662 |         fprintf(stderr, "# QDWH Profiling \n"); 
663 |         fprintf(stderr, "#\n");
664 |         fprintf(stderr, "# \tn    \ttimeQDWH     \ttimeLi     \ttimeNrm    \ttime1itQR   \t#QR    \ttime1itPO   \t#PO    \ttimeFormH \n");
665 | 	fprintf(stderr, "  \t%d \t%2.4e \t%2.4e \t%2.4e \t%2.4e \t%d \t%2.4e \t%d \t%2.4e \n", m, qwtime, litime, nrmtime, qrtime, itqr, potime, itpo, Htime);
666 |     }
667 |     if (myrank_mpi == 0) {
668 |         fprintf(stderr, "#\n");
669 |         fprintf(stderr, "# \t#QR  \t#PO  \n");
670 | 	fprintf(stderr, "  \t%d  \t%d \n", itqr, itpo);
671 |     }
672 | 
673 |     free( tau );
674 |     if ( !optcond ){
675 |         free( Wi );
676 |     }
677 |     if ( Work1 == NULL ) {
678 |         free( U );
679 |         //free( B );
680 |     }
681 |     if ( Work2 == NULL ) {
682 |         //free( A );
683 |         free( B );
684 |     }
685 | 
686 |     if (verbose && (myrank_mpi == 0)) { fprintf(stderr, "Exiting QDWH\n");}
687 |     return 0;
688 | 
689 | }
690 | 


--------------------------------------------------------------------------------