├── src ├── Makefile ├── CMakeLists.txt ├── mellipke.c ├── pdgenm2.c ├── mellipj.c └── pdgeqdwh.c ├── .gitignore ├── .gitmodules ├── polar.pc.in ├── LICENSE ├── include ├── polar.h ├── myscalapack.h └── flops.h ├── scalapack-cmake-libsltmg.patch ├── timing ├── CMakeLists.txt ├── timing.c ├── timing_pdgeqdwh.c └── timing_pdgezolopd.c ├── testing ├── CMakeLists.txt ├── testing.c ├── testing_pdgeqdwh.c └── testing_pdgezolopd.c ├── Jenkinsfile ├── README.md ├── CMakeLists.txt └── scalapack-cmake-libsltmg-only.patch /src/Makefile: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "cmake_modules/ecrc"] 2 | path = cmake_modules/ecrc 3 | url = https://github.com/ecrc/ecrc_cmake.git 4 | -------------------------------------------------------------------------------- /polar.pc.in: -------------------------------------------------------------------------------- 1 | prefix=@prefix@ 2 | libdir=@libdir@ 3 | 4 | Name: polar 5 | Description: QR-based Dynamically Weighted Halley (POLAR) 6 | Version: @POLAR_VERSION@ 7 | URL: http://github.com/ecrc/polar 8 | Libs: -L${libdir} -lpolar 9 | Requires: mpi sclapack 10 | -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Define source files 2 | set( POLAR_SRC pdgenm2.c 3 | mellipj.c 4 | mellipke.c 5 | pdgezolopd.c 6 | pdgeqdwh.c 7 | ) 8 | 9 | 10 | list( APPEND CMAKE_SHARED_LINKER_FLAGS "-Wl,--enable-new-dtags") 11 | # Transform the list into a space separate string 12 | string(REPLACE ";" " " CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS}") 13 | 14 | # Build POLAR 15 | add_library( ${POLAR_LIBNAME} ${POLAR_SRC} ) 16 | # Libraries to link 17 | target_link_libraries( ${POLAR_LIBNAME} ${MPI_C_LIBRARIES} ${SCALAPACK_LIBRARIES} ${EXTRA_LIBS} ) 18 | 19 | # Installation 20 | install( TARGETS ${POLAR_LIBNAME} DESTINATION lib ) 21 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2017, King Abdullah University of Science and Technology 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /include/polar.h: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | * Copyright (c) 2017, King Abdullah University of Science and Technology 4 | * All rights reserved. 5 | * 6 | **/ 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include "myscalapack.h" 22 | #include "flops.h" 23 | 24 | #ifndef max 25 | #define max(a, b) ((a) > (b) ? (a) : (b)) 26 | #endif 27 | #ifndef min 28 | #define min(a, b) ((a) < (b) ? (a) : (b)) 29 | #endif 30 | 31 | 32 | int pdgeqdwh( char *jobh, int m, int n, 33 | double *A, int iA, int jA, int *descA, 34 | double *H, int iH, int jH, int *descH, 35 | double *Work1, int lWork1, 36 | double *Work2, int lWork2, 37 | int *info); 38 | 39 | int pdgezolopd( char *jobh, int m, int n, 40 | double *A, int iA, int jA, int *descA, 41 | double *H, int iH, int jH, int *descH, 42 | double *Work1, int lWork1, 43 | double *Work2, int lWork2, 44 | int *info); 45 | 46 | void pdgenm2( double *A, int M, int N, int descA[9], 47 | double *W, int descW[9], double *Sx, int descSx[9], 48 | double *e, double tol); 49 | 50 | int mellipj( double u, double alpha, 51 | double *sn, double *cn, double *dn, 52 | double *work); 53 | 54 | int mellipke( double alpha, 55 | double *k, double *e); 56 | 57 | -------------------------------------------------------------------------------- /scalapack-cmake-libsltmg.patch: -------------------------------------------------------------------------------- 1 | diff -Naur -x BLACS -x PBLAS a/CMakeLists.txt b/CMakeLists.txt 2 | --- a/CMakeLists.txt 2012-05-02 06:58:24.000000000 +0300 3 | +++ b/CMakeLists.txt 2017-02-15 11:26:02.000000000 +0300 4 | @@ -140,6 +140,11 @@ 5 | IF(LAPACK_LIBRARIES) 6 | include(CheckFortranFunctionExists) 7 | message(STATUS "--> LAPACK supplied by user is ${LAPACK_LIBRARIES}.") 8 | +# receive in white space separated list, and transform into cmake list (;) 9 | +SEPARATE_ARGUMENTS(LAPACK_LIBRARIES) 10 | +set( AUXSTR ${LAPACK_LIBRARIES} ) 11 | +set(LAPACK_LIBRARIES "${AUXSTR}" CACHE STRING "User provided list of lapack libraries." FORCE) 12 | + message(STATUS "----> LAPACK supplied by user is ${LAPACK_LIBRARIES}.") 13 | set(CMAKE_REQUIRED_LIBRARIES ${LAPACK_LIBRARIES}) 14 | CHECK_FORTRAN_FUNCTION_EXISTS("dgesv" LAPACK_FOUND) 15 | unset( CMAKE_REQUIRED_LIBRARIES ) 16 | diff -Naur -x BLACS -x PBLAS a/TESTING/EIG/CMakeLists.txt b/TESTING/EIG/CMakeLists.txt 17 | --- a/TESTING/EIG/CMakeLists.txt 2011-11-08 23:48:31.000000000 +0300 18 | +++ b/TESTING/EIG/CMakeLists.txt 2017-02-20 15:15:08.000000000 +0300 19 | @@ -97,3 +97,12 @@ 20 | target_link_libraries(xshseqr scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) 21 | target_link_libraries(xdhseqr scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) 22 | 23 | +file(GLOB SLTMG_SRC "*.f") 24 | +add_library( sltmg STATIC ${SLTMG_SRC}) 25 | +scalapack_install_library( sltmg ) 26 | +#install( TARGETS sltmg DESTINATION lib ) 27 | +ADD_CUSTOM_TARGET(install.sltmg 28 | + ${CMAKE_COMMAND} 29 | + -DBUILD_TYPE=${CMAKE_BUILD_TYPE} 30 | + -P ${CMAKE_CURRENT_BINARY_DIR}/cmake_install.cmake) 31 | +ADD_DEPENDENCIES(install.sltmg sltmg) 32 | -------------------------------------------------------------------------------- /timing/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Defines the source files of the tests. Each file generates 1 test 2 | file(GLOB tests_files "*.c") 3 | 4 | include_directories( ${CMAKE_SOURCE_DIR}/include ) 5 | link_directories( ${CMAKE_BINARY_DIR}/lib ) 6 | 7 | # Uses RUNPATH instead of RPATH 8 | #set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--enable-new-dtags") 9 | list( APPEND CMAKE_EXE_LINKER_FLAGS "-Wl,--enable-new-dtags") 10 | # Transform the list into a space separate string 11 | string(REPLACE ";" " " CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}") 12 | 13 | foreach( test_src ${tests_files} ) 14 | get_filename_component( test_exe ${test_src} NAME_WE ) 15 | if( "${test_exe}" STREQUAL "timing" ) 16 | # skip timing.c 17 | continue() 18 | endif() 19 | 20 | add_executable( ${test_exe} timing.c ${test_src} ) 21 | add_dependencies( ${test_exe} ${POLAR_LIBNAME} ) 22 | if( _build_netlib-scalapack ) 23 | add_dependencies( ${test_exe} NETLIB-SCALAPACK ) 24 | endif() 25 | 26 | target_link_libraries( ${test_exe} ${POLAR_LIBNAME} 27 | ${MPI_C_LIBRARIES} 28 | ${SLTMG_LIBRARIES} 29 | ${SCALAPACK_LIBRARIES} 30 | ${LAPACKE_LIBRARIES} 31 | ${LAPACK_LIBRARIES} 32 | ${BLAS_LIBRARIES} 33 | ${EXTRA_LIBS} ) 34 | install( TARGETS ${test_exe} DESTINATION lib/timing ) 35 | endforeach() 36 | #set( _p 4 ) 37 | #set( _q 4 ) 38 | #math( EXPR _pq "${_p} * ${_q}" ) 39 | 40 | set( _p ${P} ) 41 | set( _q ${Q} ) 42 | set( _pq ${ncores} ) 43 | 44 | # Add basic tests. To run them type: 'make test' in build directory 45 | # add_test( NAME "timing_pdgesvd:A" COMMAND mpirun -np ${_pq} timing_pdgesvd --nprow ${_p} --npcol ${_q} --nb 64 --n_range 512:512:512 --niter 3 --check --qwmr ) 46 | # add_test( NAME "timing_pdgesvd:B" COMMAND mpirun -np ${_pq} timing_pdgesvd --nprow ${_p} --npcol ${_q} --nb 64 --n_range 512:512:512 --niter 3 --check --qwdc ) 47 | # add_test( NAME "timing_pdgesvd:C" COMMAND mpirun -np ${_pq} timing_pdgesvd --nprow ${_p} --npcol ${_q} --nb 64 --n_range 512:512:512 --niter 3 --check --qwel ) 48 | add_test( NAME "timing_pdgeqdwh:A" COMMAND mpirun -np ${_pq} ./timing_pdgeqdwh --nprow ${_p} --npcol ${_q} --nb 64 --n_range 512:512:512 --niter 3 --check ) 49 | add_test( NAME "timing_pdgezolopd:B" COMMAND mpirun -np ${_pq} ./timing_pdgezolopd --nprow ${_p} --npcol ${_q} --nb 64 --n_range 512:512:512 --niter 3 --check --cond 1 ) 50 | -------------------------------------------------------------------------------- /testing/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Defines the source files of the tests. Each file generates 1 test 2 | file(GLOB tests_files "*.c") 3 | 4 | include_directories( ${CMAKE_SOURCE_DIR}/include ) 5 | link_directories( ${CMAKE_BINARY_DIR}/lib ) 6 | 7 | # Uses RUNPATH instead of RPATH 8 | #set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--enable-new-dtags") 9 | list( APPEND CMAKE_EXE_LINKER_FLAGS "-Wl,--enable-new-dtags") 10 | # Transform the list into a space separate string 11 | string(REPLACE ";" " " CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}") 12 | 13 | foreach( test_src ${tests_files} ) 14 | get_filename_component( test_exe ${test_src} NAME_WE ) 15 | if( "${test_exe}" STREQUAL "testing" ) 16 | # skip testing.c 17 | continue() 18 | endif() 19 | 20 | add_executable( ${test_exe} testing.c ${test_src} ) 21 | add_dependencies( ${test_exe} ${POLAR_LIBNAME} ) 22 | if( _build_netlib-scalapack ) 23 | add_dependencies( ${test_exe} NETLIB-SCALAPACK ) 24 | endif() 25 | 26 | target_link_libraries( ${test_exe} ${POLAR_LIBNAME} 27 | ${MPI_C_LIBRARIES} 28 | ${SLTMG_LIBRARIES} 29 | ${SCALAPACK_LIBRARIES} 30 | ${LAPACKE_LIBRARIES} 31 | ${LAPACK_LIBRARIES} 32 | ${BLAS_LIBRARIES} 33 | ${EXTRA_LIBS} ) 34 | install( TARGETS ${test_exe} DESTINATION lib/testing ) 35 | endforeach() 36 | #set( _p 4 ) 37 | #set( _q 4 ) 38 | #math( EXPR _pq "${_p} * ${_q}" ) 39 | 40 | set( _p ${P} ) 41 | set( _q ${Q} ) 42 | set( _pq ${ncores} ) 43 | 44 | # Add basic tests. To run them type: 'make test' in build directory 45 | # add_test( NAME "testing_pdgesvd:A" COMMAND mpirun -np ${_pq} testing_pdgesvd --nprow ${_p} --npcol ${_q} --nb 64 --n_range 512:512:512 --niter 3 --check --qwmr ) 46 | # add_test( NAME "testing_pdgesvd:B" COMMAND mpirun -np ${_pq} testing_pdgesvd --nprow ${_p} --npcol ${_q} --nb 64 --n_range 512:512:512 --niter 3 --check --qwdc ) 47 | # add_test( NAME "testing_pdgesvd:C" COMMAND mpirun -np ${_pq} testing_pdgesvd --nprow ${_p} --npcol ${_q} --nb 64 --n_range 512:512:512 --niter 3 --check --qwel ) 48 | add_test( NAME "testing_pdgeqdwh:A" COMMAND mpirun -np ${_pq} ./testing_pdgeqdwh --nprow ${_p} --npcol ${_q} --nb 64 --n_range 512:512:512 --niter 3 --check ) 49 | add_test( NAME "testing_pdgezolopd:B" COMMAND mpirun -np ${_pq} ./testing_pdgezolopd --nprow ${_p} --npcol ${_q} --nb 64 --n_range 512:512:512 --niter 3 --check --cond 1 ) 50 | 51 | -------------------------------------------------------------------------------- /timing/timing.c: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | * Copyright (c) 2017, King Abdullah University of Science and Technology 4 | * All rights reserved. 5 | * 6 | **/ 7 | 8 | /** 9 | * 10 | * @file timing.c 11 | * 12 | * POLAR is a high performance software framework for computing 13 | * the polar decomposition on distributed-memory manycore systems provided by KAUST 14 | * 15 | * @version 3.0.0 16 | * @author Dalal Sukkari 17 | * @author Hatem Ltaief 18 | * @date 2018-11-08 19 | * 20 | **/ 21 | 22 | #include 23 | #include 24 | #include 25 | int 26 | get_range(char *range, int *start_p, int *stop_p, int *step_p) { 27 | char *s, *s1, buf[21]; 28 | int colon_count, copy_len, nbuf=20, n; 29 | int start=1000, stop=10000, step=1000; 30 | 31 | colon_count = 0; 32 | for (s = strchr( range, ':'); s; s = strchr( s+1, ':')) 33 | colon_count++; 34 | 35 | if (colon_count == 0) { /* No colon in range. */ 36 | if (sscanf( range, "%d", &start ) < 1 || start < 1) 37 | return -1; 38 | step = start / 10; 39 | if (step < 1) step = 1; 40 | stop = start + 10 * step; 41 | 42 | } else if (colon_count == 1) { /* One colon in range.*/ 43 | /* First, get the second number (after colon): the stop value. */ 44 | s = strchr( range, ':' ); 45 | if (sscanf( s+1, "%d", &stop ) < 1 || stop < 1) 46 | return -1; 47 | 48 | /* Next, get the first number (before colon): the start value. */ 49 | n = s - range; 50 | copy_len = n > nbuf ? nbuf : n; 51 | strncpy( buf, range, copy_len ); 52 | buf[copy_len] = 0; 53 | if (sscanf( buf, "%d", &start ) < 1 || start > stop || start < 1) 54 | return -1; 55 | 56 | /* Let's have 10 steps or less. */ 57 | step = (stop - start) / 10; 58 | if (step < 1) 59 | step = 1; 60 | } else if (colon_count == 2) { /* Two colons in range. */ 61 | /* First, get the first number (before the first colon): the start value. */ 62 | s = strchr( range, ':' ); 63 | n = s - range; 64 | copy_len = n > nbuf ? nbuf : n; 65 | strncpy( buf, range, copy_len ); 66 | buf[copy_len] = 0; 67 | if (sscanf( buf, "%d", &start ) < 1 || start < 1) 68 | return -1; 69 | 70 | /* Next, get the second number (after the first colon): the stop value. */ 71 | s1 = strchr( s+1, ':' ); 72 | n = s1 - (s + 1); 73 | copy_len = n > nbuf ? nbuf : n; 74 | strncpy( buf, s+1, copy_len ); 75 | buf[copy_len] = 0; 76 | if (sscanf( buf, "%d", &stop ) < 1 || stop < start) 77 | return -1; 78 | 79 | /* Finally, get the third number (after the second colon): the step value. */ 80 | if (sscanf( s1+1, "%d", &step ) < 1 || step < 1) 81 | return -1; 82 | } else 83 | 84 | return -1; 85 | 86 | *start_p = start; 87 | *stop_p = stop; 88 | *step_p = step; 89 | 90 | return 0; 91 | } 92 | -------------------------------------------------------------------------------- /testing/testing.c: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | * Copyright (c) 2017, King Abdullah University of Science and Technology 4 | * All rights reserved. 5 | * 6 | **/ 7 | 8 | /** 9 | * 10 | * @file testing.c 11 | * 12 | * POLAR is a high performance software framework for computing 13 | * the polar decomposition on distributed-memory manycore systems provided by KAUST 14 | * 15 | * @version 3.0.0 16 | * @author Dalal Sukkari 17 | * @author Hatem Ltaief 18 | * @date 2018-11-08 19 | * 20 | **/ 21 | 22 | #include 23 | #include 24 | #include 25 | int 26 | get_range(char *range, int *start_p, int *stop_p, int *step_p) { 27 | char *s, *s1, buf[21]; 28 | int colon_count, copy_len, nbuf=20, n; 29 | int start=1000, stop=10000, step=1000; 30 | 31 | colon_count = 0; 32 | for (s = strchr( range, ':'); s; s = strchr( s+1, ':')) 33 | colon_count++; 34 | 35 | if (colon_count == 0) { /* No colon in range. */ 36 | if (sscanf( range, "%d", &start ) < 1 || start < 1) 37 | return -1; 38 | step = start / 10; 39 | if (step < 1) step = 1; 40 | stop = start + 10 * step; 41 | 42 | } else if (colon_count == 1) { /* One colon in range.*/ 43 | /* First, get the second number (after colon): the stop value. */ 44 | s = strchr( range, ':' ); 45 | if (sscanf( s+1, "%d", &stop ) < 1 || stop < 1) 46 | return -1; 47 | 48 | /* Next, get the first number (before colon): the start value. */ 49 | n = s - range; 50 | copy_len = n > nbuf ? nbuf : n; 51 | strncpy( buf, range, copy_len ); 52 | buf[copy_len] = 0; 53 | if (sscanf( buf, "%d", &start ) < 1 || start > stop || start < 1) 54 | return -1; 55 | 56 | /* Let's have 10 steps or less. */ 57 | step = (stop - start) / 10; 58 | if (step < 1) 59 | step = 1; 60 | } else if (colon_count == 2) { /* Two colons in range. */ 61 | /* First, get the first number (before the first colon): the start value. */ 62 | s = strchr( range, ':' ); 63 | n = s - range; 64 | copy_len = n > nbuf ? nbuf : n; 65 | strncpy( buf, range, copy_len ); 66 | buf[copy_len] = 0; 67 | if (sscanf( buf, "%d", &start ) < 1 || start < 1) 68 | return -1; 69 | 70 | /* Next, get the second number (after the first colon): the stop value. */ 71 | s1 = strchr( s+1, ':' ); 72 | n = s1 - (s + 1); 73 | copy_len = n > nbuf ? nbuf : n; 74 | strncpy( buf, s+1, copy_len ); 75 | buf[copy_len] = 0; 76 | if (sscanf( buf, "%d", &stop ) < 1 || stop < start) 77 | return -1; 78 | 79 | /* Finally, get the third number (after the second colon): the step value. */ 80 | if (sscanf( s1+1, "%d", &step ) < 1 || step < 1) 81 | return -1; 82 | } else 83 | 84 | return -1; 85 | 86 | *start_p = start; 87 | *stop_p = stop; 88 | *step_p = step; 89 | 90 | return 0; 91 | } 92 | -------------------------------------------------------------------------------- /src/mellipke.c: -------------------------------------------------------------------------------- 1 | /** 2 | ZOLOPD-SVD 3 | * 4 | * (C) Copyright 2016 King Abdullah University of Science and Technology 5 | * Authors: 6 | * Dalal Sukkari (dalal.sukkari@kaust.edu.sa) 7 | * David Keyes (david.keyes@kaust.edu.sa) 8 | * Hatem Ltaief (hatem.ltaief@kaust.edu.sa) 9 | * 10 | * Redistribution and use in source and binary forms, with or without 11 | * modification, are permitted provided that the following conditions 12 | * are met: 13 | * 14 | * Redistributions of source code must retain the above copyright 15 | * notice, this list of conditions and the following disclaimer. 16 | * Redistributions in binary form must reproduce the above copyright 17 | * notice, this list of conditions and the following disclaimer in the 18 | * documentation and/or other materials provided with the distribution. 19 | * Neither the name of the King Abdullah University of Science and 20 | * Technology nor the names of its contributors may be used to endorse 21 | * or promote products derived from this software without specific prior 22 | * written permission. 23 | * 24 | * 25 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 26 | ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 27 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 28 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 29 | HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 30 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 31 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 32 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 33 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 34 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 35 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 36 | **/ 37 | #include "polar.h" 38 | 39 | int mellipke( double alpha, double *k, double *e) 40 | { 41 | /* ELLIPKE Complete elliptic integral. Modified from Matlab's built-in code 42 | * for improved accuracy 43 | */ 44 | double m, m1, mm, a0, b0, s0; 45 | double a1, b1, c1, w1; 46 | double eps, tol; 47 | double mmold; 48 | int i1; 49 | 50 | m = sin(alpha) * sin(alpha); 51 | m1 = cos(alpha) * cos(alpha); 52 | 53 | eps = LAPACKE_dlamch_work('e'); 54 | tol = eps; 55 | 56 | 57 | if (m == 0.0) {k[0] = 0.0; *e = k[0]; return 0;} 58 | if (m < 0 || m > 1){ 59 | fprintf(stderr, "error(ellipke:MOutOfRange)") ; 60 | return -1; 61 | } 62 | 63 | a0 = 1.; 64 | b0 = cos(alpha); 65 | s0 = m; 66 | i1 = 0.; mm = 1.; 67 | 68 | while ( mm > tol){ 69 | a1 = (a0+b0)/2; 70 | b1 = sqrt(a0*b0); 71 | c1 = (a0-b0)/2; 72 | i1 = i1 + 1; 73 | w1 = cpow(2,i1)*c1*c1; 74 | mm = w1; 75 | s0 = s0 + w1; 76 | a0 = a1; 77 | b0 = b1; 78 | } 79 | 80 | *k = M_PI/(2*a1); 81 | *e = k[0]*(1-s0/2); 82 | return 0; 83 | } 84 | -------------------------------------------------------------------------------- /src/pdgenm2.c: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | * Copyright (c) 2017, King Abdullah University of Science and Technology 4 | * All rights reserved. 5 | * 6 | **/ 7 | 8 | /** 9 | * 10 | * @file pdgenm2.c 11 | * 12 | * ZOLOPD is a high performance software framework for computing 13 | * the polar decomposition on distributed-memory manycore systems provided by KAUST 14 | * 15 | * @version 3.0.0 16 | * @author Dalal Sukkari 17 | * @author Hatem Ltaief 18 | * @date 2018-11-08 19 | * 20 | **/ 21 | 22 | #include "polar.h" 23 | 24 | void pdgenm2( double *A, int M, int N, int descA[9], 25 | double *W, int descW[9], 26 | double *Sx, int descSx[9], 27 | double *e, double tol) 28 | { 29 | /* 30 | * 31 | * NORMEST Estimate the matrix 2-norm. 32 | * NORMEST(S) is an estimate of the 2-norm of the matrix S. 33 | * NORMEST(S,tol) uses relative error tol instead of 1.e-6. 34 | * [nrm,cnt] = NORMEST(..) also gives the number of iterations used. 35 | * 36 | * This function is intended primarily for sparse matrices, 37 | * although it works correctly and may be useful for large, full 38 | * matrices as well. Use NORMEST when your problem is large 39 | * enough that NORM takes too long to compute and an approximate 40 | * norm is acceptable. 41 | * 42 | */ 43 | 44 | int maxiter = 100; /* should never take this many iterations. */ 45 | int i1 = 1; 46 | int cnt = 0; 47 | int info; 48 | double e0, alpha, beta, normx, normSx; 49 | //x = sum(abs(S),1)'; 50 | /* Since in qdwh.c we are finding norm1 of the matrix A, 51 | * then we already have the sum of the columns saved in W 52 | * otherwise we should call the following pdlange 53 | */ 54 | //e0 = pdlange_ ( "1", &M, &N, A, &i1, &i1, descA, W); 55 | double *w = (double *)malloc(1*sizeof(double)) ; 56 | 57 | *e = pdlange_ ( "f", &N, &i1, W, &i1, &i1, descW, w); 58 | //pdnrm2_( &N, e, W, &i1 , &i1, descW , &i1); 59 | 60 | if (*e == 0){ return;} 61 | //x = x/e; 62 | alpha = 1.0; 63 | pdlascl_( "G", e, &alpha, &N, &i1, W, &i1, &i1, descW, &info); 64 | 65 | e0 = 0; 66 | while ( (cnt < maxiter) && 67 | (fabs((*e) - e0) > (tol * (*e))) ) 68 | { 69 | e0 = *e; alpha = 1.0; beta = 0.0; 70 | pdgemv_ ("N", &M, &N, &alpha, A, &i1, &i1, descA, W, &i1, &i1, descW, &i1, &beta, Sx, &i1, &i1, descSx, &i1); 71 | normSx = pdlange_ ( "f", &N, &i1, Sx, &i1, &i1, descSx, w); 72 | 73 | //if nnz(Sx) == 0 74 | // Sx = rand(size(Sx),class(Sx)); 75 | //end 76 | 77 | pdgemv_ ("T", &M, &N, &alpha, A, &i1, &i1, descA, Sx, &i1, &i1, descSx, &i1, &beta, W, &i1, &i1, descW, &i1); 78 | normx = pdlange_ ( "f", &N, &i1, W, &i1, &i1, descW, w); 79 | 80 | *e = normx/normSx; 81 | pdlascl_( "G", &normx, &alpha, &N, &i1, W, &i1, &i1, descW, &info); 82 | cnt = cnt+1; 83 | if ( (cnt >= maxiter) && 84 | (fabs((*e) - e0) > (tol * (*e))) ) { 85 | fprintf(stderr, "normest: didn't converge\n"); 86 | } 87 | } 88 | return; 89 | } 90 | -------------------------------------------------------------------------------- /Jenkinsfile: -------------------------------------------------------------------------------- 1 | pipeline { 2 | /* 3 | * Defining where to run 4 | */ 5 | //// Any: 6 | // agent any 7 | //// By agent label: 8 | // agent { label 'sandybridge' } 9 | 10 | agent { label 'jenkinsfile' } 11 | triggers { 12 | pollSCM('H/10 * * * *') 13 | } 14 | 15 | options { 16 | disableConcurrentBuilds() 17 | buildDiscarder(logRotator(numToKeepStr: '50')) 18 | timestamps() 19 | } 20 | 21 | stages { 22 | stage ('build') { 23 | steps { 24 | sh '''#!/bin/bash -le 25 | # loads modules 26 | module purge 27 | module load cmake/3.9.6 28 | module load intel/2017 29 | module load intelmpi/2017/intel-2017 30 | 31 | set -x 32 | module list 33 | 34 | export CC=icc # just in case 35 | export FC=ifort # just in case 36 | export F90=ifort # just in case 37 | export I_MPI_CC="$CC" 38 | export I_MPI_FC="$FC" 39 | export I_MPI_F90="$F90" 40 | 41 | mkdir -p build 42 | cd build && rm -rf ./* 43 | cmake .. -DCMAKE_INSTALL_PREFIX=$PWD/installdir -DPOLAR_TESTING:BOOL=ON -DEXTRA_LIBS="ifcore" 44 | 45 | # build 46 | make 47 | 48 | # install 49 | make install 50 | 51 | ''' 52 | } 53 | } 54 | stage ('test') { 55 | steps { 56 | sh '''#!/bin/bash -le 57 | # loads modules 58 | module purge 59 | module load cmake/3.9.6 60 | module load intel/2017 61 | module load intelmpi/2017/intel-2017 62 | 63 | set -x 64 | 65 | module list 66 | 67 | # Delete previous CTest results and run tests 68 | rm -rf $WORKSPACE/build/Testing 69 | cd $WORKSPACE/build 70 | export PATH=$PATH:. 71 | ctest --no-compress-output -T Test 72 | ''' 73 | } 74 | } 75 | stage ('package') { 76 | steps { 77 | sh 'cd build && make package' 78 | archiveArtifacts allowEmptyArchive: true, artifacts: 'build/POLAR-3.0.0-Linux.tar.gz' 79 | } 80 | } 81 | } 82 | // Post build actions 83 | post { 84 | //always { 85 | //} 86 | //success { 87 | //} 88 | //unstable { 89 | //} 90 | //failure { 91 | //} 92 | unstable { 93 | emailext body: "${env.JOB_NAME} - Please go to ${env.BUILD_URL}", subject: "Jenkins Pipeline build is UNSTABLE", recipientProviders: [[$class: 'CulpritsRecipientProvider'], [$class: 'RequesterRecipientProvider']] 94 | } 95 | failure { 96 | emailext body: "${env.JOB_NAME} - Please go to ${env.BUILD_URL}", subject: "Jenkins Pipeline build FAILED", recipientProviders: [[$class: 'CulpritsRecipientProvider'], [$class: 'RequesterRecipientProvider']] 97 | } 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /src/mellipj.c: -------------------------------------------------------------------------------- 1 | 2 | /** 3 | ZOLOPD-SVD 4 | * 5 | * (C) Copyright 2016 King Abdullah University of Science and Technology 6 | * Authors: 7 | * Dalal Sukkari (dalal.sukkari@kaust.edu.sa) 8 | * David Keyes (david.keyes@kaust.edu.sa) 9 | * Hatem Ltaief (hatem.ltaief@kaust.edu.sa) 10 | * 11 | * Redistribution and use in source and binary forms, with or without 12 | * modification, are permitted provided that the following conditions 13 | * are met: 14 | * 15 | * Redistributions of source code must retain the above copyright 16 | * notice, this list of conditions and the following disclaimer. 17 | * Redistributions in binary form must reproduce the above copyright 18 | * notice, this list of conditions and the following disclaimer in the 19 | * documentation and/or other materials provided with the distribution. 20 | * Neither the name of the King Abdullah University of Science and 21 | * Technology nor the names of its contributors may be used to endorse 22 | * or promote products derived from this software without specific prior 23 | * written permission. 24 | * 25 | * 26 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 27 | ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 28 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 29 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 30 | HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 31 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 32 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 33 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 34 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 35 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 36 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 37 | function [sn,cn,dn] = mellipj(u,alpha,tol) 38 | mELLIPJ Jacobi elliptic functions. MATLAB's built-in code, modified for improved accuracy. 39 | [SN,CN,DN] = ELLIPJ(U,M) returns the values of the Jacobi elliptic 40 | functions Sn, Cn and Dn, evaluated for corresponding elements of 41 | argument U and parameter M. U and M must be arrays of the same 42 | size or either can be scalar. As currently implemented, M is 43 | limited to 0 <= M <= 1. 44 | 45 | [SN,CN,DN] = ELLIPJ(U,M,TOL) computes the elliptic functions to 46 | the accuracy TOL instead of the default TOL = EPS. 47 | */ 48 | #include "polar.h" 49 | 50 | int mellipj(double u, double alpha, double *sn, double *cn, double *dn, double *work) 51 | { 52 | 53 | double eps, tol; 54 | int chunk = 10, i, n, in; 55 | 56 | double *a = work; 57 | double *b = a + chunk + 1; 58 | double *c = b + chunk + 1; 59 | 60 | double m1, m, mmax; 61 | 62 | memset(a, 0, (chunk + 1)* sizeof(double)); 63 | memset(b, 0, (chunk + 1)* sizeof(double)); 64 | memset(c, 0, (chunk + 1)* sizeof(double)); 65 | 66 | u=creal(u); 67 | alpha=creal(alpha); 68 | 69 | m = sin(alpha) * sin(alpha); 70 | m1 = cos(alpha) * cos(alpha); 71 | 72 | eps = LAPACKE_dlamch_work('e'); 73 | tol = eps; 74 | 75 | mmax = 1; 76 | 77 | *cn=u; 78 | *sn = cn[0]; *dn = sn[0]; 79 | 80 | 81 | if ((m < 0) || (m > 1)){ 82 | fprintf(stderr, "error(ellipj:MOutOfRange)") ; 83 | return -1; 84 | } 85 | 86 | // pre-allocate space and augment if needed 87 | chunk = 10; 88 | c[0] = creal(sin(alpha)); 89 | b[0] = creal(cos(alpha)); 90 | a[0] = c[0]; 91 | 92 | a[0]=1; 93 | 94 | n = 0.; 95 | i = 1; 96 | in = 0; 97 | while ((fabs(c[i-1]) > tol) && i<1000 ){ 98 | i = i + 1; 99 | if (i > chunk+1){ 100 | 101 | a = work; 102 | b = a + (2*chunk + 1); 103 | c = b + (2*chunk + 1); 104 | memset(a, 0, (2*chunk + 1)* sizeof(double)); 105 | memset(b, 0, (2*chunk + 1)* sizeof(double)); 106 | memset(c, 0, (2*chunk + 1)* sizeof(double)); 107 | } 108 | a[i-1] = 0.5 * (a[i-2] + b[i-2]); 109 | b[i-1] = sqrt(a[i-2] * b[i-2]); 110 | c[i-1] = 0.5 * (a[i-2] - b[i-2]); 111 | if ((fabs(c[i-1]) <= tol) && (fabs(c[i-2]) > tol)) {in = 1;} 112 | if (in != 0){ 113 | n = i-1; 114 | } 115 | } 116 | double *phin = c + (2*chunk + 1); 117 | memset(phin, 0, i * sizeof(double)); 118 | phin[0]=u; 119 | 120 | phin[i-1] = cpow(2,n)*a[i-1]*u; 121 | while (i > 1){ 122 | i = i - 1; 123 | if (n >= i) {in = 1;} 124 | phin[i-1] = phin[i]; 125 | if (in != 0){ 126 | phin[i-1] = 0.5 * (asin(c[i]*sin(creal(phin[i]))/a[i]) + phin[i]); 127 | } 128 | } 129 | *sn = sin(creal(phin[0])); 130 | *cn = cos(creal(phin[0])); 131 | *dn = sqrt(1 - m * sn[0]*sn[0]); 132 | 133 | return 0; 134 | } 135 | 136 | int choosem(double con, int *m){ 137 | //choose the Zolotarev degree according to the condnum 138 | if (con < 1.001) {*m=2; } 139 | else if (con <= 1.01) {*m = 3;} 140 | else if (con <= 1.1) {*m = 4;} 141 | else if (con <= 1.2) {*m = 5;} 142 | else if (con <= 1.5) {*m = 6;} 143 | else if (con <= 2) {*m = 8;} // one-step convergence till here 144 | else if (con < 6.5) {*m = 2;} 145 | else if (con < 180) {*m = 3;} 146 | else if (con < 1.5*1e4) {*m = 4;} 147 | else if (con < 2*1e6) {*m = 5;} 148 | else if (con < 1*1e9) {*m = 6;} 149 | else if (con < 3*1e12) {*m = 7;} 150 | else {*m = 8;} 151 | 152 | return 0; 153 | } 154 | 155 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | POLAR 2 | ================ 3 | POLAR is a high performance open-source software package to compute the polar decomposition ((PD)) of a dense matrix A = UH 4 | based on QR-based Dynamically Weighted Halley (QDWH) and ZOLO-PD algorithms. 5 | 6 | QDWH 7 | ================ 8 | 9 | The **QR-based Dynamically Weighted Halley** (QDWH) is one of 10 | the most popular algorithms to compute the polar decomposition. It is backward stable and converges 11 | in at most six iterations. 12 | 13 | ZOLO-PD 14 | ================ 15 | 16 | ZOLO-PD relies on the **Zolotarev function which is the best rational approximant to the sign function** 17 | for computing the polar decomposition. 18 | Building upon on the QR-based 19 | Dynamically Weighted Halley (QDWH) algorithm, the key idea 20 | lies in finding the best rational approximation for the scalar sign function, 21 | which also corresponds to the polar factor for symmetric matrices, 22 | to further accelerate the QDWH convergence. 23 | Based on the Zolotarev rational functions---introduced by Zolotarev (ZOLO) in 24 | 1877--- this new PD algorithm ZOLO-PD converges within two iterations even for ill-conditioned matrices, 25 | instead of the original six iterations needed for QDWH. 26 | ZOLO-PD uses the property of Zolotarev functions that optimality is maintained when 27 | two functions are composed in an appropriate manner. 28 | The resulting ZOLO-PD has a convergence rate up to seventeen, 29 | in contrast to the cubic convergence rate for QDWH. 30 | This comes at the price of higher arithmetic costs and memory footprint. These 31 | extra floating-point operations can, however, be processed in an 32 | embarrassingly parallel fashion. 33 | 34 | Current Features of QDWH/ZOLOPD 35 | =========================== 36 | 37 | - Written in C. 38 | - Support for Double Precision. 39 | - Support for Two-Dimensional Block Cyclic Data Distribution. 40 | - ScaLAPACK Interface / Native Interface. 41 | - ScaLAPACK-Compliant Error Handling. 42 | - ScaLAPACK-Derived Testing Suite 43 | - ScaLAPACK-Compliant Accuracy. 44 | 45 | Programming models and backends: 46 | 1. MPI 47 | 2. ScaLAPACK 48 | 49 | 50 | Installation 51 | ============ 52 | 53 | The installation requires at least **CMake** of version 3.2.3. To build the polar decomposition based on QDWH/ZOLOPD, 54 | follow these instructions: 55 | 56 | 1. Get polar from git repository 57 | 58 | git clone git@github.com:ecrc/polar 59 | 60 | 2. Go into polar folder 61 | 62 | cd polar 63 | 64 | 3. Create build directory and go there 65 | 66 | mkdir build && cd build 67 | 68 | 4. Use CMake to get all the dependencies 69 | 70 | cmake .. -DCMAKE_INSTALL_PREFIX=/path/to/install/ 71 | 72 | 5. To use exist dependencies 73 | 74 | cmake .. -DCMAKE_INSTALL_PREFIX=/path/to/install/ -DSCALAPACK_DIR=/path/to/scalapack/install/ -DSLTMG_LIBRARIES=/path/to/scalapack/install/lib/libsltmg.a 75 | 76 | 5. To build the testing binaries (optional) add the following: 77 | 78 | -DPOLAR_TESTING:BOOL=ON 79 | 80 | 5. Build polar 81 | 82 | make -j 83 | 84 | 6. Install polar 85 | 86 | make install 87 | 88 | 7. Add line 89 | 90 | export PKG_CONFIG_PATH=/path/to/install/lib/pkgconfig:$PKG_CONFIG_PATH 91 | 92 | to your .bashrc file. 93 | 94 | Now you can use pkg-config executable to collect compiler and linker flags for 95 | polar based on QDWH/ZOLO-PD. 96 | 97 | Testing and Timing 98 | ================== 99 | 100 | The directories testing and timing contain an example 101 | to test the accuracy and the performance of QDWH/ZOLO-PD using 102 | ill (with condition number less than 5.e12) and well-conditioned random matrices. 103 | 104 | The complete list of options is available below with -h option: 105 | 106 | ``` 107 | "======= QDWH/ZOLOPD testing using ScaLAPACK\n" 108 | " -p --nprow : Number of MPI process rows\n" 109 | " -q --npcol : Number of MPI process cols\n" 110 | " -jl --lvec : Compute left singular vectors\n" 111 | " -jr --rvec : Compute right singular vectors\n" 112 | " -n --N : Dimension of the matrix\n" 113 | " -b --nb : Block size\n" 114 | " -m --mode : [1:6] Mode from pdlatms used to generate the matrix\n" 115 | " -k --cond : Condition number used to generate the matrix\n" 116 | " -o --optcond : Estimate Condition number using QR\n" 117 | " -i --niter : Number of iterations\n" 118 | " -r --n_range : Range for matrix sizes Start:Stop:Step\n" 119 | " -c --check : Check the solution\n" 120 | " -v --verbose : Verbose\n" 121 | " -h --help : Print this help\n" ); 122 | ``` 123 | On Cray systems, the launching command typically looks like: 124 | 125 | srun --ntasks=nT --hint=nomultithread ./main --nprow p --npcol q --b 64 --cond 1e16 --niter 1 --n_range start:stop:step --check 126 | 127 | 1. The number of the nodes is N, the number of tasks (nT) = N * (number_of_cores per node ). The programming model is pure MPI (no OpenMP, i.e., sequential BLAS). 128 | 2. PxQ is the process grid configuration, where (nT - PxQ = 0) 129 | 130 | 131 | TODO List 132 | ========= 133 | 134 | 1. Add support for the other precisions 135 | 2. Extend task-based programming model 136 | 3. Port to various dynamic runtime systems 137 | 138 | 139 | References 140 | ========== 141 | 1. H. Ltaief, D. Sukkari, A. Esposito, Y. Nakatsukasa and D. Keyes, Massively Parallel 142 | Polar Decomposition on Distributed-Memory Systems, *Submitted to IEEE Transactions on 143 | Parallel Computing TOPC*, http://hdl.handle.net/10754/626359.1, 2018. 144 | 2. D. Sukkari, H. Ltaief, A. Esposito and D. Keyes, A QDWH-Based SVD Software Framework on 145 | Distributed-Memory Manycore Systems, *Submitted to ACM Transactions on Mathematical Software TOMS*, 146 | http://hdl.handle.net/10754/626212, 2017. 147 | 3. D. Sukkari, H. Ltaief, M. Faverge, and D. Keyes, Asynchronous Task-Based Polar 148 | Decomposition on Massively Parallel Systems, *IEEE Transactions on Parallel and 149 | Distributed Systems TPDS*, volume 29, pages 312–323, https://ieeexplore.ieee.org/document/8053812/, 2017. 150 | 4. D. Sukkari, H. Ltaief and D. Keyes, A High Performance QDWH-SVD Solver using 151 | Hardware Accelerators, *ACM Transactions on Mathematical Software TOMS*, vol. 43 (1), pp. 1-25, 2016. 152 | 5. D. Sukkari, H. Ltaief and D. Keyes, High Performance Polar Decomposition for SVD 153 | Solvers on Distributed Memory Systems, Best Papers, *Proceedings of the 22nd International 154 | Euro-Par Conference*, https://doi.org/10.1007/978-3-319-43659-3_44, 2016. 155 | 6. D.Sukkari, H. Ltaief and D. Keyes, A High Performance QDWH-SVD Solver using 156 | Hardware Accelerators, *ACM Transactions on Mathematical Software TOMS*, 157 | http://doi.acm. org/10.1145/2894747, volume 43, pages 6:1–6:25, 2016. 158 | 7. Y. Nakatsukasa and N. J. Higham, Stable and Efficient Spectral Divide and Conquer 159 | Algorithms for the Symmetric Eigenvalue Decomposition and the SVD, *SIAM Journal on Scientific Computing*, 160 | vol. 35, no. 3, pp. A1325–A1349, http://epubs.siam.org/doi/abs/10.1137/120876605, 2013. 161 | 8. Y. Nakatsukasa, R. Freund, using Zolotarev's Rational Approximation for Computing the Polar, 162 | Symmetric Eigenvalue, and Singular Value Decompositions, *SIAM Review*, 163 | https://books.google.com.sa/books?id=a9d7rgEACAAJ, 2016. 164 | 165 | 166 | Questions? 167 | ========== 168 | Please feel free to create an issue on Github for any questions and inquiries. 169 | 170 | -------------------------------------------------------------------------------- /include/myscalapack.h: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | * Copyright (c) 2017, King Abdullah University of Science and Technology 4 | * All rights reserved. 5 | * 6 | **/ 7 | 8 | /* 9 | * Copyright (c) 2009-2010 The University of Tennessee and The University 10 | * of Tennessee Research Foundation. All rights 11 | * reserved. 12 | * Copyright (c) 2010 University of Denver, Colorado. 13 | */ 14 | 15 | #ifndef MYSCALAPACK_H 16 | #define MYSCALAPACK_H 17 | 18 | #ifdef SUP_ 19 | #define pdpotrf_ pdpotrf 20 | #define pdpotri_ pdpotri 21 | #define pdpotrs_ pdpotrs 22 | #define pdsymm_ pdsymm 23 | #define pdgetrf_ pdgetrf 24 | #define pdlansy_ pdlansy 25 | #define pdmatgen_ pdmatgen 26 | #define pdtrsm_ pdtrsm 27 | #define psgesv_ psgesv 28 | #define pdgesv_ pdgesv 29 | #define psgemm_ psgemm 30 | #define pdgemm_ pdgemm 31 | #define numroc_ numroc 32 | #define pslange_ pslange 33 | #define pdlange_ pdlange 34 | #define pslacpy_ pslacpy 35 | #define pdlacpy_ pdlacpy 36 | #define pdgeqrf_ pdgeqrf 37 | #define pdormqr_ pdormqr 38 | #define psgesvd_ psgesvd 39 | #define pdgesvd_ pdgesvd 40 | #define pslaset_ pslaset 41 | #define pdlaset_ pdlaset 42 | #define pselset_ pselset 43 | #define pdelset_ pdelset 44 | #define pslamch_ pslamch 45 | #define pdlamch_ pdlamch 46 | #define indxg2p_ indxg2p 47 | #define indxg2l_ indxg2l 48 | #define descinit_ descinit 49 | #define pslawrite_ pslawrite 50 | #define pdlawrite_ pdlawrite 51 | #define blacs_get_ blacs_get 52 | #define blacs_pinfo_ blacs_pinfo 53 | #define blacs_gridinit_ blacs_gridinit 54 | #define blacs_gridinfo_ blacs_gridinfo 55 | #define blacs_gridexit_ blacs_gridexit 56 | #define blacs_exit_ blacs_exit 57 | #endif 58 | 59 | extern void Cblacs_pinfo( int* mypnum, int* nprocs); 60 | extern void Cblacs_get( int context, int request, int* value); 61 | extern int Cblacs_gridinit( int* context, char * order, int np_row, int np_col); 62 | extern void Cblacs_gridinfo( int context, int* np_row, int* np_col, int* my_row, int* my_col); 63 | extern void Cblacs_gridexit( int context); 64 | extern void Cblacs_exit( int error_code); 65 | 66 | extern void blacs_pinfo_( int *mypnum, int *nprocs); 67 | extern void blacs_get_( int *context, int *request, int* value); 68 | extern void blacs_gridinit_( int* context, char *order, int *np_row, int *np_col); 69 | extern void blacs_gridinfo_( int *context, int *np_row, int *np_col, int *my_row, int *my_col); 70 | extern void blacs_gridexit_( int *context); 71 | extern void blacs_exit_( int *error_code); 72 | 73 | extern void pdgeqrf_( int *m, int *n, double *a, int *ia, int *ja, int *desca, double *tau, double *work, int *lwork, int *info ); 74 | extern void pdormqr_( char *side, char *trans, int *m, int *n, int *k, double *a, int *ia, 75 | int *ja, int *desca, double *tau, double *c, int *ic, int *jc, int *descc, double *work, int *lwork, int *info ); 76 | extern void pdtrsm_ ( char *side, char *uplo, char *transa, char *diag, int *m, int *n, double *alpha, double *a, int *ia, 77 | int *ja, int *desca, double *b, int *ib, int *jb, int *descb ); 78 | 79 | extern float pslange_( char *norm, int *m, int *n, float *A, int *ia, int *ja, int *descA, float *work); 80 | extern double pdlange_( char *norm, int *m, int *n, double *A, int *ia, int *ja, int *descA, double *work); 81 | 82 | 83 | extern void pslacpy_( char *uplo, int *m, int *n, float *A, int *ia, int *ja, int *descA, 84 | float *B, int *ib, int *jb, int *descB); 85 | extern void pdlacpy_( char *uplo, int *m, int *n, double *A, int *ia, int *ja, int *descA, 86 | double *B, int *ib, int *jb, int *descB); 87 | 88 | extern void psgesv_( int *n, int *nrhs, float *A, int *ia, int *ja, int *descA, int *ipiv, 89 | float *B, int *ib, int *jb, int *descB, int *info); 90 | extern void pdgesv_( int *n, int *nrhs, double *A, int *ia, int *ja, int *descA, int *ipiv, 91 | double *B, int *ib, int *jb, int *descB, int *info); 92 | 93 | extern void psgemm_( char *transa, char *transb, int *M, int *N, int *K, 94 | float *alpha, 95 | float *A, int *ia, int *ja, int *descA, 96 | float *B, int *ib, int *jb, int *descB, 97 | float *beta, 98 | float *C, int *ic, int *jc, int *descC ); 99 | extern void pdgemm_( char *transa, char *transb, int *M, int *N, int *K, 100 | double *alpha, 101 | double *A, int *ia, int *ja, int *descA, 102 | double *B, int *ib, int *jb, int *descB, 103 | double *beta, 104 | double *C, int *ic, int *jc, int *descC ); 105 | 106 | extern void psgesvd_( char *jobu, char *jobvt, int *m, int *n, 107 | float *A, int *ia, int *ja, int *descA, 108 | float *s, 109 | float *U, int *iu, int *ju, int *descU, 110 | float *VT, int *ivt, int *jvt, int *descVT, 111 | float *work, int *lwork, int *info); 112 | extern void pdgesvd_( char *jobu, char *jobvt, int *m, int *n, 113 | double *A, int *ia, int *ja, int *descA, 114 | double *s, 115 | double *U, int *iu, int *ju, int *descU, 116 | double *VT, int *ivt, int *jvt, int *descVT, 117 | double *work, int *lwork, int *info); 118 | 119 | extern void pslaset_( char *uplo, int *m, int *n, float *alpha, float *beta, float *A, int *ia, int *ja, int *descA ); 120 | extern void pdlaset_( char *uplo, int *m, int *n, double *alpha, double *beta, double *A, int *ia, int *ja, int *descA ); 121 | 122 | extern void pselset_( float *A, int *ia, int *ja, int *descA, float *alpha); 123 | extern void pdelset_( double *A, int *ia, int *ja, int *descA, double *alpha); 124 | 125 | extern void pslawrite_( char **filenam, int *m, int *n, float *A, int *ia, int *ja, int *descA, int *irwrit, int *icwrit, float *work); 126 | extern void pdlawrite_( char **filenam, int *m, int *n, double *A, int *ia, int *ja, int *descA, int *irwrit, int *icwrit, double *work); 127 | 128 | extern float pslamch_( int *ictxt, char *cmach); 129 | extern double pdlamch_( int *ictxt, char *cmach); 130 | 131 | extern int indxg2p_( int *indxglob, int *nb, int *iproc, int *isrcproc, int *nprocs); 132 | extern int indxg2l_( int *indxglob, int *nb, int *iproc, int *isrcproc, int *nprocs); 133 | extern int numroc_( int *n, int *nb, int *iproc, int *isrcproc, int *nprocs); 134 | extern void descinit_( int *desc, int *m, int *n, int *mb, int *nb, int *irsrc, int *icsrc, 135 | int *ictxt, int *lld, int *info); 136 | 137 | extern void pdgetrf_ ( int* m, int *n, double *a, int *i1, int *i2, int *desca, int* ipiv, int *info ); 138 | extern void pdgetrs_ ( char* trans, int* n, int* nrhs, double* A, int* ia, int* ja, int* descA, int* ippiv, double* B, int* ib, int* jb, int* descB, int* info); 139 | extern double pdlansy_ ( char *norm, char *uplo, int *n, double *a, int *ia, int *ja, int *desca, double *work ); 140 | extern void pdmatgen_( int *ictxt, char *aform, char *diag, int *m, int *n, int *mb, int *nb, double *a, int *lda, int *iarow, int *iacol, int *iseed, int *iroff, int *irnum, int *icoff, int *icnum, int *myrow, int *mycol, int *nprow, int *npcol ); 141 | 142 | 143 | extern void pdpotrf_( char *uplo, int *n, double *a, int *ia, int *ja, int *desca, int *info ); 144 | extern void pdpotri_( char *uplo, int *n, double *a, int *ia, int *ja, int *desca, int *info ); 145 | extern void pdpotrs_( char *uplo, int *n, int *nrhs, double *a, int *ia, int *ja, int *desca, double *b, int *ib, int *jb, int *descb, int *info ); 146 | 147 | extern void pdsymm_(char *side, char *uplo, int *m, int *n, double *alpha, double *a, int *ia, int *ja, int *desca, double *b, int *ib, int *jb, int *descb, double *beta, double *c, int *ic, int *jc, int *descc); 148 | 149 | extern void pdposv_(char *uplo, int *n, int *nrhs, double *a, int *ia, int *ja, 150 | int *desca, double *b, int *ib, int *jb, int *descb, int *info); 151 | extern void pdgeadd_(char *trans, int *m, int *n, double *alpha, double *a, 152 | int *ia, int *ja, int *desca, double *beta, double *c, int *ic, int *jc, 153 | int *descc); 154 | extern void pdgemv_(char *trans, int *m, int *n, double *alpha, double *a, 155 | int *ia, int *ja, int *desca, double *x, int *ix, int *jx, int *descx, 156 | int *incx, double *beta, double *y, int *iy, int *jy, int *descy, int *incy); 157 | extern void chk1mat_(int *ma, int *mapos0, int *na, int *napos0, int *ia, 158 | int *ja, int *desca, int *descapos0, int *info); 159 | extern void pchk1mat_(int *ma, int *mapos0, int *na, int *napos0, int *ia, 160 | int *ja, int *desca, int *descapos0, int *nextra, int *ex, int *expos, 161 | int *info); 162 | extern void pdlascl_(char *type, double *cfrom, double *cto, int *m, int *n, 163 | double *a, int *ia, int *ja, int *desca, int *info); 164 | extern void pdorgqr_(int *m, int *n, int *k, double *a, int *ia, int *ja, 165 | int *desca, double *tau, double *work, int *lwork, int *info); 166 | extern void pdgecon_(char *norm, int *n, double *a, int *ia, int *ja, 167 | int *desca, double *anorm, double *rcond, double *work, int *lwork, 168 | int *iwork, int *liwork, int *info); 169 | extern void pxerbla_(int *ictxt, char *srname, int *info); 170 | extern void pdtrtri_(char *uplo, char *diag, int *n, double *a, int *ia, 171 | int *ja, int *desca, int *info); 172 | 173 | #endif 174 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.2.3) 2 | # directly make an error if in-source build 3 | if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_BINARY_DIR}") 4 | message(FATAL_ERROR "In-source builds are not allowed.\n" 5 | "Please create a build directory first and execute cmake configuration from " 6 | "this directory. Example: mkdir build && cd build && cmake ..") 7 | endif() 8 | 9 | project(POLAR Fortran C) 10 | 11 | ######################### 12 | ### VARIABLES ########### 13 | ######################### 14 | set( POLAR_LIBNAME polar ) 15 | option( POLAR_TESTING "Generates testing binaries" OFF) 16 | option( BUILD_SHARED_LIBS "Generates shared libraries instead of static" OFF) 17 | option( SHARED_NETLIBSCALAPACK "Build NETLIB-SCALAPACK as shared library." ${BUILD_SHARED_LIBS}) 18 | option( USE_PARALLEL_BLAS "If possible, prefer parallel blas libraries" ON ) 19 | set(EXTRA_LIBS "" CACHE STRING "Additional libraries that may be needed to compile/link" ) 20 | 21 | # Version 3.0.0 22 | SET( POLAR_VERSION_MAJOR 3 ) 23 | SET( POLAR_VERSION_MINOR 0 ) 24 | SET( POLAR_VERSION_PATCH 0 ) 25 | set( POLAR_VERSION "${POLAR_VERSION_MAJOR}.${POLAR_VERSION_MINOR}.${POLAR_VERSION_PATCH}" ) 26 | #execute_process(COMMAND git rev-parse --short HEAD OUTPUT_VARIABLE POLAR_VERSION_RELEASE ) 27 | 28 | 29 | # Set the RPATH config 30 | # -------------------- 31 | # use, i.e. don't skip the full RPATH for the build tree 32 | set(CMAKE_SKIP_BUILD_RPATH FALSE) 33 | # when building, use the install RPATH 34 | # (automated test will need this) 35 | set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE) 36 | # the RPATH to be used when installing 37 | set(CMAKE_INSTALL_RPATH ${CMAKE_INSTALL_PREFIX}/lib) 38 | 39 | 40 | # Packaging (make package) 41 | SET(CPACK_PACKAGE_VERSION ${POLAR_VERSION}) 42 | SET(CPACK_GENERATOR "TGZ") 43 | INCLUDE(CPack) 44 | 45 | 46 | # This project needs C99 standard to compile properly 47 | # (could have used CMAKE_C_FLAGS instead) 48 | #ADD_DEFINITIONS( -std=c99 ) 49 | #set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c99") 50 | 51 | link_directories( ${CMAKE_INSTALL_PREFIX}/lib ) 52 | include_directories( ${CMAKE_SOURCE_DIR}/include ) 53 | 54 | ## CMAKE MODULES :: ECRC 55 | ## REQUIRED FOR TESTS TO LINK LIBRARIES 56 | if(NOT EXISTS "${PROJECT_SOURCE_DIR}/cmake_modules/ecrc/modules" ) 57 | find_package(Git REQUIRED) 58 | execute_process(COMMAND ${GIT_EXECUTABLE} submodule init WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} RESULT_VARIABLE _res_init OUTPUT_QUIET ERROR_QUIET) 59 | execute_process(COMMAND ${GIT_EXECUTABLE} submodule update WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} RESULT_VARIABLE _res_update OUTPUT_QUIET ERROR_QUIET) 60 | if( ${_res_init} GREATER 0 OR ${_res_update} GREATER 0 ) 61 | message(FATAL_ERROR "ECRC CMake modules were not found.\n" 62 | "We tried: 'git submodule init && git submodule update' and resulted in error" ) 63 | endif() 64 | endif() 65 | ## ECRC INITIALIZATION 66 | list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake_modules) 67 | list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake_modules/ecrc/modules") 68 | set(ECRC_CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake_modules/ecrc/modules ) 69 | include(EcrcInit) 70 | 71 | # MPI 72 | find_package( MPIEXT REQUIRED ) 73 | if( MPI_FOUND ) 74 | # TODO check MPI_C_COMPILER 75 | include_directories( ${MPI_C_INCLUDE_PATH} ) 76 | set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${MPI_C_LINK_FLAGS}" ) 77 | endif() 78 | 79 | ## BLAS 80 | find_package(BLASEXT) 81 | if(BLAS_FOUND) 82 | # Use parallel blas 83 | if( USE_PARALLEL_BLAS AND NOT "${BLAS_PAR_LIBRARIES}" STREQUAL "" ) 84 | message( STATUS "Using Parallel Blas" ) 85 | set( BLAS_LIBRARIES "${BLAS_PAR_LIBRARIES}" ) 86 | endif() 87 | if (BLAS_LIBRARY_DIRS) 88 | # the RPATH to be used when installing 89 | list( APPEND CMAKE_INSTALL_RPATH ${BLAS_LIBRARY_DIRS}) 90 | link_directories( ${BLAS_LIBRARY_DIRS} ) 91 | endif() 92 | if(BLAS_LINKER_FLAGS) 93 | list(APPEND CMAKE_EXE_LINKER_FLAGS ${BLAS_LINKER_FLAGS}) 94 | list(APPEND CMAKE_SHARED_LINKER_FLAGS ${BLAS_LINKER_FLAGS}) 95 | #set( CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${BLAS_LINKER_FLAGS}") 96 | list( REMOVE_DUPLICATES CMAKE_EXE_LINKER_FLAGS ) 97 | endif() 98 | else() 99 | message( FATAL_ERROR "BLAS library has not been found") 100 | endif() 101 | 102 | # LAPACK 103 | #find_package(LAPACK REQUIRED) 104 | find_package(LAPACKEXT REQUIRED) 105 | if(LAPACK_FOUND) 106 | include_directories(${LAPACK_INCLUDE_DIRS}) 107 | if(LAPACK_LIBRARY_DIRS) 108 | # the RPATH to be used when installing 109 | list(APPEND CMAKE_INSTALL_RPATH ${LAPACK_LIBRARY_DIRS}) 110 | link_directories( ${LAPACK_LIBRARY_DIRS} ) 111 | endif() 112 | if(LAPACK_LINKER_FLAGS) 113 | list( REMOVE_DUPLICATES LAPACK_LINKER_FLAGS ) 114 | list( APPEND CMAKE_EXE_LINKER_FLAGS ${LAPACK_LINKER_FLAGS}) 115 | list( REMOVE_DUPLICATES CMAKE_EXE_LINKER_FLAGS ) 116 | endif() 117 | endif() 118 | 119 | # SCALAPACK 120 | find_package( SCALAPACK ) 121 | if(SCALAPACK_FOUND) 122 | if (SCALAPACK_LIBRARY_DIRS) 123 | # the RPATH to be used when installing 124 | list(APPEND CMAKE_INSTALL_RPATH ${SCALAPACK_LIBRARY_DIRS}) 125 | link_directories(${SCALAPACK_LIBRARY_DIRS}) 126 | endif() 127 | if(SCALAPACK_LINKER_FLAGS) 128 | list( REMOVE_DUPLICATES SCALAPACK_LINKER_FLAGS ) 129 | list( APPEND CMAKE_EXE_LINKER_FLAGS ${SCALAPACK_LINKER_FLAGS}) 130 | list( REMOVE_DUPLICATES CMAKE_EXE_LINKER_FLAGS ) 131 | endif() 132 | endif() 133 | 134 | 135 | include(ExternalProject) 136 | 137 | 138 | # SLTMG - look for sltmg (only used for tests ) 139 | if( POLAR_TESTING ) 140 | set( SLTMG_LIBRARIES "" CACHE STRING "User provided sltmg library" ) 141 | set( _notfound_msg_ "SLTMG not found, NETLIB-SCALAPACK will be downloaded in order to generate libsltmg.a\n If you want to use your own SLTMG, define SLTMG_LIBRARIES and make sure SCALAPACK is found." ) 142 | if( SLTMG_LIBRARIES ) 143 | set(CMAKE_REQUIRED_LIBRARIES "${SLTMG_LIBRARIES};${SCALAPACK_LIBRARIES}") 144 | check_fortran_function_exists("pdlatms" SLTMG_WORKS) 145 | set(CMAKE_REQUIRED_LIBRARIES) 146 | if( SLTMG_WORKS AND SCALAPACK_FOUND ) 147 | message(STATUS "Found SLTMG : ${SLTMG_LIBRARIES}") 148 | else() 149 | message( STATUS "${_notfound_msg_}") 150 | endif() 151 | else() 152 | message( STATUS "${_notfound_msg_}") 153 | endif() 154 | endif() 155 | 156 | if( NOT SCALAPACK_FOUND OR ( POLAR_TESTING AND NOT SLTMG_WORKS ) ) 157 | # Download NETLIB-SCALAPACK 158 | set( _build_netlib-scalapack ) 159 | set( NETLIB_SCALAPACK_URL "http://www.netlib.org/scalapack/scalapack-2.0.2.tgz" CACHE STRING "URL to download NETLIB-SCALAPACK project" ) 160 | if(NOT SCALAPACK_FOUND) 161 | set( SCALAPACK_LIBRARY_DIRS "${CMAKE_INSTALL_PREFIX}/lib") 162 | set( SCALAPACK_LIBRARIES "scalapack") 163 | set( _install_target "install" ) 164 | set( _build_target "all" ) 165 | set( _patch_file "scalapack-cmake-libsltmg.patch" ) 166 | message( "## NETLIB-SCALAPACK will be downloaded from ${NETLIB_SCALAPACK_URL} and built along with libsltmg.a " ) 167 | else() 168 | set( _install_target "install.sltmg" ) 169 | set( _build_target "sltmg" ) 170 | set( _patch_file "scalapack-cmake-libsltmg-only.patch" ) 171 | message( "## NETLIB-SCALAPACK will be downloaded from ${NETLIB_SCALAPACK_URL} in order to generate libsltmg.a " ) 172 | endif() 173 | 174 | # Use the built sltmg 175 | set( SLTMG_LIBRARIES "sltmg" ) 176 | 177 | # build scalapack for sltmg 178 | string(REPLACE ";" " " LAPACK_LIBRARIES_STR "${LAPACK_LIBRARIES}") 179 | ExternalProject_Add( NETLIB-SCALAPACK 180 | PREFIX dependencies 181 | URL ${NETLIB_SCALAPACK_URL} 182 | #URL /tmp/scalapack-2.0.2.tgz 183 | CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${CMAKE_INSTALL_PREFIX} -DBUILD_SHARED_LIBS:BOOL=${SHARED_NETLIBSCALAPACK} -DLAPACK_LIBRARIES:STRING='${LAPACK_LIBRARIES_STR}' 184 | PATCH_COMMAND patch -p1 -i ${CMAKE_SOURCE_DIR}/${_patch_file} 185 | BUILD_COMMAND ${CMAKE_COMMAND} --build . --target ${_build_target} -- -j -l ${NUMBER_OF_CPU} 186 | INSTALL_COMMAND ${CMAKE_COMMAND} --build . --target ${_install_target} -- 187 | #LOG_CONFIGURE 1 188 | #LOG_BUILD 1 189 | #LOG_INSTALL 1 190 | ) 191 | 192 | endif() 193 | 194 | 195 | # POLAR code 196 | add_subdirectory(src) 197 | 198 | # TESTS 199 | if( POLAR_TESTING ) 200 | 201 | ## LAPACKE 202 | find_package(LAPACKE COMPONENTS LAPACKEXT) 203 | if(LAPACK_FOUND AND LAPACK_LIBRARY_DIRS) 204 | # the RPATH to be used when installing 205 | list(APPEND CMAKE_INSTALL_RPATH ${LAPACK_LIBRARY_DIRS}) 206 | else() 207 | message(FATAL_ERROR "A LAPACK library is required but has not been found") 208 | endif() 209 | if(LAPACKE_FOUND) 210 | if ( LAPACKE_INCLUDE_DIRS ) # Not always this variable is set, especially when is included in LAPACK 211 | include_directories(${LAPACKE_INCLUDE_DIRS}) 212 | endif() 213 | if(LAPACKE_LIBRARY_DIRS) 214 | # the RPATH to be used when installing 215 | list(APPEND CMAKE_INSTALL_RPATH "${LAPACKE_LIBRARY_DIRS}") 216 | endif() 217 | if(LAPACKE_LINKER_FLAGS) 218 | list(APPEND CMAKE_EXE_LINKER_FLAGS ${LAPACKE_LINKER_FLAGS}) 219 | endif() 220 | else() 221 | if(ECRC_VERBOSE_FIND_PACKAGE) 222 | if (LAPACKE_STANDALONE OR NOT LAPACKE_WORKS) 223 | if (NOT LAPACKE_lapacke.h_DIRS) 224 | Print_Find_Header_Status(lapacke lapacke.h) 225 | endif () 226 | if (NOT LAPACKE_lapacke_LIBRARY) 227 | Print_Find_Library_Status(lapacke liblapacke) 228 | endif () 229 | endif() 230 | else() 231 | message(WARNING "LAPACKE library has not been found and ECRC_VERBOSE_FIND_PACKAGE is set to OFF" 232 | "Try to activate ECRC_VERBOSE_FIND_PACKAGE option (-DECRC_VERBOSE_FIND_PACKAGE=ON) to get some hints for the detection") 233 | endif() 234 | #message(FATAL_ERROR "A LAPACKE library is required but has not been found") 235 | endif() 236 | 237 | # BUILD TESTS 238 | enable_testing() # enables ctest 239 | add_definitions(-DHAVE_GETOPT_LONG) 240 | add_subdirectory(testing) 241 | add_subdirectory(timing) 242 | endif() 243 | 244 | install( FILES ${CMAKE_SOURCE_DIR}/include/polar.h DESTINATION include ) 245 | 246 | # PKGCONFIG 247 | set(prefix ${CMAKE_INSTALL_PREFIX}) 248 | set(libdir ${CMAKE_INSTALL_PREFIX}/lib) 249 | set(PKG_CONFIG_DIR ${libdir}/pkgconfig) 250 | configure_file(${CMAKE_SOURCE_DIR}/polar.pc.in ${CMAKE_CURRENT_BINARY_DIR}/polar.pc) 251 | install( FILES ${CMAKE_BINARY_DIR}/polar.pc 252 | DESTINATION ${PKG_CONFIG_DIR} 253 | ) 254 | -------------------------------------------------------------------------------- /scalapack-cmake-libsltmg-only.patch: -------------------------------------------------------------------------------- 1 | diff -Naur -x BLACS -x PBLAS a/CMakeLists.txt c/CMakeLists.txt 2 | --- a/CMakeLists.txt 2012-05-02 06:58:24.000000000 +0300 3 | +++ c/CMakeLists.txt 2017-02-22 15:58:25.000000000 +0300 4 | @@ -1,311 +1,11 @@ 5 | cmake_minimum_required(VERSION 2.8) 6 | -project(SCALAPACK C Fortran) 7 | -# Configure the warning and code coverage suppression file 8 | -configure_file( 9 | - "${SCALAPACK_SOURCE_DIR}/CMAKE/CTestCustom.cmake.in" 10 | - "${SCALAPACK_BINARY_DIR}/CTestCustom.cmake" 11 | - COPYONLY 12 | -) 13 | - 14 | -# Add the CMake directory for custon CMake modules 15 | -set(CMAKE_MODULE_PATH "${SCALAPACK_SOURCE_DIR}/CMAKE" ${CMAKE_MODULE_PATH}) 16 | - 17 | -if (UNIX) 18 | - if ( "${CMAKE_Fortran_COMPILER}" MATCHES "ifort" ) 19 | - set( CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fltconsistency -fp_port" ) 20 | - endif () 21 | -endif () 22 | - 23 | -# 24 | -# MPI 25 | -# 26 | -#set(MPI_BASE_DIR "/Users/julie/opt/openmpi/" CACHE PATH "MPI Path") 27 | -#set(MPI_BASE_DIR "/Users/julie/opt/mpich2/" CACHE PATH "MPI Path") 28 | -set(CMAKE_PREFIX_PATH "${MPI_BASE_DIR};${CMAKE_PREFIX_PATH}") 29 | -#set(MPI_COMPILER ${MPI_BASE_DIR}/bin/mpicc) 30 | - 31 | -find_package(MPI) 32 | -if (MPI_FOUND) 33 | - message(STATUS "Found MPI_LIBRARY : ${MPI_FOUND} ") 34 | - INCLUDE_DIRECTORIES(${MPI_INCLUDE_PATH}) 35 | - 36 | - find_program(MPI_C_COMPILER 37 | - NAMES mpicc 38 | - HINTS "${MPI_BASE_DIR}" 39 | - PATH_SUFFIXES bin 40 | - DOC "MPI C compiler.") 41 | - MARK_AS_ADVANCED(MPI_C_COMPILER) 42 | - if ("${MPI_C_COMPILER}" STREQUAL "MPI_C_COMPILER-NOTFOUND") 43 | - message(ERROR "--> MPI C Compiler NOT FOUND (please set MPI_BASE_DIR accordingly") 44 | - else() 45 | - message(STATUS "--> MPI C Compiler : ${MPI_C_COMPILER}") 46 | - SET(CMAKE_C_COMPILER "${MPI_C_COMPILER}") 47 | - message(STATUS "--> C Compiler : ${CMAKE_C_COMPILER}") 48 | - endif() 49 | - find_program(MPI_Fortran_COMPILER 50 | - NAMES mpif77 51 | - HINTS "${MPI_BASE_DIR}" 52 | - PATH_SUFFIXES bin 53 | - DOC "MPI Fortran compiler.") 54 | - MARK_AS_ADVANCED(MPI_Fortran_COMPILER) 55 | - 56 | - 57 | - 58 | - if ("${MPI_Fortran_COMPILER}" STREQUAL "MPI_Fortran_COMPILER-NOTFOUND") 59 | - message(ERROR "--> MPI Fortran Compiler NOT FOUND (please set MPI_BASE_DIR accordingly") 60 | - else() 61 | - message(STATUS "--> MPI Fortran Compiler : ${MPI_Fortran_COMPILER}") 62 | - SET(Fortran_COMPILER "${CMAKE_Fortran_COMPILER}") 63 | - SET(CMAKE_Fortran_COMPILER "${MPI_Fortran_COMPILER}") 64 | - message(STATUS "--> Fortran Compiler : ${CMAKE_Fortran_COMPILER}") 65 | - endif() 66 | - 67 | -else() 68 | - message(STATUS "Found MPI_LIBRARY : ${MPI_FOUND} ") 69 | - set(MPI_BASE_DIR ${MPI_BASE_DIR} CACHE PATH "MPI Path") 70 | - UNSET(MPIEXEC CACHE) 71 | - UNSET(MPIEXEC_POSTFLAGS CACHE) 72 | - UNSET(MPIEXEC_PREFLAGS CACHE) 73 | - UNSET(MPIEXEC_MAX_NUMPROCS CACHE) 74 | - UNSET(MPIEXEC_NUMPROC_FLAG CACHE) 75 | - UNSET(MPI_COMPILE_FLAGS CACHE) 76 | - UNSET(MPI_LINK_FLAGS CACHE) 77 | - UNSET(MPI_INCLUDE_PATH CACHE) 78 | - message(FATAL_ERROR "--> MPI Library NOT FOUND -- please set MPI_BASE_DIR accordingly --") 79 | -endif() 80 | - 81 | - 82 | -if (UNIX) 83 | - if ( "${CMAKE_Fortran_COMPILER}" MATCHES "ifort" ) 84 | - set( CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fltconsistency -fp_port" ) 85 | - endif () 86 | -endif () 87 | - 88 | -macro(SCALAPACK_install_library lib) 89 | - install(TARGETS ${lib} EXPORT scalapack-targets 90 | - ARCHIVE DESTINATION lib${LIB_SUFFIX} 91 | - LIBRARY DESTINATION lib${LIB_SUFFIX} 92 | - RUNTIME DESTINATION Testing 93 | - ) 94 | -endmacro() 95 | - 96 | -# -------------------------------------------------- 97 | -# Testing 98 | -SET(DART_TESTING_TIMEOUT 600) 99 | - 100 | -enable_testing() 101 | -include(CTest) 102 | -enable_testing() 103 | -# -------------------------------------------------- 104 | - 105 | -# Organize output files. On Windows this also keeps .dll files next 106 | -# to the .exe files that need them, making tests easy to run. 107 | -set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${SCALAPACK_BINARY_DIR}/TESTING) 108 | -set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${SCALAPACK_BINARY_DIR}/lib) 109 | -set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${SCALAPACK_BINARY_DIR}/lib) 110 | - 111 | -# -------------------------------------------------- 112 | -# Check for any necessary platform specific compiler flags 113 | -include( CheckBLACSCompilerFlags ) 114 | -CheckBLACSCompilerFlags() 115 | - 116 | -set(prefix ${CMAKE_INSTALL_PREFIX}) 117 | -set(libdir ${CMAKE_INSTALL_PREFIX}/lib${LIB_SUFFIX}) 118 | -set(PKG_CONFIG_DIR ${libdir}/pkgconfig) 119 | - 120 | -# -------------------------------------------------- 121 | -# BLACS Internal variables 122 | -# 123 | -# Fortran Mangling, MPI Tests and BLACS settings 124 | -# 125 | -include(FortranMangling) 126 | -COMPILE(install_COMPILED) 127 | - 128 | -FORTRAN_MANGLING(CDEFS) 129 | -#MESSAGE(STATUS "Setting CDEFS = ${CDEFS}") 130 | -#set(CDEFS ${CDEFS} CACHE STRING "Fortran Mangling" FORCE) 131 | -MESSAGE(STATUS "=========") 132 | - 133 | -# -------------------------------------------------- 134 | -# Compiler Flags 135 | -ADD_DEFINITIONS( "-D${CDEFS}") 136 | - 137 | -# -------------------------------------------------- 138 | -# ScaLAPACK needs BLAS and LAPACK 139 | - 140 | -option(USE_OPTIMIZED_LAPACK_BLAS "Whether or not to search for optimized LAPACK and BLAS libraries on your machine (if not found, Reference LAPACK and BLAS will be downloaded and installed)" ON) 141 | - 142 | -message(STATUS "CHECKING BLAS AND LAPACK LIBRARIES") 143 | - 144 | -IF(LAPACK_LIBRARIES) 145 | - include(CheckFortranFunctionExists) 146 | - message(STATUS "--> LAPACK supplied by user is ${LAPACK_LIBRARIES}.") 147 | - set(CMAKE_REQUIRED_LIBRARIES ${LAPACK_LIBRARIES}) 148 | - CHECK_FORTRAN_FUNCTION_EXISTS("dgesv" LAPACK_FOUND) 149 | - unset( CMAKE_REQUIRED_LIBRARIES ) 150 | - message(STATUS "--> LAPACK routine dgesv is found: ${LAPACK_FOUND}.") 151 | -ENDIF() 152 | - 153 | -if(LAPACK_FOUND) 154 | - message(STATUS "--> LAPACK supplied by user is WORKING, will use ${LAPACK_LIBRARIES}.") 155 | -else(LAPACK_FOUND) 156 | - if(USE_OPTIMIZED_LAPACK_BLAS) 157 | - message(STATUS "--> Searching for optimized LAPACK and BLAS libraries on your machine.") 158 | - find_package(LAPACK) 159 | - ENDIF(USE_OPTIMIZED_LAPACK_BLAS) 160 | - if(NOT LAPACK_FOUND) 161 | - message(STATUS "--> LAPACK and BLAS were not found. Reference LAPACK and BLAS will be downloaded and installed") 162 | - include(ExternalProject) 163 | - ExternalProject_Add( 164 | - lapack 165 | - URL http://www.netlib.org/lapack/lapack.tgz 166 | - CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${SCALAPACK_BINARY_DIR} 167 | - PREFIX ${SCALAPACK_BINARY_DIR}/dependencies 168 | - ) 169 | - if (UNIX) 170 | - SET(LAPACK_LIBRARIES ${SCALAPACK_BINARY_DIR}/lib/liblapack.a CACHE STRING "LAPACK library" FORCE) 171 | - SET(BLAS_LIBRARIES ${SCALAPACK_BINARY_DIR}/lib/libblas.a CACHE STRING "BLAS library" FORCE) 172 | - else (UNIX) # On Windows 173 | - SET(LAPACK_LIBRARIES ${SCALAPACK_BINARY_DIR}/lib/liblapack.lib CACHE STRING "LAPACK library" FORCE) 174 | - SET(BLAS_LIBRARIES ${SCALAPACK_BINARY_DIR}/lib/libblas.lib CACHE STRING "BLAS library" FORCE) 175 | - endif (UNIX) 176 | - ENDIF() 177 | -ENDIF() 178 | - 179 | -message(STATUS "BLAS library: ${BLAS_LIBRARIES}") 180 | -message(STATUS "LAPACK library: ${LAPACK_LIBRARIES}") 181 | - 182 | -MESSAGE(STATUS "=========") 183 | - 184 | -# -------------------------------------------------- 185 | -# By default static library 186 | -OPTION(BUILD_SHARED_LIBS "Build shared libraries" OFF ) 187 | -OPTION(BUILD_STATIC_LIBS "Build static libraries" ON ) 188 | - 189 | -# -------------------------------------------------- 190 | -# Subdirectories that need to be processed 191 | - 192 | -macro(append_subdir_files variable dirname) 193 | -get_directory_property(holder DIRECTORY ${dirname} DEFINITION ${variable}) 194 | -foreach(depfile ${holder}) 195 | - list(APPEND ${variable} "${dirname}/${depfile}") 196 | -endforeach() 197 | -endmacro() 198 | - 199 | - 200 | -# 201 | -# BLACS 202 | -# 203 | -add_subdirectory(BLACS) 204 | -append_subdir_files(blacs "BLACS/SRC") 205 | - 206 | -# 207 | -# TOOLS 208 | -# 209 | -add_subdirectory(TOOLS) 210 | -append_subdir_files(tools TOOLS) 211 | -append_subdir_files(tools-C TOOLS) 212 | -append_subdir_files(extra_lapack "TOOLS/LAPACK") 213 | - 214 | -# 215 | -# PBLAS 216 | -# 217 | -add_subdirectory(PBLAS) 218 | -append_subdir_files(pblas "PBLAS/SRC") 219 | -append_subdir_files(pblas-F "PBLAS/SRC") 220 | -append_subdir_files(pbblas "PBLAS/SRC/PBBLAS") 221 | -append_subdir_files(ptzblas "PBLAS/SRC/PTZBLAS") 222 | -append_subdir_files(ptools "PBLAS/SRC/PTOOLS") 223 | - 224 | -# 225 | -# REDIST 226 | -# 227 | -add_subdirectory(REDIST) 228 | -append_subdir_files(redist "REDIST/SRC") 229 | - 230 | -# 231 | -# SRC 232 | -# 233 | -add_subdirectory(SRC) 234 | -append_subdir_files(src "SRC") 235 | -append_subdir_files(src-C "SRC") 236 | - 237 | -if (UNIX) 238 | - add_library(scalapack ${blacs} ${tools} ${tools-C} ${extra_lapack} ${pblas} ${pblas-F} ${ptzblas} ${ptools} ${pbblas} ${redist} ${src} ${src-C}) 239 | - target_link_libraries( scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) 240 | - scalapack_install_library(scalapack) 241 | -else (UNIX) # Need to separate Fortran and C Code 242 | - OPTION(BUILD_SHARED_LIBS "Build shared libraries" ON ) 243 | - add_library(scalapack ${blacs} ${tools-C} ${pblas} ${ptools} ${redist} ${src-C}) 244 | - target_link_libraries( scalapack ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) 245 | - add_library(scalapack-F ${pblas-F} ${pbblas} ${ptzblas} ${tools} ${src} ${extra_lapack} ) 246 | - target_link_libraries( scalapack-F ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) 247 | - scalapack_install_library(scalapack) 248 | - scalapack_install_library(scalapack-F) 249 | -endif (UNIX) 250 | -add_subdirectory(TESTING) 251 | - 252 | -# -------------------------------------------------- 253 | -# CPACK Packaging 254 | - 255 | -SET(CPACK_PACKAGE_NAME "ScaLAPACK") 256 | -SET(CPACK_PACKAGE_VENDOR "University of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd") 257 | -SET(CPACK_PACKAGE_DESCRIPTION_SUMMARY "ScaLAPACK- Linear Algebra Package") 258 | -set(SCALAPACK_VERSION 2.0.2) 259 | -set(CPACK_PACKAGE_VERSION_MAJOR 2) 260 | -set(CPACK_PACKAGE_VERSION_MINOR 0) 261 | -set(CPACK_PACKAGE_VERSION_PATCH 2) 262 | -set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE") 263 | -SET(CPACK_PACKAGE_INSTALL_DIRECTORY "SCALAPACK") 264 | -IF(WIN32 AND NOT UNIX) 265 | - # There is a bug in NSI that does not handle full unix paths properly. Make 266 | - # sure there is at least one set of four (4) backlasshes. 267 | - SET(CPACK_NSIS_HELP_LINK "http:\\\\\\\\http://icl.cs.utk.edu/lapack-forum") 268 | - SET(CPACK_NSIS_URL_INFO_ABOUT "http:\\\\\\\\www.netlib.org/scalapack") 269 | - SET(CPACK_NSIS_CONTACT "scalapack@eecs.utk.edu") 270 | - SET(CPACK_NSIS_MODIFY_PATH ON) 271 | - SET(CPACK_NSIS_DISPLAY_NAME "SCALAPACK-${SCALAPACK_VERSION}") 272 | - set(CPACK_PACKAGE_RELOCATABLE "true") 273 | -ELSE(WIN32 AND NOT UNIX) 274 | - SET(CPACK_GENERATOR "TGZ") 275 | - SET(CPACK_SOURCE_GENERATOR TGZ) 276 | - SET(CPACK_SOURCE_PACKAGE_FILE_NAME "scalapack-${SCALAPACK_VERSION}" ) 277 | - SET(CPACK_SOURCE_IGNORE_FILES ~$ .svn ${CPACK_SOURCE_IGNORE_FILES} ) 278 | -ENDIF(WIN32 AND NOT UNIX) 279 | -INCLUDE(CPack) 280 | - 281 | - 282 | -# -------------------------------------------------- 283 | - 284 | - 285 | -export(TARGETS scalapack FILE scalapack-targets.cmake) 286 | - 287 | -if( NOT LAPACK_FOUND ) 288 | - install(FILES 289 | - ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES} 290 | - DESTINATION lib 291 | - ) 292 | -endif( NOT LAPACK_FOUND ) 293 | - 294 | -configure_file(${SCALAPACK_SOURCE_DIR}/CMAKE/scalapack-config-version.cmake.in 295 | - ${SCALAPACK_BINARY_DIR}/scalapack-config-version.cmake @ONLY) 296 | -configure_file(${SCALAPACK_SOURCE_DIR}/CMAKE/scalapack-config-build.cmake.in 297 | - ${SCALAPACK_BINARY_DIR}/scalapack-config.cmake @ONLY) 298 | - 299 | -configure_file(${CMAKE_CURRENT_SOURCE_DIR}/scalapack.pc.in ${CMAKE_CURRENT_BINARY_DIR}/scalapack.pc) 300 | - install(FILES 301 | - ${CMAKE_CURRENT_BINARY_DIR}/scalapack.pc 302 | - DESTINATION ${PKG_CONFIG_DIR} 303 | - ) 304 | - 305 | -configure_file(${SCALAPACK_SOURCE_DIR}/CMAKE/scalapack-config-install.cmake.in 306 | - ${SCALAPACK_BINARY_DIR}/CMakeFiles/scalapack-config.cmake @ONLY) 307 | -install(FILES 308 | - ${SCALAPACK_BINARY_DIR}/CMakeFiles/scalapack-config.cmake 309 | - ${SCALAPACK_BINARY_DIR}/scalapack-config-version.cmake 310 | - DESTINATION lib/cmake/scalapack-${SCALAPACK_VERSION} 311 | - ) 312 | - 313 | -install(EXPORT scalapack-targets 314 | - DESTINATION lib/cmake/scalapack-${SCALAPACK_VERSION}) 315 | +project(SLTMG Fortran) 316 | 317 | +file(GLOB SLTMG_SRC "TESTING/EIG/*.f") 318 | +add_library( sltmg STATIC ${SLTMG_SRC}) 319 | +install( TARGETS sltmg DESTINATION lib ) 320 | +ADD_CUSTOM_TARGET(install.sltmg 321 | + ${CMAKE_COMMAND} 322 | + -DBUILD_TYPE=${CMAKE_BUILD_TYPE} 323 | + -P ${CMAKE_CURRENT_BINARY_DIR}/cmake_install.cmake) 324 | +ADD_DEPENDENCIES(install.sltmg sltmg) 325 | -------------------------------------------------------------------------------- /testing/testing_pdgeqdwh.c: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | * Copyright (c) 2017, King Abdullah University of Science and Technology 4 | * All rights reserved. 5 | * 6 | **/ 7 | 8 | /** 9 | * 10 | * @file testing_pdgeqdwh.c 11 | * 12 | * QDWH is a high performance software framework for computing 13 | * the polar decomposition on distributed-memory manycore systems provided by KAUST 14 | * 15 | * @version 3.0.0 16 | * @author Dalal Sukkari 17 | * @author Hatem Ltaief 18 | * @date 2018-11-08 19 | * 20 | **/ 21 | 22 | #include 23 | #include "polar.h" 24 | 25 | 26 | /* Default values of parameters */ 27 | int nprow = 1; 28 | int npcol = 1; 29 | int lvec = 1; 30 | int rvec = 1; 31 | int n = 5120; 32 | int nb = 128; 33 | int mode = 4; 34 | double cond = 9.0072e+15; 35 | int optcond = 0; 36 | int start = 5120; 37 | int stop = 5120; 38 | int step = 1; 39 | int niter = 1; 40 | int check = 0; 41 | int verbose = 0; 42 | 43 | static inline double cWtime(void) 44 | { 45 | struct timeval tp; 46 | gettimeofday( &tp, NULL ); 47 | return tp.tv_sec + 1e-6 * tp.tv_usec; 48 | } 49 | 50 | void print_usage(void) 51 | { 52 | fprintf(stderr, 53 | "======= QDWH testing using ScaLAPACK\n" 54 | " -p --nprow : Number of MPI process rows\n" 55 | " -q --npcol : Number of MPI process cols\n" 56 | " -jl --lvec : Compute left singular vectors\n" 57 | " -jr --rvec : Compute right singular vectors\n" 58 | " -n --N : Dimension of the matrix\n" 59 | " -b --nb : Block size\n" 60 | " -m --mode : [1:6] Mode from pdlatms used to generate the matrix\n" 61 | " -k --cond : Condition number used to generate the matrix\n" 62 | " -o --optcond : Estimate Condition number using QR\n" 63 | " -i --niter : Number of iterations\n" 64 | " -r --n_range : Range for matrix sizes Start:Stop:Step\n" 65 | " -c --check : Check the solution\n" 66 | " -v --verbose : Verbose\n" 67 | " -h --help : Print this help\n" ); 68 | } 69 | 70 | #define GETOPT_STRING "p:q:x:y:n:b:m:i:o:r:Q,S:s:w:e:c:f:t:v:h" 71 | 72 | static struct option long_options[] = 73 | { 74 | /* PaRSEC specific options */ 75 | {"nprow", required_argument, 0, 'p'}, 76 | {"npcol", required_argument, 0, 'q'}, 77 | {"jl", no_argument, 0, 'x'}, 78 | {"lvec", no_argument, 0, 'x'}, 79 | {"jr", no_argument, 0, 'y'}, 80 | {"rvec", no_argument, 0, 'y'}, 81 | {"N", required_argument, 0, 'n'}, 82 | {"n", required_argument, 0, 'n'}, 83 | {"nb", required_argument, 0, 'b'}, 84 | {"b", required_argument, 0, 'b'}, 85 | {"mode", required_argument, 0, 'm'}, 86 | {"m", required_argument, 0, 'm'}, 87 | {"cond", required_argument, 0, 'k'}, 88 | {"k", required_argument, 0, 'k'}, 89 | {"optcond", required_argument, 0, 'o'}, 90 | {"o", required_argument, 0, 'o'}, 91 | {"i", required_argument, 0, 'i'}, 92 | {"niter", required_argument, 0, 'i'}, 93 | {"r", required_argument, 0, 'r'}, 94 | {"n_range", required_argument, 0, 'r'}, 95 | {"check", no_argument, 0, 'c'}, 96 | {"verbose", no_argument, 0, 'v'}, 97 | {"help", no_argument, 0, 'h'}, 98 | {"h", no_argument, 0, 'h'}, 99 | {0, 0, 0, 0} 100 | }; 101 | 102 | static void parse_arguments(int argc, char** argv) 103 | { 104 | int opt = 0; 105 | int c; 106 | int myrank_mpi; 107 | 108 | MPI_Comm_rank(MPI_COMM_WORLD, &myrank_mpi); 109 | 110 | do { 111 | #if defined(HAVE_GETOPT_LONG) 112 | c = getopt_long_only(argc, argv, "", 113 | long_options, &opt); 114 | #else 115 | c = getopt(argc, argv, GETOPT_STRING); 116 | (void) opt; 117 | #endif /* defined(HAVE_GETOPT_LONG) */ 118 | 119 | switch(c) { 120 | case 'p': nprow = atoi(optarg); break; 121 | case 'q': npcol = atoi(optarg); break; 122 | case 'n': n = atoi(optarg); start = n; stop = n; step = 1; break; 123 | case 'b': nb = atoi(optarg); break; 124 | case 'm': mode = atoi(optarg); break; 125 | case 'k': cond = atof(optarg); break; 126 | case 'o': optcond = atof(optarg); break; 127 | case 'i': niter = atoi(optarg); break; 128 | case 'r': get_range( optarg, &start, &stop, &step ); break; 129 | case 'c': check = 1; break; 130 | case 'v': verbose = 1; break; 131 | case 'h': 132 | if (myrank_mpi == 0) print_usage(); MPI_Finalize(); exit(0); 133 | break; 134 | default: 135 | break; 136 | } 137 | } while(-1 != c); 138 | } 139 | 140 | int main(int argc, char **argv) { 141 | 142 | 143 | int myrank_mpi, nprocs_mpi; 144 | int ictxt, myrow, mycol; 145 | int mloc, nloc, mlocW; 146 | int mpi_comm_rows, mpi_comm_cols; 147 | int i, j, k, iter, size, info_facto_mr, info_facto_dc, info_facto_qw, info_facto_el, info_facto_sl, info, iseed; 148 | int my_info_facto; 149 | int i0 = 0, i1 = 1; 150 | int lwork, liwork, ldw; 151 | 152 | /* Allocation for the input/output matrices */ 153 | int descA[9], descH[9]; 154 | double *A=NULL, *H=NULL; 155 | 156 | /* Allocation to check the results */ 157 | int descAcpy[9], descC[9]; 158 | double *Acpy=NULL, *C=NULL; 159 | 160 | /* Allocation for pdlatsm */ 161 | double *Wloc1=NULL, *Wloc2=NULL, *D=NULL; 162 | 163 | double eps = LAPACKE_dlamch_work('e'); 164 | int iprepad, ipostpad, sizemqrleft, sizemqrright, sizeqrf, sizeqtq, 165 | sizechk, sizesyevx, isizesyevx, 166 | sizesubtst, isizesubtst, sizetst, 167 | isizetst; 168 | /**/ 169 | 170 | double flops, GFLOPS; 171 | 172 | double orth_Uqw, berr_UHqw; 173 | double frobA; 174 | 175 | double alpha, beta; 176 | char *jobu, *jobvt; 177 | 178 | 179 | /**/ 180 | 181 | if (verbose & myrank_mpi == 0) fprintf(stderr, "Program starts... \n"); 182 | 183 | MPI_Init(&argc, &argv); 184 | MPI_Comm_rank(MPI_COMM_WORLD, &myrank_mpi); 185 | MPI_Comm_size(MPI_COMM_WORLD, &nprocs_mpi); 186 | 187 | if (verbose & myrank_mpi == 0) fprintf(stderr, "MPI Init done\n"); 188 | 189 | parse_arguments(argc, argv); 190 | 191 | if (verbose & myrank_mpi == 0) fprintf(stderr, "Checking arguments done\n"); 192 | 193 | Cblacs_get( -1, 0, &ictxt ); 194 | Cblacs_gridinit( &ictxt, "R", nprow, npcol ); 195 | Cblacs_gridinfo( ictxt, &nprow, &npcol, &myrow, &mycol ); 196 | 197 | if (verbose & myrank_mpi == 0) fprintf(stderr, "BLACS Init done\n"); 198 | 199 | if (myrank_mpi == 0) { 200 | fprintf(stderr, "# \n"); 201 | fprintf(stderr, "# NPROCS %d P %d Q %d\n", nprocs_mpi, nprow, npcol); 202 | fprintf(stderr, "# niter %d\n", niter); 203 | fprintf(stderr, "# n_range %d:%d:%d mode: %d cond: %2.4e \n", start, stop, step, mode, cond); 204 | fprintf(stderr, "# \n"); 205 | } 206 | 207 | if (verbose & myrank_mpi == 0) fprintf(stderr, "Range loop starts\n"); 208 | 209 | 210 | // Begin loop over range 211 | for (size = start; size <= stop; size += step) { 212 | while ( (int)((double)size / (double)nb) < ( max(nprow , npcol) )){ 213 | if (myrank_mpi == 0) fprintf(stderr, " Matrix size is small to be facrorized using this number of processors \n"); 214 | size += step; 215 | } 216 | n = size; ldw = 2*n; 217 | mloc = numroc_( &n, &nb, &myrow, &i0, &nprow ); 218 | nloc = numroc_( &n, &nb, &mycol, &i0, &npcol ); 219 | mlocW = numroc_( &ldw, &nb, &myrow, &i0, &nprow ); 220 | if (verbose & myrank_mpi == 0) fprintf(stderr, "Desc Init starts %d\n", mloc); 221 | 222 | descinit_( descA, &n, &n, &nb, &nb, &i0, &i0, &ictxt, &mloc, &info ); 223 | descinit_( descAcpy, &n, &n, &nb, &nb, &i0, &i0, &ictxt, &mloc, &info ); 224 | descinit_( descC, &n, &n, &nb, &nb, &i0, &i0, &ictxt, &mloc, &info ); 225 | descinit_( descH, &n, &n, &nb, &nb, &i0, &i0, &ictxt, &mloc, &info ); 226 | if (verbose & myrank_mpi == 0) fprintf(stderr, "Desc Init ends %d\n", mloc); 227 | 228 | A = (double *)malloc(mloc*nloc*sizeof(double)) ; 229 | H = (double *)malloc(mloc*nloc*sizeof(double)) ; 230 | C = (double *)malloc(mloc*nloc*sizeof(double)) ; 231 | Acpy = (double *)malloc(mloc*nloc*sizeof(double)) ; 232 | D = (double *)malloc(n*sizeof(double)) ; 233 | 234 | 235 | /* Generate matrix by pdlatms */ 236 | { 237 | char *dist = "N"; /* NORMAL( 0, 1 ) ( 'N' for normal ) */ 238 | int iseed[4] = {1, 0, 0, 1}; 239 | char *sym = "P"; /* The generated matrix is symmetric, with 240 | eigenvalues (= singular values) specified by D, COND, 241 | MODE, and DMAX; they will not be negative. 242 | "N" not supported. */ 243 | //int mode = 4; /* sets D(i)=1 - (i-1)/(N-1)*(1 - 1/COND) */ 244 | //double cond = 1.0/eps; 245 | double dmax = 1.0; 246 | int kl = n; 247 | int ku = n; 248 | char *pack = "N"; /* no packing */ 249 | int order = n; 250 | int info; 251 | 252 | pdlasizesep_( descA, 253 | &iprepad, &ipostpad, &sizemqrleft, &sizemqrright, &sizeqrf, 254 | &lwork, 255 | &sizeqtq, &sizechk, &sizesyevx, &isizesyevx, &sizesubtst, 256 | &isizesubtst, &sizetst, &isizetst ); 257 | if (verbose & myrank_mpi == 0) fprintf(stderr, "Setting lwork done\n"); 258 | Wloc1 = (double *)calloc(lwork,sizeof(double)) ; 259 | pdlatms_(&n, &n, dist, 260 | iseed, sym, D, &mode, &cond, &dmax, 261 | &kl, &ku, pack, 262 | A, &i1, &i1, descA, &order, 263 | Wloc1, &lwork, &info); 264 | if (verbose & myrank_mpi == 0) fprintf(stderr, "MatGen done\n"); 265 | if (info != 0) { 266 | fprintf(stderr, "An error occured during matrix generation: %d\n", info ); 267 | return EXIT_FAILURE; 268 | } 269 | pdlacpy_( "All", &n, &n, 270 | A, &i1, &i1, descA, 271 | Acpy, &i1, &i1, descAcpy ); 272 | frobA = pdlange_ ( "f", &n, &n, A, &i1, &i1, descA, Wloc1); 273 | beta = 0.0; 274 | 275 | if (verbose & myrank_mpi == 0) fprintf(stderr, "Copy to Acpy done\n"); 276 | 277 | free( Wloc1 ); 278 | } 279 | 280 | if (myrank_mpi == 0) fprintf(stderr, "\n\n"); 281 | if (myrank_mpi == 0) fprintf(stderr, "/////////////////////////////////////////////////////////////////////////\n"); 282 | if (myrank_mpi == 0) fprintf(stderr, "/////////////////////////////////////////////////////////////////////////\n"); 283 | if (myrank_mpi == 0) fprintf(stderr, "/////////////////////////////////////////////////////////////////////////\n"); 284 | 285 | // QDWH 286 | //if ( qwmr || qwdc || qwel || polarqdwh) { 287 | jobu = lvec ? "V" : "N"; 288 | int lWork1, lWork2, lWi; 289 | Wloc1 = (double *)calloc(1,sizeof(double)) ; 290 | Wloc2 = (double *)calloc(1,sizeof(double)) ; 291 | lWork1 = -1; 292 | lWork2 = -1; 293 | 294 | pdgeqdwh( "H", n, n, 295 | A, i1, i1, descA, 296 | H, i1, i1, descH, 297 | //NULL, lWork, //debug 298 | Wloc1, lWork1, 299 | Wloc2, lWork2, 300 | &my_info_facto); 301 | 302 | lWork1 = Wloc1[0]; 303 | lWork2 = Wloc2[0]; 304 | 305 | 306 | //Wloc = (double *)malloc(lWork*n*sizeof(double)); 307 | Wloc1 = (double *)malloc((lWork1*nloc)*sizeof(double)); 308 | Wloc2 = (double *)malloc((lWork2*nloc)*sizeof(double)); 309 | 310 | //int MB3 = 3*n; 311 | //int mlocW3 = numroc_( &MB3, &nb, &myrow, &i0, &nprow ); 312 | //mlocW3 = ((mlocW3+(nb-1))/nb)*nb; 313 | //Wloc = (double *)malloc((mlocW3*nloc)*sizeof(double)); 314 | 315 | 316 | for (iter = 0; iter < niter; iter++) { 317 | pdlacpy_( "All", &n, &n, 318 | Acpy, &i1, &i1, descAcpy, 319 | A, &i1, &i1, descA ); 320 | flops = 0.0; 321 | if (verbose & myrank_mpi == 0) fprintf(stderr, "\nQDWH + ScaLAPACK EIG done\n"); 322 | if (verbose & myrank_mpi == 0) fprintf(stderr, "QDWH starts...\n"); 323 | 324 | /* 325 | * Find polar decomposition using QDWH. 326 | * C contains the positive-definite factor. 327 | * A contains the orthogonal polar factor. 328 | */ 329 | 330 | pdgeqdwh( "H", n, n, 331 | A, i1, i1, descA, 332 | H, i1, i1, descH, 333 | //NULL, lWork, //debug 334 | Wloc1, lWork1, 335 | Wloc2, lWork2, 336 | &my_info_facto); 337 | 338 | MPI_Allreduce( &my_info_facto, &info_facto_qw, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD); 339 | 340 | if (verbose & myrank_mpi == 0) fprintf(stderr, "QDWH ends...\n"); 341 | if (verbose & check & myrank_mpi == 0) fprintf(stderr, "Testing QDWH starts...\n"); 342 | /* 343 | * Checking the polar factorization 344 | */ 345 | if(check ){ 346 | /* 347 | * checking orthogonality of Up 348 | */ 349 | alpha = 0.0; beta = 1.0; 350 | pdlaset_( "G", &n, &n, &alpha, &beta, C, &i1, &i1, descC); 351 | alpha = 1.0; beta = -1.0; 352 | pdgemm_( "T", "N", &n, &n, &n, 353 | &alpha, 354 | A, &i1, &i1, descA, 355 | A, &i1, &i1, descA, 356 | &beta, 357 | C, &i1, &i1, descC); 358 | orth_Uqw = pdlange_ ( "f", &n, &n, C, &i1, &i1, descC, Wloc1)/sqrt(n); 359 | /* 360 | * checking the factorization |A-Up*H| 361 | */ 362 | pdlacpy_( "A", &n, &n, Acpy, &i1, &i1, descAcpy, C, &i1, &i1, descC ); 363 | alpha = 1.0; beta = -1.0; 364 | pdgemm_( "N", "N", &n, &n, &n, 365 | &alpha, 366 | A, &i1, &i1, descA, 367 | H, &i1, &i1, descH, 368 | &beta, 369 | C, &i1, &i1, descC); 370 | berr_UHqw = pdlange_ ( "f", &n, &n, C, &i1, &i1, descC, Wloc1)/frobA; 371 | if ( myrank_mpi == 0) { 372 | fprintf(stderr, "# QDWH \n"); 373 | fprintf(stderr, "#\n"); 374 | fprintf(stderr, "# \tN \tNB \tNP \tP \tQ \tinfo \tBerr_UpH \tOrth_Up \n"); 375 | fprintf(stderr, " %6d \t%4d \t%4d \t%3d \t%3d", n, nb, nprocs_mpi, nprow, npcol); 376 | fprintf(stderr, "\t%d \t%2.4e \t%2.4e \n", info_facto_qw, berr_UHqw, orth_Uqw); 377 | fprintf(stderr, "/////////////////////////////////////////////////////////////////////////\n"); 378 | } 379 | } 380 | } 381 | 382 | 383 | 384 | if (myrank_mpi == 0) fprintf(stderr, "/////////////////////////////////////////////////////////////////////////\n"); 385 | 386 | free( A ); 387 | free( Acpy ); 388 | free( C ); 389 | free( H ); 390 | free( D ); 391 | free(Wloc1); 392 | free(Wloc2); 393 | if (verbose & myrank_mpi == 0) fprintf(stderr, "Free matrices done\n"); 394 | } // End loop over range 395 | 396 | 397 | if (verbose & myrank_mpi == 0) fprintf(stderr, "Range loop ends\n"); 398 | 399 | blacs_gridexit_( &i0 ); 400 | MPI_Finalize(); 401 | if (verbose & myrank_mpi == 0) fprintf(stderr, "Program ends...\n"); 402 | return 0; 403 | } 404 | -------------------------------------------------------------------------------- /testing/testing_pdgezolopd.c: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | * Copyright (c) 2017, King Abdullah University of Science and Technology 4 | * All rights reserved. 5 | * 6 | **/ 7 | 8 | /** 9 | * 10 | * @file testing_pdgezolopd.c 11 | * 12 | * ZOLOPD is a high performance software framework for computing 13 | * the polar decomposition on distributed-memory manycore systems provided by KAUST 14 | * 15 | * @version 3.0.0 16 | * @author Dalal Sukkari 17 | * @author Hatem Ltaief 18 | * @date 2018-11-08 19 | * 20 | **/ 21 | 22 | #include "polar.h" 23 | 24 | 25 | /* Default values of parameters */ 26 | int nprow = 1; 27 | int npcol = 1; 28 | int lvec = 1; 29 | int rvec = 1; 30 | int n = 5120; 31 | int nb = 128; 32 | int mode = 4; 33 | double cond = 9.0072e+15; 34 | int optcond = 0; 35 | int start = 5120; 36 | int stop = 5120; 37 | int step = 1; 38 | int niter = 1; 39 | int check = 0; 40 | int verbose = 0; 41 | 42 | static inline double cWtime(void) 43 | { 44 | struct timeval tp; 45 | gettimeofday( &tp, NULL ); 46 | return tp.tv_sec + 1e-6 * tp.tv_usec; 47 | } 48 | 49 | void print_usage(void) 50 | { 51 | fprintf(stderr, 52 | "======= ZOLOPD testing using ScaLAPACK\n" 53 | " -p --nprow : Number of MPI process rows\n" 54 | " -q --npcol : Number of MPI process cols\n" 55 | " -jl --lvec : Compute left singular vectors\n" 56 | " -jr --rvec : Compute right singular vectors\n" 57 | " -n --N : Dimension of the matrix\n" 58 | " -b --nb : Block size\n" 59 | " -m --mode : [1:6] Mode from pdlatms used to generate the matrix\n" 60 | " -k --cond : Condition number used to generate the matrix\n" 61 | " -o --optcond : Estimate Condition number using QR\n" 62 | " -i --niter : Number of iterations\n" 63 | " -r --n_range : Range for matrix sizes Start:Stop:Step\n" 64 | " -c --check : Check the solution\n" 65 | " -v --verbose : Verbose\n" 66 | " -h --help : Print this help\n" ); 67 | } 68 | 69 | #define GETOPT_STRING "p:q:x:y:n:b:m:i:o:r:Q,S:s:w:e:c:f:t:v:h" 70 | 71 | static struct option long_options[] = 72 | { 73 | /* PaRSEC specific options */ 74 | {"nprow", required_argument, 0, 'p'}, 75 | {"npcol", required_argument, 0, 'q'}, 76 | {"jl", no_argument, 0, 'x'}, 77 | {"lvec", no_argument, 0, 'x'}, 78 | {"jr", no_argument, 0, 'y'}, 79 | {"rvec", no_argument, 0, 'y'}, 80 | {"N", required_argument, 0, 'n'}, 81 | {"n", required_argument, 0, 'n'}, 82 | {"nb", required_argument, 0, 'b'}, 83 | {"b", required_argument, 0, 'b'}, 84 | {"mode", required_argument, 0, 'm'}, 85 | {"m", required_argument, 0, 'm'}, 86 | {"cond", required_argument, 0, 'k'}, 87 | {"k", required_argument, 0, 'k'}, 88 | {"optcond", required_argument, 0, 'o'}, 89 | {"o", required_argument, 0, 'o'}, 90 | {"i", required_argument, 0, 'i'}, 91 | {"niter", required_argument, 0, 'i'}, 92 | {"r", required_argument, 0, 'r'}, 93 | {"n_range", required_argument, 0, 'r'}, 94 | {"check", no_argument, 0, 'c'}, 95 | {"verbose", no_argument, 0, 'v'}, 96 | {"help", no_argument, 0, 'h'}, 97 | {"h", no_argument, 0, 'h'}, 98 | {0, 0, 0, 0} 99 | }; 100 | 101 | static void parse_arguments(int argc, char** argv) 102 | { 103 | int opt = 0; 104 | int c; 105 | int myrank_mpi; 106 | 107 | MPI_Comm_rank(MPI_COMM_WORLD, &myrank_mpi); 108 | 109 | do { 110 | #if defined(HAVE_GETOPT_LONG) 111 | c = getopt_long_only(argc, argv, "", 112 | long_options, &opt); 113 | #else 114 | c = getopt(argc, argv, GETOPT_STRING); 115 | (void) opt; 116 | #endif /* defined(HAVE_GETOPT_LONG) */ 117 | 118 | switch(c) { 119 | case 'p': nprow = atoi(optarg); break; 120 | case 'q': npcol = atoi(optarg); break; 121 | case 'n': n = atoi(optarg); start = n; stop = n; step = 1; break; 122 | case 'b': nb = atoi(optarg); break; 123 | case 'm': mode = atoi(optarg); break; 124 | case 'k': cond = atof(optarg); break; 125 | case 'o': optcond = atof(optarg); break; 126 | case 'i': niter = atoi(optarg); break; 127 | case 'r': get_range( optarg, &start, &stop, &step ); break; 128 | case 'c': check = 1; break; 129 | case 'v': verbose = 1; break; 130 | case 'h': 131 | if (myrank_mpi == 0) print_usage(); MPI_Finalize(); exit(0); 132 | break; 133 | default: 134 | break; 135 | } 136 | } while(-1 != c); 137 | } 138 | 139 | int main(int argc, char **argv) { 140 | 141 | 142 | int myrank_mpi, nprocs_mpi; 143 | int ictxt, myrow, mycol; 144 | int mloc, nloc, mlocW; 145 | int mpi_comm_rows, mpi_comm_cols; 146 | int i, j, k, iter, size, info_facto_mr, info_facto_dc, info_facto_zo, info_facto_el, info_facto_sl, info, iseed; 147 | int my_info_facto; 148 | int i0 = 0, i1 = 1; 149 | int lwork, liwork, ldw; 150 | 151 | /* Allocation for the input/output matrices */ 152 | int descA[9], descH[9]; 153 | double *A=NULL, *H=NULL; 154 | 155 | /* Allocation to check the results */ 156 | int descAcpy[9], descC[9]; 157 | double *Acpy=NULL, *C=NULL; 158 | 159 | /* Allocation for pdlatsm */ 160 | double *Wloc1=NULL, *Wloc2=NULL, *D=NULL; 161 | 162 | double eps = LAPACKE_dlamch_work('e'); 163 | int iprepad, ipostpad, sizemqrleft, sizemqrright, sizeqrf, sizeqtq, 164 | sizechk, sizesyevx, isizesyevx, 165 | sizesubtst, isizesubtst, sizetst, 166 | isizetst; 167 | /**/ 168 | 169 | double flops, GFLOPS; 170 | 171 | double orth_Uzo, berr_UHzo; 172 | double frobA; 173 | 174 | double alpha, beta; 175 | char *jobu, *jobvt; 176 | 177 | 178 | /**/ 179 | 180 | if (verbose & myrank_mpi == 0) fprintf(stderr, "Program starts... \n"); 181 | 182 | MPI_Init(&argc, &argv); 183 | MPI_Comm_rank(MPI_COMM_WORLD, &myrank_mpi); 184 | MPI_Comm_size(MPI_COMM_WORLD, &nprocs_mpi); 185 | 186 | if (verbose & myrank_mpi == 0) fprintf(stderr, "MPI Init done\n"); 187 | 188 | parse_arguments(argc, argv); 189 | 190 | if (verbose & myrank_mpi == 0) fprintf(stderr, "Checking arguments done\n"); 191 | 192 | Cblacs_get( -1, 0, &ictxt ); 193 | Cblacs_gridinit( &ictxt, "R", nprow, npcol ); 194 | Cblacs_gridinfo( ictxt, &nprow, &npcol, &myrow, &mycol ); 195 | 196 | if (verbose & myrank_mpi == 0) fprintf(stderr, "BLACS Init done\n"); 197 | 198 | if (myrank_mpi == 0) { 199 | fprintf(stderr, "# \n"); 200 | fprintf(stderr, "# NPROCS %d P %d Q %d\n", nprocs_mpi, nprow, npcol); 201 | fprintf(stderr, "# niter %d\n", niter); 202 | fprintf(stderr, "# n_range %d:%d:%d mode: %d cond: %2.4e \n", start, stop, step, mode, cond); 203 | fprintf(stderr, "# \n"); 204 | } 205 | 206 | if (verbose & myrank_mpi == 0) fprintf(stderr, "Range loop starts\n"); 207 | 208 | 209 | // Begin loop over range 210 | for (size = start; size <= stop; size += step) { 211 | while ( (int)((double)size / (double)nb) < ( max(nprow , npcol) )){ 212 | if (myrank_mpi == 0) fprintf(stderr, " Matrix size is small to be facrorized using this number of processors \n"); 213 | size += step; 214 | } 215 | n = size; ldw = 2*n; 216 | mloc = numroc_( &n, &nb, &myrow, &i0, &nprow ); 217 | nloc = numroc_( &n, &nb, &mycol, &i0, &npcol ); 218 | mlocW = numroc_( &ldw, &nb, &myrow, &i0, &nprow ); 219 | if (verbose & myrank_mpi == 0) fprintf(stderr, "Desc Init starts %d\n", mloc); 220 | 221 | descinit_( descA, &n, &n, &nb, &nb, &i0, &i0, &ictxt, &mloc, &info ); 222 | descinit_( descAcpy, &n, &n, &nb, &nb, &i0, &i0, &ictxt, &mloc, &info ); 223 | descinit_( descC, &n, &n, &nb, &nb, &i0, &i0, &ictxt, &mloc, &info ); 224 | descinit_( descH, &n, &n, &nb, &nb, &i0, &i0, &ictxt, &mloc, &info ); 225 | if (verbose & myrank_mpi == 0) fprintf(stderr, "Desc Init ends %d\n", mloc); 226 | 227 | A = (double *)malloc(mloc*nloc*sizeof(double)) ; 228 | H = (double *)malloc(mloc*nloc*sizeof(double)) ; 229 | C = (double *)malloc(mloc*nloc*sizeof(double)) ; 230 | Acpy = (double *)malloc(mloc*nloc*sizeof(double)) ; 231 | D = (double *)malloc(n*sizeof(double)) ; 232 | 233 | 234 | /* Generate matrix by pdlatms */ 235 | { 236 | char *dist = "N"; /* NORMAL( 0, 1 ) ( 'N' for normal ) */ 237 | int iseed[4] = {1, 0, 0, 1}; 238 | char *sym = "P"; /* The generated matrix is symmetric, with 239 | eigenvalues (= singular values) specified by D, COND, 240 | MODE, and DMAX; they will not be negative. 241 | "N" not supported. */ 242 | //int mode = 4; /* sets D(i)=1 - (i-1)/(N-1)*(1 - 1/COND) */ 243 | //double cond = 1.0/eps; 244 | double dmax = 1.0; 245 | int kl = n; 246 | int ku = n; 247 | char *pack = "N"; /* no packing */ 248 | int order = n; 249 | int info; 250 | 251 | pdlasizesep_( descA, 252 | &iprepad, &ipostpad, &sizemqrleft, &sizemqrright, &sizeqrf, 253 | &lwork, 254 | &sizeqtq, &sizechk, &sizesyevx, &isizesyevx, &sizesubtst, 255 | &isizesubtst, &sizetst, &isizetst ); 256 | if (verbose & myrank_mpi == 0) fprintf(stderr, "Setting lwork done\n"); 257 | Wloc1 = (double *)calloc(lwork,sizeof(double)) ; 258 | pdlatms_(&n, &n, dist, 259 | iseed, sym, D, &mode, &cond, &dmax, 260 | &kl, &ku, pack, 261 | A, &i1, &i1, descA, &order, 262 | Wloc1, &lwork, &info); 263 | if (verbose & myrank_mpi == 0) fprintf(stderr, "MatGen done\n"); 264 | if (info != 0) { 265 | fprintf(stderr, "An error occured during matrix generation: %d\n", info ); 266 | return EXIT_FAILURE; 267 | } 268 | pdlacpy_( "All", &n, &n, 269 | A, &i1, &i1, descA, 270 | Acpy, &i1, &i1, descAcpy ); 271 | frobA = pdlange_ ( "f", &n, &n, A, &i1, &i1, descA, Wloc1); 272 | beta = 0.0; 273 | 274 | if (verbose & myrank_mpi == 0) fprintf(stderr, "Copy to Acpy done\n"); 275 | 276 | free( Wloc1 ); 277 | } 278 | 279 | if (myrank_mpi == 0) fprintf(stderr, "\n\n"); 280 | if (myrank_mpi == 0) fprintf(stderr, "/////////////////////////////////////////////////////////////////////////\n"); 281 | if (myrank_mpi == 0) fprintf(stderr, "/////////////////////////////////////////////////////////////////////////\n"); 282 | if (myrank_mpi == 0) fprintf(stderr, "/////////////////////////////////////////////////////////////////////////\n"); 283 | 284 | // ZOLOPD 285 | //if ( zomr || zodc || zoel || polarzolopd) { 286 | jobu = lvec ? "V" : "N"; 287 | int lWork1, lWork2, lWi; 288 | Wloc1 = (double *)calloc(1,sizeof(double)) ; 289 | Wloc2 = (double *)calloc(1,sizeof(double)) ; 290 | lWork1 = -1; 291 | lWork2 = -1; 292 | 293 | pdgezolopd( "H", n, n, 294 | A, i1, i1, descA, 295 | H, i1, i1, descH, 296 | //NULL, lWork, //debug 297 | Wloc1, lWork1, 298 | Wloc2, lWork2, 299 | &my_info_facto); 300 | 301 | lWork1 = Wloc1[0]; 302 | lWork2 = Wloc2[0]; 303 | 304 | 305 | //Wloc = (double *)malloc(lWork*n*sizeof(double)); 306 | Wloc1 = (double *)malloc((lWork1*nloc)*sizeof(double)); 307 | Wloc2 = (double *)malloc((lWork2*nloc)*sizeof(double)); 308 | 309 | //int MB3 = 3*n; 310 | //int mlocW3 = numroc_( &MB3, &nb, &myrow, &i0, &nprow ); 311 | //mlocW3 = ((mlocW3+(nb-1))/nb)*nb; 312 | //Wloc = (double *)malloc((mlocW3*nloc)*sizeof(double)); 313 | 314 | 315 | for (iter = 0; iter < niter; iter++) { 316 | pdlacpy_( "All", &n, &n, 317 | Acpy, &i1, &i1, descAcpy, 318 | A, &i1, &i1, descA ); 319 | flops = 0.0; 320 | if (verbose & myrank_mpi == 0) fprintf(stderr, "\nZOLOPD + ScaLAPACK EIG done\n"); 321 | if (verbose & myrank_mpi == 0) fprintf(stderr, "ZOLOPD starts...\n"); 322 | 323 | /* 324 | * Find polar decomposition using ZOLOPD. 325 | * C contains the positive-definite factor. 326 | * A contains the orthogonal polar factor. 327 | */ 328 | 329 | pdgezolopd( "H", n, n, 330 | A, i1, i1, descA, 331 | H, i1, i1, descH, 332 | //NULL, lWork, //debug 333 | Wloc1, lWork1, 334 | Wloc2, lWork2, 335 | &my_info_facto); 336 | 337 | MPI_Allreduce( &my_info_facto, &info_facto_zo, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD); 338 | 339 | if (verbose & myrank_mpi == 0) fprintf(stderr, "ZOLOPD ends...\n"); 340 | if (verbose & check & myrank_mpi == 0) fprintf(stderr, "Testing ZOLOPD starts...\n"); 341 | /* 342 | * Checking the polar factorization 343 | */ 344 | if(check ){ 345 | /* 346 | * checking orthogonality of Up 347 | */ 348 | alpha = 0.0; beta = 1.0; 349 | pdlaset_( "G", &n, &n, &alpha, &beta, C, &i1, &i1, descC); 350 | alpha = 1.0; beta = -1.0; 351 | pdgemm_( "T", "N", &n, &n, &n, 352 | &alpha, 353 | A, &i1, &i1, descA, 354 | A, &i1, &i1, descA, 355 | &beta, 356 | C, &i1, &i1, descC); 357 | orth_Uzo = pdlange_ ( "f", &n, &n, C, &i1, &i1, descC, Wloc1)/sqrt(n); 358 | /* 359 | * checking the factorization |A-Up*H| 360 | */ 361 | pdlacpy_( "A", &n, &n, Acpy, &i1, &i1, descAcpy, C, &i1, &i1, descC ); 362 | alpha = 1.0; beta = -1.0; 363 | pdgemm_( "N", "N", &n, &n, &n, 364 | &alpha, 365 | A, &i1, &i1, descA, 366 | H, &i1, &i1, descH, 367 | &beta, 368 | C, &i1, &i1, descC); 369 | berr_UHzo = pdlange_ ( "f", &n, &n, C, &i1, &i1, descC, Wloc1)/frobA; 370 | if ( myrank_mpi == 0) { 371 | fprintf(stderr, "# ZOLOPD \n"); 372 | fprintf(stderr, "#\n"); 373 | fprintf(stderr, "# \tN \tNB \tNP \tP \tQ \tinfo \tBerr_UpH \tOrth_Up \n"); 374 | fprintf(stderr, " %6d \t%4d \t%4d \t%3d \t%3d", n, nb, nprocs_mpi, nprow, npcol); 375 | fprintf(stderr, "\t%d \t%2.4e \t%2.4e \n", info_facto_zo, berr_UHzo, orth_Uzo); 376 | fprintf(stderr, "/////////////////////////////////////////////////////////////////////////\n"); 377 | } 378 | } 379 | } 380 | 381 | 382 | 383 | if (myrank_mpi == 0) fprintf(stderr, "/////////////////////////////////////////////////////////////////////////\n"); 384 | 385 | free( A ); 386 | free( Acpy ); 387 | free( C ); 388 | free( H ); 389 | free( D ); 390 | free(Wloc1); 391 | free(Wloc2); 392 | if (verbose & myrank_mpi == 0) fprintf(stderr, "Free matrices done\n"); 393 | } // End loop over range 394 | 395 | 396 | if (verbose & myrank_mpi == 0) fprintf(stderr, "Range loop ends\n"); 397 | 398 | blacs_gridexit_( &i0 ); 399 | MPI_Finalize(); 400 | if (verbose & myrank_mpi == 0) fprintf(stderr, "Program ends...\n"); 401 | return 0; 402 | } 403 | -------------------------------------------------------------------------------- /timing/timing_pdgeqdwh.c: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | * Copyright (c) 2017, King Abdullah University of Science and Technology 4 | * All rights reserved. 5 | * 6 | **/ 7 | 8 | /** 9 | * 10 | * @file timing_pdgeqdwh.c 11 | * 12 | * QDWH is a high performance software framework for computing 13 | * the polar decomposition on distributed-memory manycore systems provided by KAUST 14 | * 15 | * @version 3.0.0 16 | * @author Dalal Sukkari 17 | * @author Hatem Ltaief 18 | * @date 2018-11-08 19 | * 20 | **/ 21 | 22 | #include 23 | #include "polar.h" 24 | 25 | 26 | /* Default values of parameters */ 27 | int nprow = 1; 28 | int npcol = 1; 29 | int lvec = 1; 30 | int rvec = 1; 31 | int n = 5120; 32 | int nb = 128; 33 | int mode = 4; 34 | double cond = 9.0072e+15; 35 | int optcond = 0; 36 | int start = 5120; 37 | int stop = 5120; 38 | int step = 1; 39 | int niter = 1; 40 | int check = 0; 41 | int verbose = 0; 42 | 43 | static inline double cWtime(void) 44 | { 45 | struct timeval tp; 46 | gettimeofday( &tp, NULL ); 47 | return tp.tv_sec + 1e-6 * tp.tv_usec; 48 | } 49 | 50 | void print_usage(void) 51 | { 52 | fprintf(stderr, 53 | "======= QDWH timing using ScaLAPACK\n" 54 | " -p --nprow : Number of MPI process rows\n" 55 | " -q --npcol : Number of MPI process cols\n" 56 | " -jl --lvec : Compute left singular vectors\n" 57 | " -jr --rvec : Compute right singular vectors\n" 58 | " -n --N : Dimension of the matrix\n" 59 | " -b --nb : Block size\n" 60 | " -m --mode : [1:6] Mode from pdlatms used to generate the matrix\n" 61 | " -k --cond : Condition number used to generate the matrix\n" 62 | " -o --optcond : Estimate Condition number using QR\n" 63 | " -i --niter : Number of iterations\n" 64 | " -r --n_range : Range for matrix sizes Start:Stop:Step\n" 65 | " -c --check : Check the solution\n" 66 | " -v --verbose : Verbose\n" 67 | " -h --help : Print this help\n" ); 68 | } 69 | 70 | #define GETOPT_STRING "p:q:x:y:n:b:m:i:o:r:Q,S:s:w:e:c:f:t:v:h" 71 | 72 | static struct option long_options[] = 73 | { 74 | /* PaRSEC specific options */ 75 | {"nprow", required_argument, 0, 'p'}, 76 | {"npcol", required_argument, 0, 'q'}, 77 | {"jl", no_argument, 0, 'x'}, 78 | {"lvec", no_argument, 0, 'x'}, 79 | {"jr", no_argument, 0, 'y'}, 80 | {"rvec", no_argument, 0, 'y'}, 81 | {"N", required_argument, 0, 'n'}, 82 | {"n", required_argument, 0, 'n'}, 83 | {"nb", required_argument, 0, 'b'}, 84 | {"b", required_argument, 0, 'b'}, 85 | {"mode", required_argument, 0, 'm'}, 86 | {"m", required_argument, 0, 'm'}, 87 | {"cond", required_argument, 0, 'k'}, 88 | {"k", required_argument, 0, 'k'}, 89 | {"optcond", required_argument, 0, 'o'}, 90 | {"o", required_argument, 0, 'o'}, 91 | {"i", required_argument, 0, 'i'}, 92 | {"niter", required_argument, 0, 'i'}, 93 | {"r", required_argument, 0, 'r'}, 94 | {"n_range", required_argument, 0, 'r'}, 95 | {"check", no_argument, 0, 'c'}, 96 | {"verbose", no_argument, 0, 'v'}, 97 | {"help", no_argument, 0, 'h'}, 98 | {"h", no_argument, 0, 'h'}, 99 | {0, 0, 0, 0} 100 | }; 101 | 102 | static void parse_arguments(int argc, char** argv) 103 | { 104 | int opt = 0; 105 | int c; 106 | int myrank_mpi; 107 | 108 | MPI_Comm_rank(MPI_COMM_WORLD, &myrank_mpi); 109 | 110 | do { 111 | #if defined(HAVE_GETOPT_LONG) 112 | c = getopt_long_only(argc, argv, "", 113 | long_options, &opt); 114 | #else 115 | c = getopt(argc, argv, GETOPT_STRING); 116 | (void) opt; 117 | #endif /* defined(HAVE_GETOPT_LONG) */ 118 | 119 | switch(c) { 120 | case 'p': nprow = atoi(optarg); break; 121 | case 'q': npcol = atoi(optarg); break; 122 | case 'n': n = atoi(optarg); start = n; stop = n; step = 1; break; 123 | case 'b': nb = atoi(optarg); break; 124 | case 'm': mode = atoi(optarg); break; 125 | case 'k': cond = atof(optarg); break; 126 | case 'o': optcond = atof(optarg); break; 127 | case 'i': niter = atoi(optarg); break; 128 | case 'r': get_range( optarg, &start, &stop, &step ); break; 129 | case 'c': check = 1; break; 130 | case 'v': verbose = 1; break; 131 | case 'h': 132 | if (myrank_mpi == 0) print_usage(); MPI_Finalize(); exit(0); 133 | break; 134 | default: 135 | break; 136 | } 137 | } while(-1 != c); 138 | } 139 | 140 | int main(int argc, char **argv) { 141 | 142 | 143 | int myrank_mpi, nprocs_mpi; 144 | int ictxt, myrow, mycol; 145 | int mloc, nloc, mlocW; 146 | int mpi_comm_rows, mpi_comm_cols; 147 | int i, j, k, iter, size, info_facto_mr, info_facto_dc, info_facto_qw, info_facto_el, info_facto_sl, info, iseed; 148 | int my_info_facto; 149 | int i0 = 0, i1 = 1; 150 | int lwork, liwork, ldw; 151 | 152 | /* Allocation for the input/output matrices */ 153 | int descA[9], descH[9]; 154 | double *A=NULL, *H=NULL; 155 | 156 | /* Allocation to check the results */ 157 | int descAcpy[9], descC[9]; 158 | double *Acpy=NULL, *C=NULL; 159 | 160 | /* Allocation for pdlatsm */ 161 | double *Wloc1=NULL, *Wloc2=NULL, *D=NULL; 162 | 163 | double eps = LAPACKE_dlamch_work('e'); 164 | int iprepad, ipostpad, sizemqrleft, sizemqrright, sizeqrf, sizeqtq, 165 | sizechk, sizesyevx, isizesyevx, 166 | sizesubtst, isizesubtst, sizetst, 167 | isizetst; 168 | /**/ 169 | 170 | double flops, GFLOPS; 171 | 172 | double orth_Uqw, berr_UHqw; 173 | double frobA; 174 | 175 | double alpha, beta; 176 | char *jobu, *jobvt; 177 | 178 | double my_elapsed_polarqdwh = 0.0, elapsed_polarqdwh = 0.0, sumtime_polarqdwh = 0.0; 179 | double max_time_polarqdwh = 0.0, min_time_polarqdwh = 1e20; 180 | 181 | /**/ 182 | 183 | if (verbose & myrank_mpi == 0) fprintf(stderr, "Program starts... \n"); 184 | 185 | MPI_Init(&argc, &argv); 186 | MPI_Comm_rank(MPI_COMM_WORLD, &myrank_mpi); 187 | MPI_Comm_size(MPI_COMM_WORLD, &nprocs_mpi); 188 | 189 | if (verbose & myrank_mpi == 0) fprintf(stderr, "MPI Init done\n"); 190 | 191 | parse_arguments(argc, argv); 192 | 193 | if (verbose & myrank_mpi == 0) fprintf(stderr, "Checking arguments done\n"); 194 | 195 | Cblacs_get( -1, 0, &ictxt ); 196 | Cblacs_gridinit( &ictxt, "R", nprow, npcol ); 197 | Cblacs_gridinfo( ictxt, &nprow, &npcol, &myrow, &mycol ); 198 | 199 | if (verbose & myrank_mpi == 0) fprintf(stderr, "BLACS Init done\n"); 200 | 201 | if (myrank_mpi == 0) { 202 | fprintf(stderr, "# \n"); 203 | fprintf(stderr, "# NPROCS %d P %d Q %d\n", nprocs_mpi, nprow, npcol); 204 | fprintf(stderr, "# niter %d\n", niter); 205 | fprintf(stderr, "# n_range %d:%d:%d mode: %d cond: %2.4e \n", start, stop, step, mode, cond); 206 | fprintf(stderr, "# \n"); 207 | } 208 | 209 | if (verbose & myrank_mpi == 0) fprintf(stderr, "Range loop starts\n"); 210 | 211 | 212 | // Begin loop over range 213 | for (size = start; size <= stop; size += step) { 214 | while ( (int)((double)size / (double)nb) < ( max(nprow , npcol) )){ 215 | if (myrank_mpi == 0) fprintf(stderr, " Matrix size is small to be facrorized using this number of processors \n"); 216 | size += step; 217 | } 218 | n = size; ldw = 2*n; 219 | mloc = numroc_( &n, &nb, &myrow, &i0, &nprow ); 220 | nloc = numroc_( &n, &nb, &mycol, &i0, &npcol ); 221 | mlocW = numroc_( &ldw, &nb, &myrow, &i0, &nprow ); 222 | if (verbose & myrank_mpi == 0) fprintf(stderr, "Desc Init starts %d\n", mloc); 223 | 224 | descinit_( descA, &n, &n, &nb, &nb, &i0, &i0, &ictxt, &mloc, &info ); 225 | descinit_( descAcpy, &n, &n, &nb, &nb, &i0, &i0, &ictxt, &mloc, &info ); 226 | descinit_( descC, &n, &n, &nb, &nb, &i0, &i0, &ictxt, &mloc, &info ); 227 | descinit_( descH, &n, &n, &nb, &nb, &i0, &i0, &ictxt, &mloc, &info ); 228 | if (verbose & myrank_mpi == 0) fprintf(stderr, "Desc Init ends %d\n", mloc); 229 | 230 | A = (double *)malloc(mloc*nloc*sizeof(double)) ; 231 | H = (double *)malloc(mloc*nloc*sizeof(double)) ; 232 | C = (double *)malloc(mloc*nloc*sizeof(double)) ; 233 | Acpy = (double *)malloc(mloc*nloc*sizeof(double)) ; 234 | D = (double *)malloc(n*sizeof(double)) ; 235 | 236 | 237 | /* Initialize the timing counters */ 238 | my_elapsed_polarqdwh = 0.0, elapsed_polarqdwh = 0.0, sumtime_polarqdwh = 0.0; 239 | max_time_polarqdwh = 0.0, min_time_polarqdwh = 1e20; 240 | 241 | /* Generate matrix by pdlatms */ 242 | { 243 | char *dist = "N"; /* NORMAL( 0, 1 ) ( 'N' for normal ) */ 244 | int iseed[4] = {1, 0, 0, 1}; 245 | char *sym = "P"; /* The generated matrix is symmetric, with 246 | eigenvalues (= singular values) specified by D, COND, 247 | MODE, and DMAX; they will not be negative. 248 | "N" not supported. */ 249 | //int mode = 4; /* sets D(i)=1 - (i-1)/(N-1)*(1 - 1/COND) */ 250 | //double cond = 1.0/eps; 251 | double dmax = 1.0; 252 | int kl = n; 253 | int ku = n; 254 | char *pack = "N"; /* no packing */ 255 | int order = n; 256 | int info; 257 | 258 | pdlasizesep_( descA, 259 | &iprepad, &ipostpad, &sizemqrleft, &sizemqrright, &sizeqrf, 260 | &lwork, 261 | &sizeqtq, &sizechk, &sizesyevx, &isizesyevx, &sizesubtst, 262 | &isizesubtst, &sizetst, &isizetst ); 263 | if (verbose & myrank_mpi == 0) fprintf(stderr, "Setting lwork done\n"); 264 | Wloc1 = (double *)calloc(lwork,sizeof(double)) ; 265 | pdlatms_(&n, &n, dist, 266 | iseed, sym, D, &mode, &cond, &dmax, 267 | &kl, &ku, pack, 268 | A, &i1, &i1, descA, &order, 269 | Wloc1, &lwork, &info); 270 | if (verbose & myrank_mpi == 0) fprintf(stderr, "MatGen done\n"); 271 | if (info != 0) { 272 | fprintf(stderr, "An error occured during matrix generation: %d\n", info ); 273 | return EXIT_FAILURE; 274 | } 275 | pdlacpy_( "All", &n, &n, 276 | A, &i1, &i1, descA, 277 | Acpy, &i1, &i1, descAcpy ); 278 | frobA = pdlange_ ( "f", &n, &n, A, &i1, &i1, descA, Wloc1); 279 | beta = 0.0; 280 | 281 | if (verbose & myrank_mpi == 0) fprintf(stderr, "Copy to Acpy done\n"); 282 | 283 | free( Wloc1 ); 284 | } 285 | 286 | if (myrank_mpi == 0) fprintf(stderr, "\n\n"); 287 | if (myrank_mpi == 0) fprintf(stderr, "/////////////////////////////////////////////////////////////////////////\n"); 288 | if (myrank_mpi == 0) fprintf(stderr, "/////////////////////////////////////////////////////////////////////////\n"); 289 | if (myrank_mpi == 0) fprintf(stderr, "/////////////////////////////////////////////////////////////////////////\n"); 290 | 291 | // QDWH 292 | //if ( qwmr || qwdc || qwel || polarqdwh) { 293 | jobu = lvec ? "V" : "N"; 294 | int lWork1, lWork2, lWi; 295 | Wloc1 = (double *)calloc(1,sizeof(double)) ; 296 | Wloc2 = (double *)calloc(1,sizeof(double)) ; 297 | lWork1 = -1; 298 | lWork2 = -1; 299 | 300 | pdgeqdwh( "H", n, n, 301 | A, i1, i1, descA, 302 | H, i1, i1, descH, 303 | //NULL, lWork, //debug 304 | Wloc1, lWork1, 305 | Wloc2, lWork2, 306 | &my_info_facto); 307 | 308 | lWork1 = Wloc1[0]; 309 | lWork2 = Wloc2[0]; 310 | 311 | 312 | //Wloc = (double *)malloc(lWork*n*sizeof(double)); 313 | Wloc1 = (double *)malloc((lWork1*nloc)*sizeof(double)); 314 | Wloc2 = (double *)malloc((lWork2*nloc)*sizeof(double)); 315 | 316 | //int MB3 = 3*n; 317 | //int mlocW3 = numroc_( &MB3, &nb, &myrow, &i0, &nprow ); 318 | //mlocW3 = ((mlocW3+(nb-1))/nb)*nb; 319 | //Wloc = (double *)malloc((mlocW3*nloc)*sizeof(double)); 320 | 321 | 322 | for (iter = 0; iter < niter; iter++) { 323 | pdlacpy_( "All", &n, &n, 324 | Acpy, &i1, &i1, descAcpy, 325 | A, &i1, &i1, descA ); 326 | flops = 0.0; 327 | if (verbose & myrank_mpi == 0) fprintf(stderr, "\nQDWH + ScaLAPACK EIG done\n"); 328 | if (verbose & myrank_mpi == 0) fprintf(stderr, "QDWH starts...\n"); 329 | 330 | /* 331 | * Find polar decomposition using QDWH. 332 | * C contains the positive-definite factor. 333 | * A contains the orthogonal polar factor. 334 | */ 335 | my_elapsed_polarqdwh = 0.0; 336 | my_elapsed_polarqdwh =- MPI_Wtime(); 337 | 338 | pdgeqdwh( "H", n, n, 339 | A, i1, i1, descA, 340 | H, i1, i1, descH, 341 | //NULL, lWork, //debug 342 | Wloc1, lWork1, 343 | Wloc2, lWork2, 344 | &my_info_facto); 345 | 346 | my_elapsed_polarqdwh += MPI_Wtime(); 347 | MPI_Allreduce( &my_elapsed_polarqdwh, &elapsed_polarqdwh, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); 348 | sumtime_polarqdwh += elapsed_polarqdwh; 349 | if ( elapsed_polarqdwh >= max_time_polarqdwh ) { max_time_polarqdwh = elapsed_polarqdwh;} 350 | if ( elapsed_polarqdwh <= min_time_polarqdwh ) { min_time_polarqdwh = elapsed_polarqdwh;} 351 | MPI_Allreduce( &my_info_facto, &info_facto_qw, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD); 352 | 353 | MPI_Allreduce( &my_info_facto, &info_facto_qw, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD); 354 | 355 | if (verbose & myrank_mpi == 0) fprintf(stderr, "QDWH ends...\n"); 356 | if (verbose & check & myrank_mpi == 0) fprintf(stderr, "Testing QDWH starts...\n"); 357 | /* 358 | * Checking the polar factorization 359 | */ 360 | if(check ){ 361 | /* 362 | * checking orthogonality of Up 363 | */ 364 | alpha = 0.0; beta = 1.0; 365 | pdlaset_( "G", &n, &n, &alpha, &beta, C, &i1, &i1, descC); 366 | alpha = 1.0; beta = -1.0; 367 | pdgemm_( "T", "N", &n, &n, &n, 368 | &alpha, 369 | A, &i1, &i1, descA, 370 | A, &i1, &i1, descA, 371 | &beta, 372 | C, &i1, &i1, descC); 373 | orth_Uqw = pdlange_ ( "f", &n, &n, C, &i1, &i1, descC, Wloc1)/frobA; 374 | /* 375 | * checking the factorization |A-Up*H| 376 | */ 377 | pdlacpy_( "A", &n, &n, Acpy, &i1, &i1, descAcpy, C, &i1, &i1, descC ); 378 | alpha = 1.0; beta = -1.0; 379 | pdgemm_( "N", "N", &n, &n, &n, 380 | &alpha, 381 | A, &i1, &i1, descA, 382 | H, &i1, &i1, descH, 383 | &beta, 384 | C, &i1, &i1, descC); 385 | berr_UHqw = pdlange_ ( "f", &n, &n, C, &i1, &i1, descC, Wloc1)/frobA; 386 | } 387 | } 388 | if ( myrank_mpi == 0) { 389 | fprintf(stderr, "# QDWH \n"); 390 | fprintf(stderr, "#\n"); 391 | fprintf(stderr, "# \tN \tNB \tNP \tP \tQ \tGflop/s \tAvg-Time \tMax-Time \tMin-Time \tBerr_UpH \tOrth_Up \tinfo \n"); 392 | fprintf(stderr, " %6d \t%4d \t%4d \t%3d \t%3d \t%8.2f", n, nb, nprocs_mpi, nprow, npcol, flops/1e9/min_time_polarqdwh); 393 | fprintf(stderr, "\t%6.2f \t\t%6.2f \t\t%6.2f \t\t%2.4e \t%2.4e \t%d \n", sumtime_polarqdwh/niter, max_time_polarqdwh, min_time_polarqdwh, berr_UHqw, orth_Uqw,info_facto_qw); 394 | fprintf(stderr, "/////////////////////////////////////////////////////////////////////////\n"); 395 | } 396 | 397 | 398 | 399 | if (myrank_mpi == 0) fprintf(stderr, "/////////////////////////////////////////////////////////////////////////\n"); 400 | 401 | free( A ); 402 | free( Acpy ); 403 | free( C ); 404 | free( H ); 405 | free( D ); 406 | free(Wloc1); 407 | free(Wloc2); 408 | if (verbose & myrank_mpi == 0) fprintf(stderr, "Free matrices done\n"); 409 | } // End loop over range 410 | 411 | 412 | if (verbose & myrank_mpi == 0) fprintf(stderr, "Range loop ends\n"); 413 | 414 | blacs_gridexit_( &i0 ); 415 | MPI_Finalize(); 416 | if (verbose & myrank_mpi == 0) fprintf(stderr, "Program ends...\n"); 417 | return 0; 418 | } 419 | -------------------------------------------------------------------------------- /timing/timing_pdgezolopd.c: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | * Copyright (c) 2017, King Abdullah University of Science and Technology 4 | * All rights reserved. 5 | * 6 | **/ 7 | 8 | /** 9 | * 10 | * @file timing_pdgezolopd.c 11 | * 12 | * ZOLOPD is a high performance software framework for computing 13 | * the polar decomposition on distributed-memory manycore systems provided by KAUST 14 | * 15 | * @version 3.0.0 16 | * @author Dalal Sukkari 17 | * @author Hatem Ltaief 18 | * @date 2018-11-08 19 | * 20 | **/ 21 | 22 | #include 23 | #include "polar.h" 24 | 25 | 26 | /* Default values of parameters */ 27 | int nprow = 1; 28 | int npcol = 1; 29 | int lvec = 1; 30 | int rvec = 1; 31 | int n = 5120; 32 | int nb = 128; 33 | int mode = 4; 34 | double cond = 9.0072e+15; 35 | int optcond = 0; 36 | int start = 5120; 37 | int stop = 5120; 38 | int step = 1; 39 | int niter = 1; 40 | int check = 0; 41 | int verbose = 0; 42 | 43 | static inline double cWtime(void) 44 | { 45 | struct timeval tp; 46 | gettimeofday( &tp, NULL ); 47 | return tp.tv_sec + 1e-6 * tp.tv_usec; 48 | } 49 | 50 | void print_usage(void) 51 | { 52 | fprintf(stderr, 53 | "======= ZOLOPD timing using ScaLAPACK\n" 54 | " -p --nprow : Number of MPI process rows\n" 55 | " -q --npcol : Number of MPI process cols\n" 56 | " -jl --lvec : Compute left singular vectors\n" 57 | " -jr --rvec : Compute right singular vectors\n" 58 | " -n --N : Dimension of the matrix\n" 59 | " -b --nb : Block size\n" 60 | " -m --mode : [1:6] Mode from pdlatms used to generate the matrix\n" 61 | " -k --cond : Condition number used to generate the matrix\n" 62 | " -o --optcond : Estimate Condition number using QR\n" 63 | " -i --niter : Number of iterations\n" 64 | " -r --n_range : Range for matrix sizes Start:Stop:Step\n" 65 | " -c --check : Check the solution\n" 66 | " -v --verbose : Verbose\n" 67 | " -h --help : Print this help\n" ); 68 | } 69 | 70 | #define GETOPT_STRING "p:q:x:y:n:b:m:i:o:r:Q,S:s:w:e:c:f:t:v:h" 71 | 72 | static struct option long_options[] = 73 | { 74 | /* PaRSEC specific options */ 75 | {"nprow", required_argument, 0, 'p'}, 76 | {"npcol", required_argument, 0, 'q'}, 77 | {"jl", no_argument, 0, 'x'}, 78 | {"lvec", no_argument, 0, 'x'}, 79 | {"jr", no_argument, 0, 'y'}, 80 | {"rvec", no_argument, 0, 'y'}, 81 | {"N", required_argument, 0, 'n'}, 82 | {"n", required_argument, 0, 'n'}, 83 | {"nb", required_argument, 0, 'b'}, 84 | {"b", required_argument, 0, 'b'}, 85 | {"mode", required_argument, 0, 'm'}, 86 | {"m", required_argument, 0, 'm'}, 87 | {"cond", required_argument, 0, 'k'}, 88 | {"k", required_argument, 0, 'k'}, 89 | {"optcond", required_argument, 0, 'o'}, 90 | {"o", required_argument, 0, 'o'}, 91 | {"i", required_argument, 0, 'i'}, 92 | {"niter", required_argument, 0, 'i'}, 93 | {"r", required_argument, 0, 'r'}, 94 | {"n_range", required_argument, 0, 'r'}, 95 | {"check", no_argument, 0, 'c'}, 96 | {"verbose", no_argument, 0, 'v'}, 97 | {"help", no_argument, 0, 'h'}, 98 | {"h", no_argument, 0, 'h'}, 99 | {0, 0, 0, 0} 100 | }; 101 | 102 | static void parse_arguments(int argc, char** argv) 103 | { 104 | int opt = 0; 105 | int c; 106 | int myrank_mpi; 107 | 108 | MPI_Comm_rank(MPI_COMM_WORLD, &myrank_mpi); 109 | 110 | do { 111 | #if defined(HAVE_GETOPT_LONG) 112 | c = getopt_long_only(argc, argv, "", 113 | long_options, &opt); 114 | #else 115 | c = getopt(argc, argv, GETOPT_STRING); 116 | (void) opt; 117 | #endif /* defined(HAVE_GETOPT_LONG) */ 118 | 119 | switch(c) { 120 | case 'p': nprow = atoi(optarg); break; 121 | case 'q': npcol = atoi(optarg); break; 122 | case 'n': n = atoi(optarg); start = n; stop = n; step = 1; break; 123 | case 'b': nb = atoi(optarg); break; 124 | case 'm': mode = atoi(optarg); break; 125 | case 'k': cond = atof(optarg); break; 126 | case 'o': optcond = atof(optarg); break; 127 | case 'i': niter = atoi(optarg); break; 128 | case 'r': get_range( optarg, &start, &stop, &step ); break; 129 | case 'c': check = 1; break; 130 | case 'v': verbose = 1; break; 131 | case 'h': 132 | if (myrank_mpi == 0) print_usage(); MPI_Finalize(); exit(0); 133 | break; 134 | default: 135 | break; 136 | } 137 | } while(-1 != c); 138 | } 139 | 140 | int main(int argc, char **argv) { 141 | 142 | 143 | int myrank_mpi, nprocs_mpi; 144 | int ictxt, myrow, mycol; 145 | int mloc, nloc, mlocW; 146 | int mpi_comm_rows, mpi_comm_cols; 147 | int i, j, k, iter, size, info_facto_mr, info_facto_dc, info_facto_zo, info_facto_el, info_facto_sl, info, iseed; 148 | int my_info_facto; 149 | int i0 = 0, i1 = 1; 150 | int lwork, liwork, ldw; 151 | 152 | /* Allocation for the input/output matrices */ 153 | int descA[9], descH[9]; 154 | double *A=NULL, *H=NULL; 155 | 156 | /* Allocation to check the results */ 157 | int descAcpy[9], descC[9]; 158 | double *Acpy=NULL, *C=NULL; 159 | 160 | /* Allocation for pdlatsm */ 161 | double *Wloc1=NULL, *Wloc2=NULL, *D=NULL; 162 | 163 | double eps = LAPACKE_dlamch_work('e'); 164 | int iprepad, ipostpad, sizemqrleft, sizemqrright, sizeqrf, sizeqtq, 165 | sizechk, sizesyevx, isizesyevx, 166 | sizesubtst, isizesubtst, sizetst, 167 | isizetst; 168 | /**/ 169 | 170 | double flops, GFLOPS; 171 | 172 | double orth_Uzo, berr_UHzo; 173 | double frobA; 174 | 175 | double alpha, beta; 176 | char *jobu, *jobvt; 177 | 178 | double my_elapsed_polarzolopd = 0.0, elapsed_polarzolopd = 0.0, sumtime_polarzolopd = 0.0; 179 | double max_time_polarzolopd = 0.0, min_time_polarzolopd = 1e20; 180 | 181 | /**/ 182 | 183 | if (verbose & myrank_mpi == 0) fprintf(stderr, "Program starts... \n"); 184 | 185 | MPI_Init(&argc, &argv); 186 | MPI_Comm_rank(MPI_COMM_WORLD, &myrank_mpi); 187 | MPI_Comm_size(MPI_COMM_WORLD, &nprocs_mpi); 188 | 189 | if (verbose & myrank_mpi == 0) fprintf(stderr, "MPI Init done\n"); 190 | 191 | parse_arguments(argc, argv); 192 | 193 | if (verbose & myrank_mpi == 0) fprintf(stderr, "Checking arguments done\n"); 194 | 195 | Cblacs_get( -1, 0, &ictxt ); 196 | Cblacs_gridinit( &ictxt, "R", nprow, npcol ); 197 | Cblacs_gridinfo( ictxt, &nprow, &npcol, &myrow, &mycol ); 198 | 199 | if (verbose & myrank_mpi == 0) fprintf(stderr, "BLACS Init done\n"); 200 | 201 | if (myrank_mpi == 0) { 202 | fprintf(stderr, "# \n"); 203 | fprintf(stderr, "# NPROCS %d P %d Q %d\n", nprocs_mpi, nprow, npcol); 204 | fprintf(stderr, "# niter %d\n", niter); 205 | fprintf(stderr, "# n_range %d:%d:%d mode: %d cond: %2.4e \n", start, stop, step, mode, cond); 206 | fprintf(stderr, "# \n"); 207 | } 208 | 209 | if (verbose & myrank_mpi == 0) fprintf(stderr, "Range loop starts\n"); 210 | 211 | 212 | // Begin loop over range 213 | for (size = start; size <= stop; size += step) { 214 | while ( (int)((double)size / (double)nb) < ( max(nprow , npcol) )){ 215 | if (myrank_mpi == 0) fprintf(stderr, " Matrix size is small to be facrorized using this number of processors \n"); 216 | size += step; 217 | } 218 | n = size; ldw = 2*n; 219 | mloc = numroc_( &n, &nb, &myrow, &i0, &nprow ); 220 | nloc = numroc_( &n, &nb, &mycol, &i0, &npcol ); 221 | mlocW = numroc_( &ldw, &nb, &myrow, &i0, &nprow ); 222 | if (verbose & myrank_mpi == 0) fprintf(stderr, "Desc Init starts %d\n", mloc); 223 | 224 | descinit_( descA, &n, &n, &nb, &nb, &i0, &i0, &ictxt, &mloc, &info ); 225 | descinit_( descAcpy, &n, &n, &nb, &nb, &i0, &i0, &ictxt, &mloc, &info ); 226 | descinit_( descC, &n, &n, &nb, &nb, &i0, &i0, &ictxt, &mloc, &info ); 227 | descinit_( descH, &n, &n, &nb, &nb, &i0, &i0, &ictxt, &mloc, &info ); 228 | if (verbose & myrank_mpi == 0) fprintf(stderr, "Desc Init ends %d\n", mloc); 229 | 230 | A = (double *)malloc(mloc*nloc*sizeof(double)) ; 231 | H = (double *)malloc(mloc*nloc*sizeof(double)) ; 232 | C = (double *)malloc(mloc*nloc*sizeof(double)) ; 233 | Acpy = (double *)malloc(mloc*nloc*sizeof(double)) ; 234 | D = (double *)malloc(n*sizeof(double)) ; 235 | 236 | 237 | /* Initialize the timing counters */ 238 | my_elapsed_polarzolopd = 0.0, elapsed_polarzolopd = 0.0, sumtime_polarzolopd = 0.0; 239 | max_time_polarzolopd = 0.0, min_time_polarzolopd = 1e20; 240 | 241 | /* Generate matrix by pdlatms */ 242 | { 243 | char *dist = "N"; /* NORMAL( 0, 1 ) ( 'N' for normal ) */ 244 | int iseed[4] = {1, 0, 0, 1}; 245 | char *sym = "P"; /* The generated matrix is symmetric, with 246 | eigenvalues (= singular values) specified by D, COND, 247 | MODE, and DMAX; they will not be negative. 248 | "N" not supported. */ 249 | //int mode = 4; /* sets D(i)=1 - (i-1)/(N-1)*(1 - 1/COND) */ 250 | //double cond = 1.0/eps; 251 | double dmax = 1.0; 252 | int kl = n; 253 | int ku = n; 254 | char *pack = "N"; /* no packing */ 255 | int order = n; 256 | int info; 257 | 258 | pdlasizesep_( descA, 259 | &iprepad, &ipostpad, &sizemqrleft, &sizemqrright, &sizeqrf, 260 | &lwork, 261 | &sizeqtq, &sizechk, &sizesyevx, &isizesyevx, &sizesubtst, 262 | &isizesubtst, &sizetst, &isizetst ); 263 | if (verbose & myrank_mpi == 0) fprintf(stderr, "Setting lwork done\n"); 264 | Wloc1 = (double *)calloc(lwork,sizeof(double)) ; 265 | pdlatms_(&n, &n, dist, 266 | iseed, sym, D, &mode, &cond, &dmax, 267 | &kl, &ku, pack, 268 | A, &i1, &i1, descA, &order, 269 | Wloc1, &lwork, &info); 270 | if (verbose & myrank_mpi == 0) fprintf(stderr, "MatGen done\n"); 271 | if (info != 0) { 272 | fprintf(stderr, "An error occured during matrix generation: %d\n", info ); 273 | return EXIT_FAILURE; 274 | } 275 | pdlacpy_( "All", &n, &n, 276 | A, &i1, &i1, descA, 277 | Acpy, &i1, &i1, descAcpy ); 278 | frobA = pdlange_ ( "f", &n, &n, A, &i1, &i1, descA, Wloc1); 279 | beta = 0.0; 280 | 281 | if (verbose & myrank_mpi == 0) fprintf(stderr, "Copy to Acpy done\n"); 282 | 283 | free( Wloc1 ); 284 | } 285 | 286 | if (myrank_mpi == 0) fprintf(stderr, "\n\n"); 287 | if (myrank_mpi == 0) fprintf(stderr, "/////////////////////////////////////////////////////////////////////////\n"); 288 | if (myrank_mpi == 0) fprintf(stderr, "/////////////////////////////////////////////////////////////////////////\n"); 289 | if (myrank_mpi == 0) fprintf(stderr, "/////////////////////////////////////////////////////////////////////////\n"); 290 | 291 | // ZOLOPD 292 | //if ( zomr || zodc || zoel || polarzolopd) { 293 | jobu = lvec ? "V" : "N"; 294 | int lWork1, lWork2, lWi; 295 | Wloc1 = (double *)calloc(1,sizeof(double)) ; 296 | Wloc2 = (double *)calloc(1,sizeof(double)) ; 297 | lWork1 = -1; 298 | lWork2 = -1; 299 | 300 | pdgezolopd( "H", n, n, 301 | A, i1, i1, descA, 302 | H, i1, i1, descH, 303 | //NULL, lWork, //debug 304 | Wloc1, lWork1, 305 | Wloc2, lWork2, 306 | &my_info_facto); 307 | 308 | lWork1 = Wloc1[0]; 309 | lWork2 = Wloc2[0]; 310 | 311 | 312 | //Wloc = (double *)malloc(lWork*n*sizeof(double)); 313 | Wloc1 = (double *)malloc((lWork1*nloc)*sizeof(double)); 314 | Wloc2 = (double *)malloc((lWork2*nloc)*sizeof(double)); 315 | 316 | //int MB3 = 3*n; 317 | //int mlocW3 = numroc_( &MB3, &nb, &myrow, &i0, &nprow ); 318 | //mlocW3 = ((mlocW3+(nb-1))/nb)*nb; 319 | //Wloc = (double *)malloc((mlocW3*nloc)*sizeof(double)); 320 | 321 | 322 | for (iter = 0; iter < niter; iter++) { 323 | pdlacpy_( "All", &n, &n, 324 | Acpy, &i1, &i1, descAcpy, 325 | A, &i1, &i1, descA ); 326 | flops = 0.0; 327 | if (verbose & myrank_mpi == 0) fprintf(stderr, "\nZOLOPD + ScaLAPACK EIG done\n"); 328 | if (verbose & myrank_mpi == 0) fprintf(stderr, "ZOLOPD starts...\n"); 329 | 330 | /* 331 | * Find polar decomposition using ZOLOPD. 332 | * C contains the positive-definite factor. 333 | * A contains the orthogonal polar factor. 334 | */ 335 | my_elapsed_polarzolopd = 0.0; 336 | my_elapsed_polarzolopd =- MPI_Wtime(); 337 | 338 | pdgezolopd( "H", n, n, 339 | A, i1, i1, descA, 340 | H, i1, i1, descH, 341 | //NULL, lWork, //debug 342 | Wloc1, lWork1, 343 | Wloc2, lWork2, 344 | &my_info_facto); 345 | 346 | my_elapsed_polarzolopd += MPI_Wtime(); 347 | MPI_Allreduce( &my_elapsed_polarzolopd, &elapsed_polarzolopd, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); 348 | sumtime_polarzolopd += elapsed_polarzolopd; 349 | if ( elapsed_polarzolopd >= max_time_polarzolopd ) { max_time_polarzolopd = elapsed_polarzolopd;} 350 | if ( elapsed_polarzolopd <= min_time_polarzolopd ) { min_time_polarzolopd = elapsed_polarzolopd;} 351 | MPI_Allreduce( &my_info_facto, &info_facto_zo, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD); 352 | 353 | MPI_Allreduce( &my_info_facto, &info_facto_zo, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD); 354 | 355 | if (verbose & myrank_mpi == 0) fprintf(stderr, "ZOLOPD ends...\n"); 356 | if (verbose & check & myrank_mpi == 0) fprintf(stderr, "Testing ZOLOPD starts...\n"); 357 | /* 358 | * Checking the polar factorization 359 | */ 360 | if(check ){ 361 | /* 362 | * checking orthogonality of Up 363 | */ 364 | alpha = 0.0; beta = 1.0; 365 | pdlaset_( "G", &n, &n, &alpha, &beta, C, &i1, &i1, descC); 366 | alpha = 1.0; beta = -1.0; 367 | pdgemm_( "T", "N", &n, &n, &n, 368 | &alpha, 369 | A, &i1, &i1, descA, 370 | A, &i1, &i1, descA, 371 | &beta, 372 | C, &i1, &i1, descC); 373 | orth_Uzo = pdlange_ ( "f", &n, &n, C, &i1, &i1, descC, Wloc1)/frobA; 374 | /* 375 | * checking the factorization |A-Up*H| 376 | */ 377 | pdlacpy_( "A", &n, &n, Acpy, &i1, &i1, descAcpy, C, &i1, &i1, descC ); 378 | alpha = 1.0; beta = -1.0; 379 | pdgemm_( "N", "N", &n, &n, &n, 380 | &alpha, 381 | A, &i1, &i1, descA, 382 | H, &i1, &i1, descH, 383 | &beta, 384 | C, &i1, &i1, descC); 385 | berr_UHzo = pdlange_ ( "f", &n, &n, C, &i1, &i1, descC, Wloc1)/frobA; 386 | } 387 | } 388 | if ( myrank_mpi == 0) { 389 | fprintf(stderr, "# ZOLOPD \n"); 390 | fprintf(stderr, "#\n"); 391 | fprintf(stderr, "# \tN \tNB \tNP \tP \tQ \tGflop/s \tAvg-Time \tMax-Time \tMin-Time \tBerr_UpH \tOrth_Up \tinfo \n"); 392 | fprintf(stderr, " %6d \t%4d \t%4d \t%3d \t%3d \t%8.2f", n, nb, nprocs_mpi, nprow, npcol, flops/1e9/min_time_polarzolopd); 393 | fprintf(stderr, "\t%6.2f \t\t%6.2f \t\t%6.2f \t\t%2.4e \t%2.4e \t%d \n", sumtime_polarzolopd/niter, max_time_polarzolopd, min_time_polarzolopd, berr_UHzo, orth_Uzo,info_facto_zo); 394 | fprintf(stderr, "/////////////////////////////////////////////////////////////////////////\n"); 395 | } 396 | 397 | 398 | 399 | if (myrank_mpi == 0) fprintf(stderr, "/////////////////////////////////////////////////////////////////////////\n"); 400 | 401 | free( A ); 402 | free( Acpy ); 403 | free( C ); 404 | free( H ); 405 | free( D ); 406 | free(Wloc1); 407 | free(Wloc2); 408 | if (verbose & myrank_mpi == 0) fprintf(stderr, "Free matrices done\n"); 409 | } // End loop over range 410 | 411 | 412 | if (verbose & myrank_mpi == 0) fprintf(stderr, "Range loop ends\n"); 413 | 414 | blacs_gridexit_( &i0 ); 415 | MPI_Finalize(); 416 | if (verbose & myrank_mpi == 0) fprintf(stderr, "Program ends...\n"); 417 | return 0; 418 | } 419 | -------------------------------------------------------------------------------- /include/flops.h: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | * Copyright (c) 2017, King Abdullah University of Science and Technology 4 | * All rights reserved. 5 | * 6 | **/ 7 | 8 | /** 9 | * 10 | * @file flops.h 11 | * 12 | * File provided by Univ. of Tennessee, 13 | * 14 | * @version 3.0.0 15 | * @author Mathieu Faverge 16 | * @date 2018-11-08 17 | * 18 | **/ 19 | /* 20 | * This file provide the flops formula for all Level 3 BLAS and some 21 | * Lapack routines. Each macro uses the same size parameters as the 22 | * function associated and provide one formula for additions and one 23 | * for multiplications. Example to use these macros: 24 | * 25 | * FLOPS_ZGEMM( m, n, k ) 26 | * 27 | * All the formula are reported in the LAPACK Lawn 41: 28 | * http://www.netlib.org/lapack/lawns/lawn41.ps 29 | */ 30 | #ifndef _FLOPS_H_ 31 | #define _FLOPS_H_ 32 | 33 | /************************************************************************ 34 | * Generic formula coming from LAWN 41 35 | ***********************************************************************/ 36 | 37 | /* 38 | * Level 2 BLAS 39 | */ 40 | #define FMULS_GEMV(__m, __n) ((double)(__m) * (double)(__n) + 2. * (double)(__m)) 41 | #define FADDS_GEMV(__m, __n) ((double)(__m) * (double)(__n) ) 42 | 43 | #define FMULS_SYMV(__n) FMULS_GEMV( (__n), (__n) ) 44 | #define FADDS_SYMV(__n) FADDS_GEMV( (__n), (__n) ) 45 | #define FMULS_HEMV FMULS_SYMV 46 | #define FADDS_HEMV FADDS_SYMV 47 | 48 | /* 49 | * Level 3 BLAS 50 | */ 51 | #define FMULS_GEMM(__m, __n, __k) ((double)(__m) * (double)(__n) * (double)(__k)) 52 | #define FADDS_GEMM(__m, __n, __k) ((double)(__m) * (double)(__n) * (double)(__k)) 53 | 54 | #define FMULS_SYMM(__side, __m, __n) ( ( (__side) == Left ) ? FMULS_GEMM((__m), (__m), (__n)) : FMULS_GEMM((__m), (__n), (__n)) ) 55 | #define FADDS_SYMM(__side, __m, __n) ( ( (__side) == Left ) ? FADDS_GEMM((__m), (__m), (__n)) : FADDS_GEMM((__m), (__n), (__n)) ) 56 | #define FMULS_HEMM FMULS_SYMM 57 | #define FADDS_HEMM FADDS_SYMM 58 | 59 | #define FMULS_SYRK(__k, __n) (0.5 * (double)(__k) * (double)(__n) * ((double)(__n)+1.)) 60 | #define FADDS_SYRK(__k, __n) (0.5 * (double)(__k) * (double)(__n) * ((double)(__n)+1.)) 61 | #define FMULS_HERK FMULS_SYRK 62 | #define FADDS_HERK FADDS_SYRK 63 | 64 | #define FMULS_SYR2K(__k, __n) ((double)(__k) * (double)(__n) * (double)(__n) ) 65 | #define FADDS_SYR2K(__k, __n) ((double)(__k) * (double)(__n) * (double)(__n) + (double)(__n)) 66 | #define FMULS_HER2K FMULS_SYR2K 67 | #define FADDS_HER2K FADDS_SYR2K 68 | 69 | #define FMULS_TRMM_2(__m, __n) (0.5 * (double)(__n) * (double)(__m) * ((double)(__m)+1.)) 70 | #define FADDS_TRMM_2(__m, __n) (0.5 * (double)(__n) * (double)(__m) * ((double)(__m)-1.)) 71 | 72 | 73 | #define FMULS_TRMM(__side, __m, __n) ( ( (__side) == 'L' ) ? FMULS_TRMM_2((__m), (__n)) : FMULS_TRMM_2((__n), (__m)) ) 74 | #define FADDS_TRMM(__side, __m, __n) ( ( (__side) == 'L' ) ? FADDS_TRMM_2((__m), (__n)) : FADDS_TRMM_2((__n), (__m)) ) 75 | 76 | #define FMULS_TRSM FMULS_TRMM 77 | #define FADDS_TRSM FMULS_TRMM 78 | 79 | /* 80 | * Lapack 81 | */ 82 | #define FMULS_GETRF(__m, __n) ( ((__m) < (__n)) ? (0.5 * (double)(__m) * ((double)(__m) * ((double)(__n) - (1./3.) * (__m) - 1. ) + (double)(__n)) + (2. / 3.) * (__m)) \ 83 | : (0.5 * (double)(__n) * ((double)(__n) * ((double)(__m) - (1./3.) * (__n) - 1. ) + (double)(__m)) + (2. / 3.) * (__n)) ) 84 | #define FADDS_GETRF(__m, __n) ( ((__m) < (__n)) ? (0.5 * (double)(__m) * ((double)(__m) * ((double)(__n) - (1./3.) * (__m) ) - (double)(__n)) + (1. / 6.) * (__m)) \ 85 | : (0.5 * (double)(__n) * ((double)(__n) * ((double)(__m) - (1./3.) * (__n) ) - (double)(__m)) + (1. / 6.) * (__n)) ) 86 | 87 | #define FMULS_GETRI(__n) ( (double)(__n) * ((5. / 6.) + (double)(__n) * ((2. / 3.) * (double)(__n) + 0.5)) ) 88 | #define FADDS_GETRI(__n) ( (double)(__n) * ((5. / 6.) + (double)(__n) * ((2. / 3.) * (double)(__n) - 1.5)) ) 89 | 90 | #define FMULS_GETRS(__n, __nrhs) ((double)(__nrhs) * (double)(__n) * (double)(__n) ) 91 | #define FADDS_GETRS(__n, __nrhs) ((double)(__nrhs) * (double)(__n) * ((double)(__n) - 1. )) 92 | 93 | #define FMULS_POTRF(__n) ((double)(__n) * (((1. / 6.) * (double)(__n) + 0.5) * (double)(__n) + (1. / 3.))) 94 | #define FADDS_POTRF(__n) ((double)(__n) * (((1. / 6.) * (double)(__n) ) * (double)(__n) - (1. / 6.))) 95 | 96 | #define FMULS_POTRI(__n) ( (double)(__n) * ((2. / 3.) + (double)(__n) * ((1. / 3.) * (double)(__n) + 1. )) ) 97 | #define FADDS_POTRI(__n) ( (double)(__n) * ((1. / 6.) + (double)(__n) * ((1. / 3.) * (double)(__n) - 0.5)) ) 98 | 99 | #define FMULS_POTRS(__n, __nrhs) ((double)(__nrhs) * (double)(__n) * ((double)(__n) + 1. )) 100 | #define FADDS_POTRS(__n, __nrhs) ((double)(__nrhs) * (double)(__n) * ((double)(__n) - 1. )) 101 | 102 | //SPBTRF 103 | //SPBTRS 104 | //SSYTRF 105 | //SSYTRI 106 | //SSYTRS 107 | 108 | #define FMULS_GEQRF(__m, __n) (((__m) > (__n)) ? ((double)(__n) * ((double)(__n) * ( 0.5-(1./3.) * (double)(__n) + (double)(__m)) + (double)(__m) + 23. / 6.)) \ 109 | : ((double)(__m) * ((double)(__m) * ( -0.5-(1./3.) * (double)(__m) + (double)(__n)) + 2.*(double)(__n) + 23. / 6.)) ) 110 | #define FADDS_GEQRF(__m, __n) (((__m) > (__n)) ? ((double)(__n) * ((double)(__n) * ( 0.5-(1./3.) * (double)(__n) + (double)(__m)) + 5. / 6.)) \ 111 | : ((double)(__m) * ((double)(__m) * ( -0.5-(1./3.) * (double)(__m) + (double)(__n)) + (double)(__n) + 5. / 6.)) ) 112 | 113 | #define FMULS_GEQLF(__m, __n) FMULS_GEQRF(__m, __n) 114 | #define FADDS_GEQLF(__m, __n) FADDS_GEQRF(__m, __n) 115 | 116 | #define FMULS_GERQF(__m, __n) (((__m) > (__n)) ? ((double)(__n) * ((double)(__n) * ( 0.5-(1./3.) * (double)(__n) + (double)(__m)) + (double)(__m) + 29. / 6.)) \ 117 | : ((double)(__m) * ((double)(__m) * ( -0.5-(1./3.) * (double)(__m) + (double)(__n)) + 2.*(double)(__n) + 29. / 6.)) ) 118 | #define FADDS_GERQF(__m, __n) (((__m) > (__n)) ? ((double)(__n) * ((double)(__n) * ( -0.5-(1./3.) * (double)(__n) + (double)(__m)) + (double)(__m) + 5. / 6.)) \ 119 | : ((double)(__m) * ((double)(__m) * ( 0.5-(1./3.) * (double)(__m) + (double)(__n)) + + 5. / 6.)) ) 120 | 121 | #define FMULS_GELQF(__m, __n) FMULS_GERQF(__m, __n) 122 | #define FADDS_GELQF(__m, __n) FADDS_GERQF(__m, __n) 123 | 124 | #define FMULS_UNGQR(__m, __n, __k) ((double)(__k) * (2.* (double)(__m) * (double)(__n) + 2. * (double)(__n) - 5./3. + (double)(__k) * ( 2./3. * (double)(__k) - ((double)(__m) + (double)(__n)) - 1.))) 125 | #define FADDS_UNGQR(__m, __n, __k) ((double)(__k) * (2.* (double)(__m) * (double)(__n) + (double)(__n) - (double)(__m) + 1./3. + (double)(__k) * ( 2./3. * (double)(__k) - ((double)(__m) + (double)(__n)) ))) 126 | #define FMULS_UNGQL FMULS_UNGQR 127 | #define FMULS_ORGQR FMULS_UNGQR 128 | #define FMULS_ORGQL FMULS_UNGQR 129 | #define FADDS_UNGQL FADDS_UNGQR 130 | #define FADDS_ORGQR FADDS_UNGQR 131 | #define FADDS_ORGQL FADDS_UNGQR 132 | 133 | #define FMULS_UNGRQ(__m, __n, __k) ((double)(__k) * (2.* (double)(__m) * (double)(__n) + (double)(__m) + (double)(__n) - 2./3. + (double)(__k) * ( 2./3. * (double)(__k) - ((double)(__m) + (double)(__n)) - 1.))) 134 | #define FADDS_UNGRQ(__m, __n, __k) ((double)(__k) * (2.* (double)(__m) * (double)(__n) + (double)(__m) - (double)(__n) + 1./3. + (double)(__k) * ( 2./3. * (double)(__k) - ((double)(__m) + (double)(__n)) ))) 135 | #define FMULS_UNGLQ FMULS_UNGRQ 136 | #define FMULS_ORGRQ FMULS_UNGRQ 137 | #define FMULS_ORGLQ FMULS_UNGRQ 138 | #define FADDS_UNGLQ FADDS_UNGRQ 139 | #define FADDS_ORGRQ FADDS_UNGRQ 140 | #define FADDS_ORGLQ FADDS_UNGRQ 141 | 142 | #define FMULS_GEQRS(__m, __n, __nrhs) ((double)(__nrhs) * ((double)(__n) * ( 2.* (double)(__m) - 0.5 * (double)(__n) + 2.5))) 143 | #define FADDS_GEQRS(__m, __n, __nrhs) ((double)(__nrhs) * ((double)(__n) * ( 2.* (double)(__m) - 0.5 * (double)(__n) + 0.5))) 144 | 145 | //UNMQR, UNMLQ, UNMQL, UNMRQ (Left) 146 | //UNMQR, UNMLQ, UNMQL, UNMRQ (Right) 147 | 148 | #define FMULS_TRTRI(__n) ((double)(__n) * ((double)(__n) * ( 1./6. * (double)(__n) + 0.5 ) + 1./3.)) 149 | #define FADDS_TRTRI(__n) ((double)(__n) * ((double)(__n) * ( 1./6. * (double)(__n) - 0.5 ) + 1./3.)) 150 | 151 | #define FMULS_GEHRD(__n) ( (double)(__n) * ((double)(__n) * (5./3. *(double)(__n) + 0.5) - 7./6.) - 13. ) 152 | #define FADDS_GEHRD(__n) ( (double)(__n) * ((double)(__n) * (5./3. *(double)(__n) - 1. ) - 2./3.) - 8. ) 153 | 154 | #define FMULS_SYTRD(__n) ( (double)(__n) * ( (double)(__n) * ( 2./3. * (double)(__n) + 2.5 ) - 1./6. ) - 15.) 155 | #define FADDS_SYTRD(__n) ( (double)(__n) * ( (double)(__n) * ( 2./3. * (double)(__n) + 1. ) - 8./3. ) - 4.) 156 | #define FMULS_HETRD FMULS_SYTRD 157 | #define FADDS_HETRD FADDS_SYTRD 158 | 159 | #define FMULS_GEBRD(__m, __n) ( ((__m) >= (__n)) ? ((double)(__n) * ((double)(__n) * (2. * (double)(__m) - 2./3. * (double)(__n) + 2. ) + 20./3.)) \ 160 | : ((double)(__m) * ((double)(__m) * (2. * (double)(__n) - 2./3. * (double)(__m) + 2. ) + 20./3.)) ) 161 | #define FADDS_GEBRD(__m, __n) ( ((__m) >= (__n)) ? ((double)(__n) * ((double)(__n) * (2. * (double)(__m) - 2./3. * (double)(__n) + 1. ) - (double)(__m) + 5./3.)) \ 162 | : ((double)(__m) * ((double)(__m) * (2. * (double)(__n) - 2./3. * (double)(__m) + 1. ) - (double)(__n) + 5./3.)) ) 163 | 164 | 165 | /******************************************************************************* 166 | * Users functions 167 | ******************************************************************************/ 168 | 169 | /* 170 | * Level 2 BLAS 171 | */ 172 | #define FLOPS_ZGEMV(__m, __n) (6. * FMULS_GEMV((__m), (__n)) + 2.0 * FADDS_GEMV((__m), (__n)) ) 173 | #define FLOPS_CGEMV(__m, __n) (6. * FMULS_GEMV((__m), (__n)) + 2.0 * FADDS_GEMV((__m), (__n)) ) 174 | #define FLOPS_DGEMV(__m, __n) ( FMULS_GEMV((__m), (__n)) + FADDS_GEMV((__m), (__n)) ) 175 | #define FLOPS_SGEMV(__m, __n) ( FMULS_GEMV((__m), (__n)) + FADDS_GEMV((__m), (__n)) ) 176 | 177 | #define FLOPS_ZHEMV(__n) (6. * FMULS_HEMV((__n)) + 2.0 * FADDS_HEMV((__n)) ) 178 | #define FLOPS_CHEMV(__n) (6. * FMULS_HEMV((__n)) + 2.0 * FADDS_HEMV((__n)) ) 179 | 180 | #define FLOPS_ZSYMV(__n) (6. * FMULS_SYMV((__n)) + 2.0 * FADDS_SYMV((__n)) ) 181 | #define FLOPS_CSYMV(__n) (6. * FMULS_SYMV((__n)) + 2.0 * FADDS_SYMV((__n)) ) 182 | #define FLOPS_DSYMV(__n) ( FMULS_SYMV((__n)) + FADDS_SYMV((__n)) ) 183 | #define FLOPS_SSYMV(__n) ( FMULS_SYMV((__n)) + FADDS_SYMV((__n)) ) 184 | 185 | /* 186 | * Level 3 BLAS 187 | */ 188 | #define FLOPS_ZGEMM(__m, __n, __k) (6. * FMULS_GEMM((__m), (__n), (__k)) + 2.0 * FADDS_GEMM((__m), (__n), (__k)) ) 189 | #define FLOPS_CGEMM(__m, __n, __k) (6. * FMULS_GEMM((__m), (__n), (__k)) + 2.0 * FADDS_GEMM((__m), (__n), (__k)) ) 190 | #define FLOPS_DGEMM(__m, __n, __k) ( FMULS_GEMM((__m), (__n), (__k)) + FADDS_GEMM((__m), (__n), (__k)) ) 191 | #define FLOPS_SGEMM(__m, __n, __k) ( FMULS_GEMM((__m), (__n), (__k)) + FADDS_GEMM((__m), (__n), (__k)) ) 192 | 193 | #define FLOPS_ZHEMM(__side, __m, __n) (6. * FMULS_HEMM(__side, (__m), (__n)) + 2.0 * FADDS_HEMM(__side, (__m), (__n)) ) 194 | #define FLOPS_CHEMM(__side, __m, __n) (6. * FMULS_HEMM(__side, (__m), (__n)) + 2.0 * FADDS_HEMM(__side, (__m), (__n)) ) 195 | 196 | #define FLOPS_ZSYMM(__side, __m, __n) (6. * FMULS_SYMM(__side, (__m), (__n)) + 2.0 * FADDS_SYMM(__side, (__m), (__n)) ) 197 | #define FLOPS_CSYMM(__side, __m, __n) (6. * FMULS_SYMM(__side, (__m), (__n)) + 2.0 * FADDS_SYMM(__side, (__m), (__n)) ) 198 | #define FLOPS_DSYMM(__side, __m, __n) ( FMULS_SYMM(__side, (__m), (__n)) + FADDS_SYMM(__side, (__m), (__n)) ) 199 | #define FLOPS_SSYMM(__side, __m, __n) ( FMULS_SYMM(__side, (__m), (__n)) + FADDS_SYMM(__side, (__m), (__n)) ) 200 | 201 | #define FLOPS_ZHERK(__k, __n) (6. * FMULS_HERK((__k), (__n)) + 2.0 * FADDS_HERK((__k), (__n)) ) 202 | #define FLOPS_CHERK(__k, __n) (6. * FMULS_HERK((__k), (__n)) + 2.0 * FADDS_HERK((__k), (__n)) ) 203 | 204 | #define FLOPS_ZSYRK(__k, __n) (6. * FMULS_SYRK((__k), (__n)) + 2.0 * FADDS_SYRK((__k), (__n)) ) 205 | #define FLOPS_CSYRK(__k, __n) (6. * FMULS_SYRK((__k), (__n)) + 2.0 * FADDS_SYRK((__k), (__n)) ) 206 | #define FLOPS_DSYRK(__k, __n) ( FMULS_SYRK((__k), (__n)) + FADDS_SYRK((__k), (__n)) ) 207 | #define FLOPS_SSYRK(__k, __n) ( FMULS_SYRK((__k), (__n)) + FADDS_SYRK((__k), (__n)) ) 208 | 209 | #define FLOPS_ZHER2K(__k, __n) (6. * FMULS_HER2K((__k), (__n)) + 2.0 * FADDS_HER2K((__k), (__n)) ) 210 | #define FLOPS_CHER2K(__k, __n) (6. * FMULS_HER2K((__k), (__n)) + 2.0 * FADDS_HER2K((__k), (__n)) ) 211 | 212 | #define FLOPS_ZSYR2K(__k, __n) (6. * FMULS_SYR2K((__k), (__n)) + 2.0 * FADDS_SYR2K((__k), (__n)) ) 213 | #define FLOPS_CSYR2K(__k, __n) (6. * FMULS_SYR2K((__k), (__n)) + 2.0 * FADDS_SYR2K((__k), (__n)) ) 214 | #define FLOPS_DSYR2K(__k, __n) ( FMULS_SYR2K((__k), (__n)) + FADDS_SYR2K((__k), (__n)) ) 215 | #define FLOPS_SSYR2K(__k, __n) ( FMULS_SYR2K((__k), (__n)) + FADDS_SYR2K((__k), (__n)) ) 216 | 217 | #define FLOPS_ZTRMM(__side, __m, __n) (6. * FMULS_TRMM(__side, (__m), (__n)) + 2.0 * FADDS_TRMM(__side, (__m), (__n)) ) 218 | #define FLOPS_CTRMM(__side, __m, __n) (6. * FMULS_TRMM(__side, (__m), (__n)) + 2.0 * FADDS_TRMM(__side, (__m), (__n)) ) 219 | #define FLOPS_DTRMM(__side, __m, __n) ( FMULS_TRMM(__side, (__m), (__n)) + FADDS_TRMM(__side, (__m), (__n)) ) 220 | #define FLOPS_STRMM(__side, __m, __n) ( FMULS_TRMM(__side, (__m), (__n)) + FADDS_TRMM(__side, (__m), (__n)) ) 221 | 222 | #define FLOPS_ZTRSM(__side, __m, __n) (6. * FMULS_TRSM(__side, (__m), (__n)) + 2.0 * FADDS_TRSM(__side, (__m), (__n)) ) 223 | #define FLOPS_CTRSM(__side, __m, __n) (6. * FMULS_TRSM(__side, (__m), (__n)) + 2.0 * FADDS_TRSM(__side, (__m), (__n)) ) 224 | #define FLOPS_DTRSM(__side, __m, __n) ( FMULS_TRSM(__side, (__m), (__n)) + FADDS_TRSM(__side, (__m), (__n)) ) 225 | #define FLOPS_STRSM(__side, __m, __n) ( FMULS_TRSM(__side, (__m), (__n)) + FADDS_TRSM(__side, (__m), (__n)) ) 226 | 227 | /* 228 | * Lapack 229 | */ 230 | #define FLOPS_ZGETRF(__m, __n) (6. * FMULS_GETRF((__m), (__n)) + 2.0 * FADDS_GETRF((__m), (__n)) ) 231 | #define FLOPS_CGETRF(__m, __n) (6. * FMULS_GETRF((__m), (__n)) + 2.0 * FADDS_GETRF((__m), (__n)) ) 232 | #define FLOPS_DGETRF(__m, __n) ( FMULS_GETRF((__m), (__n)) + FADDS_GETRF((__m), (__n)) ) 233 | #define FLOPS_SGETRF(__m, __n) ( FMULS_GETRF((__m), (__n)) + FADDS_GETRF((__m), (__n)) ) 234 | 235 | #define FLOPS_ZGETRI(__n) (6. * FMULS_GETRI((__n)) + 2.0 * FADDS_GETRI((__n)) ) 236 | #define FLOPS_CGETRI(__n) (6. * FMULS_GETRI((__n)) + 2.0 * FADDS_GETRI((__n)) ) 237 | #define FLOPS_DGETRI(__n) ( FMULS_GETRI((__n)) + FADDS_GETRI((__n)) ) 238 | #define FLOPS_SGETRI(__n) ( FMULS_GETRI((__n)) + FADDS_GETRI((__n)) ) 239 | 240 | #define FLOPS_ZGETRS(__n, __nrhs) (6. * FMULS_GETRS((__n), (__nrhs)) + 2.0 * FADDS_GETRS((__n), (__nrhs)) ) 241 | #define FLOPS_CGETRS(__n, __nrhs) (6. * FMULS_GETRS((__n), (__nrhs)) + 2.0 * FADDS_GETRS((__n), (__nrhs)) ) 242 | #define FLOPS_DGETRS(__n, __nrhs) ( FMULS_GETRS((__n), (__nrhs)) + FADDS_GETRS((__n), (__nrhs)) ) 243 | #define FLOPS_SGETRS(__n, __nrhs) ( FMULS_GETRS((__n), (__nrhs)) + FADDS_GETRS((__n), (__nrhs)) ) 244 | 245 | #define FLOPS_ZPOTRF(__n) (6. * FMULS_POTRF((__n)) + 2.0 * FADDS_POTRF((__n)) ) 246 | #define FLOPS_CPOTRF(__n) (6. * FMULS_POTRF((__n)) + 2.0 * FADDS_POTRF((__n)) ) 247 | #define FLOPS_DPOTRF(__n) ( FMULS_POTRF((__n)) + FADDS_POTRF((__n)) ) 248 | #define FLOPS_SPOTRF(__n) ( FMULS_POTRF((__n)) + FADDS_POTRF((__n)) ) 249 | 250 | #define FLOPS_ZPOTRI(__n) (6. * FMULS_POTRI((__n)) + 2.0 * FADDS_POTRI((__n)) ) 251 | #define FLOPS_CPOTRI(__n) (6. * FMULS_POTRI((__n)) + 2.0 * FADDS_POTRI((__n)) ) 252 | #define FLOPS_DPOTRI(__n) ( FMULS_POTRI((__n)) + FADDS_POTRI((__n)) ) 253 | #define FLOPS_SPOTRI(__n) ( FMULS_POTRI((__n)) + FADDS_POTRI((__n)) ) 254 | 255 | #define FLOPS_ZPOTRS(__n, __nrhs) (6. * FMULS_POTRS((__n), (__nrhs)) + 2.0 * FADDS_POTRS((__n), (__nrhs)) ) 256 | #define FLOPS_CPOTRS(__n, __nrhs) (6. * FMULS_POTRS((__n), (__nrhs)) + 2.0 * FADDS_POTRS((__n), (__nrhs)) ) 257 | #define FLOPS_DPOTRS(__n, __nrhs) ( FMULS_POTRS((__n), (__nrhs)) + FADDS_POTRS((__n), (__nrhs)) ) 258 | #define FLOPS_SPOTRS(__n, __nrhs) ( FMULS_POTRS((__n), (__nrhs)) + FADDS_POTRS((__n), (__nrhs)) ) 259 | 260 | #define FLOPS_ZGEQRF(__m, __n) (6. * FMULS_GEQRF((__m), (__n)) + 2.0 * FADDS_GEQRF((__m), (__n)) ) 261 | #define FLOPS_CGEQRF(__m, __n) (6. * FMULS_GEQRF((__m), (__n)) + 2.0 * FADDS_GEQRF((__m), (__n)) ) 262 | #define FLOPS_DGEQRF(__m, __n) ( FMULS_GEQRF((__m), (__n)) + FADDS_GEQRF((__m), (__n)) ) 263 | #define FLOPS_SGEQRF(__m, __n) ( FMULS_GEQRF((__m), (__n)) + FADDS_GEQRF((__m), (__n)) ) 264 | 265 | #define FLOPS_ZGEQLF(__m, __n) (6. * FMULS_GEQLF((__m), (__n)) + 2.0 * FADDS_GEQLF((__m), (__n)) ) 266 | #define FLOPS_CGEQLF(__m, __n) (6. * FMULS_GEQLF((__m), (__n)) + 2.0 * FADDS_GEQLF((__m), (__n)) ) 267 | #define FLOPS_DGEQLF(__m, __n) ( FMULS_GEQLF((__m), (__n)) + FADDS_GEQLF((__m), (__n)) ) 268 | #define FLOPS_SGEQLF(__m, __n) ( FMULS_GEQLF((__m), (__n)) + FADDS_GEQLF((__m), (__n)) ) 269 | 270 | #define FLOPS_ZGERQF(__m, __n) (6. * FMULS_GERQF((__m), (__n)) + 2.0 * FADDS_GERQF((__m), (__n)) ) 271 | #define FLOPS_CGERQF(__m, __n) (6. * FMULS_GERQF((__m), (__n)) + 2.0 * FADDS_GERQF((__m), (__n)) ) 272 | #define FLOPS_DGERQF(__m, __n) ( FMULS_GERQF((__m), (__n)) + FADDS_GERQF((__m), (__n)) ) 273 | #define FLOPS_SGERQF(__m, __n) ( FMULS_GERQF((__m), (__n)) + FADDS_GERQF((__m), (__n)) ) 274 | 275 | #define FLOPS_ZGELQF(__m, __n) (6. * FMULS_GELQF((__m), (__n)) + 2.0 * FADDS_GELQF((__m), (__n)) ) 276 | #define FLOPS_CGELQF(__m, __n) (6. * FMULS_GELQF((__m), (__n)) + 2.0 * FADDS_GELQF((__m), (__n)) ) 277 | #define FLOPS_DGELQF(__m, __n) ( FMULS_GELQF((__m), (__n)) + FADDS_GELQF((__m), (__n)) ) 278 | #define FLOPS_SGELQF(__m, __n) ( FMULS_GELQF((__m), (__n)) + FADDS_GELQF((__m), (__n)) ) 279 | 280 | #define FLOPS_ZUNGQR(__m, __n, __k) (6. * FMULS_UNGQR((__m), (__n), (__k)) + 2.0 * FADDS_UNGQR((__m), (__n), (__k)) ) 281 | #define FLOPS_CUNGQR(__m, __n, __k) (6. * FMULS_UNGQR((__m), (__n), (__k)) + 2.0 * FADDS_UNGQR((__m), (__n), (__k)) ) 282 | #define FLOPS_DUNGQR(__m, __n, __k) ( FMULS_UNGQR((__m), (__n), (__k)) + FADDS_UNGQR((__m), (__n), (__k)) ) 283 | #define FLOPS_SUNGQR(__m, __n, __k) ( FMULS_UNGQR((__m), (__n), (__k)) + FADDS_UNGQR((__m), (__n), (__k)) ) 284 | 285 | #define FLOPS_ZUNGQL(__m, __n, __k) (6. * FMULS_UNGQL((__m), (__n), (__k)) + 2.0 * FADDS_UNGQL((__m), (__n), (__k)) ) 286 | #define FLOPS_CUNGQL(__m, __n, __k) (6. * FMULS_UNGQL((__m), (__n), (__k)) + 2.0 * FADDS_UNGQL((__m), (__n), (__k)) ) 287 | #define FLOPS_DUNGQL(__m, __n, __k) ( FMULS_UNGQL((__m), (__n), (__k)) + FADDS_UNGQL((__m), (__n), (__k)) ) 288 | #define FLOPS_SUNGQL(__m, __n, __k) ( FMULS_UNGQL((__m), (__n), (__k)) + FADDS_UNGQL((__m), (__n), (__k)) ) 289 | 290 | #define FLOPS_ZORGQR(__m, __n, __k) (6. * FMULS_ORGQR((__m), (__n), (__k)) + 2.0 * FADDS_ORGQR((__m), (__n), (__k)) ) 291 | #define FLOPS_CORGQR(__m, __n, __k) (6. * FMULS_ORGQR((__m), (__n), (__k)) + 2.0 * FADDS_ORGQR((__m), (__n), (__k)) ) 292 | #define FLOPS_DORGQR(__m, __n, __k) ( FMULS_ORGQR((__m), (__n), (__k)) + FADDS_ORGQR((__m), (__n), (__k)) ) 293 | #define FLOPS_SORGQR(__m, __n, __k) ( FMULS_ORGQR((__m), (__n), (__k)) + FADDS_ORGQR((__m), (__n), (__k)) ) 294 | 295 | #define FLOPS_ZORGQL(__m, __n, __k) (6. * FMULS_ORGQL((__m), (__n), (__k)) + 2.0 * FADDS_ORGQL((__m), (__n), (__k)) ) 296 | #define FLOPS_CORGQL(__m, __n, __k) (6. * FMULS_ORGQL((__m), (__n), (__k)) + 2.0 * FADDS_ORGQL((__m), (__n), (__k)) ) 297 | #define FLOPS_DORGQL(__m, __n, __k) ( FMULS_ORGQL((__m), (__n), (__k)) + FADDS_ORGQL((__m), (__n), (__k)) ) 298 | #define FLOPS_SORGQL(__m, __n, __k) ( FMULS_ORGQL((__m), (__n), (__k)) + FADDS_ORGQL((__m), (__n), (__k)) ) 299 | 300 | #define FLOPS_ZUNGRQ(__m, __n, __k) (6. * FMULS_UNGRQ((__m), (__n), (__k)) + 2.0 * FADDS_UNGRQ((__m), (__n), (__k)) ) 301 | #define FLOPS_CUNGRQ(__m, __n, __k) (6. * FMULS_UNGRQ((__m), (__n), (__k)) + 2.0 * FADDS_UNGRQ((__m), (__n), (__k)) ) 302 | #define FLOPS_DUNGRQ(__m, __n, __k) ( FMULS_UNGRQ((__m), (__n), (__k)) + FADDS_UNGRQ((__m), (__n), (__k)) ) 303 | #define FLOPS_SUNGRQ(__m, __n, __k) ( FMULS_UNGRQ((__m), (__n), (__k)) + FADDS_UNGRQ((__m), (__n), (__k)) ) 304 | 305 | #define FLOPS_ZUNGLQ(__m, __n, __k) (6. * FMULS_UNGLQ((__m), (__n), (__k)) + 2.0 * FADDS_UNGLQ((__m), (__n), (__k)) ) 306 | #define FLOPS_CUNGLQ(__m, __n, __k) (6. * FMULS_UNGLQ((__m), (__n), (__k)) + 2.0 * FADDS_UNGLQ((__m), (__n), (__k)) ) 307 | #define FLOPS_DUNGLQ(__m, __n, __k) ( FMULS_UNGLQ((__m), (__n), (__k)) + FADDS_UNGLQ((__m), (__n), (__k)) ) 308 | #define FLOPS_SUNGLQ(__m, __n, __k) ( FMULS_UNGLQ((__m), (__n), (__k)) + FADDS_UNGLQ((__m), (__n), (__k)) ) 309 | 310 | #define FLOPS_ZORGRQ(__m, __n, __k) (6. * FMULS_ORGRQ((__m), (__n), (__k)) + 2.0 * FADDS_ORGRQ((__m), (__n), (__k)) ) 311 | #define FLOPS_CORGRQ(__m, __n, __k) (6. * FMULS_ORGRQ((__m), (__n), (__k)) + 2.0 * FADDS_ORGRQ((__m), (__n), (__k)) ) 312 | #define FLOPS_DORGRQ(__m, __n, __k) ( FMULS_ORGRQ((__m), (__n), (__k)) + FADDS_ORGRQ((__m), (__n), (__k)) ) 313 | #define FLOPS_SORGRQ(__m, __n, __k) ( FMULS_ORGRQ((__m), (__n), (__k)) + FADDS_ORGRQ((__m), (__n), (__k)) ) 314 | 315 | #define FLOPS_ZORGLQ(__m, __n, __k) (6. * FMULS_ORGLQ((__m), (__n), (__k)) + 2.0 * FADDS_ORGLQ((__m), (__n), (__k)) ) 316 | #define FLOPS_CORGLQ(__m, __n, __k) (6. * FMULS_ORGLQ((__m), (__n), (__k)) + 2.0 * FADDS_ORGLQ((__m), (__n), (__k)) ) 317 | #define FLOPS_DORGLQ(__m, __n, __k) ( FMULS_ORGLQ((__m), (__n), (__k)) + FADDS_ORGLQ((__m), (__n), (__k)) ) 318 | #define FLOPS_SORGLQ(__m, __n, __k) ( FMULS_ORGLQ((__m), (__n), (__k)) + FADDS_ORGLQ((__m), (__n), (__k)) ) 319 | 320 | /////// 321 | #define FMULS_UNMQR(m_, n_, k_, side_) (( (side_) == Left ) \ 322 | ? (2.*(n_)*(m_)*(k_) - (n_)*(k_)*(k_) + 2.*(n_)*(k_)) \ 323 | : (2.*(n_)*(m_)*(k_) - (m_)*(k_)*(k_) + (m_)*(k_) + (n_)*(k_) - 0.5*(k_)*(k_) + 0.5*(k_))) 324 | #define FADDS_UNMQR(m_, n_, k_, side_) (( ((side_)) == Left ) \ 325 | ? (2.*(n_)*(m_)*(k_) - (n_)*(k_)*(k_) + (n_)*(k_)) \ 326 | : (2.*(n_)*(m_)*(k_) - (m_)*(k_)*(k_) + (m_)*(k_))) 327 | #define FLOPS_DORMQR(m_, n_, k_, side_) ( FMULS_UNMQR((double)(m_), (double)(n_), (double)(k_), (side_)) + FADDS_UNMQR((double)(m_), (double)(n_), (double)(k_), (side_)) ) 328 | ////// 329 | 330 | #define FLOPS_ZGEQRS(__m, __n, __nrhs) (6. * FMULS_GEQRS((__m), (__n), (__nrhs)) + 2.0 * FADDS_GEQRS((__m), (__n), (__nrhs)) ) 331 | #define FLOPS_CGEQRS(__m, __n, __nrhs) (6. * FMULS_GEQRS((__m), (__n), (__nrhs)) + 2.0 * FADDS_GEQRS((__m), (__n), (__nrhs)) ) 332 | #define FLOPS_DGEQRS(__m, __n, __nrhs) ( FMULS_GEQRS((__m), (__n), (__nrhs)) + FADDS_GEQRS((__m), (__n), (__nrhs)) ) 333 | #define FLOPS_SGEQRS(__m, __n, __nrhs) ( FMULS_GEQRS((__m), (__n), (__nrhs)) + FADDS_GEQRS((__m), (__n), (__nrhs)) ) 334 | 335 | #define FLOPS_ZTRTRI(__n) (6. * FMULS_TRTRI((__n)) + 2.0 * FADDS_TRTRI((__n)) ) 336 | #define FLOPS_CTRTRI(__n) (6. * FMULS_TRTRI((__n)) + 2.0 * FADDS_TRTRI((__n)) ) 337 | #define FLOPS_DTRTRI(__n) ( FMULS_TRTRI((__n)) + FADDS_TRTRI((__n)) ) 338 | #define FLOPS_STRTRI(__n) ( FMULS_TRTRI((__n)) + FADDS_TRTRI((__n)) ) 339 | 340 | #define FLOPS_ZGEHRD(__n) (6. * FMULS_GEHRD((__n)) + 2.0 * FADDS_GEHRD((__n)) ) 341 | #define FLOPS_CGEHRD(__n) (6. * FMULS_GEHRD((__n)) + 2.0 * FADDS_GEHRD((__n)) ) 342 | #define FLOPS_DGEHRD(__n) ( FMULS_GEHRD((__n)) + FADDS_GEHRD((__n)) ) 343 | #define FLOPS_SGEHRD(__n) ( FMULS_GEHRD((__n)) + FADDS_GEHRD((__n)) ) 344 | 345 | #define FLOPS_ZHETRD(__n) (6. * FMULS_HETRD((__n)) + 2.0 * FADDS_HETRD((__n)) ) 346 | #define FLOPS_CHETRD(__n) (6. * FMULS_HETRD((__n)) + 2.0 * FADDS_HETRD((__n)) ) 347 | 348 | #define FLOPS_ZSYTRD(__n) (6. * FMULS_SYTRD((__n)) + 2.0 * FADDS_SYTRD((__n)) ) 349 | #define FLOPS_CSYTRD(__n) (6. * FMULS_SYTRD((__n)) + 2.0 * FADDS_SYTRD((__n)) ) 350 | #define FLOPS_DSYTRD(__n) ( FMULS_SYTRD((__n)) + FADDS_SYTRD((__n)) ) 351 | #define FLOPS_SSYTRD(__n) ( FMULS_SYTRD((__n)) + FADDS_SYTRD((__n)) ) 352 | 353 | #define FLOPS_ZGEBRD(__m, __n) (6. * FMULS_GEBRD((__m), (__n)) + 2.0 * FADDS_GEBRD((__m), (__n)) ) 354 | #define FLOPS_CGEBRD(__m, __n) (6. * FMULS_GEBRD((__m), (__n)) + 2.0 * FADDS_GEBRD((__m), (__n)) ) 355 | #define FLOPS_DGEBRD(__m, __n) ( FMULS_GEBRD((__m), (__n)) + FADDS_GEBRD((__m), (__n)) ) 356 | #define FLOPS_SGEBRD(__m, __n) ( FMULS_GEBRD((__m), (__n)) + FADDS_GEBRD((__m), (__n)) ) 357 | 358 | #endif /* _FLOPS_H_ */ 359 | -------------------------------------------------------------------------------- /src/pdgeqdwh.c: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | * Copyright (c) 2017, King Abdullah University of Science and Technology 4 | * All rights reserved. 5 | * 6 | **/ 7 | 8 | /** 9 | * 10 | * @file pdgeqdwh.c 11 | * 12 | * QDWH is a high performance software framework for computing 13 | * the polar decomposition on distributed-memory manycore systems provided by KAUST 14 | * 15 | * @version 3.0.0 16 | * @author Dalal Sukkari 17 | * @author Hatem Ltaief 18 | * @date 2018-11-08 19 | * 20 | **/ 21 | 22 | #include "polar.h" 23 | 24 | extern void pdgenm2( double *A, int M, int N, int descA[9], double *W, int descW[9], double *Sx, int descSx[9], double *e, double tol); 25 | 26 | #ifndef max 27 | #define max(a, b) ((a) > (b) ? (a) : (b)) 28 | #endif 29 | #ifndef min 30 | #define min(a, b) ((a) < (b) ? (a) : (b)) 31 | #endif 32 | 33 | 34 | /******************************************************************************* 35 | * .. Scalar Arguments .. 36 | * INTEGER IA, INFO, JA, LWORK, M, N 37 | * .. 38 | * .. Array Arguments .. 39 | * INTEGER DESCA( * ) 40 | DOUBLE PRECISION A( * ), TAU( * ), WORK( * ) 41 | * .. 42 | * 43 | * Purpose 44 | * ======= 45 | * 46 | * PDGDWH computes the polar decomposition of a real distributed M-by-N 47 | * 48 | * matrix A = U * H. 49 | * 50 | * Notes 51 | * ===== 52 | * 53 | * Each global data object is described by an associated description 54 | * vector. This vector stores the information required to establish 55 | * the mapping between an object element and its corresponding process 56 | * and memory location. 57 | * 58 | * Let A be a generic term for any 2D block cyclicly distributed array. 59 | * Such a global array has an associated description vector DESCA. 60 | * In the following comments, the character _ should be read as 61 | * "of the global array". 62 | * 63 | * NOTATION STORED IN EXPLANATION 64 | * --------------- -------------- -------------------------------------- 65 | * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, 66 | * DTYPE_A = 1. 67 | * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating 68 | * the BLACS process grid A is distribu- 69 | * ted over. The context itself is glo- 70 | * bal, but the handle (the integer 71 | * value) may vary. 72 | * M_A (global) DESCA( M_ ) The number of rows in the global 73 | * array A. 74 | * N_A (global) DESCA( N_ ) The number of columns in the global 75 | * array A. 76 | * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute 77 | * the rows of the array. 78 | * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute 79 | * the columns of the array. 80 | * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first 81 | * row of the array A is distributed. 82 | * CSRC_A (global) DESCA( CSRC_ ) The process column over which the 83 | * first column of the array A is 84 | * distributed. 85 | * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local 86 | * array. LLD_A >= MAX(1,LOCr(M_A)). 87 | * 88 | * Let K be the number of rows or columns of a distributed matrix, 89 | * and assume that its process grid has dimension p x q. 90 | * LOCr( K ) denotes the number of elements of K that a process 91 | * would receive if K were distributed over the p processes of its 92 | * process column. 93 | * Similarly, LOCc( K ) denotes the number of elements of K that a 94 | * process would receive if K were distributed over the q processes of 95 | * its process row. 96 | * The values of LOCr() and LOCc() may be determined via a call to the 97 | * ScaLAPACK tool function, NUMROC: 98 | * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), 99 | * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). 100 | * An upper bound for these quantities may be computed by: 101 | * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A 102 | * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A 103 | * 104 | * Arguments 105 | ************* 106 | * JOBH (global input) CHARACTER*1 107 | * Specifies options for computing H: 108 | * = 'H': the H (the symmetric positive 109 | * semidefinite polar factor) are returned in the array H; 110 | * = 'N': no columns of H (no symmetric positive semidefinite polar factor) are 111 | * computed. 112 | * 113 | * M (global input) INTEGER 114 | * The number of rows of the input matrix A. M >= 0. 115 | * 116 | * N (global input) INTEGER 117 | * The number of columns of the input matrix A. N >= 0. 118 | * 119 | * A (local input/output) block cyclic DOUBLE PRECISION 120 | * array, 121 | * global dimension (M, N), local dimension (MP, NQ) 122 | * On entry, this array contains the matrix to be factorized 123 | * On exit, it contain the orthogonal polar factor A_P 124 | * 125 | * IA (global input) INTEGER 126 | * The row index in the global array A indicating the first 127 | * row of sub( A ). 128 | * 129 | * JA (global input) INTEGER 130 | * The column index in the global array A indicating the 131 | * first column of sub( A ). 132 | * 133 | * DESCA (global input) INTEGER array of dimension DLEN_ 134 | * The array descriptor for the distributed matrix A. 135 | * 136 | * H (local output) block cyclic DOUBLE PRECISION 137 | * array, 138 | * global dimension (M, N), local dimension (MP, NQ) 139 | * On exit, this array contains the symmetric positive semidefinite polar factor H 140 | * 141 | * IH (global input) INTEGER 142 | * The row index in the global array H indicating the first 143 | * row of sub( H ). 144 | * 145 | * JH (global input) INTEGER 146 | * The column index in the global array H indicating the 147 | * first column of sub( H ). 148 | * 149 | * DESCH (global input) INTEGER array of dimension DLEN_ 150 | * The array descriptor for the distributed matrix H. 151 | * 152 | * WORK (local workspace/output) DOUBLE PRECISION array, dimension 153 | * (LWORK* NQ) 154 | * On exit, if INFO = 0, WORK(1) returns the optimal LWORK. 155 | * 156 | * LWORK (local input) INTEGER 157 | * The dimension of the array WORK. 158 | * LWORK must be at least LWORK >= 3*MP*NQ 159 | * If LWORK = -1, then LWORK is global input and a workspace 160 | * query is assumed; the routine only calculates the minimum 161 | * and optimal size for all work arrays. Each of these 162 | * values is returned in the first entry of the corresponding 163 | * work array, and no error message is issued by PXERBLA. 164 | * 165 | * INFO (global output) INTEGER 166 | * = 0: successful exit 167 | * < 0: If the i-th argument is an array and the j-entry had 168 | * an illegal value, then INFO = -(i*100+j), if the i-th 169 | * argument is a scalar and had an illegal value, then 170 | * INFO = -i. 171 | * 172 | ******************************************************************************/ 173 | 174 | 175 | int pdgeqdwh( char *jobh, int m, int n, 176 | double *A, int iA, int jA, int *descA, 177 | double *H, int iH, int jH, int *descH, 178 | double *Work1, int lWork1, 179 | double *Work2, int lWork2, 180 | int *info) 181 | { 182 | 183 | int init = 0; 184 | double eps; 185 | double tol1; 186 | double tol3; 187 | 188 | complex dd, sqd, a1; 189 | double conv = 100.; 190 | double a, b, c, L2, Liconv, alpha, beta, Anorm, Ainvnorm, Li, norm_est; 191 | double tol = 3.e-1; 192 | double flops_dgeqrf, flops_dorgqr, flops_dgemm, flops_dpotrf, flops_dtrsm; 193 | int MB = 2*m; 194 | int it, itconv, facto = -1; 195 | int itqr = 0, itpo = 0, alloc_qr = 0; 196 | int i1 =1, i0 = 0, iM = m+1; 197 | int myrank_mpi; 198 | double qwtime, litime, nrmtime, potime, qrtime, Htime; 199 | double sync_time_elapsed, reduced_time_elapsed; 200 | 201 | int verbose = 0, prof = 0, optcond = 0; 202 | double flops; 203 | flops = 0.0; 204 | 205 | int mloc, nloc, mlocW, nb; 206 | int myrow, mycol, nprow, npcol; 207 | int ctxt_ = 1, nb_ = 5; 208 | int ictxt; 209 | int wantH; 210 | 211 | int lWi, lwork_qr; 212 | int *Wi = (int *)calloc(1,sizeof(int)) ; 213 | double *W = (double *)calloc(1,sizeof(double)) ; 214 | 215 | int iinfo; 216 | 217 | /* 218 | * Get the grid parameters 219 | */ 220 | ictxt = descA[ctxt_]; 221 | Cblacs_get( -1, 0, &ictxt ); 222 | nb = descA[nb_]; 223 | Cblacs_gridinfo( ictxt, &nprow, &npcol, &myrow, &mycol ); 224 | mloc = numroc_( &m, &nb, &myrow, &i0, &nprow ); 225 | nloc = numroc_( &n, &nb, &mycol, &i0, &npcol ); 226 | mlocW = numroc_( &MB, &nb, &myrow, &i0, &nprow ); 227 | 228 | int lmin1, lmin2, lquery; 229 | *info = 0; 230 | lquery = (lWork1 == -1 || lWork2 == -1); 231 | wantH = 0; 232 | 233 | /* 234 | * Test the input parameters 235 | */ 236 | if( nprow == -1 ){ 237 | *info = -(700+ctxt_); 238 | } 239 | else { 240 | if ( m < n ){ 241 | fprintf(stderr, "error(m >= n is required)") ; 242 | return -1; 243 | } 244 | if (jobh[0] == 'H' || jobh[0] == 'h'){ 245 | wantH = 1; 246 | } 247 | 248 | int i2 = 2, i3 = 3, i7 = 7, i11 = 11; 249 | int *idum1, *idum2; 250 | idum1 = (int *)malloc(2*sizeof(int)) ; 251 | idum2 = (int *)malloc(2*sizeof(int)) ; 252 | chk1mat_(&m, &i2, &n, &i3, &iA, &jA, descA, &i7, info); 253 | if (wantH){ 254 | chk1mat_(&m, &i2, &n, &i3, &iH, &jH, descH, &i11, info); 255 | } 256 | //igamx2d_(descA[ctxt_], "A", " ", &i1, &i1, info, &i1, &i1, &i1, &i_1, &i_1, &i0); 257 | 258 | lquery = (lWork1 == -1 || lWork2 == -1); 259 | if (*info == 0){ 260 | lmin1 = mloc; 261 | lmin2 = mlocW; 262 | Work1[0] = lmin1; 263 | Work2[0] = lmin2; 264 | lquery = (lWork1 == -1 || lWork2 == -1); 265 | if( (lWork1 < lmin1) & !lquery ){ 266 | *info = -13; 267 | } 268 | if( (lWork2 < lmin2) & !lquery ){ 269 | *info = -15; 270 | } 271 | } 272 | 273 | idum1[0] = wantH; 274 | if( lWork1 == -1 || lWork2 == -1) { 275 | idum1[1] = -1; 276 | } 277 | else { 278 | idum1[1] = 1; 279 | } 280 | idum2[0] = 1; 281 | idum2[1] = 15; 282 | pchk1mat_( &m, &i2, &n, &i3, &iA, &jA, descA, &i7, &i2, idum1, idum2, 283 | info ); 284 | if ((*info == 0) && wantH){ 285 | pchk1mat_( &m, &i2, &n, &i3, &iH, &jH, descH, &i11, &i0, idum1, idum2, 286 | info ); 287 | } 288 | } 289 | 290 | if( *info != 0 ){ 291 | pxerbla_( &ictxt, "PDGEQDWH", &(int){-1*info[0]} ); 292 | return 0; 293 | } 294 | else if ( lquery ){ 295 | //lquery = (lWork1 == -1 || lWork2 == -1); 296 | //if ( lquery ){ 297 | /* 298 | * Find Workspace 299 | */ 300 | /* 301 | int lwork_qr = -1, lwork_cn = -1; 302 | pdgecon_ ("1", &m, H, &iH, &jH, descH, 303 | &Anorm, &Li, 304 | Work, &lwork_cn, Wi, &lWi, info); 305 | lwork_cn = (int)Work[0]; 306 | lWi = N;//(int)iWloc[0]; 307 | 308 | pdgeqrf_(&MB, &n, H, &iH, &iH, descH, 309 | tau, Work, &lwork_qr, info); 310 | lwork_qr = Work[0]; 311 | lWork = max ( lwork_cn, lwork_qr); 312 | lWi = N; 313 | */ 314 | //Work[0] = mlocW_; //B = Work; 315 | //Work[0] = 3*mloc; 316 | //Work[0] = 3*m; 317 | //Work[0] = ((3*m+nb)/nb)*nb; 318 | Work1[0] = mloc; 319 | Work2[0] = mlocW; 320 | return 0; 321 | } 322 | 323 | /* Quick return if possible */ 324 | if ( m == 0 || n == 0 ){ 325 | return 0; 326 | } 327 | 328 | /** 329 | * Create the required workspaces 330 | * Needed for debugging the code 331 | */ 332 | 333 | double *U=NULL, *B=NULL; 334 | int descU[9], descB[9]; 335 | 336 | //int MB3 = 3*m; 337 | //int mlocW3 = numroc_( &MB3, &nb, &myrow, &i0, &nprow ); 338 | if ( Work1 == NULL ) { 339 | U = (double *)malloc(mloc*nloc*sizeof(double)); 340 | //B = (double *)malloc(mlocW*nloc*sizeof(double)); 341 | } 342 | if ( Work2 == NULL ) { 343 | //A = (double *)malloc(mloc*nloc*sizeof(double)); 344 | B = (double *)malloc(mlocW*nloc*sizeof(double)); 345 | } 346 | else { 347 | U = Work1; 348 | //B = A + mlocW3*nloc; 349 | B = Work2; 350 | } 351 | 352 | descinit_( descU, &m, &n, &nb, &nb, &i0, &i0, &ictxt, &mloc, &iinfo ); 353 | descinit_( descB, &MB, &n, &nb, &nb, &i0, &i0, &ictxt, &mlocW, &iinfo ); //B = A + mloc*nloc; 354 | 355 | //lWork = 3*m; MB = 2*m; 356 | //descinit_( descA, &lWork, &n, &nb, &nb, &MB, &i0, &ictxt, &mloc, &iinfo ); 357 | //descinit_( descB, &lWork, &N, &nb, &nb, &i0, &i0, &ictxt, &mlocW, &iinfo ); //B = Work; 358 | 359 | double *tau = (double *)malloc(nloc*sizeof(double)) ; 360 | 361 | if ( !optcond ){ 362 | lWi = n; 363 | Wi = (int *)malloc(lWi*sizeof(int)) ; 364 | } 365 | 366 | MPI_Comm_rank(MPI_COMM_WORLD, &myrank_mpi); 367 | 368 | if (verbose && (myrank_mpi == 0)) { fprintf(stderr, "Entering QDWH\n");} 369 | if (verbose && (myrank_mpi == 0)) { fprintf(stderr, "Preparing workspace for QDWH\n");} 370 | qwtime = 0.0; 371 | if(prof) {qwtime -= MPI_Wtime();} 372 | 373 | if (!init) { 374 | eps = pdlamch_( &ictxt, "E" ); 375 | tol1 = 5. * eps; 376 | tol3 = pow(tol1, 1./3.); 377 | init = 1; 378 | } 379 | 380 | 381 | if (verbose && (myrank_mpi == 0)) { fprintf(stderr, "Finish preparing workspace for QDWH\n");} 382 | 383 | /* 384 | * Save copy of A ==> H = U'*A 385 | */ 386 | pdlacpy_ ( "A", &m, &n, A, &i1, &i1, descA, U, &i1, &i1, descU ); 387 | 388 | if (verbose && (myrank_mpi == 0)) { fprintf(stderr, "Cond estimate starts\n");} 389 | /* 390 | * Calculate Li: reciprocal of condition number estimation 391 | */ 392 | 393 | litime = 0.0; 394 | if(prof) {litime =- MPI_Wtime();} 395 | 396 | pdlacpy_ ( "A", &m, &n, A, &i1, &i1, descA, B, &i1, &i1, descB ); 397 | if (verbose && (myrank_mpi == 0)) { fprintf(stderr, "lacpy ends\n");} 398 | Anorm = pdlange_ ( "1", &m, &n, U, &i1, &i1, descU, H); 399 | if (verbose && (myrank_mpi == 0)) { fprintf(stderr, "dlange ends\n");} 400 | 401 | alpha = 1.0; 402 | pdgenm2( U, m, n, descU, B, descB, H, descH, &norm_est, tol); 403 | pdlascl_( "G", &norm_est, &alpha, &m, &n, A, &i1, &i1, descA, &iinfo); 404 | //pdlascl_( "G", &alpha, &norm_est, &m, &n, A, &i1, &i1, descA, &iinfo); 405 | 406 | 407 | /* estimate condition number using QR */ 408 | if ( optcond ){ 409 | pdgeqrf_(&m, &n, B, &i1, &i1, descB, tau, H, &lWork1, &iinfo); 410 | 411 | sync_time_elapsed =- MPI_Wtime(); 412 | pdtrtri_( "U", "N", &n, B, &i1, &i1, descB, &iinfo ); 413 | sync_time_elapsed += MPI_Wtime(); 414 | MPI_Allreduce( &sync_time_elapsed, &reduced_time_elapsed, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); 415 | 416 | Ainvnorm = pdlange_ ( "1", &m, &n, B, &i1, &i1, descB, H); 417 | Li = ( 1.0 / Ainvnorm)/Anorm; 418 | Li = norm_est/1.1*Li; 419 | flops += FLOPS_DGEQRF( m, n ) 420 | + FLOPS_DTRTRI( n ); 421 | } 422 | /* estimate condition number using LU */ 423 | else { 424 | pdgetrf_ ( &m, &n, B, &i1, &i1, descB, Wi, &iinfo ); 425 | if (verbose && (myrank_mpi == 0)) { fprintf(stderr, "LU ends\n");} 426 | 427 | int lwork_cn = -1; 428 | pdgecon_ ("1", &m, B, &i1, &i1, descB, &Anorm, &Li, H, &lwork_cn, Wi, &lWi, &iinfo); 429 | lwork_cn = H[0]; 430 | 431 | pdgecon_ ("1", &m, B, &i1, &i1, descB, &Anorm, &Li, H, &lwork_cn, Wi, &lWi, &iinfo); 432 | Li = norm_est/1.1*Li; 433 | //flops += FLOPS_DGETRF(n, n) + 2. * FLOPS_DTRSM( 'L', n, 1 ); 434 | flops += FLOPS_DGETRF(n, n); 435 | } 436 | 437 | if(prof) {litime += MPI_Wtime();} 438 | if (verbose && (myrank_mpi == 0)) { fprintf(stderr, "Cond estimate ends\n");} 439 | 440 | /* 441 | * Calculate norm_est 442 | * Scal the matrix by norm_est 443 | */ 444 | 445 | if (verbose && (myrank_mpi == 0)) { fprintf(stderr, "Normest starts\n");} 446 | nrmtime = 0.0; 447 | if(prof) {nrmtime =- MPI_Wtime();} 448 | 449 | 450 | if(prof) {nrmtime += MPI_Wtime();} 451 | if (verbose && (myrank_mpi == 0)) { fprintf(stderr, "Normest ends\n");} 452 | 453 | 454 | itconv = 0; Liconv = Li; 455 | 456 | int itcqr = 0, itcpo = 0; 457 | while(itconv == 0 || fabs(1-Liconv) > tol1 ) { 458 | /* To find the minimum number of iterations to converge. 459 | * itconv = number of iterations needed until |Li - 1| < tol1 460 | * This should have converged in less than 50 iterations 461 | */ 462 | if (itconv > 100) { 463 | exit(-1); 464 | break; 465 | } 466 | itconv++; 467 | 468 | L2 = Liconv * Liconv; 469 | dd = cpow( 4. * (1. - L2 ) / (L2 * L2), 1./3. ); 470 | sqd = sqrt(1. + dd); 471 | a1 = sqd + sqrt( 8. - 4. * dd + 8. * (2. - L2) / (L2 * sqd) ) / 2.; 472 | a = creal(a1); 473 | b = (a - 1.) * (a - 1.) / 4.; 474 | c = a + b - 1.; 475 | if (c > 100) {itcqr += 1; alloc_qr = 1;} else {itcpo += 1;} 476 | // Update Liconv 477 | Liconv = Liconv * (a + b * L2) / (1. + c * L2); 478 | } 479 | if (verbose && (myrank_mpi == 0)) { fprintf(stderr, "QDWH loop starts\n");} 480 | if (myrank_mpi == 0) { fprintf(stderr, "\nItConv %d itcqr %d itcpo %d norm_est %2.4e Li %2.4e \n", itconv, itcqr, itcpo, norm_est, Li); fprintf(stderr, "It Facto Conv\n");} 481 | it = 0; 482 | 483 | if ( alloc_qr ){ 484 | lwork_qr = -1; 485 | pdgeqrf_(&MB, &n, B, &i1, &i1, descB, 486 | tau, W, &lwork_qr, &iinfo); 487 | lwork_qr = W[0]; 488 | W = (double *)malloc((lwork_qr)*sizeof(double)) ; 489 | } 490 | 491 | 492 | while(conv > tol3 || it < itconv ) { 493 | /* This should have converged in less than 50 iterations */ 494 | if (it > 100) { 495 | exit(-1); 496 | break; 497 | } 498 | it++; 499 | 500 | /* Copy U into B1 */ 501 | //pdlacpy_( "A", &m, &n, U, &i1, &i1, descU, C, &i1, &i1, descC ); 502 | 503 | // Compute parameters L,a,b,c (second, equivalent way). 504 | L2 = Li * Li; 505 | dd = cpow( 4. * (1. - L2 ) / (L2 * L2), 1./3. ); 506 | sqd = sqrt(1. + dd); 507 | a1 = sqd + sqrt( 8. - 4. * dd + 8. * (2. - L2) / (L2 * sqd) ) / 2.; 508 | a = creal(a1); 509 | b = (a - 1.) * (a - 1.) / 4.; 510 | c = a + b - 1.; 511 | // Update Li 512 | Li = Li * (a + b * L2) / (1. + c * L2); 513 | 514 | if ( c > 100) { 515 | 516 | qrtime = 0.0; 517 | if(prof) {qrtime =- MPI_Wtime();} 518 | 519 | /* Copy U into C to check the convergence of QDWH */ 520 | if (it >= itconv ){ 521 | pdlacpy_( "A", &m, &n, A, &i1, &i1, descA, H, &i1, &i1, descH ); 522 | } 523 | 524 | /** 525 | * Generate the matrix B = [ B1 ] = [ sqrt(c) * U ] 526 | * [ B2 ] = [ Id ] 527 | */ 528 | pdlacpy_( "A", &m, &n, A, &i1, &i1, descA, B, &i1, &i1, descB ); 529 | alpha = 1.0; beta = sqrt(c); 530 | pdlascl_( "G", &alpha, &beta, &m, &n, B, &i1, &i1, descB, &iinfo); 531 | alpha = 0.; beta =1.; 532 | pdlaset_( "G", &m, &n, &alpha, &beta, B, &iM, &i1, descB); 533 | 534 | /** 535 | * Factorize B = QR, and generate the associated Q 536 | */ 537 | sync_time_elapsed =- MPI_Wtime(); 538 | 539 | pdgeqrf_(&MB, &n, B, &i1, &i1, descB, tau, W, &lwork_qr, &iinfo); 540 | pdorgqr_(&MB, &n, &n, B, &i1, &i1, descB, tau, W, &lwork_qr, &iinfo); 541 | 542 | sync_time_elapsed += MPI_Wtime(); 543 | MPI_Allreduce( &sync_time_elapsed, &reduced_time_elapsed, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); 544 | 545 | /** 546 | * Gemm to find the conv-norm 547 | * U = ( (a-b/c)/sqrt(c) ) * Q1 * Q2' + (b/c) * U 548 | */ 549 | alpha = (a-b/c)/sqrt(c); beta = (b/c); 550 | pdgemm_( "N", "T", &m, &n, &n, &alpha, B, &i1, &i1, descB, B, &iM, &i1, 551 | descB, &beta, A, &i1, &i1, descA); 552 | 553 | if(prof) {qrtime += MPI_Wtime();} 554 | 555 | /* Main flops used in this step */ 556 | flops_dgeqrf = FLOPS_DGEQRF( 2*m, n ); 557 | flops_dorgqr = FLOPS_DORGQR( 2*m, n, n ); 558 | flops_dgemm = FLOPS_DGEMM( m, n, n ); 559 | flops += flops_dgeqrf + flops_dorgqr + flops_dgemm; 560 | 561 | itqr += 1; 562 | facto = 0; 563 | } 564 | else { 565 | /** 566 | * Compute Q1 = c * U * U' + I 567 | */ 568 | alloc_qr = 0; 569 | potime = 0.0; 570 | if(prof) {potime =- MPI_Wtime();} 571 | 572 | alpha = 0.; beta =1.; 573 | pdlaset_( "G", &m, &n, &alpha, &beta, H, &i1, &i1, descH); 574 | 575 | sync_time_elapsed =- MPI_Wtime(); 576 | 577 | pdgemm_( "T", "N", &m, &n, &n, &c, A, &i1, &i1, descA, A, &i1, &i1, 578 | descA, &beta, H, &i1, &i1, descH); 579 | 580 | sync_time_elapsed += MPI_Wtime(); 581 | MPI_Allreduce( &sync_time_elapsed, &reduced_time_elapsed, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); 582 | 583 | /** 584 | * Solve Q1 x = Q2, with Q2 = U 585 | */ 586 | alpha = 1.0; beta = 0.0; 587 | pdgeadd_( "T", &m, &n, &alpha, A, &i1, &i1, descA, &beta, B, &i1, &i1, descB); 588 | 589 | sync_time_elapsed =- MPI_Wtime(); 590 | 591 | pdposv_( "U", &m, &n, H, &i1, &i1, descH, B, &i1, &i1, descB, &iinfo); 592 | 593 | sync_time_elapsed += MPI_Wtime(); 594 | MPI_Allreduce( &sync_time_elapsed, &reduced_time_elapsed, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); 595 | 596 | /* Copy U into H to check the convergence of QDWH */ 597 | if (it >= itconv ){ 598 | pdlacpy_( "A", &m, &n, A, &i1, &i1, descA, H, &i1, &i1, descH ); 599 | } 600 | 601 | /** 602 | * Compute U = (a-b/c) * Q2' + (b/c) * U 603 | */ 604 | alpha = (a-b/c); beta = (b/c); 605 | pdgeadd_ ( "T", &m, &n, &alpha, B, &i1, &i1, descB, &beta, A, &i1, &i1, descA); 606 | 607 | if(prof) {potime += MPI_Wtime();} 608 | 609 | /* Main flops used in this step */ 610 | flops_dgemm = FLOPS_DGEMM( m, n, n ); 611 | flops_dpotrf = FLOPS_DPOTRF( m ); 612 | flops_dtrsm = FLOPS_DTRSM( 'L', m, n ); 613 | flops += flops_dgemm + flops_dpotrf + 2. * flops_dtrsm; 614 | 615 | itpo += 1; 616 | facto = 1; 617 | } 618 | 619 | /** 620 | * Compute the norm of the symmetric matrix U - B1 621 | */ 622 | conv = 10.; 623 | if(it >= itconv ){ 624 | alpha = 1.0; beta = -1.0; 625 | pdgeadd_ ( "N", &m, &n, &alpha, A, &i1, &i1, descA, &beta, H, &i1, &i1, descH); 626 | 627 | sync_time_elapsed =- MPI_Wtime(); 628 | 629 | conv = pdlange_( "F", &m, &n, H, &i1, &i1, descH, W); 630 | 631 | sync_time_elapsed += MPI_Wtime(); 632 | MPI_Allreduce( &sync_time_elapsed, &reduced_time_elapsed, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); 633 | } 634 | if (verbose && (myrank_mpi == 0)) fprintf(stderr, "%02d %-5s %e\n", it, 635 | facto == 0 ? "QR" : "PO", conv ); 636 | } 637 | if (verbose && (myrank_mpi == 0)) { fprintf(stderr, "QDWH loop ends\n");} 638 | 639 | /* 640 | * A = U*H ==> H = U'*A ==> H = 0.5*(H'+H) 641 | */ 642 | 643 | Htime = 0.0; 644 | if(prof) {Htime =- MPI_Wtime();} 645 | 646 | alpha = 1.0; beta = 0.0; 647 | pdgemm_( "T", "N", &m, &n, &n, &alpha, A, &i1, &i1, descA, U, &i1, &i1, 648 | descU, &beta, H, &i1, &i1, descH); 649 | pdlacpy_( "A", &m, &n, H, &i1, &i1, descH, B, &i1, &i1, descB ); 650 | alpha = 0.5; 651 | pdgeadd_ ( "T", &m, &n, &alpha, B, &i1, &i1, descB, &alpha, H, &i1, &i1, descH); 652 | 653 | if(prof) {Htime += MPI_Wtime();} 654 | 655 | flops_dgemm = FLOPS_DGEMM( m, n, n ); 656 | flops += flops_dgemm; 657 | 658 | 659 | if(prof) {qwtime += MPI_Wtime();} 660 | 661 | if (prof && (myrank_mpi == 0)) { 662 | fprintf(stderr, "# QDWH Profiling \n"); 663 | fprintf(stderr, "#\n"); 664 | fprintf(stderr, "# \tn \ttimeQDWH \ttimeLi \ttimeNrm \ttime1itQR \t#QR \ttime1itPO \t#PO \ttimeFormH \n"); 665 | fprintf(stderr, " \t%d \t%2.4e \t%2.4e \t%2.4e \t%2.4e \t%d \t%2.4e \t%d \t%2.4e \n", m, qwtime, litime, nrmtime, qrtime, itqr, potime, itpo, Htime); 666 | } 667 | if (myrank_mpi == 0) { 668 | fprintf(stderr, "#\n"); 669 | fprintf(stderr, "# \t#QR \t#PO \n"); 670 | fprintf(stderr, " \t%d \t%d \n", itqr, itpo); 671 | } 672 | 673 | free( tau ); 674 | if ( !optcond ){ 675 | free( Wi ); 676 | } 677 | if ( Work1 == NULL ) { 678 | free( U ); 679 | //free( B ); 680 | } 681 | if ( Work2 == NULL ) { 682 | //free( A ); 683 | free( B ); 684 | } 685 | 686 | if (verbose && (myrank_mpi == 0)) { fprintf(stderr, "Exiting QDWH\n");} 687 | return 0; 688 | 689 | } 690 | --------------------------------------------------------------------------------