├── LICENSE ├── Makefile ├── README.md ├── alg ├── LU │ ├── Makefile │ ├── lu_25d_pvt.cxx │ ├── lu_25d_pvt.h │ ├── lu_offload.cxx │ ├── lu_offload.h │ ├── partial_pvt.cxx │ ├── partial_pvt.h │ ├── tnmt_pvt.cxx │ └── tnmt_pvt.h ├── MM │ ├── Makefile │ ├── charm_splitdim_cannon │ │ ├── Makefile │ │ ├── run_vspc.cxx │ │ ├── spcannon_internal.h │ │ ├── vpblock.cxx │ │ ├── vspcannon.ci │ │ ├── vspcannon.cxx │ │ └── vspcannon.h │ ├── splitdim_cannon │ │ ├── Makefile │ │ ├── spcannon.cxx │ │ ├── spcannon.h │ │ ├── spcannon_internal.h │ │ └── unicannon.cxx │ └── topo_pdgemm │ │ ├── Makefile │ │ ├── d25_summa.cxx │ │ ├── dual_cannon.cxx │ │ ├── summa.cxx │ │ ├── topo_pdgemm_algs.h │ │ ├── topo_pdgemm_bench.cxx │ │ └── topo_pdgemm_unit.cxx ├── Makefile ├── QR │ ├── Makefile │ ├── hh_recon │ │ ├── Makefile │ │ ├── hh_recon.cxx │ │ ├── hh_recon.h │ │ ├── yamamoto.cxx │ │ └── yamamoto.h │ ├── qr_2d │ │ ├── Makefile │ │ ├── qr_2d.cxx │ │ ├── qr_2d.h │ │ ├── qr_butterfly_2d.cxx │ │ ├── qr_tree_2d.cxx │ │ ├── qr_y2d.cxx │ │ └── qr_y2d.h │ └── tsqr │ │ ├── Makefile │ │ ├── apply_butterfly_tsqr_QT.cxx │ │ ├── apply_tsqr_QT.cxx │ │ ├── bitree_tsqr.cxx │ │ ├── bitree_tsqr.h │ │ ├── butterfly_construct_Q.cxx │ │ ├── butterfly_tsqr.cxx │ │ ├── butterfly_tsqr.h │ │ └── construct_tsqr_Q.cxx ├── SE │ ├── CANSE.h │ ├── Makefile │ ├── dmatrix.cxx │ ├── dmatrix.h │ ├── drive_band_to_band.cxx │ ├── full_to_band.cxx │ ├── full_to_band_3d.cxx │ └── full_to_band_scala.cxx └── shared │ ├── Makefile │ ├── comm.h │ ├── lapack.cxx │ ├── lapack.h │ ├── pmpi.h │ ├── timer.cxx │ ├── timer.h │ ├── util.cxx │ └── util.h ├── bench ├── LU │ ├── Makefile │ ├── lu_25d_pvt_bench.cxx │ ├── par_tnmt_bench.cxx │ └── pblas_lu.c ├── MM │ ├── Makefile │ ├── bench_spc.cxx │ └── topo_pdgemm_bench.cxx ├── Makefile ├── QR │ ├── Makefile │ ├── bench_hh_recon.cxx │ ├── bench_qr_2d.cxx │ ├── bench_qr_2d_hh_scala.cxx │ ├── bench_qr_butterfly_2d.cxx │ ├── bench_qr_seq.cxx │ ├── bench_qr_tree_2d.cxx │ ├── bench_qr_y2d.cxx │ └── bench_scala_qr.cxx └── SE │ ├── Makefile │ ├── bench_elpa_sym_eig.cxx │ ├── bench_full2band.cxx │ ├── bench_full2band_3d.cxx │ ├── bench_scala_sym_eig.cxx │ └── compare_sytrd.cxx ├── bin ├── benchmarks │ └── .gitignore └── tests │ └── .gitignore ├── configure ├── include └── CANDMC.h ├── lib └── .gitignore ├── scripts ├── bench_all.sh └── test_all.sh └── test ├── LU ├── Makefile ├── lu_25d_pvt_unit_test.cxx ├── lu_25d_unit_test.cxx ├── par_pivot_unit_test.cxx ├── par_tnmt_unit_test.cxx ├── pvt_unit_test.cxx ├── seq_tnmt_unit_test.cxx ├── unit_test.cxx └── unit_test.h ├── MM ├── Makefile ├── test_spc.cxx └── topo_pdgemm_unit.cxx ├── Makefile ├── QR ├── Makefile ├── test_bitree_tsqr.cxx ├── test_construct_tsqr_Q.cxx ├── test_hh_recon.cxx ├── test_qr_2d.cxx ├── test_qr_butterfly_2d.cxx ├── test_qr_tree_2d.cxx ├── test_qr_y2d.cxx └── test_scala_qr_2d.cxx └── SE ├── Makefile ├── test_full2band.cxx ├── test_full2band_3d.cxx ├── test_full2band_scala.cxx └── test_scala_sym_eig.cxx /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015, Edgar Solomonik 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, 5 | are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, this 11 | list of conditions and the following disclaimer in the documentation and/or 12 | other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 18 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 21 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | include config.mk 2 | 3 | TESTS := lu_25d_np_test lu_25d_pp_test lu_25d_tp_test test_bitree_tsqr \ 4 | test_construct_tsqr_Q test_hh_recon test_qr_2d test_qr_y2d test_qr_butterfly_2d \ 5 | test_qr_tree_2d test_scala_qr_2d test_spc topo_pdgemm_unit test_scala_sym_eig \ 6 | test_full2band test_full2band_scala test_full2band_3d 7 | 8 | BENCHMARKS := bench_hh_recon bench_qr_2d bench_qr_butterfly_2d bench_qr_tree_2d \ 9 | bench_scala_qr bench_qr_2d_hh_scala bench_spc bench_scala_sym_eig lu_25d_np_bench \ 10 | lu_25d_pp_bench lu_25d_tp_bench topo_pdgemm_bench bench_qr_seq \ 11 | bench_full2band bench_elpa_sym_eig bench_full2band_3d bench_qr_y2d 12 | 13 | 14 | lib: CANDMC 15 | 16 | CANMM CANLU CANQR CANSE CANDMC: 17 | $(MAKE) $@ -C alg 18 | 19 | test: CANDMC 20 | $(MAKE) $@ -C test 21 | 22 | bench: CANDMC 23 | $(MAKE) $@ -C bench 24 | 25 | $(TESTS): CANDMC 26 | $(MAKE) $@ -C test 27 | 28 | $(BENCHMARKS): CANDMC 29 | $(MAKE) $@ -C bench 30 | 31 | clean: 32 | rm -f lib/libCANDMC.a lib/libCANMM.a lib/libCANLU.a lib/libCANQR.a lib/libCANSE.a lib/libCANShared.a; \ 33 | cd bin/tests; rm -f $(TESTS); cd ../..; \ 34 | cd bin/benchmarks; rm -f $(BENCHMARKS); cd ../..; \ 35 | $(MAKE) $@ -C alg; \ 36 | $(MAKE) $@ -C test; \ 37 | $(MAKE) $@ -C bench; 38 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | CANDMC 2 | ===== 3 | 4 | ##**Communication Avoiding Numerical Dense Matrix Computations** 5 | 6 | **Purpose:** 7 | 8 | This repository contains studies for algorithms to perform matrix multiplication and dense matrix factorizations, currently: LU, QR, and the symmetric eigensolve. 9 | 10 | **Requirements:** 11 | 12 | Some version of BLAS and LAPACK required for any build. LAPACK version 3.40 or higher required to build QR codes. 13 | 14 | **Build Instructions:** 15 | 16 | Running ./configure will generate a config.mk file with build parameters and a Makefile. 17 | See the configure file for build options. Profiling may be activated with flag -DPROFILE (add to DEFS in config.mk). 18 | After running configure, running 'make' will build the library and place it into lib/libCANDMC.a. 19 | Specific contents may be built individually, 20 | * library of all routines 'CANDMC' 21 | * library of all shared routines required for use of any individual algorithmic library component 'CANShared' 22 | * library for matrix multiplication algorithms 'CANMM' 23 | * library for LU factorization algorithms 'CANLU' 24 | * library for QR factorization algorithms 'CANQR' 25 | * library for symmetric eigensolve algorithms 'CANSE' 26 | * all unit tests 'test', executables appear in bin/tests/ 27 | * all benchmarks 'bench', executables appear in bin/benchmarks/ 28 | 29 | **Accrediation:** 30 | 31 | Code is available under a two-clause BSD license. 32 | 33 | Repository created and maintained by Edgar Solomonik (ETH Zurich). Please contact solomonik@inf.ethz.ch with any questions or inquiries. 34 | 35 | Thanks to the following developers, snippets of whose code are used in a few places of this repository. 36 | * Grey Ballard (Sandia Laboratory) 37 | * Mathias Jacquelin (Lawrence Berkeley National Laboratory) 38 | * Devin Matthews (University of Texas at Austin) 39 | 40 | -------------------------------------------------------------------------------- /alg/LU/Makefile: -------------------------------------------------------------------------------- 1 | include ../../config.mk 2 | 3 | LIB_DIR = ../../lib 4 | 5 | CANLU: $(LIB_DIR)/libCANLU.a 6 | 7 | $(LIB_DIR)/libCANLU.a: lu_25d_pvt.o tnmt_pvt.o partial_pvt.o 8 | $(AR) -crs $(LIB_DIR)/libCANLU.a *.o 9 | 10 | lu_25d_pvt.o: lu_25d_pvt.cxx 11 | $(CXX) -c lu_25d_pvt.cxx -o lu_25d_pvt.o $(CXXFLAGS) $(DEFS) 12 | 13 | tnmt_pvt.o: tnmt_pvt.h tnmt_pvt.cxx 14 | $(CXX) -c tnmt_pvt.cxx -o tnmt_pvt.o $(CXXFLAGS) $(DEFS) 15 | 16 | partial_pvt.o: partial_pvt.h partial_pvt.cxx 17 | $(CXX) -c partial_pvt.cxx -o partial_pvt.o $(CXXFLAGS) $(DEFS) 18 | 19 | clean: 20 | rm -f *.o 21 | -------------------------------------------------------------------------------- /alg/LU/lu_25d_pvt.h: -------------------------------------------------------------------------------- 1 | #ifndef __LU_25D_PVT_H__ 2 | #define __LU_25D_PVT_H__ 3 | 4 | #include "../shared/comm.h" 5 | #include "../shared/util.h" 6 | 7 | //#define SHARE_MIC 2 8 | 9 | typedef struct lu_25d_pvt_params { 10 | int pvt; /* 1-> do pivoting, 0-> no pivoting */ 11 | int is_tnmt_pvt; 12 | int myRank; 13 | int c_rep; 14 | int matrixDim; 15 | int blockDim; 16 | int big_blockDim; 17 | int num_pes_dim; 18 | int layerRank; 19 | int myRow; 20 | int myCol; 21 | CommData cdt_row; 22 | CommData cdt_col; 23 | CommData cdt_kdir; 24 | CommData cdt_kcol; 25 | } lu_25d_pvt_params_t; 26 | 27 | 28 | void lu_25d_pvt(lu_25d_pvt_params_t *p, 29 | double *mat_A, 30 | int *mat_pvt, 31 | int *pvt_buffer, 32 | double *buffer, 33 | int is_alloced=0); 34 | 35 | #endif //__LU_25D_PVT_H__ 36 | -------------------------------------------------------------------------------- /alg/LU/lu_offload.h: -------------------------------------------------------------------------------- 1 | #ifndef __LU_OFFLOAD_H__ 2 | #define __LU_OFFLOAD_H__ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "../shared/util.h" 10 | 11 | 12 | #ifdef USE_MIC 13 | #include "mkl.h" 14 | #include "omp.h" 15 | #define ASYNC_GEMM 16 | #define MIC_PER_NODE 2 17 | #endif 18 | 19 | enum OFF_MAT { OFF_A, OFF_L, OFF_U }; 20 | 21 | #ifdef USE_MIC 22 | __declspec(target(mic:mic_rank)) 23 | #endif 24 | double * get_mat_handle(OFF_MAT omat); 25 | 26 | void set_mic_rank(int mic_rank); 27 | 28 | void wait_gemm(); 29 | 30 | void offload_gemm_A(char tA, 31 | char tB, 32 | int m, 33 | int n, 34 | int k, 35 | double alpha, 36 | int offset_A, 37 | OFF_MAT omat_A, 38 | int lda_A, 39 | int offset_B, 40 | OFF_MAT omat_B, 41 | int lda_B, 42 | double beta, 43 | int offset_C, 44 | OFF_MAT omat_C, 45 | int lda_C); 46 | 47 | void download_lda_cpy(int nrow, 48 | int ncol, 49 | int lda_A, 50 | int lda_B, 51 | int offset_A, 52 | double * B, 53 | OFF_MAT omat_A); 54 | 55 | void download_lda_cpy(int nrow, 56 | int ncol, 57 | int lda_A, 58 | int lda_B, 59 | int offset_A, 60 | double * B, 61 | OFF_MAT omat_A); 62 | 63 | 64 | void upload_lda_cpy(int nrow, 65 | int ncol, 66 | int lda_A, 67 | int lda_B, 68 | double const * A, 69 | int offset_B, 70 | OFF_MAT omat_B); 71 | 72 | 73 | void upload_lda_cpy(int nrow, 74 | int ncol, 75 | int lda_A, 76 | int lda_B, 77 | double const * A, 78 | int offset_B, 79 | OFF_MAT omat_B); 80 | 81 | void offload_sparse_rw(int nrow, 82 | int ncol, 83 | int lda_B, 84 | double * A, 85 | int lda_A, 86 | int * offsets_transfer, 87 | OFF_MAT omat_B, 88 | char rw); 89 | 90 | void upload_A(); 91 | void alloc_A(int64_t size, double * ptr); 92 | void alloc_L(int64_t size); 93 | void alloc_U(int64_t size); 94 | void alloc_transfer(int64_t size); 95 | void free_offload_A();//int64_t size); 96 | void free_offload_L();//int64_t size); 97 | void free_offload_U();//int64_t size); 98 | void free_offload_transfer();//int64_t size); 99 | 100 | 101 | #endif 102 | -------------------------------------------------------------------------------- /alg/LU/partial_pvt.cxx: -------------------------------------------------------------------------------- 1 | /* Copyright (c) Edgar Solomonik 2015, all rights reserved. This code is part of the CANDMC repository, protected under a two-clause BSD license. */ 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "../shared/util.h" 8 | #include "partial_pvt.h" 9 | #include "tnmt_pvt.h" 10 | #include 11 | 12 | extern "C"{ 13 | int idamax_(int const * N, double const * A, int const * inc_A); 14 | } 15 | //void dscal_(int const * n, double * dA, double * dX, int const * incX); 16 | //void dger_( int const * M, 17 | // int const * N, 18 | // double const * alpha, 19 | // double const * X, 20 | // int const * incX, 21 | // double const * Y, 22 | // int const * incY, 23 | // double * A, 24 | // int const * lda); 25 | //} 26 | // 27 | int cidamax(int const N, double const * A, int const inc_A){ 28 | return idamax_(&N, A, &inc_A); 29 | } 30 | // 31 | //void cdscal(int const n, double dA, double * dX, int const incX){ 32 | // dscal_(&n, &dA, dX, &incX); 33 | //} 34 | // 35 | //void cdger(int const M, 36 | // int const N, 37 | // double const alpha, 38 | // double const * X, 39 | // int const incX, 40 | // double const * Y, 41 | // int const incY, 42 | // double * A, 43 | // int const lda){ 44 | // dger_(&M, &N, &alpha, X, &incX, Y, &incY, A, &lda); 45 | //} 46 | 47 | 48 | 49 | /** 50 | * \brief performs parallel partial pivoting on a tall-skinny matrix 51 | * 52 | * \param[in,out] A pointer to nb-by-b block in lda-by-b buffer 53 | * \param[in] lda leading dimension of A 54 | * \param[in,out] P pivot matrix which contains the initial index of each row of A 55 | * \param[in] nb number of local rows of A 56 | * \param[in] b number of columns in A 57 | * \param[in] myRank rank in column 58 | * \param[in] numPes number of processors in column 59 | * \param[in] root root process in column 60 | * \param[in] cdt communicator 61 | */ 62 | void partial_pvt(double * A, 63 | int const lda, 64 | int * P, 65 | int64_t const nb, 66 | int64_t const b, 67 | int const myRank, 68 | int const numPes, 69 | int const root, 70 | CommData_t cdt){ 71 | int i, j, imax, all_imax, npiv; 72 | double dmax, all_dmax; 73 | double * buffer = (double*)malloc(sizeof(double)*b); 74 | double * col = (double*)malloc(sizeof(double)*nb); 75 | int * pivoted_rows = (int*)malloc(sizeof(int)*b); 76 | int * P_start = (int*)malloc(sizeof(int)*nb); 77 | MPI_Status stat; 78 | 79 | TAU_FSTART(partial_pvt_inner); 80 | 81 | memcpy(P_start, P, nb*sizeof(int)); 82 | 83 | npiv = 0; 84 | for (i=0; i= 0) 92 | dmax = fabs(col[imax]); 93 | else 94 | dmax = 0.0; 95 | 96 | MPI_Allreduce(&dmax, &all_dmax, 1, MPI_DOUBLE, MPI_MAX, cdt.cm); 97 | assert(all_dmax!=0.0); 98 | 99 | if (dmax == all_dmax) all_imax = myRank; 100 | else all_imax = -1; 101 | 102 | MPI_Allreduce(MPI_IN_PLACE, &all_imax, 1, MPI_INT, MPI_MAX, cdt.cm); 103 | 104 | if (myRank == all_imax){ 105 | pivoted_rows[npiv] = imax; 106 | npiv++; 107 | lda_cpy(1, b, lda, 1, A+imax, buffer); 108 | 109 | if (myRank != root) 110 | MPI_Sendrecv_replace(P+imax, 1, MPI_INT, root, 2*i, 111 | root, 2*i+1, cdt.cm, &stat); 112 | else { 113 | P[i] = P_start[imax]; 114 | } 115 | col[imax] = 0.0; 116 | 117 | } 118 | if (myRank == root && all_imax != root){ 119 | MPI_Sendrecv_replace(P+i, 1, MPI_INT, all_imax, 2*i+1, 120 | all_imax, 2*i, cdt.cm, &stat); 121 | } 122 | TAU_FSTOP(select_pivot); 123 | TAU_FSTART(update_thin_panel); 124 | MPI_Bcast(buffer, b, MPI_DOUBLE, all_imax, cdt.cm); 125 | 126 | cdscal(nb, 1.0/buffer[i], col, 1); 127 | 128 | cdger(nb, b-i-1, -1.0, 129 | col, 1, 130 | buffer+i+1, 1, 131 | A+(i+1)*lda, lda); 132 | for (j=0; j /* is row major ? */ 61 | void local_tournament(double *A, /* n by b matrix */ 62 | double *R, /* input: n by b buffer of opaque memory 63 | output: b by b best rows; n-b by b buffer of opaque memory*/ 64 | int *P, /* output: b length pivot array */ 65 | int n, 66 | int b, 67 | int lda_A); 68 | 69 | /* Perform tournament pivoting over a ring of processors */ 70 | void tnmt_pvt_1d(double *R, /* input: b by b matrix of my best rows */ 71 | double *R_out, /* out: b by b matrix of best rows */ 72 | int *P_in, /* in: b by 1 global ranks of my best rows */ 73 | double *R_buf, /* out: 2b by b buffer for rows */ 74 | int *P_out, /* 3b by 1 buffer for local best rank calc 75 | out: b by 1 (2b by 1 buffer) global 76 | ranks of best rows in communicator */ 77 | const int b, 78 | const int myRank, 79 | const int pe_start, 80 | const int numPes, 81 | const int root, 82 | CommData cdt); 83 | 84 | 85 | /* Apply a distributed pivot matrix over a column of processors */ 86 | void par_pivot(double *A, /* input: matrix */ 87 | double *buffer,/* buffer space */ 88 | const int npiv, /* number of rows to pivot */ 89 | const int ncol, /* number of columns */ 90 | const int b, /* dimension of small block */ 91 | const int lda_A, /* lda of A */ 92 | const int idx_off,/* local offset of A and P_* from top */ 93 | const int glb_off,/* global offset from top */ 94 | int *P_r, /* old->new row source indices */ 95 | int *P_app, /* permutation matrix to apply */ 96 | /* owned by root only */ 97 | /* 3*npiv length */ 98 | const int myRank, /* rank in processor oclumn comm */ 99 | const int lrRank, /* rank in processor oclumn comm */ 100 | const int numPes, /* num pes in column */ 101 | const int numLrs, /* num of layers */ 102 | const int root, /* rank in processor oclumn comm */ 103 | const int st_blk, /* first block this layer owns */ 104 | const int num_blk,/* number of blocks this layer owns */ 105 | const CommData cdt_col, /* column communiccator */ 106 | const CommData cdt_col_kdir, /* kdir communiccator */ 107 | int * pvt_buffer=NULL, 108 | int const is_tnmt_pvt=1, 109 | int const nloaded=0);//rows already uploaded (only used in skinyy offload mode 110 | 111 | /* Collect rows of U */ 112 | void pvt_collc(double *A_fw, /* input: next U offset by idx_off*/ 113 | double *A_bw, /* input: matrix A offset by idx_off */ 114 | double *buffer,/* buffer space */ 115 | const int b, /* block size */ 116 | const int nrow, /* number of rows involved */ 117 | const int ncol_fw,/* number of columns after current i_big */ 118 | const int ncol_bw,/* number of columns before current i_big */ 119 | const int lda_A, /* lda of A */ 120 | const int idx_off,/* local offset of A and P_* from top */ 121 | const int glb_off,/* global offset from top */ 122 | int *P_r, /* old->new row source indices */ 123 | int *P_mine,/* The rows I need */ 124 | int *P_buf, /* buffer space */ 125 | const int myRank, /* rank in processor oclumn comm */ 126 | const int numPes, /* num pes in column */ 127 | const CommData cdt); /* column communiccator */ 128 | 129 | 130 | #define local_tournament_row_maj(A,R,P,n,b,lda_A) \ 131 | do { \ 132 | local_tournament<1>(A,R,P,n,b,lda_A); \ 133 | } while (0) 134 | 135 | #define local_tournament_col_maj(A,R,P,n,b,lda_A) \ 136 | do { \ 137 | local_tournament<0>(A,R,P,n,b,lda_A); \ 138 | } while (0) 139 | 140 | 141 | #endif 142 | -------------------------------------------------------------------------------- /alg/MM/Makefile: -------------------------------------------------------------------------------- 1 | include ../../config.mk 2 | 3 | LIB_DIR = ../../lib 4 | 5 | CANMM: CANMMrec $(LIB_DIR)/libCANMM.a 6 | 7 | CANMMrec: 8 | $(MAKE) CANMM -C splitdim_cannon; \ 9 | $(MAKE) CANMM -C topo_pdgemm; 10 | 11 | #splitdim_cannon/*.o topo_pdgemm/*.o 12 | $(LIB_DIR)/libCANMM.a: CANMMrec 13 | $(AR) -crs $(LIB_DIR)/libCANMM.a splitdim_cannon/*.o topo_pdgemm/*.o 14 | 15 | clean: 16 | $(MAKE) $@ -C splitdim_cannon; \ 17 | $(MAKE) $@ -C topo_pdgemm; 18 | -------------------------------------------------------------------------------- /alg/MM/charm_splitdim_cannon/Makefile: -------------------------------------------------------------------------------- 1 | CHARM_HOME=/home/edgar/work/charm 2 | OPTS=-g -O0 3 | CHARMC=$(CHARM_HOME)/bin/charmc $(OPTS) 4 | 5 | MATH_LIB = -lesslbg -lesslsmpbg -lxlf90_r \ 6 | -lmass -lmassv -lxlfmath -lxlomp_ser -lxlsmp -lpthread 7 | 8 | LOCAL_LIBS = -L/soft/apps/LAPACK \ 9 | -L/opt/ibmcmp/xlf/bg/11.1/bglib \ 10 | -L/opt/ibmcmp/xlsmp/bg/1.7/bglib \ 11 | -L/bgsys/ibm_essl/sles10/prod/opt/ibmmath/lib \ 12 | -L/bgsys/drivers/ppcfloor/gnu-linux/powerpc-bgp-linux/lib \ 13 | -L/gpfs/home/bohm/zlib/lib 14 | 15 | #LIBS = $(MATH_LIB) $(LOCAL_LIBS) 16 | LIBS = -lblas -llapack 17 | 18 | #INCLUDE = -I/bgsys/ibm_essl/sles10/prod/opt/ibmmath/include 19 | 20 | 21 | OBJS = vspcannon.o 22 | 23 | all: bench_vspc test_vspc 24 | 25 | bench_vspc: $(OBJS) bench_vspc.o 26 | $(CHARMC) -language charm++ $(LIBS) -o bench_vspc $(OBJS) bench_vspc.o 27 | 28 | test_vspc: $(OBJS) test_vspc.o 29 | $(CHARMC) -language charm++ $(LIBS) -o test_vspc $(OBJS) test_vspc.o 30 | 31 | projections: $(OBJS) bench_vspc.o 32 | $(CHARMC) -language charm++ $(LIBS) -tracemode projections -lz -o bench_vspc.prj $(OBJS) bench_vspc.o 33 | 34 | summary: $(OBJS) bench_vspc.o 35 | $(CHARMC) -language charm++ $(LIBS) -tracemode summary -lz -o bench_vspc.sum $(OBJS) bench_vspc.o 36 | 37 | vspcannon.decl.h: vspcannon.ci 38 | $(CHARMC) vspcannon.ci 39 | 40 | 41 | bench_vspc.o: run_vspc.cxx vspcannon.h vspcannon.decl.h spcannon_internal.h 42 | $(CHARMC) -c $(INCLUDE) run_vspc.cxx -o bench_vspc.o -DBENCH 43 | 44 | test_vspc.o: run_vspc.cxx vspcannon.h vspcannon.decl.h spcannon_internal.h 45 | $(CHARMC) -c $(INCLUDE) run_vspc.cxx -o test_vspc.o -DTEST 46 | 47 | vspcannon.o: vspcannon.cxx vspcannon.h vspcannon.decl.h spcannon_internal.h 48 | $(CHARMC) -c $(INCLUDE) vspcannon.cxx 49 | 50 | clean: 51 | rm -f *.decl.h *.def.h conv-host *.o bench_vspc test_vspc bench_vspc.prj charmrun *~ 52 | 53 | -------------------------------------------------------------------------------- /alg/MM/charm_splitdim_cannon/spcannon_internal.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2011, Edgar Solomonik> 2 | * All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following 6 | * conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * 13 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 14 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 | * ARE DISCLAIMED. IN NO EVENT SHALL EDGAR SOLOMONIK BE LIABLE FOR ANY 17 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 19 | * SERVICES LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 20 | * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 | * LIABILITY, OR TORT(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 | * SUCH DAMAGE. */ 24 | 25 | #ifndef __SPCANNON_INTERNAL_H__ 26 | #define __SPCANNON_INTERNAL_H__ 27 | 28 | #include 29 | #include 30 | #include 31 | 32 | #define ALIGN 16 33 | #define MALLOC(s) malloc(s) 34 | #define DGEMM cdgemm 35 | #define BLAS_DGEMM dgemm_ 36 | #define TRANSPOSE naive_transp 37 | #ifndef WRAP 38 | #define WRAP(a,b) ((a + b)%b) 39 | #endif 40 | 41 | 42 | extern "C" 43 | void dgemm_(const char *, const char *, 44 | const int *, const int *, 45 | const int *, const double *, 46 | const double *, const int *, 47 | const double *, const int *, 48 | const double *, double *, 49 | const int *); 50 | 51 | inline 52 | void DGEMM(const char transa, const char transb, 53 | const int m, const int n, 54 | const int k, const double a, 55 | const double * A, const int lda, 56 | const double * B, const int ldb, 57 | const double b, double * C, 58 | const int ldc){ 59 | BLAS_DGEMM(&transa, &transb, &m, &n, &k, &a, A, 60 | &lda, B, &ldb, &b, C, &ldc); 61 | 62 | } 63 | 64 | inline 65 | void TRANSPOSE(int const lda_fr, int const lda_to, double * A, double * buf){ 66 | int i,j; 67 | for (i=0; i 2 | * All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following 6 | * conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * 13 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 14 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 | * ARE DISCLAIMED. IN NO EVENT SHALL EDGAR SOLOMONIK BE LIABLE FOR ANY 17 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 19 | * SERVICES LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 20 | * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 | * LIABILITY, OR TORT(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 | * SUCH DAMAGE. */ 24 | 25 | #ifndef __VSPCANNON_H__ 26 | #define __VSPCANNON_H__ 27 | 28 | #include 29 | 30 | #ifdef TAU 31 | #include 32 | #define TAU_FSTART(ARG) \ 33 | TAU_PROFILE_TIMER(timer##ARG, #ARG, "", TAU_USER); \ 34 | TAU_PROFILE_START(timer##ARG) 35 | 36 | #define TAU_FSTOP(ARG) \ 37 | TAU_PROFILE_STOP(timer##ARG) 38 | 39 | #else 40 | #define TAU_PROFILE(NAME,ARG,USER) 41 | #define TAU_PROFILE_TIMER(ARG1, ARG2, ARG3, ARG4) 42 | #define TAU_PROFILE_STOP(ARG) 43 | #define TAU_PROFILE_START(ARG) 44 | #define TAU_FSTART(ARG) 45 | #define TAU_FSTOP(ARG) 46 | #endif 47 | class ShftMsg : public CMessage_ShftMsg { 48 | public: 49 | double * data; 50 | int dim; 51 | int level; 52 | int pidx; 53 | }; 54 | 55 | class StgrMsg : public CMessage_StgrMsg { 56 | public: 57 | double * data; 58 | int dim; 59 | int level; 60 | }; 61 | 62 | class Main : public CBase_Main { 63 | public: 64 | int * dim_len; 65 | int np, iter, niter, nwarm, warmup, witer; 66 | int n, m, k; 67 | double st, end; 68 | double alpha, beta; 69 | 70 | Main(CkArgMsg* m); 71 | void done(); 72 | void run_spc(); 73 | void reduceC(CkReductionMsg * msg); 74 | }; 75 | 76 | class Mapper : public CBase_Mapper { 77 | public: 78 | int np; 79 | int *mapping; 80 | 81 | Mapper(int ndim, int kary, int * dim_len); 82 | ~Mapper(); 83 | int procNum(int, const CkArrayIndex &idx); 84 | }; 85 | 86 | 87 | class VPblock : public CBase_VPblock { 88 | public: 89 | int n, m, k; 90 | double alpha, beta; 91 | int pidx, stgr_set, shft_set, level, nmsg; 92 | 93 | int * stgr_acc_table; 94 | int * shft_acc_table; 95 | int * shft_pidx; 96 | 97 | std::vector< ShftMsg* > shft_queue_A; 98 | std::vector< ShftMsg* > shft_queue_B; 99 | std::vector< StgrMsg* > stgr_queue_A; 100 | std::vector< StgrMsg* > stgr_queue_B; 101 | 102 | double *A, *B, *C; 103 | 104 | VPblock(); 105 | VPblock(CkMigrateMessage *msg); 106 | 107 | void init_sindex(int nb, int mb, int kb); 108 | void init_rand(int n, int m, int k); 109 | void contract(double alpha, double beta); 110 | void staggerA(StgrMsg * msg); 111 | void staggerB(StgrMsg * msg); 112 | void stagger(); 113 | void shiftA(ShftMsg * msg); 114 | void shiftB(ShftMsg * msg); 115 | void loc_shiftA(int im); 116 | void loc_shiftB(int im); 117 | void loc_staggerA(int im); 118 | void loc_staggerB(int im); 119 | void shift(); 120 | void start_shift(); 121 | void gatherC(); 122 | }; 123 | 124 | #endif// __VSPCANNON_H__ 125 | 126 | -------------------------------------------------------------------------------- /alg/MM/splitdim_cannon/Makefile: -------------------------------------------------------------------------------- 1 | include ../../../config.mk 2 | 3 | LIB_DIR = ../../../lib 4 | 5 | CANMM: $(LIB_DIR)/libCANMM.a 6 | 7 | $(LIB_DIR)/libCANMM.a: spcannon.o 8 | 9 | spcannon.o: spcannon.cxx spcannon.h spcannon_internal.h 10 | $(CXX) -c spcannon.cxx -o spcannon.o $(CXXFLAGS) $(DEFS) 11 | 12 | clean: 13 | rm -f *.o 14 | -------------------------------------------------------------------------------- /alg/MM/splitdim_cannon/spcannon.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2011, Edgar Solomonik> 2 | * All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following 6 | * conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * 13 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 14 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 | * ARE DISCLAIMED. IN NO EVENT SHALL EDGAR SOLOMONIK BE LIABLE FOR ANY 17 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 19 | * SERVICES LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 20 | * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 | * LIABILITY, OR TORT(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 | * SUCH DAMAGE. */ 24 | 25 | #ifndef __SPCANNON_H__ 26 | #define __SPCANNON_H__ 27 | 28 | #include "mpi.h" 29 | 30 | 31 | void kput_cannon(int const rank, 32 | int const kary, 33 | int const ndim, 34 | MPI_Comm const comm, 35 | int const n, 36 | int const m, 37 | int const k, 38 | char const transp_A, 39 | double const alpha, 40 | double * A, 41 | char const transp_B, 42 | double const beta, 43 | double * B, 44 | double * C); 45 | 46 | void kuni_cannon(int const rank, 47 | int const kary, 48 | int const ndim, 49 | MPI_Comm const comm, 50 | int const n, 51 | int const m, 52 | int const k, 53 | char const transp_A, 54 | double const alpha, 55 | double * A, 56 | char const transp_B, 57 | double const beta, 58 | double * B, 59 | double * C); 60 | 61 | 62 | #endif// __SPCANNON_H__ 63 | 64 | -------------------------------------------------------------------------------- /alg/MM/splitdim_cannon/spcannon_internal.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2011, Edgar Solomonik> 2 | * All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following 6 | * conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * 13 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 14 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 | * ARE DISCLAIMED. IN NO EVENT SHALL EDGAR SOLOMONIK BE LIABLE FOR ANY 17 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 19 | * SERVICES LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 20 | * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 | * LIABILITY, OR TORT(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 | * SUCH DAMAGE. */ 24 | 25 | #ifndef __SPCANNON_INTERNAL_H__ 26 | #define __SPCANNON_INTERNAL_H__ 27 | 28 | #include 29 | #include 30 | #include 31 | 32 | #define MALLOC(s) malloc(s) 33 | #define DGEMM cdgemm 34 | #define BLAS_DGEMM dgemm_ 35 | #define TRANSPOSE naive_transp 36 | #ifndef WRAP 37 | #define WRAP(a,b) ((a + b)%b) 38 | #endif 39 | 40 | 41 | extern "C" 42 | void dgemm_(const char *, const char *, 43 | const int *, const int *, 44 | const int *, const double *, 45 | const double *, const int *, 46 | const double *, const int *, 47 | const double *, double *, 48 | const int *); 49 | 50 | inline 51 | void DGEMM(const char transa, const char transb, 52 | const int m, const int n, 53 | const int k, const double a, 54 | const double * A, const int lda, 55 | const double * B, const int ldb, 56 | const double b, double * C, 57 | const int ldc){ 58 | BLAS_DGEMM(&transa, &transb, &m, &n, &k, &a, A, 59 | &lda, B, &ldb, &b, C, &ldc); 60 | 61 | } 62 | 63 | inline 64 | void TRANSPOSE(int const lda_fr, int const lda_to, double * A, double * buf){ 65 | int i,j; 66 | for (i=0; i 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "topo_pdgemm_algs.h" 10 | #include "../../shared/util.h" 11 | 12 | #ifndef ASSERT 13 | #define ASSERT(...) \ 14 | do{ \ 15 | assert(__VA_ARGS__); \ 16 | } while (0) 17 | #endif 18 | 19 | /* returns the number of bytes of buffer space 20 | we need */ 21 | static 22 | int64_t buffer_space_req(int64_t b){ 23 | return 4*b*b*sizeof(double); 24 | } 25 | 26 | void summa(ctb_args_t const * args, 27 | double const * mat_A, 28 | double const * mat_B, 29 | double * mat_C, 30 | double * buffer, 31 | CommData_t cdt_row, 32 | CommData_t cdt_col){ 33 | int64_t i; 34 | 35 | const int np_row = cdt_col.np; 36 | const int np_col = cdt_row.np; 37 | const int my_row = cdt_col.rank; 38 | const int my_col = cdt_row.rank; 39 | 40 | const int64_t n = args->n; 41 | const int64_t b = n / np_col; 42 | 43 | /* make sure we have enough buffer space */ 44 | ASSERT(args->buffer_size >= buffer_space_req(b)); 45 | ASSERT(np_row == np_col); 46 | ASSERT(n % np_row == 0); 47 | 48 | double * loc_A = buffer; 49 | double * loc_B = loc_A+b*b; 50 | double * buf_A = loc_B+b*b; 51 | double * buf_B = buf_A+b*b; 52 | 53 | lda_cpy(b,b,args->lda_A,b,mat_A,loc_A); 54 | lda_cpy(b,b,args->lda_B,b,mat_B,loc_B); 55 | 56 | MPI_Request req1, req2; 57 | 58 | TAU_FSTART(d2_topo_bcast_gemm); 59 | for (i=0; itrans_A, args->trans_B, 97 | b, b, b, 1.0, buf_A, b, buf_B, b, (i>0)*1.0, mat_C, args->lda_C); 98 | 99 | } 100 | TAU_FSTOP(d2_topo_bcast_gemm); 101 | } 102 | 103 | 104 | 105 | 106 | 107 | 108 | -------------------------------------------------------------------------------- /alg/MM/topo_pdgemm/topo_pdgemm_algs.h: -------------------------------------------------------------------------------- 1 | #ifndef __TOPO_PDGEMM_ALGS_H__ 2 | #define __TOPO_PDGEMM_ALGS_H__ 3 | 4 | #include "../../shared/comm.h" 5 | 6 | typedef struct ctb_args { 7 | char trans_A; 8 | char trans_B; 9 | int64_t n; 10 | int64_t lda_A; 11 | int64_t lda_B; 12 | int64_t lda_C; 13 | int64_t buffer_size; 14 | int ovp; 15 | } ctb_args_t; 16 | 17 | void summa(ctb_args_t const * args, 18 | double const * mat_A, 19 | double const * mat_B, 20 | double * mat_C, 21 | double * buffer, 22 | CommData_t cdt_row, 23 | CommData_t cdt_col); 24 | 25 | void d25_summa(ctb_args_t const * args, 26 | double * mat_A, 27 | double * mat_B, 28 | double * mat_C, 29 | double * buffer, 30 | #ifdef USE_MIC 31 | int mic_portion, 32 | int mic_id, 33 | #endif 34 | CommData_t cdt_row, 35 | CommData_t cdt_col, 36 | CommData_t cdt_kdir); 37 | 38 | void d25_summa_ovp(ctb_args_t const * args, 39 | double * mat_A, 40 | double * mat_B, 41 | double * mat_C, 42 | double * buffer, 43 | #ifdef USE_MIC 44 | int mic_portion, 45 | int mic_id, 46 | #endif 47 | CommData_t cdt_row, 48 | CommData_t cdt_col, 49 | CommData_t cdt_kdir); 50 | 51 | void bcast_cannon_4d(ctb_args_t const * args, 52 | double * mat_A, 53 | double * mat_B, 54 | double * mat_C, 55 | double * buffer, 56 | CommData_t cdt_x1, 57 | CommData_t cdt_y1, 58 | CommData_t cdt_x2, 59 | CommData_t cdt_y2); 60 | 61 | 62 | #endif 63 | 64 | 65 | -------------------------------------------------------------------------------- /alg/Makefile: -------------------------------------------------------------------------------- 1 | include ../config.mk 2 | 3 | LIB_DIR = ../lib 4 | 5 | CANDMC: $(LIB_DIR)/libCANDMC.a 6 | 7 | $(LIB_DIR)/libCANDMC.a: $(LIB_DIR)/libCANQR.a $(LIB_DIR)/libCANLU.a \ 8 | $(LIB_DIR)/libCANMM.a $(LIB_DIR)/libCANShared.a $(LIB_DIR)/libCANSE.a 9 | ifneq (,$(findstring DLAPACKHASTSQR=1,$(DEFS))) 10 | $(AR) -x $(LIB_DIR)/libCANSE.a; \ 11 | $(AR) -x $(LIB_DIR)/libCANQR.a; \ 12 | $(AR) -x $(LIB_DIR)/libCANMM.a; \ 13 | $(AR) -x $(LIB_DIR)/libCANShared.a; \ 14 | $(AR) -crs $(LIB_DIR)/libCANDMC.a *.o; rm *.o; 15 | else 16 | $(AR) -x $(LIB_DIR)/libCANMM.a; $(AR) -x $(LIB_DIR)/libCANLU.a; \ 17 | $(AR) -x $(LIB_DIR)/libCANShared.a; \ 18 | $(AR) -crs $(LIB_DIR)/libCANDMC.a *.o; rm *.o; 19 | endif 20 | 21 | $(LIB_DIR)/libCANQR.a: $(LIB_DIR)/libCANShared.a CANQR 22 | CANQR: 23 | ifneq (,$(findstring DLAPACKHASTSQR=1,$(DEFS))) 24 | $(MAKE) $@ -C QR; 25 | else 26 | $(AR) -crs $(LIB_DIR)/libCANQR.a; 27 | endif 28 | 29 | $(LIB_DIR)/libCANSE.a: $(LIB_DIR)/libCANQR.a CANSE 30 | CANSE: 31 | $(MAKE) $@ -C SE; 32 | # ar crs $(LIB_DIR)/libCANSE.a; 33 | 34 | $(LIB_DIR)/libCANMM.a: $(LIB_DIR)/libCANShared.a CANMM 35 | CANMM: 36 | $(MAKE) $@ -C MM; 37 | 38 | $(LIB_DIR)/libCANShared.a: CANShared 39 | CANShared: 40 | $(MAKE) $@ -C shared; 41 | 42 | $(LIB_DIR)/libCANLU.a: $(LIB_DIR)/libCANShared.a CANLU 43 | CANLU: 44 | $(MAKE) $@ -C LU; 45 | 46 | clean: 47 | $(MAKE) $@ -C MM; \ 48 | $(MAKE) $@ -C LU; \ 49 | $(MAKE) $@ -C QR; \ 50 | $(MAKE) $@ -C SE; \ 51 | $(MAKE) $@ -C shared 52 | -------------------------------------------------------------------------------- /alg/QR/Makefile: -------------------------------------------------------------------------------- 1 | include ../../config.mk 2 | 3 | LIB_DIR = ../../lib 4 | 5 | CANQR: CANQRrec 6 | $(MAKE) $(LIB_DIR)/libCANQR.a 7 | 8 | CANQRrec: 9 | if ($(MAKE) CANQR -C tsqr); then \ 10 | if ($(MAKE) CANQR -C hh_recon); then \ 11 | $(MAKE) CANQR -C qr_2d; \ 12 | fi \ 13 | fi 14 | 15 | $(LIB_DIR)/libCANQR.a: tsqr/* hh_recon/* qr_2d/* 16 | $(AR) -crs $(LIB_DIR)/libCANQR.a tsqr/*.o hh_recon/*.o qr_2d/*.o 17 | 18 | clean: 19 | $(MAKE) $@ -C tsqr; \ 20 | $(MAKE) $@ -C hh_recon; \ 21 | $(MAKE) $@ -C qr_2d; 22 | -------------------------------------------------------------------------------- /alg/QR/hh_recon/Makefile: -------------------------------------------------------------------------------- 1 | include ../../../config.mk 2 | 3 | LIB_DIR = ../../../lib 4 | 5 | CANQR: $(LIB_DIR)/libCANQR.a 6 | 7 | $(LIB_DIR)/libCANQR.a: hh_recon.o yamamoto.o 8 | 9 | hh_recon.o: hh_recon.h hh_recon.cxx 10 | $(CXX) -c hh_recon.cxx -o hh_recon.o $(CXXFLAGS) $(DEFS) 11 | 12 | yamamoto.o: yamamoto.h yamamoto.cxx 13 | $(CXX) -c yamamoto.cxx -o yamamoto.o $(CXXFLAGS) $(DEFS) 14 | 15 | 16 | clean: 17 | rm -f *.o 18 | -------------------------------------------------------------------------------- /alg/QR/hh_recon/hh_recon.h: -------------------------------------------------------------------------------- 1 | #ifndef __HH_RECON_H__ 2 | #define __HH_RECON_H__ 3 | 4 | #include "../../shared/comm.h" 5 | 6 | /** 7 | * \brief perform sequential b-by-b 8 | * TRSM to compute invT from W matrix (output of hh_recon QR) 9 | * \param[in] W b-by-b triangular factor -T*Y1' 10 | * \param[in] b dimension of W and T 11 | * \param[in,out] invT preallcative space for T^-1 12 | */ 13 | void compute_invT_from_W(double const * W, 14 | int64_t b, 15 | double * invT); 16 | 17 | /** 18 | * \brief Perform TSQR over a (sub)-column of processors 19 | * 20 | * \param[in,out] A b-by-b dense square matrix, L\U on output 21 | * \param[in,out] R b-by-b upper-triangular matrix, gets multiplied by signs 22 | * \param[in] b number of rows/columns in A 23 | * \param[in] lda_A leading dimension (number of buffer rows) in A 24 | * \param[in] lda_R leading dimension (number of buffer rows) in R 25 | * \param[out] signs of R, filled if not NULL 26 | **/ 27 | void signed_NLU(double * A, 28 | double * R, 29 | int64_t b, 30 | int64_t lda_A, 31 | int64_t lda_R, 32 | int64_t * signs); 33 | 34 | /** 35 | * \brief Perform TSQR over a (sub)-column of processors 36 | * 37 | * \param[in,out] A b-by-b dense square matrix, L\U on output 38 | * \param[in,out] R b-by-b upper-triangular matrix, gets multiplied by signs 39 | * \param[in] b number of rows/columns in A 40 | * \param[in] lda_A leading dimension (number of buffer rows) in A 41 | * \param[in] lda_R leading dimension (number of buffer rows) in R 42 | * \param[in] signs of R rows to use 43 | **/ 44 | void recursive_NLU( double * A, 45 | double * R, 46 | int64_t b, 47 | int64_t lda_A, 48 | int64_t lda_R, 49 | int64_t * signs); 50 | 51 | /** 52 | * \brief Perform TSQR and reconstruct YT on a (sub)-column of processors 53 | * 54 | * \param[in,out] A m-by-b dense tall-skinny matrix, Y\R on output 55 | * \param[in] lda_A lda of A 56 | * \param[in] m number of rows in A 57 | * \param[in] b number of columns in A 58 | * \param[in,out] W b-by-b upper triangular matrix -T*Y1' 59 | * \param[in,out] preallocated buffer for upper triangular W matrix 60 | * \param[in] myRank rank in communicator column 61 | * \param[in] numPes number of processes in column 62 | * \param[in] root the root of the tree (who will own R at the end) 63 | * \param[in] req_id request id to use for send/recv 64 | * \param[in] cdt MPI communicator for column 65 | **/ 66 | void hh_recon_qr(double * A, 67 | int64_t lda_A, 68 | int64_t m, 69 | int64_t b, 70 | double * W, 71 | int64_t myRank, 72 | int64_t numPes, 73 | int64_t root, 74 | int64_t req_id, 75 | CommData_t cdt); 76 | #endif 77 | -------------------------------------------------------------------------------- /alg/QR/hh_recon/yamamoto.cxx: -------------------------------------------------------------------------------- 1 | /* Copyright (c) Edgar Solomonik 2015, all rights reserved. This code is part of the CANDMC repository, protected under a two-clause BSD license. */ 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "../../shared/util.h" 9 | #include "../tsqr/bitree_tsqr.h" 10 | #include "../tsqr/butterfly_tsqr.h" 11 | #include "yamamoto.h" 12 | 13 | #define BSIZE 64 14 | 15 | /** 16 | * \brief Perform signed LU 17 | * 18 | * \param[in,out] A dense lda_A-by-b matrix, 19 | * on output the top b-by-b matrix contains L and U factors of itself minus S 20 | * \param[in,out] sign matrix S 21 | * \param[in] b number of rows/columns in A 22 | * \param[in] lda_A leading dimension (number of buffer rows) in A 23 | **/ 24 | void signed_YLU(double * A, 25 | int64_t b, 26 | int64_t lda_A, 27 | int * signs){ 28 | int64_t info, i, j, pos; 29 | 30 | for (i=0; i 0) 32 | signs[i] = -1; 33 | else 34 | signs[i] = 1; 35 | //assert(signs[i]==1); 36 | A[i*lda_A+i] = A[i*lda_A+i] - signs[i]; 37 | //A[i*lda_A+i] = 1 - signs[i]*A[i*lda_A+i]; 38 | 39 | for (j=1; j (myRank + numPes - root) % numPes) mb+=b; 116 | 117 | R = (double*)malloc(sizeof(double)*b*b); 118 | tau = (double*)malloc(sizeof(double)*b); 119 | tree_data = (double*)malloc(sizeof(double)*2*b*b*(log(numPes)+2)); 120 | 121 | 122 | TAU_FSTART(TSQR); 123 | #ifdef BUTTERFLY_QR 124 | butterfly_tsqr(A, lda_A, tau, m, b, myRank, numPes, root, req_id, cdt, tree_data); 125 | #else 126 | bitree_tsqr(A, lda_A, R, tau, m, b, myRank, numPes, root, req_id, cdt, 1, tree_data); 127 | if (myRank == root) 128 | copy_upper(R, A, b, b, lda_A, 0); 129 | #endif 130 | MPI_Barrier(cdt.cm); 131 | TAU_FSTOP(TSQR); 132 | 133 | MPI_Barrier(cdt.cm); 134 | TAU_FSTART(Construct_Q1); 135 | #ifdef BUTTERFLY_QR 136 | butterfly_construct_Q1(A, lda_A, tau, Qm, lda_Qm, m, b, myRank, numPes, root, cdt, tree_data); 137 | //construct_Q1(A, lda_A, tau, Q1, mb, m, b, b, myRank, numPes, root, cdt, tree_data); 138 | #else 139 | construct_Q1(A, lda_A, tau, Qm, lda_Qm, m, b, b, myRank, numPes, root, cdt, tree_data); 140 | #endif 141 | 142 | MPI_Barrier(cdt.cm); 143 | TAU_FSTOP(Construct_Q1); 144 | if (myRank == root){ 145 | TAU_FSTART(LU_of_Q1_minus_I); 146 | int * signs = (int*)malloc(sizeof(int)*b); 147 | int * pivs = (int*)malloc(sizeof(int)*b); 148 | double * wk = (double*)malloc(sizeof(double)*4*b); 149 | //compute LU(Q-S) 150 | lda_cpy(b, b, lda_Qm, b, Qm, W); 151 | recursive_YLU(W, b, b, signs); 152 | for (int i=0; i 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "../../shared/util.h" 9 | #include "../tsqr/bitree_tsqr.h" 10 | #include "../tsqr/butterfly_tsqr.h" 11 | #include "../hh_recon/hh_recon.h" 12 | 13 | //#define USE_BINARY_TREE 14 | 15 | /** 16 | * \param[in,out] A m-by-k dense matrix on input, YR where (I-YTY^T)R 17 | * \param[in] lda_A lda of A 18 | * \param[in] m number of rows in A 19 | * \param[in] k number of columns in A 20 | * \param[in] b number of Householder vectors 21 | * \param[in] myRank my global processorrank 22 | * \param[in] numPes number of processors 23 | * \param[in] root_row current row root 24 | * \param[in] root_col current column root 25 | * \param[in] cdt_row MPI communicator for row 26 | * \param[in] cdt_col MPI communicator for column 27 | * \param[in] cdt_world MPI communicator for world 28 | * \param[in] stop_at if negative ignored, if positive, stop QR after this many 29 | * cols/rows 30 | **/ 31 | void QR_butterfly_2D( double * A, 32 | int64_t const lda_A, 33 | int64_t const m, 34 | int64_t const k, 35 | int64_t const b, 36 | int64_t const myRank, 37 | int64_t const numPes, 38 | int64_t const root_row, 39 | int64_t const root_col, 40 | CommData_t cdt_row, 41 | CommData_t cdt_col, 42 | CommData_t cdt_world, 43 | int64_t const _stop_at){ 44 | int64_t i, j, pe_st_new, move_ptr; 45 | double * R, * Y, * tau, * tree_data; 46 | int64_t mb = (m+root_row*b)/cdt_col.np; 47 | if (cdt_col.rank < root_row) mb-=b; 48 | int64_t kb = (k+root_col*b)/cdt_row.np; 49 | if (cdt_row.rank <= root_col) kb-=b; 50 | int64_t tdsz = (psz_upr(b) + MIN(b,TAU_BLK)*b)*(log2(cdt_col.np)+2); 51 | 52 | int64_t stop_at; 53 | if (_stop_at >= 0){ 54 | stop_at = MIN(MIN(m,k),_stop_at); 55 | } else { 56 | stop_at = MIN(m,k); 57 | } 58 | 59 | R = (double*)malloc(sizeof(double)*b*b); 60 | Y = (double*)malloc(sizeof(double)*mb*b); 61 | tau = (double*)malloc(sizeof(double)*b); 62 | tree_data = (double*)malloc(sizeof(double)*tdsz); 63 | /* TSQR on block column */ 64 | TAU_FSTART(TSQR); 65 | if (cdt_row.rank == root_col){ 66 | #ifndef USE_BINARY_TREE 67 | if ( kb < cdt_col.np){ 68 | //if (cdt_col.rank == 0) printf("Binary tree\n"); 69 | bitree_tsqr(A, lda_A, R, tau, m, MIN(b,stop_at), cdt_col.rank, cdt_col.np, 70 | root_row, 0, cdt_col, 1, tree_data); 71 | if (cdt_col.rank == root_row) 72 | copy_upper(R, A, b, b, lda_A, 0); 73 | } else { 74 | //if (cdt_col.rank == 0) printf("Butterfly tree\n"); 75 | butterfly_tsqr(A, lda_A, tau, m, MIN(b,stop_at), cdt_col.rank, cdt_col.np, 76 | root_row, 13, cdt_col, tree_data); 77 | } 78 | #else 79 | bitree_tsqr(A, lda_A, R, tau, m, MIN(b,stop_at), cdt_col.rank, cdt_col.np, 80 | root_row, 0, cdt_col, 1, tree_data); 81 | if (cdt_col.rank == root_row) 82 | copy_upper(R, A, b, b, lda_A, 0); 83 | #endif 84 | } 85 | /* Iterate over panels */ 86 | if (m-b>0 || k-b>0){ 87 | if (cdt_row.rank == root_col) 88 | lda_cpy(mb, b, lda_A, mb, A, Y); 89 | MPI_Barrier(cdt_world.cm); 90 | TAU_FSTOP(TSQR); 91 | TAU_FSTART(Bcast_update); 92 | MPI_Bcast(Y, mb*b, MPI_DOUBLE, root_col, cdt_row.cm); 93 | MPI_Bcast(tree_data, tdsz, MPI_DOUBLE, root_col, cdt_row.cm); 94 | MPI_Bcast(tau, b, MPI_DOUBLE, root_col, cdt_row.cm); 95 | TAU_FSTOP(Bcast_update); 96 | move_ptr = 0; 97 | /* Update 2D distributed matrix */ 98 | /* if (myRank == 0) printf("before apply Q:\n"); 99 | double * A_ptr = A; 100 | for (int rr=0; rr 0){ 114 | #ifndef USE_BINARY_TREE 115 | if ( kb < cdt_col.np) 116 | apply_tsqr_QT(Y, mb, tau, A+move_ptr, lda_A, m, b, kb, cdt_col.rank, cdt_col.np, root_row, cdt_col, tree_data); 117 | else 118 | apply_butterfly_tsqr_QT(Y, mb, tau, A+move_ptr, lda_A, m, b, kb, cdt_col.rank, cdt_col.np, root_row, cdt_col, tree_data); 119 | #else 120 | apply_tsqr_QT(Y, mb, tau, A+move_ptr, lda_A, m, b, kb, cdt_col.rank, cdt_col.np, root_row, cdt_col, tree_data); 121 | #endif 122 | } 123 | /* if (myRank == 0) printf("after apply Q:\n"); 124 | A_ptr = A; 125 | for (int rr=0; rr 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "../../shared/util.h" 9 | #include "../tsqr/bitree_tsqr.h" 10 | #include "../hh_recon/hh_recon.h" 11 | 12 | 13 | /** 14 | * \param[in,out] A m-by-k dense matrix on input, YR where (I-YTY^T)R 15 | * \param[in] lda_A lda of A 16 | * \param[in] m number of rows in A 17 | * \param[in] k number of columns in A 18 | * \param[in] b number of Householder vectors 19 | * \param[in] myRank my global processorrank 20 | * \param[in] numPes number of processors 21 | * \param[in] root_row current row root 22 | * \param[in] root_col current column root 23 | * \param[in] cdt_row MPI communicator for row 24 | * \param[in] cdt_col MPI communicator for column 25 | * \param[in] cdt_world MPI communicator for world 26 | * \param[in] stop_at if negative ignored, if positive, stop QR after this many 27 | * cols/rows 28 | **/ 29 | void QR_tree_2D(double * A, 30 | int64_t const lda_A, 31 | int64_t const m, 32 | int64_t const k, 33 | int64_t const b, 34 | int64_t const myRank, 35 | int64_t const numPes, 36 | int64_t const root_row, 37 | int64_t const root_col, 38 | CommData_t cdt_row, 39 | CommData_t cdt_col, 40 | CommData_t cdt_world, 41 | int64_t const _stop_at){ 42 | int64_t i, j, pe_st_new, move_ptr; 43 | double * R, * Y, * tau, * tree_data; 44 | int64_t mb = (m+root_row*b)/cdt_col.np; 45 | if (cdt_col.rank < root_row) mb-=b; 46 | int64_t kb = (k+root_col*b)/cdt_row.np; 47 | if (cdt_row.rank <= root_col) kb-=b; 48 | int64_t tdsz = (psz_upr(b) + MIN(b,TAU_BLK)*b)*(log2(cdt_col.np)+2); 49 | 50 | int64_t stop_at; 51 | if (_stop_at >= 0){ 52 | stop_at = MIN(MIN(m,k),_stop_at); 53 | } else { 54 | stop_at = MIN(m,k); 55 | } 56 | 57 | R = (double*)malloc(sizeof(double)*b*b); 58 | Y = (double*)malloc(sizeof(double)*mb*b); 59 | tau = (double*)malloc(sizeof(double)*b); 60 | tree_data = (double*)malloc(sizeof(double)*tdsz); 61 | /* TSQR on block column */ 62 | TAU_FSTART(TSQR); 63 | if (cdt_row.rank == root_col){ 64 | bitree_tsqr(A, lda_A, R, tau, m, MIN(b,stop_at), cdt_col.rank, cdt_col.np, 65 | root_row, 0, cdt_col, 1, tree_data); 66 | if (cdt_col.rank == root_row) 67 | copy_upper(R, A, b, b, lda_A, 0); 68 | } 69 | /* Iterate over panels */ 70 | if (m-b>0 || k-b>0){ 71 | if (cdt_row.rank == root_col) 72 | lda_cpy(mb, b, lda_A, mb, A, Y); 73 | MPI_Barrier(cdt_world.cm); 74 | TAU_FSTOP(TSQR); 75 | TAU_FSTART(Bcast_update); 76 | MPI_Bcast(Y, mb*b, MPI_DOUBLE, root_col, cdt_row.cm); 77 | MPI_Bcast(tree_data, tdsz, MPI_DOUBLE, root_col, cdt_row.cm); 78 | MPI_Bcast(tau, b, MPI_DOUBLE, root_col, cdt_row.cm); 79 | TAU_FSTOP(Bcast_update); 80 | MPI_Barrier(cdt_world.cm); 81 | move_ptr = 0; 82 | if (cdt_row.rank == root_col){ 83 | move_ptr = b*lda_A; 84 | } 85 | /* Update 2D distributed matrix */ 86 | /*if (myRank == 0) printf("before apply Q:\n"); 87 | double * A_ptr = A+move_ptr;; 88 | for (int rr=0; rr 0) 97 | apply_tsqr_QT(Y, mb, tau, A+move_ptr, lda_A, m, b, kb, cdt_col.rank, cdt_col.np, root_row, cdt_col, tree_data); 98 | /*if (myRank == 0) printf("after apply Q:\n"); 99 | A_ptr = A+move_ptr;; 100 | for (int rr=0; rr 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "../../shared/util.h" 9 | #include "bitree_tsqr.h" 10 | 11 | 12 | /** 13 | * \brief Apply Q^T to B, where Q^T is represented implicitly by Y and 14 | * tree_data, obtained by running binary tree TSQR 15 | * 16 | * \param[in] Y m-by-b matrix of Householder vectors from TSQR 17 | * \param[in] lda_Y lda of Y 18 | * \param[in] tau TAU values associated with the first level of TSQR HH vecs 19 | * \param[in,out] B m-by-k matrix to apply QT to 20 | * \param[in] lda_B length of leading dimension of B 21 | * \param[in] m number of rows in Y 22 | * \param[in] b number of columns in Y 23 | * \param[in] k number of columns of B to apply Y to 24 | * \param[in] myRank rank in communicator column 25 | * \param[in] numPes number of processes in column 26 | * \param[in] root the root of the tree (who will own R at the end) 27 | * \param[in] cdt MPI communicator for column 28 | * \param[in] tree_data TAU and Y data for the TSQR tree, 29 | * must be of size ((log2(p)+1)*b+b)/2-by-b 30 | **/ 31 | void apply_tsqr_QT( double const * Y, 32 | int64_t const lda_Y, 33 | double const * tau, 34 | double * B, 35 | int64_t const lda_B, 36 | int64_t const m, 37 | int64_t const b, 38 | int64_t const k, 39 | int64_t const myRank, 40 | int64_t const numPes, 41 | int64_t const root, 42 | CommData_t cdt, 43 | double * tree_data){ 44 | int req_id = 0; 45 | MPI_Request req; 46 | MPI_Status stat; 47 | int info; 48 | int64_t comm_pe, np, myr, offset, mb, bmb, i, buf_sz, j, np_work, coff; 49 | double * T, * Y2, * buffer, * B_buf; 50 | 51 | mb = (m+root*b)/numPes; 52 | bmb = (m+root*b)/numPes; 53 | if (myRank < root){ 54 | offset = (numPes-root)*mb+myRank*(mb-b); 55 | mb-=b; 56 | } else 57 | offset = (myRank-root)*mb; 58 | 59 | if (mb <= 0){ 60 | return; 61 | } else { 62 | myr = myRank-root; 63 | if (myr < 0) 64 | myr += numPes; 65 | } 66 | if (numPes * b > m) 67 | np_work = m/b; 68 | else 69 | np_work = numPes; 70 | buf_sz = mb*MAX(k,b); 71 | 72 | buffer = (double*)malloc(sizeof(double)*buf_sz); 73 | 74 | // exit early if only one process involved 75 | if (np_work == 1){ 76 | TAU_FSTART(apply_tsqr_QT_local); 77 | if (myr >= 0){ 78 | cdormqr('L', 'T', m, k, b, Y, lda_Y, tau, B, lda_B, buffer, buf_sz, &info); 79 | } 80 | TAU_FSTOP(apply_tsqr_QT_local); 81 | free(buffer); 82 | return; 83 | } 84 | 85 | Y2 = (double*)malloc(sizeof(double)*b*b*2); 86 | B_buf = (double*)malloc(sizeof(double)*b*k*2); 87 | T = (double*)malloc(sizeof(double)*b*b); 88 | std::fill(Y2, Y2+b*b*2, 0.0); 89 | 90 | TAU_FSTART(apply_tsqr_QT_local); 91 | if (myr >= 0){ 92 | //cdormqr('L', 'T', mb, k, b, Y, lda_Y, tau, B, lda_B, buffer, buf_sz, &info); 93 | cdlarft('F', 'C', mb, b, Y, lda_Y, tau, T, b); 94 | cdlarfb('L', 'T', 'F', 'C', mb, k, b, Y, lda_Y, T, b, B, lda_B, buffer, k); 95 | } 96 | TAU_FSTOP(apply_tsqr_QT_local); 97 | 98 | lda_cpy(b,k,lda_B,2*b,B,B_buf); 99 | 100 | 101 | TAU_FSTART(apply_tsqr_QT_tree); 102 | for (np = np_work; np > 1; np = np/2+(np%2)){ 103 | /* If I am in second half of processor list send my data to lower half */ 104 | if ((myr > np/2 || myr*2 == np) && myr < np ){ 105 | comm_pe = myr-(np+1)/2; 106 | comm_pe = comm_pe + root; 107 | if (comm_pe >= numPes) 108 | comm_pe = comm_pe - numPes; 109 | 110 | if ((np%2 == 0 || myr != np/2)){ 111 | if (tree_data == NULL){ 112 | pack_upper(Y, buffer, b, lda_Y); 113 | MPI_Send(buffer, psz_upr(b), MPI_DOUBLE, comm_pe, req_id, cdt.cm); 114 | } 115 | lda_cpy(b, k, 2*b, b, B_buf, buffer); 116 | MPI_Send(buffer, k*b, MPI_DOUBLE, comm_pe, req_id, cdt.cm); 117 | MPI_Irecv(buffer, k*b, MPI_DOUBLE, comm_pe, req_id, cdt.cm, &req); 118 | MPI_Wait(&req, &stat); 119 | lda_cpy(b, k, b, 2*b, buffer, B_buf); 120 | } 121 | } else if (myr < np/2 && myr >= 0) { 122 | TAU_FSTART(ctQ_tree_worker); 123 | comm_pe = myr+(np+1)/2; 124 | comm_pe = comm_pe + root; 125 | if (comm_pe >= numPes){ 126 | comm_pe = comm_pe - numPes; 127 | coff = comm_pe*(bmb-b)+(numPes-root)*bmb; 128 | } else 129 | coff = (comm_pe-root)*bmb; 130 | MPI_Irecv(buffer, k*b, MPI_DOUBLE, comm_pe, req_id, cdt.cm, &req); 131 | MPI_Wait(&req, &stat); 132 | lda_cpy(b, k, b, 2*b, buffer, B_buf+b); 133 | if (tree_data == NULL){ 134 | MPI_Irecv(buffer, psz_upr(b), MPI_DOUBLE, comm_pe, req_id, cdt.cm, &req); 135 | MPI_Wait(&req, &stat); 136 | unpack_upper(buffer, Y2+b, b, 2*b); 137 | tau_recon('U', b, b, 2*b, Y2+b, T); 138 | cdormqr('L', 'N', 2*b, k, b, Y2, 2*b, T, B_buf, 2*b, 139 | buffer, buf_sz, &info); 140 | } else { 141 | memcpy(T, tree_data, b*MIN(b,TAU_BLK)*sizeof(double)); 142 | tree_data += b*MIN(b,TAU_BLK); 143 | unpack_upper(tree_data, Y2+b, b, 2*b); 144 | tree_data += psz_upr(b); 145 | TAU_FSTART(cdtpmqrt); 146 | cdtpmqrt('L', 'T', b, k, b, b, MIN(b,TAU_BLK), Y2+b, 2*b, T, MIN(b,TAU_BLK), B_buf, 2*b, 147 | B_buf+b, 2*b, buffer, &info); 148 | TAU_FSTOP(cdtpmqrt); 149 | } 150 | lda_cpy(b,k,2*b,b,B_buf+b,buffer); 151 | MPI_Send(buffer, k*b, MPI_DOUBLE, comm_pe, req_id, cdt.cm); 152 | TAU_FSTOP(ctQ_tree_worker); 153 | } 154 | } 155 | TAU_FSTOP(apply_tsqr_QT_tree); 156 | lda_cpy(b,k,2*b,lda_B,B_buf,B); 157 | 158 | 159 | free(T); 160 | free(B_buf); 161 | free(buffer); 162 | free(Y2); 163 | } 164 | 165 | 166 | -------------------------------------------------------------------------------- /alg/QR/tsqr/bitree_tsqr.h: -------------------------------------------------------------------------------- 1 | #ifndef __BITREE_TSQR_H__ 2 | #define __BITREE_TSQR_H__ 3 | 4 | #include "../../shared/comm.h" 5 | 6 | #ifndef TAU_BLK 7 | #define TAU_BLK 16 8 | #endif 9 | 10 | /** 11 | * \brief Perform TSQR on a matrix of two stacked upper-trinagular Rs 12 | * 13 | * \param[in,out] A 2b-by-b [R1; R2] tall-skinny matrix, Y\R on output 14 | * \param[in,out] buf 2b-by-b buffer 15 | * \param[in] b number of columns in A 16 | * \param[in] tau if not NULL put T matrix here 17 | **/ 18 | void tree_tsqr(double * A, 19 | double * buf, 20 | int64_t const b, 21 | double * tau=NULL); 22 | 23 | /** 24 | * \brief Perform TSQR over a (sub)-column of processors 25 | * 26 | * \param[in,out] A m-by-b dense tall-skinny matrix, Y\R on output 27 | * \param[in,out] R b-by-b upper-triangular matrix for which A=QR 28 | * \param[in] m number of rows in A 29 | * \param[in] b number of columns in A 30 | * \param[in] lda leading dimension (number of buffer rows) in A 31 | * \param[in] tau if not NULL put T matrix here 32 | **/ 33 | void local_tsqr(double * A, 34 | double * R, 35 | int64_t const m, 36 | int64_t const b, 37 | int64_t const lda_A, 38 | double * tau = NULL); 39 | 40 | /** 41 | * \brief Perform TSQR over a (sub)-column of processors 42 | * 43 | * \param[in,out] A m-by-b dense tall-skinny matrix 44 | * \param[in] lda_A lda of A 45 | * \param[in,out] R b-by-b upper-triangular matrix of, for which A=QR 46 | * \param[in,out] tau length b vector of tau values for the first tree level 47 | * \param[in] m number of rows in A 48 | * \param[in] b number of columns in A 49 | * \param[in] myRank rank in communicator column 50 | * \param[in] numPes number of processes in column 51 | * \param[in] root the root of the tree (who will own R at the end) 52 | * \param[in] req_id request id to use for send/recv 53 | * \param[in] cdt MPI communicator for column 54 | * \param[in] output_Y whether to overwrite A with Y 55 | * \param[in,out] tree_data TAU and Y data for the TSQR tree, 56 | * must be of size ((log2(p)+1)*b+b)/2-by-b 57 | **/ 58 | void bitree_tsqr( double * A, 59 | int64_t const lda_A, 60 | double * R, 61 | double * tau, 62 | int64_t const m, 63 | int64_t const b, 64 | int64_t const myRank, 65 | int64_t const numPes, 66 | int64_t const root, 67 | int64_t const req_id, 68 | CommData_t cdt, 69 | int64_t const output_Y=0, 70 | double * tree_data = NULL); 71 | 72 | /** 73 | * \brief Perform TSQR over a (sub)-column of processors 74 | * 75 | * \param[in] Y m-by-b matrix of Householder vectors from TSQR 76 | * \param[in] lda_Y lda of Y 77 | * \param[in] tau TAU values associated with the first level of TSQR HH vecs 78 | * \param[out] Q1 first b columns of Q (buffer should be prealloced) 79 | * \param[in] lda_Q length of leading dimension of Q1 80 | * \param[in] m number of rows in Y 81 | * \param[in] b number of columns in Y 82 | * \param[in] k number of columns of Q1 to compute 83 | * \param[in] myRank rank in communicator column 84 | * \param[in] numPes number of processes in column 85 | * \param[in] root the root of the tree (who will own R at the end) 86 | * \param[in] cdt MPI communicator for column 87 | * \param[in,out] tree_data TAU and Y data for the TSQR tree, 88 | * must be of size ((log2(p)+1)*b+b)/2-by-b 89 | * \param[in] ID b-by-k matrix which replaces the identity on the tree root if not NULL 90 | **/ 91 | void construct_Q1(double const * Y, 92 | int64_t const lda_Y, 93 | double const * tau, 94 | double * Q1, 95 | int64_t const lda_Q, 96 | int64_t const m, 97 | int64_t const b, 98 | int64_t const k, 99 | int64_t const myRank, 100 | int64_t const numPes, 101 | int64_t const root, 102 | CommData_t cdt, 103 | double * tree_data = NULL, 104 | double * ID = NULL); 105 | 106 | /** 107 | * \brief Apply Q^T to B, where Q^T is represented implicitly by Y and 108 | * tree_data, obtained by running binary tree TSQR 109 | * 110 | * \param[in] Y m-by-b matrix of Householder vectors from TSQR 111 | * \param[in] lda_Y lda of Y 112 | * \param[in] tau TAU values associated with the first level of TSQR HH vecs 113 | * \param[in,out] B m-by-k matrix to apply QT to 114 | * \param[in] lda_B length of leading dimension of B 115 | * \param[in] m number of rows in Y 116 | * \param[in] b number of columns in Y 117 | * \param[in] k number of columns of B to apply Y to 118 | * \param[in] myRank rank in communicator column 119 | * \param[in] numPes number of processes in column 120 | * \param[in] root the root of the tree (who will own R at the end) 121 | * \param[in] req_id request id to use for send/recv 122 | * \param[in] cdt MPI communicator for column 123 | * \param[in] tree_data TAU and Y data for the TSQR tree, 124 | * must be of size ((log2(p)+1)*b+b)/2-by-b 125 | **/ 126 | void apply_tsqr_QT( double const * Y, 127 | int64_t const lda_Y, 128 | double const * tau, 129 | double * B, 130 | int64_t const lda_B, 131 | int64_t const m, 132 | int64_t const b, 133 | int64_t const k, 134 | int64_t const myRank, 135 | int64_t const numPes, 136 | int64_t const root, 137 | CommData_t cdt, 138 | double * tree_data = NULL); 139 | 140 | #endif 141 | -------------------------------------------------------------------------------- /alg/QR/tsqr/butterfly_tsqr.cxx: -------------------------------------------------------------------------------- 1 | /* Copyright (c) Edgar Solomonik 2015, all rights reserved. This code is part of the CANDMC repository, protected under a two-clause BSD license. */ 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "../../shared/util.h" 9 | #include "bitree_tsqr.h" 10 | #include "butterfly_tsqr.h" 11 | #include "mpi.h" 12 | 13 | /** 14 | * \brief Perform TSQR over a (sub)-column of processors 15 | * 16 | * \param[in,out] A m-by-b dense tall-skinny matrix 17 | * \param[in] lda_A lda of A 18 | * \param[in,out] R b-by-b upper-triangular matrix of, for which A=QR 19 | * \param[in,out] tau length b vector of tau values for the first tree level 20 | * \param[in] m number of rows in A 21 | * \param[in] b number of columns in A 22 | * \param[in] myRank rank in communicator column 23 | * \param[in] numPes number of processes in column 24 | * \param[in] root the root of the tree (who will own R at the end) 25 | * \param[in] req_id request id to use for send/recv 26 | * \param[in] cdt MPI communicator for column 27 | * \param[in] tree_data TAU and Y data for the TSQR tree, 28 | * must be of size ((log2(p)+1)*b+b)/2-by-b 29 | **/ 30 | void butterfly_tsqr(double * A, 31 | int64_t const lda_A, 32 | double * tau, 33 | int64_t const m, 34 | int64_t const b, 35 | int64_t const myRank, 36 | int64_t const numPes, 37 | int64_t const root, 38 | int64_t const req_id, 39 | CommData_t cdt, 40 | double * tree_data){ 41 | int64_t comm_pe, mb, i, info, np_work; 42 | double * R_buf, * S_buf, * work; 43 | 44 | mb = ((m/b)/numPes)*b; 45 | if ((m/b) % numPes > (myRank + numPes - root) % numPes) mb+=b; 46 | 47 | /* This is not an invarient butterfly, there is an up */ 48 | int myr = myRank-root; 49 | if (myr < 0) 50 | myr += numPes; 51 | 52 | /* determine the number of processors that have a block involved */ 53 | if (numPes * b > m){ 54 | np_work = m/b; 55 | } else 56 | np_work = numPes; 57 | 58 | /* find the smallest power of two less than or equal to number of working processors */ 59 | int inp = np_work; 60 | int pow2_np_work = 1; 61 | while (inp > 1){ 62 | pow2_np_work = pow2_np_work*2; 63 | inp = inp/2; 64 | } 65 | assert(pow2_np_work <= np_work); 66 | 67 | work = (double*)malloc(sizeof(double)*mb*b); 68 | // exit early if only one process involved 69 | if (np_work == 1){ 70 | /*R_buf = (double*)malloc(sizeof(double)*m*b); 71 | lda_cpy(mb, b, lda_A, mb, A, R_buf);*/ 72 | if (myr == 0){ 73 | TAU_FSTART(Local_panel_TSQR); 74 | local_tsqr(A, work, m, b, lda_A, tau); 75 | TAU_FSTOP(Local_panel_TSQR); 76 | } 77 | // free(R_buf); 78 | 79 | return; 80 | } 81 | 82 | // compute the first R on each m-by-b starting block 83 | R_buf = (double*)malloc(sizeof(double)*2*b*b); 84 | double * R = (double*)malloc(sizeof(double)*b*b); 85 | 86 | if (myr < np_work){ 87 | TAU_FSTART(Local_panel_TSQR); 88 | local_tsqr(A, work, mb, b, lda_A, tau); 89 | pack_upper(A, R, b, lda_A); 90 | TAU_FSTOP(Local_panel_TSQR); 91 | } 92 | double * R_recv_buf = (double*)malloc(sizeof(double)*b*b); 93 | 94 | // if processor count now power of two do one clipped butterfly level 95 | TAU_FSTART(TSQR_clipped_wing); 96 | if (np_work > pow2_np_work){ 97 | if ((myr >= pow2_np_work && myr < np_work) || (myr < np_work - pow2_np_work)){ 98 | int parity = (myr>=pow2_np_work); 99 | int vcomm_pe = myr - parity*pow2_np_work + (1-parity)*pow2_np_work; 100 | int comm_pe = (vcomm_pe + root)%numPes; 101 | MPI_Status stat; 102 | MPI_Sendrecv(R, psz_upr(b), MPI_DOUBLE, comm_pe, req_id+parity, 103 | R_recv_buf, psz_upr(b), MPI_DOUBLE, comm_pe, req_id+1-parity, 104 | cdt.cm, &stat); 105 | 106 | unpack_upper(R, R_buf+parity*b, b, 2*b); 107 | unpack_upper(R_recv_buf, R_buf+(1-parity)*b, b, 2*b); 108 | 109 | tree_tsqr(R_buf, work, b, tree_data); 110 | tree_data += b*MIN(b,TAU_BLK); 111 | pack_upper(R_buf+b,tree_data, b, 2*b); 112 | tree_data += psz_upr(b); 113 | pack_upper(R_buf, R, b, 2*b); 114 | } 115 | } 116 | #ifdef PROFILE 117 | MPI_Barrier(cdt.cm); 118 | #endif 119 | TAU_FSTOP(TSQR_clipped_wing); 120 | 121 | TAU_FSTART(TSQR_butterfly); 122 | if (pow2_np_work > 1 && myr < pow2_np_work){ 123 | /* Tournament tree is a butterfly */ 124 | for (int level=pow2_np_work; level>1; level=level/2){ 125 | /* parity determines which buttefly wing this proc is on */ 126 | int parity = (myr%level)>=level/2; 127 | /* comm_pe finds the other wing */ 128 | int vcomm_pe = level*(myr/level) + (((myr%level)+(level/2))%level); 129 | int comm_pe = (vcomm_pe + root)%numPes; 130 | 131 | MPI_Status stat; 132 | MPI_Sendrecv(R, psz_upr(b), MPI_DOUBLE, comm_pe, req_id+parity, 133 | R_recv_buf, psz_upr(b), MPI_DOUBLE, comm_pe, req_id+1-parity, 134 | cdt.cm, &stat); 135 | 136 | unpack_upper(R, R_buf+parity*b, b, 2*b); 137 | unpack_upper(R_recv_buf, R_buf+(1-parity)*b, b, 2*b); 138 | 139 | tree_tsqr(R_buf, work, b, tree_data); 140 | tree_data += b*MIN(b,TAU_BLK); 141 | pack_upper(R_buf+b,tree_data, b, 2*b); 142 | tree_data += psz_upr(b); 143 | pack_upper(R_buf, R, b, 2*b); 144 | } 145 | } 146 | #ifdef PROFILE 147 | MPI_Barrier(cdt.cm); 148 | #endif 149 | TAU_FSTOP(TSQR_butterfly); 150 | if (myRank == root){ 151 | copy_upper(R_buf, A, b, 2*b, lda_A, 0); 152 | } 153 | 154 | free(R); 155 | free(R_buf); 156 | free(R_recv_buf); 157 | } 158 | 159 | 160 | 161 | -------------------------------------------------------------------------------- /alg/QR/tsqr/butterfly_tsqr.h: -------------------------------------------------------------------------------- 1 | #ifndef __BUTTERFLY_TSQR_H__ 2 | #define __BUTTERFLY_TSQR_H__ 3 | 4 | #include "../../shared/comm.h" 5 | 6 | #ifndef TAU_BLK 7 | #define TAU_BLK 16 8 | #endif 9 | 10 | /** 11 | * \brief Perform TSQR over a (sub)-column of processors 12 | * 13 | * \param[in,out] A m-by-b dense tall-skinny matrix 14 | * \param[in] lda_A lda of A 15 | * \param[in,out] R b-by-b upper-triangular matrix of, for which A=QR 16 | * \param[in,out] tau length b vector of tau values for the first tree level 17 | * \param[in] m number of rows in A 18 | * \param[in] b number of columns in A 19 | * \param[in] myRank rank in communicator column 20 | * \param[in] numPes number of processes in column 21 | * \param[in] root the root of the tree (who will own R at the end) 22 | * \param[in] req_id request id to use for send/recv 23 | * \param[in] cdt MPI communicator for column 24 | * \param[in,out] tree_data TAU and Y data for the TSQR tree, 25 | * must be of size ((log2(p)+1)*b+b)/2-by-b 26 | **/ 27 | void butterfly_tsqr(double * A, 28 | int64_t const lda_A, 29 | double * tau, 30 | int64_t const m, 31 | int64_t const b, 32 | int64_t const myRank, 33 | int64_t const numPes, 34 | int64_t const root, 35 | int64_t const req_id, 36 | CommData_t cdt, 37 | double * tree_data); 38 | 39 | /** 40 | * \brief Construct Q from butterfly TSQR over a (sub)-column of processors 41 | * 42 | * \param[in] Y m-by-b matrix of Householder vectors from TSQR 43 | * \param[in] lda_Y lda of Y 44 | * \param[in] tau TAU values associated with the first level of TSQR HH vecs 45 | * \param[in,out] Q1 first k columns of Q (buffer should be prealloced and 46 | * preset if is_form_q set to 0) 47 | * \param[in] lda_Q length of leading dimension of Q1 48 | * \param[in] m number of rows in Y 49 | * \param[in] b number of columns in Y, number of columns of Q1 to compute 50 | * \param[in] myRank rank in communicator column 51 | * \param[in] numPes number of processes in column 52 | * \param[in] root the root of the tree (who will own R at the end) 53 | * \param[in] cdt MPI communicator for column 54 | * \param[in] tree_data TAU and Y data for the TSQR tree, 55 | * must be of size ((log2(p)+1)*b+b)/2-by-b 56 | **/ 57 | void butterfly_construct_Q1(double const * Y, 58 | int64_t lda_Y, 59 | double const * tau, 60 | double * Q1, 61 | int64_t lda_Q, 62 | int64_t m, 63 | int64_t b, 64 | int64_t myRank, 65 | int64_t numPes, 66 | int64_t root, 67 | CommData_t cdt, 68 | double * tree_data); 69 | 70 | /** 71 | * \brief Apply Q^T to B, where Q^T is represented implicitly by Y and 72 | * tree_data, obtained by running binary tree TSQR 73 | * 74 | * \param[in] Y m-by-b matrix of Householder vectors from TSQR 75 | * \param[in] lda_Y lda of Y 76 | * \param[in] tau TAU values associated with the first level of TSQR HH vecs 77 | * \param[in,out] B m-by-k matrix to apply QT to 78 | * \param[in] lda_B length of leading dimension of B 79 | * \param[in] m number of rows in Y 80 | * \param[in] b number of columns in Y 81 | * \param[in] k number of columns of B to apply Y to 82 | * \param[in] myRank rank in communicator column 83 | * \param[in] numPes number of processes in column 84 | * \param[in] root the root of the tree (who will own R at the end) 85 | * \param[in] cdt MPI communicator for column 86 | * \param[in] tree_data TAU and Y data for the TSQR tree, 87 | * must be of size ((log2(p)+1)*b+b)/2-by-b 88 | **/ 89 | void apply_butterfly_tsqr_QT( double const * Y, 90 | int64_t const lda_Y, 91 | double const * tau, 92 | double * B, 93 | int64_t const lda_B, 94 | int64_t const m, 95 | int64_t const b, 96 | int64_t const k, 97 | int64_t const myRank, 98 | int64_t const numPes, 99 | int64_t const root, 100 | CommData_t cdt, 101 | double * tree_data); 102 | 103 | #endif 104 | -------------------------------------------------------------------------------- /alg/SE/CANSE.h: -------------------------------------------------------------------------------- 1 | #ifndef __CANSE_H__ 2 | #define __CANSE_H__ 3 | 4 | 5 | /** 6 | * \brief Perform reduction to banded using 2D QR 7 | * 8 | * \param[in,out] A n-by-n dense symmetric matrix, stored unpacked 9 | pointer should refer to current working corner of A 10 | * \param[in] lda_A lda of A 11 | * \param[in] n number of rows and columns in A 12 | * \param[in] b is the large block to which we are reducing the band, 13 | b must be a multiple of b_sub 14 | * \param[in] b_sub small block size with which matrix is distributed 15 | * \param[in] pv current processor grid view oriented at corner of A 16 | **/ 17 | void sym_full2band(double * A, 18 | int64_t lda_A, 19 | int64_t n, 20 | int64_t b, 21 | int64_t b_sub, 22 | pview * pv); 23 | 24 | /** 25 | * \brief Perform reduction to banded using a 3D algorithm 26 | * 27 | * \param[in,out] A n-by-n dense symmetric matrix, stored unpacked 28 | * pointer should refer to current working corner of A 29 | * blocked accrows crow and ccol pv and replicated across clyr 30 | * \param[in] lda_A lda of A 31 | * \param[in] n number of rows and columns in A 32 | * \param[in] b_agg is the number of U vectors to aggregate before applying to 33 | * full trailing matrix (must be multiply of b_qr) 34 | * \param[in] bw is the bandwidth to reduce to (2D QR size) 35 | * \param[in] b_sub small block size with which matrix is distributed 36 | * \param[in] pv current processor grid view oriented at corner of A 37 | **/ 38 | void sym_full2band_3d(double * A, 39 | int64_t lda_A, 40 | int64_t n, 41 | int64_t b_agg, 42 | int64_t bw, 43 | int64_t b_sub, 44 | pview_3d * pv); 45 | 46 | /** 47 | * \brief Perform reduction to banded using 2D QR 48 | * 49 | * \param[in,out] A n-by-n dense symmetric matrix, stored unpacked 50 | pointer should refer to current working corner of A 51 | * \param[in] lda_A lda of A 52 | * \param[in] n number of rows and columns in A 53 | * \param[in] b is the large block to which we are reducing the band, 54 | b must be a multiple of b_sub 55 | * \param[in] b_sub small block size with which matrix is distributed 56 | * \param[in] pv current processor grid view oriented at corner of A 57 | * \param[in] desc_A descriptor for whole A matrix 58 | * \param[in] org_A pointer to top left corner of A matrix 59 | * \param[in] IA row index offset 60 | * \param[in] JA column index offset 61 | **/ 62 | void sym_full2band_scala(double * A, 63 | int64_t lda_A, 64 | int64_t n, 65 | int64_t b, 66 | int64_t b_sub, 67 | pview * pv, 68 | int const * desc_A, 69 | double * org_A, 70 | int64_t IA=1, 71 | int64_t JA=1); 72 | #endif 73 | -------------------------------------------------------------------------------- /alg/SE/Makefile: -------------------------------------------------------------------------------- 1 | include ../../config.mk 2 | 3 | LIB_DIR = ../../lib 4 | 5 | CANSE: $(LIB_DIR)/libCANSE.a 6 | 7 | $(LIB_DIR)/libCANSE.a: full_to_band.o full_to_band_scala.o dmatrix.o full_to_band_3d.o 8 | $(AR) -crs $(LIB_DIR)/libCANSE.a *.o 9 | 10 | dmatrix.o: dmatrix.cxx 11 | ifneq (,$(findstring DUSE_SCALAPACK,$(DEFS))) 12 | $(CXX) -c dmatrix.cxx -o dmatrix.o $(CXXFLAGS) $(DEFS) 13 | endif 14 | 15 | full_to_band.o: full_to_band.cxx CANSE.h 16 | $(CXX) -c full_to_band.cxx -o full_to_band.o $(CXXFLAGS) $(DEFS) 17 | 18 | full_to_band_3d.o: dmatrix.o full_to_band_3d.cxx CANSE.h 19 | ifneq (,$(findstring DUSE_SCALAPACK,$(DEFS))) 20 | $(CXX) -c full_to_band_3d.cxx -o full_to_band_3d.o $(CXXFLAGS) $(DEFS) 21 | endif 22 | 23 | full_to_band_scala.o: full_to_band_scala.cxx CANSE.h 24 | ifneq (,$(findstring DUSE_SCALAPACK,$(DEFS))) 25 | $(CXX) -c full_to_band_scala.cxx -o full_to_band_scala.o $(CXXFLAGS) $(DEFS) 26 | endif 27 | 28 | clean: 29 | rm -f *.o 30 | -------------------------------------------------------------------------------- /alg/shared/Makefile: -------------------------------------------------------------------------------- 1 | include ../../config.mk 2 | 3 | LIB_DIR=../../lib 4 | 5 | CANShared: $(LIB_DIR)/libCANShared.a 6 | 7 | $(LIB_DIR)/libCANShared.a: util.o timer.o lapack.o 8 | $(AR) -crs $(LIB_DIR)/libCANShared.a util.o timer.o lapack.o 9 | 10 | util.o: util.h util.cxx 11 | $(CXX) -c util.cxx -o util.o $(CXXFLAGS) $(DEFS) 12 | 13 | timer.o: timer.h timer.cxx 14 | $(CXX) -c timer.cxx -o timer.o $(CXXFLAGS) $(DEFS) 15 | 16 | lapack.o: lapack.h lapack.cxx 17 | $(CXX) -c lapack.cxx -o lapack.o $(CXXFLAGS) $(DEFS) 18 | 19 | clean: 20 | rm -f *.o 21 | -------------------------------------------------------------------------------- /alg/shared/comm.h: -------------------------------------------------------------------------------- 1 | /** Copyright Edgar Solomonik 2015, all rights reserved. This code is part of the CANDMC repository, protected under a two-clause BSD license. **/ 2 | 3 | #ifndef __COMM_H__ 4 | #define __COMM_H__ 5 | 6 | #define USE_MPI 7 | 8 | #include 9 | 10 | #ifdef USE_MPI 11 | /********************************************************* 12 | * * 13 | * MPI * 14 | * * 15 | *********************************************************/ 16 | #include "mpi.h" 17 | #include "util.h" 18 | //latency time per message 19 | #define COST_LATENCY 1.e-6 20 | //memory bandwidth: time per per byte 21 | #define COST_MEMBW 1.e-9 22 | //network bandwidth: time per byte 23 | #define COST_NETWBW 5.e-10 24 | //flop cost: time per flop 25 | #define COST_FLOP 2.e-11 26 | //flop cost: time per flop 27 | #define COST_OFFLOADBW 5.e-10 28 | 29 | 30 | //typedef MPI_Comm COMM; 31 | 32 | typedef class CommData { 33 | public: 34 | MPI_Comm cm; 35 | int np; 36 | int rank; 37 | int color; 38 | int alive; 39 | 40 | double estimate_bcast_time(long_int msg_sz) { 41 | #ifdef BGQ 42 | return msg_sz*(double)COST_NETWBW+COST_LATENCY; 43 | #else 44 | return msg_sz*(double)log2((double)np)*COST_NETWBW; 45 | #endif 46 | } 47 | 48 | double estimate_allred_time(long_int msg_sz) { 49 | #ifdef BGQ 50 | return msg_sz*(double)(2.*COST_MEMBW+COST_NETWBW)+COST_LATENCY; 51 | #else 52 | return msg_sz*(double)log2((double)np)*(2.*COST_MEMBW+COST_FLOP+COST_NETWBW); 53 | #endif 54 | } 55 | 56 | double estimate_alltoall_time(long_int chunk_sz) { 57 | return chunk_sz*np*log2((double)np)*COST_NETWBW+2.*log2((double)np)*COST_LATENCY; 58 | } 59 | 60 | double estimate_alltoallv_time(long_int tot_sz) { 61 | return 2.*tot_sz*log2((double)np)*COST_NETWBW+2.*log2((double)np)*COST_LATENCY; 62 | } 63 | } CommData_t; 64 | 65 | //2d procesor grid local processor view 66 | class pview { 67 | public: 68 | //current root row 69 | int rrow; 70 | //current root col 71 | int rcol; 72 | //row communicatir 73 | CommData_t crow; 74 | //column communicator 75 | CommData_t ccol; 76 | //diagonal communicator 77 | CommData_t cdiag; 78 | //world communicator 79 | CommData_t cworld; 80 | #ifdef USE_SCALAPACK 81 | //scalapack context for 2D grid 82 | int ictxt; 83 | #endif 84 | }; 85 | 86 | 87 | //3d procesor grid local processor view 88 | class pview_3d { 89 | public: 90 | //context for 2D rectangular folding of 3D grid 91 | pview prect; 92 | 93 | //context for my 2D layer 94 | pview plyr; 95 | 96 | //layer (replication dimension) communicator 97 | CommData_t clyr; 98 | 99 | //column-layers of the processor grid (there is crow.np of these in total) 100 | CommData_t cworld; 101 | }; 102 | 103 | #ifdef PRINTALL 104 | #define CPRINTF(cdt,...) \ 105 | do { if (cdt.rank == 0) printf(__VA_ARGS__); } while (0) 106 | #else 107 | #define CPRINTF(...) 108 | #endif 109 | 110 | #define POST_BCAST(buf, sz, type, root, cdt, bcast_req) \ 111 | do { \ 112 | MPI_Bcast(buf, sz, type, root, cdt.cm); } while(0) 113 | 114 | #define WAIT_BCAST(cdt, bcast_req) 115 | 116 | 117 | #define SET_COMM(_cm, _rank, _np, _cdt) \ 118 | do { \ 119 | _cdt.cm = _cm; \ 120 | _cdt.rank = _rank; \ 121 | _cdt.np = _np; \ 122 | _cdt.alive = 1; \ 123 | } while (0) 124 | 125 | #define RINIT_COMM(numPes, myRank, nr, nb, cdt) \ 126 | do { \ 127 | INIT_COMM(numPes, myRank, nr, cdt); \ 128 | } while(0) 129 | 130 | #define INIT_COMM(numPes, myRank, nr, cdt) \ 131 | do { \ 132 | MPI_Init(&argc, &argv); \ 133 | MPI_Comm_size(MPI_COMM_WORLD, &numPes); \ 134 | MPI_Comm_rank(MPI_COMM_WORLD, &myRank); \ 135 | SET_COMM(MPI_COMM_WORLD, myRank, numPes, cdt); \ 136 | } while(0) 137 | 138 | 139 | #define COMM_EXIT \ 140 | do{ \ 141 | MPI_Finalize(); } while(0) 142 | 143 | #define SETUP_SUB_COMM(cdt_master, cdt, commrank, bcolor, p) \ 144 | do { \ 145 | cdt.rank = commrank; \ 146 | cdt.np = p; \ 147 | cdt.color = bcolor; \ 148 | cdt.alive = 1; \ 149 | MPI_Comm_split(cdt_master.cm, \ 150 | bcolor, \ 151 | commrank, \ 152 | &cdt.cm); } while(0) 153 | 154 | #define SETUP_SUB_COMM_SHELL(cdt_master, cdt, commrank, bcolor, p) \ 155 | do { \ 156 | cdt.rank = commrank; \ 157 | cdt.np = p; \ 158 | cdt.color = bcolor; \ 159 | cdt.alive = 0; \ 160 | } while(0) 161 | 162 | #define SHELL_SPLIT(cdt_master, cdt) \ 163 | do { \ 164 | cdt.alive = 1; \ 165 | MPI_Comm_split(cdt_master.cm, \ 166 | cdt.color, \ 167 | cdt.rank, \ 168 | &cdt.cm); } while(0) 169 | 170 | 171 | #define RSETUP_KDIR_COMM(myRank, p, c, cdt, commrank, color) \ 172 | do { \ 173 | commrank = myRank/(p/c); \ 174 | color = myRank%(p/c); \ 175 | cdt.rank = commrank; \ 176 | cdt.np = c; \ 177 | MPI_Comm_split(MPI_COMM_WORLD, \ 178 | color, \ 179 | commrank, \ 180 | &(cdt.cm)); } while(0) 181 | 182 | 183 | #define RSETUP_LAYER_COMM(pesdim, commrank, color, cdt_row, cdt_col, row, col) \ 184 | do { \ 185 | MPI_Comm MPI_INTRALAYER_COMM; \ 186 | MPI_Comm_split(MPI_COMM_WORLD, commrank, color, &MPI_INTRALAYER_COMM);\ 187 | row = color / pesdim; \ 188 | col = color % pesdim; \ 189 | MPI_Comm_split(MPI_INTRALAYER_COMM, myRow, myCol, &(cdt_row.cm)); \ 190 | MPI_Comm_split(MPI_INTRALAYER_COMM, myCol, myRow, &(cdt_col.cm)); \ 191 | cdt_row.np = pesdim; \ 192 | cdt_row.rank = col; \ 193 | cdt_col.np = pesdim; \ 194 | cdt_col.rank = row; \ 195 | } while(0) 196 | 197 | 198 | 199 | #define FREE_CDT(cdt) \ 200 | do { \ 201 | MPI_Comm_free(&(cdt->cm)); } while(0) 202 | 203 | #endif 204 | 205 | #endif 206 | -------------------------------------------------------------------------------- /alg/shared/pmpi.h: -------------------------------------------------------------------------------- 1 | #ifndef __PMPI_H__ 2 | #define __PMPI_H__ 3 | 4 | #include "mpi.h" 5 | 6 | #ifdef PMPI 7 | #define MPI_Bcast(...) \ 8 | do { CTF_Timer __t("MPI_Bcast"); \ 9 | __t.start(); \ 10 | PMPI_Bcast(__VA_ARGS__); \ 11 | __t.stop(); } while (0) 12 | #define MPI_Reduce(...) \ 13 | do { CTF_Timer __t("MPI_Reduce"); \ 14 | __t.start(); \ 15 | PMPI_Reduce(__VA_ARGS__); \ 16 | __t.stop(); }while (0) 17 | #define MPI_Wait(...) \ 18 | do { CTF_Timer __t("MPI_Wait"); \ 19 | __t.start(); \ 20 | PMPI_Wait(__VA_ARGS__); \ 21 | __t.stop(); } while (0) 22 | #define MPI_Send(...) \ 23 | do { CTF_Timer __t("MPI_Send"); \ 24 | __t.start(); \ 25 | PMPI_Send(__VA_ARGS__); \ 26 | __t.stop(); } while (0) 27 | #define MPI_Recv(...) \ 28 | do { CTF_Timer __t("MPI_Recv"); \ 29 | __t.start(); \ 30 | PMPI_Recv(__VA_ARGS__); \ 31 | __t.stop(); } while (0) 32 | #define MPI_Sendrecv(...) \ 33 | do { CTF_Timer __t("MPI_Sendrecv"); \ 34 | __t.start(); \ 35 | PMPI_Sendrecv(__VA_ARGS__); \ 36 | __t.stop(); } while (0) 37 | #define MPI_Allreduce(...) \ 38 | do { CTF_Timer __t("MPI_Allreduce"); \ 39 | __t.start(); \ 40 | PMPI_Allreduce(__VA_ARGS__); \ 41 | __t.stop(); } while (0) 42 | #define MPI_Allgather(...) \ 43 | do { CTF_Timer __t("MPI_Allgather"); \ 44 | __t.start(); \ 45 | PMPI_Allgather(__VA_ARGS__); \ 46 | __t.stop(); } while (0) 47 | #define MPI_Scatter(...) \ 48 | do { CTF_Timer __t("MPI_Scatter"); \ 49 | __t.start(); \ 50 | PMPI_Scatter(__VA_ARGS__); \ 51 | __t.stop(); } while (0) 52 | #define MPI_Alltoall(...) \ 53 | do { CTF_Timer __t("MPI_Alltoall"); \ 54 | __t.start(); \ 55 | PMPI_Alltoall(__VA_ARGS__); \ 56 | __t.stop(); } while (0) 57 | #define MPI_Alltoallv(...) \ 58 | do { CTF_Timer __t("MPI_Alltoallv"); \ 59 | __t.start(); \ 60 | PMPI_Alltoallv(__VA_ARGS__); \ 61 | __t.stop(); } while (0) 62 | #define MPI_Gatherv(...) \ 63 | do { CTF_Timer __t("MPI_Gatherv"); \ 64 | __t.start(); \ 65 | PMPI_Gatherv(__VA_ARGS__); \ 66 | __t.stop(); } while (0) 67 | #define MPI_Scatterv(...) \ 68 | do { CTF_Timer __t("MPI_Scatterv"); \ 69 | __t.start(); \ 70 | PMPI_Scatterv(__VA_ARGS__); \ 71 | __t.stop(); } while (0) 72 | #define MPI_Waitall(...) \ 73 | do { CTF_Timer __t("MPI_Waitall"); \ 74 | __t.start(); \ 75 | PMPI_Waitall(__VA_ARGS__); \ 76 | __t.stop(); } while (0) 77 | #define MPI_Barrier(...) \ 78 | do { CTF_Timer __t("MPI_Barrier"); \ 79 | __t.start(); \ 80 | PMPI_Barrier(__VA_ARGS__); \ 81 | __t.stop(); } while (0) 82 | #endif 83 | 84 | #endif 85 | -------------------------------------------------------------------------------- /alg/shared/timer.h: -------------------------------------------------------------------------------- 1 | #ifndef __TIMER_H__ 2 | #define __TIMER_H__ 3 | 4 | #include "util.h" 5 | #define MAX_NAME_LENGTH 53 6 | 7 | /** 8 | * \defgroup timer Timing and cost measurement 9 | * @{ 10 | *//** 11 | * \brief local process walltime measurement 12 | */ 13 | void CTF_set_main_args(int argc, const char * const * argv); 14 | 15 | /** 16 | * \defgroup timer Timing and cost measurement 17 | * @{ 18 | *//** 19 | * \brief local process walltime measurement 20 | */ 21 | class CTF_Timer{ 22 | public: 23 | char const * timer_name; 24 | int index; 25 | int exited; 26 | int original; 27 | 28 | public: 29 | CTF_Timer(char const * name); 30 | ~CTF_Timer(); 31 | void stop(); 32 | void start(); 33 | void exit(); 34 | 35 | }; 36 | 37 | 38 | class CTF_Function_timer{ 39 | public: 40 | char name[MAX_NAME_LENGTH]; 41 | double start_time; 42 | double start_excl_time; 43 | double acc_time; 44 | double acc_excl_time; 45 | int calls; 46 | 47 | double total_time; 48 | double total_excl_time; 49 | int total_calls; 50 | 51 | public: 52 | CTF_Function_timer(char const * name_, 53 | double const start_time_, 54 | double const start_excl_time_); 55 | void compute_totals(MPI_Comm comm); 56 | bool operator<(CTF_Function_timer const & w) const ; 57 | void print(FILE * output, 58 | MPI_Comm const comm, 59 | int const rank, 60 | int const np); 61 | }; 62 | 63 | /** 64 | * \brief epoch during which to measure timers 65 | */ 66 | class CTF_Timer_epoch{ 67 | private: 68 | CTF_Timer * tmr_inner; 69 | CTF_Timer * tmr_outer; 70 | std::vector saved_function_timers; 71 | double save_excl_time; 72 | public: 73 | 74 | public: 75 | char const * name; 76 | //create epoch called name 77 | CTF_Timer_epoch(char const * name_); 78 | 79 | CTF_Timer_epoch(){ 80 | saved_function_timers.clear(); 81 | } 82 | 83 | 84 | //clears timers and begins epoch 85 | void begin(); 86 | 87 | //prints timers and clears them 88 | void end(); 89 | }; 90 | 91 | void CTF_set_context(MPI_Comm ctxt); 92 | 93 | #endif 94 | 95 | -------------------------------------------------------------------------------- /bench/LU/Makefile: -------------------------------------------------------------------------------- 1 | include ../../config.mk 2 | 3 | LU_BENCHMARKS = lu_25d_np_bench lu_25d_pp_bench lu_25d_tp_bench 4 | 5 | INCLUDES := -I../../include/ 6 | BIN_DIR = ../../bin/benchmarks 7 | LIB_DIR = ../../lib 8 | 9 | lu_25d_pp_bench: $(BIN_DIR)/lu_25d_pp_bench 10 | lu_25d_tp_bench: $(BIN_DIR)/lu_25d_tp_bench 11 | lu_25d_np_bench: $(BIN_DIR)/lu_25d_np_bench 12 | 13 | LU_benchmarks: $(LU_BENCHMARKS) 14 | 15 | $(BIN_DIR)/lu_25d_np_bench: lu_25d_pvt_bench.cxx 16 | $(CXX) -o $(BIN_DIR)/lu_25d_np_bench lu_25d_pvt_bench.cxx $(CXXFLAGS) $(DEFS) $(INCLUDES) \ 17 | -L$(LIB_DIR) -lCANLU -lCANShared $(BLAS_LIBS) $(LDFLAGS) -lm -DNO_PVT 18 | 19 | $(BIN_DIR)/lu_25d_pp_bench: lu_25d_pvt_bench.cxx 20 | $(CXX) -o $(BIN_DIR)/lu_25d_pp_bench lu_25d_pvt_bench.cxx $(CXXFLAGS) $(DEFS) $(INCLUDES) \ 21 | -L$(LIB_DIR) -lCANLU -lCANShared $(BLAS_LIBS) $(LDFLAGS) -lm -DPARTIAL_PVT 22 | 23 | $(BIN_DIR)/lu_25d_tp_bench: lu_25d_pvt_bench.cxx 24 | $(CXX) -o $(BIN_DIR)/lu_25d_tp_bench lu_25d_pvt_bench.cxx $(CXXFLAGS) $(DEFS) $(INCLUDES) \ 25 | -L$(LIB_DIR) -lCANLU -lCANShared $(BLAS_LIBS) $(LDFLAGS) -lm -DTNMT_PVT 26 | 27 | $(BIN_DIR)/par_tnmt_bench: par_tnmt_bench.cxx 28 | $(CXX) -o par_tnmt_bench par_tnmt_bench.cxx $(CXXFLAGS) $(DEFS) $(INCLUDES) \ 29 | -L$(LIB_DIR) -lCANLU -lCANShared $(BLAS_LIBS) $(LDFLAGS) -lm 30 | 31 | clean: 32 | rm -f *.o 33 | -------------------------------------------------------------------------------- /bench/LU/par_tnmt_bench.cxx: -------------------------------------------------------------------------------- 1 | /* Copyright (c) Edgar Solomonik 2015, all rights reserved. This code is part of the CANDMC repository, protected under a two-clause BSD license. */ 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "tnmt_pvt.h" 8 | #include "partial_pvt.h" 9 | //#include "../shared/comm.h" 10 | #include "../shared/util.h" 11 | #include "../shared/seq_lu.h" 12 | 13 | /* test parallel tournament pivoting 14 | * b is the size of the panel */ 15 | void par_tnmt_bench(int n, int b, int myRank, int numPes, 16 | int req_id, CommData *cdt, int num_iter){ 17 | // if (myRank == 0) printf("benchmarking parallel tournament pivoting with b=%d...\n",b); 18 | double *A,*A_buf,*R; 19 | int *P, *P_br; 20 | int i,j,row,col,info,it; 21 | //double frb_norm_tnmt[1], max_norm_tnmt[1]; 22 | // double * max_norm_tnmt = (double*)malloc(sizeof(double)); 23 | // double * frb_norm_tnmt = (double*)malloc(sizeof(double)); 24 | //// double frb_norm_gepp, max_norm_gepp; 25 | // double * tot_max_norm_tnmt = (double*)malloc(sizeof(double)); 26 | // double * tot_frb_norm_tnmt = (double*)malloc(sizeof(double)); 27 | // double tot_frb_norm_gepp, tot_max_norm_gepp; 28 | double start_time, tnmt_time, out_pivot_time, barrier_time, partial_time; 29 | 30 | int seed_offset = 1000; 31 | assert(n%numPes==0); 32 | int nb = n / numPes; 33 | 34 | assert(0==(posix_memalign((void**)&A, 35 | ALIGN_BYTES, 36 | nb*b*sizeof(double)))); 37 | assert(0==(posix_memalign((void**)&A_buf, 38 | ALIGN_BYTES, 39 | nb*b*sizeof(double)))); 40 | assert(0==(posix_memalign((void**)&R, 41 | ALIGN_BYTES, 42 | nb*b*sizeof(double)))); 43 | assert(0==(posix_memalign((void**)&P_br, 44 | ALIGN_BYTES, 45 | 3*nb*sizeof(int)))); 46 | assert(0==(posix_memalign((void**)&P, 47 | ALIGN_BYTES, 48 | nb*sizeof(int)))); 49 | 50 | 51 | 52 | COMM_BARRIER(cdt); 53 | start_time = TIME_SEC(); 54 | for (i=0; i(A, R, P, nb, b, nb); 76 | tnmt_pvt_1d(R,A,P,A_buf,P_br,b,myRank,0,numPes,0,req_id,cdt); 77 | COMM_BARRIER(cdt); 78 | tnmt_time += TIME_SEC()-start_time; 79 | } 80 | tnmt_time = tnmt_time/num_iter; 81 | tnmt_time = tnmt_time-barrier_time; 82 | 83 | COMM_BARRIER(cdt); 84 | start_time = TIME_SEC(); 85 | for (it=0; it= myRank*b && P[i] < (myRank+1)*b){ 95 | for (j=0; j 1) n = atoi(argv[1]); 146 | else n = 64; 147 | if (argc > 2) b_min = atoi(argv[2]); 148 | else b_min = 16; 149 | if (argc > 3) b_max = atoi(argv[3]); 150 | else b_max = 32; 151 | if (argc > 4) num_iter = atoi(argv[4]); 152 | else num_iter = 25; 153 | 154 | if (myRank == 0) { 155 | printf("benchmarking tournament pivoting panel of length %d for block sizes from b_min = %d to b_max = %d (p=%d)\n",n,b_min,b_max,numPes); 156 | printf("performing %d iterations\n", num_iter); 157 | printf("b\ttnmt(ms)\toutp(ms)\tpartial(ms)\tbarr(ms)\ttotal(ms)\n"); 158 | } 159 | 160 | GLOBAL_BARRIER(cdt_glb); 161 | for (b = b_min; b <= b_max; b = b*2){ 162 | par_tnmt_bench(n, b, myRank, numPes, 0, cdt_glb, num_iter); 163 | } 164 | GLOBAL_BARRIER(cdt_glb); 165 | COMM_EXIT; 166 | return 0; 167 | } 168 | -------------------------------------------------------------------------------- /bench/LU/pblas_lu.c: -------------------------------------------------------------------------------- 1 | #include "mpi.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #ifndef __C_SRC 9 | #define __C_SRC 10 | #endif 11 | 12 | #define NUM_ITER 5 13 | 14 | //proper modulus for 'a' in the range of [-b inf] 15 | #define WRAP(a,b) ((a + b)%b) 16 | #define MIN( a, b ) ( ((a) < (b)) ? (a) : (b) ) 17 | 18 | void Cblacs_pinfo(int*,int*); 19 | 20 | void Cblacs_get(int,int,int*); 21 | 22 | int Cblacs_gridinit(int*,char*,int,int); 23 | 24 | void descinit(int *, int *, 25 | int *, int *, 26 | int *, int *, 27 | int *, int *, 28 | int *, int *); 29 | 30 | static void cdesc_init(int * desc, 31 | int m, int n, 32 | int mb, int nb, 33 | int irsrc, int icsrc, 34 | int ictxt, int LLD, 35 | int * info){ 36 | descinit(desc,&m,&n,&mb,&nb,&irsrc,&icsrc, 37 | &ictxt, &LLD, info); 38 | } 39 | 40 | static void pdgetrf(int *, int *, 41 | double *, int *, 42 | int *, int *, 43 | int *, int *); 44 | 45 | static void cpdgetrf(int m, int n, 46 | double *A, int ia, 47 | int ja, int * desca, 48 | int *IPIV, int * info){ 49 | pdgetrf(&m,&n,A,&ia,&ja,desca,IPIV,info); 50 | } 51 | 52 | 53 | int main(int argc, char **argv) { 54 | /*void pbm() { 55 | 56 | int argc; 57 | char **argv;*/ 58 | int myRank, numPes; 59 | 60 | MPI_Init(&argc, &argv); 61 | MPI_Comm_size(MPI_COMM_WORLD, &numPes); 62 | MPI_Comm_rank(MPI_COMM_WORLD, &myRank); 63 | MPI_Request req[4]; 64 | MPI_Status status[4]; 65 | 66 | int log_numPes = uint_log2(numPes); 67 | 68 | 69 | if (argc < 4 || argc > 5) { 70 | if (myRank == 0) 71 | printf("%s [log2_mat_dim] [log2_pe_mat_lda] [log2_blk_dim] [number of iterations]\n", argv[0]); 72 | MPI_Abort(MPI_COMM_WORLD, -1); 73 | } 74 | 75 | int log_matrixDim = atoi(argv[1]); 76 | int log_blockDim = atoi(argv[2]); 77 | int log_sbDim = atoi(argv[3]); 78 | int matrixDim = 1< 4) num_iter = atoi(argv[4]); 84 | else num_iter = NUM_ITER; 85 | 86 | if (myRank == 0){ 87 | printf("PDGETRFOF SQUARE MATRIX\n"); 88 | printf("MATRIX DIMENSION IS %d\n", matrixDim); 89 | printf("BLOCK DIMENSION IS %d\n", sbDim); 90 | printf("PERFORMING %d ITERATIONS\n", num_iter); 91 | #ifdef RAND 92 | printf("WITH RANDOM DATA\n"); 93 | #else 94 | printf("WITH DATA=INDEX\n"); 95 | #endif 96 | } 97 | 98 | if (matrixDim < blockDim || matrixDim % blockDim != 0) { 99 | if (myRank == 0) printf("array_size_X \% block_size_X != 0!\n"); 100 | MPI_Abort(MPI_COMM_WORLD, -1); 101 | } 102 | if (matrixDim < blockDim || matrixDim % blockDim != 0) { 103 | if (myRank == 0) printf("array_size_Y \% block_size_Y != 0!\n"); 104 | MPI_Abort(MPI_COMM_WORLD, -1); 105 | } 106 | 107 | int log_num_blocks_dim = log_matrixDim - log_blockDim; 108 | int num_blocks_dim = 1< 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #include "CANDMC.h" 15 | 16 | using namespace std; 17 | 18 | /** 19 | * \brief Benchmark TSQR and HH reconstruction 20 | * 21 | * \param[in] m number of rows in A 22 | * \param[in] b number of columns in A 23 | * \param[in] niter number of iterations 24 | * \param[in] myRank rank in communicator column 25 | * \param[in] numPes number of processes in column 26 | * \param[in] req_id request id to use for send/recv 27 | * \param[in] comm MPI communicator for column 28 | **/ 29 | void hh_recon_bench(int64_t const m, 30 | int64_t const b, 31 | int64_t const niter, 32 | int64_t const myRank, 33 | int64_t const numPes, 34 | int64_t const req_id, 35 | CommData_t cdt){ 36 | if (myRank == 0) 37 | printf("benchmarking parallel TSQR with YT reconstruction...\n"); 38 | double *A; 39 | double time; 40 | int64_t i,mb,iter; 41 | 42 | int64_t seed_offset = 99900; 43 | assert(m%numPes == 0); 44 | mb = m / numPes; 45 | 46 | assert(0==(posix_memalign((void**)&A, 47 | ALIGN_BYTES, 48 | mb*b*sizeof(double)))); 49 | double * W; 50 | assert(0==(posix_memalign((void**)&W, 51 | ALIGN_BYTES, 52 | b*b*sizeof(double)))); 53 | srand48(seed_offset); 54 | 55 | time = MPI_Wtime(); 56 | for (iter=0; iter ./exe \n"); 81 | ABORT; 82 | } 83 | niter = 10; 84 | 85 | if (argc == 1) { 86 | b = 17; 87 | m = 39*numPes; 88 | } 89 | if (argc >= 3) { 90 | m = atoi(argv[1]); 91 | b = atoi(argv[2]); 92 | assert(m > 0); 93 | assert(b > 0); 94 | assert(m % numPes == 0); 95 | assert(m / numPes >= b); 96 | } 97 | if (argc >= 4) 98 | niter = atoi(argv[3]); 99 | TAU_PROFILE_TIMER(timer, "main", "int (int, char**)", TAU_USER); 100 | TAU_PROFILE_START(timer); 101 | TAU_PROFILE_SET_NODE(myRank); 102 | TAU_PROFILE_SET_CONTEXT(0); 103 | 104 | 105 | hh_recon_bench(m, b, niter, myRank, numPes, 0, cdt_glb); 106 | TAU_PROFILE_STOP(timer); 107 | 108 | COMM_EXIT; 109 | return 0; 110 | } 111 | -------------------------------------------------------------------------------- /bench/QR/bench_qr_2d.cxx: -------------------------------------------------------------------------------- 1 | /* Copyright (c) Edgar Solomonik 2015, all rights reserved. This code is part of the CANDMC repository, protected under a two-clause BSD license. */ 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #include "CANDMC.h" 15 | using namespace std; 16 | 17 | /** 18 | * \brief Benchmark TSQR and HH reconstruction 19 | * 20 | * \param[in] m number of rows in A 21 | * \param[in] k number of columns in A 22 | * \param[in] b2 outer block size of A 23 | * \param[in] b block size of A 24 | * \param[in] nprow number of procesor rows 25 | * \param[in] npcol number of procesor columns 26 | * \param[in] niter number of iterations 27 | * \param[in] myRank rank in communicator column 28 | * \param[in] numPes number of processes in column 29 | * \param[in] req_id request id to use for send/recv 30 | * \param[in] comm MPI communicator for column 31 | **/ 32 | void qr_2d_bench( int64_t const m, 33 | int64_t const k, 34 | int64_t const b2, 35 | int64_t const b, 36 | int64_t const nprow, 37 | int64_t const npcol, 38 | int64_t const niter, 39 | int64_t const myRank, 40 | int64_t const numPes, 41 | int64_t const req_id, 42 | CommData_t cdt_glb){ 43 | if (myRank == 0) 44 | printf("benchmarking parallel TSQR with YT reconstruction...\n"); 45 | double *A; 46 | double time; 47 | int64_t i,mb,iter,kb; 48 | 49 | int64_t seed_offset = 99900; 50 | CommData_t cdt_row, cdt_col; 51 | SETUP_SUB_COMM(cdt_glb, (cdt_row), 52 | myRank/nprow, 53 | myRank%nprow, 54 | npcol); 55 | SETUP_SUB_COMM(cdt_glb, (cdt_col), 56 | myRank%nprow, 57 | myRank/nprow, 58 | nprow); 59 | 60 | 61 | mb = m / nprow; 62 | kb = k / npcol; 63 | srand48(seed_offset); 64 | 65 | assert(0==(posix_memalign((void**)&A, 66 | ALIGN_BYTES, 67 | mb*kb*sizeof(double)))); 68 | time = MPI_Wtime(); 69 | for (iter=0; iter ./exe "); 101 | printf(" \n"); 102 | ABORT; 103 | } 104 | if (argc > 4) nprow = atoi(argv[4]); 105 | else { 106 | nprow = sqrt(numPes); 107 | while (numPes%nprow!=0) nprow++; 108 | } 109 | npcol = numPes/nprow; 110 | if (argc > 1) m = atoi(argv[1]); 111 | else m = 32*nprow; 112 | if (argc > 2) k = atoi(argv[2]); 113 | else k = 16*nprow; 114 | if (argc > 3) b = atoi(argv[3]); 115 | else b = MIN(m,MIN(k,8)); 116 | if (argc > 5) b2 = atoi(argv[5]); 117 | else b2 = m; 118 | if (argc > 6) niter = atoi(argv[6]); 119 | else niter = 10; 120 | if (argc > 7) transp_fact = atoi(argv[7]); 121 | else transp_fact = 1; 122 | if (myRank == 0){ 123 | printf("m=" PRId64 ", k=" PRId64 ", b = " PRId64 ", b2 = " PRId64 ", nprow = " PRId64 ", npcol = " PRId64 ", niter = " PRId64 ", transp_fact = " PRId64 "\n", 124 | m,k,b,b2,nprow,npcol,niter,transp_fact); 125 | } 126 | 127 | #ifdef TAU 128 | TAU_PROFILE_TIMER(timer, "main", "int (int, char**)", TAU_USER); 129 | TAU_PROFILE_START(timer); 130 | TAU_PROFILE_INIT(argc, argv); 131 | TAU_PROFILE_SET_NODE(myRank); 132 | TAU_PROFILE_SET_CONTEXT(0); 133 | #endif 134 | 135 | if (transp_fact == 1) 136 | qr_2d_bench(m, k, b2, b, nprow, npcol, niter, myRank, numPes, 0, cdt_glb); 137 | else { 138 | CommData_t cdt_glb_transp; 139 | int myrow, mycol; 140 | myrow = myRank / transp_fact; 141 | mycol = myRank % transp_fact; 142 | SETUP_SUB_COMM(cdt_glb, (cdt_glb_transp), (mycol*transp_fact+myrow), 0, numPes); 143 | qr_2d_bench(m, k, b2, b, nprow, npcol, niter, myRank, numPes, 0, cdt_glb_transp); 144 | 145 | } 146 | TAU_PROFILE_STOP(timer); 147 | 148 | COMM_EXIT; 149 | return 0; 150 | } 151 | -------------------------------------------------------------------------------- /bench/QR/bench_qr_2d_hh_scala.cxx: -------------------------------------------------------------------------------- 1 | /* Copyright (c) Edgar Solomonik 2015, all rights reserved. This code is part of the CANDMC repository, protected under a two-clause BSD license. */ 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #include "CANDMC.h" 15 | 16 | using namespace std; 17 | 18 | /** 19 | * \brief Benchmark TSQR and HH reconstruction 20 | * 21 | * \param[in] m number of rows in A 22 | * \param[in] k number of columns in A 23 | * \param[in] b2 outer block size of A 24 | * \param[in] b block size of A 25 | * \param[in] nprow number of procesor rows 26 | * \param[in] npcol number of procesor columns 27 | * \param[in] niter number of iterations 28 | * \param[in] myRank rank in communicator column 29 | * \param[in] numPes number of processes in column 30 | * \param[in] req_id request id to use for send/recv 31 | * \param[in] comm MPI communicator for column 32 | **/ 33 | void scala_qr_2d_bench( int64_t const m, 34 | int64_t const k, 35 | int64_t const b2, 36 | int64_t const b, 37 | int64_t const nprow, 38 | int64_t const npcol, 39 | int64_t const niter, 40 | int64_t const myRank, 41 | int64_t const numPes, 42 | int64_t const req_id, 43 | CommData_t cdt_glb){ 44 | if (myRank == 0) 45 | printf("benchmarking Scalapack with Aggregation...\n"); 46 | #ifndef USE_SCALAPACK 47 | assert(0); 48 | #else 49 | double *A; 50 | double time; 51 | int64_t i,mb,iter,kb; 52 | 53 | int64_t seed_offset = 99900; 54 | CommData_t cdt_row, cdt_col; 55 | SETUP_SUB_COMM(cdt_glb, (cdt_row), 56 | myRank/nprow, 57 | myRank%nprow, 58 | npcol); 59 | SETUP_SUB_COMM(cdt_glb, (cdt_col), 60 | myRank%nprow, 61 | myRank/nprow, 62 | nprow); 63 | 64 | 65 | mb = m / nprow; 66 | kb = k / npcol; 67 | srand48(seed_offset); 68 | 69 | assert(0==(posix_memalign((void**)&A, 70 | ALIGN_BYTES, 71 | mb*kb*sizeof(double)))); 72 | int icontxt, iam, inprocs, info; 73 | char cC = 'C'; 74 | int desc_A[9]; 75 | Cblacs_pinfo(&iam,&inprocs); 76 | Cblacs_get(-1, 0, &icontxt); 77 | int pr = cdt_col.np; 78 | int pc = cdt_row.np; 79 | Cblacs_gridinit(&icontxt, &cC, pr, pc); 80 | cdescinit(desc_A, m, k, 81 | b, b, 82 | 0, 0, 83 | icontxt, m/pr, 84 | &info); 85 | assert(info==0); 86 | 87 | time = MPI_Wtime(); 88 | for (iter=0; iter ./exe "); 121 | printf(" \n"); 122 | ABORT; 123 | } 124 | if (argc > 4) nprow = atoi(argv[4]); 125 | else { 126 | nprow = sqrt(numPes); 127 | while (numPes%nprow!=0) nprow++; 128 | } 129 | npcol = numPes/nprow; 130 | if (argc > 1) m = atoi(argv[1]); 131 | else m = 32*nprow; 132 | if (argc > 2) k = atoi(argv[2]); 133 | else k = 16*nprow; 134 | if (argc > 3) b = atoi(argv[3]); 135 | else b = MIN(m,MIN(k,8)); 136 | if (argc > 5) b2 = atoi(argv[5]); 137 | else b2 = m; 138 | if (argc > 6) niter = atoi(argv[6]); 139 | else niter = 10; 140 | if (argc > 7) transp_fact = atoi(argv[7]); 141 | else transp_fact = 1; 142 | if (myRank == 0){ 143 | printf("m=" PRId64 ", k=" PRId64 ", b = " PRId64 ", b2 = " PRId64 ", nprow = " PRId64 ", npcol = " PRId64 ", niter = " PRId64 ", transp_fact = " PRId64 "\n", 144 | m,k,b,b2,nprow,npcol,niter,transp_fact); 145 | } 146 | 147 | #ifdef TAU 148 | TAU_PROFILE_TIMER(timer, "main", "int (int, char**)", TAU_USER); 149 | TAU_PROFILE_START(timer); 150 | TAU_PROFILE_INIT(argc, argv); 151 | TAU_PROFILE_SET_NODE(myRank); 152 | TAU_PROFILE_SET_CONTEXT(0); 153 | #endif 154 | 155 | if (transp_fact == 1) 156 | scala_qr_2d_bench(m, k, b2, b, nprow, npcol, niter, myRank, numPes, 0, cdt_glb); 157 | else { 158 | CommData_t cdt_glb_transp; 159 | int myrow, mycol; 160 | myrow = myRank / transp_fact; 161 | mycol = myRank % transp_fact; 162 | SETUP_SUB_COMM(cdt_glb, (cdt_glb_transp), (mycol*transp_fact+myrow), 0, numPes); 163 | scala_qr_2d_bench(m, k, b2, b, nprow, npcol, niter, myRank, numPes, 0, cdt_glb_transp); 164 | 165 | } 166 | TAU_PROFILE_STOP(timer); 167 | 168 | COMM_EXIT; 169 | return 0; 170 | } 171 | -------------------------------------------------------------------------------- /bench/QR/bench_qr_butterfly_2d.cxx: -------------------------------------------------------------------------------- 1 | /* Copyright (c) Edgar Solomonik 2015, all rights reserved. This code is part of the CANDMC repository, protected under a two-clause BSD license. */ 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include "CANDMC.h" 14 | using namespace std; 15 | 16 | /** 17 | * \brief Benchmark TSQR and HH reconstruction 18 | * 19 | * \param[in] m number of rows in A 20 | * \param[in] k number of columns in A 21 | * \param[in] b block size of A 22 | * \param[in] nprow number of procesor rows 23 | * \param[in] npcol number of procesor columns 24 | * \param[in] niter number of iterations 25 | * \param[in] myRank rank in communicator column 26 | * \param[in] numPes number of processes in column 27 | * \param[in] req_id request id to use for send/recv 28 | * \param[in] comm MPI communicator for column 29 | **/ 30 | void qr_butterfly_2d_bench( int64_t const m, 31 | int64_t const k, 32 | int64_t const b, 33 | int64_t const nprow, 34 | int64_t const npcol, 35 | int64_t const niter, 36 | int64_t const myRank, 37 | int64_t const numPes, 38 | int64_t const req_id, 39 | CommData_t cdt_glb){ 40 | if (myRank == 0) 41 | printf("benchmarking 2D QR with TSQR implicit update...\n"); 42 | double *A; 43 | double time; 44 | int64_t i,mb,iter,kb; 45 | 46 | int64_t seed_offset = 99900; 47 | CommData_t cdt_row, cdt_col; 48 | SETUP_SUB_COMM(cdt_glb, (cdt_row), 49 | myRank/nprow, 50 | myRank%nprow, 51 | npcol); 52 | SETUP_SUB_COMM(cdt_glb, (cdt_col), 53 | myRank%nprow, 54 | myRank/nprow, 55 | nprow); 56 | 57 | 58 | mb = m / nprow; 59 | kb = k / npcol; 60 | srand48(seed_offset); 61 | 62 | assert(0==(posix_memalign((void**)&A, 63 | ALIGN_BYTES, 64 | mb*kb*sizeof(double)))); 65 | 66 | time = MPI_Wtime(); 67 | for (iter=0; iter ./exe "); 92 | printf(" \n"); 93 | ABORT; 94 | } 95 | if (argc > 4) nprow = atoi(argv[4]); 96 | else { 97 | nprow = sqrt(numPes); 98 | while (numPes%nprow!=0) nprow++; 99 | } 100 | npcol = numPes/nprow; 101 | if (argc > 1) m = atoi(argv[1]); 102 | else m = 32*nprow; 103 | if (argc > 2) k = atoi(argv[2]); 104 | else k = 16*nprow; 105 | if (argc > 3) b = atoi(argv[3]); 106 | else b = MIN(m,MIN(k,8)); 107 | if (argc > 5) niter = atoi(argv[5]); 108 | else niter = 10; 109 | if (argc > 6) transp_fact = atoi(argv[6]); 110 | else transp_fact = 1; 111 | if (myRank == 0){ 112 | printf("m=" PRId64 ", k=" PRId64 ", b = " PRId64 ", nprow = " PRId64 ", npcol = " PRId64 ", niter = " PRId64 ", transp_fact = " PRId64 "\n", 113 | m,k,b,nprow,npcol,niter,transp_fact); 114 | } 115 | 116 | #ifdef TAU 117 | TAU_PROFILE_TIMER(timer, "main", "int (int, char**)", TAU_USER); 118 | TAU_PROFILE_START(timer); 119 | TAU_PROFILE_INIT(argc, argv); 120 | TAU_PROFILE_SET_NODE(myRank); 121 | TAU_PROFILE_SET_CONTEXT(0); 122 | #endif 123 | 124 | if (transp_fact == 1) 125 | qr_butterfly_2d_bench(m, k, b, nprow, npcol, niter, myRank, numPes, 0, cdt_glb); 126 | else { 127 | CommData_t cdt_glb_transp; 128 | int myrow, mycol; 129 | myrow = myRank / transp_fact; 130 | mycol = myRank % transp_fact; 131 | SETUP_SUB_COMM(cdt_glb, (cdt_glb_transp), (mycol*transp_fact+myrow), 0, numPes); 132 | qr_butterfly_2d_bench(m, k, b, nprow, npcol, niter, myRank, numPes, 0, cdt_glb_transp); 133 | 134 | } 135 | TAU_PROFILE_STOP(timer); 136 | 137 | COMM_EXIT; 138 | return 0; 139 | } 140 | -------------------------------------------------------------------------------- /bench/QR/bench_qr_seq.cxx: -------------------------------------------------------------------------------- 1 | /* Copyright (c) Edgar Solomonik 2015, all rights reserved. This code is part of the CANDMC repository, protected under a two-clause BSD license. */ 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | #include "../../alg/shared/util.h" 16 | #include "../../alg/shared/comm.h" 17 | 18 | using namespace std; 19 | 20 | /** 21 | * \brief Benchmark lapack QR routines 22 | * 23 | * \param[in] m numer of rows in A 24 | * \param[in] k numer of columns in A 25 | * \param[in] b block size of A 26 | * \param[in] nprow numer of procesor rows 27 | * \param[in] npcol numer of procesor columns 28 | * \param[in] niter numer of iterations 29 | **/ 30 | void qr_seq_bench(int64_t const m, 31 | int64_t const k, 32 | int64_t const b, 33 | int64_t const niter){ 34 | printf("benchmarking sequential QR\n"); 35 | double * A, * buf, * tau; 36 | double * B; 37 | double time_qr, time_ap, tick; 38 | int64_t i, iter; 39 | int info; 40 | 41 | int64_t seed_offset = 99900; 42 | 43 | srand48(seed_offset); 44 | 45 | assert(0==(posix_memalign((void**)&A, 46 | ALIGN_BYTES, 47 | m*k*sizeof(double)))); 48 | assert(0==(posix_memalign((void**)&B, 49 | ALIGN_BYTES, 50 | m*k*sizeof(double)))); 51 | assert(0==(posix_memalign((void**)&buf, 52 | ALIGN_BYTES, 53 | m*k*sizeof(double)))); 54 | assert(0==(posix_memalign((void**)&tau, 55 | ALIGN_BYTES, 56 | m*k*sizeof(double)))); 57 | 58 | time_qr = 0.0; 59 | time_ap = 0.0; 60 | for (iter=0; iter "); 119 | printf(" \n"); 120 | ABORT; 121 | } 122 | if (argc > 1) m = atoi(argv[1]); 123 | else m = 64; 124 | if (argc > 2) k = atoi(argv[2]); 125 | else k = 32; 126 | if (argc > 3) b = atoi(argv[3]); 127 | else b = 8; 128 | if (argc > 4) niter = atoi(argv[4]); 129 | else niter = 10; 130 | if (myRank == 0){ 131 | printf("m=" PRId64 ", k=" PRId64 ", b = " PRId64 ", niter = " PRId64 "\n", 132 | m,k,b,niter); 133 | } 134 | 135 | if (myRank == 0) 136 | qr_seq_bench(m, k, b, niter); 137 | 138 | COMM_EXIT; 139 | return 0; 140 | } 141 | -------------------------------------------------------------------------------- /bench/QR/bench_qr_tree_2d.cxx: -------------------------------------------------------------------------------- 1 | /* Copyright (c) Edgar Solomonik 2015, all rights reserved. This code is part of the CANDMC repository, protected under a two-clause BSD license. */ 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #include "CANDMC.h" 15 | 16 | using namespace std; 17 | 18 | /** 19 | * \brief Benchmark TSQR and HH reconstruction 20 | * 21 | * \param[in] m number of rows in A 22 | * \param[in] k number of columns in A 23 | * \param[in] b block size of A 24 | * \param[in] nprow number of procesor rows 25 | * \param[in] npcol number of procesor columns 26 | * \param[in] niter number of iterations 27 | * \param[in] myRank rank in communicator column 28 | * \param[in] numPes number of processes in column 29 | * \param[in] req_id request id to use for send/recv 30 | * \param[in] comm MPI communicator for column 31 | **/ 32 | void qr_tree_2d_bench( int64_t const m, 33 | int64_t const k, 34 | int64_t const b, 35 | int64_t const nprow, 36 | int64_t const npcol, 37 | int64_t const niter, 38 | int64_t const myRank, 39 | int64_t const numPes, 40 | int64_t const req_id, 41 | CommData_t cdt_glb){ 42 | if (myRank == 0) 43 | printf("benchmarking 2D QR with TSQR implicit update...\n"); 44 | double *A; 45 | double time; 46 | int64_t i,mb,iter,kb; 47 | 48 | int64_t seed_offset = 99900; 49 | CommData_t cdt_row, cdt_col; 50 | SETUP_SUB_COMM(cdt_glb, (cdt_row), 51 | myRank/nprow, 52 | myRank%nprow, 53 | npcol); 54 | SETUP_SUB_COMM(cdt_glb, (cdt_col), 55 | myRank%nprow, 56 | myRank/nprow, 57 | nprow); 58 | 59 | 60 | mb = m / nprow; 61 | kb = k / npcol; 62 | srand48(seed_offset); 63 | 64 | assert(0==(posix_memalign((void**)&A, 65 | ALIGN_BYTES, 66 | mb*kb*sizeof(double)))); 67 | 68 | time = MPI_Wtime(); 69 | for (iter=0; iter ./exe "); 94 | printf(" \n"); 95 | ABORT; 96 | } 97 | if (argc > 4) nprow = atoi(argv[4]); 98 | else { 99 | nprow = sqrt(numPes); 100 | while (numPes%nprow!=0) nprow++; 101 | } 102 | npcol = numPes/nprow; 103 | if (argc > 1) m = atoi(argv[1]); 104 | else m = 32*nprow; 105 | if (argc > 2) k = atoi(argv[2]); 106 | else k = 16*nprow; 107 | if (argc > 3) b = atoi(argv[3]); 108 | else b = MIN(m,MIN(k,8)); 109 | if (argc > 5) niter = atoi(argv[5]); 110 | else niter = 10; 111 | if (argc > 6) transp_fact = atoi(argv[6]); 112 | else transp_fact = 1; 113 | if (myRank == 0){ 114 | printf("m=" PRId64 ", k=" PRId64 ", b = " PRId64 ", nprow = " PRId64 ", npcol = " PRId64 ", niter = " PRId64 ", transp_fact = " PRId64 "\n", 115 | m,k,b,nprow,npcol,niter,transp_fact); 116 | } 117 | 118 | #ifdef TAU 119 | TAU_PROFILE_TIMER(timer, "main", "int (int, char**)", TAU_USER); 120 | TAU_PROFILE_START(timer); 121 | TAU_PROFILE_INIT(argc, argv); 122 | TAU_PROFILE_SET_NODE(myRank); 123 | TAU_PROFILE_SET_CONTEXT(0); 124 | #endif 125 | 126 | if (transp_fact == 1) 127 | qr_tree_2d_bench(m, k, b, nprow, npcol, niter, myRank, numPes, 0, cdt_glb); 128 | else { 129 | CommData_t cdt_glb_transp; 130 | int myrow, mycol; 131 | myrow = myRank / transp_fact; 132 | mycol = myRank % transp_fact; 133 | SETUP_SUB_COMM(cdt_glb, (cdt_glb_transp), (mycol*transp_fact+myrow), 0, numPes); 134 | qr_tree_2d_bench(m, k, b, nprow, npcol, niter, myRank, numPes, 0, cdt_glb_transp); 135 | 136 | } 137 | TAU_PROFILE_STOP(timer); 138 | 139 | COMM_EXIT; 140 | return 0; 141 | } 142 | -------------------------------------------------------------------------------- /bench/QR/bench_qr_y2d.cxx: -------------------------------------------------------------------------------- 1 | /* Copyright (c) Edgar Solomonik 2015, all rights reserved. This code is part of the CANDMC repository, protected under a two-clause BSD license. */ 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #include "CANDMC.h" 15 | using namespace std; 16 | 17 | /** 18 | * \brief Benchmark Yamamoto-s algorithm 19 | * 20 | * \param[in] m number of rows in A 21 | * \param[in] k number of columns in A 22 | * \param[in] b2 outer block size of A 23 | * \param[in] b block size of A 24 | * \param[in] nprow number of procesor rows 25 | * \param[in] npcol number of procesor columns 26 | * \param[in] niter number of iterations 27 | * \param[in] myRank rank in communicator column 28 | * \param[in] numPes number of processes in column 29 | * \param[in] req_id request id to use for send/recv 30 | * \param[in] comm MPI communicator for column 31 | **/ 32 | void qr_y2d_bench(int64_t m, 33 | int64_t k, 34 | int64_t b2, 35 | int64_t b, 36 | int64_t nprow, 37 | int64_t npcol, 38 | int64_t niter, 39 | int64_t myRank, 40 | int64_t numPes, 41 | int64_t req_id, 42 | CommData_t cdt_glb){ 43 | if (myRank == 0) 44 | printf("benchmarking parallel TSQR with Yamamoto...\n"); 45 | double *A; 46 | double time; 47 | int64_t i,mb,iter,kb; 48 | 49 | int64_t seed_offset = 99900; 50 | CommData_t cdt_row, cdt_col; 51 | SETUP_SUB_COMM(cdt_glb, (cdt_row), 52 | myRank/nprow, 53 | myRank%nprow, 54 | npcol); 55 | SETUP_SUB_COMM(cdt_glb, (cdt_col), 56 | myRank%nprow, 57 | myRank/nprow, 58 | nprow); 59 | 60 | 61 | mb = m / nprow; 62 | kb = k / npcol; 63 | srand48(seed_offset); 64 | 65 | assert(0==(posix_memalign((void**)&A, 66 | ALIGN_BYTES, 67 | mb*kb*sizeof(double)))); 68 | time = MPI_Wtime(); 69 | for (iter=0; iter ./exe "); 101 | printf(" \n"); 102 | ABORT; 103 | } 104 | if (argc > 4) nprow = atoi(argv[4]); 105 | else { 106 | nprow = sqrt(numPes); 107 | while (numPes%nprow!=0) nprow++; 108 | } 109 | npcol = numPes/nprow; 110 | if (argc > 1) m = atoi(argv[1]); 111 | else m = 32*nprow; 112 | if (argc > 2) k = atoi(argv[2]); 113 | else k = 16*nprow; 114 | if (argc > 3) b = atoi(argv[3]); 115 | else b = MIN(m,MIN(k,8)); 116 | if (argc > 5) b2 = atoi(argv[5]); 117 | else b2 = m; 118 | if (argc > 6) niter = atoi(argv[6]); 119 | else niter = 10; 120 | if (argc > 7) transp_fact = atoi(argv[7]); 121 | else transp_fact = 1; 122 | if (myRank == 0){ 123 | printf("m=" PRId64 ", k=" PRId64 ", b = " PRId64 ", b2 = " PRId64 ", nprow = " PRId64 ", npcol = " PRId64 ", niter = " PRId64 ", transp_fact = " PRId64 "\n", 124 | m,k,b,b2,nprow,npcol,niter,transp_fact); 125 | } 126 | 127 | // if (b2 < min(m,k)) 128 | // printf("ERROR: need b2>=min(m,k), because Yamamoto's update aggregation is not yet implemented\n"); 129 | 130 | #ifdef TAU 131 | TAU_PROFILE_TIMER(timer, "main", "int (int, char**)", TAU_USER); 132 | TAU_PROFILE_START(timer); 133 | TAU_PROFILE_INIT(argc, argv); 134 | TAU_PROFILE_SET_NODE(myRank); 135 | TAU_PROFILE_SET_CONTEXT(0); 136 | #endif 137 | 138 | if (transp_fact == 1) 139 | qr_y2d_bench(m, k, b2, b, nprow, npcol, niter, myRank, numPes, 0, cdt_glb); 140 | else { 141 | CommData_t cdt_glb_transp; 142 | int myrow, mycol; 143 | myrow = myRank / transp_fact; 144 | mycol = myRank % transp_fact; 145 | SETUP_SUB_COMM(cdt_glb, (cdt_glb_transp), (mycol*transp_fact+myrow), 0, numPes); 146 | qr_y2d_bench(m, k, b2, b, nprow, npcol, niter, myRank, numPes, 0, cdt_glb_transp); 147 | 148 | } 149 | TAU_PROFILE_STOP(timer); 150 | 151 | COMM_EXIT; 152 | return 0; 153 | } 154 | -------------------------------------------------------------------------------- /bench/SE/Makefile: -------------------------------------------------------------------------------- 1 | include ../../config.mk 2 | 3 | INCLUDES := -I../../include/ 4 | BIN_DIR = ../../bin/benchmarks 5 | LIB_DIR = ../../lib 6 | 7 | SE_BENCHMARKS = bench_scala_sym_eig bench_full2band bench_elpa_sym_eig bench_full2band_3d 8 | .PHONY: $(SE_BENCHMARKS) 9 | bench_scala_sym_eig SE_benchmarks: $(BIN_DIR)/bench_scala_sym_eig 10 | bench_elpa_sym_eig SE_benchmarks: $(BIN_DIR)/bench_elpa_sym_eig 11 | bench_full2band SE_benchmarks: $(BIN_DIR)/bench_full2band 12 | bench_full2band_3d SE_benchmarks: $(BIN_DIR)/bench_full2band_3d 13 | 14 | #$(BIN_DIR)/compare_sytrd 15 | 16 | $(BIN_DIR)/bench_elpa_sym_eig: bench_elpa_sym_eig.cxx $(LIB_DIR)/libCANSE.a $(LIB_DIR)/libCANShared.a 17 | ifneq (,$(findstring DUSE_ELPA,$(DEFS))) 18 | $(CXX) -o $(BIN_DIR)/bench_elpa_sym_eig bench_elpa_sym_eig.cxx $(CXXFLAGS) $(DEFS) $(INCLUDES) \ 19 | -L$(LIB_DIR) -lCANSE -lCANShared $(BLAS_LIBS) $(LDFLAGS) -lm 20 | endif 21 | 22 | $(BIN_DIR)/bench_scala_sym_eig: bench_scala_sym_eig.cxx $(LIB_DIR)/libCANSE.a $(LIB_DIR)/libCANShared.a 23 | ifneq (,$(findstring DUSE_SCALAPACK,$(DEFS))) 24 | $(CXX) -o $(BIN_DIR)/bench_scala_sym_eig bench_scala_sym_eig.cxx $(CXXFLAGS) $(DEFS) $(INCLUDES) \ 25 | -L$(LIB_DIR) -lCANSE -lCANShared $(BLAS_LIBS) $(LDFLAGS) -lm 26 | endif 27 | 28 | $(BIN_DIR)/bench_full2band: bench_full2band.cxx $(LIB_DIR)/libCANSE.a $(LIB_DIR)/libCANShared.a 29 | ifneq (,$(findstring DUSE_SCALAPACK,$(DEFS))) 30 | $(CXX) -o $(BIN_DIR)/bench_full2band bench_full2band.cxx $(CXXFLAGS) $(DEFS) $(INCLUDES) \ 31 | -L$(LIB_DIR) -lCANSE -lCANQR -lCANShared $(BLAS_LIBS) $(LDFLAGS) -lm 32 | endif 33 | 34 | $(BIN_DIR)/bench_full2band_3d: bench_full2band_3d.cxx $(LIB_DIR)/libCANSE.a $(LIB_DIR)/libCANShared.a 35 | ifneq (,$(findstring DUSE_SCALAPACK,$(DEFS))) 36 | $(CXX) -o $(BIN_DIR)/bench_full2band_3d bench_full2band_3d.cxx $(CXXFLAGS) $(DEFS) $(INCLUDES) \ 37 | -L$(LIB_DIR) -lCANSE -lCANQR -lCANShared $(BLAS_LIBS) $(LDFLAGS) -lm 38 | endif 39 | 40 | #$(BIN_DIR)/compare_sytrd: compare_sytrd.cxx $(LIB_DIR)/libCANSE.a $(LIB_DIR)/libCANShared.a 41 | # $(CXX) -o $(BIN_DIR)/compare_sytrd compare_sytrd.cxx $(CXXFLAGS) $(DEFS) $(INCLUDES) \ 42 | # -L$(LIB_DIR) -lCANSE -lCANShared $(BLAS_LIBS) $(LDFLAGS) -lm 43 | 44 | clean: 45 | rm -f *.o 46 | -------------------------------------------------------------------------------- /bench/SE/bench_full2band.cxx: -------------------------------------------------------------------------------- 1 | /* Copyright (c) Edgar Solomonik 2015, all rights reserved. This code is part of the CANDMC repository, protected under a two-clause BSD license. */ 2 | #include "mpi.h" 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "CANDMC.h" 9 | #include "../../alg/shared/util.h" 10 | 11 | #define NUM_ITER 3 12 | 13 | static 14 | char* getopt(char ** begin, char ** end, const std::string & option){ 15 | char ** itr = std::find(begin, end, option); 16 | if (itr != end && ++itr != end){ 17 | return *itr; 18 | } 19 | return 0; 20 | } 21 | 22 | 23 | 24 | int main(int argc, char **argv) { 25 | int myRank, numPes, niter, pr, pc, ipr, ipc, iter; 26 | int64_t n, b, i, b_agg; 27 | double * loc_A; 28 | double time; 29 | 30 | CommData_t cdt_glb; 31 | CommData_t cdt_row, cdt_col; 32 | INIT_COMM(numPes, myRank, 1, cdt_glb); 33 | CommData_t cdt_diag; 34 | 35 | 36 | if (myRank == 0) 37 | printf("Usage: %s -n 'matrix dimension' -b 'distribution blocking factor' -b_agg 'aggregation blocking factor' -niter 'number of iterations'\n", argv[0]); 38 | 39 | pr = sqrt(numPes); 40 | if (pr != sqrt(numPes)){ 41 | if (myRank == 0) 42 | printf("Full to banded benchmark needs square processor grid, terminating...\n"); 43 | return 0; 44 | } 45 | assert(numPes%pr == 0); 46 | if ( getopt(argv, argv+argc, "-niter") && 47 | atoi(getopt(argv, argv+argc, "-niter")) > 0 ) 48 | niter = atoi(getopt(argv, argv+argc, "-niter")); 49 | else 50 | niter = NUM_ITER; 51 | if ( getopt(argv, argv+argc, "-b") && 52 | atoi(getopt(argv, argv+argc, "-b")) > 0 ) 53 | b = atoi(getopt(argv, argv+argc, "-b")); 54 | else 55 | b = 16; 56 | if ( getopt(argv, argv+argc, "-b_agg") && 57 | atoi(getopt(argv, argv+argc, "-b_agg")) > 0 ) 58 | b_agg = atoi(getopt(argv, argv+argc, "-b_agg")); 59 | else 60 | b_agg = 32; 61 | if ( getopt(argv, argv+argc, "-n") && 62 | atoi(getopt(argv, argv+argc, "-n")) > 0 ) 63 | n = atoi(getopt(argv, argv+argc, "-n")); 64 | else 65 | n = 8*b*pr; 66 | 67 | if (myRank == 0) 68 | printf("Executed as '%s -n %ld b_agg %ld -b %ld -niter %d'\n", 69 | argv[0], n, b_agg, b, niter); 70 | 71 | if (numPes % pr != 0) { 72 | if (myRank == 0){ 73 | printf("%d mod %d != 0 Number of processor grid ", numPes, pr); 74 | printf("rows must divide into number of processors\n"); 75 | } 76 | MPI_Abort(MPI_COMM_WORLD, -1); 77 | } 78 | if (n % pr != 0) { 79 | if (myRank == 0){ 80 | printf("%ld mod %d != 0 Number of processor grid ", n, pr); 81 | printf("rows must divide into the matrix dimension\n"); 82 | } 83 | MPI_Abort(MPI_COMM_WORLD, -1); 84 | } 85 | pc = numPes / pr; 86 | if (numPes % pr != 0) { 87 | if (myRank == 0){ 88 | printf("%ld mod %d != 0 Number of processor grid ", n, pc); 89 | printf("columns must divide into the matrix dimension\n"); 90 | } 91 | MPI_Abort(MPI_COMM_WORLD, -1); 92 | } 93 | ipc = myRank / pr; 94 | ipr = myRank % pr; 95 | 96 | 97 | if (myRank == 0){ 98 | printf("Benchmarking symmetric eigensolve full to bandwidth %ld of ",b_agg); 99 | printf("%ld-by-%ld matrix with block size %ld\n",n,n,b); 100 | printf("Using %d processors in %d-by-%d grid.\n", numPes, pr, pc); 101 | } 102 | SETUP_SUB_COMM(cdt_glb, cdt_row, 103 | myRank/pr, 104 | myRank%pr, 105 | pc); 106 | SETUP_SUB_COMM(cdt_glb, cdt_col, 107 | myRank%pr, 108 | myRank/pr, 109 | pr); 110 | if (ipr == ipc){ 111 | SETUP_SUB_COMM(cdt_glb, cdt_diag, 112 | ipr, 113 | 0, 114 | pr); 115 | } else { 116 | SETUP_SUB_COMM(cdt_glb, cdt_diag, 117 | myRank, 118 | 1, 119 | pr); 120 | } 121 | TAU_PROFILE_TIMER(timer, "main", "int (int, char**)", TAU_USER); 122 | TAU_PROFILE_START(timer); 123 | TAU_PROFILE_SET_NODE(myRank); 124 | TAU_PROFILE_SET_CONTEXT(0); 125 | 126 | loc_A = (double*)malloc(n*n*sizeof(double)/numPes); 127 | srand48(666*myRank); 128 | 129 | time = MPI_Wtime(); 130 | 131 | CTF_Timer_epoch ep1("full2band_with_TSQR"); 132 | ep1.begin(); 133 | for (iter=0; iter 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "tnmt_pvt.h" 8 | //#include "../shared/comm.h" 9 | #include "unit_test.h" 10 | #include "../shared/util.h" 11 | #include "../shared/seq_lu.h" 12 | 13 | /* test parallel tournament pivoting parallel swap function */ 14 | void par_pivot_unit_test(int b_sm, 15 | int mat_dim, 16 | int myRank, 17 | int numPes, 18 | int req_id, 19 | CommData cdt){ 20 | if (myRank == 0) 21 | printf("unit testing block cyclic parallel tournament pivoting...\n"); 22 | double *A,*A_buf,*R,*R_out; 23 | int *P, *P_br, *P_I; 24 | int i,j,row,col,info; 25 | double * max_norm_tnmt = (double*)malloc(sizeof(double)); 26 | double * frb_norm_tnmt = (double*)malloc(sizeof(double)); 27 | double * tot_max_norm_tnmt = (double*)malloc(sizeof(double)); 28 | double * tot_frb_norm_tnmt = (double*)malloc(sizeof(double)); 29 | int idx_off; 30 | bool passed = true; 31 | double val; 32 | 33 | const int mat_subdim = mat_dim/numPes; 34 | 35 | int seed_offset = 1000; 36 | 37 | assert(0==(posix_memalign((void**)&A, 38 | ALIGN_BYTES, 39 | 2*mat_subdim*b_sm*sizeof(double)))); 40 | assert(0==(posix_memalign((void**)&A_buf, 41 | ALIGN_BYTES, 42 | 2*mat_subdim*b_sm*sizeof(double)))); 43 | assert(0==(posix_memalign((void**)&R, 44 | ALIGN_BYTES, 45 | 2*mat_subdim*b_sm*sizeof(double)))); 46 | assert(0==(posix_memalign((void**)&R_out, 47 | ALIGN_BYTES, 48 | 2*mat_subdim*b_sm*sizeof(double)))); 49 | assert(0==(posix_memalign((void**)&P_br, 50 | ALIGN_BYTES, 51 | 4*mat_subdim*sizeof(int)))); 52 | assert(0==(posix_memalign((void**)&P, 53 | ALIGN_BYTES, 54 | 4*mat_subdim*sizeof(int)))); 55 | assert(0==(posix_memalign((void**)&P_I, 56 | ALIGN_BYTES, 57 | 4*mat_dim*sizeof(int)))); 58 | 59 | 60 | for (idx_off=0; idx_off < mat_subdim; idx_off+=b_sm){ 61 | RANK_PRINTF(myRank,0,"idx_off=%d\n",idx_off); 62 | for (i=0; i 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "tnmt_pvt.h" 8 | //#include "../shared/comm.h" 9 | #include "unit_test.h" 10 | #include "../shared/util.h" 11 | #include "../shared/seq_lu.h" 12 | 13 | /* test parallel tournament pivoting 14 | * b is the size of the panel */ 15 | void par_tnmt_unit_test(int b, int myRank, int numPes, int req_id, CommData *cdt){ 16 | if (myRank == 0) printf("unit testing parallel tournament pivoting...\n"); 17 | double *A,*A_buf,*R,*whole_A; 18 | int *P, *P_br, *P_I; 19 | int i,j,row,col,info; 20 | //double frb_norm_tnmt[1], max_norm_tnmt[1]; 21 | double * max_norm_tnmt = (double*)malloc(sizeof(double)); 22 | double * frb_norm_tnmt = (double*)malloc(sizeof(double)); 23 | // double frb_norm_gepp, max_norm_gepp; 24 | double * tot_max_norm_tnmt = (double*)malloc(sizeof(double)); 25 | double * tot_frb_norm_tnmt = (double*)malloc(sizeof(double)); 26 | // double tot_frb_norm_gepp, tot_max_norm_gepp; 27 | 28 | int seed_offset = 1000; 29 | 30 | assert(0==(posix_memalign((void**)&A, 31 | ALIGN_BYTES, 32 | 2*b*b*sizeof(double)))); 33 | assert(0==(posix_memalign((void**)&A_buf, 34 | ALIGN_BYTES, 35 | 2*b*b*sizeof(double)))); 36 | assert(0==(posix_memalign((void**)&R, 37 | ALIGN_BYTES, 38 | 2*b*b*sizeof(double)))); 39 | assert(0==(posix_memalign((void**)&whole_A, 40 | ALIGN_BYTES, 41 | numPes*b*b*sizeof(double)))); 42 | assert(0==(posix_memalign((void**)&P_br, 43 | ALIGN_BYTES, 44 | 3*b*sizeof(int)))); 45 | assert(0==(posix_memalign((void**)&P, 46 | ALIGN_BYTES, 47 | 2*b*sizeof(int)))); 48 | assert(0==(posix_memalign((void**)&P_I, 49 | ALIGN_BYTES, 50 | numPes*b*sizeof(int)))); 51 | 52 | 53 | for (col=0; col= myRank*b && P[i] < (myRank+1)*b){ 79 | for (j=0; j 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "tnmt_pvt.h" 8 | #include "unit_test.h" 9 | #include "../shared/util.h" 10 | #include "../shared/seq_lu.h" 11 | 12 | /* test sequetnial tournament pivoting 13 | * b is the size of the panel */ 14 | void seq_tnmt_unit_test(int b){ 15 | printf("unit testing local tournament pivoting...\n"); 16 | double *A,*B; 17 | int *P, *P_br, *P_I; 18 | int i,row,col,info; 19 | double frb_norm_tnmt, max_norm_tnmt; 20 | double frb_norm_gepp, max_norm_gepp; 21 | 22 | int seed_offset = 1000; 23 | 24 | assert(0==(posix_memalign((void**)&A, 25 | ALIGN_BYTES, 26 | 2*b*b*sizeof(double)))); 27 | assert(0==(posix_memalign((void**)&B, 28 | ALIGN_BYTES, 29 | 2*b*b*sizeof(double)))); 30 | assert(0==(posix_memalign((void**)&P_br, 31 | ALIGN_BYTES, 32 | b*sizeof(int)))); 33 | assert(0==(posix_memalign((void**)&P, 34 | ALIGN_BYTES, 35 | 2*b*sizeof(int)))); 36 | assert(0==(posix_memalign((void**)&P_I, 37 | ALIGN_BYTES, 38 | 2*b*sizeof(int)))); 39 | 40 | for (col=0; col= frb_norm_tnmt/100. && max_norm_gepp >= max_norm_tnmt/100.){ 80 | printf("test passed (local tournament pivoting test)\n"); 81 | } else { 82 | printf("TEST FAILED (local tournament pivoting test)\n"); 83 | printf("diff between frb = %E\n",frb_norm_tnmt-frb_norm_gepp); 84 | printf("diff between max = %E\n",max_norm_tnmt-max_norm_gepp); 85 | } 86 | 87 | free(A); 88 | free(B); 89 | free(P_br); 90 | free(P); 91 | free(P_I); 92 | } 93 | -------------------------------------------------------------------------------- /test/LU/unit_test.cxx: -------------------------------------------------------------------------------- 1 | /* Copyright (c) Edgar Solomonik 2015, all rights reserved. This code is part of the CANDMC repository, protected under a two-clause BSD license. */ 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "CANDMC.h" 8 | #include "unit_test.h" 9 | 10 | /* confirms LU factorization of a matrix 11 | * with arbitrary pivoting 12 | * assumes the matrix was creating with 13 | * A[row,col] = rand48() with seed48(seed+col*dim + row) */ 14 | void pvt_con_lu(int const nrows, 15 | int const ncols, 16 | int const seed, 17 | double const* LU, 18 | int const* P){ 19 | 20 | 21 | int row, col, i, j, k; 22 | 23 | double *A, *A_piv, div; 24 | 25 | assert(ncols > 0); 26 | assert(nrows >= ncols); 27 | assert(seed >= 0); 28 | 29 | assert(0==(posix_memalign((void**)&A, 30 | ALIGN_BYTES, 31 | nrows*ncols*sizeof(double)))); 32 | assert(0==(posix_memalign((void**)&A_piv, 33 | ALIGN_BYTES, 34 | nrows*ncols*sizeof(double)))); 35 | for (col=0; col 1E-6 && 64 | fabs((LU[i*nrows+j] - A_piv[i*nrows+j])/A_piv[i*nrows+j]) > 1E-6){ 65 | DEBUG_PRINTF("LU[%d][%d] = %lf, should have been %lf\n", 66 | j,i,LU[i*nrows+j],A_piv[i*nrows+j]); 67 | correct = false; 68 | } 69 | } 70 | } 71 | if (correct) printf("given the pivot matrix, the answer is CORRECT\n"); 72 | else printf("given the pivot matrix, the answer is INCORRECT\n"); 73 | 74 | double * max_norm_tnmt = (double*)malloc(sizeof(double)); 75 | double * frb_norm_tnmt = (double*)malloc(sizeof(double)); 76 | backerr_lu(nrows,ncols,seed,A_piv, 77 | P,frb_norm_tnmt,max_norm_tnmt, 78 | 0,0,nrows,ncols); 79 | printf("with this pivot matrix blas 2 LU gets backward norms |(A-LU)|, frobenius = %E, max = %E\n", 80 | frb_norm_tnmt[0],max_norm_tnmt[0]); 81 | } 82 | /* confirms LU factorization of a matrix 83 | * by computing the backward error norm 84 | * assumes the matrix was creating with 85 | * A[row,col] = rand48() with seed48(seed+col*dim + row) */ 86 | void backerr_lu(int const nrows, 87 | int const ncols, 88 | int const seed, 89 | double const* LU, 90 | int const* P, 91 | double* frb_norm, 92 | double* max_norm, 93 | int const row_st, 94 | int const col_st, 95 | int const num_row_chk, 96 | int const num_col_chk){ 97 | 98 | 99 | int row, col, i; 100 | 101 | double *A, *A_piv, val, err; 102 | 103 | assert(ncols > 0); 104 | assert(nrows >= ncols); 105 | assert(seed >= 0); 106 | 107 | assert(0==(posix_memalign((void**)&A, 108 | ALIGN_BYTES, 109 | nrows*ncols*sizeof(double)))); 110 | assert(0==(posix_memalign((void**)&A_piv, 111 | ALIGN_BYTES, 112 | nrows*ncols*sizeof(double)))); 113 | for (col=0; col 1) b_sm = atoi(argv[1]); 160 | else b_sm = 16; 161 | 162 | if (argc > 2) b_lrg = atoi(argv[2]); 163 | else b_lrg = b_sm*sqrt(numPes)*2; 164 | 165 | if (argc > 3) n = atoi(argv[3]); 166 | else n = b_lrg*2; 167 | 168 | if (argc > 4) c_rep = atoi(argv[4]); 169 | else c_rep = 1; 170 | 171 | if (argc > 5) test_mask = atoi(argv[5]); 172 | else test_mask = 0x1F; 173 | 174 | if (myRank == 0) { 175 | printf("starting unit tests for 2.5D LU."); 176 | printf("b_sm=%d, b_lrg=%d, n=%d\n",b_sm,b_lrg,n); 177 | } 178 | 179 | if (test_mask&0x1){ 180 | if (myRank == 0) { 181 | seq_tnmt_unit_test(b_sm); 182 | } 183 | GLOBAL_BARRIER(cdt_glb); 184 | } 185 | if (test_mask&0x2){ 186 | par_tnmt_unit_test(b_sm, myRank, numPes, 0, cdt_glb); 187 | GLOBAL_BARRIER(cdt_glb); 188 | } 189 | if (test_mask&0x4){ 190 | par_pivot_unit_test(b_sm, n, myRank, numPes, 0, cdt_glb); 191 | GLOBAL_BARRIER(cdt_glb); 192 | } 193 | if (test_mask&0x8){ 194 | lu_25d_unit_test(n, b_sm, b_lrg, myRank, numPes, c_rep, cdt_glb); 195 | GLOBAL_BARRIER(cdt_glb); 196 | } 197 | if (test_mask&0x10){ 198 | lu_25d_pvt_unit_test(n, b_sm, b_lrg, myRank, numPes, c_rep, cdt_glb); 199 | GLOBAL_BARRIER(cdt_glb); 200 | } 201 | COMM_EXIT; 202 | return 0; 203 | } 204 | #endif 205 | -------------------------------------------------------------------------------- /test/LU/unit_test.h: -------------------------------------------------------------------------------- 1 | #ifndef __UNIT_TEST_H__ 2 | #define __UNIT_TEST_H__ 3 | 4 | /* confirms LU factorization of a matrix 5 | * with arbitrary pivoting 6 | * assumes the matrix was creating with 7 | * A[row,col] = rand48() with seed48(seed+col*dim + row) */ 8 | void pvt_con_lu(int const nrows, 9 | int const ncols, 10 | int const seed, 11 | double const* LU, 12 | int const* P); 13 | 14 | /* confirms LU factorization of a matrix 15 | * by computing the backward error norm 16 | * assumes the matrix was creating with 17 | * A[i,j] = rand48() with seed48(seed+i*dim + j) */ 18 | void backerr_lu(int const nrows, 19 | int const ncols, 20 | int const seed, 21 | double const* LU, 22 | int const* P, 23 | double* frb_norm, 24 | double* max_norm); 25 | 26 | void backerr_lu(int const nrows, 27 | int const ncols, 28 | int const seed, 29 | double const* LU, 30 | int const* P, 31 | double* frb_norm, 32 | double* max_norm, 33 | int const row_st, 34 | int const col_st, 35 | int const num_row_chk, 36 | int const num_col_chk); 37 | 38 | /* test sequetnial tournament pivoting 39 | * b is the size of the panel */ 40 | void seq_tnmt_unit_test(int b); 41 | 42 | /* test parallel tournament pivoting 43 | * b is the size of the panel */ 44 | void par_tnmt_unit_test(int b, int myRank, int numPes, int req_id, CommData cdt); 45 | 46 | /* test parallel tournament pivoting parallel swap function */ 47 | void par_pivot_unit_test(int b_sm, 48 | int mat_dim, 49 | int myRank, 50 | int numPes, 51 | int req_id, 52 | CommData cdt); 53 | 54 | /* test parallel tournament pivoting 55 | * n is the test matrix dimension 56 | * b_sm is the small block dimension 57 | * b_lrg is the large block dimension */ 58 | void lu_25d_unit_test(int const n, 59 | int const b_sm, 60 | int const b_lrg, 61 | int const myRank, 62 | int const numPes, 63 | int const c_rep, 64 | CommData const cdt); 65 | 66 | void lu_25d_pvt_unit_test( int const n, 67 | int const b_sm, 68 | int const b_lrg, 69 | int const myRank, 70 | int const numPes, 71 | int const c_rep, 72 | CommData const cdt); 73 | 74 | 75 | 76 | #endif //__UNIT_TEST_H__ 77 | 78 | -------------------------------------------------------------------------------- /test/MM/Makefile: -------------------------------------------------------------------------------- 1 | include ../../config.mk 2 | 3 | INCLUDES := -I../../include/ 4 | BIN_DIR = ../../bin/tests 5 | LIB_DIR = ../../lib 6 | 7 | MM_tests: $(MM_TESTS) 8 | MM_TESTS = test_spc topo_pdgemm_unit 9 | test_spc: $(BIN_DIR)/test_spc 10 | topo_pdgemm_unit: $(BIN_DIR)/topo_pdgemm_unit 11 | 12 | 13 | $(BIN_DIR)/topo_pdgemm_unit: topo_pdgemm_unit.cxx $(LIB_DIR)/libCANMM.a $(LIB_DIR)/libCANShared.a 14 | $(CXX) -o $(BIN_DIR)/topo_pdgemm_unit topo_pdgemm_unit.cxx $(CXXFLAGS) $(DEFS) $(INCLUDES) \ 15 | -L$(LIB_DIR) -lCANMM -lCANShared $(BLAS_LIBS) $(LDFLAGS) -lm 16 | 17 | $(BIN_DIR)/test_spc: test_spc.cxx $(LIB_DIR)/libCANMM.a $(LIB_DIR)/libCANShared.a 18 | $(CXX) -o $(BIN_DIR)/test_spc test_spc.cxx $(CXXFLAGS) $(DEFS) $(INCLUDES) \ 19 | -L$(LIB_DIR) -lCANMM -lCANShared $(BLAS_LIBS) $(LDFLAGS) -lm 20 | 21 | clean: 22 | rm -f *.o 23 | -------------------------------------------------------------------------------- /test/Makefile: -------------------------------------------------------------------------------- 1 | include ../config.mk 2 | 3 | test: MM_tests LU_tests QR_tests SE_tests 4 | 5 | MM_TESTS := test_spc topo_pdgemm_unit 6 | LU_TESTS := lu_25d_np_test lu_25d_pp_test lu_25d_tp_test 7 | QR_TESTS := test_bitree_tsqr test_construct_tsqr_Q test_hh_recon test_qr_y2d \ 8 | test_qr_2d test_qr_butterfly_2d test_qr_tree_2d test_scala_qr_2d 9 | SE_TESTS := test_scala_sym_eig test_full2band test_full2band_scala \ 10 | test_full2band_3d test_band_to_band 11 | 12 | MM_tests $(MM_TESTS): 13 | $(MAKE) $@ -C MM 14 | 15 | LU_tests $(LU_TESTS): 16 | $(MAKE) $@ -C LU 17 | 18 | QR_tests $(QR_TESTS): 19 | ifneq (,$(findstring DLAPACKHASTSQR=1,$(DEFS))) 20 | $(MAKE) $@ -C QR; 21 | endif 22 | 23 | SE_tests $(SE_TESTS): 24 | ifneq (,$(findstring DLAPACKHASTSQR=1,$(DEFS))) 25 | $(MAKE) $@ -C SE; 26 | endif 27 | 28 | clean: 29 | $(MAKE) $@ -C MM; \ 30 | $(MAKE) $@ -C LU; \ 31 | $(MAKE) $@ -C QR; \ 32 | $(MAKE) $@ -C SE; 33 | -------------------------------------------------------------------------------- /test/QR/Makefile: -------------------------------------------------------------------------------- 1 | include ../../config.mk 2 | 3 | INCLUDES := -I../../include/ 4 | BIN_DIR := ../../bin/tests 5 | LIB_DIR := ../../lib 6 | 7 | QR_TESTS := test_bitree_tsqr test_construct_tsqr_Q test_hh_recon \ 8 | test_qr_2d test_qr_y2d test_qr_butterfly_2d test_qr_tree_2d test_scala_qr_2d 9 | 10 | QR_tests: $(QR_TESTS) 11 | test_hh_recon: $(BIN_DIR)/test_hh_recon 12 | test_construct_tsqr_Q: $(BIN_DIR)/test_construct_tsqr_Q 13 | test_bitree_tsqr: $(BIN_DIR)/test_bitree_tsqr 14 | test_qr_2d: $(BIN_DIR)/test_qr_2d 15 | test_qr_y2d: $(BIN_DIR)/test_qr_y2d 16 | test_qr_tree_2d: $(BIN_DIR)/test_qr_tree_2d 17 | test_scala_qr_2d: $(BIN_DIR)/test_scala_qr_2d 18 | test_qr_butterfly_2d: $(BIN_DIR)/test_qr_butterfly_2d 19 | 20 | $(BIN_DIR)/test_hh_recon: test_hh_recon.cxx $(LIB_DIR)/libCANQR.a $(LIB_DIR)/libCANShared.a 21 | $(CXX) -o $(BIN_DIR)/test_hh_recon test_hh_recon.cxx $(CXXFLAGS) $(DEFS) $(INCLUDES) \ 22 | -L$(LIB_DIR) -lCANQR -lCANShared $(BLAS_LIBS) $(LDFLAGS) -lm 23 | 24 | $(BIN_DIR)/test_construct_tsqr_Q: test_construct_tsqr_Q.cxx $(LIB_DIR)/libCANQR.a $(LIB_DIR)/libCANShared.a 25 | $(CXX) -o $(BIN_DIR)/test_construct_tsqr_Q test_construct_tsqr_Q.cxx $(CXXFLAGS) $(DEFS) $(INCLUDES) \ 26 | -L$(LIB_DIR) -lCANQR -lCANShared $(BLAS_LIBS) $(LDFLAGS) -lm 27 | 28 | $(BIN_DIR)/test_bitree_tsqr: test_bitree_tsqr.cxx $(LIB_DIR)/libCANQR.a $(LIB_DIR)/libCANShared.a 29 | $(CXX) -o $(BIN_DIR)/test_bitree_tsqr test_bitree_tsqr.cxx $(CXXFLAGS) $(DEFS) $(INCLUDES) \ 30 | -L$(LIB_DIR) -lCANQR -lCANShared $(BLAS_LIBS) $(LDFLAGS) -lm 31 | 32 | $(BIN_DIR)/test_qr_2d: test_qr_2d.cxx $(LIB_DIR)/libCANQR.a $(LIB_DIR)/libCANShared.a 33 | $(CXX) -o $(BIN_DIR)/test_qr_2d test_qr_2d.cxx $(CXXFLAGS) $(DEFS) $(INCLUDES) \ 34 | -L$(LIB_DIR) -lCANQR -lCANShared $(BLAS_LIBS) $(LDFLAGS) -lm 35 | 36 | $(BIN_DIR)/test_qr_y2d: test_qr_y2d.cxx $(LIB_DIR)/libCANQR.a $(LIB_DIR)/libCANShared.a 37 | $(CXX) -o $(BIN_DIR)/test_qr_y2d test_qr_y2d.cxx $(CXXFLAGS) $(DEFS) $(INCLUDES) \ 38 | -L$(LIB_DIR) -lCANQR -lCANShared $(BLAS_LIBS) $(LDFLAGS) -lm 39 | 40 | $(BIN_DIR)/test_scala_qr_2d: test_scala_qr_2d.cxx $(LIB_DIR)/libCANQR.a $(LIB_DIR)/libCANShared.a 41 | $(CXX) -o $(BIN_DIR)/test_scala_qr_2d test_scala_qr_2d.cxx $(CXXFLAGS) $(DEFS) $(INCLUDES) \ 42 | -L$(LIB_DIR) -lCANQR -lCANShared $(BLAS_LIBS) $(LDFLAGS) -lm 43 | 44 | $(BIN_DIR)/test_qr_tree_2d: test_qr_tree_2d.cxx $(LIB_DIR)/libCANQR.a $(LIB_DIR)/libCANShared.a 45 | $(CXX) -o $(BIN_DIR)/test_qr_tree_2d test_qr_tree_2d.cxx $(CXXFLAGS) $(DEFS) $(INCLUDES) \ 46 | -L$(LIB_DIR) -lCANQR -lCANShared $(BLAS_LIBS) $(LDFLAGS) -lm 47 | 48 | $(BIN_DIR)/test_qr_butterfly_2d: test_qr_butterfly_2d.cxx $(LIB_DIR)/libCANQR.a $(LIB_DIR)/libCANShared.a 49 | $(CXX) -o $(BIN_DIR)/test_qr_butterfly_2d test_qr_butterfly_2d.cxx $(CXXFLAGS) $(DEFS) $(INCLUDES) \ 50 | -L$(LIB_DIR) -lCANQR -lCANShared $(BLAS_LIBS) $(LDFLAGS) -lm 51 | 52 | clean: 53 | rm -f *.o 54 | 55 | -------------------------------------------------------------------------------- /test/QR/test_qr_butterfly_2d.cxx: -------------------------------------------------------------------------------- 1 | /* Copyright (c) Edgar Solomonik 2015, all rights reserved. This code is part of the CANDMC repository, protected under a two-clause BSD license. */ 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #include "CANDMC.h" 15 | 16 | using namespace std; 17 | 18 | /** 19 | * \brief Test TSQR 20 | * 21 | * \param[in] m number of rows in A 22 | * \param[in] b number of columns in A 23 | * \param[in] myRank rank in communicator column 24 | * \param[in] numPes number of processes in column 25 | * \param[in] req_id request id to use for send/recv 26 | * \param[in] comm MPI communicator for column 27 | **/ 28 | #ifdef READ_FILE 29 | void qr_2d_unit_test( const char * filename, 30 | int64_t const myRank, 31 | int64_t const numPes, 32 | int64_t const req_id, 33 | CommData_t cdt_row, 34 | CommData_t cdt_col, 35 | CommData_t cdt){ 36 | #else 37 | void qr_2d_unit_test(int64_t const m, 38 | int64_t const k, 39 | int64_t const b, 40 | int64_t const myRank, 41 | int64_t const numPes, 42 | int64_t const req_id, 43 | CommData_t cdt_row, 44 | CommData_t cdt_col, 45 | CommData_t cdt){ 46 | #endif 47 | if (myRank == 0) 48 | printf("unit testing parallel 2D butterfly QR...\n"); 49 | double *A,*whole_A,*collect_YR,*whole_YR,*swork,*tau,*bw_A; 50 | double norm, yn2; 51 | int info; 52 | int64_t i,j,row,col,mb,kb,pass; 53 | int64_t npcol, nprow, mycol, myrow; 54 | npcol = cdt_row.np; 55 | nprow = cdt_col.np; 56 | mycol = cdt_row.rank; 57 | myrow = cdt_col.rank; 58 | 59 | 60 | int64_t seed_offset = 99900; 61 | 62 | mb = m / nprow; 63 | kb = k / npcol; 64 | assert(0==(posix_memalign((void**)&collect_YR, 65 | ALIGN_BYTES, 66 | 2*m*k*sizeof(double)))); 67 | assert(0==(posix_memalign((void**)&whole_YR, 68 | ALIGN_BYTES, 69 | 2*m*k*sizeof(double)))); 70 | assert(0==(posix_memalign((void**)&whole_A, 71 | ALIGN_BYTES, 72 | 2*m*k*sizeof(double)))); 73 | assert(0==(posix_memalign((void**)&A, 74 | ALIGN_BYTES, 75 | 2*mb*kb*sizeof(double)))); 76 | 77 | for (col=0; col 131 | (mycol*b + (col%b) + (col/b)*b*npcol)){ 132 | fnorm += A[row+(kb+col)*mb]*A[row+(kb+col)*mb]; 133 | } else { 134 | fnorm += MIN(pow(A[row+col*mb]-A[row+(kb+col)*mb],2),pow((A[row+col*mb]-A[row+(kb+col)*mb])/A[row+col*mb],2)); 135 | } 136 | } 137 | } 138 | fnorm = sqrtf(fnorm); 139 | if (fnorm <1.E-6) pass = 1; 140 | else pass = 0; 141 | if (myRank == 0){ 142 | printf("QR of [A,A] gives [Y/R_1,0/R_2], checking norm ||0/R_2-R_1||_F=%E\n",fnorm); 143 | if (pass) printf("Test passed.\n"); 144 | else printf("Test FAILED!\n"); 145 | } 146 | MPI_Barrier(cdt.cm); 147 | 148 | free(A); 149 | free(whole_A); 150 | free(whole_YR); 151 | free(collect_YR); 152 | free(tau); 153 | free(swork); 154 | } 155 | 156 | int main(int argc, char **argv) { 157 | int myRank, numPes; 158 | int64_t m, k, b, nprow, npcol; 159 | 160 | CommData_t cdt_glb; 161 | CommData_t cdt_row, cdt_col; 162 | INIT_COMM(numPes, myRank, 1, cdt_glb); 163 | 164 | /*string filename; 165 | if (argc != 2 ) 166 | printf("Usage: mpirun -np ./exe \n"); 167 | 168 | if (argc == 2) { 169 | filename.append(argv[1]); 170 | } 171 | qr_2d_unit_test(filename.c_str(), myRank, numPes, 0, cdt_glb); 172 | */ 173 | 174 | nprow = sqrt(numPes); 175 | while (numPes % nprow != 0) nprow++; 176 | npcol = numPes/nprow; 177 | if (argc == 1) { 178 | b = 2; 179 | m = 8*nprow; 180 | k = 4*npcol; 181 | } else if (argc > 3) { 182 | m = atoi(argv[1]); 183 | k = atoi(argv[2]); 184 | b = atoi(argv[3]); 185 | if (argc > 4) nprow = atoi(argv[4]); 186 | assert(m > 0); 187 | assert(b > 0); 188 | assert(k > 0); 189 | assert(nprow > 0); 190 | assert(nprow <= numPes); 191 | } else { 192 | printf("Usage: mpirun -np ./exe "); 193 | printf(" \n"); 194 | ABORT; 195 | } 196 | npcol = numPes/nprow; 197 | if (myRank == 0){ 198 | printf("m=" PRId64 ", k=" PRId64 ", b = " PRId64 ", nprow = " PRId64 ", npcol = " PRId64 "\n", 199 | m,k,b,nprow,npcol); 200 | } 201 | SETUP_SUB_COMM(cdt_glb, (cdt_row), 202 | myRank/nprow, 203 | myRank%nprow, 204 | npcol); 205 | SETUP_SUB_COMM(cdt_glb, (cdt_col), 206 | myRank%nprow, 207 | myRank/nprow, 208 | nprow); 209 | 210 | qr_2d_unit_test(m, k, b, myRank, numPes, 0, cdt_row, cdt_col, cdt_glb); 211 | 212 | 213 | COMM_EXIT; 214 | return 0; 215 | } 216 | -------------------------------------------------------------------------------- /test/QR/test_qr_tree_2d.cxx: -------------------------------------------------------------------------------- 1 | /* Copyright (c) Edgar Solomonik 2015, all rights reserved. This code is part of the CANDMC repository, protected under a two-clause BSD license. */ 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | 15 | #include "CANDMC.h" 16 | 17 | using namespace std; 18 | 19 | /** 20 | * \brief Test TSQR 21 | * 22 | * \param[in] m number of rows in A 23 | * \param[in] b number of columns in A 24 | * \param[in] myRank rank in communicator column 25 | * \param[in] numPes number of processes in column 26 | * \param[in] req_id request id to use for send/recv 27 | * \param[in] comm MPI communicator for column 28 | **/ 29 | #ifdef READ_FILE 30 | void qr_2d_unit_test( const char * filename, 31 | int64_t const myRank, 32 | int64_t const numPes, 33 | int64_t const req_id, 34 | CommData_t cdt_row, 35 | CommData_t cdt_col, 36 | CommData_t cdt){ 37 | #else 38 | void qr_2d_unit_test(int64_t const m, 39 | int64_t const k, 40 | int64_t const b, 41 | int64_t const myRank, 42 | int64_t const numPes, 43 | int64_t const req_id, 44 | CommData_t cdt_row, 45 | CommData_t cdt_col, 46 | CommData_t cdt){ 47 | #endif 48 | if (myRank == 0) 49 | printf("unit testing parallel 2D tree QR...\n"); 50 | double *A,*whole_A,*collect_YR,*whole_YR,*swork,*tau,*bw_A; 51 | double norm, yn2; 52 | int info; 53 | int64_t i,j,row,col,mb,kb,pass; 54 | int64_t npcol, nprow, mycol, myrow; 55 | npcol = cdt_row.np; 56 | nprow = cdt_col.np; 57 | mycol = cdt_row.rank; 58 | myrow = cdt_col.rank; 59 | 60 | 61 | int64_t seed_offset = 99900; 62 | 63 | mb = m / nprow; 64 | kb = k / npcol; 65 | assert(0==(posix_memalign((void**)&collect_YR, 66 | ALIGN_BYTES, 67 | 2*m*k*sizeof(double)))); 68 | assert(0==(posix_memalign((void**)&whole_YR, 69 | ALIGN_BYTES, 70 | 2*m*k*sizeof(double)))); 71 | assert(0==(posix_memalign((void**)&whole_A, 72 | ALIGN_BYTES, 73 | 2*m*k*sizeof(double)))); 74 | assert(0==(posix_memalign((void**)&A, 75 | ALIGN_BYTES, 76 | 2*mb*kb*sizeof(double)))); 77 | 78 | for (col=0; col 132 | (mycol*b + (col%b) + (col/b)*b*npcol)){ 133 | fnorm += A[row+(kb+col)*mb]*A[row+(kb+col)*mb]; 134 | } else { 135 | fnorm += MIN(pow(A[row+col*mb]-A[row+(kb+col)*mb],2),pow((A[row+col*mb]-A[row+(kb+col)*mb])/A[row+col*mb],2)); 136 | } 137 | } 138 | } 139 | fnorm = sqrtf(fnorm); 140 | if (fnorm <1.E-6) pass = 1; 141 | else pass = 0; 142 | if (myRank == 0){ 143 | printf("QR of [A,A] gives [Y/R_1,0/R_2], checking norm ||0/R_2-R_1||_F=%E\n",fnorm); 144 | if (pass) printf("Test passed.\n"); 145 | else printf("Test FAILED!\n"); 146 | } 147 | MPI_Barrier(cdt.cm); 148 | 149 | free(A); 150 | free(whole_A); 151 | free(whole_YR); 152 | free(collect_YR); 153 | free(tau); 154 | free(swork); 155 | } 156 | 157 | int main(int argc, char **argv) { 158 | int myRank, numPes; 159 | int64_t m, k, b, nprow, npcol; 160 | 161 | CommData_t cdt_glb; 162 | CommData_t cdt_row, cdt_col; 163 | INIT_COMM(numPes, myRank, 1, cdt_glb); 164 | 165 | /*string filename; 166 | if (argc != 2 ) 167 | printf("Usage: mpirun -np ./exe \n"); 168 | 169 | if (argc == 2) { 170 | filename.append(argv[1]); 171 | } 172 | qr_2d_unit_test(filename.c_str(), myRank, numPes, 0, cdt_glb); 173 | */ 174 | 175 | nprow = sqrt(numPes); 176 | while (numPes % nprow != 0) nprow++; 177 | npcol = numPes/nprow; 178 | if (argc == 1) { 179 | b = 2; 180 | m = 8*nprow; 181 | k = 4*npcol; 182 | } else if (argc > 3) { 183 | m = atoi(argv[1]); 184 | k = atoi(argv[2]); 185 | b = atoi(argv[3]); 186 | if (argc > 4) nprow = atoi(argv[4]); 187 | assert(m > 0); 188 | assert(b > 0); 189 | assert(k > 0); 190 | assert(nprow > 0); 191 | assert(nprow <= numPes); 192 | } else { 193 | printf("Usage: mpirun -np ./exe "); 194 | printf(" \n"); 195 | ABORT; 196 | } 197 | npcol = numPes/nprow; 198 | if (myRank == 0){ 199 | printf("m=" PRId64 ", k=" PRId64 ", b = " PRId64 ", nprow = " PRId64 ", npcol = " PRId64 "\n", 200 | m,k,b,nprow,npcol); 201 | } 202 | SETUP_SUB_COMM(cdt_glb, (cdt_row), 203 | myRank/nprow, 204 | myRank%nprow, 205 | npcol); 206 | SETUP_SUB_COMM(cdt_glb, (cdt_col), 207 | myRank%nprow, 208 | myRank/nprow, 209 | nprow); 210 | 211 | qr_2d_unit_test(m, k, b, myRank, numPes, 0, cdt_row, cdt_col, cdt_glb); 212 | 213 | 214 | COMM_EXIT; 215 | return 0; 216 | } 217 | -------------------------------------------------------------------------------- /test/SE/Makefile: -------------------------------------------------------------------------------- 1 | include ../../config.mk 2 | 3 | INCLUDES := -I../../include/ 4 | BIN_DIR = ../../bin/tests 5 | LIB_DIR = ../../lib 6 | 7 | SE_TESTS = test_scala_sym_eig test_full2band 8 | .PHONY: $(SE_TESTS) 9 | SE_tests: $(SE_TESTS) 10 | test_scala_sym_eig: $(BIN_DIR)/test_scala_sym_eig 11 | test_full2band: $(BIN_DIR)/test_full2band 12 | test_full2band_3d: $(BIN_DIR)/test_full2band_3d 13 | test_full2band_scala: $(BIN_DIR)/test_full2band_scala 14 | 15 | 16 | $(BIN_DIR)/test_scala_sym_eig: test_scala_sym_eig.cxx $(LIB_DIR)/libCANSE.a $(LIB_DIR)/libCANQR.a $(LIB_DIR)/libCANShared.a 17 | ifneq (,$(findstring DUSE_SCALAPACK,$(DEFS))) 18 | $(CXX) -o $(BIN_DIR)/test_scala_sym_eig test_scala_sym_eig.cxx $(CXXFLAGS) $(DEFS) $(INCLUDES) \ 19 | -L$(LIB_DIR) -lCANSE -lCANQR -lCANShared $(BLAS_LIBS) $(LDFLAGS) -lm 20 | endif 21 | 22 | $(BIN_DIR)/test_full2band_3d: test_full2band_3d.cxx $(LIB_DIR)/libCANSE.a $(LIB_DIR)/libCANQR.a $(LIB_DIR)/libCANShared.a 23 | ifneq (,$(findstring DUSE_SCALAPACK,$(DEFS))) 24 | $(CXX) -o $(BIN_DIR)/test_full2band_3d test_full2band_3d.cxx $(CXXFLAGS) $(DEFS) $(INCLUDES) \ 25 | -L$(LIB_DIR) -lCANSE -lCANQR -lCANShared $(BLAS_LIBS) $(LDFLAGS) -lm 26 | endif 27 | 28 | 29 | $(BIN_DIR)/test_full2band: test_full2band.cxx $(LIB_DIR)/libCANSE.a $(LIB_DIR)/libCANQR.a $(LIB_DIR)/libCANShared.a 30 | ifneq (,$(findstring DUSE_SCALAPACK,$(DEFS))) 31 | $(CXX) -o $(BIN_DIR)/test_full2band test_full2band.cxx $(CXXFLAGS) $(DEFS) $(INCLUDES) \ 32 | -L$(LIB_DIR) -lCANSE -lCANQR -lCANShared $(BLAS_LIBS) $(LDFLAGS) -lm 33 | endif 34 | 35 | $(BIN_DIR)/test_full2band_scala: test_full2band_scala.cxx $(LIB_DIR)/libCANSE.a $(LIB_DIR)/libCANQR.a $(LIB_DIR)/libCANShared.a 36 | ifneq (,$(findstring DUSE_SCALAPACK,$(DEFS))) 37 | $(CXX) -o $(BIN_DIR)/test_full2band_scala test_full2band_scala.cxx $(CXXFLAGS) $(DEFS) $(INCLUDES) \ 38 | -L$(LIB_DIR) -lCANSE -lCANQR -lCANShared $(BLAS_LIBS) $(LDFLAGS) -lm 39 | endif 40 | 41 | clean: 42 | rm -f *.o 43 | --------------------------------------------------------------------------------