├── src
    ├── fp16_gpu_kernels.cu
    ├── gpu_init_kernels.cu
    ├── device_macros.h
    ├── highammgen.hpp
    ├── log.hpp
    ├── fp16_gpu_kernels.h
    ├── gpu_init_kernels.h
    ├── getrf_nopiv.hpp
    ├── panel_check.hpp
    ├── higham_mat_impl.cpp
    ├── gpu_init.hpp
    ├── panel_norm.hpp
    ├── CMakeLists.txt
    ├── cuda_device_macros.h
    ├── iterative_refinement.hpp
    ├── hpl_rand.hpp
    ├── gpu_init_kernels.cpp
    ├── svesim.hpp
    ├── rocm_device_macros.h
    ├── fp16sim.hpp
    ├── grid.hpp
    ├── otf_gemv.cpp
    ├── timer.hpp
    ├── fp16_gpu_kernels.cpp
    └── matgen.hpp
├── doc
    ├── CUDA_HIP_Macros.xlsx
    ├── load_modules_frontier.sh
    ├── build_OpenMxP_frontier.sh
    ├── OpenMxP.slurm
    └── crusher_example_32x32.out
├── LICENSE
├── LICENSE.Fugaku
└── README.md


/src/fp16_gpu_kernels.cu:
--------------------------------------------------------------------------------
1 | // Import GPU code
2 | #include "fp16_gpu_kernels.cpp"
3 | 


--------------------------------------------------------------------------------
/src/gpu_init_kernels.cu:
--------------------------------------------------------------------------------
1 | // Import GPU code
2 | #include "gpu_init_kernels.cpp"
3 | 


--------------------------------------------------------------------------------
/doc/CUDA_HIP_Macros.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/at-aaims/OpenMxP/HEAD/doc/CUDA_HIP_Macros.xlsx


--------------------------------------------------------------------------------
/doc/load_modules_frontier.sh:
--------------------------------------------------------------------------------
 1 | #module load PrgEnv-cray/8.3.3
 2 | 
 3 | module load PrgEnv-gnu/8.3.3
 4 | module load gcc
 5 | 
 6 | module load rocm/5.1.0
 7 | 
 8 | module load cray-mpich/8.1.18
 9 | 
10 | #module load cray-libsci/21.08.1.2
11 | 
12 | module load craype-x86-trento
13 | module load craype-network-ofi
14 | module load craype-accel-amd-gfx90a
15 | module load cmake/3.23.2
16 | 
17 | 


--------------------------------------------------------------------------------
/src/device_macros.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifndef __HPLAI_DEVICE_MACROS__
 3 | #define __HPLAI_DEVICE_MACROS__
 4 | 
 5 | // Temp
 6 | /*
 7 | #if 0
 8 |     #define CUDA_OLCF_PLATFORM 1
 9 | 
10 | #elif
11 |     #define ROCM_OLCF_PLATFORM 1
12 | 
13 | #else
14 | 
15 | #endif
16 | */
17 | 
18 | #define ROCM_OLCF_PLATFORM 1
19 | 
20 | // SUMMIT
21 | #ifdef CUDA_OLCF_PLATFORM
22 | 
23 | #include "cuda_device_macros.h"
24 | 
25 | 
26 | // FRONTIER (& SPOCK)
27 | #elif defined(ROCM_OLCF_PLATFORM)
28 | 
29 | #include "rocm_device_macros.h"
30 | 
31 | #endif
32 | 
33 | 
34 | #endif // __HPLAI_DEVICE_MACROS__
35 | 


--------------------------------------------------------------------------------
/doc/build_OpenMxP_frontier.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | source ../doc/load_modules_frontier.sh
 3 | 
 4 | SRC_DIR=../src
 5 | 
 6 | export LD_LIBRARY_PATH=/opt/rocm-5.1.0/llvm/lib:/opt/rocm-5.1.0/llvm/lib:${LD_LIBRARY_PATH}
 7 | export HIPCC_COMPILE_FLAGS_APPEND="$HIPCC_COMPILE_FLAGS_APPEND -std=c++14 -O3 -fopenmp --offload-arch=gfx90a"
 8 | rm -rf CMakeCache.txt CMakeFiles externals Makefile
 9 | cmake \
10 |     -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON -DCMAKE_BUILD_TYPE=Release \
11 |     $SRC_DIR 2>&1 | tee CMAKE.OUTPUT
12 | #   -DCMAKE_HIP_COMPILER_FORCED=True \
13 | 
14 | make VERBOSE=1 -j1 2>&1 | tee MAKE.OUTPUT
15 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright © 2022 , UT-Battelle, LLC 
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/LICENSE.Fugaku:
--------------------------------------------------------------------------------
 1 | .Copyright (c) 2021, RIKEN
 2 | 
 3 | All rights reserved.
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met: 
 6 | 1. Redistributions of source code must retain the above copyright notice,
 7 |    this list of conditions and the following disclaimer. 
 8 | 2. Redistributions in binary form must reproduce the above copyright notice,
 9 |    this list of conditions and the following disclaimer in the documentation
10 |    and/or other materials provided with the distribution. 
11 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
12 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
13 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
14 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
15 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
16 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
17 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
18 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
19 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
20 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
21 | 


--------------------------------------------------------------------------------
/src/highammgen.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef _HIGHAMMGEN_HPP
 2 | #define _HIGHAMMGEN_HPP
 3 | extern "C" double higham_mat_comp_beta(int n, double cond, double rho);
 4 | template <typename F> struct HMGen {
 5 |     // generator of the Higham's HPL-AI matrix in
 6 |     // https://github.com/higham/hpl-ai-matrix It generates the matrix A=LU with
 7 |     // L and U have the special structure. L is the lower-triangular matrix with
 8 |     // the diagonals = 1 and the strictly lower-triangular part = alpha. U is
 9 |     // the upper-triangular matrix with the diagonals = 1 and the strictly
10 |     // upper-triangular part = beta.
11 |     int n; // the matrix size
12 |     double alpha, beta;
13 |     double scalea, scaleb; // scaling for left and right panels.
14 |     F *diag;               // diagonal part of the matrix.
15 |     HMGen(int n, double cond, double rho, F *diag) : n(n), diag(diag) {
16 |         // alpha and beta are automatically computed from the condition number
17 |         // and others
18 |         beta = higham_mat_comp_beta(n, cond, rho);
19 |         alpha = rho * beta; // rhos is the ration of alhpa and beta.
20 | #if 0
21 | 		// this scaling may be lead to too good result
22 | 		scalea = 1. / (alpha * 32.);
23 | 		scaleb = 1. / (beta * 16.);
24 | #else
25 |         // we observed that alpha~beta~O(1/n).
26 |         scalea = n / (32.);
27 |         scaleb = n / (16.);
28 | #endif
29 |     }
30 | };
31 | #endif
32 | 


--------------------------------------------------------------------------------
/src/log.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef LOG_HPP
 2 | #define LOG_HPP
 3 | 
 4 | // remove the following will compile away debug message, but keep info
 5 | // or make it conditional by: #ifdef DEBUG
 6 | #if 0
 7 | #define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_TRACE
 8 | #include "spdlog/cfg/env.h"
 9 | #include "spdlog/sinks/basic_file_sink.h"
10 | #include "spdlog/sinks/stdout_color_sinks.h"
11 | #include "spdlog/spdlog.h"
12 | #endif
13 | 
14 | //extern std::shared_ptr<spdlog::logger> LOG;
15 |   
16 | extern int grank, gsize, glog;
17 | 
18 | // #define INFO(...) {LOG->info(__VA_ARGS__); }
19 | 
20 | // #define WARN(...) {LOG->warn(__VA_ARGS__); }
21 | 
22 | #if 0
23 | #ifdef NDEBUG
24 | #define TRACE(...) (void)0
25 | #else
26 | #define TRACE(...) SPDLOG_LOGGER_CALL(spdlog::default_logger_raw(), spdlog::level::trace, __VA_ARGS__) 
27 | #endif
28 | #endif
29 | 
30 | void PrintMsg(const char *fmt...) {
31 |     if (grank == 0) {
32 | //      LOG->info(fmt);
33 |     }
34 | }
35 | 
36 | // #define PrintLogMsg( ... ) { if ( grank == 0  &&  glog == 1 ) LOG->info( __VA_ARGS__ ); }
37 | 
38 | #define OUTPUT if ( grank == 0  &&  glog == 1 ) 
39 | 
40 | inline
41 | void PrintLogMsg( const char * fmt, ... )
42 | {
43 |    if ( grank == 0  &&  glog == 1 )
44 |    {
45 |       va_list argPtr;
46 |       va_start( argPtr, fmt );
47 |       vfprintf( stdout, fmt, argPtr );
48 |       va_end( argPtr );
49 |       fflush( stdout );
50 |    }
51 | }
52 | 
53 | #endif
54 | 


--------------------------------------------------------------------------------
/doc/OpenMxP.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -A stf018
 3 | #SBATCH -J openmxp
 4 | #SBATCH -p batch
 5 | 
 6 | #SBATCH -N 32
 7 | 
 8 | ##SBATCH -C nvme
 9 | 
10 | #SBATCH -t 0:30:00
11 | 
12 | #SBATCH -o frontier_OpenMxP_01.out
13 | 
14 | module load PrgEnv-gnu/8.3.3
15 | module load gcc
16 | 
17 | module load rocm/5.1.0
18 | module load cray-mpich/8.1.18
19 | module load craype-x86-trento
20 | module load craype-network-ofi
21 | module load craype-accel-host
22 | 
23 | export LD_LIBRARY_PATH=/opt/rocm-5.1.0/llvm/lib:${LD_LIBRARY_PATH}
24 | export LD_LIBRARY_PATH=/opt/rocm-5.1.0/lib:${LD_LIBRARY_PATH}
25 | 
26 | #export LD_LIBRARY_PATH=/opt/rocm-5.4.0/llvm/lib:${LD_LIBRARY_PATH}
27 | #export LD_LIBRARY_PATH=/opt/rocm-5.4.0/lib:${LD_LIBRARY_PATH}
28 | 
29 | export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0
30 | export MPICH_GPU_SUPPORT_ENABLED=1
31 | export MPICH_SMP_SINGLE_COPY_MODE=CMA
32 | 
33 | echo "JobID          : $SLURM_JOB_ID"
34 | echo "Number of Nodes: $SLURM_JOB_NUM_NODES"
35 | 
36 | #SLURM_JOB_NODELIST SLURM_JOB_ID
37 | #echo $SLURM_JOB_NODELIST > "frontier_$SLURM_JOB_ID.nodes"
38 | #echo $SLURM_JOB_NODELIST
39 | 
40 | nt=$(expr $SLURM_JOB_NUM_NODES \* 8)
41 | 
42 | pq=24
43 | pq=28
44 | pq=32
45 | 
46 | pq=16
47 | 
48 | comm=4
49 | 
50 | b=2560
51 | 
52 | ln=92160
53 | ln=120320
54 | ln=122880
55 | ln=125440
56 | 
57 | N=$(expr $pq \* $ln)
58 | 
59 | export OMP_NUM_THREADS=7
60 | 
61 | export NOTE="OpenMxP rocm 5.1.0 GPU-Direct"
62 | 
63 | export CMD="../build/OpenMxP.x86_64 $N $b $pq -1 -comm 2 -alt 1 "
64 | srun -N $SLURM_JOB_NUM_NODES -n $nt -c 7 --ntasks-per-node=8 --gpus-per-task=1 --gpu-bind=closest $CMD
65 | 
66 | export CMD="../build/OpenMxP.x86_64 $N $b $pq -1 -comm 2 -alt 2 "
67 | srun -N $SLURM_JOB_NUM_NODES -n $nt -c 7 --ntasks-per-node=8 --gpus-per-task=1 --gpu-bind=closest $CMD
68 | 
69 | 


--------------------------------------------------------------------------------
/src/fp16_gpu_kernels.h:
--------------------------------------------------------------------------------
 1 | #ifndef __fp16GPUKer
 2 | #define __fp16GPUKer
 3 | 
 4 | #include <assert.h>
 5 | //#include <cublas_v2.h>
 6 | //#include <cuda_runtime.h>
 7 | #include <iostream>
 8 | #include <stdlib.h>
 9 | #include <unistd.h>
10 | //#include <chrono>
11 | 
12 | #include "device_macros.h"
13 | 
14 | using namespace std;
15 | 
16 | 
17 | __global__ void copyCoalesced(__half *odata, const float *idata, int olda, int ilda);
18 | /// HUGE ASSUMTION OF ilda = olda for copy lower and upper
19 | __global__ void copyCoalesced_lower(__half *odata, const float *idata, int olda, int ilda);
20 | __global__ void copyCoalesced_upper(__half *odata, const float *idata, int olda, int ilda);
21 | __global__ void copyCoalesced_lower(float *odata, const __half *idata, int olda, int ilda);
22 | __global__ void copyCoalesced_upper(float *odata, const __half *idata, int olda, int ilda);
23 | __global__ void transposeCoalesced(__half *odata, const float *idata, int olda, int ilda);
24 | 
25 | // HOST CALLS TO LAUNCH KERNELS (LEFT)
26 | __host__ void half_conversion_left( const float *C, int b, int plda, __half *lp, int lplda);
27 | 
28 | // HOST CALLS TO LAUNCH KERNELS (RIGHT TRANS)
29 | __host__ void half_conversion_right_trans(const float *C, int b, int plda, __half * rp, int rplda);
30 | 
31 | // Copy cast kernel
32 | __global__ void copyFtoH(__half *odata, const float *idata, long olda, long ilda);
33 | __host__ void downCast_copy_general(__half* out, long olda, long nrow, long ncol, float* C, long ilda);
34 | __host__ void downCast_copy_lower(__half* out, long olda, long nrow, long ncol, float* C, long ilda);
35 | __host__ void downCast_copy_upper(__half* out, long olda, long nrow, long ncol, float* C, long ilda);
36 | 
37 | // Debug purpose
38 | __host__ void upCast_copy_lower(float* out, long olda, long nrow, long ncol, __half * C, long ilda);
39 | __host__ void upCast_copy_upper(float* out, long olda, long nrow, long ncol, __half * C, long ilda);
40 | 
41 | // Alt solver
42 | __host__ void gen_identity_mat( float * out, long nrow, long ncol);
43 | __global__ void gen_identity_mat_kernel( float * out, long nrow, long ncol);
44 | __global__ void gen_identity_mat_kernel( __half * out, long nrow, long ncol);
45 | __host__ void gen_identity_mat( __half * out, long nrow, long ncol);
46 | 
47 | // trans cast kernel
48 | __host__ void downCast_trans_general(__half* out, long olda, long nrow, long ncol , float* C, long ilda);
49 | __host__ void downCast_trans_general(__half* out, int olda, int nrow, int ncol , float* C, int ilda);
50 | 
51 | #define VECTOR_OP_THREADS 128
52 | 
53 | #endif
54 | 
55 | 
56 | 


--------------------------------------------------------------------------------
/src/gpu_init_kernels.h:
--------------------------------------------------------------------------------
 1 | #ifndef GPU_INIT_KERNELS
 2 | #define GPU_INIT_KERNELS
 3 | #include <unistd.h>
 4 | #include <iostream>
 5 | #include <stdlib.h>
 6 | #include <assert.h>
 7 | 
 8 | #include "hpl_rand.hpp"
 9 | #include "device_macros.h"
10 | 
11 | #define MATGEM_THREADS 512
12 | 
13 | using namespace std;
14 | /*
15 | template <typename T>
16 | void fill_panel_diag_host(int n, int b, T *A, int lda, RandStat stat_ij, RandCoeff incl1,
17 |         RandCoeff jump_thread, int rowstart, RandStat stat_00, double* Diag, int nthreads);
18 | 
19 | template <typename T>
20 | void fill_panel_host(int n, int b, T *A, int lda,
21 |         RandStat stat_ij, RandCoeff incl1, RandCoeff jump_thread, int nthreads );
22 | 
23 | template <typename T>
24 | __global__ void fill_panel_dev(int n, int b, T * __restrict__ A,
25 |         int lda, RandStat stat_ij, RandCoeff incl1, RandCoeff jump_dn);
26 | 
27 | template <typename T>
28 | __global__ void fill_panel_diag_dev(int n, int nb, T * __restrict__ A, int lda, RandStat stat_ij, RandCoeff incl1,
29 |         RandCoeff jump_dn, RandStat stat_00, int rowstart, double* Diag);
30 | */
31 | 
32 | void fill_random(float *A, long m, long n, int n_threads, int blocksize_x, int work_per_thread);
33 | 
34 | void fill_random(double *v, long m, int n_threads);
35 | 
36 | __host__ void fill_random_fugaku(uint64_t N, RandStat stat0, RandCoeff inc1, float *A, double *row_sums, long m, long n, long i1, long j1, long b, long istride, long jstride, int n_threads, int blocksize_x, int work_per_thread);
37 | 
38 | template <typename F> __global__ void fill_random_fugaku_d(uint64_t N, RandStat stat0, RandCoeff inc1, F *A, double *row_sums, long m, long n, long i1, long j1, long b, long istride, long jstride, int work_per_thread);
39 |     
40 | __host__ void compute_row_sums(float *A, double *row_sums, long m, long n, long i1, long j1, long b, long istride, long jstride, int n_threads, int blocksize_x, int work_per_thread);
41 | 
42 | __global__ void compute_row_sums_d(float *A, double *row_sums, long m, long n, long i1, long j1, long b, long istride, long jstride, int work_per_thread);
43 | 
44 | __host__ void fill_diag_rhs(int my_init, uint64_t N, RandStat stat0, RandCoeff inc1, float *A, double *row_sums_d, double *rhs, double *rhs_d, long m, long *diag_i_steps, long *diag_j_steps, long n_diag_blocks, long i1, long b, long istride, int n_threads);
45 | 
46 | __global__ void fill_diag_rhs_d(int my_init, uint64_t N, RandStat stat0, RandCoeff inc1, float *A, double *row_sums_d, double *rhs, double *rhs_d, long m, long *diag_i_steps, long *diag_j_steps, long n_diag_entries, long i1, long b, long istride);
47 | 
48 | template <typename F> __global__ void minus_05(F *A, long m, long n, int work_per_thread);
49 | 
50 | #endif
51 | 


--------------------------------------------------------------------------------
/src/getrf_nopiv.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef GETRF_NOPIV
  2 | #define GETRF_NOPIV
  3 | #include <cstdlib>
  4 | #include <omp.h>
  5 | //#include "schur_updator.hpp"
  6 | 
  7 | extern "C" void dtrsm_(...);
  8 | extern "C" void strsm_(...);
  9 | inline void trsmR(int m, int n, const double *a, int lda, double *b, int ldb) {
 10 |     double one = 1.;
 11 |     dtrsm_("R", "U", "N", "N", &m, &n, &one, a, &lda, b, &ldb);
 12 | }
 13 | inline void trsmR(int m, int n, const float *a, int lda, float *b, int ldb) {
 14 |     float one = 1.f;
 15 |     strsm_("R", "U", "N", "N", &m, &n, &one, a, &lda, b, &ldb);
 16 | }
 17 | inline void trsmL(int m, int n, double const *a, int lda, double *b, int ldb) {
 18 |     double one = 1.;
 19 |     dtrsm_("L", "L", "N", "U", &m, &n, &one, a, &lda, b, &ldb);
 20 | }
 21 | inline void trsmL(int m, int n, float const *a, int lda, float *b, int ldb) {
 22 |     float one = 1.f;
 23 |     strsm_("L", "L", "N", "U", &m, &n, &one, a, &lda, b, &ldb);
 24 | }
 25 | /****** Use as future reference for improvment ******
 26 | 
 27 | 
 28 | #define NSMALL 16
 29 | template<typename F>
 30 | void getrf_nopiv_small(int n, F* a, size_t lda) {
 31 |         for(int k=0; k<n; k++){
 32 |                 F inv = static_cast<F>(1) / a[lda*k + k];
 33 |                 for(int i=k+1; i<n; ++i) a[k*lda + i] *= inv;
 34 |                 #pragma loop novrec
 35 |                 #pragma loop unroll(2)
 36 |                 for(int j=k+1; j<n; ++j){
 37 |                         F akj = a[j*lda + k];
 38 |                         #pragma loop novrec
 39 |                         for(int i=k+1; i<n; ++i){
 40 |                                 a[j*lda + i] -= a[k*lda + i] * akj;
 41 |                         }
 42 |                 }
 43 |         }
 44 | }
 45 | 
 46 | template<typename F>
 47 | void getrf_nopiv(int n, F* a, size_t lda, bool warmup=false) {
 48 |         for(int k=0; k<n; k+=NSMALL){
 49 |                 int nn = n-k<NSMALL? n-k: NSMALL;
 50 |                 getrf_nopiv_small(nn, a+lda*k+k, lda);
 51 |                 if(n-k>nn){
 52 |                         trsmL(nn, n-k-nn, a+lda*k+k, lda, a+lda*(k+nn)+k, lda);
 53 |                         trsmR(n-k-nn, nn, a+lda*k+k, lda, a+lda*k+k+nn, lda);
 54 |                         gemmschur(n-k-nn, n-k-nn, nn, a+lda*k+k+nn, lda,
 55 | a+lda*(k+nn)+k, lda, a+lda*(k+nn)+(k+nn), lda);
 56 |                 }
 57 |         }
 58 | }
 59 | 
 60 | #if defined(__FUJITSU) || defined(__CLANG_FUJITSU)
 61 | extern void sgetrf_nopiv_tuned(int n, float *a, size_t lda);
 62 | #endif
 63 | template<>
 64 | void getrf_nopiv<float>(int n, float* a, size_t lda, bool warmup) {
 65 | #if defined(__FUJITSU) || defined(__CLANG_FUJITSU)
 66 |         if(!warmup){
 67 |                 sgetrf_nopiv_tuned(n, a, lda);
 68 |                 return;
 69 |         }
 70 | #endif
 71 |         for(int k=0; k<n; k+=NSMALL){
 72 |                 int nn = n-k<NSMALL? n-k: NSMALL;
 73 |                 getrf_nopiv_small(nn, a+lda*k+k, lda);
 74 |                 if(n-k>nn){
 75 |                         trsmL(nn, n-k-nn, a+lda*k+k, lda, a+lda*(k+nn)+k, lda);
 76 |                         trsmR(n-k-nn, nn, a+lda*k+k, lda, a+lda*k+k+nn, lda);
 77 |                         gemmschur(n-k-nn, n-k-nn, nn, a+lda*k+k+nn, lda,
 78 | a+lda*(k+nn)+k, lda, a+lda*(k+nn)+(k+nn), lda);
 79 |                 }
 80 |         }
 81 | }
 82 | 
 83 | // copy-first version
 84 | template<typename F>
 85 | void getrf_nopiv(int n, F* a, size_t lda, F* piv, int ldpiv) {
 86 |                 #pragma omp parallel for
 87 |                 for(int j=0; j<n; ++j)
 88 | #if 0
 89 |                         for(int i=0; i<n; ++i)
 90 |                                 piv[j*ldpiv + i] = a[j*lda + i];
 91 | #else
 92 |                 {
 93 |                         const F * __restrict__ src = &a[j*lda];
 94 |                         F * __restrict__ dst = &piv[j*ldpiv];
 95 |                         for(int i=0; i<n; ++i) dst[i] = src[i];
 96 |                 }
 97 | #endif
 98 | 
 99 |                 getrf_nopiv(n, piv, ldpiv);
100 | }*/
101 | 
102 | template <typename F> void warmup_trf(int n, F *a, size_t lda) {
103 |     for (int k = 0; k < n; k++) {
104 |         a[k + lda * k] = 1.0;
105 |     }
106 |     getrf_nopiv(n, a, lda, true);
107 | }
108 | #endif
109 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # OpenMxP: Open-Source Mixed Precision Computing
  2 | - [OpenMxP: Open-Source Mixed Precision Computing](#openmxp-open-source-mixed-precision-computing)
  3 |   - [Build instructions ( Frontier/Crusher )](#build-instructions--frontiercrusher-)
  4 |   - [Running instructions ( Frontier/Crusher )](#running-instructions--frontiercrusher-)
  5 |     - [Comments](#comments)
  6 |   - [Build instruction (Summit)](#build-instruction-summit)
  7 |   - [Tuning Parameters](#tuning-parameters)
  8 |   - [Citation](#citation)
  9 |     - [Code Repo](#code-repo)
 10 |     - [SC22 Paper](#sc22-paper)
 11 |   - [Developers](#developers)
 12 |   - [Contributors](#contributors)
 13 | 
 14 | 
 15 | 
 16 | 
 17 | ## Build instructions ( Frontier/Crusher )
 18 | 
 19 | ```sh
 20 | cd OpenMxP
 21 | mkdir build
 22 | cd build
 23 | cp ../doc/build_OpenMxP_frontier.sh .
 24 | ```
 25 | That script runs `../doc/load_modules_frontier.sh` which may need to be modified for different rocm versions.
 26 | 
 27 | ```sh
 28 | ./build_OpenMxP_frontier.sh
 29 | ```
 30 | You should now have a OpenMxP.x86_64 binary.
 31 | 
 32 | 
 33 | ## Running instructions ( Frontier/Crusher )
 34 | 
 35 | ```sh
 36 | mkdir jobs
 37 | cd jobs
 38 | cp ../doc/OpenMxP.slurm
 39 | ```
 40 | Change this script to meet your needs.
 41 | 
 42 | ```sh
 43 | sbatch OpenMxP.slurm
 44 | ```
 45 | The output from crusher is in `doc/crusher_example_32x32.out`.
 46 | 
 47 | Constraints are PxQ=#GPUs, PxLN=QxLN, B need to be divisiable by TILE size.
 48 | Must have at least 3 omp threads.
 49 | 
 50 | ### Comments
 51 | 
 52 | OpenMxP is designed to run at scale.   When it is run at a few number of nodes,
 53 | the performance will suffer due to the Iterative Refinement (IR).
 54 | At larger scales, this time becomes insignificant in the run.
 55 | 
 56 | There are requirements between N, B, PxQ ( process grid ), and the local grid.
 57 | Some are enforced while others are not.  It is usually easier to run square
 58 | ( PxQ ) that are multiples of 8.  The best B tends to be 2560 and the best
 59 | performing local N (LN) tends to be 125440.   So this will give a N of P*LN.
 60 | 
 61 | 
 62 | ## Build instruction (Summit)
 63 | 
 64 | ```sh
 65 | module load cmake gcc/7.4.0 cuda/11.2.0 openblas
 66 | git clone git@github.com:at-aaims/OpenMxP
 67 | cd hpl-ai && make build && cd build 
 68 | ```
 69 | 
 70 | For release build:
 71 | 
 72 | ```sh
 73 | cmake -DCMAKE_BUILD_TYPE=Release ..
 74 | make
 75 | ```
 76 | 
 77 | The default optimization level is `-O3`.
 78 | 
 79 | For debug build:
 80 | 
 81 | ```sh
 82 | cmake -DCMAKE_BUILD_TYPE=Debug ..
 83 | make
 84 | ```
 85 | This will have debug info built in.
 86 | 
 87 | 
 88 | ## Tuning Parameters
 89 | ```
 90 | -log 1 ( print rank 0 messages )
 91 | 
 92 | -solv 0 ( use blas )
 93 |       1 ( use solver ) # default (fastest)
 94 | 
 95 | -comm 0 ( use ibcast )
 96 |       1 ( use bcast )    
 97 |       2 ( use 1ring )       # default
 98 |       3 ( use 1ringM )
 99 |       4 ( use 2ringM )
100 | 
101 | --numa 0 (Global Column Major)   # default
102 |        1 ( Node Grid - 2x3C )    
103 |        2 ( Node Grid - 3x2C )       
104 |        3 ( Global Row Major )    
105 |        4 ( Node Grid - 2x4R )
106 |        5 ( Node Grid - 2x4C )
107 | 
108 | -alt 0 (TRSM L/U panel)
109 |      1 (TRSM for Diagonal inverse)
110 |      2 (TRTRI for Diagonal inverse)
111 | 
112 | 
113 | -sync ( enable cuda device sync after sgemm - currently only for bcast )
114 | ```
115 | 
116 | ## Citation
117 | 
118 | 
119 | ### Code Repo
120 | 
121 | ```
122 | @misc{doecode_102701,
123 |   title = {OpenMxP - Open Source Mixed Precision Computing},
124 |   author = {Lu, Hao and Matheson, Michael and Wang, Feiyi and Joubert, Wayne and Ellis, Austin and Oles, Vladyslav},
125 |   doi = {10.11578/dc.20230315.3},
126 |   url = {https://doi.org/10.11578/dc.20230315.3},
127 |   howpublished = {[Computer Software] \url{https://doi.org/10.11578/dc.20230315.3}},
128 |   year = {2023},
129 |   month = {mar}
130 | }
131 | ```
132 | 
133 | ### SC22 Paper 
134 | 
135 | ```bibtex
136 | @inproceedings{10.5555/3571885.3571988,
137 |     author = {Lu, Hao and Matheson, Michael and Oles, Vladyslav and Ellis, Austin and Joubert, Wayne and Wang, Feiyi},
138 |     title = {Climbing the Summit and Pushing the Frontier of Mixed Precision Benchmarks at Extreme Scale},
139 |     year = {2022},
140 |     isbn = {9784665454445},
141 |     publisher = {IEEE Press},
142 |     booktitle = {Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis},
143 |     articleno = {78},
144 |     numpages = {15},
145 |     doi = {10.1109/SC41404.2022.00083},
146 |     keywords = {linear algebra, parallel programming, exascale computing, high performance computing},
147 |     location = {Dallas, Texas},
148 |     series = {SC '22}
149 | }
150 | ```
151 | 
152 | 
153 | ## Developers
154 | * Hao Lu, <luh1@ornl.gov>
155 | * Michael Matheson, <mathesonma@ornl.gov> (Main Contact)
156 | * Wayne Joubert, <joubert@ornl.gov>
157 | * Feiyi Wang, <fwang2@ornl.gov>
158 | * Vlad Oles, <olesv@ornl.gov> (Past)
159 | * Austin Ellis, <ellisja@ornl.gov> (Past)
160 | 
161 | ## Contributors
162 | * Jakub Kurzak <Jakub.Kurzak@amd.com>
163 | * Alessandro Fanfarillo <Alessandro.Famfarillo@amd.com>
164 | * Noel Chalmers <Noel.Chalmers@amd.com>
165 | * Nicolas Malaya Nicholas <Nicolas.Malaya@amd.com>
166 | * Pak Niu Lui <Pak.Lui@amd.com>
167 | * Hui Liu <Hui.Lui1@amd.com>
168 | * Mazda Sabony <Mazada.Sabony@amd.com>
169 | 


--------------------------------------------------------------------------------
/src/panel_check.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef PANEL_CHECK_HPP
  2 | #define PANEL_CHECK_HPP
  3 | // computing checksum of matrix for debuging.
  4 | #include "grid.hpp"
  5 | #include "highammgen.hpp"
  6 | #include "panel.hpp"
  7 | #include <mpi.h>
  8 | 
  9 | template <typename FPanel> void panel_check(Panels<FPanel> const &p, Grid &g) {
 10 |     double sigs[3] = {0., 0., 0.};
 11 |     double sigd = 0., sigu = 0., sigl = 0.;
 12 |     size_t lda = p.lda;
 13 |     int b = p.b;
 14 |     int i1 = p.i1;
 15 |     int j1 = p.j1;
 16 |     int istride = p.istride;
 17 |     int jstride = p.jstride;
 18 |     int nprow = p.nprow;
 19 |     int npcol = p.npcol;
 20 |     //#pragma omp parallel for collapse(2) schedule(dynamic)
 21 |     //reduction(+:sigd,sigu,sigl)
 22 |     for (int j = 0; j < npcol; ++j) {
 23 |         for (int i = 0; i < nprow; ++i) {
 24 |             int jpos = j1 + j * jstride;
 25 |             int ipos = i1 + i * istride;
 26 |             FPanel const *data = p(i, j);
 27 |             if (ipos == jpos) {
 28 |                 for (int jj = 0; jj < b; ++jj) {
 29 |                     for (int ii = 0; ii < jj; ++ii) {
 30 |                         double t = fabs(data[jj * lda + ii]);
 31 |                         sigu += t;
 32 |                     }
 33 |                     sigd += fabs(data[jj * lda + jj]);
 34 |                     for (int ii = jj + 1; ii < b; ++ii) {
 35 |                         double t = fabs(data[jj * lda + ii]);
 36 |                         sigl += t;
 37 |                     }
 38 |                 }
 39 |             } else if (ipos < jpos) {
 40 |                 for (int jj = 0; jj < b; ++jj)
 41 |                     for (int ii = 0; ii < b; ++ii) {
 42 |                         double t = fabs(data[jj * lda + ii]);
 43 |                         sigu += t;
 44 |                     }
 45 |             } else {
 46 |                 for (int jj = 0; jj < b; ++jj)
 47 |                     for (int ii = 0; ii < b; ++ii) {
 48 |                         double t = fabs(data[jj * lda + ii]);
 49 |                         sigl += t;
 50 |                     }
 51 |             }
 52 |         }
 53 |     }
 54 |     sigs[0] = sigd;
 55 |     sigs[1] = sigu;
 56 |     sigs[2] = sigl;
 57 |     MPI_Allreduce(MPI_IN_PLACE, sigs, 3, MPI_DOUBLE, MPI_SUM, g.commworld);
 58 |     if (g.row == 0 && g.col == 0) {
 59 |         std::printf("check %22.17e %22.17e, %22.17e\n", sigs[0], sigs[1],
 60 |                     sigs[2]);
 61 |         std::fflush(stdout);
 62 |     }
 63 | }
 64 | 
 65 | template <typename FPanel>
 66 | void panel_check(HMGen<double> const &hmg, Panels<FPanel> const &p, Grid &g) {
 67 |     double sigs[3] = {0., 0., 0.};
 68 |     double sigd = 0., sigu = 0., sigl = 0.;
 69 |     size_t lda = p.lda;
 70 |     int b = p.b;
 71 |     int i1 = p.i1;
 72 |     int j1 = p.j1;
 73 |     int istride = p.istride;
 74 |     int jstride = p.jstride;
 75 |     int nprow = p.nprow;
 76 |     int npcol = p.npcol;
 77 |     double alpha = hmg.alpha;
 78 |     double beta = hmg.beta;
 79 |     double done = 1;
 80 |     //#pragma omp parallel for collapse(2) schedule(dynamic)
 81 |     //reduction(+:sigd,sigu,sigl)
 82 |     for (int j = 0; j < npcol; ++j) {
 83 |         for (int i = 0; i < nprow; ++i) {
 84 |             int jpos = j1 + j * jstride;
 85 |             int ipos = i1 + i * istride;
 86 |             FPanel const *data = p(i, j);
 87 |             double blockmax = 0.;
 88 |             if (ipos == jpos) {
 89 |                 for (int jj = 0; jj < b; ++jj) {
 90 |                     for (int ii = 0; ii < jj; ++ii) {
 91 |                         double t = fabs(data[jj * lda + ii] - beta) / beta;
 92 |                         // printf("9871 %d %d %e %e %e\n", b*ipos+ii, b*jpos+jj,
 93 |                         // data[jj*lda+ii], beta, t);
 94 |                         sigu = sigu > t ? sigu : t;
 95 |                         blockmax = blockmax > t ? blockmax : t;
 96 |                     }
 97 |                     double t = fabs(data[jj * lda + jj] - done);
 98 |                     // if(t>1e-1) printf("9871 %d %d %e %e %e\n", b*jpos+jj,
 99 |                     // b*jpos+jj, data[jj*lda+jj], done, t);
100 |                     sigd = sigd > t ? sigd : t;
101 |                     blockmax = blockmax > t ? blockmax : t;
102 |                     for (int ii = jj + 1; ii < b; ++ii) {
103 |                         double t = fabs(data[jj * lda + ii] - alpha) / alpha;
104 |                         // printf("9871 %d %d %e %e %e\n", b*ipos+ii, b*jpos+jj,
105 |                         // data[jj*lda+ii], alpha, t);
106 |                         sigl = sigl > t ? sigl : t;
107 |                         blockmax = blockmax > t ? blockmax : t;
108 |                     }
109 |                 }
110 |             } else if (ipos < jpos) {
111 |                 for (int jj = 0; jj < b; ++jj)
112 |                     for (int ii = 0; ii < b; ++ii) {
113 |                         double t = fabs(data[jj * lda + ii] - beta) / beta;
114 |                         // printf("9871 %d %d %e %e %e\n", b*ipos+ii, b*jpos+jj,
115 |                         // data[jj*lda+ii], beta, t);
116 |                         sigu = sigu > t ? sigu : t;
117 |                         blockmax = blockmax > t ? blockmax : t;
118 |                     }
119 |             } else {
120 |                 for (int jj = 0; jj < b; ++jj)
121 |                     for (int ii = 0; ii < b; ++ii) {
122 |                         double t = fabs(data[jj * lda + ii] - alpha) / alpha;
123 |                         // printf("9871 %d %d %e %e %e\n", b*ipos+ii, b*jpos+jj,
124 |                         // data[jj*lda+ii], alpha, t);
125 |                         sigl = sigl > t ? sigl : t;
126 |                         blockmax = blockmax > t ? blockmax : t;
127 |                     }
128 |             }
129 |             /*if(g.row==0&&g.col==0) {
130 |                     printf("9871 %d %d %e\n", ipos, jpos, blockmax);
131 |                     fflush(stdout);
132 |             }*/
133 |         }
134 |     }
135 |     sigs[0] = sigd;
136 |     sigs[1] = sigu;
137 |     sigs[2] = sigl;
138 |     MPI_Allreduce(MPI_IN_PLACE, sigs, 3, MPI_DOUBLE, MPI_MAX, g.commworld);
139 |     if (g.row == 0 && g.col == 0) {
140 |         std::printf("check %22.17e %22.17e, %22.17e\n", sigs[0], sigs[1],
141 |                     sigs[2]);
142 |         std::fflush(stdout);
143 |     }
144 | }
145 | 
146 | #endif
147 | 


--------------------------------------------------------------------------------
/src/higham_mat_impl.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2020, Massimiliano Fasi and Nicholas J. Higham
  2 | // All rights reserved.
  3 | //
  4 | // Redistribution and use in source and binary forms, with or without
  5 | // modification, are permitted provided that the following conditions are met:
  6 | //
  7 | // * Redistributions of source code must retain the above copyright notice, this
  8 | //   list of conditions and the following disclaimer.
  9 | //
 10 | // * Redistributions in binary form must reproduce the above copyright notice,
 11 | //   this list of conditions and the following disclaimer in the documentation
 12 | //   and/or other materials provided with the distribution.
 13 | //
 14 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 15 | // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 16 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 17 | // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 18 | // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 19 | // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 20 | // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 21 | // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 22 | // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 23 | // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 24 | // POSSIBILITY OF SUCH DAMAGE.
 25 | 
 26 | // The above copyright notice and the code are from
 27 | // https://github.com/higham/hpl-ai-matrix
 28 | 
 29 | // This file is manual translation of the above software.
 30 | #include <assert.h>
 31 | #include <float.h>
 32 | #include <math.h>
 33 | #include <stdio.h>
 34 | #define MAX(A, B) ((A) > (B) ? (A) : (B))
 35 | #define MIN(A, B) ((A) > (B) ? (B) : (A))
 36 | double fhpl(int n, double alpha, double beta) {
 37 |     // compute the inf-norm condition number of the matrix with alpha and beta
 38 |     // FHPL   Value of cond(A,inf) for matrix A(n,a,b).
 39 |     if (isnan(alpha))
 40 |         alpha = beta / 2;
 41 |     double a = alpha, b = beta;
 42 |     double lambda_1 = 1 + (n - 1) * b;
 43 |     int idash = MIN((int)floor(1. / a), n);
 44 |     int k = MIN((int)floor((1 + b) / b), n - 1);
 45 |     double lambda_idash =
 46 |         1 + (2 * k - idash + 1) * a + (n - idash) * b +
 47 |         (-k * k + k + 3 * idash * (idash - 1) / 2 - n * idash + n) * a * b;
 48 |     double lambda_n =
 49 |         1 + (2 * k - n + 1) * a + (-k * k + k + n * (n - 1) / 2) * a * b;
 50 |     double na_est = MAX(MAX(lambda_1, lambda_idash), lambda_n);
 51 |     double r = (1 + a) * (1 + b);
 52 |     int i = 1;
 53 |     double delta1 =
 54 |         (1 + a) *
 55 |         (1. / (1 + a) + (r == 0. ? 0. : b * (1 - pow(r, n - 1)) / (1 - r)));
 56 |     double deltan = pow(1 + a, n) * (1. / (1 + a));
 57 |     double ninva_est = MAX(delta1, deltan);
 58 |     // printf("Z %e %e %e %e %e %e %e %e %e\n", a, b, lambda_1, lambda_idash,
 59 |     // lambda_n, delta1, deltan, na_est, ninva_est);
 60 |     double ret = na_est * ninva_est;
 61 |     if (isinf(ret))
 62 |         return DBL_MAX;
 63 |     else
 64 |         return ret;
 65 | }
 66 | 
 67 | template <typename F> double zero_find(F f, double left, double right) {
 68 |     // bisection method
 69 |     // the brent method consumes half # of f evaluations, it's not good enought
 70 |     // for complication
 71 |     double fl = f(left);
 72 |     double fr = f(right);
 73 |     if (fl > 0. || fr < 0.)
 74 |         return 0. / 0.; // nan
 75 |     while (true) {
 76 |         double tol1 = (2. * fabs(right) + 0.5) * DBL_EPSILON;
 77 |         if (right - left < tol1)
 78 |             break;
 79 |         double middle = (left + right) / 2;
 80 |         if (middle == left || middle == right)
 81 |             break;
 82 |         double fm = f(middle);
 83 |         // printf("%e %e %e :: %e %e %e\n", left, middle, right, fl, fm, fr);
 84 |         if (fm == 0.)
 85 |             return middle;
 86 |         else if (fm < 0.) {
 87 |             left = middle;
 88 |             fl = fm;
 89 |         } else {
 90 |             right = middle;
 91 |             fr = fm;
 92 |         }
 93 |     }
 94 |     return right;
 95 | }
 96 | 
 97 | extern "C" double higham_mat_comp_beta(int n, double kappa, double rho) {
 98 |     // % Compute alpha and beta to give cond(A,inf) = kappa.
 99 |     double left = DBL_EPSILON;
100 |     double left_val = fhpl(n, rho * left, left) - kappa;
101 |     assert(left_val < 0.);
102 |     double right = 1. / rho;
103 |     int k = 1;
104 |     while (true) {
105 |         double right_val = fhpl(n, rho * right, right) - kappa;
106 |         if (isfinite(right_val) && right_val > 0.)
107 |             break;
108 |         // %fprintf('F at right endpoint, right = %9.2e, is %9.2e.\n', right,
109 |         // right_val)
110 |         right *= 0.5;
111 |         ++k;
112 |         if (k == 100)
113 |             break;
114 |     }
115 |     double beta = zero_find(
116 |         [=](double x) -> double { return fhpl(n, rho * x, x) - kappa; }, left,
117 |         right);
118 |     double alpha = rho * beta;
119 |     while (alpha > 1.) {
120 |         // fprintf('Initial alpha = %9.2e exceeds 1 so recomputing.\n', alpha)
121 |         right *= 0.5;
122 |         right = right / 2;
123 |         beta = zero_find(
124 |             [=](double x) -> double { return fhpl(n, rho * x, x) - kappa; },
125 |             left, right);
126 |         alpha = rho * beta;
127 |     }
128 |     return beta;
129 | }
130 | 
131 | extern "C" void hplai_matrix_impl(int n, double *a, int lda, double alpha,
132 |                                   double beta) {
133 |     for (int j = 0; j < n; ++j) {
134 |         for (int i = 0; i < j; ++i) {
135 |             a[j * lda + i] = -beta + alpha * beta * i;
136 |         }
137 |         a[j * lda + j] = 1. + alpha * beta * j;
138 |         for (int i = j + 1; i < n; ++i) {
139 |             a[j * lda + i] = -alpha + alpha * beta * j;
140 |         }
141 |     }
142 | }
143 | 
144 | extern "C" void hplai_matrix(int n, double *a, int lda, double kappa) {
145 |     double rho = 0.5;
146 |     double beta = higham_mat_comp_beta(n, kappa, rho);
147 |     double alpha = rho * beta;
148 |     hplai_matrix_impl(n, a, lda, alpha, beta);
149 | }
150 | 
151 | #if 0
152 | #include <stdio.h>
153 | int main()
154 | {
155 | 	int n = 10;
156 | 	double kappa = 1000;
157 | 	double rho = 0.125;
158 | 	for(int n=10; n<100000000; n=(n*3)/2){
159 | 		double beta = comp_beta(n, kappa, rho);
160 | 		printf("%d %e %e\n", n, beta, rho*beta*beta*n);
161 | 	}
162 | 	return 0;
163 | }
164 | #endif
165 | 


--------------------------------------------------------------------------------
/src/gpu_init.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef GPU_INIT
  2 | #define GPU_INIT
  3 | #include <unistd.h>
  4 | #include <iostream>
  5 | #include <stdlib.h>
  6 | #include <assert.h>
  7 | #include <mpi.h>
  8 | 
  9 | #include "gpu_init_kernels.h"
 10 | #include "hpl_rand.hpp"
 11 | #include "panel.hpp"
 12 | 
 13 | //#include "device_macros.h"
 14 | 
 15 | using namespace std;
 16 | 
 17 | #if 0
 18 | template <typename F>
 19 | void gpu_pgen(int my_init, Matgen<double> &mg, Panels<F> &p, LRPanels<__half> *lr, int n, double *diag, double *rhs)
 20 | {
 21 |     F *pp_d = p(0, 0, 'd'), *pp = p(0, 0);
 22 |     long const nprow = p.nprow, npcol = p.npcol, b = p.b, i1 = p.i1, j1 = p.j1, istride = p.istride, jstride = p.jstride, p_m = nprow * b, p_n = npcol * b;
 23 |     size_t const p_size = (size_t)p_m * (size_t)p_n;
 24 |     int n_threads = 1024, blocksize_x = 32, work_per_thread = 32;
 25 |     double *local_row_sums, *local_row_sums_d = (double *)lr[0].d_p;
 26 | 
 27 |     // Generate matrix entries in [-0.5, 0.5] and compute local row sums and copy them to host
 28 |     if (my_init == 1)
 29 |     {
 30 |         fill_random(pp_d, p_m, p_n, n_threads, blocksize_x, work_per_thread);
 31 |         compute_row_sums(pp_d, local_row_sums_d, p_m, p_n, i1, j1, b, istride, jstride, n_threads, blocksize_x, work_per_thread);
 32 |     }
 33 |     else if (my_init == 3)
 34 |     {
 35 |         fill_random_fugaku(mg.n, RandStat::initialize(mg.seed), RandCoeff::default_vals(), pp_d, local_row_sums_d, p_m, p_n, i1, j1, b, istride, jstride, n_threads, blocksize_x, work_per_thread);
 36 |     }
 37 |     GPU_DEVICE_SYNCHRONIZE();
 38 |     checkGPU(GPU_MALLOC_HOST((void **)&local_row_sums, n * sizeof(double)),
 39 |             "<FATAL:%s> %s\n", hostRank, "Allocating host[cpu] local row sums");
 40 |     GPU_MEMCPY(local_row_sums, local_row_sums_d, n * sizeof(double), GPU_MEMCPY_DEVICE_TO_HOST);
 41 | 
 42 |     // Compute global row sums and copy them to device
 43 |     double *global_row_sums, *global_row_sums_d = (double *)lr[1].d_p;
 44 | 
 45 |     checkGPU(GPU_MALLOC_HOST((void **)&global_row_sums, n * sizeof(double)),
 46 |              "<FATAL:%s> %s\n", hostRank, "Allocating host[cpu] global row sums");
 47 |     MPI_Allreduce(local_row_sums, global_row_sums, n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
 48 |     GPU_MEMCPY(global_row_sums_d, global_row_sums, n * sizeof(double), GPU_MEMCPY_HOST_TO_DEVICE);
 49 | 
 50 |     // Find diagonal blocks owned by the rank
 51 |     long j, j_step, local_row, local_col, global_row, diag_entry_idx;
 52 |     double diag_val;
 53 |     long n_diag_blocks = 0, *diag_i_steps, *diag_j_steps;
 54 |     checkGPU( GPU_MALLOC_MANAGED( &diag_i_steps, nprow * sizeof(long) ),
 55 | 	     "<FATAL:%s> %s\n", hostRank, "Allocating managed memory - diag_i_steps" );
 56 |     checkGPU( GPU_MALLOC_MANAGED( &diag_j_steps, npcol * sizeof(long) ),
 57 | 	     "<FATAL:%s> %s\n", hostRank, "Allocating managed memory - diag_j_steps" );
 58 | 
 59 |     for (long i_step=0; i_step < nprow; i_step++)
 60 |     {
 61 |         // j == i in diagonal blocks
 62 |         j = i1 + i_step * istride;
 63 |         j_step = (j - j1) / jstride;
 64 |         if ((j - j1) % jstride == 0 && j_step >= 0 && j_step < npcol)
 65 |         {
 66 |             // rank owns diagonal entries j*b through j*b + b - 1
 67 |             diag_i_steps[n_diag_blocks] = i_step;
 68 |             diag_j_steps[n_diag_blocks] = j_step;
 69 |             n_diag_blocks++;
 70 | 
 71 |         }
 72 |     }
 73 | 
 74 | 
 75 |     // If the rank owns any diagonal entries...
 76 |     if (n_diag_blocks)
 77 |     {
 78 |         // Fill in the diagonal and generate rhs
 79 |         long n_diag_entries = n_diag_blocks * b;
 80 |         double *rhs_d;
 81 |         
 82 |         checkGPU(GPU_MALLOC((void**)&rhs_d, n_diag_entries * sizeof(double)),
 83 |                  "<FATAL:%s> %s\n", hostRank, "Allocate device[gpu] rhs");
 84 |         if (my_init == 1)
 85 |         {
 86 |             fill_random(rhs_d, n_diag_entries, n_threads);        
 87 |         }
 88 |         
 89 |         fill_diag_rhs(my_init, mg.n, RandStat::initialize(mg.seed), RandCoeff::default_vals(), pp_d, global_row_sums_d, rhs, rhs_d, p_m, diag_i_steps, diag_j_steps, n_diag_blocks, i1, b, istride, n_threads);
 90 |         
 91 |         // Copy diag to host
 92 |         long i_step;
 93 |         for (long diag_block_idx = 0; diag_block_idx < n_diag_blocks; diag_block_idx++)
 94 |         {
 95 |             i_step = diag_i_steps[diag_block_idx];
 96 |             GPU_MEMCPY(diag + i_step*b, global_row_sums_d + (i1 + i_step*istride)*b, b * sizeof(double), GPU_MEMCPY_DEVICE_TO_HOST);
 97 |         }
 98 | 
 99 |         // Copy rhs to host
100 |         GPU_MEMCPY(rhs, rhs_d, n_diag_entries * sizeof(double), GPU_MEMCPY_DEVICE_TO_HOST);
101 | 
102 |     }
103 | 
104 |     // Copy panel to host
105 |     GPU_MEMCPY(pp, pp_d, p_size * sizeof(float), GPU_MEMCPY_DEVICE_TO_HOST);
106 | 
107 |     // Copy panel to permanent storage if using our generation
108 |     if (my_init == 1)
109 |     {
110 |         checkGPU(GPU_MALLOC_HOST((void **)&p.p_init, p.alloc_size),
111 |                  "<FATAL:%s> %s\n", hostRank, "Allocate host[cpu] p_init");
112 |         memcpy(p.p_init, pp, p_size * sizeof(F));
113 |     }
114 | }
115 | 
116 | template <typename F>
117 | void gpu_pgen2(Panels<F> &p, Grid& grid)
118 | {
119 | 	F *pp_d = p(0, 0, 'd'), *pp = p(0, 0);
120 | 	int b = p.b;
121 | 	fill_random2(p(0,0,'d') , p.nprow*b, p.npcol*b);
122 | 
123 | 	GPU_MEMCPY(p(0,0),p(0,0,'d'), (size_t)p.nprow* (size_t)b* (size_t)p.npcol*(size_t)b*sizeof(F),
124 | 					GPU_MEMCPY_DEVICE_TO_HOST);
125 | 	double *local_row_sums, *global_row_sums;
126 | 	checkGPU( GPU_MALLOC_MANAGED( (void**)&local_row_sums, b*sizeof(double) ),
127 | 	         "<FATAL:%s> %s\n", hostRank, "Allocate managed memory - local row sums" );
128 |    	checkGPU( GPU_MALLOC_MANAGED( (void**)&global_row_sums, b*sizeof(double) ),
129 | 	         "<FATAL:%s> %s\n", hostRank, "Allocate managed memory - global row sums" );
130 | 
131 | 	for (int k = 0; k < p.nblocks; ++k) {
132 | 		// position of the panels to decomp in process grid
133 | 		int const rootrow = k % grid.nrow;
134 | 		int const rootcol = k % grid.ncol;
135 | 		// position of the panels to decomp in local matrix
136 | 		int i = k / grid.nrow + (rootrow > grid.row ? 1 : 0);
137 | 		int j = k / grid.ncol + (rootcol > grid.col ? 1 : 0);
138 | 
139 | 		if (rootrow == grid.row && rootcol == grid.col) {
140 | 			compute_row_sums2(p(i,0,'d'),local_row_sums, b, p.npcol*b, p.lda);
141 | 			GPU_DEVICE_SYNCHRONIZE();
142 | 
143 | 			MPI_Allreduce(local_row_sums, global_row_sums, b, MPI_DOUBLE, MPI_SUM, grid.hcomm);
144 | 			pp = p(i,j);
145 | 			#pragma omp parallel for
146 | 			for (int i_step=0; i_step < b; i_step++)
147 |     		{
148 | 				pp[ i_step + i_step*p.lda]= static_cast<F>(global_row_sums[i_step]);
149 | 			}
150 | 		}
151 | 		else if(rootrow != grid.row)
152 | 		{
153 | 			//chill
154 | 		}
155 | 		else if(rootrow == grid.row)
156 | 		{
157 | 			compute_row_sums2(p(i,0,'d'),local_row_sums, b, p.npcol*b, p.lda);
158 | 			GPU_DEVICE_SYNCHRONIZE();
159 | 			MPI_Allreduce(local_row_sums, global_row_sums, b, MPI_DOUBLE, MPI_SUM, grid.hcomm);
160 | 		}
161 | 	}
162 | 	GPU_FREE(local_row_sums);
163 | 	GPU_FREE(global_row_sums);
164 | }
165 | #endif
166 | 
167 | #endif
168 | 


--------------------------------------------------------------------------------
/src/panel_norm.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef PANEL_NORM_HPP
  2 | #define PANEL_NORM_HPP
  3 | #include "grid.hpp"
  4 | #include "highammgen.hpp"
  5 | #include "panel.hpp"
  6 | #include <mpi.h>
  7 | #define NORM_THREADS 1024
  8 | //#include <thrust/extrema.h>
  9 | //#include <thrust/execution_policy.h>
 10 | 
 11 | __global__ void calc_infnorm_d(int b, double* __restrict__ x, double* __restrict__ result) {
 12 |     __shared__ double sdata[NORM_THREADS];
 13 |     
 14 |     size_t id = threadIdx.x;
 15 |     sdata[id] = 0.0;
 16 | 
 17 |     for (int j = 0; j + id < b; j += NORM_THREADS) 
 18 |     {
 19 |         sdata[id] = fmax(sdata[id], fabs(x[id+j]));
 20 |     }
 21 |     __syncthreads();
 22 | 
 23 |     for (unsigned int s = NORM_THREADS / 2; s > 0; s >>= 1) {
 24 |         if (id < s) {
 25 |             sdata[id] = fmax(sdata[id], sdata[id + s]);
 26 |         }
 27 |         __syncthreads();
 28 |     }
 29 | 
 30 |     if (id == 0) {
 31 |         result[id] = sdata[0];
 32 |     }
 33 | }
 34 | 
 35 | template <typename F>
 36 | double colv_infnorm_h(Panels<F> const &p, double *dx, Grid &g, double* workspace) {
 37 |     // computes the inf-norm of the distributed column vector dx.
 38 |     // descriptros are derived from p
 39 |     int nprow = p.nprow;
 40 |     int b = p.b;
 41 |     int i1 = p.i1;
 42 |     int j1 = p.j1;
 43 |     int istride = p.istride;
 44 |     int jstride = p.jstride;
 45 |     double norm = 0.;
 46 |     double t = 0;
 47 |     for (int i = 0; i < nprow; ++i) {
 48 |         int ipos = i1 + i * istride;
 49 |         if ((ipos % jstride) == j1) {
 50 |             calc_infnorm_d<<<1,NORM_THREADS>>>(b, dx + b * i, workspace);
 51 |             GPU_DEVICE_SYNCHRONIZE();
 52 |             GPU_MEMCPY(&t, workspace, sizeof(double), GPU_MEMCPY_DEVICE_TO_HOST);
 53 |             norm = norm >= t ? norm : t;
 54 |         }
 55 |     }
 56 |     MPI_Allreduce(MPI_IN_PLACE, &norm, 1, MPI_DOUBLE, MPI_MAX, g.commworld);
 57 |     return norm;
 58 | }
 59 | 
 60 | template <typename F> F calc_infnorm(int n, F const *x) {
 61 |     F norm = static_cast<F>(0);
 62 | #pragma omp parallel for simd reduction(max : norm)
 63 |     for (int i = 0; i < n; ++i) {
 64 |         F t = x[i];
 65 |         t = (t >= static_cast<F>(0) ? t : -t);
 66 |         norm = (norm >= t ? norm : t);
 67 |     }
 68 |     return norm;
 69 | }
 70 | template <typename F>
 71 | double colv_infnorm(Panels<F> const &p, double *dx, Grid &g) {
 72 |     // computes the inf-norm of the distributed column vector dx.
 73 |     // descriptros are derived from p
 74 |     int nprow = p.nprow;
 75 |     int b = p.b;
 76 |     int i1 = p.i1;
 77 |     int j1 = p.j1;
 78 |     int istride = p.istride;
 79 |     int jstride = p.jstride;
 80 |     double norm = 0.;
 81 |     for (int i = 0; i < nprow; ++i) {
 82 |         int ipos = i1 + i * istride;
 83 |         if ((ipos % jstride) == j1) {
 84 |             double t = calc_infnorm(b, dx + b * i);
 85 |             norm = norm >= t ? norm : t;
 86 |         }
 87 |     }
 88 |     MPI_Allreduce(MPI_IN_PLACE, &norm, 1, MPI_DOUBLE, MPI_MAX, g.commworld);
 89 |     return norm;
 90 | }
 91 | 
 92 | template <typename FPanel>
 93 | double panel_infnorm(Matgen<double> const &mg, Panels<FPanel> const &p,
 94 |                      double *w, double *piv, Grid &g) {
 95 |     // compute the inf-norm of the matrix.
 96 |     // w and piv are working buffer.
 97 | 
 98 |     // matrix inf-norm is the inf-norm of the row 1-norms.
 99 |     int b = p.b;
100 |     int i1 = p.i1;
101 |     int j1 = p.j1;
102 |     int istride = p.istride;
103 |     int jstride = p.jstride;
104 |     int nprow = p.nprow;
105 |     int npcol = p.npcol;
106 |     for (int i = 0; i < b * nprow; ++i)
107 |         w[i] = 0.;
108 |     for (int j = 0; j < npcol; ++j) {
109 |         int jpos = j1 + j * jstride;
110 |         for (int i = 0; i < nprow; ++i) {
111 |             int ipos = i1 + i * istride;
112 |             fill_one_panel_with_rand(mg.n, b * ipos, b * jpos, b, b, piv, b,
113 |                                      mg.seed, true);
114 |             for (int jj = 0; jj < b; ++jj)
115 |                 for (int ii = 0; ii < b; ++ii) {
116 |                     double t = piv[jj * b + ii];
117 |                     w[b * i + ii] += (t < 0. ? -t : t);
118 |                 }
119 |         }
120 |     }
121 |     MPI_Allreduce(MPI_IN_PLACE, w, b * nprow, MPI_DOUBLE, MPI_SUM, g.hcomm);
122 |     double norm = 0.;
123 |     for (int i = 0; i < b * nprow; ++i)
124 |         norm = (norm >= w[i] ? norm : w[i]);
125 |     MPI_Allreduce(MPI_IN_PLACE, &norm, 1, MPI_DOUBLE, MPI_MAX, g.vcomm);
126 |     return norm;
127 | }
128 | 
129 | template <typename F>
130 | double hpl_infnorm(Panels<F> const &p, double *d, Grid &g) {
131 |     // the diagonal of the hpl-ai matrix is the sum of the absolute values of
132 |     // the off-diagonals on the same row. therefore, twice of the diagonal is
133 |     // the l1-norm of that row.
134 |     return 2. * colv_infnorm(p, d, g);
135 | }
136 | 
137 | template <typename FPanel>
138 | double higham_infnorm(HMGen<double> const &mg, Panels<FPanel> const &p,
139 |                       double *w, Grid &g) {
140 |     int b = p.b;
141 |     int i1 = p.i1;
142 |     int j1 = p.j1;
143 |     int istride = p.istride;
144 |     int jstride = p.jstride;
145 |     int nprow = p.nprow;
146 |     int npcol = p.npcol;
147 |     double alpha = mg.alpha;
148 |     double beta = mg.beta;
149 |     double ab = alpha * beta;
150 |     for (int i = 0; i < b * nprow; ++i)
151 |         w[i] = 0.;
152 | #pragma omp parallel for
153 |     for (int i = 0; i < nprow; ++i) {
154 |         int ipos = i1 + i * istride;
155 |         for (int j = 0; j < npcol; ++j) {
156 |             int jpos = j1 + j * jstride;
157 |             if (ipos == jpos) {
158 |                 for (int jj = 0; jj < b; ++jj) {
159 |                     for (int ii = 0; ii < jj; ++ii) {
160 |                         double aij = beta + ab * (b * ipos + ii);
161 |                         w[b * i + ii] += aij;
162 |                     }
163 |                     w[b * j + jj] += 1. + ab * (b * jpos + jj);
164 |                     for (int ii = jj + 1; ii < b; ++ii) {
165 |                         double aij = alpha + ab * (b * jpos + jj);
166 |                         w[b * i + ii] += aij;
167 |                     }
168 |                 }
169 |             } else if (ipos < jpos) {
170 |                 for (int jj = 0; jj < b; ++jj) {
171 |                     for (int ii = 0; ii < b; ++ii) {
172 |                         double aij = beta + ab * (b * ipos + ii);
173 |                         w[b * i + ii] += aij;
174 |                     }
175 |                 }
176 |             } else {
177 |                 for (int jj = 0; jj < b; ++jj) {
178 |                     for (int ii = 0; ii < b; ++ii) {
179 |                         double aij = alpha + ab * (b * jpos + jj);
180 |                         w[b * i + ii] += aij;
181 |                     }
182 |                 }
183 |             }
184 |         }
185 |     }
186 |     MPI_Allreduce(MPI_IN_PLACE, w, b * nprow, MPI_DOUBLE, MPI_SUM, g.hcomm);
187 |     double norm = 0.;
188 |     for (int i = 0; i < b * nprow; ++i)
189 |         norm = (norm >= w[i] ? norm : w[i]);
190 |     MPI_Allreduce(MPI_IN_PLACE, &norm, 1, MPI_DOUBLE, MPI_MAX, g.vcomm);
191 |     return norm;
192 | }
193 | 
194 | #endif
195 | 


--------------------------------------------------------------------------------
/src/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | EXECUTE_PROCESS( COMMAND uname -m COMMAND tr -d '\n' OUTPUT_VARIABLE ARCHITECTURE )
  2 | message( STATUS "Building HPL-AI for Architecture: ${ARCHITECTURE}" )
  3 | 
  4 | # FRONTIER / SPOCK language and definitions
  5 | if( ${ARCHITECTURE} STREQUAL "x86_64" )
  6 | 
  7 |     # Requires atleast `module load cmake/3.21.2-dev``
  8 |     cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
  9 |     set(ACCEL_COMPILER HIP)
 10 |     add_compile_definitions(ROCM_OLCF_PLATFORM)
 11 | 
 12 | # SUMMIT language and definitions
 13 | elseif( ${ARCHITECTURE} STREQUAL "ppc64le")
 14 |     cmake_minimum_required(VERSION 3.20 FATAL_ERROR)
 15 |     set(ACCEL_COMPILER CUDA)
 16 |     add_compile_definitions(CUDA_OLCF_PLATFORM)
 17 | 
 18 | endif()
 19 | 
 20 | # enable CUDA or HIP
 21 | enable_language(${ACCEL_COMPILER})
 22 | project(openMxP LANGUAGES CXX)
 23 | 
 24 | # FRONTIER / SPOCK packages
 25 | if( ${ARCHITECTURE} STREQUAL "x86_64" )
 26 |     set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "PATH to which HIP has been installed")
 27 |     set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "PATH to which ROCM has been installed")
 28 |     set(CRAY_MPICH_DIR $ENV{CRAY_MPICH_DIR} CACHE PATH "PATH to Cray MPICH ROOT")
 29 | 
 30 |     set( GPU_TARGETS "gfx90a" CACHE STRING "GPU TARGETS" )
 31 |     set( AMPGPU_TARGETS "gfx90a" CACHE STRING "GPU TARGETS" )
 32 |     set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
 33 |     find_package(HIP REQUIRED)
 34 |     #    find_package(MPI REQUIRED)
 35 |     find_package(rocblas REQUIRED)
 36 |     find_package(rocsolver REQUIRED)
 37 |     find_package(rocrand REQUIRED)
 38 |     find_package(hiprand REQUIRED)
 39 |     find_package(OpenMP REQUIRED)
 40 | 
 41 |     # For GPU Direct on AMD must export these variables:
 42 |     # export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0
 43 |     # export MPICH_GPU_SUPPORT_ENABLED=1
 44 |     find_library(GTL mpi_gtl_hsa /opt/cray/pe/mpich/default/gtl/lib)
 45 | 
 46 |     if(HIP_FOUND)
 47 |         message(STATUS "Found HIP: " ${HIP_VERSION})
 48 |     else()
 49 |         message(FATAL_ERROR "Could not find HIP. Ensure that HIP is either installed in /opt/rocm-4.x.x/hip or the variable HIP_PATH is set to point to the right location.")
 50 |     endif()
 51 | 
 52 |     #    find_library(GTL
 53 |     #        NAMES mpi_gtl_hsa
 54 |     #        PATHS ${CRAY_MPICH_ROOTDIR}/gtl/lib
 55 |     #        REQUIRED)
 56 |     #file(GLOB HIP_CLANGRT_LIB_SEARCH_PATHS "${CMAKE_HIP_COMPILER}/../lib/clang/*/lib/*")
 57 | #   file(GLOB HIP_CLANGRT_LIB_SEARCH_PATHS "/opt/rocm-5.0.2/llvm/lib/clang/14.0.0/lib/linux/*" )
 58 |     find_library(CLANGRT_BUILTINS clang_rt.builtins-x86_64 /opt/rocm-5.0.2/llvm/lib/clang/14.0.0/lib/linux )
 59 | 
 60 | # SUMMIT packages and compiling
 61 | elseif( ${ARCHITECTURE} STREQUAL "ppc64le")
 62 |     if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
 63 |         set(CMAKE_CUDA_ARCHITECTURES 70)
 64 |     endif()
 65 | 
 66 |     # ensure atomicAdd() is implemented for FP64
 67 |     if(CMAKE_CUDA_ARCHITECTURES LESS 60)
 68 |         set(CMAKE_CUDA_ARCHITECTURES 60)
 69 |     endif()
 70 | 
 71 |     find_package(CUDA 11.4 REQUIRED)
 72 |     find_package(MPI)
 73 | 
 74 |     set(BLA_VENDOR OpenBLAS)
 75 |     find_package(BLAS)
 76 | 
 77 | endif()
 78 | 
 79 | 
 80 | # NON-ACCELERATOR SPECIFIC
 81 | 
 82 | # c++ standard
 83 | set(CMAKE_CXX_STANDARD 17)
 84 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
 85 | 
 86 | # gcc version check
 87 | if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
 88 |     if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS "6.4.0")
 89 |          message(FATAL_ERROR "Insufficient gcc version, require 6.4.0+")
 90 |     endif()
 91 | endif()
 92 | 
 93 | 
 94 | # Get the current working branch
 95 | execute_process(
 96 |     COMMAND git rev-parse --abbrev-ref HEAD
 97 |     WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
 98 |     OUTPUT_VARIABLE GIT_BRANCH
 99 |     OUTPUT_STRIP_TRAILING_WHITESPACE)
100 | 
101 | # Get the latest commit hash
102 | execute_process(
103 |     COMMAND git rev-parse HEAD
104 |     WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
105 |     OUTPUT_VARIABLE GIT_COMMIT_HASH
106 |     OUTPUT_STRIP_TRAILING_WHITESPACE)
107 | 
108 | 
109 | # disable in source build
110 | if ( ${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_BINARY_DIR}  )
111 |         file(REMOVE test.txt)
112 |         message( FATAL_ERROR "In-source builds not allowed. Please make a new
113 |         directory (called a build directory) and run CMake from there. You may
114 |         need to remove CMakeCache.txt."  )
115 | endif()
116 | 
117 | 
118 | # add other sources
119 | #add_subdirectory(externals)
120 | 
121 | set(HPL_AI_SRCS
122 |       main.cpp
123 |       higham_mat_impl.cpp
124 |       otf_gemv.cpp
125 |       sgetrf_nopiv.cpp
126 |     )
127 | 
128 | add_definitions(-DEXTERNAL_CONV -DOTF_GEMV_OPTIMIZED -DNO_WARN_X86_INTRINSICS)
129 | 
130 | # FRONTIER / SPOCK build and linking
131 | if( ${ARCHITECTURE} STREQUAL "x86_64" )
132 | 
133 |     set(GPU_SRCS
134 |         gpu_init_kernels.cpp
135 |         fp16_gpu_kernels.cpp
136 |     )
137 | 
138 |     set_source_files_properties(${HPL_AI_SRCS} ${GPU_SRCS} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
139 |     hip_add_executable(OpenMxP ${HPL_AI_SRCS} ${GPU_SRCS})
140 | 
141 |     set(CRAY_LIBSCI_PREFIX_DIR $ENV{CRAY_LIBSCI_PREFIX_DIR} CACHE PATH "Path to the libsci libraries")
142 | 
143 |     message(STATUS "Using libsci BLAS libraries in ${CRAY_LIBSCI_PREFIX_DIR}/lib ")
144 |     target_include_directories(OpenMxP PUBLIC
145 |         #        ${MPI_CXX_INCLUDE_DIRS}
146 |         ${CRAY_MPICH_DIR}/include
147 |         ${ROCM_PATH}/include
148 |         ${ROCM_PATH}/hiprand/include
149 |         ${ROCM_PATH}/rocrand/include
150 | 	${ROCM_PATH}/rocthrust/include
151 |         )
152 | 
153 | 
154 |     target_link_directories(OpenMxP PUBLIC
155 |         ${ROCM_PATH}/lib
156 | #       ${CRAY_LIBSCI_PREFIX_DIR}/lib
157 |         ${CRAY_MPICH_DIR}/lib
158 |         )
159 | 
160 |     target_link_libraries(OpenMxP
161 |         #        ${MPI_CXX_LIBRARIES}
162 | #CCE
163 |         #mpi_cray
164 | #GCC
165 |         mpi
166 |         ${GTL}
167 |         amdhip64
168 |         rocblas
169 |         rocsolver
170 |         hiprand
171 |         hsa-runtime64
172 | #       sci_cray_mp hugetlbfs
173 |         ${OpenMP_CXX_LIBRARIES}
174 | #       spdlog cli11
175 |         )
176 | 
177 |     # Set HIP target architecture for SPOCK MI100 GPUs (May need to change for MI200s)
178 |     #    set_property(TARGET driver PROPERTY HIP_ARCHITECTURES gfx908)
179 | #   set_property(TARGET driver PROPERTY HIP_ARCHITECTURES gfx908)
180 |     set_property(TARGET OpenMxP PROPERTY HIP_ARCHITECTURES gfx90a)
181 | #   set( AMDGPU_TARGETS gfx90a )
182 |     set_property(TARGET OpenMxP PROPERTY CMAKE_HIP_ARCHITECTURES gfx90a)
183 |     target_compile_options(OpenMxP PUBLIC -std=c++14 -O3 )
184 | 
185 | 
186 | # SUMMIT build and linking
187 | elseif( ${ARCHITECTURE} STREQUAL "ppc64le")
188 | 
189 |     set(GPU_SRCS
190 |         gpu_init_kernels.cu
191 |         fp16_gpu_kernels.cu
192 |     )
193 | 
194 |     set_source_files_properties(GPU_SRCS PROPERTIES LANGUAGE CUDA)
195 | 
196 |     add_executable(OpenMxP ${HPL_AI_SRCS} ${GPU_SRCS})
197 | 
198 |     target_include_directories(OpenMxP PUBLIC
199 |         ${MPI_INCLUDE_PATH})
200 | 
201 |     target_link_libraries(OpenMxP
202 |       ${MPI_LIBRARIES}
203 |       ${CUDA_LIBRARIES}
204 |       ${CUDA_CUBLAS_LIBRARIES}
205 |       ${CUDA_cusolver_LIBRARY}
206 |       ${BLAS_LIBRARIES}
207 |       ${CUDA_curand_LIBRARY}
208 |       spdlog cli11
209 |       )
210 | 
211 |     target_link_options(OpenMxP PUBLIC "-fopenmp")
212 |     target_link_options(OpenMxP PUBLIC "-mcpu=native")
213 |     target_compile_options(OpenMxP PUBLIC  -W -Wall -fopenmp -mcpu=native -std=c++14 -O3)
214 | 
215 | endif()
216 | 
217 | 
218 | target_compile_definitions(OpenMxP PRIVATE
219 |     "-DGIT_COMMIT_HASH=\"${GIT_COMMIT_HASH}\"")
220 | 
221 | target_compile_definitions(OpenMxP PRIVATE
222 |     "-DGIT_BRANCH=\"${GIT_BRANCH}\"")
223 | 
224 | set_target_properties(OpenMxP
225 |   PROPERTIES
226 |     OUTPUT_NAME "OpenMxP."
227 |     SUFFIX ${CMAKE_HOST_SYSTEM_PROCESSOR}
228 | )
229 | 


--------------------------------------------------------------------------------
/src/cuda_device_macros.h:
--------------------------------------------------------------------------------
  1 | 
  2 | #ifndef __HPLAI_CUDA_DEVICE_MACROS__
  3 | #define __HPLAI_CUDA_DEVICE_MACROS__
  4 | 
  5 | #include <cuda.h>
  6 | #include <cuda_runtime.h>
  7 | #include <cublas_v2.h>
  8 | #include <cusolverDn.h>
  9 | #include <curand.h>
 10 | #include <curand_kernel.h>
 11 | #include <cuda_fp16.h>
 12 | 
 13 | // *** BASIC CUDA MACROS ***
 14 | // Kernel Macros
 15 | #define GPU_BLOCKIDX_X \
 16 |     blockIdx.x
 17 | 
 18 | #define GPU_BLOCKIDX_Y \
 19 |     blockIdx.y
 20 | 
 21 | #define GPU_BLOCKIDX_Z \
 22 |     blockIdx.z
 23 | 
 24 | 
 25 | #define GPU_THREADIDX_X \
 26 |     threadIdx.x
 27 | 
 28 | #define GPU_THREADIDX_Y \
 29 |     threadIdx.y
 30 | 
 31 | #define GPU_THREADIDX_Z \
 32 |     threadIdx.z
 33 | 
 34 | 
 35 | #define GPU_BLOCKDIM_X \
 36 |     blockDim.x
 37 | 
 38 | #define GPU_BLOCKDIM_Y \
 39 |     blockDim.y
 40 | 
 41 | #define GPU_BLOCKDIM_Z \
 42 |     blockDim.z
 43 | 
 44 | 
 45 | #define GPU_GRIDDIM_X \
 46 |     gridDim.x
 47 | 
 48 | #define GPU_GRIDDIM_Y \
 49 |     gridDim.y
 50 | 
 51 | #define GPU_GRIDDIM_Z \
 52 |     gridDim.z
 53 | 
 54 | 
 55 | // Types
 56 | #define GPU_ERROR_T \
 57 |     cudaError_t
 58 | 
 59 | #define GPU_STREAM_T \
 60 |     cudaStream_t
 61 | 
 62 | 
 63 | // Enums
 64 | #define GPU_SUCCESS \
 65 |     cudaSuccess
 66 | 
 67 | #define GPU_STREAM_NON_BLOCKING \
 68 |     cudaStreamNonBlocking
 69 | 
 70 | #define GPU_R_16F \
 71 |     CUDA_R_16F
 72 | 
 73 | #define GPU_R_32F \
 74 |     CUDA_R_32F
 75 | 
 76 | #define GPU_MEMCPY_DEVICE_TO_HOST \
 77 |     cudaMemcpyDeviceToHost
 78 | 
 79 | #define GPU_MEMCPY_HOST_TO_DEVICE \
 80 |     cudaMemcpyHostToDevice
 81 | 
 82 | // Kernels
 83 | #define GPU_DEVICE_RESET() \
 84 |     cudaDeviceReset()
 85 | 
 86 | #define GPU_SET_DEVICE(deviceID) \
 87 |     cudaSetDevice(deviceID)
 88 | 
 89 | #define GPU_DEVICE_SYNCHRONIZE() \
 90 |     cudaDeviceSynchronize()
 91 | 
 92 | #define GPU_FREE(memPointer) \
 93 |     cudaFree(memPointer)
 94 | 
 95 | #define GPU_FREE_HOST(memPointer) \
 96 |     cudaFreeHost(memPointer)
 97 | 
 98 | #define GPU_GET_ERROR_STRING(cudaError) \
 99 |     cudaGetErrorString(cudaError)
100 | 
101 | #define GPU_GET_LAST_ERROR() \
102 |     cudaGetLastError()
103 | 
104 | #define GPU_MALLOC(memAddress, numBytes) \
105 |     cudaMalloc(memAddress, numBytes)
106 | 
107 | #define GPU_MALLOC_HOST(memAddress, numBytes) \
108 |     cudaMallocHost(memAddress, numBytes)
109 | 
110 | #define GPU_MALLOC_MANAGED(memAddress, numBytes) \
111 |     cudaMallocManaged(memAddress, numBytes)
112 | 
113 | #define GPU_MEMCPY(memPointer_to, memPointer_from, numBytes, directionEnum) \
114 |     cudaMemcpy(memPointer_to, memPointer_from, numBytes, directionEnum)
115 | 
116 | #define GPU_MEMCPY_2D(memPointer_to, pitchBytes_to, memPointer_from, pitchBytes_from, numBytes_W, numBytes_H, directionEnum) \
117 |     cudaMemcpy2D(memPointer_to, pitchBytes_to, memPointer_from, pitchBytes_from, numBytes_W, numBytes_H, directionEnum)
118 | 
119 | #define GPU_MEMCPY_DEVICE_TO_DEVICE \
120 |     cudaMemcpyDeviceToDevice
121 | 
122 | #define GPU_MEM_GET_INFO(freeMem, totalMem) \
123 |     cudaMemGetInfo(freeMem, totalMem)
124 | 
125 | #define GPU_STREAM_CREATE_WITH_FLAGS(cudaStream, streamTypeEnum) \
126 |     cudaStreamCreateWithFlags(cudaStream, streamTypeEnum)
127 | 
128 | 
129 | 
130 | 
131 | // *** CUBLAS MACROS ***
132 | // Types
133 | #define GPUBLAS_HANDLE_T \
134 |     cublasHandle_t
135 | 
136 | #define GPUBLAS_STATUS_T \
137 |     cublasStatus_t
138 | 
139 | // Enums
140 | #define GPUBLAS_STATUS_SUCCESS \
141 |     CUBLAS_STATUS_SUCCESS
142 | 
143 | #define GPUBLAS_STATUS_NOT_INITIALIZED \
144 |     CUBLAS_STATUS_NOT_INITIALIZED
145 | 
146 | #define GPUBLAS_STATUS_ALLOC_FAILED \
147 |     CUBLAS_STATUS_ALLOC_FAILED
148 | 
149 | #define GPUBLAS_STATUS_INVALID_VALUE \
150 |     CUBLAS_STATUS_INVALID_VALUE
151 | 
152 | #define GPUBLAS_STATUS_ARCH_MISMATCH \
153 |     CUBLAS_STATUS_ARCH_MISMATCH
154 | 
155 | #define GPUBLAS_STATUS_MAPPING_ERROR \
156 |     CUBLAS_STATUS_MAPPING_ERROR
157 | 
158 | #define GPUBLAS_STATUS_EXECUTION_FAILED \
159 |     CUBLAS_STATUS_EXECUTION_FAILED
160 | 
161 | #define GPUBLAS_STATUS_INTERNAL_ERROR \
162 |     CUBLAS_STATUS_INTERNAL_ERROR
163 | 
164 | #define GPUBLAS_OP_N \
165 |     CUBLAS_OP_N
166 | 
167 | #define GPUBLAS_OP_T \
168 |     CUBLAS_OP_T
169 | 
170 | #define GPUBLAS_SIDE_RIGHT \
171 |     CUBLAS_SIDE_RIGHT
172 | 
173 | #define GPUBLAS_SIDE_LEFT \
174 |     CUBLAS_SIDE_LEFT
175 | 
176 | #define GPUBLAS_FILL_MODE_UPPER \
177 |     CUBLAS_FILL_MODE_UPPER
178 | 
179 | #define GPUBLAS_FILL_MODE_LOWER \
180 |     CUBLAS_FILL_MODE_LOWER
181 | 
182 | #define GPUBLAS_DIAG_UNIT \
183 |     CUBLAS_DIAG_UNIT
184 | 
185 | #define GPUBLAS_DIAG_NON_UNIT \
186 |     CUBLAS_DIAG_NON_UNIT
187 | 
188 | // Kernels
189 | #define GPUBLAS_CREATE(cublasHandle) \
190 |     cublasCreate_v2(cublasHandle)
191 | 
192 | #define GPUBLAS_SET_STREAM(cublasHandle, cudaStream) \
193 |     cublasSetStream(cublasHandle, cudaStream)
194 | 
195 | #define GPUBLAS_SGETRF_BATCHED(cublasHandle, N_dim, memPointer_A, lda, memPointer_Pivot, memPointer_Info, batchSize) \
196 |     cublasSgetrfBatched(cublasHandle, N_dim, memPointer_A, lda, memPointer_Pivot, memPointer_Info, batchSize)
197 | 
198 | #define GPUBLAS_STRSM(cublasHandle, cuSide, cuFill, cuOp, cuDiag, M_dim, N_dim, alpha, memPointer_A, lda, memPointer_B, ldb) \
199 |     cublasStrsm(cublasHandle, cuSide, cuFill, cuOp, cuDiag, M_dim, N_dim, alpha, memPointer_A, lda, memPointer_B, ldb)
200 | 
201 | // Non-Simple Kernels
202 | //#define GPUBLAS_GET_ERROR_STRING(cublasStatus) \
203 | //    cublasGetErrorString(cublasStatus)
204 | 
205 | #define GPUBLAS_SGEMM_EX(cublasHandle, cuOp_A, cuOp_B, M_dim, N_dim, k_dim, alpha, memPointer_A, datatype_A, lda, memPointer_B, datatype_B, ldb, beta, memPointer_C, datatype_C, ldc) \
206 |     cublasSgemmEx(cublasHandle, cuOp_A, cuOp_B, M_dim, N_dim, k_dim, alpha, memPointer_A, datatype_A, lda, memPointer_B, datatype_B, ldb, beta, memPointer_C, datatype_C, ldc)
207 | 
208 | 
209 | 
210 | // *** CUSOLVER MACROS ***
211 | // Types
212 | #define GPUSOLVER_HANDLE_T \
213 |     cusolverDnHandle_t
214 | 
215 | #define GPUSOLVER_STATUS_T \
216 |     cusolverStatus_t
217 | 
218 | 
219 | // Enums
220 | #define GPUSOLVER_STATUS_SUCCESS \
221 |     CUSOLVER_STATUS_SUCCESS
222 | 
223 | 
224 | // Kernels
225 | #define GPUSOLVER_CREATE(cusolverDnHandle) \
226 |     cusolverDnCreate(cusolverDnHandle)
227 | 
228 | #define GPUSOLVER_SGETRF(cusolverDnHandle, M_dim, N_dim, memPointer, lda, memAddress_Buffer, memPointer_Pivot, memPointer_Info) \
229 |     cusolverDnSgetrf(cusolverDnHandle, M_dim, N_dim, memPointer, lda, memAddress_Buffer, memPointer_Pivot, memPointer_Info)
230 | 
231 | //#define GPUSOLVER_SGETRF_BUFFERSIZE(cusolverDnHandle, M_dim, N_dim, memPointer, lda, memAddress_Buffer, memPointer_Pivot, memPointer_Info) \
232 | //    cusolverDnSgetrf_bufferSize(cusolverDnHandle, M_dim, N_dim, memPointer, lda, memAddress_Buffer, memPointer_Pivot, memPointer_Info)
233 | 
234 | #define GPUSOLVER_SGETRF_BUFFERSIZE(cusolverDnHandle, M_dim, N_dim, memPointer, lda, memAddress_Buffer) \
235 |     cusolverDnSgetrf_bufferSize(cusolverDnHandle, M_dim, N_dim, memPointer, lda, memAddress_Buffer)
236 | 
237 | #define GPUSOLVER_SET_STREAM(cusolverDnHandle, cudaStream) \
238 |     cusolverDnSetStream(cusolverDnHandle, cudaStream)
239 | 
240 | // *** CURAND MACROS ***
241 | // Types
242 | #define GPURAND_GENERATOR_T \
243 |     curandGenerator_t
244 | 
245 | 
246 | // Enums
247 | #define GPURAND_RNG_PSEUDO_DEFAULT \
248 |     CURAND_RNG_PSEUDO_DEFAULT
249 | 
250 | 
251 | // Kernels
252 | #define GPURAND_CREATE_GENERATOR(curandGenerator, curandRngType) \
253 |     curandCreateGenerator(curandGenerator, curandRngType)
254 | 
255 | #define GPURAND_DESTROY_GENERATOR(curandGenerator) \
256 |     curandDestroyGenerator(curandGenerator)
257 | 
258 | #define GPURAND_GENERATE_UNIFORM(curandGenerator, memPointer, numBytes) \
259 |     curandGenerateUniform(curandGenerator, memPointer, numBytes)
260 | 
261 | #define GPURAND_GENERATE_UNIFORM_DOUBLE(curandGenerator, memPointer, numBytes) \
262 |     curandGenerateUniformDouble(curandGenerator, memPointer, numBytes)
263 | 
264 | #define GPURAND_SET_PSEUDO_RANDOM_GENERATOR_SEED(curandGenerator, seed) \
265 |     curandSetPseudoRandomGeneratorSeed(curandGenerator, seed)
266 | 
267 | 
268 | 
269 | #endif // __HPLAI_CUDA_DEVICE_MACROS__
270 | 


--------------------------------------------------------------------------------
/src/iterative_refinement.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef ITERATIVE_REFINEMENT_HPP
  2 | #define ITERATIVE_REFINEMENT_HPP
  3 | #include "grid.hpp"
  4 | #include "hpl_rand.hpp"
  5 | #include "panel.hpp"
  6 | #include "panel_gemv.hpp"
  7 | #include "panel_norm.hpp"
  8 | #include "panel_trsv.hpp"
  9 | #include "timer.hpp"
 10 | #include <cfloat>
 11 | #include <cstdio>
 12 | 
 13 | struct IRErrors {
 14 |     double residual;
 15 |     double hpl_harness;
 16 | };
 17 | 
 18 | template <typename FPanel, template <class> class Matgen>
 19 | void iterative_tester(int my_init, Panels<FPanel> &p, Matgen<double> &mg,
 20 |                               double *x, double *w, size_t ldv, double *rhs,
 21 |                               double norma, double normb, int maxit,
 22 |                               Grid &grid) {
 23 |     int const nb = p.nblocks;
 24 |     int const n = nb * p.b;
 25 |     size_t wlen = ldv * 10 + 2 * p.b * p.b;
 26 |     double *r = NULL;
 27 |     double *v = NULL;
 28 |     double *vw1_dev;
 29 |     FPanel *vw2_dev;
 30 |     double *dx_dev;
 31 |     double *testing_r = static_cast<double*>(std::malloc(sizeof(double) * ldv));
 32 |     r = w;
 33 |     v = w + ldv;
 34 |     double normr=0;
 35 | 
 36 |     size_t bytes = (size_t)(p.b * p.nprow) * (size_t)(p.b * p.npcol) * (size_t)sizeof(float);
 37 |     GPU_MEMCPY(p(0, 0), p(0, 0, 'd'), bytes, GPU_MEMCPY_DEVICE_TO_HOST);
 38 |     GPU_MEMCPY(testing_r, r, ldv*sizeof(double), GPU_MEMCPY_DEVICE_TO_HOST);
 39 |     printf("A from rank %d %d\n",grank,p.i1);
 40 | 	print_matrix(p(0, 0),p.lda, p.lda*p.lda);
 41 |     fflush(stdout);
 42 | 
 43 |     GPU_MEMCPY(testing_r, rhs, ldv*sizeof(double), GPU_MEMCPY_DEVICE_TO_HOST);
 44 |     printf("rhs from rank %d\n",grank);
 45 | 	print_matrix(testing_r,ldv/2,ldv/2);
 46 |     fflush(stdout);
 47 | 
 48 |     //trsvL_h(p, rhs, vw1_dev, ldv, grid);
 49 |     GPU_MEMCPY(testing_r, rhs, ldv*sizeof(double), GPU_MEMCPY_DEVICE_TO_HOST);
 50 |     printf("solve from rank %d\n",grank);
 51 | 	print_matrix(testing_r,ldv/2,ldv/2);
 52 |     fflush(stdout);
 53 |     return;
 54 | 
 55 |     
 56 | 
 57 |     if(my_init == 1 || my_init == 4)
 58 |     {
 59 |         //panel_copycolv_dev(p, rhs, x);
 60 |         copycolv_h(p, rhs, x);
 61 |         divcolv_h(p, mg.diag_dev, x);
 62 |         copycolv_h(p, rhs, r);
 63 |         colv2rowv_h(p, x, v );
 64 |         if(my_init == 1){
 65 |             panel_gemv_h(-1., p, mg, false, v, 1., r, grid );
 66 |         }
 67 |        
 68 |         normr = colv_infnorm_h(p, r, grid, vw1_dev);
 69 |         trsvU_h(p, r, vw1_dev, ldv, grid);
 70 |         //panel_trsvL(p, r, vw1_dev, vw2_dev, ldv, grid);
 71 |         /*GPU_MEMCPY(testing_r, r, ldv*sizeof(double), GPU_MEMCPY_DEVICE_TO_HOST);
 72 |         if(p.i1 == p.j1 && grank == 10){
 73 |     	    printf("MV from rank %d %f\n",grank, normr);
 74 | 		    print_matrix(testing_r,ldv/2,ldv/2);
 75 |             fflush(stdout);
 76 |         }*/
 77 |     }
 78 |     else
 79 |     {
 80 |         copycolv(p, rhs, x);
 81 |         divcolv(p, mg.diag, x);
 82 |         copycolv(p, rhs, r);
 83 |         colv2rowv(p, x, v);
 84 |         panel_gemv(my_init, -1., p, mg, false, v, 1., r, grid);
 85 |         normr = colv_infnorm(p, r, grid);
 86 |         if(p.i1 == p.j1 && grank == 10){
 87 |     	    printf("MV from rank %d %f\n",grank, normr);
 88 | 		   // print_matrix(r,ldv/2,ldv/2);
 89 |             fflush(stdout);
 90 |         }
 91 |     }
 92 | 
 93 |    
 94 | 
 95 | }
 96 | 
 97 | #if 1
 98 | template <typename FPanel, template <class> class Matgen>
 99 | IRErrors iterative_refinement2(int my_init, Panels<FPanel> const &p, Matgen<double> &mg,
100 |                               double *x, double *w, size_t ldv, double *rhs,
101 |                               double norma, double normb, int maxit,
102 |                               Grid &grid) {
103 |     // do IR with approximated LU factors in p and the accurate initial matrix
104 |     // which is generated by mg. x is solution. rhs is the right-hand-side
105 |     // vector. w is working vectors. ldv is the leading dimention of w. set good
106 |     // ldv for better performance. norma is inf-norm of the initial matrix.
107 |     // normb is the inf-norm of rhs.
108 |     int const nb = p.nblocks;
109 |     int const n = nb * p.b;
110 |     double *r = w;
111 |     double *v = w + ldv;
112 | 
113 |     //double tmisc=0, time_trsvL=0, time_trsvU=0, time_pgv = 0, begintr_time, endtr_time;
114 |     if(my_init != 1){
115 |         copycolv(p, rhs, x);
116 |         divcolv(p, mg.diag, x);        
117 |         for (int iter = 0; iter < maxit; ++iter) {
118 |             copycolv(p, rhs, r);
119 |             double normx = colv_infnorm(p, x, grid);
120 |             colv2rowv(p, x, v);
121 |             panel_gemv(my_init, -1., p, mg, false, v, 1., r, grid);
122 |             double normr = colv_infnorm(p, r, grid);
123 |             MPI_Barrier(MPI_COMM_WORLD);
124 |             double hplerror =
125 |                 normr / (norma * normx + normb) * 1. / (n * DBL_EPSILON / 2);
126 |             if (grid.row == 0 && grid.col == 0) {
127 |                 printf("# iterative refinement: step=%d, residual=%e, "
128 |                             "hpl-harness=%f\n",
129 |                             iter, normr, hplerror);
130 |             }
131 |             if (hplerror < 16.) {
132 |                 return {normr, hplerror};
133 |             }
134 |             fflush(stdout);
135 | 
136 |             // x_1 = x_0 + (LU)^{-1} r
137 |             panel_trsvL(p, r, v, ldv, grid);
138 |             panel_trsvU(p, r, v, ldv, grid);
139 |             addcolv(p, r, x);
140 |         }
141 |     }else{
142 |         copycolv_h(p, rhs, x);
143 |         divcolv_h(p, mg.diag_dev, x);
144 |         for (int iter = 0; iter < maxit; ++iter) {
145 |             copycolv_h(p, rhs, r);
146 |             double normx = colv_infnorm_h(p, x, grid,v);
147 |             colv2rowv_h(p, x, v);
148 |             panel_gemv_h(-1., p, mg, false, v, 1., r, grid);
149 |             
150 |             GPU_THREAD_SYNCHRONIZE(0); 
151 |             MPI_Barrier(MPI_COMM_WORLD);
152 |             
153 |             double normr = colv_infnorm_h(p, r, grid,v);
154 |             // hplerror := \|b-Ax\|_\infty / (\|A\|_\infty \|x\|_\infty +
155 |             // \|b\|_\infty) * (n * \epsilon)^{-1}
156 |             double hplerror =
157 |                 normr / (norma * normx + normb) * 1. / (n * DBL_EPSILON / 2);
158 |             if (grid.row == 0 && grid.col == 0) {
159 |                 printf("# iterative refinement: step=%d, residual=%e, "
160 |                             "hpl-harness=%f\n",
161 |                             iter, normr, hplerror);
162 |             }
163 |             if (hplerror < 16.) {
164 |                 return {normr, hplerror};
165 |             }
166 |             // x_1 = x_0 + (LU)^{-1} r
167 |             trsvL_h(p, r, v, ldv, grid);
168 |             trsvU_h(p, r, v, ldv, grid);
169 |             addcolv_h(p, r, x);
170 |         }
171 |     }
172 |     // OMG!
173 |     return {-1., -1.};
174 | }
175 | #endif
176 | 
177 | /*template<typename FPanel>
178 | IRErrors iterative_refinement(Panels<FPanel>const&p, HMGen<double>& mg,
179 |         double* x, double* w, size_t ldv, double* rhs, double norma, double
180 | normb, int maxit, Grid&grid)
181 | {
182 |         int const nb = p.nblocks;
183 |         int const n = nb * p.b;
184 |         double*r = w;
185 |         double*v = w + ldv;
186 |         // initial approximation
187 |         // trsv
188 |         copycolv(p, rhs, x);
189 |         panel_trsvL(p, x, v, ldv, grid);
190 |         panel_trsvU(p, x, v, ldv, grid);
191 | 
192 |         for(int iter=0; iter<maxit; ++iter){
193 |                 copycolv(p, rhs, r);
194 |                 double normx = colv_infnorm(p, x, grid);
195 |                 colv2rowv(p, x, v);
196 |                 // compute residual
197 |                 panel_gemv(-1., p, mg, false, v, 1., r, grid);
198 |                 double normr = colv_infnorm(p, r, grid);
199 |                 double hplerror = normr / (norma*normx + normb) * 1./(n *
200 | DBL_EPSILON/2); if(grid.row==0 && grid.col==0){ std::printf("# iterative
201 | refinement: step=%3d, residual=%20.16e hpl-harness=%f\n", iter, normr,
202 | hplerror); fflush(stdout);
203 |                 }
204 |                 if(hplerror < 16.) return {normr, hplerror};
205 | 
206 |                 panel_trsvL(p, r, v, ldv, grid);
207 |                 panel_trsvU(p, r, v, ldv, grid);
208 |                 addcolv(p, r, x);
209 |         }
210 |         // OMG!
211 |         return {-1., -1.};
212 | }*/
213 | 
214 | #endif
215 | 


--------------------------------------------------------------------------------
/src/hpl_rand.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef HPL_RAND_HPP
  2 | #define HPL_RAND_HPP
  3 | 
  4 | #include <math.h>
  5 | #include <stdint.h>
  6 | #include <stdlib.h>
  7 | 
  8 | #include "device_macros.h"
  9 | 
 10 | extern int grank;
 11 | 
 12 | struct RandCoeff {
 13 |     uint64_t a;
 14 |     uint64_t c;
 15 | 
 16 |     static RandCoeff default_vals() { return {6364136223846793005, 1}; }
 17 | 
 18 |     __host__ __device__ RandCoeff operator*(const RandCoeff &rhs) const {
 19 |         return {a * rhs.a, a * rhs.c + c};
 20 |     }
 21 |     __host__ __device__ RandCoeff pow_fugaku(const uint64_t n) const {
 22 |         if(n==0) return RandCoeff{1, 0};
 23 |         RandCoeff tmp = pow(n / 2);
 24 |         tmp = tmp * tmp;
 25 |         if (n % 2) {
 26 |             return tmp * (*this);
 27 |         } else {
 28 |             return tmp;
 29 |         }
 30 |     }
 31 |     __host__ __device__ RandCoeff pow(const uint64_t n) const {
 32 |         uint64_t exponent = n;
 33 |         RandCoeff tmp = *(this);
 34 |         RandCoeff result = RandCoeff{1, 0};
 35 |   
 36 |         if (exponent == 0) return result;
 37 |         if(exponent & 1) result = result * tmp;
 38 | 
 39 |         exponent= exponent>> 1;
 40 |         while (exponent > 0) {
 41 |             tmp = tmp * tmp;
 42 |             if(exponent & 1)
 43 |                 result = result * tmp;
 44 | 
 45 |             exponent= exponent>> 1;
 46 |         }
 47 |         return result;
 48 |     }
 49 | };
 50 | 
 51 | struct RandStat {
 52 |     uint64_t x;
 53 | 
 54 |     __host__ __device__ static RandStat initialize(uint64_t seed,
 55 |                                RandCoeff coef = RandCoeff::default_vals()) {
 56 |         return coef * RandStat{seed};
 57 |     }
 58 | 
 59 |     __host__ __device__ friend RandStat inline operator*(RandCoeff coef, RandStat stat) {
 60 |         return {coef.a * stat.x + coef.c};
 61 |     }
 62 | 
 63 |     // returns [-0.5:0.5]
 64 |     __host__ __device__ inline operator double() const {
 65 |         return static_cast<int64_t>(x) * 0x1.fffffffffffffP-65;
 66 |       	//24x24
 67 | 		// return static_cast<int64_t>(x) * 0x1.fffffffffffffP-73;
 68 |     	// 96x 96
 69 | 		//return static_cast<int64_t>(x) * 0x1.fffffffffffffP-75;
 70 | 		// 162x 162
 71 | 		// return static_cast<int64_t>(x) * 0x1.fffffffffffffP-76; //maybe 75 will work, faster convergence
 72 | 
 73 | 	}
 74 |     __host__ __device__ operator float() const {
 75 |         float tmp = static_cast<double>(*this);
 76 |         return tmp;
 77 |     }
 78 | };
 79 | 
 80 | // fill subumat (i0:nrow-1,  j0:ncol-1) of fullmat (0:nrow-1, 0:ncol-1)
 81 | template<typename F>
 82 | static void panel_fill_one_with_rand(
 83 |         int const       n,
 84 |         int const       i0,
 85 |         int const       j0,
 86 |         int const       nrow,
 87 |         int const       ncol,
 88 |         F               *a,
 89 |         size_t const    lda,
 90 |         uint64_t const  seed,
 91 |         bool const      calc_diag = true)
 92 | {
 93 |     RandStat stat_00 = RandStat::initialize(seed);
 94 | 
 95 |     RandCoeff inc1 = RandCoeff::default_vals();
 96 |     RandCoeff jump_one_col = inc1.pow(n);
 97 |     RandCoeff jump_ij = inc1.pow(i0 + n * static_cast<uint64_t>(j0));
 98 | 
 99 |     RandStat stat_ij = jump_ij * stat_00;
100 | 
101 |     RandStat at_0j = stat_ij;
102 |     for(int j=0; j<ncol; j++){
103 |         RandStat at_ij = at_0j;
104 |         for(int i=0; i<nrow; i++){
105 |             double t = static_cast<double>(at_ij);
106 |             a[j*lda + i] = static_cast<F>(t);
107 |             at_ij = inc1 * at_ij;
108 |         }
109 |         at_0j = jump_one_col * at_0j;
110 |     }
111 | 
112 |     if (calc_diag && (i0 == j0) && (nrow==ncol)){
113 |         RandCoeff jump_i0 = inc1.pow(i0);
114 | 
115 |         RandStat stat_i0 = jump_i0 * stat_00;
116 |         for(int i=0; i<(nrow<ncol?nrow:ncol); i++){
117 |             RandStat stat_ij = stat_i0;
118 |             double sum = 0.0;
119 |             for(int j=0; j<n; j++){
120 |                 if(i0+i!=j) 
121 |                     sum += fabs(double(stat_ij));
122 |                 stat_ij = jump_one_col * stat_ij;
123 |             }
124 | 
125 |             a[lda*i + i] = static_cast<F>(sum);
126 |             stat_i0 = inc1 * stat_i0;
127 |         }
128 |     }
129 | }
130 | 
131 | template <typename F>
132 | static void fill_one_panel_with_rand2(int const n, int const i0, int const j0,
133 |                                      int const nrow, int const ncol, F *a,
134 |                                      size_t const lda, uint64_t const seed,
135 |                                      double* localSum) {
136 |     RandStat stat_00 = RandStat::initialize(seed);
137 | 
138 |     RandCoeff inc1 = RandCoeff::default_vals();
139 |     RandCoeff jump_one_col = inc1.pow(n);
140 |     RandCoeff jump_ij = inc1.pow(i0 + n * static_cast<uint64_t>(j0));
141 | 
142 |     RandStat stat_ij = jump_ij * stat_00;
143 | 
144 |     RandStat at_0j = stat_ij;
145 |     long max_idx = 0;
146 |     for (int j = 0; j < ncol; j++) {
147 |         RandStat at_ij = at_0j;
148 | 		for (int i = 0; i < nrow; i++) {
149 |                    double t = static_cast<double>(at_ij);
150 | 		   if ( ( i0 + i ) != ( j0 + j ) ) localSum[ i0+i ] += fabs( double( t ) );
151 | if ( ( j * lda + i ) > max_idx ) max_idx = j * lda + i;
152 |             a[j * lda + i] = static_cast<F> ( t );
153 |             at_ij = inc1 * at_ij;
154 |         }
155 |         at_0j = jump_one_col * at_0j;
156 |     }
157 | }
158 | // fill subumat (i0:nrow-1,  j0:ncol-1) of fullmat (0:nrow-1, 0:ncol-1)
159 | template <typename F>
160 | static void fill_one_panel_with_rand(int const n, int const i0, int const j0,
161 |                                      int const nrow, int const ncol, F *a,
162 |                                      size_t const lda, uint64_t const seed,
163 |                                      bool const calc_diag = true) {
164 |     RandStat stat_00 = RandStat::initialize(seed);
165 | 
166 |     RandCoeff inc1 = RandCoeff::default_vals();
167 |     RandCoeff jump_one_col = inc1.pow(n);
168 |     RandCoeff jump_ij = inc1.pow(i0 + n * static_cast<uint64_t>(j0));
169 | 
170 |     RandStat stat_ij = jump_ij * stat_00;
171 | 
172 |     RandStat at_0j = stat_ij;
173 |     for (int j = 0; j < ncol; j++) {
174 |         RandStat at_ij = at_0j;
175 |         for (int i = 0; i < nrow; i++) {
176 |             double t = static_cast<double>(at_ij);
177 |             a[j * lda + i] = static_cast<F>( t );
178 |             at_ij = inc1 * at_ij;
179 |         }
180 |         at_0j = jump_one_col * at_0j;
181 |     }
182 |     if (calc_diag && (i0 == j0) && (nrow == ncol)) {
183 |         RandCoeff jump_i0 = inc1.pow(i0);
184 | 
185 |         RandStat stat_i0 = jump_i0 * stat_00;
186 |         for (int i = 0; i < (nrow < ncol ? nrow : ncol); i++) {
187 |             RandStat stat_ij = stat_i0;
188 |             double sum = 0.0;
189 |             for (int j = 0; j < n; j++) {
190 |                 if (i0 + i != j)
191 |                     sum += fabs(double(stat_ij));
192 |                 stat_ij = jump_one_col * stat_ij;
193 |             }
194 |             a[lda * i + i] = static_cast<F>( sum );
195 |             stat_i0 = inc1 * stat_i0;
196 |         }
197 |     }
198 | }
199 | 
200 | static inline double calc_diag(int const i, int const n, uint64_t const seed) {
201 |     RandStat stat_00 = RandStat::initialize(seed);
202 |     RandCoeff inc1 = RandCoeff::default_vals();
203 |     RandCoeff jump_one_col = inc1.pow(n);
204 |     RandCoeff jump_i0 = inc1.pow(i);
205 |     RandStat stat_ij = jump_i0 * stat_00;
206 | 
207 |     double sum = 0.0;
208 |     for (int j = 0; j < n; j++) {
209 |         if (i != j)
210 |             sum += fabs(double(stat_ij));
211 |         stat_ij = jump_one_col * stat_ij;
212 |     }
213 |     return sum;
214 | }
215 | 
216 | // debug
217 | static inline double mat_elem(int n, int i, int j, int seed) {
218 |     RandStat stat_00 = RandStat::initialize(seed);
219 |     RandCoeff inc1 = RandCoeff::default_vals();
220 |     RandCoeff jump_ij = inc1.pow(i + static_cast<uint64_t>(n) * j);
221 |     return double(jump_ij * stat_00);
222 | }
223 | 
224 | template <typename F> struct Matgen {
225 |     uint64_t seed;
226 |     int n;
227 |     F const *diag;
228 |     F const *diag_dev;
229 |     
230 |     RandCoeff incl1, jumpn, jumpi, jumpj;
231 | 
232 |     enum { NUM_POWERS = 16 };
233 |     RandCoeff powers[NUM_POWERS];
234 | 
235 |     double scalea, scaleb;
236 | 
237 |     Matgen(uint64_t seed, int n, int iskip, int jskip, F const *diag)
238 |         : seed(seed), n(n), diag(diag) {
239 |         incl1 = RandCoeff::default_vals();
240 |         jumpn = incl1.pow(n);
241 |         jumpi = incl1.pow(iskip);
242 |         jumpj = incl1.pow(n * static_cast<uint64_t>(jskip));
243 |         for (int i = 0; i < NUM_POWERS; i++) {
244 |             powers[i] = incl1.pow(i);
245 |         }
246 |         scalea = sqrt(n * sqrt(n));
247 |         scaleb = 1;
248 |     }
249 |     RandCoeff jump(int i, int j) const {
250 |         return incl1.pow(i + n * static_cast<uint64_t>(j));
251 |     }
252 |     RandCoeff jump(uint64_t i, uint64_t j) const {
253 |         return incl1.pow(i + n * static_cast<uint64_t>(j));
254 |     }
255 | };
256 | 
257 | #endif
258 | 


--------------------------------------------------------------------------------
/src/gpu_init_kernels.cpp:
--------------------------------------------------------------------------------
  1 | 
  2 | #include "gpu_init_kernels.h"
  3 | 
  4 | using namespace std;
  5 | 
  6 | 
  7 | 
  8 | #if 0
  9 | // Generate random entries of matrix A in single precision
 10 | void fill_random(float* A, long m, long n, int n_threads, int blocksize_x, int work_per_thread)
 11 | {
 12 |     GPURAND_GENERATOR_T generator;
 13 | 
 14 |     GPURAND_CREATE_GENERATOR(&generator, GPURAND_RNG_PSEUDO_DEFAULT);
 15 |     GPURAND_SET_PSEUDO_RANDOM_GENERATOR_SEED(generator, (int)time(NULL));
 16 |     GPURAND_GENERATE_UNIFORM(generator, A, (size_t) m *(size_t)n);
 17 | 	// Push entries to (-0.5, 0.5) range
 18 |     int blocksize_y = n_threads / blocksize_x;
 19 |     dim3 thread_dims(blocksize_x, blocksize_y, 1);
 20 |     int block_dim_x = ceil((float)m / blocksize_x), block_dim_y = ceil((float)n / (work_per_thread * blocksize_y));
 21 |     dim3 block_dims(block_dim_x, block_dim_y, 1);
 22 | 
 23 | #ifdef CUDA_OLCF_PLATFORM
 24 |     minus_05<<<block_dims, thread_dims>>>(A, m, n, work_per_thread);
 25 | #elif defined(ROCM_OLCF_PLATFORM)
 26 |     hipLaunchKernelGGL(minus_05, block_dims, thread_dims, 0, 0,
 27 |             A, m, n, work_per_thread);
 28 | #else
 29 |     throw std::runtime_error("Error with build platform, neither CUDA nor ROCM. See CMake output.")
 30 | #endif
 31 | 
 32 |     GPURAND_DESTROY_GENERATOR(generator);
 33 | }
 34 | 
 35 | // Generate random entries of vector v in double precision
 36 | void fill_random(double* v, long m, int n_threads)
 37 | {
 38 |     GPURAND_GENERATOR_T generator;
 39 | 
 40 |     GPURAND_CREATE_GENERATOR(&generator, GPURAND_RNG_PSEUDO_DEFAULT);
 41 |     GPURAND_SET_PSEUDO_RANDOM_GENERATOR_SEED(generator, (int)time(NULL));
 42 |     GPURAND_GENERATE_UNIFORM_DOUBLE(generator, v, m);
 43 | 	// Push entries to (-0.5, 0.5) range
 44 |     dim3 block_dims(ceil((float) m / n_threads), 1, 1);
 45 |     dim3 thread_dims(n_threads, 1, 1);
 46 |     
 47 | #ifdef CUDA_OLCF_PLATFORM
 48 |     minus_05<<<block_dims, thread_dims>>>(v, m, 1, 1);
 49 | #elif defined(ROCM_OLCF_PLATFORM)
 50 |     hipLaunchKernelGGL(minus_05, block_dims, thread_dims, 0, 0, v, m, 1, 1);
 51 | #else
 52 |     throw std::runtime_error("Error with build platform, neither CUDA nor ROCM. See CMake output.")
 53 | #endif
 54 | 
 55 |     GPURAND_DESTROY_GENERATOR(generator);
 56 | }
 57 | 
 58 | // Generate random entries of matrix A using Fugaku's logic
 59 | __host__  void fill_random_fugaku(uint64_t N, RandStat stat0, RandCoeff inc1, float *A, double *row_sums, long m, long n, long i1, long j1, long b, long istride, long jstride, int n_threads, int blocksize_x, int work_per_thread)
 60 | {
 61 |     int blocksize_y = n_threads / blocksize_x;
 62 |     dim3 thread_dims(blocksize_x, blocksize_y, 1);
 63 |     int matrix_dim_x = ceil((float)m / blocksize_x), matrix_dim_y = ceil((float)n / (work_per_thread * blocksize_y));
 64 |     dim3 matrix_dims(matrix_dim_x, matrix_dim_y, 1);
 65 | 
 66 | #ifdef CUDA_OLCF_PLATFORM
 67 |     fill_random_fugaku_d<<<matrix_dims, thread_dims>>>(N, stat0, inc1, A, row_sums, m, n, i1, j1, b, istride, jstride, work_per_thread);
 68 | #elif defined(ROCM_OLCF_PLATFORM)
 69 |     hipLaunchKernelGGL(fill_random_fugaku_d, matrix_dims, thread_dims, 0, 0,
 70 |             N, stat0, inc1, A, row_sums, m, n, i1, j1, b, istride, jstride, work_per_thread);
 71 | #else
 72 |     throw std::runtime_error("Error with build platform, neither CUDA nor ROCM. See CMake output.")
 73 | #endif
 74 | 
 75 | }
 76 | 
 77 | template <typename F> __global__ void fill_random_fugaku_d(uint64_t N, RandStat stat0, RandCoeff inc1, F *A, double *row_sums, long m, long n, long i1, long j1, long b, long istride, long jstride, int work_per_thread)
 78 | {
 79 | 	int idx_x = GPU_BLOCKIDX_X * GPU_BLOCKDIM_X + GPU_THREADIDX_X;
 80 | 	int idx_y = GPU_BLOCKIDX_Y * GPU_BLOCKDIM_Y + GPU_THREADIDX_Y;
 81 |     const uint64_t A_row = idx_x, global_row = (i1 + (A_row / (uint64_t)b) * istride) * (uint64_t)b + (A_row % (uint64_t)b);
 82 |     uint64_t A_col, global_col, A_idx, global_idx;
 83 |     double a_ij, row_sum_increase = 0;
 84 | 
 85 |     if (A_row < m)
 86 |     {
 87 |         for(int i = 0 ; i < work_per_thread; i++)
 88 |         {
 89 |             A_col = work_per_thread * idx_y + i;
 90 |             global_col = (j1 + (A_col / (uint64_t)b) * jstride) * (uint64_t)b + (A_col % (uint64_t)b);
 91 |             if (A_col < n)
 92 |             {
 93 |                 A_idx = A_row + A_col * (uint64_t)m;
 94 |                 global_idx = global_col * N + global_row;
 95 |                 a_ij = static_cast<double>(inc1.pow(global_idx) * stat0);
 96 |                 A[A_idx] = static_cast<F>(a_ij);
 97 |                 row_sum_increase += fabs(a_ij);
 98 |             }
 99 |         }
100 |         atomicAdd(&row_sums[global_row], row_sum_increase);
101 |     }
102 | }
103 | #endif
104 | 
105 | 
106 | __host__ void compute_row_sums(float *A, double *row_sums, long m, long n, long i1, long j1, long b, long istride, long jstride, int n_threads, int blocksize_x, int work_per_thread)
107 | {
108 |     int blocksize_y = n_threads / blocksize_x;
109 |     dim3 thread_dims(blocksize_x, blocksize_y, 1);
110 |     int block_dim_x = ceil((float)m / blocksize_x), block_dim_y = ceil((float)n / (work_per_thread * blocksize_y));
111 |     dim3 block_dims(block_dim_x, block_dim_y, 1);
112 | 
113 | #ifdef CUDA_OLCF_PLATFORM
114 |     compute_row_sums_d<<<block_dims, thread_dims>>>(A, row_sums, m, n, i1, j1, b, istride, jstride, work_per_thread);
115 | #elif defined(ROCM_OLCF_PLATFORM)
116 |     hipLaunchKernelGGL(compute_row_sums_d, block_dims, thread_dims, 0, 0,
117 |             A, row_sums, m, n, i1, j1, b, istride, jstride, work_per_thread);
118 | #else
119 |     throw std::runtime_error("Error with build platform, neither CUDA nor ROCM. See CMake output.")
120 | #endif
121 | }
122 | 
123 | __global__ void compute_row_sums_d(float *A, double *row_sums, long m, long n, long i1, long j1, long b, long istride, long jstride, int work_per_thread)
124 | {
125 |     int idx_x = GPU_BLOCKIDX_X * GPU_BLOCKDIM_X + GPU_THREADIDX_X;
126 |     int idx_y = GPU_BLOCKIDX_Y * GPU_BLOCKDIM_Y + GPU_THREADIDX_Y;
127 |     const size_t A_row = idx_x, global_row_block = i1 + (A_row / b) * istride, global_row = global_row_block * b + (A_row % b);
128 |     size_t A_col;
129 |     double row_sum_increase = 0;
130 | 
131 |     if (A_row < m)
132 |     {
133 |         for(int i = 0 ; i < work_per_thread; i++)
134 |         {
135 |             A_col = work_per_thread * idx_y + i;
136 |             if (A_col < n)
137 |                 row_sum_increase += fabs(A[A_row + A_col * m]);
138 | 
139 |         }
140 |         atomicAdd(&row_sums[global_row], row_sum_increase);
141 |     }
142 | 
143 | }
144 | 
145 | __host__ void fill_diag_rhs(int my_init, uint64_t N, RandStat stat0, RandCoeff inc1, float *A, double *row_sums_d, double *rhs, double *rhs_d, long m, long *diag_i_steps, long *diag_j_steps, long n_diag_blocks, long i1, long b, long istride, int n_threads)
146 | {
147 |     int n_diag_entries = n_diag_blocks * b;
148 |     dim3 block_dims(ceil((float) n_diag_entries / n_threads), 1, 1);
149 |     dim3 thread_dims(n_threads, 1, 1);
150 | 
151 | #ifdef CUDA_OLCF_PLATFORM
152 |     fill_diag_rhs_d<<<block_dims, thread_dims>>>(my_init, N, stat0, inc1, A, row_sums_d, rhs, rhs_d, m, diag_i_steps, diag_j_steps, n_diag_entries, i1, b, istride);
153 | #elif defined(ROCM_OLCF_PLATFORM)
154 |     hipLaunchKernelGGL(fill_diag_rhs_d, block_dims, thread_dims, 0, 0,
155 |             my_init, N, stat0, inc1, A, row_sums_d, rhs, rhs_d, m, diag_i_steps, diag_j_steps, n_diag_entries, i1, b, istride);
156 | #else
157 |     throw std::runtime_error("Error with build platform, neither CUDA nor ROCM. See CMake output.")
158 | #endif
159 | }
160 | 
161 | 
162 | __global__ void fill_diag_rhs_d(int my_init, uint64_t N, RandStat stat0, RandCoeff inc1, float *A, double *row_sums_d, double *rhs, double *rhs_d, long m, long *diag_i_steps, long *diag_j_steps, long n_diag_entries, long i1, long b, long istride)
163 | {
164 |     int idx = GPU_BLOCKIDX_X * GPU_BLOCKDIM_X + GPU_THREADIDX_X;
165 |     
166 |     if (idx < n_diag_entries)
167 |     {
168 |         size_t diag_block_idx = idx / b, offset_in_block = idx % b;
169 |         size_t diag_i_step = diag_i_steps[diag_block_idx], diag_j_step = diag_j_steps[diag_block_idx];
170 |         size_t A_idx = (diag_j_step * m + diag_i_step) * b + offset_in_block * (m + 1);
171 |         size_t global_diag_idx = (i1 + diag_i_step * istride) * b + offset_in_block;
172 |         
173 |         row_sums_d[global_diag_idx] -= A[A_idx];
174 |         A[A_idx] = row_sums_d[global_diag_idx];
175 |         if (my_init == 3)
176 |         {
177 |             size_t local_diag_idx = diag_i_step * b + offset_in_block;
178 |             rhs_d[local_diag_idx] = static_cast<double>(inc1.pow(N*N + global_diag_idx) * stat0);
179 |         }
180 |     }
181 | }
182 | 
183 | template <typename F> __global__ void minus_05(F *A, long m, long n, int work_per_thread)
184 | {
185 | 	int idx_x = GPU_BLOCKIDX_X * GPU_BLOCKDIM_X + GPU_THREADIDX_X;
186 | 	int idx_y = GPU_BLOCKIDX_Y * GPU_BLOCKDIM_Y + GPU_THREADIDX_Y;
187 |     const uint64_t A_row = idx_x;
188 |     uint64_t A_col = work_per_thread * idx_y, A_idx;
189 |     
190 |     if (A_row < m)
191 |     {
192 |         for(int i = 0 ; i < work_per_thread; i++)
193 |         {
194 |             A_col++;
195 |             if (A_col < n)
196 |             {
197 |                 A_idx = A_row + A_col * (uint64_t)m;
198 |                 A[A_idx] -= 0.5;
199 |             }
200 |         }
201 |     }
202 | }
203 | 
204 | 


--------------------------------------------------------------------------------
/src/svesim.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef SVESIM_HPP
  2 | #define SVESIM_HPP
  3 | 
  4 | // ARM sve wrapper.
  5 | // Reimpelment with SSE, AVX, or sometihng if performance is important.
  6 | // (basically, we do not use this wrapper in time-consuming parts.)
  7 | // Add functions if needed.
  8 | 
  9 | #ifdef __ARM_FEATURE_SVE
 10 | #include <arm_sve.h>
 11 | #else
 12 | #include "fp16sim.hpp"
 13 | #include <cstdint>
 14 | #include <math.h>
 15 | #define SVE_VLEN 64
 16 | #define svcntd() (SVE_VLEN / 8)
 17 | #define svcntw() (SVE_VLEN / 4)
 18 | #define svcnth() (SVE_VLEN / 2)
 19 | struct svbool_t {
 20 |     bool x[SVE_VLEN];
 21 | };
 22 | static svbool_t svptrue_b64() {
 23 |     svbool_t r;
 24 |     for (int i = 0; i < SVE_VLEN / 8; ++i)
 25 |         r.x[8 * i] = true;
 26 |     return r;
 27 | }
 28 | static svbool_t svwhilelt_b64(int64_t begin, int64_t end) {
 29 |     svbool_t r;
 30 |     for (int i = 0; i < SVE_VLEN / 8; ++i)
 31 |         r.x[8 * i] = i + begin < end;
 32 |     return r;
 33 | }
 34 | static svbool_t svptrue_b32() {
 35 |     svbool_t r;
 36 |     for (int i = 0; i < SVE_VLEN / 4; ++i)
 37 |         r.x[4 * i] = true;
 38 |     return r;
 39 | }
 40 | static svbool_t svwhilelt_b32(int64_t begin, int64_t end) {
 41 |     svbool_t r;
 42 |     for (int i = 0; i < SVE_VLEN / 4; ++i)
 43 |         r.x[4 * i] = i + begin < end;
 44 |     return r;
 45 | }
 46 | static svbool_t svptrue_b16() {
 47 |     svbool_t r;
 48 |     for (int i = 0; i < SVE_VLEN / 2; ++i)
 49 |         r.x[2 * i] = true;
 50 |     return r;
 51 | }
 52 | static svbool_t svwhilelt_b16(int64_t begin, int64_t end) {
 53 |     svbool_t r;
 54 |     for (int i = 0; i < SVE_VLEN / 2; ++i)
 55 |         r.x[2 * i] = i + begin < end;
 56 |     return r;
 57 | }
 58 | 
 59 | struct svint64_t {
 60 |     int64_t x[SVE_VLEN / 8];
 61 | };
 62 | static svint64_t svdup_s64(int64_t x) {
 63 |     svint64_t r;
 64 |     for (int i = 0; i < SVE_VLEN / 8; ++i)
 65 |         r.x[i] = x;
 66 |     return r;
 67 | }
 68 | static svint64_t svld1_s64(svbool_t t, int64_t const *x) {
 69 |     svint64_t r;
 70 |     for (int i = 0; i < SVE_VLEN / 8; ++i)
 71 |         r.x[i] = (t.x[8 * i] ? x[i] : 0ll);
 72 |     return r;
 73 | }
 74 | static svint64_t svmad_s64_x(svbool_t t, svint64_t a, svint64_t b,
 75 |                              svint64_t c) {
 76 |     svint64_t r;
 77 |     for (int i = 0; i < SVE_VLEN / 8; ++i)
 78 |         r.x[i] = (t.x[8 * i] ? a.x[i] * b.x[i] + c.x[i] : a.x[i]);
 79 |     return r;
 80 | }
 81 | static svint64_t svindex_s64(int64_t base, int64_t step) {
 82 |     svint64_t r;
 83 |     for (int i = 0; i < SVE_VLEN / 8; ++i)
 84 |         r.x[i] = base + i * step;
 85 |     return r;
 86 | }
 87 | struct svfloat64_t {
 88 |     double x[SVE_VLEN / 8];
 89 | };
 90 | static svfloat64_t svdup_f64(double x) {
 91 |     svfloat64_t r;
 92 |     for (int i = 0; i < SVE_VLEN / 8; ++i)
 93 |         r.x[i] = x;
 94 |     return r;
 95 | }
 96 | static svfloat64_t svld1_vnum_f64(svbool_t t, double const *x, int vnum) {
 97 |     svfloat64_t r;
 98 |     for (int i = 0; i < SVE_VLEN / 8; ++i)
 99 |         r.x[i] = (t.x[8 * i] ? x[vnum * SVE_VLEN / 8 + i] : 0.);
100 |     return r;
101 | }
102 | static void svst1_vnum_f64(svbool_t t, double *x, int vnum, svfloat64_t r) {
103 |     for (int i = 0; i < SVE_VLEN / 8; ++i) {
104 |         if (t.x[8 * i])
105 |             x[vnum * SVE_VLEN / 8 + i] = r.x[i];
106 |     }
107 | }
108 | static svfloat64_t svadd_f64_x(svbool_t t, svfloat64_t a, svfloat64_t b) {
109 |     svfloat64_t r;
110 |     for (int i = 0; i < SVE_VLEN / 8; ++i)
111 |         r.x[i] = (t.x[8 * i] ? a.x[i] + b.x[i] : a.x[i]);
112 |     return r;
113 | }
114 | static svfloat64_t svmla_f64_x(svbool_t t, svfloat64_t a, svfloat64_t b,
115 |                                svfloat64_t c) {
116 |     svfloat64_t r;
117 |     for (int i = 0; i < SVE_VLEN / 8; ++i)
118 |         r.x[i] = (t.x[8 * i] ? a.x[i] + b.x[i] * c.x[i] : a.x[i]);
119 |     return r;
120 | }
121 | static svfloat64_t svmla_n_f64_x(svbool_t t, svfloat64_t a, svfloat64_t b,
122 |                                  float c) {
123 |     svfloat64_t r;
124 |     for (int i = 0; i < SVE_VLEN / 8; ++i)
125 |         r.x[i] = (t.x[8 * i] ? a.x[i] + b.x[i] * c : a.x[i]);
126 |     return r;
127 | }
128 | static svfloat64_t svcvt_f64_s64_x(svbool_t t, svint64_t x) {
129 |     svfloat64_t r;
130 |     for (int i = 0; i < SVE_VLEN / 8; ++i)
131 |         r.x[i] = (t.x[8 * i] ? x.x[i] : 0.);
132 |     return r;
133 | }
134 | struct svfloat32_t {
135 |     float x[SVE_VLEN / 4];
136 | };
137 | static svfloat32_t svdup_f32(float x) {
138 |     svfloat32_t r;
139 |     for (int i = 0; i < SVE_VLEN / 4; ++i)
140 |         r.x[i] = x;
141 |     return r;
142 | }
143 | static svfloat32_t svld1_vnum_f32(svbool_t t, float const *x, int vnum) {
144 |     svfloat32_t r;
145 |     for (int i = 0; i < SVE_VLEN / 4; ++i)
146 |         r.x[i] = (t.x[4 * i] ? x[vnum * SVE_VLEN / 4 + i] : 0.f);
147 |     return r;
148 | }
149 | static void svst1_vnum_f32(svbool_t t, float *x, int vnum, svfloat32_t r) {
150 |     for (int i = 0; i < SVE_VLEN / 4; ++i) {
151 |         if (t.x[4 * i])
152 |             x[vnum * SVE_VLEN / 4 + i] = r.x[i];
153 |     }
154 | }
155 | static svfloat32_t svadd_f32_x(svbool_t t, svfloat32_t a, svfloat32_t b) {
156 |     svfloat32_t r;
157 |     for (int i = 0; i < SVE_VLEN / 4; ++i)
158 |         r.x[i] = (t.x[4 * i] ? a.x[i] + b.x[i] : a.x[i]);
159 |     return r;
160 | }
161 | static svfloat32_t svmla_f32_x(svbool_t t, svfloat32_t a, svfloat32_t b,
162 |                                svfloat32_t c) {
163 |     svfloat32_t r;
164 |     for (int i = 0; i < SVE_VLEN / 4; ++i)
165 |         r.x[i] = (t.x[4 * i] ? a.x[i] + b.x[i] * c.x[i] : a.x[i]);
166 |     return r;
167 | }
168 | static svfloat32_t svmls_f32_x(svbool_t t, svfloat32_t a, svfloat32_t b,
169 |                                svfloat32_t c) {
170 |     svfloat32_t r;
171 |     for (int i = 0; i < SVE_VLEN / 4; ++i)
172 |         r.x[i] = (t.x[4 * i] ? a.x[i] - b.x[i] * c.x[i] : a.x[i]);
173 |     return r;
174 | }
175 | 
176 | static svfloat32_t svnmls_f32_x(svbool_t t, svfloat32_t a, svfloat32_t b,
177 |                                 svfloat32_t c) {
178 |     svfloat32_t r;
179 |     for (int i = 0; i < SVE_VLEN / 4; ++i)
180 |         r.x[i] = (t.x[4 * i] ? b.x[i] * c.x[i] - a.x[i] : a.x[i]);
181 |     return r;
182 | }
183 | 
184 | struct svfloat16_t {
185 |     fp16 x[SVE_VLEN / 2];
186 | };
187 | struct svint16_t {
188 |     short x[SVE_VLEN / 2];
189 | };
190 | 
191 | 
192 | //JAE Edit for Clang/LLVM
193 | static svfloat16_t svdup_f16(fp16 *x) {
194 |     svfloat16_t r;
195 |     for (int i = 0; i < SVE_VLEN / 2; ++i)
196 |         r.x[i] = *x;
197 |     return r;
198 | }
199 | 
200 | 
201 | static svfloat16_t svld1_vnum_f16(svbool_t t, fp16 const *x, int vnum) {
202 |     svfloat16_t r;
203 |     for (int i = 0; i < SVE_VLEN / 2; ++i)
204 |         r.x[i] = (t.x[2 * i] ? x[vnum * SVE_VLEN / 2 + i] : fp16(0.f));
205 |     return r;
206 | }
207 | static void svst1_vnum_f16(svbool_t t, fp16 *x, int vnum, svfloat16_t r) {
208 |     for (int i = 0; i < SVE_VLEN / 2; ++i) {
209 |         if (t.x[2 * i])
210 |             x[vnum * SVE_VLEN / 2 + i] = r.x[i];
211 |     }
212 | }
213 | static svfloat16_t svadd_f16_x(svbool_t t, svfloat16_t a, svfloat16_t b) {
214 |     svfloat16_t r;
215 |     for (int i = 0; i < SVE_VLEN / 2; ++i)
216 |         r.x[i] = (t.x[2 * i] ? a.x[i] + b.x[i] : a.x[i]);
217 |     return r;
218 | }
219 | static svfloat16_t svsub_f16_x(svbool_t t, svfloat16_t a, svfloat16_t b) {
220 |     svfloat16_t r;
221 |     for (int i = 0; i < SVE_VLEN / 2; ++i)
222 |         r.x[i] = (t.x[2 * i] ? a.x[i] - b.x[i] : a.x[i]);
223 |     return r;
224 | }
225 | static svfloat16_t svmul_f16_x(svbool_t t, svfloat16_t a, svfloat16_t b) {
226 |     svfloat16_t r;
227 |     for (int i = 0; i < SVE_VLEN / 2; ++i)
228 |         r.x[i] = (t.x[2 * i] ? a.x[i] * b.x[i] : a.x[i]);
229 |     return r;
230 | }
231 | static svfloat16_t svmla_f16_x(svbool_t t, svfloat16_t a, svfloat16_t b,
232 |                                svfloat16_t c) {
233 |     svfloat16_t r;
234 |     for (int i = 0; i < SVE_VLEN / 2; ++i)
235 |         r.x[i] = (t.x[2 * i] ? a.x[i] + b.x[i] * c.x[i] : a.x[i]);
236 |     return r;
237 | }
238 | static svfloat16_t svmls_f16_x(svbool_t t, svfloat16_t a, svfloat16_t b,
239 |                                svfloat16_t c) {
240 |     svfloat16_t r;
241 |     for (int i = 0; i < SVE_VLEN / 2; ++i)
242 |         r.x[i] = (t.x[2 * i] ? a.x[i] - b.x[i] * c.x[i] : a.x[i]);
243 |     return r;
244 | }
245 | static svfloat16_t svnmls_f16_x(svbool_t t, svfloat16_t a, svfloat16_t b,
246 |                                 svfloat16_t c) {
247 |     svfloat16_t r;
248 |     for (int i = 0; i < SVE_VLEN / 2; ++i)
249 |         r.x[i] = (t.x[2 * i] ? b.x[i] * c.x[i] - a.x[i] : a.x[i]);
250 |     return r;
251 | }
252 | static svfloat16_t svrintn_f16_x(svbool_t t, svfloat16_t a) {
253 |     svfloat16_t r;
254 |     for (int i = 0; i < SVE_VLEN / 2; ++i)
255 |         r.x[i] = (t.x[2 * i] ? fp16(roundf((float)a.x[i])) : a.x[i]);
256 |     return r;
257 | }
258 | static svint16_t svcvt_s16_f16_x(svbool_t t, svfloat16_t a) {
259 |     svint16_t r;
260 |     for (int i = 0; i < SVE_VLEN / 2; ++i)
261 |         r.x[i] = (t.x[2 * i] ? (short)(float)a.x[i] : (short)0);
262 |     return r;
263 | }
264 | 
265 | static svint16_t svld1_vnum_s16(svbool_t t, short const *x, int vnum) {
266 |     svint16_t r;
267 |     for (int i = 0; i < SVE_VLEN / 2; ++i)
268 |         r.x[i] = (t.x[2 * i] ? x[vnum * SVE_VLEN / 2 + i] : (short)0);
269 |     return r;
270 | }
271 | static void svst1_vnum_s16(svbool_t t, short *x, int vnum, svint16_t r) {
272 |     for (int i = 0; i < SVE_VLEN / 2; ++i) {
273 |         if (t.x[2 * i])
274 |             x[vnum * SVE_VLEN / 2 + i] = r.x[i];
275 |     }
276 | }
277 | static svint16_t svqadd_s16(svint16_t a, svint16_t b) {
278 |     int const max = (1 << 15) - 1;
279 |     int const min = -(1 << 15);
280 |     svint16_t r;
281 |     for (int i = 0; i < SVE_VLEN / 2; ++i) {
282 |         int x = a.x[i];
283 |         int y = b.x[i];
284 |         if (x + y >= max)
285 |             r.x[i] = max;
286 |         else if (x + y <= min)
287 |             r.x[i] = min;
288 |         else
289 |             r.x[i] = x + y;
290 |     }
291 |     return r;
292 | }
293 | #endif
294 | #endif
295 | 


--------------------------------------------------------------------------------
/src/rocm_device_macros.h:
--------------------------------------------------------------------------------
  1 | 
  2 | #ifndef __HPLAI_HIP_DEVICE_MACROS__
  3 | #define __HPLAI_HIP_DEVICE_MACROS__
  4 | 
  5 | //#include <hip/hip.h>
  6 | #include <hip/hip_runtime.h>
  7 | #include <hip/hip_vector_types.h>
  8 | #include <hip/hip_fp16.h>
  9 | #include <hipblas.h>
 10 | #include <rocblas.h>
 11 | #include <rocsolver.h>
 12 | //#include <hiprand.h>
 13 | 
 14 | #define CHECK_BIT(var,pos) ( (var>>pos) & 1 ) 
 15 | 
 16 | // #define GPU_EVENT
 17 | 
 18 | #ifdef GPU_EVENT
 19 | #define GPU_EVENTCREATE(a)       hipEventCreate(a)
 20 | #define GPU_EVENTRECORD(a,b)     hipEventRecord(a,b)
 21 | #define GPU_EVENTSYNC(a)         hipEventSynchronize(a)
 22 | #define GPU_EVENTELAPSED(a,b,c)  hipEventElapsedTime(a,b,c)
 23 | #define GPU_EVENTDESTROY(a)      hipEventDestroy(a)
 24 | #else
 25 | #define GPU_EVENTCREATE(a)       { }
 26 | #define GPU_EVENTRECORD(a,b)     { }
 27 | #define GPU_EVENTSYNC(a)         { }
 28 | #define GPU_EVENTELAPSED(a,b,c)  { }
 29 | #define GPU_EVENTDESTROY(a)      { }
 30 | #endif
 31 | 
 32 | //#define SKIP       
 33 | #ifdef SKIP
 34 | #define checkGPUblas(a) { }
 35 | #endif
 36 | 
 37 | //#define DOUBLE
 38 | 
 39 | // *** BASIC HIP MACROS ***
 40 | // Kernel Macros
 41 | #define GPU_BLOCKIDX_X \
 42 |     hipBlockIdx_x
 43 | 
 44 | #define GPU_BLOCKIDX_Y \
 45 |     hipBlockIdx_y
 46 | 
 47 | #define GPU_BLOCKIDX_Z \
 48 |     hipBlockIdx_z
 49 | 
 50 | 
 51 | #define GPU_THREADIDX_X \
 52 |     hipThreadIdx_x
 53 | 
 54 | #define GPU_THREADIDX_Y \
 55 |     hipThreadIdx_y
 56 | 
 57 | #define GPU_THREADIDX_Z \
 58 |     hipThreadIdx_z
 59 | 
 60 | 
 61 | #define GPU_BLOCKDIM_X \
 62 |     hipBlockDim_x
 63 | 
 64 | #define GPU_BLOCKDIM_Y \
 65 |     hipBlockDim_y
 66 | 
 67 | #define GPU_BLOCKDIM_Z \
 68 |     hipBlockDim_z
 69 | 
 70 | 
 71 | #define GPU_GRIDDIM_X \
 72 |     hipGridDim_x
 73 | 
 74 | #define GPU_GRIDDIM_Y \
 75 |     hipGridDim_y
 76 | 
 77 | #define GPU_GRIDDIM_Z \
 78 |     hipGridDim_z
 79 | 
 80 | 
 81 | // Types
 82 | #define GPU_ERROR_T \
 83 |     hipError_t
 84 | 
 85 | #define GPU_STREAM_T \
 86 |     hipStream_t
 87 | 
 88 | 
 89 | // Enums
 90 | #define GPU_SUCCESS \
 91 |     hipSuccess
 92 | 
 93 | #define GPU_STREAM_NON_BLOCKING \
 94 |     hipStreamNonBlocking
 95 | 
 96 | #ifdef DOUBLE 
 97 | #define GPU_R_16F \
 98 |     rocblas_datatype_f32_r
 99 | //  rocblas_datatype_f64_r
100 | 
101 | #define GPU_R_32F \
102 |     rocblas_datatype_f32_r
103 | //  rocblas_datatype_f64_r
104 | 
105 | #else
106 | 
107 | #define GPU_R_16F \
108 |     rocblas_datatype_f16_r
109 | 
110 | #define GPU_R_32F \
111 |     rocblas_datatype_f32_r
112 | #endif
113 | 
114 | #define GPU_R_64F \
115 |     rocblas_datatype_f64_r
116 | 
117 | #define GPU_MEMCPY_DEVICE_TO_HOST \
118 |     hipMemcpyDeviceToHost
119 | 
120 | #define GPU_MEMCPY_HOST_TO_DEVICE \
121 |     hipMemcpyHostToDevice
122 | 
123 | // Kernels
124 | #define GPU_DEVICE_RESET() \
125 |     hipDeviceReset()
126 | 
127 | #define GPU_SET_DEVICE(deviceID) \
128 |     hipSetDevice(deviceID)
129 | 
130 | #define GPU_DEVICE_SYNCHRONIZE() \
131 |     hipDeviceSynchronize()
132 | 
133 | #define GPU_THREAD_SYNCHRONIZE(threadID) \
134 |     hipStreamSynchronize(threadID); 
135 | 
136 | #define GPU_FREE(memPointer) \
137 |     hipFree(memPointer)
138 | 
139 | #define GPU_FREE_HOST(memPointer) \
140 |     hipHostFree(memPointer)
141 | 
142 | #define GPU_GET_ERROR_STRING(hipError) \
143 |     hipGetErrorString(hipError)
144 | 
145 | #define GPU_GET_LAST_ERROR() \
146 |     hipGetLastError()
147 | 
148 | #define GPU_MALLOC(memAddress, numBytes) \
149 |     hipMalloc(memAddress, numBytes)
150 | 
151 | #define GPU_MALLOC_HOST(memAddress, numBytes) \
152 |     hipHostMalloc(memAddress, numBytes)
153 | 
154 | #define GPU_MALLOC_MANAGED(memAddress, numBytes) \
155 |     hipMallocManaged(memAddress, numBytes)
156 | 
157 | #define GPU_MEMCPY(memPointer_to, memPointer_from, numBytes, directionEnum) \
158 |     hipMemcpy(memPointer_to, memPointer_from, numBytes, directionEnum)
159 | 
160 | #define GPU_MEMCPY_2D(memPointer_to, pitchBytes_to, memPointer_from, pitchBytes_from, numBytes_W, numBytes_H, directionEnum) \
161 |     hipMemcpy2D(memPointer_to, pitchBytes_to, memPointer_from, pitchBytes_from, numBytes_W, numBytes_H, directionEnum)
162 | 
163 | #define GPU_MEMCPY_DEVICE_TO_DEVICE \
164 |     hipMemcpyDeviceToDevice
165 | 
166 | #define GPU_MEM_GET_INFO(freeMem, totalMem) \
167 |     hipMemGetInfo(freeMem, totalMem)
168 | 
169 | #define GPU_STREAM_CREATE_WITH_FLAGS(hipStream, streamTypeEnum) \
170 |     hipStreamCreateWithFlags(hipStream, streamTypeEnum)
171 | 
172 | #define GPU_DAXPY(blasHandle, size, alpha, A, Ainc, B, Binc) \
173 |     rocblas_daxpy(blasHandle, size, alpha, A, Ainc, B, Binc)
174 | 
175 | #define GPU_DTRSV(blasHandle, rocFill, rocOp, rocDiag, M_dim, memPointer_A, lda, memPointer_B, Binc ) \
176 |     rocblas_dtrsv(blasHandle, rocFill, rocOp, rocDiag, M_dim, memPointer_A, lda, memPointer_B, Binc)
177 | 
178 | 
179 | 
180 | // *** HIPBLAS MACROS ***
181 | // Types
182 | #define GPUBLAS_HANDLE_T \
183 |     rocblas_handle
184 | 
185 | #define GPUBLAS_STATUS_T \
186 |     rocblas_status
187 | 
188 | // Enums
189 | #define GPUBLAS_STATUS_SUCCESS \
190 |     rocblas_status_success
191 | 
192 | #define GPUBLAS_STATUS_NOT_INITIALIZED \
193 |     rocblas_status_invalid_handle
194 | 
195 | #define GPUBLAS_STATUS_ALLOC_FAILED \
196 |     rocblas_status_memory_error
197 | 
198 | #define GPUBLAS_STATUS_INVALID_VALUE \
199 |     rocblas_status_invalid_value
200 | 
201 | // No good option for rocblas_status
202 | #define GPUBLAS_STATUS_ARCH_MISMATCH \
203 |     rocblas_status_perf_degraded
204 | 
205 | #define GPUBLAS_STATUS_MAPPING_ERROR \
206 |     rocblas_status_invalid_pointer
207 | 
208 | #define GPUBLAS_STATUS_EXECUTION_FAILED \
209 |     rocblas_status_invalid_size
210 | 
211 | #define GPUBLAS_STATUS_INTERNAL_ERROR \
212 |     rocblas_status_internal_error
213 | 
214 | #define GPUBLAS_OP_N \
215 |     rocblas_operation_none
216 | 
217 | #define GPUBLAS_OP_T \
218 |     rocblas_operation_transpose
219 | 
220 | #define GPUBLAS_SIDE_RIGHT \
221 |     rocblas_side_right
222 | 
223 | #define GPUBLAS_SIDE_LEFT \
224 |     rocblas_side_left
225 | 
226 | #define GPUBLAS_FILL_MODE_UPPER \
227 |     rocblas_fill_upper
228 | 
229 | #define GPUBLAS_FILL_MODE_LOWER \
230 |     rocblas_fill_lower
231 | 
232 | #define GPUBLAS_DIAG_UNIT \
233 |     rocblas_diagonal_unit
234 | 
235 | #define GPUBLAS_DIAG_NON_UNIT \
236 |     rocblas_diagonal_non_unit
237 | 
238 | // Kernels
239 | #define GPUBLAS_CREATE(rocblasHandle) \
240 |     rocblas_create_handle(rocblasHandle)
241 | 
242 | #define GPUBLAS_SET_STREAM(rocblasHandle, hipStream) \
243 |     rocblas_set_stream(rocblasHandle, hipStream)
244 | 
245 | //#define GPUBLAS_SGETRF_BATCHED(hipblasHandle, N_dim, memPointer_A, lda, memPointer_Pivot, memPointer_Info, batchSize) \
246 | //    hipblasSgetrfBatched(hipblasHandle, N_dim, memPointer_A, lda, memPointer_Pivot, memPointer_Info, batchSize)
247 | 
248 | #define GPUBLAS_STRSM(rocblasHandle, rocSide, rocFill, rocOp, rocDiag, M_dim, N_dim, alpha, memPointer_A, lda, memPointer_B, ldb) \
249 |     rocblas_strsm(rocblasHandle, rocSide, rocFill, rocOp, rocDiag, M_dim, N_dim, alpha, memPointer_A, lda, memPointer_B, ldb)
250 | //dbl  rocblas_dtrsm(rocblasHandle, rocSide, rocFill, rocOp, rocDiag, M_dim, N_dim, alpha, memPointer_A, lda, memPointer_B, ldb)
251 | 
252 | #define GPU_daxpy(handle, n, alpha, x, incx, y, incy) \
253 |     rocblas_daxpy(handle, n, alpha, x, incx, y, incy)
254 | 
255 | #define GPU_dscal(handle, n, alpha, x, incx) \
256 |     rocblas_dscal(handle, n, alpha, x, incx)
257 | 
258 | #define GPU_setValue(x, value, size) \
259 |     hipMemset(x, value, size)
260 | 
261 | 
262 | // Non-Simple Kernels
263 | //#define GPUBLAS_GET_ERROR_STRING(hipblasStatus) \
264 | //    hipblasGetErrorString(hipblasStatus)
265 | 
266 | 
267 | #define GPUBLAS_SGEMM_EX(hipblasHandle, hipOp_A, hipOp_B, M_dim, N_dim, k_dim, alpha, memPointer_A, datatype_A, lda, memPointer_B, datatype_B, ldb, beta, memPointer_C, datatype_C, ldc) \
268 |     rocblas_gemm_ex(hipblasHandle, hipOp_A, hipOp_B, M_dim, N_dim, k_dim, alpha, memPointer_A, datatype_A, lda, memPointer_B, datatype_B, ldb, beta, memPointer_C, datatype_C, ldc, memPointer_C, datatype_C, ldc, datatype_C, rocblas_gemm_algo_standard, 0, 0)
269 | 
270 | 
271 | 
272 | // *** HIPSOLVER MACROS ***
273 | // Types
274 | #define GPUSOLVER_HANDLE_T \
275 |     rocblas_handle
276 | 
277 | #define GPUSOLVER_STATUS_T \
278 |     rocblas_status
279 | 
280 | 
281 | // Enums
282 | #define GPUSOLVER_STATUS_SUCCESS \
283 |     rocblas_status_success
284 | 
285 | 
286 | // Kernels
287 | #define GPUSOLVER_CREATE(rocsolverHandle) \
288 |     rocblas_create_handle(rocsolverHandle)
289 | 
290 | #define GPUSOLVER_SGETRF(rocsolverHandle, M_dim, N_dim, memPointer, lda, memAddress_Buffer, memPointer_Pivot, memPointer_Info) \
291 |       rocsolver_sgetrf_npvt(rocsolverHandle, M_dim, N_dim, memPointer, lda, memPointer_Info)
292 | //dbl rocsolver_dgetrf_npvt(rocsolverHandle, M_dim, N_dim, memPointer, lda, memPointer_Info)
293 | 
294 | //#define GPUSOLVER_SGETRF_BUFFERSIZE(hipsolverDnHandle, M_dim, N_dim, memPointer, lda, memAddress_Buffer, memPointer_Pivot, memPointer_Info) \
295 | //    hipsolverDnSgetrf_bufferSize(hipsolverDnHandle, M_dim, N_dim, memPointer, lda, memAddress_Buffer, memPointer_Pivot, memPointer_Info)
296 | 
297 | //#define GPUSOLVER_SGETRF_BUFFERSIZE(hipsolverDnHandle, M_dim, N_dim, memPointer, lda, memAddress_Buffer) \
298 | //    hipsolverDnSgetrf_bufferSize(hipsolverDnHandle, M_dim, N_dim, memPointer, lda, memAddress_Buffer)
299 | 
300 | #define GPUSOLVER_SET_STREAM(rocsolverHandle, hipStream) \
301 |     rocblas_set_stream(rocsolverHandle, hipStream)
302 | 
303 | 
304 | 
305 | #if 0
306 | // *** HIPRAND MACROS ***
307 | // Types
308 | #define GPURAND_GENERATOR_T \
309 |     hiprandGenerator_t
310 | 
311 | 
312 | // Enums
313 | #define GPURAND_RNG_PSEUDO_DEFAULT \
314 |     HIPRAND_RNG_PSEUDO_DEFAULT
315 | 
316 | 
317 | // Kernels
318 | #define GPURAND_CREATE_GENERATOR(hiprandGenerator, hiprandRngType) \
319 |     hiprandCreateGenerator(hiprandGenerator, hiprandRngType)
320 | 
321 | #define GPURAND_DESTROY_GENERATOR(hiprandGenerator) \
322 |     hiprandDestroyGenerator(hiprandGenerator)
323 | 
324 | #define GPURAND_GENERATE_UNIFORM(hiprandGenerator, memPointer, numBytes) \
325 |     hiprandGenerateUniform(hiprandGenerator, memPointer, numBytes)
326 | 
327 | #define GPURAND_GENERATE_UNIFORM_DOUBLE(hiprandGenerator, memPointer, numBytes) \
328 |     hiprandGenerateUniformDouble(hiprandGenerator, memPointer, numBytes)
329 | 
330 | #define GPURAND_SET_PSEUDO_RANDOM_GENERATOR_SEED(hiprandGenerator, seed) \
331 |     hiprandSetPseudoRandomGeneratorSeed(hiprandGenerator, seed)
332 | 
333 | #endif
334 | 
335 | #endif // __HPLAI_HIP_DEVICE_MACROS__
336 | 


--------------------------------------------------------------------------------
/src/fp16sim.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef FP16SIM_HPP
  2 | #define FP16SIM_HPP
  3 | 
  4 | #include "device_macros.h"
  5 | 
  6 | // a very small wrapper for fp16.
  7 | 
  8 | //#define BF_NMANT 3
  9 | 
 10 | #ifdef __aarch64__
 11 | #if !defined(__FUJITSU) && !defined(__CLANG_FUJITSU)
 12 | //#    define FP16_NATIVE_SUPPORT
 13 | #define FP16_AUTO_PROMOTION
 14 | #else
 15 | #define FP16_FUJITSU_TRAD_MODE
 16 | #endif
 17 | #elif defined(BF_NMANT)
 18 | #define FP16_BFLIKE_FLOAT
 19 | #if BF_NMANT > 7 || BF_NMANT <= 1
 20 | #error "too large or small mantissa for BFLIKE_FLOAT"
 21 | #endif
 22 | #elif defined(__AVX2__)
 23 | #define FP16_AVX2_EMULATION
 24 | #elif defined(__clang__) && __clang_major__ >= 8
 25 | #define FP16_AUTO_PROMOTION
 26 | #else
 27 | #define FP16_IS_NOT_SUPPORTED
 28 | #endif
 29 | 
 30 | #ifdef FP16_NATIVE_SUPPORT
 31 | typedef _Float16 fp16;
 32 | 
 33 | inline void hgemm(int m, int n, int k, float alpha, fp16 const *a, int lda,
 34 |                   fp16 const *b, int ldb, float beta, fp16 *c, int ldc) {
 35 |     // HGEMM
 36 |     // replace with native one for performance.
 37 |     for (int i = 0; i < n; ++i) {
 38 |         for (int j = 0; j < m; ++j) {
 39 |             fp16 temp(0.f);
 40 |             for (int l = 0; l < k; ++l)
 41 |                 temp = a[l * lda + j] * b[i * ldb + l] + temp;
 42 |             c[ldc * i + j] = c[ldc * i + j] * beta + temp * alpha;
 43 |         }
 44 |     }
 45 | }
 46 | inline void shgemm(int m, int n, int k, float alpha, fp16 const *a, int lda,
 47 |                    fp16 const *b, int ldb, float beta, float *c, int ldc) {
 48 |     // SHGEMM. HGEMM with fp32 accumulator.
 49 |     // replace with native one for performance.
 50 |     for (int i = 0; i < n; ++i) {
 51 |         for (int j = 0; j < m; ++j) {
 52 |             float temp = 0.f;
 53 |             for (int l = 0; l < k; ++l)
 54 |                 temp = a[l * lda + j] * b[i * ldb + l] + temp;
 55 |             c[ldc * i + j] = c[ldc * i + j] * beta + temp * alpha;
 56 |         }
 57 |     }
 58 | }
 59 | #endif
 60 | 
 61 | #ifdef FP16_FUJITSU_TRAD_MODE
 62 | // and CLANG mode
 63 | #include <stdlib.h>
 64 | extern "C" void fjblas_gemm_r16_(...);
 65 | typedef __fp16 fp16;
 66 | inline void hgemm(int m, int n, int k, float alpha, fp16 const *a, int lda,
 67 |                   fp16 const *b, int ldb, float beta, fp16 *c, int ldc) {
 68 |     (void)alpha;
 69 |     (void)beta;
 70 |     short one = 15360;   // == 1.
 71 |     short mone = -17408; // == -1.
 72 |     fjblas_gemm_r16_("N", "N", &m, &n, &k, &mone, a, &lda, b, &ldb, &one, c,
 73 |                      &ldc);
 74 | }
 75 | inline void shgemm(int, int, int, float, fp16 const *, int, fp16 const *, int,
 76 |                    float, float *, int) {
 77 |     abort();
 78 | }
 79 | #endif
 80 | 
 81 | #ifdef FP16_AUTO_PROMOTION
 82 | typedef __fp16 fp16;
 83 | 
 84 | inline void hgemm(int m, int n, int k, float alpha, fp16 const *a, int lda,
 85 |                   fp16 const *b, int ldb, float beta, fp16 *c, int ldc) {
 86 |     for (int i = 0; i < n; ++i) {
 87 |         for (int j = 0; j < m; ++j) {
 88 |             fp16 temp(0);
 89 |             for (int l = 0; l < k; ++l)
 90 |                 temp = a[l * lda + j] * b[i * ldb + l] + temp;
 91 |             c[ldc * i + j] = c[ldc * i + j] * beta + temp * alpha;
 92 |         }
 93 |     }
 94 | }
 95 | inline void shgemm(int m, int n, int k, float alpha, fp16 const *a, int lda,
 96 |                    fp16 const *b, int ldb, float beta, float *c, int ldc) {
 97 |     for (int i = 0; i < n; ++i) {
 98 |         for (int j = 0; j < m; ++j) {
 99 |             float temp = 0.f;
100 |             for (int l = 0; l < k; ++l)
101 |                 temp = a[l * lda + j] * b[i * ldb + l] + temp;
102 |             c[ldc * i + j] = c[ldc * i + j] * beta + temp * alpha;
103 |         }
104 |     }
105 | }
106 | #endif
107 | 
108 | #if defined(FP16_AVX2_EMULATION) || defined(FP16_BFLIKE_FLOAT)
109 | #ifdef FP16_AVX2_EMULATION
110 | #include <x86intrin.h>
111 | struct fp16 {
112 |     unsigned short x;
113 |     fp16() {}
114 |     fp16(const fp16 &rhs) : x(rhs.x) {}
115 |     fp16 &operator=(fp16 rhs) {
116 |         x = rhs.x;
117 |         return *this;
118 |     }
119 |     fp16(float t) { x = _cvtss_sh(t, 0); }
120 |     float convert_to_float() const { return _cvtsh_ss(x); }
121 |     explicit operator float() const { return convert_to_float(); }
122 |     explicit operator double() const {
123 |         return static_cast<double>(convert_to_float());
124 |     }
125 | 
126 |     fp16 operator+(fp16 rhs) const {
127 |         return this->convert_to_float() + rhs.convert_to_float();
128 |     }
129 |     fp16 operator-(fp16 rhs) const {
130 |         return this->convert_to_float() - rhs.convert_to_float();
131 |     }
132 |     fp16 operator*(fp16 rhs) const {
133 |         return this->convert_to_float() * rhs.convert_to_float();
134 |     }
135 | };
136 | 
137 | #endif
138 | 
139 | #ifdef FP16_BFLIKE_FLOAT
140 | #include <math.h>
141 | #include <stdint.h>
142 | #include <stdio.h>
143 | struct fp16 {
144 |     uint16_t x;
145 |     fp16() {}
146 |     fp16(const fp16 &rhs) : x(rhs.x) {}
147 |     fp16 &operator=(fp16 rhs) {
148 |         x = rhs.x;
149 |         return *this;
150 |     }
151 |     fp16(float f) {
152 |         uint32_t t = *reinterpret_cast<uint32_t *>(&f);
153 |         uint32_t exp = t & 0x7f800000u;
154 |         uint32_t mant = t & 0x007fffffu;
155 |         int shift = 16 + 7 - BF_NMANT;
156 |         x = (t >> shift);
157 |         if (mant & (1u << (shift - 1))) {
158 |             uint32_t lowmant = mant & ((1u << shift) - 1u);
159 |             uint32_t halfway = 1u << (shift - 1);
160 |             if (lowmant > halfway || (x & 0x1u))
161 |                 ++x;
162 |         }
163 |         /*{
164 |                 float o = this->convert_to_float();
165 |                 float e = (f==0.f? fabs(o-f): fabs(o-f)/fabs(f));
166 |                 if(e>1e-1) printf("XX %x %x %.15e -> %.15e :: %f\n", t,
167 |         (uint32_t)x, f, o, e);
168 |         }*/
169 |     }
170 |     float convert_to_float() const {
171 |         // upcast is easy
172 |         uint32_t t = ((uint32_t)x) << (16 + 7 - BF_NMANT);
173 |         return *(float *)&t;
174 |     }
175 |     explicit operator float() const { return convert_to_float(); }
176 |     explicit operator double() const {
177 |         return static_cast<double>(convert_to_float());
178 |     }
179 | 
180 |     fp16 operator+(fp16 rhs) const {
181 |         return this->convert_to_float() + rhs.convert_to_float();
182 |     }
183 |     fp16 operator-(fp16 rhs) const {
184 |         return this->convert_to_float() - rhs.convert_to_float();
185 |     }
186 |     fp16 operator*(fp16 rhs) const {
187 |         return this->convert_to_float() * rhs.convert_to_float();
188 |     }
189 | };
190 | #endif
191 | 
192 | // double rounding causes larger error in very rare case. we ignore it for
193 | // performance
194 | inline float fp16_fma(fp16 a, fp16 b, fp16 c) {
195 |     float fa = a.convert_to_float();
196 |     float fb = b.convert_to_float();
197 |     float fc = c.convert_to_float();
198 |     return fa * fb + fc;
199 | }
200 | 
201 | void hgemm_opt(int m, int n, int k, float alpha, fp16 const *a, int lda,
202 |                fp16 const *b, int ldb, float /*beta*/, fp16 *c, int ldc);
203 | inline void hgemm(int m, int n, int k, float alpha, fp16 const *a, int lda,
204 |                   fp16 const *b, int ldb, float beta, fp16 *c, int ldc) {
205 | #ifdef FP16_AVX2_EMULATION
206 |     if (beta == 1.f) { // remove this in the case for test
207 |         hgemm_opt(m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
208 |         return;
209 |     }
210 | #endif
211 |     for (int j = 0; j < m; ++j) {
212 |         for (int i = 0; i < n; ++i) {
213 |             fp16 temp(0.f);
214 |             for (int l = 0; l < k; ++l)
215 |                 temp = fp16_fma(a[l * lda + j], b[i * ldb + l], temp);
216 |             c[ldc * i + j] = (c[ldc * i + j].convert_to_float() * beta +
217 |                               temp.convert_to_float() * alpha);
218 |         }
219 |     }
220 | }
221 | 
222 | void shgemm_opt(int m, int n, int k, float alpha, fp16 const *a, int lda,
223 |                 fp16 const *b, int ldb, float /*beta*/, float *c, int ldc);
224 | inline void shgemm(int m, int n, int k, float alpha, fp16 const *a, int lda,
225 |                    fp16 const *b, int ldb, float beta, float *c, int ldc) {
226 | #ifdef FP16_AVX2_EMULATION
227 |     if (beta == 1.f) { // remove this in the case of test
228 |         shgemm_opt(m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
229 |         return;
230 |     }
231 | #endif
232 |     for (int j = 0; j < m; ++j) {
233 |         for (int i = 0; i < n; ++i) {
234 |             float temp = 0.f;
235 |             for (int l = 0; l < k; ++l)
236 |                 temp = a[l * lda + j].convert_to_float() *
237 |                            b[i * ldb + l].convert_to_float() +
238 |                        temp;
239 |             c[ldc * i + j] = c[ldc * i + j] * beta + temp * alpha;
240 |         }
241 |     }
242 | }
243 | 
244 | #endif
245 | 
246 | #ifdef FP16_IS_NOT_SUPPORTED
247 | #warning "FP16 IS NOT SUPPORTED"
248 | typedef unsigned short fp16;
249 | 
250 | // do nothing.
251 | inline void hgemm(...) {}
252 | inline void shgemm(...) {}
253 | 
254 | #endif
255 | 
256 | #if 0
257 | // test code
258 | #include "fp16sim.hpp"
259 | #include <cstdio>
260 | #include <cstdlib>
261 | 
262 | int main(){
263 | 	int m = 300, n = 210, k=101;
264 | 	fp16*a = (fp16*)malloc(sizeof(fp16)*m*k);
265 | 	fp16*b = (fp16*)malloc(sizeof(fp16)*k*n);
266 | 	fp16*c = (fp16*)malloc(sizeof(fp16)*m*n);
267 | 	fp16*c2 = (fp16*)malloc(sizeof(fp16)*m*n);
268 | 	for(int j=0; j<k; ++j) for(int i=0; i<m; ++i) a[m*j+i] = (float)std::rand()/RAND_MAX;
269 | 	for(int j=0; j<n; ++j) for(int i=0; i<k; ++i) b[k*j+i] = (float)std::rand()/RAND_MAX;
270 | 	for(int j=0; j<n; ++j) for(int i=0; i<m; ++i) c[m*j+i] = 0.f;
271 | 	for(int j=0; j<n; ++j) for(int i=0; i<m; ++i) c2[m*j+i] = 0.f;
272 | 	hgemm(m, n, k, -1.f, a, m, b, k, 1.f, c, m);
273 | 	hgemm_opt(m, n, k, -1.f, a, m, b, k, 1.f, c2, m);
274 | 	double error = 0.;
275 | 	for(int j=0; j<n; ++j) for(int i=0; i<m; ++i) {
276 | 		double t = (double)c[m*j+i] - (double)c2[m*j+i];
277 | 		t = t < 0. ? -t: t;
278 | 		error = t > error ? t: error;
279 | 		//std::printf("%d %d %e %e\n", i, j, (float)c[m*j+i], (float)c2[m*j+i]);
280 | 	}
281 | 	std::printf("hgemm error = %e\n", error);
282 | 
283 | 	float*sc = (float*)malloc(sizeof(float)*m*n);
284 | 	float*sc2 = (float*)malloc(sizeof(float)*m*n);
285 | 	for(int j=0; j<n; ++j) for(int i=0; i<m; ++i) sc[m*j+i] = 0.f;
286 | 	for(int j=0; j<n; ++j) for(int i=0; i<m; ++i) sc2[m*j+i] = 0.f;
287 | 	shgemm(m, n, k, -1.f, a, m, b, k, 1.f, sc, m);
288 | 	shgemm_opt(m, n, k, -1.f, a, m, b, k, 1.f, sc2, m);
289 | 	error = 0.;
290 | 	for(int j=0; j<n; ++j) for(int i=0; i<m; ++i) {
291 | 		double t = (double)sc[m*j+i] - (double)sc2[m*j+i];
292 | 		t = t < 0. ? -t: t;
293 | 		error = t > error ? t: error;
294 | 		//std::printf("%d %d %e %e\n", i, j, (float)sc[m*j+i], (float)sc2[m*j+i]);
295 | 	}
296 | 	std::printf("shgemm error = %e\n", error);
297 | 
298 | 	return 0;
299 | }
300 | #endif
301 | #endif
302 | 


--------------------------------------------------------------------------------
/src/grid.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef GRID_HPP
  2 | #define GRID_HPP
  3 | #include "fp16sim.hpp"
  4 | #include <cassert>
  5 | #include <cstring>
  6 | #include <mpi.h>
  7 | #include "log.hpp"
  8 | 
  9 | #ifdef __APPLE__
 10 | #define aligned_alloc(alignment, size) malloc(size)
 11 | #endif
 12 | 
 13 | extern int grank, gsize; 
 14 | extern int reorder[ 8 ];
 15 | 
 16 | enum NumaMap {
 17 |     // How to destribute NUMA processes to the process grid.
 18 |     ROWCONT, // continuous in row
 19 |     COLCONT, // continuous in column
 20 |     ROWDIST, // distributed (cyclic) over row
 21 |     COLDIST, // distributed (cyclic) over column
 22 |     CONT2D   // continuous in 2x2. this is only for nnuma==4
 23 | };
 24 | 
 25 | struct Grid {
 26 |     // vcomm is a communicator for vertical communication (inside a column)
 27 |     // row = id(vcomm), nrow = sz(vcomm)
 28 |     // hcomm is a communicator for horizontal communication (inside a row)
 29 |     // col = id(hcomm), ncol = sz(hcomm)
 30 |     int row, col;
 31 |     int nrow, ncol;
 32 |     int idnuma, nnuma;
 33 |     MPI_Comm vcomm, hcomm, commworld;
 34 |     Grid(MPI_Comm comm, int nrow, int numasize = 0,
 35 |          NumaMap map = NumaMap::ROWCONT)
 36 |         : commworld(comm) {
 37 |         assert(numasize >= 0);
 38 |         assert(numasize != 0 || map != NumaMap::ROWCONT);
 39 | 
 40 |         int rank, size;
 41 |         MPI_Comm_rank(comm, &rank);
 42 |         MPI_Comm_size(comm, &size);
 43 |         if ( gsize % nrow ) MPI_Abort(MPI_COMM_WORLD, 4);
 44 |         int ncol = gsize / nrow;
 45 |         int myrow, mycol;
 46 |         if ( numasize == 1 ) {
 47 |      	    PrintMsg("\tNode Grid - 2x3C");
 48 | 	    idnuma = 0;
 49 |             nnuma = 1;
 50 |             int myNode = grank / 6;
 51 |             int myLocalID = grank % 6;
 52 |             int nodeRow = (myNode % (nrow / 2)) * 2;
 53 |             int nodeCol = (myNode / (nrow / 2)) * 3;
 54 |             myrow = (myLocalID % 2) + nodeRow;
 55 |             mycol = (myLocalID / 2) + nodeCol;
 56 |         }
 57 | 	else if ( numasize == 2 ) {
 58 |        	    PrintMsg("\tNode Grid - 3x2C");
 59 | 	    idnuma = 0;
 60 |             nnuma = 1;
 61 |             int myNode = grank / 6;
 62 |             int myLocalID = grank % 6;
 63 |             int nodeRow = (myNode % (nrow / 3)) * 2;
 64 |             int nodeCol = (myNode / (nrow / 3)) * 2;
 65 |             myrow = (myLocalID % 3) + nodeRow;
 66 |             mycol = (myLocalID / 3) + nodeCol;
 67 |         }
 68 |         else if ( numasize == 0 )
 69 | 	{
 70 |             PrintMsg("\tGlobal Column Major");
 71 | 	    idnuma = 0;
 72 |             nnuma = 1;
 73 |             myrow = grank % nrow;
 74 |             mycol = grank / nrow;
 75 |         }
 76 |         else if ( numasize == 3 )
 77 | 	{
 78 |             PrintMsg("\tGlobal Row Major");
 79 | 	    idnuma = 0;
 80 |             nnuma = 1;
 81 |             myrow = grank / ncol;
 82 |             mycol = grank % ncol;
 83 |         }
 84 | 	else if ( numasize == 4 )
 85 | 	{
 86 |             PrintMsg("\tNode Grid - 2x4R");
 87 | 	    idnuma = 0;
 88 |             nnuma = 1;
 89 | 	    int myNode    = grank / 8;
 90 |             int myLocalID = grank % 8;
 91 |             int nodeRow = (myNode % (nrow / 2)) * 2;
 92 |             int nodeCol = (myNode / (nrow / 2)) * 4;
 93 | 	    myrow       = myLocalID / 4;
 94 | 	    mycol       = myLocalID % 4;
 95 | 	    myrow      += nodeRow;
 96 |             mycol      += nodeCol;
 97 | 	}
 98 | 	else if ( numasize == 5 )
 99 | 	{
100 |             PrintMsg("\tNode Grid - 2x4C");
101 | 	    idnuma = 0;
102 |             nnuma = 1;
103 | 	    int myNode    = grank / 8;
104 |             int myLocalID = grank % 8;
105 |             int nodeRow = (myNode % (nrow / 2)) * 2;
106 |             int nodeCol = (myNode / (nrow / 2)) * 4;
107 | 	    myrow       = myLocalID % 2;
108 | 	    mycol       = myLocalID / 2;
109 | 	    myrow      += nodeRow;
110 |             mycol      += nodeCol;
111 | 	}
112 | 	else if ( numasize == 6 )
113 |         {
114 |        	    PrintMsg("\tNode Grid - 4x2R");
115 | 	    idnuma = 0;
116 |             nnuma = 1;
117 |             int myNode = grank / 8;
118 |             int myLocalID = grank % 8;
119 |             int nodeRow = (myNode % (nrow / 4)) * 4;
120 |             int nodeCol = (myNode / (nrow / 4)) * 2;
121 | 	    myrow       = myLocalID / 2;
122 | 	    mycol       = myLocalID % 2;
123 | 	    myrow      += nodeRow;
124 |             mycol      += nodeCol;
125 |         }
126 | 	else if ( numasize == 7 )
127 |         {
128 |        	    PrintMsg("\tNode Grid - 4x2C");
129 | 	    idnuma = 0;
130 |             nnuma = 1;
131 |             int myNode = grank / 8;
132 |             int myLocalID = grank % 8;
133 |             int nodeRow = (myNode % (nrow / 4)) * 4;
134 |             int nodeCol = (myNode / (nrow / 4)) * 2;
135 | 	    myrow       = myLocalID % 4;
136 | 	    mycol       = myLocalID / 4;
137 | 	    myrow      += nodeRow;
138 |             mycol      += nodeCol;
139 |         }
140 | 	else if ( numasize == 8 )
141 |         {
142 |        	    PrintMsg("\tNode Grid - 1x8C");
143 | 	    idnuma = 0;
144 |             nnuma = 1;
145 | 	    int myNode    = grank / 8;
146 |             int myLocalID = grank % 8;
147 |             int nodeRow = ( myNode % nrow );
148 |             int nodeCol = ( myNode / nrow ) * 8;
149 | 	    myrow       = myLocalID / 8;
150 | 	    mycol       = myLocalID % 8;
151 | 	    myrow      += nodeRow;
152 |             mycol      += nodeCol;
153 |         }
154 | 	else if ( numasize == 9 )
155 |         {
156 |             PrintMsg("\tNode Grid - 4x4");
157 |             idnuma = 0;
158 |             nnuma = 1;
159 |             int myNode = grank / 16;
160 |             int myLocalID = grank % 16;
161 |             int nodeRow = (myNode % (nrow / 4)) * 4;
162 |             int nodeCol = (myNode / (nrow / 4)) * 4;
163 |             switch(myLocalID)
164 |             {
165 |                 case 0:
166 |                     myrow= 0;
167 |                     mycol= 0;
168 |                     break;
169 |                 case 1:
170 |                     myrow= 1;
171 |                     mycol= 1;
172 |                     break;
173 |                 case 2:
174 |                     myrow= 2;
175 |                     mycol= 2;
176 |                     break;
177 |                 case 3:
178 |                     myrow= 3;
179 |                     mycol= 3;
180 |                     break;
181 |                 case 4:
182 |                     myrow= 0;
183 |                     mycol= 1;
184 |                     break;
185 |                 case 5:
186 |                     myrow= 1;
187 |                     mycol= 2;
188 |                     break;
189 |                 case 6:
190 |                     myrow= 2;
191 |                     mycol= 3;
192 |                     break;
193 |                 case 7:
194 |                     myrow= 3;
195 |                     mycol= 0;
196 |                     break;
197 |                 case 8:
198 |                     myrow= 0;
199 |                     mycol= 2;
200 |                     break;
201 |                 case 9:
202 |                     myrow= 1;
203 |                     mycol= 3;
204 |                     break;
205 |                 case 10:
206 |                     myrow= 2;
207 |                     mycol= 0;
208 |                     break;
209 |                 case 11:
210 |                     myrow= 3;
211 |                     mycol= 1;
212 |                     break;
213 |                 case 12:
214 |                     myrow= 0;
215 |                     mycol= 3;
216 |                     break;
217 |                 case 13:
218 |                     myrow= 1;
219 |                     mycol= 0;
220 |                     break;
221 |                 case 14:
222 |                     myrow= 2;
223 |                     mycol= 1;
224 |                     break;
225 |                 case 15:
226 |                     myrow= 3;
227 |                     mycol= 2;
228 |                     break;
229 | 	    }
230 |         }
231 | 	else if ( numasize == 10 )
232 | 	{
233 |             PrintMsg("\tNode Grid - Reorder 2x4C");
234 | 	    idnuma = 0;
235 |             nnuma = 1;
236 | 	    int myNode    = grank / 8;
237 |             int myLocalID = grank % 8;
238 |             int nodeRow = (myNode % (nrow / 2)) * 2;
239 |             int nodeCol = (myNode / (nrow / 2)) * 4;
240 | 	    int reorderID;
241 | 	    for ( int ii = 0; ii < 8; ii++ )
242 | 	    {
243 | 	       if ( myLocalID == reorder[ ii ] )
244 | 	       {
245 | 		  reorderID = ii;
246 | 		  break;
247 | 	       }
248 | 	    }
249 | 	    myrow       = reorderID % 2;
250 | 	    mycol       = reorderID / 2;
251 | 	    myrow      += nodeRow;
252 |             mycol      += nodeCol;
253 | 	}
254 | 	else {
255 | #if 0	//Future possible usage, dont delete
256 |             assert(size % numasize == 0);
257 |             idnuma = rank % numasize;
258 |             nnuma = numasize;
259 |             switch (map) {
260 |             case NumaMap::ROWCONT: {
261 |                 assert(nrow % nnuma == 0);
262 |                 myrow = rank % nrow;
263 |                 mycol = rank / nrow;
264 |             } break;
265 | 
266 |             case NumaMap::COLCONT: {
267 |                 assert((size / nrow) % nnuma == 0);
268 |                 int t = rank / nnuma;
269 |                 myrow = t % nrow;
270 |                 mycol = (t / nrow) * nnuma + idnuma;
271 |             } break;
272 | 
273 |             case NumaMap::ROWDIST: {
274 |                 assert(nrow % nnuma == 0);
275 |                 int rs = nrow / nnuma;
276 |                 int t = rank / nnuma;
277 |                 myrow = (t % rs) + idnuma * rs;
278 |                 mycol = rank / nrow;
279 |             } break;
280 | 
281 |             case NumaMap::COLDIST: {
282 |                 assert((size / nrow) % nnuma == 0);
283 |                 int t = rank / nnuma + (size / nnuma) * idnuma;
284 |                 myrow = t % nrow;
285 |                 mycol = t / nrow;
286 |             } break;
287 | 
288 |             case NumaMap::CONT2D: {
289 |                 assert(nnuma % 2 == 0); // others are not implemented yet
290 |                 assert(nrow % 2 == 0);
291 |                 assert((size / nrow) % (nnuma / 2) == 0);
292 |                 int t = rank / nnuma;
293 |                 int grow = t % (nrow / 2);
294 |                 int gcol = t / (nrow / 2);
295 |                 myrow = grow * 2 + idnuma % 2;
296 |                 mycol = gcol * (nnuma / 2) + idnuma / 2;
297 |             } break;
298 |             default:
299 |                 std::abort();
300 |             }
301 | #endif
302 |         }
303 | 
304 |         MPI_Comm_split(comm, mycol, myrow, &vcomm);
305 |         MPI_Comm_split(comm, myrow, mycol, &hcomm);
306 |         this->row = myrow;
307 |         this->col = mycol;
308 |         this->nrow = nrow;
309 |         this->ncol = ncol;
310 |     }
311 |     ~Grid() {
312 |         MPI_Comm_free(&vcomm);
313 |         MPI_Comm_free(&hcomm);
314 |     }
315 | };
316 | 
317 | template <typename T> struct Mpi_type_wrappe {};
318 | 
319 | template <> struct Mpi_type_wrappe<fp16> {
320 |     operator MPI_Datatype() { return MPI_SHORT; }
321 | };
322 | 
323 | template <> struct Mpi_type_wrappe<__half> {
324 |     operator MPI_Datatype() { return MPI_SHORT; }
325 | };
326 | 
327 | template <> struct Mpi_type_wrappe<float> {
328 |     operator MPI_Datatype() { return MPI_FLOAT; }
329 | };
330 | 
331 | template <> struct Mpi_type_wrappe<double> {
332 |     operator MPI_Datatype() { return MPI_DOUBLE; }
333 | };
334 | 
335 | template <typename F> struct T2MPI { static Mpi_type_wrappe<F> type; };
336 | 
337 | template <typename F> Mpi_type_wrappe<F> T2MPI<F>::type;
338 | 
339 | #endif
340 | 


--------------------------------------------------------------------------------
/src/otf_gemv.cpp:
--------------------------------------------------------------------------------
  1 | #include "hpl_rand.hpp"
  2 | #include "svesim.hpp"
  3 | #include <stdint.h>
  4 | 
  5 | extern "C" void otf_gemv_kernel(int64_t n, int mb, int nb, double alpha,
  6 |                                 double const *__restrict__ x,
  7 |                                 double *__restrict__ y, uint64_t seed) {
  8 |     // on-the-fly GEMV computes y = y + alpha * A * x
  9 |     // see hpl_rand.hpp for details of the matrix generation. it is LCG.
 10 |     // n is the dimension of the whole matrix.
 11 |     // mb \times nb is the dimension of the sub-matrix to compute.
 12 |     // Note that the sub-matrix cannot have diagonals. Use serial code for the
 13 |     // part includes diagonals instead.
 14 |     const int vlen = svcntd();
 15 |     const int nn = svcntd() * 4;
 16 | 
 17 |     RandCoeff c0 = RandCoeff::default_vals();
 18 |     RandStat s0;
 19 |     s0.x = seed;
 20 |     int64_t rinit[vlen];
 21 |     for (int i = 0; i < vlen; ++i) {
 22 |         rinit[i] = s0.x;
 23 |         s0 = c0 * s0;
 24 |     }
 25 |     RandCoeff c8 = c0.pow(vlen);
 26 |     RandCoeff c32 = c8.pow(4);
 27 |     RandCoeff cn = c0.pow(n);
 28 |     RandCoeff cn2 = cn.pow(2);
 29 | 
 30 |     auto t64 = svptrue_b64();
 31 |     svint64_t stat00 = svld1_s64(t64, rinit);
 32 |     alpha *= 0x1.fffffffffffffP-65;
 33 | // 24x24
 34 | //    alpha *= 0x1.fffffffffffffP-73;
 35 | // 96x96
 36 | //    alpha *= 0x1.fffffffffffffP-75;
 37 |     svfloat64_t sv0 = svdup_f64(0.);
 38 | 
 39 |     int64_t jend = nb - 2;
 40 |     for (int64_t i = 0; i < mb; i += nn) {
 41 |         __builtin_prefetch(&y[i]);
 42 |         svfloat64_t y00 = sv0;
 43 |         svfloat64_t y10 = sv0;
 44 |         svfloat64_t y20 = sv0;
 45 |         svfloat64_t y30 = sv0;
 46 |         svfloat64_t y01 = sv0;
 47 |         svfloat64_t y11 = sv0;
 48 |         svfloat64_t y21 = sv0;
 49 |         svfloat64_t y31 = sv0;
 50 |         svint64_t sc00, sc10, sc20, sc30, sc01, sc11, sc21, sc31;
 51 |         {
 52 |             svint64_t a, c;
 53 |             a = svdup_s64(c8.a);
 54 |             c = svdup_s64(c8.c);
 55 |             sc00 = stat00;
 56 |             sc10 = svmad_s64_x(t64, sc00, a, c);
 57 |             sc20 = svmad_s64_x(t64, sc10, a, c);
 58 |             sc30 = svmad_s64_x(t64, sc20, a, c);
 59 |             a = svdup_s64(cn.a);
 60 |             c = svdup_s64(cn.c);
 61 |             sc01 = svmad_s64_x(t64, sc00, a, c);
 62 |             sc11 = svmad_s64_x(t64, sc10, a, c);
 63 |             sc21 = svmad_s64_x(t64, sc20, a, c);
 64 |             sc31 = svmad_s64_x(t64, sc30, a, c);
 65 |         }
 66 |         svint64_t sva, svc;
 67 |         sva = svdup_s64(cn2.a);
 68 |         svc = svdup_s64(cn2.c);
 69 |         for (int64_t j = 0; j <= jend; j += 2) {
 70 |             svfloat64_t svx1 = svdup_f64(x[j]);
 71 |             svfloat64_t r00 = svcvt_f64_s64_x(t64, sc00);
 72 |             svfloat64_t r10 = svcvt_f64_s64_x(t64, sc10);
 73 |             svfloat64_t r20 = svcvt_f64_s64_x(t64, sc20);
 74 |             svfloat64_t r30 = svcvt_f64_s64_x(t64, sc30);
 75 |             y00 = svmla_f64_x(t64, y00, r00, svx1);
 76 |             y10 = svmla_f64_x(t64, y10, r10, svx1);
 77 |             y20 = svmla_f64_x(t64, y20, r20, svx1);
 78 |             y30 = svmla_f64_x(t64, y30, r30, svx1);
 79 |             sc00 = svmad_s64_x(t64, sc00, sva, svc);
 80 |             sc10 = svmad_s64_x(t64, sc10, sva, svc);
 81 |             sc20 = svmad_s64_x(t64, sc20, sva, svc);
 82 |             sc30 = svmad_s64_x(t64, sc30, sva, svc);
 83 | 
 84 |             svfloat64_t svx2 = svdup_f64(x[j + 1]);
 85 |             svfloat64_t r01 = svcvt_f64_s64_x(t64, sc01);
 86 |             svfloat64_t r11 = svcvt_f64_s64_x(t64, sc11);
 87 |             svfloat64_t r21 = svcvt_f64_s64_x(t64, sc21);
 88 |             svfloat64_t r31 = svcvt_f64_s64_x(t64, sc31);
 89 |             y01 = svmla_f64_x(t64, y01, r01, svx2);
 90 |             y11 = svmla_f64_x(t64, y11, r11, svx2);
 91 |             y21 = svmla_f64_x(t64, y21, r21, svx2);
 92 |             y31 = svmla_f64_x(t64, y31, r31, svx2);
 93 |             sc01 = svmad_s64_x(t64, sc01, sva, svc);
 94 |             sc11 = svmad_s64_x(t64, sc11, sva, svc);
 95 |             sc21 = svmad_s64_x(t64, sc21, sva, svc);
 96 |             sc31 = svmad_s64_x(t64, sc31, sva, svc);
 97 |         }
 98 |         if (__builtin_expect(!!(nb & 0x1u), 0)) {
 99 |             svfloat64_t r00 = svcvt_f64_s64_x(t64, sc00);
100 |             svfloat64_t r10 = svcvt_f64_s64_x(t64, sc10);
101 |             svfloat64_t r20 = svcvt_f64_s64_x(t64, sc20);
102 |             svfloat64_t r30 = svcvt_f64_s64_x(t64, sc30);
103 | 
104 |             svfloat64_t svx = svdup_f64(x[nb - 1]);
105 |             y00 = svmla_f64_x(t64, y00, r00, svx);
106 |             y10 = svmla_f64_x(t64, y10, r10, svx);
107 |             y20 = svmla_f64_x(t64, y20, r20, svx);
108 |             y30 = svmla_f64_x(t64, y30, r30, svx);
109 |         }
110 |         auto pg0 = svwhilelt_b64(i, (int64_t)(mb - 0 * vlen));
111 |         auto pg1 = svwhilelt_b64(i, (int64_t)(mb - 1 * vlen));
112 |         auto pg2 = svwhilelt_b64(i, (int64_t)(mb - 2 * vlen));
113 |         auto pg3 = svwhilelt_b64(i, (int64_t)(mb - 3 * vlen));
114 |         y00 = svadd_f64_x(t64, y00, y01);
115 |         y10 = svadd_f64_x(t64, y10, y11);
116 |         y20 = svadd_f64_x(t64, y20, y21);
117 |         y30 = svadd_f64_x(t64, y30, y31);
118 |         y00 = svmla_n_f64_x(pg0, svld1_vnum_f64(pg0, y + i, 0), y00, alpha);
119 |         y10 = svmla_n_f64_x(pg1, svld1_vnum_f64(pg1, y + i, 1), y10, alpha);
120 |         y20 = svmla_n_f64_x(pg2, svld1_vnum_f64(pg2, y + i, 2), y20, alpha);
121 |         y30 = svmla_n_f64_x(pg3, svld1_vnum_f64(pg3, y + i, 3), y30, alpha);
122 | 
123 |         svst1_vnum_f64(pg0, y + i, 0, y00);
124 |         svst1_vnum_f64(pg1, y + i, 1, y10);
125 |         svst1_vnum_f64(pg2, y + i, 2, y20);
126 |         svst1_vnum_f64(pg3, y + i, 3, y30);
127 | 
128 |         svint64_t sva32, svc32;
129 |         sva32 = svdup_s64(c32.a);
130 |         svc32 = svdup_s64(c32.c);
131 |         stat00 = svmad_s64_x(pg0, stat00, sva32, svc32);
132 |     }
133 | }
134 | 
135 | extern "C" void hmg_gemv_up(int istart, double a, double b, int mb, int nb,
136 |                             double alpha, double const *__restrict__ x,
137 |                             double *__restrict__ y) {
138 |     // same as abobe, but for the upper-triangular part of the Higham's hpl-ai
139 |     // matrix. istart is the row position of the sub-matrix. jstart is not
140 |     // needed because of the structure of the matrix. a and b are the paramter
141 |     // of the Higham's matrix.
142 |     const int vlen = svcntd();
143 |     const int nn = svcntd() * 4;
144 |     auto t64 = svptrue_b64();
145 |     double ab = a * b;
146 | 
147 |     svint64_t iindex0, iindex1, iindex2, iindex3;
148 |     iindex0 = svindex_s64(istart, 1);
149 |     iindex1 = svindex_s64(istart + vlen, 1);
150 |     iindex2 = svindex_s64(istart + 2 * vlen, 1);
151 |     iindex3 = svindex_s64(istart + 3 * vlen, 1);
152 |     svfloat64_t findex0 = svcvt_f64_s64_x(t64, iindex0);
153 |     svfloat64_t findex1 = svcvt_f64_s64_x(t64, iindex1);
154 |     svfloat64_t findex2 = svcvt_f64_s64_x(t64, iindex2);
155 |     svfloat64_t findex3 = svcvt_f64_s64_x(t64, iindex3);
156 |     svfloat64_t svincr = svdup_f64((double)nn);
157 |     svfloat64_t sva = svdup_f64(b);
158 |     svfloat64_t svab = svdup_f64(ab);
159 |     svfloat64_t sv0 = svdup_f64(0.);
160 | 
161 |     int64_t jend = nb - 2;
162 |     for (int64_t i = 0; i < mb; i += nn) {
163 |         __builtin_prefetch(&y[i]);
164 |         svfloat64_t y00 = sv0;
165 |         svfloat64_t y10 = sv0;
166 |         svfloat64_t y20 = sv0;
167 |         svfloat64_t y30 = sv0;
168 |         svfloat64_t y01 = sv0;
169 |         svfloat64_t y11 = sv0;
170 |         svfloat64_t y21 = sv0;
171 |         svfloat64_t y31 = sv0;
172 |         // a+ab*(j+nn) = (a+ab*j) + ab*nn  is better for performance, but we
173 |         // recompute them for accuracy
174 |         svfloat64_t ai0 = svmla_f64_x(t64, sva, svab, findex0);
175 |         svfloat64_t ai1 = svmla_f64_x(t64, sva, svab, findex1);
176 |         svfloat64_t ai2 = svmla_f64_x(t64, sva, svab, findex2);
177 |         svfloat64_t ai3 = svmla_f64_x(t64, sva, svab, findex3);
178 |         for (int64_t j = 0; j <= jend; j += 2) {
179 |             svfloat64_t svx1 = svdup_f64(x[j]);
180 |             y00 = svmla_f64_x(t64, y00, ai0, svx1);
181 |             y10 = svmla_f64_x(t64, y10, ai1, svx1);
182 |             y20 = svmla_f64_x(t64, y20, ai2, svx1);
183 |             y30 = svmla_f64_x(t64, y30, ai3, svx1);
184 | 
185 |             svfloat64_t svx2 = svdup_f64(x[j + 1]);
186 |             y01 = svmla_f64_x(t64, y01, ai0, svx2);
187 |             y11 = svmla_f64_x(t64, y11, ai1, svx2);
188 |             y21 = svmla_f64_x(t64, y21, ai2, svx2);
189 |             y31 = svmla_f64_x(t64, y31, ai3, svx2);
190 |         }
191 |         if (__builtin_expect(!!(nb & 0x1u), 0)) {
192 |             svfloat64_t svx = svdup_f64(x[nb - 1]);
193 |             y00 = svmla_f64_x(t64, y00, ai0, svx);
194 |             y10 = svmla_f64_x(t64, y10, ai1, svx);
195 |             y20 = svmla_f64_x(t64, y20, ai2, svx);
196 |             y30 = svmla_f64_x(t64, y30, ai3, svx);
197 |         }
198 |         auto pg0 = svwhilelt_b64(i, (int64_t)(mb - 0 * vlen));
199 |         auto pg1 = svwhilelt_b64(i, (int64_t)(mb - 1 * vlen));
200 |         auto pg2 = svwhilelt_b64(i, (int64_t)(mb - 2 * vlen));
201 |         auto pg3 = svwhilelt_b64(i, (int64_t)(mb - 3 * vlen));
202 |         y00 = svadd_f64_x(t64, y00, y01);
203 |         y10 = svadd_f64_x(t64, y10, y11);
204 |         y20 = svadd_f64_x(t64, y20, y21);
205 |         y30 = svadd_f64_x(t64, y30, y31);
206 |         y00 = svmla_n_f64_x(pg0, svld1_vnum_f64(pg0, y + i, 0), y00, alpha);
207 |         y10 = svmla_n_f64_x(pg1, svld1_vnum_f64(pg1, y + i, 1), y10, alpha);
208 |         y20 = svmla_n_f64_x(pg2, svld1_vnum_f64(pg2, y + i, 2), y20, alpha);
209 |         y30 = svmla_n_f64_x(pg3, svld1_vnum_f64(pg3, y + i, 3), y30, alpha);
210 | 
211 |         svst1_vnum_f64(pg0, y + i, 0, y00);
212 |         svst1_vnum_f64(pg1, y + i, 1, y10);
213 |         svst1_vnum_f64(pg2, y + i, 2, y20);
214 |         svst1_vnum_f64(pg3, y + i, 3, y30);
215 | 
216 |         findex0 = svadd_f64_x(t64, findex0, svincr);
217 |         findex1 = svadd_f64_x(t64, findex1, svincr);
218 |         findex2 = svadd_f64_x(t64, findex2, svincr);
219 |         findex3 = svadd_f64_x(t64, findex3, svincr);
220 |     }
221 | }
222 | 
223 | extern "C" void hmg_gemv_low(int istart, double a, double b, int mb, int nb,
224 |                              double alpha, double const *__restrict__ x,
225 |                              double *__restrict__ y) {
226 |     // same as above, but for lower-triangular part.
227 |     const int vlen = svcntd();
228 |     const int nn = svcntd() * 4;
229 |     auto t64 = svptrue_b64();
230 |     double ab = a * b;
231 | 
232 |     svfloat64_t svincr = svdup_f64((double)2);
233 |     svfloat64_t sva = svdup_f64(a);
234 |     svfloat64_t svab = svdup_f64(ab);
235 |     svfloat64_t sv0 = svdup_f64(0.);
236 | 
237 |     int64_t jend = nb - 2;
238 |     for (int64_t i = 0; i < mb; i += nn) {
239 |         svfloat64_t findex0 = svdup_f64((double)istart);
240 |         svfloat64_t findex1 = svdup_f64((double)(istart + 1));
241 |         __builtin_prefetch(&y[i]);
242 |         svfloat64_t y00 = sv0;
243 |         svfloat64_t y10 = sv0;
244 |         svfloat64_t y20 = sv0;
245 |         svfloat64_t y30 = sv0;
246 |         svfloat64_t y01 = sv0;
247 |         svfloat64_t y11 = sv0;
248 |         svfloat64_t y21 = sv0;
249 |         svfloat64_t y31 = sv0;
250 |         for (int64_t j = 0; j <= jend; j += 2) {
251 |             // a+ab*(j+nn) = (a+ab*j) + ab*nn  is better for performance, but we
252 |             // recompute them for accuracy
253 |             svfloat64_t svx1 = svdup_f64(x[j]);
254 |             svfloat64_t aj0 = svmla_f64_x(t64, sva, svab, findex0);
255 |             svfloat64_t aj1 = svmla_f64_x(t64, sva, svab, findex1);
256 |             y00 = svmla_f64_x(t64, y00, aj0, svx1);
257 |             y10 = svmla_f64_x(t64, y10, aj0, svx1);
258 |             y20 = svmla_f64_x(t64, y20, aj0, svx1);
259 |             y30 = svmla_f64_x(t64, y30, aj0, svx1);
260 | 
261 |             svfloat64_t svx2 = svdup_f64(x[j + 1]);
262 |             y01 = svmla_f64_x(t64, y01, aj1, svx2);
263 |             y11 = svmla_f64_x(t64, y11, aj1, svx2);
264 |             y21 = svmla_f64_x(t64, y21, aj1, svx2);
265 |             y31 = svmla_f64_x(t64, y31, aj1, svx2);
266 |             findex0 = svadd_f64_x(t64, findex0, svincr);
267 |             findex1 = svadd_f64_x(t64, findex1, svincr);
268 |         }
269 |         if (__builtin_expect(!!(nb & 0x1u), 0)) {
270 |             svfloat64_t svx = svdup_f64(x[nb - 1]);
271 |             svfloat64_t aj0 = svmla_f64_x(t64, sva, svab, findex0);
272 |             y00 = svmla_f64_x(t64, y00, aj0, svx);
273 |             y10 = svmla_f64_x(t64, y10, aj0, svx);
274 |             y20 = svmla_f64_x(t64, y20, aj0, svx);
275 |             y30 = svmla_f64_x(t64, y30, aj0, svx);
276 |         }
277 |         auto pg0 = svwhilelt_b64(i, (int64_t)(mb - 0 * vlen));
278 |         auto pg1 = svwhilelt_b64(i, (int64_t)(mb - 1 * vlen));
279 |         auto pg2 = svwhilelt_b64(i, (int64_t)(mb - 2 * vlen));
280 |         auto pg3 = svwhilelt_b64(i, (int64_t)(mb - 3 * vlen));
281 |         y00 = svadd_f64_x(t64, y00, y01);
282 |         y10 = svadd_f64_x(t64, y10, y11);
283 |         y20 = svadd_f64_x(t64, y20, y21);
284 |         y30 = svadd_f64_x(t64, y30, y31);
285 |         y00 = svmla_n_f64_x(pg0, svld1_vnum_f64(pg0, y + i, 0), y00, alpha);
286 |         y10 = svmla_n_f64_x(pg1, svld1_vnum_f64(pg1, y + i, 1), y10, alpha);
287 |         y20 = svmla_n_f64_x(pg2, svld1_vnum_f64(pg2, y + i, 2), y20, alpha);
288 |         y30 = svmla_n_f64_x(pg3, svld1_vnum_f64(pg3, y + i, 3), y30, alpha);
289 | 
290 |         svst1_vnum_f64(pg0, y + i, 0, y00);
291 |         svst1_vnum_f64(pg1, y + i, 1, y10);
292 |         svst1_vnum_f64(pg2, y + i, 2, y20);
293 |         svst1_vnum_f64(pg3, y + i, 3, y30);
294 |     }
295 | }
296 | 
297 | extern "C" void hmg_gemv_diag(int istart, int jstart, double a, double b,
298 |                               int mb, int nb, double alpha,
299 |                               double const *__restrict__ x,
300 |                               double *__restrict__ y) {
301 |     // same as above, but for the sub-matrix whic includes diagonals.
302 |     double ab = a * b;
303 |     for (int i = 0; i < mb; ++i) {
304 |         double d = 0.;
305 |         for (int j = 0; j < i; ++j) {
306 |             double aj = b + ab * (jstart + j);
307 |             d += aj * x[j];
308 |         }
309 |         d += (1. + ab * (istart + i)) * x[i];
310 |         double ai = a + ab * (istart + i);
311 |         for (int j = i + 1; j < nb; ++j) {
312 |             d += ai * x[j];
313 |         }
314 |         y[i] += alpha * d;
315 |     }
316 | }
317 | 


--------------------------------------------------------------------------------
/src/timer.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #include <cstdint>
  3 | #include <cstdio>
  4 | 
  5 | // Default take statistics
  6 | // #define TIMER_VERBOSE // save all the timing for visualization
  7 | // #define TIMER_SILENT // Disable all timer
  8 | //
  9 | 
 10 | #ifdef TIMER_VERBOSE
 11 | #include <vector>
 12 | #define TIMER_NUM 2
 13 | #elif defined(TIMER_SILENT)
 14 | #define TIMER_NUM 0
 15 | #else
 16 | #define TIMER_NUM 1
 17 | #endif
 18 | 
 19 | extern "C" int MPI_Get_processor_name(char *, int *);
 20 | 
 21 | #ifdef __aarch64__
 22 | static int64_t get_utime() {
 23 |     uint64_t tsc;
 24 |     asm volatile("mrs %0, cntvct_el0" : "=r"(tsc));
 25 |     return tsc;
 26 | }
 27 | static double tick2second(uint64_t tick) {
 28 |     auto frequency = [] {
 29 |         uint64_t frq;
 30 |         asm volatile("mrs %0, cntfrq_el0" : "=r"(frq));
 31 |         return frq;
 32 |     };
 33 |     static double invfreq = 1.0 / frequency();
 34 |     return invfreq * (double)tick;
 35 | }
 36 | #else
 37 | #ifdef __APPLE__
 38 | #include <sys/time.h>
 39 | static int64_t get_utime() {
 40 |     timeval tv;
 41 |     gettimeofday(&tv, NULL);
 42 | 
 43 |     return tv.tv_usec * 1000ll + tv.tv_sec * 1000000000ll;
 44 | }
 45 | #elif defined __linux__
 46 | #include <time.h>
 47 | static int64_t get_utime() {
 48 |     timespec ts;
 49 |     clock_gettime(CLOCK_REALTIME, &ts);
 50 | 
 51 |     return ts.tv_nsec + ts.tv_sec * 1000000000ll;
 52 | }
 53 | #endif
 54 | static double tick2second(uint64_t tick) { return 1.e-9 * (double)tick; }
 55 | #endif
 56 | 
 57 | struct Timer_template_base {
 58 |     enum Items {
 59 |         DIAG_BCAST = 0,
 60 |         LCOL_BCAST,
 61 |         RROW_BCAST,
 62 |         TEST,
 63 |         WAIT,
 64 |         DIAG_LU,
 65 |         TRSM_L,
 66 |         TRSM_R,
 67 |         CONV_L,
 68 |         CONV_R,
 69 |         GEMM_UPDATE,
 70 |         GEMM_PROGRESS,
 71 |         LAZY_INIT,
 72 |         WRITE_BACK,
 73 |         IR_GEMV,
 74 |         IR_GEMV_COMM,
 75 |         IR_TRSV,
 76 |         IR_TRSV_MV,
 77 |         IR_TRSV_COMM,
 78 | 	// ORNL modification
 79 |         INIT,
 80 |         MEMCPY,
 81 |         // don't touch below
 82 |         TOTAL,
 83 |         MISC,
 84 |         NUM_ITEMS,
 85 |     };
 86 | };
 87 | template <int> struct Timer_template : Timer_template_base {
 88 |     // caleld fromo initialize()
 89 |     static void flush() {}
 90 |     // initialize timer with 0
 91 |     static void initialize() {}
 92 |     // start a region
 93 |     //   elem: which item to collect
 94 |     //   reuse: reuse last timing for reduce timer cost.
 95 |     static void beg(const Items /*elem*/, bool const /*reuse*/ = false) {}
 96 |     // stop a region
 97 |     //   elem: which item to collect
 98 |     //   reuse: reuse last timing for reduce timer cost.
 99 |     //   acc: number of operations in this region. ex. flops, mips, or, bytes
100 |     static void end(const Items /*elem*/, bool const /*reuse*/ = false,
101 |                     int64_t /*acc*/ = 0ll) {}
102 |     // save a timing
103 |     static double put(const Items /*elem*/, bool const /*reuse*/ = false) {
104 |         return 0.0;
105 |     }
106 |     // dump out all the data
107 |     static void show(FILE * /*fp*/ = stderr, char const * /*fmt*/ = "") {}
108 |     // open file and call show()
109 |     //    size: number of procs
110 |     //    rank: process id
111 |     //    row : row id
112 |     //    col : col id
113 |     //    filename : filename
114 |     static void dump_mp(const int /*size*/, const int /*rank*/,
115 |                         const int /*row*/, const int /*col*/,
116 |                         const char * /*filename*/ = "") {} // do nothing
117 | };
118 | 
119 | template <> struct Timer_template<1> : Timer_template_base {
120 |     static char const *name(int const i) {
121 |         static const char *strs[NUM_ITEMS] = {
122 |             "DIAG_BCAST",
123 |             "LCOL_BCAST",
124 |             "RROW_BCAST",
125 |             "TEST",
126 |             "WAIT",
127 |             "DIAG_LU",
128 |             "TRSM_L",
129 |             "TRSM_R",
130 |             "CONV_L",
131 |             "CONV_R",
132 |             "GEMM_UPDATE",
133 |             "GEMM_PROGRESS",
134 |             "LAZY_INIT",
135 |             "WRITE_BACK",
136 |             "IR_GEMV",
137 |             "IR_GEMV_COMM",
138 |             "IR_TRSV",
139 |             "IR_TRSV_MV",
140 |             "IR_TRSV_COMM",
141 |             // ORNL modification
142 |             "INIT",
143 |             "MEMCPY",
144 |             "TOTAL",
145 |             "MISC",
146 |         };
147 |         return strs[i];
148 |     }
149 | 
150 |     static void flush() {
151 |         for (int i = 0; i < NUM_ITEMS; i++) {
152 |             time(i) = 0ll;
153 |             accum(i) = 0ll;
154 |         }
155 |     }
156 | 
157 |     static void initialize() {
158 |         flush();
159 |         tprev(1) = get_utime();
160 |     }
161 | 
162 |     static void beg(const Items elem, bool const reuse = false) {
163 |         if (reuse)
164 |             time(elem) -= tprev();
165 |         else
166 |             time(elem) -= (tprev() = get_utime());
167 |     }
168 | 
169 |     static void end(const Items elem, bool const reuse = false) {
170 |         if (reuse)
171 |             time(elem) += tprev();
172 |         else
173 |             time(elem) += (tprev() = get_utime());
174 |     }
175 |     static void end(const Items elem, bool const reuse, int64_t acc) {
176 |         end(elem, reuse);
177 |         accum(elem) += acc;
178 |     }
179 | 
180 |     static double put(const Items /*elem*/, bool const reuse = false) {
181 |         uint64_t tt = reuse ? get_utime() : tprev();
182 |         return tick2second(tt - tprev(1));
183 |     }
184 | 
185 |     static void
186 |     show(FILE *fp = stderr,
187 |          const char *fmt = " %-12s : %e sec : %6.2f %% : %20lld : %e Gop/s\n")
188 | 
189 |     {
190 |         fflush(fp);
191 | 
192 |         time(MISC) = time(TOTAL);
193 | 
194 |         for (int i = 0; i < NUM_ITEMS - 2; i++) {
195 |             time(MISC) -= time(i);
196 |         }
197 | 
198 |         for (int i = 0; i < NUM_ITEMS; i++) {
199 |             fprintf(fp, fmt, name(i), rtime(i), 100.0 * time(i) / time(TOTAL),
200 |                     accum(i), 1e-9 * accum(i) / rtime(i));
201 |         }
202 |         const double flops =
203 |             (double)(accum(GEMM_UPDATE) + accum(GEMM_PROGRESS)) /
204 |             (rtime(GEMM_UPDATE) + rtime(GEMM_PROGRESS));
205 |         fprintf(fp, "GEMM_TOTAL: %f Tflops\n", 1.e-12 * flops);
206 | 
207 |         fflush(fp);
208 |     }
209 | 
210 |     static void dump_mp(const int size, const int rank, const int row,
211 |                         const int col,
212 |                         const char *fmt = "Timerdump.%04d.%04d") {
213 |         static char filename[1024];
214 |         sprintf(filename, fmt, size, rank);
215 |         FILE *fp = fopen(filename, "w");
216 |         if (fp) {
217 |             int len;
218 |             MPI_Get_processor_name(filename, &len);
219 |             fprintf(fp, "# row=%d, col=%d, host=%s\n", row, col, filename);
220 |             show(fp);
221 |             fclose(fp);
222 |         }
223 |     }
224 | 
225 | 	static double rtime(int const i) { return tick2second(time(i)); }
226 | 
227 |   private:
228 |     static int64_t &time(int const i) {
229 |         static int64_t buf[NUM_ITEMS];
230 |         return buf[i];
231 |     }
232 |     static int64_t &accum(int const i) {
233 |         static int64_t buf[NUM_ITEMS];
234 |         return buf[i];
235 |     }
236 |     static int64_t &tprev(int const ch = 0) {
237 |         static int64_t t[2]; /* 0 : previous time */
238 |                              /* 1 : initial time  */
239 |         return t[ch];
240 |     }
241 |     //static double rtime(int const i) { return tick2second(time(i)); }
242 | };
243 | 
244 | #ifdef TIMER_VERBOSE
245 | // pritout all the timings with binary format
246 | template <> struct Timer_template<2> : Timer_template_base {
247 |     static char const *name(int const i) {
248 |         static const char *strs[NUM_ITEMS] = {
249 |             "DIAG_BCAST",
250 |             "LCOL_BCAST",
251 |             "RROW_BCAST",
252 |             "TEST",
253 |             "WAIT",
254 |             "DIAG_LU",
255 |             "TRSM_L",
256 |             "TRSM_R",
257 |             "CONV_L",
258 |             "CONV_R",
259 |             "GEMM_UPDATE",
260 |             "GEMM_PROGRESS",
261 |             "LAZY_INIT",
262 |             "WRITE_BACK",
263 |             "IR_GEMV",
264 |             "IR_GEMV_COMM",
265 |             "IR_TRSV",
266 |             "IR_TRSV_MV",
267 |             "IR_TRSV_COMM",
268 | 	    // ORNL modification
269 |             "INIT",
270 |             "MEMCPY",
271 |             "TOTAL",
272 |             "MISC",
273 |         };
274 |         return strs[i];
275 |     }
276 | 
277 |     static void flush() {
278 |         for (int i = 0; i < NUM_ITEMS; i++) {
279 |             time(i) = 0ll;
280 |             accum(i) = 0ll;
281 |             tvec_beg(i).clear();
282 |             tvec_end(i).clear();
283 |             tvec_put(i).clear();
284 |             avec(i).clear();
285 |         }
286 |     }
287 | 
288 |     static void initialize() {
289 |         flush();
290 |         for (int i = 0; i < NUM_ITEMS; i++) {
291 |             tvec_beg(i).reserve(10000);
292 |             tvec_end(i).reserve(10000);
293 |             tvec_put(i).reserve(10000);
294 |             avec(i).reserve(10000);
295 |         }
296 |         tprev(1) = get_utime();
297 |     }
298 | 
299 |     static void beg(const Items elem, bool const reuse = false) {
300 |         // fprintf(stderr, "%s: DEBUG BEG %s\n", hostname(), name(elem));
301 |         // fflush(stderr);
302 |         if (reuse)
303 |             time(elem) -= tprev();
304 |         else
305 |             time(elem) -= (tprev() = get_utime());
306 | 
307 |         tvec_beg(elem).push_back(tprev());
308 |     }
309 | 
310 |     static void end(const Items elem, bool const reuse = false,
311 |                     int64_t acc = 0ll) {
312 |         // fprintf(stderr, "%s: DEBUG END %s\n", hostname(), name(elem));
313 |         // fflush(stderr);
314 |         if (reuse)
315 |             time(elem) += tprev();
316 |         else
317 |             time(elem) += (tprev() = get_utime());
318 |         accum(elem) += acc;
319 | 
320 |         tvec_end(elem).push_back(tprev());
321 |         avec(elem).push_back(acc);
322 |     }
323 | 
324 |     static double put(const Items elem, bool const reuse = false) {
325 |         uint64_t tt = reuse ? get_utime() : tprev();
326 |         tvec_put(elem).push_back(tt);
327 |         return tick2second(tt - tprev(1));
328 |     }
329 | 
330 |     static void
331 |     show(FILE *fp = stderr,
332 |          const char *fmt = " %-12s : %e sec : %6.2f %% : %20ld : %e Gop/s\n",
333 |          FILE *fp2 = nullptr) {
334 |         fflush(fp);
335 | 
336 |         time(MISC) = time(TOTAL);
337 | 
338 |         for (int i = 0; i < NUM_ITEMS - 2; i++) {
339 |             time(MISC) -= time(i);
340 |         }
341 | 
342 |         for (int i = 0; i < NUM_ITEMS; i++) {
343 |             fprintf(fp, fmt, name(i), rtime(i), 100.0 * time(i) / time(TOTAL),
344 |                     accum(i), 1e-9 * accum(i) / rtime(i));
345 |         }
346 |         const double flops =
347 |             (double)(accum(GEMM_UPDATE) + accum(GEMM_PROGRESS)) /
348 |             (rtime(GEMM_UPDATE) + rtime(GEMM_PROGRESS));
349 |         fprintf(fp, "GEMM_TOTAL: %f Tflops\n", 1.e-12 * flops);
350 | 
351 |         fflush(fp);
352 | 
353 |         // dump event vectors
354 |         if (!fp2)
355 |             fp2 = fp;
356 |         for (int i = 0; i < NUM_ITEMS; i++) {
357 |             dump_vector(fp2, tvec_beg(i), "BEG_", name(i));
358 |             dump_vector(fp2, tvec_end(i), "END_", name(i));
359 |             dump_vector(fp2, tvec_put(i), "PUT_", name(i));
360 |             dump_accum(fp2, avec(i), "ACC_", name(i));
361 |         }
362 |         fflush(fp);
363 |     }
364 | 
365 |     static void dump_mp(const int /*size*/, const int rank, const int row,
366 |                         const int col, const char *filename) {
367 |         fprintf(stderr, "%d: (%d, %d)\n", rank, row, col);
368 |         FILE *fp = fopen(filename, "w");
369 |         if (fp) {
370 |             int len;
371 |             char hostname[1024];
372 |             MPI_Get_processor_name(hostname, &len);
373 |             fprintf(fp, "# row=%d, col=%d, host=%s\n", row, col, hostname);
374 |             show(fp);
375 |             fclose(fp);
376 |         }
377 |     }
378 | 
379 |     static const char *hostname() {
380 |         static char name[1024] = {
381 |             0,
382 |         };
383 |         if (!name[0]) {
384 |             int len;
385 |             MPI_Get_processor_name(name, &len);
386 |         }
387 |         return name;
388 |     }
389 | 
390 |   private:
391 |     static int64_t &time(int const i) {
392 |         static int64_t buf[NUM_ITEMS];
393 |         return buf[i];
394 |     }
395 |     static int64_t &accum(int const i) {
396 |         static int64_t buf[NUM_ITEMS];
397 |         return buf[i];
398 |     }
399 | 
400 |     using tvec = std::vector<int64_t>;
401 |     static tvec &tvec_beg(int const i) {
402 |         static tvec buf[NUM_ITEMS];
403 |         return buf[i];
404 |     }
405 |     static tvec &tvec_end(int const i) {
406 |         static tvec buf[NUM_ITEMS];
407 |         return buf[i];
408 |     }
409 |     static tvec &tvec_put(int const i) {
410 |         static tvec buf[NUM_ITEMS];
411 |         return buf[i];
412 |     }
413 |     static tvec &avec(int const i) {
414 |         static tvec buf[NUM_ITEMS];
415 |         return buf[i];
416 |     }
417 | 
418 |     static void dump_vector(FILE *fp, const tvec &v, const char *s0,
419 |                             const char *s1) {
420 |         const int n = v.size();
421 |         if (fp != stderr) {
422 |             fprintf(fp, "bio, %d, %s%s\n", n, s0, s1);
423 |             for (int i = 0; i < n; i++) {
424 |                 unsigned long utime = v[i];
425 |                 double dtime = tick2second(utime - tprev(1));
426 |                 fwrite(&dtime, sizeof(double), 1, fp);
427 |             }
428 |         } else {
429 |             for (int i = 0; i < n; i++) {
430 |                 unsigned long utime = v[i];
431 |                 double dtime = tick2second(utime - tprev(1));
432 | 
433 |                 fprintf(fp, "%ld, %16.12f, %s%s, %d\n", utime, dtime, s0, s1,
434 |                         i);
435 |             }
436 |         }
437 |     }
438 | 
439 |     static void dump_accum(FILE *fp, const tvec &v, const char *s0,
440 |                            const char *s1) {
441 |         const int n = v.size();
442 |         if (fp != stderr) {
443 |             fprintf(fp, "bio, %d, %s%s\n", n, s0, s1);
444 |             fwrite(v.data(), sizeof(int64_t), n, fp);
445 |         } else {
446 |             for (int i = 0; i < n; i++) {
447 |                 fprintf(fp, "%ld, 0.0, %s%s, %d\n", v[i], s0, s1, i);
448 |             }
449 |         }
450 |     }
451 | 
452 |     static int64_t &tprev(int const ch = 0) {
453 |         static int64_t t[2];
454 |         return t[ch];
455 |     }
456 |     static double rtime(int const i) { return tick2second(time(i)); }
457 | };
458 | #endif
459 | 
460 | using Timer = Timer_template<TIMER_NUM>;
461 | 


--------------------------------------------------------------------------------
/src/fp16_gpu_kernels.cpp:
--------------------------------------------------------------------------------
  1 | #include "fp16_gpu_kernels.h"
  2 | 
  3 | using namespace std;
  4 | 
  5 | #define TILE_DIM 64
  6 | #define TILE_DIM_TRANS 64
  7 | #define TILE_DIM_ID 64
  8 | #define BLOCK_ROWS 4
  9 | 
 10 | // LEFT CONVERT KERNELS
 11 | __global__ void copyCoalesced(__half *odata, const float *idata, int olda,
 12 |                               int ilda) {
 13 |     /*__shared__ float tile[TILE_DIM][TILE_DIM];
 14 | 
 15 |     int x = GPU_BLOCKIDX_X * TILE_DIM + GPU_THREADIDX_X;
 16 |     int y = GPU_BLOCKIDX_Y * TILE_DIM + GPU_THREADIDX_Y;
 17 |     // int width = gridDim.x * TILE_DIM;
 18 | 
 19 |     for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
 20 |         tile[(GPU_THREADIDX_Y + j)][GPU_THREADIDX_X] =
 21 |             idata[(long(y) +long( j)) * long(ilda) +long( x)];
 22 | 
 23 |     __syncthreads();
 24 | 
 25 |     for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
 26 |         odata[(long(y) +long( j)) *long( olda) +long( x)] =
 27 |             __float2half(tile[(GPU_THREADIDX_Y + j)][GPU_THREADIDX_X]);*/
 28 | 
 29 |     int x = GPU_BLOCKIDX_X * TILE_DIM + GPU_THREADIDX_X;
 30 |     int y = GPU_BLOCKIDX_Y * TILE_DIM + GPU_THREADIDX_Y;
 31 |     // int width = gridDim.x * TILE_DIM;
 32 | 
 33 |     for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
 34 |         odata[(long(y) +long( j)) *long( olda) +long( x)] =
 35 |             __float2half(idata[(long(y) +long( j)) * long(ilda) +long( x)]);
 36 | }
 37 | 
 38 | /// HUGE ASSUMTION OF ilda = olda
 39 | __global__ void copyCoalesced_lower(__half *odata, const float *idata, int olda,
 40 |                               int ilda) {
 41 |     if(GPU_BLOCKIDX_Y > GPU_BLOCKIDX_X){ return;};
 42 |     if(GPU_BLOCKIDX_X == GPU_BLOCKIDX_Y)
 43 |     {
 44 |         int x = GPU_BLOCKIDX_X * TILE_DIM + GPU_THREADIDX_X;
 45 |         int y = GPU_BLOCKIDX_Y * TILE_DIM + GPU_THREADIDX_Y;
 46 |         // int width = gridDim.x * TILE_DIM;
 47 | 
 48 |         for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
 49 |         {
 50 |             if(x > y+j)
 51 |                 odata[(y+j) * olda + x] =
 52 |                     __float2half(idata[(y+j) * olda + x]);
 53 |         }
 54 |         return;
 55 |     }
 56 |     
 57 |     if(GPU_BLOCKIDX_X > GPU_BLOCKIDX_Y)
 58 |     {
 59 |         int x = GPU_BLOCKIDX_X * TILE_DIM + GPU_THREADIDX_X;
 60 |         int y = GPU_BLOCKIDX_Y * TILE_DIM + GPU_THREADIDX_Y;
 61 |         for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
 62 |             odata[(y+j) * olda + x] =
 63 |                 __float2half(idata[(y+j) * olda + x]);
 64 |         return;
 65 |     }
 66 | }
 67 | 
 68 | /// HUGE ASSUMTION OF ilda = olda
 69 | __global__ void copyCoalesced_upper(__half *odata, const float *idata, int olda,
 70 |                               int ilda) {
 71 |     if(GPU_BLOCKIDX_Y < GPU_BLOCKIDX_X){ return;};
 72 |     if(GPU_BLOCKIDX_X == GPU_BLOCKIDX_Y)
 73 |     {
 74 |         int x = GPU_BLOCKIDX_X * TILE_DIM + GPU_THREADIDX_X;
 75 |         int y = GPU_BLOCKIDX_Y * TILE_DIM + GPU_THREADIDX_Y;
 76 |         for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
 77 |         {
 78 |             if(x < y+j+1)
 79 |                 odata[(y+j) * olda + x] =
 80 |                     __float2half(idata[(y+j) * olda + x]);
 81 |         }
 82 |         return;
 83 |     }
 84 |     
 85 |     if(GPU_BLOCKIDX_X < GPU_BLOCKIDX_Y)
 86 |     {
 87 |         int x = GPU_BLOCKIDX_X * TILE_DIM + GPU_THREADIDX_X;
 88 |         int y = GPU_BLOCKIDX_Y * TILE_DIM + GPU_THREADIDX_Y;
 89 |         for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
 90 |             odata[(y+j) * olda + x] =
 91 |                 __float2half(idata[(y+j) * olda + x]);
 92 |         return;
 93 |     }
 94 | }
 95 | 
 96 | /////// UPCAST FOR DEBUG
 97 | /// HUGE ASSUMTION OF ilda = olda
 98 | __global__ void copyCoalesced_lower(float *odata, const __half *idata, int olda,
 99 |                               int ilda) {
100 |     if(GPU_BLOCKIDX_Y > GPU_BLOCKIDX_X){ return;};
101 |     if(GPU_BLOCKIDX_X == GPU_BLOCKIDX_Y)
102 |     {
103 |         int x = GPU_BLOCKIDX_X * TILE_DIM + GPU_THREADIDX_X;
104 |         int y = GPU_BLOCKIDX_Y * TILE_DIM + GPU_THREADIDX_Y;
105 |         // int width = gridDim.x * TILE_DIM;
106 | 
107 |         for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
108 |         {
109 |             if(x > y+j)
110 |                 odata[(y+j) * olda + x] =
111 |                     __half2float(idata[(y+j) * olda + x]);
112 |         }
113 |         return;
114 |     }
115 |     
116 |     if(GPU_BLOCKIDX_X > GPU_BLOCKIDX_Y)
117 |     {
118 |         int x = GPU_BLOCKIDX_X * TILE_DIM + GPU_THREADIDX_X;
119 |         int y = GPU_BLOCKIDX_Y * TILE_DIM + GPU_THREADIDX_Y;
120 |         for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
121 |             odata[(y+j) * olda + x] =
122 |                 __half2float(idata[(y+j) * olda + x]);
123 |         return;
124 |     }
125 | }
126 | 
127 | /// HUGE ASSUMTION OF ilda = olda
128 | __global__ void copyCoalesced_upper(float *odata, const __half *idata, int olda,
129 |                               int ilda) {
130 |   
131 |     if(GPU_BLOCKIDX_Y < GPU_BLOCKIDX_X){ return;};
132 |     if(GPU_BLOCKIDX_X == GPU_BLOCKIDX_Y)
133 |     {
134 |         int x = GPU_BLOCKIDX_X * TILE_DIM + GPU_THREADIDX_X;
135 |         int y = GPU_BLOCKIDX_Y * TILE_DIM + GPU_THREADIDX_Y;
136 |         for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
137 |         {
138 |             if(x <= y+j)
139 |                 odata[(y+j) * olda + x] =
140 |                     __half2float(idata[(y+j) * olda + x]);
141 |         }
142 |         return;
143 |     }
144 |     
145 |     if(GPU_BLOCKIDX_X < GPU_BLOCKIDX_Y)
146 |     {
147 |         int x = GPU_BLOCKIDX_X * TILE_DIM + GPU_THREADIDX_X;
148 |         int y = GPU_BLOCKIDX_Y * TILE_DIM + GPU_THREADIDX_Y;
149 |         for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
150 |             odata[(y+j) * olda + x] =
151 |                 __half2float(idata[(y+j) * olda + x]);
152 |         return;
153 |     }
154 | }
155 | 
156 | 
157 | // RIGHT TRANSPOSE CONVERT KERNELS
158 | __global__ void transposeCoalesced(__half *odata, const float *idata, int olda,
159 |                                    int ilda) {
160 |     __shared__ float tile[TILE_DIM_TRANS][TILE_DIM_TRANS+1];
161 | 
162 |     int x = GPU_BLOCKIDX_X * TILE_DIM_TRANS + GPU_THREADIDX_X;
163 |     int y = GPU_BLOCKIDX_Y * TILE_DIM_TRANS + GPU_THREADIDX_Y;
164 |     for (int j = 0; j < TILE_DIM_TRANS; j += BLOCK_ROWS)
165 |         tile[GPU_THREADIDX_Y + j][GPU_THREADIDX_X] = 
166 |             idata[(long(y) +long( j)) * long(ilda) +long( x)];
167 | 
168 |     __syncthreads();
169 | 
170 |     x = GPU_BLOCKIDX_Y * TILE_DIM_TRANS + GPU_THREADIDX_X; // transpose block offset
171 |     y = GPU_BLOCKIDX_X * TILE_DIM_TRANS + GPU_THREADIDX_Y;
172 | 
173 |     for (int j = 0; j < TILE_DIM_TRANS; j += BLOCK_ROWS)
174 |         odata[(long(y) + long(j)) *long( olda) +long( x)] =
175 |             __float2half(tile[GPU_THREADIDX_X][GPU_THREADIDX_Y + j]);
176 | }
177 | 
178 | 
179 | // HOST CALLS TO LAUNCH KERNELS (LEFT)
180 | __host__ void half_conversion_left( const float *C, int b, int plda, __half *lp, int lplda) {
181 | 	if(lplda == 0) return;
182 |     dim3 block_dims(lplda / TILE_DIM, b / TILE_DIM, 1);
183 |     dim3 thread_dims(TILE_DIM, BLOCK_ROWS, 1);
184 |     copyCoalesced<<<block_dims, thread_dims>>>(lp, C, lplda, plda);
185 | /*#elif defined(ROCM_OLCF_PLATFORM)
186 |     hipLaunchKernelGGL(copyCoalesced, block_dims, thread_dims, 0, 0,
187 |             lp, C, lplda, plda);
188 | #else
189 |     throw std::runtime_error("Error with build platform, neither CUDA nor ROCM. See CMake output.")
190 | #endif*/
191 | 
192 | }
193 | 
194 | // HOST CALLS TO LAUNCH KERNELS (RIGHT TRANS)
195 | 
196 | __host__ void half_conversion_right_trans(const float *C, int b, int plda, __half * rp, int rplda) {
197 |     //printf("rplda:%d\n",rplda);
198 | 	if(rplda == 0) return;
199 |     dim3 block_dims(b / TILE_DIM_TRANS, rplda / TILE_DIM_TRANS, 1);
200 |     dim3 thread_dims(TILE_DIM_TRANS, BLOCK_ROWS, 1);
201 | 
202 | //#ifdef CUDA_OLCF_PLATFORM
203 |     transposeCoalesced<<<block_dims, thread_dims>>>(rp, C, rplda, plda);
204 | /*#elif defined(ROCM_OLCF_PLATFORM)
205 |     hipLaunchKernelGGL(transposeCoalesced, block_dims, thread_dims, 0, 0,
206 |             rp, C, rplda, plda);
207 | #else
208 |     throw std::runtime_error("Error with build platform, neither CUDA nor ROCM. See CMake output.")
209 | #endif*/
210 | 
211 | }
212 | 
213 | 
214 | // Copy cast kernel
215 | __global__ void copyFtoH(__half *odata, const float *idata, long olda,
216 |                               long ilda) {
217 |    /* __shared__ float tile[TILE_DIM][TILE_DIM];
218 | 
219 |     // int width = gridDim.x * TILE_DIM;
220 |     int x = GPU_BLOCKIDX_X * TILE_DIM + GPU_THREADIDX_X;
221 |     int y = GPU_BLOCKIDX_Y * TILE_DIM + GPU_THREADIDX_Y;
222 |  
223 |     for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
224 |         tile[(GPU_THREADIDX_Y + j)][GPU_THREADIDX_X] =
225 |             idata[(long(y) +long( j)) * long(ilda) +long( x)];
226 | 
227 |     __syncthreads();
228 | 
229 |     for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
230 |         odata[(long(y) +long( j)) *long( olda) +long( x)] =
231 |             __float2half(tile[(GPU_THREADIDX_Y + j)][GPU_THREADIDX_X]);
232 |     */
233 |     int x = GPU_BLOCKIDX_X * TILE_DIM + GPU_THREADIDX_X;
234 |     int y = GPU_BLOCKIDX_Y * TILE_DIM + GPU_THREADIDX_Y;
235 |  
236 |     for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
237 |         odata[(long(y) +long( j)) *long( olda) +long( x)] =
238 |             idata[(long(y) +long( j)) * long(ilda) +long( x)];
239 | }
240 | 
241 | __host__ void downCast_copy_general(__half* out, long olda, long nrow, long ncol, float* C, long ilda)
242 | {
243 |         dim3 dimGrid(nrow/TILE_DIM,ncol/TILE_DIM, 1);
244 |         dim3 dimBlock(TILE_DIM, BLOCK_ROWS, 1);
245 | //#ifdef CUDA_OLCF_PLATFORM
246 |         copyFtoH<<<dimGrid, dimBlock>>>(out, C, olda, ilda);
247 | /*#elif defined(ROCM_OLCF_PLATFORM)
248 |     hipLaunchKernelGGL(copyFtoH, dimGrid, dimBlock,0,0,
249 |                                                  out, C, olda, ilda);
250 | #else
251 |     throw std::runtime_error("Error with build platform, neither CUDA nor ROCM. See CMake output.")
252 | #endif
253 | */        //GPU_DEVICE_SYNCHRONIZE();
254 | }
255 | 
256 | __host__ void downCast_copy_lower(__half* out, long olda, long nrow, long ncol, float* C, long ilda)
257 | {
258 |         dim3 dimGrid(nrow/TILE_DIM,ncol/TILE_DIM, 1);
259 |         dim3 dimBlock(TILE_DIM, BLOCK_ROWS, 1);
260 |         copyCoalesced_lower<<<dimGrid, dimBlock>>>(out, C, olda, ilda);
261 |         //GPU_DEVICE_SYNCHRONIZE();
262 | }
263 | 
264 | __host__ void downCast_copy_upper(__half* out, long olda, long nrow, long ncol, float* C, long ilda)
265 | {
266 |         dim3 dimGrid(nrow/TILE_DIM,ncol/TILE_DIM, 1);
267 |         dim3 dimBlock(TILE_DIM, BLOCK_ROWS, 1);
268 |         copyCoalesced_upper<<<dimGrid, dimBlock>>>(out, C, olda, ilda);
269 |         //GPU_DEVICE_SYNCHRONIZE();
270 | }
271 | 
272 | // Debug purpose
273 | __host__ void upCast_copy_lower(float* out, long olda, long nrow, long ncol, __half * C, long ilda)
274 | {
275 |         dim3 dimGrid(nrow/TILE_DIM,ncol/TILE_DIM, 1);
276 |         dim3 dimBlock(TILE_DIM, BLOCK_ROWS, 1);
277 |         copyCoalesced_lower<<<dimGrid, dimBlock>>>(out, C, olda, ilda);
278 |         //GPU_DEVICE_SYNCHRONIZE();
279 | }
280 | 
281 | __host__ void upCast_copy_upper(float* out, long olda, long nrow, long ncol, __half * C, long ilda)
282 | {
283 |         dim3 dimGrid(nrow/TILE_DIM,ncol/TILE_DIM, 1);
284 |         dim3 dimBlock(TILE_DIM, BLOCK_ROWS, 1);
285 |         copyCoalesced_upper<<<dimGrid, dimBlock>>>(out, C, olda, ilda);
286 |         //GPU_DEVICE_SYNCHRONIZE();
287 | }
288 | 
289 | __host__ void downCast_trans_general(__half* out, long olda, long nrow, long ncol , float* C, long ilda)
290 | {
291 |         // rplda should be passed in as npcol-start_col
292 |         dim3 dimGrid(nrow/TILE_DIM_TRANS, ncol/TILE_DIM_TRANS, 1);
293 |         dim3 dimBlock(TILE_DIM_TRANS, BLOCK_ROWS, 1);
294 | //#ifdef CUDA_OLCF_PLATFORM
295 |         transposeCoalesced<<<dimGrid, dimBlock>>>(out, C, olda, ilda);
296 | /*#elif defined(ROCM_OLCF_PLATFORM)
297 |     hipLaunchKernelGGL(transposeCoalesced, dimGrid, dimBlock,0,0,
298 |                                                  out, C, olda, ilda);
299 | #else
300 |     throw std::runtime_error("Error with build platform, neither CUDA nor ROCM. See CMake output.")
301 | #endif*/
302 |         //GPU_DEVICE_SYNCHRONIZE();
303 | }
304 | 
305 | __host__ void gen_identity_mat( float * out, long nrow, long ncol)
306 | {
307 |         // ASSUMING IT B has to be divisible by TILE_DIM
308 |         dim3 dimGrid(nrow/TILE_DIM_ID, ncol/TILE_DIM_ID,1);
309 |         dim3 dimBlock(TILE_DIM_ID, BLOCK_ROWS,1);
310 | 
311 | #ifdef CUDA_OLCF_PLATFORM
312 |         gen_identity_mat_kernel<<<dimGrid, dimBlock>>>(out, nrow, ncol);
313 | #elif defined(ROCM_OLCF_PLATFORM)
314 |     hipLaunchKernelGGL(gen_identity_mat_kernel, dimGrid, dimBlock,0,0,
315 |                                                  out, nrow, ncol);
316 | #else
317 |     throw std::runtime_error("Error with build platform, neither CUDA nor ROCM. See CMake output.")
318 | #endif
319 |         GPU_DEVICE_SYNCHRONIZE();
320 | }
321 | 
322 | __global__ void gen_identity_mat_kernel( float * out, long nrow, long ncol) {
323 |         int x = GPU_BLOCKIDX_X * TILE_DIM_ID + GPU_THREADIDX_X;
324 |         int y = GPU_BLOCKIDX_Y * TILE_DIM_ID + GPU_THREADIDX_Y;
325 |         float a = 1.0;
326 |         float b = 0.0;
327 |         for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
328 |         {
329 |             //y += j;
330 |             if(x == y+j){
331 |                 out[long(y+j)*long(nrow) + long( x)] = a;
332 |             }
333 |             else{
334 |                 out[long(y+j)*long(nrow) + long( x)] = b;
335 |             }
336 |         }
337 | }
338 | 
339 | __global__ void gen_identity_mat_kernel( __half * out, long nrow, long ncol) {
340 |         int x = GPU_BLOCKIDX_X * TILE_DIM_ID + GPU_THREADIDX_X;
341 |         int y = GPU_BLOCKIDX_Y * TILE_DIM_ID + GPU_THREADIDX_Y;
342 |         __half a = 1.0;
343 |         __half b = 0.0;
344 |         for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
345 |         {
346 |             //y += j;
347 |             if(x == y+j){
348 |                 out[long(y+j)*long(nrow) + long( x)] = a;
349 |             }
350 |             else{
351 |                 out[long(y+j)*long(nrow) + long( x)] = b;
352 |             }
353 |         }
354 | }
355 | 
356 | __host__ void gen_identity_mat( __half * out, long nrow, long ncol)
357 | {
358 |         // ASSUMING IT B has to be divisible by TILE_DIM
359 |         dim3 dimGrid(nrow/TILE_DIM_ID, ncol/TILE_DIM_ID,1);
360 |         dim3 dimBlock(TILE_DIM_ID, BLOCK_ROWS,1);
361 | 
362 | #ifdef CUDA_OLCF_PLATFORM
363 |         gen_identity_mat_kernel<<<dimGrid, dimBlock>>>(out, nrow, ncol);
364 | #elif defined(ROCM_OLCF_PLATFORM)
365 |     hipLaunchKernelGGL(gen_identity_mat_kernel, dimGrid, dimBlock,0,0,
366 |                                                  out, nrow, ncol);
367 | #else
368 |     throw std::runtime_error("Error with build platform, neither CUDA nor ROCM. See CMake output.")
369 | #endif
370 |         GPU_DEVICE_SYNCHRONIZE();
371 | 
372 | }
373 | 
374 | 
375 | __host__ void downCast_trans_general(__half* out, int olda, int nrow, int ncol , float* C, int ilda)
376 | {
377 |   // rplda should be passed in as npcol-start_col
378 |   
379 |   dim3 dimGrid(nrow/TILE_DIM_TRANS, ncol/TILE_DIM_TRANS, 1); 
380 |   dim3 dimBlock(TILE_DIM_TRANS, BLOCK_ROWS, 1);
381 | #ifdef CUDA_OLCF_PLATFORM
382 |   transposeCoalesced<<<dimGrid, dimBlock>>>(out, C, (long)olda, (long)ilda);
383 | #endif
384 |  // GPU_DEVICE_SYNCHRONIZE();
385 | }
386 | 
387 | 


--------------------------------------------------------------------------------
/src/matgen.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef MATGEN_HPP
  2 | #define MATGEN_HPP
  3 | // functions to initialize matrix.
  4 | #include "hpl_rand.hpp"
  5 | #include "panel.hpp"
  6 | 
  7 | #define MATGEM_THREADS 512
  8 | template <typename T>
  9 | __global__ void fill_panel_diag_dev(int n, int b, T * __restrict__ A, int lda, RandStat stat_ij, RandCoeff incl1,
 10 |         RandCoeff jump_thread, RandStat stat_00, int row_start, double* Diag)
 11 | {
 12 |     int cur_row = blockIdx.x;
 13 |     int id = threadIdx.x;
 14 |     __shared__ double sh[MATGEM_THREADS];
 15 | 
 16 |     if (cur_row < b) {
 17 |         RandCoeff jump_id = incl1.pow(id * n + cur_row);
 18 |         RandStat stat_tj = jump_id*stat_ij;
 19 |         
 20 |         for (int j = 0; j + id < b; j += blockDim.x) {
 21 |             A[(j + id) * lda + cur_row] = static_cast<T>(stat_tj);
 22 |             stat_tj = jump_thread * stat_tj;
 23 |         }
 24 |     
 25 |         // diagonal 
 26 |         double t = 0;
 27 |         jump_id = incl1.pow(id * n + cur_row + row_start); //Jump from 0 column
 28 |         stat_tj = jump_id*stat_00; //Stat_i_id
 29 | 
 30 |         for (int j = 0; j + id < n; j += blockDim.x) {
 31 |             if (j + id != cur_row + row_start) {
 32 |                 t += fabs(static_cast<double>(stat_tj));
 33 |             }
 34 |             stat_tj = jump_thread * stat_tj;
 35 |         }
 36 |         sh[id] = t;
 37 |         __syncthreads();
 38 | 
 39 |         if (id < 256) sh[id] += sh[id + 256];
 40 |         __syncthreads();
 41 | 
 42 |         if (id < 128) sh[id] += sh[id + 128];
 43 |         __syncthreads();
 44 |         if (id < 64) sh[id] += sh[id + 64];
 45 |         __syncthreads();
 46 | 
 47 |         if (id < 32) sh[id] += sh[id + 32];
 48 |         __syncthreads();
 49 | 
 50 |         if (id < 16) sh[id] += sh[id + 16];
 51 |         __syncthreads();
 52 | 
 53 |         if (id < 8) sh[id] += sh[id + 8];
 54 |         __syncthreads();
 55 | 
 56 |         if (id < 4) sh[id] += sh[id + 4];
 57 |         __syncthreads();
 58 | 
 59 |         // diagonal 
 60 |         if (id == 0) {
 61 |             Diag[cur_row] = sh[0] + sh[1] + sh[2] + sh[3];
 62 |             A[cur_row * lda + cur_row] = Diag[cur_row];
 63 |         }
 64 |     }
 65 | }
 66 | 
 67 | template <typename T>
 68 | __global__ void fill_panel_dev(int n, int b, T * __restrict__ A,
 69 |         int lda, RandStat stat_ij, RandCoeff incl1, RandCoeff jump_thread)
 70 | {
 71 |     int cur_row = blockIdx.x;
 72 |     int id = threadIdx.x;
 73 | 
 74 |     if (cur_row < b) {
 75 |         RandCoeff jump_id = incl1.pow(id * n + cur_row);
 76 |         RandStat stat_tj = jump_id*stat_ij;
 77 |         for (int j = 0; j + id < b; j += blockDim.x) {
 78 |             A[(j + id) * lda + cur_row] = static_cast<T>(static_cast<double>(stat_tj));
 79 |             stat_tj = jump_thread * stat_tj;
 80 |         }
 81 |     }
 82 | }
 83 | 
 84 | template <typename F>
 85 | void panel_matgen_dev(Matgen<F> const& mg, Panels<F>& p, double* Diag)
 86 | {
 87 |     int const n = mg.n;
 88 |     int const b = p.b;
 89 |     int const i1 = p.i1;
 90 |     int const j1 = p.j1;
 91 |     int const istride = p.istride;
 92 |     int const jstride = p.jstride;
 93 |     int const nprow = p.nprow;
 94 |     int const npcol = p.npcol;
 95 |     size_t const lda = p.lda;
 96 |     RandCoeff incl1 = mg.incl1;
 97 |     RandStat stat_00 = RandStat::initialize(mg.seed);
 98 |     RandCoeff jump_thread = incl1.pow(MATGEM_THREADS * n);
 99 | 
100 |     for (int pj = 0; pj < npcol; ++pj) {
101 |         int j0 = j1 + pj * jstride;
102 | 
103 |         for (int pi = 0; pi < nprow; ++pi) {
104 |             int i0 = i1 + pi * istride;
105 |             RandCoeff jump_ij = mg.jump(b * static_cast<uint64_t>(i0), b * static_cast<uint64_t>(j0));
106 |             //RandCoeff jump_ij = inc1.pow(b*i0 + n * static_cast<uint64_t>(j0));
107 |             RandStat stat_ij = jump_ij * stat_00;
108 | 
109 |             if (i0 != j0) {
110 |                 fill_panel_dev<<<b, MATGEM_THREADS, 0>>>(n, b, p(pi, pj, 'd'), lda, stat_ij, incl1, jump_thread);
111 |             }
112 |             else {
113 |                 fill_panel_diag_dev<<<b, MATGEM_THREADS, 0>>>(n, b, p(pi, pj, 'd'), lda, stat_ij, incl1, jump_thread, stat_00, b*i0, &Diag[pi*b]);
114 |             }
115 |         }
116 |     }
117 | }
118 | 
119 | template <typename F> void pmatgen2(Matgen<F> const &mg, Panels<F> &p, double* localSum) {
120 |     int const bs = p.b;
121 |     size_t const lda = p.lda;
122 |     int const nprow = p.nprow;
123 |     int const npcol = p.npcol;
124 | 
125 |     for (int pj = 0; pj < npcol; ++pj)
126 | #pragma omp parallel for
127 |         for (int pi = 0; pi < nprow; ++pi) {
128 |             F *pp = p(pi, pj);
129 |             const int i = bs * (p.i1 + pi * p.istride);
130 |             const int j = bs * (p.j1 + pj * p.jstride);
131 | 
132 |             fill_one_panel_with_rand2(mg.n, i, j, bs, bs, pp, lda, mg.seed,localSum);
133 |         }
134 | 
135 |     MPI_Barrier( MPI_COMM_WORLD );
136 | 
137 | }
138 | 
139 | 
140 | template <typename F> void pmatgen(Matgen<F> const &mg, Panels<F> &p) {
141 |     int const bs = p.b;
142 |     size_t const lda = p.lda;
143 |     int const nprow = p.nprow;
144 |     int const npcol = p.npcol;
145 | 
146 |     for (int pj = 0; pj < npcol; ++pj)
147 |         for (int pi = 0; pi < nprow; ++pi) {
148 |             F *pp = p(pi, pj);
149 |             const int i = bs * (p.i1 + pi * p.istride);
150 |             const int j = bs * (p.j1 + pj * p.jstride);
151 | 
152 |             fill_one_panel_with_rand(mg.n, i, j, bs, bs, pp, lda, mg.seed,
153 |                                      true);
154 |         }
155 | }
156 | 
157 | template <typename F> void pmatgen(HMGen<F> const &mg, Panels<F> &p) {
158 |     size_t const lda = p.lda;
159 |     int const nprow = p.nprow;
160 |     int const npcol = p.npcol;
161 |     int const b = p.b;
162 |     int const i1 = p.i1;
163 |     int const j1 = p.j1;
164 |     int const istride = p.istride;
165 |     int const jstride = p.jstride;
166 |     F const alpha = mg.alpha;
167 |     F const beta = mg.beta;
168 |     F const ab = alpha * beta;
169 |     F const done = 1;
170 | 
171 | #pragma omp parallel for collapse(2)
172 |     for (int pj = 0; pj < npcol; ++pj) {
173 |         for (int j = 0; j < b; ++j) {
174 |             int jstart = b * (j1 + pj * jstride);
175 |             F const fpjj = jstart + j;
176 |             for (int pi = 0; pi < nprow; ++pi) {
177 |                 int istart = b * (i1 + pi * istride);
178 |                 F *to = p(pi, pj);
179 |                 if (pi < pj) {
180 |                     for (int i = 0; i < b; ++i) {
181 |                         // assuming no diag.
182 |                         F aij = beta + ab * (istart + i);
183 |                         to[j * lda + i] = aij;
184 |                     }
185 |                 } else if (pi > pj) {
186 |                     for (int i = 0; i < b; ++i) {
187 |                         // assuming no diag.
188 |                         F aij = alpha + ab * fpjj;
189 |                         to[j * lda + i] = aij;
190 |                     }
191 |                 } else {
192 |                     for (int i = 0; i < j; ++i) {
193 |                         // assuming no diag.
194 |                         F aij = beta + ab * (jstart + i);
195 |                         to[j * lda + i] = aij;
196 |                     }
197 |                     F aij = done + ab * fpjj;
198 |                     to[j * lda + j] = aij;
199 |                     for (int i = j + 1; i < b; ++i) {
200 |                         // assuming no diag.
201 |                         F aij = alpha + ab * fpjj;
202 |                         to[j * lda + i] = aij;
203 |                     }
204 |                 }
205 |             }
206 |         }
207 |     }
208 | }
209 | 
210 | template <typename F> void pmatgen0(Panels<F> &p) {
211 |     // initialize with zero
212 |     int const bs = p.b;
213 |     size_t const lda = p.lda;
214 |     int const nprow = p.nprow;
215 |     int const npcol = p.npcol;
216 |     F const dzero = static_cast<F>(0);
217 | 
218 |     if (p.is_tile) {
219 | #pragma omp parallel for collapse(2)
220 |         for (int pj = 0; pj < npcol; ++pj)
221 |             for (int pi = 0; pi < nprow; ++pi) {
222 |                 F *pp = p(pi, pj);
223 |                 for (int j = 0; j < bs; ++j)
224 |                     for (int i = 0; i < bs; ++i)
225 |                         pp[j * lda + i] = dzero;
226 |             }
227 |     } else {
228 |         F *ptr = p(0, 0);
229 |         size_t size = static_cast<size_t>(p.ldpp) * npcol;
230 | #pragma omp parallel for simd
231 |         for (size_t i = 0; i < size; ++i)
232 |             ptr[i] = dzero;
233 |     }
234 | }
235 | 
236 | template <typename F> void pmatl1est(Matgen<F> const &mg, Panels<F> &p) {
237 |     // approximation of the decomposition
238 |     int const bs = p.b;
239 |     size_t const lda = p.lda;
240 |     int const nprow = p.nprow;
241 |     int const npcol = p.npcol;
242 | 
243 | #pragma omp parallel for
244 |     for (int pj = 0; pj < npcol; ++pj) {
245 |         double buf[bs];
246 |         const int j = bs * (p.j1 + pj * p.jstride);
247 |         for (int jj = 0; jj < bs; ++jj)
248 |             buf[jj] = 1. / calc_diag(j + jj, mg.n, mg.seed);
249 |         for (int pi = 0; pi < nprow; ++pi) {
250 |             F *pp = p(pi, pj);
251 |             const int i = bs * (p.i1 + pi * p.istride);
252 |             if (i < j)
253 |                 continue;
254 |             if (i == j) {
255 |                 for (int jj = 0; jj < bs; ++jj) {
256 |                     F d = buf[jj];
257 |                     for (int ii = 0; ii < bs; ++ii) {
258 |                         if (i + ii > j + jj) {
259 |                             pp[jj * lda + ii] *= d;
260 |                         }
261 |                     }
262 |                 }
263 |             } else {
264 |                 for (int jj = 0; jj < bs; ++jj) {
265 |                     F d = buf[jj];
266 |                     for (int ii = 0; ii < bs; ++ii) {
267 |                         pp[jj * lda + ii] *= d;
268 |                     }
269 |                 }
270 |             }
271 |         }
272 |     }
273 | }
274 | 
275 | template <typename F> void pmatl1est(HMGen<F> const &mg, Panels<F> &p) {
276 |     // approximation of the decomposition
277 |     int const bs = p.b;
278 |     size_t const lda = p.lda;
279 |     int const nprow = p.nprow;
280 |     int const npcol = p.npcol;
281 |     F const alpha = mg.alpha;
282 |     F const beta = mg.beta;
283 |     F const done = 1;
284 | 
285 | #pragma omp parallel for collapse(2) schedule(dynamic)
286 |     for (int pj = 0; pj < npcol; ++pj)
287 |         for (int pi = 0; pi < nprow; ++pi) {
288 |             F *pp = p(pi, pj);
289 |             const int i = bs * (p.i1 + pi * p.istride);
290 |             const int j = bs * (p.j1 + pj * p.jstride);
291 |             if (i < j) {
292 |                 for (int jj = 0; jj < bs; ++jj) {
293 |                     for (int ii = 0; ii < bs; ++ii) {
294 |                         pp[jj * lda + ii] = beta;
295 |                     }
296 |                 }
297 |             } else if (i > j) {
298 |                 for (int jj = 0; jj < bs; ++jj) {
299 |                     for (int ii = 0; ii < bs; ++ii) {
300 |                         pp[jj * lda + ii] = alpha;
301 |                     }
302 |                 }
303 |             } else {
304 |                 for (int jj = 0; jj < bs; ++jj) {
305 |                     for (int ii = 0; ii < jj; ++ii) {
306 |                         pp[jj * lda + ii] = beta;
307 |                     }
308 |                     pp[jj * lda + jj] = done;
309 |                     for (int ii = jj + 1; ii < bs; ++ii) {
310 |                         pp[jj * lda + ii] = alpha;
311 |                     }
312 |                 }
313 |             }
314 |         }
315 | }
316 | 
317 | template <typename F, typename FPanel>
318 | void pcolvgen(Matgen<F> const &mg, Panels<FPanel> const &p, double *dx) {
319 |     int nprow = p.nprow;
320 |     int b = p.b;
321 |     int i1 = p.i1;
322 |     int j1 = p.j1;
323 |     int istride = p.istride;
324 |     int jstride = p.jstride;
325 |     for (int i = 0; i < nprow; ++i) {
326 |         int ipos = i1 + i * istride;
327 |         if (ipos % jstride == j1) {
328 |             fill_one_panel_with_rand(mg.n, b * ipos, mg.n, b, 1, dx + b * i, 1,
329 |                                      mg.seed, false);
330 |         }
331 |     }
332 | }
333 | 
334 | template <typename F, typename FPanel>
335 | void pdiaggen2(Matgen<F> const &mg, Panels<FPanel> &p, double *dx, double* localSum) {
336 |     int nprow = p.nprow;
337 |     int b = p.b;
338 |     int i1 = p.i1;
339 |     int j1 = p.j1;
340 |     int istride = p.istride;
341 |     int jstride = p.jstride;
342 | 	int const bs = p.b;
343 |     size_t const lda = p.lda;
344 |     int const npcol = p.npcol;
345 | 
346 |     double* globalSum = (double*)malloc(mg.n*sizeof(double));
347 |     if ( globalSum == NULL )
348 |     {
349 |        printf( "Allocation of globalSum failed\n" );
350 |        exit( 10 );
351 |     }
352 |     memset(globalSum, 0, mg.n*sizeof(double)); 
353 |     MPI_Allreduce(localSum, globalSum, mg.n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
354 |  
355 | #pragma omp parallel for
356 |     for (int i = 0; i < nprow; ++i) {
357 |         int ipos = i1 + i * istride;
358 |         if (ipos % jstride == j1) {
359 |             for (int k = 0; k < b; ++k)
360 |                 dx[b * i + k] = globalSum[ b * ipos + k ];
361 |         }
362 |     }
363 | // fill A back
364 |     for (int pj = 0; pj < npcol; ++pj) {
365 |         for (int pi = 0; pi < nprow; ++pi) {
366 |             F *pp = p(pi, pj);
367 |             const int i0 = bs * (p.i1 + pi * p.istride);
368 |             const int j0 = bs * (p.j1 + pj * p.jstride);
369 | 		  	if( i0 == j0 )
370 | 			{
371 | #pragma omp parallel for 
372 | 			   for (int ii = 0; ii < b; ii++)
373 |                               pp[ lda*ii + ii ] = static_cast<F>( globalSum[ i0+ii ] );
374 | 			}
375 |         }
376 |     }
377 |     std::free( globalSum );
378 | 
379 | }
380 | 
381 | template <typename F, typename FPanel>
382 | void pdiaggen(Matgen<F> const &mg, Panels<FPanel> const &p, double *dx) {
383 |     int nprow = p.nprow;
384 |     int b = p.b;
385 |     int i1 = p.i1;
386 |     int j1 = p.j1;
387 |     int istride = p.istride;
388 |     int jstride = p.jstride;
389 |     for (int i = 0; i < nprow; ++i) {
390 |         int ipos = i1 + i * istride;
391 |         if (ipos % jstride == j1) {
392 | #pragma omp parallel for
393 |             for (int k = 0; k < b; ++k)
394 |                 dx[b * i + k] = calc_diag(b * ipos + k, mg.n, mg.seed);
395 |         }
396 |     }
397 | }
398 | 
399 | template <typename F, typename FPanel>
400 | void pdiaggen(HMGen<F> const &mg, Panels<FPanel> const &p, double *dx) {
401 |     int nprow = p.nprow;
402 |     int b = p.b;
403 |     int i1 = p.i1;
404 |     int j1 = p.j1;
405 |     int istride = p.istride;
406 |     int jstride = p.jstride;
407 |     F const ab = mg.alpha * mg.beta;
408 |     F const done = 1;
409 |     for (int i = 0; i < nprow; ++i) {
410 |         int ipos = i1 + i * istride;
411 |         if (ipos % jstride == j1) {
412 | #pragma omp parallel for
413 |             for (int k = 0; k < b; ++k)
414 |                 dx[b * i + k] = done + ab * (b * ipos + k);
415 |         }
416 |     }
417 | }
418 | 
419 | template<typename F, typename FPanel>
420 | void panel_colvgen(Matgen<F> const& mg, Panels<FPanel>const& p, double* dx) {
421 |     int nprow = p.nprow;
422 |     int b = p.b;
423 |     int i1 = p.i1;
424 |     int j1 = p.j1;
425 |     int istride = p.istride;
426 |     int jstride = p.jstride;
427 | 
428 | #pragma omp parallel for
429 |     for (int i = 0; i < nprow; ++i) {
430 |         int ipos = i1 + i * istride;
431 | 
432 |         if (ipos % jstride == j1) {
433 |             panel_fill_one_with_rand(mg.n, b * ipos, mg.n, b, 1, dx + b * i, 1, mg.seed, false);
434 |         }
435 |     }
436 | }
437 | 
438 | template<typename F, typename FPanel>
439 | void panel_diaggen(Matgen<F> const& mg, Panels<FPanel>const& p, double* dx) {
440 |     int nprow = p.nprow;
441 |     int b = p.b;
442 |     int i1 = p.i1;
443 |     int j1 = p.j1;
444 |     int istride = p.istride;
445 |     int jstride = p.jstride;
446 | 
447 | //  PrintLogMsg( "nprow ... %d n ... %d\n", nprow, mg.n );
448 | 
449 | #pragma omp parallel for
450 |     for (int i = 0; i < nprow; ++i) {
451 |         int ipos = i1 + i * istride;
452 | 
453 |         if (ipos % jstride == j1) {
454 |             for (int k = 0; k < b; ++k) {
455 |                 //dx[b * i + k] = calc_diag(b * ipos + k, mg.n, mg.seed);
456 |                 dx[b * i + k] = 0.25 * mg.n;
457 |             }
458 |         }
459 |     }
460 | }
461 | 
462 | #endif
463 | 


--------------------------------------------------------------------------------
/doc/crusher_example_32x32.out:
--------------------------------------------------------------------------------
  1 | ===============================================================================
  2 | OpenMxP - High Performance Mixed Precision Benchmark - NCCS/OLCF
  3 | ===============================================================================
  4 | Build Info: git branch: [], hash-id: []
  5 | Running configuration:
  6 | 	Ranks = 1024
  7 | 	OpenMP threads = 7
  8 | 	PxQ 32x32
  9 | 	cusolver/rocsolver Sgetrf
 10 | 	2ringM comm
 11 | 	1ring dcomm
 12 | 	Node Grid - 2x4R pxq grid
 13 | 	no sync on sgemm
 14 | 	default stream sgemm
 15 | 	gpu direct mpi
 16 | 	GPU Initialization
 17 | 	Alt-Trsm Method
 18 | 	Building panels
 19 | 	gpu p[62940774400] l[1284505600] r[1284505600] ... tot[65509785600]
 20 | 	Finished building panels
 21 | 	gpu d_piv[26214400]
 22 | 	Initialization 1 = 2.1105 sec
 23 | 	Gpu memory (free/total) 2978675712/68702699520	jobid=290129
 24 | 	n=4014080 ln=125440 b=2560 r=32 c=32 epoch_size=81920
 25 | #BEGIN_: Mon Mar 27 04:30:59 2023
 26 | Entering LU
 27 | Step 10, Rank 0, Elapsed 5.80, Work left 98.10%, Time left: 299.50, Est Total: 305.30, GFlops: 141233934.16, TFlops/GPU: 137.92
 28 | Step 20, Rank 0, Elapsed 11.19, Work left 96.22%, Time left: 285.07, Est Total: 296.27, GFlops: 145540752.65, TFlops/GPU: 142.13
 29 | Step 30, Rank 0, Elapsed 16.27, Work left 94.37%, Time left: 272.65, Est Total: 288.92, GFlops: 149241859.39, TFlops/GPU: 145.74
 30 | Step 40, Rank 0, Elapsed 21.67, Work left 92.54%, Time left: 268.80, Est Total: 290.46, GFlops: 148447771.05, TFlops/GPU: 144.97
 31 | Step 50, Rank 0, Elapsed 26.81, Work left 90.74%, Time left: 262.57, Est Total: 289.38, GFlops: 149005030.04, TFlops/GPU: 145.51
 32 | Step 60, Rank 0, Elapsed 31.87, Work left 88.95%, Time left: 256.63, Est Total: 288.49, GFlops: 149462321.26, TFlops/GPU: 145.96
 33 | Step 70, Rank 0, Elapsed 37.07, Work left 87.20%, Time left: 252.46, Est Total: 289.53, GFlops: 148928266.10, TFlops/GPU: 145.44
 34 | Step 80, Rank 0, Elapsed 41.79, Work left 85.46%, Time left: 245.63, Est Total: 287.42, GFlops: 150020347.73, TFlops/GPU: 146.50
 35 | Step 90, Rank 0, Elapsed 46.81, Work left 83.75%, Time left: 241.26, Est Total: 288.08, GFlops: 149678958.04, TFlops/GPU: 146.17
 36 | Step 100, Rank 0, Elapsed 51.87, Work left 82.06%, Time left: 237.28, Est Total: 289.15, GFlops: 149122148.49, TFlops/GPU: 145.63
 37 | Step 110, Rank 0, Elapsed 56.42, Work left 80.40%, Time left: 231.39, Est Total: 287.81, GFlops: 149817072.91, TFlops/GPU: 146.31
 38 | Step 120, Rank 0, Elapsed 61.32, Work left 78.75%, Time left: 227.28, Est Total: 288.60, GFlops: 149406456.37, TFlops/GPU: 145.90
 39 | Step 130, Rank 0, Elapsed 66.06, Work left 77.13%, Time left: 222.84, Est Total: 288.90, GFlops: 149249133.14, TFlops/GPU: 145.75
 40 | Step 140, Rank 0, Elapsed 70.46, Work left 75.53%, Time left: 217.53, Est Total: 287.98, GFlops: 149726580.18, TFlops/GPU: 146.22
 41 | Step 150, Rank 0, Elapsed 75.06, Work left 73.96%, Time left: 213.19, Est Total: 288.25, GFlops: 149587093.28, TFlops/GPU: 146.08
 42 | Step 160, Rank 0, Elapsed 79.31, Work left 72.41%, Time left: 208.09, Est Total: 287.40, GFlops: 150031527.43, TFlops/GPU: 146.52
 43 | Step 170, Rank 0, Elapsed 83.98, Work left 70.87%, Time left: 204.34, Est Total: 288.32, GFlops: 149550099.31, TFlops/GPU: 146.05
 44 | Step 180, Rank 0, Elapsed 88.45, Work left 69.36%, Time left: 200.25, Est Total: 288.69, GFlops: 149358082.93, TFlops/GPU: 145.86
 45 | Step 190, Rank 0, Elapsed 92.67, Work left 67.87%, Time left: 195.79, Est Total: 288.46, GFlops: 149481419.72, TFlops/GPU: 145.98
 46 | Step 200, Rank 0, Elapsed 97.01, Work left 66.41%, Time left: 191.78, Est Total: 288.79, GFlops: 149309802.78, TFlops/GPU: 145.81
 47 | Step 210, Rank 0, Elapsed 101.15, Work left 64.96%, Time left: 187.55, Est Total: 288.70, GFlops: 149355346.72, TFlops/GPU: 145.85
 48 | Step 220, Rank 0, Elapsed 105.30, Work left 63.54%, Time left: 183.49, Est Total: 288.78, GFlops: 149312297.08, TFlops/GPU: 145.81
 49 | Step 230, Rank 0, Elapsed 109.39, Work left 62.13%, Time left: 179.50, Est Total: 288.89, GFlops: 149255627.55, TFlops/GPU: 145.76
 50 | Step 240, Rank 0, Elapsed 113.16, Work left 60.75%, Time left: 175.15, Est Total: 288.30, GFlops: 149560060.54, TFlops/GPU: 146.05
 51 | Step 250, Rank 0, Elapsed 117.20, Work left 59.39%, Time left: 171.39, Est Total: 288.59, GFlops: 149409467.15, TFlops/GPU: 145.91
 52 | Step 260, Rank 0, Elapsed 121.22, Work left 58.05%, Time left: 167.73, Est Total: 288.96, GFlops: 149221845.23, TFlops/GPU: 145.72
 53 | Step 270, Rank 0, Elapsed 124.88, Work left 56.73%, Time left: 163.71, Est Total: 288.59, GFlops: 149410745.17, TFlops/GPU: 145.91
 54 | Step 280, Rank 0, Elapsed 128.76, Work left 55.43%, Time left: 160.10, Est Total: 288.85, GFlops: 149275326.59, TFlops/GPU: 145.78
 55 | Step 290, Rank 0, Elapsed 132.58, Work left 54.14%, Time left: 156.54, Est Total: 289.12, GFlops: 149137334.16, TFlops/GPU: 145.64
 56 | Step 300, Rank 0, Elapsed 136.07, Work left 52.88%, Time left: 152.73, Est Total: 288.80, GFlops: 149305621.26, TFlops/GPU: 145.81
 57 | Step 310, Rank 0, Elapsed 139.64, Work left 51.64%, Time left: 149.13, Est Total: 288.77, GFlops: 149319831.15, TFlops/GPU: 145.82
 58 | Step 320, Rank 0, Elapsed 143.02, Work left 50.42%, Time left: 145.45, Est Total: 288.47, GFlops: 149471644.59, TFlops/GPU: 145.97
 59 | Step 330, Rank 0, Elapsed 146.69, Work left 49.22%, Time left: 142.17, Est Total: 288.86, GFlops: 149274742.08, TFlops/GPU: 145.78
 60 | Step 340, Rank 0, Elapsed 150.10, Work left 48.03%, Time left: 138.75, Est Total: 288.84, GFlops: 149280135.18, TFlops/GPU: 145.78
 61 | Step 350, Rank 0, Elapsed 153.38, Work left 46.87%, Time left: 135.31, Est Total: 288.70, GFlops: 149357426.93, TFlops/GPU: 145.86
 62 | Step 360, Rank 0, Elapsed 156.84, Work left 45.73%, Time left: 132.14, Est Total: 288.98, GFlops: 149208174.53, TFlops/GPU: 145.71
 63 | Step 370, Rank 0, Elapsed 160.12, Work left 44.60%, Time left: 128.90, Est Total: 289.02, GFlops: 149187823.39, TFlops/GPU: 145.69
 64 | Step 380, Rank 0, Elapsed 163.25, Work left 43.49%, Time left: 125.64, Est Total: 288.89, GFlops: 149256953.84, TFlops/GPU: 145.76
 65 | Step 390, Rank 0, Elapsed 166.55, Work left 42.40%, Time left: 122.61, Est Total: 289.16, GFlops: 149115792.58, TFlops/GPU: 145.62
 66 | Step 400, Rank 0, Elapsed 169.47, Work left 41.33%, Time left: 119.39, Est Total: 288.86, GFlops: 149273109.20, TFlops/GPU: 145.77
 67 | Step 410, Rank 0, Elapsed 172.63, Work left 40.28%, Time left: 116.44, Est Total: 289.07, GFlops: 149165550.90, TFlops/GPU: 145.67
 68 | Step 420, Rank 0, Elapsed 175.89, Work left 39.25%, Time left: 113.62, Est Total: 289.50, GFlops: 148940771.20, TFlops/GPU: 145.45
 69 | Step 430, Rank 0, Elapsed 178.74, Work left 38.23%, Time left: 110.62, Est Total: 289.35, GFlops: 149017949.20, TFlops/GPU: 145.53
 70 | Step 440, Rank 0, Elapsed 181.67, Work left 37.23%, Time left: 107.75, Est Total: 289.42, GFlops: 148984267.03, TFlops/GPU: 145.49
 71 | Step 450, Rank 0, Elapsed 184.76, Work left 36.25%, Time left: 105.05, Est Total: 289.82, GFlops: 148778772.01, TFlops/GPU: 145.29
 72 | Step 460, Rank 0, Elapsed 187.67, Work left 35.28%, Time left: 102.32, Est Total: 289.99, GFlops: 148689121.80, TFlops/GPU: 145.20
 73 | Step 470, Rank 0, Elapsed 190.59, Work left 34.34%, Time left: 99.67, Est Total: 290.26, GFlops: 148551035.05, TFlops/GPU: 145.07
 74 | Step 480, Rank 0, Elapsed 193.40, Work left 33.41%, Time left: 97.02, Est Total: 290.42, GFlops: 148470412.52, TFlops/GPU: 144.99
 75 | Step 490, Rank 0, Elapsed 196.30, Work left 32.50%, Time left: 94.50, Est Total: 290.80, GFlops: 148276490.22, TFlops/GPU: 144.80
 76 | Step 500, Rank 0, Elapsed 198.98, Work left 31.60%, Time left: 91.92, Est Total: 290.90, GFlops: 148223368.54, TFlops/GPU: 144.75
 77 | Step 510, Rank 0, Elapsed 201.66, Work left 30.72%, Time left: 89.42, Est Total: 291.08, GFlops: 148133057.74, TFlops/GPU: 144.66
 78 | Step 520, Rank 0, Elapsed 204.52, Work left 29.86%, Time left: 87.06, Est Total: 291.58, GFlops: 147879597.86, TFlops/GPU: 144.41
 79 | Step 530, Rank 0, Elapsed 207.32, Work left 29.01%, Time left: 84.72, Est Total: 292.04, GFlops: 147646915.93, TFlops/GPU: 144.19
 80 | Step 540, Rank 0, Elapsed 209.89, Work left 28.18%, Time left: 82.36, Est Total: 292.25, GFlops: 147542523.61, TFlops/GPU: 144.08
 81 | Step 550, Rank 0, Elapsed 212.77, Work left 27.37%, Time left: 80.16, Est Total: 292.93, GFlops: 147198233.90, TFlops/GPU: 143.75
 82 | Step 560, Rank 0, Elapsed 215.48, Work left 26.57%, Time left: 77.96, Est Total: 293.44, GFlops: 146941771.26, TFlops/GPU: 143.50
 83 | Step 570, Rank 0, Elapsed 218.30, Work left 25.78%, Time left: 75.84, Est Total: 294.14, GFlops: 146594025.82, TFlops/GPU: 143.16
 84 | Step 580, Rank 0, Elapsed 221.14, Work left 25.02%, Time left: 73.78, Est Total: 294.92, GFlops: 146206048.63, TFlops/GPU: 142.78
 85 | Step 590, Rank 0, Elapsed 223.38, Work left 24.26%, Time left: 71.57, Est Total: 294.96, GFlops: 146187623.15, TFlops/GPU: 142.76
 86 | Step 600, Rank 0, Elapsed 225.77, Work left 23.53%, Time left: 69.46, Est Total: 295.23, GFlops: 146050029.72, TFlops/GPU: 142.63
 87 | Step 610, Rank 0, Elapsed 228.33, Work left 22.81%, Time left: 67.46, Est Total: 295.79, GFlops: 145777103.24, TFlops/GPU: 142.36
 88 | Step 620, Rank 0, Elapsed 230.38, Work left 22.10%, Time left: 65.36, Est Total: 295.74, GFlops: 145798381.88, TFlops/GPU: 142.38
 89 | Step 630, Rank 0, Elapsed 232.51, Work left 21.41%, Time left: 63.33, Est Total: 295.85, GFlops: 145746978.11, TFlops/GPU: 142.33
 90 | Step 640, Rank 0, Elapsed 234.55, Work left 20.73%, Time left: 61.34, Est Total: 295.89, GFlops: 145724876.59, TFlops/GPU: 142.31
 91 | Step 650, Rank 0, Elapsed 236.68, Work left 20.07%, Time left: 59.42, Est Total: 296.09, GFlops: 145625613.43, TFlops/GPU: 142.21
 92 | Step 660, Rank 0, Elapsed 238.65, Work left 19.42%, Time left: 57.51, Est Total: 296.16, GFlops: 145590555.99, TFlops/GPU: 142.18
 93 | Step 670, Rank 0, Elapsed 240.52, Work left 18.78%, Time left: 55.63, Est Total: 296.15, GFlops: 145599001.44, TFlops/GPU: 142.19
 94 | Step 680, Rank 0, Elapsed 242.66, Work left 18.16%, Time left: 53.86, Est Total: 296.52, GFlops: 145416335.10, TFlops/GPU: 142.01
 95 | Step 690, Rank 0, Elapsed 244.73, Work left 17.56%, Time left: 52.12, Est Total: 296.84, GFlops: 145257936.33, TFlops/GPU: 141.85
 96 | Step 700, Rank 0, Elapsed 246.51, Work left 16.96%, Time left: 50.36, Est Total: 296.88, GFlops: 145242252.85, TFlops/GPU: 141.84
 97 | Step 710, Rank 0, Elapsed 248.49, Work left 16.38%, Time left: 48.69, Est Total: 297.18, GFlops: 145094182.05, TFlops/GPU: 141.69
 98 | Step 720, Rank 0, Elapsed 250.11, Work left 15.82%, Time left: 47.00, Est Total: 297.11, GFlops: 145129124.89, TFlops/GPU: 141.73
 99 | Step 730, Rank 0, Elapsed 251.83, Work left 15.26%, Time left: 45.37, Est Total: 297.19, GFlops: 145087115.70, TFlops/GPU: 141.69
100 | Step 740, Rank 0, Elapsed 253.82, Work left 14.72%, Time left: 43.83, Est Total: 297.64, GFlops: 144867494.55, TFlops/GPU: 141.47
101 | Step 750, Rank 0, Elapsed 255.48, Work left 14.20%, Time left: 42.27, Est Total: 297.75, GFlops: 144813197.16, TFlops/GPU: 141.42
102 | Step 760, Rank 0, Elapsed 257.17, Work left 13.68%, Time left: 40.77, Est Total: 297.94, GFlops: 144721851.75, TFlops/GPU: 141.33
103 | Step 770, Rank 0, Elapsed 259.19, Work left 13.18%, Time left: 39.35, Est Total: 298.54, GFlops: 144433250.11, TFlops/GPU: 141.05
104 | Step 780, Rank 0, Elapsed 260.62, Work left 12.69%, Time left: 37.89, Est Total: 298.51, GFlops: 144446447.66, TFlops/GPU: 141.06
105 | Step 790, Rank 0, Elapsed 262.24, Work left 12.22%, Time left: 36.49, Est Total: 298.74, GFlops: 144337785.04, TFlops/GPU: 140.95
106 | Step 800, Rank 0, Elapsed 263.77, Work left 11.75%, Time left: 35.12, Est Total: 298.89, GFlops: 144261263.33, TFlops/GPU: 140.88
107 | Step 810, Rank 0, Elapsed 265.26, Work left 11.30%, Time left: 33.78, Est Total: 299.04, GFlops: 144191800.49, TFlops/GPU: 140.81
108 | Step 820, Rank 0, Elapsed 266.68, Work left 10.86%, Time left: 32.48, Est Total: 299.16, GFlops: 144134032.45, TFlops/GPU: 140.76
109 | Step 830, Rank 0, Elapsed 267.88, Work left 10.43%, Time left: 31.18, Est Total: 299.07, GFlops: 144178542.86, TFlops/GPU: 140.80
110 | Step 840, Rank 0, Elapsed 269.42, Work left 10.01%, Time left: 29.96, Est Total: 299.38, GFlops: 144024953.37, TFlops/GPU: 140.65
111 | Step 850, Rank 0, Elapsed 270.83, Work left 9.60%, Time left: 28.77, Est Total: 299.59, GFlops: 143923933.29, TFlops/GPU: 140.55
112 | Step 860, Rank 0, Elapsed 272.05, Work left 9.21%, Time left: 27.58, Est Total: 299.64, GFlops: 143903822.32, TFlops/GPU: 140.53
113 | Step 870, Rank 0, Elapsed 273.46, Work left 8.82%, Time left: 26.46, Est Total: 299.91, GFlops: 143770997.23, TFlops/GPU: 140.40
114 | Step 880, Rank 0, Elapsed 274.54, Work left 8.45%, Time left: 25.33, Est Total: 299.88, GFlops: 143789234.27, TFlops/GPU: 140.42
115 | Step 890, Rank 0, Elapsed 275.72, Work left 8.08%, Time left: 24.25, Est Total: 299.97, GFlops: 143742864.39, TFlops/GPU: 140.37
116 | Step 900, Rank 0, Elapsed 277.05, Work left 7.73%, Time left: 23.22, Est Total: 300.27, GFlops: 143601737.67, TFlops/GPU: 140.24
117 | Step 910, Rank 0, Elapsed 278.04, Work left 7.39%, Time left: 22.19, Est Total: 300.23, GFlops: 143620576.13, TFlops/GPU: 140.25
118 | Step 920, Rank 0, Elapsed 279.08, Work left 7.06%, Time left: 21.19, Est Total: 300.28, GFlops: 143596488.77, TFlops/GPU: 140.23
119 | Step 930, Rank 0, Elapsed 280.31, Work left 6.74%, Time left: 20.25, Est Total: 300.56, GFlops: 143463799.97, TFlops/GPU: 140.10
120 | Step 940, Rank 0, Elapsed 281.25, Work left 6.42%, Time left: 19.31, Est Total: 300.56, GFlops: 143460769.82, TFlops/GPU: 140.10
121 | Step 950, Rank 0, Elapsed 282.29, Work left 6.12%, Time left: 18.41, Est Total: 300.70, GFlops: 143397108.83, TFlops/GPU: 140.04
122 | Step 960, Rank 0, Elapsed 283.28, Work left 5.83%, Time left: 17.54, Est Total: 300.82, GFlops: 143337182.00, TFlops/GPU: 139.98
123 | Step 970, Rank 0, Elapsed 284.28, Work left 5.55%, Time left: 16.70, Est Total: 300.97, GFlops: 143264125.56, TFlops/GPU: 139.91
124 | Step 980, Rank 0, Elapsed 285.20, Work left 5.27%, Time left: 15.88, Est Total: 301.08, GFlops: 143214038.71, TFlops/GPU: 139.86
125 | Step 990, Rank 0, Elapsed 285.99, Work left 5.01%, Time left: 15.08, Est Total: 301.07, GFlops: 143216965.77, TFlops/GPU: 139.86
126 | Step 1000, Rank 0, Elapsed 287.05, Work left 4.75%, Time left: 14.33, Est Total: 301.38, GFlops: 143071779.56, TFlops/GPU: 139.72
127 | Step 1010, Rank 0, Elapsed 287.85, Work left 4.51%, Time left: 13.58, Est Total: 301.43, GFlops: 143045912.50, TFlops/GPU: 139.69
128 | Step 1020, Rank 0, Elapsed 288.59, Work left 4.27%, Time left: 12.87, Est Total: 301.46, GFlops: 143035383.98, TFlops/GPU: 139.68
129 | Step 1030, Rank 0, Elapsed 289.53, Work left 4.04%, Time left: 12.19, Est Total: 301.72, GFlops: 142912294.41, TFlops/GPU: 139.56
130 | Step 1040, Rank 0, Elapsed 290.22, Work left 3.82%, Time left: 11.52, Est Total: 301.74, GFlops: 142898236.78, TFlops/GPU: 139.55
131 | Step 1050, Rank 0, Elapsed 291.01, Work left 3.61%, Time left: 10.88, Est Total: 301.89, GFlops: 142829714.18, TFlops/GPU: 139.48
132 | Step 1060, Rank 0, Elapsed 291.89, Work left 3.40%, Time left: 10.28, Est Total: 302.17, GFlops: 142698214.25, TFlops/GPU: 139.35
133 | Step 1070, Rank 0, Elapsed 292.49, Work left 3.20%, Time left: 9.68, Est Total: 302.17, GFlops: 142698739.05, TFlops/GPU: 139.35
134 | Step 1080, Rank 0, Elapsed 293.18, Work left 3.01%, Time left: 9.11, Est Total: 302.29, GFlops: 142640986.57, TFlops/GPU: 139.30
135 | Step 1090, Rank 0, Elapsed 294.00, Work left 2.83%, Time left: 8.57, Est Total: 302.57, GFlops: 142507256.86, TFlops/GPU: 139.17
136 | Step 1100, Rank 0, Elapsed 294.55, Work left 2.66%, Time left: 8.05, Est Total: 302.60, GFlops: 142494919.89, TFlops/GPU: 139.16
137 | Step 1110, Rank 0, Elapsed 295.21, Work left 2.49%, Time left: 7.54, Est Total: 302.76, GFlops: 142419895.25, TFlops/GPU: 139.08
138 | Step 1120, Rank 0, Elapsed 295.81, Work left 2.33%, Time left: 7.06, Est Total: 302.88, GFlops: 142363952.48, TFlops/GPU: 139.03
139 | Step 1130, Rank 0, Elapsed 296.52, Work left 2.18%, Time left: 6.61, Est Total: 303.12, GFlops: 142248370.75, TFlops/GPU: 138.91
140 | Step 1140, Rank 0, Elapsed 297.17, Work left 2.03%, Time left: 6.17, Est Total: 303.34, GFlops: 142148242.29, TFlops/GPU: 138.82
141 | Step 1150, Rank 0, Elapsed 297.68, Work left 1.89%, Time left: 5.75, Est Total: 303.43, GFlops: 142106780.17, TFlops/GPU: 138.78
142 | Step 1160, Rank 0, Elapsed 298.33, Work left 1.76%, Time left: 5.35, Est Total: 303.68, GFlops: 141988061.71, TFlops/GPU: 138.66
143 | Step 1170, Rank 0, Elapsed 298.83, Work left 1.64%, Time left: 4.97, Est Total: 303.80, GFlops: 141933525.99, TFlops/GPU: 138.61
144 | Step 1180, Rank 0, Elapsed 299.24, Work left 1.52%, Time left: 4.60, Est Total: 303.85, GFlops: 141909903.95, TFlops/GPU: 138.58
145 | Step 1190, Rank 0, Elapsed 299.84, Work left 1.40%, Time left: 4.26, Est Total: 304.10, GFlops: 141792392.34, TFlops/GPU: 138.47
146 | Step 1200, Rank 0, Elapsed 300.21, Work left 1.29%, Time left: 3.93, Est Total: 304.14, GFlops: 141770595.14, TFlops/GPU: 138.45
147 | Step 1210, Rank 0, Elapsed 300.66, Work left 1.19%, Time left: 3.62, Est Total: 304.28, GFlops: 141708376.49, TFlops/GPU: 138.39
148 | Step 1220, Rank 0, Elapsed 301.22, Work left 1.09%, Time left: 3.33, Est Total: 304.55, GFlops: 141582908.57, TFlops/GPU: 138.26
149 | Step 1230, Rank 0, Elapsed 301.54, Work left 1.00%, Time left: 3.05, Est Total: 304.59, GFlops: 141561903.63, TFlops/GPU: 138.24
150 | Step 1240, Rank 0, Elapsed 301.98, Work left 0.92%, Time left: 2.79, Est Total: 304.77, GFlops: 141478466.06, TFlops/GPU: 138.16
151 | Step 1250, Rank 0, Elapsed 302.48, Work left 0.83%, Time left: 2.54, Est Total: 305.02, GFlops: 141361659.10, TFlops/GPU: 138.05
152 | Step 1260, Rank 0, Elapsed 302.80, Work left 0.76%, Time left: 2.31, Est Total: 305.11, GFlops: 141322184.99, TFlops/GPU: 138.01
153 | Step 1270, Rank 0, Elapsed 303.19, Work left 0.69%, Time left: 2.10, Est Total: 305.28, GFlops: 141242033.69, TFlops/GPU: 137.93
154 | Step 1280, Rank 0, Elapsed 303.54, Work left 0.62%, Time left: 1.89, Est Total: 305.44, GFlops: 141171796.70, TFlops/GPU: 137.86
155 | Step 1290, Rank 0, Elapsed 303.93, Work left 0.56%, Time left: 1.70, Est Total: 305.63, GFlops: 141081286.12, TFlops/GPU: 137.77
156 | Step 1300, Rank 0, Elapsed 304.21, Work left 0.50%, Time left: 1.53, Est Total: 305.74, GFlops: 141031490.65, TFlops/GPU: 137.73
157 | Step 1310, Rank 0, Elapsed 304.45, Work left 0.45%, Time left: 1.36, Est Total: 305.81, GFlops: 140999214.86, TFlops/GPU: 137.69
158 | Step 1320, Rank 0, Elapsed 304.84, Work left 0.40%, Time left: 1.21, Est Total: 306.05, GFlops: 140888473.63, TFlops/GPU: 137.59
159 | Step 1330, Rank 0, Elapsed 305.12, Work left 0.35%, Time left: 1.07, Est Total: 306.19, GFlops: 140821428.41, TFlops/GPU: 137.52
160 | Step 1340, Rank 0, Elapsed 305.36, Work left 0.31%, Time left: 0.94, Est Total: 306.30, GFlops: 140774068.42, TFlops/GPU: 137.47
161 | Step 1350, Rank 0, Elapsed 305.68, Work left 0.27%, Time left: 0.82, Est Total: 306.51, GFlops: 140677603.88, TFlops/GPU: 137.38
162 | Step 1360, Rank 0, Elapsed 305.86, Work left 0.23%, Time left: 0.72, Est Total: 306.58, GFlops: 140644495.05, TFlops/GPU: 137.35
163 | Step 1370, Rank 0, Elapsed 306.10, Work left 0.20%, Time left: 0.62, Est Total: 306.72, GFlops: 140582461.99, TFlops/GPU: 137.29
164 | Step 1380, Rank 0, Elapsed 306.39, Work left 0.17%, Time left: 0.53, Est Total: 306.92, GFlops: 140488994.72, TFlops/GPU: 137.20
165 | Step 1390, Rank 0, Elapsed 306.58, Work left 0.15%, Time left: 0.45, Est Total: 307.03, GFlops: 140438193.65, TFlops/GPU: 137.15
166 | Step 1400, Rank 0, Elapsed 306.75, Work left 0.12%, Time left: 0.38, Est Total: 307.13, GFlops: 140393825.08, TFlops/GPU: 137.10
167 | Step 1410, Rank 0, Elapsed 307.00, Work left 0.10%, Time left: 0.31, Est Total: 307.31, GFlops: 140309193.43, TFlops/GPU: 137.02
168 | Step 1420, Rank 0, Elapsed 307.14, Work left 0.08%, Time left: 0.26, Est Total: 307.39, GFlops: 140271859.43, TFlops/GPU: 136.98
169 | Step 1430, Rank 0, Elapsed 307.31, Work left 0.07%, Time left: 0.21, Est Total: 307.52, GFlops: 140215444.68, TFlops/GPU: 136.93
170 | Step 1440, Rank 0, Elapsed 307.46, Work left 0.05%, Time left: 0.17, Est Total: 307.63, GFlops: 140164201.61, TFlops/GPU: 136.88
171 | Step 1450, Rank 0, Elapsed 307.63, Work left 0.04%, Time left: 0.13, Est Total: 307.76, GFlops: 140105788.63, TFlops/GPU: 136.82
172 | Step 1460, Rank 0, Elapsed 307.75, Work left 0.03%, Time left: 0.10, Est Total: 307.86, GFlops: 140061931.47, TFlops/GPU: 136.78
173 | Step 1470, Rank 0, Elapsed 307.86, Work left 0.02%, Time left: 0.08, Est Total: 307.93, GFlops: 140026884.42, TFlops/GPU: 136.75
174 | Step 1480, Rank 0, Elapsed 308.00, Work left 0.02%, Time left: 0.05, Est Total: 308.06, GFlops: 139969577.12, TFlops/GPU: 136.69
175 | Step 1490, Rank 0, Elapsed 308.11, Work left 0.01%, Time left: 0.04, Est Total: 308.15, GFlops: 139929101.58, TFlops/GPU: 136.65
176 | Step 1500, Rank 0, Elapsed 308.21, Work left 0.01%, Time left: 0.03, Est Total: 308.23, GFlops: 139890339.15, TFlops/GPU: 136.61
177 | Step 1510, Rank 0, Elapsed 308.34, Work left 0.01%, Time left: 0.02, Est Total: 308.35, GFlops: 139836613.55, TFlops/GPU: 136.56
178 | Step 1520, Rank 0, Elapsed 308.43, Work left 0.00%, Time left: 0.01, Est Total: 308.44, GFlops: 139798441.15, TFlops/GPU: 136.52
179 | Step 1530, Rank 0, Elapsed 308.52, Work left 0.00%, Time left: 0.00, Est Total: 308.53, GFlops: 139756324.91, TFlops/GPU: 136.48
180 | Step 1540, Rank 0, Elapsed 308.64, Work left 0.00%, Time left: 0.00, Est Total: 308.64, GFlops: 139706058.14, TFlops/GPU: 136.43
181 | Step 1550, Rank 0, Elapsed 308.72, Work left 0.00%, Time left: 0.00, Est Total: 308.72, GFlops: 139670934.41, TFlops/GPU: 136.40
182 | Step 1560, Rank 0, Elapsed 308.80, Work left 0.00%, Time left: 0.00, Est Total: 308.80, GFlops: 139632822.28, TFlops/GPU: 136.36
183 | LU factorization 308.86 sec
184 | 
185 | IR Start with GPU
186 | # iterative refinement: step=0, residual=8.778080e-04, hpl-harness=1311160.547096
187 | # iterative refinement: step=1, residual=4.172508e-07, hpl-harness=623.022736
188 | # iterative refinement: step=2, residual=1.682939e-10, hpl-harness=0.251290
189 | 	 IR Time 5.94 sec
190 | #END___: Mon Mar 27 04:36:14 2023
191 | 
192 | 314.822 sec. 136962679.949 GFlop/s resid = 1.682938764204205e-10 hpl-harness = 0.251289911 TFlop/s per GPU = 133.753


--------------------------------------------------------------------------------