├── src ├── fp16_gpu_kernels.cu ├── gpu_init_kernels.cu ├── device_macros.h ├── highammgen.hpp ├── log.hpp ├── fp16_gpu_kernels.h ├── gpu_init_kernels.h ├── getrf_nopiv.hpp ├── panel_check.hpp ├── higham_mat_impl.cpp ├── gpu_init.hpp ├── panel_norm.hpp ├── CMakeLists.txt ├── cuda_device_macros.h ├── iterative_refinement.hpp ├── hpl_rand.hpp ├── gpu_init_kernels.cpp ├── svesim.hpp ├── rocm_device_macros.h ├── fp16sim.hpp ├── grid.hpp ├── otf_gemv.cpp ├── timer.hpp ├── fp16_gpu_kernels.cpp └── matgen.hpp ├── doc ├── CUDA_HIP_Macros.xlsx ├── load_modules_frontier.sh ├── build_OpenMxP_frontier.sh ├── OpenMxP.slurm └── crusher_example_32x32.out ├── LICENSE ├── LICENSE.Fugaku └── README.md /src/fp16_gpu_kernels.cu: -------------------------------------------------------------------------------- 1 | // Import GPU code 2 | #include "fp16_gpu_kernels.cpp" 3 | -------------------------------------------------------------------------------- /src/gpu_init_kernels.cu: -------------------------------------------------------------------------------- 1 | // Import GPU code 2 | #include "gpu_init_kernels.cpp" 3 | -------------------------------------------------------------------------------- /doc/CUDA_HIP_Macros.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/at-aaims/OpenMxP/HEAD/doc/CUDA_HIP_Macros.xlsx -------------------------------------------------------------------------------- /doc/load_modules_frontier.sh: -------------------------------------------------------------------------------- 1 | #module load PrgEnv-cray/8.3.3 2 | 3 | module load PrgEnv-gnu/8.3.3 4 | module load gcc 5 | 6 | module load rocm/5.1.0 7 | 8 | module load cray-mpich/8.1.18 9 | 10 | #module load cray-libsci/21.08.1.2 11 | 12 | module load craype-x86-trento 13 | module load craype-network-ofi 14 | module load craype-accel-amd-gfx90a 15 | module load cmake/3.23.2 16 | 17 | -------------------------------------------------------------------------------- /src/device_macros.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef __HPLAI_DEVICE_MACROS__ 3 | #define __HPLAI_DEVICE_MACROS__ 4 | 5 | // Temp 6 | /* 7 | #if 0 8 | #define CUDA_OLCF_PLATFORM 1 9 | 10 | #elif 11 | #define ROCM_OLCF_PLATFORM 1 12 | 13 | #else 14 | 15 | #endif 16 | */ 17 | 18 | #define ROCM_OLCF_PLATFORM 1 19 | 20 | // SUMMIT 21 | #ifdef CUDA_OLCF_PLATFORM 22 | 23 | #include "cuda_device_macros.h" 24 | 25 | 26 | // FRONTIER (& SPOCK) 27 | #elif defined(ROCM_OLCF_PLATFORM) 28 | 29 | #include "rocm_device_macros.h" 30 | 31 | #endif 32 | 33 | 34 | #endif // __HPLAI_DEVICE_MACROS__ 35 | -------------------------------------------------------------------------------- /doc/build_OpenMxP_frontier.sh: -------------------------------------------------------------------------------- 1 | 2 | source ../doc/load_modules_frontier.sh 3 | 4 | SRC_DIR=../src 5 | 6 | export LD_LIBRARY_PATH=/opt/rocm-5.1.0/llvm/lib:/opt/rocm-5.1.0/llvm/lib:${LD_LIBRARY_PATH} 7 | export HIPCC_COMPILE_FLAGS_APPEND="$HIPCC_COMPILE_FLAGS_APPEND -std=c++14 -O3 -fopenmp --offload-arch=gfx90a" 8 | rm -rf CMakeCache.txt CMakeFiles externals Makefile 9 | cmake \ 10 | -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON -DCMAKE_BUILD_TYPE=Release \ 11 | $SRC_DIR 2>&1 | tee CMAKE.OUTPUT 12 | # -DCMAKE_HIP_COMPILER_FORCED=True \ 13 | 14 | make VERBOSE=1 -j1 2>&1 | tee MAKE.OUTPUT 15 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright © 2022 , UT-Battelle, LLC 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /LICENSE.Fugaku: -------------------------------------------------------------------------------- 1 | .Copyright (c) 2021, RIKEN 2 | 3 | All rights reserved. 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 1. Redistributions of source code must retain the above copyright notice, 7 | this list of conditions and the following disclaimer. 8 | 2. Redistributions in binary form must reproduce the above copyright notice, 9 | this list of conditions and the following disclaimer in the documentation 10 | and/or other materials provided with the distribution. 11 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 12 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 13 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 14 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 15 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 16 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 17 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 18 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 19 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 20 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 21 | -------------------------------------------------------------------------------- /src/highammgen.hpp: -------------------------------------------------------------------------------- 1 | #ifndef _HIGHAMMGEN_HPP 2 | #define _HIGHAMMGEN_HPP 3 | extern "C" double higham_mat_comp_beta(int n, double cond, double rho); 4 | template struct HMGen { 5 | // generator of the Higham's HPL-AI matrix in 6 | // https://github.com/higham/hpl-ai-matrix It generates the matrix A=LU with 7 | // L and U have the special structure. L is the lower-triangular matrix with 8 | // the diagonals = 1 and the strictly lower-triangular part = alpha. U is 9 | // the upper-triangular matrix with the diagonals = 1 and the strictly 10 | // upper-triangular part = beta. 11 | int n; // the matrix size 12 | double alpha, beta; 13 | double scalea, scaleb; // scaling for left and right panels. 14 | F *diag; // diagonal part of the matrix. 15 | HMGen(int n, double cond, double rho, F *diag) : n(n), diag(diag) { 16 | // alpha and beta are automatically computed from the condition number 17 | // and others 18 | beta = higham_mat_comp_beta(n, cond, rho); 19 | alpha = rho * beta; // rhos is the ration of alhpa and beta. 20 | #if 0 21 | // this scaling may be lead to too good result 22 | scalea = 1. / (alpha * 32.); 23 | scaleb = 1. / (beta * 16.); 24 | #else 25 | // we observed that alpha~beta~O(1/n). 26 | scalea = n / (32.); 27 | scaleb = n / (16.); 28 | #endif 29 | } 30 | }; 31 | #endif 32 | -------------------------------------------------------------------------------- /src/log.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LOG_HPP 2 | #define LOG_HPP 3 | 4 | // remove the following will compile away debug message, but keep info 5 | // or make it conditional by: #ifdef DEBUG 6 | #if 0 7 | #define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_TRACE 8 | #include "spdlog/cfg/env.h" 9 | #include "spdlog/sinks/basic_file_sink.h" 10 | #include "spdlog/sinks/stdout_color_sinks.h" 11 | #include "spdlog/spdlog.h" 12 | #endif 13 | 14 | //extern std::shared_ptr LOG; 15 | 16 | extern int grank, gsize, glog; 17 | 18 | // #define INFO(...) {LOG->info(__VA_ARGS__); } 19 | 20 | // #define WARN(...) {LOG->warn(__VA_ARGS__); } 21 | 22 | #if 0 23 | #ifdef NDEBUG 24 | #define TRACE(...) (void)0 25 | #else 26 | #define TRACE(...) SPDLOG_LOGGER_CALL(spdlog::default_logger_raw(), spdlog::level::trace, __VA_ARGS__) 27 | #endif 28 | #endif 29 | 30 | void PrintMsg(const char *fmt...) { 31 | if (grank == 0) { 32 | // LOG->info(fmt); 33 | } 34 | } 35 | 36 | // #define PrintLogMsg( ... ) { if ( grank == 0 && glog == 1 ) LOG->info( __VA_ARGS__ ); } 37 | 38 | #define OUTPUT if ( grank == 0 && glog == 1 ) 39 | 40 | inline 41 | void PrintLogMsg( const char * fmt, ... ) 42 | { 43 | if ( grank == 0 && glog == 1 ) 44 | { 45 | va_list argPtr; 46 | va_start( argPtr, fmt ); 47 | vfprintf( stdout, fmt, argPtr ); 48 | va_end( argPtr ); 49 | fflush( stdout ); 50 | } 51 | } 52 | 53 | #endif 54 | -------------------------------------------------------------------------------- /doc/OpenMxP.slurm: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -A stf018 3 | #SBATCH -J openmxp 4 | #SBATCH -p batch 5 | 6 | #SBATCH -N 32 7 | 8 | ##SBATCH -C nvme 9 | 10 | #SBATCH -t 0:30:00 11 | 12 | #SBATCH -o frontier_OpenMxP_01.out 13 | 14 | module load PrgEnv-gnu/8.3.3 15 | module load gcc 16 | 17 | module load rocm/5.1.0 18 | module load cray-mpich/8.1.18 19 | module load craype-x86-trento 20 | module load craype-network-ofi 21 | module load craype-accel-host 22 | 23 | export LD_LIBRARY_PATH=/opt/rocm-5.1.0/llvm/lib:${LD_LIBRARY_PATH} 24 | export LD_LIBRARY_PATH=/opt/rocm-5.1.0/lib:${LD_LIBRARY_PATH} 25 | 26 | #export LD_LIBRARY_PATH=/opt/rocm-5.4.0/llvm/lib:${LD_LIBRARY_PATH} 27 | #export LD_LIBRARY_PATH=/opt/rocm-5.4.0/lib:${LD_LIBRARY_PATH} 28 | 29 | export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0 30 | export MPICH_GPU_SUPPORT_ENABLED=1 31 | export MPICH_SMP_SINGLE_COPY_MODE=CMA 32 | 33 | echo "JobID : $SLURM_JOB_ID" 34 | echo "Number of Nodes: $SLURM_JOB_NUM_NODES" 35 | 36 | #SLURM_JOB_NODELIST SLURM_JOB_ID 37 | #echo $SLURM_JOB_NODELIST > "frontier_$SLURM_JOB_ID.nodes" 38 | #echo $SLURM_JOB_NODELIST 39 | 40 | nt=$(expr $SLURM_JOB_NUM_NODES \* 8) 41 | 42 | pq=24 43 | pq=28 44 | pq=32 45 | 46 | pq=16 47 | 48 | comm=4 49 | 50 | b=2560 51 | 52 | ln=92160 53 | ln=120320 54 | ln=122880 55 | ln=125440 56 | 57 | N=$(expr $pq \* $ln) 58 | 59 | export OMP_NUM_THREADS=7 60 | 61 | export NOTE="OpenMxP rocm 5.1.0 GPU-Direct" 62 | 63 | export CMD="../build/OpenMxP.x86_64 $N $b $pq -1 -comm 2 -alt 1 " 64 | srun -N $SLURM_JOB_NUM_NODES -n $nt -c 7 --ntasks-per-node=8 --gpus-per-task=1 --gpu-bind=closest $CMD 65 | 66 | export CMD="../build/OpenMxP.x86_64 $N $b $pq -1 -comm 2 -alt 2 " 67 | srun -N $SLURM_JOB_NUM_NODES -n $nt -c 7 --ntasks-per-node=8 --gpus-per-task=1 --gpu-bind=closest $CMD 68 | 69 | -------------------------------------------------------------------------------- /src/fp16_gpu_kernels.h: -------------------------------------------------------------------------------- 1 | #ifndef __fp16GPUKer 2 | #define __fp16GPUKer 3 | 4 | #include 5 | //#include 6 | //#include 7 | #include 8 | #include 9 | #include 10 | //#include 11 | 12 | #include "device_macros.h" 13 | 14 | using namespace std; 15 | 16 | 17 | __global__ void copyCoalesced(__half *odata, const float *idata, int olda, int ilda); 18 | /// HUGE ASSUMTION OF ilda = olda for copy lower and upper 19 | __global__ void copyCoalesced_lower(__half *odata, const float *idata, int olda, int ilda); 20 | __global__ void copyCoalesced_upper(__half *odata, const float *idata, int olda, int ilda); 21 | __global__ void copyCoalesced_lower(float *odata, const __half *idata, int olda, int ilda); 22 | __global__ void copyCoalesced_upper(float *odata, const __half *idata, int olda, int ilda); 23 | __global__ void transposeCoalesced(__half *odata, const float *idata, int olda, int ilda); 24 | 25 | // HOST CALLS TO LAUNCH KERNELS (LEFT) 26 | __host__ void half_conversion_left( const float *C, int b, int plda, __half *lp, int lplda); 27 | 28 | // HOST CALLS TO LAUNCH KERNELS (RIGHT TRANS) 29 | __host__ void half_conversion_right_trans(const float *C, int b, int plda, __half * rp, int rplda); 30 | 31 | // Copy cast kernel 32 | __global__ void copyFtoH(__half *odata, const float *idata, long olda, long ilda); 33 | __host__ void downCast_copy_general(__half* out, long olda, long nrow, long ncol, float* C, long ilda); 34 | __host__ void downCast_copy_lower(__half* out, long olda, long nrow, long ncol, float* C, long ilda); 35 | __host__ void downCast_copy_upper(__half* out, long olda, long nrow, long ncol, float* C, long ilda); 36 | 37 | // Debug purpose 38 | __host__ void upCast_copy_lower(float* out, long olda, long nrow, long ncol, __half * C, long ilda); 39 | __host__ void upCast_copy_upper(float* out, long olda, long nrow, long ncol, __half * C, long ilda); 40 | 41 | // Alt solver 42 | __host__ void gen_identity_mat( float * out, long nrow, long ncol); 43 | __global__ void gen_identity_mat_kernel( float * out, long nrow, long ncol); 44 | __global__ void gen_identity_mat_kernel( __half * out, long nrow, long ncol); 45 | __host__ void gen_identity_mat( __half * out, long nrow, long ncol); 46 | 47 | // trans cast kernel 48 | __host__ void downCast_trans_general(__half* out, long olda, long nrow, long ncol , float* C, long ilda); 49 | __host__ void downCast_trans_general(__half* out, int olda, int nrow, int ncol , float* C, int ilda); 50 | 51 | #define VECTOR_OP_THREADS 128 52 | 53 | #endif 54 | 55 | 56 | -------------------------------------------------------------------------------- /src/gpu_init_kernels.h: -------------------------------------------------------------------------------- 1 | #ifndef GPU_INIT_KERNELS 2 | #define GPU_INIT_KERNELS 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "hpl_rand.hpp" 9 | #include "device_macros.h" 10 | 11 | #define MATGEM_THREADS 512 12 | 13 | using namespace std; 14 | /* 15 | template 16 | void fill_panel_diag_host(int n, int b, T *A, int lda, RandStat stat_ij, RandCoeff incl1, 17 | RandCoeff jump_thread, int rowstart, RandStat stat_00, double* Diag, int nthreads); 18 | 19 | template 20 | void fill_panel_host(int n, int b, T *A, int lda, 21 | RandStat stat_ij, RandCoeff incl1, RandCoeff jump_thread, int nthreads ); 22 | 23 | template 24 | __global__ void fill_panel_dev(int n, int b, T * __restrict__ A, 25 | int lda, RandStat stat_ij, RandCoeff incl1, RandCoeff jump_dn); 26 | 27 | template 28 | __global__ void fill_panel_diag_dev(int n, int nb, T * __restrict__ A, int lda, RandStat stat_ij, RandCoeff incl1, 29 | RandCoeff jump_dn, RandStat stat_00, int rowstart, double* Diag); 30 | */ 31 | 32 | void fill_random(float *A, long m, long n, int n_threads, int blocksize_x, int work_per_thread); 33 | 34 | void fill_random(double *v, long m, int n_threads); 35 | 36 | __host__ void fill_random_fugaku(uint64_t N, RandStat stat0, RandCoeff inc1, float *A, double *row_sums, long m, long n, long i1, long j1, long b, long istride, long jstride, int n_threads, int blocksize_x, int work_per_thread); 37 | 38 | template __global__ void fill_random_fugaku_d(uint64_t N, RandStat stat0, RandCoeff inc1, F *A, double *row_sums, long m, long n, long i1, long j1, long b, long istride, long jstride, int work_per_thread); 39 | 40 | __host__ void compute_row_sums(float *A, double *row_sums, long m, long n, long i1, long j1, long b, long istride, long jstride, int n_threads, int blocksize_x, int work_per_thread); 41 | 42 | __global__ void compute_row_sums_d(float *A, double *row_sums, long m, long n, long i1, long j1, long b, long istride, long jstride, int work_per_thread); 43 | 44 | __host__ void fill_diag_rhs(int my_init, uint64_t N, RandStat stat0, RandCoeff inc1, float *A, double *row_sums_d, double *rhs, double *rhs_d, long m, long *diag_i_steps, long *diag_j_steps, long n_diag_blocks, long i1, long b, long istride, int n_threads); 45 | 46 | __global__ void fill_diag_rhs_d(int my_init, uint64_t N, RandStat stat0, RandCoeff inc1, float *A, double *row_sums_d, double *rhs, double *rhs_d, long m, long *diag_i_steps, long *diag_j_steps, long n_diag_entries, long i1, long b, long istride); 47 | 48 | template __global__ void minus_05(F *A, long m, long n, int work_per_thread); 49 | 50 | #endif 51 | -------------------------------------------------------------------------------- /src/getrf_nopiv.hpp: -------------------------------------------------------------------------------- 1 | #ifndef GETRF_NOPIV 2 | #define GETRF_NOPIV 3 | #include 4 | #include 5 | //#include "schur_updator.hpp" 6 | 7 | extern "C" void dtrsm_(...); 8 | extern "C" void strsm_(...); 9 | inline void trsmR(int m, int n, const double *a, int lda, double *b, int ldb) { 10 | double one = 1.; 11 | dtrsm_("R", "U", "N", "N", &m, &n, &one, a, &lda, b, &ldb); 12 | } 13 | inline void trsmR(int m, int n, const float *a, int lda, float *b, int ldb) { 14 | float one = 1.f; 15 | strsm_("R", "U", "N", "N", &m, &n, &one, a, &lda, b, &ldb); 16 | } 17 | inline void trsmL(int m, int n, double const *a, int lda, double *b, int ldb) { 18 | double one = 1.; 19 | dtrsm_("L", "L", "N", "U", &m, &n, &one, a, &lda, b, &ldb); 20 | } 21 | inline void trsmL(int m, int n, float const *a, int lda, float *b, int ldb) { 22 | float one = 1.f; 23 | strsm_("L", "L", "N", "U", &m, &n, &one, a, &lda, b, &ldb); 24 | } 25 | /****** Use as future reference for improvment ****** 26 | 27 | 28 | #define NSMALL 16 29 | template 30 | void getrf_nopiv_small(int n, F* a, size_t lda) { 31 | for(int k=0; k(1) / a[lda*k + k]; 33 | for(int i=k+1; i 47 | void getrf_nopiv(int n, F* a, size_t lda, bool warmup=false) { 48 | for(int k=0; knn){ 52 | trsmL(nn, n-k-nn, a+lda*k+k, lda, a+lda*(k+nn)+k, lda); 53 | trsmR(n-k-nn, nn, a+lda*k+k, lda, a+lda*k+k+nn, lda); 54 | gemmschur(n-k-nn, n-k-nn, nn, a+lda*k+k+nn, lda, 55 | a+lda*(k+nn)+k, lda, a+lda*(k+nn)+(k+nn), lda); 56 | } 57 | } 58 | } 59 | 60 | #if defined(__FUJITSU) || defined(__CLANG_FUJITSU) 61 | extern void sgetrf_nopiv_tuned(int n, float *a, size_t lda); 62 | #endif 63 | template<> 64 | void getrf_nopiv(int n, float* a, size_t lda, bool warmup) { 65 | #if defined(__FUJITSU) || defined(__CLANG_FUJITSU) 66 | if(!warmup){ 67 | sgetrf_nopiv_tuned(n, a, lda); 68 | return; 69 | } 70 | #endif 71 | for(int k=0; knn){ 75 | trsmL(nn, n-k-nn, a+lda*k+k, lda, a+lda*(k+nn)+k, lda); 76 | trsmR(n-k-nn, nn, a+lda*k+k, lda, a+lda*k+k+nn, lda); 77 | gemmschur(n-k-nn, n-k-nn, nn, a+lda*k+k+nn, lda, 78 | a+lda*(k+nn)+k, lda, a+lda*(k+nn)+(k+nn), lda); 79 | } 80 | } 81 | } 82 | 83 | // copy-first version 84 | template 85 | void getrf_nopiv(int n, F* a, size_t lda, F* piv, int ldpiv) { 86 | #pragma omp parallel for 87 | for(int j=0; j void warmup_trf(int n, F *a, size_t lda) { 103 | for (int k = 0; k < n; k++) { 104 | a[k + lda * k] = 1.0; 105 | } 106 | getrf_nopiv(n, a, lda, true); 107 | } 108 | #endif 109 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # OpenMxP: Open-Source Mixed Precision Computing 2 | - [OpenMxP: Open-Source Mixed Precision Computing](#openmxp-open-source-mixed-precision-computing) 3 | - [Build instructions ( Frontier/Crusher )](#build-instructions--frontiercrusher-) 4 | - [Running instructions ( Frontier/Crusher )](#running-instructions--frontiercrusher-) 5 | - [Comments](#comments) 6 | - [Build instruction (Summit)](#build-instruction-summit) 7 | - [Tuning Parameters](#tuning-parameters) 8 | - [Citation](#citation) 9 | - [Code Repo](#code-repo) 10 | - [SC22 Paper](#sc22-paper) 11 | - [Developers](#developers) 12 | - [Contributors](#contributors) 13 | 14 | 15 | 16 | 17 | ## Build instructions ( Frontier/Crusher ) 18 | 19 | ```sh 20 | cd OpenMxP 21 | mkdir build 22 | cd build 23 | cp ../doc/build_OpenMxP_frontier.sh . 24 | ``` 25 | That script runs `../doc/load_modules_frontier.sh` which may need to be modified for different rocm versions. 26 | 27 | ```sh 28 | ./build_OpenMxP_frontier.sh 29 | ``` 30 | You should now have a OpenMxP.x86_64 binary. 31 | 32 | 33 | ## Running instructions ( Frontier/Crusher ) 34 | 35 | ```sh 36 | mkdir jobs 37 | cd jobs 38 | cp ../doc/OpenMxP.slurm 39 | ``` 40 | Change this script to meet your needs. 41 | 42 | ```sh 43 | sbatch OpenMxP.slurm 44 | ``` 45 | The output from crusher is in `doc/crusher_example_32x32.out`. 46 | 47 | Constraints are PxQ=#GPUs, PxLN=QxLN, B need to be divisiable by TILE size. 48 | Must have at least 3 omp threads. 49 | 50 | ### Comments 51 | 52 | OpenMxP is designed to run at scale. When it is run at a few number of nodes, 53 | the performance will suffer due to the Iterative Refinement (IR). 54 | At larger scales, this time becomes insignificant in the run. 55 | 56 | There are requirements between N, B, PxQ ( process grid ), and the local grid. 57 | Some are enforced while others are not. It is usually easier to run square 58 | ( PxQ ) that are multiples of 8. The best B tends to be 2560 and the best 59 | performing local N (LN) tends to be 125440. So this will give a N of P*LN. 60 | 61 | 62 | ## Build instruction (Summit) 63 | 64 | ```sh 65 | module load cmake gcc/7.4.0 cuda/11.2.0 openblas 66 | git clone git@github.com:at-aaims/OpenMxP 67 | cd hpl-ai && make build && cd build 68 | ``` 69 | 70 | For release build: 71 | 72 | ```sh 73 | cmake -DCMAKE_BUILD_TYPE=Release .. 74 | make 75 | ``` 76 | 77 | The default optimization level is `-O3`. 78 | 79 | For debug build: 80 | 81 | ```sh 82 | cmake -DCMAKE_BUILD_TYPE=Debug .. 83 | make 84 | ``` 85 | This will have debug info built in. 86 | 87 | 88 | ## Tuning Parameters 89 | ``` 90 | -log 1 ( print rank 0 messages ) 91 | 92 | -solv 0 ( use blas ) 93 | 1 ( use solver ) # default (fastest) 94 | 95 | -comm 0 ( use ibcast ) 96 | 1 ( use bcast ) 97 | 2 ( use 1ring ) # default 98 | 3 ( use 1ringM ) 99 | 4 ( use 2ringM ) 100 | 101 | --numa 0 (Global Column Major) # default 102 | 1 ( Node Grid - 2x3C ) 103 | 2 ( Node Grid - 3x2C ) 104 | 3 ( Global Row Major ) 105 | 4 ( Node Grid - 2x4R ) 106 | 5 ( Node Grid - 2x4C ) 107 | 108 | -alt 0 (TRSM L/U panel) 109 | 1 (TRSM for Diagonal inverse) 110 | 2 (TRTRI for Diagonal inverse) 111 | 112 | 113 | -sync ( enable cuda device sync after sgemm - currently only for bcast ) 114 | ``` 115 | 116 | ## Citation 117 | 118 | 119 | ### Code Repo 120 | 121 | ``` 122 | @misc{doecode_102701, 123 | title = {OpenMxP - Open Source Mixed Precision Computing}, 124 | author = {Lu, Hao and Matheson, Michael and Wang, Feiyi and Joubert, Wayne and Ellis, Austin and Oles, Vladyslav}, 125 | doi = {10.11578/dc.20230315.3}, 126 | url = {https://doi.org/10.11578/dc.20230315.3}, 127 | howpublished = {[Computer Software] \url{https://doi.org/10.11578/dc.20230315.3}}, 128 | year = {2023}, 129 | month = {mar} 130 | } 131 | ``` 132 | 133 | ### SC22 Paper 134 | 135 | ```bibtex 136 | @inproceedings{10.5555/3571885.3571988, 137 | author = {Lu, Hao and Matheson, Michael and Oles, Vladyslav and Ellis, Austin and Joubert, Wayne and Wang, Feiyi}, 138 | title = {Climbing the Summit and Pushing the Frontier of Mixed Precision Benchmarks at Extreme Scale}, 139 | year = {2022}, 140 | isbn = {9784665454445}, 141 | publisher = {IEEE Press}, 142 | booktitle = {Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis}, 143 | articleno = {78}, 144 | numpages = {15}, 145 | doi = {10.1109/SC41404.2022.00083}, 146 | keywords = {linear algebra, parallel programming, exascale computing, high performance computing}, 147 | location = {Dallas, Texas}, 148 | series = {SC '22} 149 | } 150 | ``` 151 | 152 | 153 | ## Developers 154 | * Hao Lu, 155 | * Michael Matheson, (Main Contact) 156 | * Wayne Joubert, 157 | * Feiyi Wang, 158 | * Vlad Oles, (Past) 159 | * Austin Ellis, (Past) 160 | 161 | ## Contributors 162 | * Jakub Kurzak 163 | * Alessandro Fanfarillo 164 | * Noel Chalmers 165 | * Nicolas Malaya Nicholas 166 | * Pak Niu Lui 167 | * Hui Liu 168 | * Mazda Sabony 169 | -------------------------------------------------------------------------------- /src/panel_check.hpp: -------------------------------------------------------------------------------- 1 | #ifndef PANEL_CHECK_HPP 2 | #define PANEL_CHECK_HPP 3 | // computing checksum of matrix for debuging. 4 | #include "grid.hpp" 5 | #include "highammgen.hpp" 6 | #include "panel.hpp" 7 | #include 8 | 9 | template void panel_check(Panels const &p, Grid &g) { 10 | double sigs[3] = {0., 0., 0.}; 11 | double sigd = 0., sigu = 0., sigl = 0.; 12 | size_t lda = p.lda; 13 | int b = p.b; 14 | int i1 = p.i1; 15 | int j1 = p.j1; 16 | int istride = p.istride; 17 | int jstride = p.jstride; 18 | int nprow = p.nprow; 19 | int npcol = p.npcol; 20 | //#pragma omp parallel for collapse(2) schedule(dynamic) 21 | //reduction(+:sigd,sigu,sigl) 22 | for (int j = 0; j < npcol; ++j) { 23 | for (int i = 0; i < nprow; ++i) { 24 | int jpos = j1 + j * jstride; 25 | int ipos = i1 + i * istride; 26 | FPanel const *data = p(i, j); 27 | if (ipos == jpos) { 28 | for (int jj = 0; jj < b; ++jj) { 29 | for (int ii = 0; ii < jj; ++ii) { 30 | double t = fabs(data[jj * lda + ii]); 31 | sigu += t; 32 | } 33 | sigd += fabs(data[jj * lda + jj]); 34 | for (int ii = jj + 1; ii < b; ++ii) { 35 | double t = fabs(data[jj * lda + ii]); 36 | sigl += t; 37 | } 38 | } 39 | } else if (ipos < jpos) { 40 | for (int jj = 0; jj < b; ++jj) 41 | for (int ii = 0; ii < b; ++ii) { 42 | double t = fabs(data[jj * lda + ii]); 43 | sigu += t; 44 | } 45 | } else { 46 | for (int jj = 0; jj < b; ++jj) 47 | for (int ii = 0; ii < b; ++ii) { 48 | double t = fabs(data[jj * lda + ii]); 49 | sigl += t; 50 | } 51 | } 52 | } 53 | } 54 | sigs[0] = sigd; 55 | sigs[1] = sigu; 56 | sigs[2] = sigl; 57 | MPI_Allreduce(MPI_IN_PLACE, sigs, 3, MPI_DOUBLE, MPI_SUM, g.commworld); 58 | if (g.row == 0 && g.col == 0) { 59 | std::printf("check %22.17e %22.17e, %22.17e\n", sigs[0], sigs[1], 60 | sigs[2]); 61 | std::fflush(stdout); 62 | } 63 | } 64 | 65 | template 66 | void panel_check(HMGen const &hmg, Panels const &p, Grid &g) { 67 | double sigs[3] = {0., 0., 0.}; 68 | double sigd = 0., sigu = 0., sigl = 0.; 69 | size_t lda = p.lda; 70 | int b = p.b; 71 | int i1 = p.i1; 72 | int j1 = p.j1; 73 | int istride = p.istride; 74 | int jstride = p.jstride; 75 | int nprow = p.nprow; 76 | int npcol = p.npcol; 77 | double alpha = hmg.alpha; 78 | double beta = hmg.beta; 79 | double done = 1; 80 | //#pragma omp parallel for collapse(2) schedule(dynamic) 81 | //reduction(+:sigd,sigu,sigl) 82 | for (int j = 0; j < npcol; ++j) { 83 | for (int i = 0; i < nprow; ++i) { 84 | int jpos = j1 + j * jstride; 85 | int ipos = i1 + i * istride; 86 | FPanel const *data = p(i, j); 87 | double blockmax = 0.; 88 | if (ipos == jpos) { 89 | for (int jj = 0; jj < b; ++jj) { 90 | for (int ii = 0; ii < jj; ++ii) { 91 | double t = fabs(data[jj * lda + ii] - beta) / beta; 92 | // printf("9871 %d %d %e %e %e\n", b*ipos+ii, b*jpos+jj, 93 | // data[jj*lda+ii], beta, t); 94 | sigu = sigu > t ? sigu : t; 95 | blockmax = blockmax > t ? blockmax : t; 96 | } 97 | double t = fabs(data[jj * lda + jj] - done); 98 | // if(t>1e-1) printf("9871 %d %d %e %e %e\n", b*jpos+jj, 99 | // b*jpos+jj, data[jj*lda+jj], done, t); 100 | sigd = sigd > t ? sigd : t; 101 | blockmax = blockmax > t ? blockmax : t; 102 | for (int ii = jj + 1; ii < b; ++ii) { 103 | double t = fabs(data[jj * lda + ii] - alpha) / alpha; 104 | // printf("9871 %d %d %e %e %e\n", b*ipos+ii, b*jpos+jj, 105 | // data[jj*lda+ii], alpha, t); 106 | sigl = sigl > t ? sigl : t; 107 | blockmax = blockmax > t ? blockmax : t; 108 | } 109 | } 110 | } else if (ipos < jpos) { 111 | for (int jj = 0; jj < b; ++jj) 112 | for (int ii = 0; ii < b; ++ii) { 113 | double t = fabs(data[jj * lda + ii] - beta) / beta; 114 | // printf("9871 %d %d %e %e %e\n", b*ipos+ii, b*jpos+jj, 115 | // data[jj*lda+ii], beta, t); 116 | sigu = sigu > t ? sigu : t; 117 | blockmax = blockmax > t ? blockmax : t; 118 | } 119 | } else { 120 | for (int jj = 0; jj < b; ++jj) 121 | for (int ii = 0; ii < b; ++ii) { 122 | double t = fabs(data[jj * lda + ii] - alpha) / alpha; 123 | // printf("9871 %d %d %e %e %e\n", b*ipos+ii, b*jpos+jj, 124 | // data[jj*lda+ii], alpha, t); 125 | sigl = sigl > t ? sigl : t; 126 | blockmax = blockmax > t ? blockmax : t; 127 | } 128 | } 129 | /*if(g.row==0&&g.col==0) { 130 | printf("9871 %d %d %e\n", ipos, jpos, blockmax); 131 | fflush(stdout); 132 | }*/ 133 | } 134 | } 135 | sigs[0] = sigd; 136 | sigs[1] = sigu; 137 | sigs[2] = sigl; 138 | MPI_Allreduce(MPI_IN_PLACE, sigs, 3, MPI_DOUBLE, MPI_MAX, g.commworld); 139 | if (g.row == 0 && g.col == 0) { 140 | std::printf("check %22.17e %22.17e, %22.17e\n", sigs[0], sigs[1], 141 | sigs[2]); 142 | std::fflush(stdout); 143 | } 144 | } 145 | 146 | #endif 147 | -------------------------------------------------------------------------------- /src/higham_mat_impl.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020, Massimiliano Fasi and Nicholas J. Higham 2 | // All rights reserved. 3 | // 4 | // Redistribution and use in source and binary forms, with or without 5 | // modification, are permitted provided that the following conditions are met: 6 | // 7 | // * Redistributions of source code must retain the above copyright notice, this 8 | // list of conditions and the following disclaimer. 9 | // 10 | // * Redistributions in binary form must reproduce the above copyright notice, 11 | // this list of conditions and the following disclaimer in the documentation 12 | // and/or other materials provided with the distribution. 13 | // 14 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 15 | // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 | // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 18 | // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 19 | // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 20 | // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 21 | // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 22 | // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 23 | // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 24 | // POSSIBILITY OF SUCH DAMAGE. 25 | 26 | // The above copyright notice and the code are from 27 | // https://github.com/higham/hpl-ai-matrix 28 | 29 | // This file is manual translation of the above software. 30 | #include 31 | #include 32 | #include 33 | #include 34 | #define MAX(A, B) ((A) > (B) ? (A) : (B)) 35 | #define MIN(A, B) ((A) > (B) ? (B) : (A)) 36 | double fhpl(int n, double alpha, double beta) { 37 | // compute the inf-norm condition number of the matrix with alpha and beta 38 | // FHPL Value of cond(A,inf) for matrix A(n,a,b). 39 | if (isnan(alpha)) 40 | alpha = beta / 2; 41 | double a = alpha, b = beta; 42 | double lambda_1 = 1 + (n - 1) * b; 43 | int idash = MIN((int)floor(1. / a), n); 44 | int k = MIN((int)floor((1 + b) / b), n - 1); 45 | double lambda_idash = 46 | 1 + (2 * k - idash + 1) * a + (n - idash) * b + 47 | (-k * k + k + 3 * idash * (idash - 1) / 2 - n * idash + n) * a * b; 48 | double lambda_n = 49 | 1 + (2 * k - n + 1) * a + (-k * k + k + n * (n - 1) / 2) * a * b; 50 | double na_est = MAX(MAX(lambda_1, lambda_idash), lambda_n); 51 | double r = (1 + a) * (1 + b); 52 | int i = 1; 53 | double delta1 = 54 | (1 + a) * 55 | (1. / (1 + a) + (r == 0. ? 0. : b * (1 - pow(r, n - 1)) / (1 - r))); 56 | double deltan = pow(1 + a, n) * (1. / (1 + a)); 57 | double ninva_est = MAX(delta1, deltan); 58 | // printf("Z %e %e %e %e %e %e %e %e %e\n", a, b, lambda_1, lambda_idash, 59 | // lambda_n, delta1, deltan, na_est, ninva_est); 60 | double ret = na_est * ninva_est; 61 | if (isinf(ret)) 62 | return DBL_MAX; 63 | else 64 | return ret; 65 | } 66 | 67 | template double zero_find(F f, double left, double right) { 68 | // bisection method 69 | // the brent method consumes half # of f evaluations, it's not good enought 70 | // for complication 71 | double fl = f(left); 72 | double fr = f(right); 73 | if (fl > 0. || fr < 0.) 74 | return 0. / 0.; // nan 75 | while (true) { 76 | double tol1 = (2. * fabs(right) + 0.5) * DBL_EPSILON; 77 | if (right - left < tol1) 78 | break; 79 | double middle = (left + right) / 2; 80 | if (middle == left || middle == right) 81 | break; 82 | double fm = f(middle); 83 | // printf("%e %e %e :: %e %e %e\n", left, middle, right, fl, fm, fr); 84 | if (fm == 0.) 85 | return middle; 86 | else if (fm < 0.) { 87 | left = middle; 88 | fl = fm; 89 | } else { 90 | right = middle; 91 | fr = fm; 92 | } 93 | } 94 | return right; 95 | } 96 | 97 | extern "C" double higham_mat_comp_beta(int n, double kappa, double rho) { 98 | // % Compute alpha and beta to give cond(A,inf) = kappa. 99 | double left = DBL_EPSILON; 100 | double left_val = fhpl(n, rho * left, left) - kappa; 101 | assert(left_val < 0.); 102 | double right = 1. / rho; 103 | int k = 1; 104 | while (true) { 105 | double right_val = fhpl(n, rho * right, right) - kappa; 106 | if (isfinite(right_val) && right_val > 0.) 107 | break; 108 | // %fprintf('F at right endpoint, right = %9.2e, is %9.2e.\n', right, 109 | // right_val) 110 | right *= 0.5; 111 | ++k; 112 | if (k == 100) 113 | break; 114 | } 115 | double beta = zero_find( 116 | [=](double x) -> double { return fhpl(n, rho * x, x) - kappa; }, left, 117 | right); 118 | double alpha = rho * beta; 119 | while (alpha > 1.) { 120 | // fprintf('Initial alpha = %9.2e exceeds 1 so recomputing.\n', alpha) 121 | right *= 0.5; 122 | right = right / 2; 123 | beta = zero_find( 124 | [=](double x) -> double { return fhpl(n, rho * x, x) - kappa; }, 125 | left, right); 126 | alpha = rho * beta; 127 | } 128 | return beta; 129 | } 130 | 131 | extern "C" void hplai_matrix_impl(int n, double *a, int lda, double alpha, 132 | double beta) { 133 | for (int j = 0; j < n; ++j) { 134 | for (int i = 0; i < j; ++i) { 135 | a[j * lda + i] = -beta + alpha * beta * i; 136 | } 137 | a[j * lda + j] = 1. + alpha * beta * j; 138 | for (int i = j + 1; i < n; ++i) { 139 | a[j * lda + i] = -alpha + alpha * beta * j; 140 | } 141 | } 142 | } 143 | 144 | extern "C" void hplai_matrix(int n, double *a, int lda, double kappa) { 145 | double rho = 0.5; 146 | double beta = higham_mat_comp_beta(n, kappa, rho); 147 | double alpha = rho * beta; 148 | hplai_matrix_impl(n, a, lda, alpha, beta); 149 | } 150 | 151 | #if 0 152 | #include 153 | int main() 154 | { 155 | int n = 10; 156 | double kappa = 1000; 157 | double rho = 0.125; 158 | for(int n=10; n<100000000; n=(n*3)/2){ 159 | double beta = comp_beta(n, kappa, rho); 160 | printf("%d %e %e\n", n, beta, rho*beta*beta*n); 161 | } 162 | return 0; 163 | } 164 | #endif 165 | -------------------------------------------------------------------------------- /src/gpu_init.hpp: -------------------------------------------------------------------------------- 1 | #ifndef GPU_INIT 2 | #define GPU_INIT 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "gpu_init_kernels.h" 10 | #include "hpl_rand.hpp" 11 | #include "panel.hpp" 12 | 13 | //#include "device_macros.h" 14 | 15 | using namespace std; 16 | 17 | #if 0 18 | template 19 | void gpu_pgen(int my_init, Matgen &mg, Panels &p, LRPanels<__half> *lr, int n, double *diag, double *rhs) 20 | { 21 | F *pp_d = p(0, 0, 'd'), *pp = p(0, 0); 22 | long const nprow = p.nprow, npcol = p.npcol, b = p.b, i1 = p.i1, j1 = p.j1, istride = p.istride, jstride = p.jstride, p_m = nprow * b, p_n = npcol * b; 23 | size_t const p_size = (size_t)p_m * (size_t)p_n; 24 | int n_threads = 1024, blocksize_x = 32, work_per_thread = 32; 25 | double *local_row_sums, *local_row_sums_d = (double *)lr[0].d_p; 26 | 27 | // Generate matrix entries in [-0.5, 0.5] and compute local row sums and copy them to host 28 | if (my_init == 1) 29 | { 30 | fill_random(pp_d, p_m, p_n, n_threads, blocksize_x, work_per_thread); 31 | compute_row_sums(pp_d, local_row_sums_d, p_m, p_n, i1, j1, b, istride, jstride, n_threads, blocksize_x, work_per_thread); 32 | } 33 | else if (my_init == 3) 34 | { 35 | fill_random_fugaku(mg.n, RandStat::initialize(mg.seed), RandCoeff::default_vals(), pp_d, local_row_sums_d, p_m, p_n, i1, j1, b, istride, jstride, n_threads, blocksize_x, work_per_thread); 36 | } 37 | GPU_DEVICE_SYNCHRONIZE(); 38 | checkGPU(GPU_MALLOC_HOST((void **)&local_row_sums, n * sizeof(double)), 39 | " %s\n", hostRank, "Allocating host[cpu] local row sums"); 40 | GPU_MEMCPY(local_row_sums, local_row_sums_d, n * sizeof(double), GPU_MEMCPY_DEVICE_TO_HOST); 41 | 42 | // Compute global row sums and copy them to device 43 | double *global_row_sums, *global_row_sums_d = (double *)lr[1].d_p; 44 | 45 | checkGPU(GPU_MALLOC_HOST((void **)&global_row_sums, n * sizeof(double)), 46 | " %s\n", hostRank, "Allocating host[cpu] global row sums"); 47 | MPI_Allreduce(local_row_sums, global_row_sums, n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); 48 | GPU_MEMCPY(global_row_sums_d, global_row_sums, n * sizeof(double), GPU_MEMCPY_HOST_TO_DEVICE); 49 | 50 | // Find diagonal blocks owned by the rank 51 | long j, j_step, local_row, local_col, global_row, diag_entry_idx; 52 | double diag_val; 53 | long n_diag_blocks = 0, *diag_i_steps, *diag_j_steps; 54 | checkGPU( GPU_MALLOC_MANAGED( &diag_i_steps, nprow * sizeof(long) ), 55 | " %s\n", hostRank, "Allocating managed memory - diag_i_steps" ); 56 | checkGPU( GPU_MALLOC_MANAGED( &diag_j_steps, npcol * sizeof(long) ), 57 | " %s\n", hostRank, "Allocating managed memory - diag_j_steps" ); 58 | 59 | for (long i_step=0; i_step < nprow; i_step++) 60 | { 61 | // j == i in diagonal blocks 62 | j = i1 + i_step * istride; 63 | j_step = (j - j1) / jstride; 64 | if ((j - j1) % jstride == 0 && j_step >= 0 && j_step < npcol) 65 | { 66 | // rank owns diagonal entries j*b through j*b + b - 1 67 | diag_i_steps[n_diag_blocks] = i_step; 68 | diag_j_steps[n_diag_blocks] = j_step; 69 | n_diag_blocks++; 70 | 71 | } 72 | } 73 | 74 | 75 | // If the rank owns any diagonal entries... 76 | if (n_diag_blocks) 77 | { 78 | // Fill in the diagonal and generate rhs 79 | long n_diag_entries = n_diag_blocks * b; 80 | double *rhs_d; 81 | 82 | checkGPU(GPU_MALLOC((void**)&rhs_d, n_diag_entries * sizeof(double)), 83 | " %s\n", hostRank, "Allocate device[gpu] rhs"); 84 | if (my_init == 1) 85 | { 86 | fill_random(rhs_d, n_diag_entries, n_threads); 87 | } 88 | 89 | fill_diag_rhs(my_init, mg.n, RandStat::initialize(mg.seed), RandCoeff::default_vals(), pp_d, global_row_sums_d, rhs, rhs_d, p_m, diag_i_steps, diag_j_steps, n_diag_blocks, i1, b, istride, n_threads); 90 | 91 | // Copy diag to host 92 | long i_step; 93 | for (long diag_block_idx = 0; diag_block_idx < n_diag_blocks; diag_block_idx++) 94 | { 95 | i_step = diag_i_steps[diag_block_idx]; 96 | GPU_MEMCPY(diag + i_step*b, global_row_sums_d + (i1 + i_step*istride)*b, b * sizeof(double), GPU_MEMCPY_DEVICE_TO_HOST); 97 | } 98 | 99 | // Copy rhs to host 100 | GPU_MEMCPY(rhs, rhs_d, n_diag_entries * sizeof(double), GPU_MEMCPY_DEVICE_TO_HOST); 101 | 102 | } 103 | 104 | // Copy panel to host 105 | GPU_MEMCPY(pp, pp_d, p_size * sizeof(float), GPU_MEMCPY_DEVICE_TO_HOST); 106 | 107 | // Copy panel to permanent storage if using our generation 108 | if (my_init == 1) 109 | { 110 | checkGPU(GPU_MALLOC_HOST((void **)&p.p_init, p.alloc_size), 111 | " %s\n", hostRank, "Allocate host[cpu] p_init"); 112 | memcpy(p.p_init, pp, p_size * sizeof(F)); 113 | } 114 | } 115 | 116 | template 117 | void gpu_pgen2(Panels &p, Grid& grid) 118 | { 119 | F *pp_d = p(0, 0, 'd'), *pp = p(0, 0); 120 | int b = p.b; 121 | fill_random2(p(0,0,'d') , p.nprow*b, p.npcol*b); 122 | 123 | GPU_MEMCPY(p(0,0),p(0,0,'d'), (size_t)p.nprow* (size_t)b* (size_t)p.npcol*(size_t)b*sizeof(F), 124 | GPU_MEMCPY_DEVICE_TO_HOST); 125 | double *local_row_sums, *global_row_sums; 126 | checkGPU( GPU_MALLOC_MANAGED( (void**)&local_row_sums, b*sizeof(double) ), 127 | " %s\n", hostRank, "Allocate managed memory - local row sums" ); 128 | checkGPU( GPU_MALLOC_MANAGED( (void**)&global_row_sums, b*sizeof(double) ), 129 | " %s\n", hostRank, "Allocate managed memory - global row sums" ); 130 | 131 | for (int k = 0; k < p.nblocks; ++k) { 132 | // position of the panels to decomp in process grid 133 | int const rootrow = k % grid.nrow; 134 | int const rootcol = k % grid.ncol; 135 | // position of the panels to decomp in local matrix 136 | int i = k / grid.nrow + (rootrow > grid.row ? 1 : 0); 137 | int j = k / grid.ncol + (rootcol > grid.col ? 1 : 0); 138 | 139 | if (rootrow == grid.row && rootcol == grid.col) { 140 | compute_row_sums2(p(i,0,'d'),local_row_sums, b, p.npcol*b, p.lda); 141 | GPU_DEVICE_SYNCHRONIZE(); 142 | 143 | MPI_Allreduce(local_row_sums, global_row_sums, b, MPI_DOUBLE, MPI_SUM, grid.hcomm); 144 | pp = p(i,j); 145 | #pragma omp parallel for 146 | for (int i_step=0; i_step < b; i_step++) 147 | { 148 | pp[ i_step + i_step*p.lda]= static_cast(global_row_sums[i_step]); 149 | } 150 | } 151 | else if(rootrow != grid.row) 152 | { 153 | //chill 154 | } 155 | else if(rootrow == grid.row) 156 | { 157 | compute_row_sums2(p(i,0,'d'),local_row_sums, b, p.npcol*b, p.lda); 158 | GPU_DEVICE_SYNCHRONIZE(); 159 | MPI_Allreduce(local_row_sums, global_row_sums, b, MPI_DOUBLE, MPI_SUM, grid.hcomm); 160 | } 161 | } 162 | GPU_FREE(local_row_sums); 163 | GPU_FREE(global_row_sums); 164 | } 165 | #endif 166 | 167 | #endif 168 | -------------------------------------------------------------------------------- /src/panel_norm.hpp: -------------------------------------------------------------------------------- 1 | #ifndef PANEL_NORM_HPP 2 | #define PANEL_NORM_HPP 3 | #include "grid.hpp" 4 | #include "highammgen.hpp" 5 | #include "panel.hpp" 6 | #include 7 | #define NORM_THREADS 1024 8 | //#include 9 | //#include 10 | 11 | __global__ void calc_infnorm_d(int b, double* __restrict__ x, double* __restrict__ result) { 12 | __shared__ double sdata[NORM_THREADS]; 13 | 14 | size_t id = threadIdx.x; 15 | sdata[id] = 0.0; 16 | 17 | for (int j = 0; j + id < b; j += NORM_THREADS) 18 | { 19 | sdata[id] = fmax(sdata[id], fabs(x[id+j])); 20 | } 21 | __syncthreads(); 22 | 23 | for (unsigned int s = NORM_THREADS / 2; s > 0; s >>= 1) { 24 | if (id < s) { 25 | sdata[id] = fmax(sdata[id], sdata[id + s]); 26 | } 27 | __syncthreads(); 28 | } 29 | 30 | if (id == 0) { 31 | result[id] = sdata[0]; 32 | } 33 | } 34 | 35 | template 36 | double colv_infnorm_h(Panels const &p, double *dx, Grid &g, double* workspace) { 37 | // computes the inf-norm of the distributed column vector dx. 38 | // descriptros are derived from p 39 | int nprow = p.nprow; 40 | int b = p.b; 41 | int i1 = p.i1; 42 | int j1 = p.j1; 43 | int istride = p.istride; 44 | int jstride = p.jstride; 45 | double norm = 0.; 46 | double t = 0; 47 | for (int i = 0; i < nprow; ++i) { 48 | int ipos = i1 + i * istride; 49 | if ((ipos % jstride) == j1) { 50 | calc_infnorm_d<<<1,NORM_THREADS>>>(b, dx + b * i, workspace); 51 | GPU_DEVICE_SYNCHRONIZE(); 52 | GPU_MEMCPY(&t, workspace, sizeof(double), GPU_MEMCPY_DEVICE_TO_HOST); 53 | norm = norm >= t ? norm : t; 54 | } 55 | } 56 | MPI_Allreduce(MPI_IN_PLACE, &norm, 1, MPI_DOUBLE, MPI_MAX, g.commworld); 57 | return norm; 58 | } 59 | 60 | template F calc_infnorm(int n, F const *x) { 61 | F norm = static_cast(0); 62 | #pragma omp parallel for simd reduction(max : norm) 63 | for (int i = 0; i < n; ++i) { 64 | F t = x[i]; 65 | t = (t >= static_cast(0) ? t : -t); 66 | norm = (norm >= t ? norm : t); 67 | } 68 | return norm; 69 | } 70 | template 71 | double colv_infnorm(Panels const &p, double *dx, Grid &g) { 72 | // computes the inf-norm of the distributed column vector dx. 73 | // descriptros are derived from p 74 | int nprow = p.nprow; 75 | int b = p.b; 76 | int i1 = p.i1; 77 | int j1 = p.j1; 78 | int istride = p.istride; 79 | int jstride = p.jstride; 80 | double norm = 0.; 81 | for (int i = 0; i < nprow; ++i) { 82 | int ipos = i1 + i * istride; 83 | if ((ipos % jstride) == j1) { 84 | double t = calc_infnorm(b, dx + b * i); 85 | norm = norm >= t ? norm : t; 86 | } 87 | } 88 | MPI_Allreduce(MPI_IN_PLACE, &norm, 1, MPI_DOUBLE, MPI_MAX, g.commworld); 89 | return norm; 90 | } 91 | 92 | template 93 | double panel_infnorm(Matgen const &mg, Panels const &p, 94 | double *w, double *piv, Grid &g) { 95 | // compute the inf-norm of the matrix. 96 | // w and piv are working buffer. 97 | 98 | // matrix inf-norm is the inf-norm of the row 1-norms. 99 | int b = p.b; 100 | int i1 = p.i1; 101 | int j1 = p.j1; 102 | int istride = p.istride; 103 | int jstride = p.jstride; 104 | int nprow = p.nprow; 105 | int npcol = p.npcol; 106 | for (int i = 0; i < b * nprow; ++i) 107 | w[i] = 0.; 108 | for (int j = 0; j < npcol; ++j) { 109 | int jpos = j1 + j * jstride; 110 | for (int i = 0; i < nprow; ++i) { 111 | int ipos = i1 + i * istride; 112 | fill_one_panel_with_rand(mg.n, b * ipos, b * jpos, b, b, piv, b, 113 | mg.seed, true); 114 | for (int jj = 0; jj < b; ++jj) 115 | for (int ii = 0; ii < b; ++ii) { 116 | double t = piv[jj * b + ii]; 117 | w[b * i + ii] += (t < 0. ? -t : t); 118 | } 119 | } 120 | } 121 | MPI_Allreduce(MPI_IN_PLACE, w, b * nprow, MPI_DOUBLE, MPI_SUM, g.hcomm); 122 | double norm = 0.; 123 | for (int i = 0; i < b * nprow; ++i) 124 | norm = (norm >= w[i] ? norm : w[i]); 125 | MPI_Allreduce(MPI_IN_PLACE, &norm, 1, MPI_DOUBLE, MPI_MAX, g.vcomm); 126 | return norm; 127 | } 128 | 129 | template 130 | double hpl_infnorm(Panels const &p, double *d, Grid &g) { 131 | // the diagonal of the hpl-ai matrix is the sum of the absolute values of 132 | // the off-diagonals on the same row. therefore, twice of the diagonal is 133 | // the l1-norm of that row. 134 | return 2. * colv_infnorm(p, d, g); 135 | } 136 | 137 | template 138 | double higham_infnorm(HMGen const &mg, Panels const &p, 139 | double *w, Grid &g) { 140 | int b = p.b; 141 | int i1 = p.i1; 142 | int j1 = p.j1; 143 | int istride = p.istride; 144 | int jstride = p.jstride; 145 | int nprow = p.nprow; 146 | int npcol = p.npcol; 147 | double alpha = mg.alpha; 148 | double beta = mg.beta; 149 | double ab = alpha * beta; 150 | for (int i = 0; i < b * nprow; ++i) 151 | w[i] = 0.; 152 | #pragma omp parallel for 153 | for (int i = 0; i < nprow; ++i) { 154 | int ipos = i1 + i * istride; 155 | for (int j = 0; j < npcol; ++j) { 156 | int jpos = j1 + j * jstride; 157 | if (ipos == jpos) { 158 | for (int jj = 0; jj < b; ++jj) { 159 | for (int ii = 0; ii < jj; ++ii) { 160 | double aij = beta + ab * (b * ipos + ii); 161 | w[b * i + ii] += aij; 162 | } 163 | w[b * j + jj] += 1. + ab * (b * jpos + jj); 164 | for (int ii = jj + 1; ii < b; ++ii) { 165 | double aij = alpha + ab * (b * jpos + jj); 166 | w[b * i + ii] += aij; 167 | } 168 | } 169 | } else if (ipos < jpos) { 170 | for (int jj = 0; jj < b; ++jj) { 171 | for (int ii = 0; ii < b; ++ii) { 172 | double aij = beta + ab * (b * ipos + ii); 173 | w[b * i + ii] += aij; 174 | } 175 | } 176 | } else { 177 | for (int jj = 0; jj < b; ++jj) { 178 | for (int ii = 0; ii < b; ++ii) { 179 | double aij = alpha + ab * (b * jpos + jj); 180 | w[b * i + ii] += aij; 181 | } 182 | } 183 | } 184 | } 185 | } 186 | MPI_Allreduce(MPI_IN_PLACE, w, b * nprow, MPI_DOUBLE, MPI_SUM, g.hcomm); 187 | double norm = 0.; 188 | for (int i = 0; i < b * nprow; ++i) 189 | norm = (norm >= w[i] ? norm : w[i]); 190 | MPI_Allreduce(MPI_IN_PLACE, &norm, 1, MPI_DOUBLE, MPI_MAX, g.vcomm); 191 | return norm; 192 | } 193 | 194 | #endif 195 | -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | EXECUTE_PROCESS( COMMAND uname -m COMMAND tr -d '\n' OUTPUT_VARIABLE ARCHITECTURE ) 2 | message( STATUS "Building HPL-AI for Architecture: ${ARCHITECTURE}" ) 3 | 4 | # FRONTIER / SPOCK language and definitions 5 | if( ${ARCHITECTURE} STREQUAL "x86_64" ) 6 | 7 | # Requires atleast `module load cmake/3.21.2-dev`` 8 | cmake_minimum_required(VERSION 3.21 FATAL_ERROR) 9 | set(ACCEL_COMPILER HIP) 10 | add_compile_definitions(ROCM_OLCF_PLATFORM) 11 | 12 | # SUMMIT language and definitions 13 | elseif( ${ARCHITECTURE} STREQUAL "ppc64le") 14 | cmake_minimum_required(VERSION 3.20 FATAL_ERROR) 15 | set(ACCEL_COMPILER CUDA) 16 | add_compile_definitions(CUDA_OLCF_PLATFORM) 17 | 18 | endif() 19 | 20 | # enable CUDA or HIP 21 | enable_language(${ACCEL_COMPILER}) 22 | project(openMxP LANGUAGES CXX) 23 | 24 | # FRONTIER / SPOCK packages 25 | if( ${ARCHITECTURE} STREQUAL "x86_64" ) 26 | set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "PATH to which HIP has been installed") 27 | set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "PATH to which ROCM has been installed") 28 | set(CRAY_MPICH_DIR $ENV{CRAY_MPICH_DIR} CACHE PATH "PATH to Cray MPICH ROOT") 29 | 30 | set( GPU_TARGETS "gfx90a" CACHE STRING "GPU TARGETS" ) 31 | set( AMPGPU_TARGETS "gfx90a" CACHE STRING "GPU TARGETS" ) 32 | set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH}) 33 | find_package(HIP REQUIRED) 34 | # find_package(MPI REQUIRED) 35 | find_package(rocblas REQUIRED) 36 | find_package(rocsolver REQUIRED) 37 | find_package(rocrand REQUIRED) 38 | find_package(hiprand REQUIRED) 39 | find_package(OpenMP REQUIRED) 40 | 41 | # For GPU Direct on AMD must export these variables: 42 | # export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=0 43 | # export MPICH_GPU_SUPPORT_ENABLED=1 44 | find_library(GTL mpi_gtl_hsa /opt/cray/pe/mpich/default/gtl/lib) 45 | 46 | if(HIP_FOUND) 47 | message(STATUS "Found HIP: " ${HIP_VERSION}) 48 | else() 49 | message(FATAL_ERROR "Could not find HIP. Ensure that HIP is either installed in /opt/rocm-4.x.x/hip or the variable HIP_PATH is set to point to the right location.") 50 | endif() 51 | 52 | # find_library(GTL 53 | # NAMES mpi_gtl_hsa 54 | # PATHS ${CRAY_MPICH_ROOTDIR}/gtl/lib 55 | # REQUIRED) 56 | #file(GLOB HIP_CLANGRT_LIB_SEARCH_PATHS "${CMAKE_HIP_COMPILER}/../lib/clang/*/lib/*") 57 | # file(GLOB HIP_CLANGRT_LIB_SEARCH_PATHS "/opt/rocm-5.0.2/llvm/lib/clang/14.0.0/lib/linux/*" ) 58 | find_library(CLANGRT_BUILTINS clang_rt.builtins-x86_64 /opt/rocm-5.0.2/llvm/lib/clang/14.0.0/lib/linux ) 59 | 60 | # SUMMIT packages and compiling 61 | elseif( ${ARCHITECTURE} STREQUAL "ppc64le") 62 | if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES) 63 | set(CMAKE_CUDA_ARCHITECTURES 70) 64 | endif() 65 | 66 | # ensure atomicAdd() is implemented for FP64 67 | if(CMAKE_CUDA_ARCHITECTURES LESS 60) 68 | set(CMAKE_CUDA_ARCHITECTURES 60) 69 | endif() 70 | 71 | find_package(CUDA 11.4 REQUIRED) 72 | find_package(MPI) 73 | 74 | set(BLA_VENDOR OpenBLAS) 75 | find_package(BLAS) 76 | 77 | endif() 78 | 79 | 80 | # NON-ACCELERATOR SPECIFIC 81 | 82 | # c++ standard 83 | set(CMAKE_CXX_STANDARD 17) 84 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 85 | 86 | # gcc version check 87 | if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") 88 | if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS "6.4.0") 89 | message(FATAL_ERROR "Insufficient gcc version, require 6.4.0+") 90 | endif() 91 | endif() 92 | 93 | 94 | # Get the current working branch 95 | execute_process( 96 | COMMAND git rev-parse --abbrev-ref HEAD 97 | WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} 98 | OUTPUT_VARIABLE GIT_BRANCH 99 | OUTPUT_STRIP_TRAILING_WHITESPACE) 100 | 101 | # Get the latest commit hash 102 | execute_process( 103 | COMMAND git rev-parse HEAD 104 | WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} 105 | OUTPUT_VARIABLE GIT_COMMIT_HASH 106 | OUTPUT_STRIP_TRAILING_WHITESPACE) 107 | 108 | 109 | # disable in source build 110 | if ( ${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_BINARY_DIR} ) 111 | file(REMOVE test.txt) 112 | message( FATAL_ERROR "In-source builds not allowed. Please make a new 113 | directory (called a build directory) and run CMake from there. You may 114 | need to remove CMakeCache.txt." ) 115 | endif() 116 | 117 | 118 | # add other sources 119 | #add_subdirectory(externals) 120 | 121 | set(HPL_AI_SRCS 122 | main.cpp 123 | higham_mat_impl.cpp 124 | otf_gemv.cpp 125 | sgetrf_nopiv.cpp 126 | ) 127 | 128 | add_definitions(-DEXTERNAL_CONV -DOTF_GEMV_OPTIMIZED -DNO_WARN_X86_INTRINSICS) 129 | 130 | # FRONTIER / SPOCK build and linking 131 | if( ${ARCHITECTURE} STREQUAL "x86_64" ) 132 | 133 | set(GPU_SRCS 134 | gpu_init_kernels.cpp 135 | fp16_gpu_kernels.cpp 136 | ) 137 | 138 | set_source_files_properties(${HPL_AI_SRCS} ${GPU_SRCS} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) 139 | hip_add_executable(OpenMxP ${HPL_AI_SRCS} ${GPU_SRCS}) 140 | 141 | set(CRAY_LIBSCI_PREFIX_DIR $ENV{CRAY_LIBSCI_PREFIX_DIR} CACHE PATH "Path to the libsci libraries") 142 | 143 | message(STATUS "Using libsci BLAS libraries in ${CRAY_LIBSCI_PREFIX_DIR}/lib ") 144 | target_include_directories(OpenMxP PUBLIC 145 | # ${MPI_CXX_INCLUDE_DIRS} 146 | ${CRAY_MPICH_DIR}/include 147 | ${ROCM_PATH}/include 148 | ${ROCM_PATH}/hiprand/include 149 | ${ROCM_PATH}/rocrand/include 150 | ${ROCM_PATH}/rocthrust/include 151 | ) 152 | 153 | 154 | target_link_directories(OpenMxP PUBLIC 155 | ${ROCM_PATH}/lib 156 | # ${CRAY_LIBSCI_PREFIX_DIR}/lib 157 | ${CRAY_MPICH_DIR}/lib 158 | ) 159 | 160 | target_link_libraries(OpenMxP 161 | # ${MPI_CXX_LIBRARIES} 162 | #CCE 163 | #mpi_cray 164 | #GCC 165 | mpi 166 | ${GTL} 167 | amdhip64 168 | rocblas 169 | rocsolver 170 | hiprand 171 | hsa-runtime64 172 | # sci_cray_mp hugetlbfs 173 | ${OpenMP_CXX_LIBRARIES} 174 | # spdlog cli11 175 | ) 176 | 177 | # Set HIP target architecture for SPOCK MI100 GPUs (May need to change for MI200s) 178 | # set_property(TARGET driver PROPERTY HIP_ARCHITECTURES gfx908) 179 | # set_property(TARGET driver PROPERTY HIP_ARCHITECTURES gfx908) 180 | set_property(TARGET OpenMxP PROPERTY HIP_ARCHITECTURES gfx90a) 181 | # set( AMDGPU_TARGETS gfx90a ) 182 | set_property(TARGET OpenMxP PROPERTY CMAKE_HIP_ARCHITECTURES gfx90a) 183 | target_compile_options(OpenMxP PUBLIC -std=c++14 -O3 ) 184 | 185 | 186 | # SUMMIT build and linking 187 | elseif( ${ARCHITECTURE} STREQUAL "ppc64le") 188 | 189 | set(GPU_SRCS 190 | gpu_init_kernels.cu 191 | fp16_gpu_kernels.cu 192 | ) 193 | 194 | set_source_files_properties(GPU_SRCS PROPERTIES LANGUAGE CUDA) 195 | 196 | add_executable(OpenMxP ${HPL_AI_SRCS} ${GPU_SRCS}) 197 | 198 | target_include_directories(OpenMxP PUBLIC 199 | ${MPI_INCLUDE_PATH}) 200 | 201 | target_link_libraries(OpenMxP 202 | ${MPI_LIBRARIES} 203 | ${CUDA_LIBRARIES} 204 | ${CUDA_CUBLAS_LIBRARIES} 205 | ${CUDA_cusolver_LIBRARY} 206 | ${BLAS_LIBRARIES} 207 | ${CUDA_curand_LIBRARY} 208 | spdlog cli11 209 | ) 210 | 211 | target_link_options(OpenMxP PUBLIC "-fopenmp") 212 | target_link_options(OpenMxP PUBLIC "-mcpu=native") 213 | target_compile_options(OpenMxP PUBLIC -W -Wall -fopenmp -mcpu=native -std=c++14 -O3) 214 | 215 | endif() 216 | 217 | 218 | target_compile_definitions(OpenMxP PRIVATE 219 | "-DGIT_COMMIT_HASH=\"${GIT_COMMIT_HASH}\"") 220 | 221 | target_compile_definitions(OpenMxP PRIVATE 222 | "-DGIT_BRANCH=\"${GIT_BRANCH}\"") 223 | 224 | set_target_properties(OpenMxP 225 | PROPERTIES 226 | OUTPUT_NAME "OpenMxP." 227 | SUFFIX ${CMAKE_HOST_SYSTEM_PROCESSOR} 228 | ) 229 | -------------------------------------------------------------------------------- /src/cuda_device_macros.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef __HPLAI_CUDA_DEVICE_MACROS__ 3 | #define __HPLAI_CUDA_DEVICE_MACROS__ 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | // *** BASIC CUDA MACROS *** 14 | // Kernel Macros 15 | #define GPU_BLOCKIDX_X \ 16 | blockIdx.x 17 | 18 | #define GPU_BLOCKIDX_Y \ 19 | blockIdx.y 20 | 21 | #define GPU_BLOCKIDX_Z \ 22 | blockIdx.z 23 | 24 | 25 | #define GPU_THREADIDX_X \ 26 | threadIdx.x 27 | 28 | #define GPU_THREADIDX_Y \ 29 | threadIdx.y 30 | 31 | #define GPU_THREADIDX_Z \ 32 | threadIdx.z 33 | 34 | 35 | #define GPU_BLOCKDIM_X \ 36 | blockDim.x 37 | 38 | #define GPU_BLOCKDIM_Y \ 39 | blockDim.y 40 | 41 | #define GPU_BLOCKDIM_Z \ 42 | blockDim.z 43 | 44 | 45 | #define GPU_GRIDDIM_X \ 46 | gridDim.x 47 | 48 | #define GPU_GRIDDIM_Y \ 49 | gridDim.y 50 | 51 | #define GPU_GRIDDIM_Z \ 52 | gridDim.z 53 | 54 | 55 | // Types 56 | #define GPU_ERROR_T \ 57 | cudaError_t 58 | 59 | #define GPU_STREAM_T \ 60 | cudaStream_t 61 | 62 | 63 | // Enums 64 | #define GPU_SUCCESS \ 65 | cudaSuccess 66 | 67 | #define GPU_STREAM_NON_BLOCKING \ 68 | cudaStreamNonBlocking 69 | 70 | #define GPU_R_16F \ 71 | CUDA_R_16F 72 | 73 | #define GPU_R_32F \ 74 | CUDA_R_32F 75 | 76 | #define GPU_MEMCPY_DEVICE_TO_HOST \ 77 | cudaMemcpyDeviceToHost 78 | 79 | #define GPU_MEMCPY_HOST_TO_DEVICE \ 80 | cudaMemcpyHostToDevice 81 | 82 | // Kernels 83 | #define GPU_DEVICE_RESET() \ 84 | cudaDeviceReset() 85 | 86 | #define GPU_SET_DEVICE(deviceID) \ 87 | cudaSetDevice(deviceID) 88 | 89 | #define GPU_DEVICE_SYNCHRONIZE() \ 90 | cudaDeviceSynchronize() 91 | 92 | #define GPU_FREE(memPointer) \ 93 | cudaFree(memPointer) 94 | 95 | #define GPU_FREE_HOST(memPointer) \ 96 | cudaFreeHost(memPointer) 97 | 98 | #define GPU_GET_ERROR_STRING(cudaError) \ 99 | cudaGetErrorString(cudaError) 100 | 101 | #define GPU_GET_LAST_ERROR() \ 102 | cudaGetLastError() 103 | 104 | #define GPU_MALLOC(memAddress, numBytes) \ 105 | cudaMalloc(memAddress, numBytes) 106 | 107 | #define GPU_MALLOC_HOST(memAddress, numBytes) \ 108 | cudaMallocHost(memAddress, numBytes) 109 | 110 | #define GPU_MALLOC_MANAGED(memAddress, numBytes) \ 111 | cudaMallocManaged(memAddress, numBytes) 112 | 113 | #define GPU_MEMCPY(memPointer_to, memPointer_from, numBytes, directionEnum) \ 114 | cudaMemcpy(memPointer_to, memPointer_from, numBytes, directionEnum) 115 | 116 | #define GPU_MEMCPY_2D(memPointer_to, pitchBytes_to, memPointer_from, pitchBytes_from, numBytes_W, numBytes_H, directionEnum) \ 117 | cudaMemcpy2D(memPointer_to, pitchBytes_to, memPointer_from, pitchBytes_from, numBytes_W, numBytes_H, directionEnum) 118 | 119 | #define GPU_MEMCPY_DEVICE_TO_DEVICE \ 120 | cudaMemcpyDeviceToDevice 121 | 122 | #define GPU_MEM_GET_INFO(freeMem, totalMem) \ 123 | cudaMemGetInfo(freeMem, totalMem) 124 | 125 | #define GPU_STREAM_CREATE_WITH_FLAGS(cudaStream, streamTypeEnum) \ 126 | cudaStreamCreateWithFlags(cudaStream, streamTypeEnum) 127 | 128 | 129 | 130 | 131 | // *** CUBLAS MACROS *** 132 | // Types 133 | #define GPUBLAS_HANDLE_T \ 134 | cublasHandle_t 135 | 136 | #define GPUBLAS_STATUS_T \ 137 | cublasStatus_t 138 | 139 | // Enums 140 | #define GPUBLAS_STATUS_SUCCESS \ 141 | CUBLAS_STATUS_SUCCESS 142 | 143 | #define GPUBLAS_STATUS_NOT_INITIALIZED \ 144 | CUBLAS_STATUS_NOT_INITIALIZED 145 | 146 | #define GPUBLAS_STATUS_ALLOC_FAILED \ 147 | CUBLAS_STATUS_ALLOC_FAILED 148 | 149 | #define GPUBLAS_STATUS_INVALID_VALUE \ 150 | CUBLAS_STATUS_INVALID_VALUE 151 | 152 | #define GPUBLAS_STATUS_ARCH_MISMATCH \ 153 | CUBLAS_STATUS_ARCH_MISMATCH 154 | 155 | #define GPUBLAS_STATUS_MAPPING_ERROR \ 156 | CUBLAS_STATUS_MAPPING_ERROR 157 | 158 | #define GPUBLAS_STATUS_EXECUTION_FAILED \ 159 | CUBLAS_STATUS_EXECUTION_FAILED 160 | 161 | #define GPUBLAS_STATUS_INTERNAL_ERROR \ 162 | CUBLAS_STATUS_INTERNAL_ERROR 163 | 164 | #define GPUBLAS_OP_N \ 165 | CUBLAS_OP_N 166 | 167 | #define GPUBLAS_OP_T \ 168 | CUBLAS_OP_T 169 | 170 | #define GPUBLAS_SIDE_RIGHT \ 171 | CUBLAS_SIDE_RIGHT 172 | 173 | #define GPUBLAS_SIDE_LEFT \ 174 | CUBLAS_SIDE_LEFT 175 | 176 | #define GPUBLAS_FILL_MODE_UPPER \ 177 | CUBLAS_FILL_MODE_UPPER 178 | 179 | #define GPUBLAS_FILL_MODE_LOWER \ 180 | CUBLAS_FILL_MODE_LOWER 181 | 182 | #define GPUBLAS_DIAG_UNIT \ 183 | CUBLAS_DIAG_UNIT 184 | 185 | #define GPUBLAS_DIAG_NON_UNIT \ 186 | CUBLAS_DIAG_NON_UNIT 187 | 188 | // Kernels 189 | #define GPUBLAS_CREATE(cublasHandle) \ 190 | cublasCreate_v2(cublasHandle) 191 | 192 | #define GPUBLAS_SET_STREAM(cublasHandle, cudaStream) \ 193 | cublasSetStream(cublasHandle, cudaStream) 194 | 195 | #define GPUBLAS_SGETRF_BATCHED(cublasHandle, N_dim, memPointer_A, lda, memPointer_Pivot, memPointer_Info, batchSize) \ 196 | cublasSgetrfBatched(cublasHandle, N_dim, memPointer_A, lda, memPointer_Pivot, memPointer_Info, batchSize) 197 | 198 | #define GPUBLAS_STRSM(cublasHandle, cuSide, cuFill, cuOp, cuDiag, M_dim, N_dim, alpha, memPointer_A, lda, memPointer_B, ldb) \ 199 | cublasStrsm(cublasHandle, cuSide, cuFill, cuOp, cuDiag, M_dim, N_dim, alpha, memPointer_A, lda, memPointer_B, ldb) 200 | 201 | // Non-Simple Kernels 202 | //#define GPUBLAS_GET_ERROR_STRING(cublasStatus) \ 203 | // cublasGetErrorString(cublasStatus) 204 | 205 | #define GPUBLAS_SGEMM_EX(cublasHandle, cuOp_A, cuOp_B, M_dim, N_dim, k_dim, alpha, memPointer_A, datatype_A, lda, memPointer_B, datatype_B, ldb, beta, memPointer_C, datatype_C, ldc) \ 206 | cublasSgemmEx(cublasHandle, cuOp_A, cuOp_B, M_dim, N_dim, k_dim, alpha, memPointer_A, datatype_A, lda, memPointer_B, datatype_B, ldb, beta, memPointer_C, datatype_C, ldc) 207 | 208 | 209 | 210 | // *** CUSOLVER MACROS *** 211 | // Types 212 | #define GPUSOLVER_HANDLE_T \ 213 | cusolverDnHandle_t 214 | 215 | #define GPUSOLVER_STATUS_T \ 216 | cusolverStatus_t 217 | 218 | 219 | // Enums 220 | #define GPUSOLVER_STATUS_SUCCESS \ 221 | CUSOLVER_STATUS_SUCCESS 222 | 223 | 224 | // Kernels 225 | #define GPUSOLVER_CREATE(cusolverDnHandle) \ 226 | cusolverDnCreate(cusolverDnHandle) 227 | 228 | #define GPUSOLVER_SGETRF(cusolverDnHandle, M_dim, N_dim, memPointer, lda, memAddress_Buffer, memPointer_Pivot, memPointer_Info) \ 229 | cusolverDnSgetrf(cusolverDnHandle, M_dim, N_dim, memPointer, lda, memAddress_Buffer, memPointer_Pivot, memPointer_Info) 230 | 231 | //#define GPUSOLVER_SGETRF_BUFFERSIZE(cusolverDnHandle, M_dim, N_dim, memPointer, lda, memAddress_Buffer, memPointer_Pivot, memPointer_Info) \ 232 | // cusolverDnSgetrf_bufferSize(cusolverDnHandle, M_dim, N_dim, memPointer, lda, memAddress_Buffer, memPointer_Pivot, memPointer_Info) 233 | 234 | #define GPUSOLVER_SGETRF_BUFFERSIZE(cusolverDnHandle, M_dim, N_dim, memPointer, lda, memAddress_Buffer) \ 235 | cusolverDnSgetrf_bufferSize(cusolverDnHandle, M_dim, N_dim, memPointer, lda, memAddress_Buffer) 236 | 237 | #define GPUSOLVER_SET_STREAM(cusolverDnHandle, cudaStream) \ 238 | cusolverDnSetStream(cusolverDnHandle, cudaStream) 239 | 240 | // *** CURAND MACROS *** 241 | // Types 242 | #define GPURAND_GENERATOR_T \ 243 | curandGenerator_t 244 | 245 | 246 | // Enums 247 | #define GPURAND_RNG_PSEUDO_DEFAULT \ 248 | CURAND_RNG_PSEUDO_DEFAULT 249 | 250 | 251 | // Kernels 252 | #define GPURAND_CREATE_GENERATOR(curandGenerator, curandRngType) \ 253 | curandCreateGenerator(curandGenerator, curandRngType) 254 | 255 | #define GPURAND_DESTROY_GENERATOR(curandGenerator) \ 256 | curandDestroyGenerator(curandGenerator) 257 | 258 | #define GPURAND_GENERATE_UNIFORM(curandGenerator, memPointer, numBytes) \ 259 | curandGenerateUniform(curandGenerator, memPointer, numBytes) 260 | 261 | #define GPURAND_GENERATE_UNIFORM_DOUBLE(curandGenerator, memPointer, numBytes) \ 262 | curandGenerateUniformDouble(curandGenerator, memPointer, numBytes) 263 | 264 | #define GPURAND_SET_PSEUDO_RANDOM_GENERATOR_SEED(curandGenerator, seed) \ 265 | curandSetPseudoRandomGeneratorSeed(curandGenerator, seed) 266 | 267 | 268 | 269 | #endif // __HPLAI_CUDA_DEVICE_MACROS__ 270 | -------------------------------------------------------------------------------- /src/iterative_refinement.hpp: -------------------------------------------------------------------------------- 1 | #ifndef ITERATIVE_REFINEMENT_HPP 2 | #define ITERATIVE_REFINEMENT_HPP 3 | #include "grid.hpp" 4 | #include "hpl_rand.hpp" 5 | #include "panel.hpp" 6 | #include "panel_gemv.hpp" 7 | #include "panel_norm.hpp" 8 | #include "panel_trsv.hpp" 9 | #include "timer.hpp" 10 | #include 11 | #include 12 | 13 | struct IRErrors { 14 | double residual; 15 | double hpl_harness; 16 | }; 17 | 18 | template class Matgen> 19 | void iterative_tester(int my_init, Panels &p, Matgen &mg, 20 | double *x, double *w, size_t ldv, double *rhs, 21 | double norma, double normb, int maxit, 22 | Grid &grid) { 23 | int const nb = p.nblocks; 24 | int const n = nb * p.b; 25 | size_t wlen = ldv * 10 + 2 * p.b * p.b; 26 | double *r = NULL; 27 | double *v = NULL; 28 | double *vw1_dev; 29 | FPanel *vw2_dev; 30 | double *dx_dev; 31 | double *testing_r = static_cast(std::malloc(sizeof(double) * ldv)); 32 | r = w; 33 | v = w + ldv; 34 | double normr=0; 35 | 36 | size_t bytes = (size_t)(p.b * p.nprow) * (size_t)(p.b * p.npcol) * (size_t)sizeof(float); 37 | GPU_MEMCPY(p(0, 0), p(0, 0, 'd'), bytes, GPU_MEMCPY_DEVICE_TO_HOST); 38 | GPU_MEMCPY(testing_r, r, ldv*sizeof(double), GPU_MEMCPY_DEVICE_TO_HOST); 39 | printf("A from rank %d %d\n",grank,p.i1); 40 | print_matrix(p(0, 0),p.lda, p.lda*p.lda); 41 | fflush(stdout); 42 | 43 | GPU_MEMCPY(testing_r, rhs, ldv*sizeof(double), GPU_MEMCPY_DEVICE_TO_HOST); 44 | printf("rhs from rank %d\n",grank); 45 | print_matrix(testing_r,ldv/2,ldv/2); 46 | fflush(stdout); 47 | 48 | //trsvL_h(p, rhs, vw1_dev, ldv, grid); 49 | GPU_MEMCPY(testing_r, rhs, ldv*sizeof(double), GPU_MEMCPY_DEVICE_TO_HOST); 50 | printf("solve from rank %d\n",grank); 51 | print_matrix(testing_r,ldv/2,ldv/2); 52 | fflush(stdout); 53 | return; 54 | 55 | 56 | 57 | if(my_init == 1 || my_init == 4) 58 | { 59 | //panel_copycolv_dev(p, rhs, x); 60 | copycolv_h(p, rhs, x); 61 | divcolv_h(p, mg.diag_dev, x); 62 | copycolv_h(p, rhs, r); 63 | colv2rowv_h(p, x, v ); 64 | if(my_init == 1){ 65 | panel_gemv_h(-1., p, mg, false, v, 1., r, grid ); 66 | } 67 | 68 | normr = colv_infnorm_h(p, r, grid, vw1_dev); 69 | trsvU_h(p, r, vw1_dev, ldv, grid); 70 | //panel_trsvL(p, r, vw1_dev, vw2_dev, ldv, grid); 71 | /*GPU_MEMCPY(testing_r, r, ldv*sizeof(double), GPU_MEMCPY_DEVICE_TO_HOST); 72 | if(p.i1 == p.j1 && grank == 10){ 73 | printf("MV from rank %d %f\n",grank, normr); 74 | print_matrix(testing_r,ldv/2,ldv/2); 75 | fflush(stdout); 76 | }*/ 77 | } 78 | else 79 | { 80 | copycolv(p, rhs, x); 81 | divcolv(p, mg.diag, x); 82 | copycolv(p, rhs, r); 83 | colv2rowv(p, x, v); 84 | panel_gemv(my_init, -1., p, mg, false, v, 1., r, grid); 85 | normr = colv_infnorm(p, r, grid); 86 | if(p.i1 == p.j1 && grank == 10){ 87 | printf("MV from rank %d %f\n",grank, normr); 88 | // print_matrix(r,ldv/2,ldv/2); 89 | fflush(stdout); 90 | } 91 | } 92 | 93 | 94 | 95 | } 96 | 97 | #if 1 98 | template class Matgen> 99 | IRErrors iterative_refinement2(int my_init, Panels const &p, Matgen &mg, 100 | double *x, double *w, size_t ldv, double *rhs, 101 | double norma, double normb, int maxit, 102 | Grid &grid) { 103 | // do IR with approximated LU factors in p and the accurate initial matrix 104 | // which is generated by mg. x is solution. rhs is the right-hand-side 105 | // vector. w is working vectors. ldv is the leading dimention of w. set good 106 | // ldv for better performance. norma is inf-norm of the initial matrix. 107 | // normb is the inf-norm of rhs. 108 | int const nb = p.nblocks; 109 | int const n = nb * p.b; 110 | double *r = w; 111 | double *v = w + ldv; 112 | 113 | //double tmisc=0, time_trsvL=0, time_trsvU=0, time_pgv = 0, begintr_time, endtr_time; 114 | if(my_init != 1){ 115 | copycolv(p, rhs, x); 116 | divcolv(p, mg.diag, x); 117 | for (int iter = 0; iter < maxit; ++iter) { 118 | copycolv(p, rhs, r); 119 | double normx = colv_infnorm(p, x, grid); 120 | colv2rowv(p, x, v); 121 | panel_gemv(my_init, -1., p, mg, false, v, 1., r, grid); 122 | double normr = colv_infnorm(p, r, grid); 123 | MPI_Barrier(MPI_COMM_WORLD); 124 | double hplerror = 125 | normr / (norma * normx + normb) * 1. / (n * DBL_EPSILON / 2); 126 | if (grid.row == 0 && grid.col == 0) { 127 | printf("# iterative refinement: step=%d, residual=%e, " 128 | "hpl-harness=%f\n", 129 | iter, normr, hplerror); 130 | } 131 | if (hplerror < 16.) { 132 | return {normr, hplerror}; 133 | } 134 | fflush(stdout); 135 | 136 | // x_1 = x_0 + (LU)^{-1} r 137 | panel_trsvL(p, r, v, ldv, grid); 138 | panel_trsvU(p, r, v, ldv, grid); 139 | addcolv(p, r, x); 140 | } 141 | }else{ 142 | copycolv_h(p, rhs, x); 143 | divcolv_h(p, mg.diag_dev, x); 144 | for (int iter = 0; iter < maxit; ++iter) { 145 | copycolv_h(p, rhs, r); 146 | double normx = colv_infnorm_h(p, x, grid,v); 147 | colv2rowv_h(p, x, v); 148 | panel_gemv_h(-1., p, mg, false, v, 1., r, grid); 149 | 150 | GPU_THREAD_SYNCHRONIZE(0); 151 | MPI_Barrier(MPI_COMM_WORLD); 152 | 153 | double normr = colv_infnorm_h(p, r, grid,v); 154 | // hplerror := \|b-Ax\|_\infty / (\|A\|_\infty \|x\|_\infty + 155 | // \|b\|_\infty) * (n * \epsilon)^{-1} 156 | double hplerror = 157 | normr / (norma * normx + normb) * 1. / (n * DBL_EPSILON / 2); 158 | if (grid.row == 0 && grid.col == 0) { 159 | printf("# iterative refinement: step=%d, residual=%e, " 160 | "hpl-harness=%f\n", 161 | iter, normr, hplerror); 162 | } 163 | if (hplerror < 16.) { 164 | return {normr, hplerror}; 165 | } 166 | // x_1 = x_0 + (LU)^{-1} r 167 | trsvL_h(p, r, v, ldv, grid); 168 | trsvU_h(p, r, v, ldv, grid); 169 | addcolv_h(p, r, x); 170 | } 171 | } 172 | // OMG! 173 | return {-1., -1.}; 174 | } 175 | #endif 176 | 177 | /*template 178 | IRErrors iterative_refinement(Panelsconst&p, HMGen& mg, 179 | double* x, double* w, size_t ldv, double* rhs, double norma, double 180 | normb, int maxit, Grid&grid) 181 | { 182 | int const nb = p.nblocks; 183 | int const n = nb * p.b; 184 | double*r = w; 185 | double*v = w + ldv; 186 | // initial approximation 187 | // trsv 188 | copycolv(p, rhs, x); 189 | panel_trsvL(p, x, v, ldv, grid); 190 | panel_trsvU(p, x, v, ldv, grid); 191 | 192 | for(int iter=0; iter 5 | #include 6 | #include 7 | 8 | #include "device_macros.h" 9 | 10 | extern int grank; 11 | 12 | struct RandCoeff { 13 | uint64_t a; 14 | uint64_t c; 15 | 16 | static RandCoeff default_vals() { return {6364136223846793005, 1}; } 17 | 18 | __host__ __device__ RandCoeff operator*(const RandCoeff &rhs) const { 19 | return {a * rhs.a, a * rhs.c + c}; 20 | } 21 | __host__ __device__ RandCoeff pow_fugaku(const uint64_t n) const { 22 | if(n==0) return RandCoeff{1, 0}; 23 | RandCoeff tmp = pow(n / 2); 24 | tmp = tmp * tmp; 25 | if (n % 2) { 26 | return tmp * (*this); 27 | } else { 28 | return tmp; 29 | } 30 | } 31 | __host__ __device__ RandCoeff pow(const uint64_t n) const { 32 | uint64_t exponent = n; 33 | RandCoeff tmp = *(this); 34 | RandCoeff result = RandCoeff{1, 0}; 35 | 36 | if (exponent == 0) return result; 37 | if(exponent & 1) result = result * tmp; 38 | 39 | exponent= exponent>> 1; 40 | while (exponent > 0) { 41 | tmp = tmp * tmp; 42 | if(exponent & 1) 43 | result = result * tmp; 44 | 45 | exponent= exponent>> 1; 46 | } 47 | return result; 48 | } 49 | }; 50 | 51 | struct RandStat { 52 | uint64_t x; 53 | 54 | __host__ __device__ static RandStat initialize(uint64_t seed, 55 | RandCoeff coef = RandCoeff::default_vals()) { 56 | return coef * RandStat{seed}; 57 | } 58 | 59 | __host__ __device__ friend RandStat inline operator*(RandCoeff coef, RandStat stat) { 60 | return {coef.a * stat.x + coef.c}; 61 | } 62 | 63 | // returns [-0.5:0.5] 64 | __host__ __device__ inline operator double() const { 65 | return static_cast(x) * 0x1.fffffffffffffP-65; 66 | //24x24 67 | // return static_cast(x) * 0x1.fffffffffffffP-73; 68 | // 96x 96 69 | //return static_cast(x) * 0x1.fffffffffffffP-75; 70 | // 162x 162 71 | // return static_cast(x) * 0x1.fffffffffffffP-76; //maybe 75 will work, faster convergence 72 | 73 | } 74 | __host__ __device__ operator float() const { 75 | float tmp = static_cast(*this); 76 | return tmp; 77 | } 78 | }; 79 | 80 | // fill subumat (i0:nrow-1, j0:ncol-1) of fullmat (0:nrow-1, 0:ncol-1) 81 | template 82 | static void panel_fill_one_with_rand( 83 | int const n, 84 | int const i0, 85 | int const j0, 86 | int const nrow, 87 | int const ncol, 88 | F *a, 89 | size_t const lda, 90 | uint64_t const seed, 91 | bool const calc_diag = true) 92 | { 93 | RandStat stat_00 = RandStat::initialize(seed); 94 | 95 | RandCoeff inc1 = RandCoeff::default_vals(); 96 | RandCoeff jump_one_col = inc1.pow(n); 97 | RandCoeff jump_ij = inc1.pow(i0 + n * static_cast(j0)); 98 | 99 | RandStat stat_ij = jump_ij * stat_00; 100 | 101 | RandStat at_0j = stat_ij; 102 | for(int j=0; j(at_ij); 106 | a[j*lda + i] = static_cast(t); 107 | at_ij = inc1 * at_ij; 108 | } 109 | at_0j = jump_one_col * at_0j; 110 | } 111 | 112 | if (calc_diag && (i0 == j0) && (nrow==ncol)){ 113 | RandCoeff jump_i0 = inc1.pow(i0); 114 | 115 | RandStat stat_i0 = jump_i0 * stat_00; 116 | for(int i=0; i<(nrow(sum); 126 | stat_i0 = inc1 * stat_i0; 127 | } 128 | } 129 | } 130 | 131 | template 132 | static void fill_one_panel_with_rand2(int const n, int const i0, int const j0, 133 | int const nrow, int const ncol, F *a, 134 | size_t const lda, uint64_t const seed, 135 | double* localSum) { 136 | RandStat stat_00 = RandStat::initialize(seed); 137 | 138 | RandCoeff inc1 = RandCoeff::default_vals(); 139 | RandCoeff jump_one_col = inc1.pow(n); 140 | RandCoeff jump_ij = inc1.pow(i0 + n * static_cast(j0)); 141 | 142 | RandStat stat_ij = jump_ij * stat_00; 143 | 144 | RandStat at_0j = stat_ij; 145 | long max_idx = 0; 146 | for (int j = 0; j < ncol; j++) { 147 | RandStat at_ij = at_0j; 148 | for (int i = 0; i < nrow; i++) { 149 | double t = static_cast(at_ij); 150 | if ( ( i0 + i ) != ( j0 + j ) ) localSum[ i0+i ] += fabs( double( t ) ); 151 | if ( ( j * lda + i ) > max_idx ) max_idx = j * lda + i; 152 | a[j * lda + i] = static_cast ( t ); 153 | at_ij = inc1 * at_ij; 154 | } 155 | at_0j = jump_one_col * at_0j; 156 | } 157 | } 158 | // fill subumat (i0:nrow-1, j0:ncol-1) of fullmat (0:nrow-1, 0:ncol-1) 159 | template 160 | static void fill_one_panel_with_rand(int const n, int const i0, int const j0, 161 | int const nrow, int const ncol, F *a, 162 | size_t const lda, uint64_t const seed, 163 | bool const calc_diag = true) { 164 | RandStat stat_00 = RandStat::initialize(seed); 165 | 166 | RandCoeff inc1 = RandCoeff::default_vals(); 167 | RandCoeff jump_one_col = inc1.pow(n); 168 | RandCoeff jump_ij = inc1.pow(i0 + n * static_cast(j0)); 169 | 170 | RandStat stat_ij = jump_ij * stat_00; 171 | 172 | RandStat at_0j = stat_ij; 173 | for (int j = 0; j < ncol; j++) { 174 | RandStat at_ij = at_0j; 175 | for (int i = 0; i < nrow; i++) { 176 | double t = static_cast(at_ij); 177 | a[j * lda + i] = static_cast( t ); 178 | at_ij = inc1 * at_ij; 179 | } 180 | at_0j = jump_one_col * at_0j; 181 | } 182 | if (calc_diag && (i0 == j0) && (nrow == ncol)) { 183 | RandCoeff jump_i0 = inc1.pow(i0); 184 | 185 | RandStat stat_i0 = jump_i0 * stat_00; 186 | for (int i = 0; i < (nrow < ncol ? nrow : ncol); i++) { 187 | RandStat stat_ij = stat_i0; 188 | double sum = 0.0; 189 | for (int j = 0; j < n; j++) { 190 | if (i0 + i != j) 191 | sum += fabs(double(stat_ij)); 192 | stat_ij = jump_one_col * stat_ij; 193 | } 194 | a[lda * i + i] = static_cast( sum ); 195 | stat_i0 = inc1 * stat_i0; 196 | } 197 | } 198 | } 199 | 200 | static inline double calc_diag(int const i, int const n, uint64_t const seed) { 201 | RandStat stat_00 = RandStat::initialize(seed); 202 | RandCoeff inc1 = RandCoeff::default_vals(); 203 | RandCoeff jump_one_col = inc1.pow(n); 204 | RandCoeff jump_i0 = inc1.pow(i); 205 | RandStat stat_ij = jump_i0 * stat_00; 206 | 207 | double sum = 0.0; 208 | for (int j = 0; j < n; j++) { 209 | if (i != j) 210 | sum += fabs(double(stat_ij)); 211 | stat_ij = jump_one_col * stat_ij; 212 | } 213 | return sum; 214 | } 215 | 216 | // debug 217 | static inline double mat_elem(int n, int i, int j, int seed) { 218 | RandStat stat_00 = RandStat::initialize(seed); 219 | RandCoeff inc1 = RandCoeff::default_vals(); 220 | RandCoeff jump_ij = inc1.pow(i + static_cast(n) * j); 221 | return double(jump_ij * stat_00); 222 | } 223 | 224 | template struct Matgen { 225 | uint64_t seed; 226 | int n; 227 | F const *diag; 228 | F const *diag_dev; 229 | 230 | RandCoeff incl1, jumpn, jumpi, jumpj; 231 | 232 | enum { NUM_POWERS = 16 }; 233 | RandCoeff powers[NUM_POWERS]; 234 | 235 | double scalea, scaleb; 236 | 237 | Matgen(uint64_t seed, int n, int iskip, int jskip, F const *diag) 238 | : seed(seed), n(n), diag(diag) { 239 | incl1 = RandCoeff::default_vals(); 240 | jumpn = incl1.pow(n); 241 | jumpi = incl1.pow(iskip); 242 | jumpj = incl1.pow(n * static_cast(jskip)); 243 | for (int i = 0; i < NUM_POWERS; i++) { 244 | powers[i] = incl1.pow(i); 245 | } 246 | scalea = sqrt(n * sqrt(n)); 247 | scaleb = 1; 248 | } 249 | RandCoeff jump(int i, int j) const { 250 | return incl1.pow(i + n * static_cast(j)); 251 | } 252 | RandCoeff jump(uint64_t i, uint64_t j) const { 253 | return incl1.pow(i + n * static_cast(j)); 254 | } 255 | }; 256 | 257 | #endif 258 | -------------------------------------------------------------------------------- /src/gpu_init_kernels.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "gpu_init_kernels.h" 3 | 4 | using namespace std; 5 | 6 | 7 | 8 | #if 0 9 | // Generate random entries of matrix A in single precision 10 | void fill_random(float* A, long m, long n, int n_threads, int blocksize_x, int work_per_thread) 11 | { 12 | GPURAND_GENERATOR_T generator; 13 | 14 | GPURAND_CREATE_GENERATOR(&generator, GPURAND_RNG_PSEUDO_DEFAULT); 15 | GPURAND_SET_PSEUDO_RANDOM_GENERATOR_SEED(generator, (int)time(NULL)); 16 | GPURAND_GENERATE_UNIFORM(generator, A, (size_t) m *(size_t)n); 17 | // Push entries to (-0.5, 0.5) range 18 | int blocksize_y = n_threads / blocksize_x; 19 | dim3 thread_dims(blocksize_x, blocksize_y, 1); 20 | int block_dim_x = ceil((float)m / blocksize_x), block_dim_y = ceil((float)n / (work_per_thread * blocksize_y)); 21 | dim3 block_dims(block_dim_x, block_dim_y, 1); 22 | 23 | #ifdef CUDA_OLCF_PLATFORM 24 | minus_05<<>>(A, m, n, work_per_thread); 25 | #elif defined(ROCM_OLCF_PLATFORM) 26 | hipLaunchKernelGGL(minus_05, block_dims, thread_dims, 0, 0, 27 | A, m, n, work_per_thread); 28 | #else 29 | throw std::runtime_error("Error with build platform, neither CUDA nor ROCM. See CMake output.") 30 | #endif 31 | 32 | GPURAND_DESTROY_GENERATOR(generator); 33 | } 34 | 35 | // Generate random entries of vector v in double precision 36 | void fill_random(double* v, long m, int n_threads) 37 | { 38 | GPURAND_GENERATOR_T generator; 39 | 40 | GPURAND_CREATE_GENERATOR(&generator, GPURAND_RNG_PSEUDO_DEFAULT); 41 | GPURAND_SET_PSEUDO_RANDOM_GENERATOR_SEED(generator, (int)time(NULL)); 42 | GPURAND_GENERATE_UNIFORM_DOUBLE(generator, v, m); 43 | // Push entries to (-0.5, 0.5) range 44 | dim3 block_dims(ceil((float) m / n_threads), 1, 1); 45 | dim3 thread_dims(n_threads, 1, 1); 46 | 47 | #ifdef CUDA_OLCF_PLATFORM 48 | minus_05<<>>(v, m, 1, 1); 49 | #elif defined(ROCM_OLCF_PLATFORM) 50 | hipLaunchKernelGGL(minus_05, block_dims, thread_dims, 0, 0, v, m, 1, 1); 51 | #else 52 | throw std::runtime_error("Error with build platform, neither CUDA nor ROCM. See CMake output.") 53 | #endif 54 | 55 | GPURAND_DESTROY_GENERATOR(generator); 56 | } 57 | 58 | // Generate random entries of matrix A using Fugaku's logic 59 | __host__ void fill_random_fugaku(uint64_t N, RandStat stat0, RandCoeff inc1, float *A, double *row_sums, long m, long n, long i1, long j1, long b, long istride, long jstride, int n_threads, int blocksize_x, int work_per_thread) 60 | { 61 | int blocksize_y = n_threads / blocksize_x; 62 | dim3 thread_dims(blocksize_x, blocksize_y, 1); 63 | int matrix_dim_x = ceil((float)m / blocksize_x), matrix_dim_y = ceil((float)n / (work_per_thread * blocksize_y)); 64 | dim3 matrix_dims(matrix_dim_x, matrix_dim_y, 1); 65 | 66 | #ifdef CUDA_OLCF_PLATFORM 67 | fill_random_fugaku_d<<>>(N, stat0, inc1, A, row_sums, m, n, i1, j1, b, istride, jstride, work_per_thread); 68 | #elif defined(ROCM_OLCF_PLATFORM) 69 | hipLaunchKernelGGL(fill_random_fugaku_d, matrix_dims, thread_dims, 0, 0, 70 | N, stat0, inc1, A, row_sums, m, n, i1, j1, b, istride, jstride, work_per_thread); 71 | #else 72 | throw std::runtime_error("Error with build platform, neither CUDA nor ROCM. See CMake output.") 73 | #endif 74 | 75 | } 76 | 77 | template __global__ void fill_random_fugaku_d(uint64_t N, RandStat stat0, RandCoeff inc1, F *A, double *row_sums, long m, long n, long i1, long j1, long b, long istride, long jstride, int work_per_thread) 78 | { 79 | int idx_x = GPU_BLOCKIDX_X * GPU_BLOCKDIM_X + GPU_THREADIDX_X; 80 | int idx_y = GPU_BLOCKIDX_Y * GPU_BLOCKDIM_Y + GPU_THREADIDX_Y; 81 | const uint64_t A_row = idx_x, global_row = (i1 + (A_row / (uint64_t)b) * istride) * (uint64_t)b + (A_row % (uint64_t)b); 82 | uint64_t A_col, global_col, A_idx, global_idx; 83 | double a_ij, row_sum_increase = 0; 84 | 85 | if (A_row < m) 86 | { 87 | for(int i = 0 ; i < work_per_thread; i++) 88 | { 89 | A_col = work_per_thread * idx_y + i; 90 | global_col = (j1 + (A_col / (uint64_t)b) * jstride) * (uint64_t)b + (A_col % (uint64_t)b); 91 | if (A_col < n) 92 | { 93 | A_idx = A_row + A_col * (uint64_t)m; 94 | global_idx = global_col * N + global_row; 95 | a_ij = static_cast(inc1.pow(global_idx) * stat0); 96 | A[A_idx] = static_cast(a_ij); 97 | row_sum_increase += fabs(a_ij); 98 | } 99 | } 100 | atomicAdd(&row_sums[global_row], row_sum_increase); 101 | } 102 | } 103 | #endif 104 | 105 | 106 | __host__ void compute_row_sums(float *A, double *row_sums, long m, long n, long i1, long j1, long b, long istride, long jstride, int n_threads, int blocksize_x, int work_per_thread) 107 | { 108 | int blocksize_y = n_threads / blocksize_x; 109 | dim3 thread_dims(blocksize_x, blocksize_y, 1); 110 | int block_dim_x = ceil((float)m / blocksize_x), block_dim_y = ceil((float)n / (work_per_thread * blocksize_y)); 111 | dim3 block_dims(block_dim_x, block_dim_y, 1); 112 | 113 | #ifdef CUDA_OLCF_PLATFORM 114 | compute_row_sums_d<<>>(A, row_sums, m, n, i1, j1, b, istride, jstride, work_per_thread); 115 | #elif defined(ROCM_OLCF_PLATFORM) 116 | hipLaunchKernelGGL(compute_row_sums_d, block_dims, thread_dims, 0, 0, 117 | A, row_sums, m, n, i1, j1, b, istride, jstride, work_per_thread); 118 | #else 119 | throw std::runtime_error("Error with build platform, neither CUDA nor ROCM. See CMake output.") 120 | #endif 121 | } 122 | 123 | __global__ void compute_row_sums_d(float *A, double *row_sums, long m, long n, long i1, long j1, long b, long istride, long jstride, int work_per_thread) 124 | { 125 | int idx_x = GPU_BLOCKIDX_X * GPU_BLOCKDIM_X + GPU_THREADIDX_X; 126 | int idx_y = GPU_BLOCKIDX_Y * GPU_BLOCKDIM_Y + GPU_THREADIDX_Y; 127 | const size_t A_row = idx_x, global_row_block = i1 + (A_row / b) * istride, global_row = global_row_block * b + (A_row % b); 128 | size_t A_col; 129 | double row_sum_increase = 0; 130 | 131 | if (A_row < m) 132 | { 133 | for(int i = 0 ; i < work_per_thread; i++) 134 | { 135 | A_col = work_per_thread * idx_y + i; 136 | if (A_col < n) 137 | row_sum_increase += fabs(A[A_row + A_col * m]); 138 | 139 | } 140 | atomicAdd(&row_sums[global_row], row_sum_increase); 141 | } 142 | 143 | } 144 | 145 | __host__ void fill_diag_rhs(int my_init, uint64_t N, RandStat stat0, RandCoeff inc1, float *A, double *row_sums_d, double *rhs, double *rhs_d, long m, long *diag_i_steps, long *diag_j_steps, long n_diag_blocks, long i1, long b, long istride, int n_threads) 146 | { 147 | int n_diag_entries = n_diag_blocks * b; 148 | dim3 block_dims(ceil((float) n_diag_entries / n_threads), 1, 1); 149 | dim3 thread_dims(n_threads, 1, 1); 150 | 151 | #ifdef CUDA_OLCF_PLATFORM 152 | fill_diag_rhs_d<<>>(my_init, N, stat0, inc1, A, row_sums_d, rhs, rhs_d, m, diag_i_steps, diag_j_steps, n_diag_entries, i1, b, istride); 153 | #elif defined(ROCM_OLCF_PLATFORM) 154 | hipLaunchKernelGGL(fill_diag_rhs_d, block_dims, thread_dims, 0, 0, 155 | my_init, N, stat0, inc1, A, row_sums_d, rhs, rhs_d, m, diag_i_steps, diag_j_steps, n_diag_entries, i1, b, istride); 156 | #else 157 | throw std::runtime_error("Error with build platform, neither CUDA nor ROCM. See CMake output.") 158 | #endif 159 | } 160 | 161 | 162 | __global__ void fill_diag_rhs_d(int my_init, uint64_t N, RandStat stat0, RandCoeff inc1, float *A, double *row_sums_d, double *rhs, double *rhs_d, long m, long *diag_i_steps, long *diag_j_steps, long n_diag_entries, long i1, long b, long istride) 163 | { 164 | int idx = GPU_BLOCKIDX_X * GPU_BLOCKDIM_X + GPU_THREADIDX_X; 165 | 166 | if (idx < n_diag_entries) 167 | { 168 | size_t diag_block_idx = idx / b, offset_in_block = idx % b; 169 | size_t diag_i_step = diag_i_steps[diag_block_idx], diag_j_step = diag_j_steps[diag_block_idx]; 170 | size_t A_idx = (diag_j_step * m + diag_i_step) * b + offset_in_block * (m + 1); 171 | size_t global_diag_idx = (i1 + diag_i_step * istride) * b + offset_in_block; 172 | 173 | row_sums_d[global_diag_idx] -= A[A_idx]; 174 | A[A_idx] = row_sums_d[global_diag_idx]; 175 | if (my_init == 3) 176 | { 177 | size_t local_diag_idx = diag_i_step * b + offset_in_block; 178 | rhs_d[local_diag_idx] = static_cast(inc1.pow(N*N + global_diag_idx) * stat0); 179 | } 180 | } 181 | } 182 | 183 | template __global__ void minus_05(F *A, long m, long n, int work_per_thread) 184 | { 185 | int idx_x = GPU_BLOCKIDX_X * GPU_BLOCKDIM_X + GPU_THREADIDX_X; 186 | int idx_y = GPU_BLOCKIDX_Y * GPU_BLOCKDIM_Y + GPU_THREADIDX_Y; 187 | const uint64_t A_row = idx_x; 188 | uint64_t A_col = work_per_thread * idx_y, A_idx; 189 | 190 | if (A_row < m) 191 | { 192 | for(int i = 0 ; i < work_per_thread; i++) 193 | { 194 | A_col++; 195 | if (A_col < n) 196 | { 197 | A_idx = A_row + A_col * (uint64_t)m; 198 | A[A_idx] -= 0.5; 199 | } 200 | } 201 | } 202 | } 203 | 204 | -------------------------------------------------------------------------------- /src/svesim.hpp: -------------------------------------------------------------------------------- 1 | #ifndef SVESIM_HPP 2 | #define SVESIM_HPP 3 | 4 | // ARM sve wrapper. 5 | // Reimpelment with SSE, AVX, or sometihng if performance is important. 6 | // (basically, we do not use this wrapper in time-consuming parts.) 7 | // Add functions if needed. 8 | 9 | #ifdef __ARM_FEATURE_SVE 10 | #include 11 | #else 12 | #include "fp16sim.hpp" 13 | #include 14 | #include 15 | #define SVE_VLEN 64 16 | #define svcntd() (SVE_VLEN / 8) 17 | #define svcntw() (SVE_VLEN / 4) 18 | #define svcnth() (SVE_VLEN / 2) 19 | struct svbool_t { 20 | bool x[SVE_VLEN]; 21 | }; 22 | static svbool_t svptrue_b64() { 23 | svbool_t r; 24 | for (int i = 0; i < SVE_VLEN / 8; ++i) 25 | r.x[8 * i] = true; 26 | return r; 27 | } 28 | static svbool_t svwhilelt_b64(int64_t begin, int64_t end) { 29 | svbool_t r; 30 | for (int i = 0; i < SVE_VLEN / 8; ++i) 31 | r.x[8 * i] = i + begin < end; 32 | return r; 33 | } 34 | static svbool_t svptrue_b32() { 35 | svbool_t r; 36 | for (int i = 0; i < SVE_VLEN / 4; ++i) 37 | r.x[4 * i] = true; 38 | return r; 39 | } 40 | static svbool_t svwhilelt_b32(int64_t begin, int64_t end) { 41 | svbool_t r; 42 | for (int i = 0; i < SVE_VLEN / 4; ++i) 43 | r.x[4 * i] = i + begin < end; 44 | return r; 45 | } 46 | static svbool_t svptrue_b16() { 47 | svbool_t r; 48 | for (int i = 0; i < SVE_VLEN / 2; ++i) 49 | r.x[2 * i] = true; 50 | return r; 51 | } 52 | static svbool_t svwhilelt_b16(int64_t begin, int64_t end) { 53 | svbool_t r; 54 | for (int i = 0; i < SVE_VLEN / 2; ++i) 55 | r.x[2 * i] = i + begin < end; 56 | return r; 57 | } 58 | 59 | struct svint64_t { 60 | int64_t x[SVE_VLEN / 8]; 61 | }; 62 | static svint64_t svdup_s64(int64_t x) { 63 | svint64_t r; 64 | for (int i = 0; i < SVE_VLEN / 8; ++i) 65 | r.x[i] = x; 66 | return r; 67 | } 68 | static svint64_t svld1_s64(svbool_t t, int64_t const *x) { 69 | svint64_t r; 70 | for (int i = 0; i < SVE_VLEN / 8; ++i) 71 | r.x[i] = (t.x[8 * i] ? x[i] : 0ll); 72 | return r; 73 | } 74 | static svint64_t svmad_s64_x(svbool_t t, svint64_t a, svint64_t b, 75 | svint64_t c) { 76 | svint64_t r; 77 | for (int i = 0; i < SVE_VLEN / 8; ++i) 78 | r.x[i] = (t.x[8 * i] ? a.x[i] * b.x[i] + c.x[i] : a.x[i]); 79 | return r; 80 | } 81 | static svint64_t svindex_s64(int64_t base, int64_t step) { 82 | svint64_t r; 83 | for (int i = 0; i < SVE_VLEN / 8; ++i) 84 | r.x[i] = base + i * step; 85 | return r; 86 | } 87 | struct svfloat64_t { 88 | double x[SVE_VLEN / 8]; 89 | }; 90 | static svfloat64_t svdup_f64(double x) { 91 | svfloat64_t r; 92 | for (int i = 0; i < SVE_VLEN / 8; ++i) 93 | r.x[i] = x; 94 | return r; 95 | } 96 | static svfloat64_t svld1_vnum_f64(svbool_t t, double const *x, int vnum) { 97 | svfloat64_t r; 98 | for (int i = 0; i < SVE_VLEN / 8; ++i) 99 | r.x[i] = (t.x[8 * i] ? x[vnum * SVE_VLEN / 8 + i] : 0.); 100 | return r; 101 | } 102 | static void svst1_vnum_f64(svbool_t t, double *x, int vnum, svfloat64_t r) { 103 | for (int i = 0; i < SVE_VLEN / 8; ++i) { 104 | if (t.x[8 * i]) 105 | x[vnum * SVE_VLEN / 8 + i] = r.x[i]; 106 | } 107 | } 108 | static svfloat64_t svadd_f64_x(svbool_t t, svfloat64_t a, svfloat64_t b) { 109 | svfloat64_t r; 110 | for (int i = 0; i < SVE_VLEN / 8; ++i) 111 | r.x[i] = (t.x[8 * i] ? a.x[i] + b.x[i] : a.x[i]); 112 | return r; 113 | } 114 | static svfloat64_t svmla_f64_x(svbool_t t, svfloat64_t a, svfloat64_t b, 115 | svfloat64_t c) { 116 | svfloat64_t r; 117 | for (int i = 0; i < SVE_VLEN / 8; ++i) 118 | r.x[i] = (t.x[8 * i] ? a.x[i] + b.x[i] * c.x[i] : a.x[i]); 119 | return r; 120 | } 121 | static svfloat64_t svmla_n_f64_x(svbool_t t, svfloat64_t a, svfloat64_t b, 122 | float c) { 123 | svfloat64_t r; 124 | for (int i = 0; i < SVE_VLEN / 8; ++i) 125 | r.x[i] = (t.x[8 * i] ? a.x[i] + b.x[i] * c : a.x[i]); 126 | return r; 127 | } 128 | static svfloat64_t svcvt_f64_s64_x(svbool_t t, svint64_t x) { 129 | svfloat64_t r; 130 | for (int i = 0; i < SVE_VLEN / 8; ++i) 131 | r.x[i] = (t.x[8 * i] ? x.x[i] : 0.); 132 | return r; 133 | } 134 | struct svfloat32_t { 135 | float x[SVE_VLEN / 4]; 136 | }; 137 | static svfloat32_t svdup_f32(float x) { 138 | svfloat32_t r; 139 | for (int i = 0; i < SVE_VLEN / 4; ++i) 140 | r.x[i] = x; 141 | return r; 142 | } 143 | static svfloat32_t svld1_vnum_f32(svbool_t t, float const *x, int vnum) { 144 | svfloat32_t r; 145 | for (int i = 0; i < SVE_VLEN / 4; ++i) 146 | r.x[i] = (t.x[4 * i] ? x[vnum * SVE_VLEN / 4 + i] : 0.f); 147 | return r; 148 | } 149 | static void svst1_vnum_f32(svbool_t t, float *x, int vnum, svfloat32_t r) { 150 | for (int i = 0; i < SVE_VLEN / 4; ++i) { 151 | if (t.x[4 * i]) 152 | x[vnum * SVE_VLEN / 4 + i] = r.x[i]; 153 | } 154 | } 155 | static svfloat32_t svadd_f32_x(svbool_t t, svfloat32_t a, svfloat32_t b) { 156 | svfloat32_t r; 157 | for (int i = 0; i < SVE_VLEN / 4; ++i) 158 | r.x[i] = (t.x[4 * i] ? a.x[i] + b.x[i] : a.x[i]); 159 | return r; 160 | } 161 | static svfloat32_t svmla_f32_x(svbool_t t, svfloat32_t a, svfloat32_t b, 162 | svfloat32_t c) { 163 | svfloat32_t r; 164 | for (int i = 0; i < SVE_VLEN / 4; ++i) 165 | r.x[i] = (t.x[4 * i] ? a.x[i] + b.x[i] * c.x[i] : a.x[i]); 166 | return r; 167 | } 168 | static svfloat32_t svmls_f32_x(svbool_t t, svfloat32_t a, svfloat32_t b, 169 | svfloat32_t c) { 170 | svfloat32_t r; 171 | for (int i = 0; i < SVE_VLEN / 4; ++i) 172 | r.x[i] = (t.x[4 * i] ? a.x[i] - b.x[i] * c.x[i] : a.x[i]); 173 | return r; 174 | } 175 | 176 | static svfloat32_t svnmls_f32_x(svbool_t t, svfloat32_t a, svfloat32_t b, 177 | svfloat32_t c) { 178 | svfloat32_t r; 179 | for (int i = 0; i < SVE_VLEN / 4; ++i) 180 | r.x[i] = (t.x[4 * i] ? b.x[i] * c.x[i] - a.x[i] : a.x[i]); 181 | return r; 182 | } 183 | 184 | struct svfloat16_t { 185 | fp16 x[SVE_VLEN / 2]; 186 | }; 187 | struct svint16_t { 188 | short x[SVE_VLEN / 2]; 189 | }; 190 | 191 | 192 | //JAE Edit for Clang/LLVM 193 | static svfloat16_t svdup_f16(fp16 *x) { 194 | svfloat16_t r; 195 | for (int i = 0; i < SVE_VLEN / 2; ++i) 196 | r.x[i] = *x; 197 | return r; 198 | } 199 | 200 | 201 | static svfloat16_t svld1_vnum_f16(svbool_t t, fp16 const *x, int vnum) { 202 | svfloat16_t r; 203 | for (int i = 0; i < SVE_VLEN / 2; ++i) 204 | r.x[i] = (t.x[2 * i] ? x[vnum * SVE_VLEN / 2 + i] : fp16(0.f)); 205 | return r; 206 | } 207 | static void svst1_vnum_f16(svbool_t t, fp16 *x, int vnum, svfloat16_t r) { 208 | for (int i = 0; i < SVE_VLEN / 2; ++i) { 209 | if (t.x[2 * i]) 210 | x[vnum * SVE_VLEN / 2 + i] = r.x[i]; 211 | } 212 | } 213 | static svfloat16_t svadd_f16_x(svbool_t t, svfloat16_t a, svfloat16_t b) { 214 | svfloat16_t r; 215 | for (int i = 0; i < SVE_VLEN / 2; ++i) 216 | r.x[i] = (t.x[2 * i] ? a.x[i] + b.x[i] : a.x[i]); 217 | return r; 218 | } 219 | static svfloat16_t svsub_f16_x(svbool_t t, svfloat16_t a, svfloat16_t b) { 220 | svfloat16_t r; 221 | for (int i = 0; i < SVE_VLEN / 2; ++i) 222 | r.x[i] = (t.x[2 * i] ? a.x[i] - b.x[i] : a.x[i]); 223 | return r; 224 | } 225 | static svfloat16_t svmul_f16_x(svbool_t t, svfloat16_t a, svfloat16_t b) { 226 | svfloat16_t r; 227 | for (int i = 0; i < SVE_VLEN / 2; ++i) 228 | r.x[i] = (t.x[2 * i] ? a.x[i] * b.x[i] : a.x[i]); 229 | return r; 230 | } 231 | static svfloat16_t svmla_f16_x(svbool_t t, svfloat16_t a, svfloat16_t b, 232 | svfloat16_t c) { 233 | svfloat16_t r; 234 | for (int i = 0; i < SVE_VLEN / 2; ++i) 235 | r.x[i] = (t.x[2 * i] ? a.x[i] + b.x[i] * c.x[i] : a.x[i]); 236 | return r; 237 | } 238 | static svfloat16_t svmls_f16_x(svbool_t t, svfloat16_t a, svfloat16_t b, 239 | svfloat16_t c) { 240 | svfloat16_t r; 241 | for (int i = 0; i < SVE_VLEN / 2; ++i) 242 | r.x[i] = (t.x[2 * i] ? a.x[i] - b.x[i] * c.x[i] : a.x[i]); 243 | return r; 244 | } 245 | static svfloat16_t svnmls_f16_x(svbool_t t, svfloat16_t a, svfloat16_t b, 246 | svfloat16_t c) { 247 | svfloat16_t r; 248 | for (int i = 0; i < SVE_VLEN / 2; ++i) 249 | r.x[i] = (t.x[2 * i] ? b.x[i] * c.x[i] - a.x[i] : a.x[i]); 250 | return r; 251 | } 252 | static svfloat16_t svrintn_f16_x(svbool_t t, svfloat16_t a) { 253 | svfloat16_t r; 254 | for (int i = 0; i < SVE_VLEN / 2; ++i) 255 | r.x[i] = (t.x[2 * i] ? fp16(roundf((float)a.x[i])) : a.x[i]); 256 | return r; 257 | } 258 | static svint16_t svcvt_s16_f16_x(svbool_t t, svfloat16_t a) { 259 | svint16_t r; 260 | for (int i = 0; i < SVE_VLEN / 2; ++i) 261 | r.x[i] = (t.x[2 * i] ? (short)(float)a.x[i] : (short)0); 262 | return r; 263 | } 264 | 265 | static svint16_t svld1_vnum_s16(svbool_t t, short const *x, int vnum) { 266 | svint16_t r; 267 | for (int i = 0; i < SVE_VLEN / 2; ++i) 268 | r.x[i] = (t.x[2 * i] ? x[vnum * SVE_VLEN / 2 + i] : (short)0); 269 | return r; 270 | } 271 | static void svst1_vnum_s16(svbool_t t, short *x, int vnum, svint16_t r) { 272 | for (int i = 0; i < SVE_VLEN / 2; ++i) { 273 | if (t.x[2 * i]) 274 | x[vnum * SVE_VLEN / 2 + i] = r.x[i]; 275 | } 276 | } 277 | static svint16_t svqadd_s16(svint16_t a, svint16_t b) { 278 | int const max = (1 << 15) - 1; 279 | int const min = -(1 << 15); 280 | svint16_t r; 281 | for (int i = 0; i < SVE_VLEN / 2; ++i) { 282 | int x = a.x[i]; 283 | int y = b.x[i]; 284 | if (x + y >= max) 285 | r.x[i] = max; 286 | else if (x + y <= min) 287 | r.x[i] = min; 288 | else 289 | r.x[i] = x + y; 290 | } 291 | return r; 292 | } 293 | #endif 294 | #endif 295 | -------------------------------------------------------------------------------- /src/rocm_device_macros.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef __HPLAI_HIP_DEVICE_MACROS__ 3 | #define __HPLAI_HIP_DEVICE_MACROS__ 4 | 5 | //#include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | //#include 13 | 14 | #define CHECK_BIT(var,pos) ( (var>>pos) & 1 ) 15 | 16 | // #define GPU_EVENT 17 | 18 | #ifdef GPU_EVENT 19 | #define GPU_EVENTCREATE(a) hipEventCreate(a) 20 | #define GPU_EVENTRECORD(a,b) hipEventRecord(a,b) 21 | #define GPU_EVENTSYNC(a) hipEventSynchronize(a) 22 | #define GPU_EVENTELAPSED(a,b,c) hipEventElapsedTime(a,b,c) 23 | #define GPU_EVENTDESTROY(a) hipEventDestroy(a) 24 | #else 25 | #define GPU_EVENTCREATE(a) { } 26 | #define GPU_EVENTRECORD(a,b) { } 27 | #define GPU_EVENTSYNC(a) { } 28 | #define GPU_EVENTELAPSED(a,b,c) { } 29 | #define GPU_EVENTDESTROY(a) { } 30 | #endif 31 | 32 | //#define SKIP 33 | #ifdef SKIP 34 | #define checkGPUblas(a) { } 35 | #endif 36 | 37 | //#define DOUBLE 38 | 39 | // *** BASIC HIP MACROS *** 40 | // Kernel Macros 41 | #define GPU_BLOCKIDX_X \ 42 | hipBlockIdx_x 43 | 44 | #define GPU_BLOCKIDX_Y \ 45 | hipBlockIdx_y 46 | 47 | #define GPU_BLOCKIDX_Z \ 48 | hipBlockIdx_z 49 | 50 | 51 | #define GPU_THREADIDX_X \ 52 | hipThreadIdx_x 53 | 54 | #define GPU_THREADIDX_Y \ 55 | hipThreadIdx_y 56 | 57 | #define GPU_THREADIDX_Z \ 58 | hipThreadIdx_z 59 | 60 | 61 | #define GPU_BLOCKDIM_X \ 62 | hipBlockDim_x 63 | 64 | #define GPU_BLOCKDIM_Y \ 65 | hipBlockDim_y 66 | 67 | #define GPU_BLOCKDIM_Z \ 68 | hipBlockDim_z 69 | 70 | 71 | #define GPU_GRIDDIM_X \ 72 | hipGridDim_x 73 | 74 | #define GPU_GRIDDIM_Y \ 75 | hipGridDim_y 76 | 77 | #define GPU_GRIDDIM_Z \ 78 | hipGridDim_z 79 | 80 | 81 | // Types 82 | #define GPU_ERROR_T \ 83 | hipError_t 84 | 85 | #define GPU_STREAM_T \ 86 | hipStream_t 87 | 88 | 89 | // Enums 90 | #define GPU_SUCCESS \ 91 | hipSuccess 92 | 93 | #define GPU_STREAM_NON_BLOCKING \ 94 | hipStreamNonBlocking 95 | 96 | #ifdef DOUBLE 97 | #define GPU_R_16F \ 98 | rocblas_datatype_f32_r 99 | // rocblas_datatype_f64_r 100 | 101 | #define GPU_R_32F \ 102 | rocblas_datatype_f32_r 103 | // rocblas_datatype_f64_r 104 | 105 | #else 106 | 107 | #define GPU_R_16F \ 108 | rocblas_datatype_f16_r 109 | 110 | #define GPU_R_32F \ 111 | rocblas_datatype_f32_r 112 | #endif 113 | 114 | #define GPU_R_64F \ 115 | rocblas_datatype_f64_r 116 | 117 | #define GPU_MEMCPY_DEVICE_TO_HOST \ 118 | hipMemcpyDeviceToHost 119 | 120 | #define GPU_MEMCPY_HOST_TO_DEVICE \ 121 | hipMemcpyHostToDevice 122 | 123 | // Kernels 124 | #define GPU_DEVICE_RESET() \ 125 | hipDeviceReset() 126 | 127 | #define GPU_SET_DEVICE(deviceID) \ 128 | hipSetDevice(deviceID) 129 | 130 | #define GPU_DEVICE_SYNCHRONIZE() \ 131 | hipDeviceSynchronize() 132 | 133 | #define GPU_THREAD_SYNCHRONIZE(threadID) \ 134 | hipStreamSynchronize(threadID); 135 | 136 | #define GPU_FREE(memPointer) \ 137 | hipFree(memPointer) 138 | 139 | #define GPU_FREE_HOST(memPointer) \ 140 | hipHostFree(memPointer) 141 | 142 | #define GPU_GET_ERROR_STRING(hipError) \ 143 | hipGetErrorString(hipError) 144 | 145 | #define GPU_GET_LAST_ERROR() \ 146 | hipGetLastError() 147 | 148 | #define GPU_MALLOC(memAddress, numBytes) \ 149 | hipMalloc(memAddress, numBytes) 150 | 151 | #define GPU_MALLOC_HOST(memAddress, numBytes) \ 152 | hipHostMalloc(memAddress, numBytes) 153 | 154 | #define GPU_MALLOC_MANAGED(memAddress, numBytes) \ 155 | hipMallocManaged(memAddress, numBytes) 156 | 157 | #define GPU_MEMCPY(memPointer_to, memPointer_from, numBytes, directionEnum) \ 158 | hipMemcpy(memPointer_to, memPointer_from, numBytes, directionEnum) 159 | 160 | #define GPU_MEMCPY_2D(memPointer_to, pitchBytes_to, memPointer_from, pitchBytes_from, numBytes_W, numBytes_H, directionEnum) \ 161 | hipMemcpy2D(memPointer_to, pitchBytes_to, memPointer_from, pitchBytes_from, numBytes_W, numBytes_H, directionEnum) 162 | 163 | #define GPU_MEMCPY_DEVICE_TO_DEVICE \ 164 | hipMemcpyDeviceToDevice 165 | 166 | #define GPU_MEM_GET_INFO(freeMem, totalMem) \ 167 | hipMemGetInfo(freeMem, totalMem) 168 | 169 | #define GPU_STREAM_CREATE_WITH_FLAGS(hipStream, streamTypeEnum) \ 170 | hipStreamCreateWithFlags(hipStream, streamTypeEnum) 171 | 172 | #define GPU_DAXPY(blasHandle, size, alpha, A, Ainc, B, Binc) \ 173 | rocblas_daxpy(blasHandle, size, alpha, A, Ainc, B, Binc) 174 | 175 | #define GPU_DTRSV(blasHandle, rocFill, rocOp, rocDiag, M_dim, memPointer_A, lda, memPointer_B, Binc ) \ 176 | rocblas_dtrsv(blasHandle, rocFill, rocOp, rocDiag, M_dim, memPointer_A, lda, memPointer_B, Binc) 177 | 178 | 179 | 180 | // *** HIPBLAS MACROS *** 181 | // Types 182 | #define GPUBLAS_HANDLE_T \ 183 | rocblas_handle 184 | 185 | #define GPUBLAS_STATUS_T \ 186 | rocblas_status 187 | 188 | // Enums 189 | #define GPUBLAS_STATUS_SUCCESS \ 190 | rocblas_status_success 191 | 192 | #define GPUBLAS_STATUS_NOT_INITIALIZED \ 193 | rocblas_status_invalid_handle 194 | 195 | #define GPUBLAS_STATUS_ALLOC_FAILED \ 196 | rocblas_status_memory_error 197 | 198 | #define GPUBLAS_STATUS_INVALID_VALUE \ 199 | rocblas_status_invalid_value 200 | 201 | // No good option for rocblas_status 202 | #define GPUBLAS_STATUS_ARCH_MISMATCH \ 203 | rocblas_status_perf_degraded 204 | 205 | #define GPUBLAS_STATUS_MAPPING_ERROR \ 206 | rocblas_status_invalid_pointer 207 | 208 | #define GPUBLAS_STATUS_EXECUTION_FAILED \ 209 | rocblas_status_invalid_size 210 | 211 | #define GPUBLAS_STATUS_INTERNAL_ERROR \ 212 | rocblas_status_internal_error 213 | 214 | #define GPUBLAS_OP_N \ 215 | rocblas_operation_none 216 | 217 | #define GPUBLAS_OP_T \ 218 | rocblas_operation_transpose 219 | 220 | #define GPUBLAS_SIDE_RIGHT \ 221 | rocblas_side_right 222 | 223 | #define GPUBLAS_SIDE_LEFT \ 224 | rocblas_side_left 225 | 226 | #define GPUBLAS_FILL_MODE_UPPER \ 227 | rocblas_fill_upper 228 | 229 | #define GPUBLAS_FILL_MODE_LOWER \ 230 | rocblas_fill_lower 231 | 232 | #define GPUBLAS_DIAG_UNIT \ 233 | rocblas_diagonal_unit 234 | 235 | #define GPUBLAS_DIAG_NON_UNIT \ 236 | rocblas_diagonal_non_unit 237 | 238 | // Kernels 239 | #define GPUBLAS_CREATE(rocblasHandle) \ 240 | rocblas_create_handle(rocblasHandle) 241 | 242 | #define GPUBLAS_SET_STREAM(rocblasHandle, hipStream) \ 243 | rocblas_set_stream(rocblasHandle, hipStream) 244 | 245 | //#define GPUBLAS_SGETRF_BATCHED(hipblasHandle, N_dim, memPointer_A, lda, memPointer_Pivot, memPointer_Info, batchSize) \ 246 | // hipblasSgetrfBatched(hipblasHandle, N_dim, memPointer_A, lda, memPointer_Pivot, memPointer_Info, batchSize) 247 | 248 | #define GPUBLAS_STRSM(rocblasHandle, rocSide, rocFill, rocOp, rocDiag, M_dim, N_dim, alpha, memPointer_A, lda, memPointer_B, ldb) \ 249 | rocblas_strsm(rocblasHandle, rocSide, rocFill, rocOp, rocDiag, M_dim, N_dim, alpha, memPointer_A, lda, memPointer_B, ldb) 250 | //dbl rocblas_dtrsm(rocblasHandle, rocSide, rocFill, rocOp, rocDiag, M_dim, N_dim, alpha, memPointer_A, lda, memPointer_B, ldb) 251 | 252 | #define GPU_daxpy(handle, n, alpha, x, incx, y, incy) \ 253 | rocblas_daxpy(handle, n, alpha, x, incx, y, incy) 254 | 255 | #define GPU_dscal(handle, n, alpha, x, incx) \ 256 | rocblas_dscal(handle, n, alpha, x, incx) 257 | 258 | #define GPU_setValue(x, value, size) \ 259 | hipMemset(x, value, size) 260 | 261 | 262 | // Non-Simple Kernels 263 | //#define GPUBLAS_GET_ERROR_STRING(hipblasStatus) \ 264 | // hipblasGetErrorString(hipblasStatus) 265 | 266 | 267 | #define GPUBLAS_SGEMM_EX(hipblasHandle, hipOp_A, hipOp_B, M_dim, N_dim, k_dim, alpha, memPointer_A, datatype_A, lda, memPointer_B, datatype_B, ldb, beta, memPointer_C, datatype_C, ldc) \ 268 | rocblas_gemm_ex(hipblasHandle, hipOp_A, hipOp_B, M_dim, N_dim, k_dim, alpha, memPointer_A, datatype_A, lda, memPointer_B, datatype_B, ldb, beta, memPointer_C, datatype_C, ldc, memPointer_C, datatype_C, ldc, datatype_C, rocblas_gemm_algo_standard, 0, 0) 269 | 270 | 271 | 272 | // *** HIPSOLVER MACROS *** 273 | // Types 274 | #define GPUSOLVER_HANDLE_T \ 275 | rocblas_handle 276 | 277 | #define GPUSOLVER_STATUS_T \ 278 | rocblas_status 279 | 280 | 281 | // Enums 282 | #define GPUSOLVER_STATUS_SUCCESS \ 283 | rocblas_status_success 284 | 285 | 286 | // Kernels 287 | #define GPUSOLVER_CREATE(rocsolverHandle) \ 288 | rocblas_create_handle(rocsolverHandle) 289 | 290 | #define GPUSOLVER_SGETRF(rocsolverHandle, M_dim, N_dim, memPointer, lda, memAddress_Buffer, memPointer_Pivot, memPointer_Info) \ 291 | rocsolver_sgetrf_npvt(rocsolverHandle, M_dim, N_dim, memPointer, lda, memPointer_Info) 292 | //dbl rocsolver_dgetrf_npvt(rocsolverHandle, M_dim, N_dim, memPointer, lda, memPointer_Info) 293 | 294 | //#define GPUSOLVER_SGETRF_BUFFERSIZE(hipsolverDnHandle, M_dim, N_dim, memPointer, lda, memAddress_Buffer, memPointer_Pivot, memPointer_Info) \ 295 | // hipsolverDnSgetrf_bufferSize(hipsolverDnHandle, M_dim, N_dim, memPointer, lda, memAddress_Buffer, memPointer_Pivot, memPointer_Info) 296 | 297 | //#define GPUSOLVER_SGETRF_BUFFERSIZE(hipsolverDnHandle, M_dim, N_dim, memPointer, lda, memAddress_Buffer) \ 298 | // hipsolverDnSgetrf_bufferSize(hipsolverDnHandle, M_dim, N_dim, memPointer, lda, memAddress_Buffer) 299 | 300 | #define GPUSOLVER_SET_STREAM(rocsolverHandle, hipStream) \ 301 | rocblas_set_stream(rocsolverHandle, hipStream) 302 | 303 | 304 | 305 | #if 0 306 | // *** HIPRAND MACROS *** 307 | // Types 308 | #define GPURAND_GENERATOR_T \ 309 | hiprandGenerator_t 310 | 311 | 312 | // Enums 313 | #define GPURAND_RNG_PSEUDO_DEFAULT \ 314 | HIPRAND_RNG_PSEUDO_DEFAULT 315 | 316 | 317 | // Kernels 318 | #define GPURAND_CREATE_GENERATOR(hiprandGenerator, hiprandRngType) \ 319 | hiprandCreateGenerator(hiprandGenerator, hiprandRngType) 320 | 321 | #define GPURAND_DESTROY_GENERATOR(hiprandGenerator) \ 322 | hiprandDestroyGenerator(hiprandGenerator) 323 | 324 | #define GPURAND_GENERATE_UNIFORM(hiprandGenerator, memPointer, numBytes) \ 325 | hiprandGenerateUniform(hiprandGenerator, memPointer, numBytes) 326 | 327 | #define GPURAND_GENERATE_UNIFORM_DOUBLE(hiprandGenerator, memPointer, numBytes) \ 328 | hiprandGenerateUniformDouble(hiprandGenerator, memPointer, numBytes) 329 | 330 | #define GPURAND_SET_PSEUDO_RANDOM_GENERATOR_SEED(hiprandGenerator, seed) \ 331 | hiprandSetPseudoRandomGeneratorSeed(hiprandGenerator, seed) 332 | 333 | #endif 334 | 335 | #endif // __HPLAI_HIP_DEVICE_MACROS__ 336 | -------------------------------------------------------------------------------- /src/fp16sim.hpp: -------------------------------------------------------------------------------- 1 | #ifndef FP16SIM_HPP 2 | #define FP16SIM_HPP 3 | 4 | #include "device_macros.h" 5 | 6 | // a very small wrapper for fp16. 7 | 8 | //#define BF_NMANT 3 9 | 10 | #ifdef __aarch64__ 11 | #if !defined(__FUJITSU) && !defined(__CLANG_FUJITSU) 12 | //# define FP16_NATIVE_SUPPORT 13 | #define FP16_AUTO_PROMOTION 14 | #else 15 | #define FP16_FUJITSU_TRAD_MODE 16 | #endif 17 | #elif defined(BF_NMANT) 18 | #define FP16_BFLIKE_FLOAT 19 | #if BF_NMANT > 7 || BF_NMANT <= 1 20 | #error "too large or small mantissa for BFLIKE_FLOAT" 21 | #endif 22 | #elif defined(__AVX2__) 23 | #define FP16_AVX2_EMULATION 24 | #elif defined(__clang__) && __clang_major__ >= 8 25 | #define FP16_AUTO_PROMOTION 26 | #else 27 | #define FP16_IS_NOT_SUPPORTED 28 | #endif 29 | 30 | #ifdef FP16_NATIVE_SUPPORT 31 | typedef _Float16 fp16; 32 | 33 | inline void hgemm(int m, int n, int k, float alpha, fp16 const *a, int lda, 34 | fp16 const *b, int ldb, float beta, fp16 *c, int ldc) { 35 | // HGEMM 36 | // replace with native one for performance. 37 | for (int i = 0; i < n; ++i) { 38 | for (int j = 0; j < m; ++j) { 39 | fp16 temp(0.f); 40 | for (int l = 0; l < k; ++l) 41 | temp = a[l * lda + j] * b[i * ldb + l] + temp; 42 | c[ldc * i + j] = c[ldc * i + j] * beta + temp * alpha; 43 | } 44 | } 45 | } 46 | inline void shgemm(int m, int n, int k, float alpha, fp16 const *a, int lda, 47 | fp16 const *b, int ldb, float beta, float *c, int ldc) { 48 | // SHGEMM. HGEMM with fp32 accumulator. 49 | // replace with native one for performance. 50 | for (int i = 0; i < n; ++i) { 51 | for (int j = 0; j < m; ++j) { 52 | float temp = 0.f; 53 | for (int l = 0; l < k; ++l) 54 | temp = a[l * lda + j] * b[i * ldb + l] + temp; 55 | c[ldc * i + j] = c[ldc * i + j] * beta + temp * alpha; 56 | } 57 | } 58 | } 59 | #endif 60 | 61 | #ifdef FP16_FUJITSU_TRAD_MODE 62 | // and CLANG mode 63 | #include 64 | extern "C" void fjblas_gemm_r16_(...); 65 | typedef __fp16 fp16; 66 | inline void hgemm(int m, int n, int k, float alpha, fp16 const *a, int lda, 67 | fp16 const *b, int ldb, float beta, fp16 *c, int ldc) { 68 | (void)alpha; 69 | (void)beta; 70 | short one = 15360; // == 1. 71 | short mone = -17408; // == -1. 72 | fjblas_gemm_r16_("N", "N", &m, &n, &k, &mone, a, &lda, b, &ldb, &one, c, 73 | &ldc); 74 | } 75 | inline void shgemm(int, int, int, float, fp16 const *, int, fp16 const *, int, 76 | float, float *, int) { 77 | abort(); 78 | } 79 | #endif 80 | 81 | #ifdef FP16_AUTO_PROMOTION 82 | typedef __fp16 fp16; 83 | 84 | inline void hgemm(int m, int n, int k, float alpha, fp16 const *a, int lda, 85 | fp16 const *b, int ldb, float beta, fp16 *c, int ldc) { 86 | for (int i = 0; i < n; ++i) { 87 | for (int j = 0; j < m; ++j) { 88 | fp16 temp(0); 89 | for (int l = 0; l < k; ++l) 90 | temp = a[l * lda + j] * b[i * ldb + l] + temp; 91 | c[ldc * i + j] = c[ldc * i + j] * beta + temp * alpha; 92 | } 93 | } 94 | } 95 | inline void shgemm(int m, int n, int k, float alpha, fp16 const *a, int lda, 96 | fp16 const *b, int ldb, float beta, float *c, int ldc) { 97 | for (int i = 0; i < n; ++i) { 98 | for (int j = 0; j < m; ++j) { 99 | float temp = 0.f; 100 | for (int l = 0; l < k; ++l) 101 | temp = a[l * lda + j] * b[i * ldb + l] + temp; 102 | c[ldc * i + j] = c[ldc * i + j] * beta + temp * alpha; 103 | } 104 | } 105 | } 106 | #endif 107 | 108 | #if defined(FP16_AVX2_EMULATION) || defined(FP16_BFLIKE_FLOAT) 109 | #ifdef FP16_AVX2_EMULATION 110 | #include 111 | struct fp16 { 112 | unsigned short x; 113 | fp16() {} 114 | fp16(const fp16 &rhs) : x(rhs.x) {} 115 | fp16 &operator=(fp16 rhs) { 116 | x = rhs.x; 117 | return *this; 118 | } 119 | fp16(float t) { x = _cvtss_sh(t, 0); } 120 | float convert_to_float() const { return _cvtsh_ss(x); } 121 | explicit operator float() const { return convert_to_float(); } 122 | explicit operator double() const { 123 | return static_cast(convert_to_float()); 124 | } 125 | 126 | fp16 operator+(fp16 rhs) const { 127 | return this->convert_to_float() + rhs.convert_to_float(); 128 | } 129 | fp16 operator-(fp16 rhs) const { 130 | return this->convert_to_float() - rhs.convert_to_float(); 131 | } 132 | fp16 operator*(fp16 rhs) const { 133 | return this->convert_to_float() * rhs.convert_to_float(); 134 | } 135 | }; 136 | 137 | #endif 138 | 139 | #ifdef FP16_BFLIKE_FLOAT 140 | #include 141 | #include 142 | #include 143 | struct fp16 { 144 | uint16_t x; 145 | fp16() {} 146 | fp16(const fp16 &rhs) : x(rhs.x) {} 147 | fp16 &operator=(fp16 rhs) { 148 | x = rhs.x; 149 | return *this; 150 | } 151 | fp16(float f) { 152 | uint32_t t = *reinterpret_cast(&f); 153 | uint32_t exp = t & 0x7f800000u; 154 | uint32_t mant = t & 0x007fffffu; 155 | int shift = 16 + 7 - BF_NMANT; 156 | x = (t >> shift); 157 | if (mant & (1u << (shift - 1))) { 158 | uint32_t lowmant = mant & ((1u << shift) - 1u); 159 | uint32_t halfway = 1u << (shift - 1); 160 | if (lowmant > halfway || (x & 0x1u)) 161 | ++x; 162 | } 163 | /*{ 164 | float o = this->convert_to_float(); 165 | float e = (f==0.f? fabs(o-f): fabs(o-f)/fabs(f)); 166 | if(e>1e-1) printf("XX %x %x %.15e -> %.15e :: %f\n", t, 167 | (uint32_t)x, f, o, e); 168 | }*/ 169 | } 170 | float convert_to_float() const { 171 | // upcast is easy 172 | uint32_t t = ((uint32_t)x) << (16 + 7 - BF_NMANT); 173 | return *(float *)&t; 174 | } 175 | explicit operator float() const { return convert_to_float(); } 176 | explicit operator double() const { 177 | return static_cast(convert_to_float()); 178 | } 179 | 180 | fp16 operator+(fp16 rhs) const { 181 | return this->convert_to_float() + rhs.convert_to_float(); 182 | } 183 | fp16 operator-(fp16 rhs) const { 184 | return this->convert_to_float() - rhs.convert_to_float(); 185 | } 186 | fp16 operator*(fp16 rhs) const { 187 | return this->convert_to_float() * rhs.convert_to_float(); 188 | } 189 | }; 190 | #endif 191 | 192 | // double rounding causes larger error in very rare case. we ignore it for 193 | // performance 194 | inline float fp16_fma(fp16 a, fp16 b, fp16 c) { 195 | float fa = a.convert_to_float(); 196 | float fb = b.convert_to_float(); 197 | float fc = c.convert_to_float(); 198 | return fa * fb + fc; 199 | } 200 | 201 | void hgemm_opt(int m, int n, int k, float alpha, fp16 const *a, int lda, 202 | fp16 const *b, int ldb, float /*beta*/, fp16 *c, int ldc); 203 | inline void hgemm(int m, int n, int k, float alpha, fp16 const *a, int lda, 204 | fp16 const *b, int ldb, float beta, fp16 *c, int ldc) { 205 | #ifdef FP16_AVX2_EMULATION 206 | if (beta == 1.f) { // remove this in the case for test 207 | hgemm_opt(m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); 208 | return; 209 | } 210 | #endif 211 | for (int j = 0; j < m; ++j) { 212 | for (int i = 0; i < n; ++i) { 213 | fp16 temp(0.f); 214 | for (int l = 0; l < k; ++l) 215 | temp = fp16_fma(a[l * lda + j], b[i * ldb + l], temp); 216 | c[ldc * i + j] = (c[ldc * i + j].convert_to_float() * beta + 217 | temp.convert_to_float() * alpha); 218 | } 219 | } 220 | } 221 | 222 | void shgemm_opt(int m, int n, int k, float alpha, fp16 const *a, int lda, 223 | fp16 const *b, int ldb, float /*beta*/, float *c, int ldc); 224 | inline void shgemm(int m, int n, int k, float alpha, fp16 const *a, int lda, 225 | fp16 const *b, int ldb, float beta, float *c, int ldc) { 226 | #ifdef FP16_AVX2_EMULATION 227 | if (beta == 1.f) { // remove this in the case of test 228 | shgemm_opt(m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); 229 | return; 230 | } 231 | #endif 232 | for (int j = 0; j < m; ++j) { 233 | for (int i = 0; i < n; ++i) { 234 | float temp = 0.f; 235 | for (int l = 0; l < k; ++l) 236 | temp = a[l * lda + j].convert_to_float() * 237 | b[i * ldb + l].convert_to_float() + 238 | temp; 239 | c[ldc * i + j] = c[ldc * i + j] * beta + temp * alpha; 240 | } 241 | } 242 | } 243 | 244 | #endif 245 | 246 | #ifdef FP16_IS_NOT_SUPPORTED 247 | #warning "FP16 IS NOT SUPPORTED" 248 | typedef unsigned short fp16; 249 | 250 | // do nothing. 251 | inline void hgemm(...) {} 252 | inline void shgemm(...) {} 253 | 254 | #endif 255 | 256 | #if 0 257 | // test code 258 | #include "fp16sim.hpp" 259 | #include 260 | #include 261 | 262 | int main(){ 263 | int m = 300, n = 210, k=101; 264 | fp16*a = (fp16*)malloc(sizeof(fp16)*m*k); 265 | fp16*b = (fp16*)malloc(sizeof(fp16)*k*n); 266 | fp16*c = (fp16*)malloc(sizeof(fp16)*m*n); 267 | fp16*c2 = (fp16*)malloc(sizeof(fp16)*m*n); 268 | for(int j=0; j error ? t: error; 279 | //std::printf("%d %d %e %e\n", i, j, (float)c[m*j+i], (float)c2[m*j+i]); 280 | } 281 | std::printf("hgemm error = %e\n", error); 282 | 283 | float*sc = (float*)malloc(sizeof(float)*m*n); 284 | float*sc2 = (float*)malloc(sizeof(float)*m*n); 285 | for(int j=0; j error ? t: error; 294 | //std::printf("%d %d %e %e\n", i, j, (float)sc[m*j+i], (float)sc2[m*j+i]); 295 | } 296 | std::printf("shgemm error = %e\n", error); 297 | 298 | return 0; 299 | } 300 | #endif 301 | #endif 302 | -------------------------------------------------------------------------------- /src/grid.hpp: -------------------------------------------------------------------------------- 1 | #ifndef GRID_HPP 2 | #define GRID_HPP 3 | #include "fp16sim.hpp" 4 | #include 5 | #include 6 | #include 7 | #include "log.hpp" 8 | 9 | #ifdef __APPLE__ 10 | #define aligned_alloc(alignment, size) malloc(size) 11 | #endif 12 | 13 | extern int grank, gsize; 14 | extern int reorder[ 8 ]; 15 | 16 | enum NumaMap { 17 | // How to destribute NUMA processes to the process grid. 18 | ROWCONT, // continuous in row 19 | COLCONT, // continuous in column 20 | ROWDIST, // distributed (cyclic) over row 21 | COLDIST, // distributed (cyclic) over column 22 | CONT2D // continuous in 2x2. this is only for nnuma==4 23 | }; 24 | 25 | struct Grid { 26 | // vcomm is a communicator for vertical communication (inside a column) 27 | // row = id(vcomm), nrow = sz(vcomm) 28 | // hcomm is a communicator for horizontal communication (inside a row) 29 | // col = id(hcomm), ncol = sz(hcomm) 30 | int row, col; 31 | int nrow, ncol; 32 | int idnuma, nnuma; 33 | MPI_Comm vcomm, hcomm, commworld; 34 | Grid(MPI_Comm comm, int nrow, int numasize = 0, 35 | NumaMap map = NumaMap::ROWCONT) 36 | : commworld(comm) { 37 | assert(numasize >= 0); 38 | assert(numasize != 0 || map != NumaMap::ROWCONT); 39 | 40 | int rank, size; 41 | MPI_Comm_rank(comm, &rank); 42 | MPI_Comm_size(comm, &size); 43 | if ( gsize % nrow ) MPI_Abort(MPI_COMM_WORLD, 4); 44 | int ncol = gsize / nrow; 45 | int myrow, mycol; 46 | if ( numasize == 1 ) { 47 | PrintMsg("\tNode Grid - 2x3C"); 48 | idnuma = 0; 49 | nnuma = 1; 50 | int myNode = grank / 6; 51 | int myLocalID = grank % 6; 52 | int nodeRow = (myNode % (nrow / 2)) * 2; 53 | int nodeCol = (myNode / (nrow / 2)) * 3; 54 | myrow = (myLocalID % 2) + nodeRow; 55 | mycol = (myLocalID / 2) + nodeCol; 56 | } 57 | else if ( numasize == 2 ) { 58 | PrintMsg("\tNode Grid - 3x2C"); 59 | idnuma = 0; 60 | nnuma = 1; 61 | int myNode = grank / 6; 62 | int myLocalID = grank % 6; 63 | int nodeRow = (myNode % (nrow / 3)) * 2; 64 | int nodeCol = (myNode / (nrow / 3)) * 2; 65 | myrow = (myLocalID % 3) + nodeRow; 66 | mycol = (myLocalID / 3) + nodeCol; 67 | } 68 | else if ( numasize == 0 ) 69 | { 70 | PrintMsg("\tGlobal Column Major"); 71 | idnuma = 0; 72 | nnuma = 1; 73 | myrow = grank % nrow; 74 | mycol = grank / nrow; 75 | } 76 | else if ( numasize == 3 ) 77 | { 78 | PrintMsg("\tGlobal Row Major"); 79 | idnuma = 0; 80 | nnuma = 1; 81 | myrow = grank / ncol; 82 | mycol = grank % ncol; 83 | } 84 | else if ( numasize == 4 ) 85 | { 86 | PrintMsg("\tNode Grid - 2x4R"); 87 | idnuma = 0; 88 | nnuma = 1; 89 | int myNode = grank / 8; 90 | int myLocalID = grank % 8; 91 | int nodeRow = (myNode % (nrow / 2)) * 2; 92 | int nodeCol = (myNode / (nrow / 2)) * 4; 93 | myrow = myLocalID / 4; 94 | mycol = myLocalID % 4; 95 | myrow += nodeRow; 96 | mycol += nodeCol; 97 | } 98 | else if ( numasize == 5 ) 99 | { 100 | PrintMsg("\tNode Grid - 2x4C"); 101 | idnuma = 0; 102 | nnuma = 1; 103 | int myNode = grank / 8; 104 | int myLocalID = grank % 8; 105 | int nodeRow = (myNode % (nrow / 2)) * 2; 106 | int nodeCol = (myNode / (nrow / 2)) * 4; 107 | myrow = myLocalID % 2; 108 | mycol = myLocalID / 2; 109 | myrow += nodeRow; 110 | mycol += nodeCol; 111 | } 112 | else if ( numasize == 6 ) 113 | { 114 | PrintMsg("\tNode Grid - 4x2R"); 115 | idnuma = 0; 116 | nnuma = 1; 117 | int myNode = grank / 8; 118 | int myLocalID = grank % 8; 119 | int nodeRow = (myNode % (nrow / 4)) * 4; 120 | int nodeCol = (myNode / (nrow / 4)) * 2; 121 | myrow = myLocalID / 2; 122 | mycol = myLocalID % 2; 123 | myrow += nodeRow; 124 | mycol += nodeCol; 125 | } 126 | else if ( numasize == 7 ) 127 | { 128 | PrintMsg("\tNode Grid - 4x2C"); 129 | idnuma = 0; 130 | nnuma = 1; 131 | int myNode = grank / 8; 132 | int myLocalID = grank % 8; 133 | int nodeRow = (myNode % (nrow / 4)) * 4; 134 | int nodeCol = (myNode / (nrow / 4)) * 2; 135 | myrow = myLocalID % 4; 136 | mycol = myLocalID / 4; 137 | myrow += nodeRow; 138 | mycol += nodeCol; 139 | } 140 | else if ( numasize == 8 ) 141 | { 142 | PrintMsg("\tNode Grid - 1x8C"); 143 | idnuma = 0; 144 | nnuma = 1; 145 | int myNode = grank / 8; 146 | int myLocalID = grank % 8; 147 | int nodeRow = ( myNode % nrow ); 148 | int nodeCol = ( myNode / nrow ) * 8; 149 | myrow = myLocalID / 8; 150 | mycol = myLocalID % 8; 151 | myrow += nodeRow; 152 | mycol += nodeCol; 153 | } 154 | else if ( numasize == 9 ) 155 | { 156 | PrintMsg("\tNode Grid - 4x4"); 157 | idnuma = 0; 158 | nnuma = 1; 159 | int myNode = grank / 16; 160 | int myLocalID = grank % 16; 161 | int nodeRow = (myNode % (nrow / 4)) * 4; 162 | int nodeCol = (myNode / (nrow / 4)) * 4; 163 | switch(myLocalID) 164 | { 165 | case 0: 166 | myrow= 0; 167 | mycol= 0; 168 | break; 169 | case 1: 170 | myrow= 1; 171 | mycol= 1; 172 | break; 173 | case 2: 174 | myrow= 2; 175 | mycol= 2; 176 | break; 177 | case 3: 178 | myrow= 3; 179 | mycol= 3; 180 | break; 181 | case 4: 182 | myrow= 0; 183 | mycol= 1; 184 | break; 185 | case 5: 186 | myrow= 1; 187 | mycol= 2; 188 | break; 189 | case 6: 190 | myrow= 2; 191 | mycol= 3; 192 | break; 193 | case 7: 194 | myrow= 3; 195 | mycol= 0; 196 | break; 197 | case 8: 198 | myrow= 0; 199 | mycol= 2; 200 | break; 201 | case 9: 202 | myrow= 1; 203 | mycol= 3; 204 | break; 205 | case 10: 206 | myrow= 2; 207 | mycol= 0; 208 | break; 209 | case 11: 210 | myrow= 3; 211 | mycol= 1; 212 | break; 213 | case 12: 214 | myrow= 0; 215 | mycol= 3; 216 | break; 217 | case 13: 218 | myrow= 1; 219 | mycol= 0; 220 | break; 221 | case 14: 222 | myrow= 2; 223 | mycol= 1; 224 | break; 225 | case 15: 226 | myrow= 3; 227 | mycol= 2; 228 | break; 229 | } 230 | } 231 | else if ( numasize == 10 ) 232 | { 233 | PrintMsg("\tNode Grid - Reorder 2x4C"); 234 | idnuma = 0; 235 | nnuma = 1; 236 | int myNode = grank / 8; 237 | int myLocalID = grank % 8; 238 | int nodeRow = (myNode % (nrow / 2)) * 2; 239 | int nodeCol = (myNode / (nrow / 2)) * 4; 240 | int reorderID; 241 | for ( int ii = 0; ii < 8; ii++ ) 242 | { 243 | if ( myLocalID == reorder[ ii ] ) 244 | { 245 | reorderID = ii; 246 | break; 247 | } 248 | } 249 | myrow = reorderID % 2; 250 | mycol = reorderID / 2; 251 | myrow += nodeRow; 252 | mycol += nodeCol; 253 | } 254 | else { 255 | #if 0 //Future possible usage, dont delete 256 | assert(size % numasize == 0); 257 | idnuma = rank % numasize; 258 | nnuma = numasize; 259 | switch (map) { 260 | case NumaMap::ROWCONT: { 261 | assert(nrow % nnuma == 0); 262 | myrow = rank % nrow; 263 | mycol = rank / nrow; 264 | } break; 265 | 266 | case NumaMap::COLCONT: { 267 | assert((size / nrow) % nnuma == 0); 268 | int t = rank / nnuma; 269 | myrow = t % nrow; 270 | mycol = (t / nrow) * nnuma + idnuma; 271 | } break; 272 | 273 | case NumaMap::ROWDIST: { 274 | assert(nrow % nnuma == 0); 275 | int rs = nrow / nnuma; 276 | int t = rank / nnuma; 277 | myrow = (t % rs) + idnuma * rs; 278 | mycol = rank / nrow; 279 | } break; 280 | 281 | case NumaMap::COLDIST: { 282 | assert((size / nrow) % nnuma == 0); 283 | int t = rank / nnuma + (size / nnuma) * idnuma; 284 | myrow = t % nrow; 285 | mycol = t / nrow; 286 | } break; 287 | 288 | case NumaMap::CONT2D: { 289 | assert(nnuma % 2 == 0); // others are not implemented yet 290 | assert(nrow % 2 == 0); 291 | assert((size / nrow) % (nnuma / 2) == 0); 292 | int t = rank / nnuma; 293 | int grow = t % (nrow / 2); 294 | int gcol = t / (nrow / 2); 295 | myrow = grow * 2 + idnuma % 2; 296 | mycol = gcol * (nnuma / 2) + idnuma / 2; 297 | } break; 298 | default: 299 | std::abort(); 300 | } 301 | #endif 302 | } 303 | 304 | MPI_Comm_split(comm, mycol, myrow, &vcomm); 305 | MPI_Comm_split(comm, myrow, mycol, &hcomm); 306 | this->row = myrow; 307 | this->col = mycol; 308 | this->nrow = nrow; 309 | this->ncol = ncol; 310 | } 311 | ~Grid() { 312 | MPI_Comm_free(&vcomm); 313 | MPI_Comm_free(&hcomm); 314 | } 315 | }; 316 | 317 | template struct Mpi_type_wrappe {}; 318 | 319 | template <> struct Mpi_type_wrappe { 320 | operator MPI_Datatype() { return MPI_SHORT; } 321 | }; 322 | 323 | template <> struct Mpi_type_wrappe<__half> { 324 | operator MPI_Datatype() { return MPI_SHORT; } 325 | }; 326 | 327 | template <> struct Mpi_type_wrappe { 328 | operator MPI_Datatype() { return MPI_FLOAT; } 329 | }; 330 | 331 | template <> struct Mpi_type_wrappe { 332 | operator MPI_Datatype() { return MPI_DOUBLE; } 333 | }; 334 | 335 | template struct T2MPI { static Mpi_type_wrappe type; }; 336 | 337 | template Mpi_type_wrappe T2MPI::type; 338 | 339 | #endif 340 | -------------------------------------------------------------------------------- /src/otf_gemv.cpp: -------------------------------------------------------------------------------- 1 | #include "hpl_rand.hpp" 2 | #include "svesim.hpp" 3 | #include 4 | 5 | extern "C" void otf_gemv_kernel(int64_t n, int mb, int nb, double alpha, 6 | double const *__restrict__ x, 7 | double *__restrict__ y, uint64_t seed) { 8 | // on-the-fly GEMV computes y = y + alpha * A * x 9 | // see hpl_rand.hpp for details of the matrix generation. it is LCG. 10 | // n is the dimension of the whole matrix. 11 | // mb \times nb is the dimension of the sub-matrix to compute. 12 | // Note that the sub-matrix cannot have diagonals. Use serial code for the 13 | // part includes diagonals instead. 14 | const int vlen = svcntd(); 15 | const int nn = svcntd() * 4; 16 | 17 | RandCoeff c0 = RandCoeff::default_vals(); 18 | RandStat s0; 19 | s0.x = seed; 20 | int64_t rinit[vlen]; 21 | for (int i = 0; i < vlen; ++i) { 22 | rinit[i] = s0.x; 23 | s0 = c0 * s0; 24 | } 25 | RandCoeff c8 = c0.pow(vlen); 26 | RandCoeff c32 = c8.pow(4); 27 | RandCoeff cn = c0.pow(n); 28 | RandCoeff cn2 = cn.pow(2); 29 | 30 | auto t64 = svptrue_b64(); 31 | svint64_t stat00 = svld1_s64(t64, rinit); 32 | alpha *= 0x1.fffffffffffffP-65; 33 | // 24x24 34 | // alpha *= 0x1.fffffffffffffP-73; 35 | // 96x96 36 | // alpha *= 0x1.fffffffffffffP-75; 37 | svfloat64_t sv0 = svdup_f64(0.); 38 | 39 | int64_t jend = nb - 2; 40 | for (int64_t i = 0; i < mb; i += nn) { 41 | __builtin_prefetch(&y[i]); 42 | svfloat64_t y00 = sv0; 43 | svfloat64_t y10 = sv0; 44 | svfloat64_t y20 = sv0; 45 | svfloat64_t y30 = sv0; 46 | svfloat64_t y01 = sv0; 47 | svfloat64_t y11 = sv0; 48 | svfloat64_t y21 = sv0; 49 | svfloat64_t y31 = sv0; 50 | svint64_t sc00, sc10, sc20, sc30, sc01, sc11, sc21, sc31; 51 | { 52 | svint64_t a, c; 53 | a = svdup_s64(c8.a); 54 | c = svdup_s64(c8.c); 55 | sc00 = stat00; 56 | sc10 = svmad_s64_x(t64, sc00, a, c); 57 | sc20 = svmad_s64_x(t64, sc10, a, c); 58 | sc30 = svmad_s64_x(t64, sc20, a, c); 59 | a = svdup_s64(cn.a); 60 | c = svdup_s64(cn.c); 61 | sc01 = svmad_s64_x(t64, sc00, a, c); 62 | sc11 = svmad_s64_x(t64, sc10, a, c); 63 | sc21 = svmad_s64_x(t64, sc20, a, c); 64 | sc31 = svmad_s64_x(t64, sc30, a, c); 65 | } 66 | svint64_t sva, svc; 67 | sva = svdup_s64(cn2.a); 68 | svc = svdup_s64(cn2.c); 69 | for (int64_t j = 0; j <= jend; j += 2) { 70 | svfloat64_t svx1 = svdup_f64(x[j]); 71 | svfloat64_t r00 = svcvt_f64_s64_x(t64, sc00); 72 | svfloat64_t r10 = svcvt_f64_s64_x(t64, sc10); 73 | svfloat64_t r20 = svcvt_f64_s64_x(t64, sc20); 74 | svfloat64_t r30 = svcvt_f64_s64_x(t64, sc30); 75 | y00 = svmla_f64_x(t64, y00, r00, svx1); 76 | y10 = svmla_f64_x(t64, y10, r10, svx1); 77 | y20 = svmla_f64_x(t64, y20, r20, svx1); 78 | y30 = svmla_f64_x(t64, y30, r30, svx1); 79 | sc00 = svmad_s64_x(t64, sc00, sva, svc); 80 | sc10 = svmad_s64_x(t64, sc10, sva, svc); 81 | sc20 = svmad_s64_x(t64, sc20, sva, svc); 82 | sc30 = svmad_s64_x(t64, sc30, sva, svc); 83 | 84 | svfloat64_t svx2 = svdup_f64(x[j + 1]); 85 | svfloat64_t r01 = svcvt_f64_s64_x(t64, sc01); 86 | svfloat64_t r11 = svcvt_f64_s64_x(t64, sc11); 87 | svfloat64_t r21 = svcvt_f64_s64_x(t64, sc21); 88 | svfloat64_t r31 = svcvt_f64_s64_x(t64, sc31); 89 | y01 = svmla_f64_x(t64, y01, r01, svx2); 90 | y11 = svmla_f64_x(t64, y11, r11, svx2); 91 | y21 = svmla_f64_x(t64, y21, r21, svx2); 92 | y31 = svmla_f64_x(t64, y31, r31, svx2); 93 | sc01 = svmad_s64_x(t64, sc01, sva, svc); 94 | sc11 = svmad_s64_x(t64, sc11, sva, svc); 95 | sc21 = svmad_s64_x(t64, sc21, sva, svc); 96 | sc31 = svmad_s64_x(t64, sc31, sva, svc); 97 | } 98 | if (__builtin_expect(!!(nb & 0x1u), 0)) { 99 | svfloat64_t r00 = svcvt_f64_s64_x(t64, sc00); 100 | svfloat64_t r10 = svcvt_f64_s64_x(t64, sc10); 101 | svfloat64_t r20 = svcvt_f64_s64_x(t64, sc20); 102 | svfloat64_t r30 = svcvt_f64_s64_x(t64, sc30); 103 | 104 | svfloat64_t svx = svdup_f64(x[nb - 1]); 105 | y00 = svmla_f64_x(t64, y00, r00, svx); 106 | y10 = svmla_f64_x(t64, y10, r10, svx); 107 | y20 = svmla_f64_x(t64, y20, r20, svx); 108 | y30 = svmla_f64_x(t64, y30, r30, svx); 109 | } 110 | auto pg0 = svwhilelt_b64(i, (int64_t)(mb - 0 * vlen)); 111 | auto pg1 = svwhilelt_b64(i, (int64_t)(mb - 1 * vlen)); 112 | auto pg2 = svwhilelt_b64(i, (int64_t)(mb - 2 * vlen)); 113 | auto pg3 = svwhilelt_b64(i, (int64_t)(mb - 3 * vlen)); 114 | y00 = svadd_f64_x(t64, y00, y01); 115 | y10 = svadd_f64_x(t64, y10, y11); 116 | y20 = svadd_f64_x(t64, y20, y21); 117 | y30 = svadd_f64_x(t64, y30, y31); 118 | y00 = svmla_n_f64_x(pg0, svld1_vnum_f64(pg0, y + i, 0), y00, alpha); 119 | y10 = svmla_n_f64_x(pg1, svld1_vnum_f64(pg1, y + i, 1), y10, alpha); 120 | y20 = svmla_n_f64_x(pg2, svld1_vnum_f64(pg2, y + i, 2), y20, alpha); 121 | y30 = svmla_n_f64_x(pg3, svld1_vnum_f64(pg3, y + i, 3), y30, alpha); 122 | 123 | svst1_vnum_f64(pg0, y + i, 0, y00); 124 | svst1_vnum_f64(pg1, y + i, 1, y10); 125 | svst1_vnum_f64(pg2, y + i, 2, y20); 126 | svst1_vnum_f64(pg3, y + i, 3, y30); 127 | 128 | svint64_t sva32, svc32; 129 | sva32 = svdup_s64(c32.a); 130 | svc32 = svdup_s64(c32.c); 131 | stat00 = svmad_s64_x(pg0, stat00, sva32, svc32); 132 | } 133 | } 134 | 135 | extern "C" void hmg_gemv_up(int istart, double a, double b, int mb, int nb, 136 | double alpha, double const *__restrict__ x, 137 | double *__restrict__ y) { 138 | // same as abobe, but for the upper-triangular part of the Higham's hpl-ai 139 | // matrix. istart is the row position of the sub-matrix. jstart is not 140 | // needed because of the structure of the matrix. a and b are the paramter 141 | // of the Higham's matrix. 142 | const int vlen = svcntd(); 143 | const int nn = svcntd() * 4; 144 | auto t64 = svptrue_b64(); 145 | double ab = a * b; 146 | 147 | svint64_t iindex0, iindex1, iindex2, iindex3; 148 | iindex0 = svindex_s64(istart, 1); 149 | iindex1 = svindex_s64(istart + vlen, 1); 150 | iindex2 = svindex_s64(istart + 2 * vlen, 1); 151 | iindex3 = svindex_s64(istart + 3 * vlen, 1); 152 | svfloat64_t findex0 = svcvt_f64_s64_x(t64, iindex0); 153 | svfloat64_t findex1 = svcvt_f64_s64_x(t64, iindex1); 154 | svfloat64_t findex2 = svcvt_f64_s64_x(t64, iindex2); 155 | svfloat64_t findex3 = svcvt_f64_s64_x(t64, iindex3); 156 | svfloat64_t svincr = svdup_f64((double)nn); 157 | svfloat64_t sva = svdup_f64(b); 158 | svfloat64_t svab = svdup_f64(ab); 159 | svfloat64_t sv0 = svdup_f64(0.); 160 | 161 | int64_t jend = nb - 2; 162 | for (int64_t i = 0; i < mb; i += nn) { 163 | __builtin_prefetch(&y[i]); 164 | svfloat64_t y00 = sv0; 165 | svfloat64_t y10 = sv0; 166 | svfloat64_t y20 = sv0; 167 | svfloat64_t y30 = sv0; 168 | svfloat64_t y01 = sv0; 169 | svfloat64_t y11 = sv0; 170 | svfloat64_t y21 = sv0; 171 | svfloat64_t y31 = sv0; 172 | // a+ab*(j+nn) = (a+ab*j) + ab*nn is better for performance, but we 173 | // recompute them for accuracy 174 | svfloat64_t ai0 = svmla_f64_x(t64, sva, svab, findex0); 175 | svfloat64_t ai1 = svmla_f64_x(t64, sva, svab, findex1); 176 | svfloat64_t ai2 = svmla_f64_x(t64, sva, svab, findex2); 177 | svfloat64_t ai3 = svmla_f64_x(t64, sva, svab, findex3); 178 | for (int64_t j = 0; j <= jend; j += 2) { 179 | svfloat64_t svx1 = svdup_f64(x[j]); 180 | y00 = svmla_f64_x(t64, y00, ai0, svx1); 181 | y10 = svmla_f64_x(t64, y10, ai1, svx1); 182 | y20 = svmla_f64_x(t64, y20, ai2, svx1); 183 | y30 = svmla_f64_x(t64, y30, ai3, svx1); 184 | 185 | svfloat64_t svx2 = svdup_f64(x[j + 1]); 186 | y01 = svmla_f64_x(t64, y01, ai0, svx2); 187 | y11 = svmla_f64_x(t64, y11, ai1, svx2); 188 | y21 = svmla_f64_x(t64, y21, ai2, svx2); 189 | y31 = svmla_f64_x(t64, y31, ai3, svx2); 190 | } 191 | if (__builtin_expect(!!(nb & 0x1u), 0)) { 192 | svfloat64_t svx = svdup_f64(x[nb - 1]); 193 | y00 = svmla_f64_x(t64, y00, ai0, svx); 194 | y10 = svmla_f64_x(t64, y10, ai1, svx); 195 | y20 = svmla_f64_x(t64, y20, ai2, svx); 196 | y30 = svmla_f64_x(t64, y30, ai3, svx); 197 | } 198 | auto pg0 = svwhilelt_b64(i, (int64_t)(mb - 0 * vlen)); 199 | auto pg1 = svwhilelt_b64(i, (int64_t)(mb - 1 * vlen)); 200 | auto pg2 = svwhilelt_b64(i, (int64_t)(mb - 2 * vlen)); 201 | auto pg3 = svwhilelt_b64(i, (int64_t)(mb - 3 * vlen)); 202 | y00 = svadd_f64_x(t64, y00, y01); 203 | y10 = svadd_f64_x(t64, y10, y11); 204 | y20 = svadd_f64_x(t64, y20, y21); 205 | y30 = svadd_f64_x(t64, y30, y31); 206 | y00 = svmla_n_f64_x(pg0, svld1_vnum_f64(pg0, y + i, 0), y00, alpha); 207 | y10 = svmla_n_f64_x(pg1, svld1_vnum_f64(pg1, y + i, 1), y10, alpha); 208 | y20 = svmla_n_f64_x(pg2, svld1_vnum_f64(pg2, y + i, 2), y20, alpha); 209 | y30 = svmla_n_f64_x(pg3, svld1_vnum_f64(pg3, y + i, 3), y30, alpha); 210 | 211 | svst1_vnum_f64(pg0, y + i, 0, y00); 212 | svst1_vnum_f64(pg1, y + i, 1, y10); 213 | svst1_vnum_f64(pg2, y + i, 2, y20); 214 | svst1_vnum_f64(pg3, y + i, 3, y30); 215 | 216 | findex0 = svadd_f64_x(t64, findex0, svincr); 217 | findex1 = svadd_f64_x(t64, findex1, svincr); 218 | findex2 = svadd_f64_x(t64, findex2, svincr); 219 | findex3 = svadd_f64_x(t64, findex3, svincr); 220 | } 221 | } 222 | 223 | extern "C" void hmg_gemv_low(int istart, double a, double b, int mb, int nb, 224 | double alpha, double const *__restrict__ x, 225 | double *__restrict__ y) { 226 | // same as above, but for lower-triangular part. 227 | const int vlen = svcntd(); 228 | const int nn = svcntd() * 4; 229 | auto t64 = svptrue_b64(); 230 | double ab = a * b; 231 | 232 | svfloat64_t svincr = svdup_f64((double)2); 233 | svfloat64_t sva = svdup_f64(a); 234 | svfloat64_t svab = svdup_f64(ab); 235 | svfloat64_t sv0 = svdup_f64(0.); 236 | 237 | int64_t jend = nb - 2; 238 | for (int64_t i = 0; i < mb; i += nn) { 239 | svfloat64_t findex0 = svdup_f64((double)istart); 240 | svfloat64_t findex1 = svdup_f64((double)(istart + 1)); 241 | __builtin_prefetch(&y[i]); 242 | svfloat64_t y00 = sv0; 243 | svfloat64_t y10 = sv0; 244 | svfloat64_t y20 = sv0; 245 | svfloat64_t y30 = sv0; 246 | svfloat64_t y01 = sv0; 247 | svfloat64_t y11 = sv0; 248 | svfloat64_t y21 = sv0; 249 | svfloat64_t y31 = sv0; 250 | for (int64_t j = 0; j <= jend; j += 2) { 251 | // a+ab*(j+nn) = (a+ab*j) + ab*nn is better for performance, but we 252 | // recompute them for accuracy 253 | svfloat64_t svx1 = svdup_f64(x[j]); 254 | svfloat64_t aj0 = svmla_f64_x(t64, sva, svab, findex0); 255 | svfloat64_t aj1 = svmla_f64_x(t64, sva, svab, findex1); 256 | y00 = svmla_f64_x(t64, y00, aj0, svx1); 257 | y10 = svmla_f64_x(t64, y10, aj0, svx1); 258 | y20 = svmla_f64_x(t64, y20, aj0, svx1); 259 | y30 = svmla_f64_x(t64, y30, aj0, svx1); 260 | 261 | svfloat64_t svx2 = svdup_f64(x[j + 1]); 262 | y01 = svmla_f64_x(t64, y01, aj1, svx2); 263 | y11 = svmla_f64_x(t64, y11, aj1, svx2); 264 | y21 = svmla_f64_x(t64, y21, aj1, svx2); 265 | y31 = svmla_f64_x(t64, y31, aj1, svx2); 266 | findex0 = svadd_f64_x(t64, findex0, svincr); 267 | findex1 = svadd_f64_x(t64, findex1, svincr); 268 | } 269 | if (__builtin_expect(!!(nb & 0x1u), 0)) { 270 | svfloat64_t svx = svdup_f64(x[nb - 1]); 271 | svfloat64_t aj0 = svmla_f64_x(t64, sva, svab, findex0); 272 | y00 = svmla_f64_x(t64, y00, aj0, svx); 273 | y10 = svmla_f64_x(t64, y10, aj0, svx); 274 | y20 = svmla_f64_x(t64, y20, aj0, svx); 275 | y30 = svmla_f64_x(t64, y30, aj0, svx); 276 | } 277 | auto pg0 = svwhilelt_b64(i, (int64_t)(mb - 0 * vlen)); 278 | auto pg1 = svwhilelt_b64(i, (int64_t)(mb - 1 * vlen)); 279 | auto pg2 = svwhilelt_b64(i, (int64_t)(mb - 2 * vlen)); 280 | auto pg3 = svwhilelt_b64(i, (int64_t)(mb - 3 * vlen)); 281 | y00 = svadd_f64_x(t64, y00, y01); 282 | y10 = svadd_f64_x(t64, y10, y11); 283 | y20 = svadd_f64_x(t64, y20, y21); 284 | y30 = svadd_f64_x(t64, y30, y31); 285 | y00 = svmla_n_f64_x(pg0, svld1_vnum_f64(pg0, y + i, 0), y00, alpha); 286 | y10 = svmla_n_f64_x(pg1, svld1_vnum_f64(pg1, y + i, 1), y10, alpha); 287 | y20 = svmla_n_f64_x(pg2, svld1_vnum_f64(pg2, y + i, 2), y20, alpha); 288 | y30 = svmla_n_f64_x(pg3, svld1_vnum_f64(pg3, y + i, 3), y30, alpha); 289 | 290 | svst1_vnum_f64(pg0, y + i, 0, y00); 291 | svst1_vnum_f64(pg1, y + i, 1, y10); 292 | svst1_vnum_f64(pg2, y + i, 2, y20); 293 | svst1_vnum_f64(pg3, y + i, 3, y30); 294 | } 295 | } 296 | 297 | extern "C" void hmg_gemv_diag(int istart, int jstart, double a, double b, 298 | int mb, int nb, double alpha, 299 | double const *__restrict__ x, 300 | double *__restrict__ y) { 301 | // same as above, but for the sub-matrix whic includes diagonals. 302 | double ab = a * b; 303 | for (int i = 0; i < mb; ++i) { 304 | double d = 0.; 305 | for (int j = 0; j < i; ++j) { 306 | double aj = b + ab * (jstart + j); 307 | d += aj * x[j]; 308 | } 309 | d += (1. + ab * (istart + i)) * x[i]; 310 | double ai = a + ab * (istart + i); 311 | for (int j = i + 1; j < nb; ++j) { 312 | d += ai * x[j]; 313 | } 314 | y[i] += alpha * d; 315 | } 316 | } 317 | -------------------------------------------------------------------------------- /src/timer.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | 5 | // Default take statistics 6 | // #define TIMER_VERBOSE // save all the timing for visualization 7 | // #define TIMER_SILENT // Disable all timer 8 | // 9 | 10 | #ifdef TIMER_VERBOSE 11 | #include 12 | #define TIMER_NUM 2 13 | #elif defined(TIMER_SILENT) 14 | #define TIMER_NUM 0 15 | #else 16 | #define TIMER_NUM 1 17 | #endif 18 | 19 | extern "C" int MPI_Get_processor_name(char *, int *); 20 | 21 | #ifdef __aarch64__ 22 | static int64_t get_utime() { 23 | uint64_t tsc; 24 | asm volatile("mrs %0, cntvct_el0" : "=r"(tsc)); 25 | return tsc; 26 | } 27 | static double tick2second(uint64_t tick) { 28 | auto frequency = [] { 29 | uint64_t frq; 30 | asm volatile("mrs %0, cntfrq_el0" : "=r"(frq)); 31 | return frq; 32 | }; 33 | static double invfreq = 1.0 / frequency(); 34 | return invfreq * (double)tick; 35 | } 36 | #else 37 | #ifdef __APPLE__ 38 | #include 39 | static int64_t get_utime() { 40 | timeval tv; 41 | gettimeofday(&tv, NULL); 42 | 43 | return tv.tv_usec * 1000ll + tv.tv_sec * 1000000000ll; 44 | } 45 | #elif defined __linux__ 46 | #include 47 | static int64_t get_utime() { 48 | timespec ts; 49 | clock_gettime(CLOCK_REALTIME, &ts); 50 | 51 | return ts.tv_nsec + ts.tv_sec * 1000000000ll; 52 | } 53 | #endif 54 | static double tick2second(uint64_t tick) { return 1.e-9 * (double)tick; } 55 | #endif 56 | 57 | struct Timer_template_base { 58 | enum Items { 59 | DIAG_BCAST = 0, 60 | LCOL_BCAST, 61 | RROW_BCAST, 62 | TEST, 63 | WAIT, 64 | DIAG_LU, 65 | TRSM_L, 66 | TRSM_R, 67 | CONV_L, 68 | CONV_R, 69 | GEMM_UPDATE, 70 | GEMM_PROGRESS, 71 | LAZY_INIT, 72 | WRITE_BACK, 73 | IR_GEMV, 74 | IR_GEMV_COMM, 75 | IR_TRSV, 76 | IR_TRSV_MV, 77 | IR_TRSV_COMM, 78 | // ORNL modification 79 | INIT, 80 | MEMCPY, 81 | // don't touch below 82 | TOTAL, 83 | MISC, 84 | NUM_ITEMS, 85 | }; 86 | }; 87 | template struct Timer_template : Timer_template_base { 88 | // caleld fromo initialize() 89 | static void flush() {} 90 | // initialize timer with 0 91 | static void initialize() {} 92 | // start a region 93 | // elem: which item to collect 94 | // reuse: reuse last timing for reduce timer cost. 95 | static void beg(const Items /*elem*/, bool const /*reuse*/ = false) {} 96 | // stop a region 97 | // elem: which item to collect 98 | // reuse: reuse last timing for reduce timer cost. 99 | // acc: number of operations in this region. ex. flops, mips, or, bytes 100 | static void end(const Items /*elem*/, bool const /*reuse*/ = false, 101 | int64_t /*acc*/ = 0ll) {} 102 | // save a timing 103 | static double put(const Items /*elem*/, bool const /*reuse*/ = false) { 104 | return 0.0; 105 | } 106 | // dump out all the data 107 | static void show(FILE * /*fp*/ = stderr, char const * /*fmt*/ = "") {} 108 | // open file and call show() 109 | // size: number of procs 110 | // rank: process id 111 | // row : row id 112 | // col : col id 113 | // filename : filename 114 | static void dump_mp(const int /*size*/, const int /*rank*/, 115 | const int /*row*/, const int /*col*/, 116 | const char * /*filename*/ = "") {} // do nothing 117 | }; 118 | 119 | template <> struct Timer_template<1> : Timer_template_base { 120 | static char const *name(int const i) { 121 | static const char *strs[NUM_ITEMS] = { 122 | "DIAG_BCAST", 123 | "LCOL_BCAST", 124 | "RROW_BCAST", 125 | "TEST", 126 | "WAIT", 127 | "DIAG_LU", 128 | "TRSM_L", 129 | "TRSM_R", 130 | "CONV_L", 131 | "CONV_R", 132 | "GEMM_UPDATE", 133 | "GEMM_PROGRESS", 134 | "LAZY_INIT", 135 | "WRITE_BACK", 136 | "IR_GEMV", 137 | "IR_GEMV_COMM", 138 | "IR_TRSV", 139 | "IR_TRSV_MV", 140 | "IR_TRSV_COMM", 141 | // ORNL modification 142 | "INIT", 143 | "MEMCPY", 144 | "TOTAL", 145 | "MISC", 146 | }; 147 | return strs[i]; 148 | } 149 | 150 | static void flush() { 151 | for (int i = 0; i < NUM_ITEMS; i++) { 152 | time(i) = 0ll; 153 | accum(i) = 0ll; 154 | } 155 | } 156 | 157 | static void initialize() { 158 | flush(); 159 | tprev(1) = get_utime(); 160 | } 161 | 162 | static void beg(const Items elem, bool const reuse = false) { 163 | if (reuse) 164 | time(elem) -= tprev(); 165 | else 166 | time(elem) -= (tprev() = get_utime()); 167 | } 168 | 169 | static void end(const Items elem, bool const reuse = false) { 170 | if (reuse) 171 | time(elem) += tprev(); 172 | else 173 | time(elem) += (tprev() = get_utime()); 174 | } 175 | static void end(const Items elem, bool const reuse, int64_t acc) { 176 | end(elem, reuse); 177 | accum(elem) += acc; 178 | } 179 | 180 | static double put(const Items /*elem*/, bool const reuse = false) { 181 | uint64_t tt = reuse ? get_utime() : tprev(); 182 | return tick2second(tt - tprev(1)); 183 | } 184 | 185 | static void 186 | show(FILE *fp = stderr, 187 | const char *fmt = " %-12s : %e sec : %6.2f %% : %20lld : %e Gop/s\n") 188 | 189 | { 190 | fflush(fp); 191 | 192 | time(MISC) = time(TOTAL); 193 | 194 | for (int i = 0; i < NUM_ITEMS - 2; i++) { 195 | time(MISC) -= time(i); 196 | } 197 | 198 | for (int i = 0; i < NUM_ITEMS; i++) { 199 | fprintf(fp, fmt, name(i), rtime(i), 100.0 * time(i) / time(TOTAL), 200 | accum(i), 1e-9 * accum(i) / rtime(i)); 201 | } 202 | const double flops = 203 | (double)(accum(GEMM_UPDATE) + accum(GEMM_PROGRESS)) / 204 | (rtime(GEMM_UPDATE) + rtime(GEMM_PROGRESS)); 205 | fprintf(fp, "GEMM_TOTAL: %f Tflops\n", 1.e-12 * flops); 206 | 207 | fflush(fp); 208 | } 209 | 210 | static void dump_mp(const int size, const int rank, const int row, 211 | const int col, 212 | const char *fmt = "Timerdump.%04d.%04d") { 213 | static char filename[1024]; 214 | sprintf(filename, fmt, size, rank); 215 | FILE *fp = fopen(filename, "w"); 216 | if (fp) { 217 | int len; 218 | MPI_Get_processor_name(filename, &len); 219 | fprintf(fp, "# row=%d, col=%d, host=%s\n", row, col, filename); 220 | show(fp); 221 | fclose(fp); 222 | } 223 | } 224 | 225 | static double rtime(int const i) { return tick2second(time(i)); } 226 | 227 | private: 228 | static int64_t &time(int const i) { 229 | static int64_t buf[NUM_ITEMS]; 230 | return buf[i]; 231 | } 232 | static int64_t &accum(int const i) { 233 | static int64_t buf[NUM_ITEMS]; 234 | return buf[i]; 235 | } 236 | static int64_t &tprev(int const ch = 0) { 237 | static int64_t t[2]; /* 0 : previous time */ 238 | /* 1 : initial time */ 239 | return t[ch]; 240 | } 241 | //static double rtime(int const i) { return tick2second(time(i)); } 242 | }; 243 | 244 | #ifdef TIMER_VERBOSE 245 | // pritout all the timings with binary format 246 | template <> struct Timer_template<2> : Timer_template_base { 247 | static char const *name(int const i) { 248 | static const char *strs[NUM_ITEMS] = { 249 | "DIAG_BCAST", 250 | "LCOL_BCAST", 251 | "RROW_BCAST", 252 | "TEST", 253 | "WAIT", 254 | "DIAG_LU", 255 | "TRSM_L", 256 | "TRSM_R", 257 | "CONV_L", 258 | "CONV_R", 259 | "GEMM_UPDATE", 260 | "GEMM_PROGRESS", 261 | "LAZY_INIT", 262 | "WRITE_BACK", 263 | "IR_GEMV", 264 | "IR_GEMV_COMM", 265 | "IR_TRSV", 266 | "IR_TRSV_MV", 267 | "IR_TRSV_COMM", 268 | // ORNL modification 269 | "INIT", 270 | "MEMCPY", 271 | "TOTAL", 272 | "MISC", 273 | }; 274 | return strs[i]; 275 | } 276 | 277 | static void flush() { 278 | for (int i = 0; i < NUM_ITEMS; i++) { 279 | time(i) = 0ll; 280 | accum(i) = 0ll; 281 | tvec_beg(i).clear(); 282 | tvec_end(i).clear(); 283 | tvec_put(i).clear(); 284 | avec(i).clear(); 285 | } 286 | } 287 | 288 | static void initialize() { 289 | flush(); 290 | for (int i = 0; i < NUM_ITEMS; i++) { 291 | tvec_beg(i).reserve(10000); 292 | tvec_end(i).reserve(10000); 293 | tvec_put(i).reserve(10000); 294 | avec(i).reserve(10000); 295 | } 296 | tprev(1) = get_utime(); 297 | } 298 | 299 | static void beg(const Items elem, bool const reuse = false) { 300 | // fprintf(stderr, "%s: DEBUG BEG %s\n", hostname(), name(elem)); 301 | // fflush(stderr); 302 | if (reuse) 303 | time(elem) -= tprev(); 304 | else 305 | time(elem) -= (tprev() = get_utime()); 306 | 307 | tvec_beg(elem).push_back(tprev()); 308 | } 309 | 310 | static void end(const Items elem, bool const reuse = false, 311 | int64_t acc = 0ll) { 312 | // fprintf(stderr, "%s: DEBUG END %s\n", hostname(), name(elem)); 313 | // fflush(stderr); 314 | if (reuse) 315 | time(elem) += tprev(); 316 | else 317 | time(elem) += (tprev() = get_utime()); 318 | accum(elem) += acc; 319 | 320 | tvec_end(elem).push_back(tprev()); 321 | avec(elem).push_back(acc); 322 | } 323 | 324 | static double put(const Items elem, bool const reuse = false) { 325 | uint64_t tt = reuse ? get_utime() : tprev(); 326 | tvec_put(elem).push_back(tt); 327 | return tick2second(tt - tprev(1)); 328 | } 329 | 330 | static void 331 | show(FILE *fp = stderr, 332 | const char *fmt = " %-12s : %e sec : %6.2f %% : %20ld : %e Gop/s\n", 333 | FILE *fp2 = nullptr) { 334 | fflush(fp); 335 | 336 | time(MISC) = time(TOTAL); 337 | 338 | for (int i = 0; i < NUM_ITEMS - 2; i++) { 339 | time(MISC) -= time(i); 340 | } 341 | 342 | for (int i = 0; i < NUM_ITEMS; i++) { 343 | fprintf(fp, fmt, name(i), rtime(i), 100.0 * time(i) / time(TOTAL), 344 | accum(i), 1e-9 * accum(i) / rtime(i)); 345 | } 346 | const double flops = 347 | (double)(accum(GEMM_UPDATE) + accum(GEMM_PROGRESS)) / 348 | (rtime(GEMM_UPDATE) + rtime(GEMM_PROGRESS)); 349 | fprintf(fp, "GEMM_TOTAL: %f Tflops\n", 1.e-12 * flops); 350 | 351 | fflush(fp); 352 | 353 | // dump event vectors 354 | if (!fp2) 355 | fp2 = fp; 356 | for (int i = 0; i < NUM_ITEMS; i++) { 357 | dump_vector(fp2, tvec_beg(i), "BEG_", name(i)); 358 | dump_vector(fp2, tvec_end(i), "END_", name(i)); 359 | dump_vector(fp2, tvec_put(i), "PUT_", name(i)); 360 | dump_accum(fp2, avec(i), "ACC_", name(i)); 361 | } 362 | fflush(fp); 363 | } 364 | 365 | static void dump_mp(const int /*size*/, const int rank, const int row, 366 | const int col, const char *filename) { 367 | fprintf(stderr, "%d: (%d, %d)\n", rank, row, col); 368 | FILE *fp = fopen(filename, "w"); 369 | if (fp) { 370 | int len; 371 | char hostname[1024]; 372 | MPI_Get_processor_name(hostname, &len); 373 | fprintf(fp, "# row=%d, col=%d, host=%s\n", row, col, hostname); 374 | show(fp); 375 | fclose(fp); 376 | } 377 | } 378 | 379 | static const char *hostname() { 380 | static char name[1024] = { 381 | 0, 382 | }; 383 | if (!name[0]) { 384 | int len; 385 | MPI_Get_processor_name(name, &len); 386 | } 387 | return name; 388 | } 389 | 390 | private: 391 | static int64_t &time(int const i) { 392 | static int64_t buf[NUM_ITEMS]; 393 | return buf[i]; 394 | } 395 | static int64_t &accum(int const i) { 396 | static int64_t buf[NUM_ITEMS]; 397 | return buf[i]; 398 | } 399 | 400 | using tvec = std::vector; 401 | static tvec &tvec_beg(int const i) { 402 | static tvec buf[NUM_ITEMS]; 403 | return buf[i]; 404 | } 405 | static tvec &tvec_end(int const i) { 406 | static tvec buf[NUM_ITEMS]; 407 | return buf[i]; 408 | } 409 | static tvec &tvec_put(int const i) { 410 | static tvec buf[NUM_ITEMS]; 411 | return buf[i]; 412 | } 413 | static tvec &avec(int const i) { 414 | static tvec buf[NUM_ITEMS]; 415 | return buf[i]; 416 | } 417 | 418 | static void dump_vector(FILE *fp, const tvec &v, const char *s0, 419 | const char *s1) { 420 | const int n = v.size(); 421 | if (fp != stderr) { 422 | fprintf(fp, "bio, %d, %s%s\n", n, s0, s1); 423 | for (int i = 0; i < n; i++) { 424 | unsigned long utime = v[i]; 425 | double dtime = tick2second(utime - tprev(1)); 426 | fwrite(&dtime, sizeof(double), 1, fp); 427 | } 428 | } else { 429 | for (int i = 0; i < n; i++) { 430 | unsigned long utime = v[i]; 431 | double dtime = tick2second(utime - tprev(1)); 432 | 433 | fprintf(fp, "%ld, %16.12f, %s%s, %d\n", utime, dtime, s0, s1, 434 | i); 435 | } 436 | } 437 | } 438 | 439 | static void dump_accum(FILE *fp, const tvec &v, const char *s0, 440 | const char *s1) { 441 | const int n = v.size(); 442 | if (fp != stderr) { 443 | fprintf(fp, "bio, %d, %s%s\n", n, s0, s1); 444 | fwrite(v.data(), sizeof(int64_t), n, fp); 445 | } else { 446 | for (int i = 0; i < n; i++) { 447 | fprintf(fp, "%ld, 0.0, %s%s, %d\n", v[i], s0, s1, i); 448 | } 449 | } 450 | } 451 | 452 | static int64_t &tprev(int const ch = 0) { 453 | static int64_t t[2]; 454 | return t[ch]; 455 | } 456 | static double rtime(int const i) { return tick2second(time(i)); } 457 | }; 458 | #endif 459 | 460 | using Timer = Timer_template; 461 | -------------------------------------------------------------------------------- /src/fp16_gpu_kernels.cpp: -------------------------------------------------------------------------------- 1 | #include "fp16_gpu_kernels.h" 2 | 3 | using namespace std; 4 | 5 | #define TILE_DIM 64 6 | #define TILE_DIM_TRANS 64 7 | #define TILE_DIM_ID 64 8 | #define BLOCK_ROWS 4 9 | 10 | // LEFT CONVERT KERNELS 11 | __global__ void copyCoalesced(__half *odata, const float *idata, int olda, 12 | int ilda) { 13 | /*__shared__ float tile[TILE_DIM][TILE_DIM]; 14 | 15 | int x = GPU_BLOCKIDX_X * TILE_DIM + GPU_THREADIDX_X; 16 | int y = GPU_BLOCKIDX_Y * TILE_DIM + GPU_THREADIDX_Y; 17 | // int width = gridDim.x * TILE_DIM; 18 | 19 | for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) 20 | tile[(GPU_THREADIDX_Y + j)][GPU_THREADIDX_X] = 21 | idata[(long(y) +long( j)) * long(ilda) +long( x)]; 22 | 23 | __syncthreads(); 24 | 25 | for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) 26 | odata[(long(y) +long( j)) *long( olda) +long( x)] = 27 | __float2half(tile[(GPU_THREADIDX_Y + j)][GPU_THREADIDX_X]);*/ 28 | 29 | int x = GPU_BLOCKIDX_X * TILE_DIM + GPU_THREADIDX_X; 30 | int y = GPU_BLOCKIDX_Y * TILE_DIM + GPU_THREADIDX_Y; 31 | // int width = gridDim.x * TILE_DIM; 32 | 33 | for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) 34 | odata[(long(y) +long( j)) *long( olda) +long( x)] = 35 | __float2half(idata[(long(y) +long( j)) * long(ilda) +long( x)]); 36 | } 37 | 38 | /// HUGE ASSUMTION OF ilda = olda 39 | __global__ void copyCoalesced_lower(__half *odata, const float *idata, int olda, 40 | int ilda) { 41 | if(GPU_BLOCKIDX_Y > GPU_BLOCKIDX_X){ return;}; 42 | if(GPU_BLOCKIDX_X == GPU_BLOCKIDX_Y) 43 | { 44 | int x = GPU_BLOCKIDX_X * TILE_DIM + GPU_THREADIDX_X; 45 | int y = GPU_BLOCKIDX_Y * TILE_DIM + GPU_THREADIDX_Y; 46 | // int width = gridDim.x * TILE_DIM; 47 | 48 | for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) 49 | { 50 | if(x > y+j) 51 | odata[(y+j) * olda + x] = 52 | __float2half(idata[(y+j) * olda + x]); 53 | } 54 | return; 55 | } 56 | 57 | if(GPU_BLOCKIDX_X > GPU_BLOCKIDX_Y) 58 | { 59 | int x = GPU_BLOCKIDX_X * TILE_DIM + GPU_THREADIDX_X; 60 | int y = GPU_BLOCKIDX_Y * TILE_DIM + GPU_THREADIDX_Y; 61 | for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) 62 | odata[(y+j) * olda + x] = 63 | __float2half(idata[(y+j) * olda + x]); 64 | return; 65 | } 66 | } 67 | 68 | /// HUGE ASSUMTION OF ilda = olda 69 | __global__ void copyCoalesced_upper(__half *odata, const float *idata, int olda, 70 | int ilda) { 71 | if(GPU_BLOCKIDX_Y < GPU_BLOCKIDX_X){ return;}; 72 | if(GPU_BLOCKIDX_X == GPU_BLOCKIDX_Y) 73 | { 74 | int x = GPU_BLOCKIDX_X * TILE_DIM + GPU_THREADIDX_X; 75 | int y = GPU_BLOCKIDX_Y * TILE_DIM + GPU_THREADIDX_Y; 76 | for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) 77 | { 78 | if(x < y+j+1) 79 | odata[(y+j) * olda + x] = 80 | __float2half(idata[(y+j) * olda + x]); 81 | } 82 | return; 83 | } 84 | 85 | if(GPU_BLOCKIDX_X < GPU_BLOCKIDX_Y) 86 | { 87 | int x = GPU_BLOCKIDX_X * TILE_DIM + GPU_THREADIDX_X; 88 | int y = GPU_BLOCKIDX_Y * TILE_DIM + GPU_THREADIDX_Y; 89 | for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) 90 | odata[(y+j) * olda + x] = 91 | __float2half(idata[(y+j) * olda + x]); 92 | return; 93 | } 94 | } 95 | 96 | /////// UPCAST FOR DEBUG 97 | /// HUGE ASSUMTION OF ilda = olda 98 | __global__ void copyCoalesced_lower(float *odata, const __half *idata, int olda, 99 | int ilda) { 100 | if(GPU_BLOCKIDX_Y > GPU_BLOCKIDX_X){ return;}; 101 | if(GPU_BLOCKIDX_X == GPU_BLOCKIDX_Y) 102 | { 103 | int x = GPU_BLOCKIDX_X * TILE_DIM + GPU_THREADIDX_X; 104 | int y = GPU_BLOCKIDX_Y * TILE_DIM + GPU_THREADIDX_Y; 105 | // int width = gridDim.x * TILE_DIM; 106 | 107 | for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) 108 | { 109 | if(x > y+j) 110 | odata[(y+j) * olda + x] = 111 | __half2float(idata[(y+j) * olda + x]); 112 | } 113 | return; 114 | } 115 | 116 | if(GPU_BLOCKIDX_X > GPU_BLOCKIDX_Y) 117 | { 118 | int x = GPU_BLOCKIDX_X * TILE_DIM + GPU_THREADIDX_X; 119 | int y = GPU_BLOCKIDX_Y * TILE_DIM + GPU_THREADIDX_Y; 120 | for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) 121 | odata[(y+j) * olda + x] = 122 | __half2float(idata[(y+j) * olda + x]); 123 | return; 124 | } 125 | } 126 | 127 | /// HUGE ASSUMTION OF ilda = olda 128 | __global__ void copyCoalesced_upper(float *odata, const __half *idata, int olda, 129 | int ilda) { 130 | 131 | if(GPU_BLOCKIDX_Y < GPU_BLOCKIDX_X){ return;}; 132 | if(GPU_BLOCKIDX_X == GPU_BLOCKIDX_Y) 133 | { 134 | int x = GPU_BLOCKIDX_X * TILE_DIM + GPU_THREADIDX_X; 135 | int y = GPU_BLOCKIDX_Y * TILE_DIM + GPU_THREADIDX_Y; 136 | for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) 137 | { 138 | if(x <= y+j) 139 | odata[(y+j) * olda + x] = 140 | __half2float(idata[(y+j) * olda + x]); 141 | } 142 | return; 143 | } 144 | 145 | if(GPU_BLOCKIDX_X < GPU_BLOCKIDX_Y) 146 | { 147 | int x = GPU_BLOCKIDX_X * TILE_DIM + GPU_THREADIDX_X; 148 | int y = GPU_BLOCKIDX_Y * TILE_DIM + GPU_THREADIDX_Y; 149 | for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) 150 | odata[(y+j) * olda + x] = 151 | __half2float(idata[(y+j) * olda + x]); 152 | return; 153 | } 154 | } 155 | 156 | 157 | // RIGHT TRANSPOSE CONVERT KERNELS 158 | __global__ void transposeCoalesced(__half *odata, const float *idata, int olda, 159 | int ilda) { 160 | __shared__ float tile[TILE_DIM_TRANS][TILE_DIM_TRANS+1]; 161 | 162 | int x = GPU_BLOCKIDX_X * TILE_DIM_TRANS + GPU_THREADIDX_X; 163 | int y = GPU_BLOCKIDX_Y * TILE_DIM_TRANS + GPU_THREADIDX_Y; 164 | for (int j = 0; j < TILE_DIM_TRANS; j += BLOCK_ROWS) 165 | tile[GPU_THREADIDX_Y + j][GPU_THREADIDX_X] = 166 | idata[(long(y) +long( j)) * long(ilda) +long( x)]; 167 | 168 | __syncthreads(); 169 | 170 | x = GPU_BLOCKIDX_Y * TILE_DIM_TRANS + GPU_THREADIDX_X; // transpose block offset 171 | y = GPU_BLOCKIDX_X * TILE_DIM_TRANS + GPU_THREADIDX_Y; 172 | 173 | for (int j = 0; j < TILE_DIM_TRANS; j += BLOCK_ROWS) 174 | odata[(long(y) + long(j)) *long( olda) +long( x)] = 175 | __float2half(tile[GPU_THREADIDX_X][GPU_THREADIDX_Y + j]); 176 | } 177 | 178 | 179 | // HOST CALLS TO LAUNCH KERNELS (LEFT) 180 | __host__ void half_conversion_left( const float *C, int b, int plda, __half *lp, int lplda) { 181 | if(lplda == 0) return; 182 | dim3 block_dims(lplda / TILE_DIM, b / TILE_DIM, 1); 183 | dim3 thread_dims(TILE_DIM, BLOCK_ROWS, 1); 184 | copyCoalesced<<>>(lp, C, lplda, plda); 185 | /*#elif defined(ROCM_OLCF_PLATFORM) 186 | hipLaunchKernelGGL(copyCoalesced, block_dims, thread_dims, 0, 0, 187 | lp, C, lplda, plda); 188 | #else 189 | throw std::runtime_error("Error with build platform, neither CUDA nor ROCM. See CMake output.") 190 | #endif*/ 191 | 192 | } 193 | 194 | // HOST CALLS TO LAUNCH KERNELS (RIGHT TRANS) 195 | 196 | __host__ void half_conversion_right_trans(const float *C, int b, int plda, __half * rp, int rplda) { 197 | //printf("rplda:%d\n",rplda); 198 | if(rplda == 0) return; 199 | dim3 block_dims(b / TILE_DIM_TRANS, rplda / TILE_DIM_TRANS, 1); 200 | dim3 thread_dims(TILE_DIM_TRANS, BLOCK_ROWS, 1); 201 | 202 | //#ifdef CUDA_OLCF_PLATFORM 203 | transposeCoalesced<<>>(rp, C, rplda, plda); 204 | /*#elif defined(ROCM_OLCF_PLATFORM) 205 | hipLaunchKernelGGL(transposeCoalesced, block_dims, thread_dims, 0, 0, 206 | rp, C, rplda, plda); 207 | #else 208 | throw std::runtime_error("Error with build platform, neither CUDA nor ROCM. See CMake output.") 209 | #endif*/ 210 | 211 | } 212 | 213 | 214 | // Copy cast kernel 215 | __global__ void copyFtoH(__half *odata, const float *idata, long olda, 216 | long ilda) { 217 | /* __shared__ float tile[TILE_DIM][TILE_DIM]; 218 | 219 | // int width = gridDim.x * TILE_DIM; 220 | int x = GPU_BLOCKIDX_X * TILE_DIM + GPU_THREADIDX_X; 221 | int y = GPU_BLOCKIDX_Y * TILE_DIM + GPU_THREADIDX_Y; 222 | 223 | for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) 224 | tile[(GPU_THREADIDX_Y + j)][GPU_THREADIDX_X] = 225 | idata[(long(y) +long( j)) * long(ilda) +long( x)]; 226 | 227 | __syncthreads(); 228 | 229 | for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) 230 | odata[(long(y) +long( j)) *long( olda) +long( x)] = 231 | __float2half(tile[(GPU_THREADIDX_Y + j)][GPU_THREADIDX_X]); 232 | */ 233 | int x = GPU_BLOCKIDX_X * TILE_DIM + GPU_THREADIDX_X; 234 | int y = GPU_BLOCKIDX_Y * TILE_DIM + GPU_THREADIDX_Y; 235 | 236 | for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) 237 | odata[(long(y) +long( j)) *long( olda) +long( x)] = 238 | idata[(long(y) +long( j)) * long(ilda) +long( x)]; 239 | } 240 | 241 | __host__ void downCast_copy_general(__half* out, long olda, long nrow, long ncol, float* C, long ilda) 242 | { 243 | dim3 dimGrid(nrow/TILE_DIM,ncol/TILE_DIM, 1); 244 | dim3 dimBlock(TILE_DIM, BLOCK_ROWS, 1); 245 | //#ifdef CUDA_OLCF_PLATFORM 246 | copyFtoH<<>>(out, C, olda, ilda); 247 | /*#elif defined(ROCM_OLCF_PLATFORM) 248 | hipLaunchKernelGGL(copyFtoH, dimGrid, dimBlock,0,0, 249 | out, C, olda, ilda); 250 | #else 251 | throw std::runtime_error("Error with build platform, neither CUDA nor ROCM. See CMake output.") 252 | #endif 253 | */ //GPU_DEVICE_SYNCHRONIZE(); 254 | } 255 | 256 | __host__ void downCast_copy_lower(__half* out, long olda, long nrow, long ncol, float* C, long ilda) 257 | { 258 | dim3 dimGrid(nrow/TILE_DIM,ncol/TILE_DIM, 1); 259 | dim3 dimBlock(TILE_DIM, BLOCK_ROWS, 1); 260 | copyCoalesced_lower<<>>(out, C, olda, ilda); 261 | //GPU_DEVICE_SYNCHRONIZE(); 262 | } 263 | 264 | __host__ void downCast_copy_upper(__half* out, long olda, long nrow, long ncol, float* C, long ilda) 265 | { 266 | dim3 dimGrid(nrow/TILE_DIM,ncol/TILE_DIM, 1); 267 | dim3 dimBlock(TILE_DIM, BLOCK_ROWS, 1); 268 | copyCoalesced_upper<<>>(out, C, olda, ilda); 269 | //GPU_DEVICE_SYNCHRONIZE(); 270 | } 271 | 272 | // Debug purpose 273 | __host__ void upCast_copy_lower(float* out, long olda, long nrow, long ncol, __half * C, long ilda) 274 | { 275 | dim3 dimGrid(nrow/TILE_DIM,ncol/TILE_DIM, 1); 276 | dim3 dimBlock(TILE_DIM, BLOCK_ROWS, 1); 277 | copyCoalesced_lower<<>>(out, C, olda, ilda); 278 | //GPU_DEVICE_SYNCHRONIZE(); 279 | } 280 | 281 | __host__ void upCast_copy_upper(float* out, long olda, long nrow, long ncol, __half * C, long ilda) 282 | { 283 | dim3 dimGrid(nrow/TILE_DIM,ncol/TILE_DIM, 1); 284 | dim3 dimBlock(TILE_DIM, BLOCK_ROWS, 1); 285 | copyCoalesced_upper<<>>(out, C, olda, ilda); 286 | //GPU_DEVICE_SYNCHRONIZE(); 287 | } 288 | 289 | __host__ void downCast_trans_general(__half* out, long olda, long nrow, long ncol , float* C, long ilda) 290 | { 291 | // rplda should be passed in as npcol-start_col 292 | dim3 dimGrid(nrow/TILE_DIM_TRANS, ncol/TILE_DIM_TRANS, 1); 293 | dim3 dimBlock(TILE_DIM_TRANS, BLOCK_ROWS, 1); 294 | //#ifdef CUDA_OLCF_PLATFORM 295 | transposeCoalesced<<>>(out, C, olda, ilda); 296 | /*#elif defined(ROCM_OLCF_PLATFORM) 297 | hipLaunchKernelGGL(transposeCoalesced, dimGrid, dimBlock,0,0, 298 | out, C, olda, ilda); 299 | #else 300 | throw std::runtime_error("Error with build platform, neither CUDA nor ROCM. See CMake output.") 301 | #endif*/ 302 | //GPU_DEVICE_SYNCHRONIZE(); 303 | } 304 | 305 | __host__ void gen_identity_mat( float * out, long nrow, long ncol) 306 | { 307 | // ASSUMING IT B has to be divisible by TILE_DIM 308 | dim3 dimGrid(nrow/TILE_DIM_ID, ncol/TILE_DIM_ID,1); 309 | dim3 dimBlock(TILE_DIM_ID, BLOCK_ROWS,1); 310 | 311 | #ifdef CUDA_OLCF_PLATFORM 312 | gen_identity_mat_kernel<<>>(out, nrow, ncol); 313 | #elif defined(ROCM_OLCF_PLATFORM) 314 | hipLaunchKernelGGL(gen_identity_mat_kernel, dimGrid, dimBlock,0,0, 315 | out, nrow, ncol); 316 | #else 317 | throw std::runtime_error("Error with build platform, neither CUDA nor ROCM. See CMake output.") 318 | #endif 319 | GPU_DEVICE_SYNCHRONIZE(); 320 | } 321 | 322 | __global__ void gen_identity_mat_kernel( float * out, long nrow, long ncol) { 323 | int x = GPU_BLOCKIDX_X * TILE_DIM_ID + GPU_THREADIDX_X; 324 | int y = GPU_BLOCKIDX_Y * TILE_DIM_ID + GPU_THREADIDX_Y; 325 | float a = 1.0; 326 | float b = 0.0; 327 | for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) 328 | { 329 | //y += j; 330 | if(x == y+j){ 331 | out[long(y+j)*long(nrow) + long( x)] = a; 332 | } 333 | else{ 334 | out[long(y+j)*long(nrow) + long( x)] = b; 335 | } 336 | } 337 | } 338 | 339 | __global__ void gen_identity_mat_kernel( __half * out, long nrow, long ncol) { 340 | int x = GPU_BLOCKIDX_X * TILE_DIM_ID + GPU_THREADIDX_X; 341 | int y = GPU_BLOCKIDX_Y * TILE_DIM_ID + GPU_THREADIDX_Y; 342 | __half a = 1.0; 343 | __half b = 0.0; 344 | for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) 345 | { 346 | //y += j; 347 | if(x == y+j){ 348 | out[long(y+j)*long(nrow) + long( x)] = a; 349 | } 350 | else{ 351 | out[long(y+j)*long(nrow) + long( x)] = b; 352 | } 353 | } 354 | } 355 | 356 | __host__ void gen_identity_mat( __half * out, long nrow, long ncol) 357 | { 358 | // ASSUMING IT B has to be divisible by TILE_DIM 359 | dim3 dimGrid(nrow/TILE_DIM_ID, ncol/TILE_DIM_ID,1); 360 | dim3 dimBlock(TILE_DIM_ID, BLOCK_ROWS,1); 361 | 362 | #ifdef CUDA_OLCF_PLATFORM 363 | gen_identity_mat_kernel<<>>(out, nrow, ncol); 364 | #elif defined(ROCM_OLCF_PLATFORM) 365 | hipLaunchKernelGGL(gen_identity_mat_kernel, dimGrid, dimBlock,0,0, 366 | out, nrow, ncol); 367 | #else 368 | throw std::runtime_error("Error with build platform, neither CUDA nor ROCM. See CMake output.") 369 | #endif 370 | GPU_DEVICE_SYNCHRONIZE(); 371 | 372 | } 373 | 374 | 375 | __host__ void downCast_trans_general(__half* out, int olda, int nrow, int ncol , float* C, int ilda) 376 | { 377 | // rplda should be passed in as npcol-start_col 378 | 379 | dim3 dimGrid(nrow/TILE_DIM_TRANS, ncol/TILE_DIM_TRANS, 1); 380 | dim3 dimBlock(TILE_DIM_TRANS, BLOCK_ROWS, 1); 381 | #ifdef CUDA_OLCF_PLATFORM 382 | transposeCoalesced<<>>(out, C, (long)olda, (long)ilda); 383 | #endif 384 | // GPU_DEVICE_SYNCHRONIZE(); 385 | } 386 | 387 | -------------------------------------------------------------------------------- /src/matgen.hpp: -------------------------------------------------------------------------------- 1 | #ifndef MATGEN_HPP 2 | #define MATGEN_HPP 3 | // functions to initialize matrix. 4 | #include "hpl_rand.hpp" 5 | #include "panel.hpp" 6 | 7 | #define MATGEM_THREADS 512 8 | template 9 | __global__ void fill_panel_diag_dev(int n, int b, T * __restrict__ A, int lda, RandStat stat_ij, RandCoeff incl1, 10 | RandCoeff jump_thread, RandStat stat_00, int row_start, double* Diag) 11 | { 12 | int cur_row = blockIdx.x; 13 | int id = threadIdx.x; 14 | __shared__ double sh[MATGEM_THREADS]; 15 | 16 | if (cur_row < b) { 17 | RandCoeff jump_id = incl1.pow(id * n + cur_row); 18 | RandStat stat_tj = jump_id*stat_ij; 19 | 20 | for (int j = 0; j + id < b; j += blockDim.x) { 21 | A[(j + id) * lda + cur_row] = static_cast(stat_tj); 22 | stat_tj = jump_thread * stat_tj; 23 | } 24 | 25 | // diagonal 26 | double t = 0; 27 | jump_id = incl1.pow(id * n + cur_row + row_start); //Jump from 0 column 28 | stat_tj = jump_id*stat_00; //Stat_i_id 29 | 30 | for (int j = 0; j + id < n; j += blockDim.x) { 31 | if (j + id != cur_row + row_start) { 32 | t += fabs(static_cast(stat_tj)); 33 | } 34 | stat_tj = jump_thread * stat_tj; 35 | } 36 | sh[id] = t; 37 | __syncthreads(); 38 | 39 | if (id < 256) sh[id] += sh[id + 256]; 40 | __syncthreads(); 41 | 42 | if (id < 128) sh[id] += sh[id + 128]; 43 | __syncthreads(); 44 | if (id < 64) sh[id] += sh[id + 64]; 45 | __syncthreads(); 46 | 47 | if (id < 32) sh[id] += sh[id + 32]; 48 | __syncthreads(); 49 | 50 | if (id < 16) sh[id] += sh[id + 16]; 51 | __syncthreads(); 52 | 53 | if (id < 8) sh[id] += sh[id + 8]; 54 | __syncthreads(); 55 | 56 | if (id < 4) sh[id] += sh[id + 4]; 57 | __syncthreads(); 58 | 59 | // diagonal 60 | if (id == 0) { 61 | Diag[cur_row] = sh[0] + sh[1] + sh[2] + sh[3]; 62 | A[cur_row * lda + cur_row] = Diag[cur_row]; 63 | } 64 | } 65 | } 66 | 67 | template 68 | __global__ void fill_panel_dev(int n, int b, T * __restrict__ A, 69 | int lda, RandStat stat_ij, RandCoeff incl1, RandCoeff jump_thread) 70 | { 71 | int cur_row = blockIdx.x; 72 | int id = threadIdx.x; 73 | 74 | if (cur_row < b) { 75 | RandCoeff jump_id = incl1.pow(id * n + cur_row); 76 | RandStat stat_tj = jump_id*stat_ij; 77 | for (int j = 0; j + id < b; j += blockDim.x) { 78 | A[(j + id) * lda + cur_row] = static_cast(static_cast(stat_tj)); 79 | stat_tj = jump_thread * stat_tj; 80 | } 81 | } 82 | } 83 | 84 | template 85 | void panel_matgen_dev(Matgen const& mg, Panels& p, double* Diag) 86 | { 87 | int const n = mg.n; 88 | int const b = p.b; 89 | int const i1 = p.i1; 90 | int const j1 = p.j1; 91 | int const istride = p.istride; 92 | int const jstride = p.jstride; 93 | int const nprow = p.nprow; 94 | int const npcol = p.npcol; 95 | size_t const lda = p.lda; 96 | RandCoeff incl1 = mg.incl1; 97 | RandStat stat_00 = RandStat::initialize(mg.seed); 98 | RandCoeff jump_thread = incl1.pow(MATGEM_THREADS * n); 99 | 100 | for (int pj = 0; pj < npcol; ++pj) { 101 | int j0 = j1 + pj * jstride; 102 | 103 | for (int pi = 0; pi < nprow; ++pi) { 104 | int i0 = i1 + pi * istride; 105 | RandCoeff jump_ij = mg.jump(b * static_cast(i0), b * static_cast(j0)); 106 | //RandCoeff jump_ij = inc1.pow(b*i0 + n * static_cast(j0)); 107 | RandStat stat_ij = jump_ij * stat_00; 108 | 109 | if (i0 != j0) { 110 | fill_panel_dev<<>>(n, b, p(pi, pj, 'd'), lda, stat_ij, incl1, jump_thread); 111 | } 112 | else { 113 | fill_panel_diag_dev<<>>(n, b, p(pi, pj, 'd'), lda, stat_ij, incl1, jump_thread, stat_00, b*i0, &Diag[pi*b]); 114 | } 115 | } 116 | } 117 | } 118 | 119 | template void pmatgen2(Matgen const &mg, Panels &p, double* localSum) { 120 | int const bs = p.b; 121 | size_t const lda = p.lda; 122 | int const nprow = p.nprow; 123 | int const npcol = p.npcol; 124 | 125 | for (int pj = 0; pj < npcol; ++pj) 126 | #pragma omp parallel for 127 | for (int pi = 0; pi < nprow; ++pi) { 128 | F *pp = p(pi, pj); 129 | const int i = bs * (p.i1 + pi * p.istride); 130 | const int j = bs * (p.j1 + pj * p.jstride); 131 | 132 | fill_one_panel_with_rand2(mg.n, i, j, bs, bs, pp, lda, mg.seed,localSum); 133 | } 134 | 135 | MPI_Barrier( MPI_COMM_WORLD ); 136 | 137 | } 138 | 139 | 140 | template void pmatgen(Matgen const &mg, Panels &p) { 141 | int const bs = p.b; 142 | size_t const lda = p.lda; 143 | int const nprow = p.nprow; 144 | int const npcol = p.npcol; 145 | 146 | for (int pj = 0; pj < npcol; ++pj) 147 | for (int pi = 0; pi < nprow; ++pi) { 148 | F *pp = p(pi, pj); 149 | const int i = bs * (p.i1 + pi * p.istride); 150 | const int j = bs * (p.j1 + pj * p.jstride); 151 | 152 | fill_one_panel_with_rand(mg.n, i, j, bs, bs, pp, lda, mg.seed, 153 | true); 154 | } 155 | } 156 | 157 | template void pmatgen(HMGen const &mg, Panels &p) { 158 | size_t const lda = p.lda; 159 | int const nprow = p.nprow; 160 | int const npcol = p.npcol; 161 | int const b = p.b; 162 | int const i1 = p.i1; 163 | int const j1 = p.j1; 164 | int const istride = p.istride; 165 | int const jstride = p.jstride; 166 | F const alpha = mg.alpha; 167 | F const beta = mg.beta; 168 | F const ab = alpha * beta; 169 | F const done = 1; 170 | 171 | #pragma omp parallel for collapse(2) 172 | for (int pj = 0; pj < npcol; ++pj) { 173 | for (int j = 0; j < b; ++j) { 174 | int jstart = b * (j1 + pj * jstride); 175 | F const fpjj = jstart + j; 176 | for (int pi = 0; pi < nprow; ++pi) { 177 | int istart = b * (i1 + pi * istride); 178 | F *to = p(pi, pj); 179 | if (pi < pj) { 180 | for (int i = 0; i < b; ++i) { 181 | // assuming no diag. 182 | F aij = beta + ab * (istart + i); 183 | to[j * lda + i] = aij; 184 | } 185 | } else if (pi > pj) { 186 | for (int i = 0; i < b; ++i) { 187 | // assuming no diag. 188 | F aij = alpha + ab * fpjj; 189 | to[j * lda + i] = aij; 190 | } 191 | } else { 192 | for (int i = 0; i < j; ++i) { 193 | // assuming no diag. 194 | F aij = beta + ab * (jstart + i); 195 | to[j * lda + i] = aij; 196 | } 197 | F aij = done + ab * fpjj; 198 | to[j * lda + j] = aij; 199 | for (int i = j + 1; i < b; ++i) { 200 | // assuming no diag. 201 | F aij = alpha + ab * fpjj; 202 | to[j * lda + i] = aij; 203 | } 204 | } 205 | } 206 | } 207 | } 208 | } 209 | 210 | template void pmatgen0(Panels &p) { 211 | // initialize with zero 212 | int const bs = p.b; 213 | size_t const lda = p.lda; 214 | int const nprow = p.nprow; 215 | int const npcol = p.npcol; 216 | F const dzero = static_cast(0); 217 | 218 | if (p.is_tile) { 219 | #pragma omp parallel for collapse(2) 220 | for (int pj = 0; pj < npcol; ++pj) 221 | for (int pi = 0; pi < nprow; ++pi) { 222 | F *pp = p(pi, pj); 223 | for (int j = 0; j < bs; ++j) 224 | for (int i = 0; i < bs; ++i) 225 | pp[j * lda + i] = dzero; 226 | } 227 | } else { 228 | F *ptr = p(0, 0); 229 | size_t size = static_cast(p.ldpp) * npcol; 230 | #pragma omp parallel for simd 231 | for (size_t i = 0; i < size; ++i) 232 | ptr[i] = dzero; 233 | } 234 | } 235 | 236 | template void pmatl1est(Matgen const &mg, Panels &p) { 237 | // approximation of the decomposition 238 | int const bs = p.b; 239 | size_t const lda = p.lda; 240 | int const nprow = p.nprow; 241 | int const npcol = p.npcol; 242 | 243 | #pragma omp parallel for 244 | for (int pj = 0; pj < npcol; ++pj) { 245 | double buf[bs]; 246 | const int j = bs * (p.j1 + pj * p.jstride); 247 | for (int jj = 0; jj < bs; ++jj) 248 | buf[jj] = 1. / calc_diag(j + jj, mg.n, mg.seed); 249 | for (int pi = 0; pi < nprow; ++pi) { 250 | F *pp = p(pi, pj); 251 | const int i = bs * (p.i1 + pi * p.istride); 252 | if (i < j) 253 | continue; 254 | if (i == j) { 255 | for (int jj = 0; jj < bs; ++jj) { 256 | F d = buf[jj]; 257 | for (int ii = 0; ii < bs; ++ii) { 258 | if (i + ii > j + jj) { 259 | pp[jj * lda + ii] *= d; 260 | } 261 | } 262 | } 263 | } else { 264 | for (int jj = 0; jj < bs; ++jj) { 265 | F d = buf[jj]; 266 | for (int ii = 0; ii < bs; ++ii) { 267 | pp[jj * lda + ii] *= d; 268 | } 269 | } 270 | } 271 | } 272 | } 273 | } 274 | 275 | template void pmatl1est(HMGen const &mg, Panels &p) { 276 | // approximation of the decomposition 277 | int const bs = p.b; 278 | size_t const lda = p.lda; 279 | int const nprow = p.nprow; 280 | int const npcol = p.npcol; 281 | F const alpha = mg.alpha; 282 | F const beta = mg.beta; 283 | F const done = 1; 284 | 285 | #pragma omp parallel for collapse(2) schedule(dynamic) 286 | for (int pj = 0; pj < npcol; ++pj) 287 | for (int pi = 0; pi < nprow; ++pi) { 288 | F *pp = p(pi, pj); 289 | const int i = bs * (p.i1 + pi * p.istride); 290 | const int j = bs * (p.j1 + pj * p.jstride); 291 | if (i < j) { 292 | for (int jj = 0; jj < bs; ++jj) { 293 | for (int ii = 0; ii < bs; ++ii) { 294 | pp[jj * lda + ii] = beta; 295 | } 296 | } 297 | } else if (i > j) { 298 | for (int jj = 0; jj < bs; ++jj) { 299 | for (int ii = 0; ii < bs; ++ii) { 300 | pp[jj * lda + ii] = alpha; 301 | } 302 | } 303 | } else { 304 | for (int jj = 0; jj < bs; ++jj) { 305 | for (int ii = 0; ii < jj; ++ii) { 306 | pp[jj * lda + ii] = beta; 307 | } 308 | pp[jj * lda + jj] = done; 309 | for (int ii = jj + 1; ii < bs; ++ii) { 310 | pp[jj * lda + ii] = alpha; 311 | } 312 | } 313 | } 314 | } 315 | } 316 | 317 | template 318 | void pcolvgen(Matgen const &mg, Panels const &p, double *dx) { 319 | int nprow = p.nprow; 320 | int b = p.b; 321 | int i1 = p.i1; 322 | int j1 = p.j1; 323 | int istride = p.istride; 324 | int jstride = p.jstride; 325 | for (int i = 0; i < nprow; ++i) { 326 | int ipos = i1 + i * istride; 327 | if (ipos % jstride == j1) { 328 | fill_one_panel_with_rand(mg.n, b * ipos, mg.n, b, 1, dx + b * i, 1, 329 | mg.seed, false); 330 | } 331 | } 332 | } 333 | 334 | template 335 | void pdiaggen2(Matgen const &mg, Panels &p, double *dx, double* localSum) { 336 | int nprow = p.nprow; 337 | int b = p.b; 338 | int i1 = p.i1; 339 | int j1 = p.j1; 340 | int istride = p.istride; 341 | int jstride = p.jstride; 342 | int const bs = p.b; 343 | size_t const lda = p.lda; 344 | int const npcol = p.npcol; 345 | 346 | double* globalSum = (double*)malloc(mg.n*sizeof(double)); 347 | if ( globalSum == NULL ) 348 | { 349 | printf( "Allocation of globalSum failed\n" ); 350 | exit( 10 ); 351 | } 352 | memset(globalSum, 0, mg.n*sizeof(double)); 353 | MPI_Allreduce(localSum, globalSum, mg.n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); 354 | 355 | #pragma omp parallel for 356 | for (int i = 0; i < nprow; ++i) { 357 | int ipos = i1 + i * istride; 358 | if (ipos % jstride == j1) { 359 | for (int k = 0; k < b; ++k) 360 | dx[b * i + k] = globalSum[ b * ipos + k ]; 361 | } 362 | } 363 | // fill A back 364 | for (int pj = 0; pj < npcol; ++pj) { 365 | for (int pi = 0; pi < nprow; ++pi) { 366 | F *pp = p(pi, pj); 367 | const int i0 = bs * (p.i1 + pi * p.istride); 368 | const int j0 = bs * (p.j1 + pj * p.jstride); 369 | if( i0 == j0 ) 370 | { 371 | #pragma omp parallel for 372 | for (int ii = 0; ii < b; ii++) 373 | pp[ lda*ii + ii ] = static_cast( globalSum[ i0+ii ] ); 374 | } 375 | } 376 | } 377 | std::free( globalSum ); 378 | 379 | } 380 | 381 | template 382 | void pdiaggen(Matgen const &mg, Panels const &p, double *dx) { 383 | int nprow = p.nprow; 384 | int b = p.b; 385 | int i1 = p.i1; 386 | int j1 = p.j1; 387 | int istride = p.istride; 388 | int jstride = p.jstride; 389 | for (int i = 0; i < nprow; ++i) { 390 | int ipos = i1 + i * istride; 391 | if (ipos % jstride == j1) { 392 | #pragma omp parallel for 393 | for (int k = 0; k < b; ++k) 394 | dx[b * i + k] = calc_diag(b * ipos + k, mg.n, mg.seed); 395 | } 396 | } 397 | } 398 | 399 | template 400 | void pdiaggen(HMGen const &mg, Panels const &p, double *dx) { 401 | int nprow = p.nprow; 402 | int b = p.b; 403 | int i1 = p.i1; 404 | int j1 = p.j1; 405 | int istride = p.istride; 406 | int jstride = p.jstride; 407 | F const ab = mg.alpha * mg.beta; 408 | F const done = 1; 409 | for (int i = 0; i < nprow; ++i) { 410 | int ipos = i1 + i * istride; 411 | if (ipos % jstride == j1) { 412 | #pragma omp parallel for 413 | for (int k = 0; k < b; ++k) 414 | dx[b * i + k] = done + ab * (b * ipos + k); 415 | } 416 | } 417 | } 418 | 419 | template 420 | void panel_colvgen(Matgen const& mg, Panelsconst& p, double* dx) { 421 | int nprow = p.nprow; 422 | int b = p.b; 423 | int i1 = p.i1; 424 | int j1 = p.j1; 425 | int istride = p.istride; 426 | int jstride = p.jstride; 427 | 428 | #pragma omp parallel for 429 | for (int i = 0; i < nprow; ++i) { 430 | int ipos = i1 + i * istride; 431 | 432 | if (ipos % jstride == j1) { 433 | panel_fill_one_with_rand(mg.n, b * ipos, mg.n, b, 1, dx + b * i, 1, mg.seed, false); 434 | } 435 | } 436 | } 437 | 438 | template 439 | void panel_diaggen(Matgen const& mg, Panelsconst& p, double* dx) { 440 | int nprow = p.nprow; 441 | int b = p.b; 442 | int i1 = p.i1; 443 | int j1 = p.j1; 444 | int istride = p.istride; 445 | int jstride = p.jstride; 446 | 447 | // PrintLogMsg( "nprow ... %d n ... %d\n", nprow, mg.n ); 448 | 449 | #pragma omp parallel for 450 | for (int i = 0; i < nprow; ++i) { 451 | int ipos = i1 + i * istride; 452 | 453 | if (ipos % jstride == j1) { 454 | for (int k = 0; k < b; ++k) { 455 | //dx[b * i + k] = calc_diag(b * ipos + k, mg.n, mg.seed); 456 | dx[b * i + k] = 0.25 * mg.n; 457 | } 458 | } 459 | } 460 | } 461 | 462 | #endif 463 | -------------------------------------------------------------------------------- /doc/crusher_example_32x32.out: -------------------------------------------------------------------------------- 1 | =============================================================================== 2 | OpenMxP - High Performance Mixed Precision Benchmark - NCCS/OLCF 3 | =============================================================================== 4 | Build Info: git branch: [], hash-id: [] 5 | Running configuration: 6 | Ranks = 1024 7 | OpenMP threads = 7 8 | PxQ 32x32 9 | cusolver/rocsolver Sgetrf 10 | 2ringM comm 11 | 1ring dcomm 12 | Node Grid - 2x4R pxq grid 13 | no sync on sgemm 14 | default stream sgemm 15 | gpu direct mpi 16 | GPU Initialization 17 | Alt-Trsm Method 18 | Building panels 19 | gpu p[62940774400] l[1284505600] r[1284505600] ... tot[65509785600] 20 | Finished building panels 21 | gpu d_piv[26214400] 22 | Initialization 1 = 2.1105 sec 23 | Gpu memory (free/total) 2978675712/68702699520 jobid=290129 24 | n=4014080 ln=125440 b=2560 r=32 c=32 epoch_size=81920 25 | #BEGIN_: Mon Mar 27 04:30:59 2023 26 | Entering LU 27 | Step 10, Rank 0, Elapsed 5.80, Work left 98.10%, Time left: 299.50, Est Total: 305.30, GFlops: 141233934.16, TFlops/GPU: 137.92 28 | Step 20, Rank 0, Elapsed 11.19, Work left 96.22%, Time left: 285.07, Est Total: 296.27, GFlops: 145540752.65, TFlops/GPU: 142.13 29 | Step 30, Rank 0, Elapsed 16.27, Work left 94.37%, Time left: 272.65, Est Total: 288.92, GFlops: 149241859.39, TFlops/GPU: 145.74 30 | Step 40, Rank 0, Elapsed 21.67, Work left 92.54%, Time left: 268.80, Est Total: 290.46, GFlops: 148447771.05, TFlops/GPU: 144.97 31 | Step 50, Rank 0, Elapsed 26.81, Work left 90.74%, Time left: 262.57, Est Total: 289.38, GFlops: 149005030.04, TFlops/GPU: 145.51 32 | Step 60, Rank 0, Elapsed 31.87, Work left 88.95%, Time left: 256.63, Est Total: 288.49, GFlops: 149462321.26, TFlops/GPU: 145.96 33 | Step 70, Rank 0, Elapsed 37.07, Work left 87.20%, Time left: 252.46, Est Total: 289.53, GFlops: 148928266.10, TFlops/GPU: 145.44 34 | Step 80, Rank 0, Elapsed 41.79, Work left 85.46%, Time left: 245.63, Est Total: 287.42, GFlops: 150020347.73, TFlops/GPU: 146.50 35 | Step 90, Rank 0, Elapsed 46.81, Work left 83.75%, Time left: 241.26, Est Total: 288.08, GFlops: 149678958.04, TFlops/GPU: 146.17 36 | Step 100, Rank 0, Elapsed 51.87, Work left 82.06%, Time left: 237.28, Est Total: 289.15, GFlops: 149122148.49, TFlops/GPU: 145.63 37 | Step 110, Rank 0, Elapsed 56.42, Work left 80.40%, Time left: 231.39, Est Total: 287.81, GFlops: 149817072.91, TFlops/GPU: 146.31 38 | Step 120, Rank 0, Elapsed 61.32, Work left 78.75%, Time left: 227.28, Est Total: 288.60, GFlops: 149406456.37, TFlops/GPU: 145.90 39 | Step 130, Rank 0, Elapsed 66.06, Work left 77.13%, Time left: 222.84, Est Total: 288.90, GFlops: 149249133.14, TFlops/GPU: 145.75 40 | Step 140, Rank 0, Elapsed 70.46, Work left 75.53%, Time left: 217.53, Est Total: 287.98, GFlops: 149726580.18, TFlops/GPU: 146.22 41 | Step 150, Rank 0, Elapsed 75.06, Work left 73.96%, Time left: 213.19, Est Total: 288.25, GFlops: 149587093.28, TFlops/GPU: 146.08 42 | Step 160, Rank 0, Elapsed 79.31, Work left 72.41%, Time left: 208.09, Est Total: 287.40, GFlops: 150031527.43, TFlops/GPU: 146.52 43 | Step 170, Rank 0, Elapsed 83.98, Work left 70.87%, Time left: 204.34, Est Total: 288.32, GFlops: 149550099.31, TFlops/GPU: 146.05 44 | Step 180, Rank 0, Elapsed 88.45, Work left 69.36%, Time left: 200.25, Est Total: 288.69, GFlops: 149358082.93, TFlops/GPU: 145.86 45 | Step 190, Rank 0, Elapsed 92.67, Work left 67.87%, Time left: 195.79, Est Total: 288.46, GFlops: 149481419.72, TFlops/GPU: 145.98 46 | Step 200, Rank 0, Elapsed 97.01, Work left 66.41%, Time left: 191.78, Est Total: 288.79, GFlops: 149309802.78, TFlops/GPU: 145.81 47 | Step 210, Rank 0, Elapsed 101.15, Work left 64.96%, Time left: 187.55, Est Total: 288.70, GFlops: 149355346.72, TFlops/GPU: 145.85 48 | Step 220, Rank 0, Elapsed 105.30, Work left 63.54%, Time left: 183.49, Est Total: 288.78, GFlops: 149312297.08, TFlops/GPU: 145.81 49 | Step 230, Rank 0, Elapsed 109.39, Work left 62.13%, Time left: 179.50, Est Total: 288.89, GFlops: 149255627.55, TFlops/GPU: 145.76 50 | Step 240, Rank 0, Elapsed 113.16, Work left 60.75%, Time left: 175.15, Est Total: 288.30, GFlops: 149560060.54, TFlops/GPU: 146.05 51 | Step 250, Rank 0, Elapsed 117.20, Work left 59.39%, Time left: 171.39, Est Total: 288.59, GFlops: 149409467.15, TFlops/GPU: 145.91 52 | Step 260, Rank 0, Elapsed 121.22, Work left 58.05%, Time left: 167.73, Est Total: 288.96, GFlops: 149221845.23, TFlops/GPU: 145.72 53 | Step 270, Rank 0, Elapsed 124.88, Work left 56.73%, Time left: 163.71, Est Total: 288.59, GFlops: 149410745.17, TFlops/GPU: 145.91 54 | Step 280, Rank 0, Elapsed 128.76, Work left 55.43%, Time left: 160.10, Est Total: 288.85, GFlops: 149275326.59, TFlops/GPU: 145.78 55 | Step 290, Rank 0, Elapsed 132.58, Work left 54.14%, Time left: 156.54, Est Total: 289.12, GFlops: 149137334.16, TFlops/GPU: 145.64 56 | Step 300, Rank 0, Elapsed 136.07, Work left 52.88%, Time left: 152.73, Est Total: 288.80, GFlops: 149305621.26, TFlops/GPU: 145.81 57 | Step 310, Rank 0, Elapsed 139.64, Work left 51.64%, Time left: 149.13, Est Total: 288.77, GFlops: 149319831.15, TFlops/GPU: 145.82 58 | Step 320, Rank 0, Elapsed 143.02, Work left 50.42%, Time left: 145.45, Est Total: 288.47, GFlops: 149471644.59, TFlops/GPU: 145.97 59 | Step 330, Rank 0, Elapsed 146.69, Work left 49.22%, Time left: 142.17, Est Total: 288.86, GFlops: 149274742.08, TFlops/GPU: 145.78 60 | Step 340, Rank 0, Elapsed 150.10, Work left 48.03%, Time left: 138.75, Est Total: 288.84, GFlops: 149280135.18, TFlops/GPU: 145.78 61 | Step 350, Rank 0, Elapsed 153.38, Work left 46.87%, Time left: 135.31, Est Total: 288.70, GFlops: 149357426.93, TFlops/GPU: 145.86 62 | Step 360, Rank 0, Elapsed 156.84, Work left 45.73%, Time left: 132.14, Est Total: 288.98, GFlops: 149208174.53, TFlops/GPU: 145.71 63 | Step 370, Rank 0, Elapsed 160.12, Work left 44.60%, Time left: 128.90, Est Total: 289.02, GFlops: 149187823.39, TFlops/GPU: 145.69 64 | Step 380, Rank 0, Elapsed 163.25, Work left 43.49%, Time left: 125.64, Est Total: 288.89, GFlops: 149256953.84, TFlops/GPU: 145.76 65 | Step 390, Rank 0, Elapsed 166.55, Work left 42.40%, Time left: 122.61, Est Total: 289.16, GFlops: 149115792.58, TFlops/GPU: 145.62 66 | Step 400, Rank 0, Elapsed 169.47, Work left 41.33%, Time left: 119.39, Est Total: 288.86, GFlops: 149273109.20, TFlops/GPU: 145.77 67 | Step 410, Rank 0, Elapsed 172.63, Work left 40.28%, Time left: 116.44, Est Total: 289.07, GFlops: 149165550.90, TFlops/GPU: 145.67 68 | Step 420, Rank 0, Elapsed 175.89, Work left 39.25%, Time left: 113.62, Est Total: 289.50, GFlops: 148940771.20, TFlops/GPU: 145.45 69 | Step 430, Rank 0, Elapsed 178.74, Work left 38.23%, Time left: 110.62, Est Total: 289.35, GFlops: 149017949.20, TFlops/GPU: 145.53 70 | Step 440, Rank 0, Elapsed 181.67, Work left 37.23%, Time left: 107.75, Est Total: 289.42, GFlops: 148984267.03, TFlops/GPU: 145.49 71 | Step 450, Rank 0, Elapsed 184.76, Work left 36.25%, Time left: 105.05, Est Total: 289.82, GFlops: 148778772.01, TFlops/GPU: 145.29 72 | Step 460, Rank 0, Elapsed 187.67, Work left 35.28%, Time left: 102.32, Est Total: 289.99, GFlops: 148689121.80, TFlops/GPU: 145.20 73 | Step 470, Rank 0, Elapsed 190.59, Work left 34.34%, Time left: 99.67, Est Total: 290.26, GFlops: 148551035.05, TFlops/GPU: 145.07 74 | Step 480, Rank 0, Elapsed 193.40, Work left 33.41%, Time left: 97.02, Est Total: 290.42, GFlops: 148470412.52, TFlops/GPU: 144.99 75 | Step 490, Rank 0, Elapsed 196.30, Work left 32.50%, Time left: 94.50, Est Total: 290.80, GFlops: 148276490.22, TFlops/GPU: 144.80 76 | Step 500, Rank 0, Elapsed 198.98, Work left 31.60%, Time left: 91.92, Est Total: 290.90, GFlops: 148223368.54, TFlops/GPU: 144.75 77 | Step 510, Rank 0, Elapsed 201.66, Work left 30.72%, Time left: 89.42, Est Total: 291.08, GFlops: 148133057.74, TFlops/GPU: 144.66 78 | Step 520, Rank 0, Elapsed 204.52, Work left 29.86%, Time left: 87.06, Est Total: 291.58, GFlops: 147879597.86, TFlops/GPU: 144.41 79 | Step 530, Rank 0, Elapsed 207.32, Work left 29.01%, Time left: 84.72, Est Total: 292.04, GFlops: 147646915.93, TFlops/GPU: 144.19 80 | Step 540, Rank 0, Elapsed 209.89, Work left 28.18%, Time left: 82.36, Est Total: 292.25, GFlops: 147542523.61, TFlops/GPU: 144.08 81 | Step 550, Rank 0, Elapsed 212.77, Work left 27.37%, Time left: 80.16, Est Total: 292.93, GFlops: 147198233.90, TFlops/GPU: 143.75 82 | Step 560, Rank 0, Elapsed 215.48, Work left 26.57%, Time left: 77.96, Est Total: 293.44, GFlops: 146941771.26, TFlops/GPU: 143.50 83 | Step 570, Rank 0, Elapsed 218.30, Work left 25.78%, Time left: 75.84, Est Total: 294.14, GFlops: 146594025.82, TFlops/GPU: 143.16 84 | Step 580, Rank 0, Elapsed 221.14, Work left 25.02%, Time left: 73.78, Est Total: 294.92, GFlops: 146206048.63, TFlops/GPU: 142.78 85 | Step 590, Rank 0, Elapsed 223.38, Work left 24.26%, Time left: 71.57, Est Total: 294.96, GFlops: 146187623.15, TFlops/GPU: 142.76 86 | Step 600, Rank 0, Elapsed 225.77, Work left 23.53%, Time left: 69.46, Est Total: 295.23, GFlops: 146050029.72, TFlops/GPU: 142.63 87 | Step 610, Rank 0, Elapsed 228.33, Work left 22.81%, Time left: 67.46, Est Total: 295.79, GFlops: 145777103.24, TFlops/GPU: 142.36 88 | Step 620, Rank 0, Elapsed 230.38, Work left 22.10%, Time left: 65.36, Est Total: 295.74, GFlops: 145798381.88, TFlops/GPU: 142.38 89 | Step 630, Rank 0, Elapsed 232.51, Work left 21.41%, Time left: 63.33, Est Total: 295.85, GFlops: 145746978.11, TFlops/GPU: 142.33 90 | Step 640, Rank 0, Elapsed 234.55, Work left 20.73%, Time left: 61.34, Est Total: 295.89, GFlops: 145724876.59, TFlops/GPU: 142.31 91 | Step 650, Rank 0, Elapsed 236.68, Work left 20.07%, Time left: 59.42, Est Total: 296.09, GFlops: 145625613.43, TFlops/GPU: 142.21 92 | Step 660, Rank 0, Elapsed 238.65, Work left 19.42%, Time left: 57.51, Est Total: 296.16, GFlops: 145590555.99, TFlops/GPU: 142.18 93 | Step 670, Rank 0, Elapsed 240.52, Work left 18.78%, Time left: 55.63, Est Total: 296.15, GFlops: 145599001.44, TFlops/GPU: 142.19 94 | Step 680, Rank 0, Elapsed 242.66, Work left 18.16%, Time left: 53.86, Est Total: 296.52, GFlops: 145416335.10, TFlops/GPU: 142.01 95 | Step 690, Rank 0, Elapsed 244.73, Work left 17.56%, Time left: 52.12, Est Total: 296.84, GFlops: 145257936.33, TFlops/GPU: 141.85 96 | Step 700, Rank 0, Elapsed 246.51, Work left 16.96%, Time left: 50.36, Est Total: 296.88, GFlops: 145242252.85, TFlops/GPU: 141.84 97 | Step 710, Rank 0, Elapsed 248.49, Work left 16.38%, Time left: 48.69, Est Total: 297.18, GFlops: 145094182.05, TFlops/GPU: 141.69 98 | Step 720, Rank 0, Elapsed 250.11, Work left 15.82%, Time left: 47.00, Est Total: 297.11, GFlops: 145129124.89, TFlops/GPU: 141.73 99 | Step 730, Rank 0, Elapsed 251.83, Work left 15.26%, Time left: 45.37, Est Total: 297.19, GFlops: 145087115.70, TFlops/GPU: 141.69 100 | Step 740, Rank 0, Elapsed 253.82, Work left 14.72%, Time left: 43.83, Est Total: 297.64, GFlops: 144867494.55, TFlops/GPU: 141.47 101 | Step 750, Rank 0, Elapsed 255.48, Work left 14.20%, Time left: 42.27, Est Total: 297.75, GFlops: 144813197.16, TFlops/GPU: 141.42 102 | Step 760, Rank 0, Elapsed 257.17, Work left 13.68%, Time left: 40.77, Est Total: 297.94, GFlops: 144721851.75, TFlops/GPU: 141.33 103 | Step 770, Rank 0, Elapsed 259.19, Work left 13.18%, Time left: 39.35, Est Total: 298.54, GFlops: 144433250.11, TFlops/GPU: 141.05 104 | Step 780, Rank 0, Elapsed 260.62, Work left 12.69%, Time left: 37.89, Est Total: 298.51, GFlops: 144446447.66, TFlops/GPU: 141.06 105 | Step 790, Rank 0, Elapsed 262.24, Work left 12.22%, Time left: 36.49, Est Total: 298.74, GFlops: 144337785.04, TFlops/GPU: 140.95 106 | Step 800, Rank 0, Elapsed 263.77, Work left 11.75%, Time left: 35.12, Est Total: 298.89, GFlops: 144261263.33, TFlops/GPU: 140.88 107 | Step 810, Rank 0, Elapsed 265.26, Work left 11.30%, Time left: 33.78, Est Total: 299.04, GFlops: 144191800.49, TFlops/GPU: 140.81 108 | Step 820, Rank 0, Elapsed 266.68, Work left 10.86%, Time left: 32.48, Est Total: 299.16, GFlops: 144134032.45, TFlops/GPU: 140.76 109 | Step 830, Rank 0, Elapsed 267.88, Work left 10.43%, Time left: 31.18, Est Total: 299.07, GFlops: 144178542.86, TFlops/GPU: 140.80 110 | Step 840, Rank 0, Elapsed 269.42, Work left 10.01%, Time left: 29.96, Est Total: 299.38, GFlops: 144024953.37, TFlops/GPU: 140.65 111 | Step 850, Rank 0, Elapsed 270.83, Work left 9.60%, Time left: 28.77, Est Total: 299.59, GFlops: 143923933.29, TFlops/GPU: 140.55 112 | Step 860, Rank 0, Elapsed 272.05, Work left 9.21%, Time left: 27.58, Est Total: 299.64, GFlops: 143903822.32, TFlops/GPU: 140.53 113 | Step 870, Rank 0, Elapsed 273.46, Work left 8.82%, Time left: 26.46, Est Total: 299.91, GFlops: 143770997.23, TFlops/GPU: 140.40 114 | Step 880, Rank 0, Elapsed 274.54, Work left 8.45%, Time left: 25.33, Est Total: 299.88, GFlops: 143789234.27, TFlops/GPU: 140.42 115 | Step 890, Rank 0, Elapsed 275.72, Work left 8.08%, Time left: 24.25, Est Total: 299.97, GFlops: 143742864.39, TFlops/GPU: 140.37 116 | Step 900, Rank 0, Elapsed 277.05, Work left 7.73%, Time left: 23.22, Est Total: 300.27, GFlops: 143601737.67, TFlops/GPU: 140.24 117 | Step 910, Rank 0, Elapsed 278.04, Work left 7.39%, Time left: 22.19, Est Total: 300.23, GFlops: 143620576.13, TFlops/GPU: 140.25 118 | Step 920, Rank 0, Elapsed 279.08, Work left 7.06%, Time left: 21.19, Est Total: 300.28, GFlops: 143596488.77, TFlops/GPU: 140.23 119 | Step 930, Rank 0, Elapsed 280.31, Work left 6.74%, Time left: 20.25, Est Total: 300.56, GFlops: 143463799.97, TFlops/GPU: 140.10 120 | Step 940, Rank 0, Elapsed 281.25, Work left 6.42%, Time left: 19.31, Est Total: 300.56, GFlops: 143460769.82, TFlops/GPU: 140.10 121 | Step 950, Rank 0, Elapsed 282.29, Work left 6.12%, Time left: 18.41, Est Total: 300.70, GFlops: 143397108.83, TFlops/GPU: 140.04 122 | Step 960, Rank 0, Elapsed 283.28, Work left 5.83%, Time left: 17.54, Est Total: 300.82, GFlops: 143337182.00, TFlops/GPU: 139.98 123 | Step 970, Rank 0, Elapsed 284.28, Work left 5.55%, Time left: 16.70, Est Total: 300.97, GFlops: 143264125.56, TFlops/GPU: 139.91 124 | Step 980, Rank 0, Elapsed 285.20, Work left 5.27%, Time left: 15.88, Est Total: 301.08, GFlops: 143214038.71, TFlops/GPU: 139.86 125 | Step 990, Rank 0, Elapsed 285.99, Work left 5.01%, Time left: 15.08, Est Total: 301.07, GFlops: 143216965.77, TFlops/GPU: 139.86 126 | Step 1000, Rank 0, Elapsed 287.05, Work left 4.75%, Time left: 14.33, Est Total: 301.38, GFlops: 143071779.56, TFlops/GPU: 139.72 127 | Step 1010, Rank 0, Elapsed 287.85, Work left 4.51%, Time left: 13.58, Est Total: 301.43, GFlops: 143045912.50, TFlops/GPU: 139.69 128 | Step 1020, Rank 0, Elapsed 288.59, Work left 4.27%, Time left: 12.87, Est Total: 301.46, GFlops: 143035383.98, TFlops/GPU: 139.68 129 | Step 1030, Rank 0, Elapsed 289.53, Work left 4.04%, Time left: 12.19, Est Total: 301.72, GFlops: 142912294.41, TFlops/GPU: 139.56 130 | Step 1040, Rank 0, Elapsed 290.22, Work left 3.82%, Time left: 11.52, Est Total: 301.74, GFlops: 142898236.78, TFlops/GPU: 139.55 131 | Step 1050, Rank 0, Elapsed 291.01, Work left 3.61%, Time left: 10.88, Est Total: 301.89, GFlops: 142829714.18, TFlops/GPU: 139.48 132 | Step 1060, Rank 0, Elapsed 291.89, Work left 3.40%, Time left: 10.28, Est Total: 302.17, GFlops: 142698214.25, TFlops/GPU: 139.35 133 | Step 1070, Rank 0, Elapsed 292.49, Work left 3.20%, Time left: 9.68, Est Total: 302.17, GFlops: 142698739.05, TFlops/GPU: 139.35 134 | Step 1080, Rank 0, Elapsed 293.18, Work left 3.01%, Time left: 9.11, Est Total: 302.29, GFlops: 142640986.57, TFlops/GPU: 139.30 135 | Step 1090, Rank 0, Elapsed 294.00, Work left 2.83%, Time left: 8.57, Est Total: 302.57, GFlops: 142507256.86, TFlops/GPU: 139.17 136 | Step 1100, Rank 0, Elapsed 294.55, Work left 2.66%, Time left: 8.05, Est Total: 302.60, GFlops: 142494919.89, TFlops/GPU: 139.16 137 | Step 1110, Rank 0, Elapsed 295.21, Work left 2.49%, Time left: 7.54, Est Total: 302.76, GFlops: 142419895.25, TFlops/GPU: 139.08 138 | Step 1120, Rank 0, Elapsed 295.81, Work left 2.33%, Time left: 7.06, Est Total: 302.88, GFlops: 142363952.48, TFlops/GPU: 139.03 139 | Step 1130, Rank 0, Elapsed 296.52, Work left 2.18%, Time left: 6.61, Est Total: 303.12, GFlops: 142248370.75, TFlops/GPU: 138.91 140 | Step 1140, Rank 0, Elapsed 297.17, Work left 2.03%, Time left: 6.17, Est Total: 303.34, GFlops: 142148242.29, TFlops/GPU: 138.82 141 | Step 1150, Rank 0, Elapsed 297.68, Work left 1.89%, Time left: 5.75, Est Total: 303.43, GFlops: 142106780.17, TFlops/GPU: 138.78 142 | Step 1160, Rank 0, Elapsed 298.33, Work left 1.76%, Time left: 5.35, Est Total: 303.68, GFlops: 141988061.71, TFlops/GPU: 138.66 143 | Step 1170, Rank 0, Elapsed 298.83, Work left 1.64%, Time left: 4.97, Est Total: 303.80, GFlops: 141933525.99, TFlops/GPU: 138.61 144 | Step 1180, Rank 0, Elapsed 299.24, Work left 1.52%, Time left: 4.60, Est Total: 303.85, GFlops: 141909903.95, TFlops/GPU: 138.58 145 | Step 1190, Rank 0, Elapsed 299.84, Work left 1.40%, Time left: 4.26, Est Total: 304.10, GFlops: 141792392.34, TFlops/GPU: 138.47 146 | Step 1200, Rank 0, Elapsed 300.21, Work left 1.29%, Time left: 3.93, Est Total: 304.14, GFlops: 141770595.14, TFlops/GPU: 138.45 147 | Step 1210, Rank 0, Elapsed 300.66, Work left 1.19%, Time left: 3.62, Est Total: 304.28, GFlops: 141708376.49, TFlops/GPU: 138.39 148 | Step 1220, Rank 0, Elapsed 301.22, Work left 1.09%, Time left: 3.33, Est Total: 304.55, GFlops: 141582908.57, TFlops/GPU: 138.26 149 | Step 1230, Rank 0, Elapsed 301.54, Work left 1.00%, Time left: 3.05, Est Total: 304.59, GFlops: 141561903.63, TFlops/GPU: 138.24 150 | Step 1240, Rank 0, Elapsed 301.98, Work left 0.92%, Time left: 2.79, Est Total: 304.77, GFlops: 141478466.06, TFlops/GPU: 138.16 151 | Step 1250, Rank 0, Elapsed 302.48, Work left 0.83%, Time left: 2.54, Est Total: 305.02, GFlops: 141361659.10, TFlops/GPU: 138.05 152 | Step 1260, Rank 0, Elapsed 302.80, Work left 0.76%, Time left: 2.31, Est Total: 305.11, GFlops: 141322184.99, TFlops/GPU: 138.01 153 | Step 1270, Rank 0, Elapsed 303.19, Work left 0.69%, Time left: 2.10, Est Total: 305.28, GFlops: 141242033.69, TFlops/GPU: 137.93 154 | Step 1280, Rank 0, Elapsed 303.54, Work left 0.62%, Time left: 1.89, Est Total: 305.44, GFlops: 141171796.70, TFlops/GPU: 137.86 155 | Step 1290, Rank 0, Elapsed 303.93, Work left 0.56%, Time left: 1.70, Est Total: 305.63, GFlops: 141081286.12, TFlops/GPU: 137.77 156 | Step 1300, Rank 0, Elapsed 304.21, Work left 0.50%, Time left: 1.53, Est Total: 305.74, GFlops: 141031490.65, TFlops/GPU: 137.73 157 | Step 1310, Rank 0, Elapsed 304.45, Work left 0.45%, Time left: 1.36, Est Total: 305.81, GFlops: 140999214.86, TFlops/GPU: 137.69 158 | Step 1320, Rank 0, Elapsed 304.84, Work left 0.40%, Time left: 1.21, Est Total: 306.05, GFlops: 140888473.63, TFlops/GPU: 137.59 159 | Step 1330, Rank 0, Elapsed 305.12, Work left 0.35%, Time left: 1.07, Est Total: 306.19, GFlops: 140821428.41, TFlops/GPU: 137.52 160 | Step 1340, Rank 0, Elapsed 305.36, Work left 0.31%, Time left: 0.94, Est Total: 306.30, GFlops: 140774068.42, TFlops/GPU: 137.47 161 | Step 1350, Rank 0, Elapsed 305.68, Work left 0.27%, Time left: 0.82, Est Total: 306.51, GFlops: 140677603.88, TFlops/GPU: 137.38 162 | Step 1360, Rank 0, Elapsed 305.86, Work left 0.23%, Time left: 0.72, Est Total: 306.58, GFlops: 140644495.05, TFlops/GPU: 137.35 163 | Step 1370, Rank 0, Elapsed 306.10, Work left 0.20%, Time left: 0.62, Est Total: 306.72, GFlops: 140582461.99, TFlops/GPU: 137.29 164 | Step 1380, Rank 0, Elapsed 306.39, Work left 0.17%, Time left: 0.53, Est Total: 306.92, GFlops: 140488994.72, TFlops/GPU: 137.20 165 | Step 1390, Rank 0, Elapsed 306.58, Work left 0.15%, Time left: 0.45, Est Total: 307.03, GFlops: 140438193.65, TFlops/GPU: 137.15 166 | Step 1400, Rank 0, Elapsed 306.75, Work left 0.12%, Time left: 0.38, Est Total: 307.13, GFlops: 140393825.08, TFlops/GPU: 137.10 167 | Step 1410, Rank 0, Elapsed 307.00, Work left 0.10%, Time left: 0.31, Est Total: 307.31, GFlops: 140309193.43, TFlops/GPU: 137.02 168 | Step 1420, Rank 0, Elapsed 307.14, Work left 0.08%, Time left: 0.26, Est Total: 307.39, GFlops: 140271859.43, TFlops/GPU: 136.98 169 | Step 1430, Rank 0, Elapsed 307.31, Work left 0.07%, Time left: 0.21, Est Total: 307.52, GFlops: 140215444.68, TFlops/GPU: 136.93 170 | Step 1440, Rank 0, Elapsed 307.46, Work left 0.05%, Time left: 0.17, Est Total: 307.63, GFlops: 140164201.61, TFlops/GPU: 136.88 171 | Step 1450, Rank 0, Elapsed 307.63, Work left 0.04%, Time left: 0.13, Est Total: 307.76, GFlops: 140105788.63, TFlops/GPU: 136.82 172 | Step 1460, Rank 0, Elapsed 307.75, Work left 0.03%, Time left: 0.10, Est Total: 307.86, GFlops: 140061931.47, TFlops/GPU: 136.78 173 | Step 1470, Rank 0, Elapsed 307.86, Work left 0.02%, Time left: 0.08, Est Total: 307.93, GFlops: 140026884.42, TFlops/GPU: 136.75 174 | Step 1480, Rank 0, Elapsed 308.00, Work left 0.02%, Time left: 0.05, Est Total: 308.06, GFlops: 139969577.12, TFlops/GPU: 136.69 175 | Step 1490, Rank 0, Elapsed 308.11, Work left 0.01%, Time left: 0.04, Est Total: 308.15, GFlops: 139929101.58, TFlops/GPU: 136.65 176 | Step 1500, Rank 0, Elapsed 308.21, Work left 0.01%, Time left: 0.03, Est Total: 308.23, GFlops: 139890339.15, TFlops/GPU: 136.61 177 | Step 1510, Rank 0, Elapsed 308.34, Work left 0.01%, Time left: 0.02, Est Total: 308.35, GFlops: 139836613.55, TFlops/GPU: 136.56 178 | Step 1520, Rank 0, Elapsed 308.43, Work left 0.00%, Time left: 0.01, Est Total: 308.44, GFlops: 139798441.15, TFlops/GPU: 136.52 179 | Step 1530, Rank 0, Elapsed 308.52, Work left 0.00%, Time left: 0.00, Est Total: 308.53, GFlops: 139756324.91, TFlops/GPU: 136.48 180 | Step 1540, Rank 0, Elapsed 308.64, Work left 0.00%, Time left: 0.00, Est Total: 308.64, GFlops: 139706058.14, TFlops/GPU: 136.43 181 | Step 1550, Rank 0, Elapsed 308.72, Work left 0.00%, Time left: 0.00, Est Total: 308.72, GFlops: 139670934.41, TFlops/GPU: 136.40 182 | Step 1560, Rank 0, Elapsed 308.80, Work left 0.00%, Time left: 0.00, Est Total: 308.80, GFlops: 139632822.28, TFlops/GPU: 136.36 183 | LU factorization 308.86 sec 184 | 185 | IR Start with GPU 186 | # iterative refinement: step=0, residual=8.778080e-04, hpl-harness=1311160.547096 187 | # iterative refinement: step=1, residual=4.172508e-07, hpl-harness=623.022736 188 | # iterative refinement: step=2, residual=1.682939e-10, hpl-harness=0.251290 189 | IR Time 5.94 sec 190 | #END___: Mon Mar 27 04:36:14 2023 191 | 192 | 314.822 sec. 136962679.949 GFlop/s resid = 1.682938764204205e-10 hpl-harness = 0.251289911 TFlop/s per GPU = 133.753 --------------------------------------------------------------------------------