├── .clang-format ├── .github └── CODEOWNERS ├── .gitignore ├── CMakeLists.txt ├── LICENSE ├── README.md ├── cmake └── Dependencies.cmake ├── include ├── hpl.hpp ├── hpl_auxil.hpp ├── hpl_blas.hpp ├── hpl_comm.hpp ├── hpl_grid.hpp ├── hpl_hip_ex.hpp ├── hpl_misc.hpp ├── hpl_panel.hpp ├── hpl_pauxil.hpp ├── hpl_pfact.hpp ├── hpl_pgesv.hpp ├── hpl_pmatgen.hpp ├── hpl_pmisc.hpp ├── hpl_ptest.hpp ├── hpl_ptimer.hpp └── hpl_version.hpp.in ├── install.sh ├── scripts ├── HPL.dat ├── mpirun_rochpl.in └── run_rochpl.in └── src ├── HPL_InitGPU.cpp ├── HPL_pddriver.cpp ├── HPL_pdinfo.cpp ├── HPL_pdtest.cpp ├── HPL_pdwarmup.cpp ├── auxil ├── HPL_abort.cpp ├── HPL_dlacpy_device.cpp ├── HPL_dlamch.cpp ├── HPL_dlaprnt.cpp ├── HPL_dlatcpy_device.cpp ├── HPL_fprintf.cpp └── HPL_warn.cpp ├── comm ├── HPL_all_reduce.cpp ├── HPL_all_reduce_dmxswp.cpp ├── HPL_allgatherv.cpp ├── HPL_barrier.cpp ├── HPL_bcast.cpp ├── HPL_bcast_1rinM.cpp ├── HPL_bcast_1ring.cpp ├── HPL_bcast_2rinM.cpp ├── HPL_bcast_2ring.cpp ├── HPL_bcast_blonM.cpp ├── HPL_bcast_blong.cpp ├── HPL_broadcast.cpp ├── HPL_recv.cpp ├── HPL_reduce.cpp ├── HPL_scatterv.cpp ├── HPL_sdrv.cpp └── HPL_send.cpp ├── grid ├── HPL_grid_exit.cpp ├── HPL_grid_info.cpp └── HPL_grid_init.cpp ├── matgen ├── HPL_pdmatgen.cpp ├── HPL_pdrandmat_device.cpp └── HPL_xjumpm.cpp ├── panel ├── HPL_pdpanel_bcast.cpp ├── HPL_pdpanel_free.cpp ├── HPL_pdpanel_init.cpp ├── HPL_pdpanel_new.cpp ├── HPL_pdpanel_swapids.cpp └── HPL_pdpanel_wait.cpp ├── pauxil ├── HPL_dlaswp00N_device.cpp ├── HPL_dlaswp01T_device.cpp ├── HPL_dlaswp02T_device.cpp ├── HPL_dlaswp03T_device.cpp ├── HPL_dlaswp04T_device.cpp ├── HPL_dlaswp10N_device.cpp ├── HPL_indxg2l.cpp ├── HPL_indxg2lp.cpp ├── HPL_indxg2p.cpp ├── HPL_indxl2g.cpp ├── HPL_infog2l.cpp ├── HPL_numroc.cpp ├── HPL_numrocI.cpp ├── HPL_pabort.cpp ├── HPL_pdlamch.cpp ├── HPL_pdlange_device.cpp └── HPL_pwarn.cpp ├── pfact ├── HPL_pdfact.cpp ├── HPL_pdmxswp.cpp ├── HPL_pdpancrN_device.cpp ├── HPL_pdpancrT_device.cpp ├── HPL_pdpanllN_device.cpp ├── HPL_pdpanllT_device.cpp ├── HPL_pdpanrlN_device.cpp ├── HPL_pdpanrlT_device.cpp ├── HPL_pdrpancrN.cpp ├── HPL_pdrpancrT.cpp ├── HPL_pdrpanllN.cpp ├── HPL_pdrpanllT.cpp ├── HPL_pdrpanrlN.cpp └── HPL_pdrpanrlT.cpp ├── pgesv ├── HPL_pdgesv.cpp ├── HPL_pdlaswp.cpp ├── HPL_pdtrsv_device.cpp ├── HPL_pdupdateNT.cpp ├── HPL_pdupdateTT.cpp ├── HPL_perm.cpp ├── HPL_pipid.cpp ├── HPL_piplen.cpp └── HPL_plindx.cpp └── timer ├── HPL_ptimer.cpp ├── HPL_ptimer_cputime.cpp └── HPL_ptimer_walltime.cpp /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | Language: Cpp 3 | AccessModifierOffset: 0 4 | AlignAfterOpenBracket: Align 5 | AlignConsecutiveAssignments: true 6 | AlignConsecutiveDeclarations: true 7 | AlignEscapedNewlinesLeft: true 8 | AlignOperands: true 9 | AlignTrailingComments: true 10 | AllowAllParametersOfDeclarationOnNextLine: false 11 | AllowShortBlocksOnASingleLine: true 12 | AllowShortCaseLabelsOnASingleLine: true 13 | AllowShortFunctionsOnASingleLine: true 14 | AllowShortIfStatementsOnASingleLine: true 15 | AllowShortLoopsOnASingleLine: true 16 | AlwaysBreakAfterDefinitionReturnType: None 17 | AlwaysBreakAfterReturnType: None 18 | AlwaysBreakBeforeMultilineStrings: false 19 | AlwaysBreakTemplateDeclarations: true 20 | BinPackArguments: false 21 | BinPackParameters: false 22 | BraceWrapping: 23 | AfterClass: false 24 | AfterControlStatement: false 25 | AfterEnum: false 26 | AfterFunction: false 27 | AfterNamespace: false 28 | AfterObjCDeclaration: false 29 | AfterStruct: false 30 | AfterUnion: false 31 | BeforeCatch: false 32 | BeforeElse: false 33 | IndentBraces: false 34 | BreakBeforeBinaryOperators: None 35 | BreakBeforeBraces: Custom 36 | BreakBeforeTernaryOperators: true 37 | BreakConstructorInitializersBeforeComma: false 38 | ColumnLimit: 80 39 | CommentPragmas: '^ IWYU pragma:' 40 | ConstructorInitializerAllOnOneLineOrOnePerLine: true 41 | ConstructorInitializerIndentWidth: 4 42 | ContinuationIndentWidth: 4 43 | Cpp11BracedListStyle: true 44 | DerivePointerAlignment: false 45 | DisableFormat: false 46 | ExperimentalAutoDetectBinPacking: false 47 | ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ] 48 | IncludeCategories: 49 | - Regex: '^"(llvm|llvm-c|clang|clang-c)/' 50 | Priority: 2 51 | - Regex: '^(<|"(gtest|isl|json)/)' 52 | Priority: 3 53 | - Regex: '.*' 54 | Priority: 1 55 | IndentCaseLabels: true 56 | IndentWidth: 2 57 | IndentWrappedFunctionNames: false 58 | KeepEmptyLinesAtTheStartOfBlocks: true 59 | MacroBlockBegin: '' 60 | MacroBlockEnd: '' 61 | MaxEmptyLinesToKeep: 1 62 | NamespaceIndentation: None 63 | ObjCBlockIndentWidth: 2 64 | ObjCSpaceAfterProperty: false 65 | ObjCSpaceBeforeProtocolList: true 66 | PenaltyBreakBeforeFirstCallParameter: 19 67 | PenaltyBreakComment: 300 68 | PenaltyBreakFirstLessLess: 120 69 | PenaltyBreakString: 1000 70 | PenaltyExcessCharacter: 1000000 71 | PenaltyReturnTypeOnItsOwnLine: 1000 72 | PointerAlignment: Left 73 | ReflowComments: true 74 | SortIncludes: false 75 | SpaceAfterCStyleCast: false 76 | # SpaceAfterTemplateKeyword: true 77 | SpaceBeforeAssignmentOperators: true 78 | SpaceBeforeParens: Never 79 | SpaceInEmptyParentheses: false 80 | SpacesBeforeTrailingComments: 1 81 | SpacesInAngles: false 82 | SpacesInContainerLiterals: true 83 | SpacesInCStyleCastParentheses: false 84 | SpacesInParentheses: false 85 | SpacesInSquareBrackets: false 86 | Standard: Cpp11 87 | TabWidth: 2 88 | UseTab: Never 89 | ... 90 | 91 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @noelchalmers @pbauman 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files 2 | *.slo 3 | *.lo 4 | *.o 5 | *.obj 6 | 7 | # Precompiled Headers 8 | *.gch 9 | *.pch 10 | 11 | # Compiled Dynamic libraries 12 | *.so 13 | *.dylib 14 | *.dll 15 | 16 | # Fortran module files 17 | *.mod 18 | 19 | # Compiled Static libraries 20 | *.lai 21 | *.la 22 | *.a 23 | *.lib 24 | 25 | # Executables 26 | *.exe 27 | *.out 28 | *.app 29 | 30 | # vim tags 31 | tags 32 | .tags 33 | .*.swp 34 | 35 | # Editors 36 | .vscode 37 | 38 | # build-in-source directory 39 | build 40 | 41 | # doc directory 42 | docBin 43 | _build 44 | 45 | #third-party software 46 | tpl/ 47 | ltmain.sh -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | ====================================================================== 2 | -- High Performance Computing Linpack Benchmark (HPL) 3 | HPL - 2.2 - February 24, 2016 4 | Antoine P. Petitet 5 | University of Tennessee, Knoxville 6 | Innovative Computing Laboratory 7 | (C) Copyright 2000-2008 All Rights Reserved 8 | 9 | -- Copyright notice and Licensing terms: 10 | 11 | Redistribution and use in source and binary forms, with or without 12 | modification, are permitted provided that the following conditions 13 | are met: 14 | 15 | 1. Redistributions of source code must retain the above copyright 16 | notice, this list of conditions and the following disclaimer. 17 | 18 | 2. Redistributions in binary form must reproduce the above copyright 19 | notice, this list of conditions, and the following disclaimer in the 20 | documentation and/or other materials provided with the distribution. 21 | 22 | 3. All advertising materials mentioning features or use of this 23 | software must display the following acknowledgement: 24 | This product includes software developed at the University of 25 | Tennessee, Knoxville, Innovative Computing Laboratory. 26 | 27 | 4. The name of the University, the name of the Laboratory, or the 28 | names of its contributors may not be used to endorse or promote 29 | products derived from this software without specific written 30 | permission. 31 | 32 | -- Disclaimer: 33 | 34 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 35 | ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 36 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 37 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY 38 | OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 39 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 40 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 41 | DATA OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 42 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 43 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 44 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 45 | ====================================================================== 46 | 47 | Modifications (c) 2018-2025 Advanced Micro Devices, Inc. 48 | Modified by: Noel Chalmers 49 | 50 | Redistribution and use in source and binary forms, with or without modification, 51 | are permitted provided that the following conditions are met: 52 | 53 | 1. Redistributions of source code must retain the above copyright notice, this 54 | list of conditions and the following disclaimer. 55 | 2. Redistributions in binary form must reproduce the above copyright notice, 56 | this list of conditions and the following disclaimer in the documentation 57 | and/or other materials provided with the distribution. 58 | 3. Neither the name of the copyright holder nor the names of its contributors 59 | may be used to endorse or promote products derived from this software without 60 | specific prior written permission. 61 | 62 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 63 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 64 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 65 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 66 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 67 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, 68 | OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 69 | WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 70 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 71 | POSSIBILITY OF SUCH DAMAGE. 72 | -------------------------------------------------------------------------------- /cmake/Dependencies.cmake: -------------------------------------------------------------------------------- 1 | # Modifications (c) 2019-2022 Advanced Micro Devices, Inc. 2 | # 3 | # Redistribution and use in source and binary forms, with or without modification, 4 | # are permitted provided that the following conditions are met: 5 | # 6 | # 1. Redistributions of source code must retain the above copyright notice, this 7 | # list of conditions and the following disclaimer. 8 | # 2. Redistributions in binary form must reproduce the above copyright notice, 9 | # this list of conditions and the following disclaimer in the documentation 10 | # and/or other materials provided with the distribution. 11 | # 3. Neither the name of the copyright holder nor the names of its contributors 12 | # may be used to endorse or promote products derived from this software without 13 | # specific prior written permission. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18 | # IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 19 | # INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 20 | # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, 21 | # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 22 | # WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 23 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 24 | # POSSIBILITY OF SUCH DAMAGE. 25 | 26 | # Dependencies 27 | 28 | # Git 29 | find_package(Git REQUIRED) 30 | 31 | # Find OpenMP package 32 | find_package(OpenMP REQUIRED) 33 | 34 | # MPI 35 | set(MPI_HOME ${HPL_MPI_DIR}) 36 | find_package(MPI REQUIRED) 37 | 38 | # Add some paths 39 | list(APPEND CMAKE_PREFIX_PATH ${ROCBLAS_PATH} ${ROCM_PATH} ) 40 | list(APPEND CMAKE_MODULE_PATH ${ROCM_PATH}/lib/cmake/hip ) 41 | 42 | # Find HIP package 43 | find_package(HIP REQUIRED) 44 | 45 | # rocblas 46 | find_package(rocblas REQUIRED) 47 | 48 | get_target_property(rocblas_LIBRARIES roc::rocblas IMPORTED_LOCATION_RELEASE) 49 | 50 | message("-- rocBLAS version: ${rocblas_VERSION}") 51 | message("-- rocBLAS include dirs: ${rocblas_INCLUDE_DIRS}") 52 | message("-- rocBLAS libraries: ${rocblas_LIBRARIES}") 53 | 54 | get_filename_component(ROCBLAS_LIB_PATH ${rocblas_LIBRARIES} DIRECTORY) 55 | 56 | if(HPL_TRACING) 57 | find_library(ROCTRACER NAMES roctracer64 58 | PATHS ${ROCM_PATH}/lib 59 | NO_DEFAULT_PATH) 60 | find_library(ROCTX NAMES roctx64 61 | PATHS ${ROCM_PATH}/lib 62 | NO_DEFAULT_PATH) 63 | 64 | message("-- roctracer: ${ROCTRACER}") 65 | message("-- roctx: ${ROCTX}") 66 | 67 | add_library(roc::roctracer SHARED IMPORTED) 68 | set_target_properties(roc::roctracer PROPERTIES 69 | INTERFACE_INCLUDE_DIRECTORIES "${ROCM_PATH}/include" 70 | INTERFACE_LINK_LIBRARIES "hip::host" 71 | IMPORTED_LOCATION "${ROCTRACER}" 72 | IMPORTED_SONAME "libroctracer.so") 73 | add_library(roc::roctx SHARED IMPORTED) 74 | set_target_properties(roc::roctx PROPERTIES 75 | INTERFACE_INCLUDE_DIRECTORIES "${ROCM_PATH}/include" 76 | INTERFACE_LINK_LIBRARIES "hip::host" 77 | IMPORTED_LOCATION "${ROCTX}" 78 | IMPORTED_SONAME "libroctx64.so") 79 | endif() 80 | 81 | # ROCm cmake package 82 | find_package(ROCmCMakeBuildTools QUIET CONFIG PATHS ${CMAKE_PREFIX_PATH}) 83 | if(NOT ROCM_FOUND) 84 | set(PROJECT_EXTERN_DIR ${CMAKE_CURRENT_BINARY_DIR}/extern) 85 | set(rocm_cmake_tag "master" CACHE STRING "rocm-cmake tag to download") 86 | file(DOWNLOAD https://github.com/RadeonOpenCompute/rocm-cmake/archive/${rocm_cmake_tag}.zip 87 | ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag}.zip STATUS status LOG log) 88 | 89 | list(GET status 0 status_code) 90 | list(GET status 1 status_string) 91 | 92 | if(NOT status_code EQUAL 0) 93 | message(FATAL_ERROR "error: downloading 94 | 'https://github.com/RadeonOpenCompute/rocm-cmake/archive/${rocm_cmake_tag}.zip' failed 95 | status_code: ${status_code} 96 | status_string: ${status_string} 97 | log: ${log} 98 | ") 99 | endif() 100 | 101 | execute_process(COMMAND ${CMAKE_COMMAND} -E tar xzf ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag}.zip 102 | WORKING_DIRECTORY ${PROJECT_EXTERN_DIR}) 103 | 104 | find_package(ROCmCMakeBuildTools REQUIRED CONFIG PATHS ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag}) 105 | endif() 106 | 107 | include(ROCMSetupVersion) 108 | include(ROCMCreatePackage) 109 | include(ROCMInstallTargets) 110 | include(ROCMPackageConfigHelpers) 111 | include(ROCMInstallSymlinks) 112 | include(ROCMCheckTargetIds OPTIONAL) 113 | -------------------------------------------------------------------------------- /include/hpl.hpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | #ifndef HPL_HPP 17 | #define HPL_HPP 18 | /* 19 | * --------------------------------------------------------------------- 20 | * HPL default compile options that can overridden in the cmake 21 | * --------------------------------------------------------------------- 22 | */ 23 | #ifndef HPL_DETAILED_TIMING /* Do not enable detailed timings */ 24 | #define HPL_NO_DETAILED_TIMING 25 | #endif 26 | 27 | #undef HPL_USE_COLLECTIVES 28 | // #define HPL_USE_COLLECTIVES 29 | 30 | /* 31 | Enabling atomics will potentially allow more performance optimization 32 | but will potentailly lead to residual values which vary from run-to-run 33 | */ 34 | #undef HPL_ROCBLAS_ALLOW_ATOMICS 35 | // #define HPL_ROCBLAS_ALLOW_ATOMICS 36 | 37 | /* 38 | * --------------------------------------------------------------------- 39 | * Include files 40 | * --------------------------------------------------------------------- 41 | */ 42 | #include 43 | 44 | // NC: hipcc in ROCm 3.7 complains if __HIP_PLATFORM_HCC__ is defined in the 45 | // compile line 46 | #ifdef __HIPCC__ 47 | #ifdef __HIP_PLATFORM_HCC__ 48 | #undef __HIP_PLATFORM_HCC__ 49 | #endif 50 | #endif 51 | #include "hip/hip_runtime_api.h" 52 | 53 | #ifdef HPL_TRACING 54 | #include 55 | #include 56 | #endif 57 | 58 | #include "hpl_version.hpp" 59 | #include "hpl_misc.hpp" 60 | #include "hpl_blas.hpp" 61 | #include "hpl_auxil.hpp" 62 | 63 | #include "hpl_pmisc.hpp" 64 | #include "hpl_pauxil.hpp" 65 | #include "hpl_panel.hpp" 66 | #include "hpl_pfact.hpp" 67 | #include "hpl_pgesv.hpp" 68 | 69 | #include "hpl_ptimer.hpp" 70 | #include "hpl_pmatgen.hpp" 71 | #include "hpl_ptest.hpp" 72 | 73 | #endif 74 | /* 75 | * End of hpl.hpp 76 | */ 77 | -------------------------------------------------------------------------------- /include/hpl_auxil.hpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | #ifndef HPL_AUXIL_HPP 17 | #define HPL_AUXIL_HPP 18 | /* 19 | * --------------------------------------------------------------------- 20 | * Include files 21 | * --------------------------------------------------------------------- 22 | */ 23 | #include "hpl_misc.hpp" 24 | #include "hpl_blas.hpp" 25 | /* 26 | * --------------------------------------------------------------------- 27 | * typedef definitions 28 | * --------------------------------------------------------------------- 29 | */ 30 | typedef enum { 31 | HPL_NORM_A = 800, 32 | HPL_NORM_1 = 801, 33 | HPL_NORM_I = 802 34 | } HPL_T_NORM; 35 | 36 | typedef enum { 37 | HPL_MACH_EPS = 900, /* relative machine precision */ 38 | HPL_MACH_SFMIN = 901, /* safe minimum st 1/sfmin does not overflow */ 39 | HPL_MACH_BASE = 902, /* base = base of the machine */ 40 | HPL_MACH_PREC = 903, /* prec = eps*base */ 41 | HPL_MACH_MLEN = 904, /* number of (base) digits in the mantissa */ 42 | HPL_MACH_RND = 905, /* 1.0 if rounding occurs in addition */ 43 | HPL_MACH_EMIN = 906, /* min exponent before (gradual) underflow */ 44 | HPL_MACH_RMIN = 907, /* underflow threshold base**(emin-1) */ 45 | HPL_MACH_EMAX = 908, /* largest exponent before overflow */ 46 | HPL_MACH_RMAX = 909 /* overflow threshold - (base**emax)*(1-eps) */ 47 | 48 | } HPL_T_MACH; 49 | /* 50 | * --------------------------------------------------------------------- 51 | * Function prototypes 52 | * --------------------------------------------------------------------- 53 | */ 54 | void HPL_fprintf(FILE*, const char*, ...); 55 | void HPL_warn(FILE*, int, const char*, const char*, ...); 56 | void HPL_abort(int, const char*, const char*, ...); 57 | 58 | void HPL_dlacpy(const int, 59 | const int, 60 | const double*, 61 | const int, 62 | double*, 63 | const int); 64 | 65 | void HPL_dlatcpy(const int, 66 | const int, 67 | const double*, 68 | const int, 69 | double*, 70 | const int); 71 | 72 | double HPL_dlamch(const HPL_T_MACH); 73 | 74 | #endif 75 | /* 76 | * End of hpl_auxil.hpp 77 | */ 78 | -------------------------------------------------------------------------------- /include/hpl_blas.hpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | #ifndef HPL_BLAS_HPP 17 | #define HPL_BLAS_HPP 18 | /* 19 | * --------------------------------------------------------------------- 20 | * Include files 21 | * --------------------------------------------------------------------- 22 | */ 23 | 24 | #include "hpl_misc.hpp" 25 | #include 26 | #include 27 | 28 | extern rocblas_handle handle; 29 | extern hipStream_t computeStream; 30 | extern hipStream_t dataStream; 31 | 32 | #define CHECK_HIP_ERROR(val) hipCheck((val), #val, __FILE__, __LINE__) 33 | inline void hipCheck(hipError_t err, 34 | const char* const func, 35 | const char* const file, 36 | const int line) { 37 | if(err != hipSuccess) { 38 | std::cerr << "HIP Runtime Error at: " << file << ":" << line << std::endl; 39 | std::cerr << hipGetErrorString(err) << " " << func << std::endl; 40 | std::exit(-1); 41 | } 42 | } 43 | 44 | #define CHECK_ROCBLAS_ERROR(val) rocBLASCheck((val), #val, __FILE__, __LINE__) 45 | inline void rocBLASCheck(rocblas_status err, 46 | const char* const func, 47 | const char* const file, 48 | const int line) { 49 | if(err != rocblas_status_success) { 50 | std::cerr << "rocBLAS Reports Error at: " << file << ":" << line 51 | << std::endl; 52 | std::cerr << rocblas_status_to_string(err) << " " << func << std::endl; 53 | std::exit(-1); 54 | } 55 | } 56 | 57 | #endif 58 | /* 59 | * hpl_blas.hpp 60 | */ 61 | -------------------------------------------------------------------------------- /include/hpl_comm.hpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | #ifndef HPL_COMM_HPP 17 | #define HPL_COMM_HPP 18 | /* 19 | * --------------------------------------------------------------------- 20 | * Include files 21 | * --------------------------------------------------------------------- 22 | */ 23 | #include "hpl_pmisc.hpp" 24 | #include "hpl_panel.hpp" 25 | 26 | /* 27 | * --------------------------------------------------------------------- 28 | * #typedefs and data structures 29 | * --------------------------------------------------------------------- 30 | */ 31 | typedef enum { 32 | HPL_1RING = 401, /* Unidirectional ring */ 33 | HPL_1RING_M = 402, /* Unidirectional ring (modified) */ 34 | HPL_2RING = 403, /* Bidirectional ring */ 35 | HPL_2RING_M = 404, /* Bidirectional ring (modified) */ 36 | HPL_BLONG = 405, /* long broadcast */ 37 | HPL_BLONG_M = 406, /* long broadcast (modified) */ 38 | } HPL_T_TOP; 39 | 40 | typedef MPI_Op HPL_T_OP; 41 | 42 | #define HPL_SUM MPI_SUM 43 | #define HPL_MAX MPI_MAX 44 | #define HPL_MIN MPI_MIN 45 | 46 | extern MPI_Op HPL_DMXSWP; 47 | extern MPI_Datatype PDFACT_ROW; 48 | /* 49 | * --------------------------------------------------------------------- 50 | * #define macro constants 51 | * --------------------------------------------------------------------- 52 | */ 53 | #define HPL_FAILURE 0 54 | #define HPL_SUCCESS 1 55 | /* 56 | * --------------------------------------------------------------------- 57 | * comm function prototypes 58 | * --------------------------------------------------------------------- 59 | */ 60 | int HPL_send(double*, int, int, int, MPI_Comm); 61 | int HPL_recv(double*, int, int, int, MPI_Comm); 62 | int HPL_sdrv(double*, int, int, double*, int, int, int, MPI_Comm); 63 | int HPL_bcast(double*, int, int, MPI_Comm, HPL_T_TOP top); 64 | int HPL_bcast_1ring(double* SBUF, int SCOUNT, int ROOT, MPI_Comm COMM); 65 | int HPL_bcast_1rinM(double* SBUF, int SCOUNT, int ROOT, MPI_Comm COMM); 66 | int HPL_bcast_2ring(double* SBUF, int SCOUNT, int ROOT, MPI_Comm COMM); 67 | int HPL_bcast_2rinM(double* SBUF, int SCOUNT, int ROOT, MPI_Comm COMM); 68 | int HPL_bcast_blong(double* SBUF, int SCOUNT, int ROOT, MPI_Comm COMM); 69 | int HPL_bcast_blonM(double* SBUF, int SCOUNT, int ROOT, MPI_Comm COMM); 70 | int HPL_scatterv(double*, const int*, const int*, const int, int, MPI_Comm); 71 | int HPL_allgatherv(double*, const int, const int*, const int*, MPI_Comm); 72 | int HPL_barrier(MPI_Comm); 73 | int HPL_broadcast(void*, const int, const HPL_T_TYPE, const int, MPI_Comm); 74 | 75 | int HPL_reduce(void*, 76 | const int, 77 | const HPL_T_TYPE, 78 | const HPL_T_OP, 79 | const int, 80 | MPI_Comm); 81 | 82 | int HPL_all_reduce(void*, 83 | const int, 84 | const HPL_T_TYPE, 85 | const HPL_T_OP, 86 | MPI_Comm); 87 | 88 | void HPL_dmxswp(void*, void*, int*, MPI_Datatype*); 89 | void HPL_all_reduce_dmxswp(double*, const int, const int, MPI_Comm, double*); 90 | 91 | #endif 92 | /* 93 | * End of hpl_comm.hpp 94 | */ 95 | -------------------------------------------------------------------------------- /include/hpl_grid.hpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | #ifndef HPL_GRID_H 17 | #define HPL_GRID_H 18 | /* 19 | * --------------------------------------------------------------------- 20 | * Include files 21 | * --------------------------------------------------------------------- 22 | */ 23 | #include "hpl_pmisc.hpp" 24 | 25 | /* 26 | * --------------------------------------------------------------------- 27 | * #typedefs and data structures 28 | * --------------------------------------------------------------------- 29 | */ 30 | typedef enum { HPL_INT = 100, HPL_DOUBLE = 101 } HPL_T_TYPE; 31 | 32 | typedef enum { HPL_ROW_MAJOR = 201, HPL_COLUMN_MAJOR = 202 } HPL_T_ORDER; 33 | 34 | typedef struct HPL_S_grid { 35 | MPI_Comm all_comm; /* grid communicator */ 36 | MPI_Comm row_comm; /* row communicator */ 37 | MPI_Comm col_comm; /* column communicator */ 38 | HPL_T_ORDER order; /* ordering of the procs in the grid */ 39 | int iam; /* my rank in the grid */ 40 | int myrow; /* my row number in the grid */ 41 | int mycol; /* my column number in the grid */ 42 | int nprow; /* the total # of rows in the grid */ 43 | int npcol; /* the total # of columns in the grid */ 44 | int local_myrow; /* my row number in the node-local grid */ 45 | int local_mycol; /* my column number in the node-local grid */ 46 | int local_nprow; /* the total # of rows in the node-local grid */ 47 | int local_npcol; /* the total # of columns in the node-local grid */ 48 | int nprocs; /* the total # of procs in the grid */ 49 | int row_ip2; /* largest power of two <= nprow */ 50 | int row_hdim; /* row_ip2 procs hypercube dimension */ 51 | int row_ip2m1; /* largest power of two <= nprow-1 */ 52 | int row_mask; /* row_ip2m1 procs hypercube mask */ 53 | int col_ip2; /* largest power of two <= npcol */ 54 | int col_hdim; /* col_ip2 procs hypercube dimension */ 55 | int col_ip2m1; /* largest power of two <= npcol-1 */ 56 | int col_mask; /* col_ip2m1 procs hypercube mask */ 57 | } HPL_T_grid; 58 | 59 | /* 60 | * --------------------------------------------------------------------- 61 | * #define macros definitions 62 | * --------------------------------------------------------------------- 63 | */ 64 | #define HPL_2_MPI_TYPE(typ) ((typ == HPL_INT ? MPI_INT : MPI_DOUBLE)) 65 | /* 66 | * The following macros perform common modulo operations; All functions 67 | * except MPosMod assume arguments are < d (i.e., arguments are themsel- 68 | * ves within modulo range). 69 | */ 70 | /* increment with mod */ 71 | #define MModInc(I, d) \ 72 | if(++(I) == (d)) (I) = 0 73 | /* decrement with mod */ 74 | #define MModDec(I, d) \ 75 | if(--(I) == -1) (I) = (d) - 1 76 | /* positive modulo */ 77 | #define MPosMod(I, d) ((I) - ((I) / (d)) * (d)) 78 | /* add two numbers */ 79 | #define MModAdd(I1, I2, d) \ 80 | (((I1) + (I2) < (d)) ? (I1) + (I2) : (I1) + (I2) - (d)) 81 | /* add 1 to # */ 82 | #define MModAdd1(I, d) (((I) != (d) - 1) ? (I) + 1 : 0) 83 | /* subtract two numbers */ 84 | #define MModSub(I1, I2, d) (((I1) < (I2)) ? (d) + (I1) - (I2) : (I1) - (I2)) 85 | /* sub 1 from # */ 86 | #define MModSub1(I, d) (((I) != 0) ? (I) - 1 : (d) - 1) 87 | /* 88 | * --------------------------------------------------------------------- 89 | * grid function prototypes 90 | * --------------------------------------------------------------------- 91 | */ 92 | int HPL_grid_init(MPI_Comm, 93 | const HPL_T_ORDER, 94 | const int, 95 | const int, 96 | const int, 97 | const int, 98 | HPL_T_grid*); 99 | 100 | int HPL_grid_exit(HPL_T_grid*); 101 | int HPL_grid_info(const HPL_T_grid*, int*, int*, int*, int*); 102 | 103 | #endif 104 | /* 105 | * End of hpl_grid.hpp 106 | */ 107 | -------------------------------------------------------------------------------- /include/hpl_misc.hpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | #ifndef HPL_MISC_HPP 17 | #define HPL_MISC_HPP 18 | /* 19 | * --------------------------------------------------------------------- 20 | * Include files 21 | * --------------------------------------------------------------------- 22 | */ 23 | 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | 30 | /* 31 | * --------------------------------------------------------------------- 32 | * #define macro constants 33 | * --------------------------------------------------------------------- 34 | */ 35 | #define HPL_rone 1.0 36 | #define HPL_rtwo 2.0 37 | #define HPL_rzero 0.0 38 | /* 39 | * --------------------------------------------------------------------- 40 | * #define macros definitions 41 | * --------------------------------------------------------------------- 42 | */ 43 | #define Mabs(a_) (((a_) < 0) ? -(a_) : (a_)) 44 | #define Mmin(a_, b_) (((a_) < (b_)) ? (a_) : (b_)) 45 | #define Mmax(a_, b_) (((a_) > (b_)) ? (a_) : (b_)) 46 | 47 | #define Mfloor(a, b) (((a) > 0) ? (((a) / (b))) : (-(((-(a)) + (b) - 1) / (b)))) 48 | #define Mceil(a, b) (((a) + (b) - 1) / (b)) 49 | #define Miceil(a, b) (((a) > 0) ? ((((a) + (b) - 1) / (b))) : (-((-(a)) / (b)))) 50 | 51 | #define Mupcase(C) (((C) > 96 && (C) < 123) ? (C) & 0xDF : (C)) 52 | #define Mlowcase(C) (((C) > 64 && (C) < 91) ? (C) | 32 : (C)) 53 | /* 54 | * Mptr returns a pointer to a_( i_, j_ ) for readability reasons and 55 | * also less silly errors ... 56 | */ 57 | #define Mptr(a_, i_, j_, lda_) \ 58 | ((a_) + (size_t)(i_) + (size_t)(j_) * (size_t)(lda_)) 59 | /* 60 | * Align pointer 61 | */ 62 | #define HPL_PTR(ptr_, al_) ((((size_t)(ptr_) + (al_) - 1) / (al_)) * (al_)) 63 | #endif 64 | 65 | #ifdef HPL_TRACING 66 | #define HPL_TracingPush(label) roctxRangePush(label) 67 | #define HPL_TracingPop(label) roctxRangePop() 68 | #else 69 | #define HPL_TracingPush(label) 70 | #define HPL_TracingPop(label) 71 | #endif 72 | /* 73 | * End of hpl_misc.hpp 74 | */ 75 | -------------------------------------------------------------------------------- /include/hpl_panel.hpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | #ifndef HPL_PANEL_HPP 17 | #define HPL_PANEL_HPP 18 | /* 19 | * --------------------------------------------------------------------- 20 | * Include files 21 | * --------------------------------------------------------------------- 22 | */ 23 | #include "hpl_pmisc.hpp" 24 | #include "hpl_grid.hpp" 25 | 26 | /* 27 | * --------------------------------------------------------------------- 28 | * Data Structures 29 | * --------------------------------------------------------------------- 30 | */ 31 | typedef struct HPL_S_panel { 32 | struct HPL_S_grid* grid; /* ptr to the process grid */ 33 | struct HPL_S_palg* algo; /* ptr to the algo parameters */ 34 | struct HPL_S_pmat* pmat; /* ptr to the local array info */ 35 | double* A; /* ptr to trailing part of A */ 36 | double* A0; /* ptr to current panel of A */ 37 | double* L2; /* ptr to L */ 38 | double* L1; /* ptr to jb x jb upper block of A */ 39 | double* U0; /* ptr to U */ 40 | double* U1; /* ptr to U1 */ 41 | double* U2; /* ptr to U2 */ 42 | int* IWORK; /* integer workspace for swapping */ 43 | int* ipiv; 44 | int* dipiv; 45 | int nu0; 46 | int nu1; 47 | int nu2; 48 | int ldu0; 49 | int ldu1; 50 | int ldu2; 51 | int lda0; /* local leading dim of array A0 */ 52 | int ldl2; /* local leading dim of array L2 */ 53 | int len; /* length of the buffer to broadcast */ 54 | void* buffers[2]; /* buffers for panel bcast */ 55 | int counts[2]; /* counts for panel bcast */ 56 | MPI_Datatype dtypes[2]; /* data types for panel bcast */ 57 | MPI_Request request[1]; /* requests for panel bcast */ 58 | MPI_Status status[1]; /* status for panel bcast */ 59 | int nb; /* distribution blocking factor */ 60 | int jb; /* panel width */ 61 | int m; /* global # of rows of trailing part of A */ 62 | int n; /* global # of cols of trailing part of A */ 63 | int ia; /* global row index of trailing part of A */ 64 | int ja; /* global col index of trailing part of A */ 65 | int mp; /* local # of rows of trailing part of A */ 66 | int nq; /* local # of cols of trailing part of A */ 67 | int ii; /* local row index of trailing part of A */ 68 | int jj; /* local col index of trailing part of A */ 69 | int lda; /* local leading dim of array A */ 70 | int prow; /* proc. row owning 1st row of trail. A */ 71 | int pcol; /* proc. col owning 1st col of trail. A */ 72 | int msgid; /* message id for panel bcast */ 73 | } HPL_T_panel; 74 | 75 | /* 76 | * --------------------------------------------------------------------- 77 | * panel function prototypes 78 | * --------------------------------------------------------------------- 79 | */ 80 | #include "hpl_pgesv.hpp" 81 | 82 | typedef struct HPL_S_test HPL_T_test; 83 | 84 | int HPL_pdpanel_new(HPL_T_test*, 85 | HPL_T_grid*, 86 | HPL_T_palg*, 87 | HPL_T_pmat*, 88 | HPL_T_panel*, 89 | size_t&); 90 | 91 | void HPL_pdpanel_init(HPL_T_grid*, 92 | HPL_T_palg*, 93 | const int, 94 | const int, 95 | const int, 96 | HPL_T_pmat*, 97 | const int, 98 | const int, 99 | const int, 100 | HPL_T_panel*); 101 | 102 | int HPL_pdpanel_free(HPL_T_panel*); 103 | void HPL_pdpanel_SendToHost(HPL_T_panel*); 104 | void HPL_pdpanel_SendToDevice(HPL_T_panel*); 105 | void HPL_pdpanel_swapids(HPL_T_panel* PANEL); 106 | void HPL_pdpanel_copyL1(HPL_T_panel* PANEL); 107 | void HPL_pdpanel_Wait(HPL_T_panel* PANEL); 108 | int HPL_pdpanel_bcast(HPL_T_panel*); 109 | #endif 110 | /* 111 | * End of hpl_panel.hpp 112 | */ 113 | -------------------------------------------------------------------------------- /include/hpl_pfact.hpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | #ifndef HPL_PFACT_HPP 17 | #define HPL_PFACT_HPP 18 | /* 19 | * --------------------------------------------------------------------- 20 | * Include files 21 | * --------------------------------------------------------------------- 22 | */ 23 | #include "hpl_misc.hpp" 24 | #include "hpl_blas.hpp" 25 | 26 | #include "hpl_pgesv.hpp" 27 | #include "hpl_pmisc.hpp" 28 | #include "hpl_pauxil.hpp" 29 | #include "hpl_panel.hpp" 30 | 31 | /* 32 | * --------------------------------------------------------------------- 33 | * #typedefs and data structures 34 | * --------------------------------------------------------------------- 35 | */ 36 | typedef void (*HPL_T_PFA_FUN)(HPL_T_panel*, const int, const int, const int); 37 | 38 | typedef void (*HPL_T_RFA_FUN)(HPL_T_panel*, const int, const int, const int); 39 | /* 40 | * --------------------------------------------------------------------- 41 | * Function prototypes 42 | * --------------------------------------------------------------------- 43 | */ 44 | void HPL_pdmxswp(HPL_T_panel*, const int, const int, const int, double*); 45 | 46 | void HPL_pdpancrN(HPL_T_panel*, const int, const int, const int); 47 | 48 | void HPL_pdpancrT(HPL_T_panel*, const int, const int, const int); 49 | 50 | void HPL_pdpanllN(HPL_T_panel*, const int, const int, const int); 51 | 52 | void HPL_pdpanllT(HPL_T_panel*, const int, const int, const int); 53 | 54 | void HPL_pdpanrlN(HPL_T_panel*, const int, const int, const int); 55 | 56 | void HPL_pdpanrlT(HPL_T_panel*, const int, const int, const int); 57 | 58 | void HPL_pdrpancrN(HPL_T_panel*, const int, const int, const int); 59 | 60 | void HPL_pdrpancrT(HPL_T_panel*, const int, const int, const int); 61 | 62 | void HPL_pdrpanllN(HPL_T_panel*, const int, const int, const int); 63 | 64 | void HPL_pdrpanllT(HPL_T_panel*, const int, const int, const int); 65 | 66 | void HPL_pdrpanrlN(HPL_T_panel*, const int, const int, const int); 67 | 68 | void HPL_pdrpanrlT(HPL_T_panel*, const int, const int, const int); 69 | 70 | void HPL_pdfact(HPL_T_panel*); 71 | 72 | #endif 73 | /* 74 | * End of hpl_pfact.hpp 75 | */ 76 | -------------------------------------------------------------------------------- /include/hpl_pmatgen.hpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | #ifndef HPL_PMATGEN_HPP 17 | #define HPL_PMATGEN_HPP 18 | /* 19 | * --------------------------------------------------------------------- 20 | * Include files 21 | * --------------------------------------------------------------------- 22 | */ 23 | #include "hpl_misc.hpp" 24 | 25 | #include "hpl_pmisc.hpp" 26 | #include "hpl_pauxil.hpp" 27 | #include "hpl_pgesv.hpp" 28 | #include "hpl_ptest.hpp" 29 | 30 | /* 31 | * --------------------------------------------------------------------- 32 | * #define macro constants 33 | * --------------------------------------------------------------------- 34 | */ 35 | #define HPL_MULT 6364136223846793005UL 36 | #define HPL_IADD 1UL 37 | #define HPL_DIVFAC 2147483648.0 38 | #define HPL_POW16 65536.0 39 | #define HPL_HALF 0.5 40 | /* 41 | * --------------------------------------------------------------------- 42 | * Function prototypes 43 | * --------------------------------------------------------------------- 44 | */ 45 | void HPL_xjumpm(const int JUMPM, 46 | const uint64_t MULT, 47 | const uint64_t IADD, 48 | const uint64_t IRANN, 49 | uint64_t& IRANM, 50 | uint64_t& IAM, 51 | uint64_t& ICM); 52 | 53 | void HPL_pdrandmat(const HPL_T_grid*, 54 | const int, 55 | const int, 56 | const int, 57 | double*, 58 | const int, 59 | const int); 60 | 61 | int HPL_pdmatgen(HPL_T_test*, 62 | HPL_T_grid*, 63 | HPL_T_palg*, 64 | HPL_T_pmat*, 65 | const int, 66 | const int); 67 | 68 | void HPL_pdmatfree(HPL_T_pmat*); 69 | 70 | #endif 71 | /* 72 | * End of hpl_pmatgen.hpp 73 | */ 74 | -------------------------------------------------------------------------------- /include/hpl_pmisc.hpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | #ifndef HPL_PMISC_HPP 17 | #define HPL_PMISC_HPP 18 | /* 19 | * --------------------------------------------------------------------- 20 | * Include files 21 | * --------------------------------------------------------------------- 22 | */ 23 | #include "hpl_misc.hpp" 24 | #include "mpi.h" 25 | 26 | #endif 27 | /* 28 | * End of hpl_pmisc.hpp 29 | */ 30 | -------------------------------------------------------------------------------- /include/hpl_ptest.hpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | #ifndef HPL_PTEST_HPP 17 | #define HPL_PTEST_HPP 18 | /* 19 | * --------------------------------------------------------------------- 20 | * Include files 21 | * --------------------------------------------------------------------- 22 | */ 23 | #include "hpl_misc.hpp" 24 | #include "hpl_blas.hpp" 25 | #include "hpl_auxil.hpp" 26 | 27 | #include "hpl_pmisc.hpp" 28 | #include "hpl_pauxil.hpp" 29 | #include "hpl_panel.hpp" 30 | #include "hpl_pgesv.hpp" 31 | 32 | #include "hpl_ptimer.hpp" 33 | #include "hpl_pmatgen.hpp" 34 | 35 | /* 36 | * --------------------------------------------------------------------- 37 | * Data Structures 38 | * --------------------------------------------------------------------- 39 | */ 40 | typedef struct HPL_S_test { 41 | double epsil; /* epsilon machine */ 42 | double thrsh; /* threshold */ 43 | FILE* outfp; /* output stream (only in proc 0) */ 44 | int kfail; /* # of tests failed */ 45 | int kpass; /* # of tests passed */ 46 | int kskip; /* # of tests skipped */ 47 | int ktest; /* total number of tests */ 48 | } HPL_T_test; 49 | 50 | /* 51 | * --------------------------------------------------------------------- 52 | * #define macro constants for testing only 53 | * --------------------------------------------------------------------- 54 | */ 55 | #define HPL_LINE_MAX 256 56 | #define HPL_MAX_PARAM 20 57 | #define HPL_ISEED 100 58 | /* 59 | * --------------------------------------------------------------------- 60 | * global timers for timing analysis only 61 | * --------------------------------------------------------------------- 62 | */ 63 | #define HPL_TIMING_BEG 11 /* timer 0 reserved, used by main */ 64 | #define HPL_TIMING_N 8 /* number of timers defined below */ 65 | #define HPL_TIMING_RPFACT 11 /* starting from here, contiguous */ 66 | #define HPL_TIMING_PFACT 12 67 | #define HPL_TIMING_MXSWP 13 68 | #define HPL_TIMING_COPY 14 69 | #define HPL_TIMING_LBCAST 15 70 | #define HPL_TIMING_LASWP 16 71 | #define HPL_TIMING_UPDATE 17 72 | #define HPL_TIMING_PTRSV 18 73 | /* 74 | * --------------------------------------------------------------------- 75 | * Function prototypes 76 | * --------------------------------------------------------------------- 77 | */ 78 | void HPL_pdinfo(int ARGC, 79 | char** ARGV, 80 | HPL_T_test*, 81 | int*, 82 | int*, 83 | int*, 84 | int*, 85 | HPL_T_ORDER*, 86 | int*, 87 | int*, 88 | int*, 89 | int*, 90 | int*, 91 | int*, 92 | HPL_T_FACT*, 93 | int*, 94 | int*, 95 | int*, 96 | int*, 97 | int*, 98 | HPL_T_FACT*, 99 | int*, 100 | HPL_T_TOP*, 101 | int*, 102 | int*, 103 | HPL_T_SWAP*, 104 | int*, 105 | int*, 106 | int*, 107 | int*, 108 | int*, 109 | double*, 110 | int*); 111 | 112 | int HPL_pdwarmup(HPL_T_test* TEST, 113 | HPL_T_grid* GRID, 114 | HPL_T_palg* ALGO, 115 | HPL_T_pmat* mat); 116 | 117 | void HPL_pdtest(HPL_T_test*, HPL_T_grid*, HPL_T_palg*, const int, const int); 118 | void HPL_InitGPU(const HPL_T_grid* GRID); 119 | void HPL_FreeGPU(); 120 | 121 | #endif 122 | /* 123 | * End of hpl_ptest.hpp 124 | */ 125 | -------------------------------------------------------------------------------- /include/hpl_ptimer.hpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | #ifndef HPL_PTIMER_HPP 17 | #define HPL_PTIMER_HPP 18 | /* 19 | * --------------------------------------------------------------------- 20 | * Include files 21 | * --------------------------------------------------------------------- 22 | */ 23 | #include "hpl_pmisc.hpp" 24 | 25 | /* 26 | * --------------------------------------------------------------------- 27 | * #define macro constants 28 | * --------------------------------------------------------------------- 29 | */ 30 | #define HPL_NPTIMER 64 31 | #define HPL_PTIMER_STARTFLAG 5.0 32 | #define HPL_PTIMER_ERROR -1.0 33 | /* 34 | * --------------------------------------------------------------------- 35 | * type definitions 36 | * --------------------------------------------------------------------- 37 | */ 38 | typedef enum { HPL_WALL_PTIME = 101, HPL_CPU_PTIME = 102 } HPL_T_PTIME; 39 | 40 | typedef enum { 41 | HPL_AMAX_PTIME = 201, 42 | HPL_AMIN_PTIME = 202, 43 | HPL_SUM_PTIME = 203 44 | } HPL_T_PTIME_OP; 45 | /* 46 | * --------------------------------------------------------------------- 47 | * Function prototypes 48 | * --------------------------------------------------------------------- 49 | */ 50 | double HPL_ptimer_cputime(void); 51 | double HPL_ptimer_walltime(void); 52 | void HPL_ptimer(const int); 53 | void HPL_ptimer_boot(void); 54 | 55 | void HPL_ptimer_combine(MPI_Comm comm, 56 | const HPL_T_PTIME_OP, 57 | const HPL_T_PTIME, 58 | const int, 59 | const int, 60 | double*); 61 | 62 | void HPL_ptimer_disable(void); 63 | void HPL_ptimer_enable(void); 64 | double HPL_ptimer_inquire(const HPL_T_PTIME, const int); 65 | void HPL_ptimer_stepReset(const int, const int); 66 | double HPL_ptimer_getStep(const int); 67 | 68 | #endif 69 | /* 70 | * End of hpl_ptimer.hpp 71 | */ 72 | -------------------------------------------------------------------------------- /include/hpl_version.hpp.in: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * Noel Chalmers 4 | * (C) 2018-2025 Advanced Micro Devices, Inc. 5 | * See the rocHPL/LICENCE file for details. 6 | * 7 | * SPDX-License-Identifier: (BSD-3-Clause) 8 | * --------------------------------------------------------------------- 9 | */ 10 | 11 | #ifndef HPL_VERSION_HPP 12 | #define HPL_VERSION_HPP 13 | 14 | // clang-format off 15 | #define __ROCHPL_VER_MAJOR @rochpl_VERSION_MAJOR@ 16 | #define __ROCHPL_VER_MINOR @rochpl_VERSION_MINOR@ 17 | #define __ROCHPL_VER_PATCH @rochpl_VERSION_PATCH@ 18 | #define __ROCHPL_VER_TWEAK @rochpl_VERSION_TWEAK@ 19 | // clang-format on 20 | 21 | #define __ROCHPL_VER \ 22 | 10000 * __ROCHPL_VER_MAJOR + 100 * __ROCHPL_VER_MINOR + __ROCHPL_VER_PATCH 23 | 24 | #endif // VERSION_HPP 25 | -------------------------------------------------------------------------------- /scripts/HPL.dat: -------------------------------------------------------------------------------- 1 | HPLinpack benchmark input file 2 | Innovative Computing Laboratory, University of Tennessee 3 | HPL.out output file name (if any) 4 | 0 device out (6=stdout,7=stderr,file) 5 | 1 # of problems sizes (N) 6 | 45312 Ns 7 | 1 # of NBs 8 | 384 NBs 9 | 1 PMAP process mapping (0=Row-,1=Column-major) 10 | 1 # of process grids (P x Q) 11 | 1 Ps 12 | 1 Qs 13 | 16.0 threshold 14 | 1 # of panel fact 15 | 2 PFACTs (0=left, 1=Crout, 2=Right) 16 | 1 # of recursive stopping criterium 17 | 32 NBMINs (>= 1) 18 | 1 # of panels in recursion 19 | 2 NDIVs 20 | 1 # of recursive panel fact. 21 | 2 RFACTs (0=left, 1=Crout, 2=Right) 22 | 1 # of broadcast 23 | 0 BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM) 24 | 1 # of lookahead depth 25 | 1 DEPTHs (>=0) 26 | 1 SWAP (0=bin-exch,1=long,2=mix) 27 | 64 swapping threshold 28 | 0 L1 in (0=transposed,1=no-transposed) form 29 | 0 U in (0=transposed,1=no-transposed) form 30 | 0 Equilibration (0=no,1=yes) 31 | 8 memory alignment in double (> 0) 32 | -------------------------------------------------------------------------------- /src/HPL_InitGPU.cpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * Noel Chalmers 4 | * (C) 2018-2025 Advanced Micro Devices, Inc. 5 | * See the rocHPL/LICENCE file for details. 6 | * 7 | * SPDX-License-Identifier: (BSD-3-Clause) 8 | * --------------------------------------------------------------------- 9 | */ 10 | 11 | #include "hpl.hpp" 12 | #include 13 | 14 | rocblas_handle handle; 15 | 16 | hipStream_t computeStream, dataStream; 17 | 18 | hipEvent_t swapStartEvent[HPL_N_UPD], update[HPL_N_UPD]; 19 | hipEvent_t dgemmStart[HPL_N_UPD], dgemmStop[HPL_N_UPD]; 20 | hipEvent_t pfactStart, pfactStop; 21 | 22 | static char host_name[MPI_MAX_PROCESSOR_NAME]; 23 | 24 | /* 25 | This function finds out how many MPI processes are running on the same node 26 | and assigns a local rank that can be used to map a process to a device. 27 | This function needs to be called by all the MPI processes. 28 | */ 29 | void HPL_InitGPU(const HPL_T_grid* GRID) { 30 | char host_name[MPI_MAX_PROCESSOR_NAME]; 31 | 32 | int i, n, namelen, rank, nprocs; 33 | int dev; 34 | 35 | int nprow, npcol, myrow, mycol; 36 | (void)HPL_grid_info(GRID, &nprow, &npcol, &myrow, &mycol); 37 | 38 | MPI_Comm_rank(MPI_COMM_WORLD, &rank); 39 | MPI_Comm_size(MPI_COMM_WORLD, &nprocs); 40 | 41 | MPI_Get_processor_name(host_name, &namelen); 42 | 43 | int localSize = GRID->local_npcol * GRID->local_nprow; 44 | int localRank = rank % localSize; 45 | 46 | /* Find out how many GPUs are in the system and their device number */ 47 | int deviceCount; 48 | CHECK_HIP_ERROR(hipGetDeviceCount(&deviceCount)); 49 | 50 | if(deviceCount < 1) { 51 | if(localRank == 0) 52 | HPL_pwarn(stderr, 53 | __LINE__, 54 | "HPL_InitGPU", 55 | "Node %s found no GPUs. Is the ROCm kernel module loaded?", 56 | host_name); 57 | MPI_Finalize(); 58 | exit(1); 59 | } 60 | 61 | dev = localRank % deviceCount; 62 | 63 | /* Assign device to MPI process, initialize BLAS and probe device properties 64 | */ 65 | CHECK_HIP_ERROR(hipSetDevice(dev)); 66 | 67 | CHECK_HIP_ERROR(hipStreamCreate(&computeStream)); 68 | CHECK_HIP_ERROR(hipStreamCreate(&dataStream)); 69 | 70 | CHECK_HIP_ERROR(hipEventCreate(swapStartEvent + HPL_LOOK_AHEAD)); 71 | CHECK_HIP_ERROR(hipEventCreate(swapStartEvent + HPL_UPD_1)); 72 | CHECK_HIP_ERROR(hipEventCreate(swapStartEvent + HPL_UPD_2)); 73 | 74 | CHECK_HIP_ERROR(hipEventCreate(update + HPL_LOOK_AHEAD)); 75 | CHECK_HIP_ERROR(hipEventCreate(update + HPL_UPD_1)); 76 | CHECK_HIP_ERROR(hipEventCreate(update + HPL_UPD_2)); 77 | 78 | CHECK_HIP_ERROR(hipEventCreate(dgemmStart + HPL_LOOK_AHEAD)); 79 | CHECK_HIP_ERROR(hipEventCreate(dgemmStart + HPL_UPD_1)); 80 | CHECK_HIP_ERROR(hipEventCreate(dgemmStart + HPL_UPD_2)); 81 | 82 | CHECK_HIP_ERROR(hipEventCreate(dgemmStop + HPL_LOOK_AHEAD)); 83 | CHECK_HIP_ERROR(hipEventCreate(dgemmStop + HPL_UPD_1)); 84 | CHECK_HIP_ERROR(hipEventCreate(dgemmStop + HPL_UPD_2)); 85 | 86 | CHECK_HIP_ERROR(hipEventCreate(&pfactStart)); 87 | CHECK_HIP_ERROR(hipEventCreate(&pfactStop)); 88 | 89 | /* Create a rocBLAS handle */ 90 | CHECK_ROCBLAS_ERROR(rocblas_create_handle(&handle)); 91 | CHECK_ROCBLAS_ERROR( 92 | rocblas_set_pointer_mode(handle, rocblas_pointer_mode_host)); 93 | CHECK_ROCBLAS_ERROR(rocblas_set_stream(handle, computeStream)); 94 | 95 | rocblas_initialize(); 96 | 97 | #ifdef HPL_ROCBLAS_ALLOW_ATOMICS 98 | CHECK_ROCBLAS_ERROR( 99 | rocblas_set_atomics_mode(handle, rocblas_atomics_allowed)); 100 | #else 101 | CHECK_ROCBLAS_ERROR( 102 | rocblas_set_atomics_mode(handle, rocblas_atomics_not_allowed)); 103 | #endif 104 | } 105 | 106 | void HPL_FreeGPU() { 107 | CHECK_ROCBLAS_ERROR(rocblas_destroy_handle(handle)); 108 | 109 | CHECK_HIP_ERROR(hipEventDestroy(swapStartEvent[HPL_LOOK_AHEAD])); 110 | CHECK_HIP_ERROR(hipEventDestroy(swapStartEvent[HPL_UPD_1])); 111 | CHECK_HIP_ERROR(hipEventDestroy(swapStartEvent[HPL_UPD_2])); 112 | 113 | CHECK_HIP_ERROR(hipEventDestroy(update[HPL_LOOK_AHEAD])); 114 | CHECK_HIP_ERROR(hipEventDestroy(update[HPL_UPD_1])); 115 | CHECK_HIP_ERROR(hipEventDestroy(update[HPL_UPD_2])); 116 | 117 | CHECK_HIP_ERROR(hipEventDestroy(dgemmStart[HPL_LOOK_AHEAD])); 118 | CHECK_HIP_ERROR(hipEventDestroy(dgemmStart[HPL_UPD_1])); 119 | CHECK_HIP_ERROR(hipEventDestroy(dgemmStart[HPL_UPD_2])); 120 | 121 | CHECK_HIP_ERROR(hipEventDestroy(dgemmStop[HPL_LOOK_AHEAD])); 122 | CHECK_HIP_ERROR(hipEventDestroy(dgemmStop[HPL_UPD_1])); 123 | CHECK_HIP_ERROR(hipEventDestroy(dgemmStop[HPL_UPD_2])); 124 | 125 | CHECK_HIP_ERROR(hipEventDestroy(pfactStart)); 126 | CHECK_HIP_ERROR(hipEventDestroy(pfactStop)); 127 | 128 | CHECK_HIP_ERROR(hipStreamDestroy(computeStream)); 129 | CHECK_HIP_ERROR(hipStreamDestroy(dataStream)); 130 | } 131 | -------------------------------------------------------------------------------- /src/HPL_pdwarmup.cpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | #include "hpl.hpp" 17 | 18 | int HPL_pdwarmup(HPL_T_test* TEST, 19 | HPL_T_grid* GRID, 20 | HPL_T_palg* ALGO, 21 | HPL_T_pmat* mat) { 22 | 23 | int N = mat->n; 24 | int NB = mat->nb; 25 | 26 | HPL_T_UPD_FUN HPL_pdupdate = ALGO->upfun; 27 | 28 | HPL_T_panel* p0 = &(mat->panel[0]); 29 | HPL_T_panel* p1 = &(mat->panel[1]); 30 | 31 | HPL_pdpanel_init( 32 | GRID, ALGO, N, N + 1, Mmin(N, NB), mat, 0, 0, MSGID_BEGIN_FACT, p0); 33 | HPL_pdpanel_init( 34 | GRID, ALGO, N, N + 1, Mmin(N, NB), mat, 0, 0, MSGID_BEGIN_FACT, p1); 35 | 36 | int mm = Mmin(p0->mp, p0->jb); 37 | int nn = Mmin(p0->nq, p0->jb); 38 | 39 | // Fill the matrix with values 40 | HPL_pdrandmat(GRID, N, N + 1, NB, mat->A, mat->ld, HPL_ISEED); 41 | 42 | // Do a pfact on all columns 43 | p0->pcol = p0->grid->mycol; 44 | HPL_pdfact(p0); 45 | HPL_pdpanel_swapids(p0); 46 | HPL_pdpanel_Wait(p0); 47 | p0->A -= p0->jb * static_cast(p0->lda); 48 | 49 | // Broadcast to register with MPI 50 | p0->pcol = 0; 51 | HPL_pdpanel_bcast(p0); 52 | 53 | p0->nu0 = nn; 54 | p0->ldu0 = nn; 55 | HPL_pdlaswp_start(p0, HPL_LOOK_AHEAD); 56 | HPL_pdlaswp_exchange(p0, HPL_LOOK_AHEAD); 57 | HPL_pdlaswp_end(p0, HPL_LOOK_AHEAD); 58 | HPL_pdupdate(p0, HPL_LOOK_AHEAD); 59 | p0->nu0 = 0; 60 | 61 | HPL_pdlaswp_start(p0, HPL_UPD_1); 62 | HPL_pdlaswp_exchange(p0, HPL_UPD_1); 63 | HPL_pdlaswp_end(p0, HPL_UPD_1); 64 | HPL_pdupdate(p0, HPL_UPD_1); 65 | 66 | HPL_pdlaswp_start(p0, HPL_UPD_2); 67 | HPL_pdlaswp_exchange(p0, HPL_UPD_2); 68 | HPL_pdlaswp_end(p0, HPL_UPD_2); 69 | HPL_pdupdate(p0, HPL_UPD_2); 70 | 71 | CHECK_HIP_ERROR(hipDeviceSynchronize()); 72 | 73 | // Do a pfact on all columns 74 | p1->pcol = p1->grid->mycol; 75 | HPL_pdfact(p1); 76 | HPL_pdpanel_swapids(p1); 77 | HPL_pdpanel_Wait(p1); 78 | p1->A -= p1->jb * static_cast(p1->lda); 79 | 80 | // Broadcast to register with MPI 81 | p1->pcol = 0; 82 | HPL_pdpanel_bcast(p1); 83 | 84 | p1->nu0 = nn; 85 | p1->ldu0 = nn; 86 | HPL_pdlaswp_start(p1, HPL_LOOK_AHEAD); 87 | HPL_pdlaswp_exchange(p1, HPL_LOOK_AHEAD); 88 | HPL_pdlaswp_end(p1, HPL_LOOK_AHEAD); 89 | HPL_pdupdate(p1, HPL_LOOK_AHEAD); 90 | p1->nu0 = 0; 91 | 92 | HPL_pdlaswp_start(p1, HPL_UPD_1); 93 | HPL_pdlaswp_exchange(p1, HPL_UPD_1); 94 | HPL_pdlaswp_end(p1, HPL_UPD_1); 95 | HPL_pdupdate(p1, HPL_UPD_1); 96 | 97 | HPL_pdlaswp_start(p1, HPL_UPD_2); 98 | HPL_pdlaswp_exchange(p1, HPL_UPD_2); 99 | HPL_pdlaswp_end(p1, HPL_UPD_2); 100 | HPL_pdupdate(p1, HPL_UPD_2); 101 | 102 | HPL_pdtrsv(GRID, mat); 103 | 104 | return HPL_SUCCESS; 105 | } 106 | -------------------------------------------------------------------------------- /src/auxil/HPL_abort.cpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | 17 | #include "hpl.hpp" 18 | 19 | void HPL_abort(int LINE, const char* SRNAME, const char* FORM, ...) { 20 | /* 21 | * Purpose 22 | * ======= 23 | * 24 | * HPL_abort displays an error message on stderr and halts execution. 25 | * 26 | * 27 | * Arguments 28 | * ========= 29 | * 30 | * LINE (local input) int 31 | * On entry, LINE specifies the line number in the file where 32 | * the error has occured. When LINE is not a positive line 33 | * number, it is ignored. 34 | * 35 | * SRNAME (local input) const char * 36 | * On entry, SRNAME should be the name of the routine calling 37 | * this error handler. 38 | * 39 | * FORM (local input) const char * 40 | * On entry, FORM specifies the format, i.e., how the subsequent 41 | * arguments are converted for output. 42 | * 43 | * (local input) ... 44 | * On entry, ... is the list of arguments to be printed within 45 | * the format string. 46 | * 47 | * --------------------------------------------------------------------- 48 | */ 49 | 50 | va_list argptr; 51 | char cline[128]; 52 | 53 | va_start(argptr, FORM); 54 | (void)vsprintf(cline, FORM, argptr); 55 | va_end(argptr); 56 | /* 57 | * Display an error message 58 | */ 59 | if(LINE <= 0) 60 | HPL_fprintf(stderr, 61 | "%s %s:\n>>> %s <<< Abort ...\n\n", 62 | "HPL ERROR in function", 63 | SRNAME, 64 | cline); 65 | else 66 | HPL_fprintf(stderr, 67 | "%s %d %s %s:\n>>> %s <<< Abort ...\n\n", 68 | "HPL ERROR on line", 69 | LINE, 70 | "of function", 71 | SRNAME, 72 | cline); 73 | exit(0); 74 | } 75 | -------------------------------------------------------------------------------- /src/auxil/HPL_dlacpy_device.cpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | 17 | #include "hpl.hpp" 18 | #include 19 | 20 | #define TILE_DIM 64 21 | #define BLOCK_ROWS 16 22 | 23 | __global__ void dlacpy(const int M, 24 | const int N, 25 | const double* __restrict__ A, 26 | const int LDA, 27 | double* __restrict__ B, 28 | const int LDB) { 29 | 30 | const int I = blockIdx.x * TILE_DIM + threadIdx.x; 31 | const int J = blockIdx.y * TILE_DIM + threadIdx.y; 32 | 33 | if(I < M) { 34 | if(J + 0 < N) 35 | B[I + static_cast(LDB) * (J + 0)] = 36 | A[I + static_cast(LDA) * (J + 0)]; 37 | if(J + 16 < N) 38 | B[I + static_cast(LDB) * (J + 16)] = 39 | A[I + static_cast(LDA) * (J + 16)]; 40 | if(J + 32 < N) 41 | B[I + static_cast(LDB) * (J + 32)] = 42 | A[I + static_cast(LDA) * (J + 32)]; 43 | if(J + 48 < N) 44 | B[I + static_cast(LDB) * (J + 48)] = 45 | A[I + static_cast(LDA) * (J + 48)]; 46 | } 47 | } 48 | 49 | void HPL_dlacpy(const int M, 50 | const int N, 51 | const double* A, 52 | const int LDA, 53 | double* B, 54 | const int LDB) { 55 | /* 56 | * Purpose 57 | * ======= 58 | * 59 | * HPL_dlacpy copies an array A into an array B. 60 | * 61 | * 62 | * Arguments 63 | * ========= 64 | * 65 | * M (local input) const int 66 | * On entry, M specifies the number of rows of the arrays A and 67 | * B. M must be at least zero. 68 | * 69 | * N (local input) const int 70 | * On entry, N specifies the number of columns of the arrays A 71 | * and B. N must be at least zero. 72 | * 73 | * A (local input) const double * 74 | * On entry, A points to an array of dimension (LDA,N). 75 | * 76 | * LDA (local input) const int 77 | * On entry, LDA specifies the leading dimension of the array A. 78 | * LDA must be at least MAX(1,M). 79 | * 80 | * B (local output) double * 81 | * On entry, B points to an array of dimension (LDB,N). On exit, 82 | * B is overwritten with A. 83 | * 84 | * LDB (local input) const int 85 | * On entry, LDB specifies the leading dimension of the array B. 86 | * LDB must be at least MAX(1,M). 87 | * 88 | * --------------------------------------------------------------------- 89 | */ 90 | 91 | if((M <= 0) || (N <= 0)) return; 92 | 93 | hipStream_t stream; 94 | CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); 95 | 96 | dim3 grid_size((M + TILE_DIM - 1) / TILE_DIM, (N + TILE_DIM - 1) / TILE_DIM); 97 | dim3 block_size(TILE_DIM, BLOCK_ROWS); 98 | dlacpy<<>>(M, N, A, LDA, B, LDB); 99 | CHECK_HIP_ERROR(hipGetLastError()); 100 | } 101 | -------------------------------------------------------------------------------- /src/auxil/HPL_dlaprnt.cpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | 17 | #include "hpl.hpp" 18 | 19 | void HPL_dlaprnt(const int M, 20 | const int N, 21 | double* A, 22 | const int IA, 23 | const int JA, 24 | const int LDA, 25 | const char* CMATNM) { 26 | /* 27 | * Purpose 28 | * ======= 29 | * 30 | * HPL_dlaprnt prints to standard error an M-by-N matrix A. 31 | * 32 | * 33 | * Arguments 34 | * ========= 35 | * 36 | * M (local input) const int 37 | * On entry, M specifies the number of rows of A. M must be at 38 | * least zero. 39 | * 40 | * N (local input) const int 41 | * On entry, N specifies the number of columns of A. N must be 42 | * at least zero. 43 | * 44 | * A (local input) double * 45 | * On entry, A points to an array of dimension (LDA,N). 46 | * 47 | * IA (local input) const int 48 | * On entry, IA specifies the starting row index to be printed. 49 | * 50 | * JA (local input) const int 51 | * On entry, JA specifies the starting column index to be 52 | * printed. 53 | * 54 | * LDA (local input) const int 55 | * On entry, LDA specifies the leading dimension of the array A. 56 | * LDA must be at least max(1,M). 57 | * 58 | * CMATNM (local input) const char * 59 | * On entry, CMATNM is the name of the matrix to be printed. 60 | * 61 | * --------------------------------------------------------------------- 62 | */ 63 | 64 | int i, j; 65 | 66 | for(j = 0; j < N; j++) { 67 | for(i = 0; i < M; i++) { 68 | HPL_fprintf(stderr, 69 | "%s(%6d,%6d)=%30.18f\n", 70 | CMATNM, 71 | IA + i, 72 | JA + j, 73 | *(Mptr(A, i, j, LDA))); 74 | } 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/auxil/HPL_dlatcpy_device.cpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | 17 | #include "hpl.hpp" 18 | #include 19 | 20 | #define TILE_DIM 64 21 | #define BLOCK_ROWS 16 22 | 23 | __global__ void dlatcpy(const int M, 24 | const int N, 25 | const double* __restrict__ A, 26 | const int LDA, 27 | double* __restrict__ B, 28 | const int LDB) { 29 | 30 | __shared__ double s_tile[TILE_DIM][TILE_DIM + 1]; 31 | 32 | int I = blockIdx.x * TILE_DIM + threadIdx.y; 33 | int J = blockIdx.y * TILE_DIM + threadIdx.x; 34 | 35 | if(J < N) { 36 | if(I + 0 < M) 37 | s_tile[threadIdx.y + 0][threadIdx.x] = A[((size_t)I + 0) * LDA + J]; 38 | if(I + 16 < M) 39 | s_tile[threadIdx.y + 16][threadIdx.x] = A[((size_t)I + 16) * LDA + J]; 40 | if(I + 32 < M) 41 | s_tile[threadIdx.y + 32][threadIdx.x] = A[((size_t)I + 32) * LDA + J]; 42 | if(I + 48 < M) 43 | s_tile[threadIdx.y + 48][threadIdx.x] = A[((size_t)I + 48) * LDA + J]; 44 | } 45 | 46 | I = blockIdx.x * TILE_DIM + threadIdx.x; 47 | J = blockIdx.y * TILE_DIM + threadIdx.y; 48 | 49 | __syncthreads(); 50 | 51 | if(I < M) { 52 | if(J + 0 < N) 53 | B[I + ((size_t)J + 0) * LDB] = s_tile[threadIdx.x][threadIdx.y + 0]; 54 | if(J + 16 < N) 55 | B[I + ((size_t)J + 16) * LDB] = s_tile[threadIdx.x][threadIdx.y + 16]; 56 | if(J + 32 < N) 57 | B[I + ((size_t)J + 32) * LDB] = s_tile[threadIdx.x][threadIdx.y + 32]; 58 | if(J + 48 < N) 59 | B[I + ((size_t)J + 48) * LDB] = s_tile[threadIdx.x][threadIdx.y + 48]; 60 | } 61 | } 62 | 63 | void HPL_dlatcpy(const int M, 64 | const int N, 65 | const double* A, 66 | const int LDA, 67 | double* B, 68 | const int LDB) { 69 | /* 70 | * Purpose 71 | * ======= 72 | * 73 | * HPL_dlatcpy copies the transpose of an array A into an array B. 74 | * 75 | * 76 | * Arguments 77 | * ========= 78 | * 79 | * M (local input) const int 80 | * On entry, M specifies the number of rows of the array B and 81 | * the number of columns of A. M must be at least zero. 82 | * 83 | * N (local input) const int 84 | * On entry, N specifies the number of rows of the array A and 85 | * the number of columns of B. N must be at least zero. 86 | * 87 | * A (local input) const double * 88 | * On entry, A points to an array of dimension (LDA,M). 89 | * 90 | * LDA (local input) const int 91 | * On entry, LDA specifies the leading dimension of the array A. 92 | * LDA must be at least MAX(1,N). 93 | * 94 | * B (local output) double * 95 | * On entry, B points to an array of dimension (LDB,N). On exit, 96 | * B is overwritten with the transpose of A. 97 | * 98 | * LDB (local input) const int 99 | * On entry, LDB specifies the leading dimension of the array B. 100 | * LDB must be at least MAX(1,M). 101 | * 102 | * --------------------------------------------------------------------- 103 | */ 104 | 105 | if((M <= 0) || (N <= 0)) return; 106 | 107 | hipStream_t stream; 108 | CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); 109 | 110 | dim3 grid_size((M + TILE_DIM - 1) / TILE_DIM, (N + TILE_DIM - 1) / TILE_DIM); 111 | dim3 block_size(TILE_DIM, BLOCK_ROWS); 112 | dlatcpy<<>>(M, N, A, LDA, B, LDB); 113 | CHECK_HIP_ERROR(hipGetLastError()); 114 | } 115 | -------------------------------------------------------------------------------- /src/auxil/HPL_fprintf.cpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | 17 | #include "hpl.hpp" 18 | 19 | void HPL_fprintf(FILE* STREAM, const char* FORM, ...) { 20 | /* 21 | * Purpose 22 | * ======= 23 | * 24 | * HPL_fprintf is a wrapper around fprintf flushing the output stream. 25 | * 26 | * 27 | * Arguments 28 | * ========= 29 | * 30 | * STREAM (local input) FILE * 31 | * On entry, STREAM specifies the output stream. 32 | * 33 | * FORM (local input) const char * 34 | * On entry, FORM specifies the format, i.e., how the subsequent 35 | * arguments are converted for output. 36 | * 37 | * (local input) ... 38 | * On entry, ... is the list of arguments to be printed within 39 | * the format string. 40 | * 41 | * --------------------------------------------------------------------- 42 | */ 43 | 44 | va_list argptr; 45 | char cline[256]; 46 | 47 | va_start(argptr, FORM); 48 | (void)vsprintf(cline, FORM, argptr); 49 | va_end(argptr); 50 | 51 | (void)fprintf(STREAM, "%s", cline); 52 | (void)fflush(STREAM); 53 | } 54 | -------------------------------------------------------------------------------- /src/auxil/HPL_warn.cpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | 17 | #include "hpl.hpp" 18 | 19 | void HPL_warn(FILE* STREAM, 20 | int LINE, 21 | const char* SRNAME, 22 | const char* FORM, 23 | ...) { 24 | /* 25 | * Purpose 26 | * ======= 27 | * 28 | * HPL_warn displays an error message. 29 | * 30 | * 31 | * Arguments 32 | * ========= 33 | * 34 | * STREAM (local input) FILE * 35 | * On entry, STREAM specifies the output stream. 36 | * 37 | * LINE (local input) int 38 | * On entry, LINE specifies the line number in the file where 39 | * the error has occured. When LINE is not a positive line 40 | * number, it is ignored. 41 | * 42 | * SRNAME (local input) const char * 43 | * On entry, SRNAME should be the name of the routine calling 44 | * this error handler. 45 | * 46 | * FORM (local input) const char * 47 | * On entry, FORM specifies the format, i.e., how the subsequent 48 | * arguments are converted for output. 49 | * 50 | * (local input) ... 51 | * On entry, ... is the list of arguments to be printed within 52 | * the format string. 53 | * 54 | * --------------------------------------------------------------------- 55 | */ 56 | 57 | va_list argptr; 58 | char cline[128]; 59 | 60 | va_start(argptr, FORM); 61 | (void)vsprintf(cline, FORM, argptr); 62 | va_end(argptr); 63 | /* 64 | * Display an error message 65 | */ 66 | if(LINE <= 0) 67 | HPL_fprintf(STREAM, 68 | "%s %s:\n>>> %s <<<\n\n", 69 | "HPL ERROR in function", 70 | SRNAME, 71 | cline); 72 | else 73 | HPL_fprintf(STREAM, 74 | "%s %d %s %s:\n>>> %s <<<\n\n", 75 | "HPL ERROR on line", 76 | LINE, 77 | "of function", 78 | SRNAME, 79 | cline); 80 | } 81 | -------------------------------------------------------------------------------- /src/comm/HPL_all_reduce.cpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | 17 | #include "hpl.hpp" 18 | 19 | int HPL_all_reduce(void* BUFFER, 20 | const int COUNT, 21 | const HPL_T_TYPE DTYPE, 22 | const HPL_T_OP OP, 23 | MPI_Comm COMM) { 24 | /* 25 | * Purpose 26 | * ======= 27 | * 28 | * HPL_all_reduce performs a global reduce operation across all 29 | * processes of a group leaving the results on all processes. 30 | * 31 | * Arguments 32 | * ========= 33 | * 34 | * BUFFER (local input/global output) void * 35 | * On entry, BUFFER points to the buffer to be combined. On 36 | * exit, this array contains the combined data and is identical 37 | * on all processes in the group. 38 | * 39 | * COUNT (global input) const int 40 | * On entry, COUNT indicates the number of entries in BUFFER. 41 | * COUNT must be at least zero. 42 | * 43 | * DTYPE (global input) const HPL_T_TYPE 44 | * On entry, DTYPE specifies the type of the buffers operands. 45 | * 46 | * OP (global input) const HPL_T_OP 47 | * On entry, OP is a pointer to the local combine function. 48 | * 49 | * COMM (global/local input) MPI_Comm 50 | * The MPI communicator identifying the process collection. 51 | * 52 | * --------------------------------------------------------------------- 53 | */ 54 | 55 | int ierr = MPI_Allreduce( 56 | MPI_IN_PLACE, BUFFER, COUNT, HPL_2_MPI_TYPE(DTYPE), OP, COMM); 57 | 58 | return ((ierr == MPI_SUCCESS ? HPL_SUCCESS : HPL_FAILURE)); 59 | } 60 | -------------------------------------------------------------------------------- /src/comm/HPL_allgatherv.cpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | 17 | #include "hpl.hpp" 18 | 19 | int HPL_allgatherv(double* BUF, 20 | const int SCOUNT, 21 | const int* RCOUNT, 22 | const int* DISPL, 23 | MPI_Comm COMM) { 24 | /* 25 | * Purpose 26 | * ======= 27 | * 28 | * HPL_allgatherv is a simple wrapper around an in-place MPI_Allgatherv. 29 | * Its main purpose is to allow for some experimentation / tuning 30 | * of this simple routine. Successful completion is indicated by 31 | * the returned error code HPL_SUCCESS. 32 | * 33 | * Arguments 34 | * ========= 35 | * 36 | * BUF (local input/output) double * 37 | * On entry, on the root process BUF specifies the starting 38 | * address of buffer to be gathered. 39 | * 40 | * SCOUNT (local input) int 41 | * On entry, SCOUNT is an array of length SIZE specifiying 42 | * the number of double precision entries in BUF to send to 43 | * each process. 44 | * 45 | * RCOUNT (local input) int 46 | * On entry, RCOUNT is an array of length SIZE specifiying 47 | * the number of double precision entries in BUF to receive from 48 | * each process. 49 | * 50 | * DISPL (local input) int * 51 | * On entry, DISPL is an array of length SIZE specifiying the 52 | * displacement (relative to BUF) from which to place the incoming 53 | * data from each process. 54 | * 55 | * COMM (local input) MPI_Comm 56 | * The MPI communicator identifying the communication space. 57 | * 58 | * --------------------------------------------------------------------- 59 | */ 60 | 61 | HPL_TracingPush("HPL_Allgatherv"); 62 | 63 | #ifdef HPL_USE_COLLECTIVES 64 | 65 | int ierr = MPI_Allgatherv( 66 | MPI_IN_PLACE, SCOUNT, MPI_DOUBLE, BUF, RCOUNT, DISPL, MPI_DOUBLE, COMM); 67 | 68 | #else 69 | 70 | int rank, size, ierr = MPI_SUCCESS; 71 | MPI_Comm_rank(COMM, &rank); 72 | MPI_Comm_size(COMM, &size); 73 | 74 | /* 75 | * Ring exchange 76 | */ 77 | const int npm1 = size - 1; 78 | const int prev = MModSub1(rank, size); 79 | const int next = MModAdd1(rank, size); 80 | 81 | const int tag = 0; 82 | 83 | for(int k = 0; k < npm1; k++) { 84 | MPI_Request request; 85 | MPI_Status status; 86 | const int l = (int)((unsigned int)(k) >> 1); 87 | 88 | int il, lengthS, lengthR, partner, ibufS, ibufR; 89 | if(((rank + k) & 1) != 0) { 90 | il = MModAdd(rank, l, size); 91 | ibufS = DISPL[il]; 92 | lengthS = RCOUNT[il]; 93 | il = MModSub(rank, l + 1, size); 94 | ibufR = DISPL[il]; 95 | lengthR = RCOUNT[il]; 96 | partner = prev; 97 | } else { 98 | il = MModSub(rank, l, size); 99 | ibufS = DISPL[il]; 100 | lengthS = RCOUNT[il]; 101 | il = MModAdd(rank, l + 1, size); 102 | ibufR = DISPL[il]; 103 | lengthR = RCOUNT[il]; 104 | partner = next; 105 | } 106 | 107 | if(lengthR > 0) { 108 | if(ierr == MPI_SUCCESS) 109 | ierr = MPI_Irecv( 110 | BUF + ibufR, lengthR, MPI_DOUBLE, partner, tag, COMM, &request); 111 | } 112 | 113 | if(lengthS > 0) { 114 | if(ierr == MPI_SUCCESS) 115 | ierr = MPI_Send(BUF + ibufS, lengthS, MPI_DOUBLE, partner, tag, COMM); 116 | } 117 | 118 | if(lengthR > 0) { 119 | if(ierr == MPI_SUCCESS) ierr = MPI_Wait(&request, &status); 120 | } 121 | } 122 | 123 | #endif 124 | 125 | HPL_TracingPop(); 126 | 127 | return ((ierr == MPI_SUCCESS ? HPL_SUCCESS : HPL_FAILURE)); 128 | } 129 | -------------------------------------------------------------------------------- /src/comm/HPL_barrier.cpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | 17 | #include "hpl.hpp" 18 | 19 | int HPL_barrier(MPI_Comm COMM) { 20 | /* 21 | * Purpose 22 | * ======= 23 | * 24 | * HPL_barrier blocks the caller until all process members have call it. 25 | * The call returns at any process only after all group members have 26 | * entered the call. 27 | * 28 | * Arguments 29 | * ========= 30 | * 31 | * COMM (global/local input) MPI_Comm 32 | * The MPI communicator identifying the process collection. 33 | * 34 | * --------------------------------------------------------------------- 35 | */ 36 | 37 | int ierr = MPI_Barrier(COMM); 38 | 39 | return ((ierr == MPI_SUCCESS ? HPL_SUCCESS : HPL_FAILURE)); 40 | } 41 | -------------------------------------------------------------------------------- /src/comm/HPL_bcast.cpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | 17 | #include "hpl.hpp" 18 | 19 | int HPL_bcast(double* SBUF, 20 | int SCOUNT, 21 | int ROOT, 22 | MPI_Comm COMM, 23 | HPL_T_TOP top) { 24 | /* 25 | * Purpose 26 | * ======= 27 | * 28 | * HPL_bcast is a simple wrapper around MPI_Bcast. Its main purpose is 29 | * to allow for some experimentation / tuning of this simple routine. 30 | * Successful completion is indicated by the returned error code 31 | * HPL_SUCCESS. In the case of messages of length less than or equal to 32 | * zero, this function returns immediately. 33 | * 34 | * Arguments 35 | * ========= 36 | * 37 | * SBUF (local input) double * 38 | * On entry, SBUF specifies the starting address of buffer to be 39 | * broadcast. 40 | * 41 | * SCOUNT (local input) int 42 | * On entry, SCOUNT specifies the number of double precision 43 | * entries in SBUF. SCOUNT must be at least zero. 44 | * 45 | * ROOT (local input) int 46 | * On entry, ROOT specifies the rank of the origin process in 47 | * the communication space defined by COMM. 48 | * 49 | * COMM (local input) MPI_Comm 50 | * The MPI communicator identifying the communication space. 51 | * 52 | * --------------------------------------------------------------------- 53 | */ 54 | 55 | if(SCOUNT <= 0) return (HPL_SUCCESS); 56 | 57 | int ierr; 58 | 59 | HPL_TracingPush("HPL_Bcast"); 60 | 61 | #ifdef HPL_USE_COLLECTIVES 62 | 63 | ierr = MPI_Bcast(SBUF, SCOUNT, MPI_DOUBLE, ROOT, COMM); 64 | 65 | #else 66 | 67 | switch(top) { 68 | case HPL_1RING_M: ierr = HPL_bcast_1rinM(SBUF, SCOUNT, ROOT, COMM); break; 69 | case HPL_1RING: ierr = HPL_bcast_1ring(SBUF, SCOUNT, ROOT, COMM); break; 70 | case HPL_2RING_M: ierr = HPL_bcast_2rinM(SBUF, SCOUNT, ROOT, COMM); break; 71 | case HPL_2RING: ierr = HPL_bcast_2ring(SBUF, SCOUNT, ROOT, COMM); break; 72 | case HPL_BLONG_M: ierr = HPL_bcast_blonM(SBUF, SCOUNT, ROOT, COMM); break; 73 | case HPL_BLONG: ierr = HPL_bcast_blong(SBUF, SCOUNT, ROOT, COMM); break; 74 | default: ierr = HPL_FAILURE; 75 | } 76 | 77 | #endif 78 | 79 | HPL_TracingPop(); 80 | 81 | return ((ierr == MPI_SUCCESS ? HPL_SUCCESS : HPL_FAILURE)); 82 | } 83 | -------------------------------------------------------------------------------- /src/comm/HPL_bcast_1rinM.cpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | 17 | #include "hpl.hpp" 18 | 19 | int HPL_bcast_1rinM(double* SBUF, int SCOUNT, int ROOT, MPI_Comm COMM) { 20 | 21 | int rank, size; 22 | MPI_Comm_rank(COMM, &rank); 23 | MPI_Comm_size(COMM, &size); 24 | 25 | if(size <= 1) return (MPI_SUCCESS); 26 | 27 | /*Root immediately sends to ROOT+1*/ 28 | if(rank == ROOT) { 29 | MPI_Send(SBUF, SCOUNT, MPI_DOUBLE, MModAdd1(ROOT, size), ROOT, COMM); 30 | } else if(rank == MModAdd1(ROOT, size)) { 31 | MPI_Recv(SBUF, SCOUNT, MPI_DOUBLE, ROOT, ROOT, COMM, MPI_STATUS_IGNORE); 32 | return MPI_SUCCESS; 33 | } 34 | 35 | if(size == 2) return (MPI_SUCCESS); 36 | 37 | /*One ring exchange to rule them all*/ 38 | int chunk_size = 512 * 512; // 2MB 39 | 40 | chunk_size = std::min(chunk_size, SCOUNT); 41 | 42 | MPI_Request request[2]; 43 | 44 | request[0] = MPI_REQUEST_NULL; 45 | request[1] = MPI_REQUEST_NULL; 46 | 47 | const int Nchunks = (SCOUNT + chunk_size - 1) / chunk_size; 48 | 49 | const int tag = rank; 50 | const int next = 51 | (rank == ROOT) ? MModAdd(ROOT, 2, size) : MModAdd1(rank, size); 52 | const int prev = 53 | (rank == MModAdd(ROOT, 2, size)) ? ROOT : MModSub1(rank, size); 54 | 55 | double* RBUF = SBUF; 56 | 57 | /*Shift to ROOT=0*/ 58 | rank = MModSub(rank, ROOT, size); 59 | 60 | int Nsend = (rank == size - 1) ? 0 : SCOUNT; 61 | int Nrecv = (rank == 0) ? 0 : SCOUNT; 62 | 63 | /*Recv from left*/ 64 | int Nr = std::min(Nrecv, chunk_size); 65 | if(Nr > 0) { MPI_Irecv(RBUF, Nr, MPI_DOUBLE, prev, prev, COMM, request + 0); } 66 | 67 | /*Send to right if there is data present to send*/ 68 | int Ns = std::min(Nsend - Nrecv, chunk_size); 69 | if(Ns > 0) { MPI_Isend(SBUF, Ns, MPI_DOUBLE, next, tag, COMM, request + 1); } 70 | 71 | while(Nsend > 0 || Nrecv > 0) { 72 | int index = -1; 73 | MPI_Waitany(2, request, &index, MPI_STATUSES_IGNORE); 74 | 75 | if(index == 0) { /*Recv'd from left*/ 76 | /*If we're waiting on this recv in order to send, send now*/ 77 | if(Nrecv == Nsend) { 78 | Ns = Nr; 79 | MPI_Isend(SBUF, Ns, MPI_DOUBLE, next, tag, COMM, request + 1); 80 | } 81 | 82 | /*Count the recv'd amounts*/ 83 | Nrecv -= Nr; 84 | RBUF += Nr; 85 | 86 | /*Post next recv if needed*/ 87 | Nr = std::min(Nrecv, chunk_size); 88 | if(Nr > 0) { 89 | MPI_Irecv(RBUF, Nr, MPI_DOUBLE, prev, prev, COMM, request + 0); 90 | } else { 91 | request[0] = MPI_REQUEST_NULL; 92 | } 93 | 94 | } else if(index == 1) { /*Sent to right */ 95 | Nsend -= Ns; 96 | SBUF += Ns; 97 | 98 | /*Send to right if there is data present to send*/ 99 | Ns = std::min(Nsend - Nrecv, chunk_size); 100 | if(Ns > 0) { 101 | MPI_Isend(SBUF, Ns, MPI_DOUBLE, next, tag, COMM, request + 1); 102 | } else { 103 | request[1] = MPI_REQUEST_NULL; 104 | } 105 | } 106 | } 107 | 108 | return MPI_SUCCESS; 109 | } 110 | -------------------------------------------------------------------------------- /src/comm/HPL_bcast_1ring.cpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | 17 | #include "hpl.hpp" 18 | 19 | int HPL_bcast_1ring(double* SBUF, int SCOUNT, int ROOT, MPI_Comm COMM) { 20 | 21 | int rank, size; 22 | MPI_Comm_rank(COMM, &rank); 23 | MPI_Comm_size(COMM, &size); 24 | 25 | if(size <= 1) return (MPI_SUCCESS); 26 | 27 | /*One ring exchange to rule them all*/ 28 | int chunk_size = 512 * 512; // 2MB 29 | // int chunk_size = 64 * 512; // 256KB 30 | 31 | chunk_size = std::min(chunk_size, SCOUNT); 32 | 33 | MPI_Request request[2]; 34 | 35 | request[0] = MPI_REQUEST_NULL; 36 | request[1] = MPI_REQUEST_NULL; 37 | 38 | const int Nchunks = (SCOUNT + chunk_size - 1) / chunk_size; 39 | 40 | const int tag = rank; 41 | const int next = MModAdd1(rank, size); 42 | const int prev = MModSub1(rank, size); 43 | 44 | /*Mid point of message*/ 45 | double* RBUF = SBUF; 46 | 47 | /*Shift to ROOT=0*/ 48 | rank = MModSub(rank, ROOT, size); 49 | 50 | int Nsend = (rank == size - 1) ? 0 : SCOUNT; 51 | int Nrecv = (rank == 0) ? 0 : SCOUNT; 52 | 53 | /*Recv from left*/ 54 | int Nr = std::min(Nrecv, chunk_size); 55 | if(Nr > 0) { MPI_Irecv(RBUF, Nr, MPI_DOUBLE, prev, prev, COMM, request + 0); } 56 | 57 | /*Send to right if there is data present to send*/ 58 | int Ns = std::min(Nsend - Nrecv, chunk_size); 59 | if(Ns > 0) { MPI_Isend(SBUF, Ns, MPI_DOUBLE, next, tag, COMM, request + 1); } 60 | 61 | while(Nsend > 0 || Nrecv > 0) { 62 | int index = -1; 63 | MPI_Waitany(2, request, &index, MPI_STATUSES_IGNORE); 64 | 65 | if(index == 0) { /*Recv'd from left*/ 66 | /*If we're waiting on this recv in order to send, send now*/ 67 | if(Nrecv == Nsend) { 68 | Ns = Nr; 69 | MPI_Isend(SBUF, Ns, MPI_DOUBLE, next, tag, COMM, request + 1); 70 | } 71 | 72 | /*Count the recv'd amounts*/ 73 | Nrecv -= Nr; 74 | RBUF += Nr; 75 | 76 | /*Post next recv if needed*/ 77 | Nr = std::min(Nrecv, chunk_size); 78 | if(Nr > 0) { 79 | MPI_Irecv(RBUF, Nr, MPI_DOUBLE, prev, prev, COMM, request + 0); 80 | } else { 81 | request[0] = MPI_REQUEST_NULL; 82 | } 83 | 84 | } else if(index == 1) { /*Sent to right */ 85 | Nsend -= Ns; 86 | SBUF += Ns; 87 | 88 | /*Send to right if there is data present to send*/ 89 | Ns = std::min(Nsend - Nrecv, chunk_size); 90 | if(Ns > 0) { 91 | MPI_Isend(SBUF, Ns, MPI_DOUBLE, next, tag, COMM, request + 1); 92 | } else { 93 | request[1] = MPI_REQUEST_NULL; 94 | } 95 | } 96 | } 97 | 98 | return MPI_SUCCESS; 99 | } 100 | -------------------------------------------------------------------------------- /src/comm/HPL_bcast_2rinM.cpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | 17 | #include "hpl.hpp" 18 | 19 | int HPL_bcast_2rinM(double* SBUF, int SCOUNT, int ROOT, MPI_Comm COMM) { 20 | 21 | int rank, size; 22 | MPI_Comm_rank(COMM, &rank); 23 | MPI_Comm_size(COMM, &size); 24 | 25 | if(size <= 1) return (MPI_SUCCESS); 26 | 27 | /*Root immediately sends to ROOT+1*/ 28 | if(rank == ROOT) { 29 | MPI_Send(SBUF, SCOUNT, MPI_DOUBLE, MModAdd1(ROOT, size), ROOT, COMM); 30 | } else if(rank == MModAdd1(ROOT, size)) { 31 | MPI_Recv(SBUF, SCOUNT, MPI_DOUBLE, ROOT, ROOT, COMM, MPI_STATUS_IGNORE); 32 | return MPI_SUCCESS; 33 | } 34 | 35 | if(size == 2) return (MPI_SUCCESS); 36 | 37 | /*One ring exchange to rule them all*/ 38 | int chunk_size = 512 * 512; // 2MB 39 | 40 | chunk_size = std::min(chunk_size, SCOUNT); 41 | 42 | MPI_Request request[4]; 43 | 44 | request[0] = MPI_REQUEST_NULL; 45 | request[1] = MPI_REQUEST_NULL; 46 | request[2] = MPI_REQUEST_NULL; 47 | request[3] = MPI_REQUEST_NULL; 48 | 49 | const int Nchunks = (SCOUNT + chunk_size - 1) / chunk_size; 50 | const int NchunksHalf = (Nchunks + 1) / 2; 51 | 52 | const int tag = rank; 53 | const int next = 54 | (rank == ROOT) ? MModAdd(ROOT, 2, size) : MModAdd1(rank, size); 55 | const int prev = 56 | (rank == MModAdd(ROOT, 2, size)) ? ROOT : MModSub1(rank, size); 57 | 58 | /*Mid point of message*/ 59 | double* SBUF0 = SBUF; 60 | double* SBUF1 = SBUF + NchunksHalf * chunk_size; 61 | 62 | double* RBUF0 = SBUF0; 63 | double* RBUF1 = SBUF1; 64 | 65 | /*Shift to ROOT=0*/ 66 | rank = MModSub(rank, ROOT, size); 67 | 68 | int Nsend0 = (rank == size - 1) ? 0 : NchunksHalf * chunk_size; 69 | int Nsend1 = (rank == 2) ? 0 : SCOUNT - NchunksHalf * chunk_size; 70 | 71 | int Nrecv0 = (rank == 0) ? 0 : NchunksHalf * chunk_size; 72 | int Nrecv1 = (rank == 0) ? 0 : SCOUNT - NchunksHalf * chunk_size; 73 | 74 | /*Recv from left*/ 75 | int Nr0 = std::min(Nrecv0, chunk_size); 76 | if(Nr0 > 0) { 77 | MPI_Irecv(RBUF0, Nr0, MPI_DOUBLE, prev, prev, COMM, request + 0); 78 | } 79 | 80 | /*Recv from right*/ 81 | int Nr1 = std::min(Nrecv1, chunk_size); 82 | if(Nr1 > 0) { 83 | MPI_Irecv(RBUF1, Nr1, MPI_DOUBLE, next, next, COMM, request + 1); 84 | } 85 | 86 | /*Send to right if there is data present to send*/ 87 | int Ns0 = std::min(Nsend0 - Nrecv0, chunk_size); 88 | if(Ns0 > 0) { 89 | MPI_Isend(SBUF0, Ns0, MPI_DOUBLE, next, tag, COMM, request + 2); 90 | } 91 | 92 | /*Send to left if there is data present to send*/ 93 | int Ns1 = std::min(Nsend1 - Nrecv1, chunk_size); 94 | if(Ns1 > 0) { 95 | MPI_Isend(SBUF1, Ns1, MPI_DOUBLE, prev, tag, COMM, request + 3); 96 | } 97 | 98 | while(Nsend0 > 0 || Nsend1 > 0 || Nrecv0 > 0 || Nrecv1 > 0) { 99 | int index = -1; 100 | MPI_Waitany(4, request, &index, MPI_STATUSES_IGNORE); 101 | 102 | if(index == 0) { /*Recv'd from left*/ 103 | /*If we're waiting on this recv in order to send, send now*/ 104 | if(Nrecv0 == Nsend0) { 105 | Ns0 = Nr0; 106 | MPI_Isend(SBUF0, Ns0, MPI_DOUBLE, next, tag, COMM, request + 2); 107 | } 108 | 109 | /*Count the recv'd amounts*/ 110 | Nrecv0 -= Nr0; 111 | RBUF0 += Nr0; 112 | 113 | /*Post next recv if needed*/ 114 | Nr0 = std::min(Nrecv0, chunk_size); 115 | if(Nr0 > 0) { 116 | MPI_Irecv(RBUF0, Nr0, MPI_DOUBLE, prev, prev, COMM, request + 0); 117 | } else { 118 | request[0] = MPI_REQUEST_NULL; 119 | } 120 | 121 | } else if(index == 1) { /*Recv'd from right*/ 122 | /*If we're waiting on this recv in order to send, send now*/ 123 | if(Nrecv1 == Nsend1) { 124 | Ns1 = Nr1; 125 | MPI_Isend(SBUF1, Ns1, MPI_DOUBLE, prev, tag, COMM, request + 3); 126 | } 127 | 128 | /*Count the recv'd amounts*/ 129 | Nrecv1 -= Nr1; 130 | RBUF1 += Nr1; 131 | 132 | /*Post next recv if needed*/ 133 | Nr1 = std::min(Nrecv1, chunk_size); 134 | if(Nr1 > 0) { 135 | MPI_Irecv(RBUF1, Nr1, MPI_DOUBLE, next, next, COMM, request + 1); 136 | } else { 137 | request[1] = MPI_REQUEST_NULL; 138 | } 139 | 140 | } else if(index == 2) { /*Sent to right */ 141 | Nsend0 -= Ns0; 142 | SBUF0 += Ns0; 143 | 144 | /*Send to right if there is data present to send*/ 145 | Ns0 = std::min(Nsend0 - Nrecv0, chunk_size); 146 | if(Ns0 > 0) { 147 | MPI_Isend(SBUF0, Ns0, MPI_DOUBLE, next, tag, COMM, request + 2); 148 | } else { 149 | request[2] = MPI_REQUEST_NULL; 150 | } 151 | } else { /*index==3, Sent to left */ 152 | Nsend1 -= Ns1; 153 | SBUF1 += Ns1; 154 | 155 | Ns1 = std::min(Nsend1 - Nrecv1, chunk_size); 156 | if(Ns1 > 0) { 157 | MPI_Isend(SBUF1, Ns1, MPI_DOUBLE, prev, tag, COMM, request + 3); 158 | } else { 159 | request[3] = MPI_REQUEST_NULL; 160 | } 161 | } 162 | } 163 | 164 | return MPI_SUCCESS; 165 | } 166 | -------------------------------------------------------------------------------- /src/comm/HPL_bcast_2ring.cpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | 17 | #include "hpl.hpp" 18 | 19 | int HPL_bcast_2ring(double* SBUF, int SCOUNT, int ROOT, MPI_Comm COMM) { 20 | 21 | int rank, size; 22 | MPI_Comm_rank(COMM, &rank); 23 | MPI_Comm_size(COMM, &size); 24 | 25 | if(size <= 1) return (MPI_SUCCESS); 26 | 27 | /*One ring exchange to rule them all*/ 28 | int chunk_size = 512 * 512; // 2MB 29 | 30 | chunk_size = std::min(chunk_size, SCOUNT); 31 | 32 | MPI_Request request[4]; 33 | 34 | request[0] = MPI_REQUEST_NULL; 35 | request[1] = MPI_REQUEST_NULL; 36 | request[2] = MPI_REQUEST_NULL; 37 | request[3] = MPI_REQUEST_NULL; 38 | 39 | const int Nchunks = (SCOUNT + chunk_size - 1) / chunk_size; 40 | const int NchunksHalf = (Nchunks + 1) / 2; 41 | 42 | const int tag = rank; 43 | const int next = MModAdd1(rank, size); 44 | const int prev = MModSub1(rank, size); 45 | 46 | /*Mid point of message*/ 47 | double* SBUF0 = SBUF; 48 | double* SBUF1 = SBUF + NchunksHalf * chunk_size; 49 | 50 | double* RBUF0 = SBUF0; 51 | double* RBUF1 = SBUF1; 52 | 53 | /*Shift to ROOT=0*/ 54 | rank = MModSub(rank, ROOT, size); 55 | 56 | int Nsend0 = (rank == size - 1) ? 0 : NchunksHalf * chunk_size; 57 | int Nsend1 = (rank == 1) ? 0 : SCOUNT - NchunksHalf * chunk_size; 58 | 59 | int Nrecv0 = (rank == 0) ? 0 : NchunksHalf * chunk_size; 60 | int Nrecv1 = (rank == 0) ? 0 : SCOUNT - NchunksHalf * chunk_size; 61 | 62 | /*Recv from left*/ 63 | int Nr0 = std::min(Nrecv0, chunk_size); 64 | if(Nr0 > 0) { 65 | MPI_Irecv(RBUF0, Nr0, MPI_DOUBLE, prev, prev, COMM, request + 0); 66 | } 67 | 68 | /*Recv from right*/ 69 | int Nr1 = std::min(Nrecv1, chunk_size); 70 | if(Nr1 > 0) { 71 | MPI_Irecv(RBUF1, Nr1, MPI_DOUBLE, next, next, COMM, request + 1); 72 | } 73 | 74 | /*Send to right if there is data present to send*/ 75 | int Ns0 = std::min(Nsend0 - Nrecv0, chunk_size); 76 | if(Ns0 > 0) { 77 | MPI_Isend(SBUF0, Ns0, MPI_DOUBLE, next, tag, COMM, request + 2); 78 | } 79 | 80 | /*Send to left if there is data present to send*/ 81 | int Ns1 = std::min(Nsend1 - Nrecv1, chunk_size); 82 | if(Ns1 > 0) { 83 | MPI_Isend(SBUF1, Ns1, MPI_DOUBLE, prev, tag, COMM, request + 3); 84 | } 85 | 86 | while(Nsend0 > 0 || Nsend1 > 0 || Nrecv0 > 0 || Nrecv1 > 0) { 87 | int index = -1; 88 | MPI_Waitany(4, request, &index, MPI_STATUSES_IGNORE); 89 | 90 | if(index == 0) { /*Recv'd from left*/ 91 | /*If we're waiting on this recv in order to send, send now*/ 92 | if(Nrecv0 == Nsend0) { 93 | Ns0 = Nr0; 94 | MPI_Isend(SBUF0, Ns0, MPI_DOUBLE, next, tag, COMM, request + 2); 95 | } 96 | 97 | /*Count the recv'd amounts*/ 98 | Nrecv0 -= Nr0; 99 | RBUF0 += Nr0; 100 | 101 | /*Post next recv if needed*/ 102 | Nr0 = std::min(Nrecv0, chunk_size); 103 | if(Nr0 > 0) { 104 | MPI_Irecv(RBUF0, Nr0, MPI_DOUBLE, prev, prev, COMM, request + 0); 105 | } else { 106 | request[0] = MPI_REQUEST_NULL; 107 | } 108 | 109 | } else if(index == 1) { /*Recv'd from right*/ 110 | /*If we're waiting on this recv in order to send, send now*/ 111 | if(Nrecv1 == Nsend1) { 112 | Ns1 = Nr1; 113 | MPI_Isend(SBUF1, Ns1, MPI_DOUBLE, prev, tag, COMM, request + 3); 114 | } 115 | 116 | /*Count the recv'd amounts*/ 117 | Nrecv1 -= Nr1; 118 | RBUF1 += Nr1; 119 | 120 | /*Post next recv if needed*/ 121 | Nr1 = std::min(Nrecv1, chunk_size); 122 | if(Nr1 > 0) { 123 | MPI_Irecv(RBUF1, Nr1, MPI_DOUBLE, next, next, COMM, request + 1); 124 | } else { 125 | request[1] = MPI_REQUEST_NULL; 126 | } 127 | 128 | } else if(index == 2) { /*Sent to right */ 129 | Nsend0 -= Ns0; 130 | SBUF0 += Ns0; 131 | 132 | /*Send to right if there is data present to send*/ 133 | Ns0 = std::min(Nsend0 - Nrecv0, chunk_size); 134 | if(Ns0 > 0) { 135 | MPI_Isend(SBUF0, Ns0, MPI_DOUBLE, next, tag, COMM, request + 2); 136 | } else { 137 | request[2] = MPI_REQUEST_NULL; 138 | } 139 | } else { /*index==3, Sent to left */ 140 | Nsend1 -= Ns1; 141 | SBUF1 += Ns1; 142 | 143 | Ns1 = std::min(Nsend1 - Nrecv1, chunk_size); 144 | if(Ns1 > 0) { 145 | MPI_Isend(SBUF1, Ns1, MPI_DOUBLE, prev, tag, COMM, request + 3); 146 | } else { 147 | request[3] = MPI_REQUEST_NULL; 148 | } 149 | } 150 | } 151 | 152 | return MPI_SUCCESS; 153 | } 154 | -------------------------------------------------------------------------------- /src/comm/HPL_bcast_blonM.cpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | 17 | #include "hpl.hpp" 18 | 19 | int HPL_bcast_blonM(double* SBUF, int SCOUNT, int ROOT, MPI_Comm COMM) { 20 | 21 | int rank, size; 22 | MPI_Comm_rank(COMM, &rank); 23 | MPI_Comm_size(COMM, &size); 24 | 25 | if(size <= 1) return (MPI_SUCCESS); 26 | 27 | /* 28 | * Cast phase: ROOT process sends to its right neighbor, then spread 29 | * the panel on the other npcol - 2 processes. If I am not the ROOT 30 | * process, probe for message received. If the message is there, then 31 | * receive it. If I am just after the ROOT process, return. Otherwise, 32 | * keep spreading on those npcol - 2 processes. Otherwise, inform the 33 | * caller that the panel has still not been received. 34 | */ 35 | int count, ierr = MPI_SUCCESS, ibuf, ibufR, ibufS, indx, ip2 = 1, k, l, lbuf, 36 | lbufR, lbufS, mask = 1, mydist, mydist2, next, npm1, npm2, partner, 37 | prev; 38 | 39 | const int tag = ROOT; 40 | next = MModAdd1(rank, size); 41 | prev = MModSub1(rank, size); 42 | 43 | if(rank == ROOT) { 44 | if(ierr == MPI_SUCCESS) 45 | ierr = 46 | MPI_Send(SBUF, SCOUNT, MPI_DOUBLE, MModAdd1(rank, size), tag, COMM); 47 | } else if(prev == ROOT) { 48 | if(ierr == MPI_SUCCESS) 49 | ierr = MPI_Recv( 50 | SBUF, SCOUNT, MPI_DOUBLE, ROOT, tag, COMM, MPI_STATUS_IGNORE); 51 | } 52 | /* 53 | * if I am just after the ROOT, exit now. The message receive completed 54 | * successfully, this guy is done. If there are only 2 processes in each 55 | * row of processes, we are done as well. 56 | */ 57 | if((prev == ROOT) || (size == 2)) return ierr; 58 | /* 59 | * Otherwise, proceed with broadcast - Spread the panel across process 60 | * columns 61 | */ 62 | npm2 = (npm1 = size - 1) - 1; 63 | 64 | k = npm2; 65 | while(k > 1) { 66 | k >>= 1; 67 | ip2 <<= 1; 68 | mask <<= 1; 69 | mask++; 70 | } 71 | if(rank == ROOT) 72 | mydist2 = (mydist = 0); 73 | else 74 | mydist2 = (mydist = MModSub(rank, ROOT, size) - 1); 75 | 76 | indx = ip2; 77 | count = SCOUNT / npm1; 78 | count = Mmax(count, 1); 79 | 80 | do { 81 | mask ^= ip2; 82 | 83 | if((mydist & mask) == 0) { 84 | lbuf = SCOUNT - (ibuf = indx * count); 85 | if(indx + ip2 < npm1) { 86 | l = ip2 * count; 87 | lbuf = Mmin(lbuf, l); 88 | } 89 | 90 | partner = mydist ^ ip2; 91 | 92 | if((mydist & ip2) != 0) { 93 | partner = MModAdd(ROOT, partner, size); 94 | if(partner != ROOT) partner = MModAdd1(partner, size); 95 | 96 | if(lbuf > 0) { 97 | if(ierr == MPI_SUCCESS) 98 | ierr = MPI_Recv(SBUF + ibuf, 99 | lbuf, 100 | MPI_DOUBLE, 101 | partner, 102 | tag, 103 | COMM, 104 | MPI_STATUS_IGNORE); 105 | } 106 | } else if(partner < npm1) { 107 | partner = MModAdd(ROOT, partner, size); 108 | if(partner != ROOT) partner = MModAdd1(partner, size); 109 | 110 | if(lbuf > 0) { 111 | if(ierr == MPI_SUCCESS) 112 | ierr = MPI_Send(SBUF + ibuf, lbuf, MPI_DOUBLE, partner, tag, COMM); 113 | } 114 | } 115 | } 116 | 117 | if(mydist2 < ip2) { 118 | ip2 >>= 1; 119 | indx -= ip2; 120 | } else { 121 | mydist2 -= ip2; 122 | ip2 >>= 1; 123 | indx += ip2; 124 | } 125 | 126 | } while(ip2 > 0); 127 | /* 128 | * Roll the pieces 129 | */ 130 | if(MModSub1(prev, size) == ROOT) prev = ROOT; 131 | if(rank == ROOT) next = MModAdd1(next, size); 132 | 133 | for(k = 0; k < npm2; k++) { 134 | l = (k >> 1); 135 | /* 136 | * Who is sending to who and how much 137 | */ 138 | if(((mydist + k) & 1) != 0) { 139 | ibufS = (indx = MModAdd(mydist, l, npm1)) * count; 140 | lbufS = (indx == npm2 ? SCOUNT : ibufS + count); 141 | lbufS = Mmin(SCOUNT, lbufS) - ibufS; 142 | lbufS = Mmax(0, lbufS); 143 | 144 | ibufR = (indx = MModSub(mydist, l + 1, npm1)) * count; 145 | lbufR = (indx == npm2 ? SCOUNT : ibufR + count); 146 | lbufR = Mmin(SCOUNT, lbufR) - ibufR; 147 | lbufR = Mmax(0, lbufR); 148 | 149 | partner = prev; 150 | } else { 151 | ibufS = (indx = MModSub(mydist, l, npm1)) * count; 152 | lbufS = (indx == npm2 ? SCOUNT : ibufS + count); 153 | lbufS = Mmin(SCOUNT, lbufS) - ibufS; 154 | lbufS = Mmax(0, lbufS); 155 | 156 | ibufR = (indx = MModAdd(mydist, l + 1, npm1)) * count; 157 | lbufR = (indx == npm2 ? SCOUNT : ibufR + count); 158 | lbufR = Mmin(SCOUNT, lbufR) - ibufR; 159 | lbufR = Mmax(0, lbufR); 160 | 161 | partner = next; 162 | } 163 | /* 164 | * Exchange the messages 165 | */ 166 | MPI_Request request; 167 | MPI_Status status; 168 | 169 | if(lbufR > 0) { 170 | if(ierr == MPI_SUCCESS) 171 | ierr = MPI_Irecv( 172 | SBUF + ibufR, lbufR, MPI_DOUBLE, partner, tag, COMM, &request); 173 | } 174 | 175 | if(lbufS > 0) { 176 | if(ierr == MPI_SUCCESS) 177 | ierr = MPI_Send(SBUF + ibufS, lbufS, MPI_DOUBLE, partner, tag, COMM); 178 | } 179 | 180 | if(lbufR > 0) 181 | if(ierr == MPI_SUCCESS) ierr = MPI_Wait(&request, &status); 182 | } 183 | 184 | return ierr; 185 | } 186 | -------------------------------------------------------------------------------- /src/comm/HPL_bcast_blong.cpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | 17 | #include "hpl.hpp" 18 | 19 | int HPL_bcast_blong(double* SBUF, int SCOUNT, int ROOT, MPI_Comm COMM) { 20 | 21 | int rank, size; 22 | MPI_Comm_rank(COMM, &rank); 23 | MPI_Comm_size(COMM, &size); 24 | 25 | if(size <= 1) return (MPI_SUCCESS); 26 | 27 | /* 28 | * Cast phase: If I am the ROOT process, start spreading the panel. If 29 | * I am not the ROOT process, test for message receive completion. If 30 | * the message is there, then receive it, and keep spreading in a 31 | * blocking fashion this time. Otherwise, inform the caller that the 32 | * panel has still not been received. 33 | */ 34 | int count, ierr = MPI_SUCCESS, ibuf, ibufR, ibufS, indx, ip2, k, l, lbuf, 35 | lbufR, lbufS, mask, mydist, mydist2, npm1, partner, next, prev; 36 | 37 | const int tag = 0; 38 | 39 | // ip2 : largest power of two <= size-1; 40 | // mask : ip2 procs hypercube mask; 41 | mask = ip2 = 1; 42 | k = size - 1; 43 | while(k > 1) { 44 | k >>= 1; 45 | ip2 <<= 1; 46 | mask <<= 1; 47 | mask++; 48 | } 49 | 50 | npm1 = size - 1; 51 | mydist2 = (mydist = MModSub(rank, ROOT, size)); 52 | indx = ip2; 53 | count = SCOUNT / size; 54 | count = Mmax(count, 1); 55 | /* 56 | * Spread the panel across process columns 57 | */ 58 | do { 59 | mask ^= ip2; 60 | 61 | if((mydist & mask) == 0) { 62 | lbuf = SCOUNT - (ibuf = indx * count); 63 | if(indx + ip2 < size) { 64 | l = ip2 * count; 65 | lbuf = Mmin(lbuf, l); 66 | } 67 | 68 | partner = mydist ^ ip2; 69 | 70 | if((mydist & ip2) != 0) { 71 | partner = MModAdd(ROOT, partner, size); 72 | 73 | if(lbuf > 0) { 74 | if(ierr == MPI_SUCCESS) 75 | ierr = MPI_Recv(SBUF + ibuf, 76 | lbuf, 77 | MPI_DOUBLE, 78 | partner, 79 | tag, 80 | COMM, 81 | MPI_STATUS_IGNORE); 82 | } 83 | } else if(partner < size) { 84 | partner = MModAdd(ROOT, partner, size); 85 | 86 | if(lbuf > 0) { 87 | if(ierr == MPI_SUCCESS) 88 | ierr = MPI_Send(SBUF + ibuf, lbuf, MPI_DOUBLE, partner, tag, COMM); 89 | } 90 | } 91 | } 92 | 93 | if(mydist2 < ip2) { 94 | ip2 >>= 1; 95 | indx -= ip2; 96 | } else { 97 | mydist2 -= ip2; 98 | ip2 >>= 1; 99 | indx += ip2; 100 | } 101 | 102 | } while(ip2 > 0); 103 | /* 104 | * Roll the pieces 105 | */ 106 | prev = MModSub1(rank, size); 107 | next = MModAdd1(rank, size); 108 | 109 | for(k = 0; k < npm1; k++) { 110 | l = (k >> 1); 111 | /* 112 | * Who is sending to who and how much 113 | */ 114 | if(((mydist + k) & 1) != 0) { 115 | ibufS = (indx = MModAdd(mydist, l, size)) * count; 116 | lbufS = (indx == npm1 ? SCOUNT : ibufS + count); 117 | lbufS = Mmin(SCOUNT, lbufS) - ibufS; 118 | lbufS = Mmax(0, lbufS); 119 | 120 | ibufR = (indx = MModSub(mydist, l + 1, size)) * count; 121 | lbufR = (indx == npm1 ? SCOUNT : ibufR + count); 122 | lbufR = Mmin(SCOUNT, lbufR) - ibufR; 123 | lbufR = Mmax(0, lbufR); 124 | 125 | partner = prev; 126 | } else { 127 | ibufS = (indx = MModSub(mydist, l, size)) * count; 128 | lbufS = (indx == npm1 ? SCOUNT : ibufS + count); 129 | lbufS = Mmin(SCOUNT, lbufS) - ibufS; 130 | lbufS = Mmax(0, lbufS); 131 | 132 | ibufR = (indx = MModAdd(mydist, l + 1, size)) * count; 133 | lbufR = (indx == npm1 ? SCOUNT : ibufR + count); 134 | lbufR = Mmin(SCOUNT, lbufR) - ibufR; 135 | lbufR = Mmax(0, lbufR); 136 | 137 | partner = next; 138 | } 139 | /* 140 | * Exchange the messages 141 | */ 142 | MPI_Request request; 143 | MPI_Status status; 144 | 145 | if(lbufR > 0) { 146 | if(ierr == MPI_SUCCESS) 147 | ierr = MPI_Irecv( 148 | SBUF + ibufR, lbufR, MPI_DOUBLE, partner, tag, COMM, &request); 149 | } 150 | 151 | if(lbufS > 0) { 152 | if(ierr == MPI_SUCCESS) 153 | ierr = MPI_Send(SBUF + ibufS, lbufS, MPI_DOUBLE, partner, tag, COMM); 154 | } 155 | 156 | if(lbufR > 0) 157 | if(ierr == MPI_SUCCESS) ierr = MPI_Wait(&request, &status); 158 | } 159 | 160 | return ierr; 161 | } 162 | -------------------------------------------------------------------------------- /src/comm/HPL_broadcast.cpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | 17 | #include "hpl.hpp" 18 | 19 | int HPL_broadcast(void* BUFFER, 20 | const int COUNT, 21 | const HPL_T_TYPE DTYPE, 22 | const int ROOT, 23 | MPI_Comm COMM) { 24 | /* 25 | * Purpose 26 | * ======= 27 | * 28 | * HPL_broadcast broadcasts a message from the process with rank ROOT to 29 | * all processes in the group. 30 | * 31 | * Arguments 32 | * ========= 33 | * 34 | * BUFFER (local input/output) void * 35 | * On entry, BUFFER points to the buffer to be broadcast. On 36 | * exit, this array contains the broadcast data and is identical 37 | * on all processes in the group. 38 | * 39 | * COUNT (global input) const int 40 | * On entry, COUNT indicates the number of entries in BUFFER. 41 | * COUNT must be at least zero. 42 | * 43 | * DTYPE (global input) const HPL_T_TYPE 44 | * On entry, DTYPE specifies the type of the buffers operands. 45 | * 46 | * ROOT (global input) const int 47 | * On entry, ROOT is the coordinate of the source process. 48 | * 49 | * COMM (global/local input) MPI_Comm 50 | * The MPI communicator identifying the process collection. 51 | * 52 | * --------------------------------------------------------------------- 53 | */ 54 | 55 | int ierr = MPI_Bcast(BUFFER, COUNT, HPL_2_MPI_TYPE(DTYPE), ROOT, COMM); 56 | 57 | return ((ierr == MPI_SUCCESS ? HPL_SUCCESS : HPL_FAILURE)); 58 | } 59 | -------------------------------------------------------------------------------- /src/comm/HPL_recv.cpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | 17 | #include "hpl.hpp" 18 | 19 | int HPL_recv(double* RBUF, int RCOUNT, int SRC, int RTAG, MPI_Comm COMM) { 20 | /* 21 | * Purpose 22 | * ======= 23 | * 24 | * HPL_recv is a simple wrapper around MPI_Recv. Its main purpose is 25 | * to allow for some experimentation / tuning of this simple routine. 26 | * Successful completion is indicated by the returned error code 27 | * HPL_SUCCESS. In the case of messages of length less than or equal to 28 | * zero, this function returns immediately. 29 | * 30 | * Arguments 31 | * ========= 32 | * 33 | * RBUF (local output) double * 34 | * On entry, RBUF specifies the starting address of buffer to be 35 | * received. 36 | * 37 | * RCOUNT (local input) int 38 | * On entry, RCOUNT specifies the number of double precision 39 | * entries in RBUF. RCOUNT must be at least zero. 40 | * 41 | * SRC (local input) int 42 | * On entry, SRC specifies the rank of the sending process in 43 | * the communication space defined by COMM. 44 | * 45 | * RTAG (local input) int 46 | * On entry, STAG specifies the message tag to be used for this 47 | * communication operation. 48 | * 49 | * COMM (local input) MPI_Comm 50 | * The MPI communicator identifying the communication space. 51 | * 52 | * --------------------------------------------------------------------- 53 | */ 54 | 55 | if(RCOUNT <= 0) return (HPL_SUCCESS); 56 | 57 | MPI_Status status; 58 | 59 | int ierr = 60 | MPI_Recv((void*)(RBUF), RCOUNT, MPI_DOUBLE, SRC, RTAG, COMM, &status); 61 | 62 | return ((ierr == MPI_SUCCESS ? HPL_SUCCESS : HPL_FAILURE)); 63 | } 64 | -------------------------------------------------------------------------------- /src/comm/HPL_reduce.cpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | 17 | #include "hpl.hpp" 18 | 19 | int HPL_reduce(void* BUFFER, 20 | const int COUNT, 21 | const HPL_T_TYPE DTYPE, 22 | const HPL_T_OP OP, 23 | const int ROOT, 24 | MPI_Comm COMM) { 25 | /* 26 | * Purpose 27 | * ======= 28 | * 29 | * HPL_reduce performs a global reduce operation across all processes of 30 | * a group. Note that the input buffer is used as workarray and in all 31 | * processes but the accumulating process corrupting the original data. 32 | * 33 | * Arguments 34 | * ========= 35 | * 36 | * BUFFER (local input/output) void * 37 | * On entry, BUFFER points to the buffer to be reduced. On 38 | * exit, and in process of rank ROOT this array contains the 39 | * reduced data. This buffer is also used as workspace during 40 | * the operation in the other processes of the group. 41 | * 42 | * COUNT (global input) const int 43 | * On entry, COUNT indicates the number of entries in BUFFER. 44 | * COUNT must be at least zero. 45 | * 46 | * DTYPE (global input) const HPL_T_TYPE 47 | * On entry, DTYPE specifies the type of the buffers operands. 48 | * 49 | * OP (global input) const HPL_T_OP 50 | * On entry, OP is a pointer to the local combine function. 51 | * 52 | * ROOT (global input) const int 53 | * On entry, ROOT is the coordinate of the accumulating process. 54 | * 55 | * COMM (global/local input) MPI_Comm 56 | * The MPI communicator identifying the process collection. 57 | * 58 | * --------------------------------------------------------------------- 59 | */ 60 | 61 | int ierr; 62 | 63 | int rank; 64 | MPI_Comm_rank(COMM, &rank); 65 | 66 | if(rank == ROOT) 67 | ierr = MPI_Reduce( 68 | MPI_IN_PLACE, BUFFER, COUNT, HPL_2_MPI_TYPE(DTYPE), OP, ROOT, COMM); 69 | else 70 | ierr = 71 | MPI_Reduce(BUFFER, NULL, COUNT, HPL_2_MPI_TYPE(DTYPE), OP, ROOT, COMM); 72 | 73 | return ((ierr == MPI_SUCCESS ? HPL_SUCCESS : HPL_FAILURE)); 74 | } 75 | -------------------------------------------------------------------------------- /src/comm/HPL_scatterv.cpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | 17 | #include "hpl.hpp" 18 | 19 | int HPL_scatterv(double* BUF, 20 | const int* SCOUNT, 21 | const int* DISPL, 22 | const int RCOUNT, 23 | int ROOT, 24 | MPI_Comm COMM) { 25 | /* 26 | * Purpose 27 | * ======= 28 | * 29 | * HPL_scatterv is a simple wrapper around an in-place MPI_Scatterv. 30 | * Its main purpose is to allow for some experimentation / tuning 31 | * of this simple routine. Successful completion is indicated by 32 | * the returned error code HPL_SUCCESS. 33 | * 34 | * Arguments 35 | * ========= 36 | * 37 | * BUF (local input/output) double * 38 | * On entry, on the root process BUF specifies the starting 39 | * address of buffer to be scattered. On non-root processes, 40 | * BUF specifies the starting point of the received buffer. 41 | * 42 | * SCOUNT (local input) int * 43 | * On entry, SCOUNT is an array of length SIZE specifiying 44 | * the number of double precision entries in BUF to send to 45 | * each process. 46 | * 47 | * DISPL (local input) int * 48 | * On entry, DISPL is an array of length SIZE specifiying the 49 | * displacement (relative to BUF) from which to take the outgoing 50 | * data to each process from the root process, and the displacement 51 | * (relative to BUF) from which to receive the incoming data on 52 | * each non-root process. 53 | * 54 | * RCOUNT (local input) int 55 | * On entry, RCOUNT specifies the number of double precision 56 | * entries in BUF to be received from the ROOT process. 57 | * 58 | * ROOT (local input) int 59 | * On entry, ROOT specifies the rank of the origin process in 60 | * the communication space defined by COMM. 61 | * 62 | * COMM (local input) MPI_Comm 63 | * The MPI communicator identifying the communication space. 64 | * 65 | * --------------------------------------------------------------------- 66 | */ 67 | 68 | int rank, ierr = MPI_SUCCESS; 69 | MPI_Comm_rank(COMM, &rank); 70 | 71 | HPL_TracingPush("HPL_Scatterv"); 72 | 73 | #ifdef HPL_USE_COLLECTIVES 74 | 75 | if(rank == ROOT) { 76 | ierr = MPI_Scatterv(BUF, 77 | SCOUNT, 78 | DISPL, 79 | MPI_DOUBLE, 80 | MPI_IN_PLACE, 81 | RCOUNT, 82 | MPI_DOUBLE, 83 | ROOT, 84 | COMM); 85 | } else { 86 | ierr = MPI_Scatterv( 87 | NULL, SCOUNT, DISPL, MPI_DOUBLE, BUF, RCOUNT, MPI_DOUBLE, ROOT, COMM); 88 | } 89 | 90 | #else 91 | 92 | int size; 93 | MPI_Comm_size(COMM, &size); 94 | 95 | const int tag = ROOT; 96 | if(rank == ROOT) { 97 | MPI_Request requests[size]; 98 | 99 | /*Just send size-1 messages*/ 100 | for(int i = 0; i < size; ++i) { 101 | 102 | requests[i] = MPI_REQUEST_NULL; 103 | 104 | if(i == ROOT) { continue; } 105 | const int ibuf = DISPL[i]; 106 | const int lbuf = SCOUNT[i]; 107 | 108 | if(lbuf > 0) { 109 | (void)MPI_Isend( 110 | BUF + ibuf, lbuf, MPI_DOUBLE, i, tag, COMM, requests + i); 111 | } 112 | } 113 | 114 | MPI_Waitall(size, requests, MPI_STATUSES_IGNORE); 115 | } else { 116 | if(RCOUNT > 0) 117 | ierr = 118 | MPI_Recv(BUF, RCOUNT, MPI_DOUBLE, ROOT, tag, COMM, MPI_STATUS_IGNORE); 119 | } 120 | 121 | #endif 122 | HPL_TracingPop(); 123 | 124 | return ((ierr == MPI_SUCCESS ? HPL_SUCCESS : HPL_FAILURE)); 125 | } 126 | -------------------------------------------------------------------------------- /src/comm/HPL_sdrv.cpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | 17 | #include "hpl.hpp" 18 | 19 | int HPL_sdrv(double* SBUF, 20 | int SCOUNT, 21 | int STAG, 22 | double* RBUF, 23 | int RCOUNT, 24 | int RTAG, 25 | int PARTNER, 26 | MPI_Comm COMM) { 27 | /* 28 | * Purpose 29 | * ======= 30 | * 31 | * HPL_sdrv is a simple wrapper around MPI_Sendrecv. Its main purpose is 32 | * to allow for some experimentation and tuning of this simple function. 33 | * Messages of length less than or equal to zero are not sent nor 34 | * received. Successful completion is indicated by the returned error 35 | * code HPL_SUCCESS. 36 | * 37 | * Arguments 38 | * ========= 39 | * 40 | * SBUF (local input) double * 41 | * On entry, SBUF specifies the starting address of buffer to be 42 | * sent. 43 | * 44 | * SCOUNT (local input) int 45 | * On entry, SCOUNT specifies the number of double precision 46 | * entries in SBUF. SCOUNT must be at least zero. 47 | * 48 | * STAG (local input) int 49 | * On entry, STAG specifies the message tag to be used for the 50 | * sending communication operation. 51 | * 52 | * RBUF (local output) double * 53 | * On entry, RBUF specifies the starting address of buffer to be 54 | * received. 55 | * 56 | * RCOUNT (local input) int 57 | * On entry, RCOUNT specifies the number of double precision 58 | * entries in RBUF. RCOUNT must be at least zero. 59 | * 60 | * RTAG (local input) int 61 | * On entry, RTAG specifies the message tag to be used for the 62 | * receiving communication operation. 63 | * 64 | * PARTNER (local input) int 65 | * On entry, PARTNER specifies the rank of the collaborative 66 | * process in the communication space defined by COMM. 67 | * 68 | * COMM (local input) MPI_Comm 69 | * The MPI communicator identifying the communication space. 70 | * 71 | * --------------------------------------------------------------------- 72 | */ 73 | 74 | MPI_Status status; 75 | int ierr; 76 | 77 | ierr = MPI_Sendrecv(SBUF, 78 | SCOUNT, 79 | MPI_DOUBLE, 80 | PARTNER, 81 | STAG, 82 | RBUF, 83 | RCOUNT, 84 | MPI_DOUBLE, 85 | PARTNER, 86 | RTAG, 87 | COMM, 88 | &status); 89 | 90 | return ((ierr == MPI_SUCCESS ? HPL_SUCCESS : HPL_FAILURE)); 91 | } 92 | -------------------------------------------------------------------------------- /src/comm/HPL_send.cpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | 17 | #include "hpl.hpp" 18 | 19 | int HPL_send(double* SBUF, int SCOUNT, int DEST, int STAG, MPI_Comm COMM) { 20 | /* 21 | * Purpose 22 | * ======= 23 | * 24 | * HPL_send is a simple wrapper around MPI_Send. Its main purpose is 25 | * to allow for some experimentation / tuning of this simple routine. 26 | * Successful completion is indicated by the returned error code 27 | * MPI_SUCCESS. In the case of messages of length less than or equal to 28 | * zero, this function returns immediately. 29 | * 30 | * Arguments 31 | * ========= 32 | * 33 | * SBUF (local input) double * 34 | * On entry, SBUF specifies the starting address of buffer to be 35 | * sent. 36 | * 37 | * SCOUNT (local input) int 38 | * On entry, SCOUNT specifies the number of double precision 39 | * entries in SBUF. SCOUNT must be at least zero. 40 | * 41 | * DEST (local input) int 42 | * On entry, DEST specifies the rank of the receiving process in 43 | * the communication space defined by COMM. 44 | * 45 | * STAG (local input) int 46 | * On entry, STAG specifies the message tag to be used for this 47 | * communication operation. 48 | * 49 | * COMM (local input) MPI_Comm 50 | * The MPI communicator identifying the communication space. 51 | * 52 | * --------------------------------------------------------------------- 53 | */ 54 | 55 | if(SCOUNT <= 0) return (HPL_SUCCESS); 56 | 57 | int ierr = MPI_Send((void*)(SBUF), SCOUNT, MPI_DOUBLE, DEST, STAG, COMM); 58 | 59 | return ((ierr == MPI_SUCCESS ? HPL_SUCCESS : HPL_FAILURE)); 60 | } 61 | -------------------------------------------------------------------------------- /src/grid/HPL_grid_exit.cpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | 17 | #include "hpl.hpp" 18 | 19 | int HPL_grid_exit(HPL_T_grid* GRID) { 20 | /* 21 | * Purpose 22 | * ======= 23 | * 24 | * HPL_grid_exit marks the process grid object for deallocation. The 25 | * returned error code MPI_SUCCESS indicates successful completion. 26 | * Other error codes are (MPI) implementation dependent. 27 | * 28 | * Arguments 29 | * ========= 30 | * 31 | * GRID (local input/output) HPL_T_grid * 32 | * On entry, GRID points to the data structure containing the 33 | * process grid to be released. 34 | * 35 | * --------------------------------------------------------------------- 36 | */ 37 | 38 | int hplerr = MPI_SUCCESS, mpierr; 39 | 40 | if(GRID->all_comm != MPI_COMM_NULL) { 41 | mpierr = MPI_Comm_free(&(GRID->row_comm)); 42 | if(mpierr != MPI_SUCCESS) hplerr = mpierr; 43 | mpierr = MPI_Comm_free(&(GRID->col_comm)); 44 | if(mpierr != MPI_SUCCESS) hplerr = mpierr; 45 | mpierr = MPI_Comm_free(&(GRID->all_comm)); 46 | if(mpierr != MPI_SUCCESS) hplerr = mpierr; 47 | } 48 | 49 | GRID->order = HPL_COLUMN_MAJOR; 50 | 51 | GRID->iam = GRID->myrow = GRID->mycol = -1; 52 | GRID->nprow = GRID->npcol = GRID->nprocs = -1; 53 | 54 | GRID->row_ip2 = GRID->row_hdim = GRID->row_ip2m1 = GRID->row_mask = -1; 55 | GRID->col_ip2 = GRID->col_hdim = GRID->col_ip2m1 = GRID->col_mask = -1; 56 | 57 | return (hplerr); 58 | } 59 | -------------------------------------------------------------------------------- /src/grid/HPL_grid_info.cpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | 17 | #include "hpl.hpp" 18 | 19 | int HPL_grid_info(const HPL_T_grid* GRID, 20 | int* NPROW, 21 | int* NPCOL, 22 | int* MYROW, 23 | int* MYCOL) { 24 | /* 25 | * Purpose 26 | * ======= 27 | * 28 | * HPL_grid_info returns the grid shape and the coordinates in the grid 29 | * of the calling process. Successful completion is indicated by the 30 | * returned error code MPI_SUCCESS. Other error codes depend on the MPI 31 | * implementation. 32 | * 33 | * Arguments 34 | * ========= 35 | * 36 | * GRID (local input) const HPL_T_grid * 37 | * On entry, GRID points to the data structure containing the 38 | * process grid information. 39 | * 40 | * NPROW (global output) int * 41 | * On exit, NPROW specifies the number of process rows in the 42 | * grid. NPROW is at least one. 43 | * 44 | * NPCOL (global output) int * 45 | * On exit, NPCOL specifies the number of process columns in 46 | * the grid. NPCOL is at least one. 47 | * 48 | * MYROW (global output) int * 49 | * On exit, MYROW specifies my row process coordinate in the 50 | * grid. MYROW is greater than or equal to zero and less than 51 | * NPROW. 52 | * 53 | * MYCOL (global output) int * 54 | * On exit, MYCOL specifies my column process coordinate in the 55 | * grid. MYCOL is greater than or equal to zero and less than 56 | * NPCOL. 57 | * 58 | * --------------------------------------------------------------------- 59 | */ 60 | 61 | *NPROW = GRID->nprow; 62 | *NPCOL = GRID->npcol; 63 | *MYROW = GRID->myrow; 64 | *MYCOL = GRID->mycol; 65 | return (MPI_SUCCESS); 66 | } 67 | -------------------------------------------------------------------------------- /src/matgen/HPL_xjumpm.cpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | 17 | #include "hpl.hpp" 18 | 19 | void HPL_xjumpm(const int JUMPM, 20 | const uint64_t MULT, 21 | const uint64_t IADD, 22 | const uint64_t IRANN, 23 | uint64_t& IRANM, 24 | uint64_t& IAM, 25 | uint64_t& ICM) { 26 | /* 27 | * Purpose 28 | * ======= 29 | * 30 | * HPL_xjumpm computes the constants A and C to jump JUMPM numbers in 31 | * the random sequence: X(n+JUMPM) = A*X(n)+C. The constants encoded in 32 | * MULT and IADD specify how to jump from one entry in the sequence to 33 | * the next. 34 | * 35 | * Arguments 36 | * ========= 37 | * 38 | * JUMPM (local input) const int 39 | * On entry, JUMPM specifies the number of entries in the 40 | * sequence to jump over. When JUMPM is less or equal than zero, 41 | * A and C are not computed, IRANM is set to IRANN corresponding 42 | * to a jump of size zero. 43 | * 44 | * MULT (local input) unint64_t 45 | * On entry, MULT is an array of dimension 2, that contains the 46 | * 16-lower and 15-higher bits of the constant a to jump from 47 | * X(n) to X(n+1) = a*X(n) + c in the random sequence. 48 | * 49 | * IADD (local input) unint64_t 50 | * On entry, IADD is an array of dimension 2, that contains the 51 | * 16-lower and 15-higher bits of the constant c to jump from 52 | * X(n) to X(n+1) = a*X(n) + c in the random sequence. 53 | * 54 | * IRANN (local input) unint64_t 55 | * On entry, IRANN is an array of dimension 2. that contains the 56 | * 16-lower and 15-higher bits of the encoding of X(n). 57 | * 58 | * IRANM (local output) unint64_t 59 | * On entry, IRANM is an array of dimension 2. On exit, this 60 | * array contains respectively the 16-lower and 15-higher bits 61 | * of the encoding of X(n+JUMPM). 62 | * 63 | * IAM (local output) unint64_t 64 | * On entry, IAM is an array of dimension 2. On exit, when JUMPM 65 | * is greater than zero, this array contains the encoded 66 | * constant A to jump from X(n) to X(n+JUMPM) in the random 67 | * sequence. IAM(0:1) contains respectively the 16-lower and 68 | * 15-higher bits of this constant A. When JUMPM is less or 69 | * equal than zero, this array is not referenced. 70 | * 71 | * ICM (local output) unint64_t 72 | * On entry, ICM is an array of dimension 2. On exit, when JUMPM 73 | * is greater than zero, this array contains the encoded 74 | * constant C to jump from X(n) to X(n+JUMPM) in the random 75 | * sequence. ICM(0:1) contains respectively the 16-lower and 76 | * 15-higher bits of this constant C. When JUMPM is less or 77 | * equal than zero, this array is not referenced. 78 | * 79 | * --------------------------------------------------------------------- 80 | */ 81 | if(JUMPM > 0) { 82 | IAM = MULT; 83 | ICM = IADD; 84 | for(int k = 1; k <= JUMPM - 1; k++) { 85 | IAM *= MULT; 86 | ICM = ICM * MULT + IADD; 87 | } 88 | IRANM = IRANN * IAM + ICM; 89 | } else { 90 | IRANM = IRANN; 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /src/panel/HPL_pdpanel_bcast.cpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | 17 | #include "hpl.hpp" 18 | 19 | int HPL_pdpanel_bcast(HPL_T_panel* PANEL) { 20 | /* 21 | * Purpose 22 | * ======= 23 | * 24 | * HPL_pdpanel_bcast broadcasts the current panel. Successful completion 25 | * is indicated by a return code of HPL_SUCCESS. 26 | * 27 | * Arguments 28 | * ========= 29 | * 30 | * PANEL (input/output) HPL_T_panel * 31 | * On entry, PANEL points to the current panel data structure 32 | * being broadcast. 33 | * 34 | * --------------------------------------------------------------------- 35 | */ 36 | 37 | if(PANEL == NULL) { return HPL_SUCCESS; } 38 | if(PANEL->grid->npcol <= 1) { return HPL_SUCCESS; } 39 | 40 | MPI_Comm comm = PANEL->grid->row_comm; 41 | int root = PANEL->pcol; 42 | 43 | #ifdef HPL_DETAILED_TIMING 44 | HPL_ptimer(HPL_TIMING_LBCAST); 45 | #endif 46 | /* 47 | * Single Bcast call 48 | */ 49 | int err = HPL_bcast(PANEL->A0, PANEL->len, root, comm, PANEL->algo->btopo); 50 | 51 | #ifdef HPL_DETAILED_TIMING 52 | HPL_ptimer(HPL_TIMING_LBCAST); 53 | #endif 54 | 55 | if(PANEL->grid->mycol != root) { 56 | // retrieve some host-side pivoting info from bcast message 57 | int* dipA = PANEL->dipiv + 4 * PANEL->jb; 58 | int* ipA = PANEL->ipiv + 5 * PANEL->jb; 59 | int nprow = PANEL->grid->nprow; 60 | 61 | CHECK_HIP_ERROR(hipMemcpyAsync(ipA, 62 | dipA, 63 | (1 + nprow + 1) * sizeof(int), 64 | hipMemcpyDeviceToHost, 65 | dataStream)); 66 | CHECK_HIP_ERROR(hipStreamSynchronize(dataStream)); 67 | } 68 | 69 | return err; 70 | } 71 | -------------------------------------------------------------------------------- /src/panel/HPL_pdpanel_free.cpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | #include "hpl.hpp" 17 | 18 | int HPL_pdpanel_free(HPL_T_panel* PANEL) { 19 | /* 20 | * Purpose 21 | * ======= 22 | * 23 | * HPL_pdpanel_free deallocates the panel resources and stores the error 24 | * code returned by the panel factorization. 25 | * 26 | * Arguments 27 | * ========= 28 | * 29 | * PANEL (local input/output) HPL_T_panel * 30 | * On entry, PANEL points to the panel data structure from 31 | * which the resources should be deallocated. 32 | * 33 | * --------------------------------------------------------------------- 34 | */ 35 | 36 | if(PANEL->IWORK) { 37 | CHECK_HIP_ERROR(hipHostFree(PANEL->IWORK)); 38 | PANEL->IWORK = nullptr; 39 | } 40 | if(PANEL->U2) { 41 | CHECK_HIP_ERROR(hipFree(PANEL->U2)); 42 | PANEL->U2 = nullptr; 43 | } 44 | if(PANEL->U1) { 45 | CHECK_HIP_ERROR(hipFree(PANEL->U1)); 46 | PANEL->U1 = nullptr; 47 | } 48 | if(PANEL->U0) { 49 | CHECK_HIP_ERROR(hipFree(PANEL->U0)); 50 | PANEL->U0 = nullptr; 51 | } 52 | if(PANEL->A0) { 53 | CHECK_HIP_ERROR(hipFree(PANEL->A0)); 54 | PANEL->A0 = nullptr; 55 | } 56 | 57 | return (HPL_SUCCESS); 58 | } 59 | -------------------------------------------------------------------------------- /src/panel/HPL_pdpanel_swapids.cpp: -------------------------------------------------------------------------------- 1 | 2 | /* --------------------------------------------------------------------- 3 | * -- High Performance Computing Linpack Benchmark (HPL) 4 | * Noel Chalmers 5 | * (C) 2018-2025 Advanced Micro Devices, Inc. 6 | * See the rocHPL/LICENCE file for details. 7 | * 8 | * SPDX-License-Identifier: (BSD-3-Clause) 9 | * --------------------------------------------------------------------- 10 | */ 11 | #include "hpl.hpp" 12 | 13 | void HPL_pdpanel_swapids(HPL_T_panel* PANEL) { 14 | int jb, i, ml2; 15 | 16 | jb = PANEL->jb; 17 | 18 | int nprow = PANEL->grid->nprow; 19 | 20 | if(jb <= 0) return; 21 | 22 | if(nprow == 1) { 23 | // unroll pivoting 24 | int* ipiv = PANEL->ipiv; 25 | int* permU = PANEL->ipiv + jb; 26 | int* ipl = permU + 2 * jb; 27 | int* ipID = ipl + 1; 28 | 29 | for(i = 0; i < jb; i++) permU[i + jb] = -1; 30 | 31 | HPL_pipid(PANEL, ipl, ipID); 32 | 33 | for(i = 0; i < *ipl; i += 2) { 34 | int src = ipID[i] - PANEL->ia; 35 | int dst = ipID[i + 1] - PANEL->ia; 36 | if(dst < jb) { 37 | permU[dst] = src; 38 | } else { 39 | permU[src + jb] = dst; 40 | } 41 | } 42 | 43 | int* dpermU = PANEL->dipiv; 44 | 45 | // send pivoting ids to device 46 | CHECK_HIP_ERROR(hipMemcpyAsync(dpermU, 47 | permU, 48 | 2 * jb * sizeof(int), 49 | hipMemcpyHostToDevice, 50 | dataStream)); 51 | CHECK_HIP_ERROR(hipStreamSynchronize(dataStream)); 52 | 53 | } else { 54 | 55 | int* permU = PANEL->ipiv + jb; 56 | int* lindxU = permU + jb; 57 | int* lindxA = lindxU + jb; 58 | int* lindxAU = lindxA + jb; 59 | int* ipA = lindxAU + jb; 60 | int* iplen = ipA + 1; 61 | 62 | int* ipl = iplen + nprow + 1; 63 | int* ipID = ipl + 1; 64 | int* iwork = ipID + 4 * jb; 65 | 66 | HPL_pipid(PANEL, ipl, ipID); 67 | HPL_plindx( 68 | PANEL, *ipl, ipID, ipA, lindxU, lindxAU, lindxA, iplen, permU, iwork); 69 | 70 | int* dpermU = PANEL->dipiv; 71 | 72 | // send pivoting ids to device 73 | CHECK_HIP_ERROR(hipMemcpyAsync(dpermU, 74 | permU, 75 | (4 * jb + 1 + nprow + 1) * sizeof(int), 76 | hipMemcpyHostToDevice, 77 | dataStream)); 78 | CHECK_HIP_ERROR(hipStreamSynchronize(dataStream)); 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /src/panel/HPL_pdpanel_wait.cpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * Noel Chalmers 4 | * (C) 2018-2025 Advanced Micro Devices, Inc. 5 | * See the rocHPL/LICENCE file for details. 6 | * 7 | * SPDX-License-Identifier: (BSD-3-Clause) 8 | * --------------------------------------------------------------------- 9 | */ 10 | #include "hpl.hpp" 11 | 12 | void HPL_pdpanel_Wait(HPL_T_panel* PANEL) { 13 | // Wait for pfact to be complete 14 | CHECK_HIP_ERROR(hipEventSynchronize(pfactStop)); 15 | } 16 | -------------------------------------------------------------------------------- /src/pauxil/HPL_dlaswp00N_device.cpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | 17 | #include "hpl.hpp" 18 | #include 19 | 20 | #define BLOCK_SIZE 512 21 | 22 | __global__ void dlaswp00N(const int N, 23 | const int M, 24 | double* __restrict__ A, 25 | const int LDA, 26 | const int* __restrict__ IPIV) { 27 | 28 | __shared__ double s_An_init[2048]; 29 | __shared__ double s_An_ipiv[2048]; 30 | 31 | const int m = threadIdx.x; 32 | const int n = blockIdx.x; 33 | 34 | // read in block column 35 | for(int i = m; i < M; i += blockDim.x) 36 | s_An_init[i] = A[i + n * ((size_t)LDA)]; 37 | 38 | __syncthreads(); 39 | 40 | // local block 41 | for(int i = m; i < M; i += blockDim.x) { 42 | const int ip = IPIV[i]; 43 | 44 | if(ip < M) { // local swap 45 | s_An_ipiv[i] = s_An_init[ip]; 46 | } else { // non local swap 47 | s_An_ipiv[i] = A[ip + n * ((size_t)LDA)]; 48 | } 49 | } 50 | __syncthreads(); 51 | 52 | // write out local block 53 | for(int i = m; i < M; i += blockDim.x) 54 | A[i + n * ((size_t)LDA)] = s_An_ipiv[i]; 55 | 56 | // remaining swaps in column 57 | for(int i = m; i < M; i += blockDim.x) { 58 | const int ip_ex = IPIV[i + M]; 59 | 60 | if(ip_ex > -1) { A[ip_ex + n * ((size_t)LDA)] = s_An_init[i]; } 61 | } 62 | } 63 | 64 | void HPL_dlaswp00N(const int M, 65 | const int N, 66 | double* A, 67 | const int LDA, 68 | const int* IPIV) { 69 | /* 70 | * Purpose 71 | * ======= 72 | * 73 | * HPL_dlaswp00N performs a series of local row interchanges on a matrix 74 | * A. One row interchange is initiated for rows 0 through M-1 of A. 75 | * 76 | * Arguments 77 | * ========= 78 | * 79 | * M (local input) const int 80 | * On entry, M specifies the number of rows of the array A to be 81 | * interchanged. M must be at least zero. 82 | * 83 | * N (local input) const int 84 | * On entry, N specifies the number of columns of the array A. 85 | * N must be at least zero. 86 | * 87 | * A (local input/output) double * 88 | * On entry, A points to an array of dimension (LDA,N) to which 89 | * the row interchanges will be applied. On exit, the permuted 90 | * matrix. 91 | * 92 | * LDA (local input) const int 93 | * On entry, LDA specifies the leading dimension of the array A. 94 | * LDA must be at least MAX(1,M). 95 | * 96 | * IPIV (local input) const int * 97 | * On entry, IPIV is an array of size M that contains the 98 | * pivoting information. For k in [0..M), IPIV[k]=IROFF + l 99 | * implies that local rows k and l are to be interchanged. 100 | * 101 | * --------------------------------------------------------------------- 102 | */ 103 | 104 | if((M <= 0) || (N <= 0)) return; 105 | 106 | hipStream_t stream; 107 | CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); 108 | 109 | int grid_size = N; 110 | dlaswp00N<<>>(N, M, A, LDA, IPIV); 111 | CHECK_HIP_ERROR(hipGetLastError()); 112 | } 113 | -------------------------------------------------------------------------------- /src/pauxil/HPL_dlaswp01T_device.cpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | 17 | #include "hpl.hpp" 18 | #include 19 | 20 | #define TILE_DIM 32 21 | #define BLOCK_ROWS 8 22 | 23 | /* Build U matrix from rows of A */ 24 | __global__ void dlaswp01T(const int M, 25 | const int N, 26 | double* __restrict__ A, 27 | const int LDA, 28 | double* __restrict__ U, 29 | const int LDU, 30 | const int* __restrict__ LINDXU) { 31 | 32 | __shared__ double s_U[TILE_DIM][TILE_DIM + 1]; 33 | 34 | const int m = threadIdx.x + TILE_DIM * blockIdx.x; 35 | const int n = threadIdx.y + TILE_DIM * blockIdx.y; 36 | 37 | if(m < M) { 38 | const int ipa = LINDXU[m]; 39 | 40 | // save in LDS for the moment 41 | // possible cache-hits if ipas are close 42 | s_U[threadIdx.x][threadIdx.y + 0] = 43 | (n + 0 < N) ? A[ipa + (n + 0) * ((size_t)LDA)] : 0.0; 44 | s_U[threadIdx.x][threadIdx.y + 8] = 45 | (n + 8 < N) ? A[ipa + (n + 8) * ((size_t)LDA)] : 0.0; 46 | s_U[threadIdx.x][threadIdx.y + 16] = 47 | (n + 16 < N) ? A[ipa + (n + 16) * ((size_t)LDA)] : 0.0; 48 | s_U[threadIdx.x][threadIdx.y + 24] = 49 | (n + 24 < N) ? A[ipa + (n + 24) * ((size_t)LDA)] : 0.0; 50 | } 51 | 52 | __syncthreads(); 53 | 54 | const int um = threadIdx.y + TILE_DIM * blockIdx.x; 55 | const int un = threadIdx.x + TILE_DIM * blockIdx.y; 56 | 57 | if(un < N) { 58 | // write out chunks of U 59 | if((um + 0) < M) 60 | U[un + (um + 0) * ((size_t)LDU)] = s_U[threadIdx.y + 0][threadIdx.x]; 61 | if((um + 8) < M) 62 | U[un + (um + 8) * ((size_t)LDU)] = s_U[threadIdx.y + 8][threadIdx.x]; 63 | if((um + 16) < M) 64 | U[un + (um + 16) * ((size_t)LDU)] = s_U[threadIdx.y + 16][threadIdx.x]; 65 | if((um + 24) < M) 66 | U[un + (um + 24) * ((size_t)LDU)] = s_U[threadIdx.y + 24][threadIdx.x]; 67 | } 68 | } 69 | 70 | void HPL_dlaswp01T(const int M, 71 | const int N, 72 | double* A, 73 | const int LDA, 74 | double* U, 75 | const int LDU, 76 | const int* LINDXU) { 77 | /* 78 | * Purpose 79 | * ======= 80 | * 81 | * HPL_dlaswp01T copies scattered rows of A into an array U. The 82 | * row offsets in A of the source rows are specified by LINDXU. 83 | * Rows of A are stored as columns in U. 84 | * 85 | * Arguments 86 | * ========= 87 | * 88 | * M (local input) const int 89 | * On entry, M specifies the number of rows of A that should be 90 | * moved within A or copied into U. M must be at least zero. 91 | * 92 | * N (local input) const int 93 | * On entry, N specifies the length of rows of A that should be 94 | * moved within A or copied into U. N must be at least zero. 95 | * 96 | * A (local input/output) double * 97 | * On entry, A points to an array of dimension (LDA,N). The rows 98 | * of this array specified by LINDXA should be moved within A or 99 | * copied into U. 100 | * 101 | * LDA (local input) const int 102 | * On entry, LDA specifies the leading dimension of the array A. 103 | * LDA must be at least MAX(1,M). 104 | * 105 | * U (local input/output) double * 106 | * On entry, U points to an array of dimension (LDU,M). The rows 107 | * of A specified by LINDXA are copied within this array U at 108 | * the positions indicated by positive values of LINDXAU. The 109 | * rows of A are stored as columns in U. 110 | * 111 | * LDU (local input) const int 112 | * On entry, LDU specifies the leading dimension of the array U. 113 | * LDU must be at least MAX(1,N). 114 | * 115 | * LINDXU (local input) const int * 116 | * On entry, LINDXU is an array of dimension M that contains the 117 | * local row indexes of A that should be copied into U. 118 | * 119 | * --------------------------------------------------------------------- 120 | */ 121 | /* 122 | * .. Local Variables .. 123 | */ 124 | 125 | if((M <= 0) || (N <= 0)) return; 126 | 127 | dim3 grid_size((M + TILE_DIM - 1) / TILE_DIM, (N + TILE_DIM - 1) / TILE_DIM); 128 | dim3 block_size(TILE_DIM, BLOCK_ROWS); 129 | dlaswp01T<<>>( 130 | M, N, A, LDA, U, LDU, LINDXU); 131 | CHECK_HIP_ERROR(hipGetLastError()); 132 | 133 | /* 134 | * End of HPL_dlaswp01T 135 | */ 136 | } 137 | -------------------------------------------------------------------------------- /src/pauxil/HPL_dlaswp02T_device.cpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | 17 | #include "hpl.hpp" 18 | #include 19 | #include 20 | 21 | #define assertm(exp, msg) assert(((void)msg, exp)) 22 | 23 | /* Perform any local row swaps of A */ 24 | __global__ void dlaswp02T(const int M, 25 | const int N, 26 | double* __restrict__ A, 27 | const int LDA, 28 | const int* __restrict__ LINDXAU, 29 | const int* __restrict__ LINDXA) { 30 | 31 | const int n = blockIdx.x; 32 | const int m = threadIdx.x; 33 | 34 | const int ipau = LINDXAU[m]; // src row 35 | const int ipa = LINDXA[m]; // dst row 36 | 37 | const double An = A[ipau + n * ((size_t)LDA)]; 38 | 39 | __syncthreads(); 40 | 41 | A[ipa + n * ((size_t)LDA)] = An; 42 | } 43 | 44 | void HPL_dlaswp02T(const int M, 45 | const int N, 46 | double* A, 47 | const int LDA, 48 | const int* LINDXAU, 49 | const int* LINDXA) { 50 | /* 51 | * Purpose 52 | * ======= 53 | * 54 | * HPL_dlaswp02T copies scattered rows of A into itself. The row 55 | * offsets in A of the source rows are specified by LINDXA. 56 | * The destination of those rows are specified by LINDXAU. A 57 | * positive value of LINDXAU indicates that the array destination is U, 58 | * and A otherwise. 59 | * 60 | * Arguments 61 | * ========= 62 | * 63 | * M (local input) const int 64 | * On entry, M specifies the number of rows of A that should be 65 | * moved within A or copied into U. M must be at least zero. 66 | * 67 | * N (local input) const int 68 | * On entry, N specifies the length of rows of A that should be 69 | * moved within A or copied into U. N must be at least zero. 70 | * 71 | * A (local input/output) double * 72 | * On entry, A points to an array of dimension (LDA,N). The rows 73 | * of this array specified by LINDXA should be moved within A or 74 | * copied into U. 75 | * 76 | * LDA (local input) const int 77 | * On entry, LDA specifies the leading dimension of the array A. 78 | * LDA must be at least MAX(1,M). 79 | * 80 | * LINDXAU (local input) const int * 81 | * On entry, LINDXA is an array of dimension M that contains the 82 | * local row indexes of A that should be moved within A. 83 | * 84 | * LINDXA (local input) const int * 85 | * On entry, LINDXAU is an array of dimension M that contains 86 | * the local row indexes of A where the rows of A should be 87 | * copied to. 88 | * 89 | * --------------------------------------------------------------------- 90 | */ 91 | /* 92 | * .. Local Variables .. 93 | */ 94 | 95 | if((M <= 0) || (N <= 0)) return; 96 | 97 | assertm(M <= 1024, "NB too large in HPL_dlaswp02T"); 98 | 99 | dim3 grid_size(N); 100 | dim3 block_size(M); 101 | dlaswp02T<<>>(M, N, A, LDA, LINDXAU, LINDXA); 102 | CHECK_HIP_ERROR(hipGetLastError()); 103 | /* 104 | * End of HPL_dlaswp02T 105 | */ 106 | } 107 | -------------------------------------------------------------------------------- /src/pauxil/HPL_dlaswp03T_device.cpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | 17 | #include "hpl.hpp" 18 | #include 19 | 20 | #define TILE_DIM 32 21 | #define BLOCK_ROWS 8 22 | 23 | /* Build W matrix from rows of A */ 24 | __global__ void dlaswp03T(const int M, 25 | const int N, 26 | double* __restrict__ A, 27 | const int LDA, 28 | double* __restrict__ W, 29 | const int LDW, 30 | const int* __restrict__ LINDXU) { 31 | 32 | __shared__ double s_W[TILE_DIM][TILE_DIM + 1]; 33 | 34 | const int m = threadIdx.x + TILE_DIM * blockIdx.x; 35 | const int n = threadIdx.y + TILE_DIM * blockIdx.y; 36 | 37 | if(m < M) { 38 | const int ipa = LINDXU[m]; 39 | 40 | // save in LDS for the moment 41 | // possible cache-hits if ipas are close 42 | s_W[threadIdx.x][threadIdx.y + 0] = 43 | (n + 0 < N) ? A[ipa + (n + 0) * ((size_t)LDA)] : 0.0; 44 | s_W[threadIdx.x][threadIdx.y + 8] = 45 | (n + 8 < N) ? A[ipa + (n + 8) * ((size_t)LDA)] : 0.0; 46 | s_W[threadIdx.x][threadIdx.y + 16] = 47 | (n + 16 < N) ? A[ipa + (n + 16) * ((size_t)LDA)] : 0.0; 48 | s_W[threadIdx.x][threadIdx.y + 24] = 49 | (n + 24 < N) ? A[ipa + (n + 24) * ((size_t)LDA)] : 0.0; 50 | } 51 | 52 | __syncthreads(); 53 | 54 | const int wm = threadIdx.y + TILE_DIM * blockIdx.x; 55 | const int wn = threadIdx.x + TILE_DIM * blockIdx.y; 56 | 57 | if(wn < N) { 58 | // write out chunks of W 59 | if((wm + 0) < M) 60 | W[wn + (wm + 0) * ((size_t)LDW)] = s_W[threadIdx.y + 0][threadIdx.x]; 61 | if((wm + 8) < M) 62 | W[wn + (wm + 8) * ((size_t)LDW)] = s_W[threadIdx.y + 8][threadIdx.x]; 63 | if((wm + 16) < M) 64 | W[wn + (wm + 16) * ((size_t)LDW)] = s_W[threadIdx.y + 16][threadIdx.x]; 65 | if((wm + 24) < M) 66 | W[wn + (wm + 24) * ((size_t)LDW)] = s_W[threadIdx.y + 24][threadIdx.x]; 67 | } 68 | } 69 | 70 | void HPL_dlaswp03T(const int M, 71 | const int N, 72 | double* A, 73 | const int LDA, 74 | double* W, 75 | const int LDW, 76 | const int* LINDXU) { 77 | /* 78 | * Purpose 79 | * ======= 80 | * 81 | * HPL_dlaswp03T packs scattered rows of an array A into workspace W. 82 | * The row offsets in A are specified by LINDXU. 83 | * 84 | * Arguments 85 | * ========= 86 | * 87 | * M (local input) const int 88 | * On entry, M specifies the number of rows of A that should be 89 | * swapped with columns of W. M must be at least zero. 90 | * 91 | * N (local input) const int 92 | * On entry, N specifies the length of the rows of A that should 93 | * be swapped with columns of W. N must be at least zero. 94 | * 95 | * A (local output) double * 96 | * On entry, A points to an array of dimension (LDA,N). On exit, 97 | * the rows of this array specified by LINDXU are replaced by 98 | * columns of W. 99 | * 100 | * LDA (local input) const int 101 | * On entry, LDA specifies the leading dimension of the array A. 102 | * LDA must be at least MAX(1,M). 103 | * 104 | * W (local input/output) double * 105 | * On entry, W points to an array of dimension (LDW,*). This 106 | * array contains the columns of W that are to be swapped with 107 | * rows of A. 108 | * 109 | * LDW (local input) const int 110 | * On entry, LDW specifies the leading dimension of the array W. 111 | * LDW must be at least MAX(1,N). 112 | * 113 | * LINDXU (local input) const int * 114 | * On entry, LINDXU is an array of dimension M that contains the 115 | * local row indexes of A that should be copied into W. 116 | * 117 | * --------------------------------------------------------------------- 118 | */ 119 | /* 120 | * .. Local Variables .. 121 | */ 122 | 123 | if((M <= 0) || (N <= 0)) return; 124 | 125 | dim3 grid_size((M + TILE_DIM - 1) / TILE_DIM, (N + TILE_DIM - 1) / TILE_DIM); 126 | dim3 block_size(TILE_DIM, BLOCK_ROWS); 127 | dlaswp03T<<>>( 128 | M, N, A, LDA, W, LDW, LINDXU); 129 | CHECK_HIP_ERROR(hipGetLastError()); 130 | /* 131 | * End of HPL_dlaswp03T 132 | */ 133 | } 134 | -------------------------------------------------------------------------------- /src/pauxil/HPL_dlaswp04T_device.cpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | 17 | #include "hpl.hpp" 18 | #include 19 | 20 | #define TILE_DIM 32 21 | #define BLOCK_ROWS 8 22 | 23 | static __global__ void dlaswp04T(const int M, 24 | const int N, 25 | double* __restrict__ A, 26 | const int LDA, 27 | double* __restrict__ W, 28 | const int LDW, 29 | const int* __restrict__ LINDXU) { 30 | 31 | __shared__ double s_W[TILE_DIM][TILE_DIM + 1]; 32 | 33 | const int am = threadIdx.x + TILE_DIM * blockIdx.x; 34 | const int an = threadIdx.y + TILE_DIM * blockIdx.y; 35 | 36 | const int wm = threadIdx.y + TILE_DIM * blockIdx.x; 37 | const int wn = threadIdx.x + TILE_DIM * blockIdx.y; 38 | 39 | if(wn < N) { 40 | s_W[threadIdx.y + 0][threadIdx.x] = 41 | (wm + 0 < M) ? W[wn + (wm + 0) * ((size_t)LDW)] : 0.0; 42 | s_W[threadIdx.y + 8][threadIdx.x] = 43 | (wm + 8 < M) ? W[wn + (wm + 8) * ((size_t)LDW)] : 0.0; 44 | s_W[threadIdx.y + 16][threadIdx.x] = 45 | (wm + 16 < M) ? W[wn + (wm + 16) * ((size_t)LDW)] : 0.0; 46 | s_W[threadIdx.y + 24][threadIdx.x] = 47 | (wm + 24 < M) ? W[wn + (wm + 24) * ((size_t)LDW)] : 0.0; 48 | } 49 | 50 | __syncthreads(); 51 | 52 | if(am < M) { 53 | const int aip = LINDXU[am]; 54 | if((an + 0) < N) 55 | A[aip + (an + 0) * ((size_t)LDA)] = s_W[threadIdx.x][threadIdx.y + 0]; 56 | if((an + 8) < N) 57 | A[aip + (an + 8) * ((size_t)LDA)] = s_W[threadIdx.x][threadIdx.y + 8]; 58 | if((an + 16) < N) 59 | A[aip + (an + 16) * ((size_t)LDA)] = s_W[threadIdx.x][threadIdx.y + 16]; 60 | if((an + 24) < N) 61 | A[aip + (an + 24) * ((size_t)LDA)] = s_W[threadIdx.x][threadIdx.y + 24]; 62 | } 63 | } 64 | 65 | void HPL_dlaswp04T(const int M, 66 | const int N, 67 | double* A, 68 | const int LDA, 69 | double* W, 70 | const int LDW, 71 | const int* LINDXU) { 72 | /* 73 | * Purpose 74 | * ======= 75 | * 76 | * HPL_dlaswp04T writes columns of W into rows of A at positions 77 | * indicated by LINDXU. 78 | * 79 | * Arguments 80 | * ========= 81 | * 82 | * M (local input) const int 83 | * On entry, M specifies the number of rows of A that should be 84 | * replaced with columns of W. M must be at least zero. 85 | * 86 | * N (local input) const int 87 | * On entry, N specifies the length of the rows of A that should 88 | * be replaced with columns of W. N must be at least zero. 89 | * 90 | * A (local output) double * 91 | * On entry, A points to an array of dimension (LDA,N). On exit, 92 | * the rows of this array specified by LINDXU are replaced by 93 | * columns of W. 94 | * 95 | * LDA (local input) const int 96 | * On entry, LDA specifies the leading dimension of the array A. 97 | * LDA must be at least MAX(1,M). 98 | * 99 | * W (local input/output) double * 100 | * On entry, W points to an array of dimension (LDW,*). This 101 | * array contains the columns of W that are to be writen to 102 | * rows of A. 103 | * 104 | * LDW (local input) const int 105 | * On entry, LDW specifies the leading dimension of the array W. 106 | * LDW must be at least MAX(1,N). 107 | * 108 | * LINDXU (local input) const int * 109 | * On entry, LINDXU is an array of dimension M that contains the 110 | * local row indexes of A that should be replaced with W. 111 | * 112 | * --------------------------------------------------------------------- 113 | */ 114 | /* 115 | * .. Local Variables .. 116 | */ 117 | 118 | if((M <= 0) || (N <= 0)) return; 119 | 120 | dim3 grid_size((M + TILE_DIM - 1) / TILE_DIM, (N + TILE_DIM - 1) / TILE_DIM); 121 | dim3 block_size(TILE_DIM, BLOCK_ROWS); 122 | dlaswp04T<<>>( 123 | M, N, A, LDA, W, LDW, LINDXU); 124 | CHECK_HIP_ERROR(hipGetLastError()); 125 | /* 126 | * End of HPL_dlaswp04T 127 | */ 128 | } 129 | -------------------------------------------------------------------------------- /src/pauxil/HPL_dlaswp10N_device.cpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | 17 | #include "hpl.hpp" 18 | #include 19 | 20 | #define BLOCK_SIZE 512 21 | 22 | __global__ void dlaswp10N(const int M, 23 | const int N, 24 | double* __restrict__ A, 25 | const int LDA, 26 | const int* __restrict__ IPIV) { 27 | 28 | const int m = threadIdx.x + BLOCK_SIZE * blockIdx.x; 29 | 30 | if(m < M) { 31 | for(int i = 0; i < N; i++) { 32 | const int ip = IPIV[i]; 33 | 34 | if(ip != i) { 35 | // swap 36 | const double Ai = A[m + i * ((size_t)LDA)]; 37 | const double Aip = A[m + ip * ((size_t)LDA)]; 38 | A[m + i * ((size_t)LDA)] = Aip; 39 | A[m + ip * ((size_t)LDA)] = Ai; 40 | } 41 | } 42 | } 43 | } 44 | 45 | void HPL_dlaswp10N(const int M, 46 | const int N, 47 | double* A, 48 | const int LDA, 49 | const int* IPIV) { 50 | /* 51 | * Purpose 52 | * ======= 53 | * 54 | * HPL_dlaswp10N performs a sequence of local column interchanges on a 55 | * matrix A. One column interchange is initiated for columns 0 through 56 | * N-1 of A. 57 | * 58 | * Arguments 59 | * ========= 60 | * 61 | * M (local input) const int 62 | * __arg0__ 63 | * 64 | * N (local input) const int 65 | * On entry, M specifies the number of rows of the array A. M 66 | * must be at least zero. 67 | * 68 | * A (local input/output) double * 69 | * On entry, N specifies the number of columns of the array A. N 70 | * must be at least zero. 71 | * 72 | * LDA (local input) const int 73 | * On entry, A points to an array of dimension (LDA,N). This 74 | * array contains the columns onto which the interchanges should 75 | * be applied. On exit, A contains the permuted matrix. 76 | * 77 | * IPIV (local input) const int * 78 | * On entry, LDA specifies the leading dimension of the array A. 79 | * LDA must be at least MAX(1,M). 80 | * 81 | * --------------------------------------------------------------------- 82 | */ 83 | 84 | if((M <= 0) || (N <= 0)) return; 85 | 86 | hipStream_t stream; 87 | CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); 88 | 89 | dim3 grid_size((M + BLOCK_SIZE - 1) / BLOCK_SIZE); 90 | dlaswp10N<<>>(M, N, A, LDA, IPIV); 91 | CHECK_HIP_ERROR(hipGetLastError()); 92 | } 93 | -------------------------------------------------------------------------------- /src/pauxil/HPL_indxg2l.cpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | 17 | #include "hpl.hpp" 18 | 19 | int HPL_indxg2l(const int IG, 20 | const int INB, 21 | const int NB, 22 | const int SRCPROC, 23 | const int NPROCS) { 24 | /* 25 | * Purpose 26 | * ======= 27 | * 28 | * HPL_indxg2l computes the local index of a matrix entry pointed to by 29 | * the global index IG. This local returned index is the same in all 30 | * processes. 31 | * 32 | * Arguments 33 | * ========= 34 | * 35 | * IG (input) const int 36 | * On entry, IG specifies the global index of the matrix entry. 37 | * IG must be at least zero. 38 | * 39 | * INB (input) const int 40 | * On entry, INB specifies the size of the first block of the 41 | * global matrix. INB must be at least one. 42 | * 43 | * NB (input) const int 44 | * On entry, NB specifies the blocking factor used to partition 45 | * and distribute the matrix. NB must be larger than one. 46 | * 47 | * SRCPROC (input) const int 48 | * On entry, if SRCPROC = -1, the data is not distributed but 49 | * replicated, in which case this routine returns IG in all 50 | * processes. Otherwise, the value of SRCPROC is ignored. 51 | * 52 | * NPROCS (input) const int 53 | * On entry, NPROCS specifies the total number of process rows 54 | * or columns over which the matrix is distributed. NPROCS must 55 | * be at least one. 56 | * 57 | * --------------------------------------------------------------------- 58 | */ 59 | 60 | int i, j; 61 | 62 | if((IG < INB) || (SRCPROC == -1) || (NPROCS == 1)) 63 | /* 64 | * IG belongs to the first block, or the data is not distributed, or 65 | * there is just one process in this dimension of the grid. 66 | */ 67 | return (IG); 68 | /* 69 | * IG = INB - NB + ( l * NPROCS + MYROC ) * NB + X with 0 <= X < NB, 70 | * thus IG is to be found in the block (IG-INB+NB) / NB = l*NPROCS+MYROC 71 | * with 0 <= MYROC < NPROCS. The local index to be returned depends on 72 | * whether IG resides in the process owning the first partial block of 73 | * size INB (MYROC=0). To determine this cheaply, let i = (IG-INB) / NB, 74 | * so that if NPROCS divides i+1, i.e. MYROC=0, we have i+1 = l*NPROCS. 75 | * If we set j = i / NPROCS, it follows that j = l-1. Therefore, i+1 is 76 | * equal to (j+1) * NPROCS. Conversely, if NPROCS does not divide i+1, 77 | * then i+1 = l*NPROCS + MYROC with 1 <= MYROC < NPROCS. It follows that 78 | * j=l and thus (j+1)*NPROCS > i+1. 79 | */ 80 | j = (i = (IG - INB) / NB) / NPROCS; 81 | /* 82 | * When IG resides in the process owning the first partial block of size 83 | * INB (MYROC = 0), then the result IL can be written as: 84 | * IL = INB - NB + l * NB + X = IG + ( l - (l * NPROCS + MYROC) ) * NB. 85 | * Using the above notation, we have i+1 = l*NPROCS + MYROC = l*NPROCS, 86 | * i.e l = ( i+1 ) / NPROCS = j+1, since NPROCS divides i+1, therefore 87 | * IL = IG + ( j + 1 - ( i + 1 ) ) * NB. 88 | * 89 | * Otherwise when MYROC >= 1, the result IL can be written as: 90 | * IL = l * NB + X = IG - INB + ( ( l+1 ) - ( l * NPROCS + MYROC ) )*NB. 91 | * We still have i+1 = l*NPROCS+MYROC. Since NPROCS does not divide i+1, 92 | * we have j = (l*NPROCS+MYROC-1) / NPROCS = l, i.e 93 | * IL = IG - INB + ( j + 1 - ( i + 1 ) ) * NB. 94 | */ 95 | return (NB * (j - i) + ((i + 1 - (j + 1) * NPROCS) ? IG - INB : IG)); 96 | } 97 | -------------------------------------------------------------------------------- /src/pauxil/HPL_indxg2lp.cpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | 17 | #include "hpl.hpp" 18 | 19 | void HPL_indxg2lp(int* IL, 20 | int* PROC, 21 | const int IG, 22 | const int INB, 23 | const int NB, 24 | const int SRCPROC, 25 | const int NPROCS) { 26 | /* 27 | * Purpose 28 | * ======= 29 | * 30 | * HPL_indxg2lp computes the local index of a matrix entry pointed to by 31 | * the global index IG as well as the process coordinate which posseses 32 | * this entry. The local returned index is the same in all processes. 33 | * 34 | * Arguments 35 | * ========= 36 | * 37 | * IL (output) int * 38 | * On exit, IL specifies the local index corresponding to IG. IL 39 | * is at least zero. 40 | * 41 | * PROC (output) int * 42 | * On exit, PROC is the coordinate of the process owning the 43 | * entry specified by the global index IG. PROC is at least zero 44 | * and less than NPROCS. 45 | * 46 | * IG (input) const int 47 | * On entry, IG specifies the global index of the matrix entry. 48 | * IG must be at least zero. 49 | * 50 | * INB (input) const int 51 | * On entry, INB specifies the size of the first block of the 52 | * global matrix. INB must be at least one. 53 | * 54 | * NB (input) const int 55 | * On entry, NB specifies the blocking factor used to partition 56 | * and distribute the matrix A. NB must be larger than one. 57 | * 58 | * SRCPROC (input) const int 59 | * On entry, if SRCPROC = -1, the data is not distributed but 60 | * replicated, in which case this routine returns IG in all 61 | * processes. Otherwise, the value of SRCPROC is ignored. 62 | * 63 | * NPROCS (input) const int 64 | * On entry, NPROCS specifies the total number of process rows 65 | * or columns over which the matrix is distributed. NPROCS must 66 | * be at least one. 67 | * 68 | * --------------------------------------------------------------------- 69 | */ 70 | 71 | int i, j; 72 | 73 | if((IG < INB) || (SRCPROC == -1) || (NPROCS == 1)) { 74 | /* 75 | * IG belongs to the first block, or the data is not distributed, or 76 | * there is just one process in this dimension of the grid. 77 | */ 78 | *IL = IG; 79 | *PROC = SRCPROC; 80 | } else { 81 | /* 82 | * IG = INB - NB + ( l * NPROCS + MYROC ) * NB + X with 0 <= X < NB, 83 | * thus IG is to be found in the block (IG-INB+NB) / NB = l*NPROCS+MYROC 84 | * with 0 <= MYROC < NPROCS. The local index to be returned depends on 85 | * whether IG resides in the process owning the first partial block of 86 | * size INB (MYROC=0). To determine this cheaply, let i = (IG-INB) / NB, 87 | * so that if NPROCS divides i+1, i.e. MYROC=0, we have i+1 = l*NPROCS. 88 | * If we set j = i / NPROCS, it follows that j = l-1. Therefore, i+1 is 89 | * equal to (j+1) * NPROCS. Conversely, if NPROCS does not divide i+1, 90 | * then i+1 = l*NPROCS + MYROC with 1 <= MYROC < NPROCS. It follows that 91 | * j=l and thus (j+1)*NPROCS > i+1. 92 | */ 93 | j = (i = (IG - INB) / NB) / NPROCS; 94 | /* 95 | * IG is in block 1 + ( IG - INB ) / NB. Add this to SRCPROC and take 96 | * the NPROCS modulo (definition of the block-cyclic data distribution). 97 | */ 98 | *PROC = SRCPROC + 1 + i; 99 | *PROC = MPosMod(*PROC, NPROCS); 100 | /* 101 | * When IG resides in the process owning the first partial block of size 102 | * INB (MYROC = 0), then the result IL can be written as: 103 | * IL = INB - NB + l * NB + X = IG + ( l - (l * NPROCS + MYROC) ) * NB. 104 | * Using the above notation, we have i+1 = l*NPROCS + MYROC = l*NPROCS, 105 | * i.e l = ( i+1 ) / NPROCS = j+1, since NPROCS divides i+1, therefore 106 | * IL = IG + ( j + 1 - ( i + 1 ) ) * NB. 107 | * 108 | * Otherwise when MYROC >= 1, the result IL can be written as: 109 | * IL = l * NB + X = IG - INB + ( ( l+1 ) - ( l * NPROCS + MYROC ) )*NB. 110 | * We still have i+1 = l*NPROCS+MYROC. Since NPROCS does not divide i+1, 111 | * we have j = (l*NPROCS+MYROC-1) / NPROCS = l, i.e 112 | * IL = IG - INB + ( j + 1 - ( i + 1 ) ) * NB. 113 | */ 114 | *IL = NB * (j - i) + ((i + 1 - (j + 1) * NPROCS) ? IG - INB : IG); 115 | } 116 | } 117 | -------------------------------------------------------------------------------- /src/pauxil/HPL_indxg2p.cpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | 17 | #include "hpl.hpp" 18 | 19 | int HPL_indxg2p(const int IG, 20 | const int INB, 21 | const int NB, 22 | const int SRCPROC, 23 | const int NPROCS) { 24 | /* 25 | * Purpose 26 | * ======= 27 | * 28 | * HPL_indxg2p computes the process coordinate which posseses the entry 29 | * of a matrix specified by a global index IG. 30 | * 31 | * Arguments 32 | * ========= 33 | * 34 | * IG (input) const int 35 | * On entry, IG specifies the global index of the matrix entry. 36 | * IG must be at least zero. 37 | * 38 | * INB (input) const int 39 | * On entry, INB specifies the size of the first block of the 40 | * global matrix. INB must be at least one. 41 | * 42 | * NB (input) const int 43 | * On entry, NB specifies the blocking factor used to partition 44 | * and distribute the matrix A. NB must be larger than one. 45 | * 46 | * SRCPROC (input) const int 47 | * On entry, SRCPROC specifies the coordinate of the process 48 | * that possesses the first row or column of the matrix. SRCPROC 49 | * must be at least zero and strictly less than NPROCS. 50 | * 51 | * NPROCS (input) const int 52 | * On entry, NPROCS specifies the total number of process rows 53 | * or columns over which the matrix is distributed. NPROCS must 54 | * be at least one. 55 | * 56 | * --------------------------------------------------------------------- 57 | */ 58 | 59 | int proc; 60 | 61 | if((IG < INB) || (SRCPROC == -1) || (NPROCS == 1)) 62 | /* 63 | * IG belongs to the first block, or the data is not distributed, or 64 | * there is just one process in this dimension of the grid. 65 | */ 66 | return (SRCPROC); 67 | /* 68 | * Otherwise, IG is in block 1 + ( IG - INB ) / NB. Add this to SRCPROC 69 | * and take the NPROCS modulo (definition of the block-cyclic data dis- 70 | * tribution). 71 | */ 72 | proc = SRCPROC + 1 + (IG - INB) / NB; 73 | return (MPosMod(proc, NPROCS)); 74 | } 75 | -------------------------------------------------------------------------------- /src/pauxil/HPL_indxl2g.cpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | 17 | #include "hpl.hpp" 18 | 19 | int HPL_indxl2g(const int IL, 20 | const int INB, 21 | const int NB, 22 | const int PROC, 23 | const int SRCPROC, 24 | const int NPROCS) { 25 | /* 26 | * Purpose 27 | * ======= 28 | * 29 | * HPL_indxl2g computes the global index of a matrix entry pointed to 30 | * by the local index IL of the process indicated by PROC. 31 | * 32 | * Arguments 33 | * ========= 34 | * 35 | * IL (input) const int 36 | * On entry, IL specifies the local index of the matrix entry. 37 | * IL must be at least zero. 38 | * 39 | * INB (input) const int 40 | * On entry, INB specifies the size of the first block of the 41 | * global matrix. INB must be at least one. 42 | * 43 | * NB (input) const int 44 | * On entry, NB specifies the blocking factor used to partition 45 | * and distribute the matrix A. NB must be larger than one. 46 | * 47 | * PROC (input) const int 48 | * On entry, PROC specifies the coordinate of the process whose 49 | * local array row or column is to be determined. PROC must be 50 | * at least zero and strictly less than NPROCS. 51 | * 52 | * SRCPROC (input) const int 53 | * On entry, SRCPROC specifies the coordinate of the process 54 | * that possesses the first row or column of the matrix. SRCPROC 55 | * must be at least zero and strictly less than NPROCS. 56 | * 57 | * NPROCS (input) const int 58 | * On entry, NPROCS specifies the total number of process rows 59 | * or columns over which the matrix is distributed. NPROCS must 60 | * be at least one. 61 | * 62 | * --------------------------------------------------------------------- 63 | */ 64 | 65 | if((SRCPROC == -1) || (NPROCS == 1)) { 66 | /* 67 | * The data is not distributed, or there is just one process in this di- 68 | * mension of the grid. 69 | */ 70 | return (IL); 71 | } else if(PROC == SRCPROC) { 72 | /* 73 | * If I am SRCPROC, my first block is of size INB 74 | */ 75 | if(IL < INB) 76 | /* 77 | * If IL belongs to the first block, the local and global indexes are 78 | * equal. 79 | */ 80 | return (IL); 81 | /* 82 | * The number of entire blocks before the one IL belongs to is 83 | * ( IL - INB ) / NB + 1. In the other NPROCS-1 processes, there are 84 | * thus NB*( ( IL-INB )/NB + 1 ) entries, that are globally before the 85 | * global entry corresponding to IL. 86 | */ 87 | return ((NPROCS - 1) * NB * ((IL - INB) / NB + 1) + IL); 88 | } else if(PROC < SRCPROC) { 89 | /* 90 | * Otherwise, the process of coordinate MOD(SRCPROC+1, NPROCS) owns the 91 | * second block. Let IPROC = PROC-SRCPROC-1+NPROCS be the number of pro- 92 | * cesses between this process and PROC not included when going from 93 | * left to right on the process line with possible wrap around. These 94 | * IPROC processes have one more NB block than the other processes, who 95 | * own IL / NB blocks of size NB. 96 | */ 97 | return (NB * ((NPROCS - 1) * (IL / NB) + PROC - SRCPROC - 1 + NPROCS) + IL + 98 | INB); 99 | } else { 100 | /* 101 | * Same reasoning as above with IPROC = PROC - SRCPROC - 1. 102 | */ 103 | return (NB * ((NPROCS - 1) * (IL / NB) + PROC - SRCPROC - 1) + IL + INB); 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /src/pauxil/HPL_numroc.cpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | 17 | #include "hpl.hpp" 18 | 19 | int HPL_numroc(const int N, 20 | const int INB, 21 | const int NB, 22 | const int PROC, 23 | const int SRCPROC, 24 | const int NPROCS) { 25 | /* 26 | * Purpose 27 | * ======= 28 | * 29 | * HPL_numroc returns the local number of matrix rows/columns process 30 | * PROC will get if we give out N rows/columns starting from global 31 | * index 0. 32 | * 33 | * Arguments 34 | * ========= 35 | * 36 | * N (input) const int 37 | * On entry, N specifies the number of rows/columns being dealt 38 | * out. N must be at least zero. 39 | * 40 | * INB (input) const int 41 | * On entry, INB specifies the size of the first block of the 42 | * global matrix. INB must be at least one. 43 | * 44 | * NB (input) const int 45 | * On entry, NB specifies the blocking factor used to partition 46 | * and distribute the matrix A. NB must be larger than one. 47 | * 48 | * PROC (input) const int 49 | * On entry, PROC specifies the coordinate of the process whose 50 | * local portion is determined. PROC must be at least zero and 51 | * strictly less than NPROCS. 52 | * 53 | * SRCPROC (input) const int 54 | * On entry, SRCPROC specifies the coordinate of the process 55 | * that possesses the first row or column of the matrix. SRCPROC 56 | * must be at least zero and strictly less than NPROCS. 57 | * 58 | * NPROCS (input) const int 59 | * On entry, NPROCS specifies the total number of process rows 60 | * or columns over which the matrix is distributed. NPROCS must 61 | * be at least one. 62 | * 63 | * --------------------------------------------------------------------- 64 | */ 65 | 66 | return (HPL_numrocI(N, 0, INB, NB, PROC, SRCPROC, NPROCS)); 67 | } 68 | -------------------------------------------------------------------------------- /src/pauxil/HPL_pabort.cpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | 17 | #include "hpl.hpp" 18 | 19 | void HPL_pabort(int LINE, const char* SRNAME, const char* FORM, ...) { 20 | /* 21 | * Purpose 22 | * ======= 23 | * 24 | * HPL_pabort displays an error message on stderr and halts execution. 25 | * 26 | * 27 | * Arguments 28 | * ========= 29 | * 30 | * LINE (local input) int 31 | * On entry, LINE specifies the line number in the file where 32 | * the error has occured. When LINE is not a positive line 33 | * number, it is ignored. 34 | * 35 | * SRNAME (local input) const char * 36 | * On entry, SRNAME should be the name of the routine calling 37 | * this error handler. 38 | * 39 | * FORM (local input) const char * 40 | * On entry, FORM specifies the format, i.e., how the subsequent 41 | * arguments are converted for output. 42 | * 43 | * (local input) ... 44 | * On entry, ... is the list of arguments to be printed within 45 | * the format string. 46 | * 47 | * --------------------------------------------------------------------- 48 | */ 49 | 50 | va_list argptr; 51 | int rank; 52 | char cline[128]; 53 | 54 | va_start(argptr, FORM); 55 | (void)vsprintf(cline, FORM, argptr); 56 | va_end(argptr); 57 | 58 | MPI_Comm_rank(MPI_COMM_WORLD, &rank); 59 | /* 60 | * Display an error message 61 | */ 62 | if(LINE <= 0) 63 | HPL_fprintf(stderr, 64 | "%s %s %d, %s %s:\n>>> %s <<< Abort ...\n\n", 65 | "HPL ERROR", 66 | "from process #", 67 | rank, 68 | "in function", 69 | SRNAME, 70 | cline); 71 | else 72 | HPL_fprintf(stderr, 73 | "%s %s %d, %s %d %s %s:\n>>> %s <<< Abort ...\n\n", 74 | "HPL ERROR", 75 | "from process #", 76 | rank, 77 | "on line", 78 | LINE, 79 | "of function", 80 | SRNAME, 81 | cline); 82 | 83 | MPI_Abort(MPI_COMM_WORLD, -1); 84 | exit(-1); 85 | } 86 | -------------------------------------------------------------------------------- /src/pauxil/HPL_pdlamch.cpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | 17 | #include "hpl.hpp" 18 | 19 | double HPL_pdlamch(MPI_Comm COMM, const HPL_T_MACH CMACH) { 20 | /* 21 | * Purpose 22 | * ======= 23 | * 24 | * HPL_pdlamch determines machine-specific arithmetic constants such as 25 | * the relative machine precision (eps), the safe minimum(sfmin) such that 26 | * 1/sfmin does not overflow, the base of the machine (base), the precision 27 | * (prec), the number of (base) digits in the mantissa (t), whether 28 | * rounding occurs in addition (rnd = 1.0 and 0.0 otherwise), the minimum 29 | * exponent before (gradual) underflow (emin), the underflow threshold 30 | * (rmin)- base**(emin-1), the largest exponent before overflow (emax), the 31 | * overflow threshold (rmax) - (base**emax)*(1-eps). 32 | * 33 | * Arguments 34 | * ========= 35 | * 36 | * COMM (global/local input) MPI_Comm 37 | * The MPI communicator identifying the process collection. 38 | * 39 | * CMACH (global input) const HPL_T_MACH 40 | * Specifies the value to be returned by HPL_pdlamch 41 | * = HPL_MACH_EPS, HPL_pdlamch := eps (default) 42 | * = HPL_MACH_SFMIN, HPL_pdlamch := sfmin 43 | * = HPL_MACH_BASE, HPL_pdlamch := base 44 | * = HPL_MACH_PREC, HPL_pdlamch := eps*base 45 | * = HPL_MACH_MLEN, HPL_pdlamch := t 46 | * = HPL_MACH_RND, HPL_pdlamch := rnd 47 | * = HPL_MACH_EMIN, HPL_pdlamch := emin 48 | * = HPL_MACH_RMIN, HPL_pdlamch := rmin 49 | * = HPL_MACH_EMAX, HPL_pdlamch := emax 50 | * = HPL_MACH_RMAX, HPL_pdlamch := rmax 51 | * 52 | * where 53 | * 54 | * eps = relative machine precision, 55 | * sfmin = safe minimum, 56 | * base = base of the machine, 57 | * prec = eps*base, 58 | * t = number of digits in the mantissa, 59 | * rnd = 1.0 if rounding occurs in addition, 60 | * emin = minimum exponent before underflow, 61 | * rmin = underflow threshold, 62 | * emax = largest exponent before overflow, 63 | * rmax = overflow threshold. 64 | * 65 | * --------------------------------------------------------------------- 66 | */ 67 | 68 | double param; 69 | 70 | param = HPL_dlamch(CMACH); 71 | 72 | switch(CMACH) { 73 | case HPL_MACH_EPS: 74 | case HPL_MACH_SFMIN: 75 | case HPL_MACH_EMIN: 76 | case HPL_MACH_RMIN: 77 | (void)HPL_all_reduce((void*)(¶m), 1, HPL_DOUBLE, HPL_MAX, COMM); 78 | break; 79 | case HPL_MACH_EMAX: 80 | case HPL_MACH_RMAX: 81 | (void)HPL_all_reduce((void*)(¶m), 1, HPL_DOUBLE, HPL_MIN, COMM); 82 | break; 83 | default: break; 84 | } 85 | 86 | return (param); 87 | } 88 | -------------------------------------------------------------------------------- /src/pauxil/HPL_pwarn.cpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | 17 | #include "hpl.hpp" 18 | 19 | void HPL_pwarn(FILE* STREAM, 20 | int LINE, 21 | const char* SRNAME, 22 | const char* FORM, 23 | ...) { 24 | /* 25 | * Purpose 26 | * ======= 27 | * 28 | * HPL_pwarn displays an error message. 29 | * 30 | * 31 | * Arguments 32 | * ========= 33 | * 34 | * STREAM (local input) FILE * 35 | * On entry, STREAM specifies the output stream. 36 | * 37 | * LINE (local input) int 38 | * On entry, LINE specifies the line number in the file where 39 | * the error has occured. When LINE is not a positive line 40 | * number, it is ignored. 41 | * 42 | * SRNAME (local input) const char * 43 | * On entry, SRNAME should be the name of the routine calling 44 | * this error handler. 45 | * 46 | * FORM (local input) const char * 47 | * On entry, FORM specifies the format, i.e., how the subsequent 48 | * arguments are converted for output. 49 | * 50 | * (local input) ... 51 | * On entry, ... is the list of arguments to be printed within 52 | * the format string. 53 | * 54 | * --------------------------------------------------------------------- 55 | */ 56 | 57 | va_list argptr; 58 | int rank; 59 | char cline[128]; 60 | 61 | va_start(argptr, FORM); 62 | (void)vsprintf(cline, FORM, argptr); 63 | va_end(argptr); 64 | 65 | MPI_Comm_rank(MPI_COMM_WORLD, &rank); 66 | /* 67 | * Display an error message 68 | */ 69 | if(LINE <= 0) { 70 | HPL_fprintf(STREAM, 71 | "%s %s %d, %s %s:\n>>> %s <<<\n\n", 72 | "HPL ERROR", 73 | "from process #", 74 | rank, 75 | "in function", 76 | SRNAME, 77 | cline); 78 | if(STREAM != stderr) { 79 | HPL_fprintf(stderr, 80 | "%s %s %d, %s %s:\n>>> %s <<<\n\n", 81 | "HPL ERROR", 82 | "from process #", 83 | rank, 84 | "in function", 85 | SRNAME, 86 | cline); 87 | } 88 | } else { 89 | HPL_fprintf(STREAM, 90 | "%s %s %d, %s %d %s %s:\n>>> %s <<<\n\n", 91 | "HPL ERROR", 92 | "from process #", 93 | rank, 94 | "on line", 95 | LINE, 96 | "of function", 97 | SRNAME, 98 | cline); 99 | 100 | if(STREAM != stderr) { 101 | HPL_fprintf(stderr, 102 | "%s %s %d, %s %d %s %s:\n>>> %s <<<\n\n", 103 | "HPL ERROR", 104 | "from process #", 105 | rank, 106 | "on line", 107 | LINE, 108 | "of function", 109 | SRNAME, 110 | cline); 111 | } 112 | } 113 | } 114 | -------------------------------------------------------------------------------- /src/pfact/HPL_pdfact.cpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | 17 | #include "hpl.hpp" 18 | #include 19 | 20 | void HPL_pdfact(HPL_T_panel* PANEL) { 21 | /* 22 | * Purpose 23 | * ======= 24 | * 25 | * HPL_pdfact recursively factorizes a 1-dimensional panel of columns. 26 | * The RPFACT function pointer specifies the recursive algorithm to be 27 | * used, either Crout, Left- or Right looking. NBMIN allows to vary the 28 | * recursive stopping criterium in terms of the number of columns in the 29 | * panel, and NDIV allows to specify the number of subpanels each panel 30 | * should be divided into. Usuallly a value of 2 will be chosen. Finally 31 | * PFACT is a function pointer specifying the non-recursive algorithm to 32 | * to be used on at most NBMIN columns. One can also choose here between 33 | * Crout, Left- or Right looking. Empirical tests seem to indicate that 34 | * values of 4 or 8 for NBMIN give the best results. 35 | * 36 | * Bi-directional exchange is used to perform the swap::broadcast 37 | * operations at once for one column in the panel. This results in a 38 | * lower number of slightly larger messages than usual. On P processes 39 | * and assuming bi-directional links, the running time of this function 40 | * can be approximated by (when N is equal to N0): 41 | * 42 | * N0 * log_2( P ) * ( lat + ( 2*N0 + 4 ) / bdwth ) + 43 | * N0^2 * ( M - N0/3 ) * gam2-3 44 | * 45 | * where M is the local number of rows of the panel, lat and bdwth are 46 | * the latency and bandwidth of the network for double precision real 47 | * words, and gam2-3 is an estimate of the Level 2 and Level 3 BLAS 48 | * rate of execution. The recursive algorithm allows indeed to almost 49 | * achieve Level 3 BLAS performance in the panel factorization. On a 50 | * large number of modern machines, this operation is however latency 51 | * bound, meaning that its cost can be estimated by only the latency 52 | * portion N0 * log_2(P) * lat. Mono-directional links will double this 53 | * communication cost. 54 | * 55 | * Arguments 56 | * ========= 57 | * 58 | * PANEL (local input/output) HPL_T_panel * 59 | * On entry, PANEL points to the data structure containing the 60 | * panel information. 61 | * 62 | * --------------------------------------------------------------------- 63 | */ 64 | 65 | int jb = PANEL->jb; 66 | PANEL->n -= jb; 67 | PANEL->ja += jb; 68 | 69 | if((PANEL->grid->mycol != PANEL->pcol) || (jb <= 0)) return; 70 | 71 | #ifdef HPL_DETAILED_TIMING 72 | HPL_ptimer(HPL_TIMING_RPFACT); 73 | #endif 74 | 75 | /*Copy current panel into workspace*/ 76 | HPL_dlacpy( 77 | PANEL->mp, PANEL->jb, PANEL->A, PANEL->lda, PANEL->A0, PANEL->lda0); 78 | 79 | /* 80 | * Factor the panel - Update the panel pointers 81 | */ 82 | HPL_TracingPush("pdfact"); 83 | 84 | hipStream_t stream; 85 | CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); 86 | CHECK_HIP_ERROR(hipEventRecord(pfactStart, stream)); 87 | 88 | PANEL->algo->rffun(PANEL, PANEL->mp, jb, 0); 89 | 90 | CHECK_HIP_ERROR(hipEventRecord(pfactStop, stream)); 91 | 92 | HPL_TracingPop(); 93 | 94 | /*Copy L1 back into A*/ 95 | if(PANEL->grid->myrow == PANEL->prow) { 96 | if(PANEL->algo->L1notran) { 97 | HPL_dlacpy(jb, jb, PANEL->L1, jb, PANEL->A, PANEL->lda); 98 | } else { 99 | HPL_dlatcpy(jb, jb, PANEL->L1, jb, PANEL->A, PANEL->lda); 100 | } 101 | } 102 | 103 | PANEL->A = Mptr(PANEL->A, 0, jb, PANEL->lda); 104 | PANEL->nq -= jb; 105 | PANEL->jj += jb; 106 | 107 | #ifdef HPL_DETAILED_TIMING 108 | HPL_ptimer(HPL_TIMING_RPFACT); 109 | #endif 110 | } 111 | -------------------------------------------------------------------------------- /src/pfact/HPL_pdmxswp.cpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | 17 | #include "hpl.hpp" 18 | 19 | void HPL_pdmxswp(HPL_T_panel* PANEL, 20 | const int M, 21 | const int II, 22 | const int JJ, 23 | double* WORK) { 24 | /* 25 | * Purpose 26 | * ======= 27 | * 28 | * HPL_pdmxswp swaps and broadcasts the absolute value max row using 29 | * bi-directional exchange. The buffer is partially set by HPL_dlocmax. 30 | * 31 | * Bi-directional exchange is used to perform the swap::broadcast 32 | * operations at once for one column in the panel. This results in a 33 | * lower number of slightly larger messages than usual. On P processes 34 | * and assuming bi-directional links, the running time of this function 35 | * can be approximated by 36 | * 37 | * log_2( P ) * ( lat + ( 2 * N0 + 4 ) / bdwth ) 38 | * 39 | * where lat and bdwth are the latency and bandwidth of the network for 40 | * double precision real elements. Communication only occurs in one 41 | * process column. Mono-directional links will cause the communication 42 | * cost to double. 43 | * 44 | * Arguments 45 | * ========= 46 | * 47 | * PANEL (local input/output) HPL_T_panel * 48 | * On entry, PANEL points to the data structure containing the 49 | * panel information. 50 | * 51 | * M (local input) const int 52 | * On entry, M specifies the local number of rows of the matrix 53 | * column on which this function operates. 54 | * 55 | * II (local input) const int 56 | * On entry, II specifies the row offset where the column to be 57 | * operated on starts with respect to the panel. 58 | * 59 | * JJ (local input) const int 60 | * On entry, JJ specifies the column offset where the column to 61 | * be operated on starts with respect to the panel. 62 | * 63 | * WORK (local workspace) double * 64 | * On entry, WORK is a workarray of size at least 2 * (4+2*N0). 65 | * It is assumed that HPL_dlocmax was called prior to this 66 | * routine to initialize the first four entries of this array. 67 | * On exit, the N0 length max row is stored in WORK[4:4+N0-1]; 68 | * Note that this is also the JJth row (or column) of L1. The 69 | * remaining part is used as a temporary array. 70 | * 71 | * --------------------------------------------------------------------- 72 | */ 73 | 74 | double* Wwork; 75 | HPL_T_grid* grid; 76 | MPI_Comm comm; 77 | int cnt0, icurrow, myrow, nprow; 78 | 79 | /* .. 80 | * .. Executable Statements .. 81 | */ 82 | #ifdef HPL_DETAILED_TIMING 83 | HPL_ptimer(HPL_TIMING_MXSWP); 84 | #endif 85 | grid = PANEL->grid; 86 | comm = grid->col_comm; 87 | myrow = grid->myrow; 88 | nprow = grid->nprow; 89 | int JB = PANEL->jb; 90 | int NB = PANEL->nb; 91 | icurrow = PANEL->prow; 92 | 93 | cnt0 = 4 + 2 * JB; 94 | Wwork = WORK + cnt0; 95 | 96 | if(M > 0) { 97 | int ilindx = static_cast(WORK[1]); 98 | int kk = PANEL->ii + II + (ilindx); 99 | int igindx = 0; 100 | Mindxl2g(igindx, kk, NB, NB, myrow, 0, nprow); 101 | /* 102 | * WORK[0] := local maximum absolute value scalar, 103 | * WORK[1] := corresponding local row index, 104 | * WORK[2] := corresponding global row index, 105 | * WORK[3] := coordinate of process owning this max. 106 | */ 107 | WORK[2] = (double)(igindx); 108 | WORK[3] = (double)(myrow); 109 | 110 | } else { 111 | WORK[0] = WORK[1] = WORK[2] = HPL_rzero; 112 | WORK[3] = (double)(PANEL->grid->nprow); 113 | } 114 | 115 | /* Perform swap-broadcast */ 116 | HPL_all_reduce_dmxswp(WORK, cnt0, icurrow, comm, Wwork); 117 | 118 | /* 119 | * Save the global pivot index in pivot array 120 | */ 121 | (PANEL->ipiv)[JJ] = (int)WORK[2]; 122 | 123 | #ifdef HPL_DETAILED_TIMING 124 | HPL_ptimer(HPL_TIMING_MXSWP); 125 | #endif 126 | } 127 | -------------------------------------------------------------------------------- /src/pgesv/HPL_perm.cpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | 17 | #include "hpl.hpp" 18 | 19 | void HPL_perm(const int N, int* LINDXA, int* LINDXAU, int* IWORK) { 20 | /* 21 | * Purpose 22 | * ======= 23 | * 24 | * HPL_perm combines two index arrays and generate the corresponding 25 | * permutation. First, this function computes the inverse of LINDXA, and 26 | * then combine it with LINDXAU. Second, in order to be able to perform 27 | * the permutation in place, LINDXAU is overwritten by the sequence of 28 | * permutation producing the same result. What we ultimately want to 29 | * achieve is: U[LINDXAU[i]] := U[LINDXA[i]] for i in [0..N). After the 30 | * call to this function, this in place permutation can be performed by 31 | * for i in [0..N) swap U[i] with U[LINDXAU[i]]. 32 | * 33 | * Arguments 34 | * ========= 35 | * 36 | * N (global input) const int 37 | * On entry, N specifies the length of the arrays LINDXA and 38 | * LINDXAU. N should be at least zero. 39 | * 40 | * LINDXA (global input/output) int * 41 | * On entry, LINDXA is an array of dimension N containing the 42 | * source indexes. On exit, LINDXA contains the combined index 43 | * array. 44 | * 45 | * LINDXAU (global input/output) int * 46 | * On entry, LINDXAU is an array of dimension N containing the 47 | * target indexes. On exit, LINDXAU contains the sequence of 48 | * permutation, that should be applied in increasing order to 49 | * permute the underlying array U in place. 50 | * 51 | * IWORK (workspace) int * 52 | * On entry, IWORK is a workarray of dimension N. 53 | * 54 | * --------------------------------------------------------------------- 55 | */ 56 | 57 | int i, j, k, fndd; 58 | 59 | /* 60 | * Inverse LINDXA - combine LINDXA and LINDXAU - Initialize IWORK 61 | */ 62 | for(i = 0; i < N; i++) { IWORK[LINDXA[i]] = i; } 63 | for(i = 0; i < N; i++) { 64 | LINDXA[i] = LINDXAU[IWORK[i]]; 65 | IWORK[i] = i; 66 | } 67 | 68 | for(i = 0; i < N; i++) { 69 | /* search LINDXA such that LINDXA[j] == i */ 70 | j = 0; 71 | do { 72 | fndd = (LINDXA[j] == i); 73 | j++; 74 | } while(!fndd); 75 | j--; 76 | /* search IWORK such that IWORK[k] == j */ 77 | k = 0; 78 | do { 79 | fndd = (IWORK[k] == j); 80 | k++; 81 | } while(!fndd); 82 | k--; 83 | /* swap IWORK[i] and IWORK[k]; LINDXAU[i] = k */ 84 | j = IWORK[i]; 85 | IWORK[i] = IWORK[k]; 86 | IWORK[k] = j; 87 | LINDXAU[i] = k; 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /src/pgesv/HPL_piplen.cpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | 17 | #include "hpl.hpp" 18 | 19 | void HPL_piplen(HPL_T_panel* PANEL, 20 | const int K, 21 | const int* IPID, 22 | int* IPLEN, 23 | int* IWORK) { 24 | 25 | const int nprow = PANEL->grid->nprow; 26 | const int jb = PANEL->jb; 27 | const int nb = PANEL->nb; 28 | const int ia = PANEL->ia; 29 | const int icurrow = PANEL->prow; 30 | 31 | int* iwork = IWORK + jb; 32 | 33 | /* 34 | * Compute IPLEN 35 | */ 36 | for(int i = 0; i <= nprow; i++) IPLEN[i] = 0; 37 | 38 | /* 39 | * IPLEN[i] is the number of rows of A in the processes before 40 | * process i, with the convention that IPLEN[nprow] is the total 41 | * number of rows. 42 | * In other words, IPLEN[i+1] - IPLEN[i] is the local number of 43 | * rows of A that should be moved for each process. 44 | */ 45 | for(int i = 0; i < K; i += 2) { 46 | const int src = IPID[i]; 47 | int srcrow; 48 | Mindxg2p(src, nb, nb, srcrow, 0, nprow); 49 | if(srcrow == icurrow) { 50 | const int dst = IPID[i + 1]; 51 | int dstrow; 52 | Mindxg2p(dst, nb, nb, dstrow, 0, nprow); 53 | if((dstrow != srcrow) || (dst - ia < jb)) IPLEN[dstrow + 1]++; 54 | } 55 | } 56 | 57 | for(int i = 1; i <= nprow; i++) { IPLEN[i] += IPLEN[i - 1]; } 58 | } 59 | -------------------------------------------------------------------------------- /src/timer/HPL_ptimer_cputime.cpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | 17 | #include "hpl.hpp" 18 | 19 | /* 20 | * Purpose 21 | * ======= 22 | * 23 | * HPL_ptimer_cputime returns the cpu time. 24 | * The clock() function is used to return an approximation of processor 25 | * time used by the program. The value returned is the CPU time used so 26 | * far as a clock_t; to get the number of seconds used, the result is 27 | * divided by CLOCKS_PER_SEC. This function is part of the ANSI/ISO C 28 | * standard library. 29 | * 30 | * --------------------------------------------------------------------- 31 | */ 32 | 33 | #include 34 | 35 | double HPL_ptimer_cputime(void) { 36 | static double cps = CLOCKS_PER_SEC; 37 | double d; 38 | clock_t t1; 39 | static clock_t t0 = 0; 40 | 41 | if(t0 == 0) t0 = clock(); 42 | t1 = clock() - t0; 43 | d = (double)(t1) / cps; 44 | return (d); 45 | } 46 | -------------------------------------------------------------------------------- /src/timer/HPL_ptimer_walltime.cpp: -------------------------------------------------------------------------------- 1 | /* --------------------------------------------------------------------- 2 | * -- High Performance Computing Linpack Benchmark (HPL) 3 | * HPL - 2.2 - February 24, 2016 4 | * Antoine P. Petitet 5 | * University of Tennessee, Knoxville 6 | * Innovative Computing Laboratory 7 | * (C) Copyright 2000-2008 All Rights Reserved 8 | * 9 | * Modified by: Noel Chalmers 10 | * (C) 2018-2025 Advanced Micro Devices, Inc. 11 | * See the rocHPL/LICENCE file for details. 12 | * 13 | * SPDX-License-Identifier: (BSD-3-Clause) 14 | * --------------------------------------------------------------------- 15 | */ 16 | 17 | #include "hpl.hpp" 18 | 19 | /* 20 | * Purpose 21 | * ======= 22 | * 23 | * HPL_ptimer_walltime returns the elapsed (wall-clock) time. 24 | * 25 | * 26 | * --------------------------------------------------------------------- 27 | */ 28 | 29 | double HPL_ptimer_walltime(void) { return (MPI_Wtime()); } 30 | --------------------------------------------------------------------------------