├── .clang-format
├── .github
    └── CODEOWNERS
├── .gitignore
├── CMakeLists.txt
├── LICENSE
├── README.md
├── cmake
    └── Dependencies.cmake
├── include
    ├── hpl.hpp
    ├── hpl_auxil.hpp
    ├── hpl_blas.hpp
    ├── hpl_comm.hpp
    ├── hpl_grid.hpp
    ├── hpl_hip_ex.hpp
    ├── hpl_misc.hpp
    ├── hpl_panel.hpp
    ├── hpl_pauxil.hpp
    ├── hpl_pfact.hpp
    ├── hpl_pgesv.hpp
    ├── hpl_pmatgen.hpp
    ├── hpl_pmisc.hpp
    ├── hpl_ptest.hpp
    ├── hpl_ptimer.hpp
    └── hpl_version.hpp.in
├── install.sh
├── scripts
    ├── HPL.dat
    ├── mpirun_rochpl.in
    └── run_rochpl.in
└── src
    ├── HPL_InitGPU.cpp
    ├── HPL_pddriver.cpp
    ├── HPL_pdinfo.cpp
    ├── HPL_pdtest.cpp
    ├── HPL_pdwarmup.cpp
    ├── auxil
        ├── HPL_abort.cpp
        ├── HPL_dlacpy_device.cpp
        ├── HPL_dlamch.cpp
        ├── HPL_dlaprnt.cpp
        ├── HPL_dlatcpy_device.cpp
        ├── HPL_fprintf.cpp
        └── HPL_warn.cpp
    ├── comm
        ├── HPL_all_reduce.cpp
        ├── HPL_all_reduce_dmxswp.cpp
        ├── HPL_allgatherv.cpp
        ├── HPL_barrier.cpp
        ├── HPL_bcast.cpp
        ├── HPL_bcast_1rinM.cpp
        ├── HPL_bcast_1ring.cpp
        ├── HPL_bcast_2rinM.cpp
        ├── HPL_bcast_2ring.cpp
        ├── HPL_bcast_blonM.cpp
        ├── HPL_bcast_blong.cpp
        ├── HPL_broadcast.cpp
        ├── HPL_recv.cpp
        ├── HPL_reduce.cpp
        ├── HPL_scatterv.cpp
        ├── HPL_sdrv.cpp
        └── HPL_send.cpp
    ├── grid
        ├── HPL_grid_exit.cpp
        ├── HPL_grid_info.cpp
        └── HPL_grid_init.cpp
    ├── matgen
        ├── HPL_pdmatgen.cpp
        ├── HPL_pdrandmat_device.cpp
        └── HPL_xjumpm.cpp
    ├── panel
        ├── HPL_pdpanel_bcast.cpp
        ├── HPL_pdpanel_free.cpp
        ├── HPL_pdpanel_init.cpp
        ├── HPL_pdpanel_new.cpp
        ├── HPL_pdpanel_swapids.cpp
        └── HPL_pdpanel_wait.cpp
    ├── pauxil
        ├── HPL_dlaswp00N_device.cpp
        ├── HPL_dlaswp01T_device.cpp
        ├── HPL_dlaswp02T_device.cpp
        ├── HPL_dlaswp03T_device.cpp
        ├── HPL_dlaswp04T_device.cpp
        ├── HPL_dlaswp10N_device.cpp
        ├── HPL_indxg2l.cpp
        ├── HPL_indxg2lp.cpp
        ├── HPL_indxg2p.cpp
        ├── HPL_indxl2g.cpp
        ├── HPL_infog2l.cpp
        ├── HPL_numroc.cpp
        ├── HPL_numrocI.cpp
        ├── HPL_pabort.cpp
        ├── HPL_pdlamch.cpp
        ├── HPL_pdlange_device.cpp
        └── HPL_pwarn.cpp
    ├── pfact
        ├── HPL_pdfact.cpp
        ├── HPL_pdmxswp.cpp
        ├── HPL_pdpancrN_device.cpp
        ├── HPL_pdpancrT_device.cpp
        ├── HPL_pdpanllN_device.cpp
        ├── HPL_pdpanllT_device.cpp
        ├── HPL_pdpanrlN_device.cpp
        ├── HPL_pdpanrlT_device.cpp
        ├── HPL_pdrpancrN.cpp
        ├── HPL_pdrpancrT.cpp
        ├── HPL_pdrpanllN.cpp
        ├── HPL_pdrpanllT.cpp
        ├── HPL_pdrpanrlN.cpp
        └── HPL_pdrpanrlT.cpp
    ├── pgesv
        ├── HPL_pdgesv.cpp
        ├── HPL_pdlaswp.cpp
        ├── HPL_pdtrsv_device.cpp
        ├── HPL_pdupdateNT.cpp
        ├── HPL_pdupdateTT.cpp
        ├── HPL_perm.cpp
        ├── HPL_pipid.cpp
        ├── HPL_piplen.cpp
        └── HPL_plindx.cpp
    └── timer
        ├── HPL_ptimer.cpp
        ├── HPL_ptimer_cputime.cpp
        └── HPL_ptimer_walltime.cpp


/.clang-format:
--------------------------------------------------------------------------------
 1 | ---
 2 | Language:        Cpp
 3 | AccessModifierOffset: 0
 4 | AlignAfterOpenBracket: Align
 5 | AlignConsecutiveAssignments: true
 6 | AlignConsecutiveDeclarations: true
 7 | AlignEscapedNewlinesLeft: true
 8 | AlignOperands:   true
 9 | AlignTrailingComments: true
10 | AllowAllParametersOfDeclarationOnNextLine: false
11 | AllowShortBlocksOnASingleLine: true
12 | AllowShortCaseLabelsOnASingleLine: true
13 | AllowShortFunctionsOnASingleLine: true
14 | AllowShortIfStatementsOnASingleLine: true
15 | AllowShortLoopsOnASingleLine: true
16 | AlwaysBreakAfterDefinitionReturnType: None
17 | AlwaysBreakAfterReturnType: None
18 | AlwaysBreakBeforeMultilineStrings: false
19 | AlwaysBreakTemplateDeclarations: true
20 | BinPackArguments: false
21 | BinPackParameters: false
22 | BraceWrapping:
23 |   AfterClass:      false
24 |   AfterControlStatement: false
25 |   AfterEnum:       false
26 |   AfterFunction:   false
27 |   AfterNamespace:  false
28 |   AfterObjCDeclaration: false
29 |   AfterStruct:     false
30 |   AfterUnion:      false
31 |   BeforeCatch:     false
32 |   BeforeElse:      false
33 |   IndentBraces:    false
34 | BreakBeforeBinaryOperators: None
35 | BreakBeforeBraces: Custom
36 | BreakBeforeTernaryOperators: true
37 | BreakConstructorInitializersBeforeComma: false
38 | ColumnLimit:     80
39 | CommentPragmas:  '^ IWYU pragma:'
40 | ConstructorInitializerAllOnOneLineOrOnePerLine: true
41 | ConstructorInitializerIndentWidth: 4
42 | ContinuationIndentWidth: 4
43 | Cpp11BracedListStyle: true
44 | DerivePointerAlignment: false
45 | DisableFormat:   false
46 | ExperimentalAutoDetectBinPacking: false
47 | ForEachMacros:   [ foreach, Q_FOREACH, BOOST_FOREACH ]
48 | IncludeCategories:
49 |   - Regex:           '^"(llvm|llvm-c|clang|clang-c)/'
50 |     Priority:        2
51 |   - Regex:           '^(<|"(gtest|isl|json)/)'
52 |     Priority:        3
53 |   - Regex:           '.*'
54 |     Priority:        1
55 | IndentCaseLabels: true
56 | IndentWidth:     2
57 | IndentWrappedFunctionNames: false
58 | KeepEmptyLinesAtTheStartOfBlocks: true
59 | MacroBlockBegin: ''
60 | MacroBlockEnd:   ''
61 | MaxEmptyLinesToKeep: 1
62 | NamespaceIndentation: None
63 | ObjCBlockIndentWidth: 2
64 | ObjCSpaceAfterProperty: false
65 | ObjCSpaceBeforeProtocolList: true
66 | PenaltyBreakBeforeFirstCallParameter: 19
67 | PenaltyBreakComment: 300
68 | PenaltyBreakFirstLessLess: 120
69 | PenaltyBreakString: 1000
70 | PenaltyExcessCharacter: 1000000
71 | PenaltyReturnTypeOnItsOwnLine: 1000
72 | PointerAlignment: Left
73 | ReflowComments:  true
74 | SortIncludes:    false
75 | SpaceAfterCStyleCast: false
76 | # SpaceAfterTemplateKeyword: true
77 | SpaceBeforeAssignmentOperators: true
78 | SpaceBeforeParens: Never
79 | SpaceInEmptyParentheses: false
80 | SpacesBeforeTrailingComments: 1
81 | SpacesInAngles:  false
82 | SpacesInContainerLiterals: true
83 | SpacesInCStyleCastParentheses: false
84 | SpacesInParentheses: false
85 | SpacesInSquareBrackets: false
86 | Standard:        Cpp11
87 | TabWidth:        2
88 | UseTab:          Never
89 | ...
90 | 
91 | 


--------------------------------------------------------------------------------
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | * @noelchalmers @pbauman
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled Object files
 2 | *.slo
 3 | *.lo
 4 | *.o
 5 | *.obj
 6 | 
 7 | # Precompiled Headers
 8 | *.gch
 9 | *.pch
10 | 
11 | # Compiled Dynamic libraries
12 | *.so
13 | *.dylib
14 | *.dll
15 | 
16 | # Fortran module files
17 | *.mod
18 | 
19 | # Compiled Static libraries
20 | *.lai
21 | *.la
22 | *.a
23 | *.lib
24 | 
25 | # Executables
26 | *.exe
27 | *.out
28 | *.app
29 | 
30 | # vim tags
31 | tags
32 | .tags
33 | .*.swp
34 | 
35 | # Editors
36 | .vscode
37 | 
38 | # build-in-source directory
39 | build
40 | 
41 | # doc directory
42 | docBin
43 | _build
44 | 
45 | #third-party software
46 | tpl/
47 | ltmain.sh


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | ======================================================================
 2 |  -- High Performance Computing Linpack Benchmark (HPL)
 3 |     HPL - 2.2 - February 24, 2016
 4 |     Antoine P. Petitet
 5 |     University of Tennessee, Knoxville
 6 |     Innovative Computing Laboratory
 7 |     (C) Copyright 2000-2008 All Rights Reserved
 8 | 
 9 |  -- Copyright notice and Licensing terms:
10 | 
11 |  Redistribution  and  use in  source and binary forms, with or without
12 |  modification, are  permitted provided  that the following  conditions
13 |  are met:
14 | 
15 |  1. Redistributions  of  source  code  must retain the above copyright
16 |  notice, this list of conditions and the following disclaimer.
17 | 
18 |  2. Redistributions in binary form must reproduce  the above copyright
19 |  notice, this list of conditions,  and the following disclaimer in the
20 |  documentation and/or other materials provided with the distribution.
21 | 
22 |  3. All  advertising  materials  mentioning  features  or  use of this
23 |  software must display the following acknowledgement:
24 |  This  product  includes  software  developed  at  the  University  of
25 |  Tennessee, Knoxville, Innovative Computing Laboratory.
26 | 
27 |  4. The name of the  University,  the name of the  Laboratory,  or the
28 |  names  of  its  contributors  may  not  be used to endorse or promote
29 |  products  derived   from   this  software  without  specific  written
30 |  permission.
31 | 
32 |  -- Disclaimer:
33 | 
34 |  THIS  SOFTWARE  IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
35 |  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,  INCLUDING,  BUT NOT
36 |  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
37 |  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
38 |  OR  CONTRIBUTORS  BE  LIABLE FOR ANY  DIRECT,  INDIRECT,  INCIDENTAL,
39 |  SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES  (INCLUDING,  BUT NOT
40 |  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
41 |  DATA OR PROFITS; OR BUSINESS INTERRUPTION)  HOWEVER CAUSED AND ON ANY
42 |  THEORY OF LIABILITY, WHETHER IN CONTRACT,  STRICT LIABILITY,  OR TORT
43 |  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
44 |  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
45 | ======================================================================
46 | 
47 | Modifications (c) 2018-2025 Advanced Micro Devices, Inc.
48 | Modified by: Noel Chalmers
49 | 
50 | Redistribution and use in source and binary forms, with or without modification,
51 | are permitted provided that the following conditions are met:
52 | 
53 | 1. Redistributions of source code must retain the above copyright notice, this
54 |    list of conditions and the following disclaimer.
55 | 2. Redistributions in binary form must reproduce the above copyright notice,
56 |    this list of conditions and the following disclaimer in the documentation
57 |    and/or other materials provided with the distribution.
58 | 3. Neither the name of the copyright holder nor the names of its contributors
59 |    may be used to endorse or promote products derived from this software without
60 |    specific prior written permission.
61 | 
62 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
63 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
64 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
65 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
66 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
67 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
68 | OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
69 | WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
70 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
71 | POSSIBILITY OF SUCH DAMAGE.
72 | 


--------------------------------------------------------------------------------
/cmake/Dependencies.cmake:
--------------------------------------------------------------------------------
  1 | # Modifications (c) 2019-2022 Advanced Micro Devices, Inc.
  2 | #
  3 | # Redistribution and use in source and binary forms, with or without modification,
  4 | # are permitted provided that the following conditions are met:
  5 | #
  6 | # 1. Redistributions of source code must retain the above copyright notice, this
  7 | #    list of conditions and the following disclaimer.
  8 | # 2. Redistributions in binary form must reproduce the above copyright notice,
  9 | #    this list of conditions and the following disclaimer in the documentation
 10 | #    and/or other materials provided with the distribution.
 11 | # 3. Neither the name of the copyright holder nor the names of its contributors
 12 | #    may be used to endorse or promote products derived from this software without
 13 | #    specific prior written permission.
 14 | #
 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 16 | # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 17 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 18 | # IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
 19 | # INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 20 | # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
 21 | # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 22 | # WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 23 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 24 | # POSSIBILITY OF SUCH DAMAGE.
 25 | 
 26 | # Dependencies
 27 | 
 28 | # Git
 29 | find_package(Git REQUIRED)
 30 | 
 31 | # Find OpenMP package
 32 | find_package(OpenMP REQUIRED)
 33 | 
 34 | # MPI
 35 | set(MPI_HOME ${HPL_MPI_DIR})
 36 | find_package(MPI REQUIRED)
 37 | 
 38 | # Add some paths
 39 | list(APPEND CMAKE_PREFIX_PATH ${ROCBLAS_PATH} ${ROCM_PATH} )
 40 | list(APPEND CMAKE_MODULE_PATH ${ROCM_PATH}/lib/cmake/hip )
 41 | 
 42 | # Find HIP package
 43 | find_package(HIP REQUIRED)
 44 | 
 45 | # rocblas
 46 | find_package(rocblas REQUIRED)
 47 | 
 48 | get_target_property(rocblas_LIBRARIES roc::rocblas IMPORTED_LOCATION_RELEASE)
 49 | 
 50 | message("-- rocBLAS version:      ${rocblas_VERSION}")
 51 | message("-- rocBLAS include dirs: ${rocblas_INCLUDE_DIRS}")
 52 | message("-- rocBLAS libraries:    ${rocblas_LIBRARIES}")
 53 | 
 54 | get_filename_component(ROCBLAS_LIB_PATH ${rocblas_LIBRARIES} DIRECTORY)
 55 | 
 56 | if(HPL_TRACING)
 57 |   find_library(ROCTRACER NAMES roctracer64
 58 |                PATHS ${ROCM_PATH}/lib
 59 |                NO_DEFAULT_PATH)
 60 |   find_library(ROCTX NAMES roctx64
 61 |                PATHS ${ROCM_PATH}/lib
 62 |                NO_DEFAULT_PATH)
 63 | 
 64 |   message("-- roctracer:  ${ROCTRACER}")
 65 |   message("-- roctx:      ${ROCTX}")
 66 | 
 67 |   add_library(roc::roctracer SHARED IMPORTED)
 68 |   set_target_properties(roc::roctracer PROPERTIES
 69 |     INTERFACE_INCLUDE_DIRECTORIES "${ROCM_PATH}/include"
 70 |     INTERFACE_LINK_LIBRARIES "hip::host"
 71 |     IMPORTED_LOCATION "${ROCTRACER}"
 72 |     IMPORTED_SONAME "libroctracer.so")
 73 |   add_library(roc::roctx SHARED IMPORTED)
 74 |   set_target_properties(roc::roctx PROPERTIES
 75 |     INTERFACE_INCLUDE_DIRECTORIES "${ROCM_PATH}/include"
 76 |     INTERFACE_LINK_LIBRARIES "hip::host"
 77 |     IMPORTED_LOCATION "${ROCTX}"
 78 |     IMPORTED_SONAME "libroctx64.so")
 79 | endif()
 80 | 
 81 | # ROCm cmake package
 82 | find_package(ROCmCMakeBuildTools QUIET CONFIG PATHS ${CMAKE_PREFIX_PATH})
 83 | if(NOT ROCM_FOUND)
 84 |   set(PROJECT_EXTERN_DIR ${CMAKE_CURRENT_BINARY_DIR}/extern)
 85 |   set(rocm_cmake_tag "master" CACHE STRING "rocm-cmake tag to download")
 86 |   file(DOWNLOAD https://github.com/RadeonOpenCompute/rocm-cmake/archive/${rocm_cmake_tag}.zip
 87 |        ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag}.zip STATUS status LOG log)
 88 | 
 89 |   list(GET status 0 status_code)
 90 |   list(GET status 1 status_string)
 91 | 
 92 |   if(NOT status_code EQUAL 0)
 93 |     message(FATAL_ERROR "error: downloading
 94 |     'https://github.com/RadeonOpenCompute/rocm-cmake/archive/${rocm_cmake_tag}.zip' failed
 95 |     status_code: ${status_code}
 96 |     status_string: ${status_string}
 97 |     log: ${log}
 98 |     ")
 99 |   endif()
100 | 
101 |   execute_process(COMMAND ${CMAKE_COMMAND} -E tar xzf ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag}.zip
102 |                   WORKING_DIRECTORY ${PROJECT_EXTERN_DIR})
103 | 
104 |   find_package(ROCmCMakeBuildTools REQUIRED CONFIG PATHS ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag})
105 | endif()
106 | 
107 | include(ROCMSetupVersion)
108 | include(ROCMCreatePackage)
109 | include(ROCMInstallTargets)
110 | include(ROCMPackageConfigHelpers)
111 | include(ROCMInstallSymlinks)
112 | include(ROCMCheckTargetIds OPTIONAL)
113 | 


--------------------------------------------------------------------------------
/include/hpl.hpp:
--------------------------------------------------------------------------------
 1 | /* ---------------------------------------------------------------------
 2 |  * -- High Performance Computing Linpack Benchmark (HPL)
 3 |  *    HPL - 2.2 - February 24, 2016
 4 |  *    Antoine P. Petitet
 5 |  *    University of Tennessee, Knoxville
 6 |  *    Innovative Computing Laboratory
 7 |  *    (C) Copyright 2000-2008 All Rights Reserved
 8 |  *
 9 |  *    Modified by: Noel Chalmers
10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
11 |  *    See the rocHPL/LICENCE file for details.
12 |  *
13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
14 |  * ---------------------------------------------------------------------
15 |  */
16 | #ifndef HPL_HPP
17 | #define HPL_HPP
18 | /*
19 |  * ---------------------------------------------------------------------
20 |  * HPL default compile options that can overridden in the cmake
21 |  * ---------------------------------------------------------------------
22 |  */
23 | #ifndef HPL_DETAILED_TIMING /* Do not enable detailed timings */
24 | #define HPL_NO_DETAILED_TIMING
25 | #endif
26 | 
27 | #undef HPL_USE_COLLECTIVES
28 | // #define HPL_USE_COLLECTIVES
29 | 
30 | /*
31 | Enabling atomics will potentially allow more performance optimization
32 | but will potentailly lead to residual values which vary from run-to-run
33 | */
34 | #undef HPL_ROCBLAS_ALLOW_ATOMICS
35 | // #define HPL_ROCBLAS_ALLOW_ATOMICS
36 | 
37 | /*
38 |  * ---------------------------------------------------------------------
39 |  * Include files
40 |  * ---------------------------------------------------------------------
41 |  */
42 | #include <omp.h>
43 | 
44 | // NC: hipcc in ROCm 3.7 complains if __HIP_PLATFORM_HCC__ is defined in the
45 | // compile line
46 | #ifdef __HIPCC__
47 | #ifdef __HIP_PLATFORM_HCC__
48 | #undef __HIP_PLATFORM_HCC__
49 | #endif
50 | #endif
51 | #include "hip/hip_runtime_api.h"
52 | 
53 | #ifdef HPL_TRACING
54 | #include <roctracer.h>
55 | #include <roctx.h>
56 | #endif
57 | 
58 | #include "hpl_version.hpp"
59 | #include "hpl_misc.hpp"
60 | #include "hpl_blas.hpp"
61 | #include "hpl_auxil.hpp"
62 | 
63 | #include "hpl_pmisc.hpp"
64 | #include "hpl_pauxil.hpp"
65 | #include "hpl_panel.hpp"
66 | #include "hpl_pfact.hpp"
67 | #include "hpl_pgesv.hpp"
68 | 
69 | #include "hpl_ptimer.hpp"
70 | #include "hpl_pmatgen.hpp"
71 | #include "hpl_ptest.hpp"
72 | 
73 | #endif
74 | /*
75 |  * End of hpl.hpp
76 |  */
77 | 


--------------------------------------------------------------------------------
/include/hpl_auxil.hpp:
--------------------------------------------------------------------------------
 1 | /* ---------------------------------------------------------------------
 2 |  * -- High Performance Computing Linpack Benchmark (HPL)
 3 |  *    HPL - 2.2 - February 24, 2016
 4 |  *    Antoine P. Petitet
 5 |  *    University of Tennessee, Knoxville
 6 |  *    Innovative Computing Laboratory
 7 |  *    (C) Copyright 2000-2008 All Rights Reserved
 8 |  *
 9 |  *    Modified by: Noel Chalmers
10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
11 |  *    See the rocHPL/LICENCE file for details.
12 |  *
13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
14 |  * ---------------------------------------------------------------------
15 |  */
16 | #ifndef HPL_AUXIL_HPP
17 | #define HPL_AUXIL_HPP
18 | /*
19 |  * ---------------------------------------------------------------------
20 |  * Include files
21 |  * ---------------------------------------------------------------------
22 |  */
23 | #include "hpl_misc.hpp"
24 | #include "hpl_blas.hpp"
25 | /*
26 |  * ---------------------------------------------------------------------
27 |  * typedef definitions
28 |  * ---------------------------------------------------------------------
29 |  */
30 | typedef enum {
31 |   HPL_NORM_A = 800,
32 |   HPL_NORM_1 = 801,
33 |   HPL_NORM_I = 802
34 | } HPL_T_NORM;
35 | 
36 | typedef enum {
37 |   HPL_MACH_EPS   = 900, /* relative machine precision */
38 |   HPL_MACH_SFMIN = 901, /* safe minimum st 1/sfmin does not overflow */
39 |   HPL_MACH_BASE  = 902, /* base = base of the machine */
40 |   HPL_MACH_PREC  = 903, /* prec  = eps*base */
41 |   HPL_MACH_MLEN  = 904, /* number of (base) digits in the mantissa */
42 |   HPL_MACH_RND   = 905, /* 1.0 if rounding occurs in addition */
43 |   HPL_MACH_EMIN  = 906, /* min exponent before (gradual) underflow */
44 |   HPL_MACH_RMIN  = 907, /* underflow threshold base**(emin-1) */
45 |   HPL_MACH_EMAX  = 908, /* largest exponent before overflow */
46 |   HPL_MACH_RMAX  = 909  /* overflow threshold - (base**emax)*(1-eps) */
47 | 
48 | } HPL_T_MACH;
49 | /*
50 |  * ---------------------------------------------------------------------
51 |  * Function prototypes
52 |  * ---------------------------------------------------------------------
53 |  */
54 | void HPL_fprintf(FILE*, const char*, ...);
55 | void HPL_warn(FILE*, int, const char*, const char*, ...);
56 | void HPL_abort(int, const char*, const char*, ...);
57 | 
58 | void HPL_dlacpy(const int,
59 |                 const int,
60 |                 const double*,
61 |                 const int,
62 |                 double*,
63 |                 const int);
64 | 
65 | void HPL_dlatcpy(const int,
66 |                  const int,
67 |                  const double*,
68 |                  const int,
69 |                  double*,
70 |                  const int);
71 | 
72 | double HPL_dlamch(const HPL_T_MACH);
73 | 
74 | #endif
75 | /*
76 |  * End of hpl_auxil.hpp
77 |  */
78 | 


--------------------------------------------------------------------------------
/include/hpl_blas.hpp:
--------------------------------------------------------------------------------
 1 | /* ---------------------------------------------------------------------
 2 |  * -- High Performance Computing Linpack Benchmark (HPL)
 3 |  *    HPL - 2.2 - February 24, 2016
 4 |  *    Antoine P. Petitet
 5 |  *    University of Tennessee, Knoxville
 6 |  *    Innovative Computing Laboratory
 7 |  *    (C) Copyright 2000-2008 All Rights Reserved
 8 |  *
 9 |  *    Modified by: Noel Chalmers
10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
11 |  *    See the rocHPL/LICENCE file for details.
12 |  *
13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
14 |  * ---------------------------------------------------------------------
15 |  */
16 | #ifndef HPL_BLAS_HPP
17 | #define HPL_BLAS_HPP
18 | /*
19 |  * ---------------------------------------------------------------------
20 |  * Include files
21 |  * ---------------------------------------------------------------------
22 |  */
23 | 
24 | #include "hpl_misc.hpp"
25 | #include <rocblas/rocblas.h>
26 | #include <iostream>
27 | 
28 | extern rocblas_handle handle;
29 | extern hipStream_t    computeStream;
30 | extern hipStream_t    dataStream;
31 | 
32 | #define CHECK_HIP_ERROR(val) hipCheck((val), #val, __FILE__, __LINE__)
33 | inline void hipCheck(hipError_t        err,
34 |                      const char* const func,
35 |                      const char* const file,
36 |                      const int         line) {
37 |   if(err != hipSuccess) {
38 |     std::cerr << "HIP Runtime Error at: " << file << ":" << line << std::endl;
39 |     std::cerr << hipGetErrorString(err) << " " << func << std::endl;
40 |     std::exit(-1);
41 |   }
42 | }
43 | 
44 | #define CHECK_ROCBLAS_ERROR(val) rocBLASCheck((val), #val, __FILE__, __LINE__)
45 | inline void rocBLASCheck(rocblas_status    err,
46 |                          const char* const func,
47 |                          const char* const file,
48 |                          const int         line) {
49 |   if(err != rocblas_status_success) {
50 |     std::cerr << "rocBLAS Reports Error at: " << file << ":" << line
51 |               << std::endl;
52 |     std::cerr << rocblas_status_to_string(err) << " " << func << std::endl;
53 |     std::exit(-1);
54 |   }
55 | }
56 | 
57 | #endif
58 | /*
59 |  * hpl_blas.hpp
60 |  */
61 | 


--------------------------------------------------------------------------------
/include/hpl_comm.hpp:
--------------------------------------------------------------------------------
 1 | /* ---------------------------------------------------------------------
 2 |  * -- High Performance Computing Linpack Benchmark (HPL)
 3 |  *    HPL - 2.2 - February 24, 2016
 4 |  *    Antoine P. Petitet
 5 |  *    University of Tennessee, Knoxville
 6 |  *    Innovative Computing Laboratory
 7 |  *    (C) Copyright 2000-2008 All Rights Reserved
 8 |  *
 9 |  *    Modified by: Noel Chalmers
10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
11 |  *    See the rocHPL/LICENCE file for details.
12 |  *
13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
14 |  * ---------------------------------------------------------------------
15 |  */
16 | #ifndef HPL_COMM_HPP
17 | #define HPL_COMM_HPP
18 | /*
19 |  * ---------------------------------------------------------------------
20 |  * Include files
21 |  * ---------------------------------------------------------------------
22 |  */
23 | #include "hpl_pmisc.hpp"
24 | #include "hpl_panel.hpp"
25 | 
26 | /*
27 |  * ---------------------------------------------------------------------
28 |  * #typedefs and data structures
29 |  * ---------------------------------------------------------------------
30 |  */
31 | typedef enum {
32 |   HPL_1RING   = 401, /* Unidirectional ring */
33 |   HPL_1RING_M = 402, /* Unidirectional ring (modified) */
34 |   HPL_2RING   = 403, /* Bidirectional ring */
35 |   HPL_2RING_M = 404, /* Bidirectional ring (modified) */
36 |   HPL_BLONG   = 405, /* long broadcast */
37 |   HPL_BLONG_M = 406, /* long broadcast (modified) */
38 | } HPL_T_TOP;
39 | 
40 | typedef MPI_Op HPL_T_OP;
41 | 
42 | #define HPL_SUM MPI_SUM
43 | #define HPL_MAX MPI_MAX
44 | #define HPL_MIN MPI_MIN
45 | 
46 | extern MPI_Op       HPL_DMXSWP;
47 | extern MPI_Datatype PDFACT_ROW;
48 | /*
49 |  * ---------------------------------------------------------------------
50 |  * #define macro constants
51 |  * ---------------------------------------------------------------------
52 |  */
53 | #define HPL_FAILURE 0
54 | #define HPL_SUCCESS 1
55 | /*
56 |  * ---------------------------------------------------------------------
57 |  * comm function prototypes
58 |  * ---------------------------------------------------------------------
59 |  */
60 | int HPL_send(double*, int, int, int, MPI_Comm);
61 | int HPL_recv(double*, int, int, int, MPI_Comm);
62 | int HPL_sdrv(double*, int, int, double*, int, int, int, MPI_Comm);
63 | int HPL_bcast(double*, int, int, MPI_Comm, HPL_T_TOP top);
64 | int HPL_bcast_1ring(double* SBUF, int SCOUNT, int ROOT, MPI_Comm COMM);
65 | int HPL_bcast_1rinM(double* SBUF, int SCOUNT, int ROOT, MPI_Comm COMM);
66 | int HPL_bcast_2ring(double* SBUF, int SCOUNT, int ROOT, MPI_Comm COMM);
67 | int HPL_bcast_2rinM(double* SBUF, int SCOUNT, int ROOT, MPI_Comm COMM);
68 | int HPL_bcast_blong(double* SBUF, int SCOUNT, int ROOT, MPI_Comm COMM);
69 | int HPL_bcast_blonM(double* SBUF, int SCOUNT, int ROOT, MPI_Comm COMM);
70 | int HPL_scatterv(double*, const int*, const int*, const int, int, MPI_Comm);
71 | int HPL_allgatherv(double*, const int, const int*, const int*, MPI_Comm);
72 | int HPL_barrier(MPI_Comm);
73 | int HPL_broadcast(void*, const int, const HPL_T_TYPE, const int, MPI_Comm);
74 | 
75 | int HPL_reduce(void*,
76 |                const int,
77 |                const HPL_T_TYPE,
78 |                const HPL_T_OP,
79 |                const int,
80 |                MPI_Comm);
81 | 
82 | int HPL_all_reduce(void*,
83 |                    const int,
84 |                    const HPL_T_TYPE,
85 |                    const HPL_T_OP,
86 |                    MPI_Comm);
87 | 
88 | void HPL_dmxswp(void*, void*, int*, MPI_Datatype*);
89 | void HPL_all_reduce_dmxswp(double*, const int, const int, MPI_Comm, double*);
90 | 
91 | #endif
92 | /*
93 |  * End of hpl_comm.hpp
94 |  */
95 | 


--------------------------------------------------------------------------------
/include/hpl_grid.hpp:
--------------------------------------------------------------------------------
  1 | /* ---------------------------------------------------------------------
  2 |  * -- High Performance Computing Linpack Benchmark (HPL)
  3 |  *    HPL - 2.2 - February 24, 2016
  4 |  *    Antoine P. Petitet
  5 |  *    University of Tennessee, Knoxville
  6 |  *    Innovative Computing Laboratory
  7 |  *    (C) Copyright 2000-2008 All Rights Reserved
  8 |  *
  9 |  *    Modified by: Noel Chalmers
 10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
 11 |  *    See the rocHPL/LICENCE file for details.
 12 |  *
 13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
 14 |  * ---------------------------------------------------------------------
 15 |  */
 16 | #ifndef HPL_GRID_H
 17 | #define HPL_GRID_H
 18 | /*
 19 |  * ---------------------------------------------------------------------
 20 |  * Include files
 21 |  * ---------------------------------------------------------------------
 22 |  */
 23 | #include "hpl_pmisc.hpp"
 24 | 
 25 | /*
 26 |  * ---------------------------------------------------------------------
 27 |  * #typedefs and data structures
 28 |  * ---------------------------------------------------------------------
 29 |  */
 30 | typedef enum { HPL_INT = 100, HPL_DOUBLE = 101 } HPL_T_TYPE;
 31 | 
 32 | typedef enum { HPL_ROW_MAJOR = 201, HPL_COLUMN_MAJOR = 202 } HPL_T_ORDER;
 33 | 
 34 | typedef struct HPL_S_grid {
 35 |   MPI_Comm    all_comm;    /* grid communicator */
 36 |   MPI_Comm    row_comm;    /* row communicator */
 37 |   MPI_Comm    col_comm;    /* column communicator */
 38 |   HPL_T_ORDER order;       /* ordering of the procs in the grid */
 39 |   int         iam;         /* my rank in the grid */
 40 |   int         myrow;       /* my row number in the grid */
 41 |   int         mycol;       /* my column number in the grid */
 42 |   int         nprow;       /* the total # of rows in the grid */
 43 |   int         npcol;       /* the total # of columns in the grid */
 44 |   int         local_myrow; /* my row number in the node-local grid */
 45 |   int         local_mycol; /* my column number in the node-local grid */
 46 |   int         local_nprow; /* the total # of rows in the node-local grid */
 47 |   int         local_npcol; /* the total # of columns in the node-local grid */
 48 |   int         nprocs;      /* the total # of procs in the grid */
 49 |   int         row_ip2;     /* largest power of two <= nprow */
 50 |   int         row_hdim;    /* row_ip2 procs hypercube dimension */
 51 |   int         row_ip2m1;   /* largest power of two <= nprow-1 */
 52 |   int         row_mask;    /* row_ip2m1 procs hypercube mask */
 53 |   int         col_ip2;     /* largest power of two <= npcol */
 54 |   int         col_hdim;    /* col_ip2 procs hypercube dimension */
 55 |   int         col_ip2m1;   /* largest power of two <= npcol-1 */
 56 |   int         col_mask;    /* col_ip2m1 procs hypercube mask */
 57 | } HPL_T_grid;
 58 | 
 59 | /*
 60 |  * ---------------------------------------------------------------------
 61 |  * #define macros definitions
 62 |  * ---------------------------------------------------------------------
 63 |  */
 64 | #define HPL_2_MPI_TYPE(typ) ((typ == HPL_INT ? MPI_INT : MPI_DOUBLE))
 65 | /*
 66 |  * The following macros perform common modulo operations;  All functions
 67 |  * except MPosMod assume arguments are < d (i.e., arguments are themsel-
 68 |  * ves within modulo range).
 69 |  */
 70 | /* increment with mod */
 71 | #define MModInc(I, d) \
 72 |   if(++(I) == (d)) (I) = 0
 73 | /* decrement with mod */
 74 | #define MModDec(I, d) \
 75 |   if(--(I) == -1) (I) = (d) - 1
 76 | /* positive modulo */
 77 | #define MPosMod(I, d) ((I) - ((I) / (d)) * (d))
 78 | /* add two numbers */
 79 | #define MModAdd(I1, I2, d) \
 80 |   (((I1) + (I2) < (d)) ? (I1) + (I2) : (I1) + (I2) - (d))
 81 | /* add 1 to # */
 82 | #define MModAdd1(I, d) (((I) != (d) - 1) ? (I) + 1 : 0)
 83 | /* subtract two numbers */
 84 | #define MModSub(I1, I2, d) (((I1) < (I2)) ? (d) + (I1) - (I2) : (I1) - (I2))
 85 | /* sub 1 from # */
 86 | #define MModSub1(I, d) (((I) != 0) ? (I) - 1 : (d) - 1)
 87 | /*
 88 |  * ---------------------------------------------------------------------
 89 |  * grid function prototypes
 90 |  * ---------------------------------------------------------------------
 91 |  */
 92 | int HPL_grid_init(MPI_Comm,
 93 |                   const HPL_T_ORDER,
 94 |                   const int,
 95 |                   const int,
 96 |                   const int,
 97 |                   const int,
 98 |                   HPL_T_grid*);
 99 | 
100 | int HPL_grid_exit(HPL_T_grid*);
101 | int HPL_grid_info(const HPL_T_grid*, int*, int*, int*, int*);
102 | 
103 | #endif
104 | /*
105 |  * End of hpl_grid.hpp
106 |  */
107 | 


--------------------------------------------------------------------------------
/include/hpl_misc.hpp:
--------------------------------------------------------------------------------
 1 | /* ---------------------------------------------------------------------
 2 |  * -- High Performance Computing Linpack Benchmark (HPL)
 3 |  *    HPL - 2.2 - February 24, 2016
 4 |  *    Antoine P. Petitet
 5 |  *    University of Tennessee, Knoxville
 6 |  *    Innovative Computing Laboratory
 7 |  *    (C) Copyright 2000-2008 All Rights Reserved
 8 |  *
 9 |  *    Modified by: Noel Chalmers
10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
11 |  *    See the rocHPL/LICENCE file for details.
12 |  *
13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
14 |  * ---------------------------------------------------------------------
15 |  */
16 | #ifndef HPL_MISC_HPP
17 | #define HPL_MISC_HPP
18 | /*
19 |  * ---------------------------------------------------------------------
20 |  * Include files
21 |  * ---------------------------------------------------------------------
22 |  */
23 | 
24 | #include <stdio.h>
25 | #include <stdlib.h>
26 | #include <string.h>
27 | #include <time.h>
28 | #include <stdarg.h>
29 | 
30 | /*
31 |  * ---------------------------------------------------------------------
32 |  * #define macro constants
33 |  * ---------------------------------------------------------------------
34 |  */
35 | #define HPL_rone 1.0
36 | #define HPL_rtwo 2.0
37 | #define HPL_rzero 0.0
38 | /*
39 |  * ---------------------------------------------------------------------
40 |  * #define macros definitions
41 |  * ---------------------------------------------------------------------
42 |  */
43 | #define Mabs(a_) (((a_) < 0) ? -(a_) : (a_))
44 | #define Mmin(a_, b_) (((a_) < (b_)) ? (a_) : (b_))
45 | #define Mmax(a_, b_) (((a_) > (b_)) ? (a_) : (b_))
46 | 
47 | #define Mfloor(a, b) (((a) > 0) ? (((a) / (b))) : (-(((-(a)) + (b) - 1) / (b))))
48 | #define Mceil(a, b) (((a) + (b) - 1) / (b))
49 | #define Miceil(a, b) (((a) > 0) ? ((((a) + (b) - 1) / (b))) : (-((-(a)) / (b))))
50 | 
51 | #define Mupcase(C) (((C) > 96 && (C) < 123) ? (C) & 0xDF : (C))
52 | #define Mlowcase(C) (((C) > 64 && (C) < 91) ? (C) | 32 : (C))
53 | /*
54 |  * Mptr returns a pointer to a_( i_, j_ ) for readability reasons and
55 |  * also less silly errors ...
56 |  */
57 | #define Mptr(a_, i_, j_, lda_) \
58 |   ((a_) + (size_t)(i_) + (size_t)(j_) * (size_t)(lda_))
59 | /*
60 |  * Align pointer
61 |  */
62 | #define HPL_PTR(ptr_, al_) ((((size_t)(ptr_) + (al_) - 1) / (al_)) * (al_))
63 | #endif
64 | 
65 | #ifdef HPL_TRACING
66 | #define HPL_TracingPush(label) roctxRangePush(label)
67 | #define HPL_TracingPop(label) roctxRangePop()
68 | #else
69 | #define HPL_TracingPush(label)
70 | #define HPL_TracingPop(label)
71 | #endif
72 | /*
73 |  * End of hpl_misc.hpp
74 |  */
75 | 


--------------------------------------------------------------------------------
/include/hpl_panel.hpp:
--------------------------------------------------------------------------------
  1 | /* ---------------------------------------------------------------------
  2 |  * -- High Performance Computing Linpack Benchmark (HPL)
  3 |  *    HPL - 2.2 - February 24, 2016
  4 |  *    Antoine P. Petitet
  5 |  *    University of Tennessee, Knoxville
  6 |  *    Innovative Computing Laboratory
  7 |  *    (C) Copyright 2000-2008 All Rights Reserved
  8 |  *
  9 |  *    Modified by: Noel Chalmers
 10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
 11 |  *    See the rocHPL/LICENCE file for details.
 12 |  *
 13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
 14 |  * ---------------------------------------------------------------------
 15 |  */
 16 | #ifndef HPL_PANEL_HPP
 17 | #define HPL_PANEL_HPP
 18 | /*
 19 |  * ---------------------------------------------------------------------
 20 |  * Include files
 21 |  * ---------------------------------------------------------------------
 22 |  */
 23 | #include "hpl_pmisc.hpp"
 24 | #include "hpl_grid.hpp"
 25 | 
 26 | /*
 27 |  * ---------------------------------------------------------------------
 28 |  * Data Structures
 29 |  * ---------------------------------------------------------------------
 30 |  */
 31 | typedef struct HPL_S_panel {
 32 |   struct HPL_S_grid* grid;  /* ptr to the process grid */
 33 |   struct HPL_S_palg* algo;  /* ptr to the algo parameters */
 34 |   struct HPL_S_pmat* pmat;  /* ptr to the local array info */
 35 |   double*            A;     /* ptr to trailing part of A */
 36 |   double*            A0;    /* ptr to current panel of A */
 37 |   double*            L2;    /* ptr to L */
 38 |   double*            L1;    /* ptr to jb x jb upper block of A */
 39 |   double*            U0;    /* ptr to U */
 40 |   double*            U1;    /* ptr to U1 */
 41 |   double*            U2;    /* ptr to U2 */
 42 |   int*               IWORK; /* integer workspace for swapping */
 43 |   int*               ipiv;
 44 |   int*               dipiv;
 45 |   int                nu0;
 46 |   int                nu1;
 47 |   int                nu2;
 48 |   int                ldu0;
 49 |   int                ldu1;
 50 |   int                ldu2;
 51 |   int                lda0;       /* local leading dim of array A0 */
 52 |   int                ldl2;       /* local leading dim of array L2 */
 53 |   int                len;        /* length of the buffer to broadcast */
 54 |   void*              buffers[2]; /* buffers for panel bcast */
 55 |   int                counts[2];  /* counts for panel bcast */
 56 |   MPI_Datatype       dtypes[2];  /* data types for panel bcast */
 57 |   MPI_Request        request[1]; /* requests for panel bcast */
 58 |   MPI_Status         status[1];  /* status for panel bcast */
 59 |   int                nb;         /* distribution blocking factor */
 60 |   int                jb;         /* panel width */
 61 |   int                m;          /* global # of rows of trailing part of A */
 62 |   int                n;          /* global # of cols of trailing part of A */
 63 |   int                ia;         /* global row index of trailing part of A */
 64 |   int                ja;         /* global col index of trailing part of A */
 65 |   int                mp;         /* local # of rows of trailing part of A */
 66 |   int                nq;         /* local # of cols of trailing part of A */
 67 |   int                ii;         /* local row index of trailing part of A */
 68 |   int                jj;         /* local col index of trailing part of A */
 69 |   int                lda;        /* local leading dim of array A */
 70 |   int                prow;       /* proc. row owning 1st row of trail. A */
 71 |   int                pcol;       /* proc. col owning 1st col of trail. A */
 72 |   int                msgid;      /* message id for panel bcast */
 73 | } HPL_T_panel;
 74 | 
 75 | /*
 76 |  * ---------------------------------------------------------------------
 77 |  * panel function prototypes
 78 |  * ---------------------------------------------------------------------
 79 |  */
 80 | #include "hpl_pgesv.hpp"
 81 | 
 82 | typedef struct HPL_S_test HPL_T_test;
 83 | 
 84 | int HPL_pdpanel_new(HPL_T_test*,
 85 |                     HPL_T_grid*,
 86 |                     HPL_T_palg*,
 87 |                     HPL_T_pmat*,
 88 |                     HPL_T_panel*,
 89 |                     size_t&);
 90 | 
 91 | void HPL_pdpanel_init(HPL_T_grid*,
 92 |                       HPL_T_palg*,
 93 |                       const int,
 94 |                       const int,
 95 |                       const int,
 96 |                       HPL_T_pmat*,
 97 |                       const int,
 98 |                       const int,
 99 |                       const int,
100 |                       HPL_T_panel*);
101 | 
102 | int  HPL_pdpanel_free(HPL_T_panel*);
103 | void HPL_pdpanel_SendToHost(HPL_T_panel*);
104 | void HPL_pdpanel_SendToDevice(HPL_T_panel*);
105 | void HPL_pdpanel_swapids(HPL_T_panel* PANEL);
106 | void HPL_pdpanel_copyL1(HPL_T_panel* PANEL);
107 | void HPL_pdpanel_Wait(HPL_T_panel* PANEL);
108 | int  HPL_pdpanel_bcast(HPL_T_panel*);
109 | #endif
110 | /*
111 |  * End of hpl_panel.hpp
112 |  */
113 | 


--------------------------------------------------------------------------------
/include/hpl_pfact.hpp:
--------------------------------------------------------------------------------
 1 | /* ---------------------------------------------------------------------
 2 |  * -- High Performance Computing Linpack Benchmark (HPL)
 3 |  *    HPL - 2.2 - February 24, 2016
 4 |  *    Antoine P. Petitet
 5 |  *    University of Tennessee, Knoxville
 6 |  *    Innovative Computing Laboratory
 7 |  *    (C) Copyright 2000-2008 All Rights Reserved
 8 |  *
 9 |  *    Modified by: Noel Chalmers
10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
11 |  *    See the rocHPL/LICENCE file for details.
12 |  *
13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
14 |  * ---------------------------------------------------------------------
15 |  */
16 | #ifndef HPL_PFACT_HPP
17 | #define HPL_PFACT_HPP
18 | /*
19 |  * ---------------------------------------------------------------------
20 |  * Include files
21 |  * ---------------------------------------------------------------------
22 |  */
23 | #include "hpl_misc.hpp"
24 | #include "hpl_blas.hpp"
25 | 
26 | #include "hpl_pgesv.hpp"
27 | #include "hpl_pmisc.hpp"
28 | #include "hpl_pauxil.hpp"
29 | #include "hpl_panel.hpp"
30 | 
31 | /*
32 |  * ---------------------------------------------------------------------
33 |  * #typedefs and data structures
34 |  * ---------------------------------------------------------------------
35 |  */
36 | typedef void (*HPL_T_PFA_FUN)(HPL_T_panel*, const int, const int, const int);
37 | 
38 | typedef void (*HPL_T_RFA_FUN)(HPL_T_panel*, const int, const int, const int);
39 | /*
40 |  * ---------------------------------------------------------------------
41 |  * Function prototypes
42 |  * ---------------------------------------------------------------------
43 |  */
44 | void HPL_pdmxswp(HPL_T_panel*, const int, const int, const int, double*);
45 | 
46 | void HPL_pdpancrN(HPL_T_panel*, const int, const int, const int);
47 | 
48 | void HPL_pdpancrT(HPL_T_panel*, const int, const int, const int);
49 | 
50 | void HPL_pdpanllN(HPL_T_panel*, const int, const int, const int);
51 | 
52 | void HPL_pdpanllT(HPL_T_panel*, const int, const int, const int);
53 | 
54 | void HPL_pdpanrlN(HPL_T_panel*, const int, const int, const int);
55 | 
56 | void HPL_pdpanrlT(HPL_T_panel*, const int, const int, const int);
57 | 
58 | void HPL_pdrpancrN(HPL_T_panel*, const int, const int, const int);
59 | 
60 | void HPL_pdrpancrT(HPL_T_panel*, const int, const int, const int);
61 | 
62 | void HPL_pdrpanllN(HPL_T_panel*, const int, const int, const int);
63 | 
64 | void HPL_pdrpanllT(HPL_T_panel*, const int, const int, const int);
65 | 
66 | void HPL_pdrpanrlN(HPL_T_panel*, const int, const int, const int);
67 | 
68 | void HPL_pdrpanrlT(HPL_T_panel*, const int, const int, const int);
69 | 
70 | void HPL_pdfact(HPL_T_panel*);
71 | 
72 | #endif
73 | /*
74 |  * End of hpl_pfact.hpp
75 |  */
76 | 


--------------------------------------------------------------------------------
/include/hpl_pmatgen.hpp:
--------------------------------------------------------------------------------
 1 | /* ---------------------------------------------------------------------
 2 |  * -- High Performance Computing Linpack Benchmark (HPL)
 3 |  *    HPL - 2.2 - February 24, 2016
 4 |  *    Antoine P. Petitet
 5 |  *    University of Tennessee, Knoxville
 6 |  *    Innovative Computing Laboratory
 7 |  *    (C) Copyright 2000-2008 All Rights Reserved
 8 |  *
 9 |  *    Modified by: Noel Chalmers
10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
11 |  *    See the rocHPL/LICENCE file for details.
12 |  *
13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
14 |  * ---------------------------------------------------------------------
15 |  */
16 | #ifndef HPL_PMATGEN_HPP
17 | #define HPL_PMATGEN_HPP
18 | /*
19 |  * ---------------------------------------------------------------------
20 |  * Include files
21 |  * ---------------------------------------------------------------------
22 |  */
23 | #include "hpl_misc.hpp"
24 | 
25 | #include "hpl_pmisc.hpp"
26 | #include "hpl_pauxil.hpp"
27 | #include "hpl_pgesv.hpp"
28 | #include "hpl_ptest.hpp"
29 | 
30 | /*
31 |  * ---------------------------------------------------------------------
32 |  * #define macro constants
33 |  * ---------------------------------------------------------------------
34 |  */
35 | #define HPL_MULT 6364136223846793005UL
36 | #define HPL_IADD 1UL
37 | #define HPL_DIVFAC 2147483648.0
38 | #define HPL_POW16 65536.0
39 | #define HPL_HALF 0.5
40 | /*
41 |  * ---------------------------------------------------------------------
42 |  * Function prototypes
43 |  * ---------------------------------------------------------------------
44 |  */
45 | void HPL_xjumpm(const int      JUMPM,
46 |                 const uint64_t MULT,
47 |                 const uint64_t IADD,
48 |                 const uint64_t IRANN,
49 |                 uint64_t&      IRANM,
50 |                 uint64_t&      IAM,
51 |                 uint64_t&      ICM);
52 | 
53 | void HPL_pdrandmat(const HPL_T_grid*,
54 |                    const int,
55 |                    const int,
56 |                    const int,
57 |                    double*,
58 |                    const int,
59 |                    const int);
60 | 
61 | int HPL_pdmatgen(HPL_T_test*,
62 |                  HPL_T_grid*,
63 |                  HPL_T_palg*,
64 |                  HPL_T_pmat*,
65 |                  const int,
66 |                  const int);
67 | 
68 | void HPL_pdmatfree(HPL_T_pmat*);
69 | 
70 | #endif
71 | /*
72 |  * End of hpl_pmatgen.hpp
73 |  */
74 | 


--------------------------------------------------------------------------------
/include/hpl_pmisc.hpp:
--------------------------------------------------------------------------------
 1 | /* ---------------------------------------------------------------------
 2 |  * -- High Performance Computing Linpack Benchmark (HPL)
 3 |  *    HPL - 2.2 - February 24, 2016
 4 |  *    Antoine P. Petitet
 5 |  *    University of Tennessee, Knoxville
 6 |  *    Innovative Computing Laboratory
 7 |  *    (C) Copyright 2000-2008 All Rights Reserved
 8 |  *
 9 |  *    Modified by: Noel Chalmers
10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
11 |  *    See the rocHPL/LICENCE file for details.
12 |  *
13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
14 |  * ---------------------------------------------------------------------
15 |  */
16 | #ifndef HPL_PMISC_HPP
17 | #define HPL_PMISC_HPP
18 | /*
19 |  * ---------------------------------------------------------------------
20 |  * Include files
21 |  * ---------------------------------------------------------------------
22 |  */
23 | #include "hpl_misc.hpp"
24 | #include "mpi.h"
25 | 
26 | #endif
27 | /*
28 |  * End of hpl_pmisc.hpp
29 |  */
30 | 


--------------------------------------------------------------------------------
/include/hpl_ptest.hpp:
--------------------------------------------------------------------------------
  1 | /* ---------------------------------------------------------------------
  2 |  * -- High Performance Computing Linpack Benchmark (HPL)
  3 |  *    HPL - 2.2 - February 24, 2016
  4 |  *    Antoine P. Petitet
  5 |  *    University of Tennessee, Knoxville
  6 |  *    Innovative Computing Laboratory
  7 |  *    (C) Copyright 2000-2008 All Rights Reserved
  8 |  *
  9 |  *    Modified by: Noel Chalmers
 10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
 11 |  *    See the rocHPL/LICENCE file for details.
 12 |  *
 13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
 14 |  * ---------------------------------------------------------------------
 15 |  */
 16 | #ifndef HPL_PTEST_HPP
 17 | #define HPL_PTEST_HPP
 18 | /*
 19 |  * ---------------------------------------------------------------------
 20 |  * Include files
 21 |  * ---------------------------------------------------------------------
 22 |  */
 23 | #include "hpl_misc.hpp"
 24 | #include "hpl_blas.hpp"
 25 | #include "hpl_auxil.hpp"
 26 | 
 27 | #include "hpl_pmisc.hpp"
 28 | #include "hpl_pauxil.hpp"
 29 | #include "hpl_panel.hpp"
 30 | #include "hpl_pgesv.hpp"
 31 | 
 32 | #include "hpl_ptimer.hpp"
 33 | #include "hpl_pmatgen.hpp"
 34 | 
 35 | /*
 36 |  * ---------------------------------------------------------------------
 37 |  * Data Structures
 38 |  * ---------------------------------------------------------------------
 39 |  */
 40 | typedef struct HPL_S_test {
 41 |   double epsil; /* epsilon machine */
 42 |   double thrsh; /* threshold */
 43 |   FILE*  outfp; /* output stream (only in proc 0) */
 44 |   int    kfail; /* # of tests failed */
 45 |   int    kpass; /* # of tests passed */
 46 |   int    kskip; /* # of tests skipped */
 47 |   int    ktest; /* total number of tests */
 48 | } HPL_T_test;
 49 | 
 50 | /*
 51 |  * ---------------------------------------------------------------------
 52 |  * #define macro constants for testing only
 53 |  * ---------------------------------------------------------------------
 54 |  */
 55 | #define HPL_LINE_MAX 256
 56 | #define HPL_MAX_PARAM 20
 57 | #define HPL_ISEED 100
 58 | /*
 59 |  * ---------------------------------------------------------------------
 60 |  * global timers for timing analysis only
 61 |  * ---------------------------------------------------------------------
 62 |  */
 63 | #define HPL_TIMING_BEG 11    /* timer 0 reserved, used by main */
 64 | #define HPL_TIMING_N 8       /* number of timers defined below */
 65 | #define HPL_TIMING_RPFACT 11 /* starting from here, contiguous */
 66 | #define HPL_TIMING_PFACT 12
 67 | #define HPL_TIMING_MXSWP 13
 68 | #define HPL_TIMING_COPY 14
 69 | #define HPL_TIMING_LBCAST 15
 70 | #define HPL_TIMING_LASWP 16
 71 | #define HPL_TIMING_UPDATE 17
 72 | #define HPL_TIMING_PTRSV 18
 73 | /*
 74 |  * ---------------------------------------------------------------------
 75 |  * Function prototypes
 76 |  * ---------------------------------------------------------------------
 77 |  */
 78 | void HPL_pdinfo(int    ARGC,
 79 |                 char** ARGV,
 80 |                 HPL_T_test*,
 81 |                 int*,
 82 |                 int*,
 83 |                 int*,
 84 |                 int*,
 85 |                 HPL_T_ORDER*,
 86 |                 int*,
 87 |                 int*,
 88 |                 int*,
 89 |                 int*,
 90 |                 int*,
 91 |                 int*,
 92 |                 HPL_T_FACT*,
 93 |                 int*,
 94 |                 int*,
 95 |                 int*,
 96 |                 int*,
 97 |                 int*,
 98 |                 HPL_T_FACT*,
 99 |                 int*,
100 |                 HPL_T_TOP*,
101 |                 int*,
102 |                 int*,
103 |                 HPL_T_SWAP*,
104 |                 int*,
105 |                 int*,
106 |                 int*,
107 |                 int*,
108 |                 int*,
109 |                 double*,
110 |                 int*);
111 | 
112 | int HPL_pdwarmup(HPL_T_test* TEST,
113 |                  HPL_T_grid* GRID,
114 |                  HPL_T_palg* ALGO,
115 |                  HPL_T_pmat* mat);
116 | 
117 | void HPL_pdtest(HPL_T_test*, HPL_T_grid*, HPL_T_palg*, const int, const int);
118 | void HPL_InitGPU(const HPL_T_grid* GRID);
119 | void HPL_FreeGPU();
120 | 
121 | #endif
122 | /*
123 |  * End of hpl_ptest.hpp
124 |  */
125 | 


--------------------------------------------------------------------------------
/include/hpl_ptimer.hpp:
--------------------------------------------------------------------------------
 1 | /* ---------------------------------------------------------------------
 2 |  * -- High Performance Computing Linpack Benchmark (HPL)
 3 |  *    HPL - 2.2 - February 24, 2016
 4 |  *    Antoine P. Petitet
 5 |  *    University of Tennessee, Knoxville
 6 |  *    Innovative Computing Laboratory
 7 |  *    (C) Copyright 2000-2008 All Rights Reserved
 8 |  *
 9 |  *    Modified by: Noel Chalmers
10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
11 |  *    See the rocHPL/LICENCE file for details.
12 |  *
13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
14 |  * ---------------------------------------------------------------------
15 |  */
16 | #ifndef HPL_PTIMER_HPP
17 | #define HPL_PTIMER_HPP
18 | /*
19 |  * ---------------------------------------------------------------------
20 |  * Include files
21 |  * ---------------------------------------------------------------------
22 |  */
23 | #include "hpl_pmisc.hpp"
24 | 
25 | /*
26 |  * ---------------------------------------------------------------------
27 |  * #define macro constants
28 |  * ---------------------------------------------------------------------
29 |  */
30 | #define HPL_NPTIMER 64
31 | #define HPL_PTIMER_STARTFLAG 5.0
32 | #define HPL_PTIMER_ERROR -1.0
33 | /*
34 |  * ---------------------------------------------------------------------
35 |  * type definitions
36 |  * ---------------------------------------------------------------------
37 |  */
38 | typedef enum { HPL_WALL_PTIME = 101, HPL_CPU_PTIME = 102 } HPL_T_PTIME;
39 | 
40 | typedef enum {
41 |   HPL_AMAX_PTIME = 201,
42 |   HPL_AMIN_PTIME = 202,
43 |   HPL_SUM_PTIME  = 203
44 | } HPL_T_PTIME_OP;
45 | /*
46 |  * ---------------------------------------------------------------------
47 |  * Function prototypes
48 |  * ---------------------------------------------------------------------
49 |  */
50 | double HPL_ptimer_cputime(void);
51 | double HPL_ptimer_walltime(void);
52 | void   HPL_ptimer(const int);
53 | void   HPL_ptimer_boot(void);
54 | 
55 | void HPL_ptimer_combine(MPI_Comm comm,
56 |                         const HPL_T_PTIME_OP,
57 |                         const HPL_T_PTIME,
58 |                         const int,
59 |                         const int,
60 |                         double*);
61 | 
62 | void   HPL_ptimer_disable(void);
63 | void   HPL_ptimer_enable(void);
64 | double HPL_ptimer_inquire(const HPL_T_PTIME, const int);
65 | void   HPL_ptimer_stepReset(const int, const int);
66 | double HPL_ptimer_getStep(const int);
67 | 
68 | #endif
69 | /*
70 |  * End of hpl_ptimer.hpp
71 |  */
72 | 


--------------------------------------------------------------------------------
/include/hpl_version.hpp.in:
--------------------------------------------------------------------------------
 1 | /* ---------------------------------------------------------------------
 2 |  * -- High Performance Computing Linpack Benchmark (HPL)
 3 |  *    Noel Chalmers
 4 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
 5 |  *    See the rocHPL/LICENCE file for details.
 6 |  *
 7 |  *    SPDX-License-Identifier: (BSD-3-Clause)
 8 |  * ---------------------------------------------------------------------
 9 |  */
10 | 
11 | #ifndef HPL_VERSION_HPP
12 | #define HPL_VERSION_HPP
13 | 
14 | // clang-format off
15 | #define __ROCHPL_VER_MAJOR     @rochpl_VERSION_MAJOR@
16 | #define __ROCHPL_VER_MINOR     @rochpl_VERSION_MINOR@
17 | #define __ROCHPL_VER_PATCH     @rochpl_VERSION_PATCH@
18 | #define __ROCHPL_VER_TWEAK     @rochpl_VERSION_TWEAK@
19 | // clang-format on
20 | 
21 | #define __ROCHPL_VER \
22 |   10000 * __ROCHPL_VER_MAJOR + 100 * __ROCHPL_VER_MINOR + __ROCHPL_VER_PATCH
23 | 
24 | #endif // VERSION_HPP
25 | 


--------------------------------------------------------------------------------
/scripts/HPL.dat:
--------------------------------------------------------------------------------
 1 | HPLinpack benchmark input file
 2 | Innovative Computing Laboratory, University of Tennessee
 3 | HPL.out      output file name (if any)
 4 | 0            device out (6=stdout,7=stderr,file)
 5 | 1            # of problems sizes (N)
 6 | 45312        Ns
 7 | 1            # of NBs
 8 | 384          NBs
 9 | 1            PMAP process mapping (0=Row-,1=Column-major)
10 | 1            # of process grids (P x Q)
11 | 1            Ps
12 | 1            Qs
13 | 16.0         threshold
14 | 1            # of panel fact
15 | 2            PFACTs (0=left, 1=Crout, 2=Right)
16 | 1            # of recursive stopping criterium
17 | 32           NBMINs (>= 1)
18 | 1            # of panels in recursion
19 | 2            NDIVs
20 | 1            # of recursive panel fact.
21 | 2            RFACTs (0=left, 1=Crout, 2=Right)
22 | 1            # of broadcast
23 | 0            BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM)
24 | 1            # of lookahead depth
25 | 1            DEPTHs (>=0)
26 | 1            SWAP (0=bin-exch,1=long,2=mix)
27 | 64           swapping threshold
28 | 0            L1 in (0=transposed,1=no-transposed) form
29 | 0            U  in (0=transposed,1=no-transposed) form
30 | 0            Equilibration (0=no,1=yes)
31 | 8            memory alignment in double (> 0)
32 | 


--------------------------------------------------------------------------------
/src/HPL_InitGPU.cpp:
--------------------------------------------------------------------------------
  1 | /* ---------------------------------------------------------------------
  2 |  * -- High Performance Computing Linpack Benchmark (HPL)
  3 |  *    Noel Chalmers
  4 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
  5 |  *    See the rocHPL/LICENCE file for details.
  6 |  *
  7 |  *    SPDX-License-Identifier: (BSD-3-Clause)
  8 |  * ---------------------------------------------------------------------
  9 |  */
 10 | 
 11 | #include "hpl.hpp"
 12 | #include <algorithm>
 13 | 
 14 | rocblas_handle handle;
 15 | 
 16 | hipStream_t computeStream, dataStream;
 17 | 
 18 | hipEvent_t swapStartEvent[HPL_N_UPD], update[HPL_N_UPD];
 19 | hipEvent_t dgemmStart[HPL_N_UPD], dgemmStop[HPL_N_UPD];
 20 | hipEvent_t pfactStart, pfactStop;
 21 | 
 22 | static char host_name[MPI_MAX_PROCESSOR_NAME];
 23 | 
 24 | /*
 25 |   This function finds out how many MPI processes are running on the same node
 26 |   and assigns a local rank that can be used to map a process to a device.
 27 |   This function needs to be called by all the MPI processes.
 28 | */
 29 | void HPL_InitGPU(const HPL_T_grid* GRID) {
 30 |   char host_name[MPI_MAX_PROCESSOR_NAME];
 31 | 
 32 |   int i, n, namelen, rank, nprocs;
 33 |   int dev;
 34 | 
 35 |   int nprow, npcol, myrow, mycol;
 36 |   (void)HPL_grid_info(GRID, &nprow, &npcol, &myrow, &mycol);
 37 | 
 38 |   MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 39 |   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
 40 | 
 41 |   MPI_Get_processor_name(host_name, &namelen);
 42 | 
 43 |   int localSize = GRID->local_npcol * GRID->local_nprow;
 44 |   int localRank = rank % localSize;
 45 | 
 46 |   /* Find out how many GPUs are in the system and their device number */
 47 |   int deviceCount;
 48 |   CHECK_HIP_ERROR(hipGetDeviceCount(&deviceCount));
 49 | 
 50 |   if(deviceCount < 1) {
 51 |     if(localRank == 0)
 52 |       HPL_pwarn(stderr,
 53 |                 __LINE__,
 54 |                 "HPL_InitGPU",
 55 |                 "Node %s found no GPUs. Is the ROCm kernel module loaded?",
 56 |                 host_name);
 57 |     MPI_Finalize();
 58 |     exit(1);
 59 |   }
 60 | 
 61 |   dev = localRank % deviceCount;
 62 | 
 63 |   /* Assign device to MPI process, initialize BLAS and probe device properties
 64 |    */
 65 |   CHECK_HIP_ERROR(hipSetDevice(dev));
 66 | 
 67 |   CHECK_HIP_ERROR(hipStreamCreate(&computeStream));
 68 |   CHECK_HIP_ERROR(hipStreamCreate(&dataStream));
 69 | 
 70 |   CHECK_HIP_ERROR(hipEventCreate(swapStartEvent + HPL_LOOK_AHEAD));
 71 |   CHECK_HIP_ERROR(hipEventCreate(swapStartEvent + HPL_UPD_1));
 72 |   CHECK_HIP_ERROR(hipEventCreate(swapStartEvent + HPL_UPD_2));
 73 | 
 74 |   CHECK_HIP_ERROR(hipEventCreate(update + HPL_LOOK_AHEAD));
 75 |   CHECK_HIP_ERROR(hipEventCreate(update + HPL_UPD_1));
 76 |   CHECK_HIP_ERROR(hipEventCreate(update + HPL_UPD_2));
 77 | 
 78 |   CHECK_HIP_ERROR(hipEventCreate(dgemmStart + HPL_LOOK_AHEAD));
 79 |   CHECK_HIP_ERROR(hipEventCreate(dgemmStart + HPL_UPD_1));
 80 |   CHECK_HIP_ERROR(hipEventCreate(dgemmStart + HPL_UPD_2));
 81 | 
 82 |   CHECK_HIP_ERROR(hipEventCreate(dgemmStop + HPL_LOOK_AHEAD));
 83 |   CHECK_HIP_ERROR(hipEventCreate(dgemmStop + HPL_UPD_1));
 84 |   CHECK_HIP_ERROR(hipEventCreate(dgemmStop + HPL_UPD_2));
 85 | 
 86 |   CHECK_HIP_ERROR(hipEventCreate(&pfactStart));
 87 |   CHECK_HIP_ERROR(hipEventCreate(&pfactStop));
 88 | 
 89 |   /* Create a rocBLAS handle */
 90 |   CHECK_ROCBLAS_ERROR(rocblas_create_handle(&handle));
 91 |   CHECK_ROCBLAS_ERROR(
 92 |       rocblas_set_pointer_mode(handle, rocblas_pointer_mode_host));
 93 |   CHECK_ROCBLAS_ERROR(rocblas_set_stream(handle, computeStream));
 94 | 
 95 |   rocblas_initialize();
 96 | 
 97 | #ifdef HPL_ROCBLAS_ALLOW_ATOMICS
 98 |   CHECK_ROCBLAS_ERROR(
 99 |       rocblas_set_atomics_mode(handle, rocblas_atomics_allowed));
100 | #else
101 |   CHECK_ROCBLAS_ERROR(
102 |       rocblas_set_atomics_mode(handle, rocblas_atomics_not_allowed));
103 | #endif
104 | }
105 | 
106 | void HPL_FreeGPU() {
107 |   CHECK_ROCBLAS_ERROR(rocblas_destroy_handle(handle));
108 | 
109 |   CHECK_HIP_ERROR(hipEventDestroy(swapStartEvent[HPL_LOOK_AHEAD]));
110 |   CHECK_HIP_ERROR(hipEventDestroy(swapStartEvent[HPL_UPD_1]));
111 |   CHECK_HIP_ERROR(hipEventDestroy(swapStartEvent[HPL_UPD_2]));
112 | 
113 |   CHECK_HIP_ERROR(hipEventDestroy(update[HPL_LOOK_AHEAD]));
114 |   CHECK_HIP_ERROR(hipEventDestroy(update[HPL_UPD_1]));
115 |   CHECK_HIP_ERROR(hipEventDestroy(update[HPL_UPD_2]));
116 | 
117 |   CHECK_HIP_ERROR(hipEventDestroy(dgemmStart[HPL_LOOK_AHEAD]));
118 |   CHECK_HIP_ERROR(hipEventDestroy(dgemmStart[HPL_UPD_1]));
119 |   CHECK_HIP_ERROR(hipEventDestroy(dgemmStart[HPL_UPD_2]));
120 | 
121 |   CHECK_HIP_ERROR(hipEventDestroy(dgemmStop[HPL_LOOK_AHEAD]));
122 |   CHECK_HIP_ERROR(hipEventDestroy(dgemmStop[HPL_UPD_1]));
123 |   CHECK_HIP_ERROR(hipEventDestroy(dgemmStop[HPL_UPD_2]));
124 | 
125 |   CHECK_HIP_ERROR(hipEventDestroy(pfactStart));
126 |   CHECK_HIP_ERROR(hipEventDestroy(pfactStop));
127 | 
128 |   CHECK_HIP_ERROR(hipStreamDestroy(computeStream));
129 |   CHECK_HIP_ERROR(hipStreamDestroy(dataStream));
130 | }
131 | 


--------------------------------------------------------------------------------
/src/HPL_pdwarmup.cpp:
--------------------------------------------------------------------------------
  1 | /* ---------------------------------------------------------------------
  2 |  * -- High Performance Computing Linpack Benchmark (HPL)
  3 |  *    HPL - 2.2 - February 24, 2016
  4 |  *    Antoine P. Petitet
  5 |  *    University of Tennessee, Knoxville
  6 |  *    Innovative Computing Laboratory
  7 |  *    (C) Copyright 2000-2008 All Rights Reserved
  8 |  *
  9 |  *    Modified by: Noel Chalmers
 10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
 11 |  *    See the rocHPL/LICENCE file for details.
 12 |  *
 13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
 14 |  * ---------------------------------------------------------------------
 15 |  */
 16 | #include "hpl.hpp"
 17 | 
 18 | int HPL_pdwarmup(HPL_T_test* TEST,
 19 |                  HPL_T_grid* GRID,
 20 |                  HPL_T_palg* ALGO,
 21 |                  HPL_T_pmat* mat) {
 22 | 
 23 |   int N  = mat->n;
 24 |   int NB = mat->nb;
 25 | 
 26 |   HPL_T_UPD_FUN HPL_pdupdate = ALGO->upfun;
 27 | 
 28 |   HPL_T_panel* p0 = &(mat->panel[0]);
 29 |   HPL_T_panel* p1 = &(mat->panel[1]);
 30 | 
 31 |   HPL_pdpanel_init(
 32 |       GRID, ALGO, N, N + 1, Mmin(N, NB), mat, 0, 0, MSGID_BEGIN_FACT, p0);
 33 |   HPL_pdpanel_init(
 34 |       GRID, ALGO, N, N + 1, Mmin(N, NB), mat, 0, 0, MSGID_BEGIN_FACT, p1);
 35 | 
 36 |   int mm = Mmin(p0->mp, p0->jb);
 37 |   int nn = Mmin(p0->nq, p0->jb);
 38 | 
 39 |   // Fill the matrix with values
 40 |   HPL_pdrandmat(GRID, N, N + 1, NB, mat->A, mat->ld, HPL_ISEED);
 41 | 
 42 |   // Do a pfact on all columns
 43 |   p0->pcol = p0->grid->mycol;
 44 |   HPL_pdfact(p0);
 45 |   HPL_pdpanel_swapids(p0);
 46 |   HPL_pdpanel_Wait(p0);
 47 |   p0->A -= p0->jb * static_cast<size_t>(p0->lda);
 48 | 
 49 |   // Broadcast to register with MPI
 50 |   p0->pcol = 0;
 51 |   HPL_pdpanel_bcast(p0);
 52 | 
 53 |   p0->nu0  = nn;
 54 |   p0->ldu0 = nn;
 55 |   HPL_pdlaswp_start(p0, HPL_LOOK_AHEAD);
 56 |   HPL_pdlaswp_exchange(p0, HPL_LOOK_AHEAD);
 57 |   HPL_pdlaswp_end(p0, HPL_LOOK_AHEAD);
 58 |   HPL_pdupdate(p0, HPL_LOOK_AHEAD);
 59 |   p0->nu0 = 0;
 60 | 
 61 |   HPL_pdlaswp_start(p0, HPL_UPD_1);
 62 |   HPL_pdlaswp_exchange(p0, HPL_UPD_1);
 63 |   HPL_pdlaswp_end(p0, HPL_UPD_1);
 64 |   HPL_pdupdate(p0, HPL_UPD_1);
 65 | 
 66 |   HPL_pdlaswp_start(p0, HPL_UPD_2);
 67 |   HPL_pdlaswp_exchange(p0, HPL_UPD_2);
 68 |   HPL_pdlaswp_end(p0, HPL_UPD_2);
 69 |   HPL_pdupdate(p0, HPL_UPD_2);
 70 | 
 71 |   CHECK_HIP_ERROR(hipDeviceSynchronize());
 72 | 
 73 |   // Do a pfact on all columns
 74 |   p1->pcol = p1->grid->mycol;
 75 |   HPL_pdfact(p1);
 76 |   HPL_pdpanel_swapids(p1);
 77 |   HPL_pdpanel_Wait(p1);
 78 |   p1->A -= p1->jb * static_cast<size_t>(p1->lda);
 79 | 
 80 |   // Broadcast to register with MPI
 81 |   p1->pcol = 0;
 82 |   HPL_pdpanel_bcast(p1);
 83 | 
 84 |   p1->nu0  = nn;
 85 |   p1->ldu0 = nn;
 86 |   HPL_pdlaswp_start(p1, HPL_LOOK_AHEAD);
 87 |   HPL_pdlaswp_exchange(p1, HPL_LOOK_AHEAD);
 88 |   HPL_pdlaswp_end(p1, HPL_LOOK_AHEAD);
 89 |   HPL_pdupdate(p1, HPL_LOOK_AHEAD);
 90 |   p1->nu0 = 0;
 91 | 
 92 |   HPL_pdlaswp_start(p1, HPL_UPD_1);
 93 |   HPL_pdlaswp_exchange(p1, HPL_UPD_1);
 94 |   HPL_pdlaswp_end(p1, HPL_UPD_1);
 95 |   HPL_pdupdate(p1, HPL_UPD_1);
 96 | 
 97 |   HPL_pdlaswp_start(p1, HPL_UPD_2);
 98 |   HPL_pdlaswp_exchange(p1, HPL_UPD_2);
 99 |   HPL_pdlaswp_end(p1, HPL_UPD_2);
100 |   HPL_pdupdate(p1, HPL_UPD_2);
101 | 
102 |   HPL_pdtrsv(GRID, mat);
103 | 
104 |   return HPL_SUCCESS;
105 | }
106 | 


--------------------------------------------------------------------------------
/src/auxil/HPL_abort.cpp:
--------------------------------------------------------------------------------
 1 | /* ---------------------------------------------------------------------
 2 |  * -- High Performance Computing Linpack Benchmark (HPL)
 3 |  *    HPL - 2.2 - February 24, 2016
 4 |  *    Antoine P. Petitet
 5 |  *    University of Tennessee, Knoxville
 6 |  *    Innovative Computing Laboratory
 7 |  *    (C) Copyright 2000-2008 All Rights Reserved
 8 |  *
 9 |  *    Modified by: Noel Chalmers
10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
11 |  *    See the rocHPL/LICENCE file for details.
12 |  *
13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
14 |  * ---------------------------------------------------------------------
15 |  */
16 | 
17 | #include "hpl.hpp"
18 | 
19 | void HPL_abort(int LINE, const char* SRNAME, const char* FORM, ...) {
20 |   /*
21 |    * Purpose
22 |    * =======
23 |    *
24 |    * HPL_abort displays an error message on stderr and halts execution.
25 |    *
26 |    *
27 |    * Arguments
28 |    * =========
29 |    *
30 |    * LINE    (local input)                 int
31 |    *         On entry,  LINE  specifies the line  number in the file where
32 |    *         the  error  has  occured.  When  LINE  is not a positive line
33 |    *         number, it is ignored.
34 |    *
35 |    * SRNAME  (local input)                 const char *
36 |    *         On entry, SRNAME  should  be the name of the routine  calling
37 |    *         this error handler.
38 |    *
39 |    * FORM    (local input)                 const char *
40 |    *         On entry, FORM specifies the format, i.e., how the subsequent
41 |    *         arguments are converted for output.
42 |    *
43 |    *         (local input)                 ...
44 |    *         On entry,  ...  is the list of arguments to be printed within
45 |    *         the format string.
46 |    *
47 |    * ---------------------------------------------------------------------
48 |    */
49 | 
50 |   va_list argptr;
51 |   char    cline[128];
52 | 
53 |   va_start(argptr, FORM);
54 |   (void)vsprintf(cline, FORM, argptr);
55 |   va_end(argptr);
56 |   /*
57 |    * Display an error message
58 |    */
59 |   if(LINE <= 0)
60 |     HPL_fprintf(stderr,
61 |                 "%s %s:\n>>> %s <<< Abort ...\n\n",
62 |                 "HPL ERROR in function",
63 |                 SRNAME,
64 |                 cline);
65 |   else
66 |     HPL_fprintf(stderr,
67 |                 "%s %d %s %s:\n>>> %s <<< Abort ...\n\n",
68 |                 "HPL ERROR on line",
69 |                 LINE,
70 |                 "of function",
71 |                 SRNAME,
72 |                 cline);
73 |   exit(0);
74 | }
75 | 


--------------------------------------------------------------------------------
/src/auxil/HPL_dlacpy_device.cpp:
--------------------------------------------------------------------------------
  1 | /* ---------------------------------------------------------------------
  2 |  * -- High Performance Computing Linpack Benchmark (HPL)
  3 |  *    HPL - 2.2 - February 24, 2016
  4 |  *    Antoine P. Petitet
  5 |  *    University of Tennessee, Knoxville
  6 |  *    Innovative Computing Laboratory
  7 |  *    (C) Copyright 2000-2008 All Rights Reserved
  8 |  *
  9 |  *    Modified by: Noel Chalmers
 10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
 11 |  *    See the rocHPL/LICENCE file for details.
 12 |  *
 13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
 14 |  * ---------------------------------------------------------------------
 15 |  */
 16 | 
 17 | #include "hpl.hpp"
 18 | #include <hip/hip_runtime.h>
 19 | 
 20 | #define TILE_DIM 64
 21 | #define BLOCK_ROWS 16
 22 | 
 23 | __global__ void dlacpy(const int M,
 24 |                        const int N,
 25 |                        const double* __restrict__ A,
 26 |                        const int LDA,
 27 |                        double* __restrict__ B,
 28 |                        const int LDB) {
 29 | 
 30 |   const int I = blockIdx.x * TILE_DIM + threadIdx.x;
 31 |   const int J = blockIdx.y * TILE_DIM + threadIdx.y;
 32 | 
 33 |   if(I < M) {
 34 |     if(J + 0 < N)
 35 |       B[I + static_cast<size_t>(LDB) * (J + 0)] =
 36 |           A[I + static_cast<size_t>(LDA) * (J + 0)];
 37 |     if(J + 16 < N)
 38 |       B[I + static_cast<size_t>(LDB) * (J + 16)] =
 39 |           A[I + static_cast<size_t>(LDA) * (J + 16)];
 40 |     if(J + 32 < N)
 41 |       B[I + static_cast<size_t>(LDB) * (J + 32)] =
 42 |           A[I + static_cast<size_t>(LDA) * (J + 32)];
 43 |     if(J + 48 < N)
 44 |       B[I + static_cast<size_t>(LDB) * (J + 48)] =
 45 |           A[I + static_cast<size_t>(LDA) * (J + 48)];
 46 |   }
 47 | }
 48 | 
 49 | void HPL_dlacpy(const int     M,
 50 |                 const int     N,
 51 |                 const double* A,
 52 |                 const int     LDA,
 53 |                 double*       B,
 54 |                 const int     LDB) {
 55 |   /*
 56 |    * Purpose
 57 |    * =======
 58 |    *
 59 |    * HPL_dlacpy copies an array A into an array B.
 60 |    *
 61 |    *
 62 |    * Arguments
 63 |    * =========
 64 |    *
 65 |    * M       (local input)                 const int
 66 |    *         On entry,  M specifies the number of rows of the arrays A and
 67 |    *         B. M must be at least zero.
 68 |    *
 69 |    * N       (local input)                 const int
 70 |    *         On entry,  N specifies  the number of columns of the arrays A
 71 |    *         and B. N must be at least zero.
 72 |    *
 73 |    * A       (local input)                 const double *
 74 |    *         On entry, A points to an array of dimension (LDA,N).
 75 |    *
 76 |    * LDA     (local input)                 const int
 77 |    *         On entry, LDA specifies the leading dimension of the array A.
 78 |    *         LDA must be at least MAX(1,M).
 79 |    *
 80 |    * B       (local output)                double *
 81 |    *         On entry, B points to an array of dimension (LDB,N). On exit,
 82 |    *         B is overwritten with A.
 83 |    *
 84 |    * LDB     (local input)                 const int
 85 |    *         On entry, LDB specifies the leading dimension of the array B.
 86 |    *         LDB must be at least MAX(1,M).
 87 |    *
 88 |    * ---------------------------------------------------------------------
 89 |    */
 90 | 
 91 |   if((M <= 0) || (N <= 0)) return;
 92 | 
 93 |   hipStream_t stream;
 94 |   CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream));
 95 | 
 96 |   dim3 grid_size((M + TILE_DIM - 1) / TILE_DIM, (N + TILE_DIM - 1) / TILE_DIM);
 97 |   dim3 block_size(TILE_DIM, BLOCK_ROWS);
 98 |   dlacpy<<<grid_size, block_size, 0, stream>>>(M, N, A, LDA, B, LDB);
 99 |   CHECK_HIP_ERROR(hipGetLastError());
100 | }
101 | 


--------------------------------------------------------------------------------
/src/auxil/HPL_dlaprnt.cpp:
--------------------------------------------------------------------------------
 1 | /* ---------------------------------------------------------------------
 2 |  * -- High Performance Computing Linpack Benchmark (HPL)
 3 |  *    HPL - 2.2 - February 24, 2016
 4 |  *    Antoine P. Petitet
 5 |  *    University of Tennessee, Knoxville
 6 |  *    Innovative Computing Laboratory
 7 |  *    (C) Copyright 2000-2008 All Rights Reserved
 8 |  *
 9 |  *    Modified by: Noel Chalmers
10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
11 |  *    See the rocHPL/LICENCE file for details.
12 |  *
13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
14 |  * ---------------------------------------------------------------------
15 |  */
16 | 
17 | #include "hpl.hpp"
18 | 
19 | void HPL_dlaprnt(const int   M,
20 |                  const int   N,
21 |                  double*     A,
22 |                  const int   IA,
23 |                  const int   JA,
24 |                  const int   LDA,
25 |                  const char* CMATNM) {
26 |   /*
27 |    * Purpose
28 |    * =======
29 |    *
30 |    * HPL_dlaprnt prints to standard error an M-by-N matrix A.
31 |    *
32 |    *
33 |    * Arguments
34 |    * =========
35 |    *
36 |    * M       (local input)                 const int
37 |    *         On entry,  M  specifies the number of rows of A. M must be at
38 |    *         least zero.
39 |    *
40 |    * N       (local input)                 const int
41 |    *         On entry,  N  specifies the number of columns of A. N must be
42 |    *         at least zero.
43 |    *
44 |    * A       (local input)                 double *
45 |    *         On entry, A  points to an array of dimension (LDA,N).
46 |    *
47 |    * IA      (local input)                 const int
48 |    *         On entry, IA specifies the starting row index to be printed.
49 |    *
50 |    * JA      (local input)                 const int
51 |    *         On entry,  JA  specifies  the  starting  column index  to be
52 |    *         printed.
53 |    *
54 |    * LDA     (local input)                 const int
55 |    *         On entry, LDA specifies the leading dimension of the array A.
56 |    *         LDA must be at least max(1,M).
57 |    *
58 |    * CMATNM  (local input)                 const char *
59 |    *         On entry, CMATNM is the name of the matrix to be printed.
60 |    *
61 |    * ---------------------------------------------------------------------
62 |    */
63 | 
64 |   int i, j;
65 | 
66 |   for(j = 0; j < N; j++) {
67 |     for(i = 0; i < M; i++) {
68 |       HPL_fprintf(stderr,
69 |                   "%s(%6d,%6d)=%30.18f\n",
70 |                   CMATNM,
71 |                   IA + i,
72 |                   JA + j,
73 |                   *(Mptr(A, i, j, LDA)));
74 |     }
75 |   }
76 | }
77 | 


--------------------------------------------------------------------------------
/src/auxil/HPL_dlatcpy_device.cpp:
--------------------------------------------------------------------------------
  1 | /* ---------------------------------------------------------------------
  2 |  * -- High Performance Computing Linpack Benchmark (HPL)
  3 |  *    HPL - 2.2 - February 24, 2016
  4 |  *    Antoine P. Petitet
  5 |  *    University of Tennessee, Knoxville
  6 |  *    Innovative Computing Laboratory
  7 |  *    (C) Copyright 2000-2008 All Rights Reserved
  8 |  *
  9 |  *    Modified by: Noel Chalmers
 10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
 11 |  *    See the rocHPL/LICENCE file for details.
 12 |  *
 13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
 14 |  * ---------------------------------------------------------------------
 15 |  */
 16 | 
 17 | #include "hpl.hpp"
 18 | #include <hip/hip_runtime.h>
 19 | 
 20 | #define TILE_DIM 64
 21 | #define BLOCK_ROWS 16
 22 | 
 23 | __global__ void dlatcpy(const int M,
 24 |                         const int N,
 25 |                         const double* __restrict__ A,
 26 |                         const int LDA,
 27 |                         double* __restrict__ B,
 28 |                         const int LDB) {
 29 | 
 30 |   __shared__ double s_tile[TILE_DIM][TILE_DIM + 1];
 31 | 
 32 |   int I = blockIdx.x * TILE_DIM + threadIdx.y;
 33 |   int J = blockIdx.y * TILE_DIM + threadIdx.x;
 34 | 
 35 |   if(J < N) {
 36 |     if(I + 0 < M)
 37 |       s_tile[threadIdx.y + 0][threadIdx.x] = A[((size_t)I + 0) * LDA + J];
 38 |     if(I + 16 < M)
 39 |       s_tile[threadIdx.y + 16][threadIdx.x] = A[((size_t)I + 16) * LDA + J];
 40 |     if(I + 32 < M)
 41 |       s_tile[threadIdx.y + 32][threadIdx.x] = A[((size_t)I + 32) * LDA + J];
 42 |     if(I + 48 < M)
 43 |       s_tile[threadIdx.y + 48][threadIdx.x] = A[((size_t)I + 48) * LDA + J];
 44 |   }
 45 | 
 46 |   I = blockIdx.x * TILE_DIM + threadIdx.x;
 47 |   J = blockIdx.y * TILE_DIM + threadIdx.y;
 48 | 
 49 |   __syncthreads();
 50 | 
 51 |   if(I < M) {
 52 |     if(J + 0 < N)
 53 |       B[I + ((size_t)J + 0) * LDB] = s_tile[threadIdx.x][threadIdx.y + 0];
 54 |     if(J + 16 < N)
 55 |       B[I + ((size_t)J + 16) * LDB] = s_tile[threadIdx.x][threadIdx.y + 16];
 56 |     if(J + 32 < N)
 57 |       B[I + ((size_t)J + 32) * LDB] = s_tile[threadIdx.x][threadIdx.y + 32];
 58 |     if(J + 48 < N)
 59 |       B[I + ((size_t)J + 48) * LDB] = s_tile[threadIdx.x][threadIdx.y + 48];
 60 |   }
 61 | }
 62 | 
 63 | void HPL_dlatcpy(const int     M,
 64 |                  const int     N,
 65 |                  const double* A,
 66 |                  const int     LDA,
 67 |                  double*       B,
 68 |                  const int     LDB) {
 69 |   /*
 70 |    * Purpose
 71 |    * =======
 72 |    *
 73 |    * HPL_dlatcpy copies the transpose of an array A into an array B.
 74 |    *
 75 |    *
 76 |    * Arguments
 77 |    * =========
 78 |    *
 79 |    * M       (local input)                 const int
 80 |    *         On entry,  M specifies the number of  rows of the array B and
 81 |    *         the number of columns of A. M must be at least zero.
 82 |    *
 83 |    * N       (local input)                 const int
 84 |    *         On entry,  N specifies the number of  rows of the array A and
 85 |    *         the number of columns of B. N must be at least zero.
 86 |    *
 87 |    * A       (local input)                 const double *
 88 |    *         On entry, A points to an array of dimension (LDA,M).
 89 |    *
 90 |    * LDA     (local input)                 const int
 91 |    *         On entry, LDA specifies the leading dimension of the array A.
 92 |    *         LDA must be at least MAX(1,N).
 93 |    *
 94 |    * B       (local output)                double *
 95 |    *         On entry, B points to an array of dimension (LDB,N). On exit,
 96 |    *         B is overwritten with the transpose of A.
 97 |    *
 98 |    * LDB     (local input)                 const int
 99 |    *         On entry, LDB specifies the leading dimension of the array B.
100 |    *         LDB must be at least MAX(1,M).
101 |    *
102 |    * ---------------------------------------------------------------------
103 |    */
104 | 
105 |   if((M <= 0) || (N <= 0)) return;
106 | 
107 |   hipStream_t stream;
108 |   CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream));
109 | 
110 |   dim3 grid_size((M + TILE_DIM - 1) / TILE_DIM, (N + TILE_DIM - 1) / TILE_DIM);
111 |   dim3 block_size(TILE_DIM, BLOCK_ROWS);
112 |   dlatcpy<<<grid_size, block_size, 0, stream>>>(M, N, A, LDA, B, LDB);
113 |   CHECK_HIP_ERROR(hipGetLastError());
114 | }
115 | 


--------------------------------------------------------------------------------
/src/auxil/HPL_fprintf.cpp:
--------------------------------------------------------------------------------
 1 | /* ---------------------------------------------------------------------
 2 |  * -- High Performance Computing Linpack Benchmark (HPL)
 3 |  *    HPL - 2.2 - February 24, 2016
 4 |  *    Antoine P. Petitet
 5 |  *    University of Tennessee, Knoxville
 6 |  *    Innovative Computing Laboratory
 7 |  *    (C) Copyright 2000-2008 All Rights Reserved
 8 |  *
 9 |  *    Modified by: Noel Chalmers
10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
11 |  *    See the rocHPL/LICENCE file for details.
12 |  *
13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
14 |  * ---------------------------------------------------------------------
15 |  */
16 | 
17 | #include "hpl.hpp"
18 | 
19 | void HPL_fprintf(FILE* STREAM, const char* FORM, ...) {
20 |   /*
21 |    * Purpose
22 |    * =======
23 |    *
24 |    * HPL_fprintf is a wrapper around fprintf flushing the output stream.
25 |    *
26 |    *
27 |    * Arguments
28 |    * =========
29 |    *
30 |    * STREAM  (local input)                 FILE *
31 |    *         On entry, STREAM specifies the output stream.
32 |    *
33 |    * FORM    (local input)                 const char *
34 |    *         On entry, FORM specifies the format, i.e., how the subsequent
35 |    *         arguments are converted for output.
36 |    *
37 |    *         (local input)                 ...
38 |    *         On entry,  ...  is the list of arguments to be printed within
39 |    *         the format string.
40 |    *
41 |    * ---------------------------------------------------------------------
42 |    */
43 | 
44 |   va_list argptr;
45 |   char    cline[256];
46 | 
47 |   va_start(argptr, FORM);
48 |   (void)vsprintf(cline, FORM, argptr);
49 |   va_end(argptr);
50 | 
51 |   (void)fprintf(STREAM, "%s", cline);
52 |   (void)fflush(STREAM);
53 | }
54 | 


--------------------------------------------------------------------------------
/src/auxil/HPL_warn.cpp:
--------------------------------------------------------------------------------
 1 | /* ---------------------------------------------------------------------
 2 |  * -- High Performance Computing Linpack Benchmark (HPL)
 3 |  *    HPL - 2.2 - February 24, 2016
 4 |  *    Antoine P. Petitet
 5 |  *    University of Tennessee, Knoxville
 6 |  *    Innovative Computing Laboratory
 7 |  *    (C) Copyright 2000-2008 All Rights Reserved
 8 |  *
 9 |  *    Modified by: Noel Chalmers
10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
11 |  *    See the rocHPL/LICENCE file for details.
12 |  *
13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
14 |  * ---------------------------------------------------------------------
15 |  */
16 | 
17 | #include "hpl.hpp"
18 | 
19 | void HPL_warn(FILE*       STREAM,
20 |               int         LINE,
21 |               const char* SRNAME,
22 |               const char* FORM,
23 |               ...) {
24 |   /*
25 |    * Purpose
26 |    * =======
27 |    *
28 |    * HPL_warn displays an error message.
29 |    *
30 |    *
31 |    * Arguments
32 |    * =========
33 |    *
34 |    * STREAM  (local input)                 FILE *
35 |    *         On entry, STREAM specifies the output stream.
36 |    *
37 |    * LINE    (local input)                 int
38 |    *         On entry,  LINE  specifies the line  number in the file where
39 |    *         the  error  has  occured.  When  LINE  is not a positive line
40 |    *         number, it is ignored.
41 |    *
42 |    * SRNAME  (local input)                 const char *
43 |    *         On entry, SRNAME  should  be the name of the routine  calling
44 |    *         this error handler.
45 |    *
46 |    * FORM    (local input)                 const char *
47 |    *         On entry, FORM specifies the format, i.e., how the subsequent
48 |    *         arguments are converted for output.
49 |    *
50 |    *         (local input)                 ...
51 |    *         On entry,  ...  is the list of arguments to be printed within
52 |    *         the format string.
53 |    *
54 |    * ---------------------------------------------------------------------
55 |    */
56 | 
57 |   va_list argptr;
58 |   char    cline[128];
59 | 
60 |   va_start(argptr, FORM);
61 |   (void)vsprintf(cline, FORM, argptr);
62 |   va_end(argptr);
63 |   /*
64 |    * Display an error message
65 |    */
66 |   if(LINE <= 0)
67 |     HPL_fprintf(STREAM,
68 |                 "%s %s:\n>>> %s <<<\n\n",
69 |                 "HPL ERROR in function",
70 |                 SRNAME,
71 |                 cline);
72 |   else
73 |     HPL_fprintf(STREAM,
74 |                 "%s %d %s %s:\n>>> %s <<<\n\n",
75 |                 "HPL ERROR on line",
76 |                 LINE,
77 |                 "of function",
78 |                 SRNAME,
79 |                 cline);
80 | }
81 | 


--------------------------------------------------------------------------------
/src/comm/HPL_all_reduce.cpp:
--------------------------------------------------------------------------------
 1 | /* ---------------------------------------------------------------------
 2 |  * -- High Performance Computing Linpack Benchmark (HPL)
 3 |  *    HPL - 2.2 - February 24, 2016
 4 |  *    Antoine P. Petitet
 5 |  *    University of Tennessee, Knoxville
 6 |  *    Innovative Computing Laboratory
 7 |  *    (C) Copyright 2000-2008 All Rights Reserved
 8 |  *
 9 |  *    Modified by: Noel Chalmers
10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
11 |  *    See the rocHPL/LICENCE file for details.
12 |  *
13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
14 |  * ---------------------------------------------------------------------
15 |  */
16 | 
17 | #include "hpl.hpp"
18 | 
19 | int HPL_all_reduce(void*            BUFFER,
20 |                    const int        COUNT,
21 |                    const HPL_T_TYPE DTYPE,
22 |                    const HPL_T_OP   OP,
23 |                    MPI_Comm         COMM) {
24 |   /*
25 |    * Purpose
26 |    * =======
27 |    *
28 |    * HPL_all_reduce performs   a   global   reduce  operation  across  all
29 |    * processes of a group leaving the results on all processes.
30 |    *
31 |    * Arguments
32 |    * =========
33 |    *
34 |    * BUFFER  (local input/global output)   void *
35 |    *         On entry,  BUFFER  points to  the  buffer to be combined.  On
36 |    *         exit, this array contains the combined data and  is identical
37 |    *         on all processes in the group.
38 |    *
39 |    * COUNT   (global input)                const int
40 |    *         On entry,  COUNT  indicates the number of entries in  BUFFER.
41 |    *         COUNT must be at least zero.
42 |    *
43 |    * DTYPE   (global input)                const HPL_T_TYPE
44 |    *         On entry,  DTYPE  specifies the type of the buffers operands.
45 |    *
46 |    * OP      (global input)                const HPL_T_OP
47 |    *         On entry, OP is a pointer to the local combine function.
48 |    *
49 |    * COMM    (global/local input)          MPI_Comm
50 |    *         The MPI communicator identifying the process collection.
51 |    *
52 |    * ---------------------------------------------------------------------
53 |    */
54 | 
55 |   int ierr = MPI_Allreduce(
56 |       MPI_IN_PLACE, BUFFER, COUNT, HPL_2_MPI_TYPE(DTYPE), OP, COMM);
57 | 
58 |   return ((ierr == MPI_SUCCESS ? HPL_SUCCESS : HPL_FAILURE));
59 | }
60 | 


--------------------------------------------------------------------------------
/src/comm/HPL_allgatherv.cpp:
--------------------------------------------------------------------------------
  1 | /* ---------------------------------------------------------------------
  2 |  * -- High Performance Computing Linpack Benchmark (HPL)
  3 |  *    HPL - 2.2 - February 24, 2016
  4 |  *    Antoine P. Petitet
  5 |  *    University of Tennessee, Knoxville
  6 |  *    Innovative Computing Laboratory
  7 |  *    (C) Copyright 2000-2008 All Rights Reserved
  8 |  *
  9 |  *    Modified by: Noel Chalmers
 10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
 11 |  *    See the rocHPL/LICENCE file for details.
 12 |  *
 13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
 14 |  * ---------------------------------------------------------------------
 15 |  */
 16 | 
 17 | #include "hpl.hpp"
 18 | 
 19 | int HPL_allgatherv(double*    BUF,
 20 |                    const int  SCOUNT,
 21 |                    const int* RCOUNT,
 22 |                    const int* DISPL,
 23 |                    MPI_Comm   COMM) {
 24 |   /*
 25 |    * Purpose
 26 |    * =======
 27 |    *
 28 |    * HPL_allgatherv is a simple wrapper around an in-place MPI_Allgatherv.
 29 |    * Its  main  purpose is to  allow for some  experimentation / tuning
 30 |    * of this simple routine. Successful  completion  is  indicated  by
 31 |    * the  returned  error  code HPL_SUCCESS.
 32 |    *
 33 |    * Arguments
 34 |    * =========
 35 |    *
 36 |    * BUF    (local input/output)           double *
 37 |    *         On entry, on the root process BUF specifies the starting
 38 |    *         address of buffer to be gathered.
 39 |    *
 40 |    * SCOUNT  (local input)                 int
 41 |    *         On entry,  SCOUNT is an array of length SIZE specifiying
 42 |    *         the number of  double precision entries in BUF to send to
 43 |    *         each process.
 44 |    *
 45 |    * RCOUNT  (local input)                 int
 46 |    *         On entry,  RCOUNT is an array of length SIZE specifiying
 47 |    *         the number of double precision entries in BUF to receive from
 48 |    *         each process.
 49 |    *
 50 |    * DISPL   (local input)                 int *
 51 |    *         On entry,  DISPL is an array of length SIZE specifiying the
 52 |    *         displacement (relative to BUF) from which to place the incoming
 53 |    *         data from each process.
 54 |    *
 55 |    * COMM    (local input)                 MPI_Comm
 56 |    *         The MPI communicator identifying the communication space.
 57 |    *
 58 |    * ---------------------------------------------------------------------
 59 |    */
 60 | 
 61 |   HPL_TracingPush("HPL_Allgatherv");
 62 | 
 63 | #ifdef HPL_USE_COLLECTIVES
 64 | 
 65 |   int ierr = MPI_Allgatherv(
 66 |       MPI_IN_PLACE, SCOUNT, MPI_DOUBLE, BUF, RCOUNT, DISPL, MPI_DOUBLE, COMM);
 67 | 
 68 | #else
 69 | 
 70 |   int rank, size, ierr = MPI_SUCCESS;
 71 |   MPI_Comm_rank(COMM, &rank);
 72 |   MPI_Comm_size(COMM, &size);
 73 | 
 74 |   /*
 75 |    * Ring exchange
 76 |    */
 77 |   const int npm1 = size - 1;
 78 |   const int prev = MModSub1(rank, size);
 79 |   const int next = MModAdd1(rank, size);
 80 | 
 81 |   const int tag = 0;
 82 | 
 83 |   for(int k = 0; k < npm1; k++) {
 84 |     MPI_Request request;
 85 |     MPI_Status  status;
 86 |     const int   l = (int)((unsigned int)(k) >> 1);
 87 | 
 88 |     int il, lengthS, lengthR, partner, ibufS, ibufR;
 89 |     if(((rank + k) & 1) != 0) {
 90 |       il      = MModAdd(rank, l, size);
 91 |       ibufS   = DISPL[il];
 92 |       lengthS = RCOUNT[il];
 93 |       il      = MModSub(rank, l + 1, size);
 94 |       ibufR   = DISPL[il];
 95 |       lengthR = RCOUNT[il];
 96 |       partner = prev;
 97 |     } else {
 98 |       il      = MModSub(rank, l, size);
 99 |       ibufS   = DISPL[il];
100 |       lengthS = RCOUNT[il];
101 |       il      = MModAdd(rank, l + 1, size);
102 |       ibufR   = DISPL[il];
103 |       lengthR = RCOUNT[il];
104 |       partner = next;
105 |     }
106 | 
107 |     if(lengthR > 0) {
108 |       if(ierr == MPI_SUCCESS)
109 |         ierr = MPI_Irecv(
110 |             BUF + ibufR, lengthR, MPI_DOUBLE, partner, tag, COMM, &request);
111 |     }
112 | 
113 |     if(lengthS > 0) {
114 |       if(ierr == MPI_SUCCESS)
115 |         ierr = MPI_Send(BUF + ibufS, lengthS, MPI_DOUBLE, partner, tag, COMM);
116 |     }
117 | 
118 |     if(lengthR > 0) {
119 |       if(ierr == MPI_SUCCESS) ierr = MPI_Wait(&request, &status);
120 |     }
121 |   }
122 | 
123 | #endif
124 | 
125 |   HPL_TracingPop();
126 | 
127 |   return ((ierr == MPI_SUCCESS ? HPL_SUCCESS : HPL_FAILURE));
128 | }
129 | 


--------------------------------------------------------------------------------
/src/comm/HPL_barrier.cpp:
--------------------------------------------------------------------------------
 1 | /* ---------------------------------------------------------------------
 2 |  * -- High Performance Computing Linpack Benchmark (HPL)
 3 |  *    HPL - 2.2 - February 24, 2016
 4 |  *    Antoine P. Petitet
 5 |  *    University of Tennessee, Knoxville
 6 |  *    Innovative Computing Laboratory
 7 |  *    (C) Copyright 2000-2008 All Rights Reserved
 8 |  *
 9 |  *    Modified by: Noel Chalmers
10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
11 |  *    See the rocHPL/LICENCE file for details.
12 |  *
13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
14 |  * ---------------------------------------------------------------------
15 |  */
16 | 
17 | #include "hpl.hpp"
18 | 
19 | int HPL_barrier(MPI_Comm COMM) {
20 |   /*
21 |    * Purpose
22 |    * =======
23 |    *
24 |    * HPL_barrier blocks the caller until all process members have call it.
25 |    * The  call  returns  at any process  only after all group members have
26 |    * entered the call.
27 |    *
28 |    * Arguments
29 |    * =========
30 |    *
31 |    * COMM    (global/local input)          MPI_Comm
32 |    *         The MPI communicator identifying the process collection.
33 |    *
34 |    * ---------------------------------------------------------------------
35 |    */
36 | 
37 |   int ierr = MPI_Barrier(COMM);
38 | 
39 |   return ((ierr == MPI_SUCCESS ? HPL_SUCCESS : HPL_FAILURE));
40 | }
41 | 


--------------------------------------------------------------------------------
/src/comm/HPL_bcast.cpp:
--------------------------------------------------------------------------------
 1 | /* ---------------------------------------------------------------------
 2 |  * -- High Performance Computing Linpack Benchmark (HPL)
 3 |  *    HPL - 2.2 - February 24, 2016
 4 |  *    Antoine P. Petitet
 5 |  *    University of Tennessee, Knoxville
 6 |  *    Innovative Computing Laboratory
 7 |  *    (C) Copyright 2000-2008 All Rights Reserved
 8 |  *
 9 |  *    Modified by: Noel Chalmers
10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
11 |  *    See the rocHPL/LICENCE file for details.
12 |  *
13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
14 |  * ---------------------------------------------------------------------
15 |  */
16 | 
17 | #include "hpl.hpp"
18 | 
19 | int HPL_bcast(double*   SBUF,
20 |               int       SCOUNT,
21 |               int       ROOT,
22 |               MPI_Comm  COMM,
23 |               HPL_T_TOP top) {
24 |   /*
25 |    * Purpose
26 |    * =======
27 |    *
28 |    * HPL_bcast is a simple wrapper around  MPI_Bcast.  Its  main  purpose is
29 |    * to  allow for some  experimentation / tuning  of this simple routine.
30 |    * Successful  completion  is  indicated  by  the  returned  error  code
31 |    * HPL_SUCCESS.  In the case of messages of length less than or equal to
32 |    * zero, this function returns immediately.
33 |    *
34 |    * Arguments
35 |    * =========
36 |    *
37 |    * SBUF    (local input)                 double *
38 |    *         On entry, SBUF specifies the starting address of buffer to be
39 |    *         broadcast.
40 |    *
41 |    * SCOUNT  (local input)                 int
42 |    *         On entry,  SCOUNT  specifies  the number of  double precision
43 |    *         entries in SBUF. SCOUNT must be at least zero.
44 |    *
45 |    * ROOT    (local input)                 int
46 |    *         On entry, ROOT specifies the rank of the origin process in
47 |    *         the communication space defined by COMM.
48 |    *
49 |    * COMM    (local input)                 MPI_Comm
50 |    *         The MPI communicator identifying the communication space.
51 |    *
52 |    * ---------------------------------------------------------------------
53 |    */
54 | 
55 |   if(SCOUNT <= 0) return (HPL_SUCCESS);
56 | 
57 |   int ierr;
58 | 
59 |   HPL_TracingPush("HPL_Bcast");
60 | 
61 | #ifdef HPL_USE_COLLECTIVES
62 | 
63 |   ierr = MPI_Bcast(SBUF, SCOUNT, MPI_DOUBLE, ROOT, COMM);
64 | 
65 | #else
66 | 
67 |   switch(top) {
68 |     case HPL_1RING_M: ierr = HPL_bcast_1rinM(SBUF, SCOUNT, ROOT, COMM); break;
69 |     case HPL_1RING: ierr = HPL_bcast_1ring(SBUF, SCOUNT, ROOT, COMM); break;
70 |     case HPL_2RING_M: ierr = HPL_bcast_2rinM(SBUF, SCOUNT, ROOT, COMM); break;
71 |     case HPL_2RING: ierr = HPL_bcast_2ring(SBUF, SCOUNT, ROOT, COMM); break;
72 |     case HPL_BLONG_M: ierr = HPL_bcast_blonM(SBUF, SCOUNT, ROOT, COMM); break;
73 |     case HPL_BLONG: ierr = HPL_bcast_blong(SBUF, SCOUNT, ROOT, COMM); break;
74 |     default: ierr = HPL_FAILURE;
75 |   }
76 | 
77 | #endif
78 | 
79 |   HPL_TracingPop();
80 | 
81 |   return ((ierr == MPI_SUCCESS ? HPL_SUCCESS : HPL_FAILURE));
82 | }
83 | 


--------------------------------------------------------------------------------
/src/comm/HPL_bcast_1rinM.cpp:
--------------------------------------------------------------------------------
  1 | /* ---------------------------------------------------------------------
  2 |  * -- High Performance Computing Linpack Benchmark (HPL)
  3 |  *    HPL - 2.2 - February 24, 2016
  4 |  *    Antoine P. Petitet
  5 |  *    University of Tennessee, Knoxville
  6 |  *    Innovative Computing Laboratory
  7 |  *    (C) Copyright 2000-2008 All Rights Reserved
  8 |  *
  9 |  *    Modified by: Noel Chalmers
 10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
 11 |  *    See the rocHPL/LICENCE file for details.
 12 |  *
 13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
 14 |  * ---------------------------------------------------------------------
 15 |  */
 16 | 
 17 | #include "hpl.hpp"
 18 | 
 19 | int HPL_bcast_1rinM(double* SBUF, int SCOUNT, int ROOT, MPI_Comm COMM) {
 20 | 
 21 |   int rank, size;
 22 |   MPI_Comm_rank(COMM, &rank);
 23 |   MPI_Comm_size(COMM, &size);
 24 | 
 25 |   if(size <= 1) return (MPI_SUCCESS);
 26 | 
 27 |   /*Root immediately sends to ROOT+1*/
 28 |   if(rank == ROOT) {
 29 |     MPI_Send(SBUF, SCOUNT, MPI_DOUBLE, MModAdd1(ROOT, size), ROOT, COMM);
 30 |   } else if(rank == MModAdd1(ROOT, size)) {
 31 |     MPI_Recv(SBUF, SCOUNT, MPI_DOUBLE, ROOT, ROOT, COMM, MPI_STATUS_IGNORE);
 32 |     return MPI_SUCCESS;
 33 |   }
 34 | 
 35 |   if(size == 2) return (MPI_SUCCESS);
 36 | 
 37 |   /*One ring exchange to rule them all*/
 38 |   int chunk_size = 512 * 512; // 2MB
 39 | 
 40 |   chunk_size = std::min(chunk_size, SCOUNT);
 41 | 
 42 |   MPI_Request request[2];
 43 | 
 44 |   request[0] = MPI_REQUEST_NULL;
 45 |   request[1] = MPI_REQUEST_NULL;
 46 | 
 47 |   const int Nchunks = (SCOUNT + chunk_size - 1) / chunk_size;
 48 | 
 49 |   const int tag = rank;
 50 |   const int next =
 51 |       (rank == ROOT) ? MModAdd(ROOT, 2, size) : MModAdd1(rank, size);
 52 |   const int prev =
 53 |       (rank == MModAdd(ROOT, 2, size)) ? ROOT : MModSub1(rank, size);
 54 | 
 55 |   double* RBUF = SBUF;
 56 | 
 57 |   /*Shift to ROOT=0*/
 58 |   rank = MModSub(rank, ROOT, size);
 59 | 
 60 |   int Nsend = (rank == size - 1) ? 0 : SCOUNT;
 61 |   int Nrecv = (rank == 0) ? 0 : SCOUNT;
 62 | 
 63 |   /*Recv from left*/
 64 |   int Nr = std::min(Nrecv, chunk_size);
 65 |   if(Nr > 0) { MPI_Irecv(RBUF, Nr, MPI_DOUBLE, prev, prev, COMM, request + 0); }
 66 | 
 67 |   /*Send to right if there is data present to send*/
 68 |   int Ns = std::min(Nsend - Nrecv, chunk_size);
 69 |   if(Ns > 0) { MPI_Isend(SBUF, Ns, MPI_DOUBLE, next, tag, COMM, request + 1); }
 70 | 
 71 |   while(Nsend > 0 || Nrecv > 0) {
 72 |     int index = -1;
 73 |     MPI_Waitany(2, request, &index, MPI_STATUSES_IGNORE);
 74 | 
 75 |     if(index == 0) { /*Recv'd from left*/
 76 |       /*If we're waiting on this recv in order to send, send now*/
 77 |       if(Nrecv == Nsend) {
 78 |         Ns = Nr;
 79 |         MPI_Isend(SBUF, Ns, MPI_DOUBLE, next, tag, COMM, request + 1);
 80 |       }
 81 | 
 82 |       /*Count the recv'd amounts*/
 83 |       Nrecv -= Nr;
 84 |       RBUF += Nr;
 85 | 
 86 |       /*Post next recv if needed*/
 87 |       Nr = std::min(Nrecv, chunk_size);
 88 |       if(Nr > 0) {
 89 |         MPI_Irecv(RBUF, Nr, MPI_DOUBLE, prev, prev, COMM, request + 0);
 90 |       } else {
 91 |         request[0] = MPI_REQUEST_NULL;
 92 |       }
 93 | 
 94 |     } else if(index == 1) { /*Sent to right */
 95 |       Nsend -= Ns;
 96 |       SBUF += Ns;
 97 | 
 98 |       /*Send to right if there is data present to send*/
 99 |       Ns = std::min(Nsend - Nrecv, chunk_size);
100 |       if(Ns > 0) {
101 |         MPI_Isend(SBUF, Ns, MPI_DOUBLE, next, tag, COMM, request + 1);
102 |       } else {
103 |         request[1] = MPI_REQUEST_NULL;
104 |       }
105 |     }
106 |   }
107 | 
108 |   return MPI_SUCCESS;
109 | }
110 | 


--------------------------------------------------------------------------------
/src/comm/HPL_bcast_1ring.cpp:
--------------------------------------------------------------------------------
  1 | /* ---------------------------------------------------------------------
  2 |  * -- High Performance Computing Linpack Benchmark (HPL)
  3 |  *    HPL - 2.2 - February 24, 2016
  4 |  *    Antoine P. Petitet
  5 |  *    University of Tennessee, Knoxville
  6 |  *    Innovative Computing Laboratory
  7 |  *    (C) Copyright 2000-2008 All Rights Reserved
  8 |  *
  9 |  *    Modified by: Noel Chalmers
 10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
 11 |  *    See the rocHPL/LICENCE file for details.
 12 |  *
 13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
 14 |  * ---------------------------------------------------------------------
 15 |  */
 16 | 
 17 | #include "hpl.hpp"
 18 | 
 19 | int HPL_bcast_1ring(double* SBUF, int SCOUNT, int ROOT, MPI_Comm COMM) {
 20 | 
 21 |   int rank, size;
 22 |   MPI_Comm_rank(COMM, &rank);
 23 |   MPI_Comm_size(COMM, &size);
 24 | 
 25 |   if(size <= 1) return (MPI_SUCCESS);
 26 | 
 27 |   /*One ring exchange to rule them all*/
 28 |   int chunk_size = 512 * 512; // 2MB
 29 |   // int chunk_size = 64 * 512; // 256KB
 30 | 
 31 |   chunk_size = std::min(chunk_size, SCOUNT);
 32 | 
 33 |   MPI_Request request[2];
 34 | 
 35 |   request[0] = MPI_REQUEST_NULL;
 36 |   request[1] = MPI_REQUEST_NULL;
 37 | 
 38 |   const int Nchunks = (SCOUNT + chunk_size - 1) / chunk_size;
 39 | 
 40 |   const int tag  = rank;
 41 |   const int next = MModAdd1(rank, size);
 42 |   const int prev = MModSub1(rank, size);
 43 | 
 44 |   /*Mid point of message*/
 45 |   double* RBUF = SBUF;
 46 | 
 47 |   /*Shift to ROOT=0*/
 48 |   rank = MModSub(rank, ROOT, size);
 49 | 
 50 |   int Nsend = (rank == size - 1) ? 0 : SCOUNT;
 51 |   int Nrecv = (rank == 0) ? 0 : SCOUNT;
 52 | 
 53 |   /*Recv from left*/
 54 |   int Nr = std::min(Nrecv, chunk_size);
 55 |   if(Nr > 0) { MPI_Irecv(RBUF, Nr, MPI_DOUBLE, prev, prev, COMM, request + 0); }
 56 | 
 57 |   /*Send to right if there is data present to send*/
 58 |   int Ns = std::min(Nsend - Nrecv, chunk_size);
 59 |   if(Ns > 0) { MPI_Isend(SBUF, Ns, MPI_DOUBLE, next, tag, COMM, request + 1); }
 60 | 
 61 |   while(Nsend > 0 || Nrecv > 0) {
 62 |     int index = -1;
 63 |     MPI_Waitany(2, request, &index, MPI_STATUSES_IGNORE);
 64 | 
 65 |     if(index == 0) { /*Recv'd from left*/
 66 |       /*If we're waiting on this recv in order to send, send now*/
 67 |       if(Nrecv == Nsend) {
 68 |         Ns = Nr;
 69 |         MPI_Isend(SBUF, Ns, MPI_DOUBLE, next, tag, COMM, request + 1);
 70 |       }
 71 | 
 72 |       /*Count the recv'd amounts*/
 73 |       Nrecv -= Nr;
 74 |       RBUF += Nr;
 75 | 
 76 |       /*Post next recv if needed*/
 77 |       Nr = std::min(Nrecv, chunk_size);
 78 |       if(Nr > 0) {
 79 |         MPI_Irecv(RBUF, Nr, MPI_DOUBLE, prev, prev, COMM, request + 0);
 80 |       } else {
 81 |         request[0] = MPI_REQUEST_NULL;
 82 |       }
 83 | 
 84 |     } else if(index == 1) { /*Sent to right */
 85 |       Nsend -= Ns;
 86 |       SBUF += Ns;
 87 | 
 88 |       /*Send to right if there is data present to send*/
 89 |       Ns = std::min(Nsend - Nrecv, chunk_size);
 90 |       if(Ns > 0) {
 91 |         MPI_Isend(SBUF, Ns, MPI_DOUBLE, next, tag, COMM, request + 1);
 92 |       } else {
 93 |         request[1] = MPI_REQUEST_NULL;
 94 |       }
 95 |     }
 96 |   }
 97 | 
 98 |   return MPI_SUCCESS;
 99 | }
100 | 


--------------------------------------------------------------------------------
/src/comm/HPL_bcast_2rinM.cpp:
--------------------------------------------------------------------------------
  1 | /* ---------------------------------------------------------------------
  2 |  * -- High Performance Computing Linpack Benchmark (HPL)
  3 |  *    HPL - 2.2 - February 24, 2016
  4 |  *    Antoine P. Petitet
  5 |  *    University of Tennessee, Knoxville
  6 |  *    Innovative Computing Laboratory
  7 |  *    (C) Copyright 2000-2008 All Rights Reserved
  8 |  *
  9 |  *    Modified by: Noel Chalmers
 10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
 11 |  *    See the rocHPL/LICENCE file for details.
 12 |  *
 13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
 14 |  * ---------------------------------------------------------------------
 15 |  */
 16 | 
 17 | #include "hpl.hpp"
 18 | 
 19 | int HPL_bcast_2rinM(double* SBUF, int SCOUNT, int ROOT, MPI_Comm COMM) {
 20 | 
 21 |   int rank, size;
 22 |   MPI_Comm_rank(COMM, &rank);
 23 |   MPI_Comm_size(COMM, &size);
 24 | 
 25 |   if(size <= 1) return (MPI_SUCCESS);
 26 | 
 27 |   /*Root immediately sends to ROOT+1*/
 28 |   if(rank == ROOT) {
 29 |     MPI_Send(SBUF, SCOUNT, MPI_DOUBLE, MModAdd1(ROOT, size), ROOT, COMM);
 30 |   } else if(rank == MModAdd1(ROOT, size)) {
 31 |     MPI_Recv(SBUF, SCOUNT, MPI_DOUBLE, ROOT, ROOT, COMM, MPI_STATUS_IGNORE);
 32 |     return MPI_SUCCESS;
 33 |   }
 34 | 
 35 |   if(size == 2) return (MPI_SUCCESS);
 36 | 
 37 |   /*One ring exchange to rule them all*/
 38 |   int chunk_size = 512 * 512; // 2MB
 39 | 
 40 |   chunk_size = std::min(chunk_size, SCOUNT);
 41 | 
 42 |   MPI_Request request[4];
 43 | 
 44 |   request[0] = MPI_REQUEST_NULL;
 45 |   request[1] = MPI_REQUEST_NULL;
 46 |   request[2] = MPI_REQUEST_NULL;
 47 |   request[3] = MPI_REQUEST_NULL;
 48 | 
 49 |   const int Nchunks     = (SCOUNT + chunk_size - 1) / chunk_size;
 50 |   const int NchunksHalf = (Nchunks + 1) / 2;
 51 | 
 52 |   const int tag = rank;
 53 |   const int next =
 54 |       (rank == ROOT) ? MModAdd(ROOT, 2, size) : MModAdd1(rank, size);
 55 |   const int prev =
 56 |       (rank == MModAdd(ROOT, 2, size)) ? ROOT : MModSub1(rank, size);
 57 | 
 58 |   /*Mid point of message*/
 59 |   double* SBUF0 = SBUF;
 60 |   double* SBUF1 = SBUF + NchunksHalf * chunk_size;
 61 | 
 62 |   double* RBUF0 = SBUF0;
 63 |   double* RBUF1 = SBUF1;
 64 | 
 65 |   /*Shift to ROOT=0*/
 66 |   rank = MModSub(rank, ROOT, size);
 67 | 
 68 |   int Nsend0 = (rank == size - 1) ? 0 : NchunksHalf * chunk_size;
 69 |   int Nsend1 = (rank == 2) ? 0 : SCOUNT - NchunksHalf * chunk_size;
 70 | 
 71 |   int Nrecv0 = (rank == 0) ? 0 : NchunksHalf * chunk_size;
 72 |   int Nrecv1 = (rank == 0) ? 0 : SCOUNT - NchunksHalf * chunk_size;
 73 | 
 74 |   /*Recv from left*/
 75 |   int Nr0 = std::min(Nrecv0, chunk_size);
 76 |   if(Nr0 > 0) {
 77 |     MPI_Irecv(RBUF0, Nr0, MPI_DOUBLE, prev, prev, COMM, request + 0);
 78 |   }
 79 | 
 80 |   /*Recv from right*/
 81 |   int Nr1 = std::min(Nrecv1, chunk_size);
 82 |   if(Nr1 > 0) {
 83 |     MPI_Irecv(RBUF1, Nr1, MPI_DOUBLE, next, next, COMM, request + 1);
 84 |   }
 85 | 
 86 |   /*Send to right if there is data present to send*/
 87 |   int Ns0 = std::min(Nsend0 - Nrecv0, chunk_size);
 88 |   if(Ns0 > 0) {
 89 |     MPI_Isend(SBUF0, Ns0, MPI_DOUBLE, next, tag, COMM, request + 2);
 90 |   }
 91 | 
 92 |   /*Send to left if there is data present to send*/
 93 |   int Ns1 = std::min(Nsend1 - Nrecv1, chunk_size);
 94 |   if(Ns1 > 0) {
 95 |     MPI_Isend(SBUF1, Ns1, MPI_DOUBLE, prev, tag, COMM, request + 3);
 96 |   }
 97 | 
 98 |   while(Nsend0 > 0 || Nsend1 > 0 || Nrecv0 > 0 || Nrecv1 > 0) {
 99 |     int index = -1;
100 |     MPI_Waitany(4, request, &index, MPI_STATUSES_IGNORE);
101 | 
102 |     if(index == 0) { /*Recv'd from left*/
103 |       /*If we're waiting on this recv in order to send, send now*/
104 |       if(Nrecv0 == Nsend0) {
105 |         Ns0 = Nr0;
106 |         MPI_Isend(SBUF0, Ns0, MPI_DOUBLE, next, tag, COMM, request + 2);
107 |       }
108 | 
109 |       /*Count the recv'd amounts*/
110 |       Nrecv0 -= Nr0;
111 |       RBUF0 += Nr0;
112 | 
113 |       /*Post next recv if needed*/
114 |       Nr0 = std::min(Nrecv0, chunk_size);
115 |       if(Nr0 > 0) {
116 |         MPI_Irecv(RBUF0, Nr0, MPI_DOUBLE, prev, prev, COMM, request + 0);
117 |       } else {
118 |         request[0] = MPI_REQUEST_NULL;
119 |       }
120 | 
121 |     } else if(index == 1) { /*Recv'd from right*/
122 |       /*If we're waiting on this recv in order to send, send now*/
123 |       if(Nrecv1 == Nsend1) {
124 |         Ns1 = Nr1;
125 |         MPI_Isend(SBUF1, Ns1, MPI_DOUBLE, prev, tag, COMM, request + 3);
126 |       }
127 | 
128 |       /*Count the recv'd amounts*/
129 |       Nrecv1 -= Nr1;
130 |       RBUF1 += Nr1;
131 | 
132 |       /*Post next recv if needed*/
133 |       Nr1 = std::min(Nrecv1, chunk_size);
134 |       if(Nr1 > 0) {
135 |         MPI_Irecv(RBUF1, Nr1, MPI_DOUBLE, next, next, COMM, request + 1);
136 |       } else {
137 |         request[1] = MPI_REQUEST_NULL;
138 |       }
139 | 
140 |     } else if(index == 2) { /*Sent to right */
141 |       Nsend0 -= Ns0;
142 |       SBUF0 += Ns0;
143 | 
144 |       /*Send to right if there is data present to send*/
145 |       Ns0 = std::min(Nsend0 - Nrecv0, chunk_size);
146 |       if(Ns0 > 0) {
147 |         MPI_Isend(SBUF0, Ns0, MPI_DOUBLE, next, tag, COMM, request + 2);
148 |       } else {
149 |         request[2] = MPI_REQUEST_NULL;
150 |       }
151 |     } else { /*index==3, Sent to left */
152 |       Nsend1 -= Ns1;
153 |       SBUF1 += Ns1;
154 | 
155 |       Ns1 = std::min(Nsend1 - Nrecv1, chunk_size);
156 |       if(Ns1 > 0) {
157 |         MPI_Isend(SBUF1, Ns1, MPI_DOUBLE, prev, tag, COMM, request + 3);
158 |       } else {
159 |         request[3] = MPI_REQUEST_NULL;
160 |       }
161 |     }
162 |   }
163 | 
164 |   return MPI_SUCCESS;
165 | }
166 | 


--------------------------------------------------------------------------------
/src/comm/HPL_bcast_2ring.cpp:
--------------------------------------------------------------------------------
  1 | /* ---------------------------------------------------------------------
  2 |  * -- High Performance Computing Linpack Benchmark (HPL)
  3 |  *    HPL - 2.2 - February 24, 2016
  4 |  *    Antoine P. Petitet
  5 |  *    University of Tennessee, Knoxville
  6 |  *    Innovative Computing Laboratory
  7 |  *    (C) Copyright 2000-2008 All Rights Reserved
  8 |  *
  9 |  *    Modified by: Noel Chalmers
 10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
 11 |  *    See the rocHPL/LICENCE file for details.
 12 |  *
 13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
 14 |  * ---------------------------------------------------------------------
 15 |  */
 16 | 
 17 | #include "hpl.hpp"
 18 | 
 19 | int HPL_bcast_2ring(double* SBUF, int SCOUNT, int ROOT, MPI_Comm COMM) {
 20 | 
 21 |   int rank, size;
 22 |   MPI_Comm_rank(COMM, &rank);
 23 |   MPI_Comm_size(COMM, &size);
 24 | 
 25 |   if(size <= 1) return (MPI_SUCCESS);
 26 | 
 27 |   /*One ring exchange to rule them all*/
 28 |   int chunk_size = 512 * 512; // 2MB
 29 | 
 30 |   chunk_size = std::min(chunk_size, SCOUNT);
 31 | 
 32 |   MPI_Request request[4];
 33 | 
 34 |   request[0] = MPI_REQUEST_NULL;
 35 |   request[1] = MPI_REQUEST_NULL;
 36 |   request[2] = MPI_REQUEST_NULL;
 37 |   request[3] = MPI_REQUEST_NULL;
 38 | 
 39 |   const int Nchunks     = (SCOUNT + chunk_size - 1) / chunk_size;
 40 |   const int NchunksHalf = (Nchunks + 1) / 2;
 41 | 
 42 |   const int tag  = rank;
 43 |   const int next = MModAdd1(rank, size);
 44 |   const int prev = MModSub1(rank, size);
 45 | 
 46 |   /*Mid point of message*/
 47 |   double* SBUF0 = SBUF;
 48 |   double* SBUF1 = SBUF + NchunksHalf * chunk_size;
 49 | 
 50 |   double* RBUF0 = SBUF0;
 51 |   double* RBUF1 = SBUF1;
 52 | 
 53 |   /*Shift to ROOT=0*/
 54 |   rank = MModSub(rank, ROOT, size);
 55 | 
 56 |   int Nsend0 = (rank == size - 1) ? 0 : NchunksHalf * chunk_size;
 57 |   int Nsend1 = (rank == 1) ? 0 : SCOUNT - NchunksHalf * chunk_size;
 58 | 
 59 |   int Nrecv0 = (rank == 0) ? 0 : NchunksHalf * chunk_size;
 60 |   int Nrecv1 = (rank == 0) ? 0 : SCOUNT - NchunksHalf * chunk_size;
 61 | 
 62 |   /*Recv from left*/
 63 |   int Nr0 = std::min(Nrecv0, chunk_size);
 64 |   if(Nr0 > 0) {
 65 |     MPI_Irecv(RBUF0, Nr0, MPI_DOUBLE, prev, prev, COMM, request + 0);
 66 |   }
 67 | 
 68 |   /*Recv from right*/
 69 |   int Nr1 = std::min(Nrecv1, chunk_size);
 70 |   if(Nr1 > 0) {
 71 |     MPI_Irecv(RBUF1, Nr1, MPI_DOUBLE, next, next, COMM, request + 1);
 72 |   }
 73 | 
 74 |   /*Send to right if there is data present to send*/
 75 |   int Ns0 = std::min(Nsend0 - Nrecv0, chunk_size);
 76 |   if(Ns0 > 0) {
 77 |     MPI_Isend(SBUF0, Ns0, MPI_DOUBLE, next, tag, COMM, request + 2);
 78 |   }
 79 | 
 80 |   /*Send to left if there is data present to send*/
 81 |   int Ns1 = std::min(Nsend1 - Nrecv1, chunk_size);
 82 |   if(Ns1 > 0) {
 83 |     MPI_Isend(SBUF1, Ns1, MPI_DOUBLE, prev, tag, COMM, request + 3);
 84 |   }
 85 | 
 86 |   while(Nsend0 > 0 || Nsend1 > 0 || Nrecv0 > 0 || Nrecv1 > 0) {
 87 |     int index = -1;
 88 |     MPI_Waitany(4, request, &index, MPI_STATUSES_IGNORE);
 89 | 
 90 |     if(index == 0) { /*Recv'd from left*/
 91 |       /*If we're waiting on this recv in order to send, send now*/
 92 |       if(Nrecv0 == Nsend0) {
 93 |         Ns0 = Nr0;
 94 |         MPI_Isend(SBUF0, Ns0, MPI_DOUBLE, next, tag, COMM, request + 2);
 95 |       }
 96 | 
 97 |       /*Count the recv'd amounts*/
 98 |       Nrecv0 -= Nr0;
 99 |       RBUF0 += Nr0;
100 | 
101 |       /*Post next recv if needed*/
102 |       Nr0 = std::min(Nrecv0, chunk_size);
103 |       if(Nr0 > 0) {
104 |         MPI_Irecv(RBUF0, Nr0, MPI_DOUBLE, prev, prev, COMM, request + 0);
105 |       } else {
106 |         request[0] = MPI_REQUEST_NULL;
107 |       }
108 | 
109 |     } else if(index == 1) { /*Recv'd from right*/
110 |       /*If we're waiting on this recv in order to send, send now*/
111 |       if(Nrecv1 == Nsend1) {
112 |         Ns1 = Nr1;
113 |         MPI_Isend(SBUF1, Ns1, MPI_DOUBLE, prev, tag, COMM, request + 3);
114 |       }
115 | 
116 |       /*Count the recv'd amounts*/
117 |       Nrecv1 -= Nr1;
118 |       RBUF1 += Nr1;
119 | 
120 |       /*Post next recv if needed*/
121 |       Nr1 = std::min(Nrecv1, chunk_size);
122 |       if(Nr1 > 0) {
123 |         MPI_Irecv(RBUF1, Nr1, MPI_DOUBLE, next, next, COMM, request + 1);
124 |       } else {
125 |         request[1] = MPI_REQUEST_NULL;
126 |       }
127 | 
128 |     } else if(index == 2) { /*Sent to right */
129 |       Nsend0 -= Ns0;
130 |       SBUF0 += Ns0;
131 | 
132 |       /*Send to right if there is data present to send*/
133 |       Ns0 = std::min(Nsend0 - Nrecv0, chunk_size);
134 |       if(Ns0 > 0) {
135 |         MPI_Isend(SBUF0, Ns0, MPI_DOUBLE, next, tag, COMM, request + 2);
136 |       } else {
137 |         request[2] = MPI_REQUEST_NULL;
138 |       }
139 |     } else { /*index==3, Sent to left */
140 |       Nsend1 -= Ns1;
141 |       SBUF1 += Ns1;
142 | 
143 |       Ns1 = std::min(Nsend1 - Nrecv1, chunk_size);
144 |       if(Ns1 > 0) {
145 |         MPI_Isend(SBUF1, Ns1, MPI_DOUBLE, prev, tag, COMM, request + 3);
146 |       } else {
147 |         request[3] = MPI_REQUEST_NULL;
148 |       }
149 |     }
150 |   }
151 | 
152 |   return MPI_SUCCESS;
153 | }
154 | 


--------------------------------------------------------------------------------
/src/comm/HPL_bcast_blonM.cpp:
--------------------------------------------------------------------------------
  1 | /* ---------------------------------------------------------------------
  2 |  * -- High Performance Computing Linpack Benchmark (HPL)
  3 |  *    HPL - 2.2 - February 24, 2016
  4 |  *    Antoine P. Petitet
  5 |  *    University of Tennessee, Knoxville
  6 |  *    Innovative Computing Laboratory
  7 |  *    (C) Copyright 2000-2008 All Rights Reserved
  8 |  *
  9 |  *    Modified by: Noel Chalmers
 10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
 11 |  *    See the rocHPL/LICENCE file for details.
 12 |  *
 13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
 14 |  * ---------------------------------------------------------------------
 15 |  */
 16 | 
 17 | #include "hpl.hpp"
 18 | 
 19 | int HPL_bcast_blonM(double* SBUF, int SCOUNT, int ROOT, MPI_Comm COMM) {
 20 | 
 21 |   int rank, size;
 22 |   MPI_Comm_rank(COMM, &rank);
 23 |   MPI_Comm_size(COMM, &size);
 24 | 
 25 |   if(size <= 1) return (MPI_SUCCESS);
 26 | 
 27 |   /*
 28 |    * Cast phase:  ROOT process  sends to its right neighbor,  then spread
 29 |    * the panel on the other npcol - 2 processes.  If  I  am  not the ROOT
 30 |    * process, probe for message received.  If the message is there,  then
 31 |    * receive it. If I am just after the ROOT process, return.  Otherwise,
 32 |    * keep spreading on those npcol - 2 processes.  Otherwise,  inform the
 33 |    * caller that the panel has still not been received.
 34 |    */
 35 |   int count, ierr = MPI_SUCCESS, ibuf, ibufR, ibufS, indx, ip2 = 1, k, l, lbuf,
 36 |              lbufR, lbufS, mask = 1, mydist, mydist2, next, npm1, npm2, partner,
 37 |              prev;
 38 | 
 39 |   const int tag = ROOT;
 40 |   next          = MModAdd1(rank, size);
 41 |   prev          = MModSub1(rank, size);
 42 | 
 43 |   if(rank == ROOT) {
 44 |     if(ierr == MPI_SUCCESS)
 45 |       ierr =
 46 |           MPI_Send(SBUF, SCOUNT, MPI_DOUBLE, MModAdd1(rank, size), tag, COMM);
 47 |   } else if(prev == ROOT) {
 48 |     if(ierr == MPI_SUCCESS)
 49 |       ierr = MPI_Recv(
 50 |           SBUF, SCOUNT, MPI_DOUBLE, ROOT, tag, COMM, MPI_STATUS_IGNORE);
 51 |   }
 52 |   /*
 53 |    * if I am just after the ROOT, exit now. The message receive  completed
 54 |    * successfully, this guy is done. If there are only 2 processes in each
 55 |    * row of processes, we are done as well.
 56 |    */
 57 |   if((prev == ROOT) || (size == 2)) return ierr;
 58 |   /*
 59 |    * Otherwise, proceed with broadcast -  Spread  the panel across process
 60 |    * columns
 61 |    */
 62 |   npm2 = (npm1 = size - 1) - 1;
 63 | 
 64 |   k = npm2;
 65 |   while(k > 1) {
 66 |     k >>= 1;
 67 |     ip2 <<= 1;
 68 |     mask <<= 1;
 69 |     mask++;
 70 |   }
 71 |   if(rank == ROOT)
 72 |     mydist2 = (mydist = 0);
 73 |   else
 74 |     mydist2 = (mydist = MModSub(rank, ROOT, size) - 1);
 75 | 
 76 |   indx  = ip2;
 77 |   count = SCOUNT / npm1;
 78 |   count = Mmax(count, 1);
 79 | 
 80 |   do {
 81 |     mask ^= ip2;
 82 | 
 83 |     if((mydist & mask) == 0) {
 84 |       lbuf = SCOUNT - (ibuf = indx * count);
 85 |       if(indx + ip2 < npm1) {
 86 |         l    = ip2 * count;
 87 |         lbuf = Mmin(lbuf, l);
 88 |       }
 89 | 
 90 |       partner = mydist ^ ip2;
 91 | 
 92 |       if((mydist & ip2) != 0) {
 93 |         partner = MModAdd(ROOT, partner, size);
 94 |         if(partner != ROOT) partner = MModAdd1(partner, size);
 95 | 
 96 |         if(lbuf > 0) {
 97 |           if(ierr == MPI_SUCCESS)
 98 |             ierr = MPI_Recv(SBUF + ibuf,
 99 |                             lbuf,
100 |                             MPI_DOUBLE,
101 |                             partner,
102 |                             tag,
103 |                             COMM,
104 |                             MPI_STATUS_IGNORE);
105 |         }
106 |       } else if(partner < npm1) {
107 |         partner = MModAdd(ROOT, partner, size);
108 |         if(partner != ROOT) partner = MModAdd1(partner, size);
109 | 
110 |         if(lbuf > 0) {
111 |           if(ierr == MPI_SUCCESS)
112 |             ierr = MPI_Send(SBUF + ibuf, lbuf, MPI_DOUBLE, partner, tag, COMM);
113 |         }
114 |       }
115 |     }
116 | 
117 |     if(mydist2 < ip2) {
118 |       ip2 >>= 1;
119 |       indx -= ip2;
120 |     } else {
121 |       mydist2 -= ip2;
122 |       ip2 >>= 1;
123 |       indx += ip2;
124 |     }
125 | 
126 |   } while(ip2 > 0);
127 |   /*
128 |    * Roll the pieces
129 |    */
130 |   if(MModSub1(prev, size) == ROOT) prev = ROOT;
131 |   if(rank == ROOT) next = MModAdd1(next, size);
132 | 
133 |   for(k = 0; k < npm2; k++) {
134 |     l = (k >> 1);
135 |     /*
136 |      * Who is sending to who and how much
137 |      */
138 |     if(((mydist + k) & 1) != 0) {
139 |       ibufS = (indx = MModAdd(mydist, l, npm1)) * count;
140 |       lbufS = (indx == npm2 ? SCOUNT : ibufS + count);
141 |       lbufS = Mmin(SCOUNT, lbufS) - ibufS;
142 |       lbufS = Mmax(0, lbufS);
143 | 
144 |       ibufR = (indx = MModSub(mydist, l + 1, npm1)) * count;
145 |       lbufR = (indx == npm2 ? SCOUNT : ibufR + count);
146 |       lbufR = Mmin(SCOUNT, lbufR) - ibufR;
147 |       lbufR = Mmax(0, lbufR);
148 | 
149 |       partner = prev;
150 |     } else {
151 |       ibufS = (indx = MModSub(mydist, l, npm1)) * count;
152 |       lbufS = (indx == npm2 ? SCOUNT : ibufS + count);
153 |       lbufS = Mmin(SCOUNT, lbufS) - ibufS;
154 |       lbufS = Mmax(0, lbufS);
155 | 
156 |       ibufR = (indx = MModAdd(mydist, l + 1, npm1)) * count;
157 |       lbufR = (indx == npm2 ? SCOUNT : ibufR + count);
158 |       lbufR = Mmin(SCOUNT, lbufR) - ibufR;
159 |       lbufR = Mmax(0, lbufR);
160 | 
161 |       partner = next;
162 |     }
163 |     /*
164 |      * Exchange the messages
165 |      */
166 |     MPI_Request request;
167 |     MPI_Status  status;
168 | 
169 |     if(lbufR > 0) {
170 |       if(ierr == MPI_SUCCESS)
171 |         ierr = MPI_Irecv(
172 |             SBUF + ibufR, lbufR, MPI_DOUBLE, partner, tag, COMM, &request);
173 |     }
174 | 
175 |     if(lbufS > 0) {
176 |       if(ierr == MPI_SUCCESS)
177 |         ierr = MPI_Send(SBUF + ibufS, lbufS, MPI_DOUBLE, partner, tag, COMM);
178 |     }
179 | 
180 |     if(lbufR > 0)
181 |       if(ierr == MPI_SUCCESS) ierr = MPI_Wait(&request, &status);
182 |   }
183 | 
184 |   return ierr;
185 | }
186 | 


--------------------------------------------------------------------------------
/src/comm/HPL_bcast_blong.cpp:
--------------------------------------------------------------------------------
  1 | /* ---------------------------------------------------------------------
  2 |  * -- High Performance Computing Linpack Benchmark (HPL)
  3 |  *    HPL - 2.2 - February 24, 2016
  4 |  *    Antoine P. Petitet
  5 |  *    University of Tennessee, Knoxville
  6 |  *    Innovative Computing Laboratory
  7 |  *    (C) Copyright 2000-2008 All Rights Reserved
  8 |  *
  9 |  *    Modified by: Noel Chalmers
 10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
 11 |  *    See the rocHPL/LICENCE file for details.
 12 |  *
 13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
 14 |  * ---------------------------------------------------------------------
 15 |  */
 16 | 
 17 | #include "hpl.hpp"
 18 | 
 19 | int HPL_bcast_blong(double* SBUF, int SCOUNT, int ROOT, MPI_Comm COMM) {
 20 | 
 21 |   int rank, size;
 22 |   MPI_Comm_rank(COMM, &rank);
 23 |   MPI_Comm_size(COMM, &size);
 24 | 
 25 |   if(size <= 1) return (MPI_SUCCESS);
 26 | 
 27 |   /*
 28 |    * Cast phase:  If I am the ROOT process, start spreading the panel.  If
 29 |    * I am not the ROOT process,  test  for  message receive completion. If
 30 |    * the message  is there,  then receive it,  and  keep  spreading  in  a
 31 |    * blocking fashion this time.  Otherwise,  inform  the caller  that the
 32 |    * panel has still not been received.
 33 |    */
 34 |   int count, ierr = MPI_SUCCESS, ibuf, ibufR, ibufS, indx, ip2, k, l, lbuf,
 35 |              lbufR, lbufS, mask, mydist, mydist2, npm1, partner, next, prev;
 36 | 
 37 |   const int tag = 0;
 38 | 
 39 |   // ip2  : largest power of two <= size-1;
 40 |   // mask : ip2 procs hypercube mask;
 41 |   mask = ip2 = 1;
 42 |   k          = size - 1;
 43 |   while(k > 1) {
 44 |     k >>= 1;
 45 |     ip2 <<= 1;
 46 |     mask <<= 1;
 47 |     mask++;
 48 |   }
 49 | 
 50 |   npm1    = size - 1;
 51 |   mydist2 = (mydist = MModSub(rank, ROOT, size));
 52 |   indx    = ip2;
 53 |   count   = SCOUNT / size;
 54 |   count   = Mmax(count, 1);
 55 |   /*
 56 |    * Spread the panel across process columns
 57 |    */
 58 |   do {
 59 |     mask ^= ip2;
 60 | 
 61 |     if((mydist & mask) == 0) {
 62 |       lbuf = SCOUNT - (ibuf = indx * count);
 63 |       if(indx + ip2 < size) {
 64 |         l    = ip2 * count;
 65 |         lbuf = Mmin(lbuf, l);
 66 |       }
 67 | 
 68 |       partner = mydist ^ ip2;
 69 | 
 70 |       if((mydist & ip2) != 0) {
 71 |         partner = MModAdd(ROOT, partner, size);
 72 | 
 73 |         if(lbuf > 0) {
 74 |           if(ierr == MPI_SUCCESS)
 75 |             ierr = MPI_Recv(SBUF + ibuf,
 76 |                             lbuf,
 77 |                             MPI_DOUBLE,
 78 |                             partner,
 79 |                             tag,
 80 |                             COMM,
 81 |                             MPI_STATUS_IGNORE);
 82 |         }
 83 |       } else if(partner < size) {
 84 |         partner = MModAdd(ROOT, partner, size);
 85 | 
 86 |         if(lbuf > 0) {
 87 |           if(ierr == MPI_SUCCESS)
 88 |             ierr = MPI_Send(SBUF + ibuf, lbuf, MPI_DOUBLE, partner, tag, COMM);
 89 |         }
 90 |       }
 91 |     }
 92 | 
 93 |     if(mydist2 < ip2) {
 94 |       ip2 >>= 1;
 95 |       indx -= ip2;
 96 |     } else {
 97 |       mydist2 -= ip2;
 98 |       ip2 >>= 1;
 99 |       indx += ip2;
100 |     }
101 | 
102 |   } while(ip2 > 0);
103 |   /*
104 |    * Roll the pieces
105 |    */
106 |   prev = MModSub1(rank, size);
107 |   next = MModAdd1(rank, size);
108 | 
109 |   for(k = 0; k < npm1; k++) {
110 |     l = (k >> 1);
111 |     /*
112 |      * Who is sending to who and how much
113 |      */
114 |     if(((mydist + k) & 1) != 0) {
115 |       ibufS = (indx = MModAdd(mydist, l, size)) * count;
116 |       lbufS = (indx == npm1 ? SCOUNT : ibufS + count);
117 |       lbufS = Mmin(SCOUNT, lbufS) - ibufS;
118 |       lbufS = Mmax(0, lbufS);
119 | 
120 |       ibufR = (indx = MModSub(mydist, l + 1, size)) * count;
121 |       lbufR = (indx == npm1 ? SCOUNT : ibufR + count);
122 |       lbufR = Mmin(SCOUNT, lbufR) - ibufR;
123 |       lbufR = Mmax(0, lbufR);
124 | 
125 |       partner = prev;
126 |     } else {
127 |       ibufS = (indx = MModSub(mydist, l, size)) * count;
128 |       lbufS = (indx == npm1 ? SCOUNT : ibufS + count);
129 |       lbufS = Mmin(SCOUNT, lbufS) - ibufS;
130 |       lbufS = Mmax(0, lbufS);
131 | 
132 |       ibufR = (indx = MModAdd(mydist, l + 1, size)) * count;
133 |       lbufR = (indx == npm1 ? SCOUNT : ibufR + count);
134 |       lbufR = Mmin(SCOUNT, lbufR) - ibufR;
135 |       lbufR = Mmax(0, lbufR);
136 | 
137 |       partner = next;
138 |     }
139 |     /*
140 |      * Exchange the messages
141 |      */
142 |     MPI_Request request;
143 |     MPI_Status  status;
144 | 
145 |     if(lbufR > 0) {
146 |       if(ierr == MPI_SUCCESS)
147 |         ierr = MPI_Irecv(
148 |             SBUF + ibufR, lbufR, MPI_DOUBLE, partner, tag, COMM, &request);
149 |     }
150 | 
151 |     if(lbufS > 0) {
152 |       if(ierr == MPI_SUCCESS)
153 |         ierr = MPI_Send(SBUF + ibufS, lbufS, MPI_DOUBLE, partner, tag, COMM);
154 |     }
155 | 
156 |     if(lbufR > 0)
157 |       if(ierr == MPI_SUCCESS) ierr = MPI_Wait(&request, &status);
158 |   }
159 | 
160 |   return ierr;
161 | }
162 | 


--------------------------------------------------------------------------------
/src/comm/HPL_broadcast.cpp:
--------------------------------------------------------------------------------
 1 | /* ---------------------------------------------------------------------
 2 |  * -- High Performance Computing Linpack Benchmark (HPL)
 3 |  *    HPL - 2.2 - February 24, 2016
 4 |  *    Antoine P. Petitet
 5 |  *    University of Tennessee, Knoxville
 6 |  *    Innovative Computing Laboratory
 7 |  *    (C) Copyright 2000-2008 All Rights Reserved
 8 |  *
 9 |  *    Modified by: Noel Chalmers
10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
11 |  *    See the rocHPL/LICENCE file for details.
12 |  *
13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
14 |  * ---------------------------------------------------------------------
15 |  */
16 | 
17 | #include "hpl.hpp"
18 | 
19 | int HPL_broadcast(void*            BUFFER,
20 |                   const int        COUNT,
21 |                   const HPL_T_TYPE DTYPE,
22 |                   const int        ROOT,
23 |                   MPI_Comm         COMM) {
24 |   /*
25 |    * Purpose
26 |    * =======
27 |    *
28 |    * HPL_broadcast broadcasts  a message from the process with rank ROOT to
29 |    * all processes in the group.
30 |    *
31 |    * Arguments
32 |    * =========
33 |    *
34 |    * BUFFER  (local input/output)          void *
35 |    *         On entry,  BUFFER  points to  the  buffer to be broadcast. On
36 |    *         exit, this array contains the broadcast data and is identical
37 |    *         on all processes in the group.
38 |    *
39 |    * COUNT   (global input)                const int
40 |    *         On entry,  COUNT  indicates the number of entries in  BUFFER.
41 |    *         COUNT must be at least zero.
42 |    *
43 |    * DTYPE   (global input)                const HPL_T_TYPE
44 |    *         On entry,  DTYPE  specifies the type of the buffers operands.
45 |    *
46 |    * ROOT    (global input)                const int
47 |    *         On entry, ROOT is the coordinate of the source process.
48 |    *
49 |    * COMM    (global/local input)          MPI_Comm
50 |    *         The MPI communicator identifying the process collection.
51 |    *
52 |    * ---------------------------------------------------------------------
53 |    */
54 | 
55 |   int ierr = MPI_Bcast(BUFFER, COUNT, HPL_2_MPI_TYPE(DTYPE), ROOT, COMM);
56 | 
57 |   return ((ierr == MPI_SUCCESS ? HPL_SUCCESS : HPL_FAILURE));
58 | }
59 | 


--------------------------------------------------------------------------------
/src/comm/HPL_recv.cpp:
--------------------------------------------------------------------------------
 1 | /* ---------------------------------------------------------------------
 2 |  * -- High Performance Computing Linpack Benchmark (HPL)
 3 |  *    HPL - 2.2 - February 24, 2016
 4 |  *    Antoine P. Petitet
 5 |  *    University of Tennessee, Knoxville
 6 |  *    Innovative Computing Laboratory
 7 |  *    (C) Copyright 2000-2008 All Rights Reserved
 8 |  *
 9 |  *    Modified by: Noel Chalmers
10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
11 |  *    See the rocHPL/LICENCE file for details.
12 |  *
13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
14 |  * ---------------------------------------------------------------------
15 |  */
16 | 
17 | #include "hpl.hpp"
18 | 
19 | int HPL_recv(double* RBUF, int RCOUNT, int SRC, int RTAG, MPI_Comm COMM) {
20 |   /*
21 |    * Purpose
22 |    * =======
23 |    *
24 |    * HPL_recv is a simple wrapper around  MPI_Recv.  Its  main  purpose is
25 |    * to  allow for some  experimentation / tuning  of this simple routine.
26 |    * Successful  completion  is  indicated  by  the  returned  error  code
27 |    * HPL_SUCCESS.  In the case of messages of length less than or equal to
28 |    * zero, this function returns immediately.
29 |    *
30 |    * Arguments
31 |    * =========
32 |    *
33 |    * RBUF    (local output)                double *
34 |    *         On entry, RBUF specifies the starting address of buffer to be
35 |    *         received.
36 |    *
37 |    * RCOUNT  (local input)                 int
38 |    *         On entry,  RCOUNT  specifies  the number  of double precision
39 |    *         entries in RBUF. RCOUNT must be at least zero.
40 |    *
41 |    * SRC     (local input)                 int
42 |    *         On entry, SRC  specifies the rank of the  sending  process in
43 |    *         the communication space defined by COMM.
44 |    *
45 |    * RTAG    (local input)                 int
46 |    *         On entry,  STAG specifies the message tag to be used for this
47 |    *         communication operation.
48 |    *
49 |    * COMM    (local input)                 MPI_Comm
50 |    *         The MPI communicator identifying the communication space.
51 |    *
52 |    * ---------------------------------------------------------------------
53 |    */
54 | 
55 |   if(RCOUNT <= 0) return (HPL_SUCCESS);
56 | 
57 |   MPI_Status status;
58 | 
59 |   int ierr =
60 |       MPI_Recv((void*)(RBUF), RCOUNT, MPI_DOUBLE, SRC, RTAG, COMM, &status);
61 | 
62 |   return ((ierr == MPI_SUCCESS ? HPL_SUCCESS : HPL_FAILURE));
63 | }
64 | 


--------------------------------------------------------------------------------
/src/comm/HPL_reduce.cpp:
--------------------------------------------------------------------------------
 1 | /* ---------------------------------------------------------------------
 2 |  * -- High Performance Computing Linpack Benchmark (HPL)
 3 |  *    HPL - 2.2 - February 24, 2016
 4 |  *    Antoine P. Petitet
 5 |  *    University of Tennessee, Knoxville
 6 |  *    Innovative Computing Laboratory
 7 |  *    (C) Copyright 2000-2008 All Rights Reserved
 8 |  *
 9 |  *    Modified by: Noel Chalmers
10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
11 |  *    See the rocHPL/LICENCE file for details.
12 |  *
13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
14 |  * ---------------------------------------------------------------------
15 |  */
16 | 
17 | #include "hpl.hpp"
18 | 
19 | int HPL_reduce(void*            BUFFER,
20 |                const int        COUNT,
21 |                const HPL_T_TYPE DTYPE,
22 |                const HPL_T_OP   OP,
23 |                const int        ROOT,
24 |                MPI_Comm         COMM) {
25 |   /*
26 |    * Purpose
27 |    * =======
28 |    *
29 |    * HPL_reduce performs a global reduce operation across all processes of
30 |    * a group.  Note that the input buffer is  used as workarray and in all
31 |    * processes but the accumulating process corrupting the original data.
32 |    *
33 |    * Arguments
34 |    * =========
35 |    *
36 |    * BUFFER  (local input/output)          void *
37 |    *         On entry,  BUFFER  points to  the  buffer to be  reduced.  On
38 |    *         exit,  and  in process of rank  ROOT  this array contains the
39 |    *         reduced data.  This  buffer  is also used as workspace during
40 |    *         the operation in the other processes of the group.
41 |    *
42 |    * COUNT   (global input)                const int
43 |    *         On entry,  COUNT  indicates the number of entries in  BUFFER.
44 |    *         COUNT must be at least zero.
45 |    *
46 |    * DTYPE   (global input)                const HPL_T_TYPE
47 |    *         On entry,  DTYPE  specifies the type of the buffers operands.
48 |    *
49 |    * OP      (global input)                const HPL_T_OP
50 |    *         On entry, OP is a pointer to the local combine function.
51 |    *
52 |    * ROOT    (global input)                const int
53 |    *         On entry, ROOT is the coordinate of the accumulating process.
54 |    *
55 |    * COMM    (global/local input)          MPI_Comm
56 |    *         The MPI communicator identifying the process collection.
57 |    *
58 |    * ---------------------------------------------------------------------
59 |    */
60 | 
61 |   int ierr;
62 | 
63 |   int rank;
64 |   MPI_Comm_rank(COMM, &rank);
65 | 
66 |   if(rank == ROOT)
67 |     ierr = MPI_Reduce(
68 |         MPI_IN_PLACE, BUFFER, COUNT, HPL_2_MPI_TYPE(DTYPE), OP, ROOT, COMM);
69 |   else
70 |     ierr =
71 |         MPI_Reduce(BUFFER, NULL, COUNT, HPL_2_MPI_TYPE(DTYPE), OP, ROOT, COMM);
72 | 
73 |   return ((ierr == MPI_SUCCESS ? HPL_SUCCESS : HPL_FAILURE));
74 | }
75 | 


--------------------------------------------------------------------------------
/src/comm/HPL_scatterv.cpp:
--------------------------------------------------------------------------------
  1 | /* ---------------------------------------------------------------------
  2 |  * -- High Performance Computing Linpack Benchmark (HPL)
  3 |  *    HPL - 2.2 - February 24, 2016
  4 |  *    Antoine P. Petitet
  5 |  *    University of Tennessee, Knoxville
  6 |  *    Innovative Computing Laboratory
  7 |  *    (C) Copyright 2000-2008 All Rights Reserved
  8 |  *
  9 |  *    Modified by: Noel Chalmers
 10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
 11 |  *    See the rocHPL/LICENCE file for details.
 12 |  *
 13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
 14 |  * ---------------------------------------------------------------------
 15 |  */
 16 | 
 17 | #include "hpl.hpp"
 18 | 
 19 | int HPL_scatterv(double*    BUF,
 20 |                  const int* SCOUNT,
 21 |                  const int* DISPL,
 22 |                  const int  RCOUNT,
 23 |                  int        ROOT,
 24 |                  MPI_Comm   COMM) {
 25 |   /*
 26 |    * Purpose
 27 |    * =======
 28 |    *
 29 |    * HPL_scatterv is a simple wrapper around an in-place MPI_Scatterv.
 30 |    * Its  main  purpose is to  allow for some  experimentation / tuning
 31 |    * of this simple routine. Successful  completion  is  indicated  by
 32 |    * the  returned  error  code HPL_SUCCESS.
 33 |    *
 34 |    * Arguments
 35 |    * =========
 36 |    *
 37 |    * BUF    (local input/output)           double *
 38 |    *         On entry, on the root process BUF specifies the starting
 39 |    *         address of buffer to be scattered. On non-root processes,
 40 |    *         BUF specifies the starting point of the received buffer.
 41 |    *
 42 |    * SCOUNT  (local input)                 int *
 43 |    *         On entry,  SCOUNT is an array of length SIZE specifiying
 44 |    *         the number of  double precision entries in BUF to send to
 45 |    *         each process.
 46 |    *
 47 |    * DISPL   (local input)                 int *
 48 |    *         On entry,  DISPL is an array of length SIZE specifiying the
 49 |    *         displacement (relative to BUF) from which to take the outgoing
 50 |    *         data to each process from the root process, and the displacement
 51 |    *         (relative to BUF) from which to receive the incoming data on
 52 |    *         each non-root process.
 53 |    *
 54 |    * RCOUNT  (local input)                 int
 55 |    *         On entry,  RCOUNT  specifies  the number of  double precision
 56 |    *         entries in BUF to be received from the ROOT process.
 57 |    *
 58 |    * ROOT    (local input)                 int
 59 |    *         On entry, ROOT specifies the rank of the origin process in
 60 |    *         the communication space defined by COMM.
 61 |    *
 62 |    * COMM    (local input)                 MPI_Comm
 63 |    *         The MPI communicator identifying the communication space.
 64 |    *
 65 |    * ---------------------------------------------------------------------
 66 |    */
 67 | 
 68 |   int rank, ierr = MPI_SUCCESS;
 69 |   MPI_Comm_rank(COMM, &rank);
 70 | 
 71 |   HPL_TracingPush("HPL_Scatterv");
 72 | 
 73 | #ifdef HPL_USE_COLLECTIVES
 74 | 
 75 |   if(rank == ROOT) {
 76 |     ierr = MPI_Scatterv(BUF,
 77 |                         SCOUNT,
 78 |                         DISPL,
 79 |                         MPI_DOUBLE,
 80 |                         MPI_IN_PLACE,
 81 |                         RCOUNT,
 82 |                         MPI_DOUBLE,
 83 |                         ROOT,
 84 |                         COMM);
 85 |   } else {
 86 |     ierr = MPI_Scatterv(
 87 |         NULL, SCOUNT, DISPL, MPI_DOUBLE, BUF, RCOUNT, MPI_DOUBLE, ROOT, COMM);
 88 |   }
 89 | 
 90 | #else
 91 | 
 92 |   int size;
 93 |   MPI_Comm_size(COMM, &size);
 94 | 
 95 |   const int tag = ROOT;
 96 |   if(rank == ROOT) {
 97 |     MPI_Request requests[size];
 98 | 
 99 |     /*Just send size-1 messages*/
100 |     for(int i = 0; i < size; ++i) {
101 | 
102 |       requests[i] = MPI_REQUEST_NULL;
103 | 
104 |       if(i == ROOT) { continue; }
105 |       const int ibuf = DISPL[i];
106 |       const int lbuf = SCOUNT[i];
107 | 
108 |       if(lbuf > 0) {
109 |         (void)MPI_Isend(
110 |             BUF + ibuf, lbuf, MPI_DOUBLE, i, tag, COMM, requests + i);
111 |       }
112 |     }
113 | 
114 |     MPI_Waitall(size, requests, MPI_STATUSES_IGNORE);
115 |   } else {
116 |     if(RCOUNT > 0)
117 |       ierr =
118 |           MPI_Recv(BUF, RCOUNT, MPI_DOUBLE, ROOT, tag, COMM, MPI_STATUS_IGNORE);
119 |   }
120 | 
121 | #endif
122 |   HPL_TracingPop();
123 | 
124 |   return ((ierr == MPI_SUCCESS ? HPL_SUCCESS : HPL_FAILURE));
125 | }
126 | 


--------------------------------------------------------------------------------
/src/comm/HPL_sdrv.cpp:
--------------------------------------------------------------------------------
 1 | /* ---------------------------------------------------------------------
 2 |  * -- High Performance Computing Linpack Benchmark (HPL)
 3 |  *    HPL - 2.2 - February 24, 2016
 4 |  *    Antoine P. Petitet
 5 |  *    University of Tennessee, Knoxville
 6 |  *    Innovative Computing Laboratory
 7 |  *    (C) Copyright 2000-2008 All Rights Reserved
 8 |  *
 9 |  *    Modified by: Noel Chalmers
10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
11 |  *    See the rocHPL/LICENCE file for details.
12 |  *
13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
14 |  * ---------------------------------------------------------------------
15 |  */
16 | 
17 | #include "hpl.hpp"
18 | 
19 | int HPL_sdrv(double*  SBUF,
20 |              int      SCOUNT,
21 |              int      STAG,
22 |              double*  RBUF,
23 |              int      RCOUNT,
24 |              int      RTAG,
25 |              int      PARTNER,
26 |              MPI_Comm COMM) {
27 |   /*
28 |    * Purpose
29 |    * =======
30 |    *
31 |    * HPL_sdrv is a simple wrapper around MPI_Sendrecv. Its main purpose is
32 |    * to allow for some experimentation and tuning of this simple function.
33 |    * Messages  of  length  less than  or  equal to zero  are not sent  nor
34 |    * received.  Successful completion  is  indicated by the returned error
35 |    * code HPL_SUCCESS.
36 |    *
37 |    * Arguments
38 |    * =========
39 |    *
40 |    * SBUF    (local input)                 double *
41 |    *         On entry, SBUF specifies the starting address of buffer to be
42 |    *         sent.
43 |    *
44 |    * SCOUNT  (local input)                 int
45 |    *         On entry,  SCOUNT  specifies  the number  of double precision
46 |    *         entries in SBUF. SCOUNT must be at least zero.
47 |    *
48 |    * STAG    (local input)                 int
49 |    *         On entry,  STAG  specifies the message tag to be used for the
50 |    *         sending communication operation.
51 |    *
52 |    * RBUF    (local output)                double *
53 |    *         On entry, RBUF specifies the starting address of buffer to be
54 |    *         received.
55 |    *
56 |    * RCOUNT  (local input)                 int
57 |    *         On entry,  RCOUNT  specifies  the number  of double precision
58 |    *         entries in RBUF. RCOUNT must be at least zero.
59 |    *
60 |    * RTAG    (local input)                 int
61 |    *         On entry,  RTAG  specifies the message tag to be used for the
62 |    *         receiving communication operation.
63 |    *
64 |    * PARTNER (local input)                 int
65 |    *         On entry,  PARTNER  specifies  the rank of the  collaborative
66 |    *         process in the communication space defined by COMM.
67 |    *
68 |    * COMM    (local input)                 MPI_Comm
69 |    *         The MPI communicator identifying the communication space.
70 |    *
71 |    * ---------------------------------------------------------------------
72 |    */
73 | 
74 |   MPI_Status status;
75 |   int        ierr;
76 | 
77 |   ierr = MPI_Sendrecv(SBUF,
78 |                       SCOUNT,
79 |                       MPI_DOUBLE,
80 |                       PARTNER,
81 |                       STAG,
82 |                       RBUF,
83 |                       RCOUNT,
84 |                       MPI_DOUBLE,
85 |                       PARTNER,
86 |                       RTAG,
87 |                       COMM,
88 |                       &status);
89 | 
90 |   return ((ierr == MPI_SUCCESS ? HPL_SUCCESS : HPL_FAILURE));
91 | }
92 | 


--------------------------------------------------------------------------------
/src/comm/HPL_send.cpp:
--------------------------------------------------------------------------------
 1 | /* ---------------------------------------------------------------------
 2 |  * -- High Performance Computing Linpack Benchmark (HPL)
 3 |  *    HPL - 2.2 - February 24, 2016
 4 |  *    Antoine P. Petitet
 5 |  *    University of Tennessee, Knoxville
 6 |  *    Innovative Computing Laboratory
 7 |  *    (C) Copyright 2000-2008 All Rights Reserved
 8 |  *
 9 |  *    Modified by: Noel Chalmers
10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
11 |  *    See the rocHPL/LICENCE file for details.
12 |  *
13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
14 |  * ---------------------------------------------------------------------
15 |  */
16 | 
17 | #include "hpl.hpp"
18 | 
19 | int HPL_send(double* SBUF, int SCOUNT, int DEST, int STAG, MPI_Comm COMM) {
20 |   /*
21 |    * Purpose
22 |    * =======
23 |    *
24 |    * HPL_send is a simple wrapper around  MPI_Send.  Its  main  purpose is
25 |    * to  allow for some  experimentation / tuning  of this simple routine.
26 |    * Successful  completion  is  indicated  by  the  returned  error  code
27 |    * MPI_SUCCESS.  In the case of messages of length less than or equal to
28 |    * zero, this function returns immediately.
29 |    *
30 |    * Arguments
31 |    * =========
32 |    *
33 |    * SBUF    (local input)                 double *
34 |    *         On entry, SBUF specifies the starting address of buffer to be
35 |    *         sent.
36 |    *
37 |    * SCOUNT  (local input)                 int
38 |    *         On entry,  SCOUNT  specifies  the number of  double precision
39 |    *         entries in SBUF. SCOUNT must be at least zero.
40 |    *
41 |    * DEST    (local input)                 int
42 |    *         On entry, DEST specifies the rank of the receiving process in
43 |    *         the communication space defined by COMM.
44 |    *
45 |    * STAG    (local input)                 int
46 |    *         On entry,  STAG specifies the message tag to be used for this
47 |    *         communication operation.
48 |    *
49 |    * COMM    (local input)                 MPI_Comm
50 |    *         The MPI communicator identifying the communication space.
51 |    *
52 |    * ---------------------------------------------------------------------
53 |    */
54 | 
55 |   if(SCOUNT <= 0) return (HPL_SUCCESS);
56 | 
57 |   int ierr = MPI_Send((void*)(SBUF), SCOUNT, MPI_DOUBLE, DEST, STAG, COMM);
58 | 
59 |   return ((ierr == MPI_SUCCESS ? HPL_SUCCESS : HPL_FAILURE));
60 | }
61 | 


--------------------------------------------------------------------------------
/src/grid/HPL_grid_exit.cpp:
--------------------------------------------------------------------------------
 1 | /* ---------------------------------------------------------------------
 2 |  * -- High Performance Computing Linpack Benchmark (HPL)
 3 |  *    HPL - 2.2 - February 24, 2016
 4 |  *    Antoine P. Petitet
 5 |  *    University of Tennessee, Knoxville
 6 |  *    Innovative Computing Laboratory
 7 |  *    (C) Copyright 2000-2008 All Rights Reserved
 8 |  *
 9 |  *    Modified by: Noel Chalmers
10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
11 |  *    See the rocHPL/LICENCE file for details.
12 |  *
13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
14 |  * ---------------------------------------------------------------------
15 |  */
16 | 
17 | #include "hpl.hpp"
18 | 
19 | int HPL_grid_exit(HPL_T_grid* GRID) {
20 |   /*
21 |    * Purpose
22 |    * =======
23 |    *
24 |    * HPL_grid_exit marks  the process  grid object for  deallocation.  The
25 |    * returned  error  code  MPI_SUCCESS  indicates  successful completion.
26 |    * Other error codes are (MPI) implementation dependent.
27 |    *
28 |    * Arguments
29 |    * =========
30 |    *
31 |    * GRID    (local input/output)          HPL_T_grid *
32 |    *         On entry,  GRID  points  to the data structure containing the
33 |    *         process grid to be released.
34 |    *
35 |    * ---------------------------------------------------------------------
36 |    */
37 | 
38 |   int hplerr = MPI_SUCCESS, mpierr;
39 | 
40 |   if(GRID->all_comm != MPI_COMM_NULL) {
41 |     mpierr = MPI_Comm_free(&(GRID->row_comm));
42 |     if(mpierr != MPI_SUCCESS) hplerr = mpierr;
43 |     mpierr = MPI_Comm_free(&(GRID->col_comm));
44 |     if(mpierr != MPI_SUCCESS) hplerr = mpierr;
45 |     mpierr = MPI_Comm_free(&(GRID->all_comm));
46 |     if(mpierr != MPI_SUCCESS) hplerr = mpierr;
47 |   }
48 | 
49 |   GRID->order = HPL_COLUMN_MAJOR;
50 | 
51 |   GRID->iam = GRID->myrow = GRID->mycol = -1;
52 |   GRID->nprow = GRID->npcol = GRID->nprocs = -1;
53 | 
54 |   GRID->row_ip2 = GRID->row_hdim = GRID->row_ip2m1 = GRID->row_mask = -1;
55 |   GRID->col_ip2 = GRID->col_hdim = GRID->col_ip2m1 = GRID->col_mask = -1;
56 | 
57 |   return (hplerr);
58 | }
59 | 


--------------------------------------------------------------------------------
/src/grid/HPL_grid_info.cpp:
--------------------------------------------------------------------------------
 1 | /* ---------------------------------------------------------------------
 2 |  * -- High Performance Computing Linpack Benchmark (HPL)
 3 |  *    HPL - 2.2 - February 24, 2016
 4 |  *    Antoine P. Petitet
 5 |  *    University of Tennessee, Knoxville
 6 |  *    Innovative Computing Laboratory
 7 |  *    (C) Copyright 2000-2008 All Rights Reserved
 8 |  *
 9 |  *    Modified by: Noel Chalmers
10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
11 |  *    See the rocHPL/LICENCE file for details.
12 |  *
13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
14 |  * ---------------------------------------------------------------------
15 |  */
16 | 
17 | #include "hpl.hpp"
18 | 
19 | int HPL_grid_info(const HPL_T_grid* GRID,
20 |                   int*              NPROW,
21 |                   int*              NPCOL,
22 |                   int*              MYROW,
23 |                   int*              MYCOL) {
24 |   /*
25 |    * Purpose
26 |    * =======
27 |    *
28 |    * HPL_grid_info returns  the grid shape and the coordinates in the grid
29 |    * of the calling process.  Successful  completion  is  indicated by the
30 |    * returned error code  MPI_SUCCESS. Other error codes depend on the MPI
31 |    * implementation.
32 |    *
33 |    * Arguments
34 |    * =========
35 |    *
36 |    * GRID    (local input)                 const HPL_T_grid *
37 |    *         On entry,  GRID  points  to the data structure containing the
38 |    *         process grid information.
39 |    *
40 |    * NPROW   (global output)               int *
41 |    *         On exit,   NPROW  specifies the number of process rows in the
42 |    *         grid. NPROW is at least one.
43 |    *
44 |    * NPCOL   (global output)               int *
45 |    *         On exit,   NPCOL  specifies  the number of process columns in
46 |    *         the grid. NPCOL is at least one.
47 |    *
48 |    * MYROW   (global output)               int *
49 |    *         On exit,  MYROW  specifies my  row process  coordinate in the
50 |    *         grid. MYROW is greater than or equal  to zero  and  less than
51 |    *         NPROW.
52 |    *
53 |    * MYCOL   (global output)               int *
54 |    *         On exit,  MYCOL specifies my column process coordinate in the
55 |    *         grid. MYCOL is greater than or equal  to zero  and  less than
56 |    *         NPCOL.
57 |    *
58 |    * ---------------------------------------------------------------------
59 |    */
60 | 
61 |   *NPROW = GRID->nprow;
62 |   *NPCOL = GRID->npcol;
63 |   *MYROW = GRID->myrow;
64 |   *MYCOL = GRID->mycol;
65 |   return (MPI_SUCCESS);
66 | }
67 | 


--------------------------------------------------------------------------------
/src/matgen/HPL_xjumpm.cpp:
--------------------------------------------------------------------------------
 1 | /* ---------------------------------------------------------------------
 2 |  * -- High Performance Computing Linpack Benchmark (HPL)
 3 |  *    HPL - 2.2 - February 24, 2016
 4 |  *    Antoine P. Petitet
 5 |  *    University of Tennessee, Knoxville
 6 |  *    Innovative Computing Laboratory
 7 |  *    (C) Copyright 2000-2008 All Rights Reserved
 8 |  *
 9 |  *    Modified by: Noel Chalmers
10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
11 |  *    See the rocHPL/LICENCE file for details.
12 |  *
13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
14 |  * ---------------------------------------------------------------------
15 |  */
16 | 
17 | #include "hpl.hpp"
18 | 
19 | void HPL_xjumpm(const int      JUMPM,
20 |                 const uint64_t MULT,
21 |                 const uint64_t IADD,
22 |                 const uint64_t IRANN,
23 |                 uint64_t&      IRANM,
24 |                 uint64_t&      IAM,
25 |                 uint64_t&      ICM) {
26 |   /*
27 |    * Purpose
28 |    * =======
29 |    *
30 |    * HPL_xjumpm computes  the constants  A and C  to jump JUMPM numbers in
31 |    * the random sequence: X(n+JUMPM) = A*X(n)+C.  The constants encoded in
32 |    * MULT and IADD  specify  how to jump from one entry in the sequence to
33 |    * the next.
34 |    *
35 |    * Arguments
36 |    * =========
37 |    *
38 |    * JUMPM   (local input)                 const int
39 |    *         On entry,  JUMPM  specifies  the  number  of entries  in  the
40 |    *         sequence to jump over. When JUMPM is less or equal than zero,
41 |    *         A and C are not computed, IRANM is set to IRANN corresponding
42 |    *         to a jump of size zero.
43 |    *
44 |    * MULT    (local input)                 unint64_t
45 |    *         On entry, MULT is an array of dimension 2,  that contains the
46 |    *         16-lower  and 15-higher bits of the constant  a  to jump from
47 |    *         X(n) to X(n+1) = a*X(n) + c in the random sequence.
48 |    *
49 |    * IADD    (local input)                 unint64_t
50 |    *         On entry, IADD is an array of dimension 2,  that contains the
51 |    *         16-lower  and 15-higher bits of the constant  c  to jump from
52 |    *         X(n) to X(n+1) = a*X(n) + c in the random sequence.
53 |    *
54 |    * IRANN   (local input)                 unint64_t
55 |    *         On entry, IRANN is an array of dimension 2. that contains the
56 |    *         16-lower and 15-higher bits of the encoding of X(n).
57 |    *
58 |    * IRANM   (local output)                unint64_t
59 |    *         On entry,  IRANM  is an array of dimension 2.   On exit, this
60 |    *         array  contains respectively  the 16-lower and 15-higher bits
61 |    *         of the encoding of X(n+JUMPM).
62 |    *
63 |    * IAM     (local output)                unint64_t
64 |    *         On entry, IAM is an array of dimension 2. On exit, when JUMPM
65 |    *         is  greater  than  zero,  this  array  contains  the  encoded
66 |    *         constant  A  to jump from  X(n) to  X(n+JUMPM)  in the random
67 |    *         sequence. IAM(0:1)  contains  respectively  the  16-lower and
68 |    *         15-higher  bits  of this constant  A. When  JUMPM  is less or
69 |    *         equal than zero, this array is not referenced.
70 |    *
71 |    * ICM     (local output)                unint64_t
72 |    *         On entry, ICM is an array of dimension 2. On exit, when JUMPM
73 |    *         is  greater  than  zero,  this  array  contains  the  encoded
74 |    *         constant  C  to jump from  X(n)  to  X(n+JUMPM) in the random
75 |    *         sequence. ICM(0:1)  contains  respectively  the  16-lower and
76 |    *         15-higher  bits  of this constant  C. When  JUMPM  is less or
77 |    *         equal than zero, this array is not referenced.
78 |    *
79 |    * ---------------------------------------------------------------------
80 |    */
81 |   if(JUMPM > 0) {
82 |     IAM = MULT;
83 |     ICM = IADD;
84 |     for(int k = 1; k <= JUMPM - 1; k++) {
85 |       IAM *= MULT;
86 |       ICM = ICM * MULT + IADD;
87 |     }
88 |     IRANM = IRANN * IAM + ICM;
89 |   } else {
90 |     IRANM = IRANN;
91 |   }
92 | }
93 | 


--------------------------------------------------------------------------------
/src/panel/HPL_pdpanel_bcast.cpp:
--------------------------------------------------------------------------------
 1 | /* ---------------------------------------------------------------------
 2 |  * -- High Performance Computing Linpack Benchmark (HPL)
 3 |  *    HPL - 2.2 - February 24, 2016
 4 |  *    Antoine P. Petitet
 5 |  *    University of Tennessee, Knoxville
 6 |  *    Innovative Computing Laboratory
 7 |  *    (C) Copyright 2000-2008 All Rights Reserved
 8 |  *
 9 |  *    Modified by: Noel Chalmers
10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
11 |  *    See the rocHPL/LICENCE file for details.
12 |  *
13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
14 |  * ---------------------------------------------------------------------
15 |  */
16 | 
17 | #include "hpl.hpp"
18 | 
19 | int HPL_pdpanel_bcast(HPL_T_panel* PANEL) {
20 |   /*
21 |    * Purpose
22 |    * =======
23 |    *
24 |    * HPL_pdpanel_bcast broadcasts  the  current  panel.  Successful  completion
25 |    * is indicated by a return code of HPL_SUCCESS.
26 |    *
27 |    * Arguments
28 |    * =========
29 |    *
30 |    * PANEL   (input/output)                HPL_T_panel *
31 |    *         On entry,  PANEL  points to the  current panel data structure
32 |    *         being broadcast.
33 |    *
34 |    * ---------------------------------------------------------------------
35 |    */
36 | 
37 |   if(PANEL == NULL) { return HPL_SUCCESS; }
38 |   if(PANEL->grid->npcol <= 1) { return HPL_SUCCESS; }
39 | 
40 |   MPI_Comm comm = PANEL->grid->row_comm;
41 |   int      root = PANEL->pcol;
42 | 
43 | #ifdef HPL_DETAILED_TIMING
44 |   HPL_ptimer(HPL_TIMING_LBCAST);
45 | #endif
46 |   /*
47 |    * Single Bcast call
48 |    */
49 |   int err = HPL_bcast(PANEL->A0, PANEL->len, root, comm, PANEL->algo->btopo);
50 | 
51 | #ifdef HPL_DETAILED_TIMING
52 |   HPL_ptimer(HPL_TIMING_LBCAST);
53 | #endif
54 | 
55 |   if(PANEL->grid->mycol != root) {
56 |     // retrieve some host-side pivoting info from bcast message
57 |     int* dipA  = PANEL->dipiv + 4 * PANEL->jb;
58 |     int* ipA   = PANEL->ipiv + 5 * PANEL->jb;
59 |     int  nprow = PANEL->grid->nprow;
60 | 
61 |     CHECK_HIP_ERROR(hipMemcpyAsync(ipA,
62 |                                    dipA,
63 |                                    (1 + nprow + 1) * sizeof(int),
64 |                                    hipMemcpyDeviceToHost,
65 |                                    dataStream));
66 |     CHECK_HIP_ERROR(hipStreamSynchronize(dataStream));
67 |   }
68 | 
69 |   return err;
70 | }
71 | 


--------------------------------------------------------------------------------
/src/panel/HPL_pdpanel_free.cpp:
--------------------------------------------------------------------------------
 1 | /* ---------------------------------------------------------------------
 2 |  * -- High Performance Computing Linpack Benchmark (HPL)
 3 |  *    HPL - 2.2 - February 24, 2016
 4 |  *    Antoine P. Petitet
 5 |  *    University of Tennessee, Knoxville
 6 |  *    Innovative Computing Laboratory
 7 |  *    (C) Copyright 2000-2008 All Rights Reserved
 8 |  *
 9 |  *    Modified by: Noel Chalmers
10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
11 |  *    See the rocHPL/LICENCE file for details.
12 |  *
13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
14 |  * ---------------------------------------------------------------------
15 |  */
16 | #include "hpl.hpp"
17 | 
18 | int HPL_pdpanel_free(HPL_T_panel* PANEL) {
19 |   /*
20 |    * Purpose
21 |    * =======
22 |    *
23 |    * HPL_pdpanel_free deallocates  the panel resources  and  stores the error
24 |    * code returned by the panel factorization.
25 |    *
26 |    * Arguments
27 |    * =========
28 |    *
29 |    * PANEL   (local input/output)          HPL_T_panel *
30 |    *         On entry,  PANEL  points  to  the  panel data  structure from
31 |    *         which the resources should be deallocated.
32 |    *
33 |    * ---------------------------------------------------------------------
34 |    */
35 | 
36 |   if(PANEL->IWORK) {
37 |     CHECK_HIP_ERROR(hipHostFree(PANEL->IWORK));
38 |     PANEL->IWORK = nullptr;
39 |   }
40 |   if(PANEL->U2) {
41 |     CHECK_HIP_ERROR(hipFree(PANEL->U2));
42 |     PANEL->U2 = nullptr;
43 |   }
44 |   if(PANEL->U1) {
45 |     CHECK_HIP_ERROR(hipFree(PANEL->U1));
46 |     PANEL->U1 = nullptr;
47 |   }
48 |   if(PANEL->U0) {
49 |     CHECK_HIP_ERROR(hipFree(PANEL->U0));
50 |     PANEL->U0 = nullptr;
51 |   }
52 |   if(PANEL->A0) {
53 |     CHECK_HIP_ERROR(hipFree(PANEL->A0));
54 |     PANEL->A0 = nullptr;
55 |   }
56 | 
57 |   return (HPL_SUCCESS);
58 | }
59 | 


--------------------------------------------------------------------------------
/src/panel/HPL_pdpanel_swapids.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | /* ---------------------------------------------------------------------
 3 |  * -- High Performance Computing Linpack Benchmark (HPL)
 4 |  *    Noel Chalmers
 5 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
 6 |  *    See the rocHPL/LICENCE file for details.
 7 |  *
 8 |  *    SPDX-License-Identifier: (BSD-3-Clause)
 9 |  * ---------------------------------------------------------------------
10 |  */
11 | #include "hpl.hpp"
12 | 
13 | void HPL_pdpanel_swapids(HPL_T_panel* PANEL) {
14 |   int jb, i, ml2;
15 | 
16 |   jb = PANEL->jb;
17 | 
18 |   int nprow = PANEL->grid->nprow;
19 | 
20 |   if(jb <= 0) return;
21 | 
22 |   if(nprow == 1) {
23 |     // unroll pivoting
24 |     int* ipiv  = PANEL->ipiv;
25 |     int* permU = PANEL->ipiv + jb;
26 |     int* ipl   = permU + 2 * jb;
27 |     int* ipID  = ipl + 1;
28 | 
29 |     for(i = 0; i < jb; i++) permU[i + jb] = -1;
30 | 
31 |     HPL_pipid(PANEL, ipl, ipID);
32 | 
33 |     for(i = 0; i < *ipl; i += 2) {
34 |       int src = ipID[i] - PANEL->ia;
35 |       int dst = ipID[i + 1] - PANEL->ia;
36 |       if(dst < jb) {
37 |         permU[dst] = src;
38 |       } else {
39 |         permU[src + jb] = dst;
40 |       }
41 |     }
42 | 
43 |     int* dpermU = PANEL->dipiv;
44 | 
45 |     // send pivoting ids to device
46 |     CHECK_HIP_ERROR(hipMemcpyAsync(dpermU,
47 |                                    permU,
48 |                                    2 * jb * sizeof(int),
49 |                                    hipMemcpyHostToDevice,
50 |                                    dataStream));
51 |     CHECK_HIP_ERROR(hipStreamSynchronize(dataStream));
52 | 
53 |   } else {
54 | 
55 |     int* permU   = PANEL->ipiv + jb;
56 |     int* lindxU  = permU + jb;
57 |     int* lindxA  = lindxU + jb;
58 |     int* lindxAU = lindxA + jb;
59 |     int* ipA     = lindxAU + jb;
60 |     int* iplen   = ipA + 1;
61 | 
62 |     int* ipl   = iplen + nprow + 1;
63 |     int* ipID  = ipl + 1;
64 |     int* iwork = ipID + 4 * jb;
65 | 
66 |     HPL_pipid(PANEL, ipl, ipID);
67 |     HPL_plindx(
68 |         PANEL, *ipl, ipID, ipA, lindxU, lindxAU, lindxA, iplen, permU, iwork);
69 | 
70 |     int* dpermU = PANEL->dipiv;
71 | 
72 |     // send pivoting ids to device
73 |     CHECK_HIP_ERROR(hipMemcpyAsync(dpermU,
74 |                                    permU,
75 |                                    (4 * jb + 1 + nprow + 1) * sizeof(int),
76 |                                    hipMemcpyHostToDevice,
77 |                                    dataStream));
78 |     CHECK_HIP_ERROR(hipStreamSynchronize(dataStream));
79 |   }
80 | }
81 | 


--------------------------------------------------------------------------------
/src/panel/HPL_pdpanel_wait.cpp:
--------------------------------------------------------------------------------
 1 | /* ---------------------------------------------------------------------
 2 |  * -- High Performance Computing Linpack Benchmark (HPL)
 3 |  *    Noel Chalmers
 4 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
 5 |  *    See the rocHPL/LICENCE file for details.
 6 |  *
 7 |  *    SPDX-License-Identifier: (BSD-3-Clause)
 8 |  * ---------------------------------------------------------------------
 9 |  */
10 | #include "hpl.hpp"
11 | 
12 | void HPL_pdpanel_Wait(HPL_T_panel* PANEL) {
13 |   // Wait for pfact to be complete
14 |   CHECK_HIP_ERROR(hipEventSynchronize(pfactStop));
15 | }
16 | 


--------------------------------------------------------------------------------
/src/pauxil/HPL_dlaswp00N_device.cpp:
--------------------------------------------------------------------------------
  1 | /* ---------------------------------------------------------------------
  2 |  * -- High Performance Computing Linpack Benchmark (HPL)
  3 |  *    HPL - 2.2 - February 24, 2016
  4 |  *    Antoine P. Petitet
  5 |  *    University of Tennessee, Knoxville
  6 |  *    Innovative Computing Laboratory
  7 |  *    (C) Copyright 2000-2008 All Rights Reserved
  8 |  *
  9 |  *    Modified by: Noel Chalmers
 10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
 11 |  *    See the rocHPL/LICENCE file for details.
 12 |  *
 13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
 14 |  * ---------------------------------------------------------------------
 15 |  */
 16 | 
 17 | #include "hpl.hpp"
 18 | #include <hip/hip_runtime.h>
 19 | 
 20 | #define BLOCK_SIZE 512
 21 | 
 22 | __global__ void dlaswp00N(const int N,
 23 |                           const int M,
 24 |                           double* __restrict__ A,
 25 |                           const int LDA,
 26 |                           const int* __restrict__ IPIV) {
 27 | 
 28 |   __shared__ double s_An_init[2048];
 29 |   __shared__ double s_An_ipiv[2048];
 30 | 
 31 |   const int m = threadIdx.x;
 32 |   const int n = blockIdx.x;
 33 | 
 34 |   // read in block column
 35 |   for(int i = m; i < M; i += blockDim.x)
 36 |     s_An_init[i] = A[i + n * ((size_t)LDA)];
 37 | 
 38 |   __syncthreads();
 39 | 
 40 |   // local block
 41 |   for(int i = m; i < M; i += blockDim.x) {
 42 |     const int ip = IPIV[i];
 43 | 
 44 |     if(ip < M) { // local swap
 45 |       s_An_ipiv[i] = s_An_init[ip];
 46 |     } else { // non local swap
 47 |       s_An_ipiv[i] = A[ip + n * ((size_t)LDA)];
 48 |     }
 49 |   }
 50 |   __syncthreads();
 51 | 
 52 |   // write out local block
 53 |   for(int i = m; i < M; i += blockDim.x)
 54 |     A[i + n * ((size_t)LDA)] = s_An_ipiv[i];
 55 | 
 56 |   // remaining swaps in column
 57 |   for(int i = m; i < M; i += blockDim.x) {
 58 |     const int ip_ex = IPIV[i + M];
 59 | 
 60 |     if(ip_ex > -1) { A[ip_ex + n * ((size_t)LDA)] = s_An_init[i]; }
 61 |   }
 62 | }
 63 | 
 64 | void HPL_dlaswp00N(const int  M,
 65 |                    const int  N,
 66 |                    double*    A,
 67 |                    const int  LDA,
 68 |                    const int* IPIV) {
 69 |   /*
 70 |    * Purpose
 71 |    * =======
 72 |    *
 73 |    * HPL_dlaswp00N performs a series of local row interchanges on a matrix
 74 |    * A. One row interchange is initiated for rows 0 through M-1 of A.
 75 |    *
 76 |    * Arguments
 77 |    * =========
 78 |    *
 79 |    * M       (local input)                 const int
 80 |    *         On entry, M specifies the number of rows of the array A to be
 81 |    *         interchanged. M must be at least zero.
 82 |    *
 83 |    * N       (local input)                 const int
 84 |    *         On entry, N  specifies  the number of columns of the array A.
 85 |    *         N must be at least zero.
 86 |    *
 87 |    * A       (local input/output)          double *
 88 |    *         On entry, A  points to an array of dimension (LDA,N) to which
 89 |    *         the row interchanges will be  applied.  On exit, the permuted
 90 |    *         matrix.
 91 |    *
 92 |    * LDA     (local input)                 const int
 93 |    *         On entry, LDA specifies the leading dimension of the array A.
 94 |    *         LDA must be at least MAX(1,M).
 95 |    *
 96 |    * IPIV    (local input)                 const int *
 97 |    *         On entry,  IPIV  is  an  array of size  M  that  contains the
 98 |    *         pivoting  information.  For  k  in [0..M),  IPIV[k]=IROFF + l
 99 |    *         implies that local rows k and l are to be interchanged.
100 |    *
101 |    * ---------------------------------------------------------------------
102 |    */
103 | 
104 |   if((M <= 0) || (N <= 0)) return;
105 | 
106 |   hipStream_t stream;
107 |   CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream));
108 | 
109 |   int grid_size = N;
110 |   dlaswp00N<<<grid_size, BLOCK_SIZE, 0, stream>>>(N, M, A, LDA, IPIV);
111 |   CHECK_HIP_ERROR(hipGetLastError());
112 | }
113 | 


--------------------------------------------------------------------------------
/src/pauxil/HPL_dlaswp01T_device.cpp:
--------------------------------------------------------------------------------
  1 | /* ---------------------------------------------------------------------
  2 |  * -- High Performance Computing Linpack Benchmark (HPL)
  3 |  *    HPL - 2.2 - February 24, 2016
  4 |  *    Antoine P. Petitet
  5 |  *    University of Tennessee, Knoxville
  6 |  *    Innovative Computing Laboratory
  7 |  *    (C) Copyright 2000-2008 All Rights Reserved
  8 |  *
  9 |  *    Modified by: Noel Chalmers
 10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
 11 |  *    See the rocHPL/LICENCE file for details.
 12 |  *
 13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
 14 |  * ---------------------------------------------------------------------
 15 |  */
 16 | 
 17 | #include "hpl.hpp"
 18 | #include <hip/hip_runtime.h>
 19 | 
 20 | #define TILE_DIM 32
 21 | #define BLOCK_ROWS 8
 22 | 
 23 | /* Build U matrix from rows of A */
 24 | __global__ void dlaswp01T(const int M,
 25 |                           const int N,
 26 |                           double* __restrict__ A,
 27 |                           const int LDA,
 28 |                           double* __restrict__ U,
 29 |                           const int LDU,
 30 |                           const int* __restrict__ LINDXU) {
 31 | 
 32 |   __shared__ double s_U[TILE_DIM][TILE_DIM + 1];
 33 | 
 34 |   const int m = threadIdx.x + TILE_DIM * blockIdx.x;
 35 |   const int n = threadIdx.y + TILE_DIM * blockIdx.y;
 36 | 
 37 |   if(m < M) {
 38 |     const int ipa = LINDXU[m];
 39 | 
 40 |     // save in LDS for the moment
 41 |     // possible cache-hits if ipas are close
 42 |     s_U[threadIdx.x][threadIdx.y + 0] =
 43 |         (n + 0 < N) ? A[ipa + (n + 0) * ((size_t)LDA)] : 0.0;
 44 |     s_U[threadIdx.x][threadIdx.y + 8] =
 45 |         (n + 8 < N) ? A[ipa + (n + 8) * ((size_t)LDA)] : 0.0;
 46 |     s_U[threadIdx.x][threadIdx.y + 16] =
 47 |         (n + 16 < N) ? A[ipa + (n + 16) * ((size_t)LDA)] : 0.0;
 48 |     s_U[threadIdx.x][threadIdx.y + 24] =
 49 |         (n + 24 < N) ? A[ipa + (n + 24) * ((size_t)LDA)] : 0.0;
 50 |   }
 51 | 
 52 |   __syncthreads();
 53 | 
 54 |   const int um = threadIdx.y + TILE_DIM * blockIdx.x;
 55 |   const int un = threadIdx.x + TILE_DIM * blockIdx.y;
 56 | 
 57 |   if(un < N) {
 58 |     // write out chunks of U
 59 |     if((um + 0) < M)
 60 |       U[un + (um + 0) * ((size_t)LDU)] = s_U[threadIdx.y + 0][threadIdx.x];
 61 |     if((um + 8) < M)
 62 |       U[un + (um + 8) * ((size_t)LDU)] = s_U[threadIdx.y + 8][threadIdx.x];
 63 |     if((um + 16) < M)
 64 |       U[un + (um + 16) * ((size_t)LDU)] = s_U[threadIdx.y + 16][threadIdx.x];
 65 |     if((um + 24) < M)
 66 |       U[un + (um + 24) * ((size_t)LDU)] = s_U[threadIdx.y + 24][threadIdx.x];
 67 |   }
 68 | }
 69 | 
 70 | void HPL_dlaswp01T(const int  M,
 71 |                    const int  N,
 72 |                    double*    A,
 73 |                    const int  LDA,
 74 |                    double*    U,
 75 |                    const int  LDU,
 76 |                    const int* LINDXU) {
 77 |   /*
 78 |    * Purpose
 79 |    * =======
 80 |    *
 81 |    * HPL_dlaswp01T copies  scattered rows  of  A  into an array U.  The
 82 |    * row offsets in  A  of the source rows  are specified by LINDXU.
 83 |    * Rows of A are stored as columns in U.
 84 |    *
 85 |    * Arguments
 86 |    * =========
 87 |    *
 88 |    * M       (local input)                 const int
 89 |    *         On entry, M  specifies the number of rows of A that should be
 90 |    *         moved within A or copied into U. M must be at least zero.
 91 |    *
 92 |    * N       (local input)                 const int
 93 |    *         On entry, N  specifies the length of rows of A that should be
 94 |    *         moved within A or copied into U. N must be at least zero.
 95 |    *
 96 |    * A       (local input/output)          double *
 97 |    *         On entry, A points to an array of dimension (LDA,N). The rows
 98 |    *         of this array specified by LINDXA should be moved within A or
 99 |    *         copied into U.
100 |    *
101 |    * LDA     (local input)                 const int
102 |    *         On entry, LDA specifies the leading dimension of the array A.
103 |    *         LDA must be at least MAX(1,M).
104 |    *
105 |    * U       (local input/output)          double *
106 |    *         On entry, U points to an array of dimension (LDU,M). The rows
107 |    *         of A specified by  LINDXA  are copied within this array  U at
108 |    *         the  positions indicated by positive values of LINDXAU.  The
109 |    *         rows of A are stored as columns in U.
110 |    *
111 |    * LDU     (local input)                 const int
112 |    *         On entry, LDU specifies the leading dimension of the array U.
113 |    *         LDU must be at least MAX(1,N).
114 |    *
115 |    * LINDXU  (local input)                 const int *
116 |    *         On entry, LINDXU is an array of dimension M that contains the
117 |    *         local  row indexes  of  A  that should be copied into U.
118 |    *
119 |    * ---------------------------------------------------------------------
120 |    */
121 |   /*
122 |    * .. Local Variables ..
123 |    */
124 | 
125 |   if((M <= 0) || (N <= 0)) return;
126 | 
127 |   dim3 grid_size((M + TILE_DIM - 1) / TILE_DIM, (N + TILE_DIM - 1) / TILE_DIM);
128 |   dim3 block_size(TILE_DIM, BLOCK_ROWS);
129 |   dlaswp01T<<<grid_size, block_size, 0, computeStream>>>(
130 |       M, N, A, LDA, U, LDU, LINDXU);
131 |   CHECK_HIP_ERROR(hipGetLastError());
132 | 
133 |   /*
134 |    * End of HPL_dlaswp01T
135 |    */
136 | }
137 | 


--------------------------------------------------------------------------------
/src/pauxil/HPL_dlaswp02T_device.cpp:
--------------------------------------------------------------------------------
  1 | /* ---------------------------------------------------------------------
  2 |  * -- High Performance Computing Linpack Benchmark (HPL)
  3 |  *    HPL - 2.2 - February 24, 2016
  4 |  *    Antoine P. Petitet
  5 |  *    University of Tennessee, Knoxville
  6 |  *    Innovative Computing Laboratory
  7 |  *    (C) Copyright 2000-2008 All Rights Reserved
  8 |  *
  9 |  *    Modified by: Noel Chalmers
 10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
 11 |  *    See the rocHPL/LICENCE file for details.
 12 |  *
 13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
 14 |  * ---------------------------------------------------------------------
 15 |  */
 16 | 
 17 | #include "hpl.hpp"
 18 | #include <hip/hip_runtime.h>
 19 | #include <cassert>
 20 | 
 21 | #define assertm(exp, msg) assert(((void)msg, exp))
 22 | 
 23 | /* Perform any local row swaps of A */
 24 | __global__ void dlaswp02T(const int M,
 25 |                           const int N,
 26 |                           double* __restrict__ A,
 27 |                           const int LDA,
 28 |                           const int* __restrict__ LINDXAU,
 29 |                           const int* __restrict__ LINDXA) {
 30 | 
 31 |   const int n = blockIdx.x;
 32 |   const int m = threadIdx.x;
 33 | 
 34 |   const int ipau = LINDXAU[m]; // src row
 35 |   const int ipa  = LINDXA[m];  // dst row
 36 | 
 37 |   const double An = A[ipau + n * ((size_t)LDA)];
 38 | 
 39 |   __syncthreads();
 40 | 
 41 |   A[ipa + n * ((size_t)LDA)] = An;
 42 | }
 43 | 
 44 | void HPL_dlaswp02T(const int  M,
 45 |                    const int  N,
 46 |                    double*    A,
 47 |                    const int  LDA,
 48 |                    const int* LINDXAU,
 49 |                    const int* LINDXA) {
 50 |   /*
 51 |    * Purpose
 52 |    * =======
 53 |    *
 54 |    * HPL_dlaswp02T copies  scattered rows  of  A  into itself. The row
 55 |    * offsets in  A  of the source rows  are specified by LINDXA.
 56 |    * The  destination of those rows are specified by  LINDXAU.  A
 57 |    * positive value of LINDXAU indicates that the array  destination is U,
 58 |    * and A otherwise.
 59 |    *
 60 |    * Arguments
 61 |    * =========
 62 |    *
 63 |    * M       (local input)                 const int
 64 |    *         On entry, M  specifies the number of rows of A that should be
 65 |    *         moved within A or copied into U. M must be at least zero.
 66 |    *
 67 |    * N       (local input)                 const int
 68 |    *         On entry, N  specifies the length of rows of A that should be
 69 |    *         moved within A or copied into U. N must be at least zero.
 70 |    *
 71 |    * A       (local input/output)          double *
 72 |    *         On entry, A points to an array of dimension (LDA,N). The rows
 73 |    *         of this array specified by LINDXA should be moved within A or
 74 |    *         copied into U.
 75 |    *
 76 |    * LDA     (local input)                 const int
 77 |    *         On entry, LDA specifies the leading dimension of the array A.
 78 |    *         LDA must be at least MAX(1,M).
 79 |    *
 80 |    * LINDXAU (local input)                 const int *
 81 |    *         On entry, LINDXA is an array of dimension M that contains the
 82 |    *         local  row indexes  of  A  that should be moved within  A.
 83 |    *
 84 |    * LINDXA  (local input)                 const int *
 85 |    *         On entry, LINDXAU  is an array of dimension  M that  contains
 86 |    *         the local  row indexes of  A  where the rows of  A  should be
 87 |    *         copied to.
 88 |    *
 89 |    * ---------------------------------------------------------------------
 90 |    */
 91 |   /*
 92 |    * .. Local Variables ..
 93 |    */
 94 | 
 95 |   if((M <= 0) || (N <= 0)) return;
 96 | 
 97 |   assertm(M <= 1024, "NB too large in HPL_dlaswp02T");
 98 | 
 99 |   dim3 grid_size(N);
100 |   dim3 block_size(M);
101 |   dlaswp02T<<<N, M, 0, computeStream>>>(M, N, A, LDA, LINDXAU, LINDXA);
102 |   CHECK_HIP_ERROR(hipGetLastError());
103 |   /*
104 |    * End of HPL_dlaswp02T
105 |    */
106 | }
107 | 


--------------------------------------------------------------------------------
/src/pauxil/HPL_dlaswp03T_device.cpp:
--------------------------------------------------------------------------------
  1 | /* ---------------------------------------------------------------------
  2 |  * -- High Performance Computing Linpack Benchmark (HPL)
  3 |  *    HPL - 2.2 - February 24, 2016
  4 |  *    Antoine P. Petitet
  5 |  *    University of Tennessee, Knoxville
  6 |  *    Innovative Computing Laboratory
  7 |  *    (C) Copyright 2000-2008 All Rights Reserved
  8 |  *
  9 |  *    Modified by: Noel Chalmers
 10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
 11 |  *    See the rocHPL/LICENCE file for details.
 12 |  *
 13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
 14 |  * ---------------------------------------------------------------------
 15 |  */
 16 | 
 17 | #include "hpl.hpp"
 18 | #include <hip/hip_runtime.h>
 19 | 
 20 | #define TILE_DIM 32
 21 | #define BLOCK_ROWS 8
 22 | 
 23 | /* Build W matrix from rows of A */
 24 | __global__ void dlaswp03T(const int M,
 25 |                           const int N,
 26 |                           double* __restrict__ A,
 27 |                           const int LDA,
 28 |                           double* __restrict__ W,
 29 |                           const int LDW,
 30 |                           const int* __restrict__ LINDXU) {
 31 | 
 32 |   __shared__ double s_W[TILE_DIM][TILE_DIM + 1];
 33 | 
 34 |   const int m = threadIdx.x + TILE_DIM * blockIdx.x;
 35 |   const int n = threadIdx.y + TILE_DIM * blockIdx.y;
 36 | 
 37 |   if(m < M) {
 38 |     const int ipa = LINDXU[m];
 39 | 
 40 |     // save in LDS for the moment
 41 |     // possible cache-hits if ipas are close
 42 |     s_W[threadIdx.x][threadIdx.y + 0] =
 43 |         (n + 0 < N) ? A[ipa + (n + 0) * ((size_t)LDA)] : 0.0;
 44 |     s_W[threadIdx.x][threadIdx.y + 8] =
 45 |         (n + 8 < N) ? A[ipa + (n + 8) * ((size_t)LDA)] : 0.0;
 46 |     s_W[threadIdx.x][threadIdx.y + 16] =
 47 |         (n + 16 < N) ? A[ipa + (n + 16) * ((size_t)LDA)] : 0.0;
 48 |     s_W[threadIdx.x][threadIdx.y + 24] =
 49 |         (n + 24 < N) ? A[ipa + (n + 24) * ((size_t)LDA)] : 0.0;
 50 |   }
 51 | 
 52 |   __syncthreads();
 53 | 
 54 |   const int wm = threadIdx.y + TILE_DIM * blockIdx.x;
 55 |   const int wn = threadIdx.x + TILE_DIM * blockIdx.y;
 56 | 
 57 |   if(wn < N) {
 58 |     // write out chunks of W
 59 |     if((wm + 0) < M)
 60 |       W[wn + (wm + 0) * ((size_t)LDW)] = s_W[threadIdx.y + 0][threadIdx.x];
 61 |     if((wm + 8) < M)
 62 |       W[wn + (wm + 8) * ((size_t)LDW)] = s_W[threadIdx.y + 8][threadIdx.x];
 63 |     if((wm + 16) < M)
 64 |       W[wn + (wm + 16) * ((size_t)LDW)] = s_W[threadIdx.y + 16][threadIdx.x];
 65 |     if((wm + 24) < M)
 66 |       W[wn + (wm + 24) * ((size_t)LDW)] = s_W[threadIdx.y + 24][threadIdx.x];
 67 |   }
 68 | }
 69 | 
 70 | void HPL_dlaswp03T(const int  M,
 71 |                    const int  N,
 72 |                    double*    A,
 73 |                    const int  LDA,
 74 |                    double*    W,
 75 |                    const int  LDW,
 76 |                    const int* LINDXU) {
 77 |   /*
 78 |    * Purpose
 79 |    * =======
 80 |    *
 81 |    * HPL_dlaswp03T packs scattered rows of an array  A  into workspace  W.
 82 |    * The row offsets in A are specified by LINDXU.
 83 |    *
 84 |    * Arguments
 85 |    * =========
 86 |    *
 87 |    * M       (local input)                 const int
 88 |    *         On entry, M  specifies the number of rows of A that should be
 89 |    *         swapped with columns of W. M must be at least zero.
 90 |    *
 91 |    * N       (local input)                 const int
 92 |    *         On entry, N specifies the length of the rows of A that should
 93 |    *         be swapped with columns of W. N must be at least zero.
 94 |    *
 95 |    * A       (local output)                double *
 96 |    *         On entry, A points to an array of dimension (LDA,N). On exit,
 97 |    *         the  rows of this array specified by  LINDXU  are replaced by
 98 |    *         columns of W.
 99 |    *
100 |    * LDA     (local input)                 const int
101 |    *         On entry, LDA specifies the leading dimension of the array A.
102 |    *         LDA must be at least MAX(1,M).
103 |    *
104 |    * W       (local input/output)          double *
105 |    *         On entry,  W  points  to an array of dimension (LDW,*).  This
106 |    *         array contains the columns of  W  that are to be swapped with
107 |    *         rows of A.
108 |    *
109 |    * LDW     (local input)                 const int
110 |    *         On entry, LDW specifies the leading dimension of the array W.
111 |    *         LDW must be at least MAX(1,N).
112 |    *
113 |    * LINDXU  (local input)                 const int *
114 |    *         On entry, LINDXU is an array of dimension M that contains the
115 |    *         local row indexes of A that should be copied into W.
116 |    *
117 |    * ---------------------------------------------------------------------
118 |    */
119 |   /*
120 |    * .. Local Variables ..
121 |    */
122 | 
123 |   if((M <= 0) || (N <= 0)) return;
124 | 
125 |   dim3 grid_size((M + TILE_DIM - 1) / TILE_DIM, (N + TILE_DIM - 1) / TILE_DIM);
126 |   dim3 block_size(TILE_DIM, BLOCK_ROWS);
127 |   dlaswp03T<<<grid_size, block_size, 0, computeStream>>>(
128 |       M, N, A, LDA, W, LDW, LINDXU);
129 |   CHECK_HIP_ERROR(hipGetLastError());
130 |   /*
131 |    * End of HPL_dlaswp03T
132 |    */
133 | }
134 | 


--------------------------------------------------------------------------------
/src/pauxil/HPL_dlaswp04T_device.cpp:
--------------------------------------------------------------------------------
  1 | /* ---------------------------------------------------------------------
  2 |  * -- High Performance Computing Linpack Benchmark (HPL)
  3 |  *    HPL - 2.2 - February 24, 2016
  4 |  *    Antoine P. Petitet
  5 |  *    University of Tennessee, Knoxville
  6 |  *    Innovative Computing Laboratory
  7 |  *    (C) Copyright 2000-2008 All Rights Reserved
  8 |  *
  9 |  *    Modified by: Noel Chalmers
 10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
 11 |  *    See the rocHPL/LICENCE file for details.
 12 |  *
 13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
 14 |  * ---------------------------------------------------------------------
 15 |  */
 16 | 
 17 | #include "hpl.hpp"
 18 | #include <hip/hip_runtime.h>
 19 | 
 20 | #define TILE_DIM 32
 21 | #define BLOCK_ROWS 8
 22 | 
 23 | static __global__ void dlaswp04T(const int M,
 24 |                                  const int N,
 25 |                                  double* __restrict__ A,
 26 |                                  const int LDA,
 27 |                                  double* __restrict__ W,
 28 |                                  const int LDW,
 29 |                                  const int* __restrict__ LINDXU) {
 30 | 
 31 |   __shared__ double s_W[TILE_DIM][TILE_DIM + 1];
 32 | 
 33 |   const int am = threadIdx.x + TILE_DIM * blockIdx.x;
 34 |   const int an = threadIdx.y + TILE_DIM * blockIdx.y;
 35 | 
 36 |   const int wm = threadIdx.y + TILE_DIM * blockIdx.x;
 37 |   const int wn = threadIdx.x + TILE_DIM * blockIdx.y;
 38 | 
 39 |   if(wn < N) {
 40 |     s_W[threadIdx.y + 0][threadIdx.x] =
 41 |         (wm + 0 < M) ? W[wn + (wm + 0) * ((size_t)LDW)] : 0.0;
 42 |     s_W[threadIdx.y + 8][threadIdx.x] =
 43 |         (wm + 8 < M) ? W[wn + (wm + 8) * ((size_t)LDW)] : 0.0;
 44 |     s_W[threadIdx.y + 16][threadIdx.x] =
 45 |         (wm + 16 < M) ? W[wn + (wm + 16) * ((size_t)LDW)] : 0.0;
 46 |     s_W[threadIdx.y + 24][threadIdx.x] =
 47 |         (wm + 24 < M) ? W[wn + (wm + 24) * ((size_t)LDW)] : 0.0;
 48 |   }
 49 | 
 50 |   __syncthreads();
 51 | 
 52 |   if(am < M) {
 53 |     const int aip = LINDXU[am];
 54 |     if((an + 0) < N)
 55 |       A[aip + (an + 0) * ((size_t)LDA)] = s_W[threadIdx.x][threadIdx.y + 0];
 56 |     if((an + 8) < N)
 57 |       A[aip + (an + 8) * ((size_t)LDA)] = s_W[threadIdx.x][threadIdx.y + 8];
 58 |     if((an + 16) < N)
 59 |       A[aip + (an + 16) * ((size_t)LDA)] = s_W[threadIdx.x][threadIdx.y + 16];
 60 |     if((an + 24) < N)
 61 |       A[aip + (an + 24) * ((size_t)LDA)] = s_W[threadIdx.x][threadIdx.y + 24];
 62 |   }
 63 | }
 64 | 
 65 | void HPL_dlaswp04T(const int  M,
 66 |                    const int  N,
 67 |                    double*    A,
 68 |                    const int  LDA,
 69 |                    double*    W,
 70 |                    const int  LDW,
 71 |                    const int* LINDXU) {
 72 |   /*
 73 |    * Purpose
 74 |    * =======
 75 |    *
 76 |    * HPL_dlaswp04T writes columns  of  W  into  rows  of  A  at  positions
 77 |    * indicated by LINDXU.
 78 |    *
 79 |    * Arguments
 80 |    * =========
 81 |    *
 82 |    * M       (local input)                 const int
 83 |    *         On entry, M  specifies the number of rows of A that should be
 84 |    *         replaced with columns of W. M must be at least zero.
 85 |    *
 86 |    * N       (local input)                 const int
 87 |    *         On entry, N specifies the length of the rows of A that should
 88 |    *         be replaced with columns of W. N must be at least zero.
 89 |    *
 90 |    * A       (local output)                double *
 91 |    *         On entry, A points to an array of dimension (LDA,N). On exit,
 92 |    *         the  rows of this array specified by  LINDXU  are replaced by
 93 |    *         columns of W.
 94 |    *
 95 |    * LDA     (local input)                 const int
 96 |    *         On entry, LDA specifies the leading dimension of the array A.
 97 |    *         LDA must be at least MAX(1,M).
 98 |    *
 99 |    * W       (local input/output)          double *
100 |    *         On entry,  W  points  to an array of dimension (LDW,*).  This
101 |    *         array contains the columns of  W  that are to be writen to
102 |    *         rows of A.
103 |    *
104 |    * LDW     (local input)                 const int
105 |    *         On entry, LDW specifies the leading dimension of the array W.
106 |    *         LDW must be at least MAX(1,N).
107 |    *
108 |    * LINDXU  (local input)                 const int *
109 |    *         On entry, LINDXU is an array of dimension M that contains the
110 |    *         local row indexes of A that should be replaced with W.
111 |    *
112 |    * ---------------------------------------------------------------------
113 |    */
114 |   /*
115 |    * .. Local Variables ..
116 |    */
117 | 
118 |   if((M <= 0) || (N <= 0)) return;
119 | 
120 |   dim3 grid_size((M + TILE_DIM - 1) / TILE_DIM, (N + TILE_DIM - 1) / TILE_DIM);
121 |   dim3 block_size(TILE_DIM, BLOCK_ROWS);
122 |   dlaswp04T<<<grid_size, block_size, 0, computeStream>>>(
123 |       M, N, A, LDA, W, LDW, LINDXU);
124 |   CHECK_HIP_ERROR(hipGetLastError());
125 |   /*
126 |    * End of HPL_dlaswp04T
127 |    */
128 | }
129 | 


--------------------------------------------------------------------------------
/src/pauxil/HPL_dlaswp10N_device.cpp:
--------------------------------------------------------------------------------
 1 | /* ---------------------------------------------------------------------
 2 |  * -- High Performance Computing Linpack Benchmark (HPL)
 3 |  *    HPL - 2.2 - February 24, 2016
 4 |  *    Antoine P. Petitet
 5 |  *    University of Tennessee, Knoxville
 6 |  *    Innovative Computing Laboratory
 7 |  *    (C) Copyright 2000-2008 All Rights Reserved
 8 |  *
 9 |  *    Modified by: Noel Chalmers
10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
11 |  *    See the rocHPL/LICENCE file for details.
12 |  *
13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
14 |  * ---------------------------------------------------------------------
15 |  */
16 | 
17 | #include "hpl.hpp"
18 | #include <hip/hip_runtime.h>
19 | 
20 | #define BLOCK_SIZE 512
21 | 
22 | __global__ void dlaswp10N(const int M,
23 |                           const int N,
24 |                           double* __restrict__ A,
25 |                           const int LDA,
26 |                           const int* __restrict__ IPIV) {
27 | 
28 |   const int m = threadIdx.x + BLOCK_SIZE * blockIdx.x;
29 | 
30 |   if(m < M) {
31 |     for(int i = 0; i < N; i++) {
32 |       const int ip = IPIV[i];
33 | 
34 |       if(ip != i) {
35 |         // swap
36 |         const double Ai           = A[m + i * ((size_t)LDA)];
37 |         const double Aip          = A[m + ip * ((size_t)LDA)];
38 |         A[m + i * ((size_t)LDA)]  = Aip;
39 |         A[m + ip * ((size_t)LDA)] = Ai;
40 |       }
41 |     }
42 |   }
43 | }
44 | 
45 | void HPL_dlaswp10N(const int  M,
46 |                    const int  N,
47 |                    double*    A,
48 |                    const int  LDA,
49 |                    const int* IPIV) {
50 |   /*
51 |    * Purpose
52 |    * =======
53 |    *
54 |    * HPL_dlaswp10N performs a sequence  of  local column interchanges on a
55 |    * matrix A.  One column interchange is initiated  for columns 0 through
56 |    * N-1 of A.
57 |    *
58 |    * Arguments
59 |    * =========
60 |    *
61 |    * M       (local input)                 const int
62 |    *         __arg0__
63 |    *
64 |    * N       (local input)                 const int
65 |    *         On entry,  M  specifies  the number of rows of the array A. M
66 |    *         must be at least zero.
67 |    *
68 |    * A       (local input/output)          double *
69 |    *         On entry, N specifies the number of columns of the array A. N
70 |    *         must be at least zero.
71 |    *
72 |    * LDA     (local input)                 const int
73 |    *         On entry, A  points to an  array of  dimension (LDA,N).  This
74 |    *         array contains the columns onto which the interchanges should
75 |    *         be applied. On exit, A contains the permuted matrix.
76 |    *
77 |    * IPIV    (local input)                 const int *
78 |    *         On entry, LDA specifies the leading dimension of the array A.
79 |    *         LDA must be at least MAX(1,M).
80 |    *
81 |    * ---------------------------------------------------------------------
82 |    */
83 | 
84 |   if((M <= 0) || (N <= 0)) return;
85 | 
86 |   hipStream_t stream;
87 |   CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream));
88 | 
89 |   dim3 grid_size((M + BLOCK_SIZE - 1) / BLOCK_SIZE);
90 |   dlaswp10N<<<grid_size, dim3(BLOCK_SIZE), 0, stream>>>(M, N, A, LDA, IPIV);
91 |   CHECK_HIP_ERROR(hipGetLastError());
92 | }
93 | 


--------------------------------------------------------------------------------
/src/pauxil/HPL_indxg2l.cpp:
--------------------------------------------------------------------------------
 1 | /* ---------------------------------------------------------------------
 2 |  * -- High Performance Computing Linpack Benchmark (HPL)
 3 |  *    HPL - 2.2 - February 24, 2016
 4 |  *    Antoine P. Petitet
 5 |  *    University of Tennessee, Knoxville
 6 |  *    Innovative Computing Laboratory
 7 |  *    (C) Copyright 2000-2008 All Rights Reserved
 8 |  *
 9 |  *    Modified by: Noel Chalmers
10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
11 |  *    See the rocHPL/LICENCE file for details.
12 |  *
13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
14 |  * ---------------------------------------------------------------------
15 |  */
16 | 
17 | #include "hpl.hpp"
18 | 
19 | int HPL_indxg2l(const int IG,
20 |                 const int INB,
21 |                 const int NB,
22 |                 const int SRCPROC,
23 |                 const int NPROCS) {
24 |   /*
25 |    * Purpose
26 |    * =======
27 |    *
28 |    * HPL_indxg2l computes  the local index of a matrix entry pointed to by
29 |    * the  global index IG.  This  local  returned index is the same in all
30 |    * processes.
31 |    *
32 |    * Arguments
33 |    * =========
34 |    *
35 |    * IG      (input)                       const int
36 |    *         On entry, IG specifies the global index of the matrix  entry.
37 |    *         IG must be at least zero.
38 |    *
39 |    * INB     (input)                       const int
40 |    *         On entry,  INB  specifies  the size of the first block of the
41 |    *         global matrix. INB must be at least one.
42 |    *
43 |    * NB      (input)                       const int
44 |    *         On entry,  NB specifies the blocking factor used to partition
45 |    *         and distribute the matrix. NB must be larger than one.
46 |    *
47 |    * SRCPROC (input)                       const int
48 |    *         On entry, if SRCPROC = -1, the data  is not  distributed  but
49 |    *         replicated,  in  which  case  this  routine returns IG in all
50 |    *         processes. Otherwise, the value of SRCPROC is ignored.
51 |    *
52 |    * NPROCS  (input)                       const int
53 |    *         On entry,  NPROCS  specifies the total number of process rows
54 |    *         or columns over which the matrix is distributed.  NPROCS must
55 |    *         be at least one.
56 |    *
57 |    * ---------------------------------------------------------------------
58 |    */
59 | 
60 |   int i, j;
61 | 
62 |   if((IG < INB) || (SRCPROC == -1) || (NPROCS == 1))
63 |     /*
64 |      * IG  belongs  to the first block,  or the data is not distributed,  or
65 |      * there is just one process in this dimension of the grid.
66 |      */
67 |     return (IG);
68 |   /*
69 |    * IG  =  INB - NB + ( l * NPROCS + MYROC ) * NB + X  with  0 <= X < NB,
70 |    * thus IG is to be found in the block (IG-INB+NB) / NB = l*NPROCS+MYROC
71 |    * with  0 <= MYROC < NPROCS.  The local index to be returned depends on
72 |    * whether  IG  resides in the process owning the first partial block of
73 |    * size INB (MYROC=0). To determine this cheaply, let i = (IG-INB) / NB,
74 |    * so that if NPROCS divides i+1, i.e. MYROC=0,  we have i+1 = l*NPROCS.
75 |    * If we set  j = i / NPROCS, it follows that j = l-1. Therefore, i+1 is
76 |    * equal to (j+1) * NPROCS.  Conversely, if NPROCS does not divide  i+1,
77 |    * then i+1 = l*NPROCS + MYROC with 1 <= MYROC < NPROCS. It follows that
78 |    * j=l and thus (j+1)*NPROCS > i+1.
79 |    */
80 |   j = (i = (IG - INB) / NB) / NPROCS;
81 |   /*
82 |    * When IG resides in the process owning the first partial block of size
83 |    * INB (MYROC = 0), then the result IL can be written as:
84 |    * IL = INB - NB + l * NB + X  = IG + ( l - (l * NPROCS + MYROC) ) * NB.
85 |    * Using the above notation,  we have i+1 = l*NPROCS + MYROC = l*NPROCS,
86 |    * i.e l = ( i+1 ) / NPROCS = j+1,  since  NPROCS divides i+1, therefore
87 |    * IL = IG + ( j + 1 - ( i + 1 ) ) * NB.
88 |    *
89 |    * Otherwise when MYROC >= 1, the result IL can be written as:
90 |    * IL = l * NB + X = IG - INB + ( ( l+1 ) - ( l * NPROCS + MYROC ) )*NB.
91 |    * We still have i+1 = l*NPROCS+MYROC. Since NPROCS does not divide i+1,
92 |    * we have j = (l*NPROCS+MYROC-1) / NPROCS = l, i.e
93 |    * IL = IG - INB + ( j + 1 - ( i + 1 ) ) * NB.
94 |    */
95 |   return (NB * (j - i) + ((i + 1 - (j + 1) * NPROCS) ? IG - INB : IG));
96 | }
97 | 


--------------------------------------------------------------------------------
/src/pauxil/HPL_indxg2lp.cpp:
--------------------------------------------------------------------------------
  1 | /* ---------------------------------------------------------------------
  2 |  * -- High Performance Computing Linpack Benchmark (HPL)
  3 |  *    HPL - 2.2 - February 24, 2016
  4 |  *    Antoine P. Petitet
  5 |  *    University of Tennessee, Knoxville
  6 |  *    Innovative Computing Laboratory
  7 |  *    (C) Copyright 2000-2008 All Rights Reserved
  8 |  *
  9 |  *    Modified by: Noel Chalmers
 10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
 11 |  *    See the rocHPL/LICENCE file for details.
 12 |  *
 13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
 14 |  * ---------------------------------------------------------------------
 15 |  */
 16 | 
 17 | #include "hpl.hpp"
 18 | 
 19 | void HPL_indxg2lp(int*      IL,
 20 |                   int*      PROC,
 21 |                   const int IG,
 22 |                   const int INB,
 23 |                   const int NB,
 24 |                   const int SRCPROC,
 25 |                   const int NPROCS) {
 26 |   /*
 27 |    * Purpose
 28 |    * =======
 29 |    *
 30 |    * HPL_indxg2lp computes the local index of a matrix entry pointed to by
 31 |    * the global  index IG as well as the process coordinate which posseses
 32 |    * this entry. The local returned index is the same in all processes.
 33 |    *
 34 |    * Arguments
 35 |    * =========
 36 |    *
 37 |    * IL      (output)                      int *
 38 |    *         On exit, IL specifies the local index corresponding to IG. IL
 39 |    *         is at least zero.
 40 |    *
 41 |    * PROC    (output)                      int *
 42 |    *         On exit,  PROC  is the  coordinate of the process  owning the
 43 |    *         entry specified by the global index IG. PROC is at least zero
 44 |    *         and less than NPROCS.
 45 |    *
 46 |    * IG      (input)                       const int
 47 |    *         On entry, IG specifies the global index of the matrix  entry.
 48 |    *         IG must be at least zero.
 49 |    *
 50 |    * INB     (input)                       const int
 51 |    *         On entry,  INB  specifies  the size of the first block of the
 52 |    *         global matrix. INB must be at least one.
 53 |    *
 54 |    * NB      (input)                       const int
 55 |    *         On entry,  NB specifies the blocking factor used to partition
 56 |    *         and distribute the matrix A. NB must be larger than one.
 57 |    *
 58 |    * SRCPROC (input)                       const int
 59 |    *         On entry, if SRCPROC = -1, the data  is not  distributed  but
 60 |    *         replicated,  in  which  case  this  routine returns IG in all
 61 |    *         processes. Otherwise, the value of SRCPROC is ignored.
 62 |    *
 63 |    * NPROCS  (input)                       const int
 64 |    *         On entry,  NPROCS  specifies the total number of process rows
 65 |    *         or columns over which the matrix is distributed.  NPROCS must
 66 |    *         be at least one.
 67 |    *
 68 |    * ---------------------------------------------------------------------
 69 |    */
 70 | 
 71 |   int i, j;
 72 | 
 73 |   if((IG < INB) || (SRCPROC == -1) || (NPROCS == 1)) {
 74 |     /*
 75 |      * IG  belongs  to the first block,  or the data is not distributed,  or
 76 |      * there is just one process in this dimension of the grid.
 77 |      */
 78 |     *IL   = IG;
 79 |     *PROC = SRCPROC;
 80 |   } else {
 81 |     /*
 82 |      * IG  =  INB - NB + ( l * NPROCS + MYROC ) * NB + X  with  0 <= X < NB,
 83 |      * thus IG is to be found in the block (IG-INB+NB) / NB = l*NPROCS+MYROC
 84 |      * with  0 <= MYROC < NPROCS.  The local index to be returned depends on
 85 |      * whether  IG  resides in the process owning the first partial block of
 86 |      * size INB (MYROC=0). To determine this cheaply, let i = (IG-INB) / NB,
 87 |      * so that if NPROCS divides i+1, i.e. MYROC=0,  we have i+1 = l*NPROCS.
 88 |      * If we set  j = i / NPROCS, it follows that j = l-1. Therefore, i+1 is
 89 |      * equal to (j+1) * NPROCS.  Conversely, if NPROCS does not divide  i+1,
 90 |      * then i+1 = l*NPROCS + MYROC with 1 <= MYROC < NPROCS. It follows that
 91 |      * j=l and thus (j+1)*NPROCS > i+1.
 92 |      */
 93 |     j = (i = (IG - INB) / NB) / NPROCS;
 94 |     /*
 95 |      * IG  is in block  1 + ( IG - INB ) / NB.  Add this to SRCPROC and take
 96 |      * the NPROCS modulo (definition of the block-cyclic data distribution).
 97 |      */
 98 |     *PROC = SRCPROC + 1 + i;
 99 |     *PROC = MPosMod(*PROC, NPROCS);
100 |     /*
101 |      * When IG resides in the process owning the first partial block of size
102 |      * INB (MYROC = 0), then the result IL can be written as:
103 |      * IL = INB - NB + l * NB + X  = IG + ( l - (l * NPROCS + MYROC) ) * NB.
104 |      * Using the above notation,  we have i+1 = l*NPROCS + MYROC = l*NPROCS,
105 |      * i.e l = ( i+1 ) / NPROCS = j+1,  since  NPROCS divides i+1, therefore
106 |      * IL = IG + ( j + 1 - ( i + 1 ) ) * NB.
107 |      *
108 |      * Otherwise when MYROC >= 1, the result IL can be written as:
109 |      * IL = l * NB + X = IG - INB + ( ( l+1 ) - ( l * NPROCS + MYROC ) )*NB.
110 |      * We still have i+1 = l*NPROCS+MYROC. Since NPROCS does not divide i+1,
111 |      * we have j = (l*NPROCS+MYROC-1) / NPROCS = l, i.e
112 |      * IL = IG - INB + ( j + 1 - ( i + 1 ) ) * NB.
113 |      */
114 |     *IL = NB * (j - i) + ((i + 1 - (j + 1) * NPROCS) ? IG - INB : IG);
115 |   }
116 | }
117 | 


--------------------------------------------------------------------------------
/src/pauxil/HPL_indxg2p.cpp:
--------------------------------------------------------------------------------
 1 | /* ---------------------------------------------------------------------
 2 |  * -- High Performance Computing Linpack Benchmark (HPL)
 3 |  *    HPL - 2.2 - February 24, 2016
 4 |  *    Antoine P. Petitet
 5 |  *    University of Tennessee, Knoxville
 6 |  *    Innovative Computing Laboratory
 7 |  *    (C) Copyright 2000-2008 All Rights Reserved
 8 |  *
 9 |  *    Modified by: Noel Chalmers
10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
11 |  *    See the rocHPL/LICENCE file for details.
12 |  *
13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
14 |  * ---------------------------------------------------------------------
15 |  */
16 | 
17 | #include "hpl.hpp"
18 | 
19 | int HPL_indxg2p(const int IG,
20 |                 const int INB,
21 |                 const int NB,
22 |                 const int SRCPROC,
23 |                 const int NPROCS) {
24 |   /*
25 |    * Purpose
26 |    * =======
27 |    *
28 |    * HPL_indxg2p computes the process coordinate  which posseses the entry
29 |    * of a matrix specified by a global index IG.
30 |    *
31 |    * Arguments
32 |    * =========
33 |    *
34 |    * IG      (input)                       const int
35 |    *         On entry, IG specifies the global index of the matrix  entry.
36 |    *         IG must be at least zero.
37 |    *
38 |    * INB     (input)                       const int
39 |    *         On entry,  INB  specifies  the size of the first block of the
40 |    *         global matrix. INB must be at least one.
41 |    *
42 |    * NB      (input)                       const int
43 |    *         On entry,  NB specifies the blocking factor used to partition
44 |    *         and distribute the matrix A. NB must be larger than one.
45 |    *
46 |    * SRCPROC (input)                       const int
47 |    *         On entry,  SRCPROC  specifies  the coordinate of the  process
48 |    *         that possesses the first row or column of the matrix. SRCPROC
49 |    *         must be at least zero and strictly less than NPROCS.
50 |    *
51 |    * NPROCS  (input)                       const int
52 |    *         On entry,  NPROCS  specifies the total number of process rows
53 |    *         or columns over which the matrix is distributed.  NPROCS must
54 |    *         be at least one.
55 |    *
56 |    * ---------------------------------------------------------------------
57 |    */
58 | 
59 |   int proc;
60 | 
61 |   if((IG < INB) || (SRCPROC == -1) || (NPROCS == 1))
62 |     /*
63 |      * IG  belongs  to the first block,  or the data is not distributed,  or
64 |      * there is just one process in this dimension of the grid.
65 |      */
66 |     return (SRCPROC);
67 |   /*
68 |    * Otherwise,  IG is in block 1 + ( IG - INB ) / NB. Add this to SRCPROC
69 |    * and take the NPROCS  modulo (definition of the block-cyclic data dis-
70 |    * tribution).
71 |    */
72 |   proc = SRCPROC + 1 + (IG - INB) / NB;
73 |   return (MPosMod(proc, NPROCS));
74 | }
75 | 


--------------------------------------------------------------------------------
/src/pauxil/HPL_indxl2g.cpp:
--------------------------------------------------------------------------------
  1 | /* ---------------------------------------------------------------------
  2 |  * -- High Performance Computing Linpack Benchmark (HPL)
  3 |  *    HPL - 2.2 - February 24, 2016
  4 |  *    Antoine P. Petitet
  5 |  *    University of Tennessee, Knoxville
  6 |  *    Innovative Computing Laboratory
  7 |  *    (C) Copyright 2000-2008 All Rights Reserved
  8 |  *
  9 |  *    Modified by: Noel Chalmers
 10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
 11 |  *    See the rocHPL/LICENCE file for details.
 12 |  *
 13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
 14 |  * ---------------------------------------------------------------------
 15 |  */
 16 | 
 17 | #include "hpl.hpp"
 18 | 
 19 | int HPL_indxl2g(const int IL,
 20 |                 const int INB,
 21 |                 const int NB,
 22 |                 const int PROC,
 23 |                 const int SRCPROC,
 24 |                 const int NPROCS) {
 25 |   /*
 26 |    * Purpose
 27 |    * =======
 28 |    *
 29 |    * HPL_indxl2g computes the global index of a matrix  entry  pointed to
 30 |    * by the local index IL of the process indicated by PROC.
 31 |    *
 32 |    * Arguments
 33 |    * =========
 34 |    *
 35 |    * IL      (input)                       const int
 36 |    *         On entry, IL specifies the local  index of the matrix  entry.
 37 |    *         IL must be at least zero.
 38 |    *
 39 |    * INB     (input)                       const int
 40 |    *         On entry,  INB  specifies  the size of the first block of the
 41 |    *         global matrix. INB must be at least one.
 42 |    *
 43 |    * NB      (input)                       const int
 44 |    *         On entry,  NB specifies the blocking factor used to partition
 45 |    *         and distribute the matrix A. NB must be larger than one.
 46 |    *
 47 |    * PROC    (input)                       const int
 48 |    *         On entry, PROC  specifies the coordinate of the process whose
 49 |    *         local array row or column is to be determined. PROC  must  be
 50 |    *         at least zero and strictly less than NPROCS.
 51 |    *
 52 |    * SRCPROC (input)                       const int
 53 |    *         On entry,  SRCPROC  specifies  the coordinate of the  process
 54 |    *         that possesses the first row or column of the matrix. SRCPROC
 55 |    *         must be at least zero and strictly less than NPROCS.
 56 |    *
 57 |    * NPROCS  (input)                       const int
 58 |    *         On entry,  NPROCS  specifies the total number of process rows
 59 |    *         or columns over which the matrix is distributed.  NPROCS must
 60 |    *         be at least one.
 61 |    *
 62 |    * ---------------------------------------------------------------------
 63 |    */
 64 | 
 65 |   if((SRCPROC == -1) || (NPROCS == 1)) {
 66 |     /*
 67 |      * The data is not distributed, or there is just one process in this di-
 68 |      * mension of the grid.
 69 |      */
 70 |     return (IL);
 71 |   } else if(PROC == SRCPROC) {
 72 |     /*
 73 |      * If I am SRCPROC, my first block is of size INB
 74 |      */
 75 |     if(IL < INB)
 76 |       /*
 77 |        * If  IL  belongs to the first block,  the local and global indexes are
 78 |        * equal.
 79 |        */
 80 |       return (IL);
 81 |     /*
 82 |      * The  number  of  entire  blocks  before  the  one  IL  belongs  to is
 83 |      * ( IL - INB ) / NB + 1.  In  the other NPROCS-1 processes,  there  are
 84 |      * thus NB*( ( IL-INB )/NB + 1 ) entries,  that are  globally before the
 85 |      * global entry corresponding to IL.
 86 |      */
 87 |     return ((NPROCS - 1) * NB * ((IL - INB) / NB + 1) + IL);
 88 |   } else if(PROC < SRCPROC) {
 89 |     /*
 90 |      * Otherwise, the process of coordinate  MOD(SRCPROC+1, NPROCS) owns the
 91 |      * second block. Let IPROC = PROC-SRCPROC-1+NPROCS be the number of pro-
 92 |      * cesses between this process and  PROC  not  included  when going from
 93 |      * left to right on the process line  with  possible wrap around.  These
 94 |      * IPROC  processes have one more NB block than the other processes, who
 95 |      * own IL / NB blocks of size NB.
 96 |      */
 97 |     return (NB * ((NPROCS - 1) * (IL / NB) + PROC - SRCPROC - 1 + NPROCS) + IL +
 98 |             INB);
 99 |   } else {
100 |     /*
101 |      * Same reasoning as above with IPROC = PROC - SRCPROC - 1.
102 |      */
103 |     return (NB * ((NPROCS - 1) * (IL / NB) + PROC - SRCPROC - 1) + IL + INB);
104 |   }
105 | }
106 | 


--------------------------------------------------------------------------------
/src/pauxil/HPL_numroc.cpp:
--------------------------------------------------------------------------------
 1 | /* ---------------------------------------------------------------------
 2 |  * -- High Performance Computing Linpack Benchmark (HPL)
 3 |  *    HPL - 2.2 - February 24, 2016
 4 |  *    Antoine P. Petitet
 5 |  *    University of Tennessee, Knoxville
 6 |  *    Innovative Computing Laboratory
 7 |  *    (C) Copyright 2000-2008 All Rights Reserved
 8 |  *
 9 |  *    Modified by: Noel Chalmers
10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
11 |  *    See the rocHPL/LICENCE file for details.
12 |  *
13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
14 |  * ---------------------------------------------------------------------
15 |  */
16 | 
17 | #include "hpl.hpp"
18 | 
19 | int HPL_numroc(const int N,
20 |                const int INB,
21 |                const int NB,
22 |                const int PROC,
23 |                const int SRCPROC,
24 |                const int NPROCS) {
25 |   /*
26 |    * Purpose
27 |    * =======
28 |    *
29 |    * HPL_numroc returns  the  local number of matrix rows/columns process
30 |    * PROC  will  get  if  we give out  N rows/columns starting from global
31 |    * index 0.
32 |    *
33 |    * Arguments
34 |    * =========
35 |    *
36 |    * N       (input)                       const int
37 |    *         On entry, N  specifies the number of rows/columns being dealt
38 |    *         out. N must be at least zero.
39 |    *
40 |    * INB     (input)                       const int
41 |    *         On entry,  INB  specifies  the size of the first block of the
42 |    *         global matrix. INB must be at least one.
43 |    *
44 |    * NB      (input)                       const int
45 |    *         On entry,  NB specifies the blocking factor used to partition
46 |    *         and distribute the matrix A. NB must be larger than one.
47 |    *
48 |    * PROC    (input)                       const int
49 |    *         On entry, PROC specifies  the coordinate of the process whose
50 |    *         local portion is determined.  PROC must be at least zero  and
51 |    *         strictly less than NPROCS.
52 |    *
53 |    * SRCPROC (input)                       const int
54 |    *         On entry,  SRCPROC  specifies  the coordinate of the  process
55 |    *         that possesses the first row or column of the matrix. SRCPROC
56 |    *         must be at least zero and strictly less than NPROCS.
57 |    *
58 |    * NPROCS  (input)                       const int
59 |    *         On entry,  NPROCS  specifies the total number of process rows
60 |    *         or columns over which the matrix is distributed.  NPROCS must
61 |    *         be at least one.
62 |    *
63 |    * ---------------------------------------------------------------------
64 |    */
65 | 
66 |   return (HPL_numrocI(N, 0, INB, NB, PROC, SRCPROC, NPROCS));
67 | }
68 | 


--------------------------------------------------------------------------------
/src/pauxil/HPL_pabort.cpp:
--------------------------------------------------------------------------------
 1 | /* ---------------------------------------------------------------------
 2 |  * -- High Performance Computing Linpack Benchmark (HPL)
 3 |  *    HPL - 2.2 - February 24, 2016
 4 |  *    Antoine P. Petitet
 5 |  *    University of Tennessee, Knoxville
 6 |  *    Innovative Computing Laboratory
 7 |  *    (C) Copyright 2000-2008 All Rights Reserved
 8 |  *
 9 |  *    Modified by: Noel Chalmers
10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
11 |  *    See the rocHPL/LICENCE file for details.
12 |  *
13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
14 |  * ---------------------------------------------------------------------
15 |  */
16 | 
17 | #include "hpl.hpp"
18 | 
19 | void HPL_pabort(int LINE, const char* SRNAME, const char* FORM, ...) {
20 |   /*
21 |    * Purpose
22 |    * =======
23 |    *
24 |    * HPL_pabort displays an error message on stderr and halts execution.
25 |    *
26 |    *
27 |    * Arguments
28 |    * =========
29 |    *
30 |    * LINE    (local input)                 int
31 |    *         On entry,  LINE  specifies the line  number in the file where
32 |    *         the  error  has  occured.  When  LINE  is not a positive line
33 |    *         number, it is ignored.
34 |    *
35 |    * SRNAME  (local input)                 const char *
36 |    *         On entry, SRNAME  should  be the name of the routine  calling
37 |    *         this error handler.
38 |    *
39 |    * FORM    (local input)                 const char *
40 |    *         On entry, FORM specifies the format, i.e., how the subsequent
41 |    *         arguments are converted for output.
42 |    *
43 |    *         (local input)                 ...
44 |    *         On entry,  ...  is the list of arguments to be printed within
45 |    *         the format string.
46 |    *
47 |    * ---------------------------------------------------------------------
48 |    */
49 | 
50 |   va_list argptr;
51 |   int     rank;
52 |   char    cline[128];
53 | 
54 |   va_start(argptr, FORM);
55 |   (void)vsprintf(cline, FORM, argptr);
56 |   va_end(argptr);
57 | 
58 |   MPI_Comm_rank(MPI_COMM_WORLD, &rank);
59 |   /*
60 |    * Display an error message
61 |    */
62 |   if(LINE <= 0)
63 |     HPL_fprintf(stderr,
64 |                 "%s %s %d, %s %s:\n>>> %s <<< Abort ...\n\n",
65 |                 "HPL ERROR",
66 |                 "from process #",
67 |                 rank,
68 |                 "in function",
69 |                 SRNAME,
70 |                 cline);
71 |   else
72 |     HPL_fprintf(stderr,
73 |                 "%s %s %d, %s %d %s %s:\n>>> %s <<< Abort ...\n\n",
74 |                 "HPL ERROR",
75 |                 "from process #",
76 |                 rank,
77 |                 "on line",
78 |                 LINE,
79 |                 "of function",
80 |                 SRNAME,
81 |                 cline);
82 | 
83 |   MPI_Abort(MPI_COMM_WORLD, -1);
84 |   exit(-1);
85 | }
86 | 


--------------------------------------------------------------------------------
/src/pauxil/HPL_pdlamch.cpp:
--------------------------------------------------------------------------------
 1 | /* ---------------------------------------------------------------------
 2 |  * -- High Performance Computing Linpack Benchmark (HPL)
 3 |  *    HPL - 2.2 - February 24, 2016
 4 |  *    Antoine P. Petitet
 5 |  *    University of Tennessee, Knoxville
 6 |  *    Innovative Computing Laboratory
 7 |  *    (C) Copyright 2000-2008 All Rights Reserved
 8 |  *
 9 |  *    Modified by: Noel Chalmers
10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
11 |  *    See the rocHPL/LICENCE file for details.
12 |  *
13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
14 |  * ---------------------------------------------------------------------
15 |  */
16 | 
17 | #include "hpl.hpp"
18 | 
19 | double HPL_pdlamch(MPI_Comm COMM, const HPL_T_MACH CMACH) {
20 |   /*
21 |    * Purpose
22 |    * =======
23 |    *
24 |    * HPL_pdlamch determines  machine-specific  arithmetic  constants  such  as
25 |    * the relative machine precision (eps),  the safe minimum(sfmin) such that
26 |    * 1/sfmin does not overflow, the base of the machine (base), the precision
27 |    * (prec),  the  number  of  (base)  digits in the  mantissa  (t),  whether
28 |    * rounding occurs in addition (rnd = 1.0 and 0.0 otherwise),  the  minimum
29 |    * exponent before  (gradual)  underflow (emin),  the  underflow  threshold
30 |    * (rmin)- base**(emin-1), the largest exponent before overflow (emax), the
31 |    * overflow threshold (rmax)  - (base**emax)*(1-eps).
32 |    *
33 |    * Arguments
34 |    * =========
35 |    *
36 |    * COMM    (global/local input)          MPI_Comm
37 |    *         The MPI communicator identifying the process collection.
38 |    *
39 |    * CMACH   (global input)                const HPL_T_MACH
40 |    *         Specifies the value to be returned by HPL_pdlamch
41 |    *            = HPL_MACH_EPS,   HPL_pdlamch := eps (default)
42 |    *            = HPL_MACH_SFMIN, HPL_pdlamch := sfmin
43 |    *            = HPL_MACH_BASE,  HPL_pdlamch := base
44 |    *            = HPL_MACH_PREC,  HPL_pdlamch := eps*base
45 |    *            = HPL_MACH_MLEN,  HPL_pdlamch := t
46 |    *            = HPL_MACH_RND,   HPL_pdlamch := rnd
47 |    *            = HPL_MACH_EMIN,  HPL_pdlamch := emin
48 |    *            = HPL_MACH_RMIN,  HPL_pdlamch := rmin
49 |    *            = HPL_MACH_EMAX,  HPL_pdlamch := emax
50 |    *            = HPL_MACH_RMAX,  HPL_pdlamch := rmax
51 |    *
52 |    *         where
53 |    *
54 |    *            eps   = relative machine precision,
55 |    *            sfmin = safe minimum,
56 |    *            base  = base of the machine,
57 |    *            prec  = eps*base,
58 |    *            t     = number of digits in the mantissa,
59 |    *            rnd   = 1.0 if rounding occurs in addition,
60 |    *            emin  = minimum exponent before underflow,
61 |    *            rmin  = underflow threshold,
62 |    *            emax  = largest exponent before overflow,
63 |    *            rmax  = overflow threshold.
64 |    *
65 |    * ---------------------------------------------------------------------
66 |    */
67 | 
68 |   double param;
69 | 
70 |   param = HPL_dlamch(CMACH);
71 | 
72 |   switch(CMACH) {
73 |     case HPL_MACH_EPS:
74 |     case HPL_MACH_SFMIN:
75 |     case HPL_MACH_EMIN:
76 |     case HPL_MACH_RMIN:
77 |       (void)HPL_all_reduce((void*)(&param), 1, HPL_DOUBLE, HPL_MAX, COMM);
78 |       break;
79 |     case HPL_MACH_EMAX:
80 |     case HPL_MACH_RMAX:
81 |       (void)HPL_all_reduce((void*)(&param), 1, HPL_DOUBLE, HPL_MIN, COMM);
82 |       break;
83 |     default: break;
84 |   }
85 | 
86 |   return (param);
87 | }
88 | 


--------------------------------------------------------------------------------
/src/pauxil/HPL_pwarn.cpp:
--------------------------------------------------------------------------------
  1 | /* ---------------------------------------------------------------------
  2 |  * -- High Performance Computing Linpack Benchmark (HPL)
  3 |  *    HPL - 2.2 - February 24, 2016
  4 |  *    Antoine P. Petitet
  5 |  *    University of Tennessee, Knoxville
  6 |  *    Innovative Computing Laboratory
  7 |  *    (C) Copyright 2000-2008 All Rights Reserved
  8 |  *
  9 |  *    Modified by: Noel Chalmers
 10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
 11 |  *    See the rocHPL/LICENCE file for details.
 12 |  *
 13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
 14 |  * ---------------------------------------------------------------------
 15 |  */
 16 | 
 17 | #include "hpl.hpp"
 18 | 
 19 | void HPL_pwarn(FILE*       STREAM,
 20 |                int         LINE,
 21 |                const char* SRNAME,
 22 |                const char* FORM,
 23 |                ...) {
 24 |   /*
 25 |    * Purpose
 26 |    * =======
 27 |    *
 28 |    * HPL_pwarn displays an error message.
 29 |    *
 30 |    *
 31 |    * Arguments
 32 |    * =========
 33 |    *
 34 |    * STREAM  (local input)                 FILE *
 35 |    *         On entry, STREAM specifies the output stream.
 36 |    *
 37 |    * LINE    (local input)                 int
 38 |    *         On entry,  LINE  specifies the line  number in the file where
 39 |    *         the  error  has  occured.  When  LINE  is not a positive line
 40 |    *         number, it is ignored.
 41 |    *
 42 |    * SRNAME  (local input)                 const char *
 43 |    *         On entry, SRNAME  should  be the name of the routine  calling
 44 |    *         this error handler.
 45 |    *
 46 |    * FORM    (local input)                 const char *
 47 |    *         On entry, FORM specifies the format, i.e., how the subsequent
 48 |    *         arguments are converted for output.
 49 |    *
 50 |    *         (local input)                 ...
 51 |    *         On entry,  ...  is the list of arguments to be printed within
 52 |    *         the format string.
 53 |    *
 54 |    * ---------------------------------------------------------------------
 55 |    */
 56 | 
 57 |   va_list argptr;
 58 |   int     rank;
 59 |   char    cline[128];
 60 | 
 61 |   va_start(argptr, FORM);
 62 |   (void)vsprintf(cline, FORM, argptr);
 63 |   va_end(argptr);
 64 | 
 65 |   MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 66 |   /*
 67 |    * Display an error message
 68 |    */
 69 |   if(LINE <= 0) {
 70 |     HPL_fprintf(STREAM,
 71 |                 "%s %s %d, %s %s:\n>>> %s <<<\n\n",
 72 |                 "HPL ERROR",
 73 |                 "from process #",
 74 |                 rank,
 75 |                 "in function",
 76 |                 SRNAME,
 77 |                 cline);
 78 |     if(STREAM != stderr) {
 79 |       HPL_fprintf(stderr,
 80 |                   "%s %s %d, %s %s:\n>>> %s <<<\n\n",
 81 |                   "HPL ERROR",
 82 |                   "from process #",
 83 |                   rank,
 84 |                   "in function",
 85 |                   SRNAME,
 86 |                   cline);
 87 |     }
 88 |   } else {
 89 |     HPL_fprintf(STREAM,
 90 |                 "%s %s %d, %s %d %s %s:\n>>> %s <<<\n\n",
 91 |                 "HPL ERROR",
 92 |                 "from process #",
 93 |                 rank,
 94 |                 "on line",
 95 |                 LINE,
 96 |                 "of function",
 97 |                 SRNAME,
 98 |                 cline);
 99 | 
100 |     if(STREAM != stderr) {
101 |       HPL_fprintf(stderr,
102 |                   "%s %s %d, %s %d %s %s:\n>>> %s <<<\n\n",
103 |                   "HPL ERROR",
104 |                   "from process #",
105 |                   rank,
106 |                   "on line",
107 |                   LINE,
108 |                   "of function",
109 |                   SRNAME,
110 |                   cline);
111 |     }
112 |   }
113 | }
114 | 


--------------------------------------------------------------------------------
/src/pfact/HPL_pdfact.cpp:
--------------------------------------------------------------------------------
  1 | /* ---------------------------------------------------------------------
  2 |  * -- High Performance Computing Linpack Benchmark (HPL)
  3 |  *    HPL - 2.2 - February 24, 2016
  4 |  *    Antoine P. Petitet
  5 |  *    University of Tennessee, Knoxville
  6 |  *    Innovative Computing Laboratory
  7 |  *    (C) Copyright 2000-2008 All Rights Reserved
  8 |  *
  9 |  *    Modified by: Noel Chalmers
 10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
 11 |  *    See the rocHPL/LICENCE file for details.
 12 |  *
 13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
 14 |  * ---------------------------------------------------------------------
 15 |  */
 16 | 
 17 | #include "hpl.hpp"
 18 | #include <assert.h>
 19 | 
 20 | void HPL_pdfact(HPL_T_panel* PANEL) {
 21 |   /*
 22 |    * Purpose
 23 |    * =======
 24 |    *
 25 |    * HPL_pdfact recursively factorizes a  1-dimensional  panel of columns.
 26 |    * The  RPFACT  function pointer specifies the recursive algorithm to be
 27 |    * used, either Crout, Left- or Right looking.  NBMIN allows to vary the
 28 |    * recursive stopping criterium in terms of the number of columns in the
 29 |    * panel, and  NDIV allows to specify the number of subpanels each panel
 30 |    * should be divided into. Usuallly a value of 2 will be chosen. Finally
 31 |    * PFACT is a function pointer specifying the non-recursive algorithm to
 32 |    * to be used on at most NBMIN columns. One can also choose here between
 33 |    * Crout, Left- or Right looking.  Empirical tests seem to indicate that
 34 |    * values of 4 or 8 for NBMIN give the best results.
 35 |    *
 36 |    * Bi-directional  exchange  is  used  to  perform  the  swap::broadcast
 37 |    * operations  at once  for one column in the panel.  This  results in a
 38 |    * lower number of slightly larger  messages than usual.  On P processes
 39 |    * and assuming bi-directional links,  the running time of this function
 40 |    * can be approximated by (when N is equal to N0):
 41 |    *
 42 |    *    N0 * log_2( P ) * ( lat + ( 2*N0 + 4 ) / bdwth ) +
 43 |    *    N0^2 * ( M - N0/3 ) * gam2-3
 44 |    *
 45 |    * where M is the local number of rows of  the panel, lat and bdwth  are
 46 |    * the latency and bandwidth of the network for  double  precision  real
 47 |    * words, and  gam2-3  is  an estimate of the  Level 2 and Level 3  BLAS
 48 |    * rate of execution. The  recursive  algorithm  allows indeed to almost
 49 |    * achieve  Level 3 BLAS  performance  in the panel factorization.  On a
 50 |    * large  number of modern machines,  this  operation is however latency
 51 |    * bound,  meaning  that its cost can  be estimated  by only the latency
 52 |    * portion N0 * log_2(P) * lat.  Mono-directional links will double this
 53 |    * communication cost.
 54 |    *
 55 |    * Arguments
 56 |    * =========
 57 |    *
 58 |    * PANEL   (local input/output)          HPL_T_panel *
 59 |    *         On entry,  PANEL  points to the data structure containing the
 60 |    *         panel information.
 61 |    *
 62 |    * ---------------------------------------------------------------------
 63 |    */
 64 | 
 65 |   int jb = PANEL->jb;
 66 |   PANEL->n -= jb;
 67 |   PANEL->ja += jb;
 68 | 
 69 |   if((PANEL->grid->mycol != PANEL->pcol) || (jb <= 0)) return;
 70 | 
 71 | #ifdef HPL_DETAILED_TIMING
 72 |   HPL_ptimer(HPL_TIMING_RPFACT);
 73 | #endif
 74 | 
 75 |   /*Copy current panel into workspace*/
 76 |   HPL_dlacpy(
 77 |       PANEL->mp, PANEL->jb, PANEL->A, PANEL->lda, PANEL->A0, PANEL->lda0);
 78 | 
 79 |   /*
 80 |    * Factor the panel - Update the panel pointers
 81 |    */
 82 |   HPL_TracingPush("pdfact");
 83 | 
 84 |   hipStream_t stream;
 85 |   CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream));
 86 |   CHECK_HIP_ERROR(hipEventRecord(pfactStart, stream));
 87 | 
 88 |   PANEL->algo->rffun(PANEL, PANEL->mp, jb, 0);
 89 | 
 90 |   CHECK_HIP_ERROR(hipEventRecord(pfactStop, stream));
 91 | 
 92 |   HPL_TracingPop();
 93 | 
 94 |   /*Copy L1 back into A*/
 95 |   if(PANEL->grid->myrow == PANEL->prow) {
 96 |     if(PANEL->algo->L1notran) {
 97 |       HPL_dlacpy(jb, jb, PANEL->L1, jb, PANEL->A, PANEL->lda);
 98 |     } else {
 99 |       HPL_dlatcpy(jb, jb, PANEL->L1, jb, PANEL->A, PANEL->lda);
100 |     }
101 |   }
102 | 
103 |   PANEL->A = Mptr(PANEL->A, 0, jb, PANEL->lda);
104 |   PANEL->nq -= jb;
105 |   PANEL->jj += jb;
106 | 
107 | #ifdef HPL_DETAILED_TIMING
108 |   HPL_ptimer(HPL_TIMING_RPFACT);
109 | #endif
110 | }
111 | 


--------------------------------------------------------------------------------
/src/pfact/HPL_pdmxswp.cpp:
--------------------------------------------------------------------------------
  1 | /* ---------------------------------------------------------------------
  2 |  * -- High Performance Computing Linpack Benchmark (HPL)
  3 |  *    HPL - 2.2 - February 24, 2016
  4 |  *    Antoine P. Petitet
  5 |  *    University of Tennessee, Knoxville
  6 |  *    Innovative Computing Laboratory
  7 |  *    (C) Copyright 2000-2008 All Rights Reserved
  8 |  *
  9 |  *    Modified by: Noel Chalmers
 10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
 11 |  *    See the rocHPL/LICENCE file for details.
 12 |  *
 13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
 14 |  * ---------------------------------------------------------------------
 15 |  */
 16 | 
 17 | #include "hpl.hpp"
 18 | 
 19 | void HPL_pdmxswp(HPL_T_panel* PANEL,
 20 |                  const int    M,
 21 |                  const int    II,
 22 |                  const int    JJ,
 23 |                  double*      WORK) {
 24 |   /*
 25 |    * Purpose
 26 |    * =======
 27 |    *
 28 |    * HPL_pdmxswp swaps  and  broadcasts  the  absolute value max row using
 29 |    * bi-directional exchange.  The buffer is partially set by HPL_dlocmax.
 30 |    *
 31 |    * Bi-directional  exchange  is  used  to  perform  the  swap::broadcast
 32 |    * operations  at once  for one column in the panel.  This  results in a
 33 |    * lower number of slightly larger  messages than usual.  On P processes
 34 |    * and assuming bi-directional links,  the running time of this function
 35 |    * can be approximated by
 36 |    *
 37 |    *    log_2( P ) * ( lat + ( 2 * N0 + 4 ) / bdwth )
 38 |    *
 39 |    * where  lat and bdwth are the latency and bandwidth of the network for
 40 |    * double precision real elements.  Communication  only  occurs  in  one
 41 |    * process  column. Mono-directional links  will cause the communication
 42 |    * cost to double.
 43 |    *
 44 |    * Arguments
 45 |    * =========
 46 |    *
 47 |    * PANEL   (local input/output)          HPL_T_panel *
 48 |    *         On entry,  PANEL  points to the data structure containing the
 49 |    *         panel information.
 50 |    *
 51 |    * M       (local input)                 const int
 52 |    *         On entry,  M specifies the local number of rows of the matrix
 53 |    *         column on which this function operates.
 54 |    *
 55 |    * II      (local input)                 const int
 56 |    *         On entry, II  specifies the row offset where the column to be
 57 |    *         operated on starts with respect to the panel.
 58 |    *
 59 |    * JJ      (local input)                 const int
 60 |    *         On entry, JJ  specifies the column offset where the column to
 61 |    *         be operated on starts with respect to the panel.
 62 |    *
 63 |    * WORK    (local workspace)             double *
 64 |    *         On entry, WORK  is a workarray of size at least 2 * (4+2*N0).
 65 |    *         It  is assumed that  HPL_dlocmax  was called  prior  to  this
 66 |    *         routine to  initialize  the first four entries of this array.
 67 |    *         On exit, the  N0  length max row is stored in WORK[4:4+N0-1];
 68 |    *         Note that this is also the  JJth  row  (or column) of L1. The
 69 |    *         remaining part is used as a temporary array.
 70 |    *
 71 |    * ---------------------------------------------------------------------
 72 |    */
 73 | 
 74 |   double*     Wwork;
 75 |   HPL_T_grid* grid;
 76 |   MPI_Comm    comm;
 77 |   int         cnt0, icurrow, myrow, nprow;
 78 | 
 79 | /* ..
 80 |  * .. Executable Statements ..
 81 |  */
 82 | #ifdef HPL_DETAILED_TIMING
 83 |   HPL_ptimer(HPL_TIMING_MXSWP);
 84 | #endif
 85 |   grid    = PANEL->grid;
 86 |   comm    = grid->col_comm;
 87 |   myrow   = grid->myrow;
 88 |   nprow   = grid->nprow;
 89 |   int JB  = PANEL->jb;
 90 |   int NB  = PANEL->nb;
 91 |   icurrow = PANEL->prow;
 92 | 
 93 |   cnt0  = 4 + 2 * JB;
 94 |   Wwork = WORK + cnt0;
 95 | 
 96 |   if(M > 0) {
 97 |     int ilindx = static_cast<int>(WORK[1]);
 98 |     int kk     = PANEL->ii + II + (ilindx);
 99 |     int igindx = 0;
100 |     Mindxl2g(igindx, kk, NB, NB, myrow, 0, nprow);
101 |     /*
102 |      * WORK[0] := local maximum absolute value scalar,
103 |      * WORK[1] := corresponding local  row index,
104 |      * WORK[2] := corresponding global row index,
105 |      * WORK[3] := coordinate of process owning this max.
106 |      */
107 |     WORK[2] = (double)(igindx);
108 |     WORK[3] = (double)(myrow);
109 | 
110 |   } else {
111 |     WORK[0] = WORK[1] = WORK[2] = HPL_rzero;
112 |     WORK[3]                     = (double)(PANEL->grid->nprow);
113 |   }
114 | 
115 |   /* Perform swap-broadcast */
116 |   HPL_all_reduce_dmxswp(WORK, cnt0, icurrow, comm, Wwork);
117 | 
118 |   /*
119 |    * Save the global pivot index in pivot array
120 |    */
121 |   (PANEL->ipiv)[JJ] = (int)WORK[2];
122 | 
123 | #ifdef HPL_DETAILED_TIMING
124 |   HPL_ptimer(HPL_TIMING_MXSWP);
125 | #endif
126 | }
127 | 


--------------------------------------------------------------------------------
/src/pgesv/HPL_perm.cpp:
--------------------------------------------------------------------------------
 1 | /* ---------------------------------------------------------------------
 2 |  * -- High Performance Computing Linpack Benchmark (HPL)
 3 |  *    HPL - 2.2 - February 24, 2016
 4 |  *    Antoine P. Petitet
 5 |  *    University of Tennessee, Knoxville
 6 |  *    Innovative Computing Laboratory
 7 |  *    (C) Copyright 2000-2008 All Rights Reserved
 8 |  *
 9 |  *    Modified by: Noel Chalmers
10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
11 |  *    See the rocHPL/LICENCE file for details.
12 |  *
13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
14 |  * ---------------------------------------------------------------------
15 |  */
16 | 
17 | #include "hpl.hpp"
18 | 
19 | void HPL_perm(const int N, int* LINDXA, int* LINDXAU, int* IWORK) {
20 |   /*
21 |    * Purpose
22 |    * =======
23 |    *
24 |    * HPL_perm combines  two  index  arrays  and generate the corresponding
25 |    * permutation. First, this function computes the inverse of LINDXA, and
26 |    * then combine it with LINDXAU.  Second, in order to be able to perform
27 |    * the permutation in place,  LINDXAU  is overwritten by the sequence of
28 |    * permutation  producing  the  same result.  What we ultimately want to
29 |    * achieve is:  U[LINDXAU[i]] := U[LINDXA[i]] for i in [0..N). After the
30 |    * call to this function,  this in place permutation can be performed by
31 |    * for i in [0..N) swap U[i] with U[LINDXAU[i]].
32 |    *
33 |    * Arguments
34 |    * =========
35 |    *
36 |    * N       (global input)                const int
37 |    *         On entry,  N  specifies the length of the arrays  LINDXA  and
38 |    *         LINDXAU. N should be at least zero.
39 |    *
40 |    * LINDXA  (global input/output)         int *
41 |    *         On entry,  LINDXA  is an array of dimension N  containing the
42 |    *         source indexes. On exit,  LINDXA  contains the combined index
43 |    *         array.
44 |    *
45 |    * LINDXAU (global input/output)         int *
46 |    *         On entry,  LINDXAU is an array of dimension N  containing the
47 |    *         target indexes.  On exit,  LINDXAU  contains  the sequence of
48 |    *         permutation,  that  should be applied  in increasing order to
49 |    *         permute the underlying array U in place.
50 |    *
51 |    * IWORK   (workspace)                   int *
52 |    *         On entry, IWORK is a workarray of dimension N.
53 |    *
54 |    * ---------------------------------------------------------------------
55 |    */
56 | 
57 |   int i, j, k, fndd;
58 | 
59 |   /*
60 |    * Inverse LINDXA - combine LINDXA and LINDXAU - Initialize IWORK
61 |    */
62 |   for(i = 0; i < N; i++) { IWORK[LINDXA[i]] = i; }
63 |   for(i = 0; i < N; i++) {
64 |     LINDXA[i] = LINDXAU[IWORK[i]];
65 |     IWORK[i]  = i;
66 |   }
67 | 
68 |   for(i = 0; i < N; i++) {
69 |     /* search LINDXA such that    LINDXA[j]  == i */
70 |     j = 0;
71 |     do {
72 |       fndd = (LINDXA[j] == i);
73 |       j++;
74 |     } while(!fndd);
75 |     j--;
76 |     /* search IWORK  such that    IWORK[k]   == j */
77 |     k = 0;
78 |     do {
79 |       fndd = (IWORK[k] == j);
80 |       k++;
81 |     } while(!fndd);
82 |     k--;
83 |     /* swap IWORK[i] and IWORK[k]; LINDXAU[i] = k */
84 |     j          = IWORK[i];
85 |     IWORK[i]   = IWORK[k];
86 |     IWORK[k]   = j;
87 |     LINDXAU[i] = k;
88 |   }
89 | }
90 | 


--------------------------------------------------------------------------------
/src/pgesv/HPL_piplen.cpp:
--------------------------------------------------------------------------------
 1 | /* ---------------------------------------------------------------------
 2 |  * -- High Performance Computing Linpack Benchmark (HPL)
 3 |  *    HPL - 2.2 - February 24, 2016
 4 |  *    Antoine P. Petitet
 5 |  *    University of Tennessee, Knoxville
 6 |  *    Innovative Computing Laboratory
 7 |  *    (C) Copyright 2000-2008 All Rights Reserved
 8 |  *
 9 |  *    Modified by: Noel Chalmers
10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
11 |  *    See the rocHPL/LICENCE file for details.
12 |  *
13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
14 |  * ---------------------------------------------------------------------
15 |  */
16 | 
17 | #include "hpl.hpp"
18 | 
19 | void HPL_piplen(HPL_T_panel* PANEL,
20 |                 const int    K,
21 |                 const int*   IPID,
22 |                 int*         IPLEN,
23 |                 int*         IWORK) {
24 | 
25 |   const int nprow   = PANEL->grid->nprow;
26 |   const int jb      = PANEL->jb;
27 |   const int nb      = PANEL->nb;
28 |   const int ia      = PANEL->ia;
29 |   const int icurrow = PANEL->prow;
30 | 
31 |   int* iwork = IWORK + jb;
32 | 
33 |   /*
34 |    * Compute IPLEN
35 |    */
36 |   for(int i = 0; i <= nprow; i++) IPLEN[i] = 0;
37 | 
38 |   /*
39 |    * IPLEN[i]  is the number of rows of A in the processes  before
40 |    * process i, with the convention that IPLEN[nprow] is the total
41 |    * number of rows.
42 |    * In other words,  IPLEN[i+1] - IPLEN[i] is the local number of
43 |    * rows of  A  that should be moved for each process.
44 |    */
45 |   for(int i = 0; i < K; i += 2) {
46 |     const int src = IPID[i];
47 |     int       srcrow;
48 |     Mindxg2p(src, nb, nb, srcrow, 0, nprow);
49 |     if(srcrow == icurrow) {
50 |       const int dst = IPID[i + 1];
51 |       int       dstrow;
52 |       Mindxg2p(dst, nb, nb, dstrow, 0, nprow);
53 |       if((dstrow != srcrow) || (dst - ia < jb)) IPLEN[dstrow + 1]++;
54 |     }
55 |   }
56 | 
57 |   for(int i = 1; i <= nprow; i++) { IPLEN[i] += IPLEN[i - 1]; }
58 | }
59 | 


--------------------------------------------------------------------------------
/src/timer/HPL_ptimer_cputime.cpp:
--------------------------------------------------------------------------------
 1 | /* ---------------------------------------------------------------------
 2 |  * -- High Performance Computing Linpack Benchmark (HPL)
 3 |  *    HPL - 2.2 - February 24, 2016
 4 |  *    Antoine P. Petitet
 5 |  *    University of Tennessee, Knoxville
 6 |  *    Innovative Computing Laboratory
 7 |  *    (C) Copyright 2000-2008 All Rights Reserved
 8 |  *
 9 |  *    Modified by: Noel Chalmers
10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
11 |  *    See the rocHPL/LICENCE file for details.
12 |  *
13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
14 |  * ---------------------------------------------------------------------
15 |  */
16 | 
17 | #include "hpl.hpp"
18 | 
19 | /*
20 |  * Purpose
21 |  * =======
22 |  *
23 |  * HPL_ptimer_cputime returns the cpu time.
24 |  * The  clock() function is used to return an approximation of processor
25 |  * time used by the program.  The value returned is the CPU time used so
26 |  * far as a clock_t;  to get the number of seconds used,  the result  is
27 |  * divided by  CLOCKS_PER_SEC.  This function is part of the  ANSI/ISO C
28 |  * standard library.
29 |  *
30 |  * ---------------------------------------------------------------------
31 |  */
32 | 
33 | #include <time.h>
34 | 
35 | double HPL_ptimer_cputime(void) {
36 |   static double  cps = CLOCKS_PER_SEC;
37 |   double         d;
38 |   clock_t        t1;
39 |   static clock_t t0 = 0;
40 | 
41 |   if(t0 == 0) t0 = clock();
42 |   t1 = clock() - t0;
43 |   d  = (double)(t1) / cps;
44 |   return (d);
45 | }
46 | 


--------------------------------------------------------------------------------
/src/timer/HPL_ptimer_walltime.cpp:
--------------------------------------------------------------------------------
 1 | /* ---------------------------------------------------------------------
 2 |  * -- High Performance Computing Linpack Benchmark (HPL)
 3 |  *    HPL - 2.2 - February 24, 2016
 4 |  *    Antoine P. Petitet
 5 |  *    University of Tennessee, Knoxville
 6 |  *    Innovative Computing Laboratory
 7 |  *    (C) Copyright 2000-2008 All Rights Reserved
 8 |  *
 9 |  *    Modified by: Noel Chalmers
10 |  *    (C) 2018-2025 Advanced Micro Devices, Inc.
11 |  *    See the rocHPL/LICENCE file for details.
12 |  *
13 |  *    SPDX-License-Identifier: (BSD-3-Clause)
14 |  * ---------------------------------------------------------------------
15 |  */
16 | 
17 | #include "hpl.hpp"
18 | 
19 | /*
20 |  * Purpose
21 |  * =======
22 |  *
23 |  * HPL_ptimer_walltime returns the elapsed (wall-clock) time.
24 |  *
25 |  *
26 |  * ---------------------------------------------------------------------
27 |  */
28 | 
29 | double HPL_ptimer_walltime(void) { return (MPI_Wtime()); }
30 | 


--------------------------------------------------------------------------------