├── .github
    └── workflows
    │   ├── build.sh
    │   ├── configure.sh
    │   ├── main.yml
    │   ├── setup_env.sh
    │   ├── test.sh
    │   └── variants.yml
├── .gitignore
├── CHANGELOG.md
├── CMakeLists.txt
├── GNUmakefile
├── GNUmakefile.subdir
├── INSTALL.md
├── LICENSE
├── README.md
├── blasppConfig.cmake.in
├── cmake
    ├── BLASConfig.cmake
    ├── BLASFinder.cmake
    ├── CBLASConfig.cmake
    ├── LAPACKFinder.cmake
    ├── config.cmake
    └── util.cmake
├── config
    ├── __init__.py
    ├── ansicodes.py
    ├── blas.cc
    ├── blis_version.cc
    ├── cblas.cc
    ├── compiler_cxx.cc
    ├── config.h
    ├── config.py
    ├── cublas.cc
    ├── essl_version.cc
    ├── hello.cc
    ├── lapack.py
    ├── lapack_potrf.cc
    ├── lapack_pstrf.cc
    ├── mkl_version.cc
    ├── onemkl.cc
    ├── openblas_version.cc
    ├── openmp.cc
    ├── return_complex.cc
    ├── return_complex_argument.cc
    ├── return_float.cc
    ├── return_float_f2c.cc
    ├── rocblas.cc
    └── std_atomic.cc
├── configure.py
├── docs
    └── doxygen
    │   ├── DoxygenLayout.xml
    │   ├── GNUmakefile
    │   ├── doxyfile.conf
    │   ├── errors.pl
    │   └── groups.dox
├── examples
    ├── CMakeLists.txt
    ├── Makefile
    ├── README.md
    ├── example_gemm.cc
    ├── example_util.cc
    └── util.hh
├── include
    ├── GNUmakefile
    ├── blas.hh
    └── blas
    │   ├── GNUmakefile
    │   ├── asum.hh
    │   ├── axpy.hh
    │   ├── batch_common.hh
    │   ├── config.h
    │   ├── copy.hh
    │   ├── counter.hh
    │   ├── defines.h.in
    │   ├── device.hh
    │   ├── device_blas.hh
    │   ├── dot.hh
    │   ├── dotu.hh
    │   ├── flops.hh
    │   ├── fortran.h
    │   ├── gemm.hh
    │   ├── gemv.hh
    │   ├── ger.hh
    │   ├── geru.hh
    │   ├── hemm.hh
    │   ├── hemv.hh
    │   ├── her.hh
    │   ├── her2.hh
    │   ├── her2k.hh
    │   ├── herk.hh
    │   ├── iamax.hh
    │   ├── mangling.h
    │   ├── nrm2.hh
    │   ├── rot.hh
    │   ├── rotg.hh
    │   ├── rotm.hh
    │   ├── rotmg.hh
    │   ├── scal.hh
    │   ├── swap.hh
    │   ├── symm.hh
    │   ├── symv.hh
    │   ├── syr.hh
    │   ├── syr2.hh
    │   ├── syr2k.hh
    │   ├── syrk.hh
    │   ├── trmm.hh
    │   ├── trmv.hh
    │   ├── trsm.hh
    │   ├── trsv.hh
    │   ├── util.hh
    │   └── wrappers.hh
├── lib
    └── pkgconfig
    │   └── blaspp.pc.in
├── make.inc.in
├── src
    ├── asum.cc
    ├── axpy.cc
    ├── batch_gemm.cc
    ├── batch_hemm.cc
    ├── batch_her2k.cc
    ├── batch_herk.cc
    ├── batch_symm.cc
    ├── batch_syr2k.cc
    ├── batch_syrk.cc
    ├── batch_trmm.cc
    ├── batch_trsm.cc
    ├── blas_internal.hh
    ├── copy.cc
    ├── cublas_wrappers.cc
    ├── cuda
    │   ├── device_conj.cu
    │   └── device_shift_vec.cu
    ├── device_asum.cc
    ├── device_axpy.cc
    ├── device_batch_gemm.cc
    ├── device_batch_gemm_group.cc
    ├── device_batch_hemm.cc
    ├── device_batch_her2k.cc
    ├── device_batch_herk.cc
    ├── device_batch_symm.cc
    ├── device_batch_syr2k.cc
    ├── device_batch_syrk.cc
    ├── device_batch_trmm.cc
    ├── device_batch_trsm.cc
    ├── device_copy.cc
    ├── device_dot.cc
    ├── device_error.cc
    ├── device_gemm.cc
    ├── device_gemv.cc
    ├── device_ger.cc
    ├── device_hemm.cc
    ├── device_hemv.cc
    ├── device_her.cc
    ├── device_her2.cc
    ├── device_her2k.cc
    ├── device_herk.cc
    ├── device_iamax.cc
    ├── device_internal.hh
    ├── device_nrm2.cc
    ├── device_queue.cc
    ├── device_rot.cc
    ├── device_rotg.cc
    ├── device_rotm.cc
    ├── device_rotmg.cc
    ├── device_scal.cc
    ├── device_swap.cc
    ├── device_symm.cc
    ├── device_symv.cc
    ├── device_syr.cc
    ├── device_syr2.cc
    ├── device_syr2k.cc
    ├── device_syrk.cc
    ├── device_trmm.cc
    ├── device_trmv.cc
    ├── device_trsm.cc
    ├── device_trsv.cc
    ├── device_utils.cc
    ├── dot.cc
    ├── gemm.cc
    ├── gemv.cc
    ├── ger.cc
    ├── hemm.cc
    ├── hemv.cc
    ├── her.cc
    ├── her2.cc
    ├── her2k.cc
    ├── herk.cc
    ├── hip
    │   ├── device_conj.hip
    │   └── device_shift_vec.hip
    ├── iamax.cc
    ├── nrm2.cc
    ├── onemkl_wrappers.cc
    ├── rocblas_wrappers.cc
    ├── rot.cc
    ├── rotg.cc
    ├── rotm.cc
    ├── rotmg.cc
    ├── scal.cc
    ├── swap.cc
    ├── symm.cc
    ├── symv.cc
    ├── syr.cc
    ├── syr2.cc
    ├── syr2k.cc
    ├── syrk.cc
    ├── trmm.cc
    ├── trmv.cc
    ├── trsm.cc
    ├── trsv.cc
    ├── util.cc
    └── version.cc
├── test
    ├── CMakeLists.txt
    ├── GNUmakefile
    ├── cblas_wrappers.cc
    ├── cblas_wrappers.hh
    ├── check_gemm.hh
    ├── lapack_wrappers.cc
    ├── lapack_wrappers.hh
    ├── print_matrix.hh
    ├── run_tests.py
    ├── test.cc
    ├── test.hh
    ├── test_asum.cc
    ├── test_asum_device.cc
    ├── test_axpy.cc
    ├── test_axpy_device.cc
    ├── test_batch_gemm.cc
    ├── test_batch_gemm_device.cc
    ├── test_batch_hemm.cc
    ├── test_batch_hemm_device.cc
    ├── test_batch_her2k.cc
    ├── test_batch_her2k_device.cc
    ├── test_batch_herk.cc
    ├── test_batch_herk_device.cc
    ├── test_batch_symm.cc
    ├── test_batch_symm_device.cc
    ├── test_batch_syr2k.cc
    ├── test_batch_syr2k_device.cc
    ├── test_batch_syrk.cc
    ├── test_batch_syrk_device.cc
    ├── test_batch_trmm.cc
    ├── test_batch_trmm_device.cc
    ├── test_batch_trsm.cc
    ├── test_batch_trsm_device.cc
    ├── test_copy.cc
    ├── test_copy_device.cc
    ├── test_dot.cc
    ├── test_dot_device.cc
    ├── test_error.cc
    ├── test_gemm.cc
    ├── test_gemm_device.cc
    ├── test_gemv.cc
    ├── test_gemv_device.cc
    ├── test_ger.cc
    ├── test_ger_device.cc
    ├── test_hemm.cc
    ├── test_hemm_device.cc
    ├── test_hemv.cc
    ├── test_hemv_device.cc
    ├── test_her.cc
    ├── test_her2.cc
    ├── test_her2_device.cc
    ├── test_her2k.cc
    ├── test_her2k_device.cc
    ├── test_her_device.cc
    ├── test_herk.cc
    ├── test_herk_device.cc
    ├── test_iamax.cc
    ├── test_iamax_device.cc
    ├── test_max.cc
    ├── test_memcpy.cc
    ├── test_memcpy_2d.cc
    ├── test_nrm2.cc
    ├── test_nrm2_device.cc
    ├── test_rot.cc
    ├── test_rot_device.cc
    ├── test_rotg.cc
    ├── test_rotg_device.cc
    ├── test_rotm.cc
    ├── test_rotm_device.cc
    ├── test_rotmg.cc
    ├── test_rotmg_device.cc
    ├── test_scal.cc
    ├── test_scal_device.cc
    ├── test_schur_gemm.cc
    ├── test_swap.cc
    ├── test_swap_device.cc
    ├── test_symm.cc
    ├── test_symm_device.cc
    ├── test_symv.cc
    ├── test_symv_device.cc
    ├── test_syr.cc
    ├── test_syr2.cc
    ├── test_syr2_device.cc
    ├── test_syr2k.cc
    ├── test_syr2k_device.cc
    ├── test_syr_device.cc
    ├── test_syrk.cc
    ├── test_syrk_device.cc
    ├── test_trmm.cc
    ├── test_trmm_device.cc
    ├── test_trmv.cc
    ├── test_trmv_device.cc
    ├── test_trsm.cc
    ├── test_trsm_device.cc
    ├── test_trsv.cc
    ├── test_trsv_device.cc
    └── test_util.cc
└── tools
    ├── copyright.py
    ├── doxygen-filter.pl
    ├── hooks
        └── pre-commit
    ├── make_release.py
    ├── release.py
    └── slate-style-reject.pl


/.github/workflows/build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -x
 2 | 
 3 | mydir=$(dirname $0)
 4 | source ${mydir}/setup_env.sh
 5 | 
 6 | print "======================================== Build"
 7 | make -j8 || exit 10
 8 | 
 9 | print "======================================== Install"
10 | make -j8 install || exit 11
11 | ls -R ${top}/install
12 | 
13 | print "======================================== Verify build"
14 | ldd_result=$(ldd test/tester) || exit 12
15 | echo "${ldd_result}"
16 | 
17 | # Verify that tester linked with cublas or rocblas as intended.
18 | if [ "${device}" = "gpu_nvidia" ]; then
19 |     echo "${ldd_result}" | grep cublas || exit 13
20 | 
21 | elif [ "${device}" = "gpu_amd" ]; then
22 |     echo "${ldd_result}" | grep rocblas || exit 14
23 | 
24 | else
25 |     # CPU-only not linked with cublas or rocblas.
26 |     echo "${ldd_result}" | grep -P "cublas|rocblas" && exit 15
27 | fi
28 | 
29 | # Verify that tester linked with intended CPU BLAS.
30 | echo "${ldd_result}" | grep -P "lib\S*${blas}" || exit 16
31 | 
32 | # Verify that tester linked with intended ilp64 library, or not.
33 | if [[ $blas_int = "int64" ]] \
34 |     || [[ $bla_vendor = *64ilp* ]] \
35 |     || [[ $bla_vendor = *ilp64* ]] \
36 |     || [[ $BLAS_LIBRARIES = *ilp64* ]]; then
37 |     echo "${ldd_result}" | grep -P "libmkl_\S+_ilp64" || exit 17
38 | else
39 |     echo "${ldd_result}" | grep -P "libmkl_\S+_ilp64" && exit 18
40 | fi
41 | 
42 | print "======================================== Finished build"
43 | exit 0
44 | 


--------------------------------------------------------------------------------
/.github/workflows/configure.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -x
 2 | 
 3 | if [ "${maker}" = "cmake" ]; then
 4 |     rm -rf build
 5 |     mkdir -p build
 6 | fi
 7 | 
 8 | mydir=$(dirname $0)
 9 | source ${mydir}/setup_env.sh
10 | 
11 | print "======================================== Environment"
12 | # Show environment variables, excluding functions.
13 | (set -o posix; set)
14 | 
15 | print "======================================== Modules"
16 | quiet module list -l
17 | 
18 | print "======================================== Query GPUs"
19 | if   [ "${device}" = "gpu_nvidia" ]; then
20 |     nvidia-smi
21 | elif [ "${device}" = "gpu_amd" ]; then
22 |     rocm-smi
23 | elif [ "${device}" = "gpu_intel" ]; then
24 |     clinfo
25 |     sycl-ls
26 | fi
27 | 
28 | print "======================================== Setup build"
29 | # Note: set all env variables in setup_env.sh,
30 | # else build.sh and test.sh won't see them.
31 | 
32 | rm -rf ${top}/install
33 | if [ "${maker}" = "make" ]; then
34 |     make distclean
35 |     make config prefix=${top}/install
36 |     err=$?
37 |     if [[ $err -ne 0 ]]; then
38 |         echo "<<<<<<<<<<<<<<<<<<<< begin config/log.txt"
39 |         cat config/log.txt
40 |         echo ">>>>>>>>>>>>>>>>>>>> end config/log.txt"
41 |         exit 10
42 |     fi
43 | 
44 | elif [ "${maker}" = "cmake" ]; then
45 | 
46 |     if [[ $blas != "" ]]; then
47 |         export cmake_blas="-Dblas=$blas"
48 |     fi
49 |     if [[ $blas_int != "" ]]; then
50 |         export cmake_blas_int="-Dblas_int=$blas_int"
51 |     fi
52 |     if [[ $blas_threaded != "" ]]; then
53 |         export cmake_blas_threaded="-Dblas_threaded=$blas_threaded"
54 |     fi
55 |     if [[ $BLAS_LIBRARIES != "" ]]; then
56 |         export cmake_blas_libraries="-DBLAS_LIBRARIES=$BLAS_LIBRARIES"
57 |     fi
58 |     if [[ $bla_vendor != "" ]]; then
59 |         unset cmake_blas
60 |         unset cmake_blas_int
61 |         unset cmake_blas_threaded
62 |         unset cmake_blas_libraries
63 |         export cmake_bla_vendor="-DBLA_VENDOR=$bla_vendor"
64 |     fi
65 | 
66 |     # cmake_blas_libraries can have spaces; the rest do not expect spaces.
67 |     cmake -Dcolor=no \
68 |           -DCMAKE_INSTALL_PREFIX=${top}/install \
69 |           $cmake_blas $cmake_blas_int $cmake_blas_threaded \
70 |           "$cmake_blas_libraries" $cmake_bla_vendor \
71 |           -Dgpu_backend=${gpu_backend} .. \
72 |           || exit 12
73 | fi
74 | 
75 | cat include/blas/defines.h
76 | 
77 | print "======================================== Finished configure"
78 | exit 0
79 | 


--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
 1 | # Continuous Testing on every PR.
 2 | name: CT
 3 | 
 4 | # Controls when the workflow will run
 5 | on:
 6 |   # Triggers the workflow on push or pull request events but only for the master branch
 7 |   push:
 8 |     branches: [ master ]
 9 |   pull_request:
10 |     branches: [ master ]
11 | 
12 |   # Allows you to run this workflow manually from the Actions tab
13 |   workflow_dispatch:
14 | 
15 | # A workflow run is made up of one or more jobs that can run sequentially or in parallel
16 | #
17 | #               make        cmake
18 | # cpu           openblas    Intel10_64ilp
19 | # nvidia        mkl-int64   openblas
20 | # amd           blis        mkl-int64
21 | # intel         mkl         mkl
22 | #
23 | jobs:
24 |   icl_blaspp:
25 |     timeout-minutes: 120
26 |     strategy:
27 |       matrix:
28 |         maker:  [make, cmake]
29 |         device: [cpu, gpu_nvidia, gpu_amd, gpu_intel]
30 | 
31 |         include:
32 |             # Add `blas` and `blas_int` to existing configurations.
33 |             - maker:    make
34 |               device:   cpu
35 |               blas:     openblas
36 | 
37 |             - maker:    cmake
38 |               device:   cpu
39 |               bla_vendor: Intel10_64ilp     # MKL int64 using CMake's FindBLAS
40 | 
41 |             - maker:    make
42 |               device:   gpu_nvidia
43 |               blas:     mkl
44 |               blas_int: int64
45 | 
46 |             - maker:    cmake
47 |               device:   gpu_nvidia
48 |               blas:     openblas
49 | 
50 |             - maker:    make
51 |               device:   gpu_amd
52 |               blas:     blis
53 | 
54 |             - maker:    cmake
55 |               device:   gpu_amd
56 |               blas:     mkl
57 |               blas_int: int64
58 | 
59 |             # For both make, cmake
60 |             - device:   gpu_intel
61 |               blas:     mkl
62 | 
63 |       fail-fast: false
64 |     runs-on: ${{ matrix.device }}
65 |     name: ${{matrix.maker}} ${{matrix.device}} ${{matrix.blas}}${{matrix.bla_vendor}}${{matrix.blas_libraries}} ${{matrix.blas_int}}
66 | 
67 |     # See variants.yml for use of some of these variables.
68 |     env:
69 |         maker:      ${{matrix.maker}}
70 |         device:     ${{matrix.device}}
71 |         check:      ${{matrix.check}}
72 |         blas:       ${{matrix.blas}}
73 |         blas_int:   ${{matrix.blas_int}}
74 |         bla_vendor: ${{matrix.bla_vendor}}
75 |         BLAS_LIBRARIES: ${{matrix.blas_libraries}}
76 | 
77 |     steps:
78 |       - uses: actions/checkout@v4
79 | 
80 |       - name: Configure
81 |         run: .github/workflows/configure.sh
82 | 
83 |       - name: Build
84 |         run: .github/workflows/build.sh
85 | 
86 |       - name: Test
87 |         run: .github/workflows/test.sh
88 | 


--------------------------------------------------------------------------------
/.github/workflows/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -x
 2 | 
 3 | mydir=$(dirname $0)
 4 | source ${mydir}/setup_env.sh
 5 | 
 6 | # Instead of exiting on the first failed test (bash -e),
 7 | # run all the tests and accumulate failures into $err.
 8 | err=0
 9 | 
10 | export OMP_NUM_THREADS=8
11 | 
12 | print "======================================== Tests"
13 | cd test
14 | 
15 | args="--quick"
16 | if [ "${device}" = "gpu_intel" ]; then
17 |     # Our Intel GPU supports only single precision.
18 |     args+=" --type s,c"
19 | fi
20 | 
21 | if [[ $check = "sanity" ]]; then
22 |     echo "Running only sanity checks"
23 |     ./run_tests.py ${args} herk dev-herk
24 |     (( err += $? ))
25 | else
26 |     ./run_tests.py ${args} --blas1 --blas2 --blas3
27 |     (( err += $? ))
28 | 
29 |     ./run_tests.py ${args} --batch-blas3
30 |     (( err += $? ))
31 | 
32 |     # CUDA, HIP, or SYCL. These fail gracefully when GPUs are absent.
33 |     ./run_tests.py ${args} --blas1-device --blas2-device --blas3-device
34 |     (( err += $? ))
35 | 
36 |     ./run_tests.py ${args} --batch-blas3-device
37 |     (( err += $? ))
38 | fi
39 | 
40 | print "======================================== Smoke tests"
41 | cd ${top}/examples
42 | 
43 | # Makefile or CMakeLists.txt picks up ${test_args}.
44 | if [ "${device}" = "gpu_intel" ]; then
45 |     # Our Intel GPU supports only single precision.
46 |     export test_args="s c"
47 | else
48 |     export test_args="s d c z"
49 | fi
50 | 
51 | if [ "${maker}" = "make" ]; then
52 |     export PKG_CONFIG_PATH+=:${top}/install/lib/pkgconfig
53 |     make clean || exit 20
54 | 
55 | elif [ "${maker}" = "cmake" ]; then
56 |     rm -rf build && mkdir build && cd build
57 |     cmake "-DCMAKE_PREFIX_PATH=${top}/install" .. || exit 30
58 | fi
59 | 
60 | # ARGS=-V causes CTest to print output. Makefile doesn't use it.
61 | make -j8 || exit 40
62 | make test ARGS=-V
63 | (( err += $? ))
64 | 
65 | print "======================================== Finished test"
66 | exit ${err}
67 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.bak
 2 | *.d
 3 | *.mod
 4 | *.o
 5 | *.pyc
 6 | *.svg
 7 | .DS_Store
 8 | .id
 9 | __pycache__
10 | blaspp-*
11 | build*
12 | config/blas
13 | config/blis_version
14 | config/cblas
15 | config/compiler_cxx
16 | config/cublas
17 | config/essl_version
18 | config/hello
19 | config/lapack_potrf
20 | config/lapack_pstrf
21 | config/log.txt
22 | config/mkl_version
23 | config/onemkl
24 | config/openblas_version
25 | config/openmp
26 | config/return_complex
27 | config/return_complex_argument
28 | config/return_float
29 | config/return_float_f2c
30 | config/rocblas
31 | docs/doxygen/errors.txt
32 | docs/html/
33 | files.txt
34 | include/blas/defines.h
35 | issues/
36 | lib/*.a
37 | lib/*.so
38 | lib/pkgconfig/*.pc
39 | make.inc
40 | test/tester
41 | wiki/
42 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | 2025.05.28 (ABI 2.0.0)
  2 |   - Added Level 1 BLAS GPU wrappers
  3 |   - Added is_complex_v
  4 |   - Added support for BLIS and libFLAME (hence AOCL)
  5 |   - Removed support for ACML
  6 |   - Removed deprecated enum, memcpy routines
  7 |   - Moved [cz]symv and [cz]syr to BLAS++ from LAPACK++, since they
  8 |     logically fit in BLAS. Requires linking with an LAPACK library
  9 |   - Tester prints stats with --repeat
 10 |   - Fixed SYCL include path
 11 |   - Fixed testers for n = 0 case
 12 |   - Fixed template gemm for beta = 0 to not propagate NaN/Inf (strong zero)
 13 | 
 14 | 2024.10.26 (ABI 1.0.0)
 15 |   - Added PAPI SDE that counts flops
 16 |   - Use to_blas_int to convert int32 to int64
 17 | 
 18 | 2024.05.31 (ABI 1.0.0)
 19 |   - Added shared library ABI version
 20 |   - Updated enum parameters to have `to_string`, `from_string`;
 21 |     deprecate `<enum>2str`, `str2<enum>`
 22 |   - Removed some deprecated functions. Deprecated MemcpyKind
 23 |   - Added PAPI SDE counters (manually enable for now)
 24 |   - Support ROCm 5.6.0 (rocBLAS 3.0), trmm with 3 matrices (A, B, C)
 25 |   - Fixed bug in her2k with complex alpha and row-major matrix
 26 |   - Fixed bug in example_gemm
 27 |   - Use `sqrt`, etc. without `std::` to enable argument dependent lookup (ADL)
 28 | 
 29 | 2023.11.05
 30 |   - Fix Queue workspace
 31 |   - Update Fortran strlen handling
 32 |   - Fix CMake unity build
 33 |   - Fix CMake library ordering
 34 | 
 35 | 2023.08.25
 36 |   - Use yyyy.mm.dd version scheme, instead of yyyy.mm.release
 37 |   - Added oneAPI support to CMake
 38 |   - Fixed int64 support
 39 |   - More robust Makefile configure doesn't require CUDA or ROCm to be in
 40 |     compiler search paths (CPATH, LIBRARY_PATH, etc.)
 41 | 
 42 | 2023.06.00
 43 |   - Revised Queue class to allow creating Queue from an existing
 44 |     CUDA/HIP stream, cuBLAS/rocBLAS handle, or SYCL queue. Also
 45 |     allocates streams and workspace on demand, to make Queue creation
 46 |     much lighter weight.
 47 |   - Improved oneAPI support
 48 | 
 49 | 2023.01.00
 50 |   - Added oneAPI port (currently Makefile only)
 51 |       - Added queue argument to `device_malloc, device_free`, etc.;
 52 |         deprecated old versions
 53 |       - Deprecated `set_device, get_device`
 54 |       - Renamed `device_malloc_pinned` to `host_malloc_pinned`
 55 |       - Added `device_copy_{matrix,vector}`;
 56 |         deprecated `device_{set,get}{matrix,vector}`
 57 |   - Added more Level 1 BLAS on GPU device: axpy, dot, nrm2
 58 |   - Moved main repo to https://github.com/icl-utk-edu/blaspp/
 59 |   - Refactored routines for better maintainability
 60 |   - Use python3
 61 | 
 62 | 2022.07.00
 63 |   - Added workspace in queue; used in LAPACK++
 64 |   - Set device in memcpy, etc.
 65 |   - Updated Schur gemm test with tile layout
 66 | 
 67 | 2022.05.00
 68 |   - Added Level 3 BLAS template implementations
 69 |   - Added device copy, scal
 70 |   - Added Schur gemm test, batched tile and LAPACK formats
 71 |   - Fixed gbmm flops when rectangular
 72 |   - Fixed CMake when BLAS_LIBRARIES is empty
 73 | 
 74 | 2021.04.01
 75 |   - Fixed bug in `test_trsm_device` for row-major
 76 | 
 77 | 2021.04.00
 78 |   - Added HIP/ROCm support
 79 |   - Added include/blas/defines.h based on configuration
 80 |   - Various bug and CMake fixes
 81 | 
 82 | 2020.10.02
 83 |   - CMake support for including as subdirectory
 84 | 
 85 | 2020.10.01
 86 |   - Fixes: CMake always checks for CBLAS, in case LAPACK++ needs it
 87 | 
 88 | 2020.10.00
 89 |   - Fixes: CMake defines, version, ILP64; remove [cz]symv prototypes
 90 |   - Add `make check`
 91 | 
 92 | 2020.09.00
 93 |   - Clean up namespace
 94 |   - Makefile and CMake improvements
 95 | 
 96 | 2020.08.00
 97 |   - Initial release. Functionality:
 98 |     - Level 1, 2, 3 BLAS for CPU
 99 |     - Level 3 BLAS for GPU
100 |     - Level 3 batched BLAS for CPU and GPU
101 |     - cuBLAS GPU implementation
102 |     - Makefile and CMake build options
103 | 


--------------------------------------------------------------------------------
/GNUmakefile.subdir:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | # This program is free software: you can redistribute it and/or modify it under
 4 | # the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 5 | 
 6 | # Subdirectories include this makefile to forward rules to the top level makefile.
 7 | # Define ${top} for where the top level is.
 8 | # Example: src/GNUmakefile:
 9 | #     top = ..
10 | #     include ${top}/GNUmakefile.subdir
11 | 
12 | .SUFFIXES:
13 | 
14 | pwd     = ${shell pwd}
15 | abs_top = ${abspath ${top}}/
16 | abs_pwd = ${abspath ${pwd}}
17 | cdir    = ${subst ${abs_top},,${abs_pwd}}
18 | 
19 | # ------------------------------------------------------------------------------
20 | ifneq (${MAKECMDGOALS},)
21 | 
22 | # If arguments are given, presumably files like test.o, forward them to top
23 | # with cdir prefix.
24 | # All files are forwarded as one rule, based on first; rest are quietly ignored.
25 | goals   := ${filter-out echo ${DONT_FORWARD}, ${MAKECMDGOALS}}
26 | forward := ${addprefix ${cdir}/, ${goals}}
27 | first   := ${firstword ${goals}}
28 | rest    := ${wordlist 2, ${words ${goals}}, ${goals}}
29 | 
30 | ${first}: force
31 | 	cd ${top} && ${MAKE} ${forward}
32 | 
33 | ${rest}: force
34 | 	@echo > /dev/null
35 | 
36 | # ------------------------------------------------------------------------------
37 | else
38 | 
39 | # Otherwise, forward subdirectory name as target.
40 | .PHONY: ${cdir}
41 | 
42 | ${cdir}:
43 | 	cd ${top} && ${MAKE} $@
44 | 
45 | endif
46 | # ------------------------------------------------------------------------------
47 | 
48 | force: ;
49 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
 2 | 
 3 | Redistribution and use in source and binary forms, with or without
 4 | modification, are permitted provided that the following conditions are met:
 5 |     * Redistributions of source code must retain the above copyright
 6 |       notice, this list of conditions and the following disclaimer.
 7 |     * Redistributions in binary form must reproduce the above copyright
 8 |       notice, this list of conditions and the following disclaimer in the
 9 |       documentation and/or other materials provided with the distribution.
10 |     * Neither the name of the University of Tennessee nor the
11 |       names of its contributors may be used to endorse or promote products
12 |       derived from this software without specific prior written permission.
13 | 
14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
15 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE
18 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
20 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
21 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
22 | TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
23 | THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 | 


--------------------------------------------------------------------------------
/blasppConfig.cmake.in:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required( VERSION 3.15 )
 2 | 
 3 | set( blaspp_use_openmp "@blaspp_use_openmp@" )
 4 | set( blaspp_use_cuda   "@blaspp_use_cuda@" )
 5 | set( blaspp_use_hip    "@blaspp_use_hip@" )
 6 | set( blaspp_use_sycl   "@blaspp_use_sycl@" )
 7 | set( blaspp_int        "@blaspp_int@" )
 8 | 
 9 | include( CMakeFindDependencyMacro )
10 | if (blaspp_use_openmp)
11 |     find_dependency( OpenMP )
12 | endif()
13 | 
14 | if (blaspp_use_cuda)
15 |     find_dependency( CUDAToolkit )
16 | endif()
17 | 
18 | if (blaspp_use_hip)
19 |     find_dependency( rocblas )
20 | endif()
21 | 
22 | # Export private variables used in LAPACK++.
23 | set( blaspp_defines         "@blaspp_defines@" )
24 | set( blaspp_libraries       "@blaspp_libraries@" )
25 | 
26 | set( blaspp_cblas_found     "@blaspp_cblas_found@" )
27 | set( blaspp_cblas_include   "@blaspp_cblas_include@" )
28 | set( blaspp_cblas_libraries "@blaspp_cblas_libraries@" )
29 | 
30 | include( "${CMAKE_CURRENT_LIST_DIR}/blasppTargets.cmake" )
31 | 


--------------------------------------------------------------------------------
/cmake/CBLASConfig.cmake:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | # This program is free software: you can redistribute it and/or modify it under
 4 | # the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 5 | 
 6 | include( "cmake/util.cmake" )
 7 | 
 8 | # Check if this file has already been run with these settings (see bottom).
 9 | set( run_ true )
10 | if (DEFINED cblas_config_cache
11 |     AND "${cblas_config_cache}" STREQUAL "${BLAS_LIBRARIES}")
12 | 
13 |     message( DEBUG "CBLAS config already done for '${BLAS_LIBRARIES}'" )
14 |     set( run_ false )
15 | endif()
16 | 
17 | #===============================================================================
18 | # Matching endif at bottom.
19 | if (run_)
20 | 
21 | #----------------------------------------
22 | # Apple puts cblas.h in weird places. If we can't find it,
23 | # use Accelerate/Accelerate.h, but that had issues compiling with g++. <sigh>
24 | if ("${blaspp_defs_}" MATCHES "HAVE_ACCELERATE")
25 |     set( dir_list
26 |         "/System/Library/Frameworks/Accelerate.framework/Frameworks/vecLib.framework/Headers"
27 |         "/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Headers"
28 |     )
29 |     foreach (dir IN LISTS dir_list)
30 |         if (EXISTS "${dir}/cblas.h")
31 |             set( blaspp_cblas_include "${dir}" )
32 |             list( APPEND blaspp_defs_ "-DBLAS_HAVE_ACCELERATE_CBLAS_H" )
33 |             break()
34 |         endif()
35 |     endforeach()
36 | endif()
37 | 
38 | #-------------------------------------------------------------------------------
39 | set( lib_list ";-lcblas" )
40 | message( DEBUG "lib_list ${lib_list}" )
41 | 
42 | foreach (lib IN LISTS lib_list)
43 |     message( STATUS "Checking for CBLAS library ${lib}" )
44 | 
45 |     try_run(
46 |         run_result compile_result ${CMAKE_CURRENT_BINARY_DIR}
47 |         SOURCES
48 |             "${CMAKE_CURRENT_SOURCE_DIR}/config/cblas.cc"
49 |         LINK_LIBRARIES
50 |             ${lib} ${BLAS_LIBRARIES} ${openmp_lib} # not "..." quoted; screws up OpenMP
51 |         COMPILE_DEFINITIONS
52 |             ${blaspp_defs_}
53 |         CMAKE_FLAGS
54 |             "-DINCLUDE_DIRECTORIES=${blaspp_cblas_include}"
55 |         COMPILE_OUTPUT_VARIABLE
56 |             compile_output
57 |         RUN_OUTPUT_VARIABLE
58 |             run_output
59 |     )
60 |     # For cross-compiling, assume if it links, the run is okay.
61 |     if (CMAKE_CROSSCOMPILING AND compile_result)
62 |         message( DEBUG "cross: cblas" )
63 |         set( run_result "0"  CACHE STRING "" FORCE )
64 |         set( run_output "ok" CACHE STRING "" FORCE )
65 |     endif()
66 |     debug_try_run( "cblas.cc" "${compile_result}" "${compile_output}"
67 |                               "${run_result}" "${run_output}" )
68 | 
69 |     if (compile_result AND "${run_output}" MATCHES "ok")
70 |         list( APPEND blaspp_defs_ "-DBLAS_HAVE_CBLAS" )
71 |         set( blaspp_cblas_libraries "${lib}" CACHE INTERNAL "" )
72 |         set( blaspp_cblas_found true CACHE INTERNAL "" )
73 |         break()
74 |     endif()
75 | endforeach()
76 | 
77 | endif() # run_
78 | #===============================================================================
79 | 
80 | # Mark as already run (see top).
81 | set( cblas_config_cache "${BLAS_LIBRARIES}" CACHE INTERNAL "" )
82 | 
83 | #-------------------------------------------------------------------------------
84 | if (blaspp_cblas_found)
85 |     message( "${blue}   Found CBLAS library ${blaspp_cblas_libraries}${plain}" )
86 | else()
87 |     message( "${red}   CBLAS library not found. Tester cannot be built.${plain}" )
88 | endif()
89 | 
90 | message( DEBUG "
91 | blaspp_cblas_found     = '${blaspp_cblas_found}'
92 | blaspp_cblas_libraries = '${blaspp_cblas_libraries}'
93 | blaspp_defs_           = '${blaspp_defs_}'
94 | ")
95 | 


--------------------------------------------------------------------------------
/cmake/config.cmake:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-2024, University of Tennessee. All rights reserved.
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | # This program is free software: you can redistribute it and/or modify it under
 4 | # the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 5 | 
 6 | #-------------------------------------------------------------------------------
 7 | # Tests whether using `std::atomic` requires linking with `-latomic`
 8 | # for 64-bit values, which is the case on some 32-bit systems.
 9 | # Sets variable `libatomic_required`.
10 | #
11 | function( check_libatomic )
12 |     message( STATUS "Checking whether std::atomic requires libatomic" )
13 |     set( libatomic_required false )
14 | 
15 |     try_compile(
16 |         link_result ${CMAKE_CURRENT_BINARY_DIR}
17 |         SOURCES
18 |             "${CMAKE_CURRENT_SOURCE_DIR}/config/std_atomic.cc"
19 |         OUTPUT_VARIABLE
20 |             link_output
21 |     )
22 |     debug_try_compile( "std_atomic.cc" "${link_result}" "${link_output}" )
23 | 
24 |     set( label "   std::atomic links without -latomic" )
25 |     pad_string( "${label}" 50 label )
26 |     if (link_result)
27 |         message( "${label} ${blue} yes${plain}" )
28 |     else()
29 |         message( "${label} ${red} no${plain}" )
30 | 
31 |         try_compile(
32 |             link_result ${CMAKE_CURRENT_BINARY_DIR}
33 |             SOURCES
34 |                 "${CMAKE_CURRENT_SOURCE_DIR}/config/std_atomic.cc"
35 |             LINK_LIBRARIES
36 |                 "-latomic"
37 |             OUTPUT_VARIABLE
38 |                 link_output
39 |         )
40 |         debug_try_compile( "std_atomic.cc" "${link_result}" "${link_output}" )
41 | 
42 |         set( label "   std::atomic requires -latomic" )
43 |         pad_string( "${label}" 50 label )
44 |         if (link_result)
45 |             #target_link_libraries( ${tgt} PUBLIC "-latomic" )
46 |             message( "${label} ${blue} yes${plain}" )
47 |             set( libatomic_required true )
48 |         else()
49 |             message( "${label} ${red} failed; cannot compile libatomic test${plain}" )
50 |         endif()
51 |     endif()
52 | endfunction()
53 | 


--------------------------------------------------------------------------------
/cmake/util.cmake:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | # This program is free software: you can redistribute it and/or modify it under
 4 | # the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 5 | 
 6 | if (color)
 7 |     string( ASCII 27 Esc )
 8 |     set( ansi_reset    "${Esc}[0m"  )
 9 |     set( bold          "${Esc}[1m"  )
10 |     set( not_bold      "${Esc}[22m" )  # "normal"
11 |     set( italic        "${Esc}[3m"  )
12 |     set( not_italic    "${Esc}[23m" )
13 | 
14 |     set( black         "${Esc}[30m" )
15 |     set( red           "${Esc}[31m" )
16 |     set( green         "${Esc}[32m" )
17 |     set( yellow        "${Esc}[33m" )
18 |     set( blue          "${Esc}[34m" )
19 |     set( magenta       "${Esc}[35m" )
20 |     set( cyan          "${Esc}[36m" )
21 |     set( gray          "${Esc}[37m" )
22 |     set( default_color "${Esc}[39m" )
23 |     set( plain         "${Esc}[39m" )
24 | endif()
25 | 
26 | #-------------------------------------------------------------------------------
27 | # pad_string( input length output_variable )
28 | # Adds spaces to input up to length and saves to output_variable.
29 | #
30 | function( pad_string input length output_variable )
31 |     string( LENGTH "${input}" len )
32 |     math( EXPR pad_len "${length} - ${len}" )
33 |     if (pad_len LESS 0)
34 |         set( pad_len 0 )
35 |     endif()
36 |     string( REPEAT " " ${pad_len} pad )
37 |     set( ${output_variable} "${input}${pad}" PARENT_SCOPE )
38 | endfunction()
39 | 
40 | #-------------------------------------------------------------------------------
41 | # debug_try_compile( msg compile_result compile_output )
42 | # Prints compile_result at log level DEBUG (5);
43 | #        compile_output at log level TRACE (6).
44 | #
45 | function( debug_try_compile msg compile_result compile_output )
46 |     message( DEBUG "${msg}: compile_result '${compile_result}'" )
47 |     message( TRACE "compile_output: <<<\n${compile_output}>>>" )
48 | endfunction()
49 | 
50 | #-------------------------------------------------------------------------------
51 | # debug_try_run( msg compile_result run_result compile_output run_output )
52 | # Prints {compile,run}_result at debug DEBUG (5);
53 | #        {compile,run}_output at debug TRACE (6).
54 | #
55 | function( debug_try_run msg compile_result compile_output run_result run_output )
56 |     message( DEBUG "${msg}: compile_result '${compile_result}', run_result '${run_result}'" )
57 |     message( TRACE "compile_output: '''\n${compile_output}'''" )
58 |     message( TRACE "run_output: '''\n${run_output}'''" )
59 | endfunction()
60 | 
61 | #-------------------------------------------------------------------------------
62 | # assert( condition )
63 | # Aborts if condition is not true. Condition is evaluated inside an `if`,
64 | # so it can have boolean operators like EQUAL:
65 | #     assert( x EQUAL 2 )
66 | #
67 | macro( assert )
68 |     if (NOT (${ARGN}))
69 |         message( FATAL_ERROR "\n${red}Assertion failed: ${var} (value is '${${var}}')${default_color}\n" )
70 |     endif()
71 | endmacro()
72 | 
73 | #-------------------------------------------------------------------------------
74 | # match( regex str output )
75 | # If str matches regular expression in regex,
76 | # sets output to true, else sets it to false.
77 | #
78 | # Contrast this with: string( REGEX MATCH regex output str ),
79 | # which sets output to the match string itself, which could be false, e.g.,
80 | #     string( REGEX MATCH "(yes|no)" output "no" )
81 | # sets output = 'no' (interpreted as false in CMake), rather than true.
82 | #
83 | # The order of arguments here matches string( COMPARE EQUAL str1 str2 output ),
84 | # rather than string( REGEX MATCH regex output str ).
85 | #
86 | function( match regex str output )
87 |     if ("${str}" MATCHES "${regex}")
88 |         set( ${output} "true"  PARENT_SCOPE )
89 |     else()
90 |         set( ${output} "false" PARENT_SCOPE )
91 |     endif()
92 | endfunction()
93 | 


--------------------------------------------------------------------------------
/config/__init__.py:
--------------------------------------------------------------------------------
1 | from .config import *
2 | 


--------------------------------------------------------------------------------
/config/blas.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
 2 | // SPDX-License-Identifier: BSD-3-Clause
 3 | // This program is free software: you can redistribute it and/or modify it under
 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 5 | 
 6 | #include <stdio.h>
 7 | #include <complex>
 8 | 
 9 | #include "config.h"
10 | 
11 | //------------------------------------------------------------------------------
12 | #define BLAS_ddot FORTRAN_NAME( ddot, DDOT )
13 | 
14 | // result return directly
15 | #ifdef __cplusplus
16 | extern "C"
17 | #endif
18 | double BLAS_ddot(
19 |     const blas_int* n,
20 |     const double* x, const blas_int* incx,
21 |     const double* y, const blas_int* incy );
22 | 
23 | //------------------------------------------------------------------------------
24 | int main()
25 | {
26 |     // If blas_int is 32-bit, but BLAS actually interprets it as 64-bit,
27 |     // BLAS will see n = 0x500000005 and segfault.
28 |     // If blas_int is 64-bit, BLAS can interpret it as 32-bit or 64-bit
29 |     // to see n = 5 and pass.
30 |     blas_int n[] = { 5, 5 }, ione = 1;
31 |     double x[] = { 1, 2, 3, 4, 5 };
32 |     double y[] = { 5, 4, 3, 2, 1 };
33 |     for (int i = 0; i < n[0]; ++i) {
34 |         printf( "x[ %d ] = %.1f; y[ %d ] = %.1f\n",
35 |                 i, x[ i ],
36 |                 i, y[ i ] );
37 |     }
38 | 
39 |     double result = BLAS_ddot( n, x, &ione, y, &ione );
40 |     printf( "result = %.1f; should be 35.0\n", result );
41 | 
42 |     bool okay = (result == 35);
43 |     printf( "%s\n", okay ? "ok" : "failed" );
44 |     return ! okay;
45 | }
46 | 


--------------------------------------------------------------------------------
/config/blis_version.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2017-2024, University of Tennessee. All rights reserved.
 2 | // SPDX-License-Identifier: BSD-3-Clause
 3 | // This program is free software: you can redistribute it and/or modify it under
 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 5 | 
 6 | #include <stdio.h>
 7 | #include <blis.h>
 8 | 
 9 | int main()
10 | {
11 |     const char* v = bli_info_get_version_str();
12 |     printf( "BLIS_VERSION=%s\n", v );
13 |     return 0;
14 | }
15 | 


--------------------------------------------------------------------------------
/config/cblas.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
 2 | // SPDX-License-Identifier: BSD-3-Clause
 3 | // This program is free software: you can redistribute it and/or modify it under
 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 5 | 
 6 | #include <stdio.h>
 7 | 
 8 | #if defined(BLAS_HAVE_MKL)
 9 |     #if defined(BLAS_ILP64) && ! defined(MKL_ILP64)
10 |         #define MKL_ILP64
11 |     #endif
12 |     #include <mkl_cblas.h>
13 | 
14 | #elif defined(BLAS_HAVE_ESSL)
15 |     #if defined(BLAS_ILP64) && ! defined(_ESV6464)
16 |         #define _ESV6464
17 |     #endif
18 |     #include <essl.h>
19 | 
20 | #elif defined(BLAS_HAVE_ACCELERATE)
21 |     // On macOS, the official way to include cblas is via Accelerate.h.
22 |     // Unfortunately with Xcode 10.3 and GNU g++ 9.3, that doesn't compile.
23 |     // If we can find cblas.h, use it, otherwise use Accelerate.h.
24 |     #ifdef BLAS_HAVE_ACCELERATE_CBLAS_H
25 |         #include <cblas.h>
26 |     #else
27 |         #include <Accelerate/Accelerate.h>
28 |     #endif
29 | #else
30 |     #ifdef __cplusplus
31 |         // Some ancient cblas.h don't include extern C. It's okay to nest.
32 |         extern "C" {
33 |         #include <cblas.h>
34 |         }
35 |     #else
36 |         #include <cblas.h>
37 |     #endif
38 | #endif
39 | 
40 | //------------------------------------------------------------------------------
41 | int main()
42 | {
43 |     int n = 5;
44 |     double x[] = { 1, 2, 3, 4, 5 };
45 |     double y[] = { 5, 4, 3, 2, 1 };
46 |     for (int i = 0; i < n; ++i) {
47 |         printf( "x[ %d ] = %.1f; y[ %d ] = %.1f\n",
48 |                 i, x[ i ],
49 |                 i, y[ i ] );
50 |     }
51 | 
52 |     double result = cblas_ddot( n, x, 1, y, 1 );
53 |     printf( "result = %.1f; should be 35.0\n", result );
54 | 
55 |     bool okay = (result == 35);
56 |     printf( "%s\n", okay ? "ok" : "failed" );
57 |     return ! okay;
58 | }
59 | 


--------------------------------------------------------------------------------
/config/compiler_cxx.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
 2 | // SPDX-License-Identifier: BSD-3-Clause
 3 | // This program is free software: you can redistribute it and/or modify it under
 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 5 | 
 6 | #ifdef __cplusplus
 7 |     #include <iostream>
 8 | #else
 9 |     #include <stdio.h>
10 | #endif
11 | 
12 | int main()
13 | {
14 |     // xlc must come before clang
15 |     // clang and icc must come before gcc
16 |     // icpx and icx must come before clang
17 |     const char* compiler =
18 |     #ifdef __cplusplus
19 |         // IBM's documentation says __IBMCPP__,
20 |         // but xlc -qshowmacros shows __ibmxl_version__.
21 |         #if defined(__IBMCPP__) || defined(__ibmxl_version__)
22 |             "xlc++";
23 |         #elif defined(_CRAYC)
24 |             "cray";
25 |         #elif defined(__ICC)
26 |             "icpc";
27 |         #elif defined(__INTEL_LLVM_COMPILER)
28 |             "icpx";
29 |         #elif defined(_MSC_VER)
30 |             "MSC";
31 |         #elif defined(__clang__)
32 |             "clang++";
33 |         #elif defined(__GNUG__)
34 |             "g++";
35 |         #else
36 |             "unknown C++";
37 |         #endif
38 |     #else
39 |         #if defined(__IBMC__) || defined(__ibmxl_version__)
40 |             "xlc";
41 |         #elif defined(_CRAYC)
42 |             "cray";
43 |         #elif defined(__ICC)
44 |             "icc";
45 |         #elif defined(__INTEL_LLVM_COMPILER)
46 |             "icx";
47 |         #elif defined(_MSC_VER)
48 |             "MSC";
49 |         #elif defined(__clang__)
50 |             "clang";
51 |         #elif defined(__GNUC__)
52 |             "gcc";
53 |         #else
54 |             "unknown C";
55 |         #endif
56 |     #endif
57 | 
58 |     #ifdef __cplusplus
59 |         std::cout << compiler << "\n";
60 |     #else
61 |         printf( "%s\n", compiler );
62 |     #endif
63 |     return 0;
64 | }
65 | 


--------------------------------------------------------------------------------
/config/config.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
 2 | // SPDX-License-Identifier: BSD-3-Clause
 3 | // This program is free software: you can redistribute it and/or modify it under
 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 5 | 
 6 | #ifndef CONFIG_H
 7 | #define CONFIG_H
 8 | 
 9 | #include <stdint.h>
10 | 
11 | //------------------------------------------------------------------------------
12 | #if defined(FORTRAN_UPPER)
13 |     #define FORTRAN_NAME( lower, UPPER ) UPPER
14 | #elif defined(FORTRAN_LOWER)
15 |     #define FORTRAN_NAME( lower, UPPER ) lower
16 | #else
17 |     // default is ADD_
18 |     #define FORTRAN_NAME( lower, UPPER ) lower ## _
19 | #endif
20 | 
21 | //------------------------------------------------------------------------------
22 | #if defined(BLAS_ILP64) || defined(LAPACK_ILP64)
23 |     typedef int64_t blas_int;
24 |     typedef int64_t lapack_int;
25 | #else
26 |     typedef int blas_int;
27 |     typedef int lapack_int;
28 | #endif
29 | 
30 | //------------------------------------------------------------------------------
31 | #ifndef BLAS_FORTRAN_STRLEN_END
32 | #define BLAS_FORTRAN_STRLEN_END
33 | #endif
34 | 
35 | #ifndef LAPACK_FORTRAN_STRLEN_END
36 | #define LAPACK_FORTRAN_STRLEN_END
37 | #endif
38 | 
39 | #endif // CONFIG_H
40 | 


--------------------------------------------------------------------------------
/config/cublas.cc:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
  2 | // SPDX-License-Identifier: BSD-3-Clause
  3 | // This program is free software: you can redistribute it and/or modify it under
  4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
  5 | 
  6 | #include <cuda_runtime.h>
  7 | #include <cublas_v2.h>
  8 | 
  9 | #include <stdexcept>
 10 | #include <cassert>
 11 | #include <cstdio>
 12 | 
 13 | //------------------------------------------------------------------------------
 14 | void error_check_( cudaError_t err, const char* file, int line )
 15 | {
 16 |     if (err != cudaSuccess) {
 17 |         printf( "CUDA error %d: %s at %s:%d\n",
 18 |                 err, cudaGetErrorString(err), file, line );
 19 |         exit(1);
 20 |     }
 21 | }
 22 | 
 23 | //------------------------------------------------------------------------------
 24 | void error_check_( cublasStatus_t err, const char* file, int line )
 25 | {
 26 |     if (err != CUBLAS_STATUS_SUCCESS) {
 27 |         printf( "cuBLAS error %d at %s:%d\n",
 28 |                 err, file, line );
 29 |         exit(1);
 30 |     }
 31 | }
 32 | 
 33 | #define error_check( err ) \
 34 |         error_check_( (err), __FILE__, __LINE__ )
 35 | 
 36 | //------------------------------------------------------------------------------
 37 | int main()
 38 | {
 39 |     double alpha = 2, beta = 3;
 40 |     int n = 2;
 41 |     double A[] = { 1, 2, 3, 4 };
 42 |     double B[] = { 5, 4, 3, 2 };
 43 |     double C[] = { 2, 3, 1, 0 };
 44 |     double D[] = { 40, 61, 21, 28 };
 45 | 
 46 |     cudaError_t err = cudaSetDevice( 0 );
 47 |     if (err != cudaSuccess) {
 48 |         printf( "cudaSetDevice failed: %s (%d).\n"
 49 |                 "Cannot run on GPU; skipping test.\n",
 50 |                 cudaGetErrorString(err), err );
 51 |         return 0;
 52 |     }
 53 | 
 54 |     double *dA, *dB, *dC;
 55 |     error_check(
 56 |         cudaMalloc( &dA, n*n*sizeof(double) ) );
 57 |     error_check(
 58 |         cudaMalloc( &dB, n*n*sizeof(double) ) );
 59 |     error_check(
 60 |         cudaMalloc( &dC, n*n*sizeof(double) ) );
 61 |     assert( dA != nullptr );
 62 |     assert( dB != nullptr );
 63 |     assert( dC != nullptr );
 64 | 
 65 |     // dA = A, dB = B, dC = c
 66 |     error_check(
 67 |         cudaMemcpy( dA, A, n*n*sizeof(double), cudaMemcpyDefault ) );
 68 |     error_check(
 69 |         cudaMemcpy( dB, B, n*n*sizeof(double), cudaMemcpyDefault ) );
 70 |     error_check(
 71 |         cudaMemcpy( dC, C, n*n*sizeof(double), cudaMemcpyDefault ) );
 72 | 
 73 |     // C = alpha A B + beta C
 74 |     cublasHandle_t handle;
 75 |     error_check(
 76 |         cublasCreate( &handle ) );
 77 |     error_check(
 78 |         cublasDgemm( handle, CUBLAS_OP_N, CUBLAS_OP_N, n, n, n,
 79 |                      &alpha, dA, n, dB, n, &beta, dC, n ) );
 80 |     error_check(
 81 |         cublasDestroy( handle ) );
 82 | 
 83 |     // C = dC
 84 |     error_check(
 85 |         cudaMemcpy( C, dC, n*n*sizeof(double), cudaMemcpyDefault ) );
 86 | 
 87 |     error_check(
 88 |         cudaFree( dA ) );
 89 |     error_check(
 90 |         cudaFree( dB ) );
 91 |     error_check(
 92 |         cudaFree( dC ) );
 93 | 
 94 |     // verify C == D
 95 |     double result = 0;
 96 |     for (int i = 0; i < n*n; ++i) {
 97 |         printf( "C[%d] = %.2f, D = %.2f\n", i, C[i], D[i] );
 98 |         result += std::abs( D[i] - C[i] );
 99 |     }
100 |     bool okay = (result == 0);
101 |     printf( "%s\n", okay ? "ok" : "failed" );
102 |     return ! okay;
103 | }
104 | 


--------------------------------------------------------------------------------
/config/essl_version.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
 2 | // SPDX-License-Identifier: BSD-3-Clause
 3 | // This program is free software: you can redistribute it and/or modify it under
 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 5 | 
 6 | #include <stdio.h>
 7 | #include <essl.h>
 8 | 
 9 | int main()
10 | {
11 |     int v = iessl();
12 |     int version      = int( v / 1000000 );
13 |     int release      = int( (v % 1000000) / 10000 );
14 |     int modification = int( (v % 10000) / 100 );
15 |     int ptf          = v % 100;
16 | 
17 |     printf( "ESSL_VERSION=%d.%d.%d.%d\n",
18 |             version, release, modification, ptf );
19 |     return 0;
20 | }
21 | 


--------------------------------------------------------------------------------
/config/hello.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
 2 | // SPDX-License-Identifier: BSD-3-Clause
 3 | // This program is free software: you can redistribute it and/or modify it under
 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 5 | 
 6 | #include <stdio.h>
 7 | 
 8 | int main()
 9 | {
10 |     printf( "ok\n" );
11 |     return 0;
12 | }
13 | 


--------------------------------------------------------------------------------
/config/lapack_potrf.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
 2 | // SPDX-License-Identifier: BSD-3-Clause
 3 | // This program is free software: you can redistribute it and/or modify it under
 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 5 | 
 6 | #include <stdio.h>
 7 | #include <complex>
 8 | 
 9 | #include "config.h"
10 | 
11 | #define LAPACK_dpotrf_base FORTRAN_NAME( dpotrf, DPOTRF )
12 | 
13 | #ifdef __cplusplus
14 | extern "C"
15 | #endif
16 | void LAPACK_dpotrf_base(
17 |     const char* uplo, const lapack_int* n,
18 |     double* A, const lapack_int* lda,
19 |     lapack_int* info
20 |     #ifdef LAPACK_FORTRAN_STRLEN_END
21 |     , size_t uplo_len
22 |     #endif
23 |     );
24 | 
25 | #ifdef LAPACK_FORTRAN_STRLEN_END
26 |     #define LAPACK_dpotrf( ... ) LAPACK_dpotrf_base( __VA_ARGS__, 1 )
27 | #else
28 |     #define LAPACK_dpotrf( ... ) LAPACK_dpotrf_base( __VA_ARGS__ )
29 | #endif
30 | 
31 | //------------------------------------------------------------------------------
32 | int main()
33 | {
34 |     // If lapack_int is 32-bit, but LAPACK actually interprets it as 64-bit,
35 |     // LAPACK will see n = 0x500000005 and segfault.
36 |     // If lapack_int is 64-bit, LAPACK can interpret it as 32-bit or 64-bit
37 |     // to see n = 5 and pass.
38 |     lapack_int n[] = { 5, 5 };
39 |     // symmetric positive definite A = L L^T, with exact L.
40 |     // -1 values in upper triangle (viewed column-major) are not referenced.
41 |     double A[] = {
42 |         4,  2,  0,  0,  0,
43 |        -1,  5,  2,  0,  0,
44 |        -1, -1,  5,  2,  0,
45 |        -1, -1, -1,  5,  2,
46 |        -1, -1, -1, -1,  5
47 |     };
48 |     double L[] = {
49 |          2,  1,  0,  0,  0,
50 |         -1,  2,  1,  0,  0,
51 |         -1, -1,  2,  1,  0,
52 |         -1, -1, -1,  2,  1,
53 |         -1, -1, -1, -1,  2
54 |     };
55 |     lapack_int info = -1;
56 |     LAPACK_dpotrf( "lower", n, A, n, &info );
57 |     bool okay = (info == 0);
58 |     for (int i = 0; i < 5*5; ++i) {
59 |         okay = okay && (A[i] == L[i]);
60 |     }
61 |     printf( "%s\n", okay ? "ok" : "failed" );
62 |     return ! okay;
63 | }
64 | 


--------------------------------------------------------------------------------
/config/lapack_pstrf.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
 2 | // SPDX-License-Identifier: BSD-3-Clause
 3 | // This program is free software: you can redistribute it and/or modify it under
 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 5 | 
 6 | #include <stdio.h>
 7 | #include <complex>
 8 | 
 9 | #include "config.h"
10 | 
11 | #define LAPACK_dpstrf_base FORTRAN_NAME( dpstrf, DPSTRF )
12 | 
13 | #ifdef __cplusplus
14 | extern "C"
15 | #endif
16 | void LAPACK_dpstrf_base(
17 |     const char* uplo, const lapack_int* n,
18 |     double* A, const lapack_int* lda,
19 |     lapack_int* ipiv, lapack_int* rank,
20 |     const double* tol,
21 |     double* work,
22 |     lapack_int* info
23 |     #ifdef LAPACK_FORTRAN_STRLEN_END
24 |     , size_t uplo_len
25 |     #endif
26 |     );
27 | 
28 | #ifdef LAPACK_FORTRAN_STRLEN_END
29 |     #define LAPACK_dpstrf( ... ) LAPACK_dpstrf_base( __VA_ARGS__, 1 )
30 | #else
31 |     #define LAPACK_dpstrf( ... ) LAPACK_dpstrf_base( __VA_ARGS__ )
32 | #endif
33 | 
34 | //------------------------------------------------------------------------------
35 | int main()
36 | {
37 |     // If lapack_int is 32-bit, but LAPACK actually interprets it as 64-bit,
38 |     // LAPACK will see n = 0x500000005 and segfault.
39 |     // If lapack_int is 64-bit, LAPACK can interpret it as 32-bit or 64-bit
40 |     // to see n = 5 and pass.
41 |     lapack_int n[] = { 5, 5 };
42 |     // symmetric positive definite A = L L^T.
43 |     // -1 values in upper triangle (viewed column-major) are not referenced.
44 |     double A[] = {
45 |         4,  2,  0,  0,  0,
46 |        -1,  5,  2,  0,  0,
47 |        -1, -1,  5,  2,  0,
48 |        -1, -1, -1,  5,  2,
49 |        -1, -1, -1, -1,  5
50 |     };
51 |     lapack_int ipiv[5] = { -1, -1, -1, -1, -1 };
52 |     lapack_int rank = -1;
53 |     double tol = -1;
54 |     double work[2*5];
55 |     lapack_int info = -1;
56 |     // With pivoting in pstrf, P^T A P = L2 L2^T.
57 |     // Don't have exact L2 for comparison.
58 |     LAPACK_dpstrf( "lower", n, A, n, ipiv, &rank, &tol, work, &info );
59 |     bool okay = (info == 0) && (rank == 5);
60 |     printf( "%s\n", okay ? "ok" : "failed" );
61 |     return ! okay;
62 | }
63 | 


--------------------------------------------------------------------------------
/config/mkl_version.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
 2 | // SPDX-License-Identifier: BSD-3-Clause
 3 | // This program is free software: you can redistribute it and/or modify it under
 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 5 | 
 6 | #include <stdio.h>
 7 | #include <mkl.h>
 8 | 
 9 | int main()
10 | {
11 |     MKLVersion v;
12 |     MKL_Get_Version( &v );
13 |     printf( "MKL_VERSION=%d.%d.%d\n",
14 |             v.MajorVersion, v.MinorVersion, v.UpdateVersion );
15 |     return 0;
16 | }
17 | 


--------------------------------------------------------------------------------
/config/onemkl.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
 2 | // SPDX-License-Identifier: BSD-3-Clause
 3 | // This program is free software: you can redistribute it and/or modify it under
 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 5 | 
 6 | #include <sycl/detail/cl.h>
 7 | #include <sycl/sycl.hpp>
 8 | #include <oneapi/mkl.hpp>
 9 | 
10 | #include <stdexcept>
11 | #include <cassert>
12 | #include <cstdio>
13 | 
14 | //------------------------------------------------------------------------------
15 | int main()
16 | {
17 |   try {
18 |     double alpha = 2, beta = 3;
19 |     int n = 2;
20 |     double A[] = { 1, 2, 3, 4 };
21 |     double B[] = { 5, 4, 3, 2 };
22 |     double C[] = { 2, 3, 1, 0 };
23 |     double D[] = { 40, 61, 21, 28 };
24 | 
25 |     // enumerate devices
26 |     std::vector< sycl::device > devices;
27 |     auto platforms = sycl::platform::get_platforms();
28 |     for (auto& platform : platforms) {
29 |         auto all_devices = platform.get_devices();
30 |         for (auto& device : all_devices) {
31 |             if (device.is_gpu()) {
32 |                 devices.push_back( device );
33 |             }
34 |         }
35 |     }
36 |     if (devices.size() == 0) {
37 |         printf( "no sycl GPU devices\n" );
38 |         return -1;
39 |     }
40 | 
41 |     sycl::queue queue( devices[0] );
42 | 
43 |     double *dA, *dB, *dC;
44 |     dA = (double*) sycl::malloc_shared( n*n*sizeof(double), queue );
45 |     dB = (double*) sycl::malloc_shared( n*n*sizeof(double), queue );
46 |     dC = (double*) sycl::malloc_shared( n*n*sizeof(double), queue );
47 | 
48 |     // dA = A, dB = B, dC = c
49 |     queue.memcpy( dA, A, n*n*sizeof(double) );
50 |     queue.memcpy( dB, B, n*n*sizeof(double) );
51 |     queue.memcpy( dC, C, n*n*sizeof(double) );
52 | 
53 |     // C = alpha A B + beta C
54 |     oneapi::mkl::blas::gemm(
55 |         queue,
56 |         oneapi::mkl::transpose::N, oneapi::mkl::transpose::N,
57 |         n, n, n,
58 |         alpha, dA, n, dB, n, beta, dC, n );
59 | 
60 |     // C = dC
61 |     queue.memcpy( dC, C, n*n*sizeof(double) );
62 | 
63 |     sycl::free( dA, queue );
64 |     sycl::free( dB, queue );
65 |     sycl::free( dC, queue );
66 | 
67 |     // verify C == D
68 |     double result = 0;
69 |     for (int i = 0; i < n*n; ++i) {
70 |         printf( "C[%d] = %.2f, D = %.2f\n", i, C[i], D[i] );
71 |         result += std::abs( D[i] - C[i] );
72 |     }
73 |     bool okay = (result == 0);
74 |     printf( "%s\n", okay ? "ok" : "failed" );
75 |     return ! okay;
76 |   }
77 |   catch (...) {
78 |       printf( "caught error\n" );
79 |       return -2;
80 |   }
81 | }
82 | 


--------------------------------------------------------------------------------
/config/openblas_version.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
 2 | // SPDX-License-Identifier: BSD-3-Clause
 3 | // This program is free software: you can redistribute it and/or modify it under
 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 5 | 
 6 | #include <stdio.h>
 7 | #include <cblas.h> // openblas_get_config
 8 | 
 9 | int main()
10 | {
11 |     const char* v = OPENBLAS_VERSION;
12 |     printf( "OPENBLAS_VERSION=%s\n", v );
13 | 
14 |     // since OPENBLAS_VERSION is defined in the header, it may work even
15 |     // if we don't link with openblas. Calling an OpenBLAS-specific
16 |     // function ensures we are linking with OpenBLAS.
17 |     const char* config = openblas_get_config();
18 |     printf( "openblas_get_config=%s\n", config );
19 | 
20 |     return 0;
21 | }
22 | 


--------------------------------------------------------------------------------
/config/openmp.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
 2 | // SPDX-License-Identifier: BSD-3-Clause
 3 | // This program is free software: you can redistribute it and/or modify it under
 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 5 | 
 6 | #include <omp.h>
 7 | #include <stdio.h>
 8 | 
 9 | int main()
10 | {
11 |     int nthreads = 1;
12 |     int tid = 0;
13 |     #pragma omp parallel
14 |     {
15 |         nthreads = omp_get_max_threads();
16 |         tid = omp_get_thread_num();
17 |         printf( "tid %d, nthreads %d\n", tid, nthreads );
18 |     }
19 |     printf( "ok\n" );
20 |     return 0;
21 | }
22 | 


--------------------------------------------------------------------------------
/config/return_complex.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
 2 | // SPDX-License-Identifier: BSD-3-Clause
 3 | // This program is free software: you can redistribute it and/or modify it under
 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 5 | 
 6 | #include <stdio.h>
 7 | #include <complex>
 8 | 
 9 | // Use C99 _Complex as return type to be compatible with extern C linkage.
10 | #include <complex.h>
11 | 
12 | #include "config.h"
13 | 
14 | //------------------------------------------------------------------------------
15 | #define BLAS_zdotc FORTRAN_NAME( zdotc, ZDOTC )
16 | 
17 | // result return directly
18 | #ifdef __cplusplus
19 | extern "C"
20 | #endif
21 | double _Complex BLAS_zdotc(
22 |     const blas_int* n,
23 |     const std::complex<double>* x, const blas_int* incx,
24 |     const std::complex<double>* y, const blas_int* incy );
25 | 
26 | //------------------------------------------------------------------------------
27 | int main()
28 | {
29 |     blas_int n = 5, ione = 1;
30 |     std::complex<double> x[] = { 1, 2, 3, 4, 5 };
31 |     std::complex<double> y[] = { 5, 4, 3, 2, 1 };
32 |     for (int i = 0; i < n; ++i) {
33 |         printf( "x[ %d ] = %.1f + %.1fi; y[ %d ] = %.1f + %.1fi\n",
34 |                 i, real( x[ i ] ), imag( x[ i ] ),
35 |                 i, real( y[ i ] ), imag( y[ i ] ) );
36 |     }
37 | 
38 |     double _Complex r = BLAS_zdotc( &n, x, &ione, y, &ione );
39 |     std::complex<double> result = *reinterpret_cast< std::complex<double>* >( &r );
40 |     printf( "result = %.1f + %.1fi; should be 35.0 + 0.0i\n",
41 |             real( result ), imag( result ) );
42 | 
43 |     bool okay = (real(result) == 35);
44 |     printf( "%s\n", okay ? "ok" : "failed" );
45 |     return ! okay;
46 | }
47 | 


--------------------------------------------------------------------------------
/config/return_complex_argument.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
 2 | // SPDX-License-Identifier: BSD-3-Clause
 3 | // This program is free software: you can redistribute it and/or modify it under
 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 5 | 
 6 | #include <stdio.h>
 7 | #include <complex>
 8 | 
 9 | #include "config.h"
10 | 
11 | //------------------------------------------------------------------------------
12 | #define BLAS_zdotc FORTRAN_NAME( zdotc, ZDOTC )
13 | 
14 | // result returned as *hidden argument*
15 | #ifdef __cplusplus
16 | extern "C"
17 | #endif
18 | void BLAS_zdotc(
19 |     std::complex<double>* result,
20 |     const blas_int* n,
21 |     const std::complex<double>* x, const blas_int* incx,
22 |     const std::complex<double>* y, const blas_int* incy );
23 | 
24 | //------------------------------------------------------------------------------
25 | int main()
26 | {
27 |     blas_int n = 5, ione = 1;
28 |     std::complex<double> x[] = { 1, 2, 3, 4, 5 };
29 |     std::complex<double> y[] = { 5, 4, 3, 2, 1 };
30 |     for (int i = 0; i < n; ++i) {
31 |         printf( "x[ %d ] = %.1f + %.1fi; y[ %d ] = %.1f + %.1fi\n",
32 |                 i, real( x[ i ] ), imag( x[ i ] ),
33 |                 i, real( y[ i ] ), imag( y[ i ] ) );
34 |     }
35 | 
36 |     std::complex<double> result;
37 |     BLAS_zdotc( &result, &n, x, &ione, y, &ione );
38 |     printf( "result = %.1f + %.1fi; should be 35.0 + 0.0i\n",
39 |             real( result ), imag( result ) );
40 | 
41 |     bool okay = (real(result) == 35);
42 |     printf( "%s\n", okay ? "ok" : "failed" );
43 |     return ! okay;
44 | }
45 | 


--------------------------------------------------------------------------------
/config/return_float.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
 2 | // SPDX-License-Identifier: BSD-3-Clause
 3 | // This program is free software: you can redistribute it and/or modify it under
 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 5 | 
 6 | #include <stdio.h>
 7 | 
 8 | #include "config.h"
 9 | 
10 | //------------------------------------------------------------------------------
11 | #define BLAS_sdot FORTRAN_NAME( sdot, SDOT )
12 | 
13 | // returns *float*
14 | #ifdef __cplusplus
15 | extern "C"
16 | #endif
17 | float  BLAS_sdot( const blas_int* n,
18 |                   const float* x, const blas_int* incx,
19 |                   const float* y, const blas_int* incy );
20 | 
21 | //------------------------------------------------------------------------------
22 | int main()
23 | {
24 |     blas_int n = 5, ione = 1;
25 |     float x[] = { 1, 2, 3, 4, 5 };
26 |     float y[] = { 5, 4, 3, 2, 1 };
27 |     for (int i = 0; i < n; ++i) {
28 |         printf( "x[ %d ] = %.1f; y[ %d ] = %.1f\n",
29 |                 i, x[ i ],
30 |                 i, y[ i ] );
31 |     }
32 | 
33 |     float result = BLAS_sdot( &n, x, &ione, y, &ione );
34 |     printf( "result = %.1f; should be 35.0\n", result );
35 | 
36 |     bool okay = (result == 35);
37 |     printf( "%s\n", okay ? "ok" : "failed" );
38 |     return ! okay;
39 | }
40 | 


--------------------------------------------------------------------------------
/config/return_float_f2c.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
 2 | // SPDX-License-Identifier: BSD-3-Clause
 3 | // This program is free software: you can redistribute it and/or modify it under
 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 5 | 
 6 | #include <stdio.h>
 7 | 
 8 | #include "config.h"
 9 | 
10 | //------------------------------------------------------------------------------
11 | #define BLAS_sdot FORTRAN_NAME( sdot, SDOT )
12 | 
13 | // returns *double*
14 | #ifdef __cplusplus
15 | extern "C"
16 | #endif
17 | double BLAS_sdot( const blas_int* n,
18 |                   const float* x, const blas_int* incx,
19 |                   const float* y, const blas_int* incy );
20 | 
21 | //------------------------------------------------------------------------------
22 | int main()
23 | {
24 |     blas_int n = 5, ione = 1;
25 |     float x[] = { 1, 2, 3, 4, 5 };
26 |     float y[] = { 5, 4, 3, 2, 1 };
27 |     for (int i = 0; i < n; ++i) {
28 |         printf( "x[ %d ] = %.1f; y[ %d ] = %.1f\n",
29 |                 i, x[ i ],
30 |                 i, y[ i ] );
31 |     }
32 | 
33 |     float result = BLAS_sdot( &n, x, &ione, y, &ione );
34 |     printf( "result = %.1f; should be 35.0\n", result );
35 | 
36 |     bool okay = (result == 35);
37 |     printf( "%s\n", okay ? "ok" : "failed" );
38 |     return ! okay;
39 | }
40 | 


--------------------------------------------------------------------------------
/config/rocblas.cc:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
  2 | // SPDX-License-Identifier: BSD-3-Clause
  3 | // This program is free software: you can redistribute it and/or modify it under
  4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
  5 | 
  6 | #ifndef __HIP_PLATFORM_AMD__
  7 | #define __HIP_PLATFORM_AMD__
  8 | #endif
  9 | 
 10 | #include <hip/hip_runtime.h>
 11 | 
 12 | // Headers moved in ROCm 5.2
 13 | #if HIP_VERSION >= 50200000
 14 |     #include <rocblas/rocblas.h>
 15 | #else
 16 |     #include <rocblas.h>
 17 | #endif
 18 | 
 19 | #include <stdexcept>
 20 | #include <cassert>
 21 | #include <cstdio>
 22 | 
 23 | //------------------------------------------------------------------------------
 24 | void error_check_( hipError_t err, const char* file, int line )
 25 | {
 26 |     if (err != hipSuccess) {
 27 |         printf( "HIP error %d: %s at %s:%d\n",
 28 |                 err, hipGetErrorString(err), file, line );
 29 |         exit(1);
 30 |     }
 31 | }
 32 | 
 33 | //------------------------------------------------------------------------------
 34 | void error_check_( rocblas_status err, const char* file, int line )
 35 | {
 36 |     if (err != rocblas_status_success) {
 37 |         printf( "rocblas error %d: %s at %s:%d\n",
 38 |                 err, rocblas_status_to_string(err), file, line );
 39 |         exit(1);
 40 |     }
 41 | }
 42 | 
 43 | #define error_check( err ) \
 44 |         error_check_( (err), __FILE__, __LINE__ )
 45 | 
 46 | //------------------------------------------------------------------------------
 47 | int main()
 48 | {
 49 |     double alpha = 2, beta = 3;
 50 |     int n = 2;
 51 |     double A[] = { 1, 2, 3, 4 };
 52 |     double B[] = { 5, 4, 3, 2 };
 53 |     double C[] = { 2, 3, 1, 0 };
 54 |     double D[] = { 40, 61, 21, 28 };
 55 | 
 56 |     hipError_t err = hipSetDevice( 0 );
 57 |     if (err != hipSuccess) {
 58 |         printf( "hipSetDevice failed: %s (%d).\n"
 59 |                 "Cannot run on GPU; skipping test.\n",
 60 |                 hipGetErrorString(err), err );
 61 |         return 0;
 62 |     }
 63 | 
 64 |     double *dA, *dB, *dC;
 65 |     error_check(
 66 |         hipMalloc( &dA, n*n*sizeof(double) ) );
 67 |     error_check(
 68 |         hipMalloc( &dB, n*n*sizeof(double) ) );
 69 |     error_check(
 70 |         hipMalloc( &dC, n*n*sizeof(double) ) );
 71 |     assert( dA != nullptr );
 72 |     assert( dB != nullptr );
 73 |     assert( dC != nullptr );
 74 | 
 75 |     // dA = A, dB = B, dC = c
 76 |     error_check(
 77 |         hipMemcpy( dA, A, n*n*sizeof(double), hipMemcpyDefault ) );
 78 |     error_check(
 79 |         hipMemcpy( dB, B, n*n*sizeof(double), hipMemcpyDefault ) );
 80 |     error_check(
 81 |         hipMemcpy( dC, C, n*n*sizeof(double), hipMemcpyDefault ) );
 82 | 
 83 |     // C = alpha A B + beta C
 84 |     rocblas_handle handle;
 85 |     error_check(
 86 |         rocblas_create_handle( &handle ) );
 87 |     error_check(
 88 |         rocblas_dgemm( handle, rocblas_operation_none, rocblas_operation_none,
 89 |                        n, n, n,
 90 |                        &alpha, dA, n, dB, n, &beta, dC, n ) );
 91 |     error_check(
 92 |         rocblas_destroy_handle( handle ) );
 93 | 
 94 |     // C = dC
 95 |     error_check(
 96 |         hipMemcpy( C, dC, n*n*sizeof(double), hipMemcpyDefault ) );
 97 | 
 98 |     error_check(
 99 |         hipFree( dA ) );
100 |     error_check(
101 |         hipFree( dB ) );
102 |     error_check(
103 |         hipFree( dC ) );
104 | 
105 |     // verify C == D
106 |     double result = 0;
107 |     for (int i = 0; i < n*n; ++i) {
108 |         printf( "C[%d] = %.2f, D = %.2f\n", i, C[i], D[i] );
109 |         result += std::abs( D[i] - C[i] );
110 |     }
111 |     bool okay = (result == 0);
112 |     printf( "%s\n", okay ? "ok" : "failed" );
113 |     return ! okay;
114 | }
115 | 


--------------------------------------------------------------------------------
/config/std_atomic.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2017-2024, University of Tennessee. All rights reserved.
 2 | // SPDX-License-Identifier: BSD-3-Clause
 3 | // This program is free software: you can redistribute it and/or modify it under
 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 5 | 
 6 | #include <atomic>
 7 | #include <cstdint>
 8 | 
 9 | int main( int argc, char** argv )
10 | {
11 |     std::atomic<std::int64_t> x = 0;
12 |     for (int i = 1; i < argc; ++i) {
13 |         ++x;
14 |     }
15 |     return x;
16 | }
17 | 


--------------------------------------------------------------------------------
/configure.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | #
  3 | # Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
  4 | # SPDX-License-Identifier: BSD-3-Clause
  5 | # This program is free software: you can redistribute it and/or modify it under
  6 | # the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
  7 | #
  8 | # Usage: python3 configure.py [--interactive]
  9 | 
 10 | from __future__ import print_function
 11 | 
 12 | import sys
 13 | import re
 14 | import config
 15 | from   config import Error, font, print_msg, print_warn, print_header
 16 | import config.lapack
 17 | 
 18 | #-------------------------------------------------------------------------------
 19 | # header
 20 | 
 21 | print( '-'*80 + '\n' +
 22 | font.bold( font.blue( '                              Welcome to BLAS++.' ) ) +
 23 | '''
 24 | 
 25 | By default, configure will automatically choose the first valid value it finds
 26 | for each option. You can set it to interactive to find all possible values and
 27 | give you a choice:
 28 |     ''' + font.blue( 'make config interactive=1' ) + '''
 29 | 
 30 | If you have multiple compilers, we suggest specifying your desired compiler by
 31 | setting CXX, as the automated search may prefer a different compiler.
 32 | 
 33 | For options, see the `INSTALL.md` file.
 34 | 
 35 | Configure assumes environment variables CPATH, LIBRARY_PATH, and LD_LIBRARY_PATH
 36 | are set so your compiler can find libraries. See INSTALL.md for more details.
 37 | ''' + '-'*80 )
 38 | 
 39 | #-------------------------------------------------------------------------------
 40 | def main():
 41 |     config.init( namespace='BLAS', prefix='/opt/slate' )
 42 |     config.prog_cxx()
 43 | 
 44 |     print_header( 'C++ compiler flags' )
 45 |     # Pick highest level supported. oneAPI needs C++17.
 46 |     # Crusher had issue with -std=c++20 (2022-07).
 47 |     config.prog_cxx_flag(
 48 |         ['-std=c++17', '-std=c++14', '-std=c++11'])
 49 |     config.prog_cxx_flag( '-O2' )
 50 |     config.prog_cxx_flag( '-MMD' )
 51 |     config.prog_cxx_flag( '-Wall' )
 52 |     config.prog_cxx_flag( '-Wno-unused-local-typedefs' )
 53 |     config.prog_cxx_flag( '-Wno-unused-function' )
 54 |    #config.prog_cxx_flag( '-pedantic',  # todo: conflict with ROCm 3.9.0
 55 |    #config.prog_cxx_flag( '-Wshadow',   # todo: conflict with ROCm 3.9.0
 56 |    #config.prog_cxx_flag( '-Wmissing-declarations' )
 57 |    #config.prog_cxx_flag( '-Wconversion' )
 58 |    #config.prog_cxx_flag( '-Werror' )
 59 | 
 60 |     print_header( 'Libraries' )
 61 |     config.libatomic()
 62 | 
 63 |     config.openmp()
 64 | 
 65 |     config.lapack.blas()
 66 |     print()
 67 |     config.lapack.blas_float_return()
 68 |     config.lapack.blas_complex_return()
 69 |     config.lapack.vendor_version()
 70 | 
 71 |     # Must test mkl_version before cblas and lapacke, to define HAVE_MKL.
 72 |     try:
 73 |         config.lapack.cblas()
 74 |     except Error:
 75 |         print_warn( 'BLAS++ needs CBLAS for testers.' )
 76 | 
 77 |     try:
 78 |         config.lapack.lapack()
 79 |     except Error as ex:
 80 |         print_warn( 'BLAS++ requires LAPACK for [cz]rot, [cz]syr, [cz]symv.' )
 81 |         raise( ex )
 82 | 
 83 |     config.gpu_blas()
 84 | 
 85 |     testsweeper = config.get_package(
 86 |         'TestSweeper',
 87 |         ['../testsweeper', './testsweeper'],
 88 |         'https://github.com/icl-utk-edu/testsweeper',
 89 |         'https://github.com/icl-utk-edu/testsweeper/tarball/master',
 90 |         'testsweeper.tar.gz' )
 91 |     if (not testsweeper):
 92 |         print_warn( 'BLAS++ needs TestSweeper for testers.' )
 93 | 
 94 |     config.extract_defines_from_flags( 'CXXFLAGS', 'blaspp_header_defines' )
 95 |     config.output_files( ['make.inc', 'include/blas/defines.h'] )
 96 |     print( 'log in config/log.txt' )
 97 | 
 98 |     print( '-'*80 )
 99 | # end
100 | 
101 | #-------------------------------------------------------------------------------
102 | try:
103 |     main()
104 | except Error as ex:
105 |     print_warn( 'A fatal error occurred. ' + str(ex) +
106 |                 '\nBLAS++ could not be configured. Log in config/log.txt' )
107 |     exit(1)
108 | 


--------------------------------------------------------------------------------
/docs/doxygen/GNUmakefile:
--------------------------------------------------------------------------------
1 | docs:
2 | 	cd ../.. && ${MAKE} docs
3 | 


--------------------------------------------------------------------------------
/docs/doxygen/errors.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl -p
 2 | #
 3 | # Usage: ./errors.pl errors.txt > errors2.txt
 4 | #
 5 | # Removes extraneous errors from Doxygen log.
 6 | 
 7 | s/.* warning: Member \w+\(.*(float|double|std::complex).*\) \(function\) of namespace blas is not documented\.\n//;
 8 | s/.* warning: Member (real_t|scalar_t) \(typedef\) of class blas::traits\d* is not documented\.\n//;
 9 | 
10 | #s/.* warning: Member \w+ \(variable\) of class Params is not documented\.\n//;
11 | #s/.* warning: Compound (blas::traits.*|blas::Error|Params) is not documented\.\n//;
12 | 


--------------------------------------------------------------------------------
/examples/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | # This program is free software: you can redistribute it and/or modify it under
 4 | # the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 5 | 
 6 | # CXX compiler must match the one used to compiler BLAS++.
 7 | # Set it in your environment.
 8 | 
 9 | cmake_minimum_required( VERSION 3.8 )
10 | 
11 | project(
12 |     blaspp_example
13 |     LANGUAGES CXX
14 | )
15 | 
16 | #-------------------------------------------------------------------------------
17 | # Enforce out-of-source build
18 | string( TOLOWER "${CMAKE_CURRENT_SOURCE_DIR}" source_dir )
19 | string( TOLOWER "${CMAKE_CURRENT_BINARY_DIR}" binary_dir )
20 | if ("${source_dir}" STREQUAL "${binary_dir}")
21 |     message( FATAL_ERROR
22 |     "Compiling with CMake requires an out-of-source build. To proceed:
23 |     rm -rf CMakeCache.txt CMakeFiles/   # delete files in ${CMAKE_CURRENT_SOURCE_DIR}
24 |     mkdir build
25 |     cd build
26 |     cmake ..
27 |     make" )
28 | endif()
29 | 
30 | #-------------------------------------------------------------------------------
31 | find_package( blaspp REQUIRED )
32 | 
33 | #--------------------
34 | add_executable(
35 |     example_gemm
36 |     example_gemm.cc
37 | )
38 | target_link_libraries(
39 |     example_gemm
40 |     blaspp
41 | )
42 | 
43 | #--------------------
44 | add_executable(
45 |     example_util
46 |     example_util.cc
47 | )
48 | target_link_libraries(
49 |     example_util
50 |     blaspp
51 | )
52 | 
53 | #-------------------------------------------------------------------------------
54 | # CTest
55 | 
56 | # Get precisions to test. See .github/workflows/test.sh
57 | set( test_args $ENV{test_args} )
58 | if (NOT test_args)
59 |     set( test_args "s d c z" )
60 | endif()
61 | string( REPLACE " " ";" test_args ${test_args} ) # convert to list
62 | 
63 | enable_testing()
64 | add_test( NAME example_gemm COMMAND ./example_gemm ${test_args} )
65 | add_test( NAME example_util COMMAND ./example_util ${test_args} )
66 | 


--------------------------------------------------------------------------------
/examples/Makefile:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
 2 | # SPDX-License-Identifier: BSD-3-Clause
 3 | # This program is free software: you can redistribute it and/or modify it under
 4 | # the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 5 | 
 6 | # CXX compiler must match the one used to compiler BLAS++.
 7 | # Set it in your environment or here.
 8 | # Sadly, pkg-config doesn't provide a way to query CXX,
 9 | # CXXFLAGS (only cflags), CPPFLAGS, or LDFLAGS.
10 | 
11 | #-------------------------------------------------------------------------------
12 | # Set CXXFLAGS and LIBS
13 | pkg_exists := $(shell pkg-config --exists blaspp; echo $$?)
14 | ifeq ($(pkg_exists),0)
15 | 
16 |   # Get flags from pkg-config.
17 |   CXX      = $(shell pkg-config --variable CXX blaspp)
18 |   CXXFLAGS = $(shell pkg-config --cflags blaspp)
19 |   LIBS     = $(shell pkg-config --libs   blaspp)
20 | 
21 | else
22 |   $(warning WARNING: pkg-config couldn't find blaspp. Using hard-coded flags in Makefile.)
23 | 
24 |   # BLAS++ not in pkg-config.
25 |   # Here's a hard-coded example using OpenBLAS.
26 |   CXXFLAGS = -I/usr/local/blaspp/include -std=c++11
27 |   LIBS     = -L/usr/local/blaspp/lib$(LIB_SUFFIX) -lblaspp -lopenblas
28 | 
29 | endif
30 | 
31 | #-------------------------------------------------------------------------------
32 | # Rules
33 | 
34 | exe = example_gemm example_util
35 | run = ${addsuffix .run, ${exe}}
36 | txt = ${addsuffix .txt, ${exe}}
37 | 
38 | .DELETE_ON_ERROR:
39 | .SECONDARY:
40 | .SUFFIXES:
41 | .DEFAULT_GOAL := all
42 | .PRECIOUS: ${txt}
43 | 
44 | # Serialize everything so that `make test` runs in serial.
45 | .NOTPARALLEL:
46 | 
47 | all: ${exe}
48 | 
49 | %: %.o
50 | 	$(CXX) -o $@ $^ $(LIBS)
51 | 
52 | %.o: %.cc
53 | 	$(CXX) $(CXXFLAGS) -c -o $@ $<
54 | 
55 | clean:
56 | 	-rm -f ${exe} ${txt} *.o *.d
57 | 
58 | # CMake uses `make test`, GNU autotools uses `make check`; allow both.
59 | test: check
60 | check: ${run}
61 | 
62 | # Run example, but don't save results; %.run is dummy filename.
63 | %.run: %
64 | 	@echo "----------------------------------------------------------------------"
65 | 	./$< ${test_args}
66 | 
67 | # Run example and save result in .txt file.
68 | txt: ${txt}
69 | %.txt: %
70 | 	./$< ${test_args} > $@
71 | 
72 | #-------------------------------------------------------------------------------
73 | # Debugging
74 | echo:
75 | 	@echo "PKG_CONFIG_PATH $(PKG_CONFIG_PATH)"
76 | 	@echo "pkg_exists $(pkg_exists)"
77 | 	@echo "CXX        $(CXX)"
78 | 	@echo "CXXFLAGS   $(CXXFLAGS)"
79 | 	@echo "LIBS       $(LIBS)"
80 | 	@echo
81 | 	@echo "exe       $(exe)"
82 | 	@echo "run       $(run)"
83 | 	@echo "txt       $(txt)"
84 | 


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
 1 | BLAS++ Example
 2 | ================================================================================
 3 | 
 4 | This is designed as a minimal, standalone example to demonstrate
 5 | how to include and link with BLAS++. This assumes that BLAS++ has
 6 | been compiled and installed. There are two options:
 7 | 
 8 | ## Option 1: Makefile
 9 | 
10 | The Makefile must know the compiler used to compile BLAS++,
11 | CXXFLAGS, and LIBS. Set CXX to the compiler, either in your environment
12 | or in the Makefile. For the flags, there are two more options:
13 | 
14 | a. Using pkg-config to get CXXFLAGS and LIBS for BLAS++ (recommended).
15 | pkg-config must be able to locate the blaspp package. If it is installed
16 | outside the default search path (see `pkg-config --variable pc_path pkg-config`),
17 | it should be added to `$PKG_CONFIG_PATH`. For instance, if it is installed
18 | in /opt/slate:
19 | 
20 |     export PKG_CONFIG_PATH=/opt/slate/lib/pkgconfig  # for sh
21 |     setenv PKG_CONFIG_PATH /opt/slate/lib/pkgconfig  # for csh
22 | 
23 | b. Hard-code CXXFLAGS and LIBS for BLAS++ in the Makefile.
24 | 
25 | Then, to build and run `example_gemm` and `example_util` using the
26 | Makefile, run:
27 | 
28 |     make
29 |     make test
30 | 
31 | ## Option 2: CMake
32 | 
33 | CMake must know the compiler used to compile BLAS++. Set CXX to the
34 | compiler, in your environment.
35 | 
36 | Create a build directory:
37 | 
38 |     mkdir build && cd build
39 | 
40 | If BLAS++ is installed outside the default search path, tell cmake
41 | where, for example, in /opt/slate:
42 | 
43 |     cmake -DCMAKE_PREFIX_PATH=/opt/slate ..
44 | 
45 | Otherwise, simply run:
46 | 
47 |     cmake ..
48 | 
49 | Then, to build and run `example_gemm` and `example_util` using the
50 | resulting Makefile, run:
51 | 
52 |     make
53 |     make test
54 | 


--------------------------------------------------------------------------------
/examples/example_gemm.cc:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
  2 | // SPDX-License-Identifier: BSD-3-Clause
  3 | // This program is free software: you can redistribute it and/or modify it under
  4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
  5 | 
  6 | #include <blas.hh>
  7 | 
  8 | #include <vector>
  9 | #include <stdio.h>
 10 | 
 11 | #include "util.hh"
 12 | 
 13 | //------------------------------------------------------------------------------
 14 | template <typename T>
 15 | void test_gemm( int m, int n, int k )
 16 | {
 17 |     print_func();
 18 | 
 19 |     int lda = m;
 20 |     int ldb = k;
 21 |     int ldc = m;
 22 |     std::vector<T> A( lda*k, 1.0 );  // m-by-k
 23 |     std::vector<T> B( ldb*n, 2.0 );  // k-by-n
 24 |     std::vector<T> C( ldc*n, 3.0 );  // m-by-n
 25 | 
 26 |     // ... fill in application data into A, B, C ...
 27 | 
 28 |     // C = -1.0*A*B + 1.0*C
 29 |     blas::gemm( blas::Layout::ColMajor, blas::Op::NoTrans, blas::Op::NoTrans,
 30 |                 m, n, k,
 31 |                 -1.0, A.data(), lda,
 32 |                       B.data(), ldb,
 33 |                  1.0, C.data(), ldc );
 34 | }
 35 | 
 36 | //------------------------------------------------------------------------------
 37 | template <typename T>
 38 | void test_device_gemm( int m, int n, int k )
 39 | {
 40 |     print_func();
 41 |     if (blas::get_device_count() == 0) {
 42 |         printf( "no GPU devices\n" );
 43 |     }
 44 |     else {
 45 |         int lda = m;
 46 |         int ldb = k;
 47 |         int ldc = m;
 48 |         std::vector<T> A( lda*k, 1.0 );  // m-by-k
 49 |         std::vector<T> B( ldb*n, 2.0 );  // k-by-n
 50 |         std::vector<T> C( ldc*n, 3.0 );  // m-by-n
 51 | 
 52 |         // ... fill in application data into A, B, C ...
 53 | 
 54 |         int device = 0;
 55 |         blas::Queue queue( device );
 56 | 
 57 |         T *dA = blas::device_malloc<T>( lda*k, queue );  // m-by-k
 58 |         T *dB = blas::device_malloc<T>( ldb*n, queue );  // k-by-n
 59 |         T *dC = blas::device_malloc<T>( ldc*n, queue );  // m-by-n
 60 | 
 61 |         blas::device_copy_matrix(
 62 |             m, k,
 63 |             A.data(), lda,      // src
 64 |             dA, lda, queue );   // dst
 65 | 
 66 |         blas::device_copy_matrix(
 67 |             k, n,
 68 |             B.data(), ldb,      // src
 69 |             dB, ldb, queue );   // dst
 70 | 
 71 |         blas::device_copy_matrix(
 72 |             m, n,
 73 |             C.data(), ldc,      // src
 74 |             dC, ldc, queue );   // dst
 75 | 
 76 |         // C = -1.0*A*B + 1.0*C
 77 |         blas::gemm(
 78 |             blas::Layout::ColMajor, blas::Op::NoTrans, blas::Op::NoTrans,
 79 |             m, n, k,
 80 |             -1.0, dA, lda,
 81 |                   dB, ldb,
 82 |              1.0, dC, ldc,
 83 |             queue );
 84 | 
 85 |         blas::device_copy_matrix(
 86 |             m, n,
 87 |             dC, ldc,                 // src
 88 |             C.data(), ldc, queue );  // dst
 89 | 
 90 |         queue.sync();
 91 | 
 92 |         blas::device_free( dA, queue );  dA = nullptr;
 93 |         blas::device_free( dB, queue );  dB = nullptr;
 94 |         blas::device_free( dC, queue );  dC = nullptr;
 95 |     }
 96 | }
 97 | 
 98 | //------------------------------------------------------------------------------
 99 | int main( int argc, char** argv )
100 | {
101 |     try {
102 |         // Parse command line to set types for s, d, c, z precisions.
103 |         bool types[ 4 ];
104 |         parse_args( argc, argv, types );
105 | 
106 |         int m = 100, n = 200, k = 50;
107 |         printf( "m %d, n %d, k %d\n", m, n, k );
108 | 
109 |         // Run tests.
110 |         if (types[ 0 ])
111 |             test_gemm< float  >( m, n, k );
112 |         if (types[ 1 ])
113 |             test_gemm< double >( m, n, k );
114 |         if (types[ 2 ])
115 |             test_gemm< std::complex<float>  >( m, n, k );
116 |         if (types[ 3 ])
117 |             test_gemm< std::complex<double> >( m, n, k );
118 | 
119 |         if (types[ 0 ])
120 |             test_device_gemm< float  >( m, n, k );
121 |         if (types[ 1 ])
122 |             test_device_gemm< double >( m, n, k );
123 |         if (types[ 2 ])
124 |             test_device_gemm< std::complex<float>  >( m, n, k );
125 |         if (types[ 3 ])
126 |             test_device_gemm< std::complex<double> >( m, n, k );
127 |     }
128 |     catch (std::exception const& ex) {
129 |         fprintf( stderr, "%s\n", ex.what() );
130 |         return 1;
131 |     }
132 |     return 0;
133 | }
134 | 


--------------------------------------------------------------------------------
/examples/example_util.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
 2 | // SPDX-License-Identifier: BSD-3-Clause
 3 | // This program is free software: you can redistribute it and/or modify it under
 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 5 | 
 6 | // blas01_util.cc
 7 | // BLAS++ utilities: blas::real_type, blas::is_complex, blas::conj
 8 | #include <blas.hh>
 9 | 
10 | #include "util.hh"
11 | 
12 | //------------------------------------------------------------------------------
13 | template <typename scalar_type>
14 | void test_util( scalar_type alpha )
15 | {
16 |     print_func();
17 | 
18 |     //--------------------
19 |     // demo blas::real_type
20 |     int64_t n=100;
21 |     std::vector<scalar_type> x( n, 1.0 );
22 | 
23 |     using real_type = blas::real_type< scalar_type >;
24 |     real_type norm = blas::nrm2( n, x.data(), 1 );
25 |     printf( "norm  %7.4f\n", norm );
26 | 
27 |     //--------------------
28 |     // demo blas::conj
29 |     scalar_type beta;
30 | 
31 |     // std::conj fails if alpha is real:
32 |     // error: cannot convert 'std::complex<double>' to 'double' in assignment
33 |     //beta = std::conj( alpha );
34 | 
35 |     // blas::conj works. Need `using`!
36 |     using blas::conj;
37 |     beta = conj( alpha );
38 | 
39 |     //--------------------
40 |     // demo blas::is_complex
41 |     using std::real;
42 |     using std::imag;
43 |     if (blas::is_complex_v<scalar_type>) {
44 |         printf( "alpha %7.4f + %7.4fi\n", real(alpha), imag(alpha) );
45 |         printf( "beta  %7.4f + %7.4fi\n", real(beta),  imag(beta)  );
46 |     }
47 |     else {
48 |         printf( "alpha %7.4f\n", real(alpha) );
49 |         printf( "beta  %7.4f\n", real(beta)  );
50 |     }
51 | }
52 | 
53 | //------------------------------------------------------------------------------
54 | int main( int argc, char** argv )
55 | {
56 |     try {
57 |         // Parse command line to set types for s, d, c, z precisions.
58 |         bool types[ 4 ];
59 |         parse_args( argc, argv, types );
60 | 
61 |         // Run tests.
62 |         int m = 100, n = 200, k = 50;
63 |         if (types[ 0 ])
64 |             test_util(  float(1.234) );
65 |         if (types[ 1 ])
66 |             test_util( double(2.468) );
67 |         if (types[ 2 ])
68 |             test_util( std::complex< float>( 3.1415, 0.5678 ) );
69 |         if (types[ 3 ])
70 |             test_util( std::complex<double>( 6.2830, 1.1356 ) );
71 |     }
72 |     catch (std::exception const& ex) {
73 |         fprintf( stderr, "%s", ex.what() );
74 |         return 1;
75 |     }
76 |     return 0;
77 | }
78 | 


--------------------------------------------------------------------------------
/examples/util.hh:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
 2 | // SPDX-License-Identifier: BSD-3-Clause
 3 | // This program is free software: you can redistribute it and/or modify it under
 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 5 | 
 6 | #ifndef UTIL_H
 7 | #define UTIL_H
 8 | 
 9 | #include <stdio.h>
10 | #include <stdexcept>
11 | 
12 | //------------------------------------------------------------------------------
13 | void print_func_( const char* func )
14 | {
15 |     printf( "\n%s\n", func );
16 | }
17 | 
18 | #ifdef __GNUC__
19 |     #define print_func() print_func_( __PRETTY_FUNCTION__ )
20 | #else
21 |     #define print_func() print_func_( __func__ )
22 | #endif
23 | 
24 | //------------------------------------------------------------------------------
25 | // Parse command line options:
26 | // s = single,         sets types[ 0 ]
27 | // d = double,         sets types[ 1 ]
28 | // c = complex,        sets types[ 2 ]
29 | // z = double-complex, sets types[ 3 ]
30 | // If no options, sets all types to true.
31 | // Throws error for unknown options.
32 | void parse_args( int argc, char** argv, bool types[ 4 ] )
33 | {
34 |     if (argc == 1) {
35 |         types[ 0 ] = types[ 1 ] = types[ 2 ] = types[ 3 ] = true;
36 |     }
37 |     else {
38 |         types[ 0 ] = types[ 1 ] = types[ 2 ] = types[ 3 ] = false;
39 |     }
40 |     for (int i = 1; i < argc; ++i) {
41 |         std::string arg = argv[ i ];
42 |         if (arg == "s")
43 |             types[ 0 ] = true;
44 |         else if (arg == "d")
45 |             types[ 1 ] = true;
46 |         else if (arg == "c")
47 |             types[ 2 ] = true;
48 |         else if (arg == "z")
49 |             types[ 3 ] = true;
50 |         else {
51 |             throw std::runtime_error(
52 |                 "unknown option: \"" + arg + "\"\n"
53 |                 + "Usage: " + argv[ 0 ] + " [s] [d] [c] [z]\n"
54 |                 + "for single, double, complex, double-complex.\n" );
55 |         }
56 |     }
57 | }
58 | 
59 | #endif // UTIL_H
60 | 


--------------------------------------------------------------------------------
/include/GNUmakefile:
--------------------------------------------------------------------------------
1 | top = ..
2 | include ${top}/GNUmakefile.subdir
3 | 


--------------------------------------------------------------------------------
/include/blas.hh:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
 2 | // SPDX-License-Identifier: BSD-3-Clause
 3 | // This program is free software: you can redistribute it and/or modify it under
 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 5 | 
 6 | #ifndef BLAS_HH
 7 | #define BLAS_HH
 8 | 
 9 | #include "blas/defines.h"
10 | 
11 | #include "blas/counter.hh"
12 | 
13 | // Version is updated by make_release.py; DO NOT EDIT.
14 | // Version 2025.05.28
15 | #define BLASPP_VERSION 20250528
16 | 
17 | namespace blas {
18 | 
19 | int blaspp_version();
20 | const char* blaspp_id();
21 | 
22 | }  // namespace blas
23 | 
24 | #include "blas/wrappers.hh"
25 | 
26 | // =============================================================================
27 | // Level 1 BLAS template implementations
28 | 
29 | #include "blas/asum.hh"
30 | #include "blas/axpy.hh"
31 | #include "blas/copy.hh"
32 | #include "blas/dot.hh"
33 | #include "blas/dotu.hh"
34 | #include "blas/iamax.hh"
35 | #include "blas/nrm2.hh"
36 | #include "blas/rot.hh"
37 | #include "blas/rotg.hh"
38 | #include "blas/rotm.hh"
39 | #include "blas/rotmg.hh"
40 | #include "blas/scal.hh"
41 | #include "blas/swap.hh"
42 | 
43 | // =============================================================================
44 | // Level 2 BLAS template implementations
45 | 
46 | #include "blas/gemv.hh"
47 | #include "blas/ger.hh"
48 | #include "blas/geru.hh"
49 | #include "blas/hemv.hh"
50 | #include "blas/her.hh"
51 | #include "blas/her2.hh"
52 | #include "blas/symv.hh"
53 | #include "blas/syr.hh"
54 | #include "blas/syr2.hh"
55 | #include "blas/trmv.hh"
56 | #include "blas/trsv.hh"
57 | 
58 | // =============================================================================
59 | // Level 3 BLAS template implementations
60 | 
61 | #include "blas/gemm.hh"
62 | #include "blas/hemm.hh"
63 | #include "blas/herk.hh"
64 | #include "blas/her2k.hh"
65 | #include "blas/symm.hh"
66 | #include "blas/syrk.hh"
67 | #include "blas/syr2k.hh"
68 | #include "blas/trmm.hh"
69 | #include "blas/trsm.hh"
70 | 
71 | // =============================================================================
72 | // Device BLAS
73 | 
74 | #include "blas/device_blas.hh"
75 | 
76 | #endif        //  #ifndef BLAS_HH
77 | 


--------------------------------------------------------------------------------
/include/blas/GNUmakefile:
--------------------------------------------------------------------------------
1 | top = ../..
2 | include ${top}/GNUmakefile.subdir
3 | 


--------------------------------------------------------------------------------
/include/blas/asum.hh:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
 2 | // SPDX-License-Identifier: BSD-3-Clause
 3 | // This program is free software: you can redistribute it and/or modify it under
 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 5 | 
 6 | #ifndef BLAS_ASUM_HH
 7 | #define BLAS_ASUM_HH
 8 | 
 9 | #include "blas/util.hh"
10 | 
11 | #include <limits>
12 | 
13 | namespace blas {
14 | 
15 | // =============================================================================
16 | /// @return 1-norm of vector,
17 | ///     $|| Re(x) ||_1 + || Im(x) ||_1
18 | ///         = \sum_{i=0}^{n-1} |Re(x_i)| + |Im(x_i)|$.
19 | ///
20 | /// Generic implementation for arbitrary data types.
21 | ///
22 | /// @param[in] n
23 | ///     Number of elements in x. n >= 0.
24 | ///
25 | /// @param[in] x
26 | ///     The n-element vector x, in an array of length (n-1)*incx + 1.
27 | ///
28 | /// @param[in] incx
29 | ///     Stride between elements of x. incx > 0.
30 | ///
31 | /// @ingroup asum
32 | 
33 | template <typename T>
34 | real_type<T>
35 | asum(
36 |     int64_t n,
37 |     T const *x, int64_t incx )
38 | {
39 |     typedef real_type<T> real_t;
40 | 
41 |     // check arguments
42 |     blas_error_if( n < 0 );      // standard BLAS returns, doesn't fail
43 |     blas_error_if( incx <= 0 );  // standard BLAS returns, doesn't fail
44 | 
45 |     real_t result = 0;
46 |     if (incx == 1) {
47 |         // unit stride
48 |         for (int64_t i = 0; i < n; ++i) {
49 |             result += abs1( x[i] );
50 |         }
51 |     }
52 |     else {
53 |         // non-unit stride
54 |         int64_t ix = 0;
55 |         for (int64_t i = 0; i < n; ++i) {
56 |             result += abs1( x[ix] );
57 |             ix += incx;
58 |         }
59 |     }
60 |     return result;
61 | }
62 | 
63 | }  // namespace blas
64 | 
65 | #endif        //  #ifndef BLAS_ASUM_HH
66 | 


--------------------------------------------------------------------------------
/include/blas/axpy.hh:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
 2 | // SPDX-License-Identifier: BSD-3-Clause
 3 | // This program is free software: you can redistribute it and/or modify it under
 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 5 | 
 6 | #ifndef BLAS_AXPY_HH
 7 | #define BLAS_AXPY_HH
 8 | 
 9 | #include "blas/util.hh"
10 | 
11 | #include <limits>
12 | 
13 | namespace blas {
14 | 
15 | // =============================================================================
16 | /// Add scaled vector, $y = \alpha x + y$.
17 | ///
18 | /// Generic implementation for arbitrary data types.
19 | ///
20 | /// @param[in] n
21 | ///     Number of elements in x and y. n >= 0.
22 | ///
23 | /// @param[in] alpha
24 | ///     Scalar alpha. If alpha is zero, y is not updated.
25 | ///
26 | /// @param[in] x
27 | ///     The n-element vector x, in an array of length (n-1)*abs(incx) + 1.
28 | ///
29 | /// @param[in] incx
30 | ///     Stride between elements of x. incx must not be zero.
31 | ///     If incx < 0, uses elements of x in reverse order: x(n-1), ..., x(0).
32 | ///
33 | /// @param[in, out] y
34 | ///     The n-element vector y, in an array of length (n-1)*abs(incy) + 1.
35 | ///
36 | /// @param[in] incy
37 | ///     Stride between elements of y. incy must not be zero.
38 | ///     If incy < 0, uses elements of y in reverse order: y(n-1), ..., y(0).
39 | ///
40 | /// @ingroup axpy
41 | 
42 | template <typename TX, typename TY>
43 | void axpy(
44 |     int64_t n,
45 |     blas::scalar_type<TX, TY> alpha,
46 |     TX const *x, int64_t incx,
47 |     TY       *y, int64_t incy )
48 | {
49 |     typedef blas::scalar_type<TX, TY> scalar_t;
50 | 
51 |     // check arguments
52 |     blas_error_if( n < 0 );
53 |     blas_error_if( incx == 0 );
54 |     blas_error_if( incy == 0 );
55 | 
56 |     // quick return
57 |     if (alpha == scalar_t(0))
58 |         return;
59 | 
60 |     if (incx == 1 && incy == 1) {
61 |         // unit stride
62 |         for (int64_t i = 0; i < n; ++i) {
63 |             y[i] += alpha*x[i];
64 |         }
65 |     }
66 |     else {
67 |         // non-unit stride
68 |         int64_t ix = (incx > 0 ? 0 : (-n + 1)*incx);
69 |         int64_t iy = (incy > 0 ? 0 : (-n + 1)*incy);
70 |         for (int64_t i = 0; i < n; ++i) {
71 |             y[iy] += alpha * x[ix];
72 |             ix += incx;
73 |             iy += incy;
74 |         }
75 |     }
76 | }
77 | 
78 | }  // namespace blas
79 | 
80 | #endif        //  #ifndef BLAS_AXPY_HH
81 | 


--------------------------------------------------------------------------------
/include/blas/config.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
 2 | // SPDX-License-Identifier: BSD-3-Clause
 3 | // This program is free software: you can redistribute it and/or modify it under
 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 5 | 
 6 | #ifndef BLAS_CONFIG_H
 7 | #define BLAS_CONFIG_H
 8 | 
 9 | #include <stdint.h>
10 | 
11 | #include "blas/defines.h"
12 | 
13 | #ifndef blas_int
14 |     #if defined(BLAS_ILP64)
15 |         typedef int64_t blas_int;
16 |     #else
17 |         typedef int blas_int;
18 |     #endif
19 |     /* #define so that #ifdef works. */
20 |     #define blas_int blas_int
21 | #endif
22 | 
23 | /* f2c, hence MacOS Accelerate, returns double instead of float
24 |  * for sdot, slange, clange, etc. */
25 | #if defined(BLAS_HAVE_F2C)
26 |     typedef double blas_float_return;
27 | #else
28 |     typedef float blas_float_return;
29 | #endif
30 | 
31 | #if defined(BLAS_COMPLEX_CPP) || defined(LAPACK_COMPLEX_CPP)
32 |     /* user has to specifically request std::complex,
33 |      * as it isn't compatible as a return type from extern C functions. */
34 |     #include <complex>
35 |     typedef std::complex<float>  blas_complex_float;
36 |     typedef std::complex<double> blas_complex_double;
37 | #elif defined(_MSC_VER)
38 |     /* MSVC has no C99 _Complex */
39 |     typedef struct { float real, imag; }  blas_complex_float;
40 |     typedef struct { double real, imag; } blas_complex_double;
41 | #else
42 |     /* otherwise, by default use C99 _Complex */
43 |     #include <complex.h>
44 |     typedef float _Complex  blas_complex_float;
45 |     typedef double _Complex blas_complex_double;
46 | #endif
47 | 
48 | /* define so we can check later with ifdef */
49 | #define blas_complex_float  blas_complex_float
50 | #define blas_complex_double blas_complex_double
51 | 
52 | #endif        //  #ifndef BLAS_CONFIG_H
53 | 


--------------------------------------------------------------------------------
/include/blas/copy.hh:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
 2 | // SPDX-License-Identifier: BSD-3-Clause
 3 | // This program is free software: you can redistribute it and/or modify it under
 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 5 | 
 6 | #ifndef BLAS_COPY_HH
 7 | #define BLAS_COPY_HH
 8 | 
 9 | #include "blas/util.hh"
10 | 
11 | #include <limits>
12 | 
13 | namespace blas {
14 | 
15 | // =============================================================================
16 | /// Copy vector, $y = x$.
17 | ///
18 | /// Generic implementation for arbitrary data types.
19 | ///
20 | /// @param[in] n
21 | ///     Number of elements in x and y. n >= 0.
22 | ///
23 | /// @param[in] x
24 | ///     The n-element vector x, in an array of length (n-1)*abs(incx) + 1.
25 | ///
26 | /// @param[in] incx
27 | ///     Stride between elements of x. incx must not be zero.
28 | ///     If incx < 0, uses elements of x in reverse order: x(n-1), ..., x(0).
29 | ///
30 | /// @param[out] y
31 | ///     The n-element vector y, in an array of length (n-1)*abs(incy) + 1.
32 | ///
33 | /// @param[in] incy
34 | ///     Stride between elements of y. incy must not be zero.
35 | ///     If incy < 0, uses elements of y in reverse order: y(n-1), ..., y(0).
36 | ///
37 | /// @ingroup copy
38 | 
39 | template <typename TX, typename TY>
40 | void copy(
41 |     int64_t n,
42 |     TX const *x, int64_t incx,
43 |     TY       *y, int64_t incy )
44 | {
45 |     // check arguments
46 |     blas_error_if( n < 0 );
47 |     blas_error_if( incx == 0 );
48 |     blas_error_if( incy == 0 );
49 | 
50 |     if (incx == 1 && incy == 1) {
51 |         // unit stride
52 |         for (int64_t i = 0; i < n; ++i) {
53 |             y[i] = x[i];
54 |         }
55 |     }
56 |     else {
57 |         // non-unit stride
58 |         int64_t ix = (incx > 0 ? 0 : (-n + 1)*incx);
59 |         int64_t iy = (incy > 0 ? 0 : (-n + 1)*incy);
60 |         for (int64_t i = 0; i < n; ++i) {
61 |             y[iy] = x[ix];
62 |             ix += incx;
63 |             iy += incy;
64 |         }
65 |     }
66 | }
67 | 
68 | }  // namespace blas
69 | 
70 | #endif        //  #ifndef BLAS_COPY_HH
71 | 


--------------------------------------------------------------------------------
/include/blas/defines.h.in:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
 2 | // SPDX-License-Identifier: BSD-3-Clause
 3 | // This program is free software: you can redistribute it and/or modify it under
 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 5 | 
 6 | #ifndef BLAS_DEFINES_H
 7 | #define BLAS_DEFINES_H
 8 | 
 9 | // auto-generated by: @argv@
10 | // @CMAKE_COMMAND@ @CMAKE_VERSION@
11 | //
12 | // Definitions for:
13 | // CXX   = @CXX@
14 | //         @CMAKE_CXX_COMPILER@
15 | // LIBS  = @LIBS@
16 | //         @BLAS_LIBRARIES@
17 | 
18 | @blaspp_header_defines@
19 | 
20 | #endif        //  #ifndef BLAS_DEFINES_H
21 | 


--------------------------------------------------------------------------------
/include/blas/dot.hh:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
 2 | // SPDX-License-Identifier: BSD-3-Clause
 3 | // This program is free software: you can redistribute it and/or modify it under
 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 5 | 
 6 | #ifndef BLAS_DOT_HH
 7 | #define BLAS_DOT_HH
 8 | 
 9 | #include "blas/util.hh"
10 | 
11 | #include <limits>
12 | 
13 | namespace blas {
14 | 
15 | // =============================================================================
16 | /// @return dot product, $x^H y$.
17 | /// @see dotu for unconjugated version, $x^T y$.
18 | ///
19 | /// Generic implementation for arbitrary data types.
20 | ///
21 | /// @param[in] n
22 | ///     Number of elements in x and y. n >= 0.
23 | ///
24 | /// @param[in] x
25 | ///     The n-element vector x, in an array of length (n-1)*abs(incx) + 1.
26 | ///
27 | /// @param[in] incx
28 | ///     Stride between elements of x. incx must not be zero.
29 | ///     If incx < 0, uses elements of x in reverse order: x(n-1), ..., x(0).
30 | ///
31 | /// @param[in] y
32 | ///     The n-element vector y, in an array of length (n-1)*abs(incy) + 1.
33 | ///
34 | /// @param[in] incy
35 | ///     Stride between elements of y. incy must not be zero.
36 | ///     If incy < 0, uses elements of y in reverse order: y(n-1), ..., y(0).
37 | ///
38 | /// @ingroup dot
39 | 
40 | template <typename TX, typename TY>
41 | scalar_type<TX, TY> dot(
42 |     int64_t n,
43 |     TX const *x, int64_t incx,
44 |     TY const *y, int64_t incy )
45 | {
46 |     typedef scalar_type<TX, TY> scalar_t;
47 | 
48 |     // check arguments
49 |     blas_error_if( n < 0 );
50 |     blas_error_if( incx == 0 );
51 |     blas_error_if( incy == 0 );
52 | 
53 |     scalar_t result = 0;
54 |     if (incx == 1 && incy == 1) {
55 |         // unit stride
56 |         for (int64_t i = 0; i < n; ++i) {
57 |             result += conj(x[i]) * y[i];
58 |         }
59 |     }
60 |     else {
61 |         // non-unit stride
62 |         int64_t ix = (incx > 0 ? 0 : (-n + 1)*incx);
63 |         int64_t iy = (incy > 0 ? 0 : (-n + 1)*incy);
64 |         for (int64_t i = 0; i < n; ++i) {
65 |             result += conj(x[ix]) * y[iy];
66 |             ix += incx;
67 |             iy += incy;
68 |         }
69 |     }
70 |     return result;
71 | }
72 | 
73 | }  // namespace blas
74 | 
75 | #endif        //  #ifndef BLAS_DOT_HH
76 | 


--------------------------------------------------------------------------------
/include/blas/dotu.hh:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
 2 | // SPDX-License-Identifier: BSD-3-Clause
 3 | // This program is free software: you can redistribute it and/or modify it under
 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 5 | 
 6 | #ifndef BLAS_DOTU_HH
 7 | #define BLAS_DOTU_HH
 8 | 
 9 | #include "blas/util.hh"
10 | 
11 | #include <limits>
12 | 
13 | namespace blas {
14 | 
15 | // =============================================================================
16 | /// @return unconjugated dot product, $x^T y$.
17 | /// @see dot for conjugated version, $x^H y$.
18 | ///
19 | /// Generic implementation for arbitrary data types.
20 | ///
21 | /// @param[in] n
22 | ///     Number of elements in x and y. n >= 0.
23 | ///
24 | /// @param[in] x
25 | ///     The n-element vector x, in an array of length (n-1)*abs(incx) + 1.
26 | ///
27 | /// @param[in] incx
28 | ///     Stride between elements of x. incx must not be zero.
29 | ///     If incx < 0, uses elements of x in reverse order: x(n-1), ..., x(0).
30 | ///
31 | /// @param[in] y
32 | ///     The n-element vector y, in an array of length (n-1)*abs(incy) + 1.
33 | ///
34 | /// @param[in] incy
35 | ///     Stride between elements of y. incy must not be zero.
36 | ///     If incy < 0, uses elements of y in reverse order: y(n-1), ..., y(0).
37 | ///
38 | /// @ingroup dotu
39 | 
40 | template <typename TX, typename TY>
41 | scalar_type<TX, TY> dotu(
42 |     int64_t n,
43 |     TX const *x, int64_t incx,
44 |     TY const *y, int64_t incy )
45 | {
46 |     typedef scalar_type<TX, TY> scalar_t;
47 | 
48 |     // check arguments
49 |     blas_error_if( n < 0 );
50 |     blas_error_if( incx == 0 );
51 |     blas_error_if( incy == 0 );
52 | 
53 |     scalar_t result = 0;
54 |     if (incx == 1 && incy == 1) {
55 |         // unit stride
56 |         for (int64_t i = 0; i < n; ++i) {
57 |             result += x[i] * y[i];
58 |         }
59 |     }
60 |     else {
61 |         // non-unit stride
62 |         int64_t ix = (incx > 0 ? 0 : (-n + 1)*incx);
63 |         int64_t iy = (incy > 0 ? 0 : (-n + 1)*incy);
64 |         for (int64_t i = 0; i < n; ++i) {
65 |             result += x[ix] * y[iy];
66 |             ix += incx;
67 |             iy += incy;
68 |         }
69 |     }
70 |     return result;
71 | }
72 | 
73 | }  // namespace blas
74 | 
75 | #endif        //  #ifndef BLAS_DOTU_HH
76 | 


--------------------------------------------------------------------------------
/include/blas/geru.hh:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
  2 | // SPDX-License-Identifier: BSD-3-Clause
  3 | // This program is free software: you can redistribute it and/or modify it under
  4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
  5 | 
  6 | #ifndef BLAS_GERU_HH
  7 | #define BLAS_GERU_HH
  8 | 
  9 | #include "blas/util.hh"
 10 | #include "blas/ger.hh"
 11 | 
 12 | #include <limits>
 13 | 
 14 | namespace blas {
 15 | 
 16 | // =============================================================================
 17 | /// General matrix rank-1 update:
 18 | /// \[
 19 | ///     A = \alpha x y^T + A,
 20 | /// \]
 21 | /// where alpha is a scalar, x and y are vectors,
 22 | /// and A is an m-by-n matrix.
 23 | ///
 24 | /// Generic implementation for arbitrary data types.
 25 | ///
 26 | /// @param[in] layout
 27 | ///     Matrix storage, Layout::ColMajor or Layout::RowMajor.
 28 | ///
 29 | /// @param[in] m
 30 | ///     Number of rows of the matrix A. m >= 0.
 31 | ///
 32 | /// @param[in] n
 33 | ///     Number of columns of the matrix A. n >= 0.
 34 | ///
 35 | /// @param[in] alpha
 36 | ///     Scalar alpha. If alpha is zero, A is not updated.
 37 | ///
 38 | /// @param[in] x
 39 | ///     The m-element vector x, in an array of length (m-1)*abs(incx) + 1.
 40 | ///
 41 | /// @param[in] incx
 42 | ///     Stride between elements of x. incx must not be zero.
 43 | ///     If incx < 0, uses elements of x in reverse order: x(n-1), ..., x(0).
 44 | ///
 45 | /// @param[in] y
 46 | ///     The n-element vector y, in an array of length (n-1)*abs(incy) + 1.
 47 | ///
 48 | /// @param[in] incy
 49 | ///     Stride between elements of y. incy must not be zero.
 50 | ///     If incy < 0, uses elements of y in reverse order: y(n-1), ..., y(0).
 51 | ///
 52 | /// @param[in, out] A
 53 | ///     The m-by-n matrix A, stored in an lda-by-n array [RowMajor: m-by-lda].
 54 | ///
 55 | /// @param[in] lda
 56 | ///     Leading dimension of A. lda >= max(1, m) [RowMajor: lda >= max(1, n)].
 57 | ///
 58 | /// @ingroup geru
 59 | 
 60 | template <typename TA, typename TX, typename TY>
 61 | void geru(
 62 |     blas::Layout layout,
 63 |     int64_t m, int64_t n,
 64 |     blas::scalar_type<TA, TX, TY> alpha,
 65 |     TX const *x, int64_t incx,
 66 |     TY const *y, int64_t incy,
 67 |     TA *A, int64_t lda )
 68 | {
 69 |     typedef blas::scalar_type<TA, TX, TY> scalar_t;
 70 | 
 71 |     #define A(i_, j_) A[ (i_) + (j_)*lda ]
 72 | 
 73 |     // constants
 74 |     const scalar_t zero = 0;
 75 | 
 76 |     // check arguments
 77 |     blas_error_if( layout != Layout::ColMajor &&
 78 |                    layout != Layout::RowMajor );
 79 |     blas_error_if( m < 0 );
 80 |     blas_error_if( n < 0 );
 81 |     blas_error_if( incx == 0 );
 82 |     blas_error_if( incy == 0 );
 83 | 
 84 |     if (layout == Layout::ColMajor)
 85 |         blas_error_if( lda < m );
 86 |     else
 87 |         blas_error_if( lda < n );
 88 | 
 89 |     // quick return
 90 |     if (m == 0 || n == 0 || alpha == zero)
 91 |         return;
 92 | 
 93 |     // for row-major, simply swap dimensions and x <=> y
 94 |     // this doesn't work in the complex gerc case because y gets conj
 95 |     if (layout == Layout::RowMajor) {
 96 |         geru( Layout::ColMajor, n, m, alpha, y, incy, x, incx, A, lda );
 97 |         return;
 98 |     }
 99 | 
100 |     if (incx == 1 && incy == 1) {
101 |         // unit stride
102 |         for (int64_t j = 0; j < n; ++j) {
103 |             // note: NOT skipping if y[j] is zero, for consistent NAN handling
104 |             scalar_t tmp = alpha * y[j];
105 |             for (int64_t i = 0; i < m; ++i) {
106 |                 A(i, j) += x[i] * tmp;
107 |             }
108 |         }
109 |     }
110 |     else if (incx == 1) {
111 |         // x unit stride, y non-unit stride
112 |         int64_t jy = (incy > 0 ? 0 : (-n + 1)*incy);
113 |         for (int64_t j = 0; j < n; ++j) {
114 |             scalar_t tmp = alpha * y[jy];
115 |             for (int64_t i = 0; i < m; ++i) {
116 |                 A(i, j) += x[i] * tmp;
117 |             }
118 |             jy += incy;
119 |         }
120 |     }
121 |     else {
122 |         // x and y non-unit stride
123 |         int64_t kx = (incx > 0 ? 0 : (-m + 1)*incx);
124 |         int64_t jy = (incy > 0 ? 0 : (-n + 1)*incy);
125 |         for (int64_t j = 0; j < n; ++j) {
126 |             scalar_t tmp = alpha * y[jy];
127 |             int64_t ix = kx;
128 |             for (int64_t i = 0; i < m; ++i) {
129 |                 A(i, j) += x[ix] * tmp;
130 |                 ix += incx;
131 |             }
132 |             jy += incy;
133 |         }
134 |     }
135 | 
136 |     #undef A
137 | }
138 | 
139 | }  // namespace blas
140 | 
141 | #endif        //  #ifndef BLAS_GER_HH
142 | 


--------------------------------------------------------------------------------
/include/blas/iamax.hh:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
 2 | // SPDX-License-Identifier: BSD-3-Clause
 3 | // This program is free software: you can redistribute it and/or modify it under
 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 5 | 
 6 | #ifndef BLAS_IAMAX_HH
 7 | #define BLAS_IAMAX_HH
 8 | 
 9 | #include "blas/util.hh"
10 | 
11 | #include <limits>
12 | 
13 | namespace blas {
14 | 
15 | // =============================================================================
16 | /// @return Index of infinity-norm of vector, $|| x ||_{inf}$,
17 | ///     $\text{argmax}_{i=0}^{n-1} |Re(x_i)| + |Im(x_i)|$.
18 | /// Returns -1 if n = 0.
19 | ///
20 | /// Generic implementation for arbitrary data types.
21 | ///
22 | /// @param[in] n
23 | ///     Number of elements in x. n >= 0.
24 | ///
25 | /// @param[in] x
26 | ///     The n-element vector x, in an array of length (n-1)*incx + 1.
27 | ///
28 | /// @param[in] incx
29 | ///     Stride between elements of x. incx > 0.
30 | ///
31 | /// @ingroup iamax
32 | 
33 | template <typename T>
34 | int64_t iamax(
35 |     int64_t n,
36 |     T const *x, int64_t incx )
37 | {
38 |     typedef real_type<T> real_t;
39 | 
40 |     // check arguments
41 |     blas_error_if( n < 0 );      // standard BLAS returns, doesn't fail
42 |     blas_error_if( incx <= 0 );  // standard BLAS returns, doesn't fail
43 | 
44 |     // todo: check NAN
45 |     real_t result = -1;
46 |     int64_t index = -1;
47 |     if (incx == 1) {
48 |         // unit stride
49 |         for (int64_t i = 0; i < n; ++i) {
50 |             real_t tmp = abs1( x[i] );
51 |             if (tmp > result) {
52 |                 result = tmp;
53 |                 index = i;
54 |             }
55 |         }
56 |     }
57 |     else {
58 |         // non-unit stride
59 |         int64_t ix = 0;
60 |         for (int64_t i = 0; i < n; ++i) {
61 |             real_t tmp = abs1( x[ix] );
62 |             if (tmp > result) {
63 |                 result = tmp;
64 |                 index = i;
65 |             }
66 |             ix += incx;
67 |         }
68 |     }
69 |     return index;
70 | }
71 | 
72 | }  // namespace blas
73 | 
74 | #endif        //  #ifndef BLAS_IAMAX_HH
75 | 


--------------------------------------------------------------------------------
/include/blas/mangling.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
 2 | // SPDX-License-Identifier: BSD-3-Clause
 3 | // This program is free software: you can redistribute it and/or modify it under
 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 5 | 
 6 | #ifndef BLAS_MANGLING_H
 7 | #define BLAS_MANGLING_H
 8 | 
 9 | #include "blas/defines.h"
10 | 
11 | // -----------------------------------------------------------------------------
12 | // Fortran name mangling depends on compiler.
13 | // Define FORTRAN_UPPER for uppercase,
14 | // define FORTRAN_LOWER for lowercase (IBM xlf),
15 | // else the default is lowercase with appended underscore
16 | // (GNU gcc, Intel icc, PGI pgfortan, Cray ftn).
17 | #ifndef BLAS_FORTRAN_NAME
18 |     #if defined(BLAS_FORTRAN_UPPER)
19 |         #define BLAS_FORTRAN_NAME( lower, UPPER ) UPPER
20 |     #elif defined(BLAS_FORTRAN_LOWER)
21 |         #define BLAS_FORTRAN_NAME( lower, UPPER ) lower
22 |     #else
23 |         #define BLAS_FORTRAN_NAME( lower, UPPER ) lower##_
24 |     #endif
25 | #endif
26 | 
27 | #endif        //  #ifndef BLAS_MANGLING_H
28 | 


--------------------------------------------------------------------------------
/include/blas/nrm2.hh:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
 2 | // SPDX-License-Identifier: BSD-3-Clause
 3 | // This program is free software: you can redistribute it and/or modify it under
 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 5 | 
 6 | #ifndef BLAS_NRM2_HH
 7 | #define BLAS_NRM2_HH
 8 | 
 9 | #include "blas/util.hh"
10 | 
11 | #include <limits>
12 | 
13 | namespace blas {
14 | 
15 | // =============================================================================
16 | /// @return 2-norm of vector,
17 | ///     $|| x ||_2 = (\sum_{i=0}^{n-1} |x_i|^2)^{1/2}$.
18 | ///
19 | /// Generic implementation for arbitrary data types.
20 | /// TODO: generic implementation does not currently scale to avoid over- or underflow.
21 | ///
22 | /// @param[in] n
23 | ///     Number of elements in x. n >= 0.
24 | ///
25 | /// @param[in] x
26 | ///     The n-element vector x, in an array of length (n-1)*incx + 1.
27 | ///
28 | /// @param[in] incx
29 | ///     Stride between elements of x. incx > 0.
30 | ///
31 | /// @ingroup nrm2
32 | 
33 | template <typename T>
34 | real_type<T>
35 | nrm2(
36 |     int64_t n,
37 |     T const * x, int64_t incx )
38 | {
39 |     using std::sqrt;
40 |     using real_t = real_type<T>;
41 | 
42 |     // check arguments
43 |     blas_error_if( n < 0 );      // standard BLAS returns, doesn't fail
44 |     blas_error_if( incx <= 0 );  // standard BLAS returns, doesn't fail
45 | 
46 |     // todo: scale to avoid overflow & underflow
47 |     real_t result = 0;
48 |     if (incx == 1) {
49 |         // unit stride
50 |         for (int64_t i = 0; i < n; ++i) {
51 |             result += real(x[i]) * real(x[i]) + imag(x[i]) * imag(x[i]);
52 |         }
53 |     }
54 |     else {
55 |         // non-unit stride
56 |         int64_t ix = 0;
57 |         for (int64_t i = 0; i < n; ++i) {
58 |             result += real(x[ix]) * real(x[ix]) + imag(x[ix]) * imag(x[ix]);
59 |             ix += incx;
60 |         }
61 |     }
62 |     return sqrt( result );
63 | }
64 | 
65 | }  // namespace blas
66 | 
67 | #endif        //  #ifndef BLAS_NRM2_HH
68 | 


--------------------------------------------------------------------------------
/include/blas/rot.hh:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
 2 | // SPDX-License-Identifier: BSD-3-Clause
 3 | // This program is free software: you can redistribute it and/or modify it under
 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 5 | 
 6 | #ifndef BLAS_ROT_HH
 7 | #define BLAS_ROT_HH
 8 | 
 9 | #include "blas/util.hh"
10 | 
11 | #include <limits>
12 | 
13 | namespace blas {
14 | 
15 | // =============================================================================
16 | /// Apply plane rotation:
17 | /// \[
18 | ///       \begin{bmatrix} x^T   \\ y^T    \end{bmatrix}
19 | ///     = \begin{bmatrix} c & s \\ -s & c \end{bmatrix}
20 | ///       \begin{bmatrix} x^T   \\ y^T    \end{bmatrix}.
21 | /// \]
22 | ///
23 | /// @see rotg to generate the rotation.
24 | ///
25 | /// Generic implementation for arbitrary data types.
26 | ///
27 | /// @param[in] n
28 | ///     Number of elements in x and y. n >= 0.
29 | ///
30 | /// @param[in, out] x
31 | ///     The n-element vector x, in an array of length (n-1)*abs(incx) + 1.
32 | ///
33 | /// @param[in] incx
34 | ///     Stride between elements of x. incx must not be zero.
35 | ///     If incx < 0, uses elements of x in reverse order: x(n-1), ..., x(0).
36 | ///
37 | /// @param[in, out] y
38 | ///     The n-element vector y, in an array of length (n-1)*abs(incy) + 1.
39 | ///
40 | /// @param[in] incy
41 | ///     Stride between elements of y. incy must not be zero.
42 | ///     If incy < 0, uses elements of y in reverse order: y(n-1), ..., y(0).
43 | ///
44 | /// @param[in] c
45 | ///     Cosine of rotation; real.
46 | ///
47 | /// @param[in] s
48 | ///     Sine of rotation; complex.
49 | ///
50 | /// @ingroup rot
51 | 
52 | template <typename TX, typename TY>
53 | void rot(
54 |     int64_t n,
55 |     TX *x, int64_t incx,
56 |     TY *y, int64_t incy,
57 |     blas::real_type<TX, TY>   c,
58 |     blas::scalar_type<TX, TY> s )
59 | {
60 |     typedef scalar_type<TX, TY> scalar_t;
61 | 
62 |     // check arguments
63 |     blas_error_if( n < 0 );      // standard BLAS returns, doesn't fail
64 |     blas_error_if( incx == 0 );
65 |     blas_error_if( incy == 0 );
66 | 
67 |     scalar_t zero( 0 );
68 | 
69 |     // quick return
70 |     if (n == 0 || (c == 1 && s == zero))
71 |         return;
72 | 
73 |     if (incx == 1 && incy == 1) {
74 |         // unit stride
75 |         for (int64_t i = 0; i < n; ++i) {
76 |             scalar_t stmp = c*x[i] + s*y[i];
77 |             y[i] = c*y[i] - conj(s)*x[i];
78 |             x[i] = stmp;
79 |         }
80 |     }
81 |     else {
82 |         // non-unit stride
83 |         int64_t ix = (incx > 0 ? 0 : (-n + 1)*incx);
84 |         int64_t iy = (incy > 0 ? 0 : (-n + 1)*incy);
85 |         for (int64_t i = 0; i < n; ++i) {
86 |             scalar_t stmp = c*x[ix] + s*y[iy];
87 |             y[iy] = c*y[iy] - conj(s)*x[ix];
88 |             x[ix] = stmp;
89 |             ix += incx;
90 |             iy += incy;
91 |         }
92 |     }
93 | }
94 | 
95 | }  // namespace blas
96 | 
97 | #endif        //  #ifndef BLAS_ROT_HH
98 | 


--------------------------------------------------------------------------------
/include/blas/scal.hh:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
 2 | // SPDX-License-Identifier: BSD-3-Clause
 3 | // This program is free software: you can redistribute it and/or modify it under
 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 5 | 
 6 | #ifndef BLAS_SCAL_HH
 7 | #define BLAS_SCAL_HH
 8 | 
 9 | #include "blas/util.hh"
10 | 
11 | #include <limits>
12 | 
13 | namespace blas {
14 | 
15 | // =============================================================================
16 | /// Scale vector by constant, $x = \alpha x$.
17 | ///
18 | /// Generic implementation for arbitrary data types.
19 | ///
20 | /// @param[in] n
21 | ///     Number of elements in x. n >= 0.
22 | ///
23 | /// @param[in] alpha
24 | ///     Scalar alpha.
25 | ///
26 | /// @param[in] x
27 | ///     The n-element vector x, in an array of length (n-1)*incx + 1.
28 | ///
29 | /// @param[in] incx
30 | ///     Stride between elements of x. incx > 0.
31 | ///
32 | /// @ingroup scal
33 | 
34 | template <typename T>
35 | void scal(
36 |     int64_t n,
37 |     T alpha,
38 |     T* x, int64_t incx )
39 | {
40 |     // check arguments
41 |     blas_error_if( n < 0 );      // standard BLAS returns, doesn't fail
42 |     blas_error_if( incx <= 0 );  // standard BLAS returns, doesn't fail
43 | 
44 |     if (incx == 1) {
45 |         // unit stride
46 |         for (int64_t i = 0; i < n; ++i) {
47 |             x[i] *= alpha;
48 |         }
49 |     }
50 |     else {
51 |         // non-unit stride
52 |         for (int64_t i = 0; i < n*incx; i += incx) {
53 |             x[i] *= alpha;
54 |         }
55 |     }
56 | }
57 | 
58 | }  // namespace blas
59 | 
60 | #endif        //  #ifndef BLAS_SCAL_HH
61 | 


--------------------------------------------------------------------------------
/include/blas/swap.hh:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
 2 | // SPDX-License-Identifier: BSD-3-Clause
 3 | // This program is free software: you can redistribute it and/or modify it under
 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 5 | 
 6 | #ifndef BLAS_SWAP_HH
 7 | #define BLAS_SWAP_HH
 8 | 
 9 | #include "blas/util.hh"
10 | 
11 | #include <limits>
12 | 
13 | namespace blas {
14 | 
15 | // =============================================================================
16 | /// Swap vectors, $x <=> y$.
17 | ///
18 | /// Generic implementation for arbitrary data types.
19 | ///
20 | /// @param[in] n
21 | ///     Number of elements in x and y. n >= 0.
22 | ///
23 | /// @param[in] x
24 | ///     The n-element vector x, in an array of length (n-1)*abs(incx) + 1.
25 | ///
26 | /// @param[in] incx
27 | ///     Stride between elements of x. incx must not be zero.
28 | ///     If incx < 0, uses elements of x in reverse order: x(n-1), ..., x(0).
29 | ///
30 | /// @param[in, out] y
31 | ///     The n-element vector y, in an array of length (n-1)*abs(incy) + 1.
32 | ///
33 | /// @param[in] incy
34 | ///     Stride between elements of y. incy must not be zero.
35 | ///     If incy < 0, uses elements of y in reverse order: y(n-1), ..., y(0).
36 | ///
37 | /// @ingroup swap
38 | 
39 | template <typename TX, typename TY>
40 | void swap(
41 |     int64_t n,
42 |     TX *x, int64_t incx,
43 |     TY *y, int64_t incy )
44 | {
45 |     using std::swap;
46 | 
47 |     // check arguments
48 |     blas_error_if( n < 0 );      // standard BLAS returns, doesn't fail
49 |     blas_error_if( incx == 0 );  // standard BLAS doesn't detect inc[xy] == 0
50 |     blas_error_if( incy == 0 );
51 | 
52 |     if (incx == 1 && incy == 1) {
53 |         // unit stride
54 |         for (int64_t i = 0; i < n; ++i) {
55 |             swap( x[i], y[i] );
56 |         }
57 |     }
58 |     else {
59 |         // non-unit stride
60 |         int64_t ix = (incx > 0 ? 0 : (-n + 1)*incx);
61 |         int64_t iy = (incy > 0 ? 0 : (-n + 1)*incy);
62 |         for (int64_t i = 0; i < n; ++i) {
63 |             swap( x[ix], y[iy] );
64 |             ix += incx;
65 |             iy += incy;
66 |         }
67 |     }
68 | }
69 | 
70 | }  // namespace blas
71 | 
72 | #endif        //  #ifndef BLAS_SWAP_HH
73 | 


--------------------------------------------------------------------------------
/lib/pkgconfig/blaspp.pc.in:
--------------------------------------------------------------------------------
 1 | prefix=#PREFIX
 2 | exec_prefix=${prefix}
 3 | libdir=${exec_prefix}/lib
 4 | includedir=${prefix}/include
 5 | CXX=#CXX
 6 | 
 7 | Name: BLAS++
 8 | Description: C++ API to BLAS (basic linear algebra subroutines)
 9 | Version: #VERSION
10 | Cflags: #CXXFLAGS -I${includedir}
11 | # Some platforms require CUDA LIBS to be public.
12 | Libs: -L${libdir} -Wl,-rpath,${libdir} -lblaspp #LDFLAGS #LIBS
13 | Libs.private:
14 | Requires:
15 | Requires.private:
16 | 


--------------------------------------------------------------------------------
/make.inc.in:
--------------------------------------------------------------------------------
 1 | #-------------------------------------------------------------------------------
 2 | # make.inc file
 3 | # auto-generated by: @argv@
 4 | # host: @HOSTNAME@
 5 | # CPATH: @CPATH@
 6 | # LIBRARY_PATH: @LIBRARY_PATH@
 7 | #
 8 | CXX      = @CXX@
 9 | 
10 | CXXFLAGS = @CXXFLAGS@
11 | 
12 | # see include/blas/defines.h
13 | # @DEFINES@
14 | 
15 | LDFLAGS  = @LDFLAGS@
16 | 
17 | LIBS     = @LIBS@
18 | 
19 | prefix   = @prefix@
20 | 
21 | static   = @static@
22 | 


--------------------------------------------------------------------------------
/src/asum.cc:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
  2 | // SPDX-License-Identifier: BSD-3-Clause
  3 | // This program is free software: you can redistribute it and/or modify it under
  4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
  5 | 
  6 | #include "blas/fortran.h"
  7 | #include "blas.hh"
  8 | #include "blas_internal.hh"
  9 | #include "blas/counter.hh"
 10 | 
 11 | #include <limits>
 12 | #include <string.h>
 13 | 
 14 | namespace blas {
 15 | 
 16 | //==============================================================================
 17 | namespace internal {
 18 | 
 19 | //------------------------------------------------------------------------------
 20 | /// Low-level overload wrapper calls Fortran, float version.
 21 | /// @ingroup asum_internal
 22 | inline float asum(
 23 |     blas_int n,
 24 |     float const* x, blas_int incx )
 25 | {
 26 |     return BLAS_sasum( &n, x, &incx );
 27 | }
 28 | 
 29 | //------------------------------------------------------------------------------
 30 | /// Low-level overload wrapper calls Fortran, double version.
 31 | /// @ingroup asum_internal
 32 | inline double asum(
 33 |     blas_int n,
 34 |     double const* x, blas_int incx )
 35 | {
 36 |     return BLAS_dasum( &n, x, &incx );
 37 | }
 38 | 
 39 | //------------------------------------------------------------------------------
 40 | /// Low-level overload wrapper calls Fortran, complex<float> version.
 41 | /// @ingroup asum_internal
 42 | inline float asum(
 43 |     blas_int n,
 44 |     std::complex<float> const* x, blas_int incx )
 45 | {
 46 |     return BLAS_scasum( &n, (blas_complex_float*) x, &incx );
 47 | }
 48 | 
 49 | //------------------------------------------------------------------------------
 50 | /// Low-level overload wrapper calls Fortran, complex<double> version.
 51 | /// @ingroup asum_internal
 52 | inline double asum(
 53 |     blas_int n,
 54 |     std::complex<double> const* x, blas_int incx )
 55 | {
 56 |     return BLAS_dzasum( &n, (blas_complex_double*) x, &incx );
 57 | }
 58 | 
 59 | }  // namespace internal
 60 | 
 61 | //==============================================================================
 62 | namespace impl {
 63 | 
 64 | //------------------------------------------------------------------------------
 65 | /// Mid-level templated wrapper checks and converts arguments,
 66 | /// then calls low-level wrapper.
 67 | /// @ingroup asum_internal
 68 | ///
 69 | template <typename scalar_t>
 70 | real_type<scalar_t> asum(
 71 |     int64_t n,
 72 |     scalar_t const* x, int64_t incx )
 73 | {
 74 |     // check arguments
 75 |     blas_error_if( n < 0 );      // standard BLAS returns, doesn't fail
 76 |     blas_error_if( incx <= 0 );  // standard BLAS returns, doesn't fail
 77 | 
 78 |     #ifdef BLAS_HAVE_PAPI
 79 |         // PAPI instrumentation
 80 |         counter::asum_type element;
 81 |         memset( &element, 0, sizeof( element ) );
 82 |         element = { n };
 83 |         counter::insert( element, counter::Id::asum );
 84 | 
 85 |         double gflops = 1e9 * blas::Gflop< scalar_t >::asum( n );
 86 |         counter::inc_flop_count( (long long int)gflops );
 87 |     #endif
 88 | 
 89 |     // convert arguments
 90 |     blas_int n_    = to_blas_int( n );
 91 |     blas_int incx_ = to_blas_int( incx );
 92 | 
 93 |     // call low-level wrapper
 94 |     return internal::asum( n_, x, incx_ );
 95 | }
 96 | 
 97 | }  // namespace impl
 98 | 
 99 | //==============================================================================
100 | // High-level overloaded wrappers call mid-level templated wrapper.
101 | 
102 | //------------------------------------------------------------------------------
103 | /// CPU, float version.
104 | /// @ingroup asum
105 | float asum(
106 |     int64_t n,
107 |     float const* x, int64_t incx )
108 | {
109 |     return impl::asum( n, x, incx );
110 | }
111 | 
112 | //------------------------------------------------------------------------------
113 | /// CPU, double version.
114 | /// @ingroup asum
115 | double asum(
116 |     int64_t n,
117 |     double const* x, int64_t incx )
118 | {
119 |     return impl::asum( n, x, incx );
120 | }
121 | 
122 | //------------------------------------------------------------------------------
123 | /// CPU, complex<float> version.
124 | /// @ingroup asum
125 | float asum(
126 |     int64_t n,
127 |     std::complex<float> const* x, int64_t incx )
128 | {
129 |     return impl::asum( n, x, incx );
130 | }
131 | 
132 | //------------------------------------------------------------------------------
133 | /// CPU, complex<double> version.
134 | /// @ingroup asum
135 | double asum(
136 |     int64_t n,
137 |     std::complex<double> const* x, int64_t incx )
138 | {
139 |     return impl::asum( n, x, incx );
140 | }
141 | 
142 | }  // namespace blas
143 | 


--------------------------------------------------------------------------------
/src/blas_internal.hh:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
 2 | // SPDX-License-Identifier: BSD-3-Clause
 3 | // This program is free software: you can redistribute it and/or modify it under
 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 5 | 
 6 | #ifndef BLAS_INTERNAL_HH
 7 | #define BLAS_INTERNAL_HH
 8 | 
 9 | #include "blas/util.hh"
10 | 
11 | namespace blas {
12 | 
13 | //------------------------------------------------------------------------------
14 | /// @see to_blas_int
15 | ///
16 | inline blas_int to_blas_int_( int64_t x, const char* x_str )
17 | {
18 |     if (sizeof(int64_t) > sizeof(blas_int)) {
19 |         blas_error_if_msg( x > std::numeric_limits<blas_int>::max(), "%s", x_str );
20 |     }
21 |     return blas_int( x );
22 | }
23 | 
24 | //----------------------------------------
25 | /// Convert int64_t to blas_int.
26 | /// If blas_int is 64-bit, this does nothing.
27 | /// If blas_int is 32-bit, throws if x > INT_MAX, so conversion would overflow.
28 | ///
29 | /// Note this is in src/blas_internal.hh, so this macro won't pollute
30 | /// the namespace when apps #include <blas.hh>.
31 | ///
32 | #define to_blas_int( x ) to_blas_int_( x, #x )
33 | 
34 | }  // namespace blas
35 | 
36 | #endif // BLAS_INTERNAL_HH
37 | 


--------------------------------------------------------------------------------
/src/cuda/device_conj.cu:
--------------------------------------------------------------------------------
  1 | #include "blas/device.hh"
  2 | #include "thrust/complex.h"
  3 | 
  4 | #if defined(BLAS_HAVE_CUBLAS)
  5 | 
  6 | namespace blas {
  7 | 
  8 | __device__ std::complex<float> conj_convert(
  9 |     std::complex<float> z)
 10 | {
 11 |     ((cuComplex*) &z)->y *= -1;
 12 |     return z;
 13 | }
 14 | 
 15 | __device__ std::complex<double> conj_convert(
 16 |     std::complex<double> z)
 17 | {
 18 |     ((cuDoubleComplex*) &z)->y *= -1;
 19 |     return z;
 20 | }
 21 | 
 22 | // Each thread conjugates 1 item
 23 | template <typename TS, typename TD>
 24 | __global__ void conj_kernel(
 25 |     int64_t n,
 26 |     TS const* src, int64_t inc_src, int64_t i_src,
 27 |     TD* dst, int64_t inc_dst, int64_t i_dst)
 28 | {
 29 |     using thrust::conj;
 30 | 
 31 |     int i = blockIdx.x * blockDim.x + threadIdx.x;
 32 |     if (i < n)
 33 |         dst[ i*inc_dst + i_dst ] = conj_convert( src[ i*inc_src + i_src ] );
 34 | }
 35 | 
 36 | //------------------------------------------------------------------------------
 37 | /// Conjugates each element of the vector src and stores in dst.
 38 | ///
 39 | /// @param[in] n
 40 | ///     Number of elements in the vector. n >= 0.
 41 | ///
 42 | /// @param[in] src
 43 | ///     Pointer to the input vector of length n.
 44 | ///
 45 | /// @param[in] inc_src
 46 | ///     Stride between elements of src. inc_src >= 1.
 47 | ///
 48 | /// @param[out] dst
 49 | ///     Pointer to output vector
 50 | ///     On exit, each element dst[i] is updated as dst[i] = conj( src[i] ).
 51 | ///     dst may be the same as src.
 52 | ///
 53 | /// @param[in] inc_dst
 54 | ///     Stride between elements of dst. inc_dst >= 1.
 55 | ///
 56 | /// @param[in] queue
 57 | ///     BLAS++ queue to execute in.
 58 | ///
 59 | template <typename TS, typename TD>
 60 | void conj(
 61 |     int64_t n,
 62 |     TS const* src, int64_t inc_src,
 63 |     TD* dst, int64_t inc_dst,
 64 |     blas::Queue& queue )
 65 | {
 66 |     if (n <= 0) {
 67 |         return;
 68 |     }
 69 | 
 70 |     const int64_t BlockSize = 128;
 71 | 
 72 |     int64_t n_threads = min( BlockSize, n );
 73 |     int64_t n_blocks = ceildiv(n, n_threads);
 74 | 
 75 |     int64_t i_src = (inc_src > 0 ? 0 : (1 - n) * inc_src);
 76 |     int64_t i_dst = (inc_dst > 0 ? 0 : (1 - n) * inc_dst);
 77 | 
 78 |     blas_dev_call(
 79 |         cudaSetDevice( queue.device() ) );
 80 | 
 81 |     conj_kernel<<<n_blocks, n_threads, 0, queue.stream()>>>(
 82 |         n, src, inc_src, i_src, dst, inc_dst, i_dst );
 83 | 
 84 |     blas_dev_call(
 85 |         cudaGetLastError() );
 86 | }
 87 | 
 88 | //------------------------------------------------------------------------------
 89 | // Explicit instantiations.
 90 | template void conj(
 91 |     int64_t n,
 92 |     std::complex<float> const* src, int64_t inc_src,
 93 |     std::complex<float>* dst, int64_t inc_dst,
 94 |     blas::Queue& queue);
 95 | 
 96 | template void conj(
 97 |     int64_t n,
 98 |     std::complex<double> const* src, int64_t inc_src,
 99 |     std::complex<double>* dst, int64_t inc_dst,
100 |     blas::Queue& queue);
101 | 
102 | } // namespace blas
103 | 
104 | #endif // BLAS_HAVE_CUBLAS
105 | 


--------------------------------------------------------------------------------
/src/cuda/device_shift_vec.cu:
--------------------------------------------------------------------------------
 1 | #include "blas/device.hh"
 2 | 
 3 | #if defined(BLAS_HAVE_CUBLAS)
 4 | 
 5 | namespace blas {
 6 | 
 7 | template <typename scalar_t>
 8 | __global__ void shift_vec_kernel(
 9 |     int64_t n, scalar_t* v,
10 |     scalar_t c)
11 | {
12 |     for (int i = threadIdx.x; i < n; i += blockDim.x) {
13 |         v[ i ] += c;
14 |     }
15 | }
16 | 
17 | //------------------------------------------------------------------------------
18 | /// Shifts each element of the vector v by a constant value c.
19 | ///
20 | /// @param[in] n
21 | ///     Number of elements in the vector. n >= 0.
22 | ///
23 | /// @param[in,out] v
24 | ///     Pointer to the vector of length n.
25 | ///     On exit, each element v[i] is updated as v[i] += c.
26 | ///
27 | /// @param[in] c
28 | ///     Scalar value to be added to each element of v.
29 | ///
30 | /// @param[in] queue
31 | ///     BLAS++ queue to execute in.
32 | ///
33 | template <typename scalar_t>
34 | void shift_vec(
35 |     int64_t n, scalar_t* v,
36 |     scalar_t c,
37 |     blas::Queue& queue)
38 | {
39 |     if (n == 0) {
40 |         return;
41 |     }
42 | 
43 |     int64_t nthreads = std::min( int64_t( 1024 ), n );
44 | 
45 |     blas_dev_call(
46 |         cudaSetDevice( queue.device() ) );
47 | 
48 |     shift_vec_kernel<<<1, nthreads, 0, queue.stream()>>>( n, v, c );
49 | 
50 |     blas_dev_call(
51 |         cudaGetLastError() );
52 | }
53 | 
54 | //------------------------------------------------------------------------------
55 | // Explicit instantiations.
56 | template
57 | void shift_vec(
58 |     int64_t n, int64_t* v,
59 |     int64_t c,
60 |     blas::Queue& queue);
61 | 
62 | template
63 | void shift_vec(
64 |     int64_t n, int* v,
65 |     int c,
66 |     blas::Queue& queue);
67 | 
68 | } // namespace blas
69 | 
70 | #endif // BLAS_HAVE_CUBLAS
71 | 


--------------------------------------------------------------------------------
/src/device_asum.cc:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
  2 | // SPDX-License-Identifier: BSD-3-Clause
  3 | // This program is free software: you can redistribute it and/or modify it under
  4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
  5 | 
  6 | #include "blas/device_blas.hh"
  7 | #include "blas/counter.hh"
  8 | 
  9 | #include "device_internal.hh"
 10 | 
 11 | #include <limits>
 12 | 
 13 | namespace blas {
 14 | 
 15 | // =============================================================================
 16 | namespace impl {
 17 | 
 18 | //------------------------------------------------------------------------------
 19 | /// Mid-level templated wrapper checks and converts arguments,
 20 | /// then calls low-level wrapper.
 21 | /// @ingroup asum
 22 | ///
 23 | template <typename scalar_t>
 24 | void asum(
 25 |     int64_t n,
 26 |     scalar_t const* x, int64_t incx,
 27 |     real_type<scalar_t>* result,
 28 |     blas::Queue& queue)
 29 | {
 30 | #ifndef BLAS_HAVE_DEVICE
 31 |     throw blas::Error( "device BLAS not available", __func__ );
 32 | #else
 33 |     // check arguments
 34 |     blas_error_if( n < 0 );       // standard BLAS returns, doesn't fail
 35 |     blas_error_if( incx <= 0 );   // standard BLAS returns, doesn't fail
 36 | 
 37 |     #ifdef BLAS_HAVE_PAPI
 38 |         // PAPI instrumentation
 39 |         counter::dev_asum_type element;
 40 |         memset( &element, 0, sizeof( element ) );
 41 |         element = { n };
 42 |         counter::insert( element, counter::Id::dev_asum );
 43 | 
 44 |         double gflops = 1e9 * blas::Gflop< scalar_t >::asum( n );
 45 |         counter::inc_flop_count( (long long int)gflops );
 46 |     #endif
 47 | 
 48 |     // convert arguments
 49 |     device_blas_int n_    = to_device_blas_int( n );
 50 |     device_blas_int incx_ = to_device_blas_int( incx );
 51 | 
 52 |     blas::internal_set_device( queue.device() );
 53 | 
 54 |     // call low-level wrapper
 55 |     #if defined( BLAS_HAVE_SYCL )
 56 |         sycl::queue syclq = queue.stream();
 57 |         // check how the result scalar was allocated
 58 |         auto result_ptr_type = sycl::get_pointer_type( result, syclq.get_context() );
 59 |         // if result was outside SYCL/USM memory allocation, use device workspace
 60 |         if (result_ptr_type == sycl::usm::alloc::unknown) {
 61 |             // use preallocated device workspace (resizing if needed)
 62 |             queue.work_ensure_size< char >( sizeof(scalar_t) );  // syncs if needed
 63 |             real_type<scalar_t>* dev_work = (real_type<scalar_t>*)queue.work();
 64 |             internal::asum( n_, x, incx_, dev_work, queue );
 65 |             blas::device_memcpy( result, dev_work, 1, queue );
 66 |         }
 67 |         else {
 68 |             internal::asum( n_, x, incx_, result, queue );
 69 |         }
 70 |     #else  // other devices (CUDA/HIP)
 71 |         internal::asum( n_, x, incx_, result, queue );
 72 |     #endif
 73 | #endif
 74 | }
 75 | 
 76 | } // namespace impl
 77 | 
 78 | //==============================================================================
 79 | // High-level overloaded wrappers call mid-level templated wrapper.
 80 | 
 81 | //------------------------------------------------------------------------------
 82 | /// GPU device, float version.
 83 | /// Unlike CPU version, here `result` is an output parameter,
 84 | /// to store the result when the asynchronous execution completes.
 85 | /// @ingroup asum
 86 | void asum(
 87 |     int64_t n,
 88 |     float const* x, int64_t incx,
 89 |     float* result,
 90 |     blas::Queue& queue)
 91 | {
 92 |     impl::asum( n, x, incx, result, queue );
 93 | }
 94 | 
 95 | //------------------------------------------------------------------------------
 96 | /// GPU device, double version.
 97 | /// @ingroup asum
 98 | void asum(
 99 |     int64_t n,
100 |     double const* x, int64_t incx,
101 |     double* result,
102 |     blas::Queue& queue)
103 | {
104 |     impl::asum( n, x, incx, result, queue );
105 | }
106 | 
107 | //------------------------------------------------------------------------------
108 | /// GPU device, complex<float> version.
109 | /// @ingroup asum
110 | void asum(
111 |     int64_t n,
112 |     std::complex<float> const *x, int64_t incx,
113 |     float* result,
114 |     blas::Queue& queue)
115 | {
116 |     impl::asum( n, x, incx, result, queue );
117 | }
118 | 
119 | //------------------------------------------------------------------------------
120 | /// GPU device, complex<double> version.
121 | /// @ingroup asum
122 | void asum(
123 |     int64_t n,
124 |     std::complex<double> const* x, int64_t incx,
125 |     double* result,
126 |     blas::Queue& queue)
127 | {
128 |     impl::asum( n, x, incx, result, queue );
129 | }
130 | 
131 | } // namespace blas


--------------------------------------------------------------------------------
/src/device_axpy.cc:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
  2 | // SPDX-License-Identifier: BSD-3-Clause
  3 | // This program is free software: you can redistribute it and/or modify it under
  4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
  5 | 
  6 | #include "blas/device_blas.hh"
  7 | #include "blas/counter.hh"
  8 | 
  9 | #include "device_internal.hh"
 10 | 
 11 | #include <limits>
 12 | 
 13 | namespace blas {
 14 | 
 15 | //==============================================================================
 16 | namespace impl {
 17 | 
 18 | //------------------------------------------------------------------------------
 19 | /// Mid-level templated wrapper checks and converts arguments,
 20 | /// then calls low-level wrapper.
 21 | /// @ingroup axpy_internal
 22 | ///
 23 | template <typename scalar_t>
 24 | void axpy(
 25 |     int64_t n,
 26 |     scalar_t alpha,
 27 |     scalar_t const* x, int64_t incx,
 28 |     scalar_t*       y, int64_t incy,
 29 |     blas::Queue& queue)
 30 | {
 31 | #ifndef BLAS_HAVE_DEVICE
 32 |     throw blas::Error( "device BLAS not available", __func__ );
 33 | #else
 34 |     // check arguments
 35 |     blas_error_if( n < 0 );      // standard BLAS returns, doesn't fail
 36 |     blas_error_if( incx == 0 );  // standard BLAS doesn't detect inc[xy] == 0
 37 |     blas_error_if( incy == 0 );
 38 | 
 39 |     // convert arguments
 40 |     device_blas_int n_    = to_device_blas_int( n );
 41 |     device_blas_int incx_ = to_device_blas_int( incx );
 42 |     device_blas_int incy_ = to_device_blas_int( incy );
 43 | 
 44 |     blas::internal_set_device( queue.device() );
 45 | 
 46 |     // call low-level wrapper
 47 |     internal::axpy( n_, alpha, x, incx_, y, incy_, queue );
 48 | #endif
 49 | }
 50 | 
 51 | }  // namespace impl
 52 | 
 53 | //==============================================================================
 54 | // High-level overloaded wrappers call mid-level templated wrapper.
 55 | 
 56 | //------------------------------------------------------------------------------
 57 | /// GPU device, float version.
 58 | /// @ingroup axpy
 59 | void axpy(
 60 |     int64_t n,
 61 |     float alpha,
 62 |     float const* x, int64_t incx,
 63 |     float*       y, int64_t incy,
 64 |     blas::Queue& queue)
 65 | {
 66 |     impl::axpy( n, alpha, x, incx, y, incy, queue );
 67 | }
 68 | 
 69 | //------------------------------------------------------------------------------
 70 | /// GPU device, double version.
 71 | /// @ingroup axpy
 72 | void axpy(
 73 |     int64_t n,
 74 |     double alpha,
 75 |     double const* x, int64_t incx,
 76 |     double*       y, int64_t incy,
 77 |     blas::Queue& queue)
 78 | {
 79 |     impl::axpy( n, alpha, x, incx, y, incy, queue );
 80 | }
 81 | 
 82 | //------------------------------------------------------------------------------
 83 | /// GPU device, complex<float> version.
 84 | /// @ingroup axpy
 85 | void axpy(
 86 |     int64_t n,
 87 |     std::complex<float> alpha,
 88 |     std::complex<float> const* x, int64_t incx,
 89 |     std::complex<float>*       y, int64_t incy,
 90 |     blas::Queue& queue)
 91 | {
 92 |     impl::axpy( n, alpha, x, incx, y, incy, queue );
 93 | }
 94 | 
 95 | //------------------------------------------------------------------------------
 96 | /// GPU device, complex<double> version.
 97 | /// @ingroup axpy
 98 | void axpy(
 99 |     int64_t n,
100 |     std::complex<double> alpha,
101 |     std::complex<double> const* x, int64_t incx,
102 |     std::complex<double>*       y, int64_t incy,
103 |     blas::Queue& queue)
104 | {
105 |     impl::axpy( n, alpha, x, incx, y, incy, queue );
106 | }
107 | 
108 | }  // namespace blas
109 | 


--------------------------------------------------------------------------------
/src/device_copy.cc:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
  2 | // SPDX-License-Identifier: BSD-3-Clause
  3 | // This program is free software: you can redistribute it and/or modify it under
  4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
  5 | 
  6 | #include "blas/device_blas.hh"
  7 | #include "blas/counter.hh"
  8 | 
  9 | #include "device_internal.hh"
 10 | 
 11 | #include <limits>
 12 | #include <string.h>
 13 | 
 14 | namespace blas {
 15 | 
 16 | //==============================================================================
 17 | namespace impl {
 18 | 
 19 | //------------------------------------------------------------------------------
 20 | /// Mid-level templated wrapper checks and converts arguments,
 21 | /// then calls low-level wrapper.
 22 | /// @ingroup copy_internal
 23 | ///
 24 | template <typename scalar_t>
 25 | void copy(
 26 |     int64_t n,
 27 |     scalar_t const* x, int64_t incx,
 28 |     scalar_t*       y, int64_t incy,
 29 |     blas::Queue& queue )
 30 | {
 31 | #ifndef BLAS_HAVE_DEVICE
 32 |     throw blas::Error( "device BLAS not available", __func__ );
 33 | #else
 34 |     // check arguments
 35 |     blas_error_if( n < 0 );      // standard BLAS returns, doesn't fail
 36 |     blas_error_if( incx == 0 );  // standard BLAS doesn't detect inc[xy] == 0
 37 |     blas_error_if( incy == 0 );
 38 | 
 39 |     #ifdef BLAS_HAVE_PAPI
 40 |         // PAPI instrumentation
 41 |         counter::dev_copy_type element;
 42 |         memset( &element, 0, sizeof( element ) );
 43 |         element = { n };
 44 |         counter::insert( element, counter::Id::dev_copy );
 45 | 
 46 |         double gflops = 1e9 * blas::Gflop< scalar_t >::copy( n );
 47 |         counter::inc_flop_count( (long long int)gflops );
 48 |     #endif
 49 | 
 50 |     // convert arguments
 51 |     device_blas_int n_    = to_device_blas_int( n );
 52 |     device_blas_int incx_ = to_device_blas_int( incx );
 53 |     device_blas_int incy_ = to_device_blas_int( incy );
 54 | 
 55 |     blas::internal_set_device( queue.device() );
 56 | 
 57 |     // call low-level wrapper
 58 |     internal::copy( n_, x, incx_, y, incy_, queue );
 59 | #endif
 60 | }
 61 | 
 62 | }  // namespace impl
 63 | 
 64 | //==============================================================================
 65 | // High-level overloaded wrappers call mid-level templated wrapper.
 66 | 
 67 | //------------------------------------------------------------------------------
 68 | /// GPU device, float version.
 69 | /// @ingroup copy
 70 | void copy(
 71 |     int64_t n,
 72 |     float const* x, int64_t incx,
 73 |     float*       y, int64_t incy,
 74 |     blas::Queue& queue )
 75 | {
 76 |     impl::copy( n, x, incx, y, incy, queue );
 77 | }
 78 | 
 79 | //------------------------------------------------------------------------------
 80 | /// GPU device, double version.
 81 | /// @ingroup copy
 82 | void copy(
 83 |     int64_t n,
 84 |     double const* x, int64_t incx,
 85 |     double*       y, int64_t incy,
 86 |     blas::Queue& queue )
 87 | {
 88 |     impl::copy( n, x, incx, y, incy, queue );
 89 | }
 90 | 
 91 | //------------------------------------------------------------------------------
 92 | /// GPU device, complex<float> version.
 93 | /// @ingroup copy
 94 | void copy(
 95 |     int64_t n,
 96 |     std::complex<float> const* x, int64_t incx,
 97 |     std::complex<float>*       y, int64_t incy,
 98 |     blas::Queue& queue )
 99 | {
100 |     impl::copy( n, x, incx, y, incy, queue );
101 | }
102 | 
103 | //------------------------------------------------------------------------------
104 | /// GPU device, complex<double> version.
105 | /// @ingroup copy
106 | void copy(
107 |     int64_t n,
108 |     std::complex<double> const* x, int64_t incx,
109 |     std::complex<double>*       y, int64_t incy,
110 |     blas::Queue& queue )
111 | {
112 |     impl::copy( n, x, incx, y, incy, queue );
113 | }
114 | 
115 | }  // namespace blas
116 | 


--------------------------------------------------------------------------------
/src/device_error.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
 2 | // SPDX-License-Identifier: BSD-3-Clause
 3 | // This program is free software: you can redistribute it and/or modify it under
 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 5 | 
 6 | #include "blas/device.hh"
 7 | 
 8 | #include "device_internal.hh"
 9 | 
10 | #ifdef BLAS_HAVE_CUBLAS
11 | 
12 | // -----------------------------------------------------------------------------
13 | // return string of blas error
14 | const char* blas::device_error_string( cublasStatus_t error )
15 | {
16 |     switch (error) {
17 |         case CUBLAS_STATUS_SUCCESS:
18 |             return "cublas: success";
19 | 
20 |         case CUBLAS_STATUS_NOT_INITIALIZED:
21 |             return "cublas: not initialized";
22 | 
23 |         case CUBLAS_STATUS_ALLOC_FAILED:
24 |             return "cublas: out of memory";
25 | 
26 |         case CUBLAS_STATUS_INVALID_VALUE:
27 |             return "cublas: invalid value";
28 | 
29 |         case CUBLAS_STATUS_ARCH_MISMATCH:
30 |             return "cublas: architecture mismatch";
31 | 
32 |         case CUBLAS_STATUS_MAPPING_ERROR:
33 |             return "cublas: memory mapping error";
34 | 
35 |         case CUBLAS_STATUS_EXECUTION_FAILED:
36 |             return "cublas: execution failed";
37 | 
38 |         case CUBLAS_STATUS_INTERNAL_ERROR:
39 |             return "cublas: internal error";
40 | 
41 |         case CUBLAS_STATUS_NOT_SUPPORTED:
42 |             return "cublas: functionality not supported";
43 | 
44 |         case CUBLAS_STATUS_LICENSE_ERROR:
45 |             return "cublas: license error";
46 | 
47 |         default:
48 |             return "cublas: unknown error code";
49 |     }
50 | }
51 | 
52 | #endif  // HAVE_CUBLAS
53 | 


--------------------------------------------------------------------------------
/src/device_iamax.cc:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
  2 | // SPDX-License-Identifier: BSD-3-Clause
  3 | // This program is free software: you can redistribute it and/or modify it under
  4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
  5 | 
  6 | #include "blas/device_blas.hh"
  7 | #include "blas/counter.hh"
  8 | 
  9 | #include "device_internal.hh"
 10 | 
 11 | #include <limits>
 12 | 
 13 | namespace blas {
 14 | 
 15 | // =============================================================================
 16 | namespace impl {
 17 | 
 18 | //------------------------------------------------------------------------------
 19 | /// Mid-level templated wrapper checks and converts arguments,
 20 | /// then calls low-level wrapper.
 21 | /// @ingroup iamax
 22 | ///
 23 | template <typename scalar_t>
 24 | void iamax(
 25 |     int64_t n,
 26 |     scalar_t const* x, int64_t incx,
 27 |     int64_t* result,
 28 |     blas::Queue& queue)
 29 | {
 30 | #ifndef BLAS_HAVE_DEVICE
 31 |     throw blas::Error( "device BLAS not available", __func__ );
 32 | #else
 33 |     // check arguments
 34 |     blas_error_if( n < 0 );       // standard BLAS returns, doesn't fail
 35 |     blas_error_if( incx <= 0 );   // standard BLAS returns, doesn't fail
 36 | 
 37 |     #ifdef BLAS_HAVE_PAPI
 38 |         // PAPI instrumentation
 39 |         counter::dev_iamax_type element;
 40 |         memset( &element, 0, sizeof( element ) );
 41 |         element = { n };
 42 |         counter::insert( element, counter::Id::dev_iamax );
 43 | 
 44 |         double gflops = 1e9 * blas::Gflop< scalar_t >::iamax( n );
 45 |         counter::inc_flop_count( (long long int)gflops );
 46 |     #endif
 47 | 
 48 |     blas::internal_set_device( queue.device() );
 49 | 
 50 |     // call low-level wrapper
 51 |     #if defined( BLAS_HAVE_SYCL )
 52 |         sycl::queue syclq = queue.stream();
 53 |         // check how the result scalar was allocated
 54 |         auto result_ptr_type = sycl::get_pointer_type( result, syclq.get_context() );
 55 |         // if result was outside SYCL/USM memory allocation, use device workspace
 56 |         if (result_ptr_type == sycl::usm::alloc::unknown) {
 57 |             // use preallocated device workspace (resizing if needed)
 58 |             queue.work_ensure_size< char >( sizeof(int64_t) );  // syncs if needed
 59 |             int64_t* dev_work = (int64_t*)queue.work();
 60 |             internal::iamax( n, x, incx, dev_work, queue );
 61 |             blas::device_memcpy( result, dev_work, 1, queue );
 62 |         }
 63 |         else {
 64 |             internal::iamax( n, x, incx, result, queue );
 65 |         }
 66 |     #else
 67 |         internal::iamax( n, x, incx, result, queue );
 68 |     #endif
 69 | #endif
 70 | }
 71 | 
 72 | } // namespace impl
 73 | 
 74 | //==============================================================================
 75 | // High-level overloaded wrappers call mid-level templated wrapper.
 76 | 
 77 | //------------------------------------------------------------------------------
 78 | /// GPU device, float version.
 79 | /// Unlike CPU version, here `result` is an output parameter,
 80 | /// to store the result when the asynchronous execution completes.
 81 | /// @ingroup iamax
 82 | void iamax(
 83 |     int64_t n,
 84 |     float const* x, int64_t incx,
 85 |     int64_t* result,
 86 |     blas::Queue& queue)
 87 | {
 88 |     impl::iamax( n, x, incx, result, queue );
 89 | }
 90 | 
 91 | //------------------------------------------------------------------------------
 92 | /// GPU device, double version.
 93 | /// @ingroup iamax
 94 | void iamax(
 95 |     int64_t n,
 96 |     double const* x, int64_t incx,
 97 |     int64_t* result,
 98 |     blas::Queue& queue)
 99 | {
100 |     impl::iamax( n, x, incx, result, queue );
101 | }
102 | 
103 | //------------------------------------------------------------------------------
104 | /// GPU device, complex<float> version.
105 | /// @ingroup iamax
106 | void iamax(
107 |     int64_t n,
108 |     std::complex<float> const *x, int64_t incx,
109 |     int64_t* result,
110 |     blas::Queue& queue)
111 | {
112 |     impl::iamax( n, x, incx, result, queue );
113 | }
114 | 
115 | //------------------------------------------------------------------------------
116 | /// GPU device, complex<double> version.
117 | /// @ingroup iamax
118 | void iamax(
119 |     int64_t n,
120 |     std::complex<double> const* x, int64_t incx,
121 |     int64_t* result,
122 |     blas::Queue& queue)
123 | {
124 |     impl::iamax( n, x, incx, result, queue );
125 | }
126 | 
127 | } // namespace blas
128 | 


--------------------------------------------------------------------------------
/src/device_nrm2.cc:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
  2 | // SPDX-License-Identifier: BSD-3-Clause
  3 | // This program is free software: you can redistribute it and/or modify it under
  4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
  5 | 
  6 | #include "blas/device_blas.hh"
  7 | #include "blas/counter.hh"
  8 | 
  9 | #include "device_internal.hh"
 10 | 
 11 | #include <limits>
 12 | #include <string.h>
 13 | 
 14 | namespace blas {
 15 | 
 16 | //==============================================================================
 17 | namespace impl {
 18 | 
 19 | //------------------------------------------------------------------------------
 20 | /// Mid-level templated wrapper checks and converts arguments,
 21 | /// then calls low-level wrapper.
 22 | /// @ingroup nrm2_internal
 23 | ///
 24 | template <typename scalar_t>
 25 | void nrm2(
 26 |     int64_t n,
 27 |     scalar_t const* x, int64_t incx,
 28 |     real_type<scalar_t>* result,
 29 |     blas::Queue& queue )
 30 | {
 31 | #ifndef BLAS_HAVE_DEVICE
 32 |     throw blas::Error( "device BLAS not available", __func__ );
 33 | #else
 34 |     // check arguments
 35 |     blas_error_if( n < 0 );      // standard BLAS returns, doesn't fail
 36 |     blas_error_if( incx <= 0 );  // standard BLAS returns, doesn't fail
 37 | 
 38 |     #ifdef BLAS_HAVE_PAPI
 39 |         // PAPI instrumentation
 40 |         counter::dev_nrm2_type element;
 41 |         memset( &element, 0, sizeof( element ) );
 42 |         element = { n };
 43 |         counter::insert( element, counter::Id::dev_nrm2 );
 44 | 
 45 |         double gflops = 1e9 * blas::Gflop< scalar_t >::nrm2( n );
 46 |         counter::inc_flop_count( (long long int)gflops );
 47 |     #endif
 48 | 
 49 |     // convert arguments
 50 |     device_blas_int n_    = to_device_blas_int( n );
 51 |     device_blas_int incx_ = to_device_blas_int( incx );
 52 | 
 53 |     blas::internal_set_device( queue.device() );
 54 | 
 55 |     // call low-level wrapper
 56 |     #if defined( BLAS_HAVE_SYCL )
 57 |         sycl::queue syclq = queue.stream();
 58 |         // check how the result scalar was allocated
 59 |         auto result_ptr_type = sycl::get_pointer_type( result, syclq.get_context() );
 60 |         // if result was outside SYCL/USM memory allocation, use device workspace
 61 |         if (result_ptr_type == sycl::usm::alloc::unknown) {
 62 |             // use preallocated device workspace (resizing if needed)
 63 |             queue.work_ensure_size< char >( sizeof(scalar_t) );  // syncs if needed
 64 |             real_type<scalar_t>* dev_work = (real_type<scalar_t>*)queue.work();
 65 |             internal::nrm2( n_, x, incx_, dev_work, queue );
 66 |             blas::device_memcpy( result, dev_work, 1, queue );
 67 |         }
 68 |         else {
 69 |             internal::nrm2( n_, x, incx_, result, queue );
 70 |         }
 71 |     #else // other devices (CUDA/HIP)
 72 |         internal::nrm2( n_, x, incx_, result, queue );
 73 |     #endif
 74 | #endif
 75 | }
 76 | 
 77 | }  // namespace impl
 78 | 
 79 | //==============================================================================
 80 | // High-level overloaded wrappers call mid-level templated wrapper.
 81 | 
 82 | //------------------------------------------------------------------------------
 83 | /// GPU device, float version.
 84 | /// Unlike CPU version, here `result` is an output parameter,
 85 | /// to store the result when the asynchronous execution completes.
 86 | /// @ingroup nrm2
 87 | void nrm2(
 88 |     int64_t n,
 89 |     float const* x, int64_t incx,
 90 |     float* result,
 91 |     blas::Queue& queue )
 92 | {
 93 |     impl::nrm2( n, x, incx, result, queue );
 94 | }
 95 | 
 96 | //------------------------------------------------------------------------------
 97 | /// GPU device, double version.
 98 | /// @ingroup nrm2
 99 | void nrm2(
100 |     int64_t n,
101 |     double const* x, int64_t incx,
102 |     double* result,
103 |     blas::Queue& queue )
104 | {
105 |     impl::nrm2( n, x, incx, result, queue );
106 | }
107 | 
108 | //------------------------------------------------------------------------------
109 | /// GPU device, complex<float> version.
110 | /// @ingroup nrm2
111 | void nrm2(
112 |     int64_t n,
113 |     std::complex<float> const* x, int64_t incx,
114 |     float* result,
115 |     blas::Queue& queue )
116 | {
117 |     impl::nrm2( n, x, incx, result, queue );
118 | }
119 | 
120 | //------------------------------------------------------------------------------
121 | /// GPU device, complex<double> version.
122 | /// @ingroup nrm2
123 | void nrm2(
124 |     int64_t n,
125 |     std::complex<double> const* x, int64_t incx,
126 |     double* result,
127 |     blas::Queue& queue )
128 | {
129 |     impl::nrm2( n, x, incx, result, queue );
130 | }
131 | 
132 | }  // namespace blas
133 | 


--------------------------------------------------------------------------------
/src/device_rotg.cc:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
  2 | // SPDX-License-Identifier: BSD-3-Clause
  3 | // This program is free software: you can redistribute it and/or modify it under
  4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
  5 | 
  6 | #include "blas/device_blas.hh"
  7 | #include "blas/counter.hh"
  8 | 
  9 | #include "device_internal.hh"
 10 | 
 11 | #include <limits>
 12 | #include <string.h>
 13 | 
 14 | namespace blas {
 15 | 
 16 | //==============================================================================
 17 | namespace impl {
 18 | 
 19 | //------------------------------------------------------------------------------
 20 | /// Mid-level templated wrapper checks and converts arguments,
 21 | /// then calls low-level wrapper.
 22 | /// @ingroup rotg_internal
 23 | ///
 24 | template <typename scalar_t>
 25 | void rotg(
 26 |     scalar_t* a,
 27 |     scalar_t* b,
 28 |     real_type<scalar_t>* c,
 29 |     scalar_t* s,
 30 |     blas::Queue& queue )
 31 | {
 32 | #ifndef BLAS_HAVE_DEVICE
 33 |     throw blas::Error( "device BLAS not available", __func__ );
 34 | #else
 35 |     #ifdef BLAS_HAVE_PAPI
 36 |         // PAPI instrumentation
 37 |         counter::dev_rotg_type element;
 38 |         memset( &element, 0, sizeof( element ) );
 39 |         element = { 1 };
 40 |         counter::insert( element, counter::Id::dev_rotg );
 41 | 
 42 |         // This operation does not incur significant FLOPs, so no
 43 |         // need to call counter::inc_flop_count()
 44 |     #endif
 45 | 
 46 |     blas::internal_set_device( queue.device() );
 47 | 
 48 |     // call low-level wrapper
 49 |     internal::rotg( a, b, c, s, queue );
 50 | #endif
 51 | }
 52 | 
 53 | }  // namespace impl
 54 | 
 55 | //==============================================================================
 56 | // High-level overloaded wrappers call mid-level templated wrapper.
 57 | 
 58 | //------------------------------------------------------------------------------
 59 | /// GPU device, float version.
 60 | /// @ingroup rotg
 61 | void rotg(
 62 |     float *a,
 63 |     float *b,
 64 |     float *c,
 65 |     float *s,
 66 |     blas::Queue& queue )
 67 | {
 68 |     impl::rotg( a, b, c, s, queue );
 69 | }
 70 | 
 71 | //------------------------------------------------------------------------------
 72 | /// GPU device, double version.
 73 | /// @ingroup rotg
 74 | void rotg(
 75 |     double *a,
 76 |     double *b,
 77 |     double *c,
 78 |     double *s,
 79 |     blas::Queue& queue )
 80 | {
 81 |     impl::rotg( a, b, c, s, queue );
 82 | }
 83 | 
 84 | //------------------------------------------------------------------------------
 85 | /// GPU device, complex<float> version.
 86 | /// @ingroup rotg
 87 | void rotg(
 88 |     std::complex<float> *a,
 89 |     std::complex<float> *b,
 90 |     float *c,
 91 |     std::complex<float> *s,
 92 |     blas::Queue& queue )
 93 | {
 94 |     impl::rotg( a, b, c, s, queue );
 95 | }
 96 | 
 97 | //------------------------------------------------------------------------------
 98 | /// GPU device, complex<double> version.
 99 | /// @ingroup rotg
100 | void rotg(
101 |     std::complex<double> *a,
102 |     std::complex<double> *b,
103 |     double *c,
104 |     std::complex<double> *s,
105 |     blas::Queue& queue )
106 | {
107 |     impl::rotg( a, b, c, s, queue );
108 | }
109 | 
110 | }  // namespace blas
111 | 


--------------------------------------------------------------------------------
/src/device_rotm.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
 2 | // SPDX-License-Identifier: BSD-3-Clause
 3 | // This program is free software: you can redistribute it and/or modify it under
 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 5 | 
 6 | #include "blas/device_blas.hh"
 7 | #include "blas/counter.hh"
 8 | 
 9 | #include "device_internal.hh"
10 | 
11 | #include <limits>
12 | #include <string.h>
13 | 
14 | namespace blas {
15 | 
16 | //==============================================================================
17 | namespace impl {
18 | 
19 | //------------------------------------------------------------------------------
20 | /// Mid-level templated wrapper checks and converts arguments,
21 | /// then calls low-level wrapper.
22 | /// @ingroup rotm_internal
23 | ///
24 | template <typename scalar_t>
25 | void rotm(
26 |     int64_t n,
27 |     scalar_t* x, int64_t incx,
28 |     scalar_t* y, int64_t incy,
29 |     const scalar_t* param,
30 |     blas::Queue& queue )
31 | {
32 | #ifndef BLAS_HAVE_DEVICE
33 |     throw blas::Error( "device BLAS not available", __func__ );
34 | #else
35 |     // check arguments
36 |     blas_error_if( n < 0 );      // standard BLAS returns, doesn't fail
37 |     blas_error_if( incx == 0 );  // standard BLAS doesn't detect inc[xy] == 0
38 |     blas_error_if( incy == 0 );
39 | 
40 |     #ifdef BLAS_HAVE_PAPI
41 |         // PAPI instrumentation
42 |         counter::dev_rotm_type element;
43 |         memset( &element, 0, sizeof( element ) );
44 |         element = { n };
45 |         counter::insert( element, counter::Id::dev_rotm );
46 | 
47 |         double gflops = 1e9 * blas::Gflop< scalar_t >::rotm( n );
48 |         counter::inc_flop_count( (long long int)gflops );
49 |     #endif
50 | 
51 |     // convert arguments
52 |     device_blas_int n_    = to_device_blas_int( n );
53 |     device_blas_int incx_ = to_device_blas_int( incx );
54 |     device_blas_int incy_ = to_device_blas_int( incy );
55 | 
56 |     blas::internal_set_device( queue.device() );
57 | 
58 |     // call low-level wrapper
59 |     internal::rotm( n_, x, incx_, y, incy_, param, queue );
60 | #endif
61 | }
62 | 
63 | }  // namespace impl
64 | 
65 | //==============================================================================
66 | // High-level overloaded wrappers call mid-level templated wrapper.
67 | // Not available for complex.
68 | 
69 | //------------------------------------------------------------------------------
70 | /// GPU device, float version.
71 | /// @ingroup rotm
72 | void rotm(
73 |     int64_t n,
74 |     float* x, int64_t incx,
75 |     float*y, int64_t incy,
76 |     const float* param,
77 |     blas::Queue& queue )
78 | {
79 |     impl::rotm( n, x, incx, y, incy, param, queue );
80 | }
81 | 
82 | //------------------------------------------------------------------------------
83 | /// GPU device, double version.
84 | /// @ingroup rotm
85 | void rotm(
86 |     int64_t n,
87 |     double* x, int64_t incx,
88 |     double* y, int64_t incy,
89 |     const double* param,
90 |     blas::Queue& queue )
91 | {
92 |     impl::rotm( n, x, incx, y, incy, param, queue );
93 | }
94 | 
95 | } // namespace blas
96 | 


--------------------------------------------------------------------------------
/src/device_rotmg.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
 2 | // SPDX-License-Identifier: BSD-3-Clause
 3 | // This program is free software: you can redistribute it and/or modify it under
 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 5 | 
 6 | #include "blas/device_blas.hh"
 7 | #include "blas/counter.hh"
 8 | 
 9 | #include "device_internal.hh"
10 | 
11 | #include <limits>
12 | #include <string.h>
13 | 
14 | namespace blas {
15 | 
16 | //==============================================================================
17 | namespace impl {
18 | 
19 | //------------------------------------------------------------------------------
20 | /// Mid-level templated wrapper checks and converts arguments,
21 | /// then calls low-level wrapper.
22 | /// @ingroup rotmg_internal
23 | ///
24 | template <typename scalar_t>
25 | void rotmg(
26 |     scalar_t* d1,
27 |     scalar_t* d2,
28 |     scalar_t* x1,
29 |     scalar_t* y1,
30 |     scalar_t* param,
31 |     blas::Queue& queue )
32 | {
33 | #ifndef BLAS_HAVE_DEVICE
34 |     throw blas::Error( "device BLAS not available", __func__ );
35 | #else
36 |     #ifdef BLAS_HAVE_PAPI
37 |         // PAPI instrumentation
38 |         counter::dev_rotmg_type element;
39 |         memset( &element, 0, sizeof( element ) );
40 |         element = { 1 };
41 |         counter::insert( element, counter::Id::dev_rotmg );
42 | 
43 |         // This operation does not incur significant FLOPs, so no
44 |         // need to call counter::inc_flop_count()
45 |     #endif
46 | 
47 |     blas::internal_set_device( queue.device() );
48 | 
49 |     // call low-level wrapper
50 |     internal::rotmg( d1, d2, x1, y1, param, queue );
51 | #endif
52 | }
53 | 
54 | }  // namespace impl
55 | 
56 | //==============================================================================
57 | // High-level overloaded wrappers call mid-level templated wrapper.
58 | // Not available for complex.
59 | 
60 | //------------------------------------------------------------------------------
61 | /// GPU device, float version.
62 | /// @ingroup rotmg
63 | void rotmg(
64 |     float* d1,
65 |     float* d2,
66 |     float* x1,
67 |     float* y1,
68 |     float* param,
69 |     blas::Queue& queue )
70 | {
71 |     impl::rotmg( d1, d2, x1, y1, param, queue );
72 | }
73 | 
74 | //------------------------------------------------------------------------------
75 | /// GPU device, double version.
76 | /// @ingroup rotmg
77 | void rotmg(
78 |     double* d1,
79 |     double* d2,
80 |     double* x1,
81 |     double* y1,
82 |     double* param,
83 |     blas::Queue& queue )
84 | {
85 |     impl::rotmg( d1, d2, x1, y1, param, queue );
86 | }
87 | 
88 | } // namespace blas
89 | 


--------------------------------------------------------------------------------
/src/device_scal.cc:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
  2 | // SPDX-License-Identifier: BSD-3-Clause
  3 | // This program is free software: you can redistribute it and/or modify it under
  4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
  5 | 
  6 | #include "blas/device_blas.hh"
  7 | #include "blas/counter.hh"
  8 | 
  9 | #include "device_internal.hh"
 10 | 
 11 | #include <limits>
 12 | #include <string.h>
 13 | 
 14 | namespace blas {
 15 | 
 16 | //==============================================================================
 17 | namespace impl {
 18 | 
 19 | //------------------------------------------------------------------------------
 20 | /// Mid-level templated wrapper checks and converts arguments,
 21 | /// then calls low-level wrapper.
 22 | /// @ingroup scal_internal
 23 | ///
 24 | template <typename scalar_t>
 25 | void scal(
 26 |     int64_t n,
 27 |     scalar_t alpha,
 28 |     scalar_t* x, int64_t incx,
 29 |     blas::Queue& queue )
 30 | {
 31 | #ifndef BLAS_HAVE_DEVICE
 32 |     throw blas::Error( "device BLAS not available", __func__ );
 33 | #else
 34 |     // check arguments
 35 |     blas_error_if( n < 0 );      // standard BLAS returns, doesn't fail
 36 |     blas_error_if( incx <= 0 );  // standard BLAS returns, doesn't fail
 37 | 
 38 |     #ifdef BLAS_HAVE_PAPI
 39 |         // PAPI instrumentation
 40 |         counter::dev_scal_type element;
 41 |         memset( &element, 0, sizeof( element ) );
 42 |         element = { n };
 43 |         counter::insert( element, counter::Id::dev_scal );
 44 | 
 45 |         double gflops = 1e9 * blas::Gflop< scalar_t >::scal( n );
 46 |         counter::inc_flop_count( (long long int)gflops );
 47 |     #endif
 48 | 
 49 |     // convert arguments
 50 |     device_blas_int n_    = to_device_blas_int( n );
 51 |     device_blas_int incx_ = to_device_blas_int( incx );
 52 | 
 53 |     blas::internal_set_device( queue.device() );
 54 | 
 55 |     // call low-level wrapper
 56 |     internal::scal( n_, alpha, x, incx_, queue );
 57 | #endif
 58 | }
 59 | 
 60 | }  // namespace impl
 61 | 
 62 | //==============================================================================
 63 | // High-level overloaded wrappers call mid-level templated wrapper.
 64 | 
 65 | //------------------------------------------------------------------------------
 66 | /// GPU device, float version.
 67 | /// @ingroup scal
 68 | void scal(
 69 |     int64_t n,
 70 |     float alpha,
 71 |     float* x, int64_t incx,
 72 |     blas::Queue& queue )
 73 | {
 74 |     impl::scal( n, alpha, x, incx, queue );
 75 | }
 76 | 
 77 | //------------------------------------------------------------------------------
 78 | /// GPU device, double version.
 79 | /// @ingroup scal
 80 | void scal(
 81 |     int64_t n,
 82 |     double alpha,
 83 |     double* x, int64_t incx,
 84 |     blas::Queue& queue )
 85 | {
 86 |     impl::scal( n, alpha, x, incx, queue );
 87 | }
 88 | 
 89 | //------------------------------------------------------------------------------
 90 | /// GPU device, complex<float> version.
 91 | /// @ingroup scal
 92 | void scal(
 93 |     int64_t n,
 94 |     std::complex<float> alpha,
 95 |     std::complex<float>* x, int64_t incx,
 96 |     blas::Queue& queue )
 97 | {
 98 |     impl::scal( n, alpha, x, incx, queue );
 99 | }
100 | 
101 | //------------------------------------------------------------------------------
102 | /// GPU device, complex<double> version.
103 | /// @ingroup scal
104 | void scal(
105 |     int64_t n,
106 |     std::complex<double> alpha,
107 |     std::complex<double>* x, int64_t incx,
108 |     blas::Queue& queue )
109 | {
110 |     impl::scal( n, alpha, x, incx, queue );
111 | }
112 | 
113 | }  // namespace blas
114 | 


--------------------------------------------------------------------------------
/src/device_swap.cc:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
  2 | // SPDX-License-Identifier: BSD-3-Clause
  3 | // This program is free software: you can redistribute it and/or modify it under
  4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
  5 | 
  6 | #include "blas/device_blas.hh"
  7 | #include "blas/counter.hh"
  8 | 
  9 | #include "device_internal.hh"
 10 | 
 11 | #include <limits>
 12 | #include <string.h>
 13 | 
 14 | namespace blas {
 15 | 
 16 | //==============================================================================
 17 | namespace impl {
 18 | 
 19 | //------------------------------------------------------------------------------
 20 | /// Mid-level templated wrapper checks and converts arguments,
 21 | /// then calls low-level wrapper.
 22 | /// @ingroup swap_internal
 23 | ///
 24 | template <typename scalar_t>
 25 | void swap(
 26 |     int64_t n,
 27 |     scalar_t* x, int64_t incx,
 28 |     scalar_t* y, int64_t incy,
 29 |     blas::Queue& queue )
 30 | {
 31 | #ifndef BLAS_HAVE_DEVICE
 32 |     throw blas::Error( "device BLAS not available", __func__ );
 33 | #else
 34 |     // check arguments
 35 |     blas_error_if( n < 0 );      // standard BLAS returns, doesn't fail
 36 |     blas_error_if( incx == 0 );  // standard BLAS doesn't detect inc[xy] == 0
 37 |     blas_error_if( incy == 0 );
 38 | 
 39 |     #ifdef BLAS_HAVE_PAPI
 40 |         // PAPI instrumentation
 41 |         counter::dev_swap_type element;
 42 |         memset( &element, 0, sizeof( element ) );
 43 |         element = { n };
 44 |         counter::insert( element, counter::Id::dev_swap );
 45 | 
 46 |         double gflops = 1e9 * blas::Gflop< scalar_t >::swap( n );
 47 |         counter::inc_flop_count( (long long int)gflops );
 48 |     #endif
 49 | 
 50 |     // convert arguments
 51 |     device_blas_int n_    = to_device_blas_int( n );
 52 |     device_blas_int incx_ = to_device_blas_int( incx );
 53 |     device_blas_int incy_ = to_device_blas_int( incy );
 54 | 
 55 |     blas::internal_set_device( queue.device() );
 56 | 
 57 |     // call low-level wrapper
 58 |     internal::swap( n_, x, incx_, y, incy_, queue );
 59 | #endif
 60 | }
 61 | 
 62 | }  // namespace impl
 63 | 
 64 | //==============================================================================
 65 | // High-level overloaded wrappers call mid-level templated wrapper.
 66 | 
 67 | //------------------------------------------------------------------------------
 68 | /// GPU device, float version.
 69 | /// @ingroup swap
 70 | void swap(
 71 |     int64_t n,
 72 |     float* x, int64_t incx,
 73 |     float* y, int64_t incy,
 74 |     blas::Queue& queue )
 75 | {
 76 |     impl::swap( n, x, incx, y, incy, queue );
 77 | }
 78 | 
 79 | //------------------------------------------------------------------------------
 80 | /// GPU device, double version.
 81 | /// @ingroup swap
 82 | void swap(
 83 |     int64_t n,
 84 |     double* x, int64_t incx,
 85 |     double* y, int64_t incy,
 86 |     blas::Queue& queue )
 87 | {
 88 |     impl::swap( n, x, incx, y, incy, queue );
 89 | }
 90 | 
 91 | //------------------------------------------------------------------------------
 92 | /// GPU device, complex<float> version.
 93 | /// @ingroup swap
 94 | void swap(
 95 |     int64_t n,
 96 |     std::complex<float>* x, int64_t incx,
 97 |     std::complex<float>* y, int64_t incy,
 98 |     blas::Queue& queue )
 99 | {
100 |     impl::swap( n, x, incx, y, incy, queue );
101 | }
102 | 
103 | //------------------------------------------------------------------------------
104 | /// GPU device, complex<double> version.
105 | /// @ingroup swap
106 | void swap(
107 |     int64_t n,
108 |     std::complex<double>* x, int64_t incx,
109 |     std::complex<double>* y, int64_t incy,
110 |     blas::Queue& queue )
111 | {
112 |     impl::swap( n, x, incx, y, incy, queue );
113 | }
114 | 
115 | }  // namespace blas
116 | 


--------------------------------------------------------------------------------
/src/device_syr.cc:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
  2 | // SPDX-License-Identifier: BSD-3-Clause
  3 | // This program is free software: you can redistribute it and/or modify it under
  4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
  5 | 
  6 | #include "blas/device_blas.hh"
  7 | #include "blas/counter.hh"
  8 | 
  9 | #include "device_internal.hh"
 10 | 
 11 | #include <limits>
 12 | #include <string>
 13 | 
 14 | namespace blas {
 15 | 
 16 | //==============================================================================
 17 | namespace impl {
 18 | 
 19 | //------------------------------------------------------------------------------
 20 | /// Mid-level templated wrapper checks and converts arguments,
 21 | /// then calls low-level wrapper.
 22 | /// @ingroup syr_internal
 23 | ///
 24 | template <typename scalar_t>
 25 | void syr(
 26 |     blas::Layout layout,
 27 |     blas::Uplo uplo,
 28 |     int64_t n,
 29 |     scalar_t alpha,
 30 |     scalar_t const* x, int64_t incx,
 31 |     scalar_t*       A, int64_t lda,
 32 |     blas::Queue& queue )
 33 | {
 34 | #ifndef BLAS_HAVE_DEVICE
 35 |     throw blas::Error( "device BLAS not available", __func__ );
 36 | #else
 37 |     // check arguments
 38 |     blas_error_if( layout != Layout::ColMajor &&
 39 |                    layout != Layout::RowMajor );
 40 |     blas_error_if( uplo != Uplo::Lower &&
 41 |                    uplo != Uplo::Upper );
 42 |     blas_error_if( n < 0 );
 43 |     blas_error_if( lda < n );
 44 |     blas_error_if( incx == 0 );
 45 | 
 46 |     #ifdef BLAS_HAVE_PAPI
 47 |         // PAPI instrumentation
 48 |         counter::dev_syr_type element;
 49 |         memset( &element, 0, sizeof( element ) );
 50 |         element = { uplo, n };
 51 |         counter::insert( element, counter::Id::dev_syr );
 52 | 
 53 |         double gflops = 1e9 * blas::Gflop< scalar_t >::syr( n );
 54 |         counter::inc_flop_count( (long long int)gflops );
 55 |     #endif
 56 | 
 57 |     // convert arguments
 58 |     device_blas_int n_    = to_device_blas_int( n );
 59 |     device_blas_int lda_  = to_device_blas_int( lda );
 60 |     device_blas_int incx_ = to_device_blas_int( incx );
 61 | 
 62 |     blas::internal_set_device( queue.device() );
 63 | 
 64 |     if (layout == Layout::RowMajor) {
 65 |         // swap lower <=> upper
 66 |         uplo = (uplo == Uplo::Lower ? Uplo::Upper : Uplo::Lower);
 67 |     }
 68 |     queue.sync();
 69 | 
 70 |     // call low-level wrapper
 71 |     internal::syr( uplo, n_,
 72 |                    alpha, x, incx_, A, lda_, queue );
 73 | #endif
 74 | }
 75 | 
 76 | }  // namespace impl
 77 | 
 78 | //==============================================================================
 79 | // High-level overloaded wrappers call mid-level templated wrapper.
 80 | 
 81 | //------------------------------------------------------------------------------
 82 | /// GPU device, float version.
 83 | /// @ingroup syr
 84 | void syr(
 85 |     blas::Layout layout,
 86 |     blas::Uplo uplo,
 87 |     int64_t n,
 88 |     float alpha,
 89 |     float const* x, int64_t incx,
 90 |     float*       A, int64_t lda,
 91 |     blas::Queue& queue )
 92 | {
 93 |     impl::syr( layout, uplo, n,
 94 |                alpha, x, incx, A, lda, queue );
 95 | }
 96 | 
 97 | //------------------------------------------------------------------------------
 98 | /// GPU device, double version.
 99 | /// @ingroup syr
100 | void syr(
101 |     blas::Layout layout,
102 |     blas::Uplo uplo,
103 |     int64_t n,
104 |     double alpha,
105 |     double const* x, int64_t incx,
106 |     double*       A, int64_t lda,
107 |     blas::Queue& queue )
108 | {
109 |     impl::syr( layout, uplo, n,
110 |                alpha, x, incx, A, lda, queue );
111 | }
112 | 
113 | //------------------------------------------------------------------------------
114 | /// GPU device, complex<float> version.
115 | /// @ingroup syr
116 | void syr(
117 |     blas::Layout layout,
118 |     blas::Uplo uplo,
119 |     int64_t n,
120 |     std::complex<float> alpha,
121 |     std::complex<float> const* x, int64_t incx,
122 |     std::complex<float>*       A, int64_t lda,
123 |     blas::Queue& queue )
124 | {
125 |     impl::syr( layout, uplo, n,
126 |                alpha, x, incx, A, lda, queue );
127 | }
128 | 
129 | //------------------------------------------------------------------------------
130 | /// GPU device, complex<double> version.
131 | /// @ingroup syr
132 | void syr(
133 |     blas::Layout layout,
134 |     blas::Uplo uplo,
135 |     int64_t n,
136 |     std::complex<double> alpha,
137 |     std::complex<double> const* x, int64_t incx,
138 |     std::complex<double>*       A, int64_t lda,
139 |     blas::Queue& queue )
140 | {
141 |     impl::syr( layout, uplo, n,
142 |                alpha, x, incx, A, lda, queue );
143 | }
144 | 
145 | }  // namespace blas
146 | 


--------------------------------------------------------------------------------
/src/hip/device_conj.hip:
--------------------------------------------------------------------------------
 1 | #include "blas/device.hh"
 2 | #include <hip/hip_complex.h>
 3 | 
 4 | #if defined(BLAS_HAVE_ROCBLAS)
 5 | 
 6 | namespace blas {
 7 | 
 8 | __device__ std::complex<float> conj_convert(
 9 |     std::complex<float> z)
10 | {
11 |     hipFloatComplex res = hipConjf(*(hipFloatComplex*) &z);
12 |     return *(std::complex<float>*) &res;
13 | }
14 | 
15 | __device__ std::complex<double> conj_convert(
16 |     std::complex<double> z)
17 | {
18 |     hipDoubleComplex res = hipConj(*(hipDoubleComplex*) &z);
19 |     return *(std::complex<double>*) &res;
20 | }
21 | 
22 | template <typename TS, typename TD>
23 | __global__ void conj_kernel(
24 |     int64_t n,
25 |     TS const* src, int64_t inc_src, int64_t i_src,
26 |     TD* dst, int64_t inc_dst, int64_t i_dst)
27 | {
28 |     int i = blockIdx.x * blockDim.x + threadIdx.x;
29 |     if (i < n)
30 |         dst[ i*inc_dst + i_dst ] = conj_convert( src[ i*inc_src + i_src ] );
31 | }
32 | 
33 | template <typename TS, typename TD>
34 | void conj(
35 |     int64_t n,
36 |     TS const* src, int64_t inc_src,
37 |     TD* dst, int64_t inc_dst,
38 |     blas::Queue& queue )
39 | {
40 |     if (n <= 0) {
41 |         return;
42 |     }
43 | 
44 |     const int BlockSize = 128;
45 | 
46 |     int64_t n_threads = std::min( int64_t( BlockSize ), n );
47 |     int64_t n_blocks = ceildiv(n, n_threads);
48 | 
49 |     int64_t i_src = (inc_src > 0 ? 0 : (1 - n) * inc_src);
50 |     int64_t i_dst = (inc_dst > 0 ? 0 : (1 - n) * inc_dst);
51 | 
52 |     blas_dev_call(
53 |         hipSetDevice( queue.device() ) );
54 | 
55 |     conj_kernel<<<n_blocks, n_threads, 0, queue.stream()>>>(
56 |         n, src, inc_src, i_src, dst, inc_dst, i_dst );
57 | 
58 |     blas_dev_call(
59 |         hipGetLastError() );
60 | }
61 | 
62 | //------------------------------------------------------------------------------
63 | // Explicit instantiations.
64 | template void conj(
65 |     int64_t n,
66 |     std::complex<float> const* src, int64_t inc_src,
67 |     std::complex<float>* dst, int64_t inc_dst,
68 |     blas::Queue& queue);
69 | 
70 | template void conj(
71 |     int64_t n,
72 |     std::complex<double> const* src, int64_t inc_src,
73 |     std::complex<double>* dst, int64_t inc_dst,
74 |     blas::Queue& queue);
75 | 
76 | } // namespace blas
77 | 
78 | #endif // BLAS_HAVE_ROCBLAS
79 | 


--------------------------------------------------------------------------------
/src/hip/device_shift_vec.hip:
--------------------------------------------------------------------------------
 1 | #include "hip/hip_runtime.h"
 2 | #include "blas/device.hh"
 3 | 
 4 | #if defined(BLAS_HAVE_ROCBLAS)
 5 | 
 6 | namespace blas {
 7 | 
 8 | template <typename scalar_t>
 9 | __device__ void shift_vec_func(
10 |     int64_t n, scalar_t* v,
11 |     scalar_t c)
12 | {
13 |     for (int i = threadIdx.x; i < n; i += blockDim.x) {
14 |         v[ i ] += c;
15 |     }
16 | }
17 | 
18 | template <typename scalar_t>
19 | __global__ void shift_vec_kernel(
20 |     int64_t n, scalar_t* v,
21 |     scalar_t c)
22 | {
23 |     shift_vec_func( n, v, c );
24 | }
25 | 
26 | template <typename scalar_t>
27 | void shift_vec(
28 |     int64_t n, scalar_t* v,
29 |     scalar_t c,
30 |     blas::Queue& queue)
31 | {
32 |     if (n == 0) {
33 |         return;
34 |     }
35 | 
36 |     int64_t nthreads = std::min( int64_t( 1024 ), n );
37 | 
38 |     blas_dev_call(
39 |         hipSetDevice( queue.device() ) );
40 | 
41 |     shift_vec_kernel<<<1, nthreads, 0, queue.stream()>>>(
42 |         n, v, c);
43 | 
44 |     blas_dev_call(
45 |         hipGetLastError() );
46 | }
47 | 
48 | //------------------------------------------------------------------------------
49 | // Explicit instantiations.
50 | template
51 | void shift_vec(
52 |     int64_t n, int64_t* v,
53 |     int64_t c,
54 |     blas::Queue& queue);
55 | 
56 | } // namespace blas
57 | 
58 | #endif // BLAS_HAVE_ROCBLAS
59 | 


--------------------------------------------------------------------------------
/src/iamax.cc:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
  2 | // SPDX-License-Identifier: BSD-3-Clause
  3 | // This program is free software: you can redistribute it and/or modify it under
  4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
  5 | 
  6 | #include "blas/fortran.h"
  7 | #include "blas.hh"
  8 | #include "blas_internal.hh"
  9 | #include "blas/counter.hh"
 10 | 
 11 | #include <limits>
 12 | #include <string.h>
 13 | 
 14 | namespace blas {
 15 | 
 16 | //==============================================================================
 17 | namespace internal {
 18 | 
 19 | //------------------------------------------------------------------------------
 20 | /// Low-level overload wrapper calls Fortran, float version.
 21 | /// @ingroup iamax_internal
 22 | inline blas_int iamax(
 23 |     blas_int n,
 24 |     float const* x, blas_int incx )
 25 | {
 26 |     return BLAS_isamax( &n, x, &incx );
 27 | }
 28 | 
 29 | //------------------------------------------------------------------------------
 30 | /// Low-level overload wrapper calls Fortran, double version.
 31 | /// @ingroup iamax_internal
 32 | inline blas_int iamax(
 33 |     blas_int n,
 34 |     double const* x, blas_int incx )
 35 | {
 36 |     return BLAS_idamax( &n, x, &incx );
 37 | }
 38 | 
 39 | //------------------------------------------------------------------------------
 40 | /// Low-level overload wrapper calls Fortran, complex<float> version.
 41 | /// @ingroup iamax_internal
 42 | inline blas_int iamax(
 43 |     blas_int n,
 44 |     std::complex<float> const* x, blas_int incx )
 45 | {
 46 |     return BLAS_icamax( &n,
 47 |                         (blas_complex_float*) x, &incx );
 48 | }
 49 | 
 50 | //------------------------------------------------------------------------------
 51 | /// Low-level overload wrapper calls Fortran, complex<double> version.
 52 | /// @ingroup iamax_internal
 53 | inline blas_int iamax(
 54 |     blas_int n,
 55 |     std::complex<double> const* x, blas_int incx )
 56 | {
 57 |     return BLAS_izamax( &n,
 58 |                         (blas_complex_double*) x, &incx );
 59 | }
 60 | 
 61 | }  // namespace internal
 62 | 
 63 | //==============================================================================
 64 | namespace impl {
 65 | 
 66 | //------------------------------------------------------------------------------
 67 | /// Mid-level templated wrapper checks and converts arguments,
 68 | /// then calls low-level wrapper.
 69 | /// @ingroup iamax_internal
 70 | ///
 71 | template <typename scalar_t>
 72 | int64_t iamax(
 73 |     int64_t n,
 74 |     scalar_t const* x, int64_t incx )
 75 | {
 76 |     // check arguments
 77 |     blas_error_if( n < 0 );      // standard BLAS returns, doesn't fail
 78 |     blas_error_if( incx <= 0 );  // standard BLAS returns, doesn't fail
 79 | 
 80 |     #ifdef BLAS_HAVE_PAPI
 81 |         // PAPI instrumentation
 82 |         counter::iamax_type element;
 83 |         memset( &element, 0, sizeof( element ) );
 84 |         element = { n };
 85 |         counter::insert( element, counter::Id::iamax );
 86 | 
 87 |         double gflops = 1e9 * blas::Gflop< scalar_t >::iamax( n );
 88 |         counter::inc_flop_count( (long long int)gflops );
 89 |     #endif
 90 | 
 91 |     // convert arguments
 92 |     blas_int n_    = to_blas_int( n );
 93 |     blas_int incx_ = to_blas_int( incx );
 94 | 
 95 |     // call low-level wrapper
 96 |     return internal::iamax( n_, x, incx_ ) - 1;
 97 | }
 98 | 
 99 | }  // namespace impl
100 | 
101 | //==============================================================================
102 | // High-level overloaded wrappers call mid-level templated wrapper.
103 | 
104 | //------------------------------------------------------------------------------
105 | /// CPU, float version.
106 | /// @ingroup iamax
107 | int64_t iamax(
108 |     int64_t n,
109 |     float const* x, int64_t incx )
110 | {
111 |     return impl::iamax( n, x, incx );
112 | }
113 | 
114 | //------------------------------------------------------------------------------
115 | /// CPU, double version.
116 | /// @ingroup iamax
117 | int64_t iamax(
118 |     int64_t n,
119 |     double const* x, int64_t incx )
120 | {
121 |     return impl::iamax( n, x, incx );
122 | }
123 | 
124 | //------------------------------------------------------------------------------
125 | /// CPU, complex<float> version.
126 | /// @ingroup iamax
127 | int64_t iamax(
128 |     int64_t n,
129 |     std::complex<float> const* x, int64_t incx )
130 | {
131 |     return impl::iamax( n, x, incx );
132 | }
133 | 
134 | //------------------------------------------------------------------------------
135 | /// CPU, complex<double> version.
136 | /// @ingroup iamax
137 | int64_t iamax(
138 |     int64_t n,
139 |     std::complex<double> const* x, int64_t incx )
140 | {
141 |     return impl::iamax( n, x, incx );
142 | }
143 | 
144 | }  // namespace blas
145 | 


--------------------------------------------------------------------------------
/src/nrm2.cc:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
  2 | // SPDX-License-Identifier: BSD-3-Clause
  3 | // This program is free software: you can redistribute it and/or modify it under
  4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
  5 | 
  6 | #include "blas/fortran.h"
  7 | #include "blas.hh"
  8 | #include "blas_internal.hh"
  9 | #include "blas/counter.hh"
 10 | 
 11 | #include <limits>
 12 | #include <string.h>
 13 | 
 14 | namespace blas {
 15 | 
 16 | //==============================================================================
 17 | namespace internal {
 18 | 
 19 | //------------------------------------------------------------------------------
 20 | /// Low-level overload wrapper calls Fortran, float version.
 21 | /// @ingroup nrm2_internal
 22 | inline float nrm2(
 23 |     blas_int n,
 24 |     float const* x, blas_int incx )
 25 | {
 26 |     return BLAS_snrm2( &n, x, &incx );
 27 | }
 28 | 
 29 | //------------------------------------------------------------------------------
 30 | /// Low-level overload wrapper calls Fortran, double version.
 31 | /// @ingroup nrm2_internal
 32 | inline double nrm2(
 33 |     blas_int n,
 34 |     double const* x, blas_int incx )
 35 | {
 36 |     return BLAS_dnrm2( &n, x, &incx );
 37 | }
 38 | 
 39 | //------------------------------------------------------------------------------
 40 | /// Low-level overload wrapper calls Fortran, complex<float> version.
 41 | /// @ingroup nrm2_internal
 42 | inline float nrm2(
 43 |     blas_int n,
 44 |     std::complex<float> const* x, blas_int incx )
 45 | {
 46 |     return BLAS_scnrm2( &n, (blas_complex_float*) x, &incx );
 47 | }
 48 | 
 49 | //------------------------------------------------------------------------------
 50 | /// Low-level overload wrapper calls Fortran, complex<double> version.
 51 | /// @ingroup nrm2_internal
 52 | inline double nrm2(
 53 |     blas_int n,
 54 |     std::complex<double> const* x, blas_int incx )
 55 | {
 56 |     return BLAS_dznrm2( &n, (blas_complex_double*) x, &incx );
 57 | }
 58 | 
 59 | }  // namespace internal
 60 | 
 61 | //==============================================================================
 62 | namespace impl {
 63 | 
 64 | //------------------------------------------------------------------------------
 65 | /// Mid-level templated wrapper checks and converts arguments,
 66 | /// then calls low-level wrapper.
 67 | /// @ingroup nrm2_internal
 68 | ///
 69 | template <typename scalar_t>
 70 | real_type<scalar_t> nrm2(
 71 |     int64_t n,
 72 |     scalar_t const* x, int64_t incx )
 73 | {
 74 |     // check arguments
 75 |     blas_error_if( n < 0 );      // standard BLAS returns, doesn't fail
 76 |     blas_error_if( incx <= 0 );  // standard BLAS returns, doesn't fail
 77 | 
 78 |     #ifdef BLAS_HAVE_PAPI
 79 |         // PAPI instrumentation
 80 |         counter::nrm2_type element;
 81 |         memset( &element, 0, sizeof( element ) );
 82 |         element = { n };
 83 |         counter::insert( element, counter::Id::nrm2 );
 84 | 
 85 |         double gflops = 1e9 * blas::Gflop< scalar_t >::nrm2( n );
 86 |         counter::inc_flop_count( (long long int)gflops );
 87 |     #endif
 88 | 
 89 |     // convert arguments
 90 |     blas_int n_    = to_blas_int( n );
 91 |     blas_int incx_ = to_blas_int( incx );
 92 | 
 93 |     // call low-level wrapper
 94 |     return internal::nrm2( n_, x, incx_ );
 95 | }
 96 | 
 97 | }  // namespace impl
 98 | 
 99 | //==============================================================================
100 | // High-level overloaded wrappers call mid-level templated wrapper.
101 | 
102 | //------------------------------------------------------------------------------
103 | /// CPU, float version.
104 | /// @ingroup nrm2
105 | float nrm2(
106 |     int64_t n,
107 |     float const* x, int64_t incx )
108 | {
109 |     return impl::nrm2( n, x, incx );
110 | }
111 | 
112 | //------------------------------------------------------------------------------
113 | /// CPU, double version.
114 | /// @ingroup nrm2
115 | double nrm2(
116 |     int64_t n,
117 |     double const* x, int64_t incx )
118 | {
119 |     return impl::nrm2( n, x, incx );
120 | }
121 | 
122 | //------------------------------------------------------------------------------
123 | /// CPU, complex<float> version.
124 | /// @ingroup nrm2
125 | float nrm2(
126 |     int64_t n,
127 |     std::complex<float> const* x, int64_t incx )
128 | {
129 |     return impl::nrm2( n, x, incx );
130 | }
131 | 
132 | //------------------------------------------------------------------------------
133 | /// CPU, complex<double> version.
134 | /// @ingroup nrm2
135 | double nrm2(
136 |     int64_t n,
137 |     std::complex<double> const* x, int64_t incx )
138 | {
139 |     return impl::nrm2( n, x, incx );
140 | }
141 | 
142 | }  // namespace blas
143 | 


--------------------------------------------------------------------------------
/src/rotg.cc:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
  2 | // SPDX-License-Identifier: BSD-3-Clause
  3 | // This program is free software: you can redistribute it and/or modify it under
  4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
  5 | 
  6 | #include "blas/fortran.h"
  7 | #include "blas.hh"
  8 | #include "blas/counter.hh"
  9 | 
 10 | #include <limits>
 11 | #include <string.h>
 12 | 
 13 | namespace blas {
 14 | 
 15 | // =============================================================================
 16 | // Overloaded wrappers for s, d, c, z precisions.
 17 | 
 18 | // -----------------------------------------------------------------------------
 19 | /// @ingroup rotg
 20 | void rotg(
 21 |     float *a,
 22 |     float *b,
 23 |     float *c,
 24 |     float *s )
 25 | {
 26 |     #ifdef BLAS_HAVE_PAPI
 27 |         // PAPI instrumentation
 28 |         counter::rotg_type element;
 29 |         memset( &element, 0, sizeof( element ) );
 30 |         element = { 1 };
 31 |         counter::insert( element, counter::Id::rotg );
 32 | 
 33 |         // This operation does not incur significant FLOPs, so no
 34 |         // need to call counter::inc_flop_count()
 35 |     #endif
 36 | 
 37 |     BLAS_srotg( a, b, c, s );
 38 | }
 39 | 
 40 | // -----------------------------------------------------------------------------
 41 | /// @ingroup rotg
 42 | void rotg(
 43 |     double *a,
 44 |     double *b,
 45 |     double *c,
 46 |     double *s )
 47 | {
 48 |     #ifdef BLAS_HAVE_PAPI
 49 |         // PAPI instrumentation
 50 |         counter::rotg_type element;
 51 |         memset( &element, 0, sizeof( element ) );
 52 |         element = { 1 };
 53 |         counter::insert( element, counter::Id::rotg );
 54 | 
 55 |         // This operation does not incur significant FLOPs, so no
 56 |         // need to call counter::inc_flop_count()
 57 |     #endif
 58 | 
 59 |     BLAS_drotg( a, b, c, s );
 60 | }
 61 | 
 62 | // -----------------------------------------------------------------------------
 63 | /// @ingroup rotg
 64 | void rotg(
 65 |     std::complex<float> *a,
 66 |     std::complex<float> *b,  // const in BLAS implementation, oddly
 67 |     float *c,
 68 |     std::complex<float> *s )
 69 | {
 70 |     #ifdef BLAS_HAVE_PAPI
 71 |         // PAPI instrumentation
 72 |         counter::rotg_type element;
 73 |         memset( &element, 0, sizeof( element ) );
 74 |         element = { 1 };
 75 |         counter::insert( element, counter::Id::rotg );
 76 | 
 77 |         // This operation does not incur significant FLOPs, so no
 78 |         // need to call counter::inc_flop_count()
 79 |     #endif
 80 | 
 81 |     BLAS_crotg( (blas_complex_float*) a,
 82 |                 (blas_complex_float*) b,
 83 |                 c,
 84 |                 (blas_complex_float*) s );
 85 | }
 86 | 
 87 | // -----------------------------------------------------------------------------
 88 | /// @ingroup rotg
 89 | void rotg(
 90 |     std::complex<double> *a,
 91 |     std::complex<double> *b,  // const in BLAS implementation, oddly
 92 |     double *c,
 93 |     std::complex<double> *s )
 94 | {
 95 |     #ifdef BLAS_HAVE_PAPI
 96 |         // PAPI instrumentation
 97 |         counter::rotg_type element;
 98 |         memset( &element, 0, sizeof( element ) );
 99 |         element = { 1 };
100 |         counter::insert( element, counter::Id::rotg );
101 | 
102 |         // This operation does not incur significant FLOPs, so no
103 |         // need to call counter::inc_flop_count()
104 |     #endif
105 | 
106 |     BLAS_zrotg( (blas_complex_double*) a,
107 |                 (blas_complex_double*) b,
108 |                 c,
109 |                 (blas_complex_double*) s );
110 | }
111 | 
112 | }  // namespace blas
113 | 


--------------------------------------------------------------------------------
/src/rotm.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
 2 | // SPDX-License-Identifier: BSD-3-Clause
 3 | // This program is free software: you can redistribute it and/or modify it under
 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 5 | 
 6 | #include "blas/fortran.h"
 7 | #include "blas.hh"
 8 | #include "blas_internal.hh"
 9 | #include "blas/counter.hh"
10 | 
11 | #include <limits>
12 | #include <string.h>
13 | 
14 | namespace blas {
15 | 
16 | // =============================================================================
17 | // Overloaded wrappers for s, d precisions.
18 | // Not available for complex.
19 | 
20 | // -----------------------------------------------------------------------------
21 | /// @ingroup rotm
22 | void rotm(
23 |     int64_t n,
24 |     float *x, int64_t incx,
25 |     float *y, int64_t incy,
26 |     float const param[5] )
27 | {
28 |     // check arguments
29 |     blas_error_if( n < 0 );      // standard BLAS returns, doesn't fail
30 |     blas_error_if( incx == 0 );  // standard BLAS doesn't detect inc[xy] == 0
31 |     blas_error_if( incy == 0 );
32 | 
33 |     #ifdef BLAS_HAVE_PAPI
34 |         // PAPI instrumentation
35 |         counter::rotm_type element;
36 |         memset( &element, 0, sizeof( element ) );
37 |         element = { n };
38 |         counter::insert( element, counter::Id::rotm );
39 | 
40 |         double gflops = 1e9 * blas::Gflop< float >::rotm( n );
41 |         counter::inc_flop_count( (long long int)gflops );
42 |     #endif
43 | 
44 |     // convert arguments
45 |     blas_int n_    = to_blas_int( n );
46 |     blas_int incx_ = to_blas_int( incx );
47 |     blas_int incy_ = to_blas_int( incy );
48 |     BLAS_srotm( &n_, x, &incx_, y, &incy_, param );
49 | }
50 | 
51 | // -----------------------------------------------------------------------------
52 | /// @ingroup rotm
53 | void rotm(
54 |     int64_t n,
55 |     double *x, int64_t incx,
56 |     double *y, int64_t incy,
57 |     double const param[5] )
58 | {
59 |     // check arguments
60 |     blas_error_if( n < 0 );      // standard BLAS returns, doesn't fail
61 |     blas_error_if( incx == 0 );  // standard BLAS doesn't detect inc[xy] == 0
62 |     blas_error_if( incy == 0 );
63 | 
64 |     #ifdef BLAS_HAVE_PAPI
65 |         // PAPI instrumentation
66 |         counter::rotm_type element;
67 |         memset( &element, 0, sizeof( element ) );
68 |         element = { n };
69 |         counter::insert( element, counter::Id::rotm );
70 | 
71 |         double gflops = 1e9 * blas::Gflop< double >::rotm( n );
72 |         counter::inc_flop_count( (long long int)gflops );
73 |     #endif
74 | 
75 |     // convert arguments
76 |     blas_int n_    = to_blas_int( n );
77 |     blas_int incx_ = to_blas_int( incx );
78 |     blas_int incy_ = to_blas_int( incy );
79 |     BLAS_drotm( &n_, x, &incx_, y, &incy_, param );
80 | }
81 | 
82 | }  // namespace blas
83 | 


--------------------------------------------------------------------------------
/src/rotmg.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
 2 | // SPDX-License-Identifier: BSD-3-Clause
 3 | // This program is free software: you can redistribute it and/or modify it under
 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 5 | 
 6 | #include "blas/fortran.h"
 7 | #include "blas.hh"
 8 | #include "blas/counter.hh"
 9 | 
10 | #include <limits>
11 | #include <string.h>
12 | 
13 | namespace blas {
14 | 
15 | // =============================================================================
16 | // Overloaded wrappers for s, d, c, z precisions.
17 | // Not available for complex.
18 | 
19 | // -----------------------------------------------------------------------------
20 | /// @ingroup rotmg
21 | void rotmg(
22 |     float *d1,
23 |     float *d2,
24 |     float *a,
25 |     float  b,
26 |     float  param[5] )
27 | {
28 |     #ifdef BLAS_HAVE_PAPI
29 |         // PAPI instrumentation
30 |         counter::rotmg_type element;
31 |         memset( &element, 0, sizeof( element ) );
32 |         element = { 1 };
33 |         counter::insert( element, counter::Id::rotmg );
34 | 
35 |         // This operation does not incur significant FLOPs, so no
36 |         // need to call counter::inc_flop_count()
37 |     #endif
38 | 
39 |     BLAS_srotmg( d1, d2, a, &b, param );
40 | }
41 | 
42 | // -----------------------------------------------------------------------------
43 | /// @ingroup rotmg
44 | void rotmg(
45 |     double *d1,
46 |     double *d2,
47 |     double *a,
48 |     double  b,
49 |     double  param[5] )
50 | {
51 |     #ifdef BLAS_HAVE_PAPI
52 |         // PAPI instrumentation
53 |         counter::rotmg_type element;
54 |         memset( &element, 0, sizeof( element ) );
55 |         element = { 1 };
56 |         counter::insert( element, counter::Id::rotmg );
57 | 
58 |         // This operation does not incur significant FLOPs, so no
59 |         // need to call counter::inc_flop_count()
60 |     #endif
61 | 
62 |     BLAS_drotmg( d1, d2, a, &b, param );
63 | }
64 | 
65 | }  // namespace blas
66 | 


--------------------------------------------------------------------------------
/src/util.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
 2 | // SPDX-License-Identifier: BSD-3-Clause
 3 | // This program is free software: you can redistribute it and/or modify it under
 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 5 | 
 6 | #include "blas/util.hh"
 7 | 
 8 | namespace blas {
 9 | 
10 | const char* Layout_help = "one of: C or ColMajor; R or RowMajor";
11 | const char* Op_help     = "one of: N or NoTrans; T or Trans; C or ConjTrans";
12 | const char* Uplo_help   = "one of: L or Lower; U or Upper";
13 | const char* Diag_help   = "one of: N or NonUnit; U or Unit";
14 | const char* Side_help   = "one of: L or Left; R or Right";
15 | 
16 | }  // namespace blas
17 | 


--------------------------------------------------------------------------------
/src/version.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
 2 | // SPDX-License-Identifier: BSD-3-Clause
 3 | // This program is free software: you can redistribute it and/or modify it under
 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 5 | 
 6 | #include "blas.hh"
 7 | 
 8 | namespace blas {
 9 | 
10 | //------------------------------------------------------------------------------
11 | /// @return BLAS++ version.
12 | /// Version is integer of form yyyymmrr, where yyyy is year, mm is month,
13 | /// and rr is release counter within month, starting at 00.
14 | ///
15 | int blaspp_version()
16 | {
17 |     return BLASPP_VERSION;
18 | }
19 | 
20 | // BLASPP_ID is the Mercurial or git commit hash ID, either
21 | // defined by `git rev-parse --short HEAD` in Makefile,
22 | // or defined here by make_release.py for release tar files. DO NOT EDIT.
23 | #ifndef BLASPP_ID
24 | #define BLASPP_ID "unknown"
25 | #endif
26 | 
27 | //------------------------------------------------------------------------------
28 | /// @return BLAS++ Mercurial or git commit hash ID.
29 | ///
30 | const char* blaspp_id()
31 | {
32 |     return BLASPP_ID;
33 | }
34 | 
35 | }  // namespace blas
36 | 


--------------------------------------------------------------------------------
/test/GNUmakefile:
--------------------------------------------------------------------------------
1 | top = ..
2 | include ${top}/GNUmakefile.subdir
3 | 


--------------------------------------------------------------------------------
/test/print_matrix.hh:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
  2 | // SPDX-License-Identifier: BSD-3-Clause
  3 | // This program is free software: you can redistribute it and/or modify it under
  4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
  5 | 
  6 | #ifndef PRINT_HH
  7 | #define PRINT_HH
  8 | 
  9 | #include <assert.h>
 10 | #include <complex>
 11 | 
 12 | // -----------------------------------------------------------------------------
 13 | template <typename T>
 14 | void print_matrix( int64_t m, int64_t n, T *A, int64_t lda,
 15 |                    const char* format="%9.4f" )
 16 | {
 17 |     #define A(i_, j_) A[ (i_) + size_t(lda)*(j_) ]
 18 | 
 19 |     assert( m >= 0 );
 20 |     assert( n >= 0 );
 21 |     assert( lda >= m );
 22 |     char format2[32];
 23 |     snprintf( format2, sizeof(format2), " %s", format );
 24 | 
 25 |     printf( "[\n" );
 26 |     for (int64_t i = 0; i < m; ++i) {
 27 |         for (int64_t j = 0; j < n; ++j) {
 28 |             printf( format2, A(i, j) );
 29 |         }
 30 |         printf( "\n" );
 31 |     }
 32 |     printf( "];\n" );
 33 | 
 34 |     #undef A
 35 | }
 36 | 
 37 | // -----------------------------------------------------------------------------
 38 | template <typename T>
 39 | void print_matrix( int64_t m, int64_t n, std::complex<T>* A, int64_t lda,
 40 |                    const char* format="%9.4f" )
 41 | {
 42 |     #define A(i_, j_) A[ (i_) + size_t(lda)*(j_) ]
 43 | 
 44 |     assert( m >= 0 );
 45 |     assert( n >= 0 );
 46 |     assert( lda >= m );
 47 |     char format2[32];
 48 |     snprintf( format2, sizeof(format2), " %s + %si", format, format );
 49 | 
 50 |     printf( "[\n" );
 51 |     for (int64_t i = 0; i < m; ++i) {
 52 |         for (int64_t j = 0; j < n; ++j) {
 53 |             printf( format2, real(A(i, j)), imag(A(i, j)) );
 54 |         }
 55 |         printf( "\n" );
 56 |     }
 57 |     printf( "];\n" );
 58 | 
 59 |     #undef A
 60 | }
 61 | 
 62 | // -----------------------------------------------------------------------------
 63 | template <typename T>
 64 | void print_vector( int64_t n, T *x, int64_t incx,
 65 |                    const char* format="%9.4f" )
 66 | {
 67 |     assert( n >= 0 );
 68 |     assert( incx != 0 );
 69 |     char format2[32];
 70 |     snprintf( format2, sizeof(format2), " %s", format );
 71 | 
 72 |     printf( "[" );
 73 |     int64_t ix = (incx > 0 ? 0 : (-n + 1)*incx);
 74 |     for (int64_t i = 0; i < n; ++i) {
 75 |         printf( format2, x[ix] );
 76 |         ix += incx;
 77 |     }
 78 |     printf( " ]';\n" );
 79 | }
 80 | 
 81 | // -----------------------------------------------------------------------------
 82 | template <typename T>
 83 | void print_vector( int64_t n, std::complex<T>* x, int64_t incx,
 84 |                    const char* format="%9.4f" )
 85 | {
 86 |     assert( n >= 0 );
 87 |     assert( incx != 0 );
 88 |     char format2[32];
 89 |     snprintf( format2, sizeof(format2), " %s + %si", format, format );
 90 | 
 91 |     printf( "[" );
 92 |     int64_t ix = (incx > 0 ? 0 : (-n + 1)*incx);
 93 |     for (int64_t i = 0; i < n; ++i) {
 94 |         printf( format2, real(x[ix]), imag(x[ix]) );
 95 |         ix += incx;
 96 |     }
 97 |     printf( " ]';\n" );
 98 | }
 99 | 
100 | #endif        //  #ifndef PRINT_HH
101 | 


--------------------------------------------------------------------------------
/test/test_asum.cc:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
  2 | // SPDX-License-Identifier: BSD-3-Clause
  3 | // This program is free software: you can redistribute it and/or modify it under
  4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
  5 | 
  6 | #include "test.hh"
  7 | #include "cblas_wrappers.hh"
  8 | #include "lapack_wrappers.hh"
  9 | #include "blas/flops.hh"
 10 | #include "print_matrix.hh"
 11 | 
 12 | // -----------------------------------------------------------------------------
 13 | template <typename T>
 14 | void test_asum_work( Params& params, bool run )
 15 | {
 16 |     using namespace testsweeper;
 17 |     using std::abs;
 18 |     using blas::max;
 19 |     using real_t   = blas::real_type< T >;
 20 | 
 21 |     // get & mark input values
 22 |     int64_t n       = params.dim.n();
 23 |     int64_t incx    = params.incx();
 24 |     int64_t verbose = params.verbose();
 25 | 
 26 |     // mark non-standard output values
 27 |     params.gflops();
 28 |     params.gbytes();
 29 |     params.ref_time();
 30 |     params.ref_gflops();
 31 |     params.ref_gbytes();
 32 | 
 33 |     // adjust header to msec
 34 |     params.time.name( "time (ms)" );
 35 |     params.ref_time.name( "ref time (ms)" );
 36 |     params.ref_time.width( 13 );
 37 | 
 38 |     if (! run)
 39 |         return;
 40 | 
 41 |     // setup
 42 |     size_t size_x = max( (n - 1) * abs( incx ) + 1, 0 );
 43 |     T* x = new T[ size_x ];
 44 | 
 45 |     int64_t idist = 1;
 46 |     int iseed[4] = { 0, 0, 0, 1 };
 47 |     lapack_larnv( idist, iseed, size_x, x );
 48 | 
 49 |     // test error exits
 50 |     assert_throw( blas::asum( -1, x, incx ), blas::Error );
 51 |     assert_throw( blas::asum(  n, x,    0 ), blas::Error );
 52 |     assert_throw( blas::asum(  n, x,   -1 ), blas::Error );
 53 | 
 54 |     if (verbose >= 1) {
 55 |         printf( "\n"
 56 |                 "x n=%5lld, inc=%5lld, size=%10lld\n",
 57 |                 llong( n ), llong( incx ), llong( size_x ) );
 58 |     }
 59 |     if (verbose >= 2) {
 60 |         printf( "x = " ); print_vector( n, x, incx );
 61 |     }
 62 | 
 63 |     // run test
 64 |     testsweeper::flush_cache( params.cache() );
 65 |     double time = get_wtime();
 66 |     real_t result = blas::asum( n, x, incx );
 67 |     time = get_wtime() - time;
 68 | 
 69 |     double gflop = blas::Gflop< T >::asum( n );
 70 |     double gbyte = blas::Gbyte< T >::asum( n );
 71 |     params.time()   = time * 1000;  // msec
 72 |     params.gflops() = gflop / time;
 73 |     params.gbytes() = gbyte / time;
 74 | 
 75 |     if (verbose >= 1) {
 76 |         printf( "result = %.4e\n", result );
 77 |     }
 78 | 
 79 |     if (params.check() == 'y') {
 80 |         // run reference
 81 |         testsweeper::flush_cache( params.cache() );
 82 |         time = get_wtime();
 83 |         real_t ref = cblas_asum( n, x, incx );
 84 |         time = get_wtime() - time;
 85 | 
 86 |         params.ref_time()   = time * 1000;  // msec
 87 |         params.ref_gflops() = gflop / time;
 88 |         params.ref_gbytes() = gbyte / time;
 89 | 
 90 |         if (verbose >= 1) {
 91 |             printf( "ref    = %.4e\n", ref );
 92 |         }
 93 | 
 94 |         // relative forward error
 95 |         // note: using sqrt(n) here gives failures
 96 |         real_t error = abs( ref - result );
 97 |         if (ref != 0) {
 98 |             error /= (n * ref);
 99 |         }
100 | 
101 |         // complex needs extra factor; see Higham, 2002, sec. 3.6.
102 |         if (blas::is_complex_v<T>) {
103 |             error /= 2*sqrt(2);
104 |         }
105 | 
106 |         real_t u = 0.5 * std::numeric_limits< real_t >::epsilon();
107 |         params.error() = error;
108 |         params.okay() = (error < u);
109 |     }
110 | 
111 |     delete[] x;
112 | }
113 | 
114 | // -----------------------------------------------------------------------------
115 | void test_asum( Params& params, bool run )
116 | {
117 |     switch (params.datatype()) {
118 |         case testsweeper::DataType::Single:
119 |             test_asum_work< float >( params, run );
120 |             break;
121 | 
122 |         case testsweeper::DataType::Double:
123 |             test_asum_work< double >( params, run );
124 |             break;
125 | 
126 |         case testsweeper::DataType::SingleComplex:
127 |             test_asum_work< std::complex<float> >( params, run );
128 |             break;
129 | 
130 |         case testsweeper::DataType::DoubleComplex:
131 |             test_asum_work< std::complex<double> >( params, run );
132 |             break;
133 | 
134 |         default:
135 |             throw std::exception();
136 |             break;
137 |     }
138 | }
139 | 


--------------------------------------------------------------------------------
/test/test_error.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
 2 | // SPDX-License-Identifier: BSD-3-Clause
 3 | // This program is free software: you can redistribute it and/or modify it under
 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 5 | 
 6 | #include "test.hh"
 7 | #include "blas/util.hh"
 8 | 
 9 | void test_error( Params& params, bool run )
10 | {
11 |     int64_t m = params.dim.m();
12 |     int64_t n = params.dim.n();
13 | 
14 |     if (! run) {
15 |         printf( "test error checks the internal error routines:\n"
16 |                 "if m == 100, checks: blas_error_if( m == n );\n"
17 |                 "if m == 200, checks: blas_error_if_msg( m == n, \"m %%d == n %%d\", m, n );\n"
18 |                 "if m == 300, checks: assert( m != n );\n\n" );
19 |         return;
20 |     }
21 | 
22 |     if (m == 100) {
23 |         blas_error_if( m == n );
24 |     }
25 |     else if (m == 200) {
26 |         blas_error_if_msg( m == n, "m %lld == n %lld", llong( m ), llong( n ) );
27 |     }
28 |     else if (m == 300) {
29 |         assert( m != n );
30 |     }
31 | }
32 | 


--------------------------------------------------------------------------------
/test/test_iamax.cc:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
  2 | // SPDX-License-Identifier: BSD-3-Clause
  3 | // This program is free software: you can redistribute it and/or modify it under
  4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
  5 | 
  6 | #include "test.hh"
  7 | #include "cblas_wrappers.hh"
  8 | #include "lapack_wrappers.hh"
  9 | #include "blas/flops.hh"
 10 | #include "print_matrix.hh"
 11 | 
 12 | // -----------------------------------------------------------------------------
 13 | template <typename T>
 14 | void test_iamax_work( Params& params, bool run )
 15 | {
 16 |     using namespace testsweeper;
 17 |     using real_t   = blas::real_type< T >;
 18 |     using std::abs;
 19 |     using blas::max;
 20 | 
 21 |     // get & mark input values
 22 |     int64_t n       = params.dim.n();
 23 |     int64_t incx    = params.incx();
 24 |     int64_t verbose = params.verbose();
 25 | 
 26 |     // mark non-standard output values
 27 |     params.gflops();
 28 |     params.gbytes();
 29 |     params.ref_time();
 30 |     params.ref_gflops();
 31 |     params.ref_gbytes();
 32 | 
 33 |     // adjust header to msec
 34 |     params.time.name( "time (ms)" );
 35 |     params.ref_time.name( "ref time (ms)" );
 36 |     params.ref_time.width( 13 );
 37 | 
 38 |     if (! run)
 39 |         return;
 40 | 
 41 |     // setup
 42 |     size_t size_x = max( (n - 1) * abs( incx ) + 1, 0 );
 43 |     T* x = new T[ size_x ];
 44 | 
 45 |     int64_t idist = 1;
 46 |     int iseed[4] = { 0, 0, 0, 1 };
 47 |     lapack_larnv( idist, iseed, size_x, x );
 48 | 
 49 |     // test error exits
 50 |     assert_throw( blas::iamax( -1, x, incx ), blas::Error );
 51 |     assert_throw( blas::iamax(  n, x,    0 ), blas::Error );
 52 |     assert_throw( blas::iamax(  n, x,   -1 ), blas::Error );
 53 | 
 54 |     if (verbose >= 1) {
 55 |         printf( "\n"
 56 |                 "x n=%5lld, inc=%5lld, size=%10lld\n",
 57 |                 llong( n ), llong( incx ), llong( size_x ) );
 58 |     }
 59 |     if (verbose >= 2) {
 60 |         printf( "x = " ); print_vector( n, x, incx );
 61 |     }
 62 | 
 63 |     // run test
 64 |     testsweeper::flush_cache( params.cache() );
 65 |     double time = get_wtime();
 66 |     int64_t result = blas::iamax( n, x, incx );
 67 |     time = get_wtime() - time;
 68 | 
 69 |     double gflop = blas::Gflop< T >::iamax( n );
 70 |     double gbyte = blas::Gbyte< T >::iamax( n );
 71 |     params.time()   = time * 1000;  // msec
 72 |     params.gflops() = gflop / time;
 73 |     params.gbytes() = gbyte / time;
 74 | 
 75 |     if (verbose >= 1) {
 76 |         printf( "result = %5lld\n", llong( result ) );
 77 |     }
 78 | 
 79 |     if (params.check() == 'y') {
 80 |         // run reference
 81 |         testsweeper::flush_cache( params.cache() );
 82 |         time = get_wtime();
 83 |         int64_t ref = cblas_iamax( n, x, incx );
 84 |         if (n == 0)
 85 |             ref -= 1;
 86 |         time = get_wtime() - time;
 87 | 
 88 |         params.ref_time()   = time * 1000;  // msec
 89 |         params.ref_gflops() = gflop / time;
 90 |         params.ref_gbytes() = gbyte / time;
 91 | 
 92 |         if (verbose >= 1) {
 93 |             printf( "ref    = %5lld\n", llong( ref ) );
 94 |         }
 95 | 
 96 |         // error = |ref - result|
 97 |         real_t error = abs( ref - result );
 98 |         params.error() = error;
 99 | 
100 |         // iamax must be exact!
101 |         params.okay() = (error == 0);
102 |     }
103 | 
104 |     delete[] x;
105 | }
106 | 
107 | // -----------------------------------------------------------------------------
108 | void test_iamax( Params& params, bool run )
109 | {
110 |     switch (params.datatype()) {
111 |         case testsweeper::DataType::Single:
112 |             test_iamax_work< float >( params, run );
113 |             break;
114 | 
115 |         case testsweeper::DataType::Double:
116 |             test_iamax_work< double >( params, run );
117 |             break;
118 | 
119 |         case testsweeper::DataType::SingleComplex:
120 |             test_iamax_work< std::complex<float> >( params, run );
121 |             break;
122 | 
123 |         case testsweeper::DataType::DoubleComplex:
124 |             test_iamax_work< std::complex<double> >( params, run );
125 |             break;
126 | 
127 |         default:
128 |             throw std::exception();
129 |             break;
130 |     }
131 | }
132 | 


--------------------------------------------------------------------------------
/test/test_nrm2.cc:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
  2 | // SPDX-License-Identifier: BSD-3-Clause
  3 | // This program is free software: you can redistribute it and/or modify it under
  4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
  5 | 
  6 | #include "test.hh"
  7 | #include "cblas_wrappers.hh"
  8 | #include "lapack_wrappers.hh"
  9 | #include "blas/flops.hh"
 10 | #include "print_matrix.hh"
 11 | 
 12 | // -----------------------------------------------------------------------------
 13 | template <typename T>
 14 | void test_nrm2_work( Params& params, bool run )
 15 | {
 16 |     using namespace testsweeper;
 17 |     using std::abs;
 18 |     using blas::max;
 19 |     using scalar_t = blas::scalar_type<T>;
 20 |     using real_t   = blas::real_type<scalar_t>;
 21 | 
 22 |     // get & mark input values
 23 |     int64_t n       = params.dim.n();
 24 |     int64_t incx    = params.incx();
 25 |     int64_t verbose = params.verbose();
 26 | 
 27 |     // mark non-standard output values
 28 |     params.gflops();
 29 |     params.gbytes();
 30 |     params.ref_time();
 31 |     params.ref_gflops();
 32 |     params.ref_gbytes();
 33 | 
 34 |     // adjust header to msec
 35 |     params.time.name( "time (ms)" );
 36 |     params.ref_time.name( "ref time (ms)" );
 37 |     params.ref_time.width( 13 );
 38 | 
 39 |     if (! run)
 40 |         return;
 41 | 
 42 |     // setup
 43 |     size_t size_x = max( (n - 1) * abs( incx ) + 1, 0 );
 44 |     T* x = new T[ size_x ];
 45 | 
 46 |     int64_t idist = 1;
 47 |     int iseed[4] = { 0, 0, 0, 1 };
 48 |     lapack_larnv( idist, iseed, size_x, x );
 49 | 
 50 |     // test error exits
 51 |     assert_throw( blas::nrm2( -1, x, incx ), blas::Error );
 52 |     assert_throw( blas::nrm2(  n, x,    0 ), blas::Error );
 53 |     assert_throw( blas::nrm2(  n, x,   -1 ), blas::Error );
 54 | 
 55 |     if (verbose >= 1) {
 56 |         printf( "\n"
 57 |                 "x n=%5lld, inc=%5lld, size=%10lld\n",
 58 |                 llong( n ), llong( incx ), llong( size_x ) );
 59 |     }
 60 |     if (verbose >= 2) {
 61 |         printf( "x = " ); print_vector( n, x, incx );
 62 |     }
 63 | 
 64 |     // run test
 65 |     testsweeper::flush_cache( params.cache() );
 66 |     double time = get_wtime();
 67 |     real_t result = blas::nrm2( n, x, incx );
 68 |     time = get_wtime() - time;
 69 | 
 70 |     double gflop = blas::Gflop< T >::nrm2( n );
 71 |     double gbyte = blas::Gbyte< T >::nrm2( n );
 72 |     params.time()   = time * 1000;  // msec
 73 |     params.gflops() = gflop / time;
 74 |     params.gbytes() = gbyte / time;
 75 | 
 76 |     if (verbose >= 2) {
 77 |         printf( "result = %.4e\n", result );
 78 |     }
 79 | 
 80 |     if (params.check() == 'y') {
 81 |         // run reference
 82 |         testsweeper::flush_cache( params.cache() );
 83 |         time = get_wtime();
 84 |         real_t ref = cblas_nrm2( n, x, std::abs(incx) );
 85 |         time = get_wtime() - time;
 86 | 
 87 |         params.ref_time()   = time * 1000;  // msec
 88 |         params.ref_gflops() = gflop / time;
 89 |         params.ref_gbytes() = gbyte / time;
 90 | 
 91 |         if (verbose >= 2) {
 92 |             printf( "ref    = %.4e\n", ref );
 93 |         }
 94 | 
 95 |         // relative forward error
 96 |         real_t error = abs( ref - result );
 97 |         if (ref != 0) {
 98 |             error /= sqrt(n+1) * ref;
 99 |         }
100 | 
101 |         // complex needs extra factor; see Higham, 2002, sec. 3.6.
102 |         if (blas::is_complex_v<scalar_t>) {
103 |             error /= 2*sqrt(2);
104 |         }
105 | 
106 |         real_t u = 0.5 * std::numeric_limits< real_t >::epsilon();
107 |         params.error() = error;
108 |         params.okay() = (error < u);
109 |     }
110 | 
111 |     delete[] x;
112 | }
113 | 
114 | // -----------------------------------------------------------------------------
115 | void test_nrm2( Params& params, bool run )
116 | {
117 |     switch (params.datatype()) {
118 |         case testsweeper::DataType::Single:
119 |             test_nrm2_work< float >( params, run );
120 |             break;
121 | 
122 |         case testsweeper::DataType::Double:
123 |             test_nrm2_work< double >( params, run );
124 |             break;
125 | 
126 |         case testsweeper::DataType::SingleComplex:
127 |             test_nrm2_work< std::complex<float> >( params, run );
128 |             break;
129 | 
130 |         case testsweeper::DataType::DoubleComplex:
131 |             test_nrm2_work< std::complex<double> >( params, run );
132 |             break;
133 | 
134 |         default:
135 |             throw std::exception();
136 |             break;
137 |     }
138 | }
139 | 


--------------------------------------------------------------------------------
/test/test_rotmg.cc:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
  2 | // SPDX-License-Identifier: BSD-3-Clause
  3 | // This program is free software: you can redistribute it and/or modify it under
  4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
  5 | 
  6 | #include "test.hh"
  7 | #include "cblas_wrappers.hh"
  8 | #include "lapack_wrappers.hh"
  9 | #include "print_matrix.hh"
 10 | 
 11 | // -----------------------------------------------------------------------------
 12 | template <typename T>
 13 | void test_rotmg_work( Params& params, bool run )
 14 | {
 15 |     using namespace testsweeper;
 16 |     using std::abs;
 17 |     using std::real;
 18 |     using std::imag;
 19 |     using real_t   = blas::real_type< T >;
 20 | 
 21 |     // Constants
 22 |     const real_t epsilon = std::numeric_limits< real_t >::epsilon();
 23 | 
 24 |     // get & mark input values
 25 |     int64_t n = params.dim.n();
 26 |     double tol      = params.tol() * epsilon;
 27 | 
 28 |     // mark non-standard output values
 29 |     params.ref_time();
 30 | 
 31 |     // adjust header to msec
 32 |     params.time.name( "time (ms)" );
 33 |     params.ref_time.name( "ref time (ms)" );
 34 |     params.ref_time.width( 13 );
 35 | 
 36 |     if (! run)
 37 |         return;
 38 | 
 39 |     // setup
 40 |     std::vector<T> d1( n ), d1_ref( n );
 41 |     std::vector<T> d2( n ), d2_ref( n );
 42 |     std::vector<T> x1( n ), x1_ref( n );
 43 |     std::vector<T> y1( n ), y1_ref( n );
 44 |     std::vector<T> ps( 5*n ), ps_ref( 5*n );
 45 | 
 46 |     int64_t idist = 3;
 47 |     int iseed[4] = { 0, 0, 0, 1 };
 48 |     lapack_larnv( idist, iseed, n, &d1[0] );
 49 |     lapack_larnv( idist, iseed, n, &d2[0] );
 50 |     lapack_larnv( idist, iseed, n, &x1[0] );
 51 |     lapack_larnv( idist, iseed, n, &y1[0] );
 52 |     lapack_larnv( idist, iseed, 5*n, &ps[0] );
 53 | 
 54 |     d1_ref = d1;
 55 |     d2_ref = d2;
 56 |     x1_ref = x1;
 57 |     y1_ref = y1;
 58 |     ps_ref = ps;
 59 | 
 60 |     // run test
 61 |     testsweeper::flush_cache( params.cache() );
 62 |     double time = get_wtime();
 63 |     for (int64_t i = 0; i < n; ++i) {
 64 |         blas::rotmg( &d1[i], &d2[i], &x1[i], y1[i], &ps[5*i] );
 65 |     }
 66 |     time = get_wtime() - time;
 67 |     params.time() = time * 1000;  // msec
 68 | 
 69 |     if (params.check() == 'y') {
 70 |         // run reference
 71 |         testsweeper::flush_cache( params.cache() );
 72 |         time = get_wtime();
 73 |         for (int64_t i = 0; i < n; ++i) {
 74 |             cblas_rotmg( &d1_ref[i], &d2_ref[i], &x1_ref[i], y1_ref[i], &ps_ref[5*i] );
 75 |         }
 76 |         time = get_wtime() - time;
 77 |         params.ref_time() = time * 1000;  // msec
 78 | 
 79 |         // get max error of all outputs
 80 |         cblas_axpy(   n, -1.0, &d1[0], 1, &d1_ref[0], 1 );
 81 |         cblas_axpy(   n, -1.0, &d2[0], 1, &d2_ref[0], 1 );
 82 |         cblas_axpy(   n, -1.0, &x1[0], 1, &x1_ref[0], 1 );
 83 |         cblas_axpy( 5*n, -1.0, &ps[0], 1, &ps_ref[0], 1 );
 84 | 
 85 |         int64_t id1 = cblas_iamax(   n, &d1_ref[0], 1 );
 86 |         int64_t id2 = cblas_iamax(   n, &d2_ref[0], 1 );
 87 |         int64_t ix1 = cblas_iamax(   n, &x1_ref[0], 1 );
 88 |         int64_t ips = cblas_iamax( 5*n, &ps_ref[0], 1 );
 89 | 
 90 |         real_t error = blas::max(
 91 |             abs( d1_ref[ id1 ] ),
 92 |             abs( d2_ref[ id2 ] ),
 93 |             abs( x1_ref[ ix1 ] ),
 94 |             abs( ps_ref[ ips ] )
 95 |         );
 96 | 
 97 |         // error is normally 0, but allow for some rounding just in case.
 98 |         params.error() = error;
 99 |         params.okay() = (error < tol);
100 |     }
101 | }
102 | 
103 | // -----------------------------------------------------------------------------
104 | void test_rotmg( Params& params, bool run )
105 | {
106 |     switch (params.datatype()) {
107 |         case testsweeper::DataType::Single:
108 |             test_rotmg_work< float >( params, run );
109 |             break;
110 | 
111 |         case testsweeper::DataType::Double:
112 |             test_rotmg_work< double >( params, run );
113 |             break;
114 | 
115 |         // modified Givens not available for complex
116 | 
117 |         default:
118 |             throw std::exception();
119 |             break;
120 |     }
121 | }
122 | 


--------------------------------------------------------------------------------
/tools/copyright.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | 
3 | import release
4 | release.copyright()
5 | 


--------------------------------------------------------------------------------
/tools/doxygen-filter.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | use strict;
 4 | 
 5 | sub dollar
 6 | {
 7 |     my( $pre ) = @_;
 8 |     if ($pre eq '\\') {
 9 |         # change \$ to $
10 |         return '$';
11 |     }
12 |     elsif ($pre eq '\\f') {
13 |         # don't change \f$
14 |         return '\\f$';
15 |     }
16 |     else {
17 |         # change $ to \f$
18 |         return $pre . '\\f$';
19 |     }
20 | }
21 | 
22 | while (<>) {
23 |     # replace \[ and \] by \f[ and \f]
24 |     s/\\([\[\]])/\\f$1/g;
25 | 
26 |     # replace         $  by  \f$
27 |     # replace        \$  by  $
28 |     # don't change  \f$
29 |     s/(\\f|\\|)\$/dollar($1)/eg;
30 | 
31 |     print
32 | }
33 | 


--------------------------------------------------------------------------------
/tools/hooks/pre-commit:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #
 3 | # Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
 4 | # SPDX-License-Identifier: BSD-3-Clause
 5 | # This program is free software: you can redistribute it and/or modify it under
 6 | # the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
 7 | #
 8 | # Does style checks before committing code.
 9 | # Unfortunately, at the moment astyle does not properly parse all the
10 | # C++ 11, 14, 17 syntax, so it formats things weirdly. Instead, this
11 | # hook uses a simple Perl script to check for a subset of errors to fix.
12 | 
13 | script="tools/slate-style-reject.pl"
14 | return=0
15 | 
16 | files=$(git diff --cached --name-only --diff-filter=ACMR \
17 |         | grep -E "\.(c|cc|cpp|cxx|h|hh|hpp|hxx|cu|cuh)$")
18 | #echo "files ${files}"
19 | $script $files
20 | return=$?
21 | 
22 | # Comment out exit in .git/hooks/pre-commit to check files but allow the commit.
23 | exit $return
24 | 


--------------------------------------------------------------------------------
/tools/make_release.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | 
3 | import release
4 | release.make( 'blaspp', 'include/blas.hh', 'src/version.cc' )
5 | 


--------------------------------------------------------------------------------
/tools/slate-style-reject.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | #
  3 | # Catches certain common style issues and rejects them, returning non-zero exit code.
  4 | # Can be used for pre-commit and pre-push hook to prevent errors from coming in.
  5 | 
  6 | use strict;
  7 | use Getopt::Std;
  8 | 
  9 | my %opts = ();
 10 | getopts( 'vl', \%opts ) || exit(-1);
 11 | 
 12 | my $verbose = $opts{v};
 13 | my $list    = $opts{l};
 14 | 
 15 | # escape characters for ANSI colors
 16 | # see http://en.wikipedia.org/wiki/ANSI_escape_code
 17 | my $esc     = chr(0x1B) . "[";
 18 | my $red     = "${esc}31m";
 19 | my $green   = "${esc}32m";
 20 | my $yellow  = "${esc}33m";
 21 | my $blue    = "${esc}34m";
 22 | my $magenta = "${esc}35m";
 23 | my $cyan    = "${esc}36m";
 24 | my $white   = "${esc}37m";
 25 | my $black   = "${esc}0m";
 26 | 
 27 | my $result = 0;
 28 | for my $ARG (@ARGV) {
 29 |     open( my $fileh, "<", $ARG ) or die( "Can't open $ARG: $!\n" );
 30 |     my $file_result = 0;
 31 |     while (<$fileh>) {
 32 |         chomp;
 33 |         my $orig = $_;
 34 |         s@//.*\S@//@;     # ignore C++ comments
 35 |         s/".*?"/"..."/g;  # ignore strings
 36 | 
 37 |         my $line = 0;
 38 |         if (m/\t/) {
 39 |             print( "$red$ARG:$.$black: tab\n" ) if (not $list);
 40 |             $line = 1;
 41 |         }
 42 | 
 43 |         if (m/ $/) {
 44 |             print( "$red$ARG:$.$black: trailing space\n" ) if (not $list);
 45 |             $line = 1;
 46 |         }
 47 | 
 48 |         if (m/^ *(if|for|while|switch|else if)\(/) {
 49 |             print( "$red$ARG:$.$black: missing space after `$1`\n" ) if (not $list);
 50 |             $line = 1;
 51 |         }
 52 | 
 53 |         if (m/^ *(if|for|while|switch|else if) \( +[^ ;]/) {
 54 |             print( "$red$ARG:$.$black: excess space inside parens after `$1`\n" ) if (not $list);
 55 |             $line = 1;
 56 |         }
 57 | 
 58 |         if (m/^ *\} *else/) {
 59 |             print( "$red$ARG:$.$black: don't cuddle } and else on same line\n" ) if (not $list);
 60 |             $line = 1;
 61 |         }
 62 | 
 63 |         if (m/\)\{/) {
 64 |             print( "$red$ARG:$.$black: missing space before { brace\n" ) if (not $list);
 65 |             $line = 1;
 66 |         }
 67 | 
 68 |         if (m/  +\\$/) {
 69 |             print( "$red$ARG:$.$black: excess space before line continuation\n" ) if (not $list);
 70 |             $line = 1;
 71 |         }
 72 | 
 73 |         # This checks 2-character operators.
 74 |         # It's hard to check < > = w/o full parser.
 75 |         if (m/[^ =](&&|\|\||==|<=|>=|!=|\+=|-=|\*=|\/=|\|=|\&=)[^ =]/) {
 76 |             print( "$red$ARG:$.$black: missing space around boolean operator\n" ) if (not $list);
 77 |             $line = 1;
 78 |         }
 79 | 
 80 |         # Prohibit space before , or ; unless at the beginning of a line.
 81 |         # Sometimes with #if conditions, the comma has to start the line.
 82 |         if (m/\S +[,;]/) {
 83 |             print( "$red$ARG:$.$black: excess space before comma or semi-colon\n" ) if (not $list);
 84 |             $line = 1;
 85 |         }
 86 | 
 87 |         # semi-colon ; must be at end, followed by space,
 88 |         # or followed by \n string as in printf( "];\n" ).
 89 |         if (m/(;(?!$|\s|\\n))/) {
 90 |             print( "$red$ARG:$.$black: missing space after semi-colon: <<$1>>\n" ) if (not $list);
 91 |             $line = 1;
 92 |         }
 93 | 
 94 |         # It's hard to check indentation w/o full parser, but at least
 95 |         # preprocessor, comments, and control keywords should be
 96 |         # indented correctly.
 97 |         if (m@^(    )* {1,3}(#|//|if|else|for|while|switch|case|default|break|throw)@) {
 98 |             print( "$red$ARG:$.$black: not 4-space indent\n" ) if (not $list);
 99 |             $line = 1;
100 |         }
101 | 
102 |         # if (m/.{85}/) {
103 |         #     print( "$red$ARG:$.$black: longer than 85 char hard limit\n" ) if (not $list);
104 |         #     $line = 1;
105 |         # }
106 | 
107 |         $file_result |= $line;
108 |         if ($line and $verbose) {
109 |             print( "<$orig>\n\n" );
110 |         }
111 |     }
112 |     if ($file_result and $list) {
113 |         print( "$ARG\n" );
114 |     }
115 |     $result |= $file_result;
116 | }
117 | 
118 | #print( "result $result\n" );
119 | exit( $result );
120 | 


--------------------------------------------------------------------------------