├── .github └── workflows │ ├── build.sh │ ├── configure.sh │ ├── main.yml │ ├── setup_env.sh │ ├── test.sh │ └── variants.yml ├── .gitignore ├── CHANGELOG.md ├── CMakeLists.txt ├── GNUmakefile ├── GNUmakefile.subdir ├── INSTALL.md ├── LICENSE ├── README.md ├── blasppConfig.cmake.in ├── cmake ├── BLASConfig.cmake ├── BLASFinder.cmake ├── CBLASConfig.cmake ├── LAPACKFinder.cmake ├── config.cmake └── util.cmake ├── config ├── __init__.py ├── ansicodes.py ├── blas.cc ├── blis_version.cc ├── cblas.cc ├── compiler_cxx.cc ├── config.h ├── config.py ├── cublas.cc ├── essl_version.cc ├── hello.cc ├── lapack.py ├── lapack_potrf.cc ├── lapack_pstrf.cc ├── mkl_version.cc ├── onemkl.cc ├── openblas_version.cc ├── openmp.cc ├── return_complex.cc ├── return_complex_argument.cc ├── return_float.cc ├── return_float_f2c.cc ├── rocblas.cc └── std_atomic.cc ├── configure.py ├── docs └── doxygen │ ├── DoxygenLayout.xml │ ├── GNUmakefile │ ├── doxyfile.conf │ ├── errors.pl │ └── groups.dox ├── examples ├── CMakeLists.txt ├── Makefile ├── README.md ├── example_gemm.cc ├── example_util.cc └── util.hh ├── include ├── GNUmakefile ├── blas.hh └── blas │ ├── GNUmakefile │ ├── asum.hh │ ├── axpy.hh │ ├── batch_common.hh │ ├── config.h │ ├── copy.hh │ ├── counter.hh │ ├── defines.h.in │ ├── device.hh │ ├── device_blas.hh │ ├── dot.hh │ ├── dotu.hh │ ├── flops.hh │ ├── fortran.h │ ├── gemm.hh │ ├── gemv.hh │ ├── ger.hh │ ├── geru.hh │ ├── hemm.hh │ ├── hemv.hh │ ├── her.hh │ ├── her2.hh │ ├── her2k.hh │ ├── herk.hh │ ├── iamax.hh │ ├── mangling.h │ ├── nrm2.hh │ ├── rot.hh │ ├── rotg.hh │ ├── rotm.hh │ ├── rotmg.hh │ ├── scal.hh │ ├── swap.hh │ ├── symm.hh │ ├── symv.hh │ ├── syr.hh │ ├── syr2.hh │ ├── syr2k.hh │ ├── syrk.hh │ ├── trmm.hh │ ├── trmv.hh │ ├── trsm.hh │ ├── trsv.hh │ ├── util.hh │ └── wrappers.hh ├── lib └── pkgconfig │ └── blaspp.pc.in ├── make.inc.in ├── src ├── asum.cc ├── axpy.cc ├── batch_gemm.cc ├── batch_hemm.cc ├── batch_her2k.cc ├── batch_herk.cc ├── batch_symm.cc ├── batch_syr2k.cc ├── batch_syrk.cc ├── batch_trmm.cc ├── batch_trsm.cc ├── blas_internal.hh ├── copy.cc ├── cublas_wrappers.cc ├── cuda │ ├── device_conj.cu │ └── device_shift_vec.cu ├── device_asum.cc ├── device_axpy.cc ├── device_batch_gemm.cc ├── device_batch_gemm_group.cc ├── device_batch_hemm.cc ├── device_batch_her2k.cc ├── device_batch_herk.cc ├── device_batch_symm.cc ├── device_batch_syr2k.cc ├── device_batch_syrk.cc ├── device_batch_trmm.cc ├── device_batch_trsm.cc ├── device_copy.cc ├── device_dot.cc ├── device_error.cc ├── device_gemm.cc ├── device_gemv.cc ├── device_ger.cc ├── device_hemm.cc ├── device_hemv.cc ├── device_her.cc ├── device_her2.cc ├── device_her2k.cc ├── device_herk.cc ├── device_iamax.cc ├── device_internal.hh ├── device_nrm2.cc ├── device_queue.cc ├── device_rot.cc ├── device_rotg.cc ├── device_rotm.cc ├── device_rotmg.cc ├── device_scal.cc ├── device_swap.cc ├── device_symm.cc ├── device_symv.cc ├── device_syr.cc ├── device_syr2.cc ├── device_syr2k.cc ├── device_syrk.cc ├── device_trmm.cc ├── device_trmv.cc ├── device_trsm.cc ├── device_trsv.cc ├── device_utils.cc ├── dot.cc ├── gemm.cc ├── gemv.cc ├── ger.cc ├── hemm.cc ├── hemv.cc ├── her.cc ├── her2.cc ├── her2k.cc ├── herk.cc ├── hip │ ├── device_conj.hip │ └── device_shift_vec.hip ├── iamax.cc ├── nrm2.cc ├── onemkl_wrappers.cc ├── rocblas_wrappers.cc ├── rot.cc ├── rotg.cc ├── rotm.cc ├── rotmg.cc ├── scal.cc ├── swap.cc ├── symm.cc ├── symv.cc ├── syr.cc ├── syr2.cc ├── syr2k.cc ├── syrk.cc ├── trmm.cc ├── trmv.cc ├── trsm.cc ├── trsv.cc ├── util.cc └── version.cc ├── test ├── CMakeLists.txt ├── GNUmakefile ├── cblas_wrappers.cc ├── cblas_wrappers.hh ├── check_gemm.hh ├── lapack_wrappers.cc ├── lapack_wrappers.hh ├── print_matrix.hh ├── run_tests.py ├── test.cc ├── test.hh ├── test_asum.cc ├── test_asum_device.cc ├── test_axpy.cc ├── test_axpy_device.cc ├── test_batch_gemm.cc ├── test_batch_gemm_device.cc ├── test_batch_hemm.cc ├── test_batch_hemm_device.cc ├── test_batch_her2k.cc ├── test_batch_her2k_device.cc ├── test_batch_herk.cc ├── test_batch_herk_device.cc ├── test_batch_symm.cc ├── test_batch_symm_device.cc ├── test_batch_syr2k.cc ├── test_batch_syr2k_device.cc ├── test_batch_syrk.cc ├── test_batch_syrk_device.cc ├── test_batch_trmm.cc ├── test_batch_trmm_device.cc ├── test_batch_trsm.cc ├── test_batch_trsm_device.cc ├── test_copy.cc ├── test_copy_device.cc ├── test_dot.cc ├── test_dot_device.cc ├── test_error.cc ├── test_gemm.cc ├── test_gemm_device.cc ├── test_gemv.cc ├── test_gemv_device.cc ├── test_ger.cc ├── test_ger_device.cc ├── test_hemm.cc ├── test_hemm_device.cc ├── test_hemv.cc ├── test_hemv_device.cc ├── test_her.cc ├── test_her2.cc ├── test_her2_device.cc ├── test_her2k.cc ├── test_her2k_device.cc ├── test_her_device.cc ├── test_herk.cc ├── test_herk_device.cc ├── test_iamax.cc ├── test_iamax_device.cc ├── test_max.cc ├── test_memcpy.cc ├── test_memcpy_2d.cc ├── test_nrm2.cc ├── test_nrm2_device.cc ├── test_rot.cc ├── test_rot_device.cc ├── test_rotg.cc ├── test_rotg_device.cc ├── test_rotm.cc ├── test_rotm_device.cc ├── test_rotmg.cc ├── test_rotmg_device.cc ├── test_scal.cc ├── test_scal_device.cc ├── test_schur_gemm.cc ├── test_swap.cc ├── test_swap_device.cc ├── test_symm.cc ├── test_symm_device.cc ├── test_symv.cc ├── test_symv_device.cc ├── test_syr.cc ├── test_syr2.cc ├── test_syr2_device.cc ├── test_syr2k.cc ├── test_syr2k_device.cc ├── test_syr_device.cc ├── test_syrk.cc ├── test_syrk_device.cc ├── test_trmm.cc ├── test_trmm_device.cc ├── test_trmv.cc ├── test_trmv_device.cc ├── test_trsm.cc ├── test_trsm_device.cc ├── test_trsv.cc ├── test_trsv_device.cc └── test_util.cc └── tools ├── copyright.py ├── doxygen-filter.pl ├── hooks └── pre-commit ├── make_release.py ├── release.py └── slate-style-reject.pl /.github/workflows/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -x 2 | 3 | mydir=$(dirname $0) 4 | source ${mydir}/setup_env.sh 5 | 6 | print "======================================== Build" 7 | make -j8 || exit 10 8 | 9 | print "======================================== Install" 10 | make -j8 install || exit 11 11 | ls -R ${top}/install 12 | 13 | print "======================================== Verify build" 14 | ldd_result=$(ldd test/tester) || exit 12 15 | echo "${ldd_result}" 16 | 17 | # Verify that tester linked with cublas or rocblas as intended. 18 | if [ "${device}" = "gpu_nvidia" ]; then 19 | echo "${ldd_result}" | grep cublas || exit 13 20 | 21 | elif [ "${device}" = "gpu_amd" ]; then 22 | echo "${ldd_result}" | grep rocblas || exit 14 23 | 24 | else 25 | # CPU-only not linked with cublas or rocblas. 26 | echo "${ldd_result}" | grep -P "cublas|rocblas" && exit 15 27 | fi 28 | 29 | # Verify that tester linked with intended CPU BLAS. 30 | echo "${ldd_result}" | grep -P "lib\S*${blas}" || exit 16 31 | 32 | # Verify that tester linked with intended ilp64 library, or not. 33 | if [[ $blas_int = "int64" ]] \ 34 | || [[ $bla_vendor = *64ilp* ]] \ 35 | || [[ $bla_vendor = *ilp64* ]] \ 36 | || [[ $BLAS_LIBRARIES = *ilp64* ]]; then 37 | echo "${ldd_result}" | grep -P "libmkl_\S+_ilp64" || exit 17 38 | else 39 | echo "${ldd_result}" | grep -P "libmkl_\S+_ilp64" && exit 18 40 | fi 41 | 42 | print "======================================== Finished build" 43 | exit 0 44 | -------------------------------------------------------------------------------- /.github/workflows/configure.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -x 2 | 3 | if [ "${maker}" = "cmake" ]; then 4 | rm -rf build 5 | mkdir -p build 6 | fi 7 | 8 | mydir=$(dirname $0) 9 | source ${mydir}/setup_env.sh 10 | 11 | print "======================================== Environment" 12 | # Show environment variables, excluding functions. 13 | (set -o posix; set) 14 | 15 | print "======================================== Modules" 16 | quiet module list -l 17 | 18 | print "======================================== Query GPUs" 19 | if [ "${device}" = "gpu_nvidia" ]; then 20 | nvidia-smi 21 | elif [ "${device}" = "gpu_amd" ]; then 22 | rocm-smi 23 | elif [ "${device}" = "gpu_intel" ]; then 24 | clinfo 25 | sycl-ls 26 | fi 27 | 28 | print "======================================== Setup build" 29 | # Note: set all env variables in setup_env.sh, 30 | # else build.sh and test.sh won't see them. 31 | 32 | rm -rf ${top}/install 33 | if [ "${maker}" = "make" ]; then 34 | make distclean 35 | make config prefix=${top}/install 36 | err=$? 37 | if [[ $err -ne 0 ]]; then 38 | echo "<<<<<<<<<<<<<<<<<<<< begin config/log.txt" 39 | cat config/log.txt 40 | echo ">>>>>>>>>>>>>>>>>>>> end config/log.txt" 41 | exit 10 42 | fi 43 | 44 | elif [ "${maker}" = "cmake" ]; then 45 | 46 | if [[ $blas != "" ]]; then 47 | export cmake_blas="-Dblas=$blas" 48 | fi 49 | if [[ $blas_int != "" ]]; then 50 | export cmake_blas_int="-Dblas_int=$blas_int" 51 | fi 52 | if [[ $blas_threaded != "" ]]; then 53 | export cmake_blas_threaded="-Dblas_threaded=$blas_threaded" 54 | fi 55 | if [[ $BLAS_LIBRARIES != "" ]]; then 56 | export cmake_blas_libraries="-DBLAS_LIBRARIES=$BLAS_LIBRARIES" 57 | fi 58 | if [[ $bla_vendor != "" ]]; then 59 | unset cmake_blas 60 | unset cmake_blas_int 61 | unset cmake_blas_threaded 62 | unset cmake_blas_libraries 63 | export cmake_bla_vendor="-DBLA_VENDOR=$bla_vendor" 64 | fi 65 | 66 | # cmake_blas_libraries can have spaces; the rest do not expect spaces. 67 | cmake -Dcolor=no \ 68 | -DCMAKE_INSTALL_PREFIX=${top}/install \ 69 | $cmake_blas $cmake_blas_int $cmake_blas_threaded \ 70 | "$cmake_blas_libraries" $cmake_bla_vendor \ 71 | -Dgpu_backend=${gpu_backend} .. \ 72 | || exit 12 73 | fi 74 | 75 | cat include/blas/defines.h 76 | 77 | print "======================================== Finished configure" 78 | exit 0 79 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | # Continuous Testing on every PR. 2 | name: CT 3 | 4 | # Controls when the workflow will run 5 | on: 6 | # Triggers the workflow on push or pull request events but only for the master branch 7 | push: 8 | branches: [ master ] 9 | pull_request: 10 | branches: [ master ] 11 | 12 | # Allows you to run this workflow manually from the Actions tab 13 | workflow_dispatch: 14 | 15 | # A workflow run is made up of one or more jobs that can run sequentially or in parallel 16 | # 17 | # make cmake 18 | # cpu openblas Intel10_64ilp 19 | # nvidia mkl-int64 openblas 20 | # amd blis mkl-int64 21 | # intel mkl mkl 22 | # 23 | jobs: 24 | icl_blaspp: 25 | timeout-minutes: 120 26 | strategy: 27 | matrix: 28 | maker: [make, cmake] 29 | device: [cpu, gpu_nvidia, gpu_amd, gpu_intel] 30 | 31 | include: 32 | # Add `blas` and `blas_int` to existing configurations. 33 | - maker: make 34 | device: cpu 35 | blas: openblas 36 | 37 | - maker: cmake 38 | device: cpu 39 | bla_vendor: Intel10_64ilp # MKL int64 using CMake's FindBLAS 40 | 41 | - maker: make 42 | device: gpu_nvidia 43 | blas: mkl 44 | blas_int: int64 45 | 46 | - maker: cmake 47 | device: gpu_nvidia 48 | blas: openblas 49 | 50 | - maker: make 51 | device: gpu_amd 52 | blas: blis 53 | 54 | - maker: cmake 55 | device: gpu_amd 56 | blas: mkl 57 | blas_int: int64 58 | 59 | # For both make, cmake 60 | - device: gpu_intel 61 | blas: mkl 62 | 63 | fail-fast: false 64 | runs-on: ${{ matrix.device }} 65 | name: ${{matrix.maker}} ${{matrix.device}} ${{matrix.blas}}${{matrix.bla_vendor}}${{matrix.blas_libraries}} ${{matrix.blas_int}} 66 | 67 | # See variants.yml for use of some of these variables. 68 | env: 69 | maker: ${{matrix.maker}} 70 | device: ${{matrix.device}} 71 | check: ${{matrix.check}} 72 | blas: ${{matrix.blas}} 73 | blas_int: ${{matrix.blas_int}} 74 | bla_vendor: ${{matrix.bla_vendor}} 75 | BLAS_LIBRARIES: ${{matrix.blas_libraries}} 76 | 77 | steps: 78 | - uses: actions/checkout@v4 79 | 80 | - name: Configure 81 | run: .github/workflows/configure.sh 82 | 83 | - name: Build 84 | run: .github/workflows/build.sh 85 | 86 | - name: Test 87 | run: .github/workflows/test.sh 88 | -------------------------------------------------------------------------------- /.github/workflows/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -x 2 | 3 | mydir=$(dirname $0) 4 | source ${mydir}/setup_env.sh 5 | 6 | # Instead of exiting on the first failed test (bash -e), 7 | # run all the tests and accumulate failures into $err. 8 | err=0 9 | 10 | export OMP_NUM_THREADS=8 11 | 12 | print "======================================== Tests" 13 | cd test 14 | 15 | args="--quick" 16 | if [ "${device}" = "gpu_intel" ]; then 17 | # Our Intel GPU supports only single precision. 18 | args+=" --type s,c" 19 | fi 20 | 21 | if [[ $check = "sanity" ]]; then 22 | echo "Running only sanity checks" 23 | ./run_tests.py ${args} herk dev-herk 24 | (( err += $? )) 25 | else 26 | ./run_tests.py ${args} --blas1 --blas2 --blas3 27 | (( err += $? )) 28 | 29 | ./run_tests.py ${args} --batch-blas3 30 | (( err += $? )) 31 | 32 | # CUDA, HIP, or SYCL. These fail gracefully when GPUs are absent. 33 | ./run_tests.py ${args} --blas1-device --blas2-device --blas3-device 34 | (( err += $? )) 35 | 36 | ./run_tests.py ${args} --batch-blas3-device 37 | (( err += $? )) 38 | fi 39 | 40 | print "======================================== Smoke tests" 41 | cd ${top}/examples 42 | 43 | # Makefile or CMakeLists.txt picks up ${test_args}. 44 | if [ "${device}" = "gpu_intel" ]; then 45 | # Our Intel GPU supports only single precision. 46 | export test_args="s c" 47 | else 48 | export test_args="s d c z" 49 | fi 50 | 51 | if [ "${maker}" = "make" ]; then 52 | export PKG_CONFIG_PATH+=:${top}/install/lib/pkgconfig 53 | make clean || exit 20 54 | 55 | elif [ "${maker}" = "cmake" ]; then 56 | rm -rf build && mkdir build && cd build 57 | cmake "-DCMAKE_PREFIX_PATH=${top}/install" .. || exit 30 58 | fi 59 | 60 | # ARGS=-V causes CTest to print output. Makefile doesn't use it. 61 | make -j8 || exit 40 62 | make test ARGS=-V 63 | (( err += $? )) 64 | 65 | print "======================================== Finished test" 66 | exit ${err} 67 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.bak 2 | *.d 3 | *.mod 4 | *.o 5 | *.pyc 6 | *.svg 7 | .DS_Store 8 | .id 9 | __pycache__ 10 | blaspp-* 11 | build* 12 | config/blas 13 | config/blis_version 14 | config/cblas 15 | config/compiler_cxx 16 | config/cublas 17 | config/essl_version 18 | config/hello 19 | config/lapack_potrf 20 | config/lapack_pstrf 21 | config/log.txt 22 | config/mkl_version 23 | config/onemkl 24 | config/openblas_version 25 | config/openmp 26 | config/return_complex 27 | config/return_complex_argument 28 | config/return_float 29 | config/return_float_f2c 30 | config/rocblas 31 | docs/doxygen/errors.txt 32 | docs/html/ 33 | files.txt 34 | include/blas/defines.h 35 | issues/ 36 | lib/*.a 37 | lib/*.so 38 | lib/pkgconfig/*.pc 39 | make.inc 40 | test/tester 41 | wiki/ 42 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | 2025.05.28 (ABI 2.0.0) 2 | - Added Level 1 BLAS GPU wrappers 3 | - Added is_complex_v 4 | - Added support for BLIS and libFLAME (hence AOCL) 5 | - Removed support for ACML 6 | - Removed deprecated enum, memcpy routines 7 | - Moved [cz]symv and [cz]syr to BLAS++ from LAPACK++, since they 8 | logically fit in BLAS. Requires linking with an LAPACK library 9 | - Tester prints stats with --repeat 10 | - Fixed SYCL include path 11 | - Fixed testers for n = 0 case 12 | - Fixed template gemm for beta = 0 to not propagate NaN/Inf (strong zero) 13 | 14 | 2024.10.26 (ABI 1.0.0) 15 | - Added PAPI SDE that counts flops 16 | - Use to_blas_int to convert int32 to int64 17 | 18 | 2024.05.31 (ABI 1.0.0) 19 | - Added shared library ABI version 20 | - Updated enum parameters to have `to_string`, `from_string`; 21 | deprecate `2str`, `str2` 22 | - Removed some deprecated functions. Deprecated MemcpyKind 23 | - Added PAPI SDE counters (manually enable for now) 24 | - Support ROCm 5.6.0 (rocBLAS 3.0), trmm with 3 matrices (A, B, C) 25 | - Fixed bug in her2k with complex alpha and row-major matrix 26 | - Fixed bug in example_gemm 27 | - Use `sqrt`, etc. without `std::` to enable argument dependent lookup (ADL) 28 | 29 | 2023.11.05 30 | - Fix Queue workspace 31 | - Update Fortran strlen handling 32 | - Fix CMake unity build 33 | - Fix CMake library ordering 34 | 35 | 2023.08.25 36 | - Use yyyy.mm.dd version scheme, instead of yyyy.mm.release 37 | - Added oneAPI support to CMake 38 | - Fixed int64 support 39 | - More robust Makefile configure doesn't require CUDA or ROCm to be in 40 | compiler search paths (CPATH, LIBRARY_PATH, etc.) 41 | 42 | 2023.06.00 43 | - Revised Queue class to allow creating Queue from an existing 44 | CUDA/HIP stream, cuBLAS/rocBLAS handle, or SYCL queue. Also 45 | allocates streams and workspace on demand, to make Queue creation 46 | much lighter weight. 47 | - Improved oneAPI support 48 | 49 | 2023.01.00 50 | - Added oneAPI port (currently Makefile only) 51 | - Added queue argument to `device_malloc, device_free`, etc.; 52 | deprecated old versions 53 | - Deprecated `set_device, get_device` 54 | - Renamed `device_malloc_pinned` to `host_malloc_pinned` 55 | - Added `device_copy_{matrix,vector}`; 56 | deprecated `device_{set,get}{matrix,vector}` 57 | - Added more Level 1 BLAS on GPU device: axpy, dot, nrm2 58 | - Moved main repo to https://github.com/icl-utk-edu/blaspp/ 59 | - Refactored routines for better maintainability 60 | - Use python3 61 | 62 | 2022.07.00 63 | - Added workspace in queue; used in LAPACK++ 64 | - Set device in memcpy, etc. 65 | - Updated Schur gemm test with tile layout 66 | 67 | 2022.05.00 68 | - Added Level 3 BLAS template implementations 69 | - Added device copy, scal 70 | - Added Schur gemm test, batched tile and LAPACK formats 71 | - Fixed gbmm flops when rectangular 72 | - Fixed CMake when BLAS_LIBRARIES is empty 73 | 74 | 2021.04.01 75 | - Fixed bug in `test_trsm_device` for row-major 76 | 77 | 2021.04.00 78 | - Added HIP/ROCm support 79 | - Added include/blas/defines.h based on configuration 80 | - Various bug and CMake fixes 81 | 82 | 2020.10.02 83 | - CMake support for including as subdirectory 84 | 85 | 2020.10.01 86 | - Fixes: CMake always checks for CBLAS, in case LAPACK++ needs it 87 | 88 | 2020.10.00 89 | - Fixes: CMake defines, version, ILP64; remove [cz]symv prototypes 90 | - Add `make check` 91 | 92 | 2020.09.00 93 | - Clean up namespace 94 | - Makefile and CMake improvements 95 | 96 | 2020.08.00 97 | - Initial release. Functionality: 98 | - Level 1, 2, 3 BLAS for CPU 99 | - Level 3 BLAS for GPU 100 | - Level 3 batched BLAS for CPU and GPU 101 | - cuBLAS GPU implementation 102 | - Makefile and CMake build options 103 | -------------------------------------------------------------------------------- /GNUmakefile.subdir: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | # SPDX-License-Identifier: BSD-3-Clause 3 | # This program is free software: you can redistribute it and/or modify it under 4 | # the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | # Subdirectories include this makefile to forward rules to the top level makefile. 7 | # Define ${top} for where the top level is. 8 | # Example: src/GNUmakefile: 9 | # top = .. 10 | # include ${top}/GNUmakefile.subdir 11 | 12 | .SUFFIXES: 13 | 14 | pwd = ${shell pwd} 15 | abs_top = ${abspath ${top}}/ 16 | abs_pwd = ${abspath ${pwd}} 17 | cdir = ${subst ${abs_top},,${abs_pwd}} 18 | 19 | # ------------------------------------------------------------------------------ 20 | ifneq (${MAKECMDGOALS},) 21 | 22 | # If arguments are given, presumably files like test.o, forward them to top 23 | # with cdir prefix. 24 | # All files are forwarded as one rule, based on first; rest are quietly ignored. 25 | goals := ${filter-out echo ${DONT_FORWARD}, ${MAKECMDGOALS}} 26 | forward := ${addprefix ${cdir}/, ${goals}} 27 | first := ${firstword ${goals}} 28 | rest := ${wordlist 2, ${words ${goals}}, ${goals}} 29 | 30 | ${first}: force 31 | cd ${top} && ${MAKE} ${forward} 32 | 33 | ${rest}: force 34 | @echo > /dev/null 35 | 36 | # ------------------------------------------------------------------------------ 37 | else 38 | 39 | # Otherwise, forward subdirectory name as target. 40 | .PHONY: ${cdir} 41 | 42 | ${cdir}: 43 | cd ${top} && ${MAKE} $@ 44 | 45 | endif 46 | # ------------------------------------------------------------------------------ 47 | 48 | force: ; 49 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions are met: 5 | * Redistributions of source code must retain the above copyright 6 | notice, this list of conditions and the following disclaimer. 7 | * Redistributions in binary form must reproduce the above copyright 8 | notice, this list of conditions and the following disclaimer in the 9 | documentation and/or other materials provided with the distribution. 10 | * Neither the name of the University of Tennessee nor the 11 | names of its contributors may be used to endorse or promote products 12 | derived from this software without specific prior written permission. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE 18 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 20 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 21 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR 22 | TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 23 | THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | -------------------------------------------------------------------------------- /blasppConfig.cmake.in: -------------------------------------------------------------------------------- 1 | cmake_minimum_required( VERSION 3.15 ) 2 | 3 | set( blaspp_use_openmp "@blaspp_use_openmp@" ) 4 | set( blaspp_use_cuda "@blaspp_use_cuda@" ) 5 | set( blaspp_use_hip "@blaspp_use_hip@" ) 6 | set( blaspp_use_sycl "@blaspp_use_sycl@" ) 7 | set( blaspp_int "@blaspp_int@" ) 8 | 9 | include( CMakeFindDependencyMacro ) 10 | if (blaspp_use_openmp) 11 | find_dependency( OpenMP ) 12 | endif() 13 | 14 | if (blaspp_use_cuda) 15 | find_dependency( CUDAToolkit ) 16 | endif() 17 | 18 | if (blaspp_use_hip) 19 | find_dependency( rocblas ) 20 | endif() 21 | 22 | # Export private variables used in LAPACK++. 23 | set( blaspp_defines "@blaspp_defines@" ) 24 | set( blaspp_libraries "@blaspp_libraries@" ) 25 | 26 | set( blaspp_cblas_found "@blaspp_cblas_found@" ) 27 | set( blaspp_cblas_include "@blaspp_cblas_include@" ) 28 | set( blaspp_cblas_libraries "@blaspp_cblas_libraries@" ) 29 | 30 | include( "${CMAKE_CURRENT_LIST_DIR}/blasppTargets.cmake" ) 31 | -------------------------------------------------------------------------------- /cmake/CBLASConfig.cmake: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | # SPDX-License-Identifier: BSD-3-Clause 3 | # This program is free software: you can redistribute it and/or modify it under 4 | # the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | include( "cmake/util.cmake" ) 7 | 8 | # Check if this file has already been run with these settings (see bottom). 9 | set( run_ true ) 10 | if (DEFINED cblas_config_cache 11 | AND "${cblas_config_cache}" STREQUAL "${BLAS_LIBRARIES}") 12 | 13 | message( DEBUG "CBLAS config already done for '${BLAS_LIBRARIES}'" ) 14 | set( run_ false ) 15 | endif() 16 | 17 | #=============================================================================== 18 | # Matching endif at bottom. 19 | if (run_) 20 | 21 | #---------------------------------------- 22 | # Apple puts cblas.h in weird places. If we can't find it, 23 | # use Accelerate/Accelerate.h, but that had issues compiling with g++. 24 | if ("${blaspp_defs_}" MATCHES "HAVE_ACCELERATE") 25 | set( dir_list 26 | "/System/Library/Frameworks/Accelerate.framework/Frameworks/vecLib.framework/Headers" 27 | "/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Headers" 28 | ) 29 | foreach (dir IN LISTS dir_list) 30 | if (EXISTS "${dir}/cblas.h") 31 | set( blaspp_cblas_include "${dir}" ) 32 | list( APPEND blaspp_defs_ "-DBLAS_HAVE_ACCELERATE_CBLAS_H" ) 33 | break() 34 | endif() 35 | endforeach() 36 | endif() 37 | 38 | #------------------------------------------------------------------------------- 39 | set( lib_list ";-lcblas" ) 40 | message( DEBUG "lib_list ${lib_list}" ) 41 | 42 | foreach (lib IN LISTS lib_list) 43 | message( STATUS "Checking for CBLAS library ${lib}" ) 44 | 45 | try_run( 46 | run_result compile_result ${CMAKE_CURRENT_BINARY_DIR} 47 | SOURCES 48 | "${CMAKE_CURRENT_SOURCE_DIR}/config/cblas.cc" 49 | LINK_LIBRARIES 50 | ${lib} ${BLAS_LIBRARIES} ${openmp_lib} # not "..." quoted; screws up OpenMP 51 | COMPILE_DEFINITIONS 52 | ${blaspp_defs_} 53 | CMAKE_FLAGS 54 | "-DINCLUDE_DIRECTORIES=${blaspp_cblas_include}" 55 | COMPILE_OUTPUT_VARIABLE 56 | compile_output 57 | RUN_OUTPUT_VARIABLE 58 | run_output 59 | ) 60 | # For cross-compiling, assume if it links, the run is okay. 61 | if (CMAKE_CROSSCOMPILING AND compile_result) 62 | message( DEBUG "cross: cblas" ) 63 | set( run_result "0" CACHE STRING "" FORCE ) 64 | set( run_output "ok" CACHE STRING "" FORCE ) 65 | endif() 66 | debug_try_run( "cblas.cc" "${compile_result}" "${compile_output}" 67 | "${run_result}" "${run_output}" ) 68 | 69 | if (compile_result AND "${run_output}" MATCHES "ok") 70 | list( APPEND blaspp_defs_ "-DBLAS_HAVE_CBLAS" ) 71 | set( blaspp_cblas_libraries "${lib}" CACHE INTERNAL "" ) 72 | set( blaspp_cblas_found true CACHE INTERNAL "" ) 73 | break() 74 | endif() 75 | endforeach() 76 | 77 | endif() # run_ 78 | #=============================================================================== 79 | 80 | # Mark as already run (see top). 81 | set( cblas_config_cache "${BLAS_LIBRARIES}" CACHE INTERNAL "" ) 82 | 83 | #------------------------------------------------------------------------------- 84 | if (blaspp_cblas_found) 85 | message( "${blue} Found CBLAS library ${blaspp_cblas_libraries}${plain}" ) 86 | else() 87 | message( "${red} CBLAS library not found. Tester cannot be built.${plain}" ) 88 | endif() 89 | 90 | message( DEBUG " 91 | blaspp_cblas_found = '${blaspp_cblas_found}' 92 | blaspp_cblas_libraries = '${blaspp_cblas_libraries}' 93 | blaspp_defs_ = '${blaspp_defs_}' 94 | ") 95 | -------------------------------------------------------------------------------- /cmake/config.cmake: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-2024, University of Tennessee. All rights reserved. 2 | # SPDX-License-Identifier: BSD-3-Clause 3 | # This program is free software: you can redistribute it and/or modify it under 4 | # the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #------------------------------------------------------------------------------- 7 | # Tests whether using `std::atomic` requires linking with `-latomic` 8 | # for 64-bit values, which is the case on some 32-bit systems. 9 | # Sets variable `libatomic_required`. 10 | # 11 | function( check_libatomic ) 12 | message( STATUS "Checking whether std::atomic requires libatomic" ) 13 | set( libatomic_required false ) 14 | 15 | try_compile( 16 | link_result ${CMAKE_CURRENT_BINARY_DIR} 17 | SOURCES 18 | "${CMAKE_CURRENT_SOURCE_DIR}/config/std_atomic.cc" 19 | OUTPUT_VARIABLE 20 | link_output 21 | ) 22 | debug_try_compile( "std_atomic.cc" "${link_result}" "${link_output}" ) 23 | 24 | set( label " std::atomic links without -latomic" ) 25 | pad_string( "${label}" 50 label ) 26 | if (link_result) 27 | message( "${label} ${blue} yes${plain}" ) 28 | else() 29 | message( "${label} ${red} no${plain}" ) 30 | 31 | try_compile( 32 | link_result ${CMAKE_CURRENT_BINARY_DIR} 33 | SOURCES 34 | "${CMAKE_CURRENT_SOURCE_DIR}/config/std_atomic.cc" 35 | LINK_LIBRARIES 36 | "-latomic" 37 | OUTPUT_VARIABLE 38 | link_output 39 | ) 40 | debug_try_compile( "std_atomic.cc" "${link_result}" "${link_output}" ) 41 | 42 | set( label " std::atomic requires -latomic" ) 43 | pad_string( "${label}" 50 label ) 44 | if (link_result) 45 | #target_link_libraries( ${tgt} PUBLIC "-latomic" ) 46 | message( "${label} ${blue} yes${plain}" ) 47 | set( libatomic_required true ) 48 | else() 49 | message( "${label} ${red} failed; cannot compile libatomic test${plain}" ) 50 | endif() 51 | endif() 52 | endfunction() 53 | -------------------------------------------------------------------------------- /cmake/util.cmake: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | # SPDX-License-Identifier: BSD-3-Clause 3 | # This program is free software: you can redistribute it and/or modify it under 4 | # the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | if (color) 7 | string( ASCII 27 Esc ) 8 | set( ansi_reset "${Esc}[0m" ) 9 | set( bold "${Esc}[1m" ) 10 | set( not_bold "${Esc}[22m" ) # "normal" 11 | set( italic "${Esc}[3m" ) 12 | set( not_italic "${Esc}[23m" ) 13 | 14 | set( black "${Esc}[30m" ) 15 | set( red "${Esc}[31m" ) 16 | set( green "${Esc}[32m" ) 17 | set( yellow "${Esc}[33m" ) 18 | set( blue "${Esc}[34m" ) 19 | set( magenta "${Esc}[35m" ) 20 | set( cyan "${Esc}[36m" ) 21 | set( gray "${Esc}[37m" ) 22 | set( default_color "${Esc}[39m" ) 23 | set( plain "${Esc}[39m" ) 24 | endif() 25 | 26 | #------------------------------------------------------------------------------- 27 | # pad_string( input length output_variable ) 28 | # Adds spaces to input up to length and saves to output_variable. 29 | # 30 | function( pad_string input length output_variable ) 31 | string( LENGTH "${input}" len ) 32 | math( EXPR pad_len "${length} - ${len}" ) 33 | if (pad_len LESS 0) 34 | set( pad_len 0 ) 35 | endif() 36 | string( REPEAT " " ${pad_len} pad ) 37 | set( ${output_variable} "${input}${pad}" PARENT_SCOPE ) 38 | endfunction() 39 | 40 | #------------------------------------------------------------------------------- 41 | # debug_try_compile( msg compile_result compile_output ) 42 | # Prints compile_result at log level DEBUG (5); 43 | # compile_output at log level TRACE (6). 44 | # 45 | function( debug_try_compile msg compile_result compile_output ) 46 | message( DEBUG "${msg}: compile_result '${compile_result}'" ) 47 | message( TRACE "compile_output: <<<\n${compile_output}>>>" ) 48 | endfunction() 49 | 50 | #------------------------------------------------------------------------------- 51 | # debug_try_run( msg compile_result run_result compile_output run_output ) 52 | # Prints {compile,run}_result at debug DEBUG (5); 53 | # {compile,run}_output at debug TRACE (6). 54 | # 55 | function( debug_try_run msg compile_result compile_output run_result run_output ) 56 | message( DEBUG "${msg}: compile_result '${compile_result}', run_result '${run_result}'" ) 57 | message( TRACE "compile_output: '''\n${compile_output}'''" ) 58 | message( TRACE "run_output: '''\n${run_output}'''" ) 59 | endfunction() 60 | 61 | #------------------------------------------------------------------------------- 62 | # assert( condition ) 63 | # Aborts if condition is not true. Condition is evaluated inside an `if`, 64 | # so it can have boolean operators like EQUAL: 65 | # assert( x EQUAL 2 ) 66 | # 67 | macro( assert ) 68 | if (NOT (${ARGN})) 69 | message( FATAL_ERROR "\n${red}Assertion failed: ${var} (value is '${${var}}')${default_color}\n" ) 70 | endif() 71 | endmacro() 72 | 73 | #------------------------------------------------------------------------------- 74 | # match( regex str output ) 75 | # If str matches regular expression in regex, 76 | # sets output to true, else sets it to false. 77 | # 78 | # Contrast this with: string( REGEX MATCH regex output str ), 79 | # which sets output to the match string itself, which could be false, e.g., 80 | # string( REGEX MATCH "(yes|no)" output "no" ) 81 | # sets output = 'no' (interpreted as false in CMake), rather than true. 82 | # 83 | # The order of arguments here matches string( COMPARE EQUAL str1 str2 output ), 84 | # rather than string( REGEX MATCH regex output str ). 85 | # 86 | function( match regex str output ) 87 | if ("${str}" MATCHES "${regex}") 88 | set( ${output} "true" PARENT_SCOPE ) 89 | else() 90 | set( ${output} "false" PARENT_SCOPE ) 91 | endif() 92 | endfunction() 93 | -------------------------------------------------------------------------------- /config/__init__.py: -------------------------------------------------------------------------------- 1 | from .config import * 2 | -------------------------------------------------------------------------------- /config/blas.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #include 7 | #include 8 | 9 | #include "config.h" 10 | 11 | //------------------------------------------------------------------------------ 12 | #define BLAS_ddot FORTRAN_NAME( ddot, DDOT ) 13 | 14 | // result return directly 15 | #ifdef __cplusplus 16 | extern "C" 17 | #endif 18 | double BLAS_ddot( 19 | const blas_int* n, 20 | const double* x, const blas_int* incx, 21 | const double* y, const blas_int* incy ); 22 | 23 | //------------------------------------------------------------------------------ 24 | int main() 25 | { 26 | // If blas_int is 32-bit, but BLAS actually interprets it as 64-bit, 27 | // BLAS will see n = 0x500000005 and segfault. 28 | // If blas_int is 64-bit, BLAS can interpret it as 32-bit or 64-bit 29 | // to see n = 5 and pass. 30 | blas_int n[] = { 5, 5 }, ione = 1; 31 | double x[] = { 1, 2, 3, 4, 5 }; 32 | double y[] = { 5, 4, 3, 2, 1 }; 33 | for (int i = 0; i < n[0]; ++i) { 34 | printf( "x[ %d ] = %.1f; y[ %d ] = %.1f\n", 35 | i, x[ i ], 36 | i, y[ i ] ); 37 | } 38 | 39 | double result = BLAS_ddot( n, x, &ione, y, &ione ); 40 | printf( "result = %.1f; should be 35.0\n", result ); 41 | 42 | bool okay = (result == 35); 43 | printf( "%s\n", okay ? "ok" : "failed" ); 44 | return ! okay; 45 | } 46 | -------------------------------------------------------------------------------- /config/blis_version.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2024, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #include 7 | #include 8 | 9 | int main() 10 | { 11 | const char* v = bli_info_get_version_str(); 12 | printf( "BLIS_VERSION=%s\n", v ); 13 | return 0; 14 | } 15 | -------------------------------------------------------------------------------- /config/cblas.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #include 7 | 8 | #if defined(BLAS_HAVE_MKL) 9 | #if defined(BLAS_ILP64) && ! defined(MKL_ILP64) 10 | #define MKL_ILP64 11 | #endif 12 | #include 13 | 14 | #elif defined(BLAS_HAVE_ESSL) 15 | #if defined(BLAS_ILP64) && ! defined(_ESV6464) 16 | #define _ESV6464 17 | #endif 18 | #include 19 | 20 | #elif defined(BLAS_HAVE_ACCELERATE) 21 | // On macOS, the official way to include cblas is via Accelerate.h. 22 | // Unfortunately with Xcode 10.3 and GNU g++ 9.3, that doesn't compile. 23 | // If we can find cblas.h, use it, otherwise use Accelerate.h. 24 | #ifdef BLAS_HAVE_ACCELERATE_CBLAS_H 25 | #include 26 | #else 27 | #include 28 | #endif 29 | #else 30 | #ifdef __cplusplus 31 | // Some ancient cblas.h don't include extern C. It's okay to nest. 32 | extern "C" { 33 | #include 34 | } 35 | #else 36 | #include 37 | #endif 38 | #endif 39 | 40 | //------------------------------------------------------------------------------ 41 | int main() 42 | { 43 | int n = 5; 44 | double x[] = { 1, 2, 3, 4, 5 }; 45 | double y[] = { 5, 4, 3, 2, 1 }; 46 | for (int i = 0; i < n; ++i) { 47 | printf( "x[ %d ] = %.1f; y[ %d ] = %.1f\n", 48 | i, x[ i ], 49 | i, y[ i ] ); 50 | } 51 | 52 | double result = cblas_ddot( n, x, 1, y, 1 ); 53 | printf( "result = %.1f; should be 35.0\n", result ); 54 | 55 | bool okay = (result == 35); 56 | printf( "%s\n", okay ? "ok" : "failed" ); 57 | return ! okay; 58 | } 59 | -------------------------------------------------------------------------------- /config/compiler_cxx.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #ifdef __cplusplus 7 | #include 8 | #else 9 | #include 10 | #endif 11 | 12 | int main() 13 | { 14 | // xlc must come before clang 15 | // clang and icc must come before gcc 16 | // icpx and icx must come before clang 17 | const char* compiler = 18 | #ifdef __cplusplus 19 | // IBM's documentation says __IBMCPP__, 20 | // but xlc -qshowmacros shows __ibmxl_version__. 21 | #if defined(__IBMCPP__) || defined(__ibmxl_version__) 22 | "xlc++"; 23 | #elif defined(_CRAYC) 24 | "cray"; 25 | #elif defined(__ICC) 26 | "icpc"; 27 | #elif defined(__INTEL_LLVM_COMPILER) 28 | "icpx"; 29 | #elif defined(_MSC_VER) 30 | "MSC"; 31 | #elif defined(__clang__) 32 | "clang++"; 33 | #elif defined(__GNUG__) 34 | "g++"; 35 | #else 36 | "unknown C++"; 37 | #endif 38 | #else 39 | #if defined(__IBMC__) || defined(__ibmxl_version__) 40 | "xlc"; 41 | #elif defined(_CRAYC) 42 | "cray"; 43 | #elif defined(__ICC) 44 | "icc"; 45 | #elif defined(__INTEL_LLVM_COMPILER) 46 | "icx"; 47 | #elif defined(_MSC_VER) 48 | "MSC"; 49 | #elif defined(__clang__) 50 | "clang"; 51 | #elif defined(__GNUC__) 52 | "gcc"; 53 | #else 54 | "unknown C"; 55 | #endif 56 | #endif 57 | 58 | #ifdef __cplusplus 59 | std::cout << compiler << "\n"; 60 | #else 61 | printf( "%s\n", compiler ); 62 | #endif 63 | return 0; 64 | } 65 | -------------------------------------------------------------------------------- /config/config.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #ifndef CONFIG_H 7 | #define CONFIG_H 8 | 9 | #include 10 | 11 | //------------------------------------------------------------------------------ 12 | #if defined(FORTRAN_UPPER) 13 | #define FORTRAN_NAME( lower, UPPER ) UPPER 14 | #elif defined(FORTRAN_LOWER) 15 | #define FORTRAN_NAME( lower, UPPER ) lower 16 | #else 17 | // default is ADD_ 18 | #define FORTRAN_NAME( lower, UPPER ) lower ## _ 19 | #endif 20 | 21 | //------------------------------------------------------------------------------ 22 | #if defined(BLAS_ILP64) || defined(LAPACK_ILP64) 23 | typedef int64_t blas_int; 24 | typedef int64_t lapack_int; 25 | #else 26 | typedef int blas_int; 27 | typedef int lapack_int; 28 | #endif 29 | 30 | //------------------------------------------------------------------------------ 31 | #ifndef BLAS_FORTRAN_STRLEN_END 32 | #define BLAS_FORTRAN_STRLEN_END 33 | #endif 34 | 35 | #ifndef LAPACK_FORTRAN_STRLEN_END 36 | #define LAPACK_FORTRAN_STRLEN_END 37 | #endif 38 | 39 | #endif // CONFIG_H 40 | -------------------------------------------------------------------------------- /config/cublas.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #include 7 | #include 8 | 9 | #include 10 | #include 11 | #include 12 | 13 | //------------------------------------------------------------------------------ 14 | void error_check_( cudaError_t err, const char* file, int line ) 15 | { 16 | if (err != cudaSuccess) { 17 | printf( "CUDA error %d: %s at %s:%d\n", 18 | err, cudaGetErrorString(err), file, line ); 19 | exit(1); 20 | } 21 | } 22 | 23 | //------------------------------------------------------------------------------ 24 | void error_check_( cublasStatus_t err, const char* file, int line ) 25 | { 26 | if (err != CUBLAS_STATUS_SUCCESS) { 27 | printf( "cuBLAS error %d at %s:%d\n", 28 | err, file, line ); 29 | exit(1); 30 | } 31 | } 32 | 33 | #define error_check( err ) \ 34 | error_check_( (err), __FILE__, __LINE__ ) 35 | 36 | //------------------------------------------------------------------------------ 37 | int main() 38 | { 39 | double alpha = 2, beta = 3; 40 | int n = 2; 41 | double A[] = { 1, 2, 3, 4 }; 42 | double B[] = { 5, 4, 3, 2 }; 43 | double C[] = { 2, 3, 1, 0 }; 44 | double D[] = { 40, 61, 21, 28 }; 45 | 46 | cudaError_t err = cudaSetDevice( 0 ); 47 | if (err != cudaSuccess) { 48 | printf( "cudaSetDevice failed: %s (%d).\n" 49 | "Cannot run on GPU; skipping test.\n", 50 | cudaGetErrorString(err), err ); 51 | return 0; 52 | } 53 | 54 | double *dA, *dB, *dC; 55 | error_check( 56 | cudaMalloc( &dA, n*n*sizeof(double) ) ); 57 | error_check( 58 | cudaMalloc( &dB, n*n*sizeof(double) ) ); 59 | error_check( 60 | cudaMalloc( &dC, n*n*sizeof(double) ) ); 61 | assert( dA != nullptr ); 62 | assert( dB != nullptr ); 63 | assert( dC != nullptr ); 64 | 65 | // dA = A, dB = B, dC = c 66 | error_check( 67 | cudaMemcpy( dA, A, n*n*sizeof(double), cudaMemcpyDefault ) ); 68 | error_check( 69 | cudaMemcpy( dB, B, n*n*sizeof(double), cudaMemcpyDefault ) ); 70 | error_check( 71 | cudaMemcpy( dC, C, n*n*sizeof(double), cudaMemcpyDefault ) ); 72 | 73 | // C = alpha A B + beta C 74 | cublasHandle_t handle; 75 | error_check( 76 | cublasCreate( &handle ) ); 77 | error_check( 78 | cublasDgemm( handle, CUBLAS_OP_N, CUBLAS_OP_N, n, n, n, 79 | &alpha, dA, n, dB, n, &beta, dC, n ) ); 80 | error_check( 81 | cublasDestroy( handle ) ); 82 | 83 | // C = dC 84 | error_check( 85 | cudaMemcpy( C, dC, n*n*sizeof(double), cudaMemcpyDefault ) ); 86 | 87 | error_check( 88 | cudaFree( dA ) ); 89 | error_check( 90 | cudaFree( dB ) ); 91 | error_check( 92 | cudaFree( dC ) ); 93 | 94 | // verify C == D 95 | double result = 0; 96 | for (int i = 0; i < n*n; ++i) { 97 | printf( "C[%d] = %.2f, D = %.2f\n", i, C[i], D[i] ); 98 | result += std::abs( D[i] - C[i] ); 99 | } 100 | bool okay = (result == 0); 101 | printf( "%s\n", okay ? "ok" : "failed" ); 102 | return ! okay; 103 | } 104 | -------------------------------------------------------------------------------- /config/essl_version.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #include 7 | #include 8 | 9 | int main() 10 | { 11 | int v = iessl(); 12 | int version = int( v / 1000000 ); 13 | int release = int( (v % 1000000) / 10000 ); 14 | int modification = int( (v % 10000) / 100 ); 15 | int ptf = v % 100; 16 | 17 | printf( "ESSL_VERSION=%d.%d.%d.%d\n", 18 | version, release, modification, ptf ); 19 | return 0; 20 | } 21 | -------------------------------------------------------------------------------- /config/hello.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #include 7 | 8 | int main() 9 | { 10 | printf( "ok\n" ); 11 | return 0; 12 | } 13 | -------------------------------------------------------------------------------- /config/lapack_potrf.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #include 7 | #include 8 | 9 | #include "config.h" 10 | 11 | #define LAPACK_dpotrf_base FORTRAN_NAME( dpotrf, DPOTRF ) 12 | 13 | #ifdef __cplusplus 14 | extern "C" 15 | #endif 16 | void LAPACK_dpotrf_base( 17 | const char* uplo, const lapack_int* n, 18 | double* A, const lapack_int* lda, 19 | lapack_int* info 20 | #ifdef LAPACK_FORTRAN_STRLEN_END 21 | , size_t uplo_len 22 | #endif 23 | ); 24 | 25 | #ifdef LAPACK_FORTRAN_STRLEN_END 26 | #define LAPACK_dpotrf( ... ) LAPACK_dpotrf_base( __VA_ARGS__, 1 ) 27 | #else 28 | #define LAPACK_dpotrf( ... ) LAPACK_dpotrf_base( __VA_ARGS__ ) 29 | #endif 30 | 31 | //------------------------------------------------------------------------------ 32 | int main() 33 | { 34 | // If lapack_int is 32-bit, but LAPACK actually interprets it as 64-bit, 35 | // LAPACK will see n = 0x500000005 and segfault. 36 | // If lapack_int is 64-bit, LAPACK can interpret it as 32-bit or 64-bit 37 | // to see n = 5 and pass. 38 | lapack_int n[] = { 5, 5 }; 39 | // symmetric positive definite A = L L^T, with exact L. 40 | // -1 values in upper triangle (viewed column-major) are not referenced. 41 | double A[] = { 42 | 4, 2, 0, 0, 0, 43 | -1, 5, 2, 0, 0, 44 | -1, -1, 5, 2, 0, 45 | -1, -1, -1, 5, 2, 46 | -1, -1, -1, -1, 5 47 | }; 48 | double L[] = { 49 | 2, 1, 0, 0, 0, 50 | -1, 2, 1, 0, 0, 51 | -1, -1, 2, 1, 0, 52 | -1, -1, -1, 2, 1, 53 | -1, -1, -1, -1, 2 54 | }; 55 | lapack_int info = -1; 56 | LAPACK_dpotrf( "lower", n, A, n, &info ); 57 | bool okay = (info == 0); 58 | for (int i = 0; i < 5*5; ++i) { 59 | okay = okay && (A[i] == L[i]); 60 | } 61 | printf( "%s\n", okay ? "ok" : "failed" ); 62 | return ! okay; 63 | } 64 | -------------------------------------------------------------------------------- /config/lapack_pstrf.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #include 7 | #include 8 | 9 | #include "config.h" 10 | 11 | #define LAPACK_dpstrf_base FORTRAN_NAME( dpstrf, DPSTRF ) 12 | 13 | #ifdef __cplusplus 14 | extern "C" 15 | #endif 16 | void LAPACK_dpstrf_base( 17 | const char* uplo, const lapack_int* n, 18 | double* A, const lapack_int* lda, 19 | lapack_int* ipiv, lapack_int* rank, 20 | const double* tol, 21 | double* work, 22 | lapack_int* info 23 | #ifdef LAPACK_FORTRAN_STRLEN_END 24 | , size_t uplo_len 25 | #endif 26 | ); 27 | 28 | #ifdef LAPACK_FORTRAN_STRLEN_END 29 | #define LAPACK_dpstrf( ... ) LAPACK_dpstrf_base( __VA_ARGS__, 1 ) 30 | #else 31 | #define LAPACK_dpstrf( ... ) LAPACK_dpstrf_base( __VA_ARGS__ ) 32 | #endif 33 | 34 | //------------------------------------------------------------------------------ 35 | int main() 36 | { 37 | // If lapack_int is 32-bit, but LAPACK actually interprets it as 64-bit, 38 | // LAPACK will see n = 0x500000005 and segfault. 39 | // If lapack_int is 64-bit, LAPACK can interpret it as 32-bit or 64-bit 40 | // to see n = 5 and pass. 41 | lapack_int n[] = { 5, 5 }; 42 | // symmetric positive definite A = L L^T. 43 | // -1 values in upper triangle (viewed column-major) are not referenced. 44 | double A[] = { 45 | 4, 2, 0, 0, 0, 46 | -1, 5, 2, 0, 0, 47 | -1, -1, 5, 2, 0, 48 | -1, -1, -1, 5, 2, 49 | -1, -1, -1, -1, 5 50 | }; 51 | lapack_int ipiv[5] = { -1, -1, -1, -1, -1 }; 52 | lapack_int rank = -1; 53 | double tol = -1; 54 | double work[2*5]; 55 | lapack_int info = -1; 56 | // With pivoting in pstrf, P^T A P = L2 L2^T. 57 | // Don't have exact L2 for comparison. 58 | LAPACK_dpstrf( "lower", n, A, n, ipiv, &rank, &tol, work, &info ); 59 | bool okay = (info == 0) && (rank == 5); 60 | printf( "%s\n", okay ? "ok" : "failed" ); 61 | return ! okay; 62 | } 63 | -------------------------------------------------------------------------------- /config/mkl_version.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #include 7 | #include 8 | 9 | int main() 10 | { 11 | MKLVersion v; 12 | MKL_Get_Version( &v ); 13 | printf( "MKL_VERSION=%d.%d.%d\n", 14 | v.MajorVersion, v.MinorVersion, v.UpdateVersion ); 15 | return 0; 16 | } 17 | -------------------------------------------------------------------------------- /config/onemkl.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | #include 12 | #include 13 | 14 | //------------------------------------------------------------------------------ 15 | int main() 16 | { 17 | try { 18 | double alpha = 2, beta = 3; 19 | int n = 2; 20 | double A[] = { 1, 2, 3, 4 }; 21 | double B[] = { 5, 4, 3, 2 }; 22 | double C[] = { 2, 3, 1, 0 }; 23 | double D[] = { 40, 61, 21, 28 }; 24 | 25 | // enumerate devices 26 | std::vector< sycl::device > devices; 27 | auto platforms = sycl::platform::get_platforms(); 28 | for (auto& platform : platforms) { 29 | auto all_devices = platform.get_devices(); 30 | for (auto& device : all_devices) { 31 | if (device.is_gpu()) { 32 | devices.push_back( device ); 33 | } 34 | } 35 | } 36 | if (devices.size() == 0) { 37 | printf( "no sycl GPU devices\n" ); 38 | return -1; 39 | } 40 | 41 | sycl::queue queue( devices[0] ); 42 | 43 | double *dA, *dB, *dC; 44 | dA = (double*) sycl::malloc_shared( n*n*sizeof(double), queue ); 45 | dB = (double*) sycl::malloc_shared( n*n*sizeof(double), queue ); 46 | dC = (double*) sycl::malloc_shared( n*n*sizeof(double), queue ); 47 | 48 | // dA = A, dB = B, dC = c 49 | queue.memcpy( dA, A, n*n*sizeof(double) ); 50 | queue.memcpy( dB, B, n*n*sizeof(double) ); 51 | queue.memcpy( dC, C, n*n*sizeof(double) ); 52 | 53 | // C = alpha A B + beta C 54 | oneapi::mkl::blas::gemm( 55 | queue, 56 | oneapi::mkl::transpose::N, oneapi::mkl::transpose::N, 57 | n, n, n, 58 | alpha, dA, n, dB, n, beta, dC, n ); 59 | 60 | // C = dC 61 | queue.memcpy( dC, C, n*n*sizeof(double) ); 62 | 63 | sycl::free( dA, queue ); 64 | sycl::free( dB, queue ); 65 | sycl::free( dC, queue ); 66 | 67 | // verify C == D 68 | double result = 0; 69 | for (int i = 0; i < n*n; ++i) { 70 | printf( "C[%d] = %.2f, D = %.2f\n", i, C[i], D[i] ); 71 | result += std::abs( D[i] - C[i] ); 72 | } 73 | bool okay = (result == 0); 74 | printf( "%s\n", okay ? "ok" : "failed" ); 75 | return ! okay; 76 | } 77 | catch (...) { 78 | printf( "caught error\n" ); 79 | return -2; 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /config/openblas_version.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #include 7 | #include // openblas_get_config 8 | 9 | int main() 10 | { 11 | const char* v = OPENBLAS_VERSION; 12 | printf( "OPENBLAS_VERSION=%s\n", v ); 13 | 14 | // since OPENBLAS_VERSION is defined in the header, it may work even 15 | // if we don't link with openblas. Calling an OpenBLAS-specific 16 | // function ensures we are linking with OpenBLAS. 17 | const char* config = openblas_get_config(); 18 | printf( "openblas_get_config=%s\n", config ); 19 | 20 | return 0; 21 | } 22 | -------------------------------------------------------------------------------- /config/openmp.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #include 7 | #include 8 | 9 | int main() 10 | { 11 | int nthreads = 1; 12 | int tid = 0; 13 | #pragma omp parallel 14 | { 15 | nthreads = omp_get_max_threads(); 16 | tid = omp_get_thread_num(); 17 | printf( "tid %d, nthreads %d\n", tid, nthreads ); 18 | } 19 | printf( "ok\n" ); 20 | return 0; 21 | } 22 | -------------------------------------------------------------------------------- /config/return_complex.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #include 7 | #include 8 | 9 | // Use C99 _Complex as return type to be compatible with extern C linkage. 10 | #include 11 | 12 | #include "config.h" 13 | 14 | //------------------------------------------------------------------------------ 15 | #define BLAS_zdotc FORTRAN_NAME( zdotc, ZDOTC ) 16 | 17 | // result return directly 18 | #ifdef __cplusplus 19 | extern "C" 20 | #endif 21 | double _Complex BLAS_zdotc( 22 | const blas_int* n, 23 | const std::complex* x, const blas_int* incx, 24 | const std::complex* y, const blas_int* incy ); 25 | 26 | //------------------------------------------------------------------------------ 27 | int main() 28 | { 29 | blas_int n = 5, ione = 1; 30 | std::complex x[] = { 1, 2, 3, 4, 5 }; 31 | std::complex y[] = { 5, 4, 3, 2, 1 }; 32 | for (int i = 0; i < n; ++i) { 33 | printf( "x[ %d ] = %.1f + %.1fi; y[ %d ] = %.1f + %.1fi\n", 34 | i, real( x[ i ] ), imag( x[ i ] ), 35 | i, real( y[ i ] ), imag( y[ i ] ) ); 36 | } 37 | 38 | double _Complex r = BLAS_zdotc( &n, x, &ione, y, &ione ); 39 | std::complex result = *reinterpret_cast< std::complex* >( &r ); 40 | printf( "result = %.1f + %.1fi; should be 35.0 + 0.0i\n", 41 | real( result ), imag( result ) ); 42 | 43 | bool okay = (real(result) == 35); 44 | printf( "%s\n", okay ? "ok" : "failed" ); 45 | return ! okay; 46 | } 47 | -------------------------------------------------------------------------------- /config/return_complex_argument.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #include 7 | #include 8 | 9 | #include "config.h" 10 | 11 | //------------------------------------------------------------------------------ 12 | #define BLAS_zdotc FORTRAN_NAME( zdotc, ZDOTC ) 13 | 14 | // result returned as *hidden argument* 15 | #ifdef __cplusplus 16 | extern "C" 17 | #endif 18 | void BLAS_zdotc( 19 | std::complex* result, 20 | const blas_int* n, 21 | const std::complex* x, const blas_int* incx, 22 | const std::complex* y, const blas_int* incy ); 23 | 24 | //------------------------------------------------------------------------------ 25 | int main() 26 | { 27 | blas_int n = 5, ione = 1; 28 | std::complex x[] = { 1, 2, 3, 4, 5 }; 29 | std::complex y[] = { 5, 4, 3, 2, 1 }; 30 | for (int i = 0; i < n; ++i) { 31 | printf( "x[ %d ] = %.1f + %.1fi; y[ %d ] = %.1f + %.1fi\n", 32 | i, real( x[ i ] ), imag( x[ i ] ), 33 | i, real( y[ i ] ), imag( y[ i ] ) ); 34 | } 35 | 36 | std::complex result; 37 | BLAS_zdotc( &result, &n, x, &ione, y, &ione ); 38 | printf( "result = %.1f + %.1fi; should be 35.0 + 0.0i\n", 39 | real( result ), imag( result ) ); 40 | 41 | bool okay = (real(result) == 35); 42 | printf( "%s\n", okay ? "ok" : "failed" ); 43 | return ! okay; 44 | } 45 | -------------------------------------------------------------------------------- /config/return_float.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #include 7 | 8 | #include "config.h" 9 | 10 | //------------------------------------------------------------------------------ 11 | #define BLAS_sdot FORTRAN_NAME( sdot, SDOT ) 12 | 13 | // returns *float* 14 | #ifdef __cplusplus 15 | extern "C" 16 | #endif 17 | float BLAS_sdot( const blas_int* n, 18 | const float* x, const blas_int* incx, 19 | const float* y, const blas_int* incy ); 20 | 21 | //------------------------------------------------------------------------------ 22 | int main() 23 | { 24 | blas_int n = 5, ione = 1; 25 | float x[] = { 1, 2, 3, 4, 5 }; 26 | float y[] = { 5, 4, 3, 2, 1 }; 27 | for (int i = 0; i < n; ++i) { 28 | printf( "x[ %d ] = %.1f; y[ %d ] = %.1f\n", 29 | i, x[ i ], 30 | i, y[ i ] ); 31 | } 32 | 33 | float result = BLAS_sdot( &n, x, &ione, y, &ione ); 34 | printf( "result = %.1f; should be 35.0\n", result ); 35 | 36 | bool okay = (result == 35); 37 | printf( "%s\n", okay ? "ok" : "failed" ); 38 | return ! okay; 39 | } 40 | -------------------------------------------------------------------------------- /config/return_float_f2c.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #include 7 | 8 | #include "config.h" 9 | 10 | //------------------------------------------------------------------------------ 11 | #define BLAS_sdot FORTRAN_NAME( sdot, SDOT ) 12 | 13 | // returns *double* 14 | #ifdef __cplusplus 15 | extern "C" 16 | #endif 17 | double BLAS_sdot( const blas_int* n, 18 | const float* x, const blas_int* incx, 19 | const float* y, const blas_int* incy ); 20 | 21 | //------------------------------------------------------------------------------ 22 | int main() 23 | { 24 | blas_int n = 5, ione = 1; 25 | float x[] = { 1, 2, 3, 4, 5 }; 26 | float y[] = { 5, 4, 3, 2, 1 }; 27 | for (int i = 0; i < n; ++i) { 28 | printf( "x[ %d ] = %.1f; y[ %d ] = %.1f\n", 29 | i, x[ i ], 30 | i, y[ i ] ); 31 | } 32 | 33 | float result = BLAS_sdot( &n, x, &ione, y, &ione ); 34 | printf( "result = %.1f; should be 35.0\n", result ); 35 | 36 | bool okay = (result == 35); 37 | printf( "%s\n", okay ? "ok" : "failed" ); 38 | return ! okay; 39 | } 40 | -------------------------------------------------------------------------------- /config/rocblas.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #ifndef __HIP_PLATFORM_AMD__ 7 | #define __HIP_PLATFORM_AMD__ 8 | #endif 9 | 10 | #include 11 | 12 | // Headers moved in ROCm 5.2 13 | #if HIP_VERSION >= 50200000 14 | #include 15 | #else 16 | #include 17 | #endif 18 | 19 | #include 20 | #include 21 | #include 22 | 23 | //------------------------------------------------------------------------------ 24 | void error_check_( hipError_t err, const char* file, int line ) 25 | { 26 | if (err != hipSuccess) { 27 | printf( "HIP error %d: %s at %s:%d\n", 28 | err, hipGetErrorString(err), file, line ); 29 | exit(1); 30 | } 31 | } 32 | 33 | //------------------------------------------------------------------------------ 34 | void error_check_( rocblas_status err, const char* file, int line ) 35 | { 36 | if (err != rocblas_status_success) { 37 | printf( "rocblas error %d: %s at %s:%d\n", 38 | err, rocblas_status_to_string(err), file, line ); 39 | exit(1); 40 | } 41 | } 42 | 43 | #define error_check( err ) \ 44 | error_check_( (err), __FILE__, __LINE__ ) 45 | 46 | //------------------------------------------------------------------------------ 47 | int main() 48 | { 49 | double alpha = 2, beta = 3; 50 | int n = 2; 51 | double A[] = { 1, 2, 3, 4 }; 52 | double B[] = { 5, 4, 3, 2 }; 53 | double C[] = { 2, 3, 1, 0 }; 54 | double D[] = { 40, 61, 21, 28 }; 55 | 56 | hipError_t err = hipSetDevice( 0 ); 57 | if (err != hipSuccess) { 58 | printf( "hipSetDevice failed: %s (%d).\n" 59 | "Cannot run on GPU; skipping test.\n", 60 | hipGetErrorString(err), err ); 61 | return 0; 62 | } 63 | 64 | double *dA, *dB, *dC; 65 | error_check( 66 | hipMalloc( &dA, n*n*sizeof(double) ) ); 67 | error_check( 68 | hipMalloc( &dB, n*n*sizeof(double) ) ); 69 | error_check( 70 | hipMalloc( &dC, n*n*sizeof(double) ) ); 71 | assert( dA != nullptr ); 72 | assert( dB != nullptr ); 73 | assert( dC != nullptr ); 74 | 75 | // dA = A, dB = B, dC = c 76 | error_check( 77 | hipMemcpy( dA, A, n*n*sizeof(double), hipMemcpyDefault ) ); 78 | error_check( 79 | hipMemcpy( dB, B, n*n*sizeof(double), hipMemcpyDefault ) ); 80 | error_check( 81 | hipMemcpy( dC, C, n*n*sizeof(double), hipMemcpyDefault ) ); 82 | 83 | // C = alpha A B + beta C 84 | rocblas_handle handle; 85 | error_check( 86 | rocblas_create_handle( &handle ) ); 87 | error_check( 88 | rocblas_dgemm( handle, rocblas_operation_none, rocblas_operation_none, 89 | n, n, n, 90 | &alpha, dA, n, dB, n, &beta, dC, n ) ); 91 | error_check( 92 | rocblas_destroy_handle( handle ) ); 93 | 94 | // C = dC 95 | error_check( 96 | hipMemcpy( C, dC, n*n*sizeof(double), hipMemcpyDefault ) ); 97 | 98 | error_check( 99 | hipFree( dA ) ); 100 | error_check( 101 | hipFree( dB ) ); 102 | error_check( 103 | hipFree( dC ) ); 104 | 105 | // verify C == D 106 | double result = 0; 107 | for (int i = 0; i < n*n; ++i) { 108 | printf( "C[%d] = %.2f, D = %.2f\n", i, C[i], D[i] ); 109 | result += std::abs( D[i] - C[i] ); 110 | } 111 | bool okay = (result == 0); 112 | printf( "%s\n", okay ? "ok" : "failed" ); 113 | return ! okay; 114 | } 115 | -------------------------------------------------------------------------------- /config/std_atomic.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2024, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #include 7 | #include 8 | 9 | int main( int argc, char** argv ) 10 | { 11 | std::atomic x = 0; 12 | for (int i = 1; i < argc; ++i) { 13 | ++x; 14 | } 15 | return x; 16 | } 17 | -------------------------------------------------------------------------------- /configure.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # 3 | # Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # This program is free software: you can redistribute it and/or modify it under 6 | # the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 7 | # 8 | # Usage: python3 configure.py [--interactive] 9 | 10 | from __future__ import print_function 11 | 12 | import sys 13 | import re 14 | import config 15 | from config import Error, font, print_msg, print_warn, print_header 16 | import config.lapack 17 | 18 | #------------------------------------------------------------------------------- 19 | # header 20 | 21 | print( '-'*80 + '\n' + 22 | font.bold( font.blue( ' Welcome to BLAS++.' ) ) + 23 | ''' 24 | 25 | By default, configure will automatically choose the first valid value it finds 26 | for each option. You can set it to interactive to find all possible values and 27 | give you a choice: 28 | ''' + font.blue( 'make config interactive=1' ) + ''' 29 | 30 | If you have multiple compilers, we suggest specifying your desired compiler by 31 | setting CXX, as the automated search may prefer a different compiler. 32 | 33 | For options, see the `INSTALL.md` file. 34 | 35 | Configure assumes environment variables CPATH, LIBRARY_PATH, and LD_LIBRARY_PATH 36 | are set so your compiler can find libraries. See INSTALL.md for more details. 37 | ''' + '-'*80 ) 38 | 39 | #------------------------------------------------------------------------------- 40 | def main(): 41 | config.init( namespace='BLAS', prefix='/opt/slate' ) 42 | config.prog_cxx() 43 | 44 | print_header( 'C++ compiler flags' ) 45 | # Pick highest level supported. oneAPI needs C++17. 46 | # Crusher had issue with -std=c++20 (2022-07). 47 | config.prog_cxx_flag( 48 | ['-std=c++17', '-std=c++14', '-std=c++11']) 49 | config.prog_cxx_flag( '-O2' ) 50 | config.prog_cxx_flag( '-MMD' ) 51 | config.prog_cxx_flag( '-Wall' ) 52 | config.prog_cxx_flag( '-Wno-unused-local-typedefs' ) 53 | config.prog_cxx_flag( '-Wno-unused-function' ) 54 | #config.prog_cxx_flag( '-pedantic', # todo: conflict with ROCm 3.9.0 55 | #config.prog_cxx_flag( '-Wshadow', # todo: conflict with ROCm 3.9.0 56 | #config.prog_cxx_flag( '-Wmissing-declarations' ) 57 | #config.prog_cxx_flag( '-Wconversion' ) 58 | #config.prog_cxx_flag( '-Werror' ) 59 | 60 | print_header( 'Libraries' ) 61 | config.libatomic() 62 | 63 | config.openmp() 64 | 65 | config.lapack.blas() 66 | print() 67 | config.lapack.blas_float_return() 68 | config.lapack.blas_complex_return() 69 | config.lapack.vendor_version() 70 | 71 | # Must test mkl_version before cblas and lapacke, to define HAVE_MKL. 72 | try: 73 | config.lapack.cblas() 74 | except Error: 75 | print_warn( 'BLAS++ needs CBLAS for testers.' ) 76 | 77 | try: 78 | config.lapack.lapack() 79 | except Error as ex: 80 | print_warn( 'BLAS++ requires LAPACK for [cz]rot, [cz]syr, [cz]symv.' ) 81 | raise( ex ) 82 | 83 | config.gpu_blas() 84 | 85 | testsweeper = config.get_package( 86 | 'TestSweeper', 87 | ['../testsweeper', './testsweeper'], 88 | 'https://github.com/icl-utk-edu/testsweeper', 89 | 'https://github.com/icl-utk-edu/testsweeper/tarball/master', 90 | 'testsweeper.tar.gz' ) 91 | if (not testsweeper): 92 | print_warn( 'BLAS++ needs TestSweeper for testers.' ) 93 | 94 | config.extract_defines_from_flags( 'CXXFLAGS', 'blaspp_header_defines' ) 95 | config.output_files( ['make.inc', 'include/blas/defines.h'] ) 96 | print( 'log in config/log.txt' ) 97 | 98 | print( '-'*80 ) 99 | # end 100 | 101 | #------------------------------------------------------------------------------- 102 | try: 103 | main() 104 | except Error as ex: 105 | print_warn( 'A fatal error occurred. ' + str(ex) + 106 | '\nBLAS++ could not be configured. Log in config/log.txt' ) 107 | exit(1) 108 | -------------------------------------------------------------------------------- /docs/doxygen/GNUmakefile: -------------------------------------------------------------------------------- 1 | docs: 2 | cd ../.. && ${MAKE} docs 3 | -------------------------------------------------------------------------------- /docs/doxygen/errors.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -p 2 | # 3 | # Usage: ./errors.pl errors.txt > errors2.txt 4 | # 5 | # Removes extraneous errors from Doxygen log. 6 | 7 | s/.* warning: Member \w+\(.*(float|double|std::complex).*\) \(function\) of namespace blas is not documented\.\n//; 8 | s/.* warning: Member (real_t|scalar_t) \(typedef\) of class blas::traits\d* is not documented\.\n//; 9 | 10 | #s/.* warning: Member \w+ \(variable\) of class Params is not documented\.\n//; 11 | #s/.* warning: Compound (blas::traits.*|blas::Error|Params) is not documented\.\n//; 12 | -------------------------------------------------------------------------------- /examples/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | # SPDX-License-Identifier: BSD-3-Clause 3 | # This program is free software: you can redistribute it and/or modify it under 4 | # the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | # CXX compiler must match the one used to compiler BLAS++. 7 | # Set it in your environment. 8 | 9 | cmake_minimum_required( VERSION 3.8 ) 10 | 11 | project( 12 | blaspp_example 13 | LANGUAGES CXX 14 | ) 15 | 16 | #------------------------------------------------------------------------------- 17 | # Enforce out-of-source build 18 | string( TOLOWER "${CMAKE_CURRENT_SOURCE_DIR}" source_dir ) 19 | string( TOLOWER "${CMAKE_CURRENT_BINARY_DIR}" binary_dir ) 20 | if ("${source_dir}" STREQUAL "${binary_dir}") 21 | message( FATAL_ERROR 22 | "Compiling with CMake requires an out-of-source build. To proceed: 23 | rm -rf CMakeCache.txt CMakeFiles/ # delete files in ${CMAKE_CURRENT_SOURCE_DIR} 24 | mkdir build 25 | cd build 26 | cmake .. 27 | make" ) 28 | endif() 29 | 30 | #------------------------------------------------------------------------------- 31 | find_package( blaspp REQUIRED ) 32 | 33 | #-------------------- 34 | add_executable( 35 | example_gemm 36 | example_gemm.cc 37 | ) 38 | target_link_libraries( 39 | example_gemm 40 | blaspp 41 | ) 42 | 43 | #-------------------- 44 | add_executable( 45 | example_util 46 | example_util.cc 47 | ) 48 | target_link_libraries( 49 | example_util 50 | blaspp 51 | ) 52 | 53 | #------------------------------------------------------------------------------- 54 | # CTest 55 | 56 | # Get precisions to test. See .github/workflows/test.sh 57 | set( test_args $ENV{test_args} ) 58 | if (NOT test_args) 59 | set( test_args "s d c z" ) 60 | endif() 61 | string( REPLACE " " ";" test_args ${test_args} ) # convert to list 62 | 63 | enable_testing() 64 | add_test( NAME example_gemm COMMAND ./example_gemm ${test_args} ) 65 | add_test( NAME example_util COMMAND ./example_util ${test_args} ) 66 | -------------------------------------------------------------------------------- /examples/Makefile: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | # SPDX-License-Identifier: BSD-3-Clause 3 | # This program is free software: you can redistribute it and/or modify it under 4 | # the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | # CXX compiler must match the one used to compiler BLAS++. 7 | # Set it in your environment or here. 8 | # Sadly, pkg-config doesn't provide a way to query CXX, 9 | # CXXFLAGS (only cflags), CPPFLAGS, or LDFLAGS. 10 | 11 | #------------------------------------------------------------------------------- 12 | # Set CXXFLAGS and LIBS 13 | pkg_exists := $(shell pkg-config --exists blaspp; echo $$?) 14 | ifeq ($(pkg_exists),0) 15 | 16 | # Get flags from pkg-config. 17 | CXX = $(shell pkg-config --variable CXX blaspp) 18 | CXXFLAGS = $(shell pkg-config --cflags blaspp) 19 | LIBS = $(shell pkg-config --libs blaspp) 20 | 21 | else 22 | $(warning WARNING: pkg-config couldn't find blaspp. Using hard-coded flags in Makefile.) 23 | 24 | # BLAS++ not in pkg-config. 25 | # Here's a hard-coded example using OpenBLAS. 26 | CXXFLAGS = -I/usr/local/blaspp/include -std=c++11 27 | LIBS = -L/usr/local/blaspp/lib$(LIB_SUFFIX) -lblaspp -lopenblas 28 | 29 | endif 30 | 31 | #------------------------------------------------------------------------------- 32 | # Rules 33 | 34 | exe = example_gemm example_util 35 | run = ${addsuffix .run, ${exe}} 36 | txt = ${addsuffix .txt, ${exe}} 37 | 38 | .DELETE_ON_ERROR: 39 | .SECONDARY: 40 | .SUFFIXES: 41 | .DEFAULT_GOAL := all 42 | .PRECIOUS: ${txt} 43 | 44 | # Serialize everything so that `make test` runs in serial. 45 | .NOTPARALLEL: 46 | 47 | all: ${exe} 48 | 49 | %: %.o 50 | $(CXX) -o $@ $^ $(LIBS) 51 | 52 | %.o: %.cc 53 | $(CXX) $(CXXFLAGS) -c -o $@ $< 54 | 55 | clean: 56 | -rm -f ${exe} ${txt} *.o *.d 57 | 58 | # CMake uses `make test`, GNU autotools uses `make check`; allow both. 59 | test: check 60 | check: ${run} 61 | 62 | # Run example, but don't save results; %.run is dummy filename. 63 | %.run: % 64 | @echo "----------------------------------------------------------------------" 65 | ./$< ${test_args} 66 | 67 | # Run example and save result in .txt file. 68 | txt: ${txt} 69 | %.txt: % 70 | ./$< ${test_args} > $@ 71 | 72 | #------------------------------------------------------------------------------- 73 | # Debugging 74 | echo: 75 | @echo "PKG_CONFIG_PATH $(PKG_CONFIG_PATH)" 76 | @echo "pkg_exists $(pkg_exists)" 77 | @echo "CXX $(CXX)" 78 | @echo "CXXFLAGS $(CXXFLAGS)" 79 | @echo "LIBS $(LIBS)" 80 | @echo 81 | @echo "exe $(exe)" 82 | @echo "run $(run)" 83 | @echo "txt $(txt)" 84 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | BLAS++ Example 2 | ================================================================================ 3 | 4 | This is designed as a minimal, standalone example to demonstrate 5 | how to include and link with BLAS++. This assumes that BLAS++ has 6 | been compiled and installed. There are two options: 7 | 8 | ## Option 1: Makefile 9 | 10 | The Makefile must know the compiler used to compile BLAS++, 11 | CXXFLAGS, and LIBS. Set CXX to the compiler, either in your environment 12 | or in the Makefile. For the flags, there are two more options: 13 | 14 | a. Using pkg-config to get CXXFLAGS and LIBS for BLAS++ (recommended). 15 | pkg-config must be able to locate the blaspp package. If it is installed 16 | outside the default search path (see `pkg-config --variable pc_path pkg-config`), 17 | it should be added to `$PKG_CONFIG_PATH`. For instance, if it is installed 18 | in /opt/slate: 19 | 20 | export PKG_CONFIG_PATH=/opt/slate/lib/pkgconfig # for sh 21 | setenv PKG_CONFIG_PATH /opt/slate/lib/pkgconfig # for csh 22 | 23 | b. Hard-code CXXFLAGS and LIBS for BLAS++ in the Makefile. 24 | 25 | Then, to build and run `example_gemm` and `example_util` using the 26 | Makefile, run: 27 | 28 | make 29 | make test 30 | 31 | ## Option 2: CMake 32 | 33 | CMake must know the compiler used to compile BLAS++. Set CXX to the 34 | compiler, in your environment. 35 | 36 | Create a build directory: 37 | 38 | mkdir build && cd build 39 | 40 | If BLAS++ is installed outside the default search path, tell cmake 41 | where, for example, in /opt/slate: 42 | 43 | cmake -DCMAKE_PREFIX_PATH=/opt/slate .. 44 | 45 | Otherwise, simply run: 46 | 47 | cmake .. 48 | 49 | Then, to build and run `example_gemm` and `example_util` using the 50 | resulting Makefile, run: 51 | 52 | make 53 | make test 54 | -------------------------------------------------------------------------------- /examples/example_gemm.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #include 7 | 8 | #include 9 | #include 10 | 11 | #include "util.hh" 12 | 13 | //------------------------------------------------------------------------------ 14 | template 15 | void test_gemm( int m, int n, int k ) 16 | { 17 | print_func(); 18 | 19 | int lda = m; 20 | int ldb = k; 21 | int ldc = m; 22 | std::vector A( lda*k, 1.0 ); // m-by-k 23 | std::vector B( ldb*n, 2.0 ); // k-by-n 24 | std::vector C( ldc*n, 3.0 ); // m-by-n 25 | 26 | // ... fill in application data into A, B, C ... 27 | 28 | // C = -1.0*A*B + 1.0*C 29 | blas::gemm( blas::Layout::ColMajor, blas::Op::NoTrans, blas::Op::NoTrans, 30 | m, n, k, 31 | -1.0, A.data(), lda, 32 | B.data(), ldb, 33 | 1.0, C.data(), ldc ); 34 | } 35 | 36 | //------------------------------------------------------------------------------ 37 | template 38 | void test_device_gemm( int m, int n, int k ) 39 | { 40 | print_func(); 41 | if (blas::get_device_count() == 0) { 42 | printf( "no GPU devices\n" ); 43 | } 44 | else { 45 | int lda = m; 46 | int ldb = k; 47 | int ldc = m; 48 | std::vector A( lda*k, 1.0 ); // m-by-k 49 | std::vector B( ldb*n, 2.0 ); // k-by-n 50 | std::vector C( ldc*n, 3.0 ); // m-by-n 51 | 52 | // ... fill in application data into A, B, C ... 53 | 54 | int device = 0; 55 | blas::Queue queue( device ); 56 | 57 | T *dA = blas::device_malloc( lda*k, queue ); // m-by-k 58 | T *dB = blas::device_malloc( ldb*n, queue ); // k-by-n 59 | T *dC = blas::device_malloc( ldc*n, queue ); // m-by-n 60 | 61 | blas::device_copy_matrix( 62 | m, k, 63 | A.data(), lda, // src 64 | dA, lda, queue ); // dst 65 | 66 | blas::device_copy_matrix( 67 | k, n, 68 | B.data(), ldb, // src 69 | dB, ldb, queue ); // dst 70 | 71 | blas::device_copy_matrix( 72 | m, n, 73 | C.data(), ldc, // src 74 | dC, ldc, queue ); // dst 75 | 76 | // C = -1.0*A*B + 1.0*C 77 | blas::gemm( 78 | blas::Layout::ColMajor, blas::Op::NoTrans, blas::Op::NoTrans, 79 | m, n, k, 80 | -1.0, dA, lda, 81 | dB, ldb, 82 | 1.0, dC, ldc, 83 | queue ); 84 | 85 | blas::device_copy_matrix( 86 | m, n, 87 | dC, ldc, // src 88 | C.data(), ldc, queue ); // dst 89 | 90 | queue.sync(); 91 | 92 | blas::device_free( dA, queue ); dA = nullptr; 93 | blas::device_free( dB, queue ); dB = nullptr; 94 | blas::device_free( dC, queue ); dC = nullptr; 95 | } 96 | } 97 | 98 | //------------------------------------------------------------------------------ 99 | int main( int argc, char** argv ) 100 | { 101 | try { 102 | // Parse command line to set types for s, d, c, z precisions. 103 | bool types[ 4 ]; 104 | parse_args( argc, argv, types ); 105 | 106 | int m = 100, n = 200, k = 50; 107 | printf( "m %d, n %d, k %d\n", m, n, k ); 108 | 109 | // Run tests. 110 | if (types[ 0 ]) 111 | test_gemm< float >( m, n, k ); 112 | if (types[ 1 ]) 113 | test_gemm< double >( m, n, k ); 114 | if (types[ 2 ]) 115 | test_gemm< std::complex >( m, n, k ); 116 | if (types[ 3 ]) 117 | test_gemm< std::complex >( m, n, k ); 118 | 119 | if (types[ 0 ]) 120 | test_device_gemm< float >( m, n, k ); 121 | if (types[ 1 ]) 122 | test_device_gemm< double >( m, n, k ); 123 | if (types[ 2 ]) 124 | test_device_gemm< std::complex >( m, n, k ); 125 | if (types[ 3 ]) 126 | test_device_gemm< std::complex >( m, n, k ); 127 | } 128 | catch (std::exception const& ex) { 129 | fprintf( stderr, "%s\n", ex.what() ); 130 | return 1; 131 | } 132 | return 0; 133 | } 134 | -------------------------------------------------------------------------------- /examples/example_util.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | // blas01_util.cc 7 | // BLAS++ utilities: blas::real_type, blas::is_complex, blas::conj 8 | #include 9 | 10 | #include "util.hh" 11 | 12 | //------------------------------------------------------------------------------ 13 | template 14 | void test_util( scalar_type alpha ) 15 | { 16 | print_func(); 17 | 18 | //-------------------- 19 | // demo blas::real_type 20 | int64_t n=100; 21 | std::vector x( n, 1.0 ); 22 | 23 | using real_type = blas::real_type< scalar_type >; 24 | real_type norm = blas::nrm2( n, x.data(), 1 ); 25 | printf( "norm %7.4f\n", norm ); 26 | 27 | //-------------------- 28 | // demo blas::conj 29 | scalar_type beta; 30 | 31 | // std::conj fails if alpha is real: 32 | // error: cannot convert 'std::complex' to 'double' in assignment 33 | //beta = std::conj( alpha ); 34 | 35 | // blas::conj works. Need `using`! 36 | using blas::conj; 37 | beta = conj( alpha ); 38 | 39 | //-------------------- 40 | // demo blas::is_complex 41 | using std::real; 42 | using std::imag; 43 | if (blas::is_complex_v) { 44 | printf( "alpha %7.4f + %7.4fi\n", real(alpha), imag(alpha) ); 45 | printf( "beta %7.4f + %7.4fi\n", real(beta), imag(beta) ); 46 | } 47 | else { 48 | printf( "alpha %7.4f\n", real(alpha) ); 49 | printf( "beta %7.4f\n", real(beta) ); 50 | } 51 | } 52 | 53 | //------------------------------------------------------------------------------ 54 | int main( int argc, char** argv ) 55 | { 56 | try { 57 | // Parse command line to set types for s, d, c, z precisions. 58 | bool types[ 4 ]; 59 | parse_args( argc, argv, types ); 60 | 61 | // Run tests. 62 | int m = 100, n = 200, k = 50; 63 | if (types[ 0 ]) 64 | test_util( float(1.234) ); 65 | if (types[ 1 ]) 66 | test_util( double(2.468) ); 67 | if (types[ 2 ]) 68 | test_util( std::complex< float>( 3.1415, 0.5678 ) ); 69 | if (types[ 3 ]) 70 | test_util( std::complex( 6.2830, 1.1356 ) ); 71 | } 72 | catch (std::exception const& ex) { 73 | fprintf( stderr, "%s", ex.what() ); 74 | return 1; 75 | } 76 | return 0; 77 | } 78 | -------------------------------------------------------------------------------- /examples/util.hh: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #ifndef UTIL_H 7 | #define UTIL_H 8 | 9 | #include 10 | #include 11 | 12 | //------------------------------------------------------------------------------ 13 | void print_func_( const char* func ) 14 | { 15 | printf( "\n%s\n", func ); 16 | } 17 | 18 | #ifdef __GNUC__ 19 | #define print_func() print_func_( __PRETTY_FUNCTION__ ) 20 | #else 21 | #define print_func() print_func_( __func__ ) 22 | #endif 23 | 24 | //------------------------------------------------------------------------------ 25 | // Parse command line options: 26 | // s = single, sets types[ 0 ] 27 | // d = double, sets types[ 1 ] 28 | // c = complex, sets types[ 2 ] 29 | // z = double-complex, sets types[ 3 ] 30 | // If no options, sets all types to true. 31 | // Throws error for unknown options. 32 | void parse_args( int argc, char** argv, bool types[ 4 ] ) 33 | { 34 | if (argc == 1) { 35 | types[ 0 ] = types[ 1 ] = types[ 2 ] = types[ 3 ] = true; 36 | } 37 | else { 38 | types[ 0 ] = types[ 1 ] = types[ 2 ] = types[ 3 ] = false; 39 | } 40 | for (int i = 1; i < argc; ++i) { 41 | std::string arg = argv[ i ]; 42 | if (arg == "s") 43 | types[ 0 ] = true; 44 | else if (arg == "d") 45 | types[ 1 ] = true; 46 | else if (arg == "c") 47 | types[ 2 ] = true; 48 | else if (arg == "z") 49 | types[ 3 ] = true; 50 | else { 51 | throw std::runtime_error( 52 | "unknown option: \"" + arg + "\"\n" 53 | + "Usage: " + argv[ 0 ] + " [s] [d] [c] [z]\n" 54 | + "for single, double, complex, double-complex.\n" ); 55 | } 56 | } 57 | } 58 | 59 | #endif // UTIL_H 60 | -------------------------------------------------------------------------------- /include/GNUmakefile: -------------------------------------------------------------------------------- 1 | top = .. 2 | include ${top}/GNUmakefile.subdir 3 | -------------------------------------------------------------------------------- /include/blas.hh: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #ifndef BLAS_HH 7 | #define BLAS_HH 8 | 9 | #include "blas/defines.h" 10 | 11 | #include "blas/counter.hh" 12 | 13 | // Version is updated by make_release.py; DO NOT EDIT. 14 | // Version 2025.05.28 15 | #define BLASPP_VERSION 20250528 16 | 17 | namespace blas { 18 | 19 | int blaspp_version(); 20 | const char* blaspp_id(); 21 | 22 | } // namespace blas 23 | 24 | #include "blas/wrappers.hh" 25 | 26 | // ============================================================================= 27 | // Level 1 BLAS template implementations 28 | 29 | #include "blas/asum.hh" 30 | #include "blas/axpy.hh" 31 | #include "blas/copy.hh" 32 | #include "blas/dot.hh" 33 | #include "blas/dotu.hh" 34 | #include "blas/iamax.hh" 35 | #include "blas/nrm2.hh" 36 | #include "blas/rot.hh" 37 | #include "blas/rotg.hh" 38 | #include "blas/rotm.hh" 39 | #include "blas/rotmg.hh" 40 | #include "blas/scal.hh" 41 | #include "blas/swap.hh" 42 | 43 | // ============================================================================= 44 | // Level 2 BLAS template implementations 45 | 46 | #include "blas/gemv.hh" 47 | #include "blas/ger.hh" 48 | #include "blas/geru.hh" 49 | #include "blas/hemv.hh" 50 | #include "blas/her.hh" 51 | #include "blas/her2.hh" 52 | #include "blas/symv.hh" 53 | #include "blas/syr.hh" 54 | #include "blas/syr2.hh" 55 | #include "blas/trmv.hh" 56 | #include "blas/trsv.hh" 57 | 58 | // ============================================================================= 59 | // Level 3 BLAS template implementations 60 | 61 | #include "blas/gemm.hh" 62 | #include "blas/hemm.hh" 63 | #include "blas/herk.hh" 64 | #include "blas/her2k.hh" 65 | #include "blas/symm.hh" 66 | #include "blas/syrk.hh" 67 | #include "blas/syr2k.hh" 68 | #include "blas/trmm.hh" 69 | #include "blas/trsm.hh" 70 | 71 | // ============================================================================= 72 | // Device BLAS 73 | 74 | #include "blas/device_blas.hh" 75 | 76 | #endif // #ifndef BLAS_HH 77 | -------------------------------------------------------------------------------- /include/blas/GNUmakefile: -------------------------------------------------------------------------------- 1 | top = ../.. 2 | include ${top}/GNUmakefile.subdir 3 | -------------------------------------------------------------------------------- /include/blas/asum.hh: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #ifndef BLAS_ASUM_HH 7 | #define BLAS_ASUM_HH 8 | 9 | #include "blas/util.hh" 10 | 11 | #include 12 | 13 | namespace blas { 14 | 15 | // ============================================================================= 16 | /// @return 1-norm of vector, 17 | /// $|| Re(x) ||_1 + || Im(x) ||_1 18 | /// = \sum_{i=0}^{n-1} |Re(x_i)| + |Im(x_i)|$. 19 | /// 20 | /// Generic implementation for arbitrary data types. 21 | /// 22 | /// @param[in] n 23 | /// Number of elements in x. n >= 0. 24 | /// 25 | /// @param[in] x 26 | /// The n-element vector x, in an array of length (n-1)*incx + 1. 27 | /// 28 | /// @param[in] incx 29 | /// Stride between elements of x. incx > 0. 30 | /// 31 | /// @ingroup asum 32 | 33 | template 34 | real_type 35 | asum( 36 | int64_t n, 37 | T const *x, int64_t incx ) 38 | { 39 | typedef real_type real_t; 40 | 41 | // check arguments 42 | blas_error_if( n < 0 ); // standard BLAS returns, doesn't fail 43 | blas_error_if( incx <= 0 ); // standard BLAS returns, doesn't fail 44 | 45 | real_t result = 0; 46 | if (incx == 1) { 47 | // unit stride 48 | for (int64_t i = 0; i < n; ++i) { 49 | result += abs1( x[i] ); 50 | } 51 | } 52 | else { 53 | // non-unit stride 54 | int64_t ix = 0; 55 | for (int64_t i = 0; i < n; ++i) { 56 | result += abs1( x[ix] ); 57 | ix += incx; 58 | } 59 | } 60 | return result; 61 | } 62 | 63 | } // namespace blas 64 | 65 | #endif // #ifndef BLAS_ASUM_HH 66 | -------------------------------------------------------------------------------- /include/blas/axpy.hh: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #ifndef BLAS_AXPY_HH 7 | #define BLAS_AXPY_HH 8 | 9 | #include "blas/util.hh" 10 | 11 | #include 12 | 13 | namespace blas { 14 | 15 | // ============================================================================= 16 | /// Add scaled vector, $y = \alpha x + y$. 17 | /// 18 | /// Generic implementation for arbitrary data types. 19 | /// 20 | /// @param[in] n 21 | /// Number of elements in x and y. n >= 0. 22 | /// 23 | /// @param[in] alpha 24 | /// Scalar alpha. If alpha is zero, y is not updated. 25 | /// 26 | /// @param[in] x 27 | /// The n-element vector x, in an array of length (n-1)*abs(incx) + 1. 28 | /// 29 | /// @param[in] incx 30 | /// Stride between elements of x. incx must not be zero. 31 | /// If incx < 0, uses elements of x in reverse order: x(n-1), ..., x(0). 32 | /// 33 | /// @param[in, out] y 34 | /// The n-element vector y, in an array of length (n-1)*abs(incy) + 1. 35 | /// 36 | /// @param[in] incy 37 | /// Stride between elements of y. incy must not be zero. 38 | /// If incy < 0, uses elements of y in reverse order: y(n-1), ..., y(0). 39 | /// 40 | /// @ingroup axpy 41 | 42 | template 43 | void axpy( 44 | int64_t n, 45 | blas::scalar_type alpha, 46 | TX const *x, int64_t incx, 47 | TY *y, int64_t incy ) 48 | { 49 | typedef blas::scalar_type scalar_t; 50 | 51 | // check arguments 52 | blas_error_if( n < 0 ); 53 | blas_error_if( incx == 0 ); 54 | blas_error_if( incy == 0 ); 55 | 56 | // quick return 57 | if (alpha == scalar_t(0)) 58 | return; 59 | 60 | if (incx == 1 && incy == 1) { 61 | // unit stride 62 | for (int64_t i = 0; i < n; ++i) { 63 | y[i] += alpha*x[i]; 64 | } 65 | } 66 | else { 67 | // non-unit stride 68 | int64_t ix = (incx > 0 ? 0 : (-n + 1)*incx); 69 | int64_t iy = (incy > 0 ? 0 : (-n + 1)*incy); 70 | for (int64_t i = 0; i < n; ++i) { 71 | y[iy] += alpha * x[ix]; 72 | ix += incx; 73 | iy += incy; 74 | } 75 | } 76 | } 77 | 78 | } // namespace blas 79 | 80 | #endif // #ifndef BLAS_AXPY_HH 81 | -------------------------------------------------------------------------------- /include/blas/config.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #ifndef BLAS_CONFIG_H 7 | #define BLAS_CONFIG_H 8 | 9 | #include 10 | 11 | #include "blas/defines.h" 12 | 13 | #ifndef blas_int 14 | #if defined(BLAS_ILP64) 15 | typedef int64_t blas_int; 16 | #else 17 | typedef int blas_int; 18 | #endif 19 | /* #define so that #ifdef works. */ 20 | #define blas_int blas_int 21 | #endif 22 | 23 | /* f2c, hence MacOS Accelerate, returns double instead of float 24 | * for sdot, slange, clange, etc. */ 25 | #if defined(BLAS_HAVE_F2C) 26 | typedef double blas_float_return; 27 | #else 28 | typedef float blas_float_return; 29 | #endif 30 | 31 | #if defined(BLAS_COMPLEX_CPP) || defined(LAPACK_COMPLEX_CPP) 32 | /* user has to specifically request std::complex, 33 | * as it isn't compatible as a return type from extern C functions. */ 34 | #include 35 | typedef std::complex blas_complex_float; 36 | typedef std::complex blas_complex_double; 37 | #elif defined(_MSC_VER) 38 | /* MSVC has no C99 _Complex */ 39 | typedef struct { float real, imag; } blas_complex_float; 40 | typedef struct { double real, imag; } blas_complex_double; 41 | #else 42 | /* otherwise, by default use C99 _Complex */ 43 | #include 44 | typedef float _Complex blas_complex_float; 45 | typedef double _Complex blas_complex_double; 46 | #endif 47 | 48 | /* define so we can check later with ifdef */ 49 | #define blas_complex_float blas_complex_float 50 | #define blas_complex_double blas_complex_double 51 | 52 | #endif // #ifndef BLAS_CONFIG_H 53 | -------------------------------------------------------------------------------- /include/blas/copy.hh: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #ifndef BLAS_COPY_HH 7 | #define BLAS_COPY_HH 8 | 9 | #include "blas/util.hh" 10 | 11 | #include 12 | 13 | namespace blas { 14 | 15 | // ============================================================================= 16 | /// Copy vector, $y = x$. 17 | /// 18 | /// Generic implementation for arbitrary data types. 19 | /// 20 | /// @param[in] n 21 | /// Number of elements in x and y. n >= 0. 22 | /// 23 | /// @param[in] x 24 | /// The n-element vector x, in an array of length (n-1)*abs(incx) + 1. 25 | /// 26 | /// @param[in] incx 27 | /// Stride between elements of x. incx must not be zero. 28 | /// If incx < 0, uses elements of x in reverse order: x(n-1), ..., x(0). 29 | /// 30 | /// @param[out] y 31 | /// The n-element vector y, in an array of length (n-1)*abs(incy) + 1. 32 | /// 33 | /// @param[in] incy 34 | /// Stride between elements of y. incy must not be zero. 35 | /// If incy < 0, uses elements of y in reverse order: y(n-1), ..., y(0). 36 | /// 37 | /// @ingroup copy 38 | 39 | template 40 | void copy( 41 | int64_t n, 42 | TX const *x, int64_t incx, 43 | TY *y, int64_t incy ) 44 | { 45 | // check arguments 46 | blas_error_if( n < 0 ); 47 | blas_error_if( incx == 0 ); 48 | blas_error_if( incy == 0 ); 49 | 50 | if (incx == 1 && incy == 1) { 51 | // unit stride 52 | for (int64_t i = 0; i < n; ++i) { 53 | y[i] = x[i]; 54 | } 55 | } 56 | else { 57 | // non-unit stride 58 | int64_t ix = (incx > 0 ? 0 : (-n + 1)*incx); 59 | int64_t iy = (incy > 0 ? 0 : (-n + 1)*incy); 60 | for (int64_t i = 0; i < n; ++i) { 61 | y[iy] = x[ix]; 62 | ix += incx; 63 | iy += incy; 64 | } 65 | } 66 | } 67 | 68 | } // namespace blas 69 | 70 | #endif // #ifndef BLAS_COPY_HH 71 | -------------------------------------------------------------------------------- /include/blas/defines.h.in: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #ifndef BLAS_DEFINES_H 7 | #define BLAS_DEFINES_H 8 | 9 | // auto-generated by: @argv@ 10 | // @CMAKE_COMMAND@ @CMAKE_VERSION@ 11 | // 12 | // Definitions for: 13 | // CXX = @CXX@ 14 | // @CMAKE_CXX_COMPILER@ 15 | // LIBS = @LIBS@ 16 | // @BLAS_LIBRARIES@ 17 | 18 | @blaspp_header_defines@ 19 | 20 | #endif // #ifndef BLAS_DEFINES_H 21 | -------------------------------------------------------------------------------- /include/blas/dot.hh: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #ifndef BLAS_DOT_HH 7 | #define BLAS_DOT_HH 8 | 9 | #include "blas/util.hh" 10 | 11 | #include 12 | 13 | namespace blas { 14 | 15 | // ============================================================================= 16 | /// @return dot product, $x^H y$. 17 | /// @see dotu for unconjugated version, $x^T y$. 18 | /// 19 | /// Generic implementation for arbitrary data types. 20 | /// 21 | /// @param[in] n 22 | /// Number of elements in x and y. n >= 0. 23 | /// 24 | /// @param[in] x 25 | /// The n-element vector x, in an array of length (n-1)*abs(incx) + 1. 26 | /// 27 | /// @param[in] incx 28 | /// Stride between elements of x. incx must not be zero. 29 | /// If incx < 0, uses elements of x in reverse order: x(n-1), ..., x(0). 30 | /// 31 | /// @param[in] y 32 | /// The n-element vector y, in an array of length (n-1)*abs(incy) + 1. 33 | /// 34 | /// @param[in] incy 35 | /// Stride between elements of y. incy must not be zero. 36 | /// If incy < 0, uses elements of y in reverse order: y(n-1), ..., y(0). 37 | /// 38 | /// @ingroup dot 39 | 40 | template 41 | scalar_type dot( 42 | int64_t n, 43 | TX const *x, int64_t incx, 44 | TY const *y, int64_t incy ) 45 | { 46 | typedef scalar_type scalar_t; 47 | 48 | // check arguments 49 | blas_error_if( n < 0 ); 50 | blas_error_if( incx == 0 ); 51 | blas_error_if( incy == 0 ); 52 | 53 | scalar_t result = 0; 54 | if (incx == 1 && incy == 1) { 55 | // unit stride 56 | for (int64_t i = 0; i < n; ++i) { 57 | result += conj(x[i]) * y[i]; 58 | } 59 | } 60 | else { 61 | // non-unit stride 62 | int64_t ix = (incx > 0 ? 0 : (-n + 1)*incx); 63 | int64_t iy = (incy > 0 ? 0 : (-n + 1)*incy); 64 | for (int64_t i = 0; i < n; ++i) { 65 | result += conj(x[ix]) * y[iy]; 66 | ix += incx; 67 | iy += incy; 68 | } 69 | } 70 | return result; 71 | } 72 | 73 | } // namespace blas 74 | 75 | #endif // #ifndef BLAS_DOT_HH 76 | -------------------------------------------------------------------------------- /include/blas/dotu.hh: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #ifndef BLAS_DOTU_HH 7 | #define BLAS_DOTU_HH 8 | 9 | #include "blas/util.hh" 10 | 11 | #include 12 | 13 | namespace blas { 14 | 15 | // ============================================================================= 16 | /// @return unconjugated dot product, $x^T y$. 17 | /// @see dot for conjugated version, $x^H y$. 18 | /// 19 | /// Generic implementation for arbitrary data types. 20 | /// 21 | /// @param[in] n 22 | /// Number of elements in x and y. n >= 0. 23 | /// 24 | /// @param[in] x 25 | /// The n-element vector x, in an array of length (n-1)*abs(incx) + 1. 26 | /// 27 | /// @param[in] incx 28 | /// Stride between elements of x. incx must not be zero. 29 | /// If incx < 0, uses elements of x in reverse order: x(n-1), ..., x(0). 30 | /// 31 | /// @param[in] y 32 | /// The n-element vector y, in an array of length (n-1)*abs(incy) + 1. 33 | /// 34 | /// @param[in] incy 35 | /// Stride between elements of y. incy must not be zero. 36 | /// If incy < 0, uses elements of y in reverse order: y(n-1), ..., y(0). 37 | /// 38 | /// @ingroup dotu 39 | 40 | template 41 | scalar_type dotu( 42 | int64_t n, 43 | TX const *x, int64_t incx, 44 | TY const *y, int64_t incy ) 45 | { 46 | typedef scalar_type scalar_t; 47 | 48 | // check arguments 49 | blas_error_if( n < 0 ); 50 | blas_error_if( incx == 0 ); 51 | blas_error_if( incy == 0 ); 52 | 53 | scalar_t result = 0; 54 | if (incx == 1 && incy == 1) { 55 | // unit stride 56 | for (int64_t i = 0; i < n; ++i) { 57 | result += x[i] * y[i]; 58 | } 59 | } 60 | else { 61 | // non-unit stride 62 | int64_t ix = (incx > 0 ? 0 : (-n + 1)*incx); 63 | int64_t iy = (incy > 0 ? 0 : (-n + 1)*incy); 64 | for (int64_t i = 0; i < n; ++i) { 65 | result += x[ix] * y[iy]; 66 | ix += incx; 67 | iy += incy; 68 | } 69 | } 70 | return result; 71 | } 72 | 73 | } // namespace blas 74 | 75 | #endif // #ifndef BLAS_DOTU_HH 76 | -------------------------------------------------------------------------------- /include/blas/geru.hh: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #ifndef BLAS_GERU_HH 7 | #define BLAS_GERU_HH 8 | 9 | #include "blas/util.hh" 10 | #include "blas/ger.hh" 11 | 12 | #include 13 | 14 | namespace blas { 15 | 16 | // ============================================================================= 17 | /// General matrix rank-1 update: 18 | /// \[ 19 | /// A = \alpha x y^T + A, 20 | /// \] 21 | /// where alpha is a scalar, x and y are vectors, 22 | /// and A is an m-by-n matrix. 23 | /// 24 | /// Generic implementation for arbitrary data types. 25 | /// 26 | /// @param[in] layout 27 | /// Matrix storage, Layout::ColMajor or Layout::RowMajor. 28 | /// 29 | /// @param[in] m 30 | /// Number of rows of the matrix A. m >= 0. 31 | /// 32 | /// @param[in] n 33 | /// Number of columns of the matrix A. n >= 0. 34 | /// 35 | /// @param[in] alpha 36 | /// Scalar alpha. If alpha is zero, A is not updated. 37 | /// 38 | /// @param[in] x 39 | /// The m-element vector x, in an array of length (m-1)*abs(incx) + 1. 40 | /// 41 | /// @param[in] incx 42 | /// Stride between elements of x. incx must not be zero. 43 | /// If incx < 0, uses elements of x in reverse order: x(n-1), ..., x(0). 44 | /// 45 | /// @param[in] y 46 | /// The n-element vector y, in an array of length (n-1)*abs(incy) + 1. 47 | /// 48 | /// @param[in] incy 49 | /// Stride between elements of y. incy must not be zero. 50 | /// If incy < 0, uses elements of y in reverse order: y(n-1), ..., y(0). 51 | /// 52 | /// @param[in, out] A 53 | /// The m-by-n matrix A, stored in an lda-by-n array [RowMajor: m-by-lda]. 54 | /// 55 | /// @param[in] lda 56 | /// Leading dimension of A. lda >= max(1, m) [RowMajor: lda >= max(1, n)]. 57 | /// 58 | /// @ingroup geru 59 | 60 | template 61 | void geru( 62 | blas::Layout layout, 63 | int64_t m, int64_t n, 64 | blas::scalar_type alpha, 65 | TX const *x, int64_t incx, 66 | TY const *y, int64_t incy, 67 | TA *A, int64_t lda ) 68 | { 69 | typedef blas::scalar_type scalar_t; 70 | 71 | #define A(i_, j_) A[ (i_) + (j_)*lda ] 72 | 73 | // constants 74 | const scalar_t zero = 0; 75 | 76 | // check arguments 77 | blas_error_if( layout != Layout::ColMajor && 78 | layout != Layout::RowMajor ); 79 | blas_error_if( m < 0 ); 80 | blas_error_if( n < 0 ); 81 | blas_error_if( incx == 0 ); 82 | blas_error_if( incy == 0 ); 83 | 84 | if (layout == Layout::ColMajor) 85 | blas_error_if( lda < m ); 86 | else 87 | blas_error_if( lda < n ); 88 | 89 | // quick return 90 | if (m == 0 || n == 0 || alpha == zero) 91 | return; 92 | 93 | // for row-major, simply swap dimensions and x <=> y 94 | // this doesn't work in the complex gerc case because y gets conj 95 | if (layout == Layout::RowMajor) { 96 | geru( Layout::ColMajor, n, m, alpha, y, incy, x, incx, A, lda ); 97 | return; 98 | } 99 | 100 | if (incx == 1 && incy == 1) { 101 | // unit stride 102 | for (int64_t j = 0; j < n; ++j) { 103 | // note: NOT skipping if y[j] is zero, for consistent NAN handling 104 | scalar_t tmp = alpha * y[j]; 105 | for (int64_t i = 0; i < m; ++i) { 106 | A(i, j) += x[i] * tmp; 107 | } 108 | } 109 | } 110 | else if (incx == 1) { 111 | // x unit stride, y non-unit stride 112 | int64_t jy = (incy > 0 ? 0 : (-n + 1)*incy); 113 | for (int64_t j = 0; j < n; ++j) { 114 | scalar_t tmp = alpha * y[jy]; 115 | for (int64_t i = 0; i < m; ++i) { 116 | A(i, j) += x[i] * tmp; 117 | } 118 | jy += incy; 119 | } 120 | } 121 | else { 122 | // x and y non-unit stride 123 | int64_t kx = (incx > 0 ? 0 : (-m + 1)*incx); 124 | int64_t jy = (incy > 0 ? 0 : (-n + 1)*incy); 125 | for (int64_t j = 0; j < n; ++j) { 126 | scalar_t tmp = alpha * y[jy]; 127 | int64_t ix = kx; 128 | for (int64_t i = 0; i < m; ++i) { 129 | A(i, j) += x[ix] * tmp; 130 | ix += incx; 131 | } 132 | jy += incy; 133 | } 134 | } 135 | 136 | #undef A 137 | } 138 | 139 | } // namespace blas 140 | 141 | #endif // #ifndef BLAS_GER_HH 142 | -------------------------------------------------------------------------------- /include/blas/iamax.hh: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #ifndef BLAS_IAMAX_HH 7 | #define BLAS_IAMAX_HH 8 | 9 | #include "blas/util.hh" 10 | 11 | #include 12 | 13 | namespace blas { 14 | 15 | // ============================================================================= 16 | /// @return Index of infinity-norm of vector, $|| x ||_{inf}$, 17 | /// $\text{argmax}_{i=0}^{n-1} |Re(x_i)| + |Im(x_i)|$. 18 | /// Returns -1 if n = 0. 19 | /// 20 | /// Generic implementation for arbitrary data types. 21 | /// 22 | /// @param[in] n 23 | /// Number of elements in x. n >= 0. 24 | /// 25 | /// @param[in] x 26 | /// The n-element vector x, in an array of length (n-1)*incx + 1. 27 | /// 28 | /// @param[in] incx 29 | /// Stride between elements of x. incx > 0. 30 | /// 31 | /// @ingroup iamax 32 | 33 | template 34 | int64_t iamax( 35 | int64_t n, 36 | T const *x, int64_t incx ) 37 | { 38 | typedef real_type real_t; 39 | 40 | // check arguments 41 | blas_error_if( n < 0 ); // standard BLAS returns, doesn't fail 42 | blas_error_if( incx <= 0 ); // standard BLAS returns, doesn't fail 43 | 44 | // todo: check NAN 45 | real_t result = -1; 46 | int64_t index = -1; 47 | if (incx == 1) { 48 | // unit stride 49 | for (int64_t i = 0; i < n; ++i) { 50 | real_t tmp = abs1( x[i] ); 51 | if (tmp > result) { 52 | result = tmp; 53 | index = i; 54 | } 55 | } 56 | } 57 | else { 58 | // non-unit stride 59 | int64_t ix = 0; 60 | for (int64_t i = 0; i < n; ++i) { 61 | real_t tmp = abs1( x[ix] ); 62 | if (tmp > result) { 63 | result = tmp; 64 | index = i; 65 | } 66 | ix += incx; 67 | } 68 | } 69 | return index; 70 | } 71 | 72 | } // namespace blas 73 | 74 | #endif // #ifndef BLAS_IAMAX_HH 75 | -------------------------------------------------------------------------------- /include/blas/mangling.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #ifndef BLAS_MANGLING_H 7 | #define BLAS_MANGLING_H 8 | 9 | #include "blas/defines.h" 10 | 11 | // ----------------------------------------------------------------------------- 12 | // Fortran name mangling depends on compiler. 13 | // Define FORTRAN_UPPER for uppercase, 14 | // define FORTRAN_LOWER for lowercase (IBM xlf), 15 | // else the default is lowercase with appended underscore 16 | // (GNU gcc, Intel icc, PGI pgfortan, Cray ftn). 17 | #ifndef BLAS_FORTRAN_NAME 18 | #if defined(BLAS_FORTRAN_UPPER) 19 | #define BLAS_FORTRAN_NAME( lower, UPPER ) UPPER 20 | #elif defined(BLAS_FORTRAN_LOWER) 21 | #define BLAS_FORTRAN_NAME( lower, UPPER ) lower 22 | #else 23 | #define BLAS_FORTRAN_NAME( lower, UPPER ) lower##_ 24 | #endif 25 | #endif 26 | 27 | #endif // #ifndef BLAS_MANGLING_H 28 | -------------------------------------------------------------------------------- /include/blas/nrm2.hh: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #ifndef BLAS_NRM2_HH 7 | #define BLAS_NRM2_HH 8 | 9 | #include "blas/util.hh" 10 | 11 | #include 12 | 13 | namespace blas { 14 | 15 | // ============================================================================= 16 | /// @return 2-norm of vector, 17 | /// $|| x ||_2 = (\sum_{i=0}^{n-1} |x_i|^2)^{1/2}$. 18 | /// 19 | /// Generic implementation for arbitrary data types. 20 | /// TODO: generic implementation does not currently scale to avoid over- or underflow. 21 | /// 22 | /// @param[in] n 23 | /// Number of elements in x. n >= 0. 24 | /// 25 | /// @param[in] x 26 | /// The n-element vector x, in an array of length (n-1)*incx + 1. 27 | /// 28 | /// @param[in] incx 29 | /// Stride between elements of x. incx > 0. 30 | /// 31 | /// @ingroup nrm2 32 | 33 | template 34 | real_type 35 | nrm2( 36 | int64_t n, 37 | T const * x, int64_t incx ) 38 | { 39 | using std::sqrt; 40 | using real_t = real_type; 41 | 42 | // check arguments 43 | blas_error_if( n < 0 ); // standard BLAS returns, doesn't fail 44 | blas_error_if( incx <= 0 ); // standard BLAS returns, doesn't fail 45 | 46 | // todo: scale to avoid overflow & underflow 47 | real_t result = 0; 48 | if (incx == 1) { 49 | // unit stride 50 | for (int64_t i = 0; i < n; ++i) { 51 | result += real(x[i]) * real(x[i]) + imag(x[i]) * imag(x[i]); 52 | } 53 | } 54 | else { 55 | // non-unit stride 56 | int64_t ix = 0; 57 | for (int64_t i = 0; i < n; ++i) { 58 | result += real(x[ix]) * real(x[ix]) + imag(x[ix]) * imag(x[ix]); 59 | ix += incx; 60 | } 61 | } 62 | return sqrt( result ); 63 | } 64 | 65 | } // namespace blas 66 | 67 | #endif // #ifndef BLAS_NRM2_HH 68 | -------------------------------------------------------------------------------- /include/blas/rot.hh: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #ifndef BLAS_ROT_HH 7 | #define BLAS_ROT_HH 8 | 9 | #include "blas/util.hh" 10 | 11 | #include 12 | 13 | namespace blas { 14 | 15 | // ============================================================================= 16 | /// Apply plane rotation: 17 | /// \[ 18 | /// \begin{bmatrix} x^T \\ y^T \end{bmatrix} 19 | /// = \begin{bmatrix} c & s \\ -s & c \end{bmatrix} 20 | /// \begin{bmatrix} x^T \\ y^T \end{bmatrix}. 21 | /// \] 22 | /// 23 | /// @see rotg to generate the rotation. 24 | /// 25 | /// Generic implementation for arbitrary data types. 26 | /// 27 | /// @param[in] n 28 | /// Number of elements in x and y. n >= 0. 29 | /// 30 | /// @param[in, out] x 31 | /// The n-element vector x, in an array of length (n-1)*abs(incx) + 1. 32 | /// 33 | /// @param[in] incx 34 | /// Stride between elements of x. incx must not be zero. 35 | /// If incx < 0, uses elements of x in reverse order: x(n-1), ..., x(0). 36 | /// 37 | /// @param[in, out] y 38 | /// The n-element vector y, in an array of length (n-1)*abs(incy) + 1. 39 | /// 40 | /// @param[in] incy 41 | /// Stride between elements of y. incy must not be zero. 42 | /// If incy < 0, uses elements of y in reverse order: y(n-1), ..., y(0). 43 | /// 44 | /// @param[in] c 45 | /// Cosine of rotation; real. 46 | /// 47 | /// @param[in] s 48 | /// Sine of rotation; complex. 49 | /// 50 | /// @ingroup rot 51 | 52 | template 53 | void rot( 54 | int64_t n, 55 | TX *x, int64_t incx, 56 | TY *y, int64_t incy, 57 | blas::real_type c, 58 | blas::scalar_type s ) 59 | { 60 | typedef scalar_type scalar_t; 61 | 62 | // check arguments 63 | blas_error_if( n < 0 ); // standard BLAS returns, doesn't fail 64 | blas_error_if( incx == 0 ); 65 | blas_error_if( incy == 0 ); 66 | 67 | scalar_t zero( 0 ); 68 | 69 | // quick return 70 | if (n == 0 || (c == 1 && s == zero)) 71 | return; 72 | 73 | if (incx == 1 && incy == 1) { 74 | // unit stride 75 | for (int64_t i = 0; i < n; ++i) { 76 | scalar_t stmp = c*x[i] + s*y[i]; 77 | y[i] = c*y[i] - conj(s)*x[i]; 78 | x[i] = stmp; 79 | } 80 | } 81 | else { 82 | // non-unit stride 83 | int64_t ix = (incx > 0 ? 0 : (-n + 1)*incx); 84 | int64_t iy = (incy > 0 ? 0 : (-n + 1)*incy); 85 | for (int64_t i = 0; i < n; ++i) { 86 | scalar_t stmp = c*x[ix] + s*y[iy]; 87 | y[iy] = c*y[iy] - conj(s)*x[ix]; 88 | x[ix] = stmp; 89 | ix += incx; 90 | iy += incy; 91 | } 92 | } 93 | } 94 | 95 | } // namespace blas 96 | 97 | #endif // #ifndef BLAS_ROT_HH 98 | -------------------------------------------------------------------------------- /include/blas/scal.hh: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #ifndef BLAS_SCAL_HH 7 | #define BLAS_SCAL_HH 8 | 9 | #include "blas/util.hh" 10 | 11 | #include 12 | 13 | namespace blas { 14 | 15 | // ============================================================================= 16 | /// Scale vector by constant, $x = \alpha x$. 17 | /// 18 | /// Generic implementation for arbitrary data types. 19 | /// 20 | /// @param[in] n 21 | /// Number of elements in x. n >= 0. 22 | /// 23 | /// @param[in] alpha 24 | /// Scalar alpha. 25 | /// 26 | /// @param[in] x 27 | /// The n-element vector x, in an array of length (n-1)*incx + 1. 28 | /// 29 | /// @param[in] incx 30 | /// Stride between elements of x. incx > 0. 31 | /// 32 | /// @ingroup scal 33 | 34 | template 35 | void scal( 36 | int64_t n, 37 | T alpha, 38 | T* x, int64_t incx ) 39 | { 40 | // check arguments 41 | blas_error_if( n < 0 ); // standard BLAS returns, doesn't fail 42 | blas_error_if( incx <= 0 ); // standard BLAS returns, doesn't fail 43 | 44 | if (incx == 1) { 45 | // unit stride 46 | for (int64_t i = 0; i < n; ++i) { 47 | x[i] *= alpha; 48 | } 49 | } 50 | else { 51 | // non-unit stride 52 | for (int64_t i = 0; i < n*incx; i += incx) { 53 | x[i] *= alpha; 54 | } 55 | } 56 | } 57 | 58 | } // namespace blas 59 | 60 | #endif // #ifndef BLAS_SCAL_HH 61 | -------------------------------------------------------------------------------- /include/blas/swap.hh: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #ifndef BLAS_SWAP_HH 7 | #define BLAS_SWAP_HH 8 | 9 | #include "blas/util.hh" 10 | 11 | #include 12 | 13 | namespace blas { 14 | 15 | // ============================================================================= 16 | /// Swap vectors, $x <=> y$. 17 | /// 18 | /// Generic implementation for arbitrary data types. 19 | /// 20 | /// @param[in] n 21 | /// Number of elements in x and y. n >= 0. 22 | /// 23 | /// @param[in] x 24 | /// The n-element vector x, in an array of length (n-1)*abs(incx) + 1. 25 | /// 26 | /// @param[in] incx 27 | /// Stride between elements of x. incx must not be zero. 28 | /// If incx < 0, uses elements of x in reverse order: x(n-1), ..., x(0). 29 | /// 30 | /// @param[in, out] y 31 | /// The n-element vector y, in an array of length (n-1)*abs(incy) + 1. 32 | /// 33 | /// @param[in] incy 34 | /// Stride between elements of y. incy must not be zero. 35 | /// If incy < 0, uses elements of y in reverse order: y(n-1), ..., y(0). 36 | /// 37 | /// @ingroup swap 38 | 39 | template 40 | void swap( 41 | int64_t n, 42 | TX *x, int64_t incx, 43 | TY *y, int64_t incy ) 44 | { 45 | using std::swap; 46 | 47 | // check arguments 48 | blas_error_if( n < 0 ); // standard BLAS returns, doesn't fail 49 | blas_error_if( incx == 0 ); // standard BLAS doesn't detect inc[xy] == 0 50 | blas_error_if( incy == 0 ); 51 | 52 | if (incx == 1 && incy == 1) { 53 | // unit stride 54 | for (int64_t i = 0; i < n; ++i) { 55 | swap( x[i], y[i] ); 56 | } 57 | } 58 | else { 59 | // non-unit stride 60 | int64_t ix = (incx > 0 ? 0 : (-n + 1)*incx); 61 | int64_t iy = (incy > 0 ? 0 : (-n + 1)*incy); 62 | for (int64_t i = 0; i < n; ++i) { 63 | swap( x[ix], y[iy] ); 64 | ix += incx; 65 | iy += incy; 66 | } 67 | } 68 | } 69 | 70 | } // namespace blas 71 | 72 | #endif // #ifndef BLAS_SWAP_HH 73 | -------------------------------------------------------------------------------- /lib/pkgconfig/blaspp.pc.in: -------------------------------------------------------------------------------- 1 | prefix=#PREFIX 2 | exec_prefix=${prefix} 3 | libdir=${exec_prefix}/lib 4 | includedir=${prefix}/include 5 | CXX=#CXX 6 | 7 | Name: BLAS++ 8 | Description: C++ API to BLAS (basic linear algebra subroutines) 9 | Version: #VERSION 10 | Cflags: #CXXFLAGS -I${includedir} 11 | # Some platforms require CUDA LIBS to be public. 12 | Libs: -L${libdir} -Wl,-rpath,${libdir} -lblaspp #LDFLAGS #LIBS 13 | Libs.private: 14 | Requires: 15 | Requires.private: 16 | -------------------------------------------------------------------------------- /make.inc.in: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------------- 2 | # make.inc file 3 | # auto-generated by: @argv@ 4 | # host: @HOSTNAME@ 5 | # CPATH: @CPATH@ 6 | # LIBRARY_PATH: @LIBRARY_PATH@ 7 | # 8 | CXX = @CXX@ 9 | 10 | CXXFLAGS = @CXXFLAGS@ 11 | 12 | # see include/blas/defines.h 13 | # @DEFINES@ 14 | 15 | LDFLAGS = @LDFLAGS@ 16 | 17 | LIBS = @LIBS@ 18 | 19 | prefix = @prefix@ 20 | 21 | static = @static@ 22 | -------------------------------------------------------------------------------- /src/asum.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #include "blas/fortran.h" 7 | #include "blas.hh" 8 | #include "blas_internal.hh" 9 | #include "blas/counter.hh" 10 | 11 | #include 12 | #include 13 | 14 | namespace blas { 15 | 16 | //============================================================================== 17 | namespace internal { 18 | 19 | //------------------------------------------------------------------------------ 20 | /// Low-level overload wrapper calls Fortran, float version. 21 | /// @ingroup asum_internal 22 | inline float asum( 23 | blas_int n, 24 | float const* x, blas_int incx ) 25 | { 26 | return BLAS_sasum( &n, x, &incx ); 27 | } 28 | 29 | //------------------------------------------------------------------------------ 30 | /// Low-level overload wrapper calls Fortran, double version. 31 | /// @ingroup asum_internal 32 | inline double asum( 33 | blas_int n, 34 | double const* x, blas_int incx ) 35 | { 36 | return BLAS_dasum( &n, x, &incx ); 37 | } 38 | 39 | //------------------------------------------------------------------------------ 40 | /// Low-level overload wrapper calls Fortran, complex version. 41 | /// @ingroup asum_internal 42 | inline float asum( 43 | blas_int n, 44 | std::complex const* x, blas_int incx ) 45 | { 46 | return BLAS_scasum( &n, (blas_complex_float*) x, &incx ); 47 | } 48 | 49 | //------------------------------------------------------------------------------ 50 | /// Low-level overload wrapper calls Fortran, complex version. 51 | /// @ingroup asum_internal 52 | inline double asum( 53 | blas_int n, 54 | std::complex const* x, blas_int incx ) 55 | { 56 | return BLAS_dzasum( &n, (blas_complex_double*) x, &incx ); 57 | } 58 | 59 | } // namespace internal 60 | 61 | //============================================================================== 62 | namespace impl { 63 | 64 | //------------------------------------------------------------------------------ 65 | /// Mid-level templated wrapper checks and converts arguments, 66 | /// then calls low-level wrapper. 67 | /// @ingroup asum_internal 68 | /// 69 | template 70 | real_type asum( 71 | int64_t n, 72 | scalar_t const* x, int64_t incx ) 73 | { 74 | // check arguments 75 | blas_error_if( n < 0 ); // standard BLAS returns, doesn't fail 76 | blas_error_if( incx <= 0 ); // standard BLAS returns, doesn't fail 77 | 78 | #ifdef BLAS_HAVE_PAPI 79 | // PAPI instrumentation 80 | counter::asum_type element; 81 | memset( &element, 0, sizeof( element ) ); 82 | element = { n }; 83 | counter::insert( element, counter::Id::asum ); 84 | 85 | double gflops = 1e9 * blas::Gflop< scalar_t >::asum( n ); 86 | counter::inc_flop_count( (long long int)gflops ); 87 | #endif 88 | 89 | // convert arguments 90 | blas_int n_ = to_blas_int( n ); 91 | blas_int incx_ = to_blas_int( incx ); 92 | 93 | // call low-level wrapper 94 | return internal::asum( n_, x, incx_ ); 95 | } 96 | 97 | } // namespace impl 98 | 99 | //============================================================================== 100 | // High-level overloaded wrappers call mid-level templated wrapper. 101 | 102 | //------------------------------------------------------------------------------ 103 | /// CPU, float version. 104 | /// @ingroup asum 105 | float asum( 106 | int64_t n, 107 | float const* x, int64_t incx ) 108 | { 109 | return impl::asum( n, x, incx ); 110 | } 111 | 112 | //------------------------------------------------------------------------------ 113 | /// CPU, double version. 114 | /// @ingroup asum 115 | double asum( 116 | int64_t n, 117 | double const* x, int64_t incx ) 118 | { 119 | return impl::asum( n, x, incx ); 120 | } 121 | 122 | //------------------------------------------------------------------------------ 123 | /// CPU, complex version. 124 | /// @ingroup asum 125 | float asum( 126 | int64_t n, 127 | std::complex const* x, int64_t incx ) 128 | { 129 | return impl::asum( n, x, incx ); 130 | } 131 | 132 | //------------------------------------------------------------------------------ 133 | /// CPU, complex version. 134 | /// @ingroup asum 135 | double asum( 136 | int64_t n, 137 | std::complex const* x, int64_t incx ) 138 | { 139 | return impl::asum( n, x, incx ); 140 | } 141 | 142 | } // namespace blas 143 | -------------------------------------------------------------------------------- /src/blas_internal.hh: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #ifndef BLAS_INTERNAL_HH 7 | #define BLAS_INTERNAL_HH 8 | 9 | #include "blas/util.hh" 10 | 11 | namespace blas { 12 | 13 | //------------------------------------------------------------------------------ 14 | /// @see to_blas_int 15 | /// 16 | inline blas_int to_blas_int_( int64_t x, const char* x_str ) 17 | { 18 | if (sizeof(int64_t) > sizeof(blas_int)) { 19 | blas_error_if_msg( x > std::numeric_limits::max(), "%s", x_str ); 20 | } 21 | return blas_int( x ); 22 | } 23 | 24 | //---------------------------------------- 25 | /// Convert int64_t to blas_int. 26 | /// If blas_int is 64-bit, this does nothing. 27 | /// If blas_int is 32-bit, throws if x > INT_MAX, so conversion would overflow. 28 | /// 29 | /// Note this is in src/blas_internal.hh, so this macro won't pollute 30 | /// the namespace when apps #include . 31 | /// 32 | #define to_blas_int( x ) to_blas_int_( x, #x ) 33 | 34 | } // namespace blas 35 | 36 | #endif // BLAS_INTERNAL_HH 37 | -------------------------------------------------------------------------------- /src/cuda/device_conj.cu: -------------------------------------------------------------------------------- 1 | #include "blas/device.hh" 2 | #include "thrust/complex.h" 3 | 4 | #if defined(BLAS_HAVE_CUBLAS) 5 | 6 | namespace blas { 7 | 8 | __device__ std::complex conj_convert( 9 | std::complex z) 10 | { 11 | ((cuComplex*) &z)->y *= -1; 12 | return z; 13 | } 14 | 15 | __device__ std::complex conj_convert( 16 | std::complex z) 17 | { 18 | ((cuDoubleComplex*) &z)->y *= -1; 19 | return z; 20 | } 21 | 22 | // Each thread conjugates 1 item 23 | template 24 | __global__ void conj_kernel( 25 | int64_t n, 26 | TS const* src, int64_t inc_src, int64_t i_src, 27 | TD* dst, int64_t inc_dst, int64_t i_dst) 28 | { 29 | using thrust::conj; 30 | 31 | int i = blockIdx.x * blockDim.x + threadIdx.x; 32 | if (i < n) 33 | dst[ i*inc_dst + i_dst ] = conj_convert( src[ i*inc_src + i_src ] ); 34 | } 35 | 36 | //------------------------------------------------------------------------------ 37 | /// Conjugates each element of the vector src and stores in dst. 38 | /// 39 | /// @param[in] n 40 | /// Number of elements in the vector. n >= 0. 41 | /// 42 | /// @param[in] src 43 | /// Pointer to the input vector of length n. 44 | /// 45 | /// @param[in] inc_src 46 | /// Stride between elements of src. inc_src >= 1. 47 | /// 48 | /// @param[out] dst 49 | /// Pointer to output vector 50 | /// On exit, each element dst[i] is updated as dst[i] = conj( src[i] ). 51 | /// dst may be the same as src. 52 | /// 53 | /// @param[in] inc_dst 54 | /// Stride between elements of dst. inc_dst >= 1. 55 | /// 56 | /// @param[in] queue 57 | /// BLAS++ queue to execute in. 58 | /// 59 | template 60 | void conj( 61 | int64_t n, 62 | TS const* src, int64_t inc_src, 63 | TD* dst, int64_t inc_dst, 64 | blas::Queue& queue ) 65 | { 66 | if (n <= 0) { 67 | return; 68 | } 69 | 70 | const int64_t BlockSize = 128; 71 | 72 | int64_t n_threads = min( BlockSize, n ); 73 | int64_t n_blocks = ceildiv(n, n_threads); 74 | 75 | int64_t i_src = (inc_src > 0 ? 0 : (1 - n) * inc_src); 76 | int64_t i_dst = (inc_dst > 0 ? 0 : (1 - n) * inc_dst); 77 | 78 | blas_dev_call( 79 | cudaSetDevice( queue.device() ) ); 80 | 81 | conj_kernel<<>>( 82 | n, src, inc_src, i_src, dst, inc_dst, i_dst ); 83 | 84 | blas_dev_call( 85 | cudaGetLastError() ); 86 | } 87 | 88 | //------------------------------------------------------------------------------ 89 | // Explicit instantiations. 90 | template void conj( 91 | int64_t n, 92 | std::complex const* src, int64_t inc_src, 93 | std::complex* dst, int64_t inc_dst, 94 | blas::Queue& queue); 95 | 96 | template void conj( 97 | int64_t n, 98 | std::complex const* src, int64_t inc_src, 99 | std::complex* dst, int64_t inc_dst, 100 | blas::Queue& queue); 101 | 102 | } // namespace blas 103 | 104 | #endif // BLAS_HAVE_CUBLAS 105 | -------------------------------------------------------------------------------- /src/cuda/device_shift_vec.cu: -------------------------------------------------------------------------------- 1 | #include "blas/device.hh" 2 | 3 | #if defined(BLAS_HAVE_CUBLAS) 4 | 5 | namespace blas { 6 | 7 | template 8 | __global__ void shift_vec_kernel( 9 | int64_t n, scalar_t* v, 10 | scalar_t c) 11 | { 12 | for (int i = threadIdx.x; i < n; i += blockDim.x) { 13 | v[ i ] += c; 14 | } 15 | } 16 | 17 | //------------------------------------------------------------------------------ 18 | /// Shifts each element of the vector v by a constant value c. 19 | /// 20 | /// @param[in] n 21 | /// Number of elements in the vector. n >= 0. 22 | /// 23 | /// @param[in,out] v 24 | /// Pointer to the vector of length n. 25 | /// On exit, each element v[i] is updated as v[i] += c. 26 | /// 27 | /// @param[in] c 28 | /// Scalar value to be added to each element of v. 29 | /// 30 | /// @param[in] queue 31 | /// BLAS++ queue to execute in. 32 | /// 33 | template 34 | void shift_vec( 35 | int64_t n, scalar_t* v, 36 | scalar_t c, 37 | blas::Queue& queue) 38 | { 39 | if (n == 0) { 40 | return; 41 | } 42 | 43 | int64_t nthreads = std::min( int64_t( 1024 ), n ); 44 | 45 | blas_dev_call( 46 | cudaSetDevice( queue.device() ) ); 47 | 48 | shift_vec_kernel<<<1, nthreads, 0, queue.stream()>>>( n, v, c ); 49 | 50 | blas_dev_call( 51 | cudaGetLastError() ); 52 | } 53 | 54 | //------------------------------------------------------------------------------ 55 | // Explicit instantiations. 56 | template 57 | void shift_vec( 58 | int64_t n, int64_t* v, 59 | int64_t c, 60 | blas::Queue& queue); 61 | 62 | template 63 | void shift_vec( 64 | int64_t n, int* v, 65 | int c, 66 | blas::Queue& queue); 67 | 68 | } // namespace blas 69 | 70 | #endif // BLAS_HAVE_CUBLAS 71 | -------------------------------------------------------------------------------- /src/device_asum.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #include "blas/device_blas.hh" 7 | #include "blas/counter.hh" 8 | 9 | #include "device_internal.hh" 10 | 11 | #include 12 | 13 | namespace blas { 14 | 15 | // ============================================================================= 16 | namespace impl { 17 | 18 | //------------------------------------------------------------------------------ 19 | /// Mid-level templated wrapper checks and converts arguments, 20 | /// then calls low-level wrapper. 21 | /// @ingroup asum 22 | /// 23 | template 24 | void asum( 25 | int64_t n, 26 | scalar_t const* x, int64_t incx, 27 | real_type* result, 28 | blas::Queue& queue) 29 | { 30 | #ifndef BLAS_HAVE_DEVICE 31 | throw blas::Error( "device BLAS not available", __func__ ); 32 | #else 33 | // check arguments 34 | blas_error_if( n < 0 ); // standard BLAS returns, doesn't fail 35 | blas_error_if( incx <= 0 ); // standard BLAS returns, doesn't fail 36 | 37 | #ifdef BLAS_HAVE_PAPI 38 | // PAPI instrumentation 39 | counter::dev_asum_type element; 40 | memset( &element, 0, sizeof( element ) ); 41 | element = { n }; 42 | counter::insert( element, counter::Id::dev_asum ); 43 | 44 | double gflops = 1e9 * blas::Gflop< scalar_t >::asum( n ); 45 | counter::inc_flop_count( (long long int)gflops ); 46 | #endif 47 | 48 | // convert arguments 49 | device_blas_int n_ = to_device_blas_int( n ); 50 | device_blas_int incx_ = to_device_blas_int( incx ); 51 | 52 | blas::internal_set_device( queue.device() ); 53 | 54 | // call low-level wrapper 55 | #if defined( BLAS_HAVE_SYCL ) 56 | sycl::queue syclq = queue.stream(); 57 | // check how the result scalar was allocated 58 | auto result_ptr_type = sycl::get_pointer_type( result, syclq.get_context() ); 59 | // if result was outside SYCL/USM memory allocation, use device workspace 60 | if (result_ptr_type == sycl::usm::alloc::unknown) { 61 | // use preallocated device workspace (resizing if needed) 62 | queue.work_ensure_size< char >( sizeof(scalar_t) ); // syncs if needed 63 | real_type* dev_work = (real_type*)queue.work(); 64 | internal::asum( n_, x, incx_, dev_work, queue ); 65 | blas::device_memcpy( result, dev_work, 1, queue ); 66 | } 67 | else { 68 | internal::asum( n_, x, incx_, result, queue ); 69 | } 70 | #else // other devices (CUDA/HIP) 71 | internal::asum( n_, x, incx_, result, queue ); 72 | #endif 73 | #endif 74 | } 75 | 76 | } // namespace impl 77 | 78 | //============================================================================== 79 | // High-level overloaded wrappers call mid-level templated wrapper. 80 | 81 | //------------------------------------------------------------------------------ 82 | /// GPU device, float version. 83 | /// Unlike CPU version, here `result` is an output parameter, 84 | /// to store the result when the asynchronous execution completes. 85 | /// @ingroup asum 86 | void asum( 87 | int64_t n, 88 | float const* x, int64_t incx, 89 | float* result, 90 | blas::Queue& queue) 91 | { 92 | impl::asum( n, x, incx, result, queue ); 93 | } 94 | 95 | //------------------------------------------------------------------------------ 96 | /// GPU device, double version. 97 | /// @ingroup asum 98 | void asum( 99 | int64_t n, 100 | double const* x, int64_t incx, 101 | double* result, 102 | blas::Queue& queue) 103 | { 104 | impl::asum( n, x, incx, result, queue ); 105 | } 106 | 107 | //------------------------------------------------------------------------------ 108 | /// GPU device, complex version. 109 | /// @ingroup asum 110 | void asum( 111 | int64_t n, 112 | std::complex const *x, int64_t incx, 113 | float* result, 114 | blas::Queue& queue) 115 | { 116 | impl::asum( n, x, incx, result, queue ); 117 | } 118 | 119 | //------------------------------------------------------------------------------ 120 | /// GPU device, complex version. 121 | /// @ingroup asum 122 | void asum( 123 | int64_t n, 124 | std::complex const* x, int64_t incx, 125 | double* result, 126 | blas::Queue& queue) 127 | { 128 | impl::asum( n, x, incx, result, queue ); 129 | } 130 | 131 | } // namespace blas -------------------------------------------------------------------------------- /src/device_axpy.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #include "blas/device_blas.hh" 7 | #include "blas/counter.hh" 8 | 9 | #include "device_internal.hh" 10 | 11 | #include 12 | 13 | namespace blas { 14 | 15 | //============================================================================== 16 | namespace impl { 17 | 18 | //------------------------------------------------------------------------------ 19 | /// Mid-level templated wrapper checks and converts arguments, 20 | /// then calls low-level wrapper. 21 | /// @ingroup axpy_internal 22 | /// 23 | template 24 | void axpy( 25 | int64_t n, 26 | scalar_t alpha, 27 | scalar_t const* x, int64_t incx, 28 | scalar_t* y, int64_t incy, 29 | blas::Queue& queue) 30 | { 31 | #ifndef BLAS_HAVE_DEVICE 32 | throw blas::Error( "device BLAS not available", __func__ ); 33 | #else 34 | // check arguments 35 | blas_error_if( n < 0 ); // standard BLAS returns, doesn't fail 36 | blas_error_if( incx == 0 ); // standard BLAS doesn't detect inc[xy] == 0 37 | blas_error_if( incy == 0 ); 38 | 39 | // convert arguments 40 | device_blas_int n_ = to_device_blas_int( n ); 41 | device_blas_int incx_ = to_device_blas_int( incx ); 42 | device_blas_int incy_ = to_device_blas_int( incy ); 43 | 44 | blas::internal_set_device( queue.device() ); 45 | 46 | // call low-level wrapper 47 | internal::axpy( n_, alpha, x, incx_, y, incy_, queue ); 48 | #endif 49 | } 50 | 51 | } // namespace impl 52 | 53 | //============================================================================== 54 | // High-level overloaded wrappers call mid-level templated wrapper. 55 | 56 | //------------------------------------------------------------------------------ 57 | /// GPU device, float version. 58 | /// @ingroup axpy 59 | void axpy( 60 | int64_t n, 61 | float alpha, 62 | float const* x, int64_t incx, 63 | float* y, int64_t incy, 64 | blas::Queue& queue) 65 | { 66 | impl::axpy( n, alpha, x, incx, y, incy, queue ); 67 | } 68 | 69 | //------------------------------------------------------------------------------ 70 | /// GPU device, double version. 71 | /// @ingroup axpy 72 | void axpy( 73 | int64_t n, 74 | double alpha, 75 | double const* x, int64_t incx, 76 | double* y, int64_t incy, 77 | blas::Queue& queue) 78 | { 79 | impl::axpy( n, alpha, x, incx, y, incy, queue ); 80 | } 81 | 82 | //------------------------------------------------------------------------------ 83 | /// GPU device, complex version. 84 | /// @ingroup axpy 85 | void axpy( 86 | int64_t n, 87 | std::complex alpha, 88 | std::complex const* x, int64_t incx, 89 | std::complex* y, int64_t incy, 90 | blas::Queue& queue) 91 | { 92 | impl::axpy( n, alpha, x, incx, y, incy, queue ); 93 | } 94 | 95 | //------------------------------------------------------------------------------ 96 | /// GPU device, complex version. 97 | /// @ingroup axpy 98 | void axpy( 99 | int64_t n, 100 | std::complex alpha, 101 | std::complex const* x, int64_t incx, 102 | std::complex* y, int64_t incy, 103 | blas::Queue& queue) 104 | { 105 | impl::axpy( n, alpha, x, incx, y, incy, queue ); 106 | } 107 | 108 | } // namespace blas 109 | -------------------------------------------------------------------------------- /src/device_copy.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #include "blas/device_blas.hh" 7 | #include "blas/counter.hh" 8 | 9 | #include "device_internal.hh" 10 | 11 | #include 12 | #include 13 | 14 | namespace blas { 15 | 16 | //============================================================================== 17 | namespace impl { 18 | 19 | //------------------------------------------------------------------------------ 20 | /// Mid-level templated wrapper checks and converts arguments, 21 | /// then calls low-level wrapper. 22 | /// @ingroup copy_internal 23 | /// 24 | template 25 | void copy( 26 | int64_t n, 27 | scalar_t const* x, int64_t incx, 28 | scalar_t* y, int64_t incy, 29 | blas::Queue& queue ) 30 | { 31 | #ifndef BLAS_HAVE_DEVICE 32 | throw blas::Error( "device BLAS not available", __func__ ); 33 | #else 34 | // check arguments 35 | blas_error_if( n < 0 ); // standard BLAS returns, doesn't fail 36 | blas_error_if( incx == 0 ); // standard BLAS doesn't detect inc[xy] == 0 37 | blas_error_if( incy == 0 ); 38 | 39 | #ifdef BLAS_HAVE_PAPI 40 | // PAPI instrumentation 41 | counter::dev_copy_type element; 42 | memset( &element, 0, sizeof( element ) ); 43 | element = { n }; 44 | counter::insert( element, counter::Id::dev_copy ); 45 | 46 | double gflops = 1e9 * blas::Gflop< scalar_t >::copy( n ); 47 | counter::inc_flop_count( (long long int)gflops ); 48 | #endif 49 | 50 | // convert arguments 51 | device_blas_int n_ = to_device_blas_int( n ); 52 | device_blas_int incx_ = to_device_blas_int( incx ); 53 | device_blas_int incy_ = to_device_blas_int( incy ); 54 | 55 | blas::internal_set_device( queue.device() ); 56 | 57 | // call low-level wrapper 58 | internal::copy( n_, x, incx_, y, incy_, queue ); 59 | #endif 60 | } 61 | 62 | } // namespace impl 63 | 64 | //============================================================================== 65 | // High-level overloaded wrappers call mid-level templated wrapper. 66 | 67 | //------------------------------------------------------------------------------ 68 | /// GPU device, float version. 69 | /// @ingroup copy 70 | void copy( 71 | int64_t n, 72 | float const* x, int64_t incx, 73 | float* y, int64_t incy, 74 | blas::Queue& queue ) 75 | { 76 | impl::copy( n, x, incx, y, incy, queue ); 77 | } 78 | 79 | //------------------------------------------------------------------------------ 80 | /// GPU device, double version. 81 | /// @ingroup copy 82 | void copy( 83 | int64_t n, 84 | double const* x, int64_t incx, 85 | double* y, int64_t incy, 86 | blas::Queue& queue ) 87 | { 88 | impl::copy( n, x, incx, y, incy, queue ); 89 | } 90 | 91 | //------------------------------------------------------------------------------ 92 | /// GPU device, complex version. 93 | /// @ingroup copy 94 | void copy( 95 | int64_t n, 96 | std::complex const* x, int64_t incx, 97 | std::complex* y, int64_t incy, 98 | blas::Queue& queue ) 99 | { 100 | impl::copy( n, x, incx, y, incy, queue ); 101 | } 102 | 103 | //------------------------------------------------------------------------------ 104 | /// GPU device, complex version. 105 | /// @ingroup copy 106 | void copy( 107 | int64_t n, 108 | std::complex const* x, int64_t incx, 109 | std::complex* y, int64_t incy, 110 | blas::Queue& queue ) 111 | { 112 | impl::copy( n, x, incx, y, incy, queue ); 113 | } 114 | 115 | } // namespace blas 116 | -------------------------------------------------------------------------------- /src/device_error.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #include "blas/device.hh" 7 | 8 | #include "device_internal.hh" 9 | 10 | #ifdef BLAS_HAVE_CUBLAS 11 | 12 | // ----------------------------------------------------------------------------- 13 | // return string of blas error 14 | const char* blas::device_error_string( cublasStatus_t error ) 15 | { 16 | switch (error) { 17 | case CUBLAS_STATUS_SUCCESS: 18 | return "cublas: success"; 19 | 20 | case CUBLAS_STATUS_NOT_INITIALIZED: 21 | return "cublas: not initialized"; 22 | 23 | case CUBLAS_STATUS_ALLOC_FAILED: 24 | return "cublas: out of memory"; 25 | 26 | case CUBLAS_STATUS_INVALID_VALUE: 27 | return "cublas: invalid value"; 28 | 29 | case CUBLAS_STATUS_ARCH_MISMATCH: 30 | return "cublas: architecture mismatch"; 31 | 32 | case CUBLAS_STATUS_MAPPING_ERROR: 33 | return "cublas: memory mapping error"; 34 | 35 | case CUBLAS_STATUS_EXECUTION_FAILED: 36 | return "cublas: execution failed"; 37 | 38 | case CUBLAS_STATUS_INTERNAL_ERROR: 39 | return "cublas: internal error"; 40 | 41 | case CUBLAS_STATUS_NOT_SUPPORTED: 42 | return "cublas: functionality not supported"; 43 | 44 | case CUBLAS_STATUS_LICENSE_ERROR: 45 | return "cublas: license error"; 46 | 47 | default: 48 | return "cublas: unknown error code"; 49 | } 50 | } 51 | 52 | #endif // HAVE_CUBLAS 53 | -------------------------------------------------------------------------------- /src/device_iamax.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #include "blas/device_blas.hh" 7 | #include "blas/counter.hh" 8 | 9 | #include "device_internal.hh" 10 | 11 | #include 12 | 13 | namespace blas { 14 | 15 | // ============================================================================= 16 | namespace impl { 17 | 18 | //------------------------------------------------------------------------------ 19 | /// Mid-level templated wrapper checks and converts arguments, 20 | /// then calls low-level wrapper. 21 | /// @ingroup iamax 22 | /// 23 | template 24 | void iamax( 25 | int64_t n, 26 | scalar_t const* x, int64_t incx, 27 | int64_t* result, 28 | blas::Queue& queue) 29 | { 30 | #ifndef BLAS_HAVE_DEVICE 31 | throw blas::Error( "device BLAS not available", __func__ ); 32 | #else 33 | // check arguments 34 | blas_error_if( n < 0 ); // standard BLAS returns, doesn't fail 35 | blas_error_if( incx <= 0 ); // standard BLAS returns, doesn't fail 36 | 37 | #ifdef BLAS_HAVE_PAPI 38 | // PAPI instrumentation 39 | counter::dev_iamax_type element; 40 | memset( &element, 0, sizeof( element ) ); 41 | element = { n }; 42 | counter::insert( element, counter::Id::dev_iamax ); 43 | 44 | double gflops = 1e9 * blas::Gflop< scalar_t >::iamax( n ); 45 | counter::inc_flop_count( (long long int)gflops ); 46 | #endif 47 | 48 | blas::internal_set_device( queue.device() ); 49 | 50 | // call low-level wrapper 51 | #if defined( BLAS_HAVE_SYCL ) 52 | sycl::queue syclq = queue.stream(); 53 | // check how the result scalar was allocated 54 | auto result_ptr_type = sycl::get_pointer_type( result, syclq.get_context() ); 55 | // if result was outside SYCL/USM memory allocation, use device workspace 56 | if (result_ptr_type == sycl::usm::alloc::unknown) { 57 | // use preallocated device workspace (resizing if needed) 58 | queue.work_ensure_size< char >( sizeof(int64_t) ); // syncs if needed 59 | int64_t* dev_work = (int64_t*)queue.work(); 60 | internal::iamax( n, x, incx, dev_work, queue ); 61 | blas::device_memcpy( result, dev_work, 1, queue ); 62 | } 63 | else { 64 | internal::iamax( n, x, incx, result, queue ); 65 | } 66 | #else 67 | internal::iamax( n, x, incx, result, queue ); 68 | #endif 69 | #endif 70 | } 71 | 72 | } // namespace impl 73 | 74 | //============================================================================== 75 | // High-level overloaded wrappers call mid-level templated wrapper. 76 | 77 | //------------------------------------------------------------------------------ 78 | /// GPU device, float version. 79 | /// Unlike CPU version, here `result` is an output parameter, 80 | /// to store the result when the asynchronous execution completes. 81 | /// @ingroup iamax 82 | void iamax( 83 | int64_t n, 84 | float const* x, int64_t incx, 85 | int64_t* result, 86 | blas::Queue& queue) 87 | { 88 | impl::iamax( n, x, incx, result, queue ); 89 | } 90 | 91 | //------------------------------------------------------------------------------ 92 | /// GPU device, double version. 93 | /// @ingroup iamax 94 | void iamax( 95 | int64_t n, 96 | double const* x, int64_t incx, 97 | int64_t* result, 98 | blas::Queue& queue) 99 | { 100 | impl::iamax( n, x, incx, result, queue ); 101 | } 102 | 103 | //------------------------------------------------------------------------------ 104 | /// GPU device, complex version. 105 | /// @ingroup iamax 106 | void iamax( 107 | int64_t n, 108 | std::complex const *x, int64_t incx, 109 | int64_t* result, 110 | blas::Queue& queue) 111 | { 112 | impl::iamax( n, x, incx, result, queue ); 113 | } 114 | 115 | //------------------------------------------------------------------------------ 116 | /// GPU device, complex version. 117 | /// @ingroup iamax 118 | void iamax( 119 | int64_t n, 120 | std::complex const* x, int64_t incx, 121 | int64_t* result, 122 | blas::Queue& queue) 123 | { 124 | impl::iamax( n, x, incx, result, queue ); 125 | } 126 | 127 | } // namespace blas 128 | -------------------------------------------------------------------------------- /src/device_nrm2.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #include "blas/device_blas.hh" 7 | #include "blas/counter.hh" 8 | 9 | #include "device_internal.hh" 10 | 11 | #include 12 | #include 13 | 14 | namespace blas { 15 | 16 | //============================================================================== 17 | namespace impl { 18 | 19 | //------------------------------------------------------------------------------ 20 | /// Mid-level templated wrapper checks and converts arguments, 21 | /// then calls low-level wrapper. 22 | /// @ingroup nrm2_internal 23 | /// 24 | template 25 | void nrm2( 26 | int64_t n, 27 | scalar_t const* x, int64_t incx, 28 | real_type* result, 29 | blas::Queue& queue ) 30 | { 31 | #ifndef BLAS_HAVE_DEVICE 32 | throw blas::Error( "device BLAS not available", __func__ ); 33 | #else 34 | // check arguments 35 | blas_error_if( n < 0 ); // standard BLAS returns, doesn't fail 36 | blas_error_if( incx <= 0 ); // standard BLAS returns, doesn't fail 37 | 38 | #ifdef BLAS_HAVE_PAPI 39 | // PAPI instrumentation 40 | counter::dev_nrm2_type element; 41 | memset( &element, 0, sizeof( element ) ); 42 | element = { n }; 43 | counter::insert( element, counter::Id::dev_nrm2 ); 44 | 45 | double gflops = 1e9 * blas::Gflop< scalar_t >::nrm2( n ); 46 | counter::inc_flop_count( (long long int)gflops ); 47 | #endif 48 | 49 | // convert arguments 50 | device_blas_int n_ = to_device_blas_int( n ); 51 | device_blas_int incx_ = to_device_blas_int( incx ); 52 | 53 | blas::internal_set_device( queue.device() ); 54 | 55 | // call low-level wrapper 56 | #if defined( BLAS_HAVE_SYCL ) 57 | sycl::queue syclq = queue.stream(); 58 | // check how the result scalar was allocated 59 | auto result_ptr_type = sycl::get_pointer_type( result, syclq.get_context() ); 60 | // if result was outside SYCL/USM memory allocation, use device workspace 61 | if (result_ptr_type == sycl::usm::alloc::unknown) { 62 | // use preallocated device workspace (resizing if needed) 63 | queue.work_ensure_size< char >( sizeof(scalar_t) ); // syncs if needed 64 | real_type* dev_work = (real_type*)queue.work(); 65 | internal::nrm2( n_, x, incx_, dev_work, queue ); 66 | blas::device_memcpy( result, dev_work, 1, queue ); 67 | } 68 | else { 69 | internal::nrm2( n_, x, incx_, result, queue ); 70 | } 71 | #else // other devices (CUDA/HIP) 72 | internal::nrm2( n_, x, incx_, result, queue ); 73 | #endif 74 | #endif 75 | } 76 | 77 | } // namespace impl 78 | 79 | //============================================================================== 80 | // High-level overloaded wrappers call mid-level templated wrapper. 81 | 82 | //------------------------------------------------------------------------------ 83 | /// GPU device, float version. 84 | /// Unlike CPU version, here `result` is an output parameter, 85 | /// to store the result when the asynchronous execution completes. 86 | /// @ingroup nrm2 87 | void nrm2( 88 | int64_t n, 89 | float const* x, int64_t incx, 90 | float* result, 91 | blas::Queue& queue ) 92 | { 93 | impl::nrm2( n, x, incx, result, queue ); 94 | } 95 | 96 | //------------------------------------------------------------------------------ 97 | /// GPU device, double version. 98 | /// @ingroup nrm2 99 | void nrm2( 100 | int64_t n, 101 | double const* x, int64_t incx, 102 | double* result, 103 | blas::Queue& queue ) 104 | { 105 | impl::nrm2( n, x, incx, result, queue ); 106 | } 107 | 108 | //------------------------------------------------------------------------------ 109 | /// GPU device, complex version. 110 | /// @ingroup nrm2 111 | void nrm2( 112 | int64_t n, 113 | std::complex const* x, int64_t incx, 114 | float* result, 115 | blas::Queue& queue ) 116 | { 117 | impl::nrm2( n, x, incx, result, queue ); 118 | } 119 | 120 | //------------------------------------------------------------------------------ 121 | /// GPU device, complex version. 122 | /// @ingroup nrm2 123 | void nrm2( 124 | int64_t n, 125 | std::complex const* x, int64_t incx, 126 | double* result, 127 | blas::Queue& queue ) 128 | { 129 | impl::nrm2( n, x, incx, result, queue ); 130 | } 131 | 132 | } // namespace blas 133 | -------------------------------------------------------------------------------- /src/device_rotg.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #include "blas/device_blas.hh" 7 | #include "blas/counter.hh" 8 | 9 | #include "device_internal.hh" 10 | 11 | #include 12 | #include 13 | 14 | namespace blas { 15 | 16 | //============================================================================== 17 | namespace impl { 18 | 19 | //------------------------------------------------------------------------------ 20 | /// Mid-level templated wrapper checks and converts arguments, 21 | /// then calls low-level wrapper. 22 | /// @ingroup rotg_internal 23 | /// 24 | template 25 | void rotg( 26 | scalar_t* a, 27 | scalar_t* b, 28 | real_type* c, 29 | scalar_t* s, 30 | blas::Queue& queue ) 31 | { 32 | #ifndef BLAS_HAVE_DEVICE 33 | throw blas::Error( "device BLAS not available", __func__ ); 34 | #else 35 | #ifdef BLAS_HAVE_PAPI 36 | // PAPI instrumentation 37 | counter::dev_rotg_type element; 38 | memset( &element, 0, sizeof( element ) ); 39 | element = { 1 }; 40 | counter::insert( element, counter::Id::dev_rotg ); 41 | 42 | // This operation does not incur significant FLOPs, so no 43 | // need to call counter::inc_flop_count() 44 | #endif 45 | 46 | blas::internal_set_device( queue.device() ); 47 | 48 | // call low-level wrapper 49 | internal::rotg( a, b, c, s, queue ); 50 | #endif 51 | } 52 | 53 | } // namespace impl 54 | 55 | //============================================================================== 56 | // High-level overloaded wrappers call mid-level templated wrapper. 57 | 58 | //------------------------------------------------------------------------------ 59 | /// GPU device, float version. 60 | /// @ingroup rotg 61 | void rotg( 62 | float *a, 63 | float *b, 64 | float *c, 65 | float *s, 66 | blas::Queue& queue ) 67 | { 68 | impl::rotg( a, b, c, s, queue ); 69 | } 70 | 71 | //------------------------------------------------------------------------------ 72 | /// GPU device, double version. 73 | /// @ingroup rotg 74 | void rotg( 75 | double *a, 76 | double *b, 77 | double *c, 78 | double *s, 79 | blas::Queue& queue ) 80 | { 81 | impl::rotg( a, b, c, s, queue ); 82 | } 83 | 84 | //------------------------------------------------------------------------------ 85 | /// GPU device, complex version. 86 | /// @ingroup rotg 87 | void rotg( 88 | std::complex *a, 89 | std::complex *b, 90 | float *c, 91 | std::complex *s, 92 | blas::Queue& queue ) 93 | { 94 | impl::rotg( a, b, c, s, queue ); 95 | } 96 | 97 | //------------------------------------------------------------------------------ 98 | /// GPU device, complex version. 99 | /// @ingroup rotg 100 | void rotg( 101 | std::complex *a, 102 | std::complex *b, 103 | double *c, 104 | std::complex *s, 105 | blas::Queue& queue ) 106 | { 107 | impl::rotg( a, b, c, s, queue ); 108 | } 109 | 110 | } // namespace blas 111 | -------------------------------------------------------------------------------- /src/device_rotm.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #include "blas/device_blas.hh" 7 | #include "blas/counter.hh" 8 | 9 | #include "device_internal.hh" 10 | 11 | #include 12 | #include 13 | 14 | namespace blas { 15 | 16 | //============================================================================== 17 | namespace impl { 18 | 19 | //------------------------------------------------------------------------------ 20 | /// Mid-level templated wrapper checks and converts arguments, 21 | /// then calls low-level wrapper. 22 | /// @ingroup rotm_internal 23 | /// 24 | template 25 | void rotm( 26 | int64_t n, 27 | scalar_t* x, int64_t incx, 28 | scalar_t* y, int64_t incy, 29 | const scalar_t* param, 30 | blas::Queue& queue ) 31 | { 32 | #ifndef BLAS_HAVE_DEVICE 33 | throw blas::Error( "device BLAS not available", __func__ ); 34 | #else 35 | // check arguments 36 | blas_error_if( n < 0 ); // standard BLAS returns, doesn't fail 37 | blas_error_if( incx == 0 ); // standard BLAS doesn't detect inc[xy] == 0 38 | blas_error_if( incy == 0 ); 39 | 40 | #ifdef BLAS_HAVE_PAPI 41 | // PAPI instrumentation 42 | counter::dev_rotm_type element; 43 | memset( &element, 0, sizeof( element ) ); 44 | element = { n }; 45 | counter::insert( element, counter::Id::dev_rotm ); 46 | 47 | double gflops = 1e9 * blas::Gflop< scalar_t >::rotm( n ); 48 | counter::inc_flop_count( (long long int)gflops ); 49 | #endif 50 | 51 | // convert arguments 52 | device_blas_int n_ = to_device_blas_int( n ); 53 | device_blas_int incx_ = to_device_blas_int( incx ); 54 | device_blas_int incy_ = to_device_blas_int( incy ); 55 | 56 | blas::internal_set_device( queue.device() ); 57 | 58 | // call low-level wrapper 59 | internal::rotm( n_, x, incx_, y, incy_, param, queue ); 60 | #endif 61 | } 62 | 63 | } // namespace impl 64 | 65 | //============================================================================== 66 | // High-level overloaded wrappers call mid-level templated wrapper. 67 | // Not available for complex. 68 | 69 | //------------------------------------------------------------------------------ 70 | /// GPU device, float version. 71 | /// @ingroup rotm 72 | void rotm( 73 | int64_t n, 74 | float* x, int64_t incx, 75 | float*y, int64_t incy, 76 | const float* param, 77 | blas::Queue& queue ) 78 | { 79 | impl::rotm( n, x, incx, y, incy, param, queue ); 80 | } 81 | 82 | //------------------------------------------------------------------------------ 83 | /// GPU device, double version. 84 | /// @ingroup rotm 85 | void rotm( 86 | int64_t n, 87 | double* x, int64_t incx, 88 | double* y, int64_t incy, 89 | const double* param, 90 | blas::Queue& queue ) 91 | { 92 | impl::rotm( n, x, incx, y, incy, param, queue ); 93 | } 94 | 95 | } // namespace blas 96 | -------------------------------------------------------------------------------- /src/device_rotmg.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #include "blas/device_blas.hh" 7 | #include "blas/counter.hh" 8 | 9 | #include "device_internal.hh" 10 | 11 | #include 12 | #include 13 | 14 | namespace blas { 15 | 16 | //============================================================================== 17 | namespace impl { 18 | 19 | //------------------------------------------------------------------------------ 20 | /// Mid-level templated wrapper checks and converts arguments, 21 | /// then calls low-level wrapper. 22 | /// @ingroup rotmg_internal 23 | /// 24 | template 25 | void rotmg( 26 | scalar_t* d1, 27 | scalar_t* d2, 28 | scalar_t* x1, 29 | scalar_t* y1, 30 | scalar_t* param, 31 | blas::Queue& queue ) 32 | { 33 | #ifndef BLAS_HAVE_DEVICE 34 | throw blas::Error( "device BLAS not available", __func__ ); 35 | #else 36 | #ifdef BLAS_HAVE_PAPI 37 | // PAPI instrumentation 38 | counter::dev_rotmg_type element; 39 | memset( &element, 0, sizeof( element ) ); 40 | element = { 1 }; 41 | counter::insert( element, counter::Id::dev_rotmg ); 42 | 43 | // This operation does not incur significant FLOPs, so no 44 | // need to call counter::inc_flop_count() 45 | #endif 46 | 47 | blas::internal_set_device( queue.device() ); 48 | 49 | // call low-level wrapper 50 | internal::rotmg( d1, d2, x1, y1, param, queue ); 51 | #endif 52 | } 53 | 54 | } // namespace impl 55 | 56 | //============================================================================== 57 | // High-level overloaded wrappers call mid-level templated wrapper. 58 | // Not available for complex. 59 | 60 | //------------------------------------------------------------------------------ 61 | /// GPU device, float version. 62 | /// @ingroup rotmg 63 | void rotmg( 64 | float* d1, 65 | float* d2, 66 | float* x1, 67 | float* y1, 68 | float* param, 69 | blas::Queue& queue ) 70 | { 71 | impl::rotmg( d1, d2, x1, y1, param, queue ); 72 | } 73 | 74 | //------------------------------------------------------------------------------ 75 | /// GPU device, double version. 76 | /// @ingroup rotmg 77 | void rotmg( 78 | double* d1, 79 | double* d2, 80 | double* x1, 81 | double* y1, 82 | double* param, 83 | blas::Queue& queue ) 84 | { 85 | impl::rotmg( d1, d2, x1, y1, param, queue ); 86 | } 87 | 88 | } // namespace blas 89 | -------------------------------------------------------------------------------- /src/device_scal.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #include "blas/device_blas.hh" 7 | #include "blas/counter.hh" 8 | 9 | #include "device_internal.hh" 10 | 11 | #include 12 | #include 13 | 14 | namespace blas { 15 | 16 | //============================================================================== 17 | namespace impl { 18 | 19 | //------------------------------------------------------------------------------ 20 | /// Mid-level templated wrapper checks and converts arguments, 21 | /// then calls low-level wrapper. 22 | /// @ingroup scal_internal 23 | /// 24 | template 25 | void scal( 26 | int64_t n, 27 | scalar_t alpha, 28 | scalar_t* x, int64_t incx, 29 | blas::Queue& queue ) 30 | { 31 | #ifndef BLAS_HAVE_DEVICE 32 | throw blas::Error( "device BLAS not available", __func__ ); 33 | #else 34 | // check arguments 35 | blas_error_if( n < 0 ); // standard BLAS returns, doesn't fail 36 | blas_error_if( incx <= 0 ); // standard BLAS returns, doesn't fail 37 | 38 | #ifdef BLAS_HAVE_PAPI 39 | // PAPI instrumentation 40 | counter::dev_scal_type element; 41 | memset( &element, 0, sizeof( element ) ); 42 | element = { n }; 43 | counter::insert( element, counter::Id::dev_scal ); 44 | 45 | double gflops = 1e9 * blas::Gflop< scalar_t >::scal( n ); 46 | counter::inc_flop_count( (long long int)gflops ); 47 | #endif 48 | 49 | // convert arguments 50 | device_blas_int n_ = to_device_blas_int( n ); 51 | device_blas_int incx_ = to_device_blas_int( incx ); 52 | 53 | blas::internal_set_device( queue.device() ); 54 | 55 | // call low-level wrapper 56 | internal::scal( n_, alpha, x, incx_, queue ); 57 | #endif 58 | } 59 | 60 | } // namespace impl 61 | 62 | //============================================================================== 63 | // High-level overloaded wrappers call mid-level templated wrapper. 64 | 65 | //------------------------------------------------------------------------------ 66 | /// GPU device, float version. 67 | /// @ingroup scal 68 | void scal( 69 | int64_t n, 70 | float alpha, 71 | float* x, int64_t incx, 72 | blas::Queue& queue ) 73 | { 74 | impl::scal( n, alpha, x, incx, queue ); 75 | } 76 | 77 | //------------------------------------------------------------------------------ 78 | /// GPU device, double version. 79 | /// @ingroup scal 80 | void scal( 81 | int64_t n, 82 | double alpha, 83 | double* x, int64_t incx, 84 | blas::Queue& queue ) 85 | { 86 | impl::scal( n, alpha, x, incx, queue ); 87 | } 88 | 89 | //------------------------------------------------------------------------------ 90 | /// GPU device, complex version. 91 | /// @ingroup scal 92 | void scal( 93 | int64_t n, 94 | std::complex alpha, 95 | std::complex* x, int64_t incx, 96 | blas::Queue& queue ) 97 | { 98 | impl::scal( n, alpha, x, incx, queue ); 99 | } 100 | 101 | //------------------------------------------------------------------------------ 102 | /// GPU device, complex version. 103 | /// @ingroup scal 104 | void scal( 105 | int64_t n, 106 | std::complex alpha, 107 | std::complex* x, int64_t incx, 108 | blas::Queue& queue ) 109 | { 110 | impl::scal( n, alpha, x, incx, queue ); 111 | } 112 | 113 | } // namespace blas 114 | -------------------------------------------------------------------------------- /src/device_swap.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #include "blas/device_blas.hh" 7 | #include "blas/counter.hh" 8 | 9 | #include "device_internal.hh" 10 | 11 | #include 12 | #include 13 | 14 | namespace blas { 15 | 16 | //============================================================================== 17 | namespace impl { 18 | 19 | //------------------------------------------------------------------------------ 20 | /// Mid-level templated wrapper checks and converts arguments, 21 | /// then calls low-level wrapper. 22 | /// @ingroup swap_internal 23 | /// 24 | template 25 | void swap( 26 | int64_t n, 27 | scalar_t* x, int64_t incx, 28 | scalar_t* y, int64_t incy, 29 | blas::Queue& queue ) 30 | { 31 | #ifndef BLAS_HAVE_DEVICE 32 | throw blas::Error( "device BLAS not available", __func__ ); 33 | #else 34 | // check arguments 35 | blas_error_if( n < 0 ); // standard BLAS returns, doesn't fail 36 | blas_error_if( incx == 0 ); // standard BLAS doesn't detect inc[xy] == 0 37 | blas_error_if( incy == 0 ); 38 | 39 | #ifdef BLAS_HAVE_PAPI 40 | // PAPI instrumentation 41 | counter::dev_swap_type element; 42 | memset( &element, 0, sizeof( element ) ); 43 | element = { n }; 44 | counter::insert( element, counter::Id::dev_swap ); 45 | 46 | double gflops = 1e9 * blas::Gflop< scalar_t >::swap( n ); 47 | counter::inc_flop_count( (long long int)gflops ); 48 | #endif 49 | 50 | // convert arguments 51 | device_blas_int n_ = to_device_blas_int( n ); 52 | device_blas_int incx_ = to_device_blas_int( incx ); 53 | device_blas_int incy_ = to_device_blas_int( incy ); 54 | 55 | blas::internal_set_device( queue.device() ); 56 | 57 | // call low-level wrapper 58 | internal::swap( n_, x, incx_, y, incy_, queue ); 59 | #endif 60 | } 61 | 62 | } // namespace impl 63 | 64 | //============================================================================== 65 | // High-level overloaded wrappers call mid-level templated wrapper. 66 | 67 | //------------------------------------------------------------------------------ 68 | /// GPU device, float version. 69 | /// @ingroup swap 70 | void swap( 71 | int64_t n, 72 | float* x, int64_t incx, 73 | float* y, int64_t incy, 74 | blas::Queue& queue ) 75 | { 76 | impl::swap( n, x, incx, y, incy, queue ); 77 | } 78 | 79 | //------------------------------------------------------------------------------ 80 | /// GPU device, double version. 81 | /// @ingroup swap 82 | void swap( 83 | int64_t n, 84 | double* x, int64_t incx, 85 | double* y, int64_t incy, 86 | blas::Queue& queue ) 87 | { 88 | impl::swap( n, x, incx, y, incy, queue ); 89 | } 90 | 91 | //------------------------------------------------------------------------------ 92 | /// GPU device, complex version. 93 | /// @ingroup swap 94 | void swap( 95 | int64_t n, 96 | std::complex* x, int64_t incx, 97 | std::complex* y, int64_t incy, 98 | blas::Queue& queue ) 99 | { 100 | impl::swap( n, x, incx, y, incy, queue ); 101 | } 102 | 103 | //------------------------------------------------------------------------------ 104 | /// GPU device, complex version. 105 | /// @ingroup swap 106 | void swap( 107 | int64_t n, 108 | std::complex* x, int64_t incx, 109 | std::complex* y, int64_t incy, 110 | blas::Queue& queue ) 111 | { 112 | impl::swap( n, x, incx, y, incy, queue ); 113 | } 114 | 115 | } // namespace blas 116 | -------------------------------------------------------------------------------- /src/device_syr.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #include "blas/device_blas.hh" 7 | #include "blas/counter.hh" 8 | 9 | #include "device_internal.hh" 10 | 11 | #include 12 | #include 13 | 14 | namespace blas { 15 | 16 | //============================================================================== 17 | namespace impl { 18 | 19 | //------------------------------------------------------------------------------ 20 | /// Mid-level templated wrapper checks and converts arguments, 21 | /// then calls low-level wrapper. 22 | /// @ingroup syr_internal 23 | /// 24 | template 25 | void syr( 26 | blas::Layout layout, 27 | blas::Uplo uplo, 28 | int64_t n, 29 | scalar_t alpha, 30 | scalar_t const* x, int64_t incx, 31 | scalar_t* A, int64_t lda, 32 | blas::Queue& queue ) 33 | { 34 | #ifndef BLAS_HAVE_DEVICE 35 | throw blas::Error( "device BLAS not available", __func__ ); 36 | #else 37 | // check arguments 38 | blas_error_if( layout != Layout::ColMajor && 39 | layout != Layout::RowMajor ); 40 | blas_error_if( uplo != Uplo::Lower && 41 | uplo != Uplo::Upper ); 42 | blas_error_if( n < 0 ); 43 | blas_error_if( lda < n ); 44 | blas_error_if( incx == 0 ); 45 | 46 | #ifdef BLAS_HAVE_PAPI 47 | // PAPI instrumentation 48 | counter::dev_syr_type element; 49 | memset( &element, 0, sizeof( element ) ); 50 | element = { uplo, n }; 51 | counter::insert( element, counter::Id::dev_syr ); 52 | 53 | double gflops = 1e9 * blas::Gflop< scalar_t >::syr( n ); 54 | counter::inc_flop_count( (long long int)gflops ); 55 | #endif 56 | 57 | // convert arguments 58 | device_blas_int n_ = to_device_blas_int( n ); 59 | device_blas_int lda_ = to_device_blas_int( lda ); 60 | device_blas_int incx_ = to_device_blas_int( incx ); 61 | 62 | blas::internal_set_device( queue.device() ); 63 | 64 | if (layout == Layout::RowMajor) { 65 | // swap lower <=> upper 66 | uplo = (uplo == Uplo::Lower ? Uplo::Upper : Uplo::Lower); 67 | } 68 | queue.sync(); 69 | 70 | // call low-level wrapper 71 | internal::syr( uplo, n_, 72 | alpha, x, incx_, A, lda_, queue ); 73 | #endif 74 | } 75 | 76 | } // namespace impl 77 | 78 | //============================================================================== 79 | // High-level overloaded wrappers call mid-level templated wrapper. 80 | 81 | //------------------------------------------------------------------------------ 82 | /// GPU device, float version. 83 | /// @ingroup syr 84 | void syr( 85 | blas::Layout layout, 86 | blas::Uplo uplo, 87 | int64_t n, 88 | float alpha, 89 | float const* x, int64_t incx, 90 | float* A, int64_t lda, 91 | blas::Queue& queue ) 92 | { 93 | impl::syr( layout, uplo, n, 94 | alpha, x, incx, A, lda, queue ); 95 | } 96 | 97 | //------------------------------------------------------------------------------ 98 | /// GPU device, double version. 99 | /// @ingroup syr 100 | void syr( 101 | blas::Layout layout, 102 | blas::Uplo uplo, 103 | int64_t n, 104 | double alpha, 105 | double const* x, int64_t incx, 106 | double* A, int64_t lda, 107 | blas::Queue& queue ) 108 | { 109 | impl::syr( layout, uplo, n, 110 | alpha, x, incx, A, lda, queue ); 111 | } 112 | 113 | //------------------------------------------------------------------------------ 114 | /// GPU device, complex version. 115 | /// @ingroup syr 116 | void syr( 117 | blas::Layout layout, 118 | blas::Uplo uplo, 119 | int64_t n, 120 | std::complex alpha, 121 | std::complex const* x, int64_t incx, 122 | std::complex* A, int64_t lda, 123 | blas::Queue& queue ) 124 | { 125 | impl::syr( layout, uplo, n, 126 | alpha, x, incx, A, lda, queue ); 127 | } 128 | 129 | //------------------------------------------------------------------------------ 130 | /// GPU device, complex version. 131 | /// @ingroup syr 132 | void syr( 133 | blas::Layout layout, 134 | blas::Uplo uplo, 135 | int64_t n, 136 | std::complex alpha, 137 | std::complex const* x, int64_t incx, 138 | std::complex* A, int64_t lda, 139 | blas::Queue& queue ) 140 | { 141 | impl::syr( layout, uplo, n, 142 | alpha, x, incx, A, lda, queue ); 143 | } 144 | 145 | } // namespace blas 146 | -------------------------------------------------------------------------------- /src/hip/device_conj.hip: -------------------------------------------------------------------------------- 1 | #include "blas/device.hh" 2 | #include 3 | 4 | #if defined(BLAS_HAVE_ROCBLAS) 5 | 6 | namespace blas { 7 | 8 | __device__ std::complex conj_convert( 9 | std::complex z) 10 | { 11 | hipFloatComplex res = hipConjf(*(hipFloatComplex*) &z); 12 | return *(std::complex*) &res; 13 | } 14 | 15 | __device__ std::complex conj_convert( 16 | std::complex z) 17 | { 18 | hipDoubleComplex res = hipConj(*(hipDoubleComplex*) &z); 19 | return *(std::complex*) &res; 20 | } 21 | 22 | template 23 | __global__ void conj_kernel( 24 | int64_t n, 25 | TS const* src, int64_t inc_src, int64_t i_src, 26 | TD* dst, int64_t inc_dst, int64_t i_dst) 27 | { 28 | int i = blockIdx.x * blockDim.x + threadIdx.x; 29 | if (i < n) 30 | dst[ i*inc_dst + i_dst ] = conj_convert( src[ i*inc_src + i_src ] ); 31 | } 32 | 33 | template 34 | void conj( 35 | int64_t n, 36 | TS const* src, int64_t inc_src, 37 | TD* dst, int64_t inc_dst, 38 | blas::Queue& queue ) 39 | { 40 | if (n <= 0) { 41 | return; 42 | } 43 | 44 | const int BlockSize = 128; 45 | 46 | int64_t n_threads = std::min( int64_t( BlockSize ), n ); 47 | int64_t n_blocks = ceildiv(n, n_threads); 48 | 49 | int64_t i_src = (inc_src > 0 ? 0 : (1 - n) * inc_src); 50 | int64_t i_dst = (inc_dst > 0 ? 0 : (1 - n) * inc_dst); 51 | 52 | blas_dev_call( 53 | hipSetDevice( queue.device() ) ); 54 | 55 | conj_kernel<<>>( 56 | n, src, inc_src, i_src, dst, inc_dst, i_dst ); 57 | 58 | blas_dev_call( 59 | hipGetLastError() ); 60 | } 61 | 62 | //------------------------------------------------------------------------------ 63 | // Explicit instantiations. 64 | template void conj( 65 | int64_t n, 66 | std::complex const* src, int64_t inc_src, 67 | std::complex* dst, int64_t inc_dst, 68 | blas::Queue& queue); 69 | 70 | template void conj( 71 | int64_t n, 72 | std::complex const* src, int64_t inc_src, 73 | std::complex* dst, int64_t inc_dst, 74 | blas::Queue& queue); 75 | 76 | } // namespace blas 77 | 78 | #endif // BLAS_HAVE_ROCBLAS 79 | -------------------------------------------------------------------------------- /src/hip/device_shift_vec.hip: -------------------------------------------------------------------------------- 1 | #include "hip/hip_runtime.h" 2 | #include "blas/device.hh" 3 | 4 | #if defined(BLAS_HAVE_ROCBLAS) 5 | 6 | namespace blas { 7 | 8 | template 9 | __device__ void shift_vec_func( 10 | int64_t n, scalar_t* v, 11 | scalar_t c) 12 | { 13 | for (int i = threadIdx.x; i < n; i += blockDim.x) { 14 | v[ i ] += c; 15 | } 16 | } 17 | 18 | template 19 | __global__ void shift_vec_kernel( 20 | int64_t n, scalar_t* v, 21 | scalar_t c) 22 | { 23 | shift_vec_func( n, v, c ); 24 | } 25 | 26 | template 27 | void shift_vec( 28 | int64_t n, scalar_t* v, 29 | scalar_t c, 30 | blas::Queue& queue) 31 | { 32 | if (n == 0) { 33 | return; 34 | } 35 | 36 | int64_t nthreads = std::min( int64_t( 1024 ), n ); 37 | 38 | blas_dev_call( 39 | hipSetDevice( queue.device() ) ); 40 | 41 | shift_vec_kernel<<<1, nthreads, 0, queue.stream()>>>( 42 | n, v, c); 43 | 44 | blas_dev_call( 45 | hipGetLastError() ); 46 | } 47 | 48 | //------------------------------------------------------------------------------ 49 | // Explicit instantiations. 50 | template 51 | void shift_vec( 52 | int64_t n, int64_t* v, 53 | int64_t c, 54 | blas::Queue& queue); 55 | 56 | } // namespace blas 57 | 58 | #endif // BLAS_HAVE_ROCBLAS 59 | -------------------------------------------------------------------------------- /src/iamax.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #include "blas/fortran.h" 7 | #include "blas.hh" 8 | #include "blas_internal.hh" 9 | #include "blas/counter.hh" 10 | 11 | #include 12 | #include 13 | 14 | namespace blas { 15 | 16 | //============================================================================== 17 | namespace internal { 18 | 19 | //------------------------------------------------------------------------------ 20 | /// Low-level overload wrapper calls Fortran, float version. 21 | /// @ingroup iamax_internal 22 | inline blas_int iamax( 23 | blas_int n, 24 | float const* x, blas_int incx ) 25 | { 26 | return BLAS_isamax( &n, x, &incx ); 27 | } 28 | 29 | //------------------------------------------------------------------------------ 30 | /// Low-level overload wrapper calls Fortran, double version. 31 | /// @ingroup iamax_internal 32 | inline blas_int iamax( 33 | blas_int n, 34 | double const* x, blas_int incx ) 35 | { 36 | return BLAS_idamax( &n, x, &incx ); 37 | } 38 | 39 | //------------------------------------------------------------------------------ 40 | /// Low-level overload wrapper calls Fortran, complex version. 41 | /// @ingroup iamax_internal 42 | inline blas_int iamax( 43 | blas_int n, 44 | std::complex const* x, blas_int incx ) 45 | { 46 | return BLAS_icamax( &n, 47 | (blas_complex_float*) x, &incx ); 48 | } 49 | 50 | //------------------------------------------------------------------------------ 51 | /// Low-level overload wrapper calls Fortran, complex version. 52 | /// @ingroup iamax_internal 53 | inline blas_int iamax( 54 | blas_int n, 55 | std::complex const* x, blas_int incx ) 56 | { 57 | return BLAS_izamax( &n, 58 | (blas_complex_double*) x, &incx ); 59 | } 60 | 61 | } // namespace internal 62 | 63 | //============================================================================== 64 | namespace impl { 65 | 66 | //------------------------------------------------------------------------------ 67 | /// Mid-level templated wrapper checks and converts arguments, 68 | /// then calls low-level wrapper. 69 | /// @ingroup iamax_internal 70 | /// 71 | template 72 | int64_t iamax( 73 | int64_t n, 74 | scalar_t const* x, int64_t incx ) 75 | { 76 | // check arguments 77 | blas_error_if( n < 0 ); // standard BLAS returns, doesn't fail 78 | blas_error_if( incx <= 0 ); // standard BLAS returns, doesn't fail 79 | 80 | #ifdef BLAS_HAVE_PAPI 81 | // PAPI instrumentation 82 | counter::iamax_type element; 83 | memset( &element, 0, sizeof( element ) ); 84 | element = { n }; 85 | counter::insert( element, counter::Id::iamax ); 86 | 87 | double gflops = 1e9 * blas::Gflop< scalar_t >::iamax( n ); 88 | counter::inc_flop_count( (long long int)gflops ); 89 | #endif 90 | 91 | // convert arguments 92 | blas_int n_ = to_blas_int( n ); 93 | blas_int incx_ = to_blas_int( incx ); 94 | 95 | // call low-level wrapper 96 | return internal::iamax( n_, x, incx_ ) - 1; 97 | } 98 | 99 | } // namespace impl 100 | 101 | //============================================================================== 102 | // High-level overloaded wrappers call mid-level templated wrapper. 103 | 104 | //------------------------------------------------------------------------------ 105 | /// CPU, float version. 106 | /// @ingroup iamax 107 | int64_t iamax( 108 | int64_t n, 109 | float const* x, int64_t incx ) 110 | { 111 | return impl::iamax( n, x, incx ); 112 | } 113 | 114 | //------------------------------------------------------------------------------ 115 | /// CPU, double version. 116 | /// @ingroup iamax 117 | int64_t iamax( 118 | int64_t n, 119 | double const* x, int64_t incx ) 120 | { 121 | return impl::iamax( n, x, incx ); 122 | } 123 | 124 | //------------------------------------------------------------------------------ 125 | /// CPU, complex version. 126 | /// @ingroup iamax 127 | int64_t iamax( 128 | int64_t n, 129 | std::complex const* x, int64_t incx ) 130 | { 131 | return impl::iamax( n, x, incx ); 132 | } 133 | 134 | //------------------------------------------------------------------------------ 135 | /// CPU, complex version. 136 | /// @ingroup iamax 137 | int64_t iamax( 138 | int64_t n, 139 | std::complex const* x, int64_t incx ) 140 | { 141 | return impl::iamax( n, x, incx ); 142 | } 143 | 144 | } // namespace blas 145 | -------------------------------------------------------------------------------- /src/nrm2.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #include "blas/fortran.h" 7 | #include "blas.hh" 8 | #include "blas_internal.hh" 9 | #include "blas/counter.hh" 10 | 11 | #include 12 | #include 13 | 14 | namespace blas { 15 | 16 | //============================================================================== 17 | namespace internal { 18 | 19 | //------------------------------------------------------------------------------ 20 | /// Low-level overload wrapper calls Fortran, float version. 21 | /// @ingroup nrm2_internal 22 | inline float nrm2( 23 | blas_int n, 24 | float const* x, blas_int incx ) 25 | { 26 | return BLAS_snrm2( &n, x, &incx ); 27 | } 28 | 29 | //------------------------------------------------------------------------------ 30 | /// Low-level overload wrapper calls Fortran, double version. 31 | /// @ingroup nrm2_internal 32 | inline double nrm2( 33 | blas_int n, 34 | double const* x, blas_int incx ) 35 | { 36 | return BLAS_dnrm2( &n, x, &incx ); 37 | } 38 | 39 | //------------------------------------------------------------------------------ 40 | /// Low-level overload wrapper calls Fortran, complex version. 41 | /// @ingroup nrm2_internal 42 | inline float nrm2( 43 | blas_int n, 44 | std::complex const* x, blas_int incx ) 45 | { 46 | return BLAS_scnrm2( &n, (blas_complex_float*) x, &incx ); 47 | } 48 | 49 | //------------------------------------------------------------------------------ 50 | /// Low-level overload wrapper calls Fortran, complex version. 51 | /// @ingroup nrm2_internal 52 | inline double nrm2( 53 | blas_int n, 54 | std::complex const* x, blas_int incx ) 55 | { 56 | return BLAS_dznrm2( &n, (blas_complex_double*) x, &incx ); 57 | } 58 | 59 | } // namespace internal 60 | 61 | //============================================================================== 62 | namespace impl { 63 | 64 | //------------------------------------------------------------------------------ 65 | /// Mid-level templated wrapper checks and converts arguments, 66 | /// then calls low-level wrapper. 67 | /// @ingroup nrm2_internal 68 | /// 69 | template 70 | real_type nrm2( 71 | int64_t n, 72 | scalar_t const* x, int64_t incx ) 73 | { 74 | // check arguments 75 | blas_error_if( n < 0 ); // standard BLAS returns, doesn't fail 76 | blas_error_if( incx <= 0 ); // standard BLAS returns, doesn't fail 77 | 78 | #ifdef BLAS_HAVE_PAPI 79 | // PAPI instrumentation 80 | counter::nrm2_type element; 81 | memset( &element, 0, sizeof( element ) ); 82 | element = { n }; 83 | counter::insert( element, counter::Id::nrm2 ); 84 | 85 | double gflops = 1e9 * blas::Gflop< scalar_t >::nrm2( n ); 86 | counter::inc_flop_count( (long long int)gflops ); 87 | #endif 88 | 89 | // convert arguments 90 | blas_int n_ = to_blas_int( n ); 91 | blas_int incx_ = to_blas_int( incx ); 92 | 93 | // call low-level wrapper 94 | return internal::nrm2( n_, x, incx_ ); 95 | } 96 | 97 | } // namespace impl 98 | 99 | //============================================================================== 100 | // High-level overloaded wrappers call mid-level templated wrapper. 101 | 102 | //------------------------------------------------------------------------------ 103 | /// CPU, float version. 104 | /// @ingroup nrm2 105 | float nrm2( 106 | int64_t n, 107 | float const* x, int64_t incx ) 108 | { 109 | return impl::nrm2( n, x, incx ); 110 | } 111 | 112 | //------------------------------------------------------------------------------ 113 | /// CPU, double version. 114 | /// @ingroup nrm2 115 | double nrm2( 116 | int64_t n, 117 | double const* x, int64_t incx ) 118 | { 119 | return impl::nrm2( n, x, incx ); 120 | } 121 | 122 | //------------------------------------------------------------------------------ 123 | /// CPU, complex version. 124 | /// @ingroup nrm2 125 | float nrm2( 126 | int64_t n, 127 | std::complex const* x, int64_t incx ) 128 | { 129 | return impl::nrm2( n, x, incx ); 130 | } 131 | 132 | //------------------------------------------------------------------------------ 133 | /// CPU, complex version. 134 | /// @ingroup nrm2 135 | double nrm2( 136 | int64_t n, 137 | std::complex const* x, int64_t incx ) 138 | { 139 | return impl::nrm2( n, x, incx ); 140 | } 141 | 142 | } // namespace blas 143 | -------------------------------------------------------------------------------- /src/rotg.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #include "blas/fortran.h" 7 | #include "blas.hh" 8 | #include "blas/counter.hh" 9 | 10 | #include 11 | #include 12 | 13 | namespace blas { 14 | 15 | // ============================================================================= 16 | // Overloaded wrappers for s, d, c, z precisions. 17 | 18 | // ----------------------------------------------------------------------------- 19 | /// @ingroup rotg 20 | void rotg( 21 | float *a, 22 | float *b, 23 | float *c, 24 | float *s ) 25 | { 26 | #ifdef BLAS_HAVE_PAPI 27 | // PAPI instrumentation 28 | counter::rotg_type element; 29 | memset( &element, 0, sizeof( element ) ); 30 | element = { 1 }; 31 | counter::insert( element, counter::Id::rotg ); 32 | 33 | // This operation does not incur significant FLOPs, so no 34 | // need to call counter::inc_flop_count() 35 | #endif 36 | 37 | BLAS_srotg( a, b, c, s ); 38 | } 39 | 40 | // ----------------------------------------------------------------------------- 41 | /// @ingroup rotg 42 | void rotg( 43 | double *a, 44 | double *b, 45 | double *c, 46 | double *s ) 47 | { 48 | #ifdef BLAS_HAVE_PAPI 49 | // PAPI instrumentation 50 | counter::rotg_type element; 51 | memset( &element, 0, sizeof( element ) ); 52 | element = { 1 }; 53 | counter::insert( element, counter::Id::rotg ); 54 | 55 | // This operation does not incur significant FLOPs, so no 56 | // need to call counter::inc_flop_count() 57 | #endif 58 | 59 | BLAS_drotg( a, b, c, s ); 60 | } 61 | 62 | // ----------------------------------------------------------------------------- 63 | /// @ingroup rotg 64 | void rotg( 65 | std::complex *a, 66 | std::complex *b, // const in BLAS implementation, oddly 67 | float *c, 68 | std::complex *s ) 69 | { 70 | #ifdef BLAS_HAVE_PAPI 71 | // PAPI instrumentation 72 | counter::rotg_type element; 73 | memset( &element, 0, sizeof( element ) ); 74 | element = { 1 }; 75 | counter::insert( element, counter::Id::rotg ); 76 | 77 | // This operation does not incur significant FLOPs, so no 78 | // need to call counter::inc_flop_count() 79 | #endif 80 | 81 | BLAS_crotg( (blas_complex_float*) a, 82 | (blas_complex_float*) b, 83 | c, 84 | (blas_complex_float*) s ); 85 | } 86 | 87 | // ----------------------------------------------------------------------------- 88 | /// @ingroup rotg 89 | void rotg( 90 | std::complex *a, 91 | std::complex *b, // const in BLAS implementation, oddly 92 | double *c, 93 | std::complex *s ) 94 | { 95 | #ifdef BLAS_HAVE_PAPI 96 | // PAPI instrumentation 97 | counter::rotg_type element; 98 | memset( &element, 0, sizeof( element ) ); 99 | element = { 1 }; 100 | counter::insert( element, counter::Id::rotg ); 101 | 102 | // This operation does not incur significant FLOPs, so no 103 | // need to call counter::inc_flop_count() 104 | #endif 105 | 106 | BLAS_zrotg( (blas_complex_double*) a, 107 | (blas_complex_double*) b, 108 | c, 109 | (blas_complex_double*) s ); 110 | } 111 | 112 | } // namespace blas 113 | -------------------------------------------------------------------------------- /src/rotm.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #include "blas/fortran.h" 7 | #include "blas.hh" 8 | #include "blas_internal.hh" 9 | #include "blas/counter.hh" 10 | 11 | #include 12 | #include 13 | 14 | namespace blas { 15 | 16 | // ============================================================================= 17 | // Overloaded wrappers for s, d precisions. 18 | // Not available for complex. 19 | 20 | // ----------------------------------------------------------------------------- 21 | /// @ingroup rotm 22 | void rotm( 23 | int64_t n, 24 | float *x, int64_t incx, 25 | float *y, int64_t incy, 26 | float const param[5] ) 27 | { 28 | // check arguments 29 | blas_error_if( n < 0 ); // standard BLAS returns, doesn't fail 30 | blas_error_if( incx == 0 ); // standard BLAS doesn't detect inc[xy] == 0 31 | blas_error_if( incy == 0 ); 32 | 33 | #ifdef BLAS_HAVE_PAPI 34 | // PAPI instrumentation 35 | counter::rotm_type element; 36 | memset( &element, 0, sizeof( element ) ); 37 | element = { n }; 38 | counter::insert( element, counter::Id::rotm ); 39 | 40 | double gflops = 1e9 * blas::Gflop< float >::rotm( n ); 41 | counter::inc_flop_count( (long long int)gflops ); 42 | #endif 43 | 44 | // convert arguments 45 | blas_int n_ = to_blas_int( n ); 46 | blas_int incx_ = to_blas_int( incx ); 47 | blas_int incy_ = to_blas_int( incy ); 48 | BLAS_srotm( &n_, x, &incx_, y, &incy_, param ); 49 | } 50 | 51 | // ----------------------------------------------------------------------------- 52 | /// @ingroup rotm 53 | void rotm( 54 | int64_t n, 55 | double *x, int64_t incx, 56 | double *y, int64_t incy, 57 | double const param[5] ) 58 | { 59 | // check arguments 60 | blas_error_if( n < 0 ); // standard BLAS returns, doesn't fail 61 | blas_error_if( incx == 0 ); // standard BLAS doesn't detect inc[xy] == 0 62 | blas_error_if( incy == 0 ); 63 | 64 | #ifdef BLAS_HAVE_PAPI 65 | // PAPI instrumentation 66 | counter::rotm_type element; 67 | memset( &element, 0, sizeof( element ) ); 68 | element = { n }; 69 | counter::insert( element, counter::Id::rotm ); 70 | 71 | double gflops = 1e9 * blas::Gflop< double >::rotm( n ); 72 | counter::inc_flop_count( (long long int)gflops ); 73 | #endif 74 | 75 | // convert arguments 76 | blas_int n_ = to_blas_int( n ); 77 | blas_int incx_ = to_blas_int( incx ); 78 | blas_int incy_ = to_blas_int( incy ); 79 | BLAS_drotm( &n_, x, &incx_, y, &incy_, param ); 80 | } 81 | 82 | } // namespace blas 83 | -------------------------------------------------------------------------------- /src/rotmg.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #include "blas/fortran.h" 7 | #include "blas.hh" 8 | #include "blas/counter.hh" 9 | 10 | #include 11 | #include 12 | 13 | namespace blas { 14 | 15 | // ============================================================================= 16 | // Overloaded wrappers for s, d, c, z precisions. 17 | // Not available for complex. 18 | 19 | // ----------------------------------------------------------------------------- 20 | /// @ingroup rotmg 21 | void rotmg( 22 | float *d1, 23 | float *d2, 24 | float *a, 25 | float b, 26 | float param[5] ) 27 | { 28 | #ifdef BLAS_HAVE_PAPI 29 | // PAPI instrumentation 30 | counter::rotmg_type element; 31 | memset( &element, 0, sizeof( element ) ); 32 | element = { 1 }; 33 | counter::insert( element, counter::Id::rotmg ); 34 | 35 | // This operation does not incur significant FLOPs, so no 36 | // need to call counter::inc_flop_count() 37 | #endif 38 | 39 | BLAS_srotmg( d1, d2, a, &b, param ); 40 | } 41 | 42 | // ----------------------------------------------------------------------------- 43 | /// @ingroup rotmg 44 | void rotmg( 45 | double *d1, 46 | double *d2, 47 | double *a, 48 | double b, 49 | double param[5] ) 50 | { 51 | #ifdef BLAS_HAVE_PAPI 52 | // PAPI instrumentation 53 | counter::rotmg_type element; 54 | memset( &element, 0, sizeof( element ) ); 55 | element = { 1 }; 56 | counter::insert( element, counter::Id::rotmg ); 57 | 58 | // This operation does not incur significant FLOPs, so no 59 | // need to call counter::inc_flop_count() 60 | #endif 61 | 62 | BLAS_drotmg( d1, d2, a, &b, param ); 63 | } 64 | 65 | } // namespace blas 66 | -------------------------------------------------------------------------------- /src/util.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #include "blas/util.hh" 7 | 8 | namespace blas { 9 | 10 | const char* Layout_help = "one of: C or ColMajor; R or RowMajor"; 11 | const char* Op_help = "one of: N or NoTrans; T or Trans; C or ConjTrans"; 12 | const char* Uplo_help = "one of: L or Lower; U or Upper"; 13 | const char* Diag_help = "one of: N or NonUnit; U or Unit"; 14 | const char* Side_help = "one of: L or Left; R or Right"; 15 | 16 | } // namespace blas 17 | -------------------------------------------------------------------------------- /src/version.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #include "blas.hh" 7 | 8 | namespace blas { 9 | 10 | //------------------------------------------------------------------------------ 11 | /// @return BLAS++ version. 12 | /// Version is integer of form yyyymmrr, where yyyy is year, mm is month, 13 | /// and rr is release counter within month, starting at 00. 14 | /// 15 | int blaspp_version() 16 | { 17 | return BLASPP_VERSION; 18 | } 19 | 20 | // BLASPP_ID is the Mercurial or git commit hash ID, either 21 | // defined by `git rev-parse --short HEAD` in Makefile, 22 | // or defined here by make_release.py for release tar files. DO NOT EDIT. 23 | #ifndef BLASPP_ID 24 | #define BLASPP_ID "unknown" 25 | #endif 26 | 27 | //------------------------------------------------------------------------------ 28 | /// @return BLAS++ Mercurial or git commit hash ID. 29 | /// 30 | const char* blaspp_id() 31 | { 32 | return BLASPP_ID; 33 | } 34 | 35 | } // namespace blas 36 | -------------------------------------------------------------------------------- /test/GNUmakefile: -------------------------------------------------------------------------------- 1 | top = .. 2 | include ${top}/GNUmakefile.subdir 3 | -------------------------------------------------------------------------------- /test/print_matrix.hh: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #ifndef PRINT_HH 7 | #define PRINT_HH 8 | 9 | #include 10 | #include 11 | 12 | // ----------------------------------------------------------------------------- 13 | template 14 | void print_matrix( int64_t m, int64_t n, T *A, int64_t lda, 15 | const char* format="%9.4f" ) 16 | { 17 | #define A(i_, j_) A[ (i_) + size_t(lda)*(j_) ] 18 | 19 | assert( m >= 0 ); 20 | assert( n >= 0 ); 21 | assert( lda >= m ); 22 | char format2[32]; 23 | snprintf( format2, sizeof(format2), " %s", format ); 24 | 25 | printf( "[\n" ); 26 | for (int64_t i = 0; i < m; ++i) { 27 | for (int64_t j = 0; j < n; ++j) { 28 | printf( format2, A(i, j) ); 29 | } 30 | printf( "\n" ); 31 | } 32 | printf( "];\n" ); 33 | 34 | #undef A 35 | } 36 | 37 | // ----------------------------------------------------------------------------- 38 | template 39 | void print_matrix( int64_t m, int64_t n, std::complex* A, int64_t lda, 40 | const char* format="%9.4f" ) 41 | { 42 | #define A(i_, j_) A[ (i_) + size_t(lda)*(j_) ] 43 | 44 | assert( m >= 0 ); 45 | assert( n >= 0 ); 46 | assert( lda >= m ); 47 | char format2[32]; 48 | snprintf( format2, sizeof(format2), " %s + %si", format, format ); 49 | 50 | printf( "[\n" ); 51 | for (int64_t i = 0; i < m; ++i) { 52 | for (int64_t j = 0; j < n; ++j) { 53 | printf( format2, real(A(i, j)), imag(A(i, j)) ); 54 | } 55 | printf( "\n" ); 56 | } 57 | printf( "];\n" ); 58 | 59 | #undef A 60 | } 61 | 62 | // ----------------------------------------------------------------------------- 63 | template 64 | void print_vector( int64_t n, T *x, int64_t incx, 65 | const char* format="%9.4f" ) 66 | { 67 | assert( n >= 0 ); 68 | assert( incx != 0 ); 69 | char format2[32]; 70 | snprintf( format2, sizeof(format2), " %s", format ); 71 | 72 | printf( "[" ); 73 | int64_t ix = (incx > 0 ? 0 : (-n + 1)*incx); 74 | for (int64_t i = 0; i < n; ++i) { 75 | printf( format2, x[ix] ); 76 | ix += incx; 77 | } 78 | printf( " ]';\n" ); 79 | } 80 | 81 | // ----------------------------------------------------------------------------- 82 | template 83 | void print_vector( int64_t n, std::complex* x, int64_t incx, 84 | const char* format="%9.4f" ) 85 | { 86 | assert( n >= 0 ); 87 | assert( incx != 0 ); 88 | char format2[32]; 89 | snprintf( format2, sizeof(format2), " %s + %si", format, format ); 90 | 91 | printf( "[" ); 92 | int64_t ix = (incx > 0 ? 0 : (-n + 1)*incx); 93 | for (int64_t i = 0; i < n; ++i) { 94 | printf( format2, real(x[ix]), imag(x[ix]) ); 95 | ix += incx; 96 | } 97 | printf( " ]';\n" ); 98 | } 99 | 100 | #endif // #ifndef PRINT_HH 101 | -------------------------------------------------------------------------------- /test/test_asum.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #include "test.hh" 7 | #include "cblas_wrappers.hh" 8 | #include "lapack_wrappers.hh" 9 | #include "blas/flops.hh" 10 | #include "print_matrix.hh" 11 | 12 | // ----------------------------------------------------------------------------- 13 | template 14 | void test_asum_work( Params& params, bool run ) 15 | { 16 | using namespace testsweeper; 17 | using std::abs; 18 | using blas::max; 19 | using real_t = blas::real_type< T >; 20 | 21 | // get & mark input values 22 | int64_t n = params.dim.n(); 23 | int64_t incx = params.incx(); 24 | int64_t verbose = params.verbose(); 25 | 26 | // mark non-standard output values 27 | params.gflops(); 28 | params.gbytes(); 29 | params.ref_time(); 30 | params.ref_gflops(); 31 | params.ref_gbytes(); 32 | 33 | // adjust header to msec 34 | params.time.name( "time (ms)" ); 35 | params.ref_time.name( "ref time (ms)" ); 36 | params.ref_time.width( 13 ); 37 | 38 | if (! run) 39 | return; 40 | 41 | // setup 42 | size_t size_x = max( (n - 1) * abs( incx ) + 1, 0 ); 43 | T* x = new T[ size_x ]; 44 | 45 | int64_t idist = 1; 46 | int iseed[4] = { 0, 0, 0, 1 }; 47 | lapack_larnv( idist, iseed, size_x, x ); 48 | 49 | // test error exits 50 | assert_throw( blas::asum( -1, x, incx ), blas::Error ); 51 | assert_throw( blas::asum( n, x, 0 ), blas::Error ); 52 | assert_throw( blas::asum( n, x, -1 ), blas::Error ); 53 | 54 | if (verbose >= 1) { 55 | printf( "\n" 56 | "x n=%5lld, inc=%5lld, size=%10lld\n", 57 | llong( n ), llong( incx ), llong( size_x ) ); 58 | } 59 | if (verbose >= 2) { 60 | printf( "x = " ); print_vector( n, x, incx ); 61 | } 62 | 63 | // run test 64 | testsweeper::flush_cache( params.cache() ); 65 | double time = get_wtime(); 66 | real_t result = blas::asum( n, x, incx ); 67 | time = get_wtime() - time; 68 | 69 | double gflop = blas::Gflop< T >::asum( n ); 70 | double gbyte = blas::Gbyte< T >::asum( n ); 71 | params.time() = time * 1000; // msec 72 | params.gflops() = gflop / time; 73 | params.gbytes() = gbyte / time; 74 | 75 | if (verbose >= 1) { 76 | printf( "result = %.4e\n", result ); 77 | } 78 | 79 | if (params.check() == 'y') { 80 | // run reference 81 | testsweeper::flush_cache( params.cache() ); 82 | time = get_wtime(); 83 | real_t ref = cblas_asum( n, x, incx ); 84 | time = get_wtime() - time; 85 | 86 | params.ref_time() = time * 1000; // msec 87 | params.ref_gflops() = gflop / time; 88 | params.ref_gbytes() = gbyte / time; 89 | 90 | if (verbose >= 1) { 91 | printf( "ref = %.4e\n", ref ); 92 | } 93 | 94 | // relative forward error 95 | // note: using sqrt(n) here gives failures 96 | real_t error = abs( ref - result ); 97 | if (ref != 0) { 98 | error /= (n * ref); 99 | } 100 | 101 | // complex needs extra factor; see Higham, 2002, sec. 3.6. 102 | if (blas::is_complex_v) { 103 | error /= 2*sqrt(2); 104 | } 105 | 106 | real_t u = 0.5 * std::numeric_limits< real_t >::epsilon(); 107 | params.error() = error; 108 | params.okay() = (error < u); 109 | } 110 | 111 | delete[] x; 112 | } 113 | 114 | // ----------------------------------------------------------------------------- 115 | void test_asum( Params& params, bool run ) 116 | { 117 | switch (params.datatype()) { 118 | case testsweeper::DataType::Single: 119 | test_asum_work< float >( params, run ); 120 | break; 121 | 122 | case testsweeper::DataType::Double: 123 | test_asum_work< double >( params, run ); 124 | break; 125 | 126 | case testsweeper::DataType::SingleComplex: 127 | test_asum_work< std::complex >( params, run ); 128 | break; 129 | 130 | case testsweeper::DataType::DoubleComplex: 131 | test_asum_work< std::complex >( params, run ); 132 | break; 133 | 134 | default: 135 | throw std::exception(); 136 | break; 137 | } 138 | } 139 | -------------------------------------------------------------------------------- /test/test_error.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #include "test.hh" 7 | #include "blas/util.hh" 8 | 9 | void test_error( Params& params, bool run ) 10 | { 11 | int64_t m = params.dim.m(); 12 | int64_t n = params.dim.n(); 13 | 14 | if (! run) { 15 | printf( "test error checks the internal error routines:\n" 16 | "if m == 100, checks: blas_error_if( m == n );\n" 17 | "if m == 200, checks: blas_error_if_msg( m == n, \"m %%d == n %%d\", m, n );\n" 18 | "if m == 300, checks: assert( m != n );\n\n" ); 19 | return; 20 | } 21 | 22 | if (m == 100) { 23 | blas_error_if( m == n ); 24 | } 25 | else if (m == 200) { 26 | blas_error_if_msg( m == n, "m %lld == n %lld", llong( m ), llong( n ) ); 27 | } 28 | else if (m == 300) { 29 | assert( m != n ); 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /test/test_iamax.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #include "test.hh" 7 | #include "cblas_wrappers.hh" 8 | #include "lapack_wrappers.hh" 9 | #include "blas/flops.hh" 10 | #include "print_matrix.hh" 11 | 12 | // ----------------------------------------------------------------------------- 13 | template 14 | void test_iamax_work( Params& params, bool run ) 15 | { 16 | using namespace testsweeper; 17 | using real_t = blas::real_type< T >; 18 | using std::abs; 19 | using blas::max; 20 | 21 | // get & mark input values 22 | int64_t n = params.dim.n(); 23 | int64_t incx = params.incx(); 24 | int64_t verbose = params.verbose(); 25 | 26 | // mark non-standard output values 27 | params.gflops(); 28 | params.gbytes(); 29 | params.ref_time(); 30 | params.ref_gflops(); 31 | params.ref_gbytes(); 32 | 33 | // adjust header to msec 34 | params.time.name( "time (ms)" ); 35 | params.ref_time.name( "ref time (ms)" ); 36 | params.ref_time.width( 13 ); 37 | 38 | if (! run) 39 | return; 40 | 41 | // setup 42 | size_t size_x = max( (n - 1) * abs( incx ) + 1, 0 ); 43 | T* x = new T[ size_x ]; 44 | 45 | int64_t idist = 1; 46 | int iseed[4] = { 0, 0, 0, 1 }; 47 | lapack_larnv( idist, iseed, size_x, x ); 48 | 49 | // test error exits 50 | assert_throw( blas::iamax( -1, x, incx ), blas::Error ); 51 | assert_throw( blas::iamax( n, x, 0 ), blas::Error ); 52 | assert_throw( blas::iamax( n, x, -1 ), blas::Error ); 53 | 54 | if (verbose >= 1) { 55 | printf( "\n" 56 | "x n=%5lld, inc=%5lld, size=%10lld\n", 57 | llong( n ), llong( incx ), llong( size_x ) ); 58 | } 59 | if (verbose >= 2) { 60 | printf( "x = " ); print_vector( n, x, incx ); 61 | } 62 | 63 | // run test 64 | testsweeper::flush_cache( params.cache() ); 65 | double time = get_wtime(); 66 | int64_t result = blas::iamax( n, x, incx ); 67 | time = get_wtime() - time; 68 | 69 | double gflop = blas::Gflop< T >::iamax( n ); 70 | double gbyte = blas::Gbyte< T >::iamax( n ); 71 | params.time() = time * 1000; // msec 72 | params.gflops() = gflop / time; 73 | params.gbytes() = gbyte / time; 74 | 75 | if (verbose >= 1) { 76 | printf( "result = %5lld\n", llong( result ) ); 77 | } 78 | 79 | if (params.check() == 'y') { 80 | // run reference 81 | testsweeper::flush_cache( params.cache() ); 82 | time = get_wtime(); 83 | int64_t ref = cblas_iamax( n, x, incx ); 84 | if (n == 0) 85 | ref -= 1; 86 | time = get_wtime() - time; 87 | 88 | params.ref_time() = time * 1000; // msec 89 | params.ref_gflops() = gflop / time; 90 | params.ref_gbytes() = gbyte / time; 91 | 92 | if (verbose >= 1) { 93 | printf( "ref = %5lld\n", llong( ref ) ); 94 | } 95 | 96 | // error = |ref - result| 97 | real_t error = abs( ref - result ); 98 | params.error() = error; 99 | 100 | // iamax must be exact! 101 | params.okay() = (error == 0); 102 | } 103 | 104 | delete[] x; 105 | } 106 | 107 | // ----------------------------------------------------------------------------- 108 | void test_iamax( Params& params, bool run ) 109 | { 110 | switch (params.datatype()) { 111 | case testsweeper::DataType::Single: 112 | test_iamax_work< float >( params, run ); 113 | break; 114 | 115 | case testsweeper::DataType::Double: 116 | test_iamax_work< double >( params, run ); 117 | break; 118 | 119 | case testsweeper::DataType::SingleComplex: 120 | test_iamax_work< std::complex >( params, run ); 121 | break; 122 | 123 | case testsweeper::DataType::DoubleComplex: 124 | test_iamax_work< std::complex >( params, run ); 125 | break; 126 | 127 | default: 128 | throw std::exception(); 129 | break; 130 | } 131 | } 132 | -------------------------------------------------------------------------------- /test/test_nrm2.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #include "test.hh" 7 | #include "cblas_wrappers.hh" 8 | #include "lapack_wrappers.hh" 9 | #include "blas/flops.hh" 10 | #include "print_matrix.hh" 11 | 12 | // ----------------------------------------------------------------------------- 13 | template 14 | void test_nrm2_work( Params& params, bool run ) 15 | { 16 | using namespace testsweeper; 17 | using std::abs; 18 | using blas::max; 19 | using scalar_t = blas::scalar_type; 20 | using real_t = blas::real_type; 21 | 22 | // get & mark input values 23 | int64_t n = params.dim.n(); 24 | int64_t incx = params.incx(); 25 | int64_t verbose = params.verbose(); 26 | 27 | // mark non-standard output values 28 | params.gflops(); 29 | params.gbytes(); 30 | params.ref_time(); 31 | params.ref_gflops(); 32 | params.ref_gbytes(); 33 | 34 | // adjust header to msec 35 | params.time.name( "time (ms)" ); 36 | params.ref_time.name( "ref time (ms)" ); 37 | params.ref_time.width( 13 ); 38 | 39 | if (! run) 40 | return; 41 | 42 | // setup 43 | size_t size_x = max( (n - 1) * abs( incx ) + 1, 0 ); 44 | T* x = new T[ size_x ]; 45 | 46 | int64_t idist = 1; 47 | int iseed[4] = { 0, 0, 0, 1 }; 48 | lapack_larnv( idist, iseed, size_x, x ); 49 | 50 | // test error exits 51 | assert_throw( blas::nrm2( -1, x, incx ), blas::Error ); 52 | assert_throw( blas::nrm2( n, x, 0 ), blas::Error ); 53 | assert_throw( blas::nrm2( n, x, -1 ), blas::Error ); 54 | 55 | if (verbose >= 1) { 56 | printf( "\n" 57 | "x n=%5lld, inc=%5lld, size=%10lld\n", 58 | llong( n ), llong( incx ), llong( size_x ) ); 59 | } 60 | if (verbose >= 2) { 61 | printf( "x = " ); print_vector( n, x, incx ); 62 | } 63 | 64 | // run test 65 | testsweeper::flush_cache( params.cache() ); 66 | double time = get_wtime(); 67 | real_t result = blas::nrm2( n, x, incx ); 68 | time = get_wtime() - time; 69 | 70 | double gflop = blas::Gflop< T >::nrm2( n ); 71 | double gbyte = blas::Gbyte< T >::nrm2( n ); 72 | params.time() = time * 1000; // msec 73 | params.gflops() = gflop / time; 74 | params.gbytes() = gbyte / time; 75 | 76 | if (verbose >= 2) { 77 | printf( "result = %.4e\n", result ); 78 | } 79 | 80 | if (params.check() == 'y') { 81 | // run reference 82 | testsweeper::flush_cache( params.cache() ); 83 | time = get_wtime(); 84 | real_t ref = cblas_nrm2( n, x, std::abs(incx) ); 85 | time = get_wtime() - time; 86 | 87 | params.ref_time() = time * 1000; // msec 88 | params.ref_gflops() = gflop / time; 89 | params.ref_gbytes() = gbyte / time; 90 | 91 | if (verbose >= 2) { 92 | printf( "ref = %.4e\n", ref ); 93 | } 94 | 95 | // relative forward error 96 | real_t error = abs( ref - result ); 97 | if (ref != 0) { 98 | error /= sqrt(n+1) * ref; 99 | } 100 | 101 | // complex needs extra factor; see Higham, 2002, sec. 3.6. 102 | if (blas::is_complex_v) { 103 | error /= 2*sqrt(2); 104 | } 105 | 106 | real_t u = 0.5 * std::numeric_limits< real_t >::epsilon(); 107 | params.error() = error; 108 | params.okay() = (error < u); 109 | } 110 | 111 | delete[] x; 112 | } 113 | 114 | // ----------------------------------------------------------------------------- 115 | void test_nrm2( Params& params, bool run ) 116 | { 117 | switch (params.datatype()) { 118 | case testsweeper::DataType::Single: 119 | test_nrm2_work< float >( params, run ); 120 | break; 121 | 122 | case testsweeper::DataType::Double: 123 | test_nrm2_work< double >( params, run ); 124 | break; 125 | 126 | case testsweeper::DataType::SingleComplex: 127 | test_nrm2_work< std::complex >( params, run ); 128 | break; 129 | 130 | case testsweeper::DataType::DoubleComplex: 131 | test_nrm2_work< std::complex >( params, run ); 132 | break; 133 | 134 | default: 135 | throw std::exception(); 136 | break; 137 | } 138 | } 139 | -------------------------------------------------------------------------------- /test/test_rotmg.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 2 | // SPDX-License-Identifier: BSD-3-Clause 3 | // This program is free software: you can redistribute it and/or modify it under 4 | // the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 5 | 6 | #include "test.hh" 7 | #include "cblas_wrappers.hh" 8 | #include "lapack_wrappers.hh" 9 | #include "print_matrix.hh" 10 | 11 | // ----------------------------------------------------------------------------- 12 | template 13 | void test_rotmg_work( Params& params, bool run ) 14 | { 15 | using namespace testsweeper; 16 | using std::abs; 17 | using std::real; 18 | using std::imag; 19 | using real_t = blas::real_type< T >; 20 | 21 | // Constants 22 | const real_t epsilon = std::numeric_limits< real_t >::epsilon(); 23 | 24 | // get & mark input values 25 | int64_t n = params.dim.n(); 26 | double tol = params.tol() * epsilon; 27 | 28 | // mark non-standard output values 29 | params.ref_time(); 30 | 31 | // adjust header to msec 32 | params.time.name( "time (ms)" ); 33 | params.ref_time.name( "ref time (ms)" ); 34 | params.ref_time.width( 13 ); 35 | 36 | if (! run) 37 | return; 38 | 39 | // setup 40 | std::vector d1( n ), d1_ref( n ); 41 | std::vector d2( n ), d2_ref( n ); 42 | std::vector x1( n ), x1_ref( n ); 43 | std::vector y1( n ), y1_ref( n ); 44 | std::vector ps( 5*n ), ps_ref( 5*n ); 45 | 46 | int64_t idist = 3; 47 | int iseed[4] = { 0, 0, 0, 1 }; 48 | lapack_larnv( idist, iseed, n, &d1[0] ); 49 | lapack_larnv( idist, iseed, n, &d2[0] ); 50 | lapack_larnv( idist, iseed, n, &x1[0] ); 51 | lapack_larnv( idist, iseed, n, &y1[0] ); 52 | lapack_larnv( idist, iseed, 5*n, &ps[0] ); 53 | 54 | d1_ref = d1; 55 | d2_ref = d2; 56 | x1_ref = x1; 57 | y1_ref = y1; 58 | ps_ref = ps; 59 | 60 | // run test 61 | testsweeper::flush_cache( params.cache() ); 62 | double time = get_wtime(); 63 | for (int64_t i = 0; i < n; ++i) { 64 | blas::rotmg( &d1[i], &d2[i], &x1[i], y1[i], &ps[5*i] ); 65 | } 66 | time = get_wtime() - time; 67 | params.time() = time * 1000; // msec 68 | 69 | if (params.check() == 'y') { 70 | // run reference 71 | testsweeper::flush_cache( params.cache() ); 72 | time = get_wtime(); 73 | for (int64_t i = 0; i < n; ++i) { 74 | cblas_rotmg( &d1_ref[i], &d2_ref[i], &x1_ref[i], y1_ref[i], &ps_ref[5*i] ); 75 | } 76 | time = get_wtime() - time; 77 | params.ref_time() = time * 1000; // msec 78 | 79 | // get max error of all outputs 80 | cblas_axpy( n, -1.0, &d1[0], 1, &d1_ref[0], 1 ); 81 | cblas_axpy( n, -1.0, &d2[0], 1, &d2_ref[0], 1 ); 82 | cblas_axpy( n, -1.0, &x1[0], 1, &x1_ref[0], 1 ); 83 | cblas_axpy( 5*n, -1.0, &ps[0], 1, &ps_ref[0], 1 ); 84 | 85 | int64_t id1 = cblas_iamax( n, &d1_ref[0], 1 ); 86 | int64_t id2 = cblas_iamax( n, &d2_ref[0], 1 ); 87 | int64_t ix1 = cblas_iamax( n, &x1_ref[0], 1 ); 88 | int64_t ips = cblas_iamax( 5*n, &ps_ref[0], 1 ); 89 | 90 | real_t error = blas::max( 91 | abs( d1_ref[ id1 ] ), 92 | abs( d2_ref[ id2 ] ), 93 | abs( x1_ref[ ix1 ] ), 94 | abs( ps_ref[ ips ] ) 95 | ); 96 | 97 | // error is normally 0, but allow for some rounding just in case. 98 | params.error() = error; 99 | params.okay() = (error < tol); 100 | } 101 | } 102 | 103 | // ----------------------------------------------------------------------------- 104 | void test_rotmg( Params& params, bool run ) 105 | { 106 | switch (params.datatype()) { 107 | case testsweeper::DataType::Single: 108 | test_rotmg_work< float >( params, run ); 109 | break; 110 | 111 | case testsweeper::DataType::Double: 112 | test_rotmg_work< double >( params, run ); 113 | break; 114 | 115 | // modified Givens not available for complex 116 | 117 | default: 118 | throw std::exception(); 119 | break; 120 | } 121 | } 122 | -------------------------------------------------------------------------------- /tools/copyright.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import release 4 | release.copyright() 5 | -------------------------------------------------------------------------------- /tools/doxygen-filter.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use strict; 4 | 5 | sub dollar 6 | { 7 | my( $pre ) = @_; 8 | if ($pre eq '\\') { 9 | # change \$ to $ 10 | return '$'; 11 | } 12 | elsif ($pre eq '\\f') { 13 | # don't change \f$ 14 | return '\\f$'; 15 | } 16 | else { 17 | # change $ to \f$ 18 | return $pre . '\\f$'; 19 | } 20 | } 21 | 22 | while (<>) { 23 | # replace \[ and \] by \f[ and \f] 24 | s/\\([\[\]])/\\f$1/g; 25 | 26 | # replace $ by \f$ 27 | # replace \$ by $ 28 | # don't change \f$ 29 | s/(\\f|\\|)\$/dollar($1)/eg; 30 | 31 | print 32 | } 33 | -------------------------------------------------------------------------------- /tools/hooks/pre-commit: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # 3 | # Copyright (c) 2017-2023, University of Tennessee. All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # This program is free software: you can redistribute it and/or modify it under 6 | # the terms of the BSD 3-Clause license. See the accompanying LICENSE file. 7 | # 8 | # Does style checks before committing code. 9 | # Unfortunately, at the moment astyle does not properly parse all the 10 | # C++ 11, 14, 17 syntax, so it formats things weirdly. Instead, this 11 | # hook uses a simple Perl script to check for a subset of errors to fix. 12 | 13 | script="tools/slate-style-reject.pl" 14 | return=0 15 | 16 | files=$(git diff --cached --name-only --diff-filter=ACMR \ 17 | | grep -E "\.(c|cc|cpp|cxx|h|hh|hpp|hxx|cu|cuh)$") 18 | #echo "files ${files}" 19 | $script $files 20 | return=$? 21 | 22 | # Comment out exit in .git/hooks/pre-commit to check files but allow the commit. 23 | exit $return 24 | -------------------------------------------------------------------------------- /tools/make_release.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import release 4 | release.make( 'blaspp', 'include/blas.hh', 'src/version.cc' ) 5 | -------------------------------------------------------------------------------- /tools/slate-style-reject.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | # 3 | # Catches certain common style issues and rejects them, returning non-zero exit code. 4 | # Can be used for pre-commit and pre-push hook to prevent errors from coming in. 5 | 6 | use strict; 7 | use Getopt::Std; 8 | 9 | my %opts = (); 10 | getopts( 'vl', \%opts ) || exit(-1); 11 | 12 | my $verbose = $opts{v}; 13 | my $list = $opts{l}; 14 | 15 | # escape characters for ANSI colors 16 | # see http://en.wikipedia.org/wiki/ANSI_escape_code 17 | my $esc = chr(0x1B) . "["; 18 | my $red = "${esc}31m"; 19 | my $green = "${esc}32m"; 20 | my $yellow = "${esc}33m"; 21 | my $blue = "${esc}34m"; 22 | my $magenta = "${esc}35m"; 23 | my $cyan = "${esc}36m"; 24 | my $white = "${esc}37m"; 25 | my $black = "${esc}0m"; 26 | 27 | my $result = 0; 28 | for my $ARG (@ARGV) { 29 | open( my $fileh, "<", $ARG ) or die( "Can't open $ARG: $!\n" ); 30 | my $file_result = 0; 31 | while (<$fileh>) { 32 | chomp; 33 | my $orig = $_; 34 | s@//.*\S@//@; # ignore C++ comments 35 | s/".*?"/"..."/g; # ignore strings 36 | 37 | my $line = 0; 38 | if (m/\t/) { 39 | print( "$red$ARG:$.$black: tab\n" ) if (not $list); 40 | $line = 1; 41 | } 42 | 43 | if (m/ $/) { 44 | print( "$red$ARG:$.$black: trailing space\n" ) if (not $list); 45 | $line = 1; 46 | } 47 | 48 | if (m/^ *(if|for|while|switch|else if)\(/) { 49 | print( "$red$ARG:$.$black: missing space after `$1`\n" ) if (not $list); 50 | $line = 1; 51 | } 52 | 53 | if (m/^ *(if|for|while|switch|else if) \( +[^ ;]/) { 54 | print( "$red$ARG:$.$black: excess space inside parens after `$1`\n" ) if (not $list); 55 | $line = 1; 56 | } 57 | 58 | if (m/^ *\} *else/) { 59 | print( "$red$ARG:$.$black: don't cuddle } and else on same line\n" ) if (not $list); 60 | $line = 1; 61 | } 62 | 63 | if (m/\)\{/) { 64 | print( "$red$ARG:$.$black: missing space before { brace\n" ) if (not $list); 65 | $line = 1; 66 | } 67 | 68 | if (m/ +\\$/) { 69 | print( "$red$ARG:$.$black: excess space before line continuation\n" ) if (not $list); 70 | $line = 1; 71 | } 72 | 73 | # This checks 2-character operators. 74 | # It's hard to check < > = w/o full parser. 75 | if (m/[^ =](&&|\|\||==|<=|>=|!=|\+=|-=|\*=|\/=|\|=|\&=)[^ =]/) { 76 | print( "$red$ARG:$.$black: missing space around boolean operator\n" ) if (not $list); 77 | $line = 1; 78 | } 79 | 80 | # Prohibit space before , or ; unless at the beginning of a line. 81 | # Sometimes with #if conditions, the comma has to start the line. 82 | if (m/\S +[,;]/) { 83 | print( "$red$ARG:$.$black: excess space before comma or semi-colon\n" ) if (not $list); 84 | $line = 1; 85 | } 86 | 87 | # semi-colon ; must be at end, followed by space, 88 | # or followed by \n string as in printf( "];\n" ). 89 | if (m/(;(?!$|\s|\\n))/) { 90 | print( "$red$ARG:$.$black: missing space after semi-colon: <<$1>>\n" ) if (not $list); 91 | $line = 1; 92 | } 93 | 94 | # It's hard to check indentation w/o full parser, but at least 95 | # preprocessor, comments, and control keywords should be 96 | # indented correctly. 97 | if (m@^( )* {1,3}(#|//|if|else|for|while|switch|case|default|break|throw)@) { 98 | print( "$red$ARG:$.$black: not 4-space indent\n" ) if (not $list); 99 | $line = 1; 100 | } 101 | 102 | # if (m/.{85}/) { 103 | # print( "$red$ARG:$.$black: longer than 85 char hard limit\n" ) if (not $list); 104 | # $line = 1; 105 | # } 106 | 107 | $file_result |= $line; 108 | if ($line and $verbose) { 109 | print( "<$orig>\n\n" ); 110 | } 111 | } 112 | if ($file_result and $list) { 113 | print( "$ARG\n" ); 114 | } 115 | $result |= $file_result; 116 | } 117 | 118 | #print( "result $result\n" ); 119 | exit( $result ); 120 | --------------------------------------------------------------------------------