├── .gitignore ├── .travis.yml ├── CMakeLists.txt ├── Doxyfile ├── LICENSE.txt ├── README.md ├── bin └── travisci.sh ├── cmake ├── FindSIMD.cmake ├── HAXXCompilerFlags.cmake └── HAXXHandleSIMD.cmake ├── include ├── haxx.hpp ├── haxx │ ├── haxx_complex_op.hpp │ ├── haxx_def.hpp │ ├── haxx_quaternion_op.hpp │ ├── haxx_quaternion_reduction.hpp │ └── haxx_scalar_op.hpp ├── haxx_config.hpp.in ├── hblas.hpp ├── hblas │ ├── config │ │ ├── hblas3 │ │ │ └── gemm.hpp │ │ └── types.hpp │ ├── generic │ │ ├── hblas1 │ │ │ ├── hblas_axpym_impl.hpp │ │ │ ├── hblas_axpyv_impl.hpp │ │ │ ├── hblas_copyv_impl.hpp │ │ │ ├── hblas_dotcv_impl.hpp │ │ │ ├── hblas_dotuv_impl.hpp │ │ │ ├── hblas_scalm_impl.hpp │ │ │ ├── hblas_scalv_impl.hpp │ │ │ ├── hblas_swapv_impl.hpp │ │ │ └── impl.hpp │ │ ├── hblas2 │ │ │ ├── hblas_gemv_impl.hpp │ │ │ ├── hblas_gerc_impl.hpp │ │ │ ├── hblas_geru_impl.hpp │ │ │ └── impl.hpp │ │ ├── hblas3 │ │ │ ├── hblas_gemm_impl.hpp │ │ │ └── impl.hpp │ │ └── util │ │ │ ├── contract_impl.hpp │ │ │ ├── expand_impl.hpp │ │ │ └── impl.hpp │ ├── hblas1.hpp │ ├── hblas2.hpp │ ├── hblas3.hpp │ ├── hblas_util.hpp │ └── pack │ │ ├── pack.hpp │ │ ├── packops.hpp │ │ └── typewrapper.hpp └── util │ ├── boilerplate.hpp │ ├── constants.hpp │ ├── macro.hpp │ ├── simd.hpp │ ├── simd │ ├── intrin_alias.hpp │ ├── misc.hpp │ └── qop.hpp │ └── types.hpp ├── src ├── CMakeLists.txt ├── benchmark │ ├── CMakeLists.txt │ ├── dot.cxx │ └── gemm.cxx ├── hblas │ ├── CMakeLists.txt │ ├── cxx │ │ ├── CMakeLists.txt │ │ ├── hblas1 │ │ │ ├── AXPYScal.cmake │ │ │ ├── CMakeLists.txt │ │ │ ├── CopySwap.cmake │ │ │ ├── Dot.cmake │ │ │ ├── axpy_scal.cxx │ │ │ ├── copy_swap.cxx │ │ │ └── dot.cxx │ │ └── hblas3 │ │ │ ├── CMakeLists.txt │ │ │ ├── Gemm.cmake │ │ │ ├── gemm.cxx │ │ │ └── kern.cxx │ ├── fortran │ │ ├── CMakeLists.txt │ │ ├── hblas1 │ │ │ ├── haxpych.f │ │ │ ├── haxpydh.f │ │ │ ├── haxpyhh.f │ │ │ ├── hdotc.f │ │ │ ├── hdotu.f │ │ │ ├── hscalc.f │ │ │ ├── hscald.f │ │ │ └── hscalh.f │ │ ├── hblas2 │ │ │ ├── hgemvdd.f │ │ │ ├── hgemvdh.f │ │ │ ├── hgemvdz.f │ │ │ ├── hgemvhd.f │ │ │ ├── hgemvhh.f │ │ │ ├── hgemvhz.f │ │ │ ├── hgemvzd.f │ │ │ ├── hgemvzh.f │ │ │ ├── hgemvzz.f │ │ │ ├── hgercd.f │ │ │ ├── hgerch.f │ │ │ ├── hgercz.f │ │ │ ├── hgerud.f │ │ │ ├── hgeruh.f │ │ │ └── hgeruz.f │ │ ├── hblas3 │ │ │ ├── hgemmdd.f │ │ │ ├── hgemmdh.f │ │ │ ├── hgemmdz.f │ │ │ ├── hgemmhd.f │ │ │ ├── hgemmhh.f │ │ │ ├── hgemmhz.f │ │ │ ├── hgemmzd.f │ │ │ ├── hgemmzh.f │ │ │ └── hgemmzz.f │ │ └── util │ │ │ ├── hdexp.f │ │ │ ├── hzcon.f │ │ │ ├── hzexp.f │ │ │ └── lsame.f │ ├── hblas1.cxx.in │ ├── hblas2.cxx.in │ ├── hblas3.cxx.in │ └── util.cxx └── tune │ ├── CMakeLists.txt │ └── gemm.cxx └── tests ├── CMakeLists.txt ├── haxx_algebra.cxx ├── haxx_binary_op.cxx ├── haxx_unary_op.cxx ├── haxx_ut.hpp ├── hblas1.cxx ├── hblas2.cxx ├── hblas3.cxx ├── hblas_util.cxx └── ut.cxx /.gitignore: -------------------------------------------------------------------------------- 1 | *build*/ 2 | docs/ 3 | .*.swp 4 | .*.swo 5 | *.o 6 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | os: 2 | - linux 3 | language: cpp 4 | compiler: 5 | - gcc 6 | # - clang 7 | env: 8 | - GCC_VERSION=4.8 9 | - GCC_VERSION=4.9 10 | - GCC_VERSION=5 11 | matrix: 12 | exclude: 13 | - compiler: clang 14 | env: GCC_VERSION=4.8 15 | - compiler: clang 16 | env: GCC_VERSION=4.9 17 | sudo: false 18 | addons: 19 | apt: 20 | sources: 21 | - george-edison55-precise-backports 22 | - ubuntu-toolchain-r-test 23 | - llvm-toolchain-precise-3.8 24 | - boost-latest 25 | packages: 26 | - gfortran 27 | - g++-4.8 28 | - g++-4.9 29 | - g++-5 30 | - clang-3.8 31 | - libboost1.55-all-dev 32 | install: true 33 | script: 34 | - ./bin/travisci.sh 35 | after_failure: 36 | - cat build/Testing/Temporary/LastTest.log 37 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # This file is a part of HAXX 2 | # 3 | # Copyright (c) 2017 David Williams-Young 4 | # All rights reserved. 5 | # 6 | # See LICENSE.txt 7 | # 8 | cmake_minimum_required(VERSION 3.0) 9 | project(HAXX CXX C Fortran) 10 | set(HAXX_VERSION_MAJOR ALPHA) 11 | set(HAXX_VERSION_MINOR 1) 12 | set(HAXX_VERSION_PATCH 0) 13 | 14 | 15 | include_directories("${PROJECT_SOURCE_DIR}/include") 16 | include_directories("${PROJECT_BINARY_DIR}/include") 17 | 18 | configure_file( 19 | "${PROJECT_SOURCE_DIR}/include/haxx_config.hpp.in" 20 | "${PROJECT_BINARY_DIR}/include/haxx_config.hpp" 21 | ) 22 | 23 | set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${PROJECT_SOURCE_DIR}/cmake) 24 | 25 | if(CMAKE_CXX_COMPILER_ID MATCHES "Clang") 26 | message(STATUS "${CMAKE_CXX_COMPILER_ID} Recognized as CLANG: Adding -D_HAXX_UT_BUTF_NINCLUDED") 27 | add_definitions("-D_HAXX_UT_BUTF_NINCLUDED") 28 | endif() 29 | 30 | 31 | # Options 32 | option( HAXX_ENABLE_BENCHMARK "Enable Build of HAXX / HBLAS Benchmark" OFF ) 33 | option( HAXX_ENABLE_TESTING "Enable Build of HAXX / HBLAS Unit Tests" ON ) 34 | option( HAXX_USE_HOST_SIMD "Use Host Native Flags for SIMD" ON ) 35 | 36 | include(HAXXCompilerFlags) 37 | include(HAXXHandleSIMD) 38 | 39 | add_subdirectory(src) 40 | 41 | if( HAXX_ENABLE_TESTING ) 42 | enable_testing() 43 | add_subdirectory(tests) 44 | endif() 45 | 46 | 47 | # Install Headers 48 | install(FILES ${PROJECT_SOURCE_DIR}/include/haxx.hpp DESTINATION include) 49 | install(FILES ${PROJECT_SOURCE_DIR}/include/hblas.hpp DESTINATION include) 50 | install(DIRECTORY ${PROJECT_SOURCE_DIR}/include/haxx DESTINATION include) 51 | install(DIRECTORY ${PROJECT_SOURCE_DIR}/include/hblas DESTINATION include) 52 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (C) 2017 David Williams-Young 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | 2. Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | 3. Neither the name of the copyright holder nor the names of its contributors 15 | may be used to endorse or promote products derived from this software without 16 | specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Build Status](https://travis-ci.org/wavefunction91/HAXX.svg?branch=master)](https://travis-ci.org/wavefunction91/HAXX) 2 | 3 | 4 | Synopsis 5 | ======== 6 | **HAXX** (**H**amilton's Quaternion **A**lgebra for C**XX**) is a C++ software 7 | infrastructure for the development of efficient scalar and tensorial quaternion 8 | algorithms. HAXX can be thought of as two interdependent C++ software 9 | libraries: 10 | 11 | * The HAXX scalar quaternion class which handles the scalar operations 12 | (+,-,etc) over the quaternion numbers 13 | * HBLAS for quaternion and mixed-type linear algebra 14 | 15 | 16 | **HBLAS** (**H**amilton's Quaternion **B**asic **L**inear **A**lgebra 17 | **S**ubroutines) provides a BLAS-like interface for matrices and vectors over 18 | the quaternion numbers. As HBLAS depends solely on the HAXX scalar quaternion 19 | infrastructure, there is no plan currently to release them separately. 20 | Currently, HBLAS provides an optimized (serial) software implementation of quaternion 21 | matrix operations for AVX and AVX2 microarchitectures (see [arXiv:1903.05575](http://arxiv.org/abs/1903.05575) for details). 22 | 23 | 24 | HAXX is currently a development code which has been hand tuned for a select few microarchitectures to demonstrate the 25 | efficacy of such operations on modern computing platforms. 26 | The default caching parameters shipped with HAXX are optimized for the Intel(R) Xeon(R) CPU E5-2660 (Sandy Bridge) processor. 27 | The API specification in HAXX is very flexible, but most of the flexibility is not directly user-facing. If there is interest 28 | in exposing such functionality, please open a GitHub issue. 29 | 30 | A primary goal of HBLAS is not only to provide a 31 | convenient and efficient interface for quaternion-quaternion linear algebra, 32 | but also to efficiently handle mixed-type (quaternion-real, quaternion-complex) 33 | linear algebra through their natural embeddings into the quaternion algebra. 34 | HAXX is actively being developed with little focus on backwards compatibility 35 | with previous versions. The HAXX and HBLAS interfaces are constantly evolving 36 | and can (will) change on a regular basis as new, exciting functionality is 37 | added. 38 | 39 | Design Goals 40 | ============ 41 | * A high-level, modern C++ API for scalar quaternion algebra (addition, 42 | subtraction, multiplication, division) 43 | * Access to low level optimization and vectorization of the real arithmetic 44 | underlying quaternion operations 45 | * Extension of BLAS functionality to quaternion algebra and mixed-type 46 | expressions (HBLAS) 47 | * A reusable software framework to enable future scalar and tensorial 48 | algorithmic development using the quaternion algebra 49 | 50 | Developers 51 | ========== 52 | David Williams-Young (Computational Research Division / Lawrence Berkeley National Laboratory)
53 | E-Mail: dbwy at lbl dot gov 54 | -------------------------------------------------------------------------------- /bin/travisci.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -e 4 | 5 | if [ "$CXX" = "g++" ]; then 6 | export CC=/usr/bin/gcc-$GCC_VERSION 7 | export CXX=/usr/bin/g++-$GCC_VERSION 8 | else 9 | # no OpenMP support in clang, will use C++11 threads 10 | export CC=/usr/bin/clang-3.8 11 | export CXX=/usr/bin/clang++-3.8 12 | fi 13 | export CXXFLAGS="-std=c++11 -O3" 14 | export FC=gfortran 15 | 16 | 17 | CMAKE_URL="https://cmake.org/files/v3.7/cmake-3.7.2-Linux-x86_64.tar.gz" 18 | 19 | wget --no-check-certificate --quiet -O - $CMAKE_URL | tar --strip-components=1 -xz -C cmake 20 | PATH=$PWD/cmake/bin":"$PATH 21 | 22 | 23 | mkdir build && cd build 24 | cmake --version 25 | cmake -DBOOST_LIBRARYDIR='/usr/lib' -DCMAKE_Fortran_FLAGS='-O3' .. 26 | make -j2 27 | make test 28 | -------------------------------------------------------------------------------- /cmake/FindSIMD.cmake: -------------------------------------------------------------------------------- 1 | # Determine SIMD instruction set if any 2 | 3 | 4 | message( STATUS "Attempting to determine SIMD instructions") 5 | 6 | # LINUX checks 7 | if( CMAKE_SYSTEM_NAME MATCHES "Linux" ) 8 | 9 | # Get proc info 10 | EXEC_PROGRAM( cat ARGS "/proc/cpuinfo" OUTPUT_VARIABLE CPUINFO ) 11 | 12 | 13 | # Check AVX and AVX2 14 | STRING( REGEX REPLACE "^.*(avx).*$" "\\1" AVX_THERE ${CPUINFO} ) 15 | STRING( REGEX REPLACE "^.*(avx2).*$" "\\1" AVX2_THERE ${CPUINFO} ) 16 | 17 | STRING( COMPARE EQUAL "avx" "${AVX_THERE}" AVX_TRUE ) 18 | STRING( COMPARE EQUAL "avx2" "${AVX2_THERE}" AVX2_TRUE ) 19 | 20 | 21 | # NON-LINUX defaults to generic code 22 | else() 23 | 24 | set( AVX_TRUE false) 25 | set( AVX2_TRUE false) 26 | set( AVX512_TRUE false) 27 | 28 | endif() 29 | 30 | 31 | set( AVX_FOUND ${AVX_TRUE} CACHE BOOL "AVX instructions available" ) 32 | set( AVX2_FOUND ${AVX2_TRUE} CACHE BOOL "AVX2 instructions available" ) 33 | set( AVX512_FOUND ${AVX512_TRUE} CACHE BOOL "AVX512 instructions available" ) 34 | 35 | 36 | if( AVX2_FOUND ) 37 | message( STATUS "-- AVX2 is largest available SIMD instruction set" ) 38 | elseif( AVX_FOUND ) 39 | message( STATUS "-- AVX is largest available SIMD instruction set" ) 40 | endif() 41 | 42 | -------------------------------------------------------------------------------- /cmake/HAXXCompilerFlags.cmake: -------------------------------------------------------------------------------- 1 | include(CheckCXXCompilerFlag) 2 | include(CheckFortranCompilerFlag) 3 | 4 | # Handle C++14 Flags 5 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 6 | set(CMAKE_CXX_STANDARD 14) 7 | 8 | if(CMAKE_CXX_COMPILER_ID MATCHES "Intel") 9 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14") 10 | endif() 11 | 12 | # Check for FOTRAN preprocessor 13 | check_fortran_compiler_flag("-fpp" FC_USES_FPP) 14 | check_fortran_compiler_flag("-cpp" FC_USES_CPP) 15 | 16 | if(FC_USES_FPP) 17 | set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fpp") 18 | elseif(FC_USES_CPP) 19 | set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -cpp") 20 | else() 21 | message(FATAL "Unable to Determine a Suitable FORTRAN Preprocessor") 22 | endif() 23 | 24 | 25 | # Check IPO 26 | check_cxx_compiler_flag("-ipo" CXX_USES_IPO) 27 | 28 | if( CXX_USES_IPO AND NOT DISABLE_IPO ) 29 | set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ipo" ) 30 | endif() 31 | 32 | 33 | # Host SIMD flags 34 | if( HAXX_USE_HOST_SIMD ) 35 | 36 | # Determine CXX opt flags 37 | check_cxx_compiler_flag("-march=native" CXX_USES_MARCH_NATIVE) 38 | check_cxx_compiler_flag("-xHost" CXX_USES_XHOST ) 39 | 40 | check_fortran_compiler_flag("-march=native" FC_USES_MARCH_NATIVE) 41 | check_fortran_compiler_flag("-xHost" FC_USES_XHOST ) 42 | 43 | # Add Host flags 44 | if( CXX_USES_XHOST ) 45 | set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -xHost" ) 46 | elseif( CXX_USES_MARCH_NATIVE ) 47 | set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native -O3" ) 48 | else() 49 | message( WARNING "Unable to determine proper HOST flags for CXX compiler" ) 50 | endif() 51 | 52 | if( FC_USES_XHOST ) 53 | set( CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -xHost" ) 54 | elseif( FC_USES_MARCH_NATIVE ) 55 | set( CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -march=native" ) 56 | else() 57 | message( WARNING "Unable to determine proper HOST flags for FC" ) 58 | endif() 59 | 60 | endif() 61 | 62 | # HAXX Types 63 | 64 | # Index Integer Type 65 | if( NOT HAXX_INT ) 66 | set( HAXX_INT int32_t ) 67 | endif() 68 | add_definitions("-DHAXX_INT=${HAXX_INT}") 69 | 70 | 71 | -------------------------------------------------------------------------------- /cmake/HAXXHandleSIMD.cmake: -------------------------------------------------------------------------------- 1 | if( HAXX_USE_HOST_SIMD ) 2 | 3 | # Try to determine SIMD 4 | include( FindSIMD ) 5 | 6 | if( NOT AVX_FOUND AND NOT AVX2_FOUND AND NOT AVX512_FOUND ) 7 | message( WARNING "HAXX only provided optimal implementations for AVX, AVX2 and AVX-512 -- Defaulting to Generic FORTRAN build" ) 8 | 9 | set( ENABLE_GENERIC_FORTRAN true CACHE BOOL "Enable generic FORTRAN code" ) 10 | set( ENABLE_GENERIC_CXX true CACHE BOOL "Enable generic CXX code" ) 11 | 12 | else() 13 | 14 | message( STATUS "HAXX Found a suitable SIMD instruction set -- Enabling Optimized code" ) 15 | 16 | set( ENABLE_GENERIC_FORTRAN false CACHE BOOL "Enable generic FORTRAN code" ) 17 | set( ENABLE_GENERIC_CXX false CACHE BOOL "Enable generic CXX code" ) 18 | 19 | endif() 20 | 21 | 22 | endif() 23 | -------------------------------------------------------------------------------- /include/haxx.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is a part of HAXX 3 | * 4 | * Copyright (c) 2017 David Williams-Young 5 | * All rights reserved. 6 | * 7 | * See LICENSE.txt 8 | */ 9 | 10 | #ifndef __INCLUDED_HAXX_HPP 11 | #define __INCLUDED_HAXX_HPP 12 | 13 | 14 | // HAXX quaternion definition 15 | #include "haxx/haxx_def.hpp" 16 | 17 | // Scalar Operators 18 | #include "haxx/haxx_scalar_op.hpp" 19 | 20 | // Complex Operators 21 | #include "haxx/haxx_complex_op.hpp" 22 | 23 | // Quaternion Operators 24 | #include "haxx/haxx_quaternion_op.hpp" 25 | 26 | // Quaternion Reduction 27 | #include "haxx/haxx_quaternion_reduction.hpp" 28 | 29 | 30 | #endif 31 | -------------------------------------------------------------------------------- /include/haxx/haxx_quaternion_op.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is a part of HAXX 3 | * 4 | * Copyright (c) 2017 David Williams-Young 5 | * All rights reserved. 6 | * 7 | * See LICENSE.txt 8 | */ 9 | 10 | #ifndef __INCLUDED_HAXX_QUATERNION_OP_HPP 11 | #define __INCLUDED_HAXX_QUATERNION_OP_HPP 12 | 13 | #include "haxx/haxx_def.hpp" 14 | 15 | namespace HAXX { 16 | 17 | /** 18 | * Assigns a quaternion of a different type to this quaternion 19 | */ 20 | template 21 | template 22 | quaternion<_F>& quaternion<_F>::operator=(const quaternion<_G> &__q) { 23 | 24 | _M_real = __q.real(); 25 | _M_imag_i = __q.imag_i(); 26 | _M_imag_j = __q.imag_j(); 27 | _M_imag_k = __q.imag_k(); 28 | 29 | return *this; 30 | 31 | }; 32 | 33 | /** 34 | * \f$ p = p + q \qquad p,q \in \mathbb{H} \f$ 35 | */ 36 | template 37 | template 38 | quaternion<_F>& quaternion<_F>::operator+=(const quaternion<_G> &__q) { 39 | 40 | _M_real += __q.real(); 41 | _M_imag_i += __q.imag_i(); 42 | _M_imag_j += __q.imag_j(); 43 | _M_imag_k += __q.imag_k(); 44 | 45 | return *this; 46 | 47 | }; 48 | 49 | /** 50 | * \f$ p = p - q \qquad p,q \in \mathbb{H} \f$ 51 | */ 52 | template 53 | template 54 | quaternion<_F>& quaternion<_F>::operator-=(const quaternion<_G> &__q) { 55 | 56 | _M_real -= __q.real(); 57 | _M_imag_i -= __q.imag_i(); 58 | _M_imag_j -= __q.imag_j(); 59 | _M_imag_k -= __q.imag_k(); 60 | 61 | return *this; 62 | 63 | }; 64 | 65 | 66 | 67 | 68 | /** 69 | * \f$ r = -q = (-q^R, -q^I, -q^J, -q^K) \f$ 70 | */ 71 | template 72 | inline quaternion<_F> operator-(const quaternion<_F>& __x) { 73 | 74 | return quaternion<_F>(-__x.real(),-__x.imag_i(),-__x.imag_j(),-__x.imag_k()); 75 | 76 | } 77 | 78 | 79 | /** 80 | * \f$r = p + q \qquad r,p,q\in\mathbb{H} \f$ 81 | */ 82 | template 83 | inline quaternion<_F> operator+(const quaternion<_F>& __x, 84 | const quaternion<_F>& __y) { 85 | 86 | quaternion<_F> __r = __x; 87 | __r += __y; 88 | return __r; 89 | 90 | }; 91 | 92 | /** 93 | * \f$r = p - q \qquad r,p,q\in\mathbb{H} \f$ 94 | */ 95 | template 96 | inline quaternion<_F> operator-(const quaternion<_F>& __x, 97 | const quaternion<_F>& __y) { 98 | 99 | quaternion<_F> __r = __x; 100 | __r -= __y; 101 | return __r; 102 | 103 | }; 104 | 105 | /** 106 | * \f$ r = pq \qquad r,p,q\in\mathbb{H} \f$ 107 | */ 108 | template 109 | inline quaternion<_F> operator*(const quaternion<_F>& __x, 110 | const quaternion<_F>& __y) { 111 | 112 | quaternion<_F> __r; 113 | 114 | // This is a really naive algorithm 115 | __r.real(__x.real() * __y.real() - __x.imag_i() * __y.imag_i() - 116 | __x.imag_j() * __y.imag_j() - __x.imag_k() * __y.imag_k()); 117 | 118 | __r.imag_i(__x.real() * __y.imag_i() + __x.imag_i() * __y.real() + 119 | __x.imag_j() * __y.imag_k() - __x.imag_k() * __y.imag_j()); 120 | 121 | __r.imag_j(__x.real() * __y.imag_j() - __x.imag_i() * __y.imag_k() + 122 | __x.imag_j() * __y.real() + __x.imag_k() * __y.imag_i()); 123 | 124 | __r.imag_k(__x.real() * __y.imag_k() + __x.imag_i() * __y.imag_j() - 125 | __x.imag_j() * __y.imag_i() + __x.imag_k() * __y.real()); 126 | 127 | return __r; 128 | 129 | }; 130 | 131 | #if defined(__AVX__) || defined(__AVX2__) 132 | 133 | // C++ wrapper around SIMD quaternion multiplication 134 | template<> 135 | inline quaternion operator*(const quaternion& __x, 136 | const quaternion& __y) { 137 | 138 | quaternion __r; 139 | 140 | // Load x,y into 256-bit vector lanes and perform the multiplication 141 | __m256d x = LOAD_256D_UNALIGNED_AS(double,&__x); 142 | __m256d y = LOAD_256D_UNALIGNED_AS(double,&__y); 143 | __m256d r = MULDQ_NN(x,y); 144 | 145 | STORE_256D_UNALIGNED_AS(double,&__r,r); 146 | 147 | return __r; 148 | }; 149 | 150 | #endif 151 | 152 | /** 153 | * Returns true iff all of the elements of quaternion \f$q\f$ are the 154 | * same as quaternoin \f$p\f$. 155 | */ 156 | template 157 | inline bool operator==(const quaternion<_F>& p, const quaternion<_F>& q) { 158 | 159 | return p.real() == q.real() and p.imag_i() == q.imag_i() 160 | and p.imag_j() == q.imag_j() and q.imag_k() == q.imag_k(); 161 | 162 | } 163 | 164 | /** 165 | * Returns true iff all of the elements of quaternion \f$q\f$ are the 166 | * same as quaternoin \f$p\f$. 167 | */ 168 | template 169 | inline bool operator!=(const quaternion<_F>& p, const quaternion<_F>& q) { 170 | 171 | return not (p == q); 172 | 173 | } 174 | 175 | }; // HAXX namespace 176 | 177 | #endif 178 | -------------------------------------------------------------------------------- /include/haxx/haxx_quaternion_reduction.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is a part of HAXX 3 | * 4 | * Copyright (c) 2017 David Williams-Young 5 | * All rights reserved. 6 | * 7 | * See LICENSE.txt 8 | */ 9 | 10 | #ifndef __INCLUDED_HAXX_QUATERNION_REDUCTION_HPP 11 | #define __INCLUDED_HAXX_QUATERNION_REDUCTION_HPP 12 | 13 | #include "haxx/haxx_def.hpp" 14 | 15 | namespace HAXX { 16 | 17 | /** 18 | * \f$ \vert\vert q \vert\vert = \sqrt{a^2 + b^2 + c^2 + d^2} 19 | * \qquad q = (a,b,c,d) \in \mathbb{H}\f$ 20 | * 21 | * Return type is the value_type 22 | */ 23 | template 24 | inline _F norm(const quaternion<_F>& __q) { 25 | _F nmsq = __q.real() * __q.real(); 26 | nmsq += __q.imag_i() * __q.imag_i(); 27 | nmsq += __q.imag_j() * __q.imag_j(); 28 | nmsq += __q.imag_k() * __q.imag_k(); 29 | 30 | return std::sqrt(nmsq); 31 | }; 32 | 33 | /** 34 | * \f$ q^* = (q^R,-q^I,-q^J,-q^K)\f$ 35 | */ 36 | template 37 | inline quaternion<_F> conj(const quaternion<_F>& __q) { 38 | 39 | return quaternion<_F>(__q.real(),-__q.imag_i(),-__q.imag_j(),-__q.imag_k()); 40 | 41 | }; 42 | 43 | // Attempt at SIMD conjugate, slows down the code 44 | #if 0 45 | //#if defined(__AVX__) || defined(__AVX2__) 46 | 47 | template<> 48 | inline quaternion conj(const quaternion &__q) { 49 | 50 | quaternion __r; 51 | 52 | __m256d r = LOAD_256D_UNALIGNED_AS(double,&__q); 53 | r = QCONJ_256D(r); 54 | 55 | STORE_256D_UNALIGNED_AS(double,&__r,r); 56 | 57 | return __r; 58 | 59 | } 60 | 61 | #endif 62 | 63 | /** 64 | * \f$ q^{-1} = \dfrac{q^*}{\vert\vert q \vert\vert^2} \f$ 65 | */ 66 | template 67 | inline quaternion<_F> inv(const quaternion<_F>& __q) { 68 | 69 | _F nrm = norm(__q); 70 | return conj(__q) / nrm / nrm; 71 | 72 | }; 73 | 74 | /** 75 | * \f$ r = \dfrac{q}{\vert\vert q \vert\vert} \f$ 76 | */ 77 | template 78 | inline quaternion<_F> versor(const quaternion<_F>& __q) { 79 | 80 | _F nrm = norm(__q); 81 | return __q / nrm; 82 | 83 | }; 84 | 85 | /** 86 | * \f$ [p,q] = pq - qp \f$ 87 | */ 88 | template 89 | inline quaternion<_F> comm(const quaternion<_F>& p, const quaternion<_F>& q) { 90 | 91 | return p * q - q * p; 92 | 93 | }; 94 | 95 | template 96 | inline quaternion<_F> comm(const quaternion<_F>& p, const std::complex<_F>& q) { 97 | /* 98 | std::complex jPt(p.imag_j(),p.imag_k()); 99 | return quaternion<_F>(std::complex(0.),2.*jPt*std::imag(q)); 100 | */ 101 | return p * q - q * p; 102 | }; 103 | 104 | template 105 | inline quaternion<_F> comm(const std::complex<_F>& p, const quaternion<_F>& q) { 106 | 107 | return -comm(q,p); 108 | 109 | } 110 | 111 | 112 | template 113 | inline quaternion<_F> comm(const _F& p, const quaternion<_F>& q) { 114 | 115 | return quaternion<_F>(0.); 116 | 117 | } 118 | 119 | template 120 | inline quaternion<_F> comm(const quaternion<_F>& q, const _F& p) { 121 | 122 | return quaternion<_F>(0.); 123 | 124 | } 125 | 126 | 127 | 128 | template<> inline double SmartConj( double &x ) { return x; } 129 | template<> 130 | inline std::complex SmartConj( std::complex &x ) { 131 | return std::conj(x); 132 | } 133 | template<> 134 | inline quaternion SmartConj( quaternion &x ) { 135 | return conj(x); 136 | } 137 | 138 | 139 | 140 | }; 141 | 142 | #endif 143 | -------------------------------------------------------------------------------- /include/haxx_config.hpp.in: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is a part of HAXX 3 | * 4 | * Copyright (c) 2017 David Williams-Young 5 | * All rights reserved. 6 | * 7 | * See LICENSE.txt 8 | */ 9 | 10 | #ifndef __INCLUDED_HAXX_CONFIG_HPP 11 | #define __INCLUDED_HAXX_CONFIG_HPP 12 | 13 | // BLAS-3 Params 14 | #cmakedefine MC ${MC} 15 | #cmakedefine NC ${NC} 16 | #cmakedefine KC ${KC} 17 | 18 | 19 | #endif 20 | -------------------------------------------------------------------------------- /include/hblas.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is a part of HAXX 3 | * 4 | * Copyright (c) 2017 David Williams-Young 5 | * All rights reserved. 6 | * 7 | * See LICENSE.txt 8 | */ 9 | #ifndef __INCLUDED_HBLAS_HPP 10 | #define __INCLUDED_HBLAS_HPP 11 | 12 | /** 13 | * @defgroup HBLAS Quaternion BLAS (HBLAS) 14 | * BLAS-Like Routines over the quaternion numbers 15 | */ 16 | 17 | #include "hblas/hblas1.hpp" // Level 1 HBLAS 18 | #include "hblas/hblas2.hpp" // Level 2 HBLAS 19 | #include "hblas/hblas3.hpp" // Level 3 HBLAS 20 | 21 | #endif 22 | -------------------------------------------------------------------------------- /include/hblas/config/hblas3/gemm.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is a part of HAXX 3 | * 4 | * Copyright (c) 2017 David Williams-Young 5 | * All rights reserved. 6 | * 7 | * See LICENSE.txt 8 | */ 9 | 10 | #ifndef __INCLUDED_HBLAS_CONFIG_HBLAS3_GEMM_HPP 11 | #define __INCLUDED_HBLAS_CONFIG_HBLAS3_GEMM_HPP 12 | 13 | 14 | #include "hblas/config/types.hpp" 15 | 16 | // Caching dimensions 17 | 18 | #ifndef MC 19 | #define MC 64 20 | #endif 21 | #ifndef NC 22 | #define NC 512 23 | #endif 24 | #ifndef KC 25 | #define KC 64 26 | #endif 27 | 28 | // Register block size 29 | #define MR 2 30 | #define NR 2 31 | 32 | 33 | // Determine where to factor ALPHA scaling 34 | #define _FACTOR_ALPHA_IN_A_PACK 35 | //#define _FACTOR_ALPHA_IN_B_PACK 36 | 37 | #if defined(_FACTOR_ALPHA_IN_A_PACK) && defined(_FACTOR_ALPHA_IN_B_PACK) 38 | #error "Cannot factor ALPHA into both A and B packs" 39 | #endif 40 | 41 | 42 | // Determine where to factor the transpose operation 43 | // for GEMM kernel 44 | #define _FACTOR_TRANSPOSE_INTO_A_PACK 45 | #define _FACTOR_TRANSPOSE_INTO_B_PACK 46 | 47 | 48 | 49 | // Determine packing utility for GEMM 50 | 51 | #include "hblas/pack/pack.hpp" 52 | 53 | #ifdef _FACTOR_TRANSPOSE_INTO_B_PACK 54 | 55 | #define BPACKT NPACK< NR, _AMATF, GenericPackOps_T2<> > 56 | #define BPACKCT NPACK< NR, _AMATF, ConjPackOps_T2 <> > 57 | #define BPACKR TPACK< NR, _AMATF, ConjPackOps_T2 <> > 58 | #define BPACK TPACK< NR, _AMATF, GenericPackOps_T2<> > 59 | 60 | #else 61 | 62 | #define BPACKT NPACK< NR, _BMATF, GenericPackOps<_BMATF> > 63 | #define BPACKCT NPACK< NR, _BMATF, ConjPackOps <_BMATF> > 64 | #define BPACKR TPACK< NR, _BMATF, ConjPackOps <_BMATF> > 65 | #define BPACK TPACK< NR, _BMATF, GenericPackOps<_BMATF> > 66 | 67 | #endif 68 | 69 | #ifdef _FACTOR_TRANSPOSE_INTO_A_PACK 70 | 71 | #define APACKT TPACK< MR, _AMATF, GenericPackOps_T1<> > 72 | #define APACKCT TPACK< MR, _AMATF, ConjPackOps_T1 <> > 73 | #define APACKR NPACK< MR, _AMATF, ConjPackOps_T1 <> > 74 | #define APACK NPACK< MR, _AMATF, GenericPackOps_T1<> > 75 | 76 | #else 77 | 78 | #define APACKT TPACK< MR, _AMATF, GenericPackOps<_AMATF> > 79 | #define APACKCT TPACK< MR, _AMATF, ConjPackOps <_AMATF> > 80 | #define APACKR NPACK< MR, _AMATF, ConjPackOps <_AMATF> > 81 | #define APACK NPACK< MR, _AMATF, GenericPackOps<_AMATF> > 82 | 83 | #endif 84 | 85 | #endif 86 | -------------------------------------------------------------------------------- /include/hblas/config/types.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is a part of HAXX 3 | * 4 | * Copyright (c) 2017 David Williams-Young 5 | * All rights reserved. 6 | * 7 | * See LICENSE.txt 8 | */ 9 | 10 | #ifndef __INCLUDED_HBLAS_CONFIG_TYPES_HPP 11 | #define __INCLUDED_HBLAS_CONFIG_TYPES_HPP 12 | 13 | #include "util/constants.hpp" 14 | 15 | // Determine type of scaling parameter ALPHA 16 | #ifdef ALPHAF 17 | #if ALPHAF == DOUBLE 18 | #define _ALPHAF double 19 | #elif ALPHAF == DCOMPLEX 20 | #define _ALPHAF std::complex 21 | #elif ALPHAF == DQUATERNION 22 | #define _ALPHAF quaternion 23 | #endif 24 | #endif 25 | 26 | // Determine type of scaling parameter BETA 27 | #ifdef BETAF 28 | #if BETAF == DOUBLE 29 | #define _BETAF double 30 | #elif BETAF == DCOMPLEX 31 | #define _BETAF std::complex 32 | #elif BETAF == DQUATERNION 33 | #define _BETAF quaternion 34 | #endif 35 | #endif 36 | 37 | // Determine type of matrix type AMAT 38 | #ifdef AMATF 39 | #if AMATF == DOUBLE 40 | #define _AMATF double 41 | #elif AMATF == DCOMPLEX 42 | #define _AMATF std::complex 43 | #elif AMATF == DQUATERNION 44 | #define _AMATF quaternion 45 | #endif 46 | #endif 47 | 48 | // Determine type of matrix type BMAT 49 | #ifdef BMATF 50 | #if BMATF == DOUBLE 51 | #define _BMATF double 52 | #elif BMATF == DCOMPLEX 53 | #define _BMATF std::complex 54 | #elif BMATF == DQUATERNION 55 | #define _BMATF quaternion 56 | #endif 57 | #endif 58 | 59 | 60 | 61 | #endif 62 | -------------------------------------------------------------------------------- /include/hblas/generic/hblas1/hblas_axpym_impl.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is a part of HAXX 3 | * 4 | * Copyright (c) 2017 David Williams-Young 5 | * All rights reserved. 6 | * 7 | * See LICENSE.txt 8 | */ 9 | 10 | #pragma once 11 | 12 | #include 13 | #include "hblas/hblas1.hpp" 14 | 15 | namespace HAXX { 16 | 17 | /** 18 | * Written by DBWY (4/2017) 19 | */ 20 | template 21 | void HBLAS_AXPYM(const char SIDE, const char TRANSA, const HAXX_INT M, 22 | const HAXX_INT N, const _AlphaF ALPHA, _XF * const A, const HAXX_INT LDA, 23 | const HAXX_INT INCA, quaternion<_F> * const B, const HAXX_INT LDB, 24 | const HAXX_INT INCB) { 25 | 26 | 27 | if( N <= 0 or M <= 0) return; 28 | if( ALPHA == _AlphaF(0.) ) return; 29 | 30 | assert(TRANSA == 'N'); // Only supporting scaling for now 31 | 32 | HAXX_INT j; 33 | 34 | quaternion<_F> *locA = A, *locB = B; 35 | 36 | for( j = 0; j < N; j++ ) { 37 | HBLAS_AXPYV(SIDE,M,ALPHA,locA,INCA,locB,INCB); 38 | locA += LDA; 39 | locB += LDB; 40 | } 41 | 42 | }; 43 | 44 | 45 | 46 | 47 | }; // namespace HAXX 48 | -------------------------------------------------------------------------------- /include/hblas/generic/hblas1/hblas_axpyv_impl.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is a part of HAXX 3 | * 4 | * Copyright (c) 2017 David Williams-Young 5 | * All rights reserved. 6 | * 7 | * See LICENSE.txt 8 | */ 9 | 10 | #pragma once 11 | 12 | #include "hblas/hblas1.hpp" 13 | 14 | #define HAXX_AXPY_UNROLL 4 15 | 16 | namespace HAXX { 17 | 18 | /** 19 | * Written by DBWY (4/2017) 20 | * 21 | * Based on the BLAS implementaion of DCOPY by Jack Dongarra 22 | * http://www.netlib.org/lapack/explore-html/d9/dcd/daxpy_8f.html 23 | * 24 | * 25 | * \f$ y \in \mathbb{H} \qquad x,\alpha \in \mathbb{R},\mathbb{C},\mathbb{H} \f$ 26 | * 27 | * SIDE == 'L' 28 | * 29 | * \f$ y_i = \alpha x_i + y_i\f$ 30 | * 31 | * SIDE == 'R' 32 | * 33 | * \f$ y_i = x_i \alpha + y_i \f$ 34 | */ 35 | template 36 | void HBLAS_AXPYV(const char SIDE, const HAXX_INT N, const _AlphaF ALPHA, 37 | _XF * const X, const HAXX_INT INCX, quaternion<_F> * const Y, 38 | const HAXX_INT INCY) { 39 | 40 | 41 | if( N <= 0 ) return; 42 | if( ALPHA == _AlphaF(0.) ) return; 43 | 44 | // FIXME: See further comments on negative stride 45 | assert(INCX > 0); 46 | assert(INCY > 0); 47 | 48 | HAXX_INT i; 49 | 50 | // FIXME: Should write a specialization for real ALPHA where side 51 | // doesnt matter 52 | bool isR = SIDE == 'R'; 53 | bool isL = not isR; 54 | 55 | if( INCX == 1 and INCY == 1 ) { 56 | HAXX_INT m = N % HAXX_AXPY_UNROLL; 57 | 58 | if( m != 0) { 59 | if( isL ) for( i = 0; i < m; ++i ) Y[i] += ALPHA * X[i]; 60 | else for( i = 0; i < m; ++i ) Y[i] += X[i] * ALPHA; 61 | 62 | // XXX: DAXPY has this outside of the if-check? Unline COPY and SCAL 63 | if( N < HAXX_AXPY_UNROLL ) return; 64 | } 65 | 66 | // FIXME: This assumes HAXX_AXPY_UNROLL = 4 67 | if( isL ) 68 | for( i = m; i < N; i += HAXX_AXPY_UNROLL ) { 69 | Y[i] += ALPHA * X[i]; 70 | Y[i+1] += ALPHA * X[i+1]; 71 | Y[i+2] += ALPHA * X[i+2]; 72 | Y[i+3] += ALPHA * X[i+3]; 73 | } 74 | else 75 | for( i = m; i < N; i += HAXX_AXPY_UNROLL ) { 76 | Y[i] += X[i] * ALPHA; 77 | Y[i+1] += X[i+1] * ALPHA; 78 | Y[i+2] += X[i+2] * ALPHA; 79 | Y[i+3] += X[i+3] * ALPHA; 80 | } 81 | } else { 82 | 83 | HAXX_INT ix(0), iy(0); 84 | // FIXME: the original _AXPY function has code here to handle 85 | // negative increments. Unsure on what that accomplishes 86 | 87 | if( isL ) 88 | for( i = 0; i < N; ++i, ix += INCX, iy += INCY ) Y[iy] += ALPHA * X[ix]; 89 | else 90 | for( i = 0; i < N; ++i, ix += INCX, iy += INCY ) Y[iy] += X[ix] * ALPHA; 91 | 92 | } 93 | }; 94 | 95 | 96 | 97 | 98 | }; // namespace HAXX 99 | -------------------------------------------------------------------------------- /include/hblas/generic/hblas1/hblas_copyv_impl.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is a part of HAXX 3 | * 4 | * Copyright (c) 2017 David Williams-Young 5 | * All rights reserved. 6 | * 7 | * See LICENSE.txt 8 | */ 9 | 10 | #pragma once 11 | 12 | #include "hblas/hblas1.hpp" 13 | 14 | #define HAXX_COPY_UNROLL 7 15 | 16 | namespace HAXX { 17 | 18 | /** 19 | * Copies the elememts from one quaternion vector to another 20 | * 21 | * Written by DBWY (4/2017) 22 | * 23 | * Based on the BLAS implementaion of DCOPY by Jack Dongarra 24 | * http://www.netlib.org/lapack/explore-html/da/d6c/dcopy_8f.html 25 | */ 26 | template 27 | void HBLAS_COPYV(const HAXX_INT N, quaternion<_F> * const X, 28 | const HAXX_INT INCX, quaternion<_F> * const Y, const HAXX_INT INCY) { 29 | 30 | if( N <= 0 ) return; 31 | // FIXME: See further comments on negative stride 32 | assert(INCX > 0); 33 | assert(INCY > 0); 34 | 35 | HAXX_INT i; 36 | 37 | if( INCX == 1 and INCY == 1 ) { 38 | 39 | HAXX_INT m = N % HAXX_COPY_UNROLL; 40 | if( m != 0 ) { 41 | for( i = 0; i < m; i++ ) Y[i] = X[i]; 42 | if( N < HAXX_COPY_UNROLL ) return; 43 | } 44 | 45 | // FIXME: This assumes HAXX_COPY_UNROLL = 7 46 | for( i = m; i < N; i += HAXX_COPY_UNROLL ) { 47 | Y[i] = X[i]; 48 | Y[i+1] = X[i+1]; 49 | Y[i+2] = X[i+2]; 50 | Y[i+3] = X[i+3]; 51 | Y[i+4] = X[i+4]; 52 | Y[i+5] = X[i+5]; 53 | Y[i+6] = X[i+6]; 54 | } 55 | 56 | } else { 57 | 58 | HAXX_INT ix(0), iy(0); 59 | // FIXME: the original _COPY function has code here to handle 60 | // negative increments. Unsure on what that accomplishes 61 | 62 | for( i = 0; i < N; ++i, ix += INCX, iy += INCY ) Y[iy] = X[ix]; 63 | 64 | 65 | } 66 | }; 67 | 68 | }; // namespace HAXX 69 | 70 | 71 | -------------------------------------------------------------------------------- /include/hblas/generic/hblas1/hblas_dotcv_impl.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is a part of HAXX 3 | * 4 | * Copyright (c) 2017 David Williams-Young 5 | * All rights reserved. 6 | * 7 | * See LICENSE.txt 8 | */ 9 | 10 | #pragma once 11 | 12 | #include "hblas/hblas1.hpp" 13 | 14 | namespace HAXX { 15 | 16 | /** 17 | * Written by DBWY (4/2017) 18 | * 19 | * Based on the BLAS implementation of ZDOTU by Jack Dongarra 20 | * http://www.netlib.org/lapack/explore-html/d6/db8/zdotc_8f.html 21 | * 22 | * \f$ r,x,y \in \mathbb{H}, \qquad r = \sum_i x^*_i y_i \f$ 23 | */ 24 | template 25 | quaternion<_F> HBLAS_DOTCV(const HAXX_INT N, quaternion<_F> * const X, 26 | const HAXX_INT INCX, quaternion<_F> * const Y, const HAXX_INT INCY) { 27 | 28 | quaternion<_F> htemp(0.,0.,0.,0.); 29 | 30 | if( N <= 0 ) return htemp; 31 | 32 | // FIXME: See further comments on negative stride 33 | assert(INCX > 0); 34 | assert(INCY > 0); 35 | 36 | HAXX_INT i; 37 | 38 | if( INCX == 1 and INCY == 1 ) { 39 | 40 | for( i = 0; i < N; ++i ) htemp += conj(X[i])*Y[i]; 41 | 42 | } else { 43 | 44 | HAXX_INT ix(0), iy(0); 45 | // FIXME: the original _AXPY function has code here to handle 46 | // negative increments. Unsure on what that accomplishes 47 | 48 | for( i = 0; i < N; ++i, ix += INCX, iy += INCY ) 49 | htemp += conj(X[ix]) * Y[iy]; 50 | } 51 | 52 | return htemp; 53 | }; 54 | 55 | 56 | }; // namespace HAXX 57 | -------------------------------------------------------------------------------- /include/hblas/generic/hblas1/hblas_dotuv_impl.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is a part of HAXX 3 | * 4 | * Copyright (c) 2017 David Williams-Young 5 | * All rights reserved. 6 | * 7 | * See LICENSE.txt 8 | */ 9 | 10 | #pragma once 11 | 12 | #include "hblas/hblas1.hpp" 13 | 14 | namespace HAXX { 15 | 16 | /** 17 | * Written by DBWY (4/2017) 18 | * 19 | * Based on the BLAS implementation of ZDOTU by Jack Dongarra 20 | * http://www.netlib.org/lapack/explore-html/db/d2d/zdotu_8f.html 21 | * 22 | * \f$ r,x,y \in \mathbb{H}, \qquad r = \sum_i x_i y_i \f$ 23 | */ 24 | template 25 | quaternion<_F> HBLAS_DOTUV(const HAXX_INT N, quaternion<_F> * const X, 26 | const HAXX_INT INCX, quaternion<_F> * const Y, const HAXX_INT INCY) { 27 | 28 | quaternion<_F> htemp(0.,0.,0.,0.); 29 | if( N <= 0 ) return htemp; 30 | 31 | // FIXME: See further comments on negative stride 32 | assert(INCX > 0); 33 | assert(INCY > 0); 34 | 35 | HAXX_INT i; 36 | 37 | if( INCX == 1 and INCY == 1 ) { 38 | 39 | for( i = 0; i < N; ++i ) htemp += X[i]*Y[i]; 40 | 41 | } else { 42 | 43 | HAXX_INT ix(0), iy(0); 44 | // FIXME: the original _AXPY function has code here to handle 45 | // negative increments. Unsure on what that accomplishes 46 | 47 | for( i = 0; i < N; ++i, ix += INCX, iy += INCY ) { 48 | htemp += X[ix] * Y[iy]; 49 | } 50 | } 51 | 52 | return htemp; 53 | }; 54 | 55 | 56 | }; // namespace HAXX 57 | -------------------------------------------------------------------------------- /include/hblas/generic/hblas1/hblas_scalm_impl.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is a part of HAXX 3 | * 4 | * Copyright (c) 2017 David Williams-Young 5 | * All rights reserved. 6 | * 7 | * See LICENSE.txt 8 | */ 9 | 10 | #pragma once 11 | 12 | #include 13 | 14 | #include "hblas/hblas1.hpp" 15 | 16 | 17 | namespace HAXX { 18 | 19 | /** 20 | * Scale / Transpose / Conjugate a quaternion matrix in place 21 | * 22 | * Written by DBWY (9/2017) 23 | */ 24 | template 25 | void HBLAS_SCALM(const char SIDE, const char TRANSA, const HAXX_INT M, 26 | const HAXX_INT N, const _AlphaF ALPHA, quaternion<_F> * const A, 27 | const HAXX_INT LDA, const HAXX_INT INCA) { 28 | 29 | if( N <= 0 or M <= 0 or INCA <= 0 ) return; 30 | assert(TRANSA == 'N'); // Only supporting scaling for now 31 | 32 | HAXX_INT j; 33 | 34 | quaternion<_F> *locA = A; 35 | 36 | for( j = 0; j < N; j++ ) { 37 | HBLAS_SCALV(SIDE,M,ALPHA,locA,INCA); 38 | locA += LDA; 39 | } 40 | 41 | 42 | }; 43 | 44 | 45 | 46 | }; // namespace HAXX 47 | 48 | -------------------------------------------------------------------------------- /include/hblas/generic/hblas1/hblas_scalv_impl.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is a part of HAXX 3 | * 4 | * Copyright (c) 2017 David Williams-Young 5 | * All rights reserved. 6 | * 7 | * See LICENSE.txt 8 | */ 9 | 10 | #pragma once 11 | 12 | #include "hblas/hblas1.hpp" 13 | #include 14 | 15 | #define HAXX_SCAL_UNROLL 5 16 | 17 | namespace HAXX { 18 | 19 | /** 20 | * Scales a quaternion vector in place 21 | * 22 | * Written by DBWY (4/2017) 23 | * 24 | * Based on the BLAS implementaion of DSCAL by Jack Dongarra 25 | * http://www.netlib.org/lapack/explore-html/d4/dd0/dscal_8f.html 26 | * 27 | * SIDE == 'L' 28 | * 29 | * \f$ x_i = \alpha x_i \f$ 30 | * 31 | * SIDE == 'R' 32 | * 33 | * \f$ x_i = x_i \alpha \f$ 34 | */ 35 | template 36 | void HBLAS_SCALV(const char SIDE, const HAXX_INT N, const _AlphaF ALPHA, 37 | quaternion<_F> * const X, const HAXX_INT INCX) { 38 | 39 | if( N <= 0 or INCX <= 0 ) return; 40 | 41 | HAXX_INT i; 42 | 43 | // FIXME: Should write a specialization for real ALPHA where side 44 | // doesnt matter 45 | bool isR = SIDE == 'R'; 46 | bool isL = not isR; 47 | 48 | if( INCX == 1 ) { 49 | HAXX_INT m = N % HAXX_SCAL_UNROLL; 50 | 51 | if( m != 0 ) { 52 | if( isL ) for( i = 0; i < m; ++i ) X[i] = ALPHA * X[i]; 53 | else for( i = 0; i < m; ++i ) X[i] = X[i] * ALPHA; 54 | 55 | if( N < HAXX_SCAL_UNROLL ) return; 56 | } 57 | 58 | // FIXME: This assumes HAXX_SCAL_UNROLL = 5 59 | if( isL ) 60 | for( i = m; i < N; i += HAXX_SCAL_UNROLL ) { 61 | X[i] = ALPHA * X[i]; 62 | X[i+1] = ALPHA * X[i+1]; 63 | X[i+2] = ALPHA * X[i+2]; 64 | X[i+3] = ALPHA * X[i+3]; 65 | X[i+4] = ALPHA * X[i+4]; 66 | } 67 | else 68 | for( i = m; i < N; i += HAXX_SCAL_UNROLL ) { 69 | X[i] = X[i] * ALPHA; 70 | X[i+1] = X[i+1] * ALPHA; 71 | X[i+2] = X[i+2] * ALPHA; 72 | X[i+3] = X[i+3] * ALPHA; 73 | X[i+4] = X[i+4] * ALPHA; 74 | } 75 | } else { 76 | 77 | HAXX_INT NINCX = N*INCX; 78 | if( isL ) for( i = 0; i < NINCX; i += INCX ) X[i] = ALPHA * X[i]; 79 | else for( i = 0; i < NINCX; i += INCX ) X[i] = X[i] * ALPHA; 80 | 81 | } 82 | 83 | 84 | }; 85 | 86 | 87 | 88 | }; // namespace HAXX 89 | 90 | -------------------------------------------------------------------------------- /include/hblas/generic/hblas1/hblas_swapv_impl.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is a part of HAXX 3 | * 4 | * Copyright (c) 2017 David Williams-Young 5 | * All rights reserved. 6 | * 7 | * See LICENSE.txt 8 | */ 9 | 10 | #pragma once 11 | 12 | #include "hblas/hblas1.hpp" 13 | 14 | #define HAXX_SWAP_UNROLL 3 15 | 16 | namespace HAXX { 17 | 18 | /** 19 | * Swaps the elements of two strided quaternion arrays of length N 20 | * 21 | * Written by DBWY (4/2017) 22 | * 23 | * Based on the BLAS implementaion of DSWAP by Jack Dongarra 24 | * http://www.netlib.org/lapack/explore-html/db/dd4/dswap_8f.html 25 | */ 26 | template 27 | void HBLAS_SWAPV(const HAXX_INT N, quaternion<_F> * const X, 28 | const HAXX_INT INCX, quaternion<_F> * const Y, const HAXX_INT INCY) { 29 | 30 | if( N <= 0 ) return; 31 | // FIXME: See further comments on negative stride 32 | assert(INCX > 0); 33 | assert(INCY > 0); 34 | 35 | HAXX_INT i; 36 | quaternion<_F> qtmp; 37 | 38 | if( INCX == 1 and INCY == 1 ) { 39 | HAXX_INT m = N % HAXX_SWAP_UNROLL; 40 | 41 | // Use unrolled loops for both unit increments 42 | 43 | // XXX: For some reason Z/CSWAP does not do this. 44 | // Cache utilization? 45 | if( m != 0 ) { 46 | for( i = 0; i < m; ++i ) { 47 | qtmp = X[i]; 48 | X[i] = Y[i]; 49 | Y[i] = qtmp; 50 | } 51 | if( N < HAXX_SWAP_UNROLL ) return; 52 | } 53 | 54 | // FIXME: This assumes HAXX_SWAP_UNROLL = 3 55 | for( i = m; i < N; i += HAXX_SWAP_UNROLL ) { 56 | qtmp = X[i]; 57 | X[i] = Y[i]; 58 | Y[i] = qtmp; 59 | 60 | qtmp = X[i+1]; 61 | X[i+1] = Y[i+1]; 62 | Y[i+1] = qtmp; 63 | 64 | qtmp = X[i+2]; 65 | X[i+2] = Y[i+2]; 66 | Y[i+2] = qtmp; 67 | } 68 | 69 | } else { 70 | 71 | HAXX_INT ix(0), iy(0); 72 | // FIXME: the original _SWAP function has code here to handle 73 | // negative increments. Unsure on what that accomplishes 74 | 75 | for( i = 0; i < N; ++i, ix += INCX, iy += INCY ) { 76 | qtmp = X[ix]; 77 | X[ix] = Y[iy]; 78 | Y[iy] = qtmp; 79 | } 80 | } 81 | 82 | }; 83 | 84 | }; // namespace HAXX 85 | 86 | 87 | -------------------------------------------------------------------------------- /include/hblas/generic/hblas1/impl.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is a part of HAXX 3 | * 4 | * Copyright (c) 2017 David Williams-Young 5 | * All rights reserved. 6 | * 7 | * See LICENSE.txt 8 | */ 9 | 10 | #pragma once 11 | 12 | #include "hblas_swapv_impl.hpp" // HBLAS_SWAPV 13 | #include "hblas_scalv_impl.hpp" // HBLAS_SCALV 14 | #include "hblas_copyv_impl.hpp" // HBLAS_COPYV 15 | #include "hblas_axpyv_impl.hpp" // HBLAS_AXPYV 16 | #include "hblas_dotuv_impl.hpp" // HBLAS_DOTUV 17 | #include "hblas_dotcv_impl.hpp" // HBLAS_DOTCV 18 | 19 | #include "hblas_scalm_impl.hpp" // HBLAS_SCALM 20 | #include "hblas_axpym_impl.hpp" // HBLAS_AXPYM 21 | -------------------------------------------------------------------------------- /include/hblas/generic/hblas2/hblas_gerc_impl.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is a part of HAXX 3 | * 4 | * Copyright (c) 2017 David Williams-Young 5 | * All rights reserved. 6 | * 7 | * See LICENSE.txt 8 | */ 9 | 10 | #ifndef __INCLUDED_HBLAS_GERC_IMPL_HPP 11 | #define __INCLUDED_HBLAS_GERC_IMPL_HPP 12 | 13 | #include "hblas/hblas2.hpp" 14 | 15 | namespace HAXX { 16 | 17 | /** 18 | * Written by DBWY (4/2017) 19 | * 20 | * Based on the BLAS implementation of ZGERC by 21 | * Jack Dongarra (Argonne) 22 | * Jeremy Du Croz (NAG) 23 | * Sven Hammarling (NAG) 24 | * Richard Hanson (Sandia) 25 | * 26 | * Performs the rank 1 operation 27 | * 28 | * \f$ A_{ij} = A_{ij} + \alpha x_i y^*_j \f$ 29 | * 30 | * \f$ A \in \mathbb{H} \qquad x,y,\alpha \in \mathbb{R},\mathbb{C},\mathbb{H}\f$ 31 | */ 32 | // FIXME: In this implementaion, it has been implied that scalars 33 | // will always multiply from the left. Should generalize in such a 34 | // was to allow flexibility in ALPHA 35 | template 37 | void HBLAS_GERC(const HAXX_INT M, const HAXX_INT N, const _AlphaF ALPHA, 38 | _LeftVecF * const X, const HAXX_INT INCX, _RightVecF * const Y, 39 | const HAXX_INT INCY, quaternion<_F> * const A, const HAXX_INT LDA) { 40 | 41 | if( M == 0 or N == 0 or ALPHA == _AlphaF(0.)) return; 42 | 43 | // FIXME: The original BLAS implementaion has logic to handle 44 | // negative strides. See further comments. 45 | assert( INCX > 0 ); 46 | assert( INCY > 0 ); 47 | 48 | 49 | HAXX_INT i, j, ix; 50 | 51 | // FIXME: This parameter is effected in the orignal BLAS 52 | // implementaion by negative stride 53 | HAXX_INT JY = 0; 54 | 55 | quaternion<_F> htemp1; 56 | 57 | if( INCX == 1 ) { 58 | 59 | for( j = 0; j < N; ++j, JY += INCY ) 60 | if( Y[JY] != _RightVecF(0.) ) { 61 | // htemp1 = ALPHA * Y[JY]; 62 | for( i = 0; i < M; ++i ) { 63 | A[RANK2_INDX(i,j,LDA)] += ALPHA * X[i] * conj(Y[JY]); 64 | } 65 | } 66 | 67 | } else { // end INCX == 1 68 | 69 | // FIXME: This parameter is effected in the orignal BLAS 70 | // implementaion by negative stride 71 | HAXX_INT KX = 0; 72 | 73 | for( j = 0; j < N; ++j, JY += INCY ) 74 | if( Y[JY] != _RightVecF(0.) ) { 75 | // htemp1 = ALPHA * Y[JY]; 76 | for( i = 0, ix = KX; i < M; ++i, ix += INCX ) { 77 | A[RANK2_INDX(i,j,LDA)] += ALPHA * X[ix] * conj(Y[JY]); 78 | } 79 | } 80 | } // end INCX != 1 81 | 82 | }; // end GERC 83 | 84 | 85 | 86 | }; // namspace HAXX 87 | 88 | #endif 89 | -------------------------------------------------------------------------------- /include/hblas/generic/hblas2/hblas_geru_impl.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is a part of HAXX 3 | * 4 | * Copyright (c) 2017 David Williams-Young 5 | * All rights reserved. 6 | * 7 | * See LICENSE.txt 8 | */ 9 | 10 | #ifndef __INCLUDED_HBLAS_GERU_IMPL_HPP 11 | #define __INCLUDED_HBLAS_GERU_IMPL_HPP 12 | 13 | #include "hblas/hblas2.hpp" 14 | 15 | namespace HAXX { 16 | 17 | /** 18 | * Written by DBWY (4/2017) 19 | * 20 | * Based on the BLAS implementation of ZGERU by 21 | * Jack Dongarra (Argonne) 22 | * Jeremy Du Croz (NAG) 23 | * Sven Hammarling (NAG) 24 | * Richard Hanson (Sandia) 25 | * 26 | * Performs the rank 1 operation 27 | * 28 | * \f$ A_{ij} = A_{ij} + \alpha x_i y_j \f$ 29 | * 30 | * \f$ A \in \mathbb{H} \qquad x,y,\alpha \in \mathbb{R},\mathbb{C},\mathbb{H}\f$ 31 | */ 32 | // FIXME: In this implementaion, it has been implied that scalars 33 | // will always multiply from the left. Should generalize in such a 34 | // was to allow flexibility in ALPHA 35 | template 37 | void HBLAS_GERU(const HAXX_INT M, const HAXX_INT N, const _AlphaF ALPHA, 38 | _LeftVecF * const X, const HAXX_INT INCX, _RightVecF * const Y, 39 | const HAXX_INT INCY, quaternion<_F> * const A, const HAXX_INT LDA) { 40 | 41 | if( M == 0 or N == 0 or ALPHA == _AlphaF(0.)) return; 42 | 43 | // FIXME: The original BLAS implementaion has logic to handle 44 | // negative strides. See further comments. 45 | assert( INCX > 0 ); 46 | assert( INCY > 0 ); 47 | 48 | 49 | HAXX_INT i, j, ix; 50 | 51 | // FIXME: This parameter is effected in the orignal BLAS 52 | // implementaion by negative stride 53 | HAXX_INT JY = 0; 54 | 55 | quaternion<_F> htemp1; 56 | 57 | if( INCX == 1 ) { 58 | 59 | for( j = 0; j < N; ++j, JY += INCY ) 60 | if( Y[JY] != _RightVecF(0.) ) { 61 | // htemp1 = ALPHA * Y[JY]; 62 | for( i = 0; i < M; ++i ) { 63 | A[RANK2_INDX(i,j,LDA)] += ALPHA * X[i] * Y[JY]; 64 | } 65 | } 66 | 67 | } else { // end INCX == 1 68 | 69 | // FIXME: This parameter is effected in the orignal BLAS 70 | // implementaion by negative stride 71 | HAXX_INT KX = 0; 72 | 73 | for( j = 0; j < N; ++j, JY += INCY ) 74 | if( Y[JY] != _RightVecF(0.) ) { 75 | // htemp1 = ALPHA * Y[JY]; 76 | for( i = 0, ix = KX; i < M; ++i, ix += INCX ) { 77 | A[RANK2_INDX(i,j,LDA)] += ALPHA * X[ix] * Y[JY]; 78 | } 79 | } 80 | } // end INCX != 1 81 | 82 | }; // end GERU 83 | 84 | 85 | 86 | }; // namespace HAXX 87 | 88 | #endif 89 | -------------------------------------------------------------------------------- /include/hblas/generic/hblas2/impl.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is a part of HAXX 3 | * 4 | * Copyright (c) 2017 David Williams-Young 5 | * All rights reserved. 6 | * 7 | * See LICENSE.txt 8 | */ 9 | 10 | #ifndef __INCLUDED_HBLAS2_IMPL_HPP 11 | #define __INCLUDED_HBLAS2_IMPL_HPP 12 | 13 | #include "hblas_gemv_impl.hpp" 14 | #include "hblas_geru_impl.hpp" 15 | #include "hblas_gerc_impl.hpp" 16 | 17 | #endif 18 | -------------------------------------------------------------------------------- /include/hblas/generic/hblas3/impl.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is a part of HAXX 3 | * 4 | * Copyright (c) 2017 David Williams-Young 5 | * All rights reserved. 6 | * 7 | * See LICENSE.txt 8 | */ 9 | 10 | #ifndef __INCLUDED_HBLAS3_IMPL_HPP 11 | #define __INCLUDED_HBLAS3_IMPL_HPP 12 | 13 | #include "hblas_gemm_impl.hpp" 14 | 15 | #endif 16 | -------------------------------------------------------------------------------- /include/hblas/generic/util/contract_impl.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is a part of HAXX 3 | * 4 | * Copyright (c) 2017 David Williams-Young 5 | * All rights reserved. 6 | * 7 | * See LICENSE.txt 8 | */ 9 | 10 | #ifndef __INCLUDED_HBLAS_UTIL_CONTRACT_IMPL_HPP 11 | #define __INCLUDED_HBLAS_UTIL_CONTRACT_IMPL_HPP 12 | 13 | #include 14 | #include "hblas/hblas_util.hpp" 15 | 16 | namespace HAXX { 17 | 18 | template <> 19 | void HBLAS_COMPLEX_CONTRACT(char ORDER, char UPLO, HAXX_INT M, HAXX_INT N, 20 | quaternion *A, HAXX_INT LDA, std::complex *B, 21 | HAXX_INT LDB) { 22 | 23 | assert( ORDER == 'F' or ORDER == 'S' ); 24 | 25 | if( ORDER == 'F' ) 26 | hzcon1_(&UPLO,&M,&N,A,&LDA,B,&LDB); 27 | else 28 | hzcon2_(&UPLO,&M,&N,A,&LDA,B,&LDB); 29 | 30 | } 31 | 32 | 33 | }; // namespace HAXX 34 | 35 | #endif 36 | -------------------------------------------------------------------------------- /include/hblas/generic/util/expand_impl.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is a part of HAXX 3 | * 4 | * Copyright (c) 2017 David Williams-Young 5 | * All rights reserved. 6 | * 7 | * See LICENSE.txt 8 | */ 9 | 10 | #ifndef __INCLUDED_HBLAS_UTIL_EXPAND_IMPL_HPP 11 | #define __INCLUDED_HBLAS_UTIL_EXPAND_IMPL_HPP 12 | 13 | #include 14 | #include "hblas/hblas_util.hpp" 15 | 16 | namespace HAXX { 17 | 18 | template <> 19 | void HBLAS_COMPLEX_EXPAND(char ORDER, HAXX_INT M, HAXX_INT N, 20 | quaternion *A, HAXX_INT LDA, std::complex *B, 21 | HAXX_INT LDB) { 22 | 23 | assert( ORDER == 'F' or ORDER == 'S' ); 24 | 25 | if( ORDER == 'F' ) 26 | hzexp1_(&M,&N,A,&LDA,B,&LDB); 27 | else 28 | hzexp2_(&M,&N,A,&LDA,B,&LDB); 29 | 30 | } 31 | 32 | template <> 33 | void HBLAS_REAL_EXPAND(HAXX_INT M, HAXX_INT N, quaternion *A, 34 | HAXX_INT LDA, double *B, HAXX_INT LDB) { 35 | 36 | hdexp_(&M,&N,A,&LDA,B,&LDB); 37 | 38 | } 39 | 40 | }; // namespace HAXX 41 | 42 | #endif 43 | -------------------------------------------------------------------------------- /include/hblas/generic/util/impl.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is a part of HAXX 3 | * 4 | * Copyright (c) 2017 David Williams-Young 5 | * All rights reserved. 6 | * 7 | * See LICENSE.txt 8 | */ 9 | 10 | #ifndef __INCLUDED_HBLAS_UTIL_IMPL_HPP 11 | #define __INCLUDED_HBLAS_UTIL_IMPL_HPP 12 | 13 | #include "expand_impl.hpp" 14 | #include "contract_impl.hpp" 15 | 16 | #endif 17 | 18 | -------------------------------------------------------------------------------- /include/hblas/hblas1.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is a part of HAXX 3 | * 4 | * Copyright (c) 2017 David Williams-Young 5 | * All rights reserved. 6 | * 7 | * See LICENSE.txt 8 | */ 9 | 10 | #ifndef __INCLUDED_HBLAS1_HPP 11 | #define __INCLUDED_HBLAS1_HPP 12 | 13 | #include "haxx/haxx_def.hpp" 14 | 15 | // Preprocessor macros for quick FORTRAN declarations 16 | 17 | #define SCAL_FORTRAN_DECL(NAME,F,ALPHAF) \ 18 | void NAME##_(const char*, const HAXX_INT*, const ALPHAF*, \ 19 | const quaternion *, const HAXX_INT*); 20 | 21 | #define DOT_FORTRAN_DECL(NAME,F) \ 22 | void NAME##_(const quaternion *, const HAXX_INT*, const quaternion *,\ 23 | const HAXX_INT*, const quaternion *, const HAXX_INT*); 24 | 25 | #define AXPY_FORTRAN_DECL(NAME,F,XF,ALPHAF)\ 26 | void NAME##_(const char*, const HAXX_INT*, const ALPHAF*, const XF *, \ 27 | const HAXX_INT*, const quaternion *, const HAXX_INT*); 28 | 29 | 30 | 31 | 32 | 33 | namespace HAXX { 34 | 35 | 36 | // FORTRAN HBLAS1 functions 37 | extern "C" { 38 | 39 | // SCAL functions 40 | SCAL_FORTRAN_DECL(hscald,double,double); 41 | SCAL_FORTRAN_DECL(hscalc,double,std::complex); 42 | SCAL_FORTRAN_DECL(hscalh,double,quaternion); 43 | 44 | // DOT functions 45 | DOT_FORTRAN_DECL(hdotu,double); 46 | DOT_FORTRAN_DECL(hdotc,double); 47 | 48 | // AXPY functions 49 | AXPY_FORTRAN_DECL(haxpydh,double,quaternion,double); 50 | AXPY_FORTRAN_DECL(haxpych,double,quaternion,std::complex); 51 | AXPY_FORTRAN_DECL(haxpyhh,double,quaternion,quaternion); 52 | 53 | }; 54 | 55 | 56 | /** 57 | * \addtogroup HBLAS 58 | * @{ 59 | * 60 | * 61 | * @defgroup HBLAS1 Level 1 HBLAS 62 | * Level 1 BLAS operations over quaternion numbers 63 | * 64 | * @{ 65 | */ 66 | 67 | 68 | /** 69 | * @defgroup HBLAS1V Level 1 HBLASV 70 | * Level 1 BLAS operations on quaternion vectors 71 | * 72 | * @{ 73 | */ 74 | 75 | /// Swap the states of two quaternion arrays 76 | template 77 | void HBLAS_SWAPV(const HAXX_INT N, quaternion<_F> * const X, 78 | const HAXX_INT INCX, quaternion<_F> * const Y, const HAXX_INT INCY); 79 | 80 | /// Copy a quaternion array to another quaternion array 81 | template 82 | void HBLAS_COPYV(const HAXX_INT N, quaternion<_F> * const X, 83 | const HAXX_INT INCX, quaternion<_F> * const Y, const HAXX_INT INCY); 84 | 85 | /// Scale a quaternion array by a scalar 86 | template 87 | void HBLAS_SCALV(const char SIDE, const HAXX_INT N, const _AlphaF ALPHA, 88 | quaternion<_F> * const X, const HAXX_INT INCX); 89 | 90 | /// Scale a quaternion array and add it to another quaternion array 91 | template 92 | void HBLAS_AXPYV(const char SIDE, const HAXX_INT N, const _AlphaF ALPHA, 93 | _XF * const X, const HAXX_INT INCX, quaternion<_F> * const Y, 94 | const HAXX_INT INCY); 95 | 96 | /// Perform an unaltered dot product of two quaternion arrays 97 | template 98 | quaternion<_F> HBLAS_DOTUV(const HAXX_INT N, quaternion<_F> * const X, 99 | const HAXX_INT INCX, quaternion<_F> * const Y, const HAXX_INT INCY); 100 | 101 | /// Obtain the inner product of two quaternion arrays 102 | template 103 | quaternion<_F> HBLAS_DOTCV(const HAXX_INT N, quaternion<_F> * const X, 104 | const HAXX_INT INCX, quaternion<_F> * const Y, const HAXX_INT INCY); 105 | 106 | /* @} */ // HBLAS1V 107 | 108 | 109 | /** 110 | * @defgroup HBLAS1M Level 1 HBLASM 111 | * Level 1 BLAS operations on quaternion matricies (ala BLIS) 112 | * 113 | * @{ 114 | */ 115 | 116 | /// Scale / Transpose / Conjugate a quaternion matrix in place 117 | template 118 | void HBLAS_SCALM(const char SIDE, const char TRANSA, const HAXX_INT M, 119 | const HAXX_INT N, const _AlphaF ALPHA, quaternion<_F> * const A, 120 | const HAXX_INT LDA, const HAXX_INT INCA); 121 | 122 | /// Accumulate a quaternion matrix 123 | template 124 | void HBLAS_AXPYM(const char SIDE, const char TRANSA, const HAXX_INT M, 125 | const HAXX_INT N, const _AlphaF ALPHA, _XF * const A, const HAXX_INT LDA, 126 | const HAXX_INT INCA, quaternion<_F> * const B, const HAXX_INT LDB, 127 | const HAXX_INT INCB); 128 | 129 | /* @} */ // HBLAS1M 130 | 131 | /* @} */ // HBLAS1 132 | 133 | /* @} */ // HBLAS 134 | 135 | }; // namespace HAXX 136 | 137 | #endif 138 | -------------------------------------------------------------------------------- /include/hblas/hblas2.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is a part of HAXX 3 | * 4 | * Copyright (c) 2017 David Williams-Young 5 | * All rights reserved. 6 | * 7 | * See LICENSE.txt 8 | */ 9 | 10 | #ifndef __INCLUDED_HBLAS2_HPP 11 | #define __INCLUDED_HBLAS2_HPP 12 | 13 | #include "haxx/haxx_def.hpp" 14 | #include "util/macro.hpp" 15 | 16 | #define GEMV_FORTRAN_DECL(NAME,F,MATF,VECF,ALPHAF,BETAF) \ 17 | void NAME##_(const char*, const HAXX_INT*, const HAXX_INT*, const ALPHAF*,\ 18 | const MATF*, const HAXX_INT*, const VECF*, const HAXX_INT*, const BETAF*, \ 19 | const quaternion*, const HAXX_INT*); 20 | 21 | #define GER_FORTRAN_DECL(NAME,F,LEFTF,RIGHTF,ALPHAF) \ 22 | void NAME##_(const HAXX_INT*, const HAXX_INT*, const ALPHAF*,\ 23 | const LEFTF*, const HAXX_INT*, const RIGHTF*, const HAXX_INT*, \ 24 | const quaternion*, const HAXX_INT*);\ 25 | 26 | namespace HAXX { 27 | 28 | // FORTRAN HBLAS2 functions 29 | extern "C" { 30 | 31 | // GEMV functions 32 | 33 | GEMV_FORTRAN_DECL(hgemvdd,double,quaternion,quaternion, 34 | double, double); 35 | GEMV_FORTRAN_DECL(hgemvdz,double,quaternion,quaternion, 36 | double, std::complex); 37 | GEMV_FORTRAN_DECL(hgemvdh,double,quaternion,quaternion, 38 | double, quaternion); 39 | 40 | GEMV_FORTRAN_DECL(hgemvzd,double,quaternion,quaternion, 41 | std::complex,double); 42 | GEMV_FORTRAN_DECL(hgemvzz,double,quaternion,quaternion, 43 | std::complex,std::complex); 44 | GEMV_FORTRAN_DECL(hgemvzh,double,quaternion,quaternion, 45 | std::complex, quaternion); 46 | 47 | GEMV_FORTRAN_DECL(hgemvhd,double,quaternion,quaternion, 48 | quaternion,double); 49 | GEMV_FORTRAN_DECL(hgemvhz,double,quaternion,quaternion, 50 | quaternion,std::complex); 51 | GEMV_FORTRAN_DECL(hgemvhh,double,quaternion,quaternion, 52 | quaternion, quaternion); 53 | 54 | 55 | 56 | // GERU functions 57 | 58 | GER_FORTRAN_DECL(hgerud,double,quaternion,quaternion, 59 | double) 60 | GER_FORTRAN_DECL(hgeruz,double,quaternion,quaternion, 61 | std::complex) 62 | GER_FORTRAN_DECL(hgeruh,double,quaternion,quaternion, 63 | quaternion) 64 | 65 | // GERC functions 66 | 67 | GER_FORTRAN_DECL(hgercd,double,quaternion,quaternion, 68 | double) 69 | GER_FORTRAN_DECL(hgercz,double,quaternion,quaternion, 70 | std::complex) 71 | GER_FORTRAN_DECL(hgerch,double,quaternion,quaternion, 72 | quaternion) 73 | 74 | }; 75 | 76 | /** 77 | * \addtogroup HBLAS 78 | * @{ 79 | * 80 | * 81 | * @defgroup HBLAS2 Level 2 HBLAS 82 | * Level 2 BLAS operations over quaternion numbers 83 | * 84 | * @{ 85 | */ 86 | 87 | /// Multiply a general vector by a quaternion matrix 88 | template 90 | void HBLAS_GEMV(const char TRANS, const HAXX_INT M, const HAXX_INT N, 91 | const _AlphaF ALPHA, _MatF * const A, const HAXX_INT LDA, _VecF * const X, 92 | const HAXX_INT INCX, const _BetaF BETA, quaternion<_F> * const Y, 93 | const HAXX_INT INCY); 94 | 95 | /// Perform the quaternion rank 1 operation with two general vectors 96 | template 98 | void HBLAS_GERU(const HAXX_INT M, const HAXX_INT N, const _AlphaF ALPHA, 99 | _LeftVecF * const X, const HAXX_INT INCX, _RightVecF * const Y, 100 | const HAXX_INT INCY, quaternion<_F> * const A, const HAXX_INT LDA); 101 | 102 | /// Perform the quaternion rank 1 operation with two general vectors 103 | template 105 | void HBLAS_GERC(const HAXX_INT M, const HAXX_INT N, const _AlphaF ALPHA, 106 | _LeftVecF * const X, const HAXX_INT INCX, _RightVecF * const Y, 107 | const HAXX_INT INCY, quaternion<_F> * const A, const HAXX_INT LDA); 108 | 109 | 110 | /* @} */ // HBLAS2 111 | 112 | /* @} */ // HBLAS 113 | }; // namespace HAXX 114 | 115 | #endif 116 | -------------------------------------------------------------------------------- /include/hblas/hblas3.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is a part of HAXX 3 | * 4 | * Copyright (c) 2017 David Williams-Young 5 | * All rights reserved. 6 | * 7 | * See LICENSE.txt 8 | */ 9 | 10 | #ifndef __INCLUDED_HBLAS3_HPP 11 | #define __INCLUDED_HBLAS3_HPP 12 | 13 | #include "haxx/haxx_def.hpp" 14 | 15 | #define GEMM_FORTRAN_DECL(NAME,F,AMATF,BMATF,ALPHAF,BETAF) \ 16 | void NAME##_(const char*, const char*, const HAXX_INT*, const HAXX_INT*,\ 17 | const HAXX_INT*, const ALPHAF*, const AMATF*, const HAXX_INT*,\ 18 | const BMATF*, const HAXX_INT*, const BETAF*, const quaternion*,\ 19 | const HAXX_INT*); 20 | 21 | 22 | namespace HAXX { 23 | 24 | // FORTRAN HBLAS3 Functions 25 | extern "C" { 26 | 27 | 28 | // GEMM functions 29 | 30 | GEMM_FORTRAN_DECL(hgemmdd,double,quaternion,quaternion, 31 | double,double); 32 | GEMM_FORTRAN_DECL(hgemmdz,double,quaternion,quaternion, 33 | double,std::complex); 34 | GEMM_FORTRAN_DECL(hgemmdh,double,quaternion,quaternion, 35 | double,quaternion); 36 | 37 | GEMM_FORTRAN_DECL(hgemmzd,double,quaternion,quaternion, 38 | std::complex,double); 39 | GEMM_FORTRAN_DECL(hgemmzz,double,quaternion,quaternion, 40 | std::complex,std::complex); 41 | GEMM_FORTRAN_DECL(hgemmzh,double,quaternion,quaternion, 42 | std::complex,quaternion); 43 | 44 | GEMM_FORTRAN_DECL(hgemmhd,double,quaternion,quaternion, 45 | quaternion,double); 46 | GEMM_FORTRAN_DECL(hgemmhz,double,quaternion,quaternion, 47 | quaternion,std::complex); 48 | GEMM_FORTRAN_DECL(hgemmhh,double,quaternion,quaternion, 49 | quaternion,quaternion); 50 | 51 | }; 52 | 53 | /** 54 | * \addtogroup HBLAS 55 | * @{ 56 | * 57 | * 58 | * @defgroup HBLAS3 Level 3 HBLAS 59 | * Level 3 BLAS operations over quaternion numbers 60 | * 61 | * @{ 62 | */ 63 | 64 | /// Multiply a quaternion matrix by a quaternion matrix 65 | template 67 | void HBLAS_GEMM(const char TRANSA, const char TRANSB, const HAXX_INT M, 68 | const HAXX_INT N, const HAXX_INT K, const _AlphaF ALPHA, _AMatF * const A, 69 | const HAXX_INT LDA, _BMatF * const B, const HAXX_INT LDB, 70 | const _BetaF BETA, quaternion<_F> * const C, const HAXX_INT LDC); 71 | 72 | /* @} */ // HBLAS3 73 | 74 | /* @} */ // HBLAS 75 | 76 | }; // namespace HAXX 77 | 78 | #endif 79 | -------------------------------------------------------------------------------- /include/hblas/hblas_util.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is a part of HAXX 3 | * 4 | * Copyright (c) 2017 David Williams-Young 5 | * All rights reserved. 6 | * 7 | * See LICENSE.txt 8 | */ 9 | 10 | #ifndef __INCLUDED_HBLAS_UTIL_HPP 11 | #define __INCLUDED_HBLAS_UTIL_HPP 12 | 13 | #include "haxx/haxx_def.hpp" 14 | 15 | extern "C" { 16 | 17 | void hzexp1_(const int*, const int*, HAXX::quaternion*, const int*, 18 | std::complex*, const int*); 19 | void hzexp2_(const int*, const int*, HAXX::quaternion*, const int*, 20 | std::complex*, const int*); 21 | void hdexp_(const int*, const int*, HAXX::quaternion*, const int*, 22 | double*, const int*); 23 | 24 | void hzcon1_(const char*, const int*, const int*, HAXX::quaternion*, 25 | const int*, std::complex*, const int*); 26 | void hzcon2_(const char*, const int*, const int*, HAXX::quaternion*, 27 | const int*, std::complex*, const int*); 28 | } 29 | 30 | namespace HAXX { 31 | 32 | /** 33 | * \addtogroup HBLAS 34 | * @{ 35 | * 36 | * 37 | * @defgroup HBLAS_UTIL HBLAS Utilities 38 | * Utility functions for HBLAS 39 | * 40 | * @{ 41 | */ 42 | 43 | 44 | /// Expand a quaternion matrix to a complex matrix 45 | template 46 | void HBLAS_COMPLEX_EXPAND(char ORDER, HAXX_INT M, HAXX_INT N, 47 | quaternion<_F> *A, HAXX_INT LDA, std::complex<_F> *B, HAXX_INT LDB); 48 | 49 | /// Expans a quaternion matrix to a real matrix 50 | template 51 | void HBLAS_REAL_EXPAND(HAXX_INT M, HAXX_INT N, quaternion<_F> *A, 52 | HAXX_INT LDA, _F *B, HAXX_INT LDB); 53 | 54 | /// Contract a complex matrix (of proper symmetry) to a quaternion 55 | /// matrix 56 | template 57 | void HBLAS_COMPLEX_CONTRACT(char ORDER, char UPLO, HAXX_INT M, HAXX_INT N, 58 | quaternion<_F> *A, HAXX_INT LDA, std::complex<_F> *B, HAXX_INT LDB); 59 | 60 | 61 | /* @} */ // HBLAS_UTIL 62 | 63 | /* @} */ // HBLAS 64 | 65 | }; // namespace HAXX 66 | 67 | #endif 68 | -------------------------------------------------------------------------------- /include/hblas/pack/packops.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is a part of HAXX 3 | * 4 | * Copyright (c) 2017 David Williams-Young 5 | * All rights reserved. 6 | * 7 | * See LICENSE.txt 8 | */ 9 | 10 | #ifndef __INCLUDED_HBLAS_PACK_PACKOPS_HPP 11 | #define __INCLUDED_HBLAS_PACK_PACKOPS_HPP 12 | 13 | #include "haxx.hpp" 14 | #include "util/types.hpp" 15 | #include "util/simd.hpp" 16 | 17 | namespace HAXX { 18 | 19 | 20 | /** 21 | * \brief Generic implementation of packing operations used in 22 | * the packing utilities. 23 | * 24 | * Will work for an arbitrary data type 25 | */ 26 | template > 27 | struct GenericPackOps { 28 | 29 | typedef _TypeWrapper TypeWrapper; 30 | 31 | typedef typename _TypeWrapper::load_t load_t; 32 | typedef typename _TypeWrapper::noscalar_t noscalar_t; 33 | 34 | 35 | /** 36 | * Generic implementation of scaling operations used prior to internal 37 | * operations. 38 | */ 39 | template 40 | static load_t preOP(load_t &x, U &alpha){ return Mul(alpha,x); } 41 | 42 | 43 | template 44 | static load_t Load(U &alpha, Args... args) { 45 | 46 | auto x = TypeWrapper::Load(args...); 47 | return preOP(x,alpha); 48 | 49 | } 50 | 51 | /** 52 | * Generic implementation of scaling operations used prior to internal 53 | * operations. Case when no scaling is needed. 54 | */ 55 | 56 | /// Generic implementation of (no-op) internal packing operation. 57 | template 58 | static Tuple OP(Tuple &t){ return t; } 59 | 60 | }; 61 | 62 | 63 | /** 64 | * \brief Generic wrapper around a packing implementation which factors 65 | * a conjugation operation into the preOP function prior to scaling. 66 | * 67 | * The passed TypeWrapper must have conjOp defined. 68 | */ 69 | template < typename T, typename _TypeWrapper = GenericType, 70 | template class PackOps = GenericPackOps > 71 | struct ConjPackOps : public PackOps { 72 | 73 | typedef typename PackOps::load_t load_t; 74 | 75 | template 76 | static load_t preOP(load_t &x, U &alpha){ 77 | auto y = _TypeWrapper::Conj(x); 78 | return PackOps::preOP(y,alpha); 79 | } 80 | 81 | template 82 | static load_t Load(U &alpha, Args... args) { 83 | 84 | auto x = _TypeWrapper::Load(args...); 85 | return preOP(x,alpha); 86 | 87 | } 88 | 89 | }; 90 | 91 | 92 | 93 | /** 94 | * \brief Specialization of GenericPackOps for double precision 95 | * quaternions on AVX / AVX2 96 | */ 97 | template<> 98 | struct GenericPackOps< quaternion, AVXType > { 99 | 100 | typedef quaternion qd; 101 | typedef AVXType TypeWrapper; 102 | 103 | typedef typename TypeWrapper::noscalar_t noscalar_t; 104 | typedef typename TypeWrapper::real_t real_t; 105 | typedef typename TypeWrapper::complex_t complex_t; 106 | typedef typename TypeWrapper::quaternion_t quaternion_t; 107 | typedef typename TypeWrapper::load_t load_t; 108 | 109 | 110 | template 111 | static inline __m256d preOP(__m256d &x, U &z){ return TypeWrapper::Mul(z,x); } 112 | 113 | template 114 | static load_t Load(U &alpha, Args... args) { 115 | 116 | auto x = TypeWrapper::Load(args...); 117 | return preOP(x,alpha); 118 | 119 | } 120 | 121 | 122 | }; 123 | 124 | 125 | 126 | // Forward decl of specialized packing operations 127 | 128 | 129 | template < typename T = quaternion, 130 | typename _TypeWrapper = AVXType > struct GenericPackOps_T1; 131 | 132 | template < typename T = quaternion, 133 | typename _TypeWrapper = AVXType > struct GenericPackOps_T2; 134 | 135 | 136 | template , 137 | typename _TypeWrapper = AVXType> 138 | using ConjPackOps_T1 = ConjPackOps; 139 | 140 | template , 141 | typename _TypeWrapper = AVXType> 142 | using ConjPackOps_T2 = ConjPackOps; 143 | 144 | 145 | 146 | 147 | /** 148 | * Factor expensive SIMD permutation into packing for LHS 149 | * of quaternion--quaternion matrix product. 150 | * 151 | * Specializes the GenericPackOps for quaternion on 152 | * AVX / AVX2 153 | */ 154 | template<> 155 | struct GenericPackOps_T1< quaternion, AVXType >: 156 | public GenericPackOps< quaternion, AVXType > { 157 | 158 | using twoTuple = std::tuple<__m256d,__m256d>; 159 | 160 | static inline twoTuple OP( twoTuple &t ) { 161 | return std::make_tuple( 162 | _mm256_permute2f128_pd(std::get<0>(t),std::get<1>(t), 0x20), 163 | _mm256_permute2f128_pd(std::get<0>(t),std::get<1>(t), 0x31) 164 | ); 165 | } 166 | 167 | }; 168 | 169 | 170 | 171 | 172 | /** 173 | * Factor expensive SIMD unpacking into packing for RHS 174 | * of quaternion--quaternion matrix product. 175 | * 176 | * Specializes the GenericPackOps for quaternion on 177 | * AVX / AVX2 178 | */ 179 | template<> 180 | struct GenericPackOps_T2< quaternion, AVXType >: 181 | public GenericPackOps< quaternion, AVXType > { 182 | 183 | using twoTuple = std::tuple<__m256d,__m256d>; 184 | 185 | static inline twoTuple OP( twoTuple &t ) { 186 | return std::make_tuple( 187 | _mm256_unpacklo_pd(std::get<0>(t),std::get<1>(t)), 188 | _mm256_unpackhi_pd(std::get<0>(t),std::get<1>(t)) 189 | ); 190 | } 191 | 192 | }; 193 | 194 | 195 | }; // namespace HAXX 196 | 197 | #endif 198 | -------------------------------------------------------------------------------- /include/hblas/pack/typewrapper.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is a part of HAXX 3 | * 4 | * Copyright (c) 2017 David Williams-Young 5 | * All rights reserved. 6 | * 7 | * See LICENSE.txt 8 | */ 9 | 10 | #ifndef __INCLUDED_HBLAS_PACK_TYPEWRAPPER_HPP 11 | #define __INCLUDED_HBLAS_PACK_TYPEWRAPPER_HPP 12 | 13 | #include "haxx.hpp" 14 | #include "util/simd.hpp" 15 | 16 | namespace HAXX { 17 | 18 | 19 | /** 20 | * \brief Generic implementation of scalar operations used in 21 | * the packing utilities. 22 | * 23 | * Will work with any scalar data type. 24 | */ 25 | template 26 | struct GenericTypeWrapper { 27 | 28 | typedef T load_t; ///< Load variable typename 29 | 30 | /// Struct to handle the case when no scalar is passed to 31 | /// the packing utility 32 | struct noscalar_t {}; 33 | 34 | 35 | /// Load the passed scalar without modification 36 | template 37 | static const U cacheScalar( U &alpha ){ return alpha; } 38 | 39 | /// No scalar is passed 40 | static noscalar_t cacheScalar(){ return noscalar_t{}; } 41 | 42 | 43 | /// Generic, typesafe implementation of conjugation 44 | template 45 | static U conjOp( U &t ) { return SmartConj(t); } 46 | 47 | }; 48 | 49 | 50 | /** 51 | * \brief AVX / AVX2 (256-bit vector length) implementation 52 | * of scalar operations used in packing utilities. 53 | * 54 | * Currently only viable for quaternion packing (FIXME) 55 | */ 56 | struct AVX64BitTypeWrapper { 57 | 58 | typedef __m256d load_t; ///< Load variable typename 59 | 60 | struct noscalar_t {}; ///< No scalar passed 61 | struct real_t { __m256d x; }; ///< Real (double) 62 | struct complex_t { __m256d x; __m256d y; }; ///< Complex (double) 63 | struct quaternion_t { __m256d x; }; ///< Quaternion (double) 64 | 65 | 66 | 67 | /// No scalar passed to packing utility 68 | static inline noscalar_t cacheScalar(){ return noscalar_t(); } 69 | 70 | /// Real (double) scalar passed to packing utility (bcast to __m256d) 71 | static inline const real_t cacheScalar( double &alpha ) { 72 | return real_t{_mm256_broadcast_sd(&alpha)}; 73 | } 74 | 75 | /** 76 | * Complex (double) scalar passed to packing utility. 77 | * Load as 78 | * { 79 | * { x, x }, 80 | * { -x*, -x* } 81 | * } 82 | * 83 | */ 84 | static inline const complex_t 85 | cacheScalar(std::complex &ALPHA) { 86 | 87 | const __m256i maskConj = _mm256_set_epi64x( 88 | 0x8000000000000000, 0, 89 | 0x8000000000000000, 0 ); 90 | 91 | __m128d alphaC = LOAD_128D_UNALIGNED_AS(double,&ALPHA); 92 | __m256d alpha = SET_256D_FROM_128D(alphaC,alphaC); 93 | __m256d alpha_C = 94 | _mm256_xor_pd(_mm256_permute_pd(alpha, 0x5), 95 | _mm256_castsi256_pd(maskConj) 96 | ); 97 | 98 | return complex_t{ alpha, alpha_C }; 99 | } 100 | 101 | /// Quaternion (double) scalar passed to packing utility (load as __m256d) 102 | static inline const quaternion_t cacheScalar( quaternion& alpha ) { 103 | return quaternion_t{LOAD_256D_UNALIGNED_AS(double,&alpha)}; 104 | } 105 | 106 | /// AVX / AVX2 Conjugation operation (FIXME: only works for quaternions) 107 | static __m256d conjOp( __m256d &t ) { return QCONJ_256D(t); } 108 | 109 | }; 110 | 111 | }; 112 | 113 | #endif 114 | -------------------------------------------------------------------------------- /include/util/constants.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is a part of HAXX 3 | * 4 | * Copyright (c) 2017 David Williams-Young 5 | * All rights reserved. 6 | * 7 | * See LICENSE.txt 8 | */ 9 | 10 | #ifndef __INCLUDED_UTIL_CONSTANTS_HPP__ 11 | #define __INCLUDED_UTIL_CONSTANTS_HPP__ 12 | 13 | // Compile time type constants 14 | #define DOUBLE 0 15 | #define DCOMPLEX 1 16 | #define DQUATERNION 2 17 | 18 | #endif 19 | -------------------------------------------------------------------------------- /include/util/macro.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is a part of HAXX 3 | * 4 | * Copyright (c) 2017 David Williams-Young 5 | * All rights reserved. 6 | * 7 | * See LICENSE.txt 8 | */ 9 | 10 | #ifndef __INCLUDED_UTIL_MACRO_HPP 11 | #define __INCLUDED_UTIL_MACRO_HPP 12 | 13 | // Misc macros 14 | 15 | // Compute rank-2 index 16 | #define RANK2_INDX(i,j,N) ( (i) + (j)*(N) ) 17 | 18 | // Alignment checking 19 | #define IS_ALIGNED(X,B) ( ((unsigned long)(X) & (B-1)) == 0 ) 20 | 21 | // Fix a number for mod arithmitic 22 | #define FixMod(X,N) (( (X) % (N) ) ? (X) + (N) - ((X) % (N)) : (X)) 23 | 24 | #endif 25 | -------------------------------------------------------------------------------- /include/util/simd.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is a part of HAXX 3 | * 4 | * Copyright (c) 2017 David Williams-Young 5 | * All rights reserved. 6 | * 7 | * See LICENSE.txt 8 | */ 9 | 10 | #ifndef __INCLUDED_UTIL_SIMD_HPP__ 11 | #define __INCLUDED_UTIL_SIMD_HPP__ 12 | 13 | // SIMD intrinsics header 14 | 15 | #include 16 | 17 | // Required boundary alignment for aligned data 18 | #if defined(__AVX__) || defined(__AVX2__) 19 | #define REQ_ALIGN 32 20 | #endif 21 | 22 | 23 | 24 | #include "simd/intrin_alias.hpp" 25 | #include "simd/misc.hpp" 26 | #include "simd/qop.hpp" 27 | 28 | 29 | #endif 30 | -------------------------------------------------------------------------------- /include/util/simd/intrin_alias.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is a part of HAXX 3 | * 4 | * Copyright (c) 2017 David Williams-Young 5 | * All rights reserved. 6 | * 7 | * See LICENSE.txt 8 | */ 9 | 10 | #ifndef __INCLUDED_SIMD_INTRIN_ALIAS_HPP__ 11 | #define __INCLUDED_SIMD_INTRIN_ALIAS_HPP__ 12 | 13 | // Alias SIMD intrinsics 14 | 15 | 16 | // 256-bit vectors 17 | 18 | // Load operations 19 | #define LOAD_256D_ALIGNED(X) _mm256_load_pd(X) 20 | #define LOAD_256D_UNALIGNED(X) _mm256_loadu_pd(X) 21 | 22 | #define LOAD_128D_ALIGNED(X) _mm_load_pd(X) 23 | #define LOAD_128D_UNALIGNED(X) _mm_loadu_pd(X) 24 | 25 | // Load operations with proper cast 26 | #define LOAD_256D_ALIGNED_AS(T,X)\ 27 | LOAD_256D_ALIGNED(const_cast(reinterpret_cast(X))) 28 | #define LOAD_256D_UNALIGNED_AS(T,X)\ 29 | LOAD_256D_UNALIGNED(const_cast(reinterpret_cast(X))) 30 | 31 | #define LOAD_128D_ALIGNED_AS(T,X)\ 32 | LOAD_128D_ALIGNED(const_cast(reinterpret_cast(X))) 33 | #define LOAD_128D_UNALIGNED_AS(T,X)\ 34 | LOAD_128D_UNALIGNED(const_cast(reinterpret_cast(X))) 35 | 36 | 37 | // Store operations 38 | #define STORE_256D_ALIGNED(X,V) _mm256_store_pd(X,V) 39 | #define STORE_256D_UNALIGNED(X,V) _mm256_storeu_pd(X,V) 40 | 41 | // Store operations with proper cast 42 | #define STORE_256D_ALIGNED_AS(T,X,V)\ 43 | STORE_256D_ALIGNED(reinterpret_cast(X),V) 44 | #define STORE_256D_UNALIGNED_AS(T,X,V)\ 45 | STORE_256D_UNALIGNED(reinterpret_cast(X),V) 46 | 47 | 48 | // Getting / Assembling operations 49 | #define GET_LO_128D_256D(X) _mm256_castpd256_pd128(X) 50 | #define GET_HI_128D_256D(X) _mm256_extractf128_pd((X),1) 51 | 52 | #define SET_256D_FROM_128D(X,Y) \ 53 | _mm256_castps_pd(\ 54 | _mm256_insertf128_ps(\ 55 | _mm256_castps128_ps256(_mm_castpd_ps(X)),\ 56 | _mm_castpd_ps(Y),1)\ 57 | ); 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | // FMA / FMS operations (D = +-(A*B) +- C) 72 | 73 | // Emulate FMA with two instructions 74 | #ifndef __FMA__ 75 | 76 | // 256-bit vectors 77 | #define FMA_256D(A,B,C) _mm256_add_pd(_mm256_mul_pd(A,B),C) 78 | #define FMS_256D(A,B,C) _mm256_sub_pd(_mm256_mul_pd(A,B),C) 79 | #define FNMA_256D(A,B,C) _mm256_sub_pd(C,_mm256_mul_pd(A,B)) 80 | 81 | #else 82 | 83 | // 256-bit vectors 84 | #define FMA_256D(A,B,C) _mm256_fmadd_pd(A,B,C) 85 | #define FMS_256D(A,B,C) _mm256_fmsub_pd(A,B,C) 86 | #define FNMA_256D(A,B,C) _mm256_fnmadd_pd(A,B,C) 87 | 88 | #endif 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | #endif 102 | -------------------------------------------------------------------------------- /include/util/simd/misc.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is a part of HAXX 3 | * 4 | * Copyright (c) 2017 David Williams-Young 5 | * All rights reserved. 6 | * 7 | * See LICENSE.txt 8 | */ 9 | 10 | #ifndef __INCLUDED_SIMD_MISC_HPP__ 11 | #define __INCLUDED_SIMD_MISC_HPP__ 12 | 13 | 14 | // Load operations 15 | 16 | // Load aligned with cast 17 | #define LOADD_ALIGNED_AS(T,X) \ 18 | LOADD_ALIGNED(const_cast(reinterpret_cast(X))) 19 | 20 | // Load unaligned with cast 21 | #define LOADD_UNALIGNED_AS(T,X) \ 22 | LOADD_UNALIGNED(const_cast(reinterpret_cast(X))) 23 | 24 | // Store aligned with cast 25 | #define STORED_ALIGNED_AS(T,X,V) \ 26 | STORED_ALIGNED(reinterpret_cast(X),V) 27 | 28 | // Load unaligned with cast 29 | #define STORED_UNALIGNED_AS(T,X,V) \ 30 | STORED_UNALIGNED(reinterpret_cast(X),V) 31 | 32 | 33 | 34 | 35 | #if defined(__AVX__) || defined(__AVX2__) 36 | 37 | // Transpose 4x4 registers (with scratch space) 38 | #define _MM_TRANSPOSE_4x4_PD(w,x,y,z,t1,t2,t3,t4) \ 39 | t1 = _mm256_shuffle_pd(w, x, 0x0);\ 40 | t3 = _mm256_shuffle_pd(w, x, 0xF);\ 41 | t2 = _mm256_shuffle_pd(y, z, 0x0);\ 42 | t4 = _mm256_shuffle_pd(y, z, 0xF);\ 43 | \ 44 | w = _mm256_permute2f128_pd(t1, t2, 0x20);\ 45 | x = _mm256_permute2f128_pd(t3, t4, 0x20);\ 46 | y = _mm256_permute2f128_pd(t1, t2, 0x31);\ 47 | z = _mm256_permute2f128_pd(t3, t4, 0x31); 48 | 49 | 50 | #endif 51 | 52 | #endif 53 | -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is a part of HAXX 3 | # 4 | # Copyright (c) 2017 David Williams-Young 5 | # All rights reserved. 6 | # 7 | # See LICENSE.txt 8 | # 9 | 10 | add_subdirectory(hblas) 11 | add_subdirectory(tune) 12 | 13 | if( HAXX_ENABLE_BENCHMARK ) 14 | add_subdirectory(benchmark) 15 | endif() 16 | -------------------------------------------------------------------------------- /src/benchmark/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is a part of HAXX 3 | # 4 | # Copyright (c) 2017 David Williams-Young 5 | # All rights reserved. 6 | # 7 | # See LICENSE.txt 8 | # 9 | #set(Boost_USE_STATIC_LIBS ON) 10 | 11 | 12 | message( STATUS "HAXX WILL BUILD BENCHMARK EXECUTABLES" ) 13 | 14 | add_executable(gemm gemm.cxx) 15 | 16 | set( DOT_SRC dot.cxx ) 17 | 18 | # Make sure that the fortran functions get compiled for 19 | # the benchmark 20 | if( NOT ENABLE_GENERIC_FORTRAN ) 21 | set( DOT_SRC ${DOT_SRC} 22 | ${PROJECT_SOURCE_DIR}/src/hblas/fortran/hblas1/hdotc.f 23 | ${PROJECT_SOURCE_DIR}/src/hblas/fortran/hblas1/hdotu.f 24 | ) 25 | endif() 26 | 27 | add_executable(dot ${DOT_SRC}) 28 | 29 | option( COMPARE_MKL_ZGEMM "Compare HGEMM to ZGEMM" OFF ) 30 | 31 | if( NOT COMPARE_MKL_ZGEMM ) 32 | # Reference BLAS / LAPACK 33 | include(ExternalProject) 34 | find_program(SED_EXEC sed) 35 | ExternalProject_Add(lapack 36 | PREFIX ${PROJECT_BINARY_DIR}/deps/lapack 37 | URL "http://www.netlib.org/lapack/lapack-3.5.0.tgz" 38 | PATCH_COMMAND ${SED_EXEC} -i -e "s/-fltconsistency -fp_port//g" ${PROJECT_BINARY_DIR}/deps/lapack/src/lapack/CMakeLists.txt 39 | CMAKE_ARGS -DCMAKE_Fortran_COMPILER=${CMAKE_Fortran_COMPILER} 40 | -DCMAKE_Fortran_FLAGS='${CMAKE_Fortran_FLAGS}' 41 | -DCMAKE_INSTALL_PREFIX=${PROJECT_BINARY_DIR}/deps 42 | -DBUILD_TESTING=OFF 43 | ) 44 | 45 | ExternalProject_Get_Property(lapack install_dir) 46 | add_library(blas STATIC IMPORTED) 47 | set_property(TARGET blas PROPERTY IMPORTED_LOCATION ${PROJECT_BINARY_DIR}/deps/lib/libblas.a) 48 | 49 | add_dependencies(blas lapack) 50 | target_link_libraries(gemm ${Boost_LIBRARIES} blas) 51 | endif() 52 | 53 | target_link_libraries(gemm ${Boost_LIBRARIES} hblas) 54 | target_link_libraries(dot ${Boost_LIBRARIES} hblas) 55 | 56 | #find_package(BLAS REQUIRED) 57 | #target_link_libraries(gemm ${Boost_LIBRARIES} haxx_fortran ${BLAS_LIBRARIES}) 58 | 59 | -------------------------------------------------------------------------------- /src/benchmark/dot.cxx: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is a part of HAXX 3 | * 4 | * Copyright (c) 2017 David Williams-Young 5 | * All rights reserved. 6 | * 7 | * See LICENSE.txt 8 | */ 9 | 10 | #include "haxx.hpp" 11 | #include "hblas/hblas1.hpp" 12 | 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | 20 | #define HBLAS1_RAND_MIN -20 21 | #define HBLAS1_RAND_MAX 54 22 | 23 | // Setup Random Number generator 24 | std::random_device rd; 25 | std::mt19937 gen(rd()); 26 | std::uniform_real_distribution<> dis(HBLAS1_RAND_MIN,HBLAS1_RAND_MAX); 27 | 28 | #define DOT_LEN_MAX 10000000 29 | #define DOT_LEN_START 100 30 | #define NTEST 10 31 | #define NREP 10 32 | 33 | #define USE_ALIGNED_MEM 34 | 35 | using namespace HAXX; 36 | 37 | int main() { 38 | 39 | 40 | #ifdef USE_ALIGNED_MEM 41 | 42 | size_t alignment = 32; 43 | size_t nAlloc = (size_t(DOT_LEN_MAX) * sizeof(quaternion)); 44 | nAlloc = nAlloc + (nAlloc % 32); 45 | 46 | quaternion *X = 47 | (quaternion*)aligned_alloc(alignment, nAlloc); 48 | quaternion *Y = 49 | (quaternion*)aligned_alloc(alignment, nAlloc); 50 | 51 | for(auto i = 0ul; i < DOT_LEN_MAX; i++) { 52 | X[i] = quaternion(dis(gen),dis(gen),dis(gen),dis(gen)); 53 | Y[i] = quaternion(dis(gen),dis(gen),dis(gen),dis(gen)); 54 | } 55 | 56 | #else 57 | 58 | std::vector> X(DOT_LEN_MAX), Y(DOT_LEN_MAX); 59 | for(auto &x : X) x = quaternion(dis(gen),dis(gen),dis(gen),dis(gen)); 60 | for(auto &x : Y) x = quaternion(dis(gen),dis(gen),dis(gen),dis(gen)); 61 | 62 | #endif 63 | 64 | 65 | for(auto N = DOT_LEN_START; N <= DOT_LEN_MAX; 66 | N += (DOT_LEN_MAX - DOT_LEN_START)/NTEST ) { 67 | 68 | // Fix cache 69 | if( N == DOT_LEN_START ) HBLAS_DOTUV(N,&X[0],1,&Y[0],1); 70 | 71 | std::chrono::duration dur_opt(0.); 72 | std::chrono::duration dur_for(0.); 73 | 74 | quaternion res; 75 | 76 | 77 | int INCX = 1; 78 | for(auto rep = 0; rep < NREP; rep++) { 79 | auto start = std::chrono::high_resolution_clock::now(); 80 | hdotu_(&res,&N,&X[0],&INCX,&Y[0],&INCX); 81 | auto end = std::chrono::high_resolution_clock::now(); 82 | dur_for += end - start; 83 | } 84 | 85 | for(auto rep = 0; rep < NREP; rep++) { 86 | auto start = std::chrono::high_resolution_clock::now(); 87 | res = HBLAS_DOTUV(N,&X[0],1,&Y[0],1); 88 | auto end = std::chrono::high_resolution_clock::now(); 89 | dur_opt += end - start; 90 | } 91 | 92 | 93 | dur_opt /= NREP; 94 | dur_for /= NREP; 95 | 96 | std::cout << "N = " << N << ", SIMD = " << dur_opt.count() 97 | << ", FORTRAN = " << dur_for.count() 98 | << ", % = " << (dur_for.count() - dur_opt.count()) / dur_for.count() 99 | << std::endl; 100 | 101 | } 102 | 103 | #ifdef USE_ALIGNED_MEM 104 | free(X); free(Y); 105 | #endif 106 | 107 | }; 108 | -------------------------------------------------------------------------------- /src/benchmark/gemm.cxx: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is a part of HAXX 3 | * 4 | * Copyright (c) 2017 David Williams-Young 5 | * All rights reserved. 6 | * 7 | * See LICENSE.txt 8 | */ 9 | 10 | 11 | #include "haxx.hpp" 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include "hblas/hblas3.hpp" 19 | #include "hblas/hblas_util.hpp" 20 | 21 | extern "C" { 22 | 23 | void zgemm_(const char*, const char*, const int*, const int*, 24 | const int*, const std::complex*, const std::complex*, 25 | const int*, const std::complex*, const int*, 26 | const std::complex*, const std::complex*, const int*); 27 | 28 | }; 29 | 30 | #define HBLAS1_RAND_MIN -20 31 | #define HBLAS1_RAND_MAX 54 32 | 33 | // Setup Random Number generator 34 | std::random_device rd; 35 | std::mt19937 gen(rd()); 36 | std::uniform_real_distribution<> dis(HBLAS1_RAND_MIN,HBLAS1_RAND_MAX); 37 | 38 | 39 | //#define _DO_COMPLEX 40 | //#define _DO_FORTRAN 41 | 42 | 43 | void outTime(std::string name, size_t LEN, size_t FLOPS, double count) { 44 | 45 | std::cout << std::setw(15) << std::left << name; 46 | std::cout << std::setw(15) << std::left << LEN; 47 | 48 | std::cout << std::setprecision(8); 49 | 50 | std::cout << std::setw(15) << std::right << count; 51 | std::cout << std::setw(15) << std::right << FLOPS/count/1.e9; 52 | 53 | std::cout << std::endl; 54 | 55 | 56 | 57 | }; 58 | 59 | int main() { 60 | 61 | const size_t GEMM_LEN_MIN = 500; 62 | const size_t GEMM_LEN_MAX = 5000; 63 | const size_t GEMM_INC = 500; 64 | 65 | for( auto GEMM_LEN = GEMM_LEN_MIN; 66 | GEMM_LEN <= GEMM_LEN_MAX; 67 | GEMM_LEN += GEMM_INC ) { 68 | 69 | 70 | std::vector> 71 | A(GEMM_LEN*GEMM_LEN), B(GEMM_LEN*GEMM_LEN), C(GEMM_LEN*GEMM_LEN); 72 | 73 | for(auto &x : A) 74 | x = HAXX::quaternion(dis(gen),dis(gen),dis(gen),dis(gen)); 75 | for(auto &x : B) 76 | x = HAXX::quaternion(dis(gen),dis(gen),dis(gen),dis(gen)); 77 | for(auto &x : C) 78 | x = HAXX::quaternion(dis(gen),dis(gen),dis(gen),dis(gen)); 79 | 80 | #ifdef _DO_COMPLEX 81 | std::vector> 82 | AC(2*GEMM_LEN*2*GEMM_LEN), BC(2*GEMM_LEN*2*GEMM_LEN), CC(2*GEMM_LEN*2*GEMM_LEN); 83 | 84 | int x2x = 2*GEMM_LEN; 85 | int gl = GEMM_LEN; 86 | HBLAS_COMPLEX_EXPAND('S',GEMM_LEN,GEMM_LEN,&A[0],GEMM_LEN,&AC[0],2*GEMM_LEN); 87 | HBLAS_COMPLEX_EXPAND('S',GEMM_LEN,GEMM_LEN,&B[0],GEMM_LEN,&BC[0],2*GEMM_LEN); 88 | HBLAS_COMPLEX_EXPAND('S',GEMM_LEN,GEMM_LEN,&C[0],GEMM_LEN,&CC[0],2*GEMM_LEN); 89 | #endif 90 | 91 | 92 | char TRANSA = 'N', TRANSB = 'N'; 93 | 94 | std::complex ALPHA(dis(gen),dis(gen)), BETA(dis(gen),dis(gen)); 95 | 96 | 97 | if(GEMM_LEN == 500) 98 | HBLAS_GEMM(TRANSA,TRANSB,GEMM_LEN,GEMM_LEN,GEMM_LEN,ALPHA,&A[0],GEMM_LEN, 99 | &B[0],GEMM_LEN,BETA,&C[0],GEMM_LEN); 100 | 101 | #ifdef _DO_FORTRAN 102 | auto fortranStart = std::chrono::high_resolution_clock::now(); 103 | hgemmzz_(&TRANSA,&TRANSB,&gl,&gl,&gl,&ALPHA,&A[0],&gl,&B[0],&gl,&BETA,&C[0],&gl); 104 | auto fortranEnd = std::chrono::high_resolution_clock::now(); 105 | std::chrono::duration fortranDur = fortranEnd - fortranStart; 106 | 107 | outTime("HGEMM_FORTRAN",GEMM_LEN,32.*GEMM_LEN*GEMM_LEN*GEMM_LEN, 108 | fortranDur.count()); 109 | #endif 110 | 111 | auto hgemmStart = std::chrono::high_resolution_clock::now(); 112 | HBLAS_GEMM(TRANSA,TRANSB,GEMM_LEN,GEMM_LEN,GEMM_LEN,ALPHA,&A[0],GEMM_LEN, 113 | &B[0],GEMM_LEN,BETA,&C[0],GEMM_LEN); 114 | auto hgemmEnd = std::chrono::high_resolution_clock::now(); 115 | std::chrono::duration hgemmDur = hgemmEnd - hgemmStart; 116 | 117 | outTime("HGEMM",GEMM_LEN,32.*GEMM_LEN*GEMM_LEN*GEMM_LEN,hgemmDur.count()); 118 | 119 | 120 | #ifdef _DO_COMPLEX 121 | auto zgemmStart = std::chrono::high_resolution_clock::now(); 122 | zgemm_(&TRANSA,&TRANSB,&x2x,&x2x,&x2x,&ALPHA,&AC[0],&x2x, 123 | &BC[0],&x2x,&BETA,&CC[0],&x2x); 124 | auto zgemmEnd = std::chrono::high_resolution_clock::now(); 125 | std::chrono::duration zgemmDur = zgemmEnd - zgemmStart; 126 | 127 | outTime("ZGEMM",GEMM_LEN,64.*GEMM_LEN*GEMM_LEN*GEMM_LEN,zgemmDur.count()); 128 | #endif 129 | 130 | 131 | } 132 | } 133 | -------------------------------------------------------------------------------- /src/hblas/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(fortran) 2 | 3 | if( NOT ENABLE_GENERIC_FORTRAN ) 4 | add_subdirectory(cxx) 5 | endif() 6 | 7 | configure_file( hblas1.cxx.in ${PROJECT_BINARY_DIR}/src/hblas/hblas1.cxx ) 8 | configure_file( hblas2.cxx.in ${PROJECT_BINARY_DIR}/src/hblas/hblas2.cxx ) 9 | configure_file( hblas3.cxx.in ${PROJECT_BINARY_DIR}/src/hblas/hblas3.cxx ) 10 | 11 | set( HBLAS_IMPL_SRC 12 | ${PROJECT_BINARY_DIR}/src/hblas/hblas1.cxx 13 | ${PROJECT_BINARY_DIR}/src/hblas/hblas2.cxx 14 | ${PROJECT_BINARY_DIR}/src/hblas/hblas3.cxx 15 | util.cxx 16 | ) 17 | 18 | add_library( hblas STATIC ${HBLAS_IMPL_SRC} ${HBLAS1_OBJ} ${HBLAS3_OBJ} ) 19 | 20 | target_link_libraries(hblas PUBLIC hblas_fortran) 21 | 22 | install(TARGETS hblas DESTINATION lib) 23 | -------------------------------------------------------------------------------- /src/hblas/cxx/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(hblas1) 2 | add_subdirectory(hblas3) 3 | -------------------------------------------------------------------------------- /src/hblas/cxx/hblas1/AXPYScal.cmake: -------------------------------------------------------------------------------- 1 | # AXPY/SCAL functions 2 | 3 | add_library( hblas_scalvd OBJECT axpy_scal.cxx ) 4 | add_library( hblas_scalvz OBJECT axpy_scal.cxx ) 5 | add_library( hblas_scalvh OBJECT axpy_scal.cxx ) 6 | add_library( hblas_axpyvdh OBJECT axpy_scal.cxx ) 7 | add_library( hblas_axpyvzh OBJECT axpy_scal.cxx ) 8 | add_library( hblas_axpyvhh OBJECT axpy_scal.cxx ) 9 | 10 | target_compile_definitions( hblas_scalvd PRIVATE "FNAME=HBLAS_SCALV" "_SCAL" "ALPHAF=DOUBLE" ) 11 | target_compile_definitions( hblas_scalvz PRIVATE "FNAME=HBLAS_SCALV" "_SCAL" "ALPHAF=DCOMPLEX" ) 12 | target_compile_definitions( hblas_scalvh PRIVATE "FNAME=HBLAS_SCALV" "_SCAL" "ALPHAF=DQUATERNION" ) 13 | target_compile_definitions( hblas_axpyvdh PRIVATE "FNAME=HBLAS_AXPYV" "_AXPY" "ALPHAF=DOUBLE" ) 14 | target_compile_definitions( hblas_axpyvzh PRIVATE "FNAME=HBLAS_AXPYV" "_AXPY" "ALPHAF=DCOMPLEX" ) 15 | target_compile_definitions( hblas_axpyvhh PRIVATE "FNAME=HBLAS_AXPYV" "_AXPY" "ALPHAF=DQUATERNION" ) 16 | 17 | set(AXPY_SCAL_OBJ 18 | $ 19 | $ 20 | $ 21 | $ 22 | $ 23 | $ 24 | CACHE LIST "Object Files for SCAL and AXPY functions" 25 | ) 26 | 27 | -------------------------------------------------------------------------------- /src/hblas/cxx/hblas1/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | ##### HBLAS1 COMPILATION ##### 2 | 3 | include(CopySwap.cmake) 4 | include(Dot.cmake) 5 | include(AXPYScal.cmake) 6 | 7 | 8 | set( HBLAS1_OBJ ${DOT_OBJ} ${AXPY_SCAL_OBJ} ${COPY_SWAP_OBJ} 9 | CACHE LIST "Object Files for HBLAS1 functions" ) 10 | -------------------------------------------------------------------------------- /src/hblas/cxx/hblas1/CopySwap.cmake: -------------------------------------------------------------------------------- 1 | # COPY / SWAP functions 2 | 3 | add_library( hblas_copyv OBJECT copy_swap.cxx ) 4 | add_library( hblas_swapv OBJECT copy_swap.cxx ) 5 | 6 | target_compile_definitions( hblas_copyv PRIVATE "FNAME=HBLAS_COPYV" "_COPY" ) 7 | target_compile_definitions( hblas_swapv PRIVATE "FNAME=HBLAS_SWAPV" "_SWAP" ) 8 | 9 | set( COPY_SWAP_OBJ 10 | $ 11 | $ 12 | CACHE LIST "Object Files for COPYV and SWAPV function" ) 13 | 14 | 15 | -------------------------------------------------------------------------------- /src/hblas/cxx/hblas1/Dot.cmake: -------------------------------------------------------------------------------- 1 | # DOT functions 2 | 3 | add_library( hblas_dotuv OBJECT dot.cxx ) 4 | add_library( hblas_dotcv OBJECT dot.cxx ) 5 | 6 | target_compile_definitions( hblas_dotuv PRIVATE "FNAME=HBLAS_DOTUV" ) 7 | target_compile_definitions( hblas_dotcv PRIVATE "FNAME=HBLAS_DOTCV" "_CONJ" ) 8 | 9 | set( DOT_OBJ 10 | $ 11 | $ 12 | CACHE LIST "Object Files for DOTV functions" 13 | ) 14 | 15 | -------------------------------------------------------------------------------- /src/hblas/cxx/hblas1/axpy_scal.cxx: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is a part of HAXX 3 | * 4 | * Copyright (c) 2017 David Williams-Young 5 | * All rights reserved. 6 | * 7 | * See LICENSE.txt 8 | */ 9 | 10 | #include "haxx.hpp" 11 | #include "hblas/hblas1.hpp" 12 | 13 | #include 14 | 15 | #include 16 | #include 17 | 18 | #include "hblas/config/types.hpp" 19 | 20 | 21 | 22 | namespace HAXX { 23 | 24 | // Optimized AXPY and SCAL operations 25 | 26 | template <> 27 | #ifdef _AXPY 28 | 29 | void FNAME(const char SIDE, const HAXX_INT N, const _ALPHAF ALPHA, 30 | quaternion * const X, const HAXX_INT INCX, 31 | quaternion * const Y, const HAXX_INT INCY) 32 | 33 | #elif defined(_SCAL) 34 | 35 | void FNAME(const char SIDE, const HAXX_INT N, const _ALPHAF ALPHA, 36 | quaternion * const X, const HAXX_INT INCX) 37 | 38 | #endif 39 | { 40 | 41 | // Local vector pointers for increment 42 | quaternion * locX = X; 43 | 44 | #ifdef _AXPY 45 | quaternion * locY = Y; 46 | #endif 47 | 48 | #if defined(__AVX__) || defined(__AVX2__) 49 | 50 | // Load quaternions 51 | __m256d x1; // Load space for an element of X 52 | 53 | #ifdef _AXPY 54 | __m256d y1; // Load space for an element of Y 55 | #endif 56 | 57 | #endif 58 | 59 | const HAXX_INT BatchSize = 1; 60 | const HAXX_INT NBatch = N / BatchSize; 61 | const HAXX_INT nLeft = N % BatchSize; 62 | 63 | HAXX_INT i; 64 | 65 | // Determine side 66 | const bool MulLeft = (SIDE == 'L'); 67 | 68 | // Determine alignment 69 | const bool XIsAligned = IS_ALIGNED(X,REQ_ALIGN); 70 | 71 | #ifdef _AXPY 72 | const bool YIsAligned = IS_ALIGNED(Y,REQ_ALIGN); 73 | const bool isAligned = XIsAligned and YIsAligned; 74 | #else 75 | const bool isAligned = XIsAligned; 76 | #endif 77 | 78 | // Load scaling factor 79 | #if defined(__AVX__) || defined(__AVX2__) 80 | 81 | 82 | #if ALPHAF == DOUBLE 83 | 84 | // alpha = (ALPHA, ALPHA, ALPHA, ALPHA) 85 | const __m256d alpha = _mm256_broadcast_sd(&ALPHA); 86 | 87 | #elif ALPHAF == DCOMPLEX 88 | 89 | 90 | // Quaternion product with a complex number may be thought of as 91 | // two complex products via tha Cayley-Dickson construction of the 92 | // quaternions 93 | // 94 | // A * (B + Cj) = AB + ACj 95 | // (A + Bj) * C = AC + B*CONJ(C) j 96 | 97 | 98 | auto ALPHA_C = std::conj(ALPHA); 99 | 100 | double *ALPHA_PTR = const_cast( 101 | reinterpret_cast(&ALPHA)); 102 | 103 | __m128d alphaC = _mm_loadu_pd(ALPHA_PTR); 104 | 105 | 106 | // Precompute the second vector for complex multiplication 107 | __m128d alphaC_conj = 108 | MulLeft ? _mm_loadu_pd(ALPHA_PTR) : 109 | _mm_loadu_pd(reinterpret_cast(&ALPHA_C)); 110 | 111 | 112 | 113 | // SIDE == 'L' -> 114 | // alpha = (ALPHA_R, ALPHA_I, ALPHA_R, ALPHA_I) 115 | // alpha_conj = (ALPHA_I, -ALPHA_R, ALPHA_I, -ALPHA_R) 116 | // SIDE == 'R' -> 117 | // alpha = (ALPHA_R, ALPHA_I, ALPHA_R, -ALPHA_I) 118 | // alpha_conj = (ALPHA_I, -ALPHA_R, -ALPHA_I, -ALPHA_R) 119 | const __m256d alpha = SET_256D_FROM_128D(alphaC,alphaC_conj); 120 | 121 | // (0 -1 0 -1) mask 122 | const __m256i maskConj = _mm256_set_epi64x( 123 | 0x8000000000000000, 0, 124 | 0x8000000000000000, 0 ); 125 | 126 | const __m256d alpha_conj = _mm256_xor_pd( 127 | _mm256_permute_pd(alpha, 0x5), 128 | _mm256_castsi256_pd(maskConj) 129 | ); 130 | 131 | #elif ALPHAF == DQUATERNION 132 | 133 | // Load ALPHA as is 134 | const __m256d alpha = LOAD_256D_UNALIGNED_AS(double,&ALPHA); 135 | 136 | #endif 137 | 138 | #endif 139 | 140 | for( i = 0; i < NBatch; i++ ) { 141 | 142 | #if defined(__AVX__) || defined (__AVX2__) 143 | 144 | // Load X and possibly Y 145 | if( isAligned ) { 146 | 147 | x1 = LOAD_256D_ALIGNED_AS(double,locX); 148 | #ifdef _AXPY 149 | y1 = LOAD_256D_ALIGNED_AS(double,locY); 150 | #endif 151 | 152 | } else { 153 | 154 | x1 = LOAD_256D_UNALIGNED_AS(double,locX); 155 | #ifdef _AXPY 156 | y1 = LOAD_256D_UNALIGNED_AS(double,locY); 157 | #endif 158 | 159 | } 160 | 161 | #ifdef _SCAL 162 | 163 | // X = ALPHA * X or X * ALPHA 164 | #if ALPHAF == DOUBLE 165 | x1 = _mm256_mul_pd(alpha,x1); 166 | #elif ALPHAF == DCOMPLEX 167 | __m256d p1 = _mm256_mul_pd(x1,alpha); 168 | __m256d p2 = _mm256_mul_pd(x1,alpha_conj); 169 | x1 = _mm256_hsub_pd(p1,p2); 170 | #elif ALPHAF == DQUATERNION 171 | if( MulLeft ) x1 = MULDQ_NN(alpha,x1); 172 | else x1 = MULDQ_NN(x1,alpha); 173 | #endif 174 | 175 | // Store X in place 176 | if( isAligned ) STORE_256D_ALIGNED_AS( double,locX,x1); 177 | else STORE_256D_UNALIGNED_AS(double,locX,x1); 178 | 179 | #elif defined(_AXPY) 180 | 181 | // Y = Y + ALPHA * X or Y + X * ALPHA 182 | #if ALPHAF == DOUBLE 183 | y1 = FMA_256D(alpha,x1,y1); 184 | #elif ALPHAF == DCOMPLEX 185 | __m256d p1 = _mm256_mul_pd(x1,alpha); 186 | __m256d p2 = _mm256_mul_pd(x1,alpha_conj); 187 | y1 = _mm256_add_pd(y1,_mm256_hsub_pd(p1,p2)); 188 | #elif ALPHAF == DQUATERNION 189 | if( MulLeft ) y1 = _mm256_add_pd(y1,MULDQ_NN(alpha,x1)); 190 | else y1 = _mm256_add_pd(y1,MULDQ_NN(x1,alpha)); 191 | #endif 192 | 193 | // Store Y in place 194 | if( isAligned ) STORE_256D_ALIGNED_AS( double,locY,y1); 195 | else STORE_256D_UNALIGNED_AS(double,locY,y1); 196 | 197 | #endif 198 | 199 | #endif 200 | 201 | // Increment vectors 202 | locX += BatchSize * INCX; 203 | #ifdef _AXPY 204 | locY += BatchSize * INCY; 205 | #endif 206 | 207 | } 208 | 209 | // FIXME: this only works for a batch size of 1, need a 210 | // cleanup loop for AVX-512 211 | // 212 | }; 213 | 214 | }; // namespace HAXX 215 | -------------------------------------------------------------------------------- /src/hblas/cxx/hblas1/copy_swap.cxx: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is a part of HAXX 3 | * 4 | * Copyright (c) 2017 David Williams-Young 5 | * All rights reserved. 6 | * 7 | * See LICENSE.txt 8 | */ 9 | 10 | #include "haxx.hpp" 11 | #include "hblas/hblas1.hpp" 12 | 13 | #include "util/simd.hpp" 14 | #include 15 | 16 | #include 17 | 18 | namespace HAXX { 19 | 20 | // Optimized Copy 21 | 22 | template <> 23 | void FNAME(const HAXX_INT N, quaternion * const X, 24 | const HAXX_INT INCX, quaternion * const Y, const HAXX_INT INCY) { 25 | 26 | // Local pointers for increment 27 | quaternion * locX = X, *locY = Y; 28 | 29 | #if defined(__AVX__) || defined(__AVX2__) 30 | 31 | __m256d x1; 32 | 33 | #ifdef _SWAP 34 | 35 | __m256d y1; 36 | 37 | #endif 38 | 39 | #endif 40 | 41 | // Batch size information 42 | const HAXX_INT BatchSize = 1; 43 | const HAXX_INT NBatch = N / BatchSize; 44 | const HAXX_INT nLeft = N % BatchSize; 45 | 46 | HAXX_INT i; 47 | 48 | assert(BatchSize == 1); // Need to address increment for AVX512 49 | 50 | // Determine alignment 51 | const bool XIsAligned = IS_ALIGNED(X,REQ_ALIGN); 52 | const bool YIsAligned = IS_ALIGNED(Y,REQ_ALIGN); 53 | 54 | const bool isAligned = XIsAligned and YIsAligned; 55 | 56 | 57 | if( isAligned ) 58 | for( i = 0; i < NBatch; i++ ) { 59 | 60 | #if defined(__AVX__) || defined(__AVX2__) 61 | 62 | x1 = LOAD_256D_ALIGNED_AS(double,locX); 63 | #ifdef _SWAP 64 | y1 = LOAD_256D_ALIGNED_AS(double,locY); 65 | #endif 66 | 67 | STORE_256D_ALIGNED_AS(double,locY,x1); 68 | #ifdef _SWAP 69 | STORE_256D_ALIGNED_AS(double,locX,y1); 70 | #endif 71 | 72 | #endif 73 | 74 | locX += INCX; 75 | locY += INCY; 76 | 77 | } 78 | else 79 | for( i = 0; i < NBatch; i++ ) { 80 | 81 | #if defined(__AVX__) || defined(__AVX2__) 82 | 83 | x1 = LOAD_256D_UNALIGNED_AS(double,locX); 84 | #ifdef _SWAP 85 | y1 = LOAD_256D_UNALIGNED_AS(double,locY); 86 | #endif 87 | 88 | 89 | STORE_256D_UNALIGNED_AS(double,locY,x1); 90 | #ifdef _SWAP 91 | STORE_256D_UNALIGNED_AS(double,locX,y1); 92 | #endif 93 | 94 | #endif 95 | 96 | locX += INCX; 97 | locY += INCY; 98 | 99 | } 100 | 101 | }; // HBLAS_COPY / HBLAS_SWAP 102 | 103 | }; // namspace HAXX 104 | -------------------------------------------------------------------------------- /src/hblas/cxx/hblas1/dot.cxx: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is a part of HAXX 3 | * 4 | * Copyright (c) 2017 David Williams-Young 5 | * All rights reserved. 6 | * 7 | * See LICENSE.txt 8 | */ 9 | 10 | #include "haxx.hpp" 11 | #include "hblas/hblas1.hpp" 12 | 13 | #include "util/simd.hpp" 14 | #include 15 | 16 | namespace HAXX { 17 | 18 | // Optimized Dot Product 19 | 20 | template <> 21 | quaternion FNAME(const HAXX_INT N, quaternion * const X, 22 | const HAXX_INT INCX, quaternion * const Y, const HAXX_INT INCY) { 23 | 24 | 25 | // Result quaternion 26 | quaternion res(0.); 27 | 28 | // Local pointers for increment 29 | quaternion * locX = X, *locY = Y; 30 | 31 | #if defined(__AVX__) || defined(__AVX2__) 32 | 33 | // Load quaternions 34 | __m256d x1; 35 | __m256d x2; 36 | __m256d x3; 37 | __m256d x4; 38 | 39 | __m256d y1; 40 | __m256d y2; 41 | __m256d y3; 42 | __m256d y4; 43 | 44 | 45 | // Scratch space 46 | __m256d t1, t2, t3, t4; 47 | 48 | // Result buffer (zeroed out) 49 | __m256d r1 = _mm256_setzero_pd(); 50 | __m256d r2 = _mm256_setzero_pd(); 51 | __m256d r3 = _mm256_setzero_pd(); 52 | __m256d r4 = _mm256_setzero_pd(); 53 | 54 | #endif 55 | 56 | // Batch size information 57 | const HAXX_INT BatchSize = 4; 58 | const HAXX_INT NBatch = N / BatchSize; 59 | const HAXX_INT nLeft = N % BatchSize; 60 | 61 | HAXX_INT i; 62 | 63 | 64 | // Determine alignment 65 | const bool XIsAligned = IS_ALIGNED(X,REQ_ALIGN); 66 | const bool YIsAligned = IS_ALIGNED(Y,REQ_ALIGN); 67 | 68 | const bool isAligned = XIsAligned and YIsAligned; 69 | 70 | for( i = 0; i < NBatch; i++ ) { 71 | 72 | #if defined(__AVX__) || defined(__AVX2__) 73 | 74 | // Load 4 X and Y elements 75 | if( isAligned ) { 76 | 77 | x1 = LOAD_256D_ALIGNED_AS(double,locX ); 78 | x2 = LOAD_256D_ALIGNED_AS(double,locX + INCX); 79 | x3 = LOAD_256D_ALIGNED_AS(double,locX + 2*INCX); 80 | x4 = LOAD_256D_ALIGNED_AS(double,locX + 3*INCX); 81 | 82 | y1 = LOAD_256D_ALIGNED_AS(double,locY ); 83 | y2 = LOAD_256D_ALIGNED_AS(double,locY + INCY); 84 | y3 = LOAD_256D_ALIGNED_AS(double,locY + 2*INCY); 85 | y4 = LOAD_256D_ALIGNED_AS(double,locY + 3*INCY); 86 | 87 | } else { 88 | 89 | x1 = LOAD_256D_UNALIGNED_AS(double,locX ); 90 | x2 = LOAD_256D_UNALIGNED_AS(double,locX + INCX); 91 | x3 = LOAD_256D_UNALIGNED_AS(double,locX + 2*INCX); 92 | x4 = LOAD_256D_UNALIGNED_AS(double,locX + 3*INCX); 93 | 94 | y1 = LOAD_256D_UNALIGNED_AS(double,locY ); 95 | y2 = LOAD_256D_UNALIGNED_AS(double,locY + INCY); 96 | y3 = LOAD_256D_UNALIGNED_AS(double,locY + 2*INCY); 97 | y4 = LOAD_256D_UNALIGNED_AS(double,locY + 3*INCY); 98 | 99 | } 100 | 101 | 102 | // Transpose X batch 103 | _MM_TRANSPOSE_4x4_PD(x1,x2,x3,x4,t1,t2,t3,t4); 104 | 105 | // Transpose Y batch 106 | _MM_TRANSPOSE_4x4_PD(y1,y2,y3,y4,t1,t2,t3,t4); 107 | 108 | // R += CONJ(X) * Y 109 | #ifdef _CONJ 110 | INC_MULD4Q_CN(x1,x2,x3,x4,y1,y2,y3,y4,r1,r2,r3,r4); 111 | 112 | // R += X * Y 113 | #else 114 | INC_MULD4Q_NN(x1,x2,x3,x4,y1,y2,y3,y4,r1,r2,r3,r4); 115 | 116 | #endif 117 | 118 | 119 | 120 | #endif 121 | 122 | 123 | // Increment pointers 124 | locX += BatchSize * INCX; 125 | locY += BatchSize * INCY; 126 | 127 | } 128 | 129 | 130 | #if defined(__AVX__) || defined(__AVX2__) 131 | 132 | // Transpose R batch to proper quaternion storage 133 | _MM_TRANSPOSE_4x4_PD(r1,r2,r3,r4,t1,t2,t3,t4); 134 | 135 | // Add up the four quaternions 136 | // r1 = r1 + r2 + r3 + r4 137 | r1 = _mm256_add_pd(r1,r2); 138 | r1 = _mm256_add_pd(r1,r3); 139 | r1 = _mm256_add_pd(r1,r4); 140 | 141 | #endif 142 | 143 | // Cleanup loop, multiply one quaternion at a time 144 | for(i = 0; i < nLeft; i++) { 145 | 146 | // Load a single element of X and Y 147 | if( isAligned ) { 148 | x1 = LOAD_256D_ALIGNED_AS(double,locX + i*INCX); 149 | y1 = LOAD_256D_ALIGNED_AS(double,locY + i*INCY); 150 | } else { 151 | x1 = LOAD_256D_UNALIGNED_AS(double,locX + i*INCX); 152 | y1 = LOAD_256D_UNALIGNED_AS(double,locY + i*INCY); 153 | } 154 | 155 | // r1 += CONJ(X) * Y 156 | #ifdef _CONJ 157 | r1 = _mm256_add_pd(r1,MULDQ_CN(x1,y1)); 158 | 159 | // r1 += X * Y 160 | #else 161 | r1 = _mm256_add_pd(r1,MULDQ_NN(x1,y1)); 162 | 163 | #endif 164 | 165 | } 166 | 167 | // Store the result in a possibly unaligned 168 | // storage 169 | STORE_256D_UNALIGNED_AS(double,(&res),r1); 170 | 171 | 172 | // return 173 | return res; 174 | 175 | }; // HBLAS_DOT? 176 | 177 | }; // namespace HAXX 178 | -------------------------------------------------------------------------------- /src/hblas/cxx/hblas3/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | ##### HBLAS3 COMPILATION ##### 2 | 3 | include(Gemm.cmake) 4 | 5 | set(HBLAS3_OBJ ${GEMM_OBJ} CACHE LIST 6 | "Object Files for HBLAS3 functions" ) 7 | -------------------------------------------------------------------------------- /src/hblas/cxx/hblas3/Gemm.cmake: -------------------------------------------------------------------------------- 1 | # GEMM functions 2 | add_library( hblas_gemmdd OBJECT gemm.cxx kern.cxx) 3 | add_library( hblas_gemmdz OBJECT gemm.cxx kern.cxx) 4 | add_library( hblas_gemmzd OBJECT gemm.cxx kern.cxx) 5 | add_library( hblas_gemmzz OBJECT gemm.cxx kern.cxx) 6 | 7 | target_compile_definitions( hblas_gemmdd PRIVATE "AMATF=DQUATERNION" "BMATF=DQUATERNION" "ALPHAF=DOUBLE" "BETAF=DOUBLE" ) 8 | target_compile_definitions( hblas_gemmdz PRIVATE "AMATF=DQUATERNION" "BMATF=DQUATERNION" "ALPHAF=DOUBLE" "BETAF=DCOMPLEX" ) 9 | target_compile_definitions( hblas_gemmzd PRIVATE "AMATF=DQUATERNION" "BMATF=DQUATERNION" "ALPHAF=DCOMPLEX" "BETAF=DOUBLE" ) 10 | target_compile_definitions( hblas_gemmzz PRIVATE "AMATF=DQUATERNION" "BMATF=DQUATERNION" "ALPHAF=DCOMPLEX" "BETAF=DCOMPLEX" ) 11 | 12 | set(GEMM_OBJ 13 | $ 14 | $ 15 | $ 16 | $ 17 | CACHE LIST "Object Files for GEMM functions" 18 | ) 19 | 20 | -------------------------------------------------------------------------------- /src/hblas/cxx/hblas3/gemm.cxx: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is a part of HAXX 3 | * 4 | * Copyright (c) 2017 David Williams-Young 5 | * All rights reserved. 6 | * 7 | * See LICENSE.txt 8 | */ 9 | 10 | #include "haxx.hpp" 11 | #include "hblas/hblas1.hpp" 12 | #include "hblas/hblas3.hpp" 13 | 14 | #include "util/simd.hpp" 15 | #include "util/macro.hpp" 16 | 17 | #include "hblas/config/types.hpp" 18 | #include "hblas/config/hblas3/gemm.hpp" 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | namespace HAXX { 29 | 30 | // Optimized GEMM 31 | 32 | template 33 | void Kern(HAXX_INT M, HAXX_INT N, HAXX_INT K, T* __restrict__ A, U* __restrict__ B, V* __restrict__ C, 34 | HAXX_INT LDC); 35 | 36 | 37 | 38 | 39 | 40 | 41 | template<> 42 | void HBLAS_GEMM(const char TRANSA, const char TRANSB, const HAXX_INT M, 43 | const HAXX_INT N, const HAXX_INT K, const _ALPHAF ALPHA, _AMATF * const A, 44 | const HAXX_INT LDA, _BMATF * const B, const HAXX_INT LDB, 45 | const _BETAF BETA, quaternion * const C, const HAXX_INT LDC) { 46 | 47 | const bool ATRAN = TRANSA == 'T'; 48 | const bool ACT = TRANSA == 'C'; 49 | const bool ACONJ = TRANSA == 'R'; 50 | 51 | const bool BTRAN = TRANSB == 'T'; 52 | const bool BCT = TRANSB == 'C'; 53 | const bool BCONJ = TRANSB == 'R'; 54 | 55 | // Packed matricies (aligned) 56 | _BMATF *bPack = 57 | (_BMATF*)aligned_alloc(REQ_ALIGN,FixMod(KC*NC,REQ_ALIGN)*sizeof(_BMATF)); 58 | _AMATF *aPack = 59 | (_AMATF*)aligned_alloc(REQ_ALIGN,FixMod(KC*MC,REQ_ALIGN)*sizeof(_AMATF)); 60 | 61 | 62 | // Scale C (on the left) by BETA 63 | HBLAS_SCALM('L','N',M,N,BETA,C,LDC,1); 64 | 65 | // Counter vars 66 | HAXX_INT j, nJ, jDo; 67 | HAXX_INT i, nI, iDo; 68 | HAXX_INT k, nK; 69 | HAXX_INT ii, jj, iii, jjj, nIII, nJJJ; 70 | 71 | // Initial N-cut panels of C and B 72 | quaternion *Cj = C; 73 | _BMATF *Bj = B; 74 | 75 | 76 | // Panel / block pointers 77 | _BMATF *Bp, *BL1; 78 | _AMATF *Ap, *Ai, *smallA; 79 | quaternion *Ci, *CBlk, *smallC; 80 | 81 | for( j = 0; j < N; j += NC ) { 82 | 83 | nJ = std::min(N-j,NC); 84 | jDo = FixMod(nJ,NR); 85 | 86 | Bp = Bj; 87 | Ap = A; 88 | 89 | for( k = 0; k < K; k += KC ) { 90 | 91 | nK = std::min(K-k,KC); 92 | 93 | Ai = Ap; 94 | Ci = Cj; 95 | 96 | // Turn off B Pack 97 | #if 1 98 | 99 | #ifdef _FACTOR_ALPHA_IN_B_PACK 100 | if( BTRAN ) BPACKT (nJ,nK,Bp,LDB,bPack,ALPHA); 101 | else if( BCT ) BPACKCT(nJ,nK,Bp,LDB,bPack,ALPHA); 102 | else if( BCONJ ) BPACKR (nK,nJ,Bp,LDB,bPack,ALPHA); 103 | else BPACK (nK,nJ,Bp,LDB,bPack,ALPHA); 104 | #else 105 | if( BTRAN ) BPACKT (nJ,nK,Bp,LDB,bPack); 106 | else if( BCT ) BPACKCT(nJ,nK,Bp,LDB,bPack); 107 | else if( BCONJ ) BPACKR (nK,nJ,Bp,LDB,bPack); 108 | else BPACK (nK,nJ,Bp,LDB,bPack); 109 | #endif 110 | 111 | #endif 112 | 113 | for( i = 0; i < M; i += MC ) { 114 | 115 | nI = std::min(M-i,MC); 116 | iDo = FixMod(nI,MC); 117 | 118 | BL1 = bPack; 119 | CBlk = Ci; 120 | 121 | // Turn off A Pack 122 | #if 1 123 | 124 | #ifdef _FACTOR_ALPHA_IN_A_PACK 125 | if( ATRAN ) APACKT (nK,nI,Ai,LDA,aPack,ALPHA); 126 | else if( ACT ) APACKCT(nK,nI,Ai,LDA,aPack,ALPHA); 127 | else if( ACONJ ) APACKR (nI,nK,Ai,LDA,aPack,ALPHA); 128 | else APACK (nI,nK,Ai,LDA,aPack,ALPHA); 129 | #else 130 | if( ATRAN ) APACKT (nK,nI,Ai,LDA,aPack); 131 | else if( ACT ) APACKCT(nK,nI,Ai,LDA,aPack); 132 | else if( ACONJ ) APACKR (nI,nK,Ai,LDA,aPack); 133 | else APACK (nI,nK,Ai,LDA,aPack); 134 | #endif 135 | 136 | #endif 137 | 138 | 139 | // Turn off Kernel evaluation 140 | #if 1 141 | for( jj = 0; jj < nJ; jj += NR ) { 142 | nJJJ = std::min(NR,nJ-jj); 143 | 144 | smallC = CBlk; 145 | smallA = aPack; 146 | 147 | for( ii = 0; ii < nI; ii += MR ) { 148 | nIII = std::min(MR,nI-ii); 149 | 150 | // Perform kernel operation 151 | Kern(nIII,nJJJ,nK,smallA,BL1,smallC,LDC); 152 | 153 | smallC += MR; 154 | smallA += MR*nK; 155 | } 156 | 157 | BL1 += NR*nK; 158 | CBlk += NR*LDC; 159 | } 160 | #endif 161 | 162 | if( ATRAN or ACT ) Ai += nI*LDA; 163 | else Ai += nI; 164 | 165 | Ci += nI; 166 | } 167 | 168 | if( ATRAN or ACT ) Ap += nK; 169 | else Ap += nK*LDA; 170 | 171 | if( BTRAN or BCT ) Bp += nK*LDB; 172 | else Bp += nK; 173 | } 174 | 175 | Cj += nJ*LDC; 176 | 177 | if( BTRAN or BCT ) Bj += nJ; 178 | else Bj += nJ*LDB; 179 | } 180 | 181 | // Free packed matricies 182 | free(bPack); free(aPack); 183 | 184 | }; // HBLAS_GEMM 185 | 186 | 187 | }; // namespace HAXX 188 | 189 | -------------------------------------------------------------------------------- /src/hblas/cxx/hblas3/kern.cxx: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is a part of HAXX 3 | * 4 | * Copyright (c) 2017 David Williams-Young 5 | * All rights reserved. 6 | * 7 | * See LICENSE.txt 8 | */ 9 | 10 | #include "haxx.hpp" 11 | #include "hblas/hblas1.hpp" 12 | #include "hblas/hblas3.hpp" 13 | 14 | #include "util/simd.hpp" 15 | #include "util/macro.hpp" 16 | 17 | #include "hblas/config/types.hpp" 18 | #include "hblas/config/hblas3/gemm.hpp" 19 | 20 | 21 | namespace HAXX { 22 | 23 | template 24 | void Kern(HAXX_INT M, HAXX_INT N, HAXX_INT K, T* __restrict__ A, U* __restrict__ B, V* __restrict__ C, 25 | HAXX_INT LDC); 26 | 27 | template<> 28 | void Kern(HAXX_INT M, HAXX_INT N, HAXX_INT K, 29 | quaternion* __restrict__ A, quaternion* __restrict__ B, 30 | quaternion* __restrict__ C, HAXX_INT LDC) { 31 | 32 | __m256d t1,t2,t3,t4; 33 | 34 | const bool M2 = (M == 2); 35 | const bool N2 = (N == 2); 36 | 37 | // Load C 38 | __m256d c00 = LOAD_256D_UNALIGNED_AS(double,C ); 39 | 40 | __m256d c10 = M2 ? LOAD_256D_UNALIGNED_AS(double,C+1 ) : 41 | _mm256_setzero_pd(); 42 | 43 | __m256d c01 = N2 ? LOAD_256D_UNALIGNED_AS(double,C+LDC ) : 44 | _mm256_setzero_pd(); 45 | 46 | __m256d c11 = ( M2 and N2 ) ? LOAD_256D_UNALIGNED_AS(double,C+LDC+1) : 47 | _mm256_setzero_pd(); 48 | 49 | _MM_TRANSPOSE_4x4_PD(c00,c01,c10,c11,t1,t2,t3,t4); 50 | 51 | 52 | quaternion *locB = B, *locA = A; 53 | 54 | HAXX_INT k = K; 55 | 56 | if( k > 0 ) 57 | do { 58 | 59 | // Load A 60 | __m256d a00 = LOAD_256D_ALIGNED_AS(double,locA); 61 | __m256d a10 = LOAD_256D_ALIGNED_AS(double,locA+1); 62 | locA += 2; 63 | 64 | 65 | // Load B 66 | 67 | #ifndef _FACTOR_TRANSPOSE_INTO_B_PACK 68 | __m256d b00 = LOAD_256D_ALIGNED_AS(double,locB); 69 | __m256d b10 = LOAD_256D_ALIGNED_AS(double,locB+1); 70 | #else 71 | double *BasDouble = reinterpret_cast(locB); 72 | __m128d b00lo = LOAD_128D_ALIGNED(BasDouble); 73 | __m128d b00hi = LOAD_128D_ALIGNED(BasDouble+2); 74 | __m128d b10lo = LOAD_128D_ALIGNED(BasDouble+4); 75 | __m128d b10hi = LOAD_128D_ALIGNED(BasDouble+6); 76 | #endif 77 | locB += 2; 78 | 79 | 80 | #ifdef _FACTOR_TRANSPOSE_INTO_A_PACK 81 | 82 | __m256d a_IIII = _mm256_permute_pd(a00,0xF); 83 | a00 = _mm256_permute_pd(a00,0x0); // SSSS 84 | 85 | __m256d a_KKKK = _mm256_permute_pd(a10,0xF); 86 | a10 = _mm256_permute_pd(a10,0x0); // SSSS 87 | 88 | __m256d &a00c = a_IIII; 89 | __m256d &a10c = a_KKKK; 90 | 91 | #else 92 | 93 | __m256d a00c = a00; 94 | __m256d a10c = a10; 95 | _MM_TRANSPOSE_4x4_PD(a00,a00c,a10,a10c,t1,t2,t3,t4); 96 | 97 | #endif 98 | 99 | #ifdef _FACTOR_TRANSPOSE_INTO_B_PACK 100 | 101 | __m256d bSSSS = SET_256D_FROM_128D(b00lo,b00lo); 102 | __m256d bIIII = SET_256D_FROM_128D(b10lo,b10lo); 103 | __m256d bJJJJ = SET_256D_FROM_128D(b00hi,b00hi); 104 | __m256d bKKKK = SET_256D_FROM_128D(b10hi,b10hi); 105 | 106 | __m256d &b00 = bSSSS; 107 | __m256d &b10 = bIIII; 108 | __m256d &b00c = bJJJJ; 109 | __m256d &b10c = bKKKK; 110 | 111 | #else 112 | 113 | __m256d b00c = b00; 114 | __m256d b10c = b10; 115 | _MM_TRANSPOSE_4x4_PD(b00,b10,b00c,b10c,t1,t2,t3,t4); 116 | 117 | #endif 118 | 119 | INC_MULD4Q_NN(a00,a00c,a10,a10c,b00,b10,b00c,b10c,c00,c01,c10,c11); 120 | 121 | 122 | k--; 123 | } while( k > 0 ); 124 | 125 | _MM_TRANSPOSE_4x4_PD(c00,c01,c10,c11,t1,t2,t3,t4); 126 | 127 | STORE_256D_UNALIGNED_AS(double,C ,c00); 128 | if( M2 ) STORE_256D_UNALIGNED_AS(double,C+1 ,c10); 129 | if( N2 ) STORE_256D_UNALIGNED_AS(double,C+LDC ,c01); 130 | if( M2 and N2 ) STORE_256D_UNALIGNED_AS(double,C+LDC+1,c11); 131 | 132 | 133 | } 134 | }; 135 | -------------------------------------------------------------------------------- /src/hblas/fortran/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | 2 | if( ENABLE_GENERIC_FORTRAN ) 3 | set(HBLAS1_SRC 4 | hblas1/hdotu.f hblas1/hdotc.f 5 | hblas1/haxpydh.f hblas1/haxpyhh.f hblas1/haxpych.f 6 | hblas1/hscalh.f hblas1/hscalc.f hblas1/hscald.f) 7 | endif() 8 | 9 | set(HBLAS2_SRC 10 | hblas2/hgemvdd.f hblas2/hgemvzd.f hblas2/hgemvhd.f 11 | hblas2/hgemvdz.f hblas2/hgemvzz.f hblas2/hgemvhz.f 12 | hblas2/hgemvdh.f hblas2/hgemvzh.f hblas2/hgemvhh.f 13 | hblas2/hgerud.f hblas2/hgeruh.f hblas2/hgeruz.f 14 | hblas2/hgercd.f hblas2/hgerch.f hblas2/hgercz.f) 15 | set(HBLAS3_SRC 16 | hblas3/hgemmdd.f hblas3/hgemmzd.f hblas3/hgemmhd.f 17 | hblas3/hgemmdz.f hblas3/hgemmzz.f hblas3/hgemmhz.f 18 | hblas3/hgemmdh.f hblas3/hgemmzh.f hblas3/hgemmhh.f) 19 | 20 | set(UTIL_SRC util/lsame.f util/hzexp.f util/hdexp.f util/hzcon.f) 21 | 22 | add_library(hblas_fortran STATIC ${HBLAS1_SRC} ${HBLAS2_SRC} 23 | ${HBLAS3_SRC} ${UTIL_SRC}) 24 | 25 | 26 | # Install FORTRAN library 27 | install(TARGETS hblas_fortran DESTINATION lib) 28 | -------------------------------------------------------------------------------- /src/hblas/fortran/hblas1/haxpych.f: -------------------------------------------------------------------------------- 1 | C 2 | C This file is a part of HAXX 3 | C 4 | C Copyright (c) 2017 David Williams-Young 5 | C All rights reserved. 6 | C 7 | C See LICENSE.txt 8 | C 9 | Subroutine HAXPYCH(SIDE, N, ALPHA, X, INCX, Y, INCY) 10 | C 11 | LOGICAL LSAME 12 | EXTERNAL LSAME 13 | C 14 | LOGICAL MRIGHT 15 | CHARACTER SIDE 16 | INTEGER*4 N, INCX, INCY, I, IX, IY 17 | REAL*8 ALPHA(2), X(4,*), Y(4,*) 18 | C 19 | IF ( N.LE.0 ) RETURN 20 | IF ( ALPHA(1).EQ.0.0D0 .AND. ALPHA(2).EQ.0.0D0 ) RETURN 21 | C 22 | MRIGHT = LSAME('R',SIDE) 23 | C 24 | IF ( INCX.EQ.1 .AND. INCY.EQ.1 ) THEN 25 | IF ( .NOT. MRIGHT ) THEN 26 | DO 10 I = 1,N 27 | C Hamilton Product 28 | Y(1,I) = Y(1,I) + ALPHA(1)*X(1,I) - ALPHA(2)*X(2,I) 29 | Y(2,I) = Y(2,I) + ALPHA(1)*X(2,I) + ALPHA(2)*X(1,I) 30 | Y(3,I) = Y(3,I) + ALPHA(1)*X(3,I) - ALPHA(2)*X(4,I) 31 | Y(4,I) = Y(4,I) + ALPHA(1)*X(4,I) + ALPHA(2)*X(3,I) 32 | 10 CONTINUE 33 | ELSE 34 | DO 20 I = 1,N 35 | C Hamilton Product 36 | Y(1,I) = Y(1,I) + X(1,I)*ALPHA(1) - X(2,I)*ALPHA(2) 37 | Y(2,I) = Y(2,I) + X(1,I)*ALPHA(2) + X(2,I)*ALPHA(1) 38 | Y(3,I) = Y(3,I) + X(3,I)*ALPHA(1) + X(4,I)*ALPHA(2) 39 | Y(4,I) = Y(4,I) - X(3,I)*ALPHA(2) + X(4,I)*ALPHA(1) 40 | 20 CONTINUE 41 | ENDIF 42 | ELSE 43 | 44 | IX = 1 45 | IY = 1 46 | IF ( INCX.LT.0 ) IX = (-N + 1)*INCX + 1 47 | IF ( INCY.LT.0 ) IY = (-N + 1)*INCY + 1 48 | 49 | IF ( .NOT. MRIGHT ) THEN 50 | DO 30 I = 1,N 51 | C Hamilton Product 52 | Y(1,IY) = Y(1,IY) + ALPHA(1)*X(1,IX) - ALPHA(2)*X(2,IX) 53 | Y(2,IY) = Y(2,IY) + ALPHA(1)*X(2,IX) + ALPHA(2)*X(1,IX) 54 | Y(3,IY) = Y(3,IY) + ALPHA(1)*X(3,IX) - ALPHA(2)*X(4,IX) 55 | Y(4,IY) = Y(4,IY) + ALPHA(1)*X(4,IX) + ALPHA(2)*X(3,IX) 56 | 57 | IX = IX + INCX 58 | IY = IY + INCY 59 | 30 CONTINUE 60 | ELSE 61 | DO 40 I = 1,N 62 | C Hamilton Product 63 | Y(1,IY) = Y(1,IY) + X(1,IX)*ALPHA(1) - X(2,IX)*ALPHA(2) 64 | Y(2,IY) = Y(2,IY) + X(1,IX)*ALPHA(2) + X(2,IX)*ALPHA(1) 65 | Y(3,IY) = Y(3,IY) + X(3,IX)*ALPHA(1) + X(4,IX)*ALPHA(2) 66 | Y(4,IY) = Y(4,IY) - X(3,IX)*ALPHA(2) + X(4,IX)*ALPHA(1) 67 | 68 | IX = IX + INCX 69 | IY = IY + INCY 70 | 40 CONTINUE 71 | ENDIF 72 | ENDIF 73 | C 74 | END 75 | -------------------------------------------------------------------------------- /src/hblas/fortran/hblas1/haxpydh.f: -------------------------------------------------------------------------------- 1 | C 2 | C This file is a part of HAXX 3 | C 4 | C Copyright (c) 2017 David Williams-Young 5 | C All rights reserved. 6 | C 7 | C See LICENSE.txt 8 | C 9 | Subroutine HAXPYDH(SIDE, N, ALPHA, X, INCX, Y, INCY) 10 | C 11 | LOGICAL LSAME 12 | EXTERNAL LSAME 13 | C 14 | CHARACTER SIDE 15 | INTEGER*4 N, INCX, INCY, I, IX, IY 16 | REAL*8 ALPHA, X(4,*), Y(4,*) 17 | C 18 | IF ( N.LE.0 ) RETURN 19 | IF ( ALPHA.EQ.0.0D0 ) RETURN 20 | C 21 | IF ( INCX.EQ.1 .AND. INCY.EQ.1 ) THEN 22 | DO 10 I = 1,N 23 | Y(1,I) = Y(1,I) + ALPHA*X(1,I) 24 | Y(2,I) = Y(2,I) + ALPHA*X(2,I) 25 | Y(3,I) = Y(3,I) + ALPHA*X(3,I) 26 | Y(4,I) = Y(4,I) + ALPHA*X(4,I) 27 | 10 CONTINUE 28 | ELSE 29 | 30 | IX = 1 31 | IY = 1 32 | IF ( INCX.LT.0 ) IX = (-N + 1)*INCX + 1 33 | IF ( INCY.LT.0 ) IY = (-N + 1)*INCY + 1 34 | 35 | DO 20 I = 1,N 36 | Y(1,IY) = Y(1,IY) + ALPHA*X(1,IX) 37 | Y(2,IY) = Y(2,IY) + ALPHA*X(2,IX) 38 | Y(3,IY) = Y(3,IY) + ALPHA*X(3,IX) 39 | Y(4,IY) = Y(4,IY) + ALPHA*X(4,IX) 40 | IX = IX + INCX 41 | IY = IY + INCY 42 | 20 CONTINUE 43 | ENDIF 44 | C 45 | END 46 | -------------------------------------------------------------------------------- /src/hblas/fortran/hblas1/haxpyhh.f: -------------------------------------------------------------------------------- 1 | C 2 | C This file is a part of HAXX 3 | C 4 | C Copyright (c) 2017 David Williams-Young 5 | C All rights reserved. 6 | C 7 | C See LICENSE.txt 8 | C 9 | Subroutine HAXPYHH(SIDE, N, ALPHA, X, INCX, Y, INCY) 10 | C 11 | LOGICAL LSAME 12 | EXTERNAL LSAME 13 | C 14 | LOGICAL MRIGHT 15 | CHARACTER SIDE 16 | INTEGER*4 N, INCX, INCY, I, IX, IY 17 | REAL*8 ALPHA(4), X(4,*), Y(4,*) 18 | C 19 | IF ( N.LE.0 ) RETURN 20 | IF ( ALPHA(1).EQ.0.0D0 .AND. ALPHA(2).EQ.0.0D0 .AND. 21 | $ ALPHA(3).EQ.0.0D0 .AND. ALPHA(4).EQ.0.0D0 ) RETURN 22 | C 23 | MRIGHT = LSAME('R',SIDE) 24 | C 25 | IF ( INCX.EQ.1 .AND. INCY.EQ.1 ) THEN 26 | IF ( .NOT. MRIGHT ) THEN 27 | DO 10 I = 1,N 28 | C Hamilton Product 29 | Y(1,I) = Y(1,I) + ALPHA(1)*X(1,I) - ALPHA(2)*X(2,I) - 30 | $ ALPHA(3)*X(3,I) - ALPHA(4)*X(4,I) 31 | 32 | Y(2,I) = Y(2,I) + ALPHA(1)*X(2,I) + ALPHA(2)*X(1,I) + 33 | $ ALPHA(3)*X(4,I) - ALPHA(4)*X(3,I) 34 | 35 | Y(3,I) = Y(3,I) + ALPHA(1)*X(3,I) - ALPHA(2)*X(4,I) + 36 | $ ALPHA(3)*X(1,I) + ALPHA(4)*X(2,I) 37 | 38 | Y(4,I) = Y(4,I) + ALPHA(1)*X(4,I) + ALPHA(2)*X(3,I) - 39 | $ ALPHA(3)*X(2,I) + ALPHA(4)*X(1,I) 40 | 10 CONTINUE 41 | ELSE 42 | DO 20 I = 1,N 43 | C Hamilton Product 44 | Y(1,I) = Y(1,I) + X(1,I)*ALPHA(1) - X(2,I)*ALPHA(2) - 45 | $ X(3,I)*ALPHA(3) - X(4,I)*ALPHA(4) 46 | 47 | Y(2,I) = Y(2,I) + X(1,I)*ALPHA(2) + X(2,I)*ALPHA(1) + 48 | $ X(3,I)*ALPHA(4) - X(4,I)*ALPHA(3) 49 | 50 | Y(3,I) = Y(3,I) + X(1,I)*ALPHA(3) - X(2,I)*ALPHA(4) + 51 | $ X(3,I)*ALPHA(1) + X(4,I)*ALPHA(2) 52 | 53 | Y(4,I) = Y(4,I) + X(1,I)*ALPHA(4) + X(2,I)*ALPHA(3) - 54 | $ X(3,I)*ALPHA(2) + X(4,I)*ALPHA(1) 55 | 20 CONTINUE 56 | ENDIF 57 | ELSE 58 | 59 | IX = 1 60 | IY = 1 61 | IF ( INCX.LT.0 ) IX = (-N + 1)*INCX + 1 62 | IF ( INCY.LT.0 ) IY = (-N + 1)*INCY + 1 63 | 64 | IF ( .NOT. MRIGHT ) THEN 65 | DO 30 I = 1,N 66 | C Hamilton Product 67 | Y(1,IY) = Y(1,IY) + ALPHA(1)*X(1,IX) - ALPHA(2)*X(2,IX) - 68 | $ ALPHA(3)*X(3,IX) - ALPHA(4)*X(4,IX) 69 | 70 | Y(2,IY) = Y(2,IY) + ALPHA(1)*X(2,IX) + ALPHA(2)*X(1,IX) + 71 | $ ALPHA(3)*X(4,IX) - ALPHA(4)*X(3,IX) 72 | 73 | Y(3,IY) = Y(3,IY) + ALPHA(1)*X(3,IX) - ALPHA(2)*X(4,IX) + 74 | $ ALPHA(3)*X(1,IX) + ALPHA(4)*X(2,IX) 75 | 76 | Y(4,IY) = Y(4,IY) + ALPHA(1)*X(4,IX) + ALPHA(2)*X(3,IX) - 77 | $ ALPHA(3)*X(2,IX) + ALPHA(4)*X(1,IX) 78 | 79 | IX = IX + INCX 80 | IY = IY + INCY 81 | 30 CONTINUE 82 | ELSE 83 | DO 40 I = 1,N 84 | C Hamilton Product 85 | Y(1,IY) = Y(1,IY) + X(1,IX)*ALPHA(1) - X(2,IX)*ALPHA(2) - 86 | $ X(3,IX)*ALPHA(3) - X(4,IX)*ALPHA(4) 87 | 88 | Y(2,IY) = Y(2,IY) + X(1,IX)*ALPHA(2) + X(2,IX)*ALPHA(1) + 89 | $ X(3,IX)*ALPHA(4) - X(4,IX)*ALPHA(3) 90 | 91 | Y(3,IY) = Y(3,IY) + X(1,IX)*ALPHA(3) - X(2,IX)*ALPHA(4) + 92 | $ X(3,IX)*ALPHA(1) + X(4,IX)*ALPHA(2) 93 | 94 | Y(4,IY) = Y(4,IY) + X(1,IX)*ALPHA(4) + X(2,IX)*ALPHA(3) - 95 | $ X(3,IX)*ALPHA(2) + X(4,IX)*ALPHA(1) 96 | 97 | IX = IX + INCX 98 | IY = IY + INCY 99 | 40 CONTINUE 100 | ENDIF 101 | ENDIF 102 | C 103 | END 104 | -------------------------------------------------------------------------------- /src/hblas/fortran/hblas1/hdotc.f: -------------------------------------------------------------------------------- 1 | C 2 | C This file is a part of HAXX 3 | C 4 | C Copyright (c) 2017 David Williams-Young 5 | C All rights reserved. 6 | C 7 | C See LICENSE.txt 8 | C 9 | Subroutine HDOTC(R, N, X, INCX, Y, INCY) 10 | C 11 | INTEGER*4 N, INCX, INCY, I,IX,IY 12 | REAL*8 X(4,*), Y(4,*), R(4) 13 | C 14 | R(1) = 0.0D+0 15 | R(2) = 0.0D+0 16 | R(3) = 0.0D+0 17 | R(4) = 0.0D+0 18 | C 19 | IF ( N.LE.0 ) RETURN 20 | IF ( INCX.EQ.1 .AND. INCY.EQ.1 ) THEN 21 | DO 10 I = 1,N 22 | C Hamilton Product 23 | R(1) = R(1) + X(1,I)*Y(1,I) + X(2,I)*Y(2,I) + 24 | $ X(3,I)*Y(3,I) + X(4,I)*Y(4,I) 25 | 26 | R(2) = R(2) + X(1,I)*Y(2,I) - X(2,I)*Y(1,I) - 27 | $ X(3,I)*Y(4,I) + X(4,I)*Y(3,I) 28 | 29 | R(3) = R(3) + X(1,I)*Y(3,I) + X(2,I)*Y(4,I) - 30 | $ X(3,I)*Y(1,I) - X(4,I)*Y(2,I) 31 | 32 | R(4) = R(4) + X(1,I)*Y(4,I) - X(2,I)*Y(3,I) + 33 | $ X(3,I)*Y(2,I) - X(4,I)*Y(1,I) 34 | 10 CONTINUE 35 | ELSE 36 | 37 | IX = 1 38 | IY = 1 39 | IF ( INCX.LT.0 ) IX = (-N + 1)*INCX + 1 40 | IF ( INCY.LT.0 ) IY = (-N + 1)*INCY + 1 41 | 42 | DO 20 I = 1,N 43 | C Hamilton Product 44 | R(1) = R(1) + X(1,IX)*Y(1,IY) + X(2,IX)*Y(2,IY) + 45 | $ X(3,IX)*Y(3,IY) + X(4,IX)*Y(4,IY) 46 | 47 | R(2) = R(2) + X(1,IX)*Y(2,IY) - X(2,IX)*Y(1,IY) - 48 | $ X(3,IX)*Y(4,IY) + X(4,IX)*Y(3,IY) 49 | 50 | R(3) = R(3) + X(1,IX)*Y(3,IY) + X(2,IX)*Y(4,IY) - 51 | $ X(3,IX)*Y(1,IY) - X(4,IX)*Y(2,IY) 52 | 53 | R(4) = R(4) + X(1,IX)*Y(4,IY) - X(2,IX)*Y(3,IY) + 54 | $ X(3,IX)*Y(2,IY) - X(4,IX)*Y(1,IY) 55 | IX = IX + INCX 56 | IY = IY + INCY 57 | 20 CONTINUE 58 | ENDIF 59 | C 60 | END 61 | -------------------------------------------------------------------------------- /src/hblas/fortran/hblas1/hdotu.f: -------------------------------------------------------------------------------- 1 | C 2 | C This file is a part of HAXX 3 | C 4 | C Copyright (c) 2017 David Williams-Young 5 | C All rights reserved. 6 | C 7 | C See LICENSE.txt 8 | C 9 | Subroutine HDOTU(R, N, X, INCX, Y, INCY) 10 | C 11 | INTEGER*4 N, INCX, INCY, I,IX,IY 12 | REAL*8 X(4,*), Y(4,*), R(4) 13 | C 14 | R(1) = 0.0D+0 15 | R(2) = 0.0D+0 16 | R(3) = 0.0D+0 17 | R(4) = 0.0D+0 18 | C 19 | IF ( N.LE.0 ) RETURN 20 | IF ( INCX.EQ.1 .AND. INCY.EQ.1 ) THEN 21 | DO 10 I = 1,N 22 | C Hamilton Product 23 | R(1) = R(1) + X(1,I)*Y(1,I) - X(2,I)*Y(2,I) - 24 | $ X(3,I)*Y(3,I) - X(4,I)*Y(4,I) 25 | 26 | R(2) = R(2) + X(1,I)*Y(2,I) + X(2,I)*Y(1,I) + 27 | $ X(3,I)*Y(4,I) - X(4,I)*Y(3,I) 28 | 29 | R(3) = R(3) + X(1,I)*Y(3,I) - X(2,I)*Y(4,I) + 30 | $ X(3,I)*Y(1,I) + X(4,I)*Y(2,I) 31 | 32 | R(4) = R(4) + X(1,I)*Y(4,I) + X(2,I)*Y(3,I) - 33 | $ X(3,I)*Y(2,I) + X(4,I)*Y(1,I) 34 | 10 CONTINUE 35 | ELSE 36 | 37 | IX = 1 38 | IY = 1 39 | IF ( INCX.LT.0 ) IX = (-N + 1)*INCX + 1 40 | IF ( INCY.LT.0 ) IY = (-N + 1)*INCY + 1 41 | 42 | DO 20 I = 1,N 43 | C Hamilton Product 44 | R(1) = R(1) + X(1,IX)*Y(1,IY) - X(2,IX)*Y(2,IY) - 45 | $ X(3,IX)*Y(3,IY) - X(4,IX)*Y(4,IY) 46 | 47 | R(2) = R(2) + X(1,IX)*Y(2,IY) + X(2,IX)*Y(1,IY) + 48 | $ X(3,IX)*Y(4,IY) - X(4,IX)*Y(3,IY) 49 | 50 | R(3) = R(3) + X(1,IX)*Y(3,IY) - X(2,IX)*Y(4,IY) + 51 | $ X(3,IX)*Y(1,IY) + X(4,IX)*Y(2,IY) 52 | 53 | R(4) = R(4) + X(1,IX)*Y(4,IY) + X(2,IX)*Y(3,IY) - 54 | $ X(3,IX)*Y(2,IY) + X(4,IX)*Y(1,IY) 55 | IX = IX + INCX 56 | IY = IY + INCY 57 | 20 CONTINUE 58 | ENDIF 59 | C 60 | END 61 | -------------------------------------------------------------------------------- /src/hblas/fortran/hblas1/hscalc.f: -------------------------------------------------------------------------------- 1 | C 2 | C This file is a part of HAXX 3 | C 4 | C Copyright (c) 2017 David Williams-Young 5 | C All rights reserved. 6 | C 7 | C See LICENSE.txt 8 | C 9 | Subroutine HSCALC(SIDE, N, ALPHA, X, INCX) 10 | C 11 | LOGICAL LSAME 12 | EXTERNAL LSAME 13 | C 14 | LOGICAL MRIGHT 15 | CHARACTER SIDE 16 | INTEGER*4 N, INCX, I, IX 17 | COMPLEX*16 ALPHA, X(2,*) 18 | C 19 | COMPLEX*16 HTMPS, HTMPJ 20 | C 21 | IF ( N.LE.0 ) RETURN 22 | IF ( ABS(ALPHA).EQ.0.0D0 ) RETURN 23 | C 24 | MRIGHT = LSAME('R',SIDE) 25 | C 26 | IF ( INCX.EQ.1 ) THEN 27 | IF ( .NOT. MRIGHT ) THEN 28 | DO 10 I = 1,N 29 | C Hamilton Product 30 | X(1,I) = ALPHA * X(1,I) 31 | X(2,I) = ALPHA * X(2,I) 32 | 10 CONTINUE 33 | ELSE 34 | DO 20 I = 1,N 35 | C Hamilton Product 36 | X(1,I) = X(1,I) * ALPHA 37 | X(2,I) = X(2,I) * CONJG(ALPHA) 38 | 20 CONTINUE 39 | ENDIF 40 | ELSE 41 | 42 | IX = 1 43 | IF ( INCX.LT.0 ) IX = (-N + 1)*INCX + 1 44 | 45 | IF ( .NOT. MRIGHT ) THEN 46 | DO 30 I = 1,N 47 | C Hamilton Product 48 | X(1,IX) = ALPHA * X(1,IX) 49 | X(2,IX) = ALPHA * X(2,IX) 50 | 51 | IX = IX + INCX 52 | 30 CONTINUE 53 | ELSE 54 | DO 40 I = 1,N 55 | C Hamilton Product 56 | X(1,IX) = X(1,IX) * ALPHA 57 | X(2,IX) = X(2,IX) * CONJG(ALPHA) 58 | 59 | IX = IX + INCX 60 | 40 CONTINUE 61 | ENDIF 62 | ENDIF 63 | C 64 | END 65 | -------------------------------------------------------------------------------- /src/hblas/fortran/hblas1/hscald.f: -------------------------------------------------------------------------------- 1 | C 2 | C This file is a part of HAXX 3 | C 4 | C Copyright (c) 2017 David Williams-Young 5 | C All rights reserved. 6 | C 7 | C See LICENSE.txt 8 | C 9 | Subroutine HSCALD(SIDE, N, ALPHA, X, INCX) 10 | C 11 | LOGICAL LSAME 12 | EXTERNAL LSAME 13 | C 14 | CHARACTER SIDE 15 | INTEGER*4 N, INCX, I, IX 16 | REAL*8 ALPHA, X(4,*) 17 | C 18 | IF ( N.LE.0 ) RETURN 19 | IF ( ALPHA.EQ.0.0D0 ) RETURN 20 | C 21 | IF ( INCX.EQ.1 ) THEN 22 | DO 10 I = 1,N 23 | X(1,I) = ALPHA*X(1,I) 24 | X(2,I) = ALPHA*X(2,I) 25 | X(3,I) = ALPHA*X(3,I) 26 | X(4,I) = ALPHA*X(4,I) 27 | 10 CONTINUE 28 | ELSE 29 | 30 | IX = 1 31 | IF ( INCX.LT.0 ) IX = (-N + 1)*INCX + 1 32 | 33 | DO 20 I = 1,N 34 | X(1,IX) = ALPHA*X(1,IX) 35 | X(2,IX) = ALPHA*X(2,IX) 36 | X(3,IX) = ALPHA*X(3,IX) 37 | X(4,IX) = ALPHA*X(4,IX) 38 | 39 | IX = IX + INCX 40 | 20 CONTINUE 41 | ENDIF 42 | C 43 | END 44 | -------------------------------------------------------------------------------- /src/hblas/fortran/hblas1/hscalh.f: -------------------------------------------------------------------------------- 1 | C 2 | C This file is a part of HAXX 3 | C 4 | C Copyright (c) 2017 David Williams-Young 5 | C All rights reserved. 6 | C 7 | C See LICENSE.txt 8 | C 9 | Subroutine HSCALH(SIDE, N, ALPHA, X, INCX) 10 | C 11 | LOGICAL LSAME 12 | EXTERNAL LSAME 13 | C 14 | LOGICAL MRIGHT 15 | CHARACTER SIDE 16 | INTEGER*4 N, INCX, I, IX 17 | REAL*8 ALPHA(4), X(4,*) 18 | C 19 | REAL*8 HTMPS, HTMPI, HTMPJ 20 | C 21 | IF ( N.LE.0 ) RETURN 22 | IF ( ALPHA(1).EQ.0.0D0 .AND. ALPHA(2).EQ.0.0D0 .AND. 23 | $ ALPHA(3).EQ.0.0D0 .AND. ALPHA(4).EQ.0.0D0 ) RETURN 24 | C 25 | MRIGHT = LSAME('R',SIDE) 26 | C 27 | IF ( INCX.EQ.1 ) THEN 28 | IF ( .NOT. MRIGHT ) THEN 29 | DO 10 I = 1,N 30 | C Hamilton Product 31 | HTMPS = ALPHA(1)*X(1,I) - ALPHA(2)*X(2,I) - 32 | $ ALPHA(3)*X(3,I) - ALPHA(4)*X(4,I) 33 | 34 | HTMPI = ALPHA(1)*X(2,I) + ALPHA(2)*X(1,I) + 35 | $ ALPHA(3)*X(4,I) - ALPHA(4)*X(3,I) 36 | 37 | HTMPJ = ALPHA(1)*X(3,I) - ALPHA(2)*X(4,I) + 38 | $ ALPHA(3)*X(1,I) + ALPHA(4)*X(2,I) 39 | 40 | X(4,I) = ALPHA(1)*X(4,I) + ALPHA(2)*X(3,I) - 41 | $ ALPHA(3)*X(2,I) + ALPHA(4)*X(1,I) 42 | X(1,I) = HTMPS 43 | X(2,I) = HTMPI 44 | X(3,I) = HTMPJ 45 | 10 CONTINUE 46 | ELSE 47 | DO 20 I = 1,N 48 | C Hamilton Product 49 | HTMPS = X(1,I)*ALPHA(1) - X(2,I)*ALPHA(2) - 50 | $ X(3,I)*ALPHA(3) - X(4,I)*ALPHA(4) 51 | 52 | HTMPI = X(1,I)*ALPHA(2) + X(2,I)*ALPHA(1) + 53 | $ X(3,I)*ALPHA(4) - X(4,I)*ALPHA(3) 54 | 55 | HTMPJ = X(1,I)*ALPHA(3) - X(2,I)*ALPHA(4) + 56 | $ X(3,I)*ALPHA(1) + X(4,I)*ALPHA(2) 57 | 58 | X(4,I) = X(1,I)*ALPHA(4) + X(2,I)*ALPHA(3) - 59 | $ X(3,I)*ALPHA(2) + X(4,I)*ALPHA(1) 60 | X(1,I) = HTMPS 61 | X(2,I) = HTMPI 62 | X(3,I) = HTMPJ 63 | 20 CONTINUE 64 | ENDIF 65 | ELSE 66 | 67 | IX = 1 68 | IF ( INCX.LT.0 ) IX = (-N + 1)*INCX + 1 69 | 70 | IF ( .NOT. MRIGHT ) THEN 71 | DO 30 I = 1,N 72 | C Hamilton Product 73 | HTMPS = ALPHA(1)*X(1,IX) - ALPHA(2)*X(2,IX) - 74 | $ ALPHA(3)*X(3,IX) - ALPHA(4)*X(4,IX) 75 | 76 | HTMPI = ALPHA(1)*X(2,IX) + ALPHA(2)*X(1,IX) + 77 | $ ALPHA(3)*X(4,IX) - ALPHA(4)*X(3,IX) 78 | 79 | HTMPJ = ALPHA(1)*X(3,IX) - ALPHA(2)*X(4,IX) + 80 | $ ALPHA(3)*X(1,IX) + ALPHA(4)*X(2,IX) 81 | 82 | X(4,IX) = ALPHA(1)*X(4,IX) + ALPHA(2)*X(3,IX) - 83 | $ ALPHA(3)*X(2,IX) + ALPHA(4)*X(1,IX) 84 | X(1,IX) = HTMPS 85 | X(2,IX) = HTMPI 86 | X(3,IX) = HTMPJ 87 | 88 | IX = IX + INCX 89 | 30 CONTINUE 90 | ELSE 91 | DO 40 I = 1,N 92 | C Hamilton Product 93 | HTMPS = X(1,IX)*ALPHA(1) - X(2,IX)*ALPHA(2) - 94 | $ X(3,IX)*ALPHA(3) - X(4,IX)*ALPHA(4) 95 | 96 | HTMPI = X(1,IX)*ALPHA(2) + X(2,IX)*ALPHA(1) + 97 | $ X(3,IX)*ALPHA(4) - X(4,IX)*ALPHA(3) 98 | 99 | HTMPJ = X(1,IX)*ALPHA(3) - X(2,IX)*ALPHA(4) + 100 | $ X(3,IX)*ALPHA(1) + X(4,IX)*ALPHA(2) 101 | 102 | X(4,IX) = X(1,IX)*ALPHA(4) + X(2,IX)*ALPHA(3) - 103 | $ X(3,IX)*ALPHA(2) + X(4,IX)*ALPHA(1) 104 | X(1,IX) = HTMPS 105 | X(2,IX) = HTMPI 106 | X(3,IX) = HTMPJ 107 | 108 | IX = IX + INCX 109 | 40 CONTINUE 110 | ENDIF 111 | ENDIF 112 | C 113 | END 114 | -------------------------------------------------------------------------------- /src/hblas/fortran/hblas2/hgemvdd.f: -------------------------------------------------------------------------------- 1 | C 2 | C This file is a part of HAXX 3 | C 4 | C Copyright (c) 2017 David Williams-Young 5 | C All rights reserved. 6 | C 7 | C See LICENSE.txt 8 | C 9 | Subroutine HGEMVDD(TRANS, M, N, ALPHA, A, LDA, X, INCX, BETA, 10 | $ Y, INCY) 11 | C 12 | LOGICAL LSAME 13 | EXTERNAL LSAME 14 | C 15 | CHARACTER TRANS 16 | LOGICAL NOCONJ, AZERO,AONE,BZERO,BONE 17 | INTEGER*4 M,N, LDA,INCX,INCY, LENX,LENY, I,J,IY,JX,IX,JY, KX,KY 18 | REAL*8 ALPHA,BETA 19 | COMPLEX*16 A(2,LDA,*), X(2,*), Y(2,*) 20 | C 21 | COMPLEX*16 HTMPS, HTMPJ 22 | C 23 | COMPLEX*16 ONE, ZERO 24 | PARAMETER (ONE=(1.0D+0,0.0D+0), ZERO=(0.0D+0,0.0D+0)) 25 | C 26 | IF ( (M.EQ.0) .OR. (N.EQ.0) .OR. 27 | $ ( (ALPHA.EQ.ZERO) .AND. (BETA.EQ.ONE) ) ) RETURN 28 | C 29 | NOCONJ = LSAME('T',TRANS) 30 | IF ( LSAME('N',TRANS) ) THEN 31 | LENX = N 32 | LENY = M 33 | ELSE 34 | LENX = M 35 | LENY = N 36 | ENDIF 37 | 38 | AZERO = ALPHA.EQ.ZERO 39 | AONE = ALPHA.EQ.ONE 40 | BZERO = BETA.EQ.ZERO 41 | BONE = BETA.EQ.ONE 42 | C 43 | IF ( INCX.GT.0 ) THEN 44 | KX = 1 45 | ELSE 46 | KX = 1 - (LENX-1)*INCX 47 | ENDIF 48 | 49 | IF ( INCY.GT.0 ) THEN 50 | KY = 1 51 | ELSE 52 | KY = 1 - (LENY-1)*INCY 53 | ENDIF 54 | C 55 | 56 | IF ( .NOT.BONE ) THEN 57 | IF ( INCY.EQ.1 ) THEN 58 | IF ( BZERO ) THEN 59 | DO 10 I = 1,LENY 60 | Y(1,I) = ZERO 61 | Y(2,I) = ZERO 62 | 10 CONTINUE 63 | ELSE 64 | DO 20 I = 1,LENY 65 | Y(1,I) = BETA * Y(1,I) 66 | Y(2,I) = BETA * Y(2,I) 67 | 20 CONTINUE 68 | ENDIF 69 | ELSE 70 | IY = KY 71 | IF ( BZERO ) THEN 72 | DO 30 I = 1,LENY 73 | Y(1,IY) = ZERO 74 | Y(2,IY) = ZERO 75 | IY = IY + INCY 76 | 30 CONTINUE 77 | ELSE 78 | DO 40 I = 1,LENY 79 | Y(1,IY) = BETA * Y(1,IY) 80 | Y(2,IY) = BETA * Y(2,IY) 81 | IY = IY + INCY 82 | 40 CONTINUE 83 | ENDIF 84 | ENDIF 85 | ENDIF 86 | C 87 | IF ( AZERO ) RETURN ! Nothing left to do 88 | C 89 | IF ( LSAME(TRANS,'N') ) THEN 90 | JX = KX 91 | IF ( INCY.EQ.1 ) THEN 92 | DO 60 J = 1,N 93 | HTMPS = ALPHA * X(1,JX) 94 | HTMPJ = ALPHA * X(2,JX) 95 | Do 50 I = 1,M 96 | C Hamilton Product 97 | Y(1,I) = Y(1,I) + 98 | $ A(1,I,J)*HTMPS - A(2,I,J)*CONJG(HTMPJ) 99 | Y(2,I) = Y(2,I) + 100 | $ A(1,I,J)*HTMPJ + A(2,I,J)*CONJG(HTMPS) 101 | 50 CONTINUE 102 | JX = JX + INCX 103 | 60 CONTINUE 104 | ELSE 105 | DO 80 J = 1,N 106 | HTMPS = ALPHA * X(1,JX) 107 | HTMPJ = ALPHA * X(2,JX) 108 | IY = KY 109 | Do 70 I = 1,M 110 | C Hamilton Product 111 | Y(1,IY) = Y(1,IY) + 112 | $ A(1,I,J)*HTMPS - A(2,I,J)*CONJG(HTMPJ) 113 | Y(2,IY) = Y(2,IY) + 114 | $ A(1,I,J)*HTMPJ + A(2,I,J)*CONJG(HTMPS) 115 | IY = IY + INCY 116 | 70 CONTINUE 117 | JX = JX + INCX 118 | 80 CONTINUE 119 | ENDIF 120 | ELSE 121 | JY = KY 122 | IF ( INCX.EQ.1 ) THEN 123 | DO 110 J = 1,N 124 | HTMPS = ZERO 125 | HTMPJ = ZERO 126 | IF ( NOCONJ ) THEN 127 | DO 90 I = 1,M 128 | C Hamilton Product 129 | HTMPS = HTMPS + A(1,I,J)*X(1,I) - A(2,I,J)*CONJG(X(2,I)) 130 | HTMPJ = HTMPJ + A(1,I,J)*X(2,I) + A(2,I,J)*CONJG(X(1,I)) 131 | 90 CONTINUE 132 | ELSE 133 | DO 100 I = 1,M 134 | C Hamilton Product (IMPLIED CONJ(A)) 135 | HTMPS = HTMPS + 136 | $ CONJG(A(1,I,J))*X(1,I) + A(2,I,J)*CONJG(X(2,I)) 137 | HTMPJ = HTMPJ + 138 | $ CONJG(A(1,I,J))*X(2,I) - A(2,I,J)*CONJG(X(1,I)) 139 | 100 CONTINUE 140 | ENDIF 141 | Y(1,JY) = Y(1,JY) + ALPHA * HTMPS 142 | Y(2,JY) = Y(2,JY) + ALPHA * HTMPJ 143 | JY = JY + INCY 144 | 110 CONTINUE 145 | ELSE 146 | DO 140 J = 1,N 147 | HTMPS = ZERO 148 | HTMPJ = ZERO 149 | IX = KX 150 | IF ( NOCONJ ) THEN 151 | DO 120 I = 1,M 152 | C Hamilton Product 153 | HTMPS = HTMPS + 154 | $ A(1,I,J)*X(1,IX) - A(2,I,J)*CONJG(X(2,IX)) 155 | HTMPJ = HTMPJ + 156 | $ A(1,I,J)*X(2,IX) + A(2,I,J)*CONJG(X(1,IX)) 157 | IX = IX + INCX 158 | 120 CONTINUE 159 | ELSE 160 | DO 130 I = 1,M 161 | C Hamilton Product (IMPLIED CONJ(A)) 162 | HTMPS = HTMPS + 163 | $ CONJG(A(1,I,J))*X(1,IX) + A(2,I,J)*CONJG(X(2,IX)) 164 | HTMPJ = HTMPJ + 165 | $ CONJG(A(1,I,J))*X(2,IX) - A(2,I,J)*CONJG(X(1,IX)) 166 | IX = IX + INCX 167 | 130 CONTINUE 168 | ENDIF 169 | Y(1,JY) = Y(1,JY) + ALPHA * HTMPS 170 | Y(2,JY) = Y(2,JY) + ALPHA * HTMPJ 171 | JY = JY + INCY 172 | 140 CONTINUE 173 | ENDIF 174 | ENDIF 175 | C 176 | END 177 | -------------------------------------------------------------------------------- /src/hblas/fortran/hblas2/hgemvdh.f: -------------------------------------------------------------------------------- 1 | C 2 | C This file is a part of HAXX 3 | C 4 | C Copyright (c) 2017 David Williams-Young 5 | C All rights reserved. 6 | C 7 | C See LICENSE.txt 8 | C 9 | Subroutine HGEMVDH(TRANS, M, N, ALPHA, A, LDA, X, INCX, BETA, 10 | $ Y, INCY) 11 | C 12 | LOGICAL LSAME 13 | EXTERNAL LSAME 14 | C 15 | CHARACTER TRANS 16 | LOGICAL NOCONJ, AZERO,AONE,BZERO,BONE 17 | INTEGER*4 M,N, LDA,INCX,INCY, LENX,LENY, I,J,IY,JX,IX,JY, KX,KY 18 | REAL*8 ALPHA 19 | COMPLEX*16 A(2,LDA,*), X(2,*), Y(2,*), BETA(2) 20 | C 21 | COMPLEX*16 HTMPS, HTMPJ 22 | C 23 | COMPLEX*16 ONE, ZERO 24 | PARAMETER (ONE=(1.0D+0,0.0D+0), ZERO=(0.0D+0,0.0D+0)) 25 | C 26 | C 27 | NOCONJ = LSAME('T',TRANS) 28 | IF ( LSAME('T',TRANS) ) THEN 29 | LENX = N 30 | LENY = M 31 | ELSE 32 | LENX = M 33 | LENY = N 34 | ENDIF 35 | 36 | AZERO = ALPHA.EQ.ZERO 37 | AONE = ALPHA.EQ.ONE 38 | BZERO = (BETA(1).EQ.ZERO) .AND. (BETA(2).EQ.ZERO) 39 | BONE = (BETA(1).EQ.ONE) .AND. (BETA(2).EQ.ZERO) 40 | C 41 | IF ( (M.EQ.0) .OR. (N.EQ.0) .OR. 42 | $ ( (AZERO) .AND. (BONE) ) ) RETURN 43 | C 44 | IF ( INCX.GT.0 ) THEN 45 | KX = 1 46 | ELSE 47 | KX = 1 - (LENX-1)*INCX 48 | ENDIF 49 | 50 | IF ( INCY.GT.0 ) THEN 51 | KY = 1 52 | ELSE 53 | KY = 1 - (LENY-1)*INCY 54 | ENDIF 55 | C 56 | 57 | IF ( .NOT.BONE ) THEN 58 | IF ( INCY.EQ.1 ) THEN 59 | IF ( BZERO ) THEN 60 | DO 10 I = 1,LENY 61 | Y(1,I) = ZERO 62 | Y(2,I) = ZERO 63 | 10 CONTINUE 64 | ELSE 65 | DO 20 I = 1,LENY 66 | HTMPS = BETA(1) * Y(1,I) - BETA(2) * CONJG(Y(2,I)) 67 | Y(2,I) = BETA(1) * Y(2,I) + BETA(2) * CONJG(Y(1,I)) 68 | Y(1,I) = HTMPS 69 | 20 CONTINUE 70 | ENDIF 71 | ELSE 72 | IY = KY 73 | IF ( BZERO ) THEN 74 | DO 30 I = 1,LENY 75 | Y(1,IY) = ZERO 76 | Y(2,IY) = ZERO 77 | IY = IY + INCY 78 | 30 CONTINUE 79 | ELSE 80 | DO 40 I = 1,LENY 81 | HTMPS = BETA(1) * Y(1,IY) - BETA(2) * CONJG(Y(2,IY)) 82 | Y(2,IY) = BETA(1) * Y(2,IY) + BETA(2) * CONJG(Y(1,IY)) 83 | Y(1,IY) = HTMPS 84 | 40 CONTINUE 85 | ENDIF 86 | ENDIF 87 | ENDIF 88 | C 89 | IF ( AZERO ) RETURN ! Nothing left to do 90 | C 91 | IF ( LSAME(TRANS,'N') ) THEN 92 | JX = KX 93 | IF ( INCY.EQ.1 ) THEN 94 | DO 60 J = 1,N 95 | HTMPS = ALPHA * X(1,JX) 96 | HTMPJ = ALPHA * X(2,JX) 97 | Do 50 I = 1,M 98 | C Hamilton Product 99 | Y(1,I) = Y(1,I) + 100 | $ A(1,I,J)*HTMPS - A(2,I,J)*CONJG(HTMPJ) 101 | Y(2,I) = Y(2,I) + 102 | $ A(1,I,J)*HTMPJ + A(2,I,J)*CONJG(HTMPS) 103 | 50 CONTINUE 104 | JX = JX + INCX 105 | 60 CONTINUE 106 | ELSE 107 | DO 80 J = 1,N 108 | HTMPS = ALPHA * X(1,JX) 109 | HTMPJ = ALPHA * X(2,JX) 110 | IY = KY 111 | Do 70 I = 1,M 112 | C Hamilton Product 113 | Y(1,IY) = Y(1,IY) + 114 | $ A(1,I,J)*HTMPS - A(2,I,J)*CONJG(HTMPJ) 115 | Y(2,IY) = Y(2,IY) + 116 | $ A(1,I,J)*HTMPJ + A(2,I,J)*CONJG(HTMPS) 117 | IY = IY + INCY 118 | 70 CONTINUE 119 | JX = JX + INCX 120 | 80 CONTINUE 121 | ENDIF 122 | ELSE 123 | JY = KY 124 | IF ( INCX.EQ.1 ) THEN 125 | DO 110 J = 1,N 126 | HTMPS = ZERO 127 | HTMPJ = ZERO 128 | IF ( NOCONJ ) THEN 129 | DO 90 I = 1,M 130 | C Hamilton Product 131 | HTMPS = HTMPS + A(1,I,J)*X(1,I) - A(2,I,J)*CONJG(X(2,I)) 132 | HTMPJ = HTMPJ + A(1,I,J)*X(2,I) + A(2,I,J)*CONJG(X(1,I)) 133 | 90 CONTINUE 134 | ELSE 135 | DO 100 I = 1,M 136 | C Hamilton Product (IMPLIED CONJ(A)) 137 | HTMPS = HTMPS + 138 | $ CONJG(A(1,I,J))*X(1,I) + A(2,I,J)*CONJG(X(2,I)) 139 | HTMPJ = HTMPJ + 140 | $ CONJG(A(1,I,J))*X(2,I) - A(2,I,J)*CONJG(X(1,I)) 141 | 100 CONTINUE 142 | ENDIF 143 | Y(1,JY) = Y(1,JY) + ALPHA * HTMPS 144 | Y(2,JY) = Y(2,JY) + ALPHA * HTMPJ 145 | JY = JY + INCY 146 | 110 CONTINUE 147 | ELSE 148 | DO 140 J = 1,N 149 | HTMPS = ZERO 150 | HTMPJ = ZERO 151 | IX = KX 152 | IF ( NOCONJ ) THEN 153 | DO 120 I = 1,M 154 | C Hamilton Product 155 | HTMPS = HTMPS + 156 | $ A(1,I,J)*X(1,IX) - A(2,I,J)*CONJG(X(2,IX)) 157 | HTMPJ = HTMPJ + 158 | $ A(1,I,J)*X(2,IX) + A(2,I,J)*CONJG(X(1,IX)) 159 | IX = IX + INCX 160 | 120 CONTINUE 161 | ELSE 162 | DO 130 I = 1,M 163 | C Hamilton Product (IMPLIED CONJ(A)) 164 | HTMPS = HTMPS + 165 | $ CONJG(A(1,I,J))*X(1,IX) + A(2,I,J)*CONJG(X(2,IX)) 166 | HTMPJ = HTMPJ + 167 | $ CONJG(A(1,I,J))*X(2,IX) - A(2,I,J)*CONJG(X(1,IX)) 168 | IX = IX + INCX 169 | 130 CONTINUE 170 | ENDIF 171 | Y(1,JY) = Y(1,JY) + ALPHA * HTMPS 172 | Y(2,JY) = Y(2,JY) + ALPHA * HTMPJ 173 | JY = JY + INCY 174 | 140 CONTINUE 175 | ENDIF 176 | ENDIF 177 | C 178 | END 179 | -------------------------------------------------------------------------------- /src/hblas/fortran/hblas2/hgemvdz.f: -------------------------------------------------------------------------------- 1 | C 2 | C This file is a part of HAXX 3 | C 4 | C Copyright (c) 2017 David Williams-Young 5 | C All rights reserved. 6 | C 7 | C See LICENSE.txt 8 | C 9 | Subroutine HGEMVDZ(TRANS, M, N, ALPHA, A, LDA, X, INCX, BETA, 10 | $ Y, INCY) 11 | C 12 | LOGICAL LSAME 13 | EXTERNAL LSAME 14 | C 15 | CHARACTER TRANS 16 | LOGICAL NOCONJ, AZERO,AONE,BZERO,BONE 17 | INTEGER*4 M,N, LDA,INCX,INCY, LENX,LENY, I,J,IY,JX,IX,JY, KX,KY 18 | REAL*8 ALPHA 19 | COMPLEX*16 A(2,LDA,*), X(2,*), Y(2,*), BETA 20 | C 21 | COMPLEX*16 HTMPS, HTMPJ 22 | C 23 | COMPLEX*16 ONE, ZERO 24 | PARAMETER (ONE=(1.0D+0,0.0D+0), ZERO=(0.0D+0,0.0D+0)) 25 | C 26 | IF ( (M.EQ.0) .OR. (N.EQ.0) .OR. 27 | $ ( (ALPHA.EQ.ZERO) .AND. (BETA.EQ.ONE) ) ) RETURN 28 | C 29 | NOCONJ = LSAME('T',TRANS) 30 | IF ( LSAME('T',TRANS) ) THEN 31 | LENX = N 32 | LENY = M 33 | ELSE 34 | LENX = M 35 | LENY = N 36 | ENDIF 37 | 38 | AZERO = ALPHA.EQ.ZERO 39 | AONE = ALPHA.EQ.ONE 40 | BZERO = BETA.EQ.ZERO 41 | BONE = BETA.EQ.ONE 42 | C 43 | IF ( INCX.GT.0 ) THEN 44 | KX = 1 45 | ELSE 46 | KX = 1 - (LENX-1)*INCX 47 | ENDIF 48 | 49 | IF ( INCY.GT.0 ) THEN 50 | KY = 1 51 | ELSE 52 | KY = 1 - (LENY-1)*INCY 53 | ENDIF 54 | C 55 | 56 | IF ( .NOT.BONE ) THEN 57 | IF ( INCY.EQ.1 ) THEN 58 | IF ( BZERO ) THEN 59 | DO 10 I = 1,LENY 60 | Y(1,I) = ZERO 61 | Y(2,I) = ZERO 62 | 10 CONTINUE 63 | ELSE 64 | DO 20 I = 1,LENY 65 | Y(1,I) = BETA * Y(1,I) 66 | Y(2,I) = BETA * Y(2,I) 67 | 20 CONTINUE 68 | ENDIF 69 | ELSE 70 | IY = KY 71 | IF ( BZERO ) THEN 72 | DO 30 I = 1,LENY 73 | Y(1,IY) = ZERO 74 | Y(2,IY) = ZERO 75 | IY = IY + INCY 76 | 30 CONTINUE 77 | ELSE 78 | DO 40 I = 1,LENY 79 | Y(1,IY) = BETA * Y(1,IY) 80 | Y(2,IY) = BETA * Y(2,IY) 81 | IY = IY + INCY 82 | 40 CONTINUE 83 | ENDIF 84 | ENDIF 85 | ENDIF 86 | C 87 | IF ( AZERO ) RETURN ! Nothing left to do 88 | C 89 | IF ( LSAME(TRANS,'N') ) THEN 90 | JX = KX 91 | IF ( INCY.EQ.1 ) THEN 92 | DO 60 J = 1,N 93 | HTMPS = ALPHA * X(1,JX) 94 | HTMPJ = ALPHA * X(2,JX) 95 | Do 50 I = 1,M 96 | C Hamilton Product 97 | Y(1,I) = Y(1,I) + 98 | $ A(1,I,J)*HTMPS - A(2,I,J)*CONJG(HTMPJ) 99 | Y(2,I) = Y(2,I) + 100 | $ A(1,I,J)*HTMPJ + A(2,I,J)*CONJG(HTMPS) 101 | 50 CONTINUE 102 | JX = JX + INCX 103 | 60 CONTINUE 104 | ELSE 105 | DO 80 J = 1,N 106 | HTMPS = ALPHA * X(1,JX) 107 | HTMPJ = ALPHA * X(2,JX) 108 | IY = KY 109 | Do 70 I = 1,M 110 | C Hamilton Product 111 | Y(1,IY) = Y(1,IY) + 112 | $ A(1,I,J)*HTMPS - A(2,I,J)*CONJG(HTMPJ) 113 | Y(2,IY) = Y(2,IY) + 114 | $ A(1,I,J)*HTMPJ + A(2,I,J)*CONJG(HTMPS) 115 | IY = IY + INCY 116 | 70 CONTINUE 117 | JX = JX + INCX 118 | 80 CONTINUE 119 | ENDIF 120 | ELSE 121 | JY = KY 122 | IF ( INCX.EQ.1 ) THEN 123 | DO 110 J = 1,N 124 | HTMPS = ZERO 125 | HTMPJ = ZERO 126 | IF ( NOCONJ ) THEN 127 | DO 90 I = 1,M 128 | C Hamilton Product 129 | HTMPS = HTMPS + A(1,I,J)*X(1,I) - A(2,I,J)*CONJG(X(2,I)) 130 | HTMPJ = HTMPJ + A(1,I,J)*X(2,I) + A(2,I,J)*CONJG(X(1,I)) 131 | 90 CONTINUE 132 | ELSE 133 | DO 100 I = 1,M 134 | C Hamilton Product (IMPLIED CONJ(A)) 135 | HTMPS = HTMPS + 136 | $ CONJG(A(1,I,J))*X(1,I) + A(2,I,J)*CONJG(X(2,I)) 137 | HTMPJ = HTMPJ + 138 | $ CONJG(A(1,I,J))*X(2,I) - A(2,I,J)*CONJG(X(1,I)) 139 | 100 CONTINUE 140 | ENDIF 141 | Y(1,JY) = Y(1,JY) + ALPHA * HTMPS 142 | Y(2,JY) = Y(2,JY) + ALPHA * HTMPJ 143 | JY = JY + INCY 144 | 110 CONTINUE 145 | ELSE 146 | DO 140 J = 1,N 147 | HTMPS = ZERO 148 | HTMPJ = ZERO 149 | IX = KX 150 | IF ( NOCONJ ) THEN 151 | DO 120 I = 1,M 152 | C Hamilton Product 153 | HTMPS = HTMPS + 154 | $ A(1,I,J)*X(1,IX) - A(2,I,J)*CONJG(X(2,IX)) 155 | HTMPJ = HTMPJ + 156 | $ A(1,I,J)*X(2,IX) + A(2,I,J)*CONJG(X(1,IX)) 157 | IX = IX + INCX 158 | 120 CONTINUE 159 | ELSE 160 | DO 130 I = 1,M 161 | C Hamilton Product (IMPLIED CONJ(A)) 162 | HTMPS = HTMPS + 163 | $ CONJG(A(1,I,J))*X(1,IX) + A(2,I,J)*CONJG(X(2,IX)) 164 | HTMPJ = HTMPJ + 165 | $ CONJG(A(1,I,J))*X(2,IX) - A(2,I,J)*CONJG(X(1,IX)) 166 | IX = IX + INCX 167 | 130 CONTINUE 168 | ENDIF 169 | Y(1,JY) = Y(1,JY) + ALPHA * HTMPS 170 | Y(2,JY) = Y(2,JY) + ALPHA * HTMPJ 171 | JY = JY + INCY 172 | 140 CONTINUE 173 | ENDIF 174 | ENDIF 175 | C 176 | END 177 | -------------------------------------------------------------------------------- /src/hblas/fortran/hblas2/hgemvzd.f: -------------------------------------------------------------------------------- 1 | C 2 | C This file is a part of HAXX 3 | C 4 | C Copyright (c) 2017 David Williams-Young 5 | C All rights reserved. 6 | C 7 | C See LICENSE.txt 8 | C 9 | Subroutine HGEMVZD(TRANS, M, N, ALPHA, A, LDA, X, INCX, BETA, 10 | $ Y, INCY) 11 | C 12 | LOGICAL LSAME 13 | EXTERNAL LSAME 14 | C 15 | CHARACTER TRANS 16 | LOGICAL NOCONJ, AZERO,AONE,BZERO,BONE 17 | INTEGER*4 M,N, LDA,INCX,INCY, LENX,LENY, I,J,IY,JX,IX,JY, KX,KY 18 | REAL*8 BETA 19 | COMPLEX*16 A(2,LDA,*), X(2,*), Y(2,*), ALPHA 20 | C 21 | COMPLEX*16 HTMP1S, HTMP1J, HTMP2S, HTMP2J 22 | C 23 | COMPLEX*16 ONE, ZERO 24 | PARAMETER (ONE=(1.0D+0,0.0D+0), ZERO=(0.0D+0,0.0D+0)) 25 | C 26 | NOCONJ = LSAME('T',TRANS) 27 | IF ( LSAME('T',TRANS) ) THEN 28 | LENX = N 29 | LENY = M 30 | ELSE 31 | LENX = M 32 | LENY = N 33 | ENDIF 34 | 35 | AZERO = ALPHA.EQ.ZERO 36 | AONE = ALPHA.EQ.ONE 37 | BZERO = BETA.EQ.ZERO 38 | BONE = BETA.EQ.ONE 39 | C 40 | IF ( (M.EQ.0) .OR. (N.EQ.0) .OR. 41 | $ ( AZERO .AND. BONE ) ) RETURN 42 | C 43 | C 44 | IF ( INCX.GT.0 ) THEN 45 | KX = 1 46 | ELSE 47 | KX = 1 - (LENX-1)*INCX 48 | ENDIF 49 | 50 | IF ( INCY.GT.0 ) THEN 51 | KY = 1 52 | ELSE 53 | KY = 1 - (LENY-1)*INCY 54 | ENDIF 55 | C 56 | 57 | IF ( .NOT.BONE ) THEN 58 | IF ( INCY.EQ.1 ) THEN 59 | IF ( BZERO ) THEN 60 | DO 10 I = 1,LENY 61 | Y(1,I) = ZERO 62 | Y(2,I) = ZERO 63 | 10 CONTINUE 64 | ELSE 65 | DO 20 I = 1,LENY 66 | Y(1,I) = BETA * Y(1,I) 67 | Y(2,I) = BETA * Y(2,I) 68 | 20 CONTINUE 69 | ENDIF 70 | ELSE 71 | IY = KY 72 | IF ( BZERO ) THEN 73 | DO 30 I = 1,LENY 74 | Y(1,IY) = ZERO 75 | Y(2,IY) = ZERO 76 | IY = IY + INCY 77 | 30 CONTINUE 78 | ELSE 79 | DO 40 I = 1,LENY 80 | Y(1,IY) = BETA * Y(1,IY) 81 | Y(2,IY) = BETA * Y(2,IY) 82 | IY = IY + INCY 83 | 40 CONTINUE 84 | ENDIF 85 | ENDIF 86 | ENDIF 87 | C 88 | IF ( AZERO ) RETURN ! Nothing left to do 89 | C 90 | IF ( LSAME(TRANS,'N') ) THEN 91 | JX = KX 92 | IF ( INCY.EQ.1 ) THEN 93 | DO 60 J = 1,N 94 | HTMP1S = ALPHA * X(1,JX) 95 | HTMP1J = ALPHA * X(2,JX) 96 | HTMP2S = ALPHA * CONJG(X(1,JX)) 97 | HTMP2J = ALPHA * CONJG(X(2,JX)) 98 | Do 50 I = 1,M 99 | C Hamilton Product 100 | Y(1,I) = Y(1,I) + A(1,I,J)*HTMP1S - A(2,I,J)*HTMP2J 101 | Y(2,I) = Y(2,I) + A(1,I,J)*HTMP1J + A(2,I,J)*HTMP2S 102 | 50 CONTINUE 103 | JX = JX + INCX 104 | 60 CONTINUE 105 | ELSE 106 | DO 80 J = 1,N 107 | HTMP1S = ALPHA * X(1,JX) 108 | HTMP1J = ALPHA * X(2,JX) 109 | HTMP2S = ALPHA * CONJG(X(1,JX)) 110 | HTMP2J = ALPHA * CONJG(X(2,JX)) 111 | IY = KY 112 | Do 70 I = 1,M 113 | C Hamilton Product 114 | Y(1,IY) = Y(1,IY) + A(1,I,J)*HTMP1S - A(2,I,J)*HTMP2J 115 | Y(2,IY) = Y(2,IY) + A(1,I,J)*HTMP1J + A(2,I,J)*HTMP2S 116 | IY = IY + INCY 117 | 70 CONTINUE 118 | JX = JX + INCX 119 | 80 CONTINUE 120 | ENDIF 121 | ELSE 122 | JY = KY 123 | IF ( INCX.EQ.1 ) THEN 124 | DO 110 J = 1,N 125 | HTMP1S = ZERO 126 | HTMP1J = ZERO 127 | IF ( NOCONJ ) THEN 128 | DO 90 I = 1,M 129 | C Hamilton Product 130 | HTMP1S = HTMP1S + 131 | $ A(1,I,J)*X(1,I) - A(2,I,J)*CONJG(X(2,I)) 132 | HTMP1J = HTMP1J + 133 | $ A(1,I,J)*X(2,I) + A(2,I,J)*CONJG(X(1,I)) 134 | 90 CONTINUE 135 | ELSE 136 | DO 100 I = 1,M 137 | C Hamilton Product (IMPLIED CONJ(A)) 138 | HTMP1S = HTMP1S + 139 | $ CONJG(A(1,I,J))*X(1,I) + A(2,I,J)*CONJG(X(2,I)) 140 | HTMP1J = HTMP1J + 141 | $ CONJG(A(1,I,J))*X(2,I) - A(2,I,J)*CONJG(X(1,I)) 142 | 100 CONTINUE 143 | ENDIF 144 | Y(1,JY) = Y(1,JY) + ALPHA*HTMP1S 145 | Y(2,JY) = Y(2,JY) + ALPHA*HTMP1J 146 | JY = JY + INCY 147 | 110 CONTINUE 148 | ELSE 149 | DO 140 J = 1,N 150 | HTMP1S = ZERO 151 | HTMP1J = ZERO 152 | IX = KX 153 | IF ( NOCONJ ) THEN 154 | DO 120 I = 1,M 155 | C Hamilton Product 156 | HTMP1S = HTMP1S + 157 | $ A(1,I,J)*X(1,IX) - A(2,I,J)*CONJG(X(2,IX)) 158 | HTMP1J = HTMP1J + 159 | $ A(1,I,J)*X(2,IX) + A(2,I,J)*CONJG(X(1,IX)) 160 | IX = IX + INCX 161 | 120 CONTINUE 162 | ELSE 163 | DO 130 I = 1,M 164 | C Hamilton Product (IMPLIED CONJ(A)) 165 | HTMP1S = HTMP1S + 166 | $ CONJG(A(1,I,J))*X(1,IX) + A(2,I,J)*CONJG(X(2,IX)) 167 | HTMP1J = HTMP1J + 168 | $ CONJG(A(1,I,J))*X(2,IX) - A(2,I,J)*CONJG(X(1,IX)) 169 | IX = IX + INCX 170 | 130 CONTINUE 171 | ENDIF 172 | Y(1,JY) = Y(1,JY) + ALPHA*HTMP1S 173 | Y(2,JY) = Y(2,JY) + ALPHA*HTMP1J 174 | JY = JY + INCY 175 | 140 CONTINUE 176 | ENDIF 177 | ENDIF 178 | C 179 | END 180 | -------------------------------------------------------------------------------- /src/hblas/fortran/hblas2/hgemvzh.f: -------------------------------------------------------------------------------- 1 | C 2 | C This file is a part of HAXX 3 | C 4 | C Copyright (c) 2017 David Williams-Young 5 | C All rights reserved. 6 | C 7 | C See LICENSE.txt 8 | C 9 | Subroutine HGEMVZH(TRANS, M, N, ALPHA, A, LDA, X, INCX, BETA, 10 | $ Y, INCY) 11 | C 12 | LOGICAL LSAME 13 | EXTERNAL LSAME 14 | C 15 | CHARACTER TRANS 16 | LOGICAL NOCONJ, AZERO,AONE,BZERO,BONE 17 | INTEGER*4 M,N, LDA,INCX,INCY, LENX,LENY, I,J,IY,JX,IX,JY, KX,KY 18 | COMPLEX*16 A(2,LDA,*), X(2,*), Y(2,*), ALPHA, BETA(2) 19 | C 20 | COMPLEX*16 HTMP1S, HTMP1J, HTMP2S, HTMP2J 21 | C 22 | COMPLEX*16 ONE, ZERO 23 | PARAMETER (ONE=(1.0D+0,0.0D+0), ZERO=(0.0D+0,0.0D+0)) 24 | C 25 | NOCONJ = LSAME('T',TRANS) 26 | IF ( LSAME('T',TRANS) ) THEN 27 | LENX = N 28 | LENY = M 29 | ELSE 30 | LENX = M 31 | LENY = N 32 | ENDIF 33 | 34 | AZERO = ALPHA.EQ.ZERO 35 | AONE = ALPHA.EQ.ONE 36 | BZERO = (BETA(1).EQ.ZERO) .AND. (BETA(2).EQ.ZERO) 37 | BONE = (BETA(1).EQ.ONE) .AND. (BETA(2).EQ.ZERO) 38 | C 39 | IF ( (M.EQ.0) .OR. (N.EQ.0) .OR. 40 | $ ( AZERO .AND. BONE ) ) RETURN 41 | C 42 | C 43 | IF ( INCX.GT.0 ) THEN 44 | KX = 1 45 | ELSE 46 | KX = 1 - (LENX-1)*INCX 47 | ENDIF 48 | 49 | IF ( INCY.GT.0 ) THEN 50 | KY = 1 51 | ELSE 52 | KY = 1 - (LENY-1)*INCY 53 | ENDIF 54 | C 55 | 56 | IF ( .NOT.BONE ) THEN 57 | IF ( INCY.EQ.1 ) THEN 58 | IF ( BZERO ) THEN 59 | DO 10 I = 1,LENY 60 | Y(1,I) = ZERO 61 | Y(2,I) = ZERO 62 | 10 CONTINUE 63 | ELSE 64 | DO 20 I = 1,LENY 65 | HTMP1S = BETA(1) * Y(1,I) - BETA(2) * CONJG(Y(2,I)) 66 | Y(2,I) = BETA(1) * Y(2,I) + BETA(2) * CONJG(Y(1,I)) 67 | Y(1,I) = HTMP1S 68 | 20 CONTINUE 69 | ENDIF 70 | ELSE 71 | IY = KY 72 | IF ( BZERO ) THEN 73 | DO 30 I = 1,LENY 74 | Y(1,IY) = ZERO 75 | Y(2,IY) = ZERO 76 | IY = IY + INCY 77 | 30 CONTINUE 78 | ELSE 79 | DO 40 I = 1,LENY 80 | HTMP1S = BETA(1) * Y(1,IY) - BETA(2) * CONJG(Y(2,IY)) 81 | Y(2,IY) = BETA(1) * Y(2,IY) + BETA(2) * CONJG(Y(1,IY)) 82 | Y(1,IY) = HTMP1S 83 | IY = IY + INCY 84 | 40 CONTINUE 85 | ENDIF 86 | ENDIF 87 | ENDIF 88 | C 89 | IF ( AZERO ) RETURN ! Nothing left to do 90 | C 91 | IF ( LSAME(TRANS,'N') ) THEN 92 | JX = KX 93 | IF ( INCY.EQ.1 ) THEN 94 | DO 60 J = 1,N 95 | HTMP1S = ALPHA * X(1,JX) 96 | HTMP1J = ALPHA * X(2,JX) 97 | HTMP2S = ALPHA * CONJG(X(1,JX)) 98 | HTMP2J = ALPHA * CONJG(X(2,JX)) 99 | Do 50 I = 1,M 100 | C Hamilton Product 101 | Y(1,I) = Y(1,I) + A(1,I,J)*HTMP1S - A(2,I,J)*HTMP2J 102 | Y(2,I) = Y(2,I) + A(1,I,J)*HTMP1J + A(2,I,J)*HTMP2S 103 | 50 CONTINUE 104 | JX = JX + INCX 105 | 60 CONTINUE 106 | ELSE 107 | DO 80 J = 1,N 108 | HTMP1S = ALPHA * X(1,JX) 109 | HTMP1J = ALPHA * X(2,JX) 110 | HTMP2S = ALPHA * CONJG(X(1,JX)) 111 | HTMP2J = ALPHA * CONJG(X(2,JX)) 112 | IY = KY 113 | Do 70 I = 1,M 114 | C Hamilton Product 115 | Y(1,IY) = Y(1,IY) + A(1,I,J)*HTMP1S - A(2,I,J)*HTMP2J 116 | Y(2,IY) = Y(2,IY) + A(1,I,J)*HTMP1J + A(2,I,J)*HTMP2S 117 | IY = IY + INCY 118 | 70 CONTINUE 119 | JX = JX + INCX 120 | 80 CONTINUE 121 | ENDIF 122 | ELSE 123 | JY = KY 124 | IF ( INCX.EQ.1 ) THEN 125 | DO 110 J = 1,N 126 | HTMP1S = ZERO 127 | HTMP1J = ZERO 128 | IF ( NOCONJ ) THEN 129 | DO 90 I = 1,M 130 | C Hamilton Product 131 | HTMP1S = HTMP1S + 132 | $ A(1,I,J)*X(1,I) - A(2,I,J)*CONJG(X(2,I)) 133 | HTMP1J = HTMP1J + 134 | $ A(1,I,J)*X(2,I) + A(2,I,J)*CONJG(X(1,I)) 135 | 90 CONTINUE 136 | ELSE 137 | DO 100 I = 1,M 138 | C Hamilton Product (IMPLIED CONJ(A)) 139 | HTMP1S = HTMP1S + 140 | $ CONJG(A(1,I,J))*X(1,I) + A(2,I,J)*CONJG(X(2,I)) 141 | HTMP1J = HTMP1J + 142 | $ CONJG(A(1,I,J))*X(2,I) - A(2,I,J)*CONJG(X(1,I)) 143 | 100 CONTINUE 144 | ENDIF 145 | Y(1,JY) = Y(1,JY) + ALPHA*HTMP1S 146 | Y(2,JY) = Y(2,JY) + ALPHA*HTMP1J 147 | JY = JY + INCY 148 | 110 CONTINUE 149 | ELSE 150 | DO 140 J = 1,N 151 | HTMP1S = ZERO 152 | HTMP1J = ZERO 153 | IX = KX 154 | IF ( NOCONJ ) THEN 155 | DO 120 I = 1,M 156 | C Hamilton Product 157 | HTMP1S = HTMP1S + 158 | $ A(1,I,J)*X(1,IX) - A(2,I,J)*CONJG(X(2,IX)) 159 | HTMP1J = HTMP1J + 160 | $ A(1,I,J)*X(2,IX) + A(2,I,J)*CONJG(X(1,IX)) 161 | IX = IX + INCX 162 | 120 CONTINUE 163 | ELSE 164 | DO 130 I = 1,M 165 | C Hamilton Product (IMPLIED CONJ(A)) 166 | HTMP1S = HTMP1S + 167 | $ CONJG(A(1,I,J))*X(1,IX) + A(2,I,J)*CONJG(X(2,IX)) 168 | HTMP1J = HTMP1J + 169 | $ CONJG(A(1,I,J))*X(2,IX) - A(2,I,J)*CONJG(X(1,IX)) 170 | IX = IX + INCX 171 | 130 CONTINUE 172 | ENDIF 173 | Y(1,JY) = Y(1,JY) + ALPHA*HTMP1S 174 | Y(2,JY) = Y(2,JY) + ALPHA*HTMP1J 175 | JY = JY + INCY 176 | 140 CONTINUE 177 | ENDIF 178 | ENDIF 179 | C 180 | END 181 | -------------------------------------------------------------------------------- /src/hblas/fortran/hblas2/hgemvzz.f: -------------------------------------------------------------------------------- 1 | C 2 | C This file is a part of HAXX 3 | C 4 | C Copyright (c) 2017 David Williams-Young 5 | C All rights reserved. 6 | C 7 | C See LICENSE.txt 8 | C 9 | Subroutine HGEMVZZ(TRANS, M, N, ALPHA, A, LDA, X, INCX, BETA, 10 | $ Y, INCY) 11 | C 12 | LOGICAL LSAME 13 | EXTERNAL LSAME 14 | C 15 | CHARACTER TRANS 16 | LOGICAL NOCONJ, AZERO,AONE,BZERO,BONE 17 | INTEGER*4 M,N, LDA,INCX,INCY, LENX,LENY, I,J,IY,JX,IX,JY, KX,KY 18 | COMPLEX*16 A(2,LDA,*), X(2,*), Y(2,*), ALPHA, BETA 19 | C 20 | COMPLEX*16 HTMP1S, HTMP1J, HTMP2S, HTMP2J 21 | C 22 | COMPLEX*16 ONE, ZERO 23 | PARAMETER (ONE=(1.0D+0,0.0D+0), ZERO=(0.0D+0,0.0D+0)) 24 | C 25 | NOCONJ = LSAME('T',TRANS) 26 | IF ( LSAME('T',TRANS) ) THEN 27 | LENX = N 28 | LENY = M 29 | ELSE 30 | LENX = M 31 | LENY = N 32 | ENDIF 33 | 34 | AZERO = ALPHA.EQ.ZERO 35 | AONE = ALPHA.EQ.ONE 36 | BZERO = BETA.EQ.ZERO 37 | BONE = BETA.EQ.ONE 38 | C 39 | IF ( (M.EQ.0) .OR. (N.EQ.0) .OR. 40 | $ ( AZERO .AND. BONE ) ) RETURN 41 | C 42 | C 43 | IF ( INCX.GT.0 ) THEN 44 | KX = 1 45 | ELSE 46 | KX = 1 - (LENX-1)*INCX 47 | ENDIF 48 | 49 | IF ( INCY.GT.0 ) THEN 50 | KY = 1 51 | ELSE 52 | KY = 1 - (LENY-1)*INCY 53 | ENDIF 54 | C 55 | 56 | IF ( .NOT.BONE ) THEN 57 | IF ( INCY.EQ.1 ) THEN 58 | IF ( BZERO ) THEN 59 | DO 10 I = 1,LENY 60 | Y(1,I) = ZERO 61 | Y(2,I) = ZERO 62 | 10 CONTINUE 63 | ELSE 64 | DO 20 I = 1,LENY 65 | Y(1,I) = BETA * Y(1,I) 66 | Y(2,I) = BETA * Y(2,I) 67 | 20 CONTINUE 68 | ENDIF 69 | ELSE 70 | IY = KY 71 | IF ( BZERO ) THEN 72 | DO 30 I = 1,LENY 73 | Y(1,IY) = ZERO 74 | Y(2,IY) = ZERO 75 | IY = IY + INCY 76 | 30 CONTINUE 77 | ELSE 78 | DO 40 I = 1,LENY 79 | Y(1,IY) = BETA * Y(1,IY) 80 | Y(2,IY) = BETA * Y(2,IY) 81 | IY = IY + INCY 82 | 40 CONTINUE 83 | ENDIF 84 | ENDIF 85 | ENDIF 86 | C 87 | IF ( AZERO ) RETURN ! Nothing left to do 88 | C 89 | IF ( LSAME(TRANS,'N') ) THEN 90 | JX = KX 91 | IF ( INCY.EQ.1 ) THEN 92 | DO 60 J = 1,N 93 | HTMP1S = ALPHA * X(1,JX) 94 | HTMP1J = ALPHA * X(2,JX) 95 | HTMP2S = ALPHA * CONJG(X(1,JX)) 96 | HTMP2J = ALPHA * CONJG(X(2,JX)) 97 | Do 50 I = 1,M 98 | C Hamilton Product 99 | Y(1,I) = Y(1,I) + A(1,I,J)*HTMP1S - A(2,I,J)*HTMP2J 100 | Y(2,I) = Y(2,I) + A(1,I,J)*HTMP1J + A(2,I,J)*HTMP2S 101 | 50 CONTINUE 102 | JX = JX + INCX 103 | 60 CONTINUE 104 | ELSE 105 | DO 80 J = 1,N 106 | HTMP1S = ALPHA * X(1,JX) 107 | HTMP1J = ALPHA * X(2,JX) 108 | HTMP2S = ALPHA * CONJG(X(1,JX)) 109 | HTMP2J = ALPHA * CONJG(X(2,JX)) 110 | IY = KY 111 | Do 70 I = 1,M 112 | C Hamilton Product 113 | Y(1,IY) = Y(1,IY) + A(1,I,J)*HTMP1S - A(2,I,J)*HTMP2J 114 | Y(2,IY) = Y(2,IY) + A(1,I,J)*HTMP1J + A(2,I,J)*HTMP2S 115 | IY = IY + INCY 116 | 70 CONTINUE 117 | JX = JX + INCX 118 | 80 CONTINUE 119 | ENDIF 120 | ELSE 121 | JY = KY 122 | IF ( INCX.EQ.1 ) THEN 123 | DO 110 J = 1,N 124 | HTMP1S = ZERO 125 | HTMP1J = ZERO 126 | IF ( NOCONJ ) THEN 127 | DO 90 I = 1,M 128 | C Hamilton Product 129 | HTMP1S = HTMP1S + 130 | $ A(1,I,J)*X(1,I) - A(2,I,J)*CONJG(X(2,I)) 131 | HTMP1J = HTMP1J + 132 | $ A(1,I,J)*X(2,I) + A(2,I,J)*CONJG(X(1,I)) 133 | 90 CONTINUE 134 | ELSE 135 | DO 100 I = 1,M 136 | C Hamilton Product (IMPLIED CONJ(A)) 137 | HTMP1S = HTMP1S + 138 | $ CONJG(A(1,I,J))*X(1,I) + A(2,I,J)*CONJG(X(2,I)) 139 | HTMP1J = HTMP1J + 140 | $ CONJG(A(1,I,J))*X(2,I) - A(2,I,J)*CONJG(X(1,I)) 141 | 100 CONTINUE 142 | ENDIF 143 | Y(1,JY) = Y(1,JY) + ALPHA*HTMP1S 144 | Y(2,JY) = Y(2,JY) + ALPHA*HTMP1J 145 | JY = JY + INCY 146 | 110 CONTINUE 147 | ELSE 148 | DO 140 J = 1,N 149 | HTMP1S = ZERO 150 | HTMP1J = ZERO 151 | IX = KX 152 | IF ( NOCONJ ) THEN 153 | DO 120 I = 1,M 154 | C Hamilton Product 155 | HTMP1S = HTMP1S + 156 | $ A(1,I,J)*X(1,IX) - A(2,I,J)*CONJG(X(2,IX)) 157 | HTMP1J = HTMP1J + 158 | $ A(1,I,J)*X(2,IX) + A(2,I,J)*CONJG(X(1,IX)) 159 | IX = IX + INCX 160 | 120 CONTINUE 161 | ELSE 162 | DO 130 I = 1,M 163 | C Hamilton Product (IMPLIED CONJ(A)) 164 | HTMP1S = HTMP1S + 165 | $ CONJG(A(1,I,J))*X(1,IX) + A(2,I,J)*CONJG(X(2,IX)) 166 | HTMP1J = HTMP1J + 167 | $ CONJG(A(1,I,J))*X(2,IX) - A(2,I,J)*CONJG(X(1,IX)) 168 | IX = IX + INCX 169 | 130 CONTINUE 170 | ENDIF 171 | Y(1,JY) = Y(1,JY) + ALPHA*HTMP1S 172 | Y(2,JY) = Y(2,JY) + ALPHA*HTMP1J 173 | JY = JY + INCY 174 | 140 CONTINUE 175 | ENDIF 176 | ENDIF 177 | C 178 | END 179 | -------------------------------------------------------------------------------- /src/hblas/fortran/hblas2/hgercd.f: -------------------------------------------------------------------------------- 1 | C 2 | C This file is a part of HAXX 3 | C 4 | C Copyright (c) 2017 David Williams-Young 5 | C All rights reserved. 6 | C 7 | C See LICENSE.txt 8 | C 9 | Subroutine HGERCD(M, N, ALPHA, X, INCX, Y, INCY, A, LDA) 10 | C 11 | LOGICAL AZERO,AONE 12 | INTEGER*4 M,N, LDA,INCX,INCY, I,J,IY,JX,IX,JY, KX 13 | REAL*8 ALPHA 14 | COMPLEX*16 X(2,*), Y(2,*), A(2,LDA,*) 15 | COMPLEX*16 HTMPS, HTMPJ 16 | C 17 | REAL*8 ONED, ZEROD 18 | COMPLEX*16 ONEZ, ZEROZ 19 | PARAMETER (ONED=1.0D+0, ZEROD=0.0D+0) 20 | PARAMETER (ONEZ=(1.0D+0,0.0D+0), ZEROZ=(0.0D+0,0.0D+0)) 21 | C 22 | 23 | AZERO = ALPHA.EQ.ZEROD 24 | AONE = ALPHA.EQ.ONED 25 | 26 | IF ( M.EQ.0 .OR. N.EQ.0 .OR. AZERO ) RETURN 27 | C 28 | IF ( INCY.GT.0 ) THEN 29 | JY = 1 30 | ELSE 31 | JY = 1 - (N-1)*INCY 32 | ENDIF 33 | C 34 | IF ( INCX.EQ.1 ) THEN 35 | DO 20 J = 1,N 36 | IF ( Y(1,JY).NE.ZEROZ .OR. Y(2,JY).NE.ZEROZ ) THEN 37 | HTMPS = ALPHA * CONJG(Y(1,JY)) 38 | HTMPJ = -ALPHA * Y(2,JY) 39 | DO 10 I = 1,M 40 | A(1,I,J) = A(1,I,J) + X(1,I)*HTMPS - X(2,I)*CONJG(HTMPJ) 41 | A(2,I,J) = A(2,I,J) + X(1,I)*HTMPJ + X(2,I)*CONJG(HTMPS) 42 | 10 CONTINUE 43 | ENDIF 44 | JY = JY + INCY 45 | 20 CONTINUE 46 | ELSE 47 | IF ( INCX.GT.0 ) THEN 48 | KX = 1 49 | ELSE 50 | KX = 1 - (M-1)*INCX 51 | ENDIF 52 | DO 40 J = 1,N 53 | IF ( Y(1,JY).NE.ZEROZ .OR. Y(2,JY).NE.ZEROZ ) THEN 54 | HTMPI = ALPHA * CONJG(Y(1,JY)) 55 | HTMPJ = -ALPHA * Y(2,JY) 56 | IX = KX 57 | DO 30 I = 1,M 58 | A(1,I,J) = A(1,I,J) + X(1,IX)*HTMPS - X(2,IX)*CONJG(HTMPJ) 59 | A(2,I,J) = A(2,I,J) + X(1,IX)*HTMPJ + X(2,IX)*CONJG(HTMPS) 60 | IX = IX + INCX 61 | 30 CONTINUE 62 | ENDIF 63 | JY = JY + INCY 64 | 40 CONTINUE 65 | ENDIF 66 | C 67 | END 68 | -------------------------------------------------------------------------------- /src/hblas/fortran/hblas2/hgerch.f: -------------------------------------------------------------------------------- 1 | C 2 | C This file is a part of HAXX 3 | C 4 | C Copyright (c) 2017 David Williams-Young 5 | C All rights reserved. 6 | C 7 | C See LICENSE.txt 8 | C 9 | Subroutine HGERCH(M, N, ALPHA, X, INCX, Y, INCY, A, LDA) 10 | C 11 | LOGICAL AZERO,AONE 12 | INTEGER*4 M,N, LDA,INCX,INCY, I,J,IY,JX,IX,JY, KX 13 | COMPLEX*16 X(2,*), Y(2,*), A(2,LDA,*), ALPHA(2) 14 | COMPLEX*16 HTMP1S, HTMP1J, HTMP2S, HTMP2J 15 | COMPLEX*16 HTMP3S, HTMP3J, HTMP4S, HTMP4J 16 | C 17 | REAL*8 ONED, ZEROD 18 | COMPLEX*16 ONEZ, ZEROZ 19 | PARAMETER (ONED=1.0D+0, ZEROD=0.0D+0) 20 | PARAMETER (ONEZ=(1.0D+0,0.0D+0), ZEROZ=(0.0D+0,0.0D+0)) 21 | C 22 | 23 | AZERO = (ALPHA(1).EQ.ZEROZ).AND.(ALPHA(2).EQ.ZEROZ) 24 | AONE = (ALPHA(1).EQ.ONEZ).AND.(ALPHA(2).EQ.ZEROZ) 25 | 26 | IF ( M.EQ.0 .OR. N.EQ.0 .OR. AZERO ) RETURN 27 | C 28 | IF ( INCY.GT.0 ) THEN 29 | JY = 1 30 | ELSE 31 | JY = 1 - (N-1)*INCY 32 | ENDIF 33 | C 34 | IF ( INCX.EQ.1 ) THEN 35 | DO 20 J = 1,N 36 | IF ( Y(1,JY).NE.ZEROZ .OR. Y(2,JY).NE.ZEROZ ) THEN 37 | HTMP1S = ALPHA(1) * CONJG(Y(1,JY)) 38 | HTMP1J = -ALPHA(1) * Y(2,JY) 39 | HTMP2S = ALPHA(1) * Y(1,JY) 40 | HTMP2J = -ALPHA(1) * CONJG(Y(2,JY)) 41 | HTMP3S = ALPHA(2) * CONJG(Y(1,JY)) 42 | HTMP3J = -ALPHA(2) * Y(2,JY) 43 | HTMP4S = ALPHA(2) * Y(1,JY) 44 | HTMP4J = -ALPHA(2) * CONJG(Y(2,JY)) 45 | DO 10 I = 1,M 46 | A(1,I,J) = A(1,I,J) + 47 | $ X(1,I)*HTMP1S - X(2,I)*HTMP2J - 48 | $ CONJG(X(1,I))*HTMP4J - CONJG(X(2,I))*HTMP3S 49 | A(2,I,J) = A(2,I,J) + 50 | $ X(1,I)*HTMP1J + X(2,I)*HTMP2S + 51 | $ CONJG(X(1,I))*HTMP4S - CONJG(X(2,I))*HTMP3J 52 | 10 CONTINUE 53 | ENDIF 54 | JY = JY + INCY 55 | 20 CONTINUE 56 | ELSE 57 | IF ( INCX.GT.0 ) THEN 58 | KX = 1 59 | ELSE 60 | KX = 1 - (M-1)*INCX 61 | ENDIF 62 | DO 40 J = 1,N 63 | IF ( Y(1,JY).NE.ZEROZ .OR. Y(2,JY).NE.ZEROZ ) THEN 64 | HTMP1S = ALPHA(1) * CONJG(Y(1,JY)) 65 | HTMP1J = -ALPHA(1) * Y(2,JY) 66 | HTMP2S = ALPHA(1) * Y(1,JY) 67 | HTMP2J = -ALPHA(1) * CONJG(Y(2,JY)) 68 | HTMP3S = ALPHA(2) * CONJG(Y(1,JY)) 69 | HTMP3J = -ALPHA(2) * Y(2,JY) 70 | HTMP4S = ALPHA(2) * Y(1,JY) 71 | HTMP4J = -ALPHA(2) * CONJG(Y(2,JY)) 72 | IX = KX 73 | DO 30 I = 1,M 74 | A(1,I,J) = A(1,I,J) + 75 | $ X(1,IX)*HTMP1S - X(2,IX)*HTMP2J - 76 | $ CONJG(X(1,IX))*HTMP4J - CONJG(X(2,IX))*HTMP3S 77 | A(2,I,J) = A(2,I,J) + 78 | $ X(1,IX)*HTMP1J + X(2,IX)*HTMP2S + 79 | $ CONJG(X(1,IX))*HTMP4S - CONJG(X(2,IX))*HTMP3J 80 | IX = IX + INCX 81 | 30 CONTINUE 82 | ENDIF 83 | JY = JY + INCY 84 | 40 CONTINUE 85 | ENDIF 86 | C 87 | END 88 | -------------------------------------------------------------------------------- /src/hblas/fortran/hblas2/hgercz.f: -------------------------------------------------------------------------------- 1 | C 2 | C This file is a part of HAXX 3 | C 4 | C Copyright (c) 2017 David Williams-Young 5 | C All rights reserved. 6 | C 7 | C See LICENSE.txt 8 | C 9 | Subroutine HGERCZ(M, N, ALPHA, X, INCX, Y, INCY, A, LDA) 10 | C 11 | LOGICAL AZERO,AONE 12 | INTEGER*4 M,N, LDA,INCX,INCY, I,J,IY,JX,IX,JY, KX 13 | COMPLEX*16 X(2,*), Y(2,*), A(2,LDA,*), ALPHA 14 | COMPLEX*16 HTMP1S, HTMP1J, HTMP2S, HTMP2J 15 | C 16 | REAL*8 ONED, ZEROD 17 | COMPLEX*16 ONEZ, ZEROZ 18 | PARAMETER (ONED=1.0D+0, ZEROD=0.0D+0) 19 | PARAMETER (ONEZ=(1.0D+0,0.0D+0), ZEROZ=(0.0D+0,0.0D+0)) 20 | C 21 | 22 | AZERO = ALPHA.EQ.ZEROZ 23 | AONE = ALPHA.EQ.ONEZ 24 | 25 | IF ( M.EQ.0 .OR. N.EQ.0 .OR. AZERO ) RETURN 26 | C 27 | IF ( INCY.GT.0 ) THEN 28 | JY = 1 29 | ELSE 30 | JY = 1 - (N-1)*INCY 31 | ENDIF 32 | C 33 | IF ( INCX.EQ.1 ) THEN 34 | DO 20 J = 1,N 35 | IF ( Y(1,JY).NE.ZEROZ .OR. Y(2,JY).NE.ZEROZ ) THEN 36 | HTMP1S = ALPHA * CONJG(Y(1,JY)) 37 | HTMP1J = -ALPHA * Y(2,JY) 38 | HTMP2S = ALPHA * Y(1,JY) 39 | HTMP2J = -ALPHA * CONJG(Y(2,JY)) 40 | DO 10 I = 1,M 41 | A(1,I,J) = A(1,I,J) + X(1,I)*HTMP1S - X(2,I)*HTMP2J 42 | A(2,I,J) = A(2,I,J) + X(1,I)*HTMP1J + X(2,I)*HTMP2S 43 | 10 CONTINUE 44 | ENDIF 45 | JY = JY + INCY 46 | 20 CONTINUE 47 | ELSE 48 | IF ( INCX.GT.0 ) THEN 49 | KX = 1 50 | ELSE 51 | KX = 1 - (M-1)*INCX 52 | ENDIF 53 | DO 40 J = 1,N 54 | IF ( Y(1,JY).NE.ZEROZ .OR. Y(2,JY).NE.ZEROZ ) THEN 55 | HTMP1S = ALPHA * CONJG(Y(1,JY)) 56 | HTMP1J = -ALPHA * Y(2,JY) 57 | HTMP2S = ALPHA * Y(1,JY) 58 | HTMP2J = -ALPHA * CONJG(Y(2,JY)) 59 | IX = KX 60 | DO 30 I = 1,M 61 | A(1,I,J) = A(1,I,J) + X(1,IX)*HTMP1S - X(2,IX)*HTMP2J 62 | A(2,I,J) = A(2,I,J) + X(1,IX)*HTMP1J + X(2,IX)*HTMP2S 63 | IX = IX + INCX 64 | 30 CONTINUE 65 | ENDIF 66 | JY = JY + INCY 67 | 40 CONTINUE 68 | ENDIF 69 | C 70 | END 71 | -------------------------------------------------------------------------------- /src/hblas/fortran/hblas2/hgerud.f: -------------------------------------------------------------------------------- 1 | C 2 | C This file is a part of HAXX 3 | C 4 | C Copyright (c) 2017 David Williams-Young 5 | C All rights reserved. 6 | C 7 | C See LICENSE.txt 8 | C 9 | Subroutine HGERUD(M, N, ALPHA, X, INCX, Y, INCY, A, LDA) 10 | C 11 | LOGICAL AZERO,AONE 12 | INTEGER*4 M,N, LDA,INCX,INCY, I,J,IY,JX,IX,JY, KX 13 | REAL*8 ALPHA 14 | COMPLEX*16 X(2,*), Y(2,*), A(2,LDA,*) 15 | COMPLEX*16 HTMPS, HTMPJ 16 | C 17 | REAL*8 ONED, ZEROD 18 | COMPLEX*16 ONEZ, ZEROZ 19 | PARAMETER (ONED=1.0D+0, ZEROD=0.0D+0) 20 | PARAMETER (ONEZ=(1.0D+0,0.0D+0), ZEROZ=(0.0D+0,0.0D+0)) 21 | C 22 | 23 | AZERO = ALPHA.EQ.ZEROD 24 | AONE = ALPHA.EQ.ONED 25 | 26 | IF ( M.EQ.0 .OR. N.EQ.0 .OR. AZERO ) RETURN 27 | C 28 | IF ( INCY.GT.0 ) THEN 29 | JY = 1 30 | ELSE 31 | JY = 1 - (N-1)*INCY 32 | ENDIF 33 | C 34 | IF ( INCX.EQ.1 ) THEN 35 | DO 20 J = 1,N 36 | IF ( Y(1,JY).NE.ZEROZ .OR. Y(2,JY).NE.ZEROZ ) THEN 37 | HTMPS = ALPHA * Y(1,JY) 38 | HTMPJ = ALPHA * Y(2,JY) 39 | DO 10 I = 1,M 40 | A(1,I,J) = A(1,I,J) + X(1,I)*HTMPS - X(2,I)*CONJG(HTMPJ) 41 | A(2,I,J) = A(2,I,J) + X(1,I)*HTMPJ + X(2,I)*CONJG(HTMPS) 42 | 10 CONTINUE 43 | ENDIF 44 | JY = JY + INCY 45 | 20 CONTINUE 46 | ELSE 47 | IF ( INCX.GT.0 ) THEN 48 | KX = 1 49 | ELSE 50 | KX = 1 - (M-1)*INCX 51 | ENDIF 52 | DO 40 J = 1,N 53 | IF ( Y(1,JY).NE.ZEROZ .OR. Y(2,JY).NE.ZEROZ ) THEN 54 | HTMPI = ALPHA * Y(1,JY) 55 | HTMPJ = ALPHA * Y(2,JY) 56 | IX = KX 57 | DO 30 I = 1,M 58 | A(1,I,J) = A(1,I,J) + X(1,IX)*HTMPS - X(2,IX)*CONJG(HTMPJ) 59 | A(2,I,J) = A(2,I,J) + X(1,IX)*HTMPJ + X(2,IX)*CONJG(HTMPS) 60 | IX = IX + INCX 61 | 30 CONTINUE 62 | ENDIF 63 | JY = JY + INCY 64 | 40 CONTINUE 65 | ENDIF 66 | C 67 | END 68 | -------------------------------------------------------------------------------- /src/hblas/fortran/hblas2/hgeruh.f: -------------------------------------------------------------------------------- 1 | C 2 | C This file is a part of HAXX 3 | C 4 | C Copyright (c) 2017 David Williams-Young 5 | C All rights reserved. 6 | C 7 | C See LICENSE.txt 8 | C 9 | Subroutine HGERUH(M, N, ALPHA, X, INCX, Y, INCY, A, LDA) 10 | C 11 | LOGICAL AZERO,AONE 12 | INTEGER*4 M,N, LDA,INCX,INCY, I,J,IY,JX,IX,JY, KX 13 | COMPLEX*16 X(2,*), Y(2,*), A(2,LDA,*), ALPHA(2) 14 | COMPLEX*16 HTMP1S, HTMP1J, HTMP2S, HTMP2J 15 | COMPLEX*16 HTMP3S, HTMP3J, HTMP4S, HTMP4J 16 | C 17 | REAL*8 ONED, ZEROD 18 | COMPLEX*16 ONEZ, ZEROZ 19 | PARAMETER (ONED=1.0D+0, ZEROD=0.0D+0) 20 | PARAMETER (ONEZ=(1.0D+0,0.0D+0), ZEROZ=(0.0D+0,0.0D+0)) 21 | C 22 | 23 | AZERO = (ALPHA(1).EQ.ZEROZ).AND.(ALPHA(2).EQ.ZEROZ) 24 | AONE = (ALPHA(1).EQ.ONEZ).AND.(ALPHA(2).EQ.ZEROZ) 25 | 26 | IF ( M.EQ.0 .OR. N.EQ.0 .OR. AZERO ) RETURN 27 | C 28 | IF ( INCY.GT.0 ) THEN 29 | JY = 1 30 | ELSE 31 | JY = 1 - (N-1)*INCY 32 | ENDIF 33 | C 34 | IF ( INCX.EQ.1 ) THEN 35 | DO 20 J = 1,N 36 | IF ( Y(1,JY).NE.ZEROZ .OR. Y(2,JY).NE.ZEROZ ) THEN 37 | HTMP1S = ALPHA(1) * Y(1,JY) 38 | HTMP1J = ALPHA(1) * Y(2,JY) 39 | HTMP2S = ALPHA(1) * CONJG(Y(1,JY)) 40 | HTMP2J = ALPHA(1) * CONJG(Y(2,JY)) 41 | HTMP3S = ALPHA(2) * Y(1,JY) 42 | HTMP3J = ALPHA(2) * Y(2,JY) 43 | HTMP4S = ALPHA(2) * CONJG(Y(1,JY)) 44 | HTMP4J = ALPHA(2) * CONJG(Y(2,JY)) 45 | DO 10 I = 1,M 46 | A(1,I,J) = A(1,I,J) + 47 | $ X(1,I)*HTMP1S - X(2,I)*HTMP2J - 48 | $ CONJG(X(1,I))*HTMP4J - CONJG(X(2,I))*HTMP3S 49 | A(2,I,J) = A(2,I,J) + 50 | $ X(1,I)*HTMP1J + X(2,I)*HTMP2S + 51 | $ CONJG(X(1,I))*HTMP4S - CONJG(X(2,I))*HTMP3J 52 | 10 CONTINUE 53 | ENDIF 54 | JY = JY + INCY 55 | 20 CONTINUE 56 | ELSE 57 | IF ( INCX.GT.0 ) THEN 58 | KX = 1 59 | ELSE 60 | KX = 1 - (M-1)*INCX 61 | ENDIF 62 | DO 40 J = 1,N 63 | IF ( Y(1,JY).NE.ZEROZ .OR. Y(2,JY).NE.ZEROZ ) THEN 64 | HTMP1S = ALPHA(1) * Y(1,JY) 65 | HTMP1J = ALPHA(1) * Y(2,JY) 66 | HTMP2S = ALPHA(1) * CONJG(Y(1,JY)) 67 | HTMP2J = ALPHA(1) * CONJG(Y(2,JY)) 68 | HTMP3S = ALPHA(2) * Y(1,JY) 69 | HTMP3J = ALPHA(2) * Y(2,JY) 70 | HTMP4S = ALPHA(2) * CONJG(Y(1,JY)) 71 | HTMP4J = ALPHA(2) * CONJG(Y(2,JY)) 72 | IX = KX 73 | DO 30 I = 1,M 74 | A(1,I,J) = A(1,I,J) + 75 | $ X(1,IX)*HTMP1S - X(2,IX)*HTMP2J - 76 | $ CONJG(X(1,IX))*HTMP4J - CONJG(X(2,IX))*HTMP3S 77 | A(2,I,J) = A(2,I,J) + 78 | $ X(1,IX)*HTMP1J + X(2,IX)*HTMP2S + 79 | $ CONJG(X(1,IX))*HTMP4S - CONJG(X(2,IX))*HTMP3J 80 | IX = IX + INCX 81 | 30 CONTINUE 82 | ENDIF 83 | JY = JY + INCY 84 | 40 CONTINUE 85 | ENDIF 86 | C 87 | END 88 | -------------------------------------------------------------------------------- /src/hblas/fortran/hblas2/hgeruz.f: -------------------------------------------------------------------------------- 1 | C 2 | C This file is a part of HAXX 3 | C 4 | C Copyright (c) 2017 David Williams-Young 5 | C All rights reserved. 6 | C 7 | C See LICENSE.txt 8 | C 9 | Subroutine HGERUZ(M, N, ALPHA, X, INCX, Y, INCY, A, LDA) 10 | C 11 | LOGICAL AZERO,AONE 12 | INTEGER*4 M,N, LDA,INCX,INCY, I,J,IY,JX,IX,JY, KX 13 | COMPLEX*16 X(2,*), Y(2,*), A(2,LDA,*), ALPHA 14 | COMPLEX*16 HTMP1S, HTMP1J, HTMP2S, HTMP2J 15 | C 16 | REAL*8 ONED, ZEROD 17 | COMPLEX*16 ONEZ, ZEROZ 18 | PARAMETER (ONED=1.0D+0, ZEROD=0.0D+0) 19 | PARAMETER (ONEZ=(1.0D+0,0.0D+0), ZEROZ=(0.0D+0,0.0D+0)) 20 | C 21 | 22 | AZERO = ALPHA.EQ.ZEROZ 23 | AONE = ALPHA.EQ.ONEZ 24 | 25 | IF ( M.EQ.0 .OR. N.EQ.0 .OR. AZERO ) RETURN 26 | C 27 | IF ( INCY.GT.0 ) THEN 28 | JY = 1 29 | ELSE 30 | JY = 1 - (N-1)*INCY 31 | ENDIF 32 | C 33 | IF ( INCX.EQ.1 ) THEN 34 | DO 20 J = 1,N 35 | IF ( Y(1,JY).NE.ZEROZ .OR. Y(2,JY).NE.ZEROZ ) THEN 36 | HTMP1S = ALPHA * Y(1,JY) 37 | HTMP1J = ALPHA * Y(2,JY) 38 | HTMP2S = ALPHA * CONJG(Y(1,JY)) 39 | HTMP2J = ALPHA * CONJG(Y(2,JY)) 40 | DO 10 I = 1,M 41 | A(1,I,J) = A(1,I,J) + X(1,I)*HTMP1S - X(2,I)*HTMP2J 42 | A(2,I,J) = A(2,I,J) + X(1,I)*HTMP1J + X(2,I)*HTMP2S 43 | 10 CONTINUE 44 | ENDIF 45 | JY = JY + INCY 46 | 20 CONTINUE 47 | ELSE 48 | IF ( INCX.GT.0 ) THEN 49 | KX = 1 50 | ELSE 51 | KX = 1 - (M-1)*INCX 52 | ENDIF 53 | DO 40 J = 1,N 54 | IF ( Y(1,JY).NE.ZEROZ .OR. Y(2,JY).NE.ZEROZ ) THEN 55 | HTMP1S = ALPHA * Y(1,JY) 56 | HTMP1J = ALPHA * Y(2,JY) 57 | HTMP2S = ALPHA * CONJG(Y(1,JY)) 58 | HTMP2J = ALPHA * CONJG(Y(2,JY)) 59 | IX = KX 60 | DO 30 I = 1,M 61 | A(1,I,J) = A(1,I,J) + X(1,IX)*HTMP1S - X(2,IX)*HTMP2J 62 | A(2,I,J) = A(2,I,J) + X(1,IX)*HTMP1J + X(2,IX)*HTMP2S 63 | IX = IX + INCX 64 | 30 CONTINUE 65 | ENDIF 66 | JY = JY + INCY 67 | 40 CONTINUE 68 | ENDIF 69 | C 70 | END 71 | -------------------------------------------------------------------------------- /src/hblas/fortran/util/hdexp.f: -------------------------------------------------------------------------------- 1 | C 2 | C This file is a part of HAXX 3 | C 4 | C Copyright (c) 2017 David Williams-Young 5 | C All rights reserved. 6 | C 7 | C See LICENSE.txt 8 | C 9 | Subroutine HDEXP(M, N, A, LDA, B, LDB) 10 | C 11 | IMPLICIT REAL*8(A-H,O,Z) 12 | DIMENSION A(4,LDA,*), B(LDB,*) 13 | C 14 | JB = 1 15 | Do 20 J = 1,N 16 | IB = 1 17 | Do 10 I = 1,M 18 | B(IB,JB) = A(1,I,J) 19 | B(IB,JB+1) = A(2,I,J) 20 | B(IB,JB+2) = A(3,I,J) 21 | B(IB,JB+3) = A(4,I,J) 22 | 23 | B(IB+1,JB) = -A(2,I,J) 24 | B(IB+1,JB+1) = A(1,I,J) 25 | B(IB+1,JB+2) = -A(4,I,J) 26 | B(IB+1,JB+3) = A(3,I,J) 27 | 28 | B(IB+2,JB) = -A(3,I,J) 29 | B(IB+2,JB+1) = A(4,I,J) 30 | B(IB+2,JB+2) = A(1,I,J) 31 | B(IB+2,JB+3) = -A(2,I,J) 32 | 33 | B(IB+3,JB) = -A(4,I,J) 34 | B(IB+3,JB+1) = -A(3,I,J) 35 | B(IB+3,JB+2) = A(2,I,J) 36 | B(IB+3,JB+3) = A(1,I,J) 37 | 38 | IB = IB + 4 39 | 10 CONTINUE 40 | JB = JB + 4 41 | 20 CONTINUE 42 | C 43 | END 44 | -------------------------------------------------------------------------------- /src/hblas/fortran/util/hzcon.f: -------------------------------------------------------------------------------- 1 | C 2 | C This file is a part of HAXX 3 | C 4 | C Copyright (c) 2017 David Williams-Young 5 | C All rights reserved. 6 | C 7 | C See LICENSE.txt 8 | C 9 | Subroutine HZCON1(UPLO, M, N, A, LDA, B, LDB) 10 | C 11 | CHARACTER UPLO 12 | LOGICAL NOTL 13 | INTEGER*4 M,N,LDA,LDB 14 | COMPLEX*16 A(2,LDA,*), B(LDB,*) 15 | C 16 | LOGICAL LSAME 17 | EXTERNAL LSAME 18 | C 19 | C 20 | NOTL = LSAME(UPLO,'U') 21 | C 22 | JB = 1 23 | Do 20 J = 1,N 24 | IB = 1 25 | Do 10 I = 1,M 26 | A(1,I,J) = B(IB,JB) 27 | 28 | IF( NOTL ) THEN 29 | A(2,I,J) = B(IB,JB+1) 30 | ELSE 31 | A(2,I,J) = -CONJG(B(IB+1,JB)) 32 | ENDIF 33 | 34 | IB = IB + 2 35 | 10 CONTINUE 36 | JB = JB + 2 37 | 20 CONTINUE 38 | C 39 | END 40 | C 41 | C 42 | C 43 | Subroutine HZCON2(UPLO, M, N, A, LDA, B, LDB) 44 | C 45 | CHARACTER UPLO 46 | LOGICAL NOTL 47 | INTEGER*4 M,N,LDA,LDB 48 | C 49 | Complex*16 A(2,LDA,*), B(LDB,*) 50 | C 51 | C 52 | LOGICAL LSAME 53 | EXTERNAL LSAME 54 | C 55 | 56 | NOTL = LSAME(UPLO,'U') 57 | 58 | Do 10 J = 1,N 59 | Do 10 I = 1,M 60 | 61 | A(1,I,J) = B(I,J) 62 | IF( NOTL ) THEN 63 | A(2,I,J) = B(I,J+N) 64 | ELSE 65 | A(2,I,J) = -CONJG(B(I+M,J)) 66 | ENDIF 67 | 68 | 69 | 10 CONTINUE 70 | C 71 | END 72 | -------------------------------------------------------------------------------- /src/hblas/fortran/util/hzexp.f: -------------------------------------------------------------------------------- 1 | C 2 | C This file is a part of HAXX 3 | C 4 | C Copyright (c) 2017 David Williams-Young 5 | C All rights reserved. 6 | C 7 | C See LICENSE.txt 8 | C 9 | Subroutine HZEXP1(M, N, A, LDA, B, LDB) 10 | C 11 | IMPLICIT COMPLEX*16(A-H,O,Z) 12 | DIMENSION A(2,LDA,*), B(LDB,*) 13 | C 14 | JB = 1 15 | Do 20 J = 1,N 16 | IB = 1 17 | Do 10 I = 1,M 18 | B(IB,JB) = A(1,I,J) 19 | B(IB+1,JB+1) = CONJG(A(1,I,J)) 20 | B(IB,JB+1) = A(2,I,J) 21 | B(IB+1,JB) = -CONJG(A(2,I,J)) 22 | 23 | IB = IB + 2 24 | 10 CONTINUE 25 | JB = JB + 2 26 | 20 CONTINUE 27 | C 28 | END 29 | C 30 | C 31 | C 32 | Subroutine HZEXP2(M, N, A, LDA, B, LDB) 33 | C 34 | IMPLICIT COMPLEX*16(A-H,O,Z) 35 | DIMENSION A(2,LDA,*), B(LDB,*) 36 | C 37 | Do 10 J = 1,N 38 | Do 10 I = 1,M 39 | B(I,J) = A(1,I,J) 40 | B(I+M,J+N) = CONJG(A(1,I,J)) 41 | B(I,J+N) = A(2,I,J) 42 | B(I+M,J) = -CONJG(A(2,I,J)) 43 | 10 CONTINUE 44 | C 45 | END 46 | -------------------------------------------------------------------------------- /src/hblas/fortran/util/lsame.f: -------------------------------------------------------------------------------- 1 | LOGICAL FUNCTION LSAME( CA, CB ) 2 | * 3 | * -- LAPACK auxiliary routine (version 3.1) -- 4 | * Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. 5 | * November 2006 6 | * 7 | * .. Scalar Arguments .. 8 | CHARACTER CA, CB 9 | * .. 10 | * 11 | * Purpose 12 | * ======= 13 | * 14 | * LSAME returns .TRUE. if CA is the same letter as CB regardless of 15 | * case. 16 | * 17 | * Arguments 18 | * ========= 19 | * 20 | * CA (input) CHARACTER*1 21 | * CB (input) CHARACTER*1 22 | * CA and CB specify the single characters to be compared. 23 | * 24 | * ===================================================================== 25 | * 26 | * .. Intrinsic Functions .. 27 | INTRINSIC ICHAR 28 | * .. 29 | * .. Local Scalars .. 30 | INTEGER INTA, INTB, ZCODE 31 | * .. 32 | * .. Executable Statements .. 33 | * 34 | * Test if the characters are equal 35 | * 36 | LSAME = CA.EQ.CB 37 | IF( LSAME ) 38 | $ RETURN 39 | * 40 | * Now test for equivalence if both characters are alphabetic. 41 | * 42 | ZCODE = ICHAR( 'Z' ) 43 | * 44 | * Use 'Z' rather than 'A' so that ASCII can be detected on Prime 45 | * machines, on which ICHAR returns a value with bit 8 set. 46 | * ICHAR('A') on Prime machines returns 193 which is the same as 47 | * ICHAR('A') on an EBCDIC machine. 48 | * 49 | INTA = ICHAR( CA ) 50 | INTB = ICHAR( CB ) 51 | * 52 | IF( ZCODE.EQ.90 .OR. ZCODE.EQ.122 ) THEN 53 | * 54 | * ASCII is assumed - ZCODE is the ASCII code of either lower or 55 | * upper case 'Z'. 56 | * 57 | IF( INTA.GE.97 .AND. INTA.LE.122 ) INTA = INTA - 32 58 | IF( INTB.GE.97 .AND. INTB.LE.122 ) INTB = INTB - 32 59 | * 60 | ELSE IF( ZCODE.EQ.233 .OR. ZCODE.EQ.169 ) THEN 61 | * 62 | * EBCDIC is assumed - ZCODE is the EBCDIC code of either lower or 63 | * upper case 'Z'. 64 | * 65 | IF( INTA.GE.129 .AND. INTA.LE.137 .OR. 66 | $ INTA.GE.145 .AND. INTA.LE.153 .OR. 67 | $ INTA.GE.162 .AND. INTA.LE.169 ) INTA = INTA + 64 68 | IF( INTB.GE.129 .AND. INTB.LE.137 .OR. 69 | $ INTB.GE.145 .AND. INTB.LE.153 .OR. 70 | $ INTB.GE.162 .AND. INTB.LE.169 ) INTB = INTB + 64 71 | * 72 | ELSE IF( ZCODE.EQ.218 .OR. ZCODE.EQ.250 ) THEN 73 | * 74 | * ASCII is assumed, on Prime machines - ZCODE is the ASCII code 75 | * plus 128 of either lower or upper case 'Z'. 76 | * 77 | IF( INTA.GE.225 .AND. INTA.LE.250 ) INTA = INTA - 32 78 | IF( INTB.GE.225 .AND. INTB.LE.250 ) INTB = INTB - 32 79 | END IF 80 | LSAME = INTA.EQ.INTB 81 | * 82 | * RETURN 83 | * 84 | * End of LSAME 85 | * 86 | END 87 | 88 | -------------------------------------------------------------------------------- /src/hblas/hblas1.cxx.in: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is a part of HAXX 3 | * 4 | * Copyright (c) 2017 David Williams-Young 5 | * All rights reserved. 6 | * 7 | * See LICENSE.txt 8 | */ 9 | 10 | #include "haxx.hpp" 11 | #include "hblas/generic/hblas1/hblas_scalm_impl.hpp" 12 | #include "hblas/generic/hblas1/hblas_axpym_impl.hpp" 13 | 14 | #cmakedefine ENABLE_GENERIC_FORTRAN 15 | #cmakedefine ENABLE_GENERIC_CXX 16 | 17 | 18 | // Preprocessor macros for quick CXX implementations 19 | 20 | #define SWAPV_CXX_IMPL(F) \ 21 | template \ 22 | void HBLAS_SWAPV(const HAXX_INT, quaternion * const , const HAXX_INT,\ 23 | quaternion * const , const HAXX_INT); 24 | 25 | #define COPYV_CXX_IMPL(F) \ 26 | template \ 27 | void HBLAS_COPYV(const HAXX_INT, quaternion * const , const HAXX_INT,\ 28 | quaternion * const , const HAXX_INT); 29 | 30 | 31 | #define SCALM_CXX_IMPL(F,ALPHAF) \ 32 | template \ 33 | void HBLAS_SCALM(const char SIDE, const char TRANSA, const HAXX_INT M, \ 34 | const HAXX_INT N, const ALPHAF ALPHA, quaternion * const A, \ 35 | const HAXX_INT LDA, const HAXX_INT INCA); 36 | 37 | #define AXPYM_CXX_IMPL(F,XF,ALPHAF) \ 38 | template \ 39 | void HBLAS_AXPYM(const char SIDE, const char TRANSA, const HAXX_INT M, \ 40 | const HAXX_INT N, const ALPHAF ALPHA, XF * const A, const HAXX_INT LDA, \ 41 | const HAXX_INT INCA, quaternion * const B, const HAXX_INT LDB, \ 42 | const HAXX_INT INCB); 43 | 44 | 45 | // Preprocessor macros for quick FORTRAN implementations 46 | 47 | #define DOT_FORTRAN_IMPL(FUNCNAME,NAME,F)\ 48 | template<>\ 49 | quaternion HBLAS_##FUNCNAME( HAXX_INT N, quaternion *X, \ 50 | HAXX_INT INCX, quaternion *Y, HAXX_INT INCY){\ 51 | \ 52 | quaternion htemp;\ 53 | NAME##_(&htemp, &N, X, &INCX, Y, &INCY);\ 54 | return htemp;\ 55 | \ 56 | } 57 | 58 | #define SCALV_FORTRAN_IMPL(NAME,F,ALPHAF)\ 59 | template<>\ 60 | void HBLAS_SCALV(const char SIDE, const HAXX_INT N, const ALPHAF ALPHA,\ 61 | quaternion * const X, const HAXX_INT INCX) {\ 62 | \ 63 | NAME##_(&SIDE,&N,&ALPHA,X,&INCX);\ 64 | \ 65 | }; 66 | 67 | 68 | #define AXPYV_FORTRAN_IMPL(NAME,F,XF,ALPHAF)\ 69 | template<> \ 70 | void HBLAS_AXPYV(const char SIDE, const HAXX_INT N, const ALPHAF ALPHA,\ 71 | XF * const X, const HAXX_INT INCX, quaternion * const Y, \ 72 | const HAXX_INT INCY) {\ 73 | \ 74 | NAME##_(&SIDE,&N,&ALPHA,X,&INCX,Y,&INCY);\ 75 | \ 76 | }; 77 | 78 | namespace HAXX { 79 | 80 | // HBLAS1V functions 81 | 82 | #ifdef ENABLE_GENERIC_CXX 83 | SWAPV_CXX_IMPL(double); 84 | #endif 85 | 86 | #ifdef ENABLE_GENERIC_FORTRAN 87 | SCALV_FORTRAN_IMPL(hscald,double,double); 88 | SCALV_FORTRAN_IMPL(hscalc,double,std::complex); 89 | SCALV_FORTRAN_IMPL(hscalh,double,quaternion); 90 | #endif 91 | 92 | #ifdef ENABLE_GENERIC_CXX 93 | COPYV_CXX_IMPL(double); 94 | #endif 95 | 96 | #ifdef ENABLE_GENERIC_FORTRAN 97 | AXPYV_FORTRAN_IMPL(haxpydh,double,quaternion,double); 98 | AXPYV_FORTRAN_IMPL(haxpych,double,quaternion,std::complex); 99 | AXPYV_FORTRAN_IMPL(haxpyhh,double,quaternion,quaternion); 100 | #endif 101 | 102 | #ifdef ENABLE_GENERIC_FORTRAN 103 | DOTV_FORTRAN_IMPL(DOTUV,hdotu,double); 104 | DOTV_FORTRAN_IMPL(DOTCV,hdotc,double); 105 | #endif 106 | 107 | // HBLAS1M functions 108 | 109 | SCALM_CXX_IMPL(double,double); 110 | SCALM_CXX_IMPL(double,std::complex); 111 | SCALM_CXX_IMPL(double,quaternion); 112 | 113 | AXPYM_CXX_IMPL(double,quaternion,double); 114 | AXPYM_CXX_IMPL(double,quaternion,std::complex); 115 | AXPYM_CXX_IMPL(double,quaternion,quaternion); 116 | 117 | }; 118 | 119 | -------------------------------------------------------------------------------- /src/hblas/hblas2.cxx.in: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is a part of HAXX 3 | * 4 | * Copyright (c) 2017 David Williams-Young 5 | * All rights reserved. 6 | * 7 | * See LICENSE.txt 8 | */ 9 | 10 | #include "haxx.hpp" 11 | #include "hblas/generic/hblas2/impl.hpp" 12 | 13 | // Preprocessor macros for quick CXX implementations 14 | 15 | #define GEMV_CXX_IMPL(F,MATF,VECF,ALPHAF,BETAF) \ 16 | template \ 17 | void HBLAS_GEMV(const char TRANS, const HAXX_INT M, const HAXX_INT N, \ 18 | const ALPHAF ALPHA, MATF * const A, const HAXX_INT LDA, \ 19 | VECF * const X, const HAXX_INT INCX, const BETAF BETA, \ 20 | quaternion * const Y, const HAXX_INT INCY); 21 | 22 | 23 | 24 | 25 | // Preprocessor macros for quick FORTRAN implementations 26 | 27 | #define GEMV_FORTRAN_IMPL(NAME,F,MATF,VECF,ALPHAF,BETAF) \ 28 | template <>\ 29 | void HBLAS_GEMV(const char TRANS, const HAXX_INT M, const HAXX_INT N, \ 30 | const ALPHAF ALPHA, MATF * const A, const HAXX_INT LDA, \ 31 | VECF * const X, const HAXX_INT INCX, const BETAF BETA, \ 32 | quaternion * const Y, const HAXX_INT INCY) {\ 33 | \ 34 | NAME##_(&TRANS,&M,&N,&ALPHA,A,&LDA,X,&INCX,&BETA,Y,&INCY);\ 35 | \ 36 | }; 37 | 38 | #define GER_FORTRAN_IMPL(FUNCNAME,NAME,F,LEFTF,RIGHTF,ALPHAF) \ 39 | template <>\ 40 | void HBLAS_##FUNCNAME(const HAXX_INT M, const HAXX_INT N, const ALPHAF ALPHA,\ 41 | LEFTF * const X, const HAXX_INT INCX, RIGHTF * const Y, const HAXX_INT INCY,\ 42 | quaternion * const A, const HAXX_INT LDA) {\ 43 | \ 44 | NAME##_(&M,&N,&ALPHA,X,&INCX,Y,&INCY,A,&LDA);\ 45 | \ 46 | } 47 | 48 | 49 | namespace HAXX { 50 | 51 | // GEMV functions 52 | 53 | GEMV_FORTRAN_IMPL(hgemvdd,double,quaternion,quaternion, 54 | double, double); 55 | GEMV_FORTRAN_IMPL(hgemvdz,double,quaternion,quaternion, 56 | double, std::complex); 57 | GEMV_FORTRAN_IMPL(hgemvdh,double,quaternion,quaternion, 58 | double, quaternion); 59 | 60 | GEMV_FORTRAN_IMPL(hgemvzd,double,quaternion,quaternion, 61 | std::complex,double); 62 | GEMV_FORTRAN_IMPL(hgemvzz,double,quaternion,quaternion, 63 | std::complex,std::complex); 64 | GEMV_FORTRAN_IMPL(hgemvzh,double,quaternion,quaternion, 65 | std::complex, quaternion); 66 | 67 | GEMV_FORTRAN_IMPL(hgemvhd,double,quaternion,quaternion, 68 | quaternion,double); 69 | GEMV_FORTRAN_IMPL(hgemvhz,double,quaternion,quaternion, 70 | quaternion,std::complex); 71 | GEMV_FORTRAN_IMPL(hgemvhh,double,quaternion,quaternion, 72 | quaternion, quaternion); 73 | 74 | // Use CXX for REAL-QUATERNION multiplication 75 | GEMV_CXX_IMPL(double,double,quaternion,double,double); 76 | 77 | 78 | 79 | // GERU functions 80 | 81 | GER_FORTRAN_IMPL(GERU,hgerud,double,quaternion,quaternion, 82 | double) 83 | GER_FORTRAN_IMPL(GERU,hgeruz,double,quaternion,quaternion, 84 | std::complex) 85 | GER_FORTRAN_IMPL(GERU,hgeruh,double,quaternion,quaternion, 86 | quaternion) 87 | 88 | // GERC functions 89 | 90 | GER_FORTRAN_IMPL(GERC,hgercd,double,quaternion,quaternion, 91 | double) 92 | GER_FORTRAN_IMPL(GERC,hgercz,double,quaternion,quaternion, 93 | std::complex) 94 | GER_FORTRAN_IMPL(GERC,hgerch,double,quaternion,quaternion, 95 | quaternion) 96 | }; 97 | 98 | -------------------------------------------------------------------------------- /src/hblas/hblas3.cxx.in: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is a part of HAXX 3 | * 4 | * Copyright (c) 2017 David Williams-Young 5 | * All rights reserved. 6 | * 7 | * See LICENSE.txt 8 | */ 9 | 10 | #include "haxx.hpp" 11 | #include "hblas/generic/hblas3/impl.hpp" 12 | 13 | // Preprocessor macros for quick CXX implementations 14 | 15 | #define GEMM_CXX_IMPL(F,AMATF,BMATF,ALPHAF,BETAF) \ 16 | template \ 17 | void HBLAS_GEMM(const char, const char, const HAXX_INT, const HAXX_INT, \ 18 | const HAXX_INT, const ALPHAF ALPHA, AMATF * const, const HAXX_INT, \ 19 | BMATF * const, const HAXX_INT, const BETAF BETA, \ 20 | quaternion * const, const HAXX_INT); 21 | 22 | 23 | // Preprocessor macros for quick FORTRAN implementations 24 | 25 | #define GEMM_FORTRAN_IMPL(NAME,F,AMATF,BMATF,ALPHAF,BETAF) \ 26 | template<>\ 27 | void HBLAS_GEMM(const char TRANSA, const char TRANSB, const HAXX_INT M,\ 28 | const HAXX_INT N, const HAXX_INT K, const ALPHAF ALPHA, \ 29 | AMATF * const A, const HAXX_INT LDA, BMATF * const B, const HAXX_INT LDB, \ 30 | const BETAF BETA, quaternion * const C, const HAXX_INT LDC){\ 31 | \ 32 | NAME##_(&TRANSA,&TRANSB,&M,&N,&K,&ALPHA,A,&LDA,B,&LDB,&BETA,\ 33 | C,&LDC);\ 34 | \ 35 | }; 36 | 37 | namespace HAXX { 38 | 39 | // GEMM functions 40 | 41 | //GEMM_FORTRAN_IMPL(hgemmdd,double,quaternion,quaternion, 42 | // double,double); 43 | //GEMM_FORTRAN_IMPL(hgemmdz,double,quaternion,quaternion, 44 | // double,std::complex); 45 | GEMM_FORTRAN_IMPL(hgemmdh,double,quaternion,quaternion, 46 | double,quaternion); 47 | 48 | //GEMM_FORTRAN_IMPL(hgemmzd,double,quaternion,quaternion, 49 | // std::complex,double); 50 | //GEMM_FORTRAN_IMPL(hgemmzz,double,quaternion,quaternion, 51 | // std::complex,std::complex); 52 | GEMM_FORTRAN_IMPL(hgemmzh,double,quaternion,quaternion, 53 | std::complex,quaternion); 54 | 55 | GEMM_FORTRAN_IMPL(hgemmhd,double,quaternion,quaternion, 56 | quaternion,double); 57 | GEMM_FORTRAN_IMPL(hgemmhz,double,quaternion,quaternion, 58 | quaternion,std::complex); 59 | GEMM_FORTRAN_IMPL(hgemmhh,double,quaternion,quaternion, 60 | quaternion,quaternion); 61 | 62 | 63 | // Instantiate REAL-QUATERION matrix multiplication from template 64 | GEMM_CXX_IMPL(double,double,quaternion,double,double); 65 | 66 | }; 67 | 68 | -------------------------------------------------------------------------------- /src/hblas/util.cxx: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is a part of HAXX 3 | * 4 | * Copyright (c) 2017 David Williams-Young 5 | * All rights reserved. 6 | * 7 | * See LICENSE.txt 8 | */ 9 | 10 | #include "haxx.hpp" 11 | #include "hblas/generic/util/impl.hpp" 12 | 13 | namespace HAXX { 14 | 15 | 16 | }; 17 | 18 | -------------------------------------------------------------------------------- /src/tune/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is a part of HAXX 3 | # 4 | # Copyright (c) 2017 David Williams-Young 5 | # All rights reserved. 6 | # 7 | # See LICENSE.txt 8 | # 9 | #set(Boost_USE_STATIC_LIBS ON) 10 | 11 | 12 | 13 | add_executable(gemm_tune gemm.cxx) 14 | target_link_libraries(gemm_tune ${Boost_LIBRARIES} hblas) 15 | 16 | -------------------------------------------------------------------------------- /src/tune/gemm.cxx: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is a part of HAXX 3 | * 4 | * Copyright (c) 2017 David Williams-Young 5 | * All rights reserved. 6 | * 7 | * See LICENSE.txt 8 | */ 9 | 10 | 11 | #include "haxx.hpp" 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include "hblas/hblas3.hpp" 19 | #include "hblas/hblas_util.hpp" 20 | 21 | extern "C" { 22 | 23 | void zgemm_(const char*, const char*, const int*, const int*, 24 | const int*, const std::complex*, const std::complex*, 25 | const int*, const std::complex*, const int*, 26 | const std::complex*, const std::complex*, const int*); 27 | 28 | }; 29 | 30 | #define HBLAS1_RAND_MIN -20 31 | #define HBLAS1_RAND_MAX 54 32 | 33 | // Setup Random Number generator 34 | std::random_device rd; 35 | std::mt19937 gen(rd()); 36 | std::uniform_real_distribution<> dis(HBLAS1_RAND_MIN,HBLAS1_RAND_MAX); 37 | 38 | 39 | //#define _DO_COMPLEX 40 | //#define _DO_FORTRAN 41 | 42 | 43 | void outTime(std::string name, size_t LEN, size_t FLOPS, double count) { 44 | 45 | std::cout << std::setw(15) << std::left << name; 46 | std::cout << std::setw(15) << std::left << LEN; 47 | 48 | std::cout << std::setprecision(8); 49 | 50 | std::cout << std::setw(15) << std::right << count; 51 | std::cout << std::setw(15) << std::right << FLOPS/count/1.e9; 52 | 53 | std::cout << std::endl; 54 | 55 | 56 | 57 | }; 58 | 59 | int main() { 60 | 61 | const size_t GEMM_LEN_MIN = 500; 62 | const size_t GEMM_LEN_MAX = 2000; 63 | const size_t GEMM_INC = 500; 64 | 65 | std::chrono::duration hgemmDur; 66 | for( auto GEMM_LEN = GEMM_LEN_MIN; 67 | GEMM_LEN <= GEMM_LEN_MAX; 68 | GEMM_LEN += GEMM_INC ) { 69 | 70 | 71 | std::vector> 72 | A(GEMM_LEN*GEMM_LEN), B(GEMM_LEN*GEMM_LEN), C(GEMM_LEN*GEMM_LEN); 73 | 74 | for(auto &x : A) 75 | x = HAXX::quaternion(dis(gen),dis(gen),dis(gen),dis(gen)); 76 | for(auto &x : B) 77 | x = HAXX::quaternion(dis(gen),dis(gen),dis(gen),dis(gen)); 78 | for(auto &x : C) 79 | x = HAXX::quaternion(dis(gen),dis(gen),dis(gen),dis(gen)); 80 | 81 | #ifdef _DO_COMPLEX 82 | std::vector> 83 | AC(2*GEMM_LEN*2*GEMM_LEN), BC(2*GEMM_LEN*2*GEMM_LEN), CC(2*GEMM_LEN*2*GEMM_LEN); 84 | 85 | int x2x = 2*GEMM_LEN; 86 | int gl = GEMM_LEN; 87 | HBLAS_COMPLEX_EXPAND('S',GEMM_LEN,GEMM_LEN,&A[0],GEMM_LEN,&AC[0],2*GEMM_LEN); 88 | HBLAS_COMPLEX_EXPAND('S',GEMM_LEN,GEMM_LEN,&B[0],GEMM_LEN,&BC[0],2*GEMM_LEN); 89 | HBLAS_COMPLEX_EXPAND('S',GEMM_LEN,GEMM_LEN,&C[0],GEMM_LEN,&CC[0],2*GEMM_LEN); 90 | #endif 91 | 92 | 93 | char TRANSA = 'N', TRANSB = 'N'; 94 | 95 | std::complex ALPHA(dis(gen),dis(gen)), BETA(dis(gen),dis(gen)); 96 | 97 | 98 | if(GEMM_LEN == 500) 99 | HBLAS_GEMM(TRANSA,TRANSB,GEMM_LEN,GEMM_LEN,GEMM_LEN,ALPHA,&A[0],GEMM_LEN, 100 | &B[0],GEMM_LEN,BETA,&C[0],GEMM_LEN); 101 | 102 | #ifdef _DO_FORTRAN 103 | auto fortranStart = std::chrono::high_resolution_clock::now(); 104 | hgemmzz_(&TRANSA,&TRANSB,&gl,&gl,&gl,&ALPHA,&A[0],&gl,&B[0],&gl,&BETA,&C[0],&gl); 105 | auto fortranEnd = std::chrono::high_resolution_clock::now(); 106 | std::chrono::duration fortranDur = fortranEnd - fortranStart; 107 | 108 | outTime("HGEMM_FORTRAN",GEMM_LEN,32.*GEMM_LEN*GEMM_LEN*GEMM_LEN, 109 | fortranDur.count()); 110 | #endif 111 | 112 | auto hgemmStart = std::chrono::high_resolution_clock::now(); 113 | HBLAS_GEMM(TRANSA,TRANSB,GEMM_LEN,GEMM_LEN,GEMM_LEN,ALPHA,&A[0],GEMM_LEN, 114 | &B[0],GEMM_LEN,BETA,&C[0],GEMM_LEN); 115 | auto hgemmEnd = std::chrono::high_resolution_clock::now(); 116 | hgemmDur += hgemmEnd - hgemmStart; 117 | 118 | //outTime("HGEMM",GEMM_LEN,32.*GEMM_LEN*GEMM_LEN*GEMM_LEN,hgemmDur.count()); 119 | 120 | 121 | #ifdef _DO_COMPLEX 122 | auto zgemmStart = std::chrono::high_resolution_clock::now(); 123 | zgemm_(&TRANSA,&TRANSB,&x2x,&x2x,&x2x,&ALPHA,&AC[0],&x2x, 124 | &BC[0],&x2x,&BETA,&CC[0],&x2x); 125 | auto zgemmEnd = std::chrono::high_resolution_clock::now(); 126 | std::chrono::duration zgemmDur = zgemmEnd - zgemmStart; 127 | 128 | outTime("ZGEMM",GEMM_LEN,64.*GEMM_LEN*GEMM_LEN*GEMM_LEN,zgemmDur.count()); 129 | #endif 130 | 131 | 132 | } 133 | 134 | std::cout << hgemmDur.count() << std::endl; 135 | 136 | 137 | } 138 | -------------------------------------------------------------------------------- /tests/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is a part of HAXX 3 | # 4 | # Copyright (c) 2017 David Williams-Young 5 | # All rights reserved. 6 | # 7 | # See LICENSE.txt 8 | # 9 | 10 | if(NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang") 11 | set(Boost_USE_STATIC_LIBS ON) 12 | endif() 13 | 14 | message( STATUS "HAXX WILL BUILD TEST EXECUTABLES" ) 15 | find_package (Boost COMPONENTS unit_test_framework REQUIRED) 16 | 17 | # HAXX UTs 18 | add_executable(haxx_ut ut.cxx haxx_algebra.cxx haxx_binary_op.cxx 19 | haxx_unary_op.cxx) 20 | target_compile_definitions(haxx_ut PUBLIC BOOST_TEST_MODULE=HAXX_FUNCTIONALITY) 21 | target_link_libraries(haxx_ut ${Boost_LIBRARIES}) 22 | 23 | add_test(HAXX_ALGEBRA haxx_ut --report_level=detailed --run_test=HAXX_ALGEBRA ) 24 | add_test(HAXX_BINARY_OP haxx_ut --report_level=detailed --run_test=HAXX_BINARY_OP) 25 | add_test(HAXX_UNARY_OP haxx_ut --report_level=detailed --run_test=HAXX_UNARY_OP ) 26 | 27 | add_executable(hblas_ut ut.cxx hblas1.cxx hblas2.cxx hblas3.cxx hblas_util.cxx) 28 | target_compile_definitions(hblas_ut PUBLIC BOOST_TEST_MODULE=HBLAS_FUNCTIONALITY) 29 | target_link_libraries(hblas_ut ${Boost_LIBRARIES} hblas) 30 | 31 | add_test(HBLAS_HBLAS1 hblas_ut --report_level=detailed --run_test=HBLAS1) 32 | add_test(HBLAS_HBLAS2 hblas_ut --report_level=detailed --run_test=HBLAS2) 33 | add_test(HBLAS_HBLAS3 hblas_ut --report_level=detailed --run_test=HBLAS3) 34 | add_test(HBLAS_UTIL hblas_ut --report_level=detailed --run_test=UTIL ) 35 | -------------------------------------------------------------------------------- /tests/haxx_algebra.cxx: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is a part of HAXX 3 | * 4 | * Copyright (c) 2017 David Williams-Young 5 | * All rights reserved. 6 | * 7 | * See LICENSE.txt 8 | */ 9 | 10 | #include "haxx_ut.hpp" 11 | 12 | BOOST_AUTO_TEST_SUITE(HAXX_ALGEBRA) 13 | 14 | BOOST_AUTO_TEST_CASE(conj) 15 | { 16 | HAXX::quaternion q(1.,2.,3.,4.); 17 | HAXX::quaternion p = HAXX::conj(q); 18 | 19 | 20 | BOOST_CHECK_EQUAL(p.real(), q.real() ); 21 | BOOST_CHECK_EQUAL(p.imag_i(),-q.imag_i()); 22 | BOOST_CHECK_EQUAL(p.imag_j(),-q.imag_j()); 23 | BOOST_CHECK_EQUAL(p.imag_k(),-q.imag_k()); 24 | 25 | BOOST_CHECK_EQUAL(q.real(), 1.); 26 | BOOST_CHECK_EQUAL(q.imag_i(),2.); 27 | BOOST_CHECK_EQUAL(q.imag_j(),3.); 28 | BOOST_CHECK_EQUAL(q.imag_k(),4.); 29 | }; 30 | 31 | 32 | 33 | 34 | BOOST_AUTO_TEST_CASE(norm) 35 | { 36 | HAXX::quaternion q(1.,2.,3.,4.); 37 | double nrm = HAXX::norm(q); 38 | double nrmStupid = 39 | q.real() * q.real() + 40 | q.imag_i() * q.imag_i() + 41 | q.imag_j() * q.imag_j() + 42 | q.imag_k() * q.imag_k(); 43 | 44 | nrmStupid = std::sqrt(nrmStupid); 45 | 46 | 47 | HAXX::quaternion p(8.43,-9.3829,-10.3,3.14); 48 | HAXX::quaternion q1(7.,2.43,3.65,7.006); 49 | HAXX::quaternion p1(4.,2.,3.,7.); 50 | double a = 0.543; 51 | 52 | // Distance function 53 | double distPQ = HAXX::norm(p - q); 54 | double distQP = HAXX::norm(q - p); 55 | 56 | // Scaled distance 57 | double distP1Q1 = HAXX::norm(p1 + q1); 58 | double distDiff = HAXX::norm((p + a*p1 + q + a*q1) - (p + q)); 59 | 60 | 61 | BOOST_CHECK_EQUAL(nrm,nrmStupid); 62 | BOOST_CHECK_EQUAL(distPQ,distQP); 63 | BOOST_CHECK_EQUAL(distDiff,a*distP1Q1); 64 | 65 | BOOST_CHECK_EQUAL(q.real(), 1.); 66 | BOOST_CHECK_EQUAL(q.imag_i(),2.); 67 | BOOST_CHECK_EQUAL(q.imag_j(),3.); 68 | BOOST_CHECK_EQUAL(q.imag_k(),4.); 69 | 70 | }; 71 | 72 | 73 | 74 | 75 | BOOST_AUTO_TEST_SUITE_END() 76 | -------------------------------------------------------------------------------- /tests/haxx_unary_op.cxx: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is a part of HAXX 3 | * 4 | * Copyright (c) 2017 David Williams-Young 5 | * All rights reserved. 6 | * 7 | * See LICENSE.txt 8 | */ 9 | 10 | 11 | #include "haxx_ut.hpp" 12 | 13 | BOOST_AUTO_TEST_SUITE(HAXX_UNARY_OP) 14 | 15 | // Real Unary Operators 16 | BOOST_AUTO_TEST_CASE(real_unary_assign) 17 | { 18 | 19 | HAXX::quaternion q(1.,2.,3.,4.); 20 | double x = -4.59; 21 | q = x; 22 | 23 | // Check that q is unchanged 24 | BOOST_CHECK_EQUAL(q.real(),x); 25 | BOOST_CHECK_EQUAL(q.imag_i(),0.); 26 | BOOST_CHECK_EQUAL(q.imag_j(),0.); 27 | BOOST_CHECK_EQUAL(q.imag_k(),0.); 28 | 29 | } 30 | 31 | BOOST_AUTO_TEST_CASE(real_unary_add) 32 | { 33 | 34 | HAXX::quaternion q(1.,2.,3.,4.); 35 | double x = -4.59; 36 | q += x; 37 | 38 | // Check that q is unchanged 39 | BOOST_CHECK_EQUAL(q.real(),1. + (x)); 40 | BOOST_CHECK_EQUAL(q.imag_i(),2.); 41 | BOOST_CHECK_EQUAL(q.imag_j(),3.); 42 | BOOST_CHECK_EQUAL(q.imag_k(),4.); 43 | 44 | } 45 | 46 | 47 | BOOST_AUTO_TEST_CASE(real_unary_sub) 48 | { 49 | 50 | HAXX::quaternion q(1.,2.,3.,4.); 51 | double x = -4.59; 52 | q -= x; 53 | 54 | // Check that q is unchanged 55 | BOOST_CHECK_EQUAL(q.real(),1. - (x)); 56 | BOOST_CHECK_EQUAL(q.imag_i(),2.); 57 | BOOST_CHECK_EQUAL(q.imag_j(),3.); 58 | BOOST_CHECK_EQUAL(q.imag_k(),4.); 59 | 60 | } 61 | 62 | BOOST_AUTO_TEST_CASE(real_unary_mul) 63 | { 64 | 65 | HAXX::quaternion q(1.,2.,3.,4.); 66 | double x = -4.59; 67 | q *= x; 68 | 69 | // Check that q is unchanged 70 | BOOST_CHECK_EQUAL(q.real(), 1.*x); 71 | BOOST_CHECK_EQUAL(q.imag_i(),2.*x); 72 | BOOST_CHECK_EQUAL(q.imag_j(),3.*x); 73 | BOOST_CHECK_EQUAL(q.imag_k(),4.*x); 74 | 75 | } 76 | 77 | BOOST_AUTO_TEST_CASE(real_unary_div) 78 | { 79 | 80 | HAXX::quaternion q(1.,2.,3.,4.); 81 | double x = -4.59; 82 | q /= x; 83 | 84 | // Check that q is unchanged 85 | BOOST_CHECK_EQUAL(q.real(), 1./x); 86 | BOOST_CHECK_EQUAL(q.imag_i(),2./x); 87 | BOOST_CHECK_EQUAL(q.imag_j(),3./x); 88 | BOOST_CHECK_EQUAL(q.imag_k(),4./x); 89 | 90 | } 91 | 92 | 93 | 94 | // Complex Unary Operators 95 | BOOST_AUTO_TEST_CASE(complex_unary_assign) 96 | { 97 | 98 | HAXX::quaternion q(1.,2.,3.,4.); 99 | std::complex x(-4.76,5.6); 100 | q = x; 101 | 102 | // Check that q is unchanged 103 | BOOST_CHECK_EQUAL(q.real(),x.real()); 104 | BOOST_CHECK_EQUAL(q.imag_i(),x.imag()); 105 | BOOST_CHECK_EQUAL(q.imag_j(),0.); 106 | BOOST_CHECK_EQUAL(q.imag_k(),0.); 107 | 108 | } 109 | 110 | BOOST_AUTO_TEST_CASE(complex_unary_add) 111 | { 112 | 113 | HAXX::quaternion q(1.,2.,3.,4.); 114 | std::complex x(-4.76,5.6); 115 | q += x; 116 | 117 | // Check that q is unchanged 118 | BOOST_CHECK_EQUAL(q.real(),1. + x.real()); 119 | BOOST_CHECK_EQUAL(q.imag_i(),2. + x.imag()); 120 | BOOST_CHECK_EQUAL(q.imag_j(),3.); 121 | BOOST_CHECK_EQUAL(q.imag_k(),4.); 122 | 123 | } 124 | 125 | 126 | BOOST_AUTO_TEST_CASE(complex_unary_sub) 127 | { 128 | 129 | HAXX::quaternion q(1.,2.,3.,4.); 130 | std::complex x(-4.76,5.6); 131 | q -= x; 132 | 133 | // Check that q is unchanged 134 | BOOST_CHECK_EQUAL(q.real(),1. - x.real()); 135 | BOOST_CHECK_EQUAL(q.imag_i(),2. - x.imag()); 136 | BOOST_CHECK_EQUAL(q.imag_j(),3.); 137 | BOOST_CHECK_EQUAL(q.imag_k(),4.); 138 | 139 | } 140 | 141 | 142 | // Quaternion Unary Operations 143 | BOOST_AUTO_TEST_CASE(quaternion_unary_assign) 144 | { 145 | 146 | HAXX::quaternion q(1.,2.,3.,4.), p = q; 147 | 148 | // Check that q is unchanged 149 | BOOST_CHECK_EQUAL(q.real(),p.real()); 150 | BOOST_CHECK_EQUAL(q.imag_i(),p.imag_i()); 151 | BOOST_CHECK_EQUAL(q.imag_j(),p.imag_j()); 152 | BOOST_CHECK_EQUAL(q.imag_k(),p.imag_k()); 153 | 154 | } 155 | 156 | BOOST_AUTO_TEST_CASE(quaternion_unary_add) 157 | { 158 | 159 | HAXX::quaternion q(1.,2.,3.,4.), p(5.,6.,7.,8.); 160 | p += q; 161 | 162 | // Check that p has been changed properly 163 | BOOST_CHECK_EQUAL(p.real(),6.); 164 | BOOST_CHECK_EQUAL(p.imag_i(),8.); 165 | BOOST_CHECK_EQUAL(p.imag_j(),10.); 166 | BOOST_CHECK_EQUAL(p.imag_k(),12.); 167 | 168 | // Check that q is unchanged 169 | BOOST_CHECK_EQUAL(q.real(),1.); 170 | BOOST_CHECK_EQUAL(q.imag_i(),2.); 171 | BOOST_CHECK_EQUAL(q.imag_j(),3.); 172 | BOOST_CHECK_EQUAL(q.imag_k(),4.); 173 | } 174 | 175 | BOOST_AUTO_TEST_CASE(quaternion_unary_sub) 176 | { 177 | 178 | HAXX::quaternion q(1.,2.,3.,4.), p(5.,0.,7.,9.); 179 | p -= q; 180 | 181 | // Check that p has been changed properly 182 | BOOST_CHECK_EQUAL(p.real(),4.); 183 | BOOST_CHECK_EQUAL(p.imag_i(),-2.); 184 | BOOST_CHECK_EQUAL(p.imag_j(),4.); 185 | BOOST_CHECK_EQUAL(p.imag_k(),5.); 186 | 187 | // Check that q is unchanged 188 | BOOST_CHECK_EQUAL(q.real(),1.); 189 | BOOST_CHECK_EQUAL(q.imag_i(),2.); 190 | BOOST_CHECK_EQUAL(q.imag_j(),3.); 191 | BOOST_CHECK_EQUAL(q.imag_k(),4.); 192 | 193 | } 194 | 195 | BOOST_AUTO_TEST_SUITE_END() 196 | -------------------------------------------------------------------------------- /tests/haxx_ut.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is a part of HAXX 3 | * 4 | * Copyright (c) 2017 David Williams-Young 5 | * All rights reserved. 6 | * 7 | * See LICENSE.txt 8 | */ 9 | 10 | 11 | #ifdef BOOST_TEST_MODULE 12 | #undef BOOST_TEST_MODULE 13 | #endif 14 | 15 | #define BOOST_NO_MAIN 16 | #include 17 | 18 | #include 19 | 20 | #include "haxx.hpp" 21 | 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | 28 | // Length constants 29 | #define HBLAS1_VECLEN 100 30 | #define HBLAS2_MATLEN (HBLAS1_VECLEN) * (HBLAS1_VECLEN) 31 | #define HBLAS1_RAND_MIN -20 32 | #define HBLAS1_RAND_MAX 54 33 | 34 | // Setup Random Number generator 35 | static std::random_device rd; 36 | static std::mt19937 gen(rd()); 37 | static std::uniform_real_distribution<> dis(HBLAS1_RAND_MIN,HBLAS1_RAND_MAX); 38 | 39 | template _F genRandom(); 40 | template<> inline double genRandom(){ return double(dis(gen)); } 41 | template<> inline std::complex genRandom>(){ 42 | return std::complex(dis(gen),dis(gen)); 43 | } 44 | template<> inline HAXX::quaternion genRandom>(){ 45 | return HAXX::quaternion(dis(gen),dis(gen),dis(gen),dis(gen)); 46 | } 47 | 48 | 49 | 50 | // Index list for HBLAS1 UT conformation 51 | static std::vector indx(boost::counting_iterator(0), 52 | boost::counting_iterator(HBLAS1_VECLEN)); 53 | 54 | // Strides to be tested 55 | static std::vector strides = {1,2,3,5,9}; 56 | 57 | #define COMPARE_TOL 1e-12 58 | #define CMP_Q(a,b) ( HAXX::norm(((a) * HAXX::inv(b))- 1.) < COMPARE_TOL ) 59 | -------------------------------------------------------------------------------- /tests/hblas_util.cxx: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is a part of HAXX 3 | * 4 | * Copyright (c) 2017 David Williams-Young 5 | * All rights reserved. 6 | * 7 | * See LICENSE.txt 8 | */ 9 | 10 | #include "haxx_ut.hpp" 11 | #include "hblas/hblas_util.hpp" 12 | #include "hblas/hblas3.hpp" 13 | 14 | BOOST_AUTO_TEST_SUITE(UTIL) 15 | 16 | void ComplexExpandTest(char ORDER) { 17 | 18 | std::vector> A(HBLAS2_MATLEN); 19 | std::vector> B(HBLAS2_MATLEN); 20 | std::vector> C(HBLAS2_MATLEN); 21 | 22 | std::vector> AC(4*HBLAS2_MATLEN); 23 | std::vector> BC(4*HBLAS2_MATLEN); 24 | std::vector> CC(4*HBLAS2_MATLEN); 25 | std::vector> PC(4*HBLAS2_MATLEN); 26 | 27 | for(auto &x : A) x = genRandom>(); 28 | for(auto &x : B) x = genRandom>(); 29 | 30 | HBLAS_COMPLEX_EXPAND(ORDER,HBLAS1_VECLEN,HBLAS1_VECLEN,&A[0],HBLAS1_VECLEN, 31 | &AC[0],2*HBLAS1_VECLEN); 32 | HBLAS_COMPLEX_EXPAND(ORDER,HBLAS1_VECLEN,HBLAS1_VECLEN,&B[0],HBLAS1_VECLEN, 33 | &BC[0],2*HBLAS1_VECLEN); 34 | 35 | 36 | // Quaternion multiplication 37 | HBLAS_GEMM('N','N', HBLAS1_VECLEN, HBLAS1_VECLEN, HBLAS1_VECLEN, 38 | 1., &A[0], HBLAS1_VECLEN, &B[0], HBLAS1_VECLEN, 0., &C[0], 39 | HBLAS1_VECLEN); 40 | 41 | // Complex multiplication (FIXME: Should use ZGEMM here) 42 | for(auto i = 0; i < 2*HBLAS1_VECLEN; i++) 43 | for(auto j = 0; j < 2*HBLAS1_VECLEN; j++) { 44 | CC[i + j*2*HBLAS1_VECLEN] = 0.; 45 | for(auto k = 0; k < 2*HBLAS1_VECLEN; k++) 46 | CC[i + j*2*HBLAS1_VECLEN] += 47 | AC[i + k*2*HBLAS1_VECLEN] * BC[k + j*2*HBLAS1_VECLEN]; 48 | } 49 | 50 | // Expand product to complex 51 | HBLAS_COMPLEX_EXPAND(ORDER,HBLAS1_VECLEN,HBLAS1_VECLEN,&C[0],HBLAS1_VECLEN, 52 | &PC[0],2*HBLAS1_VECLEN); 53 | 54 | 55 | for(auto i = 0; i < 4*HBLAS2_MATLEN; i++) 56 | BOOST_CHECK( std::norm(CC[i] / PC[i] - 1.) < COMPARE_TOL ); 57 | 58 | }; 59 | 60 | void ComplexContractTest(char ORDER, char UPLO) { 61 | 62 | std::vector> A(HBLAS2_MATLEN); 63 | std::vector> B(HBLAS2_MATLEN); 64 | 65 | std::vector> AC(4*HBLAS2_MATLEN); 66 | 67 | for(auto &x : A) x = genRandom>(); 68 | 69 | // A -> AC 70 | HBLAS_COMPLEX_EXPAND(ORDER,HBLAS1_VECLEN,HBLAS1_VECLEN,&A[0],HBLAS1_VECLEN, 71 | &AC[0],2*HBLAS1_VECLEN); 72 | 73 | // AC -> B 74 | HBLAS_COMPLEX_CONTRACT(ORDER,UPLO,HBLAS1_VECLEN,HBLAS1_VECLEN,&B[0], 75 | HBLAS1_VECLEN,&AC[0],2*HBLAS1_VECLEN); 76 | 77 | 78 | // Compare A and B 79 | for(auto i = 0; i < HBLAS2_MATLEN; i++) 80 | BOOST_CHECK( CMP_Q(A[i],B[i]) ); 81 | } 82 | 83 | 84 | BOOST_AUTO_TEST_CASE(Complex_Expand1) { ComplexExpandTest('F'); }; 85 | BOOST_AUTO_TEST_CASE(Complex_Expand2) { ComplexExpandTest('S'); }; 86 | BOOST_AUTO_TEST_CASE(Complex_Contract1) { ComplexContractTest('F','U'); }; 87 | BOOST_AUTO_TEST_CASE(Complex_Contract2) { ComplexContractTest('S','U'); }; 88 | BOOST_AUTO_TEST_CASE(Complex_Contract3) { ComplexContractTest('F','L'); }; 89 | BOOST_AUTO_TEST_CASE(Complex_Contract4) { ComplexContractTest('S','L'); }; 90 | 91 | BOOST_AUTO_TEST_SUITE_END() 92 | -------------------------------------------------------------------------------- /tests/ut.cxx: -------------------------------------------------------------------------------- 1 | /* 2 | * This file is a part of HAXX 3 | * 4 | * Copyright (c) 2017 David Williams-Young 5 | * All rights reserved. 6 | * 7 | * See LICENSE.txt 8 | */ 9 | 10 | #include 11 | #include 12 | #include 13 | using namespace boost::unit_test; 14 | 15 | // Instantiate the Boost UTF 16 | --------------------------------------------------------------------------------