├── .gitignore
├── .travis.yml
├── CMakeLists.txt
├── Doxyfile
├── LICENSE.txt
├── README.md
├── bin
└── travisci.sh
├── cmake
├── FindSIMD.cmake
├── HAXXCompilerFlags.cmake
└── HAXXHandleSIMD.cmake
├── include
├── haxx.hpp
├── haxx
│ ├── haxx_complex_op.hpp
│ ├── haxx_def.hpp
│ ├── haxx_quaternion_op.hpp
│ ├── haxx_quaternion_reduction.hpp
│ └── haxx_scalar_op.hpp
├── haxx_config.hpp.in
├── hblas.hpp
├── hblas
│ ├── config
│ │ ├── hblas3
│ │ │ └── gemm.hpp
│ │ └── types.hpp
│ ├── generic
│ │ ├── hblas1
│ │ │ ├── hblas_axpym_impl.hpp
│ │ │ ├── hblas_axpyv_impl.hpp
│ │ │ ├── hblas_copyv_impl.hpp
│ │ │ ├── hblas_dotcv_impl.hpp
│ │ │ ├── hblas_dotuv_impl.hpp
│ │ │ ├── hblas_scalm_impl.hpp
│ │ │ ├── hblas_scalv_impl.hpp
│ │ │ ├── hblas_swapv_impl.hpp
│ │ │ └── impl.hpp
│ │ ├── hblas2
│ │ │ ├── hblas_gemv_impl.hpp
│ │ │ ├── hblas_gerc_impl.hpp
│ │ │ ├── hblas_geru_impl.hpp
│ │ │ └── impl.hpp
│ │ ├── hblas3
│ │ │ ├── hblas_gemm_impl.hpp
│ │ │ └── impl.hpp
│ │ └── util
│ │ │ ├── contract_impl.hpp
│ │ │ ├── expand_impl.hpp
│ │ │ └── impl.hpp
│ ├── hblas1.hpp
│ ├── hblas2.hpp
│ ├── hblas3.hpp
│ ├── hblas_util.hpp
│ └── pack
│ │ ├── pack.hpp
│ │ ├── packops.hpp
│ │ └── typewrapper.hpp
└── util
│ ├── boilerplate.hpp
│ ├── constants.hpp
│ ├── macro.hpp
│ ├── simd.hpp
│ ├── simd
│ ├── intrin_alias.hpp
│ ├── misc.hpp
│ └── qop.hpp
│ └── types.hpp
├── src
├── CMakeLists.txt
├── benchmark
│ ├── CMakeLists.txt
│ ├── dot.cxx
│ └── gemm.cxx
├── hblas
│ ├── CMakeLists.txt
│ ├── cxx
│ │ ├── CMakeLists.txt
│ │ ├── hblas1
│ │ │ ├── AXPYScal.cmake
│ │ │ ├── CMakeLists.txt
│ │ │ ├── CopySwap.cmake
│ │ │ ├── Dot.cmake
│ │ │ ├── axpy_scal.cxx
│ │ │ ├── copy_swap.cxx
│ │ │ └── dot.cxx
│ │ └── hblas3
│ │ │ ├── CMakeLists.txt
│ │ │ ├── Gemm.cmake
│ │ │ ├── gemm.cxx
│ │ │ └── kern.cxx
│ ├── fortran
│ │ ├── CMakeLists.txt
│ │ ├── hblas1
│ │ │ ├── haxpych.f
│ │ │ ├── haxpydh.f
│ │ │ ├── haxpyhh.f
│ │ │ ├── hdotc.f
│ │ │ ├── hdotu.f
│ │ │ ├── hscalc.f
│ │ │ ├── hscald.f
│ │ │ └── hscalh.f
│ │ ├── hblas2
│ │ │ ├── hgemvdd.f
│ │ │ ├── hgemvdh.f
│ │ │ ├── hgemvdz.f
│ │ │ ├── hgemvhd.f
│ │ │ ├── hgemvhh.f
│ │ │ ├── hgemvhz.f
│ │ │ ├── hgemvzd.f
│ │ │ ├── hgemvzh.f
│ │ │ ├── hgemvzz.f
│ │ │ ├── hgercd.f
│ │ │ ├── hgerch.f
│ │ │ ├── hgercz.f
│ │ │ ├── hgerud.f
│ │ │ ├── hgeruh.f
│ │ │ └── hgeruz.f
│ │ ├── hblas3
│ │ │ ├── hgemmdd.f
│ │ │ ├── hgemmdh.f
│ │ │ ├── hgemmdz.f
│ │ │ ├── hgemmhd.f
│ │ │ ├── hgemmhh.f
│ │ │ ├── hgemmhz.f
│ │ │ ├── hgemmzd.f
│ │ │ ├── hgemmzh.f
│ │ │ └── hgemmzz.f
│ │ └── util
│ │ │ ├── hdexp.f
│ │ │ ├── hzcon.f
│ │ │ ├── hzexp.f
│ │ │ └── lsame.f
│ ├── hblas1.cxx.in
│ ├── hblas2.cxx.in
│ ├── hblas3.cxx.in
│ └── util.cxx
└── tune
│ ├── CMakeLists.txt
│ └── gemm.cxx
└── tests
├── CMakeLists.txt
├── haxx_algebra.cxx
├── haxx_binary_op.cxx
├── haxx_unary_op.cxx
├── haxx_ut.hpp
├── hblas1.cxx
├── hblas2.cxx
├── hblas3.cxx
├── hblas_util.cxx
└── ut.cxx
/.gitignore:
--------------------------------------------------------------------------------
1 | *build*/
2 | docs/
3 | .*.swp
4 | .*.swo
5 | *.o
6 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | os:
2 | - linux
3 | language: cpp
4 | compiler:
5 | - gcc
6 | # - clang
7 | env:
8 | - GCC_VERSION=4.8
9 | - GCC_VERSION=4.9
10 | - GCC_VERSION=5
11 | matrix:
12 | exclude:
13 | - compiler: clang
14 | env: GCC_VERSION=4.8
15 | - compiler: clang
16 | env: GCC_VERSION=4.9
17 | sudo: false
18 | addons:
19 | apt:
20 | sources:
21 | - george-edison55-precise-backports
22 | - ubuntu-toolchain-r-test
23 | - llvm-toolchain-precise-3.8
24 | - boost-latest
25 | packages:
26 | - gfortran
27 | - g++-4.8
28 | - g++-4.9
29 | - g++-5
30 | - clang-3.8
31 | - libboost1.55-all-dev
32 | install: true
33 | script:
34 | - ./bin/travisci.sh
35 | after_failure:
36 | - cat build/Testing/Temporary/LastTest.log
37 |
--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | # This file is a part of HAXX
2 | #
3 | # Copyright (c) 2017 David Williams-Young
4 | # All rights reserved.
5 | #
6 | # See LICENSE.txt
7 | #
8 | cmake_minimum_required(VERSION 3.0)
9 | project(HAXX CXX C Fortran)
10 | set(HAXX_VERSION_MAJOR ALPHA)
11 | set(HAXX_VERSION_MINOR 1)
12 | set(HAXX_VERSION_PATCH 0)
13 |
14 |
15 | include_directories("${PROJECT_SOURCE_DIR}/include")
16 | include_directories("${PROJECT_BINARY_DIR}/include")
17 |
18 | configure_file(
19 | "${PROJECT_SOURCE_DIR}/include/haxx_config.hpp.in"
20 | "${PROJECT_BINARY_DIR}/include/haxx_config.hpp"
21 | )
22 |
23 | set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${PROJECT_SOURCE_DIR}/cmake)
24 |
25 | if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
26 | message(STATUS "${CMAKE_CXX_COMPILER_ID} Recognized as CLANG: Adding -D_HAXX_UT_BUTF_NINCLUDED")
27 | add_definitions("-D_HAXX_UT_BUTF_NINCLUDED")
28 | endif()
29 |
30 |
31 | # Options
32 | option( HAXX_ENABLE_BENCHMARK "Enable Build of HAXX / HBLAS Benchmark" OFF )
33 | option( HAXX_ENABLE_TESTING "Enable Build of HAXX / HBLAS Unit Tests" ON )
34 | option( HAXX_USE_HOST_SIMD "Use Host Native Flags for SIMD" ON )
35 |
36 | include(HAXXCompilerFlags)
37 | include(HAXXHandleSIMD)
38 |
39 | add_subdirectory(src)
40 |
41 | if( HAXX_ENABLE_TESTING )
42 | enable_testing()
43 | add_subdirectory(tests)
44 | endif()
45 |
46 |
47 | # Install Headers
48 | install(FILES ${PROJECT_SOURCE_DIR}/include/haxx.hpp DESTINATION include)
49 | install(FILES ${PROJECT_SOURCE_DIR}/include/hblas.hpp DESTINATION include)
50 | install(DIRECTORY ${PROJECT_SOURCE_DIR}/include/haxx DESTINATION include)
51 | install(DIRECTORY ${PROJECT_SOURCE_DIR}/include/hblas DESTINATION include)
52 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | Copyright (C) 2017 David Williams-Young
2 | All rights reserved.
3 |
4 | Redistribution and use in source and binary forms, with or without
5 | modification, are permitted provided that the following conditions are met:
6 |
7 | 1. Redistributions of source code must retain the above copyright notice, this
8 | list of conditions and the following disclaimer.
9 |
10 | 2. Redistributions in binary form must reproduce the above copyright notice,
11 | this list of conditions and the following disclaimer in the documentation
12 | and/or other materials provided with the distribution.
13 |
14 | 3. Neither the name of the copyright holder nor the names of its contributors
15 | may be used to endorse or promote products derived from this software without
16 | specific prior written permission.
17 |
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [](https://travis-ci.org/wavefunction91/HAXX)
2 |
3 |
4 | Synopsis
5 | ========
6 | **HAXX** (**H**amilton's Quaternion **A**lgebra for C**XX**) is a C++ software
7 | infrastructure for the development of efficient scalar and tensorial quaternion
8 | algorithms. HAXX can be thought of as two interdependent C++ software
9 | libraries:
10 |
11 | * The HAXX scalar quaternion class which handles the scalar operations
12 | (+,-,etc) over the quaternion numbers
13 | * HBLAS for quaternion and mixed-type linear algebra
14 |
15 |
16 | **HBLAS** (**H**amilton's Quaternion **B**asic **L**inear **A**lgebra
17 | **S**ubroutines) provides a BLAS-like interface for matrices and vectors over
18 | the quaternion numbers. As HBLAS depends solely on the HAXX scalar quaternion
19 | infrastructure, there is no plan currently to release them separately.
20 | Currently, HBLAS provides an optimized (serial) software implementation of quaternion
21 | matrix operations for AVX and AVX2 microarchitectures (see [arXiv:1903.05575](http://arxiv.org/abs/1903.05575) for details).
22 |
23 |
24 | HAXX is currently a development code which has been hand tuned for a select few microarchitectures to demonstrate the
25 | efficacy of such operations on modern computing platforms.
26 | The default caching parameters shipped with HAXX are optimized for the Intel(R) Xeon(R) CPU E5-2660 (Sandy Bridge) processor.
27 | The API specification in HAXX is very flexible, but most of the flexibility is not directly user-facing. If there is interest
28 | in exposing such functionality, please open a GitHub issue.
29 |
30 | A primary goal of HBLAS is not only to provide a
31 | convenient and efficient interface for quaternion-quaternion linear algebra,
32 | but also to efficiently handle mixed-type (quaternion-real, quaternion-complex)
33 | linear algebra through their natural embeddings into the quaternion algebra.
34 | HAXX is actively being developed with little focus on backwards compatibility
35 | with previous versions. The HAXX and HBLAS interfaces are constantly evolving
36 | and can (will) change on a regular basis as new, exciting functionality is
37 | added.
38 |
39 | Design Goals
40 | ============
41 | * A high-level, modern C++ API for scalar quaternion algebra (addition,
42 | subtraction, multiplication, division)
43 | * Access to low level optimization and vectorization of the real arithmetic
44 | underlying quaternion operations
45 | * Extension of BLAS functionality to quaternion algebra and mixed-type
46 | expressions (HBLAS)
47 | * A reusable software framework to enable future scalar and tensorial
48 | algorithmic development using the quaternion algebra
49 |
50 | Developers
51 | ==========
52 | David Williams-Young (Computational Research Division / Lawrence Berkeley National Laboratory)
53 | E-Mail: dbwy at lbl dot gov
54 |
--------------------------------------------------------------------------------
/bin/travisci.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | set -e
4 |
5 | if [ "$CXX" = "g++" ]; then
6 | export CC=/usr/bin/gcc-$GCC_VERSION
7 | export CXX=/usr/bin/g++-$GCC_VERSION
8 | else
9 | # no OpenMP support in clang, will use C++11 threads
10 | export CC=/usr/bin/clang-3.8
11 | export CXX=/usr/bin/clang++-3.8
12 | fi
13 | export CXXFLAGS="-std=c++11 -O3"
14 | export FC=gfortran
15 |
16 |
17 | CMAKE_URL="https://cmake.org/files/v3.7/cmake-3.7.2-Linux-x86_64.tar.gz"
18 |
19 | wget --no-check-certificate --quiet -O - $CMAKE_URL | tar --strip-components=1 -xz -C cmake
20 | PATH=$PWD/cmake/bin":"$PATH
21 |
22 |
23 | mkdir build && cd build
24 | cmake --version
25 | cmake -DBOOST_LIBRARYDIR='/usr/lib' -DCMAKE_Fortran_FLAGS='-O3' ..
26 | make -j2
27 | make test
28 |
--------------------------------------------------------------------------------
/cmake/FindSIMD.cmake:
--------------------------------------------------------------------------------
1 | # Determine SIMD instruction set if any
2 |
3 |
4 | message( STATUS "Attempting to determine SIMD instructions")
5 |
6 | # LINUX checks
7 | if( CMAKE_SYSTEM_NAME MATCHES "Linux" )
8 |
9 | # Get proc info
10 | EXEC_PROGRAM( cat ARGS "/proc/cpuinfo" OUTPUT_VARIABLE CPUINFO )
11 |
12 |
13 | # Check AVX and AVX2
14 | STRING( REGEX REPLACE "^.*(avx).*$" "\\1" AVX_THERE ${CPUINFO} )
15 | STRING( REGEX REPLACE "^.*(avx2).*$" "\\1" AVX2_THERE ${CPUINFO} )
16 |
17 | STRING( COMPARE EQUAL "avx" "${AVX_THERE}" AVX_TRUE )
18 | STRING( COMPARE EQUAL "avx2" "${AVX2_THERE}" AVX2_TRUE )
19 |
20 |
21 | # NON-LINUX defaults to generic code
22 | else()
23 |
24 | set( AVX_TRUE false)
25 | set( AVX2_TRUE false)
26 | set( AVX512_TRUE false)
27 |
28 | endif()
29 |
30 |
31 | set( AVX_FOUND ${AVX_TRUE} CACHE BOOL "AVX instructions available" )
32 | set( AVX2_FOUND ${AVX2_TRUE} CACHE BOOL "AVX2 instructions available" )
33 | set( AVX512_FOUND ${AVX512_TRUE} CACHE BOOL "AVX512 instructions available" )
34 |
35 |
36 | if( AVX2_FOUND )
37 | message( STATUS "-- AVX2 is largest available SIMD instruction set" )
38 | elseif( AVX_FOUND )
39 | message( STATUS "-- AVX is largest available SIMD instruction set" )
40 | endif()
41 |
42 |
--------------------------------------------------------------------------------
/cmake/HAXXCompilerFlags.cmake:
--------------------------------------------------------------------------------
1 | include(CheckCXXCompilerFlag)
2 | include(CheckFortranCompilerFlag)
3 |
4 | # Handle C++14 Flags
5 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
6 | set(CMAKE_CXX_STANDARD 14)
7 |
8 | if(CMAKE_CXX_COMPILER_ID MATCHES "Intel")
9 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14")
10 | endif()
11 |
12 | # Check for FOTRAN preprocessor
13 | check_fortran_compiler_flag("-fpp" FC_USES_FPP)
14 | check_fortran_compiler_flag("-cpp" FC_USES_CPP)
15 |
16 | if(FC_USES_FPP)
17 | set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fpp")
18 | elseif(FC_USES_CPP)
19 | set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -cpp")
20 | else()
21 | message(FATAL "Unable to Determine a Suitable FORTRAN Preprocessor")
22 | endif()
23 |
24 |
25 | # Check IPO
26 | check_cxx_compiler_flag("-ipo" CXX_USES_IPO)
27 |
28 | if( CXX_USES_IPO AND NOT DISABLE_IPO )
29 | set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ipo" )
30 | endif()
31 |
32 |
33 | # Host SIMD flags
34 | if( HAXX_USE_HOST_SIMD )
35 |
36 | # Determine CXX opt flags
37 | check_cxx_compiler_flag("-march=native" CXX_USES_MARCH_NATIVE)
38 | check_cxx_compiler_flag("-xHost" CXX_USES_XHOST )
39 |
40 | check_fortran_compiler_flag("-march=native" FC_USES_MARCH_NATIVE)
41 | check_fortran_compiler_flag("-xHost" FC_USES_XHOST )
42 |
43 | # Add Host flags
44 | if( CXX_USES_XHOST )
45 | set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -xHost" )
46 | elseif( CXX_USES_MARCH_NATIVE )
47 | set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native -O3" )
48 | else()
49 | message( WARNING "Unable to determine proper HOST flags for CXX compiler" )
50 | endif()
51 |
52 | if( FC_USES_XHOST )
53 | set( CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -xHost" )
54 | elseif( FC_USES_MARCH_NATIVE )
55 | set( CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -march=native" )
56 | else()
57 | message( WARNING "Unable to determine proper HOST flags for FC" )
58 | endif()
59 |
60 | endif()
61 |
62 | # HAXX Types
63 |
64 | # Index Integer Type
65 | if( NOT HAXX_INT )
66 | set( HAXX_INT int32_t )
67 | endif()
68 | add_definitions("-DHAXX_INT=${HAXX_INT}")
69 |
70 |
71 |
--------------------------------------------------------------------------------
/cmake/HAXXHandleSIMD.cmake:
--------------------------------------------------------------------------------
1 | if( HAXX_USE_HOST_SIMD )
2 |
3 | # Try to determine SIMD
4 | include( FindSIMD )
5 |
6 | if( NOT AVX_FOUND AND NOT AVX2_FOUND AND NOT AVX512_FOUND )
7 | message( WARNING "HAXX only provided optimal implementations for AVX, AVX2 and AVX-512 -- Defaulting to Generic FORTRAN build" )
8 |
9 | set( ENABLE_GENERIC_FORTRAN true CACHE BOOL "Enable generic FORTRAN code" )
10 | set( ENABLE_GENERIC_CXX true CACHE BOOL "Enable generic CXX code" )
11 |
12 | else()
13 |
14 | message( STATUS "HAXX Found a suitable SIMD instruction set -- Enabling Optimized code" )
15 |
16 | set( ENABLE_GENERIC_FORTRAN false CACHE BOOL "Enable generic FORTRAN code" )
17 | set( ENABLE_GENERIC_CXX false CACHE BOOL "Enable generic CXX code" )
18 |
19 | endif()
20 |
21 |
22 | endif()
23 |
--------------------------------------------------------------------------------
/include/haxx.hpp:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is a part of HAXX
3 | *
4 | * Copyright (c) 2017 David Williams-Young
5 | * All rights reserved.
6 | *
7 | * See LICENSE.txt
8 | */
9 |
10 | #ifndef __INCLUDED_HAXX_HPP
11 | #define __INCLUDED_HAXX_HPP
12 |
13 |
14 | // HAXX quaternion definition
15 | #include "haxx/haxx_def.hpp"
16 |
17 | // Scalar Operators
18 | #include "haxx/haxx_scalar_op.hpp"
19 |
20 | // Complex Operators
21 | #include "haxx/haxx_complex_op.hpp"
22 |
23 | // Quaternion Operators
24 | #include "haxx/haxx_quaternion_op.hpp"
25 |
26 | // Quaternion Reduction
27 | #include "haxx/haxx_quaternion_reduction.hpp"
28 |
29 |
30 | #endif
31 |
--------------------------------------------------------------------------------
/include/haxx/haxx_quaternion_op.hpp:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is a part of HAXX
3 | *
4 | * Copyright (c) 2017 David Williams-Young
5 | * All rights reserved.
6 | *
7 | * See LICENSE.txt
8 | */
9 |
10 | #ifndef __INCLUDED_HAXX_QUATERNION_OP_HPP
11 | #define __INCLUDED_HAXX_QUATERNION_OP_HPP
12 |
13 | #include "haxx/haxx_def.hpp"
14 |
15 | namespace HAXX {
16 |
17 | /**
18 | * Assigns a quaternion of a different type to this quaternion
19 | */
20 | template
21 | template
22 | quaternion<_F>& quaternion<_F>::operator=(const quaternion<_G> &__q) {
23 |
24 | _M_real = __q.real();
25 | _M_imag_i = __q.imag_i();
26 | _M_imag_j = __q.imag_j();
27 | _M_imag_k = __q.imag_k();
28 |
29 | return *this;
30 |
31 | };
32 |
33 | /**
34 | * \f$ p = p + q \qquad p,q \in \mathbb{H} \f$
35 | */
36 | template
37 | template
38 | quaternion<_F>& quaternion<_F>::operator+=(const quaternion<_G> &__q) {
39 |
40 | _M_real += __q.real();
41 | _M_imag_i += __q.imag_i();
42 | _M_imag_j += __q.imag_j();
43 | _M_imag_k += __q.imag_k();
44 |
45 | return *this;
46 |
47 | };
48 |
49 | /**
50 | * \f$ p = p - q \qquad p,q \in \mathbb{H} \f$
51 | */
52 | template
53 | template
54 | quaternion<_F>& quaternion<_F>::operator-=(const quaternion<_G> &__q) {
55 |
56 | _M_real -= __q.real();
57 | _M_imag_i -= __q.imag_i();
58 | _M_imag_j -= __q.imag_j();
59 | _M_imag_k -= __q.imag_k();
60 |
61 | return *this;
62 |
63 | };
64 |
65 |
66 |
67 |
68 | /**
69 | * \f$ r = -q = (-q^R, -q^I, -q^J, -q^K) \f$
70 | */
71 | template
72 | inline quaternion<_F> operator-(const quaternion<_F>& __x) {
73 |
74 | return quaternion<_F>(-__x.real(),-__x.imag_i(),-__x.imag_j(),-__x.imag_k());
75 |
76 | }
77 |
78 |
79 | /**
80 | * \f$r = p + q \qquad r,p,q\in\mathbb{H} \f$
81 | */
82 | template
83 | inline quaternion<_F> operator+(const quaternion<_F>& __x,
84 | const quaternion<_F>& __y) {
85 |
86 | quaternion<_F> __r = __x;
87 | __r += __y;
88 | return __r;
89 |
90 | };
91 |
92 | /**
93 | * \f$r = p - q \qquad r,p,q\in\mathbb{H} \f$
94 | */
95 | template
96 | inline quaternion<_F> operator-(const quaternion<_F>& __x,
97 | const quaternion<_F>& __y) {
98 |
99 | quaternion<_F> __r = __x;
100 | __r -= __y;
101 | return __r;
102 |
103 | };
104 |
105 | /**
106 | * \f$ r = pq \qquad r,p,q\in\mathbb{H} \f$
107 | */
108 | template
109 | inline quaternion<_F> operator*(const quaternion<_F>& __x,
110 | const quaternion<_F>& __y) {
111 |
112 | quaternion<_F> __r;
113 |
114 | // This is a really naive algorithm
115 | __r.real(__x.real() * __y.real() - __x.imag_i() * __y.imag_i() -
116 | __x.imag_j() * __y.imag_j() - __x.imag_k() * __y.imag_k());
117 |
118 | __r.imag_i(__x.real() * __y.imag_i() + __x.imag_i() * __y.real() +
119 | __x.imag_j() * __y.imag_k() - __x.imag_k() * __y.imag_j());
120 |
121 | __r.imag_j(__x.real() * __y.imag_j() - __x.imag_i() * __y.imag_k() +
122 | __x.imag_j() * __y.real() + __x.imag_k() * __y.imag_i());
123 |
124 | __r.imag_k(__x.real() * __y.imag_k() + __x.imag_i() * __y.imag_j() -
125 | __x.imag_j() * __y.imag_i() + __x.imag_k() * __y.real());
126 |
127 | return __r;
128 |
129 | };
130 |
131 | #if defined(__AVX__) || defined(__AVX2__)
132 |
133 | // C++ wrapper around SIMD quaternion multiplication
134 | template<>
135 | inline quaternion operator*(const quaternion& __x,
136 | const quaternion& __y) {
137 |
138 | quaternion __r;
139 |
140 | // Load x,y into 256-bit vector lanes and perform the multiplication
141 | __m256d x = LOAD_256D_UNALIGNED_AS(double,&__x);
142 | __m256d y = LOAD_256D_UNALIGNED_AS(double,&__y);
143 | __m256d r = MULDQ_NN(x,y);
144 |
145 | STORE_256D_UNALIGNED_AS(double,&__r,r);
146 |
147 | return __r;
148 | };
149 |
150 | #endif
151 |
152 | /**
153 | * Returns true iff all of the elements of quaternion \f$q\f$ are the
154 | * same as quaternoin \f$p\f$.
155 | */
156 | template
157 | inline bool operator==(const quaternion<_F>& p, const quaternion<_F>& q) {
158 |
159 | return p.real() == q.real() and p.imag_i() == q.imag_i()
160 | and p.imag_j() == q.imag_j() and q.imag_k() == q.imag_k();
161 |
162 | }
163 |
164 | /**
165 | * Returns true iff all of the elements of quaternion \f$q\f$ are the
166 | * same as quaternoin \f$p\f$.
167 | */
168 | template
169 | inline bool operator!=(const quaternion<_F>& p, const quaternion<_F>& q) {
170 |
171 | return not (p == q);
172 |
173 | }
174 |
175 | }; // HAXX namespace
176 |
177 | #endif
178 |
--------------------------------------------------------------------------------
/include/haxx/haxx_quaternion_reduction.hpp:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is a part of HAXX
3 | *
4 | * Copyright (c) 2017 David Williams-Young
5 | * All rights reserved.
6 | *
7 | * See LICENSE.txt
8 | */
9 |
10 | #ifndef __INCLUDED_HAXX_QUATERNION_REDUCTION_HPP
11 | #define __INCLUDED_HAXX_QUATERNION_REDUCTION_HPP
12 |
13 | #include "haxx/haxx_def.hpp"
14 |
15 | namespace HAXX {
16 |
17 | /**
18 | * \f$ \vert\vert q \vert\vert = \sqrt{a^2 + b^2 + c^2 + d^2}
19 | * \qquad q = (a,b,c,d) \in \mathbb{H}\f$
20 | *
21 | * Return type is the value_type
22 | */
23 | template
24 | inline _F norm(const quaternion<_F>& __q) {
25 | _F nmsq = __q.real() * __q.real();
26 | nmsq += __q.imag_i() * __q.imag_i();
27 | nmsq += __q.imag_j() * __q.imag_j();
28 | nmsq += __q.imag_k() * __q.imag_k();
29 |
30 | return std::sqrt(nmsq);
31 | };
32 |
33 | /**
34 | * \f$ q^* = (q^R,-q^I,-q^J,-q^K)\f$
35 | */
36 | template
37 | inline quaternion<_F> conj(const quaternion<_F>& __q) {
38 |
39 | return quaternion<_F>(__q.real(),-__q.imag_i(),-__q.imag_j(),-__q.imag_k());
40 |
41 | };
42 |
43 | // Attempt at SIMD conjugate, slows down the code
44 | #if 0
45 | //#if defined(__AVX__) || defined(__AVX2__)
46 |
47 | template<>
48 | inline quaternion conj(const quaternion &__q) {
49 |
50 | quaternion __r;
51 |
52 | __m256d r = LOAD_256D_UNALIGNED_AS(double,&__q);
53 | r = QCONJ_256D(r);
54 |
55 | STORE_256D_UNALIGNED_AS(double,&__r,r);
56 |
57 | return __r;
58 |
59 | }
60 |
61 | #endif
62 |
63 | /**
64 | * \f$ q^{-1} = \dfrac{q^*}{\vert\vert q \vert\vert^2} \f$
65 | */
66 | template
67 | inline quaternion<_F> inv(const quaternion<_F>& __q) {
68 |
69 | _F nrm = norm(__q);
70 | return conj(__q) / nrm / nrm;
71 |
72 | };
73 |
74 | /**
75 | * \f$ r = \dfrac{q}{\vert\vert q \vert\vert} \f$
76 | */
77 | template
78 | inline quaternion<_F> versor(const quaternion<_F>& __q) {
79 |
80 | _F nrm = norm(__q);
81 | return __q / nrm;
82 |
83 | };
84 |
85 | /**
86 | * \f$ [p,q] = pq - qp \f$
87 | */
88 | template
89 | inline quaternion<_F> comm(const quaternion<_F>& p, const quaternion<_F>& q) {
90 |
91 | return p * q - q * p;
92 |
93 | };
94 |
95 | template
96 | inline quaternion<_F> comm(const quaternion<_F>& p, const std::complex<_F>& q) {
97 | /*
98 | std::complex jPt(p.imag_j(),p.imag_k());
99 | return quaternion<_F>(std::complex(0.),2.*jPt*std::imag(q));
100 | */
101 | return p * q - q * p;
102 | };
103 |
104 | template
105 | inline quaternion<_F> comm(const std::complex<_F>& p, const quaternion<_F>& q) {
106 |
107 | return -comm(q,p);
108 |
109 | }
110 |
111 |
112 | template
113 | inline quaternion<_F> comm(const _F& p, const quaternion<_F>& q) {
114 |
115 | return quaternion<_F>(0.);
116 |
117 | }
118 |
119 | template
120 | inline quaternion<_F> comm(const quaternion<_F>& q, const _F& p) {
121 |
122 | return quaternion<_F>(0.);
123 |
124 | }
125 |
126 |
127 |
128 | template<> inline double SmartConj( double &x ) { return x; }
129 | template<>
130 | inline std::complex SmartConj( std::complex &x ) {
131 | return std::conj(x);
132 | }
133 | template<>
134 | inline quaternion SmartConj( quaternion &x ) {
135 | return conj(x);
136 | }
137 |
138 |
139 |
140 | };
141 |
142 | #endif
143 |
--------------------------------------------------------------------------------
/include/haxx_config.hpp.in:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is a part of HAXX
3 | *
4 | * Copyright (c) 2017 David Williams-Young
5 | * All rights reserved.
6 | *
7 | * See LICENSE.txt
8 | */
9 |
10 | #ifndef __INCLUDED_HAXX_CONFIG_HPP
11 | #define __INCLUDED_HAXX_CONFIG_HPP
12 |
13 | // BLAS-3 Params
14 | #cmakedefine MC ${MC}
15 | #cmakedefine NC ${NC}
16 | #cmakedefine KC ${KC}
17 |
18 |
19 | #endif
20 |
--------------------------------------------------------------------------------
/include/hblas.hpp:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is a part of HAXX
3 | *
4 | * Copyright (c) 2017 David Williams-Young
5 | * All rights reserved.
6 | *
7 | * See LICENSE.txt
8 | */
9 | #ifndef __INCLUDED_HBLAS_HPP
10 | #define __INCLUDED_HBLAS_HPP
11 |
12 | /**
13 | * @defgroup HBLAS Quaternion BLAS (HBLAS)
14 | * BLAS-Like Routines over the quaternion numbers
15 | */
16 |
17 | #include "hblas/hblas1.hpp" // Level 1 HBLAS
18 | #include "hblas/hblas2.hpp" // Level 2 HBLAS
19 | #include "hblas/hblas3.hpp" // Level 3 HBLAS
20 |
21 | #endif
22 |
--------------------------------------------------------------------------------
/include/hblas/config/hblas3/gemm.hpp:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is a part of HAXX
3 | *
4 | * Copyright (c) 2017 David Williams-Young
5 | * All rights reserved.
6 | *
7 | * See LICENSE.txt
8 | */
9 |
10 | #ifndef __INCLUDED_HBLAS_CONFIG_HBLAS3_GEMM_HPP
11 | #define __INCLUDED_HBLAS_CONFIG_HBLAS3_GEMM_HPP
12 |
13 |
14 | #include "hblas/config/types.hpp"
15 |
16 | // Caching dimensions
17 |
18 | #ifndef MC
19 | #define MC 64
20 | #endif
21 | #ifndef NC
22 | #define NC 512
23 | #endif
24 | #ifndef KC
25 | #define KC 64
26 | #endif
27 |
28 | // Register block size
29 | #define MR 2
30 | #define NR 2
31 |
32 |
33 | // Determine where to factor ALPHA scaling
34 | #define _FACTOR_ALPHA_IN_A_PACK
35 | //#define _FACTOR_ALPHA_IN_B_PACK
36 |
37 | #if defined(_FACTOR_ALPHA_IN_A_PACK) && defined(_FACTOR_ALPHA_IN_B_PACK)
38 | #error "Cannot factor ALPHA into both A and B packs"
39 | #endif
40 |
41 |
42 | // Determine where to factor the transpose operation
43 | // for GEMM kernel
44 | #define _FACTOR_TRANSPOSE_INTO_A_PACK
45 | #define _FACTOR_TRANSPOSE_INTO_B_PACK
46 |
47 |
48 |
49 | // Determine packing utility for GEMM
50 |
51 | #include "hblas/pack/pack.hpp"
52 |
53 | #ifdef _FACTOR_TRANSPOSE_INTO_B_PACK
54 |
55 | #define BPACKT NPACK< NR, _AMATF, GenericPackOps_T2<> >
56 | #define BPACKCT NPACK< NR, _AMATF, ConjPackOps_T2 <> >
57 | #define BPACKR TPACK< NR, _AMATF, ConjPackOps_T2 <> >
58 | #define BPACK TPACK< NR, _AMATF, GenericPackOps_T2<> >
59 |
60 | #else
61 |
62 | #define BPACKT NPACK< NR, _BMATF, GenericPackOps<_BMATF> >
63 | #define BPACKCT NPACK< NR, _BMATF, ConjPackOps <_BMATF> >
64 | #define BPACKR TPACK< NR, _BMATF, ConjPackOps <_BMATF> >
65 | #define BPACK TPACK< NR, _BMATF, GenericPackOps<_BMATF> >
66 |
67 | #endif
68 |
69 | #ifdef _FACTOR_TRANSPOSE_INTO_A_PACK
70 |
71 | #define APACKT TPACK< MR, _AMATF, GenericPackOps_T1<> >
72 | #define APACKCT TPACK< MR, _AMATF, ConjPackOps_T1 <> >
73 | #define APACKR NPACK< MR, _AMATF, ConjPackOps_T1 <> >
74 | #define APACK NPACK< MR, _AMATF, GenericPackOps_T1<> >
75 |
76 | #else
77 |
78 | #define APACKT TPACK< MR, _AMATF, GenericPackOps<_AMATF> >
79 | #define APACKCT TPACK< MR, _AMATF, ConjPackOps <_AMATF> >
80 | #define APACKR NPACK< MR, _AMATF, ConjPackOps <_AMATF> >
81 | #define APACK NPACK< MR, _AMATF, GenericPackOps<_AMATF> >
82 |
83 | #endif
84 |
85 | #endif
86 |
--------------------------------------------------------------------------------
/include/hblas/config/types.hpp:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is a part of HAXX
3 | *
4 | * Copyright (c) 2017 David Williams-Young
5 | * All rights reserved.
6 | *
7 | * See LICENSE.txt
8 | */
9 |
10 | #ifndef __INCLUDED_HBLAS_CONFIG_TYPES_HPP
11 | #define __INCLUDED_HBLAS_CONFIG_TYPES_HPP
12 |
13 | #include "util/constants.hpp"
14 |
15 | // Determine type of scaling parameter ALPHA
16 | #ifdef ALPHAF
17 | #if ALPHAF == DOUBLE
18 | #define _ALPHAF double
19 | #elif ALPHAF == DCOMPLEX
20 | #define _ALPHAF std::complex
21 | #elif ALPHAF == DQUATERNION
22 | #define _ALPHAF quaternion
23 | #endif
24 | #endif
25 |
26 | // Determine type of scaling parameter BETA
27 | #ifdef BETAF
28 | #if BETAF == DOUBLE
29 | #define _BETAF double
30 | #elif BETAF == DCOMPLEX
31 | #define _BETAF std::complex
32 | #elif BETAF == DQUATERNION
33 | #define _BETAF quaternion
34 | #endif
35 | #endif
36 |
37 | // Determine type of matrix type AMAT
38 | #ifdef AMATF
39 | #if AMATF == DOUBLE
40 | #define _AMATF double
41 | #elif AMATF == DCOMPLEX
42 | #define _AMATF std::complex
43 | #elif AMATF == DQUATERNION
44 | #define _AMATF quaternion
45 | #endif
46 | #endif
47 |
48 | // Determine type of matrix type BMAT
49 | #ifdef BMATF
50 | #if BMATF == DOUBLE
51 | #define _BMATF double
52 | #elif BMATF == DCOMPLEX
53 | #define _BMATF std::complex
54 | #elif BMATF == DQUATERNION
55 | #define _BMATF quaternion
56 | #endif
57 | #endif
58 |
59 |
60 |
61 | #endif
62 |
--------------------------------------------------------------------------------
/include/hblas/generic/hblas1/hblas_axpym_impl.hpp:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is a part of HAXX
3 | *
4 | * Copyright (c) 2017 David Williams-Young
5 | * All rights reserved.
6 | *
7 | * See LICENSE.txt
8 | */
9 |
10 | #pragma once
11 |
12 | #include
13 | #include "hblas/hblas1.hpp"
14 |
15 | namespace HAXX {
16 |
17 | /**
18 | * Written by DBWY (4/2017)
19 | */
20 | template
21 | void HBLAS_AXPYM(const char SIDE, const char TRANSA, const HAXX_INT M,
22 | const HAXX_INT N, const _AlphaF ALPHA, _XF * const A, const HAXX_INT LDA,
23 | const HAXX_INT INCA, quaternion<_F> * const B, const HAXX_INT LDB,
24 | const HAXX_INT INCB) {
25 |
26 |
27 | if( N <= 0 or M <= 0) return;
28 | if( ALPHA == _AlphaF(0.) ) return;
29 |
30 | assert(TRANSA == 'N'); // Only supporting scaling for now
31 |
32 | HAXX_INT j;
33 |
34 | quaternion<_F> *locA = A, *locB = B;
35 |
36 | for( j = 0; j < N; j++ ) {
37 | HBLAS_AXPYV(SIDE,M,ALPHA,locA,INCA,locB,INCB);
38 | locA += LDA;
39 | locB += LDB;
40 | }
41 |
42 | };
43 |
44 |
45 |
46 |
47 | }; // namespace HAXX
48 |
--------------------------------------------------------------------------------
/include/hblas/generic/hblas1/hblas_axpyv_impl.hpp:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is a part of HAXX
3 | *
4 | * Copyright (c) 2017 David Williams-Young
5 | * All rights reserved.
6 | *
7 | * See LICENSE.txt
8 | */
9 |
10 | #pragma once
11 |
12 | #include "hblas/hblas1.hpp"
13 |
14 | #define HAXX_AXPY_UNROLL 4
15 |
16 | namespace HAXX {
17 |
18 | /**
19 | * Written by DBWY (4/2017)
20 | *
21 | * Based on the BLAS implementaion of DCOPY by Jack Dongarra
22 | * http://www.netlib.org/lapack/explore-html/d9/dcd/daxpy_8f.html
23 | *
24 | *
25 | * \f$ y \in \mathbb{H} \qquad x,\alpha \in \mathbb{R},\mathbb{C},\mathbb{H} \f$
26 | *
27 | * SIDE == 'L'
28 | *
29 | * \f$ y_i = \alpha x_i + y_i\f$
30 | *
31 | * SIDE == 'R'
32 | *
33 | * \f$ y_i = x_i \alpha + y_i \f$
34 | */
35 | template
36 | void HBLAS_AXPYV(const char SIDE, const HAXX_INT N, const _AlphaF ALPHA,
37 | _XF * const X, const HAXX_INT INCX, quaternion<_F> * const Y,
38 | const HAXX_INT INCY) {
39 |
40 |
41 | if( N <= 0 ) return;
42 | if( ALPHA == _AlphaF(0.) ) return;
43 |
44 | // FIXME: See further comments on negative stride
45 | assert(INCX > 0);
46 | assert(INCY > 0);
47 |
48 | HAXX_INT i;
49 |
50 | // FIXME: Should write a specialization for real ALPHA where side
51 | // doesnt matter
52 | bool isR = SIDE == 'R';
53 | bool isL = not isR;
54 |
55 | if( INCX == 1 and INCY == 1 ) {
56 | HAXX_INT m = N % HAXX_AXPY_UNROLL;
57 |
58 | if( m != 0) {
59 | if( isL ) for( i = 0; i < m; ++i ) Y[i] += ALPHA * X[i];
60 | else for( i = 0; i < m; ++i ) Y[i] += X[i] * ALPHA;
61 |
62 | // XXX: DAXPY has this outside of the if-check? Unline COPY and SCAL
63 | if( N < HAXX_AXPY_UNROLL ) return;
64 | }
65 |
66 | // FIXME: This assumes HAXX_AXPY_UNROLL = 4
67 | if( isL )
68 | for( i = m; i < N; i += HAXX_AXPY_UNROLL ) {
69 | Y[i] += ALPHA * X[i];
70 | Y[i+1] += ALPHA * X[i+1];
71 | Y[i+2] += ALPHA * X[i+2];
72 | Y[i+3] += ALPHA * X[i+3];
73 | }
74 | else
75 | for( i = m; i < N; i += HAXX_AXPY_UNROLL ) {
76 | Y[i] += X[i] * ALPHA;
77 | Y[i+1] += X[i+1] * ALPHA;
78 | Y[i+2] += X[i+2] * ALPHA;
79 | Y[i+3] += X[i+3] * ALPHA;
80 | }
81 | } else {
82 |
83 | HAXX_INT ix(0), iy(0);
84 | // FIXME: the original _AXPY function has code here to handle
85 | // negative increments. Unsure on what that accomplishes
86 |
87 | if( isL )
88 | for( i = 0; i < N; ++i, ix += INCX, iy += INCY ) Y[iy] += ALPHA * X[ix];
89 | else
90 | for( i = 0; i < N; ++i, ix += INCX, iy += INCY ) Y[iy] += X[ix] * ALPHA;
91 |
92 | }
93 | };
94 |
95 |
96 |
97 |
98 | }; // namespace HAXX
99 |
--------------------------------------------------------------------------------
/include/hblas/generic/hblas1/hblas_copyv_impl.hpp:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is a part of HAXX
3 | *
4 | * Copyright (c) 2017 David Williams-Young
5 | * All rights reserved.
6 | *
7 | * See LICENSE.txt
8 | */
9 |
10 | #pragma once
11 |
12 | #include "hblas/hblas1.hpp"
13 |
14 | #define HAXX_COPY_UNROLL 7
15 |
16 | namespace HAXX {
17 |
18 | /**
19 | * Copies the elememts from one quaternion vector to another
20 | *
21 | * Written by DBWY (4/2017)
22 | *
23 | * Based on the BLAS implementaion of DCOPY by Jack Dongarra
24 | * http://www.netlib.org/lapack/explore-html/da/d6c/dcopy_8f.html
25 | */
26 | template
27 | void HBLAS_COPYV(const HAXX_INT N, quaternion<_F> * const X,
28 | const HAXX_INT INCX, quaternion<_F> * const Y, const HAXX_INT INCY) {
29 |
30 | if( N <= 0 ) return;
31 | // FIXME: See further comments on negative stride
32 | assert(INCX > 0);
33 | assert(INCY > 0);
34 |
35 | HAXX_INT i;
36 |
37 | if( INCX == 1 and INCY == 1 ) {
38 |
39 | HAXX_INT m = N % HAXX_COPY_UNROLL;
40 | if( m != 0 ) {
41 | for( i = 0; i < m; i++ ) Y[i] = X[i];
42 | if( N < HAXX_COPY_UNROLL ) return;
43 | }
44 |
45 | // FIXME: This assumes HAXX_COPY_UNROLL = 7
46 | for( i = m; i < N; i += HAXX_COPY_UNROLL ) {
47 | Y[i] = X[i];
48 | Y[i+1] = X[i+1];
49 | Y[i+2] = X[i+2];
50 | Y[i+3] = X[i+3];
51 | Y[i+4] = X[i+4];
52 | Y[i+5] = X[i+5];
53 | Y[i+6] = X[i+6];
54 | }
55 |
56 | } else {
57 |
58 | HAXX_INT ix(0), iy(0);
59 | // FIXME: the original _COPY function has code here to handle
60 | // negative increments. Unsure on what that accomplishes
61 |
62 | for( i = 0; i < N; ++i, ix += INCX, iy += INCY ) Y[iy] = X[ix];
63 |
64 |
65 | }
66 | };
67 |
68 | }; // namespace HAXX
69 |
70 |
71 |
--------------------------------------------------------------------------------
/include/hblas/generic/hblas1/hblas_dotcv_impl.hpp:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is a part of HAXX
3 | *
4 | * Copyright (c) 2017 David Williams-Young
5 | * All rights reserved.
6 | *
7 | * See LICENSE.txt
8 | */
9 |
10 | #pragma once
11 |
12 | #include "hblas/hblas1.hpp"
13 |
14 | namespace HAXX {
15 |
16 | /**
17 | * Written by DBWY (4/2017)
18 | *
19 | * Based on the BLAS implementation of ZDOTU by Jack Dongarra
20 | * http://www.netlib.org/lapack/explore-html/d6/db8/zdotc_8f.html
21 | *
22 | * \f$ r,x,y \in \mathbb{H}, \qquad r = \sum_i x^*_i y_i \f$
23 | */
24 | template
25 | quaternion<_F> HBLAS_DOTCV(const HAXX_INT N, quaternion<_F> * const X,
26 | const HAXX_INT INCX, quaternion<_F> * const Y, const HAXX_INT INCY) {
27 |
28 | quaternion<_F> htemp(0.,0.,0.,0.);
29 |
30 | if( N <= 0 ) return htemp;
31 |
32 | // FIXME: See further comments on negative stride
33 | assert(INCX > 0);
34 | assert(INCY > 0);
35 |
36 | HAXX_INT i;
37 |
38 | if( INCX == 1 and INCY == 1 ) {
39 |
40 | for( i = 0; i < N; ++i ) htemp += conj(X[i])*Y[i];
41 |
42 | } else {
43 |
44 | HAXX_INT ix(0), iy(0);
45 | // FIXME: the original _AXPY function has code here to handle
46 | // negative increments. Unsure on what that accomplishes
47 |
48 | for( i = 0; i < N; ++i, ix += INCX, iy += INCY )
49 | htemp += conj(X[ix]) * Y[iy];
50 | }
51 |
52 | return htemp;
53 | };
54 |
55 |
56 | }; // namespace HAXX
57 |
--------------------------------------------------------------------------------
/include/hblas/generic/hblas1/hblas_dotuv_impl.hpp:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is a part of HAXX
3 | *
4 | * Copyright (c) 2017 David Williams-Young
5 | * All rights reserved.
6 | *
7 | * See LICENSE.txt
8 | */
9 |
10 | #pragma once
11 |
12 | #include "hblas/hblas1.hpp"
13 |
14 | namespace HAXX {
15 |
16 | /**
17 | * Written by DBWY (4/2017)
18 | *
19 | * Based on the BLAS implementation of ZDOTU by Jack Dongarra
20 | * http://www.netlib.org/lapack/explore-html/db/d2d/zdotu_8f.html
21 | *
22 | * \f$ r,x,y \in \mathbb{H}, \qquad r = \sum_i x_i y_i \f$
23 | */
24 | template
25 | quaternion<_F> HBLAS_DOTUV(const HAXX_INT N, quaternion<_F> * const X,
26 | const HAXX_INT INCX, quaternion<_F> * const Y, const HAXX_INT INCY) {
27 |
28 | quaternion<_F> htemp(0.,0.,0.,0.);
29 | if( N <= 0 ) return htemp;
30 |
31 | // FIXME: See further comments on negative stride
32 | assert(INCX > 0);
33 | assert(INCY > 0);
34 |
35 | HAXX_INT i;
36 |
37 | if( INCX == 1 and INCY == 1 ) {
38 |
39 | for( i = 0; i < N; ++i ) htemp += X[i]*Y[i];
40 |
41 | } else {
42 |
43 | HAXX_INT ix(0), iy(0);
44 | // FIXME: the original _AXPY function has code here to handle
45 | // negative increments. Unsure on what that accomplishes
46 |
47 | for( i = 0; i < N; ++i, ix += INCX, iy += INCY ) {
48 | htemp += X[ix] * Y[iy];
49 | }
50 | }
51 |
52 | return htemp;
53 | };
54 |
55 |
56 | }; // namespace HAXX
57 |
--------------------------------------------------------------------------------
/include/hblas/generic/hblas1/hblas_scalm_impl.hpp:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is a part of HAXX
3 | *
4 | * Copyright (c) 2017 David Williams-Young
5 | * All rights reserved.
6 | *
7 | * See LICENSE.txt
8 | */
9 |
10 | #pragma once
11 |
12 | #include
13 |
14 | #include "hblas/hblas1.hpp"
15 |
16 |
17 | namespace HAXX {
18 |
19 | /**
20 | * Scale / Transpose / Conjugate a quaternion matrix in place
21 | *
22 | * Written by DBWY (9/2017)
23 | */
24 | template
25 | void HBLAS_SCALM(const char SIDE, const char TRANSA, const HAXX_INT M,
26 | const HAXX_INT N, const _AlphaF ALPHA, quaternion<_F> * const A,
27 | const HAXX_INT LDA, const HAXX_INT INCA) {
28 |
29 | if( N <= 0 or M <= 0 or INCA <= 0 ) return;
30 | assert(TRANSA == 'N'); // Only supporting scaling for now
31 |
32 | HAXX_INT j;
33 |
34 | quaternion<_F> *locA = A;
35 |
36 | for( j = 0; j < N; j++ ) {
37 | HBLAS_SCALV(SIDE,M,ALPHA,locA,INCA);
38 | locA += LDA;
39 | }
40 |
41 |
42 | };
43 |
44 |
45 |
46 | }; // namespace HAXX
47 |
48 |
--------------------------------------------------------------------------------
/include/hblas/generic/hblas1/hblas_scalv_impl.hpp:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is a part of HAXX
3 | *
4 | * Copyright (c) 2017 David Williams-Young
5 | * All rights reserved.
6 | *
7 | * See LICENSE.txt
8 | */
9 |
10 | #pragma once
11 |
12 | #include "hblas/hblas1.hpp"
13 | #include
14 |
15 | #define HAXX_SCAL_UNROLL 5
16 |
17 | namespace HAXX {
18 |
19 | /**
20 | * Scales a quaternion vector in place
21 | *
22 | * Written by DBWY (4/2017)
23 | *
24 | * Based on the BLAS implementaion of DSCAL by Jack Dongarra
25 | * http://www.netlib.org/lapack/explore-html/d4/dd0/dscal_8f.html
26 | *
27 | * SIDE == 'L'
28 | *
29 | * \f$ x_i = \alpha x_i \f$
30 | *
31 | * SIDE == 'R'
32 | *
33 | * \f$ x_i = x_i \alpha \f$
34 | */
35 | template
36 | void HBLAS_SCALV(const char SIDE, const HAXX_INT N, const _AlphaF ALPHA,
37 | quaternion<_F> * const X, const HAXX_INT INCX) {
38 |
39 | if( N <= 0 or INCX <= 0 ) return;
40 |
41 | HAXX_INT i;
42 |
43 | // FIXME: Should write a specialization for real ALPHA where side
44 | // doesnt matter
45 | bool isR = SIDE == 'R';
46 | bool isL = not isR;
47 |
48 | if( INCX == 1 ) {
49 | HAXX_INT m = N % HAXX_SCAL_UNROLL;
50 |
51 | if( m != 0 ) {
52 | if( isL ) for( i = 0; i < m; ++i ) X[i] = ALPHA * X[i];
53 | else for( i = 0; i < m; ++i ) X[i] = X[i] * ALPHA;
54 |
55 | if( N < HAXX_SCAL_UNROLL ) return;
56 | }
57 |
58 | // FIXME: This assumes HAXX_SCAL_UNROLL = 5
59 | if( isL )
60 | for( i = m; i < N; i += HAXX_SCAL_UNROLL ) {
61 | X[i] = ALPHA * X[i];
62 | X[i+1] = ALPHA * X[i+1];
63 | X[i+2] = ALPHA * X[i+2];
64 | X[i+3] = ALPHA * X[i+3];
65 | X[i+4] = ALPHA * X[i+4];
66 | }
67 | else
68 | for( i = m; i < N; i += HAXX_SCAL_UNROLL ) {
69 | X[i] = X[i] * ALPHA;
70 | X[i+1] = X[i+1] * ALPHA;
71 | X[i+2] = X[i+2] * ALPHA;
72 | X[i+3] = X[i+3] * ALPHA;
73 | X[i+4] = X[i+4] * ALPHA;
74 | }
75 | } else {
76 |
77 | HAXX_INT NINCX = N*INCX;
78 | if( isL ) for( i = 0; i < NINCX; i += INCX ) X[i] = ALPHA * X[i];
79 | else for( i = 0; i < NINCX; i += INCX ) X[i] = X[i] * ALPHA;
80 |
81 | }
82 |
83 |
84 | };
85 |
86 |
87 |
88 | }; // namespace HAXX
89 |
90 |
--------------------------------------------------------------------------------
/include/hblas/generic/hblas1/hblas_swapv_impl.hpp:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is a part of HAXX
3 | *
4 | * Copyright (c) 2017 David Williams-Young
5 | * All rights reserved.
6 | *
7 | * See LICENSE.txt
8 | */
9 |
10 | #pragma once
11 |
12 | #include "hblas/hblas1.hpp"
13 |
14 | #define HAXX_SWAP_UNROLL 3
15 |
16 | namespace HAXX {
17 |
18 | /**
19 | * Swaps the elements of two strided quaternion arrays of length N
20 | *
21 | * Written by DBWY (4/2017)
22 | *
23 | * Based on the BLAS implementaion of DSWAP by Jack Dongarra
24 | * http://www.netlib.org/lapack/explore-html/db/dd4/dswap_8f.html
25 | */
26 | template
27 | void HBLAS_SWAPV(const HAXX_INT N, quaternion<_F> * const X,
28 | const HAXX_INT INCX, quaternion<_F> * const Y, const HAXX_INT INCY) {
29 |
30 | if( N <= 0 ) return;
31 | // FIXME: See further comments on negative stride
32 | assert(INCX > 0);
33 | assert(INCY > 0);
34 |
35 | HAXX_INT i;
36 | quaternion<_F> qtmp;
37 |
38 | if( INCX == 1 and INCY == 1 ) {
39 | HAXX_INT m = N % HAXX_SWAP_UNROLL;
40 |
41 | // Use unrolled loops for both unit increments
42 |
43 | // XXX: For some reason Z/CSWAP does not do this.
44 | // Cache utilization?
45 | if( m != 0 ) {
46 | for( i = 0; i < m; ++i ) {
47 | qtmp = X[i];
48 | X[i] = Y[i];
49 | Y[i] = qtmp;
50 | }
51 | if( N < HAXX_SWAP_UNROLL ) return;
52 | }
53 |
54 | // FIXME: This assumes HAXX_SWAP_UNROLL = 3
55 | for( i = m; i < N; i += HAXX_SWAP_UNROLL ) {
56 | qtmp = X[i];
57 | X[i] = Y[i];
58 | Y[i] = qtmp;
59 |
60 | qtmp = X[i+1];
61 | X[i+1] = Y[i+1];
62 | Y[i+1] = qtmp;
63 |
64 | qtmp = X[i+2];
65 | X[i+2] = Y[i+2];
66 | Y[i+2] = qtmp;
67 | }
68 |
69 | } else {
70 |
71 | HAXX_INT ix(0), iy(0);
72 | // FIXME: the original _SWAP function has code here to handle
73 | // negative increments. Unsure on what that accomplishes
74 |
75 | for( i = 0; i < N; ++i, ix += INCX, iy += INCY ) {
76 | qtmp = X[ix];
77 | X[ix] = Y[iy];
78 | Y[iy] = qtmp;
79 | }
80 | }
81 |
82 | };
83 |
84 | }; // namespace HAXX
85 |
86 |
87 |
--------------------------------------------------------------------------------
/include/hblas/generic/hblas1/impl.hpp:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is a part of HAXX
3 | *
4 | * Copyright (c) 2017 David Williams-Young
5 | * All rights reserved.
6 | *
7 | * See LICENSE.txt
8 | */
9 |
10 | #pragma once
11 |
12 | #include "hblas_swapv_impl.hpp" // HBLAS_SWAPV
13 | #include "hblas_scalv_impl.hpp" // HBLAS_SCALV
14 | #include "hblas_copyv_impl.hpp" // HBLAS_COPYV
15 | #include "hblas_axpyv_impl.hpp" // HBLAS_AXPYV
16 | #include "hblas_dotuv_impl.hpp" // HBLAS_DOTUV
17 | #include "hblas_dotcv_impl.hpp" // HBLAS_DOTCV
18 |
19 | #include "hblas_scalm_impl.hpp" // HBLAS_SCALM
20 | #include "hblas_axpym_impl.hpp" // HBLAS_AXPYM
21 |
--------------------------------------------------------------------------------
/include/hblas/generic/hblas2/hblas_gerc_impl.hpp:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is a part of HAXX
3 | *
4 | * Copyright (c) 2017 David Williams-Young
5 | * All rights reserved.
6 | *
7 | * See LICENSE.txt
8 | */
9 |
10 | #ifndef __INCLUDED_HBLAS_GERC_IMPL_HPP
11 | #define __INCLUDED_HBLAS_GERC_IMPL_HPP
12 |
13 | #include "hblas/hblas2.hpp"
14 |
15 | namespace HAXX {
16 |
17 | /**
18 | * Written by DBWY (4/2017)
19 | *
20 | * Based on the BLAS implementation of ZGERC by
21 | * Jack Dongarra (Argonne)
22 | * Jeremy Du Croz (NAG)
23 | * Sven Hammarling (NAG)
24 | * Richard Hanson (Sandia)
25 | *
26 | * Performs the rank 1 operation
27 | *
28 | * \f$ A_{ij} = A_{ij} + \alpha x_i y^*_j \f$
29 | *
30 | * \f$ A \in \mathbb{H} \qquad x,y,\alpha \in \mathbb{R},\mathbb{C},\mathbb{H}\f$
31 | */
32 | // FIXME: In this implementaion, it has been implied that scalars
33 | // will always multiply from the left. Should generalize in such a
34 | // was to allow flexibility in ALPHA
35 | template
37 | void HBLAS_GERC(const HAXX_INT M, const HAXX_INT N, const _AlphaF ALPHA,
38 | _LeftVecF * const X, const HAXX_INT INCX, _RightVecF * const Y,
39 | const HAXX_INT INCY, quaternion<_F> * const A, const HAXX_INT LDA) {
40 |
41 | if( M == 0 or N == 0 or ALPHA == _AlphaF(0.)) return;
42 |
43 | // FIXME: The original BLAS implementaion has logic to handle
44 | // negative strides. See further comments.
45 | assert( INCX > 0 );
46 | assert( INCY > 0 );
47 |
48 |
49 | HAXX_INT i, j, ix;
50 |
51 | // FIXME: This parameter is effected in the orignal BLAS
52 | // implementaion by negative stride
53 | HAXX_INT JY = 0;
54 |
55 | quaternion<_F> htemp1;
56 |
57 | if( INCX == 1 ) {
58 |
59 | for( j = 0; j < N; ++j, JY += INCY )
60 | if( Y[JY] != _RightVecF(0.) ) {
61 | // htemp1 = ALPHA * Y[JY];
62 | for( i = 0; i < M; ++i ) {
63 | A[RANK2_INDX(i,j,LDA)] += ALPHA * X[i] * conj(Y[JY]);
64 | }
65 | }
66 |
67 | } else { // end INCX == 1
68 |
69 | // FIXME: This parameter is effected in the orignal BLAS
70 | // implementaion by negative stride
71 | HAXX_INT KX = 0;
72 |
73 | for( j = 0; j < N; ++j, JY += INCY )
74 | if( Y[JY] != _RightVecF(0.) ) {
75 | // htemp1 = ALPHA * Y[JY];
76 | for( i = 0, ix = KX; i < M; ++i, ix += INCX ) {
77 | A[RANK2_INDX(i,j,LDA)] += ALPHA * X[ix] * conj(Y[JY]);
78 | }
79 | }
80 | } // end INCX != 1
81 |
82 | }; // end GERC
83 |
84 |
85 |
86 | }; // namspace HAXX
87 |
88 | #endif
89 |
--------------------------------------------------------------------------------
/include/hblas/generic/hblas2/hblas_geru_impl.hpp:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is a part of HAXX
3 | *
4 | * Copyright (c) 2017 David Williams-Young
5 | * All rights reserved.
6 | *
7 | * See LICENSE.txt
8 | */
9 |
10 | #ifndef __INCLUDED_HBLAS_GERU_IMPL_HPP
11 | #define __INCLUDED_HBLAS_GERU_IMPL_HPP
12 |
13 | #include "hblas/hblas2.hpp"
14 |
15 | namespace HAXX {
16 |
17 | /**
18 | * Written by DBWY (4/2017)
19 | *
20 | * Based on the BLAS implementation of ZGERU by
21 | * Jack Dongarra (Argonne)
22 | * Jeremy Du Croz (NAG)
23 | * Sven Hammarling (NAG)
24 | * Richard Hanson (Sandia)
25 | *
26 | * Performs the rank 1 operation
27 | *
28 | * \f$ A_{ij} = A_{ij} + \alpha x_i y_j \f$
29 | *
30 | * \f$ A \in \mathbb{H} \qquad x,y,\alpha \in \mathbb{R},\mathbb{C},\mathbb{H}\f$
31 | */
32 | // FIXME: In this implementaion, it has been implied that scalars
33 | // will always multiply from the left. Should generalize in such a
34 | // was to allow flexibility in ALPHA
35 | template
37 | void HBLAS_GERU(const HAXX_INT M, const HAXX_INT N, const _AlphaF ALPHA,
38 | _LeftVecF * const X, const HAXX_INT INCX, _RightVecF * const Y,
39 | const HAXX_INT INCY, quaternion<_F> * const A, const HAXX_INT LDA) {
40 |
41 | if( M == 0 or N == 0 or ALPHA == _AlphaF(0.)) return;
42 |
43 | // FIXME: The original BLAS implementaion has logic to handle
44 | // negative strides. See further comments.
45 | assert( INCX > 0 );
46 | assert( INCY > 0 );
47 |
48 |
49 | HAXX_INT i, j, ix;
50 |
51 | // FIXME: This parameter is effected in the orignal BLAS
52 | // implementaion by negative stride
53 | HAXX_INT JY = 0;
54 |
55 | quaternion<_F> htemp1;
56 |
57 | if( INCX == 1 ) {
58 |
59 | for( j = 0; j < N; ++j, JY += INCY )
60 | if( Y[JY] != _RightVecF(0.) ) {
61 | // htemp1 = ALPHA * Y[JY];
62 | for( i = 0; i < M; ++i ) {
63 | A[RANK2_INDX(i,j,LDA)] += ALPHA * X[i] * Y[JY];
64 | }
65 | }
66 |
67 | } else { // end INCX == 1
68 |
69 | // FIXME: This parameter is effected in the orignal BLAS
70 | // implementaion by negative stride
71 | HAXX_INT KX = 0;
72 |
73 | for( j = 0; j < N; ++j, JY += INCY )
74 | if( Y[JY] != _RightVecF(0.) ) {
75 | // htemp1 = ALPHA * Y[JY];
76 | for( i = 0, ix = KX; i < M; ++i, ix += INCX ) {
77 | A[RANK2_INDX(i,j,LDA)] += ALPHA * X[ix] * Y[JY];
78 | }
79 | }
80 | } // end INCX != 1
81 |
82 | }; // end GERU
83 |
84 |
85 |
86 | }; // namespace HAXX
87 |
88 | #endif
89 |
--------------------------------------------------------------------------------
/include/hblas/generic/hblas2/impl.hpp:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is a part of HAXX
3 | *
4 | * Copyright (c) 2017 David Williams-Young
5 | * All rights reserved.
6 | *
7 | * See LICENSE.txt
8 | */
9 |
10 | #ifndef __INCLUDED_HBLAS2_IMPL_HPP
11 | #define __INCLUDED_HBLAS2_IMPL_HPP
12 |
13 | #include "hblas_gemv_impl.hpp"
14 | #include "hblas_geru_impl.hpp"
15 | #include "hblas_gerc_impl.hpp"
16 |
17 | #endif
18 |
--------------------------------------------------------------------------------
/include/hblas/generic/hblas3/impl.hpp:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is a part of HAXX
3 | *
4 | * Copyright (c) 2017 David Williams-Young
5 | * All rights reserved.
6 | *
7 | * See LICENSE.txt
8 | */
9 |
10 | #ifndef __INCLUDED_HBLAS3_IMPL_HPP
11 | #define __INCLUDED_HBLAS3_IMPL_HPP
12 |
13 | #include "hblas_gemm_impl.hpp"
14 |
15 | #endif
16 |
--------------------------------------------------------------------------------
/include/hblas/generic/util/contract_impl.hpp:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is a part of HAXX
3 | *
4 | * Copyright (c) 2017 David Williams-Young
5 | * All rights reserved.
6 | *
7 | * See LICENSE.txt
8 | */
9 |
10 | #ifndef __INCLUDED_HBLAS_UTIL_CONTRACT_IMPL_HPP
11 | #define __INCLUDED_HBLAS_UTIL_CONTRACT_IMPL_HPP
12 |
13 | #include
14 | #include "hblas/hblas_util.hpp"
15 |
16 | namespace HAXX {
17 |
18 | template <>
19 | void HBLAS_COMPLEX_CONTRACT(char ORDER, char UPLO, HAXX_INT M, HAXX_INT N,
20 | quaternion *A, HAXX_INT LDA, std::complex *B,
21 | HAXX_INT LDB) {
22 |
23 | assert( ORDER == 'F' or ORDER == 'S' );
24 |
25 | if( ORDER == 'F' )
26 | hzcon1_(&UPLO,&M,&N,A,&LDA,B,&LDB);
27 | else
28 | hzcon2_(&UPLO,&M,&N,A,&LDA,B,&LDB);
29 |
30 | }
31 |
32 |
33 | }; // namespace HAXX
34 |
35 | #endif
36 |
--------------------------------------------------------------------------------
/include/hblas/generic/util/expand_impl.hpp:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is a part of HAXX
3 | *
4 | * Copyright (c) 2017 David Williams-Young
5 | * All rights reserved.
6 | *
7 | * See LICENSE.txt
8 | */
9 |
10 | #ifndef __INCLUDED_HBLAS_UTIL_EXPAND_IMPL_HPP
11 | #define __INCLUDED_HBLAS_UTIL_EXPAND_IMPL_HPP
12 |
13 | #include
14 | #include "hblas/hblas_util.hpp"
15 |
16 | namespace HAXX {
17 |
18 | template <>
19 | void HBLAS_COMPLEX_EXPAND(char ORDER, HAXX_INT M, HAXX_INT N,
20 | quaternion *A, HAXX_INT LDA, std::complex *B,
21 | HAXX_INT LDB) {
22 |
23 | assert( ORDER == 'F' or ORDER == 'S' );
24 |
25 | if( ORDER == 'F' )
26 | hzexp1_(&M,&N,A,&LDA,B,&LDB);
27 | else
28 | hzexp2_(&M,&N,A,&LDA,B,&LDB);
29 |
30 | }
31 |
32 | template <>
33 | void HBLAS_REAL_EXPAND(HAXX_INT M, HAXX_INT N, quaternion *A,
34 | HAXX_INT LDA, double *B, HAXX_INT LDB) {
35 |
36 | hdexp_(&M,&N,A,&LDA,B,&LDB);
37 |
38 | }
39 |
40 | }; // namespace HAXX
41 |
42 | #endif
43 |
--------------------------------------------------------------------------------
/include/hblas/generic/util/impl.hpp:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is a part of HAXX
3 | *
4 | * Copyright (c) 2017 David Williams-Young
5 | * All rights reserved.
6 | *
7 | * See LICENSE.txt
8 | */
9 |
10 | #ifndef __INCLUDED_HBLAS_UTIL_IMPL_HPP
11 | #define __INCLUDED_HBLAS_UTIL_IMPL_HPP
12 |
13 | #include "expand_impl.hpp"
14 | #include "contract_impl.hpp"
15 |
16 | #endif
17 |
18 |
--------------------------------------------------------------------------------
/include/hblas/hblas1.hpp:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is a part of HAXX
3 | *
4 | * Copyright (c) 2017 David Williams-Young
5 | * All rights reserved.
6 | *
7 | * See LICENSE.txt
8 | */
9 |
10 | #ifndef __INCLUDED_HBLAS1_HPP
11 | #define __INCLUDED_HBLAS1_HPP
12 |
13 | #include "haxx/haxx_def.hpp"
14 |
15 | // Preprocessor macros for quick FORTRAN declarations
16 |
17 | #define SCAL_FORTRAN_DECL(NAME,F,ALPHAF) \
18 | void NAME##_(const char*, const HAXX_INT*, const ALPHAF*, \
19 | const quaternion *, const HAXX_INT*);
20 |
21 | #define DOT_FORTRAN_DECL(NAME,F) \
22 | void NAME##_(const quaternion *, const HAXX_INT*, const quaternion *,\
23 | const HAXX_INT*, const quaternion *, const HAXX_INT*);
24 |
25 | #define AXPY_FORTRAN_DECL(NAME,F,XF,ALPHAF)\
26 | void NAME##_(const char*, const HAXX_INT*, const ALPHAF*, const XF *, \
27 | const HAXX_INT*, const quaternion *, const HAXX_INT*);
28 |
29 |
30 |
31 |
32 |
33 | namespace HAXX {
34 |
35 |
36 | // FORTRAN HBLAS1 functions
37 | extern "C" {
38 |
39 | // SCAL functions
40 | SCAL_FORTRAN_DECL(hscald,double,double);
41 | SCAL_FORTRAN_DECL(hscalc,double,std::complex);
42 | SCAL_FORTRAN_DECL(hscalh,double,quaternion);
43 |
44 | // DOT functions
45 | DOT_FORTRAN_DECL(hdotu,double);
46 | DOT_FORTRAN_DECL(hdotc,double);
47 |
48 | // AXPY functions
49 | AXPY_FORTRAN_DECL(haxpydh,double,quaternion,double);
50 | AXPY_FORTRAN_DECL(haxpych,double,quaternion,std::complex);
51 | AXPY_FORTRAN_DECL(haxpyhh,double,quaternion,quaternion);
52 |
53 | };
54 |
55 |
56 | /**
57 | * \addtogroup HBLAS
58 | * @{
59 | *
60 | *
61 | * @defgroup HBLAS1 Level 1 HBLAS
62 | * Level 1 BLAS operations over quaternion numbers
63 | *
64 | * @{
65 | */
66 |
67 |
68 | /**
69 | * @defgroup HBLAS1V Level 1 HBLASV
70 | * Level 1 BLAS operations on quaternion vectors
71 | *
72 | * @{
73 | */
74 |
75 | /// Swap the states of two quaternion arrays
76 | template
77 | void HBLAS_SWAPV(const HAXX_INT N, quaternion<_F> * const X,
78 | const HAXX_INT INCX, quaternion<_F> * const Y, const HAXX_INT INCY);
79 |
80 | /// Copy a quaternion array to another quaternion array
81 | template
82 | void HBLAS_COPYV(const HAXX_INT N, quaternion<_F> * const X,
83 | const HAXX_INT INCX, quaternion<_F> * const Y, const HAXX_INT INCY);
84 |
85 | /// Scale a quaternion array by a scalar
86 | template
87 | void HBLAS_SCALV(const char SIDE, const HAXX_INT N, const _AlphaF ALPHA,
88 | quaternion<_F> * const X, const HAXX_INT INCX);
89 |
90 | /// Scale a quaternion array and add it to another quaternion array
91 | template
92 | void HBLAS_AXPYV(const char SIDE, const HAXX_INT N, const _AlphaF ALPHA,
93 | _XF * const X, const HAXX_INT INCX, quaternion<_F> * const Y,
94 | const HAXX_INT INCY);
95 |
96 | /// Perform an unaltered dot product of two quaternion arrays
97 | template
98 | quaternion<_F> HBLAS_DOTUV(const HAXX_INT N, quaternion<_F> * const X,
99 | const HAXX_INT INCX, quaternion<_F> * const Y, const HAXX_INT INCY);
100 |
101 | /// Obtain the inner product of two quaternion arrays
102 | template
103 | quaternion<_F> HBLAS_DOTCV(const HAXX_INT N, quaternion<_F> * const X,
104 | const HAXX_INT INCX, quaternion<_F> * const Y, const HAXX_INT INCY);
105 |
106 | /* @} */ // HBLAS1V
107 |
108 |
109 | /**
110 | * @defgroup HBLAS1M Level 1 HBLASM
111 | * Level 1 BLAS operations on quaternion matricies (ala BLIS)
112 | *
113 | * @{
114 | */
115 |
116 | /// Scale / Transpose / Conjugate a quaternion matrix in place
117 | template
118 | void HBLAS_SCALM(const char SIDE, const char TRANSA, const HAXX_INT M,
119 | const HAXX_INT N, const _AlphaF ALPHA, quaternion<_F> * const A,
120 | const HAXX_INT LDA, const HAXX_INT INCA);
121 |
122 | /// Accumulate a quaternion matrix
123 | template
124 | void HBLAS_AXPYM(const char SIDE, const char TRANSA, const HAXX_INT M,
125 | const HAXX_INT N, const _AlphaF ALPHA, _XF * const A, const HAXX_INT LDA,
126 | const HAXX_INT INCA, quaternion<_F> * const B, const HAXX_INT LDB,
127 | const HAXX_INT INCB);
128 |
129 | /* @} */ // HBLAS1M
130 |
131 | /* @} */ // HBLAS1
132 |
133 | /* @} */ // HBLAS
134 |
135 | }; // namespace HAXX
136 |
137 | #endif
138 |
--------------------------------------------------------------------------------
/include/hblas/hblas2.hpp:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is a part of HAXX
3 | *
4 | * Copyright (c) 2017 David Williams-Young
5 | * All rights reserved.
6 | *
7 | * See LICENSE.txt
8 | */
9 |
10 | #ifndef __INCLUDED_HBLAS2_HPP
11 | #define __INCLUDED_HBLAS2_HPP
12 |
13 | #include "haxx/haxx_def.hpp"
14 | #include "util/macro.hpp"
15 |
16 | #define GEMV_FORTRAN_DECL(NAME,F,MATF,VECF,ALPHAF,BETAF) \
17 | void NAME##_(const char*, const HAXX_INT*, const HAXX_INT*, const ALPHAF*,\
18 | const MATF*, const HAXX_INT*, const VECF*, const HAXX_INT*, const BETAF*, \
19 | const quaternion*, const HAXX_INT*);
20 |
21 | #define GER_FORTRAN_DECL(NAME,F,LEFTF,RIGHTF,ALPHAF) \
22 | void NAME##_(const HAXX_INT*, const HAXX_INT*, const ALPHAF*,\
23 | const LEFTF*, const HAXX_INT*, const RIGHTF*, const HAXX_INT*, \
24 | const quaternion*, const HAXX_INT*);\
25 |
26 | namespace HAXX {
27 |
28 | // FORTRAN HBLAS2 functions
29 | extern "C" {
30 |
31 | // GEMV functions
32 |
33 | GEMV_FORTRAN_DECL(hgemvdd,double,quaternion,quaternion,
34 | double, double);
35 | GEMV_FORTRAN_DECL(hgemvdz,double,quaternion,quaternion,
36 | double, std::complex);
37 | GEMV_FORTRAN_DECL(hgemvdh,double,quaternion,quaternion,
38 | double, quaternion);
39 |
40 | GEMV_FORTRAN_DECL(hgemvzd,double,quaternion,quaternion,
41 | std::complex,double);
42 | GEMV_FORTRAN_DECL(hgemvzz,double,quaternion,quaternion,
43 | std::complex,std::complex);
44 | GEMV_FORTRAN_DECL(hgemvzh,double,quaternion,quaternion,
45 | std::complex, quaternion);
46 |
47 | GEMV_FORTRAN_DECL(hgemvhd,double,quaternion,quaternion,
48 | quaternion,double);
49 | GEMV_FORTRAN_DECL(hgemvhz,double,quaternion,quaternion,
50 | quaternion,std::complex);
51 | GEMV_FORTRAN_DECL(hgemvhh,double,quaternion,quaternion,
52 | quaternion, quaternion);
53 |
54 |
55 |
56 | // GERU functions
57 |
58 | GER_FORTRAN_DECL(hgerud,double,quaternion,quaternion,
59 | double)
60 | GER_FORTRAN_DECL(hgeruz,double,quaternion,quaternion,
61 | std::complex)
62 | GER_FORTRAN_DECL(hgeruh,double,quaternion,quaternion,
63 | quaternion)
64 |
65 | // GERC functions
66 |
67 | GER_FORTRAN_DECL(hgercd,double,quaternion,quaternion,
68 | double)
69 | GER_FORTRAN_DECL(hgercz,double,quaternion,quaternion,
70 | std::complex)
71 | GER_FORTRAN_DECL(hgerch,double,quaternion,quaternion,
72 | quaternion)
73 |
74 | };
75 |
76 | /**
77 | * \addtogroup HBLAS
78 | * @{
79 | *
80 | *
81 | * @defgroup HBLAS2 Level 2 HBLAS
82 | * Level 2 BLAS operations over quaternion numbers
83 | *
84 | * @{
85 | */
86 |
87 | /// Multiply a general vector by a quaternion matrix
88 | template
90 | void HBLAS_GEMV(const char TRANS, const HAXX_INT M, const HAXX_INT N,
91 | const _AlphaF ALPHA, _MatF * const A, const HAXX_INT LDA, _VecF * const X,
92 | const HAXX_INT INCX, const _BetaF BETA, quaternion<_F> * const Y,
93 | const HAXX_INT INCY);
94 |
95 | /// Perform the quaternion rank 1 operation with two general vectors
96 | template
98 | void HBLAS_GERU(const HAXX_INT M, const HAXX_INT N, const _AlphaF ALPHA,
99 | _LeftVecF * const X, const HAXX_INT INCX, _RightVecF * const Y,
100 | const HAXX_INT INCY, quaternion<_F> * const A, const HAXX_INT LDA);
101 |
102 | /// Perform the quaternion rank 1 operation with two general vectors
103 | template
105 | void HBLAS_GERC(const HAXX_INT M, const HAXX_INT N, const _AlphaF ALPHA,
106 | _LeftVecF * const X, const HAXX_INT INCX, _RightVecF * const Y,
107 | const HAXX_INT INCY, quaternion<_F> * const A, const HAXX_INT LDA);
108 |
109 |
110 | /* @} */ // HBLAS2
111 |
112 | /* @} */ // HBLAS
113 | }; // namespace HAXX
114 |
115 | #endif
116 |
--------------------------------------------------------------------------------
/include/hblas/hblas3.hpp:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is a part of HAXX
3 | *
4 | * Copyright (c) 2017 David Williams-Young
5 | * All rights reserved.
6 | *
7 | * See LICENSE.txt
8 | */
9 |
10 | #ifndef __INCLUDED_HBLAS3_HPP
11 | #define __INCLUDED_HBLAS3_HPP
12 |
13 | #include "haxx/haxx_def.hpp"
14 |
15 | #define GEMM_FORTRAN_DECL(NAME,F,AMATF,BMATF,ALPHAF,BETAF) \
16 | void NAME##_(const char*, const char*, const HAXX_INT*, const HAXX_INT*,\
17 | const HAXX_INT*, const ALPHAF*, const AMATF*, const HAXX_INT*,\
18 | const BMATF*, const HAXX_INT*, const BETAF*, const quaternion*,\
19 | const HAXX_INT*);
20 |
21 |
22 | namespace HAXX {
23 |
24 | // FORTRAN HBLAS3 Functions
25 | extern "C" {
26 |
27 |
28 | // GEMM functions
29 |
30 | GEMM_FORTRAN_DECL(hgemmdd,double,quaternion,quaternion,
31 | double,double);
32 | GEMM_FORTRAN_DECL(hgemmdz,double,quaternion,quaternion,
33 | double,std::complex);
34 | GEMM_FORTRAN_DECL(hgemmdh,double,quaternion,quaternion,
35 | double,quaternion);
36 |
37 | GEMM_FORTRAN_DECL(hgemmzd,double,quaternion,quaternion,
38 | std::complex,double);
39 | GEMM_FORTRAN_DECL(hgemmzz,double,quaternion,quaternion,
40 | std::complex,std::complex);
41 | GEMM_FORTRAN_DECL(hgemmzh,double,quaternion,quaternion,
42 | std::complex,quaternion);
43 |
44 | GEMM_FORTRAN_DECL(hgemmhd,double,quaternion,quaternion,
45 | quaternion,double);
46 | GEMM_FORTRAN_DECL(hgemmhz,double,quaternion,quaternion,
47 | quaternion,std::complex);
48 | GEMM_FORTRAN_DECL(hgemmhh,double,quaternion,quaternion,
49 | quaternion,quaternion);
50 |
51 | };
52 |
53 | /**
54 | * \addtogroup HBLAS
55 | * @{
56 | *
57 | *
58 | * @defgroup HBLAS3 Level 3 HBLAS
59 | * Level 3 BLAS operations over quaternion numbers
60 | *
61 | * @{
62 | */
63 |
64 | /// Multiply a quaternion matrix by a quaternion matrix
65 | template
67 | void HBLAS_GEMM(const char TRANSA, const char TRANSB, const HAXX_INT M,
68 | const HAXX_INT N, const HAXX_INT K, const _AlphaF ALPHA, _AMatF * const A,
69 | const HAXX_INT LDA, _BMatF * const B, const HAXX_INT LDB,
70 | const _BetaF BETA, quaternion<_F> * const C, const HAXX_INT LDC);
71 |
72 | /* @} */ // HBLAS3
73 |
74 | /* @} */ // HBLAS
75 |
76 | }; // namespace HAXX
77 |
78 | #endif
79 |
--------------------------------------------------------------------------------
/include/hblas/hblas_util.hpp:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is a part of HAXX
3 | *
4 | * Copyright (c) 2017 David Williams-Young
5 | * All rights reserved.
6 | *
7 | * See LICENSE.txt
8 | */
9 |
10 | #ifndef __INCLUDED_HBLAS_UTIL_HPP
11 | #define __INCLUDED_HBLAS_UTIL_HPP
12 |
13 | #include "haxx/haxx_def.hpp"
14 |
15 | extern "C" {
16 |
17 | void hzexp1_(const int*, const int*, HAXX::quaternion*, const int*,
18 | std::complex*, const int*);
19 | void hzexp2_(const int*, const int*, HAXX::quaternion*, const int*,
20 | std::complex*, const int*);
21 | void hdexp_(const int*, const int*, HAXX::quaternion*, const int*,
22 | double*, const int*);
23 |
24 | void hzcon1_(const char*, const int*, const int*, HAXX::quaternion*,
25 | const int*, std::complex*, const int*);
26 | void hzcon2_(const char*, const int*, const int*, HAXX::quaternion*,
27 | const int*, std::complex*, const int*);
28 | }
29 |
30 | namespace HAXX {
31 |
32 | /**
33 | * \addtogroup HBLAS
34 | * @{
35 | *
36 | *
37 | * @defgroup HBLAS_UTIL HBLAS Utilities
38 | * Utility functions for HBLAS
39 | *
40 | * @{
41 | */
42 |
43 |
44 | /// Expand a quaternion matrix to a complex matrix
45 | template
46 | void HBLAS_COMPLEX_EXPAND(char ORDER, HAXX_INT M, HAXX_INT N,
47 | quaternion<_F> *A, HAXX_INT LDA, std::complex<_F> *B, HAXX_INT LDB);
48 |
49 | /// Expans a quaternion matrix to a real matrix
50 | template
51 | void HBLAS_REAL_EXPAND(HAXX_INT M, HAXX_INT N, quaternion<_F> *A,
52 | HAXX_INT LDA, _F *B, HAXX_INT LDB);
53 |
54 | /// Contract a complex matrix (of proper symmetry) to a quaternion
55 | /// matrix
56 | template
57 | void HBLAS_COMPLEX_CONTRACT(char ORDER, char UPLO, HAXX_INT M, HAXX_INT N,
58 | quaternion<_F> *A, HAXX_INT LDA, std::complex<_F> *B, HAXX_INT LDB);
59 |
60 |
61 | /* @} */ // HBLAS_UTIL
62 |
63 | /* @} */ // HBLAS
64 |
65 | }; // namespace HAXX
66 |
67 | #endif
68 |
--------------------------------------------------------------------------------
/include/hblas/pack/packops.hpp:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is a part of HAXX
3 | *
4 | * Copyright (c) 2017 David Williams-Young
5 | * All rights reserved.
6 | *
7 | * See LICENSE.txt
8 | */
9 |
10 | #ifndef __INCLUDED_HBLAS_PACK_PACKOPS_HPP
11 | #define __INCLUDED_HBLAS_PACK_PACKOPS_HPP
12 |
13 | #include "haxx.hpp"
14 | #include "util/types.hpp"
15 | #include "util/simd.hpp"
16 |
17 | namespace HAXX {
18 |
19 |
20 | /**
21 | * \brief Generic implementation of packing operations used in
22 | * the packing utilities.
23 | *
24 | * Will work for an arbitrary data type
25 | */
26 | template >
27 | struct GenericPackOps {
28 |
29 | typedef _TypeWrapper TypeWrapper;
30 |
31 | typedef typename _TypeWrapper::load_t load_t;
32 | typedef typename _TypeWrapper::noscalar_t noscalar_t;
33 |
34 |
35 | /**
36 | * Generic implementation of scaling operations used prior to internal
37 | * operations.
38 | */
39 | template
40 | static load_t preOP(load_t &x, U &alpha){ return Mul(alpha,x); }
41 |
42 |
43 | template
44 | static load_t Load(U &alpha, Args... args) {
45 |
46 | auto x = TypeWrapper::Load(args...);
47 | return preOP(x,alpha);
48 |
49 | }
50 |
51 | /**
52 | * Generic implementation of scaling operations used prior to internal
53 | * operations. Case when no scaling is needed.
54 | */
55 |
56 | /// Generic implementation of (no-op) internal packing operation.
57 | template
58 | static Tuple OP(Tuple &t){ return t; }
59 |
60 | };
61 |
62 |
63 | /**
64 | * \brief Generic wrapper around a packing implementation which factors
65 | * a conjugation operation into the preOP function prior to scaling.
66 | *
67 | * The passed TypeWrapper must have conjOp defined.
68 | */
69 | template < typename T, typename _TypeWrapper = GenericType,
70 | template class PackOps = GenericPackOps >
71 | struct ConjPackOps : public PackOps {
72 |
73 | typedef typename PackOps::load_t load_t;
74 |
75 | template
76 | static load_t preOP(load_t &x, U &alpha){
77 | auto y = _TypeWrapper::Conj(x);
78 | return PackOps::preOP(y,alpha);
79 | }
80 |
81 | template
82 | static load_t Load(U &alpha, Args... args) {
83 |
84 | auto x = _TypeWrapper::Load(args...);
85 | return preOP(x,alpha);
86 |
87 | }
88 |
89 | };
90 |
91 |
92 |
93 | /**
94 | * \brief Specialization of GenericPackOps for double precision
95 | * quaternions on AVX / AVX2
96 | */
97 | template<>
98 | struct GenericPackOps< quaternion, AVXType > {
99 |
100 | typedef quaternion qd;
101 | typedef AVXType TypeWrapper;
102 |
103 | typedef typename TypeWrapper::noscalar_t noscalar_t;
104 | typedef typename TypeWrapper::real_t real_t;
105 | typedef typename TypeWrapper::complex_t complex_t;
106 | typedef typename TypeWrapper::quaternion_t quaternion_t;
107 | typedef typename TypeWrapper::load_t load_t;
108 |
109 |
110 | template
111 | static inline __m256d preOP(__m256d &x, U &z){ return TypeWrapper::Mul(z,x); }
112 |
113 | template
114 | static load_t Load(U &alpha, Args... args) {
115 |
116 | auto x = TypeWrapper::Load(args...);
117 | return preOP(x,alpha);
118 |
119 | }
120 |
121 |
122 | };
123 |
124 |
125 |
126 | // Forward decl of specialized packing operations
127 |
128 |
129 | template < typename T = quaternion,
130 | typename _TypeWrapper = AVXType > struct GenericPackOps_T1;
131 |
132 | template < typename T = quaternion,
133 | typename _TypeWrapper = AVXType > struct GenericPackOps_T2;
134 |
135 |
136 | template ,
137 | typename _TypeWrapper = AVXType>
138 | using ConjPackOps_T1 = ConjPackOps;
139 |
140 | template ,
141 | typename _TypeWrapper = AVXType>
142 | using ConjPackOps_T2 = ConjPackOps;
143 |
144 |
145 |
146 |
147 | /**
148 | * Factor expensive SIMD permutation into packing for LHS
149 | * of quaternion--quaternion matrix product.
150 | *
151 | * Specializes the GenericPackOps for quaternion on
152 | * AVX / AVX2
153 | */
154 | template<>
155 | struct GenericPackOps_T1< quaternion, AVXType >:
156 | public GenericPackOps< quaternion, AVXType > {
157 |
158 | using twoTuple = std::tuple<__m256d,__m256d>;
159 |
160 | static inline twoTuple OP( twoTuple &t ) {
161 | return std::make_tuple(
162 | _mm256_permute2f128_pd(std::get<0>(t),std::get<1>(t), 0x20),
163 | _mm256_permute2f128_pd(std::get<0>(t),std::get<1>(t), 0x31)
164 | );
165 | }
166 |
167 | };
168 |
169 |
170 |
171 |
172 | /**
173 | * Factor expensive SIMD unpacking into packing for RHS
174 | * of quaternion--quaternion matrix product.
175 | *
176 | * Specializes the GenericPackOps for quaternion on
177 | * AVX / AVX2
178 | */
179 | template<>
180 | struct GenericPackOps_T2< quaternion, AVXType >:
181 | public GenericPackOps< quaternion, AVXType > {
182 |
183 | using twoTuple = std::tuple<__m256d,__m256d>;
184 |
185 | static inline twoTuple OP( twoTuple &t ) {
186 | return std::make_tuple(
187 | _mm256_unpacklo_pd(std::get<0>(t),std::get<1>(t)),
188 | _mm256_unpackhi_pd(std::get<0>(t),std::get<1>(t))
189 | );
190 | }
191 |
192 | };
193 |
194 |
195 | }; // namespace HAXX
196 |
197 | #endif
198 |
--------------------------------------------------------------------------------
/include/hblas/pack/typewrapper.hpp:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is a part of HAXX
3 | *
4 | * Copyright (c) 2017 David Williams-Young
5 | * All rights reserved.
6 | *
7 | * See LICENSE.txt
8 | */
9 |
10 | #ifndef __INCLUDED_HBLAS_PACK_TYPEWRAPPER_HPP
11 | #define __INCLUDED_HBLAS_PACK_TYPEWRAPPER_HPP
12 |
13 | #include "haxx.hpp"
14 | #include "util/simd.hpp"
15 |
16 | namespace HAXX {
17 |
18 |
19 | /**
20 | * \brief Generic implementation of scalar operations used in
21 | * the packing utilities.
22 | *
23 | * Will work with any scalar data type.
24 | */
25 | template
26 | struct GenericTypeWrapper {
27 |
28 | typedef T load_t; ///< Load variable typename
29 |
30 | /// Struct to handle the case when no scalar is passed to
31 | /// the packing utility
32 | struct noscalar_t {};
33 |
34 |
35 | /// Load the passed scalar without modification
36 | template
37 | static const U cacheScalar( U &alpha ){ return alpha; }
38 |
39 | /// No scalar is passed
40 | static noscalar_t cacheScalar(){ return noscalar_t{}; }
41 |
42 |
43 | /// Generic, typesafe implementation of conjugation
44 | template
45 | static U conjOp( U &t ) { return SmartConj(t); }
46 |
47 | };
48 |
49 |
50 | /**
51 | * \brief AVX / AVX2 (256-bit vector length) implementation
52 | * of scalar operations used in packing utilities.
53 | *
54 | * Currently only viable for quaternion packing (FIXME)
55 | */
56 | struct AVX64BitTypeWrapper {
57 |
58 | typedef __m256d load_t; ///< Load variable typename
59 |
60 | struct noscalar_t {}; ///< No scalar passed
61 | struct real_t { __m256d x; }; ///< Real (double)
62 | struct complex_t { __m256d x; __m256d y; }; ///< Complex (double)
63 | struct quaternion_t { __m256d x; }; ///< Quaternion (double)
64 |
65 |
66 |
67 | /// No scalar passed to packing utility
68 | static inline noscalar_t cacheScalar(){ return noscalar_t(); }
69 |
70 | /// Real (double) scalar passed to packing utility (bcast to __m256d)
71 | static inline const real_t cacheScalar( double &alpha ) {
72 | return real_t{_mm256_broadcast_sd(&alpha)};
73 | }
74 |
75 | /**
76 | * Complex (double) scalar passed to packing utility.
77 | * Load as
78 | * {
79 | * { x, x },
80 | * { -x*, -x* }
81 | * }
82 | *
83 | */
84 | static inline const complex_t
85 | cacheScalar(std::complex &ALPHA) {
86 |
87 | const __m256i maskConj = _mm256_set_epi64x(
88 | 0x8000000000000000, 0,
89 | 0x8000000000000000, 0 );
90 |
91 | __m128d alphaC = LOAD_128D_UNALIGNED_AS(double,&ALPHA);
92 | __m256d alpha = SET_256D_FROM_128D(alphaC,alphaC);
93 | __m256d alpha_C =
94 | _mm256_xor_pd(_mm256_permute_pd(alpha, 0x5),
95 | _mm256_castsi256_pd(maskConj)
96 | );
97 |
98 | return complex_t{ alpha, alpha_C };
99 | }
100 |
101 | /// Quaternion (double) scalar passed to packing utility (load as __m256d)
102 | static inline const quaternion_t cacheScalar( quaternion& alpha ) {
103 | return quaternion_t{LOAD_256D_UNALIGNED_AS(double,&alpha)};
104 | }
105 |
106 | /// AVX / AVX2 Conjugation operation (FIXME: only works for quaternions)
107 | static __m256d conjOp( __m256d &t ) { return QCONJ_256D(t); }
108 |
109 | };
110 |
111 | };
112 |
113 | #endif
114 |
--------------------------------------------------------------------------------
/include/util/constants.hpp:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is a part of HAXX
3 | *
4 | * Copyright (c) 2017 David Williams-Young
5 | * All rights reserved.
6 | *
7 | * See LICENSE.txt
8 | */
9 |
10 | #ifndef __INCLUDED_UTIL_CONSTANTS_HPP__
11 | #define __INCLUDED_UTIL_CONSTANTS_HPP__
12 |
13 | // Compile time type constants
14 | #define DOUBLE 0
15 | #define DCOMPLEX 1
16 | #define DQUATERNION 2
17 |
18 | #endif
19 |
--------------------------------------------------------------------------------
/include/util/macro.hpp:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is a part of HAXX
3 | *
4 | * Copyright (c) 2017 David Williams-Young
5 | * All rights reserved.
6 | *
7 | * See LICENSE.txt
8 | */
9 |
10 | #ifndef __INCLUDED_UTIL_MACRO_HPP
11 | #define __INCLUDED_UTIL_MACRO_HPP
12 |
13 | // Misc macros
14 |
15 | // Compute rank-2 index
16 | #define RANK2_INDX(i,j,N) ( (i) + (j)*(N) )
17 |
18 | // Alignment checking
19 | #define IS_ALIGNED(X,B) ( ((unsigned long)(X) & (B-1)) == 0 )
20 |
21 | // Fix a number for mod arithmitic
22 | #define FixMod(X,N) (( (X) % (N) ) ? (X) + (N) - ((X) % (N)) : (X))
23 |
24 | #endif
25 |
--------------------------------------------------------------------------------
/include/util/simd.hpp:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is a part of HAXX
3 | *
4 | * Copyright (c) 2017 David Williams-Young
5 | * All rights reserved.
6 | *
7 | * See LICENSE.txt
8 | */
9 |
10 | #ifndef __INCLUDED_UTIL_SIMD_HPP__
11 | #define __INCLUDED_UTIL_SIMD_HPP__
12 |
13 | // SIMD intrinsics header
14 |
15 | #include
16 |
17 | // Required boundary alignment for aligned data
18 | #if defined(__AVX__) || defined(__AVX2__)
19 | #define REQ_ALIGN 32
20 | #endif
21 |
22 |
23 |
24 | #include "simd/intrin_alias.hpp"
25 | #include "simd/misc.hpp"
26 | #include "simd/qop.hpp"
27 |
28 |
29 | #endif
30 |
--------------------------------------------------------------------------------
/include/util/simd/intrin_alias.hpp:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is a part of HAXX
3 | *
4 | * Copyright (c) 2017 David Williams-Young
5 | * All rights reserved.
6 | *
7 | * See LICENSE.txt
8 | */
9 |
10 | #ifndef __INCLUDED_SIMD_INTRIN_ALIAS_HPP__
11 | #define __INCLUDED_SIMD_INTRIN_ALIAS_HPP__
12 |
13 | // Alias SIMD intrinsics
14 |
15 |
16 | // 256-bit vectors
17 |
18 | // Load operations
19 | #define LOAD_256D_ALIGNED(X) _mm256_load_pd(X)
20 | #define LOAD_256D_UNALIGNED(X) _mm256_loadu_pd(X)
21 |
22 | #define LOAD_128D_ALIGNED(X) _mm_load_pd(X)
23 | #define LOAD_128D_UNALIGNED(X) _mm_loadu_pd(X)
24 |
25 | // Load operations with proper cast
26 | #define LOAD_256D_ALIGNED_AS(T,X)\
27 | LOAD_256D_ALIGNED(const_cast(reinterpret_cast(X)))
28 | #define LOAD_256D_UNALIGNED_AS(T,X)\
29 | LOAD_256D_UNALIGNED(const_cast(reinterpret_cast(X)))
30 |
31 | #define LOAD_128D_ALIGNED_AS(T,X)\
32 | LOAD_128D_ALIGNED(const_cast(reinterpret_cast(X)))
33 | #define LOAD_128D_UNALIGNED_AS(T,X)\
34 | LOAD_128D_UNALIGNED(const_cast(reinterpret_cast(X)))
35 |
36 |
37 | // Store operations
38 | #define STORE_256D_ALIGNED(X,V) _mm256_store_pd(X,V)
39 | #define STORE_256D_UNALIGNED(X,V) _mm256_storeu_pd(X,V)
40 |
41 | // Store operations with proper cast
42 | #define STORE_256D_ALIGNED_AS(T,X,V)\
43 | STORE_256D_ALIGNED(reinterpret_cast(X),V)
44 | #define STORE_256D_UNALIGNED_AS(T,X,V)\
45 | STORE_256D_UNALIGNED(reinterpret_cast(X),V)
46 |
47 |
48 | // Getting / Assembling operations
49 | #define GET_LO_128D_256D(X) _mm256_castpd256_pd128(X)
50 | #define GET_HI_128D_256D(X) _mm256_extractf128_pd((X),1)
51 |
52 | #define SET_256D_FROM_128D(X,Y) \
53 | _mm256_castps_pd(\
54 | _mm256_insertf128_ps(\
55 | _mm256_castps128_ps256(_mm_castpd_ps(X)),\
56 | _mm_castpd_ps(Y),1)\
57 | );
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 | // FMA / FMS operations (D = +-(A*B) +- C)
72 |
73 | // Emulate FMA with two instructions
74 | #ifndef __FMA__
75 |
76 | // 256-bit vectors
77 | #define FMA_256D(A,B,C) _mm256_add_pd(_mm256_mul_pd(A,B),C)
78 | #define FMS_256D(A,B,C) _mm256_sub_pd(_mm256_mul_pd(A,B),C)
79 | #define FNMA_256D(A,B,C) _mm256_sub_pd(C,_mm256_mul_pd(A,B))
80 |
81 | #else
82 |
83 | // 256-bit vectors
84 | #define FMA_256D(A,B,C) _mm256_fmadd_pd(A,B,C)
85 | #define FMS_256D(A,B,C) _mm256_fmsub_pd(A,B,C)
86 | #define FNMA_256D(A,B,C) _mm256_fnmadd_pd(A,B,C)
87 |
88 | #endif
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 | #endif
102 |
--------------------------------------------------------------------------------
/include/util/simd/misc.hpp:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is a part of HAXX
3 | *
4 | * Copyright (c) 2017 David Williams-Young
5 | * All rights reserved.
6 | *
7 | * See LICENSE.txt
8 | */
9 |
10 | #ifndef __INCLUDED_SIMD_MISC_HPP__
11 | #define __INCLUDED_SIMD_MISC_HPP__
12 |
13 |
14 | // Load operations
15 |
16 | // Load aligned with cast
17 | #define LOADD_ALIGNED_AS(T,X) \
18 | LOADD_ALIGNED(const_cast(reinterpret_cast(X)))
19 |
20 | // Load unaligned with cast
21 | #define LOADD_UNALIGNED_AS(T,X) \
22 | LOADD_UNALIGNED(const_cast(reinterpret_cast(X)))
23 |
24 | // Store aligned with cast
25 | #define STORED_ALIGNED_AS(T,X,V) \
26 | STORED_ALIGNED(reinterpret_cast(X),V)
27 |
28 | // Load unaligned with cast
29 | #define STORED_UNALIGNED_AS(T,X,V) \
30 | STORED_UNALIGNED(reinterpret_cast(X),V)
31 |
32 |
33 |
34 |
35 | #if defined(__AVX__) || defined(__AVX2__)
36 |
37 | // Transpose 4x4 registers (with scratch space)
38 | #define _MM_TRANSPOSE_4x4_PD(w,x,y,z,t1,t2,t3,t4) \
39 | t1 = _mm256_shuffle_pd(w, x, 0x0);\
40 | t3 = _mm256_shuffle_pd(w, x, 0xF);\
41 | t2 = _mm256_shuffle_pd(y, z, 0x0);\
42 | t4 = _mm256_shuffle_pd(y, z, 0xF);\
43 | \
44 | w = _mm256_permute2f128_pd(t1, t2, 0x20);\
45 | x = _mm256_permute2f128_pd(t3, t4, 0x20);\
46 | y = _mm256_permute2f128_pd(t1, t2, 0x31);\
47 | z = _mm256_permute2f128_pd(t3, t4, 0x31);
48 |
49 |
50 | #endif
51 |
52 | #endif
53 |
--------------------------------------------------------------------------------
/src/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | #
2 | # This file is a part of HAXX
3 | #
4 | # Copyright (c) 2017 David Williams-Young
5 | # All rights reserved.
6 | #
7 | # See LICENSE.txt
8 | #
9 |
10 | add_subdirectory(hblas)
11 | add_subdirectory(tune)
12 |
13 | if( HAXX_ENABLE_BENCHMARK )
14 | add_subdirectory(benchmark)
15 | endif()
16 |
--------------------------------------------------------------------------------
/src/benchmark/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | #
2 | # This file is a part of HAXX
3 | #
4 | # Copyright (c) 2017 David Williams-Young
5 | # All rights reserved.
6 | #
7 | # See LICENSE.txt
8 | #
9 | #set(Boost_USE_STATIC_LIBS ON)
10 |
11 |
12 | message( STATUS "HAXX WILL BUILD BENCHMARK EXECUTABLES" )
13 |
14 | add_executable(gemm gemm.cxx)
15 |
16 | set( DOT_SRC dot.cxx )
17 |
18 | # Make sure that the fortran functions get compiled for
19 | # the benchmark
20 | if( NOT ENABLE_GENERIC_FORTRAN )
21 | set( DOT_SRC ${DOT_SRC}
22 | ${PROJECT_SOURCE_DIR}/src/hblas/fortran/hblas1/hdotc.f
23 | ${PROJECT_SOURCE_DIR}/src/hblas/fortran/hblas1/hdotu.f
24 | )
25 | endif()
26 |
27 | add_executable(dot ${DOT_SRC})
28 |
29 | option( COMPARE_MKL_ZGEMM "Compare HGEMM to ZGEMM" OFF )
30 |
31 | if( NOT COMPARE_MKL_ZGEMM )
32 | # Reference BLAS / LAPACK
33 | include(ExternalProject)
34 | find_program(SED_EXEC sed)
35 | ExternalProject_Add(lapack
36 | PREFIX ${PROJECT_BINARY_DIR}/deps/lapack
37 | URL "http://www.netlib.org/lapack/lapack-3.5.0.tgz"
38 | PATCH_COMMAND ${SED_EXEC} -i -e "s/-fltconsistency -fp_port//g" ${PROJECT_BINARY_DIR}/deps/lapack/src/lapack/CMakeLists.txt
39 | CMAKE_ARGS -DCMAKE_Fortran_COMPILER=${CMAKE_Fortran_COMPILER}
40 | -DCMAKE_Fortran_FLAGS='${CMAKE_Fortran_FLAGS}'
41 | -DCMAKE_INSTALL_PREFIX=${PROJECT_BINARY_DIR}/deps
42 | -DBUILD_TESTING=OFF
43 | )
44 |
45 | ExternalProject_Get_Property(lapack install_dir)
46 | add_library(blas STATIC IMPORTED)
47 | set_property(TARGET blas PROPERTY IMPORTED_LOCATION ${PROJECT_BINARY_DIR}/deps/lib/libblas.a)
48 |
49 | add_dependencies(blas lapack)
50 | target_link_libraries(gemm ${Boost_LIBRARIES} blas)
51 | endif()
52 |
53 | target_link_libraries(gemm ${Boost_LIBRARIES} hblas)
54 | target_link_libraries(dot ${Boost_LIBRARIES} hblas)
55 |
56 | #find_package(BLAS REQUIRED)
57 | #target_link_libraries(gemm ${Boost_LIBRARIES} haxx_fortran ${BLAS_LIBRARIES})
58 |
59 |
--------------------------------------------------------------------------------
/src/benchmark/dot.cxx:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is a part of HAXX
3 | *
4 | * Copyright (c) 2017 David Williams-Young
5 | * All rights reserved.
6 | *
7 | * See LICENSE.txt
8 | */
9 |
10 | #include "haxx.hpp"
11 | #include "hblas/hblas1.hpp"
12 |
13 | #include
14 | #include
15 | #include
16 | #include
17 | #include
18 |
19 |
20 | #define HBLAS1_RAND_MIN -20
21 | #define HBLAS1_RAND_MAX 54
22 |
23 | // Setup Random Number generator
24 | std::random_device rd;
25 | std::mt19937 gen(rd());
26 | std::uniform_real_distribution<> dis(HBLAS1_RAND_MIN,HBLAS1_RAND_MAX);
27 |
28 | #define DOT_LEN_MAX 10000000
29 | #define DOT_LEN_START 100
30 | #define NTEST 10
31 | #define NREP 10
32 |
33 | #define USE_ALIGNED_MEM
34 |
35 | using namespace HAXX;
36 |
37 | int main() {
38 |
39 |
40 | #ifdef USE_ALIGNED_MEM
41 |
42 | size_t alignment = 32;
43 | size_t nAlloc = (size_t(DOT_LEN_MAX) * sizeof(quaternion));
44 | nAlloc = nAlloc + (nAlloc % 32);
45 |
46 | quaternion *X =
47 | (quaternion*)aligned_alloc(alignment, nAlloc);
48 | quaternion *Y =
49 | (quaternion*)aligned_alloc(alignment, nAlloc);
50 |
51 | for(auto i = 0ul; i < DOT_LEN_MAX; i++) {
52 | X[i] = quaternion(dis(gen),dis(gen),dis(gen),dis(gen));
53 | Y[i] = quaternion(dis(gen),dis(gen),dis(gen),dis(gen));
54 | }
55 |
56 | #else
57 |
58 | std::vector> X(DOT_LEN_MAX), Y(DOT_LEN_MAX);
59 | for(auto &x : X) x = quaternion(dis(gen),dis(gen),dis(gen),dis(gen));
60 | for(auto &x : Y) x = quaternion(dis(gen),dis(gen),dis(gen),dis(gen));
61 |
62 | #endif
63 |
64 |
65 | for(auto N = DOT_LEN_START; N <= DOT_LEN_MAX;
66 | N += (DOT_LEN_MAX - DOT_LEN_START)/NTEST ) {
67 |
68 | // Fix cache
69 | if( N == DOT_LEN_START ) HBLAS_DOTUV(N,&X[0],1,&Y[0],1);
70 |
71 | std::chrono::duration dur_opt(0.);
72 | std::chrono::duration dur_for(0.);
73 |
74 | quaternion res;
75 |
76 |
77 | int INCX = 1;
78 | for(auto rep = 0; rep < NREP; rep++) {
79 | auto start = std::chrono::high_resolution_clock::now();
80 | hdotu_(&res,&N,&X[0],&INCX,&Y[0],&INCX);
81 | auto end = std::chrono::high_resolution_clock::now();
82 | dur_for += end - start;
83 | }
84 |
85 | for(auto rep = 0; rep < NREP; rep++) {
86 | auto start = std::chrono::high_resolution_clock::now();
87 | res = HBLAS_DOTUV(N,&X[0],1,&Y[0],1);
88 | auto end = std::chrono::high_resolution_clock::now();
89 | dur_opt += end - start;
90 | }
91 |
92 |
93 | dur_opt /= NREP;
94 | dur_for /= NREP;
95 |
96 | std::cout << "N = " << N << ", SIMD = " << dur_opt.count()
97 | << ", FORTRAN = " << dur_for.count()
98 | << ", % = " << (dur_for.count() - dur_opt.count()) / dur_for.count()
99 | << std::endl;
100 |
101 | }
102 |
103 | #ifdef USE_ALIGNED_MEM
104 | free(X); free(Y);
105 | #endif
106 |
107 | };
108 |
--------------------------------------------------------------------------------
/src/benchmark/gemm.cxx:
--------------------------------------------------------------------------------
1 | /*
2 | * This file is a part of HAXX
3 | *
4 | * Copyright (c) 2017 David Williams-Young
5 | * All rights reserved.
6 | *
7 | * See LICENSE.txt
8 | */
9 |
10 |
11 | #include "haxx.hpp"
12 | #include