├── Copyright.txt ├── README.md ├── avx.hpp ├── avx512.hpp ├── cuda_warp.hpp ├── hip_wavefront.hpp ├── neon.hpp ├── pack.hpp ├── scalar.hpp ├── simd.hpp ├── simd_common.hpp ├── sse.hpp ├── test.cpp ├── vector_size.hpp └── vsx.hpp /Copyright.txt: -------------------------------------------------------------------------------- 1 | //@HEADER 2 | // ************************************************************************ 3 | // 4 | // Kokkos v. 2.0 5 | // Copyright (2014) Sandia Corporation 6 | // 7 | // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, 8 | // the U.S. Government retains certain rights in this software. 9 | // 10 | // Redistribution and use in source and binary forms, with or without 11 | // modification, are permitted provided that the following conditions are 12 | // met: 13 | // 14 | // 1. Redistributions of source code must retain the above copyright 15 | // notice, this list of conditions and the following disclaimer. 16 | // 17 | // 2. Redistributions in binary form must reproduce the above copyright 18 | // notice, this list of conditions and the following disclaimer in the 19 | // documentation and/or other materials provided with the distribution. 20 | // 21 | // 3. Neither the name of the Corporation nor the names of the 22 | // contributors may be used to endorse or promote products derived from 23 | // this software without specific prior written permission. 24 | // 25 | // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY 26 | // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 28 | // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE 29 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 30 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 31 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 32 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 33 | // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 34 | // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 35 | // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 36 | // 37 | // Questions? Contact Christian R. Trott (crtrott@sandia.gov) 38 | // 39 | // ************************************************************************ 40 | //@HEADER 41 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## THIS IS A TEMPORARY DEVELOPMENT REPOSITORY. IT WILL BE MERGED INTO KOKKOS SOON. 2 | 3 | # simd-math 4 | Library for length agnostic SIMD intrinsic support and the corresponding math operations 5 | -------------------------------------------------------------------------------- /avx.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | //@HEADER 3 | // ************************************************************************ 4 | // 5 | // Kokkos v. 2.0 6 | // Copyright (2014) Sandia Corporation 7 | // 8 | // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, 9 | // the U.S. Government retains certain rights in this software. 10 | // 11 | // Redistribution and use in source and binary forms, with or without 12 | // modification, are permitted provided that the following conditions are 13 | // met: 14 | // 15 | // 1. Redistributions of source code must retain the above copyright 16 | // notice, this list of conditions and the following disclaimer. 17 | // 18 | // 2. Redistributions in binary form must reproduce the above copyright 19 | // notice, this list of conditions and the following disclaimer in the 20 | // documentation and/or other materials provided with the distribution. 21 | // 22 | // 3. Neither the name of the Corporation nor the names of the 23 | // contributors may be used to endorse or promote products derived from 24 | // this software without specific prior written permission. 25 | // 26 | // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY 27 | // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 29 | // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE 30 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 31 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 32 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 33 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 34 | // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 35 | // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 36 | // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 37 | // 38 | // Questions? Contact Christian R. Trott (crtrott@sandia.gov) 39 | // 40 | // ************************************************************************ 41 | //@HEADER 42 | */ 43 | 44 | #pragma once 45 | 46 | #include "simd_common.hpp" 47 | 48 | #ifdef __AVX__ 49 | 50 | #include 51 | 52 | namespace SIMD_NAMESPACE { 53 | 54 | namespace simd_abi { 55 | 56 | class avx {}; 57 | 58 | } 59 | 60 | template <> 61 | class simd_mask { 62 | __m256 m_value; 63 | public: 64 | using value_type = bool; 65 | using simd_type = simd; 66 | using abi_type = simd_abi::avx; 67 | SIMD_ALWAYS_INLINE inline simd_mask() = default; 68 | SIMD_ALWAYS_INLINE inline simd_mask(bool value) { 69 | m_value = _mm256_castsi256_ps(_mm256_set1_epi32(-int(value))); 70 | } 71 | SIMD_ALWAYS_INLINE inline static constexpr int size() { return 8; } 72 | SIMD_ALWAYS_INLINE inline constexpr simd_mask(__m256 const& value_in) 73 | :m_value(value_in) 74 | {} 75 | SIMD_ALWAYS_INLINE inline constexpr __m256 get() const { return m_value; } 76 | SIMD_ALWAYS_INLINE inline simd_mask operator||(simd_mask const& other) const { 77 | return simd_mask(_mm256_or_ps(m_value, other.m_value)); 78 | } 79 | SIMD_ALWAYS_INLINE inline simd_mask operator&&(simd_mask const& other) const { 80 | return simd_mask(_mm256_and_ps(m_value, other.m_value)); 81 | } 82 | SIMD_ALWAYS_INLINE inline simd_mask operator!() const { 83 | return simd_mask(_mm256_andnot_ps(m_value, simd_mask(true).get())); 84 | } 85 | }; 86 | 87 | SIMD_ALWAYS_INLINE inline bool all_of(simd_mask const& a) { 88 | return _mm256_testc_ps(a.get(), simd_mask(true).get()); 89 | } 90 | 91 | SIMD_ALWAYS_INLINE inline bool any_of(simd_mask const& a) { 92 | return !_mm256_testc_ps(simd_mask(false).get(), a.get()); 93 | } 94 | 95 | template <> 96 | class simd { 97 | __m256 m_value; 98 | public: 99 | using value_type = float; 100 | using abi_type = simd_abi::avx; 101 | using mask_type = simd_mask; 102 | using storage_type = simd_storage; 103 | SIMD_ALWAYS_INLINE inline simd() = default; 104 | SIMD_ALWAYS_INLINE inline static constexpr int size() { return 8; } 105 | SIMD_ALWAYS_INLINE inline simd(float value) 106 | :m_value(_mm256_set1_ps(value)) 107 | {} 108 | SIMD_ALWAYS_INLINE inline simd( 109 | float a, float b, float c, float d, 110 | float e, float f, float g, float h) 111 | :m_value(_mm256_setr_ps(a, b, c, d, e, f, g, h)) 112 | {} 113 | SIMD_ALWAYS_INLINE inline 114 | simd(storage_type const& value) { 115 | copy_from(value.data(), element_aligned_tag()); 116 | } 117 | SIMD_ALWAYS_INLINE inline 118 | simd& operator=(storage_type const& value) { 119 | copy_from(value.data(), element_aligned_tag()); 120 | return *this; 121 | } 122 | template 123 | SIMD_ALWAYS_INLINE inline simd(float const* ptr, Flags /*flags*/) 124 | :m_value(_mm256_loadu_ps(ptr)) 125 | {} 126 | SIMD_ALWAYS_INLINE inline simd(float const* ptr, int stride) 127 | :simd(ptr[0], ptr[stride], ptr[2*stride], ptr[3*stride], 128 | ptr[4*stride], ptr[5*stride], ptr[6*stride], ptr[7*stride]) 129 | {} 130 | SIMD_ALWAYS_INLINE inline constexpr simd(__m256 const& value_in) 131 | :m_value(value_in) 132 | {} 133 | SIMD_ALWAYS_INLINE inline simd operator*(simd const& other) const { 134 | return simd(_mm256_mul_ps(m_value, other.m_value)); 135 | } 136 | SIMD_ALWAYS_INLINE inline simd operator/(simd const& other) const { 137 | return simd(_mm256_div_ps(m_value, other.m_value)); 138 | } 139 | SIMD_ALWAYS_INLINE inline simd operator+(simd const& other) const { 140 | return simd(_mm256_add_ps(m_value, other.m_value)); 141 | } 142 | SIMD_ALWAYS_INLINE inline simd operator-(simd const& other) const { 143 | return simd(_mm256_sub_ps(m_value, other.m_value)); 144 | } 145 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd operator-() const { 146 | return simd(_mm256_sub_ps(_mm256_set1_ps(0.0), m_value)); 147 | } 148 | SIMD_ALWAYS_INLINE inline void copy_from(float const* ptr, element_aligned_tag) { 149 | m_value = _mm256_loadu_ps(ptr); 150 | } 151 | SIMD_ALWAYS_INLINE inline void copy_to(float* ptr, element_aligned_tag) const { 152 | _mm256_storeu_ps(ptr, m_value); 153 | } 154 | SIMD_ALWAYS_INLINE inline constexpr __m256 get() const { return m_value; } 155 | SIMD_ALWAYS_INLINE inline simd_mask operator<(simd const& other) const { 156 | return simd_mask(_mm256_cmp_ps(m_value, other.m_value, _CMP_LT_OS)); 157 | } 158 | SIMD_ALWAYS_INLINE inline simd_mask operator==(simd const& other) const { 159 | return simd_mask(_mm256_cmp_ps(m_value, other.m_value, _CMP_EQ_OS)); 160 | } 161 | }; 162 | 163 | SIMD_ALWAYS_INLINE inline simd multiplysign(simd const& a, simd const& b) { 164 | __m256 const sign_mask = _mm256_set1_ps(-0.f); 165 | return simd(_mm256_xor_ps(a.get(), _mm256_and_ps(sign_mask, b.get()))); 166 | } 167 | 168 | SIMD_ALWAYS_INLINE inline simd copysign(simd const& a, simd const& b) { 169 | __m256 const sign_mask = _mm256_set1_ps(-0.); 170 | return simd(_mm256_xor_ps(_mm256_andnot_ps(sign_mask, a.get()), _mm256_and_ps(sign_mask, b.get()))); 171 | } 172 | 173 | SIMD_ALWAYS_INLINE inline simd abs(simd const& a) { 174 | __m256 sign_mask = _mm256_set1_ps(-0.f); // -0.f = 1 << 31 175 | return simd(_mm256_andnot_ps(sign_mask, a.get())); 176 | } 177 | 178 | SIMD_ALWAYS_INLINE inline simd sqrt(simd const& a) { 179 | return simd(_mm256_sqrt_ps(a.get())); 180 | } 181 | 182 | #ifdef __INTEL_COMPILER 183 | SIMD_ALWAYS_INLINE inline simd cbrt(simd const& a) { 184 | return simd(_mm256_cbrt_ps(a.get())); 185 | } 186 | 187 | SIMD_ALWAYS_INLINE inline simd exp(simd const& a) { 188 | return simd(_mm256_exp_ps(a.get())); 189 | } 190 | 191 | SIMD_ALWAYS_INLINE inline simd log(simd const& a) { 192 | return simd(_mm256_log_ps(a.get())); 193 | } 194 | #endif 195 | 196 | #if defined(__FMA__) || defined(__AVX2__) 197 | SIMD_ALWAYS_INLINE inline simd fma( 198 | simd const& a, 199 | simd const& b, 200 | simd const& c) { 201 | return simd(_mm256_fmadd_ps(a.get(), b.get(), c.get())); 202 | } 203 | #endif 204 | 205 | SIMD_ALWAYS_INLINE inline simd max( 206 | simd const& a, simd const& b) { 207 | return simd(_mm256_max_ps(a.get(), b.get())); 208 | } 209 | 210 | SIMD_ALWAYS_INLINE inline simd min( 211 | simd const& a, simd const& b) { 212 | return simd(_mm256_min_ps(a.get(), b.get())); 213 | } 214 | 215 | SIMD_ALWAYS_INLINE inline simd choose( 216 | simd_mask const& a, simd const& b, simd const& c) { 217 | return simd(_mm256_blendv_ps(c.get(), b.get(), a.get())); 218 | } 219 | 220 | template <> 221 | class simd_mask { 222 | __m256d m_value; 223 | public: 224 | using value_type = bool; 225 | using simd_type = simd; 226 | using abi_type = simd_abi::avx; 227 | SIMD_ALWAYS_INLINE inline simd_mask() = default; 228 | SIMD_ALWAYS_INLINE inline simd_mask(bool value) { 229 | m_value = _mm256_castsi256_pd(_mm256_set1_epi64x(-std::int64_t(value))); 230 | } 231 | SIMD_ALWAYS_INLINE inline static constexpr int size() { return 4; } 232 | SIMD_ALWAYS_INLINE inline constexpr simd_mask(__m256d const& value_in) 233 | :m_value(value_in) 234 | {} 235 | SIMD_ALWAYS_INLINE inline constexpr __m256d get() const { return m_value; } 236 | SIMD_ALWAYS_INLINE inline simd_mask operator||(simd_mask const& other) const { 237 | return simd_mask(_mm256_or_pd(m_value, other.m_value)); 238 | } 239 | SIMD_ALWAYS_INLINE inline simd_mask operator&&(simd_mask const& other) const { 240 | return simd_mask(_mm256_and_pd(m_value, other.m_value)); 241 | } 242 | SIMD_ALWAYS_INLINE inline simd_mask operator!() const { 243 | return simd_mask(_mm256_andnot_pd(m_value, simd_mask(true).get())); 244 | } 245 | }; 246 | 247 | SIMD_ALWAYS_INLINE inline bool all_of(simd_mask const& a) { 248 | return _mm256_testc_pd(a.get(), 249 | simd_mask(true).get()); 250 | } 251 | 252 | SIMD_ALWAYS_INLINE inline bool any_of(simd_mask const& a) { 253 | return !_mm256_testc_pd( 254 | simd_mask(false).get(), a.get()); 255 | } 256 | 257 | template <> 258 | class simd { 259 | __m256d m_value; 260 | public: 261 | using value_type = double; 262 | using abi_type = simd_abi::avx; 263 | using mask_type = simd_mask; 264 | using storage_type = simd_storage; 265 | SIMD_ALWAYS_INLINE inline simd() = default; 266 | SIMD_ALWAYS_INLINE inline simd(simd const&) = default; 267 | SIMD_ALWAYS_INLINE inline simd(simd&&) = default; 268 | SIMD_ALWAYS_INLINE inline simd& operator=(simd const&) = default; 269 | SIMD_ALWAYS_INLINE inline simd& operator=(simd&&) = default; 270 | SIMD_ALWAYS_INLINE inline static constexpr int size() { return 4; } 271 | SIMD_ALWAYS_INLINE inline simd(double value) 272 | :m_value(_mm256_set1_pd(value)) 273 | {} 274 | SIMD_ALWAYS_INLINE inline simd( 275 | double a, double b, double c, double d) 276 | :m_value(_mm256_setr_pd(a, b, c, d)) 277 | {} 278 | SIMD_ALWAYS_INLINE inline 279 | simd(storage_type const& value) { 280 | copy_from(value.data(), element_aligned_tag()); 281 | } 282 | #ifdef STK_VOLATILE_SIMD 283 | SIMD_ALWAYS_INLINE inline 284 | simd(simd const volatile& value) 285 | :m_value(value.m_value) 286 | {} 287 | #endif 288 | SIMD_ALWAYS_INLINE inline 289 | simd& operator=(storage_type const& value) { 290 | copy_from(value.data(), element_aligned_tag()); 291 | return *this; 292 | } 293 | template 294 | SIMD_ALWAYS_INLINE inline simd(double const* ptr, Flags flags) 295 | :m_value(_mm256_loadu_pd(ptr)) 296 | {} 297 | SIMD_ALWAYS_INLINE inline simd(double const* ptr, int stride) 298 | :simd(ptr[0], ptr[stride], ptr[2*stride], ptr[3*stride]) 299 | {} 300 | SIMD_ALWAYS_INLINE inline constexpr simd(__m256d const& value_in) 301 | :m_value(value_in) 302 | {} 303 | SIMD_ALWAYS_INLINE inline simd operator*(simd const& other) const { 304 | return simd(_mm256_mul_pd(m_value, other.m_value)); 305 | } 306 | SIMD_ALWAYS_INLINE inline simd operator/(simd const& other) const { 307 | return simd(_mm256_div_pd(m_value, other.m_value)); 308 | } 309 | SIMD_ALWAYS_INLINE inline simd operator+(simd const& other) const { 310 | return simd(_mm256_add_pd(m_value, other.m_value)); 311 | } 312 | #ifdef STK_VOLATILE_SIMD 313 | SIMD_ALWAYS_INLINE inline void plus_equals(simd const volatile& other) volatile { 314 | m_value = _mm256_add_pd(m_value, other.m_value); 315 | } 316 | #endif 317 | SIMD_ALWAYS_INLINE inline simd operator-(simd const& other) const { 318 | return simd(_mm256_sub_pd(m_value, other.m_value)); 319 | } 320 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd operator-() const { 321 | return simd(_mm256_sub_pd(_mm256_set1_pd(0.0), m_value)); 322 | } 323 | SIMD_ALWAYS_INLINE inline void copy_from(double const* ptr, element_aligned_tag) { 324 | m_value = _mm256_loadu_pd(ptr); 325 | } 326 | SIMD_ALWAYS_INLINE inline void copy_to(double* ptr, element_aligned_tag) const { 327 | _mm256_storeu_pd(ptr, m_value); 328 | } 329 | SIMD_ALWAYS_INLINE inline constexpr __m256d get() const { return m_value; } 330 | SIMD_ALWAYS_INLINE inline simd_mask operator<(simd const& other) const { 331 | return simd_mask(_mm256_cmp_pd(m_value, other.m_value, _CMP_LT_OS)); 332 | } 333 | SIMD_ALWAYS_INLINE inline simd_mask operator==(simd const& other) const { 334 | return simd_mask(_mm256_cmp_pd(m_value, other.m_value, _CMP_EQ_OS)); 335 | } 336 | }; 337 | 338 | SIMD_ALWAYS_INLINE inline simd multiplysign(simd const& a, simd const& b) { 339 | __m256d const sign_mask = _mm256_set1_pd(-0.f); 340 | return simd(_mm256_xor_pd(a.get(), _mm256_and_pd(sign_mask, b.get()))); 341 | } 342 | 343 | SIMD_ALWAYS_INLINE inline simd copysign(simd const& a, simd const& b) { 344 | __m256d const sign_mask = _mm256_set1_pd(-0.f); 345 | return simd(_mm256_xor_pd(_mm256_andnot_pd(sign_mask, a.get()), _mm256_and_pd(sign_mask, b.get()))); 346 | } 347 | 348 | SIMD_ALWAYS_INLINE inline simd abs(simd const& a) { 349 | __m256d const sign_mask = _mm256_set1_pd(-0.f); // -0.f = 1 << 31 350 | return simd(_mm256_andnot_pd(sign_mask, a.get())); 351 | } 352 | 353 | SIMD_ALWAYS_INLINE inline simd sqrt(simd const& a) { 354 | return simd(_mm256_sqrt_pd(a.get())); 355 | } 356 | 357 | #ifdef __INTEL_COMPILER 358 | SIMD_ALWAYS_INLINE inline simd cbrt(simd const& a) { 359 | return simd(_mm256_cbrt_pd(a.get())); 360 | } 361 | 362 | SIMD_ALWAYS_INLINE inline simd exp(simd const& a) { 363 | return simd(_mm256_exp_pd(a.get())); 364 | } 365 | 366 | SIMD_ALWAYS_INLINE inline simd log(simd const& a) { 367 | return simd(_mm256_log_pd(a.get())); 368 | } 369 | #endif 370 | 371 | #if defined(__FMA__) || defined(__AVX2__) 372 | SIMD_ALWAYS_INLINE inline simd fma( 373 | simd const& a, 374 | simd const& b, 375 | simd const& c) { 376 | return simd(_mm256_fmadd_pd(a.get(), b.get(), c.get())); 377 | } 378 | #endif 379 | 380 | SIMD_ALWAYS_INLINE inline simd max( 381 | simd const& a, simd const& b) { 382 | return simd(_mm256_max_pd(a.get(), b.get())); 383 | } 384 | 385 | SIMD_ALWAYS_INLINE inline simd min( 386 | simd const& a, simd const& b) { 387 | return simd(_mm256_min_pd(a.get(), b.get())); 388 | } 389 | 390 | SIMD_ALWAYS_INLINE inline simd choose( 391 | simd_mask const& a, simd const& b, simd const& c) { 392 | return simd(_mm256_blendv_pd(c.get(), b.get(), a.get())); 393 | } 394 | 395 | } 396 | 397 | #endif 398 | -------------------------------------------------------------------------------- /avx512.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | //@HEADER 3 | // ************************************************************************ 4 | // 5 | // Kokkos v. 2.0 6 | // Copyright (2014) Sandia Corporation 7 | // 8 | // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, 9 | // the U.S. Government retains certain rights in this software. 10 | // 11 | // Redistribution and use in source and binary forms, with or without 12 | // modification, are permitted provided that the following conditions are 13 | // met: 14 | // 15 | // 1. Redistributions of source code must retain the above copyright 16 | // notice, this list of conditions and the following disclaimer. 17 | // 18 | // 2. Redistributions in binary form must reproduce the above copyright 19 | // notice, this list of conditions and the following disclaimer in the 20 | // documentation and/or other materials provided with the distribution. 21 | // 22 | // 3. Neither the name of the Corporation nor the names of the 23 | // contributors may be used to endorse or promote products derived from 24 | // this software without specific prior written permission. 25 | // 26 | // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY 27 | // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 29 | // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE 30 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 31 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 32 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 33 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 34 | // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 35 | // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 36 | // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 37 | // 38 | // Questions? Contact Christian R. Trott (crtrott@sandia.gov) 39 | // 40 | // ************************************************************************ 41 | //@HEADER 42 | */ 43 | 44 | #pragma once 45 | 46 | #include "simd_common.hpp" 47 | 48 | #ifdef __AVX512F__ 49 | 50 | #include 51 | 52 | namespace SIMD_NAMESPACE { 53 | 54 | namespace simd_abi { 55 | 56 | class avx512 {}; 57 | 58 | } 59 | 60 | template <> 61 | class simd_mask { 62 | __mmask16 m_value; 63 | public: 64 | using value_type = bool; 65 | using simd_type = simd; 66 | using abi_type = simd_abi::avx512; 67 | SIMD_ALWAYS_INLINE inline simd_mask() = default; 68 | SIMD_ALWAYS_INLINE inline simd_mask(bool value) 69 | :m_value(-std::int16_t(value)) 70 | {} 71 | SIMD_ALWAYS_INLINE inline static constexpr int size() { return 16; } 72 | SIMD_ALWAYS_INLINE inline constexpr simd_mask(__mmask16 const& value_in) 73 | :m_value(value_in) 74 | {} 75 | SIMD_ALWAYS_INLINE inline constexpr __mmask16 get() const { return m_value; } 76 | SIMD_ALWAYS_INLINE inline simd_mask operator||(simd_mask const& other) const { 77 | return simd_mask(_kor_mask16(m_value, other.m_value)); 78 | } 79 | SIMD_ALWAYS_INLINE inline simd_mask operator&&(simd_mask const& other) const { 80 | return simd_mask(_kand_mask16(m_value, other.m_value)); 81 | } 82 | SIMD_ALWAYS_INLINE inline simd_mask operator!() const { 83 | return simd_mask(_knot_mask16(m_value)); 84 | } 85 | }; 86 | 87 | SIMD_ALWAYS_INLINE inline bool all_of(simd_mask const& a) { 88 | static const __mmask16 false_value(-std::int16_t(false)); 89 | return _kortestc_mask16_u8(a.get(), false_value); 90 | } 91 | 92 | SIMD_ALWAYS_INLINE inline bool any_of(simd_mask const& a) { 93 | static const __mmask16 false_value(-std::int16_t(false)); 94 | return !_kortestc_mask16_u8(~a.get(), false_value); 95 | } 96 | 97 | template <> 98 | class simd { 99 | __m512 m_value; 100 | public: 101 | SIMD_ALWAYS_INLINE simd() = default; 102 | using value_type = float; 103 | using abi_type = simd_abi::avx512; 104 | using mask_type = simd_mask; 105 | using storage_type = simd_storage; 106 | SIMD_ALWAYS_INLINE inline static constexpr int size() { return 16; } 107 | SIMD_ALWAYS_INLINE inline simd(float value) 108 | :m_value(_mm512_set1_ps(value)) 109 | {} 110 | SIMD_ALWAYS_INLINE inline simd( 111 | float a, float b, float c, float d, 112 | float e, float f, float g, float h, 113 | float i, float j, float k, float l, 114 | float m, float n, float o, float p) 115 | :m_value(_mm512_setr_ps( 116 | a, b, c, d, e, f, g, h, 117 | i, j, k, l, m, n, o, p)) 118 | {} 119 | SIMD_ALWAYS_INLINE inline 120 | simd(storage_type const& value) { 121 | copy_from(value.data(), element_aligned_tag()); 122 | } 123 | SIMD_ALWAYS_INLINE inline 124 | simd& operator=(storage_type const& value) { 125 | copy_from(value.data(), element_aligned_tag()); 126 | return *this; 127 | } 128 | template 129 | SIMD_ALWAYS_INLINE inline simd(float const* ptr, Flags /*flags*/) 130 | :m_value(_mm512_loadu_ps(ptr)) 131 | {} 132 | SIMD_ALWAYS_INLINE inline simd(float const* ptr, int stride) 133 | :simd(ptr[0], ptr[stride], ptr[2*stride], ptr[3*stride], 134 | ptr[4*stride], ptr[5*stride], ptr[6*stride], ptr[7*stride], 135 | ptr[8*stride], ptr[9*stride], ptr[10*stride], ptr[11*stride], 136 | ptr[12*stride], ptr[13*stride], ptr[14*stride], ptr[15*stride]) 137 | {} 138 | SIMD_ALWAYS_INLINE inline constexpr simd(__m512 const& value_in) 139 | :m_value(value_in) 140 | {} 141 | SIMD_ALWAYS_INLINE inline simd operator*(simd const& other) const { 142 | return simd(_mm512_mul_ps(m_value, other.m_value)); 143 | } 144 | SIMD_ALWAYS_INLINE inline simd operator/(simd const& other) const { 145 | return simd(_mm512_div_ps(m_value, other.m_value)); 146 | } 147 | SIMD_ALWAYS_INLINE inline simd operator+(simd const& other) const { 148 | return simd(_mm512_add_ps(m_value, other.m_value)); 149 | } 150 | SIMD_ALWAYS_INLINE inline simd operator-(simd const& other) const { 151 | return simd(_mm512_sub_ps(m_value, other.m_value)); 152 | } 153 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd operator-() const { 154 | return simd(_mm512_sub_ps(_mm512_set1_ps(0.0), m_value)); 155 | } 156 | SIMD_ALWAYS_INLINE inline void copy_from(float const* ptr, element_aligned_tag) { 157 | m_value = _mm512_loadu_ps(ptr); 158 | } 159 | SIMD_ALWAYS_INLINE inline void copy_to(float* ptr, element_aligned_tag) const { 160 | _mm512_storeu_ps(ptr, m_value); 161 | } 162 | SIMD_ALWAYS_INLINE inline constexpr __m512 get() const { return m_value; } 163 | SIMD_ALWAYS_INLINE inline simd_mask operator<(simd const& other) const { 164 | return simd_mask(_mm512_cmplt_ps_mask(m_value, other.m_value)); 165 | } 166 | SIMD_ALWAYS_INLINE inline simd_mask operator==(simd const& other) const { 167 | return simd_mask(_mm512_cmpeq_ps_mask(m_value, other.m_value)); 168 | } 169 | }; 170 | 171 | SIMD_ALWAYS_INLINE inline simd multiplysign(simd const& a, simd const& b) { 172 | static const __m512i sign_mask = reinterpret_cast<__m512i>(simd(-0.0).get()); 173 | return simd( 174 | reinterpret_cast<__m512>(_mm512_xor_epi32( 175 | reinterpret_cast<__m512i>(a.get()), 176 | _mm512_and_epi32(sign_mask, reinterpret_cast<__m512i>(b.get())) 177 | )) 178 | ); 179 | } 180 | 181 | SIMD_ALWAYS_INLINE inline simd copysign(simd const& a, simd const& b) { 182 | static const __m512i sign_mask = reinterpret_cast<__m512i>(simd(-0.0).get()); 183 | return simd( 184 | reinterpret_cast<__m512>(_mm512_xor_epi32( 185 | _mm512_andnot_epi32(sign_mask, reinterpret_cast<__m512i>(a.get())), 186 | _mm512_and_epi32(sign_mask, reinterpret_cast<__m512i>(b.get())) 187 | )) 188 | ); 189 | } 190 | 191 | SIMD_ALWAYS_INLINE inline simd abs(simd const& a) { 192 | __m512 const rhs = a.get(); 193 | return reinterpret_cast<__m512>(_mm512_and_epi32(reinterpret_cast<__m512i>(rhs), _mm512_set1_epi32(0x7fffffff))); 194 | } 195 | 196 | SIMD_ALWAYS_INLINE inline simd sqrt(simd const& a) { 197 | return simd(_mm512_sqrt_ps(a.get())); 198 | } 199 | 200 | #ifdef __INTEL_COMPILER 201 | SIMD_ALWAYS_INLINE inline simd cbrt(simd const& a) { 202 | return simd(_mm512_cbrt_ps(a.get())); 203 | } 204 | 205 | SIMD_ALWAYS_INLINE inline simd exp(simd const& a) { 206 | return simd(_mm512_exp_ps(a.get())); 207 | } 208 | 209 | SIMD_ALWAYS_INLINE inline simd log(simd const& a) { 210 | return simd(_mm512_log_ps(a.get())); 211 | } 212 | #endif 213 | 214 | SIMD_ALWAYS_INLINE inline simd fma( 215 | simd const& a, 216 | simd const& b, 217 | simd const& c) { 218 | return simd(_mm512_fmadd_ps(a.get(), b.get(), c.get())); 219 | } 220 | 221 | SIMD_ALWAYS_INLINE inline simd max( 222 | simd const& a, simd const& b) { 223 | return simd(_mm512_max_ps(a.get(), b.get())); 224 | } 225 | 226 | SIMD_ALWAYS_INLINE inline simd min( 227 | simd const& a, simd const& b) { 228 | return simd(_mm512_min_ps(a.get(), b.get())); 229 | } 230 | 231 | SIMD_ALWAYS_INLINE inline simd choose( 232 | simd_mask const& a, simd const& b, simd const& c) { 233 | return simd(_mm512_mask_blend_ps(a.get(), c.get(), b.get())); 234 | } 235 | 236 | template <> 237 | class simd_mask { 238 | __mmask8 m_value; 239 | public: 240 | using value_type = bool; 241 | SIMD_ALWAYS_INLINE inline simd_mask() = default; 242 | SIMD_ALWAYS_INLINE inline simd_mask(bool value) 243 | :m_value(-std::int16_t(value)) 244 | {} 245 | SIMD_ALWAYS_INLINE inline static constexpr int size() { return 8; } 246 | SIMD_ALWAYS_INLINE inline constexpr simd_mask(__mmask8 const& value_in) 247 | :m_value(value_in) 248 | {} 249 | SIMD_ALWAYS_INLINE inline constexpr __mmask8 get() const { return m_value; } 250 | SIMD_ALWAYS_INLINE simd_mask operator||(simd_mask const& other) const { 251 | return simd_mask(static_cast<__mmask8>(_mm512_kor(m_value, other.m_value))); 252 | } 253 | SIMD_ALWAYS_INLINE simd_mask operator&&(simd_mask const& other) const { 254 | return simd_mask(static_cast<__mmask8>(_mm512_kand(m_value, other.m_value))); 255 | } 256 | SIMD_ALWAYS_INLINE simd_mask operator!() const { 257 | static const __mmask8 true_value(simd_mask(true).get()); 258 | return simd_mask(static_cast<__mmask8>(_mm512_kxor(true_value, m_value))); 259 | } 260 | }; 261 | 262 | SIMD_ALWAYS_INLINE inline bool all_of(simd_mask const& a) { 263 | static const __mmask16 false_value(-std::int16_t(false)); 264 | const __mmask16 a_value(0xFF00 | a.get()); 265 | return _kortestc_mask16_u8(a_value, false_value); 266 | } 267 | 268 | SIMD_ALWAYS_INLINE inline bool any_of(simd_mask const& a) { 269 | static const __mmask16 false_value(-std::int16_t(false)); 270 | const __mmask16 a_value(0x0000 | a.get()); 271 | return !_kortestc_mask16_u8(~a_value, false_value); 272 | } 273 | 274 | template <> 275 | class simd { 276 | __m512d m_value; 277 | public: 278 | using value_type = double; 279 | using abi_type = simd_abi::avx512; 280 | using mask_type = simd_mask; 281 | using storage_type = simd_storage; 282 | SIMD_ALWAYS_INLINE inline simd() = default; 283 | SIMD_ALWAYS_INLINE inline simd(simd const&) = default; 284 | SIMD_ALWAYS_INLINE inline simd(simd&&) = default; 285 | SIMD_ALWAYS_INLINE inline simd& operator=(simd const&) = default; 286 | SIMD_ALWAYS_INLINE inline simd& operator=(simd&&) = default; 287 | SIMD_ALWAYS_INLINE inline static constexpr int size() { return 8; } 288 | SIMD_ALWAYS_INLINE inline simd(double value) 289 | :m_value(_mm512_set1_pd(value)) 290 | {} 291 | SIMD_ALWAYS_INLINE inline simd( 292 | double a, double b, double c, double d, 293 | double e, double f, double g, double h) 294 | :m_value(_mm512_setr_pd(a, b, c, d, e, f, g, h)) 295 | {} 296 | SIMD_ALWAYS_INLINE inline 297 | simd(storage_type const& value) { 298 | copy_from(value.data(), element_aligned_tag()); 299 | } 300 | #ifdef STK_VOLATILE_SIMD 301 | SIMD_ALWAYS_INLINE inline 302 | simd(simd const volatile& value) 303 | :m_value(value.m_value) 304 | {} 305 | #endif 306 | SIMD_ALWAYS_INLINE inline 307 | simd& operator=(storage_type const& value) { 308 | copy_from(value.data(), element_aligned_tag()); 309 | return *this; 310 | } 311 | template 312 | SIMD_ALWAYS_INLINE inline simd(double const* ptr, Flags /*flags*/) 313 | :m_value(_mm512_loadu_pd(ptr)) 314 | {} 315 | SIMD_ALWAYS_INLINE inline simd(double const* ptr, int stride) 316 | :simd(ptr[0], ptr[stride], ptr[2*stride], ptr[3*stride], 317 | ptr[4*stride], ptr[5*stride], ptr[6*stride], ptr[7*stride]) 318 | {} 319 | SIMD_ALWAYS_INLINE inline constexpr simd(__m512d const& value_in) 320 | :m_value(value_in) 321 | {} 322 | SIMD_ALWAYS_INLINE inline simd operator*(simd const& other) const { 323 | return simd(_mm512_mul_pd(m_value, other.m_value)); 324 | } 325 | SIMD_ALWAYS_INLINE inline simd operator/(simd const& other) const { 326 | return simd(_mm512_div_pd(m_value, other.m_value)); 327 | } 328 | SIMD_ALWAYS_INLINE inline simd operator+(simd const& other) const { 329 | return simd(_mm512_add_pd(m_value, other.m_value)); 330 | } 331 | #ifdef STK_VOLATILE_SIMD 332 | SIMD_ALWAYS_INLINE inline void plus_equals(simd const volatile& other) volatile { 333 | m_value = _mm512_add_pd(m_value, other.m_value); 334 | } 335 | #endif 336 | SIMD_ALWAYS_INLINE inline simd operator-(simd const& other) const { 337 | return simd(_mm512_sub_pd(m_value, other.m_value)); 338 | } 339 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd operator-() const { 340 | return simd(_mm512_sub_pd(_mm512_set1_pd(0.0), m_value)); 341 | } 342 | SIMD_ALWAYS_INLINE inline void copy_from(double const* ptr, element_aligned_tag) { 343 | m_value = _mm512_loadu_pd(ptr); 344 | } 345 | SIMD_ALWAYS_INLINE inline void copy_to(double* ptr, element_aligned_tag) const { 346 | _mm512_storeu_pd(ptr, m_value); 347 | } 348 | SIMD_ALWAYS_INLINE inline constexpr __m512d get() const { return m_value; } 349 | SIMD_ALWAYS_INLINE inline simd_mask operator<(simd const& other) const { 350 | return simd_mask(_mm512_cmplt_pd_mask(m_value, other.m_value)); 351 | } 352 | SIMD_ALWAYS_INLINE inline simd_mask operator==(simd const& other) const { 353 | return simd_mask(_mm512_cmpeq_pd_mask(m_value, other.m_value)); 354 | } 355 | }; 356 | 357 | SIMD_ALWAYS_INLINE inline simd multiplysign(simd const& a, simd const& b) { 358 | static const __m512i sign_mask = reinterpret_cast<__m512i>(simd(-0.0).get()); 359 | return simd( 360 | reinterpret_cast<__m512d>(_mm512_xor_epi64( 361 | reinterpret_cast<__m512i>(a.get()), 362 | _mm512_and_epi64(sign_mask, reinterpret_cast<__m512i>(b.get())) 363 | )) 364 | ); 365 | } 366 | 367 | SIMD_ALWAYS_INLINE inline simd copysign(simd const& a, simd const& b) { 368 | static const __m512i sign_mask = reinterpret_cast<__m512i>(simd(-0.0).get()); 369 | return simd( 370 | reinterpret_cast<__m512d>(_mm512_xor_epi64( 371 | _mm512_andnot_epi64(sign_mask, reinterpret_cast<__m512i>(a.get())), 372 | _mm512_and_epi64(sign_mask, reinterpret_cast<__m512i>(b.get())) 373 | )) 374 | ); 375 | } 376 | 377 | SIMD_ALWAYS_INLINE inline simd abs(simd const& a) { 378 | __m512d const rhs = a.get(); 379 | return reinterpret_cast<__m512d>(_mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF), 380 | reinterpret_cast<__m512i>(rhs))); 381 | } 382 | 383 | SIMD_ALWAYS_INLINE inline simd sqrt(simd const& a) { 384 | return simd(_mm512_sqrt_pd(a.get())); 385 | } 386 | 387 | #ifdef __INTEL_COMPILER 388 | SIMD_ALWAYS_INLINE inline simd cbrt(simd const& a) { 389 | return simd(_mm512_cbrt_pd(a.get())); 390 | } 391 | 392 | SIMD_ALWAYS_INLINE inline simd exp(simd const& a) { 393 | return simd(_mm512_exp_pd(a.get())); 394 | } 395 | 396 | SIMD_ALWAYS_INLINE inline simd log(simd const& a) { 397 | return simd(_mm512_log_pd(a.get())); 398 | } 399 | #endif 400 | 401 | SIMD_ALWAYS_INLINE inline simd fma( 402 | simd const& a, 403 | simd const& b, 404 | simd const& c) { 405 | return simd(_mm512_fmadd_pd(a.get(), b.get(), c.get())); 406 | } 407 | 408 | SIMD_ALWAYS_INLINE inline simd max( 409 | simd const& a, simd const& b) { 410 | return simd(_mm512_max_pd(a.get(), b.get())); 411 | } 412 | 413 | SIMD_ALWAYS_INLINE inline simd min( 414 | simd const& a, simd const& b) { 415 | return simd(_mm512_min_pd(a.get(), b.get())); 416 | } 417 | 418 | SIMD_ALWAYS_INLINE inline simd choose( 419 | simd_mask const& a, simd const& b, simd const& c) { 420 | return simd(_mm512_mask_blend_pd(a.get(), c.get(), b.get())); 421 | } 422 | 423 | } 424 | 425 | #endif 426 | 427 | 428 | 429 | -------------------------------------------------------------------------------- /cuda_warp.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | //@HEADER 3 | // ************************************************************************ 4 | // 5 | // Kokkos v. 2.0 6 | // Copyright (2014) Sandia Corporation 7 | // 8 | // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, 9 | // the U.S. Government retains certain rights in this software. 10 | // 11 | // Redistribution and use in source and binary forms, with or without 12 | // modification, are permitted provided that the following conditions are 13 | // met: 14 | // 15 | // 1. Redistributions of source code must retain the above copyright 16 | // notice, this list of conditions and the following disclaimer. 17 | // 18 | // 2. Redistributions in binary form must reproduce the above copyright 19 | // notice, this list of conditions and the following disclaimer in the 20 | // documentation and/or other materials provided with the distribution. 21 | // 22 | // 3. Neither the name of the Corporation nor the names of the 23 | // contributors may be used to endorse or promote products derived from 24 | // this software without specific prior written permission. 25 | // 26 | // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY 27 | // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 29 | // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE 30 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 31 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 32 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 33 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 34 | // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 35 | // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 36 | // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 37 | // 38 | // Questions? Contact Christian R. Trott (crtrott@sandia.gov) 39 | // 40 | // ************************************************************************ 41 | //@HEADER 42 | */ 43 | 44 | #pragma once 45 | 46 | #include "simd_common.hpp" 47 | 48 | #ifdef __CUDACC__ 49 | #define SIMD_CUDA_ALWAYS_INLINE __forceinline__ 50 | #endif 51 | 52 | #ifdef __CUDACC__ 53 | #define SIMD_HOST_DEVICE __host__ __device__ 54 | #else 55 | #define SIMD_HOST_DEVICE 56 | #endif 57 | 58 | #ifdef __CUDACC__ 59 | #define SIMD_DEVICE __device__ 60 | #else 61 | #define SIMD_DEVICE 62 | #endif 63 | 64 | #ifdef __CUDACC__ 65 | 66 | namespace SIMD_NAMESPACE { 67 | 68 | namespace simd_abi { 69 | 70 | template 71 | class cuda_warp { 72 | static_assert(N <= 32, "CUDA warps can't be more than 32 threads"); 73 | public: 74 | SIMD_HOST_DEVICE static unsigned mask() { 75 | return (unsigned(1) << N) - unsigned(1); 76 | } 77 | }; 78 | 79 | } 80 | 81 | template 82 | class simd_storage> { 83 | T m_value[simd>::size()]; 84 | public: 85 | using value_type = T; 86 | using abi_type = simd_abi::cuda_warp; 87 | using simd_type = simd; 88 | SIMD_ALWAYS_INLINE inline simd_storage() = default; 89 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline static constexpr 90 | int size() { return simd::size(); } 91 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline 92 | simd_storage(simd const& value) { 93 | value.copy_to(m_value, element_aligned_tag()); 94 | } 95 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE explicit inline 96 | simd_storage(T value) 97 | :simd_storage(simd(value)) 98 | {} 99 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline 100 | simd_storage& operator=(simd const& value) { 101 | value.copy_to(m_value, element_aligned_tag()); 102 | return *this; 103 | } 104 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE 105 | T const* data() const { return m_value; } 106 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE 107 | T* data() { return m_value; } 108 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE 109 | T const& operator[](int i) const { return m_value[i]; } 110 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE 111 | T& operator[](int i) { return m_value[i]; } 112 | }; 113 | 114 | template 115 | class simd_mask> { 116 | bool m_value; 117 | public: 118 | using value_type = bool; 119 | using abi_type = simd_abi::cuda_warp; 120 | using simd_type = simd; 121 | SIMD_CUDA_ALWAYS_INLINE simd_mask() = default; 122 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE static constexpr 123 | int size() { return N; } 124 | SIMD_CUDA_ALWAYS_INLINE SIMD_HOST_DEVICE 125 | simd_mask(bool value) 126 | :m_value(value) 127 | {} 128 | SIMD_CUDA_ALWAYS_INLINE SIMD_HOST_DEVICE constexpr 129 | bool get() const { 130 | return m_value; 131 | } 132 | SIMD_CUDA_ALWAYS_INLINE SIMD_HOST_DEVICE 133 | simd_mask operator||(simd_mask const& other) const { 134 | return m_value || other.m_value; 135 | } 136 | SIMD_CUDA_ALWAYS_INLINE SIMD_HOST_DEVICE 137 | simd_mask operator&&(simd_mask const& other) const { 138 | return m_value && other.m_value; 139 | } 140 | SIMD_CUDA_ALWAYS_INLINE SIMD_HOST_DEVICE 141 | simd_mask operator!() const { 142 | return !m_value; 143 | } 144 | }; 145 | 146 | template 147 | SIMD_CUDA_ALWAYS_INLINE SIMD_HOST_DEVICE 148 | bool all_of(simd_mask> const& a) { 149 | return bool(__all_sync(simd_abi::cuda_warp::mask(), int(a.get()))); 150 | } 151 | 152 | template 153 | SIMD_CUDA_ALWAYS_INLINE SIMD_HOST_DEVICE 154 | bool any_of(simd_mask> const& a) { 155 | return bool(__any_sync(simd_abi::cuda_warp::mask(), int(a.get()))); 156 | } 157 | 158 | template 159 | class simd> { 160 | T m_value; 161 | public: 162 | using value_type = T; 163 | using abi_type = simd_abi::cuda_warp; 164 | using mask_type = simd_mask; 165 | using storage_type = simd_storage; 166 | SIMD_CUDA_ALWAYS_INLINE simd() = default; 167 | SIMD_CUDA_ALWAYS_INLINE SIMD_HOST_DEVICE static constexpr int size() { return N; } 168 | SIMD_CUDA_ALWAYS_INLINE SIMD_HOST_DEVICE simd(T value) 169 | :m_value(value) 170 | {} 171 | SIMD_CUDA_ALWAYS_INLINE SIMD_HOST_DEVICE 172 | simd(storage_type const& value) { 173 | copy_from(value.data(), element_aligned_tag()); 174 | } 175 | SIMD_CUDA_ALWAYS_INLINE SIMD_HOST_DEVICE 176 | simd& operator=(storage_type const& value) { 177 | copy_from(value.data(), element_aligned_tag()); 178 | return *this; 179 | } 180 | template 181 | SIMD_CUDA_ALWAYS_INLINE SIMD_HOST_DEVICE simd(T const* ptr, Flags flags) { 182 | copy_from(ptr, flags); 183 | } 184 | SIMD_CUDA_ALWAYS_INLINE SIMD_HOST_DEVICE simd operator*(simd const& other) const { 185 | return simd(m_value * other.m_value); 186 | } 187 | SIMD_CUDA_ALWAYS_INLINE SIMD_HOST_DEVICE simd operator/(simd const& other) const { 188 | return simd(m_value / other.m_value); 189 | } 190 | SIMD_CUDA_ALWAYS_INLINE SIMD_HOST_DEVICE simd operator+(simd const& other) const { 191 | return simd(m_value + other.m_value); 192 | } 193 | SIMD_CUDA_ALWAYS_INLINE SIMD_HOST_DEVICE simd operator-(simd const& other) const { 194 | return simd(m_value - other.m_value); 195 | } 196 | SIMD_CUDA_ALWAYS_INLINE SIMD_HOST_DEVICE simd operator-() const { 197 | return simd(-m_value); 198 | } 199 | SIMD_CUDA_ALWAYS_INLINE SIMD_HOST_DEVICE void copy_from(T const* ptr, element_aligned_tag) { 200 | #ifdef __CUDA_ARCH__ 201 | m_value = ptr[threadIdx.x]; 202 | #endif 203 | } 204 | SIMD_CUDA_ALWAYS_INLINE SIMD_HOST_DEVICE void copy_to(T* ptr, element_aligned_tag) const { 205 | #ifdef __CUDA_ARCH__ 206 | ptr[threadIdx.x] = m_value; 207 | #endif 208 | } 209 | SIMD_CUDA_ALWAYS_INLINE SIMD_HOST_DEVICE T get() const { 210 | return m_value; 211 | } 212 | SIMD_CUDA_ALWAYS_INLINE SIMD_HOST_DEVICE 213 | mask_type operator<(simd const& other) const { 214 | return mask_type(m_value < other.m_value); 215 | } 216 | SIMD_CUDA_ALWAYS_INLINE SIMD_HOST_DEVICE 217 | mask_type operator==(simd const& other) const { 218 | return mask_type(m_value == other.m_value); 219 | } 220 | }; 221 | 222 | template 223 | SIMD_CUDA_ALWAYS_INLINE SIMD_HOST_DEVICE simd> abs(simd> const& a) { 224 | return simd>(std::abs(a.get())); 225 | } 226 | 227 | template 228 | SIMD_CUDA_ALWAYS_INLINE SIMD_HOST_DEVICE simd> sqrt(simd> const& a) { 229 | return simd>(std::sqrt(a.get())); 230 | } 231 | 232 | template 233 | SIMD_CUDA_ALWAYS_INLINE SIMD_HOST_DEVICE simd> cbrt(simd> const& a) { 234 | return simd>(std::cbrt(a.get())); 235 | } 236 | 237 | template 238 | SIMD_CUDA_ALWAYS_INLINE SIMD_HOST_DEVICE simd> exp(simd> const& a) { 239 | return simd>(std::exp(a.get())); 240 | } 241 | 242 | template 243 | SIMD_CUDA_ALWAYS_INLINE SIMD_HOST_DEVICE simd> fma( 244 | simd> const& a, 245 | simd> const& b, 246 | simd> const& c) { 247 | return simd>((a.get() * b.get()) + c.get()); 248 | } 249 | 250 | template 251 | SIMD_CUDA_ALWAYS_INLINE SIMD_HOST_DEVICE simd> max( 252 | simd> const& a, simd> const& b) { 253 | return simd>((a.get() < b.get()) ? b.get() : a.get()); 254 | } 255 | 256 | template 257 | SIMD_CUDA_ALWAYS_INLINE SIMD_HOST_DEVICE simd> min( 258 | simd> const& a, simd> const& b) { 259 | return simd>((b.get() < a.get()) ? b.get() : a.get()); 260 | } 261 | 262 | template 263 | SIMD_CUDA_ALWAYS_INLINE SIMD_HOST_DEVICE simd> choose( 264 | simd_mask> const& a, 265 | simd> const& b, 266 | simd> const& c) { 267 | return simd>(a.get() ? b.get() : c.get()); 268 | } 269 | 270 | } 271 | 272 | #endif 273 | -------------------------------------------------------------------------------- /hip_wavefront.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | //@HEADER 3 | // ************************************************************************ 4 | // 5 | // Kokkos v. 2.0 6 | // Copyright (2014) Sandia Corporation 7 | // 8 | // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, 9 | // the U.S. Government retains certain rights in this software. 10 | // 11 | // Redistribution and use in source and binary forms, with or without 12 | // modification, are permitted provided that the following conditions are 13 | // met: 14 | // 15 | // 1. Redistributions of source code must retain the above copyright 16 | // notice, this list of conditions and the following disclaimer. 17 | // 18 | // 2. Redistributions in binary form must reproduce the above copyright 19 | // notice, this list of conditions and the following disclaimer in the 20 | // documentation and/or other materials provided with the distribution. 21 | // 22 | // 3. Neither the name of the Corporation nor the names of the 23 | // contributors may be used to endorse or promote products derived from 24 | // this software without specific prior written permission. 25 | // 26 | // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY 27 | // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 29 | // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE 30 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 31 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 32 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 33 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 34 | // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 35 | // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 36 | // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 37 | // 38 | // Questions? Contact Christian R. Trott (crtrott@sandia.gov) 39 | // 40 | // ************************************************************************ 41 | //@HEADER 42 | */ 43 | 44 | #pragma once 45 | 46 | #include "simd_common.hpp" 47 | 48 | #ifdef __HIPCC__ 49 | #define SIMD_HIP_ALWAYS_INLINE __forceinline__ 50 | #endif 51 | 52 | #ifdef __HIPCC__ 53 | #define SIMD_HOST_DEVICE __host__ __device__ 54 | #else 55 | #define SIMD_HOST_DEVICE 56 | #endif 57 | 58 | #ifdef __HIPCC__ 59 | #define SIMD_DEVICE __device__ 60 | #else 61 | #define SIMD_DEVICE 62 | #endif 63 | 64 | #ifdef __HIPCC__ 65 | #include 66 | 67 | namespace SIMD_NAMESPACE { 68 | 69 | namespace simd_abi { 70 | 71 | template 72 | class hip_wavefront { 73 | static_assert(N <= 64, "HIP wavefronts can't be more than 64 threads"); 74 | public: 75 | SIMD_HOST_DEVICE static unsigned mask() { 76 | return (unsigned(1) << N) - unsigned(1); 77 | } 78 | }; 79 | 80 | } // SIMD ABI 81 | 82 | template 83 | class simd_storage> { 84 | T m_value[simd>::size()]; 85 | public: 86 | using value_type = T; 87 | using abi_type = simd_abi::hip_wavefront; 88 | using simd_type = simd; 89 | SIMD_ALWAYS_INLINE inline simd_storage() = default; 90 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline static constexpr 91 | int size() { return simd::size(); } 92 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline 93 | simd_storage(simd const& value) { 94 | value.copy_to(m_value, element_aligned_tag()); 95 | } 96 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE explicit inline 97 | simd_storage(T value) 98 | :simd_storage(simd(value)) 99 | {} 100 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline 101 | simd_storage& operator=(simd const& value) { 102 | value.copy_to(m_value, element_aligned_tag()); 103 | return *this; 104 | } 105 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE 106 | T const* data() const { return m_value; } 107 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE 108 | T* data() { return m_value; } 109 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE 110 | T const& operator[](int i) const { return m_value[i]; } 111 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE 112 | T& operator[](int i) { return m_value[i]; } 113 | }; 114 | 115 | template 116 | class simd_mask> { 117 | bool m_value; 118 | public: 119 | using value_type = bool; 120 | using abi_type = simd_abi::hip_wavefront; 121 | using simd_type = simd; 122 | SIMD_HIP_ALWAYS_INLINE simd_mask() = default; 123 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE static constexpr 124 | int size() { return N; } 125 | SIMD_HIP_ALWAYS_INLINE SIMD_DEVICE 126 | simd_mask(bool value) 127 | :m_value(value) 128 | {} 129 | SIMD_HIP_ALWAYS_INLINE SIMD_DEVICE constexpr 130 | bool get() const { 131 | return m_value; 132 | } 133 | SIMD_HIP_ALWAYS_INLINE SIMD_DEVICE 134 | simd_mask operator||(simd_mask const& other) const { 135 | return m_value || other.m_value; 136 | } 137 | SIMD_HIP_ALWAYS_INLINE SIMD_DEVICE 138 | simd_mask operator&&(simd_mask const& other) const { 139 | return m_value && other.m_value; 140 | } 141 | SIMD_HIP_ALWAYS_INLINE SIMD_DEVICE 142 | simd_mask operator!() const { 143 | return !m_value; 144 | } 145 | }; 146 | 147 | template 148 | SIMD_HIP_ALWAYS_INLINE SIMD_DEVICE 149 | bool all_of(simd_mask> const& a) { 150 | return bool(__all_sync(simd_abi::hip_wavefront::mask(), int(a.get()))); 151 | } 152 | 153 | template 154 | SIMD_HIP_ALWAYS_INLINE SIMD_DEVICE 155 | bool any_of(simd_mask> const& a) { 156 | return bool(__any_sync(simd_abi::hip_wavefront::mask(), int(a.get()))); 157 | } 158 | 159 | template 160 | class simd> { 161 | T m_value; 162 | public: 163 | using value_type = T; 164 | using abi_type = simd_abi::hip_wavefront; 165 | using mask_type = simd_mask; 166 | using storage_type = simd_storage; 167 | SIMD_HIP_ALWAYS_INLINE simd() = default; 168 | SIMD_HIP_ALWAYS_INLINE SIMD_HOST_DEVICE static constexpr int size() { return N; } 169 | SIMD_HIP_ALWAYS_INLINE SIMD_DEVICE simd(T value) 170 | :m_value(value) 171 | {} 172 | SIMD_HIP_ALWAYS_INLINE SIMD_DEVICE 173 | simd(storage_type const& value) { 174 | copy_from(value.data(), element_aligned_tag()); 175 | } 176 | SIMD_HIP_ALWAYS_INLINE SIMD_DEVICE 177 | simd& operator=(storage_type const& value) { 178 | copy_from(value.data(), element_aligned_tag()); 179 | return *this; 180 | } 181 | template 182 | SIMD_HIP_ALWAYS_INLINE SIMD_DEVICE simd(T const* ptr, Flags flags) { 183 | copy_from(ptr, flags); 184 | } 185 | SIMD_HIP_ALWAYS_INLINE SIMD_DEVICE simd operator*(simd const& other) const { 186 | return simd(m_value * other.m_value); 187 | } 188 | SIMD_HIP_ALWAYS_INLINE SIMD_DEVICE simd operator/(simd const& other) const { 189 | return simd(m_value / other.m_value); 190 | } 191 | SIMD_HIP_ALWAYS_INLINE SIMD_DEVICE simd operator+(simd const& other) const { 192 | return simd(m_value + other.m_value); 193 | } 194 | SIMD_HIP_ALWAYS_INLINE SIMD_DEVICE simd operator-(simd const& other) const { 195 | return simd(m_value - other.m_value); 196 | } 197 | SIMD_HIP_ALWAYS_INLINE SIMD_DEVICE simd operator-() const { 198 | return simd(-m_value); 199 | } 200 | SIMD_HIP_ALWAYS_INLINE SIMD_DEVICE void copy_from(T const* ptr, element_aligned_tag) { 201 | m_value = ptr[hipThreadIdx_x]; 202 | } 203 | SIMD_HIP_ALWAYS_INLINE SIMD_DEVICE void copy_to(T* ptr, element_aligned_tag) const { 204 | ptr[hipThreadIdx_x] = m_value; 205 | } 206 | SIMD_HIP_ALWAYS_INLINE SIMD_DEVICE T get() const { 207 | return m_value; 208 | } 209 | SIMD_HIP_ALWAYS_INLINE SIMD_DEVICE 210 | mask_type operator<(simd const& other) const { 211 | return mask_type(m_value < other.m_value); 212 | } 213 | SIMD_HIP_ALWAYS_INLINE SIMD_DEVICE 214 | mask_type operator==(simd const& other) const { 215 | return mask_type(m_value == other.m_value); 216 | } 217 | }; 218 | 219 | // ABS 220 | template 221 | SIMD_HIP_ALWAYS_INLINE SIMD_DEVICE simd> abs(simd> const& a) { 222 | return simd>(::fabsf(a.get())); 223 | } 224 | 225 | template 226 | SIMD_HIP_ALWAYS_INLINE SIMD_DEVICE simd> abs(simd> const& a) { 227 | return simd>(::fabs(a.get())); 228 | } 229 | 230 | // SQRT 231 | template 232 | SIMD_HIP_ALWAYS_INLINE SIMD_DEVICE simd> sqrt(simd> const& a) { 233 | return simd>(::sqrtf(a.get())); 234 | } 235 | 236 | template 237 | SIMD_HIP_ALWAYS_INLINE SIMD_DEVICE simd> sqrt(simd> const& a) { 238 | return simd>(::sqrt(a.get())); 239 | } 240 | 241 | // CBRT 242 | template 243 | SIMD_HIP_ALWAYS_INLINE SIMD_DEVICE simd> cbrt(simd> const& a) { 244 | return simd>(::cbrtf(a.get())); 245 | } 246 | 247 | template 248 | SIMD_HIP_ALWAYS_INLINE SIMD_DEVICE simd> cbrt(simd> const& a) { 249 | return simd>(::cbrt(a.get())); 250 | } 251 | 252 | // EXP 253 | template 254 | SIMD_HIP_ALWAYS_INLINE SIMD_DEVICE simd> exp(simd> const& a) { 255 | return simd>(::expf(a.get())); 256 | } 257 | 258 | 259 | template 260 | SIMD_HIP_ALWAYS_INLINE SIMD_DEVICE simd> exp(simd> const& a) { 261 | return simd>(::exp(a.get())); 262 | } 263 | 264 | template 265 | SIMD_HIP_ALWAYS_INLINE SIMD_HOST_DEVICE simd> fma( 266 | simd> const& a, 267 | simd> const& b, 268 | simd> const& c) { 269 | return simd>((a.get() * b.get()) + c.get()); 270 | } 271 | 272 | template 273 | SIMD_HIP_ALWAYS_INLINE SIMD_HOST_DEVICE simd> max( 274 | simd> const& a, simd> const& b) { 275 | return simd>((a.get() < b.get()) ? b.get() : a.get()); 276 | } 277 | 278 | template 279 | SIMD_HIP_ALWAYS_INLINE SIMD_HOST_DEVICE simd> min( 280 | simd> const& a, simd> const& b) { 281 | return simd>((b.get() < a.get()) ? b.get() : a.get()); 282 | } 283 | 284 | template 285 | SIMD_HIP_ALWAYS_INLINE SIMD_HOST_DEVICE simd> choose( 286 | simd_mask> const& a, 287 | simd> const& b, 288 | simd> const& c) { 289 | return simd>(a.get() ? b.get() : c.get()); 290 | } 291 | 292 | } // SIMD_NAMESPACE 293 | 294 | #endif 295 | -------------------------------------------------------------------------------- /neon.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | //@HEADER 3 | // ************************************************************************ 4 | // 5 | // Kokkos v. 2.0 6 | // Copyright (2014) Sandia Corporation 7 | // 8 | // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, 9 | // the U.S. Government retains certain rights in this software. 10 | // 11 | // Redistribution and use in source and binary forms, with or without 12 | // modification, are permitted provided that the following conditions are 13 | // met: 14 | // 15 | // 1. Redistributions of source code must retain the above copyright 16 | // notice, this list of conditions and the following disclaimer. 17 | // 18 | // 2. Redistributions in binary form must reproduce the above copyright 19 | // notice, this list of conditions and the following disclaimer in the 20 | // documentation and/or other materials provided with the distribution. 21 | // 22 | // 3. Neither the name of the Corporation nor the names of the 23 | // contributors may be used to endorse or promote products derived from 24 | // this software without specific prior written permission. 25 | // 26 | // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY 27 | // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 29 | // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE 30 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 31 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 32 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 33 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 34 | // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 35 | // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 36 | // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 37 | // 38 | // Questions? Contact Christian R. Trott (crtrott@sandia.gov) 39 | // 40 | // ************************************************************************ 41 | //@HEADER 42 | */ 43 | 44 | #pragma once 45 | 46 | #include "simd_common.hpp" 47 | 48 | #ifdef __ARM_NEON 49 | 50 | #include 51 | 52 | namespace SIMD_NAMESPACE { 53 | 54 | namespace simd_abi { 55 | 56 | class neon {}; 57 | 58 | } 59 | 60 | template <> 61 | class simd_mask { 62 | uint32x4_t m_value; 63 | public: 64 | using value_type = bool; 65 | using simd_type = simd_mask; 66 | using abi_type = simd_abi::neon; 67 | SIMD_ALWAYS_INLINE inline simd_mask() = default; 68 | SIMD_ALWAYS_INLINE inline simd_mask(bool value) 69 | :m_value(vreinterpretq_u32_s32(vdupq_n_s32(-int(value)))) 70 | {} 71 | SIMD_ALWAYS_INLINE inline static constexpr int size() { return 4; } 72 | SIMD_ALWAYS_INLINE inline constexpr simd_mask(uint32x4_t const& value_in) 73 | :m_value(value_in) 74 | {} 75 | SIMD_ALWAYS_INLINE inline constexpr uint32x4_t get() const { return m_value; } 76 | SIMD_ALWAYS_INLINE inline simd_mask operator||(simd_mask const& other) const { 77 | return simd_mask(vorrq_u32(m_value, other.m_value)); 78 | } 79 | SIMD_ALWAYS_INLINE inline simd_mask operator&&(simd_mask const& other) const { 80 | return simd_mask(vandq_u32(m_value, other.m_value)); 81 | } 82 | SIMD_ALWAYS_INLINE inline simd_mask operator!() const { 83 | return simd_mask(vmvnq_u32(m_value)); 84 | } 85 | }; 86 | 87 | SIMD_ALWAYS_INLINE inline bool all_of(simd_mask const& a) { 88 | return vminvq_u32(a.get()) == std::uint32_t(-std::int32_t(1)); 89 | } 90 | 91 | SIMD_ALWAYS_INLINE inline bool any_of(simd_mask const& a) { 92 | return vmaxvq_u32(a.get()) == std::uint32_t(-std::int32_t(1)); 93 | } 94 | 95 | template <> 96 | class simd { 97 | float32x4_t m_value; 98 | public: 99 | using value_type = float; 100 | using abi_type = simd_abi::neon; 101 | using mask_type = simd_mask; 102 | using storage_type = simd_storage; 103 | SIMD_ALWAYS_INLINE inline simd() = default; 104 | SIMD_ALWAYS_INLINE inline static constexpr int size() { return 4; } 105 | SIMD_ALWAYS_INLINE inline simd(float value) 106 | :m_value(vdupq_n_f32(value)) 107 | {} 108 | SIMD_ALWAYS_INLINE inline simd(float a, float b, float c, float d) 109 | :m_value((float32x4_t){a, b, c, d}) 110 | {} 111 | SIMD_ALWAYS_INLINE inline 112 | simd(storage_type const& value) { 113 | copy_from(value.data(), element_aligned_tag()); 114 | } 115 | SIMD_ALWAYS_INLINE inline 116 | simd& operator=(storage_type const& value) { 117 | copy_from(value.data(), element_aligned_tag()); 118 | return *this; 119 | } 120 | template 121 | SIMD_ALWAYS_INLINE inline simd(float const* ptr, Flags flags) { 122 | copy_from(ptr, flags); 123 | } 124 | SIMD_ALWAYS_INLINE inline simd(float const* ptr, int stride) 125 | :simd(ptr[0], ptr[stride], ptr[2*stride], ptr[3*stride]) 126 | {} 127 | SIMD_ALWAYS_INLINE inline constexpr simd(float32x4_t const& value_in) 128 | :m_value(value_in) 129 | {} 130 | SIMD_ALWAYS_INLINE inline simd operator*(simd const& other) const { 131 | return simd(vmulq_f32(m_value, other.m_value)); 132 | } 133 | SIMD_ALWAYS_INLINE inline simd operator/(simd const& other) const { 134 | return simd(vdivq_f32(m_value, other.m_value)); 135 | } 136 | SIMD_ALWAYS_INLINE inline simd operator+(simd const& other) const { 137 | return simd(vaddq_f32(m_value, other.m_value)); 138 | } 139 | SIMD_ALWAYS_INLINE inline simd operator-(simd const& other) const { 140 | return simd(vsubq_f32(m_value, other.m_value)); 141 | } 142 | SIMD_ALWAYS_INLINE inline simd operator-() const { 143 | return simd(vnegq_f32(m_value)); 144 | } 145 | SIMD_ALWAYS_INLINE inline void copy_from(float const* ptr, element_aligned_tag) { 146 | m_value = vld1q_f32(ptr); 147 | } 148 | SIMD_ALWAYS_INLINE inline void copy_to(float* ptr, element_aligned_tag) const { 149 | vst1q_f32(ptr, m_value); 150 | } 151 | SIMD_ALWAYS_INLINE inline constexpr float32x4_t get() const { return m_value; } 152 | SIMD_ALWAYS_INLINE simd_mask operator<(simd const& other) const { 153 | return simd_mask(vcltq_f32(m_value, other.m_value)); 154 | } 155 | SIMD_ALWAYS_INLINE inline simd_mask operator==(simd const& other) const { 156 | return simd_mask(vceqq_f32(m_value, other.m_value)); 157 | } 158 | }; 159 | 160 | SIMD_ALWAYS_INLINE inline simd abs(simd const& a) { 161 | return simd(vabsq_f32(a.get())); 162 | } 163 | 164 | SIMD_ALWAYS_INLINE inline simd sqrt(simd const& a) { 165 | return simd(vsqrtq_f32(a.get())); 166 | } 167 | 168 | SIMD_ALWAYS_INLINE inline simd fma( 169 | simd const& a, 170 | simd const& b, 171 | simd const& c) { 172 | return simd(vfmaq_f32(c.get(), b.get(), a.get())); 173 | } 174 | 175 | SIMD_ALWAYS_INLINE inline simd max( 176 | simd const& a, simd const& b) { 177 | return simd(vmaxq_f32(a.get(), b.get())); 178 | } 179 | 180 | SIMD_ALWAYS_INLINE inline simd min( 181 | simd const& a, simd const& b) { 182 | return simd(vminq_f32(a.get(), b.get())); 183 | } 184 | 185 | SIMD_ALWAYS_INLINE inline simd choose( 186 | simd_mask const& a, simd const& b, simd const& c) { 187 | return simd( 188 | vreinterpretq_f32_u32( 189 | vbslq_u32( 190 | a.get(), 191 | vreinterpretq_u32_f32(b.get()), 192 | vreinterpretq_u32_f32(c.get())))); 193 | } 194 | 195 | template <> 196 | class simd_mask { 197 | uint64x2_t m_value; 198 | public: 199 | using value_type = bool; 200 | using simd_type = simd; 201 | using abi_type = simd_abi::neon; 202 | SIMD_ALWAYS_INLINE inline simd_mask() = default; 203 | SIMD_ALWAYS_INLINE inline simd_mask(bool value) 204 | :m_value(vreinterpretq_u64_s64(vdupq_n_s64(-std::int64_t(value)))) 205 | {} 206 | SIMD_ALWAYS_INLINE inline static constexpr int size() { return 4; } 207 | SIMD_ALWAYS_INLINE inline constexpr simd_mask(uint64x2_t const& value_in) 208 | :m_value(value_in) 209 | {} 210 | SIMD_ALWAYS_INLINE inline constexpr uint64x2_t get() const { return m_value; } 211 | SIMD_ALWAYS_INLINE inline simd_mask operator||(simd_mask const& other) const { 212 | return simd_mask(vorrq_u64(m_value, other.m_value)); 213 | } 214 | SIMD_ALWAYS_INLINE inline simd_mask operator&&(simd_mask const& other) const { 215 | return simd_mask(vandq_u64(m_value, other.m_value)); 216 | } 217 | SIMD_ALWAYS_INLINE inline simd_mask operator!() const { 218 | return simd_mask(vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(m_value)))); 219 | } 220 | }; 221 | 222 | SIMD_ALWAYS_INLINE inline bool all_of(simd_mask const& a) { 223 | return all_of(simd_mask(vreinterpretq_u32_u64(a.get()))); 224 | } 225 | 226 | SIMD_ALWAYS_INLINE inline bool any_of(simd_mask const& a) { 227 | return any_of(simd_mask(vreinterpretq_u32_u64(a.get()))); 228 | } 229 | 230 | template <> 231 | class simd { 232 | float64x2_t m_value; 233 | public: 234 | using value_type = double; 235 | using abi_type = simd_abi::neon; 236 | using mask_type = simd_mask; 237 | using storage_type = simd_storage; 238 | SIMD_ALWAYS_INLINE inline simd() = default; 239 | SIMD_ALWAYS_INLINE inline simd(simd const&) = default; 240 | SIMD_ALWAYS_INLINE inline simd(simd&&) = default; 241 | SIMD_ALWAYS_INLINE inline simd& operator=(simd const&) = default; 242 | SIMD_ALWAYS_INLINE inline simd& operator=(simd&&) = default; 243 | SIMD_ALWAYS_INLINE inline static constexpr int size() { return 2; } 244 | SIMD_ALWAYS_INLINE inline simd(double value) 245 | :m_value(vdupq_n_f64(value)) 246 | {} 247 | SIMD_ALWAYS_INLINE inline simd(double a, double b) 248 | :m_value((float64x2_t){a, b}) 249 | {} 250 | SIMD_ALWAYS_INLINE inline 251 | simd(storage_type const& value) { 252 | copy_from(value.data(), element_aligned_tag()); 253 | } 254 | #ifdef STK_VOLATILE_SIMD 255 | SIMD_ALWAYS_INLINE inline 256 | simd(simd const volatile& value) 257 | :m_value(value.m_value) 258 | {} 259 | #endif 260 | SIMD_ALWAYS_INLINE inline 261 | simd& operator=(storage_type const& value) { 262 | copy_from(value.data(), element_aligned_tag()); 263 | return *this; 264 | } 265 | template 266 | SIMD_ALWAYS_INLINE inline simd(double const* ptr, Flags flags) { 267 | copy_from(ptr, flags); 268 | } 269 | SIMD_ALWAYS_INLINE inline simd(double const* ptr, int stride) 270 | :simd(ptr[0], ptr[stride]) 271 | {} 272 | SIMD_ALWAYS_INLINE inline constexpr simd(float64x2_t const& value_in) 273 | :m_value(value_in) 274 | {} 275 | SIMD_ALWAYS_INLINE inline simd operator*(simd const& other) const { 276 | return simd(vmulq_f64(m_value, other.m_value)); 277 | } 278 | SIMD_ALWAYS_INLINE inline simd operator/(simd const& other) const { 279 | return simd(vdivq_f64(m_value, other.m_value)); 280 | } 281 | SIMD_ALWAYS_INLINE inline simd operator+(simd const& other) const { 282 | return simd(vaddq_f64(m_value, other.m_value)); 283 | } 284 | #ifdef STK_VOLATILE_SIMD 285 | SIMD_ALWAYS_INLINE inline void plus_equals(simd const volatile& other) volatile { 286 | m_value = vaddq_f64(m_value, other.m_value); 287 | } 288 | #endif 289 | SIMD_ALWAYS_INLINE inline simd operator-(simd const& other) const { 290 | return simd(vsubq_f64(m_value, other.m_value)); 291 | } 292 | SIMD_ALWAYS_INLINE inline simd operator-() const { 293 | return simd(vnegq_f64(m_value)); 294 | } 295 | SIMD_ALWAYS_INLINE inline void copy_from(double const* ptr, element_aligned_tag) { 296 | m_value = vld1q_f64(ptr); 297 | } 298 | SIMD_ALWAYS_INLINE inline void copy_to(double* ptr, element_aligned_tag) const { 299 | vst1q_f64(ptr, m_value); 300 | } 301 | SIMD_ALWAYS_INLINE inline constexpr float64x2_t get() const { return m_value; } 302 | SIMD_ALWAYS_INLINE inline simd_mask operator<(simd const& other) const { 303 | return simd_mask(vcltq_f64(m_value, other.m_value)); 304 | } 305 | SIMD_ALWAYS_INLINE inline simd_mask operator==(simd const& other) const { 306 | return simd_mask(vceqq_f64(m_value, other.m_value)); 307 | } 308 | }; 309 | 310 | SIMD_ALWAYS_INLINE inline simd abs(simd const& a) { 311 | return simd(vabsq_f64(a.get())); 312 | } 313 | 314 | SIMD_ALWAYS_INLINE inline simd sqrt(simd const& a) { 315 | return simd(vsqrtq_f64(a.get())); 316 | } 317 | 318 | SIMD_ALWAYS_INLINE inline simd fma( 319 | simd const& a, 320 | simd const& b, 321 | simd const& c) { 322 | return simd(vfmaq_f64(c.get(), b.get(), a.get())); 323 | } 324 | 325 | SIMD_ALWAYS_INLINE inline simd max( 326 | simd const& a, simd const& b) { 327 | return simd(vmaxq_f64(a.get(), b.get())); 328 | } 329 | 330 | SIMD_ALWAYS_INLINE inline simd min( 331 | simd const& a, simd const& b) { 332 | return simd(vminq_f64(a.get(), b.get())); 333 | } 334 | 335 | SIMD_ALWAYS_INLINE inline simd choose( 336 | simd_mask const& a, simd const& b, simd const& c) { 337 | return simd( 338 | vreinterpretq_f64_u64( 339 | vbslq_u64( 340 | a.get(), 341 | vreinterpretq_u64_f64(b.get()), 342 | vreinterpretq_u64_f64(c.get())))); 343 | } 344 | 345 | } 346 | 347 | #endif 348 | -------------------------------------------------------------------------------- /pack.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | //@HEADER 3 | // ************************************************************************ 4 | // 5 | // Kokkos v. 2.0 6 | // Copyright (2014) Sandia Corporation 7 | // 8 | // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, 9 | // the U.S. Government retains certain rights in this software. 10 | // 11 | // Redistribution and use in source and binary forms, with or without 12 | // modification, are permitted provided that the following conditions are 13 | // met: 14 | // 15 | // 1. Redistributions of source code must retain the above copyright 16 | // notice, this list of conditions and the following disclaimer. 17 | // 18 | // 2. Redistributions in binary form must reproduce the above copyright 19 | // notice, this list of conditions and the following disclaimer in the 20 | // documentation and/or other materials provided with the distribution. 21 | // 22 | // 3. Neither the name of the Corporation nor the names of the 23 | // contributors may be used to endorse or promote products derived from 24 | // this software without specific prior written permission. 25 | // 26 | // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY 27 | // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 29 | // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE 30 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 31 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 32 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 33 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 34 | // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 35 | // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 36 | // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 37 | // 38 | // Questions? Contact Christian R. Trott (crtrott@sandia.gov) 39 | // 40 | // ************************************************************************ 41 | //@HEADER 42 | */ 43 | 44 | #pragma once 45 | 46 | #include "simd_common.hpp" 47 | 48 | namespace SIMD_NAMESPACE { 49 | 50 | namespace simd_abi { 51 | 52 | template 53 | class pack; 54 | 55 | } 56 | 57 | template 58 | class simd_mask> { 59 | int m_value[N]; 60 | public: 61 | using value_type = bool; 62 | using simd_type = simd>; 63 | using abi_type = simd_abi::pack; 64 | SIMD_ALWAYS_INLINE inline simd_mask() = default; 65 | SIMD_ALWAYS_INLINE inline static constexpr int size() { return N; } 66 | SIMD_ALWAYS_INLINE inline simd_mask(bool value) { 67 | SIMD_PRAGMA for (int i = 0; i < size(); ++i) m_value[i] = value; 68 | } 69 | SIMD_ALWAYS_INLINE inline constexpr bool operator[](int i) const { return m_value[i]; } 70 | SIMD_ALWAYS_INLINE inline int& operator[](int i) { return m_value[i]; } 71 | SIMD_ALWAYS_INLINE inline simd_mask operator||(simd_mask const& other) const { 72 | simd_mask result; 73 | SIMD_PRAGMA for (int i = 0; i < size(); ++i) result.m_value[i] = m_value[i] || other.m_value[i]; 74 | return result; 75 | } 76 | SIMD_ALWAYS_INLINE inline simd_mask operator&&(simd_mask const& other) const { 77 | simd_mask result; 78 | SIMD_PRAGMA for (int i = 0; i < size(); ++i) result.m_value[i] = m_value[i] && other.m_value[i]; 79 | return result; 80 | } 81 | SIMD_ALWAYS_INLINE inline simd_mask operator!() const { 82 | simd_mask result; 83 | SIMD_PRAGMA for (int i = 0; i < size(); ++i) result.m_value[i] = !m_value[i]; 84 | return result; 85 | } 86 | }; 87 | 88 | template 89 | class simd_mask> { 90 | std::int64_t m_value[N]; 91 | public: 92 | using value_type = bool; 93 | using simd_type = simd>; 94 | using abi_type = simd_abi::pack; 95 | SIMD_ALWAYS_INLINE inline simd_mask() = default; 96 | SIMD_ALWAYS_INLINE inline static constexpr int size() { return N; } 97 | SIMD_ALWAYS_INLINE inline simd_mask(bool value) { 98 | SIMD_PRAGMA for (int i = 0; i < size(); ++i) m_value[i] = value; 99 | } 100 | SIMD_ALWAYS_INLINE inline constexpr bool operator[](int i) const { return m_value[i]; } 101 | SIMD_ALWAYS_INLINE inline std::int64_t& operator[](int i) { return m_value[i]; } 102 | SIMD_ALWAYS_INLINE inline simd_mask operator||(simd_mask const& other) const { 103 | simd_mask result; 104 | SIMD_PRAGMA for (int i = 0; i < size(); ++i) result.m_value[i] = m_value[i] || other.m_value[i]; 105 | return result; 106 | } 107 | SIMD_ALWAYS_INLINE inline simd_mask operator&&(simd_mask const& other) const { 108 | simd_mask result; 109 | SIMD_PRAGMA for (int i = 0; i < size(); ++i) result.m_value[i] = m_value[i] && other.m_value[i]; 110 | return result; 111 | } 112 | SIMD_ALWAYS_INLINE inline simd_mask operator!() const { 113 | simd_mask result; 114 | SIMD_PRAGMA for (int i = 0; i < size(); ++i) result.m_value[i] = !m_value[i]; 115 | return result; 116 | } 117 | }; 118 | 119 | template 120 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline bool all_of(simd_mask> const& a) { 121 | bool result = true; 122 | SIMD_PRAGMA for (int i = 0; i < a.size(); ++i) result = result && a[i]; 123 | return result; 124 | } 125 | 126 | template 127 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline bool any_of(simd_mask> const& a) { 128 | bool result = false; 129 | SIMD_PRAGMA for (int i = 0; i < a.size(); ++i) result = result || a[i]; 130 | return result; 131 | } 132 | 133 | template 134 | class simd> { 135 | T m_value[N]; 136 | public: 137 | using value_type = T; 138 | using abi_type = simd_abi::pack; 139 | using mask_type = simd_mask; 140 | using storage_type = simd_storage; 141 | SIMD_ALWAYS_INLINE inline simd() = default; 142 | SIMD_ALWAYS_INLINE inline static constexpr int size() { return N; } 143 | SIMD_ALWAYS_INLINE inline simd(T value) 144 | { 145 | SIMD_PRAGMA for (int i = 0; i < size(); ++i) m_value[i] = value; 146 | } 147 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline 148 | simd(storage_type const& value) { 149 | copy_from(value.data(), element_aligned_tag()); 150 | } 151 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline 152 | simd& operator=(storage_type const& value) { 153 | copy_from(value.data(), element_aligned_tag()); 154 | return *this; 155 | } 156 | template 157 | SIMD_ALWAYS_INLINE simd(T const* ptr, Flags flags) { 158 | copy_from(ptr, flags); 159 | } 160 | SIMD_ALWAYS_INLINE simd operator*(simd const& other) const { 161 | simd result; 162 | SIMD_PRAGMA for (int i = 0; i < size(); ++i) result[i] = m_value[i] * other.m_value[i]; 163 | return result; 164 | } 165 | SIMD_ALWAYS_INLINE simd operator/(simd const& other) const { 166 | simd result; 167 | SIMD_PRAGMA for (int i = 0; i < size(); ++i) result[i] = m_value[i] / other.m_value[i]; 168 | return result; 169 | } 170 | SIMD_ALWAYS_INLINE simd operator+(simd const& other) const { 171 | simd result; 172 | SIMD_PRAGMA for (int i = 0; i < size(); ++i) result[i] = m_value[i] + other.m_value[i]; 173 | return result; 174 | } 175 | SIMD_ALWAYS_INLINE simd operator-(simd const& other) const { 176 | simd result; 177 | SIMD_PRAGMA for (int i = 0; i < size(); ++i) result[i] = m_value[i] - other.m_value[i]; 178 | return result; 179 | } 180 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd operator-() const { 181 | simd result; 182 | SIMD_PRAGMA for (int i = 0; i < size(); ++i) result[i] = -m_value[i]; 183 | return result; 184 | } 185 | SIMD_ALWAYS_INLINE void copy_from(T const* ptr, element_aligned_tag) { 186 | SIMD_PRAGMA for (int i = 0; i < size(); ++i) m_value[i] = ptr[i]; 187 | } 188 | SIMD_ALWAYS_INLINE void copy_to(T* ptr, element_aligned_tag) const { 189 | SIMD_PRAGMA for (int i = 0; i < size(); ++i) ptr[i] = m_value[i]; 190 | } 191 | SIMD_ALWAYS_INLINE constexpr T operator[](int i) const { return m_value[i]; } 192 | SIMD_ALWAYS_INLINE T& operator[](int i) { return m_value[i]; } 193 | SIMD_ALWAYS_INLINE simd_mask> operator<(simd const& other) const { 194 | simd_mask> result; 195 | SIMD_PRAGMA for (int i = 0; i < size(); ++i) result[i] = m_value[i] < other.m_value[i]; 196 | return result; 197 | } 198 | SIMD_ALWAYS_INLINE simd_mask> operator==(simd const& other) const { 199 | simd_mask> result; 200 | SIMD_PRAGMA for (int i = 0; i < size(); ++i) result[i] = m_value[i] == other.m_value[i]; 201 | return result; 202 | } 203 | }; 204 | 205 | template 206 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd> abs(simd> const& a) { 207 | simd> result; 208 | using std::abs; 209 | SIMD_PRAGMA for (int i = 0; i < a.size(); ++i) result[i] = abs(a[i]); 210 | return result; 211 | } 212 | 213 | template 214 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd> sqrt(simd> const& a) { 215 | simd> result; 216 | using std::sqrt; 217 | SIMD_PRAGMA for (int i = 0; i < a.size(); ++i) result[i] = sqrt(a[i]); 218 | return result; 219 | } 220 | 221 | template 222 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd> cbrt(simd> const& a) { 223 | simd> result; 224 | using std::cbrt; 225 | SIMD_PRAGMA for (int i = 0; i < a.size(); ++i) result[i] = cbrt(a[i]); 226 | return result; 227 | } 228 | 229 | template 230 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd> exp(simd> const& a) { 231 | simd> result; 232 | using std::exp; 233 | SIMD_PRAGMA for (int i = 0; i < a.size(); ++i) result[i] = exp(a[i]); 234 | return result; 235 | } 236 | 237 | template 238 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd> fma( 239 | simd> const& a, 240 | simd> const& b, 241 | simd> const& c) { 242 | simd> result; 243 | SIMD_PRAGMA for (int i = 0; i < a.size(); ++i) result[i] = (a[i] * b[i]) + c[i]; 244 | return result; 245 | } 246 | 247 | template 248 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd> max( 249 | simd> const& a, simd> const& b) { 250 | simd> result; 251 | SIMD_PRAGMA 252 | for (int i = 0; i < a.size(); ++i) { 253 | result[i] = choose((a[i] < b[i]), b[i], a[i]); 254 | } 255 | return result; 256 | } 257 | 258 | template 259 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd> min( 260 | simd> const& a, simd> const& b) { 261 | simd> result; 262 | SIMD_PRAGMA 263 | for (int i = 0; i < a.size(); ++i) { 264 | result[i] = choose((b[i] < a[i]), b[i], a[i]); 265 | } 266 | return result; 267 | } 268 | 269 | template 270 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd> choose( 271 | simd_mask> const& a, simd> const& b, simd> const& c) { 272 | simd> result; 273 | SIMD_PRAGMA for (int i = 0; i < a.size(); ++i) result[i] = a[i] ? b[i] : c[i]; 274 | return result; 275 | } 276 | 277 | } 278 | -------------------------------------------------------------------------------- /scalar.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | //@HEADER 3 | // ************************************************************************ 4 | // 5 | // Kokkos v. 2.0 6 | // Copyright (2014) Sandia Corporation 7 | // 8 | // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, 9 | // the U.S. Government retains certain rights in this software. 10 | // 11 | // Redistribution and use in source and binary forms, with or without 12 | // modification, are permitted provided that the following conditions are 13 | // met: 14 | // 15 | // 1. Redistributions of source code must retain the above copyright 16 | // notice, this list of conditions and the following disclaimer. 17 | // 18 | // 2. Redistributions in binary form must reproduce the above copyright 19 | // notice, this list of conditions and the following disclaimer in the 20 | // documentation and/or other materials provided with the distribution. 21 | // 22 | // 3. Neither the name of the Corporation nor the names of the 23 | // contributors may be used to endorse or promote products derived from 24 | // this software without specific prior written permission. 25 | // 26 | // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY 27 | // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 29 | // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE 30 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 31 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 32 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 33 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 34 | // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 35 | // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 36 | // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 37 | // 38 | // Questions? Contact Christian R. Trott (crtrott@sandia.gov) 39 | // 40 | // ************************************************************************ 41 | //@HEADER 42 | */ 43 | 44 | #pragma once 45 | 46 | #include "simd_common.hpp" 47 | 48 | namespace SIMD_NAMESPACE { 49 | 50 | namespace simd_abi { 51 | 52 | class scalar {}; 53 | 54 | } 55 | 56 | template 57 | class simd_mask { 58 | bool m_value; 59 | public: 60 | using value_type = bool; 61 | using simd_type = simd; 62 | using abi_type = simd_abi::scalar; 63 | SIMD_ALWAYS_INLINE inline simd_mask() = default; 64 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE static constexpr int size() { return 1; } 65 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd_mask(bool value) 66 | :m_value(value) 67 | {} 68 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline constexpr bool get() const { return m_value; } 69 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd_mask operator||(simd_mask const& other) const { 70 | return m_value || other.m_value; 71 | } 72 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd_mask operator&&(simd_mask const& other) const { 73 | return m_value && other.m_value; 74 | } 75 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd_mask operator!() const { 76 | return !m_value; 77 | } 78 | }; 79 | 80 | template 81 | class simd_storage { 82 | using Abi = simd_abi::scalar; 83 | T m_value; 84 | public: 85 | using value_type = T; 86 | using simd_type = simd; 87 | SIMD_ALWAYS_INLINE inline simd_storage() = default; 88 | SIMD_ALWAYS_INLINE inline static constexpr 89 | int size() { return simd::size(); } 90 | SIMD_ALWAYS_INLINE explicit SIMD_HOST_DEVICE 91 | simd_storage(simd const& value) SIMD_HOST_DEVICE { 92 | value.copy_to(&m_value, element_aligned_tag()); 93 | } 94 | SIMD_ALWAYS_INLINE explicit SIMD_HOST_DEVICE 95 | simd_storage(T value) 96 | :simd_storage(simd(value)) 97 | {} 98 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE 99 | simd_storage& operator=(simd const& value) { 100 | value.copy_to(&m_value, element_aligned_tag()); 101 | return *this; 102 | } 103 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE 104 | T const* data() const { return &m_value; } 105 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE 106 | T* data() { return &m_value; } 107 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE 108 | T const& operator[](int) const { return m_value; } 109 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE 110 | T& operator[](int) { return m_value; } 111 | }; 112 | 113 | template 114 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline 115 | bool all_of(simd_mask const& a) { return a.get(); } 116 | 117 | template 118 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline 119 | bool any_of(simd_mask const& a) { return a.get(); } 120 | 121 | template 122 | class simd { 123 | T m_value; 124 | public: 125 | using value_type = T; 126 | using abi_type = simd_abi::scalar; 127 | using mask_type = simd_mask; 128 | using storage_type = simd_storage; 129 | SIMD_ALWAYS_INLINE inline simd() = default; 130 | SIMD_ALWAYS_INLINE inline simd(simd const&) = default; 131 | SIMD_ALWAYS_INLINE inline simd(simd&&) = default; 132 | SIMD_ALWAYS_INLINE inline simd& operator=(simd const&) = default; 133 | SIMD_ALWAYS_INLINE inline simd& operator=(simd&&) = default; 134 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE static constexpr int size() { return 1; } 135 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd(T value) 136 | :m_value(value) 137 | {} 138 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline 139 | simd(storage_type const& value) { 140 | copy_from(value.data(), element_aligned_tag()); 141 | } 142 | #ifdef STK_VOLATILE_SIMD 143 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline 144 | simd(simd const volatile& value) 145 | :m_value(value.m_value) 146 | {} 147 | #endif 148 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline 149 | simd& operator=(storage_type const& value) { 150 | copy_from(value.data(), element_aligned_tag()); 151 | return *this; 152 | } 153 | template 154 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd(T const* ptr, Flags flags) { 155 | copy_from(ptr, flags); 156 | } 157 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd(T const* ptr, int /*stride*/) 158 | : m_value(ptr[0]) 159 | {} 160 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd operator*(simd const& other) const { 161 | return simd(m_value * other.m_value); 162 | } 163 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd operator/(simd const& other) const { 164 | return simd(m_value / other.m_value); 165 | } 166 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd operator+(simd const& other) const { 167 | return simd(m_value + other.m_value); 168 | } 169 | #ifdef STK_VOLATILE_SIMD 170 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline void plus_equals(simd const volatile& other) volatile { 171 | m_value = m_value + other.m_value; 172 | } 173 | #endif 174 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd operator-(simd const& other) const { 175 | return simd(m_value - other.m_value); 176 | } 177 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd operator-() const { 178 | return simd(-m_value); 179 | } 180 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE void copy_from(T const* ptr, element_aligned_tag) { 181 | m_value = *ptr; 182 | } 183 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE void copy_to(T* ptr, element_aligned_tag) const { 184 | *ptr = m_value; 185 | } 186 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE constexpr T get() const { return m_value; } 187 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd_mask operator<(simd const& other) const { 188 | return simd_mask(m_value < other.m_value); 189 | } 190 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd_mask operator==(simd const& other) const { 191 | return simd_mask(m_value == other.m_value); 192 | } 193 | }; 194 | 195 | template 196 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd abs(simd const& a) { 197 | return simd(std::abs(a.get())); 198 | } 199 | 200 | template 201 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd sqrt(simd const& a) { 202 | return simd(std::sqrt(a.get())); 203 | } 204 | 205 | template 206 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd cbrt(simd const& a) { 207 | return simd(std::cbrt(a.get())); 208 | } 209 | 210 | template 211 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd exp(simd const& a) { 212 | return simd(std::exp(a.get())); 213 | } 214 | 215 | template 216 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd fma( 217 | simd const& a, 218 | simd const& b, 219 | simd const& c) { 220 | return simd((a.get() * b.get()) + c.get()); 221 | } 222 | 223 | template 224 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd max( 225 | simd const& a, simd const& b) { 226 | return simd(choose((a.get() < b.get()), b.get(), a.get())); 227 | } 228 | 229 | template 230 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd min( 231 | simd const& a, simd const& b) { 232 | return simd(choose((b.get() < a.get()), b.get(), a.get())); 233 | } 234 | 235 | template 236 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd choose( 237 | simd_mask const& a, simd const& b, simd const& c) { 238 | return simd(choose(a.get(), b.get(), c.get())); 239 | } 240 | 241 | } 242 | -------------------------------------------------------------------------------- /simd.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | //@HEADER 3 | // ************************************************************************ 4 | // 5 | // Kokkos v. 2.0 6 | // Copyright (2014) Sandia Corporation 7 | // 8 | // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, 9 | // the U.S. Government retains certain rights in this software. 10 | // 11 | // Redistribution and use in source and binary forms, with or without 12 | // modification, are permitted provided that the following conditions are 13 | // met: 14 | // 15 | // 1. Redistributions of source code must retain the above copyright 16 | // notice, this list of conditions and the following disclaimer. 17 | // 18 | // 2. Redistributions in binary form must reproduce the above copyright 19 | // notice, this list of conditions and the following disclaimer in the 20 | // documentation and/or other materials provided with the distribution. 21 | // 22 | // 3. Neither the name of the Corporation nor the names of the 23 | // contributors may be used to endorse or promote products derived from 24 | // this software without specific prior written permission. 25 | // 26 | // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY 27 | // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 29 | // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE 30 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 31 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 32 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 33 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 34 | // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 35 | // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 36 | // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 37 | // 38 | // Questions? Contact Christian R. Trott (crtrott@sandia.gov) 39 | // 40 | // ************************************************************************ 41 | //@HEADER 42 | */ 43 | 44 | #pragma once 45 | 46 | #include "simd_common.hpp" 47 | 48 | #include "scalar.hpp" 49 | 50 | #include "pack.hpp" 51 | 52 | #include "vector_size.hpp" 53 | 54 | #ifndef SIMD_FORCE_SCALAR 55 | #if defined( __CUDACC__ ) 56 | #include "cuda_warp.hpp" 57 | 58 | #elif defined( __HIPCC__ ) 59 | #include "hip_wavefront.hpp" 60 | 61 | #else 62 | 63 | #ifdef __SSE__ 64 | #include "sse.hpp" 65 | #endif 66 | 67 | #ifdef __AVX__ 68 | #include "avx.hpp" 69 | #endif 70 | 71 | #ifdef __AVX512F__ 72 | #include "avx512.hpp" 73 | #endif 74 | 75 | #ifdef __ARM_NEON 76 | #include "neon.hpp" 77 | #endif 78 | 79 | #ifdef __VSX__ 80 | #include "vsx.hpp" 81 | #endif 82 | 83 | #endif 84 | #endif 85 | 86 | namespace SIMD_NAMESPACE { 87 | 88 | namespace simd_abi { 89 | 90 | #if defined(SIMD_FORCE_SCALAR) 91 | using native = scalar; 92 | #elif defined(__CUDACC__) 93 | using native = scalar; 94 | #elif defined(__HIPCC__) 95 | using native = scalar; 96 | #elif defined(__AVX512F__) 97 | using native = avx512; 98 | #elif defined(__AVX__) 99 | using native = avx; 100 | #elif defined(__SSE2__) 101 | using native = sse; 102 | #elif defined(__ARM_NEON) && !defined(__ARM_FEATURE_SVE_BITS) && !defined(__ARM_FEATURE_SVE) 103 | using native = neon; 104 | #elif defined(__VSX__) 105 | using native = vsx; 106 | #elif defined(SIMD_ENABLE_VECTOR_SIZE) 107 | #if defined(__ARM_FEATURE_SVE_BITS) 108 | using native = vector_size<__ARM_FEATURE_SVE_BITS/8>; 109 | #else 110 | #if defined(__ARM_FEATURE_SVE) 111 | using native = vector_size<64>; 112 | #else 113 | using native = vector_size<32>; 114 | #endif 115 | #endif 116 | #else 117 | using native = pack<8>; 118 | #endif 119 | 120 | } 121 | 122 | template 123 | using native_simd = simd; 124 | 125 | } 126 | -------------------------------------------------------------------------------- /simd_common.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | //@HEADER 3 | // ************************************************************************ 4 | // 5 | // Kokkos v. 2.0 6 | // Copyright (2014) Sandia Corporation 7 | // 8 | // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, 9 | // the U.S. Government retains certain rights in this software. 10 | // 11 | // Redistribution and use in source and binary forms, with or without 12 | // modification, are permitted provided that the following conditions are 13 | // met: 14 | // 15 | // 1. Redistributions of source code must retain the above copyright 16 | // notice, this list of conditions and the following disclaimer. 17 | // 18 | // 2. Redistributions in binary form must reproduce the above copyright 19 | // notice, this list of conditions and the following disclaimer in the 20 | // documentation and/or other materials provided with the distribution. 21 | // 22 | // 3. Neither the name of the Corporation nor the names of the 23 | // contributors may be used to endorse or promote products derived from 24 | // this software without specific prior written permission. 25 | // 26 | // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY 27 | // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 29 | // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE 30 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 31 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 32 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 33 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 34 | // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 35 | // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 36 | // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 37 | // 38 | // Questions? Contact Christian R. Trott (crtrott@sandia.gov) 39 | // 40 | // ************************************************************************ 41 | //@HEADER 42 | */ 43 | 44 | #pragma once 45 | 46 | #include 47 | #include 48 | 49 | #ifndef SIMD_ALWAYS_INLINE 50 | #if (defined(__clang__) && (__clang_major__ >= 12)) || \ 51 | (defined(__GNUC__) && !defined(__clang__)) 52 | #define SIMD_ALWAYS_INLINE [[gnu::always_inline]] 53 | #else 54 | #define SIMD_ALWAYS_INLINE 55 | #endif 56 | #endif 57 | 58 | #if defined( __CUDACC__ ) 59 | #define SIMD_CUDA_ALWAYS_INLINE __forceinline__ 60 | #endif 61 | 62 | #if defined( __HIPCC__ ) 63 | #define SIMD_HIP_ALWAYS_INLINE __forceinline__ 64 | #endif 65 | 66 | 67 | #if defined( __CUDACC__) || defined( __HIPCC__ ) 68 | #define SIMD_HOST_DEVICE __host__ __device__ 69 | #else 70 | #define SIMD_HOST_DEVICE 71 | #endif 72 | 73 | #if defined (__CUDACC__) || defined( __HIPCC__ ) 74 | #define SIMD_DEVICE __device__ 75 | #else 76 | #define SIMD_DEVICE 77 | #endif 78 | 79 | #ifndef SIMD_PRAGMA 80 | #if defined(_OPENMP) 81 | #define SIMD_PRAGMA _Pragma("omp simd") 82 | #elif defined(__clang__) 83 | #define SIMD_PRAGMA _Pragma("clang loop vectorize(enable)") 84 | #elif defined(__GNUC__) && !defined(__FUJITSU) 85 | #define SIMD_PRAGMA _Pragma("GCC ivdep") 86 | #else 87 | #define SIMD_PRAGMA 88 | #endif 89 | #endif 90 | 91 | #ifndef SIMD_NAMESPACE 92 | #define SIMD_NAMESPACE simd 93 | #endif 94 | 95 | namespace SIMD_NAMESPACE { 96 | 97 | template 98 | class simd; 99 | 100 | template 101 | class simd_mask; 102 | 103 | class element_aligned_tag {}; 104 | 105 | #ifndef SIMD_SCALAR_CHOOSE_DEFINED 106 | template 107 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE constexpr T const& 108 | choose(bool a, T const& b, T const& c) { 109 | return a ? b : c; 110 | } 111 | #endif 112 | 113 | template 114 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd& operator+=(simd& a, simd const& b) { 115 | a = a + b; 116 | return a; 117 | } 118 | 119 | template 120 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd& operator-=(simd& a, simd const& b) { 121 | a = a - b; 122 | return a; 123 | } 124 | 125 | template 126 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd& operator*=(simd& a, simd const& b) { 127 | a = a * b; 128 | return a; 129 | } 130 | 131 | template 132 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd& operator/=(simd& a, simd const& b) { 133 | a = a / b; 134 | return a; 135 | } 136 | 137 | template 138 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd operator+(T const& a, simd const& b) { 139 | return simd(a) + b; 140 | } 141 | 142 | template 143 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd operator+(simd const& a, T const& b) { 144 | return a + simd(b); 145 | } 146 | 147 | template 148 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd operator-(T const& a, simd const& b) { 149 | return simd(a) - b; 150 | } 151 | 152 | template 153 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd operator-(simd const& a, T const& b) { 154 | return a - simd(b); 155 | } 156 | 157 | template 158 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd operator*(T const& a, simd const& b) { 159 | return simd(a) * b; 160 | } 161 | 162 | template 163 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd operator*(simd const& a, T const& b) { 164 | return a * simd(b); 165 | } 166 | 167 | template 168 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd operator/(T const& a, simd const& b) { 169 | return simd(a) / b; 170 | } 171 | 172 | template 173 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd operator/(simd const& a, T const& b) { 174 | return a / simd(b); 175 | } 176 | 177 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline double copysign(double a, double b) { 178 | return std::copysign(a, b); 179 | } 180 | 181 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline float copysign(float a, float b) { 182 | return std::copysignf(a, b); 183 | } 184 | 185 | template 186 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd multiplysign(simd a, simd b) { 187 | T tmp_a[simd::size()]; 188 | T tmp_b[simd::size()]; 189 | a.copy_to(tmp_a, element_aligned_tag()); 190 | b.copy_to(tmp_b, element_aligned_tag()); 191 | for (int i = 0; i < simd::size(); ++i) tmp_a[i] = tmp_a[i]*::SIMD_NAMESPACE::copysign(static_cast(1.0), tmp_b[i]); 192 | a.copy_from(tmp_a, element_aligned_tag()); 193 | return a; 194 | } 195 | 196 | template 197 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd copysign(simd a, simd b) { 198 | T tmp_a[simd::size()]; 199 | T tmp_b[simd::size()]; 200 | a.copy_to(tmp_a, element_aligned_tag()); 201 | b.copy_to(tmp_b, element_aligned_tag()); 202 | for (int i = 0; i < simd::size(); ++i) tmp_a[i] = ::SIMD_NAMESPACE::copysign(tmp_a[i], tmp_b[i]); 203 | a.copy_from(tmp_a, element_aligned_tag()); 204 | return a; 205 | } 206 | 207 | template 208 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd abs(simd a) { 209 | T tmp[simd::size()]; 210 | a.copy_to(tmp, element_aligned_tag()); 211 | for (int i = 0; i < simd::size(); ++i) tmp[i] = std::abs(tmp[i]); 212 | a.copy_from(tmp, element_aligned_tag()); 213 | return a; 214 | } 215 | 216 | template 217 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd cbrt(simd a) { 218 | T tmp[simd::size()]; 219 | a.copy_to(tmp, element_aligned_tag()); 220 | for (int i = 0; i < simd::size(); ++i) tmp[i] = std::cbrt(tmp[i]); 221 | a.copy_from(tmp, element_aligned_tag()); 222 | return a; 223 | } 224 | 225 | template 226 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd exp(simd a) { 227 | T tmp[simd::size()]; 228 | a.copy_to(tmp, element_aligned_tag()); 229 | for (int i = 0; i < simd::size(); ++i) tmp[i] = std::exp(tmp[i]); 230 | a.copy_from(tmp, element_aligned_tag()); 231 | return a; 232 | } 233 | 234 | template 235 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd fma(simd a, simd const& b, simd const& c) { 236 | T stack_a[simd::size()]; 237 | T stack_b[simd::size()]; 238 | a.copy_to(stack_a, element_aligned_tag()); 239 | b.copy_to(stack_b, element_aligned_tag()); 240 | for (int i = 0; i < simd::size(); ++i) stack_a[i] *= stack_b[i]; 241 | c.copy_to(stack_b, element_aligned_tag()); 242 | for (int i = 0; i < simd::size(); ++i) stack_a[i] += stack_b[i]; 243 | a.copy_from(stack_a, element_aligned_tag()); 244 | return a; 245 | } 246 | 247 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline bool all_of(bool a) { return a; } 248 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline bool any_of(bool a) { return a; } 249 | 250 | template 251 | class simd_storage { 252 | T m_value[simd::size()]; 253 | public: 254 | using value_type = T; 255 | using simd_type = simd; 256 | SIMD_ALWAYS_INLINE inline simd_storage() = default; 257 | SIMD_ALWAYS_INLINE inline static constexpr 258 | int size() { return simd::size(); } 259 | SIMD_ALWAYS_INLINE explicit inline 260 | simd_storage(simd const& value) { 261 | value.copy_to(m_value, element_aligned_tag()); 262 | } 263 | SIMD_ALWAYS_INLINE explicit inline 264 | simd_storage(T value) 265 | :simd_storage(simd(value)) 266 | {} 267 | SIMD_ALWAYS_INLINE inline 268 | simd_storage& operator=(simd const& value) { 269 | value.copy_to(m_value, element_aligned_tag()); 270 | return *this; 271 | } 272 | SIMD_ALWAYS_INLINE inline 273 | T const* data() const { return m_value; } 274 | SIMD_ALWAYS_INLINE inline 275 | T* data() { return m_value; } 276 | SIMD_ALWAYS_INLINE inline 277 | T const& operator[](int i) const { return m_value[i]; } 278 | SIMD_ALWAYS_INLINE inline 279 | T& operator[](int i) { return m_value[i]; } 280 | }; 281 | 282 | template 283 | class simd_size { 284 | public: 285 | static constexpr int value = 1; 286 | }; 287 | 288 | template 289 | class simd_size> { 290 | public: 291 | static constexpr int value = simd::size(); 292 | }; 293 | 294 | } 295 | -------------------------------------------------------------------------------- /sse.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | //@HEADER 3 | // ************************************************************************ 4 | // 5 | // Kokkos v. 2.0 6 | // Copyright (2014) Sandia Corporation 7 | // 8 | // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, 9 | // the U.S. Government retains certain rights in this software. 10 | // 11 | // Redistribution and use in source and binary forms, with or without 12 | // modification, are permitted provided that the following conditions are 13 | // met: 14 | // 15 | // 1. Redistributions of source code must retain the above copyright 16 | // notice, this list of conditions and the following disclaimer. 17 | // 18 | // 2. Redistributions in binary form must reproduce the above copyright 19 | // notice, this list of conditions and the following disclaimer in the 20 | // documentation and/or other materials provided with the distribution. 21 | // 22 | // 3. Neither the name of the Corporation nor the names of the 23 | // contributors may be used to endorse or promote products derived from 24 | // this software without specific prior written permission. 25 | // 26 | // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY 27 | // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 29 | // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE 30 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 31 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 32 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 33 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 34 | // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 35 | // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 36 | // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 37 | // 38 | // Questions? Contact Christian R. Trott (crtrott@sandia.gov) 39 | // 40 | // ************************************************************************ 41 | //@HEADER 42 | */ 43 | 44 | #pragma once 45 | 46 | #include "simd_common.hpp" 47 | 48 | #ifdef __SSE__ 49 | #include 50 | #endif 51 | 52 | #ifdef __SSE2__ 53 | #include 54 | #endif 55 | 56 | #if defined(__FMA__) || defined(__AVX2__) 57 | #include 58 | #endif 59 | 60 | /* Intel SVML disclaimer: cbrt, exp, etc. are not intrinsics, they are Intel-proprietary library functions 61 | https://stackoverflow.com/questions/36636159/where-is-clangs-mm256-pow-ps-intrinsic 62 | This is why the specializations that call these functions are protected with __INTEL_COMPILER. 63 | */ 64 | 65 | /* Intel FMA disclaimer: it is hard to detect FMA across compilers 66 | https://stackoverflow.com/questions/16348909/how-do-i-know-if-i-can-compile-with-fma-instruction-sets 67 | it seems like the best we can do is __FMA__ or __AVX2__, since MSVC doesn't define __FMA__ 68 | */ 69 | 70 | #ifdef __SSE__ 71 | 72 | namespace SIMD_NAMESPACE { 73 | 74 | namespace simd_abi { 75 | 76 | class sse {}; 77 | 78 | } 79 | 80 | template <> 81 | class simd_mask { 82 | __m128 m_value; 83 | public: 84 | using value_type = bool; 85 | using simd_type = simd; 86 | using abi_type = simd_abi::sse; 87 | SIMD_ALWAYS_INLINE inline simd_mask() = default; 88 | SIMD_ALWAYS_INLINE inline simd_mask(bool value) 89 | :m_value(_mm_castsi128_ps(_mm_set1_epi32(-int(value)))) 90 | {} 91 | SIMD_ALWAYS_INLINE inline static constexpr int size() { return 4; } 92 | SIMD_ALWAYS_INLINE inline constexpr simd_mask(__m128 const& value_in) 93 | :m_value(value_in) 94 | {} 95 | SIMD_ALWAYS_INLINE constexpr __m128 get() const { return m_value; } 96 | SIMD_ALWAYS_INLINE simd_mask operator||(simd_mask const& other) const { 97 | return simd_mask(_mm_or_ps(m_value, other.m_value)); 98 | } 99 | SIMD_ALWAYS_INLINE simd_mask operator&&(simd_mask const& other) const { 100 | return simd_mask(_mm_and_ps(m_value, other.m_value)); 101 | } 102 | SIMD_ALWAYS_INLINE simd_mask operator!() const { 103 | return simd_mask(_mm_andnot_ps(m_value, simd_mask(true).get())); 104 | } 105 | }; 106 | 107 | SIMD_ALWAYS_INLINE inline bool all_of(simd_mask const& a) { 108 | return _mm_movemask_ps(a.get()) == 0xF; 109 | } 110 | 111 | SIMD_ALWAYS_INLINE inline bool any_of(simd_mask const& a) { 112 | return _mm_movemask_ps(a.get()) != 0x0; 113 | } 114 | 115 | template <> 116 | class simd { 117 | __m128 m_value; 118 | public: 119 | using value_type = float; 120 | using abi_type = simd_abi::sse; 121 | using mask_type = simd_mask; 122 | using storage_type = simd_storage; 123 | SIMD_ALWAYS_INLINE inline simd() = default; 124 | SIMD_ALWAYS_INLINE inline static constexpr int size() { return 4; } 125 | SIMD_ALWAYS_INLINE inline simd(float value) 126 | :m_value(_mm_set1_ps(value)) 127 | {} 128 | SIMD_ALWAYS_INLINE inline simd( 129 | float a, float b, float c, float d) 130 | :m_value(_mm_setr_ps(a, b, c, d)) 131 | {} 132 | SIMD_ALWAYS_INLINE inline 133 | simd(storage_type const& value) { 134 | copy_from(value.data(), element_aligned_tag()); 135 | } 136 | SIMD_ALWAYS_INLINE inline 137 | simd& operator=(storage_type const& value) { 138 | copy_from(value.data(), element_aligned_tag()); 139 | return *this; 140 | } 141 | template 142 | SIMD_ALWAYS_INLINE inline simd(float const* ptr, Flags /*flags*/) 143 | :m_value(_mm_loadu_ps(ptr)) 144 | {} 145 | SIMD_ALWAYS_INLINE inline simd(float const* ptr, int stride) 146 | :simd(ptr[0], ptr[stride], ptr[2*stride], ptr[3*stride]) 147 | {} 148 | SIMD_ALWAYS_INLINE inline constexpr simd(__m128 const& value_in) 149 | :m_value(value_in) 150 | {} 151 | SIMD_ALWAYS_INLINE inline simd operator*(simd const& other) const { 152 | return simd(_mm_mul_ps(m_value, other.m_value)); 153 | } 154 | SIMD_ALWAYS_INLINE inline simd operator/(simd const& other) const { 155 | return simd(_mm_div_ps(m_value, other.m_value)); 156 | } 157 | SIMD_ALWAYS_INLINE inline simd operator+(simd const& other) const { 158 | return simd(_mm_add_ps(m_value, other.m_value)); 159 | } 160 | SIMD_ALWAYS_INLINE inline simd operator-(simd const& other) const { 161 | return simd(_mm_sub_ps(m_value, other.m_value)); 162 | } 163 | SIMD_ALWAYS_INLINE inline simd operator-() const { 164 | return simd(_mm_sub_ps(_mm_set1_ps(0.0), m_value)); 165 | } 166 | SIMD_ALWAYS_INLINE void copy_from(float const* ptr, element_aligned_tag) { 167 | m_value = _mm_loadu_ps(ptr); 168 | } 169 | SIMD_ALWAYS_INLINE void copy_to(float* ptr, element_aligned_tag) const { 170 | _mm_storeu_ps(ptr, m_value); 171 | } 172 | SIMD_ALWAYS_INLINE constexpr __m128 get() const { return m_value; } 173 | SIMD_ALWAYS_INLINE simd_mask operator<(simd const& other) const { 174 | return simd_mask(_mm_cmplt_ps(m_value, other.m_value)); 175 | } 176 | SIMD_ALWAYS_INLINE simd_mask operator==(simd const& other) const { 177 | return simd_mask(_mm_cmpeq_ps(m_value, other.m_value)); 178 | } 179 | }; 180 | 181 | SIMD_ALWAYS_INLINE inline simd multiplysign(simd const& a, simd const& b) { 182 | __m128 const sign_mask = _mm_set1_ps(-0.); 183 | return simd(_mm_xor_ps(a.get(), _mm_and_ps(sign_mask, b.get()))); 184 | } 185 | 186 | SIMD_ALWAYS_INLINE inline simd copysign(simd const& a, simd const& b) { 187 | __m128 const sign_mask = _mm_set1_ps(-0.); 188 | return simd(_mm_xor_ps(_mm_andnot_ps(sign_mask, a.get()), _mm_and_ps(sign_mask, b.get()))); 189 | } 190 | 191 | SIMD_ALWAYS_INLINE inline simd abs(simd const& a) { 192 | __m128 const sign_mask = _mm_set1_ps(-0.f); // -0.f = 1 << 31 193 | return simd(_mm_andnot_ps(sign_mask, a.get())); 194 | } 195 | 196 | SIMD_ALWAYS_INLINE inline simd sqrt(simd const& a) { 197 | return simd(_mm_sqrt_ps(a.get())); 198 | } 199 | 200 | #ifdef __INTEL_COMPILER 201 | SIMD_ALWAYS_INLINE inline simd cbrt(simd const& a) { 202 | return simd(_mm_cbrt_ps(a.get())); 203 | } 204 | 205 | SIMD_ALWAYS_INLINE inline simd exp(simd const& a) { 206 | return simd(_mm_exp_ps(a.get())); 207 | } 208 | 209 | SIMD_ALWAYS_INLINE inline simd log(simd const& a) { 210 | return simd(_mm_log_ps(a.get())); 211 | } 212 | #endif 213 | 214 | #if defined(__FMA__) || defined(__AVX2__) 215 | SIMD_ALWAYS_INLINE inline simd fma( 216 | simd const& a, 217 | simd const& b, 218 | simd const& c) { 219 | return simd(_mm_fmadd_ps(a.get(), b.get(), c.get())); 220 | } 221 | #endif 222 | 223 | SIMD_ALWAYS_INLINE inline simd max( 224 | simd const& a, simd const& b) { 225 | return simd(_mm_max_ps(a.get(), b.get())); 226 | } 227 | 228 | SIMD_ALWAYS_INLINE inline simd min( 229 | simd const& a, simd const& b) { 230 | return simd(_mm_min_ps(a.get(), b.get())); 231 | } 232 | 233 | SIMD_ALWAYS_INLINE inline simd choose( 234 | simd_mask const& a, simd const& b, simd const& c) { 235 | return simd(_mm_add_ps(_mm_and_ps(a.get(), b.get()), _mm_andnot_ps(a.get(), c.get()))); 236 | } 237 | 238 | #endif 239 | 240 | #ifdef __SSE2__ 241 | 242 | template <> 243 | class simd_mask { 244 | __m128d m_value; 245 | public: 246 | using value_type = bool; 247 | using simd_type = simd; 248 | using abi_type = simd_abi::sse; 249 | SIMD_ALWAYS_INLINE inline simd_mask() = default; 250 | SIMD_ALWAYS_INLINE inline simd_mask(bool value) 251 | :m_value(_mm_castsi128_pd(_mm_set1_epi64x(-std::int64_t(value)))) 252 | {} 253 | SIMD_ALWAYS_INLINE inline static constexpr int size() { return 4; } 254 | SIMD_ALWAYS_INLINE inline constexpr simd_mask(__m128d const& value_in) 255 | :m_value(value_in) 256 | {} 257 | SIMD_ALWAYS_INLINE inline constexpr __m128d get() const { return m_value; } 258 | SIMD_ALWAYS_INLINE inline simd_mask operator||(simd_mask const& other) const { 259 | return simd_mask(_mm_or_pd(m_value, other.m_value)); 260 | } 261 | SIMD_ALWAYS_INLINE inline simd_mask operator&&(simd_mask const& other) const { 262 | return simd_mask(_mm_and_pd(m_value, other.m_value)); 263 | } 264 | SIMD_ALWAYS_INLINE inline simd_mask operator!() const { 265 | return simd_mask(_mm_andnot_pd(m_value, simd_mask(true).get())); 266 | } 267 | }; 268 | 269 | SIMD_ALWAYS_INLINE inline bool all_of(simd_mask const& a) { 270 | return _mm_movemask_pd(a.get()) == 0x3; 271 | } 272 | 273 | SIMD_ALWAYS_INLINE inline bool any_of(simd_mask const& a) { 274 | return _mm_movemask_pd(a.get()) != 0x0; 275 | } 276 | 277 | template <> 278 | class simd { 279 | __m128d m_value; 280 | public: 281 | using value_type = double; 282 | using abi_type = simd_abi::sse; 283 | using mask_type = simd_mask; 284 | using storage_type = simd_storage; 285 | SIMD_ALWAYS_INLINE inline simd() = default; 286 | SIMD_ALWAYS_INLINE inline simd(simd const&) = default; 287 | SIMD_ALWAYS_INLINE inline simd(simd&&) = default; 288 | SIMD_ALWAYS_INLINE inline simd& operator=(simd const&) = default; 289 | SIMD_ALWAYS_INLINE inline simd& operator=(simd&&) = default; 290 | SIMD_ALWAYS_INLINE inline static constexpr int size() { return 2; } 291 | SIMD_ALWAYS_INLINE inline simd(double value) 292 | :m_value(_mm_set1_pd(value)) 293 | {} 294 | SIMD_ALWAYS_INLINE inline simd(double a, double b) 295 | :m_value(_mm_setr_pd(a, b)) 296 | {} 297 | SIMD_ALWAYS_INLINE inline 298 | simd(storage_type const& value) { 299 | copy_from(value.data(), element_aligned_tag()); 300 | } 301 | #ifdef STK_VOLATILE_SIMD 302 | SIMD_ALWAYS_INLINE inline 303 | simd(simd const volatile& value) 304 | :m_value(value.m_value) 305 | {} 306 | #endif 307 | SIMD_ALWAYS_INLINE inline 308 | simd& operator=(storage_type const& value) { 309 | copy_from(value.data(), element_aligned_tag()); 310 | return *this; 311 | } 312 | template 313 | SIMD_ALWAYS_INLINE inline simd(double const* ptr, Flags /*flags*/) 314 | :m_value(_mm_loadu_pd(ptr)) 315 | {} 316 | SIMD_ALWAYS_INLINE inline simd(double const* ptr, int stride) 317 | :simd(ptr[0], ptr[stride]) 318 | {} 319 | SIMD_ALWAYS_INLINE inline constexpr simd(__m128d const& value_in) 320 | :m_value(value_in) 321 | {} 322 | SIMD_ALWAYS_INLINE inline simd operator*(simd const& other) const { 323 | return simd(_mm_mul_pd(m_value, other.m_value)); 324 | } 325 | SIMD_ALWAYS_INLINE inline simd operator/(simd const& other) const { 326 | return simd(_mm_div_pd(m_value, other.m_value)); 327 | } 328 | SIMD_ALWAYS_INLINE inline simd operator+(simd const& other) const { 329 | return simd(_mm_add_pd(m_value, other.m_value)); 330 | } 331 | #ifdef STK_VOLATILE_SIMD 332 | SIMD_ALWAYS_INLINE inline void plus_equals(simd const volatile& other) volatile { 333 | m_value = _mm_add_pd(m_value, other.m_value); 334 | } 335 | #endif 336 | SIMD_ALWAYS_INLINE inline simd operator-(simd const& other) const { 337 | return simd(_mm_sub_pd(m_value, other.m_value)); 338 | } 339 | SIMD_ALWAYS_INLINE inline simd operator-() const { 340 | return simd(_mm_sub_pd(_mm_set1_pd(0.0), m_value)); 341 | } 342 | SIMD_ALWAYS_INLINE inline void copy_from(double const* ptr, element_aligned_tag) { 343 | m_value = _mm_loadu_pd(ptr); 344 | } 345 | SIMD_ALWAYS_INLINE inline void copy_to(double* ptr, element_aligned_tag) const { 346 | _mm_storeu_pd(ptr, m_value); 347 | } 348 | SIMD_ALWAYS_INLINE inline constexpr __m128d get() const { return m_value; } 349 | SIMD_ALWAYS_INLINE inline simd_mask operator<(simd const& other) const { 350 | return simd_mask(_mm_cmplt_pd(m_value, other.m_value)); 351 | } 352 | SIMD_ALWAYS_INLINE inline simd_mask operator==(simd const& other) const { 353 | return simd_mask(_mm_cmpeq_pd(m_value, other.m_value)); 354 | } 355 | }; 356 | 357 | SIMD_ALWAYS_INLINE inline simd multiplysign(simd const& a, simd const& b) { 358 | __m128d const sign_mask = _mm_set1_pd(-0.); 359 | return simd(_mm_xor_pd(a.get(), _mm_and_pd(sign_mask, b.get()))); 360 | } 361 | 362 | SIMD_ALWAYS_INLINE inline simd copysign(simd const& a, simd const& b) { 363 | __m128d const sign_mask = _mm_set1_pd(-0.); 364 | return simd(_mm_xor_pd(_mm_andnot_pd(sign_mask, a.get()), _mm_and_pd(sign_mask, b.get()))); 365 | } 366 | 367 | SIMD_ALWAYS_INLINE inline simd abs(simd const& a) { 368 | __m128d const sign_mask = _mm_set1_pd(-0.); // -0. = 1 << 63 369 | return simd(_mm_andnot_pd(sign_mask, a.get())); 370 | } 371 | 372 | SIMD_ALWAYS_INLINE inline simd sqrt(simd const& a) { 373 | return simd(_mm_sqrt_pd(a.get())); 374 | } 375 | 376 | #ifdef __INTEL_COMPILER 377 | SIMD_ALWAYS_INLINE inline simd cbrt(simd const& a) { 378 | return simd(_mm_cbrt_pd(a.get())); 379 | } 380 | 381 | SIMD_ALWAYS_INLINE inline simd exp(simd const& a) { 382 | return simd(_mm_exp_pd(a.get())); 383 | } 384 | 385 | SIMD_ALWAYS_INLINE inline simd log(simd const& a) { 386 | return simd(_mm_log_pd(a.get())); 387 | } 388 | #endif 389 | 390 | #if defined(__FMA__) || defined(__AVX2__) 391 | SIMD_ALWAYS_INLINE inline simd fma( 392 | simd const& a, 393 | simd const& b, 394 | simd const& c) { 395 | return simd(_mm_fmadd_pd(a.get(), b.get(), c.get())); 396 | } 397 | #endif 398 | 399 | SIMD_ALWAYS_INLINE inline simd max( 400 | simd const& a, simd const& b) { 401 | return simd(_mm_max_pd(a.get(), b.get())); 402 | } 403 | 404 | SIMD_ALWAYS_INLINE inline simd min( 405 | simd const& a, simd const& b) { 406 | return simd(_mm_min_pd(a.get(), b.get())); 407 | } 408 | 409 | SIMD_ALWAYS_INLINE inline simd choose( 410 | simd_mask const& a, simd const& b, simd const& c) { 411 | return simd( 412 | _mm_add_pd( 413 | _mm_and_pd(a.get(), b.get()), 414 | _mm_andnot_pd(a.get(), c.get()))); 415 | } 416 | 417 | } 418 | 419 | #endif 420 | -------------------------------------------------------------------------------- /test.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | //@HEADER 3 | // ************************************************************************ 4 | // 5 | // Kokkos v. 2.0 6 | // Copyright (2014) Sandia Corporation 7 | // 8 | // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, 9 | // the U.S. Government retains certain rights in this software. 10 | // 11 | // Redistribution and use in source and binary forms, with or without 12 | // modification, are permitted provided that the following conditions are 13 | // met: 14 | // 15 | // 1. Redistributions of source code must retain the above copyright 16 | // notice, this list of conditions and the following disclaimer. 17 | // 18 | // 2. Redistributions in binary form must reproduce the above copyright 19 | // notice, this list of conditions and the following disclaimer in the 20 | // documentation and/or other materials provided with the distribution. 21 | // 22 | // 3. Neither the name of the Corporation nor the names of the 23 | // contributors may be used to endorse or promote products derived from 24 | // this software without specific prior written permission. 25 | // 26 | // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY 27 | // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 29 | // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE 30 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 31 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 32 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 33 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 34 | // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 35 | // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 36 | // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 37 | // 38 | // Questions? Contact Christian R. Trott (crtrott@sandia.gov) 39 | // 40 | // ************************************************************************ 41 | //@HEADER 42 | */ 43 | 44 | #include 45 | #include 46 | 47 | #include "simd.hpp" 48 | 49 | #define ASSERT_EQ(a, b) \ 50 | if ((a) != (b)) { \ 51 | std::abort(); \ 52 | } 53 | 54 | template 55 | void test_binary_op( 56 | T const* a, 57 | T const* b, 58 | BinaryOp const& binary_op) { 59 | simd::simd native_a(a, simd::element_aligned_tag()); 60 | simd::simd native_b(b, simd::element_aligned_tag()); 61 | simd::simd native_answer = binary_op(native_a, native_b); 62 | constexpr int size = simd::simd::size(); 63 | using pack_abi = simd::simd_abi::pack; 64 | simd::simd pack_a(a, simd::element_aligned_tag()); 65 | simd::simd pack_b(b, simd::element_aligned_tag()); 66 | simd::simd pack_answer = binary_op(pack_a, pack_b); 67 | simd::simd_storage stored_native_answer(native_answer); 68 | simd::simd_storage stored_pack_answer(pack_answer); 69 | for (int i = 0; i < size; ++i) { 70 | ASSERT_EQ(stored_native_answer[i], stored_pack_answer[i]); 71 | } 72 | } 73 | 74 | struct plus { 75 | template 76 | T operator()(T const& a, T const& b) const { 77 | return a + b; 78 | } 79 | }; 80 | 81 | struct minus { 82 | template 83 | T operator()(T const& a, T const& b) const { 84 | return a - b; 85 | } 86 | }; 87 | 88 | struct multiplies { 89 | template 90 | T operator()(T const& a, T const& b) const { 91 | return a * b; 92 | } 93 | }; 94 | 95 | struct divides { 96 | template 97 | T operator()(T const& a, T const& b) const { 98 | return a / b; 99 | } 100 | }; 101 | 102 | int main() { 103 | double const a[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}; 104 | double const b[] = {1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8}; 105 | test_binary_op(a, b, plus()); 106 | test_binary_op(a, b, minus()); 107 | test_binary_op(a, b, multiplies()); 108 | test_binary_op(a, b, divides()); 109 | } 110 | -------------------------------------------------------------------------------- /vector_size.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | //@HEADER 3 | // ************************************************************************ 4 | // 5 | // Kokkos v. 2.0 6 | // Copyright (2014) Sandia Corporation 7 | // 8 | // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, 9 | // the U.S. Government retains certain rights in this software. 10 | // 11 | // Redistribution and use in source and binary forms, with or without 12 | // modification, are permitted provided that the following conditions are 13 | // met: 14 | // 15 | // 1. Redistributions of source code must retain the above copyright 16 | // notice, this list of conditions and the following disclaimer. 17 | // 18 | // 2. Redistributions in binary form must reproduce the above copyright 19 | // notice, this list of conditions and the following disclaimer in the 20 | // documentation and/or other materials provided with the distribution. 21 | // 22 | // 3. Neither the name of the Corporation nor the names of the 23 | // contributors may be used to endorse or promote products derived from 24 | // this software without specific prior written permission. 25 | // 26 | // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY 27 | // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 29 | // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE 30 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 31 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 32 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 33 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 34 | // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 35 | // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 36 | // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 37 | // 38 | // Questions? Contact Christian R. Trott (crtrott@sandia.gov) 39 | // 40 | // ************************************************************************ 41 | //@HEADER 42 | */ 43 | 44 | #pragma once 45 | 46 | #include "simd_common.hpp" 47 | 48 | #if (defined(__clang__) && (__clang_major__ >= 11)) || \ 49 | (defined(__GNUC__) && (__GNUC__ >= 10) && (__GNUC_MINOR__ >= 2)) || \ 50 | (defined(SIMD_ENABLE_VECTOR_SIZE)) 51 | 52 | #ifndef SIMD_ENABLE_VECTOR_SIZE 53 | #define SIMD_ENABLE_VECTOR_SIZE 54 | #endif 55 | 56 | namespace SIMD_NAMESPACE { 57 | 58 | namespace simd_abi { 59 | 60 | template 61 | class vector_size {}; 62 | 63 | } 64 | 65 | template 66 | class simd_mask> { 67 | typedef int native_type __attribute__((vector_size(N/(sizeof(T)/sizeof(int))))); 68 | native_type m_value; 69 | public: 70 | using value_type = bool; 71 | using simd_type = simd>; 72 | using abi_type = simd_abi::vector_size; 73 | SIMD_ALWAYS_INLINE inline simd_mask() = default; 74 | SIMD_ALWAYS_INLINE inline static constexpr int size() { return N / sizeof(T); } 75 | SIMD_ALWAYS_INLINE inline simd_mask(bool value) 76 | :m_value(static_cast(value)) 77 | {} 78 | SIMD_ALWAYS_INLINE inline simd_mask(native_type value) 79 | :m_value(value) 80 | {} 81 | SIMD_ALWAYS_INLINE inline int operator[](int i) { return reinterpret_cast(&m_value)[i]; } 82 | SIMD_ALWAYS_INLINE inline native_type const& get() const { return m_value; } 83 | SIMD_ALWAYS_INLINE inline simd_mask operator||(simd_mask const& other) const { 84 | return simd_mask(m_value || other.m_value); 85 | } 86 | SIMD_ALWAYS_INLINE inline simd_mask operator&&(simd_mask const& other) const { 87 | return simd_mask(m_value && other.m_value); 88 | } 89 | SIMD_ALWAYS_INLINE inline simd_mask operator!() const { 90 | return simd_mask(!m_value); 91 | } 92 | }; 93 | 94 | template 95 | class simd_mask> { 96 | typedef long long native_type __attribute__((vector_size(N))); 97 | native_type m_value; 98 | public: 99 | using value_type = bool; 100 | using simd_type = simd>; 101 | using abi_type = simd_abi::vector_size; 102 | SIMD_ALWAYS_INLINE inline simd_mask() = default; 103 | SIMD_ALWAYS_INLINE inline static constexpr int size() { return N / sizeof(long long); } 104 | SIMD_ALWAYS_INLINE inline simd_mask(bool value) 105 | :m_value(static_cast(value)) 106 | {} 107 | SIMD_ALWAYS_INLINE inline simd_mask(native_type value) 108 | :m_value(value) 109 | {} 110 | SIMD_ALWAYS_INLINE inline long long operator[](int i) { return reinterpret_cast(&m_value)[i]; } 111 | SIMD_ALWAYS_INLINE inline native_type const& get() const { return m_value; } 112 | SIMD_ALWAYS_INLINE inline simd_mask operator||(simd_mask const& other) const { 113 | return simd_mask(m_value || other.m_value); 114 | } 115 | SIMD_ALWAYS_INLINE inline simd_mask operator&&(simd_mask const& other) const { 116 | return simd_mask(m_value && other.m_value); 117 | } 118 | SIMD_ALWAYS_INLINE inline simd_mask operator!() const { 119 | return simd_mask(!m_value); 120 | } 121 | }; 122 | 123 | template 124 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline bool all_of(simd_mask> const& a) { 125 | bool result = true; 126 | SIMD_PRAGMA for (int i = 0; i < a.size(); ++i) result = result && a.get()[i]; 127 | return result; 128 | } 129 | 130 | template 131 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline bool any_of(simd_mask> const& a) { 132 | bool result = false; 133 | SIMD_PRAGMA for (int i = 0; i < a.size(); ++i) result = result || a.get()[i]; 134 | return result; 135 | } 136 | 137 | template 138 | class simd> { 139 | typedef T native_type __attribute__((vector_size(N))); 140 | native_type m_value; 141 | public: 142 | using value_type = T; 143 | using abi_type = simd_abi::vector_size; 144 | using mask_type = simd_mask; 145 | using storage_type = simd_storage; 146 | SIMD_ALWAYS_INLINE inline simd() = default; 147 | SIMD_ALWAYS_INLINE inline static constexpr int size() { return N / sizeof(T); } 148 | SIMD_ALWAYS_INLINE inline simd(T value) { for(int i=0; i(&m_value)[i] = value; } 149 | explicit SIMD_ALWAYS_INLINE inline simd(const native_type& value):m_value(value) {} 150 | SIMD_ALWAYS_INLINE inline 151 | simd(storage_type const& value) { 152 | copy_from(value.data(), element_aligned_tag()); 153 | } 154 | SIMD_ALWAYS_INLINE inline 155 | simd& operator=(storage_type const& value) { 156 | copy_from(value.data(), element_aligned_tag()); 157 | return *this; 158 | } 159 | template 160 | SIMD_ALWAYS_INLINE simd(T const* ptr, Flags flags) { 161 | copy_from(ptr, flags); 162 | } 163 | SIMD_ALWAYS_INLINE simd operator*(simd const& other) const { 164 | return simd(m_value * other.m_value); 165 | } 166 | SIMD_ALWAYS_INLINE simd operator/(simd const& other) const { 167 | return simd(m_value / other.m_value); 168 | } 169 | SIMD_ALWAYS_INLINE simd operator+(simd const& other) const { 170 | return simd(m_value + other.m_value); 171 | } 172 | SIMD_ALWAYS_INLINE simd operator+=(simd const& other) const { 173 | return m_value += other.m_value; 174 | } 175 | SIMD_ALWAYS_INLINE simd operator-(simd const& other) const { 176 | return simd(m_value - other.m_value); 177 | } 178 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd operator-() const { 179 | return simd(-m_value); 180 | } 181 | SIMD_ALWAYS_INLINE void copy_from(T const* ptr, element_aligned_tag) { 182 | SIMD_PRAGMA for (int i = 0; i < size(); ++i) reinterpret_cast(&m_value)[i] = ptr[i]; 183 | } 184 | SIMD_ALWAYS_INLINE void copy_to(T* ptr, element_aligned_tag) const { 185 | SIMD_PRAGMA for (int i = 0; i < size(); ++i) ptr[i] = reinterpret_cast(&m_value)[i]; 186 | } 187 | SIMD_ALWAYS_INLINE constexpr T operator[](int i) const { return m_value[i]; } 188 | SIMD_ALWAYS_INLINE native_type const& get() const { return m_value; } 189 | SIMD_ALWAYS_INLINE native_type& get() { return m_value; } 190 | SIMD_ALWAYS_INLINE simd_mask> operator<(simd const& other) const { 191 | return simd_mask>(m_value < other.m_value); 192 | } 193 | SIMD_ALWAYS_INLINE simd_mask> operator==(simd const& other) const { 194 | return simd_mask>(m_value == other.m_value); 195 | } 196 | }; 197 | 198 | template 199 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd> abs(simd> const& a) { 200 | simd> result; 201 | using std::sqrt; 202 | SIMD_PRAGMA for (int i = 0; i < a.size(); ++i) result.get()[i] = abs(a[i]); 203 | return result; 204 | } 205 | 206 | template 207 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd> sqrt(simd> const& a) { 208 | simd> result; 209 | using std::sqrt; 210 | SIMD_PRAGMA for (int i = 0; i < a.size(); ++i) result.get()[i] = sqrt(a[i]); 211 | return result; 212 | } 213 | 214 | template 215 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd> cbrt(simd> const& a) { 216 | simd> result; 217 | using std::cbrt; 218 | SIMD_PRAGMA for (int i = 0; i < a.size(); ++i) result.get()[i] = cbrt(a[i]); 219 | return result; 220 | } 221 | 222 | template 223 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd> exp(simd> const& a) { 224 | simd> result; 225 | using std::exp; 226 | SIMD_PRAGMA for (int i = 0; i < a.size(); ++i) result.get()[i] = exp(a[i]); 227 | return result; 228 | } 229 | 230 | template 231 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd> fma( 232 | simd> const& a, 233 | simd> const& b, 234 | simd> const& c) { 235 | simd> result; 236 | SIMD_PRAGMA for (int i = 0; i < a.size(); ++i) result.get()[i] = (a[i] * b[i]) + c[i]; 237 | return result; 238 | } 239 | 240 | template 241 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd> choose( 242 | simd_mask> const& a, 243 | simd> const& b, 244 | simd> const& c) { 245 | simd> result; 246 | SIMD_PRAGMA for (int i = 0; i < a.size(); ++i) result.get()[i] = a.get()[i] ? b.get()[i] : c.get()[i]; 247 | return result; 248 | } 249 | 250 | template 251 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd> max( 252 | simd> const& a, 253 | simd> const& b) { 254 | return choose(b < a, a, b); 255 | } 256 | 257 | template 258 | SIMD_ALWAYS_INLINE SIMD_HOST_DEVICE inline simd> min( 259 | simd> const& a, 260 | simd> const& b) { 261 | return choose(a < b, a, b); 262 | } 263 | 264 | } 265 | 266 | #endif 267 | -------------------------------------------------------------------------------- /vsx.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | //@HEADER 3 | // ************************************************************************ 4 | // 5 | // Kokkos v. 2.0 6 | // Copyright (2014) Sandia Corporation 7 | // 8 | // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, 9 | // the U.S. Government retains certain rights in this software. 10 | // 11 | // Redistribution and use in source and binary forms, with or without 12 | // modification, are permitted provided that the following conditions are 13 | // met: 14 | // 15 | // 1. Redistributions of source code must retain the above copyright 16 | // notice, this list of conditions and the following disclaimer. 17 | // 18 | // 2. Redistributions in binary form must reproduce the above copyright 19 | // notice, this list of conditions and the following disclaimer in the 20 | // documentation and/or other materials provided with the distribution. 21 | // 22 | // 3. Neither the name of the Corporation nor the names of the 23 | // contributors may be used to endorse or promote products derived from 24 | // this software without specific prior written permission. 25 | // 26 | // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY 27 | // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 29 | // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE 30 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 31 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 32 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 33 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 34 | // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 35 | // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 36 | // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 37 | // 38 | // Questions? Contact Christian R. Trott (crtrott@sandia.gov) 39 | // 40 | // ************************************************************************ 41 | //@HEADER 42 | */ 43 | 44 | #pragma once 45 | 46 | #include "simd_common.hpp" 47 | 48 | #ifdef __VSX__ 49 | #include 50 | // undefine the really dangerous macros from this file 51 | #undef vector 52 | #undef pixel 53 | #undef bool 54 | #endif 55 | 56 | 57 | #if defined(__VSX__) && (!defined(__CUDACC__)) 58 | 59 | namespace SIMD_NAMESPACE { 60 | 61 | namespace simd_abi { 62 | 63 | class vsx {}; 64 | 65 | } 66 | 67 | template <> 68 | class simd_mask { 69 | __vector __bool int m_value; 70 | public: 71 | using value_type = bool; 72 | using simd_type = simd; 73 | using abi_type = simd_abi::vsx; 74 | SIMD_ALWAYS_INLINE inline simd_mask() = default; 75 | SIMD_ALWAYS_INLINE inline simd_mask(bool value) 76 | :m_value{value, value, value, value} 77 | {} 78 | SIMD_ALWAYS_INLINE inline static constexpr int size() { return 4; } 79 | SIMD_ALWAYS_INLINE inline constexpr simd_mask(__vector __bool int const& value_in) 80 | :m_value(value_in) 81 | {} 82 | SIMD_ALWAYS_INLINE inline constexpr __vector __bool int get() const { return m_value; } 83 | SIMD_ALWAYS_INLINE inline simd_mask operator||(simd_mask const& other) const { 84 | return simd_mask(vec_or(m_value, other.m_value)); 85 | } 86 | SIMD_ALWAYS_INLINE inline simd_mask operator&&(simd_mask const& other) const { 87 | return simd_mask(vec_and(m_value, other.m_value)); 88 | } 89 | SIMD_ALWAYS_INLINE inline simd_mask operator!() const { 90 | return simd_mask(vec_nand(m_value, simd_mask(true).get())); 91 | } 92 | }; 93 | 94 | SIMD_ALWAYS_INLINE inline bool all_of(simd_mask const& a) { 95 | auto const true_value = simd_mask(true).get(); 96 | return vec_all_eq(a.get(), true_value); 97 | } 98 | 99 | SIMD_ALWAYS_INLINE inline bool any_of(simd_mask const& a) { 100 | auto const true_value = simd_mask(true).get(); 101 | return vec_any_eq(a.get(), true_value); 102 | } 103 | 104 | template <> 105 | class simd { 106 | __vector float m_value; 107 | public: 108 | using value_type = float; 109 | using abi_type = simd_abi::vsx; 110 | using mask_type = simd_mask; 111 | using storage_type = simd_storage; 112 | SIMD_ALWAYS_INLINE inline simd() = default; 113 | SIMD_ALWAYS_INLINE inline static constexpr int size() { return 4; } 114 | SIMD_ALWAYS_INLINE inline simd(float value) 115 | :m_value(vec_splats(value)) 116 | {} 117 | SIMD_ALWAYS_INLINE inline simd(float a, float b, float c, float d) 118 | :m_value((__vector float){a, b, c, d}) 119 | {} 120 | SIMD_ALWAYS_INLINE inline 121 | simd(storage_type const& value) { 122 | copy_from(value.data(), element_aligned_tag()); 123 | } 124 | SIMD_ALWAYS_INLINE inline 125 | simd& operator=(storage_type const& value) { 126 | copy_from(value.data(), element_aligned_tag()); 127 | return *this; 128 | } 129 | template 130 | SIMD_ALWAYS_INLINE inline simd(float const* ptr, Flags flags) { 131 | copy_from(ptr, flags); 132 | } 133 | SIMD_ALWAYS_INLINE inline simd(float const* ptr, int stride) 134 | :simd(ptr[0], ptr[stride], ptr[2*stride], ptr[3*stride]) 135 | {} 136 | SIMD_ALWAYS_INLINE inline constexpr simd(__vector float const& value_in) 137 | :m_value(value_in) 138 | {} 139 | SIMD_ALWAYS_INLINE simd operator*(simd const& other) const { 140 | return simd(vec_mul(m_value, other.m_value)); 141 | } 142 | SIMD_ALWAYS_INLINE simd operator/(simd const& other) const { 143 | return simd(vec_div(m_value, other.m_value)); 144 | } 145 | SIMD_ALWAYS_INLINE simd operator+(simd const& other) const { 146 | return simd(vec_add(m_value, other.m_value)); 147 | } 148 | SIMD_ALWAYS_INLINE simd operator-(simd const& other) const { 149 | return simd(vec_sub(m_value, other.m_value)); 150 | } 151 | SIMD_ALWAYS_INLINE simd operator-() const { 152 | // return simd(vec_neg(m_value)); some GCC versions dont have this 153 | return simd(0.0) - (*this); 154 | } 155 | SIMD_ALWAYS_INLINE void copy_from(float const* ptr, element_aligned_tag) { 156 | m_value = vec_vsx_ld(0, ptr); 157 | } 158 | SIMD_ALWAYS_INLINE void copy_to(float* ptr, element_aligned_tag) const { 159 | vec_vsx_st(m_value, 0, ptr); 160 | } 161 | SIMD_ALWAYS_INLINE constexpr __vector float get() const { return m_value; } 162 | SIMD_ALWAYS_INLINE simd_mask operator<(simd const& other) const { 163 | return simd_mask(vec_cmplt(m_value, other.m_value)); 164 | } 165 | SIMD_ALWAYS_INLINE simd_mask operator==(simd const& other) const { 166 | return simd_mask(vec_cmpeq(m_value, other.m_value)); 167 | } 168 | }; 169 | 170 | SIMD_ALWAYS_INLINE inline simd abs(simd const& a) { 171 | return simd(vec_abs(a.get())); 172 | } 173 | 174 | SIMD_ALWAYS_INLINE inline simd sqrt(simd const& a) { 175 | return simd(vec_sqrt(a.get())); 176 | } 177 | 178 | SIMD_ALWAYS_INLINE inline simd fma( 179 | simd const& a, 180 | simd const& b, 181 | simd const& c) { 182 | return simd(vec_madd(a.get(), b.get(), c.get())); 183 | } 184 | 185 | SIMD_ALWAYS_INLINE inline simd max( 186 | simd const& a, simd const& b) { 187 | return simd(vec_max(a.get(), b.get())); 188 | } 189 | 190 | SIMD_ALWAYS_INLINE inline simd min( 191 | simd const& a, simd const& b) { 192 | return simd(vec_min(a.get(), b.get())); 193 | } 194 | 195 | SIMD_ALWAYS_INLINE inline simd choose( 196 | simd_mask const& a, simd const& b, simd const& c) { 197 | return simd(vec_sel(c.get(), b.get(), a.get())); 198 | } 199 | 200 | template <> 201 | class simd_mask { 202 | __vector __bool long long m_value; 203 | using ll_t = long long; 204 | using ull_t = unsigned long long; 205 | public: 206 | using value_type = bool; 207 | using simd_type = simd_mask; 208 | using abi_type = simd_abi::vsx; 209 | SIMD_ALWAYS_INLINE inline simd_mask() = default; 210 | SIMD_ALWAYS_INLINE inline simd_mask(bool value) 211 | :m_value{ull_t(-ll_t(value)), ull_t(-ll_t(value))} 212 | {} 213 | SIMD_ALWAYS_INLINE inline static constexpr int size() { return 2; } 214 | SIMD_ALWAYS_INLINE inline constexpr simd_mask(__vector __bool long long const& value_in) 215 | :m_value(value_in) 216 | {} 217 | SIMD_ALWAYS_INLINE inline constexpr __vector __bool long long get() const { return m_value; } 218 | SIMD_ALWAYS_INLINE inline simd_mask operator||(simd_mask const& other) const { 219 | return simd_mask(vec_or(m_value, other.m_value)); 220 | } 221 | SIMD_ALWAYS_INLINE inline simd_mask operator&&(simd_mask const& other) const { 222 | return simd_mask(vec_and(m_value, other.m_value)); 223 | } 224 | SIMD_ALWAYS_INLINE inline simd_mask operator!() const { 225 | return simd_mask(vec_nand(m_value, simd_mask(true).get())); 226 | } 227 | }; 228 | 229 | SIMD_ALWAYS_INLINE inline bool all_of(simd_mask const& a) { 230 | auto const true_value = simd_mask(true).get(); 231 | return vec_all_eq(a.get(), true_value); 232 | } 233 | 234 | SIMD_ALWAYS_INLINE inline bool any_of(simd_mask const& a) { 235 | auto const true_value = simd_mask(true).get(); 236 | return vec_any_eq(a.get(), true_value); 237 | } 238 | 239 | template <> 240 | class simd { 241 | __vector double m_value; 242 | public: 243 | using value_type = double; 244 | using abi_type = simd_abi::vsx; 245 | using mask_type = simd_mask; 246 | using storage_type = simd_storage; 247 | SIMD_ALWAYS_INLINE inline simd() = default; 248 | SIMD_ALWAYS_INLINE inline simd(simd const&) = default; 249 | SIMD_ALWAYS_INLINE inline simd(simd&&) = default; 250 | SIMD_ALWAYS_INLINE inline simd& operator=(simd const&) = default; 251 | SIMD_ALWAYS_INLINE inline simd& operator=(simd&&) = default; 252 | SIMD_ALWAYS_INLINE inline static constexpr int size() { return 2; } 253 | SIMD_ALWAYS_INLINE inline simd(double value) 254 | :m_value(vec_splats(value)) 255 | {} 256 | SIMD_ALWAYS_INLINE inline simd(double a, double b) 257 | :m_value((__vector double){a, b}) 258 | {} 259 | SIMD_ALWAYS_INLINE inline 260 | simd(storage_type const& value) { 261 | copy_from(value.data(), element_aligned_tag()); 262 | } 263 | #ifdef STK_VOLATILE_SIMD 264 | SIMD_ALWAYS_INLINE inline 265 | simd(simd const volatile& value) 266 | :m_value(value.m_value) 267 | {} 268 | #endif 269 | SIMD_ALWAYS_INLINE inline 270 | simd& operator=(storage_type const& value) { 271 | copy_from(value.data(), element_aligned_tag()); 272 | return *this; 273 | } 274 | template 275 | SIMD_ALWAYS_INLINE inline simd(double const* ptr, Flags flags) { 276 | copy_from(ptr, flags); 277 | } 278 | SIMD_ALWAYS_INLINE inline simd(double const* ptr, int stride) 279 | :simd(ptr[0], ptr[stride]) 280 | {} 281 | SIMD_ALWAYS_INLINE inline constexpr simd(__vector double const& value_in) 282 | :m_value(value_in) 283 | {} 284 | SIMD_ALWAYS_INLINE inline simd operator*(simd const& other) const { 285 | return simd(vec_mul(m_value, other.m_value)); 286 | } 287 | SIMD_ALWAYS_INLINE inline simd operator/(simd const& other) const { 288 | return simd(vec_div(m_value, other.m_value)); 289 | } 290 | SIMD_ALWAYS_INLINE inline simd operator+(simd const& other) const { 291 | return simd(vec_add(m_value, other.m_value)); 292 | } 293 | #ifdef STK_VOLATILE_SIMD 294 | SIMD_ALWAYS_INLINE inline void plus_equals(simd const volatile& other) volatile { 295 | m_value = vec_add(m_value, other.m_value); 296 | } 297 | #endif 298 | SIMD_ALWAYS_INLINE inline simd operator-(simd const& other) const { 299 | return simd(vec_sub(m_value, other.m_value)); 300 | } 301 | SIMD_ALWAYS_INLINE inline simd operator-() const { 302 | // return simd(vec_neg(m_value)); some GCC versions dont have this 303 | return simd(0.0) - (*this); 304 | } 305 | SIMD_ALWAYS_INLINE inline void copy_from(double const* ptr, element_aligned_tag) { 306 | m_value = vec_vsx_ld(0, ptr); 307 | } 308 | SIMD_ALWAYS_INLINE inline void copy_to(double* ptr, element_aligned_tag) const { 309 | vec_vsx_st(m_value, 0, ptr); 310 | } 311 | SIMD_ALWAYS_INLINE inline constexpr __vector double get() const { return m_value; } 312 | SIMD_ALWAYS_INLINE inline simd_mask operator<(simd const& other) const { 313 | return simd_mask(vec_cmplt(m_value, other.m_value)); 314 | } 315 | SIMD_ALWAYS_INLINE inline simd_mask operator==(simd const& other) const { 316 | return simd_mask(vec_cmpeq(m_value, other.m_value)); 317 | } 318 | }; 319 | 320 | SIMD_ALWAYS_INLINE inline simd abs(simd const& a) { 321 | return simd(vec_abs(a.get())); 322 | } 323 | 324 | SIMD_ALWAYS_INLINE inline simd sqrt(simd const& a) { 325 | return simd(vec_sqrt(a.get())); 326 | } 327 | 328 | SIMD_ALWAYS_INLINE inline simd fma( 329 | simd const& a, 330 | simd const& b, 331 | simd const& c) { 332 | return simd(vec_madd(a.get(), b.get(), c.get())); 333 | } 334 | 335 | SIMD_ALWAYS_INLINE inline simd max( 336 | simd const& a, simd const& b) { 337 | return simd(vec_max(a.get(), b.get())); 338 | } 339 | 340 | SIMD_ALWAYS_INLINE inline simd min( 341 | simd const& a, simd const& b) { 342 | return simd(vec_min(a.get(), b.get())); 343 | } 344 | 345 | SIMD_ALWAYS_INLINE inline simd choose( 346 | simd_mask const& a, simd const& b, simd const& c) { 347 | return simd(vec_sel(c.get(), b.get(), a.get())); 348 | } 349 | 350 | } 351 | 352 | #endif 353 | --------------------------------------------------------------------------------