├── .clang-format
├── .github
    ├── FUNDING.yml
    └── workflows
    │   └── ci.yml
├── .gitignore
├── CMakeLists.txt
├── CMakeSettings.json
├── LICENSE
├── README.md
├── cmake
    ├── ArchDetect.cmake
    └── ArchDetect.cpp
├── dispatch
    ├── CMakeLists.txt
    ├── cmake
    │   ├── ClassSIMD.cmake
    │   ├── feature_set_source.cpp.in
    │   └── simd_lib_config.h.in
    └── impl
    │   └── DispatchClassImpl.h
├── examples
    ├── CMakeLists.txt
    ├── dispatch_library
    │   ├── CMakeLists.txt
    │   ├── example.h
    │   ├── example.inl
    │   └── main.cpp
    └── header_only
    │   ├── CMakeLists.txt
    │   └── main.cpp
├── include
    └── FastSIMD
    │   ├── DispatchClass.h
    │   ├── ToolSet.h
    │   ├── ToolSet
    │       ├── ARM
    │       │   ├── 128
    │       │   │   ├── f32x4.h
    │       │   │   ├── i32x4.h
    │       │   │   └── m32x4.h
    │       │   ├── ARM.h
    │       │   └── NEON.h
    │       ├── Generic
    │       │   ├── Functions.h
    │       │   ├── Register.h
    │       │   ├── Scalar.h
    │       │   └── Scalar
    │       │   │   ├── f32x1.h
    │       │   │   ├── i32x1.h
    │       │   │   └── mNx1.h
    │       ├── WASM
    │       │   ├── 128
    │       │   │   ├── f32x4.h
    │       │   │   ├── i32x4.h
    │       │   │   └── m32x4.h
    │       │   └── WASM.h
    │       └── x86
    │       │   ├── 128
    │       │       ├── f32x4.h
    │       │       ├── i32x4.h
    │       │       └── m32x4.h
    │       │   ├── 256
    │       │       ├── f32x8.h
    │       │       ├── i32x8.h
    │       │       └── m32x8.h
    │       │   ├── 512
    │       │       ├── f32x16.h
    │       │       ├── i32x16.h
    │       │       └── mNx16.h
    │       │   ├── AVX.h
    │       │   ├── AVX512.h
    │       │   ├── SSE.h
    │       │   └── x86.h
    │   └── Utility
    │       ├── ArchDetect.h
    │       ├── Export.h
    │       ├── FeatureEnums.h
    │       └── FeatureSetList.h
├── src
    └── FastSIMD.cpp
└── tests
    ├── CMakeLists.txt
    ├── test.cpp
    ├── test.h
    └── test.inl


/.clang-format:
--------------------------------------------------------------------------------
 1 | ---
 2 | Language: Cpp
 3 | BasedOnStyle: Microsoft
 4 | AccessModifierOffset: -4
 5 | AlignOperands: false
 6 | AlignTrailingComments: false
 7 | AlwaysBreakTemplateDeclarations: Yes
 8 | BraceWrapping: 
 9 |   AfterCaseLabel: true
10 |   AfterClass: true
11 |   AfterControlStatement: true
12 |   AfterEnum: true
13 |   AfterFunction: true
14 |   AfterNamespace: true
15 |   AfterStruct: true
16 |   AfterUnion: true
17 |   AfterExternBlock: false
18 |   BeforeCatch: true
19 |   BeforeElse: true
20 |   SplitEmptyFunction: true
21 |   SplitEmptyRecord: true
22 |   SplitEmptyNamespace: true
23 | BreakConstructorInitializers: AfterColon
24 | ColumnLimit: 0
25 | Cpp11BracedListStyle: false
26 | IncludeCategories: 
27 |   - Regex: '^<.*'
28 |     Priority: 1
29 |   - Regex: '^".*'
30 |     Priority: 2
31 |   - Regex: '.*'
32 |     Priority: 3
33 | IncludeIsMainRegex: '([-_](test|unittest))?$'
34 | MaxEmptyLinesToKeep: 2
35 | NamespaceIndentation: All
36 | PointerAlignment: Left
37 | SpaceAfterTemplateKeyword: 'false'
38 | SpaceBeforeCpp11BracedList: 'true'
39 | SpaceBeforeParens: Never
40 | SpaceBeforeRangeBasedForLoopColon: 'false'
41 | SpaceInEmptyParentheses: 'false'
42 | SpacesInCStyleCastParentheses: 'false'
43 | SpacesInContainerLiterals: 'false'
44 | SpacesInParentheses: 'true'
45 | SpacesInSquareBrackets: 'false'
46 | UseTab: Never
47 | ...
48 | 


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
 1 | # These are supported funding model platforms
 2 | 
 3 | github: Auburn
 4 | patreon: # Replace with a single Patreon username
 5 | open_collective: # Replace with a single Open Collective username
 6 | ko_fi: # Replace with a single Ko-fi username
 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
 9 | liberapay: # Replace with a single Liberapay username
10 | issuehunt: # Replace with a single IssueHunt username
11 | otechie: # Replace with a single Otechie username
12 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
13 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | # Controls when the action will run. Triggers the workflow on push or pull request
 4 | # events but only for the master branch
 5 | on:
 6 |   push:
 7 |     branches: [master]
 8 |   pull_request:
 9 |     branches: [master]
10 | 
11 | # A workflow run is made up of one or more jobs that can run sequentially or in parallel
12 | jobs:  
13 |   ci-matrix:
14 |     name: ${{ matrix.name }}
15 |     runs-on: ${{ matrix.os }}
16 |     strategy:
17 |       fail-fast: false
18 |       matrix:
19 |         include:
20 |           - os: windows-latest
21 |             name: Win32-MSVC
22 |             cmake_options: -A Win32
23 |           - os: windows-latest
24 |             name: Win64-MSVC
25 |             cmake_options: -A x64            
26 |           - os: windows-latest
27 |             name: Win64-ClangCL
28 |             cmake_options: -A x64 -T ClangCL
29 |           - os: ubuntu-latest
30 |             name: Linux64-GCC
31 |             cmake_options: -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++      
32 |           - os: ubuntu-latest
33 |             name: Linux64-Clang
34 |             cmake_options: -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++
35 |           - os: macos-13
36 |             name: MacOS64-Clang
37 |             cmake_options: -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++
38 |           - os: macos-latest
39 |             name: MacOSARM64-Clang
40 |             cmake_options: -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_OSX_ARCHITECTURES="x86_64;arm64"
41 |           - os: ubuntu-latest
42 |             name: Emscripten
43 |             cmake_options: -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake
44 |             
45 |     steps:                   
46 |     - name: 'Setup Emscripten'
47 |       if: matrix.name == 'Emscripten'
48 |       uses: mymindstorm/setup-emsdk@v14
49 |       with:
50 |         version: 3.1.56
51 |         no-cache: true
52 |       
53 |     - name: 'Checkout' 
54 |       uses: actions/checkout@v4
55 |    
56 |     - name: 'CMake Configure Debug'
57 |       run: cmake -S ${{ github.workspace }} -B ${{ github.workspace }}/debug -DCMAKE_BUILD_TYPE=Debug -DCMAKE_INSTALL_PREFIX="${{ github.workspace }}/install/FastSIMD" ${{ matrix.cmake_options }}
58 |    
59 |     - name: 'CMake Build Debug'
60 |       run: cmake --build ${{ github.workspace }}/debug --config Debug --parallel 4
61 |    
62 |     - name: 'CMake Configure Release'
63 |       run: cmake -S ${{ github.workspace }} -B ${{ github.workspace }}/release -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX="${{ github.workspace }}/install/FastSIMD" ${{ matrix.cmake_options }}
64 |    
65 |     - name: 'CMake Build Release'
66 |       run: cmake --build ${{ github.workspace }}/release --config Release --parallel 4
67 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Prerequisites
 2 | *.d
 3 | 
 4 | # Compiled Object files
 5 | *.slo
 6 | *.lo
 7 | *.o
 8 | *.obj
 9 | 
10 | # Precompiled Headers
11 | *.gch
12 | *.pch
13 | 
14 | # Compiled Dynamic libraries
15 | *.so
16 | *.dylib
17 | *.dll
18 | 
19 | # Fortran module files
20 | *.mod
21 | *.smod
22 | 
23 | # Compiled Static libraries
24 | *.lai
25 | *.la
26 | *.a
27 | *.lib
28 | 
29 | # Executables
30 | *.exe
31 | *.out
32 | *.app
33 | /.vs*
34 | /out
35 | /build
36 | /enc_temp_folder
37 | /cpm-cache
38 | /old
39 | 
40 | emsdk
41 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # CMakeList.txt : CMake project for FastSIMD
 2 | cmake_minimum_required(VERSION 3.7.1)
 3 | 
 4 | project(FastSIMD VERSION 1.0.0)
 5 | set(CMAKE_CXX_STANDARD 17)
 6 | 
 7 | # determine whether this is a standalone project or included by other projects
 8 | if (NOT DEFINED FASTSIMD_STANDALONE_PROJECT)
 9 |     if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR)
10 |         set(FASTSIMD_STANDALONE_PROJECT ON)
11 |     else()
12 |         set(FASTSIMD_STANDALONE_PROJECT OFF)
13 |     endif()
14 | endif()
15 | 
16 | option(FASTSIMD_DISPATCH_CLASS "Enable FastSIMD Dispatch Classes" ON)
17 | option(FASTSIMD_EXAMPLES "Build FastSIMD examples" ${FASTSIMD_STANDALONE_PROJECT})
18 | option(FASTSIMD_TESTS "Build FastSIMD tests" ${FASTSIMD_STANDALONE_PROJECT})
19 | 
20 | include(cmake/ArchDetect.cmake)
21 | 
22 | target_architecture(FASTSIMD_ARCH_DETECT FASTSIMD_ARCHVER_DETECT)
23 | 
24 | add_library(FastSIMD OBJECT "src/FastSIMD.cpp")
25 | target_compile_definitions(FastSIMD PRIVATE FASTSIMD_EXPORT)
26 | 
27 | if(BUILD_SHARED_LIBS)
28 |     set_property(TARGET FastSIMD PROPERTY POSITION_INDEPENDENT_CODE ON)
29 | else()
30 |     target_compile_definitions(FastSIMD PUBLIC FASTSIMD_STATIC_LIB)
31 | endif()
32 | 
33 | if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
34 |     target_compile_options(FastSIMD PUBLIC -msimd128)
35 | endif()
36 | 
37 | target_include_directories(FastSIMD PUBLIC 
38 |     $<BUILD_INTERFACE:${FastSIMD_SOURCE_DIR}/include>
39 |     $<INSTALL_INTERFACE:include>
40 | )
41 | 
42 | if(FASTSIMD_DISPATCH_CLASS)
43 |     add_subdirectory(dispatch)
44 | endif()
45 | 
46 | if(FASTSIMD_TESTS)
47 |     add_subdirectory(tests)
48 | endif()
49 | 
50 | if(FASTSIMD_EXAMPLES)
51 |     add_subdirectory(examples)
52 | endif()
53 | 


--------------------------------------------------------------------------------
/CMakeSettings.json:
--------------------------------------------------------------------------------
 1 | ﻿{
 2 |   "configurations": [
 3 |     {
 4 |       "name": "x64-Debug",
 5 |       "generator": "Ninja",
 6 |       "configurationType": "Debug",
 7 |       "inheritEnvironments": [ "msvc_x64_x64" ],
 8 |       "buildRoot": "${projectDir}\\out\\build\\${name}",
 9 |       "installRoot": "${projectDir}\\out\\install\\${name}",
10 |       "cmakeCommandArgs": "",
11 |       "buildCommandArgs": "-v",
12 |       "ctestCommandArgs": ""
13 |     },
14 |     {
15 |       "name": "x64-Clang-Debug",
16 |       "generator": "Ninja",
17 |       "configurationType": "Debug",
18 |       "buildRoot": "${projectDir}\\out\\build\\${name}",
19 |       "installRoot": "${projectDir}\\out\\install\\${name}",
20 |       "buildCommandArgs": "-v",
21 |       "ctestCommandArgs": "",
22 |       "inheritEnvironments": [ "clang_cl_x64_x64" ]
23 |     },
24 |     {
25 |       "name": "WSL-GCC-Debug",
26 |       "generator": "Ninja",
27 |       "configurationType": "Debug",
28 |       "buildRoot": "${projectDir}\\out\\build\\${name}",
29 |       "installRoot": "${projectDir}\\out\\install\\${name}",
30 |       "cmakeExecutable": "cmake",
31 |       "cmakeCommandArgs": "",
32 |       "buildCommandArgs": "-v",
33 |       "ctestCommandArgs": "",
34 |       "inheritEnvironments": [ "linux_x64" ],
35 |       "wslPath": "${defaultWSLPath}"
36 |     },
37 |     {
38 |       "name": "WSL-Clang-Debug",
39 |       "generator": "Ninja",
40 |       "configurationType": "Debug",
41 |       "buildRoot": "${projectDir}\\out\\build\\${name}",
42 |       "installRoot": "${projectDir}\\out\\install\\${name}",
43 |       "cmakeExecutable": "cmake",
44 |       "cmakeCommandArgs": "",
45 |       "buildCommandArgs": "-v",
46 |       "ctestCommandArgs": "",
47 |       "inheritEnvironments": [ "linux_clang_x64" ],
48 |       "wslPath": "${defaultWSLPath}"
49 |     },
50 |     {
51 |       "name": "x64-Release",
52 |       "generator": "Ninja",
53 |       "configurationType": "RelWithDebInfo",
54 |       "buildRoot": "${projectDir}\\out\\build\\${name}",
55 |       "installRoot": "${projectDir}\\out\\install\\${name}",
56 |       "cmakeCommandArgs": "",
57 |       "buildCommandArgs": "-v",
58 |       "ctestCommandArgs": "",
59 |       "inheritEnvironments": [ "msvc_x64_x64" ],
60 |       "variables": []
61 |     },
62 |     {
63 |       "name": "x64-Clang-Release",
64 |       "generator": "Ninja",
65 |       "configurationType": "RelWithDebInfo",
66 |       "buildRoot": "${projectDir}\\out\\build\\${name}",
67 |       "installRoot": "${projectDir}\\out\\install\\${name}",
68 |       "buildCommandArgs": "-v",
69 |       "ctestCommandArgs": "",
70 |       "inheritEnvironments": [ "clang_cl_x64_x64" ],
71 |       "variables": []
72 |     }
73 |   ]
74 | }


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Jordan Peck
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # FastSIMD
 2 | 
 3 | FastSIMD is an SIMD abstraction layer that allows easy development of SIMD code. Using the generic SIMD register types included with FastSIMD provides access to standard C++ operators and various mathematic functions. Using template parameters or a define the generic types can be compiled to various SIMD feature sets (SSE, AVX, NEON...)
 4 | 
 5 | FastSIMD also provides an easy to use class template for runtime SIMD feature set detection and appropriate function dispatch. The generic register types mentioned above make it easy to write code that always makes use of the highest SIMD feature set available.
 6 | 
 7 | The original version of FastSIMD was developed along side FastNoise2. This new "1.0" version of FastSIMD is being developed as a working standalone library, although the development is still primarily driven to support FastNoise2. Compared to the "old" version of FastSIMD supplied with FastNoise2 this "1.0" version of FastSIMD has these improvements:
 8 | 
 9 | - Removed all uses of C++ macros in favour of templated types/functions 
10 | - Variable sized generic register types. For example when using operators on a register of 8xInt32 when targeting SSE, the intrinsics output will get doubled up transparently.
11 | - Moved from SIMD levels to FeatureFlags to allow more verbose specialisation of templated types and more readable code.
12 | 


--------------------------------------------------------------------------------
/cmake/ArchDetect.cmake:
--------------------------------------------------------------------------------
 1 | function(target_architecture output_arch output_arch_ver)
 2 |     if(APPLE AND CMAKE_OSX_ARCHITECTURES)
 3 |         set(ARCH "${CMAKE_OSX_ARCHITECTURES}")
 4 |         set(ARCH_VER unknown)
 5 |     else()
 6 | 
 7 |         # Detect the architecture in a rather creative way...
 8 |         # This compiles a small C program which is a series of ifdefs that selects a
 9 |         # particular #error preprocessor directive whose message string contains the
10 |         # target architecture. The program will always fail to compile (both because
11 |         # file is not a valid C program, and obviously because of the presence of the
12 |         # #error preprocessor directives... but by exploiting the preprocessor in this
13 |         # way, we can detect the correct target architecture even when cross-compiling,
14 |         # since the program itself never needs to be run (only the compiler/preprocessor)
15 |         try_compile(
16 |             compile_result_unused
17 |             "${CMAKE_BINARY_DIR}"
18 |             "${FastSIMD_SOURCE_DIR}/cmake/ArchDetect.cpp"
19 |             OUTPUT_VARIABLE COMPILE_OUTPUT
20 |         )
21 |         #message(STATUS ${COMPILE_OUTPUT})
22 | 
23 |         # Parse the architecture name from the compiler output
24 |         if ("${COMPILE_OUTPUT}" MATCHES "FASTSIMD_ARCH<([^\"=]+)=([^>]+)")
25 |             set(ARCH "${CMAKE_MATCH_1}")
26 |             set(ARCH_VER "${CMAKE_MATCH_2}")  
27 | 
28 |         else()       
29 |             set(ARCH unknown)
30 |             set(ARCH_VER SCALAR)   
31 |         endif()
32 |     endif()
33 |     
34 |     message(STATUS "FastSIMD: Detected arch \"${ARCH}\" \"${ARCH_VER}\"")
35 |     set(${output_arch} "${ARCH}" PARENT_SCOPE)
36 |     set(${output_arch_ver} "${ARCH_VER}" PARENT_SCOPE)
37 | endfunction()


--------------------------------------------------------------------------------
/cmake/ArchDetect.cpp:
--------------------------------------------------------------------------------
 1 | #include "../include/FastSIMD/Utility/ArchDetect.h"
 2 | 
 3 | #define TO_LITERAL_( string ) #string
 4 | #define TO_LITERAL( string ) TO_LITERAL_( string )
 5 | 
 6 | #if !defined( TEST_FEATURE_SET_ACTIVE )
 7 | #define FASTSIMD_DETECT_SUCCESS
 8 | #else
 9 | #if FASTSIMD_FEATURE_VALUE( TEST_FEATURE_SET_ACTIVE ) > 0
10 | #define FASTSIMD_DETECT_SUCCESS
11 | #endif
12 | #endif
13 | 
14 | #ifdef FASTSIMD_DETECT_SUCCESS
15 | static_assert( 0, "FASTSIMD_ARCH<" TO_LITERAL( FASTSIMD_ARCH_NAME() ) ">" );
16 | 
17 | // Needed for MacOS clang, it doesn't evaluate macros in static assert errors
18 | TO_LITERAL( FASTSIMD_ARCH<FASTSIMD_ARCH_NAME()> );
19 | 
20 | #endif


--------------------------------------------------------------------------------
/dispatch/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | include(cmake/ClassSIMD.cmake)
2 | 


--------------------------------------------------------------------------------
/dispatch/cmake/ClassSIMD.cmake:
--------------------------------------------------------------------------------
  1 | 
  2 | function(fastsimd_add_feature_set_source fastsimd_library_sources feature_set is_relaxed)
  3 |     foreach(simd_inl ${fastsimd_library_sources})
  4 |         set(feature_set_source "${simd_library_source_dir}/${simd_library_name}_${feature_set}.cpp")
  5 |         set(simd_inl_full "${CMAKE_CURRENT_LIST_DIR}/${simd_inl}")
  6 | 
  7 |         configure_file("${FastSIMD_SOURCE_DIR}/dispatch/cmake/feature_set_source.cpp.in" ${feature_set_source})
  8 |         target_sources(${simd_library_name} PRIVATE ${feature_set_source})
  9 | 
 10 |         if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
 11 |             # MSVC 32bit needs SSE2 flag for all SSE levels
 12 |             if(${feature_set} MATCHES "SSE[^(0-9)]" AND CMAKE_SIZEOF_VOID_P EQUAL 4)
 13 |                 set_property(SOURCE ${feature_set_source} APPEND PROPERTY COMPILE_OPTIONS /arch:SSE2)
 14 | 
 15 |             elseif(${feature_set} MATCHES "AVX[^(0-9)]")
 16 |                 set_property(SOURCE ${feature_set_source} APPEND PROPERTY COMPILE_OPTIONS /arch:AVX)
 17 | 
 18 |             elseif(${feature_set} MATCHES AVX2)
 19 |                 set_property(SOURCE ${feature_set_source} APPEND PROPERTY COMPILE_OPTIONS /arch:AVX2)
 20 | 
 21 |             elseif(${feature_set} MATCHES AVX512)
 22 |                 set_property(SOURCE ${feature_set_source} APPEND PROPERTY COMPILE_OPTIONS /arch:AVX512)
 23 |             endif()
 24 |         else()
 25 |             if(${feature_set} MATCHES SSE2 AND CMAKE_SIZEOF_VOID_P EQUAL 4)
 26 |                 set_property(SOURCE ${feature_set_source} APPEND PROPERTY COMPILE_OPTIONS -msse2)
 27 | 
 28 |             elseif(${feature_set} MATCHES SSE3)
 29 |                 set_property(SOURCE ${feature_set_source} APPEND PROPERTY COMPILE_OPTIONS -msse3)
 30 | 
 31 |             elseif(${feature_set} MATCHES SSSE3)
 32 |                 set_property(SOURCE ${feature_set_source} APPEND PROPERTY COMPILE_OPTIONS -mssse3)
 33 | 
 34 |             elseif(${feature_set} MATCHES SSE41)
 35 |                 set_property(SOURCE ${feature_set_source} APPEND PROPERTY COMPILE_OPTIONS -msse4.1)
 36 | 
 37 |             elseif(${feature_set} MATCHES SSE42)
 38 |                 set_property(SOURCE ${feature_set_source} APPEND PROPERTY COMPILE_OPTIONS -msse4.2)
 39 | 
 40 |             elseif(${feature_set} MATCHES "AVX[^(0-9)]")
 41 |                 set_property(SOURCE ${feature_set_source} APPEND PROPERTY COMPILE_OPTIONS -mavx)
 42 | 
 43 |             elseif(${feature_set} MATCHES AVX2)
 44 |                 if(is_relaxed)
 45 |                     set_property(SOURCE ${feature_set_source} APPEND PROPERTY COMPILE_OPTIONS -mfma)
 46 |                 else()
 47 |                     set_property(SOURCE ${feature_set_source} APPEND PROPERTY COMPILE_OPTIONS -mno-fma)
 48 |                 endif()
 49 |                 set_property(SOURCE ${feature_set_source} APPEND PROPERTY COMPILE_OPTIONS -mavx2)
 50 | 
 51 |             elseif(${feature_set} MATCHES AVX512)
 52 |                 if(is_relaxed)
 53 |                     set_property(SOURCE ${feature_set_source} APPEND PROPERTY COMPILE_OPTIONS -mfma)
 54 |                 else()
 55 |                     set_property(SOURCE ${feature_set_source} APPEND PROPERTY COMPILE_OPTIONS -mno-fma)
 56 |                 endif()
 57 |                 set_property(SOURCE ${feature_set_source} APPEND PROPERTY COMPILE_OPTIONS -mavx512f -mavx512dq -mavx512vl -mavx512bw)
 58 | 
 59 |             elseif(${feature_set} MATCHES WASM)
 60 |                 if(is_relaxed)
 61 |                     set_property(SOURCE ${feature_set_source} APPEND PROPERTY COMPILE_OPTIONS -mrelaxed-simd)
 62 |                 endif()
 63 |             endif()
 64 |         endif()
 65 |     endforeach()
 66 | endfunction()
 67 | 
 68 | function(fastsimd_create_dispatch_library simd_library_name)
 69 | 
 70 |     cmake_parse_arguments(PARSE_ARGV 0 fastsimd_create_dispatch_library "RELAXED" "" "SOURCES;FEATURE_SETS")
 71 | 
 72 |     list(LENGTH fastsimd_create_dispatch_library_FEATURE_SETS FEATURE_SET_COUNT)
 73 |     list(LENGTH fastsimd_create_dispatch_library_SOURCES SOURCES_COUNT)
 74 | 
 75 |     if(SOURCES_COUNT EQUAL 0)
 76 |         message(FATAL_ERROR "FastSIMD: \"${simd_library_name}\" No SOURCES specified, example usage: fastsimd_create_dispatch_library(example_simd SOURCES \"example.inl\")")
 77 |     endif()
 78 | 
 79 |     if(FEATURE_SET_COUNT EQUAL 0)
 80 |         message("FastSIMD: \"${simd_library_name}\" No FEATURE_SETS specified, using default feature sets")
 81 |         set(fastsimd_create_dispatch_library_FEATURE_SETS
 82 |             SSE2
 83 |             SSE41
 84 |             AVX2
 85 |             AVX512
 86 |             NEON
 87 |             AARCH64
 88 |             WASM)
 89 |     endif()
 90 | 
 91 |     add_library(${simd_library_name} OBJECT)
 92 | 
 93 |     set(simd_library_source_dir "${CMAKE_CURRENT_BINARY_DIR}/fastsimd/${simd_library_name}")
 94 | 
 95 |     target_compile_definitions(${simd_library_name} PRIVATE FASTSIMD_EXPORT FASTSIMD_LIBRARY_NAME=${simd_library_name})
 96 |     target_link_libraries(${simd_library_name} PRIVATE FastSIMD)
 97 | 
 98 |     target_include_directories(${simd_library_name} PUBLIC
 99 |         $<BUILD_INTERFACE:${simd_library_source_dir}/include>
100 |         $<INSTALL_INTERFACE:include>)
101 | 
102 |     if(BUILD_SHARED_LIBS)
103 |         set_target_properties(${simd_library_name} PROPERTIES POSITION_INDEPENDENT_CODE ON)
104 |     endif()
105 | 
106 |     if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
107 |         set_target_properties(${simd_library_name} PROPERTIES COMPILE_FLAGS "-Wno-ignored-attributes")
108 |     endif()
109 | 
110 |     if(MINGW)
111 |         target_compile_options(${simd_library_name} PRIVATE -Wa,-muse-unaligned-vector-move)
112 |     endif()
113 | 
114 |     if(fastsimd_create_dispatch_library_RELAXED)
115 |         target_compile_definitions(${simd_library_name} PUBLIC FASTSIMD_IS_RELAXED=1)
116 |         set(relaxed_log_msg " (RELAXED)")
117 |     endif()
118 | 
119 |     set(feature_set_list "")
120 |     set(feature_set_list_debug "")
121 | 
122 |     foreach(feature_set ${fastsimd_create_dispatch_library_FEATURE_SETS})
123 |         if(APPLE AND (NOT CMAKE_OSX_ARCHITECTURES STREQUAL "") AND (NOT feature_set STREQUAL "SCALAR"))
124 |             # Loop through OSX arches and test compile on each separately
125 |             foreach(CMAKE_OSX_ARCHITECTURES ${CMAKE_OSX_ARCHITECTURES})
126 |                 #message(STATUS "${CMAKE_OSX_ARCHITECTURES} ${feature_set}")
127 |                 try_compile(
128 |                     compile_result_unused
129 |                     "${CMAKE_BINARY_DIR}"
130 |                     "${FastSIMD_SOURCE_DIR}/cmake/ArchDetect.cpp"
131 |                     OUTPUT_VARIABLE COMPILE_OUTPUT
132 |                     COMPILE_DEFINITIONS -DTEST_FEATURE_SET_ACTIVE=${feature_set}
133 |                 )
134 | 
135 |                 #message(STATUS ${COMPILE_OUTPUT})
136 |                 if (COMPILE_OUTPUT MATCHES "FASTSIMD_ARCH<([^\"=]+)=([^>]+)")
137 |                     set(feature_arch_detect "FASTSIMD_CURRENT_ARCH_IS( ${CMAKE_MATCH_1} )")
138 |                     fastsimd_add_feature_set_source(${fastsimd_create_dispatch_library_SOURCES} ${feature_set} ${fastsimd_create_dispatch_library_RELAXED})
139 |                     string(APPEND feature_set_list "#if ${feature_arch_detect}\n,FastSIMD::FeatureSet::${feature_set}\n#endif\n" )
140 |                     list(APPEND feature_set_list_debug "${feature_set}")
141 |                     break()
142 |                 endif()
143 |             endforeach()
144 |         else()
145 |             try_compile(
146 |                 compile_result_unused
147 |                 "${CMAKE_BINARY_DIR}"
148 |                 "${FastSIMD_SOURCE_DIR}/cmake/ArchDetect.cpp"
149 |                 OUTPUT_VARIABLE COMPILE_OUTPUT
150 |                 COMPILE_DEFINITIONS -DTEST_FEATURE_SET_ACTIVE=${feature_set}
151 |             )
152 | 
153 |             #message(STATUS ${COMPILE_OUTPUT})
154 |             if (COMPILE_OUTPUT MATCHES "FASTSIMD_ARCH<([^\">=]+)=([^\">]+)>")
155 |                 set(feature_arch_detect "1")
156 |                 fastsimd_add_feature_set_source(${fastsimd_create_dispatch_library_SOURCES} ${feature_set} ${fastsimd_create_dispatch_library_RELAXED})
157 |                 string(APPEND feature_set_list ",FastSIMD::FeatureSet::${feature_set}\n" )
158 |                 list(APPEND feature_set_list_debug "${feature_set}")
159 |             endif()
160 |         endif()
161 |     endforeach()
162 | 
163 |     # Create array of compiled feature sets for lookup in FastSIMD::New()
164 |     configure_file("${FastSIMD_SOURCE_DIR}/dispatch/cmake/simd_lib_config.h.in" "${simd_library_source_dir}/include/FastSIMD/${simd_library_name}_config.h")
165 | 
166 |     message(STATUS "FastSIMD: Created dispatch library \"${simd_library_name}\" with Feature Sets${relaxed_log_msg}: ${feature_set_list_debug}")
167 | 
168 | endfunction()
169 | 


--------------------------------------------------------------------------------
/dispatch/cmake/feature_set_source.cpp.in:
--------------------------------------------------------------------------------
 1 | #define FASTSIMD_MAX_FEATURE_SET ${feature_set}
 2 | #include <FastSIMD/Utility/ArchDetect.h>
 3 | 
 4 | #if ${feature_arch_detect}
 5 | #include <FastSIMD/${simd_library_name}_config.h>
 6 | 
 7 | #include "${FastSIMD_SOURCE_DIR}/dispatch/impl/DispatchClassImpl.h"
 8 | #include "${simd_inl_full}"
 9 | #endif
10 | 


--------------------------------------------------------------------------------
/dispatch/cmake/simd_lib_config.h.in:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <FastSIMD/Utility/ArchDetect.h>
 4 | #include <FastSIMD/Utility/FeatureSetList.h>
 5 | 
 6 | namespace FastSIMD
 7 | {
 8 | namespace ${simd_library_name}
 9 | {
10 | using CompiledFeatureSets = FeatureSetList<0
11 | ${feature_set_list}>;
12 | }
13 | }
14 | 


--------------------------------------------------------------------------------
/dispatch/impl/DispatchClassImpl.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <FastSIMD/ToolSet.h>
 3 | #include <FastSIMD/DispatchClass.h>
 4 | 
 5 | #include <new>
 6 | 
 7 | namespace FastSIMD
 8 | {    
 9 |     template<typename T, FastSIMD::FeatureSet SIMD>
10 |     class DispatchClass;
11 | 
12 |     template<FastSIMD::FeatureSet SIMD>
13 |     struct DispatchClassFactory 
14 |     {
15 |         template<typename T>
16 |         FS_NEVERINLINE static T* New( FastSIMD::MemoryAllocator allocator );
17 |     };
18 | 
19 |     // Make sure we only instantiate DispatchClass<T, SIMD> for the current feature set
20 |     template<>
21 |     template<typename T>
22 |     FS_NEVERINLINE T* DispatchClassFactory<FeatureSetDefault()>::New( FastSIMD::MemoryAllocator allocator )
23 |     {
24 |         constexpr auto SIMD = FeatureSetDefault();
25 | 
26 |         if( allocator )
27 |         {
28 |             void* alloc = allocator( sizeof( DispatchClass<T, SIMD> ), alignof( DispatchClass<T, SIMD> ) );
29 | 
30 |             return new( alloc ) DispatchClass<T, SIMD>;
31 |         }
32 | 
33 |         return new DispatchClass<T, SIMD>;
34 |     }
35 | 
36 | 
37 |     template<typename T, FastSIMD::FeatureSet SIMD = FeatureSetDefault()>
38 |     class RegisterDispatchClass
39 |     {
40 |         static_assert( SIMD == FeatureSetDefault() );
41 | 
42 |         // Never called, used to instantiate DispatchClassFactory<SIMD>::New<T>()
43 |         static auto Instantiate()
44 |         {
45 |             return &FastSIMD::DispatchClassFactory<SIMD>::template New<T>;
46 |         }
47 |     };
48 | 
49 |     // Compile FastSIMD::NewDispatchClass<T> in minimum feature set compilation unit to avoid illegal instructions
50 |     template<typename T>
51 |     class RegisterDispatchClass<T, FastSIMD::FASTSIMD_LIBRARY_NAME::CompiledFeatureSets::Minimum>
52 |     {
53 |         // Never called, used to instantiate NewDispatchClass<T>()
54 |         static auto Instantiate()
55 |         {
56 |             return &FastSIMD::NewDispatchClass<T>;
57 |         }
58 |     };
59 | 
60 | 
61 |     template<typename T, FeatureSet SIMD>
62 |     FS_FORCEINLINE static T* DispatchClassFactoryIterator( FeatureSet maxFeatureSet, MemoryAllocator allocator )
63 |     {
64 |         if( maxFeatureSet < SIMD )
65 |         {
66 |             return nullptr;
67 |         }
68 | 
69 |         constexpr auto NextCompiled = FastSIMD::FASTSIMD_LIBRARY_NAME::CompiledFeatureSets::NextAfter<SIMD>;
70 | 
71 |         if constexpr( NextCompiled != FeatureSet::Max )
72 |         {
73 |             if( maxFeatureSet >= NextCompiled )
74 |             {
75 |                 return DispatchClassFactoryIterator<T, NextCompiled>( maxFeatureSet, allocator );
76 |             }
77 |         }
78 |         
79 |         return DispatchClassFactory<SIMD>::template New<T>( allocator );
80 |     }
81 | 
82 |     template<typename T>
83 |     FASTSIMD_API T* NewDispatchClass( FeatureSet maxFeatureSet, MemoryAllocator allocator )
84 |     {
85 |         if( maxFeatureSet == FeatureSet::Max )
86 |         {
87 |             maxFeatureSet = DetectCpuMaxFeatureSet();
88 |         }
89 | 
90 |         return DispatchClassFactoryIterator<T, FastSIMD::FASTSIMD_LIBRARY_NAME::CompiledFeatureSets::Minimum>( maxFeatureSet, allocator );
91 |     }
92 | 
93 | 
94 | } // namespace FastSIMD
95 | 


--------------------------------------------------------------------------------
/examples/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(dispatch_library)
2 | add_subdirectory(header_only)
3 | 


--------------------------------------------------------------------------------
/examples/dispatch_library/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | 
2 | fastsimd_create_dispatch_library(simd_example_dispatch_library SOURCES "example.inl")
3 | 
4 | add_executable(example_dispatch_library "main.cpp")
5 | target_link_libraries(example_dispatch_library PRIVATE FastSIMD simd_example_dispatch_library)


--------------------------------------------------------------------------------
/examples/dispatch_library/example.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <FastSIMD/DispatchClass.h>
 3 | #include <cstddef>
 4 | 
 5 | class ExampleSIMD
 6 | {
 7 | public:
 8 |     virtual ~ExampleSIMD() = default;
 9 | 
10 |     virtual void SimpleData( const float* in, float* out, std::size_t dataSize, float multiplier, float cutoff ) = 0;
11 | };


--------------------------------------------------------------------------------
/examples/dispatch_library/example.inl:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "example.h"
 3 | 
 4 | template<FastSIMD::FeatureSet SIMD>
 5 | class FastSIMD::DispatchClass<ExampleSIMD, SIMD> : public ExampleSIMD
 6 | {
 7 |     void SimpleData( const float* in, float* out, std::size_t dataSize, float multiplier, float cutoff ) override
 8 |     {
 9 |         constexpr std::size_t N = 32;
10 | 
11 |         if constexpr( (SIMD & FastSIMD::FeatureFlag::AVX512_F) )
12 |         {
13 |             auto vMultiplier = FS::f32<N>( multiplier );
14 |             auto test = FS::NativeExec<FS::f32<N>>( FS_BIND_INTRINSIC( _mm512_mul_ps ), vMultiplier, FS::LoadIncremented<float, N>() );
15 | 
16 |             FS::Store( out, test );
17 |         }
18 |                 
19 |         //auto vInt = FS::i32<N>( 1 ) + 2_i32;
20 | 
21 |         auto vMultiplier = FS::f32<N>( multiplier );
22 |         auto vCutoff     = FS::f32<N>( cutoff );
23 | 
24 |         for( std::size_t i = 0; i < dataSize; i += N )
25 |         {
26 |             FS::f32<N> data = FS::Load<N>( in + i );
27 | 
28 |             data = FS::Select( data < vCutoff, data * vMultiplier, data );
29 | 
30 |             FS::Store( out + i, data );
31 |         }
32 |     }
33 | };
34 | 
35 | template class FastSIMD::RegisterDispatchClass<ExampleSIMD>;


--------------------------------------------------------------------------------
/examples/dispatch_library/main.cpp:
--------------------------------------------------------------------------------
 1 | #include "example.h"
 2 | #include <vector>
 3 | #include <iostream>
 4 | 
 5 | int main()
 6 | {
 7 |     FastSIMD::FeatureSet featureSet = FastSIMD::DetectCpuMaxFeatureSet();
 8 |     std::cout << FastSIMD::GetFeatureSetString( featureSet ) << std::endl;
 9 | 
10 |     std::vector<float> data;
11 |     for( int i = 0; i < 32; i++ )
12 |     {
13 |         data.push_back( (float)i );
14 |     }
15 |     std::vector<float> out( data.size() );
16 | 
17 |     ExampleSIMD* simd = FastSIMD::NewDispatchClass<ExampleSIMD>();
18 | 
19 |     simd->SimpleData( data.data(), out.data(), data.size(), 10, 17 );
20 |     
21 |     for( std::size_t i = 0; i < data.size(); i++ )
22 |     {
23 |         std::cout << data[i] << "\t: " << out[i] << std::endl;
24 |     }
25 | 
26 |     return 0;
27 | }
28 | 


--------------------------------------------------------------------------------
/examples/header_only/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | 
2 | add_executable(example_header_only "main.cpp")
3 | target_link_libraries(example_header_only PRIVATE FastSIMD)
4 | 


--------------------------------------------------------------------------------
/examples/header_only/main.cpp:
--------------------------------------------------------------------------------
 1 | #include <FastSIMD/ToolSet.h>
 2 | 
 3 | int main()
 4 | {
 5 |     auto distSq = FS::LoadIncremented<FS::f32<4>>();
 6 | 
 7 |     auto invSqrt = FS::InvSqrt( distSq );
 8 | 
 9 |     auto dist = invSqrt * distSq;
10 | 
11 |     auto out = FS::Masked( invSqrt != FS::f32<4>( INFINITY ), dist );
12 | 
13 |     return FS::Extract0( FS::Convert<int>( out ) );
14 | }
15 | 


--------------------------------------------------------------------------------
/include/FastSIMD/DispatchClass.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "Utility/FeatureEnums.h"
 3 | 
 4 | #include <cstddef>
 5 | 
 6 | namespace FastSIMD
 7 | {        
 8 |     using MemoryAllocator = void* (*)( std::size_t size, std::size_t align );
 9 | 
10 |     template<typename T>
11 |     FASTSIMD_API T* NewDispatchClass( FeatureSet maxFeatureSet = FeatureSet::Max, MemoryAllocator allocator = nullptr );
12 | }
13 | 


--------------------------------------------------------------------------------
/include/FastSIMD/ToolSet.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "Utility/ArchDetect.h"
 3 | #include "Utility/FeatureEnums.h"
 4 | 
 5 | namespace FastSIMD
 6 | {
 7 |     static_assert( FASTSIMD_DEFAULT_FEATURE_VALUE() <= FASTSIMD_MAX_FEATURE_VALUE(),
 8 |                    "Default feature set must be <= to max feature set" );
 9 | 
10 |     template<FeatureSet SIMD = FastSIMD::FeatureSet::FASTSIMD_DEFAULT_FEATURE_SET>
11 |     static constexpr FeatureSet FeatureSetDefault()
12 |     {
13 |         return SIMD;
14 |     }
15 | 
16 |     template<auto = 0, bool RELAXED = FASTSIMD_IS_RELAXED>
17 |     static constexpr bool IsRelaxed()
18 |     {
19 |         return RELAXED;
20 |     }
21 | } // namespace FastSIMD
22 | 
23 | #include "ToolSet/Generic/Functions.h"
24 | 
25 | #include "ToolSet/Generic/Scalar.h"
26 | 
27 | #if FASTSIMD_CURRENT_ARCH_IS( X86 )
28 | #include "ToolSet/x86/x86.h"
29 | #elif FASTSIMD_CURRENT_ARCH_IS( ARM )
30 | #include "ToolSet/ARM/ARM.h"
31 | #elif FASTSIMD_CURRENT_ARCH_IS( WASM )
32 | #include "ToolSet/WASM/WASM.h"
33 | #endif
34 | 
35 | 


--------------------------------------------------------------------------------
/include/FastSIMD/ToolSet/ARM/128/f32x4.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <FastSIMD/ToolSet/Generic/Register.h>
  4 | 
  5 | namespace FS
  6 | {
  7 |     template<FastSIMD::FeatureSet SIMD>
  8 |     struct Register<float, 4, SIMD, std::enable_if_t<SIMD & FastSIMD::FeatureFlag::NEON>>
  9 |     {
 10 |         static constexpr size_t ElementCount = 4;
 11 |         static constexpr auto FeatureFlags = SIMD;
 12 |         
 13 |         using NativeType = float32x4_t;
 14 |         using ElementType = float;
 15 |         using MaskType = m32<ElementCount, true, SIMD>;
 16 |         using MaskTypeArg = m32<ElementCount, true, SIMD>;
 17 | 
 18 |         FS_FORCEINLINE Register() = default;
 19 |         FS_FORCEINLINE Register( NativeType v ) : native( v ) { }
 20 |         FS_FORCEINLINE Register( float v ) : native( vdupq_n_f32( v ) ) { }
 21 |         
 22 |         FS_FORCEINLINE NativeType GetNative() const
 23 |         {
 24 |             return native;
 25 |         }
 26 | 
 27 |         FS_FORCEINLINE Register& operator +=( const Register& rhs )
 28 |         {
 29 |             native = vaddq_f32( native, rhs.native );
 30 |             return *this;
 31 |         }
 32 | 
 33 |         FS_FORCEINLINE Register& operator -=( const Register& rhs )
 34 |         {
 35 |             native = vsubq_f32( native, rhs.native );
 36 |             return *this;
 37 |         }
 38 |         
 39 |         FS_FORCEINLINE Register& operator *=( const Register& rhs )
 40 |         {
 41 |             native = vmulq_f32( native, rhs.native );
 42 |             return *this;           
 43 |         }
 44 |         
 45 |         FS_FORCEINLINE Register& operator /=( const Register& rhs )
 46 |         {
 47 |             if constexpr( SIMD & FastSIMD::FeatureFlag::AARCH64 )
 48 |             {
 49 |                 native = vdivq_f32( native, rhs.native );
 50 |             }
 51 |             else
 52 |             {
 53 |                 float32x4_t reciprocal = vrecpeq_f32( rhs.native );
 54 |                 // Additional Netwon-Raphson iteration for accuracy
 55 |                 reciprocal = vmulq_f32( vrecpsq_f32( rhs.native, reciprocal ), reciprocal );
 56 |                 reciprocal = vmulq_f32( vrecpsq_f32( rhs.native, reciprocal ), reciprocal );
 57 | 
 58 |                 native = vmulq_f32( native, reciprocal );
 59 |             }
 60 | 
 61 |             return *this;           
 62 |         }
 63 |             
 64 |         FS_FORCEINLINE Register& operator &=( const Register& rhs )
 65 |         {
 66 |             native = vreinterpretq_f32_u32( vandq_u32( vreinterpretq_u32_f32( native ), vreinterpretq_u32_f32( rhs.native ) ) );
 67 |             return *this;
 68 |         }
 69 |         
 70 |         FS_FORCEINLINE Register& operator |=( const Register& rhs )
 71 |         {
 72 |             native = vreinterpretq_f32_u32( vorrq_u32( vreinterpretq_u32_f32( native ), vreinterpretq_u32_f32( rhs.native ) ) );
 73 |             return *this;
 74 |         }
 75 |         
 76 |         FS_FORCEINLINE Register& operator ^=( const Register& rhs )
 77 |         {
 78 |             native = vreinterpretq_f32_u32( veorq_u32( vreinterpretq_u32_f32( native ), vreinterpretq_u32_f32( rhs.native ) ) );
 79 |             return *this;
 80 |         }        
 81 | 
 82 |         FS_FORCEINLINE Register operator~() const
 83 |         {
 84 |             return vreinterpretq_f32_u32( vmvnq_u32( vreinterpretq_u32_f32( native ) ) );        
 85 |         }
 86 | 
 87 |         FS_FORCEINLINE Register operator-() const
 88 |         {
 89 |             return vnegq_f32( native );
 90 |         }
 91 |         
 92 |         
 93 |         FS_FORCEINLINE MaskType operator ==( const Register& rhs ) const
 94 |         {
 95 |             return vceqq_f32( native, rhs.native );
 96 |         }
 97 |         
 98 |         FS_FORCEINLINE MaskType operator !=( const Register& rhs ) const
 99 |         {
100 |             return ~( *this == rhs );
101 |         }
102 |         
103 |         FS_FORCEINLINE MaskType operator >=( const Register& rhs ) const
104 |         {
105 |             return vcgeq_f32( native, rhs.native );
106 |         }
107 |         
108 |         FS_FORCEINLINE MaskType operator <=( const Register& rhs ) const
109 |         {
110 |             return vcleq_f32( native, rhs.native );
111 |         }
112 |         
113 |         FS_FORCEINLINE MaskType operator >( const Register& rhs ) const
114 |         {
115 |             return vcgtq_f32( native, rhs.native );
116 |         }
117 |         
118 |         FS_FORCEINLINE MaskType operator <( const Register& rhs ) const
119 |         {
120 |             return vcltq_f32( native, rhs.native );
121 |         }
122 | 
123 |         NativeType native;
124 |     };
125 |     
126 |     
127 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>>
128 |     FS_FORCEINLINE f32<4, SIMD> Load( TypeWrapper<const float*, 4, SIMD> ptr )
129 |     {
130 |         return vld1q_f32( ptr.value );
131 |     }
132 |     
133 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>>
134 |     FS_FORCEINLINE void Store( typename f32<4, SIMD>::ElementType* ptr, const f32<4, SIMD>& a )
135 |     {
136 |         vst1q_f32( ptr, a.native );
137 |     }
138 | 
139 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>>
140 |     FS_FORCEINLINE float Extract0( const f32<4, SIMD>& a )
141 |     {
142 |         return vgetq_lane_f32( a.native, 0 );
143 |     }
144 | 
145 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>>
146 |     FS_FORCEINLINE f32<4, SIMD> Abs( const f32<4, SIMD>& a )
147 |     {
148 |         return vabsq_f32( a.native );
149 |     }
150 |     
151 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>>
152 |     FS_FORCEINLINE f32<4, SIMD> Round( const f32<4, SIMD>& a )
153 |     {
154 |         /*if constexpr( SIMD & FastSIMD::FeatureFlag::SSE41 )
155 |         {
156 |             return _mm_round_ps( a.native, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC );
157 |         }
158 |         else
159 |         {
160 |             __m128i aInt = _mm_cvtps_epi32( a.native );
161 |             __m128 aIntF = _mm_cvtepi32_ps( aInt );
162 | 
163 |             return _mm_xor_ps( aIntF, _mm_and_ps( _mm_castsi128_ps( _mm_cmpeq_epi32( aInt, _mm_set1_epi32( (-2147483647 - 1) ) ) ), _mm_xor_ps( a.native, aIntF ) ) );
164 |         }*/
165 |         return vrndnq_f32( a.native );
166 |     }
167 | 
168 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>>
169 |     FS_FORCEINLINE f32<4, SIMD> Floor( const f32<4, SIMD>& a )
170 |     {
171 |         return vrndmq_f32( a.native );
172 |     }
173 | 
174 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>>
175 |     FS_FORCEINLINE f32<4, SIMD> Ceil( const f32<4, SIMD>& a )
176 |     {
177 |         return vrndpq_f32( a.native );
178 |     }
179 | 
180 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>>
181 |     FS_FORCEINLINE f32<4, SIMD> Trunc( const f32<4, SIMD>& a )
182 |     {
183 |         return vrndq_f32( a.native );
184 |     }
185 |         
186 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>>
187 |     FS_FORCEINLINE f32<4, SIMD> Min( const f32<4, SIMD>& a, const f32<4, SIMD>& b )
188 |     {
189 |         return vminq_f32( a.native, b.native );
190 |     }
191 |         
192 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>>
193 |     FS_FORCEINLINE f32<4, SIMD> Max( const f32<4, SIMD>& a, const f32<4, SIMD>& b )
194 |     {
195 |         return vmaxq_f32( a.native, b.native );
196 |     }
197 |         
198 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>>
199 |     FS_FORCEINLINE f32<4, SIMD> Select( const typename f32<4, SIMD>::MaskTypeArg& mask, const f32<4, SIMD>& ifTrue, const f32<4, SIMD>& ifFalse )
200 |     {
201 |         return vbslq_f32( mask.native, ifTrue.native, ifFalse.native );
202 |     }
203 | 
204 | 
205 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>>
206 |     FS_FORCEINLINE f32<4, SIMD> Masked( const typename f32<4, SIMD>::MaskTypeArg& mask, const f32<4, SIMD>& a )
207 |     {
208 |         return vreinterpretq_f32_u32( vandq_u32( vreinterpretq_u32_f32( a.native ), mask.native ) );  
209 |     }
210 | 
211 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>>
212 |     FS_FORCEINLINE f32<4, SIMD> MaskedIncrement( const typename f32<4, SIMD>::MaskTypeArg& mask, const f32<4, SIMD>& a )
213 |     {
214 |         return vsubq_f32( a.native, vcvtq_f32_s32( vreinterpretq_s32_u32( mask.native ) ) );
215 |     }
216 | 
217 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>>
218 |     FS_FORCEINLINE f32<4, SIMD> MaskedDecrement( const typename f32<4, SIMD>::MaskTypeArg& mask, const f32<4, SIMD>& a )
219 |     {
220 |         return vaddq_f32( a.native, vcvtq_f32_s32( vreinterpretq_s32_u32( mask.native ) ) );
221 |     }
222 | 
223 |     
224 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>, typename = EnableIfRelaxed<SIMD>>
225 |     FS_FORCEINLINE f32<4, SIMD> Reciprocal( const f32<4, SIMD>& a )
226 |     {
227 |         float32x4_t recip = vrecpeq_f32( a.native );
228 |         return vmulq_f32( recip, vrecpsq_f32( recip, a.native ) );
229 |     }
230 |     
231 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>, typename = EnableIfRelaxed<SIMD>>
232 |     FS_FORCEINLINE f32<4, SIMD> InvSqrt( const f32<4, SIMD>& a )
233 |     {
234 |         float32x4_t rsqrt = vrsqrteq_f32( a.native );      
235 |         return vmulq_f32( rsqrt, vrsqrtsq_f32( vmulq_f32( a.native, rsqrt ), rsqrt ) );
236 |     }
237 | 
238 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>>
239 |     FS_FORCEINLINE f32<4, SIMD> Sqrt( const f32<4, SIMD>& a )
240 |     {
241 |         return vsqrtq_f32( a.native );
242 |     }
243 | 
244 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>, typename = EnableIfRelaxed<SIMD>>
245 |     FS_FORCEINLINE f32<4, SIMD> FMulAdd( const f32<4, SIMD>& a, const f32<4, SIMD>& b, const f32<4, SIMD>& c )
246 |     {
247 |         return vmlaq_f32( b.native, c.native, a.native );
248 |     }
249 | 
250 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>, typename = EnableIfRelaxed<SIMD>>
251 |     FS_FORCEINLINE f32<4, SIMD> FNMulAdd( const f32<4, SIMD>& a, const f32<4, SIMD>& b, const f32<4, SIMD>& c )
252 |     {
253 |         return vmlaq_f32( b.native, c.native, a.native );
254 |     }
255 | }
256 | 


--------------------------------------------------------------------------------
/include/FastSIMD/ToolSet/ARM/128/i32x4.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <FastSIMD/ToolSet/Generic/Register.h>
  4 | 
  5 | namespace FS
  6 | {
  7 |     template<FastSIMD::FeatureSet SIMD>
  8 |     struct Register<std::int32_t, 4, SIMD, std::enable_if_t<SIMD & FastSIMD::FeatureFlag::NEON>>
  9 |     {
 10 |         static constexpr size_t ElementCount = 4;
 11 |         static constexpr auto FeatureFlags = SIMD;
 12 | 
 13 |         using NativeType = int32x4_t;
 14 |         using ElementType = std::int32_t;
 15 |         using MaskType = m32<ElementCount, false, SIMD>;
 16 |         using MaskTypeArg = m32<ElementCount, true, SIMD>;
 17 | 
 18 |         FS_FORCEINLINE Register() = default;
 19 |         FS_FORCEINLINE Register( NativeType v ) : native( v ) { }
 20 |         FS_FORCEINLINE Register( std::int32_t v ) : native( vdupq_n_s32( v ) ) { }
 21 |         
 22 |         FS_FORCEINLINE NativeType GetNative() const
 23 |         {
 24 |             return native;
 25 |         }
 26 | 
 27 |         FS_FORCEINLINE Register& operator +=( const Register& rhs )
 28 |         {
 29 |             native = vaddq_s32( native, rhs.native );
 30 |             return *this;
 31 |         }
 32 | 
 33 |         FS_FORCEINLINE Register& operator -=( const Register& rhs )
 34 |         {
 35 |             native = vsubq_s32( native, rhs.native );
 36 |             return *this;
 37 |         }
 38 |         
 39 |         FS_FORCEINLINE Register& operator *=( const Register& rhs )
 40 |         {
 41 |             native = vmulq_s32( native, rhs.native );
 42 |             return *this;
 43 |         }
 44 |             
 45 |         FS_FORCEINLINE Register& operator &=( const Register& rhs )
 46 |         {
 47 |             native = vandq_s32( native, rhs.native );
 48 |             return *this;
 49 |         }
 50 |         
 51 |         FS_FORCEINLINE Register& operator |=( const Register& rhs )
 52 |         {
 53 |             native = vorrq_s32( native, rhs.native );
 54 |             return *this;
 55 |         }
 56 |         
 57 |         FS_FORCEINLINE Register& operator ^=( const Register& rhs )
 58 |         {
 59 |             native = veorq_s32( native, rhs.native );
 60 |             return *this;
 61 |         }
 62 |         
 63 |         FS_FORCEINLINE Register& operator >>=( int rhs )
 64 |         {
 65 |             native = vshlq_s32( native, vdupq_n_s32( -rhs ) );
 66 |             return *this;
 67 |         }
 68 |         
 69 |         FS_FORCEINLINE Register& operator <<=( int rhs )
 70 |         {
 71 |             native = vshlq_s32( native, vdupq_n_s32( rhs ) );
 72 |             return *this;
 73 |         }
 74 | 
 75 |         FS_FORCEINLINE Register operator ~() const
 76 |         {
 77 |             return vmvnq_s32( native );        
 78 |         }
 79 | 
 80 |         FS_FORCEINLINE Register operator -() const
 81 |         {
 82 |             return vnegq_s32( native );
 83 |         }
 84 | 
 85 |         
 86 |         FS_FORCEINLINE MaskType operator ==( const Register& rhs ) const
 87 |         {
 88 |             return vceqq_s32( native, rhs.native );
 89 |         }
 90 |         
 91 |         FS_FORCEINLINE MaskType operator !=( const Register& rhs ) const
 92 |         {
 93 |             return ~(*this == rhs);
 94 |         }
 95 |         
 96 |         FS_FORCEINLINE MaskType operator >=( const Register& rhs ) const
 97 |         {
 98 |             return vcgeq_s32( native, rhs.native );
 99 |         }
100 |         
101 |         FS_FORCEINLINE MaskType operator <=( const Register& rhs ) const
102 |         {
103 |             return vcleq_s32( native, rhs.native );
104 |         }
105 |         
106 |         FS_FORCEINLINE MaskType operator >( const Register& rhs ) const
107 |         {
108 |             return vcgtq_s32( native, rhs.native );
109 |         }
110 |         
111 |         FS_FORCEINLINE MaskType operator <( const Register& rhs ) const
112 |         {
113 |             return vcltq_s32( native, rhs.native );
114 |         }
115 | 
116 |         NativeType native;
117 |     };
118 | 
119 |     
120 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<4, SIMD>>>
121 |     FS_FORCEINLINE i32<4, SIMD> Load( TypeWrapper<const int*, 4, SIMD> ptr )
122 |     {
123 |         return vld1q_s32( ptr.value );
124 |     }
125 |     
126 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<4, SIMD>>>
127 |     FS_FORCEINLINE void Store( typename i32<4, SIMD>::ElementType* ptr, const i32<4, SIMD>& a )
128 |     {
129 |         vst1q_s32( ptr, a.native );
130 |     }
131 | 
132 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<4, SIMD>>>
133 |     FS_FORCEINLINE int32_t Extract0( const i32<4, SIMD>& a )
134 |     {
135 |         return vgetq_lane_s32( a.native, 0 );
136 |     }
137 |         
138 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<4, SIMD>>>
139 |     FS_FORCEINLINE i32<4, SIMD> Abs( const i32<4, SIMD>& a )
140 |     {
141 |         return vabsq_s32( a.native );
142 |     }
143 |         
144 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<4, SIMD>>>
145 |     FS_FORCEINLINE i32<4, SIMD> Min( const i32<4, SIMD>& a, const i32<4, SIMD>& b )
146 |     {
147 |         return vminq_s32( a.native, b.native );
148 |     }
149 |         
150 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<4, SIMD>>>
151 |     FS_FORCEINLINE i32<4, SIMD> Max( const i32<4, SIMD>& a, const i32<4, SIMD>& b )
152 |     {
153 |         return vmaxq_s32( a.native, b.native );
154 |     }
155 |         
156 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<4, SIMD>>>
157 |     FS_FORCEINLINE i32<4, SIMD> Select( const typename i32<4, SIMD>::MaskTypeArg& mask, const i32<4, SIMD>& ifTrue, const i32<4, SIMD>& ifFalse )
158 |     {
159 |         return vbslq_s32( mask.native, ifTrue.native, ifFalse.native );
160 |     }
161 | 
162 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<4, SIMD>>>
163 |     FS_FORCEINLINE i32<4, SIMD> BitShiftRightZeroExtend( const i32<4, SIMD>& a, int b )
164 |     {
165 |         return vreinterpretq_s32_u32( vshlq_u32( vreinterpretq_u32_s32( a.native ), vdupq_n_s32( -b ) ) );
166 |     }
167 | 
168 | 
169 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<4, SIMD>>>
170 |     FS_FORCEINLINE i32<4, SIMD> Masked( const typename i32<4, SIMD>::MaskTypeArg& mask, const i32<4, SIMD>& a )
171 |     {
172 |         return vandq_s32( vreinterpretq_s32_u32( mask.native ), a.native );    
173 |     }
174 |         
175 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<4, SIMD>>>
176 |     FS_FORCEINLINE i32<4, SIMD> MaskedIncrement( const typename i32<4, SIMD>::MaskTypeArg& mask, const i32<4, SIMD>& a )
177 |     {
178 |         return vsubq_s32( a.native, vreinterpretq_s32_u32( mask.native ) );
179 |     }
180 |         
181 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<4, SIMD>>>
182 |     FS_FORCEINLINE i32<4, SIMD> MaskedDecrement( const typename i32<4, SIMD>::MaskTypeArg& mask, const i32<4, SIMD>& a )
183 |     {
184 |         return vaddq_s32( a.native, vreinterpretq_s32_u32( mask.native ) );    
185 |     }
186 | }
187 | 


--------------------------------------------------------------------------------
/include/FastSIMD/ToolSet/ARM/128/m32x4.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <FastSIMD/ToolSet/Generic/Register.h>
 4 | 
 5 | namespace FS
 6 | {
 7 |     namespace impl
 8 |     {
 9 |         struct ArmMaskBase32x4
10 |         {
11 |             uint32x4_t native;
12 |         };
13 |     }
14 | 
15 |     template<FastSIMD::FeatureSet SIMD, bool OPTIMISE_FLOAT>
16 |     struct Register<Mask<32, OPTIMISE_FLOAT>, 4, SIMD, std::enable_if_t<SIMD & FastSIMD::FeatureFlag::ARM>>
17 |         : std::conditional_t<OPTIMISE_FLOAT, impl::ArmMaskBase32x4, Register<Mask<32, true>, 4, SIMD>>
18 |     {
19 |         static constexpr size_t ElementCount = 4;
20 |         static constexpr auto FeatureFlags = SIMD;
21 |         
22 |         using NativeType = decltype(ArmMaskBase32x4::native);
23 |         using ElementType = Mask<32, OPTIMISE_FLOAT>;
24 |         using MaskType = Register;
25 |         using MaskTypeArg = Register;
26 | 
27 |         FS_FORCEINLINE Register() = default;
28 |         FS_FORCEINLINE Register( NativeType v ) { this->native = v; }
29 |         
30 |         FS_FORCEINLINE NativeType GetNative() const
31 |         {
32 |             return this->native;
33 |         }
34 | 
35 |         FS_FORCEINLINE Register& operator &=( const Register& rhs )
36 |         {
37 |             this->native = vandq_u32( this->native, rhs.native );
38 |             return *this;
39 |         }
40 |         
41 |         FS_FORCEINLINE Register& operator |=( const Register& rhs )
42 |         {
43 |             this->native = vorrq_u32( this->native, rhs.native );
44 |             return *this;
45 |         }
46 |         
47 |         FS_FORCEINLINE Register& operator ^=( const Register& rhs )
48 |         {
49 |             this->native = veorq_u32( this->native, rhs.native );
50 |             return *this;
51 |         }
52 |         
53 |         FS_FORCEINLINE Register operator ~() const
54 |         {
55 |             return vmvnq_u32( this->native );        
56 |         }
57 |     };
58 | 
59 |     template<FastSIMD::FeatureSet SIMD, bool B, typename = EnableIfNative<Register<Mask<32, B>, 4, SIMD>>>
60 |     FS_FORCEINLINE bool AnyMask( const Register<Mask<32, B>, 4, SIMD>& a )
61 |     {
62 |         if constexpr( SIMD & FastSIMD::FeatureFlag::AARCH64 )
63 |         {
64 |             return vmaxvq_u32( a.native );
65 |         }
66 |         else
67 |         {
68 |             uint32x2_t tmp = vorr_u32( vget_low_u32( a.native ), vget_high_u32( a.native ) );
69 |             return (bool)vget_lane_u32( vpmax_u32( tmp, tmp ), 0 );
70 |         }
71 |     }
72 | 
73 |     template<FastSIMD::FeatureSet SIMD, bool B, typename = EnableIfNative<Register<Mask<32, B>, 4, SIMD>>>
74 |     FS_FORCEINLINE BitStorage<4> BitMask( const Register<Mask<32, B>, 4, SIMD>& a )
75 |     {
76 |         if constexpr( SIMD & FastSIMD::FeatureFlag::AARCH64 )
77 |         {
78 |             static const int32_t shift[4] = { 0, 1, 2, 3 };
79 |             uint32x4_t tmp = vshrq_n_u32( a.native, 31 );
80 |             return vaddvq_u32( vshlq_u32( tmp, vld1q_s32( shift ) ) );
81 |         }
82 |         else
83 |         {
84 |             // Shift out everything but the sign bits with a 32-bit unsigned shift
85 |             // right.
86 |             uint64x2_t high_bits = vreinterpretq_u64_u32( vshrq_n_u32( a.native, 31 ) );
87 |             // Merge the two pairs together with a 64-bit unsigned shift right + add.
88 |             uint8x16_t paired =
89 |                 vreinterpretq_u8_u64( vsraq_n_u64( high_bits, high_bits, 31 ) );
90 |             // Extract the result.
91 |             return vgetq_lane_u8( paired, 0 ) | ( vgetq_lane_u8( paired, 8 ) << 2 );
92 |         }
93 |     }
94 |     
95 | }


--------------------------------------------------------------------------------
/include/FastSIMD/ToolSet/ARM/ARM.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | #if FASTSIMD_MAX_FEATURE_VALUE() >= FASTSIMD_FEATURE_VALUE( NEON )
4 | #include "NEON.h"
5 | #endif
6 | 


--------------------------------------------------------------------------------
/include/FastSIMD/ToolSet/ARM/NEON.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <arm_neon.h>
 3 | 
 4 | #include "128/f32x4.h"
 5 | #include "128/i32x4.h"
 6 | #include "128/m32x4.h"
 7 | 
 8 | namespace FS
 9 | {
10 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>>
11 |     FS_FORCEINLINE i32<4, SIMD> Convert( const f32<4, SIMD>& a, TypeDummy<int32_t> )
12 |     {
13 |         return vcvtq_s32_f32( Round( a ).native );
14 |     }
15 | 
16 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<4, SIMD>>>
17 |     FS_FORCEINLINE f32<4, SIMD> Convert( const i32<4, SIMD>& a, TypeDummy<float> )
18 |     {
19 |         return vcvtq_f32_s32( a.native );
20 |     }
21 | 
22 |     template<typename U, typename T, FastSIMD::FeatureSet SIMD, typename = EnableIfNative<Register<T, 4, SIMD>>>
23 |     FS_FORCEINLINE Register<U, 4, SIMD> Cast( const Register<T, 4, SIMD>& a, TypeDummy<U> )
24 |     {
25 |         if constexpr( 
26 |             std::is_same_v<typename Register<T, 4, SIMD>::NativeType, float32x4_t> &&
27 |             std::is_same_v<typename Register<U, 4, SIMD>::NativeType, int32x4_t> )
28 |         {
29 |             return vreinterpretq_s32_f32( a.GetNative() );
30 |         }
31 |         else if constexpr( 
32 |             std::is_same_v<typename Register<T, 4, SIMD>::NativeType, int32x4_t> &&
33 |             std::is_same_v<typename Register<U, 4, SIMD>::NativeType, float32x4_t> )
34 |         {
35 |             return vreinterpretq_f32_s32( a.GetNative() );
36 |         }
37 |         else
38 |         {
39 |             return a.GetNative();
40 |         }
41 |     }
42 | }
43 | 


--------------------------------------------------------------------------------
/include/FastSIMD/ToolSet/Generic/Register.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <type_traits>
  4 | #include <cstdint>
  5 | #include <utility>
  6 | #include <tuple>
  7 | 
  8 | #include <FastSIMD/Utility/ArchDetect.h>
  9 | #include <FastSIMD/Utility/FeatureEnums.h>
 10 | 
 11 | #ifdef _MSC_VER
 12 | #define FS_FORCEINLINE __forceinline
 13 | #define FS_NEVERINLINE __declspec(noinline)
 14 | #else
 15 | #define FS_FORCEINLINE __attribute__( ( always_inline ) ) inline
 16 | #define FS_NEVERINLINE __attribute__( ( noinline ) )
 17 | #endif
 18 | 
 19 | #if FASTSIMD_CURRENT_ARCH_IS( WASM )
 20 | #define FS_VECTORCALL
 21 | #elif defined( __clang__ )
 22 | #define FS_VECTORCALL __regcall
 23 | #elif defined( _MSC_VER )
 24 | #define FS_VECTORCALL __vectorcall
 25 | #else
 26 | #define FS_VECTORCALL
 27 | #endif
 28 | 
 29 | namespace FS
 30 | {
 31 |     template<typename T, std::size_t N, FastSIMD::FeatureSet SIMD = FastSIMD::FeatureSetDefault(), typename = void>
 32 |     struct Register
 33 |     {
 34 |         static_assert( SIMD != FastSIMD::FeatureSet::Invalid, "Invalid FeatureSet" );
 35 |         static_assert( N > 1, "Unknown Vector Type" );
 36 |         static_assert( ( N & ( N - 1 ) ) == 0, "Vector size must be power of 2" );
 37 | 
 38 |         static constexpr std::size_t ElementCount = N;
 39 |         static constexpr auto FeatureFlags = SIMD;
 40 | 
 41 |         using DoubledType = Register<T, N / 2, SIMD>;
 42 |         using ElementType = T;
 43 |         using MaskType = Register<typename DoubledType::MaskType::ElementType, N, SIMD>;
 44 |         using MaskTypeArg = Register<typename DoubledType::MaskTypeArg::ElementType, N, SIMD>;
 45 | 
 46 |         Register() = default;
 47 |         Register( T v ) : v0( v ), v1( v ) { }
 48 |         Register( const DoubledType& v, const DoubledType& u ) : v0( v ), v1( u ) { }
 49 | 
 50 |         // Conversion for Mask<N, false> -> Mask<N, true>
 51 |         template<typename U>
 52 |         FS_FORCEINLINE operator Register<U, N, SIMD>() const
 53 |         {
 54 |             return Register<U, N, SIMD>{ v0, v1 };
 55 |         }
 56 | 
 57 |         FS_FORCEINLINE Register& operator +=( const Register& rhs )
 58 |         {
 59 |             v0 += rhs.v0;
 60 |             v1 += rhs.v1;
 61 |             return *this;
 62 |         }
 63 | 
 64 |         FS_FORCEINLINE Register& operator -=( const Register& rhs )
 65 |         {
 66 |             v0 -= rhs.v0;
 67 |             v1 -= rhs.v1;
 68 |             return *this;
 69 |         }
 70 | 
 71 |         FS_FORCEINLINE Register& operator *=( const Register& rhs )
 72 |         {
 73 |             v0 *= rhs.v0;
 74 |             v1 *= rhs.v1;
 75 |             return *this;
 76 |         }
 77 | 
 78 |         FS_FORCEINLINE Register& operator /=( const Register& rhs )
 79 |         {
 80 |             v0 /= rhs.v0;
 81 |             v1 /= rhs.v1;
 82 |             return *this;
 83 |         }
 84 | 
 85 |         FS_FORCEINLINE Register& operator &=( const Register& rhs )
 86 |         {
 87 |             v0 &= rhs.v0;
 88 |             v1 &= rhs.v1;
 89 |             return *this;
 90 |         }
 91 | 
 92 |         FS_FORCEINLINE Register& operator |=( const Register& rhs )
 93 |         {
 94 |             v0 |= rhs.v0;
 95 |             v1 |= rhs.v1;
 96 |             return *this;
 97 |         }
 98 | 
 99 |         FS_FORCEINLINE Register& operator ^=( const Register& rhs )
100 |         {
101 |             v0 ^= rhs.v0;
102 |             v1 ^= rhs.v1;
103 |             return *this;
104 |         }
105 | 
106 |         FS_FORCEINLINE Register& operator >>=( const Register& rhs )
107 |         {
108 |             v0 >>= rhs.v0;
109 |             v1 >>= rhs.v1;
110 |             return *this;
111 |         }
112 | 
113 |         FS_FORCEINLINE Register& operator <<=( const Register& rhs )
114 |         {
115 |             v0 <<= rhs.v0;
116 |             v1 <<= rhs.v1;
117 |             return *this;
118 |         }
119 | 
120 |         FS_FORCEINLINE Register& operator >>=( int rhs )
121 |         {
122 |             v0 >>= rhs;
123 |             v1 >>= rhs;
124 |             return *this;
125 |         }
126 | 
127 |         FS_FORCEINLINE Register& operator <<=( int rhs )
128 |         {
129 |             v0 <<= rhs;
130 |             v1 <<= rhs;
131 |             return *this;
132 |         }
133 | 
134 |         FS_FORCEINLINE Register operator -() const
135 |         {
136 |             return Register{ -this->v0, -this->v1 };
137 |         }
138 | 
139 |         FS_FORCEINLINE Register operator ~() const
140 |         {
141 |             return Register{ ~this->v0, ~this->v1 };
142 |         }
143 | 
144 | 
145 |         FS_FORCEINLINE MaskType operator ==( const Register& rhs ) const
146 |         {
147 |             return MaskType{ v0 == rhs.v0, v1 == rhs.v1 };
148 |         }
149 | 
150 |         FS_FORCEINLINE MaskType operator !=( const Register& rhs ) const
151 |         {
152 |             return MaskType{ v0 != rhs.v0, v1 != rhs.v1 };
153 |         }
154 | 
155 |         FS_FORCEINLINE MaskType operator >=( const Register& rhs ) const
156 |         {
157 |             return MaskType{ v0 >= rhs.v0, v1 >= rhs.v1 };
158 |         }
159 | 
160 |         FS_FORCEINLINE MaskType operator <=( const Register& rhs ) const
161 |         {
162 |             return MaskType{ v0 <= rhs.v0, v1 <= rhs.v1 };
163 |         }
164 | 
165 |         FS_FORCEINLINE MaskType operator >( const Register& rhs ) const
166 |         {
167 |             return MaskType{ v0 > rhs.v0, v1 > rhs.v1 };
168 |         }
169 | 
170 |         FS_FORCEINLINE MaskType operator <( const Register& rhs ) const
171 |         {
172 |             return MaskType{ v0 < rhs.v0, v1 < rhs.v1 };
173 |         }
174 | 
175 |         DoubledType v0, v1;
176 |     };
177 | 
178 |     template<typename T, std::size_t N, FastSIMD::FeatureSet SIMD>
179 |     FS_FORCEINLINE static Register<T, N, SIMD> operator +( Register<T, N, SIMD> lhs, const Register<T, N, SIMD>& rhs )
180 |     {
181 |         return lhs += rhs;
182 |     }
183 | 
184 |     template<typename T, std::size_t N, FastSIMD::FeatureSet SIMD>
185 |     FS_FORCEINLINE static Register<T, N, SIMD> operator -( Register<T, N, SIMD> lhs, const Register<T, N, SIMD>& rhs )
186 |     {
187 |         return lhs -= rhs;
188 |     }
189 | 
190 |     template<typename T, std::size_t N, FastSIMD::FeatureSet SIMD>
191 |     FS_FORCEINLINE static Register<T, N, SIMD> operator *( Register<T, N, SIMD> lhs, const Register<T, N, SIMD>& rhs )
192 |     {
193 |         return lhs *= rhs;
194 |     }
195 | 
196 |     template<typename T, std::size_t N, FastSIMD::FeatureSet SIMD>
197 |     FS_FORCEINLINE static Register<T, N, SIMD> operator /( Register<T, N, SIMD> lhs, const Register<T, N, SIMD>& rhs )
198 |     {
199 |         return lhs /= rhs;
200 |     }
201 | 
202 |     template<typename T, std::size_t N, FastSIMD::FeatureSet SIMD>
203 |     FS_FORCEINLINE static Register<T, N, SIMD> operator &( Register<T, N, SIMD> lhs, const Register<T, N, SIMD>& rhs )
204 |     {
205 |         return lhs &= rhs;
206 |     }
207 | 
208 |     template<typename T, std::size_t N, FastSIMD::FeatureSet SIMD>
209 |     FS_FORCEINLINE static Register<T, N, SIMD> operator |( Register<T, N, SIMD> lhs, const Register<T, N, SIMD>& rhs )
210 |     {
211 |         return lhs |= rhs;
212 |     }
213 | 
214 |     template<typename T, std::size_t N, FastSIMD::FeatureSet SIMD>
215 |     FS_FORCEINLINE static Register<T, N, SIMD> operator ^( Register<T, N, SIMD> lhs, const Register<T, N, SIMD>& rhs )
216 |     {
217 |         return lhs ^= rhs;
218 |     }
219 | 
220 |     template<typename T, std::size_t N, FastSIMD::FeatureSet SIMD>
221 |     FS_FORCEINLINE static Register<T, N, SIMD> operator <<( Register<T, N, SIMD> lhs, int rhs )
222 |     {
223 |         return lhs <<= rhs;
224 |     }
225 | 
226 |     template<typename T, std::size_t N, FastSIMD::FeatureSet SIMD>
227 |     FS_FORCEINLINE static Register<T, N, SIMD> operator >>( Register<T, N, SIMD> lhs, int rhs )
228 |     {
229 |         return lhs >>= rhs;
230 |     }
231 | 
232 |     template<typename T, typename U, std::size_t N, FastSIMD::FeatureSet SIMD>
233 |     FS_FORCEINLINE static Register<T, N, SIMD> operator<<( Register<T, N, SIMD> lhs, Register<U, N, SIMD> rhs )
234 |     {
235 |         return lhs <<= rhs;
236 |     }
237 | 
238 |     template<typename T, typename U, std::size_t N, FastSIMD::FeatureSet SIMD>
239 |     FS_FORCEINLINE static Register<T, N, SIMD> operator>>( Register<T, N, SIMD> lhs, Register<U, N, SIMD> rhs )
240 |     {
241 |         return lhs >>= rhs;
242 |     }
243 | 
244 |     template<typename T, std::size_t N = 0, FastSIMD::FeatureSet SIMD = FastSIMD::FeatureSetDefault()>
245 |     struct TypeWrapper
246 |     {
247 |         using Type = T;
248 |         using Half = TypeWrapper<T, N / 2, SIMD>;
249 | 
250 |         FS_FORCEINLINE constexpr explicit TypeWrapper( T v ) : value( v ) { }
251 | 
252 |         FS_FORCEINLINE constexpr Half AsHalf() const
253 |         {
254 |             return Half( value );
255 |         }
256 | 
257 |         template<typename U>
258 |         FS_FORCEINLINE constexpr Half AsHalf( U offset ) const
259 |         {
260 |             return Half( value + offset );
261 |         }
262 | 
263 |         T value;
264 |     };
265 | 
266 |     template<typename T>
267 |     struct TypeDummy
268 |     {
269 |         using Type = T;
270 |     };
271 | 
272 |     template<std::size_t N, bool OPTIMISE_FLOAT = true>
273 |     struct Mask
274 |     {
275 |         Mask() = delete;
276 |     };
277 | 
278 |     template<typename T, typename = void>
279 |     struct IsNative : std::false_type { };
280 | 
281 |     template<typename T>
282 |     struct IsNative<T, std::void_t<typename T::NativeType>> : std::true_type { };
283 | 
284 |     template<typename T>
285 |     using EnableIfNative = typename T::NativeType;
286 | 
287 |     template<typename T>
288 |     using EnableIfNotNative = decltype( T::v0 );
289 | 
290 |     template<typename T>
291 |     constexpr bool IsNativeV = IsNative<T>::value;
292 | 
293 |     template<auto SIMD = 0>
294 |     using EnableIfRelaxed = std::enable_if_t<FastSIMD::IsRelaxed<SIMD>()>;
295 | 
296 |     template<auto SIMD = 0>
297 |     using EnableIfNotRelaxed = std::enable_if_t<!FastSIMD::IsRelaxed<SIMD>()>;
298 | 
299 | 
300 |     template<std::size_t N, FastSIMD::FeatureSet SIMD = FastSIMD::FeatureSetDefault()>
301 |     using i32 = Register<std::int32_t, N, SIMD>;
302 | 
303 |     template<std::size_t N, FastSIMD::FeatureSet SIMD = FastSIMD::FeatureSetDefault()>
304 |     using f32 = Register<float, N, SIMD>;
305 | 
306 |     template<std::size_t N, bool OPTIMISE_FLOAT = true, FastSIMD::FeatureSet SIMD = FastSIMD::FeatureSetDefault()>
307 |     using m32 = Register<Mask<32, OPTIMISE_FLOAT>, N, SIMD>;
308 | 
309 |     template<std::size_t N>
310 |     using BitStorage = std::tuple_element_t<( N > 8 ) + ( N > 16 ) + ( N > 32 ),
311 |                                             std::tuple<std::uint8_t, std::uint16_t, std::uint32_t, std::uint64_t>>;
312 | 
313 |     template<typename T>
314 |     static constexpr std::size_t NativeRegisterCount( FastSIMD::FeatureSet featureSet = FastSIMD::FeatureSetDefault() );
315 | 
316 |     template<>
317 |     constexpr std::size_t NativeRegisterCount<float>( FastSIMD::FeatureSet featureSet )
318 |     {
319 |         if( featureSet & FastSIMD::FeatureFlag::AVX512_F )
320 |         {
321 |             return 16;
322 |         }
323 |         if( featureSet & FastSIMD::FeatureFlag::AVX )
324 |         {
325 |             return 8;
326 |         }
327 |         if( featureSet & (FastSIMD::FeatureFlag::SSE |
328 |             FastSIMD::FeatureFlag::NEON | FastSIMD::FeatureFlag::WASM) )
329 |         {
330 |             return 4;
331 |         }
332 | 
333 |         return 1;
334 |     }
335 | 
336 |     template<>
337 |     constexpr std::size_t NativeRegisterCount<std::int32_t>( FastSIMD::FeatureSet featureSet )
338 |     {
339 |         if( featureSet & FastSIMD::FeatureFlag::AVX512_F )
340 |         {
341 |             return 16;
342 |         }
343 |         if( featureSet & FastSIMD::FeatureFlag::AVX2 )
344 |         {
345 |             return 8;
346 |         }
347 |         if( featureSet & (FastSIMD::FeatureFlag::SSE2 |
348 |             FastSIMD::FeatureFlag::NEON | FastSIMD::FeatureFlag::WASM) )
349 |         {
350 |             return 4;
351 |         }
352 | 
353 |         return 1;
354 |     }
355 | 
356 |     template<>
357 |     constexpr std::size_t NativeRegisterCount<Mask<32>>( FastSIMD::FeatureSet featureSet )
358 |     {
359 |         if( featureSet & FastSIMD::FeatureFlag::AVX512_F )
360 |         {
361 |             return 16;
362 |         }
363 |         if( featureSet & FastSIMD::FeatureFlag::AVX2 )
364 |         {
365 |             return 8;
366 |         }
367 |         if( featureSet & (FastSIMD::FeatureFlag::SSE2 |
368 |             FastSIMD::FeatureFlag::NEON | FastSIMD::FeatureFlag::WASM) )
369 |         {
370 |             return 4;
371 |         }
372 | 
373 |         return 1;
374 |     }
375 | 
376 |     template<typename T, FastSIMD::FeatureSet SIMD = FastSIMD::FeatureSetDefault()>
377 |     using NativeRegister = Register<T, NativeRegisterCount<T>( SIMD ), SIMD>;
378 | 
379 | } // namespace FS
380 | 


--------------------------------------------------------------------------------
/include/FastSIMD/ToolSet/Generic/Scalar.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #include "Scalar/i32x1.h"
  3 | #include "Scalar/f32x1.h"
  4 | #include "Scalar/mNx1.h"
  5 | 
  6 | namespace FS
  7 | {
  8 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<1, SIMD>>>
  9 |     FS_FORCEINLINE i32<1, SIMD> Convert( const f32<1, SIMD>& a, TypeDummy<int32_t> )
 10 |     {
 11 |         return static_cast<std::int32_t>( std::rintf( a.GetNative() ) );
 12 |     }
 13 | 
 14 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<1, SIMD>>>
 15 |     FS_FORCEINLINE f32<1, SIMD> Convert( const i32<1, SIMD>& a, TypeDummy<float> )
 16 |     {
 17 |         return static_cast<float>( a.GetNative() );
 18 |     }
 19 | 
 20 |     template<typename U, typename T, FastSIMD::FeatureSet SIMD, typename = EnableIfNative<Register<T, 1, SIMD>>>
 21 |     FS_FORCEINLINE Register<U, 1, SIMD> Cast( const Register<T, 1, SIMD>& a, TypeDummy<U> )
 22 |     {
 23 |         if constexpr( !std::is_same_v<typename Register<T, 1, SIMD>::NativeType, typename Register<U, 1, SIMD>::NativeType> )
 24 |         {
 25 |             union 
 26 |             {
 27 |                 typename Register<T, 1, SIMD>::NativeType a;
 28 |                 typename Register<U, 1, SIMD>::NativeType b;
 29 |             } u;
 30 | 
 31 |             u.a = a.GetNative();
 32 | 
 33 |             return u.b;
 34 |         }
 35 |         else
 36 |         {
 37 |             return a.GetNative();
 38 |         }
 39 |     }
 40 | 
 41 |     
 42 |     template<typename T, FastSIMD::FeatureSet SIMD, typename = EnableIfNative<Register<T, 1, SIMD>>>
 43 |     FS_FORCEINLINE Register<T, 1, SIMD> Load( TypeWrapper<const T*, 1, SIMD> ptr )
 44 |     {
 45 |         return *ptr.value;
 46 |     }
 47 | 
 48 |     template<typename T, FastSIMD::FeatureSet SIMD, typename = EnableIfNative<Register<T, 1, SIMD>>>
 49 |     FS_FORCEINLINE void Store( T* ptr, const Register<T, 1, SIMD>& a )
 50 |     {
 51 |         *ptr = a.GetNative();
 52 |     }
 53 | 
 54 |     template<typename T, FastSIMD::FeatureSet SIMD, typename = EnableIfNative<Register<T, 1, SIMD>>>
 55 |     FS_FORCEINLINE T Extract0( const Register<T, 1, SIMD>& a )
 56 |     {
 57 |         return a.GetNative();
 58 |     }
 59 | 
 60 |     template<typename T, FastSIMD::FeatureSet SIMD, typename = EnableIfNative<Register<T, 1, SIMD>>>
 61 |     FS_FORCEINLINE Register<T, 1, SIMD> Abs( const Register<T, 1, SIMD>& a )
 62 |     {
 63 |         return std::abs( a.GetNative() );
 64 |     }
 65 | 
 66 |     template<typename T, FastSIMD::FeatureSet SIMD, typename = EnableIfNative<Register<T, 1, SIMD>>>
 67 |     FS_FORCEINLINE Register<T, 1, SIMD> Round( const Register<T, 1, SIMD>& a )
 68 |     {
 69 |         return std::rint( a.GetNative() );
 70 |     }
 71 | 
 72 |     template<typename T, FastSIMD::FeatureSet SIMD, typename = EnableIfNative<Register<T, 1, SIMD>>>
 73 |     FS_FORCEINLINE Register<T, 1, SIMD> Ceil( const Register<T, 1, SIMD>& a )
 74 |     {
 75 |         return std::ceil( a.GetNative() );
 76 |     }
 77 | 
 78 |     template<typename T, FastSIMD::FeatureSet SIMD, typename = EnableIfNative<Register<T, 1, SIMD>>>
 79 |     FS_FORCEINLINE Register<T, 1, SIMD> Floor( const Register<T, 1, SIMD>& a )
 80 |     {
 81 |         return std::floor( a.GetNative() );
 82 |     }
 83 | 
 84 |     template<typename T, FastSIMD::FeatureSet SIMD, typename = EnableIfNative<Register<T, 1, SIMD>>>
 85 |     FS_FORCEINLINE Register<T, 1, SIMD> Trunc( const Register<T, 1, SIMD>& a )
 86 |     {
 87 |         return std::trunc( a.GetNative() );
 88 |     }
 89 | 
 90 |     template<typename T, FastSIMD::FeatureSet SIMD, typename = EnableIfNative<Register<T, 1, SIMD>>, typename = EnableIfNotRelaxed<SIMD>>
 91 |     FS_FORCEINLINE f32<1, SIMD> Modulus( const Register<T, 1, SIMD>& a, const Register<T, 1, SIMD>& b )
 92 |     {
 93 |         return std::fmod( a.GetNative(), b.GetNative() );
 94 |     }
 95 | 
 96 |     template<typename T, FastSIMD::FeatureSet SIMD, typename = EnableIfNative<Register<T, 1, SIMD>>>
 97 |     FS_FORCEINLINE Register<T, 1, SIMD> Min( const Register<T, 1, SIMD>& a, const Register<T, 1, SIMD>& b )
 98 |     {
 99 |         return std::min( a.GetNative(), b.GetNative() );
100 |     }
101 | 
102 |     template<typename T, FastSIMD::FeatureSet SIMD, typename = EnableIfNative<Register<T, 1, SIMD>>>
103 |     FS_FORCEINLINE Register<T, 1, SIMD> Max( const Register<T, 1, SIMD>& a, const Register<T, 1, SIMD>& b )
104 |     {
105 |         return std::max( a.GetNative(), b.GetNative() );
106 |     }
107 | 
108 |     template<typename T, FastSIMD::FeatureSet SIMD, typename = EnableIfNative<Register<T, 1, SIMD>>>
109 |     FS_FORCEINLINE Register<T, 1, SIMD> Select( const typename Register<T, 1, SIMD>::MaskTypeArg& mask, const Register<T, 1, SIMD>& ifTrue, const Register<T, 1, SIMD>& ifFalse )
110 |     {
111 |         return mask.GetNative() ? ifTrue : ifFalse;
112 |     }
113 | 
114 |     
115 |     
116 |     template<typename T, FastSIMD::FeatureSet SIMD, typename = EnableIfNative<Register<T, 1, SIMD>>>
117 |     FS_FORCEINLINE Register<T, 1, SIMD> BitwiseAndNot( const Register<T, 1, SIMD>& a, const Register<T, 1, SIMD>& b )
118 |     {
119 |         return a & ~b;
120 |     }
121 |     
122 |     template<typename T, FastSIMD::FeatureSet SIMD, typename = EnableIfNative<Register<T, 1, SIMD>>>
123 |     FS_FORCEINLINE Register<T, 1, SIMD> Masked( const typename Register<T, 1, SIMD>::MaskTypeArg& mask, const Register<T, 1, SIMD>& a )
124 |     {
125 |         return mask.native ? a : 0;
126 |     }
127 | }
128 | 


--------------------------------------------------------------------------------
/include/FastSIMD/ToolSet/Generic/Scalar/f32x1.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <FastSIMD/ToolSet/Generic/Register.h>
  4 | 
  5 | #include <algorithm>
  6 | #include <cmath>
  7 | 
  8 | namespace FS
  9 | {
 10 |     template<FastSIMD::FeatureSet SIMD>
 11 |     struct Register<float, 1, SIMD, std::enable_if_t<SIMD & FastSIMD::FeatureFlag::Scalar>>
 12 |     {
 13 |         static constexpr size_t ElementCount = 1;
 14 |         static constexpr auto FeatureFlags = SIMD;
 15 |         
 16 |         using NativeType = float;
 17 |         using ElementType = float;
 18 |         using MaskType = m32<ElementCount, true, SIMD>;
 19 |         using MaskTypeArg = m32<ElementCount, true, SIMD>;
 20 | 
 21 | 
 22 |         FS_FORCEINLINE Register() = default;
 23 |         FS_FORCEINLINE Register( NativeType v ) : native{ v } { }
 24 |         
 25 |         FS_FORCEINLINE NativeType GetNative() const
 26 |         {
 27 |             return native.f;
 28 |         }
 29 | 
 30 |         FS_FORCEINLINE Register& operator +=( const Register& rhs )
 31 |         {
 32 |             native.f = native.f + rhs.native.f;
 33 |             return *this;
 34 |         }
 35 | 
 36 |         FS_FORCEINLINE Register& operator -=( const Register& rhs )
 37 |         {
 38 |             native.f = native.f - rhs.native.f;
 39 |             return *this;
 40 |         }
 41 |         
 42 |         FS_FORCEINLINE Register& operator *=( const Register& rhs )
 43 |         {
 44 |             native.f = native.f * rhs.native.f;
 45 |             return *this;
 46 |         }
 47 |         
 48 |         FS_FORCEINLINE Register& operator /=( const Register& rhs )
 49 |         {
 50 |             native.f = native.f / rhs.native.f;
 51 |             return *this;
 52 |         }
 53 |         
 54 |             
 55 |         FS_FORCEINLINE Register& operator &=( const Register& rhs )
 56 |         {
 57 |             native.i = native.i & rhs.native.i;
 58 |             return *this;
 59 |         }
 60 |         
 61 |         FS_FORCEINLINE Register& operator |=( const Register& rhs )
 62 |         {
 63 |             native.i = native.i | rhs.native.i;
 64 |             return *this;
 65 |         }
 66 |         
 67 |         FS_FORCEINLINE Register& operator ^=( const Register& rhs )
 68 |         {
 69 |             native.i = native.i ^ rhs.native.i;    
 70 |             return *this;
 71 |         }
 72 | 
 73 |         FS_FORCEINLINE Register operator ~() const
 74 |         {
 75 |             Register reg;
 76 |             reg.native.i = ~native.i;
 77 |             return reg;        
 78 |         }
 79 | 
 80 |         FS_FORCEINLINE Register operator -() const
 81 |         {
 82 |             return -native.f;
 83 |         }
 84 | 
 85 |         
 86 |         FS_FORCEINLINE MaskType operator ==( const Register& rhs ) const
 87 |         {
 88 |             return native.f == rhs.native.f;
 89 |         }
 90 |         
 91 |         FS_FORCEINLINE MaskType operator !=( const Register& rhs ) const
 92 |         {
 93 |             return native.f != rhs.native.f;
 94 |         }
 95 |         
 96 |         FS_FORCEINLINE MaskType operator >=( const Register& rhs ) const
 97 |         {
 98 |             return native.f >= rhs.native.f;
 99 |         }
100 |         
101 |         FS_FORCEINLINE MaskType operator <=( const Register& rhs ) const
102 |         {
103 |             return native.f <= rhs.native.f;
104 |         }
105 |         
106 |         FS_FORCEINLINE MaskType operator >( const Register& rhs ) const
107 |         {
108 |             return native.f > rhs.native.f;
109 |         }
110 |         
111 |         FS_FORCEINLINE MaskType operator <( const Register& rhs ) const
112 |         {
113 |             return native.f < rhs.native.f;
114 |         }
115 | 
116 |         union
117 |         {
118 |             float f;
119 |             std::int32_t i;
120 |         }
121 |         native;
122 |     };    
123 |     
124 |     
125 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<1, SIMD>>>
126 |     FS_FORCEINLINE f32<1, SIMD> Sqrt( const f32<1, SIMD>& a )
127 |     {
128 |         return std::sqrt( a.native.f );
129 |     }
130 | }
131 | 


--------------------------------------------------------------------------------
/include/FastSIMD/ToolSet/Generic/Scalar/i32x1.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <FastSIMD/ToolSet/Generic/Register.h>
  4 | 
  5 | #include <algorithm>
  6 | #include <cmath>
  7 | 
  8 | namespace FS
  9 | {
 10 |     template<FastSIMD::FeatureSet SIMD>
 11 |     struct Register<std::int32_t, 1, SIMD, std::enable_if_t<SIMD & FastSIMD::FeatureFlag::Scalar>>
 12 |     {
 13 |         static constexpr size_t ElementCount = 1;
 14 |         static constexpr auto FeatureFlags = SIMD;
 15 |         
 16 |         using NativeType = std::int32_t;
 17 |         using ElementType = std::int32_t;
 18 |         using MaskType = m32<ElementCount, false, SIMD>;
 19 |         using MaskTypeArg = m32<ElementCount, true, SIMD>;
 20 | 
 21 | 
 22 |         FS_FORCEINLINE Register() = default;
 23 |         FS_FORCEINLINE Register( NativeType v ) : native( v ) { }
 24 |         
 25 |         FS_FORCEINLINE NativeType GetNative() const
 26 |         {
 27 |             return native;
 28 |         }
 29 | 
 30 |         FS_FORCEINLINE Register& operator +=( const Register& rhs )
 31 |         {
 32 |             native = native + rhs.native;
 33 |             return *this;
 34 |         }
 35 | 
 36 |         FS_FORCEINLINE Register& operator -=( const Register& rhs )
 37 |         {
 38 |             native = native - rhs.native;
 39 |             return *this;
 40 |         }
 41 |         
 42 |         FS_FORCEINLINE Register& operator *=( const Register& rhs )
 43 |         {
 44 |             native = native * rhs.native;
 45 |             return *this;
 46 |         }
 47 |             
 48 |         FS_FORCEINLINE Register& operator &=( const Register& rhs )
 49 |         {
 50 |             native = native & rhs.native;
 51 |             return *this;
 52 |         }
 53 |         
 54 |         FS_FORCEINLINE Register& operator |=( const Register& rhs )
 55 |         {
 56 |             native = native | rhs.native;
 57 |             return *this;
 58 |         }
 59 |         
 60 |         FS_FORCEINLINE Register& operator ^=( const Register& rhs )
 61 |         {
 62 |             native = native ^ rhs.native;
 63 |             return *this;
 64 |         }
 65 |         
 66 |         FS_FORCEINLINE Register& operator >>=( int rhs )
 67 |         {
 68 |             native = native >> rhs;
 69 |             return *this;
 70 |         }
 71 |         
 72 |         FS_FORCEINLINE Register& operator <<=( int rhs )
 73 |         {
 74 |             native = native << rhs;
 75 |             return *this;
 76 |         }
 77 | 
 78 |         FS_FORCEINLINE Register operator ~() const
 79 |         {
 80 |             return ~native;        
 81 |         }
 82 | 
 83 |         FS_FORCEINLINE Register operator -() const
 84 |         {
 85 |             return -native;
 86 |         }
 87 | 
 88 |         
 89 |         FS_FORCEINLINE MaskType operator ==( const Register& rhs ) const
 90 |         {
 91 |             return native == rhs.native;
 92 |         }
 93 |         
 94 |         FS_FORCEINLINE MaskType operator !=( const Register& rhs ) const
 95 |         {
 96 |             return native != rhs.native;
 97 |         }
 98 |         
 99 |         FS_FORCEINLINE MaskType operator >=( const Register& rhs ) const
100 |         {
101 |             return native >= rhs.native;
102 |         }
103 |         
104 |         FS_FORCEINLINE MaskType operator <=( const Register& rhs ) const
105 |         {
106 |             return native <= rhs.native;
107 |         }
108 |         
109 |         FS_FORCEINLINE MaskType operator >( const Register& rhs ) const
110 |         {
111 |             return native > rhs.native;
112 |         }
113 |         
114 |         FS_FORCEINLINE MaskType operator <( const Register& rhs ) const
115 |         {
116 |             return native < rhs.native;
117 |         }
118 | 
119 |         NativeType native;
120 |     };
121 | 
122 |     
123 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<1, SIMD>>>
124 |     FS_FORCEINLINE i32<1, SIMD> BitShiftRightZeroExtend( const i32<1, SIMD>& a, int b )
125 |     {
126 |         return static_cast<std::int32_t>( static_cast<std::uint32_t>( a.native ) >> b );
127 |     }
128 | }
129 | 


--------------------------------------------------------------------------------
/include/FastSIMD/ToolSet/Generic/Scalar/mNx1.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <FastSIMD/ToolSet/Generic/Register.h>
 4 | 
 5 | namespace FS
 6 | {
 7 |     namespace impl
 8 |     {
 9 |         struct GenericMaskBase
10 |         {
11 |             bool native;
12 |         };
13 |     }
14 | 
15 |     template<FastSIMD::FeatureSet SIMD, std::size_t N, bool OPTIMISE_FLOAT>
16 |     struct Register<Mask<N, OPTIMISE_FLOAT>, 1, SIMD>
17 |         : std::conditional_t<OPTIMISE_FLOAT, impl::GenericMaskBase, Register<Mask<N, true>, 1, SIMD>>
18 |     {
19 |         static constexpr size_t ElementCount = 1;
20 |         static constexpr auto FeatureFlags = SIMD;
21 |         
22 |         using NativeType = bool;
23 |         using ElementType = Mask<N, OPTIMISE_FLOAT>;
24 |         using MaskType = Register;
25 |         using MaskTypeArg = Register;
26 | 
27 |         FS_FORCEINLINE Register() = default;
28 |         FS_FORCEINLINE Register( NativeType v ) { this->native = v; }
29 |         
30 |         FS_FORCEINLINE NativeType GetNative() const
31 |         {
32 |             return this->native;
33 |         }
34 | 
35 |         FS_FORCEINLINE Register& operator &=( const Register& rhs )
36 |         {
37 |             this->native = this->native && rhs.native;
38 |             return *this;
39 |         }
40 |         
41 |         FS_FORCEINLINE Register& operator |=( const Register& rhs )
42 |         {
43 |             this->native = this->native || rhs.native;
44 |             return *this;
45 |         }
46 |         
47 |         FS_FORCEINLINE Register& operator ^=( const Register& rhs )
48 |         {
49 |             this->native = this->native ^ rhs.native;
50 |             return *this;
51 |         }
52 |         
53 |         FS_FORCEINLINE Register operator ~() const
54 |         {
55 |             return !this->native;   
56 |         }
57 |     };
58 | 
59 |     template<FastSIMD::FeatureSet SIMD, size_t N, bool B, typename = EnableIfNative<Register<Mask<N, B>, 1, SIMD>>>
60 |     FS_FORCEINLINE Register<Mask<N, B>, 1, SIMD> BitwiseAndNot( const Register<Mask<N, B>, 1, SIMD>& a, const Register<Mask<N, B>, 1, SIMD>& b )
61 |     {
62 |         return a.native && !b.native;        
63 |     }
64 |     
65 |     template<FastSIMD::FeatureSet SIMD, size_t N, bool B, typename = EnableIfNative<Register<Mask<N, B>, 1, SIMD>>>
66 |     FS_FORCEINLINE bool AnyMask( const Register<Mask<N, B>, 1, SIMD>& a )
67 |     {          
68 |         return a.native;
69 |     }
70 | 
71 |     template<FastSIMD::FeatureSet SIMD, size_t N, bool B, typename = EnableIfNative<Register<Mask<N, B>, 1, SIMD>>>
72 |     FS_FORCEINLINE BitStorage<1> BitMask( const Register<Mask<N, B>, 1, SIMD>& a )
73 |     {
74 |         return static_cast<BitStorage<1>>( a.native );
75 |     }
76 | }
77 | 


--------------------------------------------------------------------------------
/include/FastSIMD/ToolSet/WASM/128/f32x4.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <FastSIMD/ToolSet/Generic/Register.h>
  4 | 
  5 | namespace FS
  6 | {
  7 |     template<FastSIMD::FeatureSet SIMD>
  8 |     struct Register<float, 4, SIMD, std::enable_if_t<SIMD & FastSIMD::FeatureFlag::WASM>>
  9 |     {
 10 |         static constexpr size_t ElementCount = 4;
 11 |         static constexpr auto FeatureFlags = SIMD;
 12 | 
 13 |         using NativeType = __f32x4;
 14 |         using ElementType = float;
 15 |         using MaskType = m32<ElementCount, true, SIMD>;
 16 |         using MaskTypeArg = m32<ElementCount, true, SIMD>;
 17 | 
 18 |         FS_FORCEINLINE Register() = default;
 19 |         FS_FORCEINLINE Register( NativeType v ) : native( v ) { }
 20 |         FS_FORCEINLINE Register( float v ) : native( wasm_f32x4_splat( v ) ) { }
 21 | 
 22 |         FS_FORCEINLINE NativeType GetNative() const
 23 |         {
 24 |             return native;
 25 |         }
 26 | 
 27 |         FS_FORCEINLINE Register& operator +=( const Register& rhs )
 28 |         {
 29 |             native = wasm_f32x4_add( native, rhs.native );
 30 |             return *this;
 31 |         }
 32 | 
 33 |         FS_FORCEINLINE Register& operator -=( const Register& rhs )
 34 |         {
 35 |             native = wasm_f32x4_sub( native, rhs.native );
 36 |             return *this;
 37 |         }
 38 | 
 39 |         FS_FORCEINLINE Register& operator *=( const Register& rhs )
 40 |         {
 41 |             native = wasm_f32x4_mul( native, rhs.native );
 42 |             return *this;
 43 |         }
 44 | 
 45 |         FS_FORCEINLINE Register& operator /=( const Register& rhs )
 46 |         {
 47 |             native = wasm_f32x4_div( native, rhs.native );
 48 |             return *this;
 49 |         }
 50 | 
 51 |         FS_FORCEINLINE Register& operator &=( const Register& rhs )
 52 |         {
 53 |             native = wasm_v128_and( native, rhs.native );
 54 |             return *this;
 55 |         }
 56 | 
 57 |         FS_FORCEINLINE Register& operator |=( const Register& rhs )
 58 |         {
 59 |             native = wasm_v128_or( native, rhs.native );
 60 |             return *this;
 61 |         }
 62 | 
 63 |         FS_FORCEINLINE Register& operator ^=( const Register& rhs )
 64 |         {
 65 |             native = wasm_v128_xor( native, rhs.native );
 66 |             return *this;
 67 |         }
 68 | 
 69 |         FS_FORCEINLINE Register operator~() const
 70 |         {
 71 |             return wasm_v128_not( native );
 72 |         }
 73 | 
 74 |         FS_FORCEINLINE Register operator-() const
 75 |         {
 76 |             return wasm_f32x4_neg( native );
 77 |         }
 78 | 
 79 | 
 80 |         FS_FORCEINLINE MaskType operator ==( const Register& rhs ) const
 81 |         {
 82 |             return wasm_f32x4_eq( native, rhs.native );
 83 |         }
 84 | 
 85 |         FS_FORCEINLINE MaskType operator !=( const Register& rhs ) const
 86 |         {
 87 |             return ~( *this == rhs );
 88 |         }
 89 | 
 90 |         FS_FORCEINLINE MaskType operator >=( const Register& rhs ) const
 91 |         {
 92 |             return wasm_f32x4_ge( native, rhs.native );
 93 |         }
 94 | 
 95 |         FS_FORCEINLINE MaskType operator <=( const Register& rhs ) const
 96 |         {
 97 |             return wasm_f32x4_le( native, rhs.native );
 98 |         }
 99 | 
100 |         FS_FORCEINLINE MaskType operator >( const Register& rhs ) const
101 |         {
102 |             return wasm_f32x4_gt( native, rhs.native );
103 |         }
104 | 
105 |         FS_FORCEINLINE MaskType operator <( const Register& rhs ) const
106 |         {
107 |             return wasm_f32x4_lt( native, rhs.native );
108 |         }
109 | 
110 |         NativeType native;
111 |     };
112 | 
113 | 
114 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>>
115 |     FS_FORCEINLINE f32<4, SIMD> Load( TypeWrapper<const float*, 4, SIMD> ptr )
116 |     {
117 |         return wasm_v128_load( ptr.value );
118 |     }
119 | 
120 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>>
121 |     FS_FORCEINLINE void Store( typename f32<4, SIMD>::ElementType* ptr, const f32<4, SIMD>& a )
122 |     {
123 |         wasm_v128_store( ptr, a.native );
124 |     }
125 | 
126 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>>
127 |     FS_FORCEINLINE float Extract0( const f32<4, SIMD>& a )
128 |     {
129 |         return wasm_f32x4_extract_lane( a.native, 0 );
130 |     }
131 | 
132 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>>
133 |     FS_FORCEINLINE f32<4, SIMD> Abs( const f32<4, SIMD>& a )
134 |     {
135 |         return wasm_f32x4_abs( a.native );
136 |     }
137 | 
138 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>>
139 |     FS_FORCEINLINE f32<4, SIMD> Round( const f32<4, SIMD>& a )
140 |     {
141 |         return wasm_f32x4_nearest( a.native );
142 |     }
143 | 
144 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>>
145 |     FS_FORCEINLINE f32<4, SIMD> Floor( const f32<4, SIMD>& a )
146 |     {
147 |         return wasm_f32x4_floor( a.native );
148 |     }
149 | 
150 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>>
151 |     FS_FORCEINLINE f32<4, SIMD> Trunc( const f32<4, SIMD>& a )
152 |     {
153 |         return wasm_f32x4_trunc( a.native );
154 |     }
155 | 
156 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>>
157 |     FS_FORCEINLINE f32<4, SIMD> Ceil( const f32<4, SIMD>& a )
158 |     {
159 |         return wasm_f32x4_ceil( a.native );
160 |     }
161 | 
162 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>>
163 |     FS_FORCEINLINE f32<4, SIMD> Min( const f32<4, SIMD>& a, const f32<4, SIMD>& b )
164 |     {
165 |         if constexpr( FastSIMD::IsRelaxed() )
166 |         {
167 |             return wasm_f32x4_relaxed_min( a.native, b.native );        
168 |         }
169 |         else
170 |         {
171 |             return wasm_f32x4_min( a.native, b.native );
172 |         }
173 |     }
174 | 
175 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>>
176 |     FS_FORCEINLINE f32<4, SIMD> Max( const f32<4, SIMD>& a, const f32<4, SIMD>& b )
177 |     {
178 |         if constexpr( FastSIMD::IsRelaxed() )
179 |         {
180 |             return wasm_f32x4_relaxed_max( a.native, b.native );
181 |         }
182 |         else
183 |         {
184 |             return wasm_f32x4_max( a.native, b.native );
185 |         }
186 |     }
187 | 
188 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>>
189 |     FS_FORCEINLINE f32<4, SIMD> Select( const typename f32<4, SIMD>::MaskTypeArg& mask, const f32<4, SIMD>& ifTrue, const f32<4, SIMD>& ifFalse )
190 |     {
191 |         return wasm_v128_bitselect( ifTrue.native, ifFalse.native, mask.native );
192 |     }
193 | 
194 | 
195 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>>
196 |     FS_FORCEINLINE f32<4, SIMD> Masked( const typename f32<4, SIMD>::MaskTypeArg& mask, const f32<4, SIMD>& a )
197 |     {
198 |         return wasm_v128_and( mask.native, a.native );
199 |     }
200 | 
201 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>>
202 |     FS_FORCEINLINE f32<4, SIMD> MaskedIncrement( const typename f32<4, SIMD>::MaskTypeArg& mask, const f32<4, SIMD>& a )
203 |     {
204 |         return wasm_f32x4_sub( a.native,
205 |             wasm_f32x4_convert_i32x4( static_cast<v128_t>( mask.native ) ) );
206 |     }
207 | 
208 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>>
209 |     FS_FORCEINLINE f32<4, SIMD> MaskedDecrement( const typename f32<4, SIMD>::MaskTypeArg& mask, const f32<4, SIMD>& a )
210 |     {
211 |         return wasm_f32x4_add( a.native,
212 |             wasm_f32x4_convert_i32x4( static_cast<v128_t>( mask.native ) ) );
213 |     }
214 | 
215 | 
216 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>>
217 |     FS_FORCEINLINE f32<4, SIMD> Sqrt( const f32<4, SIMD>& a )
218 |     {
219 |         return wasm_f32x4_sqrt( a.native );
220 |     }
221 | 
222 | 
223 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>, typename = EnableIfRelaxed<SIMD>()>
224 |     FS_FORCEINLINE f32<4, SIMD> FMulAdd( const f32<4, SIMD>& a, const f32<4, SIMD>& b, const f32<4, SIMD>& c )
225 |     {
226 |         return wasm_f32x4_relaxed_madd( a.native, b.native, c.native );
227 |     }
228 | 
229 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>, typename = EnableIfRelaxed<SIMD>()>
230 |     FS_FORCEINLINE f32<4, SIMD> FNMulAdd( const f32<4, SIMD>& a, const f32<4, SIMD>& b, const f32<4, SIMD>& c )
231 |     {
232 |         return wasm_f32x4_relaxed_nmadd( a.native, b.native, c.native );
233 |     }
234 | }
235 | 


--------------------------------------------------------------------------------
/include/FastSIMD/ToolSet/WASM/128/i32x4.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <FastSIMD/ToolSet/Generic/Register.h>
  4 | 
  5 | namespace FS
  6 | {
  7 |     template<FastSIMD::FeatureSet SIMD>
  8 |     struct Register<std::int32_t, 4, SIMD, std::enable_if_t<SIMD & FastSIMD::FeatureFlag::WASM>>
  9 |     {
 10 |         static constexpr size_t ElementCount = 4;
 11 |         static constexpr auto FeatureFlags = SIMD;
 12 | 
 13 |         using NativeType = v128_t;
 14 |         using ElementType = std::int32_t;
 15 |         using MaskType = m32<ElementCount, false, SIMD>;
 16 |         using MaskTypeArg = m32<ElementCount, true, SIMD>;
 17 | 
 18 |         FS_FORCEINLINE Register() = default;
 19 |         FS_FORCEINLINE Register( NativeType v ) : native( v ) { }
 20 |         FS_FORCEINLINE Register( std::int32_t v ) : native( wasm_i32x4_splat( v ) ) { }
 21 | 
 22 |         FS_FORCEINLINE NativeType GetNative() const
 23 |         {
 24 |             return native;
 25 |         }
 26 | 
 27 |         FS_FORCEINLINE Register& operator +=( const Register& rhs )
 28 |         {
 29 |             native = wasm_i32x4_add( native, rhs.native );
 30 |             return *this;
 31 |         }
 32 | 
 33 |         FS_FORCEINLINE Register& operator -=( const Register& rhs )
 34 |         {
 35 |             native = wasm_i32x4_sub( native, rhs.native );
 36 |             return *this;
 37 |         }
 38 | 
 39 |         FS_FORCEINLINE Register& operator *=( const Register& rhs )
 40 |         {
 41 |             native = wasm_i32x4_mul( native, rhs.native );
 42 |             return *this;
 43 |         }
 44 | 
 45 |         FS_FORCEINLINE Register& operator &=( const Register& rhs )
 46 |         {
 47 |             native = wasm_v128_and( native, rhs.native );
 48 |             return *this;
 49 |         }
 50 | 
 51 |         FS_FORCEINLINE Register& operator |=( const Register& rhs )
 52 |         {
 53 |             native = wasm_v128_or( native, rhs.native );
 54 |             return *this;
 55 |         }
 56 | 
 57 |         FS_FORCEINLINE Register& operator ^=( const Register& rhs )
 58 |         {
 59 |             native = wasm_v128_xor( native, rhs.native );
 60 |             return *this;
 61 |         }
 62 | 
 63 |         FS_FORCEINLINE Register& operator >>=( int rhs )
 64 |         {
 65 |             native = wasm_i32x4_shr( native, rhs );
 66 |             return *this;
 67 |         }
 68 | 
 69 |         FS_FORCEINLINE Register& operator <<=( int rhs )
 70 |         {
 71 |             native = wasm_i32x4_shl( native, rhs );
 72 |             return *this;
 73 |         }
 74 | 
 75 |         FS_FORCEINLINE Register operator ~() const
 76 |         {
 77 |             return wasm_v128_not( native );
 78 |         }
 79 | 
 80 |         FS_FORCEINLINE Register operator -() const
 81 |         {
 82 |             return wasm_i32x4_neg( native );
 83 |         }
 84 | 
 85 | 
 86 |         FS_FORCEINLINE MaskType operator ==( const Register& rhs ) const
 87 |         {
 88 |             return wasm_i32x4_eq( native, rhs.native );
 89 |         }
 90 | 
 91 |         FS_FORCEINLINE MaskType operator !=( const Register& rhs ) const
 92 |         {
 93 |             return ~(*this == rhs);
 94 |         }
 95 | 
 96 |         FS_FORCEINLINE MaskType operator >=( const Register& rhs ) const
 97 |         {
 98 |             return wasm_i32x4_ge( native, rhs.native );
 99 |         }
100 | 
101 |         FS_FORCEINLINE MaskType operator <=( const Register& rhs ) const
102 |         {
103 |             return wasm_i32x4_le( native, rhs.native );
104 |         }
105 | 
106 |         FS_FORCEINLINE MaskType operator >( const Register& rhs ) const
107 |         {
108 |             return wasm_i32x4_gt( native, rhs.native );
109 |         }
110 | 
111 |         FS_FORCEINLINE MaskType operator <( const Register& rhs ) const
112 |         {
113 |             return wasm_i32x4_lt( native, rhs.native );
114 |         }
115 | 
116 |         NativeType native;
117 |     };
118 | 
119 | 
120 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<4, SIMD>>>
121 |     FS_FORCEINLINE i32<4, SIMD> Load( TypeWrapper<const int*, 4, SIMD> ptr )
122 |     {
123 |         return wasm_v128_load( ptr.value );
124 |     }
125 | 
126 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<4, SIMD>>>
127 |     FS_FORCEINLINE void Store( typename i32<4, SIMD>::ElementType* ptr, const i32<4, SIMD>& a )
128 |     {
129 |         wasm_v128_store( ptr, a.native );
130 |     }
131 | 
132 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<4, SIMD>>>
133 |     FS_FORCEINLINE int32_t Extract0( const i32<4, SIMD>& a )
134 |     {
135 |         return wasm_i32x4_extract_lane( a.native, 0 );
136 |     }
137 | 
138 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<4, SIMD>>>
139 |     FS_FORCEINLINE i32<4, SIMD> Abs( const i32<4, SIMD>& a )
140 |     {
141 |         return wasm_i32x4_abs( a.native );
142 |     }
143 | 
144 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<4, SIMD>>>
145 |     FS_FORCEINLINE i32<4, SIMD> Min( const i32<4, SIMD>& a, const i32<4, SIMD>& b )
146 |     {
147 |         return wasm_i32x4_min( a.native, b.native );
148 |     }
149 | 
150 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<4, SIMD>>>
151 |     FS_FORCEINLINE i32<4, SIMD> Max( const i32<4, SIMD>& a, const i32<4, SIMD>& b )
152 |     {
153 |         return wasm_i32x4_max( a.native, b.native );
154 |     }
155 | 
156 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<4, SIMD>>>
157 |     FS_FORCEINLINE i32<4, SIMD> Select( const typename i32<4, SIMD>::MaskTypeArg& mask, const i32<4, SIMD>& ifTrue, const i32<4, SIMD>& ifFalse )
158 |     {
159 |         return wasm_v128_bitselect( ifTrue.native, ifFalse.native, mask.native );
160 |     }
161 | 
162 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<4, SIMD>>>
163 |     FS_FORCEINLINE i32<4, SIMD> BitShiftRightZeroExtend( const i32<4, SIMD>& a, int b )
164 |     {
165 |         return wasm_u32x4_shr( a.native, b );
166 |     }
167 | 
168 | 
169 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<4, SIMD>>>
170 |     FS_FORCEINLINE i32<4, SIMD> Masked( const typename i32<4, SIMD>::MaskTypeArg& mask, const i32<4, SIMD>& a )
171 |     {
172 |         return wasm_v128_and( mask.native, a.native );
173 |     }
174 | 
175 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<4, SIMD>>>
176 |     FS_FORCEINLINE i32<4, SIMD> MaskedIncrement( const typename i32<4, SIMD>::MaskTypeArg& mask, const i32<4, SIMD>& a )
177 |     {
178 |         return wasm_i32x4_sub( a.native, mask.native );
179 |     }
180 | 
181 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<4, SIMD>>>
182 |     FS_FORCEINLINE i32<4, SIMD> MaskedDecrement( const typename i32<4, SIMD>::MaskTypeArg& mask, const i32<4, SIMD>& a )
183 |     {
184 |         return wasm_i32x4_add( a.native, mask.native );
185 |     }
186 | }
187 | 


--------------------------------------------------------------------------------
/include/FastSIMD/ToolSet/WASM/128/m32x4.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <FastSIMD/ToolSet/Generic/Register.h>
 4 | 
 5 | namespace FS
 6 | {
 7 |     namespace impl
 8 |     {
 9 |         struct WasmMaskBase32x4
10 |         {
11 |             v128_t native;
12 |         };
13 |     }
14 | 
15 |     template<FastSIMD::FeatureSet SIMD, bool OPTIMISE_FLOAT>
16 |     struct Register<Mask<32, OPTIMISE_FLOAT>, 4, SIMD, std::enable_if_t<SIMD & FastSIMD::FeatureFlag::WASM>>
17 |         : std::conditional_t<OPTIMISE_FLOAT, impl::WasmMaskBase32x4, Register<Mask<32, true>, 4, SIMD>>
18 |     {
19 |         static constexpr size_t ElementCount = 4;
20 |         static constexpr auto FeatureFlags = SIMD;
21 | 
22 |         using NativeType = decltype(WasmMaskBase32x4::native);
23 |         using ElementType = Mask<32, OPTIMISE_FLOAT>;
24 |         using MaskType = Register;
25 |         using MaskTypeArg = Register;
26 | 
27 |         FS_FORCEINLINE Register() = default;
28 |         FS_FORCEINLINE Register( NativeType v ) { this->native = v; }
29 | 
30 |         FS_FORCEINLINE NativeType GetNative() const
31 |         {
32 |             return this->native;
33 |         }
34 | 
35 |         FS_FORCEINLINE Register& operator &=( const Register& rhs )
36 |         {
37 |             this->native = wasm_v128_and( this->native, rhs.native );
38 |             return *this;
39 |         }
40 | 
41 |         FS_FORCEINLINE Register& operator |=( const Register& rhs )
42 |         {
43 |             this->native = wasm_v128_or( this->native, rhs.native );
44 |             return *this;
45 |         }
46 | 
47 |         FS_FORCEINLINE Register& operator ^=( const Register& rhs )
48 |         {
49 |             this->native = wasm_v128_xor( this->native, rhs.native );
50 |             return *this;
51 |         }
52 | 
53 |         FS_FORCEINLINE Register operator ~() const
54 |         {
55 |             return wasm_v128_not( this->native );
56 |         }
57 |     };
58 | 
59 |     template<FastSIMD::FeatureSet SIMD, bool B, typename = EnableIfNative<Register<Mask<32, B>, 4, SIMD>>>
60 |     FS_FORCEINLINE bool AnyMask( const Register<Mask<32, B>, 4, SIMD>& a )
61 |     {
62 |         return wasm_v128_any_true(a.native);
63 |     }
64 | 
65 |     template<FastSIMD::FeatureSet SIMD, bool B, typename = EnableIfNative<Register<Mask<32, B>, 4, SIMD>>>
66 |     FS_FORCEINLINE BitStorage<4> BitMask( const Register<Mask<32, B>, 4, SIMD>& a )
67 |     {
68 |         return wasm_i32x4_bitmask(a.native);
69 |     }
70 | 
71 | }
72 | 


--------------------------------------------------------------------------------
/include/FastSIMD/ToolSet/WASM/WASM.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #if FASTSIMD_MAX_FEATURE_VALUE() >= FASTSIMD_FEATURE_VALUE( WASM )
 4 | #include <cmath>
 5 | #include <wasm_simd128.h>
 6 | 
 7 | #include "128/f32x4.h"
 8 | #include "128/i32x4.h"
 9 | #include "128/m32x4.h"
10 | 
11 | namespace FS
12 | {
13 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>>
14 |     FS_FORCEINLINE i32<4, SIMD> Convert( const f32<4, SIMD>& a, TypeDummy<int32_t> )
15 |     {
16 |         if constexpr( FastSIMD::IsRelaxed() )
17 |         {
18 |             return wasm_i32x4_relaxed_trunc_f32x4( Round( a ).native );
19 |         }
20 |         else
21 |         {
22 |             return wasm_i32x4_trunc_sat_f32x4( Round( a ).native );
23 |         }
24 |     }
25 | 
26 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<4, SIMD>>>
27 |     FS_FORCEINLINE f32<4, SIMD> Convert( const i32<4, SIMD>& a, TypeDummy<float> )
28 |     {
29 |         return wasm_f32x4_convert_i32x4( a.native );
30 |     }
31 | 
32 |     template<typename U, typename T, FastSIMD::FeatureSet SIMD, typename = EnableIfNative<Register<T, 4, SIMD>>>
33 |     FS_FORCEINLINE Register<U, 4, SIMD> Cast( const Register<T, 4, SIMD>& a, TypeDummy<U> )
34 |     {
35 |         if constexpr(
36 |             std::is_same_v<typename Register<T, 4, SIMD>::NativeType, __f32x4> &&
37 |             std::is_same_v<typename Register<U, 4, SIMD>::NativeType, v128_t> )
38 |         {
39 |             return static_cast<__f32x4>( a.GetNative() );
40 |         }
41 |         else if constexpr(
42 |             std::is_same_v<typename Register<T, 4, SIMD>::NativeType, v128_t> &&
43 |             std::is_same_v<typename Register<U, 4, SIMD>::NativeType, __f32x4> )
44 |         {
45 |             return static_cast<v128_t>( a.GetNative() );
46 |         }
47 |         else
48 |         {
49 |             return a.GetNative();
50 |         }
51 |     }
52 | }
53 | #endif
54 | 


--------------------------------------------------------------------------------
/include/FastSIMD/ToolSet/x86/128/f32x4.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <FastSIMD/ToolSet/Generic/Register.h>
  4 | 
  5 | namespace FS
  6 | {
  7 |     template<FastSIMD::FeatureSet SIMD>
  8 |     struct Register<float, 4, SIMD, std::enable_if_t<SIMD & FastSIMD::FeatureFlag::SSE>>
  9 |     {
 10 |         static constexpr size_t ElementCount = 4;
 11 |         static constexpr auto FeatureFlags = SIMD;
 12 |         
 13 |         using NativeType = __m128;
 14 |         using ElementType = float;
 15 |         using MaskType = m32<ElementCount, true, SIMD>;
 16 |         using MaskTypeArg = m32<ElementCount, true, SIMD>;
 17 | 
 18 |         FS_FORCEINLINE Register() = default;
 19 |         FS_FORCEINLINE Register( NativeType v ) : native( v ) { }
 20 |         FS_FORCEINLINE Register( float v ) : native( _mm_set1_ps( v ) ) { }
 21 |         
 22 |         FS_FORCEINLINE NativeType GetNative() const
 23 |         {
 24 |             return native;
 25 |         }
 26 | 
 27 |         FS_FORCEINLINE Register& operator +=( const Register& rhs )
 28 |         {
 29 |             native = _mm_add_ps( native, rhs.native );
 30 |             return *this;
 31 |         }
 32 | 
 33 |         FS_FORCEINLINE Register& operator -=( const Register& rhs )
 34 |         {
 35 |             native = _mm_sub_ps( native, rhs.native );
 36 |             return *this;
 37 |         }
 38 |         
 39 |         FS_FORCEINLINE Register& operator *=( const Register& rhs )
 40 |         {
 41 |             native = _mm_mul_ps( native, rhs.native );
 42 |             return *this;           
 43 |         }
 44 |         
 45 |         FS_FORCEINLINE Register& operator /=( const Register& rhs )
 46 |         {
 47 |             native = _mm_div_ps( native, rhs.native );
 48 |             return *this;           
 49 |         }
 50 |             
 51 |         FS_FORCEINLINE Register& operator &=( const Register& rhs )
 52 |         {
 53 |             native = _mm_and_ps( native, rhs.native );
 54 |             return *this;
 55 |         }
 56 |         
 57 |         FS_FORCEINLINE Register& operator |=( const Register& rhs )
 58 |         {
 59 |             native = _mm_or_ps( native, rhs.native );
 60 |             return *this;
 61 |         }
 62 |         
 63 |         FS_FORCEINLINE Register& operator ^=( const Register& rhs )
 64 |         {
 65 |             native = _mm_xor_ps( native, rhs.native );
 66 |             return *this;
 67 |         }        
 68 | 
 69 |         FS_FORCEINLINE Register operator~() const
 70 |         {
 71 |             const __m128i neg1 = _mm_set1_epi32( -1 );
 72 |             return _mm_xor_ps( native, _mm_castsi128_ps( neg1 ) );        
 73 |         }
 74 | 
 75 |         FS_FORCEINLINE Register operator-() const
 76 |         {
 77 |             const __m128i minInt = _mm_set1_epi32( 0x80000000 );
 78 |             return _mm_xor_ps( native, _mm_castsi128_ps( minInt ) );
 79 |         }
 80 |         
 81 |         
 82 |         FS_FORCEINLINE MaskType operator ==( const Register& rhs ) const
 83 |         {
 84 |             return _mm_cmpeq_ps( native, rhs.native );
 85 |         }
 86 |         
 87 |         FS_FORCEINLINE MaskType operator !=( const Register& rhs ) const
 88 |         {
 89 |             return _mm_cmpneq_ps( native, rhs.native );
 90 |         }
 91 |         
 92 |         FS_FORCEINLINE MaskType operator >=( const Register& rhs ) const
 93 |         {
 94 |             return _mm_cmpge_ps( native, rhs.native );
 95 |         }
 96 |         
 97 |         FS_FORCEINLINE MaskType operator <=( const Register& rhs ) const
 98 |         {
 99 |             return _mm_cmple_ps( native, rhs.native );
100 |         }
101 |         
102 |         FS_FORCEINLINE MaskType operator >( const Register& rhs ) const
103 |         {
104 |             return _mm_cmpgt_ps( native, rhs.native );
105 |         }
106 |         
107 |         FS_FORCEINLINE MaskType operator <( const Register& rhs ) const
108 |         {
109 |             return _mm_cmplt_ps( native, rhs.native );
110 |         }
111 | 
112 |         NativeType native;
113 |     };
114 |     
115 |     
116 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>>
117 |     FS_FORCEINLINE f32<4, SIMD> Load( TypeWrapper<const float*, 4, SIMD> ptr )
118 |     {
119 |         return _mm_loadu_ps( ptr.value );
120 |     }
121 |     
122 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>>
123 |     FS_FORCEINLINE void Store( typename f32<4, SIMD>::ElementType* ptr, const f32<4, SIMD>& a )
124 |     {
125 |         _mm_storeu_ps( ptr, a.native );
126 |     }
127 | 
128 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>>
129 |     FS_FORCEINLINE float Extract0( const f32<4, SIMD>& a )
130 |     {
131 |         return _mm_cvtss_f32( a.native );
132 |     }
133 | 
134 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>>
135 |     FS_FORCEINLINE f32<4, SIMD> Abs( const f32<4, SIMD>& a )
136 |     {
137 |         const __m128i intMax = _mm_set1_epi32( 0x7FFFFFFF );
138 |         return _mm_and_ps( a.native, _mm_castsi128_ps( intMax ) );
139 |     }
140 |     
141 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>>
142 |     FS_FORCEINLINE f32<4, SIMD> Round( const f32<4, SIMD>& a )
143 |     {
144 |         if constexpr( SIMD & FastSIMD::FeatureFlag::SSE41 )
145 |         {
146 |             return _mm_round_ps( a.native, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC );
147 |         }
148 |         else
149 |         {
150 |             __m128i aInt = _mm_cvtps_epi32( a.native );
151 |             __m128 aIntF = _mm_cvtepi32_ps( aInt );
152 | 
153 |             return _mm_xor_ps( aIntF, _mm_and_ps( _mm_castsi128_ps( _mm_cmpeq_epi32( aInt, _mm_set1_epi32( (-2147483647 - 1) ) ) ), _mm_xor_ps( a.native, aIntF ) ) );
154 |         }
155 |     }
156 | 
157 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>>
158 |     FS_FORCEINLINE f32<4, SIMD> Floor( const f32<4, SIMD>& a )
159 |     {
160 |         if constexpr( SIMD & FastSIMD::FeatureFlag::SSE41 )
161 |         {
162 |             return _mm_round_ps( a.native, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC );
163 |         }
164 |         else
165 |         {
166 |             f32<4, SIMD> aRound = Round( a );
167 | 
168 |             return MaskedDecrement( aRound > a, aRound );
169 |         }
170 |     }
171 | 
172 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>>
173 |     FS_FORCEINLINE f32<4, SIMD> Ceil( const f32<4, SIMD>& a )
174 |     {
175 |         if constexpr( SIMD & FastSIMD::FeatureFlag::SSE41 )
176 |         {
177 |             return _mm_round_ps( a.native, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC );
178 |         }
179 |         else
180 |         {
181 |             f32<4, SIMD> aRound = Round( a );
182 | 
183 |             return MaskedIncrement( aRound < a, aRound );
184 |         }
185 |     }
186 | 
187 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>>
188 |     FS_FORCEINLINE f32<4, SIMD> Trunc( const f32<4, SIMD>& a )
189 |     {
190 |         if constexpr( SIMD & FastSIMD::FeatureFlag::SSE41 )
191 |         {
192 |             return _mm_round_ps( a.native, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC );
193 |         }
194 |         else
195 |         {
196 |             __m128i aInt = _mm_cvttps_epi32( a.native );
197 |             __m128 aIntF = _mm_cvtepi32_ps( aInt );
198 | 
199 |             return _mm_xor_ps( aIntF, _mm_and_ps( _mm_castsi128_ps( _mm_cmpeq_epi32( aInt, _mm_set1_epi32( (-2147483647 - 1) ) ) ), _mm_xor_ps( a.native, aIntF ) ) );
200 | 
201 |         }
202 |     }
203 |         
204 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>>
205 |     FS_FORCEINLINE f32<4, SIMD> Min( const f32<4, SIMD>& a, const f32<4, SIMD>& b )
206 |     {
207 |         return _mm_min_ps( a.native, b.native );
208 |     }
209 |         
210 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>>
211 |     FS_FORCEINLINE f32<4, SIMD> Max( const f32<4, SIMD>& a, const f32<4, SIMD>& b )
212 |     {
213 |         return _mm_max_ps( a.native, b.native );
214 |     }
215 |         
216 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>>
217 |     FS_FORCEINLINE f32<4, SIMD> Select( const typename f32<4, SIMD>::MaskTypeArg& mask, const f32<4, SIMD>& ifTrue, const f32<4, SIMD>& ifFalse )
218 |     {
219 |         if constexpr( SIMD & FastSIMD::FeatureFlag::SSE41 )
220 |         {
221 |             return _mm_blendv_ps( ifFalse.native, ifTrue.native, mask.native );
222 |         }
223 |         else
224 |         {
225 |             return _mm_xor_ps( ifFalse.native, _mm_and_ps( mask.native, _mm_xor_ps( ifTrue.native, ifFalse.native ) ) );
226 |         }
227 |     }
228 | 
229 |     template<typename U, FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>, typename = std::enable_if_t<SIMD & FastSIMD::FeatureFlag::SSE41>>
230 |     FS_FORCEINLINE f32<4, SIMD> SelectHighBit( const Register<U, 4, SIMD>& mask, const f32<4, SIMD>& ifTrue, const f32<4, SIMD>& ifFalse )
231 |     {
232 |         return _mm_blendv_ps( ifFalse.native, ifTrue.native, FS::Cast<float>( mask ).native );
233 |     }
234 | 
235 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>>
236 |     FS_FORCEINLINE f32<4, SIMD> BitwiseAndNot( const f32<4, SIMD>& a, const f32<4, SIMD>& b )
237 |     {
238 |         return _mm_andnot_ps( b.native, a.native );        
239 |     }
240 |             
241 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>>
242 |     FS_FORCEINLINE f32<4, SIMD> Masked( const typename f32<4, SIMD>::MaskTypeArg& mask, const f32<4, SIMD>& a )
243 |     {
244 |         return _mm_and_ps( mask.native, a.native );    
245 |     }
246 | 
247 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>>
248 |     FS_FORCEINLINE f32<4, SIMD> InvMasked( const typename f32<4, SIMD>::MaskTypeArg& mask, const f32<4, SIMD>& a )
249 |     {
250 |         return _mm_andnot_ps( mask.native, a.native );    
251 |     }
252 | 
253 | 
254 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>>
255 |     FS_FORCEINLINE f32<4, SIMD> MaskedIncrement( const typename f32<4, SIMD>::MaskTypeArg& mask, const f32<4, SIMD>& a )
256 |     {
257 |         return _mm_sub_ps( a.native, _mm_cvtepi32_ps( _mm_castps_si128( mask.native ) ) );
258 |     }
259 | 
260 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>>
261 |     FS_FORCEINLINE f32<4, SIMD> MaskedDecrement( const typename f32<4, SIMD>::MaskTypeArg& mask, const f32<4, SIMD>& a )
262 |     {
263 |         return _mm_add_ps( a.native, _mm_cvtepi32_ps( _mm_castps_si128( mask.native ) ) );
264 |     }
265 | 
266 |     
267 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>, typename = EnableIfRelaxed<SIMD>()>
268 |     FS_FORCEINLINE f32<4, SIMD> Reciprocal( const f32<4, SIMD>& a )
269 |     {            
270 |         return _mm_rcp_ps( a.native );
271 |     }
272 |     
273 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>, typename = EnableIfRelaxed<SIMD>()>
274 |     FS_FORCEINLINE f32<4, SIMD> InvSqrt( const f32<4, SIMD>& a )
275 |     {            
276 |         return _mm_rsqrt_ps( a.native );
277 |     }
278 |     
279 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>>
280 |     FS_FORCEINLINE f32<4, SIMD> Sqrt( const f32<4, SIMD>& a )
281 |     {            
282 |         return _mm_sqrt_ps( a.native );
283 |     }
284 | }
285 | 


--------------------------------------------------------------------------------
/include/FastSIMD/ToolSet/x86/128/i32x4.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <FastSIMD/ToolSet/Generic/Register.h>
  4 | 
  5 | namespace FS
  6 | {
  7 |     template<FastSIMD::FeatureSet SIMD>
  8 |     struct Register<std::int32_t, 4, SIMD, std::enable_if_t<SIMD & FastSIMD::FeatureFlag::SSE2>>
  9 |     {
 10 |         static constexpr size_t ElementCount = 4;
 11 |         static constexpr auto FeatureFlags = SIMD;
 12 | 
 13 |         using NativeType = __m128i;
 14 |         using ElementType = std::int32_t;
 15 |         using MaskType = m32<ElementCount, false, SIMD>;
 16 |         using MaskTypeArg = m32<ElementCount, true, SIMD>;
 17 | 
 18 |         FS_FORCEINLINE Register() = default;
 19 |         FS_FORCEINLINE Register( NativeType v ) : native( v ) { }
 20 |         FS_FORCEINLINE Register( std::int32_t v ) : native( _mm_set1_epi32( v ) ) { }
 21 |         
 22 |         FS_FORCEINLINE NativeType GetNative() const
 23 |         {
 24 |             return native;
 25 |         }
 26 | 
 27 |         FS_FORCEINLINE Register& operator +=( const Register& rhs )
 28 |         {
 29 |             native = _mm_add_epi32( native, rhs.native );
 30 |             return *this;
 31 |         }
 32 | 
 33 |         FS_FORCEINLINE Register& operator -=( const Register& rhs )
 34 |         {
 35 |             native = _mm_sub_epi32( native, rhs.native );
 36 |             return *this;
 37 |         }
 38 |         
 39 |         FS_FORCEINLINE Register& operator *=( const Register& rhs )
 40 |         {
 41 |             if constexpr( SIMD & FastSIMD::FeatureFlag::SSE41 )
 42 |             {
 43 |                 native = _mm_mullo_epi32( native, rhs.native );
 44 |             }
 45 |             else
 46 |             {
 47 |                 __m128i mul20 = _mm_mul_epu32( native, rhs.native ); /* mul 2,0*/
 48 |                 __m128i mul31 = _mm_mul_epu32( _mm_srli_si128( native, 4 ), _mm_srli_si128( rhs.native, 4 ) ); /* mul 3,1 */
 49 |                 native = _mm_unpacklo_epi32( _mm_shuffle_epi32( mul20, _MM_SHUFFLE( 0, 0, 2, 0 ) ), _mm_shuffle_epi32( mul31, _MM_SHUFFLE( 0, 0, 2, 0 ) ) ); /* shuffle results to [63..0] and pack */                
 50 |             }
 51 |             return *this;
 52 |         }
 53 |             
 54 |         FS_FORCEINLINE Register& operator &=( const Register& rhs )
 55 |         {
 56 |             native = _mm_and_si128( native, rhs.native );
 57 |             return *this;
 58 |         }
 59 |         
 60 |         FS_FORCEINLINE Register& operator |=( const Register& rhs )
 61 |         {
 62 |             native = _mm_or_si128( native, rhs.native );
 63 |             return *this;
 64 |         }
 65 |         
 66 |         FS_FORCEINLINE Register& operator ^=( const Register& rhs )
 67 |         {
 68 |             native = _mm_xor_si128( native, rhs.native );
 69 |             return *this;
 70 |         }
 71 |         
 72 |         FS_FORCEINLINE Register& operator >>=( int rhs )
 73 |         {
 74 |             native = _mm_srai_epi32( native, rhs );
 75 |             return *this;
 76 |         }
 77 |         
 78 |         FS_FORCEINLINE Register& operator <<=( int rhs )
 79 |         {
 80 |             native = _mm_slli_epi32( native, rhs );
 81 |             return *this;
 82 |         }
 83 | 
 84 |         FS_FORCEINLINE Register operator ~() const
 85 |         {
 86 |             const __m128i neg1 = _mm_set1_epi32( -1 );
 87 |             return _mm_xor_si128( native, neg1 );        
 88 |         }
 89 | 
 90 |         FS_FORCEINLINE Register operator -() const
 91 |         {
 92 |             return _mm_sub_epi32( _mm_setzero_si128(), native );
 93 |         }
 94 | 
 95 |         
 96 |         FS_FORCEINLINE MaskType operator ==( const Register& rhs ) const
 97 |         {
 98 |             return _mm_cmpeq_epi32( native, rhs.native );
 99 |         }
100 |         
101 |         FS_FORCEINLINE MaskType operator !=( const Register& rhs ) const
102 |         {
103 |             return ~(*this == rhs);
104 |         }
105 |         
106 |         FS_FORCEINLINE MaskType operator >=( const Register& rhs ) const
107 |         {
108 |             return ~(*this < rhs);
109 |         }
110 |         
111 |         FS_FORCEINLINE MaskType operator <=( const Register& rhs ) const
112 |         {
113 |             return ~(*this > rhs);
114 |         }
115 |         
116 |         FS_FORCEINLINE MaskType operator >( const Register& rhs ) const
117 |         {
118 |             return _mm_cmpgt_epi32( native, rhs.native );
119 |         }
120 |         
121 |         FS_FORCEINLINE MaskType operator <( const Register& rhs ) const
122 |         {
123 |             return _mm_cmplt_epi32( native, rhs.native );
124 |         }
125 | 
126 |         NativeType native;
127 |     };
128 | 
129 |     
130 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<4, SIMD>>>
131 |     FS_FORCEINLINE i32<4, SIMD> Load( TypeWrapper<const int*, 4, SIMD> ptr )
132 |     {
133 |         return _mm_loadu_si128( (__m128i*)ptr.value );
134 |     }
135 |     
136 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<4, SIMD>>>
137 |     FS_FORCEINLINE void Store( typename i32<4, SIMD>::ElementType* ptr, const i32<4, SIMD>& a )
138 |     {
139 |         _mm_storeu_si128( (__m128i*)ptr, a.native );
140 |     }
141 | 
142 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<4, SIMD>>>
143 |     FS_FORCEINLINE int32_t Extract0( const i32<4, SIMD>& a )
144 |     {
145 |         return _mm_cvtsi128_si32( a.native );
146 |     }
147 |         
148 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<4, SIMD>>>
149 |     FS_FORCEINLINE i32<4, SIMD> Abs( const i32<4, SIMD>& a )
150 |     {
151 |         if constexpr( SIMD & FastSIMD::FeatureFlag::SSSE3 )
152 |         {
153 |             return _mm_abs_epi32( a.native );
154 |         }
155 |         else
156 |         {
157 |             __m128i signMask = _mm_srai_epi32( a.native, 31 );
158 |             return _mm_sub_epi32( _mm_xor_si128( a.native, signMask ), signMask );
159 |         }
160 |     }
161 |         
162 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<4, SIMD>>>
163 |     FS_FORCEINLINE i32<4, SIMD> Min( const i32<4, SIMD>& a, const i32<4, SIMD>& b )
164 |     {
165 |         if constexpr( SIMD & FastSIMD::FeatureFlag::SSE41 )
166 |         {
167 |             return _mm_min_epi32( a.native, b.native );
168 |         }
169 |         else
170 |         {
171 |             return Select( a < b, a, b );
172 |         }
173 |     }
174 |         
175 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<4, SIMD>>>
176 |     FS_FORCEINLINE i32<4, SIMD> Max( const i32<4, SIMD>& a, const i32<4, SIMD>& b )
177 |     {
178 |         if constexpr( SIMD & FastSIMD::FeatureFlag::SSE41 )
179 |         {
180 |             return _mm_max_epi32( a.native, b.native );
181 |         }
182 |         else
183 |         {
184 |             return Select( a > b, a, b );
185 |         }
186 |     }
187 |         
188 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<4, SIMD>>>
189 |     FS_FORCEINLINE i32<4, SIMD> Select( const typename i32<4, SIMD>::MaskTypeArg& mask, const i32<4, SIMD>& ifTrue, const i32<4, SIMD>& ifFalse )
190 |     {
191 |         if constexpr( SIMD & FastSIMD::FeatureFlag::SSE41 )
192 |         {
193 |             return _mm_blendv_epi8( ifFalse.native, ifTrue.native, _mm_castps_si128( mask.native ) );
194 |         }
195 |         else
196 |         {
197 |             return _mm_xor_si128( ifFalse.native, _mm_and_si128( _mm_castps_si128( mask.native ), _mm_xor_si128( ifTrue.native, ifFalse.native ) ) );
198 |         }
199 |     }
200 |         
201 |     template<typename U, FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<4, SIMD>>, typename = std::enable_if_t<SIMD & FastSIMD::FeatureFlag::SSE41>>
202 |     FS_FORCEINLINE i32<4, SIMD> SelectHighBit( const Register<U, 4, SIMD>& mask, const i32<4, SIMD>& ifTrue, const i32<4, SIMD>& ifFalse )
203 |     {
204 |         return _mm_castps_si128( _mm_blendv_ps( _mm_castsi128_ps( ifFalse.native ), _mm_castsi128_ps( ifTrue.native ), FS::Cast<float>( mask ).native ) );  
205 |     }
206 | 
207 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<4, SIMD>>>
208 |     FS_FORCEINLINE i32<4, SIMD> BitwiseAndNot( const i32<4, SIMD>& a, const i32<4, SIMD>& b )
209 |     {
210 |         return _mm_andnot_si128( b.native, a.native );        
211 |     }
212 | 
213 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<4, SIMD>>>
214 |     FS_FORCEINLINE i32<4, SIMD> BitShiftRightZeroExtend( const i32<4, SIMD>& a, int b )
215 |     {
216 |         return _mm_srli_epi32( a.native, b );        
217 |     }
218 | 
219 | 
220 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<4, SIMD>>>
221 |     FS_FORCEINLINE i32<4, SIMD> Masked( const typename i32<4, SIMD>::MaskTypeArg& mask, const i32<4, SIMD>& a )
222 |     {
223 |         return _mm_and_si128( _mm_castps_si128( mask.native ), a.native );    
224 |     }
225 |         
226 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<4, SIMD>>>
227 |     FS_FORCEINLINE i32<4, SIMD> InvMasked( const typename i32<4, SIMD>::MaskTypeArg& mask, const i32<4, SIMD>& a )
228 |     {
229 |         return _mm_andnot_si128( _mm_castps_si128( mask.native ), a.native );    
230 |     }
231 |         
232 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<4, SIMD>>>
233 |     FS_FORCEINLINE i32<4, SIMD> MaskedIncrement( const typename i32<4, SIMD>::MaskTypeArg& mask, const i32<4, SIMD>& a )
234 |     {
235 |         return _mm_sub_epi32( a.native, _mm_castps_si128( mask.native ) );
236 |     }
237 |         
238 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<4, SIMD>>>
239 |     FS_FORCEINLINE i32<4, SIMD> MaskedDecrement( const typename i32<4, SIMD>::MaskTypeArg& mask, const i32<4, SIMD>& a )
240 |     {
241 |         return _mm_add_epi32( a.native, _mm_castps_si128( mask.native ) );    
242 |     }
243 | }
244 | 


--------------------------------------------------------------------------------
/include/FastSIMD/ToolSet/x86/128/m32x4.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <FastSIMD/ToolSet/Generic/Register.h>
  4 | 
  5 | namespace FS
  6 | {
  7 |     template<FastSIMD::FeatureSet SIMD>
  8 |     struct Register<Mask<32, true>, 4, SIMD, std::enable_if_t<SIMD & FastSIMD::FeatureFlag::SSE>>
  9 |     {
 10 |         static constexpr size_t ElementCount = 4;
 11 |         static constexpr auto FeatureFlags = SIMD;
 12 |         
 13 |         using NativeType = __m128;
 14 |         using ElementType = Mask<32, true>;
 15 |         using MaskType = Register;
 16 |         using MaskTypeArg = Register;
 17 | 
 18 |         FS_FORCEINLINE Register() = default;
 19 |         FS_FORCEINLINE Register( NativeType v ) : native( v ) { }
 20 |         
 21 |         FS_FORCEINLINE NativeType GetNative() const
 22 |         {
 23 |             return native;
 24 |         }
 25 | 
 26 |         FS_FORCEINLINE Register& operator &=( const Register& rhs )
 27 |         {
 28 |             native = _mm_and_ps( native, rhs.native );
 29 |             return *this;
 30 |         }
 31 |         
 32 |         FS_FORCEINLINE Register& operator |=( const Register& rhs )
 33 |         {
 34 |             native = _mm_or_ps( native, rhs.native );
 35 |             return *this;
 36 |         }
 37 |         
 38 |         FS_FORCEINLINE Register& operator ^=( const Register& rhs )
 39 |         {
 40 |             native = _mm_xor_ps( native, rhs.native );
 41 |             return *this;
 42 |         }
 43 |         
 44 |         FS_FORCEINLINE Register operator ~() const
 45 |         {
 46 |             const __m128i neg1 = _mm_set1_epi32( -1 );
 47 |             return _mm_xor_ps( native, _mm_castsi128_ps( neg1 ) );        
 48 |         }
 49 | 
 50 |         NativeType native;
 51 |     };
 52 | 
 53 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<m32<4, true, SIMD>>>
 54 |     FS_FORCEINLINE m32<4, true, SIMD> BitwiseAndNot( const m32<4, true, SIMD>& a, const m32<4, true, SIMD>& b )
 55 |     {
 56 |         return _mm_andnot_ps( b.native, a.native );        
 57 |     }
 58 |     
 59 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<m32<4, true, SIMD>>>
 60 |     FS_FORCEINLINE bool AnyMask( const m32<4, true, SIMD>& a )
 61 |     {          
 62 |         return _mm_movemask_ps( a.native );        
 63 |     }
 64 |     
 65 |     template<FastSIMD::FeatureSet SIMD, bool B, typename = EnableIfNative<m32<4, B, SIMD>>>
 66 |     FS_FORCEINLINE BitStorage<4> BitMask( const m32<4, B, SIMD>& a )
 67 |     {          
 68 |         return static_cast<BitStorage<4>>( _mm_movemask_ps( a.native ) );        
 69 |     }
 70 | 
 71 | 
 72 |     template<FastSIMD::FeatureSet SIMD>
 73 |     struct Register<Mask<32, false>, 4, SIMD, std::enable_if_t<SIMD & FastSIMD::FeatureFlag::SSE2>> : Register<Mask<32, true>, 4, SIMD>
 74 |     {
 75 |         static constexpr size_t ElementCount = 4;
 76 |         static constexpr auto FeatureFlags = SIMD;
 77 |         
 78 |         using NativeType = __m128i;
 79 |         using ElementType = Mask<32, false>;
 80 |         using MaskType = Register;
 81 |         using MaskTypeArg = Register<Mask<32, true>, 4, SIMD>;
 82 | 
 83 |         FS_FORCEINLINE Register() = default;
 84 |         FS_FORCEINLINE Register( NativeType v ) : Register<Mask<32, true>, 4, SIMD>( _mm_castsi128_ps( v ) ) { }
 85 | 
 86 |         FS_FORCEINLINE NativeType GetNative() const
 87 |         {
 88 |             return _mm_castps_si128( this->native );
 89 |         }
 90 | 
 91 |         FS_FORCEINLINE Register& operator &=( const Register& rhs )
 92 |         {
 93 |             this->native = _mm_castsi128_ps( _mm_and_si128( _mm_castps_si128( this->native ), _mm_castps_si128( rhs.native ) ) );
 94 |             return *this;
 95 |         }
 96 |         
 97 |         FS_FORCEINLINE Register& operator |=( const Register& rhs )
 98 |         {
 99 |             this->native = _mm_castsi128_ps( _mm_or_si128( _mm_castps_si128( this->native ), _mm_castps_si128( rhs.native ) ) );
100 |             return *this;
101 |         }
102 |         
103 |         FS_FORCEINLINE Register& operator ^=( const Register& rhs )
104 |         {
105 |             this->native = _mm_castsi128_ps( _mm_xor_si128( _mm_castps_si128( this->native ), _mm_castps_si128( rhs.native ) ) );
106 |             return *this;
107 |         }
108 |         
109 |         FS_FORCEINLINE Register operator ~() const
110 |         {
111 |             const __m128i neg1 = _mm_set1_epi32( -1 );
112 |             return _mm_xor_si128( _mm_castps_si128( this->native ), neg1 );        
113 |         }        
114 |     };
115 | 
116 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<m32<4, false, SIMD>>>
117 |     FS_FORCEINLINE m32<4, false, SIMD> BitwiseAndNot( const m32<4, false, SIMD>& a, const m32<4, false, SIMD>& b )
118 |     {
119 |         return _mm_andnot_si128( _mm_castps_si128( b.native ), _mm_castps_si128( a.native ) );        
120 |     }
121 |         
122 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<m32<4, false, SIMD>>>
123 |     FS_FORCEINLINE bool AnyMask( const m32<4, false, SIMD>& a )
124 |     {          
125 |         return _mm_movemask_epi8( _mm_castps_si128( a.native ) );        
126 |     }
127 | }
128 | 


--------------------------------------------------------------------------------
/include/FastSIMD/ToolSet/x86/256/f32x8.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <FastSIMD/ToolSet/Generic/Register.h>
  4 | 
  5 | namespace FS
  6 | {
  7 |     template<FastSIMD::FeatureSet SIMD>
  8 |     struct Register<float, 8, SIMD, std::enable_if_t<SIMD & FastSIMD::FeatureFlag::AVX>>
  9 |     {
 10 |         static constexpr size_t ElementCount = 8;
 11 |         static constexpr auto FeatureFlags = SIMD;
 12 |         
 13 |         using NativeType = __m256;
 14 |         using ElementType = float;
 15 |         using MaskType = m32<ElementCount, true, SIMD>;
 16 |         using MaskTypeArg = m32<ElementCount, true, SIMD>;
 17 | 
 18 |         FS_FORCEINLINE Register() = default;
 19 |         FS_FORCEINLINE Register( NativeType v ) : native( v ) { }
 20 |         FS_FORCEINLINE Register( float v ) : native( _mm256_set1_ps( v ) ) { }
 21 |         
 22 |         FS_FORCEINLINE NativeType GetNative() const
 23 |         {
 24 |             return native;
 25 |         }
 26 | 
 27 |         FS_FORCEINLINE Register& operator +=( const Register& rhs )
 28 |         {
 29 |             native = _mm256_add_ps( native, rhs.native );
 30 |             return *this;
 31 |         }
 32 | 
 33 |         FS_FORCEINLINE Register& operator -=( const Register& rhs )
 34 |         {
 35 |             native = _mm256_sub_ps( native, rhs.native );
 36 |             return *this;
 37 |         }
 38 |         
 39 |         FS_FORCEINLINE Register& operator *=( const Register& rhs )
 40 |         {
 41 |             native = _mm256_mul_ps( native, rhs.native );
 42 |             return *this;           
 43 |         }
 44 |         
 45 |         FS_FORCEINLINE Register& operator /=( const Register& rhs )
 46 |         {
 47 |             native = _mm256_div_ps( native, rhs.native );
 48 |             return *this;           
 49 |         }
 50 |             
 51 |         FS_FORCEINLINE Register& operator &=( const Register& rhs )
 52 |         {
 53 |             native = _mm256_and_ps( native, rhs.native );
 54 |             return *this;
 55 |         }
 56 |         
 57 |         FS_FORCEINLINE Register& operator |=( const Register& rhs )
 58 |         {
 59 |             native = _mm256_or_ps( native, rhs.native );
 60 |             return *this;
 61 |         }
 62 |         
 63 |         FS_FORCEINLINE Register& operator ^=( const Register& rhs )
 64 |         {
 65 |             native = _mm256_xor_ps( native, rhs.native );
 66 |             return *this;
 67 |         }        
 68 | 
 69 |         FS_FORCEINLINE Register operator~() const
 70 |         {
 71 |             const __m256i neg1 = _mm256_set1_epi32( -1 );
 72 |             return _mm256_xor_ps( native, _mm256_castsi256_ps( neg1 ) );        
 73 |         }
 74 | 
 75 |         FS_FORCEINLINE Register operator-() const
 76 |         {
 77 |             const __m256i minInt = _mm256_set1_epi32( 0x80000000 );
 78 |             return _mm256_xor_ps( native, _mm256_castsi256_ps( minInt ) );
 79 |         }
 80 |         
 81 |         
 82 |         FS_FORCEINLINE MaskType operator ==( const Register& rhs ) const
 83 |         {
 84 |             return _mm256_cmp_ps( native, rhs.native, _CMP_EQ_OQ );
 85 |         }
 86 |         
 87 |         FS_FORCEINLINE MaskType operator !=( const Register& rhs ) const
 88 |         {
 89 |             return _mm256_cmp_ps( native, rhs.native, _CMP_NEQ_OQ );
 90 |         }
 91 |         
 92 |         FS_FORCEINLINE MaskType operator >=( const Register& rhs ) const
 93 |         {
 94 |             return _mm256_cmp_ps( native, rhs.native, _CMP_GE_OQ );
 95 |         }
 96 |         
 97 |         FS_FORCEINLINE MaskType operator <=( const Register& rhs ) const
 98 |         {
 99 |             return _mm256_cmp_ps( native, rhs.native, _CMP_LE_OQ );
100 |         }
101 |         
102 |         FS_FORCEINLINE MaskType operator >( const Register& rhs ) const
103 |         {
104 |             return _mm256_cmp_ps( native, rhs.native, _CMP_GT_OQ );
105 |         }
106 |         
107 |         FS_FORCEINLINE MaskType operator <( const Register& rhs ) const
108 |         {
109 |             return _mm256_cmp_ps( native, rhs.native, _CMP_LT_OQ );
110 |         }
111 | 
112 |         NativeType native;
113 |     };
114 |     
115 |     
116 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<8, SIMD>>>
117 |     FS_FORCEINLINE f32<8, SIMD> Load( TypeWrapper<const float*, 8, SIMD> ptr )
118 |     {
119 |         return _mm256_loadu_ps( ptr.value );
120 |     }
121 |     
122 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<8, SIMD>>>
123 |     FS_FORCEINLINE void Store( typename f32<8, SIMD>::ElementType* ptr, const f32<8, SIMD>& a )
124 |     {
125 |         _mm256_storeu_ps( ptr, a.native );
126 |     }
127 | 
128 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<8, SIMD>>>
129 |     FS_FORCEINLINE float Extract0( const f32<8, SIMD>& a )
130 |     {
131 |         return _mm256_cvtss_f32( a.native );
132 |     }
133 | 
134 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<8, SIMD>>>
135 |     FS_FORCEINLINE f32<8, SIMD> Abs( const f32<8, SIMD>& a )
136 |     {
137 |         const __m256i intMax = _mm256_set1_epi32( 0x7FFFFFFF );
138 |         return _mm256_and_ps( a.native, _mm256_castsi256_ps( intMax ) );
139 |     }
140 |     
141 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<8, SIMD>>>
142 |     FS_FORCEINLINE f32<8, SIMD> Round( const f32<8, SIMD>& a )
143 |     {
144 |         return _mm256_round_ps( a.native, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC );
145 |     }
146 | 
147 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<8, SIMD>>>
148 |     FS_FORCEINLINE f32<8, SIMD> Floor( const f32<8, SIMD>& a )
149 |     {
150 |         return _mm256_round_ps( a.native, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC );
151 |     }
152 | 
153 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<8, SIMD>>>
154 |     FS_FORCEINLINE f32<8, SIMD> Ceil( const f32<8, SIMD>& a )
155 |     {
156 |         return _mm256_round_ps( a.native, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC );
157 |     }
158 | 
159 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<8, SIMD>>>
160 |     FS_FORCEINLINE f32<8, SIMD> Trunc( const f32<8, SIMD>& a )
161 |     {
162 |         return _mm256_round_ps( a.native, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC );
163 |     }
164 |         
165 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<8, SIMD>>>
166 |     FS_FORCEINLINE f32<8, SIMD> Min( const f32<8, SIMD>& a, const f32<8, SIMD>& b )
167 |     {
168 |         return _mm256_min_ps( a.native, b.native );
169 |     }
170 |         
171 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<8, SIMD>>>
172 |     FS_FORCEINLINE f32<8, SIMD> Max( const f32<8, SIMD>& a, const f32<8, SIMD>& b )
173 |     {
174 |         return _mm256_max_ps( a.native, b.native );
175 |     }
176 |         
177 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<8, SIMD>>>
178 |     FS_FORCEINLINE f32<8, SIMD> Select( const typename f32<8, SIMD>::MaskTypeArg& mask, const f32<8, SIMD>& ifTrue, const f32<8, SIMD>& ifFalse )
179 |     {
180 |         return _mm256_blendv_ps( ifFalse.native, ifTrue.native, mask.native );
181 |     }
182 | 
183 |     template<typename U, FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<8, SIMD>>>
184 |     FS_FORCEINLINE f32<8, SIMD> SelectHighBit( const Register<U, 8, SIMD>& mask, const f32<8, SIMD>& ifTrue, const f32<8, SIMD>& ifFalse )
185 |     {
186 |         return _mm256_blendv_ps( ifFalse.native, ifTrue.native, FS::Cast<float>( mask ).native );
187 |     }
188 | 
189 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<8, SIMD>>>
190 |     FS_FORCEINLINE f32<8, SIMD> BitwiseAndNot( const f32<8, SIMD>& a, const f32<8, SIMD>& b )
191 |     {
192 |         return _mm256_andnot_ps( b.native, a.native );        
193 |     }
194 |             
195 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<8, SIMD>>>
196 |     FS_FORCEINLINE f32<8, SIMD> Masked( const typename f32<8, SIMD>::MaskTypeArg& mask, const f32<8, SIMD>& a )
197 |     {
198 |         return _mm256_and_ps( mask.native, a.native );    
199 |     }
200 | 
201 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<8, SIMD>>>
202 |     FS_FORCEINLINE f32<8, SIMD> InvMasked( const typename f32<8, SIMD>::MaskTypeArg& mask, const f32<8, SIMD>& a )
203 |     {
204 |         return _mm256_andnot_ps( mask.native, a.native );    
205 |     }
206 | 
207 | 
208 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<8, SIMD>>>
209 |     FS_FORCEINLINE f32<8, SIMD> MaskedIncrement( const typename f32<8, SIMD>::MaskTypeArg& mask, const f32<8, SIMD>& a )
210 |     {
211 |         return _mm256_sub_ps( a.native, _mm256_cvtepi32_ps( _mm256_castps_si256( mask.native ) ) );
212 |     }
213 | 
214 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<8, SIMD>>>
215 |     FS_FORCEINLINE f32<8, SIMD> MaskedDecrement( const typename f32<8, SIMD>::MaskTypeArg& mask, const f32<8, SIMD>& a )
216 |     {
217 |         return _mm256_add_ps( a.native, _mm256_cvtepi32_ps( _mm256_castps_si256( mask.native ) ) );
218 |     }
219 | 
220 |     
221 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<8, SIMD>>, typename = EnableIfRelaxed<SIMD>>
222 |     FS_FORCEINLINE f32<8, SIMD> Reciprocal( const f32<8, SIMD>& a )
223 |     {            
224 |         return _mm256_rcp_ps( a.native );
225 |     }
226 |     
227 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<8, SIMD>>, typename = EnableIfRelaxed<SIMD>>
228 |     FS_FORCEINLINE f32<8, SIMD> InvSqrt( const f32<8, SIMD>& a )
229 |     {            
230 |         return _mm256_rsqrt_ps( a.native );
231 |     }
232 |     
233 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<8, SIMD>>>
234 |     FS_FORCEINLINE f32<8, SIMD> Sqrt( const f32<8, SIMD>& a )
235 |     {            
236 |         return _mm256_sqrt_ps( a.native );
237 |     }
238 |     
239 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<8, SIMD>>, typename = EnableIfRelaxed<SIMD>, typename = std::enable_if_t<SIMD & FastSIMD::FeatureFlag::AVX2>>
240 |     FS_FORCEINLINE f32<8, SIMD> FMulAdd( const f32<8, SIMD>& a, const f32<8, SIMD>& b, const f32<8, SIMD>& c )
241 |     {            
242 |         return _mm256_fmadd_ps( a.native, b.native, c.native );
243 |     }
244 |     
245 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<8, SIMD>>, typename = EnableIfRelaxed<SIMD>, typename = std::enable_if_t<SIMD & FastSIMD::FeatureFlag::AVX2>>
246 |     FS_FORCEINLINE f32<8, SIMD> FMulSub( const f32<8, SIMD>& a, const f32<8, SIMD>& b, const f32<8, SIMD>& c )
247 |     {            
248 |         return _mm256_fmsub_ps( a.native, b.native, c.native );
249 |     }
250 |     
251 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<8, SIMD>>, typename = EnableIfRelaxed<SIMD>, typename = std::enable_if_t<SIMD & FastSIMD::FeatureFlag::AVX2>>
252 |     FS_FORCEINLINE f32<8, SIMD> FNMulAdd( const f32<8, SIMD>& a, const f32<8, SIMD>& b, const f32<8, SIMD>& c )
253 |     {            
254 |         return _mm256_fnmadd_ps( a.native, b.native, c.native );
255 |     }
256 |     
257 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<8, SIMD>>, typename = EnableIfRelaxed<SIMD>, typename = std::enable_if_t<SIMD & FastSIMD::FeatureFlag::AVX2>>
258 |     FS_FORCEINLINE f32<8, SIMD> FNMulSub( const f32<8, SIMD>& a, const f32<8, SIMD>& b, const f32<8, SIMD>& c )
259 |     {            
260 |         return _mm256_fnmsub_ps( a.native, b.native, c.native );
261 |     }
262 | }
263 | 


--------------------------------------------------------------------------------
/include/FastSIMD/ToolSet/x86/256/i32x8.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <FastSIMD/ToolSet/Generic/Register.h>
  4 | 
  5 | namespace FS
  6 | {
  7 |     template<FastSIMD::FeatureSet SIMD>
  8 |     struct Register<std::int32_t, 8, SIMD, std::enable_if_t<SIMD & FastSIMD::FeatureFlag::AVX2>>
  9 |     {
 10 |         static constexpr size_t ElementCount = 8;
 11 |         static constexpr auto FeatureFlags = SIMD;
 12 | 
 13 |         using NativeType = __m256i;
 14 |         using ElementType = std::int32_t;
 15 |         using MaskType = m32<ElementCount, false, SIMD>;
 16 |         using MaskTypeArg = m32<ElementCount, true, SIMD>;
 17 | 
 18 |         FS_FORCEINLINE Register() = default;
 19 |         FS_FORCEINLINE Register( NativeType v ) : native( v ) { }
 20 |         FS_FORCEINLINE Register( std::int32_t v ) : native( _mm256_set1_epi32( v ) ) { }
 21 |         
 22 |         FS_FORCEINLINE NativeType GetNative() const
 23 |         {
 24 |             return native;
 25 |         }
 26 | 
 27 |         FS_FORCEINLINE Register& operator +=( const Register& rhs )
 28 |         {
 29 |             native = _mm256_add_epi32( native, rhs.native );
 30 |             return *this;
 31 |         }
 32 | 
 33 |         FS_FORCEINLINE Register& operator -=( const Register& rhs )
 34 |         {
 35 |             native = _mm256_sub_epi32( native, rhs.native );
 36 |             return *this;
 37 |         }
 38 |         
 39 |         FS_FORCEINLINE Register& operator *=( const Register& rhs )
 40 |         {
 41 |             native = _mm256_mullo_epi32( native, rhs.native );            
 42 |             return *this;
 43 |         }
 44 |             
 45 |         FS_FORCEINLINE Register& operator &=( const Register& rhs )
 46 |         {
 47 |             native = _mm256_and_si256( native, rhs.native );
 48 |             return *this;
 49 |         }
 50 |         
 51 |         FS_FORCEINLINE Register& operator |=( const Register& rhs )
 52 |         {
 53 |             native = _mm256_or_si256( native, rhs.native );
 54 |             return *this;
 55 |         }
 56 |         
 57 |         FS_FORCEINLINE Register& operator ^=( const Register& rhs )
 58 |         {
 59 |             native = _mm256_xor_si256( native, rhs.native );
 60 |             return *this;
 61 |         }
 62 |         
 63 |         FS_FORCEINLINE Register& operator >>=( int rhs )
 64 |         {
 65 |             native = _mm256_srai_epi32( native, rhs );
 66 |             return *this;
 67 |         }
 68 |         
 69 |         FS_FORCEINLINE Register& operator <<=( int rhs )
 70 |         {
 71 |             native = _mm256_slli_epi32( native, rhs );
 72 |             return *this;
 73 |         }
 74 | 
 75 |         FS_FORCEINLINE Register operator ~() const
 76 |         {
 77 |             const __m256i neg1 = _mm256_set1_epi32( -1 );
 78 |             return _mm256_xor_si256( native, neg1 );        
 79 |         }
 80 | 
 81 |         FS_FORCEINLINE Register operator -() const
 82 |         {
 83 |             return _mm256_sub_epi32( _mm256_setzero_si256(), native );
 84 |         }
 85 | 
 86 |         
 87 |         FS_FORCEINLINE MaskType operator ==( const Register& rhs ) const
 88 |         {
 89 |             return _mm256_cmpeq_epi32( native, rhs.native );
 90 |         }
 91 |         
 92 |         FS_FORCEINLINE MaskType operator !=( const Register& rhs ) const
 93 |         {
 94 |             return ~(*this == rhs);
 95 |         }
 96 |         
 97 |         FS_FORCEINLINE MaskType operator >=( const Register& rhs ) const
 98 |         {
 99 |             return ~(*this < rhs);
100 |         }
101 |         
102 |         FS_FORCEINLINE MaskType operator <=( const Register& rhs ) const
103 |         {
104 |             return ~(*this > rhs);
105 |         }
106 |         
107 |         FS_FORCEINLINE MaskType operator >( const Register& rhs ) const
108 |         {
109 |             return _mm256_cmpgt_epi32( native, rhs.native );
110 |         }
111 |         
112 |         FS_FORCEINLINE MaskType operator <( const Register& rhs ) const
113 |         {
114 |             return _mm256_cmpgt_epi32( rhs.native, native );
115 |         }
116 | 
117 |         NativeType native;
118 |     };
119 | 
120 |     
121 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<8, SIMD>>>
122 |     FS_FORCEINLINE i32<8, SIMD> Load( TypeWrapper<const int*, 8, SIMD> ptr )
123 |     {
124 |         return _mm256_loadu_si256( (const __m256i*)ptr.value );
125 |     }
126 |     
127 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<8, SIMD>>>
128 |     FS_FORCEINLINE void Store( typename i32<8, SIMD>::ElementType* ptr, const i32<8, SIMD>& a )
129 |     {
130 |         _mm256_storeu_si256( (__m256i*)ptr, a.native );
131 |     }
132 | 
133 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<8, SIMD>>>
134 |     FS_FORCEINLINE int32_t Extract0( const i32<8, SIMD>& a )
135 |     {
136 |         return _mm256_cvtsi256_si32( a.native );
137 |     }
138 |         
139 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<8, SIMD>>>
140 |     FS_FORCEINLINE i32<8, SIMD> Abs( const i32<8, SIMD>& a )
141 |     {
142 |         return _mm256_abs_epi32( a.native );
143 |     }
144 |         
145 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<8, SIMD>>>
146 |     FS_FORCEINLINE i32<8, SIMD> Min( const i32<8, SIMD>& a, const i32<8, SIMD>& b )
147 |     {
148 |         return _mm256_min_epi32( a.native, b.native );
149 |     }
150 |         
151 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<8, SIMD>>>
152 |     FS_FORCEINLINE i32<8, SIMD> Max( const i32<8, SIMD>& a, const i32<8, SIMD>& b )
153 |     {
154 |         return _mm256_max_epi32( a.native, b.native );
155 |     }
156 |         
157 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<8, SIMD>>>
158 |     FS_FORCEINLINE i32<8, SIMD> Select( const typename i32<8, SIMD>::MaskTypeArg& mask, const i32<8, SIMD>& ifTrue, const i32<8, SIMD>& ifFalse )
159 |     {
160 |         return _mm256_blendv_epi8( ifFalse.native, ifTrue.native, _mm256_castps_si256( mask.native ) );
161 |     }
162 | 
163 |     template<typename U, FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<8, SIMD>>>
164 |     FS_FORCEINLINE i32<8, SIMD> SelectHighBit( const Register<U, 8, SIMD>& mask, const i32<8, SIMD>& ifTrue, const i32<8, SIMD>& ifFalse )
165 |     {
166 |         return _mm256_castps_si256( _mm256_blendv_ps( _mm256_castsi256_ps( ifFalse.native ), _mm256_castsi256_ps( ifTrue.native ), FS::Cast<float>( mask ).native ) );
167 |     }
168 | 
169 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<8, SIMD>>>
170 |     FS_FORCEINLINE i32<8, SIMD> BitwiseAndNot( const i32<8, SIMD>& a, const i32<8, SIMD>& b )
171 |     {
172 |         return _mm256_andnot_si256( b.native, a.native );        
173 |     }
174 | 
175 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<8, SIMD>>>
176 |     FS_FORCEINLINE i32<8, SIMD> BitShiftRightZeroExtend( const i32<8, SIMD>& a, int b )
177 |     {
178 |         return _mm256_srli_epi32( a.native, b );        
179 |     }
180 | 
181 | 
182 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<8, SIMD>>>
183 |     FS_FORCEINLINE i32<8, SIMD> Masked( const typename i32<8, SIMD>::MaskTypeArg& mask, const i32<8, SIMD>& a )
184 |     {
185 |         return _mm256_and_si256( _mm256_castps_si256( mask.native ), a.native );    
186 |     }
187 |         
188 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<8, SIMD>>>
189 |     FS_FORCEINLINE i32<8, SIMD> InvMasked( const typename i32<8, SIMD>::MaskTypeArg& mask, const i32<8, SIMD>& a )
190 |     {
191 |         return _mm256_andnot_si256( _mm256_castps_si256( mask.native ), a.native );    
192 |     }
193 |         
194 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<8, SIMD>>>
195 |     FS_FORCEINLINE i32<8, SIMD> MaskedIncrement( const typename i32<8, SIMD>::MaskTypeArg& mask, const i32<8, SIMD>& a )
196 |     {
197 |         return _mm256_sub_epi32( a.native, _mm256_castps_si256( mask.native ) );
198 |     }
199 |         
200 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<8, SIMD>>>
201 |     FS_FORCEINLINE i32<8, SIMD> MaskedDecrement( const typename i32<8, SIMD>::MaskTypeArg& mask, const i32<8, SIMD>& a )
202 |     {
203 |         return _mm256_add_epi32( a.native, _mm256_castps_si256( mask.native ) );    
204 |     }
205 | }
206 | 


--------------------------------------------------------------------------------
/include/FastSIMD/ToolSet/x86/256/m32x8.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <FastSIMD/ToolSet/Generic/Register.h>
  4 | 
  5 | namespace FS
  6 | {
  7 |     template<FastSIMD::FeatureSet SIMD>
  8 |     struct Register<Mask<32, true>, 8, SIMD, std::enable_if_t<SIMD & FastSIMD::FeatureFlag::AVX>>
  9 |     {
 10 |         static constexpr size_t ElementCount = 8;
 11 |         static constexpr auto FeatureFlags = SIMD;
 12 |         
 13 |         using NativeType = __m256;
 14 |         using ElementType = Mask<32, true>;
 15 |         using MaskType = Register;
 16 |         using MaskTypeArg = Register;
 17 | 
 18 |         FS_FORCEINLINE Register() = default;
 19 |         FS_FORCEINLINE Register( NativeType v ) : native( v ) { }
 20 |         
 21 |         FS_FORCEINLINE NativeType GetNative() const
 22 |         {
 23 |             return native;
 24 |         }
 25 | 
 26 |         FS_FORCEINLINE Register& operator &=( const Register& rhs )
 27 |         {
 28 |             native = _mm256_and_ps( native, rhs.native );
 29 |             return *this;
 30 |         }
 31 |         
 32 |         FS_FORCEINLINE Register& operator |=( const Register& rhs )
 33 |         {
 34 |             native = _mm256_or_ps( native, rhs.native );
 35 |             return *this;
 36 |         }
 37 |         
 38 |         FS_FORCEINLINE Register& operator ^=( const Register& rhs )
 39 |         {
 40 |             native = _mm256_xor_ps( native, rhs.native );
 41 |             return *this;
 42 |         }
 43 |         
 44 |         FS_FORCEINLINE Register operator ~() const
 45 |         {
 46 |             const __m256i neg1 = _mm256_set1_epi32( -1 );
 47 |             return _mm256_xor_ps( native, _mm256_castsi256_ps( neg1 ) );        
 48 |         }
 49 | 
 50 |         NativeType native;
 51 |     };
 52 | 
 53 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<m32<8, true, SIMD>>>
 54 |     FS_FORCEINLINE m32<8, true, SIMD> BitwiseAndNot( const m32<8, true, SIMD>& a, const m32<8, true, SIMD>& b )
 55 |     {
 56 |         return _mm256_andnot_ps( b.native, a.native );        
 57 |     }
 58 |     
 59 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<m32<8, true, SIMD>>>
 60 |     FS_FORCEINLINE bool AnyMask( const m32<8, true, SIMD>& a )
 61 |     {          
 62 |         return _mm256_movemask_ps( a.native );        
 63 |     }
 64 |     
 65 |     template<FastSIMD::FeatureSet SIMD, bool B, typename = EnableIfNative<m32<8, B, SIMD>>>
 66 |     FS_FORCEINLINE BitStorage<8> BitMask( const m32<8, B, SIMD>& a )
 67 |     {          
 68 |         return static_cast<BitStorage<8>>( _mm256_movemask_ps( a.native ) );        
 69 |     }
 70 | 
 71 | 
 72 |     template<FastSIMD::FeatureSet SIMD>
 73 |     struct Register<Mask<32, false>, 8, SIMD, std::enable_if_t<SIMD & FastSIMD::FeatureFlag::AVX2>> : Register<Mask<32, true>, 8, SIMD>
 74 |     {
 75 |         static constexpr size_t ElementCount = 8;
 76 |         static constexpr auto FeatureFlags = SIMD;
 77 |         
 78 |         using NativeType = __m256i;
 79 |         using ElementType = Mask<32, false>;
 80 |         using MaskType = Register;
 81 |         using MaskTypeArg = Register<Mask<32, true>, 8, SIMD>;
 82 | 
 83 |         FS_FORCEINLINE Register() = default;
 84 |         FS_FORCEINLINE Register( NativeType v ) : Register<Mask<32, true>, 8, SIMD>( _mm256_castsi256_ps( v ) ) { }
 85 | 
 86 |         FS_FORCEINLINE NativeType GetNative() const
 87 |         {
 88 |             return _mm256_castps_si256( this->native );
 89 |         }
 90 | 
 91 |         FS_FORCEINLINE Register& operator &=( const Register& rhs )
 92 |         {
 93 |             this->native = _mm256_castsi256_ps( _mm256_and_si256( _mm256_castps_si256( this->native ), _mm256_castps_si256( rhs.native ) ) );
 94 |             return *this;
 95 |         }
 96 |         
 97 |         FS_FORCEINLINE Register& operator |=( const Register& rhs )
 98 |         {
 99 |             this->native = _mm256_castsi256_ps( _mm256_or_si256( _mm256_castps_si256( this->native ), _mm256_castps_si256( rhs.native ) ) );
100 |             return *this;
101 |         }
102 |         
103 |         FS_FORCEINLINE Register& operator ^=( const Register& rhs )
104 |         {
105 |             this->native = _mm256_castsi256_ps( _mm256_xor_si256( _mm256_castps_si256( this->native ), _mm256_castps_si256( rhs.native ) ) );
106 |             return *this;
107 |         }
108 |         
109 |         FS_FORCEINLINE Register operator ~() const
110 |         {
111 |             const __m256i neg1 = _mm256_set1_epi32( -1 );
112 |             return _mm256_xor_si256( _mm256_castps_si256( this->native ), neg1 );        
113 |         }        
114 |     };
115 | 
116 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<m32<8, false, SIMD>>>
117 |     FS_FORCEINLINE m32<8, false, SIMD> BitwiseAndNot( const m32<8, false, SIMD>& a, const m32<8, false, SIMD>& b )
118 |     {
119 |         return _mm256_andnot_si256( _mm256_castps_si256( b.native ), _mm256_castps_si256( a.native ) );        
120 |     }
121 |         
122 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<m32<8, false, SIMD>>>
123 |     FS_FORCEINLINE bool AnyMask( const m32<8, false, SIMD>& a )
124 |     {          
125 |         return _mm256_movemask_epi8( _mm256_castps_si256( a.native ) );        
126 |     }
127 | }
128 | 


--------------------------------------------------------------------------------
/include/FastSIMD/ToolSet/x86/512/f32x16.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <FastSIMD/ToolSet/Generic/Register.h>
  4 | 
  5 | namespace FS
  6 | {
  7 |     template<FastSIMD::FeatureSet SIMD>
  8 |     struct Register<float, 16, SIMD, std::enable_if_t<SIMD & FastSIMD::FeatureFlag::AVX512_F>>
  9 |     {
 10 |         static constexpr size_t ElementCount = 16;
 11 |         static constexpr auto FeatureFlags = SIMD;
 12 |         
 13 |         using NativeType = __m512;
 14 |         using ElementType = float;
 15 |         using MaskType = m32<ElementCount, true, SIMD>;
 16 |         using MaskTypeArg = m32<ElementCount, true, SIMD>;
 17 | 
 18 |         FS_FORCEINLINE Register() = default;
 19 |         FS_FORCEINLINE Register( NativeType v ) : native( v ) { }
 20 |         FS_FORCEINLINE Register( float v ) : native( _mm512_set1_ps( v ) ) { }
 21 |         
 22 |         FS_FORCEINLINE NativeType GetNative() const
 23 |         {
 24 |             return native;
 25 |         }
 26 | 
 27 |         FS_FORCEINLINE Register& operator +=( const Register& rhs )
 28 |         {
 29 |             native = _mm512_add_ps( native, rhs.native );
 30 |             return *this;
 31 |         }
 32 | 
 33 |         FS_FORCEINLINE Register& operator -=( const Register& rhs )
 34 |         {
 35 |             native = _mm512_sub_ps( native, rhs.native );
 36 |             return *this;
 37 |         }
 38 |         
 39 |         FS_FORCEINLINE Register& operator *=( const Register& rhs )
 40 |         {
 41 |             native = _mm512_mul_ps( native, rhs.native );
 42 |             return *this;           
 43 |         }
 44 |         
 45 |         FS_FORCEINLINE Register& operator /=( const Register& rhs )
 46 |         {
 47 |             native = _mm512_div_ps( native, rhs.native );
 48 |             return *this;           
 49 |         }
 50 |             
 51 |         FS_FORCEINLINE Register& operator &=( const Register& rhs )
 52 |         {
 53 |             native = _mm512_and_ps( native, rhs.native );
 54 |             return *this;
 55 |         }
 56 |         
 57 |         FS_FORCEINLINE Register& operator |=( const Register& rhs )
 58 |         {
 59 |             native = _mm512_or_ps( native, rhs.native );
 60 |             return *this;
 61 |         }
 62 |         
 63 |         FS_FORCEINLINE Register& operator ^=( const Register& rhs )
 64 |         {
 65 |             native = _mm512_xor_ps( native, rhs.native );
 66 |             return *this;
 67 |         }        
 68 | 
 69 |         FS_FORCEINLINE Register operator~() const
 70 |         {
 71 |             const __m512i neg1 = _mm512_set1_epi32( -1 );
 72 |             return _mm512_xor_ps( native, _mm512_castsi512_ps( neg1 ) );        
 73 |         }
 74 | 
 75 |         FS_FORCEINLINE Register operator-() const
 76 |         {
 77 |             const __m512i minInt = _mm512_set1_epi32( 0x80000000 );
 78 |             return _mm512_xor_ps( native, _mm512_castsi512_ps( minInt ) );
 79 |         }
 80 |         
 81 |         
 82 |         FS_FORCEINLINE MaskType operator ==( const Register& rhs ) const
 83 |         {
 84 |             return _mm512_cmp_ps_mask( native, rhs.native, _CMP_EQ_OQ );
 85 |         }
 86 |         
 87 |         FS_FORCEINLINE MaskType operator !=( const Register& rhs ) const
 88 |         {
 89 |             return _mm512_cmp_ps_mask( native, rhs.native, _CMP_NEQ_OQ );
 90 |         }
 91 |         
 92 |         FS_FORCEINLINE MaskType operator >=( const Register& rhs ) const
 93 |         {
 94 |             return _mm512_cmp_ps_mask( native, rhs.native, _CMP_GE_OQ );
 95 |         }
 96 |         
 97 |         FS_FORCEINLINE MaskType operator <=( const Register& rhs ) const
 98 |         {
 99 |             return _mm512_cmp_ps_mask( native, rhs.native, _CMP_LE_OQ );
100 |         }
101 |         
102 |         FS_FORCEINLINE MaskType operator >( const Register& rhs ) const
103 |         {
104 |             return _mm512_cmp_ps_mask( native, rhs.native, _CMP_GT_OQ );
105 |         }
106 |         
107 |         FS_FORCEINLINE MaskType operator <( const Register& rhs ) const
108 |         {
109 |             return _mm512_cmp_ps_mask( native, rhs.native, _CMP_LT_OQ );
110 |         }
111 | 
112 |         NativeType native;
113 |     };
114 |     
115 |     
116 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<16, SIMD>>>
117 |     FS_FORCEINLINE f32<16, SIMD> Load( TypeWrapper<const float*, 16, SIMD> ptr )
118 |     {
119 |         return _mm512_loadu_ps( ptr.value );
120 |     }
121 |     
122 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<16, SIMD>>>
123 |     FS_FORCEINLINE void Store( typename f32<16, SIMD>::ElementType* ptr, const f32<16, SIMD>& a )
124 |     {
125 |         _mm512_storeu_ps( ptr, a.native );
126 |     }
127 | 
128 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<16, SIMD>>>
129 |     FS_FORCEINLINE float Extract0( const f32<16, SIMD>& a )
130 |     {
131 |         return _mm512_cvtss_f32( a.native );
132 |     }
133 | 
134 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<16, SIMD>>>
135 |     FS_FORCEINLINE f32<16, SIMD> Abs( const f32<16, SIMD>& a )
136 |     {
137 |         return _mm512_abs_ps( a.native );
138 |     }
139 |     
140 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<16, SIMD>>>
141 |     FS_FORCEINLINE f32<16, SIMD> Round( const f32<16, SIMD>& a )
142 |     {
143 |         return _mm512_roundscale_ps( a.native, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC );
144 |     }
145 | 
146 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<16, SIMD>>>
147 |     FS_FORCEINLINE f32<16, SIMD> Floor( const f32<16, SIMD>& a )
148 |     {
149 |         return _mm512_roundscale_ps( a.native, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC );
150 |     }
151 | 
152 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<16, SIMD>>>
153 |     FS_FORCEINLINE f32<16, SIMD> Ceil( const f32<16, SIMD>& a )
154 |     {
155 |         return _mm512_roundscale_ps( a.native, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC );
156 |     }
157 | 
158 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<16, SIMD>>>
159 |     FS_FORCEINLINE f32<16, SIMD> Trunc( const f32<16, SIMD>& a )
160 |     {
161 |         return _mm512_roundscale_ps( a.native, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC );
162 |     }
163 |         
164 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<16, SIMD>>>
165 |     FS_FORCEINLINE f32<16, SIMD> Min( const f32<16, SIMD>& a, const f32<16, SIMD>& b )
166 |     {
167 |         return _mm512_min_ps( a.native, b.native );
168 |     }
169 |         
170 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<16, SIMD>>>
171 |     FS_FORCEINLINE f32<16, SIMD> Max( const f32<16, SIMD>& a, const f32<16, SIMD>& b )
172 |     {
173 |         return _mm512_max_ps( a.native, b.native );
174 |     }
175 |         
176 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<16, SIMD>>>
177 |     FS_FORCEINLINE f32<16, SIMD> Select( const typename f32<16, SIMD>::MaskTypeArg& mask, const f32<16, SIMD>& ifTrue, const f32<16, SIMD>& ifFalse )
178 |     {
179 |         return _mm512_mask_blend_ps( mask.native, ifFalse.native, ifTrue.native );
180 |     }
181 | 
182 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<16, SIMD>>>
183 |     FS_FORCEINLINE f32<16, SIMD> BitwiseAndNot( const f32<16, SIMD>& a, const f32<16, SIMD>& b )
184 |     {
185 |         return _mm512_andnot_ps( b.native, a.native );        
186 |     }
187 |             
188 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<16, SIMD>>>
189 |     FS_FORCEINLINE f32<16, SIMD> Masked( const typename f32<16, SIMD>::MaskTypeArg& mask, const f32<16, SIMD>& a )
190 |     {
191 |         return _mm512_maskz_mov_ps( mask.native, a.native );    
192 |     }
193 | 
194 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<16, SIMD>>>
195 |     FS_FORCEINLINE f32<16, SIMD> InvMasked( const typename f32<16, SIMD>::MaskTypeArg& mask, const f32<16, SIMD>& a )
196 |     {
197 |         return _mm512_maskz_mov_ps( ~mask.native, a.native );    
198 |     }
199 | 
200 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<16, SIMD>>>
201 |     FS_FORCEINLINE f32<16, SIMD> MaskedAdd( const typename f32<16, SIMD>::MaskTypeArg& mask, const f32<16, SIMD>& a, const f32<16, SIMD>& b )
202 |     {
203 |         return _mm512_mask_add_ps( a.native, mask.native, a.native, b.native );
204 |     }
205 | 
206 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<16, SIMD>>>
207 |     FS_FORCEINLINE f32<16, SIMD> MaskedSub( const typename f32<16, SIMD>::MaskTypeArg& mask, const f32<16, SIMD>& a, const f32<16, SIMD>& b )
208 |     {
209 |         return _mm512_mask_sub_ps( a.native, mask.native, a.native, b.native );
210 |     }
211 | 
212 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<16, SIMD>>>
213 |     FS_FORCEINLINE f32<16, SIMD> MaskedMul( const typename f32<16, SIMD>::MaskTypeArg& mask, const f32<16, SIMD>& a, const f32<16, SIMD>& b )
214 |     {
215 |         return _mm512_mask_mul_ps( a.native, mask.native, a.native, b.native );
216 |     }
217 | 
218 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<16, SIMD>>>
219 |     FS_FORCEINLINE f32<16, SIMD> InvMaskedAdd( const typename f32<16, SIMD>::MaskTypeArg& mask, const f32<16, SIMD>& a, const f32<16, SIMD>& b )
220 |     {
221 |         return _mm512_mask_add_ps( a.native, ~mask.native, a.native, b.native );
222 |     }
223 | 
224 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<16, SIMD>>>
225 |     FS_FORCEINLINE f32<16, SIMD> InvMaskedSub( const typename f32<16, SIMD>::MaskTypeArg& mask, const f32<16, SIMD>& a, const f32<16, SIMD>& b )
226 |     {
227 |         return _mm512_mask_sub_ps( a.native, ~mask.native, a.native, b.native );
228 |     }
229 | 
230 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<16, SIMD>>>
231 |     FS_FORCEINLINE f32<16, SIMD> InvMaskedMul( const typename f32<16, SIMD>::MaskTypeArg& mask, const f32<16, SIMD>& a, const f32<16, SIMD>& b )
232 |     {
233 |         return _mm512_mask_mul_ps( a.native, ~mask.native, a.native, b.native );
234 |     }
235 |     
236 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<16, SIMD>>, typename = EnableIfRelaxed<SIMD>()>
237 |     FS_FORCEINLINE f32<16, SIMD> Reciprocal( const f32<16, SIMD>& a )
238 |     {            
239 |         return _mm512_rcp14_ps( a.native );
240 |     }
241 |     
242 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<16, SIMD>>, typename = EnableIfRelaxed<SIMD>()>
243 |     FS_FORCEINLINE f32<16, SIMD> InvSqrt( const f32<16, SIMD>& a )
244 |     {            
245 |         return _mm512_rsqrt14_ps( a.native );
246 |     }
247 |     
248 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<16, SIMD>>>
249 |     FS_FORCEINLINE f32<16, SIMD> Sqrt( const f32<16, SIMD>& a )
250 |     {            
251 |         return _mm512_sqrt_ps( a.native );
252 |     }
253 | 
254 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<16, SIMD>>, typename = EnableIfRelaxed<SIMD>()>
255 |     FS_FORCEINLINE f32<16, SIMD> FMulAdd( const f32<16, SIMD>& a, const f32<16, SIMD>& b, const f32<16, SIMD>& c )
256 |     {
257 |         return _mm512_fmadd_ps( a.native, b.native, c.native );
258 |     }
259 | 
260 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<16, SIMD>>, typename = EnableIfRelaxed<SIMD>()>
261 |     FS_FORCEINLINE f32<16, SIMD> FMulSub( const f32<16, SIMD>& a, const f32<16, SIMD>& b, const f32<16, SIMD>& c )
262 |     {
263 |         return _mm512_fmsub_ps( a.native, b.native, c.native );
264 |     }
265 | 
266 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<16, SIMD>>, typename = EnableIfRelaxed<SIMD>()>
267 |     FS_FORCEINLINE f32<16, SIMD> FNMulAdd( const f32<16, SIMD>& a, const f32<16, SIMD>& b, const f32<16, SIMD>& c )
268 |     {
269 |         return _mm512_fnmadd_ps( a.native, b.native, c.native );
270 |     }
271 | 
272 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<16, SIMD>>, typename = EnableIfRelaxed<SIMD>()>
273 |     FS_FORCEINLINE f32<16, SIMD> FNMulSub( const f32<16, SIMD>& a, const f32<16, SIMD>& b, const f32<16, SIMD>& c )
274 |     {
275 |         return _mm512_fnmsub_ps( a.native, b.native, c.native );
276 |     }
277 | }


--------------------------------------------------------------------------------
/include/FastSIMD/ToolSet/x86/512/i32x16.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <FastSIMD/ToolSet/Generic/Register.h>
  4 | 
  5 | namespace FS
  6 | {
  7 |     template<FastSIMD::FeatureSet SIMD>
  8 |     struct Register<std::int32_t, 16, SIMD, std::enable_if_t<SIMD & FastSIMD::FeatureFlag::AVX512_F>>
  9 |     {
 10 |         static constexpr size_t ElementCount = 16;
 11 |         static constexpr auto FeatureFlags = SIMD;
 12 | 
 13 |         using NativeType = __m512i;
 14 |         using ElementType = std::int32_t;
 15 |         using MaskType = m32<ElementCount, false, SIMD>;
 16 |         using MaskTypeArg = m32<ElementCount, true, SIMD>;
 17 | 
 18 |         FS_FORCEINLINE Register() = default;
 19 |         FS_FORCEINLINE Register( NativeType v ) : native( v ) { }
 20 |         FS_FORCEINLINE Register( std::int32_t v ) : native( _mm512_set1_epi32( v ) ) { }
 21 |         
 22 |         FS_FORCEINLINE NativeType GetNative() const
 23 |         {
 24 |             return native;
 25 |         }
 26 | 
 27 |         FS_FORCEINLINE Register& operator +=( const Register& rhs )
 28 |         {
 29 |             native = _mm512_add_epi32( native, rhs.native );
 30 |             return *this;
 31 |         }
 32 | 
 33 |         FS_FORCEINLINE Register& operator -=( const Register& rhs )
 34 |         {
 35 |             native = _mm512_sub_epi32( native, rhs.native );
 36 |             return *this;
 37 |         }
 38 |         
 39 |         FS_FORCEINLINE Register& operator *=( const Register& rhs )
 40 |         {
 41 |             native = _mm512_mullo_epi32( native, rhs.native );            
 42 |             return *this;
 43 |         }
 44 |             
 45 |         FS_FORCEINLINE Register& operator &=( const Register& rhs )
 46 |         {
 47 |             native = _mm512_and_si512( native, rhs.native );
 48 |             return *this;
 49 |         }
 50 |         
 51 |         FS_FORCEINLINE Register& operator |=( const Register& rhs )
 52 |         {
 53 |             native = _mm512_or_si512( native, rhs.native );
 54 |             return *this;
 55 |         }
 56 |         
 57 |         FS_FORCEINLINE Register& operator ^=( const Register& rhs )
 58 |         {
 59 |             native = _mm512_xor_si512( native, rhs.native );
 60 |             return *this;
 61 |         }
 62 |         
 63 |         FS_FORCEINLINE Register& operator >>=( int rhs )
 64 |         {
 65 |             native = _mm512_srai_epi32( native, rhs );
 66 |             return *this;
 67 |         }
 68 |         
 69 |         FS_FORCEINLINE Register& operator <<=( int rhs )
 70 |         {
 71 |             native = _mm512_slli_epi32( native, rhs );
 72 |             return *this;
 73 |         }
 74 | 
 75 |         FS_FORCEINLINE Register operator ~() const
 76 |         {
 77 |             const __m512i neg1 = _mm512_set1_epi32( -1 );
 78 |             return _mm512_xor_si512( native, neg1 );        
 79 |         }
 80 | 
 81 |         FS_FORCEINLINE Register operator -() const
 82 |         {
 83 |             return _mm512_sub_epi32( _mm512_setzero_si512(), native );
 84 |         }
 85 | 
 86 |         
 87 |         FS_FORCEINLINE MaskType operator ==( const Register& rhs ) const
 88 |         {
 89 |             return _mm512_cmp_epi32_mask( native, rhs.native, _MM_CMPINT_EQ );
 90 |         }
 91 |         
 92 |         FS_FORCEINLINE MaskType operator !=( const Register& rhs ) const
 93 |         {
 94 |             return _mm512_cmp_epi32_mask( native, rhs.native, _MM_CMPINT_NE );
 95 |         }
 96 |         
 97 |         FS_FORCEINLINE MaskType operator >=( const Register& rhs ) const
 98 |         {
 99 |             return _mm512_cmp_epi32_mask( native, rhs.native, _MM_CMPINT_NLT );
100 |         }
101 |         
102 |         FS_FORCEINLINE MaskType operator <=( const Register& rhs ) const
103 |         {
104 |             return _mm512_cmp_epi32_mask( native, rhs.native, _MM_CMPINT_LE );
105 |         }
106 |         
107 |         FS_FORCEINLINE MaskType operator >( const Register& rhs ) const
108 |         {
109 |             return _mm512_cmp_epi32_mask( native, rhs.native, _MM_CMPINT_NLE );
110 |         }
111 |         
112 |         FS_FORCEINLINE MaskType operator <( const Register& rhs ) const
113 |         {
114 |             return _mm512_cmp_epi32_mask( native, rhs.native, _MM_CMPINT_LT );
115 |         }
116 | 
117 |         NativeType native;
118 |     };
119 | 
120 |     
121 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<16, SIMD>>>
122 |     FS_FORCEINLINE i32<16, SIMD> Load( TypeWrapper<const int*, 16, SIMD> ptr )
123 |     {
124 |         return _mm512_loadu_si512( (const __m512i*)ptr.value );
125 |     }
126 |     
127 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<16, SIMD>>>
128 |     FS_FORCEINLINE void Store( typename i32<16, SIMD>::ElementType* ptr, const i32<16, SIMD>& a )
129 |     {
130 |         _mm512_storeu_si512( (__m512i*)ptr, a.native );
131 |     }
132 | 
133 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<16, SIMD>>>
134 |     FS_FORCEINLINE int32_t Extract0( const i32<16, SIMD>& a )
135 |     {
136 |         return _mm512_cvtsi512_si32( a.native );
137 |     }
138 |         
139 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<16, SIMD>>>
140 |     FS_FORCEINLINE i32<16, SIMD> Abs( const i32<16, SIMD>& a )
141 |     {
142 |         return _mm512_abs_epi32( a.native );
143 |     }
144 |         
145 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<16, SIMD>>>
146 |     FS_FORCEINLINE i32<16, SIMD> Min( const i32<16, SIMD>& a, const i32<16, SIMD>& b )
147 |     {
148 |         return _mm512_min_epi32( a.native, b.native );
149 |     }
150 |         
151 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<16, SIMD>>>
152 |     FS_FORCEINLINE i32<16, SIMD> Max( const i32<16, SIMD>& a, const i32<16, SIMD>& b )
153 |     {
154 |         return _mm512_max_epi32( a.native, b.native );
155 |     }
156 |         
157 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<16, SIMD>>>
158 |     FS_FORCEINLINE i32<16, SIMD> Select( const typename i32<16, SIMD>::MaskTypeArg& mask, const i32<16, SIMD>& ifTrue, const i32<16, SIMD>& ifFalse )
159 |     {
160 |         return _mm512_mask_blend_epi32( mask.native, ifFalse.native, ifTrue.native );
161 |     }
162 | 
163 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<16, SIMD>>>
164 |     FS_FORCEINLINE i32<16, SIMD> BitwiseAndNot( const i32<16, SIMD>& a, const i32<16, SIMD>& b )
165 |     {
166 |         return _mm512_andnot_si512( b.native, a.native );        
167 |     }
168 | 
169 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<16, SIMD>>>
170 |     FS_FORCEINLINE i32<16, SIMD> BitShiftRightZeroExtend( const i32<16, SIMD>& a, int b )
171 |     {
172 |         return _mm512_srli_epi32( a.native, b );        
173 |     }
174 | 
175 | 
176 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<16, SIMD>>>
177 |     FS_FORCEINLINE i32<16, SIMD> Masked( const typename i32<16, SIMD>::MaskTypeArg& mask, const i32<16, SIMD>& a )
178 |     {
179 |         return _mm512_maskz_mov_epi32( mask.native, a.native );    
180 |     }
181 |         
182 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<16, SIMD>>>
183 |     FS_FORCEINLINE i32<16, SIMD> InvMasked( const typename i32<16, SIMD>::MaskTypeArg& mask, const i32<16, SIMD>& a )
184 |     {
185 |         return _mm512_maskz_mov_epi32( ~mask.native, a.native );    
186 |     }
187 | 
188 | 
189 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<16, SIMD>>>
190 |     FS_FORCEINLINE i32<16, SIMD> MaskedAdd( const typename i32<16, SIMD>::MaskTypeArg& mask, const i32<16, SIMD>& a, const i32<16, SIMD>& b )
191 |     {
192 |         return _mm512_mask_add_epi32( a.native, mask.native, a.native, b.native );
193 |     }
194 | 
195 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<16, SIMD>>>
196 |     FS_FORCEINLINE i32<16, SIMD> MaskedSub( const typename i32<16, SIMD>::MaskTypeArg& mask, const i32<16, SIMD>& a, const i32<16, SIMD>& b )
197 |     {
198 |         return _mm512_mask_sub_epi32( a.native, mask.native, a.native, b.native );
199 |     }
200 | 
201 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<16, SIMD>>>
202 |     FS_FORCEINLINE i32<16, SIMD> MaskedMul( const typename i32<16, SIMD>::MaskTypeArg& mask, const i32<16, SIMD>& a, const i32<16, SIMD>& b )
203 |     {
204 |         return _mm512_mask_mullo_epi32( a.native, mask.native, a.native, b.native );
205 |     }
206 | 
207 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<16, SIMD>>>
208 |     FS_FORCEINLINE i32<16, SIMD> InvMaskedAdd( const typename i32<16, SIMD>::MaskTypeArg& mask, const i32<16, SIMD>& a, const i32<16, SIMD>& b )
209 |     {
210 |         return _mm512_mask_add_epi32( a.native, ~mask.native, a.native, b.native );
211 |     }
212 | 
213 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<16, SIMD>>>
214 |     FS_FORCEINLINE i32<16, SIMD> InvMaskedSub( const typename i32<16, SIMD>::MaskTypeArg& mask, const i32<16, SIMD>& a, const i32<16, SIMD>& b )
215 |     {
216 |         return _mm512_mask_sub_epi32( a.native, ~mask.native, a.native, b.native );
217 |     }
218 | 
219 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<16, SIMD>>>
220 |     FS_FORCEINLINE i32<16, SIMD> InvMaskedMul( const typename i32<16, SIMD>::MaskTypeArg& mask, const i32<16, SIMD>& a, const i32<16, SIMD>& b )
221 |     {
222 |         return _mm512_mask_mullo_epi32( a.native, ~mask.native, a.native, b.native );
223 |     }
224 | }


--------------------------------------------------------------------------------
/include/FastSIMD/ToolSet/x86/512/mNx16.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <FastSIMD/ToolSet/Generic/Register.h>
 4 | 
 5 | namespace FS
 6 | {
 7 |     namespace impl
 8 |     {
 9 |         struct AVX512MaskBase
10 |         {
11 |             __mmask16 native;
12 |         };
13 |     }
14 | 
15 |     template<FastSIMD::FeatureSet SIMD, std::size_t N, bool OPTIMISE_FLOAT>
16 |     struct Register<Mask<N, OPTIMISE_FLOAT>, 16, SIMD, std::enable_if_t<SIMD & FastSIMD::FeatureFlag::AVX512_F>>
17 |         : std::conditional_t<OPTIMISE_FLOAT, impl::AVX512MaskBase, Register<Mask<N, true>, 16, SIMD>>
18 |     {
19 |         static constexpr size_t ElementCount = 16;
20 |         static constexpr auto FeatureFlags = SIMD;
21 |         
22 |         using NativeType = decltype(AVX512MaskBase::native);
23 |         using ElementType = Mask<N, OPTIMISE_FLOAT>;
24 |         using MaskType = Register;
25 |         using MaskTypeArg = Register;
26 | 
27 |         FS_FORCEINLINE Register() = default;
28 |         FS_FORCEINLINE Register( NativeType v ) { this->native = v; }
29 |         
30 |         FS_FORCEINLINE NativeType GetNative() const
31 |         {
32 |             return this->native;
33 |         }
34 | 
35 |         FS_FORCEINLINE Register& operator &=( const Register& rhs )
36 |         {
37 |             this->native = ( this->native & rhs.native );
38 |             return *this;
39 |         }
40 |         
41 |         FS_FORCEINLINE Register& operator |=( const Register& rhs )
42 |         {
43 |             this->native = ( this->native | rhs.native );
44 |             return *this;
45 |         }
46 |         
47 |         FS_FORCEINLINE Register& operator ^=( const Register& rhs )
48 |         {
49 |             this->native = ( this->native ^ rhs.native );
50 |             return *this;
51 |         }
52 |         
53 |         FS_FORCEINLINE Register operator ~() const
54 |         {
55 |             return ~this->native;      
56 |         }
57 |     };
58 |     
59 |     template<FastSIMD::FeatureSet SIMD, std::size_t N, bool B, typename = EnableIfNative<Register<Mask<N, B>, 16, SIMD>>>
60 |     FS_FORCEINLINE bool AnyMask( const Register<Mask<N, B>, 16, SIMD>& a )
61 |     {          
62 |         return (bool)a.native;        
63 |     }
64 |     
65 |     template<FastSIMD::FeatureSet SIMD, std::size_t N, bool B, typename = EnableIfNative<Register<Mask<N, B>, 16, SIMD>>>
66 |     FS_FORCEINLINE BitStorage<16> BitMask( const Register<Mask<N, B>, 16, SIMD>& a )
67 |     {          
68 |         return static_cast<BitStorage<16>>( a.native );
69 |     }
70 | }


--------------------------------------------------------------------------------
/include/FastSIMD/ToolSet/x86/AVX.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <immintrin.h>
 3 | 
 4 | #if FASTSIMD_MAX_FEATURE_VALUE() >= FASTSIMD_FEATURE_VALUE( AVX512 )
 5 | #include "AVX512.h"
 6 | #endif
 7 | 
 8 | #include "256/f32x8.h"
 9 | #include "256/i32x8.h"
10 | #include "256/m32x8.h"
11 | 
12 | namespace FS
13 | {
14 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<8, SIMD>>>
15 |     FS_FORCEINLINE i32<8, SIMD> Convert( const f32<8, SIMD>& a, TypeDummy<int32_t> )
16 |     {
17 |         return _mm256_cvtps_epi32( a.native );
18 |     }
19 | 
20 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<8, SIMD>>>
21 |     FS_FORCEINLINE f32<8, SIMD> Convert( const i32<8, SIMD>& a, TypeDummy<float> )
22 |     {
23 |         return _mm256_cvtepi32_ps( a.native );
24 |     }
25 | 
26 |     template<typename U, typename T, FastSIMD::FeatureSet SIMD, typename = EnableIfNative<Register<T, 8, SIMD>>>
27 |     FS_FORCEINLINE Register<U, 8, SIMD> Cast( const Register<T, 8, SIMD>& a, TypeDummy<U> )
28 |     {
29 |         if constexpr( 
30 |             std::is_same_v<typename Register<T, 8, SIMD>::NativeType, __m256> &&
31 |             std::is_same_v<typename Register<U, 8, SIMD>::NativeType, __m256i> )
32 |         {
33 |             return _mm256_castps_si256( a.GetNative() );
34 |         }
35 |         else if constexpr( 
36 |             std::is_same_v<typename Register<T, 8, SIMD>::NativeType, __m256i> &&
37 |             std::is_same_v<typename Register<U, 8, SIMD>::NativeType, __m256> )
38 |         {
39 |             return _mm256_castsi256_ps( a.GetNative() );
40 |         }
41 |         else
42 |         {
43 |             return a.GetNative();
44 |         }
45 |     }
46 | }
47 | 


--------------------------------------------------------------------------------
/include/FastSIMD/ToolSet/x86/AVX512.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <immintrin.h>
 3 | 
 4 | #include "512/f32x16.h"
 5 | #include "512/i32x16.h"
 6 | #include "512/mNx16.h"
 7 | 
 8 | namespace FS
 9 | {
10 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<16, SIMD>>>
11 |     FS_FORCEINLINE i32<16, SIMD> Convert( const f32<16, SIMD>& a, TypeDummy<int32_t> )
12 |     {
13 |         return _mm512_cvtps_epi32( a.native );
14 |     }
15 | 
16 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<16, SIMD>>>
17 |     FS_FORCEINLINE f32<16, SIMD> Convert( const i32<16, SIMD>& a, TypeDummy<float> )
18 |     {
19 |         return _mm512_cvtepi32_ps( a.native );
20 |     }
21 | 
22 |     template<typename U, typename T, FastSIMD::FeatureSet SIMD, typename = EnableIfNative<Register<T, 16, SIMD>>>
23 |     FS_FORCEINLINE Register<U, 16, SIMD> Cast( const Register<T, 16, SIMD>& a, TypeDummy<U> )
24 |     {
25 |         if constexpr( 
26 |             std::is_same_v<typename Register<T, 16, SIMD>::NativeType, __m512> &&
27 |             std::is_same_v<typename Register<U, 16, SIMD>::NativeType, __m512i> )
28 |         {
29 |             return _mm512_castps_si512( a.GetNative() );
30 |         }
31 |         else if constexpr( 
32 |             std::is_same_v<typename Register<T, 16, SIMD>::NativeType, __m512i> &&
33 |             std::is_same_v<typename Register<U, 16, SIMD>::NativeType, __m512> )
34 |         {
35 |             return _mm512_castsi512_ps( a.GetNative() );
36 |         }
37 |         else
38 |         {
39 |             return a.GetNative();
40 |         }
41 |     }
42 | }
43 | 


--------------------------------------------------------------------------------
/include/FastSIMD/ToolSet/x86/SSE.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <nmmintrin.h>
 3 | 
 4 | #if FASTSIMD_MAX_FEATURE_VALUE() >= FASTSIMD_FEATURE_VALUE( AVX )
 5 | #include "AVX.h"
 6 | #endif
 7 | 
 8 | #include "128/f32x4.h"
 9 | #include "128/i32x4.h"
10 | #include "128/m32x4.h"
11 | 
12 | namespace FS
13 | {
14 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<f32<4, SIMD>>>
15 |     FS_FORCEINLINE i32<4, SIMD> Convert( const f32<4, SIMD>& a, TypeDummy<int32_t> )
16 |     {
17 |         return _mm_cvtps_epi32( a.native );
18 |     }
19 | 
20 |     template<FastSIMD::FeatureSet SIMD, typename = EnableIfNative<i32<4, SIMD>>>
21 |     FS_FORCEINLINE f32<4, SIMD> Convert( const i32<4, SIMD>& a, TypeDummy<float> )
22 |     {
23 |         return _mm_cvtepi32_ps( a.native );
24 |     }
25 | 
26 |     template<typename U, typename T, FastSIMD::FeatureSet SIMD, typename = EnableIfNative<Register<T, 4, SIMD>>>
27 |     FS_FORCEINLINE Register<U, 4, SIMD> Cast( const Register<T, 4, SIMD>& a, TypeDummy<U> )
28 |     {
29 |         if constexpr( 
30 |             std::is_same_v<typename Register<T, 4, SIMD>::NativeType, __m128> &&
31 |             std::is_same_v<typename Register<U, 4, SIMD>::NativeType, __m128i> )
32 |         {
33 |             return _mm_castps_si128( a.GetNative() );
34 |         }
35 |         else if constexpr( 
36 |             std::is_same_v<typename Register<T, 4, SIMD>::NativeType, __m128i> &&
37 |             std::is_same_v<typename Register<U, 4, SIMD>::NativeType, __m128> )
38 |         {
39 |             return _mm_castsi128_ps( a.GetNative() );
40 |         }
41 |         else
42 |         {
43 |             return a.GetNative();
44 |         }
45 |     }
46 | }
47 | 


--------------------------------------------------------------------------------
/include/FastSIMD/ToolSet/x86/x86.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | #if FASTSIMD_MAX_FEATURE_VALUE() >= FASTSIMD_FEATURE_VALUE( SSE )
4 | #include "SSE.h"
5 | #endif
6 | 


--------------------------------------------------------------------------------
/include/FastSIMD/Utility/ArchDetect.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | //Based on the Qt processor detection code, so should be very accurate
  3 | //https: code.qt.io/cgit/qt/qtbase.git/tree/src/corelib/global/qprocessordetection.h
  4 | 
  5 | #define FASTSIMD_ARCH_VALUE_X86() 1
  6 | #define FASTSIMD_ARCH_VALUE_ARM() 2
  7 | #define FASTSIMD_ARCH_VALUE_WASM() 3
  8 | 
  9 | #define FASTSIMD_FEATURE_VALUE_SCALAR() 1
 10 | 
 11 | // -- Web Assembly --
 12 | #if defined( __EMSCRIPTEN__ ) || defined( EMSCRIPTEN )
 13 | 
 14 | #define FASTSIMD_FEATURE_VALUE_WASM() 2
 15 | 
 16 | #define FASTSIMD_FEATURE_DETECT() WASM
 17 | #define FASTSIMD_ARCH_DETECT() WASM
 18 | 
 19 | // -- ARM --
 20 | #elif defined( __arm__ ) || defined( __TARGET_ARCH_ARM ) || defined( _M_ARM ) || defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __ARM64__ )
 21 | 
 22 | #define FASTSIMD_FEATURE_VALUE_NEON() 2
 23 | #define FASTSIMD_FEATURE_VALUE_AARCH64() 3
 24 | 
 25 | #if defined( __ARM64_ARCH_8__ ) || defined( __aarch64__ ) || defined( __ARMv8__ ) || defined( __ARMv8_A__ ) || defined( _M_ARM64 ) || defined( __ARM_NEON__ )
 26 | #define FASTSIMD_FEATURE_DETECT() AARCH64
 27 | //#elif defined( __ARM_ARCH_7__ ) || defined( __ARM_ARCH_7A__ ) || defined( __ARM_ARCH_7R__ ) || defined( __ARM_ARCH_7M__ ) || defined( __ARM_ARCH_7S__ ) || defined( _ARM_ARCH_7 ) || defined( __CORE_CORTEXA__ )
 28 | //#define FASTSIMD_ARCH_ARM() 7
 29 | //#elif defined( __ARM_ARCH_6__ ) || defined( __ARM_ARCH_6J__ ) || defined( __ARM_ARCH_6T2__ ) || defined( __ARM_ARCH_6Z__ ) || defined( __ARM_ARCH_6K__ ) || defined( __ARM_ARCH_6ZK__ ) || defined( __ARM_ARCH_6M__ )
 30 | //#define FASTSIMD_ARCH_ARM() 6
 31 | //#elif defined( __ARM_ARCH_5TEJ__ ) || defined( __ARM_ARCH_5TE__ )
 32 | //#define FASTSIMD_ARCH_ARM() 5
 33 | #else
 34 | #define FASTSIMD_FEATURE_DETECT() NEON
 35 | #endif
 36 | 
 37 | #define FASTSIMD_ARCH_DETECT() ARM
 38 | 
 39 | // -- x86 --
 40 | #elif defined( __i386 ) || defined( __i386__ ) || defined( _M_IX86 ) || defined( __x86_64 ) || defined( __x86_64__ ) || defined( __amd64 ) || defined( _M_X64 )
 41 | 
 42 | #define FASTSIMD_FEATURE_VALUE_SSE() 2
 43 | #define FASTSIMD_FEATURE_VALUE_SSE2() 3
 44 | #define FASTSIMD_FEATURE_VALUE_SSE3() 4
 45 | #define FASTSIMD_FEATURE_VALUE_SSSE3() 5
 46 | #define FASTSIMD_FEATURE_VALUE_SSE41() 6
 47 | #define FASTSIMD_FEATURE_VALUE_SSE42() 7
 48 | #define FASTSIMD_FEATURE_VALUE_AVX() 8
 49 | #define FASTSIMD_FEATURE_VALUE_AVX2() 9
 50 | #define FASTSIMD_FEATURE_VALUE_AVX512() 10
 51 | 
 52 | #if defined( __AVX512F__ ) && defined( __AVX512VL__ ) && defined( __AVX512BW__ ) && defined( __AVX512DQ__ )
 53 | #define FASTSIMD_FEATURE_DETECT() AVX512
 54 | #elif defined( __AVX2__ )
 55 | #define FASTSIMD_FEATURE_DETECT() AVX2
 56 | #elif defined( __AVX__ )
 57 | #define FASTSIMD_FEATURE_DETECT() AVX
 58 | #elif defined( __SSE4_2__ )
 59 | #define FASTSIMD_FEATURE_DETECT() SSE42
 60 | #elif defined( __SSE4_1__ )
 61 | #define FASTSIMD_FEATURE_DETECT() SSE41
 62 | #elif defined( __SSSE3__ )
 63 | #define FASTSIMD_FEATURE_DETECT() SSSE3
 64 | #elif defined( __SSE3__ )
 65 | #define FASTSIMD_FEATURE_DETECT() SSE3
 66 | #elif defined( __SSE2__ ) || defined( __x86_64 ) || defined( __x86_64__ ) || defined( __amd64 ) || defined( _M_X64 ) || ( defined( _M_IX86_FP ) && _M_IX86_FP == 2 )
 67 | #define FASTSIMD_FEATURE_DETECT() SSE2
 68 | #elif defined( __SSE__ ) || defined( __i686__ ) || defined( __athlon__ ) || defined( __pentiumpro__ ) || ( defined( _M_IX86_FP ) && _M_IX86_FP == 1 )
 69 | #define FASTSIMD_FEATURE_DETECT() SSE2
 70 | #else
 71 | #define FASTSIMD_FEATURE_DETECT() SCALAR
 72 | #endif
 73 | 
 74 | #define FASTSIMD_ARCH_DETECT() X86
 75 | #endif
 76 | 
 77 | 
 78 | #define FASTSIMD_ARCH_VALUE( arch ) FASTSIMD_ARCH_VALUE_IMPL( arch )
 79 | #define FASTSIMD_ARCH_VALUE_IMPL( arch ) FASTSIMD_ARCH_VALUE_##arch()
 80 | 
 81 | #define FASTSIMD_CURRENT_ARCH_IS( arch ) FASTSIMD_ARCH_VALUE( FASTSIMD_ARCH_DETECT() ) == FASTSIMD_ARCH_VALUE( arch )
 82 | 
 83 | 
 84 | #ifndef FASTSIMD_MAX_FEATURE_SET
 85 | #define FASTSIMD_MAX_FEATURE_SET FASTSIMD_FEATURE_DETECT()
 86 | #endif
 87 | #ifndef FASTSIMD_DEFAULT_FEATURE_SET
 88 | #define FASTSIMD_DEFAULT_FEATURE_SET FASTSIMD_MAX_FEATURE_SET
 89 | #endif
 90 | 
 91 | #define FASTSIMD_FEATURE_VALUE( feature ) FASTSIMD_FEATURE_VALUE_IMPL( feature )
 92 | #define FASTSIMD_FEATURE_VALUE_IMPL( feature ) FASTSIMD_FEATURE_VALUE_##feature()
 93 | 
 94 | #define FASTSIMD_DEFAULT_FEATURE_VALUE() FASTSIMD_FEATURE_VALUE( FASTSIMD_DEFAULT_FEATURE_SET )
 95 | #define FASTSIMD_MAX_FEATURE_VALUE() FASTSIMD_FEATURE_VALUE( FASTSIMD_MAX_FEATURE_SET )
 96 | #define FASTSIMD_ARCH_NAME() FASTSIMD_ARCH_DETECT()=FASTSIMD_MAX_FEATURE_SET
 97 | 
 98 | #ifndef FASTSIMD_IS_RELAXED
 99 | #define FASTSIMD_IS_RELAXED 0
100 | #endif
101 | 


--------------------------------------------------------------------------------
/include/FastSIMD/Utility/Export.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #if !defined( FASTSIMD_STATIC_LIB ) && ( defined( _WIN32 ) || defined( __CYGWIN__ ) )
 4 | #ifdef FASTSIMD_EXPORT
 5 | #define FASTSIMD_API __declspec( dllexport )
 6 | #else
 7 | #define FASTSIMD_API __declspec( dllimport )
 8 | #endif
 9 | #else
10 | #define FASTSIMD_API
11 | #endif
12 | 


--------------------------------------------------------------------------------
/include/FastSIMD/Utility/FeatureEnums.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "Export.h"
 3 | 
 4 | #include <cstdint>
 5 | 
 6 | namespace FastSIMD
 7 | {
 8 |     enum class FeatureFlag
 9 |     {
10 |         Scalar,
11 | 
12 |         x86,
13 |         SSE,
14 |         SSE2,
15 |         SSE3,
16 |         SSSE3,
17 |         SSE41,
18 |         SSE42,
19 |         AVX,
20 |         AVX2,
21 |         AVX512_F,
22 |         AVX512_VL,
23 |         AVX512_DQ,
24 |         AVX512_BW,
25 | 
26 |         ARM,
27 |         NEON,
28 |         AARCH64,
29 | 
30 |         WASM,
31 |     };
32 | 
33 |     constexpr std::uint32_t operator |( FeatureFlag a, FeatureFlag b )
34 |     {
35 |         return 1U << static_cast<std::uint32_t>(a) | 1U << static_cast<std::uint32_t>(b);
36 |     }
37 | 
38 |     constexpr std::uint32_t operator |( std::uint32_t a, FeatureFlag b )
39 |     {
40 |         return a | 1U << static_cast<std::uint32_t>(b);
41 |     }
42 | 
43 |     enum class FeatureSet : std::uint32_t
44 |     {
45 |         Invalid,
46 | 
47 |         SCALAR      =          Invalid | FeatureFlag::Scalar,
48 | 
49 |         SSE         = FeatureFlag::x86 | FeatureFlag::SSE,
50 |         SSE2        =              SSE | FeatureFlag::SSE2,
51 |         SSE3        =             SSE2 | FeatureFlag::SSE3,
52 |         SSSE3       =             SSE3 | FeatureFlag::SSSE3,
53 |         SSE41       =            SSSE3 | FeatureFlag::SSE41,
54 |         SSE42       =            SSE41 | FeatureFlag::SSE42,
55 |         AVX         =            SSE42 | FeatureFlag::AVX,
56 |         AVX2        =              AVX | FeatureFlag::AVX2,
57 |         AVX512      =             AVX2 | FeatureFlag::AVX512_F | FeatureFlag::AVX512_VL | FeatureFlag::AVX512_DQ | FeatureFlag::AVX512_BW,
58 | 
59 |         NEON        = FeatureFlag::ARM | FeatureFlag::NEON,
60 |         AARCH64     =             NEON | FeatureFlag::AARCH64,
61 | 
62 |         WASM        =          Invalid | FeatureFlag::WASM,
63 | 
64 |         Max = ~0U
65 |     };
66 | 
67 |     constexpr bool operator &( FeatureSet a, FeatureFlag b )
68 |     {
69 |         return static_cast<std::uint32_t>(a) & 1U << static_cast<std::uint32_t>(b);
70 |     }
71 | 
72 |     constexpr bool operator &( FeatureSet a, std::uint32_t b )
73 |     {
74 |         return static_cast<std::uint32_t>(a) & b;
75 |     }
76 | 
77 |     FASTSIMD_API FeatureSet DetectCpuMaxFeatureSet();
78 | 
79 |     FASTSIMD_API const char* GetFeatureSetString( FeatureSet );
80 | }
81 | 


--------------------------------------------------------------------------------
/include/FastSIMD/Utility/FeatureSetList.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <FastSIMD/Utility/FeatureEnums.h>
 3 | 
 4 | namespace FastSIMD
 5 | {
 6 |     template<int, FeatureSet...>
 7 |     struct FeatureSetList;
 8 | 
 9 |     template<FeatureSet HEAD>
10 |     struct FeatureSetList<0, HEAD>
11 |     {
12 |         static constexpr FeatureSet AsArray[] = { HEAD };
13 |         static constexpr FeatureSet Minimum = HEAD;
14 |         static constexpr FeatureSet Maximum = HEAD;
15 | 
16 |         template<FeatureSet L>
17 |         static constexpr FeatureSet NextAfter = FeatureSet::Max;
18 |     };
19 | 
20 |     template<FeatureSet HEAD, FeatureSet... TAIL>
21 |     struct FeatureSetList<0, HEAD, TAIL...>
22 |     {
23 |         static constexpr FeatureSet AsArray[] = { HEAD, TAIL... };
24 |         static constexpr FeatureSet Minimum = HEAD;
25 |         static constexpr FeatureSet Maximum = FeatureSetList<0, TAIL...>::Maximum;
26 | 
27 |         template<FeatureSet L>
28 |         static constexpr FeatureSet NextAfter = (L == HEAD) ? FeatureSetList<0, TAIL...>::Minimum : FeatureSetList<0, TAIL...>::template NextAfter<L>;
29 |     };
30 | 
31 | }
32 | 


--------------------------------------------------------------------------------
/src/FastSIMD.cpp:
--------------------------------------------------------------------------------
  1 | #include <FastSIMD/ToolSet.h>
  2 | 
  3 | #if FASTSIMD_CURRENT_ARCH_IS( X86 )
  4 | #if defined( _MSC_VER )
  5 | #include <intrin.h>
  6 | #endif
  7 | 
  8 | // Define interface to cpuid instruction.
  9 | // input:  eax = functionnumber, ecx = 0
 10 | // output: eax = output[0], ebx = output[1], ecx = output[2], edx = output[3]
 11 | static void cpuid( int output[4], int functionnumber )
 12 | {
 13 | #if defined( __GNUC__ ) || defined( __clang__ ) // use inline assembly, Gnu/AT&T syntax
 14 | 
 15 |     int a, b, c, d;
 16 |     __asm( "cpuid"
 17 |            : "=a"( a ), "=b"( b ), "=c"( c ), "=d"( d )
 18 |            : "a"( functionnumber ), "c"( 0 )
 19 |            : );
 20 |     output[0] = a;
 21 |     output[1] = b;
 22 |     output[2] = c;
 23 |     output[3] = d;
 24 | 
 25 | #elif defined( _MSC_VER ) || defined( __INTEL_COMPILER ) // Microsoft or Intel compiler, intrin.h included
 26 | 
 27 |     __cpuidex( output, functionnumber, 0 ); // intrinsic function for CPUID
 28 | 
 29 | #else // unknown platform. try inline assembly with masm/intel syntax
 30 | 
 31 |     __asm
 32 |     {
 33 |         mov eax, functionnumber
 34 |         xor ecx, ecx
 35 |         cpuid;
 36 |         mov esi, output
 37 |         mov[esi], eax
 38 |         mov[esi + 4], ebx
 39 |         mov[esi + 8], ecx
 40 |         mov[esi + 12], edx
 41 |     }
 42 | 
 43 | #endif
 44 | }
 45 | 
 46 | // Define interface to xgetbv instruction
 47 | static uint64_t xgetbv( uint32_t ctr )
 48 | {
 49 | #if( defined( _MSC_FULL_VER ) && _MSC_FULL_VER >= 160040000 ) || ( defined( __INTEL_COMPILER ) && __INTEL_COMPILER >= 1200 ) // Microsoft or Intel compiler supporting _xgetbv intrinsic
 50 | 
 51 |     return _xgetbv( ctr ); // intrinsic function for XGETBV
 52 | 
 53 | #elif defined( __GNUC__ ) // use inline assembly, Gnu/AT&T syntax
 54 | 
 55 |     uint32_t a, d;
 56 |     __asm( "xgetbv"
 57 |            : "=a"( a ), "=d"( d )
 58 |            : "c"( ctr )
 59 |            : );
 60 |     return a | ( uint64_t( d ) << 32 );
 61 | 
 62 | #else // #elif defined (_WIN32)                           // other compiler. try inline assembly with masm/intel/MS syntax
 63 | 
 64 |     uint32_t a, d;
 65 |     __asm {
 66 |         mov ecx, ctr
 67 |         _emit 0x0f
 68 |         _emit 0x01
 69 |         _emit 0xd0; // xgetbv
 70 |         mov a, eax
 71 |             mov d, edx
 72 |     }
 73 |     return a | ( uint64_t( d ) << 32 );
 74 | 
 75 | #endif
 76 | }
 77 | #endif
 78 | 
 79 | namespace FastSIMD
 80 | {
 81 | #if FASTSIMD_CURRENT_ARCH_IS( X86 )
 82 |     static std::uint32_t DetectCpuSupportedFlags()
 83 |     {
 84 |         std::uint32_t supportedFlags = FeatureFlag::x86 | FeatureFlag::Scalar;
 85 | 
 86 |         //#if FASTSIMD_x86
 87 |         int abcd[4] = { 0, 0, 0, 0 }; // cpuid results
 88 | 
 89 |         //#if !FASTSIMD_64BIT
 90 | 
 91 |         cpuid( abcd, 0 ); // call cpuid function 0
 92 |         if( abcd[0] == 0 )
 93 |             return supportedFlags; // no further cpuid function supported
 94 | 
 95 |         cpuid( abcd, 1 ); // call cpuid function 1 for feature flags
 96 |         if( ( abcd[3] >> 0 & 1 ) == 0 )
 97 |             return supportedFlags; // no floating point
 98 |         if( ( abcd[3] >> 23 & 1 ) == 0 )
 99 |             return supportedFlags; // no MMX
100 |         if( ( abcd[3] >> 15 & 1 ) == 0 )
101 |             return supportedFlags; // no conditional move
102 |         if( ( abcd[3] >> 24 & 1 ) == 0 )
103 |             return supportedFlags; // no FXSAVE
104 |         if( ( abcd[3] >> 25 & 1 ) == 0 )
105 |             return supportedFlags; // no SSE
106 |         supportedFlags = supportedFlags | FeatureFlag::SSE;
107 |         // SSE supported
108 | 
109 |         if( ( abcd[3] >> 26 & 1 ) == 0 )
110 |             return supportedFlags; // no SSE2
111 |         //#else
112 |         cpuid( abcd, 1 ); // call cpuid function 1 for feature flags
113 |         //#endif
114 | 
115 |         supportedFlags = supportedFlags | FeatureFlag::SSE2; // default value for 64bit
116 |         // SSE2 supported
117 | 
118 |         if( ( abcd[2] >> 0 & 1 ) == 0 )
119 |             return supportedFlags; // no SSE3
120 |         supportedFlags = supportedFlags | FeatureFlag::SSE3;
121 |         // SSE3 supported
122 | 
123 |         if( ( abcd[2] >> 9 & 1 ) == 0 )
124 |             return supportedFlags; // no SSSE3
125 |         supportedFlags = supportedFlags | FeatureFlag::SSSE3;
126 |         // SSSE3 supported
127 | 
128 |         if( ( abcd[2] >> 19 & 1 ) == 0 )
129 |             return supportedFlags; // no SSE4.1
130 |         supportedFlags = supportedFlags | FeatureFlag::SSE41;
131 |         // SSE4.1 supported
132 | 
133 |         if( ( abcd[2] >> 23 & 1 ) == 0 )
134 |             return supportedFlags; // no POPCNT
135 |         if( ( abcd[2] >> 20 & 1 ) == 0 )
136 |             return supportedFlags; // no SSE4.2
137 |         supportedFlags = supportedFlags | FeatureFlag::SSE42;
138 |         // SSE4.2 supported
139 | 
140 |         if( ( abcd[2] >> 26 & 1 ) == 0 )
141 |             return supportedFlags; // no XSAVE
142 |         if( ( abcd[2] >> 27 & 1 ) == 0 )
143 |             return supportedFlags; // no OSXSAVE
144 |         if( ( abcd[2] >> 28 & 1 ) == 0 )
145 |             return supportedFlags; // no AVX
146 | 
147 |         uint64_t osbv = xgetbv( 0 );
148 |         if( ( osbv & 6 ) != 6 )
149 |             return supportedFlags; // AVX not enabled in O.S.
150 |         supportedFlags = supportedFlags | FeatureFlag::AVX;
151 |         // AVX supported
152 | 
153 |         if constexpr( IsRelaxed() )
154 |         {
155 |             if( ( abcd[2] >> 12 & 1 ) == 0 )
156 |                 return supportedFlags; // no FMA3
157 |         }
158 | 
159 |         cpuid( abcd, 7 ); // call cpuid leaf 7 for feature flags
160 |         if( ( abcd[1] >> 5 & 1 ) == 0 )
161 |             return supportedFlags; // no AVX2
162 |         supportedFlags = supportedFlags | FeatureFlag::AVX2;
163 |         // AVX2 supported
164 | 
165 |         if( ( osbv & ( 0xE0 ) ) != 0xE0 )
166 |             return supportedFlags; // AVX512 not enabled in O.S.
167 |         if( ( abcd[1] >> 16 & 1 ) == 0 )
168 |             return supportedFlags; // no AVX512
169 |         cpuid( abcd, 0xD ); // call cpuid leaf 0xD for feature flags
170 |         if( ( abcd[0] & 0x60 ) != 0x60 )
171 |             return supportedFlags; // no AVX512
172 |         supportedFlags = supportedFlags | FeatureFlag::AVX512_F;
173 |         // AVX512 supported
174 | 
175 |         cpuid( abcd, 7 ); // call cpuid leaf 7 for feature flags
176 | 
177 |         if( ( abcd[1] >> 31 & 1 ) == 1 )
178 |             supportedFlags = supportedFlags | FeatureFlag::AVX512_VL;
179 |         // AVX512VL supported
180 | 
181 |         if( ( abcd[1] >> 17 & 1 ) == 1 )
182 |             supportedFlags = supportedFlags | FeatureFlag::AVX512_DQ;
183 |         // AVX512DQ supported
184 | 
185 |         if( ( abcd[1] >> 30 & 1 ) == 1 )
186 |             supportedFlags = supportedFlags | FeatureFlag::AVX512_BW;
187 |         // AVX512BW supported
188 | 
189 |         return supportedFlags;
190 |     }
191 | 
192 | #elif FASTSIMD_CURRENT_ARCH_IS( ARM )
193 |     static std::uint32_t DetectCpuSupportedFlags()
194 |     {
195 |         std::uint32_t supportedFlags =
196 |             FastSIMD::FeatureFlag::ARM |
197 |             FastSIMD::FeatureFlag::Scalar |
198 |             FastSIMD::FeatureFlag::NEON |
199 |             FastSIMD::FeatureFlag::AARCH64;
200 | 
201 |         return supportedFlags;
202 |     }
203 | 
204 | #elif FASTSIMD_CURRENT_ARCH_IS( WASM )
205 |     static std::uint32_t DetectCpuSupportedFlags()
206 |     {
207 |         std::uint32_t supportedFlags =
208 |             FastSIMD::FeatureFlag::WASM |
209 |             FastSIMD::FeatureFlag::Scalar;
210 | 
211 |         return supportedFlags;
212 |     }
213 | #endif
214 | 
215 |     static FeatureSet FeatureSetValues[] =
216 |     {
217 |         FeatureSet::SCALAR,
218 | 
219 | #if FASTSIMD_CURRENT_ARCH_IS( X86 )
220 |         FeatureSet::SSE,
221 |         FeatureSet::SSE2,
222 |         FeatureSet::SSE3,
223 |         FeatureSet::SSSE3,
224 |         FeatureSet::SSE41,
225 |         FeatureSet::SSE42,
226 |         FeatureSet::AVX,
227 |         FeatureSet::AVX2,
228 |         FeatureSet::AVX512,
229 | 
230 | #elif FASTSIMD_CURRENT_ARCH_IS( ARM )
231 |         FeatureSet::NEON,
232 |         FeatureSet::AARCH64,
233 | 
234 | #elif FASTSIMD_CURRENT_ARCH_IS( WASM )
235 |         FeatureSet::WASM,
236 | #endif
237 |     };
238 | 
239 |     FASTSIMD_API FeatureSet DetectCpuMaxFeatureSet()
240 |     {
241 |         static FeatureSet cache = []
242 |         {
243 |             std::uint32_t supportedFlags = DetectCpuSupportedFlags();
244 | 
245 |             FeatureSet maxSupported = FeatureSet::Invalid;
246 | 
247 |             for( FeatureSet featureSet : FeatureSetValues )
248 |             {
249 |                 // Check if feature set contains unsupported flags
250 |                 if( ( static_cast<std::uint32_t>( featureSet ) ^ supportedFlags ) & ~supportedFlags )
251 |                 {
252 |                     break;
253 |                 }
254 | 
255 |                 maxSupported = featureSet;
256 |             }
257 | 
258 |             return maxSupported;
259 |         }();
260 | 
261 |         return cache;
262 |     }
263 | 
264 |     FASTSIMD_API const char* GetFeatureSetString( FeatureSet featureSet )
265 |     {
266 |         switch( featureSet )
267 |         {
268 |             case FeatureSet::Invalid: return "Invalid";
269 |             case FeatureSet::SCALAR: return "SCALAR";
270 |             case FeatureSet::SSE: return "SSE";
271 |             case FeatureSet::SSE2: return "SSE2";
272 |             case FeatureSet::SSE3: return "SSE3";
273 |             case FeatureSet::SSSE3: return "SSSE3";
274 |             case FeatureSet::SSE41: return "SSE4.1";
275 |             case FeatureSet::SSE42: return "SSE4.2";
276 |             case FeatureSet::AVX: return "AVX";
277 |             case FeatureSet::AVX2: return "AVX2";
278 |             case FeatureSet::AVX512: return "AVX512";
279 |             case FeatureSet::NEON: return "NEON";
280 |             case FeatureSet::AARCH64: return "AARCH64";
281 |             case FeatureSet::WASM: return "WASM";
282 |             case FeatureSet::Max: return "Max";
283 |         }
284 | 
285 |         return "NAN";
286 |     }
287 | } // namespace FastSIMD
288 | 


--------------------------------------------------------------------------------
/tests/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | fastsimd_create_dispatch_library(simd_test SOURCES "test.inl" FEATURE_SETS SCALAR SSE2 SSE41 AVX2 AVX512 NEON AARCH64 WASM)
 3 | fastsimd_create_dispatch_library(simd_test_relaxed RELAXED SOURCES "test.inl" FEATURE_SETS SCALAR SSE2 SSE41 AVX2 AVX512 NEON AARCH64 WASM)
 4 | 
 5 | add_executable(test "test.cpp")
 6 | target_link_libraries(test PRIVATE FastSIMD simd_test simd_test_relaxed)
 7 | 
 8 | if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
 9 |   set(CMAKE_EXECUTABLE_SUFFIX ".html")
10 |   target_link_options(test PRIVATE -sALLOW_MEMORY_GROWTH=1 -sSINGLE_FILE)
11 | endif()
12 | 
13 | # Enable Warnings
14 | if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
15 |   #target_compile_options(test_simd PUBLIC /W4 /WX)
16 | else()
17 |   #target_compile_options(test_simd PUBLIC -Wall -Wextra -Wpedantic -Werror -Wno-c++98-compat -Wno-c++98-compat-pedantic -Wno-old-style-cast -Wno-undefined-func-template)
18 | endif()
19 | 


--------------------------------------------------------------------------------
/tests/test.cpp:
--------------------------------------------------------------------------------
  1 | #include "test.h"
  2 | 
  3 | #include <iomanip>
  4 | #include <vector>
  5 | #include <iostream>
  6 | #include <cstring>
  7 | #include <map>
  8 | #include <memory>
  9 | #include <random>
 10 | #include <string>
 11 | 
 12 | #include <FastSIMD/simd_test_config.h>
 13 | 
 14 | static constexpr size_t TestCount = 4096 * 4096;
 15 | 
 16 | static int  * rndInts;
 17 | static float* rndFloats;
 18 | 
 19 | static float GenFiniteFloat( std::mt19937& gen )
 20 | {
 21 |     union
 22 |     {
 23 |         float f;
 24 |         int32_t i;
 25 |     } u;
 26 | 
 27 |     do
 28 |     {
 29 |         u.i = static_cast<int>( gen() );
 30 | 
 31 |     } while( !std::isfinite( u.f ) );
 32 | 
 33 |     return u.f;
 34 | }
 35 | 
 36 | static void GenerateRandomValues()
 37 | {
 38 |     std::cout << "Generating random values..." << std::endl;
 39 | 
 40 |     rndInts = new int[TestCount + 1024];
 41 |     rndFloats = new float[TestCount + 1024];
 42 | 
 43 |     std::random_device rd;  //Will be used to obtain a seed for the random number engine
 44 |     std::mt19937 gen( rd() ); //Standard mersenne_twister_engine seeded with rd()
 45 | 
 46 |     for ( std::size_t i = 0; i < TestCount; i++ )
 47 |     {
 48 |         rndInts[i] = (int)gen();
 49 |         rndFloats[i] = GenFiniteFloat( gen );
 50 |     }
 51 | 
 52 |     //std::sort( rndFloats, rndFloats + TestCount + 1024, std::less() );
 53 | }
 54 | 
 55 | template<size_t RegisterBytes>
 56 | struct TestRunner
 57 | {
 58 |     using TestSet = std::vector<std::pair<std::string_view, std::vector<TestData>>>;
 59 | 
 60 |     template<typename...>
 61 |     struct TestOrganiser
 62 |     {
 63 |         static TestCollection GetCollections()
 64 |         {
 65 |             return {};
 66 |         }
 67 |     };
 68 | 
 69 |     template<FastSIMD::FeatureSet HEAD, FastSIMD::FeatureSet... TAIL>
 70 |     struct TestOrganiser<FastSIMD::FeatureSetList<0, HEAD, TAIL...>>
 71 |     { 
 72 |         static TestCollection GetCollections()
 73 |         {
 74 |             TestCollection collections = TestOrganiser<FastSIMD::FeatureSetList<0, TAIL...>>::GetCollections();
 75 | 
 76 |             if( HEAD <= FastSIMD::DetectCpuMaxFeatureSet() )
 77 |             {
 78 |                 std::cout << "Generating Tests: " << FastSIMD::GetFeatureSetString( HEAD ) << std::endl;
 79 |                 {
 80 |                     std::unique_ptr<TestFastSIMD<RegisterBytes, true>> testSimd( FastSIMD::NewDispatchClass<TestFastSIMD<RegisterBytes, true>>( HEAD ) );
 81 | 
 82 |                     TestCollection simdCollection = testSimd->RegisterTests();
 83 | 
 84 |                     collections.insert( collections.begin(), simdCollection.begin(), simdCollection.end() );
 85 |                 }
 86 |                 {
 87 |                     std::unique_ptr<TestFastSIMD<RegisterBytes, false>> testSimd( FastSIMD::NewDispatchClass<TestFastSIMD<RegisterBytes, false>>( HEAD ) );
 88 | 
 89 |                     TestCollection simdCollection = testSimd->RegisterTests();
 90 | 
 91 |                     collections.insert( collections.begin(), simdCollection.begin(), simdCollection.end() );
 92 |                 }
 93 |             }
 94 | 
 95 |             return collections;
 96 |         }
 97 | 
 98 |         static TestSet GetSet()
 99 |         {
100 |             TestCollection collections = GetCollections();
101 |             TestSet set;
102 |             
103 |             for( auto& collection : collections )
104 |             {
105 |                 std::string_view& testName = collection.first;
106 | 
107 |                 auto find = std::find_if( set.begin(), set.end(), [testName]( const auto& pair ){ return pair.first == testName; } );
108 | 
109 |                 if( find == set.end() )
110 |                 {
111 |                     if( collection.second.featureSet != FastSIMD::FeatureSet::SCALAR )
112 |                     {
113 |                         throw std::runtime_error( "Scalar must be base test set" );
114 |                     }
115 | 
116 |                     find = set.emplace( set.end(), testName, std::vector<TestData>{} );
117 |                 }
118 | 
119 |                 find->second.emplace_back( collection.second );
120 |             }
121 | 
122 |             return set;
123 |         }
124 |     };
125 | 
126 |     // get value of least significant bit
127 |     static float DeltaUnit( float x )
128 |     {
129 |         union
130 |         {
131 |             float f;
132 |             uint32_t i;
133 |         } u;
134 |         x = std::fabs( x );
135 | 
136 |         if( !std::isfinite( x ) )
137 |         {
138 |             return 1.f;
139 |         }
140 |         if( x == 0.f || !std::isnormal( x ) )
141 |         {
142 |             u.i = 0x00800000; // smallest positive normal number
143 |             return u.f;
144 |         }
145 |         float x1 = x;
146 |         u.f = x;
147 |         u.i++;
148 |         return u.f - x1;
149 |     }
150 | 
151 |     template<typename T>
152 |     static bool CompareTyped( std::string_view testName, FastSIMD::FeatureSet featureSet, float accuracy, size_t outputCount, void* scalarResults, void* simdResults )
153 |     {
154 |         bool success = true;
155 | 
156 |         const T* typedScalar = reinterpret_cast<T*>( scalarResults );
157 |         const T* typedSimd = reinterpret_cast<T*>( simdResults );
158 | 
159 |         for( size_t idx = 0; idx < outputCount; idx++ )
160 |         {
161 |             if( typedScalar[idx] != typedSimd[idx] )
162 |             {
163 |                 float relativeDif = 0;
164 | 
165 |                 if constexpr( std::is_floating_point_v<T> )
166 |                 {
167 |                     if( std::isnan( typedScalar[idx] ) && std::isnan( typedSimd[idx] ) )
168 |                     {
169 |                         continue;
170 |                     }
171 | 
172 |                     relativeDif = std::abs( typedScalar[idx] - typedSimd[idx] ) / DeltaUnit( typedScalar[idx] );
173 | 
174 |                     if( relativeDif <= accuracy )
175 |                     {
176 |                         continue;
177 |                     }
178 |                 }
179 |                 if( success )
180 |                 {
181 |                     std::cerr << std::setprecision( 16 ) << std::boolalpha;
182 |                     std::cerr << "--- " << FastSIMD::GetFeatureSetString( featureSet ) << " FAILED ---" << std::endl;
183 |                 }
184 |                 std::cerr << "idx " << idx << ": " << testName
185 |                           << " Expected \"" << typedScalar[idx]
186 |                           << "\" Actual \"" << typedSimd[idx]
187 |                           << "\" Diff \"" << std::abs( typedScalar[idx] - typedSimd[idx] ) << "\"";
188 | 
189 |                 if( relativeDif != 0.0f )
190 |                 {
191 |                     std::cerr << " (" << relativeDif << ")";
192 |                 }
193 |                 std::cerr << std::endl;
194 |                 success = false;
195 |             }
196 |         }
197 | 
198 |         return success;
199 |     }
200 | 
201 |     static bool CompareOutputs( std::string_view testName, FastSIMD::FeatureSet featureSet, TestData::ReturnType returnType, float accuracy, size_t outputCount, void* scalarResults, void* simdResults )
202 |     {
203 |         switch( returnType )
204 |         {
205 |         case TestData::ReturnType::boolean:
206 |             return CompareTyped<bool>( testName, featureSet, accuracy, outputCount, scalarResults, simdResults );
207 | 
208 |         case TestData::ReturnType::f32:
209 |             return CompareTyped<float>( testName, featureSet, accuracy, outputCount, scalarResults, simdResults );
210 | 
211 |         case TestData::ReturnType::i32:
212 |             return CompareTyped<int32_t>( testName, featureSet, accuracy, outputCount, scalarResults, simdResults );
213 |         }
214 | 
215 |         return false;
216 |     }
217 | 
218 |     static void DoTest( std::string_view testName, std::vector<TestData>& tests )
219 |     {
220 |         std::cout << "Testing: " << testName << std::endl;
221 | 
222 |         char* scalarResults = new char[RegisterBytes];
223 |         char* simdResults   = new char[RegisterBytes];
224 | 
225 |         for( size_t idx = 0; idx < TestCount; idx += RegisterBytes / sizeof( int ) )
226 |         {
227 |             int failed = 0;
228 | 
229 |             for( size_t testIdx = 0; testIdx < tests.size(); testIdx++ )
230 |             {               
231 |                 TestData& test = tests[testIdx];
232 |             
233 |                 char* resultsOut = testIdx ? simdResults : scalarResults;
234 |                 std::memset( resultsOut, (int)testIdx, RegisterBytes );
235 |                 
236 |                 size_t outputCount = test.testFunc( resultsOut, idx, rndInts, rndFloats );
237 | 
238 |                 if( testIdx )
239 |                 {
240 |                     if( test.returnType != tests[0].returnType )
241 |                     {
242 |                         std::cerr << "Tests do not match: " << testName; 
243 |                         throw std::exception();
244 |                     }
245 |                     if( test.featureSet == FastSIMD::FeatureSet::SCALAR && !test.relaxed )
246 |                     {
247 |                         std::cerr << "Multiple tests with same name: " << testName; 
248 |                         throw std::exception();
249 |                     }
250 | 
251 |                     std::string testNameRelaxed = testName.data();
252 |                     float accuracy = 0;
253 | 
254 |                     if( test.relaxed )
255 |                     {
256 |                         testNameRelaxed += " RELAXED";
257 |                         accuracy = test.relaxedAccuracy;
258 |                     }
259 | 
260 |                     if( !CompareOutputs( testNameRelaxed, test.featureSet, test.returnType, accuracy, outputCount, scalarResults, simdResults ) )
261 |                     {
262 |                         std::cerr << "Inputs: " << tests[0].inputsFunc( idx, rndInts, rndFloats ) << std::endl;
263 |                         failed++;
264 |                     }
265 |                 }
266 |             }
267 | 
268 |             if( failed >= 3 )
269 |             {
270 |                 std::cerr << "Skipping test, fail limit reached" << std::endl;
271 |                 break;
272 |             }
273 |         }
274 |         
275 |         delete[] scalarResults;
276 |         delete[] simdResults;
277 |     }
278 | 
279 |     static void Run()
280 |     {
281 |         std::cout << "Starting Tests - Register Size: " << RegisterBytes * 8 << " (" << RegisterBytes << "b)" << std::endl;
282 | 
283 |         TestSet testSet = TestOrganiser<FastSIMD::simd_test::CompiledFeatureSets>::GetSet();
284 | 
285 |         for( auto& test : testSet )
286 |         {
287 |             //if( test.first.find( "sqrt" ) != std::string_view::npos )
288 |             {
289 |                 DoTest( test.first, test.second );
290 |             }
291 |         }
292 | 
293 |         std::cout << "Testing Complete!" << std::endl;
294 |     }
295 | };
296 | 
297 | int main()
298 | {
299 |     GenerateRandomValues();
300 | 
301 |     TestRunner<kTestBytes>::Run();
302 | 
303 |     return 0;
304 | }
305 | 


--------------------------------------------------------------------------------
/tests/test.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <FastSIMD/DispatchClass.h>
 3 | #include <cstddef>
 4 | #include <map>
 5 | #include <functional>
 6 | #include <string_view>
 7 | #include <vector>
 8 | #include <string>
 9 | 
10 | constexpr size_t kTestBytes = 512 / 8;
11 | 
12 | struct TestResult
13 | {
14 |     uint8_t returnCount;
15 | };
16 | 
17 | using TestFunction = size_t ( void*, size_t, int32_t*, float* );
18 | using InputsFunction = std::string ( size_t, int32_t*, float* );
19 | 
20 | struct TestData
21 | {
22 |     enum class ReturnType
23 |     {
24 |         boolean, f32, i32
25 |     };
26 |     
27 |     FastSIMD::FeatureSet featureSet;
28 |     bool relaxed;
29 |     ReturnType returnType;
30 |     float relaxedAccuracy = 0;
31 |     std::function<TestFunction> testFunc;
32 |     std::function<InputsFunction> inputsFunc;
33 | };
34 | 
35 | using TestCollection = std::vector<std::pair<std::string_view, TestData>>;
36 | 
37 | template<size_t RegisterBytes, bool Relaxed>
38 | class TestFastSIMD
39 | {
40 | public:
41 |     virtual ~TestFastSIMD() = default;
42 | 
43 |     virtual TestCollection RegisterTests() = 0;
44 | };
45 | 


--------------------------------------------------------------------------------