├── .clang-format ├── .github ├── FUNDING.yml └── workflows │ └── ci.yml ├── .gitignore ├── CMakeLists.txt ├── CMakeSettings.json ├── LICENSE ├── README.md ├── cmake ├── ArchDetect.cmake └── ArchDetect.cpp ├── dispatch ├── CMakeLists.txt ├── cmake │ ├── ClassSIMD.cmake │ ├── feature_set_source.cpp.in │ └── simd_lib_config.h.in └── impl │ └── DispatchClassImpl.h ├── examples ├── CMakeLists.txt ├── dispatch_library │ ├── CMakeLists.txt │ ├── example.h │ ├── example.inl │ └── main.cpp └── header_only │ ├── CMakeLists.txt │ └── main.cpp ├── include └── FastSIMD │ ├── DispatchClass.h │ ├── ToolSet.h │ ├── ToolSet │ ├── ARM │ │ ├── 128 │ │ │ ├── f32x4.h │ │ │ ├── i32x4.h │ │ │ └── m32x4.h │ │ ├── ARM.h │ │ └── NEON.h │ ├── Generic │ │ ├── Functions.h │ │ ├── Register.h │ │ ├── Scalar.h │ │ └── Scalar │ │ │ ├── f32x1.h │ │ │ ├── i32x1.h │ │ │ └── mNx1.h │ ├── WASM │ │ ├── 128 │ │ │ ├── f32x4.h │ │ │ ├── i32x4.h │ │ │ └── m32x4.h │ │ └── WASM.h │ └── x86 │ │ ├── 128 │ │ ├── f32x4.h │ │ ├── i32x4.h │ │ └── m32x4.h │ │ ├── 256 │ │ ├── f32x8.h │ │ ├── i32x8.h │ │ └── m32x8.h │ │ ├── 512 │ │ ├── f32x16.h │ │ ├── i32x16.h │ │ └── mNx16.h │ │ ├── AVX.h │ │ ├── AVX512.h │ │ ├── SSE.h │ │ └── x86.h │ └── Utility │ ├── ArchDetect.h │ ├── Export.h │ ├── FeatureEnums.h │ └── FeatureSetList.h ├── src └── FastSIMD.cpp └── tests ├── CMakeLists.txt ├── test.cpp ├── test.h └── test.inl /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | Language: Cpp 3 | BasedOnStyle: Microsoft 4 | AccessModifierOffset: -4 5 | AlignOperands: false 6 | AlignTrailingComments: false 7 | AlwaysBreakTemplateDeclarations: Yes 8 | BraceWrapping: 9 | AfterCaseLabel: true 10 | AfterClass: true 11 | AfterControlStatement: true 12 | AfterEnum: true 13 | AfterFunction: true 14 | AfterNamespace: true 15 | AfterStruct: true 16 | AfterUnion: true 17 | AfterExternBlock: false 18 | BeforeCatch: true 19 | BeforeElse: true 20 | SplitEmptyFunction: true 21 | SplitEmptyRecord: true 22 | SplitEmptyNamespace: true 23 | BreakConstructorInitializers: AfterColon 24 | ColumnLimit: 0 25 | Cpp11BracedListStyle: false 26 | IncludeCategories: 27 | - Regex: '^<.*' 28 | Priority: 1 29 | - Regex: '^".*' 30 | Priority: 2 31 | - Regex: '.*' 32 | Priority: 3 33 | IncludeIsMainRegex: '([-_](test|unittest))?$' 34 | MaxEmptyLinesToKeep: 2 35 | NamespaceIndentation: All 36 | PointerAlignment: Left 37 | SpaceAfterTemplateKeyword: 'false' 38 | SpaceBeforeCpp11BracedList: 'true' 39 | SpaceBeforeParens: Never 40 | SpaceBeforeRangeBasedForLoopColon: 'false' 41 | SpaceInEmptyParentheses: 'false' 42 | SpacesInCStyleCastParentheses: 'false' 43 | SpacesInContainerLiterals: 'false' 44 | SpacesInParentheses: 'true' 45 | SpacesInSquareBrackets: 'false' 46 | UseTab: Never 47 | ... 48 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: Auburn 4 | patreon: # Replace with a single Patreon username 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: # Replace with a single Ko-fi username 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | otechie: # Replace with a single Otechie username 12 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] 13 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | # Controls when the action will run. Triggers the workflow on push or pull request 4 | # events but only for the master branch 5 | on: 6 | push: 7 | branches: [master] 8 | pull_request: 9 | branches: [master] 10 | 11 | # A workflow run is made up of one or more jobs that can run sequentially or in parallel 12 | jobs: 13 | ci-matrix: 14 | name: ${{ matrix.name }} 15 | runs-on: ${{ matrix.os }} 16 | strategy: 17 | fail-fast: false 18 | matrix: 19 | include: 20 | - os: windows-latest 21 | name: Win32-MSVC 22 | cmake_options: -A Win32 23 | - os: windows-latest 24 | name: Win64-MSVC 25 | cmake_options: -A x64 26 | - os: windows-latest 27 | name: Win64-ClangCL 28 | cmake_options: -A x64 -T ClangCL 29 | - os: ubuntu-latest 30 | name: Linux64-GCC 31 | cmake_options: -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ 32 | - os: ubuntu-latest 33 | name: Linux64-Clang 34 | cmake_options: -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ 35 | - os: macos-13 36 | name: MacOS64-Clang 37 | cmake_options: -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ 38 | - os: macos-latest 39 | name: MacOSARM64-Clang 40 | cmake_options: -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_OSX_ARCHITECTURES="x86_64;arm64" 41 | - os: ubuntu-latest 42 | name: Emscripten 43 | cmake_options: -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake 44 | 45 | steps: 46 | - name: 'Setup Emscripten' 47 | if: matrix.name == 'Emscripten' 48 | uses: mymindstorm/setup-emsdk@v14 49 | with: 50 | version: 3.1.56 51 | no-cache: true 52 | 53 | - name: 'Checkout' 54 | uses: actions/checkout@v4 55 | 56 | - name: 'CMake Configure Debug' 57 | run: cmake -S ${{ github.workspace }} -B ${{ github.workspace }}/debug -DCMAKE_BUILD_TYPE=Debug -DCMAKE_INSTALL_PREFIX="${{ github.workspace }}/install/FastSIMD" ${{ matrix.cmake_options }} 58 | 59 | - name: 'CMake Build Debug' 60 | run: cmake --build ${{ github.workspace }}/debug --config Debug --parallel 4 61 | 62 | - name: 'CMake Configure Release' 63 | run: cmake -S ${{ github.workspace }} -B ${{ github.workspace }}/release -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX="${{ github.workspace }}/install/FastSIMD" ${{ matrix.cmake_options }} 64 | 65 | - name: 'CMake Build Release' 66 | run: cmake --build ${{ github.workspace }}/release --config Release --parallel 4 67 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Prerequisites 2 | *.d 3 | 4 | # Compiled Object files 5 | *.slo 6 | *.lo 7 | *.o 8 | *.obj 9 | 10 | # Precompiled Headers 11 | *.gch 12 | *.pch 13 | 14 | # Compiled Dynamic libraries 15 | *.so 16 | *.dylib 17 | *.dll 18 | 19 | # Fortran module files 20 | *.mod 21 | *.smod 22 | 23 | # Compiled Static libraries 24 | *.lai 25 | *.la 26 | *.a 27 | *.lib 28 | 29 | # Executables 30 | *.exe 31 | *.out 32 | *.app 33 | /.vs* 34 | /out 35 | /build 36 | /enc_temp_folder 37 | /cpm-cache 38 | /old 39 | 40 | emsdk 41 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # CMakeList.txt : CMake project for FastSIMD 2 | cmake_minimum_required(VERSION 3.7.1) 3 | 4 | project(FastSIMD VERSION 1.0.0) 5 | set(CMAKE_CXX_STANDARD 17) 6 | 7 | # determine whether this is a standalone project or included by other projects 8 | if (NOT DEFINED FASTSIMD_STANDALONE_PROJECT) 9 | if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR) 10 | set(FASTSIMD_STANDALONE_PROJECT ON) 11 | else() 12 | set(FASTSIMD_STANDALONE_PROJECT OFF) 13 | endif() 14 | endif() 15 | 16 | option(FASTSIMD_DISPATCH_CLASS "Enable FastSIMD Dispatch Classes" ON) 17 | option(FASTSIMD_EXAMPLES "Build FastSIMD examples" ${FASTSIMD_STANDALONE_PROJECT}) 18 | option(FASTSIMD_TESTS "Build FastSIMD tests" ${FASTSIMD_STANDALONE_PROJECT}) 19 | 20 | include(cmake/ArchDetect.cmake) 21 | 22 | target_architecture(FASTSIMD_ARCH_DETECT FASTSIMD_ARCHVER_DETECT) 23 | 24 | add_library(FastSIMD OBJECT "src/FastSIMD.cpp") 25 | target_compile_definitions(FastSIMD PRIVATE FASTSIMD_EXPORT) 26 | 27 | if(BUILD_SHARED_LIBS) 28 | set_property(TARGET FastSIMD PROPERTY POSITION_INDEPENDENT_CODE ON) 29 | else() 30 | target_compile_definitions(FastSIMD PUBLIC FASTSIMD_STATIC_LIB) 31 | endif() 32 | 33 | if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten") 34 | target_compile_options(FastSIMD PUBLIC -msimd128) 35 | endif() 36 | 37 | target_include_directories(FastSIMD PUBLIC 38 | $ 39 | $ 40 | ) 41 | 42 | if(FASTSIMD_DISPATCH_CLASS) 43 | add_subdirectory(dispatch) 44 | endif() 45 | 46 | if(FASTSIMD_TESTS) 47 | add_subdirectory(tests) 48 | endif() 49 | 50 | if(FASTSIMD_EXAMPLES) 51 | add_subdirectory(examples) 52 | endif() 53 | -------------------------------------------------------------------------------- /CMakeSettings.json: -------------------------------------------------------------------------------- 1 | { 2 | "configurations": [ 3 | { 4 | "name": "x64-Debug", 5 | "generator": "Ninja", 6 | "configurationType": "Debug", 7 | "inheritEnvironments": [ "msvc_x64_x64" ], 8 | "buildRoot": "${projectDir}\\out\\build\\${name}", 9 | "installRoot": "${projectDir}\\out\\install\\${name}", 10 | "cmakeCommandArgs": "", 11 | "buildCommandArgs": "-v", 12 | "ctestCommandArgs": "" 13 | }, 14 | { 15 | "name": "x64-Clang-Debug", 16 | "generator": "Ninja", 17 | "configurationType": "Debug", 18 | "buildRoot": "${projectDir}\\out\\build\\${name}", 19 | "installRoot": "${projectDir}\\out\\install\\${name}", 20 | "buildCommandArgs": "-v", 21 | "ctestCommandArgs": "", 22 | "inheritEnvironments": [ "clang_cl_x64_x64" ] 23 | }, 24 | { 25 | "name": "WSL-GCC-Debug", 26 | "generator": "Ninja", 27 | "configurationType": "Debug", 28 | "buildRoot": "${projectDir}\\out\\build\\${name}", 29 | "installRoot": "${projectDir}\\out\\install\\${name}", 30 | "cmakeExecutable": "cmake", 31 | "cmakeCommandArgs": "", 32 | "buildCommandArgs": "-v", 33 | "ctestCommandArgs": "", 34 | "inheritEnvironments": [ "linux_x64" ], 35 | "wslPath": "${defaultWSLPath}" 36 | }, 37 | { 38 | "name": "WSL-Clang-Debug", 39 | "generator": "Ninja", 40 | "configurationType": "Debug", 41 | "buildRoot": "${projectDir}\\out\\build\\${name}", 42 | "installRoot": "${projectDir}\\out\\install\\${name}", 43 | "cmakeExecutable": "cmake", 44 | "cmakeCommandArgs": "", 45 | "buildCommandArgs": "-v", 46 | "ctestCommandArgs": "", 47 | "inheritEnvironments": [ "linux_clang_x64" ], 48 | "wslPath": "${defaultWSLPath}" 49 | }, 50 | { 51 | "name": "x64-Release", 52 | "generator": "Ninja", 53 | "configurationType": "RelWithDebInfo", 54 | "buildRoot": "${projectDir}\\out\\build\\${name}", 55 | "installRoot": "${projectDir}\\out\\install\\${name}", 56 | "cmakeCommandArgs": "", 57 | "buildCommandArgs": "-v", 58 | "ctestCommandArgs": "", 59 | "inheritEnvironments": [ "msvc_x64_x64" ], 60 | "variables": [] 61 | }, 62 | { 63 | "name": "x64-Clang-Release", 64 | "generator": "Ninja", 65 | "configurationType": "RelWithDebInfo", 66 | "buildRoot": "${projectDir}\\out\\build\\${name}", 67 | "installRoot": "${projectDir}\\out\\install\\${name}", 68 | "buildCommandArgs": "-v", 69 | "ctestCommandArgs": "", 70 | "inheritEnvironments": [ "clang_cl_x64_x64" ], 71 | "variables": [] 72 | } 73 | ] 74 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Jordan Peck 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # FastSIMD 2 | 3 | FastSIMD is an SIMD abstraction layer that allows easy development of SIMD code. Using the generic SIMD register types included with FastSIMD provides access to standard C++ operators and various mathematic functions. Using template parameters or a define the generic types can be compiled to various SIMD feature sets (SSE, AVX, NEON...) 4 | 5 | FastSIMD also provides an easy to use class template for runtime SIMD feature set detection and appropriate function dispatch. The generic register types mentioned above make it easy to write code that always makes use of the highest SIMD feature set available. 6 | 7 | The original version of FastSIMD was developed along side FastNoise2. This new "1.0" version of FastSIMD is being developed as a working standalone library, although the development is still primarily driven to support FastNoise2. Compared to the "old" version of FastSIMD supplied with FastNoise2 this "1.0" version of FastSIMD has these improvements: 8 | 9 | - Removed all uses of C++ macros in favour of templated types/functions 10 | - Variable sized generic register types. For example when using operators on a register of 8xInt32 when targeting SSE, the intrinsics output will get doubled up transparently. 11 | - Moved from SIMD levels to FeatureFlags to allow more verbose specialisation of templated types and more readable code. 12 | -------------------------------------------------------------------------------- /cmake/ArchDetect.cmake: -------------------------------------------------------------------------------- 1 | function(target_architecture output_arch output_arch_ver) 2 | if(APPLE AND CMAKE_OSX_ARCHITECTURES) 3 | set(ARCH "${CMAKE_OSX_ARCHITECTURES}") 4 | set(ARCH_VER unknown) 5 | else() 6 | 7 | # Detect the architecture in a rather creative way... 8 | # This compiles a small C program which is a series of ifdefs that selects a 9 | # particular #error preprocessor directive whose message string contains the 10 | # target architecture. The program will always fail to compile (both because 11 | # file is not a valid C program, and obviously because of the presence of the 12 | # #error preprocessor directives... but by exploiting the preprocessor in this 13 | # way, we can detect the correct target architecture even when cross-compiling, 14 | # since the program itself never needs to be run (only the compiler/preprocessor) 15 | try_compile( 16 | compile_result_unused 17 | "${CMAKE_BINARY_DIR}" 18 | "${FastSIMD_SOURCE_DIR}/cmake/ArchDetect.cpp" 19 | OUTPUT_VARIABLE COMPILE_OUTPUT 20 | ) 21 | #message(STATUS ${COMPILE_OUTPUT}) 22 | 23 | # Parse the architecture name from the compiler output 24 | if ("${COMPILE_OUTPUT}" MATCHES "FASTSIMD_ARCH<([^\"=]+)=([^>]+)") 25 | set(ARCH "${CMAKE_MATCH_1}") 26 | set(ARCH_VER "${CMAKE_MATCH_2}") 27 | 28 | else() 29 | set(ARCH unknown) 30 | set(ARCH_VER SCALAR) 31 | endif() 32 | endif() 33 | 34 | message(STATUS "FastSIMD: Detected arch \"${ARCH}\" \"${ARCH_VER}\"") 35 | set(${output_arch} "${ARCH}" PARENT_SCOPE) 36 | set(${output_arch_ver} "${ARCH_VER}" PARENT_SCOPE) 37 | endfunction() -------------------------------------------------------------------------------- /cmake/ArchDetect.cpp: -------------------------------------------------------------------------------- 1 | #include "../include/FastSIMD/Utility/ArchDetect.h" 2 | 3 | #define TO_LITERAL_( string ) #string 4 | #define TO_LITERAL( string ) TO_LITERAL_( string ) 5 | 6 | #if !defined( TEST_FEATURE_SET_ACTIVE ) 7 | #define FASTSIMD_DETECT_SUCCESS 8 | #else 9 | #if FASTSIMD_FEATURE_VALUE( TEST_FEATURE_SET_ACTIVE ) > 0 10 | #define FASTSIMD_DETECT_SUCCESS 11 | #endif 12 | #endif 13 | 14 | #ifdef FASTSIMD_DETECT_SUCCESS 15 | static_assert( 0, "FASTSIMD_ARCH<" TO_LITERAL( FASTSIMD_ARCH_NAME() ) ">" ); 16 | 17 | // Needed for MacOS clang, it doesn't evaluate macros in static assert errors 18 | TO_LITERAL( FASTSIMD_ARCH ); 19 | 20 | #endif -------------------------------------------------------------------------------- /dispatch/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(cmake/ClassSIMD.cmake) 2 | -------------------------------------------------------------------------------- /dispatch/cmake/ClassSIMD.cmake: -------------------------------------------------------------------------------- 1 | 2 | function(fastsimd_add_feature_set_source fastsimd_library_sources feature_set is_relaxed) 3 | foreach(simd_inl ${fastsimd_library_sources}) 4 | set(feature_set_source "${simd_library_source_dir}/${simd_library_name}_${feature_set}.cpp") 5 | set(simd_inl_full "${CMAKE_CURRENT_LIST_DIR}/${simd_inl}") 6 | 7 | configure_file("${FastSIMD_SOURCE_DIR}/dispatch/cmake/feature_set_source.cpp.in" ${feature_set_source}) 8 | target_sources(${simd_library_name} PRIVATE ${feature_set_source}) 9 | 10 | if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") 11 | # MSVC 32bit needs SSE2 flag for all SSE levels 12 | if(${feature_set} MATCHES "SSE[^(0-9)]" AND CMAKE_SIZEOF_VOID_P EQUAL 4) 13 | set_property(SOURCE ${feature_set_source} APPEND PROPERTY COMPILE_OPTIONS /arch:SSE2) 14 | 15 | elseif(${feature_set} MATCHES "AVX[^(0-9)]") 16 | set_property(SOURCE ${feature_set_source} APPEND PROPERTY COMPILE_OPTIONS /arch:AVX) 17 | 18 | elseif(${feature_set} MATCHES AVX2) 19 | set_property(SOURCE ${feature_set_source} APPEND PROPERTY COMPILE_OPTIONS /arch:AVX2) 20 | 21 | elseif(${feature_set} MATCHES AVX512) 22 | set_property(SOURCE ${feature_set_source} APPEND PROPERTY COMPILE_OPTIONS /arch:AVX512) 23 | endif() 24 | else() 25 | if(${feature_set} MATCHES SSE2 AND CMAKE_SIZEOF_VOID_P EQUAL 4) 26 | set_property(SOURCE ${feature_set_source} APPEND PROPERTY COMPILE_OPTIONS -msse2) 27 | 28 | elseif(${feature_set} MATCHES SSE3) 29 | set_property(SOURCE ${feature_set_source} APPEND PROPERTY COMPILE_OPTIONS -msse3) 30 | 31 | elseif(${feature_set} MATCHES SSSE3) 32 | set_property(SOURCE ${feature_set_source} APPEND PROPERTY COMPILE_OPTIONS -mssse3) 33 | 34 | elseif(${feature_set} MATCHES SSE41) 35 | set_property(SOURCE ${feature_set_source} APPEND PROPERTY COMPILE_OPTIONS -msse4.1) 36 | 37 | elseif(${feature_set} MATCHES SSE42) 38 | set_property(SOURCE ${feature_set_source} APPEND PROPERTY COMPILE_OPTIONS -msse4.2) 39 | 40 | elseif(${feature_set} MATCHES "AVX[^(0-9)]") 41 | set_property(SOURCE ${feature_set_source} APPEND PROPERTY COMPILE_OPTIONS -mavx) 42 | 43 | elseif(${feature_set} MATCHES AVX2) 44 | if(is_relaxed) 45 | set_property(SOURCE ${feature_set_source} APPEND PROPERTY COMPILE_OPTIONS -mfma) 46 | else() 47 | set_property(SOURCE ${feature_set_source} APPEND PROPERTY COMPILE_OPTIONS -mno-fma) 48 | endif() 49 | set_property(SOURCE ${feature_set_source} APPEND PROPERTY COMPILE_OPTIONS -mavx2) 50 | 51 | elseif(${feature_set} MATCHES AVX512) 52 | if(is_relaxed) 53 | set_property(SOURCE ${feature_set_source} APPEND PROPERTY COMPILE_OPTIONS -mfma) 54 | else() 55 | set_property(SOURCE ${feature_set_source} APPEND PROPERTY COMPILE_OPTIONS -mno-fma) 56 | endif() 57 | set_property(SOURCE ${feature_set_source} APPEND PROPERTY COMPILE_OPTIONS -mavx512f -mavx512dq -mavx512vl -mavx512bw) 58 | 59 | elseif(${feature_set} MATCHES WASM) 60 | if(is_relaxed) 61 | set_property(SOURCE ${feature_set_source} APPEND PROPERTY COMPILE_OPTIONS -mrelaxed-simd) 62 | endif() 63 | endif() 64 | endif() 65 | endforeach() 66 | endfunction() 67 | 68 | function(fastsimd_create_dispatch_library simd_library_name) 69 | 70 | cmake_parse_arguments(PARSE_ARGV 0 fastsimd_create_dispatch_library "RELAXED" "" "SOURCES;FEATURE_SETS") 71 | 72 | list(LENGTH fastsimd_create_dispatch_library_FEATURE_SETS FEATURE_SET_COUNT) 73 | list(LENGTH fastsimd_create_dispatch_library_SOURCES SOURCES_COUNT) 74 | 75 | if(SOURCES_COUNT EQUAL 0) 76 | message(FATAL_ERROR "FastSIMD: \"${simd_library_name}\" No SOURCES specified, example usage: fastsimd_create_dispatch_library(example_simd SOURCES \"example.inl\")") 77 | endif() 78 | 79 | if(FEATURE_SET_COUNT EQUAL 0) 80 | message("FastSIMD: \"${simd_library_name}\" No FEATURE_SETS specified, using default feature sets") 81 | set(fastsimd_create_dispatch_library_FEATURE_SETS 82 | SSE2 83 | SSE41 84 | AVX2 85 | AVX512 86 | NEON 87 | AARCH64 88 | WASM) 89 | endif() 90 | 91 | add_library(${simd_library_name} OBJECT) 92 | 93 | set(simd_library_source_dir "${CMAKE_CURRENT_BINARY_DIR}/fastsimd/${simd_library_name}") 94 | 95 | target_compile_definitions(${simd_library_name} PRIVATE FASTSIMD_EXPORT FASTSIMD_LIBRARY_NAME=${simd_library_name}) 96 | target_link_libraries(${simd_library_name} PRIVATE FastSIMD) 97 | 98 | target_include_directories(${simd_library_name} PUBLIC 99 | $ 100 | $) 101 | 102 | if(BUILD_SHARED_LIBS) 103 | set_target_properties(${simd_library_name} PROPERTIES POSITION_INDEPENDENT_CODE ON) 104 | endif() 105 | 106 | if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") 107 | set_target_properties(${simd_library_name} PROPERTIES COMPILE_FLAGS "-Wno-ignored-attributes") 108 | endif() 109 | 110 | if(MINGW) 111 | target_compile_options(${simd_library_name} PRIVATE -Wa,-muse-unaligned-vector-move) 112 | endif() 113 | 114 | if(fastsimd_create_dispatch_library_RELAXED) 115 | target_compile_definitions(${simd_library_name} PUBLIC FASTSIMD_IS_RELAXED=1) 116 | set(relaxed_log_msg " (RELAXED)") 117 | endif() 118 | 119 | set(feature_set_list "") 120 | set(feature_set_list_debug "") 121 | 122 | foreach(feature_set ${fastsimd_create_dispatch_library_FEATURE_SETS}) 123 | if(APPLE AND (NOT CMAKE_OSX_ARCHITECTURES STREQUAL "") AND (NOT feature_set STREQUAL "SCALAR")) 124 | # Loop through OSX arches and test compile on each separately 125 | foreach(CMAKE_OSX_ARCHITECTURES ${CMAKE_OSX_ARCHITECTURES}) 126 | #message(STATUS "${CMAKE_OSX_ARCHITECTURES} ${feature_set}") 127 | try_compile( 128 | compile_result_unused 129 | "${CMAKE_BINARY_DIR}" 130 | "${FastSIMD_SOURCE_DIR}/cmake/ArchDetect.cpp" 131 | OUTPUT_VARIABLE COMPILE_OUTPUT 132 | COMPILE_DEFINITIONS -DTEST_FEATURE_SET_ACTIVE=${feature_set} 133 | ) 134 | 135 | #message(STATUS ${COMPILE_OUTPUT}) 136 | if (COMPILE_OUTPUT MATCHES "FASTSIMD_ARCH<([^\"=]+)=([^>]+)") 137 | set(feature_arch_detect "FASTSIMD_CURRENT_ARCH_IS( ${CMAKE_MATCH_1} )") 138 | fastsimd_add_feature_set_source(${fastsimd_create_dispatch_library_SOURCES} ${feature_set} ${fastsimd_create_dispatch_library_RELAXED}) 139 | string(APPEND feature_set_list "#if ${feature_arch_detect}\n,FastSIMD::FeatureSet::${feature_set}\n#endif\n" ) 140 | list(APPEND feature_set_list_debug "${feature_set}") 141 | break() 142 | endif() 143 | endforeach() 144 | else() 145 | try_compile( 146 | compile_result_unused 147 | "${CMAKE_BINARY_DIR}" 148 | "${FastSIMD_SOURCE_DIR}/cmake/ArchDetect.cpp" 149 | OUTPUT_VARIABLE COMPILE_OUTPUT 150 | COMPILE_DEFINITIONS -DTEST_FEATURE_SET_ACTIVE=${feature_set} 151 | ) 152 | 153 | #message(STATUS ${COMPILE_OUTPUT}) 154 | if (COMPILE_OUTPUT MATCHES "FASTSIMD_ARCH<([^\">=]+)=([^\">]+)>") 155 | set(feature_arch_detect "1") 156 | fastsimd_add_feature_set_source(${fastsimd_create_dispatch_library_SOURCES} ${feature_set} ${fastsimd_create_dispatch_library_RELAXED}) 157 | string(APPEND feature_set_list ",FastSIMD::FeatureSet::${feature_set}\n" ) 158 | list(APPEND feature_set_list_debug "${feature_set}") 159 | endif() 160 | endif() 161 | endforeach() 162 | 163 | # Create array of compiled feature sets for lookup in FastSIMD::New() 164 | configure_file("${FastSIMD_SOURCE_DIR}/dispatch/cmake/simd_lib_config.h.in" "${simd_library_source_dir}/include/FastSIMD/${simd_library_name}_config.h") 165 | 166 | message(STATUS "FastSIMD: Created dispatch library \"${simd_library_name}\" with Feature Sets${relaxed_log_msg}: ${feature_set_list_debug}") 167 | 168 | endfunction() 169 | -------------------------------------------------------------------------------- /dispatch/cmake/feature_set_source.cpp.in: -------------------------------------------------------------------------------- 1 | #define FASTSIMD_MAX_FEATURE_SET ${feature_set} 2 | #include 3 | 4 | #if ${feature_arch_detect} 5 | #include 6 | 7 | #include "${FastSIMD_SOURCE_DIR}/dispatch/impl/DispatchClassImpl.h" 8 | #include "${simd_inl_full}" 9 | #endif 10 | -------------------------------------------------------------------------------- /dispatch/cmake/simd_lib_config.h.in: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | namespace FastSIMD 7 | { 8 | namespace ${simd_library_name} 9 | { 10 | using CompiledFeatureSets = FeatureSetList<0 11 | ${feature_set_list}>; 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /dispatch/impl/DispatchClassImpl.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | 5 | #include 6 | 7 | namespace FastSIMD 8 | { 9 | template 10 | class DispatchClass; 11 | 12 | template 13 | struct DispatchClassFactory 14 | { 15 | template 16 | FS_NEVERINLINE static T* New( FastSIMD::MemoryAllocator allocator ); 17 | }; 18 | 19 | // Make sure we only instantiate DispatchClass for the current feature set 20 | template<> 21 | template 22 | FS_NEVERINLINE T* DispatchClassFactory::New( FastSIMD::MemoryAllocator allocator ) 23 | { 24 | constexpr auto SIMD = FeatureSetDefault(); 25 | 26 | if( allocator ) 27 | { 28 | void* alloc = allocator( sizeof( DispatchClass ), alignof( DispatchClass ) ); 29 | 30 | return new( alloc ) DispatchClass; 31 | } 32 | 33 | return new DispatchClass; 34 | } 35 | 36 | 37 | template 38 | class RegisterDispatchClass 39 | { 40 | static_assert( SIMD == FeatureSetDefault() ); 41 | 42 | // Never called, used to instantiate DispatchClassFactory::New() 43 | static auto Instantiate() 44 | { 45 | return &FastSIMD::DispatchClassFactory::template New; 46 | } 47 | }; 48 | 49 | // Compile FastSIMD::NewDispatchClass in minimum feature set compilation unit to avoid illegal instructions 50 | template 51 | class RegisterDispatchClass 52 | { 53 | // Never called, used to instantiate NewDispatchClass() 54 | static auto Instantiate() 55 | { 56 | return &FastSIMD::NewDispatchClass; 57 | } 58 | }; 59 | 60 | 61 | template 62 | FS_FORCEINLINE static T* DispatchClassFactoryIterator( FeatureSet maxFeatureSet, MemoryAllocator allocator ) 63 | { 64 | if( maxFeatureSet < SIMD ) 65 | { 66 | return nullptr; 67 | } 68 | 69 | constexpr auto NextCompiled = FastSIMD::FASTSIMD_LIBRARY_NAME::CompiledFeatureSets::NextAfter; 70 | 71 | if constexpr( NextCompiled != FeatureSet::Max ) 72 | { 73 | if( maxFeatureSet >= NextCompiled ) 74 | { 75 | return DispatchClassFactoryIterator( maxFeatureSet, allocator ); 76 | } 77 | } 78 | 79 | return DispatchClassFactory::template New( allocator ); 80 | } 81 | 82 | template 83 | FASTSIMD_API T* NewDispatchClass( FeatureSet maxFeatureSet, MemoryAllocator allocator ) 84 | { 85 | if( maxFeatureSet == FeatureSet::Max ) 86 | { 87 | maxFeatureSet = DetectCpuMaxFeatureSet(); 88 | } 89 | 90 | return DispatchClassFactoryIterator( maxFeatureSet, allocator ); 91 | } 92 | 93 | 94 | } // namespace FastSIMD 95 | -------------------------------------------------------------------------------- /examples/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(dispatch_library) 2 | add_subdirectory(header_only) 3 | -------------------------------------------------------------------------------- /examples/dispatch_library/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | 2 | fastsimd_create_dispatch_library(simd_example_dispatch_library SOURCES "example.inl") 3 | 4 | add_executable(example_dispatch_library "main.cpp") 5 | target_link_libraries(example_dispatch_library PRIVATE FastSIMD simd_example_dispatch_library) -------------------------------------------------------------------------------- /examples/dispatch_library/example.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | 5 | class ExampleSIMD 6 | { 7 | public: 8 | virtual ~ExampleSIMD() = default; 9 | 10 | virtual void SimpleData( const float* in, float* out, std::size_t dataSize, float multiplier, float cutoff ) = 0; 11 | }; -------------------------------------------------------------------------------- /examples/dispatch_library/example.inl: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "example.h" 3 | 4 | template 5 | class FastSIMD::DispatchClass : public ExampleSIMD 6 | { 7 | void SimpleData( const float* in, float* out, std::size_t dataSize, float multiplier, float cutoff ) override 8 | { 9 | constexpr std::size_t N = 32; 10 | 11 | if constexpr( (SIMD & FastSIMD::FeatureFlag::AVX512_F) ) 12 | { 13 | auto vMultiplier = FS::f32( multiplier ); 14 | auto test = FS::NativeExec>( FS_BIND_INTRINSIC( _mm512_mul_ps ), vMultiplier, FS::LoadIncremented() ); 15 | 16 | FS::Store( out, test ); 17 | } 18 | 19 | //auto vInt = FS::i32( 1 ) + 2_i32; 20 | 21 | auto vMultiplier = FS::f32( multiplier ); 22 | auto vCutoff = FS::f32( cutoff ); 23 | 24 | for( std::size_t i = 0; i < dataSize; i += N ) 25 | { 26 | FS::f32 data = FS::Load( in + i ); 27 | 28 | data = FS::Select( data < vCutoff, data * vMultiplier, data ); 29 | 30 | FS::Store( out + i, data ); 31 | } 32 | } 33 | }; 34 | 35 | template class FastSIMD::RegisterDispatchClass; -------------------------------------------------------------------------------- /examples/dispatch_library/main.cpp: -------------------------------------------------------------------------------- 1 | #include "example.h" 2 | #include 3 | #include 4 | 5 | int main() 6 | { 7 | FastSIMD::FeatureSet featureSet = FastSIMD::DetectCpuMaxFeatureSet(); 8 | std::cout << FastSIMD::GetFeatureSetString( featureSet ) << std::endl; 9 | 10 | std::vector data; 11 | for( int i = 0; i < 32; i++ ) 12 | { 13 | data.push_back( (float)i ); 14 | } 15 | std::vector out( data.size() ); 16 | 17 | ExampleSIMD* simd = FastSIMD::NewDispatchClass(); 18 | 19 | simd->SimpleData( data.data(), out.data(), data.size(), 10, 17 ); 20 | 21 | for( std::size_t i = 0; i < data.size(); i++ ) 22 | { 23 | std::cout << data[i] << "\t: " << out[i] << std::endl; 24 | } 25 | 26 | return 0; 27 | } 28 | -------------------------------------------------------------------------------- /examples/header_only/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | 2 | add_executable(example_header_only "main.cpp") 3 | target_link_libraries(example_header_only PRIVATE FastSIMD) 4 | -------------------------------------------------------------------------------- /examples/header_only/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | int main() 4 | { 5 | auto distSq = FS::LoadIncremented>(); 6 | 7 | auto invSqrt = FS::InvSqrt( distSq ); 8 | 9 | auto dist = invSqrt * distSq; 10 | 11 | auto out = FS::Masked( invSqrt != FS::f32<4>( INFINITY ), dist ); 12 | 13 | return FS::Extract0( FS::Convert( out ) ); 14 | } 15 | -------------------------------------------------------------------------------- /include/FastSIMD/DispatchClass.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "Utility/FeatureEnums.h" 3 | 4 | #include 5 | 6 | namespace FastSIMD 7 | { 8 | using MemoryAllocator = void* (*)( std::size_t size, std::size_t align ); 9 | 10 | template 11 | FASTSIMD_API T* NewDispatchClass( FeatureSet maxFeatureSet = FeatureSet::Max, MemoryAllocator allocator = nullptr ); 12 | } 13 | -------------------------------------------------------------------------------- /include/FastSIMD/ToolSet.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "Utility/ArchDetect.h" 3 | #include "Utility/FeatureEnums.h" 4 | 5 | namespace FastSIMD 6 | { 7 | static_assert( FASTSIMD_DEFAULT_FEATURE_VALUE() <= FASTSIMD_MAX_FEATURE_VALUE(), 8 | "Default feature set must be <= to max feature set" ); 9 | 10 | template 11 | static constexpr FeatureSet FeatureSetDefault() 12 | { 13 | return SIMD; 14 | } 15 | 16 | template 17 | static constexpr bool IsRelaxed() 18 | { 19 | return RELAXED; 20 | } 21 | } // namespace FastSIMD 22 | 23 | #include "ToolSet/Generic/Functions.h" 24 | 25 | #include "ToolSet/Generic/Scalar.h" 26 | 27 | #if FASTSIMD_CURRENT_ARCH_IS( X86 ) 28 | #include "ToolSet/x86/x86.h" 29 | #elif FASTSIMD_CURRENT_ARCH_IS( ARM ) 30 | #include "ToolSet/ARM/ARM.h" 31 | #elif FASTSIMD_CURRENT_ARCH_IS( WASM ) 32 | #include "ToolSet/WASM/WASM.h" 33 | #endif 34 | 35 | -------------------------------------------------------------------------------- /include/FastSIMD/ToolSet/ARM/128/f32x4.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace FS 6 | { 7 | template 8 | struct Register> 9 | { 10 | static constexpr size_t ElementCount = 4; 11 | static constexpr auto FeatureFlags = SIMD; 12 | 13 | using NativeType = float32x4_t; 14 | using ElementType = float; 15 | using MaskType = m32; 16 | using MaskTypeArg = m32; 17 | 18 | FS_FORCEINLINE Register() = default; 19 | FS_FORCEINLINE Register( NativeType v ) : native( v ) { } 20 | FS_FORCEINLINE Register( float v ) : native( vdupq_n_f32( v ) ) { } 21 | 22 | FS_FORCEINLINE NativeType GetNative() const 23 | { 24 | return native; 25 | } 26 | 27 | FS_FORCEINLINE Register& operator +=( const Register& rhs ) 28 | { 29 | native = vaddq_f32( native, rhs.native ); 30 | return *this; 31 | } 32 | 33 | FS_FORCEINLINE Register& operator -=( const Register& rhs ) 34 | { 35 | native = vsubq_f32( native, rhs.native ); 36 | return *this; 37 | } 38 | 39 | FS_FORCEINLINE Register& operator *=( const Register& rhs ) 40 | { 41 | native = vmulq_f32( native, rhs.native ); 42 | return *this; 43 | } 44 | 45 | FS_FORCEINLINE Register& operator /=( const Register& rhs ) 46 | { 47 | if constexpr( SIMD & FastSIMD::FeatureFlag::AARCH64 ) 48 | { 49 | native = vdivq_f32( native, rhs.native ); 50 | } 51 | else 52 | { 53 | float32x4_t reciprocal = vrecpeq_f32( rhs.native ); 54 | // Additional Netwon-Raphson iteration for accuracy 55 | reciprocal = vmulq_f32( vrecpsq_f32( rhs.native, reciprocal ), reciprocal ); 56 | reciprocal = vmulq_f32( vrecpsq_f32( rhs.native, reciprocal ), reciprocal ); 57 | 58 | native = vmulq_f32( native, reciprocal ); 59 | } 60 | 61 | return *this; 62 | } 63 | 64 | FS_FORCEINLINE Register& operator &=( const Register& rhs ) 65 | { 66 | native = vreinterpretq_f32_u32( vandq_u32( vreinterpretq_u32_f32( native ), vreinterpretq_u32_f32( rhs.native ) ) ); 67 | return *this; 68 | } 69 | 70 | FS_FORCEINLINE Register& operator |=( const Register& rhs ) 71 | { 72 | native = vreinterpretq_f32_u32( vorrq_u32( vreinterpretq_u32_f32( native ), vreinterpretq_u32_f32( rhs.native ) ) ); 73 | return *this; 74 | } 75 | 76 | FS_FORCEINLINE Register& operator ^=( const Register& rhs ) 77 | { 78 | native = vreinterpretq_f32_u32( veorq_u32( vreinterpretq_u32_f32( native ), vreinterpretq_u32_f32( rhs.native ) ) ); 79 | return *this; 80 | } 81 | 82 | FS_FORCEINLINE Register operator~() const 83 | { 84 | return vreinterpretq_f32_u32( vmvnq_u32( vreinterpretq_u32_f32( native ) ) ); 85 | } 86 | 87 | FS_FORCEINLINE Register operator-() const 88 | { 89 | return vnegq_f32( native ); 90 | } 91 | 92 | 93 | FS_FORCEINLINE MaskType operator ==( const Register& rhs ) const 94 | { 95 | return vceqq_f32( native, rhs.native ); 96 | } 97 | 98 | FS_FORCEINLINE MaskType operator !=( const Register& rhs ) const 99 | { 100 | return ~( *this == rhs ); 101 | } 102 | 103 | FS_FORCEINLINE MaskType operator >=( const Register& rhs ) const 104 | { 105 | return vcgeq_f32( native, rhs.native ); 106 | } 107 | 108 | FS_FORCEINLINE MaskType operator <=( const Register& rhs ) const 109 | { 110 | return vcleq_f32( native, rhs.native ); 111 | } 112 | 113 | FS_FORCEINLINE MaskType operator >( const Register& rhs ) const 114 | { 115 | return vcgtq_f32( native, rhs.native ); 116 | } 117 | 118 | FS_FORCEINLINE MaskType operator <( const Register& rhs ) const 119 | { 120 | return vcltq_f32( native, rhs.native ); 121 | } 122 | 123 | NativeType native; 124 | }; 125 | 126 | 127 | template>> 128 | FS_FORCEINLINE f32<4, SIMD> Load( TypeWrapper ptr ) 129 | { 130 | return vld1q_f32( ptr.value ); 131 | } 132 | 133 | template>> 134 | FS_FORCEINLINE void Store( typename f32<4, SIMD>::ElementType* ptr, const f32<4, SIMD>& a ) 135 | { 136 | vst1q_f32( ptr, a.native ); 137 | } 138 | 139 | template>> 140 | FS_FORCEINLINE float Extract0( const f32<4, SIMD>& a ) 141 | { 142 | return vgetq_lane_f32( a.native, 0 ); 143 | } 144 | 145 | template>> 146 | FS_FORCEINLINE f32<4, SIMD> Abs( const f32<4, SIMD>& a ) 147 | { 148 | return vabsq_f32( a.native ); 149 | } 150 | 151 | template>> 152 | FS_FORCEINLINE f32<4, SIMD> Round( const f32<4, SIMD>& a ) 153 | { 154 | /*if constexpr( SIMD & FastSIMD::FeatureFlag::SSE41 ) 155 | { 156 | return _mm_round_ps( a.native, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC ); 157 | } 158 | else 159 | { 160 | __m128i aInt = _mm_cvtps_epi32( a.native ); 161 | __m128 aIntF = _mm_cvtepi32_ps( aInt ); 162 | 163 | return _mm_xor_ps( aIntF, _mm_and_ps( _mm_castsi128_ps( _mm_cmpeq_epi32( aInt, _mm_set1_epi32( (-2147483647 - 1) ) ) ), _mm_xor_ps( a.native, aIntF ) ) ); 164 | }*/ 165 | return vrndnq_f32( a.native ); 166 | } 167 | 168 | template>> 169 | FS_FORCEINLINE f32<4, SIMD> Floor( const f32<4, SIMD>& a ) 170 | { 171 | return vrndmq_f32( a.native ); 172 | } 173 | 174 | template>> 175 | FS_FORCEINLINE f32<4, SIMD> Ceil( const f32<4, SIMD>& a ) 176 | { 177 | return vrndpq_f32( a.native ); 178 | } 179 | 180 | template>> 181 | FS_FORCEINLINE f32<4, SIMD> Trunc( const f32<4, SIMD>& a ) 182 | { 183 | return vrndq_f32( a.native ); 184 | } 185 | 186 | template>> 187 | FS_FORCEINLINE f32<4, SIMD> Min( const f32<4, SIMD>& a, const f32<4, SIMD>& b ) 188 | { 189 | return vminq_f32( a.native, b.native ); 190 | } 191 | 192 | template>> 193 | FS_FORCEINLINE f32<4, SIMD> Max( const f32<4, SIMD>& a, const f32<4, SIMD>& b ) 194 | { 195 | return vmaxq_f32( a.native, b.native ); 196 | } 197 | 198 | template>> 199 | FS_FORCEINLINE f32<4, SIMD> Select( const typename f32<4, SIMD>::MaskTypeArg& mask, const f32<4, SIMD>& ifTrue, const f32<4, SIMD>& ifFalse ) 200 | { 201 | return vbslq_f32( mask.native, ifTrue.native, ifFalse.native ); 202 | } 203 | 204 | 205 | template>> 206 | FS_FORCEINLINE f32<4, SIMD> Masked( const typename f32<4, SIMD>::MaskTypeArg& mask, const f32<4, SIMD>& a ) 207 | { 208 | return vreinterpretq_f32_u32( vandq_u32( vreinterpretq_u32_f32( a.native ), mask.native ) ); 209 | } 210 | 211 | template>> 212 | FS_FORCEINLINE f32<4, SIMD> MaskedIncrement( const typename f32<4, SIMD>::MaskTypeArg& mask, const f32<4, SIMD>& a ) 213 | { 214 | return vsubq_f32( a.native, vcvtq_f32_s32( vreinterpretq_s32_u32( mask.native ) ) ); 215 | } 216 | 217 | template>> 218 | FS_FORCEINLINE f32<4, SIMD> MaskedDecrement( const typename f32<4, SIMD>::MaskTypeArg& mask, const f32<4, SIMD>& a ) 219 | { 220 | return vaddq_f32( a.native, vcvtq_f32_s32( vreinterpretq_s32_u32( mask.native ) ) ); 221 | } 222 | 223 | 224 | template>, typename = EnableIfRelaxed> 225 | FS_FORCEINLINE f32<4, SIMD> Reciprocal( const f32<4, SIMD>& a ) 226 | { 227 | float32x4_t recip = vrecpeq_f32( a.native ); 228 | return vmulq_f32( recip, vrecpsq_f32( recip, a.native ) ); 229 | } 230 | 231 | template>, typename = EnableIfRelaxed> 232 | FS_FORCEINLINE f32<4, SIMD> InvSqrt( const f32<4, SIMD>& a ) 233 | { 234 | float32x4_t rsqrt = vrsqrteq_f32( a.native ); 235 | return vmulq_f32( rsqrt, vrsqrtsq_f32( vmulq_f32( a.native, rsqrt ), rsqrt ) ); 236 | } 237 | 238 | template>> 239 | FS_FORCEINLINE f32<4, SIMD> Sqrt( const f32<4, SIMD>& a ) 240 | { 241 | return vsqrtq_f32( a.native ); 242 | } 243 | 244 | template>, typename = EnableIfRelaxed> 245 | FS_FORCEINLINE f32<4, SIMD> FMulAdd( const f32<4, SIMD>& a, const f32<4, SIMD>& b, const f32<4, SIMD>& c ) 246 | { 247 | return vmlaq_f32( b.native, c.native, a.native ); 248 | } 249 | 250 | template>, typename = EnableIfRelaxed> 251 | FS_FORCEINLINE f32<4, SIMD> FNMulAdd( const f32<4, SIMD>& a, const f32<4, SIMD>& b, const f32<4, SIMD>& c ) 252 | { 253 | return vmlaq_f32( b.native, c.native, a.native ); 254 | } 255 | } 256 | -------------------------------------------------------------------------------- /include/FastSIMD/ToolSet/ARM/128/i32x4.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace FS 6 | { 7 | template 8 | struct Register> 9 | { 10 | static constexpr size_t ElementCount = 4; 11 | static constexpr auto FeatureFlags = SIMD; 12 | 13 | using NativeType = int32x4_t; 14 | using ElementType = std::int32_t; 15 | using MaskType = m32; 16 | using MaskTypeArg = m32; 17 | 18 | FS_FORCEINLINE Register() = default; 19 | FS_FORCEINLINE Register( NativeType v ) : native( v ) { } 20 | FS_FORCEINLINE Register( std::int32_t v ) : native( vdupq_n_s32( v ) ) { } 21 | 22 | FS_FORCEINLINE NativeType GetNative() const 23 | { 24 | return native; 25 | } 26 | 27 | FS_FORCEINLINE Register& operator +=( const Register& rhs ) 28 | { 29 | native = vaddq_s32( native, rhs.native ); 30 | return *this; 31 | } 32 | 33 | FS_FORCEINLINE Register& operator -=( const Register& rhs ) 34 | { 35 | native = vsubq_s32( native, rhs.native ); 36 | return *this; 37 | } 38 | 39 | FS_FORCEINLINE Register& operator *=( const Register& rhs ) 40 | { 41 | native = vmulq_s32( native, rhs.native ); 42 | return *this; 43 | } 44 | 45 | FS_FORCEINLINE Register& operator &=( const Register& rhs ) 46 | { 47 | native = vandq_s32( native, rhs.native ); 48 | return *this; 49 | } 50 | 51 | FS_FORCEINLINE Register& operator |=( const Register& rhs ) 52 | { 53 | native = vorrq_s32( native, rhs.native ); 54 | return *this; 55 | } 56 | 57 | FS_FORCEINLINE Register& operator ^=( const Register& rhs ) 58 | { 59 | native = veorq_s32( native, rhs.native ); 60 | return *this; 61 | } 62 | 63 | FS_FORCEINLINE Register& operator >>=( int rhs ) 64 | { 65 | native = vshlq_s32( native, vdupq_n_s32( -rhs ) ); 66 | return *this; 67 | } 68 | 69 | FS_FORCEINLINE Register& operator <<=( int rhs ) 70 | { 71 | native = vshlq_s32( native, vdupq_n_s32( rhs ) ); 72 | return *this; 73 | } 74 | 75 | FS_FORCEINLINE Register operator ~() const 76 | { 77 | return vmvnq_s32( native ); 78 | } 79 | 80 | FS_FORCEINLINE Register operator -() const 81 | { 82 | return vnegq_s32( native ); 83 | } 84 | 85 | 86 | FS_FORCEINLINE MaskType operator ==( const Register& rhs ) const 87 | { 88 | return vceqq_s32( native, rhs.native ); 89 | } 90 | 91 | FS_FORCEINLINE MaskType operator !=( const Register& rhs ) const 92 | { 93 | return ~(*this == rhs); 94 | } 95 | 96 | FS_FORCEINLINE MaskType operator >=( const Register& rhs ) const 97 | { 98 | return vcgeq_s32( native, rhs.native ); 99 | } 100 | 101 | FS_FORCEINLINE MaskType operator <=( const Register& rhs ) const 102 | { 103 | return vcleq_s32( native, rhs.native ); 104 | } 105 | 106 | FS_FORCEINLINE MaskType operator >( const Register& rhs ) const 107 | { 108 | return vcgtq_s32( native, rhs.native ); 109 | } 110 | 111 | FS_FORCEINLINE MaskType operator <( const Register& rhs ) const 112 | { 113 | return vcltq_s32( native, rhs.native ); 114 | } 115 | 116 | NativeType native; 117 | }; 118 | 119 | 120 | template>> 121 | FS_FORCEINLINE i32<4, SIMD> Load( TypeWrapper ptr ) 122 | { 123 | return vld1q_s32( ptr.value ); 124 | } 125 | 126 | template>> 127 | FS_FORCEINLINE void Store( typename i32<4, SIMD>::ElementType* ptr, const i32<4, SIMD>& a ) 128 | { 129 | vst1q_s32( ptr, a.native ); 130 | } 131 | 132 | template>> 133 | FS_FORCEINLINE int32_t Extract0( const i32<4, SIMD>& a ) 134 | { 135 | return vgetq_lane_s32( a.native, 0 ); 136 | } 137 | 138 | template>> 139 | FS_FORCEINLINE i32<4, SIMD> Abs( const i32<4, SIMD>& a ) 140 | { 141 | return vabsq_s32( a.native ); 142 | } 143 | 144 | template>> 145 | FS_FORCEINLINE i32<4, SIMD> Min( const i32<4, SIMD>& a, const i32<4, SIMD>& b ) 146 | { 147 | return vminq_s32( a.native, b.native ); 148 | } 149 | 150 | template>> 151 | FS_FORCEINLINE i32<4, SIMD> Max( const i32<4, SIMD>& a, const i32<4, SIMD>& b ) 152 | { 153 | return vmaxq_s32( a.native, b.native ); 154 | } 155 | 156 | template>> 157 | FS_FORCEINLINE i32<4, SIMD> Select( const typename i32<4, SIMD>::MaskTypeArg& mask, const i32<4, SIMD>& ifTrue, const i32<4, SIMD>& ifFalse ) 158 | { 159 | return vbslq_s32( mask.native, ifTrue.native, ifFalse.native ); 160 | } 161 | 162 | template>> 163 | FS_FORCEINLINE i32<4, SIMD> BitShiftRightZeroExtend( const i32<4, SIMD>& a, int b ) 164 | { 165 | return vreinterpretq_s32_u32( vshlq_u32( vreinterpretq_u32_s32( a.native ), vdupq_n_s32( -b ) ) ); 166 | } 167 | 168 | 169 | template>> 170 | FS_FORCEINLINE i32<4, SIMD> Masked( const typename i32<4, SIMD>::MaskTypeArg& mask, const i32<4, SIMD>& a ) 171 | { 172 | return vandq_s32( vreinterpretq_s32_u32( mask.native ), a.native ); 173 | } 174 | 175 | template>> 176 | FS_FORCEINLINE i32<4, SIMD> MaskedIncrement( const typename i32<4, SIMD>::MaskTypeArg& mask, const i32<4, SIMD>& a ) 177 | { 178 | return vsubq_s32( a.native, vreinterpretq_s32_u32( mask.native ) ); 179 | } 180 | 181 | template>> 182 | FS_FORCEINLINE i32<4, SIMD> MaskedDecrement( const typename i32<4, SIMD>::MaskTypeArg& mask, const i32<4, SIMD>& a ) 183 | { 184 | return vaddq_s32( a.native, vreinterpretq_s32_u32( mask.native ) ); 185 | } 186 | } 187 | -------------------------------------------------------------------------------- /include/FastSIMD/ToolSet/ARM/128/m32x4.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace FS 6 | { 7 | namespace impl 8 | { 9 | struct ArmMaskBase32x4 10 | { 11 | uint32x4_t native; 12 | }; 13 | } 14 | 15 | template 16 | struct Register, 4, SIMD, std::enable_if_t> 17 | : std::conditional_t, 4, SIMD>> 18 | { 19 | static constexpr size_t ElementCount = 4; 20 | static constexpr auto FeatureFlags = SIMD; 21 | 22 | using NativeType = decltype(ArmMaskBase32x4::native); 23 | using ElementType = Mask<32, OPTIMISE_FLOAT>; 24 | using MaskType = Register; 25 | using MaskTypeArg = Register; 26 | 27 | FS_FORCEINLINE Register() = default; 28 | FS_FORCEINLINE Register( NativeType v ) { this->native = v; } 29 | 30 | FS_FORCEINLINE NativeType GetNative() const 31 | { 32 | return this->native; 33 | } 34 | 35 | FS_FORCEINLINE Register& operator &=( const Register& rhs ) 36 | { 37 | this->native = vandq_u32( this->native, rhs.native ); 38 | return *this; 39 | } 40 | 41 | FS_FORCEINLINE Register& operator |=( const Register& rhs ) 42 | { 43 | this->native = vorrq_u32( this->native, rhs.native ); 44 | return *this; 45 | } 46 | 47 | FS_FORCEINLINE Register& operator ^=( const Register& rhs ) 48 | { 49 | this->native = veorq_u32( this->native, rhs.native ); 50 | return *this; 51 | } 52 | 53 | FS_FORCEINLINE Register operator ~() const 54 | { 55 | return vmvnq_u32( this->native ); 56 | } 57 | }; 58 | 59 | template, 4, SIMD>>> 60 | FS_FORCEINLINE bool AnyMask( const Register, 4, SIMD>& a ) 61 | { 62 | if constexpr( SIMD & FastSIMD::FeatureFlag::AARCH64 ) 63 | { 64 | return vmaxvq_u32( a.native ); 65 | } 66 | else 67 | { 68 | uint32x2_t tmp = vorr_u32( vget_low_u32( a.native ), vget_high_u32( a.native ) ); 69 | return (bool)vget_lane_u32( vpmax_u32( tmp, tmp ), 0 ); 70 | } 71 | } 72 | 73 | template, 4, SIMD>>> 74 | FS_FORCEINLINE BitStorage<4> BitMask( const Register, 4, SIMD>& a ) 75 | { 76 | if constexpr( SIMD & FastSIMD::FeatureFlag::AARCH64 ) 77 | { 78 | static const int32_t shift[4] = { 0, 1, 2, 3 }; 79 | uint32x4_t tmp = vshrq_n_u32( a.native, 31 ); 80 | return vaddvq_u32( vshlq_u32( tmp, vld1q_s32( shift ) ) ); 81 | } 82 | else 83 | { 84 | // Shift out everything but the sign bits with a 32-bit unsigned shift 85 | // right. 86 | uint64x2_t high_bits = vreinterpretq_u64_u32( vshrq_n_u32( a.native, 31 ) ); 87 | // Merge the two pairs together with a 64-bit unsigned shift right + add. 88 | uint8x16_t paired = 89 | vreinterpretq_u8_u64( vsraq_n_u64( high_bits, high_bits, 31 ) ); 90 | // Extract the result. 91 | return vgetq_lane_u8( paired, 0 ) | ( vgetq_lane_u8( paired, 8 ) << 2 ); 92 | } 93 | } 94 | 95 | } -------------------------------------------------------------------------------- /include/FastSIMD/ToolSet/ARM/ARM.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #if FASTSIMD_MAX_FEATURE_VALUE() >= FASTSIMD_FEATURE_VALUE( NEON ) 4 | #include "NEON.h" 5 | #endif 6 | -------------------------------------------------------------------------------- /include/FastSIMD/ToolSet/ARM/NEON.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | #include "128/f32x4.h" 5 | #include "128/i32x4.h" 6 | #include "128/m32x4.h" 7 | 8 | namespace FS 9 | { 10 | template>> 11 | FS_FORCEINLINE i32<4, SIMD> Convert( const f32<4, SIMD>& a, TypeDummy ) 12 | { 13 | return vcvtq_s32_f32( Round( a ).native ); 14 | } 15 | 16 | template>> 17 | FS_FORCEINLINE f32<4, SIMD> Convert( const i32<4, SIMD>& a, TypeDummy ) 18 | { 19 | return vcvtq_f32_s32( a.native ); 20 | } 21 | 22 | template>> 23 | FS_FORCEINLINE Register Cast( const Register& a, TypeDummy ) 24 | { 25 | if constexpr( 26 | std::is_same_v::NativeType, float32x4_t> && 27 | std::is_same_v::NativeType, int32x4_t> ) 28 | { 29 | return vreinterpretq_s32_f32( a.GetNative() ); 30 | } 31 | else if constexpr( 32 | std::is_same_v::NativeType, int32x4_t> && 33 | std::is_same_v::NativeType, float32x4_t> ) 34 | { 35 | return vreinterpretq_f32_s32( a.GetNative() ); 36 | } 37 | else 38 | { 39 | return a.GetNative(); 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /include/FastSIMD/ToolSet/Generic/Register.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | 11 | #ifdef _MSC_VER 12 | #define FS_FORCEINLINE __forceinline 13 | #define FS_NEVERINLINE __declspec(noinline) 14 | #else 15 | #define FS_FORCEINLINE __attribute__( ( always_inline ) ) inline 16 | #define FS_NEVERINLINE __attribute__( ( noinline ) ) 17 | #endif 18 | 19 | #if FASTSIMD_CURRENT_ARCH_IS( WASM ) 20 | #define FS_VECTORCALL 21 | #elif defined( __clang__ ) 22 | #define FS_VECTORCALL __regcall 23 | #elif defined( _MSC_VER ) 24 | #define FS_VECTORCALL __vectorcall 25 | #else 26 | #define FS_VECTORCALL 27 | #endif 28 | 29 | namespace FS 30 | { 31 | template 32 | struct Register 33 | { 34 | static_assert( SIMD != FastSIMD::FeatureSet::Invalid, "Invalid FeatureSet" ); 35 | static_assert( N > 1, "Unknown Vector Type" ); 36 | static_assert( ( N & ( N - 1 ) ) == 0, "Vector size must be power of 2" ); 37 | 38 | static constexpr std::size_t ElementCount = N; 39 | static constexpr auto FeatureFlags = SIMD; 40 | 41 | using DoubledType = Register; 42 | using ElementType = T; 43 | using MaskType = Register; 44 | using MaskTypeArg = Register; 45 | 46 | Register() = default; 47 | Register( T v ) : v0( v ), v1( v ) { } 48 | Register( const DoubledType& v, const DoubledType& u ) : v0( v ), v1( u ) { } 49 | 50 | // Conversion for Mask -> Mask 51 | template 52 | FS_FORCEINLINE operator Register() const 53 | { 54 | return Register{ v0, v1 }; 55 | } 56 | 57 | FS_FORCEINLINE Register& operator +=( const Register& rhs ) 58 | { 59 | v0 += rhs.v0; 60 | v1 += rhs.v1; 61 | return *this; 62 | } 63 | 64 | FS_FORCEINLINE Register& operator -=( const Register& rhs ) 65 | { 66 | v0 -= rhs.v0; 67 | v1 -= rhs.v1; 68 | return *this; 69 | } 70 | 71 | FS_FORCEINLINE Register& operator *=( const Register& rhs ) 72 | { 73 | v0 *= rhs.v0; 74 | v1 *= rhs.v1; 75 | return *this; 76 | } 77 | 78 | FS_FORCEINLINE Register& operator /=( const Register& rhs ) 79 | { 80 | v0 /= rhs.v0; 81 | v1 /= rhs.v1; 82 | return *this; 83 | } 84 | 85 | FS_FORCEINLINE Register& operator &=( const Register& rhs ) 86 | { 87 | v0 &= rhs.v0; 88 | v1 &= rhs.v1; 89 | return *this; 90 | } 91 | 92 | FS_FORCEINLINE Register& operator |=( const Register& rhs ) 93 | { 94 | v0 |= rhs.v0; 95 | v1 |= rhs.v1; 96 | return *this; 97 | } 98 | 99 | FS_FORCEINLINE Register& operator ^=( const Register& rhs ) 100 | { 101 | v0 ^= rhs.v0; 102 | v1 ^= rhs.v1; 103 | return *this; 104 | } 105 | 106 | FS_FORCEINLINE Register& operator >>=( const Register& rhs ) 107 | { 108 | v0 >>= rhs.v0; 109 | v1 >>= rhs.v1; 110 | return *this; 111 | } 112 | 113 | FS_FORCEINLINE Register& operator <<=( const Register& rhs ) 114 | { 115 | v0 <<= rhs.v0; 116 | v1 <<= rhs.v1; 117 | return *this; 118 | } 119 | 120 | FS_FORCEINLINE Register& operator >>=( int rhs ) 121 | { 122 | v0 >>= rhs; 123 | v1 >>= rhs; 124 | return *this; 125 | } 126 | 127 | FS_FORCEINLINE Register& operator <<=( int rhs ) 128 | { 129 | v0 <<= rhs; 130 | v1 <<= rhs; 131 | return *this; 132 | } 133 | 134 | FS_FORCEINLINE Register operator -() const 135 | { 136 | return Register{ -this->v0, -this->v1 }; 137 | } 138 | 139 | FS_FORCEINLINE Register operator ~() const 140 | { 141 | return Register{ ~this->v0, ~this->v1 }; 142 | } 143 | 144 | 145 | FS_FORCEINLINE MaskType operator ==( const Register& rhs ) const 146 | { 147 | return MaskType{ v0 == rhs.v0, v1 == rhs.v1 }; 148 | } 149 | 150 | FS_FORCEINLINE MaskType operator !=( const Register& rhs ) const 151 | { 152 | return MaskType{ v0 != rhs.v0, v1 != rhs.v1 }; 153 | } 154 | 155 | FS_FORCEINLINE MaskType operator >=( const Register& rhs ) const 156 | { 157 | return MaskType{ v0 >= rhs.v0, v1 >= rhs.v1 }; 158 | } 159 | 160 | FS_FORCEINLINE MaskType operator <=( const Register& rhs ) const 161 | { 162 | return MaskType{ v0 <= rhs.v0, v1 <= rhs.v1 }; 163 | } 164 | 165 | FS_FORCEINLINE MaskType operator >( const Register& rhs ) const 166 | { 167 | return MaskType{ v0 > rhs.v0, v1 > rhs.v1 }; 168 | } 169 | 170 | FS_FORCEINLINE MaskType operator <( const Register& rhs ) const 171 | { 172 | return MaskType{ v0 < rhs.v0, v1 < rhs.v1 }; 173 | } 174 | 175 | DoubledType v0, v1; 176 | }; 177 | 178 | template 179 | FS_FORCEINLINE static Register operator +( Register lhs, const Register& rhs ) 180 | { 181 | return lhs += rhs; 182 | } 183 | 184 | template 185 | FS_FORCEINLINE static Register operator -( Register lhs, const Register& rhs ) 186 | { 187 | return lhs -= rhs; 188 | } 189 | 190 | template 191 | FS_FORCEINLINE static Register operator *( Register lhs, const Register& rhs ) 192 | { 193 | return lhs *= rhs; 194 | } 195 | 196 | template 197 | FS_FORCEINLINE static Register operator /( Register lhs, const Register& rhs ) 198 | { 199 | return lhs /= rhs; 200 | } 201 | 202 | template 203 | FS_FORCEINLINE static Register operator &( Register lhs, const Register& rhs ) 204 | { 205 | return lhs &= rhs; 206 | } 207 | 208 | template 209 | FS_FORCEINLINE static Register operator |( Register lhs, const Register& rhs ) 210 | { 211 | return lhs |= rhs; 212 | } 213 | 214 | template 215 | FS_FORCEINLINE static Register operator ^( Register lhs, const Register& rhs ) 216 | { 217 | return lhs ^= rhs; 218 | } 219 | 220 | template 221 | FS_FORCEINLINE static Register operator <<( Register lhs, int rhs ) 222 | { 223 | return lhs <<= rhs; 224 | } 225 | 226 | template 227 | FS_FORCEINLINE static Register operator >>( Register lhs, int rhs ) 228 | { 229 | return lhs >>= rhs; 230 | } 231 | 232 | template 233 | FS_FORCEINLINE static Register operator<<( Register lhs, Register rhs ) 234 | { 235 | return lhs <<= rhs; 236 | } 237 | 238 | template 239 | FS_FORCEINLINE static Register operator>>( Register lhs, Register rhs ) 240 | { 241 | return lhs >>= rhs; 242 | } 243 | 244 | template 245 | struct TypeWrapper 246 | { 247 | using Type = T; 248 | using Half = TypeWrapper; 249 | 250 | FS_FORCEINLINE constexpr explicit TypeWrapper( T v ) : value( v ) { } 251 | 252 | FS_FORCEINLINE constexpr Half AsHalf() const 253 | { 254 | return Half( value ); 255 | } 256 | 257 | template 258 | FS_FORCEINLINE constexpr Half AsHalf( U offset ) const 259 | { 260 | return Half( value + offset ); 261 | } 262 | 263 | T value; 264 | }; 265 | 266 | template 267 | struct TypeDummy 268 | { 269 | using Type = T; 270 | }; 271 | 272 | template 273 | struct Mask 274 | { 275 | Mask() = delete; 276 | }; 277 | 278 | template 279 | struct IsNative : std::false_type { }; 280 | 281 | template 282 | struct IsNative> : std::true_type { }; 283 | 284 | template 285 | using EnableIfNative = typename T::NativeType; 286 | 287 | template 288 | using EnableIfNotNative = decltype( T::v0 ); 289 | 290 | template 291 | constexpr bool IsNativeV = IsNative::value; 292 | 293 | template 294 | using EnableIfRelaxed = std::enable_if_t()>; 295 | 296 | template 297 | using EnableIfNotRelaxed = std::enable_if_t()>; 298 | 299 | 300 | template 301 | using i32 = Register; 302 | 303 | template 304 | using f32 = Register; 305 | 306 | template 307 | using m32 = Register, N, SIMD>; 308 | 309 | template 310 | using BitStorage = std::tuple_element_t<( N > 8 ) + ( N > 16 ) + ( N > 32 ), 311 | std::tuple>; 312 | 313 | template 314 | static constexpr std::size_t NativeRegisterCount( FastSIMD::FeatureSet featureSet = FastSIMD::FeatureSetDefault() ); 315 | 316 | template<> 317 | constexpr std::size_t NativeRegisterCount( FastSIMD::FeatureSet featureSet ) 318 | { 319 | if( featureSet & FastSIMD::FeatureFlag::AVX512_F ) 320 | { 321 | return 16; 322 | } 323 | if( featureSet & FastSIMD::FeatureFlag::AVX ) 324 | { 325 | return 8; 326 | } 327 | if( featureSet & (FastSIMD::FeatureFlag::SSE | 328 | FastSIMD::FeatureFlag::NEON | FastSIMD::FeatureFlag::WASM) ) 329 | { 330 | return 4; 331 | } 332 | 333 | return 1; 334 | } 335 | 336 | template<> 337 | constexpr std::size_t NativeRegisterCount( FastSIMD::FeatureSet featureSet ) 338 | { 339 | if( featureSet & FastSIMD::FeatureFlag::AVX512_F ) 340 | { 341 | return 16; 342 | } 343 | if( featureSet & FastSIMD::FeatureFlag::AVX2 ) 344 | { 345 | return 8; 346 | } 347 | if( featureSet & (FastSIMD::FeatureFlag::SSE2 | 348 | FastSIMD::FeatureFlag::NEON | FastSIMD::FeatureFlag::WASM) ) 349 | { 350 | return 4; 351 | } 352 | 353 | return 1; 354 | } 355 | 356 | template<> 357 | constexpr std::size_t NativeRegisterCount>( FastSIMD::FeatureSet featureSet ) 358 | { 359 | if( featureSet & FastSIMD::FeatureFlag::AVX512_F ) 360 | { 361 | return 16; 362 | } 363 | if( featureSet & FastSIMD::FeatureFlag::AVX2 ) 364 | { 365 | return 8; 366 | } 367 | if( featureSet & (FastSIMD::FeatureFlag::SSE2 | 368 | FastSIMD::FeatureFlag::NEON | FastSIMD::FeatureFlag::WASM) ) 369 | { 370 | return 4; 371 | } 372 | 373 | return 1; 374 | } 375 | 376 | template 377 | using NativeRegister = Register( SIMD ), SIMD>; 378 | 379 | } // namespace FS 380 | -------------------------------------------------------------------------------- /include/FastSIMD/ToolSet/Generic/Scalar.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "Scalar/i32x1.h" 3 | #include "Scalar/f32x1.h" 4 | #include "Scalar/mNx1.h" 5 | 6 | namespace FS 7 | { 8 | template>> 9 | FS_FORCEINLINE i32<1, SIMD> Convert( const f32<1, SIMD>& a, TypeDummy ) 10 | { 11 | return static_cast( std::rintf( a.GetNative() ) ); 12 | } 13 | 14 | template>> 15 | FS_FORCEINLINE f32<1, SIMD> Convert( const i32<1, SIMD>& a, TypeDummy ) 16 | { 17 | return static_cast( a.GetNative() ); 18 | } 19 | 20 | template>> 21 | FS_FORCEINLINE Register Cast( const Register& a, TypeDummy ) 22 | { 23 | if constexpr( !std::is_same_v::NativeType, typename Register::NativeType> ) 24 | { 25 | union 26 | { 27 | typename Register::NativeType a; 28 | typename Register::NativeType b; 29 | } u; 30 | 31 | u.a = a.GetNative(); 32 | 33 | return u.b; 34 | } 35 | else 36 | { 37 | return a.GetNative(); 38 | } 39 | } 40 | 41 | 42 | template>> 43 | FS_FORCEINLINE Register Load( TypeWrapper ptr ) 44 | { 45 | return *ptr.value; 46 | } 47 | 48 | template>> 49 | FS_FORCEINLINE void Store( T* ptr, const Register& a ) 50 | { 51 | *ptr = a.GetNative(); 52 | } 53 | 54 | template>> 55 | FS_FORCEINLINE T Extract0( const Register& a ) 56 | { 57 | return a.GetNative(); 58 | } 59 | 60 | template>> 61 | FS_FORCEINLINE Register Abs( const Register& a ) 62 | { 63 | return std::abs( a.GetNative() ); 64 | } 65 | 66 | template>> 67 | FS_FORCEINLINE Register Round( const Register& a ) 68 | { 69 | return std::rint( a.GetNative() ); 70 | } 71 | 72 | template>> 73 | FS_FORCEINLINE Register Ceil( const Register& a ) 74 | { 75 | return std::ceil( a.GetNative() ); 76 | } 77 | 78 | template>> 79 | FS_FORCEINLINE Register Floor( const Register& a ) 80 | { 81 | return std::floor( a.GetNative() ); 82 | } 83 | 84 | template>> 85 | FS_FORCEINLINE Register Trunc( const Register& a ) 86 | { 87 | return std::trunc( a.GetNative() ); 88 | } 89 | 90 | template>, typename = EnableIfNotRelaxed> 91 | FS_FORCEINLINE f32<1, SIMD> Modulus( const Register& a, const Register& b ) 92 | { 93 | return std::fmod( a.GetNative(), b.GetNative() ); 94 | } 95 | 96 | template>> 97 | FS_FORCEINLINE Register Min( const Register& a, const Register& b ) 98 | { 99 | return std::min( a.GetNative(), b.GetNative() ); 100 | } 101 | 102 | template>> 103 | FS_FORCEINLINE Register Max( const Register& a, const Register& b ) 104 | { 105 | return std::max( a.GetNative(), b.GetNative() ); 106 | } 107 | 108 | template>> 109 | FS_FORCEINLINE Register Select( const typename Register::MaskTypeArg& mask, const Register& ifTrue, const Register& ifFalse ) 110 | { 111 | return mask.GetNative() ? ifTrue : ifFalse; 112 | } 113 | 114 | 115 | 116 | template>> 117 | FS_FORCEINLINE Register BitwiseAndNot( const Register& a, const Register& b ) 118 | { 119 | return a & ~b; 120 | } 121 | 122 | template>> 123 | FS_FORCEINLINE Register Masked( const typename Register::MaskTypeArg& mask, const Register& a ) 124 | { 125 | return mask.native ? a : 0; 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /include/FastSIMD/ToolSet/Generic/Scalar/f32x1.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | namespace FS 9 | { 10 | template 11 | struct Register> 12 | { 13 | static constexpr size_t ElementCount = 1; 14 | static constexpr auto FeatureFlags = SIMD; 15 | 16 | using NativeType = float; 17 | using ElementType = float; 18 | using MaskType = m32; 19 | using MaskTypeArg = m32; 20 | 21 | 22 | FS_FORCEINLINE Register() = default; 23 | FS_FORCEINLINE Register( NativeType v ) : native{ v } { } 24 | 25 | FS_FORCEINLINE NativeType GetNative() const 26 | { 27 | return native.f; 28 | } 29 | 30 | FS_FORCEINLINE Register& operator +=( const Register& rhs ) 31 | { 32 | native.f = native.f + rhs.native.f; 33 | return *this; 34 | } 35 | 36 | FS_FORCEINLINE Register& operator -=( const Register& rhs ) 37 | { 38 | native.f = native.f - rhs.native.f; 39 | return *this; 40 | } 41 | 42 | FS_FORCEINLINE Register& operator *=( const Register& rhs ) 43 | { 44 | native.f = native.f * rhs.native.f; 45 | return *this; 46 | } 47 | 48 | FS_FORCEINLINE Register& operator /=( const Register& rhs ) 49 | { 50 | native.f = native.f / rhs.native.f; 51 | return *this; 52 | } 53 | 54 | 55 | FS_FORCEINLINE Register& operator &=( const Register& rhs ) 56 | { 57 | native.i = native.i & rhs.native.i; 58 | return *this; 59 | } 60 | 61 | FS_FORCEINLINE Register& operator |=( const Register& rhs ) 62 | { 63 | native.i = native.i | rhs.native.i; 64 | return *this; 65 | } 66 | 67 | FS_FORCEINLINE Register& operator ^=( const Register& rhs ) 68 | { 69 | native.i = native.i ^ rhs.native.i; 70 | return *this; 71 | } 72 | 73 | FS_FORCEINLINE Register operator ~() const 74 | { 75 | Register reg; 76 | reg.native.i = ~native.i; 77 | return reg; 78 | } 79 | 80 | FS_FORCEINLINE Register operator -() const 81 | { 82 | return -native.f; 83 | } 84 | 85 | 86 | FS_FORCEINLINE MaskType operator ==( const Register& rhs ) const 87 | { 88 | return native.f == rhs.native.f; 89 | } 90 | 91 | FS_FORCEINLINE MaskType operator !=( const Register& rhs ) const 92 | { 93 | return native.f != rhs.native.f; 94 | } 95 | 96 | FS_FORCEINLINE MaskType operator >=( const Register& rhs ) const 97 | { 98 | return native.f >= rhs.native.f; 99 | } 100 | 101 | FS_FORCEINLINE MaskType operator <=( const Register& rhs ) const 102 | { 103 | return native.f <= rhs.native.f; 104 | } 105 | 106 | FS_FORCEINLINE MaskType operator >( const Register& rhs ) const 107 | { 108 | return native.f > rhs.native.f; 109 | } 110 | 111 | FS_FORCEINLINE MaskType operator <( const Register& rhs ) const 112 | { 113 | return native.f < rhs.native.f; 114 | } 115 | 116 | union 117 | { 118 | float f; 119 | std::int32_t i; 120 | } 121 | native; 122 | }; 123 | 124 | 125 | template>> 126 | FS_FORCEINLINE f32<1, SIMD> Sqrt( const f32<1, SIMD>& a ) 127 | { 128 | return std::sqrt( a.native.f ); 129 | } 130 | } 131 | -------------------------------------------------------------------------------- /include/FastSIMD/ToolSet/Generic/Scalar/i32x1.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | namespace FS 9 | { 10 | template 11 | struct Register> 12 | { 13 | static constexpr size_t ElementCount = 1; 14 | static constexpr auto FeatureFlags = SIMD; 15 | 16 | using NativeType = std::int32_t; 17 | using ElementType = std::int32_t; 18 | using MaskType = m32; 19 | using MaskTypeArg = m32; 20 | 21 | 22 | FS_FORCEINLINE Register() = default; 23 | FS_FORCEINLINE Register( NativeType v ) : native( v ) { } 24 | 25 | FS_FORCEINLINE NativeType GetNative() const 26 | { 27 | return native; 28 | } 29 | 30 | FS_FORCEINLINE Register& operator +=( const Register& rhs ) 31 | { 32 | native = native + rhs.native; 33 | return *this; 34 | } 35 | 36 | FS_FORCEINLINE Register& operator -=( const Register& rhs ) 37 | { 38 | native = native - rhs.native; 39 | return *this; 40 | } 41 | 42 | FS_FORCEINLINE Register& operator *=( const Register& rhs ) 43 | { 44 | native = native * rhs.native; 45 | return *this; 46 | } 47 | 48 | FS_FORCEINLINE Register& operator &=( const Register& rhs ) 49 | { 50 | native = native & rhs.native; 51 | return *this; 52 | } 53 | 54 | FS_FORCEINLINE Register& operator |=( const Register& rhs ) 55 | { 56 | native = native | rhs.native; 57 | return *this; 58 | } 59 | 60 | FS_FORCEINLINE Register& operator ^=( const Register& rhs ) 61 | { 62 | native = native ^ rhs.native; 63 | return *this; 64 | } 65 | 66 | FS_FORCEINLINE Register& operator >>=( int rhs ) 67 | { 68 | native = native >> rhs; 69 | return *this; 70 | } 71 | 72 | FS_FORCEINLINE Register& operator <<=( int rhs ) 73 | { 74 | native = native << rhs; 75 | return *this; 76 | } 77 | 78 | FS_FORCEINLINE Register operator ~() const 79 | { 80 | return ~native; 81 | } 82 | 83 | FS_FORCEINLINE Register operator -() const 84 | { 85 | return -native; 86 | } 87 | 88 | 89 | FS_FORCEINLINE MaskType operator ==( const Register& rhs ) const 90 | { 91 | return native == rhs.native; 92 | } 93 | 94 | FS_FORCEINLINE MaskType operator !=( const Register& rhs ) const 95 | { 96 | return native != rhs.native; 97 | } 98 | 99 | FS_FORCEINLINE MaskType operator >=( const Register& rhs ) const 100 | { 101 | return native >= rhs.native; 102 | } 103 | 104 | FS_FORCEINLINE MaskType operator <=( const Register& rhs ) const 105 | { 106 | return native <= rhs.native; 107 | } 108 | 109 | FS_FORCEINLINE MaskType operator >( const Register& rhs ) const 110 | { 111 | return native > rhs.native; 112 | } 113 | 114 | FS_FORCEINLINE MaskType operator <( const Register& rhs ) const 115 | { 116 | return native < rhs.native; 117 | } 118 | 119 | NativeType native; 120 | }; 121 | 122 | 123 | template>> 124 | FS_FORCEINLINE i32<1, SIMD> BitShiftRightZeroExtend( const i32<1, SIMD>& a, int b ) 125 | { 126 | return static_cast( static_cast( a.native ) >> b ); 127 | } 128 | } 129 | -------------------------------------------------------------------------------- /include/FastSIMD/ToolSet/Generic/Scalar/mNx1.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace FS 6 | { 7 | namespace impl 8 | { 9 | struct GenericMaskBase 10 | { 11 | bool native; 12 | }; 13 | } 14 | 15 | template 16 | struct Register, 1, SIMD> 17 | : std::conditional_t, 1, SIMD>> 18 | { 19 | static constexpr size_t ElementCount = 1; 20 | static constexpr auto FeatureFlags = SIMD; 21 | 22 | using NativeType = bool; 23 | using ElementType = Mask; 24 | using MaskType = Register; 25 | using MaskTypeArg = Register; 26 | 27 | FS_FORCEINLINE Register() = default; 28 | FS_FORCEINLINE Register( NativeType v ) { this->native = v; } 29 | 30 | FS_FORCEINLINE NativeType GetNative() const 31 | { 32 | return this->native; 33 | } 34 | 35 | FS_FORCEINLINE Register& operator &=( const Register& rhs ) 36 | { 37 | this->native = this->native && rhs.native; 38 | return *this; 39 | } 40 | 41 | FS_FORCEINLINE Register& operator |=( const Register& rhs ) 42 | { 43 | this->native = this->native || rhs.native; 44 | return *this; 45 | } 46 | 47 | FS_FORCEINLINE Register& operator ^=( const Register& rhs ) 48 | { 49 | this->native = this->native ^ rhs.native; 50 | return *this; 51 | } 52 | 53 | FS_FORCEINLINE Register operator ~() const 54 | { 55 | return !this->native; 56 | } 57 | }; 58 | 59 | template, 1, SIMD>>> 60 | FS_FORCEINLINE Register, 1, SIMD> BitwiseAndNot( const Register, 1, SIMD>& a, const Register, 1, SIMD>& b ) 61 | { 62 | return a.native && !b.native; 63 | } 64 | 65 | template, 1, SIMD>>> 66 | FS_FORCEINLINE bool AnyMask( const Register, 1, SIMD>& a ) 67 | { 68 | return a.native; 69 | } 70 | 71 | template, 1, SIMD>>> 72 | FS_FORCEINLINE BitStorage<1> BitMask( const Register, 1, SIMD>& a ) 73 | { 74 | return static_cast>( a.native ); 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /include/FastSIMD/ToolSet/WASM/128/f32x4.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace FS 6 | { 7 | template 8 | struct Register> 9 | { 10 | static constexpr size_t ElementCount = 4; 11 | static constexpr auto FeatureFlags = SIMD; 12 | 13 | using NativeType = __f32x4; 14 | using ElementType = float; 15 | using MaskType = m32; 16 | using MaskTypeArg = m32; 17 | 18 | FS_FORCEINLINE Register() = default; 19 | FS_FORCEINLINE Register( NativeType v ) : native( v ) { } 20 | FS_FORCEINLINE Register( float v ) : native( wasm_f32x4_splat( v ) ) { } 21 | 22 | FS_FORCEINLINE NativeType GetNative() const 23 | { 24 | return native; 25 | } 26 | 27 | FS_FORCEINLINE Register& operator +=( const Register& rhs ) 28 | { 29 | native = wasm_f32x4_add( native, rhs.native ); 30 | return *this; 31 | } 32 | 33 | FS_FORCEINLINE Register& operator -=( const Register& rhs ) 34 | { 35 | native = wasm_f32x4_sub( native, rhs.native ); 36 | return *this; 37 | } 38 | 39 | FS_FORCEINLINE Register& operator *=( const Register& rhs ) 40 | { 41 | native = wasm_f32x4_mul( native, rhs.native ); 42 | return *this; 43 | } 44 | 45 | FS_FORCEINLINE Register& operator /=( const Register& rhs ) 46 | { 47 | native = wasm_f32x4_div( native, rhs.native ); 48 | return *this; 49 | } 50 | 51 | FS_FORCEINLINE Register& operator &=( const Register& rhs ) 52 | { 53 | native = wasm_v128_and( native, rhs.native ); 54 | return *this; 55 | } 56 | 57 | FS_FORCEINLINE Register& operator |=( const Register& rhs ) 58 | { 59 | native = wasm_v128_or( native, rhs.native ); 60 | return *this; 61 | } 62 | 63 | FS_FORCEINLINE Register& operator ^=( const Register& rhs ) 64 | { 65 | native = wasm_v128_xor( native, rhs.native ); 66 | return *this; 67 | } 68 | 69 | FS_FORCEINLINE Register operator~() const 70 | { 71 | return wasm_v128_not( native ); 72 | } 73 | 74 | FS_FORCEINLINE Register operator-() const 75 | { 76 | return wasm_f32x4_neg( native ); 77 | } 78 | 79 | 80 | FS_FORCEINLINE MaskType operator ==( const Register& rhs ) const 81 | { 82 | return wasm_f32x4_eq( native, rhs.native ); 83 | } 84 | 85 | FS_FORCEINLINE MaskType operator !=( const Register& rhs ) const 86 | { 87 | return ~( *this == rhs ); 88 | } 89 | 90 | FS_FORCEINLINE MaskType operator >=( const Register& rhs ) const 91 | { 92 | return wasm_f32x4_ge( native, rhs.native ); 93 | } 94 | 95 | FS_FORCEINLINE MaskType operator <=( const Register& rhs ) const 96 | { 97 | return wasm_f32x4_le( native, rhs.native ); 98 | } 99 | 100 | FS_FORCEINLINE MaskType operator >( const Register& rhs ) const 101 | { 102 | return wasm_f32x4_gt( native, rhs.native ); 103 | } 104 | 105 | FS_FORCEINLINE MaskType operator <( const Register& rhs ) const 106 | { 107 | return wasm_f32x4_lt( native, rhs.native ); 108 | } 109 | 110 | NativeType native; 111 | }; 112 | 113 | 114 | template>> 115 | FS_FORCEINLINE f32<4, SIMD> Load( TypeWrapper ptr ) 116 | { 117 | return wasm_v128_load( ptr.value ); 118 | } 119 | 120 | template>> 121 | FS_FORCEINLINE void Store( typename f32<4, SIMD>::ElementType* ptr, const f32<4, SIMD>& a ) 122 | { 123 | wasm_v128_store( ptr, a.native ); 124 | } 125 | 126 | template>> 127 | FS_FORCEINLINE float Extract0( const f32<4, SIMD>& a ) 128 | { 129 | return wasm_f32x4_extract_lane( a.native, 0 ); 130 | } 131 | 132 | template>> 133 | FS_FORCEINLINE f32<4, SIMD> Abs( const f32<4, SIMD>& a ) 134 | { 135 | return wasm_f32x4_abs( a.native ); 136 | } 137 | 138 | template>> 139 | FS_FORCEINLINE f32<4, SIMD> Round( const f32<4, SIMD>& a ) 140 | { 141 | return wasm_f32x4_nearest( a.native ); 142 | } 143 | 144 | template>> 145 | FS_FORCEINLINE f32<4, SIMD> Floor( const f32<4, SIMD>& a ) 146 | { 147 | return wasm_f32x4_floor( a.native ); 148 | } 149 | 150 | template>> 151 | FS_FORCEINLINE f32<4, SIMD> Trunc( const f32<4, SIMD>& a ) 152 | { 153 | return wasm_f32x4_trunc( a.native ); 154 | } 155 | 156 | template>> 157 | FS_FORCEINLINE f32<4, SIMD> Ceil( const f32<4, SIMD>& a ) 158 | { 159 | return wasm_f32x4_ceil( a.native ); 160 | } 161 | 162 | template>> 163 | FS_FORCEINLINE f32<4, SIMD> Min( const f32<4, SIMD>& a, const f32<4, SIMD>& b ) 164 | { 165 | if constexpr( FastSIMD::IsRelaxed() ) 166 | { 167 | return wasm_f32x4_relaxed_min( a.native, b.native ); 168 | } 169 | else 170 | { 171 | return wasm_f32x4_min( a.native, b.native ); 172 | } 173 | } 174 | 175 | template>> 176 | FS_FORCEINLINE f32<4, SIMD> Max( const f32<4, SIMD>& a, const f32<4, SIMD>& b ) 177 | { 178 | if constexpr( FastSIMD::IsRelaxed() ) 179 | { 180 | return wasm_f32x4_relaxed_max( a.native, b.native ); 181 | } 182 | else 183 | { 184 | return wasm_f32x4_max( a.native, b.native ); 185 | } 186 | } 187 | 188 | template>> 189 | FS_FORCEINLINE f32<4, SIMD> Select( const typename f32<4, SIMD>::MaskTypeArg& mask, const f32<4, SIMD>& ifTrue, const f32<4, SIMD>& ifFalse ) 190 | { 191 | return wasm_v128_bitselect( ifTrue.native, ifFalse.native, mask.native ); 192 | } 193 | 194 | 195 | template>> 196 | FS_FORCEINLINE f32<4, SIMD> Masked( const typename f32<4, SIMD>::MaskTypeArg& mask, const f32<4, SIMD>& a ) 197 | { 198 | return wasm_v128_and( mask.native, a.native ); 199 | } 200 | 201 | template>> 202 | FS_FORCEINLINE f32<4, SIMD> MaskedIncrement( const typename f32<4, SIMD>::MaskTypeArg& mask, const f32<4, SIMD>& a ) 203 | { 204 | return wasm_f32x4_sub( a.native, 205 | wasm_f32x4_convert_i32x4( static_cast( mask.native ) ) ); 206 | } 207 | 208 | template>> 209 | FS_FORCEINLINE f32<4, SIMD> MaskedDecrement( const typename f32<4, SIMD>::MaskTypeArg& mask, const f32<4, SIMD>& a ) 210 | { 211 | return wasm_f32x4_add( a.native, 212 | wasm_f32x4_convert_i32x4( static_cast( mask.native ) ) ); 213 | } 214 | 215 | 216 | template>> 217 | FS_FORCEINLINE f32<4, SIMD> Sqrt( const f32<4, SIMD>& a ) 218 | { 219 | return wasm_f32x4_sqrt( a.native ); 220 | } 221 | 222 | 223 | template>, typename = EnableIfRelaxed()> 224 | FS_FORCEINLINE f32<4, SIMD> FMulAdd( const f32<4, SIMD>& a, const f32<4, SIMD>& b, const f32<4, SIMD>& c ) 225 | { 226 | return wasm_f32x4_relaxed_madd( a.native, b.native, c.native ); 227 | } 228 | 229 | template>, typename = EnableIfRelaxed()> 230 | FS_FORCEINLINE f32<4, SIMD> FNMulAdd( const f32<4, SIMD>& a, const f32<4, SIMD>& b, const f32<4, SIMD>& c ) 231 | { 232 | return wasm_f32x4_relaxed_nmadd( a.native, b.native, c.native ); 233 | } 234 | } 235 | -------------------------------------------------------------------------------- /include/FastSIMD/ToolSet/WASM/128/i32x4.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace FS 6 | { 7 | template 8 | struct Register> 9 | { 10 | static constexpr size_t ElementCount = 4; 11 | static constexpr auto FeatureFlags = SIMD; 12 | 13 | using NativeType = v128_t; 14 | using ElementType = std::int32_t; 15 | using MaskType = m32; 16 | using MaskTypeArg = m32; 17 | 18 | FS_FORCEINLINE Register() = default; 19 | FS_FORCEINLINE Register( NativeType v ) : native( v ) { } 20 | FS_FORCEINLINE Register( std::int32_t v ) : native( wasm_i32x4_splat( v ) ) { } 21 | 22 | FS_FORCEINLINE NativeType GetNative() const 23 | { 24 | return native; 25 | } 26 | 27 | FS_FORCEINLINE Register& operator +=( const Register& rhs ) 28 | { 29 | native = wasm_i32x4_add( native, rhs.native ); 30 | return *this; 31 | } 32 | 33 | FS_FORCEINLINE Register& operator -=( const Register& rhs ) 34 | { 35 | native = wasm_i32x4_sub( native, rhs.native ); 36 | return *this; 37 | } 38 | 39 | FS_FORCEINLINE Register& operator *=( const Register& rhs ) 40 | { 41 | native = wasm_i32x4_mul( native, rhs.native ); 42 | return *this; 43 | } 44 | 45 | FS_FORCEINLINE Register& operator &=( const Register& rhs ) 46 | { 47 | native = wasm_v128_and( native, rhs.native ); 48 | return *this; 49 | } 50 | 51 | FS_FORCEINLINE Register& operator |=( const Register& rhs ) 52 | { 53 | native = wasm_v128_or( native, rhs.native ); 54 | return *this; 55 | } 56 | 57 | FS_FORCEINLINE Register& operator ^=( const Register& rhs ) 58 | { 59 | native = wasm_v128_xor( native, rhs.native ); 60 | return *this; 61 | } 62 | 63 | FS_FORCEINLINE Register& operator >>=( int rhs ) 64 | { 65 | native = wasm_i32x4_shr( native, rhs ); 66 | return *this; 67 | } 68 | 69 | FS_FORCEINLINE Register& operator <<=( int rhs ) 70 | { 71 | native = wasm_i32x4_shl( native, rhs ); 72 | return *this; 73 | } 74 | 75 | FS_FORCEINLINE Register operator ~() const 76 | { 77 | return wasm_v128_not( native ); 78 | } 79 | 80 | FS_FORCEINLINE Register operator -() const 81 | { 82 | return wasm_i32x4_neg( native ); 83 | } 84 | 85 | 86 | FS_FORCEINLINE MaskType operator ==( const Register& rhs ) const 87 | { 88 | return wasm_i32x4_eq( native, rhs.native ); 89 | } 90 | 91 | FS_FORCEINLINE MaskType operator !=( const Register& rhs ) const 92 | { 93 | return ~(*this == rhs); 94 | } 95 | 96 | FS_FORCEINLINE MaskType operator >=( const Register& rhs ) const 97 | { 98 | return wasm_i32x4_ge( native, rhs.native ); 99 | } 100 | 101 | FS_FORCEINLINE MaskType operator <=( const Register& rhs ) const 102 | { 103 | return wasm_i32x4_le( native, rhs.native ); 104 | } 105 | 106 | FS_FORCEINLINE MaskType operator >( const Register& rhs ) const 107 | { 108 | return wasm_i32x4_gt( native, rhs.native ); 109 | } 110 | 111 | FS_FORCEINLINE MaskType operator <( const Register& rhs ) const 112 | { 113 | return wasm_i32x4_lt( native, rhs.native ); 114 | } 115 | 116 | NativeType native; 117 | }; 118 | 119 | 120 | template>> 121 | FS_FORCEINLINE i32<4, SIMD> Load( TypeWrapper ptr ) 122 | { 123 | return wasm_v128_load( ptr.value ); 124 | } 125 | 126 | template>> 127 | FS_FORCEINLINE void Store( typename i32<4, SIMD>::ElementType* ptr, const i32<4, SIMD>& a ) 128 | { 129 | wasm_v128_store( ptr, a.native ); 130 | } 131 | 132 | template>> 133 | FS_FORCEINLINE int32_t Extract0( const i32<4, SIMD>& a ) 134 | { 135 | return wasm_i32x4_extract_lane( a.native, 0 ); 136 | } 137 | 138 | template>> 139 | FS_FORCEINLINE i32<4, SIMD> Abs( const i32<4, SIMD>& a ) 140 | { 141 | return wasm_i32x4_abs( a.native ); 142 | } 143 | 144 | template>> 145 | FS_FORCEINLINE i32<4, SIMD> Min( const i32<4, SIMD>& a, const i32<4, SIMD>& b ) 146 | { 147 | return wasm_i32x4_min( a.native, b.native ); 148 | } 149 | 150 | template>> 151 | FS_FORCEINLINE i32<4, SIMD> Max( const i32<4, SIMD>& a, const i32<4, SIMD>& b ) 152 | { 153 | return wasm_i32x4_max( a.native, b.native ); 154 | } 155 | 156 | template>> 157 | FS_FORCEINLINE i32<4, SIMD> Select( const typename i32<4, SIMD>::MaskTypeArg& mask, const i32<4, SIMD>& ifTrue, const i32<4, SIMD>& ifFalse ) 158 | { 159 | return wasm_v128_bitselect( ifTrue.native, ifFalse.native, mask.native ); 160 | } 161 | 162 | template>> 163 | FS_FORCEINLINE i32<4, SIMD> BitShiftRightZeroExtend( const i32<4, SIMD>& a, int b ) 164 | { 165 | return wasm_u32x4_shr( a.native, b ); 166 | } 167 | 168 | 169 | template>> 170 | FS_FORCEINLINE i32<4, SIMD> Masked( const typename i32<4, SIMD>::MaskTypeArg& mask, const i32<4, SIMD>& a ) 171 | { 172 | return wasm_v128_and( mask.native, a.native ); 173 | } 174 | 175 | template>> 176 | FS_FORCEINLINE i32<4, SIMD> MaskedIncrement( const typename i32<4, SIMD>::MaskTypeArg& mask, const i32<4, SIMD>& a ) 177 | { 178 | return wasm_i32x4_sub( a.native, mask.native ); 179 | } 180 | 181 | template>> 182 | FS_FORCEINLINE i32<4, SIMD> MaskedDecrement( const typename i32<4, SIMD>::MaskTypeArg& mask, const i32<4, SIMD>& a ) 183 | { 184 | return wasm_i32x4_add( a.native, mask.native ); 185 | } 186 | } 187 | -------------------------------------------------------------------------------- /include/FastSIMD/ToolSet/WASM/128/m32x4.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace FS 6 | { 7 | namespace impl 8 | { 9 | struct WasmMaskBase32x4 10 | { 11 | v128_t native; 12 | }; 13 | } 14 | 15 | template 16 | struct Register, 4, SIMD, std::enable_if_t> 17 | : std::conditional_t, 4, SIMD>> 18 | { 19 | static constexpr size_t ElementCount = 4; 20 | static constexpr auto FeatureFlags = SIMD; 21 | 22 | using NativeType = decltype(WasmMaskBase32x4::native); 23 | using ElementType = Mask<32, OPTIMISE_FLOAT>; 24 | using MaskType = Register; 25 | using MaskTypeArg = Register; 26 | 27 | FS_FORCEINLINE Register() = default; 28 | FS_FORCEINLINE Register( NativeType v ) { this->native = v; } 29 | 30 | FS_FORCEINLINE NativeType GetNative() const 31 | { 32 | return this->native; 33 | } 34 | 35 | FS_FORCEINLINE Register& operator &=( const Register& rhs ) 36 | { 37 | this->native = wasm_v128_and( this->native, rhs.native ); 38 | return *this; 39 | } 40 | 41 | FS_FORCEINLINE Register& operator |=( const Register& rhs ) 42 | { 43 | this->native = wasm_v128_or( this->native, rhs.native ); 44 | return *this; 45 | } 46 | 47 | FS_FORCEINLINE Register& operator ^=( const Register& rhs ) 48 | { 49 | this->native = wasm_v128_xor( this->native, rhs.native ); 50 | return *this; 51 | } 52 | 53 | FS_FORCEINLINE Register operator ~() const 54 | { 55 | return wasm_v128_not( this->native ); 56 | } 57 | }; 58 | 59 | template, 4, SIMD>>> 60 | FS_FORCEINLINE bool AnyMask( const Register, 4, SIMD>& a ) 61 | { 62 | return wasm_v128_any_true(a.native); 63 | } 64 | 65 | template, 4, SIMD>>> 66 | FS_FORCEINLINE BitStorage<4> BitMask( const Register, 4, SIMD>& a ) 67 | { 68 | return wasm_i32x4_bitmask(a.native); 69 | } 70 | 71 | } 72 | -------------------------------------------------------------------------------- /include/FastSIMD/ToolSet/WASM/WASM.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #if FASTSIMD_MAX_FEATURE_VALUE() >= FASTSIMD_FEATURE_VALUE( WASM ) 4 | #include 5 | #include 6 | 7 | #include "128/f32x4.h" 8 | #include "128/i32x4.h" 9 | #include "128/m32x4.h" 10 | 11 | namespace FS 12 | { 13 | template>> 14 | FS_FORCEINLINE i32<4, SIMD> Convert( const f32<4, SIMD>& a, TypeDummy ) 15 | { 16 | if constexpr( FastSIMD::IsRelaxed() ) 17 | { 18 | return wasm_i32x4_relaxed_trunc_f32x4( Round( a ).native ); 19 | } 20 | else 21 | { 22 | return wasm_i32x4_trunc_sat_f32x4( Round( a ).native ); 23 | } 24 | } 25 | 26 | template>> 27 | FS_FORCEINLINE f32<4, SIMD> Convert( const i32<4, SIMD>& a, TypeDummy ) 28 | { 29 | return wasm_f32x4_convert_i32x4( a.native ); 30 | } 31 | 32 | template>> 33 | FS_FORCEINLINE Register Cast( const Register& a, TypeDummy ) 34 | { 35 | if constexpr( 36 | std::is_same_v::NativeType, __f32x4> && 37 | std::is_same_v::NativeType, v128_t> ) 38 | { 39 | return static_cast<__f32x4>( a.GetNative() ); 40 | } 41 | else if constexpr( 42 | std::is_same_v::NativeType, v128_t> && 43 | std::is_same_v::NativeType, __f32x4> ) 44 | { 45 | return static_cast( a.GetNative() ); 46 | } 47 | else 48 | { 49 | return a.GetNative(); 50 | } 51 | } 52 | } 53 | #endif 54 | -------------------------------------------------------------------------------- /include/FastSIMD/ToolSet/x86/128/f32x4.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace FS 6 | { 7 | template 8 | struct Register> 9 | { 10 | static constexpr size_t ElementCount = 4; 11 | static constexpr auto FeatureFlags = SIMD; 12 | 13 | using NativeType = __m128; 14 | using ElementType = float; 15 | using MaskType = m32; 16 | using MaskTypeArg = m32; 17 | 18 | FS_FORCEINLINE Register() = default; 19 | FS_FORCEINLINE Register( NativeType v ) : native( v ) { } 20 | FS_FORCEINLINE Register( float v ) : native( _mm_set1_ps( v ) ) { } 21 | 22 | FS_FORCEINLINE NativeType GetNative() const 23 | { 24 | return native; 25 | } 26 | 27 | FS_FORCEINLINE Register& operator +=( const Register& rhs ) 28 | { 29 | native = _mm_add_ps( native, rhs.native ); 30 | return *this; 31 | } 32 | 33 | FS_FORCEINLINE Register& operator -=( const Register& rhs ) 34 | { 35 | native = _mm_sub_ps( native, rhs.native ); 36 | return *this; 37 | } 38 | 39 | FS_FORCEINLINE Register& operator *=( const Register& rhs ) 40 | { 41 | native = _mm_mul_ps( native, rhs.native ); 42 | return *this; 43 | } 44 | 45 | FS_FORCEINLINE Register& operator /=( const Register& rhs ) 46 | { 47 | native = _mm_div_ps( native, rhs.native ); 48 | return *this; 49 | } 50 | 51 | FS_FORCEINLINE Register& operator &=( const Register& rhs ) 52 | { 53 | native = _mm_and_ps( native, rhs.native ); 54 | return *this; 55 | } 56 | 57 | FS_FORCEINLINE Register& operator |=( const Register& rhs ) 58 | { 59 | native = _mm_or_ps( native, rhs.native ); 60 | return *this; 61 | } 62 | 63 | FS_FORCEINLINE Register& operator ^=( const Register& rhs ) 64 | { 65 | native = _mm_xor_ps( native, rhs.native ); 66 | return *this; 67 | } 68 | 69 | FS_FORCEINLINE Register operator~() const 70 | { 71 | const __m128i neg1 = _mm_set1_epi32( -1 ); 72 | return _mm_xor_ps( native, _mm_castsi128_ps( neg1 ) ); 73 | } 74 | 75 | FS_FORCEINLINE Register operator-() const 76 | { 77 | const __m128i minInt = _mm_set1_epi32( 0x80000000 ); 78 | return _mm_xor_ps( native, _mm_castsi128_ps( minInt ) ); 79 | } 80 | 81 | 82 | FS_FORCEINLINE MaskType operator ==( const Register& rhs ) const 83 | { 84 | return _mm_cmpeq_ps( native, rhs.native ); 85 | } 86 | 87 | FS_FORCEINLINE MaskType operator !=( const Register& rhs ) const 88 | { 89 | return _mm_cmpneq_ps( native, rhs.native ); 90 | } 91 | 92 | FS_FORCEINLINE MaskType operator >=( const Register& rhs ) const 93 | { 94 | return _mm_cmpge_ps( native, rhs.native ); 95 | } 96 | 97 | FS_FORCEINLINE MaskType operator <=( const Register& rhs ) const 98 | { 99 | return _mm_cmple_ps( native, rhs.native ); 100 | } 101 | 102 | FS_FORCEINLINE MaskType operator >( const Register& rhs ) const 103 | { 104 | return _mm_cmpgt_ps( native, rhs.native ); 105 | } 106 | 107 | FS_FORCEINLINE MaskType operator <( const Register& rhs ) const 108 | { 109 | return _mm_cmplt_ps( native, rhs.native ); 110 | } 111 | 112 | NativeType native; 113 | }; 114 | 115 | 116 | template>> 117 | FS_FORCEINLINE f32<4, SIMD> Load( TypeWrapper ptr ) 118 | { 119 | return _mm_loadu_ps( ptr.value ); 120 | } 121 | 122 | template>> 123 | FS_FORCEINLINE void Store( typename f32<4, SIMD>::ElementType* ptr, const f32<4, SIMD>& a ) 124 | { 125 | _mm_storeu_ps( ptr, a.native ); 126 | } 127 | 128 | template>> 129 | FS_FORCEINLINE float Extract0( const f32<4, SIMD>& a ) 130 | { 131 | return _mm_cvtss_f32( a.native ); 132 | } 133 | 134 | template>> 135 | FS_FORCEINLINE f32<4, SIMD> Abs( const f32<4, SIMD>& a ) 136 | { 137 | const __m128i intMax = _mm_set1_epi32( 0x7FFFFFFF ); 138 | return _mm_and_ps( a.native, _mm_castsi128_ps( intMax ) ); 139 | } 140 | 141 | template>> 142 | FS_FORCEINLINE f32<4, SIMD> Round( const f32<4, SIMD>& a ) 143 | { 144 | if constexpr( SIMD & FastSIMD::FeatureFlag::SSE41 ) 145 | { 146 | return _mm_round_ps( a.native, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC ); 147 | } 148 | else 149 | { 150 | __m128i aInt = _mm_cvtps_epi32( a.native ); 151 | __m128 aIntF = _mm_cvtepi32_ps( aInt ); 152 | 153 | return _mm_xor_ps( aIntF, _mm_and_ps( _mm_castsi128_ps( _mm_cmpeq_epi32( aInt, _mm_set1_epi32( (-2147483647 - 1) ) ) ), _mm_xor_ps( a.native, aIntF ) ) ); 154 | } 155 | } 156 | 157 | template>> 158 | FS_FORCEINLINE f32<4, SIMD> Floor( const f32<4, SIMD>& a ) 159 | { 160 | if constexpr( SIMD & FastSIMD::FeatureFlag::SSE41 ) 161 | { 162 | return _mm_round_ps( a.native, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC ); 163 | } 164 | else 165 | { 166 | f32<4, SIMD> aRound = Round( a ); 167 | 168 | return MaskedDecrement( aRound > a, aRound ); 169 | } 170 | } 171 | 172 | template>> 173 | FS_FORCEINLINE f32<4, SIMD> Ceil( const f32<4, SIMD>& a ) 174 | { 175 | if constexpr( SIMD & FastSIMD::FeatureFlag::SSE41 ) 176 | { 177 | return _mm_round_ps( a.native, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC ); 178 | } 179 | else 180 | { 181 | f32<4, SIMD> aRound = Round( a ); 182 | 183 | return MaskedIncrement( aRound < a, aRound ); 184 | } 185 | } 186 | 187 | template>> 188 | FS_FORCEINLINE f32<4, SIMD> Trunc( const f32<4, SIMD>& a ) 189 | { 190 | if constexpr( SIMD & FastSIMD::FeatureFlag::SSE41 ) 191 | { 192 | return _mm_round_ps( a.native, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC ); 193 | } 194 | else 195 | { 196 | __m128i aInt = _mm_cvttps_epi32( a.native ); 197 | __m128 aIntF = _mm_cvtepi32_ps( aInt ); 198 | 199 | return _mm_xor_ps( aIntF, _mm_and_ps( _mm_castsi128_ps( _mm_cmpeq_epi32( aInt, _mm_set1_epi32( (-2147483647 - 1) ) ) ), _mm_xor_ps( a.native, aIntF ) ) ); 200 | 201 | } 202 | } 203 | 204 | template>> 205 | FS_FORCEINLINE f32<4, SIMD> Min( const f32<4, SIMD>& a, const f32<4, SIMD>& b ) 206 | { 207 | return _mm_min_ps( a.native, b.native ); 208 | } 209 | 210 | template>> 211 | FS_FORCEINLINE f32<4, SIMD> Max( const f32<4, SIMD>& a, const f32<4, SIMD>& b ) 212 | { 213 | return _mm_max_ps( a.native, b.native ); 214 | } 215 | 216 | template>> 217 | FS_FORCEINLINE f32<4, SIMD> Select( const typename f32<4, SIMD>::MaskTypeArg& mask, const f32<4, SIMD>& ifTrue, const f32<4, SIMD>& ifFalse ) 218 | { 219 | if constexpr( SIMD & FastSIMD::FeatureFlag::SSE41 ) 220 | { 221 | return _mm_blendv_ps( ifFalse.native, ifTrue.native, mask.native ); 222 | } 223 | else 224 | { 225 | return _mm_xor_ps( ifFalse.native, _mm_and_ps( mask.native, _mm_xor_ps( ifTrue.native, ifFalse.native ) ) ); 226 | } 227 | } 228 | 229 | template>, typename = std::enable_if_t> 230 | FS_FORCEINLINE f32<4, SIMD> SelectHighBit( const Register& mask, const f32<4, SIMD>& ifTrue, const f32<4, SIMD>& ifFalse ) 231 | { 232 | return _mm_blendv_ps( ifFalse.native, ifTrue.native, FS::Cast( mask ).native ); 233 | } 234 | 235 | template>> 236 | FS_FORCEINLINE f32<4, SIMD> BitwiseAndNot( const f32<4, SIMD>& a, const f32<4, SIMD>& b ) 237 | { 238 | return _mm_andnot_ps( b.native, a.native ); 239 | } 240 | 241 | template>> 242 | FS_FORCEINLINE f32<4, SIMD> Masked( const typename f32<4, SIMD>::MaskTypeArg& mask, const f32<4, SIMD>& a ) 243 | { 244 | return _mm_and_ps( mask.native, a.native ); 245 | } 246 | 247 | template>> 248 | FS_FORCEINLINE f32<4, SIMD> InvMasked( const typename f32<4, SIMD>::MaskTypeArg& mask, const f32<4, SIMD>& a ) 249 | { 250 | return _mm_andnot_ps( mask.native, a.native ); 251 | } 252 | 253 | 254 | template>> 255 | FS_FORCEINLINE f32<4, SIMD> MaskedIncrement( const typename f32<4, SIMD>::MaskTypeArg& mask, const f32<4, SIMD>& a ) 256 | { 257 | return _mm_sub_ps( a.native, _mm_cvtepi32_ps( _mm_castps_si128( mask.native ) ) ); 258 | } 259 | 260 | template>> 261 | FS_FORCEINLINE f32<4, SIMD> MaskedDecrement( const typename f32<4, SIMD>::MaskTypeArg& mask, const f32<4, SIMD>& a ) 262 | { 263 | return _mm_add_ps( a.native, _mm_cvtepi32_ps( _mm_castps_si128( mask.native ) ) ); 264 | } 265 | 266 | 267 | template>, typename = EnableIfRelaxed()> 268 | FS_FORCEINLINE f32<4, SIMD> Reciprocal( const f32<4, SIMD>& a ) 269 | { 270 | return _mm_rcp_ps( a.native ); 271 | } 272 | 273 | template>, typename = EnableIfRelaxed()> 274 | FS_FORCEINLINE f32<4, SIMD> InvSqrt( const f32<4, SIMD>& a ) 275 | { 276 | return _mm_rsqrt_ps( a.native ); 277 | } 278 | 279 | template>> 280 | FS_FORCEINLINE f32<4, SIMD> Sqrt( const f32<4, SIMD>& a ) 281 | { 282 | return _mm_sqrt_ps( a.native ); 283 | } 284 | } 285 | -------------------------------------------------------------------------------- /include/FastSIMD/ToolSet/x86/128/i32x4.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace FS 6 | { 7 | template 8 | struct Register> 9 | { 10 | static constexpr size_t ElementCount = 4; 11 | static constexpr auto FeatureFlags = SIMD; 12 | 13 | using NativeType = __m128i; 14 | using ElementType = std::int32_t; 15 | using MaskType = m32; 16 | using MaskTypeArg = m32; 17 | 18 | FS_FORCEINLINE Register() = default; 19 | FS_FORCEINLINE Register( NativeType v ) : native( v ) { } 20 | FS_FORCEINLINE Register( std::int32_t v ) : native( _mm_set1_epi32( v ) ) { } 21 | 22 | FS_FORCEINLINE NativeType GetNative() const 23 | { 24 | return native; 25 | } 26 | 27 | FS_FORCEINLINE Register& operator +=( const Register& rhs ) 28 | { 29 | native = _mm_add_epi32( native, rhs.native ); 30 | return *this; 31 | } 32 | 33 | FS_FORCEINLINE Register& operator -=( const Register& rhs ) 34 | { 35 | native = _mm_sub_epi32( native, rhs.native ); 36 | return *this; 37 | } 38 | 39 | FS_FORCEINLINE Register& operator *=( const Register& rhs ) 40 | { 41 | if constexpr( SIMD & FastSIMD::FeatureFlag::SSE41 ) 42 | { 43 | native = _mm_mullo_epi32( native, rhs.native ); 44 | } 45 | else 46 | { 47 | __m128i mul20 = _mm_mul_epu32( native, rhs.native ); /* mul 2,0*/ 48 | __m128i mul31 = _mm_mul_epu32( _mm_srli_si128( native, 4 ), _mm_srli_si128( rhs.native, 4 ) ); /* mul 3,1 */ 49 | native = _mm_unpacklo_epi32( _mm_shuffle_epi32( mul20, _MM_SHUFFLE( 0, 0, 2, 0 ) ), _mm_shuffle_epi32( mul31, _MM_SHUFFLE( 0, 0, 2, 0 ) ) ); /* shuffle results to [63..0] and pack */ 50 | } 51 | return *this; 52 | } 53 | 54 | FS_FORCEINLINE Register& operator &=( const Register& rhs ) 55 | { 56 | native = _mm_and_si128( native, rhs.native ); 57 | return *this; 58 | } 59 | 60 | FS_FORCEINLINE Register& operator |=( const Register& rhs ) 61 | { 62 | native = _mm_or_si128( native, rhs.native ); 63 | return *this; 64 | } 65 | 66 | FS_FORCEINLINE Register& operator ^=( const Register& rhs ) 67 | { 68 | native = _mm_xor_si128( native, rhs.native ); 69 | return *this; 70 | } 71 | 72 | FS_FORCEINLINE Register& operator >>=( int rhs ) 73 | { 74 | native = _mm_srai_epi32( native, rhs ); 75 | return *this; 76 | } 77 | 78 | FS_FORCEINLINE Register& operator <<=( int rhs ) 79 | { 80 | native = _mm_slli_epi32( native, rhs ); 81 | return *this; 82 | } 83 | 84 | FS_FORCEINLINE Register operator ~() const 85 | { 86 | const __m128i neg1 = _mm_set1_epi32( -1 ); 87 | return _mm_xor_si128( native, neg1 ); 88 | } 89 | 90 | FS_FORCEINLINE Register operator -() const 91 | { 92 | return _mm_sub_epi32( _mm_setzero_si128(), native ); 93 | } 94 | 95 | 96 | FS_FORCEINLINE MaskType operator ==( const Register& rhs ) const 97 | { 98 | return _mm_cmpeq_epi32( native, rhs.native ); 99 | } 100 | 101 | FS_FORCEINLINE MaskType operator !=( const Register& rhs ) const 102 | { 103 | return ~(*this == rhs); 104 | } 105 | 106 | FS_FORCEINLINE MaskType operator >=( const Register& rhs ) const 107 | { 108 | return ~(*this < rhs); 109 | } 110 | 111 | FS_FORCEINLINE MaskType operator <=( const Register& rhs ) const 112 | { 113 | return ~(*this > rhs); 114 | } 115 | 116 | FS_FORCEINLINE MaskType operator >( const Register& rhs ) const 117 | { 118 | return _mm_cmpgt_epi32( native, rhs.native ); 119 | } 120 | 121 | FS_FORCEINLINE MaskType operator <( const Register& rhs ) const 122 | { 123 | return _mm_cmplt_epi32( native, rhs.native ); 124 | } 125 | 126 | NativeType native; 127 | }; 128 | 129 | 130 | template>> 131 | FS_FORCEINLINE i32<4, SIMD> Load( TypeWrapper ptr ) 132 | { 133 | return _mm_loadu_si128( (__m128i*)ptr.value ); 134 | } 135 | 136 | template>> 137 | FS_FORCEINLINE void Store( typename i32<4, SIMD>::ElementType* ptr, const i32<4, SIMD>& a ) 138 | { 139 | _mm_storeu_si128( (__m128i*)ptr, a.native ); 140 | } 141 | 142 | template>> 143 | FS_FORCEINLINE int32_t Extract0( const i32<4, SIMD>& a ) 144 | { 145 | return _mm_cvtsi128_si32( a.native ); 146 | } 147 | 148 | template>> 149 | FS_FORCEINLINE i32<4, SIMD> Abs( const i32<4, SIMD>& a ) 150 | { 151 | if constexpr( SIMD & FastSIMD::FeatureFlag::SSSE3 ) 152 | { 153 | return _mm_abs_epi32( a.native ); 154 | } 155 | else 156 | { 157 | __m128i signMask = _mm_srai_epi32( a.native, 31 ); 158 | return _mm_sub_epi32( _mm_xor_si128( a.native, signMask ), signMask ); 159 | } 160 | } 161 | 162 | template>> 163 | FS_FORCEINLINE i32<4, SIMD> Min( const i32<4, SIMD>& a, const i32<4, SIMD>& b ) 164 | { 165 | if constexpr( SIMD & FastSIMD::FeatureFlag::SSE41 ) 166 | { 167 | return _mm_min_epi32( a.native, b.native ); 168 | } 169 | else 170 | { 171 | return Select( a < b, a, b ); 172 | } 173 | } 174 | 175 | template>> 176 | FS_FORCEINLINE i32<4, SIMD> Max( const i32<4, SIMD>& a, const i32<4, SIMD>& b ) 177 | { 178 | if constexpr( SIMD & FastSIMD::FeatureFlag::SSE41 ) 179 | { 180 | return _mm_max_epi32( a.native, b.native ); 181 | } 182 | else 183 | { 184 | return Select( a > b, a, b ); 185 | } 186 | } 187 | 188 | template>> 189 | FS_FORCEINLINE i32<4, SIMD> Select( const typename i32<4, SIMD>::MaskTypeArg& mask, const i32<4, SIMD>& ifTrue, const i32<4, SIMD>& ifFalse ) 190 | { 191 | if constexpr( SIMD & FastSIMD::FeatureFlag::SSE41 ) 192 | { 193 | return _mm_blendv_epi8( ifFalse.native, ifTrue.native, _mm_castps_si128( mask.native ) ); 194 | } 195 | else 196 | { 197 | return _mm_xor_si128( ifFalse.native, _mm_and_si128( _mm_castps_si128( mask.native ), _mm_xor_si128( ifTrue.native, ifFalse.native ) ) ); 198 | } 199 | } 200 | 201 | template>, typename = std::enable_if_t> 202 | FS_FORCEINLINE i32<4, SIMD> SelectHighBit( const Register& mask, const i32<4, SIMD>& ifTrue, const i32<4, SIMD>& ifFalse ) 203 | { 204 | return _mm_castps_si128( _mm_blendv_ps( _mm_castsi128_ps( ifFalse.native ), _mm_castsi128_ps( ifTrue.native ), FS::Cast( mask ).native ) ); 205 | } 206 | 207 | template>> 208 | FS_FORCEINLINE i32<4, SIMD> BitwiseAndNot( const i32<4, SIMD>& a, const i32<4, SIMD>& b ) 209 | { 210 | return _mm_andnot_si128( b.native, a.native ); 211 | } 212 | 213 | template>> 214 | FS_FORCEINLINE i32<4, SIMD> BitShiftRightZeroExtend( const i32<4, SIMD>& a, int b ) 215 | { 216 | return _mm_srli_epi32( a.native, b ); 217 | } 218 | 219 | 220 | template>> 221 | FS_FORCEINLINE i32<4, SIMD> Masked( const typename i32<4, SIMD>::MaskTypeArg& mask, const i32<4, SIMD>& a ) 222 | { 223 | return _mm_and_si128( _mm_castps_si128( mask.native ), a.native ); 224 | } 225 | 226 | template>> 227 | FS_FORCEINLINE i32<4, SIMD> InvMasked( const typename i32<4, SIMD>::MaskTypeArg& mask, const i32<4, SIMD>& a ) 228 | { 229 | return _mm_andnot_si128( _mm_castps_si128( mask.native ), a.native ); 230 | } 231 | 232 | template>> 233 | FS_FORCEINLINE i32<4, SIMD> MaskedIncrement( const typename i32<4, SIMD>::MaskTypeArg& mask, const i32<4, SIMD>& a ) 234 | { 235 | return _mm_sub_epi32( a.native, _mm_castps_si128( mask.native ) ); 236 | } 237 | 238 | template>> 239 | FS_FORCEINLINE i32<4, SIMD> MaskedDecrement( const typename i32<4, SIMD>::MaskTypeArg& mask, const i32<4, SIMD>& a ) 240 | { 241 | return _mm_add_epi32( a.native, _mm_castps_si128( mask.native ) ); 242 | } 243 | } 244 | -------------------------------------------------------------------------------- /include/FastSIMD/ToolSet/x86/128/m32x4.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace FS 6 | { 7 | template 8 | struct Register, 4, SIMD, std::enable_if_t> 9 | { 10 | static constexpr size_t ElementCount = 4; 11 | static constexpr auto FeatureFlags = SIMD; 12 | 13 | using NativeType = __m128; 14 | using ElementType = Mask<32, true>; 15 | using MaskType = Register; 16 | using MaskTypeArg = Register; 17 | 18 | FS_FORCEINLINE Register() = default; 19 | FS_FORCEINLINE Register( NativeType v ) : native( v ) { } 20 | 21 | FS_FORCEINLINE NativeType GetNative() const 22 | { 23 | return native; 24 | } 25 | 26 | FS_FORCEINLINE Register& operator &=( const Register& rhs ) 27 | { 28 | native = _mm_and_ps( native, rhs.native ); 29 | return *this; 30 | } 31 | 32 | FS_FORCEINLINE Register& operator |=( const Register& rhs ) 33 | { 34 | native = _mm_or_ps( native, rhs.native ); 35 | return *this; 36 | } 37 | 38 | FS_FORCEINLINE Register& operator ^=( const Register& rhs ) 39 | { 40 | native = _mm_xor_ps( native, rhs.native ); 41 | return *this; 42 | } 43 | 44 | FS_FORCEINLINE Register operator ~() const 45 | { 46 | const __m128i neg1 = _mm_set1_epi32( -1 ); 47 | return _mm_xor_ps( native, _mm_castsi128_ps( neg1 ) ); 48 | } 49 | 50 | NativeType native; 51 | }; 52 | 53 | template>> 54 | FS_FORCEINLINE m32<4, true, SIMD> BitwiseAndNot( const m32<4, true, SIMD>& a, const m32<4, true, SIMD>& b ) 55 | { 56 | return _mm_andnot_ps( b.native, a.native ); 57 | } 58 | 59 | template>> 60 | FS_FORCEINLINE bool AnyMask( const m32<4, true, SIMD>& a ) 61 | { 62 | return _mm_movemask_ps( a.native ); 63 | } 64 | 65 | template>> 66 | FS_FORCEINLINE BitStorage<4> BitMask( const m32<4, B, SIMD>& a ) 67 | { 68 | return static_cast>( _mm_movemask_ps( a.native ) ); 69 | } 70 | 71 | 72 | template 73 | struct Register, 4, SIMD, std::enable_if_t> : Register, 4, SIMD> 74 | { 75 | static constexpr size_t ElementCount = 4; 76 | static constexpr auto FeatureFlags = SIMD; 77 | 78 | using NativeType = __m128i; 79 | using ElementType = Mask<32, false>; 80 | using MaskType = Register; 81 | using MaskTypeArg = Register, 4, SIMD>; 82 | 83 | FS_FORCEINLINE Register() = default; 84 | FS_FORCEINLINE Register( NativeType v ) : Register, 4, SIMD>( _mm_castsi128_ps( v ) ) { } 85 | 86 | FS_FORCEINLINE NativeType GetNative() const 87 | { 88 | return _mm_castps_si128( this->native ); 89 | } 90 | 91 | FS_FORCEINLINE Register& operator &=( const Register& rhs ) 92 | { 93 | this->native = _mm_castsi128_ps( _mm_and_si128( _mm_castps_si128( this->native ), _mm_castps_si128( rhs.native ) ) ); 94 | return *this; 95 | } 96 | 97 | FS_FORCEINLINE Register& operator |=( const Register& rhs ) 98 | { 99 | this->native = _mm_castsi128_ps( _mm_or_si128( _mm_castps_si128( this->native ), _mm_castps_si128( rhs.native ) ) ); 100 | return *this; 101 | } 102 | 103 | FS_FORCEINLINE Register& operator ^=( const Register& rhs ) 104 | { 105 | this->native = _mm_castsi128_ps( _mm_xor_si128( _mm_castps_si128( this->native ), _mm_castps_si128( rhs.native ) ) ); 106 | return *this; 107 | } 108 | 109 | FS_FORCEINLINE Register operator ~() const 110 | { 111 | const __m128i neg1 = _mm_set1_epi32( -1 ); 112 | return _mm_xor_si128( _mm_castps_si128( this->native ), neg1 ); 113 | } 114 | }; 115 | 116 | template>> 117 | FS_FORCEINLINE m32<4, false, SIMD> BitwiseAndNot( const m32<4, false, SIMD>& a, const m32<4, false, SIMD>& b ) 118 | { 119 | return _mm_andnot_si128( _mm_castps_si128( b.native ), _mm_castps_si128( a.native ) ); 120 | } 121 | 122 | template>> 123 | FS_FORCEINLINE bool AnyMask( const m32<4, false, SIMD>& a ) 124 | { 125 | return _mm_movemask_epi8( _mm_castps_si128( a.native ) ); 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /include/FastSIMD/ToolSet/x86/256/f32x8.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace FS 6 | { 7 | template 8 | struct Register> 9 | { 10 | static constexpr size_t ElementCount = 8; 11 | static constexpr auto FeatureFlags = SIMD; 12 | 13 | using NativeType = __m256; 14 | using ElementType = float; 15 | using MaskType = m32; 16 | using MaskTypeArg = m32; 17 | 18 | FS_FORCEINLINE Register() = default; 19 | FS_FORCEINLINE Register( NativeType v ) : native( v ) { } 20 | FS_FORCEINLINE Register( float v ) : native( _mm256_set1_ps( v ) ) { } 21 | 22 | FS_FORCEINLINE NativeType GetNative() const 23 | { 24 | return native; 25 | } 26 | 27 | FS_FORCEINLINE Register& operator +=( const Register& rhs ) 28 | { 29 | native = _mm256_add_ps( native, rhs.native ); 30 | return *this; 31 | } 32 | 33 | FS_FORCEINLINE Register& operator -=( const Register& rhs ) 34 | { 35 | native = _mm256_sub_ps( native, rhs.native ); 36 | return *this; 37 | } 38 | 39 | FS_FORCEINLINE Register& operator *=( const Register& rhs ) 40 | { 41 | native = _mm256_mul_ps( native, rhs.native ); 42 | return *this; 43 | } 44 | 45 | FS_FORCEINLINE Register& operator /=( const Register& rhs ) 46 | { 47 | native = _mm256_div_ps( native, rhs.native ); 48 | return *this; 49 | } 50 | 51 | FS_FORCEINLINE Register& operator &=( const Register& rhs ) 52 | { 53 | native = _mm256_and_ps( native, rhs.native ); 54 | return *this; 55 | } 56 | 57 | FS_FORCEINLINE Register& operator |=( const Register& rhs ) 58 | { 59 | native = _mm256_or_ps( native, rhs.native ); 60 | return *this; 61 | } 62 | 63 | FS_FORCEINLINE Register& operator ^=( const Register& rhs ) 64 | { 65 | native = _mm256_xor_ps( native, rhs.native ); 66 | return *this; 67 | } 68 | 69 | FS_FORCEINLINE Register operator~() const 70 | { 71 | const __m256i neg1 = _mm256_set1_epi32( -1 ); 72 | return _mm256_xor_ps( native, _mm256_castsi256_ps( neg1 ) ); 73 | } 74 | 75 | FS_FORCEINLINE Register operator-() const 76 | { 77 | const __m256i minInt = _mm256_set1_epi32( 0x80000000 ); 78 | return _mm256_xor_ps( native, _mm256_castsi256_ps( minInt ) ); 79 | } 80 | 81 | 82 | FS_FORCEINLINE MaskType operator ==( const Register& rhs ) const 83 | { 84 | return _mm256_cmp_ps( native, rhs.native, _CMP_EQ_OQ ); 85 | } 86 | 87 | FS_FORCEINLINE MaskType operator !=( const Register& rhs ) const 88 | { 89 | return _mm256_cmp_ps( native, rhs.native, _CMP_NEQ_OQ ); 90 | } 91 | 92 | FS_FORCEINLINE MaskType operator >=( const Register& rhs ) const 93 | { 94 | return _mm256_cmp_ps( native, rhs.native, _CMP_GE_OQ ); 95 | } 96 | 97 | FS_FORCEINLINE MaskType operator <=( const Register& rhs ) const 98 | { 99 | return _mm256_cmp_ps( native, rhs.native, _CMP_LE_OQ ); 100 | } 101 | 102 | FS_FORCEINLINE MaskType operator >( const Register& rhs ) const 103 | { 104 | return _mm256_cmp_ps( native, rhs.native, _CMP_GT_OQ ); 105 | } 106 | 107 | FS_FORCEINLINE MaskType operator <( const Register& rhs ) const 108 | { 109 | return _mm256_cmp_ps( native, rhs.native, _CMP_LT_OQ ); 110 | } 111 | 112 | NativeType native; 113 | }; 114 | 115 | 116 | template>> 117 | FS_FORCEINLINE f32<8, SIMD> Load( TypeWrapper ptr ) 118 | { 119 | return _mm256_loadu_ps( ptr.value ); 120 | } 121 | 122 | template>> 123 | FS_FORCEINLINE void Store( typename f32<8, SIMD>::ElementType* ptr, const f32<8, SIMD>& a ) 124 | { 125 | _mm256_storeu_ps( ptr, a.native ); 126 | } 127 | 128 | template>> 129 | FS_FORCEINLINE float Extract0( const f32<8, SIMD>& a ) 130 | { 131 | return _mm256_cvtss_f32( a.native ); 132 | } 133 | 134 | template>> 135 | FS_FORCEINLINE f32<8, SIMD> Abs( const f32<8, SIMD>& a ) 136 | { 137 | const __m256i intMax = _mm256_set1_epi32( 0x7FFFFFFF ); 138 | return _mm256_and_ps( a.native, _mm256_castsi256_ps( intMax ) ); 139 | } 140 | 141 | template>> 142 | FS_FORCEINLINE f32<8, SIMD> Round( const f32<8, SIMD>& a ) 143 | { 144 | return _mm256_round_ps( a.native, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC ); 145 | } 146 | 147 | template>> 148 | FS_FORCEINLINE f32<8, SIMD> Floor( const f32<8, SIMD>& a ) 149 | { 150 | return _mm256_round_ps( a.native, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC ); 151 | } 152 | 153 | template>> 154 | FS_FORCEINLINE f32<8, SIMD> Ceil( const f32<8, SIMD>& a ) 155 | { 156 | return _mm256_round_ps( a.native, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC ); 157 | } 158 | 159 | template>> 160 | FS_FORCEINLINE f32<8, SIMD> Trunc( const f32<8, SIMD>& a ) 161 | { 162 | return _mm256_round_ps( a.native, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC ); 163 | } 164 | 165 | template>> 166 | FS_FORCEINLINE f32<8, SIMD> Min( const f32<8, SIMD>& a, const f32<8, SIMD>& b ) 167 | { 168 | return _mm256_min_ps( a.native, b.native ); 169 | } 170 | 171 | template>> 172 | FS_FORCEINLINE f32<8, SIMD> Max( const f32<8, SIMD>& a, const f32<8, SIMD>& b ) 173 | { 174 | return _mm256_max_ps( a.native, b.native ); 175 | } 176 | 177 | template>> 178 | FS_FORCEINLINE f32<8, SIMD> Select( const typename f32<8, SIMD>::MaskTypeArg& mask, const f32<8, SIMD>& ifTrue, const f32<8, SIMD>& ifFalse ) 179 | { 180 | return _mm256_blendv_ps( ifFalse.native, ifTrue.native, mask.native ); 181 | } 182 | 183 | template>> 184 | FS_FORCEINLINE f32<8, SIMD> SelectHighBit( const Register& mask, const f32<8, SIMD>& ifTrue, const f32<8, SIMD>& ifFalse ) 185 | { 186 | return _mm256_blendv_ps( ifFalse.native, ifTrue.native, FS::Cast( mask ).native ); 187 | } 188 | 189 | template>> 190 | FS_FORCEINLINE f32<8, SIMD> BitwiseAndNot( const f32<8, SIMD>& a, const f32<8, SIMD>& b ) 191 | { 192 | return _mm256_andnot_ps( b.native, a.native ); 193 | } 194 | 195 | template>> 196 | FS_FORCEINLINE f32<8, SIMD> Masked( const typename f32<8, SIMD>::MaskTypeArg& mask, const f32<8, SIMD>& a ) 197 | { 198 | return _mm256_and_ps( mask.native, a.native ); 199 | } 200 | 201 | template>> 202 | FS_FORCEINLINE f32<8, SIMD> InvMasked( const typename f32<8, SIMD>::MaskTypeArg& mask, const f32<8, SIMD>& a ) 203 | { 204 | return _mm256_andnot_ps( mask.native, a.native ); 205 | } 206 | 207 | 208 | template>> 209 | FS_FORCEINLINE f32<8, SIMD> MaskedIncrement( const typename f32<8, SIMD>::MaskTypeArg& mask, const f32<8, SIMD>& a ) 210 | { 211 | return _mm256_sub_ps( a.native, _mm256_cvtepi32_ps( _mm256_castps_si256( mask.native ) ) ); 212 | } 213 | 214 | template>> 215 | FS_FORCEINLINE f32<8, SIMD> MaskedDecrement( const typename f32<8, SIMD>::MaskTypeArg& mask, const f32<8, SIMD>& a ) 216 | { 217 | return _mm256_add_ps( a.native, _mm256_cvtepi32_ps( _mm256_castps_si256( mask.native ) ) ); 218 | } 219 | 220 | 221 | template>, typename = EnableIfRelaxed> 222 | FS_FORCEINLINE f32<8, SIMD> Reciprocal( const f32<8, SIMD>& a ) 223 | { 224 | return _mm256_rcp_ps( a.native ); 225 | } 226 | 227 | template>, typename = EnableIfRelaxed> 228 | FS_FORCEINLINE f32<8, SIMD> InvSqrt( const f32<8, SIMD>& a ) 229 | { 230 | return _mm256_rsqrt_ps( a.native ); 231 | } 232 | 233 | template>> 234 | FS_FORCEINLINE f32<8, SIMD> Sqrt( const f32<8, SIMD>& a ) 235 | { 236 | return _mm256_sqrt_ps( a.native ); 237 | } 238 | 239 | template>, typename = EnableIfRelaxed, typename = std::enable_if_t> 240 | FS_FORCEINLINE f32<8, SIMD> FMulAdd( const f32<8, SIMD>& a, const f32<8, SIMD>& b, const f32<8, SIMD>& c ) 241 | { 242 | return _mm256_fmadd_ps( a.native, b.native, c.native ); 243 | } 244 | 245 | template>, typename = EnableIfRelaxed, typename = std::enable_if_t> 246 | FS_FORCEINLINE f32<8, SIMD> FMulSub( const f32<8, SIMD>& a, const f32<8, SIMD>& b, const f32<8, SIMD>& c ) 247 | { 248 | return _mm256_fmsub_ps( a.native, b.native, c.native ); 249 | } 250 | 251 | template>, typename = EnableIfRelaxed, typename = std::enable_if_t> 252 | FS_FORCEINLINE f32<8, SIMD> FNMulAdd( const f32<8, SIMD>& a, const f32<8, SIMD>& b, const f32<8, SIMD>& c ) 253 | { 254 | return _mm256_fnmadd_ps( a.native, b.native, c.native ); 255 | } 256 | 257 | template>, typename = EnableIfRelaxed, typename = std::enable_if_t> 258 | FS_FORCEINLINE f32<8, SIMD> FNMulSub( const f32<8, SIMD>& a, const f32<8, SIMD>& b, const f32<8, SIMD>& c ) 259 | { 260 | return _mm256_fnmsub_ps( a.native, b.native, c.native ); 261 | } 262 | } 263 | -------------------------------------------------------------------------------- /include/FastSIMD/ToolSet/x86/256/i32x8.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace FS 6 | { 7 | template 8 | struct Register> 9 | { 10 | static constexpr size_t ElementCount = 8; 11 | static constexpr auto FeatureFlags = SIMD; 12 | 13 | using NativeType = __m256i; 14 | using ElementType = std::int32_t; 15 | using MaskType = m32; 16 | using MaskTypeArg = m32; 17 | 18 | FS_FORCEINLINE Register() = default; 19 | FS_FORCEINLINE Register( NativeType v ) : native( v ) { } 20 | FS_FORCEINLINE Register( std::int32_t v ) : native( _mm256_set1_epi32( v ) ) { } 21 | 22 | FS_FORCEINLINE NativeType GetNative() const 23 | { 24 | return native; 25 | } 26 | 27 | FS_FORCEINLINE Register& operator +=( const Register& rhs ) 28 | { 29 | native = _mm256_add_epi32( native, rhs.native ); 30 | return *this; 31 | } 32 | 33 | FS_FORCEINLINE Register& operator -=( const Register& rhs ) 34 | { 35 | native = _mm256_sub_epi32( native, rhs.native ); 36 | return *this; 37 | } 38 | 39 | FS_FORCEINLINE Register& operator *=( const Register& rhs ) 40 | { 41 | native = _mm256_mullo_epi32( native, rhs.native ); 42 | return *this; 43 | } 44 | 45 | FS_FORCEINLINE Register& operator &=( const Register& rhs ) 46 | { 47 | native = _mm256_and_si256( native, rhs.native ); 48 | return *this; 49 | } 50 | 51 | FS_FORCEINLINE Register& operator |=( const Register& rhs ) 52 | { 53 | native = _mm256_or_si256( native, rhs.native ); 54 | return *this; 55 | } 56 | 57 | FS_FORCEINLINE Register& operator ^=( const Register& rhs ) 58 | { 59 | native = _mm256_xor_si256( native, rhs.native ); 60 | return *this; 61 | } 62 | 63 | FS_FORCEINLINE Register& operator >>=( int rhs ) 64 | { 65 | native = _mm256_srai_epi32( native, rhs ); 66 | return *this; 67 | } 68 | 69 | FS_FORCEINLINE Register& operator <<=( int rhs ) 70 | { 71 | native = _mm256_slli_epi32( native, rhs ); 72 | return *this; 73 | } 74 | 75 | FS_FORCEINLINE Register operator ~() const 76 | { 77 | const __m256i neg1 = _mm256_set1_epi32( -1 ); 78 | return _mm256_xor_si256( native, neg1 ); 79 | } 80 | 81 | FS_FORCEINLINE Register operator -() const 82 | { 83 | return _mm256_sub_epi32( _mm256_setzero_si256(), native ); 84 | } 85 | 86 | 87 | FS_FORCEINLINE MaskType operator ==( const Register& rhs ) const 88 | { 89 | return _mm256_cmpeq_epi32( native, rhs.native ); 90 | } 91 | 92 | FS_FORCEINLINE MaskType operator !=( const Register& rhs ) const 93 | { 94 | return ~(*this == rhs); 95 | } 96 | 97 | FS_FORCEINLINE MaskType operator >=( const Register& rhs ) const 98 | { 99 | return ~(*this < rhs); 100 | } 101 | 102 | FS_FORCEINLINE MaskType operator <=( const Register& rhs ) const 103 | { 104 | return ~(*this > rhs); 105 | } 106 | 107 | FS_FORCEINLINE MaskType operator >( const Register& rhs ) const 108 | { 109 | return _mm256_cmpgt_epi32( native, rhs.native ); 110 | } 111 | 112 | FS_FORCEINLINE MaskType operator <( const Register& rhs ) const 113 | { 114 | return _mm256_cmpgt_epi32( rhs.native, native ); 115 | } 116 | 117 | NativeType native; 118 | }; 119 | 120 | 121 | template>> 122 | FS_FORCEINLINE i32<8, SIMD> Load( TypeWrapper ptr ) 123 | { 124 | return _mm256_loadu_si256( (const __m256i*)ptr.value ); 125 | } 126 | 127 | template>> 128 | FS_FORCEINLINE void Store( typename i32<8, SIMD>::ElementType* ptr, const i32<8, SIMD>& a ) 129 | { 130 | _mm256_storeu_si256( (__m256i*)ptr, a.native ); 131 | } 132 | 133 | template>> 134 | FS_FORCEINLINE int32_t Extract0( const i32<8, SIMD>& a ) 135 | { 136 | return _mm256_cvtsi256_si32( a.native ); 137 | } 138 | 139 | template>> 140 | FS_FORCEINLINE i32<8, SIMD> Abs( const i32<8, SIMD>& a ) 141 | { 142 | return _mm256_abs_epi32( a.native ); 143 | } 144 | 145 | template>> 146 | FS_FORCEINLINE i32<8, SIMD> Min( const i32<8, SIMD>& a, const i32<8, SIMD>& b ) 147 | { 148 | return _mm256_min_epi32( a.native, b.native ); 149 | } 150 | 151 | template>> 152 | FS_FORCEINLINE i32<8, SIMD> Max( const i32<8, SIMD>& a, const i32<8, SIMD>& b ) 153 | { 154 | return _mm256_max_epi32( a.native, b.native ); 155 | } 156 | 157 | template>> 158 | FS_FORCEINLINE i32<8, SIMD> Select( const typename i32<8, SIMD>::MaskTypeArg& mask, const i32<8, SIMD>& ifTrue, const i32<8, SIMD>& ifFalse ) 159 | { 160 | return _mm256_blendv_epi8( ifFalse.native, ifTrue.native, _mm256_castps_si256( mask.native ) ); 161 | } 162 | 163 | template>> 164 | FS_FORCEINLINE i32<8, SIMD> SelectHighBit( const Register& mask, const i32<8, SIMD>& ifTrue, const i32<8, SIMD>& ifFalse ) 165 | { 166 | return _mm256_castps_si256( _mm256_blendv_ps( _mm256_castsi256_ps( ifFalse.native ), _mm256_castsi256_ps( ifTrue.native ), FS::Cast( mask ).native ) ); 167 | } 168 | 169 | template>> 170 | FS_FORCEINLINE i32<8, SIMD> BitwiseAndNot( const i32<8, SIMD>& a, const i32<8, SIMD>& b ) 171 | { 172 | return _mm256_andnot_si256( b.native, a.native ); 173 | } 174 | 175 | template>> 176 | FS_FORCEINLINE i32<8, SIMD> BitShiftRightZeroExtend( const i32<8, SIMD>& a, int b ) 177 | { 178 | return _mm256_srli_epi32( a.native, b ); 179 | } 180 | 181 | 182 | template>> 183 | FS_FORCEINLINE i32<8, SIMD> Masked( const typename i32<8, SIMD>::MaskTypeArg& mask, const i32<8, SIMD>& a ) 184 | { 185 | return _mm256_and_si256( _mm256_castps_si256( mask.native ), a.native ); 186 | } 187 | 188 | template>> 189 | FS_FORCEINLINE i32<8, SIMD> InvMasked( const typename i32<8, SIMD>::MaskTypeArg& mask, const i32<8, SIMD>& a ) 190 | { 191 | return _mm256_andnot_si256( _mm256_castps_si256( mask.native ), a.native ); 192 | } 193 | 194 | template>> 195 | FS_FORCEINLINE i32<8, SIMD> MaskedIncrement( const typename i32<8, SIMD>::MaskTypeArg& mask, const i32<8, SIMD>& a ) 196 | { 197 | return _mm256_sub_epi32( a.native, _mm256_castps_si256( mask.native ) ); 198 | } 199 | 200 | template>> 201 | FS_FORCEINLINE i32<8, SIMD> MaskedDecrement( const typename i32<8, SIMD>::MaskTypeArg& mask, const i32<8, SIMD>& a ) 202 | { 203 | return _mm256_add_epi32( a.native, _mm256_castps_si256( mask.native ) ); 204 | } 205 | } 206 | -------------------------------------------------------------------------------- /include/FastSIMD/ToolSet/x86/256/m32x8.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace FS 6 | { 7 | template 8 | struct Register, 8, SIMD, std::enable_if_t> 9 | { 10 | static constexpr size_t ElementCount = 8; 11 | static constexpr auto FeatureFlags = SIMD; 12 | 13 | using NativeType = __m256; 14 | using ElementType = Mask<32, true>; 15 | using MaskType = Register; 16 | using MaskTypeArg = Register; 17 | 18 | FS_FORCEINLINE Register() = default; 19 | FS_FORCEINLINE Register( NativeType v ) : native( v ) { } 20 | 21 | FS_FORCEINLINE NativeType GetNative() const 22 | { 23 | return native; 24 | } 25 | 26 | FS_FORCEINLINE Register& operator &=( const Register& rhs ) 27 | { 28 | native = _mm256_and_ps( native, rhs.native ); 29 | return *this; 30 | } 31 | 32 | FS_FORCEINLINE Register& operator |=( const Register& rhs ) 33 | { 34 | native = _mm256_or_ps( native, rhs.native ); 35 | return *this; 36 | } 37 | 38 | FS_FORCEINLINE Register& operator ^=( const Register& rhs ) 39 | { 40 | native = _mm256_xor_ps( native, rhs.native ); 41 | return *this; 42 | } 43 | 44 | FS_FORCEINLINE Register operator ~() const 45 | { 46 | const __m256i neg1 = _mm256_set1_epi32( -1 ); 47 | return _mm256_xor_ps( native, _mm256_castsi256_ps( neg1 ) ); 48 | } 49 | 50 | NativeType native; 51 | }; 52 | 53 | template>> 54 | FS_FORCEINLINE m32<8, true, SIMD> BitwiseAndNot( const m32<8, true, SIMD>& a, const m32<8, true, SIMD>& b ) 55 | { 56 | return _mm256_andnot_ps( b.native, a.native ); 57 | } 58 | 59 | template>> 60 | FS_FORCEINLINE bool AnyMask( const m32<8, true, SIMD>& a ) 61 | { 62 | return _mm256_movemask_ps( a.native ); 63 | } 64 | 65 | template>> 66 | FS_FORCEINLINE BitStorage<8> BitMask( const m32<8, B, SIMD>& a ) 67 | { 68 | return static_cast>( _mm256_movemask_ps( a.native ) ); 69 | } 70 | 71 | 72 | template 73 | struct Register, 8, SIMD, std::enable_if_t> : Register, 8, SIMD> 74 | { 75 | static constexpr size_t ElementCount = 8; 76 | static constexpr auto FeatureFlags = SIMD; 77 | 78 | using NativeType = __m256i; 79 | using ElementType = Mask<32, false>; 80 | using MaskType = Register; 81 | using MaskTypeArg = Register, 8, SIMD>; 82 | 83 | FS_FORCEINLINE Register() = default; 84 | FS_FORCEINLINE Register( NativeType v ) : Register, 8, SIMD>( _mm256_castsi256_ps( v ) ) { } 85 | 86 | FS_FORCEINLINE NativeType GetNative() const 87 | { 88 | return _mm256_castps_si256( this->native ); 89 | } 90 | 91 | FS_FORCEINLINE Register& operator &=( const Register& rhs ) 92 | { 93 | this->native = _mm256_castsi256_ps( _mm256_and_si256( _mm256_castps_si256( this->native ), _mm256_castps_si256( rhs.native ) ) ); 94 | return *this; 95 | } 96 | 97 | FS_FORCEINLINE Register& operator |=( const Register& rhs ) 98 | { 99 | this->native = _mm256_castsi256_ps( _mm256_or_si256( _mm256_castps_si256( this->native ), _mm256_castps_si256( rhs.native ) ) ); 100 | return *this; 101 | } 102 | 103 | FS_FORCEINLINE Register& operator ^=( const Register& rhs ) 104 | { 105 | this->native = _mm256_castsi256_ps( _mm256_xor_si256( _mm256_castps_si256( this->native ), _mm256_castps_si256( rhs.native ) ) ); 106 | return *this; 107 | } 108 | 109 | FS_FORCEINLINE Register operator ~() const 110 | { 111 | const __m256i neg1 = _mm256_set1_epi32( -1 ); 112 | return _mm256_xor_si256( _mm256_castps_si256( this->native ), neg1 ); 113 | } 114 | }; 115 | 116 | template>> 117 | FS_FORCEINLINE m32<8, false, SIMD> BitwiseAndNot( const m32<8, false, SIMD>& a, const m32<8, false, SIMD>& b ) 118 | { 119 | return _mm256_andnot_si256( _mm256_castps_si256( b.native ), _mm256_castps_si256( a.native ) ); 120 | } 121 | 122 | template>> 123 | FS_FORCEINLINE bool AnyMask( const m32<8, false, SIMD>& a ) 124 | { 125 | return _mm256_movemask_epi8( _mm256_castps_si256( a.native ) ); 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /include/FastSIMD/ToolSet/x86/512/f32x16.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace FS 6 | { 7 | template 8 | struct Register> 9 | { 10 | static constexpr size_t ElementCount = 16; 11 | static constexpr auto FeatureFlags = SIMD; 12 | 13 | using NativeType = __m512; 14 | using ElementType = float; 15 | using MaskType = m32; 16 | using MaskTypeArg = m32; 17 | 18 | FS_FORCEINLINE Register() = default; 19 | FS_FORCEINLINE Register( NativeType v ) : native( v ) { } 20 | FS_FORCEINLINE Register( float v ) : native( _mm512_set1_ps( v ) ) { } 21 | 22 | FS_FORCEINLINE NativeType GetNative() const 23 | { 24 | return native; 25 | } 26 | 27 | FS_FORCEINLINE Register& operator +=( const Register& rhs ) 28 | { 29 | native = _mm512_add_ps( native, rhs.native ); 30 | return *this; 31 | } 32 | 33 | FS_FORCEINLINE Register& operator -=( const Register& rhs ) 34 | { 35 | native = _mm512_sub_ps( native, rhs.native ); 36 | return *this; 37 | } 38 | 39 | FS_FORCEINLINE Register& operator *=( const Register& rhs ) 40 | { 41 | native = _mm512_mul_ps( native, rhs.native ); 42 | return *this; 43 | } 44 | 45 | FS_FORCEINLINE Register& operator /=( const Register& rhs ) 46 | { 47 | native = _mm512_div_ps( native, rhs.native ); 48 | return *this; 49 | } 50 | 51 | FS_FORCEINLINE Register& operator &=( const Register& rhs ) 52 | { 53 | native = _mm512_and_ps( native, rhs.native ); 54 | return *this; 55 | } 56 | 57 | FS_FORCEINLINE Register& operator |=( const Register& rhs ) 58 | { 59 | native = _mm512_or_ps( native, rhs.native ); 60 | return *this; 61 | } 62 | 63 | FS_FORCEINLINE Register& operator ^=( const Register& rhs ) 64 | { 65 | native = _mm512_xor_ps( native, rhs.native ); 66 | return *this; 67 | } 68 | 69 | FS_FORCEINLINE Register operator~() const 70 | { 71 | const __m512i neg1 = _mm512_set1_epi32( -1 ); 72 | return _mm512_xor_ps( native, _mm512_castsi512_ps( neg1 ) ); 73 | } 74 | 75 | FS_FORCEINLINE Register operator-() const 76 | { 77 | const __m512i minInt = _mm512_set1_epi32( 0x80000000 ); 78 | return _mm512_xor_ps( native, _mm512_castsi512_ps( minInt ) ); 79 | } 80 | 81 | 82 | FS_FORCEINLINE MaskType operator ==( const Register& rhs ) const 83 | { 84 | return _mm512_cmp_ps_mask( native, rhs.native, _CMP_EQ_OQ ); 85 | } 86 | 87 | FS_FORCEINLINE MaskType operator !=( const Register& rhs ) const 88 | { 89 | return _mm512_cmp_ps_mask( native, rhs.native, _CMP_NEQ_OQ ); 90 | } 91 | 92 | FS_FORCEINLINE MaskType operator >=( const Register& rhs ) const 93 | { 94 | return _mm512_cmp_ps_mask( native, rhs.native, _CMP_GE_OQ ); 95 | } 96 | 97 | FS_FORCEINLINE MaskType operator <=( const Register& rhs ) const 98 | { 99 | return _mm512_cmp_ps_mask( native, rhs.native, _CMP_LE_OQ ); 100 | } 101 | 102 | FS_FORCEINLINE MaskType operator >( const Register& rhs ) const 103 | { 104 | return _mm512_cmp_ps_mask( native, rhs.native, _CMP_GT_OQ ); 105 | } 106 | 107 | FS_FORCEINLINE MaskType operator <( const Register& rhs ) const 108 | { 109 | return _mm512_cmp_ps_mask( native, rhs.native, _CMP_LT_OQ ); 110 | } 111 | 112 | NativeType native; 113 | }; 114 | 115 | 116 | template>> 117 | FS_FORCEINLINE f32<16, SIMD> Load( TypeWrapper ptr ) 118 | { 119 | return _mm512_loadu_ps( ptr.value ); 120 | } 121 | 122 | template>> 123 | FS_FORCEINLINE void Store( typename f32<16, SIMD>::ElementType* ptr, const f32<16, SIMD>& a ) 124 | { 125 | _mm512_storeu_ps( ptr, a.native ); 126 | } 127 | 128 | template>> 129 | FS_FORCEINLINE float Extract0( const f32<16, SIMD>& a ) 130 | { 131 | return _mm512_cvtss_f32( a.native ); 132 | } 133 | 134 | template>> 135 | FS_FORCEINLINE f32<16, SIMD> Abs( const f32<16, SIMD>& a ) 136 | { 137 | return _mm512_abs_ps( a.native ); 138 | } 139 | 140 | template>> 141 | FS_FORCEINLINE f32<16, SIMD> Round( const f32<16, SIMD>& a ) 142 | { 143 | return _mm512_roundscale_ps( a.native, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC ); 144 | } 145 | 146 | template>> 147 | FS_FORCEINLINE f32<16, SIMD> Floor( const f32<16, SIMD>& a ) 148 | { 149 | return _mm512_roundscale_ps( a.native, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC ); 150 | } 151 | 152 | template>> 153 | FS_FORCEINLINE f32<16, SIMD> Ceil( const f32<16, SIMD>& a ) 154 | { 155 | return _mm512_roundscale_ps( a.native, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC ); 156 | } 157 | 158 | template>> 159 | FS_FORCEINLINE f32<16, SIMD> Trunc( const f32<16, SIMD>& a ) 160 | { 161 | return _mm512_roundscale_ps( a.native, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC ); 162 | } 163 | 164 | template>> 165 | FS_FORCEINLINE f32<16, SIMD> Min( const f32<16, SIMD>& a, const f32<16, SIMD>& b ) 166 | { 167 | return _mm512_min_ps( a.native, b.native ); 168 | } 169 | 170 | template>> 171 | FS_FORCEINLINE f32<16, SIMD> Max( const f32<16, SIMD>& a, const f32<16, SIMD>& b ) 172 | { 173 | return _mm512_max_ps( a.native, b.native ); 174 | } 175 | 176 | template>> 177 | FS_FORCEINLINE f32<16, SIMD> Select( const typename f32<16, SIMD>::MaskTypeArg& mask, const f32<16, SIMD>& ifTrue, const f32<16, SIMD>& ifFalse ) 178 | { 179 | return _mm512_mask_blend_ps( mask.native, ifFalse.native, ifTrue.native ); 180 | } 181 | 182 | template>> 183 | FS_FORCEINLINE f32<16, SIMD> BitwiseAndNot( const f32<16, SIMD>& a, const f32<16, SIMD>& b ) 184 | { 185 | return _mm512_andnot_ps( b.native, a.native ); 186 | } 187 | 188 | template>> 189 | FS_FORCEINLINE f32<16, SIMD> Masked( const typename f32<16, SIMD>::MaskTypeArg& mask, const f32<16, SIMD>& a ) 190 | { 191 | return _mm512_maskz_mov_ps( mask.native, a.native ); 192 | } 193 | 194 | template>> 195 | FS_FORCEINLINE f32<16, SIMD> InvMasked( const typename f32<16, SIMD>::MaskTypeArg& mask, const f32<16, SIMD>& a ) 196 | { 197 | return _mm512_maskz_mov_ps( ~mask.native, a.native ); 198 | } 199 | 200 | template>> 201 | FS_FORCEINLINE f32<16, SIMD> MaskedAdd( const typename f32<16, SIMD>::MaskTypeArg& mask, const f32<16, SIMD>& a, const f32<16, SIMD>& b ) 202 | { 203 | return _mm512_mask_add_ps( a.native, mask.native, a.native, b.native ); 204 | } 205 | 206 | template>> 207 | FS_FORCEINLINE f32<16, SIMD> MaskedSub( const typename f32<16, SIMD>::MaskTypeArg& mask, const f32<16, SIMD>& a, const f32<16, SIMD>& b ) 208 | { 209 | return _mm512_mask_sub_ps( a.native, mask.native, a.native, b.native ); 210 | } 211 | 212 | template>> 213 | FS_FORCEINLINE f32<16, SIMD> MaskedMul( const typename f32<16, SIMD>::MaskTypeArg& mask, const f32<16, SIMD>& a, const f32<16, SIMD>& b ) 214 | { 215 | return _mm512_mask_mul_ps( a.native, mask.native, a.native, b.native ); 216 | } 217 | 218 | template>> 219 | FS_FORCEINLINE f32<16, SIMD> InvMaskedAdd( const typename f32<16, SIMD>::MaskTypeArg& mask, const f32<16, SIMD>& a, const f32<16, SIMD>& b ) 220 | { 221 | return _mm512_mask_add_ps( a.native, ~mask.native, a.native, b.native ); 222 | } 223 | 224 | template>> 225 | FS_FORCEINLINE f32<16, SIMD> InvMaskedSub( const typename f32<16, SIMD>::MaskTypeArg& mask, const f32<16, SIMD>& a, const f32<16, SIMD>& b ) 226 | { 227 | return _mm512_mask_sub_ps( a.native, ~mask.native, a.native, b.native ); 228 | } 229 | 230 | template>> 231 | FS_FORCEINLINE f32<16, SIMD> InvMaskedMul( const typename f32<16, SIMD>::MaskTypeArg& mask, const f32<16, SIMD>& a, const f32<16, SIMD>& b ) 232 | { 233 | return _mm512_mask_mul_ps( a.native, ~mask.native, a.native, b.native ); 234 | } 235 | 236 | template>, typename = EnableIfRelaxed()> 237 | FS_FORCEINLINE f32<16, SIMD> Reciprocal( const f32<16, SIMD>& a ) 238 | { 239 | return _mm512_rcp14_ps( a.native ); 240 | } 241 | 242 | template>, typename = EnableIfRelaxed()> 243 | FS_FORCEINLINE f32<16, SIMD> InvSqrt( const f32<16, SIMD>& a ) 244 | { 245 | return _mm512_rsqrt14_ps( a.native ); 246 | } 247 | 248 | template>> 249 | FS_FORCEINLINE f32<16, SIMD> Sqrt( const f32<16, SIMD>& a ) 250 | { 251 | return _mm512_sqrt_ps( a.native ); 252 | } 253 | 254 | template>, typename = EnableIfRelaxed()> 255 | FS_FORCEINLINE f32<16, SIMD> FMulAdd( const f32<16, SIMD>& a, const f32<16, SIMD>& b, const f32<16, SIMD>& c ) 256 | { 257 | return _mm512_fmadd_ps( a.native, b.native, c.native ); 258 | } 259 | 260 | template>, typename = EnableIfRelaxed()> 261 | FS_FORCEINLINE f32<16, SIMD> FMulSub( const f32<16, SIMD>& a, const f32<16, SIMD>& b, const f32<16, SIMD>& c ) 262 | { 263 | return _mm512_fmsub_ps( a.native, b.native, c.native ); 264 | } 265 | 266 | template>, typename = EnableIfRelaxed()> 267 | FS_FORCEINLINE f32<16, SIMD> FNMulAdd( const f32<16, SIMD>& a, const f32<16, SIMD>& b, const f32<16, SIMD>& c ) 268 | { 269 | return _mm512_fnmadd_ps( a.native, b.native, c.native ); 270 | } 271 | 272 | template>, typename = EnableIfRelaxed()> 273 | FS_FORCEINLINE f32<16, SIMD> FNMulSub( const f32<16, SIMD>& a, const f32<16, SIMD>& b, const f32<16, SIMD>& c ) 274 | { 275 | return _mm512_fnmsub_ps( a.native, b.native, c.native ); 276 | } 277 | } -------------------------------------------------------------------------------- /include/FastSIMD/ToolSet/x86/512/i32x16.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace FS 6 | { 7 | template 8 | struct Register> 9 | { 10 | static constexpr size_t ElementCount = 16; 11 | static constexpr auto FeatureFlags = SIMD; 12 | 13 | using NativeType = __m512i; 14 | using ElementType = std::int32_t; 15 | using MaskType = m32; 16 | using MaskTypeArg = m32; 17 | 18 | FS_FORCEINLINE Register() = default; 19 | FS_FORCEINLINE Register( NativeType v ) : native( v ) { } 20 | FS_FORCEINLINE Register( std::int32_t v ) : native( _mm512_set1_epi32( v ) ) { } 21 | 22 | FS_FORCEINLINE NativeType GetNative() const 23 | { 24 | return native; 25 | } 26 | 27 | FS_FORCEINLINE Register& operator +=( const Register& rhs ) 28 | { 29 | native = _mm512_add_epi32( native, rhs.native ); 30 | return *this; 31 | } 32 | 33 | FS_FORCEINLINE Register& operator -=( const Register& rhs ) 34 | { 35 | native = _mm512_sub_epi32( native, rhs.native ); 36 | return *this; 37 | } 38 | 39 | FS_FORCEINLINE Register& operator *=( const Register& rhs ) 40 | { 41 | native = _mm512_mullo_epi32( native, rhs.native ); 42 | return *this; 43 | } 44 | 45 | FS_FORCEINLINE Register& operator &=( const Register& rhs ) 46 | { 47 | native = _mm512_and_si512( native, rhs.native ); 48 | return *this; 49 | } 50 | 51 | FS_FORCEINLINE Register& operator |=( const Register& rhs ) 52 | { 53 | native = _mm512_or_si512( native, rhs.native ); 54 | return *this; 55 | } 56 | 57 | FS_FORCEINLINE Register& operator ^=( const Register& rhs ) 58 | { 59 | native = _mm512_xor_si512( native, rhs.native ); 60 | return *this; 61 | } 62 | 63 | FS_FORCEINLINE Register& operator >>=( int rhs ) 64 | { 65 | native = _mm512_srai_epi32( native, rhs ); 66 | return *this; 67 | } 68 | 69 | FS_FORCEINLINE Register& operator <<=( int rhs ) 70 | { 71 | native = _mm512_slli_epi32( native, rhs ); 72 | return *this; 73 | } 74 | 75 | FS_FORCEINLINE Register operator ~() const 76 | { 77 | const __m512i neg1 = _mm512_set1_epi32( -1 ); 78 | return _mm512_xor_si512( native, neg1 ); 79 | } 80 | 81 | FS_FORCEINLINE Register operator -() const 82 | { 83 | return _mm512_sub_epi32( _mm512_setzero_si512(), native ); 84 | } 85 | 86 | 87 | FS_FORCEINLINE MaskType operator ==( const Register& rhs ) const 88 | { 89 | return _mm512_cmp_epi32_mask( native, rhs.native, _MM_CMPINT_EQ ); 90 | } 91 | 92 | FS_FORCEINLINE MaskType operator !=( const Register& rhs ) const 93 | { 94 | return _mm512_cmp_epi32_mask( native, rhs.native, _MM_CMPINT_NE ); 95 | } 96 | 97 | FS_FORCEINLINE MaskType operator >=( const Register& rhs ) const 98 | { 99 | return _mm512_cmp_epi32_mask( native, rhs.native, _MM_CMPINT_NLT ); 100 | } 101 | 102 | FS_FORCEINLINE MaskType operator <=( const Register& rhs ) const 103 | { 104 | return _mm512_cmp_epi32_mask( native, rhs.native, _MM_CMPINT_LE ); 105 | } 106 | 107 | FS_FORCEINLINE MaskType operator >( const Register& rhs ) const 108 | { 109 | return _mm512_cmp_epi32_mask( native, rhs.native, _MM_CMPINT_NLE ); 110 | } 111 | 112 | FS_FORCEINLINE MaskType operator <( const Register& rhs ) const 113 | { 114 | return _mm512_cmp_epi32_mask( native, rhs.native, _MM_CMPINT_LT ); 115 | } 116 | 117 | NativeType native; 118 | }; 119 | 120 | 121 | template>> 122 | FS_FORCEINLINE i32<16, SIMD> Load( TypeWrapper ptr ) 123 | { 124 | return _mm512_loadu_si512( (const __m512i*)ptr.value ); 125 | } 126 | 127 | template>> 128 | FS_FORCEINLINE void Store( typename i32<16, SIMD>::ElementType* ptr, const i32<16, SIMD>& a ) 129 | { 130 | _mm512_storeu_si512( (__m512i*)ptr, a.native ); 131 | } 132 | 133 | template>> 134 | FS_FORCEINLINE int32_t Extract0( const i32<16, SIMD>& a ) 135 | { 136 | return _mm512_cvtsi512_si32( a.native ); 137 | } 138 | 139 | template>> 140 | FS_FORCEINLINE i32<16, SIMD> Abs( const i32<16, SIMD>& a ) 141 | { 142 | return _mm512_abs_epi32( a.native ); 143 | } 144 | 145 | template>> 146 | FS_FORCEINLINE i32<16, SIMD> Min( const i32<16, SIMD>& a, const i32<16, SIMD>& b ) 147 | { 148 | return _mm512_min_epi32( a.native, b.native ); 149 | } 150 | 151 | template>> 152 | FS_FORCEINLINE i32<16, SIMD> Max( const i32<16, SIMD>& a, const i32<16, SIMD>& b ) 153 | { 154 | return _mm512_max_epi32( a.native, b.native ); 155 | } 156 | 157 | template>> 158 | FS_FORCEINLINE i32<16, SIMD> Select( const typename i32<16, SIMD>::MaskTypeArg& mask, const i32<16, SIMD>& ifTrue, const i32<16, SIMD>& ifFalse ) 159 | { 160 | return _mm512_mask_blend_epi32( mask.native, ifFalse.native, ifTrue.native ); 161 | } 162 | 163 | template>> 164 | FS_FORCEINLINE i32<16, SIMD> BitwiseAndNot( const i32<16, SIMD>& a, const i32<16, SIMD>& b ) 165 | { 166 | return _mm512_andnot_si512( b.native, a.native ); 167 | } 168 | 169 | template>> 170 | FS_FORCEINLINE i32<16, SIMD> BitShiftRightZeroExtend( const i32<16, SIMD>& a, int b ) 171 | { 172 | return _mm512_srli_epi32( a.native, b ); 173 | } 174 | 175 | 176 | template>> 177 | FS_FORCEINLINE i32<16, SIMD> Masked( const typename i32<16, SIMD>::MaskTypeArg& mask, const i32<16, SIMD>& a ) 178 | { 179 | return _mm512_maskz_mov_epi32( mask.native, a.native ); 180 | } 181 | 182 | template>> 183 | FS_FORCEINLINE i32<16, SIMD> InvMasked( const typename i32<16, SIMD>::MaskTypeArg& mask, const i32<16, SIMD>& a ) 184 | { 185 | return _mm512_maskz_mov_epi32( ~mask.native, a.native ); 186 | } 187 | 188 | 189 | template>> 190 | FS_FORCEINLINE i32<16, SIMD> MaskedAdd( const typename i32<16, SIMD>::MaskTypeArg& mask, const i32<16, SIMD>& a, const i32<16, SIMD>& b ) 191 | { 192 | return _mm512_mask_add_epi32( a.native, mask.native, a.native, b.native ); 193 | } 194 | 195 | template>> 196 | FS_FORCEINLINE i32<16, SIMD> MaskedSub( const typename i32<16, SIMD>::MaskTypeArg& mask, const i32<16, SIMD>& a, const i32<16, SIMD>& b ) 197 | { 198 | return _mm512_mask_sub_epi32( a.native, mask.native, a.native, b.native ); 199 | } 200 | 201 | template>> 202 | FS_FORCEINLINE i32<16, SIMD> MaskedMul( const typename i32<16, SIMD>::MaskTypeArg& mask, const i32<16, SIMD>& a, const i32<16, SIMD>& b ) 203 | { 204 | return _mm512_mask_mullo_epi32( a.native, mask.native, a.native, b.native ); 205 | } 206 | 207 | template>> 208 | FS_FORCEINLINE i32<16, SIMD> InvMaskedAdd( const typename i32<16, SIMD>::MaskTypeArg& mask, const i32<16, SIMD>& a, const i32<16, SIMD>& b ) 209 | { 210 | return _mm512_mask_add_epi32( a.native, ~mask.native, a.native, b.native ); 211 | } 212 | 213 | template>> 214 | FS_FORCEINLINE i32<16, SIMD> InvMaskedSub( const typename i32<16, SIMD>::MaskTypeArg& mask, const i32<16, SIMD>& a, const i32<16, SIMD>& b ) 215 | { 216 | return _mm512_mask_sub_epi32( a.native, ~mask.native, a.native, b.native ); 217 | } 218 | 219 | template>> 220 | FS_FORCEINLINE i32<16, SIMD> InvMaskedMul( const typename i32<16, SIMD>::MaskTypeArg& mask, const i32<16, SIMD>& a, const i32<16, SIMD>& b ) 221 | { 222 | return _mm512_mask_mullo_epi32( a.native, ~mask.native, a.native, b.native ); 223 | } 224 | } -------------------------------------------------------------------------------- /include/FastSIMD/ToolSet/x86/512/mNx16.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace FS 6 | { 7 | namespace impl 8 | { 9 | struct AVX512MaskBase 10 | { 11 | __mmask16 native; 12 | }; 13 | } 14 | 15 | template 16 | struct Register, 16, SIMD, std::enable_if_t> 17 | : std::conditional_t, 16, SIMD>> 18 | { 19 | static constexpr size_t ElementCount = 16; 20 | static constexpr auto FeatureFlags = SIMD; 21 | 22 | using NativeType = decltype(AVX512MaskBase::native); 23 | using ElementType = Mask; 24 | using MaskType = Register; 25 | using MaskTypeArg = Register; 26 | 27 | FS_FORCEINLINE Register() = default; 28 | FS_FORCEINLINE Register( NativeType v ) { this->native = v; } 29 | 30 | FS_FORCEINLINE NativeType GetNative() const 31 | { 32 | return this->native; 33 | } 34 | 35 | FS_FORCEINLINE Register& operator &=( const Register& rhs ) 36 | { 37 | this->native = ( this->native & rhs.native ); 38 | return *this; 39 | } 40 | 41 | FS_FORCEINLINE Register& operator |=( const Register& rhs ) 42 | { 43 | this->native = ( this->native | rhs.native ); 44 | return *this; 45 | } 46 | 47 | FS_FORCEINLINE Register& operator ^=( const Register& rhs ) 48 | { 49 | this->native = ( this->native ^ rhs.native ); 50 | return *this; 51 | } 52 | 53 | FS_FORCEINLINE Register operator ~() const 54 | { 55 | return ~this->native; 56 | } 57 | }; 58 | 59 | template, 16, SIMD>>> 60 | FS_FORCEINLINE bool AnyMask( const Register, 16, SIMD>& a ) 61 | { 62 | return (bool)a.native; 63 | } 64 | 65 | template, 16, SIMD>>> 66 | FS_FORCEINLINE BitStorage<16> BitMask( const Register, 16, SIMD>& a ) 67 | { 68 | return static_cast>( a.native ); 69 | } 70 | } -------------------------------------------------------------------------------- /include/FastSIMD/ToolSet/x86/AVX.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | #if FASTSIMD_MAX_FEATURE_VALUE() >= FASTSIMD_FEATURE_VALUE( AVX512 ) 5 | #include "AVX512.h" 6 | #endif 7 | 8 | #include "256/f32x8.h" 9 | #include "256/i32x8.h" 10 | #include "256/m32x8.h" 11 | 12 | namespace FS 13 | { 14 | template>> 15 | FS_FORCEINLINE i32<8, SIMD> Convert( const f32<8, SIMD>& a, TypeDummy ) 16 | { 17 | return _mm256_cvtps_epi32( a.native ); 18 | } 19 | 20 | template>> 21 | FS_FORCEINLINE f32<8, SIMD> Convert( const i32<8, SIMD>& a, TypeDummy ) 22 | { 23 | return _mm256_cvtepi32_ps( a.native ); 24 | } 25 | 26 | template>> 27 | FS_FORCEINLINE Register Cast( const Register& a, TypeDummy ) 28 | { 29 | if constexpr( 30 | std::is_same_v::NativeType, __m256> && 31 | std::is_same_v::NativeType, __m256i> ) 32 | { 33 | return _mm256_castps_si256( a.GetNative() ); 34 | } 35 | else if constexpr( 36 | std::is_same_v::NativeType, __m256i> && 37 | std::is_same_v::NativeType, __m256> ) 38 | { 39 | return _mm256_castsi256_ps( a.GetNative() ); 40 | } 41 | else 42 | { 43 | return a.GetNative(); 44 | } 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /include/FastSIMD/ToolSet/x86/AVX512.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | #include "512/f32x16.h" 5 | #include "512/i32x16.h" 6 | #include "512/mNx16.h" 7 | 8 | namespace FS 9 | { 10 | template>> 11 | FS_FORCEINLINE i32<16, SIMD> Convert( const f32<16, SIMD>& a, TypeDummy ) 12 | { 13 | return _mm512_cvtps_epi32( a.native ); 14 | } 15 | 16 | template>> 17 | FS_FORCEINLINE f32<16, SIMD> Convert( const i32<16, SIMD>& a, TypeDummy ) 18 | { 19 | return _mm512_cvtepi32_ps( a.native ); 20 | } 21 | 22 | template>> 23 | FS_FORCEINLINE Register Cast( const Register& a, TypeDummy ) 24 | { 25 | if constexpr( 26 | std::is_same_v::NativeType, __m512> && 27 | std::is_same_v::NativeType, __m512i> ) 28 | { 29 | return _mm512_castps_si512( a.GetNative() ); 30 | } 31 | else if constexpr( 32 | std::is_same_v::NativeType, __m512i> && 33 | std::is_same_v::NativeType, __m512> ) 34 | { 35 | return _mm512_castsi512_ps( a.GetNative() ); 36 | } 37 | else 38 | { 39 | return a.GetNative(); 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /include/FastSIMD/ToolSet/x86/SSE.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | #if FASTSIMD_MAX_FEATURE_VALUE() >= FASTSIMD_FEATURE_VALUE( AVX ) 5 | #include "AVX.h" 6 | #endif 7 | 8 | #include "128/f32x4.h" 9 | #include "128/i32x4.h" 10 | #include "128/m32x4.h" 11 | 12 | namespace FS 13 | { 14 | template>> 15 | FS_FORCEINLINE i32<4, SIMD> Convert( const f32<4, SIMD>& a, TypeDummy ) 16 | { 17 | return _mm_cvtps_epi32( a.native ); 18 | } 19 | 20 | template>> 21 | FS_FORCEINLINE f32<4, SIMD> Convert( const i32<4, SIMD>& a, TypeDummy ) 22 | { 23 | return _mm_cvtepi32_ps( a.native ); 24 | } 25 | 26 | template>> 27 | FS_FORCEINLINE Register Cast( const Register& a, TypeDummy ) 28 | { 29 | if constexpr( 30 | std::is_same_v::NativeType, __m128> && 31 | std::is_same_v::NativeType, __m128i> ) 32 | { 33 | return _mm_castps_si128( a.GetNative() ); 34 | } 35 | else if constexpr( 36 | std::is_same_v::NativeType, __m128i> && 37 | std::is_same_v::NativeType, __m128> ) 38 | { 39 | return _mm_castsi128_ps( a.GetNative() ); 40 | } 41 | else 42 | { 43 | return a.GetNative(); 44 | } 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /include/FastSIMD/ToolSet/x86/x86.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #if FASTSIMD_MAX_FEATURE_VALUE() >= FASTSIMD_FEATURE_VALUE( SSE ) 4 | #include "SSE.h" 5 | #endif 6 | -------------------------------------------------------------------------------- /include/FastSIMD/Utility/ArchDetect.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | //Based on the Qt processor detection code, so should be very accurate 3 | //https: code.qt.io/cgit/qt/qtbase.git/tree/src/corelib/global/qprocessordetection.h 4 | 5 | #define FASTSIMD_ARCH_VALUE_X86() 1 6 | #define FASTSIMD_ARCH_VALUE_ARM() 2 7 | #define FASTSIMD_ARCH_VALUE_WASM() 3 8 | 9 | #define FASTSIMD_FEATURE_VALUE_SCALAR() 1 10 | 11 | // -- Web Assembly -- 12 | #if defined( __EMSCRIPTEN__ ) || defined( EMSCRIPTEN ) 13 | 14 | #define FASTSIMD_FEATURE_VALUE_WASM() 2 15 | 16 | #define FASTSIMD_FEATURE_DETECT() WASM 17 | #define FASTSIMD_ARCH_DETECT() WASM 18 | 19 | // -- ARM -- 20 | #elif defined( __arm__ ) || defined( __TARGET_ARCH_ARM ) || defined( _M_ARM ) || defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __ARM64__ ) 21 | 22 | #define FASTSIMD_FEATURE_VALUE_NEON() 2 23 | #define FASTSIMD_FEATURE_VALUE_AARCH64() 3 24 | 25 | #if defined( __ARM64_ARCH_8__ ) || defined( __aarch64__ ) || defined( __ARMv8__ ) || defined( __ARMv8_A__ ) || defined( _M_ARM64 ) || defined( __ARM_NEON__ ) 26 | #define FASTSIMD_FEATURE_DETECT() AARCH64 27 | //#elif defined( __ARM_ARCH_7__ ) || defined( __ARM_ARCH_7A__ ) || defined( __ARM_ARCH_7R__ ) || defined( __ARM_ARCH_7M__ ) || defined( __ARM_ARCH_7S__ ) || defined( _ARM_ARCH_7 ) || defined( __CORE_CORTEXA__ ) 28 | //#define FASTSIMD_ARCH_ARM() 7 29 | //#elif defined( __ARM_ARCH_6__ ) || defined( __ARM_ARCH_6J__ ) || defined( __ARM_ARCH_6T2__ ) || defined( __ARM_ARCH_6Z__ ) || defined( __ARM_ARCH_6K__ ) || defined( __ARM_ARCH_6ZK__ ) || defined( __ARM_ARCH_6M__ ) 30 | //#define FASTSIMD_ARCH_ARM() 6 31 | //#elif defined( __ARM_ARCH_5TEJ__ ) || defined( __ARM_ARCH_5TE__ ) 32 | //#define FASTSIMD_ARCH_ARM() 5 33 | #else 34 | #define FASTSIMD_FEATURE_DETECT() NEON 35 | #endif 36 | 37 | #define FASTSIMD_ARCH_DETECT() ARM 38 | 39 | // -- x86 -- 40 | #elif defined( __i386 ) || defined( __i386__ ) || defined( _M_IX86 ) || defined( __x86_64 ) || defined( __x86_64__ ) || defined( __amd64 ) || defined( _M_X64 ) 41 | 42 | #define FASTSIMD_FEATURE_VALUE_SSE() 2 43 | #define FASTSIMD_FEATURE_VALUE_SSE2() 3 44 | #define FASTSIMD_FEATURE_VALUE_SSE3() 4 45 | #define FASTSIMD_FEATURE_VALUE_SSSE3() 5 46 | #define FASTSIMD_FEATURE_VALUE_SSE41() 6 47 | #define FASTSIMD_FEATURE_VALUE_SSE42() 7 48 | #define FASTSIMD_FEATURE_VALUE_AVX() 8 49 | #define FASTSIMD_FEATURE_VALUE_AVX2() 9 50 | #define FASTSIMD_FEATURE_VALUE_AVX512() 10 51 | 52 | #if defined( __AVX512F__ ) && defined( __AVX512VL__ ) && defined( __AVX512BW__ ) && defined( __AVX512DQ__ ) 53 | #define FASTSIMD_FEATURE_DETECT() AVX512 54 | #elif defined( __AVX2__ ) 55 | #define FASTSIMD_FEATURE_DETECT() AVX2 56 | #elif defined( __AVX__ ) 57 | #define FASTSIMD_FEATURE_DETECT() AVX 58 | #elif defined( __SSE4_2__ ) 59 | #define FASTSIMD_FEATURE_DETECT() SSE42 60 | #elif defined( __SSE4_1__ ) 61 | #define FASTSIMD_FEATURE_DETECT() SSE41 62 | #elif defined( __SSSE3__ ) 63 | #define FASTSIMD_FEATURE_DETECT() SSSE3 64 | #elif defined( __SSE3__ ) 65 | #define FASTSIMD_FEATURE_DETECT() SSE3 66 | #elif defined( __SSE2__ ) || defined( __x86_64 ) || defined( __x86_64__ ) || defined( __amd64 ) || defined( _M_X64 ) || ( defined( _M_IX86_FP ) && _M_IX86_FP == 2 ) 67 | #define FASTSIMD_FEATURE_DETECT() SSE2 68 | #elif defined( __SSE__ ) || defined( __i686__ ) || defined( __athlon__ ) || defined( __pentiumpro__ ) || ( defined( _M_IX86_FP ) && _M_IX86_FP == 1 ) 69 | #define FASTSIMD_FEATURE_DETECT() SSE2 70 | #else 71 | #define FASTSIMD_FEATURE_DETECT() SCALAR 72 | #endif 73 | 74 | #define FASTSIMD_ARCH_DETECT() X86 75 | #endif 76 | 77 | 78 | #define FASTSIMD_ARCH_VALUE( arch ) FASTSIMD_ARCH_VALUE_IMPL( arch ) 79 | #define FASTSIMD_ARCH_VALUE_IMPL( arch ) FASTSIMD_ARCH_VALUE_##arch() 80 | 81 | #define FASTSIMD_CURRENT_ARCH_IS( arch ) FASTSIMD_ARCH_VALUE( FASTSIMD_ARCH_DETECT() ) == FASTSIMD_ARCH_VALUE( arch ) 82 | 83 | 84 | #ifndef FASTSIMD_MAX_FEATURE_SET 85 | #define FASTSIMD_MAX_FEATURE_SET FASTSIMD_FEATURE_DETECT() 86 | #endif 87 | #ifndef FASTSIMD_DEFAULT_FEATURE_SET 88 | #define FASTSIMD_DEFAULT_FEATURE_SET FASTSIMD_MAX_FEATURE_SET 89 | #endif 90 | 91 | #define FASTSIMD_FEATURE_VALUE( feature ) FASTSIMD_FEATURE_VALUE_IMPL( feature ) 92 | #define FASTSIMD_FEATURE_VALUE_IMPL( feature ) FASTSIMD_FEATURE_VALUE_##feature() 93 | 94 | #define FASTSIMD_DEFAULT_FEATURE_VALUE() FASTSIMD_FEATURE_VALUE( FASTSIMD_DEFAULT_FEATURE_SET ) 95 | #define FASTSIMD_MAX_FEATURE_VALUE() FASTSIMD_FEATURE_VALUE( FASTSIMD_MAX_FEATURE_SET ) 96 | #define FASTSIMD_ARCH_NAME() FASTSIMD_ARCH_DETECT()=FASTSIMD_MAX_FEATURE_SET 97 | 98 | #ifndef FASTSIMD_IS_RELAXED 99 | #define FASTSIMD_IS_RELAXED 0 100 | #endif 101 | -------------------------------------------------------------------------------- /include/FastSIMD/Utility/Export.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #if !defined( FASTSIMD_STATIC_LIB ) && ( defined( _WIN32 ) || defined( __CYGWIN__ ) ) 4 | #ifdef FASTSIMD_EXPORT 5 | #define FASTSIMD_API __declspec( dllexport ) 6 | #else 7 | #define FASTSIMD_API __declspec( dllimport ) 8 | #endif 9 | #else 10 | #define FASTSIMD_API 11 | #endif 12 | -------------------------------------------------------------------------------- /include/FastSIMD/Utility/FeatureEnums.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "Export.h" 3 | 4 | #include 5 | 6 | namespace FastSIMD 7 | { 8 | enum class FeatureFlag 9 | { 10 | Scalar, 11 | 12 | x86, 13 | SSE, 14 | SSE2, 15 | SSE3, 16 | SSSE3, 17 | SSE41, 18 | SSE42, 19 | AVX, 20 | AVX2, 21 | AVX512_F, 22 | AVX512_VL, 23 | AVX512_DQ, 24 | AVX512_BW, 25 | 26 | ARM, 27 | NEON, 28 | AARCH64, 29 | 30 | WASM, 31 | }; 32 | 33 | constexpr std::uint32_t operator |( FeatureFlag a, FeatureFlag b ) 34 | { 35 | return 1U << static_cast(a) | 1U << static_cast(b); 36 | } 37 | 38 | constexpr std::uint32_t operator |( std::uint32_t a, FeatureFlag b ) 39 | { 40 | return a | 1U << static_cast(b); 41 | } 42 | 43 | enum class FeatureSet : std::uint32_t 44 | { 45 | Invalid, 46 | 47 | SCALAR = Invalid | FeatureFlag::Scalar, 48 | 49 | SSE = FeatureFlag::x86 | FeatureFlag::SSE, 50 | SSE2 = SSE | FeatureFlag::SSE2, 51 | SSE3 = SSE2 | FeatureFlag::SSE3, 52 | SSSE3 = SSE3 | FeatureFlag::SSSE3, 53 | SSE41 = SSSE3 | FeatureFlag::SSE41, 54 | SSE42 = SSE41 | FeatureFlag::SSE42, 55 | AVX = SSE42 | FeatureFlag::AVX, 56 | AVX2 = AVX | FeatureFlag::AVX2, 57 | AVX512 = AVX2 | FeatureFlag::AVX512_F | FeatureFlag::AVX512_VL | FeatureFlag::AVX512_DQ | FeatureFlag::AVX512_BW, 58 | 59 | NEON = FeatureFlag::ARM | FeatureFlag::NEON, 60 | AARCH64 = NEON | FeatureFlag::AARCH64, 61 | 62 | WASM = Invalid | FeatureFlag::WASM, 63 | 64 | Max = ~0U 65 | }; 66 | 67 | constexpr bool operator &( FeatureSet a, FeatureFlag b ) 68 | { 69 | return static_cast(a) & 1U << static_cast(b); 70 | } 71 | 72 | constexpr bool operator &( FeatureSet a, std::uint32_t b ) 73 | { 74 | return static_cast(a) & b; 75 | } 76 | 77 | FASTSIMD_API FeatureSet DetectCpuMaxFeatureSet(); 78 | 79 | FASTSIMD_API const char* GetFeatureSetString( FeatureSet ); 80 | } 81 | -------------------------------------------------------------------------------- /include/FastSIMD/Utility/FeatureSetList.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | namespace FastSIMD 5 | { 6 | template 7 | struct FeatureSetList; 8 | 9 | template 10 | struct FeatureSetList<0, HEAD> 11 | { 12 | static constexpr FeatureSet AsArray[] = { HEAD }; 13 | static constexpr FeatureSet Minimum = HEAD; 14 | static constexpr FeatureSet Maximum = HEAD; 15 | 16 | template 17 | static constexpr FeatureSet NextAfter = FeatureSet::Max; 18 | }; 19 | 20 | template 21 | struct FeatureSetList<0, HEAD, TAIL...> 22 | { 23 | static constexpr FeatureSet AsArray[] = { HEAD, TAIL... }; 24 | static constexpr FeatureSet Minimum = HEAD; 25 | static constexpr FeatureSet Maximum = FeatureSetList<0, TAIL...>::Maximum; 26 | 27 | template 28 | static constexpr FeatureSet NextAfter = (L == HEAD) ? FeatureSetList<0, TAIL...>::Minimum : FeatureSetList<0, TAIL...>::template NextAfter; 29 | }; 30 | 31 | } 32 | -------------------------------------------------------------------------------- /src/FastSIMD.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #if FASTSIMD_CURRENT_ARCH_IS( X86 ) 4 | #if defined( _MSC_VER ) 5 | #include 6 | #endif 7 | 8 | // Define interface to cpuid instruction. 9 | // input: eax = functionnumber, ecx = 0 10 | // output: eax = output[0], ebx = output[1], ecx = output[2], edx = output[3] 11 | static void cpuid( int output[4], int functionnumber ) 12 | { 13 | #if defined( __GNUC__ ) || defined( __clang__ ) // use inline assembly, Gnu/AT&T syntax 14 | 15 | int a, b, c, d; 16 | __asm( "cpuid" 17 | : "=a"( a ), "=b"( b ), "=c"( c ), "=d"( d ) 18 | : "a"( functionnumber ), "c"( 0 ) 19 | : ); 20 | output[0] = a; 21 | output[1] = b; 22 | output[2] = c; 23 | output[3] = d; 24 | 25 | #elif defined( _MSC_VER ) || defined( __INTEL_COMPILER ) // Microsoft or Intel compiler, intrin.h included 26 | 27 | __cpuidex( output, functionnumber, 0 ); // intrinsic function for CPUID 28 | 29 | #else // unknown platform. try inline assembly with masm/intel syntax 30 | 31 | __asm 32 | { 33 | mov eax, functionnumber 34 | xor ecx, ecx 35 | cpuid; 36 | mov esi, output 37 | mov[esi], eax 38 | mov[esi + 4], ebx 39 | mov[esi + 8], ecx 40 | mov[esi + 12], edx 41 | } 42 | 43 | #endif 44 | } 45 | 46 | // Define interface to xgetbv instruction 47 | static uint64_t xgetbv( uint32_t ctr ) 48 | { 49 | #if( defined( _MSC_FULL_VER ) && _MSC_FULL_VER >= 160040000 ) || ( defined( __INTEL_COMPILER ) && __INTEL_COMPILER >= 1200 ) // Microsoft or Intel compiler supporting _xgetbv intrinsic 50 | 51 | return _xgetbv( ctr ); // intrinsic function for XGETBV 52 | 53 | #elif defined( __GNUC__ ) // use inline assembly, Gnu/AT&T syntax 54 | 55 | uint32_t a, d; 56 | __asm( "xgetbv" 57 | : "=a"( a ), "=d"( d ) 58 | : "c"( ctr ) 59 | : ); 60 | return a | ( uint64_t( d ) << 32 ); 61 | 62 | #else // #elif defined (_WIN32) // other compiler. try inline assembly with masm/intel/MS syntax 63 | 64 | uint32_t a, d; 65 | __asm { 66 | mov ecx, ctr 67 | _emit 0x0f 68 | _emit 0x01 69 | _emit 0xd0; // xgetbv 70 | mov a, eax 71 | mov d, edx 72 | } 73 | return a | ( uint64_t( d ) << 32 ); 74 | 75 | #endif 76 | } 77 | #endif 78 | 79 | namespace FastSIMD 80 | { 81 | #if FASTSIMD_CURRENT_ARCH_IS( X86 ) 82 | static std::uint32_t DetectCpuSupportedFlags() 83 | { 84 | std::uint32_t supportedFlags = FeatureFlag::x86 | FeatureFlag::Scalar; 85 | 86 | //#if FASTSIMD_x86 87 | int abcd[4] = { 0, 0, 0, 0 }; // cpuid results 88 | 89 | //#if !FASTSIMD_64BIT 90 | 91 | cpuid( abcd, 0 ); // call cpuid function 0 92 | if( abcd[0] == 0 ) 93 | return supportedFlags; // no further cpuid function supported 94 | 95 | cpuid( abcd, 1 ); // call cpuid function 1 for feature flags 96 | if( ( abcd[3] >> 0 & 1 ) == 0 ) 97 | return supportedFlags; // no floating point 98 | if( ( abcd[3] >> 23 & 1 ) == 0 ) 99 | return supportedFlags; // no MMX 100 | if( ( abcd[3] >> 15 & 1 ) == 0 ) 101 | return supportedFlags; // no conditional move 102 | if( ( abcd[3] >> 24 & 1 ) == 0 ) 103 | return supportedFlags; // no FXSAVE 104 | if( ( abcd[3] >> 25 & 1 ) == 0 ) 105 | return supportedFlags; // no SSE 106 | supportedFlags = supportedFlags | FeatureFlag::SSE; 107 | // SSE supported 108 | 109 | if( ( abcd[3] >> 26 & 1 ) == 0 ) 110 | return supportedFlags; // no SSE2 111 | //#else 112 | cpuid( abcd, 1 ); // call cpuid function 1 for feature flags 113 | //#endif 114 | 115 | supportedFlags = supportedFlags | FeatureFlag::SSE2; // default value for 64bit 116 | // SSE2 supported 117 | 118 | if( ( abcd[2] >> 0 & 1 ) == 0 ) 119 | return supportedFlags; // no SSE3 120 | supportedFlags = supportedFlags | FeatureFlag::SSE3; 121 | // SSE3 supported 122 | 123 | if( ( abcd[2] >> 9 & 1 ) == 0 ) 124 | return supportedFlags; // no SSSE3 125 | supportedFlags = supportedFlags | FeatureFlag::SSSE3; 126 | // SSSE3 supported 127 | 128 | if( ( abcd[2] >> 19 & 1 ) == 0 ) 129 | return supportedFlags; // no SSE4.1 130 | supportedFlags = supportedFlags | FeatureFlag::SSE41; 131 | // SSE4.1 supported 132 | 133 | if( ( abcd[2] >> 23 & 1 ) == 0 ) 134 | return supportedFlags; // no POPCNT 135 | if( ( abcd[2] >> 20 & 1 ) == 0 ) 136 | return supportedFlags; // no SSE4.2 137 | supportedFlags = supportedFlags | FeatureFlag::SSE42; 138 | // SSE4.2 supported 139 | 140 | if( ( abcd[2] >> 26 & 1 ) == 0 ) 141 | return supportedFlags; // no XSAVE 142 | if( ( abcd[2] >> 27 & 1 ) == 0 ) 143 | return supportedFlags; // no OSXSAVE 144 | if( ( abcd[2] >> 28 & 1 ) == 0 ) 145 | return supportedFlags; // no AVX 146 | 147 | uint64_t osbv = xgetbv( 0 ); 148 | if( ( osbv & 6 ) != 6 ) 149 | return supportedFlags; // AVX not enabled in O.S. 150 | supportedFlags = supportedFlags | FeatureFlag::AVX; 151 | // AVX supported 152 | 153 | if constexpr( IsRelaxed() ) 154 | { 155 | if( ( abcd[2] >> 12 & 1 ) == 0 ) 156 | return supportedFlags; // no FMA3 157 | } 158 | 159 | cpuid( abcd, 7 ); // call cpuid leaf 7 for feature flags 160 | if( ( abcd[1] >> 5 & 1 ) == 0 ) 161 | return supportedFlags; // no AVX2 162 | supportedFlags = supportedFlags | FeatureFlag::AVX2; 163 | // AVX2 supported 164 | 165 | if( ( osbv & ( 0xE0 ) ) != 0xE0 ) 166 | return supportedFlags; // AVX512 not enabled in O.S. 167 | if( ( abcd[1] >> 16 & 1 ) == 0 ) 168 | return supportedFlags; // no AVX512 169 | cpuid( abcd, 0xD ); // call cpuid leaf 0xD for feature flags 170 | if( ( abcd[0] & 0x60 ) != 0x60 ) 171 | return supportedFlags; // no AVX512 172 | supportedFlags = supportedFlags | FeatureFlag::AVX512_F; 173 | // AVX512 supported 174 | 175 | cpuid( abcd, 7 ); // call cpuid leaf 7 for feature flags 176 | 177 | if( ( abcd[1] >> 31 & 1 ) == 1 ) 178 | supportedFlags = supportedFlags | FeatureFlag::AVX512_VL; 179 | // AVX512VL supported 180 | 181 | if( ( abcd[1] >> 17 & 1 ) == 1 ) 182 | supportedFlags = supportedFlags | FeatureFlag::AVX512_DQ; 183 | // AVX512DQ supported 184 | 185 | if( ( abcd[1] >> 30 & 1 ) == 1 ) 186 | supportedFlags = supportedFlags | FeatureFlag::AVX512_BW; 187 | // AVX512BW supported 188 | 189 | return supportedFlags; 190 | } 191 | 192 | #elif FASTSIMD_CURRENT_ARCH_IS( ARM ) 193 | static std::uint32_t DetectCpuSupportedFlags() 194 | { 195 | std::uint32_t supportedFlags = 196 | FastSIMD::FeatureFlag::ARM | 197 | FastSIMD::FeatureFlag::Scalar | 198 | FastSIMD::FeatureFlag::NEON | 199 | FastSIMD::FeatureFlag::AARCH64; 200 | 201 | return supportedFlags; 202 | } 203 | 204 | #elif FASTSIMD_CURRENT_ARCH_IS( WASM ) 205 | static std::uint32_t DetectCpuSupportedFlags() 206 | { 207 | std::uint32_t supportedFlags = 208 | FastSIMD::FeatureFlag::WASM | 209 | FastSIMD::FeatureFlag::Scalar; 210 | 211 | return supportedFlags; 212 | } 213 | #endif 214 | 215 | static FeatureSet FeatureSetValues[] = 216 | { 217 | FeatureSet::SCALAR, 218 | 219 | #if FASTSIMD_CURRENT_ARCH_IS( X86 ) 220 | FeatureSet::SSE, 221 | FeatureSet::SSE2, 222 | FeatureSet::SSE3, 223 | FeatureSet::SSSE3, 224 | FeatureSet::SSE41, 225 | FeatureSet::SSE42, 226 | FeatureSet::AVX, 227 | FeatureSet::AVX2, 228 | FeatureSet::AVX512, 229 | 230 | #elif FASTSIMD_CURRENT_ARCH_IS( ARM ) 231 | FeatureSet::NEON, 232 | FeatureSet::AARCH64, 233 | 234 | #elif FASTSIMD_CURRENT_ARCH_IS( WASM ) 235 | FeatureSet::WASM, 236 | #endif 237 | }; 238 | 239 | FASTSIMD_API FeatureSet DetectCpuMaxFeatureSet() 240 | { 241 | static FeatureSet cache = [] 242 | { 243 | std::uint32_t supportedFlags = DetectCpuSupportedFlags(); 244 | 245 | FeatureSet maxSupported = FeatureSet::Invalid; 246 | 247 | for( FeatureSet featureSet : FeatureSetValues ) 248 | { 249 | // Check if feature set contains unsupported flags 250 | if( ( static_cast( featureSet ) ^ supportedFlags ) & ~supportedFlags ) 251 | { 252 | break; 253 | } 254 | 255 | maxSupported = featureSet; 256 | } 257 | 258 | return maxSupported; 259 | }(); 260 | 261 | return cache; 262 | } 263 | 264 | FASTSIMD_API const char* GetFeatureSetString( FeatureSet featureSet ) 265 | { 266 | switch( featureSet ) 267 | { 268 | case FeatureSet::Invalid: return "Invalid"; 269 | case FeatureSet::SCALAR: return "SCALAR"; 270 | case FeatureSet::SSE: return "SSE"; 271 | case FeatureSet::SSE2: return "SSE2"; 272 | case FeatureSet::SSE3: return "SSE3"; 273 | case FeatureSet::SSSE3: return "SSSE3"; 274 | case FeatureSet::SSE41: return "SSE4.1"; 275 | case FeatureSet::SSE42: return "SSE4.2"; 276 | case FeatureSet::AVX: return "AVX"; 277 | case FeatureSet::AVX2: return "AVX2"; 278 | case FeatureSet::AVX512: return "AVX512"; 279 | case FeatureSet::NEON: return "NEON"; 280 | case FeatureSet::AARCH64: return "AARCH64"; 281 | case FeatureSet::WASM: return "WASM"; 282 | case FeatureSet::Max: return "Max"; 283 | } 284 | 285 | return "NAN"; 286 | } 287 | } // namespace FastSIMD 288 | -------------------------------------------------------------------------------- /tests/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | 2 | fastsimd_create_dispatch_library(simd_test SOURCES "test.inl" FEATURE_SETS SCALAR SSE2 SSE41 AVX2 AVX512 NEON AARCH64 WASM) 3 | fastsimd_create_dispatch_library(simd_test_relaxed RELAXED SOURCES "test.inl" FEATURE_SETS SCALAR SSE2 SSE41 AVX2 AVX512 NEON AARCH64 WASM) 4 | 5 | add_executable(test "test.cpp") 6 | target_link_libraries(test PRIVATE FastSIMD simd_test simd_test_relaxed) 7 | 8 | if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten") 9 | set(CMAKE_EXECUTABLE_SUFFIX ".html") 10 | target_link_options(test PRIVATE -sALLOW_MEMORY_GROWTH=1 -sSINGLE_FILE) 11 | endif() 12 | 13 | # Enable Warnings 14 | if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") 15 | #target_compile_options(test_simd PUBLIC /W4 /WX) 16 | else() 17 | #target_compile_options(test_simd PUBLIC -Wall -Wextra -Wpedantic -Werror -Wno-c++98-compat -Wno-c++98-compat-pedantic -Wno-old-style-cast -Wno-undefined-func-template) 18 | endif() 19 | -------------------------------------------------------------------------------- /tests/test.cpp: -------------------------------------------------------------------------------- 1 | #include "test.h" 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include 13 | 14 | static constexpr size_t TestCount = 4096 * 4096; 15 | 16 | static int * rndInts; 17 | static float* rndFloats; 18 | 19 | static float GenFiniteFloat( std::mt19937& gen ) 20 | { 21 | union 22 | { 23 | float f; 24 | int32_t i; 25 | } u; 26 | 27 | do 28 | { 29 | u.i = static_cast( gen() ); 30 | 31 | } while( !std::isfinite( u.f ) ); 32 | 33 | return u.f; 34 | } 35 | 36 | static void GenerateRandomValues() 37 | { 38 | std::cout << "Generating random values..." << std::endl; 39 | 40 | rndInts = new int[TestCount + 1024]; 41 | rndFloats = new float[TestCount + 1024]; 42 | 43 | std::random_device rd; //Will be used to obtain a seed for the random number engine 44 | std::mt19937 gen( rd() ); //Standard mersenne_twister_engine seeded with rd() 45 | 46 | for ( std::size_t i = 0; i < TestCount; i++ ) 47 | { 48 | rndInts[i] = (int)gen(); 49 | rndFloats[i] = GenFiniteFloat( gen ); 50 | } 51 | 52 | //std::sort( rndFloats, rndFloats + TestCount + 1024, std::less() ); 53 | } 54 | 55 | template 56 | struct TestRunner 57 | { 58 | using TestSet = std::vector>>; 59 | 60 | template 61 | struct TestOrganiser 62 | { 63 | static TestCollection GetCollections() 64 | { 65 | return {}; 66 | } 67 | }; 68 | 69 | template 70 | struct TestOrganiser> 71 | { 72 | static TestCollection GetCollections() 73 | { 74 | TestCollection collections = TestOrganiser>::GetCollections(); 75 | 76 | if( HEAD <= FastSIMD::DetectCpuMaxFeatureSet() ) 77 | { 78 | std::cout << "Generating Tests: " << FastSIMD::GetFeatureSetString( HEAD ) << std::endl; 79 | { 80 | std::unique_ptr> testSimd( FastSIMD::NewDispatchClass>( HEAD ) ); 81 | 82 | TestCollection simdCollection = testSimd->RegisterTests(); 83 | 84 | collections.insert( collections.begin(), simdCollection.begin(), simdCollection.end() ); 85 | } 86 | { 87 | std::unique_ptr> testSimd( FastSIMD::NewDispatchClass>( HEAD ) ); 88 | 89 | TestCollection simdCollection = testSimd->RegisterTests(); 90 | 91 | collections.insert( collections.begin(), simdCollection.begin(), simdCollection.end() ); 92 | } 93 | } 94 | 95 | return collections; 96 | } 97 | 98 | static TestSet GetSet() 99 | { 100 | TestCollection collections = GetCollections(); 101 | TestSet set; 102 | 103 | for( auto& collection : collections ) 104 | { 105 | std::string_view& testName = collection.first; 106 | 107 | auto find = std::find_if( set.begin(), set.end(), [testName]( const auto& pair ){ return pair.first == testName; } ); 108 | 109 | if( find == set.end() ) 110 | { 111 | if( collection.second.featureSet != FastSIMD::FeatureSet::SCALAR ) 112 | { 113 | throw std::runtime_error( "Scalar must be base test set" ); 114 | } 115 | 116 | find = set.emplace( set.end(), testName, std::vector{} ); 117 | } 118 | 119 | find->second.emplace_back( collection.second ); 120 | } 121 | 122 | return set; 123 | } 124 | }; 125 | 126 | // get value of least significant bit 127 | static float DeltaUnit( float x ) 128 | { 129 | union 130 | { 131 | float f; 132 | uint32_t i; 133 | } u; 134 | x = std::fabs( x ); 135 | 136 | if( !std::isfinite( x ) ) 137 | { 138 | return 1.f; 139 | } 140 | if( x == 0.f || !std::isnormal( x ) ) 141 | { 142 | u.i = 0x00800000; // smallest positive normal number 143 | return u.f; 144 | } 145 | float x1 = x; 146 | u.f = x; 147 | u.i++; 148 | return u.f - x1; 149 | } 150 | 151 | template 152 | static bool CompareTyped( std::string_view testName, FastSIMD::FeatureSet featureSet, float accuracy, size_t outputCount, void* scalarResults, void* simdResults ) 153 | { 154 | bool success = true; 155 | 156 | const T* typedScalar = reinterpret_cast( scalarResults ); 157 | const T* typedSimd = reinterpret_cast( simdResults ); 158 | 159 | for( size_t idx = 0; idx < outputCount; idx++ ) 160 | { 161 | if( typedScalar[idx] != typedSimd[idx] ) 162 | { 163 | float relativeDif = 0; 164 | 165 | if constexpr( std::is_floating_point_v ) 166 | { 167 | if( std::isnan( typedScalar[idx] ) && std::isnan( typedSimd[idx] ) ) 168 | { 169 | continue; 170 | } 171 | 172 | relativeDif = std::abs( typedScalar[idx] - typedSimd[idx] ) / DeltaUnit( typedScalar[idx] ); 173 | 174 | if( relativeDif <= accuracy ) 175 | { 176 | continue; 177 | } 178 | } 179 | if( success ) 180 | { 181 | std::cerr << std::setprecision( 16 ) << std::boolalpha; 182 | std::cerr << "--- " << FastSIMD::GetFeatureSetString( featureSet ) << " FAILED ---" << std::endl; 183 | } 184 | std::cerr << "idx " << idx << ": " << testName 185 | << " Expected \"" << typedScalar[idx] 186 | << "\" Actual \"" << typedSimd[idx] 187 | << "\" Diff \"" << std::abs( typedScalar[idx] - typedSimd[idx] ) << "\""; 188 | 189 | if( relativeDif != 0.0f ) 190 | { 191 | std::cerr << " (" << relativeDif << ")"; 192 | } 193 | std::cerr << std::endl; 194 | success = false; 195 | } 196 | } 197 | 198 | return success; 199 | } 200 | 201 | static bool CompareOutputs( std::string_view testName, FastSIMD::FeatureSet featureSet, TestData::ReturnType returnType, float accuracy, size_t outputCount, void* scalarResults, void* simdResults ) 202 | { 203 | switch( returnType ) 204 | { 205 | case TestData::ReturnType::boolean: 206 | return CompareTyped( testName, featureSet, accuracy, outputCount, scalarResults, simdResults ); 207 | 208 | case TestData::ReturnType::f32: 209 | return CompareTyped( testName, featureSet, accuracy, outputCount, scalarResults, simdResults ); 210 | 211 | case TestData::ReturnType::i32: 212 | return CompareTyped( testName, featureSet, accuracy, outputCount, scalarResults, simdResults ); 213 | } 214 | 215 | return false; 216 | } 217 | 218 | static void DoTest( std::string_view testName, std::vector& tests ) 219 | { 220 | std::cout << "Testing: " << testName << std::endl; 221 | 222 | char* scalarResults = new char[RegisterBytes]; 223 | char* simdResults = new char[RegisterBytes]; 224 | 225 | for( size_t idx = 0; idx < TestCount; idx += RegisterBytes / sizeof( int ) ) 226 | { 227 | int failed = 0; 228 | 229 | for( size_t testIdx = 0; testIdx < tests.size(); testIdx++ ) 230 | { 231 | TestData& test = tests[testIdx]; 232 | 233 | char* resultsOut = testIdx ? simdResults : scalarResults; 234 | std::memset( resultsOut, (int)testIdx, RegisterBytes ); 235 | 236 | size_t outputCount = test.testFunc( resultsOut, idx, rndInts, rndFloats ); 237 | 238 | if( testIdx ) 239 | { 240 | if( test.returnType != tests[0].returnType ) 241 | { 242 | std::cerr << "Tests do not match: " << testName; 243 | throw std::exception(); 244 | } 245 | if( test.featureSet == FastSIMD::FeatureSet::SCALAR && !test.relaxed ) 246 | { 247 | std::cerr << "Multiple tests with same name: " << testName; 248 | throw std::exception(); 249 | } 250 | 251 | std::string testNameRelaxed = testName.data(); 252 | float accuracy = 0; 253 | 254 | if( test.relaxed ) 255 | { 256 | testNameRelaxed += " RELAXED"; 257 | accuracy = test.relaxedAccuracy; 258 | } 259 | 260 | if( !CompareOutputs( testNameRelaxed, test.featureSet, test.returnType, accuracy, outputCount, scalarResults, simdResults ) ) 261 | { 262 | std::cerr << "Inputs: " << tests[0].inputsFunc( idx, rndInts, rndFloats ) << std::endl; 263 | failed++; 264 | } 265 | } 266 | } 267 | 268 | if( failed >= 3 ) 269 | { 270 | std::cerr << "Skipping test, fail limit reached" << std::endl; 271 | break; 272 | } 273 | } 274 | 275 | delete[] scalarResults; 276 | delete[] simdResults; 277 | } 278 | 279 | static void Run() 280 | { 281 | std::cout << "Starting Tests - Register Size: " << RegisterBytes * 8 << " (" << RegisterBytes << "b)" << std::endl; 282 | 283 | TestSet testSet = TestOrganiser::GetSet(); 284 | 285 | for( auto& test : testSet ) 286 | { 287 | //if( test.first.find( "sqrt" ) != std::string_view::npos ) 288 | { 289 | DoTest( test.first, test.second ); 290 | } 291 | } 292 | 293 | std::cout << "Testing Complete!" << std::endl; 294 | } 295 | }; 296 | 297 | int main() 298 | { 299 | GenerateRandomValues(); 300 | 301 | TestRunner::Run(); 302 | 303 | return 0; 304 | } 305 | -------------------------------------------------------------------------------- /tests/test.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | constexpr size_t kTestBytes = 512 / 8; 11 | 12 | struct TestResult 13 | { 14 | uint8_t returnCount; 15 | }; 16 | 17 | using TestFunction = size_t ( void*, size_t, int32_t*, float* ); 18 | using InputsFunction = std::string ( size_t, int32_t*, float* ); 19 | 20 | struct TestData 21 | { 22 | enum class ReturnType 23 | { 24 | boolean, f32, i32 25 | }; 26 | 27 | FastSIMD::FeatureSet featureSet; 28 | bool relaxed; 29 | ReturnType returnType; 30 | float relaxedAccuracy = 0; 31 | std::function testFunc; 32 | std::function inputsFunc; 33 | }; 34 | 35 | using TestCollection = std::vector>; 36 | 37 | template 38 | class TestFastSIMD 39 | { 40 | public: 41 | virtual ~TestFastSIMD() = default; 42 | 43 | virtual TestCollection RegisterTests() = 0; 44 | }; 45 | --------------------------------------------------------------------------------