├── .ci ├── check-format.sh ├── common.sh ├── cross-check.sh └── cross-tool.sh ├── .clang-format ├── .gitattributes ├── .github ├── CODEOWNERS └── workflows │ └── main.yml ├── .gitignore ├── CONTRIBUTING.md ├── LICENSE ├── Makefile ├── README.md ├── sse2neon.h ├── sse2neon.sln ├── sse2neon.vcxproj ├── sse2neon.vcxproj.filters └── tests ├── README.md ├── binding.cpp ├── binding.h ├── common.cpp ├── common.h ├── impl.cpp ├── impl.h └── main.cpp /.ci/check-format.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | . .ci/common.sh 4 | 5 | set -x 6 | 7 | for file in ${SOURCES}; 8 | do 9 | clang-format-18 ${file} > expected-format 10 | diff -u -p --label="${file}" --label="expected coding style" ${file} expected-format 11 | done 12 | exit $(clang-format-18 --output-replacements-xml ${SOURCES} | egrep -c "") 13 | -------------------------------------------------------------------------------- /.ci/common.sh: -------------------------------------------------------------------------------- 1 | GCC_REL=14.2.rel1 2 | ARM_MIRROR=https://github.com/DLTcollab/toolchain-arm/raw/main 3 | 4 | SOURCES=$(find $(git rev-parse --show-toplevel) | egrep "\.(cpp|h)\$" | egrep -v "arm-gnu-toolchain-${GCC_REL}-x86_64-aarch64-none-linux-gnu|arm-gnu-toolchain-${GCC_REL}-x86_64-arm-none-linux-gnueabihf") 5 | 6 | # Expect host is Linux/x86_64 7 | check_platform() 8 | { 9 | MACHINE_TYPE=`uname -m` 10 | if [ ${MACHINE_TYPE} != 'x86_64' ]; then 11 | exit 12 | fi 13 | 14 | OS_TYPE=`uname -s` 15 | if [ ${OS_TYPE} != 'Linux' ]; then 16 | exit 17 | fi 18 | } 19 | -------------------------------------------------------------------------------- /.ci/cross-check.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | . .ci/common.sh 4 | 5 | check_platform 6 | 7 | # Clang/LLVM is natively a cross-compiler. 8 | # TODO: Do cross-compilation using Clang 9 | # https://clang.llvm.org/docs/CrossCompilation.html 10 | if [ $(printenv CXX | grep clang) ]; then 11 | exit 12 | fi 13 | 14 | set -x 15 | 16 | make clean 17 | export PATH=arm-gnu-toolchain-${GCC_REL}-x86_64-aarch64-none-linux-gnu/bin:$PATH 18 | make CROSS_COMPILE=aarch64-none-linux-gnu- check || exit 1 # ARMv8-A 19 | 20 | make clean 21 | export PATH=arm-gnu-toolchain-${GCC_REL}-x86_64-arm-none-linux-gnueabihf/bin:$PATH 22 | make CROSS_COMPILE=arm-none-linux-gnueabihf- check || exit 1 # ARMv7-A 23 | -------------------------------------------------------------------------------- /.ci/cross-tool.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | . .ci/common.sh 4 | 5 | check_platform 6 | 7 | sudo apt-get update -q -y 8 | sudo apt-get install -q -y qemu-user 9 | 10 | # Clang/LLVM is natively a cross-compiler, meaning that one set of programs 11 | # can compile to all targets by setting the -target option. 12 | if [ $(printenv CXX | grep clang) ]; then 13 | exit 14 | fi 15 | 16 | set -x 17 | 18 | sudo apt-get install -y curl xz-utils 19 | 20 | curl -L \ 21 | ${ARM_MIRROR}/arm-gnu-toolchain-${GCC_REL}-x86_64-arm-none-linux-gnueabihf.tar.xz \ 22 | | tar -Jx || exit 1 23 | 24 | curl -L \ 25 | ${ARM_MIRROR}/arm-gnu-toolchain-${GCC_REL}-x86_64-aarch64-none-linux-gnu.tar.xz \ 26 | | tar -Jx || exit 1 27 | -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | BasedOnStyle: Chromium 2 | Language: Cpp 3 | MaxEmptyLinesToKeep: 3 4 | IndentCaseLabels: false 5 | AllowShortIfStatementsOnASingleLine: false 6 | AllowShortCaseLabelsOnASingleLine: false 7 | AllowShortLoopsOnASingleLine: false 8 | DerivePointerAlignment: false 9 | PointerAlignment: Right 10 | SpaceAfterCStyleCast: true 11 | TabWidth: 4 12 | UseTab: Never 13 | IndentWidth: 4 14 | BreakBeforeBraces: Linux 15 | AccessModifierOffset: -4 16 | ForEachMacros: 17 | - SET_FOREACH 18 | - RB_FOREACH 19 | AlignEscapedNewlines: Left 20 | AttributeMacros: 21 | - FORCE_INLINE 22 | - ALIGN_STRUCT 23 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.md text=auto 2 | LICENSE text=auto 3 | 4 | sse2neon.h -text linguist-language=c 5 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | # Lines starting with '#' are comments. 2 | # More details are here: https://help.github.com/articles/about-codeowners/ 3 | 4 | # Global codeowners: 5 | * @jserv @howjmay 6 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: GitHub Actions 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | host-x86: 7 | runs-on: ubuntu-24.04 8 | strategy: 9 | matrix: 10 | arch: [x86_64] 11 | cxx_compiler: [g++, clang++] 12 | steps: 13 | - name: checkout code 14 | uses: actions/checkout@v4 15 | - name: build artifact 16 | env: 17 | CXX: ${{ matrix.cxx_compiler }} 18 | run: | 19 | sh .ci/cross-tool.sh 20 | make check 21 | sh .ci/cross-check.sh 22 | 23 | host-win: 24 | runs-on: windows-2022 25 | strategy: 26 | matrix: 27 | arch: 28 | - x86_64 29 | - armv7 30 | - aarch64 31 | env: 32 | LLVM_MINGW_URL: https://github.com/mstorsjo/llvm-mingw/releases/download/20241217/llvm-mingw-20241217-msvcrt-x86_64.zip 33 | defaults: 34 | run: 35 | shell: bash 36 | steps: 37 | - name: unpack llvm-mingw 38 | run: | 39 | curl -L -O $LLVM_MINGW_URL 40 | unzip -q llvm-mingw-*.zip 41 | rm llvm-mingw-*.zip 42 | mv llvm-mingw-* "$HOME/llvm-mingw" 43 | echo "$HOME/llvm-mingw/bin" >> $GITHUB_PATH 44 | - name: checkout code 45 | uses: actions/checkout@v4 46 | - name: build artifact 47 | env: 48 | CXX: ${{ matrix.arch }}-w64-mingw32-clang++ 49 | run: mingw32-make processor=${{ matrix.arch }} 50 | - name: run tests 51 | if: matrix.arch == 'x86_64' 52 | run: mingw32-make check 53 | 54 | host-arm: 55 | runs-on: ubuntu-24.04 56 | strategy: 57 | matrix: 58 | arch_with_features: [ 59 | {arch: armv7, feature: none, arch_cflags: none}, 60 | {arch: aarch64, feature: none, arch_cflags: none}, 61 | {arch: aarch64, feature: crypto+crc, arch_cflags: none}, 62 | {arch: armv7, feature: none, arch_cflags: '-mcpu=cortex-a32 -mfpu=neon-fp-armv8'} 63 | ] 64 | cxx_compiler: [g++, clang++-15] 65 | steps: 66 | - name: checkout code 67 | uses: actions/checkout@v4 68 | - name: build artifact 69 | # The Github Action for non-x86 CPU 70 | # https://github.com/uraimo/run-on-arch-action 71 | uses: uraimo/run-on-arch-action@v2 72 | with: 73 | arch: ${{ matrix.arch_with_features.arch }} 74 | distro: ubuntu22.04 75 | # Speed up builds by storing container images in a GitHub package registry. 76 | githubToken: ${{ github.token }} 77 | env: | 78 | CXX: ${{ matrix.cxx_compiler }} 79 | ARCH_CFLAGS: ${{ matrix.arch_with_features.arch_cflags }} 80 | install: | 81 | apt-get update -q -y 82 | apt-get install -q -y gcc "${{ matrix.cxx_compiler }}" make 83 | run: | 84 | make FEATURE=${{ matrix.arch_with_features.feature }} check 85 | 86 | host-win-msvc: 87 | runs-on: windows-2022 88 | steps: 89 | - name: checkout code 90 | uses: actions/checkout@v4 91 | 92 | - name: add msbuild to PATH 93 | uses: microsoft/setup-msbuild@v2 94 | 95 | - name: build artifact 96 | run: msbuild sse2neon.vcxproj -t:rebuild -property:Configuration=Release -property:Platform=ARM64 97 | 98 | - name: upload artifact 99 | uses: actions/upload-artifact@master 100 | with: 101 | name: msvc-arm64-artifact 102 | path: ARM64 103 | 104 | test-win-msvc: 105 | runs-on: ubuntu-24.04 106 | container: linaro/wine-arm64 107 | needs: host-win-msvc 108 | steps: 109 | - name: download artifact 110 | uses: actions/download-artifact@master 111 | with: 112 | name: msvc-arm64-artifact 113 | 114 | - name: Run tests 115 | run: wine-arm64 cmd.exe /c 'Release\sse2neon.exe' 116 | 117 | 118 | coding-style: 119 | runs-on: ubuntu-24.04 120 | steps: 121 | - name: checkout code 122 | uses: actions/checkout@v4 123 | - name: style check 124 | run: | 125 | sudo apt-get install -q -y clang-format-18 126 | sh .ci/check-format.sh 127 | shell: bash 128 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.exe 2 | *.o 3 | *.gch 4 | tests/*.d 5 | tests/main 6 | gcc-arm-* 7 | .vs/ 8 | Debug/ 9 | Release/ 10 | *.vcxproj.user 11 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to SSE2NEON 2 | 3 | :+1::tada: First off, thanks for taking the time to contribute! :tada::+1: 4 | 5 | The following is a set of guidelines for contributing to [SSE2NEON](https://github.com/DLTcollab/sse2neon), 6 | hosted on GitHub. These are mostly guidelines, not rules. Use your best 7 | judgment, and feel free to propose changes to this document in a pull request. 8 | 9 | ## Issues 10 | 11 | This project uses GitHub Issues to track ongoing development, discuss project plans, and keep track of bugs. Be sure to search for existing issues before you create another one. 12 | 13 | Visit our [Issues page on GitHub](https://github.com/DLTcollab/sse2neon/issues) to search and submit. 14 | 15 | ## Add New Intrinsic 16 | 17 | The new intrinsic conversion should be added in the `sse2neon.h` file, 18 | and it should be placed in the correct classification with the alphabetical order. 19 | The classification can be referenced from [Intel Intrinsics Guide](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html). 20 | 21 | Classification: `SSE`, `SSE2`, `SSE3`, `SSSE3`, `SSE4.1`, `SSE4.2` 22 | 23 | ## Coding Convention 24 | 25 | We welcome all contributions from corporate, acaddemic and individual developers. However, there are a number of fundamental ground rules that you must adhere to in order to participate. These rules are outlined as follows: 26 | * All code must adhere to the existing C coding style (see below). While we are somewhat flexible in basic style, you will adhere to what is currently in place. Uncommented, complicated algorithmic constructs will be rejected. 27 | * All external pull requests must contain sufficient documentation in the pull request comments in order to be accepted. 28 | 29 | Software requirement: [clang-format](https://clang.llvm.org/docs/ClangFormat.html) version 18 or later. 30 | 31 | Use the command `$ clang-format -i *.[ch]` to enforce a consistent coding style. 32 | 33 | ## Naming Conventions 34 | 35 | There are some general rules. 36 | * Names with leading and trailing underscores are reserved for system purposes, and most systems use them for names that the user should not have to know. 37 | * Function, typedef, and variable names, as well as struct, union, and enum tag names should be in lower case. 38 | * Many function-like macros are in all CAPS. 39 | * Avoid names that differ only in case, like `foo` and `Foo`. Similarly, avoid `foobar` and `foo_bar`. The potential for confusion is considerable. 40 | * Similarly, avoid names that look like each other. On many terminals and printers, `l`, `1` and `I` look quite similar. A variable named `l` is particularly bad because it looks so much like the constant `1`. 41 | 42 | In general, global names (including enums) should have a common prefix (`SSE2NEON_` for macros and enum constants; `_sse2neon_` for functions) identifying the module that they belong with. Globals may alternatively be grouped in a global structure. Typedeffed names often have `_t` appended to their name. 43 | 44 | Avoid using names that might conflict with other names used in standard libraries. There may be more library code included in some systems than you need. Your program could also be extended in the future. 45 | 46 | ## Coding Style for Modern C 47 | 48 | This coding style is a variation of the K&R style. Some general principles: honor tradition, but accept progress; be consistent; 49 | embrace the latest C standards; embrace modern compilers, their static analysis 50 | capabilities and sanitizers. 51 | 52 | ### Indentation 53 | 54 | Use 4 spaces rather than tabs. 55 | 56 | ### Line length 57 | 58 | All lines should generally be within 80 characters. Wrap long lines. 59 | There are some good reasons behind this: 60 | * It forces the developer to write more succinct code; 61 | * Humans are better at processing information in smaller quantity portions; 62 | * It helps users of vi/vim (and potentially other editors) who use vertical splits. 63 | 64 | ### Comments 65 | 66 | Multi-line comments shall have the opening and closing characters 67 | in a separate line, with the lines containing the content prefixed by a space 68 | and the `*` characters for alignment, e.g., 69 | ```c 70 | /* 71 | * This is a multi-line comment. 72 | */ 73 | 74 | /* One line comment. */ 75 | ``` 76 | 77 | Use multi-line comments for more elaborative descriptions or before more 78 | significant logical block of code. 79 | 80 | Single-line comments shall be written in C89 style: 81 | ```c 82 | return (uintptr_t) val; /* return a bitfield */ 83 | ``` 84 | 85 | Leave two spaces between the statement and the inline comment. 86 | 87 | ### Spacing and brackets 88 | 89 | Use one space after the conditional or loop keyword, no spaces around 90 | their brackets, and one space before the opening curly bracket. 91 | 92 | Functions (their declarations or calls), `sizeof` operator or similar 93 | macros shall not have a space after their name/keyword or around the 94 | brackets, e.g., 95 | ```c 96 | unsigned total_len = offsetof(obj_t, items[n]); 97 | unsigned obj_len = sizeof(obj_t); 98 | ``` 99 | 100 | Use brackets to avoid ambiguity and with operators such as `sizeof`, 101 | but otherwise avoid redundant or excessive brackets. 102 | 103 | ### Variable names and declarations 104 | 105 | - Use descriptive names for global variables and short names for locals. 106 | Find the right balance between descriptive and succinct. 107 | 108 | - Use [snakecase](https://en.wikipedia.org/wiki/Snake_case). 109 | Do not use "camelcase". 110 | 111 | - Do not use Hungarian notation or other unnecessary prefixing or suffixing. 112 | 113 | - Use the following spacing for pointers: 114 | ```c 115 | const char *name; /* const pointer; '*' with the name and space before it */ 116 | conf_t * const cfg; /* pointer to a const data; spaces around 'const' */ 117 | const uint8_t * const charmap; /* const pointer and const data */ 118 | const void * restrict key; /* const pointer which does not alias */ 119 | ``` 120 | 121 | ### Type definitions 122 | 123 | Declarations shall be on the same line, e.g., 124 | ```c 125 | typedef void (*dir_iter_t)(void *, const char *, struct dirent *); 126 | ``` 127 | 128 | _Typedef_ structures rather than pointers. Note that structures can be kept 129 | opaque if they are not dereferenced outside the translation unit where they 130 | are defined. Pointers can be _typedefed_ only if there is a very compelling 131 | reason. 132 | 133 | New types may be suffixed with `_t`. Structure name, when used within the 134 | translation unit, may be omitted, e.g.: 135 | 136 | ```c 137 | typedef struct { 138 | unsigned if_index; 139 | unsigned addr_len; 140 | addr_t next_hop; 141 | } route_info_t; 142 | ``` 143 | 144 | ### Initialization 145 | 146 | Embrace C99 structure initialization where reasonable, e.g., 147 | ```c 148 | static const crypto_ops_t openssl_ops = { 149 | .create = openssl_crypto_create, 150 | .destroy = openssl_crypto_destroy, 151 | .encrypt = openssl_crypto_encrypt, 152 | .decrypt = openssl_crypto_decrypt, 153 | .hmac = openssl_crypto_hmac, 154 | }; 155 | ``` 156 | 157 | Embrace C99 array initialization, especially for the state machines, e.g., 158 | ```c 159 | static const uint8_t tcp_fsm[TCP_NSTATES][2][TCPFC_COUNT] = { 160 | [TCPS_CLOSED] = { 161 | [FLOW_FORW] = { 162 | /* Handshake (1): initial SYN. */ 163 | [TCPFC_SYN] = TCPS_SYN_SENT, 164 | }, 165 | }, 166 | ... 167 | } 168 | ``` 169 | 170 | ### Control structures 171 | 172 | Try to make the control flow easy to follow. Avoid long convoluted logic 173 | expressions; try to split them where possible (into inline functions, 174 | separate if-statements, etc). 175 | 176 | The control structure keyword and the expression in the brackets should be 177 | separated by a single space. The opening curly bracket shall be in the 178 | same line, also separated by a single space. Example: 179 | 180 | ```c 181 | for (;;) { 182 | obj = get_first(); 183 | while ((obj = get_next(obj))) { 184 | ... 185 | } 186 | if (done) 187 | break; 188 | } 189 | ``` 190 | 191 | Do not add inner spaces around the brackets. There should be one space after 192 | the semicolon when `for` has expressions: 193 | ```c 194 | for (unsigned i = 0; i < __arraycount(items); i++) { 195 | ... 196 | } 197 | ``` 198 | 199 | #### Avoid unnecessary nesting levels 200 | 201 | Avoid: 202 | ```c 203 | int inspect(obj_t *obj) 204 | { 205 | if (cond) { 206 | ... 207 | /* long code block */ 208 | ... 209 | return 0; 210 | } 211 | return -1; 212 | } 213 | ``` 214 | 215 | Consider: 216 | ```c 217 | int inspect(obj_t *obj) 218 | { 219 | if (!cond) 220 | return -1; 221 | 222 | ... 223 | return 0; 224 | } 225 | ``` 226 | 227 | However, do not make logic more convoluted. 228 | 229 | ### `if` statements 230 | 231 | Curly brackets and spacing follow the K&R style: 232 | ```c 233 | if (a == b) { 234 | .. 235 | } else if (a < b) { 236 | ... 237 | } else { 238 | ... 239 | } 240 | ``` 241 | 242 | Simple and succinct one-line if-statements may omit curly brackets: 243 | ```c 244 | if (!valid) 245 | return -1; 246 | ``` 247 | 248 | However, do prefer curly brackets with multi-line or more complex statements. 249 | If one branch uses curly brackets, then all other branches shall use the 250 | curly brackets too. 251 | 252 | Wrap long conditions to the if-statement indentation adding extra 4 spaces: 253 | ```c 254 | if (some_long_expression && 255 | another_expression) { 256 | ... 257 | } 258 | ``` 259 | 260 | #### Avoid redundant `else` 261 | 262 | Avoid: 263 | ```c 264 | if (flag & F_FEATURE_X) { 265 | ... 266 | return 0; 267 | } else { 268 | return -1; 269 | } 270 | ``` 271 | 272 | Consider: 273 | ```c 274 | if (flag & F_FEATURE_X) { 275 | ... 276 | return 0; 277 | } 278 | return -1; 279 | ``` 280 | 281 | ### `switch` statements 282 | 283 | Switch statements should have the `case` blocks at the same indentation 284 | level, e.g.: 285 | ```c 286 | switch (expr) { 287 | case A: 288 | ... 289 | break; 290 | case B: 291 | /* fallthrough */ 292 | case C: 293 | ... 294 | break; 295 | } 296 | ``` 297 | 298 | If the case block does not break, then it is strongly recommended to add a 299 | comment containing "fallthrough" to indicate it. Modern compilers can also 300 | be configured to require such comment (see gcc `-Wimplicit-fallthrough`). 301 | 302 | ### Function definitions 303 | 304 | The opening and closing curly brackets shall also be in the separate lines (K&R style). 305 | 306 | ```c 307 | ssize_t hex_write(FILE *stream, const void *buf, size_t len) 308 | { 309 | ... 310 | } 311 | ``` 312 | 313 | Do not use old style K&R style C definitions. 314 | 315 | ### Object abstraction 316 | 317 | Objects are often "simulated" by the C programmers with a `struct` and 318 | its "public API". To enforce the information hiding principle, it is a 319 | good idea to define the structure in the source file (translation unit) 320 | and provide only the _declaration_ in the header. For example, `obj.c`: 321 | 322 | ```c 323 | #include "obj.h" 324 | 325 | struct obj { 326 | int value; 327 | } 328 | 329 | obj_t *obj_create(void) 330 | { 331 | return calloc(1, sizeof(obj_t)); 332 | } 333 | 334 | void obj_destroy(obj_t *obj) 335 | { 336 | free(obj); 337 | } 338 | ``` 339 | 340 | With an example `obj.h`: 341 | ```c 342 | #ifndef _OBJ_H_ 343 | #define _OBJ_H_ 344 | 345 | typedef struct obj; 346 | 347 | obj_t *obj_create(void); 348 | void obj_destroy(obj_t *); 349 | 350 | #endif 351 | ``` 352 | 353 | Such structuring will prevent direct access of the `obj_t` members outside 354 | the `obj.c` source file. The implementation (of such "class" or "module") 355 | may be large and abstracted within separate source files. In such case, 356 | consider separating structures and "methods" into separate headers (think of 357 | different visibility), for example `obj_impl.h` (private) and `obj.h` (public). 358 | 359 | Consider `crypto_impl.h`: 360 | ```c 361 | #ifndef _CRYPTO_IMPL_H_ 362 | #define _CRYPTO_IMPL_H_ 363 | 364 | #if !defined(__CRYPTO_PRIVATE) 365 | #error "only to be used by the crypto modules" 366 | #endif 367 | 368 | #include "crypto.h" 369 | 370 | typedef struct crypto { 371 | crypto_cipher_t cipher; 372 | void *key; 373 | size_t key_len; 374 | ... 375 | } 376 | ... 377 | 378 | #endif 379 | ``` 380 | 381 | And `crypto.h` (public API): 382 | 383 | ```c 384 | #ifndef _CRYPTO_H_ 385 | #define _CRYPTO_H_ 386 | 387 | typedef struct crypto crypto_t; 388 | 389 | crypto_t *crypto_create(crypto_cipher_t); 390 | void crypto_destroy(crypto_t *); 391 | ... 392 | 393 | #endif 394 | ``` 395 | 396 | ### Use reasonable types 397 | 398 | Use `unsigned` for general iterators; use `size_t` for general sizes; use 399 | `ssize_t` to return a size which may include an error. Of course, consider 400 | possible overflows. 401 | 402 | Avoid using `uint8_t` or `uint16_t` or other sub-word types for general 403 | iterators and similar cases, unless programming for micro-controllers or 404 | other constrained environments. 405 | 406 | C has rather peculiar _type promotion rules_ and unnecessary use of sub-word 407 | types might contribute to a bug once in a while. 408 | 409 | ### Embrace portability 410 | 411 | #### Byte-order 412 | 413 | Do not assume x86 or little-endian architecture. Use endian conversion 414 | functions for operating the on-disk and on-the-wire structures or other 415 | cases where it is appropriate. 416 | 417 | #### Types 418 | 419 | - Do not assume a particular 32-bit vs 64-bit architecture, e.g., do not 420 | assume the size of `long` or `unsigned long`. Use `int64_t` or `uint64_t` 421 | for the 8-byte integers. 422 | 423 | - Do not assume `char` is signed; for example, on Arm it is unsigned. 424 | 425 | - Use C99 macros for constant prefixes or formatting of the fixed-width 426 | types. 427 | 428 | Use: 429 | ```c 430 | #define SOME_CONSTANT (UINT64_C(1) << 48) 431 | printf("val %" PRIu64 "\n", SOME_CONSTANT); 432 | ``` 433 | 434 | Do not use: 435 | ```c 436 | #define SOME_CONSTANT (1ULL << 48) 437 | printf("val %lld\n", SOME_CONSTANT); 438 | ``` 439 | 440 | #### Avoid unaligned access 441 | 442 | Do not assume unaligned access is safe. It is not safe on Arm, POWER, 443 | and various other architectures. Moreover, even on x86 unaligned access 444 | is slower. 445 | 446 | #### Avoid extreme portability 447 | 448 | Unless programming for micro-controllers or exotic CPU architectures, 449 | focus on the common denominator of the modern CPU architectures, avoiding 450 | the very maximum portability which can make the code unnecessarily cumbersome. 451 | 452 | Some examples: 453 | - It is fair to assume `sizeof(int) == 4` since it is the case on all modern 454 | mainstream architectures. PDP-11 era is long gone. 455 | - Using `1U` instead of `UINT32_C(1)` or `(uint32_t) 1` is also fine. 456 | - It is fair to assume that `NULL` is matching `(uintptr_t) 0` and it is fair 457 | to `memset()` structures with zero. Non-zero `NULL` is for retro computing. 458 | 459 | ## References 460 | - [Linux kernel coding style](https://www.kernel.org/doc/html/latest/process/coding-style.html) 461 | - 1999, Brian W. Kernighan and Rob Pike, The Practice of Programming, Addison–Wesley. 462 | - 1993, Bill Shannon, [C Style and Coding Standards for SunOS](https://devnull-cz.github.io/unix-linux-prog-in-c/cstyle.ms.pdf) 463 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2015-2025 SSE2NEON Contributors 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | ifndef CC 2 | override CC = gcc 3 | endif 4 | 5 | ifndef CXX 6 | override CXX = g++ 7 | endif 8 | 9 | ifndef CROSS_COMPILE 10 | processor := $(shell uname -m) 11 | else # CROSS_COMPILE was set 12 | CC = $(CROSS_COMPILE)gcc 13 | CXX = $(CROSS_COMPILE)g++ 14 | CXXFLAGS += -static 15 | LDFLAGS += -static 16 | check_arm := $(shell echo | $(CROSS_COMPILE)cpp -dM - | grep " __ARM_ARCH " | cut -c20-) 17 | ifeq ($(check_arm),8) 18 | processor = aarch64 19 | else ifeq ($(check_arm),7) # detect ARMv7-A only 20 | processor = arm 21 | else 22 | $(error Unsupported cross-compiler) 23 | endif 24 | endif 25 | 26 | EXEC_WRAPPER = 27 | ifdef CROSS_COMPILE 28 | EXEC_WRAPPER = qemu-$(processor) 29 | endif 30 | 31 | # Follow platform-specific configurations 32 | ARCH_CFLAGS ?= 33 | ARCH_CFLAGS_IS_SET = 34 | ifeq ($(ARCH_CFLAGS),) 35 | ARCH_CFLAGS_IS_SET = true 36 | endif 37 | ifeq ($(ARCH_CFLAGS),none) 38 | ARCH_CFLAGS_IS_SET = true 39 | endif 40 | ifdef ARCH_CFLAGS_IS_SET 41 | ifeq ($(processor),$(filter $(processor),aarch64 arm64)) 42 | override ARCH_CFLAGS := -march=armv8-a+fp+simd 43 | else ifeq ($(processor),$(filter $(processor),i386 x86_64)) 44 | override ARCH_CFLAGS := -maes -mpclmul -mssse3 -msse4.2 45 | else ifeq ($(processor),$(filter $(processor),arm armv7 armv7l)) 46 | override ARCH_CFLAGS := -mfpu=neon 47 | else 48 | $(error Unsupported architecture) 49 | endif 50 | endif 51 | 52 | FEATURE ?= 53 | ifneq ($(FEATURE),) 54 | ifneq ($(FEATURE),none) 55 | COMMA:= , 56 | ARCH_CFLAGS := $(ARCH_CFLAGS)+$(subst $(COMMA),+,$(FEATURE)) 57 | endif 58 | endif 59 | 60 | CXXFLAGS += -Wall -Wcast-qual -Wconversion -I. $(ARCH_CFLAGS) -std=gnu++14 61 | LDFLAGS += -lm 62 | OBJS = \ 63 | tests/binding.o \ 64 | tests/common.o \ 65 | tests/impl.o \ 66 | tests/main.o 67 | deps := $(OBJS:%.o=%.o.d) 68 | 69 | .SUFFIXES: .o .cpp 70 | .cpp.o: 71 | $(CXX) -o $@ $(CXXFLAGS) -c -MMD -MF $@.d $< 72 | 73 | EXEC = tests/main 74 | 75 | $(EXEC): $(OBJS) 76 | $(CXX) $(LDFLAGS) -o $@ $^ 77 | 78 | check: tests/main 79 | ifeq ($(processor),$(filter $(processor),aarch64 arm64 arm armv7l)) 80 | $(CC) $(ARCH_CFLAGS) -c sse2neon.h 81 | endif 82 | $(EXEC_WRAPPER) $^ 83 | 84 | indent: 85 | @echo "Formatting files with clang-format.." 86 | @if ! hash clang-format-18; then echo "clang-format-18 is required to indent"; fi 87 | clang-format-18 -i sse2neon.h tests/*.cpp tests/*.h 88 | 89 | .PHONY: clean check format 90 | clean: 91 | $(RM) $(OBJS) $(EXEC) $(deps) sse2neon.h.gch 92 | 93 | -include $(deps) 94 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # sse2neon 2 | ![GitHub Actions](https://github.com/DLTcollab/sse2neon/workflows/GitHub%20Actions/badge.svg) 3 | 4 | A C/C++ header file that converts Intel SSE intrinsics to Arm/Aarch64 NEON intrinsics. 5 | 6 | ## Introduction 7 | 8 | `sse2neon` is a translator of Intel SSE (Streaming SIMD Extensions) intrinsics 9 | to [Arm NEON](https://developer.arm.com/architectures/instruction-sets/simd-isas/neon), 10 | shortening the time needed to get an Arm working program that then can be used to 11 | extract profiles and to identify hot paths in the code. 12 | The header file `sse2neon.h` contains several of the functions provided by Intel 13 | intrinsic headers such as ``, only implemented with NEON-based counterparts 14 | to produce the exact semantics of the intrinsics. 15 | 16 | ## Mapping and Coverage 17 | 18 | Header file | Extension | 19 | ---|---| 20 | `` | MMX | 21 | `` | SSE | 22 | `` | SSE2 | 23 | `` | SSE3 | 24 | `` | SSSE3 | 25 | `` | SSE4.1 | 26 | `` | SSE4.2 | 27 | `` | AES | 28 | 29 | `sse2neon` aims to support SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2 and AES extension. 30 | 31 | In order to deliver NEON-equivalent intrinsics for all SSE intrinsics used widely, 32 | please be aware that some SSE intrinsics exist a direct mapping with a concrete 33 | NEON-equivalent intrinsic. Others, unfortunately, lack a 1:1 mapping, meaning that 34 | their equivalents are built utilizing a number of NEON intrinsics. 35 | 36 | For example, SSE intrinsic `_mm_loadu_si128` has a direct NEON mapping (`vld1q_s32`), 37 | but SSE intrinsic `_mm_maddubs_epi16` has to be implemented with 13+ NEON instructions. 38 | 39 | ### Floating-point compatibility 40 | 41 | Some conversions require several NEON intrinsics, which may produce inconsistent results 42 | compared to their SSE counterparts due to differences in the arithmetic rules of IEEE-754. 43 | 44 | Taking a possible conversion of `_mm_rsqrt_ps` as example: 45 | 46 | ```c 47 | __m128 _mm_rsqrt_ps(__m128 in) 48 | { 49 | float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in)); 50 | 51 | out = vmulq_f32( 52 | out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out)); 53 | 54 | return vreinterpretq_m128_f32(out); 55 | } 56 | ``` 57 | 58 | The `_mm_rsqrt_ps` conversion will produce NaN if a source value is `0.0` (first INF for the 59 | reciprocal square root of `0.0`, then INF * `0.0` using `vmulq_f32`). In contrast, 60 | the SSE counterpart produces INF if a source value is `0.0`. 61 | As a result, additional treatments should be applied to ensure consistency between the conversion and its SSE counterpart. 62 | 63 | ## Requirement 64 | 65 | Developers are advised to utilize sse2neon.h with GCC version 10 or higher, or Clang version 11 or higher. While sse2neon.h might be compatible with earlier versions, certain vector operation errors have been identified in those versions. For further details, refer to the discussion in issue [#622](https://github.com/DLTcollab/sse2neon/issues/622). 66 | 67 | ## Usage 68 | 69 | - Put the file `sse2neon.h` in to your source code directory. 70 | 71 | - Locate the following SSE header files included in the code: 72 | ```C 73 | #include 74 | #include 75 | ``` 76 | {p,t,s,n,w}mmintrin.h could be replaceable as well. 77 | 78 | - Replace them with: 79 | ```C 80 | #include "sse2neon.h" 81 | ``` 82 | - If you target Windows Arm64EC, pass `/D_DISABLE_SOFTINTRIN_=1` to MSVC or add `#define _DISABLE_SOFTINTRIN_ 1` in before `#include` any Windows header files to disable implicit inclusion of SSE header files. 83 | - Explicitly specify platform-specific options to gcc/clang compilers. 84 | * On ARMv8-A 64-bit targets, you should specify the following compiler option: (Remove `crypto` and/or `crc` if your architecture does not support cryptographic and/or CRC32 extensions) 85 | ```shell 86 | -march=armv8-a+fp+simd+crypto+crc 87 | ``` 88 | * On ARMv8-A 32-bit targets, you should specify the following compiler option: 89 | ```shell 90 | -mfpu=neon-fp-armv8 91 | ``` 92 | * On ARMv7-A targets, you need to append the following compiler option: 93 | ```shell 94 | -mfpu=neon 95 | ``` 96 | 97 | ## Compile-time Configurations 98 | 99 | Though floating-point operations in NEON use the IEEE single-precision format, NEON does not fully comply to the IEEE standard when inputs or results are denormal or NaN values for minimizing power consumption as well as maximizing performance. 100 | Considering the balance between correctness and performance, `sse2neon` recognizes the following compile-time configurations: 101 | * `SSE2NEON_PRECISE_MINMAX`: Enable precise implementation of `_mm_min_{ps,pd}` and `_mm_max_{ps,pd}`. If you need consistent results such as handling with NaN values, enable it. 102 | * `SSE2NEON_PRECISE_DIV`: Enable precise implementation of `_mm_rcp_ps` and `_mm_div_ps` by additional Netwon-Raphson iteration for accuracy. 103 | * `SSE2NEON_PRECISE_SQRT`: Enable precise implementation of `_mm_sqrt_ps` and `_mm_rsqrt_ps` by additional Netwon-Raphson iteration for accuracy. 104 | * `SSE2NEON_PRECISE_DP`: Enable precise implementation of `_mm_dp_pd`. When the conditional bit is not set, the corresponding multiplication would not be executed. 105 | * `SSE2NEON_SUPPRESS_WARNINGS`: Set this macro to disable the warning which is emitted by default when optimizations are enabled. 106 | 107 | The above are turned off by default, and you should define the corresponding macro(s) as `1` before including `sse2neon.h` if you need the precise implementations. 108 | 109 | ## Run Built-in Test Suite 110 | 111 | `sse2neon` provides a unified interface for developing test cases. These test 112 | cases are located in `tests` directory, and the input data is specified at 113 | runtime. Use the following commands to perform test cases: 114 | ```shell 115 | $ make check 116 | ``` 117 | 118 | For running check with enabling features, you can use assign the features with `FEATURE` command. 119 | If `none` is assigned, then the command will be the same as simply calling `make check`. 120 | The following command enable `crypto` and `crc` features in the tests. 121 | ``` 122 | $ make FEATURE=crypto+crc check 123 | ``` 124 | 125 | For running check on certain CPU, setting the mode of FPU, etc., 126 | you can also assign the desired options with `ARCH_CFLAGS` command. 127 | If `none` is assigned, the command acts as same as calling `make check`. 128 | For instance, to run tests on Cortex-A53 with enabling ARM VFPv4 extension and NEON: 129 | ``` 130 | $ make ARCH_CFLAGS="-mcpu=cortex-a53 -mfpu=neon-vfpv4" check 131 | ``` 132 | 133 | ### Running tests on hosts other than ARM platform 134 | 135 | For running tests on hosts other than ARM platform, 136 | you can specify GNU toolchain for cross compilation with `CROSS_COMPILE` command. 137 | [QEMU](https://www.qemu.org/) should be installed in advance. 138 | 139 | For ARMv8-A running in 64-bit mode type: 140 | ```shell 141 | $ make CROSS_COMPILE=aarch64-linux-gnu- check # ARMv8-A 142 | ``` 143 | 144 | For ARMv7-A type: 145 | ```shell 146 | $ make CROSS_COMPILE=arm-linux-gnueabihf- check # ARMv7-A 147 | ``` 148 | 149 | For ARMv8-A running in 32-bit mode (A32 instruction set) type: 150 | ```shell 151 | $ make \ 152 | CROSS_COMPILE=arm-linux-gnueabihf- \ 153 | ARCH_CFLAGS="-mcpu=cortex-a32 -mfpu=neon-fp-armv8" \ 154 | check 155 | ``` 156 | 157 | Check the details via [Test Suite for SSE2NEON](tests/README.md). 158 | 159 | ### Optimization 160 | 161 | The SSE2NEON project is designed with performance-sensitive scenarios in mind, and as such, optimization options (e.g. `O1`, `O2`) can lead to misbehavior under specific circumstances. For example, frequent changes to the rounding mode or repeated calls to `_MM_SET_DENORMALS_ZERO_MODE()` may introduce unintended behavior. 162 | 163 | Enforcing no optimizations for specific intrinsics could solve these boundary cases but may negatively impact general performance. Therefore, we have decided to prioritize performance and shift the responsibility for handling such edge cases to developers. 164 | 165 | It is important to be aware of these potential pitfalls when enabling optimizations and ensure that your code accounts for these scenarios if necessary. 166 | 167 | 168 | ## Adoptions 169 | Here is a partial list of open source projects that have adopted `sse2neon` for Arm/Aarch64 support. 170 | * [Aaru Data Preservation Suite](https://www.aaru.app/) is a fully-featured software package to preserve all storage media from the very old to the cutting edge, as well as to give detailed information about any supported image file (whether from Aaru or not) and to extract the files from those images. 171 | * [aether-game-utils](https://github.com/johnhues/aether-game-utils) is a collection of cross platform utilities for quickly creating small game prototypes in C++. 172 | * [ALE](https://github.com/sc932/ALE), aka Assembly Likelihood Evaluation, is a tool for evaluating accuracy of assemblies without the need of a reference genome. 173 | * [AnchorWave](https://github.com/baoxingsong/AnchorWave), Anchored Wavefront Alignment, identifies collinear regions via conserved anchors (full-length CDS and full-length exon have been implemented currently) and breaks collinear regions into shorter fragments, i.e., anchor and inter-anchor intervals. 174 | * [ATAK-CIV](https://github.com/deptofdefense/AndroidTacticalAssaultKit-CIV), Android Tactical Assault Kit for Civilian Use, is the official geospatial-temporal and situational awareness tool used by the US Government. 175 | * [Apache Doris](https://doris.apache.org/) is a Massively Parallel Processing (MPP) based interactive SQL data warehousing for reporting and analysis. 176 | * [Apache Impala](https://impala.apache.org/) is a lightning-fast, distributed SQL queries for petabytes of data stored in Apache Hadoop clusters. 177 | * [Apache Kudu](https://kudu.apache.org/) completes Hadoop's storage layer to enable fast analytics on fast data. 178 | * [apollo](https://github.com/ApolloAuto/apollo) is a high performance, flexible architecture which accelerates the development of Autonomous Vehicles. 179 | * [ares](https://github.com/ares-emulator/ares) is a cross-platform, open source, multi-system emulator, focusing on accuracy and preservation. 180 | * [ART](https://github.com/dinosaure/art) is an implementation in OCaml of [Adaptive Radix Tree](https://db.in.tum.de/~leis/papers/ART.pdf) (ART). 181 | * [Async](https://github.com/romange/async) is a set of c++ primitives that allows efficient and rapid development in C++17 on GNU/Linux systems. 182 | * [avec](https://github.com/unevens/avec) is a little library for using SIMD instructions on both x86 and Arm. 183 | * [BEAGLE](https://github.com/beagle-dev/beagle-lib) is a high-performance library that can perform the core calculations at the heart of most Bayesian and Maximum Likelihood phylogenetics packages. 184 | * [BitMagic](https://github.com/tlk00/BitMagic) implements compressed bit-vectors and containers (vectors) based on ideas of bit-slicing transform and Rank-Select compression, offering sets of method to architect your applications to use HPC techniques to save memory (thus be able to fit more data in one compute unit) and improve storage and traffic patterns when storing data vectors and models in files or object stores. 185 | * [bipartite\_motif\_finder](https://github.com/soedinglab/bipartite_motif_finder) as known as BMF (Bipartite Motif Finder) is an open source tool for finding co-occurences of sequence motifs in genomic sequences. 186 | * [Blender](https://www.blender.org/) is the free and open source 3D creation suite, supporting the entirety of the 3D pipeline. 187 | * [Boo](https://github.com/AxioDL/boo) is a cross-platform windowing and event manager similar to SDL or SFML, with additional 3D rendering functionality. 188 | * [Brickworks](https://github.com/sdangelo/brickworks) is a music DSP toolkit that supplies with the fundamental building blocks for creating and enhancing audio engines on any platform. 189 | * [CARTA](https://github.com/CARTAvis/carta-backend) is a new visualization tool designed for viewing radio astronomy images in CASA, FITS, MIRIAD, and HDF5 formats (using the IDIA custom schema for HDF5). 190 | * [Catcoon](https://github.com/i-evi/catcoon) is a [feedforward neural network](https://en.wikipedia.org/wiki/Feedforward_neural_network) implementation in C. 191 | * [compute-runtime](https://github.com/intel/compute-runtime), the Intel Graphics Compute Runtime for oneAPI Level Zero and OpenCL Driver, provides compute API support (Level Zero, OpenCL) for Intel graphics hardware architectures (HD Graphics, Xe). 192 | * [contour](https://github.com/contour-terminal/contour) is a modern and actually fast virtual terminal emulator. 193 | * [Cog](https://github.com/losnoco/Cog) is a free and open source audio player for macOS. 194 | * [dab-cmdline](https://github.com/JvanKatwijk/dab-cmdline) provides entries for the functionality to handle Digital audio broadcasting (DAB)/DAB+ through some simple calls. 195 | * [DISTRHO](https://distrho.sourceforge.io/) is an open-source project for Cross-Platform Audio Plugins. 196 | * [Dragonfly](https://github.com/dragonflydb/dragonfly) is a modern in-memory datastore, fully compatible with Redis and Memcached APIs. 197 | * [EDGE](https://github.com/3dfxdev/EDGE) is an advanced OpenGL source port spawned from the DOOM engine, with focus on easy development and expansion for modders and end-users. 198 | * [Embree](https://github.com/embree/embree) is a collection of high-performance ray tracing kernels. Its target users are graphics application engineers who want to improve the performance of their photo-realistic rendering application by leveraging Embree's performance-optimized ray tracing kernels. 199 | * [emp-tool](https://github.com/emp-toolkit/emp-tool) aims to provide a benchmark for secure computation and allowing other researchers to experiment and extend. 200 | * [Exudyn](https://github.com/jgerstmayr/EXUDYN) is a C++ based Python library for efficient simulation of flexible multibody dynamics systems. 201 | * [FoundationDB](https://www.foundationdb.org) is a distributed database designed to handle large volumes of structured data across clusters of commodity servers. 202 | * [fsrc](https://github.com/elsamuko/fsrc) is capable of searching large codebases for text snippets. 203 | * [GDAL](https://gdal.org) is a translator library for raster and vector geospatial data formats that comes with a variety of useful command line utilities for data translation and processing. 204 | * [gmmlib](https://github.com/intel/gmmlib) is the Intel Graphics Memory Management Library that provides device specific and buffer management for the Intel Graphics Compute Runtime for OpenCL and the Intel Media Driver for VAAPI. 205 | * [HISE](https://github.com/christophhart/HISE) is a cross-platform open source audio application for building virtual instruments, emphasizing on sampling, but includes some basic synthesis features for making hybrid instruments as well as audio effects. 206 | * [iqtree2](https://github.com/iqtree/iqtree2) is an efficient and versatile stochastic implementation to infer phylogenetic trees by maximum likelihood. 207 | * [indelPost](https://github.com/stjude/indelPost) is a Python library for indel processing via realignment and read-based phasing to resolve alignment ambiguities. 208 | * [IResearch](https://github.com/iresearch-toolkit/iresearch) is a cross-platform, high-performance document oriented search engine library written entirely in C++ with the focus on a pluggability of different ranking/similarity models. 209 | * [Kraken](https://github.com/Wabi-Studios/Kraken) is a 3D animation platform redefining animation composition, collaborative workflows, simulation engines, skeletal rigging systems, and look development from storyboard to final render. 210 | * [kram](https://github.com/alecazam/kram) is a wrapper to several popular encoders to and from PNG/[KTX](https://www.khronos.org/opengles/sdk/tools/KTX/file_format_spec/) files with [LDR/HDR and BC/ASTC/ETC2](https://developer.arm.com/solutions/graphics-and-gaming/developer-guides/learn-the-basics/adaptive-scalable-texture-compression/single-page). 211 | * [Krita](https://invent.kde.org/graphics/krita) is a cross-platform application that offers an end-to-end solution for creating digital art files from scratch built on the KDE and Qt frameworks. 212 | * [libCML](https://github.com/belosthomas/libCML) is a SLAM library and scientific tool, which include a novel fast thread-safe graph map implementation. 213 | * [libhdfs3](https://github.com/ClickHouse/libhdfs3) is implemented based on native Hadoop RPC protocol and Hadoop Distributed File System (HDFS), a highly fault-tolerant distributed fs, data transfer protocol. 214 | * [libpostal](https://github.com/openvenues/libpostal) is a C library for parsing/normalizing street addresses around the world using statistical NLP and open data. 215 | * [libscapi](https://github.com/cryptobiu/libscapi) stands for the "Secure Computation API", providing reliable, efficient, and highly flexible cryptographic infrastructure. 216 | * [libstreamvbyte](https://github.com/wst24365888/libstreamvbyte) is a C++ implementation of [StreamVByte](https://arxiv.org/abs/1709.08990). 217 | * [libmatoya](https://github.com/matoya/libmatoya) is a cross-platform application development library, providing various features such as common cryptography tasks. 218 | * [Loosejaw](https://github.com/TheHolyDiver/Loosejaw) provides deep hybrid CPU/GPU digital signal processing. 219 | * [Madronalib](https://github.com/madronalabs/madronalib) enables efficient audio DSP on SIMD processors with readable and brief C++ code. 220 | * [minimap2](https://github.com/lh3/minimap2) is a versatile sequence alignment program that aligns DNA or mRNA sequences against a large reference database. 221 | * [mixed-fem](https://github.com/tytrusty/mixed-fem) is an open source reference implementation of Mixed Variational Finite Elements for Implicit Simulation of Deformables. 222 | * [MMseqs2](https://github.com/soedinglab/MMseqs2) (Many-against-Many sequence searching) is a software suite to search and cluster huge protein and nucleotide sequence sets. 223 | * [MRIcroGL](https://github.com/rordenlab/MRIcroGL) is a cross-platform tool for viewing NIfTI, DICOM, MGH, MHD, NRRD, AFNI format medical images. 224 | * [N2](https://github.com/oddconcepts/n2o) is an approximate nearest neighborhoods algorithm library written in C++, providing a much faster search speed than other implementations when modeling large dataset. 225 | * [nanors](https://github.com/sleepybishop/nanors) is a tiny, performant implementation of [Reed-Solomon codes](https://en.wikipedia.org/wiki/Reed%E2%80%93Solomon_error_correction), capable of reaching multi-gigabit speeds on a single core. 226 | * [niimath](https://github.com/rordenlab/niimath) is a general image calculator with superior performance. 227 | * [NVIDIA GameWorks](https://developer.nvidia.com/gameworks-source-github) has been already used in a lot of games. These repositories are public on GitHub. 228 | * [Nx Meta Platform Open Source Components](https://github.com/networkoptix/nx_open) are used to build all Powered-by-Nx products including Nx Witness Video Management System (VMS). 229 | * [ofxNDI](https://github.com/leadedge/ofxNDI) is an [openFrameworks](https://openframeworks.cc/) addon to allow sending and receiving images over a network using the [NewTek](https://en.wikipedia.org/wiki/NewTek) Network Device Protocol. 230 | * [OGRE](https://github.com/OGRECave/ogre) is a scene-oriented, flexible 3D engine written in C++ designed to make it easier and more intuitive for developers to produce games and demos utilising 3D hardware. 231 | * [Olive](https://github.com/olive-editor/olive) is a free non-linear video editor for Windows, macOS, and Linux. 232 | * [OpenColorIO](https://github.com/AcademySoftwareFoundation/OpenColorIO) a complete color management solution geared towards motion picture production with an emphasis on visual effects and computer animation. 233 | * [OpenXRay](https://github.com/OpenXRay/xray-16) is an improved version of the X-Ray engine, used in world famous S.T.A.L.K.E.R. game series by GSC Game World. 234 | * [parallel-n64](https://github.com/libretro/parallel-n64) is an optimized/rewritten Nintendo 64 emulator made specifically for [Libretro](https://www.libretro.com/). 235 | * [Pathfinder C++](https://github.com/floppyhammer/pathfinder-cpp) is a fast, practical, GPU-based rasterizer for fonts and vector graphics using Vulkan and C++. 236 | * [PFFFT](https://github.com/marton78/pffft) does 1D Fast Fourier Transforms, of single precision real and complex vectors. 237 | * [pixaccess](https://github.com/oliverue/pixaccess) provides the abstractions for integer and float bitmaps, pixels, and aliased (nearest neighbor) and anti-aliased (bi-linearly interpolated) pixel access. 238 | * [PlutoSDR Firmware](https://github.com/seanstone/plutosdr-fw) is the customized firmware for the [PlutoSDR](https://wiki.analog.com/university/tools/pluto) that can be used to introduce fundamentals of Software Defined Radio (SDR) or Radio Frequency (RF) or Communications as advanced topics in electrical engineering in a self or instructor lead setting. 239 | * [PowerToys](https://github.com/microsoft/PowerToys) is a set of utilities for power users to tune and streamline their Windows experience for greater productivity. 240 | * [Pygame](https://www.pygame.org) is cross-platform and designed to make it easy to write multimedia software, such as games, in Python. 241 | * [R:RandomFieldsUtils](https://cran.r-project.org/web/packages/RandomFieldsUtils) provides various utilities might be used in spatial statistics and elsewhere. (CRAN) 242 | * [RAxML](https://github.com/stamatak/standard-RAxML) is tool for Phylogenetic Analysis and Post-Analysis of Large Phylogenies. 243 | * [ReHLDS](https://github.com/gennadykataev/rehlds) is fully compatible with latest Half-Life Dedicated Server (HLDS) with a lot of defects and (potential) bugs fixed. 244 | * [rkcommon](https://github.com/ospray/rkcommon) represents a common set of C++ infrastructure and CMake utilities used by various components of [Intel oneAPI Rendering Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/rendering-toolkit.html). 245 | * [RPCS3](https://github.com/RPCS3/rpcs3) is the world's first free and open-source PlayStation 3 emulator/debugger, written in C++. 246 | * [simd\_utils](https://github.com/JishinMaster/simd_utils) is a header-only library implementing common mathematical functions using SIMD intrinsics. 247 | * [Sire](https://github.com/OpenBioSim/sire) is a molecular modelling framework that provides extensive functionality to manipulate representations of biomolecular systems. 248 | * [SMhasher](https://github.com/rurban/smhasher) provides comprehensive Hash function quality and speed tests. 249 | * [SNN++](https://github.com/ianmkim/snnpp) implements a single layer non linear Spiking Neural Network for images classification and generation. 250 | * [Spack](https://github.com/spack/spack) is a multi-platform package manager that builds and installs multiple versions and configurations of software. 251 | * [SRA](https://github.com/ncbi/sra-tools) is a collection of tools and libraries for using data in the [INSDC Sequence Read Archives](https://www.ncbi.nlm.nih.gov/sra/docs/). 252 | * [srsLTE](https://github.com/srsLTE/srsLTE) is an open source SDR LTE software suite. 253 | * [SSW](https://github.com/mengyao/Complete-Striped-Smith-Waterman-Library) is a fast implementation of the [Smith-Waterman algorithm](https://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm), which uses the SIMD instructions to parallelize the algorithm at the instruction level. 254 | * [Surge](https://github.com/surge-synthesizer/surge) is an open source digital synthesizer. 255 | * [The Forge](https://github.com/ConfettiFX/The-Forge) is a cross-platform rendering framework, providing building blocks to write your own game engine. 256 | * [Typesense](https://github.com/typesense/typesense) is a fast, typo-tolerant search engine for building delightful search experiences. 257 | * [Vcpkg](https://github.com/microsoft/vcpkg) is a C++ Library Manager for Windows, Linux, and macOS. 258 | * [VelocyPack](https://github.com/arangodb/velocypack) is a fast and compact format for serialization and storage. 259 | * [VOLK](https://github.com/gnuradio/volk), Vector-Optimized Library of Kernel, is a sub-project of [GNU Radio](https://www.gnuradio.org/). 260 | * [Vowpal Wabbit](https://github.com/VowpalWabbit/vowpal_wabbit) is a machine learning system which pushes the frontier of machine learning with techniques such as online, hashing, allreduce, reductions, learning2search, active, and interactive learning. 261 | * [Winter](https://github.com/rosenthj/Winter) is the top rated chess engine from Switzerland and has competed at top invite only computer chess events. 262 | * [XEVE](https://github.com/mpeg5/xeve) (eXtra-fast Essential Video Encoder) is an open sourced and fast MPEG-5 EVC encoder. 263 | * [XMRig](https://github.com/xmrig/xmrig) is an open source CPU miner for [Monero](https://web.getmonero.org/) cryptocurrency. 264 | * [xsimd](https://github.com/xtensor-stack/xsimd) provides a unified means for using SIMD intrinsics and parallelized, optimized mathematical functions. 265 | * [YACL](https://github.com/secretflow/yasl) is a C++ library contains modules and utilities which [SecretFlow](https://github.com/secretflow) code depends on. 266 | 267 | ## Related Projects 268 | * [SIMDe](https://github.com/simd-everywhere/simde): fast and portable implementations of SIMD 269 | intrinsics on hardware which doesn't natively support them, such as calling SSE functions on ARM. 270 | * [CatBoost's sse2neon](https://github.com/catboost/catboost/blob/master/library/cpp/sse/sse2neon.h) 271 | * [ARM\_NEON\_2\_x86\_SSE](https://github.com/intel/ARM_NEON_2_x86_SSE) 272 | * [AvxToNeon](https://github.com/kunpengcompute/AvxToNeon) 273 | * [sse2rvv](https://github.com/FeddrickAquino/sse2rvv): C header file that converts Intel SSE intrinsics to RISC-V Vector intrinsic. 274 | * [sse2msa](https://github.com/i-evi/sse2msa): A C/C++ header file that converts Intel SSE intrinsics to MIPS/MIPS64 MSA intrinsics. 275 | * [sse2zig](https://github.com/aqrit/sse2zig): Intel SSE intrinsics mapped to [Zig](https://ziglang.org/) vector extensions. 276 | * [POWER/PowerPC support for GCC](https://github.com/gcc-mirror/gcc/blob/master/gcc/config/rs6000) contains a series of headers simplifying porting x86\_64 code that makes explicit use of Intel intrinsics to powerpc64le (pure little-endian mode that has been introduced with the [POWER8](https://en.wikipedia.org/wiki/POWER8)). 277 | - implementation: [xmmintrin.h](https://github.com/gcc-mirror/gcc/blob/master/gcc/config/rs6000/xmmintrin.h), [emmintrin.h](https://github.com/gcc-mirror/gcc/blob/master/gcc/config/rs6000/emmintrin.h), [pmmintrin.h](https://github.com/gcc-mirror/gcc/blob/master/gcc/config/rs6000/pmmintrin.h), [tmmintrin.h](https://github.com/gcc-mirror/gcc/blob/master/gcc/config/rs6000/tmmintrin.h), [smmintrin.h](https://github.com/gcc-mirror/gcc/blob/master/gcc/config/rs6000/smmintrin.h) 278 | 279 | ## Reference 280 | * [Intel Intrinsics Guide](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html) 281 | * [Microsoft: x86 intrinsics list](https://learn.microsoft.com/en-us/cpp/intrinsics/x86-intrinsics-list) 282 | * [Arm Neon Intrinsics Reference](https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics) 283 | * [Neon Programmer's Guide for Armv8-A](https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/neon-programmers-guide-for-armv8-a) 284 | * [NEON Programmer's Guide](https://static.docs.arm.com/den0018/a/DEN0018A_neon_programmers_guide_en.pdf) 285 | * [qemu/target/i386/ops\_sse.h](https://github.com/qemu/qemu/blob/master/target/i386/ops_sse.h): Comprehensive SSE instruction emulation in C. Ideal for semantic checks. 286 | * [Porting Takua Renderer to 64-bit ARM- Part 1](https://blog.yiningkarlli.com/2021/05/porting-takua-to-arm-pt1.html) 287 | * [Porting Takua Renderer to 64-bit ARM- Part 2](https://blog.yiningkarlli.com/2021/07/porting-takua-to-arm-pt2.html) 288 | * [Comparing SIMD on x86-64 and arm64](https://blog.yiningkarlli.com/2021/09/neon-vs-sse.html) 289 | * [Port with SSE2Neon and SIMDe](https://developer.arm.com/documentation/102581/0200/Port-with-SSE2Neon-and-SIMDe) 290 | * [Genomics: Optimizing the BWA aligner for Arm Servers](https://community.arm.com/arm-community-blogs/b/high-performance-computing-blog/posts/optimizing-genomics-and-the-bwa-aligner-for-arm-servers) 291 | * [Bit twiddling with Arm Neon: beating SSE movemasks, counting bits and more](https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon) 292 | * [C/C++ on Graviton](https://github.com/aws/aws-graviton-getting-started/blob/main/c-c%2B%2B.md) 293 | * [C/C++ on NVIDIA Grace](https://nvidia.github.io/grace-cpu-benchmarking-guide/developer/languages/c-c++.html) 294 | * [Tune graphics-intensive games for Apple silicon](https://developer.apple.com/games/pathway/) 295 | * [Benchmarking and Testing of Qualcomm Snapdragon System-on-Chip for JPL Space Applications and Missions](https://ieeexplore.ieee.org/abstract/document/9843518) 296 | * [Spotlight: Petrobras Speeds Up Linear Solvers for Reservoir Simulation Using NVIDIA Grace CPU](https://developer.nvidia.com/blog/spotlight-petrobras-accelerates-linear-solvers-for-reservoir-simulation-using-nvidia-grace-cpu/) 297 | 298 | ## Licensing 299 | 300 | `sse2neon` is freely redistributable under the MIT License. 301 | -------------------------------------------------------------------------------- /sse2neon.sln: -------------------------------------------------------------------------------- 1 | 2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio Version 17 4 | VisualStudioVersion = 17.3.32901.215 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "sse2neon", "sse2neon.vcxproj", "{341BF194-865B-4DA6-8120-93173498E774}" 7 | EndProject 8 | Global 9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 10 | Debug|ARM = Debug|ARM 11 | Debug|ARM64 = Debug|ARM64 12 | Debug|ARM64EC = Debug|ARM64EC 13 | Release|ARM = Release|ARM 14 | Release|ARM64 = Release|ARM64 15 | Release|ARM64EC = Release|ARM64EC 16 | EndGlobalSection 17 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 18 | {341BF194-865B-4DA6-8120-93173498E774}.Debug|ARM.ActiveCfg = Debug|ARM 19 | {341BF194-865B-4DA6-8120-93173498E774}.Debug|ARM.Build.0 = Debug|ARM 20 | {341BF194-865B-4DA6-8120-93173498E774}.Debug|ARM64.ActiveCfg = Debug|ARM64 21 | {341BF194-865B-4DA6-8120-93173498E774}.Debug|ARM64.Build.0 = Debug|ARM64 22 | {341BF194-865B-4DA6-8120-93173498E774}.Debug|ARM64EC.ActiveCfg = Debug|ARM64EC 23 | {341BF194-865B-4DA6-8120-93173498E774}.Debug|ARM64EC.Build.0 = Debug|ARM64EC 24 | {341BF194-865B-4DA6-8120-93173498E774}.Release|ARM.ActiveCfg = Release|ARM 25 | {341BF194-865B-4DA6-8120-93173498E774}.Release|ARM.Build.0 = Release|ARM 26 | {341BF194-865B-4DA6-8120-93173498E774}.Release|ARM64.ActiveCfg = Release|ARM64 27 | {341BF194-865B-4DA6-8120-93173498E774}.Release|ARM64.Build.0 = Release|ARM64 28 | {341BF194-865B-4DA6-8120-93173498E774}.Release|ARM64EC.ActiveCfg = Release|ARM64EC 29 | {341BF194-865B-4DA6-8120-93173498E774}.Release|ARM64EC.Build.0 = Release|ARM64EC 30 | EndGlobalSection 31 | GlobalSection(SolutionProperties) = preSolution 32 | HideSolutionNode = FALSE 33 | EndGlobalSection 34 | GlobalSection(ExtensibilityGlobals) = postSolution 35 | SolutionGuid = {D503B299-AA05-4E05-A8D9-37C8D229ACB1} 36 | EndGlobalSection 37 | EndGlobal 38 | -------------------------------------------------------------------------------- /sse2neon.vcxproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Debug 6 | ARM 7 | 8 | 9 | Debug 10 | ARM64 11 | 12 | 13 | Debug 14 | ARM64EC 15 | 16 | 17 | Release 18 | ARM 19 | 20 | 21 | Release 22 | ARM64 23 | 24 | 25 | Release 26 | ARM64EC 27 | 28 | 29 | 30 | 16.0 31 | Win32Proj 32 | {341bf194-865b-4da6-8120-93173498e774} 33 | sse2neon 34 | 10.0 35 | 36 | 37 | 38 | Application 39 | true 40 | v143 41 | Unicode 42 | 43 | 44 | Application 45 | false 46 | v143 47 | true 48 | Unicode 49 | 50 | 51 | Application 52 | true 53 | v143 54 | Unicode 55 | 56 | 57 | Application 58 | true 59 | v143 60 | Unicode 61 | 62 | 63 | Application 64 | false 65 | v143 66 | true 67 | Unicode 68 | 69 | 70 | Application 71 | false 72 | v143 73 | true 74 | Unicode 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | Level3 103 | true 104 | __i386__;WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) 105 | true 106 | .;%(AdditionalIncludeDirectories) 107 | 108 | 109 | Console 110 | true 111 | 112 | 113 | 114 | 115 | Level3 116 | true 117 | true 118 | true 119 | __i386__;WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 120 | true 121 | .;%(AdditionalIncludeDirectories) 122 | 123 | 124 | Console 125 | true 126 | true 127 | true 128 | 129 | 130 | 131 | 132 | Level3 133 | true 134 | _DEBUG;_CONSOLE;%(PreprocessorDefinitions) 135 | true 136 | .;%(AdditionalIncludeDirectories) 137 | /Zc:preprocessor 138 | true 139 | 140 | 141 | Console 142 | true 143 | 144 | 145 | 146 | 147 | Level3 148 | true 149 | _DEBUG;_CONSOLE;%(PreprocessorDefinitions) 150 | true 151 | .;%(AdditionalIncludeDirectories) 152 | /Zc:preprocessor 153 | true 154 | stdcpp20 155 | false 156 | 157 | 158 | Console 159 | true 160 | 161 | 162 | 163 | 164 | Level3 165 | true 166 | true 167 | true 168 | NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 169 | true 170 | .;%(AdditionalIncludeDirectories) 171 | /Zc:preprocessor %(AdditionalOptions) 172 | 173 | 174 | Console 175 | true 176 | true 177 | true 178 | 179 | 180 | 181 | 182 | Level3 183 | true 184 | true 185 | true 186 | NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 187 | true 188 | .;%(AdditionalIncludeDirectories) 189 | /Zc:preprocessor %(AdditionalOptions) 190 | false 191 | 192 | 193 | Console 194 | true 195 | true 196 | true 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | -------------------------------------------------------------------------------- /sse2neon.vcxproj.filters: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | {4FC737F1-C7A5-4376-A066-2A32D752A2FF} 6 | cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx 7 | 8 | 9 | {93995380-89BD-4b04-88EB-625FBE52EBFB} 10 | h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd 11 | 12 | 13 | {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} 14 | rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms 15 | 16 | 17 | 18 | 19 | Source Files 20 | 21 | 22 | Source Files 23 | 24 | 25 | Source Files 26 | 27 | 28 | Source Files 29 | 30 | 31 | 32 | 33 | Header Files 34 | 35 | 36 | Header Files 37 | 38 | 39 | Header Files 40 | 41 | 42 | Header Files 43 | 44 | 45 | -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- 1 | # Test Suite for SSE2NEON 2 | 3 | :warning: **Warning: The test suite is based on the little-endian architecture.** 4 | 5 | ## Add More Test Items 6 | Once the conversion is implemented, the test can be added with the following steps: 7 | 8 | * File `tests/impl.h` 9 | 10 | Add the intrinsic under `INTRIN_LIST` macro. The naming convention 11 | should be `mm_xxx`. 12 | Place it in the correct classification with the alphabetical order. 13 | The classification can be referenced from [Intel Intrinsics Guide](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html). 14 | 15 | * File `tests/impl.cpp` 16 | ```c 17 | result_t test_mm_xxx() 18 | { 19 | // The C implementation 20 | ... 21 | 22 | // The Neon implementation 23 | ret = _mm_xxx(); 24 | 25 | // Compare the result of two implementations and return either 26 | // TEST_SUCCESS, TEST_FAIL, or TEST_UNIMPL 27 | ... 28 | } 29 | ``` 30 | -------------------------------------------------------------------------------- /tests/binding.cpp: -------------------------------------------------------------------------------- 1 | #include "binding.h" 2 | 3 | #include 4 | #include 5 | 6 | namespace SSE2NEON 7 | { 8 | void *platformAlignedAlloc(size_t size) 9 | { 10 | void *address; 11 | #if defined(_WIN32) 12 | address = _aligned_malloc(size, 16); 13 | if (!address) { 14 | #else 15 | int ret = posix_memalign(&address, 16, size); 16 | if (ret != 0) { 17 | #endif 18 | fprintf(stderr, "Error at File %s line number %d\n", __FILE__, 19 | __LINE__); 20 | exit(EXIT_FAILURE); 21 | } 22 | return address; 23 | } 24 | 25 | void platformAlignedFree(void *ptr) 26 | { 27 | #if defined(_WIN32) 28 | _aligned_free(ptr); 29 | #else 30 | free(ptr); 31 | #endif 32 | } 33 | 34 | 35 | } // namespace SSE2NEON 36 | -------------------------------------------------------------------------------- /tests/binding.h: -------------------------------------------------------------------------------- 1 | #ifndef SSE2NEONBINDING_H 2 | #define SSE2NEONBINDING_H 3 | 4 | #include 5 | 6 | // The SSE2NEON unit tests run both within our own internal project 7 | // as well as within the open source framework. 8 | // This header file is used to abstract any distinctions between 9 | // those two build environments. 10 | // 11 | // Initially, this is for how 16 byte aligned memory is allocated 12 | namespace SSE2NEON 13 | { 14 | void *platformAlignedAlloc(size_t size); 15 | void platformAlignedFree(void *ptr); 16 | 17 | } // namespace SSE2NEON 18 | 19 | #endif 20 | -------------------------------------------------------------------------------- /tests/common.cpp: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | #include 3 | #include 4 | 5 | namespace SSE2NEON 6 | { 7 | int32_t NaN = ~0; 8 | int64_t NaN64 = ~0; 9 | 10 | result_t validateInt64(__m128i a, int64_t i0, int64_t i1) 11 | { 12 | const int64_t *t = (const int64_t *) &a; 13 | ASSERT_RETURN(t[0] == i0); 14 | ASSERT_RETURN(t[1] == i1); 15 | return TEST_SUCCESS; 16 | } 17 | 18 | result_t validateInt64(__m64 a, int64_t i0) 19 | { 20 | const int64_t *t = (const int64_t *) &a; 21 | ASSERT_RETURN(t[0] == i0); 22 | return TEST_SUCCESS; 23 | } 24 | 25 | result_t validateUInt64(__m128i a, uint64_t u0, uint64_t u1) 26 | { 27 | const uint64_t *t = (const uint64_t *) &a; 28 | ASSERT_RETURN(t[0] == u0); 29 | ASSERT_RETURN(t[1] == u1); 30 | return TEST_SUCCESS; 31 | } 32 | 33 | result_t validateUInt64(__m64 a, uint64_t u0) 34 | { 35 | const uint64_t *t = (const uint64_t *) &a; 36 | ASSERT_RETURN(t[0] == u0); 37 | return TEST_SUCCESS; 38 | } 39 | 40 | result_t validateInt32(__m128i a, 41 | int32_t i0, 42 | int32_t i1, 43 | int32_t i2, 44 | int32_t i3) 45 | { 46 | const int32_t *t = (const int32_t *) &a; 47 | ASSERT_RETURN(t[0] == i0); 48 | ASSERT_RETURN(t[1] == i1); 49 | ASSERT_RETURN(t[2] == i2); 50 | ASSERT_RETURN(t[3] == i3); 51 | return TEST_SUCCESS; 52 | } 53 | 54 | result_t validateUInt32(__m128i a, 55 | uint32_t u0, 56 | uint32_t u1, 57 | uint32_t u2, 58 | uint32_t u3) 59 | { 60 | const uint32_t *t = (const uint32_t *) &a; 61 | ASSERT_RETURN(t[0] == u0); 62 | ASSERT_RETURN(t[1] == u1); 63 | ASSERT_RETURN(t[2] == u2); 64 | ASSERT_RETURN(t[3] == u3); 65 | return TEST_SUCCESS; 66 | } 67 | 68 | result_t validateUInt32(__m64 a, uint32_t u0, uint32_t u1) 69 | { 70 | const uint32_t *t = (const uint32_t *) &a; 71 | ASSERT_RETURN(t[0] == u0); 72 | ASSERT_RETURN(t[1] == u1); 73 | return TEST_SUCCESS; 74 | } 75 | 76 | result_t validateInt16(__m128i a, 77 | int16_t i0, 78 | int16_t i1, 79 | int16_t i2, 80 | int16_t i3, 81 | int16_t i4, 82 | int16_t i5, 83 | int16_t i6, 84 | int16_t i7) 85 | { 86 | const int16_t *t = (const int16_t *) &a; 87 | ASSERT_RETURN(t[0] == i0); 88 | ASSERT_RETURN(t[1] == i1); 89 | ASSERT_RETURN(t[2] == i2); 90 | ASSERT_RETURN(t[3] == i3); 91 | ASSERT_RETURN(t[4] == i4); 92 | ASSERT_RETURN(t[5] == i5); 93 | ASSERT_RETURN(t[6] == i6); 94 | ASSERT_RETURN(t[7] == i7); 95 | return TEST_SUCCESS; 96 | } 97 | 98 | result_t validateInt16(__m64 a, int16_t i0, int16_t i1, int16_t i2, int16_t i3) 99 | { 100 | const int16_t *t = (const int16_t *) &a; 101 | ASSERT_RETURN(t[0] == i0); 102 | ASSERT_RETURN(t[1] == i1); 103 | ASSERT_RETURN(t[2] == i2); 104 | ASSERT_RETURN(t[3] == i3); 105 | return TEST_SUCCESS; 106 | } 107 | 108 | result_t validateUInt16(__m128i a, 109 | uint16_t u0, 110 | uint16_t u1, 111 | uint16_t u2, 112 | uint16_t u3, 113 | uint16_t u4, 114 | uint16_t u5, 115 | uint16_t u6, 116 | uint16_t u7) 117 | { 118 | const uint16_t *t = (const uint16_t *) &a; 119 | ASSERT_RETURN(t[0] == u0); 120 | ASSERT_RETURN(t[1] == u1); 121 | ASSERT_RETURN(t[2] == u2); 122 | ASSERT_RETURN(t[3] == u3); 123 | ASSERT_RETURN(t[4] == u4); 124 | ASSERT_RETURN(t[5] == u5); 125 | ASSERT_RETURN(t[6] == u6); 126 | ASSERT_RETURN(t[7] == u7); 127 | return TEST_SUCCESS; 128 | } 129 | 130 | result_t validateInt32(__m64 a, int32_t u0, int32_t u1) 131 | { 132 | const int32_t *t = (const int32_t *) &a; 133 | ASSERT_RETURN(t[0] == u0); 134 | ASSERT_RETURN(t[1] == u1); 135 | return TEST_SUCCESS; 136 | } 137 | 138 | result_t validateUInt16(__m64 a, 139 | uint16_t u0, 140 | uint16_t u1, 141 | uint16_t u2, 142 | uint16_t u3) 143 | { 144 | const uint16_t *t = (const uint16_t *) &a; 145 | ASSERT_RETURN(t[0] == u0); 146 | ASSERT_RETURN(t[1] == u1); 147 | ASSERT_RETURN(t[2] == u2); 148 | ASSERT_RETURN(t[3] == u3); 149 | return TEST_SUCCESS; 150 | } 151 | 152 | result_t validateInt8(__m128i a, 153 | int8_t i0, 154 | int8_t i1, 155 | int8_t i2, 156 | int8_t i3, 157 | int8_t i4, 158 | int8_t i5, 159 | int8_t i6, 160 | int8_t i7, 161 | int8_t i8, 162 | int8_t i9, 163 | int8_t i10, 164 | int8_t i11, 165 | int8_t i12, 166 | int8_t i13, 167 | int8_t i14, 168 | int8_t i15) 169 | { 170 | const int8_t *t = (const int8_t *) &a; 171 | ASSERT_RETURN(t[0] == i0); 172 | ASSERT_RETURN(t[1] == i1); 173 | ASSERT_RETURN(t[2] == i2); 174 | ASSERT_RETURN(t[3] == i3); 175 | ASSERT_RETURN(t[4] == i4); 176 | ASSERT_RETURN(t[5] == i5); 177 | ASSERT_RETURN(t[6] == i6); 178 | ASSERT_RETURN(t[7] == i7); 179 | ASSERT_RETURN(t[8] == i8); 180 | ASSERT_RETURN(t[9] == i9); 181 | ASSERT_RETURN(t[10] == i10); 182 | ASSERT_RETURN(t[11] == i11); 183 | ASSERT_RETURN(t[12] == i12); 184 | ASSERT_RETURN(t[13] == i13); 185 | ASSERT_RETURN(t[14] == i14); 186 | ASSERT_RETURN(t[15] == i15); 187 | return TEST_SUCCESS; 188 | } 189 | 190 | result_t validateInt8(__m64 a, 191 | int8_t i0, 192 | int8_t i1, 193 | int8_t i2, 194 | int8_t i3, 195 | int8_t i4, 196 | int8_t i5, 197 | int8_t i6, 198 | int8_t i7) 199 | { 200 | const int8_t *t = (const int8_t *) &a; 201 | ASSERT_RETURN(t[0] == i0); 202 | ASSERT_RETURN(t[1] == i1); 203 | ASSERT_RETURN(t[2] == i2); 204 | ASSERT_RETURN(t[3] == i3); 205 | ASSERT_RETURN(t[4] == i4); 206 | ASSERT_RETURN(t[5] == i5); 207 | ASSERT_RETURN(t[6] == i6); 208 | ASSERT_RETURN(t[7] == i7); 209 | return TEST_SUCCESS; 210 | } 211 | 212 | result_t validateUInt8(__m128i a, 213 | uint8_t u0, 214 | uint8_t u1, 215 | uint8_t u2, 216 | uint8_t u3, 217 | uint8_t u4, 218 | uint8_t u5, 219 | uint8_t u6, 220 | uint8_t u7, 221 | uint8_t u8, 222 | uint8_t u9, 223 | uint8_t u10, 224 | uint8_t u11, 225 | uint8_t u12, 226 | uint8_t u13, 227 | uint8_t u14, 228 | uint8_t u15) 229 | { 230 | const uint8_t *t = (const uint8_t *) &a; 231 | ASSERT_RETURN(t[0] == u0); 232 | ASSERT_RETURN(t[1] == u1); 233 | ASSERT_RETURN(t[2] == u2); 234 | ASSERT_RETURN(t[3] == u3); 235 | ASSERT_RETURN(t[4] == u4); 236 | ASSERT_RETURN(t[5] == u5); 237 | ASSERT_RETURN(t[6] == u6); 238 | ASSERT_RETURN(t[7] == u7); 239 | ASSERT_RETURN(t[8] == u8); 240 | ASSERT_RETURN(t[9] == u9); 241 | ASSERT_RETURN(t[10] == u10); 242 | ASSERT_RETURN(t[11] == u11); 243 | ASSERT_RETURN(t[12] == u12); 244 | ASSERT_RETURN(t[13] == u13); 245 | ASSERT_RETURN(t[14] == u14); 246 | ASSERT_RETURN(t[15] == u15); 247 | return TEST_SUCCESS; 248 | } 249 | 250 | result_t validateUInt8(__m64 a, 251 | uint8_t u0, 252 | uint8_t u1, 253 | uint8_t u2, 254 | uint8_t u3, 255 | uint8_t u4, 256 | uint8_t u5, 257 | uint8_t u6, 258 | uint8_t u7) 259 | { 260 | const uint8_t *t = (const uint8_t *) &a; 261 | ASSERT_RETURN(t[0] == u0); 262 | ASSERT_RETURN(t[1] == u1); 263 | ASSERT_RETURN(t[2] == u2); 264 | ASSERT_RETURN(t[3] == u3); 265 | ASSERT_RETURN(t[4] == u4); 266 | ASSERT_RETURN(t[5] == u5); 267 | ASSERT_RETURN(t[6] == u6); 268 | ASSERT_RETURN(t[7] == u7); 269 | return TEST_SUCCESS; 270 | } 271 | 272 | result_t validateSingleFloatPair(float a, float b) 273 | { 274 | const uint32_t *ua = (const uint32_t *) &a; 275 | const uint32_t *ub = (const uint32_t *) &b; 276 | // We do an integer (binary) compare rather than a 277 | // floating point compare to take NaNs and infinities 278 | // into account as well. 279 | return (*ua) == (*ub) ? TEST_SUCCESS : TEST_FAIL; 280 | } 281 | 282 | result_t validateSingleDoublePair(double a, double b) 283 | { 284 | const uint64_t *ua = (const uint64_t *) &a; 285 | const uint64_t *ub = (const uint64_t *) &b; 286 | // We do an integer (binary) compare rather than a 287 | // floating point compare to take NaNs and infinities 288 | // into account as well. 289 | 290 | if (std::isnan(a) && std::isnan(b)) { 291 | return TEST_SUCCESS; 292 | } 293 | 294 | return (*ua) == (*ub) ? TEST_SUCCESS : TEST_FAIL; 295 | } 296 | 297 | result_t validateFloat(__m128 a, float f0, float f1, float f2, float f3) 298 | { 299 | const float *t = (const float *) &a; 300 | ASSERT_RETURN(validateSingleFloatPair(t[0], f0)); 301 | ASSERT_RETURN(validateSingleFloatPair(t[1], f1)); 302 | ASSERT_RETURN(validateSingleFloatPair(t[2], f2)); 303 | ASSERT_RETURN(validateSingleFloatPair(t[3], f3)); 304 | return TEST_SUCCESS; 305 | } 306 | 307 | result_t validateFloatEpsilon(__m128 a, 308 | float f0, 309 | float f1, 310 | float f2, 311 | float f3, 312 | float epsilon) 313 | { 314 | const float *t = (const float *) &a; 315 | float df0 = fabsf(t[0] - f0); 316 | float df1 = fabsf(t[1] - f1); 317 | float df2 = fabsf(t[2] - f2); 318 | float df3 = fabsf(t[3] - f3); 319 | 320 | // Due to floating-point error, subtracting floating-point number with NaN 321 | // and zero value usually produces erroneous result. Therefore, we directly 322 | // define the difference of two floating-point numbers to zero if both 323 | // numbers are NaN or zero. 324 | if ((std::isnan(t[0]) && std::isnan(f0)) || (t[0] == 0 && f0 == 0)) { 325 | df0 = 0; 326 | } 327 | 328 | if ((std::isnan(t[1]) && std::isnan(f1)) || (t[1] == 0 && f1 == 0)) { 329 | df1 = 0; 330 | } 331 | 332 | if ((std::isnan(t[2]) && std::isnan(f2)) || (t[2] == 0 && f2 == 0)) { 333 | df2 = 0; 334 | } 335 | 336 | if ((std::isnan(t[3]) && std::isnan(f3)) || (t[3] == 0 && f3 == 0)) { 337 | df3 = 0; 338 | } 339 | 340 | ASSERT_RETURN(df0 < epsilon); 341 | ASSERT_RETURN(df1 < epsilon); 342 | ASSERT_RETURN(df2 < epsilon); 343 | ASSERT_RETURN(df3 < epsilon); 344 | return TEST_SUCCESS; 345 | } 346 | 347 | result_t validateFloatError(__m128 a, 348 | float f0, 349 | float f1, 350 | float f2, 351 | float f3, 352 | float err) 353 | { 354 | const float *t = (const float *) &a; 355 | float df0 = fabsf((t[0] - f0) / f0); 356 | float df1 = fabsf((t[1] - f1) / f1); 357 | float df2 = fabsf((t[2] - f2) / f2); 358 | float df3 = fabsf((t[3] - f3) / f3); 359 | 360 | if ((std::isnan(t[0]) && std::isnan(f0)) || (t[0] == 0 && f0 == 0) || 361 | (std::isinf(t[0]) && std::isinf(f0))) { 362 | df0 = 0; 363 | } 364 | 365 | if ((std::isnan(t[1]) && std::isnan(f1)) || (t[1] == 0 && f1 == 0) || 366 | (std::isinf(t[1]) && std::isinf(f1))) { 367 | df1 = 0; 368 | } 369 | 370 | if ((std::isnan(t[2]) && std::isnan(f2)) || (t[2] == 0 && f2 == 0) || 371 | (std::isinf(t[2]) && std::isinf(f2))) { 372 | df2 = 0; 373 | } 374 | 375 | if ((std::isnan(t[3]) && std::isnan(f3)) || (t[3] == 0 && f3 == 0) || 376 | (std::isinf(t[3]) && std::isinf(f3))) { 377 | df3 = 0; 378 | } 379 | 380 | ASSERT_RETURN(df0 < err); 381 | ASSERT_RETURN(df1 < err); 382 | ASSERT_RETURN(df2 < err); 383 | ASSERT_RETURN(df3 < err); 384 | return TEST_SUCCESS; 385 | } 386 | 387 | result_t validateDouble(__m128d a, double d0, double d1) 388 | { 389 | const double *t = (const double *) &a; 390 | ASSERT_RETURN(validateSingleDoublePair(t[0], d0)); 391 | ASSERT_RETURN(validateSingleDoublePair(t[1], d1)); 392 | return TEST_SUCCESS; 393 | } 394 | 395 | result_t validateFloatError(__m128d a, double d0, double d1, double err) 396 | { 397 | const double *t = (const double *) &a; 398 | double td0 = fabs((t[0] - d0) / d0); 399 | double td1 = fabs((t[1] - d1) / d1); 400 | 401 | if (std::isnan(t[0]) && std::isnan(d0)) { 402 | td0 = 0; 403 | } 404 | 405 | if (std::isnan(t[1]) && std::isnan(d1)) { 406 | td1 = 0; 407 | } 408 | 409 | ASSERT_RETURN(td0 < err); 410 | ASSERT_RETURN(td1 < err); 411 | return TEST_SUCCESS; 412 | } 413 | 414 | } // namespace SSE2NEON 415 | -------------------------------------------------------------------------------- /tests/common.h: -------------------------------------------------------------------------------- 1 | #ifndef SSE2NEONCOMMON_H 2 | #define SSE2NEONCOMMON_H 3 | #include 4 | #if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) || \ 5 | defined(__arm__) 6 | #include "sse2neon.h" 7 | #elif defined(__x86_64__) || defined(__i386__) 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | // __int64 is defined in the Intrinsics Guide which maps to different datatype 16 | // in different data model 17 | #if !(defined(_WIN32) || defined(_WIN64) || defined(__int64)) 18 | #if (defined(__x86_64__) || defined(__i386__)) 19 | #define __int64 long long 20 | #else 21 | #define __int64 int64_t 22 | #endif 23 | #endif 24 | 25 | #if defined(__GNUC__) || defined(__clang__) 26 | #pragma push_macro("ALIGN_STRUCT") 27 | #define ALIGN_STRUCT(x) __attribute__((aligned(x))) 28 | #else 29 | #define ALIGN_STRUCT(x) __declspec(align(x)) 30 | #endif 31 | 32 | typedef union ALIGN_STRUCT(16) SIMDVec { 33 | float m128_f32[4]; // as floats - DON'T USE. Added for convenience. 34 | int8_t m128_i8[16]; // as signed 8-bit integers. 35 | int16_t m128_i16[8]; // as signed 16-bit integers. 36 | int32_t m128_i32[4]; // as signed 32-bit integers. 37 | int64_t m128_i64[2]; // as signed 64-bit integers. 38 | uint8_t m128_u8[16]; // as unsigned 8-bit integers. 39 | uint16_t m128_u16[8]; // as unsigned 16-bit integers. 40 | uint32_t m128_u32[4]; // as unsigned 32-bit integers. 41 | uint64_t m128_u64[2]; // as unsigned 64-bit integers. 42 | } SIMDVec; 43 | 44 | #if defined(__GNUC__) || defined(__clang__) 45 | #pragma pop_macro("ALIGN_STRUCT") 46 | #endif 47 | 48 | /* Tunable testing configuration for precise testing */ 49 | /* _mm_min|max_ps|ss|pd|sd */ 50 | #ifndef SSE2NEON_PRECISE_MINMAX 51 | #define SSE2NEON_PRECISE_MINMAX (0) 52 | #endif 53 | #endif 54 | 55 | #define ASSERT_RETURN(x) \ 56 | if (!(x)) \ 57 | return TEST_FAIL; 58 | 59 | namespace SSE2NEON 60 | { 61 | enum result_t { 62 | TEST_SUCCESS = 1, 63 | TEST_FAIL = 0, 64 | TEST_UNIMPL = -1, 65 | }; 66 | extern int32_t NaN; 67 | extern int64_t NaN64; 68 | 69 | #if defined(__GNUC__) && !defined(__clang__) 70 | #pragma push_macro("OPTNONE") 71 | #define OPTNONE __attribute__((optimize("O0"))) 72 | #elif defined(__clang__) 73 | #pragma push_macro("OPTNONE") 74 | #define OPTNONE __attribute__((optnone)) 75 | #else 76 | #define OPTNONE 77 | #endif 78 | 79 | #include 80 | static inline double sse2neon_tool_recast_f64(uint64_t u64) 81 | { 82 | double f64; 83 | memcpy(&f64, &u64, sizeof(uint64_t)); 84 | return f64; 85 | } 86 | static inline int64_t sse2neon_tool_recast_i64(double f64) 87 | { 88 | int64_t i64; 89 | memcpy(&i64, &f64, sizeof(int64_t)); 90 | return i64; 91 | } 92 | static inline float sse2neon_tool_recast_f32(uint32_t u32) 93 | { 94 | float f32; 95 | memcpy(&f32, &u32, sizeof(uint32_t)); 96 | return f32; 97 | } 98 | static inline float sse2neon_tool_recast_f32(int32_t i32) 99 | { 100 | float f32; 101 | memcpy(&f32, &i32, sizeof(int32_t)); 102 | return f32; 103 | } 104 | #define ALL_BIT_1_32 sse2neon_tool_recast_f32(UINT32_MAX) 105 | #define ALL_BIT_1_64 sse2neon_tool_recast_f64(UINT64_MAX) 106 | 107 | template 108 | result_t validate128(T a, T b) 109 | { 110 | const int32_t *t1 = (const int32_t *) &a; 111 | const int32_t *t2 = (const int32_t *) &b; 112 | 113 | ASSERT_RETURN(t1[0] == t2[0]); 114 | ASSERT_RETURN(t1[1] == t2[1]); 115 | ASSERT_RETURN(t1[2] == t2[2]); 116 | ASSERT_RETURN(t1[3] == t2[3]); 117 | return TEST_SUCCESS; 118 | } 119 | result_t validateInt64(__m128i a, int64_t i0, int64_t i1); 120 | result_t validateInt64(__m64 a, int64_t i0); 121 | result_t validateUInt64(__m128i a, uint64_t u0, uint64_t u1); 122 | result_t validateUInt64(__m64 a, uint64_t u0); 123 | result_t validateInt32(__m128i a, 124 | int32_t i0, 125 | int32_t i1, 126 | int32_t i2, 127 | int32_t i3); 128 | result_t validateUInt32(__m128i a, 129 | uint32_t u0, 130 | uint32_t u1, 131 | uint32_t u2, 132 | uint32_t u3); 133 | result_t validateUInt32(__m64 a, uint32_t u0, uint32_t u1); 134 | result_t validateInt32(__m64 a, int32_t u0, int32_t u1); 135 | result_t validateInt16(__m128i a, 136 | int16_t i0, 137 | int16_t i1, 138 | int16_t i2, 139 | int16_t i3, 140 | int16_t i4, 141 | int16_t i5, 142 | int16_t i6, 143 | int16_t i7); 144 | result_t validateInt16(__m64 a, int16_t i0, int16_t i1, int16_t i2, int16_t i3); 145 | result_t validateUInt16(__m128i a, 146 | uint16_t u0, 147 | uint16_t u1, 148 | uint16_t u2, 149 | uint16_t u3, 150 | uint16_t u4, 151 | uint16_t u5, 152 | uint16_t u6, 153 | uint16_t u7); 154 | result_t validateUInt16(__m64 a, 155 | uint16_t u0, 156 | uint16_t u1, 157 | uint16_t u2, 158 | uint16_t u3); 159 | result_t validateInt8(__m128i a, 160 | int8_t i0, 161 | int8_t i1, 162 | int8_t i2, 163 | int8_t i3, 164 | int8_t i4, 165 | int8_t i5, 166 | int8_t i6, 167 | int8_t i7, 168 | int8_t i8, 169 | int8_t i9, 170 | int8_t i10, 171 | int8_t i11, 172 | int8_t i12, 173 | int8_t i13, 174 | int8_t i14, 175 | int8_t i15); 176 | result_t validateInt8(__m64 a, 177 | int8_t i0, 178 | int8_t i1, 179 | int8_t i2, 180 | int8_t i3, 181 | int8_t i4, 182 | int8_t i5, 183 | int8_t i6, 184 | int8_t i7); 185 | result_t validateUInt8(__m128i a, 186 | uint8_t u0, 187 | uint8_t u1, 188 | uint8_t u2, 189 | uint8_t u3, 190 | uint8_t u4, 191 | uint8_t u5, 192 | uint8_t u6, 193 | uint8_t u7, 194 | uint8_t u8, 195 | uint8_t u9, 196 | uint8_t u10, 197 | uint8_t u11, 198 | uint8_t u12, 199 | uint8_t u13, 200 | uint8_t u14, 201 | uint8_t u15); 202 | result_t validateUInt8(__m64 a, 203 | uint8_t u0, 204 | uint8_t u1, 205 | uint8_t u2, 206 | uint8_t u3, 207 | uint8_t u4, 208 | uint8_t u5, 209 | uint8_t u6, 210 | uint8_t u7); 211 | result_t validateSingleFloatPair(float a, float b); 212 | result_t validateSingleDoublePair(double a, double b); 213 | result_t validateFloat(__m128 a, float f0, float f1, float f2, float f3); 214 | result_t validateFloatEpsilon(__m128 a, 215 | float f0, 216 | float f1, 217 | float f2, 218 | float f3, 219 | float epsilon); 220 | result_t validateFloatError(__m128 a, 221 | float f0, 222 | float f1, 223 | float f2, 224 | float f3, 225 | float err); 226 | result_t validateDouble(__m128d a, double d0, double d1); 227 | result_t validateFloatError(__m128d a, double d0, double d1, double err); 228 | 229 | #define VALIDATE_INT8_M128(A, B) \ 230 | validateInt8(A, B[0], B[1], B[2], B[3], B[4], B[5], B[6], B[7], B[8], \ 231 | B[9], B[10], B[11], B[12], B[13], B[14], B[15]) 232 | #define VALIDATE_UINT8_M128(A, B) \ 233 | validateUInt8(A, B[0], B[1], B[2], B[3], B[4], B[5], B[6], B[7], B[8], \ 234 | B[9], B[10], B[11], B[12], B[13], B[14], B[15]) 235 | #define VALIDATE_INT16_M128(A, B) \ 236 | validateInt16(A, B[0], B[1], B[2], B[3], B[4], B[5], B[6], B[7]) 237 | #define VALIDATE_UINT16_M128(A, B) \ 238 | validateUInt16(A, B[0], B[1], B[2], B[3], B[4], B[5], B[6], B[7]) 239 | #define VALIDATE_INT32_M128(A, B) validateInt32(A, B[0], B[1], B[2], B[3]) 240 | #define VALIDATE_UINT32_M128(A, B) validateUInt32(A, B[0], B[1], B[2], B[3]) 241 | 242 | #define VALIDATE_INT8_M64(A, B) \ 243 | validateInt8(A, B[0], B[1], B[2], B[3], B[4], B[5], B[6], B[7]) 244 | #define VALIDATE_UINT8_M64(A, B) \ 245 | validateUInt8(A, B[0], B[1], B[2], B[3], B[4], B[5], B[6], B[7]) 246 | #define VALIDATE_INT16_M64(A, B) validateInt16(A, B[0], B[1], B[2], B[3]) 247 | #define VALIDATE_UINT16_M64(A, B) validateUInt16(A, B[0], B[1], B[2], B[3]) 248 | #define VALIDATE_INT32_M64(A, B) validateInt32(A, B[0], B[1]) 249 | #define VALIDATE_UINT32_M64(A, B) validateUInt32(A, B[0], B[1]) 250 | #define CHECK_RESULT(EXP) \ 251 | if (EXP != TEST_SUCCESS) { \ 252 | return TEST_FAIL; \ 253 | } 254 | #define IMM_2_ITER \ 255 | TEST_IMPL(0) \ 256 | TEST_IMPL(1) 257 | #define IMM_4_ITER \ 258 | IMM_2_ITER \ 259 | TEST_IMPL(2) \ 260 | TEST_IMPL(3) 261 | #define IMM_8_ITER \ 262 | IMM_4_ITER \ 263 | TEST_IMPL(4) \ 264 | TEST_IMPL(5) \ 265 | TEST_IMPL(6) \ 266 | TEST_IMPL(7) 267 | #define IMM_16_ITER \ 268 | IMM_8_ITER \ 269 | TEST_IMPL(8) \ 270 | TEST_IMPL(9) \ 271 | TEST_IMPL(10) \ 272 | TEST_IMPL(11) \ 273 | TEST_IMPL(12) \ 274 | TEST_IMPL(13) \ 275 | TEST_IMPL(14) \ 276 | TEST_IMPL(15) 277 | #define IMM_32_ITER \ 278 | IMM_16_ITER \ 279 | TEST_IMPL(16) \ 280 | TEST_IMPL(17) \ 281 | TEST_IMPL(18) \ 282 | TEST_IMPL(19) \ 283 | TEST_IMPL(20) \ 284 | TEST_IMPL(21) \ 285 | TEST_IMPL(22) \ 286 | TEST_IMPL(23) \ 287 | TEST_IMPL(24) \ 288 | TEST_IMPL(25) \ 289 | TEST_IMPL(26) \ 290 | TEST_IMPL(27) \ 291 | TEST_IMPL(28) \ 292 | TEST_IMPL(29) \ 293 | TEST_IMPL(30) \ 294 | TEST_IMPL(31) 295 | #define IMM_64_ITER \ 296 | IMM_32_ITER \ 297 | TEST_IMPL(32) \ 298 | TEST_IMPL(33) \ 299 | TEST_IMPL(34) \ 300 | TEST_IMPL(35) \ 301 | TEST_IMPL(36) \ 302 | TEST_IMPL(37) \ 303 | TEST_IMPL(38) \ 304 | TEST_IMPL(39) \ 305 | TEST_IMPL(40) \ 306 | TEST_IMPL(41) \ 307 | TEST_IMPL(42) \ 308 | TEST_IMPL(43) \ 309 | TEST_IMPL(44) \ 310 | TEST_IMPL(45) \ 311 | TEST_IMPL(46) \ 312 | TEST_IMPL(47) \ 313 | TEST_IMPL(48) \ 314 | TEST_IMPL(49) \ 315 | TEST_IMPL(50) \ 316 | TEST_IMPL(51) \ 317 | TEST_IMPL(52) \ 318 | TEST_IMPL(53) \ 319 | TEST_IMPL(54) \ 320 | TEST_IMPL(55) \ 321 | TEST_IMPL(56) \ 322 | TEST_IMPL(57) \ 323 | TEST_IMPL(58) \ 324 | TEST_IMPL(59) \ 325 | TEST_IMPL(60) \ 326 | TEST_IMPL(61) \ 327 | TEST_IMPL(62) \ 328 | TEST_IMPL(63) 329 | #define IMM_128_ITER \ 330 | IMM_64_ITER \ 331 | TEST_IMPL(64) \ 332 | TEST_IMPL(65) \ 333 | TEST_IMPL(66) \ 334 | TEST_IMPL(67) \ 335 | TEST_IMPL(68) \ 336 | TEST_IMPL(69) \ 337 | TEST_IMPL(70) \ 338 | TEST_IMPL(71) \ 339 | TEST_IMPL(72) \ 340 | TEST_IMPL(73) \ 341 | TEST_IMPL(74) \ 342 | TEST_IMPL(75) \ 343 | TEST_IMPL(76) \ 344 | TEST_IMPL(77) \ 345 | TEST_IMPL(78) \ 346 | TEST_IMPL(79) \ 347 | TEST_IMPL(80) \ 348 | TEST_IMPL(81) \ 349 | TEST_IMPL(82) \ 350 | TEST_IMPL(83) \ 351 | TEST_IMPL(84) \ 352 | TEST_IMPL(85) \ 353 | TEST_IMPL(86) \ 354 | TEST_IMPL(87) \ 355 | TEST_IMPL(88) \ 356 | TEST_IMPL(89) \ 357 | TEST_IMPL(90) \ 358 | TEST_IMPL(91) \ 359 | TEST_IMPL(92) \ 360 | TEST_IMPL(93) \ 361 | TEST_IMPL(94) \ 362 | TEST_IMPL(95) \ 363 | TEST_IMPL(96) \ 364 | TEST_IMPL(97) \ 365 | TEST_IMPL(98) \ 366 | TEST_IMPL(99) \ 367 | TEST_IMPL(100) \ 368 | TEST_IMPL(101) \ 369 | TEST_IMPL(102) \ 370 | TEST_IMPL(103) \ 371 | TEST_IMPL(104) \ 372 | TEST_IMPL(105) \ 373 | TEST_IMPL(106) \ 374 | TEST_IMPL(107) \ 375 | TEST_IMPL(108) \ 376 | TEST_IMPL(109) \ 377 | TEST_IMPL(110) \ 378 | TEST_IMPL(111) \ 379 | TEST_IMPL(112) \ 380 | TEST_IMPL(113) \ 381 | TEST_IMPL(114) \ 382 | TEST_IMPL(115) \ 383 | TEST_IMPL(116) \ 384 | TEST_IMPL(117) \ 385 | TEST_IMPL(118) \ 386 | TEST_IMPL(119) \ 387 | TEST_IMPL(120) \ 388 | TEST_IMPL(121) \ 389 | TEST_IMPL(122) \ 390 | TEST_IMPL(123) \ 391 | TEST_IMPL(124) \ 392 | TEST_IMPL(125) \ 393 | TEST_IMPL(126) \ 394 | TEST_IMPL(127) 395 | #define IMM_256_ITER \ 396 | IMM_128_ITER \ 397 | TEST_IMPL(128) \ 398 | TEST_IMPL(129) \ 399 | TEST_IMPL(130) \ 400 | TEST_IMPL(131) \ 401 | TEST_IMPL(132) \ 402 | TEST_IMPL(133) \ 403 | TEST_IMPL(134) \ 404 | TEST_IMPL(135) \ 405 | TEST_IMPL(136) \ 406 | TEST_IMPL(137) \ 407 | TEST_IMPL(138) \ 408 | TEST_IMPL(139) \ 409 | TEST_IMPL(140) \ 410 | TEST_IMPL(141) \ 411 | TEST_IMPL(142) \ 412 | TEST_IMPL(143) \ 413 | TEST_IMPL(144) \ 414 | TEST_IMPL(145) \ 415 | TEST_IMPL(146) \ 416 | TEST_IMPL(147) \ 417 | TEST_IMPL(148) \ 418 | TEST_IMPL(149) \ 419 | TEST_IMPL(150) \ 420 | TEST_IMPL(151) \ 421 | TEST_IMPL(152) \ 422 | TEST_IMPL(153) \ 423 | TEST_IMPL(154) \ 424 | TEST_IMPL(155) \ 425 | TEST_IMPL(156) \ 426 | TEST_IMPL(157) \ 427 | TEST_IMPL(158) \ 428 | TEST_IMPL(159) \ 429 | TEST_IMPL(160) \ 430 | TEST_IMPL(161) \ 431 | TEST_IMPL(162) \ 432 | TEST_IMPL(163) \ 433 | TEST_IMPL(164) \ 434 | TEST_IMPL(165) \ 435 | TEST_IMPL(166) \ 436 | TEST_IMPL(167) \ 437 | TEST_IMPL(168) \ 438 | TEST_IMPL(169) \ 439 | TEST_IMPL(170) \ 440 | TEST_IMPL(171) \ 441 | TEST_IMPL(172) \ 442 | TEST_IMPL(173) \ 443 | TEST_IMPL(174) \ 444 | TEST_IMPL(175) \ 445 | TEST_IMPL(176) \ 446 | TEST_IMPL(177) \ 447 | TEST_IMPL(178) \ 448 | TEST_IMPL(179) \ 449 | TEST_IMPL(180) \ 450 | TEST_IMPL(181) \ 451 | TEST_IMPL(182) \ 452 | TEST_IMPL(183) \ 453 | TEST_IMPL(184) \ 454 | TEST_IMPL(185) \ 455 | TEST_IMPL(186) \ 456 | TEST_IMPL(187) \ 457 | TEST_IMPL(188) \ 458 | TEST_IMPL(189) \ 459 | TEST_IMPL(190) \ 460 | TEST_IMPL(191) \ 461 | TEST_IMPL(192) \ 462 | TEST_IMPL(193) \ 463 | TEST_IMPL(194) \ 464 | TEST_IMPL(195) \ 465 | TEST_IMPL(196) \ 466 | TEST_IMPL(197) \ 467 | TEST_IMPL(198) \ 468 | TEST_IMPL(199) \ 469 | TEST_IMPL(200) \ 470 | TEST_IMPL(201) \ 471 | TEST_IMPL(202) \ 472 | TEST_IMPL(203) \ 473 | TEST_IMPL(204) \ 474 | TEST_IMPL(205) \ 475 | TEST_IMPL(206) \ 476 | TEST_IMPL(207) \ 477 | TEST_IMPL(208) \ 478 | TEST_IMPL(209) \ 479 | TEST_IMPL(210) \ 480 | TEST_IMPL(211) \ 481 | TEST_IMPL(212) \ 482 | TEST_IMPL(213) \ 483 | TEST_IMPL(214) \ 484 | TEST_IMPL(215) \ 485 | TEST_IMPL(216) \ 486 | TEST_IMPL(217) \ 487 | TEST_IMPL(218) \ 488 | TEST_IMPL(219) \ 489 | TEST_IMPL(220) \ 490 | TEST_IMPL(221) \ 491 | TEST_IMPL(222) \ 492 | TEST_IMPL(223) \ 493 | TEST_IMPL(224) \ 494 | TEST_IMPL(225) \ 495 | TEST_IMPL(226) \ 496 | TEST_IMPL(227) \ 497 | TEST_IMPL(228) \ 498 | TEST_IMPL(229) \ 499 | TEST_IMPL(230) \ 500 | TEST_IMPL(231) \ 501 | TEST_IMPL(232) \ 502 | TEST_IMPL(233) \ 503 | TEST_IMPL(234) \ 504 | TEST_IMPL(235) \ 505 | TEST_IMPL(236) \ 506 | TEST_IMPL(237) \ 507 | TEST_IMPL(238) \ 508 | TEST_IMPL(239) \ 509 | TEST_IMPL(240) \ 510 | TEST_IMPL(241) \ 511 | TEST_IMPL(242) \ 512 | TEST_IMPL(243) \ 513 | TEST_IMPL(244) \ 514 | TEST_IMPL(245) \ 515 | TEST_IMPL(246) \ 516 | TEST_IMPL(247) \ 517 | TEST_IMPL(248) \ 518 | TEST_IMPL(249) \ 519 | TEST_IMPL(250) \ 520 | TEST_IMPL(251) \ 521 | TEST_IMPL(252) \ 522 | TEST_IMPL(253) \ 523 | TEST_IMPL(254) \ 524 | TEST_IMPL(255) 525 | } // namespace SSE2NEON 526 | 527 | #endif 528 | -------------------------------------------------------------------------------- /tests/impl.h: -------------------------------------------------------------------------------- 1 | #ifndef SSE2NEONTEST_H 2 | #define SSE2NEONTEST_H 3 | 4 | #include "common.h" 5 | 6 | #define INTRIN_LIST \ 7 | /* MMX */ \ 8 | _(mm_empty) \ 9 | /* SSE */ \ 10 | _(mm_add_ps) \ 11 | _(mm_add_ss) \ 12 | _(mm_and_ps) \ 13 | _(mm_andnot_ps) \ 14 | _(mm_avg_pu16) \ 15 | _(mm_avg_pu8) \ 16 | _(mm_cmpeq_ps) \ 17 | _(mm_cmpeq_ss) \ 18 | _(mm_cmpge_ps) \ 19 | _(mm_cmpge_ss) \ 20 | _(mm_cmpgt_ps) \ 21 | _(mm_cmpgt_ss) \ 22 | _(mm_cmple_ps) \ 23 | _(mm_cmple_ss) \ 24 | _(mm_cmplt_ps) \ 25 | _(mm_cmplt_ss) \ 26 | _(mm_cmpneq_ps) \ 27 | _(mm_cmpneq_ss) \ 28 | _(mm_cmpnge_ps) \ 29 | _(mm_cmpnge_ss) \ 30 | _(mm_cmpngt_ps) \ 31 | _(mm_cmpngt_ss) \ 32 | _(mm_cmpnle_ps) \ 33 | _(mm_cmpnle_ss) \ 34 | _(mm_cmpnlt_ps) \ 35 | _(mm_cmpnlt_ss) \ 36 | _(mm_cmpord_ps) \ 37 | _(mm_cmpord_ss) \ 38 | _(mm_cmpunord_ps) \ 39 | _(mm_cmpunord_ss) \ 40 | _(mm_comieq_ss) \ 41 | _(mm_comige_ss) \ 42 | _(mm_comigt_ss) \ 43 | _(mm_comile_ss) \ 44 | _(mm_comilt_ss) \ 45 | _(mm_comineq_ss) \ 46 | _(mm_cvt_pi2ps) \ 47 | _(mm_cvt_ps2pi) \ 48 | _(mm_cvt_si2ss) \ 49 | _(mm_cvt_ss2si) \ 50 | _(mm_cvtpi16_ps) \ 51 | _(mm_cvtpi32_ps) \ 52 | _(mm_cvtpi32x2_ps) \ 53 | _(mm_cvtpi8_ps) \ 54 | _(mm_cvtps_pi16) \ 55 | _(mm_cvtps_pi32) \ 56 | _(mm_cvtps_pi8) \ 57 | _(mm_cvtpu16_ps) \ 58 | _(mm_cvtpu8_ps) \ 59 | _(mm_cvtsi32_ss) \ 60 | _(mm_cvtsi64_ss) \ 61 | _(mm_cvtss_f32) \ 62 | _(mm_cvtss_si32) \ 63 | _(mm_cvtss_si64) \ 64 | _(mm_cvtt_ps2pi) \ 65 | _(mm_cvtt_ss2si) \ 66 | _(mm_cvttps_pi32) \ 67 | _(mm_cvttss_si32) \ 68 | _(mm_cvttss_si64) \ 69 | _(mm_div_ps) \ 70 | _(mm_div_ss) \ 71 | _(mm_extract_pi16) \ 72 | _(mm_free) \ 73 | _(mm_get_flush_zero_mode) \ 74 | _(mm_get_rounding_mode) \ 75 | _(mm_getcsr) \ 76 | _(mm_insert_pi16) \ 77 | _(mm_load_ps) \ 78 | _(mm_load_ps1) \ 79 | _(mm_load_ss) \ 80 | _(mm_load1_ps) \ 81 | _(mm_loadh_pi) \ 82 | _(mm_loadl_pi) \ 83 | _(mm_loadr_ps) \ 84 | _(mm_loadu_ps) \ 85 | _(mm_loadu_si16) \ 86 | _(mm_loadu_si64) \ 87 | _(mm_malloc) \ 88 | _(mm_maskmove_si64) \ 89 | _(m_maskmovq) \ 90 | _(mm_max_pi16) \ 91 | _(mm_max_ps) \ 92 | _(mm_max_pu8) \ 93 | _(mm_max_ss) \ 94 | _(mm_min_pi16) \ 95 | _(mm_min_ps) \ 96 | _(mm_min_pu8) \ 97 | _(mm_min_ss) \ 98 | _(mm_move_ss) \ 99 | _(mm_movehl_ps) \ 100 | _(mm_movelh_ps) \ 101 | _(mm_movemask_pi8) \ 102 | _(mm_movemask_ps) \ 103 | _(mm_mul_ps) \ 104 | _(mm_mul_ss) \ 105 | _(mm_mulhi_pu16) \ 106 | _(mm_or_ps) \ 107 | _(m_pavgb) \ 108 | _(m_pavgw) \ 109 | _(m_pextrw) \ 110 | _(m_pinsrw) \ 111 | _(m_pmaxsw) \ 112 | _(m_pmaxub) \ 113 | _(m_pminsw) \ 114 | _(m_pminub) \ 115 | _(m_pmovmskb) \ 116 | _(m_pmulhuw) \ 117 | _(mm_prefetch) \ 118 | _(m_psadbw) \ 119 | _(m_pshufw) \ 120 | _(mm_rcp_ps) \ 121 | _(mm_rcp_ss) \ 122 | _(mm_rsqrt_ps) \ 123 | _(mm_rsqrt_ss) \ 124 | _(mm_sad_pu8) \ 125 | _(mm_set_flush_zero_mode) \ 126 | _(mm_set_ps) \ 127 | _(mm_set_ps1) \ 128 | _(mm_set_rounding_mode) \ 129 | _(mm_set_ss) \ 130 | _(mm_set1_ps) \ 131 | _(mm_setcsr) \ 132 | _(mm_setr_ps) \ 133 | _(mm_setzero_ps) \ 134 | _(mm_sfence) \ 135 | _(mm_shuffle_pi16) \ 136 | _(mm_shuffle_ps) \ 137 | _(mm_sqrt_ps) \ 138 | _(mm_sqrt_ss) \ 139 | _(mm_store_ps) \ 140 | _(mm_store_ps1) \ 141 | _(mm_store_ss) \ 142 | _(mm_store1_ps) \ 143 | _(mm_storeh_pi) \ 144 | _(mm_storel_pi) \ 145 | _(mm_storer_ps) \ 146 | _(mm_storeu_ps) \ 147 | _(mm_storeu_si16) \ 148 | _(mm_storeu_si64) \ 149 | _(mm_stream_pi) \ 150 | _(mm_stream_ps) \ 151 | _(mm_sub_ps) \ 152 | _(mm_sub_ss) \ 153 | _(mm_ucomieq_ss) \ 154 | _(mm_ucomige_ss) \ 155 | _(mm_ucomigt_ss) \ 156 | _(mm_ucomile_ss) \ 157 | _(mm_ucomilt_ss) \ 158 | _(mm_ucomineq_ss) \ 159 | _(mm_undefined_ps) \ 160 | _(mm_unpackhi_ps) \ 161 | _(mm_unpacklo_ps) \ 162 | _(mm_xor_ps) \ 163 | /* SSE2 */ \ 164 | _(mm_add_epi16) \ 165 | _(mm_add_epi32) \ 166 | _(mm_add_epi64) \ 167 | _(mm_add_epi8) \ 168 | _(mm_add_pd) \ 169 | _(mm_add_sd) \ 170 | _(mm_add_si64) \ 171 | _(mm_adds_epi16) \ 172 | _(mm_adds_epi8) \ 173 | _(mm_adds_epu16) \ 174 | _(mm_adds_epu8) \ 175 | _(mm_and_pd) \ 176 | _(mm_and_si128) \ 177 | _(mm_andnot_pd) \ 178 | _(mm_andnot_si128) \ 179 | _(mm_avg_epu16) \ 180 | _(mm_avg_epu8) \ 181 | _(mm_bslli_si128) \ 182 | _(mm_bsrli_si128) \ 183 | _(mm_castpd_ps) \ 184 | _(mm_castpd_si128) \ 185 | _(mm_castps_pd) \ 186 | _(mm_castps_si128) \ 187 | _(mm_castsi128_pd) \ 188 | _(mm_castsi128_ps) \ 189 | _(mm_clflush) \ 190 | _(mm_cmpeq_epi16) \ 191 | _(mm_cmpeq_epi32) \ 192 | _(mm_cmpeq_epi8) \ 193 | _(mm_cmpeq_pd) \ 194 | _(mm_cmpeq_sd) \ 195 | _(mm_cmpge_pd) \ 196 | _(mm_cmpge_sd) \ 197 | _(mm_cmpgt_epi16) \ 198 | _(mm_cmpgt_epi32) \ 199 | _(mm_cmpgt_epi8) \ 200 | _(mm_cmpgt_pd) \ 201 | _(mm_cmpgt_sd) \ 202 | _(mm_cmple_pd) \ 203 | _(mm_cmple_sd) \ 204 | _(mm_cmplt_epi16) \ 205 | _(mm_cmplt_epi32) \ 206 | _(mm_cmplt_epi8) \ 207 | _(mm_cmplt_pd) \ 208 | _(mm_cmplt_sd) \ 209 | _(mm_cmpneq_pd) \ 210 | _(mm_cmpneq_sd) \ 211 | _(mm_cmpnge_pd) \ 212 | _(mm_cmpnge_sd) \ 213 | _(mm_cmpngt_pd) \ 214 | _(mm_cmpngt_sd) \ 215 | _(mm_cmpnle_pd) \ 216 | _(mm_cmpnle_sd) \ 217 | _(mm_cmpnlt_pd) \ 218 | _(mm_cmpnlt_sd) \ 219 | _(mm_cmpord_pd) \ 220 | _(mm_cmpord_sd) \ 221 | _(mm_cmpunord_pd) \ 222 | _(mm_cmpunord_sd) \ 223 | _(mm_comieq_sd) \ 224 | _(mm_comige_sd) \ 225 | _(mm_comigt_sd) \ 226 | _(mm_comile_sd) \ 227 | _(mm_comilt_sd) \ 228 | _(mm_comineq_sd) \ 229 | _(mm_cvtepi32_pd) \ 230 | _(mm_cvtepi32_ps) \ 231 | _(mm_cvtpd_epi32) \ 232 | _(mm_cvtpd_pi32) \ 233 | _(mm_cvtpd_ps) \ 234 | _(mm_cvtpi32_pd) \ 235 | _(mm_cvtps_epi32) \ 236 | _(mm_cvtps_pd) \ 237 | _(mm_cvtsd_f64) \ 238 | _(mm_cvtsd_si32) \ 239 | _(mm_cvtsd_si64) \ 240 | _(mm_cvtsd_si64x) \ 241 | _(mm_cvtsd_ss) \ 242 | _(mm_cvtsi128_si32) \ 243 | _(mm_cvtsi128_si64) \ 244 | _(mm_cvtsi128_si64x) \ 245 | _(mm_cvtsi32_sd) \ 246 | _(mm_cvtsi32_si128) \ 247 | _(mm_cvtsi64_sd) \ 248 | _(mm_cvtsi64_si128) \ 249 | _(mm_cvtsi64x_sd) \ 250 | _(mm_cvtsi64x_si128) \ 251 | _(mm_cvtss_sd) \ 252 | _(mm_cvttpd_epi32) \ 253 | _(mm_cvttpd_pi32) \ 254 | _(mm_cvttps_epi32) \ 255 | _(mm_cvttsd_si32) \ 256 | _(mm_cvttsd_si64) \ 257 | _(mm_cvttsd_si64x) \ 258 | _(mm_div_pd) \ 259 | _(mm_div_sd) \ 260 | _(mm_extract_epi16) \ 261 | _(mm_insert_epi16) \ 262 | _(mm_lfence) \ 263 | _(mm_load_pd) \ 264 | _(mm_load_pd1) \ 265 | _(mm_load_sd) \ 266 | _(mm_load_si128) \ 267 | _(mm_load1_pd) \ 268 | _(mm_loadh_pd) \ 269 | _(mm_loadl_epi64) \ 270 | _(mm_loadl_pd) \ 271 | _(mm_loadr_pd) \ 272 | _(mm_loadu_pd) \ 273 | _(mm_loadu_si128) \ 274 | _(mm_loadu_si32) \ 275 | _(mm_madd_epi16) \ 276 | _(mm_maskmoveu_si128) \ 277 | _(mm_max_epi16) \ 278 | _(mm_max_epu8) \ 279 | _(mm_max_pd) \ 280 | _(mm_max_sd) \ 281 | _(mm_mfence) \ 282 | _(mm_min_epi16) \ 283 | _(mm_min_epu8) \ 284 | _(mm_min_pd) \ 285 | _(mm_min_sd) \ 286 | _(mm_move_epi64) \ 287 | _(mm_move_sd) \ 288 | _(mm_movemask_epi8) \ 289 | _(mm_movemask_pd) \ 290 | _(mm_movepi64_pi64) \ 291 | _(mm_movpi64_epi64) \ 292 | _(mm_mul_epu32) \ 293 | _(mm_mul_pd) \ 294 | _(mm_mul_sd) \ 295 | _(mm_mul_su32) \ 296 | _(mm_mulhi_epi16) \ 297 | _(mm_mulhi_epu16) \ 298 | _(mm_mullo_epi16) \ 299 | _(mm_or_pd) \ 300 | _(mm_or_si128) \ 301 | _(mm_packs_epi16) \ 302 | _(mm_packs_epi32) \ 303 | _(mm_packus_epi16) \ 304 | _(mm_pause) \ 305 | _(mm_sad_epu8) \ 306 | _(mm_set_epi16) \ 307 | _(mm_set_epi32) \ 308 | _(mm_set_epi64) \ 309 | _(mm_set_epi64x) \ 310 | _(mm_set_epi8) \ 311 | _(mm_set_pd) \ 312 | _(mm_set_pd1) \ 313 | _(mm_set_sd) \ 314 | _(mm_set1_epi16) \ 315 | _(mm_set1_epi32) \ 316 | _(mm_set1_epi64) \ 317 | _(mm_set1_epi64x) \ 318 | _(mm_set1_epi8) \ 319 | _(mm_set1_pd) \ 320 | _(mm_setr_epi16) \ 321 | _(mm_setr_epi32) \ 322 | _(mm_setr_epi64) \ 323 | _(mm_setr_epi8) \ 324 | _(mm_setr_pd) \ 325 | _(mm_setzero_pd) \ 326 | _(mm_setzero_si128) \ 327 | _(mm_shuffle_epi32) \ 328 | _(mm_shuffle_pd) \ 329 | _(mm_shufflehi_epi16) \ 330 | _(mm_shufflelo_epi16) \ 331 | _(mm_sll_epi16) \ 332 | _(mm_sll_epi32) \ 333 | _(mm_sll_epi64) \ 334 | _(mm_slli_epi16) \ 335 | _(mm_slli_epi32) \ 336 | _(mm_slli_epi64) \ 337 | _(mm_slli_si128) \ 338 | _(mm_sqrt_pd) \ 339 | _(mm_sqrt_sd) \ 340 | _(mm_sra_epi16) \ 341 | _(mm_sra_epi32) \ 342 | _(mm_srai_epi16) \ 343 | _(mm_srai_epi32) \ 344 | _(mm_srl_epi16) \ 345 | _(mm_srl_epi32) \ 346 | _(mm_srl_epi64) \ 347 | _(mm_srli_epi16) \ 348 | _(mm_srli_epi32) \ 349 | _(mm_srli_epi64) \ 350 | _(mm_srli_si128) \ 351 | _(mm_store_pd) \ 352 | _(mm_store_pd1) \ 353 | _(mm_store_sd) \ 354 | _(mm_store_si128) \ 355 | _(mm_store1_pd) \ 356 | _(mm_storeh_pd) \ 357 | _(mm_storel_epi64) \ 358 | _(mm_storel_pd) \ 359 | _(mm_storer_pd) \ 360 | _(mm_storeu_pd) \ 361 | _(mm_storeu_si128) \ 362 | _(mm_storeu_si32) \ 363 | _(mm_stream_pd) \ 364 | _(mm_stream_si128) \ 365 | _(mm_stream_si32) \ 366 | _(mm_stream_si64) \ 367 | _(mm_sub_epi16) \ 368 | _(mm_sub_epi32) \ 369 | _(mm_sub_epi64) \ 370 | _(mm_sub_epi8) \ 371 | _(mm_sub_pd) \ 372 | _(mm_sub_sd) \ 373 | _(mm_sub_si64) \ 374 | _(mm_subs_epi16) \ 375 | _(mm_subs_epi8) \ 376 | _(mm_subs_epu16) \ 377 | _(mm_subs_epu8) \ 378 | _(mm_ucomieq_sd) \ 379 | _(mm_ucomige_sd) \ 380 | _(mm_ucomigt_sd) \ 381 | _(mm_ucomile_sd) \ 382 | _(mm_ucomilt_sd) \ 383 | _(mm_ucomineq_sd) \ 384 | _(mm_undefined_pd) \ 385 | _(mm_undefined_si128) \ 386 | _(mm_unpackhi_epi16) \ 387 | _(mm_unpackhi_epi32) \ 388 | _(mm_unpackhi_epi64) \ 389 | _(mm_unpackhi_epi8) \ 390 | _(mm_unpackhi_pd) \ 391 | _(mm_unpacklo_epi16) \ 392 | _(mm_unpacklo_epi32) \ 393 | _(mm_unpacklo_epi64) \ 394 | _(mm_unpacklo_epi8) \ 395 | _(mm_unpacklo_pd) \ 396 | _(mm_xor_pd) \ 397 | _(mm_xor_si128) \ 398 | /* SSE3 */ \ 399 | _(mm_addsub_pd) \ 400 | _(mm_addsub_ps) \ 401 | _(mm_hadd_pd) \ 402 | _(mm_hadd_ps) \ 403 | _(mm_hsub_pd) \ 404 | _(mm_hsub_ps) \ 405 | _(mm_lddqu_si128) \ 406 | _(mm_loaddup_pd) \ 407 | _(mm_movedup_pd) \ 408 | _(mm_movehdup_ps) \ 409 | _(mm_moveldup_ps) \ 410 | /* SSSE3 */ \ 411 | _(mm_abs_epi16) \ 412 | _(mm_abs_epi32) \ 413 | _(mm_abs_epi8) \ 414 | _(mm_abs_pi16) \ 415 | _(mm_abs_pi32) \ 416 | _(mm_abs_pi8) \ 417 | _(mm_alignr_epi8) \ 418 | _(mm_alignr_pi8) \ 419 | _(mm_hadd_epi16) \ 420 | _(mm_hadd_epi32) \ 421 | _(mm_hadd_pi16) \ 422 | _(mm_hadd_pi32) \ 423 | _(mm_hadds_epi16) \ 424 | _(mm_hadds_pi16) \ 425 | _(mm_hsub_epi16) \ 426 | _(mm_hsub_epi32) \ 427 | _(mm_hsub_pi16) \ 428 | _(mm_hsub_pi32) \ 429 | _(mm_hsubs_epi16) \ 430 | _(mm_hsubs_pi16) \ 431 | _(mm_maddubs_epi16) \ 432 | _(mm_maddubs_pi16) \ 433 | _(mm_mulhrs_epi16) \ 434 | _(mm_mulhrs_pi16) \ 435 | _(mm_shuffle_epi8) \ 436 | _(mm_shuffle_pi8) \ 437 | _(mm_sign_epi16) \ 438 | _(mm_sign_epi32) \ 439 | _(mm_sign_epi8) \ 440 | _(mm_sign_pi16) \ 441 | _(mm_sign_pi32) \ 442 | _(mm_sign_pi8) \ 443 | /* SSE4.1 */ \ 444 | _(mm_blend_epi16) \ 445 | _(mm_blend_pd) \ 446 | _(mm_blend_ps) \ 447 | _(mm_blendv_epi8) \ 448 | _(mm_blendv_pd) \ 449 | _(mm_blendv_ps) \ 450 | _(mm_ceil_pd) \ 451 | _(mm_ceil_ps) \ 452 | _(mm_ceil_sd) \ 453 | _(mm_ceil_ss) \ 454 | _(mm_cmpeq_epi64) \ 455 | _(mm_cvtepi16_epi32) \ 456 | _(mm_cvtepi16_epi64) \ 457 | _(mm_cvtepi32_epi64) \ 458 | _(mm_cvtepi8_epi16) \ 459 | _(mm_cvtepi8_epi32) \ 460 | _(mm_cvtepi8_epi64) \ 461 | _(mm_cvtepu16_epi32) \ 462 | _(mm_cvtepu16_epi64) \ 463 | _(mm_cvtepu32_epi64) \ 464 | _(mm_cvtepu8_epi16) \ 465 | _(mm_cvtepu8_epi32) \ 466 | _(mm_cvtepu8_epi64) \ 467 | _(mm_dp_pd) \ 468 | _(mm_dp_ps) \ 469 | _(mm_extract_epi32) \ 470 | _(mm_extract_epi64) \ 471 | _(mm_extract_epi8) \ 472 | _(mm_extract_ps) \ 473 | _(mm_floor_pd) \ 474 | _(mm_floor_ps) \ 475 | _(mm_floor_sd) \ 476 | _(mm_floor_ss) \ 477 | _(mm_insert_epi32) \ 478 | _(mm_insert_epi64) \ 479 | _(mm_insert_epi8) \ 480 | _(mm_insert_ps) \ 481 | _(mm_max_epi32) \ 482 | _(mm_max_epi8) \ 483 | _(mm_max_epu16) \ 484 | _(mm_max_epu32) \ 485 | _(mm_min_epi32) \ 486 | _(mm_min_epi8) \ 487 | _(mm_min_epu16) \ 488 | _(mm_min_epu32) \ 489 | _(mm_minpos_epu16) \ 490 | _(mm_mpsadbw_epu8) \ 491 | _(mm_mul_epi32) \ 492 | _(mm_mullo_epi32) \ 493 | _(mm_packus_epi32) \ 494 | _(mm_round_pd) \ 495 | _(mm_round_ps) \ 496 | _(mm_round_sd) \ 497 | _(mm_round_ss) \ 498 | _(mm_stream_load_si128) \ 499 | _(mm_test_all_ones) \ 500 | _(mm_test_all_zeros) \ 501 | _(mm_test_mix_ones_zeros) \ 502 | _(mm_testc_si128) \ 503 | _(mm_testnzc_si128) \ 504 | _(mm_testz_si128) \ 505 | /* SSE4.2 */ \ 506 | _(mm_cmpestra) \ 507 | _(mm_cmpestrc) \ 508 | _(mm_cmpestri) \ 509 | _(mm_cmpestrm) \ 510 | _(mm_cmpestro) \ 511 | _(mm_cmpestrs) \ 512 | _(mm_cmpestrz) \ 513 | _(mm_cmpgt_epi64) \ 514 | _(mm_cmpistra) \ 515 | _(mm_cmpistrc) \ 516 | _(mm_cmpistri) \ 517 | _(mm_cmpistrm) \ 518 | _(mm_cmpistro) \ 519 | _(mm_cmpistrs) \ 520 | _(mm_cmpistrz) \ 521 | _(mm_crc32_u16) \ 522 | _(mm_crc32_u32) \ 523 | _(mm_crc32_u64) \ 524 | _(mm_crc32_u8) \ 525 | /* AES */ \ 526 | _(mm_aesenc_si128) \ 527 | _(mm_aesdec_si128) \ 528 | _(mm_aesenclast_si128) \ 529 | _(mm_aesdeclast_si128) \ 530 | _(mm_aesimc_si128) \ 531 | _(mm_aeskeygenassist_si128) \ 532 | /* Others */ \ 533 | _(mm_clmulepi64_si128) \ 534 | _(mm_get_denormals_zero_mode) \ 535 | _(mm_popcnt_u32) \ 536 | _(mm_popcnt_u64) \ 537 | _(mm_set_denormals_zero_mode) \ 538 | _(rdtsc) \ 539 | _(last) /* This indicates the end of macros */ 540 | 541 | namespace SSE2NEON 542 | { 543 | // The way unit tests are implemented is that 10,000 random floating point and 544 | // integer vec4 numbers are generated as sample data. 545 | // 546 | // A short C implementation of every intrinsic is implemented and compared to 547 | // the actual expected results from the corresponding SSE intrinsic against all 548 | // of the 10,000 randomized input vectors. When running on ARM, then the results 549 | // are compared to the NEON approximate version. 550 | extern const char *instructionString[]; 551 | enum InstructionTest { 552 | #define _(x) it_##x, 553 | INTRIN_LIST 554 | #undef _ 555 | }; 556 | 557 | class SSE2NEONTest 558 | { 559 | public: 560 | static SSE2NEONTest *create(void); // create the test. 561 | 562 | // Run test of this instruction; 563 | // Passed: TEST_SUCCESS (1) 564 | // Failed: TEST_FAIL (0) 565 | // Unimplemented: TEST_UNIMPL (-1) 566 | virtual result_t runTest(InstructionTest test) = 0; 567 | virtual void release(void) = 0; 568 | }; 569 | 570 | } // namespace SSE2NEON 571 | 572 | #endif 573 | -------------------------------------------------------------------------------- /tests/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "impl.h" 4 | 5 | int main(int /*argc*/, const char ** /*argv*/) 6 | { 7 | SSE2NEON::SSE2NEONTest *test = SSE2NEON::SSE2NEONTest::create(); 8 | uint32_t passCount = 0; 9 | uint32_t failedCount = 0; 10 | uint32_t ignoreCount = 0; 11 | for (uint32_t i = 0; i < SSE2NEON::it_last; i++) { 12 | SSE2NEON::InstructionTest it = SSE2NEON::InstructionTest(i); 13 | SSE2NEON::result_t ret = test->runTest(it); 14 | // If the test fails, we will run it again so we can step into the 15 | // debugger and figure out why! 16 | if (ret == SSE2NEON::TEST_FAIL) { 17 | printf("Test %-30s failed\n", SSE2NEON::instructionString[it]); 18 | failedCount++; 19 | } else if (ret == SSE2NEON::TEST_UNIMPL) { 20 | printf("Test %-30s skipped\n", SSE2NEON::instructionString[it]); 21 | ignoreCount++; 22 | } else { 23 | printf("Test %-30s passed\n", SSE2NEON::instructionString[it]); 24 | passCount++; 25 | } 26 | } 27 | test->release(); 28 | printf( 29 | "SSE2NEONTest Complete!\n" 30 | "Passed: %d\n" 31 | "Failed: %d\n" 32 | "Ignored: %d\n" 33 | "Coverage rate: %.2f%%\n", 34 | passCount, failedCount, ignoreCount, 35 | (float) passCount / (float) (passCount + failedCount + ignoreCount) * 36 | 100); 37 | 38 | return failedCount ? -1 : 0; 39 | } 40 | --------------------------------------------------------------------------------