├── scripts ├── neon_scrapper │ ├── venv │ │ └── requirements.txt │ ├── coverage.py │ ├── scrapper.py │ └── header_gen.py ├── check-format.sh ├── cross-build.sh ├── cross-test.sh └── intrinsics_coverage.py ├── .gitignore ├── tests ├── binding.h ├── binding.cpp ├── main.cpp ├── debug_tools.cpp ├── debug_tools.h ├── common.h └── common.cpp ├── .github └── workflows │ └── github_actions.yml ├── Makefile ├── .clang-format ├── README.md └── LICENSE /scripts/neon_scrapper/venv/requirements.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | bs4 3 | selenium 4 | webdriver_manager -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Prerequisites 2 | *.d 3 | 4 | # Object files 5 | *.o 6 | *.obj 7 | *.elf 8 | 9 | # Precompiled Headers 10 | *.gch 11 | 12 | # Libraries 13 | *.a 14 | 15 | # Log file 16 | *.log 17 | 18 | # python venv 19 | scripts/neon_scrapper/venv 20 | 21 | main 22 | -------------------------------------------------------------------------------- /scripts/check-format.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | for file in ${SOURCES}; 6 | do 7 | clang-format-17 ${file} > expected-format 8 | diff -u -p --label="${file}" --label="expected coding style" ${file} expected-format 9 | done 10 | exit $(clang-format-17 --output-replacements-xml ${SOURCES} | egrep -c "") 11 | -------------------------------------------------------------------------------- /scripts/cross-build.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Clang/LLVM is natively a cross-compiler. 4 | # TODO: Do cross-compilation using Clang 5 | # https://clang.llvm.org/docs/CrossCompilation.html 6 | if [ $(printenv CXX | grep clang) ]; then 7 | exit 8 | fi 9 | 10 | set -x 11 | 12 | make clean 13 | make CROSS_COMPILE=riscv64-unknown-elf- build-test || exit 1 # riscv64 14 | 15 | # make clean 16 | # make CROSS_COMPILE=riscv32-unknown-elf- build-test || exit 1 # riscv32 17 | -------------------------------------------------------------------------------- /scripts/cross-test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Clang/LLVM is natively a cross-compiler. 4 | # TODO: Do cross-compilation using Clang 5 | # https://clang.llvm.org/docs/CrossCompilation.html 6 | if [ $(printenv CXX | grep clang) ]; then 7 | exit 8 | fi 9 | 10 | set -x 11 | 12 | make clean 13 | make CROSS_COMPILE=riscv64-unknown-elf- SIMULATOR_TYPE=${SIMULATOR_TYPE} ENABLE_TEST_ALL=${ENABLE_TEST_ALL} test || exit 1 # riscv64 14 | 15 | # make clean 16 | # make CROSS_COMPILE=riscv32-unknown-elf- test || exit 1 # riscv32 17 | -------------------------------------------------------------------------------- /tests/binding.h: -------------------------------------------------------------------------------- 1 | #ifndef NEON2RVV_BINDING_H 2 | #define NEON2RVV_BINDING_H 3 | 4 | #include 5 | 6 | // The NEON2RVV unit tests run both within our own internal project 7 | // as well as within the open source framework. 8 | // This header file is used to abstract any distinctions between 9 | // those two build environments. 10 | // 11 | // Initially, this is for how 16 byte aligned memory is allocated 12 | namespace NEON2RVV { 13 | void *platform_aligned_alloc(size_t size); 14 | void platform_aligned_free(void *ptr); 15 | 16 | } // namespace NEON2RVV 17 | 18 | #endif // NEON2RVV_BINDING_H 19 | -------------------------------------------------------------------------------- /tests/binding.cpp: -------------------------------------------------------------------------------- 1 | #include "binding.h" 2 | 3 | #include 4 | #include 5 | 6 | namespace NEON2RVV { 7 | void *platform_aligned_alloc(size_t size) { 8 | void *address; 9 | #if defined(_WIN32) 10 | address = _aligned_malloc(size, 16); 11 | #else 12 | // FIXME 13 | // int ret = posix_memalign(&address, 16, size); 14 | address = malloc(size); 15 | #endif 16 | if (!address) { 17 | fprintf(stderr, "Error at File %s line number %d\n", __FILE__, __LINE__); 18 | exit(EXIT_FAILURE); 19 | } 20 | return address; 21 | } 22 | 23 | void platform_aligned_free(void *ptr) { 24 | #if defined(_WIN32) 25 | _aligned_free(ptr); 26 | #else 27 | free(ptr); 28 | #endif 29 | } 30 | 31 | } // namespace NEON2RVV 32 | -------------------------------------------------------------------------------- /tests/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "impl.h" 4 | 5 | int main(int /*argc*/, const char ** /*argv*/) { 6 | NEON2RVV::NEON2RVV_TEST *test = NEON2RVV::NEON2RVV_TEST::create(); 7 | uint32_t pass_count = 0; 8 | uint32_t failed_count = 0; 9 | uint32_t ignore_count = 0; 10 | for (uint32_t i = 0; i < NEON2RVV::it_last; i++) { 11 | NEON2RVV::INSTRUCTION_TEST test_iter = NEON2RVV::INSTRUCTION_TEST(i); 12 | NEON2RVV::result_t ret = test->run_test(test_iter); 13 | // If the test fails, we will run it again so we can step into the 14 | // debugger and figure out why! 15 | if (ret == NEON2RVV::TEST_FAIL) { 16 | printf("Test %-30s failed\n", NEON2RVV::instruction_string[test_iter]); 17 | failed_count++; 18 | } else if (ret == NEON2RVV::TEST_UNIMPL) { 19 | printf("Test %-30s skipped\n", NEON2RVV::instruction_string[test_iter]); 20 | ignore_count++; 21 | } else { 22 | printf("Test %-30s passed\n", NEON2RVV::instruction_string[test_iter]); 23 | pass_count++; 24 | } 25 | } 26 | test->release(); 27 | printf( 28 | "NEON2RVV_TEST Complete!\n" 29 | "Passed: %d\n" 30 | "Failed: %d\n" 31 | "Ignored: %d\n" 32 | "Coverage rate: %.2f%%\n", 33 | pass_count, failed_count, ignore_count, (float)pass_count / (pass_count + failed_count + ignore_count) * 100); 34 | 35 | return failed_count ? -1 : 0; 36 | } 37 | -------------------------------------------------------------------------------- /scripts/intrinsics_coverage.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | 4 | def get_git_root(): 5 | return subprocess.run(["git", "rev-parse", "--show-toplevel"], capture_output=True, text=True).stdout.strip() 6 | 7 | def read_file(file_path): 8 | with open(file_path, 'r') as file: 9 | return file.readlines() 10 | 11 | def expect_impl(s): 12 | not_impl_list = ["p8", "p16", "p32", "p64", "p128", "f16"] 13 | 14 | # print the intrinsics that haven't implemented by expected to 15 | # if not any(not_impl in s for not_impl in not_impl_list): 16 | # if ("// FORCE_INLINE" in s): 17 | # print(s) 18 | return not any(not_impl in s for not_impl in not_impl_list) 19 | 20 | def is_impl(s): 21 | return not("// FORCE_INLINE" in s) 22 | 23 | def main(): 24 | git_root = get_git_root() 25 | file_path = os.path.join(git_root, "neon2rvv.h") 26 | 27 | try: 28 | data = read_file(file_path) 29 | except IOError as e: 30 | print(f"Error reading file: {e}") 31 | return 32 | 33 | mother = [line for line in data if "FORCE_INLINE" in line] 34 | 35 | expected_impl_cnt = sum(1 for line in mother if expect_impl(line)) 36 | is_impl_cnt = sum(1 for line in mother if is_impl(line)) 37 | print("expected_impl_cnt: ", expected_impl_cnt) 38 | print("is_impl_cnt: ", is_impl_cnt) 39 | print("ratio: ", float(is_impl_cnt) / float(expected_impl_cnt)) 40 | 41 | if __name__ == "__main__": 42 | main() 43 | -------------------------------------------------------------------------------- /scripts/neon_scrapper/coverage.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import re 3 | import os 4 | 5 | current_dir = os.getcwd() 6 | with open(current_dir+"/../../neon2rvv.h", 'r') as file: 7 | data = file.read() 8 | result = re.findall(r"^FORCE_INLINE .+? (v.+?)$.*$", data, flags=re.MULTILINE) 9 | intrinsics = set(result) 10 | 11 | df = pd.read_csv(current_dir+"/neon_intrinsics.csv") 12 | 13 | # for data_type in ["float16_t", "float16x4_t", "float16x8_t", "poly8_t", "poly8x8_t", "poly8x16_t", "poly16_t", "poly16x4_t", "poly16x8_t", "poly64_t", "poly64x1_t", "poly64x2_t", "poly128_t"]: 14 | # df = df[~df["ReturnType"].str.contains(data_type)] 15 | # df = df[~df["Arguments"].str.contains(data_type)] 16 | # df.reset_index() 17 | # df.to_csv("neon_filtered.csv", index=False) 18 | # df_unimplemented = df[~df["Name"].isin(intrinsics)] 19 | # df_unimplemented.to_csv("neon_unimplemented.csv", index=False) 20 | 21 | primary_group_list = [] 22 | secondary_group_list = sorted(list(set(df["Group"].to_list()))) 23 | 24 | for group in secondary_group_list: 25 | primary_group_list.append(group.split(" / ")[0]) 26 | primary_group_list = sorted(list(set(primary_group_list))) 27 | 28 | print("Neon2RVV coverage:") 29 | print("Total", len(intrinsics), "/", len(set(df["Name"].to_list()))) 30 | 31 | for primary_group in primary_group_list: 32 | df_primary = df[df["Group"].str.contains(primary_group)] 33 | primary_set = set(df_primary["Name"].to_list()) 34 | intrinsics_count = len(primary_set) 35 | intersection = len(intrinsics.intersection(primary_set)) 36 | print(primary_group, "\t", intersection, "/", intrinsics_count) 37 | 38 | for secondary_group in [group for group in secondary_group_list if primary_group in group]: 39 | df_secondary = df_primary[df_primary["Group"] == secondary_group] 40 | secondary_set = set(df_secondary["Name"].to_list()) 41 | intrinsics_count = len(secondary_set) 42 | intersection = len(intrinsics.intersection(secondary_set)) 43 | print("\t", secondary_group, "\t", intersection, "/", intrinsics_count) 44 | print() 45 | -------------------------------------------------------------------------------- /.github/workflows/github_actions.yml: -------------------------------------------------------------------------------- 1 | name: Github Actions 2 | 3 | on: 4 | push: 5 | branches: [main] 6 | pull_request: 7 | branches: [main] 8 | 9 | jobs: 10 | # run riscv tests 11 | cross_compile_tests: 12 | runs-on: ubuntu-22.04 13 | steps: 14 | - name: checkout code 15 | uses: actions/checkout@v3.2.0 16 | - name: setup riscv toolchain 17 | run: | 18 | mkdir /opt/riscv 19 | export PATH=$PATH:/opt/riscv/bin 20 | wget https://github.com/howjmay/prebuilt-riscv-spike/releases/download/latest/riscv.tar.gz 21 | sudo tar -xzf riscv.tar.gz -C /opt/ 22 | 23 | - name: run tests 24 | run: | 25 | export PATH=$PATH:/opt/riscv/bin 26 | export SIMULATOR_TYPE=qemu 27 | export ENABLE_TEST_ALL=true 28 | sh scripts/cross-test.sh 29 | 30 | # for validate test cases only 31 | check_test_cases: 32 | runs-on: ubuntu-22.04 33 | strategy: 34 | matrix: 35 | arch: [aarch64] 36 | cxx_compiler: [g++-10, clang++-11] 37 | steps: 38 | - name: checkout code 39 | uses: actions/checkout@v3.2.0 40 | - name: build artifact 41 | # The Github Action for non-x86 CPU 42 | # https://github.com/uraimo/run-on-arch-action 43 | uses: uraimo/run-on-arch-action@v2.5.0 44 | with: 45 | arch: ${{ matrix.arch }} 46 | distro: ubuntu20.04 47 | env: | 48 | CXX: ${{ matrix.cxx_compiler }} 49 | install: | 50 | apt-get update -q -y 51 | apt-get install -q -y "${{ matrix.cxx_compiler }}" make 52 | apt-get install -q -y gcc 53 | run: | 54 | export ENABLE_TEST_ALL=true 55 | make test 56 | 57 | coding_style: 58 | runs-on: ubuntu-22.04 59 | steps: 60 | - name: checkout code 61 | uses: actions/checkout@v3.2.0 62 | - name: style check 63 | # clang-format version should be set 64 | run: | 65 | sudo apt-get install -q -y clang-format 66 | sh scripts/check-format.sh 67 | shell: bash 68 | -------------------------------------------------------------------------------- /scripts/neon_scrapper/scrapper.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | from selenium import webdriver 3 | from webdriver_manager.chrome import ChromeDriverManager 4 | from selenium.webdriver.chrome.options import Options 5 | from selenium.webdriver.chrome.service import Service 6 | from selenium.webdriver.common.by import By 7 | from selenium.webdriver.support.wait import WebDriverWait 8 | from selenium.webdriver.support import expected_conditions as EC 9 | 10 | with open("neon_intrinsics.csv", 'a') as file: 11 | file.write("ReturnType,Name,Arguments,Group\n") 12 | options = Options() 13 | options.add_argument('--headless') 14 | options.add_argument('--no-sandbox') 15 | options.add_argument('--disable-dev-shm-usage') 16 | driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options) 17 | 18 | driver.get("https://developer.arm.com/architectures/instruction-sets/intrinsics/#f:@navigationhierarchiessimdisa=[Neon]") 19 | driver.maximize_window() 20 | driver.find_element(By.XPATH, "//button[text()='Accept and hide this message ']").click() 21 | wait = WebDriverWait(driver, 5) 22 | wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'c-table'))) 23 | 24 | sum = 0 25 | for i in range(0, 218): 26 | data = driver.page_source 27 | soup = BeautifulSoup(data, 'html.parser') 28 | table = soup.find_all(lambda tag: tag.name == "table" and tag.has_attr("class") and ("c-table" in tag.get("class")))[0] 29 | all_tr = table.find('tbody').find_all('tr') 30 | sum += len(all_tr) 31 | print(i, sum) 32 | for tr in all_tr: 33 | td = tr.find_all('td') 34 | file.write(f"{td[2].string},{td[3].string},\"{td[4].string}\",{td[5].string}\n") 35 | 36 | element = driver.find_element(By.TAG_NAME, "ads-pagination").shadow_root.find_element(By.CLASS_NAME, "c-pagination-action--next") 37 | # element.click() 38 | driver.execute_script("arguments[0].click();", element) 39 | wait = WebDriverWait(driver, 10) 40 | wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'c-table'))) 41 | driver.close() 42 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | ifndef CC 2 | override CC = gcc 3 | endif 4 | 5 | ifndef CXX 6 | override CXX = g++ 7 | endif 8 | 9 | ifndef ENABLE_TEST_ALL 10 | DEFINED_FLAGS = 11 | else 12 | DEFINED_FLAGS = -DENABLE_TEST_ALL 13 | endif 14 | 15 | ifndef CROSS_COMPILE 16 | processor := $(shell uname -m) 17 | ARCH_CFLAGS = -march=armv8.4-a+simd+i8mm+dotprod+sha3 18 | else # CROSS_COMPILE was set 19 | CC = $(CROSS_COMPILE)gcc 20 | CXX = $(CROSS_COMPILE)g++ 21 | CXXFLAGS += -static 22 | LDFLAGS += -static 23 | 24 | check_riscv := $(shell echo | $(CROSS_COMPILE)cpp -dM - | grep " __riscv_xlen " | cut -c22-) 25 | uname_result := $(shell uname -m) 26 | ifeq ($(check_riscv),64) 27 | processor = rv64 28 | else ifeq ($(uname_result),rv64imafdc) 29 | processor = rv64 30 | else ifeq ($(check_riscv),32) 31 | processor = rv32 32 | else ifeq ($(uname_result),rv32i) 33 | processor = rv32 34 | else 35 | $(error Unsupported cross-compiler) 36 | endif 37 | 38 | ARCH_CFLAGS = -march=$(processor)gcv_zba 39 | 40 | ifeq ($(SIMULATOR_TYPE), qemu) 41 | SIMULATOR += qemu-riscv64 42 | SIMULATOR_FLAGS = -cpu $(processor),v=true,zba=true,vlen=128 43 | else 44 | SIMULATOR = spike 45 | SIMULATOR_FLAGS = --isa=$(processor)gcv_zba 46 | PROXY_KERNEL = pk 47 | endif 48 | endif 49 | 50 | CXXFLAGS += -Wall -Wcast-qual -I. $(ARCH_CFLAGS) 51 | LDFLAGS += -lm 52 | OBJS = \ 53 | tests/binding.o \ 54 | tests/common.o \ 55 | tests/debug_tools.o \ 56 | tests/impl.o \ 57 | tests/main.o 58 | deps := $(OBJS:%.o=%.o.d) 59 | 60 | .SUFFIXES: .o .cpp 61 | .cpp.o: 62 | $(CXX) -o $@ $(CXXFLAGS) $(DEFINED_FLAGS) -c -MMD -MF $@.d $< 63 | 64 | EXEC = tests/main 65 | 66 | $(EXEC): $(OBJS) 67 | $(CXX) $(LDFLAGS) -o $@ $^ 68 | 69 | test: tests/main 70 | ifeq ($(processor),$(filter $(processor),rv32 rv64)) 71 | $(CC) $(ARCH_CFLAGS) -c neon2rvv.h 72 | endif 73 | $(SIMULATOR) $(SIMULATOR_FLAGS) $(PROXY_KERNEL) $^ 74 | 75 | build-test: tests/main 76 | ifeq ($(processor),$(filter $(processor),rv32 rv64)) 77 | $(CC) $(ARCH_CFLAGS) -c neon2rvv.h 78 | endif 79 | 80 | format: 81 | @echo "Formatting files with clang-format.." 82 | @if ! hash clang-format; then echo "clang-format is required to indent"; fi 83 | clang-format -i neon2rvv.h tests/*.cpp tests/*.h 84 | 85 | .PHONY: clean check format 86 | 87 | clean: 88 | $(RM) $(OBJS) $(EXEC) $(deps) neon2rvv.h.gch 89 | 90 | clean-all: clean 91 | $(RM) *.log 92 | 93 | -include $(deps) 94 | -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | Language: Cpp 3 | AccessModifierOffset: -1 4 | AlignAfterOpenBracket: Align 5 | AlignConsecutiveAssignments: false 6 | AlignConsecutiveDeclarations: false 7 | AlignEscapedNewlines: Left 8 | AlignOperands: true 9 | AlignTrailingComments: true 10 | AllowAllParametersOfDeclarationOnNextLine: true 11 | AllowShortBlocksOnASingleLine: false 12 | AllowShortCaseLabelsOnASingleLine: false 13 | AllowShortFunctionsOnASingleLine: All 14 | AllowShortIfStatementsOnASingleLine: false 15 | AllowShortLoopsOnASingleLine: true 16 | AlwaysBreakAfterDefinitionReturnType: None 17 | AlwaysBreakAfterReturnType: None 18 | AlwaysBreakBeforeMultilineStrings: true 19 | AlwaysBreakTemplateDeclarations: true 20 | BinPackArguments: true 21 | BinPackParameters: true 22 | BraceWrapping: 23 | AfterClass: false 24 | AfterControlStatement: false 25 | AfterEnum: false 26 | AfterFunction: false 27 | AfterNamespace: false 28 | AfterObjCDeclaration: false 29 | AfterStruct: false 30 | AfterUnion: false 31 | BeforeCatch: false 32 | BeforeElse: false 33 | IndentBraces: false 34 | SplitEmptyFunction: true 35 | SplitEmptyRecord: true 36 | SplitEmptyNamespace: true 37 | BreakBeforeBinaryOperators: None 38 | BreakBeforeBraces: Attach 39 | BreakBeforeInheritanceComma: false 40 | BreakBeforeTernaryOperators: true 41 | BreakConstructorInitializersBeforeComma: false 42 | BreakConstructorInitializers: BeforeColon 43 | BreakAfterJavaFieldAnnotations: false 44 | BreakStringLiterals: true 45 | ColumnLimit: 120 46 | CommentPragmas: '^ IWYU pragma:' 47 | CompactNamespaces: false 48 | ConstructorInitializerAllOnOneLineOrOnePerLine: true 49 | ConstructorInitializerIndentWidth: 4 50 | ContinuationIndentWidth: 4 51 | Cpp11BracedListStyle: true 52 | DerivePointerAlignment: true 53 | DisableFormat: false 54 | ExperimentalAutoDetectBinPacking: false 55 | FixNamespaceComments: true 56 | IncludeCategories: 57 | - Regex: '^<.*\.h>' 58 | Priority: 1 59 | - Regex: '^<.*' 60 | Priority: 2 61 | - Regex: '.*' 62 | Priority: 3 63 | IncludeIsMainRegex: '([-_](tests))?$' 64 | IndentCaseLabels: true 65 | IndentWidth: 2 66 | IndentWrappedFunctionNames: false 67 | KeepEmptyLinesAtTheStartOfBlocks: false 68 | MacroBlockBegin: '' 69 | MacroBlockEnd: '' 70 | MaxEmptyLinesToKeep: 1 71 | NamespaceIndentation: None 72 | PenaltyBreakAssignment: 2 73 | PenaltyBreakBeforeFirstCallParameter: 1 74 | PenaltyBreakComment: 300 75 | PenaltyBreakFirstLessLess: 120 76 | PenaltyBreakString: 1000 77 | PenaltyExcessCharacter: 1000000 78 | PenaltyReturnTypeOnItsOwnLine: 200 79 | PointerAlignment: Left 80 | ReflowComments: true 81 | SortIncludes: true 82 | SortUsingDeclarations: true 83 | SpaceAfterCStyleCast: false 84 | SpaceAfterTemplateKeyword: true 85 | SpaceBeforeAssignmentOperators: true 86 | SpaceBeforeParens: ControlStatements 87 | SpaceInEmptyParentheses: false 88 | SpacesBeforeTrailingComments: 2 89 | SpacesInAngles: false 90 | SpacesInContainerLiterals: true 91 | SpacesInCStyleCastParentheses: false 92 | SpacesInParentheses: false 93 | SpacesInSquareBrackets: false 94 | Standard: Auto 95 | TabWidth: 8 96 | UseTab: Never 97 | -------------------------------------------------------------------------------- /scripts/neon_scrapper/header_gen.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | 4 | # Declare global variables to hold file contents 5 | file_content = None 6 | neon2rvv_content = None 7 | 8 | # First file processing 9 | with open("neon_intrinsics.csv", newline='') as csvfile: 10 | csv_reader = csv.reader(csvfile) 11 | header = next(csv_reader) # Skip the header line if there is one 12 | # Read the CSV data into a list of rows 13 | file_content = [row for row in csv_reader] 14 | 15 | 16 | def mod_neon2rvv(): 17 | current_dir = os.getcwd() 18 | with open(current_dir + "/../../neon2rvv.h", 'r') as file: 19 | data = file.read() 20 | neon2rvv_content = data 21 | 22 | prev_func_name = "" 23 | for line in file_content: 24 | func_partial = "FORCE_INLINE "+line[0]+" "+line[1]+"(" 25 | if func_partial not in neon2rvv_content: 26 | func = "// FORCE_INLINE "+line[0]+" "+line[1]+line[2]+";\n " 27 | print("func: ", func) 28 | start_idx = neon2rvv_content.find(prev_func_name) 29 | insert_idx = neon2rvv_content.find("FORCE_INLINE", start_idx+1) 30 | neon2rvv_content = neon2rvv_content[:insert_idx] +func+ neon2rvv_content[insert_idx:] 31 | 32 | prev_func_name = func_partial 33 | 34 | with open("modified_neon2rvv.h", 'w') as file: 35 | file.write(neon2rvv_content) 36 | 37 | def mod_impl_h(): 38 | current_dir = os.getcwd() 39 | with open(current_dir + "/../../tests/impl.h", 'r') as file: 40 | data = file.read() 41 | neon2rvv_content = data 42 | 43 | required_length = 79 44 | prev_func_name = "" 45 | for line in file_content: 46 | func_partial = "_("+line[1]+")" 47 | if func_partial not in neon2rvv_content: 48 | template = "/*_("+line[1]+")%s*/\\\n" 49 | spaces_needed = required_length - len(template % "") + 1 50 | func = template % (" " * spaces_needed) 51 | print("func: ", func) 52 | start_idx = neon2rvv_content.find(prev_func_name) 53 | insert_idx = neon2rvv_content.find("\n", start_idx+1) 54 | neon2rvv_content = neon2rvv_content[:insert_idx+1] +func+ neon2rvv_content[insert_idx+1:] 55 | 56 | prev_func_name = func_partial 57 | 58 | with open("modified_impl.h", 'w') as file: 59 | file.write(neon2rvv_content) 60 | 61 | def mod_impl_c(): 62 | current_dir = os.getcwd() 63 | with open(current_dir + "/../../tests/impl.cpp", 'r') as file: 64 | data = file.read() 65 | neon2rvv_content = data 66 | 67 | prev_func_name = "" 68 | for line in file_content: 69 | func_partial = "result_t test_"+line[1]+"(" 70 | if func_partial not in neon2rvv_content: 71 | func = "result_t test_"+line[1]+"(const NEON2RVV_TEST_IMPL &impl, uint32_t iter) {\nreturn TEST_UNIMPL;}\n" 72 | print("func: ", func) 73 | start_idx = neon2rvv_content.find(prev_func_name) 74 | insert_idx = neon2rvv_content.find("result_t test_", start_idx+1) 75 | neon2rvv_content = neon2rvv_content[:insert_idx] +func+ neon2rvv_content[insert_idx:] 76 | 77 | prev_func_name = func_partial 78 | 79 | with open("modified_impl.c", 'w') as file: 80 | file.write(neon2rvv_content) 81 | 82 | # mod_neon2rvv() 83 | # mod_impl_h() 84 | # mod_impl_c() 85 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![coverage badge](https://img.shields.io/badge/coverage-80.1%25-brightgreen) 2 | # neon2rvv 3 | 4 | A C/C++ header file that converts Arm/Aarch64 NEON intrinsics to RISC-V Vector (RVV) Extension. 5 | 6 | ## Introduction 7 | 8 | neon2rvv is a translator of Arm/Aarch64 NEON intrinsics to RISC-V Vector (RVV) Extension, shortening the time needed to get an RISCV working program that then can be used to extract profiles and to identify hot paths in the code. The header file `neon2rvv.h` contains several of the functions provided by NEON intrinsic header, ``, only implemented with RISCV-based counterparts to produce the exact semantics of the intrinsics. 9 | 10 | ## Usage 11 | 12 | * Put the file `neon2rvv.h` in to your source code directory. 13 | * Replace the header file `arm_neon.h` with `neon2rvv.h`. 14 | 15 | ```c 16 | #include "neon2rvv.h" 17 | ``` 18 | 19 | * Explicitly specify platform-specific options to gcc/clang compilers 20 | 21 | ```shell 22 | -march=rv64gcv_zba 23 | ``` 24 | 25 | ### Targets and Limitations 26 | 27 | The preliminary stage development goal of neon2rvv is targeting RV64 architecture with `128 bits vector register size (vlen == 128)`, which means the implementation is compiled with `-march=rv64gcv_zba` flag. 28 | 29 | We are using [RISC-V GNU Compiler Toolchain](https://github.com/riscv-collab/riscv-gnu-toolchain) for development. 30 | 31 | ## Development 32 | 33 | ### Run Tests 34 | 35 | `neon2rvv` provides a unified interface for developing test cases. These test cases are located in `tests` directory, and the input data is specified at runtime. Use the following commands to perform test cases: 36 | 37 | You can run the tests under the architecture of your current machine. This can help you verify the implementation of tests case if you run the tests on ARM machines. Now the tests support Aarch64 only. 38 | 39 | ```shell 40 | $ make test 41 | ``` 42 | 43 | You can specify GNU toolchain for cross compilation as well. For simulator, RISC-V ISA Simulator, [Spike](https://github.com/riscv-software-src/riscv-isa-sim) is used in default. 44 | 45 | ```shell 46 | $ make CROSS_COMPILE=riscv64-unknown-elf- check # rv64 47 | ``` 48 | 49 | For developers who wants to run the cross-compiled tests with [qemu-riscv64](https://www.qemu.org/), you can specify QEMU with following command. 50 | 51 | ```shell 52 | $ make CROSS_COMPILE=riscv64-unknown-elf- SIMULATOR_TYPE=qemu test 53 | ``` 54 | 55 | ## Official Documents 56 | 57 | * [riscv-v-spec](https://github.com/riscv/riscv-v-spec) 58 | * [rvv-intrinsic-doc](https://github.com/riscv-non-isa/rvv-intrinsic-doc) 59 | * [riscv-c-api](https://github.com/riscv-non-isa/riscv-c-api-doc/blob/master/riscv-c-api.md) 60 | * [NEON Intrinsics](https://developer.arm.com/architectures/instruction-sets/intrinsics) 61 | * Coding for Neon: 62 | * [Part 1: Load and Stores](https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/coding-for-neon---part-1-load-and-stores) 63 | * [Part 2: Dealing With Leftovers](https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/coding-for-neon---part-2-dealing-with-leftovers) 64 | * [Part 3: Matrix Multiplication](https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/coding-for-neon---part-3-matrix-multiplication) 65 | * [Part 4: Shifting Left and Right](https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/coding-for-neon---part-4-shifting-left-and-right) 66 | * [Part 5: Rearranging Vectors](https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/coding-for-neon---part-5-rearranging-vectors) 67 | 68 | ## References 69 | 70 | * [sse2neon](https://github.com/DLTcollab/sse2neon) 71 | * [rvv_example](https://github.com/brucehoult/rvv_example) 72 | 73 | ## Relating Projects 74 | 75 | * [sse2rvv](https://github.com/pattonkan/sse2rvv) 76 | -------------------------------------------------------------------------------- /tests/debug_tools.cpp: -------------------------------------------------------------------------------- 1 | #include "debug_tools.h" 2 | #include 3 | #include 4 | 5 | namespace NEON2RVV { 6 | 7 | #if defined(__aarch64__) 8 | #define INT64_ESCAPE "%20lld" 9 | #define UINT64_ESCAPE "%20llu" 10 | #else 11 | #define INT64_ESCAPE "%20ld" 12 | #define UINT64_ESCAPE "%20lu" 13 | #endif 14 | 15 | void print_64_bits_u8_arr(const char *var_name, const uint8_t *u) { 16 | printf("%s0: %3u, %s1: %3u, %s2: %3u, %s3: %3u, %s4: %3u, %s5: %3u, %s6: %3u, %s7: %3u\n", var_name, u[0], var_name, 17 | u[1], var_name, u[2], var_name, u[3], var_name, u[4], var_name, u[5], var_name, u[6], var_name, u[7]); 18 | } 19 | void print_64_bits_s8_arr(const char *var_name, const int8_t *u) { 20 | printf("%s0: %3d, %s1: %3d, %s2: %3d, %s3: %3d, %s4: %3d, %s5: %3d, %s6: %3d, %s7: %3d\n", var_name, u[0], var_name, 21 | u[1], var_name, u[2], var_name, u[3], var_name, u[4], var_name, u[5], var_name, u[6], var_name, u[7]); 22 | } 23 | void print_64_bits_u16_arr(const char *var_name, const uint16_t *u) { 24 | printf("%s0: %5u, %s1: %5u, %s2: %5u, %s3: %5u\n", var_name, u[0], var_name, u[1], var_name, u[2], var_name, u[3]); 25 | } 26 | void print_64_bits_s16_arr(const char *var_name, const int16_t *u) { 27 | printf("%s0: %5d, %s1: %5d, %s2: %5d, %s3: %5d\n", var_name, u[0], var_name, u[1], var_name, u[2], var_name, u[3]); 28 | } 29 | void print_64_bits_u32_arr(const char *var_name, const uint32_t *u) { 30 | printf("%s0: %10u, %s1: %10u\n", var_name, u[0], var_name, u[1]); 31 | } 32 | void print_64_bits_s32_arr(const char *var_name, const int32_t *u) { 33 | printf("%s0: %10d, %s1: %10d\n", var_name, u[0], var_name, u[1]); 34 | } 35 | void print_64_bits_u64_arr(const char *var_name, const uint64_t *u) { 36 | printf("%s0: " UINT64_ESCAPE "\n", var_name, u[0]); 37 | } 38 | void print_64_bits_s64_arr(const char *var_name, const int64_t *u) { 39 | printf("%s0: " INT64_ESCAPE "\n", var_name, u[0]); 40 | } 41 | void print_64_bits_f32_arr(const char *var_name, const float *f) { 42 | printf("%s0: %.3f, %s1: %.3f\n", var_name, f[0], var_name, f[1]); 43 | } 44 | void print_64_bits_f64_arr(const char *var_name, const float *f) { printf("%s0: %.3f\n", var_name, f[0]); } 45 | void print_128_bits_u8_arr(const char *var_name, const uint8_t *u) { 46 | printf( 47 | "%s0: %3u, %s1: %3u, %s2: %3u, %s3: %3u, %s4: %3u, %s5: %3u, " 48 | "%s6: %3u, %s7: %3u, %s8: %3u, %s9: %3u, %s10: %3u, %s11: %3u, " 49 | "%s12: %3u, %s13: %3u, %s14: %3u, %s15: %3u\n", 50 | var_name, u[0], var_name, u[1], var_name, u[2], var_name, u[3], var_name, u[4], var_name, u[5], var_name, u[6], 51 | var_name, u[7], var_name, u[8], var_name, u[9], var_name, u[10], var_name, u[11], var_name, u[12], var_name, 52 | u[13], var_name, u[14], var_name, u[15]); 53 | } 54 | void print_128_bits_s8_arr(const char *var_name, const int8_t *u) { 55 | printf( 56 | "%s0: %3d, %s1: %3d, %s2: %3d, %s3: %3d, %s4: %3d, %s5: %3d, " 57 | "%s6: %3d, %s7: %3d, %s8: %3d, %s9: %3d, %s10: %3d, %s11: %3d, " 58 | "%s12: %3d, %s13: %3d, %s14: %3d, %s15: %3d\n", 59 | var_name, u[0], var_name, u[1], var_name, u[2], var_name, u[3], var_name, u[4], var_name, u[5], var_name, u[6], 60 | var_name, u[7], var_name, u[8], var_name, u[9], var_name, u[10], var_name, u[11], var_name, u[12], var_name, 61 | u[13], var_name, u[14], var_name, u[15]); 62 | } 63 | void print_128_bits_u16_arr(const char *var_name, const uint16_t *u) { 64 | printf("%s0: %5u, %s1: %5u, %s2: %5u, %s3: %5u, %s4: %5u, %s5: %5u, %s6: %5u, %s7: %5u\n", var_name, u[0], var_name, 65 | u[1], var_name, u[2], var_name, u[3], var_name, u[4], var_name, u[5], var_name, u[6], var_name, u[7]); 66 | } 67 | void print_128_bits_s16_arr(const char *var_name, const int16_t *u) { 68 | printf("%s0: %5d, %s1: %5d, %s2: %5d, %s3: %5d, %s4: %5d, %s5: %5d, %s6: %5d, %s7: %5d\n", var_name, u[0], var_name, 69 | u[1], var_name, u[2], var_name, u[3], var_name, u[4], var_name, u[5], var_name, u[6], var_name, u[7]); 70 | } 71 | void print_128_bits_u32_arr(const char *var_name, const uint32_t *u) { 72 | printf("%s0: %10u, %s1: %10u, %s2: %10u, %s3: %10u\n", var_name, u[0], var_name, u[1], var_name, u[2], var_name, 73 | u[3]); 74 | } 75 | void print_128_bits_s32_arr(const char *var_name, const int32_t *u) { 76 | printf("%s0: %10d, %s1: %10d, %s2: %10d, %s3: %10d\n", var_name, u[0], var_name, u[1], var_name, u[2], var_name, 77 | u[3]); 78 | } 79 | void print_128_bits_u64_arr(const char *var_name, const uint64_t *u) { 80 | printf("%s0: " UINT64_ESCAPE ", %s1: " UINT64_ESCAPE "\n", var_name, u[0], var_name, u[1]); 81 | } 82 | void print_128_bits_s64_arr(const char *var_name, const int64_t *u) { 83 | printf("%s0: " INT64_ESCAPE ", %s1: " INT64_ESCAPE "\n", var_name, u[0], var_name, u[1]); 84 | } 85 | void print_128_bits_f32_arr(const char *var_name, const float *f) { 86 | printf("%s0: %.3f, %s1: %.3f, %s2: %.3f, %s3: %.3f\n", var_name, f[0], var_name, f[1], var_name, f[2], var_name, 87 | f[3]); 88 | } 89 | void print_128_bits_f64_arr(const char *var_name, const double *f) { 90 | printf("%s0: %.3f, %s1: %.3f\n", var_name, f[0], var_name, f[1]); 91 | } 92 | 93 | void print_u8_64(const char *var_name, uint8_t u0, uint8_t u1, uint8_t u2, uint8_t u3, uint8_t u4, uint8_t u5, 94 | uint8_t u6, uint8_t u7) { 95 | uint8_t a[] = {u0, u1, u2, u3, u4, u5, u6, u7}; 96 | const uint8_t *u = (const uint8_t *)&a; 97 | print_64_bits_u8_arr(var_name, u); 98 | } 99 | void print_u8_64(const char *var_name, int8_t i0, int8_t i1, int8_t i2, int8_t i3, int8_t i4, int8_t i5, int8_t i6, 100 | int8_t i7) { 101 | int8_t a[] = {i0, i1, i2, i3, i4, i5, i6, i7}; 102 | const uint8_t *u = (const uint8_t *)&a; 103 | print_64_bits_u8_arr(var_name, u); 104 | } 105 | void print_u8_64(const char *var_name, uint16_t u0, uint16_t u1, uint16_t u2, uint16_t u3) { 106 | uint16_t a[] = {u0, u1, u2, u3}; 107 | const uint8_t *u = (const uint8_t *)&a; 108 | print_64_bits_u8_arr(var_name, u); 109 | } 110 | void print_u8_64(const char *var_name, int16_t i0, int16_t i1, int16_t i2, int16_t i3) { 111 | int16_t a[] = {i0, i1, i2, i3}; 112 | const uint8_t *u = (const uint8_t *)&a; 113 | print_64_bits_u8_arr(var_name, u); 114 | } 115 | void print_u8_64(const char *var_name, uint32_t u0, uint32_t u1) { 116 | uint32_t a[] = {u0, u1}; 117 | const uint8_t *u = (const uint8_t *)&a; 118 | print_64_bits_u8_arr(var_name, u); 119 | } 120 | void print_u8_64(const char *var_name, int32_t i0, int32_t i1) { 121 | int32_t a[] = {i0, i1}; 122 | const uint8_t *u = (const uint8_t *)&a; 123 | print_64_bits_u8_arr(var_name, u); 124 | } 125 | void print_u8_64(const char *var_name, uint64_t u0) { 126 | uint64_t a[] = {u0}; 127 | const uint8_t *u = (const uint8_t *)&a; 128 | print_64_bits_u8_arr(var_name, u); 129 | } 130 | void print_u8_64(const char *var_name, int64_t i0) { 131 | int64_t a[] = {i0}; 132 | const uint8_t *u = (const uint8_t *)&a; 133 | print_64_bits_u8_arr(var_name, u); 134 | } 135 | 136 | void print_u8_128(const char *var_name, uint8_t u0, uint8_t u1, uint8_t u2, uint8_t u3, uint8_t u4, uint8_t u5, 137 | uint8_t u6, uint8_t u7, uint8_t u8, uint8_t u9, uint8_t u10, uint8_t u11, uint8_t u12, uint8_t u13, 138 | uint8_t u14, uint8_t u15) { 139 | uint8_t a[] = {u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15}; 140 | const uint8_t *u = (const uint8_t *)&a; 141 | print_128_bits_u8_arr(var_name, u); 142 | } 143 | void print_u8_128(const char *var_name, int8_t i0, int8_t i1, int8_t i2, int8_t i3, int8_t i4, int8_t i5, int8_t i6, 144 | int8_t i7, int8_t i8, int8_t i9, int8_t i10, int8_t i11, int8_t i12, int8_t i13, int8_t i14, 145 | int8_t i15) { 146 | int8_t a[] = {i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15}; 147 | const uint8_t *u = (const uint8_t *)&a; 148 | print_128_bits_u8_arr(var_name, u); 149 | } 150 | void print_u8_128(const char *var_name, uint16_t u0, uint16_t u1, uint16_t u2, uint16_t u3, uint16_t u4, uint16_t u5, 151 | uint16_t u6, uint16_t u7) { 152 | uint16_t a[] = {u0, u1, u2, u3, u4, u5, u6, u7}; 153 | const uint8_t *u = (const uint8_t *)&a; 154 | print_128_bits_u8_arr(var_name, u); 155 | } 156 | void print_u8_128(const char *var_name, int16_t i0, int16_t i1, int16_t i2, int16_t i3, int16_t i4, int16_t i5, 157 | int16_t i6, int16_t i7) { 158 | int16_t a[] = {i0, i1, i2, i3, i4, i5, i6, i7}; 159 | const uint8_t *u = (const uint8_t *)&a; 160 | print_128_bits_u8_arr(var_name, u); 161 | } 162 | void print_u8_128(const char *var_name, uint32_t u0, uint32_t u1, uint32_t u2, uint32_t u3) { 163 | uint32_t a[] = {u0, u1, u2, u3}; 164 | const uint8_t *u = (const uint8_t *)&a; 165 | print_128_bits_u8_arr(var_name, u); 166 | } 167 | void print_u8_128(const char *var_name, int32_t i0, int32_t i1, int32_t i2, int32_t i3) { 168 | int32_t a[] = {i0, i1, i2, i3}; 169 | const uint8_t *u = (const uint8_t *)&a; 170 | print_128_bits_u8_arr(var_name, u); 171 | } 172 | void print_u8_128(const char *var_name, uint64_t u0, uint64_t u1) { 173 | uint64_t a[] = {u0, u1}; 174 | const uint8_t *u = (const uint8_t *)&a; 175 | print_128_bits_u8_arr(var_name, u); 176 | } 177 | void print_u8_128(const char *var_name, int64_t i0, int64_t i1) { 178 | int64_t a[] = {i0, i1}; 179 | const uint8_t *u = (const uint8_t *)&a; 180 | print_128_bits_u8_arr(var_name, u); 181 | } 182 | 183 | } // namespace NEON2RVV 184 | -------------------------------------------------------------------------------- /tests/debug_tools.h: -------------------------------------------------------------------------------- 1 | #ifndef NEON2RVV_DEBUG_TOOLS_H 2 | #define NEON2RVV_DEBUG_TOOLS_H 3 | 4 | #include 5 | #include 6 | 7 | #if defined(__riscv) || defined(__riscv__) 8 | #include "neon2rvv.h" 9 | #elif (defined(__aarch64__) || defined(_M_ARM64)) || defined(__arm__) 10 | #include 11 | #endif 12 | 13 | namespace NEON2RVV { 14 | 15 | void print_64_bits_u8_arr(const char *var_name, const uint8_t *u); 16 | void print_64_bits_s8_arr(const char *var_name, const int8_t *u); 17 | void print_64_bits_u16_arr(const char *var_name, const uint16_t *u); 18 | void print_64_bits_s16_arr(const char *var_name, const int16_t *u); 19 | void print_64_bits_u32_arr(const char *var_name, const uint32_t *u); 20 | void print_64_bits_s32_arr(const char *var_name, const int32_t *u); 21 | void print_64_bits_u64_arr(const char *var_name, const uint64_t *u); 22 | void print_64_bits_s64_arr(const char *var_name, const int64_t *u); 23 | void print_64_bits_f32_arr(const char *var_name, const float *f); 24 | void print_64_bits_f64_arr(const char *var_name, const double *f); 25 | void print_128_bits_u8_arr(const char *var_name, const uint8_t *u); 26 | void print_128_bits_s8_arr(const char *var_name, const int8_t *u); 27 | void print_128_bits_u16_arr(const char *var_name, const uint16_t *u); 28 | void print_128_bits_s16_arr(const char *var_name, const int16_t *u); 29 | void print_128_bits_u32_arr(const char *var_name, const uint32_t *u); 30 | void print_128_bits_s32_arr(const char *var_name, const int32_t *u); 31 | void print_128_bits_u64_arr(const char *var_name, const uint64_t *u); 32 | void print_128_bits_s64_arr(const char *var_name, const int64_t *u); 33 | void print_128_bits_f32_arr(const char *var_name, const float *f); 34 | void print_128_bits_f64_arr(const char *var_name, const double *f); 35 | 36 | void print_u8_64(const char *var_name, uint8_t u0, uint8_t u1, uint8_t u2, uint8_t u3, uint8_t u4, uint8_t u5, 37 | uint8_t u6, uint8_t u7); 38 | void print_u8_64(const char *var_name, int8_t i0, int8_t i1, int8_t i2, int8_t i3, int8_t i4, int8_t i5, int8_t i6, 39 | int8_t i7); 40 | void print_u8_64(const char *var_name, uint16_t u0, uint16_t u1, uint16_t u2, uint16_t u3); 41 | void print_u8_64(const char *var_name, int16_t i0, int16_t i1, int16_t i2, int16_t i3); 42 | void print_u8_64(const char *var_name, uint32_t u0, uint32_t u1); 43 | void print_u8_64(const char *var_name, int32_t i0, int32_t i1); 44 | void print_u8_64(const char *var_name, uint64_t u0); 45 | void print_u8_64(const char *var_name, int64_t i0); 46 | 47 | void print_u8_128(const char *var_name, uint8_t u0, uint8_t u1, uint8_t u2, uint8_t u3, uint8_t u4, uint8_t u5, 48 | uint8_t u6, uint8_t u7, uint8_t u8, uint8_t u9, uint8_t u10, uint8_t u11, uint8_t u12, uint8_t u13, 49 | uint8_t u14, uint8_t u15); 50 | void print_u8_128(const char *var_name, int8_t i0, int8_t i1, int8_t i2, int8_t i3, int8_t i4, int8_t i5, int8_t i6, 51 | int8_t i7, int8_t i8, int8_t i9, int8_t i10, int8_t i11, int8_t i12, int8_t i13, int8_t i14, 52 | int8_t i15); 53 | void print_u8_128(const char *var_name, uint16_t u0, uint16_t u1, uint16_t u2, uint16_t u3, uint16_t u4, uint16_t u5, 54 | uint16_t u6, uint16_t u7); 55 | void print_u8_128(const char *var_name, int16_t i0, int16_t i1, int16_t i2, int16_t i3, int16_t i4, int16_t i5, 56 | int16_t i6, int16_t i7); 57 | void print_u8_128(const char *var_name, uint32_t u0, uint32_t u1, uint32_t u2, uint32_t u3); 58 | void print_u8_128(const char *var_name, int32_t i0, int32_t i1, int32_t i2, int32_t i3); 59 | void print_u8_128(const char *var_name, uint64_t u0, uint64_t u1); 60 | void print_u8_128(const char *var_name, int64_t i0, int64_t i1); 61 | 62 | template 63 | void print_u8_64(const char *var_name, T a) { 64 | const uint8_t *u = (const uint8_t *)&a; 65 | print_64_bits_u8_arr(var_name, u); 66 | } 67 | template 68 | void print_u8_64(const char *var_name, T *a) { 69 | const uint8_t *u = (const uint8_t *)a; 70 | print_64_bits_u8_arr(var_name, u); 71 | } 72 | template 73 | void print_s8_64(const char *var_name, T a) { 74 | const int8_t *s = (const int8_t *)&a; 75 | print_64_bits_s8_arr(var_name, s); 76 | } 77 | template 78 | void print_s8_64(const char *var_name, T *a) { 79 | const int8_t *s = (const int8_t *)a; 80 | print_64_bits_s8_arr(var_name, s); 81 | } 82 | template 83 | void print_u16_64(const char *var_name, T a) { 84 | const uint16_t *u = (const uint16_t *)&a; 85 | print_64_bits_u16_arr(var_name, u); 86 | } 87 | template 88 | void print_u16_64(const char *var_name, T *a) { 89 | const uint16_t *u = (const uint16_t *)a; 90 | print_64_bits_u16_arr(var_name, u); 91 | } 92 | template 93 | void print_s16_64(const char *var_name, T a) { 94 | const int16_t *s = (const int16_t *)&a; 95 | print_64_bits_s16_arr(var_name, s); 96 | } 97 | template 98 | void print_s16_64(const char *var_name, T *a) { 99 | const int16_t *s = (const int16_t *)a; 100 | print_64_bits_s16_arr(var_name, s); 101 | } 102 | template 103 | void print_u32_64(const char *var_name, T a) { 104 | const uint32_t *u = (const uint32_t *)&a; 105 | print_64_bits_u32_arr(var_name, u); 106 | } 107 | template 108 | void print_u32_64(const char *var_name, T *a) { 109 | const uint32_t *u = (const uint32_t *)a; 110 | print_64_bits_u32_arr(var_name, u); 111 | } 112 | template 113 | void print_s32_64(const char *var_name, T a) { 114 | const int32_t *s = (const int32_t *)&a; 115 | print_64_bits_s32_arr(var_name, s); 116 | } 117 | template 118 | void print_s32_64(const char *var_name, T *a) { 119 | const int32_t *s = (const int32_t *)a; 120 | print_64_bits_s32_arr(var_name, s); 121 | } 122 | template 123 | void print_u64_64(const char *var_name, T a) { 124 | const uint64_t *u = (const uint64_t *)&a; 125 | print_64_bits_u64_arr(var_name, u); 126 | } 127 | template 128 | void print_u64_64(const char *var_name, T *a) { 129 | const uint64_t *u = (const uint64_t *)a; 130 | print_64_bits_u64_arr(var_name, u); 131 | } 132 | template 133 | void print_s64_64(const char *var_name, T a) { 134 | const int64_t *s = (const int64_t *)&a; 135 | print_64_bits_s64_arr(var_name, s); 136 | } 137 | template 138 | void print_s64_64(const char *var_name, T *a) { 139 | const int64_t *s = (const int64_t *)a; 140 | print_64_bits_s64_arr(var_name, s); 141 | } 142 | template 143 | void print_f32_64(const char *var_name, T a) { 144 | const float *f = (const float *)&a; 145 | print_64_bits_f32_arr(var_name, f); 146 | } 147 | template 148 | void print_f32_64(const char *var_name, T *a) { 149 | const float *f = (const float *)a; 150 | print_64_bits_f32_arr(var_name, f); 151 | } 152 | template 153 | void print_f64_64(const char *var_name, T a) { 154 | const double *f = (const double *)&a; 155 | print_64_bits_f64_arr(var_name, f); 156 | } 157 | template 158 | void print_f64_64(const char *var_name, T *a) { 159 | const double *f = (const double *)a; 160 | print_64_bits_f64_arr(var_name, f); 161 | } 162 | template 163 | void print_u8_128(const char *var_name, T a) { 164 | const uint8_t *u = (const uint8_t *)&a; 165 | print_128_bits_u8_arr(var_name, u); 166 | } 167 | template 168 | void print_u8_128(const char *var_name, T *a) { 169 | const uint8_t *u = (const uint8_t *)a; 170 | print_128_bits_u8_arr(var_name, u); 171 | } 172 | template 173 | void print_s8_128(const char *var_name, T a) { 174 | const int8_t *u = (const int8_t *)&a; 175 | print_128_bits_s8_arr(var_name, u); 176 | } 177 | template 178 | void print_s8_128(const char *var_name, T *a) { 179 | const int8_t *u = (const int8_t *)a; 180 | print_128_bits_s8_arr(var_name, u); 181 | } 182 | template 183 | void print_u16_128(const char *var_name, T a) { 184 | const uint16_t *u = (const uint16_t *)&a; 185 | print_128_bits_u16_arr(var_name, u); 186 | } 187 | template 188 | void print_u16_128(const char *var_name, T *a) { 189 | const uint16_t *u = (const uint16_t *)a; 190 | print_128_bits_u16_arr(var_name, u); 191 | } 192 | template 193 | void print_s16_128(const char *var_name, T a) { 194 | const int16_t *u = (const int16_t *)&a; 195 | print_128_bits_s16_arr(var_name, u); 196 | } 197 | template 198 | void print_s16_128(const char *var_name, T *a) { 199 | const int16_t *u = (const int16_t *)a; 200 | print_128_bits_s16_arr(var_name, u); 201 | } 202 | template 203 | void print_u32_128(const char *var_name, T a) { 204 | const uint32_t *u = (const uint32_t *)&a; 205 | print_128_bits_u32_arr(var_name, u); 206 | } 207 | template 208 | void print_u32_128(const char *var_name, T *a) { 209 | const uint32_t *u = (const uint32_t *)a; 210 | print_128_bits_u32_arr(var_name, u); 211 | } 212 | template 213 | void print_s32_128(const char *var_name, T a) { 214 | const int32_t *u = (const int32_t *)&a; 215 | print_128_bits_s32_arr(var_name, u); 216 | } 217 | template 218 | void print_s32_128(const char *var_name, T *a) { 219 | const int32_t *u = (const int32_t *)a; 220 | print_128_bits_s32_arr(var_name, u); 221 | } 222 | template 223 | void print_u64_128(const char *var_name, T a) { 224 | const uint64_t *u = (const uint64_t *)&a; 225 | print_128_bits_u64_arr(var_name, u); 226 | } 227 | template 228 | void print_u64_128(const char *var_name, T *a) { 229 | const uint64_t *u = (const uint64_t *)a; 230 | print_128_bits_u64_arr(var_name, u); 231 | } 232 | template 233 | void print_s64_128(const char *var_name, T a) { 234 | const int64_t *u = (const int64_t *)&a; 235 | print_128_bits_s64_arr(var_name, u); 236 | } 237 | template 238 | void print_s64_128(const char *var_name, T *a) { 239 | const int64_t *u = (const int64_t *)a; 240 | print_128_bits_s64_arr(var_name, u); 241 | } 242 | template 243 | void print_f32_128(const char *var_name, T a) { 244 | const float *f = (const float *)&a; 245 | print_128_bits_f32_arr(var_name, f); 246 | } 247 | template 248 | void print_f32_128(const char *var_name, T *a) { 249 | const float *f = (const float *)a; 250 | print_128_bits_f32_arr(var_name, f); 251 | } 252 | template 253 | void print_f64_128(const char *var_name, T a) { 254 | const double *f = (const double *)&a; 255 | print_128_bits_f64_arr(var_name, f); 256 | } 257 | template 258 | void print_f64_128(const char *var_name, T *a) { 259 | const double *f = (const double *)a; 260 | print_128_bits_f64_arr(var_name, f); 261 | } 262 | 263 | } // namespace NEON2RVV 264 | 265 | #endif // NEON2RVV_DEBUG_TOOLS_H 266 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /tests/common.h: -------------------------------------------------------------------------------- 1 | #ifndef NEON2RVV_COMMON_H 2 | #define NEON2RVV_COMMON_H 3 | 4 | #include 5 | #include 6 | 7 | #if defined(__riscv) || defined(__riscv__) 8 | #include "neon2rvv.h" 9 | #elif (defined(__aarch64__) || defined(_M_ARM64)) || defined(__arm__) 10 | #include 11 | 12 | #if defined(__GNUC__) || defined(__clang__) 13 | #pragma push_macro("ALIGN_STRUCT") 14 | #define ALIGN_STRUCT(x) __attribute__((aligned(x))) 15 | #else 16 | #define ALIGN_STRUCT(x) __declspec(align(x)) 17 | #endif // defined(__GNUC__) || defined(__clang__) 18 | 19 | typedef union ALIGN_STRUCT(16) SIMDVec { 20 | float m128_f32[4]; // as floats - DON'T USE. Added for convenience. 21 | int8_t m128_i8[16]; // as signed 8-bit integers. 22 | int16_t m128_i16[8]; // as signed 16-bit integers. 23 | int32_t m128_i32[4]; // as signed 32-bit integers. 24 | int64_t m128_i64[2]; // as signed 64-bit integers. 25 | uint8_t m128_u8[16]; // as unsigned 8-bit integers. 26 | uint16_t m128_u16[8]; // as unsigned 16-bit integers. 27 | uint32_t m128_u32[4]; // as unsigned 32-bit integers. 28 | uint64_t m128_u64[2]; // as unsigned 64-bit integers. 29 | } SIMDVec; 30 | 31 | #if defined(__GNUC__) || defined(__clang__) 32 | #pragma pop_macro("ALIGN_STRUCT") 33 | #endif // defined(__GNUC__) || defined(__clang__) 34 | #endif // defined(__riscv) || defined(__riscv__) 35 | 36 | #define ASSERT_RETURN(x) \ 37 | if (!(x)) \ 38 | return TEST_FAIL; 39 | 40 | namespace NEON2RVV { 41 | 42 | enum result_t { 43 | TEST_SUCCESS = 1, 44 | TEST_FAIL = 0, 45 | TEST_UNIMPL = -1, 46 | }; 47 | extern int32_t NaN; 48 | extern int64_t NaN64; 49 | #define ALL_BIT_1_32 (*(float *)&NaN) 50 | #define ALL_BIT_1_64 (*(double *)&NaN64) 51 | 52 | template 53 | result_t validate_64_bits(T a, U b) { 54 | const int32_t *t1 = (const int32_t *)&a; 55 | const int32_t *t2 = (const int32_t *)&b; 56 | 57 | ASSERT_RETURN(t1[0] == t2[0]); 58 | ASSERT_RETURN(t1[1] == t2[1]); 59 | return TEST_SUCCESS; 60 | } 61 | template 62 | result_t validate_128_bits(T a, U b) { 63 | const int32_t *t1 = (const int32_t *)&a; 64 | const int32_t *t2 = (const int32_t *)&b; 65 | 66 | ASSERT_RETURN(t1[0] == t2[0]); 67 | ASSERT_RETURN(t1[1] == t2[1]); 68 | ASSERT_RETURN(t1[2] == t2[2]); 69 | ASSERT_RETURN(t1[3] == t2[3]); 70 | return TEST_SUCCESS; 71 | } 72 | result_t validate_int64(int64x2_t a, int64_t i0, int64_t i1); 73 | result_t validate_uint64(uint64x2_t a, uint64_t u0, uint64_t u1); 74 | result_t validate_int64(int64x2x2_t a, int64_t i0, int64_t i1, int64_t i2, int64_t i3); 75 | result_t validate_uint64(uint64x2x2_t a, uint64_t u0, uint64_t u1, uint64_t u2, uint64_t u3); 76 | result_t validate_int64(int64x1_t a, int64_t i0); 77 | result_t validate_uint64(uint64x1_t a, uint64_t u0); 78 | result_t validate_int64(int64x1x2_t a, int64_t i0, int64_t i1); 79 | result_t validate_uint64(uint64x1x2_t a, uint64_t u0, uint64_t u1); 80 | result_t validate_int32(int32x4_t a, int32_t i0, int32_t i1, int32_t i2, int32_t i3); 81 | result_t validate_uint32(uint32x4_t a, uint32_t u0, uint32_t u1, uint32_t u2, uint32_t u3); 82 | result_t validate_int32(int32x4x2_t a, int32_t i0, int32_t i1, int32_t i2, int32_t i3, int32_t i4, int32_t i5, 83 | int32_t i6, int32_t i7); 84 | result_t validate_uint32(uint32x4x2_t a, uint32_t u0, uint32_t u1, uint32_t u2, uint32_t u3, uint32_t u4, uint32_t u5, 85 | uint32_t u6, uint32_t u7); 86 | result_t validate_int32(int32x2_t a, int32_t i0, int32_t i1); 87 | result_t validate_uint32(uint32x2_t a, uint32_t u0, uint32_t u1); 88 | result_t validate_int32(int32x2x2_t a, int32_t i0, int32_t i1, int32_t i2, int32_t i3); 89 | result_t validate_uint32(uint32x2x2_t a, uint32_t u0, uint32_t u1, uint32_t u2, uint32_t u3); 90 | result_t validate_int16(int16x8_t a, int16_t i0, int16_t i1, int16_t i2, int16_t i3, int16_t i4, int16_t i5, int16_t i6, 91 | int16_t i7); 92 | result_t validate_uint16(uint16x8_t a, uint16_t u0, uint16_t u1, uint16_t u2, uint16_t u3, uint16_t u4, uint16_t u5, 93 | uint16_t u6, uint16_t u7); 94 | result_t validate_int16(int16x8x2_t a, int16_t i0, int16_t i1, int16_t i2, int16_t i3, int16_t i4, int16_t i5, 95 | int16_t i6, int16_t i7, int16_t i8, int16_t i9, int16_t i10, int16_t i11, int16_t i12, 96 | int16_t i13, int16_t i14, int16_t i15); 97 | result_t validate_uint16(uint16x8x2_t a, uint16_t u0, uint16_t u1, uint16_t u2, uint16_t u3, uint16_t u4, uint16_t u5, 98 | uint16_t u6, uint16_t u7, uint16_t u8, uint16_t u9, uint16_t u10, uint16_t u11, uint16_t u12, 99 | uint16_t u13, uint16_t u14, uint16_t u15); 100 | result_t validate_int16(int16x4_t a, int16_t i0, int16_t i1, int16_t i2, int16_t i3); 101 | result_t validate_uint16(uint16x4_t a, uint16_t u0, uint16_t u1, uint16_t u2, uint16_t u3); 102 | result_t validate_int16(int16x4x2_t a, int16_t i0, int16_t i1, int16_t i2, int16_t i3, int16_t i4, int16_t i5, 103 | int16_t i6, int16_t i7); 104 | result_t validate_uint16(uint16x4x2_t a, uint16_t u0, uint16_t u1, uint16_t u2, uint16_t u3, uint16_t u4, uint16_t u5, 105 | uint16_t u6, uint16_t u7); 106 | result_t validate_int8(int8x16_t a, int8_t i0, int8_t i1, int8_t i2, int8_t i3, int8_t i4, int8_t i5, int8_t i6, 107 | int8_t i7, int8_t i8, int8_t i9, int8_t i10, int8_t i11, int8_t i12, int8_t i13, int8_t i14, 108 | int8_t i15); 109 | result_t validate_uint8(uint8x16_t a, uint8_t u0, uint8_t u1, uint8_t u2, uint8_t u3, uint8_t u4, uint8_t u5, 110 | uint8_t u6, uint8_t u7, uint8_t u8, uint8_t u9, uint8_t u10, uint8_t u11, uint8_t u12, 111 | uint8_t u13, uint8_t u14, uint8_t u15); 112 | result_t validate_int8(int8x16x2_t a, int8_t i0, int8_t i1, int8_t i2, int8_t i3, int8_t i4, int8_t i5, int8_t i6, 113 | int8_t i7, int8_t i8, int8_t i9, int8_t i10, int8_t i11, int8_t i12, int8_t i13, int8_t i14, 114 | int8_t i15, int8_t i16, int8_t i17, int8_t i18, int8_t i19, int8_t i20, int8_t i21, int8_t i22, 115 | int8_t i23, int8_t i24, int8_t i25, int8_t i26, int8_t i27, int8_t i28, int8_t i29, int8_t i30, 116 | int8_t i31); 117 | result_t validate_uint8(uint8x16x2_t u, uint8_t u0, uint8_t u1, uint8_t u2, uint8_t u3, uint8_t u4, uint8_t u5, 118 | uint8_t u6, uint8_t u7, uint8_t u8, uint8_t u9, uint8_t u10, uint8_t u11, uint8_t u12, 119 | uint8_t u13, uint8_t u14, uint8_t u15, uint8_t u16, uint8_t u17, uint8_t u18, uint8_t u19, 120 | uint8_t u20, uint8_t u21, uint8_t u22, uint8_t u23, uint8_t u24, uint8_t u25, uint8_t u26, 121 | uint8_t u27, uint8_t u28, uint8_t u29, uint8_t u30, uint8_t u31); 122 | result_t validate_int8(int8x8_t a, int8_t i0, int8_t i1, int8_t i2, int8_t i3, int8_t i4, int8_t i5, int8_t i6, 123 | int8_t i7); 124 | result_t validate_uint8(uint8x8_t a, uint8_t u0, uint8_t u1, uint8_t u2, uint8_t u3, uint8_t u4, uint8_t u5, uint8_t u6, 125 | uint8_t u7); 126 | result_t validate_int8(int8x8x2_t a, int8_t i0, int8_t i1, int8_t i2, int8_t i3, int8_t i4, int8_t i5, int8_t i6, 127 | int8_t i7, int8_t i8, int8_t i9, int8_t i10, int8_t i11, int8_t i12, int8_t i13, int8_t i14, 128 | int8_t i15); 129 | result_t validate_uint8(uint8x8x2_t a, uint8_t u0, uint8_t u1, uint8_t u2, uint8_t u3, uint8_t u4, uint8_t u5, 130 | uint8_t u6, uint8_t u7, uint8_t u8, uint8_t u9, uint8_t u10, uint8_t u11, uint8_t u12, 131 | uint8_t u13, uint8_t u14, uint8_t u15); 132 | result_t validate_float(float32x4_t a, float f0, float f1, float f2, float f3); 133 | result_t validate_float(float32x4x2_t a, float f0, float f1, float f2, float f3, float f4, float f5, float f6, 134 | float f7); 135 | result_t validate_float(float32x2_t a, float f0, float f1); 136 | result_t validate_float(float32x2x2_t a, float f0, float f1, float f2, float f3); 137 | result_t validate_float_error(float32x4_t a, float f0, float f1, float f2, float f3, float err); 138 | result_t validate_float_error(float32x2_t a, float f0, float f1, float err); 139 | result_t validate_float_error(float32_t a, float f0, float err); 140 | result_t validate_double(float64x2_t a, double d0, double d1); 141 | result_t validate_double(float64x2x2_t a, double d0, double d1, double d2, double d3); 142 | result_t validate_double(float64x1_t a, double d0); 143 | result_t validate_double(float64x1x2_t a, double d0, double d1); 144 | result_t validate_double_error(float64x2_t a, double d0, double d1, double err); 145 | result_t validate_double_error(float64x1_t a, double d0, double err); 146 | result_t validate_double_error(double a, double d0, double err); 147 | result_t validate_float_pair(float a, float b); 148 | 149 | int8_t saturate_int8(int a); 150 | uint8_t saturate_uint8(unsigned int a); 151 | int16_t saturate_int16(int a); 152 | uint16_t saturate_uint16(unsigned int a); 153 | int32_t saturate_int32(int64_t a); 154 | uint32_t saturate_uint32(uint64_t a); 155 | 156 | #define TEST_SATURATE_ADD_SUB(BIT) \ 157 | static inline int##BIT##_t sat_add(int##BIT##_t a, int##BIT##_t b) { \ 158 | if (a > 0 && b > INT##BIT##_MAX - a) { \ 159 | return INT##BIT##_MAX; \ 160 | } else if (a < 0 && b < INT##BIT##_MIN - a) { \ 161 | return INT##BIT##_MIN; \ 162 | } else \ 163 | return a + b; \ 164 | } \ 165 | static inline uint##BIT##_t sat_add(uint##BIT##_t a, uint##BIT##_t b) { \ 166 | uint##BIT##_t r = a + b; \ 167 | r |= -(r < a); \ 168 | return r; \ 169 | } \ 170 | static inline int##BIT##_t sat_sub(int##BIT##_t a, int##BIT##_t b) { \ 171 | if (b > 0 && a < INT##BIT##_MIN + b) { \ 172 | return INT##BIT##_MIN; \ 173 | } else if (b < 0 && a > INT##BIT##_MAX + b) { \ 174 | return INT##BIT##_MAX; \ 175 | } else \ 176 | return a - b; \ 177 | } \ 178 | static inline uint##BIT##_t sat_sub(uint##BIT##_t a, uint##BIT##_t b) { \ 179 | uint##BIT##_t r = a - b; \ 180 | r &= -(r <= a); \ 181 | return r; \ 182 | } 183 | TEST_SATURATE_ADD_SUB(8) 184 | TEST_SATURATE_ADD_SUB(16) 185 | TEST_SATURATE_ADD_SUB(32) 186 | TEST_SATURATE_ADD_SUB(64) 187 | 188 | // CBIT stands for current bit size, and HBIT stands for higher bit size 189 | #define TEST_SATURATE_DMUL(CBIT, HBIT) \ 190 | static inline int##HBIT##_t sat_dmull(int##CBIT##_t a, int##CBIT##_t b) { \ 191 | int##HBIT##_t tmp = (int##HBIT##_t)a * (int##HBIT##_t)b; \ 192 | return tmp > INT##HBIT##_MAX / 2 ? INT##HBIT##_MAX : tmp < INT##HBIT##_MIN / 2 ? INT##HBIT##_MIN : tmp * 2; \ 193 | } \ 194 | static inline uint##HBIT##_t sat_dmull(uint##CBIT##_t a, uint##CBIT##_t b) { \ 195 | uint##HBIT##_t tmp = a * b; \ 196 | return tmp > UINT##HBIT##_MAX / 2 ? UINT##HBIT##_MAX : tmp * 2; \ 197 | } \ 198 | static inline int##HBIT##_t sat_rdmulh(int##CBIT##_t a, int##CBIT##_t b) { \ 199 | int##HBIT##_t tmp = sat_dmull(a, b); \ 200 | tmp = (tmp >> (CBIT - 1)) + 1; \ 201 | return saturate_int##CBIT(tmp >> 1); \ 202 | } \ 203 | static inline int##CBIT##_t sat_rdmlah(int##CBIT##_t a, int##CBIT##_t b, int##CBIT##_t c) { \ 204 | int##HBIT##_t tmp = sat_dmull(b, c); \ 205 | tmp = sat_add(tmp, (int##HBIT##_t)(1 << (CBIT - 1))); \ 206 | return sat_add(a, (int##CBIT##_t)(tmp >> CBIT)); \ 207 | } \ 208 | static inline int##CBIT##_t sat_rdmlsh(int##CBIT##_t a, int##CBIT##_t b, int##CBIT##_t c) { \ 209 | int##HBIT##_t tmp = sat_dmull(b, c); \ 210 | tmp = sat_sub(tmp, (int##HBIT##_t)(1 << (CBIT - 1))); \ 211 | return sat_sub(a, (int##CBIT##_t)(tmp >> CBIT)); \ 212 | } 213 | TEST_SATURATE_DMUL(8, 16) 214 | TEST_SATURATE_DMUL(16, 32) 215 | TEST_SATURATE_DMUL(32, 64) 216 | 217 | #define TEST_SATURATE_SHIFT(CBIT, HBIT) \ 218 | static inline int##CBIT##_t sat_rshr(int##CBIT##_t a, int##CBIT##_t b) { \ 219 | return ((int##HBIT##_t)a + (1 << (-b - 1))) >> (-b); \ 220 | } \ 221 | static inline uint##CBIT##_t sat_rshr(uint##CBIT##_t a, int##CBIT##_t b) { \ 222 | return ((uint##HBIT##_t)a + (1 << (-b - 1))) >> (-b); \ 223 | } 224 | TEST_SATURATE_SHIFT(8, 16) 225 | TEST_SATURATE_SHIFT(16, 32) 226 | TEST_SATURATE_SHIFT(32, 64) 227 | 228 | #if defined(__riscv) || defined(__riscv__) 229 | #define DEFINE_TUPLEx2_GET(TYPE, SIGN, BIT, ELT_NUM) \ 230 | __attribute__((unused)) static void TYPE##x2_get_##TYPE(TYPE##x##ELT_NUM##x2_t a, TYPE##x##ELT_NUM##_t *a0, \ 231 | TYPE##x##ELT_NUM##_t *a1) { \ 232 | *a0 = __riscv_vget_v_##SIGN##BIT##m1x2_##SIGN##BIT##m1(a, 0); \ 233 | *a1 = __riscv_vget_v_##SIGN##BIT##m1x2_##SIGN##BIT##m1(a, 1); \ 234 | } 235 | #define DEFINE_TUPLEx3_GET(TYPE, SIGN, BIT, ELT_NUM) \ 236 | __attribute__((unused)) static void TYPE##x3_get_##TYPE(TYPE##x##ELT_NUM##x3_t a, TYPE##x##ELT_NUM##_t *a0, \ 237 | TYPE##x##ELT_NUM##_t *a1, TYPE##x##ELT_NUM##_t *a2) { \ 238 | *a0 = __riscv_vget_v_##SIGN##BIT##m1x3_##SIGN##BIT##m1(a, 0); \ 239 | *a1 = __riscv_vget_v_##SIGN##BIT##m1x3_##SIGN##BIT##m1(a, 1); \ 240 | *a2 = __riscv_vget_v_##SIGN##BIT##m1x3_##SIGN##BIT##m1(a, 2); \ 241 | } 242 | #define DEFINE_TUPLEx4_GET(TYPE, SIGN, BIT, ELT_NUM) \ 243 | __attribute__((unused)) static void TYPE##x4_get_##TYPE(TYPE##x##ELT_NUM##x4_t a, TYPE##x##ELT_NUM##_t *a0, \ 244 | TYPE##x##ELT_NUM##_t *a1, TYPE##x##ELT_NUM##_t *a2, \ 245 | TYPE##x##ELT_NUM##_t *a3) { \ 246 | *a0 = __riscv_vget_v_##SIGN##BIT##m1x4_##SIGN##BIT##m1(a, 0); \ 247 | *a1 = __riscv_vget_v_##SIGN##BIT##m1x4_##SIGN##BIT##m1(a, 1); \ 248 | *a2 = __riscv_vget_v_##SIGN##BIT##m1x4_##SIGN##BIT##m1(a, 2); \ 249 | *a3 = __riscv_vget_v_##SIGN##BIT##m1x4_##SIGN##BIT##m1(a, 3); \ 250 | } 251 | #elif defined(__aarch64__) || defined(_M_ARM64) 252 | #define DEFINE_TUPLEx2_GET(TYPE, SIGN, BIT, ELT_NUM) \ 253 | __attribute__((unused)) static void TYPE##x2_get_##TYPE(TYPE##x##ELT_NUM##x2_t a, TYPE##x##ELT_NUM##_t *a0, \ 254 | TYPE##x##ELT_NUM##_t *a1) { \ 255 | *a0 = a.val[0]; \ 256 | *a1 = a.val[1]; \ 257 | } 258 | #define DEFINE_TUPLEx3_GET(TYPE, SIGN, BIT, ELT_NUM) \ 259 | __attribute__((unused)) static void TYPE##x3_get_##TYPE(TYPE##x##ELT_NUM##x3_t a, TYPE##x##ELT_NUM##_t *a0, \ 260 | TYPE##x##ELT_NUM##_t *a1, TYPE##x##ELT_NUM##_t *a2) { \ 261 | *a0 = a.val[0]; \ 262 | *a1 = a.val[1]; \ 263 | *a2 = a.val[2]; \ 264 | } 265 | #define DEFINE_TUPLEx4_GET(TYPE, SIGN, BIT, ELT_NUM) \ 266 | __attribute__((unused)) static void TYPE##x4_get_##TYPE(TYPE##x##ELT_NUM##x4_t a, TYPE##x##ELT_NUM##_t *a0, \ 267 | TYPE##x##ELT_NUM##_t *a1, TYPE##x##ELT_NUM##_t *a2, \ 268 | TYPE##x##ELT_NUM##_t *a3) { \ 269 | *a0 = a.val[0]; \ 270 | *a1 = a.val[1]; \ 271 | *a2 = a.val[2]; \ 272 | *a3 = a.val[3]; \ 273 | } 274 | #endif 275 | 276 | DEFINE_TUPLEx2_GET(int8, i, 8, 8); 277 | DEFINE_TUPLEx2_GET(uint8, u, 8, 8); 278 | DEFINE_TUPLEx2_GET(int16, i, 16, 4); 279 | DEFINE_TUPLEx2_GET(uint16, u, 16, 4); 280 | DEFINE_TUPLEx2_GET(int32, i, 32, 2); 281 | DEFINE_TUPLEx2_GET(uint32, u, 32, 2); 282 | DEFINE_TUPLEx2_GET(float32, f, 32, 2); 283 | DEFINE_TUPLEx2_GET(int64, i, 64, 1); 284 | DEFINE_TUPLEx2_GET(uint64, u, 64, 1); 285 | DEFINE_TUPLEx3_GET(int8, i, 8, 8); 286 | DEFINE_TUPLEx3_GET(uint8, u, 8, 8); 287 | DEFINE_TUPLEx3_GET(int16, i, 16, 4); 288 | DEFINE_TUPLEx3_GET(uint16, u, 16, 4); 289 | DEFINE_TUPLEx3_GET(int32, i, 32, 2); 290 | DEFINE_TUPLEx3_GET(uint32, u, 32, 2); 291 | DEFINE_TUPLEx3_GET(float32, f, 32, 2); 292 | DEFINE_TUPLEx3_GET(int64, i, 64, 1); 293 | DEFINE_TUPLEx3_GET(uint64, u, 64, 1); 294 | DEFINE_TUPLEx4_GET(int8, i, 8, 8); 295 | DEFINE_TUPLEx4_GET(uint8, u, 8, 8); 296 | DEFINE_TUPLEx4_GET(int16, i, 16, 4); 297 | DEFINE_TUPLEx4_GET(uint16, u, 16, 4); 298 | DEFINE_TUPLEx4_GET(int32, i, 32, 2); 299 | DEFINE_TUPLEx4_GET(uint32, u, 32, 2); 300 | DEFINE_TUPLEx4_GET(float32, f, 32, 2); 301 | DEFINE_TUPLEx4_GET(int64, i, 64, 1); 302 | DEFINE_TUPLEx4_GET(uint64, u, 64, 1); 303 | #if defined(__aarch64__) || defined(_M_ARM64) 304 | DEFINE_TUPLEx2_GET(int16, i, 16, 8); 305 | DEFINE_TUPLEx2_GET(uint16, u, 16, 8); 306 | DEFINE_TUPLEx2_GET(int32, i, 32, 4); 307 | DEFINE_TUPLEx2_GET(uint32, u, 32, 4); 308 | DEFINE_TUPLEx2_GET(float32, f, 32, 4); 309 | DEFINE_TUPLEx2_GET(int64, i, 64, 2); 310 | DEFINE_TUPLEx2_GET(uint64, u, 64, 2); 311 | DEFINE_TUPLEx3_GET(int16, i, 16, 8); 312 | DEFINE_TUPLEx3_GET(uint16, u, 16, 8); 313 | DEFINE_TUPLEx3_GET(int32, i, 32, 4); 314 | DEFINE_TUPLEx3_GET(uint32, u, 32, 4); 315 | DEFINE_TUPLEx3_GET(float32, f, 32, 4); 316 | DEFINE_TUPLEx3_GET(int64, i, 64, 2); 317 | DEFINE_TUPLEx3_GET(uint64, u, 64, 2); 318 | DEFINE_TUPLEx4_GET(int16, i, 16, 8); 319 | DEFINE_TUPLEx4_GET(uint16, u, 16, 8); 320 | DEFINE_TUPLEx4_GET(int32, i, 32, 4); 321 | DEFINE_TUPLEx4_GET(uint32, u, 32, 4); 322 | DEFINE_TUPLEx4_GET(float32, f, 32, 4); 323 | DEFINE_TUPLEx4_GET(int64, i, 64, 2); 324 | DEFINE_TUPLEx4_GET(uint64, u, 64, 2); 325 | #endif 326 | 327 | float ranf(float low, float high); 328 | 329 | template 330 | static void merge_arrays(const T *arr1, const T *arr2, U *out, size_t single_arr_size) { 331 | const U *_arr1 = (const U *)arr1; 332 | const U *_arr2 = (const U *)arr2; 333 | for (size_t i = 0; i < single_arr_size; i++) { 334 | out[i] = _arr1[i]; 335 | out[i + single_arr_size] = _arr2[i]; 336 | } 337 | } 338 | template 339 | static void merge_arrays(const T *arr1, const T *arr2, const T *arr3, U *out, size_t single_arr_size) { 340 | const U *_arr1 = (const U *)arr1; 341 | const U *_arr2 = (const U *)arr2; 342 | const U *_arr3 = (const U *)arr3; 343 | for (size_t i = 0; i < single_arr_size; i++) { 344 | out[i] = _arr1[i]; 345 | out[i + single_arr_size] = _arr2[i]; 346 | out[i + single_arr_size * 2] = _arr3[i]; 347 | } 348 | } 349 | template 350 | static void merge_arrays(const T *arr1, const T *arr2, const T *arr3, const T *arr4, U *out, size_t single_arr_size) { 351 | const U *_arr1 = (const U *)arr1; 352 | const U *_arr2 = (const U *)arr2; 353 | const U *_arr3 = (const U *)arr3; 354 | const U *_arr4 = (const U *)arr4; 355 | for (size_t i = 0; i < single_arr_size; i++) { 356 | out[i] = _arr1[i]; 357 | out[i + single_arr_size] = _arr2[i]; 358 | out[i + single_arr_size * 2] = _arr3[i]; 359 | out[i + single_arr_size * 3] = _arr4[i]; 360 | } 361 | } 362 | template 363 | static void merge_arrays(const T *arr1, const T *arr2, const T *arr3, const T *arr4, U *out) { 364 | size_t len = 128 / (sizeof(U) * 8); 365 | const U *_arr1 = (const U *)arr1; 366 | const U *_arr2 = (const U *)arr2; 367 | const U *_arr3 = (const U *)arr3; 368 | const U *_arr4 = (const U *)arr4; 369 | for (size_t i = 0; i < len; i++) { 370 | out[i] = _arr1[i]; 371 | out[i + len] = _arr2[i]; 372 | out[i + len * 2] = _arr3[i]; 373 | out[i + len * 3] = _arr4[i]; 374 | } 375 | } 376 | 377 | float bankers_rounding(float val); 378 | double bankers_rounding(double val); 379 | 380 | float maxnm(float a, float b); 381 | float minnm(float a, float b); 382 | double maxnm(double a, double b); 383 | double minnm(double a, double b); 384 | 385 | #define CHECK_RESULT(EXP) \ 386 | if ((EXP) != TEST_SUCCESS) { \ 387 | return TEST_FAIL; \ 388 | } 389 | #define IMM_1_ITER TEST_IMPL(0) 390 | #define IMM_2_ITER \ 391 | IMM_1_ITER \ 392 | TEST_IMPL(1) 393 | #define IMM_3_ITER \ 394 | IMM_2_ITER \ 395 | TEST_IMPL(2) 396 | #define IMM_4_ITER \ 397 | IMM_3_ITER \ 398 | TEST_IMPL(3) 399 | #define IMM_7_ITER \ 400 | IMM_4_ITER \ 401 | TEST_IMPL(4) \ 402 | TEST_IMPL(5) \ 403 | TEST_IMPL(6) 404 | #define IMM_8_ITER \ 405 | IMM_7_ITER \ 406 | TEST_IMPL(7) 407 | #define IMM_15_ITER \ 408 | IMM_8_ITER \ 409 | TEST_IMPL(8) \ 410 | TEST_IMPL(9) \ 411 | TEST_IMPL(10) \ 412 | TEST_IMPL(11) \ 413 | TEST_IMPL(12) \ 414 | TEST_IMPL(13) \ 415 | TEST_IMPL(14) 416 | #define IMM_16_ITER \ 417 | IMM_15_ITER \ 418 | TEST_IMPL(15) 419 | #define IMM_31_ITER \ 420 | IMM_16_ITER \ 421 | TEST_IMPL(16) \ 422 | TEST_IMPL(17) \ 423 | TEST_IMPL(18) \ 424 | TEST_IMPL(19) \ 425 | TEST_IMPL(20) \ 426 | TEST_IMPL(21) \ 427 | TEST_IMPL(22) \ 428 | TEST_IMPL(23) \ 429 | TEST_IMPL(24) \ 430 | TEST_IMPL(25) \ 431 | TEST_IMPL(26) \ 432 | TEST_IMPL(27) \ 433 | TEST_IMPL(28) \ 434 | TEST_IMPL(29) \ 435 | TEST_IMPL(30) 436 | #define IMM_32_ITER \ 437 | IMM_31_ITER \ 438 | TEST_IMPL(31) 439 | #define IMM_63_ITER \ 440 | IMM_32_ITER \ 441 | TEST_IMPL(32) \ 442 | TEST_IMPL(33) \ 443 | TEST_IMPL(34) \ 444 | TEST_IMPL(35) \ 445 | TEST_IMPL(36) \ 446 | TEST_IMPL(37) \ 447 | TEST_IMPL(38) \ 448 | TEST_IMPL(39) \ 449 | TEST_IMPL(40) \ 450 | TEST_IMPL(41) \ 451 | TEST_IMPL(42) \ 452 | TEST_IMPL(43) \ 453 | TEST_IMPL(44) \ 454 | TEST_IMPL(45) \ 455 | TEST_IMPL(46) \ 456 | TEST_IMPL(47) \ 457 | TEST_IMPL(48) \ 458 | TEST_IMPL(49) \ 459 | TEST_IMPL(50) \ 460 | TEST_IMPL(51) \ 461 | TEST_IMPL(52) \ 462 | TEST_IMPL(53) \ 463 | TEST_IMPL(54) \ 464 | TEST_IMPL(55) \ 465 | TEST_IMPL(56) \ 466 | TEST_IMPL(57) \ 467 | TEST_IMPL(58) \ 468 | TEST_IMPL(59) \ 469 | TEST_IMPL(60) \ 470 | TEST_IMPL(61) \ 471 | TEST_IMPL(62) 472 | #define IMM_64_ITER \ 473 | IMM_63_ITER \ 474 | TEST_IMPL(63) 475 | #define IMM_127_ITER \ 476 | IMM_64_ITER \ 477 | TEST_IMPL(64) \ 478 | TEST_IMPL(65) \ 479 | TEST_IMPL(66) \ 480 | TEST_IMPL(67) \ 481 | TEST_IMPL(68) \ 482 | TEST_IMPL(69) \ 483 | TEST_IMPL(70) \ 484 | TEST_IMPL(71) \ 485 | TEST_IMPL(72) \ 486 | TEST_IMPL(73) \ 487 | TEST_IMPL(74) \ 488 | TEST_IMPL(75) \ 489 | TEST_IMPL(76) \ 490 | TEST_IMPL(77) \ 491 | TEST_IMPL(78) \ 492 | TEST_IMPL(79) \ 493 | TEST_IMPL(80) \ 494 | TEST_IMPL(81) \ 495 | TEST_IMPL(82) \ 496 | TEST_IMPL(83) \ 497 | TEST_IMPL(84) \ 498 | TEST_IMPL(85) \ 499 | TEST_IMPL(86) \ 500 | TEST_IMPL(87) \ 501 | TEST_IMPL(88) \ 502 | TEST_IMPL(89) \ 503 | TEST_IMPL(90) \ 504 | TEST_IMPL(91) \ 505 | TEST_IMPL(92) \ 506 | TEST_IMPL(93) \ 507 | TEST_IMPL(94) \ 508 | TEST_IMPL(95) \ 509 | TEST_IMPL(96) \ 510 | TEST_IMPL(97) \ 511 | TEST_IMPL(98) \ 512 | TEST_IMPL(99) \ 513 | TEST_IMPL(100) \ 514 | TEST_IMPL(101) \ 515 | TEST_IMPL(102) \ 516 | TEST_IMPL(103) \ 517 | TEST_IMPL(104) \ 518 | TEST_IMPL(105) \ 519 | TEST_IMPL(106) \ 520 | TEST_IMPL(107) \ 521 | TEST_IMPL(108) \ 522 | TEST_IMPL(109) \ 523 | TEST_IMPL(110) \ 524 | TEST_IMPL(111) \ 525 | TEST_IMPL(112) \ 526 | TEST_IMPL(113) \ 527 | TEST_IMPL(114) \ 528 | TEST_IMPL(115) \ 529 | TEST_IMPL(116) \ 530 | TEST_IMPL(117) \ 531 | TEST_IMPL(118) \ 532 | TEST_IMPL(119) \ 533 | TEST_IMPL(120) \ 534 | TEST_IMPL(121) \ 535 | TEST_IMPL(122) \ 536 | TEST_IMPL(123) \ 537 | TEST_IMPL(124) \ 538 | TEST_IMPL(125) \ 539 | TEST_IMPL(126) 540 | #define IMM_128_ITER \ 541 | IMM_127_ITER \ 542 | TEST_IMPL(127) 543 | #define IMM_255_ITER \ 544 | IMM_128_ITER \ 545 | TEST_IMPL(128) \ 546 | TEST_IMPL(129) \ 547 | TEST_IMPL(130) \ 548 | TEST_IMPL(131) \ 549 | TEST_IMPL(132) \ 550 | TEST_IMPL(133) \ 551 | TEST_IMPL(134) \ 552 | TEST_IMPL(135) \ 553 | TEST_IMPL(136) \ 554 | TEST_IMPL(137) \ 555 | TEST_IMPL(138) \ 556 | TEST_IMPL(139) \ 557 | TEST_IMPL(140) \ 558 | TEST_IMPL(141) \ 559 | TEST_IMPL(142) \ 560 | TEST_IMPL(143) \ 561 | TEST_IMPL(144) \ 562 | TEST_IMPL(145) \ 563 | TEST_IMPL(146) \ 564 | TEST_IMPL(147) \ 565 | TEST_IMPL(148) \ 566 | TEST_IMPL(149) \ 567 | TEST_IMPL(150) \ 568 | TEST_IMPL(151) \ 569 | TEST_IMPL(152) \ 570 | TEST_IMPL(153) \ 571 | TEST_IMPL(154) \ 572 | TEST_IMPL(155) \ 573 | TEST_IMPL(156) \ 574 | TEST_IMPL(157) \ 575 | TEST_IMPL(158) \ 576 | TEST_IMPL(159) \ 577 | TEST_IMPL(160) \ 578 | TEST_IMPL(161) \ 579 | TEST_IMPL(162) \ 580 | TEST_IMPL(163) \ 581 | TEST_IMPL(164) \ 582 | TEST_IMPL(165) \ 583 | TEST_IMPL(166) \ 584 | TEST_IMPL(167) \ 585 | TEST_IMPL(168) \ 586 | TEST_IMPL(169) \ 587 | TEST_IMPL(170) \ 588 | TEST_IMPL(171) \ 589 | TEST_IMPL(172) \ 590 | TEST_IMPL(173) \ 591 | TEST_IMPL(174) \ 592 | TEST_IMPL(175) \ 593 | TEST_IMPL(176) \ 594 | TEST_IMPL(177) \ 595 | TEST_IMPL(178) \ 596 | TEST_IMPL(179) \ 597 | TEST_IMPL(180) \ 598 | TEST_IMPL(181) \ 599 | TEST_IMPL(182) \ 600 | TEST_IMPL(183) \ 601 | TEST_IMPL(184) \ 602 | TEST_IMPL(185) \ 603 | TEST_IMPL(186) \ 604 | TEST_IMPL(187) \ 605 | TEST_IMPL(188) \ 606 | TEST_IMPL(189) \ 607 | TEST_IMPL(190) \ 608 | TEST_IMPL(191) \ 609 | TEST_IMPL(192) \ 610 | TEST_IMPL(193) \ 611 | TEST_IMPL(194) \ 612 | TEST_IMPL(195) \ 613 | TEST_IMPL(196) \ 614 | TEST_IMPL(197) \ 615 | TEST_IMPL(198) \ 616 | TEST_IMPL(199) \ 617 | TEST_IMPL(200) \ 618 | TEST_IMPL(201) \ 619 | TEST_IMPL(202) \ 620 | TEST_IMPL(203) \ 621 | TEST_IMPL(204) \ 622 | TEST_IMPL(205) \ 623 | TEST_IMPL(206) \ 624 | TEST_IMPL(207) \ 625 | TEST_IMPL(208) \ 626 | TEST_IMPL(209) \ 627 | TEST_IMPL(210) \ 628 | TEST_IMPL(211) \ 629 | TEST_IMPL(212) \ 630 | TEST_IMPL(213) \ 631 | TEST_IMPL(214) \ 632 | TEST_IMPL(215) \ 633 | TEST_IMPL(216) \ 634 | TEST_IMPL(217) \ 635 | TEST_IMPL(218) \ 636 | TEST_IMPL(219) \ 637 | TEST_IMPL(220) \ 638 | TEST_IMPL(221) \ 639 | TEST_IMPL(222) \ 640 | TEST_IMPL(223) \ 641 | TEST_IMPL(224) \ 642 | TEST_IMPL(225) \ 643 | TEST_IMPL(226) \ 644 | TEST_IMPL(227) \ 645 | TEST_IMPL(228) \ 646 | TEST_IMPL(229) \ 647 | TEST_IMPL(230) \ 648 | TEST_IMPL(231) \ 649 | TEST_IMPL(232) \ 650 | TEST_IMPL(233) \ 651 | TEST_IMPL(234) \ 652 | TEST_IMPL(235) \ 653 | TEST_IMPL(236) \ 654 | TEST_IMPL(237) \ 655 | TEST_IMPL(238) \ 656 | TEST_IMPL(239) \ 657 | TEST_IMPL(240) \ 658 | TEST_IMPL(241) \ 659 | TEST_IMPL(242) \ 660 | TEST_IMPL(243) \ 661 | TEST_IMPL(244) \ 662 | TEST_IMPL(245) \ 663 | TEST_IMPL(246) \ 664 | TEST_IMPL(247) \ 665 | TEST_IMPL(248) \ 666 | TEST_IMPL(249) \ 667 | TEST_IMPL(250) \ 668 | TEST_IMPL(251) \ 669 | TEST_IMPL(252) \ 670 | TEST_IMPL(253) \ 671 | TEST_IMPL(254) 672 | #define IMM_256_ITER \ 673 | IMM_255_ITER \ 674 | TEST_IMPL(255) 675 | 676 | } // namespace NEON2RVV 677 | 678 | #endif // NEON2RVV_COMMON_H 679 | -------------------------------------------------------------------------------- /tests/common.cpp: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | #include 3 | 4 | namespace NEON2RVV { 5 | int32_t NaN = ~0; 6 | int64_t NaN64 = ~0; 7 | 8 | result_t validate_int64(int64x2_t a, int64_t i0, int64_t i1) { 9 | const int64_t *t = (const int64_t *)&a; 10 | ASSERT_RETURN(t[0] == i0); 11 | ASSERT_RETURN(t[1] == i1); 12 | return TEST_SUCCESS; 13 | } 14 | result_t validate_uint64(uint64x2_t a, uint64_t i0, uint64_t i1) { 15 | const uint64_t *t = (const uint64_t *)&a; 16 | ASSERT_RETURN(t[0] == i0); 17 | ASSERT_RETURN(t[1] == i1); 18 | return TEST_SUCCESS; 19 | } 20 | result_t validate_int64(int64x2x2_t a, int64_t i0, int64_t i1, int64_t i2, int64_t i3) { 21 | #if defined(__riscv) || defined(__riscv__) 22 | vint64m1_t a0 = __riscv_vget_v_i64m1x2_i64m1(a, 0); 23 | vint64m1_t a1 = __riscv_vget_v_i64m1x2_i64m1(a, 1); 24 | const int64_t *t0 = (const int64_t *)&a0; 25 | const int64_t *t1 = (const int64_t *)&a1; 26 | #elif defined(__aarch64__) || defined(_M_ARM64) 27 | const int64_t *t0 = (const int64_t *)&a.val[0]; 28 | const int64_t *t1 = (const int64_t *)&a.val[1]; 29 | #endif 30 | ASSERT_RETURN(t0[0] == i0); 31 | ASSERT_RETURN(t0[1] == i1); 32 | ASSERT_RETURN(t1[0] == i2); 33 | ASSERT_RETURN(t1[1] == i3); 34 | return TEST_SUCCESS; 35 | } 36 | result_t validate_uint64(uint64x2x2_t a, uint64_t u0, uint64_t u1, uint64_t u2, uint64_t u3) { 37 | #if defined(__riscv) || defined(__riscv__) 38 | vuint64m1_t a0 = __riscv_vget_v_u64m1x2_u64m1(a, 0); 39 | vuint64m1_t a1 = __riscv_vget_v_u64m1x2_u64m1(a, 1); 40 | const uint64_t *t0 = (const uint64_t *)&a0; 41 | const uint64_t *t1 = (const uint64_t *)&a1; 42 | #elif defined(__aarch64__) || defined(_M_ARM64) 43 | const uint64_t *t0 = (const uint64_t *)&a.val[0]; 44 | const uint64_t *t1 = (const uint64_t *)&a.val[1]; 45 | #endif 46 | ASSERT_RETURN(t0[0] == u0); 47 | ASSERT_RETURN(t0[1] == u1); 48 | ASSERT_RETURN(t1[0] == u2); 49 | ASSERT_RETURN(t1[1] == u3); 50 | return TEST_SUCCESS; 51 | } 52 | 53 | result_t validate_int64(int64x1_t a, int64_t i0) { 54 | const int64_t *t = (const int64_t *)&a; 55 | ASSERT_RETURN(t[0] == i0); 56 | return TEST_SUCCESS; 57 | } 58 | result_t validate_uint64(uint64x1_t a, uint64_t u0) { 59 | const uint64_t *t = (const uint64_t *)&a; 60 | ASSERT_RETURN(t[0] == u0); 61 | return TEST_SUCCESS; 62 | } 63 | result_t validate_int64(int64x1x2_t a, int64_t i0, int64_t i1) { 64 | #if defined(__riscv) || defined(__riscv__) 65 | vint64m1_t a0 = __riscv_vget_v_i64m1x2_i64m1(a, 0); 66 | vint64m1_t a1 = __riscv_vget_v_i64m1x2_i64m1(a, 1); 67 | const int64_t *t0 = (const int64_t *)&a0; 68 | const int64_t *t1 = (const int64_t *)&a1; 69 | #elif defined(__aarch64__) || defined(_M_ARM64) 70 | const int64_t *t0 = (const int64_t *)&a.val[0]; 71 | const int64_t *t1 = (const int64_t *)&a.val[1]; 72 | #endif 73 | ASSERT_RETURN(t0[0] == i0); 74 | ASSERT_RETURN(t1[0] == i1); 75 | return TEST_SUCCESS; 76 | } 77 | result_t validate_uint64(uint64x1x2_t a, uint64_t u0, uint64_t u1) { 78 | #if defined(__riscv) || defined(__riscv__) 79 | vuint64m1_t a0 = __riscv_vget_v_u64m1x2_u64m1(a, 0); 80 | vuint64m1_t a1 = __riscv_vget_v_u64m1x2_u64m1(a, 1); 81 | const uint64_t *t0 = (const uint64_t *)&a0; 82 | const uint64_t *t1 = (const uint64_t *)&a1; 83 | #elif defined(__aarch64__) || defined(_M_ARM64) 84 | const uint64_t *t0 = (const uint64_t *)&a.val[0]; 85 | const uint64_t *t1 = (const uint64_t *)&a.val[1]; 86 | #endif 87 | ASSERT_RETURN(t0[0] == u0); 88 | ASSERT_RETURN(t1[0] == u1); 89 | return TEST_SUCCESS; 90 | } 91 | 92 | result_t validate_int32(int32x4_t a, int32_t i0, int32_t i1, int32_t i2, int32_t i3) { 93 | const int32_t *t = (const int32_t *)&a; 94 | ASSERT_RETURN(t[0] == i0); 95 | ASSERT_RETURN(t[1] == i1); 96 | ASSERT_RETURN(t[2] == i2); 97 | ASSERT_RETURN(t[3] == i3); 98 | return TEST_SUCCESS; 99 | } 100 | 101 | result_t validate_uint32(uint32x4_t a, uint32_t u0, uint32_t u1, uint32_t u2, uint32_t u3) { 102 | const uint32_t *t = (const uint32_t *)&a; 103 | ASSERT_RETURN(t[0] == u0); 104 | ASSERT_RETURN(t[1] == u1); 105 | ASSERT_RETURN(t[2] == u2); 106 | ASSERT_RETURN(t[3] == u3); 107 | return TEST_SUCCESS; 108 | } 109 | 110 | result_t validate_int32(int32x4x2_t a, int32_t i0, int32_t i1, int32_t i2, int32_t i3, int32_t i4, int32_t i5, 111 | int32_t i6, int32_t i7) { 112 | #if defined(__riscv) || defined(__riscv__) 113 | vint32m1_t a0 = __riscv_vget_v_i32m1x2_i32m1(a, 0); 114 | vint32m1_t a1 = __riscv_vget_v_i32m1x2_i32m1(a, 1); 115 | const int32_t *t0 = (const int32_t *)&a0; 116 | const int32_t *t1 = (const int32_t *)&a1; 117 | #elif defined(__aarch64__) || defined(_M_ARM64) 118 | const int32_t *t0 = (const int32_t *)&a.val[0]; 119 | const int32_t *t1 = (const int32_t *)&a.val[1]; 120 | #endif 121 | ASSERT_RETURN(t0[0] == i0); 122 | ASSERT_RETURN(t0[1] == i1); 123 | ASSERT_RETURN(t0[2] == i2); 124 | ASSERT_RETURN(t0[3] == i3); 125 | ASSERT_RETURN(t1[0] == i4); 126 | ASSERT_RETURN(t1[1] == i5); 127 | ASSERT_RETURN(t1[2] == i6); 128 | ASSERT_RETURN(t1[3] == i7); 129 | return TEST_SUCCESS; 130 | } 131 | 132 | result_t validate_uint32(uint32x4x2_t a, uint32_t u0, uint32_t u1, uint32_t u2, uint32_t u3, uint32_t u4, uint32_t u5, 133 | uint32_t u6, uint32_t u7) { 134 | #if defined(__riscv) || defined(__riscv__) 135 | vuint32m1_t a0 = __riscv_vget_v_u32m1x2_u32m1(a, 0); 136 | vuint32m1_t a1 = __riscv_vget_v_u32m1x2_u32m1(a, 1); 137 | const uint32_t *t0 = (const uint32_t *)&a0; 138 | const uint32_t *t1 = (const uint32_t *)&a1; 139 | #elif defined(__aarch64__) || defined(_M_ARM64) 140 | const uint32_t *t0 = (const uint32_t *)&a.val[0]; 141 | const uint32_t *t1 = (const uint32_t *)&a.val[1]; 142 | #endif 143 | ASSERT_RETURN(t0[0] == u0); 144 | ASSERT_RETURN(t0[1] == u1); 145 | ASSERT_RETURN(t0[2] == u2); 146 | ASSERT_RETURN(t0[3] == u3); 147 | ASSERT_RETURN(t1[0] == u4); 148 | ASSERT_RETURN(t1[1] == u5); 149 | ASSERT_RETURN(t1[2] == u6); 150 | ASSERT_RETURN(t1[3] == u7); 151 | return TEST_SUCCESS; 152 | } 153 | 154 | result_t validate_int32(int32x2_t a, int32_t u0, int32_t u1) { 155 | const int32_t *t = (const int32_t *)&a; 156 | ASSERT_RETURN(t[0] == u0); 157 | ASSERT_RETURN(t[1] == u1); 158 | return TEST_SUCCESS; 159 | } 160 | 161 | result_t validate_uint32(uint32x2_t a, uint32_t u0, uint32_t u1) { 162 | const uint32_t *t = (const uint32_t *)&a; 163 | ASSERT_RETURN(t[0] == u0); 164 | ASSERT_RETURN(t[1] == u1); 165 | return TEST_SUCCESS; 166 | } 167 | 168 | result_t validate_int32(int32x2x2_t a, int32_t i0, int32_t i1, int32_t i2, int32_t i3) { 169 | #if defined(__riscv) || defined(__riscv__) 170 | vint32m1_t a0 = __riscv_vget_v_i32m1x2_i32m1(a, 0); 171 | vint32m1_t a1 = __riscv_vget_v_i32m1x2_i32m1(a, 1); 172 | const int32_t *t0 = (const int32_t *)&a0; 173 | const int32_t *t1 = (const int32_t *)&a1; 174 | #elif defined(__aarch64__) || defined(_M_ARM64) 175 | const int32_t *t0 = (const int32_t *)&a.val[0]; 176 | const int32_t *t1 = (const int32_t *)&a.val[1]; 177 | #endif 178 | ASSERT_RETURN(t0[0] == i0); 179 | ASSERT_RETURN(t0[1] == i1); 180 | ASSERT_RETURN(t1[0] == i2); 181 | ASSERT_RETURN(t1[1] == i3); 182 | return TEST_SUCCESS; 183 | } 184 | 185 | result_t validate_uint32(uint32x2x2_t a, uint32_t u0, uint32_t u1, uint32_t u2, uint32_t u3) { 186 | #if defined(__riscv) || defined(__riscv__) 187 | vuint32m1_t a0 = __riscv_vget_v_u32m1x2_u32m1(a, 0); 188 | vuint32m1_t a1 = __riscv_vget_v_u32m1x2_u32m1(a, 1); 189 | const uint32_t *t0 = (const uint32_t *)&a0; 190 | const uint32_t *t1 = (const uint32_t *)&a1; 191 | #elif defined(__aarch64__) || defined(_M_ARM64) 192 | const uint32_t *t0 = (const uint32_t *)&a.val[0]; 193 | const uint32_t *t1 = (const uint32_t *)&a.val[1]; 194 | #endif 195 | ASSERT_RETURN(t0[0] == u0); 196 | ASSERT_RETURN(t0[1] == u1); 197 | ASSERT_RETURN(t1[0] == u2); 198 | ASSERT_RETURN(t1[1] == u3); 199 | return TEST_SUCCESS; 200 | } 201 | 202 | result_t validate_int16(int16x8_t a, int16_t i0, int16_t i1, int16_t i2, int16_t i3, int16_t i4, int16_t i5, int16_t i6, 203 | int16_t i7) { 204 | const int16_t *t = (const int16_t *)&a; 205 | ASSERT_RETURN(t[0] == i0); 206 | ASSERT_RETURN(t[1] == i1); 207 | ASSERT_RETURN(t[2] == i2); 208 | ASSERT_RETURN(t[3] == i3); 209 | ASSERT_RETURN(t[4] == i4); 210 | ASSERT_RETURN(t[5] == i5); 211 | ASSERT_RETURN(t[6] == i6); 212 | ASSERT_RETURN(t[7] == i7); 213 | return TEST_SUCCESS; 214 | } 215 | 216 | result_t validate_uint16(uint16x8_t a, uint16_t u0, uint16_t u1, uint16_t u2, uint16_t u3, uint16_t u4, uint16_t u5, 217 | uint16_t u6, uint16_t u7) { 218 | const uint16_t *t = (const uint16_t *)&a; 219 | ASSERT_RETURN(t[0] == u0); 220 | ASSERT_RETURN(t[1] == u1); 221 | ASSERT_RETURN(t[2] == u2); 222 | ASSERT_RETURN(t[3] == u3); 223 | ASSERT_RETURN(t[4] == u4); 224 | ASSERT_RETURN(t[5] == u5); 225 | ASSERT_RETURN(t[6] == u6); 226 | ASSERT_RETURN(t[7] == u7); 227 | return TEST_SUCCESS; 228 | } 229 | 230 | result_t validate_int16(int16x8x2_t a, int16_t i0, int16_t i1, int16_t i2, int16_t i3, int16_t i4, int16_t i5, 231 | int16_t i6, int16_t i7, int16_t i8, int16_t i9, int16_t i10, int16_t i11, int16_t i12, 232 | int16_t i13, int16_t i14, int16_t i15) { 233 | #if defined(__riscv) || defined(__riscv__) 234 | vint16m1_t a0 = __riscv_vget_v_i16m1x2_i16m1(a, 0); 235 | vint16m1_t a1 = __riscv_vget_v_i16m1x2_i16m1(a, 1); 236 | const int16_t *t0 = (const int16_t *)&a0; 237 | const int16_t *t1 = (const int16_t *)&a1; 238 | #elif defined(__aarch64__) || defined(_M_ARM64) 239 | const int16_t *t0 = (const int16_t *)&a.val[0]; 240 | const int16_t *t1 = (const int16_t *)&a.val[1]; 241 | #endif 242 | ASSERT_RETURN(t0[0] == i0); 243 | ASSERT_RETURN(t0[1] == i1); 244 | ASSERT_RETURN(t0[2] == i2); 245 | ASSERT_RETURN(t0[3] == i3); 246 | ASSERT_RETURN(t0[4] == i4); 247 | ASSERT_RETURN(t0[5] == i5); 248 | ASSERT_RETURN(t0[6] == i6); 249 | ASSERT_RETURN(t0[7] == i7); 250 | ASSERT_RETURN(t1[0] == i8); 251 | ASSERT_RETURN(t1[1] == i9); 252 | ASSERT_RETURN(t1[2] == i10); 253 | ASSERT_RETURN(t1[3] == i11); 254 | ASSERT_RETURN(t1[4] == i12); 255 | ASSERT_RETURN(t1[5] == i13); 256 | ASSERT_RETURN(t1[6] == i14); 257 | ASSERT_RETURN(t1[7] == i15); 258 | return TEST_SUCCESS; 259 | } 260 | 261 | result_t validate_uint16(uint16x8x2_t a, uint16_t u0, uint16_t u1, uint16_t u2, uint16_t u3, uint16_t u4, uint16_t u5, 262 | uint16_t u6, uint16_t u7, uint16_t u8, uint16_t u9, uint16_t u10, uint16_t u11, uint16_t u12, 263 | uint16_t u13, uint16_t u14, uint16_t u15) { 264 | #if defined(__riscv) || defined(__riscv__) 265 | vuint16m1_t a0 = __riscv_vget_v_u16m1x2_u16m1(a, 0); 266 | vuint16m1_t a1 = __riscv_vget_v_u16m1x2_u16m1(a, 1); 267 | const uint16_t *t0 = (const uint16_t *)&a0; 268 | const uint16_t *t1 = (const uint16_t *)&a1; 269 | #elif defined(__aarch64__) || defined(_M_ARM64) 270 | const uint16_t *t0 = (const uint16_t *)&a.val[0]; 271 | const uint16_t *t1 = (const uint16_t *)&a.val[1]; 272 | #endif 273 | ASSERT_RETURN(t0[0] == u0); 274 | ASSERT_RETURN(t0[1] == u1); 275 | ASSERT_RETURN(t0[2] == u2); 276 | ASSERT_RETURN(t0[3] == u3); 277 | ASSERT_RETURN(t0[4] == u4); 278 | ASSERT_RETURN(t0[5] == u5); 279 | ASSERT_RETURN(t0[6] == u6); 280 | ASSERT_RETURN(t0[7] == u7); 281 | ASSERT_RETURN(t1[0] == u8); 282 | ASSERT_RETURN(t1[1] == u9); 283 | ASSERT_RETURN(t1[2] == u10); 284 | ASSERT_RETURN(t1[3] == u11); 285 | ASSERT_RETURN(t1[4] == u12); 286 | ASSERT_RETURN(t1[5] == u13); 287 | ASSERT_RETURN(t1[6] == u14); 288 | ASSERT_RETURN(t1[7] == u15); 289 | return TEST_SUCCESS; 290 | } 291 | 292 | result_t validate_int16(int16x4_t a, int16_t i0, int16_t i1, int16_t i2, int16_t i3) { 293 | const int16_t *t = (const int16_t *)&a; 294 | ASSERT_RETURN(t[0] == i0); 295 | ASSERT_RETURN(t[1] == i1); 296 | ASSERT_RETURN(t[2] == i2); 297 | ASSERT_RETURN(t[3] == i3); 298 | return TEST_SUCCESS; 299 | } 300 | 301 | result_t validate_uint16(uint16x4_t a, uint16_t u0, uint16_t u1, uint16_t u2, uint16_t u3) { 302 | const uint16_t *t = (const uint16_t *)&a; 303 | ASSERT_RETURN(t[0] == u0); 304 | ASSERT_RETURN(t[1] == u1); 305 | ASSERT_RETURN(t[2] == u2); 306 | ASSERT_RETURN(t[3] == u3); 307 | return TEST_SUCCESS; 308 | } 309 | 310 | result_t validate_int16(int16x4x2_t a, int16_t i0, int16_t i1, int16_t i2, int16_t i3, int16_t i4, int16_t i5, 311 | int16_t i6, int16_t i7) { 312 | #if defined(__riscv) || defined(__riscv__) 313 | vint16m1_t a0 = __riscv_vget_v_i16m1x2_i16m1(a, 0); 314 | vint16m1_t a1 = __riscv_vget_v_i16m1x2_i16m1(a, 1); 315 | const int16_t *t0 = (const int16_t *)&a0; 316 | const int16_t *t1 = (const int16_t *)&a1; 317 | #elif defined(__aarch64__) || defined(_M_ARM64) 318 | const int16_t *t0 = (const int16_t *)&a.val[0]; 319 | const int16_t *t1 = (const int16_t *)&a.val[1]; 320 | #endif 321 | ASSERT_RETURN(t0[0] == i0); 322 | ASSERT_RETURN(t0[1] == i1); 323 | ASSERT_RETURN(t0[2] == i2); 324 | ASSERT_RETURN(t0[3] == i3); 325 | ASSERT_RETURN(t1[0] == i4); 326 | ASSERT_RETURN(t1[1] == i5); 327 | ASSERT_RETURN(t1[2] == i6); 328 | ASSERT_RETURN(t1[3] == i7); 329 | return TEST_SUCCESS; 330 | } 331 | 332 | result_t validate_uint16(uint16x4x2_t a, uint16_t u0, uint16_t u1, uint16_t u2, uint16_t u3, uint16_t u4, uint16_t u5, 333 | uint16_t u6, uint16_t u7) { 334 | #if defined(__riscv) || defined(__riscv__) 335 | vuint16m1_t a0 = __riscv_vget_v_u16m1x2_u16m1(a, 0); 336 | vuint16m1_t a1 = __riscv_vget_v_u16m1x2_u16m1(a, 1); 337 | const uint16_t *t0 = (const uint16_t *)&a0; 338 | const uint16_t *t1 = (const uint16_t *)&a1; 339 | #elif defined(__aarch64__) || defined(_M_ARM64) 340 | const uint16_t *t0 = (const uint16_t *)&a.val[0]; 341 | const uint16_t *t1 = (const uint16_t *)&a.val[1]; 342 | #endif 343 | ASSERT_RETURN(t0[0] == u0); 344 | ASSERT_RETURN(t0[1] == u1); 345 | ASSERT_RETURN(t0[2] == u2); 346 | ASSERT_RETURN(t0[3] == u3); 347 | ASSERT_RETURN(t1[0] == u4); 348 | ASSERT_RETURN(t1[1] == u5); 349 | ASSERT_RETURN(t1[2] == u6); 350 | ASSERT_RETURN(t1[3] == u7); 351 | return TEST_SUCCESS; 352 | } 353 | 354 | result_t validate_int8(int8x16_t a, int8_t i0, int8_t i1, int8_t i2, int8_t i3, int8_t i4, int8_t i5, int8_t i6, 355 | int8_t i7, int8_t i8, int8_t i9, int8_t i10, int8_t i11, int8_t i12, int8_t i13, int8_t i14, 356 | int8_t i15) { 357 | const int8_t *t = (const int8_t *)&a; 358 | ASSERT_RETURN(t[0] == i0); 359 | ASSERT_RETURN(t[1] == i1); 360 | ASSERT_RETURN(t[2] == i2); 361 | ASSERT_RETURN(t[3] == i3); 362 | ASSERT_RETURN(t[4] == i4); 363 | ASSERT_RETURN(t[5] == i5); 364 | ASSERT_RETURN(t[6] == i6); 365 | ASSERT_RETURN(t[7] == i7); 366 | ASSERT_RETURN(t[8] == i8); 367 | ASSERT_RETURN(t[9] == i9); 368 | ASSERT_RETURN(t[10] == i10); 369 | ASSERT_RETURN(t[11] == i11); 370 | ASSERT_RETURN(t[12] == i12); 371 | ASSERT_RETURN(t[13] == i13); 372 | ASSERT_RETURN(t[14] == i14); 373 | ASSERT_RETURN(t[15] == i15); 374 | return TEST_SUCCESS; 375 | } 376 | 377 | result_t validate_uint8(uint8x16_t a, uint8_t u0, uint8_t u1, uint8_t u2, uint8_t u3, uint8_t u4, uint8_t u5, 378 | uint8_t u6, uint8_t u7, uint8_t u8, uint8_t u9, uint8_t u10, uint8_t u11, uint8_t u12, 379 | uint8_t u13, uint8_t u14, uint8_t u15) { 380 | const uint8_t *t = (const uint8_t *)&a; 381 | ASSERT_RETURN(t[0] == u0); 382 | ASSERT_RETURN(t[1] == u1); 383 | ASSERT_RETURN(t[2] == u2); 384 | ASSERT_RETURN(t[3] == u3); 385 | ASSERT_RETURN(t[4] == u4); 386 | ASSERT_RETURN(t[5] == u5); 387 | ASSERT_RETURN(t[6] == u6); 388 | ASSERT_RETURN(t[7] == u7); 389 | ASSERT_RETURN(t[8] == u8); 390 | ASSERT_RETURN(t[9] == u9); 391 | ASSERT_RETURN(t[10] == u10); 392 | ASSERT_RETURN(t[11] == u11); 393 | ASSERT_RETURN(t[12] == u12); 394 | ASSERT_RETURN(t[13] == u13); 395 | ASSERT_RETURN(t[14] == u14); 396 | ASSERT_RETURN(t[15] == u15); 397 | return TEST_SUCCESS; 398 | } 399 | 400 | result_t validate_int8(int8x8_t a, int8_t i0, int8_t i1, int8_t i2, int8_t i3, int8_t i4, int8_t i5, int8_t i6, 401 | int8_t i7) { 402 | const int8_t *t = (const int8_t *)&a; 403 | ASSERT_RETURN(t[0] == i0); 404 | ASSERT_RETURN(t[1] == i1); 405 | ASSERT_RETURN(t[2] == i2); 406 | ASSERT_RETURN(t[3] == i3); 407 | ASSERT_RETURN(t[4] == i4); 408 | ASSERT_RETURN(t[5] == i5); 409 | ASSERT_RETURN(t[6] == i6); 410 | ASSERT_RETURN(t[7] == i7); 411 | return TEST_SUCCESS; 412 | } 413 | 414 | result_t validate_uint8(uint8x8_t a, uint8_t u0, uint8_t u1, uint8_t u2, uint8_t u3, uint8_t u4, uint8_t u5, uint8_t u6, 415 | uint8_t u7) { 416 | const uint8_t *t = (const uint8_t *)&a; 417 | ASSERT_RETURN(t[0] == u0); 418 | ASSERT_RETURN(t[1] == u1); 419 | ASSERT_RETURN(t[2] == u2); 420 | ASSERT_RETURN(t[3] == u3); 421 | ASSERT_RETURN(t[4] == u4); 422 | ASSERT_RETURN(t[5] == u5); 423 | ASSERT_RETURN(t[6] == u6); 424 | ASSERT_RETURN(t[7] == u7); 425 | return TEST_SUCCESS; 426 | } 427 | result_t validate_int8(int8x8x2_t a, int8_t i0, int8_t i1, int8_t i2, int8_t i3, int8_t i4, int8_t i5, int8_t i6, 428 | int8_t i7, int8_t i8, int8_t i9, int8_t i10, int8_t i11, int8_t i12, int8_t i13, int8_t i14, 429 | int8_t i15) { 430 | #if defined(__riscv) || defined(__riscv__) 431 | vint8m1_t a0 = __riscv_vget_v_i8m1x2_i8m1(a, 0); 432 | vint8m1_t a1 = __riscv_vget_v_i8m1x2_i8m1(a, 1); 433 | const int8_t *t0 = (const int8_t *)&a0; 434 | const int8_t *t1 = (const int8_t *)&a1; 435 | #elif defined(__aarch64__) || defined(_M_ARM64) 436 | const int8_t *t0 = (const int8_t *)&a.val[0]; 437 | const int8_t *t1 = (const int8_t *)&a.val[1]; 438 | #endif 439 | ASSERT_RETURN(t0[0] == i0); 440 | ASSERT_RETURN(t0[1] == i1); 441 | ASSERT_RETURN(t0[2] == i2); 442 | ASSERT_RETURN(t0[3] == i3); 443 | ASSERT_RETURN(t0[4] == i4); 444 | ASSERT_RETURN(t0[5] == i5); 445 | ASSERT_RETURN(t0[6] == i6); 446 | ASSERT_RETURN(t0[7] == i7); 447 | ASSERT_RETURN(t1[0] == i8); 448 | ASSERT_RETURN(t1[1] == i9); 449 | ASSERT_RETURN(t1[2] == i10); 450 | ASSERT_RETURN(t1[3] == i11); 451 | ASSERT_RETURN(t1[4] == i12); 452 | ASSERT_RETURN(t1[5] == i13); 453 | ASSERT_RETURN(t1[6] == i14); 454 | ASSERT_RETURN(t1[7] == i15); 455 | return TEST_SUCCESS; 456 | } 457 | result_t validate_uint8(uint8x8x2_t a, uint8_t u0, uint8_t u1, uint8_t u2, uint8_t u3, uint8_t u4, uint8_t u5, 458 | uint8_t u6, uint8_t u7, uint8_t u8, uint8_t u9, uint8_t u10, uint8_t u11, uint8_t u12, 459 | uint8_t u13, uint8_t u14, uint8_t u15) { 460 | #if defined(__riscv) || defined(__riscv__) 461 | vuint8m1_t a0 = __riscv_vget_v_u8m1x2_u8m1(a, 0); 462 | vuint8m1_t a1 = __riscv_vget_v_u8m1x2_u8m1(a, 1); 463 | const uint8_t *t0 = (const uint8_t *)&a0; 464 | const uint8_t *t1 = (const uint8_t *)&a1; 465 | #elif defined(__aarch64__) || defined(_M_ARM64) 466 | const uint8_t *t0 = (const uint8_t *)&a.val[0]; 467 | const uint8_t *t1 = (const uint8_t *)&a.val[1]; 468 | #endif 469 | ASSERT_RETURN(t0[0] == u0); 470 | ASSERT_RETURN(t0[1] == u1); 471 | ASSERT_RETURN(t0[2] == u2); 472 | ASSERT_RETURN(t0[3] == u3); 473 | ASSERT_RETURN(t0[4] == u4); 474 | ASSERT_RETURN(t0[5] == u5); 475 | ASSERT_RETURN(t0[6] == u6); 476 | ASSERT_RETURN(t0[7] == u7); 477 | ASSERT_RETURN(t1[0] == u8); 478 | ASSERT_RETURN(t1[1] == u9); 479 | ASSERT_RETURN(t1[2] == u10); 480 | ASSERT_RETURN(t1[3] == u11); 481 | ASSERT_RETURN(t1[4] == u12); 482 | ASSERT_RETURN(t1[5] == u13); 483 | ASSERT_RETURN(t1[6] == u14); 484 | ASSERT_RETURN(t1[7] == u15); 485 | return TEST_SUCCESS; 486 | } 487 | result_t validate_int8(int8x16x2_t a, int8_t i0, int8_t i1, int8_t i2, int8_t i3, int8_t i4, int8_t i5, int8_t i6, 488 | int8_t i7, int8_t i8, int8_t i9, int8_t i10, int8_t i11, int8_t i12, int8_t i13, int8_t i14, 489 | int8_t i15, int8_t i16, int8_t i17, int8_t i18, int8_t i19, int8_t i20, int8_t i21, int8_t i22, 490 | int8_t i23, int8_t i24, int8_t i25, int8_t i26, int8_t i27, int8_t i28, int8_t i29, int8_t i30, 491 | int8_t i31) { 492 | #if defined(__riscv) || defined(__riscv__) 493 | vint8m1_t a0 = __riscv_vget_v_i8m1x2_i8m1(a, 0); 494 | vint8m1_t a1 = __riscv_vget_v_i8m1x2_i8m1(a, 1); 495 | const int8_t *t0 = (const int8_t *)&a0; 496 | const int8_t *t1 = (const int8_t *)&a1; 497 | #elif defined(__aarch64__) || defined(_M_ARM64) 498 | const int8_t *t0 = (const int8_t *)&a.val[0]; 499 | const int8_t *t1 = (const int8_t *)&a.val[1]; 500 | #endif 501 | ASSERT_RETURN(t0[0] == i0); 502 | ASSERT_RETURN(t0[1] == i1); 503 | ASSERT_RETURN(t0[2] == i2); 504 | ASSERT_RETURN(t0[3] == i3); 505 | ASSERT_RETURN(t0[4] == i4); 506 | ASSERT_RETURN(t0[5] == i5); 507 | ASSERT_RETURN(t0[6] == i6); 508 | ASSERT_RETURN(t0[7] == i7); 509 | ASSERT_RETURN(t0[8] == i8); 510 | ASSERT_RETURN(t0[9] == i9); 511 | ASSERT_RETURN(t0[10] == i10); 512 | ASSERT_RETURN(t0[11] == i11); 513 | ASSERT_RETURN(t0[12] == i12); 514 | ASSERT_RETURN(t0[13] == i13); 515 | ASSERT_RETURN(t0[14] == i14); 516 | ASSERT_RETURN(t0[15] == i15); 517 | ASSERT_RETURN(t1[0] == i16); 518 | ASSERT_RETURN(t1[1] == i17); 519 | ASSERT_RETURN(t1[2] == i18); 520 | ASSERT_RETURN(t1[3] == i19); 521 | ASSERT_RETURN(t1[4] == i20); 522 | ASSERT_RETURN(t1[5] == i21); 523 | ASSERT_RETURN(t1[6] == i22); 524 | ASSERT_RETURN(t1[7] == i23); 525 | ASSERT_RETURN(t1[8] == i24); 526 | ASSERT_RETURN(t1[9] == i25); 527 | ASSERT_RETURN(t1[10] == i26); 528 | ASSERT_RETURN(t1[11] == i27); 529 | ASSERT_RETURN(t1[12] == i28); 530 | ASSERT_RETURN(t1[13] == i29); 531 | ASSERT_RETURN(t1[14] == i30); 532 | ASSERT_RETURN(t1[15] == i31); 533 | return TEST_SUCCESS; 534 | } 535 | result_t validate_uint8(uint8x16x2_t a, uint8_t u0, uint8_t u1, uint8_t u2, uint8_t u3, uint8_t u4, uint8_t u5, 536 | uint8_t u6, uint8_t u7, uint8_t u8, uint8_t u9, uint8_t u10, uint8_t u11, uint8_t u12, 537 | uint8_t u13, uint8_t u14, uint8_t u15, uint8_t u16, uint8_t u17, uint8_t u18, uint8_t u19, 538 | uint8_t u20, uint8_t u21, uint8_t u22, uint8_t u23, uint8_t u24, uint8_t u25, uint8_t u26, 539 | uint8_t u27, uint8_t u28, uint8_t u29, uint8_t u30, uint8_t u31) { 540 | #if defined(__riscv) || defined(__riscv__) 541 | vuint8m1_t a0 = __riscv_vget_v_u8m1x2_u8m1(a, 0); 542 | vuint8m1_t a1 = __riscv_vget_v_u8m1x2_u8m1(a, 1); 543 | const uint8_t *t0 = (const uint8_t *)&a0; 544 | const uint8_t *t1 = (const uint8_t *)&a1; 545 | #elif defined(__aarch64__) || defined(_M_ARM64) 546 | const uint8_t *t0 = (const uint8_t *)&a.val[0]; 547 | const uint8_t *t1 = (const uint8_t *)&a.val[1]; 548 | #endif 549 | ASSERT_RETURN(t0[0] == u0); 550 | ASSERT_RETURN(t0[1] == u1); 551 | ASSERT_RETURN(t0[2] == u2); 552 | ASSERT_RETURN(t0[3] == u3); 553 | ASSERT_RETURN(t0[4] == u4); 554 | ASSERT_RETURN(t0[5] == u5); 555 | ASSERT_RETURN(t0[6] == u6); 556 | ASSERT_RETURN(t0[7] == u7); 557 | ASSERT_RETURN(t0[8] == u8); 558 | ASSERT_RETURN(t0[9] == u9); 559 | ASSERT_RETURN(t0[10] == u10); 560 | ASSERT_RETURN(t0[11] == u11); 561 | ASSERT_RETURN(t0[12] == u12); 562 | ASSERT_RETURN(t0[13] == u13); 563 | ASSERT_RETURN(t0[14] == u14); 564 | ASSERT_RETURN(t0[15] == u15); 565 | ASSERT_RETURN(t1[0] == u16); 566 | ASSERT_RETURN(t1[1] == u17); 567 | ASSERT_RETURN(t1[2] == u18); 568 | ASSERT_RETURN(t1[3] == u19); 569 | ASSERT_RETURN(t1[4] == u20); 570 | ASSERT_RETURN(t1[5] == u21); 571 | ASSERT_RETURN(t1[6] == u22); 572 | ASSERT_RETURN(t1[7] == u23); 573 | ASSERT_RETURN(t1[8] == u24); 574 | ASSERT_RETURN(t1[9] == u25); 575 | ASSERT_RETURN(t1[10] == u26); 576 | ASSERT_RETURN(t1[11] == u27); 577 | ASSERT_RETURN(t1[12] == u28); 578 | ASSERT_RETURN(t1[13] == u29); 579 | ASSERT_RETURN(t1[14] == u30); 580 | ASSERT_RETURN(t1[15] == u31); 581 | return TEST_SUCCESS; 582 | } 583 | 584 | result_t validate_float_pair(float a, float b) { 585 | const uint32_t *ua = (const uint32_t *)&a; 586 | const uint32_t *ub = (const uint32_t *)&b; 587 | // We do an integer (binary) compare rather than a 588 | // floating point compare to take NaNs and infinities 589 | // into account as well. 590 | return (*ua) == (*ub) ? TEST_SUCCESS : TEST_FAIL; 591 | } 592 | 593 | result_t validate_double_pair(double a, double b) { 594 | const uint64_t *ua = (const uint64_t *)&a; 595 | const uint64_t *ub = (const uint64_t *)&b; 596 | // We do an integer (binary) compare rather than a 597 | // floating point compare to take NaNs and infinities 598 | // into account as well. 599 | if (std::isnan(a) && std::isnan(b)) { 600 | return TEST_SUCCESS; 601 | } 602 | return (*ua) == (*ub) ? TEST_SUCCESS : TEST_FAIL; 603 | } 604 | 605 | result_t validate_float(float32x4_t a, float f0, float f1, float f2, float f3) { 606 | const float *t = (const float *)&a; 607 | ASSERT_RETURN(validate_float_pair(t[0], f0)); 608 | ASSERT_RETURN(validate_float_pair(t[1], f1)); 609 | ASSERT_RETURN(validate_float_pair(t[2], f2)); 610 | ASSERT_RETURN(validate_float_pair(t[3], f3)); 611 | return TEST_SUCCESS; 612 | } 613 | 614 | result_t validate_float(float32x4x2_t a, float f0, float f1, float f2, float f3, float f4, float f5, float f6, 615 | float f7) { 616 | #if defined(__riscv) || defined(__riscv__) 617 | vfloat32m1_t a0 = __riscv_vget_v_f32m1x2_f32m1(a, 0); 618 | vfloat32m1_t a1 = __riscv_vget_v_f32m1x2_f32m1(a, 1); 619 | const float *t0 = (const float *)&a0; 620 | const float *t1 = (const float *)&a1; 621 | #elif defined(__aarch64__) || defined(_M_ARM64) 622 | const float *t0 = (const float *)&a.val[0]; 623 | const float *t1 = (const float *)&a.val[1]; 624 | #endif 625 | ASSERT_RETURN(validate_float_pair(t0[0], f0)); 626 | ASSERT_RETURN(validate_float_pair(t0[1], f1)); 627 | ASSERT_RETURN(validate_float_pair(t0[2], f2)); 628 | ASSERT_RETURN(validate_float_pair(t0[3], f3)); 629 | ASSERT_RETURN(validate_float_pair(t1[0], f4)); 630 | ASSERT_RETURN(validate_float_pair(t1[1], f5)); 631 | ASSERT_RETURN(validate_float_pair(t1[2], f6)); 632 | ASSERT_RETURN(validate_float_pair(t1[3], f7)); 633 | return TEST_SUCCESS; 634 | } 635 | 636 | result_t validate_float(float32x2_t a, float f0, float f1) { 637 | const float *t = (const float *)&a; 638 | ASSERT_RETURN(validate_float_pair(t[0], f0)); 639 | ASSERT_RETURN(validate_float_pair(t[1], f1)); 640 | return TEST_SUCCESS; 641 | } 642 | 643 | result_t validate_float(float32x2x2_t a, float f0, float f1, float f2, float f3) { 644 | #if defined(__riscv) || defined(__riscv__) 645 | vfloat32m1_t a0 = __riscv_vget_v_f32m1x2_f32m1(a, 0); 646 | vfloat32m1_t a1 = __riscv_vget_v_f32m1x2_f32m1(a, 1); 647 | const float *t0 = (const float *)&a0; 648 | const float *t1 = (const float *)&a1; 649 | #elif defined(__aarch64__) || defined(_M_ARM64) 650 | const float *t0 = (const float *)&a.val[0]; 651 | const float *t1 = (const float *)&a.val[1]; 652 | #endif 653 | ASSERT_RETURN(validate_float_pair(t0[0], f0)); 654 | ASSERT_RETURN(validate_float_pair(t0[1], f1)); 655 | ASSERT_RETURN(validate_float_pair(t1[0], f2)); 656 | ASSERT_RETURN(validate_float_pair(t1[1], f3)); 657 | return TEST_SUCCESS; 658 | } 659 | 660 | result_t validate_float_error(float32x4_t a, float f0, float f1, float f2, float f3, float err) { 661 | const float *t = (const float *)&a; 662 | float df0 = fabsf((t[0] - f0) / f0); 663 | float df1 = fabsf((t[1] - f1) / f1); 664 | float df2 = fabsf((t[2] - f2) / f2); 665 | float df3 = fabsf((t[3] - f3) / f3); 666 | if ((std::isnan(t[0]) && std::isnan(f0)) || (t[0] == 0 && f0 == 0) || (std::isinf(t[0]) && std::isinf(f0))) { 667 | df0 = 0; 668 | } 669 | if ((std::isnan(t[1]) && std::isnan(f1)) || (t[1] == 0 && f1 == 0) || (std::isinf(t[1]) && std::isinf(f1))) { 670 | df1 = 0; 671 | } 672 | if ((std::isnan(t[2]) && std::isnan(f2)) || (t[2] == 0 && f2 == 0) || (std::isinf(t[2]) && std::isinf(f2))) { 673 | df2 = 0; 674 | } 675 | if ((std::isnan(t[3]) && std::isnan(f3)) || (t[3] == 0 && f3 == 0) || (std::isinf(t[3]) && std::isinf(f3))) { 676 | df3 = 0; 677 | } 678 | ASSERT_RETURN(df0 < err); 679 | ASSERT_RETURN(df1 < err); 680 | ASSERT_RETURN(df2 < err); 681 | ASSERT_RETURN(df3 < err); 682 | return TEST_SUCCESS; 683 | } 684 | 685 | result_t validate_float_error(float32x2_t a, float f0, float f1, float err) { 686 | const float *t = (const float *)&a; 687 | float df0 = fabsf((t[0] - f0) / f0); 688 | float df1 = fabsf((t[1] - f1) / f1); 689 | if ((std::isnan(t[0]) && std::isnan(f0)) || (t[0] == 0 && f0 == 0) || (std::isinf(t[0]) && std::isinf(f0))) { 690 | df0 = 0; 691 | } 692 | if ((std::isnan(t[1]) && std::isnan(f1)) || (t[1] == 0 && f1 == 0) || (std::isinf(t[1]) && std::isinf(f1))) { 693 | df1 = 0; 694 | } 695 | ASSERT_RETURN(df0 < err); 696 | ASSERT_RETURN(df1 < err); 697 | return TEST_SUCCESS; 698 | } 699 | 700 | result_t validate_float_error(float32_t a, float f0, float err) { 701 | float df0 = fabsf((a - f0) / f0); 702 | if ((std::isnan(a) && std::isnan(f0)) || (a == 0 && f0 == 0) || (std::isinf(a) && std::isinf(f0))) { 703 | df0 = 0; 704 | } 705 | ASSERT_RETURN(df0 < err); 706 | return TEST_SUCCESS; 707 | } 708 | 709 | result_t validate_double(float64x2_t a, double d0, double d1) { 710 | const double *t = (const double *)&a; 711 | ASSERT_RETURN(validate_float_pair(t[0], d0)); 712 | ASSERT_RETURN(validate_float_pair(t[1], d1)); 713 | return TEST_SUCCESS; 714 | } 715 | 716 | result_t validate_double(float64x2x2_t a, double d0, double d1, double d2, double d3) { 717 | #if defined(__riscv) || defined(__riscv__) 718 | vfloat64m1_t a0 = __riscv_vget_v_f64m1x2_f64m1(a, 0); 719 | vfloat64m1_t a1 = __riscv_vget_v_f64m1x2_f64m1(a, 1); 720 | const double *t0 = (const double *)&a0; 721 | const double *t1 = (const double *)&a1; 722 | #elif defined(__aarch64__) || defined(_M_ARM64) 723 | const double *t0 = (const double *)&a.val[0]; 724 | const double *t1 = (const double *)&a.val[1]; 725 | #endif 726 | ASSERT_RETURN(validate_float_pair(t0[0], d0)); 727 | ASSERT_RETURN(validate_float_pair(t0[1], d1)); 728 | ASSERT_RETURN(validate_float_pair(t1[0], d2)); 729 | ASSERT_RETURN(validate_float_pair(t1[1], d3)); 730 | return TEST_SUCCESS; 731 | } 732 | 733 | result_t validate_double(float64x1_t a, double d0) { 734 | const double *t = (const double *)&a; 735 | ASSERT_RETURN(validate_float_pair(t[0], d0)); 736 | return TEST_SUCCESS; 737 | } 738 | 739 | result_t validate_double(float64x1x2_t a, double d0, double d1) { 740 | #if defined(__riscv) || defined(__riscv__) 741 | vfloat64m1_t a0 = __riscv_vget_v_f64m1x2_f64m1(a, 0); 742 | vfloat64m1_t a1 = __riscv_vget_v_f64m1x2_f64m1(a, 1); 743 | const double *t0 = (const double *)&a0; 744 | const double *t1 = (const double *)&a1; 745 | #elif defined(__aarch64__) || defined(_M_ARM64) 746 | const double *t0 = (const double *)&a.val[0]; 747 | const double *t1 = (const double *)&a.val[1]; 748 | #endif 749 | ASSERT_RETURN(validate_float_pair(t0[0], d0)); 750 | ASSERT_RETURN(validate_float_pair(t1[0], d1)); 751 | return TEST_SUCCESS; 752 | } 753 | 754 | result_t validate_double_error(float64x2_t a, double d0, double d1, double err) { 755 | const double *t = (const double *)&a; 756 | double td0 = fabs((t[0] - d0) / d0); 757 | double td1 = fabs((t[1] - d1) / d1); 758 | if (std::isnan(t[0]) && std::isnan(d0)) { 759 | td0 = 0; 760 | } 761 | if (std::isnan(t[1]) && std::isnan(d1)) { 762 | td1 = 0; 763 | } 764 | ASSERT_RETURN(td0 < err); 765 | ASSERT_RETURN(td1 < err); 766 | return TEST_SUCCESS; 767 | } 768 | 769 | result_t validate_double_error(float64x1_t a, double d0, double err) { 770 | const double *t = (const double *)&a; 771 | double td0 = fabs((t[0] - d0) / d0); 772 | if (std::isnan(t[0]) && std::isnan(d0)) { 773 | td0 = 0; 774 | } 775 | ASSERT_RETURN(td0 < err); 776 | return TEST_SUCCESS; 777 | } 778 | 779 | result_t validate_double_error(double a, double d0, double err) { 780 | double df0 = abs((a - d0) / d0); 781 | if ((std::isnan(a) && std::isnan(d0)) || (a == 0 && d0 == 0) || (std::isinf(a) && std::isinf(d0))) { 782 | df0 = 0; 783 | } 784 | ASSERT_RETURN(df0 < err); 785 | return TEST_SUCCESS; 786 | } 787 | 788 | int8_t saturate_int8(int a) { 789 | if (a > INT8_MAX) 790 | return (int8_t)INT8_MAX; 791 | if (a < INT8_MIN) 792 | return (int8_t)INT8_MIN; 793 | return (int8_t)a; 794 | } 795 | uint8_t saturate_uint8(unsigned int a) { 796 | if (a > UINT8_MAX) { 797 | return (uint8_t)UINT8_MAX; 798 | } else if (a < 0) { 799 | return 0; 800 | } 801 | return (uint8_t)a; 802 | } 803 | int16_t saturate_int16(int a) { 804 | if (a > INT16_MAX) 805 | return (int16_t)INT16_MAX; 806 | if (a < INT16_MIN) 807 | return (int16_t)INT16_MIN; 808 | return (int16_t)a; 809 | } 810 | uint16_t saturate_uint16(unsigned int a) { 811 | if (a > UINT16_MAX) { 812 | return (uint16_t)UINT16_MAX; 813 | } else if (a < 0) { 814 | return 0; 815 | } 816 | return a; 817 | } 818 | int32_t saturate_int32(int64_t a) { 819 | if (a > INT32_MAX) 820 | return (int32_t)INT32_MAX; 821 | if (a < INT32_MIN) 822 | return (int32_t)INT32_MIN; 823 | return (int32_t)a; 824 | } 825 | uint32_t saturate_uint32(uint64_t a) { 826 | if (a > UINT32_MAX) { 827 | return (uint32_t)UINT32_MAX; 828 | } else if (a < 0) { 829 | return 0; 830 | } 831 | return a; 832 | } 833 | 834 | float ranf(float low, float high) { 835 | float rand_float = (float)rand() / (float)RAND_MAX; 836 | return rand_float * (high - low) + low; 837 | } 838 | 839 | // rounding mode 840 | float bankers_rounding(float val) { 841 | if (val < 0) 842 | return -bankers_rounding(-val); 843 | float ret; 844 | float round_down = floorf(val); // Round down value 845 | float round_up = ceilf(val); // Round up value 846 | float diff_down = val - round_down; 847 | float diff_up = round_up - val; 848 | if (diff_down < diff_up) { 849 | /* If it's closer to the round down value, then use it */ 850 | ret = round_down; 851 | } else if (diff_down > diff_up) { 852 | /* If it's closer to the round up value, then use it */ 853 | ret = round_up; 854 | } else { 855 | /* If it's equidistant between round up and round down value, pick the 856 | * one which is an even number */ 857 | float half = round_down / 2; 858 | if (half != floorf(half)) { 859 | /* If the round down value is odd, return the round up value */ 860 | ret = round_up; 861 | } else { 862 | /* If the round up value is odd, return the round down value */ 863 | ret = round_down; 864 | } 865 | } 866 | return ret; 867 | } 868 | 869 | double bankers_rounding(double val) { 870 | if (val < 0) 871 | return -bankers_rounding(-val); 872 | double ret; 873 | double round_down = floor(val); // Round down value 874 | double round_up = ceil(val); // Round up value 875 | double diff_down = val - round_down; 876 | double diff_up = round_up - val; 877 | if (diff_down < diff_up) { 878 | /* If it's closer to the round down value, then use it */ 879 | ret = round_down; 880 | } else if (diff_down > diff_up) { 881 | /* If it's closer to the round up value, then use it */ 882 | ret = round_up; 883 | } else { 884 | /* If it's equidistant between round up and round down value, pick the 885 | * one which is an even number */ 886 | double half = round_down / 2; 887 | if (half != floor(half)) { 888 | /* If the round down value is odd, return the round up value */ 889 | ret = round_up; 890 | } else { 891 | /* If the round up value is odd, return the round down value */ 892 | ret = round_down; 893 | } 894 | } 895 | return ret; 896 | } 897 | 898 | float maxnm(float a, float b) { 899 | if (std::isnan(a) && !std::isnan(b)) { 900 | a = b; 901 | } else if (!std::isnan(a) && std::isnan(b)) { 902 | b = a; 903 | } 904 | return a > b ? a : b; 905 | } 906 | float minnm(float a, float b) { 907 | if (std::isnan(a) && !std::isnan(b)) { 908 | a = b; 909 | } else if (!std::isnan(a) && std::isnan(b)) { 910 | b = a; 911 | } 912 | return a < b ? a : b; 913 | } 914 | double maxnm(double a, double b) { 915 | if (std::isnan(a) && !std::isnan(b)) { 916 | a = b; 917 | } else if (!std::isnan(a) && std::isnan(b)) { 918 | b = a; 919 | } 920 | return a > b ? a : b; 921 | } 922 | double minnm(double a, double b) { 923 | if (std::isnan(a) && !std::isnan(b)) { 924 | a = b; 925 | } else if (!std::isnan(a) && std::isnan(b)) { 926 | b = a; 927 | } 928 | return a < b ? a : b; 929 | } 930 | 931 | } // namespace NEON2RVV 932 | --------------------------------------------------------------------------------