├── misc └── object_qsort-perf.jpg ├── benchmarks ├── bench-all.cpp ├── meson.build ├── bench-vqsort.cpp ├── bench-keyvalue.hpp ├── bench-qselect.hpp ├── bench-partial-qsort.hpp ├── bench.h ├── bench-qsort.hpp ├── bench-ipp.cpp ├── bench-argsort.hpp └── bench-objsort.hpp ├── examples ├── icl-16bit.cpp ├── spr-16bit.cpp ├── skx-avx2.cpp ├── Makefile └── avx512-kv.cpp ├── SECURITY.md ├── src ├── avx512-64bit-qsort.hpp ├── avx512-64bit-argsort.hpp ├── xss-custom-float.h ├── xss-common-comparators.hpp ├── xss-common-includes.h ├── avx512-16bit-common.h ├── avx512fp16-16bit-qsort.hpp ├── README.md ├── x86simdsort-static-incl.h ├── xss-network-qsort.hpp ├── xss-pivot-selection.hpp └── xss-optimal-networks.hpp ├── .gitignore ├── .github └── workflows │ ├── linting.yml │ ├── build-test-on-32bit.sh │ ├── scorecard.yml │ ├── build-numpy.yml │ └── c-cpp.yml ├── scripts ├── bench-compare.sh └── branch-compare.sh ├── tests ├── meson.build ├── test-objqsort.cpp ├── test-qsort-common.h └── test-qsort.cpp ├── lib ├── x86simdsort-spr.cpp ├── meson.build ├── x86simdsort-icl.cpp ├── x86simdsort-internal.h ├── x86simdsort-skx.cpp ├── x86simdsort-avx2.cpp ├── list-of-exported-symbols.txt ├── x86simdsort.h └── x86simdsort-scalar.h ├── meson_options.txt ├── Makefile ├── utils ├── custom-compare.h └── rand_array.h ├── LICENSE.md ├── CONTRIBUTING.md ├── example.c ├── run-bench.py ├── .clang-format ├── meson.build ├── CODE_OF_CONDUCT.md └── README.md /misc/object_qsort-perf.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/x86-simd-sort/HEAD/misc/object_qsort-perf.jpg -------------------------------------------------------------------------------- /benchmarks/bench-all.cpp: -------------------------------------------------------------------------------- 1 | #include "bench.h" 2 | #include "bench-argsort.hpp" 3 | #include "bench-partial-qsort.hpp" 4 | #include "bench-qselect.hpp" 5 | #include "bench-qsort.hpp" 6 | #include "bench-keyvalue.hpp" 7 | #include "bench-objsort.hpp" 8 | -------------------------------------------------------------------------------- /examples/icl-16bit.cpp: -------------------------------------------------------------------------------- 1 | #include "x86simdsort-static-incl.h" 2 | 3 | int main() 4 | { 5 | const int size = 1000; 6 | short arr[size]; 7 | x86simdsortStatic::qsort(arr, size); 8 | x86simdsortStatic::qselect(arr, 10, size); 9 | x86simdsortStatic::partial_qsort(arr, 10, size); 10 | return 0; 11 | } 12 | -------------------------------------------------------------------------------- /examples/spr-16bit.cpp: -------------------------------------------------------------------------------- 1 | #include "x86simdsort-static-incl.h" 2 | 3 | int main() 4 | { 5 | const int size = 1000; 6 | _Float16 arr[size]; 7 | x86simdsortStatic::qsort(arr, size); 8 | x86simdsortStatic::qselect(arr, 10, size); 9 | x86simdsortStatic::partial_qsort(arr, 10, size); 10 | return 0; 11 | } 12 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | ## Supported Versions 4 | 5 | | Version | Supported | 6 | | ------- | ------------------ | 7 | | 5.0 | :white_check_mark: | 8 | | 4.0 | :white_check_mark: | 9 | | < 4.0 | :x: | 10 | 11 | ## Reporting a Vulnerability 12 | 13 | Report any vulnerability to raghuveer.devulapalli@intel.com 14 | -------------------------------------------------------------------------------- /src/avx512-64bit-qsort.hpp: -------------------------------------------------------------------------------- 1 | /******************************************************************* 2 | * Copyright (C) 2022 Intel Corporation 3 | * SPDX-License-Identifier: BSD-3-Clause 4 | * Authors: Raghuveer Devulapalli 5 | * ****************************************************************/ 6 | 7 | #ifndef AVX512_QSORT_64BIT 8 | #define AVX512_QSORT_64BIT 9 | 10 | #include "avx512-64bit-common.h" 11 | 12 | #endif // AVX512_QSORT_64BIT 13 | -------------------------------------------------------------------------------- /src/avx512-64bit-argsort.hpp: -------------------------------------------------------------------------------- 1 | /******************************************************************* 2 | * Copyright (C) 2022 Intel Corporation 3 | * SPDX-License-Identifier: BSD-3-Clause 4 | * Authors: Raghuveer Devulapalli 5 | * ****************************************************************/ 6 | 7 | #ifndef AVX512_ARGSORT_64BIT 8 | #define AVX512_ARGSORT_64BIT 9 | 10 | #include "avx512-64bit-common.h" 11 | #include "xss-common-argsort.h" 12 | 13 | #endif // AVX512_ARGSORT_64BIT 14 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # bench-compare 2 | .bench-compare 3 | .bench 4 | # Prerequisites 5 | *.d 6 | 7 | # Compiled Object files 8 | *.slo 9 | *.lo 10 | *.o 11 | *.obj 12 | 13 | # Precompiled Headers 14 | *.gch 15 | *.pch 16 | 17 | # Compiled Dynamic libraries 18 | *.so 19 | *.dylib 20 | *.dll 21 | 22 | # Fortran module files 23 | *.mod 24 | *.smod 25 | 26 | # Compiled Static libraries 27 | *.lai 28 | *.la 29 | *.a 30 | *.lib 31 | 32 | # Executables 33 | *.exe 34 | *.out 35 | *.app 36 | 37 | # Build or IDE artifacts 38 | **/.vscode 39 | /builddir/ 40 | /testexe 41 | /benchexe 42 | -------------------------------------------------------------------------------- /.github/workflows/linting.yml: -------------------------------------------------------------------------------- 1 | name: Lint 2 | 3 | on: 4 | push: 5 | branches: [ "main" ] 6 | pull_request: 7 | branches: [ "main" ] 8 | 9 | permissions: read-all 10 | 11 | jobs: 12 | clang-format: 13 | 14 | runs-on: intel-ubuntu-24.04 15 | 16 | steps: 17 | - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 18 | 19 | - name: Install dependencies 20 | run: | 21 | sudo apt update 22 | sudo apt -y install clang-format 23 | 24 | - name: Lint 25 | run: | 26 | find . -type f | grep -P ".*\.(c|cpp|h|hpp)\b" | xargs clang-format -style=file --dry-run -Werror 27 | -------------------------------------------------------------------------------- /scripts/bench-compare.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | branch=$(git rev-parse --abbrev-ref HEAD) 4 | SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) 5 | cd $SCRIPT_DIR/.. 6 | 7 | ## Get google-benchmark 8 | mkdir -p .bench 9 | if [ ! -d .bench/google-benchmark ]; then 10 | git clone https://github.com/google/benchmark .bench/google-benchmark 11 | fi 12 | compare=$(realpath .bench/google-benchmark/tools/compare.py) 13 | 14 | meson setup -Dbuild_benchmarks=true -Dbuild_vqsortbench=true --warnlevel 0 --buildtype release builddir-${branch} 15 | cd builddir-${branch} 16 | ninja 17 | $compare filters ./benchexe $1 $2 --benchmark_repetitions=$3 18 | -------------------------------------------------------------------------------- /examples/skx-avx2.cpp: -------------------------------------------------------------------------------- 1 | #include "x86simdsort-static-incl.h" 2 | 3 | int main() 4 | { 5 | const int size = 1000; 6 | double arrd[size]; 7 | float arrf[size]; 8 | x86simdsortStatic::qsort(arrf, size); 9 | x86simdsortStatic::qsort(arrd, size); 10 | x86simdsortStatic::qselect(arrf, 10, size); 11 | x86simdsortStatic::qselect(arrd, 10, size); 12 | x86simdsortStatic::partial_qsort(arrf, 10, size); 13 | x86simdsortStatic::partial_qsort(arrd, 10, size); 14 | auto arg1 = x86simdsortStatic::argsort(arrf, size); 15 | auto arg2 = x86simdsortStatic::argselect(arrf, 10, size); 16 | auto arg3 = x86simdsortStatic::argsort(arrd, size); 17 | auto arg4 = x86simdsortStatic::argselect(arrd, 10, size); 18 | return 0; 19 | } 20 | -------------------------------------------------------------------------------- /examples/Makefile: -------------------------------------------------------------------------------- 1 | CXX ?= g++-13 2 | CFLAGS = -I../src -std=c++17 -O3 3 | EXE = kvsort qsortavx2 qsortavx512 qsortspr qsorticl 4 | 5 | default: all 6 | all : $(EXE) 7 | 8 | kvsort: avx512-kv.cpp 9 | $(CXX) -o kvsort -mavx512vl -mavx512dq $(CFLAGS) avx512-kv.cpp 10 | 11 | qsortavx512: skx-avx2.cpp 12 | $(CXX) -o qsortavx512 -mavx512vl -mavx512dq $(CFLAGS) skx-avx2.cpp 13 | 14 | qsortavx2: skx-avx2.cpp 15 | $(CXX) -o qsortavx2 -mavx2 $(CFLAGS) skx-avx2.cpp 16 | 17 | qsorticl: icl-16bit.cpp 18 | $(CXX) -o qsorticl -mavx512vl -mavx512bw -mavx512dq -mavx512vbmi2 $(CFLAGS) icl-16bit.cpp 19 | 20 | qsortspr: spr-16bit.cpp 21 | $(CXX) -o qsortspr -mavx512vl -mavx512dq -mavx512vbmi2 -mavx512fp16 $(CFLAGS) spr-16bit.cpp 22 | 23 | clean: 24 | $(RM) $(EXE) 25 | -------------------------------------------------------------------------------- /benchmarks/meson.build: -------------------------------------------------------------------------------- 1 | libbench = [] 2 | 3 | libbench += static_library('bench_qsort', 4 | files( 5 | 'bench-all.cpp', 6 | ), 7 | dependencies: gbench_dep, 8 | include_directories : [src, lib, utils], 9 | cpp_args : ['-O3'], 10 | ) 11 | 12 | if benchvq and fs.is_file('../highway/hwy/contrib/sort/vqsort-inl.h') 13 | hwy = include_directories('../highway') 14 | libbench += static_library('bench_vqsort', 15 | files( 16 | 'bench-vqsort.cpp', 17 | ), 18 | dependencies: gbench_dep, 19 | include_directories : [src, lib, utils, hwy], 20 | cpp_args : ['-O3', '-march=native'], 21 | ) 22 | endif 23 | 24 | if benchipp 25 | libbench += static_library('bench_ippsort', 26 | files( 27 | 'bench-ipp.cpp', 28 | ), 29 | dependencies: gbench_dep, 30 | include_directories : [src, lib, utils], 31 | cpp_args : ['-O3', '-march=native'], 32 | ) 33 | endif 34 | -------------------------------------------------------------------------------- /.github/workflows/build-test-on-32bit.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ## Set up environment 4 | /opt/python/cp39-cp39/bin/python -mvenv venv 5 | source venv/bin/activate 6 | python3 -m pip install meson ninja 7 | export CXX=g++ 8 | 9 | ## Install google test from source 10 | git clone https://github.com/google/googletest.git -b v1.14.0 11 | cd googletest 12 | mkdir build 13 | cd build 14 | cmake .. -DBUILD_GMOCK=OFF 15 | make install 16 | 17 | ## Install Intel SDE 18 | curl -o /tmp/sde.tar.xz https://downloadmirror.intel.com/859732/sde-external-9.58.0-2025-06-16-lin.tar.xz 19 | mkdir /tmp/sde && tar -xvf /tmp/sde.tar.xz -C /tmp/sde/ 20 | mv /tmp/sde/* /opt/sde && ln -s /opt/sde/sde /usr/bin/sde 21 | 22 | ## Build x86-simd-sort 23 | cd /xss 24 | meson setup -Dbuild_tests=true --warnlevel 2 --werror --buildtype release builddir 25 | cd builddir 26 | ninja 27 | 28 | ## Run tests 29 | sde -tgl -- ./testexe 30 | sde -skl -- ./testexe 31 | -------------------------------------------------------------------------------- /examples/avx512-kv.cpp: -------------------------------------------------------------------------------- 1 | #include "x86simdsort-static-incl.h" 2 | 3 | int main() 4 | { 5 | const int size = 1000; 6 | int64_t arr1[size]; 7 | uint64_t arr2[size]; 8 | double arr3[size]; 9 | float arr4[size]; 10 | x86simdsortStatic::keyvalue_qsort(arr1, arr1, size); 11 | x86simdsortStatic::keyvalue_qsort(arr1, arr2, size); 12 | x86simdsortStatic::keyvalue_qsort(arr1, arr3, size); 13 | x86simdsortStatic::keyvalue_qsort(arr2, arr1, size); 14 | x86simdsortStatic::keyvalue_qsort(arr2, arr2, size); 15 | x86simdsortStatic::keyvalue_qsort(arr2, arr3, size); 16 | x86simdsortStatic::keyvalue_qsort(arr3, arr1, size); 17 | x86simdsortStatic::keyvalue_qsort(arr3, arr2, size); 18 | x86simdsortStatic::keyvalue_qsort(arr1, arr4, size); 19 | x86simdsortStatic::keyvalue_qsort(arr2, arr4, size); 20 | x86simdsortStatic::keyvalue_qsort(arr3, arr4, size); 21 | return 0; 22 | return 0; 23 | } 24 | -------------------------------------------------------------------------------- /tests/meson.build: -------------------------------------------------------------------------------- 1 | libtests = [] 2 | 3 | # Add compile flags when needed for the ASAN CI run 4 | testargs = [] 5 | if get_option('asan_ci_dont_validate') 6 | if get_option('fatal_sanitizers') 7 | testargs = ['-DXSS_ASAN_CI_NOCHECK=true'] 8 | else 9 | error('asan_ci_dont_validate is only for the ASAN CI, should be false otherwise!') 10 | endif 11 | endif 12 | 13 | libtests += static_library('tests_qsort', 14 | files('test-qsort.cpp', ), 15 | dependencies: [omp_dep, gtest_dep], 16 | include_directories : [src, lib, utils], 17 | cpp_args : [testargs], 18 | ) 19 | 20 | libtests += static_library('tests_kvsort', 21 | files('test-keyvalue.cpp', ), 22 | dependencies: [omp_dep, gtest_dep], 23 | include_directories : [src, lib, utils], 24 | cpp_args : [testargs], 25 | ) 26 | 27 | libtests += static_library('tests_objsort', 28 | files('test-objqsort.cpp', ), 29 | dependencies: [omp_dep, gtest_dep], 30 | include_directories : [src, lib, utils], 31 | cpp_args : [testargs], 32 | ) 33 | -------------------------------------------------------------------------------- /lib/x86simdsort-spr.cpp: -------------------------------------------------------------------------------- 1 | // SPR specific routines: 2 | #include "x86simdsort-static-incl.h" 3 | #include "x86simdsort-internal.h" 4 | 5 | namespace xss { 6 | namespace fp16_spr { 7 | template <> 8 | void qsort(_Float16 *arr, size_t size, bool hasnan, bool descending) 9 | { 10 | x86simdsortStatic::qsort(arr, size, hasnan, descending); 11 | } 12 | template <> 13 | void qselect(_Float16 *arr, 14 | size_t k, 15 | size_t arrsize, 16 | bool hasnan, 17 | bool descending) 18 | { 19 | x86simdsortStatic::qselect(arr, k, arrsize, hasnan, descending); 20 | } 21 | template <> 22 | void partial_qsort(_Float16 *arr, 23 | size_t k, 24 | size_t arrsize, 25 | bool hasnan, 26 | bool descending) 27 | { 28 | x86simdsortStatic::partial_qsort(arr, k, arrsize, hasnan, descending); 29 | } 30 | } // namespace fp16_spr 31 | } // namespace xss 32 | -------------------------------------------------------------------------------- /benchmarks/bench-vqsort.cpp: -------------------------------------------------------------------------------- 1 | #include "bench.h" 2 | #define VQSORT_ONLY_STATIC 1 3 | #include "hwy/contrib/sort/vqsort-inl.h" 4 | 5 | template 6 | static void vqsort(benchmark::State &state, Args &&...args) 7 | { 8 | // Get args 9 | auto args_tuple = std::make_tuple(std::move(args)...); 10 | size_t arrsize = std::get<0>(args_tuple); 11 | std::string arrtype = std::get<1>(args_tuple); 12 | // set up array 13 | std::vector arr = get_array(arrtype, arrsize); 14 | std::vector arr_bkp = arr; 15 | // benchmark 16 | for (auto _ : state) { 17 | hwy::HWY_NAMESPACE::VQSortStatic( 18 | arr.data(), arrsize, hwy::SortAscending()); 19 | state.PauseTiming(); 20 | arr = arr_bkp; 21 | state.ResumeTiming(); 22 | } 23 | } 24 | 25 | BENCH_SORT(vqsort, uint64_t) 26 | BENCH_SORT(vqsort, int64_t) 27 | BENCH_SORT(vqsort, uint32_t) 28 | BENCH_SORT(vqsort, int32_t) 29 | BENCH_SORT(vqsort, uint16_t) 30 | BENCH_SORT(vqsort, int16_t) 31 | BENCH_SORT(vqsort, float) 32 | BENCH_SORT(vqsort, double) 33 | -------------------------------------------------------------------------------- /meson_options.txt: -------------------------------------------------------------------------------- 1 | option('build_tests', type : 'boolean', value : false, 2 | description : 'Build test suite (default: "false").') 3 | option('build_benchmarks', type : 'boolean', value : false, 4 | description : 'Build benchmarking suite (default: "false").') 5 | option('build_ippbench', type : 'boolean', value : false, 6 | description : 'Add IPP sort to benchmarks (default: "false").') 7 | option('build_vqsortbench', type : 'boolean', value : true, 8 | description : 'Add google vqsort to benchmarks (default: "true").') 9 | option('use_openmp', type : 'boolean', value : false, 10 | description : 'Use OpenMP to accelerate key-value sort (default: "false").') 11 | option('lib_type', type : 'string', value : 'shared', 12 | description : 'Library type: shared or static (default: "shared").') 13 | option('fatal_sanitizers', type : 'boolean', value : 'false', 14 | description : 'If sanitizers are enabled, should all issues be considered fatal? (default: "false").') 15 | option('asan_ci_dont_validate', type : 'boolean', value : 'false', 16 | description : 'Only for speeding up ASAN CI, do not turn on otherwise') 17 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | test: 2 | meson setup -Dbuild_tests=true -Duse_openmp=false --warnlevel 2 --werror --buildtype release builddir 3 | cd builddir && ninja 4 | 5 | test_openmp: 6 | meson setup -Dbuild_tests=true -Duse_openmp=true --warnlevel 2 --werror --buildtype release builddir 7 | cd builddir && ninja 8 | 9 | test_asan: 10 | meson setup -Dbuild_tests=true -Duse_openmp=true -Db_sanitize=address,undefined -Dfatal_sanitizers=true -Db_lundef=false -Dasan_ci_dont_validate=true --warnlevel 0 --buildtype debugoptimized builddir 11 | cd builddir && ninja 12 | 13 | bench: 14 | meson setup -Dbuild_benchmarks=true --warnlevel 2 --werror --buildtype release builddir 15 | cd builddir && ninja 16 | 17 | debug: 18 | meson setup -Dbuild_tests=true --warnlevel 2 --werror --buildtype debug debug 19 | cd debug && ninja 20 | 21 | sharedlib: 22 | meson setup --warnlevel 2 --werror --buildtype release builddir 23 | cd builddir && ninja 24 | 25 | staticlib: 26 | meson setup -Dlib_type=static --warnlevel 2 --werror --buildtype release builddir 27 | cd builddir && ninja 28 | 29 | install: 30 | meson setup --warnlevel 2 --werror --buildtype release builddir 31 | cd builddir && meson install 32 | 33 | clean: 34 | $(RM) -rf $(TESTOBJS) $(BENCHOBJS) $(UTILOBJS) testexe benchexe builddir debug 35 | -------------------------------------------------------------------------------- /lib/meson.build: -------------------------------------------------------------------------------- 1 | libtargets = [] 2 | 3 | if cpp.has_argument('-march=haswell') 4 | libtargets += static_library('libavx', 5 | files( 6 | 'x86simdsort-avx2.cpp', 7 | ), 8 | include_directories : [src], 9 | cpp_args : ['-march=haswell'], 10 | gnu_symbol_visibility : 'inlineshidden', 11 | dependencies: [omp_dep], 12 | ) 13 | endif 14 | 15 | if cpp.has_argument('-march=skylake-avx512') 16 | libtargets += static_library('libskx', 17 | files( 18 | 'x86simdsort-skx.cpp', 19 | ), 20 | include_directories : [src], 21 | cpp_args : ['-march=skylake-avx512'], 22 | gnu_symbol_visibility : 'inlineshidden', 23 | dependencies: [omp_dep], 24 | ) 25 | endif 26 | 27 | if cpp.has_argument('-march=icelake-client') 28 | libtargets += static_library('libicl', 29 | files( 30 | 'x86simdsort-icl.cpp', 31 | ), 32 | include_directories : [src], 33 | cpp_args : ['-march=icelake-client'], 34 | gnu_symbol_visibility : 'inlineshidden', 35 | dependencies: [omp_dep], 36 | ) 37 | endif 38 | 39 | if cancompilefp16 40 | libtargets += static_library('libspr', 41 | files( 42 | 'x86simdsort-spr.cpp', 43 | ), 44 | include_directories : [src], 45 | cpp_args : ['-march=sapphirerapids'], 46 | gnu_symbol_visibility : 'inlineshidden', 47 | dependencies: [omp_dep], 48 | ) 49 | endif 50 | 51 | install_headers('x86simdsort.h') 52 | -------------------------------------------------------------------------------- /utils/custom-compare.h: -------------------------------------------------------------------------------- 1 | #ifndef UTILS_CUSTOM_COMPARE 2 | #define UTILS_CUSTOM_COMPARE 3 | 4 | #include 5 | #include 6 | #include "xss-custom-float.h" 7 | 8 | /* 9 | * Custom comparator class to handle NAN's: treats NAN > INF 10 | */ 11 | template 12 | struct compare { 13 | static constexpr auto op = Comparator {}; 14 | bool operator()(const T a, const T b) 15 | { 16 | if constexpr (xss::fp::is_floating_point_v) { 17 | T inf = xss::fp::infinity(); 18 | T one = (T)1.0; 19 | if (!xss::fp::isunordered(a, b)) { return op(a, b); } 20 | else if ((xss::fp::isnan(a)) && (!xss::fp::isnan(b))) { 21 | return b == inf ? op(inf, one) : op(inf, b); 22 | } 23 | else if ((!xss::fp::isnan(a)) && (xss::fp::isnan(b))) { 24 | return a == inf ? op(one, inf) : op(a, inf); 25 | } 26 | else { 27 | return op(one, one); 28 | } 29 | } 30 | else { 31 | return op(a, b); 32 | } 33 | } 34 | }; 35 | 36 | template 37 | struct compare_arg { 38 | compare_arg(const T *arr) 39 | { 40 | this->arr = arr; 41 | } 42 | bool operator()(const int64_t a, const int64_t b) 43 | { 44 | return compare()(arr[a], arr[b]); 45 | } 46 | const T *arr; 47 | }; 48 | 49 | #endif // UTILS_CUSTOM_COMPARE -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2022, Intel. All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | 3. Neither the name of the copyright holder nor the names of its 16 | contributors may be used to endorse or promote products derived from 17 | this software without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 27 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | -------------------------------------------------------------------------------- /scripts/branch-compare.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) 4 | BASE_DIR=$(dirname $SCRIPT_DIR) 5 | branch=$(git rev-parse --abbrev-ref HEAD) 6 | #br_commit=$(git rev-parse $branch) 7 | #main_commit=$(git rev-parse main) 8 | basebranch=$1 9 | echo "Comparing $basebranch branch with $branch" 10 | 11 | build_branch() { 12 | dir_name=$1 13 | if [ ! -d $dir_name ]; then 14 | git clone -b $dir_name ${BASE_DIR} $dir_name 15 | else 16 | # if it exists, just update it 17 | cd $dir_name 18 | git fetch origin 19 | git rebase origin/$dir_name 20 | # rebase fails with conflict, delete and start over 21 | if [ "$?" != 0 ]; then 22 | cd .. 23 | rm -rf $dir_name 24 | git clone -b $dir_name ${BASE_DIR} $dir_name 25 | else 26 | cd .. 27 | fi 28 | fi 29 | cd $dir_name 30 | meson setup -Dbuild_benchmarks=true -Duse_openmp=true --warnlevel 0 --buildtype release builddir 31 | cd builddir 32 | ninja 33 | cd ../../ 34 | } 35 | 36 | mkdir -p .bench 37 | cd .bench 38 | if [ ! -d google-benchmark ]; then 39 | git clone https://github.com/google/benchmark google-benchmark 40 | fi 41 | compare=$(realpath google-benchmark/tools/compare.py) 42 | build_branch $branch 43 | build_branch $basebranch 44 | contender=$(realpath ${branch}/builddir/benchexe) 45 | baseline=$(realpath ${basebranch}/builddir/benchexe) 46 | 47 | if [ -z "$3" ]; then 48 | echo "Comparing all benchmarks .." 49 | $compare benchmarks $baseline $contender --benchmark_repetitions=$2 50 | else 51 | echo "Comparing benchmark $2 .." 52 | $compare benchmarksfiltered $baseline $2 $contender $2 --benchmark_repetitions=$3 53 | fi 54 | -------------------------------------------------------------------------------- /benchmarks/bench-keyvalue.hpp: -------------------------------------------------------------------------------- 1 | #include "x86simdsort-scalar.h" 2 | 3 | template 4 | static void scalarkvsort(benchmark::State &state, Args &&...args) 5 | { 6 | // Get args 7 | auto args_tuple = std::make_tuple(std::move(args)...); 8 | size_t arrsize = std::get<0>(args_tuple); 9 | std::string arrtype = std::get<1>(args_tuple); 10 | // set up array 11 | std::vector key = get_array(arrtype, arrsize); 12 | std::vector val = get_array("random", arrsize); 13 | std::vector key_bkp = key; 14 | // benchmark 15 | for (auto _ : state) { 16 | xss::scalar::keyvalue_qsort( 17 | key.data(), val.data(), arrsize, false, false); 18 | state.PauseTiming(); 19 | key = key_bkp; 20 | state.ResumeTiming(); 21 | } 22 | } 23 | 24 | template 25 | static void simdkvsort(benchmark::State &state, Args &&...args) 26 | { 27 | auto args_tuple = std::make_tuple(std::move(args)...); 28 | size_t arrsize = std::get<0>(args_tuple); 29 | std::string arrtype = std::get<1>(args_tuple); 30 | // set up array 31 | std::vector key = get_array(arrtype, arrsize); 32 | std::vector val = get_array("random", arrsize); 33 | std::vector key_bkp = key; 34 | // benchmark 35 | for (auto _ : state) { 36 | x86simdsort::keyvalue_qsort(key.data(), val.data(), arrsize); 37 | state.PauseTiming(); 38 | key = key_bkp; 39 | state.ResumeTiming(); 40 | } 41 | } 42 | 43 | #define BENCH_BOTH_KVSORT(type) \ 44 | BENCH_SORT(simdkvsort, type) \ 45 | BENCH_SORT(scalarkvsort, type) 46 | 47 | BENCH_BOTH_KVSORT(uint64_t) 48 | BENCH_BOTH_KVSORT(int64_t) 49 | BENCH_BOTH_KVSORT(double) 50 | BENCH_BOTH_KVSORT(uint32_t) 51 | BENCH_BOTH_KVSORT(int32_t) 52 | BENCH_BOTH_KVSORT(float) 53 | -------------------------------------------------------------------------------- /benchmarks/bench-qselect.hpp: -------------------------------------------------------------------------------- 1 | template 2 | static void simdqselect(benchmark::State &state, Args &&...args) 3 | { 4 | // Perform setup here 5 | auto args_tuple = std::make_tuple(std::move(args)...); 6 | int64_t ARRSIZE = std::get<0>(args_tuple); 7 | int64_t k = std::get<1>(args_tuple); 8 | std::vector arr; 9 | std::vector arr_bkp; 10 | 11 | /* Initialize elements */ 12 | arr = get_uniform_rand_array(ARRSIZE); 13 | arr_bkp = arr; 14 | 15 | /* call avx512 quickselect */ 16 | for (auto _ : state) { 17 | x86simdsort::qselect(arr.data(), k, ARRSIZE); 18 | 19 | state.PauseTiming(); 20 | arr = arr_bkp; 21 | state.ResumeTiming(); 22 | } 23 | } 24 | 25 | template 26 | static void scalarqselect(benchmark::State &state, Args &&...args) 27 | { 28 | // Perform setup here 29 | auto args_tuple = std::make_tuple(std::move(args)...); 30 | int64_t ARRSIZE = std::get<0>(args_tuple); 31 | int64_t k = std::get<1>(args_tuple); 32 | std::vector arr; 33 | std::vector arr_bkp; 34 | 35 | /* Initialize elements */ 36 | arr = get_uniform_rand_array(ARRSIZE); 37 | arr_bkp = arr; 38 | 39 | /* call std::nth_element */ 40 | for (auto _ : state) { 41 | std::nth_element(arr.begin(), arr.begin() + k, arr.end()); 42 | 43 | state.PauseTiming(); 44 | arr = arr_bkp; 45 | state.ResumeTiming(); 46 | } 47 | } 48 | 49 | #define BENCH_BOTH_QSELECT(type) \ 50 | BENCH_PARTIAL(simdqselect, type) \ 51 | BENCH_PARTIAL(scalarqselect, type) 52 | 53 | BENCH_BOTH_QSELECT(uint64_t) 54 | BENCH_BOTH_QSELECT(int64_t) 55 | BENCH_BOTH_QSELECT(uint32_t) 56 | BENCH_BOTH_QSELECT(int32_t) 57 | BENCH_BOTH_QSELECT(uint16_t) 58 | BENCH_BOTH_QSELECT(int16_t) 59 | BENCH_BOTH_QSELECT(float) 60 | BENCH_BOTH_QSELECT(double) 61 | #ifdef __FLT16_MAX__ 62 | BENCH_BOTH_QSELECT(_Float16) 63 | #endif 64 | -------------------------------------------------------------------------------- /benchmarks/bench-partial-qsort.hpp: -------------------------------------------------------------------------------- 1 | template 2 | static void simdpartialsort(benchmark::State &state, Args &&...args) 3 | { 4 | // Perform setup here 5 | auto args_tuple = std::make_tuple(std::move(args)...); 6 | int64_t ARRSIZE = std::get<0>(args_tuple); 7 | int64_t k = std::get<1>(args_tuple); 8 | std::vector arr; 9 | std::vector arr_bkp; 10 | 11 | /* Initialize elements */ 12 | arr = get_uniform_rand_array(ARRSIZE); 13 | arr_bkp = arr; 14 | 15 | /* call simdpartialsort */ 16 | for (auto _ : state) { 17 | x86simdsort::partial_qsort(arr.data(), k, ARRSIZE); 18 | 19 | state.PauseTiming(); 20 | arr = arr_bkp; 21 | state.ResumeTiming(); 22 | } 23 | } 24 | 25 | template 26 | static void scalarpartialsort(benchmark::State &state, Args &&...args) 27 | { 28 | // Perform setup here 29 | auto args_tuple = std::make_tuple(std::move(args)...); 30 | int64_t ARRSIZE = std::get<0>(args_tuple); 31 | int64_t k = std::get<1>(args_tuple); 32 | std::vector arr; 33 | std::vector arr_bkp; 34 | 35 | /* Initialize elements */ 36 | arr = get_uniform_rand_array(ARRSIZE); 37 | arr_bkp = arr; 38 | 39 | /* call std::partial_sort */ 40 | for (auto _ : state) { 41 | std::partial_sort(arr.begin(), arr.begin() + k, arr.end()); 42 | 43 | state.PauseTiming(); 44 | arr = arr_bkp; 45 | state.ResumeTiming(); 46 | } 47 | } 48 | 49 | #define BENCH_BOTH_PARTIAL(type) \ 50 | BENCH_PARTIAL(simdpartialsort, type) \ 51 | BENCH_PARTIAL(scalarpartialsort, type) 52 | 53 | BENCH_BOTH_PARTIAL(uint64_t) 54 | BENCH_BOTH_PARTIAL(int64_t) 55 | BENCH_BOTH_PARTIAL(uint32_t) 56 | BENCH_BOTH_PARTIAL(int32_t) 57 | BENCH_BOTH_PARTIAL(uint16_t) 58 | BENCH_BOTH_PARTIAL(int16_t) 59 | BENCH_BOTH_PARTIAL(float) 60 | BENCH_BOTH_PARTIAL(double) 61 | #ifdef __FLT16_MAX__ 62 | BENCH_BOTH_PARTIAL(_Float16) 63 | #endif 64 | -------------------------------------------------------------------------------- /src/xss-custom-float.h: -------------------------------------------------------------------------------- 1 | #ifndef XSS_CUSTOM_FLOAT 2 | #define XSS_CUSTOM_FLOAT 3 | #include 4 | namespace xss { 5 | namespace fp { 6 | template 7 | inline constexpr bool is_floating_point_v = std::is_floating_point_v; 8 | 9 | template 10 | static bool isnan(T elem) 11 | { 12 | return std::isnan(elem); 13 | } 14 | template 15 | static bool isunordered(T a, T b) 16 | { 17 | return std::isunordered(a, b); 18 | } 19 | template 20 | static T max() 21 | { 22 | return std::numeric_limits::max(); 23 | } 24 | template 25 | static T min() 26 | { 27 | return std::numeric_limits::min(); 28 | } 29 | template 30 | static T infinity() 31 | { 32 | return std::numeric_limits::infinity(); 33 | } 34 | template 35 | static T quiet_NaN() 36 | { 37 | return std::numeric_limits::quiet_NaN(); 38 | } 39 | 40 | #ifdef __FLT16_MAX__ 41 | typedef union { 42 | _Float16 f_; 43 | uint16_t i_; 44 | } Fp16Bits; 45 | 46 | static _Float16 convert_bits(uint16_t val) 47 | { 48 | Fp16Bits temp; 49 | temp.i_ = val; 50 | return temp.f_; 51 | } 52 | 53 | template <> 54 | [[maybe_unused]] inline constexpr bool is_floating_point_v<_Float16> = true; 55 | 56 | template <> 57 | [[maybe_unused]] bool isnan<_Float16>(_Float16 elem) 58 | { 59 | return elem != elem; 60 | } 61 | template <> 62 | [[maybe_unused]] bool isunordered<_Float16>(_Float16 a, _Float16 b) 63 | { 64 | return isnan(a) || isnan(b); 65 | } 66 | template <> 67 | [[maybe_unused]] _Float16 max<_Float16>() 68 | { 69 | return convert_bits(0x7bff); 70 | } 71 | template <> 72 | [[maybe_unused]] _Float16 min<_Float16>() 73 | { 74 | return convert_bits(0x0400); 75 | } 76 | template <> 77 | [[maybe_unused]] _Float16 infinity<_Float16>() 78 | { 79 | return convert_bits(0x7c00); 80 | } 81 | template <> 82 | [[maybe_unused]] _Float16 quiet_NaN<_Float16>() 83 | { 84 | return convert_bits(0x7c01); 85 | } 86 | #endif 87 | 88 | } // namespace fp 89 | } // namespace xss 90 | #endif // XSS_CUSTOM_FLOAT 91 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | ### License 4 | 5 | x86-simd-sort is licensed under the terms in 6 | [LICENSE]. By 7 | contributing to the project, you agree to the license and copyright terms 8 | therein and release your contribution under these terms. 9 | 10 | ### Sign your work 11 | 12 | Please use the sign-off line at the end of the patch. Your signature certifies 13 | that you wrote the patch or otherwise have the right to pass it on as an 14 | open-source patch. The rules are pretty simple: if you can certify the below 15 | (from [developercertificate.org](http://developercertificate.org/)): 16 | 17 | ``` 18 | Developer Certificate of Origin 19 | Version 1.1 20 | 21 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors. 22 | 660 York Street, Suite 102, 23 | San Francisco, CA 94110 USA 24 | 25 | Everyone is permitted to copy and distribute verbatim copies of this 26 | license document, but changing it is not allowed. 27 | 28 | Developer's Certificate of Origin 1.1 29 | 30 | By making a contribution to this project, I certify that: 31 | 32 | (a) The contribution was created in whole or in part by me and I 33 | have the right to submit it under the open source license 34 | indicated in the file; or 35 | 36 | (b) The contribution is based upon previous work that, to the best 37 | of my knowledge, is covered under an appropriate open source 38 | license and I have the right under that license to submit that 39 | work with modifications, whether created in whole or in part 40 | by me, under the same open source license (unless I am 41 | permitted to submit under a different license), as indicated 42 | in the file; or 43 | 44 | (c) The contribution was provided directly to me by some other 45 | person who certified (a), (b) or (c) and I have not modified 46 | it. 47 | 48 | (d) I understand and agree that this project and the contribution 49 | are public and that a record of the contribution (including all 50 | personal information I submit with it, including my sign-off) is 51 | maintained indefinitely and may be redistributed consistent with 52 | this project or the open source license(s) involved. 53 | ``` 54 | 55 | Then you just add a line to every git commit message: 56 | 57 | Signed-off-by: Joe Smith 58 | 59 | Use your real name (sorry, no pseudonyms or anonymous contributions.) 60 | 61 | If you set your `user.name` and `user.email` git configs, you can sign your 62 | commit automatically with `git commit -s`. 63 | -------------------------------------------------------------------------------- /tests/test-objqsort.cpp: -------------------------------------------------------------------------------- 1 | /******************************************* 2 | * * Copyright (C) 2022-2023 Intel Corporation 3 | * * SPDX-License-Identifier: BSD-3-Clause 4 | * *******************************************/ 5 | 6 | #include "rand_array.h" 7 | #include "x86simdsort.h" 8 | #include 9 | 10 | template 11 | struct P { 12 | T x, y; 13 | T metric() const 14 | { 15 | return x; 16 | } 17 | bool operator==(const P &a) const 18 | { 19 | return a.x == x; // && a.y == y; 20 | } 21 | }; 22 | 23 | template 24 | class simdobjsort : public ::testing::Test { 25 | public: 26 | simdobjsort() 27 | { 28 | std::iota(arrsize.begin(), arrsize.end(), 0); 29 | arrtype = {"random", 30 | "constant", 31 | "sorted", 32 | "reverse", 33 | "smallrange", 34 | "max_at_the_end", 35 | "random_5d", 36 | "rand_max"}; 37 | } 38 | std::vector arrtype; 39 | std::vector arrsize = std::vector(1024); 40 | }; 41 | 42 | TYPED_TEST_SUITE_P(simdobjsort); 43 | 44 | TYPED_TEST_P(simdobjsort, test_objsort) 45 | { 46 | for (auto type : this->arrtype) { 47 | for (auto size : this->arrsize) { 48 | std::vector x = get_array(type, size); 49 | std::vector y = get_array("random", size); 50 | std::vector> arr(size); 51 | for (size_t ii = 0; ii < size; ++ii) { 52 | arr[ii].x = x[ii]; 53 | arr[ii].y = y[ii]; 54 | } 55 | std::vector> arr_bckp; 56 | for (size_t ii = 0; ii < size; ++ii) { 57 | arr_bckp.push_back(arr[ii]); 58 | } 59 | 60 | x86simdsort::object_qsort(arr.data(), size, [](P p) { 61 | return p.metric(); 62 | }); 63 | std::sort(arr_bckp.begin(), 64 | arr_bckp.end(), 65 | [](const P &a, const P &b) { 66 | return a.metric() < b.metric(); 67 | }); 68 | ASSERT_EQ(arr, arr_bckp); 69 | arr.clear(); 70 | arr_bckp.clear(); 71 | } 72 | } 73 | } 74 | 75 | REGISTER_TYPED_TEST_SUITE_P(simdobjsort, test_objsort); 76 | 77 | using QObjSortTestTypes 78 | = testing::Types; 79 | 80 | INSTANTIATE_TYPED_TEST_SUITE_P(xss, simdobjsort, QObjSortTestTypes); 81 | -------------------------------------------------------------------------------- /example.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | // declare function here, linker will find this when linked to 7 | // libx86simdsortcpp.so 8 | void keyvalue_qsort_float_sizet(float *, size_t *, size_t); 9 | void keyvalue_qsort_float_uint32(float *, uint32_t *, uint32_t); 10 | void keyvalue_qsort_sizet_sizet(size_t *, size_t *, size_t); 11 | void keyvalue_qsort_sizet_uint32(size_t *, uint32_t *, uint32_t); 12 | void keyvalue_qsort_uint32_sizet(uint32_t *, size_t *, size_t); 13 | void keyvalue_qsort_uint32_uint32(uint32_t *, uint32_t *, uint32_t); 14 | void keyvalue_qsort_int32_sizet(int32_t *, size_t *, size_t); 15 | void keyvalue_qsort_int32_uint32(int32_t *, uint32_t *, uint32_t); 16 | 17 | // struct definition, we will sort an array of these: 18 | struct Point { 19 | int x; 20 | int y; 21 | float distance; 22 | size_t metric; 23 | }; 24 | 25 | #define SWAP(a, b, type) \ 26 | { \ 27 | type temp = a; \ 28 | a = b; \ 29 | b = temp; \ 30 | } 31 | 32 | // Function to sort an array of objects: 33 | void object_qsort(struct Point *arr, size_t size) 34 | { 35 | /* (1) Create and initialize arrays of key and value */ 36 | size_t *key = malloc(size * sizeof(size_t)); 37 | size_t *arg = malloc(size * sizeof(size_t)); 38 | bool *done = malloc(size * sizeof(bool)); 39 | for (size_t ii = 0; ii < size; ++ii) { 40 | key[ii] = arr[ii].metric; 41 | arg[ii] = ii; 42 | done[ii] = false; 43 | } 44 | 45 | /* (2) IndexSort using the keyvalue_qsort */ 46 | keyvalue_qsort_sizet_sizet(key, arg, size); 47 | 48 | /* (3) Permute obj array in-place */ 49 | for (size_t ii = 0; ii < size; ++ii) { 50 | if (done[ii]) { continue; } 51 | done[ii] = true; 52 | size_t prev_j = ii; 53 | size_t jj = arg[ii]; 54 | while (ii != jj) { 55 | SWAP(arr[prev_j], arr[jj], struct Point); 56 | done[jj] = true; 57 | prev_j = jj; 58 | jj = arg[jj]; 59 | } 60 | } 61 | free(key); 62 | free(arg); 63 | free(done); 64 | } 65 | 66 | int main() 67 | { 68 | const size_t size = 10; 69 | struct Point arr[size]; 70 | 71 | // Initialize: 72 | for (size_t ii = 0; ii < size; ++ii) { 73 | arr[ii].distance = (float)rand() / RAND_MAX; 74 | arr[ii].metric = rand() % 100; 75 | } 76 | 77 | // sort: 78 | object_qsort(arr, size); 79 | 80 | // check if it is sorted: 81 | printf("arr = "); 82 | for (size_t ii = 0; ii < size; ++ii) { 83 | printf("%ld, ", arr[ii].metric); 84 | } 85 | printf("\n"); 86 | return 0; 87 | } 88 | -------------------------------------------------------------------------------- /lib/x86simdsort-icl.cpp: -------------------------------------------------------------------------------- 1 | // ICL specific routines: 2 | #include "x86simdsort-static-incl.h" 3 | #include "x86simdsort-internal.h" 4 | 5 | namespace xss { 6 | namespace avx512 { 7 | template <> 8 | void qsort(uint16_t *arr, size_t size, bool hasnan, bool descending) 9 | { 10 | x86simdsortStatic::qsort(arr, size, hasnan, descending); 11 | } 12 | template <> 13 | void qselect(uint16_t *arr, 14 | size_t k, 15 | size_t arrsize, 16 | bool hasnan, 17 | bool descending) 18 | { 19 | x86simdsortStatic::qselect(arr, k, arrsize, hasnan, descending); 20 | } 21 | template <> 22 | void partial_qsort(uint16_t *arr, 23 | size_t k, 24 | size_t arrsize, 25 | bool hasnan, 26 | bool descending) 27 | { 28 | x86simdsortStatic::partial_qsort(arr, k, arrsize, hasnan, descending); 29 | } 30 | template <> 31 | void qsort(int16_t *arr, size_t size, bool hasnan, bool descending) 32 | { 33 | x86simdsortStatic::qsort(arr, size, hasnan, descending); 34 | } 35 | template <> 36 | void qselect(int16_t *arr, 37 | size_t k, 38 | size_t arrsize, 39 | bool hasnan, 40 | bool descending) 41 | { 42 | x86simdsortStatic::qselect(arr, k, arrsize, hasnan, descending); 43 | } 44 | template <> 45 | void partial_qsort(int16_t *arr, 46 | size_t k, 47 | size_t arrsize, 48 | bool hasnan, 49 | bool descending) 50 | { 51 | x86simdsortStatic::partial_qsort(arr, k, arrsize, hasnan, descending); 52 | } 53 | } // namespace avx512 54 | namespace fp16_icl { 55 | #ifdef __FLT16_MAX__ 56 | template <> 57 | void qsort(_Float16 *arr, size_t size, bool hasnan, bool descending) 58 | { 59 | x86simdsortStatic::qsort(arr, size, hasnan, descending); 60 | } 61 | template <> 62 | void qselect(_Float16 *arr, 63 | size_t k, 64 | size_t arrsize, 65 | bool hasnan, 66 | bool descending) 67 | { 68 | x86simdsortStatic::qselect(arr, k, arrsize, hasnan, descending); 69 | } 70 | template <> 71 | void partial_qsort(_Float16 *arr, 72 | size_t k, 73 | size_t arrsize, 74 | bool hasnan, 75 | bool descending) 76 | { 77 | x86simdsortStatic::partial_qsort(arr, k, arrsize, hasnan, descending); 78 | } 79 | #endif 80 | } // namespace fp16_icl 81 | } // namespace xss 82 | -------------------------------------------------------------------------------- /run-bench.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import argparse 3 | import subprocess 4 | 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument('--branchcompare', action='store_true', help='Compare benchmarks of current branch with main. Provide an optional --filter') 7 | parser.add_argument("-b", '--branch', type=str, default="main", required=False) 8 | parser.add_argument('--benchcompare', type=str, help='Compare simd bench with stdsort methods. Requires one of qsort, qselect, partialsort, argsort or argselect') 9 | parser.add_argument("-f", '--filter', type=str, required=False) 10 | parser.add_argument("-r", '--repeat', type=int, required=False) 11 | args = parser.parse_args() 12 | 13 | if len(sys.argv) == 1: 14 | parser.error("requires one of --benchcompare or --branchcompare") 15 | 16 | filterb = "" 17 | if args.filter is not None: 18 | filterb = args.filter 19 | repeatnum = 1 20 | if args.repeat is not None: 21 | repeatnum = args.repeat 22 | 23 | if args.benchcompare: 24 | baseline = "" 25 | contender = "" 26 | if "ippsort" in args.benchcompare: 27 | baseline = "ippsort.*" + filterb 28 | contender = "simdsort.*" + filterb 29 | elif "ippargsort" in args.benchcompare: 30 | baseline = "ippargsort.*" + filterb 31 | contender = "simd_ordern_argsort.*" + filterb 32 | elif "vqsort" in args.benchcompare: 33 | baseline = "vqsort.*" + filterb 34 | contender = "simdsort.*" + filterb 35 | elif "qsort" in args.benchcompare: 36 | baseline = "scalarsort.*" + filterb 37 | contender = "simdsort.*" + filterb 38 | elif "select" in args.benchcompare: 39 | baseline = "scalarqselect.*" + filterb 40 | contender = "simdqselect.*" + filterb 41 | elif "partial" in args.benchcompare: 42 | baseline = "scalarpartialsort.*" + filterb 43 | contender = "simdpartialsort.*" + filterb 44 | elif "argsort" in args.benchcompare: 45 | baseline = "scalarargsort.*" + filterb 46 | contender = "simdargsort.*" + filterb 47 | elif "keyvalue" in args.benchcompare: 48 | baseline = "scalarkvsort.*" + filterb 49 | contender = "simdkvsort.*" + filterb 50 | elif "objsort" in args.benchcompare: 51 | baseline = "scalarobjsort.*" + filterb 52 | contender = "simdobjsort.*" + filterb 53 | else: 54 | parser.print_help(sys.stderr) 55 | parser.error("ERROR: Unknown argument '%s'" % args.benchcompare) 56 | rc = subprocess.check_call("./scripts/bench-compare.sh '%s' '%s' '%d'" % (baseline, contender, repeatnum), shell=True) 57 | 58 | if args.branchcompare: 59 | branch = args.branch 60 | if args.filter is None: 61 | rc = subprocess.check_call("./scripts/branch-compare.sh '%s' '%d'" % (branch, repeatnum), shell=True) 62 | else: 63 | rc = subprocess.check_call("./scripts/branch-compare.sh '%s' '%s' '%d'" % (branch, args.filter, repeatnum), shell=True) 64 | -------------------------------------------------------------------------------- /lib/x86simdsort-internal.h: -------------------------------------------------------------------------------- 1 | #ifndef XSS_INTERNAL_METHODS 2 | #define XSS_INTERNAL_METHODS 3 | #include "x86simdsort.h" 4 | #include 5 | #include 6 | 7 | #define DECLAREALLFUNCS(name) \ 8 | namespace name { \ 9 | template \ 10 | XSS_HIDE_SYMBOL void qsort(T *arr, \ 11 | size_t arrsize, \ 12 | bool hasnan = false, \ 13 | bool descending = false); \ 14 | template \ 15 | XSS_HIDE_SYMBOL void keyvalue_qsort(T1 *key, \ 16 | T2 *val, \ 17 | size_t arrsize, \ 18 | bool hasnan = false, \ 19 | bool descending = false); \ 20 | template \ 21 | XSS_HIDE_SYMBOL void qselect(T *arr, \ 22 | size_t k, \ 23 | size_t arrsize, \ 24 | bool hasnan = false, \ 25 | bool descending = false); \ 26 | template \ 27 | XSS_HIDE_SYMBOL void keyvalue_select(T1 *key, \ 28 | T2 *val, \ 29 | size_t k, \ 30 | size_t arrsize, \ 31 | bool hasnan = false, \ 32 | bool descending = false); \ 33 | template \ 34 | XSS_HIDE_SYMBOL void partial_qsort(T *arr, \ 35 | size_t k, \ 36 | size_t arrsize, \ 37 | bool hasnan = false, \ 38 | bool descending = false); \ 39 | template \ 40 | XSS_HIDE_SYMBOL void keyvalue_partial_sort(T1 *key, \ 41 | T2 *val, \ 42 | size_t k, \ 43 | size_t arrsize, \ 44 | bool hasnan = false, \ 45 | bool descending = false); \ 46 | template \ 47 | XSS_HIDE_SYMBOL std::vector argsort(T *arr, \ 48 | size_t arrsize, \ 49 | bool hasnan = false, \ 50 | bool descending = false); \ 51 | template \ 52 | XSS_HIDE_SYMBOL std::vector \ 53 | argselect(T *arr, size_t k, size_t arrsize, bool hasnan = false); \ 54 | } 55 | 56 | namespace xss { 57 | DECLAREALLFUNCS(avx512) 58 | DECLAREALLFUNCS(avx2) 59 | DECLAREALLFUNCS(scalar) 60 | DECLAREALLFUNCS(fp16_spr) 61 | DECLAREALLFUNCS(fp16_icl) 62 | } // namespace xss 63 | #endif 64 | -------------------------------------------------------------------------------- /benchmarks/bench.h: -------------------------------------------------------------------------------- 1 | #include "rand_array.h" 2 | #include "x86simdsort.h" 3 | #include 4 | 5 | #define MY_BENCHMARK_CAPTURE(func, T, test_case_name, ...) \ 6 | BENCHMARK_PRIVATE_DECLARE(func) \ 7 | = (::benchmark::internal::RegisterBenchmarkInternal( \ 8 | std::unique_ptr( \ 9 | new ::benchmark::internal::FunctionBenchmark( \ 10 | #func "/" #test_case_name "/" #T, \ 11 | [](::benchmark::State &st) { \ 12 | func(st, __VA_ARGS__); \ 13 | })))) 14 | 15 | #define BENCH_SORT(func, type) \ 16 | MY_BENCHMARK_CAPTURE(func, type, random_128, 128, std::string("random")); \ 17 | MY_BENCHMARK_CAPTURE(func, type, random_256, 256, std::string("random")); \ 18 | MY_BENCHMARK_CAPTURE(func, type, random_512, 512, std::string("random")); \ 19 | MY_BENCHMARK_CAPTURE(func, type, random_1k, 1024, std::string("random")); \ 20 | MY_BENCHMARK_CAPTURE(func, type, random_5k, 5000, std::string("random")); \ 21 | MY_BENCHMARK_CAPTURE( \ 22 | func, type, random_100k, 100000, std::string("random")); \ 23 | MY_BENCHMARK_CAPTURE( \ 24 | func, type, random_1m, 1000000, std::string("random")); \ 25 | MY_BENCHMARK_CAPTURE( \ 26 | func, type, random_10m, 10000000, std::string("random")); \ 27 | MY_BENCHMARK_CAPTURE( \ 28 | func, type, random_100m, 100000000, std::string("random")); \ 29 | MY_BENCHMARK_CAPTURE( \ 30 | func, type, smallrange_128, 128, std::string("smallrange")); \ 31 | MY_BENCHMARK_CAPTURE( \ 32 | func, type, smallrange_256, 256, std::string("smallrange")); \ 33 | MY_BENCHMARK_CAPTURE( \ 34 | func, type, smallrange_512, 512, std::string("smallrange")); \ 35 | MY_BENCHMARK_CAPTURE( \ 36 | func, type, smallrange_1k, 1024, std::string("smallrange")); \ 37 | MY_BENCHMARK_CAPTURE( \ 38 | func, type, smallrange_5k, 5000, std::string("smallrange")); \ 39 | MY_BENCHMARK_CAPTURE( \ 40 | func, type, smallrange_100k, 100000, std::string("smallrange")); \ 41 | MY_BENCHMARK_CAPTURE( \ 42 | func, type, smallrange_1m, 1000000, std::string("smallrange")); \ 43 | MY_BENCHMARK_CAPTURE( \ 44 | func, type, smallrange_10m, 10000000, std::string("smallrange")); \ 45 | MY_BENCHMARK_CAPTURE( \ 46 | func, type, sorted_10k, 10000, std::string("sorted")); \ 47 | MY_BENCHMARK_CAPTURE( \ 48 | func, type, constant_10k, 10000, std::string("constant")); \ 49 | MY_BENCHMARK_CAPTURE( \ 50 | func, type, reverse_10k, 10000, std::string("reverse")); 51 | 52 | #define BENCH_PARTIAL(func, type) \ 53 | MY_BENCHMARK_CAPTURE(func, type, k10, 10000, 10); \ 54 | MY_BENCHMARK_CAPTURE(func, type, k100, 10000, 100); \ 55 | MY_BENCHMARK_CAPTURE(func, type, k1000, 10000, 1000); \ 56 | MY_BENCHMARK_CAPTURE(func, type, k5000, 10000, 5000); 57 | -------------------------------------------------------------------------------- /benchmarks/bench-qsort.hpp: -------------------------------------------------------------------------------- 1 | template 2 | static void scalarsort(benchmark::State &state, Args &&...args) 3 | { 4 | // Get args 5 | auto args_tuple = std::make_tuple(std::move(args)...); 6 | size_t arrsize = std::get<0>(args_tuple); 7 | std::string arrtype = std::get<1>(args_tuple); 8 | // set up array 9 | std::vector arr = get_array(arrtype, arrsize); 10 | std::vector arr_bkp = arr; 11 | // benchmark 12 | for (auto _ : state) { 13 | std::sort(arr.begin(), arr.end()); 14 | state.PauseTiming(); 15 | arr = arr_bkp; 16 | state.ResumeTiming(); 17 | } 18 | } 19 | 20 | template 21 | static void simdsort(benchmark::State &state, Args &&...args) 22 | { 23 | // Get args 24 | auto args_tuple = std::make_tuple(std::move(args)...); 25 | size_t arrsize = std::get<0>(args_tuple); 26 | std::string arrtype = std::get<1>(args_tuple); 27 | // set up array 28 | std::vector arr = get_array(arrtype, arrsize); 29 | std::vector arr_bkp = arr; 30 | // benchmark 31 | for (auto _ : state) { 32 | x86simdsort::qsort(arr.data(), arrsize); 33 | state.PauseTiming(); 34 | arr = arr_bkp; 35 | state.ResumeTiming(); 36 | } 37 | } 38 | 39 | template 40 | static void scalar_revsort(benchmark::State &state, Args &&...args) 41 | { 42 | // Get args 43 | auto args_tuple = std::make_tuple(std::move(args)...); 44 | size_t arrsize = std::get<0>(args_tuple); 45 | std::string arrtype = std::get<1>(args_tuple); 46 | // set up array 47 | std::vector arr = get_array(arrtype, arrsize); 48 | std::vector arr_bkp = arr; 49 | // benchmark 50 | for (auto _ : state) { 51 | std::sort(arr.rbegin(), arr.rend()); 52 | state.PauseTiming(); 53 | arr = arr_bkp; 54 | state.ResumeTiming(); 55 | } 56 | } 57 | 58 | template 59 | static void simd_revsort(benchmark::State &state, Args &&...args) 60 | { 61 | // Get args 62 | auto args_tuple = std::make_tuple(std::move(args)...); 63 | size_t arrsize = std::get<0>(args_tuple); 64 | std::string arrtype = std::get<1>(args_tuple); 65 | // set up array 66 | std::vector arr = get_array(arrtype, arrsize); 67 | std::vector arr_bkp = arr; 68 | // benchmark 69 | for (auto _ : state) { 70 | x86simdsort::qsort(arr.data(), arrsize, false, true); 71 | state.PauseTiming(); 72 | arr = arr_bkp; 73 | state.ResumeTiming(); 74 | } 75 | } 76 | 77 | #define BENCH_BOTH_QSORT(type) \ 78 | BENCH_SORT(simdsort, type) \ 79 | BENCH_SORT(scalarsort, type) \ 80 | BENCH_SORT(simd_revsort, type) \ 81 | BENCH_SORT(scalar_revsort, type) 82 | 83 | BENCH_BOTH_QSORT(uint64_t) 84 | BENCH_BOTH_QSORT(int64_t) 85 | BENCH_BOTH_QSORT(uint32_t) 86 | BENCH_BOTH_QSORT(int32_t) 87 | BENCH_BOTH_QSORT(uint16_t) 88 | BENCH_BOTH_QSORT(int16_t) 89 | BENCH_BOTH_QSORT(float) 90 | BENCH_BOTH_QSORT(double) 91 | #ifdef __FLT16_MAX__ 92 | BENCH_BOTH_QSORT(_Float16) 93 | #endif 94 | -------------------------------------------------------------------------------- /benchmarks/bench-ipp.cpp: -------------------------------------------------------------------------------- 1 | #include "bench.h" 2 | #include "ipp.h" 3 | 4 | template 5 | static void ippsort(benchmark::State &state, Args &&...args) 6 | { 7 | auto args_tuple = std::make_tuple(std::move(args)...); 8 | size_t arrsize = std::get<0>(args_tuple); 9 | /* IPP set up */ 10 | int bufsize = 10; 11 | if constexpr (std::is_same_v) { 12 | ippsSortRadixGetBufferSize(arrsize, ipp32f, &bufsize); 13 | } 14 | else if constexpr (std::is_same_v) { 15 | ippsSortRadixGetBufferSize(arrsize, ipp64f, &bufsize); 16 | } 17 | unsigned char *temp = new unsigned char[bufsize]; 18 | 19 | // Get args 20 | std::string arrtype = std::get<1>(args_tuple); 21 | // set up array 22 | std::vector arr = get_array(arrtype, arrsize); 23 | std::vector arr_bkp = arr; 24 | // benchmark 25 | for (auto _ : state) { 26 | if constexpr (std::is_same_v) { 27 | ippsSortRadixAscend_32f_I(arr.data(), arrsize, temp); 28 | } 29 | else if constexpr (std::is_same_v) { 30 | ippsSortRadixAscend_64f_I(arr.data(), arrsize, temp); 31 | } 32 | state.PauseTiming(); 33 | arr = arr_bkp; 34 | state.ResumeTiming(); 35 | } 36 | } 37 | 38 | template 39 | static void ippargsort(benchmark::State &state, Args &&...args) 40 | { 41 | auto args_tuple = std::make_tuple(std::move(args)...); 42 | size_t arrsize = std::get<0>(args_tuple); 43 | /* IPP set up */ 44 | int bufsize = 10; 45 | if constexpr (std::is_same_v) { 46 | ippsSortRadixIndexGetBufferSize(arrsize, ipp32f, &bufsize); 47 | } 48 | else if constexpr (std::is_same_v) { 49 | ippsSortRadixIndexGetBufferSize(arrsize, ipp64f, &bufsize); 50 | } 51 | else if constexpr (std::is_same_v) { 52 | ippsSortRadixIndexGetBufferSize(arrsize, ipp32s, &bufsize); 53 | } 54 | unsigned char *temp = new unsigned char[bufsize]; 55 | 56 | // set up array 57 | std::string arrtype = std::get<1>(args_tuple); 58 | std::vector arr = get_array(arrtype, arrsize); 59 | std::vector arr_bkp = arr; 60 | std::vector arg(arrsize); 61 | std::iota(arg.begin(), arg.end(), 0); 62 | 63 | // benchmark 64 | for (auto _ : state) { 65 | if constexpr (std::is_same_v) { 66 | ippsSortRadixIndexAscend_32f( 67 | arr.data(), 4, arg.data(), arrsize, temp); 68 | } 69 | else if constexpr (std::is_same_v) { 70 | ippsSortRadixIndexAscend_64f( 71 | arr.data(), 8, arg.data(), arrsize, temp); 72 | } 73 | else if constexpr (std::is_same_v) { 74 | ippsSortRadixIndexAscend_32s( 75 | arr.data(), 4, arg.data(), arrsize, temp); 76 | } 77 | state.PauseTiming(); 78 | arr = arr_bkp; 79 | std::iota(arg.begin(), arg.end(), 0); 80 | state.ResumeTiming(); 81 | } 82 | } 83 | 84 | BENCH_SORT(ippsort, double) 85 | BENCH_SORT(ippsort, float) 86 | BENCH_SORT(ippargsort, double) 87 | BENCH_SORT(ippargsort, float) 88 | BENCH_SORT(ippargsort, int32_t) 89 | -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | Language: Cpp 3 | AccessModifierOffset: -4 4 | AlignAfterOpenBracket: Align 5 | AlignConsecutiveAssignments: false 6 | AlignConsecutiveDeclarations: false 7 | AlignEscapedNewlines: DontAlign 8 | AlignOperands: false 9 | AlignTrailingComments: false 10 | AllowAllParametersOfDeclarationOnNextLine: true 11 | AllowShortBlocksOnASingleLine: true 12 | AllowShortCaseLabelsOnASingleLine: true 13 | AllowShortFunctionsOnASingleLine: Empty 14 | AllowShortIfStatementsOnASingleLine: true 15 | AllowShortLoopsOnASingleLine: false 16 | AlwaysBreakAfterDefinitionReturnType: None 17 | AlwaysBreakAfterReturnType: None 18 | AlwaysBreakBeforeMultilineStrings: true 19 | AlwaysBreakTemplateDeclarations: Yes 20 | BinPackArguments: false 21 | BinPackParameters: false 22 | BraceWrapping: 23 | AfterClass: false 24 | AfterControlStatement: false 25 | AfterEnum: false 26 | AfterFunction: true 27 | AfterNamespace: false 28 | AfterObjCDeclaration: false 29 | AfterStruct: false 30 | AfterUnion: false 31 | AfterExternBlock: false 32 | BeforeCatch: false 33 | BeforeElse: true 34 | IndentBraces: false 35 | SplitEmptyFunction: true 36 | SplitEmptyRecord: true 37 | SplitEmptyNamespace: true 38 | BreakBeforeBinaryOperators: All 39 | BreakBeforeBraces: Custom 40 | BreakBeforeInheritanceComma: false 41 | BreakInheritanceList: BeforeColon 42 | BreakBeforeTernaryOperators: true 43 | BreakConstructorInitializers: BeforeComma 44 | BreakAfterJavaFieldAnnotations: false 45 | BreakStringLiterals: true 46 | ColumnLimit: 80 47 | CommentPragmas: '^ IWYU pragma:' 48 | CompactNamespaces: false 49 | ConstructorInitializerAllOnOneLineOrOnePerLine: true 50 | ConstructorInitializerIndentWidth: 4 51 | ContinuationIndentWidth: 8 52 | Cpp11BracedListStyle: true 53 | DerivePointerAlignment: false 54 | FixNamespaceComments: true 55 | ForEachMacros: 56 | IncludeBlocks: Preserve 57 | IndentCaseLabels: true 58 | # IndentPPDirectives: AfterHash 59 | IndentPPDirectives: None 60 | IndentWidth: 4 61 | IndentWrappedFunctionNames: false 62 | KeepEmptyLinesAtTheStartOfBlocks: true 63 | MacroBlockBegin: '' 64 | MacroBlockEnd: '' 65 | MaxEmptyLinesToKeep: 1 66 | NamespaceIndentation: Inner 67 | PenaltyBreakAssignment: 2 68 | PenaltyBreakBeforeFirstCallParameter: 19 69 | PenaltyBreakComment: 300 70 | PenaltyBreakFirstLessLess: 120 71 | PenaltyBreakString: 1000 72 | PenaltyBreakTemplateDeclaration: 10 73 | PenaltyExcessCharacter: 1000000 74 | PenaltyReturnTypeOnItsOwnLine: 60 75 | PointerAlignment: Right 76 | ReflowComments: false 77 | SortIncludes: false 78 | SortUsingDeclarations: true 79 | SpaceAfterCStyleCast: false 80 | SpaceAfterTemplateKeyword: true 81 | SpaceBeforeAssignmentOperators: true 82 | SpaceBeforeCpp11BracedList: true 83 | SpaceBeforeCtorInitializerColon: true 84 | SpaceBeforeInheritanceColon: true 85 | SpaceBeforeParens: ControlStatements 86 | SpaceBeforeRangeBasedForLoopColon: true 87 | SpaceInEmptyParentheses: false 88 | SpacesBeforeTrailingComments: 1 89 | SpacesInAngles: false 90 | SpacesInContainerLiterals: false 91 | SpacesInCStyleCastParentheses: false 92 | SpacesInParentheses: false 93 | SpacesInSquareBrackets: false 94 | Standard: Cpp11 95 | TabWidth: 4 96 | UseTab: Never 97 | ... 98 | # vim:ft=conf et ts=2 sw=2 99 | -------------------------------------------------------------------------------- /.github/workflows/scorecard.yml: -------------------------------------------------------------------------------- 1 | # This workflow uses actions that are not certified by GitHub. They are provided 2 | # by a third-party and are governed by separate terms of service, privacy 3 | # policy, and support documentation. 4 | 5 | name: Scorecard supply-chain security 6 | on: 7 | # For Branch-Protection check. Only the default branch is supported. See 8 | # https://github.com/ossf/scorecard/blob/main/docs/checks.md#branch-protection 9 | branch_protection_rule: 10 | # To guarantee Maintained check is occasionally updated. See 11 | # https://github.com/ossf/scorecard/blob/main/docs/checks.md#maintained 12 | schedule: 13 | - cron: '00 12 * * 0-6' 14 | push: 15 | branches: [ "main" ] 16 | 17 | # Declare default permissions as read only. 18 | permissions: read-all 19 | 20 | jobs: 21 | analysis: 22 | 23 | name: Scorecard analysis 24 | if: github.repository == 'intel/x86-simd-sort' 25 | runs-on: ubuntu-latest 26 | permissions: 27 | # Needed to upload the results to code-scanning dashboard. 28 | security-events: write 29 | # Needed to publish results and get a badge (see publish_results below). 30 | id-token: write 31 | # Uncomment the permissions below if installing in a private repository. 32 | # contents: read 33 | # actions: read 34 | 35 | steps: 36 | - name: "Checkout code" 37 | uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 38 | with: 39 | persist-credentials: false 40 | 41 | - name: "Run analysis" 42 | uses: ossf/scorecard-action@62b2cac7ed8198b15735ed49ab1e5cf35480ba46 # v2.4.0 43 | with: 44 | results_file: results.sarif 45 | results_format: sarif 46 | # (Optional) "write" PAT token. Uncomment the `repo_token` line below if: 47 | # - you want to enable the Branch-Protection check on a *public* repository, or 48 | # - you are installing Scorecard on a *private* repository 49 | # To create the PAT, follow the steps in https://github.com/ossf/scorecard-action#authentication-with-pat. 50 | # repo_token: ${{ secrets.SCORECARD_TOKEN }} 51 | 52 | # Public repositories: 53 | # - Publish results to OpenSSF REST API for easy access by consumers 54 | # - Allows the repository to include the Scorecard badge. 55 | # - See https://github.com/ossf/scorecard-action#publishing-results. 56 | # For private repositories: 57 | # - `publish_results` will always be set to `false`, regardless 58 | # of the value entered here. 59 | publish_results: true 60 | 61 | # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF 62 | # format to the repository Actions tab. 63 | - name: "Upload artifact" 64 | uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0 65 | with: 66 | name: SARIF file 67 | path: results.sarif 68 | retention-days: 5 69 | 70 | # Upload the results to GitHub's code scanning dashboard. 71 | - name: "Upload to code-scanning" 72 | uses: github/codeql-action/upload-sarif@dd746615b3b9d728a6a37ca2045b68ca76d4841a # v3.28.8 73 | with: 74 | sarif_file: results.sarif 75 | -------------------------------------------------------------------------------- /benchmarks/bench-argsort.hpp: -------------------------------------------------------------------------------- 1 | template 2 | std::vector stdargsort(const std::vector &array) 3 | { 4 | std::vector indices(array.size()); 5 | std::iota(indices.begin(), indices.end(), 0); 6 | std::sort(indices.begin(), 7 | indices.end(), 8 | [&array](size_t left, size_t right) -> bool { 9 | // sort indices according to corresponding array element 10 | return array[left] < array[right]; 11 | }); 12 | 13 | return indices; 14 | } 15 | 16 | template 17 | static void scalarargsort(benchmark::State &state, Args &&...args) 18 | { 19 | // get args 20 | auto args_tuple = std::make_tuple(std::move(args)...); 21 | size_t arrsize = std::get<0>(args_tuple); 22 | std::string arrtype = std::get<1>(args_tuple); 23 | // set up array 24 | std::vector arr = get_array(arrtype, arrsize); 25 | std::vector inx; 26 | // benchmark 27 | for (auto _ : state) { 28 | inx = stdargsort(arr); 29 | } 30 | } 31 | 32 | template 33 | static void simdargsort(benchmark::State &state, Args &&...args) 34 | { 35 | // get args 36 | auto args_tuple = std::make_tuple(std::move(args)...); 37 | size_t arrsize = std::get<0>(args_tuple); 38 | std::string arrtype = std::get<1>(args_tuple); 39 | // set up array 40 | std::vector arr = get_array(arrtype, arrsize); 41 | std::vector inx; 42 | // benchmark 43 | for (auto _ : state) { 44 | inx = x86simdsort::argsort(arr.data(), arrsize); 45 | } 46 | } 47 | 48 | template 49 | static void simd_revargsort(benchmark::State &state, Args &&...args) 50 | { 51 | // get args 52 | auto args_tuple = std::make_tuple(std::move(args)...); 53 | size_t arrsize = std::get<0>(args_tuple); 54 | std::string arrtype = std::get<1>(args_tuple); 55 | // set up array 56 | std::vector arr = get_array(arrtype, arrsize); 57 | std::vector inx; 58 | // benchmark 59 | for (auto _ : state) { 60 | inx = x86simdsort::argsort(arr.data(), arrsize, false, true); 61 | } 62 | } 63 | 64 | template 65 | static void simd_ordern_argsort(benchmark::State &state, Args &&...args) 66 | { 67 | // get args 68 | auto args_tuple = std::make_tuple(std::move(args)...); 69 | size_t arrsize = std::get<0>(args_tuple); 70 | std::string arrtype = std::get<1>(args_tuple); 71 | // set up array 72 | std::vector arr = get_array(arrtype, arrsize); 73 | std::vector arg(arrsize); 74 | std::iota(arg.begin(), arg.end(), 0); 75 | // benchmark 76 | for (auto _ : state) { 77 | std::vector arr_bkp = arr; 78 | x86simdsort::keyvalue_qsort(arr_bkp.data(), arg.data(), arrsize); 79 | state.PauseTiming(); 80 | std::iota(arg.begin(), arg.end(), 0); 81 | state.ResumeTiming(); 82 | } 83 | } 84 | 85 | #define BENCH_BOTH(type) \ 86 | BENCH_SORT(simdargsort, type) \ 87 | BENCH_SORT(simd_revargsort, type) \ 88 | BENCH_SORT(simd_ordern_argsort, type) \ 89 | BENCH_SORT(scalarargsort, type) 90 | 91 | BENCH_BOTH(int64_t) 92 | BENCH_BOTH(uint64_t) 93 | BENCH_BOTH(double) 94 | BENCH_BOTH(int32_t) 95 | BENCH_BOTH(uint32_t) 96 | BENCH_BOTH(float) 97 | -------------------------------------------------------------------------------- /lib/x86simdsort-skx.cpp: -------------------------------------------------------------------------------- 1 | // SKX specific routines: 2 | 3 | #include "x86simdsort-static-incl.h" 4 | #include "x86simdsort-internal.h" 5 | 6 | #define DEFINE_ALL_METHODS(type) \ 7 | template <> \ 8 | void qsort(type *arr, size_t arrsize, bool hasnan, bool descending) \ 9 | { \ 10 | x86simdsortStatic::qsort(arr, arrsize, hasnan, descending); \ 11 | } \ 12 | template <> \ 13 | void qselect( \ 14 | type *arr, size_t k, size_t arrsize, bool hasnan, bool descending) \ 15 | { \ 16 | x86simdsortStatic::qselect(arr, k, arrsize, hasnan, descending); \ 17 | } \ 18 | template <> \ 19 | void partial_qsort( \ 20 | type *arr, size_t k, size_t arrsize, bool hasnan, bool descending) \ 21 | { \ 22 | x86simdsortStatic::partial_qsort(arr, k, arrsize, hasnan, descending); \ 23 | } \ 24 | template <> \ 25 | std::vector argsort( \ 26 | type *arr, size_t arrsize, bool hasnan, bool descending) \ 27 | { \ 28 | return x86simdsortStatic::argsort(arr, arrsize, hasnan, descending); \ 29 | } \ 30 | template <> \ 31 | std::vector argselect( \ 32 | type *arr, size_t k, size_t arrsize, bool hasnan) \ 33 | { \ 34 | return x86simdsortStatic::argselect(arr, k, arrsize, hasnan); \ 35 | } 36 | 37 | #define DEFINE_KEYVALUE_METHODS_BASE(type1, type2) \ 38 | template <> \ 39 | void keyvalue_qsort(type1 *key, \ 40 | type2 *val, \ 41 | size_t arrsize, \ 42 | bool hasnan, \ 43 | bool descending) \ 44 | { \ 45 | x86simdsortStatic::keyvalue_qsort( \ 46 | key, val, arrsize, hasnan, descending); \ 47 | } \ 48 | template <> \ 49 | void keyvalue_select(type1 *key, \ 50 | type2 *val, \ 51 | size_t k, \ 52 | size_t arrsize, \ 53 | bool hasnan, \ 54 | bool descending) \ 55 | { \ 56 | x86simdsortStatic::keyvalue_select( \ 57 | key, val, k, arrsize, hasnan, descending); \ 58 | } \ 59 | template <> \ 60 | void keyvalue_partial_sort(type1 *key, \ 61 | type2 *val, \ 62 | size_t k, \ 63 | size_t arrsize, \ 64 | bool hasnan, \ 65 | bool descending) \ 66 | { \ 67 | x86simdsortStatic::keyvalue_partial_sort( \ 68 | key, val, k, arrsize, hasnan, descending); \ 69 | } 70 | 71 | #define DEFINE_KEYVALUE_METHODS(type) \ 72 | DEFINE_KEYVALUE_METHODS_BASE(type, uint64_t) \ 73 | DEFINE_KEYVALUE_METHODS_BASE(type, int64_t) \ 74 | DEFINE_KEYVALUE_METHODS_BASE(type, double) \ 75 | DEFINE_KEYVALUE_METHODS_BASE(type, uint32_t) \ 76 | DEFINE_KEYVALUE_METHODS_BASE(type, int32_t) \ 77 | DEFINE_KEYVALUE_METHODS_BASE(type, float) 78 | 79 | namespace xss { 80 | namespace avx512 { 81 | DEFINE_ALL_METHODS(uint32_t) 82 | DEFINE_ALL_METHODS(int32_t) 83 | DEFINE_ALL_METHODS(float) 84 | DEFINE_ALL_METHODS(uint64_t) 85 | DEFINE_ALL_METHODS(int64_t) 86 | DEFINE_ALL_METHODS(double) 87 | DEFINE_KEYVALUE_METHODS(uint64_t) 88 | DEFINE_KEYVALUE_METHODS(int64_t) 89 | DEFINE_KEYVALUE_METHODS(double) 90 | DEFINE_KEYVALUE_METHODS(uint32_t) 91 | DEFINE_KEYVALUE_METHODS(int32_t) 92 | DEFINE_KEYVALUE_METHODS(float) 93 | } // namespace avx512 94 | } // namespace xss 95 | -------------------------------------------------------------------------------- /benchmarks/bench-objsort.hpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | static constexpr char x[] = "x"; 4 | static constexpr char euclidean[] = "euclidean"; 5 | static constexpr char taxicab[] = "taxicab"; 6 | static constexpr char chebyshev[] = "chebyshev"; 7 | 8 | template 9 | struct Point3D { 10 | T x; 11 | T y; 12 | T z; 13 | static constexpr std::string_view name {val}; 14 | Point3D() 15 | { 16 | x = (T)rand() / (T)RAND_MAX; 17 | y = (T)rand() / (T)RAND_MAX; 18 | z = (T)rand() / (T)RAND_MAX; 19 | } 20 | T distance() 21 | { 22 | if constexpr (name == "x") { return x; } 23 | else if constexpr (name == "euclidean") { 24 | return std::sqrt(x * x + y * y + z * z); 25 | } 26 | else if constexpr (name == "taxicab") { 27 | return std::abs(x) + std::abs(y) + std::abs(z); 28 | } 29 | else if constexpr (name == "chebyshev") { 30 | return std::max(std::max(std::abs(x), std::abs(y)), std::abs(z)); 31 | } 32 | } 33 | }; 34 | 35 | template 36 | std::vector init_data(const int size) 37 | { 38 | srand(42); 39 | std::vector arr; 40 | for (auto ii = 0; ii < size; ++ii) { 41 | T temp; 42 | arr.push_back(temp); 43 | } 44 | return arr; 45 | } 46 | 47 | template 48 | struct less_than_key { 49 | inline bool operator()(T &p1, T &p2) 50 | { 51 | return (p1.distance() < p2.distance()); 52 | } 53 | }; 54 | 55 | template 56 | static void scalarobjsort(benchmark::State &state) 57 | { 58 | // set up array 59 | std::vector arr = init_data(state.range(0)); 60 | std::vector arr_bkp = arr; 61 | // benchmark 62 | for (auto _ : state) { 63 | std::sort(arr.begin(), arr.end(), less_than_key()); 64 | state.PauseTiming(); 65 | arr = arr_bkp; 66 | state.ResumeTiming(); 67 | } 68 | } 69 | 70 | template 71 | static void simdobjsort(benchmark::State &state) 72 | { 73 | // set up array 74 | std::vector arr = init_data(state.range(0)); 75 | std::vector arr_bkp = arr; 76 | // benchmark 77 | for (auto _ : state) { 78 | x86simdsort::object_qsort( 79 | arr.data(), arr.size(), [](T p) { return p.distance(); }); 80 | state.PauseTiming(); 81 | if (!std::is_sorted(arr.begin(), arr.end(), less_than_key())) { 82 | std::cout << "sorting failed \n"; 83 | } 84 | arr = arr_bkp; 85 | state.ResumeTiming(); 86 | } 87 | } 88 | 89 | #define BENCHMARK_OBJSORT(func, T, type, dist) \ 90 | BENCHMARK_TEMPLATE(func, T) \ 91 | ->Arg(10e1) \ 92 | ->Arg(10e2) \ 93 | ->Arg(10e3) \ 94 | ->Arg(10e4) \ 95 | ->Arg(10e5) \ 96 | ->Arg(10e6); 97 | 98 | #define BENCH_ALL(dtype) \ 99 | BENCHMARK_OBJSORT(simdobjsort, Point3D, dtype, x) \ 100 | BENCHMARK_OBJSORT(scalarobjsort, Point3D, dtype, x) \ 101 | BENCHMARK_OBJSORT(simdobjsort, Point3D, dtype, taxicab) \ 102 | BENCHMARK_OBJSORT(scalarobjsort, Point3D, dtype, taxicab) \ 103 | BENCHMARK_OBJSORT(simdobjsort, Point3D, dtype, euclidean) \ 104 | BENCHMARK_OBJSORT(scalarobjsort, Point3D, dtype, euclidean) \ 105 | BENCHMARK_OBJSORT(simdobjsort, Point3D, dtype, chebyshev) \ 106 | BENCHMARK_OBJSORT(scalarobjsort, Point3D, dtype, chebyshev) 107 | 108 | BENCH_ALL(double) 109 | BENCH_ALL(float) 110 | -------------------------------------------------------------------------------- /lib/x86simdsort-avx2.cpp: -------------------------------------------------------------------------------- 1 | // AVX2 specific routines: 2 | 3 | #include "x86simdsort-static-incl.h" 4 | #include "x86simdsort-internal.h" 5 | 6 | #define DEFINE_ALL_METHODS(type) \ 7 | template <> \ 8 | void qsort(type *arr, size_t arrsize, bool hasnan, bool descending) \ 9 | { \ 10 | x86simdsortStatic::qsort(arr, arrsize, hasnan, descending); \ 11 | } \ 12 | template <> \ 13 | void qselect( \ 14 | type *arr, size_t k, size_t arrsize, bool hasnan, bool descending) \ 15 | { \ 16 | x86simdsortStatic::qselect(arr, k, arrsize, hasnan, descending); \ 17 | } \ 18 | template <> \ 19 | void partial_qsort( \ 20 | type *arr, size_t k, size_t arrsize, bool hasnan, bool descending) \ 21 | { \ 22 | x86simdsortStatic::partial_qsort(arr, k, arrsize, hasnan, descending); \ 23 | } \ 24 | template <> \ 25 | std::vector argsort( \ 26 | type *arr, size_t arrsize, bool hasnan, bool descending) \ 27 | { \ 28 | return x86simdsortStatic::argsort(arr, arrsize, hasnan, descending); \ 29 | } \ 30 | template <> \ 31 | std::vector argselect( \ 32 | type *arr, size_t k, size_t arrsize, bool hasnan) \ 33 | { \ 34 | return x86simdsortStatic::argselect(arr, k, arrsize, hasnan); \ 35 | } 36 | 37 | #define DEFINE_KEYVALUE_METHODS_BASE(type1, type2) \ 38 | template <> \ 39 | void keyvalue_qsort(type1 *key, \ 40 | type2 *val, \ 41 | size_t arrsize, \ 42 | bool hasnan, \ 43 | bool descending) \ 44 | { \ 45 | x86simdsortStatic::keyvalue_qsort( \ 46 | key, val, arrsize, hasnan, descending); \ 47 | } \ 48 | template <> \ 49 | void keyvalue_select(type1 *key, \ 50 | type2 *val, \ 51 | size_t k, \ 52 | size_t arrsize, \ 53 | bool hasnan, \ 54 | bool descending) \ 55 | { \ 56 | x86simdsortStatic::keyvalue_select( \ 57 | key, val, k, arrsize, hasnan, descending); \ 58 | } \ 59 | template <> \ 60 | void keyvalue_partial_sort(type1 *key, \ 61 | type2 *val, \ 62 | size_t k, \ 63 | size_t arrsize, \ 64 | bool hasnan, \ 65 | bool descending) \ 66 | { \ 67 | x86simdsortStatic::keyvalue_partial_sort( \ 68 | key, val, k, arrsize, hasnan, descending); \ 69 | } 70 | 71 | #define DEFINE_KEYVALUE_METHODS(type) \ 72 | DEFINE_KEYVALUE_METHODS_BASE(type, uint64_t) \ 73 | DEFINE_KEYVALUE_METHODS_BASE(type, int64_t) \ 74 | DEFINE_KEYVALUE_METHODS_BASE(type, double) \ 75 | DEFINE_KEYVALUE_METHODS_BASE(type, uint32_t) \ 76 | DEFINE_KEYVALUE_METHODS_BASE(type, int32_t) \ 77 | DEFINE_KEYVALUE_METHODS_BASE(type, float) 78 | 79 | namespace xss { 80 | namespace avx2 { 81 | DEFINE_ALL_METHODS(uint32_t) 82 | DEFINE_ALL_METHODS(int32_t) 83 | DEFINE_ALL_METHODS(float) 84 | DEFINE_ALL_METHODS(uint64_t) 85 | DEFINE_ALL_METHODS(int64_t) 86 | DEFINE_ALL_METHODS(double) 87 | DEFINE_KEYVALUE_METHODS(uint64_t) 88 | DEFINE_KEYVALUE_METHODS(int64_t) 89 | DEFINE_KEYVALUE_METHODS(double) 90 | DEFINE_KEYVALUE_METHODS(uint32_t) 91 | DEFINE_KEYVALUE_METHODS(int32_t) 92 | DEFINE_KEYVALUE_METHODS(float) 93 | } // namespace avx2 94 | } // namespace xss -------------------------------------------------------------------------------- /lib/list-of-exported-symbols.txt: -------------------------------------------------------------------------------- 1 | std::vector > x86simdsort::argselect(double*, unsigned long, unsigned long, bool) 2 | std::vector > x86simdsort::argselect(float*, unsigned long, unsigned long, bool) 3 | std::vector > x86simdsort::argselect(int*, unsigned long, unsigned long, bool) 4 | std::vector > x86simdsort::argselect(long*, unsigned long, unsigned long, bool) 5 | std::vector > x86simdsort::argselect(short*, unsigned long, unsigned long, bool) 6 | std::vector > x86simdsort::argselect(unsigned int*, unsigned long, unsigned long, bool) 7 | std::vector > x86simdsort::argselect(unsigned long*, unsigned long, unsigned long, bool) 8 | std::vector > x86simdsort::argselect(unsigned short*, unsigned long, unsigned long, bool) 9 | std::vector > x86simdsort::argsort(double*, unsigned long, bool) 10 | std::vector > x86simdsort::argsort(float*, unsigned long, bool) 11 | std::vector > x86simdsort::argsort(int*, unsigned long, bool) 12 | std::vector > x86simdsort::argsort(long*, unsigned long, bool) 13 | std::vector > x86simdsort::argsort(short*, unsigned long, bool) 14 | std::vector > x86simdsort::argsort(unsigned int*, unsigned long, bool) 15 | std::vector > x86simdsort::argsort(unsigned long*, unsigned long, bool) 16 | std::vector > x86simdsort::argsort(unsigned short*, unsigned long, bool) 17 | void x86simdsort::partial_qsort(double*, unsigned long, unsigned long, bool) 18 | void x86simdsort::partial_qsort(float*, unsigned long, unsigned long, bool) 19 | void x86simdsort::partial_qsort(int*, unsigned long, unsigned long, bool) 20 | void x86simdsort::partial_qsort(long*, unsigned long, unsigned long, bool) 21 | void x86simdsort::partial_qsort(short*, unsigned long, unsigned long, bool) 22 | void x86simdsort::partial_qsort(unsigned int*, unsigned long, unsigned long, bool) 23 | void x86simdsort::partial_qsort(unsigned long*, unsigned long, unsigned long, bool) 24 | void x86simdsort::partial_qsort(unsigned short*, unsigned long, unsigned long, bool) 25 | void x86simdsort::qselect(double*, unsigned long, unsigned long, bool) 26 | void x86simdsort::qselect(float*, unsigned long, unsigned long, bool) 27 | void x86simdsort::qselect(int*, unsigned long, unsigned long, bool) 28 | void x86simdsort::qselect(long*, unsigned long, unsigned long, bool) 29 | void x86simdsort::qselect(short*, unsigned long, unsigned long, bool) 30 | void x86simdsort::qselect(unsigned int*, unsigned long, unsigned long, bool) 31 | void x86simdsort::qselect(unsigned long*, unsigned long, unsigned long, bool) 32 | void x86simdsort::qselect(unsigned short*, unsigned long, unsigned long, bool) 33 | void x86simdsort::qsort(double*, unsigned long, bool) 34 | void x86simdsort::qsort(float*, unsigned long, bool) 35 | void x86simdsort::qsort(int*, unsigned long, bool) 36 | void x86simdsort::qsort(long*, unsigned long, bool) 37 | void x86simdsort::qsort(short*, unsigned long, bool) 38 | void x86simdsort::qsort(unsigned int*, unsigned long, bool) 39 | void x86simdsort::qsort(unsigned long*, unsigned long, bool) 40 | void x86simdsort::qsort(unsigned short*, unsigned long, bool) 41 | _ZN11x86simdsort13partial_qsortIDF16_EEvPT_mmb 42 | _ZN11x86simdsort5qsortIDF16_EEvPT_mb 43 | _ZN11x86simdsort7argsortIDF16_EESt6vectorImSaImEEPT_mb 44 | _ZN11x86simdsort7qselectIDF16_EEvPT_mmb 45 | _ZN11x86simdsort9argselectIDF16_EESt6vectorImSaImEEPT_mmb 46 | -------------------------------------------------------------------------------- /lib/x86simdsort.h: -------------------------------------------------------------------------------- 1 | #ifndef X86_SIMD_SORT 2 | #define X86_SIMD_SORT 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #define XSS_EXPORT_SYMBOL __attribute__((visibility("default"))) 10 | #define XSS_HIDE_SYMBOL __attribute__((visibility("hidden"))) 11 | #define UNUSED(x) (void)(x) 12 | 13 | namespace x86simdsort { 14 | 15 | // quicksort 16 | template 17 | XSS_EXPORT_SYMBOL void 18 | qsort(T *arr, size_t arrsize, bool hasnan = false, bool descending = false); 19 | 20 | // quickselect 21 | template 22 | XSS_EXPORT_SYMBOL void qselect(T *arr, 23 | size_t k, 24 | size_t arrsize, 25 | bool hasnan = false, 26 | bool descending = false); 27 | 28 | // partial sort 29 | template 30 | XSS_EXPORT_SYMBOL void partial_qsort(T *arr, 31 | size_t k, 32 | size_t arrsize, 33 | bool hasnan = false, 34 | bool descending = false); 35 | 36 | // argsort 37 | template 38 | XSS_EXPORT_SYMBOL std::vector 39 | argsort(T *arr, size_t arrsize, bool hasnan = false, bool descending = false); 40 | 41 | // argselect 42 | template 43 | XSS_EXPORT_SYMBOL std::vector 44 | argselect(T *arr, size_t k, size_t arrsize, bool hasnan = false); 45 | 46 | // keyvalue sort 47 | template 48 | XSS_EXPORT_SYMBOL void keyvalue_qsort(T1 *key, 49 | T2 *val, 50 | size_t arrsize, 51 | bool hasnan = false, 52 | bool descending = false); 53 | 54 | // keyvalue select 55 | template 56 | XSS_EXPORT_SYMBOL void keyvalue_select(T1 *key, 57 | T2 *val, 58 | size_t k, 59 | size_t arrsize, 60 | bool hasnan = false, 61 | bool descending = false); 62 | 63 | // keyvalue partial sort 64 | template 65 | XSS_EXPORT_SYMBOL void keyvalue_partial_sort(T1 *key, 66 | T2 *val, 67 | size_t k, 68 | size_t arrsize, 69 | bool hasnan = false, 70 | bool descending = false); 71 | 72 | // sort an object 73 | template 74 | XSS_EXPORT_SYMBOL void object_qsort(T *arr, U arrsize, Func key_func) 75 | { 76 | static_assert(std::is_integral::value, "arrsize must be an integral type"); 77 | static_assert(sizeof(U) == sizeof(int32_t) || sizeof(U) == sizeof(int64_t), 78 | "arrsize must be 32 or 64 bits"); 79 | using return_type_of = typename decltype(std::function{key_func})::result_type; 80 | static_assert(sizeof(return_type_of) == sizeof(int32_t) || sizeof(return_type_of) == sizeof(int64_t), 81 | "key_func return type must be 32 or 64 bits"); 82 | std::vector keys(arrsize); 83 | for (U ii = 0; ii < arrsize; ++ii) { 84 | keys[ii] = key_func(arr[ii]); 85 | } 86 | 87 | /* (2) Call arg based on keys using the keyvalue sort */ 88 | std::vector arg(arrsize); 89 | std::iota(arg.begin(), arg.end(), 0); 90 | x86simdsort::keyvalue_qsort(keys.data(), arg.data(), arrsize); 91 | 92 | /* (3) Permute obj array in-place */ 93 | std::vector done(arrsize); 94 | for (U i = 0; i < arrsize; ++i) { 95 | if (done[i]) { continue; } 96 | done[i] = true; 97 | U prev_j = i; 98 | U j = arg[i]; 99 | while (i != j) { 100 | std::swap(arr[prev_j], arr[j]); 101 | done[j] = true; 102 | prev_j = j; 103 | j = arg[j]; 104 | } 105 | } 106 | } 107 | 108 | } // namespace x86simdsort 109 | #endif 110 | -------------------------------------------------------------------------------- /tests/test-qsort-common.h: -------------------------------------------------------------------------------- 1 | #ifndef AVX512_TEST_COMMON 2 | #define AVX512_TEST_COMMON 3 | 4 | #define XSS_DO_NOT_SET_SEED 5 | 6 | #include "custom-compare.h" 7 | #include "rand_array.h" 8 | #include "x86simdsort.h" 9 | #include 10 | 11 | #define EXPECT_UNIQUE(arg) \ 12 | auto sorted_arg = arg; \ 13 | std::sort(sorted_arg.begin(), sorted_arg.end()); \ 14 | std::vector expected_arg(sorted_arg.size()); \ 15 | std::iota(expected_arg.begin(), expected_arg.end(), 0); \ 16 | EXPECT_EQ(sorted_arg, expected_arg) \ 17 | << "Indices aren't unique. Array size = " << sorted_arg.size(); 18 | 19 | #define REPORT_FAIL(msg, size, type, k) \ 20 | ASSERT_TRUE(false) << msg << ". arr size = " << size \ 21 | << ", type = " << type << ", k = " << k; 22 | 23 | inline bool is_nan_test(std::string type) 24 | { 25 | // Currently, determine whether the test uses nan just be checking if nan is in its name 26 | return type.find("nan") != std::string::npos; 27 | } 28 | 29 | template 30 | void IS_SORTED(std::vector sorted, std::vector arr, std::string type) 31 | { 32 | if (arr.size() == 0) return; 33 | if (memcmp(arr.data(), sorted.data(), arr.size() * sizeof(T)) != 0) { 34 | REPORT_FAIL("Array not sorted", arr.size(), type, -1); 35 | } 36 | } 37 | 38 | template 39 | void IS_ARG_SORTED(std::vector sortedarr, 40 | std::vector arr, 41 | std::vector arg, 42 | std::string type) 43 | { 44 | EXPECT_UNIQUE(arg) 45 | std::vector arr_backup; 46 | for (auto ii : arg) { 47 | arr_backup.push_back(arr[ii]); 48 | } 49 | IS_SORTED(sortedarr, arr_backup, type); 50 | } 51 | 52 | template 53 | void IS_ARR_PARTITIONED(std::vector arr, 54 | size_t k, 55 | T true_kth, 56 | std::string type, 57 | bool descending = false) 58 | { 59 | std::function cmp_eq, cmp_less, cmp_leq, cmp_geq; 60 | cmp_eq = compare>(); 61 | 62 | if (!descending) { 63 | cmp_less = compare>(); 64 | cmp_leq = compare>(); 65 | cmp_geq = compare>(); 66 | } 67 | else { 68 | cmp_less = compare>(); 69 | cmp_leq = compare>(); 70 | cmp_geq = compare>(); 71 | } 72 | 73 | // 1) arr[k] == sorted[k]; use memcmp to handle nan 74 | if (!cmp_eq(arr[k], true_kth)) { 75 | REPORT_FAIL("kth element is incorrect", arr.size(), type, k); 76 | } 77 | // ( 2) Elements to the left of k should be atmost arr[k] 78 | if (k >= 1) { 79 | T max_left 80 | = *std::max_element(arr.begin(), arr.begin() + k - 1, cmp_less); 81 | if (!cmp_geq(arr[k], max_left)) { 82 | REPORT_FAIL("incorrect left partition", arr.size(), type, k); 83 | } 84 | } 85 | // 3) Elements to the right of k should be atleast arr[k] 86 | if (k != (size_t)(arr.size() - 1)) { 87 | T min_right 88 | = *std::min_element(arr.begin() + k + 1, arr.end(), cmp_less); 89 | if (!cmp_leq(arr[k], min_right)) { 90 | REPORT_FAIL("incorrect right partition", arr.size(), type, k); 91 | } 92 | } 93 | } 94 | 95 | template 96 | void IS_ARR_PARTIALSORTED(std::vector arr, 97 | size_t k, 98 | std::vector sorted, 99 | std::string type) 100 | { 101 | if (memcmp(arr.data(), sorted.data(), k * sizeof(T)) != 0) { 102 | REPORT_FAIL("Partial array not sorted", arr.size(), type, k); 103 | } 104 | } 105 | 106 | template 107 | void IS_ARG_PARTITIONED(std::vector arr, 108 | std::vector arg, 109 | T true_kth, 110 | size_t k, 111 | std::string type) 112 | { 113 | EXPECT_UNIQUE(arg) 114 | std::vector part_arr; 115 | for (auto ii : arg) { 116 | part_arr.push_back(arr[ii]); 117 | } 118 | IS_ARR_PARTITIONED(part_arr, k, true_kth, type); 119 | } 120 | #endif 121 | -------------------------------------------------------------------------------- /meson.build: -------------------------------------------------------------------------------- 1 | project('x86-simd-sort', 'cpp', 2 | version : '7.0.x', 3 | license : 'BSD 3-clause', 4 | default_options : ['cpp_std=c++17']) 5 | fs = import('fs') 6 | cpp = meson.get_compiler('cpp') 7 | src = include_directories('src') 8 | lib = include_directories('lib') 9 | bench = include_directories('benchmarks') 10 | utils = include_directories('utils') 11 | tests = include_directories('tests') 12 | 13 | # Add IPP sort to benchmarks: 14 | benchipp = false 15 | ipplink = [] 16 | if get_option('build_ippbench') 17 | benchipp = true 18 | ipplink = ['-lipps', '-lippcore'] 19 | endif 20 | 21 | # Essentially '-Werror' for the sanitizers; all problems become fatal with this set 22 | if get_option('fatal_sanitizers') 23 | add_project_arguments([ '-fno-sanitize-recover=all' ], language: 'cpp') 24 | endif 25 | 26 | # Add google vqsort to benchmarks: 27 | benchvq = false 28 | if get_option('build_vqsortbench') 29 | benchvq = true 30 | endif 31 | 32 | # openMP: 33 | omp = [] 34 | omp_dep = [] 35 | if get_option('use_openmp') 36 | omp = dependency('openmp', required : true) 37 | omp_dep = declare_dependency(dependencies: omp, compile_args: ['-DXSS_USE_OPENMP']) 38 | endif 39 | 40 | fp16code = '''#include 41 | int main() { 42 | __m512h temp = _mm512_set1_ph(1.0f); 43 | __m512h var2 = _mm512_min_ph(temp, temp); 44 | return 0; 45 | } 46 | ''' 47 | cancompilefp16 = cpp.compiles(fp16code, args:'-march=sapphirerapids') 48 | 49 | subdir('lib') 50 | if get_option('lib_type') == 'shared' 51 | libsimdsort = shared_library('x86simdsortcpp', 52 | 'lib/x86simdsort.cpp', 53 | include_directories : [src, utils, lib], 54 | link_with : [libtargets], 55 | dependencies: [omp_dep], 56 | gnu_symbol_visibility : 'inlineshidden', 57 | install : true, 58 | soversion : 1, 59 | ) 60 | else 61 | libsimdsort = static_library('x86simdsortcpp', 62 | 'lib/x86simdsort.cpp', 63 | include_directories : [src, utils, lib], 64 | link_with : [libtargets], 65 | dependencies: [omp_dep], 66 | gnu_symbol_visibility : 'inlineshidden', 67 | install : true, 68 | pic: true, 69 | ) 70 | endif 71 | 72 | pkg_mod = import('pkgconfig') 73 | pkg_mod.generate(libraries : libsimdsort, 74 | version : '7.0', 75 | name : 'libx86simdsortcpp', 76 | filebase : 'x86simdsortcpp', 77 | description : 'C++ template library for high performance SIMD based sorting routines.') 78 | 79 | # Create a new dependency variable making it easy to use this as a subproject: 80 | x86simdsortcpp_dep = declare_dependency( 81 | include_directories: include_directories('lib'), 82 | link_with: libsimdsort, 83 | ) 84 | 85 | # Build test suite if option build_tests set to true 86 | if get_option('build_tests') 87 | gtest_dep = dependency('gtest_main', required : true, static: false) 88 | subdir('tests') 89 | testexe = executable('testexe', 90 | include_directories : [lib, utils], 91 | dependencies : [gtest_dep, x86simdsortcpp_dep], 92 | link_whole : [libtests], 93 | ) 94 | test('x86 simd sort tests', testexe) 95 | endif 96 | 97 | # Build benchmarking suite if option build_benchmarks is set to true 98 | 99 | if get_option('build_benchmarks') 100 | gbench_dep = dependency('benchmark', required : true, static: false) 101 | thread_dep = dependency('threads') # libbenchmark could need pthread_create 102 | subdir('benchmarks') 103 | benchexe = executable('benchexe', 104 | include_directories : [src, lib, utils, bench], 105 | dependencies : [gbench_dep, thread_dep, x86simdsortcpp_dep], 106 | link_args: ['-lbenchmark_main', ipplink], 107 | link_whole : [libbench], 108 | ) 109 | endif 110 | 111 | summary({ 112 | 'Can compile AVX-512 FP16 ISA': cancompilefp16, 113 | 'Build test content': get_option('build_tests'), 114 | 'Build benchmarks': get_option('build_benchmarks'), 115 | }, 116 | section: 'Configuration', 117 | bool_yn: true 118 | ) 119 | 120 | -------------------------------------------------------------------------------- /src/xss-common-comparators.hpp: -------------------------------------------------------------------------------- 1 | #ifndef XSS_COMMON_COMPARATORS 2 | #define XSS_COMMON_COMPARATORS 3 | 4 | template 5 | type_t prev_value(type_t value) 6 | { 7 | // TODO this probably handles non-native float16 wrong 8 | if constexpr (std::is_floating_point::value) { 9 | return std::nextafter(value, -std::numeric_limits::infinity()); 10 | } 11 | else { 12 | if (value > std::numeric_limits::min()) { return value - 1; } 13 | else { 14 | return value; 15 | } 16 | } 17 | } 18 | 19 | template 20 | type_t next_value(type_t value) 21 | { 22 | // TODO this probably handles non-native float16 wrong 23 | if constexpr (std::is_floating_point::value) { 24 | return std::nextafter(value, std::numeric_limits::infinity()); 25 | } 26 | else { 27 | if (value < std::numeric_limits::max()) { return value + 1; } 28 | else { 29 | return value; 30 | } 31 | } 32 | } 33 | 34 | template 35 | X86_SIMD_SORT_INLINE void COEX(mm_t &a, mm_t &b); 36 | 37 | template 38 | struct Comparator { 39 | using reg_t = typename vtype::reg_t; 40 | using opmask_t = typename vtype::opmask_t; 41 | using type_t = typename vtype::type_t; 42 | 43 | X86_SIMD_SORT_FINLINE bool STDSortComparator(const type_t &a, 44 | const type_t &b) 45 | { 46 | if constexpr (descend) { return comparison_func(b, a); } 47 | else { 48 | return comparison_func(a, b); 49 | } 50 | } 51 | 52 | X86_SIMD_SORT_FINLINE opmask_t PartitionComparator(reg_t a, reg_t b) 53 | { 54 | if constexpr (descend) { return vtype::ge(b, a); } 55 | else { 56 | return vtype::ge(a, b); 57 | } 58 | } 59 | 60 | X86_SIMD_SORT_FINLINE void COEX(reg_t &a, reg_t &b) 61 | { 62 | if constexpr (descend) { ::COEX(b, a); } 63 | else { 64 | ::COEX(a, b); 65 | } 66 | } 67 | 68 | // Returns a vector of values that would be sorted as far right as possible 69 | // For ascending order, this is the maximum possible value 70 | X86_SIMD_SORT_FINLINE reg_t rightmostPossibleVec() 71 | { 72 | if constexpr (descend) { return vtype::zmm_min(); } 73 | else { 74 | return vtype::zmm_max(); 75 | } 76 | } 77 | 78 | // Returns the value that would be leftmost of the two when sorted 79 | // For ascending order, that is the smaller value 80 | X86_SIMD_SORT_FINLINE type_t leftmost(type_t smaller, type_t larger) 81 | { 82 | if constexpr (descend) { 83 | UNUSED(smaller); 84 | return larger; 85 | } 86 | else { 87 | UNUSED(larger); 88 | return smaller; 89 | } 90 | } 91 | 92 | // Returns the value that would be rightmost of the two when sorted 93 | // For ascending order, that is the larger value 94 | X86_SIMD_SORT_FINLINE type_t rightmost(type_t smaller, type_t larger) 95 | { 96 | if constexpr (descend) { 97 | UNUSED(larger); 98 | return smaller; 99 | } 100 | else { 101 | UNUSED(smaller); 102 | return larger; 103 | } 104 | } 105 | 106 | // If median == smallest, that implies approximately half the array is equal to smallest, unless we were very unlucky with our sample 107 | // Try just doing the next largest value greater than this seemingly very common value to seperate them out 108 | X86_SIMD_SORT_FINLINE type_t choosePivotMedianIsSmallest(type_t median) 109 | { 110 | if constexpr (descend) { return median; } 111 | else { 112 | return next_value(median); 113 | } 114 | } 115 | 116 | // If median == largest, that implies approximately half the array is equal to largest, unless we were very unlucky with our sample 117 | // Thus, median probably is a fine pivot, since it will move all of this common value into its own partition 118 | X86_SIMD_SORT_FINLINE type_t choosePivotMedianIsLargest(type_t median) 119 | { 120 | if constexpr (descend) { return prev_value(median); } 121 | else { 122 | return median; 123 | } 124 | } 125 | }; 126 | 127 | #endif // XSS_COMMON_COMPARATORS 128 | -------------------------------------------------------------------------------- /lib/x86simdsort-scalar.h: -------------------------------------------------------------------------------- 1 | #include "custom-compare.h" 2 | #include 3 | #include 4 | 5 | namespace xss { 6 | namespace utils { 7 | /* 8 | * O(1) permute array in place: stolen from 9 | * http://www.davidespataro.it/apply-a-permutation-to-a-vector 10 | */ 11 | template 12 | void apply_permutation_in_place(T *arr, std::vector arg) 13 | { 14 | for (size_t i = 0; i < arg.size(); i++) { 15 | size_t curr = i; 16 | size_t next = arg[curr]; 17 | while (next != i) { 18 | std::swap(arr[curr], arr[next]); 19 | arg[curr] = curr; 20 | curr = next; 21 | next = arg[next]; 22 | } 23 | arg[curr] = curr; 24 | } 25 | } 26 | template 27 | decltype(auto) get_cmp_func(bool hasnan, bool reverse) 28 | { 29 | std::function cmp; 30 | if (hasnan) { 31 | if (reverse == true) { cmp = compare>(); } 32 | else { 33 | cmp = compare>(); 34 | } 35 | } 36 | else { 37 | if (reverse == true) { cmp = std::greater(); } 38 | else { 39 | cmp = std::less(); 40 | } 41 | } 42 | return cmp; 43 | } 44 | } // namespace utils 45 | 46 | namespace scalar { 47 | template 48 | void qsort(T *arr, size_t arrsize, bool hasnan, bool reversed) 49 | { 50 | std::sort(arr, 51 | arr + arrsize, 52 | xss::utils::get_cmp_func(hasnan, reversed)); 53 | } 54 | 55 | template 56 | void qselect(T *arr, size_t k, size_t arrsize, bool hasnan, bool reversed) 57 | { 58 | std::nth_element(arr, 59 | arr + k, 60 | arr + arrsize, 61 | xss::utils::get_cmp_func(hasnan, reversed)); 62 | } 63 | template 64 | void 65 | partial_qsort(T *arr, size_t k, size_t arrsize, bool hasnan, bool reversed) 66 | { 67 | std::partial_sort(arr, 68 | arr + k, 69 | arr + arrsize, 70 | xss::utils::get_cmp_func(hasnan, reversed)); 71 | } 72 | template 73 | std::vector 74 | argsort(T *arr, size_t arrsize, bool hasnan, bool reversed) 75 | { 76 | UNUSED(hasnan); 77 | std::vector arg(arrsize); 78 | std::iota(arg.begin(), arg.end(), 0); 79 | if (reversed) { 80 | std::sort(arg.begin(), 81 | arg.end(), 82 | compare_arg>(arr)); 83 | } 84 | else { 85 | std::sort( 86 | arg.begin(), arg.end(), compare_arg>(arr)); 87 | } 88 | return arg; 89 | } 90 | template 91 | std::vector argselect(T *arr, size_t k, size_t arrsize, bool hasnan) 92 | { 93 | UNUSED(hasnan); 94 | std::vector arg(arrsize); 95 | std::iota(arg.begin(), arg.end(), 0); 96 | std::nth_element(arg.begin(), 97 | arg.begin() + k, 98 | arg.end(), 99 | compare_arg>(arr)); 100 | return arg; 101 | } 102 | template 103 | void keyvalue_qsort( 104 | T1 *key, T2 *val, size_t arrsize, bool hasnan, bool descending) 105 | { 106 | std::vector arg = argsort(key, arrsize, hasnan, descending); 107 | utils::apply_permutation_in_place(key, arg); 108 | utils::apply_permutation_in_place(val, arg); 109 | } 110 | template 111 | void keyvalue_select(T1 *key, 112 | T2 *val, 113 | size_t k, 114 | size_t arrsize, 115 | bool hasnan, 116 | bool descending) 117 | { 118 | // Note that this does a full kv-sort 119 | UNUSED(k); 120 | keyvalue_qsort(key, val, arrsize, hasnan, descending); 121 | } 122 | template 123 | void keyvalue_partial_sort(T1 *key, 124 | T2 *val, 125 | size_t k, 126 | size_t arrsize, 127 | bool hasnan, 128 | bool descending) 129 | { 130 | // Note that this does a full kv-sort 131 | UNUSED(k); 132 | keyvalue_qsort(key, val, arrsize, hasnan, descending); 133 | } 134 | 135 | } // namespace scalar 136 | } // namespace xss 137 | -------------------------------------------------------------------------------- /src/xss-common-includes.h: -------------------------------------------------------------------------------- 1 | #ifndef XSS_COMMON_INCLUDES 2 | #define XSS_COMMON_INCLUDES 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include "xss-custom-float.h" 11 | 12 | #define X86_SIMD_SORT_INFINITY std::numeric_limits::infinity() 13 | #define X86_SIMD_SORT_INFINITYF std::numeric_limits::infinity() 14 | #define X86_SIMD_SORT_INFINITYH 0x7c00 15 | #define X86_SIMD_SORT_NEGINFINITYH 0xfc00 16 | #define X86_SIMD_SORT_MAX_UINT16 std::numeric_limits::max() 17 | #define X86_SIMD_SORT_MAX_INT16 std::numeric_limits::max() 18 | #define X86_SIMD_SORT_MIN_INT16 std::numeric_limits::min() 19 | #define X86_SIMD_SORT_MAX_UINT32 std::numeric_limits::max() 20 | #define X86_SIMD_SORT_MAX_INT32 std::numeric_limits::max() 21 | #define X86_SIMD_SORT_MIN_INT32 std::numeric_limits::min() 22 | #define X86_SIMD_SORT_MAX_UINT64 std::numeric_limits::max() 23 | #define X86_SIMD_SORT_MAX_INT64 std::numeric_limits::max() 24 | #define X86_SIMD_SORT_MIN_INT64 std::numeric_limits::min() 25 | #define ZMM_MAX_DOUBLE _mm512_set1_pd(X86_SIMD_SORT_INFINITY) 26 | #define ZMM_MAX_UINT64 _mm512_set1_epi64(X86_SIMD_SORT_MAX_UINT64) 27 | #define ZMM_MAX_INT64 _mm512_set1_epi64(X86_SIMD_SORT_MAX_INT64) 28 | #define ZMM_MAX_FLOAT _mm512_set1_ps(X86_SIMD_SORT_INFINITYF) 29 | #define ZMM_MAX_UINT _mm512_set1_epi32(X86_SIMD_SORT_MAX_UINT32) 30 | #define ZMM_MAX_INT _mm512_set1_epi32(X86_SIMD_SORT_MAX_INT32) 31 | #define ZMM_MAX_HALF _mm512_set1_epi16(X86_SIMD_SORT_INFINITYH) 32 | #define YMM_MAX_HALF _mm256_set1_epi16(X86_SIMD_SORT_INFINITYH) 33 | #define ZMM_MAX_UINT16 _mm512_set1_epi16(X86_SIMD_SORT_MAX_UINT16) 34 | #define ZMM_MAX_INT16 _mm512_set1_epi16(X86_SIMD_SORT_MAX_INT16) 35 | #define SHUFFLE_MASK(a, b, c, d) (a << 6) | (b << 4) | (c << 2) | d 36 | 37 | #define PRAGMA(x) _Pragma(#x) 38 | #define UNUSED(x) (void)(x) 39 | 40 | /* Compiler specific macros specific */ 41 | #ifdef _MSC_VER 42 | #define X86_SIMD_SORT_INLINE_ONLY inline 43 | #define X86_SIMD_SORT_INLINE static inline 44 | #define X86_SIMD_SORT_FINLINE static __forceinline 45 | #define LIKELY(x) (x) 46 | #define UNLIKELY(x) (x) 47 | #elif defined(__CYGWIN__) 48 | /* 49 | * Force inline in cygwin to work around a compiler bug. See 50 | * https://github.com/numpy/numpy/pull/22315#issuecomment-1267757584 51 | */ 52 | #define X86_SIMD_SORT_INLINE_ONLY inline 53 | #define X86_SIMD_SORT_INLINE static __attribute__((always_inline)) 54 | #define X86_SIMD_SORT_FINLINE static __attribute__((always_inline)) 55 | #elif defined(__GNUC__) 56 | #define X86_SIMD_SORT_INLINE_ONLY inline 57 | #define X86_SIMD_SORT_INLINE static inline 58 | #define X86_SIMD_SORT_FINLINE static inline __attribute__((always_inline)) 59 | #define LIKELY(x) __builtin_expect((x), 1) 60 | #define UNLIKELY(x) __builtin_expect((x), 0) 61 | #else 62 | #define X86_SIMD_SORT_INLINE_ONLY 63 | #define X86_SIMD_SORT_INLINE static 64 | #define X86_SIMD_SORT_FINLINE static 65 | #define LIKELY(x) (x) 66 | #define UNLIKELY(x) (x) 67 | #endif 68 | 69 | #if defined(__INTEL_COMPILER) and !defined(__SANITIZE_ADDRESS__) 70 | #define X86_SIMD_SORT_UNROLL_LOOP(num) PRAGMA(unroll(num)) 71 | #elif __GNUC__ >= 8 and !defined(__SANITIZE_ADDRESS__) 72 | #define X86_SIMD_SORT_UNROLL_LOOP(num) PRAGMA(GCC unroll num) 73 | #else 74 | #define X86_SIMD_SORT_UNROLL_LOOP(num) 75 | #endif 76 | 77 | #define NETWORK_REVERSE_4LANES 0, 1, 2, 3 78 | #define NETWORK_REVERSE_8LANES 0, 1, 2, 3, 4, 5, 6, 7 79 | #define NETWORK_REVERSE_16LANES \ 80 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 81 | #define NETWORK_REVERSE_32LANES \ 82 | 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, \ 83 | 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 84 | 85 | #if defined(XSS_USE_OPENMP) && defined(_OPENMP) 86 | #define XSS_COMPILE_OPENMP 87 | #include 88 | 89 | // Limit the number of threads to 16: emperically determined, can be probably 90 | // better tuned at a later stage 91 | X86_SIMD_SORT_INLINE int xss_get_num_threads() 92 | { 93 | return std::min(16, (int)omp_get_max_threads()); 94 | } 95 | #endif 96 | 97 | template 98 | constexpr bool always_false = false; 99 | 100 | typedef size_t arrsize_t; 101 | 102 | template 103 | struct zmm_vector; 104 | 105 | template 106 | struct ymm_vector; 107 | 108 | template 109 | struct avx2_vector; 110 | 111 | template 112 | struct avx2_half_vector; 113 | 114 | enum class simd_type : int { AVX2, AVX512 }; 115 | 116 | template 117 | X86_SIMD_SORT_INLINE bool comparison_func(const T &a, const T &b); 118 | 119 | struct float16 { 120 | uint16_t val; 121 | }; 122 | 123 | #endif // XSS_COMMON_INCLUDES 124 | -------------------------------------------------------------------------------- /src/avx512-16bit-common.h: -------------------------------------------------------------------------------- 1 | /******************************************************************* 2 | * Copyright (C) 2022 Intel Corporation 3 | * SPDX-License-Identifier: BSD-3-Clause 4 | * Authors: Raghuveer Devulapalli 5 | * ****************************************************************/ 6 | 7 | #ifndef AVX512_16BIT_COMMON 8 | #define AVX512_16BIT_COMMON 9 | 10 | struct avx512_16bit_swizzle_ops { 11 | template 12 | X86_SIMD_SORT_INLINE typename vtype::reg_t swap_n(typename vtype::reg_t reg) 13 | { 14 | __m512i v = vtype::cast_to(reg); 15 | 16 | if constexpr (scale == 2) { 17 | constexpr static uint16_t arr[] 18 | = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 19 | 10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20, 20 | 23, 22, 25, 24, 27, 26, 29, 28, 31, 30}; 21 | __m512i mask = _mm512_loadu_si512(arr); 22 | v = _mm512_permutexvar_epi16(mask, v); 23 | } 24 | else if constexpr (scale == 4) { 25 | v = _mm512_shuffle_epi32(v, (_MM_PERM_ENUM)0b10110001); 26 | } 27 | else if constexpr (scale == 8) { 28 | v = _mm512_shuffle_epi32(v, (_MM_PERM_ENUM)0b01001110); 29 | } 30 | else if constexpr (scale == 16) { 31 | v = _mm512_shuffle_i64x2(v, v, 0b10110001); 32 | } 33 | else if constexpr (scale == 32) { 34 | v = _mm512_shuffle_i64x2(v, v, 0b01001110); 35 | } 36 | else { 37 | static_assert(scale == -1, "should not be reached"); 38 | } 39 | 40 | return vtype::cast_from(v); 41 | } 42 | 43 | template 44 | X86_SIMD_SORT_INLINE typename vtype::reg_t 45 | reverse_n(typename vtype::reg_t reg) 46 | { 47 | __m512i v = vtype::cast_to(reg); 48 | 49 | if constexpr (scale == 2) { return swap_n(reg); } 50 | else if constexpr (scale == 4) { 51 | constexpr static uint16_t arr[] 52 | = {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 53 | 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22, 54 | 21, 20, 27, 26, 25, 24, 31, 30, 29, 28}; 55 | __m512i mask = _mm512_loadu_si512(arr); 56 | v = _mm512_permutexvar_epi16(mask, v); 57 | } 58 | else if constexpr (scale == 8) { 59 | constexpr static int16_t arr[] 60 | = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 61 | 12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18, 62 | 17, 16, 31, 30, 29, 28, 27, 26, 25, 24}; 63 | __m512i mask = _mm512_loadu_si512(arr); 64 | v = _mm512_permutexvar_epi16(mask, v); 65 | } 66 | else if constexpr (scale == 16) { 67 | constexpr static uint16_t arr[] 68 | = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 69 | 4, 3, 2, 1, 0, 31, 30, 29, 28, 27, 26, 70 | 25, 24, 23, 22, 21, 20, 19, 18, 17, 16}; 71 | __m512i mask = _mm512_loadu_si512(arr); 72 | v = _mm512_permutexvar_epi16(mask, v); 73 | } 74 | else if constexpr (scale == 32) { 75 | return vtype::reverse(reg); 76 | } 77 | else { 78 | static_assert(scale == -1, "should not be reached"); 79 | } 80 | 81 | return vtype::cast_from(v); 82 | } 83 | 84 | template 85 | X86_SIMD_SORT_INLINE typename vtype::reg_t 86 | merge_n(typename vtype::reg_t reg, typename vtype::reg_t other) 87 | { 88 | __m512i v1 = vtype::cast_to(reg); 89 | __m512i v2 = vtype::cast_to(other); 90 | 91 | if constexpr (scale == 2) { 92 | v1 = _mm512_mask_blend_epi16( 93 | 0b01010101010101010101010101010101, v1, v2); 94 | } 95 | else if constexpr (scale == 4) { 96 | v1 = _mm512_mask_blend_epi16( 97 | 0b00110011001100110011001100110011, v1, v2); 98 | } 99 | else if constexpr (scale == 8) { 100 | v1 = _mm512_mask_blend_epi16( 101 | 0b00001111000011110000111100001111, v1, v2); 102 | } 103 | else if constexpr (scale == 16) { 104 | v1 = _mm512_mask_blend_epi16( 105 | 0b00000000111111110000000011111111, v1, v2); 106 | } 107 | else if constexpr (scale == 32) { 108 | v1 = _mm512_mask_blend_epi16( 109 | 0b00000000000000001111111111111111, v1, v2); 110 | } 111 | else { 112 | static_assert(scale == -1, "should not be reached"); 113 | } 114 | 115 | return vtype::cast_from(v1); 116 | } 117 | }; 118 | 119 | #endif // AVX512_16BIT_COMMON 120 | -------------------------------------------------------------------------------- /.github/workflows/build-numpy.yml: -------------------------------------------------------------------------------- 1 | name: NumPy 2 | 3 | on: 4 | push: 5 | branches: [ "main" ] 6 | pull_request: 7 | branches: [ "main" ] 8 | schedule: 9 | - cron: '0 5 * * *' 10 | 11 | permissions: read-all 12 | 13 | jobs: 14 | np-multiarray-tgl: 15 | 16 | if: github.repository == 'intel/x86-simd-sort' 17 | runs-on: intel-ubuntu-24.04 18 | 19 | steps: 20 | - name: Checkout x86-simd-sort 21 | uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 22 | with: 23 | fetch-depth: 0 24 | path: x86-simd-sort 25 | 26 | - name: Specify branch name 27 | working-directory: ${{ github.workspace }}/x86-simd-sort 28 | run: git switch -c pr-branch 29 | 30 | - name: Install build dependencies 31 | run: | 32 | sudo apt update 33 | sudo apt -y install g++-12 gcc-12 git 34 | 35 | - name: Checkout NumPy main 36 | uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 37 | with: 38 | repository: numpy/numpy 39 | submodules: recursive 40 | fetch-depth: 0 41 | ref: main 42 | path: numpy 43 | 44 | - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 45 | with: 46 | python-version: '3.11' 47 | 48 | - name: Install Intel SDE 49 | run: | 50 | curl -o /tmp/sde.tar.xz https://downloadmirror.intel.com/859732/sde-external-9.58.0-2025-06-16-lin.tar.xz 51 | mkdir /tmp/sde && tar -xvf /tmp/sde.tar.xz -C /tmp/sde/ 52 | sudo mv /tmp/sde/* /opt/sde && sudo ln -s /opt/sde/sde64 /usr/bin/sde 53 | 54 | - name: Install NumPy dependencies 55 | working-directory: ${{ github.workspace }}/numpy 56 | run: | 57 | pip install -r requirements/build_requirements.txt 58 | pip install -r requirements/test_requirements.txt 59 | 60 | - name: Update x86-simd-sort 61 | working-directory: ${{ github.workspace }}/numpy 62 | run: | 63 | cd numpy/_core/src/npysort/x86-simd-sort 64 | git remote add temp ${{ github.workspace }}/x86-simd-sort 65 | git fetch temp 66 | git checkout temp/pr-branch 67 | 68 | - name: Build and run NumPy tests 69 | working-directory: ${{ github.workspace }}/numpy 70 | env: 71 | CXX: g++-12 72 | CC: gcc-12 73 | run: | 74 | spin build -- -Dallow-noblas=true 75 | export NUMPY_SITE=$(realpath build-install/usr/lib/python*/site-packages/) 76 | export PYTHONPATH="$PYTHONPATH:$NUMPY_SITE" 77 | cd build-install && 78 | sde -tgl -- python -m pytest $NUMPY_SITE/numpy/_core/tests/test_multiarray.py 79 | 80 | np-multiarray-spr: 81 | 82 | if: github.repository == 'intel/x86-simd-sort' 83 | runs-on: intel-ubuntu-24.04 84 | 85 | steps: 86 | - name: Checkout x86-simd-sort 87 | uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 88 | with: 89 | fetch-depth: 0 90 | path: x86-simd-sort 91 | 92 | - name: Specify branch name 93 | working-directory: ${{ github.workspace }}/x86-simd-sort 94 | run: git switch -c pr-branch 95 | 96 | - name: Install build dependencies 97 | run: | 98 | sudo apt update 99 | sudo apt -y install g++-12 gcc-12 git 100 | 101 | - name: Install Intel SDE 102 | run: | 103 | curl -o /tmp/sde.tar.xz https://downloadmirror.intel.com/859732/sde-external-9.58.0-2025-06-16-lin.tar.xz 104 | mkdir /tmp/sde && tar -xvf /tmp/sde.tar.xz -C /tmp/sde/ 105 | sudo mv /tmp/sde/* /opt/sde && sudo ln -s /opt/sde/sde64 /usr/bin/sde 106 | 107 | - name: Checkout NumPy main 108 | uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 109 | with: 110 | repository: numpy/numpy 111 | submodules: recursive 112 | fetch-depth: 0 113 | ref: main 114 | path: numpy 115 | 116 | - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 117 | with: 118 | python-version: '3.11' 119 | 120 | - name: Install NumPy dependencies 121 | working-directory: ${{ github.workspace }}/numpy 122 | run: | 123 | pip install -r requirements/build_requirements.txt 124 | pip install -r requirements/test_requirements.txt 125 | 126 | - name: Update x86-simd-sort 127 | working-directory: ${{ github.workspace }}/numpy 128 | run: | 129 | cd numpy/_core/src/npysort/x86-simd-sort 130 | git remote add temp ${{ github.workspace }}/x86-simd-sort 131 | git fetch temp 132 | git checkout temp/pr-branch 133 | 134 | - name: Build NumPy with cpu basline SPR 135 | working-directory: ${{ github.workspace }}/numpy 136 | env: 137 | CXX: g++-12 138 | CC: gcc-12 139 | run: | 140 | spin build -- -Dallow-noblas=true -Dcpu-baseline=avx512_spr 141 | 142 | - name: Run tests on SPR 143 | working-directory: ${{ github.workspace }}/numpy 144 | run: | 145 | export NUMPY_SITE=$(realpath build-install/usr/lib/python*/site-packages/) 146 | export PYTHONPATH="$PYTHONPATH:$NUMPY_SITE" 147 | cd build-install && 148 | sde -spr -- python -c "import numpy; numpy.show_config()" && 149 | sde -spr -- python -m pytest $NUMPY_SITE/numpy/_core/tests/test_multiarray.py 150 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We as members, contributors, and leaders pledge to make participation in our 6 | community a harassment-free experience for everyone, regardless of age, body 7 | size, visible or invisible disability, ethnicity, sex characteristics, gender 8 | identity and expression, level of experience, education, socio-economic status, 9 | nationality, personal appearance, race, caste, color, religion, or sexual 10 | identity and orientation. 11 | 12 | We pledge to act and interact in ways that contribute to an open, welcoming, 13 | diverse, inclusive, and healthy community. 14 | 15 | ## Our Standards 16 | 17 | Examples of behavior that contributes to a positive environment for our 18 | community include: 19 | 20 | * Demonstrating empathy and kindness toward other people 21 | * Being respectful of differing opinions, viewpoints, and experiences 22 | * Giving and gracefully accepting constructive feedback 23 | * Accepting responsibility and apologizing to those affected by our mistakes, 24 | and learning from the experience 25 | * Focusing on what is best not just for us as individuals, but for the overall 26 | community 27 | 28 | Examples of unacceptable behavior include: 29 | 30 | * The use of sexualized language or imagery, and sexual attention or advances of 31 | any kind 32 | * Trolling, insulting or derogatory comments, and personal or political attacks 33 | * Public or private harassment 34 | * Publishing others' private information, such as a physical or email address, 35 | without their explicit permission 36 | * Other conduct which could reasonably be considered inappropriate in a 37 | professional setting 38 | 39 | ## Enforcement Responsibilities 40 | 41 | Community leaders are responsible for clarifying and enforcing our standards of 42 | acceptable behavior and will take appropriate and fair corrective action in 43 | response to any behavior that they deem inappropriate, threatening, offensive, 44 | or harmful. 45 | 46 | Community leaders have the right and responsibility to remove, edit, or reject 47 | comments, commits, code, wiki edits, issues, and other contributions that are 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation 49 | decisions when appropriate. 50 | 51 | ## Scope 52 | 53 | This Code of Conduct applies within all community spaces, and also applies when 54 | an individual is officially representing the community in public spaces. 55 | Examples of representing our community include using an official e-mail address, 56 | posting via an official social media account, or acting as an appointed 57 | representative at an online or offline event. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported to the community leaders responsible for enforcement at 63 | CommunityCodeOfConduct AT intel DOT com. 64 | All complaints will be reviewed and investigated promptly and fairly. 65 | 66 | All community leaders are obligated to respect the privacy and security of the 67 | reporter of any incident. 68 | 69 | ## Enforcement Guidelines 70 | 71 | Community leaders will follow these Community Impact Guidelines in determining 72 | the consequences for any action they deem in violation of this Code of Conduct: 73 | 74 | ### 1. Correction 75 | 76 | **Community Impact**: Use of inappropriate language or other behavior deemed 77 | unprofessional or unwelcome in the community. 78 | 79 | **Consequence**: A private, written warning from community leaders, providing 80 | clarity around the nature of the violation and an explanation of why the 81 | behavior was inappropriate. A public apology may be requested. 82 | 83 | ### 2. Warning 84 | 85 | **Community Impact**: A violation through a single incident or series of 86 | actions. 87 | 88 | **Consequence**: A warning with consequences for continued behavior. No 89 | interaction with the people involved, including unsolicited interaction with 90 | those enforcing the Code of Conduct, for a specified period of time. This 91 | includes avoiding interactions in community spaces as well as external channels 92 | like social media. Violating these terms may lead to a temporary or permanent 93 | ban. 94 | 95 | ### 3. Temporary Ban 96 | 97 | **Community Impact**: A serious violation of community standards, including 98 | sustained inappropriate behavior. 99 | 100 | **Consequence**: A temporary ban from any sort of interaction or public 101 | communication with the community for a specified period of time. No public or 102 | private interaction with the people involved, including unsolicited interaction 103 | with those enforcing the Code of Conduct, is allowed during this period. 104 | Violating these terms may lead to a permanent ban. 105 | 106 | ### 4. Permanent Ban 107 | 108 | **Community Impact**: Demonstrating a pattern of violation of community 109 | standards, including sustained inappropriate behavior, harassment of an 110 | individual, or aggression toward or disparagement of classes of individuals. 111 | 112 | **Consequence**: A permanent ban from any sort of public interaction within the 113 | community. 114 | 115 | ## Attribution 116 | 117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 118 | version 2.1, available at 119 | [https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1]. 120 | 121 | Community Impact Guidelines were inspired by 122 | [Mozilla's code of conduct enforcement ladder][Mozilla CoC]. 123 | 124 | For answers to common questions about this code of conduct, see the FAQ at 125 | [https://www.contributor-covenant.org/faq][FAQ]. Translations are available at 126 | [https://www.contributor-covenant.org/translations][translations]. 127 | 128 | [homepage]: https://www.contributor-covenant.org 129 | [v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html 130 | [Mozilla CoC]: https://github.com/mozilla/diversity 131 | [FAQ]: https://www.contributor-covenant.org/faq 132 | -------------------------------------------------------------------------------- /utils/rand_array.h: -------------------------------------------------------------------------------- 1 | /******************************************* 2 | * * Copyright (C) 2022 Intel Corporation 3 | * * SPDX-License-Identifier: BSD-3-Clause 4 | * *******************************************/ 5 | #ifndef UTILS_RAND_ARRAY 6 | #define UTILS_RAND_ARRAY 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include "xss-custom-float.h" 14 | 15 | template 16 | static std::vector get_uniform_rand_array(int64_t arrsize, 17 | T max = xss::fp::max(), 18 | T min = xss::fp::min()) 19 | { 20 | std::vector arr; 21 | std::random_device rd; 22 | if constexpr (std::is_floating_point_v) { 23 | std::mt19937 gen(rd()); 24 | #ifndef XSS_DO_NOT_SET_SEED 25 | gen.seed(42); 26 | #endif 27 | std::uniform_real_distribution dis(min, max); 28 | for (int64_t ii = 0; ii < arrsize; ++ii) { 29 | arr.emplace_back(dis(gen)); 30 | } 31 | } 32 | #ifdef __FLT16_MAX__ 33 | else if constexpr (std::is_same_v) { 34 | (void)(max); 35 | (void)(min); 36 | for (auto jj = 0; jj < arrsize; ++jj) { 37 | float temp = (float)rand() / (float)(RAND_MAX); 38 | arr.push_back((_Float16)temp); 39 | } 40 | } 41 | #endif 42 | else if constexpr (std::is_integral_v) { 43 | std::default_random_engine e1(rd()); 44 | #ifndef XSS_DO_NOT_SET_SEED 45 | e1.seed(42); 46 | #endif 47 | std::uniform_int_distribution uniform_dist(min, max); 48 | for (int64_t ii = 0; ii < arrsize; ++ii) { 49 | arr.emplace_back(uniform_dist(e1)); 50 | } 51 | } 52 | return arr; 53 | } 54 | 55 | template 56 | static std::vector get_uniform_rand_array_with_uniquevalues( 57 | int64_t arrsize, T max = xss::fp::max(), T min = xss::fp::min()) 58 | { 59 | std::vector arr = get_uniform_rand_array(arrsize, max, min); 60 | typename std::vector::iterator ip 61 | = std::unique(arr.begin(), arr.begin() + arrsize); 62 | arr.resize(std::distance(arr.begin(), ip)); 63 | return arr; 64 | } 65 | 66 | template 67 | static std::vector get_array(std::string arrtype, 68 | size_t arrsize, 69 | T min = xss::fp::min(), 70 | T max = xss::fp::max()) 71 | { 72 | std::vector arr; 73 | if (arrsize == 0) return arr; 74 | if (arrtype == "random") { 75 | arr = get_uniform_rand_array(arrsize, max, min); 76 | } 77 | else if (arrtype == "sorted") { 78 | arr = get_uniform_rand_array(arrsize, max, min); 79 | std::sort(arr.begin(), arr.end()); 80 | } 81 | else if (arrtype == "constant") { 82 | T temp = get_uniform_rand_array(1, max, min)[0]; 83 | for (size_t ii = 0; ii < arrsize; ++ii) { 84 | arr.push_back(temp); 85 | } 86 | } 87 | else if (arrtype == "reverse") { 88 | arr = get_uniform_rand_array(arrsize, max, min); 89 | std::sort(arr.begin(), arr.end()); 90 | std::reverse(arr.begin(), arr.end()); 91 | } 92 | else if (arrtype == "smallrange") { 93 | arr = get_uniform_rand_array(arrsize, 20, 1); 94 | } 95 | else if (arrtype == "random_5d") { 96 | size_t temp = std::max((size_t)1, (size_t)(0.5 * arrsize)); 97 | std::vector temparr = get_uniform_rand_array(temp); 98 | for (size_t ii = 0; ii < arrsize; ++ii) { 99 | if (ii < temp) { arr.push_back(temparr[ii]); } 100 | else { 101 | arr.push_back((T)0); 102 | } 103 | } 104 | std::shuffle(arr.begin(), arr.end(), std::default_random_engine(42)); 105 | } 106 | else if (arrtype == "max_at_the_end") { 107 | arr = get_uniform_rand_array(arrsize, max, min); 108 | if (xss::fp::is_floating_point_v) { 109 | arr[arrsize - 1] = xss::fp::infinity(); 110 | } 111 | else { 112 | arr[arrsize - 1] = std::numeric_limits::max(); 113 | } 114 | } 115 | else if (arrtype == "rand_with_nan") { 116 | arr = get_uniform_rand_array(arrsize, max, min); 117 | int64_t num_nans = 10 % arrsize; 118 | std::vector rand_indx 119 | = get_uniform_rand_array(num_nans, arrsize - 1, 0); 120 | T val; 121 | if constexpr (xss::fp::is_floating_point_v) { 122 | val = xss::fp::quiet_NaN(); 123 | } 124 | else { 125 | val = std::numeric_limits::max(); 126 | } 127 | for (auto ind : rand_indx) { 128 | arr[ind] = val; 129 | } 130 | } 131 | else if (arrtype == "rand_max") { 132 | arr = get_uniform_rand_array(arrsize, max, min); 133 | T val; 134 | if constexpr (xss::fp::is_floating_point_v) { 135 | val = xss::fp::infinity(); 136 | } 137 | else { 138 | val = std::numeric_limits::max(); 139 | } 140 | for (size_t ii = 0; ii < arrsize; ++ii) { 141 | if (rand() & 0x1) { arr[ii] = val; } 142 | } 143 | } 144 | else if (arrtype == "rand_with_max_and_nan") { 145 | arr = get_uniform_rand_array(arrsize, max, min); 146 | T max_val; 147 | T nan_val; 148 | if constexpr (xss::fp::is_floating_point_v) { 149 | max_val = xss::fp::infinity(); 150 | nan_val = xss::fp::quiet_NaN(); 151 | } 152 | else { 153 | max_val = std::numeric_limits::max(); 154 | nan_val = std::numeric_limits::max(); 155 | } 156 | for (size_t ii = 0; ii < arrsize; ++ii) { 157 | int res = rand() % 4; 158 | if (res == 2) { arr[ii] = max_val; } 159 | else if (res == 3) { 160 | arr[ii] = nan_val; 161 | } 162 | } 163 | } 164 | else { 165 | std::cout << "Warning: unrecognized array type " << arrtype 166 | << std::endl; 167 | } 168 | return arr; 169 | } 170 | 171 | #endif // UTILS_RAND_ARRAY 172 | -------------------------------------------------------------------------------- /src/avx512fp16-16bit-qsort.hpp: -------------------------------------------------------------------------------- 1 | /******************************************************************* 2 | * Copyright (C) 2022 Intel Corporation 3 | * SPDX-License-Identifier: BSD-3-Clause 4 | * Authors: Raghuveer Devulapalli 5 | * ****************************************************************/ 6 | 7 | #ifndef AVX512FP16_QSORT_16BIT 8 | #define AVX512FP16_QSORT_16BIT 9 | 10 | #include "avx512-16bit-common.h" 11 | 12 | typedef union { 13 | _Float16 f_; 14 | uint16_t i_; 15 | } Fp16Bits; 16 | 17 | template <> 18 | struct zmm_vector<_Float16> { 19 | using type_t = _Float16; 20 | using reg_t = __m512h; 21 | using halfreg_t = __m256h; 22 | using opmask_t = __mmask32; 23 | static const uint8_t numlanes = 32; 24 | static constexpr int network_sort_threshold = 128; 25 | static constexpr int partition_unroll_factor = 8; 26 | static constexpr simd_type vec_type = simd_type::AVX512; 27 | 28 | using swizzle_ops = avx512_16bit_swizzle_ops; 29 | 30 | static type_t type_max() 31 | { 32 | Fp16Bits val; 33 | val.i_ = X86_SIMD_SORT_INFINITYH; 34 | return val.f_; 35 | } 36 | static type_t type_min() 37 | { 38 | Fp16Bits val; 39 | val.i_ = X86_SIMD_SORT_NEGINFINITYH; 40 | return val.f_; 41 | } 42 | static reg_t zmm_max() 43 | { 44 | return _mm512_set1_ph(type_max()); 45 | } 46 | static reg_t zmm_min() 47 | { 48 | return _mm512_set1_ph(type_min()); 49 | } 50 | static opmask_t knot_opmask(opmask_t x) 51 | { 52 | return _knot_mask32(x); 53 | } 54 | static opmask_t ge(reg_t x, reg_t y) 55 | { 56 | return _mm512_cmp_ph_mask(x, y, _CMP_GE_OQ); 57 | } 58 | static opmask_t eq(reg_t x, reg_t y) 59 | { 60 | return _mm512_cmp_ph_mask(x, y, _CMP_EQ_OQ); 61 | } 62 | static opmask_t get_partial_loadmask(uint64_t num_to_read) 63 | { 64 | return ((0x1ull << num_to_read) - 0x1ull); 65 | } 66 | static int32_t convert_mask_to_int(opmask_t mask) 67 | { 68 | return mask; 69 | } 70 | template 71 | static opmask_t fpclass(reg_t x) 72 | { 73 | return _mm512_fpclass_ph_mask(x, type); 74 | } 75 | static reg_t loadu(void const *mem) 76 | { 77 | return _mm512_loadu_ph(mem); 78 | } 79 | static reg_t max(reg_t x, reg_t y) 80 | { 81 | return _mm512_max_ph(x, y); 82 | } 83 | static void mask_compressstoreu(void *mem, opmask_t mask, reg_t x) 84 | { 85 | __m512i temp = _mm512_castph_si512(x); 86 | // AVX512_VBMI2 87 | return _mm512_mask_compressstoreu_epi16(mem, mask, temp); 88 | } 89 | static reg_t maskz_loadu(opmask_t mask, void const *mem) 90 | { 91 | return _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, mem)); 92 | } 93 | static reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) 94 | { 95 | // AVX512BW 96 | return _mm512_castsi512_ph( 97 | _mm512_mask_loadu_epi16(_mm512_castph_si512(x), mask, mem)); 98 | } 99 | static reg_t mask_mov(reg_t x, opmask_t mask, reg_t y) 100 | { 101 | return _mm512_castsi512_ph(_mm512_mask_mov_epi16( 102 | _mm512_castph_si512(x), mask, _mm512_castph_si512(y))); 103 | } 104 | static void mask_storeu(void *mem, opmask_t mask, reg_t x) 105 | { 106 | return _mm512_mask_storeu_epi16(mem, mask, _mm512_castph_si512(x)); 107 | } 108 | static reg_t min(reg_t x, reg_t y) 109 | { 110 | return _mm512_min_ph(x, y); 111 | } 112 | static reg_t permutexvar(__m512i idx, reg_t zmm) 113 | { 114 | return _mm512_permutexvar_ph(idx, zmm); 115 | } 116 | static type_t reducemax(reg_t v) 117 | { 118 | return _mm512_reduce_max_ph(v); 119 | } 120 | static type_t reducemin(reg_t v) 121 | { 122 | return _mm512_reduce_min_ph(v); 123 | } 124 | static reg_t set1(type_t v) 125 | { 126 | return _mm512_set1_ph(v); 127 | } 128 | template 129 | static reg_t shuffle(reg_t zmm) 130 | { 131 | __m512i temp = _mm512_shufflehi_epi16(_mm512_castph_si512(zmm), 132 | (_MM_PERM_ENUM)mask); 133 | return _mm512_castsi512_ph( 134 | _mm512_shufflelo_epi16(temp, (_MM_PERM_ENUM)mask)); 135 | } 136 | static void storeu(void *mem, reg_t x) 137 | { 138 | return _mm512_storeu_ph(mem, x); 139 | } 140 | static reg_t reverse(reg_t zmm) 141 | { 142 | constexpr static uint16_t arr[] = {NETWORK_REVERSE_32LANES}; 143 | const auto rev_index = _mm512_loadu_si512(arr); 144 | return permutexvar(rev_index, zmm); 145 | } 146 | static reg_t sort_vec(reg_t x) 147 | { 148 | return sort_reg_32lanes>(x); 149 | } 150 | static reg_t cast_from(__m512i v) 151 | { 152 | return _mm512_castsi512_ph(v); 153 | } 154 | static __m512i cast_to(reg_t v) 155 | { 156 | return _mm512_castph_si512(v); 157 | } 158 | static bool all_false(opmask_t k) 159 | { 160 | return k == 0; 161 | } 162 | static int double_compressstore(type_t *left_addr, 163 | type_t *right_addr, 164 | opmask_t k, 165 | reg_t reg) 166 | { 167 | return avx512_double_compressstore>( 168 | left_addr, right_addr, k, reg); 169 | } 170 | }; 171 | 172 | template <> 173 | X86_SIMD_SORT_INLINE_ONLY bool is_a_nan<_Float16>(_Float16 elem) 174 | { 175 | return elem != elem; 176 | } 177 | 178 | template <> 179 | X86_SIMD_SORT_INLINE_ONLY void replace_inf_with_nan(_Float16 *arr, 180 | arrsize_t size, 181 | arrsize_t nan_count, 182 | bool descending) 183 | { 184 | Fp16Bits val; 185 | val.i_ = 0x7c01; 186 | 187 | if (descending) { 188 | for (arrsize_t ii = 0; nan_count > 0; ++ii) { 189 | arr[ii] = val.f_; 190 | nan_count -= 1; 191 | } 192 | } 193 | else { 194 | for (arrsize_t ii = size - 1; nan_count > 0; --ii) { 195 | arr[ii] = val.f_; 196 | nan_count -= 1; 197 | } 198 | } 199 | } 200 | #endif // AVX512FP16_QSORT_16BIT 201 | -------------------------------------------------------------------------------- /src/README.md: -------------------------------------------------------------------------------- 1 | # x86-simd-sort 2 | 3 | C++ header file library for SIMD based 16-bit, 32-bit and 64-bit data type 4 | sorting algorithms on x86 processors. We currently have AVX-512 and AVX2 based 5 | implementation of quicksort, quickselect, partialsort, argsort, argselect & 6 | key-value sort. The static methods can be used by including 7 | `src/x86simdsort-static-incl.h` file. Compiling them with the appropriate 8 | compiler flags will choose either the AVX-512 or AVX2 versions. For AVX-512, we 9 | recommend using -march=skylake-avx512 for 32-bit and 64-bit datatypes, 10 | -march=icelake-client for 16-bit datatype and -march=sapphirerapids for 11 | _Float16. For AVX2 just using -mavx2 will suffice. The following API's are 12 | currently supported: 13 | 14 | #### Quicksort 15 | 16 | Equivalent to `qsort` in 17 | [C](https://www.tutorialspoint.com/c_standard_library/c_function_qsort.htm) or 18 | `std::sort` in [C++](https://en.cppreference.com/w/cpp/algorithm/sort). 19 | 20 | ```cpp 21 | void x86simdsortStatic::qsort(T* arr, size_t arrsize, bool hasnan = false, bool descending = false); 22 | ``` 23 | Supported datatypes: `uint16_t`, `int16_t`, `_Float16`, `uint32_t`, `int32_t`, 24 | `float`, `uint64_t`, `int64_t` and `double`. AVX2 versions currently support 25 | 32-bit and 64-bit dtypes only. For floating-point types, if `arr` contains 26 | NaNs, they are moved to the end and replaced with a quiet NaN. That is, the 27 | original, bit-exact NaNs in the input are not preserved. 28 | 29 | #### Quickselect 30 | Equivalent to `std::nth_element` in 31 | [C++](https://en.cppreference.com/w/cpp/algorithm/nth_element) or 32 | `np.partition` in 33 | [NumPy](https://numpy.org/doc/stable/reference/generated/numpy.partition.html). 34 | 35 | 36 | ```cpp 37 | void x86simdsortStatic::qselect(T* arr, size_t k, size_t arrsize, bool hasnan = false, bool descending = false); 38 | ``` 39 | Supported datatypes: `uint16_t`, `int16_t`, `_Float16`, `uint32_t`, `int32_t`, 40 | `float`, `uint64_t`, `int64_t` and `double`. AVX2 versions currently support 41 | 32-bit and 64-bit dtypes only. For floating-point types, if `bool hasnan` is 42 | set, NaNs are moved to the end of the array, preserving the bit-exact NaNs in 43 | the input. If NaNs are present but `hasnan` is `false`, the behavior is 44 | undefined. 45 | 46 | #### Partialsort 47 | Equivalent to `std::partial_sort` in 48 | [C++](https://en.cppreference.com/w/cpp/algorithm/partial_sort). 49 | 50 | 51 | ```cpp 52 | void x86simdsortStatic::partial_qsort(T* arr, size_t k, size_t arrsize, bool hasnan = false, bool descending = false) 53 | ``` 54 | Supported datatypes: `uint16_t`, `int16_t`, `_Float16`, `uint32_t`, `int32_t`, 55 | `float`, `uint64_t`, `int64_t` and `double`. AVX2 versions currently support 56 | 32-bit and 64-bit dtypes only. For floating-point types, if `bool hasnan` is 57 | set, NaNs are moved to the end of the array, preserving the bit-exact NaNs in 58 | the input. If NaNs are present but `hasnan` is `false`, the behavior is 59 | undefined. 60 | 61 | #### Argsort 62 | Equivalent to `np.argsort` in 63 | [NumPy](https://numpy.org/doc/stable/reference/generated/numpy.argsort.html). 64 | 65 | ```cpp 66 | void x86simdsortStatic::argsort(T* arr, size_t *arg, size_t arrsize, bool hasnan = false, bool descending = false); 67 | ``` 68 | Supported datatypes: `uint32_t`, `int32_t`, `float`, `uint64_t`, `int64_t` and 69 | `double`. 70 | 71 | The algorithm resorts to scalar `std::sort` if the array contains NaNs. 72 | 73 | #### Argselect 74 | Equivalent to `np.argselect` in 75 | [NumPy](https://numpy.org/doc/stable/reference/generated/numpy.argpartition.html). 76 | 77 | ```cpp 78 | void x86simdsortStatic::argselect(T* arr, size_t *arg, size_t k, size_t arrsize, bool hasnan = false); 79 | ``` 80 | Supported datatypes: `uint32_t`, `int32_t`, `float`, `uint64_t`, `int64_t` and 81 | `double`. 82 | 83 | The algorithm resorts to scalar `std::sort` if the array contains NaNs. 84 | 85 | #### Key-value sort 86 | ```cpp 87 | void x86simdsortStatic::keyvalue_qsort(T1* key, T2* value, size_t arrsize, bool hasnan = false, bool descending = false); 88 | ``` 89 | Supported datatypes: `uint32_t`, `int32_t`, `float`, `uint64_t`, `int64_t` and 90 | `double`. 91 | 92 | ## Algorithm details 93 | 94 | The ideas and code are based on these two research papers [1] and [2]. On a 95 | high level, the idea is to vectorize quicksort partitioning using AVX-512 96 | compressstore instructions. If the array size is less than a certain threshold 97 | (typically 512, 256, 128 or 64), then we use sorting networks [4,5] implemented 98 | on AVX512/AVX registers. Article [4] is a good resource for bitonic sorting 99 | network. Article [5] lists optimal sorting newtorks for various array sizes. 100 | The core implementations of the vectorized qsort functions `avx*_qsort(T*, 101 | size_t)` are modified versions of avx2 quicksort presented in the paper [2] and 102 | source code associated with that paper [3]. 103 | 104 | ## Example to include and build this in a C++ code 105 | 106 | ### Sample code `main.cpp` 107 | 108 | ```cpp 109 | #include "src/x86simdsort-static-incl.h" 110 | 111 | int main() { 112 | const int ARRSIZE = 1000; 113 | std::vector arr; 114 | 115 | /* Initialize elements is reverse order */ 116 | for (int ii = 0; ii < ARRSIZE; ++ii) { 117 | arr.push_back(ARRSIZE - ii); 118 | } 119 | 120 | /* call avx512 quicksort */ 121 | x86simdsortStatic::qsort(arr.data(), ARRSIZE); 122 | return 0; 123 | } 124 | 125 | ``` 126 | 127 | ### Build using g++ 128 | 129 | ``` 130 | g++ main.cpp -mavx512f -mavx512dq -mavx512vl -O3 /* for AVX-512 */ 131 | g++ main.cpp -mavx2 -O3 /* for AVX2 */ 132 | ``` 133 | 134 | If you are using src files directly, then it is a header file only and we do 135 | not provide any compile time and run time checks which is recommended while 136 | including this in your source code. The header files are integrated into 137 | [NumPy](https://github.com/numpy/numpy) code base and this [pull 138 | request](https://github.com/numpy/numpy/pull/22315) is a good reference for how 139 | to include and build this library with your source code. 140 | 141 | ## Build requirements 142 | 143 | The sorting routines relies only on the C++ Standard Library and requires a 144 | relatively modern compiler to build (ex: gcc 8.x and above). 145 | 146 | ## Instruction set requirements 147 | 148 | The `avx512_*` routines can only run on processors that have AVX-512. 149 | Specifically, the 32-bit and 64-bit require AVX-512F and AVX-512DQ instruction 150 | set. The 16-bit sorting requires the AVX-512F, AVX-512BW and AVX-512 VMBI2 151 | instruction set. Sorting `_Float16` will require AVX-512FP16. 152 | 153 | The `avx2_*` routines require AVX/AVX2 instruction set. We currently only 154 | support 32-bit and 64-bit data for AVX2 based methods with plans to extend that 155 | to all the other routines and data types. 156 | 157 | ## References 158 | 159 | * [1] Fast and Robust Vectorized In-Place Sorting of Primitive Types 160 | https://drops.dagstuhl.de/opus/volltexte/2021/13775/ 161 | 162 | * [2] A Novel Hybrid Quicksort Algorithm Vectorized using AVX-512 on Intel 163 | Skylake https://arxiv.org/pdf/1704.08579.pdf 164 | 165 | * [3] https://github.com/simd-sorting/fast-and-robust: SPDX-License-Identifier: MIT 166 | 167 | * [4] http://mitp-content-server.mit.edu:18180/books/content/sectbyfn?collid=books_pres_0&fn=Chapter%2027.pdf&id=8030 168 | 169 | * [5] https://bertdobbelaere.github.io/sorting_networks.html 170 | -------------------------------------------------------------------------------- /src/x86simdsort-static-incl.h: -------------------------------------------------------------------------------- 1 | #ifndef X86_SIMD_SORT_STATIC_METHODS 2 | #define X86_SIMD_SORT_STATIC_METHODS 3 | #include 4 | #include 5 | #include "xss-common-includes.h" 6 | 7 | // Supported methods declared here for a quick reference: 8 | namespace x86simdsortStatic { 9 | template 10 | X86_SIMD_SORT_FINLINE void 11 | qsort(T *arr, size_t size, bool hasnan = false, bool descending = false); 12 | 13 | template 14 | X86_SIMD_SORT_FINLINE void qselect(T *arr, 15 | size_t k, 16 | size_t size, 17 | bool hasnan = false, 18 | bool descending = false); 19 | 20 | template 21 | X86_SIMD_SORT_FINLINE void partial_qsort(T *arr, 22 | size_t k, 23 | size_t size, 24 | bool hasnan = false, 25 | bool descending = false); 26 | 27 | template 28 | X86_SIMD_SORT_FINLINE std::vector 29 | argsort(T *arr, size_t size, bool hasnan = false, bool descending = false); 30 | 31 | /* argsort API required by NumPy: */ 32 | template 33 | X86_SIMD_SORT_FINLINE void argsort(T *arr, 34 | size_t *arg, 35 | size_t size, 36 | bool hasnan = false, 37 | bool descending = false); 38 | 39 | template 40 | X86_SIMD_SORT_FINLINE std::vector 41 | argselect(T *arr, size_t k, size_t size, bool hasnan = false); 42 | 43 | /* argselect API required by NumPy: */ 44 | template 45 | void X86_SIMD_SORT_FINLINE 46 | argselect(T *arr, size_t *arg, size_t k, size_t size, bool hasnan = false); 47 | 48 | template 49 | X86_SIMD_SORT_FINLINE void keyvalue_qsort(T1 *key, 50 | T2 *val, 51 | size_t size, 52 | bool hasnan = false, 53 | bool descending = false); 54 | 55 | template 56 | X86_SIMD_SORT_FINLINE void keyvalue_select(T1 *key, 57 | T2 *val, 58 | size_t k, 59 | size_t size, 60 | bool hasnan = false, 61 | bool descending = false); 62 | 63 | template 64 | X86_SIMD_SORT_FINLINE void keyvalue_partial_sort(T1 *key, 65 | T2 *val, 66 | size_t k, 67 | size_t size, 68 | bool hasnan = false, 69 | bool descending = false); 70 | 71 | } // namespace x86simdsortStatic 72 | 73 | #define XSS_METHODS(ISA) \ 74 | template \ 75 | X86_SIMD_SORT_FINLINE void x86simdsortStatic::qsort( \ 76 | T *arr, size_t size, bool hasnan, bool descending) \ 77 | { \ 78 | ISA##_qsort(arr, size, hasnan, descending); \ 79 | } \ 80 | template \ 81 | X86_SIMD_SORT_FINLINE void x86simdsortStatic::qselect( \ 82 | T *arr, size_t k, size_t size, bool hasnan, bool descending) \ 83 | { \ 84 | ISA##_qselect(arr, k, size, hasnan, descending); \ 85 | } \ 86 | template \ 87 | X86_SIMD_SORT_FINLINE void x86simdsortStatic::partial_qsort( \ 88 | T *arr, size_t k, size_t size, bool hasnan, bool descending) \ 89 | { \ 90 | ISA##_partial_qsort(arr, k, size, hasnan, descending); \ 91 | } \ 92 | template \ 93 | X86_SIMD_SORT_FINLINE void x86simdsortStatic::argsort( \ 94 | T *arr, size_t *arg, size_t size, bool hasnan, bool descending) \ 95 | { \ 96 | ISA##_argsort(arr, arg, size, hasnan, descending); \ 97 | } \ 98 | template \ 99 | X86_SIMD_SORT_FINLINE std::vector x86simdsortStatic::argsort( \ 100 | T *arr, size_t size, bool hasnan, bool descending) \ 101 | { \ 102 | std::vector indices(size); \ 103 | std::iota(indices.begin(), indices.end(), 0); \ 104 | x86simdsortStatic::argsort( \ 105 | arr, indices.data(), size, hasnan, descending); \ 106 | return indices; \ 107 | } \ 108 | template \ 109 | X86_SIMD_SORT_FINLINE void x86simdsortStatic::argselect( \ 110 | T *arr, size_t *arg, size_t k, size_t size, bool hasnan) \ 111 | { \ 112 | ISA##_argselect(arr, arg, k, size, hasnan); \ 113 | } \ 114 | template \ 115 | X86_SIMD_SORT_FINLINE std::vector x86simdsortStatic::argselect( \ 116 | T *arr, size_t k, size_t size, bool hasnan) \ 117 | { \ 118 | std::vector indices(size); \ 119 | std::iota(indices.begin(), indices.end(), 0); \ 120 | x86simdsortStatic::argselect(arr, indices.data(), k, size, hasnan); \ 121 | return indices; \ 122 | } \ 123 | template \ 124 | X86_SIMD_SORT_FINLINE void x86simdsortStatic::keyvalue_qsort( \ 125 | T1 *key, T2 *val, size_t size, bool hasnan, bool descending) \ 126 | { \ 127 | ISA##_qsort_kv(key, val, size, hasnan, descending); \ 128 | } \ 129 | template \ 130 | X86_SIMD_SORT_FINLINE void x86simdsortStatic::keyvalue_select( \ 131 | T1 *key, \ 132 | T2 *val, \ 133 | size_t k, \ 134 | size_t size, \ 135 | bool hasnan, \ 136 | bool descending) \ 137 | { \ 138 | ISA##_select_kv(key, val, k, size, hasnan, descending); \ 139 | } \ 140 | template \ 141 | X86_SIMD_SORT_FINLINE void x86simdsortStatic::keyvalue_partial_sort( \ 142 | T1 *key, \ 143 | T2 *val, \ 144 | size_t k, \ 145 | size_t size, \ 146 | bool hasnan, \ 147 | bool descending) \ 148 | { \ 149 | ISA##_partial_sort_kv(key, val, k, size, hasnan, descending); \ 150 | } 151 | 152 | /* 153 | * qsort, qselect, partial, argsort key-value sort template functions. 154 | */ 155 | #include "xss-common-qsort.h" 156 | #include "xss-common-argsort.h" 157 | #include "xss-common-keyvaluesort.hpp" 158 | 159 | #if defined(__AVX512DQ__) && defined(__AVX512VL__) 160 | /* 32-bit and 64-bit dtypes vector definitions on SKX */ 161 | #include "avx512-32bit-qsort.hpp" 162 | #include "avx512-64bit-qsort.hpp" 163 | #include "avx512-64bit-argsort.hpp" 164 | 165 | /* 16-bit dtypes vector definitions on ICL */ 166 | #if defined(__AVX512BW__) && defined(__AVX512VBMI2__) 167 | #include "avx512-16bit-qsort.hpp" 168 | /* _Float16 vector definition on SPR*/ 169 | #if defined(__FLT16_MAX__) && defined(__AVX512BW__) && defined(__AVX512FP16__) 170 | #include "avx512fp16-16bit-qsort.hpp" 171 | #endif // __FLT16_MAX__ 172 | #endif // __AVX512VBMI2__ 173 | 174 | XSS_METHODS(avx512) 175 | 176 | #if defined(__FLT16_MAX__) && defined(__AVX512BW__) \ 177 | && defined(__AVX512VBMI2__) && !defined(__AVX512FP16__) 178 | template <> 179 | [[maybe_unused]] 180 | void x86simdsortStatic::qsort<_Float16>(_Float16 *arr, 181 | size_t size, 182 | bool hasnan, 183 | bool descending) 184 | { 185 | avx512_qsort_fp16((uint16_t *)arr, size, hasnan, descending); 186 | } 187 | template <> 188 | [[maybe_unused]] 189 | void x86simdsortStatic::qselect<_Float16>( 190 | _Float16 *arr, size_t k, size_t size, bool hasnan, bool descending) 191 | { 192 | avx512_qselect_fp16((uint16_t *)arr, k, size, hasnan, descending); 193 | } 194 | template <> 195 | [[maybe_unused]] 196 | void x86simdsortStatic::partial_qsort<_Float16>( 197 | _Float16 *arr, size_t k, size_t size, bool hasnan, bool descending) 198 | { 199 | avx512_partial_qsort_fp16((uint16_t *)arr, k, size, hasnan, descending); 200 | } 201 | #endif 202 | 203 | #elif defined(__AVX2__) 204 | /* 32-bit and 64-bit dtypes vector definitions on AVX2 */ 205 | #include "avx2-32bit-half.hpp" 206 | #include "avx2-32bit-qsort.hpp" 207 | #include "avx2-64bit-qsort.hpp" 208 | XSS_METHODS(avx2) 209 | 210 | #else 211 | #error "x86simdsortStatic methods needs to be compiled with avx512/avx2 specific flags" 212 | #endif // (__AVX512VL__ && __AVX512DQ__) || AVX2 213 | 214 | #endif // X86_SIMD_SORT_STATIC_METHODS 215 | -------------------------------------------------------------------------------- /src/xss-network-qsort.hpp: -------------------------------------------------------------------------------- 1 | #ifndef XSS_NETWORK_QSORT 2 | #define XSS_NETWORK_QSORT 3 | 4 | #include "xss-optimal-networks.hpp" 5 | 6 | template 7 | X86_SIMD_SORT_INLINE void COEX(mm_t &a, mm_t &b); 8 | 9 | template 13 | X86_SIMD_SORT_FINLINE void bitonic_sort_n_vec(reg_t *regs) 14 | { 15 | if constexpr (numVecs == 1) { 16 | UNUSED(regs); 17 | return; 18 | } 19 | else if constexpr (numVecs == 2) { 20 | comparator::COEX(regs[0], regs[1]); 21 | } 22 | else if constexpr (numVecs == 4) { 23 | optimal_sort_4(regs); 24 | } 25 | else if constexpr (numVecs == 8) { 26 | optimal_sort_8(regs); 27 | } 28 | else if constexpr (numVecs == 16) { 29 | optimal_sort_16(regs); 30 | } 31 | else if constexpr (numVecs == 32) { 32 | optimal_sort_32(regs); 33 | } 34 | else { 35 | static_assert(numVecs == -1, "should not reach here"); 36 | } 37 | } 38 | 39 | /* 40 | * Swizzle ops explained: 41 | * swap_n: swap neighbouring blocks of size within block of size 42 | * reg i = [7,6,5,4,3,2,1,0] 43 | * swap_n<2>: = [[6,7],[4,5],[2,3],[0,1]] 44 | * swap_n<4>: = [[5,4,7,6],[1,0,3,2]] 45 | * swap_n<8>: = [[3,2,1,0,7,6,5,4]] 46 | * reverse_n: reverse elements within block of size 47 | * reg i = [7,6,5,4,3,2,1,0] 48 | * rev_n<2>: = [[6,7],[4,5],[2,3],[0,1]] 49 | * rev_n<4>: = [[4,5,6,7],[0,1,2,3]] 50 | * rev_n<8>: = [[0,1,2,3,4,5,6,7]] 51 | * merge_n: merge blocks of elements from two regs 52 | * reg b,a = [a,a,a,a,a,a,a,a], [b,b,b,b,b,b,b,b] 53 | * merge_n<2> = [a,b,a,b,a,b,a,b] 54 | * merge_n<4> = [a,a,b,b,a,a,b,b] 55 | * merge_n<8> = [a,a,a,a,b,b,b,b] 56 | */ 57 | 58 | template 63 | X86_SIMD_SORT_FINLINE void internal_merge_n_vec(typename vtype::reg_t *reg) 64 | { 65 | using reg_t = typename vtype::reg_t; 66 | using swizzle = typename vtype::swizzle_ops; 67 | if constexpr (scale <= 1) { 68 | UNUSED(reg); 69 | return; 70 | } 71 | else { 72 | if constexpr (first) { 73 | // Use reverse then merge 74 | X86_SIMD_SORT_UNROLL_LOOP(64) 75 | for (int i = 0; i < numVecs; i++) { 76 | reg_t &v = reg[i]; 77 | reg_t rev = swizzle::template reverse_n(v); 78 | comparator::COEX(rev, v); 79 | v = swizzle::template merge_n(v, rev); 80 | } 81 | } 82 | else { 83 | // Use swap then merge 84 | X86_SIMD_SORT_UNROLL_LOOP(64) 85 | for (int i = 0; i < numVecs; i++) { 86 | reg_t &v = reg[i]; 87 | reg_t swap = swizzle::template swap_n(v); 88 | comparator::COEX(swap, v); 89 | v = swizzle::template merge_n(v, swap); 90 | } 91 | } 92 | internal_merge_n_vec(reg); 93 | } 94 | } 95 | 96 | template 101 | X86_SIMD_SORT_FINLINE void merge_substep_n_vec(reg_t *regs) 102 | { 103 | using swizzle = typename vtype::swizzle_ops; 104 | if constexpr (numVecs <= 1) { 105 | UNUSED(regs); 106 | return; 107 | } 108 | 109 | // Reverse upper half of vectors 110 | X86_SIMD_SORT_UNROLL_LOOP(64) 111 | for (int i = numVecs / 2; i < numVecs; i++) { 112 | regs[i] = swizzle::template reverse_n(regs[i]); 113 | } 114 | // Do compare exchanges 115 | X86_SIMD_SORT_UNROLL_LOOP(64) 116 | for (int i = 0; i < numVecs / 2; i++) { 117 | comparator::COEX(regs[i], regs[numVecs - 1 - i]); 118 | } 119 | 120 | merge_substep_n_vec(regs); 121 | merge_substep_n_vec(regs 122 | + numVecs / 2); 123 | } 124 | 125 | template 130 | X86_SIMD_SORT_FINLINE void merge_step_n_vec(reg_t *regs) 131 | { 132 | // Do cross vector merges 133 | merge_substep_n_vec(regs); 134 | 135 | // Do internal vector merges 136 | internal_merge_n_vec(regs); 137 | } 138 | 139 | template 144 | X86_SIMD_SORT_FINLINE void merge_n_vec(reg_t *regs) 145 | { 146 | if constexpr (numPer > vtype::numlanes) { 147 | UNUSED(regs); 148 | return; 149 | } 150 | else { 151 | merge_step_n_vec(regs); 152 | merge_n_vec(regs); 153 | } 154 | } 155 | 156 | template 160 | X86_SIMD_SORT_FINLINE void sort_vectors(reg_t *vecs) 161 | { 162 | /* Run the initial sorting network to sort the columns of the [numVecs x 163 | * num_lanes] matrix 164 | */ 165 | bitonic_sort_n_vec(vecs); 166 | 167 | // Merge the vectors using bitonic merging networks 168 | merge_n_vec(vecs); 169 | } 170 | 171 | template 175 | X86_SIMD_SORT_INLINE void sort_n_vec(typename vtype::type_t *arr, int N) 176 | { 177 | static_assert(numVecs > 0, "numVecs should be > 0"); 178 | if constexpr (numVecs > 1) { 179 | if (N * 2 <= numVecs * vtype::numlanes) { 180 | sort_n_vec(arr, N); 181 | return; 182 | } 183 | } 184 | 185 | reg_t vecs[numVecs]; 186 | 187 | // Generate masks for loading and storing 188 | typename vtype::opmask_t ioMasks[numVecs - numVecs / 2]; 189 | X86_SIMD_SORT_UNROLL_LOOP(64) 190 | for (int i = numVecs / 2, j = 0; i < numVecs; i++, j++) { 191 | uint64_t num_to_read 192 | = std::min((uint64_t)std::max(0, N - i * vtype::numlanes), 193 | (uint64_t)vtype::numlanes); 194 | ioMasks[j] = vtype::get_partial_loadmask(num_to_read); 195 | } 196 | 197 | // Unmasked part of the load 198 | X86_SIMD_SORT_UNROLL_LOOP(64) 199 | for (int i = 0; i < numVecs / 2; i++) { 200 | vecs[i] = vtype::loadu(arr + i * vtype::numlanes); 201 | } 202 | // Masked part of the load 203 | X86_SIMD_SORT_UNROLL_LOOP(64) 204 | for (int i = numVecs / 2, j = 0; i < numVecs; i++, j++) { 205 | vecs[i] = vtype::mask_loadu(comparator::rightmostPossibleVec(), 206 | ioMasks[j], 207 | arr + i * vtype::numlanes); 208 | } 209 | 210 | sort_vectors(vecs); 211 | 212 | // Unmasked part of the store 213 | X86_SIMD_SORT_UNROLL_LOOP(64) 214 | for (int i = 0; i < numVecs / 2; i++) { 215 | vtype::storeu(arr + i * vtype::numlanes, vecs[i]); 216 | } 217 | // Masked part of the store 218 | X86_SIMD_SORT_UNROLL_LOOP(64) 219 | for (int i = numVecs / 2, j = 0; i < numVecs; i++, j++) { 220 | vtype::mask_storeu(arr + i * vtype::numlanes, ioMasks[j], vecs[i]); 221 | } 222 | } 223 | 224 | template 225 | X86_SIMD_SORT_INLINE void sort_n(typename vtype::type_t *arr, int N) 226 | { 227 | constexpr int numVecs = maxN / vtype::numlanes; 228 | constexpr bool isMultiple = (maxN == (vtype::numlanes * numVecs)); 229 | constexpr bool powerOfTwo = (numVecs != 0 && !(numVecs & (numVecs - 1))); 230 | static_assert(powerOfTwo == true && isMultiple == true, 231 | "maxN must be vtype::numlanes times a power of 2"); 232 | 233 | sort_n_vec(arr, N); 234 | } 235 | #endif 236 | -------------------------------------------------------------------------------- /src/xss-pivot-selection.hpp: -------------------------------------------------------------------------------- 1 | #ifndef XSS_PIVOT_SELECTION 2 | #define XSS_PIVOT_SELECTION 3 | 4 | #include "xss-network-qsort.hpp" 5 | #include "xss-common-comparators.hpp" 6 | 7 | enum class pivot_result_t : int { Normal, Sorted, Only2Values }; 8 | 9 | template 10 | struct pivot_results { 11 | 12 | pivot_result_t result = pivot_result_t::Normal; 13 | type_t pivot = 0; 14 | 15 | pivot_results(type_t _pivot, 16 | pivot_result_t _result = pivot_result_t::Normal) 17 | { 18 | pivot = _pivot; 19 | result = _result; 20 | } 21 | }; 22 | 23 | template 24 | X86_SIMD_SORT_INLINE void COEX(mm_t &a, mm_t &b); 25 | 26 | template 27 | X86_SIMD_SORT_INLINE type_t get_pivot(type_t *arr, 28 | const arrsize_t left, 29 | const arrsize_t right) 30 | { 31 | using reg_t = typename vtype::reg_t; 32 | type_t samples[vtype::numlanes]; 33 | arrsize_t delta = (right - left) / vtype::numlanes; 34 | for (int i = 0; i < vtype::numlanes; i++) { 35 | samples[i] = arr[left + i * delta]; 36 | } 37 | reg_t rand_vec = vtype::loadu(samples); 38 | reg_t sort = vtype::sort_vec(rand_vec); 39 | 40 | return ((type_t *)&sort)[vtype::numlanes / 2]; 41 | } 42 | 43 | template 44 | X86_SIMD_SORT_INLINE type_t get_pivot_blocks(type_t *arr, 45 | const arrsize_t left, 46 | const arrsize_t right) 47 | { 48 | 49 | if (right - left <= 1024) { return get_pivot(arr, left, right); } 50 | 51 | using reg_t = typename vtype::reg_t; 52 | constexpr int numVecs = 5; 53 | 54 | arrsize_t width = (right - vtype::numlanes) - left; 55 | arrsize_t delta = width / numVecs; 56 | 57 | reg_t vecs[numVecs]; 58 | // Load data 59 | for (int i = 0; i < numVecs; i++) { 60 | vecs[i] = vtype::loadu(arr + left + delta * i); 61 | } 62 | 63 | // Implement sorting network (from https://bertdobbelaere.github.io/sorting_networks.html) 64 | COEX(vecs[0], vecs[3]); 65 | COEX(vecs[1], vecs[4]); 66 | 67 | COEX(vecs[0], vecs[2]); 68 | COEX(vecs[1], vecs[3]); 69 | 70 | COEX(vecs[0], vecs[1]); 71 | COEX(vecs[2], vecs[4]); 72 | 73 | COEX(vecs[1], vecs[2]); 74 | COEX(vecs[3], vecs[4]); 75 | 76 | COEX(vecs[2], vecs[3]); 77 | 78 | // Calculate median of the middle vector 79 | reg_t &vec = vecs[numVecs / 2]; 80 | vec = vtype::sort_vec(vec); 81 | 82 | type_t data[vtype::numlanes]; 83 | vtype::storeu(data, vec); 84 | return data[vtype::numlanes / 2]; 85 | } 86 | 87 | template 88 | X86_SIMD_SORT_INLINE pivot_results 89 | get_pivot_near_constant(type_t *arr, 90 | type_t commonValue, 91 | const arrsize_t left, 92 | const arrsize_t right); 93 | 94 | template 95 | X86_SIMD_SORT_INLINE pivot_results 96 | get_pivot_smart(type_t *arr, const arrsize_t left, const arrsize_t right) 97 | { 98 | using reg_t = typename vtype::reg_t; 99 | constexpr int numVecs = 4; 100 | 101 | if (right - left + 1 <= 4 * numVecs * vtype::numlanes) { 102 | return pivot_results(get_pivot(arr, left, right)); 103 | } 104 | 105 | constexpr int N = numVecs * vtype::numlanes; 106 | 107 | arrsize_t width = (right - vtype::numlanes) - left; 108 | arrsize_t delta = width / numVecs; 109 | 110 | reg_t vecs[numVecs]; 111 | for (int i = 0; i < numVecs; i++) { 112 | vecs[i] = vtype::loadu(arr + left + delta * i); 113 | } 114 | 115 | // Sort the samples 116 | // Note that this intentionally uses the AscendingComparator 117 | // instead of the provided comparator 118 | sort_vectors, numVecs>(vecs); 119 | 120 | type_t samples[N]; 121 | for (int i = 0; i < numVecs; i++) { 122 | vtype::storeu(samples + vtype::numlanes * i, vecs[i]); 123 | } 124 | 125 | type_t smallest = samples[0]; 126 | type_t largest = samples[N - 1]; 127 | type_t median = samples[N / 2]; 128 | 129 | if (smallest == largest) { 130 | // We have a very unlucky sample, or the array is constant / near constant 131 | // Run a special function meant to deal with this situation 132 | return get_pivot_near_constant( 133 | arr, median, left, right); 134 | } 135 | else if (median != smallest && median != largest) { 136 | // We have a normal sample; use it's median 137 | return pivot_results(median); 138 | } 139 | else if (median == smallest) { 140 | // We will either return the median or the next value larger than the median, 141 | // depending on the comparator (see xss-common-comparators.hpp for more details) 142 | return pivot_results( 143 | comparator::choosePivotMedianIsSmallest(median)); 144 | } 145 | else if (median == largest) { 146 | // We will either return the median or the next value smaller than the median, 147 | // depending on the comparator (see xss-common-comparators.hpp for more details) 148 | return pivot_results( 149 | comparator::choosePivotMedianIsLargest(median)); 150 | } 151 | 152 | return pivot_results(median); 153 | } 154 | 155 | // Handles the case where we seem to have a near-constant array, since our sample of the array was constant 156 | template 157 | X86_SIMD_SORT_INLINE pivot_results 158 | get_pivot_near_constant(type_t *arr, 159 | type_t commonValue, 160 | const arrsize_t left, 161 | const arrsize_t right) 162 | { 163 | using reg_t = typename vtype::reg_t; 164 | 165 | arrsize_t index = left; 166 | 167 | type_t value1 = 0; 168 | type_t value2 = 0; 169 | 170 | // First, search for any value not equal to the common value 171 | // First vectorized 172 | reg_t commonVec = vtype::set1(commonValue); 173 | for (; index <= right - vtype::numlanes; index += vtype::numlanes) { 174 | reg_t data = vtype::loadu(arr + index); 175 | if (!vtype::all_false(vtype::knot_opmask(vtype::eq(data, commonVec)))) { 176 | break; 177 | } 178 | } 179 | 180 | // Than scalar at the end 181 | for (; index <= right; index++) { 182 | if (arr[index] != commonValue) { 183 | value1 = arr[index]; 184 | break; 185 | } 186 | } 187 | 188 | if (index == right + 1) { 189 | // The array is completely constant 190 | // Setting the second flag to true skips partitioning, as the array is constant and thus sorted 191 | return pivot_results(commonValue, pivot_result_t::Sorted); 192 | } 193 | 194 | // Secondly, search for a second value not equal to either of the previous two 195 | // First vectorized 196 | reg_t value1Vec = vtype::set1(value1); 197 | for (; index <= right - vtype::numlanes; index += vtype::numlanes) { 198 | reg_t data = vtype::loadu(arr + index); 199 | if (!vtype::all_false(vtype::knot_opmask(vtype::eq(data, commonVec))) 200 | && !vtype::all_false( 201 | vtype::knot_opmask(vtype::eq(data, value1Vec)))) { 202 | break; 203 | } 204 | } 205 | 206 | // Then scalar 207 | for (; index <= right; index++) { 208 | if (arr[index] != commonValue && arr[index] != value1) { 209 | value2 = arr[index]; 210 | break; 211 | } 212 | } 213 | 214 | if (index == right + 1) { 215 | // The array contains only 2 values 216 | // We must pick the larger one, else the right partition is empty 217 | // (note that larger is determined using the provided comparator, so it might actually be the smaller one) 218 | // We can also skip recursing, as it is guaranteed both partitions are constant after partitioning with the chosen value 219 | // TODO this logic now assumes we use greater than or equal to specifically when partitioning, might be worth noting that somewhere 220 | type_t pivot 221 | = std::max(value1, commonValue, comparator::STDSortComparator); 222 | return pivot_results(pivot, pivot_result_t::Only2Values); 223 | } 224 | 225 | // The array has at least 3 distinct values. Use the middle one as the pivot 226 | type_t median = std::max( 227 | std::min(value1, value2, comparison_func), 228 | std::min(std::max(value1, value2, comparison_func), 229 | commonValue, 230 | comparison_func), 231 | comparison_func); 232 | return pivot_results(median); 233 | } 234 | 235 | #endif 236 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # x86-simd-sort 2 | 3 | C++ template library for high performance SIMD based sorting routines for 4 | built-in integers and floats (16-bit, 32-bit and 64-bit data types) and custom 5 | defined C++ objects. The sorting routines are accelerated using AVX-512/AVX2 6 | when available. The library auto picks the best version depending on the 7 | processor it is run on. If you are looking for the AVX-512 or AVX2 specific 8 | implementations, please see 9 | [README](https://github.com/intel/x86-simd-sort/blob/main/src/README.md) file 10 | under `src/` directory. The following routines are currently supported: 11 | 12 | ## Sort an array of custom defined class objects (uses `O(N)` space) 13 | ``` cpp 14 | template 15 | void x86simdsort::object_qsort(T *arr, U arrsize, Func key_func) 16 | ``` 17 | `T` is any user defined struct or class and `arr` is a pointer to the first 18 | element in the array of objects of type `T`. The `arrsize` parameter can be any 19 | 32-bit or 64-bit integer type. `Func` is a lambda function that computes the 20 | `key` value for each object which is the metric used to sort the objects. 21 | `Func` needs to have the following signature: 22 | 23 | ```cpp 24 | [] (T obj) -> key_t { key_t key; /* compute key for obj */ return key; } 25 | ``` 26 | 27 | Note that the return type of the key `key_t` needs to be one of the following : 28 | `[float, uint32_t, int32_t, double, uint64_t, int64_t]`. `object_qsort` has a 29 | space complexity of `O(N)`. Specifically, it requires `arrsize * sizeof(key_t)` 30 | bytes to store a vector with all the keys and an additional `arrsize * 31 | sizeof(uint32_t)` bytes to store the indexes of the object array. For 32 | performance reasons, we recommend using `object_qsort` when the array size 33 | is less than or equal to `UINT32_MAX`. An example usage of `object_qsort` is 34 | provided in the [examples](#Sort-an-array-of-Points-using-object_qsort) 35 | section. Refer to [section](#Performance-of-object_qsort) to get a sense of 36 | how fast this is relative to `std::sort`. 37 | 38 | ## Sort an array of built-in integers and floats 39 | ```cpp 40 | void x86simdsort::qsort(T* arr, size_t size, bool hasnan, bool descending); 41 | void x86simdsort::qselect(T* arr, size_t k, size_t size, bool hasnan, bool descending); 42 | void x86simdsort::partial_qsort(T* arr, size_t k, size_t size, bool hasnan, bool descending); 43 | ``` 44 | Supported datatypes: `T` $\in$ `[_Float16, uint16_t, int16_t, float, uint32_t, 45 | int32_t, double, uint64_t, int64_t]` 46 | 47 | ## Key-value sort routines on pairs of arrays 48 | ```cpp 49 | void x86simdsort::keyvalue_qsort(T1* key, T2* val, size_t size, bool hasnan, bool descending); 50 | void x86simdsort::keyvalue_select(T1* key, T2* val, size_t k, size_t size, bool hasnan, bool descending); 51 | void x86simdsort::keyvalue_partial_sort(T1* key, T2* val, size_t k, size_t size, bool hasnan, bool descending); 52 | ``` 53 | Supported datatypes: `T1`, `T2` $\in$ `[float, uint32_t, int32_t, double, 54 | uint64_t, int64_t]` Note that keyvalue sort is not yet supported for 16-bit 55 | data types. 56 | 57 | ## Arg sort routines on arrays 58 | ```cpp 59 | std::vector arg = x86simdsort::argsort(T* arr, size_t size, bool hasnan, bool descending); 60 | std::vector arg = x86simdsort::argselect(T* arr, size_t k, size_t size, bool hasnan); 61 | ``` 62 | Supported datatypes: `T` $\in$ `[_Float16, uint16_t, int16_t, float, uint32_t, int32_t, double, 63 | uint64_t, int64_t]` Note that argsort and argselect are not accelerated with SIMD when using 16-bit 64 | data types. 65 | 66 | ## Build/Install 67 | 68 | [meson](https://github.com/mesonbuild/meson) is the used build system. Command 69 | to build and install the library: 70 | 71 | ``` 72 | meson setup --buildtype release builddir && cd builddir 73 | meson compile 74 | sudo meson install 75 | ``` 76 | 77 | Once installed, you can use `pkg-config --cflags --libs x86simdsortcpp` to 78 | populate the right cflags and ldflags to compile and link your C++ program. 79 | This repository also contains a test suite and benchmarking suite which are 80 | written using [googletest](https://github.com/google/googletest) and [google 81 | benchmark](https://github.com/google/benchmark) (>= v1.9.2) frameworks 82 | respectively. You can configure meson to build them both by using 83 | `-Dbuild_tests=true` and `-Dbuild_benchmarks=true`. 84 | 85 | ## Build using OpenMP 86 | 87 | `qsort`, `argsort`, and `keyvalue_qsort` can achieve even greater performance 88 | (up-to 3x speedup) through parallelization with 89 | [OpenMP](https://www.openmp.org/). By default, OpenMP support is disabled; to 90 | enable it, set the `-Duse_openmp=true` flag when configuring Meson. If you are 91 | using only the static SIMD implementations, compile with `-fopenmp 92 | -DXSS_USE_OPENMP`. 93 | 94 | OpenMP-based parallel sorting routines are used for arrays larger than a 95 | specific threshold where threading makes sense. The number of threads is 96 | limited to a maximum of 16. You can control the number of threads by setting 97 | the `OMP_NUM_THREADS` environment variable. 98 | 99 | ## Using x86-simd-sort as a Meson subproject 100 | 101 | If you would like to use this as a Meson subproject, then create `subprojects` 102 | directory and copy `x86-simd-sort` into it. Add these two lines 103 | in your meson.build. 104 | ``` 105 | xss = subproject('x86-simd-sort') 106 | xss_dep = xss.get_variable('x86simdsortcpp_dep') 107 | ``` 108 | 109 | For more detailed instructions please refer to Meson 110 | [documentation](https://mesonbuild.com/Subprojects.html#using-a-subproject). 111 | 112 | ## Example usage 113 | 114 | #### Sort an array of floats 115 | 116 | ```cpp 117 | #include "x86simdsort.h" 118 | 119 | int main() { 120 | std::vector arr{1000}; 121 | x86simdsort::qsort(arr.data(), 1000, true); 122 | return 0; 123 | } 124 | ``` 125 | 126 | #### Sort an array of Points using object_qsort 127 | ```cpp 128 | #include "x86simdsort.h" 129 | #include 130 | 131 | struct Point { 132 | double x, y, z; 133 | }; 134 | 135 | int main() { 136 | std::vector arr{1000}; 137 | // Sort an array of Points by its x value: 138 | x86simdsort::object_qsort(arr.data(), 1000, [](Point p) { return p.x; }); 139 | // Sort an array of Points by its distance from origin: 140 | x86simdsort::object_qsort(arr.data(), 1000, [](Point p) { 141 | return sqrt(p.x*p.x+p.y*p.y+p.z*p.z); 142 | }); 143 | return 0; 144 | } 145 | ``` 146 | 147 | ## Details 148 | 149 | - `x86simdsort::qsort` is equivalent to `qsort` in 150 | [C](https://www.tutorialspoint.com/c_standard_library/c_function_qsort.htm) 151 | or `std::sort` in [C++](https://en.cppreference.com/w/cpp/algorithm/sort). 152 | - `x86simdsort::qselect` is equivalent to `std::nth_element` in 153 | [C++](https://en.cppreference.com/w/cpp/algorithm/nth_element) or 154 | `np.partition` in 155 | [NumPy](https://numpy.org/doc/stable/reference/generated/numpy.partition.html). 156 | - `x86simdsort::partial_qsort` is equivalent to `std::partial_sort` in 157 | [C++](https://en.cppreference.com/w/cpp/algorithm/partial_sort). 158 | - `x86simdsort::argsort` is equivalent to `np.argsort` in 159 | [NumPy](https://numpy.org/doc/stable/reference/generated/numpy.argsort.html). 160 | - `x86simdsort::argselect` is equivalent to `np.argpartition` in 161 | [NumPy](https://numpy.org/doc/stable/reference/generated/numpy.argpartition.html). 162 | 163 | Supported datatypes: `uint16_t, int16_t, _Float16, uint32_t, int32_t, float, 164 | uint64_t, int64_t, double`. Note that `_Float16` will require building this 165 | library with g++ >= 12.x. All the functions have an optional argument `bool 166 | hasnan` set to `false` by default (these are relevant to floating point data 167 | types only). If your array has NAN's, the the behaviour of the sorting routine 168 | is undefined. If `hasnan` is set to true, NAN's are always sorted to the end of 169 | the array. In addition to that, qsort will replace all your NAN's with 170 | `std::numeric_limits::quiet_NaN`. The original bit-exact NaNs in 171 | the input are not preserved. Also note that the arg methods (argsort and 172 | argselect) will not use the SIMD based algorithms if they detect NAN's in the 173 | array. You can read details of all the implementations 174 | [here](https://github.com/intel/x86-simd-sort/blob/main/src/README.md). 175 | 176 | ## Performance comparison on AVX-512: `object_qsort` v/s `std::sort` 177 | Performance of `object_qsort` can vary significantly depending on the defintion 178 | of the custom class and we highly recommend benchmarking before using it. For 179 | the sake of illustration, we provide a few examples in 180 | [./benchmarks/bench-objsort.hpp](./benchmarks/bench-objsort.hpp) which measures 181 | performance of `object_qsort` relative to `std::sort` when sorting an array of 182 | 3D points represented by the class: `struct Point {double x, y, z;}` and 183 | `struct Point {float x, y, x;}`. We sort these points based on several 184 | different metrics: 185 | 186 | + sort by coordinate `x` 187 | + sort by manhanttan distance (relative to origin): `abs(x) + abx(y) + abs(z)` 188 | + sort by Euclidean distance (relative to origin): `sqrt(x*x + y*y + z*z)` 189 | + sort by Chebyshev distance (relative to origin): `max(abs(x), abs(y), abs(z))` 190 | 191 | The performance data (shown in the plot below) can be collected by building the 192 | benchmarks suite and running `./builddir/benchexe --benchmark_filter==*obj*`. 193 | The data plot shown below was collected on a processor with AVX-512. For the 194 | simplest of cases where we want to sort an array of struct by one of its 195 | members, `object_qsort` can be up-to 5x faster for 32-bit data type and about 196 | 4x for 64-bit data type. It tends to do even better when the metric to sort by 197 | gets more complicated. Sorting by Euclidean distance can be up-to 10x faster. 198 | 199 | ![alt text](./misc/object_qsort-perf.jpg?raw=true) 200 | 201 | ## Downstream projects using x86-simd-sort 202 | 203 | - NumPy uses this as a [submodule](https://github.com/numpy/numpy/pull/22315) to accelerate `np.sort, np.argsort, np.partition and np.argpartition`. 204 | - PyTorch uses this as a [submodule](https://github.com/pytorch/pytorch/pull/127936) to accelerate `torch.sort, torch.argsort`. 205 | - A slightly modifed version this library has been integrated into [openJDK](https://github.com/openjdk/jdk/pull/14227). 206 | - [GRAPE](https://github.com/alibaba/libgrape-lite.git): C++ library for parallel graph processing. 207 | - AVX-512 version of the key-value sort has been submitted to [Oceanbase](https://github.com/oceanbase/oceanbase/pull/1325). 208 | -------------------------------------------------------------------------------- /.github/workflows/c-cpp.yml: -------------------------------------------------------------------------------- 1 | name: Build and run tests 2 | 3 | on: 4 | push: 5 | branches: [ "main" ] 6 | pull_request: 7 | branches: [ "main" ] 8 | 9 | permissions: read-all 10 | 11 | jobs: 12 | SKL-gcc9: 13 | 14 | runs-on: intel-ubuntu-24.04 15 | 16 | steps: 17 | - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 18 | 19 | - name: Install dependencies 20 | run: | 21 | sudo apt update 22 | sudo apt -y install g++-9 libgtest-dev meson curl git 23 | 24 | - name: Install Intel SDE 25 | run: | 26 | curl -o /tmp/sde.tar.xz https://downloadmirror.intel.com/859732/sde-external-9.58.0-2025-06-16-lin.tar.xz 27 | mkdir /tmp/sde && tar -xvf /tmp/sde.tar.xz -C /tmp/sde/ 28 | sudo mv /tmp/sde/* /opt/sde && sudo ln -s /opt/sde/sde64 /usr/bin/sde 29 | 30 | - name: Build 31 | env: 32 | CXX: g++-9 33 | run: | 34 | make clean 35 | meson setup -Dbuild_tests=true --warnlevel 2 --werror --buildtype release builddir 36 | cd builddir 37 | ninja 38 | 39 | - name: Run test suite on SKL 40 | run: sde -skl -- ./builddir/testexe 41 | 42 | SKX-gcc10: 43 | 44 | runs-on: intel-ubuntu-24.04 45 | 46 | steps: 47 | - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 48 | 49 | - name: Install dependencies 50 | run: | 51 | sudo apt update 52 | sudo apt -y install g++-10 libgtest-dev meson curl git 53 | 54 | - name: Install Intel SDE 55 | run: | 56 | curl -o /tmp/sde.tar.xz https://downloadmirror.intel.com/859732/sde-external-9.58.0-2025-06-16-lin.tar.xz 57 | mkdir /tmp/sde && tar -xvf /tmp/sde.tar.xz -C /tmp/sde/ 58 | sudo mv /tmp/sde/* /opt/sde && sudo ln -s /opt/sde/sde64 /usr/bin/sde 59 | 60 | - name: Build 61 | env: 62 | CXX: g++-10 63 | run: | 64 | make clean 65 | meson setup -Dbuild_tests=true --warnlevel 2 --werror --buildtype release builddir 66 | cd builddir 67 | ninja 68 | 69 | - name: Run test suite on SKX 70 | run: sde -skx -- ./builddir/testexe 71 | 72 | TGL-gcc11: 73 | 74 | runs-on: intel-ubuntu-24.04 75 | 76 | steps: 77 | - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 78 | 79 | - name: Install dependencies 80 | run: | 81 | sudo apt update 82 | sudo apt -y install g++-11 libgtest-dev meson curl git 83 | 84 | - name: Install Intel SDE 85 | run: | 86 | curl -o /tmp/sde.tar.xz https://downloadmirror.intel.com/859732/sde-external-9.58.0-2025-06-16-lin.tar.xz 87 | mkdir /tmp/sde && tar -xvf /tmp/sde.tar.xz -C /tmp/sde/ 88 | sudo mv /tmp/sde/* /opt/sde && sudo ln -s /opt/sde/sde64 /usr/bin/sde 89 | 90 | - name: Build 91 | env: 92 | CXX: g++-11 93 | run: | 94 | make clean 95 | meson setup -Dbuild_tests=true --warnlevel 2 --werror --buildtype release builddir 96 | cd builddir 97 | ninja 98 | - name: Run test suite on TGL 99 | run: sde -tgl -- ./builddir/testexe 100 | 101 | SPR-gcc13: 102 | 103 | runs-on: intel-ubuntu-24.04 104 | 105 | steps: 106 | - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 107 | 108 | - name: Install dependencies 109 | run: | 110 | sudo apt update 111 | sudo apt -y install g++-13 libgtest-dev meson curl git 112 | 113 | - name: Install Intel SDE 114 | run: | 115 | curl -o /tmp/sde.tar.xz https://downloadmirror.intel.com/859732/sde-external-9.58.0-2025-06-16-lin.tar.xz 116 | mkdir /tmp/sde && tar -xvf /tmp/sde.tar.xz -C /tmp/sde/ 117 | sudo mv /tmp/sde/* /opt/sde && sudo ln -s /opt/sde/sde64 /usr/bin/sde 118 | 119 | - name: Build examples 120 | env: 121 | CXX: g++-13 122 | run: | 123 | cd examples 124 | make all 125 | 126 | - name: Build 127 | env: 128 | CXX: g++-13 129 | run: | 130 | make clean 131 | meson setup -Dbuild_tests=true --warnlevel 2 --werror --buildtype release builddir 132 | cd builddir 133 | ninja 134 | 135 | - name: Run test suite on SPR 136 | run: sde -spr -- ./builddir/testexe 137 | 138 | ADL-ASAN-clang18: 139 | 140 | runs-on: intel-ubuntu-24.04 141 | 142 | steps: 143 | - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 144 | 145 | - name: Install dependencies 146 | run: | 147 | sudo apt update 148 | sudo apt -y install clang-18 libomp-18-dev libgtest-dev meson curl git 149 | 150 | - name: Install Intel SDE 151 | run: | 152 | curl -o /tmp/sde.tar.xz https://downloadmirror.intel.com/859732/sde-external-9.58.0-2025-06-16-lin.tar.xz 153 | mkdir /tmp/sde && tar -xvf /tmp/sde.tar.xz -C /tmp/sde/ 154 | sudo mv /tmp/sde/* /opt/sde && sudo ln -s /opt/sde/sde64 /usr/bin/sde 155 | 156 | - name: Build examples 157 | env: 158 | CXX: clang++-18 159 | run: | 160 | cd examples 161 | make all 162 | 163 | - name: Build 164 | env: 165 | CXX: clang++-18 166 | run: | 167 | make clean 168 | meson setup -Dbuild_tests=true -Duse_openmp=true -Db_sanitize=address,undefined -Dfatal_sanitizers=true -Dasan_ci_dont_validate=true -Db_lundef=false --warnlevel 0 --buildtype release builddir 169 | cd builddir 170 | ninja 171 | 172 | - name: Run test suite on SPR 173 | run: sde -adl -- ./builddir/testexe 174 | 175 | SPR-ASAN-clang18: 176 | 177 | runs-on: intel-ubuntu-24.04 178 | 179 | steps: 180 | - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 181 | 182 | - name: Install dependencies 183 | run: | 184 | sudo apt update 185 | sudo apt -y install clang-18 libomp-18-dev libgtest-dev meson curl git 186 | 187 | - name: Install Intel SDE 188 | run: | 189 | curl -o /tmp/sde.tar.xz https://downloadmirror.intel.com/859732/sde-external-9.58.0-2025-06-16-lin.tar.xz 190 | mkdir /tmp/sde && tar -xvf /tmp/sde.tar.xz -C /tmp/sde/ 191 | sudo mv /tmp/sde/* /opt/sde && sudo ln -s /opt/sde/sde64 /usr/bin/sde 192 | 193 | - name: Build examples 194 | env: 195 | CXX: clang++-18 196 | run: | 197 | cd examples 198 | make all 199 | 200 | - name: Build 201 | env: 202 | CXX: clang++-18 203 | run: | 204 | make clean 205 | meson setup -Dbuild_tests=true -Duse_openmp=true -Db_sanitize=address,undefined -Dfatal_sanitizers=true -Dasan_ci_dont_validate=true -Db_lundef=false --warnlevel 0 --buildtype release builddir 206 | cd builddir 207 | ninja 208 | 209 | - name: Run test suite on SPR 210 | run: sde -spr -- ./builddir/testexe 211 | - name: Run ICL fp16 tests 212 | # Note: This filters for the _Float16 tests based on the number assigned to it, which could change in the future 213 | run: sde -icx -- ./builddir/testexe --gtest_filter="*/simdsort/2*" 214 | 215 | SKX-SKL-openmp: 216 | 217 | runs-on: intel-ubuntu-24.04 218 | 219 | steps: 220 | - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 221 | 222 | - name: Install dependencies 223 | run: | 224 | sudo apt update 225 | sudo apt -y install g++-10 libgtest-dev meson curl git 226 | 227 | - name: Install Intel SDE 228 | run: | 229 | curl -o /tmp/sde.tar.xz https://downloadmirror.intel.com/859732/sde-external-9.58.0-2025-06-16-lin.tar.xz 230 | mkdir /tmp/sde && tar -xvf /tmp/sde.tar.xz -C /tmp/sde/ 231 | sudo mv /tmp/sde/* /opt/sde && sudo ln -s /opt/sde/sde64 /usr/bin/sde 232 | 233 | - name: Build 234 | env: 235 | CXX: g++-10 236 | run: | 237 | make clean 238 | meson setup -Dbuild_tests=true -Duse_openmp=true --warnlevel 2 --werror --buildtype release builddir 239 | cd builddir 240 | ninja 241 | 242 | - name: Run test suite on SKX and SKL 243 | run: | 244 | sde -skx -- ./builddir/testexe 245 | sde -skl -- ./builddir/testexe 246 | 247 | SPR-gcc13-special-cases: 248 | 249 | runs-on: intel-ubuntu-24.04 250 | 251 | steps: 252 | - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 253 | 254 | - name: Install dependencies 255 | run: | 256 | sudo apt update 257 | sudo apt -y install g++-13 libgtest-dev meson curl git 258 | 259 | - name: Install Intel SDE 260 | run: | 261 | curl -o /tmp/sde.tar.xz https://downloadmirror.intel.com/859732/sde-external-9.58.0-2025-06-16-lin.tar.xz 262 | mkdir /tmp/sde && tar -xvf /tmp/sde.tar.xz -C /tmp/sde/ 263 | sudo mv /tmp/sde/* /opt/sde && sudo ln -s /opt/sde/sde64 /usr/bin/sde 264 | 265 | - name: Build 266 | env: 267 | CXX: g++-13 268 | CXXFLAGS: "-DXSS_MINIMAL_NETWORK_SORT -DXSS_TEST_KEYVALUE_BASE_CASE" 269 | run: | 270 | make clean 271 | meson setup -Dbuild_tests=true --warnlevel 2 --werror --buildtype release builddir 272 | cd builddir 273 | ninja 274 | 275 | - name: List exported symbols 276 | run: | 277 | nm --demangle --dynamic --defined-only --extern-only builddir/libx86simdsortcpp.so 278 | 279 | - name: Run test suite on SPR 280 | run: sde -spr -- ./builddir/testexe 281 | 282 | manylinux-32bit: 283 | 284 | runs-on: intel-ubuntu-24.04 285 | 286 | steps: 287 | - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 288 | 289 | - name: Build and test on 32-bit manylinux2014 290 | run: | 291 | docker run -v $(pwd):/xss quay.io/pypa/manylinux2014_i686 \ 292 | /bin/bash -xc "source /xss/.github/workflows/build-test-on-32bit.sh" 293 | 294 | SPR-icpx: 295 | 296 | runs-on: intel-ubuntu-24.04 297 | 298 | steps: 299 | - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 300 | 301 | - name: Install dependencies 302 | run: | 303 | echo "deb https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list 304 | sudo add-apt-repository -y "deb https://apt.repos.intel.com/oneapi all main" 305 | sudo apt update --allow-insecure-repositories 306 | sudo apt --allow-unauthenticated -y install intel-oneapi-compiler-dpcpp-cpp libgtest-dev curl git python3-pip meson 307 | 308 | - name: Install Intel SDE 309 | run: | 310 | #INTEL_SDE_URL=$(curl -s https://www.intel.com/content/www/us/en/download/684897/813591/intel-software-development-emulator.html | grep -Po 'https://downloadmirror.intel.com/.*lin.tar.xz(?=")') 311 | #curl -o /tmp/sde.tar.xz $INTEL_SDE_URL 312 | curl -o /tmp/sde.tar.xz https://downloadmirror.intel.com/859732/sde-external-9.58.0-2025-06-16-lin.tar.xz 313 | mkdir /tmp/sde && tar -xvf /tmp/sde.tar.xz -C /tmp/sde/ 314 | sudo mv /tmp/sde/* /opt/sde && sudo ln -s /opt/sde/sde64 /usr/bin/sde 315 | 316 | - name: Build examples 317 | env: 318 | CXX: icpx 319 | CXXFLAGS: -fp-model=precise 320 | run: | 321 | source /opt/intel/oneapi/setvars.sh 322 | cd examples 323 | make all 324 | 325 | - name: Build 326 | env: 327 | CXX: icpx 328 | CXXFLAGS: -fp-model=precise 329 | run: | 330 | make clean 331 | source /opt/intel/oneapi/setvars.sh 332 | icpx --version 333 | meson setup -Dbuild_tests=true --warnlevel 2 --werror --buildtype release builddir 334 | cd builddir 335 | ninja 336 | 337 | - name: Run test suite on SPR 338 | run: | 339 | source /opt/intel/oneapi/setvars.sh 340 | ./builddir/testexe 341 | -------------------------------------------------------------------------------- /tests/test-qsort.cpp: -------------------------------------------------------------------------------- 1 | /******************************************* 2 | * * Copyright (C) 2022 Intel Corporation 3 | * * SPDX-License-Identifier: BSD-3-Clause 4 | * *******************************************/ 5 | 6 | #include "test-qsort-common.h" 7 | 8 | template 9 | class simdsort : public ::testing::Test { 10 | public: 11 | simdsort() 12 | { 13 | std::iota(arrsize.begin(), arrsize.end(), 0); 14 | std::iota(arrsize_long.begin(), arrsize_long.end(), 0); 15 | #ifdef XSS_USE_OPENMP 16 | // These extended tests are only needed for the OpenMP logic 17 | arrsize_long.push_back(10'000); 18 | arrsize_long.push_back(100'000); 19 | arrsize_long.push_back(1'000'000); 20 | #endif 21 | 22 | arrtype = {"random", 23 | "constant", 24 | "sorted", 25 | "reverse", 26 | "smallrange", 27 | "max_at_the_end", 28 | "random_5d", 29 | "rand_max", 30 | "rand_with_nan", 31 | "rand_with_max_and_nan"}; 32 | } 33 | std::vector arrtype; 34 | std::vector arrsize = std::vector(1024); 35 | std::vector arrsize_long = std::vector(1024); 36 | }; 37 | 38 | TYPED_TEST_SUITE_P(simdsort); 39 | 40 | TYPED_TEST_P(simdsort, test_qsort_ascending) 41 | { 42 | for (auto type : this->arrtype) { 43 | bool hasnan = is_nan_test(type); 44 | for (auto size : this->arrsize_long) { 45 | std::vector basearr = get_array(type, size); 46 | 47 | // Ascending order 48 | std::vector arr = basearr; 49 | std::vector sortedarr = arr; 50 | 51 | x86simdsort::qsort(arr.data(), arr.size(), hasnan); 52 | #ifndef XSS_ASAN_CI_NOCHECK 53 | std::sort(sortedarr.begin(), 54 | sortedarr.end(), 55 | compare>()); 56 | IS_SORTED(sortedarr, arr, type); 57 | #endif 58 | arr.clear(); 59 | sortedarr.clear(); 60 | } 61 | } 62 | } 63 | 64 | TYPED_TEST_P(simdsort, test_qsort_descending) 65 | { 66 | for (auto type : this->arrtype) { 67 | bool hasnan = is_nan_test(type); 68 | for (auto size : this->arrsize_long) { 69 | std::vector basearr = get_array(type, size); 70 | 71 | // Descending order 72 | std::vector arr = basearr; 73 | std::vector sortedarr = arr; 74 | 75 | x86simdsort::qsort(arr.data(), arr.size(), hasnan, true); 76 | #ifndef XSS_ASAN_CI_NOCHECK 77 | std::sort(sortedarr.begin(), 78 | sortedarr.end(), 79 | compare>()); 80 | IS_SORTED(sortedarr, arr, type); 81 | #endif 82 | arr.clear(); 83 | sortedarr.clear(); 84 | } 85 | } 86 | } 87 | 88 | TYPED_TEST_P(simdsort, test_argsort_ascending) 89 | { 90 | for (auto type : this->arrtype) { 91 | bool hasnan = is_nan_test(type); 92 | for (auto size : this->arrsize_long) { 93 | std::vector arr = get_array(type, size); 94 | std::vector sortedarr = arr; 95 | 96 | auto arg = x86simdsort::argsort(arr.data(), arr.size(), hasnan); 97 | #ifndef XSS_ASAN_CI_NOCHECK 98 | std::sort(sortedarr.begin(), 99 | sortedarr.end(), 100 | compare>()); 101 | IS_ARG_SORTED(sortedarr, arr, arg, type); 102 | #endif 103 | arr.clear(); 104 | arg.clear(); 105 | } 106 | } 107 | } 108 | 109 | TYPED_TEST_P(simdsort, test_argsort_descending) 110 | { 111 | for (auto type : this->arrtype) { 112 | bool hasnan = is_nan_test(type); 113 | for (auto size : this->arrsize_long) { 114 | std::vector arr = get_array(type, size); 115 | std::vector sortedarr = arr; 116 | 117 | auto arg = x86simdsort::argsort( 118 | arr.data(), arr.size(), hasnan, true); 119 | #ifndef XSS_ASAN_CI_NOCHECK 120 | std::sort(sortedarr.begin(), 121 | sortedarr.end(), 122 | compare>()); 123 | IS_ARG_SORTED(sortedarr, arr, arg, type); 124 | #endif 125 | arr.clear(); 126 | arg.clear(); 127 | } 128 | } 129 | } 130 | 131 | TYPED_TEST_P(simdsort, test_qselect_ascending) 132 | { 133 | for (auto type : this->arrtype) { 134 | bool hasnan = is_nan_test(type); 135 | for (auto size : this->arrsize) { 136 | size_t k = size != 0 ? rand() % size : 0; 137 | std::vector basearr = get_array(type, size); 138 | 139 | // Ascending order 140 | std::vector arr = basearr; 141 | std::vector sortedarr = arr; 142 | 143 | x86simdsort::qselect(arr.data(), k, arr.size(), hasnan); 144 | #ifndef XSS_ASAN_CI_NOCHECK 145 | std::nth_element(sortedarr.begin(), 146 | sortedarr.begin() + k, 147 | sortedarr.end(), 148 | compare>()); 149 | if (size == 0) continue; 150 | IS_ARR_PARTITIONED(arr, k, sortedarr[k], type); 151 | #endif 152 | arr.clear(); 153 | sortedarr.clear(); 154 | } 155 | } 156 | } 157 | 158 | TYPED_TEST_P(simdsort, test_qselect_descending) 159 | { 160 | for (auto type : this->arrtype) { 161 | bool hasnan = is_nan_test(type); 162 | for (auto size : this->arrsize) { 163 | size_t k = size != 0 ? rand() % size : 0; 164 | std::vector basearr = get_array(type, size); 165 | 166 | // Descending order 167 | std::vector arr = basearr; 168 | std::vector sortedarr = arr; 169 | 170 | x86simdsort::qselect(arr.data(), k, arr.size(), hasnan, true); 171 | #ifndef XSS_ASAN_CI_NOCHECK 172 | std::nth_element(sortedarr.begin(), 173 | sortedarr.begin() + k, 174 | sortedarr.end(), 175 | compare>()); 176 | if (size == 0) continue; 177 | IS_ARR_PARTITIONED(arr, k, sortedarr[k], type, true); 178 | #endif 179 | arr.clear(); 180 | sortedarr.clear(); 181 | } 182 | } 183 | } 184 | 185 | TYPED_TEST_P(simdsort, test_argselect) 186 | { 187 | for (auto type : this->arrtype) { 188 | bool hasnan = is_nan_test(type); 189 | for (auto size : this->arrsize) { 190 | size_t k = size != 0 ? rand() % size : 0; 191 | std::vector arr = get_array(type, size); 192 | std::vector sortedarr = arr; 193 | 194 | auto arg 195 | = x86simdsort::argselect(arr.data(), k, arr.size(), hasnan); 196 | #ifndef XSS_ASAN_CI_NOCHECK 197 | std::sort(sortedarr.begin(), 198 | sortedarr.end(), 199 | compare>()); 200 | if (size == 0) continue; 201 | IS_ARG_PARTITIONED(arr, arg, sortedarr[k], k, type); 202 | #endif 203 | arr.clear(); 204 | sortedarr.clear(); 205 | } 206 | } 207 | } 208 | 209 | TYPED_TEST_P(simdsort, test_partial_qsort_ascending) 210 | { 211 | for (auto type : this->arrtype) { 212 | bool hasnan = is_nan_test(type); 213 | for (auto size : this->arrsize) { 214 | size_t k = size != 0 ? rand() % size : 0; 215 | std::vector basearr = get_array(type, size); 216 | 217 | // Ascending order 218 | std::vector arr = basearr; 219 | std::vector sortedarr = arr; 220 | 221 | x86simdsort::partial_qsort(arr.data(), k, arr.size(), hasnan); 222 | #ifndef XSS_ASAN_CI_NOCHECK 223 | std::sort(sortedarr.begin(), 224 | sortedarr.end(), 225 | compare>()); 226 | if (size == 0) continue; 227 | IS_ARR_PARTIALSORTED(arr, k, sortedarr, type); 228 | #endif 229 | arr.clear(); 230 | sortedarr.clear(); 231 | } 232 | } 233 | } 234 | 235 | TYPED_TEST_P(simdsort, test_partial_qsort_descending) 236 | { 237 | for (auto type : this->arrtype) { 238 | bool hasnan = is_nan_test(type); 239 | for (auto size : this->arrsize) { 240 | size_t k = size != 0 ? rand() % size : 0; 241 | std::vector basearr = get_array(type, size); 242 | 243 | // Descending order 244 | std::vector arr = basearr; 245 | std::vector sortedarr = arr; 246 | 247 | x86simdsort::partial_qsort(arr.data(), k, arr.size(), hasnan, true); 248 | #ifndef XSS_ASAN_CI_NOCHECK 249 | std::sort(sortedarr.begin(), 250 | sortedarr.end(), 251 | compare>()); 252 | if (size == 0) continue; 253 | IS_ARR_PARTIALSORTED(arr, k, sortedarr, type); 254 | #endif 255 | arr.clear(); 256 | sortedarr.clear(); 257 | } 258 | } 259 | } 260 | 261 | TYPED_TEST_P(simdsort, test_comparator) 262 | { 263 | if constexpr (xss::fp::is_floating_point_v) { 264 | auto less = compare>(); 265 | auto leq = compare>(); 266 | auto greater = compare>(); 267 | auto geq = compare>(); 268 | auto equal = compare>(); 269 | TypeParam nan = xss::fp::quiet_NaN(); 270 | TypeParam inf = xss::fp::infinity(); 271 | ASSERT_EQ(less(nan, inf), false); 272 | ASSERT_EQ(less(nan, nan), false); 273 | ASSERT_EQ(less(inf, nan), true); 274 | ASSERT_EQ(less(inf, inf), false); 275 | ASSERT_EQ(leq(nan, inf), false); 276 | ASSERT_EQ(leq(nan, nan), true); 277 | ASSERT_EQ(leq(inf, nan), true); 278 | ASSERT_EQ(leq(inf, inf), true); 279 | ASSERT_EQ(geq(nan, inf), true); 280 | ASSERT_EQ(geq(nan, nan), true); 281 | ASSERT_EQ(geq(inf, nan), false); 282 | ASSERT_EQ(geq(inf, inf), true); 283 | ASSERT_EQ(greater(nan, inf), true); 284 | ASSERT_EQ(greater(nan, nan), false); 285 | ASSERT_EQ(greater(inf, nan), false); 286 | ASSERT_EQ(greater(inf, inf), false); 287 | ASSERT_EQ(equal(nan, inf), false); 288 | ASSERT_EQ(equal(nan, nan), true); 289 | ASSERT_EQ(equal(inf, nan), false); 290 | ASSERT_EQ(equal(inf, inf), true); 291 | } 292 | } 293 | 294 | REGISTER_TYPED_TEST_SUITE_P(simdsort, 295 | test_qsort_ascending, 296 | test_qsort_descending, 297 | test_argsort_ascending, 298 | test_argsort_descending, 299 | test_argselect, 300 | test_qselect_ascending, 301 | test_qselect_descending, 302 | test_partial_qsort_ascending, 303 | test_partial_qsort_descending, 304 | test_comparator); 305 | 306 | using QSortTestTypes = testing::Types= 13 || __clang_major__ >= 6 310 | _Float16, 311 | #endif 312 | float, 313 | double, 314 | uint32_t, 315 | int32_t, 316 | uint64_t, 317 | int64_t>; 318 | 319 | INSTANTIATE_TYPED_TEST_SUITE_P(xss, simdsort, QSortTestTypes); 320 | -------------------------------------------------------------------------------- /src/xss-optimal-networks.hpp: -------------------------------------------------------------------------------- 1 | // All of these sources files are generated from the optimal networks described in 2 | // https://bertdobbelaere.github.io/sorting_networks.html 3 | 4 | template 7 | X86_SIMD_SORT_FINLINE void optimal_sort_4(reg_t *vecs) 8 | { 9 | comparator::COEX(vecs[0], vecs[2]); 10 | comparator::COEX(vecs[1], vecs[3]); 11 | 12 | comparator::COEX(vecs[0], vecs[1]); 13 | comparator::COEX(vecs[2], vecs[3]); 14 | 15 | comparator::COEX(vecs[1], vecs[2]); 16 | } 17 | 18 | template 21 | X86_SIMD_SORT_FINLINE void optimal_sort_8(reg_t *vecs) 22 | { 23 | comparator::COEX(vecs[0], vecs[2]); 24 | comparator::COEX(vecs[1], vecs[3]); 25 | comparator::COEX(vecs[4], vecs[6]); 26 | comparator::COEX(vecs[5], vecs[7]); 27 | 28 | comparator::COEX(vecs[0], vecs[4]); 29 | comparator::COEX(vecs[1], vecs[5]); 30 | comparator::COEX(vecs[2], vecs[6]); 31 | comparator::COEX(vecs[3], vecs[7]); 32 | 33 | comparator::COEX(vecs[0], vecs[1]); 34 | comparator::COEX(vecs[2], vecs[3]); 35 | comparator::COEX(vecs[4], vecs[5]); 36 | comparator::COEX(vecs[6], vecs[7]); 37 | 38 | comparator::COEX(vecs[2], vecs[4]); 39 | comparator::COEX(vecs[3], vecs[5]); 40 | 41 | comparator::COEX(vecs[1], vecs[4]); 42 | comparator::COEX(vecs[3], vecs[6]); 43 | 44 | comparator::COEX(vecs[1], vecs[2]); 45 | comparator::COEX(vecs[3], vecs[4]); 46 | comparator::COEX(vecs[5], vecs[6]); 47 | } 48 | 49 | template 52 | X86_SIMD_SORT_FINLINE void optimal_sort_16(reg_t *vecs) 53 | { 54 | comparator::COEX(vecs[0], vecs[13]); 55 | comparator::COEX(vecs[1], vecs[12]); 56 | comparator::COEX(vecs[2], vecs[15]); 57 | comparator::COEX(vecs[3], vecs[14]); 58 | comparator::COEX(vecs[4], vecs[8]); 59 | comparator::COEX(vecs[5], vecs[6]); 60 | comparator::COEX(vecs[7], vecs[11]); 61 | comparator::COEX(vecs[9], vecs[10]); 62 | 63 | comparator::COEX(vecs[0], vecs[5]); 64 | comparator::COEX(vecs[1], vecs[7]); 65 | comparator::COEX(vecs[2], vecs[9]); 66 | comparator::COEX(vecs[3], vecs[4]); 67 | comparator::COEX(vecs[6], vecs[13]); 68 | comparator::COEX(vecs[8], vecs[14]); 69 | comparator::COEX(vecs[10], vecs[15]); 70 | comparator::COEX(vecs[11], vecs[12]); 71 | 72 | comparator::COEX(vecs[0], vecs[1]); 73 | comparator::COEX(vecs[2], vecs[3]); 74 | comparator::COEX(vecs[4], vecs[5]); 75 | comparator::COEX(vecs[6], vecs[8]); 76 | comparator::COEX(vecs[7], vecs[9]); 77 | comparator::COEX(vecs[10], vecs[11]); 78 | comparator::COEX(vecs[12], vecs[13]); 79 | comparator::COEX(vecs[14], vecs[15]); 80 | 81 | comparator::COEX(vecs[0], vecs[2]); 82 | comparator::COEX(vecs[1], vecs[3]); 83 | comparator::COEX(vecs[4], vecs[10]); 84 | comparator::COEX(vecs[5], vecs[11]); 85 | comparator::COEX(vecs[6], vecs[7]); 86 | comparator::COEX(vecs[8], vecs[9]); 87 | comparator::COEX(vecs[12], vecs[14]); 88 | comparator::COEX(vecs[13], vecs[15]); 89 | 90 | comparator::COEX(vecs[1], vecs[2]); 91 | comparator::COEX(vecs[3], vecs[12]); 92 | comparator::COEX(vecs[4], vecs[6]); 93 | comparator::COEX(vecs[5], vecs[7]); 94 | comparator::COEX(vecs[8], vecs[10]); 95 | comparator::COEX(vecs[9], vecs[11]); 96 | comparator::COEX(vecs[13], vecs[14]); 97 | 98 | comparator::COEX(vecs[1], vecs[4]); 99 | comparator::COEX(vecs[2], vecs[6]); 100 | comparator::COEX(vecs[5], vecs[8]); 101 | comparator::COEX(vecs[7], vecs[10]); 102 | comparator::COEX(vecs[9], vecs[13]); 103 | comparator::COEX(vecs[11], vecs[14]); 104 | 105 | comparator::COEX(vecs[2], vecs[4]); 106 | comparator::COEX(vecs[3], vecs[6]); 107 | comparator::COEX(vecs[9], vecs[12]); 108 | comparator::COEX(vecs[11], vecs[13]); 109 | 110 | comparator::COEX(vecs[3], vecs[5]); 111 | comparator::COEX(vecs[6], vecs[8]); 112 | comparator::COEX(vecs[7], vecs[9]); 113 | comparator::COEX(vecs[10], vecs[12]); 114 | 115 | comparator::COEX(vecs[3], vecs[4]); 116 | comparator::COEX(vecs[5], vecs[6]); 117 | comparator::COEX(vecs[7], vecs[8]); 118 | comparator::COEX(vecs[9], vecs[10]); 119 | comparator::COEX(vecs[11], vecs[12]); 120 | 121 | comparator::COEX(vecs[6], vecs[7]); 122 | comparator::COEX(vecs[8], vecs[9]); 123 | } 124 | 125 | template 128 | X86_SIMD_SORT_FINLINE void optimal_sort_32(reg_t *vecs) 129 | { 130 | comparator::COEX(vecs[0], vecs[1]); 131 | comparator::COEX(vecs[2], vecs[3]); 132 | comparator::COEX(vecs[4], vecs[5]); 133 | comparator::COEX(vecs[6], vecs[7]); 134 | comparator::COEX(vecs[8], vecs[9]); 135 | comparator::COEX(vecs[10], vecs[11]); 136 | comparator::COEX(vecs[12], vecs[13]); 137 | comparator::COEX(vecs[14], vecs[15]); 138 | comparator::COEX(vecs[16], vecs[17]); 139 | comparator::COEX(vecs[18], vecs[19]); 140 | comparator::COEX(vecs[20], vecs[21]); 141 | comparator::COEX(vecs[22], vecs[23]); 142 | comparator::COEX(vecs[24], vecs[25]); 143 | comparator::COEX(vecs[26], vecs[27]); 144 | comparator::COEX(vecs[28], vecs[29]); 145 | comparator::COEX(vecs[30], vecs[31]); 146 | 147 | comparator::COEX(vecs[0], vecs[2]); 148 | comparator::COEX(vecs[1], vecs[3]); 149 | comparator::COEX(vecs[4], vecs[6]); 150 | comparator::COEX(vecs[5], vecs[7]); 151 | comparator::COEX(vecs[8], vecs[10]); 152 | comparator::COEX(vecs[9], vecs[11]); 153 | comparator::COEX(vecs[12], vecs[14]); 154 | comparator::COEX(vecs[13], vecs[15]); 155 | comparator::COEX(vecs[16], vecs[18]); 156 | comparator::COEX(vecs[17], vecs[19]); 157 | comparator::COEX(vecs[20], vecs[22]); 158 | comparator::COEX(vecs[21], vecs[23]); 159 | comparator::COEX(vecs[24], vecs[26]); 160 | comparator::COEX(vecs[25], vecs[27]); 161 | comparator::COEX(vecs[28], vecs[30]); 162 | comparator::COEX(vecs[29], vecs[31]); 163 | 164 | comparator::COEX(vecs[0], vecs[4]); 165 | comparator::COEX(vecs[1], vecs[5]); 166 | comparator::COEX(vecs[2], vecs[6]); 167 | comparator::COEX(vecs[3], vecs[7]); 168 | comparator::COEX(vecs[8], vecs[12]); 169 | comparator::COEX(vecs[9], vecs[13]); 170 | comparator::COEX(vecs[10], vecs[14]); 171 | comparator::COEX(vecs[11], vecs[15]); 172 | comparator::COEX(vecs[16], vecs[20]); 173 | comparator::COEX(vecs[17], vecs[21]); 174 | comparator::COEX(vecs[18], vecs[22]); 175 | comparator::COEX(vecs[19], vecs[23]); 176 | comparator::COEX(vecs[24], vecs[28]); 177 | comparator::COEX(vecs[25], vecs[29]); 178 | comparator::COEX(vecs[26], vecs[30]); 179 | comparator::COEX(vecs[27], vecs[31]); 180 | 181 | comparator::COEX(vecs[0], vecs[8]); 182 | comparator::COEX(vecs[1], vecs[9]); 183 | comparator::COEX(vecs[2], vecs[10]); 184 | comparator::COEX(vecs[3], vecs[11]); 185 | comparator::COEX(vecs[4], vecs[12]); 186 | comparator::COEX(vecs[5], vecs[13]); 187 | comparator::COEX(vecs[6], vecs[14]); 188 | comparator::COEX(vecs[7], vecs[15]); 189 | comparator::COEX(vecs[16], vecs[24]); 190 | comparator::COEX(vecs[17], vecs[25]); 191 | comparator::COEX(vecs[18], vecs[26]); 192 | comparator::COEX(vecs[19], vecs[27]); 193 | comparator::COEX(vecs[20], vecs[28]); 194 | comparator::COEX(vecs[21], vecs[29]); 195 | comparator::COEX(vecs[22], vecs[30]); 196 | comparator::COEX(vecs[23], vecs[31]); 197 | 198 | comparator::COEX(vecs[0], vecs[16]); 199 | comparator::COEX(vecs[1], vecs[8]); 200 | comparator::COEX(vecs[2], vecs[4]); 201 | comparator::COEX(vecs[3], vecs[12]); 202 | comparator::COEX(vecs[5], vecs[10]); 203 | comparator::COEX(vecs[6], vecs[9]); 204 | comparator::COEX(vecs[7], vecs[14]); 205 | comparator::COEX(vecs[11], vecs[13]); 206 | comparator::COEX(vecs[15], vecs[31]); 207 | comparator::COEX(vecs[17], vecs[24]); 208 | comparator::COEX(vecs[18], vecs[20]); 209 | comparator::COEX(vecs[19], vecs[28]); 210 | comparator::COEX(vecs[21], vecs[26]); 211 | comparator::COEX(vecs[22], vecs[25]); 212 | comparator::COEX(vecs[23], vecs[30]); 213 | comparator::COEX(vecs[27], vecs[29]); 214 | 215 | comparator::COEX(vecs[1], vecs[2]); 216 | comparator::COEX(vecs[3], vecs[5]); 217 | comparator::COEX(vecs[4], vecs[8]); 218 | comparator::COEX(vecs[6], vecs[22]); 219 | comparator::COEX(vecs[7], vecs[11]); 220 | comparator::COEX(vecs[9], vecs[25]); 221 | comparator::COEX(vecs[10], vecs[12]); 222 | comparator::COEX(vecs[13], vecs[14]); 223 | comparator::COEX(vecs[17], vecs[18]); 224 | comparator::COEX(vecs[19], vecs[21]); 225 | comparator::COEX(vecs[20], vecs[24]); 226 | comparator::COEX(vecs[23], vecs[27]); 227 | comparator::COEX(vecs[26], vecs[28]); 228 | comparator::COEX(vecs[29], vecs[30]); 229 | 230 | comparator::COEX(vecs[1], vecs[17]); 231 | comparator::COEX(vecs[2], vecs[18]); 232 | comparator::COEX(vecs[3], vecs[19]); 233 | comparator::COEX(vecs[4], vecs[20]); 234 | comparator::COEX(vecs[5], vecs[10]); 235 | comparator::COEX(vecs[7], vecs[23]); 236 | comparator::COEX(vecs[8], vecs[24]); 237 | comparator::COEX(vecs[11], vecs[27]); 238 | comparator::COEX(vecs[12], vecs[28]); 239 | comparator::COEX(vecs[13], vecs[29]); 240 | comparator::COEX(vecs[14], vecs[30]); 241 | comparator::COEX(vecs[21], vecs[26]); 242 | 243 | comparator::COEX(vecs[3], vecs[17]); 244 | comparator::COEX(vecs[4], vecs[16]); 245 | comparator::COEX(vecs[5], vecs[21]); 246 | comparator::COEX(vecs[6], vecs[18]); 247 | comparator::COEX(vecs[7], vecs[9]); 248 | comparator::COEX(vecs[8], vecs[20]); 249 | comparator::COEX(vecs[10], vecs[26]); 250 | comparator::COEX(vecs[11], vecs[23]); 251 | comparator::COEX(vecs[13], vecs[25]); 252 | comparator::COEX(vecs[14], vecs[28]); 253 | comparator::COEX(vecs[15], vecs[27]); 254 | comparator::COEX(vecs[22], vecs[24]); 255 | 256 | comparator::COEX(vecs[1], vecs[4]); 257 | comparator::COEX(vecs[3], vecs[8]); 258 | comparator::COEX(vecs[5], vecs[16]); 259 | comparator::COEX(vecs[7], vecs[17]); 260 | comparator::COEX(vecs[9], vecs[21]); 261 | comparator::COEX(vecs[10], vecs[22]); 262 | comparator::COEX(vecs[11], vecs[19]); 263 | comparator::COEX(vecs[12], vecs[20]); 264 | comparator::COEX(vecs[14], vecs[24]); 265 | comparator::COEX(vecs[15], vecs[26]); 266 | comparator::COEX(vecs[23], vecs[28]); 267 | comparator::COEX(vecs[27], vecs[30]); 268 | 269 | comparator::COEX(vecs[2], vecs[5]); 270 | comparator::COEX(vecs[7], vecs[8]); 271 | comparator::COEX(vecs[9], vecs[18]); 272 | comparator::COEX(vecs[11], vecs[17]); 273 | comparator::COEX(vecs[12], vecs[16]); 274 | comparator::COEX(vecs[13], vecs[22]); 275 | comparator::COEX(vecs[14], vecs[20]); 276 | comparator::COEX(vecs[15], vecs[19]); 277 | comparator::COEX(vecs[23], vecs[24]); 278 | comparator::COEX(vecs[26], vecs[29]); 279 | 280 | comparator::COEX(vecs[2], vecs[4]); 281 | comparator::COEX(vecs[6], vecs[12]); 282 | comparator::COEX(vecs[9], vecs[16]); 283 | comparator::COEX(vecs[10], vecs[11]); 284 | comparator::COEX(vecs[13], vecs[17]); 285 | comparator::COEX(vecs[14], vecs[18]); 286 | comparator::COEX(vecs[15], vecs[22]); 287 | comparator::COEX(vecs[19], vecs[25]); 288 | comparator::COEX(vecs[20], vecs[21]); 289 | comparator::COEX(vecs[27], vecs[29]); 290 | 291 | comparator::COEX(vecs[5], vecs[6]); 292 | comparator::COEX(vecs[8], vecs[12]); 293 | comparator::COEX(vecs[9], vecs[10]); 294 | comparator::COEX(vecs[11], vecs[13]); 295 | comparator::COEX(vecs[14], vecs[16]); 296 | comparator::COEX(vecs[15], vecs[17]); 297 | comparator::COEX(vecs[18], vecs[20]); 298 | comparator::COEX(vecs[19], vecs[23]); 299 | comparator::COEX(vecs[21], vecs[22]); 300 | comparator::COEX(vecs[25], vecs[26]); 301 | 302 | comparator::COEX(vecs[3], vecs[5]); 303 | comparator::COEX(vecs[6], vecs[7]); 304 | comparator::COEX(vecs[8], vecs[9]); 305 | comparator::COEX(vecs[10], vecs[12]); 306 | comparator::COEX(vecs[11], vecs[14]); 307 | comparator::COEX(vecs[13], vecs[16]); 308 | comparator::COEX(vecs[15], vecs[18]); 309 | comparator::COEX(vecs[17], vecs[20]); 310 | comparator::COEX(vecs[19], vecs[21]); 311 | comparator::COEX(vecs[22], vecs[23]); 312 | comparator::COEX(vecs[24], vecs[25]); 313 | comparator::COEX(vecs[26], vecs[28]); 314 | 315 | comparator::COEX(vecs[3], vecs[4]); 316 | comparator::COEX(vecs[5], vecs[6]); 317 | comparator::COEX(vecs[7], vecs[8]); 318 | comparator::COEX(vecs[9], vecs[10]); 319 | comparator::COEX(vecs[11], vecs[12]); 320 | comparator::COEX(vecs[13], vecs[14]); 321 | comparator::COEX(vecs[15], vecs[16]); 322 | comparator::COEX(vecs[17], vecs[18]); 323 | comparator::COEX(vecs[19], vecs[20]); 324 | comparator::COEX(vecs[21], vecs[22]); 325 | comparator::COEX(vecs[23], vecs[24]); 326 | comparator::COEX(vecs[25], vecs[26]); 327 | comparator::COEX(vecs[27], vecs[28]); 328 | } 329 | --------------------------------------------------------------------------------