├── misc
    └── object_qsort-perf.jpg
├── benchmarks
    ├── bench-all.cpp
    ├── meson.build
    ├── bench-vqsort.cpp
    ├── bench-keyvalue.hpp
    ├── bench-qselect.hpp
    ├── bench-partial-qsort.hpp
    ├── bench.h
    ├── bench-qsort.hpp
    ├── bench-ipp.cpp
    ├── bench-argsort.hpp
    └── bench-objsort.hpp
├── examples
    ├── icl-16bit.cpp
    ├── spr-16bit.cpp
    ├── skx-avx2.cpp
    ├── Makefile
    └── avx512-kv.cpp
├── SECURITY.md
├── src
    ├── avx512-64bit-qsort.hpp
    ├── avx512-64bit-argsort.hpp
    ├── xss-custom-float.h
    ├── xss-common-comparators.hpp
    ├── xss-common-includes.h
    ├── avx512-16bit-common.h
    ├── avx512fp16-16bit-qsort.hpp
    ├── README.md
    ├── x86simdsort-static-incl.h
    ├── xss-network-qsort.hpp
    ├── xss-pivot-selection.hpp
    └── xss-optimal-networks.hpp
├── .gitignore
├── .github
    └── workflows
    │   ├── linting.yml
    │   ├── build-test-on-32bit.sh
    │   ├── scorecard.yml
    │   ├── build-numpy.yml
    │   └── c-cpp.yml
├── scripts
    ├── bench-compare.sh
    └── branch-compare.sh
├── tests
    ├── meson.build
    ├── test-objqsort.cpp
    ├── test-qsort-common.h
    └── test-qsort.cpp
├── lib
    ├── x86simdsort-spr.cpp
    ├── meson.build
    ├── x86simdsort-icl.cpp
    ├── x86simdsort-internal.h
    ├── x86simdsort-skx.cpp
    ├── x86simdsort-avx2.cpp
    ├── list-of-exported-symbols.txt
    ├── x86simdsort.h
    └── x86simdsort-scalar.h
├── meson_options.txt
├── Makefile
├── utils
    ├── custom-compare.h
    └── rand_array.h
├── LICENSE.md
├── CONTRIBUTING.md
├── example.c
├── run-bench.py
├── .clang-format
├── meson.build
├── CODE_OF_CONDUCT.md
└── README.md


/misc/object_qsort-perf.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/intel/x86-simd-sort/HEAD/misc/object_qsort-perf.jpg


--------------------------------------------------------------------------------
/benchmarks/bench-all.cpp:
--------------------------------------------------------------------------------
1 | #include "bench.h"
2 | #include "bench-argsort.hpp"
3 | #include "bench-partial-qsort.hpp"
4 | #include "bench-qselect.hpp"
5 | #include "bench-qsort.hpp"
6 | #include "bench-keyvalue.hpp"
7 | #include "bench-objsort.hpp"
8 | 


--------------------------------------------------------------------------------
/examples/icl-16bit.cpp:
--------------------------------------------------------------------------------
 1 | #include "x86simdsort-static-incl.h"
 2 | 
 3 | int main()
 4 | {
 5 |     const int size = 1000;
 6 |     short arr[size];
 7 |     x86simdsortStatic::qsort(arr, size);
 8 |     x86simdsortStatic::qselect(arr, 10, size);
 9 |     x86simdsortStatic::partial_qsort(arr, 10, size);
10 |     return 0;
11 | }
12 | 


--------------------------------------------------------------------------------
/examples/spr-16bit.cpp:
--------------------------------------------------------------------------------
 1 | #include "x86simdsort-static-incl.h"
 2 | 
 3 | int main()
 4 | {
 5 |     const int size = 1000;
 6 |     _Float16 arr[size];
 7 |     x86simdsortStatic::qsort(arr, size);
 8 |     x86simdsortStatic::qselect(arr, 10, size);
 9 |     x86simdsortStatic::partial_qsort(arr, 10, size);
10 |     return 0;
11 | }
12 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | # Security Policy
 2 | 
 3 | ## Supported Versions
 4 | 
 5 | | Version | Supported          |
 6 | | ------- | ------------------ |
 7 | | 5.0     | :white_check_mark: |
 8 | | 4.0     | :white_check_mark: |
 9 | | < 4.0   | :x:                |
10 | 
11 | ## Reporting a Vulnerability
12 | 
13 | Report any vulnerability to raghuveer.devulapalli@intel.com
14 | 


--------------------------------------------------------------------------------
/src/avx512-64bit-qsort.hpp:
--------------------------------------------------------------------------------
 1 | /*******************************************************************
 2 |  * Copyright (C) 2022 Intel Corporation
 3 |  * SPDX-License-Identifier: BSD-3-Clause
 4 |  * Authors: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
 5 |  * ****************************************************************/
 6 | 
 7 | #ifndef AVX512_QSORT_64BIT
 8 | #define AVX512_QSORT_64BIT
 9 | 
10 | #include "avx512-64bit-common.h"
11 | 
12 | #endif // AVX512_QSORT_64BIT
13 | 


--------------------------------------------------------------------------------
/src/avx512-64bit-argsort.hpp:
--------------------------------------------------------------------------------
 1 | /*******************************************************************
 2 |  * Copyright (C) 2022 Intel Corporation
 3 |  * SPDX-License-Identifier: BSD-3-Clause
 4 |  * Authors: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
 5 |  * ****************************************************************/
 6 | 
 7 | #ifndef AVX512_ARGSORT_64BIT
 8 | #define AVX512_ARGSORT_64BIT
 9 | 
10 | #include "avx512-64bit-common.h"
11 | #include "xss-common-argsort.h"
12 | 
13 | #endif // AVX512_ARGSORT_64BIT
14 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # bench-compare
 2 | .bench-compare
 3 | .bench
 4 | # Prerequisites
 5 | *.d
 6 | 
 7 | # Compiled Object files
 8 | *.slo
 9 | *.lo
10 | *.o
11 | *.obj
12 | 
13 | # Precompiled Headers
14 | *.gch
15 | *.pch
16 | 
17 | # Compiled Dynamic libraries
18 | *.so
19 | *.dylib
20 | *.dll
21 | 
22 | # Fortran module files
23 | *.mod
24 | *.smod
25 | 
26 | # Compiled Static libraries
27 | *.lai
28 | *.la
29 | *.a
30 | *.lib
31 | 
32 | # Executables
33 | *.exe
34 | *.out
35 | *.app
36 | 
37 | # Build or IDE artifacts
38 | **/.vscode
39 | /builddir/
40 | /testexe
41 | /benchexe
42 | 


--------------------------------------------------------------------------------
/.github/workflows/linting.yml:
--------------------------------------------------------------------------------
 1 | name: Lint
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ "main" ]
 6 |   pull_request:
 7 |     branches: [ "main" ]
 8 | 
 9 | permissions: read-all
10 | 
11 | jobs:
12 |   clang-format:
13 | 
14 |     runs-on: intel-ubuntu-24.04
15 | 
16 |     steps:
17 |     - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
18 | 
19 |     - name: Install dependencies
20 |       run: |
21 |         sudo apt update
22 |         sudo apt -y install clang-format
23 | 
24 |     - name: Lint
25 |       run: |
26 |         find . -type f | grep -P ".*\.(c|cpp|h|hpp)\b" | xargs clang-format -style=file --dry-run -Werror
27 | 


--------------------------------------------------------------------------------
/scripts/bench-compare.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | branch=$(git rev-parse --abbrev-ref HEAD)
 4 | SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
 5 | cd $SCRIPT_DIR/..
 6 | 
 7 | ## Get google-benchmark
 8 | mkdir -p .bench
 9 | if [ ! -d .bench/google-benchmark ]; then
10 |     git clone https://github.com/google/benchmark .bench/google-benchmark
11 | fi
12 | compare=$(realpath .bench/google-benchmark/tools/compare.py)
13 | 
14 | meson setup -Dbuild_benchmarks=true -Dbuild_vqsortbench=true --warnlevel 0 --buildtype release builddir-${branch}
15 | cd builddir-${branch}
16 | ninja
17 | $compare filters ./benchexe $1 $2 --benchmark_repetitions=$3
18 | 


--------------------------------------------------------------------------------
/examples/skx-avx2.cpp:
--------------------------------------------------------------------------------
 1 | #include "x86simdsort-static-incl.h"
 2 | 
 3 | int main()
 4 | {
 5 |     const int size = 1000;
 6 |     double arrd[size];
 7 |     float arrf[size];
 8 |     x86simdsortStatic::qsort(arrf, size);
 9 |     x86simdsortStatic::qsort(arrd, size);
10 |     x86simdsortStatic::qselect(arrf, 10, size);
11 |     x86simdsortStatic::qselect(arrd, 10, size);
12 |     x86simdsortStatic::partial_qsort(arrf, 10, size);
13 |     x86simdsortStatic::partial_qsort(arrd, 10, size);
14 |     auto arg1 = x86simdsortStatic::argsort(arrf, size);
15 |     auto arg2 = x86simdsortStatic::argselect(arrf, 10, size);
16 |     auto arg3 = x86simdsortStatic::argsort(arrd, size);
17 |     auto arg4 = x86simdsortStatic::argselect(arrd, 10, size);
18 |     return 0;
19 | }
20 | 


--------------------------------------------------------------------------------
/examples/Makefile:
--------------------------------------------------------------------------------
 1 | CXX ?= g++-13
 2 | CFLAGS = -I../src -std=c++17 -O3
 3 | EXE = kvsort qsortavx2 qsortavx512 qsortspr qsorticl
 4 | 
 5 | default: all
 6 | all : $(EXE)
 7 | 
 8 | kvsort: avx512-kv.cpp
 9 | 	     $(CXX) -o kvsort -mavx512vl -mavx512dq $(CFLAGS) avx512-kv.cpp
10 | 
11 | qsortavx512: skx-avx2.cpp
12 | 	     $(CXX) -o qsortavx512 -mavx512vl -mavx512dq $(CFLAGS) skx-avx2.cpp
13 | 
14 | qsortavx2: skx-avx2.cpp
15 | 	     $(CXX) -o qsortavx2 -mavx2 $(CFLAGS) skx-avx2.cpp
16 | 
17 | qsorticl: icl-16bit.cpp
18 | 	     $(CXX) -o qsorticl -mavx512vl -mavx512bw -mavx512dq -mavx512vbmi2 $(CFLAGS) icl-16bit.cpp
19 | 
20 | qsortspr: spr-16bit.cpp
21 | 	     $(CXX) -o qsortspr -mavx512vl -mavx512dq -mavx512vbmi2 -mavx512fp16  $(CFLAGS) spr-16bit.cpp
22 | 
23 | clean:
24 | 	$(RM) $(EXE)
25 | 


--------------------------------------------------------------------------------
/benchmarks/meson.build:
--------------------------------------------------------------------------------
 1 | libbench = []
 2 | 
 3 | libbench += static_library('bench_qsort',
 4 |   files(
 5 |     'bench-all.cpp',
 6 |     ),
 7 |   dependencies: gbench_dep,
 8 |   include_directories : [src, lib, utils],
 9 |   cpp_args : ['-O3'],
10 |   )
11 | 
12 | if benchvq and fs.is_file('../highway/hwy/contrib/sort/vqsort-inl.h')
13 |   hwy = include_directories('../highway')
14 |   libbench += static_library('bench_vqsort',
15 |     files(
16 |       'bench-vqsort.cpp',
17 |       ),
18 |     dependencies: gbench_dep,
19 |     include_directories : [src, lib, utils, hwy],
20 |     cpp_args : ['-O3', '-march=native'],
21 |     )
22 | endif
23 | 
24 | if benchipp
25 |   libbench += static_library('bench_ippsort',
26 |     files(
27 |       'bench-ipp.cpp',
28 |       ),
29 |     dependencies: gbench_dep,
30 |     include_directories : [src, lib, utils],
31 |     cpp_args : ['-O3', '-march=native'],
32 |     )
33 | endif
34 | 


--------------------------------------------------------------------------------
/.github/workflows/build-test-on-32bit.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ## Set up environment
 4 | /opt/python/cp39-cp39/bin/python -mvenv venv
 5 | source venv/bin/activate
 6 | python3 -m pip install meson ninja
 7 | export CXX=g++
 8 | 
 9 | ## Install google test from source
10 | git clone https://github.com/google/googletest.git -b v1.14.0
11 | cd googletest
12 | mkdir build
13 | cd build
14 | cmake .. -DBUILD_GMOCK=OFF
15 | make install
16 | 
17 | ## Install Intel SDE
18 | curl -o /tmp/sde.tar.xz https://downloadmirror.intel.com/859732/sde-external-9.58.0-2025-06-16-lin.tar.xz
19 | mkdir /tmp/sde && tar -xvf /tmp/sde.tar.xz -C /tmp/sde/
20 | mv /tmp/sde/* /opt/sde && ln -s /opt/sde/sde /usr/bin/sde
21 | 
22 | ## Build x86-simd-sort
23 | cd /xss
24 | meson setup -Dbuild_tests=true --warnlevel 2 --werror --buildtype release builddir
25 | cd builddir
26 | ninja
27 | 
28 | ## Run tests
29 | sde -tgl -- ./testexe
30 | sde -skl -- ./testexe
31 | 


--------------------------------------------------------------------------------
/examples/avx512-kv.cpp:
--------------------------------------------------------------------------------
 1 | #include "x86simdsort-static-incl.h"
 2 | 
 3 | int main()
 4 | {
 5 |     const int size = 1000;
 6 |     int64_t arr1[size];
 7 |     uint64_t arr2[size];
 8 |     double arr3[size];
 9 |     float arr4[size];
10 |     x86simdsortStatic::keyvalue_qsort(arr1, arr1, size);
11 |     x86simdsortStatic::keyvalue_qsort(arr1, arr2, size);
12 |     x86simdsortStatic::keyvalue_qsort(arr1, arr3, size);
13 |     x86simdsortStatic::keyvalue_qsort(arr2, arr1, size);
14 |     x86simdsortStatic::keyvalue_qsort(arr2, arr2, size);
15 |     x86simdsortStatic::keyvalue_qsort(arr2, arr3, size);
16 |     x86simdsortStatic::keyvalue_qsort(arr3, arr1, size);
17 |     x86simdsortStatic::keyvalue_qsort(arr3, arr2, size);
18 |     x86simdsortStatic::keyvalue_qsort(arr1, arr4, size);
19 |     x86simdsortStatic::keyvalue_qsort(arr2, arr4, size);
20 |     x86simdsortStatic::keyvalue_qsort(arr3, arr4, size);
21 |     return 0;
22 |     return 0;
23 | }
24 | 


--------------------------------------------------------------------------------
/tests/meson.build:
--------------------------------------------------------------------------------
 1 | libtests = []
 2 | 
 3 | # Add compile flags when needed for the ASAN CI run
 4 | testargs = []
 5 | if get_option('asan_ci_dont_validate')
 6 |   if get_option('fatal_sanitizers')
 7 |     testargs = ['-DXSS_ASAN_CI_NOCHECK=true']
 8 |   else
 9 |     error('asan_ci_dont_validate is only for the ASAN CI, should be false otherwise!')
10 |   endif
11 | endif
12 | 
13 | libtests += static_library('tests_qsort',
14 |   files('test-qsort.cpp', ),
15 |   dependencies: [omp_dep, gtest_dep],
16 |   include_directories : [src, lib, utils],
17 |   cpp_args : [testargs],
18 |   )
19 | 
20 | libtests += static_library('tests_kvsort',
21 |   files('test-keyvalue.cpp', ),
22 |   dependencies: [omp_dep, gtest_dep],
23 |   include_directories : [src, lib, utils],
24 |   cpp_args : [testargs],
25 |   )
26 | 
27 | libtests += static_library('tests_objsort',
28 |   files('test-objqsort.cpp', ),
29 |   dependencies: [omp_dep, gtest_dep],
30 |   include_directories : [src, lib, utils],
31 |   cpp_args : [testargs],
32 |   )
33 | 


--------------------------------------------------------------------------------
/lib/x86simdsort-spr.cpp:
--------------------------------------------------------------------------------
 1 | // SPR specific routines:
 2 | #include "x86simdsort-static-incl.h"
 3 | #include "x86simdsort-internal.h"
 4 | 
 5 | namespace xss {
 6 | namespace fp16_spr {
 7 |     template <>
 8 |     void qsort(_Float16 *arr, size_t size, bool hasnan, bool descending)
 9 |     {
10 |         x86simdsortStatic::qsort(arr, size, hasnan, descending);
11 |     }
12 |     template <>
13 |     void qselect(_Float16 *arr,
14 |                  size_t k,
15 |                  size_t arrsize,
16 |                  bool hasnan,
17 |                  bool descending)
18 |     {
19 |         x86simdsortStatic::qselect(arr, k, arrsize, hasnan, descending);
20 |     }
21 |     template <>
22 |     void partial_qsort(_Float16 *arr,
23 |                        size_t k,
24 |                        size_t arrsize,
25 |                        bool hasnan,
26 |                        bool descending)
27 |     {
28 |         x86simdsortStatic::partial_qsort(arr, k, arrsize, hasnan, descending);
29 |     }
30 | } // namespace fp16_spr
31 | } // namespace xss
32 | 


--------------------------------------------------------------------------------
/benchmarks/bench-vqsort.cpp:
--------------------------------------------------------------------------------
 1 | #include "bench.h"
 2 | #define VQSORT_ONLY_STATIC 1
 3 | #include "hwy/contrib/sort/vqsort-inl.h"
 4 | 
 5 | template <typename T, class... Args>
 6 | static void vqsort(benchmark::State &state, Args &&...args)
 7 | {
 8 |     // Get args
 9 |     auto args_tuple = std::make_tuple(std::move(args)...);
10 |     size_t arrsize = std::get<0>(args_tuple);
11 |     std::string arrtype = std::get<1>(args_tuple);
12 |     // set up array
13 |     std::vector<T> arr = get_array<T>(arrtype, arrsize);
14 |     std::vector<T> arr_bkp = arr;
15 |     // benchmark
16 |     for (auto _ : state) {
17 |         hwy::HWY_NAMESPACE::VQSortStatic(
18 |                 arr.data(), arrsize, hwy::SortAscending());
19 |         state.PauseTiming();
20 |         arr = arr_bkp;
21 |         state.ResumeTiming();
22 |     }
23 | }
24 | 
25 | BENCH_SORT(vqsort, uint64_t)
26 | BENCH_SORT(vqsort, int64_t)
27 | BENCH_SORT(vqsort, uint32_t)
28 | BENCH_SORT(vqsort, int32_t)
29 | BENCH_SORT(vqsort, uint16_t)
30 | BENCH_SORT(vqsort, int16_t)
31 | BENCH_SORT(vqsort, float)
32 | BENCH_SORT(vqsort, double)
33 | 


--------------------------------------------------------------------------------
/meson_options.txt:
--------------------------------------------------------------------------------
 1 | option('build_tests', type : 'boolean', value : false,
 2 |   description : 'Build test suite (default: "false").')
 3 | option('build_benchmarks', type : 'boolean', value : false,
 4 |   description : 'Build benchmarking suite (default: "false").')
 5 | option('build_ippbench', type : 'boolean', value : false,
 6 |   description : 'Add IPP sort to benchmarks (default: "false").')
 7 | option('build_vqsortbench', type : 'boolean', value : true,
 8 |   description : 'Add google vqsort to benchmarks (default: "true").')
 9 | option('use_openmp', type : 'boolean', value : false,
10 |   description : 'Use OpenMP to accelerate key-value sort (default: "false").')
11 | option('lib_type', type : 'string', value : 'shared',
12 |   description : 'Library type: shared or static (default: "shared").')
13 | option('fatal_sanitizers', type : 'boolean', value : 'false',
14 |   description : 'If sanitizers are enabled, should all issues be considered fatal? (default: "false").')
15 | option('asan_ci_dont_validate', type : 'boolean', value : 'false',
16 |   description : 'Only for speeding up ASAN CI, do not turn on otherwise')
17 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | test:
 2 | 	meson setup -Dbuild_tests=true -Duse_openmp=false --warnlevel 2 --werror --buildtype release builddir
 3 | 	cd builddir && ninja
 4 | 
 5 | test_openmp:
 6 | 	meson setup -Dbuild_tests=true -Duse_openmp=true --warnlevel 2 --werror --buildtype release builddir
 7 | 	cd builddir && ninja
 8 | 
 9 | test_asan:
10 | 	meson setup -Dbuild_tests=true -Duse_openmp=true -Db_sanitize=address,undefined -Dfatal_sanitizers=true -Db_lundef=false -Dasan_ci_dont_validate=true --warnlevel 0 --buildtype debugoptimized builddir
11 | 	cd builddir && ninja
12 | 
13 | bench:
14 | 	meson setup -Dbuild_benchmarks=true --warnlevel 2 --werror --buildtype release builddir
15 | 	cd builddir && ninja
16 | 
17 | debug:
18 | 	meson setup -Dbuild_tests=true --warnlevel 2 --werror --buildtype debug debug
19 | 	cd debug && ninja
20 | 
21 | sharedlib:
22 | 	meson setup --warnlevel 2 --werror --buildtype release builddir
23 | 	cd builddir && ninja
24 | 
25 | staticlib:
26 | 	meson setup -Dlib_type=static --warnlevel 2 --werror --buildtype release builddir
27 | 	cd builddir && ninja
28 | 
29 | install:
30 | 	meson setup --warnlevel 2 --werror --buildtype release builddir
31 | 	cd builddir && meson install
32 | 
33 | clean:
34 | 	$(RM) -rf $(TESTOBJS) $(BENCHOBJS) $(UTILOBJS) testexe benchexe builddir debug
35 | 


--------------------------------------------------------------------------------
/lib/meson.build:
--------------------------------------------------------------------------------
 1 | libtargets = []
 2 | 
 3 | if cpp.has_argument('-march=haswell')
 4 |   libtargets += static_library('libavx',
 5 |     files(
 6 |       'x86simdsort-avx2.cpp',
 7 |       ),
 8 |     include_directories : [src],
 9 |     cpp_args : ['-march=haswell'],
10 |     gnu_symbol_visibility : 'inlineshidden',
11 |     dependencies: [omp_dep],
12 |     )
13 | endif
14 | 
15 | if cpp.has_argument('-march=skylake-avx512')
16 |   libtargets += static_library('libskx',
17 |     files(
18 |       'x86simdsort-skx.cpp',
19 |       ),
20 |     include_directories : [src],
21 |     cpp_args : ['-march=skylake-avx512'],
22 |     gnu_symbol_visibility : 'inlineshidden',
23 |     dependencies: [omp_dep],
24 |     )
25 | endif
26 | 
27 | if cpp.has_argument('-march=icelake-client')
28 |   libtargets += static_library('libicl',
29 |     files(
30 |       'x86simdsort-icl.cpp',
31 |       ),
32 |     include_directories : [src],
33 |     cpp_args : ['-march=icelake-client'],
34 |     gnu_symbol_visibility : 'inlineshidden',
35 |     dependencies: [omp_dep],
36 |     )
37 | endif
38 | 
39 | if cancompilefp16
40 |   libtargets += static_library('libspr',
41 |     files(
42 |       'x86simdsort-spr.cpp',
43 |       ),
44 |     include_directories : [src],
45 |     cpp_args : ['-march=sapphirerapids'],
46 |     gnu_symbol_visibility : 'inlineshidden',
47 |     dependencies: [omp_dep],
48 |     )
49 | endif
50 | 
51 | install_headers('x86simdsort.h')
52 | 


--------------------------------------------------------------------------------
/utils/custom-compare.h:
--------------------------------------------------------------------------------
 1 | #ifndef UTILS_CUSTOM_COMPARE
 2 | #define UTILS_CUSTOM_COMPARE
 3 | 
 4 | #include <limits>
 5 | #include <cmath>
 6 | #include "xss-custom-float.h"
 7 | 
 8 | /*
 9 |  * Custom comparator class to handle NAN's: treats NAN  > INF
10 |  */
11 | template <typename T, typename Comparator>
12 | struct compare {
13 |     static constexpr auto op = Comparator {};
14 |     bool operator()(const T a, const T b)
15 |     {
16 |         if constexpr (xss::fp::is_floating_point_v<T>) {
17 |             T inf = xss::fp::infinity<T>();
18 |             T one = (T)1.0;
19 |             if (!xss::fp::isunordered(a, b)) { return op(a, b); }
20 |             else if ((xss::fp::isnan(a)) && (!xss::fp::isnan(b))) {
21 |                 return b == inf ? op(inf, one) : op(inf, b);
22 |             }
23 |             else if ((!xss::fp::isnan(a)) && (xss::fp::isnan(b))) {
24 |                 return a == inf ? op(one, inf) : op(a, inf);
25 |             }
26 |             else {
27 |                 return op(one, one);
28 |             }
29 |         }
30 |         else {
31 |             return op(a, b);
32 |         }
33 |     }
34 | };
35 | 
36 | template <typename T, typename Comparator>
37 | struct compare_arg {
38 |     compare_arg(const T *arr)
39 |     {
40 |         this->arr = arr;
41 |     }
42 |     bool operator()(const int64_t a, const int64_t b)
43 |     {
44 |         return compare<T, Comparator>()(arr[a], arr[b]);
45 |     }
46 |     const T *arr;
47 | };
48 | 
49 | #endif // UTILS_CUSTOM_COMPARE


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2022, Intel. All rights reserved.
 4 | 
 5 | Redistribution and use in source and binary forms, with or without
 6 | modification, are permitted provided that the following conditions are met:
 7 | 
 8 | 1. Redistributions of source code must retain the above copyright notice, this
 9 |    list of conditions and the following disclaimer.
10 | 
11 | 2. Redistributions in binary form must reproduce the above copyright notice,
12 |    this list of conditions and the following disclaimer in the documentation
13 |    and/or other materials provided with the distribution.
14 | 
15 | 3. Neither the name of the copyright holder nor the names of its
16 |    contributors may be used to endorse or promote products derived from
17 |    this software without specific prior written permission.
18 | 
19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | 


--------------------------------------------------------------------------------
/scripts/branch-compare.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
 4 | BASE_DIR=$(dirname $SCRIPT_DIR)
 5 | branch=$(git rev-parse --abbrev-ref HEAD)
 6 | #br_commit=$(git rev-parse $branch)
 7 | #main_commit=$(git rev-parse main)
 8 | basebranch=$1
 9 | echo "Comparing $basebranch branch with $branch"
10 | 
11 | build_branch() {
12 |     dir_name=$1
13 |     if [ ! -d $dir_name ]; then
14 |         git clone -b $dir_name ${BASE_DIR} $dir_name
15 |     else
16 |         # if it exists, just update it
17 |         cd $dir_name
18 |         git fetch origin
19 |         git rebase origin/$dir_name
20 |         # rebase fails with conflict, delete and start over
21 |         if [ "$?" != 0 ]; then
22 |             cd ..
23 |             rm -rf $dir_name
24 |             git clone -b $dir_name ${BASE_DIR} $dir_name
25 |         else
26 |             cd ..
27 |         fi
28 |     fi
29 |     cd $dir_name
30 |     meson setup -Dbuild_benchmarks=true -Duse_openmp=true --warnlevel 0 --buildtype release builddir
31 |     cd builddir
32 |     ninja
33 |     cd ../../
34 | }
35 | 
36 | mkdir -p .bench
37 | cd .bench
38 | if [ ! -d google-benchmark ]; then
39 |     git clone https://github.com/google/benchmark google-benchmark
40 | fi
41 | compare=$(realpath google-benchmark/tools/compare.py)
42 | build_branch $branch
43 | build_branch $basebranch
44 | contender=$(realpath ${branch}/builddir/benchexe)
45 | baseline=$(realpath ${basebranch}/builddir/benchexe)
46 | 
47 | if [ -z "$3" ]; then
48 |     echo "Comparing all benchmarks .."
49 |     $compare benchmarks $baseline $contender --benchmark_repetitions=$2
50 | else
51 |     echo "Comparing benchmark $2 .."
52 |     $compare benchmarksfiltered $baseline $2 $contender $2 --benchmark_repetitions=$3
53 | fi
54 | 


--------------------------------------------------------------------------------
/benchmarks/bench-keyvalue.hpp:
--------------------------------------------------------------------------------
 1 | #include "x86simdsort-scalar.h"
 2 | 
 3 | template <typename T, class... Args>
 4 | static void scalarkvsort(benchmark::State &state, Args &&...args)
 5 | {
 6 |     // Get args
 7 |     auto args_tuple = std::make_tuple(std::move(args)...);
 8 |     size_t arrsize = std::get<0>(args_tuple);
 9 |     std::string arrtype = std::get<1>(args_tuple);
10 |     // set up array
11 |     std::vector<T> key = get_array<T>(arrtype, arrsize);
12 |     std::vector<T> val = get_array<T>("random", arrsize);
13 |     std::vector<T> key_bkp = key;
14 |     // benchmark
15 |     for (auto _ : state) {
16 |         xss::scalar::keyvalue_qsort(
17 |                 key.data(), val.data(), arrsize, false, false);
18 |         state.PauseTiming();
19 |         key = key_bkp;
20 |         state.ResumeTiming();
21 |     }
22 | }
23 | 
24 | template <typename T, class... Args>
25 | static void simdkvsort(benchmark::State &state, Args &&...args)
26 | {
27 |     auto args_tuple = std::make_tuple(std::move(args)...);
28 |     size_t arrsize = std::get<0>(args_tuple);
29 |     std::string arrtype = std::get<1>(args_tuple);
30 |     // set up array
31 |     std::vector<T> key = get_array<T>(arrtype, arrsize);
32 |     std::vector<T> val = get_array<T>("random", arrsize);
33 |     std::vector<T> key_bkp = key;
34 |     // benchmark
35 |     for (auto _ : state) {
36 |         x86simdsort::keyvalue_qsort(key.data(), val.data(), arrsize);
37 |         state.PauseTiming();
38 |         key = key_bkp;
39 |         state.ResumeTiming();
40 |     }
41 | }
42 | 
43 | #define BENCH_BOTH_KVSORT(type) \
44 |     BENCH_SORT(simdkvsort, type) \
45 |     BENCH_SORT(scalarkvsort, type)
46 | 
47 | BENCH_BOTH_KVSORT(uint64_t)
48 | BENCH_BOTH_KVSORT(int64_t)
49 | BENCH_BOTH_KVSORT(double)
50 | BENCH_BOTH_KVSORT(uint32_t)
51 | BENCH_BOTH_KVSORT(int32_t)
52 | BENCH_BOTH_KVSORT(float)
53 | 


--------------------------------------------------------------------------------
/benchmarks/bench-qselect.hpp:
--------------------------------------------------------------------------------
 1 | template <typename T, class... Args>
 2 | static void simdqselect(benchmark::State &state, Args &&...args)
 3 | {
 4 |     // Perform setup here
 5 |     auto args_tuple = std::make_tuple(std::move(args)...);
 6 |     int64_t ARRSIZE = std::get<0>(args_tuple);
 7 |     int64_t k = std::get<1>(args_tuple);
 8 |     std::vector<T> arr;
 9 |     std::vector<T> arr_bkp;
10 | 
11 |     /* Initialize elements */
12 |     arr = get_uniform_rand_array<T>(ARRSIZE);
13 |     arr_bkp = arr;
14 | 
15 |     /* call avx512 quickselect */
16 |     for (auto _ : state) {
17 |         x86simdsort::qselect<T>(arr.data(), k, ARRSIZE);
18 | 
19 |         state.PauseTiming();
20 |         arr = arr_bkp;
21 |         state.ResumeTiming();
22 |     }
23 | }
24 | 
25 | template <typename T, class... Args>
26 | static void scalarqselect(benchmark::State &state, Args &&...args)
27 | {
28 |     // Perform setup here
29 |     auto args_tuple = std::make_tuple(std::move(args)...);
30 |     int64_t ARRSIZE = std::get<0>(args_tuple);
31 |     int64_t k = std::get<1>(args_tuple);
32 |     std::vector<T> arr;
33 |     std::vector<T> arr_bkp;
34 | 
35 |     /* Initialize elements */
36 |     arr = get_uniform_rand_array<T>(ARRSIZE);
37 |     arr_bkp = arr;
38 | 
39 |     /* call std::nth_element */
40 |     for (auto _ : state) {
41 |         std::nth_element(arr.begin(), arr.begin() + k, arr.end());
42 | 
43 |         state.PauseTiming();
44 |         arr = arr_bkp;
45 |         state.ResumeTiming();
46 |     }
47 | }
48 | 
49 | #define BENCH_BOTH_QSELECT(type) \
50 |     BENCH_PARTIAL(simdqselect, type) \
51 |     BENCH_PARTIAL(scalarqselect, type)
52 | 
53 | BENCH_BOTH_QSELECT(uint64_t)
54 | BENCH_BOTH_QSELECT(int64_t)
55 | BENCH_BOTH_QSELECT(uint32_t)
56 | BENCH_BOTH_QSELECT(int32_t)
57 | BENCH_BOTH_QSELECT(uint16_t)
58 | BENCH_BOTH_QSELECT(int16_t)
59 | BENCH_BOTH_QSELECT(float)
60 | BENCH_BOTH_QSELECT(double)
61 | #ifdef __FLT16_MAX__
62 | BENCH_BOTH_QSELECT(_Float16)
63 | #endif
64 | 


--------------------------------------------------------------------------------
/benchmarks/bench-partial-qsort.hpp:
--------------------------------------------------------------------------------
 1 | template <typename T, class... Args>
 2 | static void simdpartialsort(benchmark::State &state, Args &&...args)
 3 | {
 4 |     // Perform setup here
 5 |     auto args_tuple = std::make_tuple(std::move(args)...);
 6 |     int64_t ARRSIZE = std::get<0>(args_tuple);
 7 |     int64_t k = std::get<1>(args_tuple);
 8 |     std::vector<T> arr;
 9 |     std::vector<T> arr_bkp;
10 | 
11 |     /* Initialize elements */
12 |     arr = get_uniform_rand_array<T>(ARRSIZE);
13 |     arr_bkp = arr;
14 | 
15 |     /* call simdpartialsort */
16 |     for (auto _ : state) {
17 |         x86simdsort::partial_qsort<T>(arr.data(), k, ARRSIZE);
18 | 
19 |         state.PauseTiming();
20 |         arr = arr_bkp;
21 |         state.ResumeTiming();
22 |     }
23 | }
24 | 
25 | template <typename T, class... Args>
26 | static void scalarpartialsort(benchmark::State &state, Args &&...args)
27 | {
28 |     // Perform setup here
29 |     auto args_tuple = std::make_tuple(std::move(args)...);
30 |     int64_t ARRSIZE = std::get<0>(args_tuple);
31 |     int64_t k = std::get<1>(args_tuple);
32 |     std::vector<T> arr;
33 |     std::vector<T> arr_bkp;
34 | 
35 |     /* Initialize elements */
36 |     arr = get_uniform_rand_array<T>(ARRSIZE);
37 |     arr_bkp = arr;
38 | 
39 |     /* call std::partial_sort */
40 |     for (auto _ : state) {
41 |         std::partial_sort(arr.begin(), arr.begin() + k, arr.end());
42 | 
43 |         state.PauseTiming();
44 |         arr = arr_bkp;
45 |         state.ResumeTiming();
46 |     }
47 | }
48 | 
49 | #define BENCH_BOTH_PARTIAL(type) \
50 |     BENCH_PARTIAL(simdpartialsort, type) \
51 |     BENCH_PARTIAL(scalarpartialsort, type)
52 | 
53 | BENCH_BOTH_PARTIAL(uint64_t)
54 | BENCH_BOTH_PARTIAL(int64_t)
55 | BENCH_BOTH_PARTIAL(uint32_t)
56 | BENCH_BOTH_PARTIAL(int32_t)
57 | BENCH_BOTH_PARTIAL(uint16_t)
58 | BENCH_BOTH_PARTIAL(int16_t)
59 | BENCH_BOTH_PARTIAL(float)
60 | BENCH_BOTH_PARTIAL(double)
61 | #ifdef __FLT16_MAX__
62 | BENCH_BOTH_PARTIAL(_Float16)
63 | #endif
64 | 


--------------------------------------------------------------------------------
/src/xss-custom-float.h:
--------------------------------------------------------------------------------
 1 | #ifndef XSS_CUSTOM_FLOAT
 2 | #define XSS_CUSTOM_FLOAT
 3 | #include <cstdint>
 4 | namespace xss {
 5 | namespace fp {
 6 |     template <typename T>
 7 |     inline constexpr bool is_floating_point_v = std::is_floating_point_v<T>;
 8 | 
 9 |     template <typename T>
10 |     static bool isnan(T elem)
11 |     {
12 |         return std::isnan(elem);
13 |     }
14 |     template <typename T>
15 |     static bool isunordered(T a, T b)
16 |     {
17 |         return std::isunordered(a, b);
18 |     }
19 |     template <typename T>
20 |     static T max()
21 |     {
22 |         return std::numeric_limits<T>::max();
23 |     }
24 |     template <typename T>
25 |     static T min()
26 |     {
27 |         return std::numeric_limits<T>::min();
28 |     }
29 |     template <typename T>
30 |     static T infinity()
31 |     {
32 |         return std::numeric_limits<T>::infinity();
33 |     }
34 |     template <typename T>
35 |     static T quiet_NaN()
36 |     {
37 |         return std::numeric_limits<T>::quiet_NaN();
38 |     }
39 | 
40 | #ifdef __FLT16_MAX__
41 |     typedef union {
42 |         _Float16 f_;
43 |         uint16_t i_;
44 |     } Fp16Bits;
45 | 
46 |     static _Float16 convert_bits(uint16_t val)
47 |     {
48 |         Fp16Bits temp;
49 |         temp.i_ = val;
50 |         return temp.f_;
51 |     }
52 | 
53 |     template <>
54 |     [[maybe_unused]] inline constexpr bool is_floating_point_v<_Float16> = true;
55 | 
56 |     template <>
57 |     [[maybe_unused]] bool isnan<_Float16>(_Float16 elem)
58 |     {
59 |         return elem != elem;
60 |     }
61 |     template <>
62 |     [[maybe_unused]] bool isunordered<_Float16>(_Float16 a, _Float16 b)
63 |     {
64 |         return isnan(a) || isnan(b);
65 |     }
66 |     template <>
67 |     [[maybe_unused]] _Float16 max<_Float16>()
68 |     {
69 |         return convert_bits(0x7bff);
70 |     }
71 |     template <>
72 |     [[maybe_unused]] _Float16 min<_Float16>()
73 |     {
74 |         return convert_bits(0x0400);
75 |     }
76 |     template <>
77 |     [[maybe_unused]] _Float16 infinity<_Float16>()
78 |     {
79 |         return convert_bits(0x7c00);
80 |     }
81 |     template <>
82 |     [[maybe_unused]] _Float16 quiet_NaN<_Float16>()
83 |     {
84 |         return convert_bits(0x7c01);
85 |     }
86 | #endif
87 | 
88 | } // namespace fp
89 | } // namespace xss
90 | #endif // XSS_CUSTOM_FLOAT
91 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | ### License
 4 | 
 5 | x86-simd-sort is licensed under the terms in
 6 | [LICENSE]<https://github.com/intel/x86-simd-sort/blob/main/LICENSE.md>. By
 7 | contributing to the project, you agree to the license and copyright terms
 8 | therein and release your contribution under these terms.
 9 | 
10 | ### Sign your work
11 | 
12 | Please use the sign-off line at the end of the patch. Your signature certifies
13 | that you wrote the patch or otherwise have the right to pass it on as an
14 | open-source patch. The rules are pretty simple: if you can certify the below
15 | (from [developercertificate.org](http://developercertificate.org/)):
16 | 
17 | ```
18 | Developer Certificate of Origin
19 | Version 1.1
20 | 
21 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
22 | 660 York Street, Suite 102,
23 | San Francisco, CA 94110 USA
24 | 
25 | Everyone is permitted to copy and distribute verbatim copies of this
26 | license document, but changing it is not allowed.
27 | 
28 | Developer's Certificate of Origin 1.1
29 | 
30 | By making a contribution to this project, I certify that:
31 | 
32 | (a) The contribution was created in whole or in part by me and I
33 |     have the right to submit it under the open source license
34 |     indicated in the file; or
35 | 
36 | (b) The contribution is based upon previous work that, to the best
37 |     of my knowledge, is covered under an appropriate open source
38 |     license and I have the right under that license to submit that
39 |     work with modifications, whether created in whole or in part
40 |     by me, under the same open source license (unless I am
41 |     permitted to submit under a different license), as indicated
42 |     in the file; or
43 | 
44 | (c) The contribution was provided directly to me by some other
45 |     person who certified (a), (b) or (c) and I have not modified
46 |     it.
47 | 
48 | (d) I understand and agree that this project and the contribution
49 |     are public and that a record of the contribution (including all
50 |     personal information I submit with it, including my sign-off) is
51 |     maintained indefinitely and may be redistributed consistent with
52 |     this project or the open source license(s) involved.
53 | ```
54 | 
55 | Then you just add a line to every git commit message:
56 | 
57 |     Signed-off-by: Joe Smith <joe.smith@email.com>
58 | 
59 | Use your real name (sorry, no pseudonyms or anonymous contributions.)
60 | 
61 | If you set your `user.name` and `user.email` git configs, you can sign your
62 | commit automatically with `git commit -s`.
63 | 


--------------------------------------------------------------------------------
/tests/test-objqsort.cpp:
--------------------------------------------------------------------------------
 1 | /*******************************************
 2 |  * * Copyright (C) 2022-2023 Intel Corporation
 3 |  * * SPDX-License-Identifier: BSD-3-Clause
 4 |  * *******************************************/
 5 | 
 6 | #include "rand_array.h"
 7 | #include "x86simdsort.h"
 8 | #include <gtest/gtest.h>
 9 | 
10 | template <typename T>
11 | struct P {
12 |     T x, y;
13 |     T metric() const
14 |     {
15 |         return x;
16 |     }
17 |     bool operator==(const P<T> &a) const
18 |     {
19 |         return a.x == x; // && a.y == y;
20 |     }
21 | };
22 | 
23 | template <typename T>
24 | class simdobjsort : public ::testing::Test {
25 | public:
26 |     simdobjsort()
27 |     {
28 |         std::iota(arrsize.begin(), arrsize.end(), 0);
29 |         arrtype = {"random",
30 |                    "constant",
31 |                    "sorted",
32 |                    "reverse",
33 |                    "smallrange",
34 |                    "max_at_the_end",
35 |                    "random_5d",
36 |                    "rand_max"};
37 |     }
38 |     std::vector<std::string> arrtype;
39 |     std::vector<size_t> arrsize = std::vector<size_t>(1024);
40 | };
41 | 
42 | TYPED_TEST_SUITE_P(simdobjsort);
43 | 
44 | TYPED_TEST_P(simdobjsort, test_objsort)
45 | {
46 |     for (auto type : this->arrtype) {
47 |         for (auto size : this->arrsize) {
48 |             std::vector<TypeParam> x = get_array<TypeParam>(type, size);
49 |             std::vector<TypeParam> y = get_array<TypeParam>("random", size);
50 |             std::vector<P<TypeParam>> arr(size);
51 |             for (size_t ii = 0; ii < size; ++ii) {
52 |                 arr[ii].x = x[ii];
53 |                 arr[ii].y = y[ii];
54 |             }
55 |             std::vector<P<TypeParam>> arr_bckp;
56 |             for (size_t ii = 0; ii < size; ++ii) {
57 |                 arr_bckp.push_back(arr[ii]);
58 |             }
59 | 
60 |             x86simdsort::object_qsort(arr.data(), size, [](P<TypeParam> p) {
61 |                 return p.metric();
62 |             });
63 |             std::sort(arr_bckp.begin(),
64 |                       arr_bckp.end(),
65 |                       [](const P<TypeParam> &a, const P<TypeParam> &b) {
66 |                           return a.metric() < b.metric();
67 |                       });
68 |             ASSERT_EQ(arr, arr_bckp);
69 |             arr.clear();
70 |             arr_bckp.clear();
71 |         }
72 |     }
73 | }
74 | 
75 | REGISTER_TYPED_TEST_SUITE_P(simdobjsort, test_objsort);
76 | 
77 | using QObjSortTestTypes
78 |         = testing::Types<double, uint64_t, int64_t, uint32_t, int32_t, float>;
79 | 
80 | INSTANTIATE_TYPED_TEST_SUITE_P(xss, simdobjsort, QObjSortTestTypes);
81 | 


--------------------------------------------------------------------------------
/example.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdint.h>
 3 | #include <stdlib.h>
 4 | #include <stdbool.h>
 5 | 
 6 | // declare function here, linker will find this when linked to
 7 | // libx86simdsortcpp.so
 8 | void keyvalue_qsort_float_sizet(float *, size_t *, size_t);
 9 | void keyvalue_qsort_float_uint32(float *, uint32_t *, uint32_t);
10 | void keyvalue_qsort_sizet_sizet(size_t *, size_t *, size_t);
11 | void keyvalue_qsort_sizet_uint32(size_t *, uint32_t *, uint32_t);
12 | void keyvalue_qsort_uint32_sizet(uint32_t *, size_t *, size_t);
13 | void keyvalue_qsort_uint32_uint32(uint32_t *, uint32_t *, uint32_t);
14 | void keyvalue_qsort_int32_sizet(int32_t *, size_t *, size_t);
15 | void keyvalue_qsort_int32_uint32(int32_t *, uint32_t *, uint32_t);
16 | 
17 | // struct definition, we will sort an array of these:
18 | struct Point {
19 |     int x;
20 |     int y;
21 |     float distance;
22 |     size_t metric;
23 | };
24 | 
25 | #define SWAP(a, b, type) \
26 |     { \
27 |         type temp = a; \
28 |         a = b; \
29 |         b = temp; \
30 |     }
31 | 
32 | // Function to sort an array of objects:
33 | void object_qsort(struct Point *arr, size_t size)
34 | {
35 |     /* (1) Create and initialize arrays of key and value  */
36 |     size_t *key = malloc(size * sizeof(size_t));
37 |     size_t *arg = malloc(size * sizeof(size_t));
38 |     bool *done = malloc(size * sizeof(bool));
39 |     for (size_t ii = 0; ii < size; ++ii) {
40 |         key[ii] = arr[ii].metric;
41 |         arg[ii] = ii;
42 |         done[ii] = false;
43 |     }
44 | 
45 |     /* (2) IndexSort using the keyvalue_qsort */
46 |     keyvalue_qsort_sizet_sizet(key, arg, size);
47 | 
48 |     /* (3) Permute obj array in-place */
49 |     for (size_t ii = 0; ii < size; ++ii) {
50 |         if (done[ii]) { continue; }
51 |         done[ii] = true;
52 |         size_t prev_j = ii;
53 |         size_t jj = arg[ii];
54 |         while (ii != jj) {
55 |             SWAP(arr[prev_j], arr[jj], struct Point);
56 |             done[jj] = true;
57 |             prev_j = jj;
58 |             jj = arg[jj];
59 |         }
60 |     }
61 |     free(key);
62 |     free(arg);
63 |     free(done);
64 | }
65 | 
66 | int main()
67 | {
68 |     const size_t size = 10;
69 |     struct Point arr[size];
70 | 
71 |     // Initialize:
72 |     for (size_t ii = 0; ii < size; ++ii) {
73 |         arr[ii].distance = (float)rand() / RAND_MAX;
74 |         arr[ii].metric = rand() % 100;
75 |     }
76 | 
77 |     // sort:
78 |     object_qsort(arr, size);
79 | 
80 |     // check if it is sorted:
81 |     printf("arr = ");
82 |     for (size_t ii = 0; ii < size; ++ii) {
83 |         printf("%ld, ", arr[ii].metric);
84 |     }
85 |     printf("\n");
86 |     return 0;
87 | }
88 | 


--------------------------------------------------------------------------------
/lib/x86simdsort-icl.cpp:
--------------------------------------------------------------------------------
 1 | // ICL specific routines:
 2 | #include "x86simdsort-static-incl.h"
 3 | #include "x86simdsort-internal.h"
 4 | 
 5 | namespace xss {
 6 | namespace avx512 {
 7 |     template <>
 8 |     void qsort(uint16_t *arr, size_t size, bool hasnan, bool descending)
 9 |     {
10 |         x86simdsortStatic::qsort(arr, size, hasnan, descending);
11 |     }
12 |     template <>
13 |     void qselect(uint16_t *arr,
14 |                  size_t k,
15 |                  size_t arrsize,
16 |                  bool hasnan,
17 |                  bool descending)
18 |     {
19 |         x86simdsortStatic::qselect(arr, k, arrsize, hasnan, descending);
20 |     }
21 |     template <>
22 |     void partial_qsort(uint16_t *arr,
23 |                        size_t k,
24 |                        size_t arrsize,
25 |                        bool hasnan,
26 |                        bool descending)
27 |     {
28 |         x86simdsortStatic::partial_qsort(arr, k, arrsize, hasnan, descending);
29 |     }
30 |     template <>
31 |     void qsort(int16_t *arr, size_t size, bool hasnan, bool descending)
32 |     {
33 |         x86simdsortStatic::qsort(arr, size, hasnan, descending);
34 |     }
35 |     template <>
36 |     void qselect(int16_t *arr,
37 |                  size_t k,
38 |                  size_t arrsize,
39 |                  bool hasnan,
40 |                  bool descending)
41 |     {
42 |         x86simdsortStatic::qselect(arr, k, arrsize, hasnan, descending);
43 |     }
44 |     template <>
45 |     void partial_qsort(int16_t *arr,
46 |                        size_t k,
47 |                        size_t arrsize,
48 |                        bool hasnan,
49 |                        bool descending)
50 |     {
51 |         x86simdsortStatic::partial_qsort(arr, k, arrsize, hasnan, descending);
52 |     }
53 | } // namespace avx512
54 | namespace fp16_icl {
55 | #ifdef __FLT16_MAX__
56 |     template <>
57 |     void qsort(_Float16 *arr, size_t size, bool hasnan, bool descending)
58 |     {
59 |         x86simdsortStatic::qsort(arr, size, hasnan, descending);
60 |     }
61 |     template <>
62 |     void qselect(_Float16 *arr,
63 |                  size_t k,
64 |                  size_t arrsize,
65 |                  bool hasnan,
66 |                  bool descending)
67 |     {
68 |         x86simdsortStatic::qselect(arr, k, arrsize, hasnan, descending);
69 |     }
70 |     template <>
71 |     void partial_qsort(_Float16 *arr,
72 |                        size_t k,
73 |                        size_t arrsize,
74 |                        bool hasnan,
75 |                        bool descending)
76 |     {
77 |         x86simdsortStatic::partial_qsort(arr, k, arrsize, hasnan, descending);
78 |     }
79 | #endif
80 | } // namespace fp16_icl
81 | } // namespace xss
82 | 


--------------------------------------------------------------------------------
/run-bench.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import argparse
 3 | import subprocess
 4 | 
 5 | parser = argparse.ArgumentParser()
 6 | parser.add_argument('--branchcompare', action='store_true', help='Compare benchmarks of current branch with main. Provide an optional --filter')
 7 | parser.add_argument("-b", '--branch', type=str, default="main", required=False)
 8 | parser.add_argument('--benchcompare', type=str, help='Compare simd bench with stdsort methods. Requires one of qsort, qselect, partialsort, argsort or argselect')
 9 | parser.add_argument("-f", '--filter', type=str, required=False)
10 | parser.add_argument("-r", '--repeat', type=int, required=False)
11 | args = parser.parse_args()
12 | 
13 | if len(sys.argv) == 1:
14 |         parser.error("requires one of --benchcompare or --branchcompare")
15 | 
16 | filterb = ""
17 | if args.filter is not None:
18 |     filterb = args.filter
19 | repeatnum = 1
20 | if args.repeat is not None:
21 |     repeatnum = args.repeat
22 | 
23 | if args.benchcompare:
24 |     baseline = ""
25 |     contender = ""
26 |     if "ippsort" in args.benchcompare:
27 |         baseline = "ippsort.*" + filterb
28 |         contender = "simdsort.*" + filterb
29 |     elif "ippargsort" in args.benchcompare:
30 |         baseline = "ippargsort.*" + filterb
31 |         contender = "simd_ordern_argsort.*" + filterb
32 |     elif "vqsort" in args.benchcompare:
33 |         baseline = "vqsort.*" + filterb
34 |         contender = "simdsort.*" + filterb
35 |     elif "qsort" in args.benchcompare:
36 |         baseline = "scalarsort.*" + filterb
37 |         contender = "simdsort.*" + filterb
38 |     elif "select" in args.benchcompare:
39 |         baseline = "scalarqselect.*" + filterb
40 |         contender = "simdqselect.*" + filterb
41 |     elif "partial" in args.benchcompare:
42 |         baseline = "scalarpartialsort.*" + filterb
43 |         contender = "simdpartialsort.*" + filterb
44 |     elif "argsort" in args.benchcompare:
45 |         baseline = "scalarargsort.*" + filterb
46 |         contender = "simdargsort.*" + filterb
47 |     elif "keyvalue" in args.benchcompare:
48 |         baseline = "scalarkvsort.*" + filterb
49 |         contender = "simdkvsort.*" + filterb
50 |     elif "objsort" in args.benchcompare:
51 |         baseline = "scalarobjsort.*" + filterb
52 |         contender = "simdobjsort.*" + filterb
53 |     else:
54 |         parser.print_help(sys.stderr)
55 |         parser.error("ERROR: Unknown argument '%s'" % args.benchcompare)
56 |     rc = subprocess.check_call("./scripts/bench-compare.sh '%s' '%s' '%d'" % (baseline, contender, repeatnum), shell=True)
57 | 
58 | if args.branchcompare:
59 |     branch = args.branch
60 |     if args.filter is None:
61 |         rc = subprocess.check_call("./scripts/branch-compare.sh '%s' '%d'" % (branch, repeatnum), shell=True)
62 |     else:
63 |         rc = subprocess.check_call("./scripts/branch-compare.sh '%s' '%s' '%d'" % (branch, args.filter, repeatnum), shell=True)
64 | 


--------------------------------------------------------------------------------
/lib/x86simdsort-internal.h:
--------------------------------------------------------------------------------
 1 | #ifndef XSS_INTERNAL_METHODS
 2 | #define XSS_INTERNAL_METHODS
 3 | #include "x86simdsort.h"
 4 | #include <stdint.h>
 5 | #include <vector>
 6 | 
 7 | #define DECLAREALLFUNCS(name) \
 8 |     namespace name { \
 9 |     template <typename T> \
10 |     XSS_HIDE_SYMBOL void qsort(T *arr, \
11 |                                size_t arrsize, \
12 |                                bool hasnan = false, \
13 |                                bool descending = false); \
14 |     template <typename T1, typename T2> \
15 |     XSS_HIDE_SYMBOL void keyvalue_qsort(T1 *key, \
16 |                                         T2 *val, \
17 |                                         size_t arrsize, \
18 |                                         bool hasnan = false, \
19 |                                         bool descending = false); \
20 |     template <typename T> \
21 |     XSS_HIDE_SYMBOL void qselect(T *arr, \
22 |                                  size_t k, \
23 |                                  size_t arrsize, \
24 |                                  bool hasnan = false, \
25 |                                  bool descending = false); \
26 |     template <typename T1, typename T2> \
27 |     XSS_HIDE_SYMBOL void keyvalue_select(T1 *key, \
28 |                                          T2 *val, \
29 |                                          size_t k, \
30 |                                          size_t arrsize, \
31 |                                          bool hasnan = false, \
32 |                                          bool descending = false); \
33 |     template <typename T> \
34 |     XSS_HIDE_SYMBOL void partial_qsort(T *arr, \
35 |                                        size_t k, \
36 |                                        size_t arrsize, \
37 |                                        bool hasnan = false, \
38 |                                        bool descending = false); \
39 |     template <typename T1, typename T2> \
40 |     XSS_HIDE_SYMBOL void keyvalue_partial_sort(T1 *key, \
41 |                                                T2 *val, \
42 |                                                size_t k, \
43 |                                                size_t arrsize, \
44 |                                                bool hasnan = false, \
45 |                                                bool descending = false); \
46 |     template <typename T> \
47 |     XSS_HIDE_SYMBOL std::vector<size_t> argsort(T *arr, \
48 |                                                 size_t arrsize, \
49 |                                                 bool hasnan = false, \
50 |                                                 bool descending = false); \
51 |     template <typename T> \
52 |     XSS_HIDE_SYMBOL std::vector<size_t> \
53 |     argselect(T *arr, size_t k, size_t arrsize, bool hasnan = false); \
54 |     }
55 | 
56 | namespace xss {
57 | DECLAREALLFUNCS(avx512)
58 | DECLAREALLFUNCS(avx2)
59 | DECLAREALLFUNCS(scalar)
60 | DECLAREALLFUNCS(fp16_spr)
61 | DECLAREALLFUNCS(fp16_icl)
62 | } // namespace xss
63 | #endif
64 | 


--------------------------------------------------------------------------------
/benchmarks/bench.h:
--------------------------------------------------------------------------------
 1 | #include "rand_array.h"
 2 | #include "x86simdsort.h"
 3 | #include <benchmark/benchmark.h>
 4 | 
 5 | #define MY_BENCHMARK_CAPTURE(func, T, test_case_name, ...) \
 6 |     BENCHMARK_PRIVATE_DECLARE(func) \
 7 |             = (::benchmark::internal::RegisterBenchmarkInternal( \
 8 |                 std::unique_ptr<benchmark::internal::Benchmark>( \
 9 |                     new ::benchmark::internal::FunctionBenchmark( \
10 |                             #func "/" #test_case_name "/" #T, \
11 |                             [](::benchmark::State &st) { \
12 |                                 func<T>(st, __VA_ARGS__); \
13 |                             }))))
14 | 
15 | #define BENCH_SORT(func, type) \
16 |     MY_BENCHMARK_CAPTURE(func, type, random_128, 128, std::string("random")); \
17 |     MY_BENCHMARK_CAPTURE(func, type, random_256, 256, std::string("random")); \
18 |     MY_BENCHMARK_CAPTURE(func, type, random_512, 512, std::string("random")); \
19 |     MY_BENCHMARK_CAPTURE(func, type, random_1k, 1024, std::string("random")); \
20 |     MY_BENCHMARK_CAPTURE(func, type, random_5k, 5000, std::string("random")); \
21 |     MY_BENCHMARK_CAPTURE( \
22 |             func, type, random_100k, 100000, std::string("random")); \
23 |     MY_BENCHMARK_CAPTURE( \
24 |             func, type, random_1m, 1000000, std::string("random")); \
25 |     MY_BENCHMARK_CAPTURE( \
26 |             func, type, random_10m, 10000000, std::string("random")); \
27 |     MY_BENCHMARK_CAPTURE( \
28 |             func, type, random_100m, 100000000, std::string("random")); \
29 |     MY_BENCHMARK_CAPTURE( \
30 |             func, type, smallrange_128, 128, std::string("smallrange")); \
31 |     MY_BENCHMARK_CAPTURE( \
32 |             func, type, smallrange_256, 256, std::string("smallrange")); \
33 |     MY_BENCHMARK_CAPTURE( \
34 |             func, type, smallrange_512, 512, std::string("smallrange")); \
35 |     MY_BENCHMARK_CAPTURE( \
36 |             func, type, smallrange_1k, 1024, std::string("smallrange")); \
37 |     MY_BENCHMARK_CAPTURE( \
38 |             func, type, smallrange_5k, 5000, std::string("smallrange")); \
39 |     MY_BENCHMARK_CAPTURE( \
40 |             func, type, smallrange_100k, 100000, std::string("smallrange")); \
41 |     MY_BENCHMARK_CAPTURE( \
42 |             func, type, smallrange_1m, 1000000, std::string("smallrange")); \
43 |     MY_BENCHMARK_CAPTURE( \
44 |             func, type, smallrange_10m, 10000000, std::string("smallrange")); \
45 |     MY_BENCHMARK_CAPTURE( \
46 |             func, type, sorted_10k, 10000, std::string("sorted")); \
47 |     MY_BENCHMARK_CAPTURE( \
48 |             func, type, constant_10k, 10000, std::string("constant")); \
49 |     MY_BENCHMARK_CAPTURE( \
50 |             func, type, reverse_10k, 10000, std::string("reverse"));
51 | 
52 | #define BENCH_PARTIAL(func, type) \
53 |     MY_BENCHMARK_CAPTURE(func, type, k10, 10000, 10); \
54 |     MY_BENCHMARK_CAPTURE(func, type, k100, 10000, 100); \
55 |     MY_BENCHMARK_CAPTURE(func, type, k1000, 10000, 1000); \
56 |     MY_BENCHMARK_CAPTURE(func, type, k5000, 10000, 5000);
57 | 


--------------------------------------------------------------------------------
/benchmarks/bench-qsort.hpp:
--------------------------------------------------------------------------------
 1 | template <typename T, class... Args>
 2 | static void scalarsort(benchmark::State &state, Args &&...args)
 3 | {
 4 |     // Get args
 5 |     auto args_tuple = std::make_tuple(std::move(args)...);
 6 |     size_t arrsize = std::get<0>(args_tuple);
 7 |     std::string arrtype = std::get<1>(args_tuple);
 8 |     // set up array
 9 |     std::vector<T> arr = get_array<T>(arrtype, arrsize);
10 |     std::vector<T> arr_bkp = arr;
11 |     // benchmark
12 |     for (auto _ : state) {
13 |         std::sort(arr.begin(), arr.end());
14 |         state.PauseTiming();
15 |         arr = arr_bkp;
16 |         state.ResumeTiming();
17 |     }
18 | }
19 | 
20 | template <typename T, class... Args>
21 | static void simdsort(benchmark::State &state, Args &&...args)
22 | {
23 |     // Get args
24 |     auto args_tuple = std::make_tuple(std::move(args)...);
25 |     size_t arrsize = std::get<0>(args_tuple);
26 |     std::string arrtype = std::get<1>(args_tuple);
27 |     // set up array
28 |     std::vector<T> arr = get_array<T>(arrtype, arrsize);
29 |     std::vector<T> arr_bkp = arr;
30 |     // benchmark
31 |     for (auto _ : state) {
32 |         x86simdsort::qsort(arr.data(), arrsize);
33 |         state.PauseTiming();
34 |         arr = arr_bkp;
35 |         state.ResumeTiming();
36 |     }
37 | }
38 | 
39 | template <typename T, class... Args>
40 | static void scalar_revsort(benchmark::State &state, Args &&...args)
41 | {
42 |     // Get args
43 |     auto args_tuple = std::make_tuple(std::move(args)...);
44 |     size_t arrsize = std::get<0>(args_tuple);
45 |     std::string arrtype = std::get<1>(args_tuple);
46 |     // set up array
47 |     std::vector<T> arr = get_array<T>(arrtype, arrsize);
48 |     std::vector<T> arr_bkp = arr;
49 |     // benchmark
50 |     for (auto _ : state) {
51 |         std::sort(arr.rbegin(), arr.rend());
52 |         state.PauseTiming();
53 |         arr = arr_bkp;
54 |         state.ResumeTiming();
55 |     }
56 | }
57 | 
58 | template <typename T, class... Args>
59 | static void simd_revsort(benchmark::State &state, Args &&...args)
60 | {
61 |     // Get args
62 |     auto args_tuple = std::make_tuple(std::move(args)...);
63 |     size_t arrsize = std::get<0>(args_tuple);
64 |     std::string arrtype = std::get<1>(args_tuple);
65 |     // set up array
66 |     std::vector<T> arr = get_array<T>(arrtype, arrsize);
67 |     std::vector<T> arr_bkp = arr;
68 |     // benchmark
69 |     for (auto _ : state) {
70 |         x86simdsort::qsort(arr.data(), arrsize, false, true);
71 |         state.PauseTiming();
72 |         arr = arr_bkp;
73 |         state.ResumeTiming();
74 |     }
75 | }
76 | 
77 | #define BENCH_BOTH_QSORT(type) \
78 |     BENCH_SORT(simdsort, type) \
79 |     BENCH_SORT(scalarsort, type) \
80 |     BENCH_SORT(simd_revsort, type) \
81 |     BENCH_SORT(scalar_revsort, type)
82 | 
83 | BENCH_BOTH_QSORT(uint64_t)
84 | BENCH_BOTH_QSORT(int64_t)
85 | BENCH_BOTH_QSORT(uint32_t)
86 | BENCH_BOTH_QSORT(int32_t)
87 | BENCH_BOTH_QSORT(uint16_t)
88 | BENCH_BOTH_QSORT(int16_t)
89 | BENCH_BOTH_QSORT(float)
90 | BENCH_BOTH_QSORT(double)
91 | #ifdef __FLT16_MAX__
92 | BENCH_BOTH_QSORT(_Float16)
93 | #endif
94 | 


--------------------------------------------------------------------------------
/benchmarks/bench-ipp.cpp:
--------------------------------------------------------------------------------
 1 | #include "bench.h"
 2 | #include "ipp.h"
 3 | 
 4 | template <typename T, class... Args>
 5 | static void ippsort(benchmark::State &state, Args &&...args)
 6 | {
 7 |     auto args_tuple = std::make_tuple(std::move(args)...);
 8 |     size_t arrsize = std::get<0>(args_tuple);
 9 |     /* IPP set up */
10 |     int bufsize = 10;
11 |     if constexpr (std::is_same_v<T, float>) {
12 |         ippsSortRadixGetBufferSize(arrsize, ipp32f, &bufsize);
13 |     }
14 |     else if constexpr (std::is_same_v<T, double>) {
15 |         ippsSortRadixGetBufferSize(arrsize, ipp64f, &bufsize);
16 |     }
17 |     unsigned char *temp = new unsigned char[bufsize];
18 | 
19 |     // Get args
20 |     std::string arrtype = std::get<1>(args_tuple);
21 |     // set up array
22 |     std::vector<T> arr = get_array<T>(arrtype, arrsize);
23 |     std::vector<T> arr_bkp = arr;
24 |     // benchmark
25 |     for (auto _ : state) {
26 |         if constexpr (std::is_same_v<T, float>) {
27 |             ippsSortRadixAscend_32f_I(arr.data(), arrsize, temp);
28 |         }
29 |         else if constexpr (std::is_same_v<T, double>) {
30 |             ippsSortRadixAscend_64f_I(arr.data(), arrsize, temp);
31 |         }
32 |         state.PauseTiming();
33 |         arr = arr_bkp;
34 |         state.ResumeTiming();
35 |     }
36 | }
37 | 
38 | template <typename T, class... Args>
39 | static void ippargsort(benchmark::State &state, Args &&...args)
40 | {
41 |     auto args_tuple = std::make_tuple(std::move(args)...);
42 |     size_t arrsize = std::get<0>(args_tuple);
43 |     /* IPP set up */
44 |     int bufsize = 10;
45 |     if constexpr (std::is_same_v<T, float>) {
46 |         ippsSortRadixIndexGetBufferSize(arrsize, ipp32f, &bufsize);
47 |     }
48 |     else if constexpr (std::is_same_v<T, double>) {
49 |         ippsSortRadixIndexGetBufferSize(arrsize, ipp64f, &bufsize);
50 |     }
51 |     else if constexpr (std::is_same_v<T, int32_t>) {
52 |         ippsSortRadixIndexGetBufferSize(arrsize, ipp32s, &bufsize);
53 |     }
54 |     unsigned char *temp = new unsigned char[bufsize];
55 | 
56 |     // set up array
57 |     std::string arrtype = std::get<1>(args_tuple);
58 |     std::vector<T> arr = get_array<T>(arrtype, arrsize);
59 |     std::vector<T> arr_bkp = arr;
60 |     std::vector<int32_t> arg(arrsize);
61 |     std::iota(arg.begin(), arg.end(), 0);
62 | 
63 |     // benchmark
64 |     for (auto _ : state) {
65 |         if constexpr (std::is_same_v<T, float>) {
66 |             ippsSortRadixIndexAscend_32f(
67 |                     arr.data(), 4, arg.data(), arrsize, temp);
68 |         }
69 |         else if constexpr (std::is_same_v<T, double>) {
70 |             ippsSortRadixIndexAscend_64f(
71 |                     arr.data(), 8, arg.data(), arrsize, temp);
72 |         }
73 |         else if constexpr (std::is_same_v<T, int32_t>) {
74 |             ippsSortRadixIndexAscend_32s(
75 |                     arr.data(), 4, arg.data(), arrsize, temp);
76 |         }
77 |         state.PauseTiming();
78 |         arr = arr_bkp;
79 |         std::iota(arg.begin(), arg.end(), 0);
80 |         state.ResumeTiming();
81 |     }
82 | }
83 | 
84 | BENCH_SORT(ippsort, double)
85 | BENCH_SORT(ippsort, float)
86 | BENCH_SORT(ippargsort, double)
87 | BENCH_SORT(ippargsort, float)
88 | BENCH_SORT(ippargsort, int32_t)
89 | 


--------------------------------------------------------------------------------
/.clang-format:
--------------------------------------------------------------------------------
 1 | ---
 2 | Language:        Cpp
 3 | AccessModifierOffset: -4
 4 | AlignAfterOpenBracket: Align
 5 | AlignConsecutiveAssignments: false
 6 | AlignConsecutiveDeclarations: false
 7 | AlignEscapedNewlines: DontAlign
 8 | AlignOperands:   false
 9 | AlignTrailingComments: false
10 | AllowAllParametersOfDeclarationOnNextLine: true
11 | AllowShortBlocksOnASingleLine: true
12 | AllowShortCaseLabelsOnASingleLine: true
13 | AllowShortFunctionsOnASingleLine: Empty
14 | AllowShortIfStatementsOnASingleLine: true
15 | AllowShortLoopsOnASingleLine: false
16 | AlwaysBreakAfterDefinitionReturnType: None
17 | AlwaysBreakAfterReturnType: None
18 | AlwaysBreakBeforeMultilineStrings: true
19 | AlwaysBreakTemplateDeclarations: Yes
20 | BinPackArguments: false
21 | BinPackParameters: false
22 | BraceWrapping:
23 |   AfterClass:      false
24 |   AfterControlStatement: false
25 |   AfterEnum:       false
26 |   AfterFunction:   true
27 |   AfterNamespace:  false
28 |   AfterObjCDeclaration: false
29 |   AfterStruct:     false
30 |   AfterUnion:      false
31 |   AfterExternBlock: false
32 |   BeforeCatch:     false
33 |   BeforeElse:      true
34 |   IndentBraces:    false
35 |   SplitEmptyFunction: true
36 |   SplitEmptyRecord: true
37 |   SplitEmptyNamespace: true
38 | BreakBeforeBinaryOperators: All
39 | BreakBeforeBraces: Custom
40 | BreakBeforeInheritanceComma: false
41 | BreakInheritanceList: BeforeColon
42 | BreakBeforeTernaryOperators: true
43 | BreakConstructorInitializers: BeforeComma
44 | BreakAfterJavaFieldAnnotations: false
45 | BreakStringLiterals: true
46 | ColumnLimit:     80
47 | CommentPragmas:  '^ IWYU pragma:'
48 | CompactNamespaces: false
49 | ConstructorInitializerAllOnOneLineOrOnePerLine: true
50 | ConstructorInitializerIndentWidth: 4
51 | ContinuationIndentWidth: 8
52 | Cpp11BracedListStyle: true
53 | DerivePointerAlignment: false
54 | FixNamespaceComments: true
55 | ForEachMacros:
56 | IncludeBlocks:   Preserve
57 | IndentCaseLabels: true
58 | # IndentPPDirectives: AfterHash
59 | IndentPPDirectives: None
60 | IndentWidth:     4
61 | IndentWrappedFunctionNames: false
62 | KeepEmptyLinesAtTheStartOfBlocks: true
63 | MacroBlockBegin: ''
64 | MacroBlockEnd:   ''
65 | MaxEmptyLinesToKeep: 1
66 | NamespaceIndentation: Inner
67 | PenaltyBreakAssignment: 2
68 | PenaltyBreakBeforeFirstCallParameter: 19
69 | PenaltyBreakComment: 300
70 | PenaltyBreakFirstLessLess: 120
71 | PenaltyBreakString: 1000
72 | PenaltyBreakTemplateDeclaration: 10
73 | PenaltyExcessCharacter: 1000000
74 | PenaltyReturnTypeOnItsOwnLine: 60
75 | PointerAlignment: Right
76 | ReflowComments:  false
77 | SortIncludes:    false
78 | SortUsingDeclarations: true
79 | SpaceAfterCStyleCast: false
80 | SpaceAfterTemplateKeyword: true
81 | SpaceBeforeAssignmentOperators: true
82 | SpaceBeforeCpp11BracedList: true
83 | SpaceBeforeCtorInitializerColon: true
84 | SpaceBeforeInheritanceColon: true
85 | SpaceBeforeParens: ControlStatements
86 | SpaceBeforeRangeBasedForLoopColon: true
87 | SpaceInEmptyParentheses: false
88 | SpacesBeforeTrailingComments: 1
89 | SpacesInAngles:  false
90 | SpacesInContainerLiterals: false
91 | SpacesInCStyleCastParentheses: false
92 | SpacesInParentheses: false
93 | SpacesInSquareBrackets: false
94 | Standard:        Cpp11
95 | TabWidth:        4
96 | UseTab:          Never
97 | ...
98 | # vim:ft=conf et ts=2 sw=2
99 | 


--------------------------------------------------------------------------------
/.github/workflows/scorecard.yml:
--------------------------------------------------------------------------------
 1 | # This workflow uses actions that are not certified by GitHub. They are provided
 2 | # by a third-party and are governed by separate terms of service, privacy
 3 | # policy, and support documentation.
 4 | 
 5 | name: Scorecard supply-chain security
 6 | on:
 7 |   # For Branch-Protection check. Only the default branch is supported. See
 8 |   # https://github.com/ossf/scorecard/blob/main/docs/checks.md#branch-protection
 9 |   branch_protection_rule:
10 |   # To guarantee Maintained check is occasionally updated. See
11 |   # https://github.com/ossf/scorecard/blob/main/docs/checks.md#maintained
12 |   schedule:
13 |     - cron: '00 12 * * 0-6'
14 |   push:
15 |     branches: [ "main" ]
16 | 
17 | # Declare default permissions as read only.
18 | permissions: read-all
19 | 
20 | jobs:
21 |   analysis:
22 |   
23 |     name: Scorecard analysis
24 |     if: github.repository == 'intel/x86-simd-sort'
25 |     runs-on: ubuntu-latest
26 |     permissions:
27 |       # Needed to upload the results to code-scanning dashboard.
28 |       security-events: write
29 |       # Needed to publish results and get a badge (see publish_results below).
30 |       id-token: write
31 |       # Uncomment the permissions below if installing in a private repository.
32 |       # contents: read
33 |       # actions: read
34 | 
35 |     steps:
36 |       - name: "Checkout code"
37 |         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
38 |         with:
39 |           persist-credentials: false
40 | 
41 |       - name: "Run analysis"
42 |         uses: ossf/scorecard-action@62b2cac7ed8198b15735ed49ab1e5cf35480ba46 # v2.4.0
43 |         with:
44 |           results_file: results.sarif
45 |           results_format: sarif
46 |           # (Optional) "write" PAT token. Uncomment the `repo_token` line below if:
47 |           # - you want to enable the Branch-Protection check on a *public* repository, or
48 |           # - you are installing Scorecard on a *private* repository
49 |           # To create the PAT, follow the steps in https://github.com/ossf/scorecard-action#authentication-with-pat.
50 |           # repo_token: ${{ secrets.SCORECARD_TOKEN }}
51 | 
52 |           # Public repositories:
53 |           #   - Publish results to OpenSSF REST API for easy access by consumers
54 |           #   - Allows the repository to include the Scorecard badge.
55 |           #   - See https://github.com/ossf/scorecard-action#publishing-results.
56 |           # For private repositories:
57 |           #   - `publish_results` will always be set to `false`, regardless
58 |           #     of the value entered here.
59 |           publish_results: true
60 | 
61 |       # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
62 |       # format to the repository Actions tab.
63 |       - name: "Upload artifact"
64 |         uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
65 |         with:
66 |           name: SARIF file
67 |           path: results.sarif
68 |           retention-days: 5
69 | 
70 |       # Upload the results to GitHub's code scanning dashboard.
71 |       - name: "Upload to code-scanning"
72 |         uses: github/codeql-action/upload-sarif@dd746615b3b9d728a6a37ca2045b68ca76d4841a # v3.28.8
73 |         with:
74 |           sarif_file: results.sarif
75 | 


--------------------------------------------------------------------------------
/benchmarks/bench-argsort.hpp:
--------------------------------------------------------------------------------
 1 | template <typename T>
 2 | std::vector<size_t> stdargsort(const std::vector<T> &array)
 3 | {
 4 |     std::vector<size_t> indices(array.size());
 5 |     std::iota(indices.begin(), indices.end(), 0);
 6 |     std::sort(indices.begin(),
 7 |               indices.end(),
 8 |               [&array](size_t left, size_t right) -> bool {
 9 |                   // sort indices according to corresponding array element
10 |                   return array[left] < array[right];
11 |               });
12 | 
13 |     return indices;
14 | }
15 | 
16 | template <typename T, class... Args>
17 | static void scalarargsort(benchmark::State &state, Args &&...args)
18 | {
19 |     // get args
20 |     auto args_tuple = std::make_tuple(std::move(args)...);
21 |     size_t arrsize = std::get<0>(args_tuple);
22 |     std::string arrtype = std::get<1>(args_tuple);
23 |     // set up array
24 |     std::vector<T> arr = get_array<T>(arrtype, arrsize);
25 |     std::vector<size_t> inx;
26 |     // benchmark
27 |     for (auto _ : state) {
28 |         inx = stdargsort(arr);
29 |     }
30 | }
31 | 
32 | template <typename T, class... Args>
33 | static void simdargsort(benchmark::State &state, Args &&...args)
34 | {
35 |     // get args
36 |     auto args_tuple = std::make_tuple(std::move(args)...);
37 |     size_t arrsize = std::get<0>(args_tuple);
38 |     std::string arrtype = std::get<1>(args_tuple);
39 |     // set up array
40 |     std::vector<T> arr = get_array<T>(arrtype, arrsize);
41 |     std::vector<size_t> inx;
42 |     // benchmark
43 |     for (auto _ : state) {
44 |         inx = x86simdsort::argsort(arr.data(), arrsize);
45 |     }
46 | }
47 | 
48 | template <typename T, class... Args>
49 | static void simd_revargsort(benchmark::State &state, Args &&...args)
50 | {
51 |     // get args
52 |     auto args_tuple = std::make_tuple(std::move(args)...);
53 |     size_t arrsize = std::get<0>(args_tuple);
54 |     std::string arrtype = std::get<1>(args_tuple);
55 |     // set up array
56 |     std::vector<T> arr = get_array<T>(arrtype, arrsize);
57 |     std::vector<size_t> inx;
58 |     // benchmark
59 |     for (auto _ : state) {
60 |         inx = x86simdsort::argsort(arr.data(), arrsize, false, true);
61 |     }
62 | }
63 | 
64 | template <typename T, class... Args>
65 | static void simd_ordern_argsort(benchmark::State &state, Args &&...args)
66 | {
67 |     // get args
68 |     auto args_tuple = std::make_tuple(std::move(args)...);
69 |     size_t arrsize = std::get<0>(args_tuple);
70 |     std::string arrtype = std::get<1>(args_tuple);
71 |     // set up array
72 |     std::vector<T> arr = get_array<T>(arrtype, arrsize);
73 |     std::vector<int32_t> arg(arrsize);
74 |     std::iota(arg.begin(), arg.end(), 0);
75 |     // benchmark
76 |     for (auto _ : state) {
77 |         std::vector<T> arr_bkp = arr;
78 |         x86simdsort::keyvalue_qsort(arr_bkp.data(), arg.data(), arrsize);
79 |         state.PauseTiming();
80 |         std::iota(arg.begin(), arg.end(), 0);
81 |         state.ResumeTiming();
82 |     }
83 | }
84 | 
85 | #define BENCH_BOTH(type) \
86 |     BENCH_SORT(simdargsort, type) \
87 |     BENCH_SORT(simd_revargsort, type) \
88 |     BENCH_SORT(simd_ordern_argsort, type) \
89 |     BENCH_SORT(scalarargsort, type)
90 | 
91 | BENCH_BOTH(int64_t)
92 | BENCH_BOTH(uint64_t)
93 | BENCH_BOTH(double)
94 | BENCH_BOTH(int32_t)
95 | BENCH_BOTH(uint32_t)
96 | BENCH_BOTH(float)
97 | 


--------------------------------------------------------------------------------
/lib/x86simdsort-skx.cpp:
--------------------------------------------------------------------------------
 1 | // SKX specific routines:
 2 | 
 3 | #include "x86simdsort-static-incl.h"
 4 | #include "x86simdsort-internal.h"
 5 | 
 6 | #define DEFINE_ALL_METHODS(type) \
 7 |     template <> \
 8 |     void qsort(type *arr, size_t arrsize, bool hasnan, bool descending) \
 9 |     { \
10 |         x86simdsortStatic::qsort(arr, arrsize, hasnan, descending); \
11 |     } \
12 |     template <> \
13 |     void qselect( \
14 |             type *arr, size_t k, size_t arrsize, bool hasnan, bool descending) \
15 |     { \
16 |         x86simdsortStatic::qselect(arr, k, arrsize, hasnan, descending); \
17 |     } \
18 |     template <> \
19 |     void partial_qsort( \
20 |             type *arr, size_t k, size_t arrsize, bool hasnan, bool descending) \
21 |     { \
22 |         x86simdsortStatic::partial_qsort(arr, k, arrsize, hasnan, descending); \
23 |     } \
24 |     template <> \
25 |     std::vector<size_t> argsort( \
26 |             type *arr, size_t arrsize, bool hasnan, bool descending) \
27 |     { \
28 |         return x86simdsortStatic::argsort(arr, arrsize, hasnan, descending); \
29 |     } \
30 |     template <> \
31 |     std::vector<size_t> argselect( \
32 |             type *arr, size_t k, size_t arrsize, bool hasnan) \
33 |     { \
34 |         return x86simdsortStatic::argselect(arr, k, arrsize, hasnan); \
35 |     }
36 | 
37 | #define DEFINE_KEYVALUE_METHODS_BASE(type1, type2) \
38 |     template <> \
39 |     void keyvalue_qsort(type1 *key, \
40 |                         type2 *val, \
41 |                         size_t arrsize, \
42 |                         bool hasnan, \
43 |                         bool descending) \
44 |     { \
45 |         x86simdsortStatic::keyvalue_qsort( \
46 |                 key, val, arrsize, hasnan, descending); \
47 |     } \
48 |     template <> \
49 |     void keyvalue_select(type1 *key, \
50 |                          type2 *val, \
51 |                          size_t k, \
52 |                          size_t arrsize, \
53 |                          bool hasnan, \
54 |                          bool descending) \
55 |     { \
56 |         x86simdsortStatic::keyvalue_select( \
57 |                 key, val, k, arrsize, hasnan, descending); \
58 |     } \
59 |     template <> \
60 |     void keyvalue_partial_sort(type1 *key, \
61 |                                type2 *val, \
62 |                                size_t k, \
63 |                                size_t arrsize, \
64 |                                bool hasnan, \
65 |                                bool descending) \
66 |     { \
67 |         x86simdsortStatic::keyvalue_partial_sort( \
68 |                 key, val, k, arrsize, hasnan, descending); \
69 |     }
70 | 
71 | #define DEFINE_KEYVALUE_METHODS(type) \
72 |     DEFINE_KEYVALUE_METHODS_BASE(type, uint64_t) \
73 |     DEFINE_KEYVALUE_METHODS_BASE(type, int64_t) \
74 |     DEFINE_KEYVALUE_METHODS_BASE(type, double) \
75 |     DEFINE_KEYVALUE_METHODS_BASE(type, uint32_t) \
76 |     DEFINE_KEYVALUE_METHODS_BASE(type, int32_t) \
77 |     DEFINE_KEYVALUE_METHODS_BASE(type, float)
78 | 
79 | namespace xss {
80 | namespace avx512 {
81 |     DEFINE_ALL_METHODS(uint32_t)
82 |     DEFINE_ALL_METHODS(int32_t)
83 |     DEFINE_ALL_METHODS(float)
84 |     DEFINE_ALL_METHODS(uint64_t)
85 |     DEFINE_ALL_METHODS(int64_t)
86 |     DEFINE_ALL_METHODS(double)
87 |     DEFINE_KEYVALUE_METHODS(uint64_t)
88 |     DEFINE_KEYVALUE_METHODS(int64_t)
89 |     DEFINE_KEYVALUE_METHODS(double)
90 |     DEFINE_KEYVALUE_METHODS(uint32_t)
91 |     DEFINE_KEYVALUE_METHODS(int32_t)
92 |     DEFINE_KEYVALUE_METHODS(float)
93 | } // namespace avx512
94 | } // namespace xss
95 | 


--------------------------------------------------------------------------------
/benchmarks/bench-objsort.hpp:
--------------------------------------------------------------------------------
  1 | #include <cmath>
  2 | 
  3 | static constexpr char x[] = "x";
  4 | static constexpr char euclidean[] = "euclidean";
  5 | static constexpr char taxicab[] = "taxicab";
  6 | static constexpr char chebyshev[] = "chebyshev";
  7 | 
  8 | template <typename T, const char *val>
  9 | struct Point3D {
 10 |     T x;
 11 |     T y;
 12 |     T z;
 13 |     static constexpr std::string_view name {val};
 14 |     Point3D()
 15 |     {
 16 |         x = (T)rand() / (T)RAND_MAX;
 17 |         y = (T)rand() / (T)RAND_MAX;
 18 |         z = (T)rand() / (T)RAND_MAX;
 19 |     }
 20 |     T distance()
 21 |     {
 22 |         if constexpr (name == "x") { return x; }
 23 |         else if constexpr (name == "euclidean") {
 24 |             return std::sqrt(x * x + y * y + z * z);
 25 |         }
 26 |         else if constexpr (name == "taxicab") {
 27 |             return std::abs(x) + std::abs(y) + std::abs(z);
 28 |         }
 29 |         else if constexpr (name == "chebyshev") {
 30 |             return std::max(std::max(std::abs(x), std::abs(y)), std::abs(z));
 31 |         }
 32 |     }
 33 | };
 34 | 
 35 | template <typename T>
 36 | std::vector<T> init_data(const int size)
 37 | {
 38 |     srand(42);
 39 |     std::vector<T> arr;
 40 |     for (auto ii = 0; ii < size; ++ii) {
 41 |         T temp;
 42 |         arr.push_back(temp);
 43 |     }
 44 |     return arr;
 45 | }
 46 | 
 47 | template <typename T>
 48 | struct less_than_key {
 49 |     inline bool operator()(T &p1, T &p2)
 50 |     {
 51 |         return (p1.distance() < p2.distance());
 52 |     }
 53 | };
 54 | 
 55 | template <typename T>
 56 | static void scalarobjsort(benchmark::State &state)
 57 | {
 58 |     // set up array
 59 |     std::vector<T> arr = init_data<T>(state.range(0));
 60 |     std::vector<T> arr_bkp = arr;
 61 |     // benchmark
 62 |     for (auto _ : state) {
 63 |         std::sort(arr.begin(), arr.end(), less_than_key<T>());
 64 |         state.PauseTiming();
 65 |         arr = arr_bkp;
 66 |         state.ResumeTiming();
 67 |     }
 68 | }
 69 | 
 70 | template <typename T>
 71 | static void simdobjsort(benchmark::State &state)
 72 | {
 73 |     // set up array
 74 |     std::vector<T> arr = init_data<T>(state.range(0));
 75 |     std::vector<T> arr_bkp = arr;
 76 |     // benchmark
 77 |     for (auto _ : state) {
 78 |         x86simdsort::object_qsort(
 79 |                 arr.data(), arr.size(), [](T p) { return p.distance(); });
 80 |         state.PauseTiming();
 81 |         if (!std::is_sorted(arr.begin(), arr.end(), less_than_key<T>())) {
 82 |             std::cout << "sorting failed \n";
 83 |         }
 84 |         arr = arr_bkp;
 85 |         state.ResumeTiming();
 86 |     }
 87 | }
 88 | 
 89 | #define BENCHMARK_OBJSORT(func, T, type, dist) \
 90 |     BENCHMARK_TEMPLATE(func, T<type, dist>) \
 91 |             ->Arg(10e1) \
 92 |             ->Arg(10e2) \
 93 |             ->Arg(10e3) \
 94 |             ->Arg(10e4) \
 95 |             ->Arg(10e5) \
 96 |             ->Arg(10e6);
 97 | 
 98 | #define BENCH_ALL(dtype) \
 99 |     BENCHMARK_OBJSORT(simdobjsort, Point3D, dtype, x) \
100 |     BENCHMARK_OBJSORT(scalarobjsort, Point3D, dtype, x) \
101 |     BENCHMARK_OBJSORT(simdobjsort, Point3D, dtype, taxicab) \
102 |     BENCHMARK_OBJSORT(scalarobjsort, Point3D, dtype, taxicab) \
103 |     BENCHMARK_OBJSORT(simdobjsort, Point3D, dtype, euclidean) \
104 |     BENCHMARK_OBJSORT(scalarobjsort, Point3D, dtype, euclidean) \
105 |     BENCHMARK_OBJSORT(simdobjsort, Point3D, dtype, chebyshev) \
106 |     BENCHMARK_OBJSORT(scalarobjsort, Point3D, dtype, chebyshev)
107 | 
108 | BENCH_ALL(double)
109 | BENCH_ALL(float)
110 | 


--------------------------------------------------------------------------------
/lib/x86simdsort-avx2.cpp:
--------------------------------------------------------------------------------
 1 | // AVX2 specific routines:
 2 | 
 3 | #include "x86simdsort-static-incl.h"
 4 | #include "x86simdsort-internal.h"
 5 | 
 6 | #define DEFINE_ALL_METHODS(type) \
 7 |     template <> \
 8 |     void qsort(type *arr, size_t arrsize, bool hasnan, bool descending) \
 9 |     { \
10 |         x86simdsortStatic::qsort(arr, arrsize, hasnan, descending); \
11 |     } \
12 |     template <> \
13 |     void qselect( \
14 |             type *arr, size_t k, size_t arrsize, bool hasnan, bool descending) \
15 |     { \
16 |         x86simdsortStatic::qselect(arr, k, arrsize, hasnan, descending); \
17 |     } \
18 |     template <> \
19 |     void partial_qsort( \
20 |             type *arr, size_t k, size_t arrsize, bool hasnan, bool descending) \
21 |     { \
22 |         x86simdsortStatic::partial_qsort(arr, k, arrsize, hasnan, descending); \
23 |     } \
24 |     template <> \
25 |     std::vector<size_t> argsort( \
26 |             type *arr, size_t arrsize, bool hasnan, bool descending) \
27 |     { \
28 |         return x86simdsortStatic::argsort(arr, arrsize, hasnan, descending); \
29 |     } \
30 |     template <> \
31 |     std::vector<size_t> argselect( \
32 |             type *arr, size_t k, size_t arrsize, bool hasnan) \
33 |     { \
34 |         return x86simdsortStatic::argselect(arr, k, arrsize, hasnan); \
35 |     }
36 | 
37 | #define DEFINE_KEYVALUE_METHODS_BASE(type1, type2) \
38 |     template <> \
39 |     void keyvalue_qsort(type1 *key, \
40 |                         type2 *val, \
41 |                         size_t arrsize, \
42 |                         bool hasnan, \
43 |                         bool descending) \
44 |     { \
45 |         x86simdsortStatic::keyvalue_qsort( \
46 |                 key, val, arrsize, hasnan, descending); \
47 |     } \
48 |     template <> \
49 |     void keyvalue_select(type1 *key, \
50 |                          type2 *val, \
51 |                          size_t k, \
52 |                          size_t arrsize, \
53 |                          bool hasnan, \
54 |                          bool descending) \
55 |     { \
56 |         x86simdsortStatic::keyvalue_select( \
57 |                 key, val, k, arrsize, hasnan, descending); \
58 |     } \
59 |     template <> \
60 |     void keyvalue_partial_sort(type1 *key, \
61 |                                type2 *val, \
62 |                                size_t k, \
63 |                                size_t arrsize, \
64 |                                bool hasnan, \
65 |                                bool descending) \
66 |     { \
67 |         x86simdsortStatic::keyvalue_partial_sort( \
68 |                 key, val, k, arrsize, hasnan, descending); \
69 |     }
70 | 
71 | #define DEFINE_KEYVALUE_METHODS(type) \
72 |     DEFINE_KEYVALUE_METHODS_BASE(type, uint64_t) \
73 |     DEFINE_KEYVALUE_METHODS_BASE(type, int64_t) \
74 |     DEFINE_KEYVALUE_METHODS_BASE(type, double) \
75 |     DEFINE_KEYVALUE_METHODS_BASE(type, uint32_t) \
76 |     DEFINE_KEYVALUE_METHODS_BASE(type, int32_t) \
77 |     DEFINE_KEYVALUE_METHODS_BASE(type, float)
78 | 
79 | namespace xss {
80 | namespace avx2 {
81 |     DEFINE_ALL_METHODS(uint32_t)
82 |     DEFINE_ALL_METHODS(int32_t)
83 |     DEFINE_ALL_METHODS(float)
84 |     DEFINE_ALL_METHODS(uint64_t)
85 |     DEFINE_ALL_METHODS(int64_t)
86 |     DEFINE_ALL_METHODS(double)
87 |     DEFINE_KEYVALUE_METHODS(uint64_t)
88 |     DEFINE_KEYVALUE_METHODS(int64_t)
89 |     DEFINE_KEYVALUE_METHODS(double)
90 |     DEFINE_KEYVALUE_METHODS(uint32_t)
91 |     DEFINE_KEYVALUE_METHODS(int32_t)
92 |     DEFINE_KEYVALUE_METHODS(float)
93 | } // namespace avx2
94 | } // namespace xss


--------------------------------------------------------------------------------
/lib/list-of-exported-symbols.txt:
--------------------------------------------------------------------------------
 1 | std::vector<unsigned long, std::allocator<unsigned long> > x86simdsort::argselect<double>(double*, unsigned long, unsigned long, bool)
 2 | std::vector<unsigned long, std::allocator<unsigned long> > x86simdsort::argselect<float>(float*, unsigned long, unsigned long, bool)
 3 | std::vector<unsigned long, std::allocator<unsigned long> > x86simdsort::argselect<int>(int*, unsigned long, unsigned long, bool)
 4 | std::vector<unsigned long, std::allocator<unsigned long> > x86simdsort::argselect<long>(long*, unsigned long, unsigned long, bool)
 5 | std::vector<unsigned long, std::allocator<unsigned long> > x86simdsort::argselect<short>(short*, unsigned long, unsigned long, bool)
 6 | std::vector<unsigned long, std::allocator<unsigned long> > x86simdsort::argselect<unsigned int>(unsigned int*, unsigned long, unsigned long, bool)
 7 | std::vector<unsigned long, std::allocator<unsigned long> > x86simdsort::argselect<unsigned long>(unsigned long*, unsigned long, unsigned long, bool)
 8 | std::vector<unsigned long, std::allocator<unsigned long> > x86simdsort::argselect<unsigned short>(unsigned short*, unsigned long, unsigned long, bool)
 9 | std::vector<unsigned long, std::allocator<unsigned long> > x86simdsort::argsort<double>(double*, unsigned long, bool)
10 | std::vector<unsigned long, std::allocator<unsigned long> > x86simdsort::argsort<float>(float*, unsigned long, bool)
11 | std::vector<unsigned long, std::allocator<unsigned long> > x86simdsort::argsort<int>(int*, unsigned long, bool)
12 | std::vector<unsigned long, std::allocator<unsigned long> > x86simdsort::argsort<long>(long*, unsigned long, bool)
13 | std::vector<unsigned long, std::allocator<unsigned long> > x86simdsort::argsort<short>(short*, unsigned long, bool)
14 | std::vector<unsigned long, std::allocator<unsigned long> > x86simdsort::argsort<unsigned int>(unsigned int*, unsigned long, bool)
15 | std::vector<unsigned long, std::allocator<unsigned long> > x86simdsort::argsort<unsigned long>(unsigned long*, unsigned long, bool)
16 | std::vector<unsigned long, std::allocator<unsigned long> > x86simdsort::argsort<unsigned short>(unsigned short*, unsigned long, bool)
17 | void x86simdsort::partial_qsort<double>(double*, unsigned long, unsigned long, bool)
18 | void x86simdsort::partial_qsort<float>(float*, unsigned long, unsigned long, bool)
19 | void x86simdsort::partial_qsort<int>(int*, unsigned long, unsigned long, bool)
20 | void x86simdsort::partial_qsort<long>(long*, unsigned long, unsigned long, bool)
21 | void x86simdsort::partial_qsort<short>(short*, unsigned long, unsigned long, bool)
22 | void x86simdsort::partial_qsort<unsigned int>(unsigned int*, unsigned long, unsigned long, bool)
23 | void x86simdsort::partial_qsort<unsigned long>(unsigned long*, unsigned long, unsigned long, bool)
24 | void x86simdsort::partial_qsort<unsigned short>(unsigned short*, unsigned long, unsigned long, bool)
25 | void x86simdsort::qselect<double>(double*, unsigned long, unsigned long, bool)
26 | void x86simdsort::qselect<float>(float*, unsigned long, unsigned long, bool)
27 | void x86simdsort::qselect<int>(int*, unsigned long, unsigned long, bool)
28 | void x86simdsort::qselect<long>(long*, unsigned long, unsigned long, bool)
29 | void x86simdsort::qselect<short>(short*, unsigned long, unsigned long, bool)
30 | void x86simdsort::qselect<unsigned int>(unsigned int*, unsigned long, unsigned long, bool)
31 | void x86simdsort::qselect<unsigned long>(unsigned long*, unsigned long, unsigned long, bool)
32 | void x86simdsort::qselect<unsigned short>(unsigned short*, unsigned long, unsigned long, bool)
33 | void x86simdsort::qsort<double>(double*, unsigned long, bool)
34 | void x86simdsort::qsort<float>(float*, unsigned long, bool)
35 | void x86simdsort::qsort<int>(int*, unsigned long, bool)
36 | void x86simdsort::qsort<long>(long*, unsigned long, bool)
37 | void x86simdsort::qsort<short>(short*, unsigned long, bool)
38 | void x86simdsort::qsort<unsigned int>(unsigned int*, unsigned long, bool)
39 | void x86simdsort::qsort<unsigned long>(unsigned long*, unsigned long, bool)
40 | void x86simdsort::qsort<unsigned short>(unsigned short*, unsigned long, bool)
41 | _ZN11x86simdsort13partial_qsortIDF16_EEvPT_mmb
42 | _ZN11x86simdsort5qsortIDF16_EEvPT_mb
43 | _ZN11x86simdsort7argsortIDF16_EESt6vectorImSaImEEPT_mb
44 | _ZN11x86simdsort7qselectIDF16_EEvPT_mmb
45 | _ZN11x86simdsort9argselectIDF16_EESt6vectorImSaImEEPT_mmb
46 | 


--------------------------------------------------------------------------------
/lib/x86simdsort.h:
--------------------------------------------------------------------------------
  1 | #ifndef X86_SIMD_SORT
  2 | #define X86_SIMD_SORT
  3 | #include <stdint.h>
  4 | #include <vector>
  5 | #include <cstddef>
  6 | #include <functional>
  7 | #include <numeric>
  8 | 
  9 | #define XSS_EXPORT_SYMBOL __attribute__((visibility("default")))
 10 | #define XSS_HIDE_SYMBOL __attribute__((visibility("hidden")))
 11 | #define UNUSED(x) (void)(x)
 12 | 
 13 | namespace x86simdsort {
 14 | 
 15 | // quicksort
 16 | template <typename T>
 17 | XSS_EXPORT_SYMBOL void
 18 | qsort(T *arr, size_t arrsize, bool hasnan = false, bool descending = false);
 19 | 
 20 | // quickselect
 21 | template <typename T>
 22 | XSS_EXPORT_SYMBOL void qselect(T *arr,
 23 |                                size_t k,
 24 |                                size_t arrsize,
 25 |                                bool hasnan = false,
 26 |                                bool descending = false);
 27 | 
 28 | // partial sort
 29 | template <typename T>
 30 | XSS_EXPORT_SYMBOL void partial_qsort(T *arr,
 31 |                                      size_t k,
 32 |                                      size_t arrsize,
 33 |                                      bool hasnan = false,
 34 |                                      bool descending = false);
 35 | 
 36 | // argsort
 37 | template <typename T>
 38 | XSS_EXPORT_SYMBOL std::vector<size_t>
 39 | argsort(T *arr, size_t arrsize, bool hasnan = false, bool descending = false);
 40 | 
 41 | // argselect
 42 | template <typename T>
 43 | XSS_EXPORT_SYMBOL std::vector<size_t>
 44 | argselect(T *arr, size_t k, size_t arrsize, bool hasnan = false);
 45 | 
 46 | // keyvalue sort
 47 | template <typename T1, typename T2>
 48 | XSS_EXPORT_SYMBOL void keyvalue_qsort(T1 *key,
 49 |                                       T2 *val,
 50 |                                       size_t arrsize,
 51 |                                       bool hasnan = false,
 52 |                                       bool descending = false);
 53 | 
 54 | // keyvalue select
 55 | template <typename T1, typename T2>
 56 | XSS_EXPORT_SYMBOL void keyvalue_select(T1 *key,
 57 |                                        T2 *val,
 58 |                                        size_t k,
 59 |                                        size_t arrsize,
 60 |                                        bool hasnan = false,
 61 |                                        bool descending = false);
 62 | 
 63 | // keyvalue partial sort
 64 | template <typename T1, typename T2>
 65 | XSS_EXPORT_SYMBOL void keyvalue_partial_sort(T1 *key,
 66 |                                              T2 *val,
 67 |                                              size_t k,
 68 |                                              size_t arrsize,
 69 |                                              bool hasnan = false,
 70 |                                              bool descending = false);
 71 | 
 72 | // sort an object
 73 | template <typename T, typename U, typename Func>
 74 | XSS_EXPORT_SYMBOL void object_qsort(T *arr, U arrsize, Func key_func)
 75 | {
 76 |     static_assert(std::is_integral<U>::value, "arrsize must be an integral type");
 77 |     static_assert(sizeof(U) == sizeof(int32_t) || sizeof(U) == sizeof(int64_t),
 78 |                   "arrsize must be 32 or 64 bits");
 79 |     using return_type_of = typename decltype(std::function{key_func})::result_type;
 80 |     static_assert(sizeof(return_type_of) == sizeof(int32_t) || sizeof(return_type_of) == sizeof(int64_t),
 81 |                   "key_func return type must be 32 or 64 bits");
 82 |     std::vector<return_type_of> keys(arrsize);
 83 |     for (U ii = 0; ii < arrsize; ++ii) {
 84 |         keys[ii] = key_func(arr[ii]);
 85 |     }
 86 | 
 87 |     /* (2) Call arg based on keys using the keyvalue sort */
 88 |     std::vector<U> arg(arrsize);
 89 |     std::iota(arg.begin(), arg.end(), 0);
 90 |     x86simdsort::keyvalue_qsort(keys.data(), arg.data(), arrsize);
 91 | 
 92 |     /* (3) Permute obj array in-place */
 93 |     std::vector<bool> done(arrsize);
 94 |     for (U i = 0; i < arrsize; ++i) {
 95 |         if (done[i]) { continue; }
 96 |         done[i] = true;
 97 |         U prev_j = i;
 98 |         U j = arg[i];
 99 |         while (i != j) {
100 |             std::swap(arr[prev_j], arr[j]);
101 |             done[j] = true;
102 |             prev_j = j;
103 |             j = arg[j];
104 |         }
105 |     }
106 | }
107 | 
108 | } // namespace x86simdsort
109 | #endif
110 | 


--------------------------------------------------------------------------------
/tests/test-qsort-common.h:
--------------------------------------------------------------------------------
  1 | #ifndef AVX512_TEST_COMMON
  2 | #define AVX512_TEST_COMMON
  3 | 
  4 | #define XSS_DO_NOT_SET_SEED
  5 | 
  6 | #include "custom-compare.h"
  7 | #include "rand_array.h"
  8 | #include "x86simdsort.h"
  9 | #include <gtest/gtest.h>
 10 | 
 11 | #define EXPECT_UNIQUE(arg) \
 12 |     auto sorted_arg = arg; \
 13 |     std::sort(sorted_arg.begin(), sorted_arg.end()); \
 14 |     std::vector<size_t> expected_arg(sorted_arg.size()); \
 15 |     std::iota(expected_arg.begin(), expected_arg.end(), 0); \
 16 |     EXPECT_EQ(sorted_arg, expected_arg) \
 17 |             << "Indices aren't unique. Array size = " << sorted_arg.size();
 18 | 
 19 | #define REPORT_FAIL(msg, size, type, k) \
 20 |     ASSERT_TRUE(false) << msg << ". arr size = " << size \
 21 |                        << ", type = " << type << ", k = " << k;
 22 | 
 23 | inline bool is_nan_test(std::string type)
 24 | {
 25 |     // Currently, determine whether the test uses nan just be checking if nan is in its name
 26 |     return type.find("nan") != std::string::npos;
 27 | }
 28 | 
 29 | template <typename T>
 30 | void IS_SORTED(std::vector<T> sorted, std::vector<T> arr, std::string type)
 31 | {
 32 |     if (arr.size() == 0) return;
 33 |     if (memcmp(arr.data(), sorted.data(), arr.size() * sizeof(T)) != 0) {
 34 |         REPORT_FAIL("Array not sorted", arr.size(), type, -1);
 35 |     }
 36 | }
 37 | 
 38 | template <typename T>
 39 | void IS_ARG_SORTED(std::vector<T> sortedarr,
 40 |                    std::vector<T> arr,
 41 |                    std::vector<size_t> arg,
 42 |                    std::string type)
 43 | {
 44 |     EXPECT_UNIQUE(arg)
 45 |     std::vector<T> arr_backup;
 46 |     for (auto ii : arg) {
 47 |         arr_backup.push_back(arr[ii]);
 48 |     }
 49 |     IS_SORTED(sortedarr, arr_backup, type);
 50 | }
 51 | 
 52 | template <typename T>
 53 | void IS_ARR_PARTITIONED(std::vector<T> arr,
 54 |                         size_t k,
 55 |                         T true_kth,
 56 |                         std::string type,
 57 |                         bool descending = false)
 58 | {
 59 |     std::function<bool(T, T)> cmp_eq, cmp_less, cmp_leq, cmp_geq;
 60 |     cmp_eq = compare<T, std::equal_to<T>>();
 61 | 
 62 |     if (!descending) {
 63 |         cmp_less = compare<T, std::less<T>>();
 64 |         cmp_leq = compare<T, std::less_equal<T>>();
 65 |         cmp_geq = compare<T, std::greater_equal<T>>();
 66 |     }
 67 |     else {
 68 |         cmp_less = compare<T, std::greater<T>>();
 69 |         cmp_leq = compare<T, std::greater_equal<T>>();
 70 |         cmp_geq = compare<T, std::less_equal<T>>();
 71 |     }
 72 | 
 73 |     // 1) arr[k] == sorted[k]; use memcmp to handle nan
 74 |     if (!cmp_eq(arr[k], true_kth)) {
 75 |         REPORT_FAIL("kth element is incorrect", arr.size(), type, k);
 76 |     }
 77 |     // ( 2) Elements to the left of k should be atmost arr[k]
 78 |     if (k >= 1) {
 79 |         T max_left
 80 |                 = *std::max_element(arr.begin(), arr.begin() + k - 1, cmp_less);
 81 |         if (!cmp_geq(arr[k], max_left)) {
 82 |             REPORT_FAIL("incorrect left partition", arr.size(), type, k);
 83 |         }
 84 |     }
 85 |     // 3) Elements to the right of k should be atleast arr[k]
 86 |     if (k != (size_t)(arr.size() - 1)) {
 87 |         T min_right
 88 |                 = *std::min_element(arr.begin() + k + 1, arr.end(), cmp_less);
 89 |         if (!cmp_leq(arr[k], min_right)) {
 90 |             REPORT_FAIL("incorrect right partition", arr.size(), type, k);
 91 |         }
 92 |     }
 93 | }
 94 | 
 95 | template <typename T>
 96 | void IS_ARR_PARTIALSORTED(std::vector<T> arr,
 97 |                           size_t k,
 98 |                           std::vector<T> sorted,
 99 |                           std::string type)
100 | {
101 |     if (memcmp(arr.data(), sorted.data(), k * sizeof(T)) != 0) {
102 |         REPORT_FAIL("Partial array not sorted", arr.size(), type, k);
103 |     }
104 | }
105 | 
106 | template <typename T>
107 | void IS_ARG_PARTITIONED(std::vector<T> arr,
108 |                         std::vector<size_t> arg,
109 |                         T true_kth,
110 |                         size_t k,
111 |                         std::string type)
112 | {
113 |     EXPECT_UNIQUE(arg)
114 |     std::vector<T> part_arr;
115 |     for (auto ii : arg) {
116 |         part_arr.push_back(arr[ii]);
117 |     }
118 |     IS_ARR_PARTITIONED(part_arr, k, true_kth, type);
119 | }
120 | #endif
121 | 


--------------------------------------------------------------------------------
/meson.build:
--------------------------------------------------------------------------------
  1 | project('x86-simd-sort', 'cpp',
  2 |         version : '7.0.x',
  3 |         license : 'BSD 3-clause',
  4 |         default_options : ['cpp_std=c++17'])
  5 | fs = import('fs')
  6 | cpp = meson.get_compiler('cpp')
  7 | src = include_directories('src')
  8 | lib = include_directories('lib')
  9 | bench = include_directories('benchmarks')
 10 | utils = include_directories('utils')
 11 | tests = include_directories('tests')
 12 | 
 13 | # Add IPP sort to benchmarks:
 14 | benchipp = false
 15 | ipplink = []
 16 | if get_option('build_ippbench')
 17 |   benchipp = true
 18 |   ipplink = ['-lipps', '-lippcore']
 19 | endif
 20 | 
 21 | # Essentially '-Werror' for the sanitizers; all problems become fatal with this set
 22 | if get_option('fatal_sanitizers')
 23 |   add_project_arguments([ '-fno-sanitize-recover=all' ], language: 'cpp')
 24 | endif
 25 | 
 26 | # Add google vqsort to benchmarks:
 27 | benchvq = false
 28 | if get_option('build_vqsortbench')
 29 |   benchvq = true
 30 | endif
 31 | 
 32 | # openMP:
 33 | omp = []
 34 | omp_dep = []
 35 | if get_option('use_openmp')
 36 |   omp = dependency('openmp', required : true)
 37 |   omp_dep = declare_dependency(dependencies: omp, compile_args: ['-DXSS_USE_OPENMP'])
 38 | endif
 39 | 
 40 | fp16code = '''#include<immintrin.h>
 41 | int main() {
 42 |   __m512h temp = _mm512_set1_ph(1.0f);
 43 |   __m512h var2 = _mm512_min_ph(temp, temp);
 44 |   return 0;
 45 | }
 46 | '''
 47 | cancompilefp16 = cpp.compiles(fp16code, args:'-march=sapphirerapids')
 48 | 
 49 | subdir('lib')
 50 | if get_option('lib_type') == 'shared'
 51 |   libsimdsort = shared_library('x86simdsortcpp',
 52 |                              'lib/x86simdsort.cpp',
 53 |                              include_directories : [src, utils, lib],
 54 |                              link_with : [libtargets],
 55 |                              dependencies: [omp_dep],
 56 |                              gnu_symbol_visibility : 'inlineshidden',
 57 |                              install : true,
 58 |                              soversion : 1,
 59 |                             )
 60 | else
 61 |   libsimdsort = static_library('x86simdsortcpp',
 62 |                              'lib/x86simdsort.cpp',
 63 |                              include_directories : [src, utils, lib],
 64 |                              link_with : [libtargets],
 65 |                              dependencies: [omp_dep],
 66 |                              gnu_symbol_visibility : 'inlineshidden',
 67 |                              install : true,
 68 |                              pic: true,
 69 |                             )
 70 | endif
 71 | 
 72 | pkg_mod = import('pkgconfig')
 73 | pkg_mod.generate(libraries : libsimdsort,
 74 |                  version : '7.0',
 75 |                  name : 'libx86simdsortcpp',
 76 |                  filebase : 'x86simdsortcpp',
 77 |                  description : 'C++ template library for high performance SIMD based sorting routines.')
 78 | 
 79 | # Create a new dependency variable making it easy to use this as a subproject:
 80 | x86simdsortcpp_dep = declare_dependency(
 81 |   include_directories: include_directories('lib'),
 82 |   link_with: libsimdsort,
 83 | )
 84 | 
 85 | # Build test suite if option build_tests set to true
 86 | if get_option('build_tests')
 87 |   gtest_dep = dependency('gtest_main', required : true, static: false)
 88 |   subdir('tests')
 89 |   testexe = executable('testexe',
 90 |                      include_directories : [lib, utils],
 91 |                      dependencies : [gtest_dep, x86simdsortcpp_dep],
 92 |                      link_whole : [libtests],
 93 |                     )
 94 |   test('x86 simd sort tests', testexe)
 95 | endif
 96 | 
 97 | # Build benchmarking suite if option build_benchmarks is set to true
 98 | 
 99 | if get_option('build_benchmarks')
100 |   gbench_dep = dependency('benchmark', required : true, static: false)
101 |   thread_dep = dependency('threads') # libbenchmark could need pthread_create
102 |   subdir('benchmarks')
103 |   benchexe = executable('benchexe',
104 |                       include_directories : [src, lib, utils, bench],
105 |                       dependencies : [gbench_dep, thread_dep, x86simdsortcpp_dep],
106 |                       link_args: ['-lbenchmark_main', ipplink],
107 |                       link_whole : [libbench],
108 |                      )
109 | endif
110 | 
111 | summary({
112 |   'Can compile AVX-512 FP16 ISA': cancompilefp16,
113 |   'Build test content': get_option('build_tests'),
114 |   'Build benchmarks': get_option('build_benchmarks'),
115 |   },
116 |   section: 'Configuration',
117 |   bool_yn: true
118 |   )
119 | 
120 | 


--------------------------------------------------------------------------------
/src/xss-common-comparators.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef XSS_COMMON_COMPARATORS
  2 | #define XSS_COMMON_COMPARATORS
  3 | 
  4 | template <typename type_t>
  5 | type_t prev_value(type_t value)
  6 | {
  7 |     // TODO this probably handles non-native float16 wrong
  8 |     if constexpr (std::is_floating_point<type_t>::value) {
  9 |         return std::nextafter(value, -std::numeric_limits<type_t>::infinity());
 10 |     }
 11 |     else {
 12 |         if (value > std::numeric_limits<type_t>::min()) { return value - 1; }
 13 |         else {
 14 |             return value;
 15 |         }
 16 |     }
 17 | }
 18 | 
 19 | template <typename type_t>
 20 | type_t next_value(type_t value)
 21 | {
 22 |     // TODO this probably handles non-native float16 wrong
 23 |     if constexpr (std::is_floating_point<type_t>::value) {
 24 |         return std::nextafter(value, std::numeric_limits<type_t>::infinity());
 25 |     }
 26 |     else {
 27 |         if (value < std::numeric_limits<type_t>::max()) { return value + 1; }
 28 |         else {
 29 |             return value;
 30 |         }
 31 |     }
 32 | }
 33 | 
 34 | template <typename vtype, typename mm_t>
 35 | X86_SIMD_SORT_INLINE void COEX(mm_t &a, mm_t &b);
 36 | 
 37 | template <typename vtype, bool descend>
 38 | struct Comparator {
 39 |     using reg_t = typename vtype::reg_t;
 40 |     using opmask_t = typename vtype::opmask_t;
 41 |     using type_t = typename vtype::type_t;
 42 | 
 43 |     X86_SIMD_SORT_FINLINE bool STDSortComparator(const type_t &a,
 44 |                                                  const type_t &b)
 45 |     {
 46 |         if constexpr (descend) { return comparison_func<vtype>(b, a); }
 47 |         else {
 48 |             return comparison_func<vtype>(a, b);
 49 |         }
 50 |     }
 51 | 
 52 |     X86_SIMD_SORT_FINLINE opmask_t PartitionComparator(reg_t a, reg_t b)
 53 |     {
 54 |         if constexpr (descend) { return vtype::ge(b, a); }
 55 |         else {
 56 |             return vtype::ge(a, b);
 57 |         }
 58 |     }
 59 | 
 60 |     X86_SIMD_SORT_FINLINE void COEX(reg_t &a, reg_t &b)
 61 |     {
 62 |         if constexpr (descend) { ::COEX<vtype, reg_t>(b, a); }
 63 |         else {
 64 |             ::COEX<vtype, reg_t>(a, b);
 65 |         }
 66 |     }
 67 | 
 68 |     // Returns a vector of values that would be sorted as far right as possible
 69 |     // For ascending order, this is the maximum possible value
 70 |     X86_SIMD_SORT_FINLINE reg_t rightmostPossibleVec()
 71 |     {
 72 |         if constexpr (descend) { return vtype::zmm_min(); }
 73 |         else {
 74 |             return vtype::zmm_max();
 75 |         }
 76 |     }
 77 | 
 78 |     // Returns the value that would be leftmost of the two when sorted
 79 |     // For ascending order, that is the smaller value
 80 |     X86_SIMD_SORT_FINLINE type_t leftmost(type_t smaller, type_t larger)
 81 |     {
 82 |         if constexpr (descend) {
 83 |             UNUSED(smaller);
 84 |             return larger;
 85 |         }
 86 |         else {
 87 |             UNUSED(larger);
 88 |             return smaller;
 89 |         }
 90 |     }
 91 | 
 92 |     // Returns the value that would be rightmost of the two when sorted
 93 |     // For ascending order, that is the larger value
 94 |     X86_SIMD_SORT_FINLINE type_t rightmost(type_t smaller, type_t larger)
 95 |     {
 96 |         if constexpr (descend) {
 97 |             UNUSED(larger);
 98 |             return smaller;
 99 |         }
100 |         else {
101 |             UNUSED(smaller);
102 |             return larger;
103 |         }
104 |     }
105 | 
106 |     // If median == smallest, that implies approximately half the array is equal to smallest, unless we were very unlucky with our sample
107 |     // Try just doing the next largest value greater than this seemingly very common value to seperate them out
108 |     X86_SIMD_SORT_FINLINE type_t choosePivotMedianIsSmallest(type_t median)
109 |     {
110 |         if constexpr (descend) { return median; }
111 |         else {
112 |             return next_value<type_t>(median);
113 |         }
114 |     }
115 | 
116 |     // If median == largest, that implies approximately half the array is equal to largest, unless we were very unlucky with our sample
117 |     // Thus, median probably is a fine pivot, since it will move all of this common value into its own partition
118 |     X86_SIMD_SORT_FINLINE type_t choosePivotMedianIsLargest(type_t median)
119 |     {
120 |         if constexpr (descend) { return prev_value<type_t>(median); }
121 |         else {
122 |             return median;
123 |         }
124 |     }
125 | };
126 | 
127 | #endif // XSS_COMMON_COMPARATORS
128 | 


--------------------------------------------------------------------------------
/lib/x86simdsort-scalar.h:
--------------------------------------------------------------------------------
  1 | #include "custom-compare.h"
  2 | #include <algorithm>
  3 | #include <numeric>
  4 | 
  5 | namespace xss {
  6 | namespace utils {
  7 |     /*
  8 |      * O(1) permute array in place: stolen from
  9 |      * http://www.davidespataro.it/apply-a-permutation-to-a-vector
 10 |      */
 11 |     template <typename T>
 12 |     void apply_permutation_in_place(T *arr, std::vector<size_t> arg)
 13 |     {
 14 |         for (size_t i = 0; i < arg.size(); i++) {
 15 |             size_t curr = i;
 16 |             size_t next = arg[curr];
 17 |             while (next != i) {
 18 |                 std::swap(arr[curr], arr[next]);
 19 |                 arg[curr] = curr;
 20 |                 curr = next;
 21 |                 next = arg[next];
 22 |             }
 23 |             arg[curr] = curr;
 24 |         }
 25 |     }
 26 |     template <typename T>
 27 |     decltype(auto) get_cmp_func(bool hasnan, bool reverse)
 28 |     {
 29 |         std::function<bool(T, T)> cmp;
 30 |         if (hasnan) {
 31 |             if (reverse == true) { cmp = compare<T, std::greater<T>>(); }
 32 |             else {
 33 |                 cmp = compare<T, std::less<T>>();
 34 |             }
 35 |         }
 36 |         else {
 37 |             if (reverse == true) { cmp = std::greater<T>(); }
 38 |             else {
 39 |                 cmp = std::less<T>();
 40 |             }
 41 |         }
 42 |         return cmp;
 43 |     }
 44 | } // namespace utils
 45 | 
 46 | namespace scalar {
 47 |     template <typename T>
 48 |     void qsort(T *arr, size_t arrsize, bool hasnan, bool reversed)
 49 |     {
 50 |         std::sort(arr,
 51 |                   arr + arrsize,
 52 |                   xss::utils::get_cmp_func<T>(hasnan, reversed));
 53 |     }
 54 | 
 55 |     template <typename T>
 56 |     void qselect(T *arr, size_t k, size_t arrsize, bool hasnan, bool reversed)
 57 |     {
 58 |         std::nth_element(arr,
 59 |                          arr + k,
 60 |                          arr + arrsize,
 61 |                          xss::utils::get_cmp_func<T>(hasnan, reversed));
 62 |     }
 63 |     template <typename T>
 64 |     void
 65 |     partial_qsort(T *arr, size_t k, size_t arrsize, bool hasnan, bool reversed)
 66 |     {
 67 |         std::partial_sort(arr,
 68 |                           arr + k,
 69 |                           arr + arrsize,
 70 |                           xss::utils::get_cmp_func<T>(hasnan, reversed));
 71 |     }
 72 |     template <typename T>
 73 |     std::vector<size_t>
 74 |     argsort(T *arr, size_t arrsize, bool hasnan, bool reversed)
 75 |     {
 76 |         UNUSED(hasnan);
 77 |         std::vector<size_t> arg(arrsize);
 78 |         std::iota(arg.begin(), arg.end(), 0);
 79 |         if (reversed) {
 80 |             std::sort(arg.begin(),
 81 |                       arg.end(),
 82 |                       compare_arg<T, std::greater<T>>(arr));
 83 |         }
 84 |         else {
 85 |             std::sort(
 86 |                     arg.begin(), arg.end(), compare_arg<T, std::less<T>>(arr));
 87 |         }
 88 |         return arg;
 89 |     }
 90 |     template <typename T>
 91 |     std::vector<size_t> argselect(T *arr, size_t k, size_t arrsize, bool hasnan)
 92 |     {
 93 |         UNUSED(hasnan);
 94 |         std::vector<size_t> arg(arrsize);
 95 |         std::iota(arg.begin(), arg.end(), 0);
 96 |         std::nth_element(arg.begin(),
 97 |                          arg.begin() + k,
 98 |                          arg.end(),
 99 |                          compare_arg<T, std::less<T>>(arr));
100 |         return arg;
101 |     }
102 |     template <typename T1, typename T2>
103 |     void keyvalue_qsort(
104 |             T1 *key, T2 *val, size_t arrsize, bool hasnan, bool descending)
105 |     {
106 |         std::vector<size_t> arg = argsort(key, arrsize, hasnan, descending);
107 |         utils::apply_permutation_in_place(key, arg);
108 |         utils::apply_permutation_in_place(val, arg);
109 |     }
110 |     template <typename T1, typename T2>
111 |     void keyvalue_select(T1 *key,
112 |                          T2 *val,
113 |                          size_t k,
114 |                          size_t arrsize,
115 |                          bool hasnan,
116 |                          bool descending)
117 |     {
118 |         // Note that this does a full kv-sort
119 |         UNUSED(k);
120 |         keyvalue_qsort(key, val, arrsize, hasnan, descending);
121 |     }
122 |     template <typename T1, typename T2>
123 |     void keyvalue_partial_sort(T1 *key,
124 |                                T2 *val,
125 |                                size_t k,
126 |                                size_t arrsize,
127 |                                bool hasnan,
128 |                                bool descending)
129 |     {
130 |         // Note that this does a full kv-sort
131 |         UNUSED(k);
132 |         keyvalue_qsort(key, val, arrsize, hasnan, descending);
133 |     }
134 | 
135 | } // namespace scalar
136 | } // namespace xss
137 | 


--------------------------------------------------------------------------------
/src/xss-common-includes.h:
--------------------------------------------------------------------------------
  1 | #ifndef XSS_COMMON_INCLUDES
  2 | #define XSS_COMMON_INCLUDES
  3 | #include <algorithm>
  4 | #include <cmath>
  5 | #include <cstdint>
  6 | #include <cstring>
  7 | #include <immintrin.h>
  8 | #include <limits>
  9 | #include <vector>
 10 | #include "xss-custom-float.h"
 11 | 
 12 | #define X86_SIMD_SORT_INFINITY std::numeric_limits<double>::infinity()
 13 | #define X86_SIMD_SORT_INFINITYF std::numeric_limits<float>::infinity()
 14 | #define X86_SIMD_SORT_INFINITYH 0x7c00
 15 | #define X86_SIMD_SORT_NEGINFINITYH 0xfc00
 16 | #define X86_SIMD_SORT_MAX_UINT16 std::numeric_limits<uint16_t>::max()
 17 | #define X86_SIMD_SORT_MAX_INT16 std::numeric_limits<int16_t>::max()
 18 | #define X86_SIMD_SORT_MIN_INT16 std::numeric_limits<int16_t>::min()
 19 | #define X86_SIMD_SORT_MAX_UINT32 std::numeric_limits<uint32_t>::max()
 20 | #define X86_SIMD_SORT_MAX_INT32 std::numeric_limits<int32_t>::max()
 21 | #define X86_SIMD_SORT_MIN_INT32 std::numeric_limits<int32_t>::min()
 22 | #define X86_SIMD_SORT_MAX_UINT64 std::numeric_limits<uint64_t>::max()
 23 | #define X86_SIMD_SORT_MAX_INT64 std::numeric_limits<int64_t>::max()
 24 | #define X86_SIMD_SORT_MIN_INT64 std::numeric_limits<int64_t>::min()
 25 | #define ZMM_MAX_DOUBLE _mm512_set1_pd(X86_SIMD_SORT_INFINITY)
 26 | #define ZMM_MAX_UINT64 _mm512_set1_epi64(X86_SIMD_SORT_MAX_UINT64)
 27 | #define ZMM_MAX_INT64 _mm512_set1_epi64(X86_SIMD_SORT_MAX_INT64)
 28 | #define ZMM_MAX_FLOAT _mm512_set1_ps(X86_SIMD_SORT_INFINITYF)
 29 | #define ZMM_MAX_UINT _mm512_set1_epi32(X86_SIMD_SORT_MAX_UINT32)
 30 | #define ZMM_MAX_INT _mm512_set1_epi32(X86_SIMD_SORT_MAX_INT32)
 31 | #define ZMM_MAX_HALF _mm512_set1_epi16(X86_SIMD_SORT_INFINITYH)
 32 | #define YMM_MAX_HALF _mm256_set1_epi16(X86_SIMD_SORT_INFINITYH)
 33 | #define ZMM_MAX_UINT16 _mm512_set1_epi16(X86_SIMD_SORT_MAX_UINT16)
 34 | #define ZMM_MAX_INT16 _mm512_set1_epi16(X86_SIMD_SORT_MAX_INT16)
 35 | #define SHUFFLE_MASK(a, b, c, d) (a << 6) | (b << 4) | (c << 2) | d
 36 | 
 37 | #define PRAGMA(x) _Pragma(#x)
 38 | #define UNUSED(x) (void)(x)
 39 | 
 40 | /* Compiler specific macros specific */
 41 | #ifdef _MSC_VER
 42 | #define X86_SIMD_SORT_INLINE_ONLY inline
 43 | #define X86_SIMD_SORT_INLINE static inline
 44 | #define X86_SIMD_SORT_FINLINE static __forceinline
 45 | #define LIKELY(x) (x)
 46 | #define UNLIKELY(x) (x)
 47 | #elif defined(__CYGWIN__)
 48 | /*
 49 |  * Force inline in cygwin to work around a compiler bug. See
 50 |  * https://github.com/numpy/numpy/pull/22315#issuecomment-1267757584
 51 |  */
 52 | #define X86_SIMD_SORT_INLINE_ONLY inline
 53 | #define X86_SIMD_SORT_INLINE static __attribute__((always_inline))
 54 | #define X86_SIMD_SORT_FINLINE static __attribute__((always_inline))
 55 | #elif defined(__GNUC__)
 56 | #define X86_SIMD_SORT_INLINE_ONLY inline
 57 | #define X86_SIMD_SORT_INLINE static inline
 58 | #define X86_SIMD_SORT_FINLINE static inline __attribute__((always_inline))
 59 | #define LIKELY(x) __builtin_expect((x), 1)
 60 | #define UNLIKELY(x) __builtin_expect((x), 0)
 61 | #else
 62 | #define X86_SIMD_SORT_INLINE_ONLY
 63 | #define X86_SIMD_SORT_INLINE static
 64 | #define X86_SIMD_SORT_FINLINE static
 65 | #define LIKELY(x) (x)
 66 | #define UNLIKELY(x) (x)
 67 | #endif
 68 | 
 69 | #if defined(__INTEL_COMPILER) and !defined(__SANITIZE_ADDRESS__)
 70 | #define X86_SIMD_SORT_UNROLL_LOOP(num) PRAGMA(unroll(num))
 71 | #elif __GNUC__ >= 8 and !defined(__SANITIZE_ADDRESS__)
 72 | #define X86_SIMD_SORT_UNROLL_LOOP(num) PRAGMA(GCC unroll num)
 73 | #else
 74 | #define X86_SIMD_SORT_UNROLL_LOOP(num)
 75 | #endif
 76 | 
 77 | #define NETWORK_REVERSE_4LANES 0, 1, 2, 3
 78 | #define NETWORK_REVERSE_8LANES 0, 1, 2, 3, 4, 5, 6, 7
 79 | #define NETWORK_REVERSE_16LANES \
 80 |     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
 81 | #define NETWORK_REVERSE_32LANES \
 82 |     31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, \
 83 |             13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 84 | 
 85 | #if defined(XSS_USE_OPENMP) && defined(_OPENMP)
 86 | #define XSS_COMPILE_OPENMP
 87 | #include <omp.h>
 88 | 
 89 | // Limit the number of threads to 16: emperically determined, can be probably
 90 | // better tuned at a later stage
 91 | X86_SIMD_SORT_INLINE int xss_get_num_threads()
 92 | {
 93 |     return std::min(16, (int)omp_get_max_threads());
 94 | }
 95 | #endif
 96 | 
 97 | template <class... T>
 98 | constexpr bool always_false = false;
 99 | 
100 | typedef size_t arrsize_t;
101 | 
102 | template <typename type>
103 | struct zmm_vector;
104 | 
105 | template <typename type>
106 | struct ymm_vector;
107 | 
108 | template <typename type>
109 | struct avx2_vector;
110 | 
111 | template <typename type>
112 | struct avx2_half_vector;
113 | 
114 | enum class simd_type : int { AVX2, AVX512 };
115 | 
116 | template <typename vtype, typename T = typename vtype::type_t>
117 | X86_SIMD_SORT_INLINE bool comparison_func(const T &a, const T &b);
118 | 
119 | struct float16 {
120 |     uint16_t val;
121 | };
122 | 
123 | #endif // XSS_COMMON_INCLUDES
124 | 


--------------------------------------------------------------------------------
/src/avx512-16bit-common.h:
--------------------------------------------------------------------------------
  1 | /*******************************************************************
  2 |  * Copyright (C) 2022 Intel Corporation
  3 |  * SPDX-License-Identifier: BSD-3-Clause
  4 |  * Authors: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
  5 |  * ****************************************************************/
  6 | 
  7 | #ifndef AVX512_16BIT_COMMON
  8 | #define AVX512_16BIT_COMMON
  9 | 
 10 | struct avx512_16bit_swizzle_ops {
 11 |     template <typename vtype, int scale>
 12 |     X86_SIMD_SORT_INLINE typename vtype::reg_t swap_n(typename vtype::reg_t reg)
 13 |     {
 14 |         __m512i v = vtype::cast_to(reg);
 15 | 
 16 |         if constexpr (scale == 2) {
 17 |             constexpr static uint16_t arr[]
 18 |                     = {1,  0,  3,  2,  5,  4,  7,  6,  9,  8,  11,
 19 |                        10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20,
 20 |                        23, 22, 25, 24, 27, 26, 29, 28, 31, 30};
 21 |             __m512i mask = _mm512_loadu_si512(arr);
 22 |             v = _mm512_permutexvar_epi16(mask, v);
 23 |         }
 24 |         else if constexpr (scale == 4) {
 25 |             v = _mm512_shuffle_epi32(v, (_MM_PERM_ENUM)0b10110001);
 26 |         }
 27 |         else if constexpr (scale == 8) {
 28 |             v = _mm512_shuffle_epi32(v, (_MM_PERM_ENUM)0b01001110);
 29 |         }
 30 |         else if constexpr (scale == 16) {
 31 |             v = _mm512_shuffle_i64x2(v, v, 0b10110001);
 32 |         }
 33 |         else if constexpr (scale == 32) {
 34 |             v = _mm512_shuffle_i64x2(v, v, 0b01001110);
 35 |         }
 36 |         else {
 37 |             static_assert(scale == -1, "should not be reached");
 38 |         }
 39 | 
 40 |         return vtype::cast_from(v);
 41 |     }
 42 | 
 43 |     template <typename vtype, int scale>
 44 |     X86_SIMD_SORT_INLINE typename vtype::reg_t
 45 |     reverse_n(typename vtype::reg_t reg)
 46 |     {
 47 |         __m512i v = vtype::cast_to(reg);
 48 | 
 49 |         if constexpr (scale == 2) { return swap_n<vtype, 2>(reg); }
 50 |         else if constexpr (scale == 4) {
 51 |             constexpr static uint16_t arr[]
 52 |                     = {3,  2,  1,  0,  7,  6,  5,  4,  11, 10, 9,
 53 |                        8,  15, 14, 13, 12, 19, 18, 17, 16, 23, 22,
 54 |                        21, 20, 27, 26, 25, 24, 31, 30, 29, 28};
 55 |             __m512i mask = _mm512_loadu_si512(arr);
 56 |             v = _mm512_permutexvar_epi16(mask, v);
 57 |         }
 58 |         else if constexpr (scale == 8) {
 59 |             constexpr static int16_t arr[]
 60 |                     = {7,  6,  5,  4,  3,  2,  1,  0,  15, 14, 13,
 61 |                        12, 11, 10, 9,  8,  23, 22, 21, 20, 19, 18,
 62 |                        17, 16, 31, 30, 29, 28, 27, 26, 25, 24};
 63 |             __m512i mask = _mm512_loadu_si512(arr);
 64 |             v = _mm512_permutexvar_epi16(mask, v);
 65 |         }
 66 |         else if constexpr (scale == 16) {
 67 |             constexpr static uint16_t arr[]
 68 |                     = {15, 14, 13, 12, 11, 10, 9,  8,  7,  6,  5,
 69 |                        4,  3,  2,  1,  0,  31, 30, 29, 28, 27, 26,
 70 |                        25, 24, 23, 22, 21, 20, 19, 18, 17, 16};
 71 |             __m512i mask = _mm512_loadu_si512(arr);
 72 |             v = _mm512_permutexvar_epi16(mask, v);
 73 |         }
 74 |         else if constexpr (scale == 32) {
 75 |             return vtype::reverse(reg);
 76 |         }
 77 |         else {
 78 |             static_assert(scale == -1, "should not be reached");
 79 |         }
 80 | 
 81 |         return vtype::cast_from(v);
 82 |     }
 83 | 
 84 |     template <typename vtype, int scale>
 85 |     X86_SIMD_SORT_INLINE typename vtype::reg_t
 86 |     merge_n(typename vtype::reg_t reg, typename vtype::reg_t other)
 87 |     {
 88 |         __m512i v1 = vtype::cast_to(reg);
 89 |         __m512i v2 = vtype::cast_to(other);
 90 | 
 91 |         if constexpr (scale == 2) {
 92 |             v1 = _mm512_mask_blend_epi16(
 93 |                     0b01010101010101010101010101010101, v1, v2);
 94 |         }
 95 |         else if constexpr (scale == 4) {
 96 |             v1 = _mm512_mask_blend_epi16(
 97 |                     0b00110011001100110011001100110011, v1, v2);
 98 |         }
 99 |         else if constexpr (scale == 8) {
100 |             v1 = _mm512_mask_blend_epi16(
101 |                     0b00001111000011110000111100001111, v1, v2);
102 |         }
103 |         else if constexpr (scale == 16) {
104 |             v1 = _mm512_mask_blend_epi16(
105 |                     0b00000000111111110000000011111111, v1, v2);
106 |         }
107 |         else if constexpr (scale == 32) {
108 |             v1 = _mm512_mask_blend_epi16(
109 |                     0b00000000000000001111111111111111, v1, v2);
110 |         }
111 |         else {
112 |             static_assert(scale == -1, "should not be reached");
113 |         }
114 | 
115 |         return vtype::cast_from(v1);
116 |     }
117 | };
118 | 
119 | #endif // AVX512_16BIT_COMMON
120 | 


--------------------------------------------------------------------------------
/.github/workflows/build-numpy.yml:
--------------------------------------------------------------------------------
  1 | name: NumPy
  2 | 
  3 | on:
  4 |   push:
  5 |     branches: [ "main" ]
  6 |   pull_request:
  7 |     branches: [ "main" ]
  8 |   schedule:
  9 |   - cron: '0 5 * * *'
 10 | 
 11 | permissions: read-all
 12 | 
 13 | jobs:
 14 |   np-multiarray-tgl:
 15 | 
 16 |     if: github.repository == 'intel/x86-simd-sort'
 17 |     runs-on: intel-ubuntu-24.04
 18 | 
 19 |     steps:
 20 |     - name: Checkout x86-simd-sort
 21 |       uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
 22 |       with:
 23 |         fetch-depth: 0
 24 |         path: x86-simd-sort
 25 | 
 26 |     - name: Specify branch name
 27 |       working-directory: ${{ github.workspace }}/x86-simd-sort
 28 |       run: git switch -c pr-branch
 29 | 
 30 |     - name: Install build dependencies
 31 |       run: |
 32 |         sudo apt update
 33 |         sudo apt -y install g++-12 gcc-12 git
 34 | 
 35 |     - name: Checkout NumPy main
 36 |       uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
 37 |       with:
 38 |         repository: numpy/numpy
 39 |         submodules: recursive
 40 |         fetch-depth: 0
 41 |         ref: main
 42 |         path: numpy
 43 | 
 44 |     - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0
 45 |       with:
 46 |         python-version: '3.11'
 47 | 
 48 |     - name: Install Intel SDE
 49 |       run: |
 50 |         curl -o /tmp/sde.tar.xz https://downloadmirror.intel.com/859732/sde-external-9.58.0-2025-06-16-lin.tar.xz
 51 |         mkdir /tmp/sde && tar -xvf /tmp/sde.tar.xz -C /tmp/sde/
 52 |         sudo mv /tmp/sde/* /opt/sde && sudo ln -s /opt/sde/sde64 /usr/bin/sde
 53 | 
 54 |     - name: Install NumPy dependencies
 55 |       working-directory: ${{ github.workspace }}/numpy
 56 |       run: |
 57 |         pip install -r requirements/build_requirements.txt
 58 |         pip install -r requirements/test_requirements.txt
 59 | 
 60 |     - name: Update x86-simd-sort
 61 |       working-directory: ${{ github.workspace }}/numpy
 62 |       run: |
 63 |         cd numpy/_core/src/npysort/x86-simd-sort
 64 |         git remote add temp ${{ github.workspace }}/x86-simd-sort
 65 |         git fetch temp
 66 |         git checkout temp/pr-branch
 67 | 
 68 |     - name: Build and run NumPy tests
 69 |       working-directory: ${{ github.workspace }}/numpy
 70 |       env:
 71 |         CXX: g++-12
 72 |         CC: gcc-12
 73 |       run: |
 74 |         spin build -- -Dallow-noblas=true
 75 |         export NUMPY_SITE=$(realpath build-install/usr/lib/python*/site-packages/)
 76 |         export PYTHONPATH="$PYTHONPATH:$NUMPY_SITE"
 77 |         cd build-install &&
 78 |         sde -tgl -- python -m pytest $NUMPY_SITE/numpy/_core/tests/test_multiarray.py
 79 | 
 80 |   np-multiarray-spr:
 81 | 
 82 |     if: github.repository == 'intel/x86-simd-sort'
 83 |     runs-on: intel-ubuntu-24.04
 84 | 
 85 |     steps:
 86 |     - name: Checkout x86-simd-sort
 87 |       uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
 88 |       with:
 89 |         fetch-depth: 0
 90 |         path: x86-simd-sort
 91 | 
 92 |     - name: Specify branch name
 93 |       working-directory: ${{ github.workspace }}/x86-simd-sort
 94 |       run: git switch -c pr-branch
 95 | 
 96 |     - name: Install build dependencies
 97 |       run: |
 98 |         sudo apt update
 99 |         sudo apt -y install g++-12 gcc-12 git
100 | 
101 |     - name: Install Intel SDE
102 |       run: |
103 |         curl -o /tmp/sde.tar.xz https://downloadmirror.intel.com/859732/sde-external-9.58.0-2025-06-16-lin.tar.xz
104 |         mkdir /tmp/sde && tar -xvf /tmp/sde.tar.xz -C /tmp/sde/
105 |         sudo mv /tmp/sde/* /opt/sde && sudo ln -s /opt/sde/sde64 /usr/bin/sde
106 | 
107 |     - name: Checkout NumPy main
108 |       uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
109 |       with:
110 |         repository: numpy/numpy
111 |         submodules: recursive
112 |         fetch-depth: 0
113 |         ref: main
114 |         path: numpy
115 | 
116 |     - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0
117 |       with:
118 |         python-version: '3.11'
119 | 
120 |     - name: Install NumPy dependencies
121 |       working-directory: ${{ github.workspace }}/numpy
122 |       run: |
123 |         pip install -r requirements/build_requirements.txt
124 |         pip install -r requirements/test_requirements.txt
125 | 
126 |     - name: Update x86-simd-sort
127 |       working-directory: ${{ github.workspace }}/numpy
128 |       run: |
129 |         cd numpy/_core/src/npysort/x86-simd-sort
130 |         git remote add temp ${{ github.workspace }}/x86-simd-sort
131 |         git fetch temp
132 |         git checkout temp/pr-branch
133 | 
134 |     - name: Build NumPy with cpu basline SPR
135 |       working-directory: ${{ github.workspace }}/numpy
136 |       env:
137 |         CXX: g++-12
138 |         CC: gcc-12
139 |       run: |
140 |         spin build -- -Dallow-noblas=true -Dcpu-baseline=avx512_spr
141 | 
142 |     - name: Run tests on SPR
143 |       working-directory: ${{ github.workspace }}/numpy
144 |       run: |
145 |         export NUMPY_SITE=$(realpath build-install/usr/lib/python*/site-packages/)
146 |         export PYTHONPATH="$PYTHONPATH:$NUMPY_SITE"
147 |         cd build-install &&
148 |         sde -spr -- python -c "import numpy; numpy.show_config()" &&
149 |         sde -spr -- python -m pytest $NUMPY_SITE/numpy/_core/tests/test_multiarray.py
150 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
  1 | # Contributor Covenant Code of Conduct
  2 | 
  3 | ## Our Pledge
  4 | 
  5 | We as members, contributors, and leaders pledge to make participation in our
  6 | community a harassment-free experience for everyone, regardless of age, body
  7 | size, visible or invisible disability, ethnicity, sex characteristics, gender
  8 | identity and expression, level of experience, education, socio-economic status,
  9 | nationality, personal appearance, race, caste, color, religion, or sexual
 10 | identity and orientation.
 11 | 
 12 | We pledge to act and interact in ways that contribute to an open, welcoming,
 13 | diverse, inclusive, and healthy community.
 14 | 
 15 | ## Our Standards
 16 | 
 17 | Examples of behavior that contributes to a positive environment for our
 18 | community include:
 19 | 
 20 | * Demonstrating empathy and kindness toward other people
 21 | * Being respectful of differing opinions, viewpoints, and experiences
 22 | * Giving and gracefully accepting constructive feedback
 23 | * Accepting responsibility and apologizing to those affected by our mistakes,
 24 |   and learning from the experience
 25 | * Focusing on what is best not just for us as individuals, but for the overall
 26 |   community
 27 | 
 28 | Examples of unacceptable behavior include:
 29 | 
 30 | * The use of sexualized language or imagery, and sexual attention or advances of
 31 |   any kind
 32 | * Trolling, insulting or derogatory comments, and personal or political attacks
 33 | * Public or private harassment
 34 | * Publishing others' private information, such as a physical or email address,
 35 |   without their explicit permission
 36 | * Other conduct which could reasonably be considered inappropriate in a
 37 |   professional setting
 38 | 
 39 | ## Enforcement Responsibilities
 40 | 
 41 | Community leaders are responsible for clarifying and enforcing our standards of
 42 | acceptable behavior and will take appropriate and fair corrective action in
 43 | response to any behavior that they deem inappropriate, threatening, offensive,
 44 | or harmful.
 45 | 
 46 | Community leaders have the right and responsibility to remove, edit, or reject
 47 | comments, commits, code, wiki edits, issues, and other contributions that are
 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation
 49 | decisions when appropriate.
 50 | 
 51 | ## Scope
 52 | 
 53 | This Code of Conduct applies within all community spaces, and also applies when
 54 | an individual is officially representing the community in public spaces.
 55 | Examples of representing our community include using an official e-mail address,
 56 | posting via an official social media account, or acting as an appointed
 57 | representative at an online or offline event.
 58 | 
 59 | ## Enforcement
 60 | 
 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
 62 | reported to the community leaders responsible for enforcement at
 63 | CommunityCodeOfConduct AT intel DOT com.
 64 | All complaints will be reviewed and investigated promptly and fairly.
 65 | 
 66 | All community leaders are obligated to respect the privacy and security of the
 67 | reporter of any incident.
 68 | 
 69 | ## Enforcement Guidelines
 70 | 
 71 | Community leaders will follow these Community Impact Guidelines in determining
 72 | the consequences for any action they deem in violation of this Code of Conduct:
 73 | 
 74 | ### 1. Correction
 75 | 
 76 | **Community Impact**: Use of inappropriate language or other behavior deemed
 77 | unprofessional or unwelcome in the community.
 78 | 
 79 | **Consequence**: A private, written warning from community leaders, providing
 80 | clarity around the nature of the violation and an explanation of why the
 81 | behavior was inappropriate. A public apology may be requested.
 82 | 
 83 | ### 2. Warning
 84 | 
 85 | **Community Impact**: A violation through a single incident or series of
 86 | actions.
 87 | 
 88 | **Consequence**: A warning with consequences for continued behavior. No
 89 | interaction with the people involved, including unsolicited interaction with
 90 | those enforcing the Code of Conduct, for a specified period of time. This
 91 | includes avoiding interactions in community spaces as well as external channels
 92 | like social media. Violating these terms may lead to a temporary or permanent
 93 | ban.
 94 | 
 95 | ### 3. Temporary Ban
 96 | 
 97 | **Community Impact**: A serious violation of community standards, including
 98 | sustained inappropriate behavior.
 99 | 
100 | **Consequence**: A temporary ban from any sort of interaction or public
101 | communication with the community for a specified period of time. No public or
102 | private interaction with the people involved, including unsolicited interaction
103 | with those enforcing the Code of Conduct, is allowed during this period.
104 | Violating these terms may lead to a permanent ban.
105 | 
106 | ### 4. Permanent Ban
107 | 
108 | **Community Impact**: Demonstrating a pattern of violation of community
109 | standards, including sustained inappropriate behavior, harassment of an
110 | individual, or aggression toward or disparagement of classes of individuals.
111 | 
112 | **Consequence**: A permanent ban from any sort of public interaction within the
113 | community.
114 | 
115 | ## Attribution
116 | 
117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
118 | version 2.1, available at
119 | [https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].
120 | 
121 | Community Impact Guidelines were inspired by
122 | [Mozilla's code of conduct enforcement ladder][Mozilla CoC].
123 | 
124 | For answers to common questions about this code of conduct, see the FAQ at
125 | [https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
126 | [https://www.contributor-covenant.org/translations][translations].
127 | 
128 | [homepage]: https://www.contributor-covenant.org
129 | [v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
130 | [Mozilla CoC]: https://github.com/mozilla/diversity
131 | [FAQ]: https://www.contributor-covenant.org/faq
132 | 


--------------------------------------------------------------------------------
/utils/rand_array.h:
--------------------------------------------------------------------------------
  1 | /*******************************************
  2 |  * * Copyright (C) 2022 Intel Corporation
  3 |  * * SPDX-License-Identifier: BSD-3-Clause
  4 |  * *******************************************/
  5 | #ifndef UTILS_RAND_ARRAY
  6 | #define UTILS_RAND_ARRAY
  7 | 
  8 | #include <iostream>
  9 | #include <random>
 10 | #include <type_traits>
 11 | #include <vector>
 12 | #include <algorithm>
 13 | #include "xss-custom-float.h"
 14 | 
 15 | template <typename T>
 16 | static std::vector<T> get_uniform_rand_array(int64_t arrsize,
 17 |                                              T max = xss::fp::max<T>(),
 18 |                                              T min = xss::fp::min<T>())
 19 | {
 20 |     std::vector<T> arr;
 21 |     std::random_device rd;
 22 |     if constexpr (std::is_floating_point_v<T>) {
 23 |         std::mt19937 gen(rd());
 24 | #ifndef XSS_DO_NOT_SET_SEED
 25 |         gen.seed(42);
 26 | #endif
 27 |         std::uniform_real_distribution<T> dis(min, max);
 28 |         for (int64_t ii = 0; ii < arrsize; ++ii) {
 29 |             arr.emplace_back(dis(gen));
 30 |         }
 31 |     }
 32 | #ifdef __FLT16_MAX__
 33 |     else if constexpr (std::is_same_v<T, _Float16>) {
 34 |         (void)(max);
 35 |         (void)(min);
 36 |         for (auto jj = 0; jj < arrsize; ++jj) {
 37 |             float temp = (float)rand() / (float)(RAND_MAX);
 38 |             arr.push_back((_Float16)temp);
 39 |         }
 40 |     }
 41 | #endif
 42 |     else if constexpr (std::is_integral_v<T>) {
 43 |         std::default_random_engine e1(rd());
 44 | #ifndef XSS_DO_NOT_SET_SEED
 45 |         e1.seed(42);
 46 | #endif
 47 |         std::uniform_int_distribution<T> uniform_dist(min, max);
 48 |         for (int64_t ii = 0; ii < arrsize; ++ii) {
 49 |             arr.emplace_back(uniform_dist(e1));
 50 |         }
 51 |     }
 52 |     return arr;
 53 | }
 54 | 
 55 | template <typename T>
 56 | static std::vector<T> get_uniform_rand_array_with_uniquevalues(
 57 |         int64_t arrsize, T max = xss::fp::max<T>(), T min = xss::fp::min<T>())
 58 | {
 59 |     std::vector<T> arr = get_uniform_rand_array<T>(arrsize, max, min);
 60 |     typename std::vector<T>::iterator ip
 61 |             = std::unique(arr.begin(), arr.begin() + arrsize);
 62 |     arr.resize(std::distance(arr.begin(), ip));
 63 |     return arr;
 64 | }
 65 | 
 66 | template <typename T>
 67 | static std::vector<T> get_array(std::string arrtype,
 68 |                                 size_t arrsize,
 69 |                                 T min = xss::fp::min<T>(),
 70 |                                 T max = xss::fp::max<T>())
 71 | {
 72 |     std::vector<T> arr;
 73 |     if (arrsize == 0) return arr;
 74 |     if (arrtype == "random") {
 75 |         arr = get_uniform_rand_array<T>(arrsize, max, min);
 76 |     }
 77 |     else if (arrtype == "sorted") {
 78 |         arr = get_uniform_rand_array<T>(arrsize, max, min);
 79 |         std::sort(arr.begin(), arr.end());
 80 |     }
 81 |     else if (arrtype == "constant") {
 82 |         T temp = get_uniform_rand_array<T>(1, max, min)[0];
 83 |         for (size_t ii = 0; ii < arrsize; ++ii) {
 84 |             arr.push_back(temp);
 85 |         }
 86 |     }
 87 |     else if (arrtype == "reverse") {
 88 |         arr = get_uniform_rand_array<T>(arrsize, max, min);
 89 |         std::sort(arr.begin(), arr.end());
 90 |         std::reverse(arr.begin(), arr.end());
 91 |     }
 92 |     else if (arrtype == "smallrange") {
 93 |         arr = get_uniform_rand_array<T>(arrsize, 20, 1);
 94 |     }
 95 |     else if (arrtype == "random_5d") {
 96 |         size_t temp = std::max((size_t)1, (size_t)(0.5 * arrsize));
 97 |         std::vector<T> temparr = get_uniform_rand_array<T>(temp);
 98 |         for (size_t ii = 0; ii < arrsize; ++ii) {
 99 |             if (ii < temp) { arr.push_back(temparr[ii]); }
100 |             else {
101 |                 arr.push_back((T)0);
102 |             }
103 |         }
104 |         std::shuffle(arr.begin(), arr.end(), std::default_random_engine(42));
105 |     }
106 |     else if (arrtype == "max_at_the_end") {
107 |         arr = get_uniform_rand_array<T>(arrsize, max, min);
108 |         if (xss::fp::is_floating_point_v<T>) {
109 |             arr[arrsize - 1] = xss::fp::infinity<T>();
110 |         }
111 |         else {
112 |             arr[arrsize - 1] = std::numeric_limits<T>::max();
113 |         }
114 |     }
115 |     else if (arrtype == "rand_with_nan") {
116 |         arr = get_uniform_rand_array<T>(arrsize, max, min);
117 |         int64_t num_nans = 10 % arrsize;
118 |         std::vector<int64_t> rand_indx
119 |                 = get_uniform_rand_array<int64_t>(num_nans, arrsize - 1, 0);
120 |         T val;
121 |         if constexpr (xss::fp::is_floating_point_v<T>) {
122 |             val = xss::fp::quiet_NaN<T>();
123 |         }
124 |         else {
125 |             val = std::numeric_limits<T>::max();
126 |         }
127 |         for (auto ind : rand_indx) {
128 |             arr[ind] = val;
129 |         }
130 |     }
131 |     else if (arrtype == "rand_max") {
132 |         arr = get_uniform_rand_array<T>(arrsize, max, min);
133 |         T val;
134 |         if constexpr (xss::fp::is_floating_point_v<T>) {
135 |             val = xss::fp::infinity<T>();
136 |         }
137 |         else {
138 |             val = std::numeric_limits<T>::max();
139 |         }
140 |         for (size_t ii = 0; ii < arrsize; ++ii) {
141 |             if (rand() & 0x1) { arr[ii] = val; }
142 |         }
143 |     }
144 |     else if (arrtype == "rand_with_max_and_nan") {
145 |         arr = get_uniform_rand_array<T>(arrsize, max, min);
146 |         T max_val;
147 |         T nan_val;
148 |         if constexpr (xss::fp::is_floating_point_v<T>) {
149 |             max_val = xss::fp::infinity<T>();
150 |             nan_val = xss::fp::quiet_NaN<T>();
151 |         }
152 |         else {
153 |             max_val = std::numeric_limits<T>::max();
154 |             nan_val = std::numeric_limits<T>::max();
155 |         }
156 |         for (size_t ii = 0; ii < arrsize; ++ii) {
157 |             int res = rand() % 4;
158 |             if (res == 2) { arr[ii] = max_val; }
159 |             else if (res == 3) {
160 |                 arr[ii] = nan_val;
161 |             }
162 |         }
163 |     }
164 |     else {
165 |         std::cout << "Warning: unrecognized array type " << arrtype
166 |                   << std::endl;
167 |     }
168 |     return arr;
169 | }
170 | 
171 | #endif // UTILS_RAND_ARRAY
172 | 


--------------------------------------------------------------------------------
/src/avx512fp16-16bit-qsort.hpp:
--------------------------------------------------------------------------------
  1 | /*******************************************************************
  2 |  * Copyright (C) 2022 Intel Corporation
  3 |  * SPDX-License-Identifier: BSD-3-Clause
  4 |  * Authors: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
  5 |  * ****************************************************************/
  6 | 
  7 | #ifndef AVX512FP16_QSORT_16BIT
  8 | #define AVX512FP16_QSORT_16BIT
  9 | 
 10 | #include "avx512-16bit-common.h"
 11 | 
 12 | typedef union {
 13 |     _Float16 f_;
 14 |     uint16_t i_;
 15 | } Fp16Bits;
 16 | 
 17 | template <>
 18 | struct zmm_vector<_Float16> {
 19 |     using type_t = _Float16;
 20 |     using reg_t = __m512h;
 21 |     using halfreg_t = __m256h;
 22 |     using opmask_t = __mmask32;
 23 |     static const uint8_t numlanes = 32;
 24 |     static constexpr int network_sort_threshold = 128;
 25 |     static constexpr int partition_unroll_factor = 8;
 26 |     static constexpr simd_type vec_type = simd_type::AVX512;
 27 | 
 28 |     using swizzle_ops = avx512_16bit_swizzle_ops;
 29 | 
 30 |     static type_t type_max()
 31 |     {
 32 |         Fp16Bits val;
 33 |         val.i_ = X86_SIMD_SORT_INFINITYH;
 34 |         return val.f_;
 35 |     }
 36 |     static type_t type_min()
 37 |     {
 38 |         Fp16Bits val;
 39 |         val.i_ = X86_SIMD_SORT_NEGINFINITYH;
 40 |         return val.f_;
 41 |     }
 42 |     static reg_t zmm_max()
 43 |     {
 44 |         return _mm512_set1_ph(type_max());
 45 |     }
 46 |     static reg_t zmm_min()
 47 |     {
 48 |         return _mm512_set1_ph(type_min());
 49 |     }
 50 |     static opmask_t knot_opmask(opmask_t x)
 51 |     {
 52 |         return _knot_mask32(x);
 53 |     }
 54 |     static opmask_t ge(reg_t x, reg_t y)
 55 |     {
 56 |         return _mm512_cmp_ph_mask(x, y, _CMP_GE_OQ);
 57 |     }
 58 |     static opmask_t eq(reg_t x, reg_t y)
 59 |     {
 60 |         return _mm512_cmp_ph_mask(x, y, _CMP_EQ_OQ);
 61 |     }
 62 |     static opmask_t get_partial_loadmask(uint64_t num_to_read)
 63 |     {
 64 |         return ((0x1ull << num_to_read) - 0x1ull);
 65 |     }
 66 |     static int32_t convert_mask_to_int(opmask_t mask)
 67 |     {
 68 |         return mask;
 69 |     }
 70 |     template <int type>
 71 |     static opmask_t fpclass(reg_t x)
 72 |     {
 73 |         return _mm512_fpclass_ph_mask(x, type);
 74 |     }
 75 |     static reg_t loadu(void const *mem)
 76 |     {
 77 |         return _mm512_loadu_ph(mem);
 78 |     }
 79 |     static reg_t max(reg_t x, reg_t y)
 80 |     {
 81 |         return _mm512_max_ph(x, y);
 82 |     }
 83 |     static void mask_compressstoreu(void *mem, opmask_t mask, reg_t x)
 84 |     {
 85 |         __m512i temp = _mm512_castph_si512(x);
 86 |         // AVX512_VBMI2
 87 |         return _mm512_mask_compressstoreu_epi16(mem, mask, temp);
 88 |     }
 89 |     static reg_t maskz_loadu(opmask_t mask, void const *mem)
 90 |     {
 91 |         return _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, mem));
 92 |     }
 93 |     static reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem)
 94 |     {
 95 |         // AVX512BW
 96 |         return _mm512_castsi512_ph(
 97 |                 _mm512_mask_loadu_epi16(_mm512_castph_si512(x), mask, mem));
 98 |     }
 99 |     static reg_t mask_mov(reg_t x, opmask_t mask, reg_t y)
100 |     {
101 |         return _mm512_castsi512_ph(_mm512_mask_mov_epi16(
102 |                 _mm512_castph_si512(x), mask, _mm512_castph_si512(y)));
103 |     }
104 |     static void mask_storeu(void *mem, opmask_t mask, reg_t x)
105 |     {
106 |         return _mm512_mask_storeu_epi16(mem, mask, _mm512_castph_si512(x));
107 |     }
108 |     static reg_t min(reg_t x, reg_t y)
109 |     {
110 |         return _mm512_min_ph(x, y);
111 |     }
112 |     static reg_t permutexvar(__m512i idx, reg_t zmm)
113 |     {
114 |         return _mm512_permutexvar_ph(idx, zmm);
115 |     }
116 |     static type_t reducemax(reg_t v)
117 |     {
118 |         return _mm512_reduce_max_ph(v);
119 |     }
120 |     static type_t reducemin(reg_t v)
121 |     {
122 |         return _mm512_reduce_min_ph(v);
123 |     }
124 |     static reg_t set1(type_t v)
125 |     {
126 |         return _mm512_set1_ph(v);
127 |     }
128 |     template <uint8_t mask>
129 |     static reg_t shuffle(reg_t zmm)
130 |     {
131 |         __m512i temp = _mm512_shufflehi_epi16(_mm512_castph_si512(zmm),
132 |                                               (_MM_PERM_ENUM)mask);
133 |         return _mm512_castsi512_ph(
134 |                 _mm512_shufflelo_epi16(temp, (_MM_PERM_ENUM)mask));
135 |     }
136 |     static void storeu(void *mem, reg_t x)
137 |     {
138 |         return _mm512_storeu_ph(mem, x);
139 |     }
140 |     static reg_t reverse(reg_t zmm)
141 |     {
142 |         constexpr static uint16_t arr[] = {NETWORK_REVERSE_32LANES};
143 |         const auto rev_index = _mm512_loadu_si512(arr);
144 |         return permutexvar(rev_index, zmm);
145 |     }
146 |     static reg_t sort_vec(reg_t x)
147 |     {
148 |         return sort_reg_32lanes<zmm_vector<type_t>>(x);
149 |     }
150 |     static reg_t cast_from(__m512i v)
151 |     {
152 |         return _mm512_castsi512_ph(v);
153 |     }
154 |     static __m512i cast_to(reg_t v)
155 |     {
156 |         return _mm512_castph_si512(v);
157 |     }
158 |     static bool all_false(opmask_t k)
159 |     {
160 |         return k == 0;
161 |     }
162 |     static int double_compressstore(type_t *left_addr,
163 |                                     type_t *right_addr,
164 |                                     opmask_t k,
165 |                                     reg_t reg)
166 |     {
167 |         return avx512_double_compressstore<zmm_vector<type_t>>(
168 |                 left_addr, right_addr, k, reg);
169 |     }
170 | };
171 | 
172 | template <>
173 | X86_SIMD_SORT_INLINE_ONLY bool is_a_nan<_Float16>(_Float16 elem)
174 | {
175 |     return elem != elem;
176 | }
177 | 
178 | template <>
179 | X86_SIMD_SORT_INLINE_ONLY void replace_inf_with_nan(_Float16 *arr,
180 |                                                     arrsize_t size,
181 |                                                     arrsize_t nan_count,
182 |                                                     bool descending)
183 | {
184 |     Fp16Bits val;
185 |     val.i_ = 0x7c01;
186 | 
187 |     if (descending) {
188 |         for (arrsize_t ii = 0; nan_count > 0; ++ii) {
189 |             arr[ii] = val.f_;
190 |             nan_count -= 1;
191 |         }
192 |     }
193 |     else {
194 |         for (arrsize_t ii = size - 1; nan_count > 0; --ii) {
195 |             arr[ii] = val.f_;
196 |             nan_count -= 1;
197 |         }
198 |     }
199 | }
200 | #endif // AVX512FP16_QSORT_16BIT
201 | 


--------------------------------------------------------------------------------
/src/README.md:
--------------------------------------------------------------------------------
  1 | # x86-simd-sort
  2 | 
  3 | C++ header file library for SIMD based 16-bit, 32-bit and 64-bit data type
  4 | sorting algorithms on x86 processors. We currently have AVX-512 and AVX2 based
  5 | implementation of quicksort, quickselect, partialsort, argsort, argselect &
  6 | key-value sort. The static methods can be used by including
  7 | `src/x86simdsort-static-incl.h` file. Compiling them with the appropriate
  8 | compiler flags will choose either the AVX-512 or AVX2 versions. For AVX-512, we
  9 | recommend using -march=skylake-avx512 for 32-bit and 64-bit datatypes,
 10 | -march=icelake-client for 16-bit datatype and -march=sapphirerapids for
 11 | _Float16. For AVX2 just using -mavx2 will suffice. The following API's are
 12 | currently supported:
 13 | 
 14 | #### Quicksort
 15 | 
 16 | Equivalent to `qsort` in
 17 | [C](https://www.tutorialspoint.com/c_standard_library/c_function_qsort.htm) or
 18 | `std::sort` in [C++](https://en.cppreference.com/w/cpp/algorithm/sort).
 19 | 
 20 | ```cpp
 21 | void x86simdsortStatic::qsort<T>(T* arr, size_t arrsize, bool hasnan = false, bool descending = false);
 22 | ```
 23 | Supported datatypes: `uint16_t`, `int16_t`, `_Float16`, `uint32_t`, `int32_t`,
 24 | `float`, `uint64_t`, `int64_t` and `double`. AVX2 versions currently support
 25 | 32-bit and 64-bit dtypes only. For floating-point types, if `arr` contains
 26 | NaNs, they are moved to the end and replaced with a quiet NaN. That is, the
 27 | original, bit-exact NaNs in the input are not preserved.
 28 | 
 29 | #### Quickselect
 30 | Equivalent to `std::nth_element` in
 31 | [C++](https://en.cppreference.com/w/cpp/algorithm/nth_element) or
 32 | `np.partition` in
 33 | [NumPy](https://numpy.org/doc/stable/reference/generated/numpy.partition.html).
 34 | 
 35 | 
 36 | ```cpp
 37 | void x86simdsortStatic::qselect<T>(T* arr, size_t k, size_t arrsize, bool hasnan = false, bool descending = false);
 38 | ```
 39 | Supported datatypes: `uint16_t`, `int16_t`, `_Float16`, `uint32_t`, `int32_t`,
 40 | `float`, `uint64_t`, `int64_t` and `double`. AVX2 versions currently support
 41 | 32-bit and 64-bit dtypes only. For floating-point types, if `bool hasnan` is
 42 | set, NaNs are moved to the end of the array, preserving the bit-exact NaNs in
 43 | the input. If NaNs are present but `hasnan` is `false`, the behavior is
 44 | undefined.
 45 | 
 46 | #### Partialsort
 47 | Equivalent to `std::partial_sort` in
 48 | [C++](https://en.cppreference.com/w/cpp/algorithm/partial_sort).
 49 | 
 50 | 
 51 | ```cpp
 52 | void x86simdsortStatic::partial_qsort<T>(T* arr, size_t k, size_t arrsize, bool hasnan = false, bool descending = false)
 53 | ```
 54 | Supported datatypes: `uint16_t`, `int16_t`, `_Float16`, `uint32_t`, `int32_t`,
 55 | `float`, `uint64_t`, `int64_t` and `double`. AVX2 versions currently support
 56 | 32-bit and 64-bit dtypes only. For floating-point types, if `bool hasnan` is
 57 | set, NaNs are moved to the end of the array, preserving the bit-exact NaNs in
 58 | the input. If NaNs are present but `hasnan` is `false`, the behavior is
 59 | undefined.
 60 | 
 61 | #### Argsort
 62 | Equivalent to `np.argsort` in
 63 | [NumPy](https://numpy.org/doc/stable/reference/generated/numpy.argsort.html).
 64 | 
 65 | ```cpp
 66 | void x86simdsortStatic::argsort<T>(T* arr, size_t *arg, size_t arrsize, bool hasnan = false, bool descending = false);
 67 | ```
 68 | Supported datatypes: `uint32_t`, `int32_t`, `float`, `uint64_t`, `int64_t` and
 69 | `double`.
 70 | 
 71 | The algorithm resorts to scalar `std::sort` if the array contains NaNs.
 72 | 
 73 | #### Argselect
 74 | Equivalent to `np.argselect` in
 75 | [NumPy](https://numpy.org/doc/stable/reference/generated/numpy.argpartition.html).
 76 | 
 77 | ```cpp
 78 | void x86simdsortStatic::argselect<T>(T* arr, size_t *arg, size_t k, size_t arrsize, bool hasnan = false);
 79 | ```
 80 | Supported datatypes: `uint32_t`, `int32_t`, `float`, `uint64_t`, `int64_t` and
 81 | `double`.
 82 | 
 83 | The algorithm resorts to scalar `std::sort` if the array contains NaNs.
 84 | 
 85 | #### Key-value sort
 86 | ```cpp
 87 | void x86simdsortStatic::keyvalue_qsort<T1, T2>(T1* key, T2* value, size_t arrsize, bool hasnan = false, bool descending = false);
 88 | ```
 89 | Supported datatypes: `uint32_t`, `int32_t`, `float`, `uint64_t`, `int64_t` and
 90 | `double`.
 91 | 
 92 | ## Algorithm details
 93 | 
 94 | The ideas and code are based on these two research papers [1] and [2]. On a
 95 | high level, the idea is to vectorize quicksort partitioning using AVX-512
 96 | compressstore instructions. If the array size is less than a certain threshold
 97 | (typically 512, 256, 128 or 64), then we use sorting networks [4,5] implemented
 98 | on AVX512/AVX registers. Article [4] is a good resource for bitonic sorting
 99 | network. Article [5] lists optimal sorting newtorks for various array sizes.
100 | The core implementations of the vectorized qsort functions `avx*_qsort<T>(T*,
101 | size_t)` are modified versions of avx2 quicksort presented in the paper [2] and
102 | source code associated with that paper [3].
103 | 
104 | ## Example to include and build this in a C++ code
105 | 
106 | ### Sample code `main.cpp`
107 | 
108 | ```cpp
109 | #include "src/x86simdsort-static-incl.h"
110 | 
111 | int main() {
112 |     const int ARRSIZE = 1000;
113 |     std::vector<float> arr;
114 | 
115 |     /* Initialize elements is reverse order */
116 |     for (int ii = 0; ii < ARRSIZE; ++ii) {
117 |         arr.push_back(ARRSIZE - ii);
118 |     }
119 | 
120 |     /* call avx512 quicksort */
121 |     x86simdsortStatic::qsort(arr.data(), ARRSIZE);
122 |     return 0;
123 | }
124 | 
125 | ```
126 | 
127 | ### Build using g++
128 | 
129 | ```
130 | g++ main.cpp -mavx512f -mavx512dq -mavx512vl -O3 /* for AVX-512 */
131 | g++ main.cpp -mavx2 -O3 /* for AVX2 */
132 | ```
133 | 
134 | If you are using src files directly, then it is a header file only and we do
135 | not provide any compile time and run time checks which is recommended while
136 | including this in your source code. The header files are integrated into
137 | [NumPy](https://github.com/numpy/numpy) code base and this [pull
138 | request](https://github.com/numpy/numpy/pull/22315) is a good reference for how
139 | to include and build this library with your source code.
140 | 
141 | ## Build requirements
142 | 
143 | The sorting routines relies only on the C++ Standard Library and requires a
144 | relatively modern compiler to build (ex: gcc 8.x and above).
145 | 
146 | ## Instruction set requirements
147 | 
148 | The `avx512_*` routines can only run on processors that have AVX-512.
149 | Specifically, the 32-bit and 64-bit require AVX-512F and AVX-512DQ instruction
150 | set. The 16-bit sorting requires the AVX-512F, AVX-512BW and AVX-512 VMBI2
151 | instruction set. Sorting `_Float16` will require AVX-512FP16.
152 | 
153 | The `avx2_*` routines require AVX/AVX2 instruction set. We currently only
154 | support 32-bit and 64-bit data for AVX2 based methods with plans to extend that
155 | to all the other routines and data types.
156 | 
157 | ## References
158 | 
159 | * [1] Fast and Robust Vectorized In-Place Sorting of Primitive Types
160 |     https://drops.dagstuhl.de/opus/volltexte/2021/13775/
161 | 
162 | * [2] A Novel Hybrid Quicksort Algorithm Vectorized using AVX-512 on Intel
163 | Skylake https://arxiv.org/pdf/1704.08579.pdf
164 | 
165 | * [3] https://github.com/simd-sorting/fast-and-robust: SPDX-License-Identifier: MIT
166 | 
167 | * [4] http://mitp-content-server.mit.edu:18180/books/content/sectbyfn?collid=books_pres_0&fn=Chapter%2027.pdf&id=8030
168 | 
169 | * [5] https://bertdobbelaere.github.io/sorting_networks.html
170 | 


--------------------------------------------------------------------------------
/src/x86simdsort-static-incl.h:
--------------------------------------------------------------------------------
  1 | #ifndef X86_SIMD_SORT_STATIC_METHODS
  2 | #define X86_SIMD_SORT_STATIC_METHODS
  3 | #include <vector>
  4 | #include <stdlib.h>
  5 | #include "xss-common-includes.h"
  6 | 
  7 | // Supported methods declared here for a quick reference:
  8 | namespace x86simdsortStatic {
  9 | template <typename T>
 10 | X86_SIMD_SORT_FINLINE void
 11 | qsort(T *arr, size_t size, bool hasnan = false, bool descending = false);
 12 | 
 13 | template <typename T>
 14 | X86_SIMD_SORT_FINLINE void qselect(T *arr,
 15 |                                    size_t k,
 16 |                                    size_t size,
 17 |                                    bool hasnan = false,
 18 |                                    bool descending = false);
 19 | 
 20 | template <typename T>
 21 | X86_SIMD_SORT_FINLINE void partial_qsort(T *arr,
 22 |                                          size_t k,
 23 |                                          size_t size,
 24 |                                          bool hasnan = false,
 25 |                                          bool descending = false);
 26 | 
 27 | template <typename T>
 28 | X86_SIMD_SORT_FINLINE std::vector<size_t>
 29 | argsort(T *arr, size_t size, bool hasnan = false, bool descending = false);
 30 | 
 31 | /* argsort API required by NumPy: */
 32 | template <typename T>
 33 | X86_SIMD_SORT_FINLINE void argsort(T *arr,
 34 |                                    size_t *arg,
 35 |                                    size_t size,
 36 |                                    bool hasnan = false,
 37 |                                    bool descending = false);
 38 | 
 39 | template <typename T>
 40 | X86_SIMD_SORT_FINLINE std::vector<size_t>
 41 | argselect(T *arr, size_t k, size_t size, bool hasnan = false);
 42 | 
 43 | /* argselect API required by NumPy: */
 44 | template <typename T>
 45 | void X86_SIMD_SORT_FINLINE
 46 | argselect(T *arr, size_t *arg, size_t k, size_t size, bool hasnan = false);
 47 | 
 48 | template <typename T1, typename T2>
 49 | X86_SIMD_SORT_FINLINE void keyvalue_qsort(T1 *key,
 50 |                                           T2 *val,
 51 |                                           size_t size,
 52 |                                           bool hasnan = false,
 53 |                                           bool descending = false);
 54 | 
 55 | template <typename T1, typename T2>
 56 | X86_SIMD_SORT_FINLINE void keyvalue_select(T1 *key,
 57 |                                            T2 *val,
 58 |                                            size_t k,
 59 |                                            size_t size,
 60 |                                            bool hasnan = false,
 61 |                                            bool descending = false);
 62 | 
 63 | template <typename T1, typename T2>
 64 | X86_SIMD_SORT_FINLINE void keyvalue_partial_sort(T1 *key,
 65 |                                                  T2 *val,
 66 |                                                  size_t k,
 67 |                                                  size_t size,
 68 |                                                  bool hasnan = false,
 69 |                                                  bool descending = false);
 70 | 
 71 | } // namespace x86simdsortStatic
 72 | 
 73 | #define XSS_METHODS(ISA) \
 74 |     template <typename T> \
 75 |     X86_SIMD_SORT_FINLINE void x86simdsortStatic::qsort( \
 76 |             T *arr, size_t size, bool hasnan, bool descending) \
 77 |     { \
 78 |         ISA##_qsort(arr, size, hasnan, descending); \
 79 |     } \
 80 |     template <typename T> \
 81 |     X86_SIMD_SORT_FINLINE void x86simdsortStatic::qselect( \
 82 |             T *arr, size_t k, size_t size, bool hasnan, bool descending) \
 83 |     { \
 84 |         ISA##_qselect(arr, k, size, hasnan, descending); \
 85 |     } \
 86 |     template <typename T> \
 87 |     X86_SIMD_SORT_FINLINE void x86simdsortStatic::partial_qsort( \
 88 |             T *arr, size_t k, size_t size, bool hasnan, bool descending) \
 89 |     { \
 90 |         ISA##_partial_qsort(arr, k, size, hasnan, descending); \
 91 |     } \
 92 |     template <typename T> \
 93 |     X86_SIMD_SORT_FINLINE void x86simdsortStatic::argsort( \
 94 |             T *arr, size_t *arg, size_t size, bool hasnan, bool descending) \
 95 |     { \
 96 |         ISA##_argsort(arr, arg, size, hasnan, descending); \
 97 |     } \
 98 |     template <typename T> \
 99 |     X86_SIMD_SORT_FINLINE std::vector<size_t> x86simdsortStatic::argsort( \
100 |             T *arr, size_t size, bool hasnan, bool descending) \
101 |     { \
102 |         std::vector<size_t> indices(size); \
103 |         std::iota(indices.begin(), indices.end(), 0); \
104 |         x86simdsortStatic::argsort( \
105 |                 arr, indices.data(), size, hasnan, descending); \
106 |         return indices; \
107 |     } \
108 |     template <typename T> \
109 |     X86_SIMD_SORT_FINLINE void x86simdsortStatic::argselect( \
110 |             T *arr, size_t *arg, size_t k, size_t size, bool hasnan) \
111 |     { \
112 |         ISA##_argselect(arr, arg, k, size, hasnan); \
113 |     } \
114 |     template <typename T> \
115 |     X86_SIMD_SORT_FINLINE std::vector<size_t> x86simdsortStatic::argselect( \
116 |             T *arr, size_t k, size_t size, bool hasnan) \
117 |     { \
118 |         std::vector<size_t> indices(size); \
119 |         std::iota(indices.begin(), indices.end(), 0); \
120 |         x86simdsortStatic::argselect(arr, indices.data(), k, size, hasnan); \
121 |         return indices; \
122 |     } \
123 |     template <typename T1, typename T2> \
124 |     X86_SIMD_SORT_FINLINE void x86simdsortStatic::keyvalue_qsort( \
125 |             T1 *key, T2 *val, size_t size, bool hasnan, bool descending) \
126 |     { \
127 |         ISA##_qsort_kv(key, val, size, hasnan, descending); \
128 |     } \
129 |     template <typename T1, typename T2> \
130 |     X86_SIMD_SORT_FINLINE void x86simdsortStatic::keyvalue_select( \
131 |             T1 *key, \
132 |             T2 *val, \
133 |             size_t k, \
134 |             size_t size, \
135 |             bool hasnan, \
136 |             bool descending) \
137 |     { \
138 |         ISA##_select_kv(key, val, k, size, hasnan, descending); \
139 |     } \
140 |     template <typename T1, typename T2> \
141 |     X86_SIMD_SORT_FINLINE void x86simdsortStatic::keyvalue_partial_sort( \
142 |             T1 *key, \
143 |             T2 *val, \
144 |             size_t k, \
145 |             size_t size, \
146 |             bool hasnan, \
147 |             bool descending) \
148 |     { \
149 |         ISA##_partial_sort_kv(key, val, k, size, hasnan, descending); \
150 |     }
151 | 
152 | /*
153 |  * qsort, qselect, partial, argsort key-value sort template functions.
154 |  */
155 | #include "xss-common-qsort.h"
156 | #include "xss-common-argsort.h"
157 | #include "xss-common-keyvaluesort.hpp"
158 | 
159 | #if defined(__AVX512DQ__) && defined(__AVX512VL__)
160 | /* 32-bit and 64-bit dtypes vector definitions on SKX */
161 | #include "avx512-32bit-qsort.hpp"
162 | #include "avx512-64bit-qsort.hpp"
163 | #include "avx512-64bit-argsort.hpp"
164 | 
165 | /* 16-bit dtypes vector definitions on ICL */
166 | #if defined(__AVX512BW__) && defined(__AVX512VBMI2__)
167 | #include "avx512-16bit-qsort.hpp"
168 | /* _Float16 vector definition on SPR*/
169 | #if defined(__FLT16_MAX__) && defined(__AVX512BW__) && defined(__AVX512FP16__)
170 | #include "avx512fp16-16bit-qsort.hpp"
171 | #endif // __FLT16_MAX__
172 | #endif // __AVX512VBMI2__
173 | 
174 | XSS_METHODS(avx512)
175 | 
176 | #if defined(__FLT16_MAX__) && defined(__AVX512BW__) \
177 |         && defined(__AVX512VBMI2__) && !defined(__AVX512FP16__)
178 | template <>
179 | [[maybe_unused]]
180 | void x86simdsortStatic::qsort<_Float16>(_Float16 *arr,
181 |                                         size_t size,
182 |                                         bool hasnan,
183 |                                         bool descending)
184 | {
185 |     avx512_qsort_fp16((uint16_t *)arr, size, hasnan, descending);
186 | }
187 | template <>
188 | [[maybe_unused]]
189 | void x86simdsortStatic::qselect<_Float16>(
190 |         _Float16 *arr, size_t k, size_t size, bool hasnan, bool descending)
191 | {
192 |     avx512_qselect_fp16((uint16_t *)arr, k, size, hasnan, descending);
193 | }
194 | template <>
195 | [[maybe_unused]]
196 | void x86simdsortStatic::partial_qsort<_Float16>(
197 |         _Float16 *arr, size_t k, size_t size, bool hasnan, bool descending)
198 | {
199 |     avx512_partial_qsort_fp16((uint16_t *)arr, k, size, hasnan, descending);
200 | }
201 | #endif
202 | 
203 | #elif defined(__AVX2__)
204 | /* 32-bit and 64-bit dtypes vector definitions on AVX2 */
205 | #include "avx2-32bit-half.hpp"
206 | #include "avx2-32bit-qsort.hpp"
207 | #include "avx2-64bit-qsort.hpp"
208 | XSS_METHODS(avx2)
209 | 
210 | #else
211 | #error "x86simdsortStatic methods needs to be compiled with avx512/avx2 specific flags"
212 | #endif // (__AVX512VL__ && __AVX512DQ__) || AVX2
213 | 
214 | #endif // X86_SIMD_SORT_STATIC_METHODS
215 | 


--------------------------------------------------------------------------------
/src/xss-network-qsort.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef XSS_NETWORK_QSORT
  2 | #define XSS_NETWORK_QSORT
  3 | 
  4 | #include "xss-optimal-networks.hpp"
  5 | 
  6 | template <typename vtype, typename mm_t>
  7 | X86_SIMD_SORT_INLINE void COEX(mm_t &a, mm_t &b);
  8 | 
  9 | template <typename vtype,
 10 |           typename comparator,
 11 |           int numVecs,
 12 |           typename reg_t = typename vtype::reg_t>
 13 | X86_SIMD_SORT_FINLINE void bitonic_sort_n_vec(reg_t *regs)
 14 | {
 15 |     if constexpr (numVecs == 1) {
 16 |         UNUSED(regs);
 17 |         return;
 18 |     }
 19 |     else if constexpr (numVecs == 2) {
 20 |         comparator::COEX(regs[0], regs[1]);
 21 |     }
 22 |     else if constexpr (numVecs == 4) {
 23 |         optimal_sort_4<vtype, comparator>(regs);
 24 |     }
 25 |     else if constexpr (numVecs == 8) {
 26 |         optimal_sort_8<vtype, comparator>(regs);
 27 |     }
 28 |     else if constexpr (numVecs == 16) {
 29 |         optimal_sort_16<vtype, comparator>(regs);
 30 |     }
 31 |     else if constexpr (numVecs == 32) {
 32 |         optimal_sort_32<vtype, comparator>(regs);
 33 |     }
 34 |     else {
 35 |         static_assert(numVecs == -1, "should not reach here");
 36 |     }
 37 | }
 38 | 
 39 | /*
 40 |  * Swizzle ops explained:
 41 |  * swap_n<scale>: swap neighbouring blocks of size <scale/2> within block of size <scale>
 42 |  * reg i        = [7,6,5,4,3,2,1,0]
 43 |  * swap_n<2>:   = [[6,7],[4,5],[2,3],[0,1]]
 44 |  * swap_n<4>:   = [[5,4,7,6],[1,0,3,2]]
 45 |  * swap_n<8>:   = [[3,2,1,0,7,6,5,4]]
 46 |  * reverse_n<scale>: reverse elements within block of size <scale>
 47 |  * reg i        = [7,6,5,4,3,2,1,0]
 48 |  * rev_n<2>:    = [[6,7],[4,5],[2,3],[0,1]]
 49 |  * rev_n<4>:    = [[4,5,6,7],[0,1,2,3]]
 50 |  * rev_n<8>:    = [[0,1,2,3,4,5,6,7]]
 51 |  * merge_n<scale>: merge blocks of <scale/2> elements from two regs
 52 |  * reg b,a      = [a,a,a,a,a,a,a,a], [b,b,b,b,b,b,b,b]
 53 |  * merge_n<2>   = [a,b,a,b,a,b,a,b]
 54 |  * merge_n<4>   = [a,a,b,b,a,a,b,b]
 55 |  * merge_n<8>   = [a,a,a,a,b,b,b,b]
 56 |  */
 57 | 
 58 | template <typename vtype,
 59 |           typename comparator,
 60 |           int numVecs,
 61 |           int scale,
 62 |           bool first = true>
 63 | X86_SIMD_SORT_FINLINE void internal_merge_n_vec(typename vtype::reg_t *reg)
 64 | {
 65 |     using reg_t = typename vtype::reg_t;
 66 |     using swizzle = typename vtype::swizzle_ops;
 67 |     if constexpr (scale <= 1) {
 68 |         UNUSED(reg);
 69 |         return;
 70 |     }
 71 |     else {
 72 |         if constexpr (first) {
 73 |             // Use reverse then merge
 74 |             X86_SIMD_SORT_UNROLL_LOOP(64)
 75 |             for (int i = 0; i < numVecs; i++) {
 76 |                 reg_t &v = reg[i];
 77 |                 reg_t rev = swizzle::template reverse_n<vtype, scale>(v);
 78 |                 comparator::COEX(rev, v);
 79 |                 v = swizzle::template merge_n<vtype, scale>(v, rev);
 80 |             }
 81 |         }
 82 |         else {
 83 |             // Use swap then merge
 84 |             X86_SIMD_SORT_UNROLL_LOOP(64)
 85 |             for (int i = 0; i < numVecs; i++) {
 86 |                 reg_t &v = reg[i];
 87 |                 reg_t swap = swizzle::template swap_n<vtype, scale>(v);
 88 |                 comparator::COEX(swap, v);
 89 |                 v = swizzle::template merge_n<vtype, scale>(v, swap);
 90 |             }
 91 |         }
 92 |         internal_merge_n_vec<vtype, comparator, numVecs, scale / 2, false>(reg);
 93 |     }
 94 | }
 95 | 
 96 | template <typename vtype,
 97 |           typename comparator,
 98 |           int numVecs,
 99 |           int scale,
100 |           typename reg_t = typename vtype::reg_t>
101 | X86_SIMD_SORT_FINLINE void merge_substep_n_vec(reg_t *regs)
102 | {
103 |     using swizzle = typename vtype::swizzle_ops;
104 |     if constexpr (numVecs <= 1) {
105 |         UNUSED(regs);
106 |         return;
107 |     }
108 | 
109 |     // Reverse upper half of vectors
110 |     X86_SIMD_SORT_UNROLL_LOOP(64)
111 |     for (int i = numVecs / 2; i < numVecs; i++) {
112 |         regs[i] = swizzle::template reverse_n<vtype, scale>(regs[i]);
113 |     }
114 |     // Do compare exchanges
115 |     X86_SIMD_SORT_UNROLL_LOOP(64)
116 |     for (int i = 0; i < numVecs / 2; i++) {
117 |         comparator::COEX(regs[i], regs[numVecs - 1 - i]);
118 |     }
119 | 
120 |     merge_substep_n_vec<vtype, comparator, numVecs / 2, scale>(regs);
121 |     merge_substep_n_vec<vtype, comparator, numVecs / 2, scale>(regs
122 |                                                                + numVecs / 2);
123 | }
124 | 
125 | template <typename vtype,
126 |           typename comparator,
127 |           int numVecs,
128 |           int scale,
129 |           typename reg_t = typename vtype::reg_t>
130 | X86_SIMD_SORT_FINLINE void merge_step_n_vec(reg_t *regs)
131 | {
132 |     // Do cross vector merges
133 |     merge_substep_n_vec<vtype, comparator, numVecs, scale>(regs);
134 | 
135 |     // Do internal vector merges
136 |     internal_merge_n_vec<vtype, comparator, numVecs, scale>(regs);
137 | }
138 | 
139 | template <typename vtype,
140 |           typename comparator,
141 |           int numVecs,
142 |           int numPer = 2,
143 |           typename reg_t = typename vtype::reg_t>
144 | X86_SIMD_SORT_FINLINE void merge_n_vec(reg_t *regs)
145 | {
146 |     if constexpr (numPer > vtype::numlanes) {
147 |         UNUSED(regs);
148 |         return;
149 |     }
150 |     else {
151 |         merge_step_n_vec<vtype, comparator, numVecs, numPer>(regs);
152 |         merge_n_vec<vtype, comparator, numVecs, numPer * 2>(regs);
153 |     }
154 | }
155 | 
156 | template <typename vtype,
157 |           typename comparator,
158 |           int numVecs,
159 |           typename reg_t = typename vtype::reg_t>
160 | X86_SIMD_SORT_FINLINE void sort_vectors(reg_t *vecs)
161 | {
162 |     /* Run the initial sorting network to sort the columns of the [numVecs x
163 |      * num_lanes] matrix
164 |      */
165 |     bitonic_sort_n_vec<vtype, comparator, numVecs>(vecs);
166 | 
167 |     // Merge the vectors using bitonic merging networks
168 |     merge_n_vec<vtype, comparator, numVecs>(vecs);
169 | }
170 | 
171 | template <typename vtype,
172 |           typename comparator,
173 |           int numVecs,
174 |           typename reg_t = typename vtype::reg_t>
175 | X86_SIMD_SORT_INLINE void sort_n_vec(typename vtype::type_t *arr, int N)
176 | {
177 |     static_assert(numVecs > 0, "numVecs should be > 0");
178 |     if constexpr (numVecs > 1) {
179 |         if (N * 2 <= numVecs * vtype::numlanes) {
180 |             sort_n_vec<vtype, comparator, numVecs / 2>(arr, N);
181 |             return;
182 |         }
183 |     }
184 | 
185 |     reg_t vecs[numVecs];
186 | 
187 |     // Generate masks for loading and storing
188 |     typename vtype::opmask_t ioMasks[numVecs - numVecs / 2];
189 |     X86_SIMD_SORT_UNROLL_LOOP(64)
190 |     for (int i = numVecs / 2, j = 0; i < numVecs; i++, j++) {
191 |         uint64_t num_to_read
192 |                 = std::min((uint64_t)std::max(0, N - i * vtype::numlanes),
193 |                            (uint64_t)vtype::numlanes);
194 |         ioMasks[j] = vtype::get_partial_loadmask(num_to_read);
195 |     }
196 | 
197 |     // Unmasked part of the load
198 |     X86_SIMD_SORT_UNROLL_LOOP(64)
199 |     for (int i = 0; i < numVecs / 2; i++) {
200 |         vecs[i] = vtype::loadu(arr + i * vtype::numlanes);
201 |     }
202 |     // Masked part of the load
203 |     X86_SIMD_SORT_UNROLL_LOOP(64)
204 |     for (int i = numVecs / 2, j = 0; i < numVecs; i++, j++) {
205 |         vecs[i] = vtype::mask_loadu(comparator::rightmostPossibleVec(),
206 |                                     ioMasks[j],
207 |                                     arr + i * vtype::numlanes);
208 |     }
209 | 
210 |     sort_vectors<vtype, comparator, numVecs>(vecs);
211 | 
212 |     // Unmasked part of the store
213 |     X86_SIMD_SORT_UNROLL_LOOP(64)
214 |     for (int i = 0; i < numVecs / 2; i++) {
215 |         vtype::storeu(arr + i * vtype::numlanes, vecs[i]);
216 |     }
217 |     // Masked part of the store
218 |     X86_SIMD_SORT_UNROLL_LOOP(64)
219 |     for (int i = numVecs / 2, j = 0; i < numVecs; i++, j++) {
220 |         vtype::mask_storeu(arr + i * vtype::numlanes, ioMasks[j], vecs[i]);
221 |     }
222 | }
223 | 
224 | template <typename vtype, typename comparator, int maxN>
225 | X86_SIMD_SORT_INLINE void sort_n(typename vtype::type_t *arr, int N)
226 | {
227 |     constexpr int numVecs = maxN / vtype::numlanes;
228 |     constexpr bool isMultiple = (maxN == (vtype::numlanes * numVecs));
229 |     constexpr bool powerOfTwo = (numVecs != 0 && !(numVecs & (numVecs - 1)));
230 |     static_assert(powerOfTwo == true && isMultiple == true,
231 |                   "maxN must be vtype::numlanes times a power of 2");
232 | 
233 |     sort_n_vec<vtype, comparator, numVecs>(arr, N);
234 | }
235 | #endif
236 | 


--------------------------------------------------------------------------------
/src/xss-pivot-selection.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef XSS_PIVOT_SELECTION
  2 | #define XSS_PIVOT_SELECTION
  3 | 
  4 | #include "xss-network-qsort.hpp"
  5 | #include "xss-common-comparators.hpp"
  6 | 
  7 | enum class pivot_result_t : int { Normal, Sorted, Only2Values };
  8 | 
  9 | template <typename type_t>
 10 | struct pivot_results {
 11 | 
 12 |     pivot_result_t result = pivot_result_t::Normal;
 13 |     type_t pivot = 0;
 14 | 
 15 |     pivot_results(type_t _pivot,
 16 |                   pivot_result_t _result = pivot_result_t::Normal)
 17 |     {
 18 |         pivot = _pivot;
 19 |         result = _result;
 20 |     }
 21 | };
 22 | 
 23 | template <typename vtype, typename mm_t>
 24 | X86_SIMD_SORT_INLINE void COEX(mm_t &a, mm_t &b);
 25 | 
 26 | template <typename vtype, typename type_t>
 27 | X86_SIMD_SORT_INLINE type_t get_pivot(type_t *arr,
 28 |                                       const arrsize_t left,
 29 |                                       const arrsize_t right)
 30 | {
 31 |     using reg_t = typename vtype::reg_t;
 32 |     type_t samples[vtype::numlanes];
 33 |     arrsize_t delta = (right - left) / vtype::numlanes;
 34 |     for (int i = 0; i < vtype::numlanes; i++) {
 35 |         samples[i] = arr[left + i * delta];
 36 |     }
 37 |     reg_t rand_vec = vtype::loadu(samples);
 38 |     reg_t sort = vtype::sort_vec(rand_vec);
 39 | 
 40 |     return ((type_t *)&sort)[vtype::numlanes / 2];
 41 | }
 42 | 
 43 | template <typename vtype, typename type_t>
 44 | X86_SIMD_SORT_INLINE type_t get_pivot_blocks(type_t *arr,
 45 |                                              const arrsize_t left,
 46 |                                              const arrsize_t right)
 47 | {
 48 | 
 49 |     if (right - left <= 1024) { return get_pivot<vtype>(arr, left, right); }
 50 | 
 51 |     using reg_t = typename vtype::reg_t;
 52 |     constexpr int numVecs = 5;
 53 | 
 54 |     arrsize_t width = (right - vtype::numlanes) - left;
 55 |     arrsize_t delta = width / numVecs;
 56 | 
 57 |     reg_t vecs[numVecs];
 58 |     // Load data
 59 |     for (int i = 0; i < numVecs; i++) {
 60 |         vecs[i] = vtype::loadu(arr + left + delta * i);
 61 |     }
 62 | 
 63 |     // Implement sorting network (from https://bertdobbelaere.github.io/sorting_networks.html)
 64 |     COEX<vtype>(vecs[0], vecs[3]);
 65 |     COEX<vtype>(vecs[1], vecs[4]);
 66 | 
 67 |     COEX<vtype>(vecs[0], vecs[2]);
 68 |     COEX<vtype>(vecs[1], vecs[3]);
 69 | 
 70 |     COEX<vtype>(vecs[0], vecs[1]);
 71 |     COEX<vtype>(vecs[2], vecs[4]);
 72 | 
 73 |     COEX<vtype>(vecs[1], vecs[2]);
 74 |     COEX<vtype>(vecs[3], vecs[4]);
 75 | 
 76 |     COEX<vtype>(vecs[2], vecs[3]);
 77 | 
 78 |     // Calculate median of the middle vector
 79 |     reg_t &vec = vecs[numVecs / 2];
 80 |     vec = vtype::sort_vec(vec);
 81 | 
 82 |     type_t data[vtype::numlanes];
 83 |     vtype::storeu(data, vec);
 84 |     return data[vtype::numlanes / 2];
 85 | }
 86 | 
 87 | template <typename vtype, typename comparator, typename type_t>
 88 | X86_SIMD_SORT_INLINE pivot_results<type_t>
 89 | get_pivot_near_constant(type_t *arr,
 90 |                         type_t commonValue,
 91 |                         const arrsize_t left,
 92 |                         const arrsize_t right);
 93 | 
 94 | template <typename vtype, typename comparator, typename type_t>
 95 | X86_SIMD_SORT_INLINE pivot_results<type_t>
 96 | get_pivot_smart(type_t *arr, const arrsize_t left, const arrsize_t right)
 97 | {
 98 |     using reg_t = typename vtype::reg_t;
 99 |     constexpr int numVecs = 4;
100 | 
101 |     if (right - left + 1 <= 4 * numVecs * vtype::numlanes) {
102 |         return pivot_results<type_t>(get_pivot<vtype>(arr, left, right));
103 |     }
104 | 
105 |     constexpr int N = numVecs * vtype::numlanes;
106 | 
107 |     arrsize_t width = (right - vtype::numlanes) - left;
108 |     arrsize_t delta = width / numVecs;
109 | 
110 |     reg_t vecs[numVecs];
111 |     for (int i = 0; i < numVecs; i++) {
112 |         vecs[i] = vtype::loadu(arr + left + delta * i);
113 |     }
114 | 
115 |     // Sort the samples
116 |     // Note that this intentionally uses the AscendingComparator
117 |     // instead of the provided comparator
118 |     sort_vectors<vtype, Comparator<vtype, false>, numVecs>(vecs);
119 | 
120 |     type_t samples[N];
121 |     for (int i = 0; i < numVecs; i++) {
122 |         vtype::storeu(samples + vtype::numlanes * i, vecs[i]);
123 |     }
124 | 
125 |     type_t smallest = samples[0];
126 |     type_t largest = samples[N - 1];
127 |     type_t median = samples[N / 2];
128 | 
129 |     if (smallest == largest) {
130 |         // We have a very unlucky sample, or the array is constant / near constant
131 |         // Run a special function meant to deal with this situation
132 |         return get_pivot_near_constant<vtype, comparator, type_t>(
133 |                 arr, median, left, right);
134 |     }
135 |     else if (median != smallest && median != largest) {
136 |         // We have a normal sample; use it's median
137 |         return pivot_results<type_t>(median);
138 |     }
139 |     else if (median == smallest) {
140 |         // We will either return the median or the next value larger than the median,
141 |         // depending on the comparator (see xss-common-comparators.hpp for more details)
142 |         return pivot_results<type_t>(
143 |                 comparator::choosePivotMedianIsSmallest(median));
144 |     }
145 |     else if (median == largest) {
146 |         // We will either return the median or the next value smaller than the median,
147 |         // depending on the comparator (see xss-common-comparators.hpp for more details)
148 |         return pivot_results<type_t>(
149 |                 comparator::choosePivotMedianIsLargest(median));
150 |     }
151 | 
152 |     return pivot_results<type_t>(median);
153 | }
154 | 
155 | // Handles the case where we seem to have a near-constant array, since our sample of the array was constant
156 | template <typename vtype, typename comparator, typename type_t>
157 | X86_SIMD_SORT_INLINE pivot_results<type_t>
158 | get_pivot_near_constant(type_t *arr,
159 |                         type_t commonValue,
160 |                         const arrsize_t left,
161 |                         const arrsize_t right)
162 | {
163 |     using reg_t = typename vtype::reg_t;
164 | 
165 |     arrsize_t index = left;
166 | 
167 |     type_t value1 = 0;
168 |     type_t value2 = 0;
169 | 
170 |     // First, search for any value not equal to the common value
171 |     // First vectorized
172 |     reg_t commonVec = vtype::set1(commonValue);
173 |     for (; index <= right - vtype::numlanes; index += vtype::numlanes) {
174 |         reg_t data = vtype::loadu(arr + index);
175 |         if (!vtype::all_false(vtype::knot_opmask(vtype::eq(data, commonVec)))) {
176 |             break;
177 |         }
178 |     }
179 | 
180 |     // Than scalar at the end
181 |     for (; index <= right; index++) {
182 |         if (arr[index] != commonValue) {
183 |             value1 = arr[index];
184 |             break;
185 |         }
186 |     }
187 | 
188 |     if (index == right + 1) {
189 |         // The array is completely constant
190 |         // Setting the second flag to true skips partitioning, as the array is constant and thus sorted
191 |         return pivot_results<type_t>(commonValue, pivot_result_t::Sorted);
192 |     }
193 | 
194 |     // Secondly, search for a second value not equal to either of the previous two
195 |     // First vectorized
196 |     reg_t value1Vec = vtype::set1(value1);
197 |     for (; index <= right - vtype::numlanes; index += vtype::numlanes) {
198 |         reg_t data = vtype::loadu(arr + index);
199 |         if (!vtype::all_false(vtype::knot_opmask(vtype::eq(data, commonVec)))
200 |             && !vtype::all_false(
201 |                     vtype::knot_opmask(vtype::eq(data, value1Vec)))) {
202 |             break;
203 |         }
204 |     }
205 | 
206 |     // Then scalar
207 |     for (; index <= right; index++) {
208 |         if (arr[index] != commonValue && arr[index] != value1) {
209 |             value2 = arr[index];
210 |             break;
211 |         }
212 |     }
213 | 
214 |     if (index == right + 1) {
215 |         // The array contains only 2 values
216 |         // We must pick the larger one, else the right partition is empty
217 |         // (note that larger is determined using the provided comparator, so it might actually be the smaller one)
218 |         // We can also skip recursing, as it is guaranteed both partitions are constant after partitioning with the chosen value
219 |         // TODO this logic now assumes we use greater than or equal to specifically when partitioning, might be worth noting that somewhere
220 |         type_t pivot
221 |                 = std::max(value1, commonValue, comparator::STDSortComparator);
222 |         return pivot_results<type_t>(pivot, pivot_result_t::Only2Values);
223 |     }
224 | 
225 |     // The array has at least 3 distinct values. Use the middle one as the pivot
226 |     type_t median = std::max(
227 |             std::min(value1, value2, comparison_func<vtype>),
228 |             std::min(std::max(value1, value2, comparison_func<vtype>),
229 |                      commonValue,
230 |                      comparison_func<vtype>),
231 |             comparison_func<vtype>);
232 |     return pivot_results<type_t>(median);
233 | }
234 | 
235 | #endif
236 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # x86-simd-sort
  2 | 
  3 | C++ template library for high performance SIMD based sorting routines for
  4 | built-in integers and floats (16-bit, 32-bit and 64-bit data types) and custom
  5 | defined C++ objects. The sorting routines are accelerated using AVX-512/AVX2
  6 | when available. The library auto picks the best version depending on the
  7 | processor it is run on. If you are looking for the AVX-512 or AVX2 specific
  8 | implementations, please see
  9 | [README](https://github.com/intel/x86-simd-sort/blob/main/src/README.md) file
 10 | under `src/` directory. The following routines are currently supported:
 11 | 
 12 | ## Sort an array of custom defined class objects (uses `O(N)` space)
 13 | ``` cpp
 14 | template <typename T, typename U, typename Func>
 15 | void x86simdsort::object_qsort(T *arr, U arrsize, Func key_func)
 16 | ```
 17 | `T` is any user defined struct or class and `arr` is a pointer to the first
 18 | element in the array of objects of type `T`. The `arrsize` parameter can be any
 19 | 32-bit or 64-bit integer type. `Func` is a lambda function that computes the
 20 | `key` value for each object which is the metric used to sort the objects.
 21 | `Func` needs to have the following signature:
 22 | 
 23 | ```cpp
 24 | [] (T obj) -> key_t { key_t key; /* compute key for obj */ return key; }
 25 | ```
 26 | 
 27 | Note that the return type of the key `key_t` needs to be one of the following :
 28 | `[float, uint32_t, int32_t, double, uint64_t, int64_t]`. `object_qsort` has a
 29 | space complexity of `O(N)`. Specifically, it requires `arrsize * sizeof(key_t)`
 30 | bytes to store a vector with all the keys and an additional `arrsize *
 31 | sizeof(uint32_t)` bytes to store the indexes of the object array.  For
 32 | performance reasons, we recommend using `object_qsort` when the array size
 33 | is less than or equal to `UINT32_MAX`. An example usage of `object_qsort` is
 34 | provided in the [examples](#Sort-an-array-of-Points-using-object_qsort)
 35 | section.  Refer to [section](#Performance-of-object_qsort) to get a sense of
 36 | how fast this is relative to `std::sort`.
 37 | 
 38 | ## Sort an array of built-in integers and floats
 39 | ```cpp
 40 | void x86simdsort::qsort(T* arr, size_t size, bool hasnan, bool descending);
 41 | void x86simdsort::qselect(T* arr, size_t k, size_t size, bool hasnan, bool descending);
 42 | void x86simdsort::partial_qsort(T* arr, size_t k, size_t size, bool hasnan, bool descending);
 43 | ```
 44 | Supported datatypes: `T` $\in$ `[_Float16, uint16_t, int16_t, float, uint32_t,
 45 | int32_t, double, uint64_t, int64_t]`
 46 | 
 47 | ## Key-value sort routines on pairs of arrays
 48 | ```cpp
 49 | void x86simdsort::keyvalue_qsort(T1* key, T2* val, size_t size, bool hasnan, bool descending);
 50 | void x86simdsort::keyvalue_select(T1* key, T2* val, size_t k, size_t size, bool hasnan, bool descending);
 51 | void x86simdsort::keyvalue_partial_sort(T1* key, T2* val, size_t k, size_t size, bool hasnan, bool descending);
 52 | ```
 53 | Supported datatypes: `T1`, `T2` $\in$ `[float, uint32_t, int32_t, double,
 54 | uint64_t, int64_t]` Note that keyvalue sort is not yet supported for 16-bit
 55 | data types.
 56 | 
 57 | ## Arg sort routines on arrays
 58 | ```cpp
 59 | std::vector<size_t> arg = x86simdsort::argsort(T* arr, size_t size, bool hasnan, bool descending);
 60 | std::vector<size_t> arg = x86simdsort::argselect(T* arr, size_t k, size_t size, bool hasnan);
 61 | ```
 62 | Supported datatypes: `T` $\in$ `[_Float16, uint16_t, int16_t, float, uint32_t, int32_t, double,
 63 | uint64_t, int64_t]` Note that argsort and argselect are not accelerated with SIMD when using 16-bit
 64 | data types.
 65 | 
 66 | ## Build/Install
 67 | 
 68 | [meson](https://github.com/mesonbuild/meson) is the used build system. Command
 69 | to build and install the library:
 70 | 
 71 | ```
 72 | meson setup --buildtype release builddir && cd builddir
 73 | meson compile
 74 | sudo meson install
 75 | ```
 76 | 
 77 | Once installed, you can use `pkg-config --cflags --libs x86simdsortcpp` to
 78 | populate the right cflags and ldflags to compile and link your C++ program.
 79 | This repository also contains a test suite and benchmarking suite which are
 80 | written using [googletest](https://github.com/google/googletest) and [google
 81 | benchmark](https://github.com/google/benchmark) (>= v1.9.2) frameworks
 82 | respectively. You can configure meson to build them both by using
 83 | `-Dbuild_tests=true` and `-Dbuild_benchmarks=true`.
 84 | 
 85 | ## Build using OpenMP
 86 | 
 87 | `qsort`, `argsort`, and `keyvalue_qsort` can achieve even greater performance
 88 | (up-to 3x speedup) through parallelization with
 89 | [OpenMP](https://www.openmp.org/). By default, OpenMP support is disabled; to
 90 | enable it, set the `-Duse_openmp=true` flag when configuring Meson. If you are
 91 | using only the static SIMD implementations, compile with `-fopenmp
 92 | -DXSS_USE_OPENMP`.
 93 | 
 94 | OpenMP-based parallel sorting routines are used for arrays larger than a
 95 | specific threshold where threading makes sense. The number of threads is
 96 | limited to a maximum of 16.  You can control the number of threads by setting
 97 | the `OMP_NUM_THREADS` environment variable.
 98 | 
 99 | ## Using x86-simd-sort as a Meson subproject
100 | 
101 | If you would like to use this as a Meson subproject, then create `subprojects`
102 | directory and copy `x86-simd-sort` into it. Add these two lines
103 | in your meson.build.
104 | ```
105 | xss = subproject('x86-simd-sort')
106 | xss_dep = xss.get_variable('x86simdsortcpp_dep')
107 | ```
108 | 
109 | For more detailed instructions please refer to Meson
110 | [documentation](https://mesonbuild.com/Subprojects.html#using-a-subproject).
111 | 
112 | ## Example usage
113 | 
114 | #### Sort an array of floats
115 | 
116 | ```cpp
117 | #include "x86simdsort.h"
118 | 
119 | int main() {
120 |     std::vector<float> arr{1000};
121 |     x86simdsort::qsort(arr.data(), 1000, true);
122 |     return 0;
123 | }
124 | ```
125 | 
126 | #### Sort an array of Points using object_qsort
127 | ```cpp
128 | #include "x86simdsort.h"
129 | #include <cmath>
130 | 
131 | struct Point {
132 |     double x, y, z;
133 | };
134 | 
135 | int main() {
136 |     std::vector<Point> arr{1000};
137 |     // Sort an array of Points by its x value:
138 |     x86simdsort::object_qsort(arr.data(), 1000, [](Point p) { return p.x; });
139 |     // Sort an array of Points by its distance from origin:
140 |     x86simdsort::object_qsort(arr.data(), 1000, [](Point p) {
141 |         return sqrt(p.x*p.x+p.y*p.y+p.z*p.z);
142 |         });
143 |     return 0;
144 | }
145 | ```
146 | 
147 | ## Details
148 | 
149 | - `x86simdsort::qsort` is equivalent to `qsort` in
150 |   [C](https://www.tutorialspoint.com/c_standard_library/c_function_qsort.htm)
151 |   or `std::sort` in [C++](https://en.cppreference.com/w/cpp/algorithm/sort).
152 | - `x86simdsort::qselect` is equivalent to `std::nth_element` in
153 |   [C++](https://en.cppreference.com/w/cpp/algorithm/nth_element) or
154 |   `np.partition` in
155 |   [NumPy](https://numpy.org/doc/stable/reference/generated/numpy.partition.html).
156 | - `x86simdsort::partial_qsort` is equivalent to `std::partial_sort` in
157 |   [C++](https://en.cppreference.com/w/cpp/algorithm/partial_sort).
158 | - `x86simdsort::argsort` is equivalent to `np.argsort` in
159 |   [NumPy](https://numpy.org/doc/stable/reference/generated/numpy.argsort.html).
160 | - `x86simdsort::argselect` is equivalent to `np.argpartition` in
161 |   [NumPy](https://numpy.org/doc/stable/reference/generated/numpy.argpartition.html).
162 | 
163 | Supported datatypes: `uint16_t, int16_t, _Float16, uint32_t, int32_t, float,
164 | uint64_t, int64_t, double`. Note that `_Float16` will require building this
165 | library with g++ >= 12.x. All the functions have an optional argument `bool
166 | hasnan` set to `false` by default (these are relevant to floating point data
167 | types only).  If your array has NAN's, the the behaviour of the sorting routine
168 | is undefined. If `hasnan` is set to true, NAN's are always sorted to the end of
169 | the array. In addition to that, qsort will replace all your NAN's with
170 | `std::numeric_limits<T>::quiet_NaN`. The original bit-exact NaNs in
171 | the input are not preserved. Also note that the arg methods (argsort and
172 | argselect) will not use the SIMD based algorithms if they detect NAN's in the
173 | array. You can read details of all the implementations
174 | [here](https://github.com/intel/x86-simd-sort/blob/main/src/README.md).
175 | 
176 | ## Performance comparison on AVX-512: `object_qsort` v/s `std::sort`
177 | Performance of `object_qsort` can vary significantly depending on the defintion
178 | of the custom class and we highly recommend benchmarking before using it. For
179 | the sake of illustration, we provide a few examples in
180 | [./benchmarks/bench-objsort.hpp](./benchmarks/bench-objsort.hpp) which measures
181 | performance of `object_qsort` relative to `std::sort` when sorting an array of
182 | 3D points represented by the class: `struct Point {double x, y, z;}` and
183 | `struct Point {float x, y, x;}`. We sort these points based on several
184 | different metrics:
185 | 
186 | + sort by coordinate `x`
187 | + sort by manhanttan distance (relative to origin): `abs(x) + abx(y) + abs(z)`
188 | + sort by Euclidean distance (relative to origin): `sqrt(x*x + y*y + z*z)`
189 | + sort by Chebyshev distance (relative to origin): `max(abs(x), abs(y), abs(z))`
190 | 
191 | The performance data (shown in the plot below) can be collected by building the
192 | benchmarks suite and running `./builddir/benchexe --benchmark_filter==*obj*`.
193 | The data plot shown below was collected on a processor with AVX-512. For the
194 | simplest of cases where we want to sort an array of struct by one of its
195 | members, `object_qsort` can be up-to 5x faster for 32-bit data type and about
196 | 4x for 64-bit data type.  It tends to do even better when the metric to sort by
197 | gets more complicated. Sorting by Euclidean distance can be up-to 10x faster.
198 | 
199 | ![alt text](./misc/object_qsort-perf.jpg?raw=true)
200 | 
201 | ## Downstream projects using x86-simd-sort
202 | 
203 | - NumPy uses this as a [submodule](https://github.com/numpy/numpy/pull/22315) to accelerate `np.sort, np.argsort, np.partition and np.argpartition`.
204 | - PyTorch uses this as a [submodule](https://github.com/pytorch/pytorch/pull/127936) to accelerate `torch.sort, torch.argsort`.
205 | - A slightly modifed version this library has been integrated into [openJDK](https://github.com/openjdk/jdk/pull/14227).
206 | - [GRAPE](https://github.com/alibaba/libgrape-lite.git): C++ library for parallel graph processing.
207 | - AVX-512 version of the key-value sort has been submitted to [Oceanbase](https://github.com/oceanbase/oceanbase/pull/1325).
208 | 


--------------------------------------------------------------------------------
/.github/workflows/c-cpp.yml:
--------------------------------------------------------------------------------
  1 | name: Build and run tests
  2 | 
  3 | on:
  4 |   push:
  5 |     branches: [ "main" ]
  6 |   pull_request:
  7 |     branches: [ "main" ]
  8 | 
  9 | permissions: read-all
 10 | 
 11 | jobs:
 12 |   SKL-gcc9:
 13 | 
 14 |     runs-on: intel-ubuntu-24.04
 15 | 
 16 |     steps:
 17 |     - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
 18 | 
 19 |     - name: Install dependencies
 20 |       run: |
 21 |         sudo apt update
 22 |         sudo apt -y install g++-9 libgtest-dev meson curl git
 23 | 
 24 |     - name: Install Intel SDE
 25 |       run: |
 26 |         curl -o /tmp/sde.tar.xz https://downloadmirror.intel.com/859732/sde-external-9.58.0-2025-06-16-lin.tar.xz
 27 |         mkdir /tmp/sde && tar -xvf /tmp/sde.tar.xz -C /tmp/sde/
 28 |         sudo mv /tmp/sde/* /opt/sde && sudo ln -s /opt/sde/sde64 /usr/bin/sde
 29 | 
 30 |     - name: Build
 31 |       env:
 32 |         CXX: g++-9
 33 |       run: |
 34 |         make clean
 35 |         meson setup -Dbuild_tests=true --warnlevel 2 --werror --buildtype release builddir
 36 |         cd builddir
 37 |         ninja
 38 | 
 39 |     - name: Run test suite on SKL
 40 |       run: sde -skl -- ./builddir/testexe
 41 | 
 42 |   SKX-gcc10:
 43 | 
 44 |     runs-on: intel-ubuntu-24.04
 45 | 
 46 |     steps:
 47 |     - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
 48 | 
 49 |     - name: Install dependencies
 50 |       run: |
 51 |         sudo apt update
 52 |         sudo apt -y install g++-10 libgtest-dev meson curl git
 53 | 
 54 |     - name: Install Intel SDE
 55 |       run: |
 56 |         curl -o /tmp/sde.tar.xz https://downloadmirror.intel.com/859732/sde-external-9.58.0-2025-06-16-lin.tar.xz
 57 |         mkdir /tmp/sde && tar -xvf /tmp/sde.tar.xz -C /tmp/sde/
 58 |         sudo mv /tmp/sde/* /opt/sde && sudo ln -s /opt/sde/sde64 /usr/bin/sde
 59 | 
 60 |     - name: Build
 61 |       env:
 62 |         CXX: g++-10
 63 |       run: |
 64 |         make clean
 65 |         meson setup -Dbuild_tests=true --warnlevel 2 --werror --buildtype release builddir
 66 |         cd builddir
 67 |         ninja
 68 | 
 69 |     - name: Run test suite on SKX
 70 |       run: sde -skx -- ./builddir/testexe
 71 | 
 72 |   TGL-gcc11:
 73 | 
 74 |     runs-on: intel-ubuntu-24.04
 75 | 
 76 |     steps:
 77 |     - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
 78 | 
 79 |     - name: Install dependencies
 80 |       run: |
 81 |         sudo apt update
 82 |         sudo apt -y install g++-11 libgtest-dev meson curl git
 83 | 
 84 |     - name: Install Intel SDE
 85 |       run: |
 86 |         curl -o /tmp/sde.tar.xz https://downloadmirror.intel.com/859732/sde-external-9.58.0-2025-06-16-lin.tar.xz
 87 |         mkdir /tmp/sde && tar -xvf /tmp/sde.tar.xz -C /tmp/sde/
 88 |         sudo mv /tmp/sde/* /opt/sde && sudo ln -s /opt/sde/sde64 /usr/bin/sde
 89 | 
 90 |     - name: Build
 91 |       env:
 92 |         CXX: g++-11
 93 |       run: |
 94 |         make clean
 95 |         meson setup -Dbuild_tests=true --warnlevel 2 --werror --buildtype release builddir
 96 |         cd builddir
 97 |         ninja
 98 |     - name: Run test suite on TGL
 99 |       run: sde -tgl -- ./builddir/testexe
100 | 
101 |   SPR-gcc13:
102 | 
103 |     runs-on: intel-ubuntu-24.04
104 | 
105 |     steps:
106 |     - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
107 | 
108 |     - name: Install dependencies
109 |       run: |
110 |         sudo apt update
111 |         sudo apt -y install g++-13 libgtest-dev meson curl git
112 | 
113 |     - name: Install Intel SDE
114 |       run: |
115 |         curl -o /tmp/sde.tar.xz https://downloadmirror.intel.com/859732/sde-external-9.58.0-2025-06-16-lin.tar.xz
116 |         mkdir /tmp/sde && tar -xvf /tmp/sde.tar.xz -C /tmp/sde/
117 |         sudo mv /tmp/sde/* /opt/sde && sudo ln -s /opt/sde/sde64 /usr/bin/sde
118 | 
119 |     - name: Build examples
120 |       env:
121 |         CXX: g++-13
122 |       run: |
123 |         cd examples
124 |         make all
125 | 
126 |     - name: Build
127 |       env:
128 |         CXX: g++-13
129 |       run: |
130 |         make clean
131 |         meson setup -Dbuild_tests=true --warnlevel 2 --werror --buildtype release builddir
132 |         cd builddir
133 |         ninja
134 | 
135 |     - name: Run test suite on SPR
136 |       run: sde -spr -- ./builddir/testexe
137 | 
138 |   ADL-ASAN-clang18:
139 | 
140 |     runs-on: intel-ubuntu-24.04
141 | 
142 |     steps:
143 |     - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
144 | 
145 |     - name: Install dependencies
146 |       run: |
147 |         sudo apt update
148 |         sudo apt -y install clang-18 libomp-18-dev libgtest-dev meson curl git
149 | 
150 |     - name: Install Intel SDE
151 |       run: |
152 |         curl -o /tmp/sde.tar.xz https://downloadmirror.intel.com/859732/sde-external-9.58.0-2025-06-16-lin.tar.xz
153 |         mkdir /tmp/sde && tar -xvf /tmp/sde.tar.xz -C /tmp/sde/
154 |         sudo mv /tmp/sde/* /opt/sde && sudo ln -s /opt/sde/sde64 /usr/bin/sde
155 | 
156 |     - name: Build examples
157 |       env:
158 |         CXX: clang++-18
159 |       run: |
160 |         cd examples
161 |         make all
162 | 
163 |     - name: Build
164 |       env:
165 |         CXX: clang++-18
166 |       run: |
167 |         make clean
168 |         meson setup -Dbuild_tests=true -Duse_openmp=true -Db_sanitize=address,undefined -Dfatal_sanitizers=true  -Dasan_ci_dont_validate=true -Db_lundef=false --warnlevel 0 --buildtype release builddir
169 |         cd builddir
170 |         ninja
171 | 
172 |     - name: Run test suite on SPR
173 |       run: sde -adl -- ./builddir/testexe
174 | 
175 |   SPR-ASAN-clang18:
176 | 
177 |     runs-on: intel-ubuntu-24.04
178 | 
179 |     steps:
180 |     - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
181 | 
182 |     - name: Install dependencies
183 |       run: |
184 |         sudo apt update
185 |         sudo apt -y install clang-18 libomp-18-dev libgtest-dev meson curl git
186 | 
187 |     - name: Install Intel SDE
188 |       run: |
189 |         curl -o /tmp/sde.tar.xz https://downloadmirror.intel.com/859732/sde-external-9.58.0-2025-06-16-lin.tar.xz
190 |         mkdir /tmp/sde && tar -xvf /tmp/sde.tar.xz -C /tmp/sde/
191 |         sudo mv /tmp/sde/* /opt/sde && sudo ln -s /opt/sde/sde64 /usr/bin/sde
192 | 
193 |     - name: Build examples
194 |       env:
195 |         CXX: clang++-18
196 |       run: |
197 |         cd examples
198 |         make all
199 | 
200 |     - name: Build
201 |       env:
202 |         CXX: clang++-18
203 |       run: |
204 |         make clean
205 |         meson setup -Dbuild_tests=true -Duse_openmp=true -Db_sanitize=address,undefined -Dfatal_sanitizers=true  -Dasan_ci_dont_validate=true -Db_lundef=false --warnlevel 0 --buildtype release builddir
206 |         cd builddir
207 |         ninja
208 | 
209 |     - name: Run test suite on SPR
210 |       run: sde -spr -- ./builddir/testexe
211 |     - name: Run ICL fp16 tests
212 |       # Note: This filters for the _Float16 tests based on the number assigned to it, which could change in the future
213 |       run: sde -icx -- ./builddir/testexe --gtest_filter="*/simdsort/2*"
214 | 
215 |   SKX-SKL-openmp:
216 | 
217 |     runs-on: intel-ubuntu-24.04
218 | 
219 |     steps:
220 |     - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
221 | 
222 |     - name: Install dependencies
223 |       run: |
224 |         sudo apt update
225 |         sudo apt -y install g++-10 libgtest-dev meson curl git
226 | 
227 |     - name: Install Intel SDE
228 |       run: |
229 |         curl -o /tmp/sde.tar.xz https://downloadmirror.intel.com/859732/sde-external-9.58.0-2025-06-16-lin.tar.xz
230 |         mkdir /tmp/sde && tar -xvf /tmp/sde.tar.xz -C /tmp/sde/
231 |         sudo mv /tmp/sde/* /opt/sde && sudo ln -s /opt/sde/sde64 /usr/bin/sde
232 | 
233 |     - name: Build
234 |       env:
235 |         CXX: g++-10
236 |       run: |
237 |         make clean
238 |         meson setup -Dbuild_tests=true -Duse_openmp=true --warnlevel 2 --werror --buildtype release builddir
239 |         cd builddir
240 |         ninja
241 | 
242 |     - name: Run test suite on SKX and SKL
243 |       run: |
244 |         sde -skx -- ./builddir/testexe
245 |         sde -skl -- ./builddir/testexe
246 | 
247 |   SPR-gcc13-special-cases:
248 | 
249 |     runs-on: intel-ubuntu-24.04
250 | 
251 |     steps:
252 |     - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
253 | 
254 |     - name: Install dependencies
255 |       run: |
256 |         sudo apt update
257 |         sudo apt -y install g++-13 libgtest-dev meson curl git
258 | 
259 |     - name: Install Intel SDE
260 |       run: |
261 |         curl -o /tmp/sde.tar.xz https://downloadmirror.intel.com/859732/sde-external-9.58.0-2025-06-16-lin.tar.xz
262 |         mkdir /tmp/sde && tar -xvf /tmp/sde.tar.xz -C /tmp/sde/
263 |         sudo mv /tmp/sde/* /opt/sde && sudo ln -s /opt/sde/sde64 /usr/bin/sde
264 | 
265 |     - name: Build
266 |       env:
267 |         CXX: g++-13
268 |         CXXFLAGS: "-DXSS_MINIMAL_NETWORK_SORT -DXSS_TEST_KEYVALUE_BASE_CASE"
269 |       run: |
270 |         make clean
271 |         meson setup -Dbuild_tests=true --warnlevel 2 --werror --buildtype release builddir
272 |         cd builddir
273 |         ninja
274 | 
275 |     - name: List exported symbols
276 |       run: |
277 |         nm --demangle --dynamic --defined-only --extern-only builddir/libx86simdsortcpp.so
278 | 
279 |     - name: Run test suite on SPR
280 |       run: sde -spr -- ./builddir/testexe
281 | 
282 |   manylinux-32bit:
283 | 
284 |     runs-on: intel-ubuntu-24.04
285 | 
286 |     steps:
287 |     - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
288 | 
289 |     - name: Build and test on 32-bit manylinux2014
290 |       run: |
291 |         docker run -v $(pwd):/xss quay.io/pypa/manylinux2014_i686 \
292 |         /bin/bash -xc "source /xss/.github/workflows/build-test-on-32bit.sh"
293 | 
294 |   SPR-icpx:
295 | 
296 |     runs-on: intel-ubuntu-24.04
297 | 
298 |     steps:
299 |     - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
300 | 
301 |     - name: Install dependencies
302 |       run: |
303 |         echo "deb https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list
304 |         sudo add-apt-repository -y "deb https://apt.repos.intel.com/oneapi all main"
305 |         sudo apt update --allow-insecure-repositories
306 |         sudo apt --allow-unauthenticated -y install intel-oneapi-compiler-dpcpp-cpp libgtest-dev curl git python3-pip meson
307 | 
308 |     - name: Install Intel SDE
309 |       run: |
310 |         #INTEL_SDE_URL=$(curl -s https://www.intel.com/content/www/us/en/download/684897/813591/intel-software-development-emulator.html | grep -Po 'https://downloadmirror.intel.com/.*lin.tar.xz(?=")')
311 |         #curl -o /tmp/sde.tar.xz $INTEL_SDE_URL
312 |         curl -o /tmp/sde.tar.xz https://downloadmirror.intel.com/859732/sde-external-9.58.0-2025-06-16-lin.tar.xz
313 |         mkdir /tmp/sde && tar -xvf /tmp/sde.tar.xz -C /tmp/sde/
314 |         sudo mv /tmp/sde/* /opt/sde && sudo ln -s /opt/sde/sde64 /usr/bin/sde
315 | 
316 |     - name: Build examples
317 |       env:
318 |         CXX: icpx
319 |         CXXFLAGS: -fp-model=precise
320 |       run: |
321 |         source /opt/intel/oneapi/setvars.sh
322 |         cd examples
323 |         make all
324 | 
325 |     - name: Build
326 |       env:
327 |         CXX: icpx
328 |         CXXFLAGS: -fp-model=precise
329 |       run: |
330 |         make clean
331 |         source /opt/intel/oneapi/setvars.sh
332 |         icpx --version
333 |         meson setup -Dbuild_tests=true --warnlevel 2 --werror --buildtype release builddir
334 |         cd builddir
335 |         ninja
336 | 
337 |     - name: Run test suite on SPR
338 |       run: |
339 |         source /opt/intel/oneapi/setvars.sh
340 |         ./builddir/testexe
341 | 


--------------------------------------------------------------------------------
/tests/test-qsort.cpp:
--------------------------------------------------------------------------------
  1 | /*******************************************
  2 |  * * Copyright (C) 2022 Intel Corporation
  3 |  * * SPDX-License-Identifier: BSD-3-Clause
  4 |  * *******************************************/
  5 | 
  6 | #include "test-qsort-common.h"
  7 | 
  8 | template <typename T>
  9 | class simdsort : public ::testing::Test {
 10 | public:
 11 |     simdsort()
 12 |     {
 13 |         std::iota(arrsize.begin(), arrsize.end(), 0);
 14 |         std::iota(arrsize_long.begin(), arrsize_long.end(), 0);
 15 | #ifdef XSS_USE_OPENMP
 16 |         // These extended tests are only needed for the OpenMP logic
 17 |         arrsize_long.push_back(10'000);
 18 |         arrsize_long.push_back(100'000);
 19 |         arrsize_long.push_back(1'000'000);
 20 | #endif
 21 | 
 22 |         arrtype = {"random",
 23 |                    "constant",
 24 |                    "sorted",
 25 |                    "reverse",
 26 |                    "smallrange",
 27 |                    "max_at_the_end",
 28 |                    "random_5d",
 29 |                    "rand_max",
 30 |                    "rand_with_nan",
 31 |                    "rand_with_max_and_nan"};
 32 |     }
 33 |     std::vector<std::string> arrtype;
 34 |     std::vector<size_t> arrsize = std::vector<size_t>(1024);
 35 |     std::vector<size_t> arrsize_long = std::vector<size_t>(1024);
 36 | };
 37 | 
 38 | TYPED_TEST_SUITE_P(simdsort);
 39 | 
 40 | TYPED_TEST_P(simdsort, test_qsort_ascending)
 41 | {
 42 |     for (auto type : this->arrtype) {
 43 |         bool hasnan = is_nan_test(type);
 44 |         for (auto size : this->arrsize_long) {
 45 |             std::vector<TypeParam> basearr = get_array<TypeParam>(type, size);
 46 | 
 47 |             // Ascending order
 48 |             std::vector<TypeParam> arr = basearr;
 49 |             std::vector<TypeParam> sortedarr = arr;
 50 | 
 51 |             x86simdsort::qsort(arr.data(), arr.size(), hasnan);
 52 | #ifndef XSS_ASAN_CI_NOCHECK
 53 |             std::sort(sortedarr.begin(),
 54 |                       sortedarr.end(),
 55 |                       compare<TypeParam, std::less<TypeParam>>());
 56 |             IS_SORTED(sortedarr, arr, type);
 57 | #endif
 58 |             arr.clear();
 59 |             sortedarr.clear();
 60 |         }
 61 |     }
 62 | }
 63 | 
 64 | TYPED_TEST_P(simdsort, test_qsort_descending)
 65 | {
 66 |     for (auto type : this->arrtype) {
 67 |         bool hasnan = is_nan_test(type);
 68 |         for (auto size : this->arrsize_long) {
 69 |             std::vector<TypeParam> basearr = get_array<TypeParam>(type, size);
 70 | 
 71 |             // Descending order
 72 |             std::vector<TypeParam> arr = basearr;
 73 |             std::vector<TypeParam> sortedarr = arr;
 74 | 
 75 |             x86simdsort::qsort(arr.data(), arr.size(), hasnan, true);
 76 | #ifndef XSS_ASAN_CI_NOCHECK
 77 |             std::sort(sortedarr.begin(),
 78 |                       sortedarr.end(),
 79 |                       compare<TypeParam, std::greater<TypeParam>>());
 80 |             IS_SORTED(sortedarr, arr, type);
 81 | #endif
 82 |             arr.clear();
 83 |             sortedarr.clear();
 84 |         }
 85 |     }
 86 | }
 87 | 
 88 | TYPED_TEST_P(simdsort, test_argsort_ascending)
 89 | {
 90 |     for (auto type : this->arrtype) {
 91 |         bool hasnan = is_nan_test(type);
 92 |         for (auto size : this->arrsize_long) {
 93 |             std::vector<TypeParam> arr = get_array<TypeParam>(type, size);
 94 |             std::vector<TypeParam> sortedarr = arr;
 95 | 
 96 |             auto arg = x86simdsort::argsort(arr.data(), arr.size(), hasnan);
 97 | #ifndef XSS_ASAN_CI_NOCHECK
 98 |             std::sort(sortedarr.begin(),
 99 |                       sortedarr.end(),
100 |                       compare<TypeParam, std::less<TypeParam>>());
101 |             IS_ARG_SORTED(sortedarr, arr, arg, type);
102 | #endif
103 |             arr.clear();
104 |             arg.clear();
105 |         }
106 |     }
107 | }
108 | 
109 | TYPED_TEST_P(simdsort, test_argsort_descending)
110 | {
111 |     for (auto type : this->arrtype) {
112 |         bool hasnan = is_nan_test(type);
113 |         for (auto size : this->arrsize_long) {
114 |             std::vector<TypeParam> arr = get_array<TypeParam>(type, size);
115 |             std::vector<TypeParam> sortedarr = arr;
116 | 
117 |             auto arg = x86simdsort::argsort(
118 |                     arr.data(), arr.size(), hasnan, true);
119 | #ifndef XSS_ASAN_CI_NOCHECK
120 |             std::sort(sortedarr.begin(),
121 |                       sortedarr.end(),
122 |                       compare<TypeParam, std::greater<TypeParam>>());
123 |             IS_ARG_SORTED(sortedarr, arr, arg, type);
124 | #endif
125 |             arr.clear();
126 |             arg.clear();
127 |         }
128 |     }
129 | }
130 | 
131 | TYPED_TEST_P(simdsort, test_qselect_ascending)
132 | {
133 |     for (auto type : this->arrtype) {
134 |         bool hasnan = is_nan_test(type);
135 |         for (auto size : this->arrsize) {
136 |             size_t k = size != 0 ? rand() % size : 0;
137 |             std::vector<TypeParam> basearr = get_array<TypeParam>(type, size);
138 | 
139 |             // Ascending order
140 |             std::vector<TypeParam> arr = basearr;
141 |             std::vector<TypeParam> sortedarr = arr;
142 | 
143 |             x86simdsort::qselect(arr.data(), k, arr.size(), hasnan);
144 | #ifndef XSS_ASAN_CI_NOCHECK
145 |             std::nth_element(sortedarr.begin(),
146 |                              sortedarr.begin() + k,
147 |                              sortedarr.end(),
148 |                              compare<TypeParam, std::less<TypeParam>>());
149 |             if (size == 0) continue;
150 |             IS_ARR_PARTITIONED(arr, k, sortedarr[k], type);
151 | #endif
152 |             arr.clear();
153 |             sortedarr.clear();
154 |         }
155 |     }
156 | }
157 | 
158 | TYPED_TEST_P(simdsort, test_qselect_descending)
159 | {
160 |     for (auto type : this->arrtype) {
161 |         bool hasnan = is_nan_test(type);
162 |         for (auto size : this->arrsize) {
163 |             size_t k = size != 0 ? rand() % size : 0;
164 |             std::vector<TypeParam> basearr = get_array<TypeParam>(type, size);
165 | 
166 |             // Descending order
167 |             std::vector<TypeParam> arr = basearr;
168 |             std::vector<TypeParam> sortedarr = arr;
169 | 
170 |             x86simdsort::qselect(arr.data(), k, arr.size(), hasnan, true);
171 | #ifndef XSS_ASAN_CI_NOCHECK
172 |             std::nth_element(sortedarr.begin(),
173 |                              sortedarr.begin() + k,
174 |                              sortedarr.end(),
175 |                              compare<TypeParam, std::greater<TypeParam>>());
176 |             if (size == 0) continue;
177 |             IS_ARR_PARTITIONED(arr, k, sortedarr[k], type, true);
178 | #endif
179 |             arr.clear();
180 |             sortedarr.clear();
181 |         }
182 |     }
183 | }
184 | 
185 | TYPED_TEST_P(simdsort, test_argselect)
186 | {
187 |     for (auto type : this->arrtype) {
188 |         bool hasnan = is_nan_test(type);
189 |         for (auto size : this->arrsize) {
190 |             size_t k = size != 0 ? rand() % size : 0;
191 |             std::vector<TypeParam> arr = get_array<TypeParam>(type, size);
192 |             std::vector<TypeParam> sortedarr = arr;
193 | 
194 |             auto arg
195 |                     = x86simdsort::argselect(arr.data(), k, arr.size(), hasnan);
196 | #ifndef XSS_ASAN_CI_NOCHECK
197 |             std::sort(sortedarr.begin(),
198 |                       sortedarr.end(),
199 |                       compare<TypeParam, std::less<TypeParam>>());
200 |             if (size == 0) continue;
201 |             IS_ARG_PARTITIONED(arr, arg, sortedarr[k], k, type);
202 | #endif
203 |             arr.clear();
204 |             sortedarr.clear();
205 |         }
206 |     }
207 | }
208 | 
209 | TYPED_TEST_P(simdsort, test_partial_qsort_ascending)
210 | {
211 |     for (auto type : this->arrtype) {
212 |         bool hasnan = is_nan_test(type);
213 |         for (auto size : this->arrsize) {
214 |             size_t k = size != 0 ? rand() % size : 0;
215 |             std::vector<TypeParam> basearr = get_array<TypeParam>(type, size);
216 | 
217 |             // Ascending order
218 |             std::vector<TypeParam> arr = basearr;
219 |             std::vector<TypeParam> sortedarr = arr;
220 | 
221 |             x86simdsort::partial_qsort(arr.data(), k, arr.size(), hasnan);
222 | #ifndef XSS_ASAN_CI_NOCHECK
223 |             std::sort(sortedarr.begin(),
224 |                       sortedarr.end(),
225 |                       compare<TypeParam, std::less<TypeParam>>());
226 |             if (size == 0) continue;
227 |             IS_ARR_PARTIALSORTED(arr, k, sortedarr, type);
228 | #endif
229 |             arr.clear();
230 |             sortedarr.clear();
231 |         }
232 |     }
233 | }
234 | 
235 | TYPED_TEST_P(simdsort, test_partial_qsort_descending)
236 | {
237 |     for (auto type : this->arrtype) {
238 |         bool hasnan = is_nan_test(type);
239 |         for (auto size : this->arrsize) {
240 |             size_t k = size != 0 ? rand() % size : 0;
241 |             std::vector<TypeParam> basearr = get_array<TypeParam>(type, size);
242 | 
243 |             // Descending order
244 |             std::vector<TypeParam> arr = basearr;
245 |             std::vector<TypeParam> sortedarr = arr;
246 | 
247 |             x86simdsort::partial_qsort(arr.data(), k, arr.size(), hasnan, true);
248 | #ifndef XSS_ASAN_CI_NOCHECK
249 |             std::sort(sortedarr.begin(),
250 |                       sortedarr.end(),
251 |                       compare<TypeParam, std::greater<TypeParam>>());
252 |             if (size == 0) continue;
253 |             IS_ARR_PARTIALSORTED(arr, k, sortedarr, type);
254 | #endif
255 |             arr.clear();
256 |             sortedarr.clear();
257 |         }
258 |     }
259 | }
260 | 
261 | TYPED_TEST_P(simdsort, test_comparator)
262 | {
263 |     if constexpr (xss::fp::is_floating_point_v<TypeParam>) {
264 |         auto less = compare<TypeParam, std::less<TypeParam>>();
265 |         auto leq = compare<TypeParam, std::less_equal<TypeParam>>();
266 |         auto greater = compare<TypeParam, std::greater<TypeParam>>();
267 |         auto geq = compare<TypeParam, std::greater_equal<TypeParam>>();
268 |         auto equal = compare<TypeParam, std::equal_to<TypeParam>>();
269 |         TypeParam nan = xss::fp::quiet_NaN<TypeParam>();
270 |         TypeParam inf = xss::fp::infinity<TypeParam>();
271 |         ASSERT_EQ(less(nan, inf), false);
272 |         ASSERT_EQ(less(nan, nan), false);
273 |         ASSERT_EQ(less(inf, nan), true);
274 |         ASSERT_EQ(less(inf, inf), false);
275 |         ASSERT_EQ(leq(nan, inf), false);
276 |         ASSERT_EQ(leq(nan, nan), true);
277 |         ASSERT_EQ(leq(inf, nan), true);
278 |         ASSERT_EQ(leq(inf, inf), true);
279 |         ASSERT_EQ(geq(nan, inf), true);
280 |         ASSERT_EQ(geq(nan, nan), true);
281 |         ASSERT_EQ(geq(inf, nan), false);
282 |         ASSERT_EQ(geq(inf, inf), true);
283 |         ASSERT_EQ(greater(nan, inf), true);
284 |         ASSERT_EQ(greater(nan, nan), false);
285 |         ASSERT_EQ(greater(inf, nan), false);
286 |         ASSERT_EQ(greater(inf, inf), false);
287 |         ASSERT_EQ(equal(nan, inf), false);
288 |         ASSERT_EQ(equal(nan, nan), true);
289 |         ASSERT_EQ(equal(inf, nan), false);
290 |         ASSERT_EQ(equal(inf, inf), true);
291 |     }
292 | }
293 | 
294 | REGISTER_TYPED_TEST_SUITE_P(simdsort,
295 |                             test_qsort_ascending,
296 |                             test_qsort_descending,
297 |                             test_argsort_ascending,
298 |                             test_argsort_descending,
299 |                             test_argselect,
300 |                             test_qselect_ascending,
301 |                             test_qselect_descending,
302 |                             test_partial_qsort_ascending,
303 |                             test_partial_qsort_descending,
304 |                             test_comparator);
305 | 
306 | using QSortTestTypes = testing::Types<uint16_t,
307 |                                       int16_t,
308 | // support for _Float16 is incomplete in gcc-12, clang < 6
309 | #if __GNUC__ >= 13 || __clang_major__ >= 6
310 |                                       _Float16,
311 | #endif
312 |                                       float,
313 |                                       double,
314 |                                       uint32_t,
315 |                                       int32_t,
316 |                                       uint64_t,
317 |                                       int64_t>;
318 | 
319 | INSTANTIATE_TYPED_TEST_SUITE_P(xss, simdsort, QSortTestTypes);
320 | 


--------------------------------------------------------------------------------
/src/xss-optimal-networks.hpp:
--------------------------------------------------------------------------------
  1 | // All of these sources files are generated from the optimal networks described in
  2 | // https://bertdobbelaere.github.io/sorting_networks.html
  3 | 
  4 | template <typename vtype,
  5 |           typename comparator,
  6 |           typename reg_t = typename vtype::reg_t>
  7 | X86_SIMD_SORT_FINLINE void optimal_sort_4(reg_t *vecs)
  8 | {
  9 |     comparator::COEX(vecs[0], vecs[2]);
 10 |     comparator::COEX(vecs[1], vecs[3]);
 11 | 
 12 |     comparator::COEX(vecs[0], vecs[1]);
 13 |     comparator::COEX(vecs[2], vecs[3]);
 14 | 
 15 |     comparator::COEX(vecs[1], vecs[2]);
 16 | }
 17 | 
 18 | template <typename vtype,
 19 |           typename comparator,
 20 |           typename reg_t = typename vtype::reg_t>
 21 | X86_SIMD_SORT_FINLINE void optimal_sort_8(reg_t *vecs)
 22 | {
 23 |     comparator::COEX(vecs[0], vecs[2]);
 24 |     comparator::COEX(vecs[1], vecs[3]);
 25 |     comparator::COEX(vecs[4], vecs[6]);
 26 |     comparator::COEX(vecs[5], vecs[7]);
 27 | 
 28 |     comparator::COEX(vecs[0], vecs[4]);
 29 |     comparator::COEX(vecs[1], vecs[5]);
 30 |     comparator::COEX(vecs[2], vecs[6]);
 31 |     comparator::COEX(vecs[3], vecs[7]);
 32 | 
 33 |     comparator::COEX(vecs[0], vecs[1]);
 34 |     comparator::COEX(vecs[2], vecs[3]);
 35 |     comparator::COEX(vecs[4], vecs[5]);
 36 |     comparator::COEX(vecs[6], vecs[7]);
 37 | 
 38 |     comparator::COEX(vecs[2], vecs[4]);
 39 |     comparator::COEX(vecs[3], vecs[5]);
 40 | 
 41 |     comparator::COEX(vecs[1], vecs[4]);
 42 |     comparator::COEX(vecs[3], vecs[6]);
 43 | 
 44 |     comparator::COEX(vecs[1], vecs[2]);
 45 |     comparator::COEX(vecs[3], vecs[4]);
 46 |     comparator::COEX(vecs[5], vecs[6]);
 47 | }
 48 | 
 49 | template <typename vtype,
 50 |           typename comparator,
 51 |           typename reg_t = typename vtype::reg_t>
 52 | X86_SIMD_SORT_FINLINE void optimal_sort_16(reg_t *vecs)
 53 | {
 54 |     comparator::COEX(vecs[0], vecs[13]);
 55 |     comparator::COEX(vecs[1], vecs[12]);
 56 |     comparator::COEX(vecs[2], vecs[15]);
 57 |     comparator::COEX(vecs[3], vecs[14]);
 58 |     comparator::COEX(vecs[4], vecs[8]);
 59 |     comparator::COEX(vecs[5], vecs[6]);
 60 |     comparator::COEX(vecs[7], vecs[11]);
 61 |     comparator::COEX(vecs[9], vecs[10]);
 62 | 
 63 |     comparator::COEX(vecs[0], vecs[5]);
 64 |     comparator::COEX(vecs[1], vecs[7]);
 65 |     comparator::COEX(vecs[2], vecs[9]);
 66 |     comparator::COEX(vecs[3], vecs[4]);
 67 |     comparator::COEX(vecs[6], vecs[13]);
 68 |     comparator::COEX(vecs[8], vecs[14]);
 69 |     comparator::COEX(vecs[10], vecs[15]);
 70 |     comparator::COEX(vecs[11], vecs[12]);
 71 | 
 72 |     comparator::COEX(vecs[0], vecs[1]);
 73 |     comparator::COEX(vecs[2], vecs[3]);
 74 |     comparator::COEX(vecs[4], vecs[5]);
 75 |     comparator::COEX(vecs[6], vecs[8]);
 76 |     comparator::COEX(vecs[7], vecs[9]);
 77 |     comparator::COEX(vecs[10], vecs[11]);
 78 |     comparator::COEX(vecs[12], vecs[13]);
 79 |     comparator::COEX(vecs[14], vecs[15]);
 80 | 
 81 |     comparator::COEX(vecs[0], vecs[2]);
 82 |     comparator::COEX(vecs[1], vecs[3]);
 83 |     comparator::COEX(vecs[4], vecs[10]);
 84 |     comparator::COEX(vecs[5], vecs[11]);
 85 |     comparator::COEX(vecs[6], vecs[7]);
 86 |     comparator::COEX(vecs[8], vecs[9]);
 87 |     comparator::COEX(vecs[12], vecs[14]);
 88 |     comparator::COEX(vecs[13], vecs[15]);
 89 | 
 90 |     comparator::COEX(vecs[1], vecs[2]);
 91 |     comparator::COEX(vecs[3], vecs[12]);
 92 |     comparator::COEX(vecs[4], vecs[6]);
 93 |     comparator::COEX(vecs[5], vecs[7]);
 94 |     comparator::COEX(vecs[8], vecs[10]);
 95 |     comparator::COEX(vecs[9], vecs[11]);
 96 |     comparator::COEX(vecs[13], vecs[14]);
 97 | 
 98 |     comparator::COEX(vecs[1], vecs[4]);
 99 |     comparator::COEX(vecs[2], vecs[6]);
100 |     comparator::COEX(vecs[5], vecs[8]);
101 |     comparator::COEX(vecs[7], vecs[10]);
102 |     comparator::COEX(vecs[9], vecs[13]);
103 |     comparator::COEX(vecs[11], vecs[14]);
104 | 
105 |     comparator::COEX(vecs[2], vecs[4]);
106 |     comparator::COEX(vecs[3], vecs[6]);
107 |     comparator::COEX(vecs[9], vecs[12]);
108 |     comparator::COEX(vecs[11], vecs[13]);
109 | 
110 |     comparator::COEX(vecs[3], vecs[5]);
111 |     comparator::COEX(vecs[6], vecs[8]);
112 |     comparator::COEX(vecs[7], vecs[9]);
113 |     comparator::COEX(vecs[10], vecs[12]);
114 | 
115 |     comparator::COEX(vecs[3], vecs[4]);
116 |     comparator::COEX(vecs[5], vecs[6]);
117 |     comparator::COEX(vecs[7], vecs[8]);
118 |     comparator::COEX(vecs[9], vecs[10]);
119 |     comparator::COEX(vecs[11], vecs[12]);
120 | 
121 |     comparator::COEX(vecs[6], vecs[7]);
122 |     comparator::COEX(vecs[8], vecs[9]);
123 | }
124 | 
125 | template <typename vtype,
126 |           typename comparator,
127 |           typename reg_t = typename vtype::reg_t>
128 | X86_SIMD_SORT_FINLINE void optimal_sort_32(reg_t *vecs)
129 | {
130 |     comparator::COEX(vecs[0], vecs[1]);
131 |     comparator::COEX(vecs[2], vecs[3]);
132 |     comparator::COEX(vecs[4], vecs[5]);
133 |     comparator::COEX(vecs[6], vecs[7]);
134 |     comparator::COEX(vecs[8], vecs[9]);
135 |     comparator::COEX(vecs[10], vecs[11]);
136 |     comparator::COEX(vecs[12], vecs[13]);
137 |     comparator::COEX(vecs[14], vecs[15]);
138 |     comparator::COEX(vecs[16], vecs[17]);
139 |     comparator::COEX(vecs[18], vecs[19]);
140 |     comparator::COEX(vecs[20], vecs[21]);
141 |     comparator::COEX(vecs[22], vecs[23]);
142 |     comparator::COEX(vecs[24], vecs[25]);
143 |     comparator::COEX(vecs[26], vecs[27]);
144 |     comparator::COEX(vecs[28], vecs[29]);
145 |     comparator::COEX(vecs[30], vecs[31]);
146 | 
147 |     comparator::COEX(vecs[0], vecs[2]);
148 |     comparator::COEX(vecs[1], vecs[3]);
149 |     comparator::COEX(vecs[4], vecs[6]);
150 |     comparator::COEX(vecs[5], vecs[7]);
151 |     comparator::COEX(vecs[8], vecs[10]);
152 |     comparator::COEX(vecs[9], vecs[11]);
153 |     comparator::COEX(vecs[12], vecs[14]);
154 |     comparator::COEX(vecs[13], vecs[15]);
155 |     comparator::COEX(vecs[16], vecs[18]);
156 |     comparator::COEX(vecs[17], vecs[19]);
157 |     comparator::COEX(vecs[20], vecs[22]);
158 |     comparator::COEX(vecs[21], vecs[23]);
159 |     comparator::COEX(vecs[24], vecs[26]);
160 |     comparator::COEX(vecs[25], vecs[27]);
161 |     comparator::COEX(vecs[28], vecs[30]);
162 |     comparator::COEX(vecs[29], vecs[31]);
163 | 
164 |     comparator::COEX(vecs[0], vecs[4]);
165 |     comparator::COEX(vecs[1], vecs[5]);
166 |     comparator::COEX(vecs[2], vecs[6]);
167 |     comparator::COEX(vecs[3], vecs[7]);
168 |     comparator::COEX(vecs[8], vecs[12]);
169 |     comparator::COEX(vecs[9], vecs[13]);
170 |     comparator::COEX(vecs[10], vecs[14]);
171 |     comparator::COEX(vecs[11], vecs[15]);
172 |     comparator::COEX(vecs[16], vecs[20]);
173 |     comparator::COEX(vecs[17], vecs[21]);
174 |     comparator::COEX(vecs[18], vecs[22]);
175 |     comparator::COEX(vecs[19], vecs[23]);
176 |     comparator::COEX(vecs[24], vecs[28]);
177 |     comparator::COEX(vecs[25], vecs[29]);
178 |     comparator::COEX(vecs[26], vecs[30]);
179 |     comparator::COEX(vecs[27], vecs[31]);
180 | 
181 |     comparator::COEX(vecs[0], vecs[8]);
182 |     comparator::COEX(vecs[1], vecs[9]);
183 |     comparator::COEX(vecs[2], vecs[10]);
184 |     comparator::COEX(vecs[3], vecs[11]);
185 |     comparator::COEX(vecs[4], vecs[12]);
186 |     comparator::COEX(vecs[5], vecs[13]);
187 |     comparator::COEX(vecs[6], vecs[14]);
188 |     comparator::COEX(vecs[7], vecs[15]);
189 |     comparator::COEX(vecs[16], vecs[24]);
190 |     comparator::COEX(vecs[17], vecs[25]);
191 |     comparator::COEX(vecs[18], vecs[26]);
192 |     comparator::COEX(vecs[19], vecs[27]);
193 |     comparator::COEX(vecs[20], vecs[28]);
194 |     comparator::COEX(vecs[21], vecs[29]);
195 |     comparator::COEX(vecs[22], vecs[30]);
196 |     comparator::COEX(vecs[23], vecs[31]);
197 | 
198 |     comparator::COEX(vecs[0], vecs[16]);
199 |     comparator::COEX(vecs[1], vecs[8]);
200 |     comparator::COEX(vecs[2], vecs[4]);
201 |     comparator::COEX(vecs[3], vecs[12]);
202 |     comparator::COEX(vecs[5], vecs[10]);
203 |     comparator::COEX(vecs[6], vecs[9]);
204 |     comparator::COEX(vecs[7], vecs[14]);
205 |     comparator::COEX(vecs[11], vecs[13]);
206 |     comparator::COEX(vecs[15], vecs[31]);
207 |     comparator::COEX(vecs[17], vecs[24]);
208 |     comparator::COEX(vecs[18], vecs[20]);
209 |     comparator::COEX(vecs[19], vecs[28]);
210 |     comparator::COEX(vecs[21], vecs[26]);
211 |     comparator::COEX(vecs[22], vecs[25]);
212 |     comparator::COEX(vecs[23], vecs[30]);
213 |     comparator::COEX(vecs[27], vecs[29]);
214 | 
215 |     comparator::COEX(vecs[1], vecs[2]);
216 |     comparator::COEX(vecs[3], vecs[5]);
217 |     comparator::COEX(vecs[4], vecs[8]);
218 |     comparator::COEX(vecs[6], vecs[22]);
219 |     comparator::COEX(vecs[7], vecs[11]);
220 |     comparator::COEX(vecs[9], vecs[25]);
221 |     comparator::COEX(vecs[10], vecs[12]);
222 |     comparator::COEX(vecs[13], vecs[14]);
223 |     comparator::COEX(vecs[17], vecs[18]);
224 |     comparator::COEX(vecs[19], vecs[21]);
225 |     comparator::COEX(vecs[20], vecs[24]);
226 |     comparator::COEX(vecs[23], vecs[27]);
227 |     comparator::COEX(vecs[26], vecs[28]);
228 |     comparator::COEX(vecs[29], vecs[30]);
229 | 
230 |     comparator::COEX(vecs[1], vecs[17]);
231 |     comparator::COEX(vecs[2], vecs[18]);
232 |     comparator::COEX(vecs[3], vecs[19]);
233 |     comparator::COEX(vecs[4], vecs[20]);
234 |     comparator::COEX(vecs[5], vecs[10]);
235 |     comparator::COEX(vecs[7], vecs[23]);
236 |     comparator::COEX(vecs[8], vecs[24]);
237 |     comparator::COEX(vecs[11], vecs[27]);
238 |     comparator::COEX(vecs[12], vecs[28]);
239 |     comparator::COEX(vecs[13], vecs[29]);
240 |     comparator::COEX(vecs[14], vecs[30]);
241 |     comparator::COEX(vecs[21], vecs[26]);
242 | 
243 |     comparator::COEX(vecs[3], vecs[17]);
244 |     comparator::COEX(vecs[4], vecs[16]);
245 |     comparator::COEX(vecs[5], vecs[21]);
246 |     comparator::COEX(vecs[6], vecs[18]);
247 |     comparator::COEX(vecs[7], vecs[9]);
248 |     comparator::COEX(vecs[8], vecs[20]);
249 |     comparator::COEX(vecs[10], vecs[26]);
250 |     comparator::COEX(vecs[11], vecs[23]);
251 |     comparator::COEX(vecs[13], vecs[25]);
252 |     comparator::COEX(vecs[14], vecs[28]);
253 |     comparator::COEX(vecs[15], vecs[27]);
254 |     comparator::COEX(vecs[22], vecs[24]);
255 | 
256 |     comparator::COEX(vecs[1], vecs[4]);
257 |     comparator::COEX(vecs[3], vecs[8]);
258 |     comparator::COEX(vecs[5], vecs[16]);
259 |     comparator::COEX(vecs[7], vecs[17]);
260 |     comparator::COEX(vecs[9], vecs[21]);
261 |     comparator::COEX(vecs[10], vecs[22]);
262 |     comparator::COEX(vecs[11], vecs[19]);
263 |     comparator::COEX(vecs[12], vecs[20]);
264 |     comparator::COEX(vecs[14], vecs[24]);
265 |     comparator::COEX(vecs[15], vecs[26]);
266 |     comparator::COEX(vecs[23], vecs[28]);
267 |     comparator::COEX(vecs[27], vecs[30]);
268 | 
269 |     comparator::COEX(vecs[2], vecs[5]);
270 |     comparator::COEX(vecs[7], vecs[8]);
271 |     comparator::COEX(vecs[9], vecs[18]);
272 |     comparator::COEX(vecs[11], vecs[17]);
273 |     comparator::COEX(vecs[12], vecs[16]);
274 |     comparator::COEX(vecs[13], vecs[22]);
275 |     comparator::COEX(vecs[14], vecs[20]);
276 |     comparator::COEX(vecs[15], vecs[19]);
277 |     comparator::COEX(vecs[23], vecs[24]);
278 |     comparator::COEX(vecs[26], vecs[29]);
279 | 
280 |     comparator::COEX(vecs[2], vecs[4]);
281 |     comparator::COEX(vecs[6], vecs[12]);
282 |     comparator::COEX(vecs[9], vecs[16]);
283 |     comparator::COEX(vecs[10], vecs[11]);
284 |     comparator::COEX(vecs[13], vecs[17]);
285 |     comparator::COEX(vecs[14], vecs[18]);
286 |     comparator::COEX(vecs[15], vecs[22]);
287 |     comparator::COEX(vecs[19], vecs[25]);
288 |     comparator::COEX(vecs[20], vecs[21]);
289 |     comparator::COEX(vecs[27], vecs[29]);
290 | 
291 |     comparator::COEX(vecs[5], vecs[6]);
292 |     comparator::COEX(vecs[8], vecs[12]);
293 |     comparator::COEX(vecs[9], vecs[10]);
294 |     comparator::COEX(vecs[11], vecs[13]);
295 |     comparator::COEX(vecs[14], vecs[16]);
296 |     comparator::COEX(vecs[15], vecs[17]);
297 |     comparator::COEX(vecs[18], vecs[20]);
298 |     comparator::COEX(vecs[19], vecs[23]);
299 |     comparator::COEX(vecs[21], vecs[22]);
300 |     comparator::COEX(vecs[25], vecs[26]);
301 | 
302 |     comparator::COEX(vecs[3], vecs[5]);
303 |     comparator::COEX(vecs[6], vecs[7]);
304 |     comparator::COEX(vecs[8], vecs[9]);
305 |     comparator::COEX(vecs[10], vecs[12]);
306 |     comparator::COEX(vecs[11], vecs[14]);
307 |     comparator::COEX(vecs[13], vecs[16]);
308 |     comparator::COEX(vecs[15], vecs[18]);
309 |     comparator::COEX(vecs[17], vecs[20]);
310 |     comparator::COEX(vecs[19], vecs[21]);
311 |     comparator::COEX(vecs[22], vecs[23]);
312 |     comparator::COEX(vecs[24], vecs[25]);
313 |     comparator::COEX(vecs[26], vecs[28]);
314 | 
315 |     comparator::COEX(vecs[3], vecs[4]);
316 |     comparator::COEX(vecs[5], vecs[6]);
317 |     comparator::COEX(vecs[7], vecs[8]);
318 |     comparator::COEX(vecs[9], vecs[10]);
319 |     comparator::COEX(vecs[11], vecs[12]);
320 |     comparator::COEX(vecs[13], vecs[14]);
321 |     comparator::COEX(vecs[15], vecs[16]);
322 |     comparator::COEX(vecs[17], vecs[18]);
323 |     comparator::COEX(vecs[19], vecs[20]);
324 |     comparator::COEX(vecs[21], vecs[22]);
325 |     comparator::COEX(vecs[23], vecs[24]);
326 |     comparator::COEX(vecs[25], vecs[26]);
327 |     comparator::COEX(vecs[27], vecs[28]);
328 | }
329 | 


--------------------------------------------------------------------------------