├── .clang-format
├── .clang-tidy.yml
├── .editorconfig
├── .github
    └── workflows
    │   └── build-and-test.yml
├── .gitignore
├── CMakeLists.txt
├── CMakeSettings.json
├── Folder.DotSettings
├── LICENSE
├── README.md
├── bench
    ├── CMakeLists.txt
    ├── bench.cpp
    ├── bench_isa.h
    ├── fullsort
    │   ├── BM_fullsort.pdqsort.cpp
    │   ├── BM_fullsort.stdsort.cpp
    │   ├── BM_fullsort.vxsort.avx2.f.cpp
    │   ├── BM_fullsort.vxsort.avx2.i.cpp
    │   ├── BM_fullsort.vxsort.avx2.u.cpp
    │   ├── BM_fullsort.vxsort.avx512.f.cpp
    │   ├── BM_fullsort.vxsort.avx512.i.cpp
    │   ├── BM_fullsort.vxsort.avx512.u.cpp
    │   ├── BM_fullsort.vxsort.h
    │   ├── BM_fullsort_strided.avx2.cpp
    │   ├── BM_fullsort_strided.avx512.cpp
    │   └── fullsort_params.h
    ├── internal_macros.h
    ├── make-figure.py
    ├── prep.sh
    ├── reference
    │   └── pdqsort.h
    ├── requirements.txt
    ├── run.cmd
    ├── run.sh
    ├── smallsort
    │   ├── BM_blacher.avx2.cpp
    │   ├── BM_smallsort.avx2.cpp
    │   ├── BM_smallsort.avx512.cpp
    │   └── BM_smallsort.h
    ├── stolen-cycleclock.h
    ├── util.cpp
    └── util.h
├── build.sh
├── clang-tidy.sh
├── cmake
    ├── CPM.cmake
    ├── ConfigSafeGuards.cmake
    ├── EnableLocalGtestDiscovery.cmake
    ├── GetHostType.cmake
    └── Modules
    │   ├── FindLLVMAr.cmake
    │   ├── FindLLVMNm.cmake
    │   └── FindLLVMRanLib.cmake
├── demo
    ├── CMakeLists.txt
    ├── demo.cpp
    ├── do_avx2.cpp
    └── do_avx512.cpp
├── tests
    ├── CMakeLists.txt
    ├── fullsort
    │   ├── fullsort.avx2.cpp
    │   ├── fullsort.avx512.cpp
    │   └── fullsort_test.h
    ├── gtest_main.cpp
    ├── mini_tests
    │   ├── masked_load_store.avx2.cpp
    │   ├── masked_load_store.avx512.cpp
    │   ├── masked_load_store.sanity.cpp
    │   ├── masked_load_store_test.h
    │   ├── mini_fixtures.h
    │   ├── pack_machine.avx2.cpp
    │   ├── pack_machine.avx512.cpp
    │   ├── pack_machine_test.h
    │   ├── partition_machine.avx2.cpp
    │   ├── partition_machine.avx512.cpp
    │   └── partition_machine_test.h
    ├── smallsort
    │   ├── smallsort.avx2.cpp
    │   ├── smallsort.avx512.cpp
    │   └── smallsort_test.h
    ├── sort_fixtures.h
    ├── test_isa.h
    └── util.h
└── vxsort
    ├── CMakeLists.txt
    ├── alignment.h
    ├── compiler.h
    ├── defs.h
    ├── isa_detection.cpp
    ├── isa_detection.h
    ├── isa_detection_sane.cpp
    ├── pack_machine.h
    ├── partition_machine.avx2.h
    ├── partition_machine.avx512.h
    ├── partition_machine.h
    ├── smallsort
        ├── avx2
        │   ├── bitonic_machine.avx2.f32.generated.h
        │   ├── bitonic_machine.avx2.f64.generated.h
        │   ├── bitonic_machine.avx2.h
        │   ├── bitonic_machine.avx2.i16.generated.h
        │   ├── bitonic_machine.avx2.i32.generated.h
        │   ├── bitonic_machine.avx2.i64.generated.h
        │   ├── bitonic_machine.avx2.u16.generated.h
        │   ├── bitonic_machine.avx2.u32.generated.h
        │   └── bitonic_machine.avx2.u64.generated.h
        ├── avx512
        │   ├── bitonic_machine.avx512.f32.generated.h
        │   ├── bitonic_machine.avx512.f64.generated.h
        │   ├── bitonic_machine.avx512.h
        │   ├── bitonic_machine.avx512.i16.generated.h
        │   ├── bitonic_machine.avx512.i32.generated.h
        │   ├── bitonic_machine.avx512.i64.generated.h
        │   ├── bitonic_machine.avx512.u16.generated.h
        │   ├── bitonic_machine.avx512.u32.generated.h
        │   └── bitonic_machine.avx512.u64.generated.h
        ├── bitonic_machine.h
        ├── bitonic_sort.avx2.h
        ├── bitonic_sort.avx512.h
        ├── bitonic_sort.h
        └── codegen
        │   ├── avx2.py
        │   ├── avx512.py
        │   ├── bitonic_gen.py
        │   ├── bitonic_isa.py
        │   └── utils.py
    ├── stats
        ├── vxsort_stats.cpp
        └── vxsort_stats.h
    ├── vector_machine
        ├── avx2
        │   ├── avx2_masks.cpp
        │   ├── f32.h
        │   ├── f64.h
        │   ├── i16.h
        │   ├── i32.h
        │   ├── i64.h
        │   ├── u16.h
        │   ├── u32.h
        │   └── u64.h
        ├── avx512
        │   ├── f32.h
        │   ├── f64.h
        │   ├── i16.h
        │   ├── i32.h
        │   ├── i64.h
        │   ├── u16.h
        │   ├── u32.h
        │   └── u64.h
        ├── machine_traits.avx2.h
        ├── machine_traits.avx512.h
        └── machine_traits.h
    ├── vxsort.avx2.h
    ├── vxsort.avx512.h
    ├── vxsort.h
    ├── vxsort_targets_disable.h
    ├── vxsort_targets_enable_avx2.h
    └── vxsort_targets_enable_avx512.h


/.clang-format:
--------------------------------------------------------------------------------
 1 | ﻿---
 2 | BasedOnStyle: Chromium
 3 | 
 4 | ---
 5 | Language: Cpp
 6 | ColumnLimit: 160
 7 | IndentWidth: 4
 8 | 
 9 | ...
10 | 


--------------------------------------------------------------------------------
/.clang-tidy.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | Checks:          >-
 3 |                     clang-diagnostic-*,
 4 |                     clang-analyzer-*'
 5 |                     performance-*,
 6 |                     portability-*,
 7 |                     -portability-simd-intrinsics,
 8 |                     bugprone-*,
 9 | WarningsAsErrors: ''
10 | HeaderFilterRegex: ''
11 | AnalyzeTemporaryDtors: false
12 | FormatStyle:     none
13 | User:            dmg
14 | CheckOptions:
15 |   - key:             llvm-else-after-return.WarnOnConditionVariables
16 |     value:           '0'
17 |   - key:             modernize-loop-convert.MinConfidence
18 |     value:           reasonable
19 |   - key:             modernize-replace-auto-ptr.IncludeStyle
20 |     value:           llvm
21 |   - key:             cert-str34-c.DiagnoseSignedUnsignedCharComparisons
22 |     value:           '0'
23 |   - key:             google-readability-namespace-comments.ShortNamespaceLines
24 |     value:           '10'
25 |   - key:             cert-oop54-cpp.WarnOnlyIfThisHasSuspiciousField
26 |     value:           '0'
27 |   - key:             cppcoreguidelines-non-private-member-variables-in-classes.IgnoreClassesWithAllMemberVariablesBeingPublic
28 |     value:           '1'
29 |   - key:             cert-dcl16-c.NewSuffixes
30 |     value:           'L;LL;LU;LLU'
31 |   - key:             google-readability-braces-around-statements.ShortStatementLines
32 |     value:           '1'
33 |   - key:             modernize-pass-by-value.IncludeStyle
34 |     value:           llvm
35 |   - key:             google-readability-namespace-comments.SpacesBeforeComments
36 |     value:           '2'
37 |   - key:             modernize-loop-convert.MaxCopySize
38 |     value:           '16'
39 |   - key:             cppcoreguidelines-explicit-virtual-functions.IgnoreDestructors
40 |     value:           '1'
41 |   - key:             modernize-use-nullptr.NullMacros
42 |     value:           'NULL'
43 |   - key:             llvm-qualified-auto.AddConstToQualified
44 |     value:           '0'
45 |   - key:             modernize-loop-convert.NamingStyle
46 |     value:           CamelCase
47 |   - key:             llvm-else-after-return.WarnOnUnfixable
48 |     value:           '0'
49 |   - key:             google-readability-function-size.StatementThreshold
50 |     value:           '800'
51 | 


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
 1 | # top-most EditorConfig file
 2 | root = true
 3 | 
 4 | # Unix-style newlines with a newline ending every file
 5 | [*]
 6 | charset = utf-8
 7 | trim_trailing_whitespace = true
 8 | end_of_line = lf
 9 | insert_final_newline = true
10 | 
11 | # Tab indentation (no size specified)
12 | [Makefile]
13 | indent_style = tab
14 | 
15 | [*.{c,h,cpp,hpp}]
16 | indent_size = 4


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | build/
3 | __pycache__
4 | .vs
5 | 


--------------------------------------------------------------------------------
/CMakeSettings.json:
--------------------------------------------------------------------------------
 1 | ﻿{
 2 |   "configurations": [
 3 |     {
 4 |       "name": "vs2022-clang-debug",
 5 |       "generator": "Ninja",
 6 |       "configurationType": "Debug",
 7 |       "inheritEnvironments": [ "clang_cl_x64" ],
 8 |       "buildRoot": "${projectDir}\\build\\${name}",
 9 |       "installRoot": "${projectDir}\\install\\vs2022-${name}",
10 |       "cmakeCommandArgs": "",
11 |       "buildCommandArgs": "",
12 |       "ctestCommandArgs": ""
13 |     },
14 |     {
15 |       "name": "vs2022-clang-release",
16 |       "generator": "Ninja",
17 |       "configurationType": "Release",
18 |       "buildRoot": "${projectDir}\\build\\${name}",
19 |       "installRoot": "${projectDir}\\install\\vs2022-${name}",
20 |       "cmakeCommandArgs": "",
21 |       "buildCommandArgs": "",
22 |       "ctestCommandArgs": "",
23 |       "inheritEnvironments": [ "clang_cl_x64" ]
24 |     },
25 |     {
26 |       "name": "vs2022-msvc-debug",
27 |       "generator": "Ninja",
28 |       "configurationType": "Debug",
29 |       "buildRoot": "${projectDir}\\build\\${name}",
30 |       "installRoot": "${projectDir}\\install\\vs2022-${name}",
31 |       "cmakeCommandArgs": "",
32 |       "buildCommandArgs": "",
33 |       "ctestCommandArgs": "",
34 |       "inheritEnvironments": [ "msvc_x64_x64" ]
35 |     },
36 |     {
37 |       "name": "vs2022-msvc-release",
38 |       "generator": "Ninja",
39 |       "configurationType": "Release",
40 |       "buildRoot": "${projectDir}\\build\\${name}",
41 |       "installRoot": "${projectDir}\\install\\vs2022-${name}",
42 |       "cmakeCommandArgs": "",
43 |       "buildCommandArgs": "",
44 |       "ctestCommandArgs": "",
45 |       "inheritEnvironments": [ "msvc_x64_x64" ]
46 |     },
47 |     {
48 |       "name": "wsl-clang-debug",
49 |       "generator": "Ninja",
50 |       "configurationType": "Debug",
51 |       "buildRoot": "${projectDir}\\build\\${name}",
52 |       "installRoot": "${projectDir}\\out\\install\\${name}",
53 |       "cmakeExecutable": "cmake",
54 |       "cmakeCommandArgs": "",
55 |       "buildCommandArgs": "",
56 |       "ctestCommandArgs": "",
57 |       "inheritEnvironments": [ "linux_clang_x64" ],
58 |       "wslPath": "${defaultWSLPath}"
59 |     },
60 |     {
61 |       "name": "wsl-clang-release",
62 |       "generator": "Ninja",
63 |       "configurationType": "Release",
64 |       "buildRoot": "${projectDir}\\build\\${name}",
65 |       "installRoot": "${projectDir}\\out\\install\\${name}",
66 |       "cmakeExecutable": "cmake",
67 |       "cmakeCommandArgs": "",
68 |       "buildCommandArgs": "",
69 |       "ctestCommandArgs": "",
70 |       "inheritEnvironments": [ "linux_clang_x64" ],
71 |       "variables": [],
72 |       "wslPath": "${defaultWSLPath}"
73 |     }
74 |   ]
75 | }


--------------------------------------------------------------------------------
/Folder.DotSettings:
--------------------------------------------------------------------------------
1 | ﻿<wpf:ResourceDictionary xml:space="preserve" xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml" xmlns:s="clr-namespace:System;assembly=mscorlib" xmlns:ss="urn:shemas-jetbrains-com:settings-storage-xaml" xmlns:wpf="http://schemas.microsoft.com/winfx/2006/xaml/presentation">
2 | 	<s:String x:Key="/Default/CodeInspection/ExcludedFiles/FilesAndFoldersToSkip2/=DB94C55F_002D6B9A_002D4AAE_002D80D7_002D29FD8153E6B5_002Fd_003Abuild_002Dclion/@EntryIndexedValue">ExplicitlyExcluded</s:String>
3 | 	<s:String x:Key="/Default/CodeInspection/ExcludedFiles/FilesAndFoldersToSkip2/=DB94C55F_002D6B9A_002D4AAE_002D80D7_002D29FD8153E6B5_002Fd_003Abuild_002Fd_003Avs2022_002Dclang_002Drelease_002Fd_003A_005Fdeps/@EntryIndexedValue">ExplicitlyExcluded</s:String>
4 | 	<s:String x:Key="/Default/CodeInspection/ExcludedFiles/FilesAndFoldersToSkip2/=DB94C55F_002D6B9A_002D4AAE_002D80D7_002D29FD8153E6B5_002Fd_003Abuild_002Fd_003Avs2022_002Dmsvc_002Ddebug_002Fd_003A_005Fdeps/@EntryIndexedValue">ExplicitlyExcluded</s:String>
5 | 	<s:String x:Key="/Default/CodeInspection/ExcludedFiles/FilesAndFoldersToSkip2/=DB94C55F_002D6B9A_002D4AAE_002D80D7_002D29FD8153E6B5_002Fd_003Abuild_002Fd_003Avs2022_002Dmsvc_002Drelease_002Fd_003A_005Fdeps/@EntryIndexedValue">ExplicitlyExcluded</s:String></wpf:ResourceDictionary>


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Dan Shechter
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/bench/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(TARGET_NAME ${CMAKE_PROJECT_NAME}_bench)
 2 | 
 3 | 
 4 | find_package(Threads REQUIRED)
 5 | 
 6 | file(GLOB_RECURSE bench_sources *.cpp)
 7 | file(GLOB_RECURSE bench_headers *.h)
 8 | add_executable(${CMAKE_PROJECT_NAME}_bench ${bench_sources} ${bench_headers})
 9 | 
10 | target_link_libraries(${TARGET_NAME}
11 |         ${CMAKE_PROJECT_NAME}_lib
12 |         benchmark
13 |         picosha2
14 |         ${CMAKE_THREAD_LIBS_INIT})
15 | 
16 | configure_file(run.sh run.sh COPYONLY)
17 | configure_file(run.cmd run.cmd COPYONLY)
18 | configure_file(make-figure.py make-figure.py COPYONLY)
19 | 


--------------------------------------------------------------------------------
/bench/bench.cpp:
--------------------------------------------------------------------------------
1 | #include "benchmark/benchmark.h"
2 | 
3 | using namespace std;
4 | 
5 | int main(int argc, char** argv)
6 | {
7 |   ::benchmark::Initialize(&argc, argv);
8 |   ::benchmark::RunSpecifiedBenchmarks();
9 | }


--------------------------------------------------------------------------------
/bench/bench_isa.h:
--------------------------------------------------------------------------------
 1 | #ifndef VXSORT_BENCH_ISA_H
 2 | #define VXSORT_BENCH_ISA_H
 3 | 
 4 | #include <isa_detection.h>
 5 | 
 6 | #define VXSORT_BENCH_ISA() \
 7 |     if (!::vxsort::supports_vector_machine<M>(sizeof(Q))) { \
 8 |                 state.SkipWithError("Current CPU does not support the minimal features for this benchmark"); \
 9 |         return; \
10 |     }
11 | 
12 | #endif //VXSORT_BENCH_ISA_H
13 | 


--------------------------------------------------------------------------------
/bench/fullsort/BM_fullsort.pdqsort.cpp:
--------------------------------------------------------------------------------
 1 | #include "vxsort_targets_enable_avx2.h"
 2 | 
 3 | #include <random>
 4 | #include <benchmark/benchmark.h>
 5 | 
 6 | #include "fullsort_params.h"
 7 | #include "../util.h"
 8 | #include "../reference/pdqsort.h"
 9 | 
10 | namespace vxsort_bench {
11 | using namespace vxsort::types;
12 | 
13 | template <class Q>
14 | static void BM_pdqsort_branchless(benchmark::State& state) {
15 |     auto n = state.range(0);
16 |     auto v = std::vector<Q>((i32)n);
17 |     const auto ITERATIONS = 10;
18 | 
19 |     generate_unique_values_vec(v, (Q)0x1000, (Q)8);
20 |     auto copies = generate_copies(ITERATIONS, n, v);
21 |     auto begins = generate_array_beginnings(copies);
22 |     auto ends = generate_array_beginnings(copies);
23 |     for (usize i = 0; i < copies.size(); i++)
24 |         ends[i] = begins[i] + n - 1;
25 | 
26 |     vxsort::u64 total_cycles = 0;
27 |     for (auto _ : state) {
28 |         state.PauseTiming();
29 |         refresh_copies(copies, v);
30 |         state.ResumeTiming();
31 |         auto start = cycleclock::Now();
32 |         for (auto i = 0; i < ITERATIONS; i++) {
33 |             pdqsort_branchless(begins[i], ends[i]);
34 |         }
35 |         total_cycles += (cycleclock::Now() - start);
36 |     }
37 | 
38 |     state.SetLabel(get_crypto_hash(begins[0], ends[0]));
39 |     state.counters["Time/N"] = make_time_per_n_counter(n * ITERATIONS);
40 |     state.SetBytesProcessed(state.iterations() * n * ITERATIONS * sizeof(Q));
41 |     process_perf_counters(state.counters, n * ITERATIONS);
42 |     if (!state.counters.contains("cycles/N"))
43 |         state.counters["rdtsc-cycles/N"] = make_cycle_per_n_counter((f64)total_cycles / (f64)(n * ITERATIONS * state.iterations()));
44 | }
45 | 
46 | BENCHMARK_TEMPLATE(BM_pdqsort_branchless, i16)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(benchmark::kMillisecond)->ThreadRange(1, processor_count);
47 | BENCHMARK_TEMPLATE(BM_pdqsort_branchless, u16)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(benchmark::kMillisecond)->ThreadRange(1, processor_count);
48 | BENCHMARK_TEMPLATE(BM_pdqsort_branchless, i32)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(benchmark::kMillisecond)->ThreadRange(1, processor_count);
49 | BENCHMARK_TEMPLATE(BM_pdqsort_branchless, u32)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(benchmark::kMillisecond)->ThreadRange(1, processor_count);
50 | BENCHMARK_TEMPLATE(BM_pdqsort_branchless, f32)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(benchmark::kMillisecond)->ThreadRange(1, processor_count);
51 | BENCHMARK_TEMPLATE(BM_pdqsort_branchless, i64)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(benchmark::kMillisecond)->ThreadRange(1, processor_count);
52 | BENCHMARK_TEMPLATE(BM_pdqsort_branchless, u64)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(benchmark::kMillisecond)->ThreadRange(1, processor_count);
53 | BENCHMARK_TEMPLATE(BM_pdqsort_branchless, f64)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(benchmark::kMillisecond)->ThreadRange(1, processor_count);
54 | }
55 | 
56 | #include "vxsort_targets_disable.h"
57 | 


--------------------------------------------------------------------------------
/bench/fullsort/BM_fullsort.stdsort.cpp:
--------------------------------------------------------------------------------
 1 | #include "vxsort_targets_enable_avx2.h"
 2 | 
 3 | #include <random>
 4 | #include <benchmark/benchmark.h>
 5 | 
 6 | #include "fullsort_params.h"
 7 | #include "../util.h"
 8 | 
 9 | namespace vxsort_bench {
10 | using namespace vxsort::types;
11 | 
12 | 
13 | template <class Q>
14 | static void BM_stdsort(benchmark::State& state) {
15 |     auto n = state.range(0);
16 |     auto v = std::vector<Q>((i32)n);
17 |     const auto ITERATIONS = 10;
18 | 
19 |     generate_unique_values_vec(v, (Q)0x1000, (Q)8);
20 |     auto copies = generate_copies(ITERATIONS, n, v);
21 |     auto begins = generate_array_beginnings(copies);
22 |     auto ends = generate_array_beginnings(copies);
23 |     for (usize i = 0; i < copies.size(); i++)
24 |         ends[i] = begins[i] + n - 1;
25 | 
26 |     vxsort::u64 total_cycles = 0;
27 |     for (auto _ : state) {
28 |         state.PauseTiming();
29 |         refresh_copies(copies, v);
30 |         state.ResumeTiming();
31 |         auto start = cycleclock::Now();
32 |         for (auto i = 0; i < ITERATIONS; i++) {
33 |             std::sort(begins[i], ends[i]);
34 |         }
35 |         total_cycles += (cycleclock::Now() - start);
36 |     }
37 | 
38 |     state.SetLabel(get_crypto_hash(begins[0], ends[0]));
39 |     state.counters["Time/N"] = make_time_per_n_counter(n * ITERATIONS);
40 |     state.SetBytesProcessed(state.iterations() * n * ITERATIONS * sizeof(Q));
41 |     process_perf_counters(state.counters, n * ITERATIONS);
42 |     if (!state.counters.contains("cycles/N"))
43 |         state.counters["rdtsc-cycles/N"] = make_cycle_per_n_counter((f64)total_cycles / (f64)(n * ITERATIONS * state.iterations()));
44 | }
45 | 
46 | BENCHMARK_TEMPLATE(BM_stdsort, i16)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(benchmark::kMillisecond)->ThreadRange(1, processor_count);
47 | BENCHMARK_TEMPLATE(BM_stdsort, u16)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(benchmark::kMillisecond)->ThreadRange(1, processor_count);
48 | BENCHMARK_TEMPLATE(BM_stdsort, i32)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(benchmark::kMillisecond)->ThreadRange(1, processor_count);
49 | BENCHMARK_TEMPLATE(BM_stdsort, u32)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(benchmark::kMillisecond)->ThreadRange(1, processor_count);
50 | BENCHMARK_TEMPLATE(BM_stdsort, f32)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(benchmark::kMillisecond)->ThreadRange(1, processor_count);
51 | BENCHMARK_TEMPLATE(BM_stdsort, i64)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(benchmark::kMillisecond)->ThreadRange(1, processor_count);
52 | BENCHMARK_TEMPLATE(BM_stdsort, u64)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(benchmark::kMillisecond)->ThreadRange(1, processor_count);
53 | BENCHMARK_TEMPLATE(BM_stdsort, f64)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(benchmark::kMillisecond)->ThreadRange(1, processor_count);
54 | }
55 | 
56 | #include "vxsort_targets_disable.h"
57 | 


--------------------------------------------------------------------------------
/bench/fullsort/BM_fullsort.vxsort.avx2.f.cpp:
--------------------------------------------------------------------------------
 1 |     #include "vxsort_targets_enable_avx2.h"
 2 | 
 3 | #include <random>
 4 | #include <benchmark/benchmark.h>
 5 | 
 6 | #include <vxsort.avx2.h>
 7 | 
 8 | #include "BM_fullsort.vxsort.h"
 9 | 
10 | namespace vxsort_bench {
11 | using namespace vxsort::types;
12 | using benchmark::TimeUnit;
13 | using vm = vxsort::vector_machine;
14 | 
15 | BENCHMARK_TEMPLATE(BM_vxsort, f32, vm::AVX2,  1)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
16 | BENCHMARK_TEMPLATE(BM_vxsort, f32, vm::AVX2,  2)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
17 | BENCHMARK_TEMPLATE(BM_vxsort, f32, vm::AVX2,  4)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
18 | BENCHMARK_TEMPLATE(BM_vxsort, f32, vm::AVX2,  8)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
19 | 
20 | BENCHMARK_TEMPLATE(BM_vxsort, f64, vm::AVX2,  1)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
21 | BENCHMARK_TEMPLATE(BM_vxsort, f64, vm::AVX2,  2)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
22 | BENCHMARK_TEMPLATE(BM_vxsort, f64, vm::AVX2,  4)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
23 | BENCHMARK_TEMPLATE(BM_vxsort, f64, vm::AVX2,  8)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
24 | 
25 | }
26 | 
27 | #include "vxsort_targets_disable.h"
28 | 


--------------------------------------------------------------------------------
/bench/fullsort/BM_fullsort.vxsort.avx2.i.cpp:
--------------------------------------------------------------------------------
 1 |     #include "vxsort_targets_enable_avx2.h"
 2 | 
 3 | #include <random>
 4 | #include <benchmark/benchmark.h>
 5 | 
 6 | #include <vxsort.avx2.h>
 7 | 
 8 | #include "BM_fullsort.vxsort.h"
 9 | 
10 | namespace vxsort_bench {
11 | using namespace vxsort::types;
12 | using benchmark::TimeUnit;
13 | using vm = vxsort::vector_machine;
14 | 
15 | BENCHMARK_TEMPLATE(BM_vxsort, i16, vm::AVX2,  1)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
16 | BENCHMARK_TEMPLATE(BM_vxsort, i16, vm::AVX2,  2)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
17 | BENCHMARK_TEMPLATE(BM_vxsort, i16, vm::AVX2,  4)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
18 | BENCHMARK_TEMPLATE(BM_vxsort, i16, vm::AVX2,  8)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
19 | 
20 | BENCHMARK_TEMPLATE(BM_vxsort, i32, vm::AVX2,  1)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
21 | BENCHMARK_TEMPLATE(BM_vxsort, i32, vm::AVX2,  2)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
22 | BENCHMARK_TEMPLATE(BM_vxsort, i32, vm::AVX2,  4)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
23 | BENCHMARK_TEMPLATE(BM_vxsort, i32, vm::AVX2,  8)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
24 | 
25 | BENCHMARK_TEMPLATE(BM_vxsort, i64, vm::AVX2,  1)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
26 | BENCHMARK_TEMPLATE(BM_vxsort, i64, vm::AVX2,  1)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
27 | BENCHMARK_TEMPLATE(BM_vxsort, i64, vm::AVX2,  4)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
28 | BENCHMARK_TEMPLATE(BM_vxsort, i64, vm::AVX2,  8)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
29 | }
30 | 
31 | #include "vxsort_targets_disable.h"
32 | 


--------------------------------------------------------------------------------
/bench/fullsort/BM_fullsort.vxsort.avx2.u.cpp:
--------------------------------------------------------------------------------
 1 |     #include "vxsort_targets_enable_avx2.h"
 2 | 
 3 | #include <random>
 4 | #include <benchmark/benchmark.h>
 5 | 
 6 | #include <vxsort.avx2.h>
 7 | 
 8 | #include "BM_fullsort.vxsort.h"
 9 | 
10 | namespace vxsort_bench {
11 | using namespace vxsort::types;
12 | using benchmark::TimeUnit;
13 | using vm = vxsort::vector_machine;
14 | 
15 | BENCHMARK_TEMPLATE(BM_vxsort, u32, vm::AVX2,  1)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
16 | BENCHMARK_TEMPLATE(BM_vxsort, u32, vm::AVX2,  2)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
17 | BENCHMARK_TEMPLATE(BM_vxsort, u32, vm::AVX2,  4)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
18 | BENCHMARK_TEMPLATE(BM_vxsort, u32, vm::AVX2,  8)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
19 | 
20 | BENCHMARK_TEMPLATE(BM_vxsort, u64, vm::AVX2,  1)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
21 | BENCHMARK_TEMPLATE(BM_vxsort, u64, vm::AVX2,  2)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
22 | BENCHMARK_TEMPLATE(BM_vxsort, u64, vm::AVX2,  4)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
23 | BENCHMARK_TEMPLATE(BM_vxsort, u64, vm::AVX2,  8)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
24 | 
25 | }
26 | 
27 | #include "vxsort_targets_disable.h"
28 | 


--------------------------------------------------------------------------------
/bench/fullsort/BM_fullsort.vxsort.avx512.f.cpp:
--------------------------------------------------------------------------------
 1 | #include "vxsort_targets_enable_avx512.h"
 2 | 
 3 | #include <random>
 4 | #include <benchmark/benchmark.h>
 5 | 
 6 | #include <vxsort.avx512.h>
 7 | 
 8 | #include "BM_fullsort.vxsort.h"
 9 | 
10 | namespace vxsort_bench {
11 | using namespace vxsort::types;
12 | using benchmark::TimeUnit;
13 | using vm = vxsort::vector_machine;
14 | 
15 | BENCHMARK_TEMPLATE(BM_vxsort, i16, vm::AVX512,  1)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
16 | BENCHMARK_TEMPLATE(BM_vxsort, f32, vm::AVX512,  1)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
17 | BENCHMARK_TEMPLATE(BM_vxsort, f32, vm::AVX512,  2)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
18 | BENCHMARK_TEMPLATE(BM_vxsort, f32, vm::AVX512,  4)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
19 | BENCHMARK_TEMPLATE(BM_vxsort, f32, vm::AVX512,  8)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
20 | 
21 | BENCHMARK_TEMPLATE(BM_vxsort, f64, vm::AVX512,  1)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
22 | BENCHMARK_TEMPLATE(BM_vxsort, f64, vm::AVX512,  2)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
23 | BENCHMARK_TEMPLATE(BM_vxsort, f64, vm::AVX512,  4)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
24 | BENCHMARK_TEMPLATE(BM_vxsort, f64, vm::AVX512,  8)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
25 | 
26 | }
27 | 
28 | #include "vxsort_targets_disable.h"
29 | 


--------------------------------------------------------------------------------
/bench/fullsort/BM_fullsort.vxsort.avx512.i.cpp:
--------------------------------------------------------------------------------
 1 | #include "vxsort_targets_enable_avx512.h"
 2 | 
 3 | #include <random>
 4 | #include <benchmark/benchmark.h>
 5 | 
 6 | #include <vxsort.avx512.h>
 7 | 
 8 | #include "BM_fullsort.vxsort.h"
 9 | 
10 | namespace vxsort_bench {
11 | using namespace vxsort::types;
12 | using benchmark::TimeUnit;
13 | using vm = vxsort::vector_machine;
14 | 
15 | BENCHMARK_TEMPLATE(BM_vxsort, i16, vm::AVX512,  1)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
16 | BENCHMARK_TEMPLATE(BM_vxsort, i16, vm::AVX512,  2)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
17 | BENCHMARK_TEMPLATE(BM_vxsort, i16, vm::AVX512,  4)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
18 | BENCHMARK_TEMPLATE(BM_vxsort, i16, vm::AVX512,  8)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
19 | 
20 | BENCHMARK_TEMPLATE(BM_vxsort, i32, vm::AVX512,  1)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
21 | BENCHMARK_TEMPLATE(BM_vxsort, i32, vm::AVX512,  2)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
22 | BENCHMARK_TEMPLATE(BM_vxsort, i32, vm::AVX512,  4)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
23 | BENCHMARK_TEMPLATE(BM_vxsort, i32, vm::AVX512,  8)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
24 | 
25 | BENCHMARK_TEMPLATE(BM_vxsort, i64, vm::AVX512,  1)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
26 | BENCHMARK_TEMPLATE(BM_vxsort, i64, vm::AVX512,  2)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
27 | BENCHMARK_TEMPLATE(BM_vxsort, i64, vm::AVX512,  4)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
28 | BENCHMARK_TEMPLATE(BM_vxsort, i64, vm::AVX512,  8)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
29 | 
30 | }
31 | 
32 | #include "vxsort_targets_disable.h"
33 | 


--------------------------------------------------------------------------------
/bench/fullsort/BM_fullsort.vxsort.avx512.u.cpp:
--------------------------------------------------------------------------------
 1 | #include "vxsort_targets_enable_avx512.h"
 2 | 
 3 | #include <random>
 4 | #include <benchmark/benchmark.h>
 5 | 
 6 | #include <vxsort.avx512.h>
 7 | 
 8 | #include "BM_fullsort.vxsort.h"
 9 | 
10 | namespace vxsort_bench {
11 | using namespace vxsort::types;
12 | using benchmark::TimeUnit;
13 | using vm = vxsort::vector_machine;
14 | 
15 | BENCHMARK_TEMPLATE(BM_vxsort, u32, vm::AVX512,  1)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
16 | BENCHMARK_TEMPLATE(BM_vxsort, u32, vm::AVX512,  2)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
17 | BENCHMARK_TEMPLATE(BM_vxsort, u32, vm::AVX512,  4)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
18 | BENCHMARK_TEMPLATE(BM_vxsort, u32, vm::AVX512,  8)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
19 | 
20 | BENCHMARK_TEMPLATE(BM_vxsort, u64, vm::AVX512,  1)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
21 | BENCHMARK_TEMPLATE(BM_vxsort, u64, vm::AVX512,  2)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
22 | BENCHMARK_TEMPLATE(BM_vxsort, u64, vm::AVX512,  4)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
23 | BENCHMARK_TEMPLATE(BM_vxsort, u64, vm::AVX512,  8)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count);
24 | 
25 | }
26 | 
27 | #include "vxsort_targets_disable.h"
28 | 


--------------------------------------------------------------------------------
/bench/fullsort/BM_fullsort.vxsort.h:
--------------------------------------------------------------------------------
  1 | #ifndef VXSORT_BM_FULLSORT_VXSORT_H
  2 | #define VXSORT_BM_FULLSORT_VXSORT_H
  3 | 
  4 | #include <benchmark/benchmark.h>
  5 | #include <algorithm>
  6 | #include <random>
  7 | #include <thread>
  8 | #include "../util.h"
  9 | #include "../bench_isa.h"
 10 | 
 11 | #include <vxsort.h>
 12 | 
 13 | #include "fullsort_params.h"
 14 | 
 15 | namespace vxsort_bench {
 16 | using namespace vxsort::types;
 17 | using vxsort::vector_machine;
 18 | 
 19 | template <class Q, vector_machine M, i32 U>
 20 | static void BM_vxsort(benchmark::State& state) {
 21 |     VXSORT_BENCH_ISA();
 22 | 
 23 |     auto n = state.range(0);
 24 |     auto v = std::vector<Q>((i32)n);
 25 |     const auto ITERATIONS = 10;
 26 | 
 27 |     generate_unique_values_vec(v, (Q)0x1000, (Q)0x8);
 28 |     auto copies = generate_copies(ITERATIONS, n, v);
 29 |     auto begins = generate_array_beginnings(copies);
 30 |     auto ends = generate_array_beginnings(copies);
 31 |     for (usize i = 0; i < copies.size(); i++)
 32 |         ends[i] = begins[i] + n - 1;
 33 | 
 34 |     auto sorter = ::vxsort::vxsort<Q, M, U>();
 35 | 
 36 |     u64 total_cycles = 0;
 37 |     for (auto _ : state) {
 38 |         state.PauseTiming();
 39 |         refresh_copies(copies, v);
 40 |         state.ResumeTiming();
 41 |         auto start = cycleclock::Now();
 42 |         for (auto i = 0; i < ITERATIONS; i++) {
 43 |             sorter.sort(begins[i], ends[i]);
 44 |         }
 45 |         total_cycles += (cycleclock::Now() - start);
 46 |     }
 47 | 
 48 |     state.SetLabel(get_crypto_hash(begins[0], ends[0]));
 49 |     state.counters["Time/N"] = make_time_per_n_counter(n * ITERATIONS);
 50 |     state.SetBytesProcessed(state.iterations() * n * ITERATIONS * sizeof(Q));
 51 |     process_perf_counters(state.counters, n * ITERATIONS);
 52 |     if (!state.counters.contains("cycles/N"))
 53 |         state.counters["rdtsc-cycles/N"] = make_cycle_per_n_counter((f64)total_cycles / (f64)(n * ITERATIONS * state.iterations()));
 54 | }
 55 | 
 56 | const i32 StridedSortSize = 1000000;
 57 | const i64 StridedSortMinValue = 0x80000000LL;
 58 | 
 59 | template <class Q, vector_machine M, i32 U>
 60 | static void BM_vxsort_strided(benchmark::State& state) {
 61 |     VXSORT_BENCH_ISA();
 62 | 
 63 |     auto n = StridedSortSize;
 64 |     auto stride = state.range(0);
 65 |     auto v = std::vector<Q>(n);
 66 |     const auto ITERATIONS = 10;
 67 | 
 68 |     const auto min_value = StridedSortMinValue;
 69 |     const auto max_value = min_value + StridedSortSize * stride;
 70 | 
 71 |     generate_unique_values_vec(v, (Q) 0x80000000, (Q) stride);
 72 |     auto copies = generate_copies(ITERATIONS, n, v);
 73 |     auto begins = generate_array_beginnings(copies);
 74 |     auto ends = generate_array_beginnings(copies);
 75 |     for (size_t i = 0; i < copies.size(); i++)
 76 |         ends[i] = begins[i] + n - 1;
 77 | 
 78 |     auto sorter = ::vxsort::vxsort<Q, M, U, 3>();
 79 | 
 80 |     u64 total_cycles = 0;
 81 |     for (auto _ : state) {
 82 |         state.PauseTiming();
 83 |         refresh_copies(copies, v);
 84 |         state.ResumeTiming();
 85 |         auto start = cycleclock::Now();
 86 |         for (auto i = 0; i < ITERATIONS; i++) {
 87 |             sorter.sort(begins[i], ends[i], min_value, max_value);
 88 |         }
 89 |         total_cycles += (cycleclock::Now() - start);
 90 |     }
 91 | 
 92 |     state.counters["Time/N"] = make_time_per_n_counter(n * ITERATIONS);
 93 |     process_perf_counters(state.counters, n * ITERATIONS);
 94 |     if (!state.counters.contains("cycles/N"))
 95 |         state.counters["rdtsc-cycles/N"] = make_cycle_per_n_counter((f64)total_cycles / (f64)(n * ITERATIONS * state.iterations()));
 96 | }
 97 | }
 98 | 
 99 | #endif  // VXSORT_BM_FULLSORT_VXSORT_H
100 | 


--------------------------------------------------------------------------------
/bench/fullsort/BM_fullsort_strided.avx2.cpp:
--------------------------------------------------------------------------------
 1 | #include "vxsort_targets_enable_avx2.h"
 2 | 
 3 | #include <benchmark/benchmark.h>
 4 | 
 5 | #include <vxsort.avx2.h>
 6 | 
 7 | #include "BM_fullsort.vxsort.h"
 8 | 
 9 | namespace vxsort_bench {
10 | using namespace vxsort::types;
11 | using benchmark::TimeUnit;
12 | using vm = vxsort::vector_machine;
13 | 
14 | BENCHMARK_TEMPLATE(BM_vxsort_strided, i64, vm::AVX2,  1)->RangeMultiplier(2)->Range(MIN_STRIDE, MAX_STRIDE)->Unit(kMillisecond)->ThreadRange(1, processor_count);
15 | BENCHMARK_TEMPLATE(BM_vxsort_strided, i64, vm::AVX2,  4)->RangeMultiplier(2)->Range(MIN_STRIDE, MAX_STRIDE)->Unit(kMillisecond)->ThreadRange(1, processor_count);
16 | BENCHMARK_TEMPLATE(BM_vxsort_strided, i64, vm::AVX2,  8)->RangeMultiplier(2)->Range(MIN_STRIDE, MAX_STRIDE)->Unit(kMillisecond)->ThreadRange(1, processor_count);
17 | BENCHMARK_TEMPLATE(BM_vxsort_strided, i64, vm::AVX2, 12)->RangeMultiplier(2)->Range(MIN_STRIDE, MAX_STRIDE)->Unit(kMillisecond)->ThreadRange(1, processor_count);
18 | 
19 | }
20 | 
21 | #include "vxsort_targets_disable.h"
22 | 


--------------------------------------------------------------------------------
/bench/fullsort/BM_fullsort_strided.avx512.cpp:
--------------------------------------------------------------------------------
 1 | #include "vxsort_targets_enable_avx512.h"
 2 | 
 3 | #include <random>
 4 | #include <benchmark/benchmark.h>
 5 | 
 6 | #include <vxsort.avx512.h>
 7 | 
 8 | #include "BM_fullsort.vxsort.h"
 9 | 
10 | namespace vxsort_bench {
11 | using namespace vxsort::types;
12 | using benchmark::TimeUnit;
13 | using vm = vxsort::vector_machine;
14 | 
15 | BENCHMARK_TEMPLATE(BM_vxsort_strided, i64, vm::AVX512,  1)->RangeMultiplier(2)->Range(MIN_STRIDE, MAX_STRIDE)->Unit(kMillisecond)->ThreadRange(1, processor_count);
16 | BENCHMARK_TEMPLATE(BM_vxsort_strided, i64, vm::AVX512,  4)->RangeMultiplier(2)->Range(MIN_STRIDE, MAX_STRIDE)->Unit(kMillisecond)->ThreadRange(1, processor_count);
17 | BENCHMARK_TEMPLATE(BM_vxsort_strided, i64, vm::AVX512,  8)->RangeMultiplier(2)->Range(MIN_STRIDE, MAX_STRIDE)->Unit(kMillisecond)->ThreadRange(1, processor_count);
18 | 
19 | }
20 | 
21 | #include "vxsort_targets_disable.h"
22 | 


--------------------------------------------------------------------------------
/bench/fullsort/fullsort_params.h:
--------------------------------------------------------------------------------
 1 | #ifndef VXSORT_FULLSORT_PARAMS_H
 2 | #define VXSORT_FULLSORT_PARAMS_H
 3 | 
 4 | #include <vxsort.h>
 5 | 
 6 | namespace vxsort_bench {
 7 | 
 8 | using namespace vxsort::types;
 9 | using vxsort::vector_machine;
10 | 
11 | const auto processor_count = 1;
12 | 
13 | static const i32 MIN_SORT = 256;
14 | static const i32 MAX_SORT = 1 << 24;
15 | 
16 | static const i32 MIN_STRIDE = 1 << 3;
17 | static const i32 MAX_STRIDE = 1 << 27;
18 | }
19 | 
20 | #endif //VXSORT_FULLSORT_PARAMS_H
21 | 


--------------------------------------------------------------------------------
/bench/internal_macros.h:
--------------------------------------------------------------------------------
 1 | #ifndef VXSORT_BENCH_INTERNAL_MACROS_H_
 2 | #define VXSORT_BENCH_INTERNAL_MACROS_H_
 3 | 
 4 | #include "benchmark/benchmark.h"
 5 | 
 6 | /* Needed to detect STL */
 7 | #include <cstdlib>
 8 | 
 9 | // clang-format off
10 | 
11 | #ifndef __has_feature
12 | #define __has_feature(x) 0
13 | #endif
14 | 
15 | #if defined(__clang__)
16 |   #if !defined(COMPILER_CLANG)
17 |     #define COMPILER_CLANG
18 |   #endif
19 | #elif defined(_MSC_VER)
20 |   #if !defined(COMPILER_MSVC)
21 |     #define COMPILER_MSVC
22 |   #endif
23 | #elif defined(__GNUC__)
24 |   #if !defined(COMPILER_GCC)
25 |     #define COMPILER_GCC
26 |   #endif
27 | #endif
28 | 
29 | #if __has_feature(cxx_attributes)
30 |   #define VXSORT_BENCH_NORETURN [[noreturn]]
31 | #elif defined(__GNUC__)
32 |   #define VXSORT_BENCH_NORETURN __attribute__((noreturn))
33 | #elif defined(COMPILER_MSVC)
34 |   #define VXSORT_BENCH_NORETURN __declspec(noreturn)
35 | #else
36 |   #define VXSORT_BENCH_NORETURN
37 | #endif
38 | 
39 | #if defined(__CYGWIN__)
40 |   #define VXSORT_BENCH_OS_CYGWIN 1
41 | #elif defined(_WIN32)
42 |   #define VXSORT_BENCH_OS_WINDOWS 1
43 |   #if defined(__MINGW32__)
44 |     #define VXSORT_BENCH_OS_MINGW 1
45 |   #endif
46 | #elif defined(__APPLE__)
47 |   #define VXSORT_BENCH_OS_APPLE 1
48 |   #include "TargetConditionals.h"
49 |   #if defined(TARGET_OS_MAC)
50 |     #define VXSORT_BENCH_OS_MACOSX 1
51 |     #if defined(TARGET_OS_IPHONE)
52 |       #define VXSORT_BENCH_OS_IOS 1
53 |     #endif
54 |   #endif
55 | #elif defined(__FreeBSD__)
56 |   #define VXSORT_BENCH_OS_FREEBSD 1
57 | #elif defined(__NetBSD__)
58 |   #define VXSORT_BENCH_OS_NETBSD 1
59 | #elif defined(__OpenBSD__)
60 |   #define VXSORT_BENCH_OS_OPENBSD 1
61 | #elif defined(__linux__)
62 |   #define VXSORT_BENCH_OS_LINUX 1
63 | #elif defined(__native_client__)
64 |   #define VXSORT_BENCH_OS_NACL 1
65 | #elif defined(__EMSCRIPTEN__)
66 |   #define VXSORT_BENCH_OS_EMSCRIPTEN 1
67 | #elif defined(__rtems__)
68 |   #define VXSORT_BENCH_OS_RTEMS 1
69 | #elif defined(__Fuchsia__)
70 | #define VXSORT_BENCH_OS_FUCHSIA 1
71 | #elif defined (__SVR4) && defined (__sun)
72 | #define VXSORT_BENCH_OS_SOLARIS 1
73 | #elif defined(__QNX__)
74 | #define VXSORT_BENCH_OS_QNX 1
75 | #endif
76 | 
77 | #if defined(__ANDROID__) && defined(__GLIBCXX__)
78 | #define VXSORT_BENCH_STL_ANDROID_GNUSTL 1
79 | #endif
80 | 
81 | #if !__has_feature(cxx_exceptions) && !defined(__cpp_exceptions) \
82 |      && !defined(__EXCEPTIONS)
83 |   #define VXSORT_BENCH_HAS_NO_EXCEPTIONS
84 | #endif
85 | 
86 | #if defined(COMPILER_CLANG) || defined(COMPILER_GCC)
87 |   #define VXSORT_BENCH_MAYBE_UNUSED __attribute__((unused))
88 | #else
89 |   #define VXSORT_BENCH_MAYBE_UNUSED
90 | #endif
91 | 
92 | // clang-format on
93 | 
94 | #endif  // VXSORT_BENCH_INTERNAL_MACROS_H_
95 | 


--------------------------------------------------------------------------------
/bench/prep.sh:
--------------------------------------------------------------------------------
 1 | # from https://www.alexgallego.org/perf/compiler/explorer/flatbuffers/smf/2018/06/30/effects-cpu-turbo.html
 2 | 
 3 | function cpu_disable_performance_cpupower_state(){
 4 |     for c in  /sys/devices/system/cpu/cpu[0-9]*/cpufreq/scaling_governor; do echo powersave > $c; done
 5 | }
 6 | function cpu_enable_performance_cpupower_state(){
 7 |     for c in  /sys/devices/system/cpu/cpu[0-9]*/cpufreq/scaling_governor; do echo performance > $c; done
 8 | }
 9 | function cpu_available_frequencies() {
10 |     local cpuspec=${1:-[0-9]}
11 |     
12 |     for i in /sys/devices/system/cpu/cpu$cpuspec*; do
13 |         echo "$i:"
14 |         echo "    cpufreq/scaling_min_freq: $(cat $i/cpufreq/scaling_min_freq)";
15 |         echo "    cpufreq/scaling_max_freq: $(cat $i/cpufreq/scaling_max_freq)";
16 |     done
17 | }
18 | 
19 | function cpu_set_min_frequencies() {
20 |     local freq=$1;
21 |     local cpuspec=${2:-[0-9]}
22 |     if [[ $freq == "" ]]; then exit 1; fi
23 |     for i in /sys/devices/system/cpu/cpu$cpuspec*; do
24 |         echo "$i:"
25 |         echo "$i/cpufreq/scaling_min_freq: $(cat $i/cpufreq/scaling_min_freq)";
26 |         echo "$freq" | sudo tee "$i/cpufreq/scaling_min_freq"
27 |         echo "$i/cpufreq/scaling_min_freq: $(cat $i/cpufreq/scaling_min_freq)";
28 |     done
29 | }
30 | 
31 | function cpu_set_max_frequencies() {
32 |     local freq=$1;
33 |     local cpuspec=${2:-[0-9]}
34 |     if [[ $freq == "" ]]; then exit 1; fi
35 |     for i in /sys/devices/system/cpu/cpu$cpuspec*; do
36 |         echo "$i:"
37 |         echo "$i/cpufreq/scaling_max_freq: $(cat $i/cpufreq/scaling_max_freq)";
38 |         echo "$freq" | sudo tee "$i/cpufreq/scaling_max_freq"
39 |         echo "$i/cpufreq/scaling_max_freq: $(cat $i/cpufreq/scaling_max_freq)";
40 |     done
41 | }
42 | 


--------------------------------------------------------------------------------
/bench/requirements.txt:
--------------------------------------------------------------------------------
1 | kaleido
2 | plotly
3 | pandas
4 | humanize
5 | ipython
6 | 


--------------------------------------------------------------------------------
/bench/run.cmd:
--------------------------------------------------------------------------------
1 | @echo off
2 | vxsort_bench --benchmark_counters_tabular %1 %2 %3 %4 %5 %6 %7 %8 %9
3 | 


--------------------------------------------------------------------------------
/bench/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | hogs=$(pgrep -if "(typora|firefox|chrome|chromium-browser|vivaldi-bin|rider|pycharm|resharper|msbuild|telegram|clion|clangd|discord|slack)")
 3 | 
 4 | resume() {
 5 |   echo Resuming "$(echo "$hogs" | wc -w)" procs after running bench
 6 |   [[ -z "$hogs" ]] || echo "$hogs" | xargs kill -CONT 
 7 | }
 8 | 
 9 | trap 'resume' SIGINT
10 | 
11 | echo Suspending "$(echo "$hogs" | wc -w)" procs before running bench
12 | [[ -z "$hogs" ]] || echo "$hogs" | xargs kill -STOP
13 | 
14 | SCRIPT_DIR="$(cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
15 | 
16 | "$SCRIPT_DIR"/vxsort_bench --benchmark_counters_tabular "$@"
17 | trap '' SIGINT
18 | resume
19 | 


--------------------------------------------------------------------------------
/bench/smallsort/BM_blacher.avx2.cpp:
--------------------------------------------------------------------------------
  1 | #include "vxsort_targets_enable_avx2.h"
  2 | 
  3 | #include "BM_smallsort.h"
  4 | 
  5 | #include <smallsort/bitonic_sort.avx2.h>
  6 | 
  7 | namespace vxsort_bench {
  8 | using namespace vxsort::types;
  9 | using benchmark::TimeUnit;
 10 | using vm = vxsort::vector_machine;
 11 | 
 12 | #define COEX(a, b){                     \
 13 |     auto vec_tmp = a;                   \
 14 |     a = _mm256_min_epi32(a, b);         \
 15 |     b = _mm256_max_epi32(vec_tmp, b);}
 16 | 
 17 | /* shuffle 2 vectors, instruction for int is missing,
 18 |  * therefore shuffle with float */
 19 | #define SHUFFLE_2_VECS(a, b, mask)                                       \
 20 |     _mm256_castps_si256 (_mm256_shuffle_ps(                         \
 21 |         _mm256_castsi256_ps (a), _mm256_castsi256_ps (b), mask));
 22 | 
 23 | /* optimized sorting network for two vectors, that is 16 ints */
 24 | inline void sort_02v_ascending(__m256i &v1, __m256i &v2) {
 25 |     COEX(v1, v2);                                  /* step 1 */
 26 | 
 27 |     v2 = _mm256_shuffle_epi32(v2, _MM_SHUFFLE(2, 3, 0, 1)); /* step 2 */
 28 |     COEX(v1, v2);
 29 | 
 30 |     auto tmp = v1;                                          /* step  3 */
 31 |     v1 = SHUFFLE_2_VECS(v1, v2, 0b10001000);
 32 |     v2 = SHUFFLE_2_VECS(tmp, v2, 0b11011101);
 33 |     COEX(v1, v2);
 34 | 
 35 |     v2 = _mm256_shuffle_epi32(v2, _MM_SHUFFLE(0, 1, 2, 3)); /* step  4 */
 36 |     COEX(v1, v2);
 37 | 
 38 |     tmp = v1;                                               /* step  5 */
 39 |     v1 = SHUFFLE_2_VECS(v1, v2, 0b01000100);
 40 |     v2 = SHUFFLE_2_VECS(tmp, v2, 0b11101110);
 41 |     COEX(v1, v2);
 42 | 
 43 |     tmp = v1;                                               /* step  6 */
 44 |     v1 = SHUFFLE_2_VECS(v1, v2, 0b11011000);
 45 |     v2 = SHUFFLE_2_VECS(tmp, v2, 0b10001101);
 46 |     COEX(v1, v2);
 47 | 
 48 |     v2 = _mm256_permutevar8x32_epi32(v2, _mm256_setr_epi32(7, 6, 5, 4, 3, 2, 1, 0));
 49 |     COEX(v1, v2);                                           /* step  7 */
 50 | 
 51 |     tmp = v1;                                               /* step  8 */
 52 |     v1 = SHUFFLE_2_VECS(v1, v2, 0b11011000);
 53 |     v2 = SHUFFLE_2_VECS(tmp, v2, 0b10001101);
 54 |     COEX(v1, v2);
 55 | 
 56 |     tmp = v1;                                               /* step  9 */
 57 |     v1 = SHUFFLE_2_VECS(v1, v2, 0b11011000);
 58 |     v2 = SHUFFLE_2_VECS(tmp, v2, 0b10001101);
 59 |     COEX(v1, v2);
 60 | 
 61 |     /* permute to make it easier to restore order */
 62 |     v1 = _mm256_permutevar8x32_epi32(v1, _mm256_setr_epi32(0, 4, 1, 5, 6, 2, 7, 3));
 63 |     v2 = _mm256_permutevar8x32_epi32(v2, _mm256_setr_epi32(0, 4, 1, 5, 6, 2, 7, 3));
 64 | 
 65 |     tmp = v1;                                              /* step  10 */
 66 |     v1 = SHUFFLE_2_VECS(v1, v2, 0b10001000);
 67 |     v2 = SHUFFLE_2_VECS(tmp, v2, 0b11011101);
 68 |     COEX(v1, v2);
 69 | 
 70 |     /* restore order */
 71 |     auto b2 = _mm256_shuffle_epi32(v2, 0b10110001);
 72 |     auto b1 = _mm256_shuffle_epi32(v1, 0b10110001);
 73 |     v1 = _mm256_blend_epi32(v1, b2, 0b10101010);
 74 |     v2 = _mm256_blend_epi32(b1, v2, 0b10101010);
 75 | }
 76 | 
 77 | // This is generated for testing purposes only
 78 | void bitonic_blacher_16_i32(i32 *ptr) {
 79 |     auto d01 = _mm256_lddqu_si256((__m256i const *) ptr + 0);;
 80 |     auto d02 = _mm256_lddqu_si256((__m256i const *) ptr + 1);;
 81 |     sort_02v_ascending(d01, d02);
 82 |     _mm256_storeu_si256((__m256i *) ptr + 0, d01);
 83 |     _mm256_storeu_si256((__m256i *) ptr + 1, d02);
 84 | }
 85 | 
 86 | 
 87 | void BM_blacher(benchmark::State& state)
 88 | {
 89 |     if (!vxsort::supports_vector_machine(vector_machine::AVX2)) {
 90 |         state.SkipWithError("Current CPU does not support the minimal features for this test");
 91 |         return;
 92 |     }
 93 | 
 94 |     static const i32 ITERATIONS = 1024;
 95 |     auto n = 16;
 96 |     auto v = std::vector<i32>(n);
 97 |     generate_unique_values_vec(v, (i32)0x1000, (i32)0x8);
 98 | 
 99 |     auto copies = generate_copies(ITERATIONS, n, v);
100 |     auto begins = generate_array_beginnings(copies);
101 | 
102 |     uint64_t total_cycles = 0;
103 |     for (auto _ : state) {
104 |         state.PauseTiming();
105 |         refresh_copies(copies, v);
106 |         state.ResumeTiming();
107 |         auto start = cycleclock::Now();
108 |         for (auto i = 0; i < ITERATIONS; i++) {
109 |             bitonic_blacher_16_i32(begins[i]);
110 |         }
111 |         total_cycles += cycleclock::Now() - start;
112 |     }
113 | 
114 |     state.SetBytesProcessed(state.iterations() * n * ITERATIONS * sizeof(i32));
115 | 
116 |     state.counters["Time/N"] = make_time_per_n_counter(n * ITERATIONS);
117 |     process_perf_counters(state.counters, n * ITERATIONS);
118 |     if (!state.counters.contains("cycles/N"))
119 |         state.counters["rdtsc-cycles/N"] = make_cycle_per_n_counter((f64)total_cycles / (f64)(n * ITERATIONS * state.iterations()));
120 | }
121 | 
122 | BENCHMARK(BM_blacher)->Unit(kNanosecond)->MinTime(0.1);
123 | 
124 | }
125 | 
126 | #include "vxsort_targets_disable.h"
127 | 


--------------------------------------------------------------------------------
/bench/smallsort/BM_smallsort.avx2.cpp:
--------------------------------------------------------------------------------
 1 | #include "vxsort_targets_enable_avx2.h"
 2 | 
 3 | #include "BM_smallsort.h"
 4 | 
 5 | #include <smallsort/bitonic_sort.avx2.h>
 6 | 
 7 | namespace vxsort_bench {
 8 | using namespace vxsort::types;
 9 | using benchmark::TimeUnit;
10 | using vm = vxsort::vector_machine;
11 | 
12 | BENCHMARK_TEMPLATE(BM_bitonic_sort, i16, vm::AVX2)->DenseRange(16, 4096, 8)->Unit(kNanosecond)->MinTime(0.1);
13 | BENCHMARK_TEMPLATE(BM_bitonic_sort, u16, vm::AVX2)->DenseRange(16, 4096, 8)->Unit(kNanosecond)->MinTime(0.1);
14 | BENCHMARK_TEMPLATE(BM_bitonic_sort, i32, vm::AVX2)->DenseRange( 4, 2048, 4)->Unit(kNanosecond)->MinTime(0.1);
15 | BENCHMARK_TEMPLATE(BM_bitonic_sort, u32, vm::AVX2)->DenseRange( 4, 2048, 4)->Unit(kNanosecond)->MinTime(0.1);
16 | BENCHMARK_TEMPLATE(BM_bitonic_sort, f32, vm::AVX2)->DenseRange( 4, 2048, 4)->Unit(kNanosecond)->MinTime(0.1);
17 | BENCHMARK_TEMPLATE(BM_bitonic_sort, i64, vm::AVX2)->DenseRange( 2, 1024, 2)->Unit(kNanosecond)->MinTime(0.1);
18 | BENCHMARK_TEMPLATE(BM_bitonic_sort, u64, vm::AVX2)->DenseRange( 2, 1024, 2)->Unit(kNanosecond)->MinTime(0.1);
19 | BENCHMARK_TEMPLATE(BM_bitonic_sort, f64, vm::AVX2)->DenseRange( 2, 1024, 2)->Unit(kNanosecond)->MinTime(0.1);
20 | 
21 | BENCHMARK_TEMPLATE(BM_bitonic_machine, i16, vm::AVX2, 2)->Unit(kNanosecond)->MinTime(0.1);
22 | BENCHMARK_TEMPLATE(BM_bitonic_machine, u16, vm::AVX2, 2)->Unit(kNanosecond)->MinTime(0.1);
23 | BENCHMARK_TEMPLATE(BM_bitonic_machine, i32, vm::AVX2, 2)->Unit(kNanosecond)->MinTime(0.1);
24 | BENCHMARK_TEMPLATE(BM_bitonic_machine, u32, vm::AVX2, 2)->Unit(kNanosecond)->MinTime(0.1);
25 | BENCHMARK_TEMPLATE(BM_bitonic_machine, i64, vm::AVX2, 2)->Unit(kNanosecond)->MinTime(0.1);
26 | BENCHMARK_TEMPLATE(BM_bitonic_machine, u64, vm::AVX2, 2)->Unit(kNanosecond)->MinTime(0.1);
27 | BENCHMARK_TEMPLATE(BM_bitonic_machine, f32, vm::AVX2, 2)->Unit(kNanosecond)->MinTime(0.1);
28 | BENCHMARK_TEMPLATE(BM_bitonic_machine, f64, vm::AVX2, 2)->Unit(kNanosecond)->MinTime(0.1);
29 | 
30 | }
31 | 
32 | #include "vxsort_targets_disable.h"
33 | 


--------------------------------------------------------------------------------
/bench/smallsort/BM_smallsort.avx512.cpp:
--------------------------------------------------------------------------------
 1 | #include "vxsort_targets_enable_avx512.h"
 2 | 
 3 | #include "BM_smallsort.h"
 4 | 
 5 | #include <smallsort/bitonic_sort.avx512.h>
 6 | 
 7 | namespace vxsort_bench {
 8 | using namespace vxsort::types;
 9 | using benchmark::TimeUnit;
10 | using vm = vxsort::vector_machine;
11 | 
12 | BENCHMARK_TEMPLATE(BM_bitonic_sort, i16, vm::AVX512)->DenseRange(8, 4096, 8)->Unit(kNanosecond)->MinTime(0.1);
13 | BENCHMARK_TEMPLATE(BM_bitonic_sort, u16, vm::AVX512)->DenseRange(8, 4096, 8)->Unit(kNanosecond)->MinTime(0.1);
14 | BENCHMARK_TEMPLATE(BM_bitonic_sort, i32, vm::AVX512)->DenseRange(4, 2048, 4)->Unit(kNanosecond)->MinTime(0.1);
15 | BENCHMARK_TEMPLATE(BM_bitonic_sort, u32, vm::AVX512)->DenseRange(4, 2048, 4)->Unit(kNanosecond)->MinTime(0.1);
16 | BENCHMARK_TEMPLATE(BM_bitonic_sort, f32, vm::AVX512)->DenseRange(4, 2048, 4)->Unit(kNanosecond)->MinTime(0.1);
17 | BENCHMARK_TEMPLATE(BM_bitonic_sort, i64, vm::AVX512)->DenseRange(2, 1024, 2)->Unit(kNanosecond)->MinTime(0.1);
18 | BENCHMARK_TEMPLATE(BM_bitonic_sort, u64, vm::AVX512)->DenseRange(2, 1024, 2)->Unit(kNanosecond)->MinTime(0.1);
19 | BENCHMARK_TEMPLATE(BM_bitonic_sort, f64, vm::AVX512)->DenseRange(2, 1024, 2)->Unit(kNanosecond)->MinTime(0.1);
20 | 
21 | BENCHMARK_TEMPLATE(BM_bitonic_machine, i16, vm::AVX512, 2)->Unit(kNanosecond)->MinTime(0.1);
22 | BENCHMARK_TEMPLATE(BM_bitonic_machine, u16, vm::AVX512, 2)->Unit(kNanosecond)->MinTime(0.1);
23 | BENCHMARK_TEMPLATE(BM_bitonic_machine, i32, vm::AVX512, 2)->Unit(kNanosecond)->MinTime(0.1);
24 | BENCHMARK_TEMPLATE(BM_bitonic_machine, u32, vm::AVX512, 2)->Unit(kNanosecond)->MinTime(0.1);
25 | BENCHMARK_TEMPLATE(BM_bitonic_machine, i64, vm::AVX512, 2)->Unit(kNanosecond)->MinTime(0.1);
26 | BENCHMARK_TEMPLATE(BM_bitonic_machine, u64, vm::AVX512, 2)->Unit(kNanosecond)->MinTime(0.1);
27 | BENCHMARK_TEMPLATE(BM_bitonic_machine, f32, vm::AVX512, 2)->Unit(kNanosecond)->MinTime(0.1);
28 | BENCHMARK_TEMPLATE(BM_bitonic_machine, f64, vm::AVX512, 2)->Unit(kNanosecond)->MinTime(0.1);
29 | 
30 | }
31 | 
32 | #include "vxsort_targets_disable.h"
33 | 


--------------------------------------------------------------------------------
/bench/smallsort/BM_smallsort.h:
--------------------------------------------------------------------------------
 1 | #include <random>
 2 | #include <algorithm>
 3 | #include <thread>
 4 | #include <benchmark/benchmark.h>
 5 | 
 6 | #include "../bench_isa.h"
 7 | #include <smallsort/bitonic_sort.h>
 8 | 
 9 | #include "../stolen-cycleclock.h"
10 | #include "../util.h"
11 | 
12 | namespace vxsort_bench {
13 | using namespace vxsort::types;
14 | using vxsort::vector_machine;
15 | 
16 | const auto processor_count = std::thread::hardware_concurrency();
17 | 
18 | template <class Q, vxsort::vector_machine M>
19 | static void BM_bitonic_sort(benchmark::State& state) {
20 |     VXSORT_BENCH_ISA();
21 | 
22 |     using BM = vxsort::smallsort::bitonic<Q, M>;
23 | 
24 |     static const i32 ITERATIONS = 1024;
25 |     auto n = state.range(0);
26 |     auto v = std::vector<Q>(n);
27 |     generate_unique_values_vec(v, (Q)0x1000, (Q)0x8);
28 | 
29 |     auto copies = generate_copies(ITERATIONS, n, v);
30 |     auto begins = generate_array_beginnings(copies);
31 | 
32 |     uint64_t total_cycles = 0;
33 |     for (auto _ : state) {
34 |         state.PauseTiming();
35 |         refresh_copies(copies, v);
36 |         state.ResumeTiming();
37 |         auto start = cycleclock::Now();
38 |         for (auto i = 0; i < ITERATIONS; i++)
39 |             BM::sort(begins[i], n);
40 |         total_cycles += cycleclock::Now() - start;
41 |     }
42 | 
43 |     state.SetBytesProcessed(state.iterations() * n * ITERATIONS * sizeof(Q));
44 | 
45 |     state.counters["Time/N"] = make_time_per_n_counter(n * ITERATIONS);
46 |     process_perf_counters(state.counters, n * ITERATIONS);
47 |     if (!state.counters.contains("cycles/N"))
48 |         state.counters["rdtsc-cycles/N"] = make_cycle_per_n_counter((f64)total_cycles / (f64)(n * ITERATIONS * state.iterations()));
49 | }
50 | 
51 | template <class Q, vxsort::vector_machine M, int N>
52 | static void BM_bitonic_machine(benchmark::State& state) {
53 |     VXSORT_BENCH_ISA();
54 | 
55 |     static_assert(N > 0, "N must be greater than 0");
56 |     static_assert(N <= 4, "N cannot exceet 4");
57 | 
58 |     using BM = vxsort::smallsort::bitonic_machine<Q, M>;
59 | 
60 |     static const i32 ITERATIONS = 1024;
61 |     auto n = N * BM::N;
62 |     auto v = std::vector<Q>(n);
63 |     generate_unique_values_vec(v, (Q)0x1000, (Q)0x8);
64 | 
65 |     auto copies = generate_copies(ITERATIONS, n, v);
66 |     auto begins = generate_array_beginnings(copies);
67 | 
68 |     uint64_t total_cycles = 0;
69 |     for (auto _ : state) {
70 |         state.PauseTiming();
71 |         refresh_copies(copies, v);
72 |         state.ResumeTiming();
73 |         auto start = cycleclock::Now();
74 |         for (auto i = 0; i < ITERATIONS; i++) {
75 |             if (N == 1)
76 |                 BM::sort_01v_full_ascending(begins[i]);
77 |             else if (N == 2)
78 |                 BM::sort_02v_full_ascending(begins[i]);
79 |             else if (N == 3)
80 |                 BM::sort_03v_full_ascending(begins[i]);
81 |             else if (N == 4)
82 |                 BM::sort_04v_full_ascending(begins[i]);
83 | 
84 |         }
85 |         total_cycles += cycleclock::Now() - start;
86 |     }
87 | 
88 |     state.SetBytesProcessed(state.iterations() * n * ITERATIONS * sizeof(Q));
89 | 
90 |     state.counters["Time/N"] = make_time_per_n_counter(n * ITERATIONS);
91 |     process_perf_counters(state.counters, n * ITERATIONS);
92 |     if (!state.counters.contains("cycles/N"))
93 |         state.counters["rdtsc-cycles/N"] = make_cycle_per_n_counter((f64)total_cycles / (f64)(n * ITERATIONS * state.iterations()));
94 | }
95 | 
96 | }
97 | 


--------------------------------------------------------------------------------
/bench/util.h:
--------------------------------------------------------------------------------
  1 | #ifndef VXSORT_BENCH_UTIL_H
  2 | #define VXSORT_BENCH_UTIL_H
  3 | 
  4 | #include <benchmark/benchmark.h>
  5 | 
  6 | #include <vector>
  7 | #include <algorithm>
  8 | #include <numeric>
  9 | #include <random>
 10 | 
 11 | #include <defs.h>
 12 | 
 13 | #include "stolen-cycleclock.h"
 14 | 
 15 | using namespace benchmark;
 16 | 
 17 | namespace vxsort_bench {
 18 | 
 19 | using namespace vxsort::types;
 20 | 
 21 | Counter make_time_per_n_counter(i64 n);
 22 | 
 23 | Counter make_cycle_per_n_counter(f64 n);
 24 | 
 25 | std::string get_crypto_hash(void *start, void *end);
 26 | 
 27 | void process_perf_counters(UserCounters &counters, i64 num_elements);
 28 | 
 29 | extern std::random_device::result_type global_bench_random_seed;
 30 | 
 31 | template <typename T>
 32 | void generate_unique_values_vec(std::vector<T>& vec, T start, T stride) {
 33 |     for (usize i = 0; i < vec.size(); i++, start += stride)
 34 |         vec[i] = start;
 35 | 
 36 |     std::mt19937_64 g(global_bench_random_seed);
 37 | 
 38 |     std::shuffle(vec.begin(), vec.end(), g);
 39 | }
 40 | 
 41 | template <typename T, typename U=T>
 42 | std::vector<U *> generate_array_beginnings(std::vector<std::vector<T>> &copies) {
 43 |     const auto num_copies = copies.size();
 44 |     std::vector<U*> begins(num_copies);
 45 |     for (usize i = 0; i < num_copies; i++)
 46 |         begins[i] = (U*)copies[i].data();
 47 |     return begins;
 48 | }
 49 | 
 50 | template <typename T>
 51 | void refresh_copies(std::vector<std::vector<T>> &copies, std::vector<T>& orig) {
 52 |     const auto begin = orig.begin();
 53 |     const auto end = orig.end();
 54 |     const auto num_copies = copies.size();
 55 |     for (usize i = 0; i < num_copies; i++)
 56 |         copies[i].assign(begin, end);
 57 | }
 58 | 
 59 | template <typename T>
 60 | std::vector<std::vector<T>> generate_copies(usize num_copies, i64 n, std::vector<T>& orig) {
 61 |     std::vector<std::vector<T>> copies(num_copies);
 62 |     for (usize i = 0; i < num_copies; i++)
 63 |         copies[i] = std::vector<T>(n);
 64 |     refresh_copies(copies, orig);
 65 |     return copies;
 66 | }
 67 | 
 68 | template <typename T>
 69 | std::vector<T> shuffled_seq(usize size, T start, T stride, std::mt19937_64& rng) {
 70 |     std::vector<T> v; v.reserve(size);
 71 |     for (usize i = 0; i < size; ++i)
 72 |         v.push_back(start + stride * i);
 73 |     std::shuffle(v.begin(), v.end(), rng);
 74 |     return v;
 75 | }
 76 | 
 77 | template <typename T>
 78 | std::vector<int> shuffled_16_values(usize size, T start, T stride, std::mt19937_64& rng) {
 79 |     std::vector<T> v; v.reserve(size);
 80 |     for (usize i = 0; i < size; ++i)
 81 |         v.push_back(start + stride * (i % 16));
 82 |     std::shuffle(v.begin(), v.end(), rng);
 83 |     return v;
 84 | }
 85 | 
 86 | template <typename T>
 87 | std::vector<int> all_equal(isize size, T start) {
 88 |     std::vector<T> v; v.reserve(size);
 89 |     for (i32 i = 0; i < size; ++i)
 90 |         v.push_back(start);
 91 |     return v;
 92 | }
 93 | 
 94 | template <typename T>
 95 | std::vector<T> ascending_int(isize size, T start, T stride) {
 96 |     std::vector<T> v; v.reserve(size);
 97 |     for (isize i = 0; i < size; ++i)
 98 |         v.push_back(start + stride * i);
 99 |     return v;
100 | }
101 | 
102 | template <typename T>
103 | std::vector<T> descending_int(isize size, T start, T stride) {
104 |     std::vector<T> v; v.reserve(size);
105 |     for (isize i = size - 1; i >= 0; --i)
106 |         v.push_back(start + stride * i);
107 |     return v;
108 | }
109 | 
110 | template <typename T>
111 | std::vector<int> pipe_organ(isize size, T start, T stride, std::mt19937_64&) {
112 |     std::vector<T> v; v.reserve(size);
113 |     for (isize i = 0; i < size/2; ++i)
114 |         v.push_back(start + stride * i);
115 |     for (isize i = size/2; i < size; ++i)
116 |         v.push_back(start + (size - i) * stride);
117 |     return v;
118 | }
119 | 
120 | template <typename T>
121 | std::vector<int> push_front(isize size, T start, T stride, std::mt19937_64&) {
122 |     std::vector<int> v; v.reserve(size);
123 |     for (isize i = 1; i < size; ++i)
124 |         v.push_back(start + stride * i);
125 |     v.push_back(start);
126 |     return v;
127 | }
128 | 
129 | template <typename T>
130 | std::vector<T> push_middle(isize size, T start, T stride, std::mt19937_64&) {
131 |     std::vector<T> v; v.reserve(size);
132 |     for (isize i = 0; i < size; ++i) {
133 |         if (i != size/2)
134 |             v.push_back(start + stride * i);
135 |     }
136 |     v.push_back(start + stride * (size/2));
137 |     return v;
138 | }
139 | 
140 | }
141 | 
142 | #endif //VXSORT_BENCH_UTIL_H
143 | 


--------------------------------------------------------------------------------
/build.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | __BuildOS=""
  4 | __CleanBuild=0
  5 | 
  6 | case $OSTYPE in
  7 |   msys|cygwin)
  8 |   __BuildOS=win
  9 |   ;;
 10 |   *)
 11 |   __BuildOS=linux
 12 |   ;;
 13 | esac
 14 | 
 15 | __BuildArch=x64
 16 | __BuildType=Debug
 17 | __COMPILER=gcc
 18 | __CC=gcc
 19 | __CXX=g++
 20 | 
 21 | for i in "$@"
 22 |   do
 23 |     lowerI=${i,,}
 24 |     case $lowerI in
 25 |       -?|-h|--help)
 26 |       usage
 27 |       exit 1
 28 |     ;;
 29 |     x64)
 30 |       __BuildArch=x64
 31 |       ;;
 32 |     x86)
 33 |       __BuildArch=x86
 34 |     ;;        
 35 |     debug)
 36 |       __BuildType=Debug
 37 |       ;;
 38 |     release)
 39 |       __BuildType=Release
 40 |     ;;
 41 |     iaca)
 42 |       __BuildType=IACA
 43 |     ;;
 44 |     clang)
 45 |       __COMPILER=clang
 46 |       __CC=clang-6.0
 47 |       __CXX=clang++-6.0
 48 |     ;;
 49 |     gcc)
 50 |       __COMPILER=gcc
 51 |       __CC=gcc
 52 |       __CXX=g++
 53 |     ;;
 54 |     test)
 55 |       __RunTests=1
 56 |     ;;
 57 |     clean)
 58 |       __CleanBuild=1
 59 |     ;;
 60 |     *)
 61 |       __UnprocessedBuildArgs="$__UnprocessedBuildArgs $i"          
 62 |   esac
 63 | done
 64 | 
 65 | if [ $__CleanBuild == "1" ]; then
 66 |   rm -rf dist
 67 |   rm -rf build-{debug,release}
 68 |   exit 0
 69 | fi
 70 | 
 71 | __DistDir=build-${__BuildType,,}-${__COMPILER}
 72 | 
 73 | mkdir -p ${__DistDir}
 74 | pushd ${__DistDir}
 75 | 
 76 | if [ $__BuildOS == "win" ]; then
 77 |   cmake -G "Visual Studio 15 2017 Win64" -DCMAKE_BUILD_TYPE=${__BuildType^^} ..
 78 |   
 79 |   vs=$(/c/Program\ Files\ \(x86\)/Microsoft\ Visual\ Studio/Installer/vswhere.exe -latest | grep installationPath | cut -f 2- -d : -d " ") 
 80 |   __MSBuildExePath="$vs/MSBuild/15.0/Bin/MSBuild.exe"      
 81 |   if [ ! -f "$__MSBuildExePath" ]; then
 82 |     echo Error: Could not find MSBuild.exe
 83 |     exit 1
 84 |   fi
 85 |   ${__MSBuildExePath} -p:Platform=${__BuildArch} -p:Configuration=${__BuildType} dodo.sln
 86 |   build_result=$?
 87 | fi
 88 | 
 89 | if [ $__BuildOS == "linux" ]; then
 90 |   CC=${__CC} CXX=${__CXX} cmake -DCMAKE_BUILD_TYPE=${__BuildType^^} ..
 91 |   make -j4
 92 |   build_result=$?
 93 | fi
 94 | 
 95 | 
 96 | build_result=$?
 97 | 
 98 | if [ "$__RunTests" == "1" ]; then
 99 |   ./tests/bitgoo_tests
100 | fi
101 | 
102 | # Build complete
103 | if [ ${build_result} == 0 ]; then
104 |     echo bitgoo successfully built. ✔
105 |     echo "binaries are available at ${__DistDir}"
106 | else
107 |     echo "build failed miserably (${build_result}), you suck, 💩💩💩"
108 |     exit $build_result
109 | fi
110 | popd
111 | 
112 | 


--------------------------------------------------------------------------------
/clang-tidy.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ $# -ne 1 ] ; then
 4 | 	echo "Usage $0 <build_folder>" >&2
 5 | 	exit 1
 6 | fi
 7 | 
 8 | BUILD_DIR=$1
 9 | NPROC=${NPROC_CI:-$(nproc)}
10 | 
11 | for candidate in run-clang-tidy-12 run-clang-tidy-11 run-clang-tidy-10 run-clang-tidy ; do
12 | 	if command -v $candidate >/dev/null ; then
13 | 		echo "Using '$candidate' to execute clang-tidy in parallel"
14 | 		_RUN_CLANG_TIDY=$candidate
15 | 		break
16 | 	fi
17 | done
18 | 
19 | if [ -z ${_RUN_CLANG_TIDY} ] >/dev/null ; then
20 | 	echo "run-clang-tidy not found in PATH" >&2
21 | 	exit 1
22 | fi
23 | 
24 | ORIGINAL_COMPILE_COMMANDS="$BUILD_DIR"/compile_commands.json
25 | 
26 | CXX_PROJECT_DIR="$(mktemp -d --suffix='-clang-tidy-vxsort')"
27 | jq 'map(select( (.["file"] | contains("/googletest") | not) and (.["file"] | contains("/googlebenchmark") | not) and (.["file"] | contains("/cpu_features") | not) ))' \
28 |         < "$ORIGINAL_COMPILE_COMMANDS"          \
29 |         > "$CXX_PROJECT_DIR"/compile_commands.json || exit 1
30 | exec "${_RUN_CLANG_TIDY}" \
31 |         -j "${NPROC}" \
32 |         -p "$CXX_PROJECT_DIR" \
33 |         -config="$(cat .clang-tidy.cxx.yaml)"
34 | 
35 | 


--------------------------------------------------------------------------------
/cmake/CPM.cmake:
--------------------------------------------------------------------------------
 1 | set(CPM_DOWNLOAD_VERSION 0.35.0)
 2 | 
 3 | if(CPM_SOURCE_CACHE)
 4 |   # Expand relative path. This is important if the provided path contains a tilde (~)
 5 |   get_filename_component(CPM_SOURCE_CACHE ${CPM_SOURCE_CACHE} ABSOLUTE)
 6 |   set(CPM_DOWNLOAD_LOCATION "${CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
 7 | elseif(DEFINED ENV{CPM_SOURCE_CACHE})
 8 |   set(CPM_DOWNLOAD_LOCATION "$ENV{CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
 9 | else()
10 |   set(CPM_DOWNLOAD_LOCATION "${CMAKE_BINARY_DIR}/cmake/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
11 | endif()
12 | 
13 | if(NOT (EXISTS ${CPM_DOWNLOAD_LOCATION}))
14 |   message(STATUS "Downloading CPM.cmake to ${CPM_DOWNLOAD_LOCATION}")
15 |   file(DOWNLOAD
16 |        https://github.com/cpm-cmake/CPM.cmake/releases/download/v${CPM_DOWNLOAD_VERSION}/CPM.cmake
17 |        ${CPM_DOWNLOAD_LOCATION}
18 |   )
19 | endif()
20 | 
21 | include(${CPM_DOWNLOAD_LOCATION})
22 | 


--------------------------------------------------------------------------------
/cmake/ConfigSafeGuards.cmake:
--------------------------------------------------------------------------------
 1 | # Adapted from: https://github.com/bast/cmake-example/tree/master/cmake
 2 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 3 | 
 4 | # guard against in-source builds
 5 | if(${CMAKE_CURRENT_SOURCE_DIR} STREQUAL ${CMAKE_CURRENT_BINARY_DIR})
 6 |     message(FATAL_ERROR "In-source builds not allowed. Please make a new directory (called a build directory) and run CMake from there.")
 7 | endif()
 8 | 
 9 | # guard against bad build-type strings
10 | if(NOT CMAKE_BUILD_TYPE)
11 |     set(CMAKE_BUILD_TYPE "Debug")
12 | endif()
13 | 
14 | string(TOLOWER "${CMAKE_BUILD_TYPE}" cmake_build_type_tolower)
15 | string(TOUPPER "${CMAKE_BUILD_TYPE}" cmake_build_type_toupper)
16 | if(    NOT cmake_build_type_tolower STREQUAL "debug"
17 |    AND NOT cmake_build_type_tolower STREQUAL "release"
18 |    AND NOT cmake_build_type_tolower STREQUAL "relwithdebinfo")
19 |       message(FATAL_ERROR "Unknown build type \"${CMAKE_BUILD_TYPE}\". Allowed values are Debug, Release, Profile, RelWithDebInfo (case-insensitive).")
20 | endif()
21 | 


--------------------------------------------------------------------------------
/cmake/EnableLocalGtestDiscovery.cmake:
--------------------------------------------------------------------------------
 1 | if(NOT EXISTS ${CMAKE_FIND_PACKAGE_REDIRECTS_DIR}/gtest-config.cmake
 2 |         AND NOT EXISTS ${CMAKE_FIND_PACKAGE_REDIRECTS_DIR}/GTestConfig.cmake)
 3 |     file(
 4 |             WRITE ${CMAKE_FIND_PACKAGE_REDIRECTS_DIR}/gtest-config.cmake
 5 |             [=[
 6 | include(CMakeFindDependencyMacro)
 7 | find_dependency(googletest)
 8 | if(NOT TARGET GTest::GTest)
 9 |   add_library(GTest::GTest INTERFACE IMPORTED)
10 |   target_link_libraries(GTest::GTest INTERFACE GTest::gtest)
11 | endif()
12 | if(NOT TARGET GTest::Main)
13 |   add_library(GTest::Main INTERFACE IMPORTED)
14 |   target_link_libraries(GTest::Main INTERFACE GTest::gtest_main)
15 | endif()
16 | ]=])
17 | endif()
18 | 
19 | if(NOT EXISTS ${CMAKE_FIND_PACKAGE_REDIRECTS_DIR}/gtest-config-version.cmake
20 |         AND NOT EXISTS ${CMAKE_FIND_PACKAGE_REDIRECTS_DIR}/GTestConfigVersion.cmake)
21 |     file(
22 |             WRITE ${CMAKE_FIND_PACKAGE_REDIRECTS_DIR}/gtest-config-version.cmake
23 |             [=[
24 | include(${CMAKE_FIND_PACKAGE_REDIRECTS_DIR}/googletest-config-version.cmake OPTIONAL)
25 | if(NOT PACKAGE_VERSION_COMPATIBLE)
26 |   include(${CMAKE_FIND_PACKAGE_REDIRECTS_DIR}/googletestConfigVersion.cmake OPTIONAL)
27 | endif()
28 | ]=])
29 | endif()
30 | 


--------------------------------------------------------------------------------
/cmake/GetHostType.cmake:
--------------------------------------------------------------------------------
 1 | set(PROCESSOR_IS_MIPS FALSE)
 2 | set(PROCESSOR_IS_ARM FALSE)
 3 | set(PROCESSOR_IS_AARCH64 FALSE)
 4 | set(PROCESSOR_IS_X86 FALSE)
 5 | set(PROCESSOR_IS_POWER FALSE)
 6 | 
 7 | if(CMAKE_SYSTEM_PROCESSOR MATCHES "^mips")
 8 |     set(PROCESSOR_IS_MIPS TRUE)
 9 | elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^arm")
10 |     set(PROCESSOR_IS_ARM TRUE)
11 | elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64")
12 |     set(PROCESSOR_IS_AARCH64 TRUE)
13 | elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)")
14 |     set(PROCESSOR_IS_X86 TRUE)
15 | elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)")
16 |     set(PROCESSOR_IS_POWER TRUE)
17 | endif()


--------------------------------------------------------------------------------
/cmake/Modules/FindLLVMAr.cmake:
--------------------------------------------------------------------------------
 1 | include(FeatureSummary)
 2 | 
 3 | find_program(LLVMAR_EXECUTABLE
 4 |   NAMES llvm-ar
 5 |   DOC "The llvm-ar executable"
 6 |   )
 7 | 
 8 | include(FindPackageHandleStandardArgs)
 9 | find_package_handle_standard_args(LLVMAr
10 |   DEFAULT_MSG
11 |   LLVMAR_EXECUTABLE)
12 | 
13 | SET_PACKAGE_PROPERTIES(LLVMAr PROPERTIES
14 |   URL https://llvm.org/docs/CommandGuide/llvm-ar.html
15 |   DESCRIPTION "create, modify, and extract from archives"
16 | )
17 | 


--------------------------------------------------------------------------------
/cmake/Modules/FindLLVMNm.cmake:
--------------------------------------------------------------------------------
 1 | include(FeatureSummary)
 2 | 
 3 | find_program(LLVMNM_EXECUTABLE
 4 |   NAMES llvm-nm
 5 |   DOC "The llvm-nm executable"
 6 |   )
 7 | 
 8 | include(FindPackageHandleStandardArgs)
 9 | find_package_handle_standard_args(LLVMNm
10 |   DEFAULT_MSG
11 |   LLVMNM_EXECUTABLE)
12 | 
13 | SET_PACKAGE_PROPERTIES(LLVMNm PROPERTIES
14 |   URL https://llvm.org/docs/CommandGuide/llvm-nm.html
15 |   DESCRIPTION "list LLVM bitcode and object file’s symbol table"
16 | )
17 | 


--------------------------------------------------------------------------------
/cmake/Modules/FindLLVMRanLib.cmake:
--------------------------------------------------------------------------------
 1 | include(FeatureSummary)
 2 | 
 3 | find_program(LLVMRANLIB_EXECUTABLE
 4 |   NAMES llvm-ranlib
 5 |   DOC "The llvm-ranlib executable"
 6 |   )
 7 | 
 8 | include(FindPackageHandleStandardArgs)
 9 | find_package_handle_standard_args(LLVMRanLib
10 |   DEFAULT_MSG
11 |   LLVMRANLIB_EXECUTABLE)
12 | 
13 | SET_PACKAGE_PROPERTIES(LLVMRanLib PROPERTIES
14 |   DESCRIPTION "generate index for LLVM archive"
15 | )
16 | 


--------------------------------------------------------------------------------
/demo/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(TARGET_NAME ${CMAKE_PROJECT_NAME}_demo)
 2 | 
 3 | set(demo_SOURCES
 4 |         demo.cpp)
 5 | 
 6 | if (${PROCESSOR_IS_X86})
 7 |     list(APPEND demo_SOURCES
 8 |             do_avx2.cpp
 9 |             do_avx512.cpp)
10 | endif()
11 | 
12 | add_executable(${CMAKE_PROJECT_NAME}_demo ${demo_SOURCES})
13 | 
14 | target_link_libraries(${TARGET_NAME} ${CMAKE_PROJECT_NAME}_lib)
15 | 


--------------------------------------------------------------------------------
/demo/demo.cpp:
--------------------------------------------------------------------------------
 1 | #include <algorithm>
 2 | #include <numeric>
 3 | #include <random>
 4 | 
 5 | using namespace std;
 6 | 
 7 | #include "isa_detection.h"
 8 | 
 9 | using vxsort::vector_machine;
10 | using namespace vxsort::types;
11 | 
12 | extern void do_avx2(i64 *begin, i64 *end);
13 | extern void do_avx512(i64 *begin, i64 *end);
14 | 
15 | std::vector<i64> generate_random_garbage(const usize size) {
16 | 
17 |     auto vec = std::vector<i64>(size);
18 |     std::iota(vec.begin(), vec.end(), 666);
19 | 
20 |     std::random_device rd;
21 |     std::mt19937 g(rd());
22 | 
23 |     std::shuffle(vec.begin(), vec.end(), g);
24 |     return vec;
25 | }
26 | 
27 | int main(int argc, char** argv) {
28 |     if (argc != 2) {
29 |         fprintf(stderr, "demo array size must be specified\n");
30 |         return -1;
31 |     }
32 | 
33 |     const size_t vector_size = atoi(argv[1]);
34 |     auto v = generate_random_garbage(vector_size);
35 | 
36 |     const auto begin = v.data();
37 |     const auto end = begin + vector_size - 1;
38 | 
39 | #if defined(CPU_FEATURES_ARCH_X86)
40 |     if (vxsort::supports_vector_machine(vxsort::vector_machine::AVX512)) {
41 |         fprintf(stderr, "Sorting with AVX512...");
42 |         do_avx512(begin, end);
43 |         fprintf(stderr, "...done!\n");
44 |     } else if (vxsort::supports_vector_machine(vxsort::vector_machine::AVX2)) {
45 |         fprintf(stderr, "Sorting with AVX2...");
46 |         do_avx2(begin, end);
47 |         fprintf(stderr, "...done!\n");
48 |     } else
49 | #endif
50 | #if defined(CPU_FEATURES_ARCH_AARCH64)
51 |     if (vxsort::supports_vector_machine(vxsort::vector_machine::NEON)) {
52 |     } else
53 | #endif
54 | 
55 |     {
56 |         fprintf(stderr, "CPU doesn't seem to support any vectorized ISA, bye-bye\n");
57 |         return -2;
58 |     }
59 | 
60 |     return 0;
61 | }
62 | 


--------------------------------------------------------------------------------
/demo/do_avx2.cpp:
--------------------------------------------------------------------------------
 1 | #include "vxsort_targets_enable_avx2.h"
 2 | 
 3 | #include "vxsort.avx2.h"
 4 | 
 5 | using namespace vxsort::types;
 6 | 
 7 | void do_avx2(i64 *begin, i64 *end) {
 8 |   auto sorter = vxsort::vxsort<i64, vxsort::vector_machine::AVX2, 8>();
 9 |   sorter.sort(begin, end);
10 | }
11 | #include "vxsort_targets_disable.h"
12 | 


--------------------------------------------------------------------------------
/demo/do_avx512.cpp:
--------------------------------------------------------------------------------
 1 | #include "vxsort_targets_enable_avx512.h"
 2 | 
 3 | #include "vxsort.avx512.h"
 4 | 
 5 | using namespace vxsort::types;
 6 | 
 7 | void do_avx512(i64 *begin, i64 *end) {
 8 |   auto sorter = vxsort::vxsort<i64, vxsort::vector_machine::AVX512, 8>();
 9 |   sorter.sort(begin, end);
10 | }
11 | #include "vxsort_targets_disable.h"
12 | 


--------------------------------------------------------------------------------
/tests/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(TARGET_NAME ${CMAKE_PROJECT_NAME}_tests)
 2 | 
 3 | include(GoogleTest)
 4 | 
 5 | set(test_HEADERS
 6 |         smallsort/smallsort_test.h
 7 |         fullsort/fullsort_test.h
 8 |         mini_tests/pack_machine_test.h
 9 |         mini_tests/partition_machine_test.h
10 |         mini_tests/masked_load_store_test.h
11 |         test_isa.h)
12 | 
13 | list(APPEND i_sort_types
14 |         i16
15 |         i32
16 |         i64
17 | )
18 | 
19 | list(APPEND u_sort_types
20 |         u16
21 |         u32
22 |         u64
23 | )
24 | 
25 | list(APPEND f_sort_types
26 |         f32
27 |         f64
28 | )
29 | 
30 | list(APPEND sort_types
31 |         i
32 |         u
33 |         f
34 | )
35 | 
36 | list(APPEND x86_isas
37 |         avx2
38 |         avx512
39 | )
40 | 
41 | list(APPEND test_SOURCES
42 |         gtest_main.cpp
43 |         mini_tests/masked_load_store.sanity.cpp
44 | )
45 | 
46 | if (${PROCESSOR_IS_X86})
47 |     set(test_avx2_SOURCES ${test_SOURCES})
48 |     list(APPEND test_avx2_SOURCES
49 |             smallsort/smallsort.avx2.cpp
50 |             fullsort/fullsort.avx2.cpp
51 |             mini_tests/masked_load_store.avx2.cpp
52 |             mini_tests/partition_machine.avx2.cpp
53 |             mini_tests/pack_machine.avx2.cpp
54 |     )
55 | 
56 |     set(test_avx512_SOURCES ${test_SOURCES})
57 |     list(APPEND test_avx512_SOURCES
58 |             smallsort/smallsort.avx512.cpp
59 |             fullsort/fullsort.avx512.cpp
60 |             mini_tests/masked_load_store.avx512.cpp
61 |             mini_tests/partition_machine.avx512.cpp
62 |             mini_tests/pack_machine.avx512.cpp
63 |             )
64 | 
65 | 
66 | 
67 |     foreach(v ${x86_isas})
68 |         foreach(tf ${sort_types})
69 |         string(TOUPPER ${v} vu)
70 |         add_executable(${TARGET_NAME}_${v}_${tf} ${test_${v}_SOURCES} ${test_HEADERS})
71 | 
72 |         foreach(t ${${tf}_sort_types})
73 |             string(TOUPPER ${t} tu)
74 |             target_compile_definitions(${TARGET_NAME}_${v}_${tf} PRIVATE VXSORT_TEST_${vu}_${tu})
75 |         endforeach ()
76 | 
77 |         target_link_libraries(${TARGET_NAME}_${v}_${tf}
78 |             ${CMAKE_PROJECT_NAME}_lib
79 |             Backward::Backward
80 |             GTest::gtest
81 |             )
82 | 
83 |         add_test(${TARGET_NAME}_${v}_${tf} ${TARGET_NAME}_${v}_${tf})
84 |         endforeach()
85 |     endforeach()
86 | 
87 | endif()
88 | 
89 | 


--------------------------------------------------------------------------------
/tests/fullsort/fullsort_test.h:
--------------------------------------------------------------------------------
 1 | #ifndef VXSORT_FULLSORT_TEST_H
 2 | #define VXSORT_FULLSORT_TEST_H
 3 | 
 4 | #include <gtest/gtest.h>
 5 | #include <algorithm>
 6 | #include <vector>
 7 | #include <fmt/format.h>
 8 | 
 9 | #include "../test_isa.h"
10 | #include "vxsort.h"
11 | 
12 | namespace vxsort_tests {
13 | using namespace vxsort::types;
14 | using ::vxsort::vector_machine;
15 | 
16 | template <typename T, i32 Unroll, vector_machine M>
17 | void vxsort_test(std::vector<T>& V) {
18 |     VXSORT_TEST_ISA();
19 | 
20 |     auto v_copy = std::vector<T>(V);
21 |     auto begin = V.data();
22 |     auto end = V.data() + V.size() - 1;
23 | 
24 |     auto sorter = ::vxsort::vxsort<T, M, Unroll>();
25 |     sorter.sort(begin, end);
26 | 
27 |     std::sort(v_copy.begin(), v_copy.end());
28 |     usize size = v_copy.size();
29 |     for (usize i = 0; i < size; ++i) {
30 |         if (v_copy[i] != V[i]) {
31 |             GTEST_FAIL() << fmt::format("value at idx #{}  {} != {}", i, v_copy[i], V[i]);
32 |         }
33 |     }
34 | }
35 | 
36 | template <typename T, i32 Unroll, int Shift, vector_machine M>
37 | void vxsort_hinted_test(std::vector<T>& V, T min_value, T max_value) {
38 |     VXSORT_TEST_ISA();
39 | 
40 |     auto v_copy = std::vector<T>(V);
41 |     auto begin = V.data();
42 |     auto end = V.data() + V.size() - 1;
43 | 
44 |     auto sorter = ::vxsort::vxsort<T, M, Unroll, Shift>();
45 |     sorter.sort(begin, end, min_value, max_value);
46 | 
47 |     std::sort(v_copy.begin(), v_copy.end());
48 |     usize size = v_copy.size();
49 |     for (usize i = 0; i < size; ++i) {
50 |         if (v_copy[i] != V[i]) {
51 |             GTEST_FAIL() << fmt::format("value at idx #{}  {} != {}", i, v_copy[i], V[i]);
52 |         }
53 |     }
54 | 
55 | }
56 | 
57 | }
58 | 
59 | #endif  // VXSORT_FULLSORT_TEST_H
60 | 


--------------------------------------------------------------------------------
/tests/gtest_main.cpp:
--------------------------------------------------------------------------------
 1 | #include <cstdio>
 2 | #include <backward.hpp>
 3 | 
 4 | #include "gtest/gtest.h"
 5 | 
 6 | #if defined(GTEST_OS_ESP8266) || defined(GTEST_OS_ESP32)
 7 | // Arduino-like platforms: program entry points are setup/loop instead of main.
 8 | 
 9 | #ifdef GTEST_OS_ESP8266
10 | extern "C" {
11 | #endif
12 | 
13 | void setup() { testing::InitGoogleTest(); }
14 | 
15 | void loop() { RUN_ALL_TESTS(); }
16 | 
17 | #ifdef GTEST_OS_ESP8266
18 | }
19 | #endif
20 | 
21 | #elif defined(GTEST_OS_QURT)
22 | // QuRT: program entry point is main, but argc/argv are unusable.
23 | 
24 | GTEST_API_ int main() {
25 |     printf("Running main() from %s\n", __FILE__);
26 |     testing::InitGoogleTest();
27 |     return RUN_ALL_TESTS();
28 | }
29 | #else
30 | // Normal platforms: program entry point is main, argc/argv are initialized.
31 | 
32 | GTEST_API_ int main(int argc, char **argv) {
33 |     backward::SignalHandling sh;
34 | 
35 |     testing::InitGoogleTest(&argc, argv);
36 |     return RUN_ALL_TESTS();
37 | }
38 | #endif


--------------------------------------------------------------------------------
/tests/mini_tests/masked_load_store.avx2.cpp:
--------------------------------------------------------------------------------
 1 | #include "vxsort_targets_enable_avx2.h"
 2 | 
 3 | #include "masked_load_store_test.h"
 4 | #include <vector_machine/machine_traits.avx2.h>
 5 | 
 6 | namespace vxsort_tests {
 7 | using namespace vxsort::types;
 8 | using VM = vxsort::vector_machine;
 9 | 
10 | template<typename T>
11 | using AVX2MaskedLoadStoreTest = PageWithLavaBoundariesFixture<T, VM::AVX2>;
12 | 
13 | using TestTypes = ::testing::Types<
14 | #ifdef VXSORT_TEST_AVX2_I16
15 |         i16, i32, i64
16 | #endif
17 | #ifdef VXSORT_TEST_AVX2_U16
18 |         u16, u32, u64
19 | #endif
20 | #ifdef VXSORT_TEST_AVX2_F32
21 |         f32, f64
22 | #endif
23 | >;
24 | TYPED_TEST_SUITE(AVX2MaskedLoadStoreTest, TestTypes);
25 | 
26 | TYPED_TEST(AVX2MaskedLoadStoreTest, PrefixLoadOnPageBoundaryWorks) {
27 |     test_prefix_mask_load_on_page_boundary<TypeParam , VM::AVX2>(this);
28 | }
29 | 
30 | TYPED_TEST(AVX2MaskedLoadStoreTest, SuffixLoadOnPageBoundaryWorks) {
31 |     test_suffix_mask_load_on_page_boundary<TypeParam , VM::AVX2>(this);
32 | }
33 | 
34 | TYPED_TEST(AVX2MaskedLoadStoreTest, LeftAlignmentWorks) {
35 |     test_left_alignment_and_masked_loads<TypeParam , VM::AVX2>(this);
36 | }
37 | 
38 | TYPED_TEST(AVX2MaskedLoadStoreTest, RightAlignmentWorks) {
39 |     test_right_alignment_and_masked_loads<TypeParam , VM::AVX2>(this);
40 | }
41 | 
42 | 
43 | };
44 | 
45 | #include "vxsort_targets_disable.h"
46 | 


--------------------------------------------------------------------------------
/tests/mini_tests/masked_load_store.avx512.cpp:
--------------------------------------------------------------------------------
 1 | #include "vxsort_targets_enable_avx512.h"
 2 | 
 3 | #include "masked_load_store_test.h"
 4 | #include <vector_machine/machine_traits.avx512.h>
 5 | 
 6 | namespace vxsort_tests {
 7 | using namespace vxsort::types;
 8 | using VM = vxsort::vector_machine;
 9 | 
10 | template<typename T>
11 | using AVX512MaskedLoadStoreTest = PageWithLavaBoundariesFixture<T, VM::AVX512>;
12 | 
13 | using TestTypes = ::testing::Types<
14 | #ifdef VXSORT_TEST_AVX512_I16
15 |             i16, i32, i64
16 | #endif
17 | #ifdef VXSORT_TEST_AVX512_U16
18 |             u16, u32, u64
19 | #endif
20 | #ifdef VXSORT_TEST_AVX512_F32
21 |             f32, f64
22 | #endif
23 | >;
24 | 
25 | TYPED_TEST_SUITE(AVX512MaskedLoadStoreTest, TestTypes);
26 | 
27 | TYPED_TEST(AVX512MaskedLoadStoreTest, PrefixLoadOnPageBoundaryWorks) {
28 |     test_prefix_mask_load_on_page_boundary<TypeParam , VM::AVX512>(this);
29 | }
30 | 
31 | TYPED_TEST(AVX512MaskedLoadStoreTest, SuffixLoadOnPageBoundaryWorks) {
32 |     test_suffix_mask_load_on_page_boundary<TypeParam , VM::AVX512>(this);
33 | }
34 | 
35 | TYPED_TEST(AVX512MaskedLoadStoreTest, LeftAlignmentWorks) {
36 |     test_left_alignment_and_masked_loads<TypeParam , VM::AVX512>(this);
37 | }
38 | 
39 | TYPED_TEST(AVX512MaskedLoadStoreTest, RightAlignmentWorks) {
40 |     test_right_alignment_and_masked_loads<TypeParam , VM::AVX512>(this);
41 | }
42 | 
43 | };
44 | 
45 | #include "vxsort_targets_disable.h"
46 | 


--------------------------------------------------------------------------------
/tests/mini_tests/masked_load_store.sanity.cpp:
--------------------------------------------------------------------------------
 1 | #include "vxsort_targets_enable_avx2.h"
 2 | 
 3 | #include "masked_load_store_test.h"
 4 | 
 5 | namespace vxsort_tests {
 6 | using namespace vxsort::types;
 7 | using VM = vxsort::vector_machine;
 8 | 
 9 | template <typename T>
10 | using MaskedLoadStoreDeathTest = PageWithLavaBoundariesFixture<T, VM::NONE>;
11 | 
12 | using TestTypes = ::testing::Types<i32>;
13 | TYPED_TEST_SUITE(MaskedLoadStoreDeathTest, TestTypes);
14 | 
15 | TYPED_TEST(MaskedLoadStoreDeathTest, IsSane) {
16 |     EXPECT_EQ(*this->page_with_data, this->get_expected_value(this->page_with_data));
17 | }
18 | 
19 | 
20 | TYPED_TEST(MaskedLoadStoreDeathTest, WhatKillsMeMakesMeStronger1) {
21 |     ASSERT_DEATH(*((volatile i32 *) this->page_with_data - 1), "");
22 | }
23 | 
24 | TYPED_TEST(MaskedLoadStoreDeathTest, WhatKillsMeMakesMeStronger2) {
25 |     ASSERT_DEATH(*((volatile i32 *) this->page_with_data + this->num_elements), "");
26 | }
27 | 
28 | };
29 | 
30 | #include "vxsort_targets_disable.h"
31 | 


--------------------------------------------------------------------------------
/tests/mini_tests/masked_load_store_test.h:
--------------------------------------------------------------------------------
  1 | #ifndef VXSORT_MASKED_LOAD_STORE_TEST_H
  2 | #define VXSORT_MASKED_LOAD_STORE_TEST_H
  3 | 
  4 | #include <gtest/gtest.h>
  5 | #include "mini_fixtures.h"
  6 | 
  7 | #include "defs.h"
  8 | #include "vector_machine/machine_traits.h"
  9 | #include "../test_isa.h"
 10 | #include "alignment.h"
 11 | 
 12 | namespace vxsort_tests {
 13 | using namespace vxsort::types;
 14 | using VM = vxsort::vector_machine;
 15 | 
 16 | template <typename T, VM M>
 17 | void test_prefix_mask_load_on_page_boundary(PageWithLavaBoundariesFixture<T, M> *fixture)
 18 | {
 19 |     VXSORT_TEST_ISA();
 20 | 
 21 |     using VMT = vxsort::vxsort_machine_traits<T, M>;
 22 |     static constexpr auto MAX = std::numeric_limits<T>::max();
 23 |     const auto MAXV = VMT::broadcast(MAX);
 24 | 
 25 |     for (auto w = 1; w < VMT::N; w++) {
 26 |         auto mask = VMT::generate_prefix_mask(w);
 27 |         auto *load_addr = fixture->page_with_data + fixture->num_elements - w;
 28 |         auto result = VMT::load_partial_vec((typename VMT::TV *) load_addr, MAXV, mask);
 29 |         auto &res_array = reinterpret_cast<T(&)[VMT::N]>(result);
 30 | 
 31 |         for (auto i = 0; i < w; ++i)
 32 |             ASSERT_EQ(res_array[i], fixture->get_expected_value(load_addr + i));
 33 |         for (auto i = w; i < VMT::N; ++i)
 34 |             ASSERT_EQ(res_array[i], MAX);
 35 |     }
 36 | }
 37 | 
 38 | template <typename T, VM M>
 39 | void test_suffix_mask_load_on_page_boundary(PageWithLavaBoundariesFixture<T, M> *fixture)
 40 | {
 41 |     VXSORT_TEST_ISA()
 42 | 
 43 |     using VMT = vxsort::vxsort_machine_traits<T, M>;
 44 |     static constexpr auto MAX = std::numeric_limits<T>::max();
 45 |     const auto MAXV = VMT::broadcast(MAX);
 46 | 
 47 |     for (auto w = 1; w < VMT::N; w++) {
 48 |         auto mask = VMT::generate_suffix_mask(w);
 49 |         auto *load_addr = fixture->page_with_data - w;
 50 |         auto result = VMT::load_partial_vec((typename VMT::TV *) load_addr, MAXV, mask);
 51 |         auto &res_array = reinterpret_cast<T(&)[VMT::N]>(result);
 52 | 
 53 |         for (auto i = 0; i < w; ++i)
 54 |             ASSERT_EQ(res_array[i], MAX);
 55 |         for (auto i = w; i < VMT::N; ++i)
 56 |             ASSERT_EQ(res_array[i], fixture->get_expected_value(load_addr + i));
 57 |     }
 58 | }
 59 | 
 60 | template <typename T, VM M>
 61 | void test_left_alignment_and_masked_loads(PageWithLavaBoundariesFixture<T, M> *fixture)
 62 | {
 63 |     VXSORT_TEST_ISA();
 64 | 
 65 |     using VMT = vxsort::vxsort_machine_traits<T, M>;
 66 |     using AH = vxsort::alignment_hint<T, M>;
 67 | 
 68 |     static constexpr auto MAX = std::numeric_limits<T>::max();
 69 |     const auto MAXV = VMT::broadcast(MAX);
 70 | 
 71 |     for (auto w = 0; w < VMT::N; w++) {
 72 |         auto *load_addr = fixture->page_with_data + w;
 73 | 
 74 |         AH align;
 75 |         align.calc_left_alignment(load_addr);
 76 |         auto mask = VMT::generate_suffix_mask(align.left_masked_amount);
 77 | 
 78 |         load_addr -= align.left_masked_amount;
 79 | 
 80 |         ASSERT_TRUE(AH::is_aligned(load_addr));
 81 | 
 82 |         auto result = VMT::load_partial_vec((typename VMT::TV *) load_addr, MAXV, mask);
 83 |         auto &res_array = reinterpret_cast<T(&)[VMT::N]>(result);
 84 | 
 85 |         for (auto i = 0; i < align.left_masked_amount; ++i)
 86 |             ASSERT_EQ(res_array[i], MAX);
 87 |         for (auto i = align.left_masked_amount; i < VMT::N; ++i)
 88 |             ASSERT_EQ(res_array[i], fixture->get_expected_value(load_addr + i));
 89 |     }
 90 | }
 91 | 
 92 | template <typename T, VM M>
 93 | void test_right_alignment_and_masked_loads(PageWithLavaBoundariesFixture<T, M> *fixture)
 94 | {
 95 |     VXSORT_TEST_ISA();
 96 | 
 97 |     using VMT = vxsort::vxsort_machine_traits<T, M>;
 98 |     using AH = vxsort::alignment_hint<T, M>;
 99 | 
100 |     static constexpr auto MAX = std::numeric_limits<T>::max();
101 |     const auto MAXV = VMT::broadcast(MAX);
102 | 
103 |     for (auto w = 0; w < VMT::N; w++) {
104 |         auto *load_addr = fixture->page_with_data + fixture->num_elements - w;
105 | 
106 |         AH align;
107 |         align.calc_right_alignment(load_addr);
108 | 
109 |         load_addr -= align.right_unmasked_amount;
110 | 
111 |         ASSERT_TRUE(AH::is_aligned(load_addr));
112 | 
113 |         auto mask = VMT::generate_prefix_mask(align.right_unmasked_amount);
114 |         auto result = VMT::load_partial_vec((typename VMT::TV *) load_addr, MAXV, mask);
115 |         auto &res_array = reinterpret_cast<T(&)[VMT::N]>(result);
116 | 
117 |         for (auto i = 0; i < align.right_unmasked_amount; ++i)
118 |             ASSERT_EQ(res_array[i], fixture->get_expected_value(load_addr + i));
119 |         for (auto i = align.right_unmasked_amount; i < VMT::N; ++i)
120 |             ASSERT_EQ(res_array[i], MAX);
121 |     }
122 | }
123 | 
124 | 
125 | };
126 | 
127 | #endif //VXSORT_MASKED_LOAD_STORE_TEST_H
128 | 


--------------------------------------------------------------------------------
/tests/mini_tests/mini_fixtures.h:
--------------------------------------------------------------------------------
  1 | #ifndef VXSORT_MINI_FIXTURES_H
  2 | #define VXSORT_MINI_FIXTURES_H
  3 | 
  4 | #include <gtest/gtest.h>
  5 | #ifndef _WIN32
  6 | #include <sys/mman.h>
  7 | #else
  8 | #ifndef NOMINMAX
  9 | # define NOMINMAX
 10 | #endif
 11 | #define WIN32_LEAN_AND_MEAN 1
 12 | #include <Windows.h>
 13 | #endif
 14 | 
 15 | #include "defs.h"
 16 | #include "vector_machine/machine_traits.h"
 17 | #include "isa_detection.h"
 18 | #include "alignment.h"
 19 | 
 20 | namespace vxsort_tests {
 21 | using namespace vxsort::types;
 22 | using VM = vxsort::vector_machine;
 23 | 
 24 | static inline usize get_page_size()
 25 | {
 26 |     usize page_size;
 27 | #ifdef WIN32
 28 |     SYSTEM_INFO sys_info;
 29 |     GetSystemInfo(&sys_info);
 30 |     page_size = sys_info.dwPageSize;
 31 | #else
 32 |     page_size = sysconf(_SC_PAGESIZE);
 33 | #endif
 34 |     return page_size;
 35 | }
 36 | 
 37 | static const i32 page_size = get_page_size();
 38 | 
 39 | template <typename T, VM M, bool for_packing = false>
 40 | class PageWithLavaBoundariesFixture : public ::testing::Test {
 41 |     using VMT = vxsort::vxsort_machine_traits<T, M>;
 42 |     static constexpr i32 N = VMT::N;
 43 |     static_assert(N < 256, "N must be < 256");
 44 | 
 45 | protected:
 46 | 
 47 |     u8 *create_mapping_with_boundary_pages() {
 48 | #ifdef WIN32
 49 |         auto *mem = (u8 *) VirtualAlloc(nullptr, 3*page_size, MEM_COMMIT, PAGE_READWRITE);
 50 |         DWORD old_protect;
 51 |         VirtualProtect(mem, page_size, PAGE_NOACCESS, &old_protect);
 52 |         VirtualProtect(mem + 2*page_size, page_size, PAGE_NOACCESS, &old_protect);
 53 |         return mem;
 54 | #else
 55 |         auto *mem = (u8 *) mmap(nullptr, 3*page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
 56 |         // Make the first and last inaccessible
 57 |         mprotect(mem, page_size, PROT_NONE);
 58 |         mprotect(mem + 2*page_size, page_size, PROT_NONE);
 59 |         return mem;
 60 | #endif
 61 |     }
 62 | 
 63 |     void SetUp() override {
 64 |         // Map 3 pages
 65 |         mem = create_mapping_with_boundary_pages();
 66 |         generate_expected_values();
 67 |         num_elements = page_size / sizeof(T);
 68 | 
 69 |         page_with_data = reinterpret_cast<T *>(mem + page_size);
 70 |         for (usize i = 0; i < num_elements; i++) {
 71 |             auto *p = &page_with_data[i];
 72 |             *p = get_expected_value(p);
 73 |         }
 74 |     }
 75 | 
 76 |     void destroy_mapping() {
 77 | #ifdef WIN32
 78 |         VirtualFree(mem, 3*page_size, MEM_DECOMMIT);
 79 | #else
 80 |         munmap(mem, 3*page_size);
 81 | #endif
 82 | 
 83 |     }
 84 | 
 85 |     void TearDown() override {
 86 |         destroy_mapping();
 87 |     }
 88 | 
 89 |     void generate_expected_values()
 90 |     {
 91 |         static constexpr T max_value = for_packing ? (T)std::numeric_limits<typename VMT::TPACK>::max() : (T)std::numeric_limits<T>::max();
 92 |         for (auto n = 0; n < N; n++) {
 93 |             expected_values[n] = (T)n+1;
 94 |             ASSERT_LE(expected_values[n], max_value);
 95 |         }
 96 |     }
 97 | 
 98 |     T expected_values[N];
 99 |     u8 *mem;
100 | 
101 | public:
102 |     T get_expected_value(const T *p)
103 |     {
104 |         const auto offset_in_elements = (((usize) p) / sizeof(T)) & (N-1);
105 | 
106 |         return expected_values[offset_in_elements];
107 |     }
108 | 
109 | 
110 |     T *page_with_data;
111 |     usize num_elements;
112 | 
113 | };
114 | 
115 | };
116 | 
117 | #endif //VXSORT_MINI_FIXTURES_H
118 | 


--------------------------------------------------------------------------------
/tests/mini_tests/pack_machine.avx2.cpp:
--------------------------------------------------------------------------------
 1 | #include "vxsort_targets_enable_avx2.h"
 2 | 
 3 | #include <gtest/gtest.h>
 4 | 
 5 | #include <vector_machine/machine_traits.avx2.h>
 6 | #include <pack_machine.h>
 7 | #include "pack_machine_test.h"
 8 | 
 9 | namespace vxsort_tests {
10 | using namespace vxsort::types;
11 | using VM = vxsort::vector_machine;
12 | 
13 | template<typename T>
14 | using PackMachineAVX2Test = PackMachineTest<T, VM::AVX2>;
15 | 
16 | using TestTypes = ::testing::Types<
17 | #ifdef VXSORT_TEST_AVX2_I16
18 |             i16, i32, i64
19 | #endif
20 | #ifdef VXSORT_TEST_AVX2_U16
21 |             u16, u32, u64
22 | #endif
23 | #ifdef VXSORT_TEST_AVX2_F32
24 |             f32, f64
25 | #endif
26 |     >;
27 | TYPED_TEST_SUITE(PackMachineAVX2Test, TestTypes);
28 | 
29 | TYPED_TEST(PackMachineAVX2Test, PackingWorks) {
30 |     test_packunpack<TypeParam , VM::AVX2>(this);
31 | }
32 | 
33 | };
34 | 
35 | #include "vxsort_targets_disable.h"
36 | 


--------------------------------------------------------------------------------
/tests/mini_tests/pack_machine.avx512.cpp:
--------------------------------------------------------------------------------
 1 | #include "vxsort_targets_enable_avx512.h"
 2 | 
 3 | #include <gtest/gtest.h>
 4 | 
 5 | #include <vector_machine/machine_traits.avx512.h>
 6 | #include <pack_machine.h>
 7 | 
 8 | #include "pack_machine_test.h"
 9 | 
10 | namespace vxsort_tests {
11 | using namespace vxsort::types;
12 | using VM = vxsort::vector_machine;
13 | 
14 | template<typename T>
15 | using PackMachineAVX512Test = PackMachineTest<T, VM::AVX512>;
16 | 
17 | using TestTypes = ::testing::Types<
18 | #ifdef VXSORT_TEST_AVX512_I16
19 |             i16, i32, i64
20 | #endif
21 | #ifdef VXSORT_TEST_AVX512_U16
22 |             u16, u32, u64
23 | #endif
24 | #ifdef VXSORT_TEST_AVX512_F32
25 |             f32, f64
26 | #endif
27 | >;
28 | TYPED_TEST_SUITE(PackMachineAVX512Test, TestTypes);
29 | 
30 | TYPED_TEST(PackMachineAVX512Test, PackingWorks) {
31 |     test_packunpack<TypeParam , VM::AVX512>(this);
32 | }
33 | 
34 | };
35 | 
36 | #include "vxsort_targets_disable.h"
37 | 


--------------------------------------------------------------------------------
/tests/mini_tests/pack_machine_test.h:
--------------------------------------------------------------------------------
 1 | #ifndef VXSORT_PACK_MACHINE_TEST_H
 2 | #define VXSORT_PACK_MACHINE_TEST_H
 3 | 
 4 | #include <random>
 5 | #include <algorithm>
 6 | #include <span>
 7 | #include <fmt/format.h>
 8 | #include "mini_fixtures.h"
 9 | 
10 | #include "defs.h"
11 | #include "vector_machine/machine_traits.h"
12 | #include "../test_isa.h"
13 | #include "alignment.h"
14 | 
15 | namespace vxsort_tests {
16 | using namespace vxsort::types;
17 | using VM = vxsort::vector_machine;
18 | 
19 | template <typename T, VM M>
20 | using PackMachineTest = PageWithLavaBoundariesFixture<T, M, true>;
21 | 
22 | template <typename T, VM M>
23 | void test_packunpack(PackMachineTest<T, M> *fixture)
24 | {
25 |     VXSORT_TEST_ISA();
26 | 
27 |     if (!::vxsort::supports_vector_machine<M>(sizeof(T)/2)) {
28 |         GTEST_SKIP_("Current CPU does not support the minimal features for this test");
29 |         return;
30 |     }
31 | 
32 |     using VMT = vxsort::vxsort_machine_traits<T, M>;
33 | 
34 |     if (!VMT::supports_packing()) {
35 |         GTEST_SKIP_("primitive type does not support packing");
36 |         return;
37 |     }
38 | 
39 |     using PM = vxsort::pack_machine<T, M, 0>;
40 |     static constexpr auto N = VMT::N;
41 | 
42 |     auto *load_addr = fixture->page_with_data;
43 |     auto s = std::span(load_addr, N*2);
44 |     const auto [min, max] = std::minmax_element(s.begin(), s.end());
45 | 
46 |     ASSERT_TRUE(VMT::template can_pack<0>(*max - *min));
47 | 
48 |     auto d1 = VMT::load_vec((typename VMT::TV *) load_addr);
49 |     auto d2 = VMT::load_vec((typename VMT::TV *) load_addr + 1);
50 | 
51 |     auto constexpr MIN = T(std::numeric_limits<typename VMT::TPACK>::min());
52 |     auto offset = VMT::template shift_n_sub<0>(*min, MIN);
53 |     const auto offset_v = VMT::broadcast(offset);
54 | 
55 |     auto packed_v = PM::pack_vectors(d1, d2, offset_v);
56 | 
57 |     typename VMT::TV u1, u2;
58 | 
59 |     PM::unpack_vectors(offset_v, packed_v, u1, u2);
60 | 
61 |     T spill[N*2];
62 |     VMT::store_vec((typename VMT::TV *) spill, u1);
63 |     VMT::store_vec((typename VMT::TV *) spill+1, u2);
64 | 
65 |     std::vector<T> orig(s.begin(), s.end());
66 |     for (auto u : spill) {
67 |         auto it = std::find(orig.begin(), orig.end(), u);
68 |         if (it == orig.end()) {
69 |             GTEST_FAIL() << fmt::format("Expected to find unpacked value {} in {}", u, fmt::join(s, ", "));
70 |         }
71 |         orig.erase(it);
72 |     }
73 |     ASSERT_EQ(orig.size(), 0);
74 | }
75 | 
76 | };
77 | 
78 | #endif //VXSORT_PACK_MACHINE_TEST_H
79 | 


--------------------------------------------------------------------------------
/tests/mini_tests/partition_machine.avx2.cpp:
--------------------------------------------------------------------------------
 1 | #include "vxsort_targets_enable_avx2.h"
 2 | 
 3 | #include <gtest/gtest.h>
 4 | 
 5 | #include <partition_machine.avx2.h>
 6 | #include "partition_machine_test.h"
 7 | 
 8 | namespace vxsort_tests {
 9 | using namespace vxsort::types;
10 | using VM = vxsort::vector_machine;
11 | 
12 | template<typename T>
13 | using PartitionMachineAVX2Test = PageWithLavaBoundariesFixture<T, VM::AVX2>;
14 | 
15 | using TestTypes = ::testing::Types<
16 | #ifdef VXSORT_TEST_AVX2_I16
17 |             i16, i32, i64
18 | #endif
19 | #ifdef VXSORT_TEST_AVX2_U16
20 |             u16, u32, u64
21 | #endif
22 | #ifdef VXSORT_TEST_AVX2_F32
23 |             f32, f64
24 | #endif
25 |     >;
26 | TYPED_TEST_SUITE(PartitionMachineAVX2Test, TestTypes);
27 | 
28 | TYPED_TEST(PartitionMachineAVX2Test, PartitioningWorks) {
29 |     test_partition<TypeParam , VM::AVX2>(this);
30 | }
31 | 
32 | TYPED_TEST(PartitionMachineAVX2Test, PartitioningIsStable) {
33 |     test_partition_stability<TypeParam , VM::AVX2>(this);
34 | }
35 | 
36 | 
37 | TYPED_TEST(PartitionMachineAVX2Test, PartitionAlignmentWorks) {
38 |     test_partition_alignment<TypeParam , VM::AVX2>(this);
39 | }
40 | 
41 | };
42 | 
43 | #include "vxsort_targets_disable.h"
44 | 


--------------------------------------------------------------------------------
/tests/mini_tests/partition_machine.avx512.cpp:
--------------------------------------------------------------------------------
 1 | #include "vxsort_targets_enable_avx512.h"
 2 | 
 3 | #include <gtest/gtest.h>
 4 | 
 5 | #include <partition_machine.avx512.h>
 6 | #include "partition_machine_test.h"
 7 | 
 8 | namespace vxsort_tests {
 9 | using namespace vxsort::types;
10 | using VM = vxsort::vector_machine;
11 | 
12 | template<typename T>
13 | using PartitionMachineAVX512Test = PageWithLavaBoundariesFixture<T, VM::AVX512>;
14 | 
15 | using TestTypes = ::testing::Types<
16 | #ifdef VXSORT_TEST_AVX512_I16
17 |             i16, i32, i64
18 | #endif
19 | #ifdef VXSORT_TEST_AVX512_U16
20 |             u16, u32, u64
21 | #endif
22 | #ifdef VXSORT_TEST_AVX512_F32
23 |             f32, f64
24 | #endif
25 | >;
26 | TYPED_TEST_SUITE(PartitionMachineAVX512Test, TestTypes);
27 | 
28 | TYPED_TEST(PartitionMachineAVX512Test, PartitioningWorks) {
29 |     test_partition<TypeParam , VM::AVX512>(this);
30 | }
31 | 
32 | TYPED_TEST(PartitionMachineAVX512Test, PartitioningIsStable) {
33 |     test_partition_stability<TypeParam , VM::AVX512>(this);
34 | }
35 | 
36 | 
37 | TYPED_TEST(PartitionMachineAVX512Test, PartitionAlignmentWorks) {
38 |     test_partition_alignment<TypeParam , VM::AVX512>(this);
39 | }
40 | 
41 | };
42 | 
43 | #include "vxsort_targets_disable.h"
44 | 


--------------------------------------------------------------------------------
/tests/mini_tests/partition_machine_test.h:
--------------------------------------------------------------------------------
  1 | #ifndef VXSORT_PARTITION_MACHINE_TEST_H
  2 | #define VXSORT_PARTITION_MACHINE_TEST_H
  3 | 
  4 | #include <random>
  5 | #include <algorithm>
  6 | #include <span>
  7 | #include <fmt/format.h>
  8 | #include "mini_fixtures.h"
  9 | 
 10 | #include "defs.h"
 11 | #include "vector_machine/machine_traits.h"
 12 | #include "../test_isa.h"
 13 | #include "alignment.h"
 14 | 
 15 | namespace vxsort_tests {
 16 | using namespace vxsort::types;
 17 | using VM = vxsort::vector_machine;
 18 | 
 19 | template <typename T, VM M>
 20 | void test_partition(PageWithLavaBoundariesFixture<T, M> *fixture)
 21 | {
 22 |     VXSORT_TEST_ISA();
 23 | 
 24 |     using VMT = vxsort::vxsort_machine_traits<T, M>;
 25 |     using PM = vxsort::partition_machine<T, M>;
 26 |     static constexpr auto N = VMT::N;
 27 | 
 28 |     for (auto p = 0; p < VMT::N; p++) {
 29 |         auto *load_addr = fixture->page_with_data;
 30 |         auto pivot = fixture->get_expected_value(load_addr + p) - 1;
 31 | 
 32 |         auto s = std::span<T>(load_addr, N);
 33 |         std::random_device rd;
 34 |         std::mt19937 gen{rd()};
 35 |         std::shuffle(s.begin(), s.end(), gen);
 36 | 
 37 |         auto PV = VMT::broadcast(pivot);
 38 | 
 39 |         auto data = VMT::load_vec((typename VMT::TV *) load_addr);
 40 | 
 41 |         T spill_left[N*2];
 42 |         T spill_right[N*2];
 43 | 
 44 |         T* RESTRICT spill_left_end = spill_left;
 45 |         // partition_block expects the left/right *write* pointers to point
 46 |         // to the next vector write position, for right write pointer
 47 |         // this means N elements BEFORE the end of the spill buffer
 48 |         T* RESTRICT spill_right_start = spill_right + N;
 49 |         T* RESTRICT spill_right_end = spill_right_start;
 50 | 
 51 |         memset(spill_left, 0x66, sizeof(spill_left));
 52 |         memset(spill_right, 0x66, sizeof(spill_right));
 53 | 
 54 |         PM::partition_block(data, PV, spill_left_end, spill_right_end);
 55 | 
 56 |         ASSERT_EQ(spill_left_end - spill_left, p);
 57 |         ASSERT_EQ(spill_right_start - spill_right_end, N - p);
 58 | 
 59 |         for (auto i = 0; i < p; ++i) {
 60 |             ASSERT_TRUE(spill_left[i] <= pivot);
 61 |         }
 62 | 
 63 |         for (auto i = VMT::N - 1; i >= p; --i) {
 64 |             ASSERT_TRUE(spill_right_start[i] > pivot);
 65 | 
 66 |         }
 67 |     }
 68 | }
 69 | 
 70 | 
 71 | template <typename T, VM M>
 72 | void test_partition_stability(PageWithLavaBoundariesFixture<T, M> *fixture)
 73 | {
 74 |     VXSORT_TEST_ISA();
 75 | 
 76 |     using VMT = vxsort::vxsort_machine_traits<T, M>;
 77 |     using PM = vxsort::partition_machine<T, M>;
 78 |     static constexpr auto N = VMT::N;
 79 | 
 80 |     for (auto p = 0; p < VMT::N; p++) {
 81 |         auto *load_addr = fixture->page_with_data;
 82 |         auto pivot = fixture->get_expected_value(load_addr + p) - 1;
 83 | 
 84 |         auto PV = VMT::broadcast(pivot);
 85 | 
 86 |         auto data = VMT::load_vec((typename VMT::TV *) load_addr);
 87 | 
 88 |         T spill_left[N*2];
 89 |         T spill_right[N*2];
 90 | 
 91 |         T* RESTRICT spill_left_end = spill_left;
 92 |         // partition_block expects the left/right *write* pointers to point
 93 |         // to the next vector write position, for right write pointer
 94 |         // this means N elements BEFORE the end of the spill buffer
 95 |         T* RESTRICT spill_right_start = spill_right + N;
 96 |         T* RESTRICT spill_right_end = spill_right_start;
 97 | 
 98 |         memset(spill_left, 0x66, sizeof(spill_left));
 99 |         memset(spill_right, 0x66, sizeof(spill_right));
100 | 
101 |         PM::partition_block(data, PV, spill_left_end, spill_right_end);
102 | 
103 |         ASSERT_EQ(spill_left_end - spill_left, p);
104 |         ASSERT_EQ(spill_right_start - spill_right_end, N - p);
105 | 
106 |         for (auto i = 0; i < p; ++i) {
107 |             auto expected_value = fixture->get_expected_value(load_addr + i);
108 |             ASSERT_EQ(spill_left[i], expected_value);
109 |         }
110 | 
111 |         for (auto i = VMT::N - 1; i >= p; --i) {
112 |             auto expected_value = fixture->get_expected_value(load_addr + i);
113 |             ASSERT_EQ(spill_right_start[i], expected_value);
114 | 
115 |         }
116 |     }
117 | }
118 | 
119 | template <typename T, VM M>
120 | void test_partition_alignment(PageWithLavaBoundariesFixture<T, M> *fixture)
121 | {
122 |     VXSORT_TEST_ISA();
123 | 
124 |     using VMT = vxsort::vxsort_machine_traits<T, M>;
125 |     using PM = vxsort::partition_machine<T, M>;
126 |     using AH = vxsort::alignment_hint<T, M>;
127 |     static constexpr auto N = VMT::N;
128 | 
129 |     for (auto p = 0; p < VMT::N; p++) {
130 |         auto * const left = fixture->page_with_data + p;
131 |         auto * const right = fixture->page_with_data + fixture->num_elements - p;
132 |         const auto pivot = fixture->get_expected_value(left + p) - 1;
133 | 
134 |         const auto PV = VMT::broadcast(pivot);
135 | 
136 |         AH align;
137 |         align.calc_left_alignment(left);
138 |         align.calc_right_alignment(right);
139 | 
140 |         T spill_left[N*2];
141 |         T spill_right[N*2];
142 | 
143 |         T* RESTRICT spill_left_start = spill_left;
144 |         T* RESTRICT spill_left_end = spill_left;
145 | 
146 |         // aligne_vectorized expects the left/right *write* pointers to point
147 |         // to the boundary of the spill buffer, for right write pointer
148 |         // this means the first element PAST the end of the spill buffer
149 |         T* RESTRICT spill_right_start = spill_right + 2*N;
150 |         T* RESTRICT spill_right_end = spill_right_start;
151 | 
152 |         memset(spill_left, 0x66, sizeof(spill_left));
153 |         memset(spill_right, 0x66, sizeof(spill_right));
154 | 
155 |         auto left_masked_amount = align.left_masked_amount;
156 |         auto right_unmasked_amount = align.right_unmasked_amount;
157 | 
158 |         T * RESTRICT left_next = left;
159 |         T * RESTRICT right_next = right;
160 | 
161 |         //fmt::print("left_masked_amount: {}, right_unmasked_amount: {}\n", left_masked_amount, right_unmasked_amount);
162 | 
163 |         PM::align_vectorized(left_masked_amount, right_unmasked_amount,
164 |                              PV,
165 |                              left_next, right_next,
166 |                              spill_left_start, spill_left_end,
167 |                              spill_right_start, spill_right_end);
168 | 
169 |         // align vectorized API is build for continued
170 |         // partitioning, so we need to update the right-pointing pointers
171 |         // when vectorized partitioning is done by bumping them up by N elements
172 |         right_next += N;
173 |         spill_right_end += N;
174 | 
175 |         auto amount_read_left = left_next - left;
176 |         auto amount_read_right = right - right_next;
177 | 
178 |         auto amount_partitioned_left = spill_left_end - spill_left_start;
179 |         auto amount_partitioned_right = spill_right_start - spill_right_end;
180 | 
181 |         ASSERT_EQ(amount_partitioned_left + amount_partitioned_right,
182 |                   amount_read_left + amount_read_right);
183 | 
184 |         ASSERT_EQ(spill_left_start - spill_left, align.left_masked_amount);
185 | 
186 |         for (auto i = 0; i < amount_partitioned_left; ++i) {
187 |             ASSERT_LE(spill_left_start[i], pivot);
188 |         }
189 | 
190 |         for (auto i = 0; i < amount_partitioned_right; ++i) {
191 |             ASSERT_GT(spill_right_end[i], pivot);
192 |         }
193 |     }
194 | }
195 | 
196 | };
197 | 
198 | #endif //VXSORT_PARTITION_MACHINE_TEST_H
199 | 


--------------------------------------------------------------------------------
/tests/smallsort/smallsort.avx2.cpp:
--------------------------------------------------------------------------------
  1 | #include "vxsort_targets_enable_avx2.h"
  2 | 
  3 | #include "gtest/gtest.h"
  4 | 
  5 | #include <smallsort/bitonic_sort.avx2.h>
  6 | 
  7 | #include "smallsort_test.h"
  8 | #include "../sort_fixtures.h"
  9 | 
 10 | namespace vxsort_tests {
 11 | using namespace vxsort::types;
 12 | using VM = vxsort::vector_machine;
 13 | 
 14 | auto bitonic_machine_allvalues_avx2_16 = ValuesIn(range(16, 64, 16));
 15 | auto bitonic_machine_allvalues_avx2_32 = ValuesIn(range(8, 32, 8));
 16 | auto bitonic_machine_allvalues_avx2_64 = ValuesIn(range(4, 16, 4));
 17 | 
 18 | auto bitonic_allvalues_avx2_16 = ValuesIn(range(1, 8192, 1));
 19 | auto bitonic_allvalues_avx2_32 = ValuesIn(range(1, 4096, 1));
 20 | auto bitonic_allvalues_avx2_64 = ValuesIn(range(1, 2048, 1));
 21 | 
 22 | #ifdef VXSORT_TEST_AVX2_I16
 23 | struct BitonicMachineAVX2_i16    : public SortFixture<i16> {};
 24 | struct BitonicAVX2_i16 : public SortFixture<i16> {};
 25 | INSTANTIATE_TEST_SUITE_P(BitonicMachineAVX2, BitonicMachineAVX2_i16,    bitonic_machine_allvalues_avx2_16, PrintValue());
 26 | INSTANTIATE_TEST_SUITE_P(BitonicAVX2, BitonicAVX2_i16, bitonic_allvalues_avx2_16, PrintValue());
 27 | #endif
 28 | 
 29 | #ifdef VXSORT_TEST_AVX2_I32
 30 | struct BitonicMachineAVX2_i32    : public SortFixture<i32> {};
 31 | struct BitonicAVX2_i32 : public SortFixture<i32> {};
 32 | INSTANTIATE_TEST_SUITE_P(BitonicMachineAVX2, BitonicMachineAVX2_i32,    bitonic_machine_allvalues_avx2_32, PrintValue());
 33 | INSTANTIATE_TEST_SUITE_P(BitonicAVX2, BitonicAVX2_i32, bitonic_allvalues_avx2_32, PrintValue());
 34 | #endif
 35 | 
 36 | #ifdef VXSORT_TEST_AVX2_I64
 37 | struct BitonicMachineAVX2_i64    : public SortFixture<i64> {};
 38 | struct BitonicAVX2_i64 : public SortFixture<i64> {};
 39 | INSTANTIATE_TEST_SUITE_P(BitonicMachineAVX2, BitonicMachineAVX2_i64,    bitonic_machine_allvalues_avx2_64, PrintValue());
 40 | INSTANTIATE_TEST_SUITE_P(BitonicAVX2, BitonicAVX2_i64, bitonic_allvalues_avx2_64, PrintValue());
 41 | #endif
 42 | 
 43 | #ifdef VXSORT_TEST_AVX2_U16
 44 | struct BitonicMachineAVX2_u16   : public SortFixture<u16> {};
 45 | struct BitonicAVX2_u16 : public SortFixture<u16> {};
 46 | INSTANTIATE_TEST_SUITE_P(BitonicMachineAVX2, BitonicMachineAVX2_u16,   bitonic_machine_allvalues_avx2_16, PrintValue());
 47 | INSTANTIATE_TEST_SUITE_P(BitonicAVX2, BitonicAVX2_u16, bitonic_allvalues_avx2_16, PrintValue());
 48 | #endif
 49 | 
 50 | #ifdef VXSORT_TEST_AVX2_U32
 51 | struct BitonicMachineAVX2_u32   : public SortFixture<u32> {};
 52 | struct BitonicAVX2_u32 : public SortFixture<u32> {};
 53 | INSTANTIATE_TEST_SUITE_P(BitonicMachineAVX2, BitonicMachineAVX2_u32,   bitonic_machine_allvalues_avx2_32, PrintValue());
 54 | INSTANTIATE_TEST_SUITE_P(BitonicAVX2, BitonicAVX2_u32, bitonic_allvalues_avx2_32, PrintValue());
 55 | #endif
 56 | 
 57 | #ifdef VXSORT_TEST_AVX2_U64
 58 | struct BitonicMachineAVX2_u64   : public SortFixture<u64> {};
 59 | struct BitonicAVX2_u64 : public SortFixture<u64> {};
 60 | INSTANTIATE_TEST_SUITE_P(BitonicMachineAVX2, BitonicMachineAVX2_u64,   bitonic_machine_allvalues_avx2_64, PrintValue());
 61 | INSTANTIATE_TEST_SUITE_P(BitonicAVX2, BitonicAVX2_u64, bitonic_allvalues_avx2_64, PrintValue());
 62 | #endif
 63 | 
 64 | #ifdef VXSORT_TEST_AVX2_F32
 65 | struct BitonicMachineAVX2_f32 : public SortFixture<f32> {};
 66 | struct BitonicAVX2_f32 : public SortFixture<f32> {};
 67 | INSTANTIATE_TEST_SUITE_P(BitonicMachineAVX2, BitonicMachineAVX2_f32,  bitonic_machine_allvalues_avx2_32, PrintValue());
 68 | INSTANTIATE_TEST_SUITE_P(BitonicAVX2, BitonicAVX2_f32, bitonic_allvalues_avx2_32, PrintValue());
 69 | #endif
 70 | 
 71 | #ifdef VXSORT_TEST_AVX2_F64
 72 | struct BitonicMachineAVX2_f64 : public SortFixture<f64> {};
 73 | struct BitonicAVX2_f64 : public SortFixture<f64> {};
 74 | INSTANTIATE_TEST_SUITE_P(BitonicMachineAVX2, BitonicMachineAVX2_f64, bitonic_machine_allvalues_avx2_64, PrintValue());
 75 | INSTANTIATE_TEST_SUITE_P(BitonicAVX2, BitonicAVX2_f64, bitonic_allvalues_avx2_64, PrintValue());
 76 | #endif
 77 | 
 78 | #ifdef VXSORT_TEST_AVX2_I16
 79 | TEST_P(BitonicMachineAVX2_i16,    BitonicSortAVX2Asc) { bitonic_machine_sort_test<i16, VM::AVX2, true>(V); }
 80 | TEST_P(BitonicAVX2_i16, BitonicSortAVX2) { bitonic_sort_test<i16, VM::AVX2>(V); }
 81 | #endif
 82 | 
 83 | #ifdef VXSORT_TEST_AVX2_I32
 84 | TEST_P(BitonicMachineAVX2_i32,    BitonicSortAVX2Asc) { bitonic_machine_sort_test<i32, VM::AVX2, true>(V); }
 85 | TEST_P(BitonicAVX2_i32, BitonicSortAVX2) { bitonic_sort_test<i32, VM::AVX2>(V); }
 86 | #endif
 87 | 
 88 | #ifdef VXSORT_TEST_AVX2_I64
 89 | TEST_P(BitonicMachineAVX2_i64,    BitonicSortAVX2Asc) { bitonic_machine_sort_test<i64, VM::AVX2, true>(V); }
 90 | TEST_P(BitonicAVX2_i64, BitonicSortAVX2) { bitonic_sort_test<i64, VM::AVX2>(V); }
 91 | #endif
 92 | #ifdef VXSORT_TEST_AVX2_U16
 93 | TEST_P(BitonicMachineAVX2_u16,   BitonicSortAVX2Asc) { bitonic_machine_sort_test<u16, VM::AVX2, true>(V); }
 94 | TEST_P(BitonicAVX2_u16, BitonicSortAVX2) { bitonic_sort_test<u16, VM::AVX2>(V); }
 95 | #endif
 96 | 
 97 | #ifdef VXSORT_TEST_AVX2_U32
 98 | TEST_P(BitonicMachineAVX2_u32,   BitonicSortAVX2Asc) { bitonic_machine_sort_test<u32, VM::AVX2, true>(V); }
 99 | TEST_P(BitonicAVX2_u32, BitonicSortAVX2) { bitonic_sort_test<u32, VM::AVX2>(V); }
100 | #endif
101 | 
102 | #ifdef VXSORT_TEST_AVX2_U64
103 | TEST_P(BitonicMachineAVX2_u64,   BitonicSortAVX2Asc) { bitonic_machine_sort_test<u64, VM::AVX2, true>(V); }
104 | TEST_P(BitonicAVX2_u64, BitonicSortAVX2) { bitonic_sort_test<u64, VM::AVX2>(V); }
105 | #endif
106 | 
107 | #ifdef VXSORT_TEST_AVX2_F32
108 | TEST_P(BitonicMachineAVX2_f32,  BitonicSortAVX2Asc) { bitonic_machine_sort_test<f32, VM::AVX2, true>(V); }
109 | TEST_P(BitonicAVX2_f32, BitonicSortAVX2) { bitonic_sort_test<f32, VM::AVX2>(V); }
110 | #endif
111 | 
112 | #ifdef VXSORT_TEST_AVX2_F64
113 | TEST_P(BitonicMachineAVX2_f64, BitonicSortAVX2Asc) { bitonic_machine_sort_test<f64, VM::AVX2, true>(V); }
114 | TEST_P(BitonicAVX2_f64, BitonicSortAVX2) { bitonic_sort_test<f64, VM::AVX2>(V); }
115 | #endif
116 | 
117 | //TEST_P(BitonicMachineAVX2_i32,    BitonicSortAVX2Desc) { bitonic_machine_sort_test<i32, VM::AVX2, false>(V); }
118 | //TEST_P(BitonicMachineAVX2_u32,   BitonicSortAVX2Desc) { bitonic_machine_sort_test<u32, VM::AVX2, false>(V); }
119 | //TEST_P(BitonicMachineAVX2_i64,    BitonicSortAVX2Desc) { bitonic_machine_sort_test<i64, VM::AVX2, false>(V); }
120 | //TEST_P(BitonicMachineAVX2_u64,   BitonicSortAVX2Desc) { bitonic_machine_sort_test<u64, VM::AVX2, false>(V); }
121 | //TEST_P(BitonicMachineAVX2_f32,  BitonicSortAVX2Desc) { bitonic_machine_sort_test<f32, VM::AVX2, false>(V); }
122 | //TEST_P(BitonicMachineAVX2_f64, BitonicSortAVX2Desc) { bitonic_machine_sort_test<f64, VM::AVX2, false>(V); }
123 | 
124 | }
125 | #include "vxsort_targets_disable.h"
126 | 


--------------------------------------------------------------------------------
/tests/smallsort/smallsort.avx512.cpp:
--------------------------------------------------------------------------------
  1 | #include "vxsort_targets_enable_avx512.h"
  2 | 
  3 | #include "gtest/gtest.h"
  4 | 
  5 | #include <smallsort/bitonic_sort.avx512.h>
  6 | 
  7 | #include "smallsort_test.h"
  8 | #include "../sort_fixtures.h"
  9 | 
 10 | namespace vxsort_tests {
 11 | using namespace vxsort::types;
 12 | using testing::Types;
 13 | 
 14 | using VM = vxsort::vector_machine;
 15 | 
 16 | auto bitonic_machine_allvalues_avx512_16 = ValuesIn(range(32, 128, 32));
 17 | auto bitonic_machine_allvalues_avx512_32 = ValuesIn(range(16, 64, 16));
 18 | auto bitonic_machine_allvalues_avx512_64 = ValuesIn(range(8, 32, 8));
 19 | auto bitonic_allvalues_avx512_16 = ValuesIn(range(1, 8192, 1));
 20 | auto bitonic_allvalues_avx512_32 = ValuesIn(range(1, 4096, 1));
 21 | auto bitonic_allvalues_avx512_64 = ValuesIn(range(1, 2048, 1));
 22 | 
 23 | #ifdef VXSORT_TEST_AVX512_I16
 24 | struct BitonicMachineAVX512_i16 : public SortFixture<i16> {};
 25 | struct BitonicAVX512_i16 : public SortFixture<i16> {};
 26 | INSTANTIATE_TEST_SUITE_P(BitonicMachineAVX512, BitonicMachineAVX512_i16, bitonic_machine_allvalues_avx512_16, PrintValue());
 27 | INSTANTIATE_TEST_SUITE_P(BitonicAVX512, BitonicAVX512_i16, bitonic_allvalues_avx512_16, PrintValue());
 28 | #endif
 29 | 
 30 | #ifdef VXSORT_TEST_AVX512_I32
 31 | struct BitonicMachineAVX512_i32 : public SortFixture<i32> {};
 32 | struct BitonicAVX512_i32 : public SortFixture<i32> {};
 33 | INSTANTIATE_TEST_SUITE_P(BitonicMachineAVX512, BitonicMachineAVX512_i32, bitonic_machine_allvalues_avx512_32, PrintValue());
 34 | INSTANTIATE_TEST_SUITE_P(BitonicAVX512, BitonicAVX512_i32, bitonic_allvalues_avx512_32, PrintValue());
 35 | #endif
 36 | 
 37 | #ifdef VXSORT_TEST_AVX512_I64
 38 | struct BitonicMachineAVX512_i64 : public SortFixture<i64> {};
 39 | struct BitonicAVX512_i64 : public SortFixture<i64> {};
 40 | INSTANTIATE_TEST_SUITE_P(BitonicMachineAVX512, BitonicMachineAVX512_i64, bitonic_machine_allvalues_avx512_64, PrintValue());
 41 | INSTANTIATE_TEST_SUITE_P(BitonicAVX512, BitonicAVX512_i64, bitonic_allvalues_avx512_64, PrintValue());
 42 | #endif
 43 | 
 44 | #ifdef VXSORT_TEST_AVX512_U16
 45 | struct BitonicMachineAVX512_u16 : public SortFixture<u16> {};
 46 | struct BitonicAVX512_u16 : public SortFixture<u16> {};
 47 | INSTANTIATE_TEST_SUITE_P(BitonicMachineAVX512, BitonicMachineAVX512_u16, bitonic_machine_allvalues_avx512_16, PrintValue());
 48 | INSTANTIATE_TEST_SUITE_P(BitonicAVX512, BitonicAVX512_u16, bitonic_allvalues_avx512_16, PrintValue());
 49 | #endif
 50 | 
 51 | #ifdef VXSORT_TEST_AVX512_U32
 52 | struct BitonicMachineAVX512_u32 : public SortFixture<u32> {};
 53 | struct BitonicAVX512_u32 : public SortFixture<u32> {};
 54 | INSTANTIATE_TEST_SUITE_P(BitonicMachineAVX512, BitonicMachineAVX512_u32, bitonic_machine_allvalues_avx512_32, PrintValue());
 55 | INSTANTIATE_TEST_SUITE_P(BitonicAVX512, BitonicAVX512_u32, bitonic_allvalues_avx512_32, PrintValue());
 56 | #endif
 57 | 
 58 | #ifdef VXSORT_TEST_AVX512_U64
 59 | struct BitonicMachineAVX512_u64 : public SortFixture<u64> {};
 60 | struct BitonicAVX512_u64 : public SortFixture<u64> {};
 61 | INSTANTIATE_TEST_SUITE_P(BitonicMachineAVX512, BitonicMachineAVX512_u64, bitonic_machine_allvalues_avx512_64, PrintValue());
 62 | INSTANTIATE_TEST_SUITE_P(BitonicAVX512, BitonicAVX512_u64, bitonic_allvalues_avx512_64, PrintValue());
 63 | #endif
 64 | 
 65 | #ifdef VXSORT_TEST_AVX512_F32
 66 | struct BitonicMachineAVX512_f32 : public SortFixture<f32> {};
 67 | struct BitonicAVX512_f32 : public SortFixture<f32> {};
 68 | INSTANTIATE_TEST_SUITE_P(BitonicMachineAVX512, BitonicMachineAVX512_f32, bitonic_machine_allvalues_avx512_32, PrintValue());
 69 | INSTANTIATE_TEST_SUITE_P(BitonicAVX512, BitonicAVX512_f32, bitonic_allvalues_avx512_32, PrintValue());
 70 | #endif
 71 | 
 72 | #ifdef VXSORT_TEST_AVX512_F64
 73 | struct BitonicMachineAVX512_f64 : public SortFixture<f64> {};
 74 | struct BitonicAVX512_f64 : public SortFixture<f64> {};
 75 | INSTANTIATE_TEST_SUITE_P(BitonicMachineAVX512, BitonicMachineAVX512_f64, bitonic_machine_allvalues_avx512_64, PrintValue());
 76 | INSTANTIATE_TEST_SUITE_P(BitonicAVX512, BitonicAVX512_f64, bitonic_allvalues_avx512_64, PrintValue());
 77 | #endif
 78 | 
 79 | 
 80 | #ifdef VXSORT_TEST_AVX512_I16
 81 | TEST_P(BitonicMachineAVX512_i16, BitonicSortAVX512Asc) { bitonic_machine_sort_test<i16, VM::AVX512, true>(V); }
 82 | TEST_P(BitonicAVX512_i16, BitonicSortAVX512) { bitonic_sort_test<i16, VM::AVX512>(V); }
 83 | #endif
 84 | 
 85 | #ifdef VXSORT_TEST_AVX512_I32
 86 | TEST_P(BitonicMachineAVX512_i32, BitonicSortAVX512Asc) { bitonic_machine_sort_test<i32, VM::AVX512, true>(V); }
 87 | TEST_P(BitonicAVX512_i32, BitonicSortAVX512) { bitonic_sort_test<i32, VM::AVX512>(V); }
 88 | #endif
 89 | 
 90 | #ifdef VXSORT_TEST_AVX512_I64
 91 | TEST_P(BitonicMachineAVX512_i64, BitonicSortAVX512Asc) { bitonic_machine_sort_test<i64, VM::AVX512, true>(V); }
 92 | TEST_P(BitonicAVX512_i64, BitonicSortAVX512) { bitonic_sort_test<i64, VM::AVX512>(V); }
 93 | #endif
 94 | 
 95 | #ifdef VXSORT_TEST_AVX512_U16
 96 | TEST_P(BitonicMachineAVX512_u16, BitonicSortAVX512Asc) { bitonic_machine_sort_test<u16, VM::AVX512, true>(V); }
 97 | TEST_P(BitonicAVX512_u16, BitonicSortAVX512) { bitonic_sort_test<u16, VM::AVX512>(V); }
 98 | #endif
 99 | 
100 | #ifdef VXSORT_TEST_AVX512_U32
101 | TEST_P(BitonicMachineAVX512_u32, BitonicSortAVX512Asc) { bitonic_machine_sort_test<u32, VM::AVX512, true>(V); }
102 | TEST_P(BitonicAVX512_u32, BitonicSortAVX512) { bitonic_sort_test<u32, VM::AVX512>(V); }
103 | #endif
104 | 
105 | #ifdef VXSORT_TEST_AVX512_U64
106 | TEST_P(BitonicMachineAVX512_u64, BitonicSortAVX512Asc) { bitonic_machine_sort_test<u64, VM::AVX512, true>(V); }
107 | TEST_P(BitonicAVX512_u64, BitonicSortAVX512) { bitonic_sort_test<u64, VM::AVX512>(V); }
108 | #endif
109 | 
110 | #ifdef VXSORT_TEST_AVX512_F32
111 | TEST_P(BitonicMachineAVX512_f32, BitonicSortAVX512Asc) { bitonic_machine_sort_test<f32, VM::AVX512, true>(V); }
112 | TEST_P(BitonicAVX512_f32, BitonicSortAVX512) { bitonic_sort_test<f32, VM::AVX512>(V); }
113 | #endif
114 | 
115 | #ifdef VXSORT_TEST_AVX512_F64
116 | TEST_P(BitonicMachineAVX512_f64, BitonicSortAVX512Asc) { bitonic_machine_sort_test<f64, VM::AVX512, true>(V); }
117 | TEST_P(BitonicAVX512_f64, BitonicSortAVX512) { bitonic_sort_test<f64, VM::AVX512>(V); }
118 | #endif
119 | 
120 | //TEST_P(BitonicMachineAVX512_i32, BitonicSortAVX2Desc) { bitonic_machine_sort_test<i32, VM::AVX512, false>(V); }
121 | //TEST_P(BitonicMachineAVX512_u32, BitonicSortAVX2Desc) { bitonic_machine_sort_test<u32, VM::AVX512, false>(V); }
122 | //TEST_P(BitonicMachineAVX512_f32, BitonicSortAVX2Desc) { bitonic_machine_sort_test<f32, VM::AVX512, false>(V); }
123 | //TEST_P(BitonicMachineAVX512_i64, BitonicSortAVX2Desc) { bitonic_machine_sort_test<i64, VM::AVX512, false>(V); }
124 | //TEST_P(BitonicMachineAVX512_u64, BitonicSortAVX2Desc) { bitonic_machine_sort_test<u64, VM::AVX512, false>(V); }
125 | //TEST_P(BitonicMachineAVX512_f64, BitonicSortAVX2Desc) { bitonic_machine_sort_test<f64, VM::AVX512, false>(V); }
126 | }
127 | 
128 | #include "vxsort_targets_disable.h"
129 | 


--------------------------------------------------------------------------------
/tests/smallsort/smallsort_test.h:
--------------------------------------------------------------------------------
 1 | #ifndef VXSORT_SMALLSORT_TEST_H
 2 | #define VXSORT_SMALLSORT_TEST_H
 3 | 
 4 | #include <functional>
 5 | 
 6 | #include "gtest/gtest.h"
 7 | #include "../sort_fixtures.h"
 8 | 
 9 | #include "../test_isa.h"
10 | #include "smallsort/bitonic_sort.h"
11 | #include "fmt/format.h"
12 | 
13 | namespace vxsort_tests {
14 | 
15 | using vxsort::vector_machine;
16 | 
17 | template <class T, vector_machine M, bool ascending>
18 | void bitonic_machine_sort_test(std::vector<T>& V) {
19 |     VXSORT_TEST_ISA();
20 | 
21 |     using BM = vxsort::smallsort::bitonic_machine<T, M>;
22 | 
23 |     auto v_copy = std::vector<T>(V);
24 |     auto begin = V.data();
25 |     auto size = V.size();
26 | 
27 |     if (ascending)
28 |         BM::sort_full_vectors_ascending(begin, size);
29 |     else
30 |         BM::sort_full_vectors_descending(begin, size);
31 | 
32 |     std::sort(v_copy.begin(), v_copy.end());
33 |     for (usize i = 0; i < size; ++i) {
34 |         if (v_copy[i] != V[i]) {
35 |             GTEST_FAIL() << fmt::format("value at idx #{}  {} != {}", i, v_copy[i], V[i]);
36 |         }
37 |     }
38 | }
39 | 
40 | template <class T, vector_machine M>
41 | void bitonic_sort_test(std::vector<T>& V) {
42 |     VXSORT_TEST_ISA();
43 | 
44 |     auto v_copy = std::vector<T>(V);
45 |     auto begin = V.data();
46 |     auto size = V.size();
47 | 
48 |     vxsort::smallsort::bitonic<T, M>::sort(begin, size);
49 |     std::sort(v_copy.begin(), v_copy.end());
50 |     for (usize i = 0; i < size; ++i) {
51 |         if (v_copy[i] != V[i]) {
52 |             GTEST_FAIL() << fmt::format("value at idx #{}  {} != {}", i, v_copy[i], V[i]);
53 |         }
54 |     }
55 | }
56 | }
57 | 
58 | #endif  // VXSORT_SMALLSORT_TEST_H
59 | 


--------------------------------------------------------------------------------
/tests/sort_fixtures.h:
--------------------------------------------------------------------------------
  1 | #ifndef VXSORT_SORT_FIXTURES_H
  2 | #define VXSORT_SORT_FIXTURES_H
  3 | 
  4 | #include "gtest/gtest.h"
  5 | #include "stats/vxsort_stats.h"
  6 | #include "util.h"
  7 | 
  8 | #include <array>
  9 | #include <algorithm>
 10 | #include <iterator>
 11 | #include <random>
 12 | #include <stdlib.h>
 13 | 
 14 | namespace vxsort_tests {
 15 | using namespace vxsort::types;
 16 | using testing::ValuesIn;
 17 | using testing::Types;
 18 | 
 19 | template <typename T, int AlignTo = 0>
 20 | struct SortFixture : public testing::TestWithParam<int> {
 21 | protected:
 22 |     std::vector<T> V;
 23 | 
 24 | public:
 25 |     virtual void SetUp() {
 26 |         V = std::vector<T>(GetParam());
 27 |         generate_unique_values_vec(V, (T)0x1000, (T)0x1);
 28 |     }
 29 |     virtual void TearDown() {
 30 |     }
 31 | };
 32 | 
 33 | struct PrintValue {
 34 |     template <class ParamType>
 35 |     std::string operator()(const testing::TestParamInfo<ParamType>& info) const {
 36 |         auto v = static_cast<int>(info.param);
 37 |         return std::to_string(v);
 38 |     }
 39 | };
 40 | 
 41 | template <typename T>
 42 | struct SizeAndSlack {
 43 | public:
 44 |     usize Size;
 45 |     i32 Slack;
 46 |     T FirstValue;
 47 |     T ValueStride;
 48 |     bool Randomize;
 49 | 
 50 |     SizeAndSlack(size_t size, int slack, T first_value, T value_stride, bool randomize)
 51 |         : Size(size), Slack(slack), FirstValue(first_value), ValueStride(value_stride), Randomize(randomize) {}
 52 | 
 53 |     /**
 54 |      * Generate sorting problems "descriptions"
 55 |      * @param start
 56 |      * @param stop
 57 |      * @param step
 58 |      * @param slack
 59 |      * @param first_value - the smallest value in each test array
 60 |      * @param value_stride - the minimal jump between array elements
 61 |      * @param randomize - should the problem array contents be randomized, defaults to true
 62 |      * @return
 63 |      */
 64 |     static std::vector<SizeAndSlack> generate(size_t start, size_t stop, size_t step, int slack, T first_value, T value_stride, bool randomize = true) {
 65 |         if (step == 0) {
 66 |             throw std::invalid_argument("step for range must be non-zero");
 67 |         }
 68 | 
 69 |         std::vector<SizeAndSlack> result;
 70 |         size_t i = start;
 71 |         while ((step > 0) ? (i <= stop) : (i > stop)) {
 72 |             for (auto j : range<int>(-slack, slack, 1)) {
 73 |                 if ((i64)i + j <= 0)
 74 |                     continue;
 75 |                 result.push_back(SizeAndSlack(i, j, first_value, value_stride, randomize));
 76 |             }
 77 |             i *= step;
 78 |         }
 79 |         return result;
 80 |     }
 81 | };
 82 | 
 83 | template <typename T, int AlignTo = 0>
 84 | struct SortWithSlackFixture : public testing::TestWithParam<SizeAndSlack<T>> {
 85 | protected:
 86 |     std::vector<T> V;
 87 | 
 88 | public:
 89 |     virtual void SetUp() {
 90 |         testing::TestWithParam<SizeAndSlack<T>>::SetUp();
 91 |         auto p = this->GetParam();
 92 |         V = std::vector<T>(p.Size + p.Slack);
 93 |         generate_unique_values_vec(V, p.FirstValue, p.ValueStride, p.Randomize);
 94 |     }
 95 |     virtual void TearDown() {
 96 | #ifdef VXSORT_STATS
 97 |         vxsort::print_all_stats();
 98 |         vxsort::reset_all_stats();
 99 | #endif
100 |     }
101 | };
102 | 
103 | template <typename T>
104 | struct PrintSizeAndSlack {
105 |     std::string operator()(const testing::TestParamInfo<SizeAndSlack<T>>& info) const {
106 |         return std::to_string(info.param.Size + info.param.Slack);
107 |     }
108 | };
109 | 
110 | template <typename T>
111 | struct SizeAndStride {
112 | public:
113 |     usize Size;
114 |     T FirstValue;
115 |     T ValueStride;
116 |     bool Randomize;
117 | 
118 |     SizeAndStride(size_t size, T first_value, T value_stride, bool randomize)
119 |         : Size(size), FirstValue(first_value), ValueStride(value_stride), Randomize(randomize) {}
120 | 
121 |     static std::vector<SizeAndStride> generate(size_t size, T stride_start, T stride_stop, T first_value, bool randomize = true) {
122 |         std::vector<SizeAndStride> result;
123 |         for (auto j : multiply_range<T>(stride_start, stride_stop, 2)) {
124 |             result.push_back(SizeAndStride(size, first_value, j, randomize));
125 |         }
126 |         return result;
127 |     }
128 | };
129 | 
130 | template <typename T, i32 AlignTo = 0>
131 | struct SortWithStrideFixture : public testing::TestWithParam<SizeAndStride<T>> {
132 | protected:
133 |     std::vector<T> V;
134 |     T MinValue;
135 |     T MaxValue;
136 | 
137 | public:
138 |     virtual void SetUp() {
139 |         testing::TestWithParam<SizeAndStride<T>>::SetUp();
140 |         auto p = this->GetParam();
141 |         V = std::vector<T>(p.Size);
142 |         generate_unique_values_vec(V, p.FirstValue, p.ValueStride, p.Randomize);
143 |         MinValue = p.FirstValue;
144 |         MaxValue = MinValue + p.Size * p.ValueStride;
145 |         if (MinValue > MaxValue)
146 |             throw std::invalid_argument("stride is generating an overflow");
147 |     }
148 |     virtual void TearDown() {
149 | #ifdef VXSORT_STATS
150 |         vxsort::print_all_stats();
151 |         vxsort::reset_all_stats();
152 | #endif
153 |     }
154 | };
155 | 
156 | template <typename T>
157 | struct PrintSizeAndStride {
158 |     std::string operator()(const testing::TestParamInfo<SizeAndStride<T>>& info) const {
159 |         return std::to_string(info.param.ValueStride);
160 |     }
161 | };
162 | }
163 | 
164 | #endif  // VXSORT_SORT_FIXTURES_H
165 | 


--------------------------------------------------------------------------------
/tests/test_isa.h:
--------------------------------------------------------------------------------
 1 | #ifndef VXSORT_TEST_ISA_H
 2 | #define VXSORT_TEST_ISA_H
 3 | 
 4 | #include "isa_detection.h"
 5 | 
 6 | #define VXSORT_TEST_ISA() \
 7 |     if (!::vxsort::supports_vector_machine<M>(sizeof(T))) { \
 8 |         GTEST_SKIP_("Current CPU does not support the minimal features for this test"); \
 9 |         return; \
10 |     }
11 | 
12 | #endif //VXSORT_TEST_ISA_H
13 | 


--------------------------------------------------------------------------------
/tests/util.h:
--------------------------------------------------------------------------------
 1 | #ifndef VXSORT_TEST_UTIL_H
 2 | #define VXSORT_TEST_UTIL_H
 3 | 
 4 | #include <vector>
 5 | #include <algorithm>
 6 | #include <numeric>
 7 | #include <random>
 8 | 
 9 | template <typename T>
10 | void generate_unique_values_vec(std::vector<T>& vec, T start, T stride= 0x1, bool randomize = true) {
11 |     for (size_t i = 0; i < vec.size(); i++) {
12 |         vec[i] = start;
13 |         start += stride;
14 |     }
15 | 
16 |     if (!randomize)
17 |         return;
18 | 
19 |     std::random_device rd;
20 |     // std::mt19937 g(rd());
21 |     std::mt19937 g(666);
22 | 
23 |     std::shuffle(vec.begin(), vec.end(), g);
24 | }
25 | 
26 | template <typename IntType>
27 | std::vector<IntType> range(IntType start, IntType stop, IntType step) {
28 |     if (step == IntType(0)) {
29 |         throw std::invalid_argument("step for range must be non-zero");
30 |     }
31 | 
32 |     std::vector<IntType> result;
33 |     IntType i = start;
34 |     while ((step > 0) ? (i <= stop) : (i > stop)) {
35 |         result.push_back(i);
36 |         i += step;
37 |     }
38 | 
39 |     return result;
40 | }
41 | 
42 | template <typename IntType>
43 | std::vector<IntType> multiply_range(IntType start, IntType stop, IntType step) {
44 |     if (step == IntType(0)) {
45 |         throw std::invalid_argument("step for range must be non-zero");
46 |     }
47 | 
48 |     std::vector<IntType> result;
49 |     IntType i = start;
50 |     while ((step > 0) ? (i <= stop) : (i > stop)) {
51 |         result.push_back(i);
52 |         i *= step;
53 |     }
54 | 
55 |     return result;
56 | }
57 | 
58 | #endif
59 | 


--------------------------------------------------------------------------------
/vxsort/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(TARGET_NAME ${CMAKE_PROJECT_NAME}_lib)
 2 | 
 3 | set(lib_SRC
 4 |         isa_detection_sane.cpp
 5 |         isa_detection.cpp
 6 |         stats/vxsort_stats.cpp
 7 | )
 8 | 
 9 | set(lib_HEADERS
10 |         isa_detection.h
11 |         alignment.h
12 |         compiler.h
13 |         defs.h
14 |         pack_machine.h
15 |         vxsort.h
16 |         stats/vxsort_stats.h
17 |         vector_machine/machine_traits.h
18 |         vxsort_targets_disable.h
19 |         partition_machine.h
20 |         vxsort.avx2.h
21 |         vxsort.avx512.h
22 |         partition_machine.avx2.h
23 |         partition_machine.avx512.h
24 |         smallsort/bitonic_sort.avx2.h
25 |         smallsort/bitonic_sort.avx512.h
26 |         )
27 | 
28 | if (${PROCESSOR_IS_X86})
29 |     file(GLOB_RECURSE avx2_SRC       vector_machine/avx2/*.cpp smallsort/avx2/*.cpp)
30 |     file(GLOB_RECURSE avx2_HEADERS   vector_machine/avx2/*.h smallsort/avx2/*.h)
31 |     file(GLOB_RECURSE avx512_SRC     vector_machine/avx512/*.cpp smallsort/avx512/*.cpp)
32 |     file(GLOB_RECURSE avx512_HEADERS vector_machine/avx512/*.h smallsort/avx512/*.h)
33 | 
34 |     list(APPEND lib_HEADERS
35 |         vector_machine/machine_traits.avx2.h
36 |         vector_machine/machine_traits.avx512.h
37 |         ${avx2_HEADERS}
38 |         ${avx512_HEADERS}
39 |     )
40 | 
41 |     list(APPEND lib_SRC
42 |             ${avx2_SRC}
43 |             ${avx512_SRC}
44 |     )
45 | 
46 | endif()
47 | 
48 | if (${PROCESSOR_IS_ARM} OR ${PROCESSOR_IS_AARCH64})
49 |     file(GLOB_RECURSE neon_SRC vector_machine/neon/*.cpp smallsort/neon/*.cpp)
50 |     file(GLOB_RECURSE neon_HEADERS vector_machine/neon/*.h smallsort/neon/*.h)
51 | 
52 |     list(APPEND lib_HEADERS ${neon_HEADERS})
53 |     list(APPEND lib_SRC ${neon_SRC})
54 | endif()
55 | 
56 | add_library(${TARGET_NAME} STATIC ${lib_SRC} ${lib_HEADERS})
57 | set_target_properties(${TARGET_NAME} PROPERTIES PREFIX lib)
58 | set_target_properties(${TARGET_NAME} PROPERTIES OUTPUT_NAME ${CMAKE_PROJECT_NAME})
59 | target_link_libraries(${TARGET_NAME}
60 |     cpu_features
61 |     fmt
62 | )
63 | 
64 | target_include_directories(${TARGET_NAME} PUBLIC .)
65 | #set_property(TARGET ${TARGET_NAME} PROPERTY INTERPROCEDURAL_OPTIMIZATION True)
66 | 
67 | 


--------------------------------------------------------------------------------
/vxsort/alignment.h:
--------------------------------------------------------------------------------
 1 | #ifndef VXSORT_ALIGNNMENT_H
 2 | #define VXSORT_ALIGNNMENT_H
 3 | 
 4 | #include <cstdint>
 5 | #include "vector_machine/machine_traits.h"
 6 | #include "defs.h"
 7 | 
 8 | namespace vxsort {
 9 | using namespace vxsort::types;
10 | 
11 | using namespace std;
12 | 
13 | /// Perform vector sized alignment of array boundary reads (beginning/end)
14 | /// \tparam T the primitive type being aligned
15 | /// \tparam M the vector_machine being use (e.g. determines the vector width in bytes)
16 | template <typename T, vector_machine M>
17 | struct alignment_hint {
18 |     using VMT = vxsort_machine_traits<T, M>;
19 |     static constexpr i32 N = VMT::N;
20 |     static constexpr usize ALIGN = sizeof(typename VMT::TV);
21 | public:
22 |     static const size_t ALIGN_MASK = ALIGN - 1;
23 |     static const i8 REALIGN = 0x66;
24 |     static_assert(REALIGN > ALIGN, "REALIGN must be larger than ALIGN");
25 | 
26 |     alignment_hint() : left_masked_amount(REALIGN), right_unmasked_amount(REALIGN) {}
27 |     alignment_hint clear_left() {
28 |         alignment_hint copy = *this;
29 |         copy.left_masked_amount = REALIGN;
30 |         return copy;
31 |     }
32 | 
33 |     alignment_hint clear_right() {
34 |         alignment_hint copy = *this;
35 |         copy.right_unmasked_amount = REALIGN;
36 |         return copy;
37 |     }
38 | 
39 |     static bool is_aligned(const void* p) { return (usize)p % ALIGN == 0; }
40 | 
41 |     /// Perform "left-side"/beginning-of partition alignment.
42 |     /// Given an inclusive pointer to the left-most element/beginning-of an array,
43 |     /// alignment to the nearest whole vector sized pointer is performed, updating the
44 |     /// internal `left_masked_amount` member with the number of elements to be masked
45 |     /// off during a vector read.
46 |     /// @param[in] p a pointer to the first element that is desired to be read
47 |     void calc_left_alignment(const T *p) {
48 |         // Alignment flow:
49 |         // * Calculate pre-alignment position on the left
50 |         // * convert to a valid input to be use with `generate_suffix_mask`
51 |         const auto* pre_aligned_left = reinterpret_cast<T*>(reinterpret_cast<usize>(p) & ~ALIGN_MASK);
52 |         left_masked_amount = p - pre_aligned_left;
53 |         assert(left_masked_amount >= 0 && left_masked_amount < N);
54 |         assert(is_aligned(pre_aligned_left));
55 |     }
56 | 
57 |     /// Perform "right-side"/end-of an partition alignment. Given an exclusive pointer just past
58 |     /// the right-most/end-of an array, alignment to the nearest vector sized read
59 |     /// is performed, updating the internal `right_unmasked_amount` with the number of unmasked
60 |     /// elements to be read
61 |     /// @param[in] p a pointer past the last element that is desired to be read
62 |     void calc_right_alignment(const T *p) {
63 |         //    │01234567│01234567
64 |         // (1)│xxxxxxxx│xxxp••••
65 |         // (2)│xxxxxxxx│xp••••••
66 |         // (3)│xxxxxxxx│p••••••
67 |         //    p -> the parameter
68 |         //    x -> data to be read
69 |         //    • -> Masked elements
70 |         // right_unmasked_amount should be:
71 |         // (1) -> 3
72 |         // (2) -> 1
73 |         // (3) -> 8 (8/0 are the same in terms of masking)
74 |         const auto* pre_aligned_right = reinterpret_cast<T*>(reinterpret_cast<usize>(p-1) & ~ALIGN_MASK);
75 |         right_unmasked_amount = p - pre_aligned_right;
76 |         assert(right_unmasked_amount >= 0 && right_unmasked_amount <= N);
77 |         assert(is_aligned(pre_aligned_right));
78 |     }
79 | 
80 |     i32 left_masked_amount : 8;
81 |     i32 right_unmasked_amount : 8;
82 | };
83 | 
84 | } // namespace vxsort
85 | #endif  // VXSORT_ALIGNNMENT_H
86 | 


--------------------------------------------------------------------------------
/vxsort/compiler.h:
--------------------------------------------------------------------------------
 1 | #ifndef VXSORT_COMPILER_H
 2 | #define VXSORT_COMPILER_H
 3 | 
 4 | #ifdef _MSC_VER
 5 | #ifdef __clang__
 6 | #define VXSORT_COMPILER_CLANGCL _MSC_VER
 7 | #else // real MSVC
 8 | #define VXSORT_COMPILER_MSVC _MSC_VER
 9 | #endif
10 | #else
11 | #ifdef __GNUC__
12 | #ifdef __clang__
13 | #define VXSORT_COMPILER_CLANG (__clang_major__ * 100 + __clang_minor__)
14 | #else
15 | #define VXSORT_COMPILER_GCC (__GNUC__ * 100 + __GNUC_MINOR__)
16 | #endif
17 | #endif
18 | #endif
19 | 
20 | #endif  // VXSORT_COMPILER_H
21 | 


--------------------------------------------------------------------------------
/vxsort/defs.h:
--------------------------------------------------------------------------------
 1 | #ifndef VXSORT_DEFS_H
 2 | #define VXSORT_DEFS_H
 3 | 
 4 | #include "compiler.h"
 5 | 
 6 | #if _MSC_VER
 7 | #ifdef _M_X86
 8 | #define ARCH_X86
 9 | #endif
10 | #ifdef _M_X64
11 | #define ARCH_X64
12 | #endif
13 | #ifdef _M_ARM64
14 | #define ARCH_ARM
15 | #endif
16 | #else
17 | #ifdef __i386__
18 | #define ARCH_X86
19 | #endif
20 | #ifdef __amd64__
21 | #define ARCH_X64
22 | #endif
23 | #ifdef __arm__
24 | #define ARCH_ARM
25 | #endif
26 | #endif
27 | 
28 | #ifdef VXSORT_COMPILER_MSVC
29 | #include <intrin.h>
30 | #define mess_up_cmov() _ReadBarrier();
31 | #define INLINE __forceinline
32 | #define NOINLINE __declspec(noinline)
33 | #else
34 | #define mess_up_cmov()
35 | #define INLINE __attribute__((always_inline))
36 | #define NOINLINE __attribute__((noinline))
37 | #endif
38 | 
39 | #include <cstdint>
40 | #include <sys/types.h>
41 | 
42 | #ifdef _MSC_VER
43 | #include <BaseTsd.h>
44 | typedef SSIZE_T ssize_t;
45 | #endif
46 | 
47 | #define RESTRICT __restrict
48 | 
49 | namespace vxsort {
50 | 
51 | template <class... E>
52 | constexpr bool always_false = false;
53 | constexpr bool is_powerof2(int v) {
54 |     return v && ((v & (v - 1)) == 0);
55 | }
56 | 
57 | namespace types {
58 |     using i8  = int8_t;
59 |     using u8  = uint8_t;
60 |     using i16 = int16_t;
61 |     using i32 = int32_t;
62 |     using i64 = int64_t;
63 |     using u16 = uint16_t;
64 |     using u32 = uint32_t;
65 |     using u64 = uint64_t;
66 |     using f32 = float;
67 |     using f64 = double;
68 |     using isize = ssize_t;
69 |     using usize = size_t;
70 | }
71 | 
72 | } // namespace vxsort
73 | 
74 | #endif  // VXSORT_DEFS_H
75 | 


--------------------------------------------------------------------------------
/vxsort/isa_detection.cpp:
--------------------------------------------------------------------------------
1 | #include "isa_detection.h"
2 | 
3 | bool __isa_detection_performed = vxsort::init_isa_detection();
4 | 


--------------------------------------------------------------------------------
/vxsort/isa_detection.h:
--------------------------------------------------------------------------------
 1 | #ifndef VXSORT_ISA_DETECTION_H
 2 | #define VXSORT_ISA_DETECTION_H
 3 | 
 4 | #include "vector_machine/machine_traits.h"
 5 | #include "cpu_features_macros.h"
 6 | 
 7 | namespace vxsort {
 8 | 
 9 | extern bool init_isa_detection();
10 | extern bool supports_vector_machine(vector_machine m);
11 | 
12 | template <vector_machine M>
13 | bool supports_vector_machine(usize width);
14 | 
15 | } // namespace vxsort
16 | 
17 | #endif  // VXSORT_ISA_DETECTION_H
18 | 


--------------------------------------------------------------------------------
/vxsort/isa_detection_sane.cpp:
--------------------------------------------------------------------------------
 1 | #include "isa_detection.h"
 2 | 
 3 | #if defined(CPU_FEATURES_ARCH_X86)
 4 | #include "cpuinfo_x86.h"
 5 | using namespace cpu_features;
 6 | static const X86Features features = GetX86Info().features;
 7 | static const bool has_avx2 = CPU_FEATURES_COMPILED_X86_AVX2 || (features.avx2 && features.avx && features.popcnt && features.bmi2);
 8 | static const bool has_avx512_32_64 = CPU_FEATURES_COMPILED_X86_AVX2 || (features.avx512f && features.avx512dq && features.avx512bw && features.popcnt);
 9 | static const bool has_avx512_16 = has_avx512_32_64 && features.avx512vbmi2;
10 | //static const bool has_avx512_16_fp16 = has_avx512_16 && features.avx512_fp16;
11 | #elif defined(CPU_FEATURES_ARCH_ARM)
12 | #include "cpuinfo_arm.h"
13 | using namespace cpu_features;
14 | static const ArmFeatures features = GetArmInfo().features;
15 | static const bool has_neon = CPU_FEATURES_COMPILED_ANY_ARM_NEON || features.neon;
16 | #elif defined(CPU_FEATURES_ARCH_AARCH64)
17 | #include "cpuinfo_aarch64.h"
18 | using namespace cpu_features;
19 | static const Aarch64Features features = GetAarch64Info().features;
20 | static const bool has_neon = CPU_FEATURES_COMPILED_ANY_ARM_NEON || features.asimd;
21 | static const bool has_sve = CPU_FEATURES_COMPILED_ANY_ARM_NEON || features.sve;
22 | 
23 | #elif defined(CPU_FEATURES_ARCH_MIPS)
24 | #include "cpuinfo_mips.h"
25 | #elif defined(CPU_FEATURES_ARCH_PPC)
26 | #include "cpuinfo_ppc.h"
27 | #endif
28 | 
29 | namespace vxsort {
30 | 
31 | bool init_isa_detection() {
32 |     return true;
33 | }
34 | 
35 | extern bool supports_vector_machine(vector_machine m)
36 | {
37 |     switch (m) {
38 |         case NONE:
39 |             return true;
40 | #if defined(CPU_FEATURES_ARCH_X86)
41 |         case AVX2:
42 |             return has_avx2;
43 |         case AVX512:
44 |             return has_avx512_32_64;
45 | #endif
46 | #if defined(CPU_FEATURES_ARCH_ANY_ARM)
47 |         case NEON:
48 |             return has_neon;
49 | #endif
50 | #if defined(CPU_FEATURES_ARCH_AARCH64)
51 |         case SVE:
52 |             return has_sve;
53 | #endif
54 |         default:
55 |             break;
56 |     }
57 |     return false;
58 | }
59 | 
60 | template<>
61 | bool supports_vector_machine<AVX2>(usize) {
62 |     return has_avx2;
63 | }
64 | 
65 | template<>
66 | bool supports_vector_machine<AVX512>(usize width) {
67 |     switch (width) {
68 |         case 2:
69 |             // We require AVX512VBMI2 for 16-bit partitioning
70 |             // since we use the _mm512_mask_compressstoreu_epi16 intrinsic
71 |             return has_avx512_16;
72 |         case 4:
73 |         case 8:
74 |             return has_avx512_32_64;
75 |         default:
76 |             break;
77 |     }
78 |     return false;
79 | }
80 | } // namespace vxsort
81 | 


--------------------------------------------------------------------------------
/vxsort/pack_machine.h:
--------------------------------------------------------------------------------
 1 | #ifndef VXSORT_PACK_MACHINE_H
 2 | #define VXSORT_PACK_MACHINE_H
 3 | 
 4 | #include <cstdint>
 5 | #include <limits>
 6 | #include <type_traits>
 7 | #include <cassert>
 8 | #include "defs.h"
 9 | #include "alignment.h"
10 | #include "vector_machine/machine_traits.h"
11 | 
12 | #include <immintrin.h>
13 | #include <cstdio>
14 | 
15 | namespace vxsort {
16 | 
17 | template<typename T, vector_machine M, i32 Shift>
18 | class pack_machine {
19 |     static_assert(Shift <= 31, "Shift must be in the range 0..31");
20 | 
21 |     using VMT = vxsort_machine_traits<T, M>;
22 |     typedef typename VMT::TV TV;
23 |     static const i32 N = sizeof(TV) / sizeof(T);
24 |     typedef alignment_hint<T, M> AH;
25 | 
26 | public:
27 | 
28 |     /// pack the provided vectors into a lower bit-width type after offestting them by a known base value
29 |     /// \param[in] u1 a vector containing the first half of the input
30 |     /// \param[in] u2 a vector containing the second half of the input
31 |     /// \param[in] offset_v a vector containing the base value to use for offsetting each element before packing
32 |     /// \return a vector containing the packed values after readjusting them to the supplied base value
33 |     static INLINE TV prepare_offset(T min_value)
34 |     {
35 |         // Create a vectorized version of the offset by which we need to
36 |         // correct the data before packing it
37 |         auto constexpr MIN = T(std::numeric_limits<typename VMT::TPACK>::min());
38 |         auto offset = VMT::template shift_n_sub<Shift>(min_value, MIN);
39 |         return VMT::broadcast(offset);
40 |     }
41 | 
42 |     static INLINE TV pack_vectors(TV u1, TV u2, const TV offset_v) {
43 |         // This is statically compiled in/out
44 |         if (Shift > 0) {
45 |             u1 = VMT::shift_right(u1, Shift);
46 |             u2 = VMT::shift_right(u2, Shift);
47 |         }
48 |         u1 = VMT::sub(u1, offset_v);
49 |         u2 = VMT::sub(u2, offset_v);
50 | 
51 |         return VMT::pack_unordered(u1, u2);
52 |     }
53 | 
54 |     /// unpack the provided vector into two higher bit-width vectors, then offset them by a known base value
55 |     /// \param[in] offset_v a vector containing the base value to use for offsetting each element after unpacking
56 |     /// \param[in] p a vector containing the packed input values
57 |     /// \param[out] u1 a vector containing the first half of the output
58 |     /// \param[out] u2 a vector containing the second half of the output
59 |     static INLINE void unpack_vectors(const TV offset_v, TV p, TV& u1, TV& u2) {
60 |         VMT::unpack_ordered(p, u1, u2);
61 | 
62 |         u1 = VMT::add(u1, offset_v);
63 |         u2 = VMT::add(u2, offset_v);
64 | 
65 |         if (Shift > 0) { // This is statically compiled in/out
66 |             u1 = VMT::shift_left(u1, Shift);
67 |             u2 = VMT::shift_left(u2, Shift);
68 |         }
69 |     }
70 | };
71 | 
72 | } // namespace vxsort
73 | 
74 | #endif  // VXSORT_PACK_MACHINE_H
75 | 


--------------------------------------------------------------------------------
/vxsort/partition_machine.h:
--------------------------------------------------------------------------------
 1 | #ifndef VXSORT_PARTITION_MACHINE_H
 2 | #define VXSORT_PARTITION_MACHINE_H
 3 | 
 4 | #include <cstdint>
 5 | #include <limits>
 6 | #include <immintrin.h>
 7 | 
 8 | #include "defs.h"
 9 | #include "vector_machine/machine_traits.h"
10 | 
11 | 
12 | #ifdef VXSORT_STATS
13 | #include "stats/vxsort_stats.h"
14 | #endif
15 | 
16 | 
17 | namespace vxsort {
18 | using namespace std;
19 | using namespace vxsort::types;
20 | 
21 | template <typename T, vector_machine M>
22 | struct partition_machine {
23 |     using VMT = vxsort_machine_traits<T, M>;
24 |     typedef typename VMT::TV TV;
25 | public:
26 | 
27 |     static INLINE void partition_block(TV& data_vec, const TV P,
28 |                                        T* RESTRICT &left, T* RESTRICT &right) {
29 |         static_assert(always_false<TV>, "must be specialized!");
30 |     }
31 | 
32 | 
33 |     /// Prime the partition "pump" by reading and aligning up to the one vector worth
34 |     /// of elements fro each side of the input partition. The actual amount of data
35 |     /// to be partitioned depends on the next alignment point for future vector reads:
36 |     /// By reading some exact amount of each side, by the end of this function, future
37 |     /// reads can perform 100% aligned loads, thereby reducing the internal resources
38 |     /// consumed by modern HW when dealing with un-aligned, or worse-yet cache-line
39 |     /// striped loads
40 |     /// @param[in] left_masked_amount the amount of elements, prior to
41 |     ///                               @p read_left that are to be discarded;
42 |     ///                               a zero (0) value is a special value that denotes that all values
43 |     ///                               are to be used (e.g. 0 discarded)
44 |     /// @param[in] right_unmasked_amount the amount of elements, prior to
45 |     ///                                  @p read_right that are to be partitioned;
46 |     ///                                  a zero (0) value is a special value that denotes that all values
47 |     ///                                  are to be used (e.g. 0 discarded)
48 |     /// @param[in] P The vector pivot value
49 |     /// @param[inout] read_left A reference to the current left-side read-position,
50 |     ///                         modified to the next read-position by the end of this function
51 |     /// @param[inout] read_right A reference to the current right-side read-position,
52 |     ///                          modified to the next read-position by the end of this function
53 |     /// @param[inout] spill_read_left A reference to the spill-buffer's copy-from left-side
54 |     ///                              read-position. This will reflect the discarded elements
55 |     ///                              by the end of the this function
56 |     /// @param[inout] spill_write_left A reference to the spill-buffer's left-side write-position
57 |     ///                                This will reflect the next valid vector write position by
58 |     ///                                the end of this function.
59 |     /// @param[inout] spill_read_right A reference to the spill-buffer's copy-from right-side
60 |     ///                               read-position. This will reflect the discarded elements
61 |     ///                               by the end of this function
62 |     /// @param[inout] spill_write_right A reference to the spill-buffer's right-side write-position
63 |     ///                                 This will reflect eh next valid vector-write position by
64 |     ///                                 the end of this function.
65 |     static inline void align_vectorized(const i32 left_masked_amount, const i32 right_unmasked_amount,
66 |                                         const TV P,
67 |                                         T* RESTRICT &read_left, T* RESTRICT &read_right,
68 |                                         T* RESTRICT &spill_read_left, T* RESTRICT &spill_write_left,
69 |                                         T* RESTRICT &spill_read_right, T* RESTRICT &spill_write_right) {
70 |         static_assert(always_false<TV>, "must be specialized!");
71 |     }
72 | };
73 | 
74 | }  // namespace vxsort
75 | 
76 | #endif //VXSORT_PARTITION_MACHINE_H
77 | 


--------------------------------------------------------------------------------
/vxsort/smallsort/avx2/bitonic_machine.avx2.h:
--------------------------------------------------------------------------------
 1 | /////////////////////////////////////////////////////////////////////////////
 2 | ////
 3 | // This file was auto-generated by a tool at 2022-10-30 08:12:25
 4 | //
 5 | // It is recommended you DO NOT directly edit this file but instead edit
 6 | // the code-generator that generated this source file instead.
 7 | /////////////////////////////////////////////////////////////////////////////
 8 | 
 9 | #include "bitonic_machine.avx2.i16.generated.h"
10 | #include "bitonic_machine.avx2.u16.generated.h"
11 | #include "bitonic_machine.avx2.i32.generated.h"
12 | #include "bitonic_machine.avx2.u32.generated.h"
13 | #include "bitonic_machine.avx2.f32.generated.h"
14 | #include "bitonic_machine.avx2.i64.generated.h"
15 | #include "bitonic_machine.avx2.u64.generated.h"
16 | #include "bitonic_machine.avx2.f64.generated.h"
17 | 


--------------------------------------------------------------------------------
/vxsort/smallsort/avx512/bitonic_machine.avx512.h:
--------------------------------------------------------------------------------
 1 | /////////////////////////////////////////////////////////////////////////////
 2 | ////
 3 | // This file was auto-generated by a tool at 2022-10-30 08:12:25
 4 | //
 5 | // It is recommended you DO NOT directly edit this file but instead edit
 6 | // the code-generator that generated this source file instead.
 7 | /////////////////////////////////////////////////////////////////////////////
 8 | 
 9 | #include "bitonic_machine.avx512.i16.generated.h"
10 | #include "bitonic_machine.avx512.u16.generated.h"
11 | #include "bitonic_machine.avx512.i32.generated.h"
12 | #include "bitonic_machine.avx512.u32.generated.h"
13 | #include "bitonic_machine.avx512.f32.generated.h"
14 | #include "bitonic_machine.avx512.i64.generated.h"
15 | #include "bitonic_machine.avx512.u64.generated.h"
16 | #include "bitonic_machine.avx512.f64.generated.h"
17 | 


--------------------------------------------------------------------------------
/vxsort/smallsort/bitonic_machine.h:
--------------------------------------------------------------------------------
 1 | #ifndef VXSORT_BITONIC_MACHINE_H
 2 | #define VXSORT_BITONIC_MACHINE_H
 3 | 
 4 | #include <cstdint>
 5 | #include "../defs.h"
 6 | #include "vector_machine/machine_traits.h"
 7 | 
 8 | namespace vxsort::smallsort {
 9 | using namespace std;
10 | 
11 | template <typename T, vector_machine M>
12 | struct bitonic_machine {
13 | public:
14 |     typedef T TV;
15 |     typedef T TMASK;
16 | 
17 |     static INLINE void sort_04v_ascending(TV& d01, TV& d02, TV& d03, TV& d04);
18 |     static INLINE void merge_04v_ascending(TV& d01, TV& d02, TV& d03, TV& d04);
19 |     static INLINE void cross_min_max(TV& d01, TV& d02);
20 |     static INLINE void strided_min_max(TV& d01, TV& d02);
21 | 
22 |     static NOINLINE void sort_01v_full_ascending(T *ptr);
23 |     static NOINLINE void sort_02v_full_ascending(T *ptr);
24 |     static NOINLINE void sort_03v_full_ascending(T *ptr);
25 |     static NOINLINE void sort_04v_full_ascending(T *ptr);
26 |     static void sort_full_vectors_ascending(T *ptr, usize length);
27 |     static void sort_full_vectors_descending(T *ptr, usize length);
28 | };
29 | }  // namespace vxsort
30 | #endif
31 | 


--------------------------------------------------------------------------------
/vxsort/smallsort/bitonic_sort.avx2.h:
--------------------------------------------------------------------------------
 1 | #ifndef VXSORT_BITONIC_SORT_AVX2_H
 2 | #define VXSORT_BITONIC_SORT_AVX2_H
 3 | 
 4 | #include "../vector_machine/machine_traits.avx2.h"
 5 | #include "avx2/bitonic_machine.avx2.h"
 6 | 
 7 | #include "bitonic_sort.h"
 8 | 
 9 | #endif //VXSORT_BITONIC_SORT_AVX2_H
10 | 


--------------------------------------------------------------------------------
/vxsort/smallsort/bitonic_sort.avx512.h:
--------------------------------------------------------------------------------
 1 | #ifndef VXSORT_BITONIC_SORT_AVX512_H
 2 | #define VXSORT_BITONIC_SORT_AVX512_H
 3 | 
 4 | #include "../vector_machine/machine_traits.avx512.h"
 5 | #include "avx512/bitonic_machine.avx512.h"
 6 | 
 7 | #include "bitonic_sort.h"
 8 | 
 9 | #endif //VXSORT_BITONIC_SORT_AVX512_H
10 | 


--------------------------------------------------------------------------------
/vxsort/smallsort/codegen/bitonic_gen.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import argparse
  3 | import os
  4 | from datetime import datetime
  5 | from enum import Enum
  6 | 
  7 | from typing.io import IO
  8 | 
  9 | from avx2 import AVX2BitonicISA
 10 | from avx512 import AVX512BitonicISA
 11 | from bitonic_isa import BitonicISA
 12 | 
 13 | BitonicISA.register(AVX2BitonicISA)
 14 | BitonicISA.register(AVX512BitonicISA)
 15 | 
 16 | 
 17 | def get_generator_supported_types(vector_isa):
 18 |     if isinstance(vector_isa, str):
 19 |         vector_isa = VectorISA[vector_isa]
 20 |     if vector_isa == VectorISA.AVX2:
 21 |         return AVX2BitonicISA.supported_types()
 22 |     elif vector_isa == VectorISA.AVX512:
 23 |         return AVX512BitonicISA.supported_types()
 24 |     elif vector_isa == VectorISA.NEON:
 25 |         return NeonBitonicISA.supported_types()
 26 |     else:
 27 |         raise Exception(f"Non-supported vector machine-type: {vector_isa}")
 28 | 
 29 | 
 30 | def get_generator(vector_isa, type, f_header: IO):
 31 |     if isinstance(vector_isa, str):
 32 |         vector_isa = VectorISA[vector_isa]
 33 |     if vector_isa == VectorISA.AVX2:
 34 |         return AVX2BitonicISA(type, f_header)
 35 |     elif vector_isa == VectorISA.AVX512:
 36 |         return AVX512BitonicISA(type, f_header)
 37 |     elif vector_isa == VectorISA.NEON:
 38 |         return NeonBitonicISA(type, f_header)
 39 |     else:
 40 |         raise Exception(f"Non-supported vector machine-type: {vector_isa}")
 41 | 
 42 | 
 43 | def generate_per_type(f_header: IO, type, vector_isa, break_inline):
 44 |     g = get_generator(vector_isa, type, f_header)
 45 |     g.generate_prologue()
 46 |     g.generate_1v_sorters(ascending=True)
 47 |     g.generate_1v_sorters(ascending=False)
 48 |     for width in range(2, g.max_bitonic_sort_vectors() + 1):
 49 |         # Allow breaking the inline chain once in a while (configurable)
 50 |         if break_inline == 0 or width % break_inline != 0:
 51 |             inline = True
 52 |         else:
 53 |             inline = False
 54 |         g.generate_compounded_sorter(width, asc=True, inline=inline)
 55 |         g.generate_compounded_sorter(width, asc=False, inline=inline)
 56 |         if width <= g.largest_merge_variant_needed():
 57 |             g.generate_compounded_merger(width, asc=True, inline=inline)
 58 |             g.generate_compounded_merger(width, asc=False, inline=inline)
 59 | 
 60 | 
 61 |     g.generate_cross_min_max()
 62 |     g.generate_strided_min_max()
 63 | 
 64 |     g.generate_entry_points_full_vectors(asc=True)
 65 |     g.generate_entry_points_full_vectors(asc=False)
 66 |     g.generate_master_entry_point_full(asc=True)
 67 |     g.generate_master_entry_point_full(asc=False)
 68 |     g.generate_epilogue()
 69 | 
 70 | 
 71 | class Language(Enum):
 72 |     csharp = 'csharp'
 73 |     cpp = 'cpp'
 74 |     rust = 'rust'
 75 | 
 76 |     def __str__(self):
 77 |         return self.value
 78 | 
 79 | 
 80 | class VectorISA(Enum):
 81 |     AVX2 = 'avx2'
 82 |     AVX512 = 'avx512'
 83 |     NEON = 'neon'
 84 |     SVE = 'sve'
 85 | 
 86 |     def __str__(self):
 87 |         return self.value
 88 | 
 89 | def autogenerated_blabber():
 90 |     return f"""/////////////////////////////////////////////////////////////////////////////
 91 | ////
 92 | // This file was auto-generated by a tool at {datetime.now().strftime("%F %H:%M:%S")}
 93 | //
 94 | // It is recommended you DO NOT directly edit this file but instead edit
 95 | // the code-generator that generated this source file instead.
 96 | /////////////////////////////////////////////////////////////////////////////"""
 97 | 
 98 | def generate_all_types():
 99 |     parser = argparse.ArgumentParser()
100 |     #parser.add_argument("--language", type=Language, choices=list(Language),
101 |     #                    help="select output language: csharp/cpp/rust")
102 |     parser.add_argument("--vector-isa",
103 |                         nargs='+',
104 |                         default='all',
105 |                         help='list of vector ISA to generate',
106 |                         choices=list(VectorISA).append("all"))
107 |     parser.add_argument("--break-inline", type=int, default=0, help="break inlining every N levels")
108 | 
109 |     parser.add_argument("--output-dir", type=str,
110 |                         help="output directory")
111 | 
112 |     opts = parser.parse_args()
113 | 
114 |     if 'all' in opts.vector_isa:
115 |         opts.vector_isa = list(VectorISA)
116 | 
117 |     for isa in opts.vector_isa:
118 |         headers = []
119 |         for t in get_generator_supported_types(isa):
120 |             filename = f"bitonic_machine.{isa.lower()}.{t}.generated"
121 |             print(f"Generating {filename}.{{h,.cpp}}")
122 |             dirname = os.path.join(opts.output_dir, isa.lower())
123 |             os.makedirs(dirname, exist_ok=True)
124 |             headers.append(filename + ".h")
125 |             h_filename = os.path.join(dirname, filename + ".h")
126 |             with open(h_filename, "w") as f_header:
127 |                 generate_per_type(f_header, t, isa, opts.break_inline)
128 | 
129 |         h_master_header = os.path.join(dirname, f"bitonic_machine.{isa.lower()}.h")
130 |         with open(h_master_header, "w") as f_header:
131 |             print(autogenerated_blabber(), file=f_header)
132 |             print("", file=f_header)
133 |             f_header.writelines([f"""#include \"{h}\"\n""" for h in headers])
134 | 
135 | if __name__ == '__main__':
136 |     generate_all_types()
137 | 


--------------------------------------------------------------------------------
/vxsort/smallsort/codegen/bitonic_isa.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, ABCMeta, abstractmethod
 2 | 
 3 | from utils import next_power_of_2
 4 | 
 5 | 
 6 | class BitonicISA(ABC, metaclass=ABCMeta):
 7 | 
 8 |     @abstractmethod
 9 |     def vector_size(self):
10 |         pass
11 | 
12 |     @abstractmethod
13 |     def max_bitonic_sort_vectors(self):
14 |         pass
15 | 
16 |     def largest_merge_variant_needed(self):
17 |         return next_power_of_2(self.max_bitonic_sort_vectors());
18 | 
19 |     @abstractmethod
20 |     def vector_size(self):
21 |         pass
22 | 
23 |     @abstractmethod
24 |     def vector_type(self):
25 |         pass
26 | 
27 |     @classmethod
28 |     @abstractmethod
29 |     def supported_types(cls):
30 |         pass
31 | 
32 |     @abstractmethod
33 |     def generate_prologue(self):
34 |         pass
35 | 
36 |     @abstractmethod
37 |     def generate_epilogue(self):
38 |         pass
39 | 
40 | 
41 |     @abstractmethod
42 |     def generate_1v_basic_sorters(self, ascending: bool):
43 |         pass
44 | 
45 |     @abstractmethod
46 |     def generate_1v_merge_sorters(self, ascending: bool):
47 |         pass
48 | 
49 |     def generate_1v_sorters(self, ascending: bool):
50 |         self.generate_1v_basic_sorters(ascending)
51 |         self.generate_1v_merge_sorters(ascending)
52 | 
53 |     @abstractmethod
54 |     def generate_compounded_sorter(self, width: int, ascending: bool, inline: int):
55 |         pass
56 | 
57 |     @abstractmethod
58 |     def generate_compounded_merger(self, width: int, ascending: bool, inline: int):
59 |         pass
60 | 
61 |     @abstractmethod
62 |     def generate_entry_points_full_vectors(self, ascending : bool):
63 |         pass
64 | 
65 |     @abstractmethod
66 |     def generate_master_entry_point_full(self, ascending : bool):
67 |         pass
68 | 
69 |     @abstractmethod
70 |     def generate_cross_min_max(self):
71 |         pass
72 | 
73 |     @abstractmethod
74 |     def generate_strided_min_max(self):
75 |         pass


--------------------------------------------------------------------------------
/vxsort/smallsort/codegen/utils.py:
--------------------------------------------------------------------------------
 1 | native_size_map = {
 2 |     "i16": 2,
 3 |     "u16": 2,
 4 |     "i32": 4,
 5 |     "u32": 4,
 6 |     "f32": 4,
 7 |     "i64": 8,
 8 |     "u64": 8,
 9 |     "f64": 8,
10 | }
11 | 
12 | 
13 | def next_power_of_2(v):
14 |     v = v - 1
15 |     v |= v >> 1
16 |     v |= v >> 2
17 |     v |= v >> 4
18 |     v |= v >> 8
19 |     v |= v >> 16
20 |     v = v + 1
21 |     return int(v)
22 | 


--------------------------------------------------------------------------------
/vxsort/stats/vxsort_stats.cpp:
--------------------------------------------------------------------------------
 1 | #include "vxsort_stats.h"
 2 | #ifdef VXSORT_STATS
 3 | 
 4 | #include <fmt/format.h>
 5 | 
 6 | namespace vxsort {
 7 | using namespace vxsort::types;
 8 | 
 9 | i32 vxsort_stats_base::last_type = 0;
10 | std::array<vxsort_type, 6> vxsort_stats_base::registered_types;
11 | 
12 | template<typename T>
13 | void vxsort_stats<T>::print_stats()
14 | {
15 |     fmt::print("{:9} | {:7} | {:8} | {:7} | {:11} | {:12} | {:10} | {:9.2}% | {:>9.2f}\n",
16 |                vxsort_type_to_str(typeid_to_vxsort_type<T>()),
17 |                _num_sorts,
18 |                _total_sort_size,
19 |                _num_partitions,
20 |                _num_vec_loads,
21 |                _num_vec_stores,
22 |                _num_small_sorts,
23 |                (f64) _small_sorts_size * 100 / (f64) _total_partitioned_size,
24 |                (f64) _small_sorts_size / (f64)  _num_small_sorts);
25 | }
26 | 
27 | extern void print_all_stats() {
28 |     fmt::print("type      | # sorts | # sorted | # parts | # vec loads | # vec stores | # sm. sort | % sm. sort | avg. sm. sort\n");
29 |     fmt::print("----------|---------|----------|---------|-------------|--------------|------------|------------|--------------\n");
30 |     for (auto i = 0; i < vxsort_stats_base::last_type; ++i) {
31 |         switch (vxsort_stats_base::registered_types[i]) {
32 |             case vxsort_type::I16:  vxsort_stats<i16>::print_stats(); break;
33 |             case vxsort_type::U16:  vxsort_stats<u16>::print_stats(); break;
34 |             case vxsort_type::I32:  vxsort_stats<i32>::print_stats(); break;
35 |             case vxsort_type::U32:  vxsort_stats<u32>::print_stats(); break;
36 |             case vxsort_type::I64:  vxsort_stats<i64>::print_stats(); break;
37 |             case vxsort_type::U64:  vxsort_stats<u64>::print_stats(); break;
38 |             case vxsort_type::F32:  vxsort_stats<f32>::print_stats(); break;
39 |             case vxsort_type::F64:  vxsort_stats<f64>::print_stats(); break;
40 |             case vxsort_type::NONE: break;
41 |         }
42 |     }
43 | }
44 | 
45 | extern void reset_all_stats() {
46 |     for (auto i = 0; i < vxsort_stats_base::last_type; i++) {
47 |         switch (vxsort_stats_base::registered_types[i]) {
48 |             case vxsort_type::I16:  vxsort_stats<i16>::reset(); break;
49 |             case vxsort_type::U16:  vxsort_stats<u16>::reset(); break;
50 |             case vxsort_type::I32:  vxsort_stats<i32>::reset(); break;
51 |             case vxsort_type::U32:  vxsort_stats<u32>::reset(); break;
52 |             case vxsort_type::I64:  vxsort_stats<i64>::reset(); break;
53 |             case vxsort_type::U64:  vxsort_stats<u64>::reset(); break;
54 |             case vxsort_type::F32:  vxsort_stats<f32>::reset(); break;
55 |             case vxsort_type::F64:  vxsort_stats<f64>::reset(); break;
56 |             case vxsort_type::NONE: break;
57 |         }
58 |     }
59 | }
60 | 
61 | } // namespace vxsort
62 | 
63 | #endif


--------------------------------------------------------------------------------
/vxsort/stats/vxsort_stats.h:
--------------------------------------------------------------------------------
  1 | #ifndef VXSORT_VXSORT_STATS_H
  2 | #define VXSORT_VXSORT_STATS_H
  3 | 
  4 | #ifdef VXSORT_STATS
  5 | 
  6 | #include <array>
  7 | #include <cstdint>
  8 | #include <cstdio>
  9 | #include <typeinfo>
 10 | #include "defs.h"
 11 | 
 12 | 
 13 | using namespace std;
 14 | namespace vxsort {
 15 | using namespace vxsort::types;
 16 | 
 17 | enum class vxsort_type {
 18 |     I16,
 19 |     U16,
 20 |     I32,
 21 |     U32,
 22 |     I64,
 23 |     U64,
 24 |     F32,
 25 |     F64,
 26 |     NONE
 27 | };
 28 | 
 29 | class vxsort_stats_base
 30 | {
 31 | public:
 32 |     static std::array<vxsort_type, 6> registered_types;
 33 |     static i32 last_type;
 34 | protected:
 35 |     static void reset()
 36 |     {
 37 | #ifdef VXSORT_STATS
 38 |         last_type = 0;
 39 | #endif
 40 |     }
 41 | 
 42 |     template<typename T>
 43 |     static vxsort_type typeid_to_vxsort_type() {
 44 |         if (typeid(T) == typeid(i16))
 45 |             return vxsort_type::I16;
 46 |         else if (typeid(T) == typeid(i32))
 47 |             return vxsort_type::I32;
 48 |         else if (typeid(T) == typeid(i64))
 49 |             return vxsort_type::I64;
 50 | 
 51 |         if (typeid(T) == typeid(u16))
 52 |             return vxsort_type::U16;
 53 |         else if (typeid(T) == typeid(u32))
 54 |             return vxsort_type::U32;
 55 |         else if (typeid(T) == typeid(u64))
 56 |             return vxsort_type::U64;
 57 | 
 58 |         if (typeid(T) == typeid(f32))
 59 |             return vxsort_type::F32;
 60 |         if (typeid(T) == typeid(f64))
 61 |             return vxsort_type::F64;
 62 |         return vxsort_type::NONE;
 63 |     }
 64 | 
 65 |     static const char *vxsort_type_to_str(const vxsort_type type) {
 66 |         switch (type) {
 67 |             case vxsort_type::I16: return "i16";
 68 |             case vxsort_type::U16: return "u16";
 69 |             case vxsort_type::I32: return "i32";
 70 |             case vxsort_type::U32: return "u32";
 71 |             case vxsort_type::I64: return "i64";
 72 |             case vxsort_type::U64: return "u64";
 73 |             case vxsort_type::F32: return "f32";
 74 |             case vxsort_type::F64: return "f64";
 75 |             case vxsort_type::NONE: return "none";
 76 |         }
 77 |     }
 78 | 
 79 |     static void register_stat(const vxsort_type type)
 80 |     {
 81 |         for (auto i = 0; i < last_type; i++)
 82 |             if (registered_types[i] == type)
 83 |                 return;
 84 | 
 85 |         registered_types[last_type++] = type;
 86 |     }
 87 | };
 88 | 
 89 | template<typename T>
 90 | class vxsort_stats : vxsort_stats_base
 91 | {
 92 | private:
 93 |     static u64 _num_sorts;
 94 |     static u64 _total_sort_size;
 95 |     static u64 _num_partitions;
 96 |     static u64 _total_partitioned_size;
 97 |     static u64 _num_small_sorts;
 98 |     static u64 _small_sorts_size;
 99 |     static u64 _packed_elements;
100 |     static u64 _unpacked_elements;
101 |     static u64 _num_perms;
102 |     static u64 _num_vec_loads;
103 |     static u64 _num_vec_stores;
104 | 
105 | public:
106 |     static void reset()
107 |     {
108 |         _num_sorts = 0;
109 |         _total_sort_size = 0;
110 |         _num_partitions = 0;
111 |         _total_partitioned_size = 0;
112 |         _num_small_sorts = 0;
113 |         _small_sorts_size = 0;
114 |         _packed_elements = 0;
115 |         _unpacked_elements = 0;
116 |         _num_perms = 0;
117 |         _num_vec_loads = 0;
118 |         _num_vec_stores = 0;
119 | 
120 |     }
121 |     static void bump_sorts(size_t n) {
122 |         _num_sorts++;
123 |         _total_sort_size += n;
124 |         vxsort_stats_base::register_stat(vxsort_stats_base::typeid_to_vxsort_type<T>());
125 |     }
126 |     static void bump_partitions(size_t n) {
127 |         _num_partitions++;
128 |         _total_partitioned_size += n;
129 |     }
130 |     static void bump_small_sorts(i32 n = 1) { _num_small_sorts++; }
131 |     static void bump_perms(usize perms = 1) { _num_perms += perms; }
132 |     static void bump_vec_loads(usize loads = 1) { _num_vec_loads += loads; }
133 |     static void bump_vec_stores(usize stores = 1) { _num_vec_stores += stores; }
134 |     static void record_small_sort_size(usize sort_size) { _small_sorts_size += sort_size; }
135 |     static void bump_packs(usize size) { _packed_elements += size; }
136 |     static void bump_unpacks(usize size) { _unpacked_elements += size; }
137 | 
138 |     static void print_stats();
139 | };
140 | 
141 | extern void reset_all_stats();
142 | extern void print_all_stats();
143 | 
144 | template <typename T>
145 | u64 vxsort_stats<T>::_num_sorts = 0;
146 | template <typename T>
147 | u64 vxsort_stats<T>::_total_sort_size = 0;
148 | template <typename T>
149 | u64 vxsort_stats<T>::_num_partitions = 0;
150 | template <typename T>
151 | u64 vxsort_stats<T>::_total_partitioned_size = 0;
152 | template <typename T>
153 | u64 vxsort_stats<T>::_num_small_sorts = 0;
154 | template <typename T>
155 | u64 vxsort_stats<T>::_small_sorts_size = 0;
156 | template <typename T>
157 | u64 vxsort_stats<T>::_packed_elements = 0;
158 | template <typename T>
159 | u64 vxsort_stats<T>::_unpacked_elements = 0;
160 | template <typename T>
161 | u64 vxsort_stats<T>::_num_perms = 0;
162 | template <typename T>
163 | u64 vxsort_stats<T>::_num_vec_loads = 0;
164 | template <typename T>
165 | u64 vxsort_stats<T>::_num_vec_stores = 0;
166 | 
167 | }
168 | 
169 | #endif
170 | #endif  // VXSORT_VXSORT_STATS_H
171 | 


--------------------------------------------------------------------------------
/vxsort/vector_machine/avx2/f32.h:
--------------------------------------------------------------------------------
 1 | template <>
 2 | class vxsort_machine_traits<f32, AVX2> {
 3 | public:
 4 |     typedef f32 T;
 5 |     typedef __m256 TV;
 6 |     typedef __m256i TLOADSTOREMASK;
 7 |     typedef u32 TCMPMASK;
 8 |     typedef f32 TPACK;
 9 | 
10 |     static constexpr i32 N = sizeof(TV) / sizeof(T);
11 |     static_assert(is_powerof2(N), "vector-size / element-size must be a power of 2");
12 | 
13 |     static constexpr bool supports_compress_writes() { return false; }
14 |     static constexpr bool supports_packing() { return false; }
15 | 
16 |     template <i32 Shift>
17 |     static bool can_pack(T) { return false; }
18 | 
19 |     static INLINE TLOADSTOREMASK generate_prefix_mask(i32 amount) {
20 |         assert(amount >= 0);
21 |         assert(amount <= N);
22 |         return _mm256_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(prefix_mask_table_32b + amount * N)));
23 |     }
24 | 
25 |     static INLINE TLOADSTOREMASK generate_suffix_mask(i32 amount) {
26 |         assert(amount >= 0);
27 |         assert(amount <= N);
28 |         return _mm256_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(suffix_mask_table_32b + amount * N)));
29 |     }
30 | 
31 |     static INLINE TV load_vec(TV* p) { return _mm256_loadu_ps((T *)p); }
32 | 
33 |     static INLINE void store_vec(TV* ptr, TV v) { _mm256_storeu_ps((T *)ptr, v); }
34 | 
35 |     static void store_compress_vec(TV*, TV, TCMPMASK) { throw std::runtime_error("operation is unsupported"); }
36 | 
37 |     static INLINE TV load_partial_vec(TV *p, TV base, TLOADSTOREMASK mask) {
38 |         return i2s(_mm256_or_si256(s2i(_mm256_maskload_ps((T *) p, mask)),
39 |                                    _mm256_andnot_si256(mask, s2i(base))));
40 |     }
41 | 
42 |     static INLINE  void store_masked_vec(TV *p, TV v, TLOADSTOREMASK mask) {
43 |         _mm256_maskstore_ps((T *) p, mask, v);
44 |     }
45 | 
46 |     static INLINE TV partition_vector(TV v, i32 mask) {
47 |         assert(mask >= 0);
48 |         assert(mask <= 255);
49 |         return _mm256_permutevar8x32_ps(v, _mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i*)(perm_table_32 + mask * 8))));
50 |     }
51 | 
52 |     static INLINE TV broadcast(T pivot) { return _mm256_set1_ps(pivot); }
53 | 
54 |     static INLINE TCMPMASK get_cmpgt_mask(TV a, TV b) {
55 |         ///    0x0E: Greater-than (ordered, signaling) \n
56 |         ///    0x1E: Greater-than (ordered, non-signaling)
57 |         return _mm256_movemask_ps(_mm256_cmp_ps(a, b, _CMP_GT_OS));
58 |     }
59 | 
60 |     static INLINE TV shift_right(TV v, i32 i) { return v; }
61 |     static INLINE TV shift_left(TV v, i32 i) { return v; }
62 | 
63 |     static INLINE TV add(TV a, TV b) { return _mm256_add_ps(a, b); }
64 |     static INLINE TV sub(TV a, TV b) { return _mm256_sub_ps(a, b); };
65 | 
66 |     static INLINE TV pack_unordered(TV, TV) { throw std::runtime_error("operation is unsupported"); }
67 |     static INLINE void unpack_ordered(TV, TV&, TV&) { }
68 | 
69 |     template <i32 Shift>
70 |     static INLINE T shift_n_sub(T v, T sub) { return v; }
71 | 
72 |     template <i32 Shift>
73 |     static T unshift_and_add(TPACK from, T add) { return add; }
74 | };
75 | 


--------------------------------------------------------------------------------
/vxsort/vector_machine/avx2/f64.h:
--------------------------------------------------------------------------------
 1 | template <>
 2 | class vxsort_machine_traits<f64, AVX2> {
 3 | public:
 4 |     typedef f64 T;
 5 |     typedef __m256d TV;
 6 |     typedef __m256i TLOADSTOREMASK;
 7 |     typedef u32 TCMPMASK;
 8 |     typedef f64 TPACK;
 9 | 
10 |     static constexpr i32 N = sizeof(TV) / sizeof(T);
11 |     static_assert(is_powerof2(N), "vector-size / element-size must be a power of 2");
12 | 
13 |     static constexpr bool supports_compress_writes() { return false; }
14 |     static constexpr bool supports_packing() { return false; }
15 | 
16 |     template <i32 Shift>
17 |     static bool can_pack(T) { return false; }
18 | 
19 |     static INLINE TLOADSTOREMASK generate_prefix_mask(i32 amount) {
20 |         assert(amount >= 0);
21 |         assert(amount <= N);
22 |         return _mm256_cvtepi8_epi64(_mm_loadu_si128((__m128i*)(prefix_mask_table_64b + amount * N)));
23 |     }
24 | 
25 |     static INLINE TLOADSTOREMASK generate_suffix_mask(i32 amount) {
26 |         assert(amount >= 0);
27 |         assert(amount <= N);
28 |         return _mm256_cvtepi8_epi64(_mm_loadu_si128((__m128i*)(suffix_mask_table_64b + amount * N)));
29 |     }
30 | 
31 |     static INLINE TV load_vec(TV* p) { return _mm256_loadu_pd((T *)p); }
32 | 
33 |     static INLINE void store_vec(TV* ptr, TV v) { _mm256_storeu_pd((T *)ptr, v); }
34 | 
35 |     static void store_compress_vec(TV* ptr, TV v, TCMPMASK mask) { throw std::runtime_error("operation is unsupported"); }
36 | 
37 |     static INLINE TV load_partial_vec(TV *p, TV base, TLOADSTOREMASK mask) {
38 |         return i2d(_mm256_or_si256(d2i(_mm256_maskload_pd((T *) p, mask)),
39 |                                    _mm256_andnot_si256(mask, d2i(base))));
40 |     }
41 | 
42 |     static INLINE  void store_masked_vec(TV *p, TV v, TLOADSTOREMASK mask) {
43 |         _mm256_maskstore_pd((double *) p, mask, v);
44 |     }
45 | 
46 |     static INLINE TV partition_vector(TV v, i32 mask) {
47 |         assert(mask >= 0);
48 |         assert(mask <= 15);
49 |         return s2d(_mm256_permutevar8x32_ps(d2s(v), _mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i*)(perm_table_64 + mask * 8)))));
50 |     }
51 | 
52 |     static INLINE TV broadcast(T pivot) { return _mm256_set1_pd(pivot); }
53 |     static INLINE TCMPMASK get_cmpgt_mask(TV a, TV b) {
54 |         ///    0x0E: Greater-than (ordered, signaling) \n
55 |         ///    0x1E: Greater-than (ordered, non-signaling)
56 |         return _mm256_movemask_pd(_mm256_cmp_pd(a, b, _CMP_GT_OS));
57 |     }
58 | 
59 |     static INLINE TV shift_right(TV v, i32 i) { return v; }
60 |     static INLINE TV shift_left(TV v, i32 i) { return v; }
61 | 
62 |     static INLINE TV add(TV a, TV b) { return _mm256_add_pd(a, b); }
63 |     static INLINE TV sub(TV a, TV b) { return _mm256_sub_pd(a, b); };
64 | 
65 |     static INLINE TV pack_unordered(TV, TV) { TV tmp = _mm256_set1_pd(0); return tmp; }
66 |     static INLINE void unpack_ordered(TV, TV&, TV&) { }
67 | 
68 |     template <i32 Shift>
69 |     static INLINE T shift_n_sub(T v, T sub) { return v; }
70 | 
71 |     template <i32 Shift>
72 |     static INLINE T unshift_and_add(TPACK from, T add) { return add; }
73 | };
74 | 


--------------------------------------------------------------------------------
/vxsort/vector_machine/avx2/i16.h:
--------------------------------------------------------------------------------
 1 | template <>
 2 | class vxsort_machine_traits<i16, AVX2> {
 3 | public:
 4 |     typedef i16 T;
 5 |     typedef __m256i TV;
 6 |     typedef i32 TLOADSTOREMASK;
 7 |     typedef u32 TCMPMASK;
 8 |     typedef i16 TPACK;
 9 |     typedef typename std::make_unsigned<T>::type TU;
10 | 
11 |     static constexpr i32 N = sizeof(TV) / sizeof(T);
12 |     static_assert(is_powerof2(N), "vector-size / element-size must be a power of 2");
13 | 
14 |     static constexpr bool supports_compress_writes() { return false; }
15 |     static constexpr bool supports_packing() { return false; }
16 | 
17 |     template <i32 Shift>
18 |     static bool can_pack(T) { return false; }
19 | 
20 |     static INLINE TLOADSTOREMASK generate_prefix_mask(i32 amount) {
21 |         assert(amount >= 0);
22 |         assert(amount <= N);
23 | 
24 |         return amount ? amount : N;
25 |     }
26 | 
27 |     static INLINE TLOADSTOREMASK generate_suffix_mask(i32 amount) {
28 |         assert(amount >= 0);
29 |         assert(amount <= N);
30 | 
31 |         return amount ? -N + amount : -N;
32 |     }
33 | 
34 |     static INLINE TV load_vec(TV* p) { return _mm256_lddqu_si256(p); }
35 | 
36 |     static INLINE void store_vec(TV* ptr, TV v) { _mm256_storeu_si256(ptr, v); }
37 | 
38 |     static void store_compress_vec(TV*, TV, TCMPMASK) { throw std::runtime_error("operation is unsupported"); }
39 | 
40 |     static INLINE TV load_partial_vec(TV *p, TV base, TLOADSTOREMASK mask) {
41 |         // FML: There is only so much AVX2 stupidity one person can
42 |         //      take in their entire lifetime, I'm personally over this crap
43 |         std::array<T, N> base_vec;
44 |         _mm256_storeu_si256((TV *)base_vec.data(), base);
45 |         auto pt = (T *)p;
46 |         auto psrc = mask > 0 ? pt : pt + N + mask;
47 |         auto pdest = mask > 0 ? base_vec.begin() : base_vec.end() + mask;
48 |         auto amount = abs(mask);
49 |         std::copy_n(psrc, amount, pdest);
50 |         return _mm256_lddqu_si256((TV *)base_vec.data());
51 |     }
52 | 
53 |     static INLINE void store_masked_vec(TV *p, TV v, TLOADSTOREMASK mask) {
54 |         memcpy(p, &v, sizeof(T) * mask);
55 |     }
56 | 
57 |     static INLINE TV partition_vector(TV, i32) {
58 |         // Should never be called, since we "hijack" 16b/avx2 partitioning with template
59 |         // specializtion with partition_machine<T>
60 |         throw std::runtime_error("operation is unsupported");
61 |     }
62 | 
63 |     static INLINE TV broadcast(T pivot) { return _mm256_set1_epi16(pivot); }
64 | 
65 |     static INLINE TCMPMASK get_cmpgt_mask(TV a, TV b) {
66 |         return _pext_u32(
67 |                 _mm256_movemask_epi8(_mm256_cmpgt_epi16(a, b)),
68 |                 0x55555555);
69 |     }
70 | 
71 |     static INLINE TV shift_right(TV v, i32 i) { return _mm256_srli_epi16(v, i); }
72 |     static INLINE TV shift_left(TV v, i32 i) { return _mm256_slli_epi16(v, i); }
73 | 
74 |     static INLINE TV add(TV a, TV b) { return _mm256_add_epi16(a, b); }
75 |     static INLINE TV sub(TV a, TV b) { return _mm256_sub_epi16(a, b); };
76 | 
77 |     static INLINE TV pack_unordered(TV, TV) { throw std::runtime_error("operation is unsupported"); }
78 |     static INLINE void unpack_ordered(TV, TV&, TV&) { }
79 | 
80 |     template <i32 Shift>
81 |     static INLINE T shift_n_sub(T v, T sub) {
82 |         if (Shift > 0)
83 |             v >>= Shift;
84 |         v -= sub;
85 |         return v;
86 |     }
87 | 
88 |     template <i32 Shift>
89 |     static INLINE T unshift_and_add(TPACK from, T add) {
90 |         add += from;
91 |         if (Shift > 0)
92 |             add = (T) (((TU) add) << Shift);
93 |         return add;
94 |     }
95 | };
96 | 


--------------------------------------------------------------------------------
/vxsort/vector_machine/avx2/i32.h:
--------------------------------------------------------------------------------
 1 | template <>
 2 | class vxsort_machine_traits<i32, AVX2> {
 3 | public:
 4 |     typedef i32 T;
 5 |     typedef __m256i TV;
 6 |     typedef __m256i TLOADSTOREMASK;
 7 |     typedef u32 TCMPMASK;
 8 |     typedef i16 TPACK;
 9 |     typedef typename std::make_unsigned<TPACK>::type TUPACK;
10 |     typedef typename std::make_unsigned<T>::type TU;
11 |     static_assert(sizeof(TPACK)*2 == sizeof(T), "TPACK must be half-width of T");
12 | 
13 |     static constexpr i32 N = sizeof(TV) / sizeof(T);
14 |     static_assert(is_powerof2(N), "vector-size / element-size must be a power of 2");
15 | 
16 |     static constexpr bool supports_compress_writes() { return false; }
17 |     static constexpr bool supports_packing() { return true; }
18 | 
19 |     template <i32 Shift>
20 |     static bool can_pack(T span) {
21 |         constexpr auto PACK_LIMIT = (((TU)std::numeric_limits<TUPACK>::max() + 1)) << Shift;
22 |         return ((TU)span) < PACK_LIMIT;
23 |     }
24 | 
25 |     static INLINE TLOADSTOREMASK generate_prefix_mask(i32 amount) {
26 |         assert(amount >= 0);
27 |         assert(amount <= N);
28 |         return _mm256_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(prefix_mask_table_32b + N * amount)));
29 |     }
30 | 
31 |     static INLINE TLOADSTOREMASK generate_suffix_mask(i32 amount) {
32 |         assert(amount >= 0);
33 |         assert(amount <= N);
34 |         return _mm256_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(suffix_mask_table_32b + N * amount)));
35 |     }
36 | 
37 |     static INLINE TV load_vec(TV* p) { return _mm256_lddqu_si256(p); }
38 | 
39 |     static INLINE void store_vec(TV* ptr, TV v) { _mm256_storeu_si256(ptr, v); }
40 | 
41 |     static INLINE void store_compress_vec(TV*, TV, TCMPMASK) { throw std::runtime_error("operation is unsupported"); }
42 | 
43 |     static INLINE TV load_partial_vec(TV *p, TV base, TLOADSTOREMASK mask) {
44 |         return _mm256_or_si256(_mm256_maskload_epi32((i32 *) p, mask),
45 |                                _mm256_andnot_si256(mask, base));
46 |     }
47 | 
48 |     static INLINE  void store_masked_vec(TV *p, TV v, TLOADSTOREMASK mask) { _mm256_maskstore_epi32((i32 *) p, mask, v); }
49 | 
50 |     static INLINE TV partition_vector(TV v, i32 mask) {
51 |         assert(mask >= 0);
52 |         assert(mask <= 255);
53 |         return s2i(_mm256_permutevar8x32_ps(i2s(v), _mm256_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(perm_table_32 + mask * 8)))));
54 |     }
55 | 
56 |     static INLINE TV broadcast(T pivot) { return _mm256_set1_epi32(pivot); }
57 | 
58 |     static INLINE TCMPMASK get_cmpgt_mask(TV a, TV b) { return _mm256_movemask_ps(i2s(_mm256_cmpgt_epi32(a, b))); }
59 | 
60 |     static INLINE TV shift_right(TV v, i32 i) { return _mm256_srli_epi32(v, i); }
61 |     static INLINE TV shift_left(TV v, i32 i) { return _mm256_slli_epi32(v, i); }
62 | 
63 |     static INLINE TV add(TV a, TV b) { return _mm256_add_epi32(a, b); }
64 |     static INLINE TV sub(TV a, TV b) { return _mm256_sub_epi32(a, b); };
65 | 
66 |     static INLINE TV pack_unordered(TV a, TV b) { return _mm256_packs_epi32(a, b); }
67 | 
68 |     static INLINE void unpack_ordered(TV p, TV& u1, TV& u2) {
69 |         u1 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(p, 0));
70 |         u2 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(p, 1));
71 |     }
72 | 
73 |     template <i32 Shift>
74 |     static INLINE T shift_n_sub(T v, T sub) {
75 |         if (Shift > 0)
76 |             v >>= Shift;
77 |         v -= sub;
78 |         return v;
79 |     }
80 | 
81 |     template <i32 Shift>
82 |     static INLINE T unshift_and_add(TPACK from, T add) {
83 |         add += from;
84 |         if (Shift > 0)
85 |             add = (T) (((TU) add) << Shift);
86 |         return add;
87 |     }
88 | };
89 | 


--------------------------------------------------------------------------------
/vxsort/vector_machine/avx2/i64.h:
--------------------------------------------------------------------------------
 1 | template <>
 2 | class vxsort_machine_traits<i64, AVX2> {
 3 | public:
 4 |     typedef i64 T;
 5 |     typedef __m256i TV;
 6 |     typedef __m256i TLOADSTOREMASK;
 7 |     typedef u32 TCMPMASK;
 8 |     typedef i32 TPACK;
 9 |     typedef typename std::make_unsigned<TPACK>::type TUPACK;
10 |     typedef typename std::make_unsigned<T>::type TU;
11 |     static_assert(sizeof(TPACK)*2 == sizeof(T), "TPACK must be half-width of T");
12 | 
13 |     static constexpr i32 N = sizeof(TV) / sizeof(T);
14 |     static_assert(is_powerof2(N), "vector-size / element-size must be a power of 2");
15 | 
16 |     static constexpr bool supports_compress_writes() { return false; }
17 |     static constexpr bool supports_packing() { return true; }
18 | 
19 |     template <i32 Shift>
20 |     static bool can_pack(T span) {
21 |         constexpr auto PACK_LIMIT = (((TU) std::numeric_limits<TUPACK>::max() + 1)) << Shift;
22 |         return ((TU)span) < PACK_LIMIT;
23 |     }
24 | 
25 |     static INLINE TLOADSTOREMASK generate_prefix_mask(i32 amount) {
26 |         assert(amount >= 0);
27 |         assert(amount <= N);
28 |         return _mm256_cvtepi8_epi64(_mm_loadu_si128((__m128i*)(prefix_mask_table_64b + N * amount)));
29 |     }
30 | 
31 |     static INLINE TLOADSTOREMASK generate_suffix_mask(i32 amount) {
32 |         assert(amount >= 0);
33 |         assert(amount <= N);
34 |         return _mm256_cvtepi8_epi64(_mm_loadu_si128((__m128i*)(suffix_mask_table_64b + N * amount)));
35 |     }
36 | 
37 |     static INLINE TV load_vec(TV* p) { return _mm256_lddqu_si256(p); }
38 | 
39 |     static INLINE void store_vec(TV* ptr, TV v) { _mm256_storeu_si256(ptr, v); }
40 | 
41 |     static void store_compress_vec(TV*, TV, TCMPMASK) { throw std::runtime_error("operation is unsupported"); }
42 | 
43 |     static INLINE TV load_partial_vec(TV *p, TV base, TLOADSTOREMASK mask) {
44 |         return _mm256_or_si256(_mm256_maskload_epi64((const long long *) p, mask),
45 |                                _mm256_andnot_si256(mask, base));
46 |     }
47 | 
48 |     static INLINE  void store_masked_vec(TV *p, TV v, TLOADSTOREMASK mask) {
49 |         _mm256_maskstore_epi64((long long *) p, mask, v);
50 |     }
51 | 
52 |     static INLINE TV partition_vector(TV v, i32 mask) {
53 |         assert(mask >= 0);
54 |         assert(mask <= 15);
55 |         return s2i(_mm256_permutevar8x32_ps(i2s(v), _mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i*)(perm_table_64 + mask * 8)))));
56 |     }
57 | 
58 |     static INLINE TV broadcast(T pivot) { return _mm256_set1_epi64x(pivot); }
59 | 
60 |     static INLINE TCMPMASK get_cmpgt_mask(TV a, TV b) { return _mm256_movemask_pd(i2d(_mm256_cmpgt_epi64(a, b))); }
61 | 
62 |     static INLINE TV shift_right(TV v, i32 i) { return _mm256_srli_epi64(v, i); }
63 |     static INLINE TV shift_left(TV v, i32 i) { return _mm256_slli_epi64(v, i); }
64 | 
65 |     static INLINE TV add(TV a, TV b) { return _mm256_add_epi64(a, b); }
66 |     static INLINE TV sub(TV a, TV b) { return _mm256_sub_epi64(a, b); };
67 | 
68 |     static INLINE TV pack_unordered(TV a, TV b) {
69 |         b = _mm256_shuffle_epi32(b, _MM_PERM_CDAB);
70 |         return _mm256_blend_epi32(a, b, 0b10101010);
71 |     }
72 | 
73 |     static INLINE void unpack_ordered(TV p, TV& u1, TV& u2) {
74 |         auto p01 = _mm256_extracti128_si256(p, 0);
75 |         auto p02 = _mm256_extracti128_si256(p, 1);
76 | 
77 |         u1 = _mm256_cvtepi32_epi64(p01);
78 |         u2 = _mm256_cvtepi32_epi64(p02);
79 |     }
80 | 
81 |     template <i32 Shift>
82 |     static T shift_n_sub(T v, T sub) {
83 |         if (Shift > 0)
84 |             v >>= Shift;
85 |         v -= sub;
86 |         return v;
87 |     }
88 | 
89 |     template <i32 Shift>
90 |     static T unshift_and_add(TPACK from, T add) {
91 |         add += from;
92 |         if (Shift > 0)
93 |             add = (T) (((TU) add) << Shift);
94 |         return add;
95 |     }
96 | };
97 | 


--------------------------------------------------------------------------------
/vxsort/vector_machine/avx2/u16.h:
--------------------------------------------------------------------------------
 1 | template <>
 2 | class vxsort_machine_traits<u16, AVX2> {
 3 | public:
 4 |     typedef u16 T;
 5 |     typedef __m256i TV;
 6 |     typedef i32 TLOADSTOREMASK;
 7 |     typedef u32 TCMPMASK;
 8 |     typedef u16 TPACK;
 9 |     typedef typename std::make_unsigned<T>::type TU;
10 | 
11 |     static constexpr i32 N = sizeof(TV) / sizeof(T);
12 |     static_assert(is_powerof2(N), "vector-size / element-size must be a power of 2");
13 | 
14 |     static constexpr bool supports_compress_writes() { return false; }
15 |     static constexpr bool supports_packing() { return false; }
16 | 
17 |     template <i32 Shift>
18 |     static bool can_pack(T) { return false; }
19 | 
20 |     static INLINE TLOADSTOREMASK generate_prefix_mask(i32 amount) {
21 |         assert(amount >= 0);
22 |         assert(amount <= N);
23 | 
24 |         return amount ? amount : N;
25 |     }
26 | 
27 |     static INLINE TLOADSTOREMASK generate_suffix_mask(i32 amount) {
28 |         assert(amount >= 0);
29 |         assert(amount <= N);
30 | 
31 |         return amount ? -N + amount : -N;
32 |     }
33 | 
34 |     static INLINE TV load_vec(TV* p) { return _mm256_lddqu_si256(p); }
35 | 
36 |     static INLINE void store_vec(TV* ptr, TV v) { _mm256_storeu_si256(ptr, v); }
37 | 
38 |     static void store_compress_vec(TV*, TV, TCMPMASK) { throw std::runtime_error("operation is unsupported"); }
39 | 
40 |     static INLINE TV load_partial_vec(TV *p, TV base, TLOADSTOREMASK mask) {
41 |         // FML: There is only so much AVX2 stupidity one person can
42 |         //      take in their entire lifetime, I'm personally over this crap
43 |         std::array<T, N> base_vec;
44 |         _mm256_storeu_si256((TV *)base_vec.data(), base);
45 |         auto pt = (T *)p;
46 |         auto psrc = mask > 0 ? pt : pt + N + mask;
47 |         auto pdest = mask > 0 ? base_vec.begin() : base_vec.end() + mask;
48 |         auto amount = abs(mask);
49 |         std::copy_n(psrc, amount, pdest);
50 |         return _mm256_lddqu_si256((TV *)base_vec.data());
51 |     }
52 | 
53 |     static INLINE void store_masked_vec(TV *p, TV v, TLOADSTOREMASK mask) {
54 |         memcpy(p, &v, sizeof(T) * mask);
55 |     }
56 | 
57 |     static INLINE TV partition_vector(TV, i32) {
58 |         // Should never be called, since we "hijack" 16b/avx2 partitioning with template
59 |         // specializtion with partition_machine<T>
60 |         throw std::runtime_error("operation is unsupported");
61 |     }
62 | 
63 |     static INLINE TV broadcast(T pivot) { return _mm256_set1_epi16(pivot); }
64 | 
65 |     static INLINE TCMPMASK get_cmpgt_mask(TV a, TV b) {
66 |         __m256i top_bit = _mm256_set1_epi16(1U << 15);
67 |         return _pext_u32(
68 |                 _mm256_movemask_epi8(_mm256_cmpgt_epi16(_mm256_xor_si256(top_bit, a),
69 |                                                             _mm256_xor_si256(top_bit, b))),
70 |                 0x55555555);
71 |     }
72 | 
73 |     static INLINE TV shift_right(TV v, i32 i) { return _mm256_srli_epi16(v, i); }
74 |     static INLINE TV shift_left(TV v, i32 i) { return _mm256_slli_epi16(v, i); }
75 | 
76 |     static INLINE TV add(TV a, TV b) { return _mm256_add_epi16(a, b); }
77 |     static INLINE TV sub(TV a, TV b) { return _mm256_sub_epi16(a, b); };
78 | 
79 |     static INLINE TV pack_unordered(TV, TV) { throw std::runtime_error("operation is unsupported"); }
80 |     static INLINE void unpack_ordered(TV, TV&, TV&) { }
81 | 
82 |     template <i32 Shift>
83 |     static INLINE T shift_n_sub(T v, T sub) {
84 |         if (Shift > 0)
85 |             v >>= Shift;
86 |         v -= sub;
87 |         return v;
88 |     }
89 | 
90 |     template <i32 Shift>
91 |     static INLINE T unshift_and_add(TPACK from, T add) {
92 |         add += from;
93 |         if (Shift > 0)
94 |             add = (T) (((TU) add) << Shift);
95 |         return add;
96 |     }
97 | };
98 | 


--------------------------------------------------------------------------------
/vxsort/vector_machine/avx2/u32.h:
--------------------------------------------------------------------------------
 1 | template <>
 2 | class vxsort_machine_traits<u32, AVX2> {
 3 | public:
 4 |     typedef u32 T;
 5 |     typedef __m256i TV;
 6 |     typedef __m256i TLOADSTOREMASK;
 7 |     typedef u32 TCMPMASK;
 8 |     typedef u16 TPACK;
 9 |     typedef typename std::make_unsigned<TPACK>::type TUPACK;
10 |     typedef typename std::make_unsigned<T>::type TU;
11 |     static_assert(sizeof(TPACK)*2 == sizeof(T), "TPACK must be half-width of T");
12 | 
13 |     static constexpr i32 N = sizeof(TV) / sizeof(T);
14 |     static_assert(is_powerof2(N), "vector-size / element-size must be a power of 2");
15 | 
16 |     static constexpr bool supports_compress_writes() { return false; }
17 |     static constexpr bool supports_packing() { return true; }
18 | 
19 |     template <i32 Shift>
20 |     static bool can_pack(T span) {
21 |         constexpr auto PACK_LIMIT = (((TU)std::numeric_limits<TUPACK>::max() + 1)) << Shift;
22 |         return ((TU)span) < PACK_LIMIT;
23 |     }
24 | 
25 |     static INLINE TLOADSTOREMASK generate_prefix_mask(i32 amount) {
26 |         assert(amount >= 0);
27 |         assert(amount <= N);
28 |         return _mm256_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(prefix_mask_table_32b + N * amount)));
29 |     }
30 | 
31 |     static INLINE TLOADSTOREMASK generate_suffix_mask(i32 amount) {
32 |         assert(amount >= 0);
33 |         assert(amount <= N);
34 |         return _mm256_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(suffix_mask_table_32b + N * amount)));
35 |     }
36 | 
37 |     static INLINE TV load_vec(TV* p) { return _mm256_lddqu_si256(p); }
38 | 
39 |     static INLINE void store_vec(TV* ptr, TV v) { _mm256_storeu_si256(ptr, v); }
40 | 
41 |     static INLINE void store_compress_vec(TV*, TV, TCMPMASK) { throw std::runtime_error("operation is unsupported"); }
42 | 
43 |     static INLINE TV load_partial_vec(TV *p, TV base, TLOADSTOREMASK mask) {
44 |         return _mm256_or_si256(_mm256_maskload_epi32((i32 *) p, mask),
45 |                                _mm256_andnot_si256(mask, base));
46 |     }
47 | 
48 |     static INLINE  void store_masked_vec(TV *p, TV v, TLOADSTOREMASK mask) { _mm256_maskstore_epi32((i32 *) p, mask, v); }
49 | 
50 |     static INLINE TV partition_vector(TV v, i32 mask) {
51 |         assert(mask >= 0);
52 |         assert(mask <= 255);
53 |         return s2i(_mm256_permutevar8x32_ps(i2s(v), _mm256_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(perm_table_32 + mask * 8)))));
54 |     }
55 | 
56 |     static INLINE TV broadcast(T pivot) { return _mm256_set1_epi32(pivot); }
57 | 
58 |     static INLINE TCMPMASK get_cmpgt_mask(TV a, TV b) {
59 |         __m256i top_bit = _mm256_set1_epi32(1U << 31);
60 |         return _mm256_movemask_ps(i2s(_mm256_cmpgt_epi32(_mm256_xor_si256(top_bit, a), _mm256_xor_si256(top_bit, b))));
61 |     }
62 | 
63 |     static INLINE TV shift_right(TV v, i32 i) { return _mm256_srli_epi32(v, i); }
64 |     static INLINE TV shift_left(TV v, i32 i) { return _mm256_slli_epi32(v, i); }
65 | 
66 |     static INLINE TV add(TV a, TV b) { return _mm256_add_epi32(a, b); }
67 |     static INLINE TV sub(TV a, TV b) { return _mm256_sub_epi32(a, b); };
68 | 
69 |     static INLINE TV pack_unordered(TV a, TV b) { return _mm256_packus_epi32(a, b); }
70 | 
71 |     static INLINE void unpack_ordered(TV p, TV& u1, TV& u2) {
72 |         u1 = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(p, 0));
73 |         u2 = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(p, 1));
74 |     }
75 | 
76 |     template <i32 Shift>
77 |     static INLINE T shift_n_sub(T v, T sub) {
78 |         if (Shift > 0)
79 |             v >>= Shift;
80 |         v -= sub;
81 |         return v;
82 |     }
83 | 
84 |     template <i32 Shift>
85 |     static INLINE T unshift_and_add(TPACK from, T add) {
86 |         add += from;
87 |         if (Shift > 0)
88 |             add = (T) (((TU) add) << Shift);
89 |         return add;
90 |     }
91 | };
92 | 


--------------------------------------------------------------------------------
/vxsort/vector_machine/avx2/u64.h:
--------------------------------------------------------------------------------
  1 | template <>
  2 | class vxsort_machine_traits<u64, AVX2> {
  3 | public:
  4 |     typedef u64 T;
  5 |     typedef __m256i TV;
  6 |     typedef __m256i TLOADSTOREMASK;
  7 |     typedef u32 TCMPMASK;
  8 |     typedef u32 TPACK;
  9 |     typedef typename std::make_unsigned<TPACK>::type TUPACK;
 10 |     typedef typename std::make_unsigned<T>::type TU;
 11 |     static_assert(sizeof(TPACK)*2 == sizeof(T), "TPACK must be half-width of T");
 12 | 
 13 |     static constexpr i32 N = sizeof(TV) / sizeof(T);
 14 |     static_assert(is_powerof2(N), "vector-size / element-size must be a power of 2");
 15 | 
 16 |     static constexpr bool supports_compress_writes() { return false; }
 17 |     static constexpr bool supports_packing() { return true; }
 18 | 
 19 |     template <i32 Shift>
 20 |     static bool can_pack(T span) {
 21 |         constexpr auto PACK_LIMIT = (((TU) std::numeric_limits<TUPACK>::max() + 1)) << Shift;
 22 |         return ((TU)span) < PACK_LIMIT;
 23 |     }
 24 | 
 25 |     static INLINE TLOADSTOREMASK generate_prefix_mask(i32 amount) {
 26 |         assert(amount >= 0);
 27 |         assert(amount <= N);
 28 |         return _mm256_cvtepi8_epi64(_mm_loadu_si128((__m128i*)(prefix_mask_table_64b + N * amount)));
 29 |     }
 30 | 
 31 |     static INLINE TLOADSTOREMASK generate_suffix_mask(i32 amount) {
 32 |         assert(amount >= 0);
 33 |         assert(amount <= N);
 34 |         return _mm256_cvtepi8_epi64(_mm_loadu_si128((__m128i*)(suffix_mask_table_64b + N * amount)));
 35 |     }
 36 | 
 37 |     static INLINE TV load_vec(TV* p) { return _mm256_lddqu_si256(p); }
 38 | 
 39 |     static INLINE void store_vec(TV* ptr, TV v) { _mm256_storeu_si256(ptr, v); }
 40 | 
 41 |     static void store_compress_vec(TV*, TV, TCMPMASK) { throw std::runtime_error("operation is unsupported"); }
 42 | 
 43 |     static INLINE TV load_partial_vec(TV *p, TV base, TLOADSTOREMASK mask) {
 44 |         return _mm256_or_si256(_mm256_maskload_epi64((const long long *) p, mask),
 45 |                                _mm256_andnot_si256(mask, base));
 46 |     }
 47 | 
 48 |     static INLINE  void store_masked_vec(TV *p, TV v, TLOADSTOREMASK mask) {
 49 |         _mm256_maskstore_epi64((long long *) p, mask, v);
 50 |     }
 51 | 
 52 |     static INLINE TV partition_vector(TV v, i32 mask) {
 53 |         assert(mask >= 0);
 54 |         assert(mask <= 15);
 55 |         return s2i(_mm256_permutevar8x32_ps(i2s(v), _mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i*)(perm_table_64 + mask * 8)))));
 56 |     }
 57 | 
 58 |     static INLINE TV broadcast(T pivot) { return _mm256_set1_epi64x(pivot); }
 59 | 
 60 |     static INLINE TCMPMASK get_cmpgt_mask(TV a, TV b) {
 61 |         __m256i top_bit = _mm256_set1_epi64x(1LLU << 63);
 62 |         return _mm256_movemask_pd(i2d(_mm256_cmpgt_epi64(_mm256_xor_si256(top_bit, a), _mm256_xor_si256(top_bit, b))));
 63 |     }
 64 | 
 65 |     static INLINE TV shift_right(TV v, i32 i) { return _mm256_srli_epi64(v, i); }
 66 |     static INLINE TV shift_left(TV v, i32 i) { return _mm256_slli_epi64(v, i); }
 67 | 
 68 |     static INLINE TV add(TV a, TV b) { return _mm256_add_epi64(a, b); }
 69 |     static INLINE TV sub(TV a, TV b) { return _mm256_sub_epi64(a, b); };
 70 | 
 71 |     static INLINE TV pack_unordered(TV a, TV b) {
 72 |         b = _mm256_shuffle_epi32(b, _MM_PERM_CDAB);
 73 |         return _mm256_blend_epi32(a, b, 0b10101010);
 74 |     }
 75 | 
 76 |     static INLINE void unpack_ordered(TV p, TV& u1, TV& u2) {
 77 |         auto p01 = _mm256_extracti128_si256(p, 0);
 78 |         auto p02 = _mm256_extracti128_si256(p, 1);
 79 | 
 80 |         u1 = _mm256_cvtepu32_epi64(p01);
 81 |         u2 = _mm256_cvtepu32_epi64(p02);
 82 |     }
 83 | 
 84 |     template <i32 Shift>
 85 |     static T shift_n_sub(T v, T sub) {
 86 |         if (Shift > 0)
 87 |             v >>= Shift;
 88 |         v -= sub;
 89 |         return v;
 90 |     }
 91 | 
 92 |     template <i32 Shift>
 93 |     static T unshift_and_add(TPACK from, T add) {
 94 |         add += from;
 95 |         if (Shift > 0)
 96 |             add = (T) (((TU) add) << Shift);
 97 |         return add;
 98 |     }
 99 | };
100 | 


--------------------------------------------------------------------------------
/vxsort/vector_machine/avx512/f32.h:
--------------------------------------------------------------------------------
 1 | template <>
 2 | class vxsort_machine_traits<f32, AVX512> {
 3 | public:
 4 |     typedef f32 T;
 5 |     typedef __m512 TV;
 6 |     typedef __mmask16 TLOADSTOREMASK;
 7 |     typedef __mmask16 TCMPMASK;
 8 |     typedef f32 TPACK;
 9 | 
10 |     static constexpr i32 N = sizeof(TV) / sizeof(T);
11 |     static_assert(is_powerof2(N), "vector-size / element-size must be a power of 2");
12 | 
13 |     static constexpr bool supports_compress_writes() { return true; }
14 |     static constexpr bool supports_packing() { return false; }
15 | 
16 |     template <i32 Shift>
17 |     static bool can_pack(T) { return false; }
18 | 
19 |     static INLINE TLOADSTOREMASK generate_prefix_mask(i32 amount) {
20 |         assert(amount >= 0);
21 |         assert(amount <= N);
22 |         return 0xFFFF >> ((N - amount) & (N-1));
23 |     }
24 | 
25 |     static INLINE TLOADSTOREMASK generate_suffix_mask(i32 amount) {
26 |         assert(amount >= 0);
27 |         assert(amount <= N);
28 |         return 0xFFFF << (amount & (N-1));
29 |     }
30 | 
31 |     static INLINE TV load_vec(TV* p) { return _mm512_loadu_ps(p); }
32 | 
33 |     static INLINE void store_vec(TV* ptr, TV v) { _mm512_storeu_ps(ptr, v); }
34 | 
35 |     static TV load_partial_vec(TV *ptr, TV base, TLOADSTOREMASK mask) {
36 |         return _mm512_mask_loadu_ps(base, mask, (T const *) ptr);
37 |     }
38 | 
39 |     static INLINE void store_masked_vec(TV * p, TV v, TLOADSTOREMASK mask) {
40 |         _mm512_mask_storeu_ps(p, mask, v);
41 |     }
42 | 
43 |     // Will never be called
44 |     static INLINE TV partition_vector(TV v, i32 mask) { return v; }
45 | 
46 |     static void store_compress_vec(TV* ptr, TV v, TCMPMASK mask) { _mm512_mask_compressstoreu_ps(ptr, mask, v); }
47 | 
48 |     static INLINE TV broadcast(T pivot) { return _mm512_set1_ps(pivot); }
49 | 
50 |     static INLINE TCMPMASK get_cmpgt_mask(TV a, TV b) { return _mm512_cmp_ps_mask(a, b, _CMP_GT_OS); }
51 | 
52 |     static INLINE TV shift_right(TV v, i32 i) { return v; }
53 |     static INLINE TV shift_left(TV v, i32 i) { return v; }
54 | 
55 |     static INLINE TV add(TV a, TV b) { return _mm512_add_ps(a, b); }
56 |     static INLINE TV sub(TV a, TV b) { return _mm512_sub_ps(a, b); };
57 | 
58 |     static INLINE TV pack_unordered(TV a, TV b) { return a; }
59 |     static INLINE void unpack_ordered(TV p, TV& u1, TV& u2) { }
60 |     template <i32 Shift>
61 |     static INLINE T shift_n_sub(T v, T sub) { return v; }
62 | 
63 |     template <i32 Shift>
64 |     static INLINE T unshift_and_add(TPACK from, T add) { return add; }
65 | };
66 | 


--------------------------------------------------------------------------------
/vxsort/vector_machine/avx512/f64.h:
--------------------------------------------------------------------------------
 1 | template <>
 2 | class vxsort_machine_traits<f64, AVX512> {
 3 | public:
 4 |     typedef f64 T;
 5 |     typedef __m512d TV;
 6 |     typedef __mmask8 TLOADSTOREMASK;
 7 |     typedef __mmask8 TCMPMASK;
 8 |     typedef f64 TPACK;
 9 | 
10 |     static constexpr i32 N = sizeof(TV) / sizeof(T);
11 |     static_assert(is_powerof2(N), "vector-size / element-size must be a power of 2");
12 | 
13 |     static constexpr bool supports_compress_writes() { return true; }
14 |     static constexpr bool supports_packing() { return false; }
15 | 
16 |     template <i32 Shift>
17 |     static bool can_pack(T) { return false; }
18 | 
19 |     static INLINE TLOADSTOREMASK generate_prefix_mask(i32 amount) {
20 |         assert(amount >= 0);
21 |         assert(amount <= N);
22 |         return 0xFF >> ((N - amount) & (N-1));
23 |     }
24 | 
25 |     static INLINE TLOADSTOREMASK generate_suffix_mask(i32 amount) {
26 |         assert(amount >= 0);
27 |         assert(amount <= N);
28 |         return 0xFF << (amount & (N-1));
29 |     }
30 | 
31 |     static INLINE TV load_vec(TV* p) { return _mm512_loadu_pd(p); }
32 | 
33 |     static INLINE void store_vec(TV* ptr, TV v) { _mm512_storeu_pd(ptr, v); }
34 | 
35 |     static INLINE TV load_partial_vec(TV *ptr, TV base, TLOADSTOREMASK mask) {
36 |         return _mm512_mask_loadu_pd(base, mask, (T const *) ptr);
37 |     }
38 | 
39 |     static INLINE void store_masked_vec(TV * p, TV v, TLOADSTOREMASK mask) {
40 |         _mm512_mask_storeu_pd(p, mask, v);
41 |     }
42 | 
43 |     // Will never be called
44 |     static INLINE TV partition_vector(TV v, i32 mask) { return v; }
45 | 
46 |     static void store_compress_vec(TV* ptr, TV v, TCMPMASK mask) { _mm512_mask_compressstoreu_pd(ptr, mask, v); }
47 |     static INLINE TV broadcast(T pivot) { return _mm512_set1_pd(pivot); }
48 |     static INLINE TCMPMASK get_cmpgt_mask(TV a, TV b) { return _mm512_cmp_pd_mask(a, b, _CMP_GT_OS); }
49 | 
50 |     static INLINE TV shift_right(TV v, i32 i) { return v; }
51 |     static INLINE TV shift_left(TV v, i32 i) { return v; }
52 | 
53 |     static INLINE TV add(TV a, TV b) { return _mm512_add_pd(a, b); }
54 |     static INLINE TV sub(TV a, TV b) { return _mm512_sub_pd(a, b); };
55 | 
56 |     static INLINE TV pack_unordered(TV a, TV b) { return a; }
57 |     static INLINE void unpack_ordered(TV p, TV& u1, TV& u2) { }
58 |     template <i32 Shift>
59 |     static T shift_n_sub(T v, T sub) { return v; }
60 | 
61 |     template <i32 Shift>
62 |     static T unshift_and_add(TPACK from, T add) { return add; }
63 | };
64 | 


--------------------------------------------------------------------------------
/vxsort/vector_machine/avx512/i16.h:
--------------------------------------------------------------------------------
 1 | template <>
 2 | class vxsort_machine_traits<i16, AVX512> {
 3 | public:
 4 |     typedef i16 T;
 5 |     typedef __m512i TV;
 6 |     typedef __mmask32 TLOADSTOREMASK;
 7 |     typedef __mmask32 TCMPMASK;
 8 |     typedef i16 TPACK;
 9 |     typedef typename std::make_unsigned<T>::type TU;
10 | 
11 |     static constexpr i32 N = sizeof(TV) / sizeof(T);
12 |     static_assert(is_powerof2(N), "vector-size / element-size must be a power of 2");
13 | 
14 |     static constexpr bool supports_compress_writes() { return true; }
15 |     static constexpr bool supports_packing() { return false; }
16 | 
17 |     template <i32 Shift>
18 |     static bool can_pack(T) { return false; }
19 | 
20 |     static INLINE TLOADSTOREMASK generate_prefix_mask(i32 amount) {
21 |         assert(amount >= 0);
22 |         assert(amount <= N);
23 |         return  0xFFFFFFFF >> ((N - amount) & (N-1));
24 |     }
25 | 
26 |     static INLINE TLOADSTOREMASK generate_suffix_mask(i32 amount) {
27 |         assert(amount >= 0);
28 |         assert(amount <= N);
29 |         return  0xFFFFFFFF << (amount & (N-1));
30 |     }
31 | 
32 |     static INLINE TV load_vec(TV* p) { return _mm512_loadu_si512(p); }
33 | 
34 |     static INLINE void store_vec(TV* ptr, TV v) { _mm512_storeu_si512(ptr, v); }
35 | 
36 |     static INLINE TV load_partial_vec(TV *ptr, TV base, TLOADSTOREMASK mask) {
37 |         return _mm512_mask_loadu_epi16(base, mask, (T const *) ptr);
38 |     }
39 | 
40 |     static INLINE void store_masked_vec(TV * p, TV v, TLOADSTOREMASK mask) {
41 |         _mm512_mask_storeu_epi16(p, mask, v);
42 |     }
43 | 
44 |     // Will never be called
45 |     static INLINE TV partition_vector(TV v, i32 mask) { return v; }
46 | 
47 |     static INLINE void store_compress_vec(TV* ptr, TV v, TCMPMASK mask) { _mm512_mask_compressstoreu_epi16(ptr, mask, v); }
48 | 
49 |     static INLINE TV broadcast(T pivot) { return _mm512_set1_epi16(pivot); }
50 | 
51 |     static INLINE TCMPMASK get_cmpgt_mask(TV a, TV b) { return _mm512_cmp_epi16_mask(a, b, _MM_CMPINT_GT); }
52 | 
53 |     static INLINE TV shift_right(TV v, i32 i) { return _mm512_srli_epi16(v, i); }
54 |     static INLINE TV shift_left(TV v, i32 i) { return _mm512_slli_epi16(v, i); }
55 | 
56 |     static INLINE TV add(TV a, TV b) { return _mm512_add_epi16(a, b); }
57 |     static INLINE TV sub(TV a, TV b) { return _mm512_sub_epi16(a, b); };
58 | 
59 |     static INLINE TV pack_unordered(TV a, TV b) { return a; }
60 |     static INLINE void unpack_ordered(TV p, TV& u1, TV& u2) { }
61 | 
62 |     template <i32 Shift>
63 |     static INLINE T shift_n_sub(T v, T sub) {
64 |         if (Shift > 0)
65 |             v >>= Shift;
66 |         v -= sub;
67 |         return v;
68 |     }
69 | 
70 |     template <i32 Shift>
71 |     static INLINE T unshift_and_add(TPACK from, T add) {
72 |         add += from;
73 |         if (Shift > 0)
74 |             add = (T) (((TU) add) << Shift);
75 |         return add;
76 |     }
77 | };
78 | 


--------------------------------------------------------------------------------
/vxsort/vector_machine/avx512/i32.h:
--------------------------------------------------------------------------------
 1 | template <>
 2 | class vxsort_machine_traits<i32, AVX512> {
 3 | public:
 4 |     typedef i32 T;
 5 |     typedef __m512i TV;
 6 |     typedef __mmask16 TLOADSTOREMASK;
 7 |     typedef __mmask16 TCMPMASK;
 8 |     typedef i16 TPACK;
 9 |     typedef typename std::make_unsigned<TPACK>::type TUPACK;
10 |     typedef typename std::make_unsigned<T>::type TU;
11 |     static_assert(sizeof(TPACK)*2 == sizeof(T), "TPACK must be half-width of T");
12 | 
13 |     static constexpr i32 N = sizeof(TV) / sizeof(T);
14 |     static_assert(is_powerof2(N), "vector-size / element-size must be a power of 2");
15 | 
16 |     static constexpr bool supports_compress_writes() { return true; }
17 |     static constexpr bool supports_packing() { return true; }
18 | 
19 |     template <i32 Shift>
20 |     static bool can_pack(T span) {
21 |         if (!supports_vector_machine<AVX512>(sizeof(TPACK))) {
22 |             return false;
23 |         }
24 |         constexpr auto PACK_LIMIT = (((TU)std::numeric_limits<TUPACK>::max() + 1)) << Shift;
25 |         return ((TU)span) < PACK_LIMIT;
26 |     }
27 | 
28 |     static INLINE TLOADSTOREMASK generate_prefix_mask(i32 amount) {
29 |         assert(amount >= 0);
30 |         assert(amount <= N);
31 |         return 0xFFFF >> ((N - amount) & (N-1));
32 |     }
33 | 
34 |     static INLINE TLOADSTOREMASK generate_suffix_mask(i32 amount) {
35 |         assert(amount >= 0);
36 |         assert(amount <= N);
37 |         return 0xFFFF << (amount & (N-1));
38 |     }
39 | 
40 |     static INLINE TV load_vec(TV* p) { return _mm512_loadu_si512(p); }
41 | 
42 |     static INLINE void store_vec(TV* ptr, TV v) { _mm512_storeu_si512(ptr, v); }
43 | 
44 |     static INLINE TV load_partial_vec(TV *p, TV base, TLOADSTOREMASK mask) {
45 |         return _mm512_mask_loadu_epi32(base, mask, (i32 const *) p);
46 |     }
47 | 
48 |     static INLINE void store_masked_vec(TV *p, TV v, TLOADSTOREMASK mask) {
49 |         _mm512_mask_storeu_epi32(p, mask, v);
50 |     }
51 | 
52 |     // Will never be called
53 |     static INLINE TV partition_vector(TV v, i32 mask) { return v; }
54 | 
55 |     static INLINE void store_compress_vec(TV* ptr, TV v, TCMPMASK mask) { _mm512_mask_compressstoreu_epi32(ptr, mask, v); }
56 | 
57 |     static INLINE TV broadcast(T pivot) { return _mm512_set1_epi32(pivot); }
58 | 
59 |     static INLINE TCMPMASK get_cmpgt_mask(TV a, TV b) { return _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_GT); }
60 | 
61 |     static INLINE TV shift_right(TV v, i32 i) { return _mm512_srli_epi32(v, i); }
62 |     static INLINE TV shift_left(TV v, i32 i) { return _mm512_slli_epi32(v, i); }
63 | 
64 |     static INLINE TV add(TV a, TV b) { return _mm512_add_epi32(a, b); }
65 |     static INLINE TV sub(TV a, TV b) { return _mm512_sub_epi32(a, b); };
66 | 
67 |     static INLINE TV pack_unordered(TV a, TV b) { return _mm512_packs_epi32(a, b); }
68 |     static INLINE void unpack_ordered(TV p, TV& u1, TV& u2) {
69 |         u1 = _mm512_cvtepi16_epi32(_mm512_extracti32x8_epi32(p, 0));
70 |         u2 = _mm512_cvtepi16_epi32(_mm512_extracti32x8_epi32(p, 1));
71 |     }
72 | 
73 |     template <i32 Shift>
74 |     static INLINE T shift_n_sub(T v, T sub) {
75 |         if (Shift > 0)
76 |             v >>= Shift;
77 |         v -= sub;
78 |         return v;
79 |     }
80 | 
81 |     template <i32 Shift>
82 |     static INLINE T unshift_and_add(TPACK from, T add) {
83 |         add += from;
84 |         if (Shift > 0)
85 |             add = (T) (((TU) add) << Shift);
86 |         return add;
87 |     }
88 | };
89 | 


--------------------------------------------------------------------------------
/vxsort/vector_machine/avx512/i64.h:
--------------------------------------------------------------------------------
 1 | template <>
 2 | class vxsort_machine_traits<i64, AVX512> {
 3 | public:
 4 |     typedef i64 T;
 5 |     typedef __m512i TV;
 6 |     typedef __mmask8 TLOADSTOREMASK;
 7 |     typedef __mmask8 TCMPMASK;
 8 |     typedef i32 TPACK;
 9 |     typedef typename std::make_unsigned<TPACK>::type TUPACK;
10 |     typedef typename std::make_unsigned<T>::type TU;
11 |     static_assert(sizeof(TPACK)*2 == sizeof(T), "TPACK must be half-width of T");
12 | 
13 |     static constexpr i32 N = sizeof(TV) / sizeof(T);
14 |     static_assert(is_powerof2(N), "vector-size / element-size must be a power of 2");
15 | 
16 |     static constexpr bool supports_compress_writes() { return true; }
17 |     static constexpr bool supports_packing() { return true; }
18 | 
19 |     template <i32 Shift>
20 |     static bool can_pack(T span) {
21 |         constexpr auto PACK_LIMIT = (((TU) std::numeric_limits<TUPACK>::max() + 1)) << Shift;
22 |         return ((TU) span) < PACK_LIMIT;
23 |     }
24 | 
25 |     static INLINE TLOADSTOREMASK generate_prefix_mask(i32 amount) {
26 |         assert(amount >= 0);
27 |         assert(amount <= N);
28 |         return 0xFF >> ((N - amount) & (N-1));
29 |     }
30 | 
31 |     static INLINE TLOADSTOREMASK generate_suffix_mask(i32 amount) {
32 |         assert(amount >= 0);
33 |         assert(amount <= N);
34 |         return 0xFF << (amount & (N-1));
35 |     }
36 | 
37 |     static INLINE TV load_vec(TV* p) { return _mm512_loadu_si512(p); }
38 | 
39 |     static INLINE void store_vec(TV* ptr, TV v) { _mm512_storeu_si512(ptr, v); }
40 | 
41 |     static INLINE TV load_partial_vec(TV *p, TV base, TLOADSTOREMASK mask) {
42 |         return _mm512_mask_loadu_epi64(base, mask, (i64 const *) p);
43 |     }
44 | 
45 |     static INLINE void store_masked_vec(TV * p, TV v, TLOADSTOREMASK mask) {
46 |         _mm512_mask_storeu_epi64(p, mask, v);
47 |     }
48 | 
49 |     // Will never be called
50 |     static INLINE TV partition_vector(TV v, i32 mask) { return v; }
51 | 
52 |     static INLINE void store_compress_vec(TV* ptr, TV v, TCMPMASK mask) { _mm512_mask_compressstoreu_epi64(ptr, mask, v); }
53 | 
54 |     static INLINE TV broadcast(T pivot) { return _mm512_set1_epi64(pivot); }
55 | 
56 |     static INLINE TCMPMASK get_cmpgt_mask(TV a, TV b) { return _mm512_cmp_epi64_mask(a, b, _MM_CMPINT_GT); }
57 | 
58 |     static INLINE TV shift_right(TV v, i32 i) { return _mm512_srli_epi64(v, i); }
59 |     static INLINE TV shift_left(TV v, i32 i) { return _mm512_slli_epi64(v, i); }
60 | 
61 |     static INLINE TV add(TV a, TV b) { return _mm512_add_epi64(a, b); }
62 |     static INLINE TV sub(TV a, TV b) { return _mm512_sub_epi64(a, b); };
63 | 
64 |     static INLINE TV pack_unordered(TV a, TV b) { return _mm512_mask_shuffle_epi32(a, 0b1010101010101010, b, _MM_PERM_CDAB); }
65 | 
66 |     static INLINE void unpack_ordered(TV p, TV& u1, TV& u2) {
67 |         auto p01 = _mm512_extracti32x8_epi32(p, 0);
68 |         auto p02 = _mm512_extracti32x8_epi32(p, 1);
69 | 
70 |         u1 = _mm512_cvtepi32_epi64(p01);
71 |         u2 = _mm512_cvtepi32_epi64(p02);
72 |     }
73 | 
74 |     template <i32 Shift>
75 |     static INLINE T shift_n_sub(T v, T sub) {
76 |         if (Shift > 0)
77 |             v >>= Shift;
78 |         v -= sub;
79 |         return v;
80 |     }
81 | 
82 |     template <i32 Shift>
83 |     static INLINE T unshift_and_add(TPACK from, T add) {
84 |         add += from;
85 | 
86 |         if (Shift > 0)
87 |             add = (T) (((TU) add) << Shift);
88 | 
89 |         return add;
90 |     }
91 | 
92 | };
93 | 


--------------------------------------------------------------------------------
/vxsort/vector_machine/avx512/u16.h:
--------------------------------------------------------------------------------
 1 | template <>
 2 | class vxsort_machine_traits<u16, AVX512> {
 3 | public:
 4 |     typedef u16 T;
 5 |     typedef __m512i TV;
 6 |     typedef __mmask32 TLOADSTOREMASK;
 7 |     typedef __mmask32 TCMPMASK;
 8 |     typedef u16 TPACK;
 9 |     typedef typename std::make_unsigned<T>::type TU;
10 | 
11 |     static constexpr i32 N = sizeof(TV) / sizeof(T);
12 |     static_assert(is_powerof2(N), "vector-size / element-size must be a power of 2");
13 | 
14 |     static constexpr bool supports_compress_writes() { return true; }
15 |     static constexpr bool supports_packing() { return false; }
16 | 
17 |     template <i32 Shift>
18 |     static bool can_pack(T) { return false; }
19 | 
20 |     static INLINE TLOADSTOREMASK generate_prefix_mask(i32 amount) {
21 |         assert(amount >= 0);
22 |         assert(amount <= N);
23 |         return  0xFFFFFFFF >> ((N - amount) & (N-1));
24 |     }
25 | 
26 |     static INLINE TLOADSTOREMASK generate_suffix_mask(i32 amount) {
27 |         assert(amount >= 0);
28 |         assert(amount <= N);
29 |         return  0xFFFFFFFF << (amount & (N-1));
30 |     }
31 | 
32 |     static INLINE TV load_vec(TV* p) { return _mm512_loadu_si512(p); }
33 | 
34 |     static INLINE void store_vec(TV* ptr, TV v) { _mm512_storeu_si512(ptr, v); }
35 | 
36 |     static INLINE TV load_partial_vec(TV *ptr, TV base, TLOADSTOREMASK mask) {
37 |         return _mm512_mask_loadu_epi16(base, mask, (T const *) ptr);
38 |     }
39 | 
40 |     static INLINE void store_masked_vec(TV * p, TV v, TLOADSTOREMASK mask) {
41 |         _mm512_mask_storeu_epi16(p, mask, v);
42 |     }
43 | 
44 |     // Will never be called
45 |     static INLINE TV partition_vector(TV v, i32 mask) { return v; }
46 | 
47 |     static INLINE void store_compress_vec(TV* ptr, TV v, TCMPMASK mask) { _mm512_mask_compressstoreu_epi16(ptr, mask, v); }
48 | 
49 |     static INLINE TV broadcast(T pivot) { return _mm512_set1_epi16(pivot); }
50 | 
51 |     static INLINE TCMPMASK get_cmpgt_mask(TV a, TV b) { return _mm512_cmp_epu16_mask(a, b, _MM_CMPINT_GT); }
52 | 
53 |     static INLINE TV shift_right(TV v, i32 i) { return _mm512_srli_epi16(v, i); }
54 |     static INLINE TV shift_left(TV v, i32 i) { return _mm512_slli_epi16(v, i); }
55 | 
56 |     static INLINE TV add(TV a, TV b) { return _mm512_add_epi16(a, b); }
57 |     static INLINE TV sub(TV a, TV b) { return _mm512_sub_epi16(a, b); };
58 | 
59 |     static INLINE TV pack_unordered(TV a, TV b) { return a; }
60 |     static INLINE void unpack_ordered(TV p, TV& u1, TV& u2) { }
61 | 
62 |     template <i32 Shift>
63 |     static INLINE T shift_n_sub(T v, T sub) {
64 |         if (Shift > 0)
65 |             v >>= Shift;
66 |         v -= sub;
67 |         return v;
68 |     }
69 | 
70 |     template <i32 Shift>
71 |     static INLINE T unshift_and_add(TPACK from, T add) {
72 |         add += from;
73 |         if (Shift > 0)
74 |             add = (T) (((TU) add) << Shift);
75 |         return add;
76 |     }
77 | };
78 | 


--------------------------------------------------------------------------------
/vxsort/vector_machine/avx512/u32.h:
--------------------------------------------------------------------------------
 1 | template <>
 2 | class vxsort_machine_traits<u32, AVX512> {
 3 | public:
 4 |     typedef u32 T;
 5 |     typedef __m512i TV;
 6 |     typedef __mmask16 TLOADSTOREMASK;
 7 |     typedef __mmask16 TCMPMASK;
 8 |     typedef u16 TPACK;
 9 |     typedef typename std::make_unsigned<TPACK>::type TUPACK;
10 |     typedef typename std::make_unsigned<T>::type TU;
11 |     static_assert(sizeof(TPACK)*2 == sizeof(T), "TPACK must be half-width of T");
12 | 
13 |     static constexpr i32 N = sizeof(TV) / sizeof(T);
14 |     static_assert(is_powerof2(N), "vector-size / element-size must be a power of 2");
15 | 
16 |     static constexpr bool supports_compress_writes() { return true; }
17 |     static constexpr bool supports_packing() { return true; }
18 | 
19 |     template <i32 Shift>
20 |     static bool can_pack(T span) {
21 |         if (!supports_vector_machine<AVX512>(sizeof(TPACK))) {
22 |             return false;
23 |         }
24 |         constexpr auto PACK_LIMIT = (((TU)std::numeric_limits<TUPACK>::max() + 1)) << Shift;
25 |         return ((TU)span) < PACK_LIMIT;
26 |     }
27 | 
28 |     static INLINE TLOADSTOREMASK generate_prefix_mask(i32 amount) {
29 |         assert(amount >= 0);
30 |         assert(amount <= N);
31 |         return 0xFFFF >> ((N - amount) & (N-1));
32 |     }
33 | 
34 |     static INLINE TLOADSTOREMASK generate_suffix_mask(i32 amount) {
35 |         assert(amount >= 0);
36 |         assert(amount <= N);
37 |         return 0xFFFF << (amount & (N-1));
38 |     }
39 | 
40 |     static INLINE TV load_vec(TV* p) { return _mm512_loadu_si512(p); }
41 | 
42 |     static INLINE void store_vec(TV* ptr, TV v) { _mm512_storeu_si512(ptr, v); }
43 | 
44 |     static INLINE TV load_partial_vec(TV *p, TV base, TLOADSTOREMASK mask) {
45 |         return _mm512_mask_loadu_epi32(base, mask, (i32 const *) p);
46 |     }
47 | 
48 |     static INLINE void store_masked_vec(TV *p, TV v, TLOADSTOREMASK mask) {
49 |         _mm512_mask_storeu_epi32(p, mask, v);
50 |     }
51 | 
52 |     // Will never be called
53 |     static INLINE TV partition_vector(TV v, i32 mask) { return v; }
54 | 
55 |     static INLINE void store_compress_vec(TV* ptr, TV v, TCMPMASK mask) { _mm512_mask_compressstoreu_epi32(ptr, mask, v); }
56 | 
57 |     static INLINE TV broadcast(T pivot) { return _mm512_set1_epi32(pivot); }
58 | 
59 |     static INLINE TCMPMASK get_cmpgt_mask(TV a, TV b) { return _mm512_cmp_epu32_mask(a, b, _MM_CMPINT_GT); }
60 | 
61 |     static INLINE TV shift_right(TV v, i32 i) { return _mm512_srli_epi32(v, i); }
62 |     static INLINE TV shift_left(TV v, i32 i) { return _mm512_slli_epi32(v, i); }
63 | 
64 |     static INLINE TV add(TV a, TV b) { return _mm512_add_epi32(a, b); }
65 |     static INLINE TV sub(TV a, TV b) { return _mm512_sub_epi32(a, b); };
66 | 
67 |     static INLINE TV pack_unordered(TV a, TV b) { return _mm512_packus_epi32(a, b); }
68 |     static INLINE void unpack_ordered(TV p, TV& u1, TV& u2) {
69 |         u1 = _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(p, 0));
70 |         u2 = _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(p, 1));
71 |     }
72 | 
73 |     template <i32 Shift>
74 |     static INLINE T shift_n_sub(T v, T sub) {
75 |         if (Shift > 0)
76 |             v >>= Shift;
77 |         v -= sub;
78 |         return v;
79 |     }
80 | 
81 |     template <i32 Shift>
82 |     static INLINE T unshift_and_add(TPACK from, T add) {
83 |         add += from;
84 |         if (Shift > 0)
85 |             add = (T) (((TU) add) << Shift);
86 |         return add;
87 |     }
88 | };
89 | 


--------------------------------------------------------------------------------
/vxsort/vector_machine/avx512/u64.h:
--------------------------------------------------------------------------------
 1 | template <>
 2 | class vxsort_machine_traits<u64, AVX512> {
 3 | public:
 4 |     typedef u64 T;
 5 |     typedef __m512i TV;
 6 |     typedef __mmask8 TLOADSTOREMASK;
 7 |     typedef __mmask8 TCMPMASK;
 8 |     typedef u32 TPACK;
 9 |     typedef typename std::make_unsigned<TPACK>::type TUPACK;
10 |     typedef typename std::make_unsigned<T>::type TU;
11 |     static_assert(sizeof(TPACK)*2 == sizeof(T), "TPACK must be half-width of T");
12 | 
13 |     static constexpr i32 N = sizeof(TV) / sizeof(T);
14 |     static_assert(is_powerof2(N), "vector-size / element-size must be a power of 2");
15 | 
16 |     static constexpr bool supports_compress_writes() { return true; }
17 |     static constexpr bool supports_packing() { return true; }
18 | 
19 |     template <i32 Shift>
20 |     static bool can_pack(T span) {
21 |         constexpr auto PACK_LIMIT = (((TU) std::numeric_limits<TUPACK>::max() + 1)) << Shift;
22 |         return ((TU) span) < PACK_LIMIT;
23 |     }
24 | 
25 |     static INLINE TLOADSTOREMASK generate_prefix_mask(i32 amount) {
26 |         assert(amount >= 0);
27 |         assert(amount <= N);
28 |         return 0xFF >> ((N - amount) & (N-1));
29 |     }
30 | 
31 |     static INLINE TLOADSTOREMASK generate_suffix_mask(i32 amount) {
32 |         assert(amount >= 0);
33 |         assert(amount <= N);
34 |         return 0xFF << (amount & (N-1));
35 |     }
36 | 
37 |     static INLINE TV load_vec(TV* p) { return _mm512_loadu_si512(p); }
38 | 
39 |     static INLINE void store_vec(TV* ptr, TV v) { _mm512_storeu_si512(ptr, v); }
40 | 
41 |     static INLINE TV load_partial_vec(TV *p, TV base, TLOADSTOREMASK mask) {
42 |         return _mm512_mask_loadu_epi64(base, mask, (i64 const *) p);
43 |     }
44 | 
45 |     static INLINE void store_masked_vec(TV * p, TV v, TLOADSTOREMASK mask) {
46 |         _mm512_mask_storeu_epi64(p, mask, v);
47 |     }
48 | 
49 |     // Will never be called
50 |     static INLINE TV partition_vector(TV v, i32 mask) { return v; }
51 | 
52 |     static INLINE void store_compress_vec(TV* ptr, TV v, TCMPMASK mask) { _mm512_mask_compressstoreu_epi64(ptr, mask, v); }
53 | 
54 |     static INLINE TV broadcast(T pivot) { return _mm512_set1_epi64(pivot); }
55 | 
56 |     static INLINE TCMPMASK get_cmpgt_mask(TV a, TV b) { return _mm512_cmp_epu64_mask(a, b, _MM_CMPINT_GT); }
57 | 
58 |     static INLINE TV shift_right(TV v, i32 i) { return _mm512_srli_epi64(v, i); }
59 |     static INLINE TV shift_left(TV v, i32 i) { return _mm512_slli_epi64(v, i); }
60 | 
61 |     static INLINE TV add(TV a, TV b) { return _mm512_add_epi64(a, b); }
62 |     static INLINE TV sub(TV a, TV b) { return _mm512_sub_epi64(a, b); };
63 | 
64 |     static INLINE TV pack_unordered(TV a, TV b) { return _mm512_mask_shuffle_epi32(a, 0b1010101010101010, b, _MM_PERM_CDAB); }
65 | 
66 |     static INLINE void unpack_ordered(TV p, TV& u1, TV& u2) {
67 |         auto p01 = _mm512_extracti32x8_epi32(p, 0);
68 |         auto p02 = _mm512_extracti32x8_epi32(p, 1);
69 | 
70 |         u1 = _mm512_cvtepu32_epi64(p01);
71 |         u2 = _mm512_cvtepu32_epi64(p02);
72 |     }
73 | 
74 |     template <i32 Shift>
75 |     static INLINE T shift_n_sub(T v, T sub) {
76 |         if (Shift > 0)
77 |             v >>= Shift;
78 |         v -= sub;
79 |         return v;
80 |     }
81 | 
82 |     template <i32 Shift>
83 |     static INLINE T unshift_and_add(TPACK from, T add) {
84 |         add += from;
85 | 
86 |         if (Shift > 0)
87 |             add = (T) (((TU) add) << Shift);
88 | 
89 |         return add;
90 |     }
91 | 
92 | };
93 | 


--------------------------------------------------------------------------------
/vxsort/vector_machine/machine_traits.avx2.h:
--------------------------------------------------------------------------------
 1 | #ifndef VXSORT_MACHINE_TRAITS_AVX2_H
 2 | #define VXSORT_MACHINE_TRAITS_AVX2_H
 3 | 
 4 | #include "vxsort_targets_enable_avx2.h"
 5 | 
 6 | #include <stdexcept>
 7 | #include <array>
 8 | #include <algorithm>
 9 | #include <type_traits>
10 | #include <limits>
11 | #include <cassert>
12 | #include <cstring>
13 | #include <cinttypes>
14 | #include <immintrin.h>
15 | 
16 | #include "defs.h"
17 | #include "machine_traits.h"
18 | 
19 | #define i2d _mm256_castsi256_pd
20 | #define d2i _mm256_castpd_si256
21 | #define i2s _mm256_castsi256_ps
22 | #define s2i _mm256_castps_si256
23 | #define s2d _mm256_castps_pd
24 | #define d2s _mm256_castpd_ps
25 | 
26 | namespace vxsort {
27 | using namespace vxsort::types;
28 | 
29 | // * We might read the last 4 bytes into a 128-bit vector for 64-bit element masking
30 | // * We might read the last 8 bytes into a 128-bit vector for 32-bit element masking
31 | // This mostly applies to debug mode, since without optimizations, most compilers
32 | // actually execute the instruction stream _mm256_cvtepi8_epiNN + _mm_loadu_si128 as they are given.
33 | // In contrast, release/optimizing compilers, turn that very specific intrinsic pair to
34 | // a more reasonable: vpmovsxbq ymm0, dword [rax*4 + prefix_mask_table_64b], eliminating the 128-bit
35 | // load completely and effectively reading exactly 4/8 (depending if the instruction is vpmovsxb[q,d]
36 | // without generating an out of bounds read at all.
37 | // But, life is harsh, and we can't trust the compiler to do the right thing if it is not
38 | // contractual, hence this flustercuck
39 | const i32 M4_SIZE = 16 + 4 + 12;
40 | const i32 M8_SIZE = 64 + 8 + 8;
41 | 
42 | extern const u8 prefix_mask_table_64b[M4_SIZE];
43 | extern const u8 prefix_mask_table_32b[M8_SIZE];
44 | 
45 | extern const u8 suffix_mask_table_64b[M4_SIZE];
46 | extern const u8 suffix_mask_table_32b[M8_SIZE];
47 | 
48 | extern const i8 perm_table_64[128];
49 | extern const i8 perm_table_32[2048];
50 | 
51 | #include "avx2/f64.h"
52 | #include "avx2/f32.h"
53 | #include "avx2/i16.h"
54 | #include "avx2/i32.h"
55 | #include "avx2/i64.h"
56 | #include "avx2/u16.h"
57 | #include "avx2/u32.h"
58 | #include "avx2/u64.h"
59 | }
60 | 
61 | #undef i2d
62 | #undef d2i
63 | #undef i2s
64 | #undef s2i
65 | #undef s2d
66 | #undef d2s
67 | 
68 | #include "vxsort_targets_disable.h"
69 | #endif  // VXSORT_VXSORT_AVX2_H
70 | 


--------------------------------------------------------------------------------
/vxsort/vector_machine/machine_traits.avx512.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifndef VXSORT_MACHINE_TRAITS_AVX512_H
 3 | #define VXSORT_MACHINE_TRAITS_AVX512_H
 4 | 
 5 | #include "vxsort_targets_enable_avx512.h"
 6 | 
 7 | #include <limits>
 8 | #include <immintrin.h>
 9 | #include <cassert>
10 | #include <type_traits>
11 | #include "defs.h"
12 | #include "isa_detection.h"
13 | #include "machine_traits.h"
14 | 
15 | namespace vxsort {
16 | using namespace vxsort::types;
17 | 
18 | #include "avx512/f64.h"
19 | #include "avx512/f32.h"
20 | #include "avx512/i16.h"
21 | #include "avx512/i32.h"
22 | #include "avx512/i64.h"
23 | #include "avx512/u16.h"
24 | #include "avx512/u32.h"
25 | #include "avx512/u64.h"
26 | }
27 | 
28 | #include "vxsort_targets_disable.h"
29 | #endif  // VXSORT_VXSORT_AVX512_H
30 | 


--------------------------------------------------------------------------------
/vxsort/vxsort.avx2.h:
--------------------------------------------------------------------------------
 1 | #ifndef VXSORT_VXSORT_AVX2_H
 2 | #define VXSORT_VXSORT_AVX2_H
 3 | 
 4 | #include "vector_machine/machine_traits.avx2.h"
 5 | #include "smallsort/avx2/bitonic_machine.avx2.h"
 6 | #include "partition_machine.avx2.h"
 7 | 
 8 | 
 9 | #include "vxsort.h"
10 | 
11 | #endif //VXSORT_VXSORT_AVX2_H
12 | 


--------------------------------------------------------------------------------
/vxsort/vxsort.avx512.h:
--------------------------------------------------------------------------------
 1 | #ifndef VXSORT_VXSORT_AVX512_H
 2 | #define VXSORT_VXSORT_AVX512_H
 3 | 
 4 | #include "vector_machine/machine_traits.avx512.h"
 5 | #include "smallsort/avx512/bitonic_machine.avx512.h"
 6 | #include "partition_machine.avx512.h"
 7 | 
 8 | #include "vxsort.h"
 9 | 
10 | #endif //VXSORT_VXSORT_AVX512_H
11 | 


--------------------------------------------------------------------------------
/vxsort/vxsort_targets_disable.h:
--------------------------------------------------------------------------------
 1 | #include "compiler.h"
 2 | 
 3 | #ifdef VXSORT_TARGET_PUSHED
 4 | 
 5 | #if defined(VXSORT_COMPILER_CLANG) || defined(VXSORT_COMPILER_CLANGCL)
 6 | #pragma clang attribute pop
 7 | #endif
 8 | 
 9 | #if defined(VXSORT_COMPILER_GCC)
10 | #pragma GCC pop_options
11 | #endif
12 | 
13 | 
14 | #endif
15 | 


--------------------------------------------------------------------------------
/vxsort/vxsort_targets_enable_avx2.h:
--------------------------------------------------------------------------------
 1 | #include "compiler.h"
 2 | 
 3 | #if defined(VXSORT_COMPILER_CLANG) || defined(VXSORT_COMPILER_CLANGCL)
 4 | #define VXSORT_TARGET_PUSHED 1
 5 | #pragma clang attribute push (__attribute__((target("avx2,popcnt,bmi2"))), apply_to = any(function))
 6 | #endif
 7 | 
 8 | #if defined(VXSORT_COMPILER_GCC)
 9 | #define VXSORT_TARGET_PUSHED 1
10 | #pragma GCC push_options
11 | #pragma GCC target("avx2,popcnt,bmi2")
12 | #endif
13 | 


--------------------------------------------------------------------------------
/vxsort/vxsort_targets_enable_avx512.h:
--------------------------------------------------------------------------------
 1 | #include "compiler.h"
 2 | 
 3 | #if defined(VXSORT_COMPILER_CLANG) || defined(VXSORT_COMPILER_CLANGCL)
 4 | #define VXSORT_TARGET_PUSHED 1
 5 | #pragma clang attribute push (__attribute__((target("avx512f,avx512dq,avx512bw,avx512vbmi2,popcnt"))), apply_to = any(function))
 6 | #endif
 7 | 
 8 | #if defined(VXSORT_COMPILER_GCC)
 9 | #define VXSORT_TARGET_PUSHED 1
10 | #pragma GCC push_options
11 | #pragma GCC target("avx512f,avx512dq,avx512bw,avx512vbmi2,popcnt")
12 | #endif
13 | 


--------------------------------------------------------------------------------