├── .clang-format ├── .clang-tidy.yml ├── .editorconfig ├── .github └── workflows │ └── build-and-test.yml ├── .gitignore ├── CMakeLists.txt ├── CMakeSettings.json ├── Folder.DotSettings ├── LICENSE ├── README.md ├── bench ├── CMakeLists.txt ├── bench.cpp ├── bench_isa.h ├── fullsort │ ├── BM_fullsort.pdqsort.cpp │ ├── BM_fullsort.stdsort.cpp │ ├── BM_fullsort.vxsort.avx2.f.cpp │ ├── BM_fullsort.vxsort.avx2.i.cpp │ ├── BM_fullsort.vxsort.avx2.u.cpp │ ├── BM_fullsort.vxsort.avx512.f.cpp │ ├── BM_fullsort.vxsort.avx512.i.cpp │ ├── BM_fullsort.vxsort.avx512.u.cpp │ ├── BM_fullsort.vxsort.h │ ├── BM_fullsort_strided.avx2.cpp │ ├── BM_fullsort_strided.avx512.cpp │ └── fullsort_params.h ├── internal_macros.h ├── make-figure.py ├── prep.sh ├── reference │ └── pdqsort.h ├── requirements.txt ├── run.cmd ├── run.sh ├── smallsort │ ├── BM_blacher.avx2.cpp │ ├── BM_smallsort.avx2.cpp │ ├── BM_smallsort.avx512.cpp │ └── BM_smallsort.h ├── stolen-cycleclock.h ├── util.cpp └── util.h ├── build.sh ├── clang-tidy.sh ├── cmake ├── CPM.cmake ├── ConfigSafeGuards.cmake ├── EnableLocalGtestDiscovery.cmake ├── GetHostType.cmake └── Modules │ ├── FindLLVMAr.cmake │ ├── FindLLVMNm.cmake │ └── FindLLVMRanLib.cmake ├── demo ├── CMakeLists.txt ├── demo.cpp ├── do_avx2.cpp └── do_avx512.cpp ├── tests ├── CMakeLists.txt ├── fullsort │ ├── fullsort.avx2.cpp │ ├── fullsort.avx512.cpp │ └── fullsort_test.h ├── gtest_main.cpp ├── mini_tests │ ├── masked_load_store.avx2.cpp │ ├── masked_load_store.avx512.cpp │ ├── masked_load_store.sanity.cpp │ ├── masked_load_store_test.h │ ├── mini_fixtures.h │ ├── pack_machine.avx2.cpp │ ├── pack_machine.avx512.cpp │ ├── pack_machine_test.h │ ├── partition_machine.avx2.cpp │ ├── partition_machine.avx512.cpp │ └── partition_machine_test.h ├── smallsort │ ├── smallsort.avx2.cpp │ ├── smallsort.avx512.cpp │ └── smallsort_test.h ├── sort_fixtures.h ├── test_isa.h └── util.h └── vxsort ├── CMakeLists.txt ├── alignment.h ├── compiler.h ├── defs.h ├── isa_detection.cpp ├── isa_detection.h ├── isa_detection_sane.cpp ├── pack_machine.h ├── partition_machine.avx2.h ├── partition_machine.avx512.h ├── partition_machine.h ├── smallsort ├── avx2 │ ├── bitonic_machine.avx2.f32.generated.h │ ├── bitonic_machine.avx2.f64.generated.h │ ├── bitonic_machine.avx2.h │ ├── bitonic_machine.avx2.i16.generated.h │ ├── bitonic_machine.avx2.i32.generated.h │ ├── bitonic_machine.avx2.i64.generated.h │ ├── bitonic_machine.avx2.u16.generated.h │ ├── bitonic_machine.avx2.u32.generated.h │ └── bitonic_machine.avx2.u64.generated.h ├── avx512 │ ├── bitonic_machine.avx512.f32.generated.h │ ├── bitonic_machine.avx512.f64.generated.h │ ├── bitonic_machine.avx512.h │ ├── bitonic_machine.avx512.i16.generated.h │ ├── bitonic_machine.avx512.i32.generated.h │ ├── bitonic_machine.avx512.i64.generated.h │ ├── bitonic_machine.avx512.u16.generated.h │ ├── bitonic_machine.avx512.u32.generated.h │ └── bitonic_machine.avx512.u64.generated.h ├── bitonic_machine.h ├── bitonic_sort.avx2.h ├── bitonic_sort.avx512.h ├── bitonic_sort.h └── codegen │ ├── avx2.py │ ├── avx512.py │ ├── bitonic_gen.py │ ├── bitonic_isa.py │ └── utils.py ├── stats ├── vxsort_stats.cpp └── vxsort_stats.h ├── vector_machine ├── avx2 │ ├── avx2_masks.cpp │ ├── f32.h │ ├── f64.h │ ├── i16.h │ ├── i32.h │ ├── i64.h │ ├── u16.h │ ├── u32.h │ └── u64.h ├── avx512 │ ├── f32.h │ ├── f64.h │ ├── i16.h │ ├── i32.h │ ├── i64.h │ ├── u16.h │ ├── u32.h │ └── u64.h ├── machine_traits.avx2.h ├── machine_traits.avx512.h └── machine_traits.h ├── vxsort.avx2.h ├── vxsort.avx512.h ├── vxsort.h ├── vxsort_targets_disable.h ├── vxsort_targets_enable_avx2.h └── vxsort_targets_enable_avx512.h /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | BasedOnStyle: Chromium 3 | 4 | --- 5 | Language: Cpp 6 | ColumnLimit: 160 7 | IndentWidth: 4 8 | 9 | ... 10 | -------------------------------------------------------------------------------- /.clang-tidy.yml: -------------------------------------------------------------------------------- 1 | --- 2 | Checks: >- 3 | clang-diagnostic-*, 4 | clang-analyzer-*' 5 | performance-*, 6 | portability-*, 7 | -portability-simd-intrinsics, 8 | bugprone-*, 9 | WarningsAsErrors: '' 10 | HeaderFilterRegex: '' 11 | AnalyzeTemporaryDtors: false 12 | FormatStyle: none 13 | User: dmg 14 | CheckOptions: 15 | - key: llvm-else-after-return.WarnOnConditionVariables 16 | value: '0' 17 | - key: modernize-loop-convert.MinConfidence 18 | value: reasonable 19 | - key: modernize-replace-auto-ptr.IncludeStyle 20 | value: llvm 21 | - key: cert-str34-c.DiagnoseSignedUnsignedCharComparisons 22 | value: '0' 23 | - key: google-readability-namespace-comments.ShortNamespaceLines 24 | value: '10' 25 | - key: cert-oop54-cpp.WarnOnlyIfThisHasSuspiciousField 26 | value: '0' 27 | - key: cppcoreguidelines-non-private-member-variables-in-classes.IgnoreClassesWithAllMemberVariablesBeingPublic 28 | value: '1' 29 | - key: cert-dcl16-c.NewSuffixes 30 | value: 'L;LL;LU;LLU' 31 | - key: google-readability-braces-around-statements.ShortStatementLines 32 | value: '1' 33 | - key: modernize-pass-by-value.IncludeStyle 34 | value: llvm 35 | - key: google-readability-namespace-comments.SpacesBeforeComments 36 | value: '2' 37 | - key: modernize-loop-convert.MaxCopySize 38 | value: '16' 39 | - key: cppcoreguidelines-explicit-virtual-functions.IgnoreDestructors 40 | value: '1' 41 | - key: modernize-use-nullptr.NullMacros 42 | value: 'NULL' 43 | - key: llvm-qualified-auto.AddConstToQualified 44 | value: '0' 45 | - key: modernize-loop-convert.NamingStyle 46 | value: CamelCase 47 | - key: llvm-else-after-return.WarnOnUnfixable 48 | value: '0' 49 | - key: google-readability-function-size.StatementThreshold 50 | value: '800' 51 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | # top-most EditorConfig file 2 | root = true 3 | 4 | # Unix-style newlines with a newline ending every file 5 | [*] 6 | charset = utf-8 7 | trim_trailing_whitespace = true 8 | end_of_line = lf 9 | insert_final_newline = true 10 | 11 | # Tab indentation (no size specified) 12 | [Makefile] 13 | indent_style = tab 14 | 15 | [*.{c,h,cpp,hpp}] 16 | indent_size = 4 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | build/ 3 | __pycache__ 4 | .vs 5 | -------------------------------------------------------------------------------- /CMakeSettings.json: -------------------------------------------------------------------------------- 1 | { 2 | "configurations": [ 3 | { 4 | "name": "vs2022-clang-debug", 5 | "generator": "Ninja", 6 | "configurationType": "Debug", 7 | "inheritEnvironments": [ "clang_cl_x64" ], 8 | "buildRoot": "${projectDir}\\build\\${name}", 9 | "installRoot": "${projectDir}\\install\\vs2022-${name}", 10 | "cmakeCommandArgs": "", 11 | "buildCommandArgs": "", 12 | "ctestCommandArgs": "" 13 | }, 14 | { 15 | "name": "vs2022-clang-release", 16 | "generator": "Ninja", 17 | "configurationType": "Release", 18 | "buildRoot": "${projectDir}\\build\\${name}", 19 | "installRoot": "${projectDir}\\install\\vs2022-${name}", 20 | "cmakeCommandArgs": "", 21 | "buildCommandArgs": "", 22 | "ctestCommandArgs": "", 23 | "inheritEnvironments": [ "clang_cl_x64" ] 24 | }, 25 | { 26 | "name": "vs2022-msvc-debug", 27 | "generator": "Ninja", 28 | "configurationType": "Debug", 29 | "buildRoot": "${projectDir}\\build\\${name}", 30 | "installRoot": "${projectDir}\\install\\vs2022-${name}", 31 | "cmakeCommandArgs": "", 32 | "buildCommandArgs": "", 33 | "ctestCommandArgs": "", 34 | "inheritEnvironments": [ "msvc_x64_x64" ] 35 | }, 36 | { 37 | "name": "vs2022-msvc-release", 38 | "generator": "Ninja", 39 | "configurationType": "Release", 40 | "buildRoot": "${projectDir}\\build\\${name}", 41 | "installRoot": "${projectDir}\\install\\vs2022-${name}", 42 | "cmakeCommandArgs": "", 43 | "buildCommandArgs": "", 44 | "ctestCommandArgs": "", 45 | "inheritEnvironments": [ "msvc_x64_x64" ] 46 | }, 47 | { 48 | "name": "wsl-clang-debug", 49 | "generator": "Ninja", 50 | "configurationType": "Debug", 51 | "buildRoot": "${projectDir}\\build\\${name}", 52 | "installRoot": "${projectDir}\\out\\install\\${name}", 53 | "cmakeExecutable": "cmake", 54 | "cmakeCommandArgs": "", 55 | "buildCommandArgs": "", 56 | "ctestCommandArgs": "", 57 | "inheritEnvironments": [ "linux_clang_x64" ], 58 | "wslPath": "${defaultWSLPath}" 59 | }, 60 | { 61 | "name": "wsl-clang-release", 62 | "generator": "Ninja", 63 | "configurationType": "Release", 64 | "buildRoot": "${projectDir}\\build\\${name}", 65 | "installRoot": "${projectDir}\\out\\install\\${name}", 66 | "cmakeExecutable": "cmake", 67 | "cmakeCommandArgs": "", 68 | "buildCommandArgs": "", 69 | "ctestCommandArgs": "", 70 | "inheritEnvironments": [ "linux_clang_x64" ], 71 | "variables": [], 72 | "wslPath": "${defaultWSLPath}" 73 | } 74 | ] 75 | } -------------------------------------------------------------------------------- /Folder.DotSettings: -------------------------------------------------------------------------------- 1 |  2 | ExplicitlyExcluded 3 | ExplicitlyExcluded 4 | ExplicitlyExcluded 5 | ExplicitlyExcluded -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Dan Shechter 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /bench/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(TARGET_NAME ${CMAKE_PROJECT_NAME}_bench) 2 | 3 | 4 | find_package(Threads REQUIRED) 5 | 6 | file(GLOB_RECURSE bench_sources *.cpp) 7 | file(GLOB_RECURSE bench_headers *.h) 8 | add_executable(${CMAKE_PROJECT_NAME}_bench ${bench_sources} ${bench_headers}) 9 | 10 | target_link_libraries(${TARGET_NAME} 11 | ${CMAKE_PROJECT_NAME}_lib 12 | benchmark 13 | picosha2 14 | ${CMAKE_THREAD_LIBS_INIT}) 15 | 16 | configure_file(run.sh run.sh COPYONLY) 17 | configure_file(run.cmd run.cmd COPYONLY) 18 | configure_file(make-figure.py make-figure.py COPYONLY) 19 | -------------------------------------------------------------------------------- /bench/bench.cpp: -------------------------------------------------------------------------------- 1 | #include "benchmark/benchmark.h" 2 | 3 | using namespace std; 4 | 5 | int main(int argc, char** argv) 6 | { 7 | ::benchmark::Initialize(&argc, argv); 8 | ::benchmark::RunSpecifiedBenchmarks(); 9 | } -------------------------------------------------------------------------------- /bench/bench_isa.h: -------------------------------------------------------------------------------- 1 | #ifndef VXSORT_BENCH_ISA_H 2 | #define VXSORT_BENCH_ISA_H 3 | 4 | #include 5 | 6 | #define VXSORT_BENCH_ISA() \ 7 | if (!::vxsort::supports_vector_machine(sizeof(Q))) { \ 8 | state.SkipWithError("Current CPU does not support the minimal features for this benchmark"); \ 9 | return; \ 10 | } 11 | 12 | #endif //VXSORT_BENCH_ISA_H 13 | -------------------------------------------------------------------------------- /bench/fullsort/BM_fullsort.pdqsort.cpp: -------------------------------------------------------------------------------- 1 | #include "vxsort_targets_enable_avx2.h" 2 | 3 | #include 4 | #include 5 | 6 | #include "fullsort_params.h" 7 | #include "../util.h" 8 | #include "../reference/pdqsort.h" 9 | 10 | namespace vxsort_bench { 11 | using namespace vxsort::types; 12 | 13 | template 14 | static void BM_pdqsort_branchless(benchmark::State& state) { 15 | auto n = state.range(0); 16 | auto v = std::vector((i32)n); 17 | const auto ITERATIONS = 10; 18 | 19 | generate_unique_values_vec(v, (Q)0x1000, (Q)8); 20 | auto copies = generate_copies(ITERATIONS, n, v); 21 | auto begins = generate_array_beginnings(copies); 22 | auto ends = generate_array_beginnings(copies); 23 | for (usize i = 0; i < copies.size(); i++) 24 | ends[i] = begins[i] + n - 1; 25 | 26 | vxsort::u64 total_cycles = 0; 27 | for (auto _ : state) { 28 | state.PauseTiming(); 29 | refresh_copies(copies, v); 30 | state.ResumeTiming(); 31 | auto start = cycleclock::Now(); 32 | for (auto i = 0; i < ITERATIONS; i++) { 33 | pdqsort_branchless(begins[i], ends[i]); 34 | } 35 | total_cycles += (cycleclock::Now() - start); 36 | } 37 | 38 | state.SetLabel(get_crypto_hash(begins[0], ends[0])); 39 | state.counters["Time/N"] = make_time_per_n_counter(n * ITERATIONS); 40 | state.SetBytesProcessed(state.iterations() * n * ITERATIONS * sizeof(Q)); 41 | process_perf_counters(state.counters, n * ITERATIONS); 42 | if (!state.counters.contains("cycles/N")) 43 | state.counters["rdtsc-cycles/N"] = make_cycle_per_n_counter((f64)total_cycles / (f64)(n * ITERATIONS * state.iterations())); 44 | } 45 | 46 | BENCHMARK_TEMPLATE(BM_pdqsort_branchless, i16)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(benchmark::kMillisecond)->ThreadRange(1, processor_count); 47 | BENCHMARK_TEMPLATE(BM_pdqsort_branchless, u16)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(benchmark::kMillisecond)->ThreadRange(1, processor_count); 48 | BENCHMARK_TEMPLATE(BM_pdqsort_branchless, i32)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(benchmark::kMillisecond)->ThreadRange(1, processor_count); 49 | BENCHMARK_TEMPLATE(BM_pdqsort_branchless, u32)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(benchmark::kMillisecond)->ThreadRange(1, processor_count); 50 | BENCHMARK_TEMPLATE(BM_pdqsort_branchless, f32)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(benchmark::kMillisecond)->ThreadRange(1, processor_count); 51 | BENCHMARK_TEMPLATE(BM_pdqsort_branchless, i64)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(benchmark::kMillisecond)->ThreadRange(1, processor_count); 52 | BENCHMARK_TEMPLATE(BM_pdqsort_branchless, u64)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(benchmark::kMillisecond)->ThreadRange(1, processor_count); 53 | BENCHMARK_TEMPLATE(BM_pdqsort_branchless, f64)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(benchmark::kMillisecond)->ThreadRange(1, processor_count); 54 | } 55 | 56 | #include "vxsort_targets_disable.h" 57 | -------------------------------------------------------------------------------- /bench/fullsort/BM_fullsort.stdsort.cpp: -------------------------------------------------------------------------------- 1 | #include "vxsort_targets_enable_avx2.h" 2 | 3 | #include 4 | #include 5 | 6 | #include "fullsort_params.h" 7 | #include "../util.h" 8 | 9 | namespace vxsort_bench { 10 | using namespace vxsort::types; 11 | 12 | 13 | template 14 | static void BM_stdsort(benchmark::State& state) { 15 | auto n = state.range(0); 16 | auto v = std::vector((i32)n); 17 | const auto ITERATIONS = 10; 18 | 19 | generate_unique_values_vec(v, (Q)0x1000, (Q)8); 20 | auto copies = generate_copies(ITERATIONS, n, v); 21 | auto begins = generate_array_beginnings(copies); 22 | auto ends = generate_array_beginnings(copies); 23 | for (usize i = 0; i < copies.size(); i++) 24 | ends[i] = begins[i] + n - 1; 25 | 26 | vxsort::u64 total_cycles = 0; 27 | for (auto _ : state) { 28 | state.PauseTiming(); 29 | refresh_copies(copies, v); 30 | state.ResumeTiming(); 31 | auto start = cycleclock::Now(); 32 | for (auto i = 0; i < ITERATIONS; i++) { 33 | std::sort(begins[i], ends[i]); 34 | } 35 | total_cycles += (cycleclock::Now() - start); 36 | } 37 | 38 | state.SetLabel(get_crypto_hash(begins[0], ends[0])); 39 | state.counters["Time/N"] = make_time_per_n_counter(n * ITERATIONS); 40 | state.SetBytesProcessed(state.iterations() * n * ITERATIONS * sizeof(Q)); 41 | process_perf_counters(state.counters, n * ITERATIONS); 42 | if (!state.counters.contains("cycles/N")) 43 | state.counters["rdtsc-cycles/N"] = make_cycle_per_n_counter((f64)total_cycles / (f64)(n * ITERATIONS * state.iterations())); 44 | } 45 | 46 | BENCHMARK_TEMPLATE(BM_stdsort, i16)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(benchmark::kMillisecond)->ThreadRange(1, processor_count); 47 | BENCHMARK_TEMPLATE(BM_stdsort, u16)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(benchmark::kMillisecond)->ThreadRange(1, processor_count); 48 | BENCHMARK_TEMPLATE(BM_stdsort, i32)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(benchmark::kMillisecond)->ThreadRange(1, processor_count); 49 | BENCHMARK_TEMPLATE(BM_stdsort, u32)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(benchmark::kMillisecond)->ThreadRange(1, processor_count); 50 | BENCHMARK_TEMPLATE(BM_stdsort, f32)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(benchmark::kMillisecond)->ThreadRange(1, processor_count); 51 | BENCHMARK_TEMPLATE(BM_stdsort, i64)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(benchmark::kMillisecond)->ThreadRange(1, processor_count); 52 | BENCHMARK_TEMPLATE(BM_stdsort, u64)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(benchmark::kMillisecond)->ThreadRange(1, processor_count); 53 | BENCHMARK_TEMPLATE(BM_stdsort, f64)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(benchmark::kMillisecond)->ThreadRange(1, processor_count); 54 | } 55 | 56 | #include "vxsort_targets_disable.h" 57 | -------------------------------------------------------------------------------- /bench/fullsort/BM_fullsort.vxsort.avx2.f.cpp: -------------------------------------------------------------------------------- 1 | #include "vxsort_targets_enable_avx2.h" 2 | 3 | #include 4 | #include 5 | 6 | #include 7 | 8 | #include "BM_fullsort.vxsort.h" 9 | 10 | namespace vxsort_bench { 11 | using namespace vxsort::types; 12 | using benchmark::TimeUnit; 13 | using vm = vxsort::vector_machine; 14 | 15 | BENCHMARK_TEMPLATE(BM_vxsort, f32, vm::AVX2, 1)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count); 16 | BENCHMARK_TEMPLATE(BM_vxsort, f32, vm::AVX2, 2)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count); 17 | BENCHMARK_TEMPLATE(BM_vxsort, f32, vm::AVX2, 4)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count); 18 | BENCHMARK_TEMPLATE(BM_vxsort, f32, vm::AVX2, 8)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count); 19 | 20 | BENCHMARK_TEMPLATE(BM_vxsort, f64, vm::AVX2, 1)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count); 21 | BENCHMARK_TEMPLATE(BM_vxsort, f64, vm::AVX2, 2)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count); 22 | BENCHMARK_TEMPLATE(BM_vxsort, f64, vm::AVX2, 4)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count); 23 | BENCHMARK_TEMPLATE(BM_vxsort, f64, vm::AVX2, 8)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count); 24 | 25 | } 26 | 27 | #include "vxsort_targets_disable.h" 28 | -------------------------------------------------------------------------------- /bench/fullsort/BM_fullsort.vxsort.avx2.i.cpp: -------------------------------------------------------------------------------- 1 | #include "vxsort_targets_enable_avx2.h" 2 | 3 | #include 4 | #include 5 | 6 | #include 7 | 8 | #include "BM_fullsort.vxsort.h" 9 | 10 | namespace vxsort_bench { 11 | using namespace vxsort::types; 12 | using benchmark::TimeUnit; 13 | using vm = vxsort::vector_machine; 14 | 15 | BENCHMARK_TEMPLATE(BM_vxsort, i16, vm::AVX2, 1)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count); 16 | BENCHMARK_TEMPLATE(BM_vxsort, i16, vm::AVX2, 2)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count); 17 | BENCHMARK_TEMPLATE(BM_vxsort, i16, vm::AVX2, 4)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count); 18 | BENCHMARK_TEMPLATE(BM_vxsort, i16, vm::AVX2, 8)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count); 19 | 20 | BENCHMARK_TEMPLATE(BM_vxsort, i32, vm::AVX2, 1)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count); 21 | BENCHMARK_TEMPLATE(BM_vxsort, i32, vm::AVX2, 2)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count); 22 | BENCHMARK_TEMPLATE(BM_vxsort, i32, vm::AVX2, 4)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count); 23 | BENCHMARK_TEMPLATE(BM_vxsort, i32, vm::AVX2, 8)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count); 24 | 25 | BENCHMARK_TEMPLATE(BM_vxsort, i64, vm::AVX2, 1)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count); 26 | BENCHMARK_TEMPLATE(BM_vxsort, i64, vm::AVX2, 1)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count); 27 | BENCHMARK_TEMPLATE(BM_vxsort, i64, vm::AVX2, 4)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count); 28 | BENCHMARK_TEMPLATE(BM_vxsort, i64, vm::AVX2, 8)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count); 29 | } 30 | 31 | #include "vxsort_targets_disable.h" 32 | -------------------------------------------------------------------------------- /bench/fullsort/BM_fullsort.vxsort.avx2.u.cpp: -------------------------------------------------------------------------------- 1 | #include "vxsort_targets_enable_avx2.h" 2 | 3 | #include 4 | #include 5 | 6 | #include 7 | 8 | #include "BM_fullsort.vxsort.h" 9 | 10 | namespace vxsort_bench { 11 | using namespace vxsort::types; 12 | using benchmark::TimeUnit; 13 | using vm = vxsort::vector_machine; 14 | 15 | BENCHMARK_TEMPLATE(BM_vxsort, u32, vm::AVX2, 1)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count); 16 | BENCHMARK_TEMPLATE(BM_vxsort, u32, vm::AVX2, 2)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count); 17 | BENCHMARK_TEMPLATE(BM_vxsort, u32, vm::AVX2, 4)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count); 18 | BENCHMARK_TEMPLATE(BM_vxsort, u32, vm::AVX2, 8)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count); 19 | 20 | BENCHMARK_TEMPLATE(BM_vxsort, u64, vm::AVX2, 1)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count); 21 | BENCHMARK_TEMPLATE(BM_vxsort, u64, vm::AVX2, 2)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count); 22 | BENCHMARK_TEMPLATE(BM_vxsort, u64, vm::AVX2, 4)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count); 23 | BENCHMARK_TEMPLATE(BM_vxsort, u64, vm::AVX2, 8)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count); 24 | 25 | } 26 | 27 | #include "vxsort_targets_disable.h" 28 | -------------------------------------------------------------------------------- /bench/fullsort/BM_fullsort.vxsort.avx512.f.cpp: -------------------------------------------------------------------------------- 1 | #include "vxsort_targets_enable_avx512.h" 2 | 3 | #include 4 | #include 5 | 6 | #include 7 | 8 | #include "BM_fullsort.vxsort.h" 9 | 10 | namespace vxsort_bench { 11 | using namespace vxsort::types; 12 | using benchmark::TimeUnit; 13 | using vm = vxsort::vector_machine; 14 | 15 | BENCHMARK_TEMPLATE(BM_vxsort, i16, vm::AVX512, 1)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count); 16 | BENCHMARK_TEMPLATE(BM_vxsort, f32, vm::AVX512, 1)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count); 17 | BENCHMARK_TEMPLATE(BM_vxsort, f32, vm::AVX512, 2)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count); 18 | BENCHMARK_TEMPLATE(BM_vxsort, f32, vm::AVX512, 4)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count); 19 | BENCHMARK_TEMPLATE(BM_vxsort, f32, vm::AVX512, 8)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count); 20 | 21 | BENCHMARK_TEMPLATE(BM_vxsort, f64, vm::AVX512, 1)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count); 22 | BENCHMARK_TEMPLATE(BM_vxsort, f64, vm::AVX512, 2)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count); 23 | BENCHMARK_TEMPLATE(BM_vxsort, f64, vm::AVX512, 4)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count); 24 | BENCHMARK_TEMPLATE(BM_vxsort, f64, vm::AVX512, 8)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count); 25 | 26 | } 27 | 28 | #include "vxsort_targets_disable.h" 29 | -------------------------------------------------------------------------------- /bench/fullsort/BM_fullsort.vxsort.avx512.i.cpp: -------------------------------------------------------------------------------- 1 | #include "vxsort_targets_enable_avx512.h" 2 | 3 | #include 4 | #include 5 | 6 | #include 7 | 8 | #include "BM_fullsort.vxsort.h" 9 | 10 | namespace vxsort_bench { 11 | using namespace vxsort::types; 12 | using benchmark::TimeUnit; 13 | using vm = vxsort::vector_machine; 14 | 15 | BENCHMARK_TEMPLATE(BM_vxsort, i16, vm::AVX512, 1)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count); 16 | BENCHMARK_TEMPLATE(BM_vxsort, i16, vm::AVX512, 2)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count); 17 | BENCHMARK_TEMPLATE(BM_vxsort, i16, vm::AVX512, 4)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count); 18 | BENCHMARK_TEMPLATE(BM_vxsort, i16, vm::AVX512, 8)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count); 19 | 20 | BENCHMARK_TEMPLATE(BM_vxsort, i32, vm::AVX512, 1)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count); 21 | BENCHMARK_TEMPLATE(BM_vxsort, i32, vm::AVX512, 2)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count); 22 | BENCHMARK_TEMPLATE(BM_vxsort, i32, vm::AVX512, 4)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count); 23 | BENCHMARK_TEMPLATE(BM_vxsort, i32, vm::AVX512, 8)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count); 24 | 25 | BENCHMARK_TEMPLATE(BM_vxsort, i64, vm::AVX512, 1)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count); 26 | BENCHMARK_TEMPLATE(BM_vxsort, i64, vm::AVX512, 2)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count); 27 | BENCHMARK_TEMPLATE(BM_vxsort, i64, vm::AVX512, 4)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count); 28 | BENCHMARK_TEMPLATE(BM_vxsort, i64, vm::AVX512, 8)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count); 29 | 30 | } 31 | 32 | #include "vxsort_targets_disable.h" 33 | -------------------------------------------------------------------------------- /bench/fullsort/BM_fullsort.vxsort.avx512.u.cpp: -------------------------------------------------------------------------------- 1 | #include "vxsort_targets_enable_avx512.h" 2 | 3 | #include 4 | #include 5 | 6 | #include 7 | 8 | #include "BM_fullsort.vxsort.h" 9 | 10 | namespace vxsort_bench { 11 | using namespace vxsort::types; 12 | using benchmark::TimeUnit; 13 | using vm = vxsort::vector_machine; 14 | 15 | BENCHMARK_TEMPLATE(BM_vxsort, u32, vm::AVX512, 1)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count); 16 | BENCHMARK_TEMPLATE(BM_vxsort, u32, vm::AVX512, 2)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count); 17 | BENCHMARK_TEMPLATE(BM_vxsort, u32, vm::AVX512, 4)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count); 18 | BENCHMARK_TEMPLATE(BM_vxsort, u32, vm::AVX512, 8)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count); 19 | 20 | BENCHMARK_TEMPLATE(BM_vxsort, u64, vm::AVX512, 1)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count); 21 | BENCHMARK_TEMPLATE(BM_vxsort, u64, vm::AVX512, 2)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count); 22 | BENCHMARK_TEMPLATE(BM_vxsort, u64, vm::AVX512, 4)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count); 23 | BENCHMARK_TEMPLATE(BM_vxsort, u64, vm::AVX512, 8)->RangeMultiplier(2)->Range(MIN_SORT, MAX_SORT)->Unit(kMillisecond)->ThreadRange(1, processor_count); 24 | 25 | } 26 | 27 | #include "vxsort_targets_disable.h" 28 | -------------------------------------------------------------------------------- /bench/fullsort/BM_fullsort.vxsort.h: -------------------------------------------------------------------------------- 1 | #ifndef VXSORT_BM_FULLSORT_VXSORT_H 2 | #define VXSORT_BM_FULLSORT_VXSORT_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "../util.h" 9 | #include "../bench_isa.h" 10 | 11 | #include 12 | 13 | #include "fullsort_params.h" 14 | 15 | namespace vxsort_bench { 16 | using namespace vxsort::types; 17 | using vxsort::vector_machine; 18 | 19 | template 20 | static void BM_vxsort(benchmark::State& state) { 21 | VXSORT_BENCH_ISA(); 22 | 23 | auto n = state.range(0); 24 | auto v = std::vector((i32)n); 25 | const auto ITERATIONS = 10; 26 | 27 | generate_unique_values_vec(v, (Q)0x1000, (Q)0x8); 28 | auto copies = generate_copies(ITERATIONS, n, v); 29 | auto begins = generate_array_beginnings(copies); 30 | auto ends = generate_array_beginnings(copies); 31 | for (usize i = 0; i < copies.size(); i++) 32 | ends[i] = begins[i] + n - 1; 33 | 34 | auto sorter = ::vxsort::vxsort(); 35 | 36 | u64 total_cycles = 0; 37 | for (auto _ : state) { 38 | state.PauseTiming(); 39 | refresh_copies(copies, v); 40 | state.ResumeTiming(); 41 | auto start = cycleclock::Now(); 42 | for (auto i = 0; i < ITERATIONS; i++) { 43 | sorter.sort(begins[i], ends[i]); 44 | } 45 | total_cycles += (cycleclock::Now() - start); 46 | } 47 | 48 | state.SetLabel(get_crypto_hash(begins[0], ends[0])); 49 | state.counters["Time/N"] = make_time_per_n_counter(n * ITERATIONS); 50 | state.SetBytesProcessed(state.iterations() * n * ITERATIONS * sizeof(Q)); 51 | process_perf_counters(state.counters, n * ITERATIONS); 52 | if (!state.counters.contains("cycles/N")) 53 | state.counters["rdtsc-cycles/N"] = make_cycle_per_n_counter((f64)total_cycles / (f64)(n * ITERATIONS * state.iterations())); 54 | } 55 | 56 | const i32 StridedSortSize = 1000000; 57 | const i64 StridedSortMinValue = 0x80000000LL; 58 | 59 | template 60 | static void BM_vxsort_strided(benchmark::State& state) { 61 | VXSORT_BENCH_ISA(); 62 | 63 | auto n = StridedSortSize; 64 | auto stride = state.range(0); 65 | auto v = std::vector(n); 66 | const auto ITERATIONS = 10; 67 | 68 | const auto min_value = StridedSortMinValue; 69 | const auto max_value = min_value + StridedSortSize * stride; 70 | 71 | generate_unique_values_vec(v, (Q) 0x80000000, (Q) stride); 72 | auto copies = generate_copies(ITERATIONS, n, v); 73 | auto begins = generate_array_beginnings(copies); 74 | auto ends = generate_array_beginnings(copies); 75 | for (size_t i = 0; i < copies.size(); i++) 76 | ends[i] = begins[i] + n - 1; 77 | 78 | auto sorter = ::vxsort::vxsort(); 79 | 80 | u64 total_cycles = 0; 81 | for (auto _ : state) { 82 | state.PauseTiming(); 83 | refresh_copies(copies, v); 84 | state.ResumeTiming(); 85 | auto start = cycleclock::Now(); 86 | for (auto i = 0; i < ITERATIONS; i++) { 87 | sorter.sort(begins[i], ends[i], min_value, max_value); 88 | } 89 | total_cycles += (cycleclock::Now() - start); 90 | } 91 | 92 | state.counters["Time/N"] = make_time_per_n_counter(n * ITERATIONS); 93 | process_perf_counters(state.counters, n * ITERATIONS); 94 | if (!state.counters.contains("cycles/N")) 95 | state.counters["rdtsc-cycles/N"] = make_cycle_per_n_counter((f64)total_cycles / (f64)(n * ITERATIONS * state.iterations())); 96 | } 97 | } 98 | 99 | #endif // VXSORT_BM_FULLSORT_VXSORT_H 100 | -------------------------------------------------------------------------------- /bench/fullsort/BM_fullsort_strided.avx2.cpp: -------------------------------------------------------------------------------- 1 | #include "vxsort_targets_enable_avx2.h" 2 | 3 | #include 4 | 5 | #include 6 | 7 | #include "BM_fullsort.vxsort.h" 8 | 9 | namespace vxsort_bench { 10 | using namespace vxsort::types; 11 | using benchmark::TimeUnit; 12 | using vm = vxsort::vector_machine; 13 | 14 | BENCHMARK_TEMPLATE(BM_vxsort_strided, i64, vm::AVX2, 1)->RangeMultiplier(2)->Range(MIN_STRIDE, MAX_STRIDE)->Unit(kMillisecond)->ThreadRange(1, processor_count); 15 | BENCHMARK_TEMPLATE(BM_vxsort_strided, i64, vm::AVX2, 4)->RangeMultiplier(2)->Range(MIN_STRIDE, MAX_STRIDE)->Unit(kMillisecond)->ThreadRange(1, processor_count); 16 | BENCHMARK_TEMPLATE(BM_vxsort_strided, i64, vm::AVX2, 8)->RangeMultiplier(2)->Range(MIN_STRIDE, MAX_STRIDE)->Unit(kMillisecond)->ThreadRange(1, processor_count); 17 | BENCHMARK_TEMPLATE(BM_vxsort_strided, i64, vm::AVX2, 12)->RangeMultiplier(2)->Range(MIN_STRIDE, MAX_STRIDE)->Unit(kMillisecond)->ThreadRange(1, processor_count); 18 | 19 | } 20 | 21 | #include "vxsort_targets_disable.h" 22 | -------------------------------------------------------------------------------- /bench/fullsort/BM_fullsort_strided.avx512.cpp: -------------------------------------------------------------------------------- 1 | #include "vxsort_targets_enable_avx512.h" 2 | 3 | #include 4 | #include 5 | 6 | #include 7 | 8 | #include "BM_fullsort.vxsort.h" 9 | 10 | namespace vxsort_bench { 11 | using namespace vxsort::types; 12 | using benchmark::TimeUnit; 13 | using vm = vxsort::vector_machine; 14 | 15 | BENCHMARK_TEMPLATE(BM_vxsort_strided, i64, vm::AVX512, 1)->RangeMultiplier(2)->Range(MIN_STRIDE, MAX_STRIDE)->Unit(kMillisecond)->ThreadRange(1, processor_count); 16 | BENCHMARK_TEMPLATE(BM_vxsort_strided, i64, vm::AVX512, 4)->RangeMultiplier(2)->Range(MIN_STRIDE, MAX_STRIDE)->Unit(kMillisecond)->ThreadRange(1, processor_count); 17 | BENCHMARK_TEMPLATE(BM_vxsort_strided, i64, vm::AVX512, 8)->RangeMultiplier(2)->Range(MIN_STRIDE, MAX_STRIDE)->Unit(kMillisecond)->ThreadRange(1, processor_count); 18 | 19 | } 20 | 21 | #include "vxsort_targets_disable.h" 22 | -------------------------------------------------------------------------------- /bench/fullsort/fullsort_params.h: -------------------------------------------------------------------------------- 1 | #ifndef VXSORT_FULLSORT_PARAMS_H 2 | #define VXSORT_FULLSORT_PARAMS_H 3 | 4 | #include 5 | 6 | namespace vxsort_bench { 7 | 8 | using namespace vxsort::types; 9 | using vxsort::vector_machine; 10 | 11 | const auto processor_count = 1; 12 | 13 | static const i32 MIN_SORT = 256; 14 | static const i32 MAX_SORT = 1 << 24; 15 | 16 | static const i32 MIN_STRIDE = 1 << 3; 17 | static const i32 MAX_STRIDE = 1 << 27; 18 | } 19 | 20 | #endif //VXSORT_FULLSORT_PARAMS_H 21 | -------------------------------------------------------------------------------- /bench/internal_macros.h: -------------------------------------------------------------------------------- 1 | #ifndef VXSORT_BENCH_INTERNAL_MACROS_H_ 2 | #define VXSORT_BENCH_INTERNAL_MACROS_H_ 3 | 4 | #include "benchmark/benchmark.h" 5 | 6 | /* Needed to detect STL */ 7 | #include 8 | 9 | // clang-format off 10 | 11 | #ifndef __has_feature 12 | #define __has_feature(x) 0 13 | #endif 14 | 15 | #if defined(__clang__) 16 | #if !defined(COMPILER_CLANG) 17 | #define COMPILER_CLANG 18 | #endif 19 | #elif defined(_MSC_VER) 20 | #if !defined(COMPILER_MSVC) 21 | #define COMPILER_MSVC 22 | #endif 23 | #elif defined(__GNUC__) 24 | #if !defined(COMPILER_GCC) 25 | #define COMPILER_GCC 26 | #endif 27 | #endif 28 | 29 | #if __has_feature(cxx_attributes) 30 | #define VXSORT_BENCH_NORETURN [[noreturn]] 31 | #elif defined(__GNUC__) 32 | #define VXSORT_BENCH_NORETURN __attribute__((noreturn)) 33 | #elif defined(COMPILER_MSVC) 34 | #define VXSORT_BENCH_NORETURN __declspec(noreturn) 35 | #else 36 | #define VXSORT_BENCH_NORETURN 37 | #endif 38 | 39 | #if defined(__CYGWIN__) 40 | #define VXSORT_BENCH_OS_CYGWIN 1 41 | #elif defined(_WIN32) 42 | #define VXSORT_BENCH_OS_WINDOWS 1 43 | #if defined(__MINGW32__) 44 | #define VXSORT_BENCH_OS_MINGW 1 45 | #endif 46 | #elif defined(__APPLE__) 47 | #define VXSORT_BENCH_OS_APPLE 1 48 | #include "TargetConditionals.h" 49 | #if defined(TARGET_OS_MAC) 50 | #define VXSORT_BENCH_OS_MACOSX 1 51 | #if defined(TARGET_OS_IPHONE) 52 | #define VXSORT_BENCH_OS_IOS 1 53 | #endif 54 | #endif 55 | #elif defined(__FreeBSD__) 56 | #define VXSORT_BENCH_OS_FREEBSD 1 57 | #elif defined(__NetBSD__) 58 | #define VXSORT_BENCH_OS_NETBSD 1 59 | #elif defined(__OpenBSD__) 60 | #define VXSORT_BENCH_OS_OPENBSD 1 61 | #elif defined(__linux__) 62 | #define VXSORT_BENCH_OS_LINUX 1 63 | #elif defined(__native_client__) 64 | #define VXSORT_BENCH_OS_NACL 1 65 | #elif defined(__EMSCRIPTEN__) 66 | #define VXSORT_BENCH_OS_EMSCRIPTEN 1 67 | #elif defined(__rtems__) 68 | #define VXSORT_BENCH_OS_RTEMS 1 69 | #elif defined(__Fuchsia__) 70 | #define VXSORT_BENCH_OS_FUCHSIA 1 71 | #elif defined (__SVR4) && defined (__sun) 72 | #define VXSORT_BENCH_OS_SOLARIS 1 73 | #elif defined(__QNX__) 74 | #define VXSORT_BENCH_OS_QNX 1 75 | #endif 76 | 77 | #if defined(__ANDROID__) && defined(__GLIBCXX__) 78 | #define VXSORT_BENCH_STL_ANDROID_GNUSTL 1 79 | #endif 80 | 81 | #if !__has_feature(cxx_exceptions) && !defined(__cpp_exceptions) \ 82 | && !defined(__EXCEPTIONS) 83 | #define VXSORT_BENCH_HAS_NO_EXCEPTIONS 84 | #endif 85 | 86 | #if defined(COMPILER_CLANG) || defined(COMPILER_GCC) 87 | #define VXSORT_BENCH_MAYBE_UNUSED __attribute__((unused)) 88 | #else 89 | #define VXSORT_BENCH_MAYBE_UNUSED 90 | #endif 91 | 92 | // clang-format on 93 | 94 | #endif // VXSORT_BENCH_INTERNAL_MACROS_H_ 95 | -------------------------------------------------------------------------------- /bench/prep.sh: -------------------------------------------------------------------------------- 1 | # from https://www.alexgallego.org/perf/compiler/explorer/flatbuffers/smf/2018/06/30/effects-cpu-turbo.html 2 | 3 | function cpu_disable_performance_cpupower_state(){ 4 | for c in /sys/devices/system/cpu/cpu[0-9]*/cpufreq/scaling_governor; do echo powersave > $c; done 5 | } 6 | function cpu_enable_performance_cpupower_state(){ 7 | for c in /sys/devices/system/cpu/cpu[0-9]*/cpufreq/scaling_governor; do echo performance > $c; done 8 | } 9 | function cpu_available_frequencies() { 10 | local cpuspec=${1:-[0-9]} 11 | 12 | for i in /sys/devices/system/cpu/cpu$cpuspec*; do 13 | echo "$i:" 14 | echo " cpufreq/scaling_min_freq: $(cat $i/cpufreq/scaling_min_freq)"; 15 | echo " cpufreq/scaling_max_freq: $(cat $i/cpufreq/scaling_max_freq)"; 16 | done 17 | } 18 | 19 | function cpu_set_min_frequencies() { 20 | local freq=$1; 21 | local cpuspec=${2:-[0-9]} 22 | if [[ $freq == "" ]]; then exit 1; fi 23 | for i in /sys/devices/system/cpu/cpu$cpuspec*; do 24 | echo "$i:" 25 | echo "$i/cpufreq/scaling_min_freq: $(cat $i/cpufreq/scaling_min_freq)"; 26 | echo "$freq" | sudo tee "$i/cpufreq/scaling_min_freq" 27 | echo "$i/cpufreq/scaling_min_freq: $(cat $i/cpufreq/scaling_min_freq)"; 28 | done 29 | } 30 | 31 | function cpu_set_max_frequencies() { 32 | local freq=$1; 33 | local cpuspec=${2:-[0-9]} 34 | if [[ $freq == "" ]]; then exit 1; fi 35 | for i in /sys/devices/system/cpu/cpu$cpuspec*; do 36 | echo "$i:" 37 | echo "$i/cpufreq/scaling_max_freq: $(cat $i/cpufreq/scaling_max_freq)"; 38 | echo "$freq" | sudo tee "$i/cpufreq/scaling_max_freq" 39 | echo "$i/cpufreq/scaling_max_freq: $(cat $i/cpufreq/scaling_max_freq)"; 40 | done 41 | } 42 | -------------------------------------------------------------------------------- /bench/requirements.txt: -------------------------------------------------------------------------------- 1 | kaleido 2 | plotly 3 | pandas 4 | humanize 5 | ipython 6 | -------------------------------------------------------------------------------- /bench/run.cmd: -------------------------------------------------------------------------------- 1 | @echo off 2 | vxsort_bench --benchmark_counters_tabular %1 %2 %3 %4 %5 %6 %7 %8 %9 3 | -------------------------------------------------------------------------------- /bench/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | hogs=$(pgrep -if "(typora|firefox|chrome|chromium-browser|vivaldi-bin|rider|pycharm|resharper|msbuild|telegram|clion|clangd|discord|slack)") 3 | 4 | resume() { 5 | echo Resuming "$(echo "$hogs" | wc -w)" procs after running bench 6 | [[ -z "$hogs" ]] || echo "$hogs" | xargs kill -CONT 7 | } 8 | 9 | trap 'resume' SIGINT 10 | 11 | echo Suspending "$(echo "$hogs" | wc -w)" procs before running bench 12 | [[ -z "$hogs" ]] || echo "$hogs" | xargs kill -STOP 13 | 14 | SCRIPT_DIR="$(cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 15 | 16 | "$SCRIPT_DIR"/vxsort_bench --benchmark_counters_tabular "$@" 17 | trap '' SIGINT 18 | resume 19 | -------------------------------------------------------------------------------- /bench/smallsort/BM_blacher.avx2.cpp: -------------------------------------------------------------------------------- 1 | #include "vxsort_targets_enable_avx2.h" 2 | 3 | #include "BM_smallsort.h" 4 | 5 | #include 6 | 7 | namespace vxsort_bench { 8 | using namespace vxsort::types; 9 | using benchmark::TimeUnit; 10 | using vm = vxsort::vector_machine; 11 | 12 | #define COEX(a, b){ \ 13 | auto vec_tmp = a; \ 14 | a = _mm256_min_epi32(a, b); \ 15 | b = _mm256_max_epi32(vec_tmp, b);} 16 | 17 | /* shuffle 2 vectors, instruction for int is missing, 18 | * therefore shuffle with float */ 19 | #define SHUFFLE_2_VECS(a, b, mask) \ 20 | _mm256_castps_si256 (_mm256_shuffle_ps( \ 21 | _mm256_castsi256_ps (a), _mm256_castsi256_ps (b), mask)); 22 | 23 | /* optimized sorting network for two vectors, that is 16 ints */ 24 | inline void sort_02v_ascending(__m256i &v1, __m256i &v2) { 25 | COEX(v1, v2); /* step 1 */ 26 | 27 | v2 = _mm256_shuffle_epi32(v2, _MM_SHUFFLE(2, 3, 0, 1)); /* step 2 */ 28 | COEX(v1, v2); 29 | 30 | auto tmp = v1; /* step 3 */ 31 | v1 = SHUFFLE_2_VECS(v1, v2, 0b10001000); 32 | v2 = SHUFFLE_2_VECS(tmp, v2, 0b11011101); 33 | COEX(v1, v2); 34 | 35 | v2 = _mm256_shuffle_epi32(v2, _MM_SHUFFLE(0, 1, 2, 3)); /* step 4 */ 36 | COEX(v1, v2); 37 | 38 | tmp = v1; /* step 5 */ 39 | v1 = SHUFFLE_2_VECS(v1, v2, 0b01000100); 40 | v2 = SHUFFLE_2_VECS(tmp, v2, 0b11101110); 41 | COEX(v1, v2); 42 | 43 | tmp = v1; /* step 6 */ 44 | v1 = SHUFFLE_2_VECS(v1, v2, 0b11011000); 45 | v2 = SHUFFLE_2_VECS(tmp, v2, 0b10001101); 46 | COEX(v1, v2); 47 | 48 | v2 = _mm256_permutevar8x32_epi32(v2, _mm256_setr_epi32(7, 6, 5, 4, 3, 2, 1, 0)); 49 | COEX(v1, v2); /* step 7 */ 50 | 51 | tmp = v1; /* step 8 */ 52 | v1 = SHUFFLE_2_VECS(v1, v2, 0b11011000); 53 | v2 = SHUFFLE_2_VECS(tmp, v2, 0b10001101); 54 | COEX(v1, v2); 55 | 56 | tmp = v1; /* step 9 */ 57 | v1 = SHUFFLE_2_VECS(v1, v2, 0b11011000); 58 | v2 = SHUFFLE_2_VECS(tmp, v2, 0b10001101); 59 | COEX(v1, v2); 60 | 61 | /* permute to make it easier to restore order */ 62 | v1 = _mm256_permutevar8x32_epi32(v1, _mm256_setr_epi32(0, 4, 1, 5, 6, 2, 7, 3)); 63 | v2 = _mm256_permutevar8x32_epi32(v2, _mm256_setr_epi32(0, 4, 1, 5, 6, 2, 7, 3)); 64 | 65 | tmp = v1; /* step 10 */ 66 | v1 = SHUFFLE_2_VECS(v1, v2, 0b10001000); 67 | v2 = SHUFFLE_2_VECS(tmp, v2, 0b11011101); 68 | COEX(v1, v2); 69 | 70 | /* restore order */ 71 | auto b2 = _mm256_shuffle_epi32(v2, 0b10110001); 72 | auto b1 = _mm256_shuffle_epi32(v1, 0b10110001); 73 | v1 = _mm256_blend_epi32(v1, b2, 0b10101010); 74 | v2 = _mm256_blend_epi32(b1, v2, 0b10101010); 75 | } 76 | 77 | // This is generated for testing purposes only 78 | void bitonic_blacher_16_i32(i32 *ptr) { 79 | auto d01 = _mm256_lddqu_si256((__m256i const *) ptr + 0);; 80 | auto d02 = _mm256_lddqu_si256((__m256i const *) ptr + 1);; 81 | sort_02v_ascending(d01, d02); 82 | _mm256_storeu_si256((__m256i *) ptr + 0, d01); 83 | _mm256_storeu_si256((__m256i *) ptr + 1, d02); 84 | } 85 | 86 | 87 | void BM_blacher(benchmark::State& state) 88 | { 89 | if (!vxsort::supports_vector_machine(vector_machine::AVX2)) { 90 | state.SkipWithError("Current CPU does not support the minimal features for this test"); 91 | return; 92 | } 93 | 94 | static const i32 ITERATIONS = 1024; 95 | auto n = 16; 96 | auto v = std::vector(n); 97 | generate_unique_values_vec(v, (i32)0x1000, (i32)0x8); 98 | 99 | auto copies = generate_copies(ITERATIONS, n, v); 100 | auto begins = generate_array_beginnings(copies); 101 | 102 | uint64_t total_cycles = 0; 103 | for (auto _ : state) { 104 | state.PauseTiming(); 105 | refresh_copies(copies, v); 106 | state.ResumeTiming(); 107 | auto start = cycleclock::Now(); 108 | for (auto i = 0; i < ITERATIONS; i++) { 109 | bitonic_blacher_16_i32(begins[i]); 110 | } 111 | total_cycles += cycleclock::Now() - start; 112 | } 113 | 114 | state.SetBytesProcessed(state.iterations() * n * ITERATIONS * sizeof(i32)); 115 | 116 | state.counters["Time/N"] = make_time_per_n_counter(n * ITERATIONS); 117 | process_perf_counters(state.counters, n * ITERATIONS); 118 | if (!state.counters.contains("cycles/N")) 119 | state.counters["rdtsc-cycles/N"] = make_cycle_per_n_counter((f64)total_cycles / (f64)(n * ITERATIONS * state.iterations())); 120 | } 121 | 122 | BENCHMARK(BM_blacher)->Unit(kNanosecond)->MinTime(0.1); 123 | 124 | } 125 | 126 | #include "vxsort_targets_disable.h" 127 | -------------------------------------------------------------------------------- /bench/smallsort/BM_smallsort.avx2.cpp: -------------------------------------------------------------------------------- 1 | #include "vxsort_targets_enable_avx2.h" 2 | 3 | #include "BM_smallsort.h" 4 | 5 | #include 6 | 7 | namespace vxsort_bench { 8 | using namespace vxsort::types; 9 | using benchmark::TimeUnit; 10 | using vm = vxsort::vector_machine; 11 | 12 | BENCHMARK_TEMPLATE(BM_bitonic_sort, i16, vm::AVX2)->DenseRange(16, 4096, 8)->Unit(kNanosecond)->MinTime(0.1); 13 | BENCHMARK_TEMPLATE(BM_bitonic_sort, u16, vm::AVX2)->DenseRange(16, 4096, 8)->Unit(kNanosecond)->MinTime(0.1); 14 | BENCHMARK_TEMPLATE(BM_bitonic_sort, i32, vm::AVX2)->DenseRange( 4, 2048, 4)->Unit(kNanosecond)->MinTime(0.1); 15 | BENCHMARK_TEMPLATE(BM_bitonic_sort, u32, vm::AVX2)->DenseRange( 4, 2048, 4)->Unit(kNanosecond)->MinTime(0.1); 16 | BENCHMARK_TEMPLATE(BM_bitonic_sort, f32, vm::AVX2)->DenseRange( 4, 2048, 4)->Unit(kNanosecond)->MinTime(0.1); 17 | BENCHMARK_TEMPLATE(BM_bitonic_sort, i64, vm::AVX2)->DenseRange( 2, 1024, 2)->Unit(kNanosecond)->MinTime(0.1); 18 | BENCHMARK_TEMPLATE(BM_bitonic_sort, u64, vm::AVX2)->DenseRange( 2, 1024, 2)->Unit(kNanosecond)->MinTime(0.1); 19 | BENCHMARK_TEMPLATE(BM_bitonic_sort, f64, vm::AVX2)->DenseRange( 2, 1024, 2)->Unit(kNanosecond)->MinTime(0.1); 20 | 21 | BENCHMARK_TEMPLATE(BM_bitonic_machine, i16, vm::AVX2, 2)->Unit(kNanosecond)->MinTime(0.1); 22 | BENCHMARK_TEMPLATE(BM_bitonic_machine, u16, vm::AVX2, 2)->Unit(kNanosecond)->MinTime(0.1); 23 | BENCHMARK_TEMPLATE(BM_bitonic_machine, i32, vm::AVX2, 2)->Unit(kNanosecond)->MinTime(0.1); 24 | BENCHMARK_TEMPLATE(BM_bitonic_machine, u32, vm::AVX2, 2)->Unit(kNanosecond)->MinTime(0.1); 25 | BENCHMARK_TEMPLATE(BM_bitonic_machine, i64, vm::AVX2, 2)->Unit(kNanosecond)->MinTime(0.1); 26 | BENCHMARK_TEMPLATE(BM_bitonic_machine, u64, vm::AVX2, 2)->Unit(kNanosecond)->MinTime(0.1); 27 | BENCHMARK_TEMPLATE(BM_bitonic_machine, f32, vm::AVX2, 2)->Unit(kNanosecond)->MinTime(0.1); 28 | BENCHMARK_TEMPLATE(BM_bitonic_machine, f64, vm::AVX2, 2)->Unit(kNanosecond)->MinTime(0.1); 29 | 30 | } 31 | 32 | #include "vxsort_targets_disable.h" 33 | -------------------------------------------------------------------------------- /bench/smallsort/BM_smallsort.avx512.cpp: -------------------------------------------------------------------------------- 1 | #include "vxsort_targets_enable_avx512.h" 2 | 3 | #include "BM_smallsort.h" 4 | 5 | #include 6 | 7 | namespace vxsort_bench { 8 | using namespace vxsort::types; 9 | using benchmark::TimeUnit; 10 | using vm = vxsort::vector_machine; 11 | 12 | BENCHMARK_TEMPLATE(BM_bitonic_sort, i16, vm::AVX512)->DenseRange(8, 4096, 8)->Unit(kNanosecond)->MinTime(0.1); 13 | BENCHMARK_TEMPLATE(BM_bitonic_sort, u16, vm::AVX512)->DenseRange(8, 4096, 8)->Unit(kNanosecond)->MinTime(0.1); 14 | BENCHMARK_TEMPLATE(BM_bitonic_sort, i32, vm::AVX512)->DenseRange(4, 2048, 4)->Unit(kNanosecond)->MinTime(0.1); 15 | BENCHMARK_TEMPLATE(BM_bitonic_sort, u32, vm::AVX512)->DenseRange(4, 2048, 4)->Unit(kNanosecond)->MinTime(0.1); 16 | BENCHMARK_TEMPLATE(BM_bitonic_sort, f32, vm::AVX512)->DenseRange(4, 2048, 4)->Unit(kNanosecond)->MinTime(0.1); 17 | BENCHMARK_TEMPLATE(BM_bitonic_sort, i64, vm::AVX512)->DenseRange(2, 1024, 2)->Unit(kNanosecond)->MinTime(0.1); 18 | BENCHMARK_TEMPLATE(BM_bitonic_sort, u64, vm::AVX512)->DenseRange(2, 1024, 2)->Unit(kNanosecond)->MinTime(0.1); 19 | BENCHMARK_TEMPLATE(BM_bitonic_sort, f64, vm::AVX512)->DenseRange(2, 1024, 2)->Unit(kNanosecond)->MinTime(0.1); 20 | 21 | BENCHMARK_TEMPLATE(BM_bitonic_machine, i16, vm::AVX512, 2)->Unit(kNanosecond)->MinTime(0.1); 22 | BENCHMARK_TEMPLATE(BM_bitonic_machine, u16, vm::AVX512, 2)->Unit(kNanosecond)->MinTime(0.1); 23 | BENCHMARK_TEMPLATE(BM_bitonic_machine, i32, vm::AVX512, 2)->Unit(kNanosecond)->MinTime(0.1); 24 | BENCHMARK_TEMPLATE(BM_bitonic_machine, u32, vm::AVX512, 2)->Unit(kNanosecond)->MinTime(0.1); 25 | BENCHMARK_TEMPLATE(BM_bitonic_machine, i64, vm::AVX512, 2)->Unit(kNanosecond)->MinTime(0.1); 26 | BENCHMARK_TEMPLATE(BM_bitonic_machine, u64, vm::AVX512, 2)->Unit(kNanosecond)->MinTime(0.1); 27 | BENCHMARK_TEMPLATE(BM_bitonic_machine, f32, vm::AVX512, 2)->Unit(kNanosecond)->MinTime(0.1); 28 | BENCHMARK_TEMPLATE(BM_bitonic_machine, f64, vm::AVX512, 2)->Unit(kNanosecond)->MinTime(0.1); 29 | 30 | } 31 | 32 | #include "vxsort_targets_disable.h" 33 | -------------------------------------------------------------------------------- /bench/smallsort/BM_smallsort.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "../bench_isa.h" 7 | #include 8 | 9 | #include "../stolen-cycleclock.h" 10 | #include "../util.h" 11 | 12 | namespace vxsort_bench { 13 | using namespace vxsort::types; 14 | using vxsort::vector_machine; 15 | 16 | const auto processor_count = std::thread::hardware_concurrency(); 17 | 18 | template 19 | static void BM_bitonic_sort(benchmark::State& state) { 20 | VXSORT_BENCH_ISA(); 21 | 22 | using BM = vxsort::smallsort::bitonic; 23 | 24 | static const i32 ITERATIONS = 1024; 25 | auto n = state.range(0); 26 | auto v = std::vector(n); 27 | generate_unique_values_vec(v, (Q)0x1000, (Q)0x8); 28 | 29 | auto copies = generate_copies(ITERATIONS, n, v); 30 | auto begins = generate_array_beginnings(copies); 31 | 32 | uint64_t total_cycles = 0; 33 | for (auto _ : state) { 34 | state.PauseTiming(); 35 | refresh_copies(copies, v); 36 | state.ResumeTiming(); 37 | auto start = cycleclock::Now(); 38 | for (auto i = 0; i < ITERATIONS; i++) 39 | BM::sort(begins[i], n); 40 | total_cycles += cycleclock::Now() - start; 41 | } 42 | 43 | state.SetBytesProcessed(state.iterations() * n * ITERATIONS * sizeof(Q)); 44 | 45 | state.counters["Time/N"] = make_time_per_n_counter(n * ITERATIONS); 46 | process_perf_counters(state.counters, n * ITERATIONS); 47 | if (!state.counters.contains("cycles/N")) 48 | state.counters["rdtsc-cycles/N"] = make_cycle_per_n_counter((f64)total_cycles / (f64)(n * ITERATIONS * state.iterations())); 49 | } 50 | 51 | template 52 | static void BM_bitonic_machine(benchmark::State& state) { 53 | VXSORT_BENCH_ISA(); 54 | 55 | static_assert(N > 0, "N must be greater than 0"); 56 | static_assert(N <= 4, "N cannot exceet 4"); 57 | 58 | using BM = vxsort::smallsort::bitonic_machine; 59 | 60 | static const i32 ITERATIONS = 1024; 61 | auto n = N * BM::N; 62 | auto v = std::vector(n); 63 | generate_unique_values_vec(v, (Q)0x1000, (Q)0x8); 64 | 65 | auto copies = generate_copies(ITERATIONS, n, v); 66 | auto begins = generate_array_beginnings(copies); 67 | 68 | uint64_t total_cycles = 0; 69 | for (auto _ : state) { 70 | state.PauseTiming(); 71 | refresh_copies(copies, v); 72 | state.ResumeTiming(); 73 | auto start = cycleclock::Now(); 74 | for (auto i = 0; i < ITERATIONS; i++) { 75 | if (N == 1) 76 | BM::sort_01v_full_ascending(begins[i]); 77 | else if (N == 2) 78 | BM::sort_02v_full_ascending(begins[i]); 79 | else if (N == 3) 80 | BM::sort_03v_full_ascending(begins[i]); 81 | else if (N == 4) 82 | BM::sort_04v_full_ascending(begins[i]); 83 | 84 | } 85 | total_cycles += cycleclock::Now() - start; 86 | } 87 | 88 | state.SetBytesProcessed(state.iterations() * n * ITERATIONS * sizeof(Q)); 89 | 90 | state.counters["Time/N"] = make_time_per_n_counter(n * ITERATIONS); 91 | process_perf_counters(state.counters, n * ITERATIONS); 92 | if (!state.counters.contains("cycles/N")) 93 | state.counters["rdtsc-cycles/N"] = make_cycle_per_n_counter((f64)total_cycles / (f64)(n * ITERATIONS * state.iterations())); 94 | } 95 | 96 | } 97 | -------------------------------------------------------------------------------- /bench/util.h: -------------------------------------------------------------------------------- 1 | #ifndef VXSORT_BENCH_UTIL_H 2 | #define VXSORT_BENCH_UTIL_H 3 | 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include 12 | 13 | #include "stolen-cycleclock.h" 14 | 15 | using namespace benchmark; 16 | 17 | namespace vxsort_bench { 18 | 19 | using namespace vxsort::types; 20 | 21 | Counter make_time_per_n_counter(i64 n); 22 | 23 | Counter make_cycle_per_n_counter(f64 n); 24 | 25 | std::string get_crypto_hash(void *start, void *end); 26 | 27 | void process_perf_counters(UserCounters &counters, i64 num_elements); 28 | 29 | extern std::random_device::result_type global_bench_random_seed; 30 | 31 | template 32 | void generate_unique_values_vec(std::vector& vec, T start, T stride) { 33 | for (usize i = 0; i < vec.size(); i++, start += stride) 34 | vec[i] = start; 35 | 36 | std::mt19937_64 g(global_bench_random_seed); 37 | 38 | std::shuffle(vec.begin(), vec.end(), g); 39 | } 40 | 41 | template 42 | std::vector generate_array_beginnings(std::vector> &copies) { 43 | const auto num_copies = copies.size(); 44 | std::vector begins(num_copies); 45 | for (usize i = 0; i < num_copies; i++) 46 | begins[i] = (U*)copies[i].data(); 47 | return begins; 48 | } 49 | 50 | template 51 | void refresh_copies(std::vector> &copies, std::vector& orig) { 52 | const auto begin = orig.begin(); 53 | const auto end = orig.end(); 54 | const auto num_copies = copies.size(); 55 | for (usize i = 0; i < num_copies; i++) 56 | copies[i].assign(begin, end); 57 | } 58 | 59 | template 60 | std::vector> generate_copies(usize num_copies, i64 n, std::vector& orig) { 61 | std::vector> copies(num_copies); 62 | for (usize i = 0; i < num_copies; i++) 63 | copies[i] = std::vector(n); 64 | refresh_copies(copies, orig); 65 | return copies; 66 | } 67 | 68 | template 69 | std::vector shuffled_seq(usize size, T start, T stride, std::mt19937_64& rng) { 70 | std::vector v; v.reserve(size); 71 | for (usize i = 0; i < size; ++i) 72 | v.push_back(start + stride * i); 73 | std::shuffle(v.begin(), v.end(), rng); 74 | return v; 75 | } 76 | 77 | template 78 | std::vector shuffled_16_values(usize size, T start, T stride, std::mt19937_64& rng) { 79 | std::vector v; v.reserve(size); 80 | for (usize i = 0; i < size; ++i) 81 | v.push_back(start + stride * (i % 16)); 82 | std::shuffle(v.begin(), v.end(), rng); 83 | return v; 84 | } 85 | 86 | template 87 | std::vector all_equal(isize size, T start) { 88 | std::vector v; v.reserve(size); 89 | for (i32 i = 0; i < size; ++i) 90 | v.push_back(start); 91 | return v; 92 | } 93 | 94 | template 95 | std::vector ascending_int(isize size, T start, T stride) { 96 | std::vector v; v.reserve(size); 97 | for (isize i = 0; i < size; ++i) 98 | v.push_back(start + stride * i); 99 | return v; 100 | } 101 | 102 | template 103 | std::vector descending_int(isize size, T start, T stride) { 104 | std::vector v; v.reserve(size); 105 | for (isize i = size - 1; i >= 0; --i) 106 | v.push_back(start + stride * i); 107 | return v; 108 | } 109 | 110 | template 111 | std::vector pipe_organ(isize size, T start, T stride, std::mt19937_64&) { 112 | std::vector v; v.reserve(size); 113 | for (isize i = 0; i < size/2; ++i) 114 | v.push_back(start + stride * i); 115 | for (isize i = size/2; i < size; ++i) 116 | v.push_back(start + (size - i) * stride); 117 | return v; 118 | } 119 | 120 | template 121 | std::vector push_front(isize size, T start, T stride, std::mt19937_64&) { 122 | std::vector v; v.reserve(size); 123 | for (isize i = 1; i < size; ++i) 124 | v.push_back(start + stride * i); 125 | v.push_back(start); 126 | return v; 127 | } 128 | 129 | template 130 | std::vector push_middle(isize size, T start, T stride, std::mt19937_64&) { 131 | std::vector v; v.reserve(size); 132 | for (isize i = 0; i < size; ++i) { 133 | if (i != size/2) 134 | v.push_back(start + stride * i); 135 | } 136 | v.push_back(start + stride * (size/2)); 137 | return v; 138 | } 139 | 140 | } 141 | 142 | #endif //VXSORT_BENCH_UTIL_H 143 | -------------------------------------------------------------------------------- /build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | __BuildOS="" 4 | __CleanBuild=0 5 | 6 | case $OSTYPE in 7 | msys|cygwin) 8 | __BuildOS=win 9 | ;; 10 | *) 11 | __BuildOS=linux 12 | ;; 13 | esac 14 | 15 | __BuildArch=x64 16 | __BuildType=Debug 17 | __COMPILER=gcc 18 | __CC=gcc 19 | __CXX=g++ 20 | 21 | for i in "$@" 22 | do 23 | lowerI=${i,,} 24 | case $lowerI in 25 | -?|-h|--help) 26 | usage 27 | exit 1 28 | ;; 29 | x64) 30 | __BuildArch=x64 31 | ;; 32 | x86) 33 | __BuildArch=x86 34 | ;; 35 | debug) 36 | __BuildType=Debug 37 | ;; 38 | release) 39 | __BuildType=Release 40 | ;; 41 | iaca) 42 | __BuildType=IACA 43 | ;; 44 | clang) 45 | __COMPILER=clang 46 | __CC=clang-6.0 47 | __CXX=clang++-6.0 48 | ;; 49 | gcc) 50 | __COMPILER=gcc 51 | __CC=gcc 52 | __CXX=g++ 53 | ;; 54 | test) 55 | __RunTests=1 56 | ;; 57 | clean) 58 | __CleanBuild=1 59 | ;; 60 | *) 61 | __UnprocessedBuildArgs="$__UnprocessedBuildArgs $i" 62 | esac 63 | done 64 | 65 | if [ $__CleanBuild == "1" ]; then 66 | rm -rf dist 67 | rm -rf build-{debug,release} 68 | exit 0 69 | fi 70 | 71 | __DistDir=build-${__BuildType,,}-${__COMPILER} 72 | 73 | mkdir -p ${__DistDir} 74 | pushd ${__DistDir} 75 | 76 | if [ $__BuildOS == "win" ]; then 77 | cmake -G "Visual Studio 15 2017 Win64" -DCMAKE_BUILD_TYPE=${__BuildType^^} .. 78 | 79 | vs=$(/c/Program\ Files\ \(x86\)/Microsoft\ Visual\ Studio/Installer/vswhere.exe -latest | grep installationPath | cut -f 2- -d : -d " ") 80 | __MSBuildExePath="$vs/MSBuild/15.0/Bin/MSBuild.exe" 81 | if [ ! -f "$__MSBuildExePath" ]; then 82 | echo Error: Could not find MSBuild.exe 83 | exit 1 84 | fi 85 | ${__MSBuildExePath} -p:Platform=${__BuildArch} -p:Configuration=${__BuildType} dodo.sln 86 | build_result=$? 87 | fi 88 | 89 | if [ $__BuildOS == "linux" ]; then 90 | CC=${__CC} CXX=${__CXX} cmake -DCMAKE_BUILD_TYPE=${__BuildType^^} .. 91 | make -j4 92 | build_result=$? 93 | fi 94 | 95 | 96 | build_result=$? 97 | 98 | if [ "$__RunTests" == "1" ]; then 99 | ./tests/bitgoo_tests 100 | fi 101 | 102 | # Build complete 103 | if [ ${build_result} == 0 ]; then 104 | echo bitgoo successfully built. ✔ 105 | echo "binaries are available at ${__DistDir}" 106 | else 107 | echo "build failed miserably (${build_result}), you suck, 💩💩💩" 108 | exit $build_result 109 | fi 110 | popd 111 | 112 | -------------------------------------------------------------------------------- /clang-tidy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ $# -ne 1 ] ; then 4 | echo "Usage $0 " >&2 5 | exit 1 6 | fi 7 | 8 | BUILD_DIR=$1 9 | NPROC=${NPROC_CI:-$(nproc)} 10 | 11 | for candidate in run-clang-tidy-12 run-clang-tidy-11 run-clang-tidy-10 run-clang-tidy ; do 12 | if command -v $candidate >/dev/null ; then 13 | echo "Using '$candidate' to execute clang-tidy in parallel" 14 | _RUN_CLANG_TIDY=$candidate 15 | break 16 | fi 17 | done 18 | 19 | if [ -z ${_RUN_CLANG_TIDY} ] >/dev/null ; then 20 | echo "run-clang-tidy not found in PATH" >&2 21 | exit 1 22 | fi 23 | 24 | ORIGINAL_COMPILE_COMMANDS="$BUILD_DIR"/compile_commands.json 25 | 26 | CXX_PROJECT_DIR="$(mktemp -d --suffix='-clang-tidy-vxsort')" 27 | jq 'map(select( (.["file"] | contains("/googletest") | not) and (.["file"] | contains("/googlebenchmark") | not) and (.["file"] | contains("/cpu_features") | not) ))' \ 28 | < "$ORIGINAL_COMPILE_COMMANDS" \ 29 | > "$CXX_PROJECT_DIR"/compile_commands.json || exit 1 30 | exec "${_RUN_CLANG_TIDY}" \ 31 | -j "${NPROC}" \ 32 | -p "$CXX_PROJECT_DIR" \ 33 | -config="$(cat .clang-tidy.cxx.yaml)" 34 | 35 | -------------------------------------------------------------------------------- /cmake/CPM.cmake: -------------------------------------------------------------------------------- 1 | set(CPM_DOWNLOAD_VERSION 0.35.0) 2 | 3 | if(CPM_SOURCE_CACHE) 4 | # Expand relative path. This is important if the provided path contains a tilde (~) 5 | get_filename_component(CPM_SOURCE_CACHE ${CPM_SOURCE_CACHE} ABSOLUTE) 6 | set(CPM_DOWNLOAD_LOCATION "${CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake") 7 | elseif(DEFINED ENV{CPM_SOURCE_CACHE}) 8 | set(CPM_DOWNLOAD_LOCATION "$ENV{CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake") 9 | else() 10 | set(CPM_DOWNLOAD_LOCATION "${CMAKE_BINARY_DIR}/cmake/CPM_${CPM_DOWNLOAD_VERSION}.cmake") 11 | endif() 12 | 13 | if(NOT (EXISTS ${CPM_DOWNLOAD_LOCATION})) 14 | message(STATUS "Downloading CPM.cmake to ${CPM_DOWNLOAD_LOCATION}") 15 | file(DOWNLOAD 16 | https://github.com/cpm-cmake/CPM.cmake/releases/download/v${CPM_DOWNLOAD_VERSION}/CPM.cmake 17 | ${CPM_DOWNLOAD_LOCATION} 18 | ) 19 | endif() 20 | 21 | include(${CPM_DOWNLOAD_LOCATION}) 22 | -------------------------------------------------------------------------------- /cmake/ConfigSafeGuards.cmake: -------------------------------------------------------------------------------- 1 | # Adapted from: https://github.com/bast/cmake-example/tree/master/cmake 2 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 3 | 4 | # guard against in-source builds 5 | if(${CMAKE_CURRENT_SOURCE_DIR} STREQUAL ${CMAKE_CURRENT_BINARY_DIR}) 6 | message(FATAL_ERROR "In-source builds not allowed. Please make a new directory (called a build directory) and run CMake from there.") 7 | endif() 8 | 9 | # guard against bad build-type strings 10 | if(NOT CMAKE_BUILD_TYPE) 11 | set(CMAKE_BUILD_TYPE "Debug") 12 | endif() 13 | 14 | string(TOLOWER "${CMAKE_BUILD_TYPE}" cmake_build_type_tolower) 15 | string(TOUPPER "${CMAKE_BUILD_TYPE}" cmake_build_type_toupper) 16 | if( NOT cmake_build_type_tolower STREQUAL "debug" 17 | AND NOT cmake_build_type_tolower STREQUAL "release" 18 | AND NOT cmake_build_type_tolower STREQUAL "relwithdebinfo") 19 | message(FATAL_ERROR "Unknown build type \"${CMAKE_BUILD_TYPE}\". Allowed values are Debug, Release, Profile, RelWithDebInfo (case-insensitive).") 20 | endif() 21 | -------------------------------------------------------------------------------- /cmake/EnableLocalGtestDiscovery.cmake: -------------------------------------------------------------------------------- 1 | if(NOT EXISTS ${CMAKE_FIND_PACKAGE_REDIRECTS_DIR}/gtest-config.cmake 2 | AND NOT EXISTS ${CMAKE_FIND_PACKAGE_REDIRECTS_DIR}/GTestConfig.cmake) 3 | file( 4 | WRITE ${CMAKE_FIND_PACKAGE_REDIRECTS_DIR}/gtest-config.cmake 5 | [=[ 6 | include(CMakeFindDependencyMacro) 7 | find_dependency(googletest) 8 | if(NOT TARGET GTest::GTest) 9 | add_library(GTest::GTest INTERFACE IMPORTED) 10 | target_link_libraries(GTest::GTest INTERFACE GTest::gtest) 11 | endif() 12 | if(NOT TARGET GTest::Main) 13 | add_library(GTest::Main INTERFACE IMPORTED) 14 | target_link_libraries(GTest::Main INTERFACE GTest::gtest_main) 15 | endif() 16 | ]=]) 17 | endif() 18 | 19 | if(NOT EXISTS ${CMAKE_FIND_PACKAGE_REDIRECTS_DIR}/gtest-config-version.cmake 20 | AND NOT EXISTS ${CMAKE_FIND_PACKAGE_REDIRECTS_DIR}/GTestConfigVersion.cmake) 21 | file( 22 | WRITE ${CMAKE_FIND_PACKAGE_REDIRECTS_DIR}/gtest-config-version.cmake 23 | [=[ 24 | include(${CMAKE_FIND_PACKAGE_REDIRECTS_DIR}/googletest-config-version.cmake OPTIONAL) 25 | if(NOT PACKAGE_VERSION_COMPATIBLE) 26 | include(${CMAKE_FIND_PACKAGE_REDIRECTS_DIR}/googletestConfigVersion.cmake OPTIONAL) 27 | endif() 28 | ]=]) 29 | endif() 30 | -------------------------------------------------------------------------------- /cmake/GetHostType.cmake: -------------------------------------------------------------------------------- 1 | set(PROCESSOR_IS_MIPS FALSE) 2 | set(PROCESSOR_IS_ARM FALSE) 3 | set(PROCESSOR_IS_AARCH64 FALSE) 4 | set(PROCESSOR_IS_X86 FALSE) 5 | set(PROCESSOR_IS_POWER FALSE) 6 | 7 | if(CMAKE_SYSTEM_PROCESSOR MATCHES "^mips") 8 | set(PROCESSOR_IS_MIPS TRUE) 9 | elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^arm") 10 | set(PROCESSOR_IS_ARM TRUE) 11 | elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64") 12 | set(PROCESSOR_IS_AARCH64 TRUE) 13 | elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)") 14 | set(PROCESSOR_IS_X86 TRUE) 15 | elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)") 16 | set(PROCESSOR_IS_POWER TRUE) 17 | endif() -------------------------------------------------------------------------------- /cmake/Modules/FindLLVMAr.cmake: -------------------------------------------------------------------------------- 1 | include(FeatureSummary) 2 | 3 | find_program(LLVMAR_EXECUTABLE 4 | NAMES llvm-ar 5 | DOC "The llvm-ar executable" 6 | ) 7 | 8 | include(FindPackageHandleStandardArgs) 9 | find_package_handle_standard_args(LLVMAr 10 | DEFAULT_MSG 11 | LLVMAR_EXECUTABLE) 12 | 13 | SET_PACKAGE_PROPERTIES(LLVMAr PROPERTIES 14 | URL https://llvm.org/docs/CommandGuide/llvm-ar.html 15 | DESCRIPTION "create, modify, and extract from archives" 16 | ) 17 | -------------------------------------------------------------------------------- /cmake/Modules/FindLLVMNm.cmake: -------------------------------------------------------------------------------- 1 | include(FeatureSummary) 2 | 3 | find_program(LLVMNM_EXECUTABLE 4 | NAMES llvm-nm 5 | DOC "The llvm-nm executable" 6 | ) 7 | 8 | include(FindPackageHandleStandardArgs) 9 | find_package_handle_standard_args(LLVMNm 10 | DEFAULT_MSG 11 | LLVMNM_EXECUTABLE) 12 | 13 | SET_PACKAGE_PROPERTIES(LLVMNm PROPERTIES 14 | URL https://llvm.org/docs/CommandGuide/llvm-nm.html 15 | DESCRIPTION "list LLVM bitcode and object file’s symbol table" 16 | ) 17 | -------------------------------------------------------------------------------- /cmake/Modules/FindLLVMRanLib.cmake: -------------------------------------------------------------------------------- 1 | include(FeatureSummary) 2 | 3 | find_program(LLVMRANLIB_EXECUTABLE 4 | NAMES llvm-ranlib 5 | DOC "The llvm-ranlib executable" 6 | ) 7 | 8 | include(FindPackageHandleStandardArgs) 9 | find_package_handle_standard_args(LLVMRanLib 10 | DEFAULT_MSG 11 | LLVMRANLIB_EXECUTABLE) 12 | 13 | SET_PACKAGE_PROPERTIES(LLVMRanLib PROPERTIES 14 | DESCRIPTION "generate index for LLVM archive" 15 | ) 16 | -------------------------------------------------------------------------------- /demo/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(TARGET_NAME ${CMAKE_PROJECT_NAME}_demo) 2 | 3 | set(demo_SOURCES 4 | demo.cpp) 5 | 6 | if (${PROCESSOR_IS_X86}) 7 | list(APPEND demo_SOURCES 8 | do_avx2.cpp 9 | do_avx512.cpp) 10 | endif() 11 | 12 | add_executable(${CMAKE_PROJECT_NAME}_demo ${demo_SOURCES}) 13 | 14 | target_link_libraries(${TARGET_NAME} ${CMAKE_PROJECT_NAME}_lib) 15 | -------------------------------------------------------------------------------- /demo/demo.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | using namespace std; 6 | 7 | #include "isa_detection.h" 8 | 9 | using vxsort::vector_machine; 10 | using namespace vxsort::types; 11 | 12 | extern void do_avx2(i64 *begin, i64 *end); 13 | extern void do_avx512(i64 *begin, i64 *end); 14 | 15 | std::vector generate_random_garbage(const usize size) { 16 | 17 | auto vec = std::vector(size); 18 | std::iota(vec.begin(), vec.end(), 666); 19 | 20 | std::random_device rd; 21 | std::mt19937 g(rd()); 22 | 23 | std::shuffle(vec.begin(), vec.end(), g); 24 | return vec; 25 | } 26 | 27 | int main(int argc, char** argv) { 28 | if (argc != 2) { 29 | fprintf(stderr, "demo array size must be specified\n"); 30 | return -1; 31 | } 32 | 33 | const size_t vector_size = atoi(argv[1]); 34 | auto v = generate_random_garbage(vector_size); 35 | 36 | const auto begin = v.data(); 37 | const auto end = begin + vector_size - 1; 38 | 39 | #if defined(CPU_FEATURES_ARCH_X86) 40 | if (vxsort::supports_vector_machine(vxsort::vector_machine::AVX512)) { 41 | fprintf(stderr, "Sorting with AVX512..."); 42 | do_avx512(begin, end); 43 | fprintf(stderr, "...done!\n"); 44 | } else if (vxsort::supports_vector_machine(vxsort::vector_machine::AVX2)) { 45 | fprintf(stderr, "Sorting with AVX2..."); 46 | do_avx2(begin, end); 47 | fprintf(stderr, "...done!\n"); 48 | } else 49 | #endif 50 | #if defined(CPU_FEATURES_ARCH_AARCH64) 51 | if (vxsort::supports_vector_machine(vxsort::vector_machine::NEON)) { 52 | } else 53 | #endif 54 | 55 | { 56 | fprintf(stderr, "CPU doesn't seem to support any vectorized ISA, bye-bye\n"); 57 | return -2; 58 | } 59 | 60 | return 0; 61 | } 62 | -------------------------------------------------------------------------------- /demo/do_avx2.cpp: -------------------------------------------------------------------------------- 1 | #include "vxsort_targets_enable_avx2.h" 2 | 3 | #include "vxsort.avx2.h" 4 | 5 | using namespace vxsort::types; 6 | 7 | void do_avx2(i64 *begin, i64 *end) { 8 | auto sorter = vxsort::vxsort(); 9 | sorter.sort(begin, end); 10 | } 11 | #include "vxsort_targets_disable.h" 12 | -------------------------------------------------------------------------------- /demo/do_avx512.cpp: -------------------------------------------------------------------------------- 1 | #include "vxsort_targets_enable_avx512.h" 2 | 3 | #include "vxsort.avx512.h" 4 | 5 | using namespace vxsort::types; 6 | 7 | void do_avx512(i64 *begin, i64 *end) { 8 | auto sorter = vxsort::vxsort(); 9 | sorter.sort(begin, end); 10 | } 11 | #include "vxsort_targets_disable.h" 12 | -------------------------------------------------------------------------------- /tests/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(TARGET_NAME ${CMAKE_PROJECT_NAME}_tests) 2 | 3 | include(GoogleTest) 4 | 5 | set(test_HEADERS 6 | smallsort/smallsort_test.h 7 | fullsort/fullsort_test.h 8 | mini_tests/pack_machine_test.h 9 | mini_tests/partition_machine_test.h 10 | mini_tests/masked_load_store_test.h 11 | test_isa.h) 12 | 13 | list(APPEND i_sort_types 14 | i16 15 | i32 16 | i64 17 | ) 18 | 19 | list(APPEND u_sort_types 20 | u16 21 | u32 22 | u64 23 | ) 24 | 25 | list(APPEND f_sort_types 26 | f32 27 | f64 28 | ) 29 | 30 | list(APPEND sort_types 31 | i 32 | u 33 | f 34 | ) 35 | 36 | list(APPEND x86_isas 37 | avx2 38 | avx512 39 | ) 40 | 41 | list(APPEND test_SOURCES 42 | gtest_main.cpp 43 | mini_tests/masked_load_store.sanity.cpp 44 | ) 45 | 46 | if (${PROCESSOR_IS_X86}) 47 | set(test_avx2_SOURCES ${test_SOURCES}) 48 | list(APPEND test_avx2_SOURCES 49 | smallsort/smallsort.avx2.cpp 50 | fullsort/fullsort.avx2.cpp 51 | mini_tests/masked_load_store.avx2.cpp 52 | mini_tests/partition_machine.avx2.cpp 53 | mini_tests/pack_machine.avx2.cpp 54 | ) 55 | 56 | set(test_avx512_SOURCES ${test_SOURCES}) 57 | list(APPEND test_avx512_SOURCES 58 | smallsort/smallsort.avx512.cpp 59 | fullsort/fullsort.avx512.cpp 60 | mini_tests/masked_load_store.avx512.cpp 61 | mini_tests/partition_machine.avx512.cpp 62 | mini_tests/pack_machine.avx512.cpp 63 | ) 64 | 65 | 66 | 67 | foreach(v ${x86_isas}) 68 | foreach(tf ${sort_types}) 69 | string(TOUPPER ${v} vu) 70 | add_executable(${TARGET_NAME}_${v}_${tf} ${test_${v}_SOURCES} ${test_HEADERS}) 71 | 72 | foreach(t ${${tf}_sort_types}) 73 | string(TOUPPER ${t} tu) 74 | target_compile_definitions(${TARGET_NAME}_${v}_${tf} PRIVATE VXSORT_TEST_${vu}_${tu}) 75 | endforeach () 76 | 77 | target_link_libraries(${TARGET_NAME}_${v}_${tf} 78 | ${CMAKE_PROJECT_NAME}_lib 79 | Backward::Backward 80 | GTest::gtest 81 | ) 82 | 83 | add_test(${TARGET_NAME}_${v}_${tf} ${TARGET_NAME}_${v}_${tf}) 84 | endforeach() 85 | endforeach() 86 | 87 | endif() 88 | 89 | -------------------------------------------------------------------------------- /tests/fullsort/fullsort_test.h: -------------------------------------------------------------------------------- 1 | #ifndef VXSORT_FULLSORT_TEST_H 2 | #define VXSORT_FULLSORT_TEST_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "../test_isa.h" 10 | #include "vxsort.h" 11 | 12 | namespace vxsort_tests { 13 | using namespace vxsort::types; 14 | using ::vxsort::vector_machine; 15 | 16 | template 17 | void vxsort_test(std::vector& V) { 18 | VXSORT_TEST_ISA(); 19 | 20 | auto v_copy = std::vector(V); 21 | auto begin = V.data(); 22 | auto end = V.data() + V.size() - 1; 23 | 24 | auto sorter = ::vxsort::vxsort(); 25 | sorter.sort(begin, end); 26 | 27 | std::sort(v_copy.begin(), v_copy.end()); 28 | usize size = v_copy.size(); 29 | for (usize i = 0; i < size; ++i) { 30 | if (v_copy[i] != V[i]) { 31 | GTEST_FAIL() << fmt::format("value at idx #{} {} != {}", i, v_copy[i], V[i]); 32 | } 33 | } 34 | } 35 | 36 | template 37 | void vxsort_hinted_test(std::vector& V, T min_value, T max_value) { 38 | VXSORT_TEST_ISA(); 39 | 40 | auto v_copy = std::vector(V); 41 | auto begin = V.data(); 42 | auto end = V.data() + V.size() - 1; 43 | 44 | auto sorter = ::vxsort::vxsort(); 45 | sorter.sort(begin, end, min_value, max_value); 46 | 47 | std::sort(v_copy.begin(), v_copy.end()); 48 | usize size = v_copy.size(); 49 | for (usize i = 0; i < size; ++i) { 50 | if (v_copy[i] != V[i]) { 51 | GTEST_FAIL() << fmt::format("value at idx #{} {} != {}", i, v_copy[i], V[i]); 52 | } 53 | } 54 | 55 | } 56 | 57 | } 58 | 59 | #endif // VXSORT_FULLSORT_TEST_H 60 | -------------------------------------------------------------------------------- /tests/gtest_main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "gtest/gtest.h" 5 | 6 | #if defined(GTEST_OS_ESP8266) || defined(GTEST_OS_ESP32) 7 | // Arduino-like platforms: program entry points are setup/loop instead of main. 8 | 9 | #ifdef GTEST_OS_ESP8266 10 | extern "C" { 11 | #endif 12 | 13 | void setup() { testing::InitGoogleTest(); } 14 | 15 | void loop() { RUN_ALL_TESTS(); } 16 | 17 | #ifdef GTEST_OS_ESP8266 18 | } 19 | #endif 20 | 21 | #elif defined(GTEST_OS_QURT) 22 | // QuRT: program entry point is main, but argc/argv are unusable. 23 | 24 | GTEST_API_ int main() { 25 | printf("Running main() from %s\n", __FILE__); 26 | testing::InitGoogleTest(); 27 | return RUN_ALL_TESTS(); 28 | } 29 | #else 30 | // Normal platforms: program entry point is main, argc/argv are initialized. 31 | 32 | GTEST_API_ int main(int argc, char **argv) { 33 | backward::SignalHandling sh; 34 | 35 | testing::InitGoogleTest(&argc, argv); 36 | return RUN_ALL_TESTS(); 37 | } 38 | #endif -------------------------------------------------------------------------------- /tests/mini_tests/masked_load_store.avx2.cpp: -------------------------------------------------------------------------------- 1 | #include "vxsort_targets_enable_avx2.h" 2 | 3 | #include "masked_load_store_test.h" 4 | #include 5 | 6 | namespace vxsort_tests { 7 | using namespace vxsort::types; 8 | using VM = vxsort::vector_machine; 9 | 10 | template 11 | using AVX2MaskedLoadStoreTest = PageWithLavaBoundariesFixture; 12 | 13 | using TestTypes = ::testing::Types< 14 | #ifdef VXSORT_TEST_AVX2_I16 15 | i16, i32, i64 16 | #endif 17 | #ifdef VXSORT_TEST_AVX2_U16 18 | u16, u32, u64 19 | #endif 20 | #ifdef VXSORT_TEST_AVX2_F32 21 | f32, f64 22 | #endif 23 | >; 24 | TYPED_TEST_SUITE(AVX2MaskedLoadStoreTest, TestTypes); 25 | 26 | TYPED_TEST(AVX2MaskedLoadStoreTest, PrefixLoadOnPageBoundaryWorks) { 27 | test_prefix_mask_load_on_page_boundary(this); 28 | } 29 | 30 | TYPED_TEST(AVX2MaskedLoadStoreTest, SuffixLoadOnPageBoundaryWorks) { 31 | test_suffix_mask_load_on_page_boundary(this); 32 | } 33 | 34 | TYPED_TEST(AVX2MaskedLoadStoreTest, LeftAlignmentWorks) { 35 | test_left_alignment_and_masked_loads(this); 36 | } 37 | 38 | TYPED_TEST(AVX2MaskedLoadStoreTest, RightAlignmentWorks) { 39 | test_right_alignment_and_masked_loads(this); 40 | } 41 | 42 | 43 | }; 44 | 45 | #include "vxsort_targets_disable.h" 46 | -------------------------------------------------------------------------------- /tests/mini_tests/masked_load_store.avx512.cpp: -------------------------------------------------------------------------------- 1 | #include "vxsort_targets_enable_avx512.h" 2 | 3 | #include "masked_load_store_test.h" 4 | #include 5 | 6 | namespace vxsort_tests { 7 | using namespace vxsort::types; 8 | using VM = vxsort::vector_machine; 9 | 10 | template 11 | using AVX512MaskedLoadStoreTest = PageWithLavaBoundariesFixture; 12 | 13 | using TestTypes = ::testing::Types< 14 | #ifdef VXSORT_TEST_AVX512_I16 15 | i16, i32, i64 16 | #endif 17 | #ifdef VXSORT_TEST_AVX512_U16 18 | u16, u32, u64 19 | #endif 20 | #ifdef VXSORT_TEST_AVX512_F32 21 | f32, f64 22 | #endif 23 | >; 24 | 25 | TYPED_TEST_SUITE(AVX512MaskedLoadStoreTest, TestTypes); 26 | 27 | TYPED_TEST(AVX512MaskedLoadStoreTest, PrefixLoadOnPageBoundaryWorks) { 28 | test_prefix_mask_load_on_page_boundary(this); 29 | } 30 | 31 | TYPED_TEST(AVX512MaskedLoadStoreTest, SuffixLoadOnPageBoundaryWorks) { 32 | test_suffix_mask_load_on_page_boundary(this); 33 | } 34 | 35 | TYPED_TEST(AVX512MaskedLoadStoreTest, LeftAlignmentWorks) { 36 | test_left_alignment_and_masked_loads(this); 37 | } 38 | 39 | TYPED_TEST(AVX512MaskedLoadStoreTest, RightAlignmentWorks) { 40 | test_right_alignment_and_masked_loads(this); 41 | } 42 | 43 | }; 44 | 45 | #include "vxsort_targets_disable.h" 46 | -------------------------------------------------------------------------------- /tests/mini_tests/masked_load_store.sanity.cpp: -------------------------------------------------------------------------------- 1 | #include "vxsort_targets_enable_avx2.h" 2 | 3 | #include "masked_load_store_test.h" 4 | 5 | namespace vxsort_tests { 6 | using namespace vxsort::types; 7 | using VM = vxsort::vector_machine; 8 | 9 | template 10 | using MaskedLoadStoreDeathTest = PageWithLavaBoundariesFixture; 11 | 12 | using TestTypes = ::testing::Types; 13 | TYPED_TEST_SUITE(MaskedLoadStoreDeathTest, TestTypes); 14 | 15 | TYPED_TEST(MaskedLoadStoreDeathTest, IsSane) { 16 | EXPECT_EQ(*this->page_with_data, this->get_expected_value(this->page_with_data)); 17 | } 18 | 19 | 20 | TYPED_TEST(MaskedLoadStoreDeathTest, WhatKillsMeMakesMeStronger1) { 21 | ASSERT_DEATH(*((volatile i32 *) this->page_with_data - 1), ""); 22 | } 23 | 24 | TYPED_TEST(MaskedLoadStoreDeathTest, WhatKillsMeMakesMeStronger2) { 25 | ASSERT_DEATH(*((volatile i32 *) this->page_with_data + this->num_elements), ""); 26 | } 27 | 28 | }; 29 | 30 | #include "vxsort_targets_disable.h" 31 | -------------------------------------------------------------------------------- /tests/mini_tests/masked_load_store_test.h: -------------------------------------------------------------------------------- 1 | #ifndef VXSORT_MASKED_LOAD_STORE_TEST_H 2 | #define VXSORT_MASKED_LOAD_STORE_TEST_H 3 | 4 | #include 5 | #include "mini_fixtures.h" 6 | 7 | #include "defs.h" 8 | #include "vector_machine/machine_traits.h" 9 | #include "../test_isa.h" 10 | #include "alignment.h" 11 | 12 | namespace vxsort_tests { 13 | using namespace vxsort::types; 14 | using VM = vxsort::vector_machine; 15 | 16 | template 17 | void test_prefix_mask_load_on_page_boundary(PageWithLavaBoundariesFixture *fixture) 18 | { 19 | VXSORT_TEST_ISA(); 20 | 21 | using VMT = vxsort::vxsort_machine_traits; 22 | static constexpr auto MAX = std::numeric_limits::max(); 23 | const auto MAXV = VMT::broadcast(MAX); 24 | 25 | for (auto w = 1; w < VMT::N; w++) { 26 | auto mask = VMT::generate_prefix_mask(w); 27 | auto *load_addr = fixture->page_with_data + fixture->num_elements - w; 28 | auto result = VMT::load_partial_vec((typename VMT::TV *) load_addr, MAXV, mask); 29 | auto &res_array = reinterpret_cast(result); 30 | 31 | for (auto i = 0; i < w; ++i) 32 | ASSERT_EQ(res_array[i], fixture->get_expected_value(load_addr + i)); 33 | for (auto i = w; i < VMT::N; ++i) 34 | ASSERT_EQ(res_array[i], MAX); 35 | } 36 | } 37 | 38 | template 39 | void test_suffix_mask_load_on_page_boundary(PageWithLavaBoundariesFixture *fixture) 40 | { 41 | VXSORT_TEST_ISA() 42 | 43 | using VMT = vxsort::vxsort_machine_traits; 44 | static constexpr auto MAX = std::numeric_limits::max(); 45 | const auto MAXV = VMT::broadcast(MAX); 46 | 47 | for (auto w = 1; w < VMT::N; w++) { 48 | auto mask = VMT::generate_suffix_mask(w); 49 | auto *load_addr = fixture->page_with_data - w; 50 | auto result = VMT::load_partial_vec((typename VMT::TV *) load_addr, MAXV, mask); 51 | auto &res_array = reinterpret_cast(result); 52 | 53 | for (auto i = 0; i < w; ++i) 54 | ASSERT_EQ(res_array[i], MAX); 55 | for (auto i = w; i < VMT::N; ++i) 56 | ASSERT_EQ(res_array[i], fixture->get_expected_value(load_addr + i)); 57 | } 58 | } 59 | 60 | template 61 | void test_left_alignment_and_masked_loads(PageWithLavaBoundariesFixture *fixture) 62 | { 63 | VXSORT_TEST_ISA(); 64 | 65 | using VMT = vxsort::vxsort_machine_traits; 66 | using AH = vxsort::alignment_hint; 67 | 68 | static constexpr auto MAX = std::numeric_limits::max(); 69 | const auto MAXV = VMT::broadcast(MAX); 70 | 71 | for (auto w = 0; w < VMT::N; w++) { 72 | auto *load_addr = fixture->page_with_data + w; 73 | 74 | AH align; 75 | align.calc_left_alignment(load_addr); 76 | auto mask = VMT::generate_suffix_mask(align.left_masked_amount); 77 | 78 | load_addr -= align.left_masked_amount; 79 | 80 | ASSERT_TRUE(AH::is_aligned(load_addr)); 81 | 82 | auto result = VMT::load_partial_vec((typename VMT::TV *) load_addr, MAXV, mask); 83 | auto &res_array = reinterpret_cast(result); 84 | 85 | for (auto i = 0; i < align.left_masked_amount; ++i) 86 | ASSERT_EQ(res_array[i], MAX); 87 | for (auto i = align.left_masked_amount; i < VMT::N; ++i) 88 | ASSERT_EQ(res_array[i], fixture->get_expected_value(load_addr + i)); 89 | } 90 | } 91 | 92 | template 93 | void test_right_alignment_and_masked_loads(PageWithLavaBoundariesFixture *fixture) 94 | { 95 | VXSORT_TEST_ISA(); 96 | 97 | using VMT = vxsort::vxsort_machine_traits; 98 | using AH = vxsort::alignment_hint; 99 | 100 | static constexpr auto MAX = std::numeric_limits::max(); 101 | const auto MAXV = VMT::broadcast(MAX); 102 | 103 | for (auto w = 0; w < VMT::N; w++) { 104 | auto *load_addr = fixture->page_with_data + fixture->num_elements - w; 105 | 106 | AH align; 107 | align.calc_right_alignment(load_addr); 108 | 109 | load_addr -= align.right_unmasked_amount; 110 | 111 | ASSERT_TRUE(AH::is_aligned(load_addr)); 112 | 113 | auto mask = VMT::generate_prefix_mask(align.right_unmasked_amount); 114 | auto result = VMT::load_partial_vec((typename VMT::TV *) load_addr, MAXV, mask); 115 | auto &res_array = reinterpret_cast(result); 116 | 117 | for (auto i = 0; i < align.right_unmasked_amount; ++i) 118 | ASSERT_EQ(res_array[i], fixture->get_expected_value(load_addr + i)); 119 | for (auto i = align.right_unmasked_amount; i < VMT::N; ++i) 120 | ASSERT_EQ(res_array[i], MAX); 121 | } 122 | } 123 | 124 | 125 | }; 126 | 127 | #endif //VXSORT_MASKED_LOAD_STORE_TEST_H 128 | -------------------------------------------------------------------------------- /tests/mini_tests/mini_fixtures.h: -------------------------------------------------------------------------------- 1 | #ifndef VXSORT_MINI_FIXTURES_H 2 | #define VXSORT_MINI_FIXTURES_H 3 | 4 | #include 5 | #ifndef _WIN32 6 | #include 7 | #else 8 | #ifndef NOMINMAX 9 | # define NOMINMAX 10 | #endif 11 | #define WIN32_LEAN_AND_MEAN 1 12 | #include 13 | #endif 14 | 15 | #include "defs.h" 16 | #include "vector_machine/machine_traits.h" 17 | #include "isa_detection.h" 18 | #include "alignment.h" 19 | 20 | namespace vxsort_tests { 21 | using namespace vxsort::types; 22 | using VM = vxsort::vector_machine; 23 | 24 | static inline usize get_page_size() 25 | { 26 | usize page_size; 27 | #ifdef WIN32 28 | SYSTEM_INFO sys_info; 29 | GetSystemInfo(&sys_info); 30 | page_size = sys_info.dwPageSize; 31 | #else 32 | page_size = sysconf(_SC_PAGESIZE); 33 | #endif 34 | return page_size; 35 | } 36 | 37 | static const i32 page_size = get_page_size(); 38 | 39 | template 40 | class PageWithLavaBoundariesFixture : public ::testing::Test { 41 | using VMT = vxsort::vxsort_machine_traits; 42 | static constexpr i32 N = VMT::N; 43 | static_assert(N < 256, "N must be < 256"); 44 | 45 | protected: 46 | 47 | u8 *create_mapping_with_boundary_pages() { 48 | #ifdef WIN32 49 | auto *mem = (u8 *) VirtualAlloc(nullptr, 3*page_size, MEM_COMMIT, PAGE_READWRITE); 50 | DWORD old_protect; 51 | VirtualProtect(mem, page_size, PAGE_NOACCESS, &old_protect); 52 | VirtualProtect(mem + 2*page_size, page_size, PAGE_NOACCESS, &old_protect); 53 | return mem; 54 | #else 55 | auto *mem = (u8 *) mmap(nullptr, 3*page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 56 | // Make the first and last inaccessible 57 | mprotect(mem, page_size, PROT_NONE); 58 | mprotect(mem + 2*page_size, page_size, PROT_NONE); 59 | return mem; 60 | #endif 61 | } 62 | 63 | void SetUp() override { 64 | // Map 3 pages 65 | mem = create_mapping_with_boundary_pages(); 66 | generate_expected_values(); 67 | num_elements = page_size / sizeof(T); 68 | 69 | page_with_data = reinterpret_cast(mem + page_size); 70 | for (usize i = 0; i < num_elements; i++) { 71 | auto *p = &page_with_data[i]; 72 | *p = get_expected_value(p); 73 | } 74 | } 75 | 76 | void destroy_mapping() { 77 | #ifdef WIN32 78 | VirtualFree(mem, 3*page_size, MEM_DECOMMIT); 79 | #else 80 | munmap(mem, 3*page_size); 81 | #endif 82 | 83 | } 84 | 85 | void TearDown() override { 86 | destroy_mapping(); 87 | } 88 | 89 | void generate_expected_values() 90 | { 91 | static constexpr T max_value = for_packing ? (T)std::numeric_limits::max() : (T)std::numeric_limits::max(); 92 | for (auto n = 0; n < N; n++) { 93 | expected_values[n] = (T)n+1; 94 | ASSERT_LE(expected_values[n], max_value); 95 | } 96 | } 97 | 98 | T expected_values[N]; 99 | u8 *mem; 100 | 101 | public: 102 | T get_expected_value(const T *p) 103 | { 104 | const auto offset_in_elements = (((usize) p) / sizeof(T)) & (N-1); 105 | 106 | return expected_values[offset_in_elements]; 107 | } 108 | 109 | 110 | T *page_with_data; 111 | usize num_elements; 112 | 113 | }; 114 | 115 | }; 116 | 117 | #endif //VXSORT_MINI_FIXTURES_H 118 | -------------------------------------------------------------------------------- /tests/mini_tests/pack_machine.avx2.cpp: -------------------------------------------------------------------------------- 1 | #include "vxsort_targets_enable_avx2.h" 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | #include "pack_machine_test.h" 8 | 9 | namespace vxsort_tests { 10 | using namespace vxsort::types; 11 | using VM = vxsort::vector_machine; 12 | 13 | template 14 | using PackMachineAVX2Test = PackMachineTest; 15 | 16 | using TestTypes = ::testing::Types< 17 | #ifdef VXSORT_TEST_AVX2_I16 18 | i16, i32, i64 19 | #endif 20 | #ifdef VXSORT_TEST_AVX2_U16 21 | u16, u32, u64 22 | #endif 23 | #ifdef VXSORT_TEST_AVX2_F32 24 | f32, f64 25 | #endif 26 | >; 27 | TYPED_TEST_SUITE(PackMachineAVX2Test, TestTypes); 28 | 29 | TYPED_TEST(PackMachineAVX2Test, PackingWorks) { 30 | test_packunpack(this); 31 | } 32 | 33 | }; 34 | 35 | #include "vxsort_targets_disable.h" 36 | -------------------------------------------------------------------------------- /tests/mini_tests/pack_machine.avx512.cpp: -------------------------------------------------------------------------------- 1 | #include "vxsort_targets_enable_avx512.h" 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | #include "pack_machine_test.h" 9 | 10 | namespace vxsort_tests { 11 | using namespace vxsort::types; 12 | using VM = vxsort::vector_machine; 13 | 14 | template 15 | using PackMachineAVX512Test = PackMachineTest; 16 | 17 | using TestTypes = ::testing::Types< 18 | #ifdef VXSORT_TEST_AVX512_I16 19 | i16, i32, i64 20 | #endif 21 | #ifdef VXSORT_TEST_AVX512_U16 22 | u16, u32, u64 23 | #endif 24 | #ifdef VXSORT_TEST_AVX512_F32 25 | f32, f64 26 | #endif 27 | >; 28 | TYPED_TEST_SUITE(PackMachineAVX512Test, TestTypes); 29 | 30 | TYPED_TEST(PackMachineAVX512Test, PackingWorks) { 31 | test_packunpack(this); 32 | } 33 | 34 | }; 35 | 36 | #include "vxsort_targets_disable.h" 37 | -------------------------------------------------------------------------------- /tests/mini_tests/pack_machine_test.h: -------------------------------------------------------------------------------- 1 | #ifndef VXSORT_PACK_MACHINE_TEST_H 2 | #define VXSORT_PACK_MACHINE_TEST_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "mini_fixtures.h" 9 | 10 | #include "defs.h" 11 | #include "vector_machine/machine_traits.h" 12 | #include "../test_isa.h" 13 | #include "alignment.h" 14 | 15 | namespace vxsort_tests { 16 | using namespace vxsort::types; 17 | using VM = vxsort::vector_machine; 18 | 19 | template 20 | using PackMachineTest = PageWithLavaBoundariesFixture; 21 | 22 | template 23 | void test_packunpack(PackMachineTest *fixture) 24 | { 25 | VXSORT_TEST_ISA(); 26 | 27 | if (!::vxsort::supports_vector_machine(sizeof(T)/2)) { 28 | GTEST_SKIP_("Current CPU does not support the minimal features for this test"); 29 | return; 30 | } 31 | 32 | using VMT = vxsort::vxsort_machine_traits; 33 | 34 | if (!VMT::supports_packing()) { 35 | GTEST_SKIP_("primitive type does not support packing"); 36 | return; 37 | } 38 | 39 | using PM = vxsort::pack_machine; 40 | static constexpr auto N = VMT::N; 41 | 42 | auto *load_addr = fixture->page_with_data; 43 | auto s = std::span(load_addr, N*2); 44 | const auto [min, max] = std::minmax_element(s.begin(), s.end()); 45 | 46 | ASSERT_TRUE(VMT::template can_pack<0>(*max - *min)); 47 | 48 | auto d1 = VMT::load_vec((typename VMT::TV *) load_addr); 49 | auto d2 = VMT::load_vec((typename VMT::TV *) load_addr + 1); 50 | 51 | auto constexpr MIN = T(std::numeric_limits::min()); 52 | auto offset = VMT::template shift_n_sub<0>(*min, MIN); 53 | const auto offset_v = VMT::broadcast(offset); 54 | 55 | auto packed_v = PM::pack_vectors(d1, d2, offset_v); 56 | 57 | typename VMT::TV u1, u2; 58 | 59 | PM::unpack_vectors(offset_v, packed_v, u1, u2); 60 | 61 | T spill[N*2]; 62 | VMT::store_vec((typename VMT::TV *) spill, u1); 63 | VMT::store_vec((typename VMT::TV *) spill+1, u2); 64 | 65 | std::vector orig(s.begin(), s.end()); 66 | for (auto u : spill) { 67 | auto it = std::find(orig.begin(), orig.end(), u); 68 | if (it == orig.end()) { 69 | GTEST_FAIL() << fmt::format("Expected to find unpacked value {} in {}", u, fmt::join(s, ", ")); 70 | } 71 | orig.erase(it); 72 | } 73 | ASSERT_EQ(orig.size(), 0); 74 | } 75 | 76 | }; 77 | 78 | #endif //VXSORT_PACK_MACHINE_TEST_H 79 | -------------------------------------------------------------------------------- /tests/mini_tests/partition_machine.avx2.cpp: -------------------------------------------------------------------------------- 1 | #include "vxsort_targets_enable_avx2.h" 2 | 3 | #include 4 | 5 | #include 6 | #include "partition_machine_test.h" 7 | 8 | namespace vxsort_tests { 9 | using namespace vxsort::types; 10 | using VM = vxsort::vector_machine; 11 | 12 | template 13 | using PartitionMachineAVX2Test = PageWithLavaBoundariesFixture; 14 | 15 | using TestTypes = ::testing::Types< 16 | #ifdef VXSORT_TEST_AVX2_I16 17 | i16, i32, i64 18 | #endif 19 | #ifdef VXSORT_TEST_AVX2_U16 20 | u16, u32, u64 21 | #endif 22 | #ifdef VXSORT_TEST_AVX2_F32 23 | f32, f64 24 | #endif 25 | >; 26 | TYPED_TEST_SUITE(PartitionMachineAVX2Test, TestTypes); 27 | 28 | TYPED_TEST(PartitionMachineAVX2Test, PartitioningWorks) { 29 | test_partition(this); 30 | } 31 | 32 | TYPED_TEST(PartitionMachineAVX2Test, PartitioningIsStable) { 33 | test_partition_stability(this); 34 | } 35 | 36 | 37 | TYPED_TEST(PartitionMachineAVX2Test, PartitionAlignmentWorks) { 38 | test_partition_alignment(this); 39 | } 40 | 41 | }; 42 | 43 | #include "vxsort_targets_disable.h" 44 | -------------------------------------------------------------------------------- /tests/mini_tests/partition_machine.avx512.cpp: -------------------------------------------------------------------------------- 1 | #include "vxsort_targets_enable_avx512.h" 2 | 3 | #include 4 | 5 | #include 6 | #include "partition_machine_test.h" 7 | 8 | namespace vxsort_tests { 9 | using namespace vxsort::types; 10 | using VM = vxsort::vector_machine; 11 | 12 | template 13 | using PartitionMachineAVX512Test = PageWithLavaBoundariesFixture; 14 | 15 | using TestTypes = ::testing::Types< 16 | #ifdef VXSORT_TEST_AVX512_I16 17 | i16, i32, i64 18 | #endif 19 | #ifdef VXSORT_TEST_AVX512_U16 20 | u16, u32, u64 21 | #endif 22 | #ifdef VXSORT_TEST_AVX512_F32 23 | f32, f64 24 | #endif 25 | >; 26 | TYPED_TEST_SUITE(PartitionMachineAVX512Test, TestTypes); 27 | 28 | TYPED_TEST(PartitionMachineAVX512Test, PartitioningWorks) { 29 | test_partition(this); 30 | } 31 | 32 | TYPED_TEST(PartitionMachineAVX512Test, PartitioningIsStable) { 33 | test_partition_stability(this); 34 | } 35 | 36 | 37 | TYPED_TEST(PartitionMachineAVX512Test, PartitionAlignmentWorks) { 38 | test_partition_alignment(this); 39 | } 40 | 41 | }; 42 | 43 | #include "vxsort_targets_disable.h" 44 | -------------------------------------------------------------------------------- /tests/mini_tests/partition_machine_test.h: -------------------------------------------------------------------------------- 1 | #ifndef VXSORT_PARTITION_MACHINE_TEST_H 2 | #define VXSORT_PARTITION_MACHINE_TEST_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "mini_fixtures.h" 9 | 10 | #include "defs.h" 11 | #include "vector_machine/machine_traits.h" 12 | #include "../test_isa.h" 13 | #include "alignment.h" 14 | 15 | namespace vxsort_tests { 16 | using namespace vxsort::types; 17 | using VM = vxsort::vector_machine; 18 | 19 | template 20 | void test_partition(PageWithLavaBoundariesFixture *fixture) 21 | { 22 | VXSORT_TEST_ISA(); 23 | 24 | using VMT = vxsort::vxsort_machine_traits; 25 | using PM = vxsort::partition_machine; 26 | static constexpr auto N = VMT::N; 27 | 28 | for (auto p = 0; p < VMT::N; p++) { 29 | auto *load_addr = fixture->page_with_data; 30 | auto pivot = fixture->get_expected_value(load_addr + p) - 1; 31 | 32 | auto s = std::span(load_addr, N); 33 | std::random_device rd; 34 | std::mt19937 gen{rd()}; 35 | std::shuffle(s.begin(), s.end(), gen); 36 | 37 | auto PV = VMT::broadcast(pivot); 38 | 39 | auto data = VMT::load_vec((typename VMT::TV *) load_addr); 40 | 41 | T spill_left[N*2]; 42 | T spill_right[N*2]; 43 | 44 | T* RESTRICT spill_left_end = spill_left; 45 | // partition_block expects the left/right *write* pointers to point 46 | // to the next vector write position, for right write pointer 47 | // this means N elements BEFORE the end of the spill buffer 48 | T* RESTRICT spill_right_start = spill_right + N; 49 | T* RESTRICT spill_right_end = spill_right_start; 50 | 51 | memset(spill_left, 0x66, sizeof(spill_left)); 52 | memset(spill_right, 0x66, sizeof(spill_right)); 53 | 54 | PM::partition_block(data, PV, spill_left_end, spill_right_end); 55 | 56 | ASSERT_EQ(spill_left_end - spill_left, p); 57 | ASSERT_EQ(spill_right_start - spill_right_end, N - p); 58 | 59 | for (auto i = 0; i < p; ++i) { 60 | ASSERT_TRUE(spill_left[i] <= pivot); 61 | } 62 | 63 | for (auto i = VMT::N - 1; i >= p; --i) { 64 | ASSERT_TRUE(spill_right_start[i] > pivot); 65 | 66 | } 67 | } 68 | } 69 | 70 | 71 | template 72 | void test_partition_stability(PageWithLavaBoundariesFixture *fixture) 73 | { 74 | VXSORT_TEST_ISA(); 75 | 76 | using VMT = vxsort::vxsort_machine_traits; 77 | using PM = vxsort::partition_machine; 78 | static constexpr auto N = VMT::N; 79 | 80 | for (auto p = 0; p < VMT::N; p++) { 81 | auto *load_addr = fixture->page_with_data; 82 | auto pivot = fixture->get_expected_value(load_addr + p) - 1; 83 | 84 | auto PV = VMT::broadcast(pivot); 85 | 86 | auto data = VMT::load_vec((typename VMT::TV *) load_addr); 87 | 88 | T spill_left[N*2]; 89 | T spill_right[N*2]; 90 | 91 | T* RESTRICT spill_left_end = spill_left; 92 | // partition_block expects the left/right *write* pointers to point 93 | // to the next vector write position, for right write pointer 94 | // this means N elements BEFORE the end of the spill buffer 95 | T* RESTRICT spill_right_start = spill_right + N; 96 | T* RESTRICT spill_right_end = spill_right_start; 97 | 98 | memset(spill_left, 0x66, sizeof(spill_left)); 99 | memset(spill_right, 0x66, sizeof(spill_right)); 100 | 101 | PM::partition_block(data, PV, spill_left_end, spill_right_end); 102 | 103 | ASSERT_EQ(spill_left_end - spill_left, p); 104 | ASSERT_EQ(spill_right_start - spill_right_end, N - p); 105 | 106 | for (auto i = 0; i < p; ++i) { 107 | auto expected_value = fixture->get_expected_value(load_addr + i); 108 | ASSERT_EQ(spill_left[i], expected_value); 109 | } 110 | 111 | for (auto i = VMT::N - 1; i >= p; --i) { 112 | auto expected_value = fixture->get_expected_value(load_addr + i); 113 | ASSERT_EQ(spill_right_start[i], expected_value); 114 | 115 | } 116 | } 117 | } 118 | 119 | template 120 | void test_partition_alignment(PageWithLavaBoundariesFixture *fixture) 121 | { 122 | VXSORT_TEST_ISA(); 123 | 124 | using VMT = vxsort::vxsort_machine_traits; 125 | using PM = vxsort::partition_machine; 126 | using AH = vxsort::alignment_hint; 127 | static constexpr auto N = VMT::N; 128 | 129 | for (auto p = 0; p < VMT::N; p++) { 130 | auto * const left = fixture->page_with_data + p; 131 | auto * const right = fixture->page_with_data + fixture->num_elements - p; 132 | const auto pivot = fixture->get_expected_value(left + p) - 1; 133 | 134 | const auto PV = VMT::broadcast(pivot); 135 | 136 | AH align; 137 | align.calc_left_alignment(left); 138 | align.calc_right_alignment(right); 139 | 140 | T spill_left[N*2]; 141 | T spill_right[N*2]; 142 | 143 | T* RESTRICT spill_left_start = spill_left; 144 | T* RESTRICT spill_left_end = spill_left; 145 | 146 | // aligne_vectorized expects the left/right *write* pointers to point 147 | // to the boundary of the spill buffer, for right write pointer 148 | // this means the first element PAST the end of the spill buffer 149 | T* RESTRICT spill_right_start = spill_right + 2*N; 150 | T* RESTRICT spill_right_end = spill_right_start; 151 | 152 | memset(spill_left, 0x66, sizeof(spill_left)); 153 | memset(spill_right, 0x66, sizeof(spill_right)); 154 | 155 | auto left_masked_amount = align.left_masked_amount; 156 | auto right_unmasked_amount = align.right_unmasked_amount; 157 | 158 | T * RESTRICT left_next = left; 159 | T * RESTRICT right_next = right; 160 | 161 | //fmt::print("left_masked_amount: {}, right_unmasked_amount: {}\n", left_masked_amount, right_unmasked_amount); 162 | 163 | PM::align_vectorized(left_masked_amount, right_unmasked_amount, 164 | PV, 165 | left_next, right_next, 166 | spill_left_start, spill_left_end, 167 | spill_right_start, spill_right_end); 168 | 169 | // align vectorized API is build for continued 170 | // partitioning, so we need to update the right-pointing pointers 171 | // when vectorized partitioning is done by bumping them up by N elements 172 | right_next += N; 173 | spill_right_end += N; 174 | 175 | auto amount_read_left = left_next - left; 176 | auto amount_read_right = right - right_next; 177 | 178 | auto amount_partitioned_left = spill_left_end - spill_left_start; 179 | auto amount_partitioned_right = spill_right_start - spill_right_end; 180 | 181 | ASSERT_EQ(amount_partitioned_left + amount_partitioned_right, 182 | amount_read_left + amount_read_right); 183 | 184 | ASSERT_EQ(spill_left_start - spill_left, align.left_masked_amount); 185 | 186 | for (auto i = 0; i < amount_partitioned_left; ++i) { 187 | ASSERT_LE(spill_left_start[i], pivot); 188 | } 189 | 190 | for (auto i = 0; i < amount_partitioned_right; ++i) { 191 | ASSERT_GT(spill_right_end[i], pivot); 192 | } 193 | } 194 | } 195 | 196 | }; 197 | 198 | #endif //VXSORT_PARTITION_MACHINE_TEST_H 199 | -------------------------------------------------------------------------------- /tests/smallsort/smallsort.avx2.cpp: -------------------------------------------------------------------------------- 1 | #include "vxsort_targets_enable_avx2.h" 2 | 3 | #include "gtest/gtest.h" 4 | 5 | #include 6 | 7 | #include "smallsort_test.h" 8 | #include "../sort_fixtures.h" 9 | 10 | namespace vxsort_tests { 11 | using namespace vxsort::types; 12 | using VM = vxsort::vector_machine; 13 | 14 | auto bitonic_machine_allvalues_avx2_16 = ValuesIn(range(16, 64, 16)); 15 | auto bitonic_machine_allvalues_avx2_32 = ValuesIn(range(8, 32, 8)); 16 | auto bitonic_machine_allvalues_avx2_64 = ValuesIn(range(4, 16, 4)); 17 | 18 | auto bitonic_allvalues_avx2_16 = ValuesIn(range(1, 8192, 1)); 19 | auto bitonic_allvalues_avx2_32 = ValuesIn(range(1, 4096, 1)); 20 | auto bitonic_allvalues_avx2_64 = ValuesIn(range(1, 2048, 1)); 21 | 22 | #ifdef VXSORT_TEST_AVX2_I16 23 | struct BitonicMachineAVX2_i16 : public SortFixture {}; 24 | struct BitonicAVX2_i16 : public SortFixture {}; 25 | INSTANTIATE_TEST_SUITE_P(BitonicMachineAVX2, BitonicMachineAVX2_i16, bitonic_machine_allvalues_avx2_16, PrintValue()); 26 | INSTANTIATE_TEST_SUITE_P(BitonicAVX2, BitonicAVX2_i16, bitonic_allvalues_avx2_16, PrintValue()); 27 | #endif 28 | 29 | #ifdef VXSORT_TEST_AVX2_I32 30 | struct BitonicMachineAVX2_i32 : public SortFixture {}; 31 | struct BitonicAVX2_i32 : public SortFixture {}; 32 | INSTANTIATE_TEST_SUITE_P(BitonicMachineAVX2, BitonicMachineAVX2_i32, bitonic_machine_allvalues_avx2_32, PrintValue()); 33 | INSTANTIATE_TEST_SUITE_P(BitonicAVX2, BitonicAVX2_i32, bitonic_allvalues_avx2_32, PrintValue()); 34 | #endif 35 | 36 | #ifdef VXSORT_TEST_AVX2_I64 37 | struct BitonicMachineAVX2_i64 : public SortFixture {}; 38 | struct BitonicAVX2_i64 : public SortFixture {}; 39 | INSTANTIATE_TEST_SUITE_P(BitonicMachineAVX2, BitonicMachineAVX2_i64, bitonic_machine_allvalues_avx2_64, PrintValue()); 40 | INSTANTIATE_TEST_SUITE_P(BitonicAVX2, BitonicAVX2_i64, bitonic_allvalues_avx2_64, PrintValue()); 41 | #endif 42 | 43 | #ifdef VXSORT_TEST_AVX2_U16 44 | struct BitonicMachineAVX2_u16 : public SortFixture {}; 45 | struct BitonicAVX2_u16 : public SortFixture {}; 46 | INSTANTIATE_TEST_SUITE_P(BitonicMachineAVX2, BitonicMachineAVX2_u16, bitonic_machine_allvalues_avx2_16, PrintValue()); 47 | INSTANTIATE_TEST_SUITE_P(BitonicAVX2, BitonicAVX2_u16, bitonic_allvalues_avx2_16, PrintValue()); 48 | #endif 49 | 50 | #ifdef VXSORT_TEST_AVX2_U32 51 | struct BitonicMachineAVX2_u32 : public SortFixture {}; 52 | struct BitonicAVX2_u32 : public SortFixture {}; 53 | INSTANTIATE_TEST_SUITE_P(BitonicMachineAVX2, BitonicMachineAVX2_u32, bitonic_machine_allvalues_avx2_32, PrintValue()); 54 | INSTANTIATE_TEST_SUITE_P(BitonicAVX2, BitonicAVX2_u32, bitonic_allvalues_avx2_32, PrintValue()); 55 | #endif 56 | 57 | #ifdef VXSORT_TEST_AVX2_U64 58 | struct BitonicMachineAVX2_u64 : public SortFixture {}; 59 | struct BitonicAVX2_u64 : public SortFixture {}; 60 | INSTANTIATE_TEST_SUITE_P(BitonicMachineAVX2, BitonicMachineAVX2_u64, bitonic_machine_allvalues_avx2_64, PrintValue()); 61 | INSTANTIATE_TEST_SUITE_P(BitonicAVX2, BitonicAVX2_u64, bitonic_allvalues_avx2_64, PrintValue()); 62 | #endif 63 | 64 | #ifdef VXSORT_TEST_AVX2_F32 65 | struct BitonicMachineAVX2_f32 : public SortFixture {}; 66 | struct BitonicAVX2_f32 : public SortFixture {}; 67 | INSTANTIATE_TEST_SUITE_P(BitonicMachineAVX2, BitonicMachineAVX2_f32, bitonic_machine_allvalues_avx2_32, PrintValue()); 68 | INSTANTIATE_TEST_SUITE_P(BitonicAVX2, BitonicAVX2_f32, bitonic_allvalues_avx2_32, PrintValue()); 69 | #endif 70 | 71 | #ifdef VXSORT_TEST_AVX2_F64 72 | struct BitonicMachineAVX2_f64 : public SortFixture {}; 73 | struct BitonicAVX2_f64 : public SortFixture {}; 74 | INSTANTIATE_TEST_SUITE_P(BitonicMachineAVX2, BitonicMachineAVX2_f64, bitonic_machine_allvalues_avx2_64, PrintValue()); 75 | INSTANTIATE_TEST_SUITE_P(BitonicAVX2, BitonicAVX2_f64, bitonic_allvalues_avx2_64, PrintValue()); 76 | #endif 77 | 78 | #ifdef VXSORT_TEST_AVX2_I16 79 | TEST_P(BitonicMachineAVX2_i16, BitonicSortAVX2Asc) { bitonic_machine_sort_test(V); } 80 | TEST_P(BitonicAVX2_i16, BitonicSortAVX2) { bitonic_sort_test(V); } 81 | #endif 82 | 83 | #ifdef VXSORT_TEST_AVX2_I32 84 | TEST_P(BitonicMachineAVX2_i32, BitonicSortAVX2Asc) { bitonic_machine_sort_test(V); } 85 | TEST_P(BitonicAVX2_i32, BitonicSortAVX2) { bitonic_sort_test(V); } 86 | #endif 87 | 88 | #ifdef VXSORT_TEST_AVX2_I64 89 | TEST_P(BitonicMachineAVX2_i64, BitonicSortAVX2Asc) { bitonic_machine_sort_test(V); } 90 | TEST_P(BitonicAVX2_i64, BitonicSortAVX2) { bitonic_sort_test(V); } 91 | #endif 92 | #ifdef VXSORT_TEST_AVX2_U16 93 | TEST_P(BitonicMachineAVX2_u16, BitonicSortAVX2Asc) { bitonic_machine_sort_test(V); } 94 | TEST_P(BitonicAVX2_u16, BitonicSortAVX2) { bitonic_sort_test(V); } 95 | #endif 96 | 97 | #ifdef VXSORT_TEST_AVX2_U32 98 | TEST_P(BitonicMachineAVX2_u32, BitonicSortAVX2Asc) { bitonic_machine_sort_test(V); } 99 | TEST_P(BitonicAVX2_u32, BitonicSortAVX2) { bitonic_sort_test(V); } 100 | #endif 101 | 102 | #ifdef VXSORT_TEST_AVX2_U64 103 | TEST_P(BitonicMachineAVX2_u64, BitonicSortAVX2Asc) { bitonic_machine_sort_test(V); } 104 | TEST_P(BitonicAVX2_u64, BitonicSortAVX2) { bitonic_sort_test(V); } 105 | #endif 106 | 107 | #ifdef VXSORT_TEST_AVX2_F32 108 | TEST_P(BitonicMachineAVX2_f32, BitonicSortAVX2Asc) { bitonic_machine_sort_test(V); } 109 | TEST_P(BitonicAVX2_f32, BitonicSortAVX2) { bitonic_sort_test(V); } 110 | #endif 111 | 112 | #ifdef VXSORT_TEST_AVX2_F64 113 | TEST_P(BitonicMachineAVX2_f64, BitonicSortAVX2Asc) { bitonic_machine_sort_test(V); } 114 | TEST_P(BitonicAVX2_f64, BitonicSortAVX2) { bitonic_sort_test(V); } 115 | #endif 116 | 117 | //TEST_P(BitonicMachineAVX2_i32, BitonicSortAVX2Desc) { bitonic_machine_sort_test(V); } 118 | //TEST_P(BitonicMachineAVX2_u32, BitonicSortAVX2Desc) { bitonic_machine_sort_test(V); } 119 | //TEST_P(BitonicMachineAVX2_i64, BitonicSortAVX2Desc) { bitonic_machine_sort_test(V); } 120 | //TEST_P(BitonicMachineAVX2_u64, BitonicSortAVX2Desc) { bitonic_machine_sort_test(V); } 121 | //TEST_P(BitonicMachineAVX2_f32, BitonicSortAVX2Desc) { bitonic_machine_sort_test(V); } 122 | //TEST_P(BitonicMachineAVX2_f64, BitonicSortAVX2Desc) { bitonic_machine_sort_test(V); } 123 | 124 | } 125 | #include "vxsort_targets_disable.h" 126 | -------------------------------------------------------------------------------- /tests/smallsort/smallsort.avx512.cpp: -------------------------------------------------------------------------------- 1 | #include "vxsort_targets_enable_avx512.h" 2 | 3 | #include "gtest/gtest.h" 4 | 5 | #include 6 | 7 | #include "smallsort_test.h" 8 | #include "../sort_fixtures.h" 9 | 10 | namespace vxsort_tests { 11 | using namespace vxsort::types; 12 | using testing::Types; 13 | 14 | using VM = vxsort::vector_machine; 15 | 16 | auto bitonic_machine_allvalues_avx512_16 = ValuesIn(range(32, 128, 32)); 17 | auto bitonic_machine_allvalues_avx512_32 = ValuesIn(range(16, 64, 16)); 18 | auto bitonic_machine_allvalues_avx512_64 = ValuesIn(range(8, 32, 8)); 19 | auto bitonic_allvalues_avx512_16 = ValuesIn(range(1, 8192, 1)); 20 | auto bitonic_allvalues_avx512_32 = ValuesIn(range(1, 4096, 1)); 21 | auto bitonic_allvalues_avx512_64 = ValuesIn(range(1, 2048, 1)); 22 | 23 | #ifdef VXSORT_TEST_AVX512_I16 24 | struct BitonicMachineAVX512_i16 : public SortFixture {}; 25 | struct BitonicAVX512_i16 : public SortFixture {}; 26 | INSTANTIATE_TEST_SUITE_P(BitonicMachineAVX512, BitonicMachineAVX512_i16, bitonic_machine_allvalues_avx512_16, PrintValue()); 27 | INSTANTIATE_TEST_SUITE_P(BitonicAVX512, BitonicAVX512_i16, bitonic_allvalues_avx512_16, PrintValue()); 28 | #endif 29 | 30 | #ifdef VXSORT_TEST_AVX512_I32 31 | struct BitonicMachineAVX512_i32 : public SortFixture {}; 32 | struct BitonicAVX512_i32 : public SortFixture {}; 33 | INSTANTIATE_TEST_SUITE_P(BitonicMachineAVX512, BitonicMachineAVX512_i32, bitonic_machine_allvalues_avx512_32, PrintValue()); 34 | INSTANTIATE_TEST_SUITE_P(BitonicAVX512, BitonicAVX512_i32, bitonic_allvalues_avx512_32, PrintValue()); 35 | #endif 36 | 37 | #ifdef VXSORT_TEST_AVX512_I64 38 | struct BitonicMachineAVX512_i64 : public SortFixture {}; 39 | struct BitonicAVX512_i64 : public SortFixture {}; 40 | INSTANTIATE_TEST_SUITE_P(BitonicMachineAVX512, BitonicMachineAVX512_i64, bitonic_machine_allvalues_avx512_64, PrintValue()); 41 | INSTANTIATE_TEST_SUITE_P(BitonicAVX512, BitonicAVX512_i64, bitonic_allvalues_avx512_64, PrintValue()); 42 | #endif 43 | 44 | #ifdef VXSORT_TEST_AVX512_U16 45 | struct BitonicMachineAVX512_u16 : public SortFixture {}; 46 | struct BitonicAVX512_u16 : public SortFixture {}; 47 | INSTANTIATE_TEST_SUITE_P(BitonicMachineAVX512, BitonicMachineAVX512_u16, bitonic_machine_allvalues_avx512_16, PrintValue()); 48 | INSTANTIATE_TEST_SUITE_P(BitonicAVX512, BitonicAVX512_u16, bitonic_allvalues_avx512_16, PrintValue()); 49 | #endif 50 | 51 | #ifdef VXSORT_TEST_AVX512_U32 52 | struct BitonicMachineAVX512_u32 : public SortFixture {}; 53 | struct BitonicAVX512_u32 : public SortFixture {}; 54 | INSTANTIATE_TEST_SUITE_P(BitonicMachineAVX512, BitonicMachineAVX512_u32, bitonic_machine_allvalues_avx512_32, PrintValue()); 55 | INSTANTIATE_TEST_SUITE_P(BitonicAVX512, BitonicAVX512_u32, bitonic_allvalues_avx512_32, PrintValue()); 56 | #endif 57 | 58 | #ifdef VXSORT_TEST_AVX512_U64 59 | struct BitonicMachineAVX512_u64 : public SortFixture {}; 60 | struct BitonicAVX512_u64 : public SortFixture {}; 61 | INSTANTIATE_TEST_SUITE_P(BitonicMachineAVX512, BitonicMachineAVX512_u64, bitonic_machine_allvalues_avx512_64, PrintValue()); 62 | INSTANTIATE_TEST_SUITE_P(BitonicAVX512, BitonicAVX512_u64, bitonic_allvalues_avx512_64, PrintValue()); 63 | #endif 64 | 65 | #ifdef VXSORT_TEST_AVX512_F32 66 | struct BitonicMachineAVX512_f32 : public SortFixture {}; 67 | struct BitonicAVX512_f32 : public SortFixture {}; 68 | INSTANTIATE_TEST_SUITE_P(BitonicMachineAVX512, BitonicMachineAVX512_f32, bitonic_machine_allvalues_avx512_32, PrintValue()); 69 | INSTANTIATE_TEST_SUITE_P(BitonicAVX512, BitonicAVX512_f32, bitonic_allvalues_avx512_32, PrintValue()); 70 | #endif 71 | 72 | #ifdef VXSORT_TEST_AVX512_F64 73 | struct BitonicMachineAVX512_f64 : public SortFixture {}; 74 | struct BitonicAVX512_f64 : public SortFixture {}; 75 | INSTANTIATE_TEST_SUITE_P(BitonicMachineAVX512, BitonicMachineAVX512_f64, bitonic_machine_allvalues_avx512_64, PrintValue()); 76 | INSTANTIATE_TEST_SUITE_P(BitonicAVX512, BitonicAVX512_f64, bitonic_allvalues_avx512_64, PrintValue()); 77 | #endif 78 | 79 | 80 | #ifdef VXSORT_TEST_AVX512_I16 81 | TEST_P(BitonicMachineAVX512_i16, BitonicSortAVX512Asc) { bitonic_machine_sort_test(V); } 82 | TEST_P(BitonicAVX512_i16, BitonicSortAVX512) { bitonic_sort_test(V); } 83 | #endif 84 | 85 | #ifdef VXSORT_TEST_AVX512_I32 86 | TEST_P(BitonicMachineAVX512_i32, BitonicSortAVX512Asc) { bitonic_machine_sort_test(V); } 87 | TEST_P(BitonicAVX512_i32, BitonicSortAVX512) { bitonic_sort_test(V); } 88 | #endif 89 | 90 | #ifdef VXSORT_TEST_AVX512_I64 91 | TEST_P(BitonicMachineAVX512_i64, BitonicSortAVX512Asc) { bitonic_machine_sort_test(V); } 92 | TEST_P(BitonicAVX512_i64, BitonicSortAVX512) { bitonic_sort_test(V); } 93 | #endif 94 | 95 | #ifdef VXSORT_TEST_AVX512_U16 96 | TEST_P(BitonicMachineAVX512_u16, BitonicSortAVX512Asc) { bitonic_machine_sort_test(V); } 97 | TEST_P(BitonicAVX512_u16, BitonicSortAVX512) { bitonic_sort_test(V); } 98 | #endif 99 | 100 | #ifdef VXSORT_TEST_AVX512_U32 101 | TEST_P(BitonicMachineAVX512_u32, BitonicSortAVX512Asc) { bitonic_machine_sort_test(V); } 102 | TEST_P(BitonicAVX512_u32, BitonicSortAVX512) { bitonic_sort_test(V); } 103 | #endif 104 | 105 | #ifdef VXSORT_TEST_AVX512_U64 106 | TEST_P(BitonicMachineAVX512_u64, BitonicSortAVX512Asc) { bitonic_machine_sort_test(V); } 107 | TEST_P(BitonicAVX512_u64, BitonicSortAVX512) { bitonic_sort_test(V); } 108 | #endif 109 | 110 | #ifdef VXSORT_TEST_AVX512_F32 111 | TEST_P(BitonicMachineAVX512_f32, BitonicSortAVX512Asc) { bitonic_machine_sort_test(V); } 112 | TEST_P(BitonicAVX512_f32, BitonicSortAVX512) { bitonic_sort_test(V); } 113 | #endif 114 | 115 | #ifdef VXSORT_TEST_AVX512_F64 116 | TEST_P(BitonicMachineAVX512_f64, BitonicSortAVX512Asc) { bitonic_machine_sort_test(V); } 117 | TEST_P(BitonicAVX512_f64, BitonicSortAVX512) { bitonic_sort_test(V); } 118 | #endif 119 | 120 | //TEST_P(BitonicMachineAVX512_i32, BitonicSortAVX2Desc) { bitonic_machine_sort_test(V); } 121 | //TEST_P(BitonicMachineAVX512_u32, BitonicSortAVX2Desc) { bitonic_machine_sort_test(V); } 122 | //TEST_P(BitonicMachineAVX512_f32, BitonicSortAVX2Desc) { bitonic_machine_sort_test(V); } 123 | //TEST_P(BitonicMachineAVX512_i64, BitonicSortAVX2Desc) { bitonic_machine_sort_test(V); } 124 | //TEST_P(BitonicMachineAVX512_u64, BitonicSortAVX2Desc) { bitonic_machine_sort_test(V); } 125 | //TEST_P(BitonicMachineAVX512_f64, BitonicSortAVX2Desc) { bitonic_machine_sort_test(V); } 126 | } 127 | 128 | #include "vxsort_targets_disable.h" 129 | -------------------------------------------------------------------------------- /tests/smallsort/smallsort_test.h: -------------------------------------------------------------------------------- 1 | #ifndef VXSORT_SMALLSORT_TEST_H 2 | #define VXSORT_SMALLSORT_TEST_H 3 | 4 | #include 5 | 6 | #include "gtest/gtest.h" 7 | #include "../sort_fixtures.h" 8 | 9 | #include "../test_isa.h" 10 | #include "smallsort/bitonic_sort.h" 11 | #include "fmt/format.h" 12 | 13 | namespace vxsort_tests { 14 | 15 | using vxsort::vector_machine; 16 | 17 | template 18 | void bitonic_machine_sort_test(std::vector& V) { 19 | VXSORT_TEST_ISA(); 20 | 21 | using BM = vxsort::smallsort::bitonic_machine; 22 | 23 | auto v_copy = std::vector(V); 24 | auto begin = V.data(); 25 | auto size = V.size(); 26 | 27 | if (ascending) 28 | BM::sort_full_vectors_ascending(begin, size); 29 | else 30 | BM::sort_full_vectors_descending(begin, size); 31 | 32 | std::sort(v_copy.begin(), v_copy.end()); 33 | for (usize i = 0; i < size; ++i) { 34 | if (v_copy[i] != V[i]) { 35 | GTEST_FAIL() << fmt::format("value at idx #{} {} != {}", i, v_copy[i], V[i]); 36 | } 37 | } 38 | } 39 | 40 | template 41 | void bitonic_sort_test(std::vector& V) { 42 | VXSORT_TEST_ISA(); 43 | 44 | auto v_copy = std::vector(V); 45 | auto begin = V.data(); 46 | auto size = V.size(); 47 | 48 | vxsort::smallsort::bitonic::sort(begin, size); 49 | std::sort(v_copy.begin(), v_copy.end()); 50 | for (usize i = 0; i < size; ++i) { 51 | if (v_copy[i] != V[i]) { 52 | GTEST_FAIL() << fmt::format("value at idx #{} {} != {}", i, v_copy[i], V[i]); 53 | } 54 | } 55 | } 56 | } 57 | 58 | #endif // VXSORT_SMALLSORT_TEST_H 59 | -------------------------------------------------------------------------------- /tests/sort_fixtures.h: -------------------------------------------------------------------------------- 1 | #ifndef VXSORT_SORT_FIXTURES_H 2 | #define VXSORT_SORT_FIXTURES_H 3 | 4 | #include "gtest/gtest.h" 5 | #include "stats/vxsort_stats.h" 6 | #include "util.h" 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | namespace vxsort_tests { 15 | using namespace vxsort::types; 16 | using testing::ValuesIn; 17 | using testing::Types; 18 | 19 | template 20 | struct SortFixture : public testing::TestWithParam { 21 | protected: 22 | std::vector V; 23 | 24 | public: 25 | virtual void SetUp() { 26 | V = std::vector(GetParam()); 27 | generate_unique_values_vec(V, (T)0x1000, (T)0x1); 28 | } 29 | virtual void TearDown() { 30 | } 31 | }; 32 | 33 | struct PrintValue { 34 | template 35 | std::string operator()(const testing::TestParamInfo& info) const { 36 | auto v = static_cast(info.param); 37 | return std::to_string(v); 38 | } 39 | }; 40 | 41 | template 42 | struct SizeAndSlack { 43 | public: 44 | usize Size; 45 | i32 Slack; 46 | T FirstValue; 47 | T ValueStride; 48 | bool Randomize; 49 | 50 | SizeAndSlack(size_t size, int slack, T first_value, T value_stride, bool randomize) 51 | : Size(size), Slack(slack), FirstValue(first_value), ValueStride(value_stride), Randomize(randomize) {} 52 | 53 | /** 54 | * Generate sorting problems "descriptions" 55 | * @param start 56 | * @param stop 57 | * @param step 58 | * @param slack 59 | * @param first_value - the smallest value in each test array 60 | * @param value_stride - the minimal jump between array elements 61 | * @param randomize - should the problem array contents be randomized, defaults to true 62 | * @return 63 | */ 64 | static std::vector generate(size_t start, size_t stop, size_t step, int slack, T first_value, T value_stride, bool randomize = true) { 65 | if (step == 0) { 66 | throw std::invalid_argument("step for range must be non-zero"); 67 | } 68 | 69 | std::vector result; 70 | size_t i = start; 71 | while ((step > 0) ? (i <= stop) : (i > stop)) { 72 | for (auto j : range(-slack, slack, 1)) { 73 | if ((i64)i + j <= 0) 74 | continue; 75 | result.push_back(SizeAndSlack(i, j, first_value, value_stride, randomize)); 76 | } 77 | i *= step; 78 | } 79 | return result; 80 | } 81 | }; 82 | 83 | template 84 | struct SortWithSlackFixture : public testing::TestWithParam> { 85 | protected: 86 | std::vector V; 87 | 88 | public: 89 | virtual void SetUp() { 90 | testing::TestWithParam>::SetUp(); 91 | auto p = this->GetParam(); 92 | V = std::vector(p.Size + p.Slack); 93 | generate_unique_values_vec(V, p.FirstValue, p.ValueStride, p.Randomize); 94 | } 95 | virtual void TearDown() { 96 | #ifdef VXSORT_STATS 97 | vxsort::print_all_stats(); 98 | vxsort::reset_all_stats(); 99 | #endif 100 | } 101 | }; 102 | 103 | template 104 | struct PrintSizeAndSlack { 105 | std::string operator()(const testing::TestParamInfo>& info) const { 106 | return std::to_string(info.param.Size + info.param.Slack); 107 | } 108 | }; 109 | 110 | template 111 | struct SizeAndStride { 112 | public: 113 | usize Size; 114 | T FirstValue; 115 | T ValueStride; 116 | bool Randomize; 117 | 118 | SizeAndStride(size_t size, T first_value, T value_stride, bool randomize) 119 | : Size(size), FirstValue(first_value), ValueStride(value_stride), Randomize(randomize) {} 120 | 121 | static std::vector generate(size_t size, T stride_start, T stride_stop, T first_value, bool randomize = true) { 122 | std::vector result; 123 | for (auto j : multiply_range(stride_start, stride_stop, 2)) { 124 | result.push_back(SizeAndStride(size, first_value, j, randomize)); 125 | } 126 | return result; 127 | } 128 | }; 129 | 130 | template 131 | struct SortWithStrideFixture : public testing::TestWithParam> { 132 | protected: 133 | std::vector V; 134 | T MinValue; 135 | T MaxValue; 136 | 137 | public: 138 | virtual void SetUp() { 139 | testing::TestWithParam>::SetUp(); 140 | auto p = this->GetParam(); 141 | V = std::vector(p.Size); 142 | generate_unique_values_vec(V, p.FirstValue, p.ValueStride, p.Randomize); 143 | MinValue = p.FirstValue; 144 | MaxValue = MinValue + p.Size * p.ValueStride; 145 | if (MinValue > MaxValue) 146 | throw std::invalid_argument("stride is generating an overflow"); 147 | } 148 | virtual void TearDown() { 149 | #ifdef VXSORT_STATS 150 | vxsort::print_all_stats(); 151 | vxsort::reset_all_stats(); 152 | #endif 153 | } 154 | }; 155 | 156 | template 157 | struct PrintSizeAndStride { 158 | std::string operator()(const testing::TestParamInfo>& info) const { 159 | return std::to_string(info.param.ValueStride); 160 | } 161 | }; 162 | } 163 | 164 | #endif // VXSORT_SORT_FIXTURES_H 165 | -------------------------------------------------------------------------------- /tests/test_isa.h: -------------------------------------------------------------------------------- 1 | #ifndef VXSORT_TEST_ISA_H 2 | #define VXSORT_TEST_ISA_H 3 | 4 | #include "isa_detection.h" 5 | 6 | #define VXSORT_TEST_ISA() \ 7 | if (!::vxsort::supports_vector_machine(sizeof(T))) { \ 8 | GTEST_SKIP_("Current CPU does not support the minimal features for this test"); \ 9 | return; \ 10 | } 11 | 12 | #endif //VXSORT_TEST_ISA_H 13 | -------------------------------------------------------------------------------- /tests/util.h: -------------------------------------------------------------------------------- 1 | #ifndef VXSORT_TEST_UTIL_H 2 | #define VXSORT_TEST_UTIL_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | template 10 | void generate_unique_values_vec(std::vector& vec, T start, T stride= 0x1, bool randomize = true) { 11 | for (size_t i = 0; i < vec.size(); i++) { 12 | vec[i] = start; 13 | start += stride; 14 | } 15 | 16 | if (!randomize) 17 | return; 18 | 19 | std::random_device rd; 20 | // std::mt19937 g(rd()); 21 | std::mt19937 g(666); 22 | 23 | std::shuffle(vec.begin(), vec.end(), g); 24 | } 25 | 26 | template 27 | std::vector range(IntType start, IntType stop, IntType step) { 28 | if (step == IntType(0)) { 29 | throw std::invalid_argument("step for range must be non-zero"); 30 | } 31 | 32 | std::vector result; 33 | IntType i = start; 34 | while ((step > 0) ? (i <= stop) : (i > stop)) { 35 | result.push_back(i); 36 | i += step; 37 | } 38 | 39 | return result; 40 | } 41 | 42 | template 43 | std::vector multiply_range(IntType start, IntType stop, IntType step) { 44 | if (step == IntType(0)) { 45 | throw std::invalid_argument("step for range must be non-zero"); 46 | } 47 | 48 | std::vector result; 49 | IntType i = start; 50 | while ((step > 0) ? (i <= stop) : (i > stop)) { 51 | result.push_back(i); 52 | i *= step; 53 | } 54 | 55 | return result; 56 | } 57 | 58 | #endif 59 | -------------------------------------------------------------------------------- /vxsort/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(TARGET_NAME ${CMAKE_PROJECT_NAME}_lib) 2 | 3 | set(lib_SRC 4 | isa_detection_sane.cpp 5 | isa_detection.cpp 6 | stats/vxsort_stats.cpp 7 | ) 8 | 9 | set(lib_HEADERS 10 | isa_detection.h 11 | alignment.h 12 | compiler.h 13 | defs.h 14 | pack_machine.h 15 | vxsort.h 16 | stats/vxsort_stats.h 17 | vector_machine/machine_traits.h 18 | vxsort_targets_disable.h 19 | partition_machine.h 20 | vxsort.avx2.h 21 | vxsort.avx512.h 22 | partition_machine.avx2.h 23 | partition_machine.avx512.h 24 | smallsort/bitonic_sort.avx2.h 25 | smallsort/bitonic_sort.avx512.h 26 | ) 27 | 28 | if (${PROCESSOR_IS_X86}) 29 | file(GLOB_RECURSE avx2_SRC vector_machine/avx2/*.cpp smallsort/avx2/*.cpp) 30 | file(GLOB_RECURSE avx2_HEADERS vector_machine/avx2/*.h smallsort/avx2/*.h) 31 | file(GLOB_RECURSE avx512_SRC vector_machine/avx512/*.cpp smallsort/avx512/*.cpp) 32 | file(GLOB_RECURSE avx512_HEADERS vector_machine/avx512/*.h smallsort/avx512/*.h) 33 | 34 | list(APPEND lib_HEADERS 35 | vector_machine/machine_traits.avx2.h 36 | vector_machine/machine_traits.avx512.h 37 | ${avx2_HEADERS} 38 | ${avx512_HEADERS} 39 | ) 40 | 41 | list(APPEND lib_SRC 42 | ${avx2_SRC} 43 | ${avx512_SRC} 44 | ) 45 | 46 | endif() 47 | 48 | if (${PROCESSOR_IS_ARM} OR ${PROCESSOR_IS_AARCH64}) 49 | file(GLOB_RECURSE neon_SRC vector_machine/neon/*.cpp smallsort/neon/*.cpp) 50 | file(GLOB_RECURSE neon_HEADERS vector_machine/neon/*.h smallsort/neon/*.h) 51 | 52 | list(APPEND lib_HEADERS ${neon_HEADERS}) 53 | list(APPEND lib_SRC ${neon_SRC}) 54 | endif() 55 | 56 | add_library(${TARGET_NAME} STATIC ${lib_SRC} ${lib_HEADERS}) 57 | set_target_properties(${TARGET_NAME} PROPERTIES PREFIX lib) 58 | set_target_properties(${TARGET_NAME} PROPERTIES OUTPUT_NAME ${CMAKE_PROJECT_NAME}) 59 | target_link_libraries(${TARGET_NAME} 60 | cpu_features 61 | fmt 62 | ) 63 | 64 | target_include_directories(${TARGET_NAME} PUBLIC .) 65 | #set_property(TARGET ${TARGET_NAME} PROPERTY INTERPROCEDURAL_OPTIMIZATION True) 66 | 67 | -------------------------------------------------------------------------------- /vxsort/alignment.h: -------------------------------------------------------------------------------- 1 | #ifndef VXSORT_ALIGNNMENT_H 2 | #define VXSORT_ALIGNNMENT_H 3 | 4 | #include 5 | #include "vector_machine/machine_traits.h" 6 | #include "defs.h" 7 | 8 | namespace vxsort { 9 | using namespace vxsort::types; 10 | 11 | using namespace std; 12 | 13 | /// Perform vector sized alignment of array boundary reads (beginning/end) 14 | /// \tparam T the primitive type being aligned 15 | /// \tparam M the vector_machine being use (e.g. determines the vector width in bytes) 16 | template 17 | struct alignment_hint { 18 | using VMT = vxsort_machine_traits; 19 | static constexpr i32 N = VMT::N; 20 | static constexpr usize ALIGN = sizeof(typename VMT::TV); 21 | public: 22 | static const size_t ALIGN_MASK = ALIGN - 1; 23 | static const i8 REALIGN = 0x66; 24 | static_assert(REALIGN > ALIGN, "REALIGN must be larger than ALIGN"); 25 | 26 | alignment_hint() : left_masked_amount(REALIGN), right_unmasked_amount(REALIGN) {} 27 | alignment_hint clear_left() { 28 | alignment_hint copy = *this; 29 | copy.left_masked_amount = REALIGN; 30 | return copy; 31 | } 32 | 33 | alignment_hint clear_right() { 34 | alignment_hint copy = *this; 35 | copy.right_unmasked_amount = REALIGN; 36 | return copy; 37 | } 38 | 39 | static bool is_aligned(const void* p) { return (usize)p % ALIGN == 0; } 40 | 41 | /// Perform "left-side"/beginning-of partition alignment. 42 | /// Given an inclusive pointer to the left-most element/beginning-of an array, 43 | /// alignment to the nearest whole vector sized pointer is performed, updating the 44 | /// internal `left_masked_amount` member with the number of elements to be masked 45 | /// off during a vector read. 46 | /// @param[in] p a pointer to the first element that is desired to be read 47 | void calc_left_alignment(const T *p) { 48 | // Alignment flow: 49 | // * Calculate pre-alignment position on the left 50 | // * convert to a valid input to be use with `generate_suffix_mask` 51 | const auto* pre_aligned_left = reinterpret_cast(reinterpret_cast(p) & ~ALIGN_MASK); 52 | left_masked_amount = p - pre_aligned_left; 53 | assert(left_masked_amount >= 0 && left_masked_amount < N); 54 | assert(is_aligned(pre_aligned_left)); 55 | } 56 | 57 | /// Perform "right-side"/end-of an partition alignment. Given an exclusive pointer just past 58 | /// the right-most/end-of an array, alignment to the nearest vector sized read 59 | /// is performed, updating the internal `right_unmasked_amount` with the number of unmasked 60 | /// elements to be read 61 | /// @param[in] p a pointer past the last element that is desired to be read 62 | void calc_right_alignment(const T *p) { 63 | // │01234567│01234567 64 | // (1)│xxxxxxxx│xxxp•••• 65 | // (2)│xxxxxxxx│xp•••••• 66 | // (3)│xxxxxxxx│p•••••• 67 | // p -> the parameter 68 | // x -> data to be read 69 | // • -> Masked elements 70 | // right_unmasked_amount should be: 71 | // (1) -> 3 72 | // (2) -> 1 73 | // (3) -> 8 (8/0 are the same in terms of masking) 74 | const auto* pre_aligned_right = reinterpret_cast(reinterpret_cast(p-1) & ~ALIGN_MASK); 75 | right_unmasked_amount = p - pre_aligned_right; 76 | assert(right_unmasked_amount >= 0 && right_unmasked_amount <= N); 77 | assert(is_aligned(pre_aligned_right)); 78 | } 79 | 80 | i32 left_masked_amount : 8; 81 | i32 right_unmasked_amount : 8; 82 | }; 83 | 84 | } // namespace vxsort 85 | #endif // VXSORT_ALIGNNMENT_H 86 | -------------------------------------------------------------------------------- /vxsort/compiler.h: -------------------------------------------------------------------------------- 1 | #ifndef VXSORT_COMPILER_H 2 | #define VXSORT_COMPILER_H 3 | 4 | #ifdef _MSC_VER 5 | #ifdef __clang__ 6 | #define VXSORT_COMPILER_CLANGCL _MSC_VER 7 | #else // real MSVC 8 | #define VXSORT_COMPILER_MSVC _MSC_VER 9 | #endif 10 | #else 11 | #ifdef __GNUC__ 12 | #ifdef __clang__ 13 | #define VXSORT_COMPILER_CLANG (__clang_major__ * 100 + __clang_minor__) 14 | #else 15 | #define VXSORT_COMPILER_GCC (__GNUC__ * 100 + __GNUC_MINOR__) 16 | #endif 17 | #endif 18 | #endif 19 | 20 | #endif // VXSORT_COMPILER_H 21 | -------------------------------------------------------------------------------- /vxsort/defs.h: -------------------------------------------------------------------------------- 1 | #ifndef VXSORT_DEFS_H 2 | #define VXSORT_DEFS_H 3 | 4 | #include "compiler.h" 5 | 6 | #if _MSC_VER 7 | #ifdef _M_X86 8 | #define ARCH_X86 9 | #endif 10 | #ifdef _M_X64 11 | #define ARCH_X64 12 | #endif 13 | #ifdef _M_ARM64 14 | #define ARCH_ARM 15 | #endif 16 | #else 17 | #ifdef __i386__ 18 | #define ARCH_X86 19 | #endif 20 | #ifdef __amd64__ 21 | #define ARCH_X64 22 | #endif 23 | #ifdef __arm__ 24 | #define ARCH_ARM 25 | #endif 26 | #endif 27 | 28 | #ifdef VXSORT_COMPILER_MSVC 29 | #include 30 | #define mess_up_cmov() _ReadBarrier(); 31 | #define INLINE __forceinline 32 | #define NOINLINE __declspec(noinline) 33 | #else 34 | #define mess_up_cmov() 35 | #define INLINE __attribute__((always_inline)) 36 | #define NOINLINE __attribute__((noinline)) 37 | #endif 38 | 39 | #include 40 | #include 41 | 42 | #ifdef _MSC_VER 43 | #include 44 | typedef SSIZE_T ssize_t; 45 | #endif 46 | 47 | #define RESTRICT __restrict 48 | 49 | namespace vxsort { 50 | 51 | template 52 | constexpr bool always_false = false; 53 | constexpr bool is_powerof2(int v) { 54 | return v && ((v & (v - 1)) == 0); 55 | } 56 | 57 | namespace types { 58 | using i8 = int8_t; 59 | using u8 = uint8_t; 60 | using i16 = int16_t; 61 | using i32 = int32_t; 62 | using i64 = int64_t; 63 | using u16 = uint16_t; 64 | using u32 = uint32_t; 65 | using u64 = uint64_t; 66 | using f32 = float; 67 | using f64 = double; 68 | using isize = ssize_t; 69 | using usize = size_t; 70 | } 71 | 72 | } // namespace vxsort 73 | 74 | #endif // VXSORT_DEFS_H 75 | -------------------------------------------------------------------------------- /vxsort/isa_detection.cpp: -------------------------------------------------------------------------------- 1 | #include "isa_detection.h" 2 | 3 | bool __isa_detection_performed = vxsort::init_isa_detection(); 4 | -------------------------------------------------------------------------------- /vxsort/isa_detection.h: -------------------------------------------------------------------------------- 1 | #ifndef VXSORT_ISA_DETECTION_H 2 | #define VXSORT_ISA_DETECTION_H 3 | 4 | #include "vector_machine/machine_traits.h" 5 | #include "cpu_features_macros.h" 6 | 7 | namespace vxsort { 8 | 9 | extern bool init_isa_detection(); 10 | extern bool supports_vector_machine(vector_machine m); 11 | 12 | template 13 | bool supports_vector_machine(usize width); 14 | 15 | } // namespace vxsort 16 | 17 | #endif // VXSORT_ISA_DETECTION_H 18 | -------------------------------------------------------------------------------- /vxsort/isa_detection_sane.cpp: -------------------------------------------------------------------------------- 1 | #include "isa_detection.h" 2 | 3 | #if defined(CPU_FEATURES_ARCH_X86) 4 | #include "cpuinfo_x86.h" 5 | using namespace cpu_features; 6 | static const X86Features features = GetX86Info().features; 7 | static const bool has_avx2 = CPU_FEATURES_COMPILED_X86_AVX2 || (features.avx2 && features.avx && features.popcnt && features.bmi2); 8 | static const bool has_avx512_32_64 = CPU_FEATURES_COMPILED_X86_AVX2 || (features.avx512f && features.avx512dq && features.avx512bw && features.popcnt); 9 | static const bool has_avx512_16 = has_avx512_32_64 && features.avx512vbmi2; 10 | //static const bool has_avx512_16_fp16 = has_avx512_16 && features.avx512_fp16; 11 | #elif defined(CPU_FEATURES_ARCH_ARM) 12 | #include "cpuinfo_arm.h" 13 | using namespace cpu_features; 14 | static const ArmFeatures features = GetArmInfo().features; 15 | static const bool has_neon = CPU_FEATURES_COMPILED_ANY_ARM_NEON || features.neon; 16 | #elif defined(CPU_FEATURES_ARCH_AARCH64) 17 | #include "cpuinfo_aarch64.h" 18 | using namespace cpu_features; 19 | static const Aarch64Features features = GetAarch64Info().features; 20 | static const bool has_neon = CPU_FEATURES_COMPILED_ANY_ARM_NEON || features.asimd; 21 | static const bool has_sve = CPU_FEATURES_COMPILED_ANY_ARM_NEON || features.sve; 22 | 23 | #elif defined(CPU_FEATURES_ARCH_MIPS) 24 | #include "cpuinfo_mips.h" 25 | #elif defined(CPU_FEATURES_ARCH_PPC) 26 | #include "cpuinfo_ppc.h" 27 | #endif 28 | 29 | namespace vxsort { 30 | 31 | bool init_isa_detection() { 32 | return true; 33 | } 34 | 35 | extern bool supports_vector_machine(vector_machine m) 36 | { 37 | switch (m) { 38 | case NONE: 39 | return true; 40 | #if defined(CPU_FEATURES_ARCH_X86) 41 | case AVX2: 42 | return has_avx2; 43 | case AVX512: 44 | return has_avx512_32_64; 45 | #endif 46 | #if defined(CPU_FEATURES_ARCH_ANY_ARM) 47 | case NEON: 48 | return has_neon; 49 | #endif 50 | #if defined(CPU_FEATURES_ARCH_AARCH64) 51 | case SVE: 52 | return has_sve; 53 | #endif 54 | default: 55 | break; 56 | } 57 | return false; 58 | } 59 | 60 | template<> 61 | bool supports_vector_machine(usize) { 62 | return has_avx2; 63 | } 64 | 65 | template<> 66 | bool supports_vector_machine(usize width) { 67 | switch (width) { 68 | case 2: 69 | // We require AVX512VBMI2 for 16-bit partitioning 70 | // since we use the _mm512_mask_compressstoreu_epi16 intrinsic 71 | return has_avx512_16; 72 | case 4: 73 | case 8: 74 | return has_avx512_32_64; 75 | default: 76 | break; 77 | } 78 | return false; 79 | } 80 | } // namespace vxsort 81 | -------------------------------------------------------------------------------- /vxsort/pack_machine.h: -------------------------------------------------------------------------------- 1 | #ifndef VXSORT_PACK_MACHINE_H 2 | #define VXSORT_PACK_MACHINE_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "defs.h" 9 | #include "alignment.h" 10 | #include "vector_machine/machine_traits.h" 11 | 12 | #include 13 | #include 14 | 15 | namespace vxsort { 16 | 17 | template 18 | class pack_machine { 19 | static_assert(Shift <= 31, "Shift must be in the range 0..31"); 20 | 21 | using VMT = vxsort_machine_traits; 22 | typedef typename VMT::TV TV; 23 | static const i32 N = sizeof(TV) / sizeof(T); 24 | typedef alignment_hint AH; 25 | 26 | public: 27 | 28 | /// pack the provided vectors into a lower bit-width type after offestting them by a known base value 29 | /// \param[in] u1 a vector containing the first half of the input 30 | /// \param[in] u2 a vector containing the second half of the input 31 | /// \param[in] offset_v a vector containing the base value to use for offsetting each element before packing 32 | /// \return a vector containing the packed values after readjusting them to the supplied base value 33 | static INLINE TV prepare_offset(T min_value) 34 | { 35 | // Create a vectorized version of the offset by which we need to 36 | // correct the data before packing it 37 | auto constexpr MIN = T(std::numeric_limits::min()); 38 | auto offset = VMT::template shift_n_sub(min_value, MIN); 39 | return VMT::broadcast(offset); 40 | } 41 | 42 | static INLINE TV pack_vectors(TV u1, TV u2, const TV offset_v) { 43 | // This is statically compiled in/out 44 | if (Shift > 0) { 45 | u1 = VMT::shift_right(u1, Shift); 46 | u2 = VMT::shift_right(u2, Shift); 47 | } 48 | u1 = VMT::sub(u1, offset_v); 49 | u2 = VMT::sub(u2, offset_v); 50 | 51 | return VMT::pack_unordered(u1, u2); 52 | } 53 | 54 | /// unpack the provided vector into two higher bit-width vectors, then offset them by a known base value 55 | /// \param[in] offset_v a vector containing the base value to use for offsetting each element after unpacking 56 | /// \param[in] p a vector containing the packed input values 57 | /// \param[out] u1 a vector containing the first half of the output 58 | /// \param[out] u2 a vector containing the second half of the output 59 | static INLINE void unpack_vectors(const TV offset_v, TV p, TV& u1, TV& u2) { 60 | VMT::unpack_ordered(p, u1, u2); 61 | 62 | u1 = VMT::add(u1, offset_v); 63 | u2 = VMT::add(u2, offset_v); 64 | 65 | if (Shift > 0) { // This is statically compiled in/out 66 | u1 = VMT::shift_left(u1, Shift); 67 | u2 = VMT::shift_left(u2, Shift); 68 | } 69 | } 70 | }; 71 | 72 | } // namespace vxsort 73 | 74 | #endif // VXSORT_PACK_MACHINE_H 75 | -------------------------------------------------------------------------------- /vxsort/partition_machine.h: -------------------------------------------------------------------------------- 1 | #ifndef VXSORT_PARTITION_MACHINE_H 2 | #define VXSORT_PARTITION_MACHINE_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include "defs.h" 9 | #include "vector_machine/machine_traits.h" 10 | 11 | 12 | #ifdef VXSORT_STATS 13 | #include "stats/vxsort_stats.h" 14 | #endif 15 | 16 | 17 | namespace vxsort { 18 | using namespace std; 19 | using namespace vxsort::types; 20 | 21 | template 22 | struct partition_machine { 23 | using VMT = vxsort_machine_traits; 24 | typedef typename VMT::TV TV; 25 | public: 26 | 27 | static INLINE void partition_block(TV& data_vec, const TV P, 28 | T* RESTRICT &left, T* RESTRICT &right) { 29 | static_assert(always_false, "must be specialized!"); 30 | } 31 | 32 | 33 | /// Prime the partition "pump" by reading and aligning up to the one vector worth 34 | /// of elements fro each side of the input partition. The actual amount of data 35 | /// to be partitioned depends on the next alignment point for future vector reads: 36 | /// By reading some exact amount of each side, by the end of this function, future 37 | /// reads can perform 100% aligned loads, thereby reducing the internal resources 38 | /// consumed by modern HW when dealing with un-aligned, or worse-yet cache-line 39 | /// striped loads 40 | /// @param[in] left_masked_amount the amount of elements, prior to 41 | /// @p read_left that are to be discarded; 42 | /// a zero (0) value is a special value that denotes that all values 43 | /// are to be used (e.g. 0 discarded) 44 | /// @param[in] right_unmasked_amount the amount of elements, prior to 45 | /// @p read_right that are to be partitioned; 46 | /// a zero (0) value is a special value that denotes that all values 47 | /// are to be used (e.g. 0 discarded) 48 | /// @param[in] P The vector pivot value 49 | /// @param[inout] read_left A reference to the current left-side read-position, 50 | /// modified to the next read-position by the end of this function 51 | /// @param[inout] read_right A reference to the current right-side read-position, 52 | /// modified to the next read-position by the end of this function 53 | /// @param[inout] spill_read_left A reference to the spill-buffer's copy-from left-side 54 | /// read-position. This will reflect the discarded elements 55 | /// by the end of the this function 56 | /// @param[inout] spill_write_left A reference to the spill-buffer's left-side write-position 57 | /// This will reflect the next valid vector write position by 58 | /// the end of this function. 59 | /// @param[inout] spill_read_right A reference to the spill-buffer's copy-from right-side 60 | /// read-position. This will reflect the discarded elements 61 | /// by the end of this function 62 | /// @param[inout] spill_write_right A reference to the spill-buffer's right-side write-position 63 | /// This will reflect eh next valid vector-write position by 64 | /// the end of this function. 65 | static inline void align_vectorized(const i32 left_masked_amount, const i32 right_unmasked_amount, 66 | const TV P, 67 | T* RESTRICT &read_left, T* RESTRICT &read_right, 68 | T* RESTRICT &spill_read_left, T* RESTRICT &spill_write_left, 69 | T* RESTRICT &spill_read_right, T* RESTRICT &spill_write_right) { 70 | static_assert(always_false, "must be specialized!"); 71 | } 72 | }; 73 | 74 | } // namespace vxsort 75 | 76 | #endif //VXSORT_PARTITION_MACHINE_H 77 | -------------------------------------------------------------------------------- /vxsort/smallsort/avx2/bitonic_machine.avx2.h: -------------------------------------------------------------------------------- 1 | ///////////////////////////////////////////////////////////////////////////// 2 | //// 3 | // This file was auto-generated by a tool at 2022-10-30 08:12:25 4 | // 5 | // It is recommended you DO NOT directly edit this file but instead edit 6 | // the code-generator that generated this source file instead. 7 | ///////////////////////////////////////////////////////////////////////////// 8 | 9 | #include "bitonic_machine.avx2.i16.generated.h" 10 | #include "bitonic_machine.avx2.u16.generated.h" 11 | #include "bitonic_machine.avx2.i32.generated.h" 12 | #include "bitonic_machine.avx2.u32.generated.h" 13 | #include "bitonic_machine.avx2.f32.generated.h" 14 | #include "bitonic_machine.avx2.i64.generated.h" 15 | #include "bitonic_machine.avx2.u64.generated.h" 16 | #include "bitonic_machine.avx2.f64.generated.h" 17 | -------------------------------------------------------------------------------- /vxsort/smallsort/avx512/bitonic_machine.avx512.h: -------------------------------------------------------------------------------- 1 | ///////////////////////////////////////////////////////////////////////////// 2 | //// 3 | // This file was auto-generated by a tool at 2022-10-30 08:12:25 4 | // 5 | // It is recommended you DO NOT directly edit this file but instead edit 6 | // the code-generator that generated this source file instead. 7 | ///////////////////////////////////////////////////////////////////////////// 8 | 9 | #include "bitonic_machine.avx512.i16.generated.h" 10 | #include "bitonic_machine.avx512.u16.generated.h" 11 | #include "bitonic_machine.avx512.i32.generated.h" 12 | #include "bitonic_machine.avx512.u32.generated.h" 13 | #include "bitonic_machine.avx512.f32.generated.h" 14 | #include "bitonic_machine.avx512.i64.generated.h" 15 | #include "bitonic_machine.avx512.u64.generated.h" 16 | #include "bitonic_machine.avx512.f64.generated.h" 17 | -------------------------------------------------------------------------------- /vxsort/smallsort/bitonic_machine.h: -------------------------------------------------------------------------------- 1 | #ifndef VXSORT_BITONIC_MACHINE_H 2 | #define VXSORT_BITONIC_MACHINE_H 3 | 4 | #include 5 | #include "../defs.h" 6 | #include "vector_machine/machine_traits.h" 7 | 8 | namespace vxsort::smallsort { 9 | using namespace std; 10 | 11 | template 12 | struct bitonic_machine { 13 | public: 14 | typedef T TV; 15 | typedef T TMASK; 16 | 17 | static INLINE void sort_04v_ascending(TV& d01, TV& d02, TV& d03, TV& d04); 18 | static INLINE void merge_04v_ascending(TV& d01, TV& d02, TV& d03, TV& d04); 19 | static INLINE void cross_min_max(TV& d01, TV& d02); 20 | static INLINE void strided_min_max(TV& d01, TV& d02); 21 | 22 | static NOINLINE void sort_01v_full_ascending(T *ptr); 23 | static NOINLINE void sort_02v_full_ascending(T *ptr); 24 | static NOINLINE void sort_03v_full_ascending(T *ptr); 25 | static NOINLINE void sort_04v_full_ascending(T *ptr); 26 | static void sort_full_vectors_ascending(T *ptr, usize length); 27 | static void sort_full_vectors_descending(T *ptr, usize length); 28 | }; 29 | } // namespace vxsort 30 | #endif 31 | -------------------------------------------------------------------------------- /vxsort/smallsort/bitonic_sort.avx2.h: -------------------------------------------------------------------------------- 1 | #ifndef VXSORT_BITONIC_SORT_AVX2_H 2 | #define VXSORT_BITONIC_SORT_AVX2_H 3 | 4 | #include "../vector_machine/machine_traits.avx2.h" 5 | #include "avx2/bitonic_machine.avx2.h" 6 | 7 | #include "bitonic_sort.h" 8 | 9 | #endif //VXSORT_BITONIC_SORT_AVX2_H 10 | -------------------------------------------------------------------------------- /vxsort/smallsort/bitonic_sort.avx512.h: -------------------------------------------------------------------------------- 1 | #ifndef VXSORT_BITONIC_SORT_AVX512_H 2 | #define VXSORT_BITONIC_SORT_AVX512_H 3 | 4 | #include "../vector_machine/machine_traits.avx512.h" 5 | #include "avx512/bitonic_machine.avx512.h" 6 | 7 | #include "bitonic_sort.h" 8 | 9 | #endif //VXSORT_BITONIC_SORT_AVX512_H 10 | -------------------------------------------------------------------------------- /vxsort/smallsort/codegen/bitonic_gen.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | import os 4 | from datetime import datetime 5 | from enum import Enum 6 | 7 | from typing.io import IO 8 | 9 | from avx2 import AVX2BitonicISA 10 | from avx512 import AVX512BitonicISA 11 | from bitonic_isa import BitonicISA 12 | 13 | BitonicISA.register(AVX2BitonicISA) 14 | BitonicISA.register(AVX512BitonicISA) 15 | 16 | 17 | def get_generator_supported_types(vector_isa): 18 | if isinstance(vector_isa, str): 19 | vector_isa = VectorISA[vector_isa] 20 | if vector_isa == VectorISA.AVX2: 21 | return AVX2BitonicISA.supported_types() 22 | elif vector_isa == VectorISA.AVX512: 23 | return AVX512BitonicISA.supported_types() 24 | elif vector_isa == VectorISA.NEON: 25 | return NeonBitonicISA.supported_types() 26 | else: 27 | raise Exception(f"Non-supported vector machine-type: {vector_isa}") 28 | 29 | 30 | def get_generator(vector_isa, type, f_header: IO): 31 | if isinstance(vector_isa, str): 32 | vector_isa = VectorISA[vector_isa] 33 | if vector_isa == VectorISA.AVX2: 34 | return AVX2BitonicISA(type, f_header) 35 | elif vector_isa == VectorISA.AVX512: 36 | return AVX512BitonicISA(type, f_header) 37 | elif vector_isa == VectorISA.NEON: 38 | return NeonBitonicISA(type, f_header) 39 | else: 40 | raise Exception(f"Non-supported vector machine-type: {vector_isa}") 41 | 42 | 43 | def generate_per_type(f_header: IO, type, vector_isa, break_inline): 44 | g = get_generator(vector_isa, type, f_header) 45 | g.generate_prologue() 46 | g.generate_1v_sorters(ascending=True) 47 | g.generate_1v_sorters(ascending=False) 48 | for width in range(2, g.max_bitonic_sort_vectors() + 1): 49 | # Allow breaking the inline chain once in a while (configurable) 50 | if break_inline == 0 or width % break_inline != 0: 51 | inline = True 52 | else: 53 | inline = False 54 | g.generate_compounded_sorter(width, asc=True, inline=inline) 55 | g.generate_compounded_sorter(width, asc=False, inline=inline) 56 | if width <= g.largest_merge_variant_needed(): 57 | g.generate_compounded_merger(width, asc=True, inline=inline) 58 | g.generate_compounded_merger(width, asc=False, inline=inline) 59 | 60 | 61 | g.generate_cross_min_max() 62 | g.generate_strided_min_max() 63 | 64 | g.generate_entry_points_full_vectors(asc=True) 65 | g.generate_entry_points_full_vectors(asc=False) 66 | g.generate_master_entry_point_full(asc=True) 67 | g.generate_master_entry_point_full(asc=False) 68 | g.generate_epilogue() 69 | 70 | 71 | class Language(Enum): 72 | csharp = 'csharp' 73 | cpp = 'cpp' 74 | rust = 'rust' 75 | 76 | def __str__(self): 77 | return self.value 78 | 79 | 80 | class VectorISA(Enum): 81 | AVX2 = 'avx2' 82 | AVX512 = 'avx512' 83 | NEON = 'neon' 84 | SVE = 'sve' 85 | 86 | def __str__(self): 87 | return self.value 88 | 89 | def autogenerated_blabber(): 90 | return f"""///////////////////////////////////////////////////////////////////////////// 91 | //// 92 | // This file was auto-generated by a tool at {datetime.now().strftime("%F %H:%M:%S")} 93 | // 94 | // It is recommended you DO NOT directly edit this file but instead edit 95 | // the code-generator that generated this source file instead. 96 | /////////////////////////////////////////////////////////////////////////////""" 97 | 98 | def generate_all_types(): 99 | parser = argparse.ArgumentParser() 100 | #parser.add_argument("--language", type=Language, choices=list(Language), 101 | # help="select output language: csharp/cpp/rust") 102 | parser.add_argument("--vector-isa", 103 | nargs='+', 104 | default='all', 105 | help='list of vector ISA to generate', 106 | choices=list(VectorISA).append("all")) 107 | parser.add_argument("--break-inline", type=int, default=0, help="break inlining every N levels") 108 | 109 | parser.add_argument("--output-dir", type=str, 110 | help="output directory") 111 | 112 | opts = parser.parse_args() 113 | 114 | if 'all' in opts.vector_isa: 115 | opts.vector_isa = list(VectorISA) 116 | 117 | for isa in opts.vector_isa: 118 | headers = [] 119 | for t in get_generator_supported_types(isa): 120 | filename = f"bitonic_machine.{isa.lower()}.{t}.generated" 121 | print(f"Generating {filename}.{{h,.cpp}}") 122 | dirname = os.path.join(opts.output_dir, isa.lower()) 123 | os.makedirs(dirname, exist_ok=True) 124 | headers.append(filename + ".h") 125 | h_filename = os.path.join(dirname, filename + ".h") 126 | with open(h_filename, "w") as f_header: 127 | generate_per_type(f_header, t, isa, opts.break_inline) 128 | 129 | h_master_header = os.path.join(dirname, f"bitonic_machine.{isa.lower()}.h") 130 | with open(h_master_header, "w") as f_header: 131 | print(autogenerated_blabber(), file=f_header) 132 | print("", file=f_header) 133 | f_header.writelines([f"""#include \"{h}\"\n""" for h in headers]) 134 | 135 | if __name__ == '__main__': 136 | generate_all_types() 137 | -------------------------------------------------------------------------------- /vxsort/smallsort/codegen/bitonic_isa.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, ABCMeta, abstractmethod 2 | 3 | from utils import next_power_of_2 4 | 5 | 6 | class BitonicISA(ABC, metaclass=ABCMeta): 7 | 8 | @abstractmethod 9 | def vector_size(self): 10 | pass 11 | 12 | @abstractmethod 13 | def max_bitonic_sort_vectors(self): 14 | pass 15 | 16 | def largest_merge_variant_needed(self): 17 | return next_power_of_2(self.max_bitonic_sort_vectors()); 18 | 19 | @abstractmethod 20 | def vector_size(self): 21 | pass 22 | 23 | @abstractmethod 24 | def vector_type(self): 25 | pass 26 | 27 | @classmethod 28 | @abstractmethod 29 | def supported_types(cls): 30 | pass 31 | 32 | @abstractmethod 33 | def generate_prologue(self): 34 | pass 35 | 36 | @abstractmethod 37 | def generate_epilogue(self): 38 | pass 39 | 40 | 41 | @abstractmethod 42 | def generate_1v_basic_sorters(self, ascending: bool): 43 | pass 44 | 45 | @abstractmethod 46 | def generate_1v_merge_sorters(self, ascending: bool): 47 | pass 48 | 49 | def generate_1v_sorters(self, ascending: bool): 50 | self.generate_1v_basic_sorters(ascending) 51 | self.generate_1v_merge_sorters(ascending) 52 | 53 | @abstractmethod 54 | def generate_compounded_sorter(self, width: int, ascending: bool, inline: int): 55 | pass 56 | 57 | @abstractmethod 58 | def generate_compounded_merger(self, width: int, ascending: bool, inline: int): 59 | pass 60 | 61 | @abstractmethod 62 | def generate_entry_points_full_vectors(self, ascending : bool): 63 | pass 64 | 65 | @abstractmethod 66 | def generate_master_entry_point_full(self, ascending : bool): 67 | pass 68 | 69 | @abstractmethod 70 | def generate_cross_min_max(self): 71 | pass 72 | 73 | @abstractmethod 74 | def generate_strided_min_max(self): 75 | pass -------------------------------------------------------------------------------- /vxsort/smallsort/codegen/utils.py: -------------------------------------------------------------------------------- 1 | native_size_map = { 2 | "i16": 2, 3 | "u16": 2, 4 | "i32": 4, 5 | "u32": 4, 6 | "f32": 4, 7 | "i64": 8, 8 | "u64": 8, 9 | "f64": 8, 10 | } 11 | 12 | 13 | def next_power_of_2(v): 14 | v = v - 1 15 | v |= v >> 1 16 | v |= v >> 2 17 | v |= v >> 4 18 | v |= v >> 8 19 | v |= v >> 16 20 | v = v + 1 21 | return int(v) 22 | -------------------------------------------------------------------------------- /vxsort/stats/vxsort_stats.cpp: -------------------------------------------------------------------------------- 1 | #include "vxsort_stats.h" 2 | #ifdef VXSORT_STATS 3 | 4 | #include 5 | 6 | namespace vxsort { 7 | using namespace vxsort::types; 8 | 9 | i32 vxsort_stats_base::last_type = 0; 10 | std::array vxsort_stats_base::registered_types; 11 | 12 | template 13 | void vxsort_stats::print_stats() 14 | { 15 | fmt::print("{:9} | {:7} | {:8} | {:7} | {:11} | {:12} | {:10} | {:9.2}% | {:>9.2f}\n", 16 | vxsort_type_to_str(typeid_to_vxsort_type()), 17 | _num_sorts, 18 | _total_sort_size, 19 | _num_partitions, 20 | _num_vec_loads, 21 | _num_vec_stores, 22 | _num_small_sorts, 23 | (f64) _small_sorts_size * 100 / (f64) _total_partitioned_size, 24 | (f64) _small_sorts_size / (f64) _num_small_sorts); 25 | } 26 | 27 | extern void print_all_stats() { 28 | fmt::print("type | # sorts | # sorted | # parts | # vec loads | # vec stores | # sm. sort | % sm. sort | avg. sm. sort\n"); 29 | fmt::print("----------|---------|----------|---------|-------------|--------------|------------|------------|--------------\n"); 30 | for (auto i = 0; i < vxsort_stats_base::last_type; ++i) { 31 | switch (vxsort_stats_base::registered_types[i]) { 32 | case vxsort_type::I16: vxsort_stats::print_stats(); break; 33 | case vxsort_type::U16: vxsort_stats::print_stats(); break; 34 | case vxsort_type::I32: vxsort_stats::print_stats(); break; 35 | case vxsort_type::U32: vxsort_stats::print_stats(); break; 36 | case vxsort_type::I64: vxsort_stats::print_stats(); break; 37 | case vxsort_type::U64: vxsort_stats::print_stats(); break; 38 | case vxsort_type::F32: vxsort_stats::print_stats(); break; 39 | case vxsort_type::F64: vxsort_stats::print_stats(); break; 40 | case vxsort_type::NONE: break; 41 | } 42 | } 43 | } 44 | 45 | extern void reset_all_stats() { 46 | for (auto i = 0; i < vxsort_stats_base::last_type; i++) { 47 | switch (vxsort_stats_base::registered_types[i]) { 48 | case vxsort_type::I16: vxsort_stats::reset(); break; 49 | case vxsort_type::U16: vxsort_stats::reset(); break; 50 | case vxsort_type::I32: vxsort_stats::reset(); break; 51 | case vxsort_type::U32: vxsort_stats::reset(); break; 52 | case vxsort_type::I64: vxsort_stats::reset(); break; 53 | case vxsort_type::U64: vxsort_stats::reset(); break; 54 | case vxsort_type::F32: vxsort_stats::reset(); break; 55 | case vxsort_type::F64: vxsort_stats::reset(); break; 56 | case vxsort_type::NONE: break; 57 | } 58 | } 59 | } 60 | 61 | } // namespace vxsort 62 | 63 | #endif -------------------------------------------------------------------------------- /vxsort/stats/vxsort_stats.h: -------------------------------------------------------------------------------- 1 | #ifndef VXSORT_VXSORT_STATS_H 2 | #define VXSORT_VXSORT_STATS_H 3 | 4 | #ifdef VXSORT_STATS 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include "defs.h" 11 | 12 | 13 | using namespace std; 14 | namespace vxsort { 15 | using namespace vxsort::types; 16 | 17 | enum class vxsort_type { 18 | I16, 19 | U16, 20 | I32, 21 | U32, 22 | I64, 23 | U64, 24 | F32, 25 | F64, 26 | NONE 27 | }; 28 | 29 | class vxsort_stats_base 30 | { 31 | public: 32 | static std::array registered_types; 33 | static i32 last_type; 34 | protected: 35 | static void reset() 36 | { 37 | #ifdef VXSORT_STATS 38 | last_type = 0; 39 | #endif 40 | } 41 | 42 | template 43 | static vxsort_type typeid_to_vxsort_type() { 44 | if (typeid(T) == typeid(i16)) 45 | return vxsort_type::I16; 46 | else if (typeid(T) == typeid(i32)) 47 | return vxsort_type::I32; 48 | else if (typeid(T) == typeid(i64)) 49 | return vxsort_type::I64; 50 | 51 | if (typeid(T) == typeid(u16)) 52 | return vxsort_type::U16; 53 | else if (typeid(T) == typeid(u32)) 54 | return vxsort_type::U32; 55 | else if (typeid(T) == typeid(u64)) 56 | return vxsort_type::U64; 57 | 58 | if (typeid(T) == typeid(f32)) 59 | return vxsort_type::F32; 60 | if (typeid(T) == typeid(f64)) 61 | return vxsort_type::F64; 62 | return vxsort_type::NONE; 63 | } 64 | 65 | static const char *vxsort_type_to_str(const vxsort_type type) { 66 | switch (type) { 67 | case vxsort_type::I16: return "i16"; 68 | case vxsort_type::U16: return "u16"; 69 | case vxsort_type::I32: return "i32"; 70 | case vxsort_type::U32: return "u32"; 71 | case vxsort_type::I64: return "i64"; 72 | case vxsort_type::U64: return "u64"; 73 | case vxsort_type::F32: return "f32"; 74 | case vxsort_type::F64: return "f64"; 75 | case vxsort_type::NONE: return "none"; 76 | } 77 | } 78 | 79 | static void register_stat(const vxsort_type type) 80 | { 81 | for (auto i = 0; i < last_type; i++) 82 | if (registered_types[i] == type) 83 | return; 84 | 85 | registered_types[last_type++] = type; 86 | } 87 | }; 88 | 89 | template 90 | class vxsort_stats : vxsort_stats_base 91 | { 92 | private: 93 | static u64 _num_sorts; 94 | static u64 _total_sort_size; 95 | static u64 _num_partitions; 96 | static u64 _total_partitioned_size; 97 | static u64 _num_small_sorts; 98 | static u64 _small_sorts_size; 99 | static u64 _packed_elements; 100 | static u64 _unpacked_elements; 101 | static u64 _num_perms; 102 | static u64 _num_vec_loads; 103 | static u64 _num_vec_stores; 104 | 105 | public: 106 | static void reset() 107 | { 108 | _num_sorts = 0; 109 | _total_sort_size = 0; 110 | _num_partitions = 0; 111 | _total_partitioned_size = 0; 112 | _num_small_sorts = 0; 113 | _small_sorts_size = 0; 114 | _packed_elements = 0; 115 | _unpacked_elements = 0; 116 | _num_perms = 0; 117 | _num_vec_loads = 0; 118 | _num_vec_stores = 0; 119 | 120 | } 121 | static void bump_sorts(size_t n) { 122 | _num_sorts++; 123 | _total_sort_size += n; 124 | vxsort_stats_base::register_stat(vxsort_stats_base::typeid_to_vxsort_type()); 125 | } 126 | static void bump_partitions(size_t n) { 127 | _num_partitions++; 128 | _total_partitioned_size += n; 129 | } 130 | static void bump_small_sorts(i32 n = 1) { _num_small_sorts++; } 131 | static void bump_perms(usize perms = 1) { _num_perms += perms; } 132 | static void bump_vec_loads(usize loads = 1) { _num_vec_loads += loads; } 133 | static void bump_vec_stores(usize stores = 1) { _num_vec_stores += stores; } 134 | static void record_small_sort_size(usize sort_size) { _small_sorts_size += sort_size; } 135 | static void bump_packs(usize size) { _packed_elements += size; } 136 | static void bump_unpacks(usize size) { _unpacked_elements += size; } 137 | 138 | static void print_stats(); 139 | }; 140 | 141 | extern void reset_all_stats(); 142 | extern void print_all_stats(); 143 | 144 | template 145 | u64 vxsort_stats::_num_sorts = 0; 146 | template 147 | u64 vxsort_stats::_total_sort_size = 0; 148 | template 149 | u64 vxsort_stats::_num_partitions = 0; 150 | template 151 | u64 vxsort_stats::_total_partitioned_size = 0; 152 | template 153 | u64 vxsort_stats::_num_small_sorts = 0; 154 | template 155 | u64 vxsort_stats::_small_sorts_size = 0; 156 | template 157 | u64 vxsort_stats::_packed_elements = 0; 158 | template 159 | u64 vxsort_stats::_unpacked_elements = 0; 160 | template 161 | u64 vxsort_stats::_num_perms = 0; 162 | template 163 | u64 vxsort_stats::_num_vec_loads = 0; 164 | template 165 | u64 vxsort_stats::_num_vec_stores = 0; 166 | 167 | } 168 | 169 | #endif 170 | #endif // VXSORT_VXSORT_STATS_H 171 | -------------------------------------------------------------------------------- /vxsort/vector_machine/avx2/f32.h: -------------------------------------------------------------------------------- 1 | template <> 2 | class vxsort_machine_traits { 3 | public: 4 | typedef f32 T; 5 | typedef __m256 TV; 6 | typedef __m256i TLOADSTOREMASK; 7 | typedef u32 TCMPMASK; 8 | typedef f32 TPACK; 9 | 10 | static constexpr i32 N = sizeof(TV) / sizeof(T); 11 | static_assert(is_powerof2(N), "vector-size / element-size must be a power of 2"); 12 | 13 | static constexpr bool supports_compress_writes() { return false; } 14 | static constexpr bool supports_packing() { return false; } 15 | 16 | template 17 | static bool can_pack(T) { return false; } 18 | 19 | static INLINE TLOADSTOREMASK generate_prefix_mask(i32 amount) { 20 | assert(amount >= 0); 21 | assert(amount <= N); 22 | return _mm256_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(prefix_mask_table_32b + amount * N))); 23 | } 24 | 25 | static INLINE TLOADSTOREMASK generate_suffix_mask(i32 amount) { 26 | assert(amount >= 0); 27 | assert(amount <= N); 28 | return _mm256_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(suffix_mask_table_32b + amount * N))); 29 | } 30 | 31 | static INLINE TV load_vec(TV* p) { return _mm256_loadu_ps((T *)p); } 32 | 33 | static INLINE void store_vec(TV* ptr, TV v) { _mm256_storeu_ps((T *)ptr, v); } 34 | 35 | static void store_compress_vec(TV*, TV, TCMPMASK) { throw std::runtime_error("operation is unsupported"); } 36 | 37 | static INLINE TV load_partial_vec(TV *p, TV base, TLOADSTOREMASK mask) { 38 | return i2s(_mm256_or_si256(s2i(_mm256_maskload_ps((T *) p, mask)), 39 | _mm256_andnot_si256(mask, s2i(base)))); 40 | } 41 | 42 | static INLINE void store_masked_vec(TV *p, TV v, TLOADSTOREMASK mask) { 43 | _mm256_maskstore_ps((T *) p, mask, v); 44 | } 45 | 46 | static INLINE TV partition_vector(TV v, i32 mask) { 47 | assert(mask >= 0); 48 | assert(mask <= 255); 49 | return _mm256_permutevar8x32_ps(v, _mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i*)(perm_table_32 + mask * 8)))); 50 | } 51 | 52 | static INLINE TV broadcast(T pivot) { return _mm256_set1_ps(pivot); } 53 | 54 | static INLINE TCMPMASK get_cmpgt_mask(TV a, TV b) { 55 | /// 0x0E: Greater-than (ordered, signaling) \n 56 | /// 0x1E: Greater-than (ordered, non-signaling) 57 | return _mm256_movemask_ps(_mm256_cmp_ps(a, b, _CMP_GT_OS)); 58 | } 59 | 60 | static INLINE TV shift_right(TV v, i32 i) { return v; } 61 | static INLINE TV shift_left(TV v, i32 i) { return v; } 62 | 63 | static INLINE TV add(TV a, TV b) { return _mm256_add_ps(a, b); } 64 | static INLINE TV sub(TV a, TV b) { return _mm256_sub_ps(a, b); }; 65 | 66 | static INLINE TV pack_unordered(TV, TV) { throw std::runtime_error("operation is unsupported"); } 67 | static INLINE void unpack_ordered(TV, TV&, TV&) { } 68 | 69 | template 70 | static INLINE T shift_n_sub(T v, T sub) { return v; } 71 | 72 | template 73 | static T unshift_and_add(TPACK from, T add) { return add; } 74 | }; 75 | -------------------------------------------------------------------------------- /vxsort/vector_machine/avx2/f64.h: -------------------------------------------------------------------------------- 1 | template <> 2 | class vxsort_machine_traits { 3 | public: 4 | typedef f64 T; 5 | typedef __m256d TV; 6 | typedef __m256i TLOADSTOREMASK; 7 | typedef u32 TCMPMASK; 8 | typedef f64 TPACK; 9 | 10 | static constexpr i32 N = sizeof(TV) / sizeof(T); 11 | static_assert(is_powerof2(N), "vector-size / element-size must be a power of 2"); 12 | 13 | static constexpr bool supports_compress_writes() { return false; } 14 | static constexpr bool supports_packing() { return false; } 15 | 16 | template 17 | static bool can_pack(T) { return false; } 18 | 19 | static INLINE TLOADSTOREMASK generate_prefix_mask(i32 amount) { 20 | assert(amount >= 0); 21 | assert(amount <= N); 22 | return _mm256_cvtepi8_epi64(_mm_loadu_si128((__m128i*)(prefix_mask_table_64b + amount * N))); 23 | } 24 | 25 | static INLINE TLOADSTOREMASK generate_suffix_mask(i32 amount) { 26 | assert(amount >= 0); 27 | assert(amount <= N); 28 | return _mm256_cvtepi8_epi64(_mm_loadu_si128((__m128i*)(suffix_mask_table_64b + amount * N))); 29 | } 30 | 31 | static INLINE TV load_vec(TV* p) { return _mm256_loadu_pd((T *)p); } 32 | 33 | static INLINE void store_vec(TV* ptr, TV v) { _mm256_storeu_pd((T *)ptr, v); } 34 | 35 | static void store_compress_vec(TV* ptr, TV v, TCMPMASK mask) { throw std::runtime_error("operation is unsupported"); } 36 | 37 | static INLINE TV load_partial_vec(TV *p, TV base, TLOADSTOREMASK mask) { 38 | return i2d(_mm256_or_si256(d2i(_mm256_maskload_pd((T *) p, mask)), 39 | _mm256_andnot_si256(mask, d2i(base)))); 40 | } 41 | 42 | static INLINE void store_masked_vec(TV *p, TV v, TLOADSTOREMASK mask) { 43 | _mm256_maskstore_pd((double *) p, mask, v); 44 | } 45 | 46 | static INLINE TV partition_vector(TV v, i32 mask) { 47 | assert(mask >= 0); 48 | assert(mask <= 15); 49 | return s2d(_mm256_permutevar8x32_ps(d2s(v), _mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i*)(perm_table_64 + mask * 8))))); 50 | } 51 | 52 | static INLINE TV broadcast(T pivot) { return _mm256_set1_pd(pivot); } 53 | static INLINE TCMPMASK get_cmpgt_mask(TV a, TV b) { 54 | /// 0x0E: Greater-than (ordered, signaling) \n 55 | /// 0x1E: Greater-than (ordered, non-signaling) 56 | return _mm256_movemask_pd(_mm256_cmp_pd(a, b, _CMP_GT_OS)); 57 | } 58 | 59 | static INLINE TV shift_right(TV v, i32 i) { return v; } 60 | static INLINE TV shift_left(TV v, i32 i) { return v; } 61 | 62 | static INLINE TV add(TV a, TV b) { return _mm256_add_pd(a, b); } 63 | static INLINE TV sub(TV a, TV b) { return _mm256_sub_pd(a, b); }; 64 | 65 | static INLINE TV pack_unordered(TV, TV) { TV tmp = _mm256_set1_pd(0); return tmp; } 66 | static INLINE void unpack_ordered(TV, TV&, TV&) { } 67 | 68 | template 69 | static INLINE T shift_n_sub(T v, T sub) { return v; } 70 | 71 | template 72 | static INLINE T unshift_and_add(TPACK from, T add) { return add; } 73 | }; 74 | -------------------------------------------------------------------------------- /vxsort/vector_machine/avx2/i16.h: -------------------------------------------------------------------------------- 1 | template <> 2 | class vxsort_machine_traits { 3 | public: 4 | typedef i16 T; 5 | typedef __m256i TV; 6 | typedef i32 TLOADSTOREMASK; 7 | typedef u32 TCMPMASK; 8 | typedef i16 TPACK; 9 | typedef typename std::make_unsigned::type TU; 10 | 11 | static constexpr i32 N = sizeof(TV) / sizeof(T); 12 | static_assert(is_powerof2(N), "vector-size / element-size must be a power of 2"); 13 | 14 | static constexpr bool supports_compress_writes() { return false; } 15 | static constexpr bool supports_packing() { return false; } 16 | 17 | template 18 | static bool can_pack(T) { return false; } 19 | 20 | static INLINE TLOADSTOREMASK generate_prefix_mask(i32 amount) { 21 | assert(amount >= 0); 22 | assert(amount <= N); 23 | 24 | return amount ? amount : N; 25 | } 26 | 27 | static INLINE TLOADSTOREMASK generate_suffix_mask(i32 amount) { 28 | assert(amount >= 0); 29 | assert(amount <= N); 30 | 31 | return amount ? -N + amount : -N; 32 | } 33 | 34 | static INLINE TV load_vec(TV* p) { return _mm256_lddqu_si256(p); } 35 | 36 | static INLINE void store_vec(TV* ptr, TV v) { _mm256_storeu_si256(ptr, v); } 37 | 38 | static void store_compress_vec(TV*, TV, TCMPMASK) { throw std::runtime_error("operation is unsupported"); } 39 | 40 | static INLINE TV load_partial_vec(TV *p, TV base, TLOADSTOREMASK mask) { 41 | // FML: There is only so much AVX2 stupidity one person can 42 | // take in their entire lifetime, I'm personally over this crap 43 | std::array base_vec; 44 | _mm256_storeu_si256((TV *)base_vec.data(), base); 45 | auto pt = (T *)p; 46 | auto psrc = mask > 0 ? pt : pt + N + mask; 47 | auto pdest = mask > 0 ? base_vec.begin() : base_vec.end() + mask; 48 | auto amount = abs(mask); 49 | std::copy_n(psrc, amount, pdest); 50 | return _mm256_lddqu_si256((TV *)base_vec.data()); 51 | } 52 | 53 | static INLINE void store_masked_vec(TV *p, TV v, TLOADSTOREMASK mask) { 54 | memcpy(p, &v, sizeof(T) * mask); 55 | } 56 | 57 | static INLINE TV partition_vector(TV, i32) { 58 | // Should never be called, since we "hijack" 16b/avx2 partitioning with template 59 | // specializtion with partition_machine 60 | throw std::runtime_error("operation is unsupported"); 61 | } 62 | 63 | static INLINE TV broadcast(T pivot) { return _mm256_set1_epi16(pivot); } 64 | 65 | static INLINE TCMPMASK get_cmpgt_mask(TV a, TV b) { 66 | return _pext_u32( 67 | _mm256_movemask_epi8(_mm256_cmpgt_epi16(a, b)), 68 | 0x55555555); 69 | } 70 | 71 | static INLINE TV shift_right(TV v, i32 i) { return _mm256_srli_epi16(v, i); } 72 | static INLINE TV shift_left(TV v, i32 i) { return _mm256_slli_epi16(v, i); } 73 | 74 | static INLINE TV add(TV a, TV b) { return _mm256_add_epi16(a, b); } 75 | static INLINE TV sub(TV a, TV b) { return _mm256_sub_epi16(a, b); }; 76 | 77 | static INLINE TV pack_unordered(TV, TV) { throw std::runtime_error("operation is unsupported"); } 78 | static INLINE void unpack_ordered(TV, TV&, TV&) { } 79 | 80 | template 81 | static INLINE T shift_n_sub(T v, T sub) { 82 | if (Shift > 0) 83 | v >>= Shift; 84 | v -= sub; 85 | return v; 86 | } 87 | 88 | template 89 | static INLINE T unshift_and_add(TPACK from, T add) { 90 | add += from; 91 | if (Shift > 0) 92 | add = (T) (((TU) add) << Shift); 93 | return add; 94 | } 95 | }; 96 | -------------------------------------------------------------------------------- /vxsort/vector_machine/avx2/i32.h: -------------------------------------------------------------------------------- 1 | template <> 2 | class vxsort_machine_traits { 3 | public: 4 | typedef i32 T; 5 | typedef __m256i TV; 6 | typedef __m256i TLOADSTOREMASK; 7 | typedef u32 TCMPMASK; 8 | typedef i16 TPACK; 9 | typedef typename std::make_unsigned::type TUPACK; 10 | typedef typename std::make_unsigned::type TU; 11 | static_assert(sizeof(TPACK)*2 == sizeof(T), "TPACK must be half-width of T"); 12 | 13 | static constexpr i32 N = sizeof(TV) / sizeof(T); 14 | static_assert(is_powerof2(N), "vector-size / element-size must be a power of 2"); 15 | 16 | static constexpr bool supports_compress_writes() { return false; } 17 | static constexpr bool supports_packing() { return true; } 18 | 19 | template 20 | static bool can_pack(T span) { 21 | constexpr auto PACK_LIMIT = (((TU)std::numeric_limits::max() + 1)) << Shift; 22 | return ((TU)span) < PACK_LIMIT; 23 | } 24 | 25 | static INLINE TLOADSTOREMASK generate_prefix_mask(i32 amount) { 26 | assert(amount >= 0); 27 | assert(amount <= N); 28 | return _mm256_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(prefix_mask_table_32b + N * amount))); 29 | } 30 | 31 | static INLINE TLOADSTOREMASK generate_suffix_mask(i32 amount) { 32 | assert(amount >= 0); 33 | assert(amount <= N); 34 | return _mm256_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(suffix_mask_table_32b + N * amount))); 35 | } 36 | 37 | static INLINE TV load_vec(TV* p) { return _mm256_lddqu_si256(p); } 38 | 39 | static INLINE void store_vec(TV* ptr, TV v) { _mm256_storeu_si256(ptr, v); } 40 | 41 | static INLINE void store_compress_vec(TV*, TV, TCMPMASK) { throw std::runtime_error("operation is unsupported"); } 42 | 43 | static INLINE TV load_partial_vec(TV *p, TV base, TLOADSTOREMASK mask) { 44 | return _mm256_or_si256(_mm256_maskload_epi32((i32 *) p, mask), 45 | _mm256_andnot_si256(mask, base)); 46 | } 47 | 48 | static INLINE void store_masked_vec(TV *p, TV v, TLOADSTOREMASK mask) { _mm256_maskstore_epi32((i32 *) p, mask, v); } 49 | 50 | static INLINE TV partition_vector(TV v, i32 mask) { 51 | assert(mask >= 0); 52 | assert(mask <= 255); 53 | return s2i(_mm256_permutevar8x32_ps(i2s(v), _mm256_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(perm_table_32 + mask * 8))))); 54 | } 55 | 56 | static INLINE TV broadcast(T pivot) { return _mm256_set1_epi32(pivot); } 57 | 58 | static INLINE TCMPMASK get_cmpgt_mask(TV a, TV b) { return _mm256_movemask_ps(i2s(_mm256_cmpgt_epi32(a, b))); } 59 | 60 | static INLINE TV shift_right(TV v, i32 i) { return _mm256_srli_epi32(v, i); } 61 | static INLINE TV shift_left(TV v, i32 i) { return _mm256_slli_epi32(v, i); } 62 | 63 | static INLINE TV add(TV a, TV b) { return _mm256_add_epi32(a, b); } 64 | static INLINE TV sub(TV a, TV b) { return _mm256_sub_epi32(a, b); }; 65 | 66 | static INLINE TV pack_unordered(TV a, TV b) { return _mm256_packs_epi32(a, b); } 67 | 68 | static INLINE void unpack_ordered(TV p, TV& u1, TV& u2) { 69 | u1 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(p, 0)); 70 | u2 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(p, 1)); 71 | } 72 | 73 | template 74 | static INLINE T shift_n_sub(T v, T sub) { 75 | if (Shift > 0) 76 | v >>= Shift; 77 | v -= sub; 78 | return v; 79 | } 80 | 81 | template 82 | static INLINE T unshift_and_add(TPACK from, T add) { 83 | add += from; 84 | if (Shift > 0) 85 | add = (T) (((TU) add) << Shift); 86 | return add; 87 | } 88 | }; 89 | -------------------------------------------------------------------------------- /vxsort/vector_machine/avx2/i64.h: -------------------------------------------------------------------------------- 1 | template <> 2 | class vxsort_machine_traits { 3 | public: 4 | typedef i64 T; 5 | typedef __m256i TV; 6 | typedef __m256i TLOADSTOREMASK; 7 | typedef u32 TCMPMASK; 8 | typedef i32 TPACK; 9 | typedef typename std::make_unsigned::type TUPACK; 10 | typedef typename std::make_unsigned::type TU; 11 | static_assert(sizeof(TPACK)*2 == sizeof(T), "TPACK must be half-width of T"); 12 | 13 | static constexpr i32 N = sizeof(TV) / sizeof(T); 14 | static_assert(is_powerof2(N), "vector-size / element-size must be a power of 2"); 15 | 16 | static constexpr bool supports_compress_writes() { return false; } 17 | static constexpr bool supports_packing() { return true; } 18 | 19 | template 20 | static bool can_pack(T span) { 21 | constexpr auto PACK_LIMIT = (((TU) std::numeric_limits::max() + 1)) << Shift; 22 | return ((TU)span) < PACK_LIMIT; 23 | } 24 | 25 | static INLINE TLOADSTOREMASK generate_prefix_mask(i32 amount) { 26 | assert(amount >= 0); 27 | assert(amount <= N); 28 | return _mm256_cvtepi8_epi64(_mm_loadu_si128((__m128i*)(prefix_mask_table_64b + N * amount))); 29 | } 30 | 31 | static INLINE TLOADSTOREMASK generate_suffix_mask(i32 amount) { 32 | assert(amount >= 0); 33 | assert(amount <= N); 34 | return _mm256_cvtepi8_epi64(_mm_loadu_si128((__m128i*)(suffix_mask_table_64b + N * amount))); 35 | } 36 | 37 | static INLINE TV load_vec(TV* p) { return _mm256_lddqu_si256(p); } 38 | 39 | static INLINE void store_vec(TV* ptr, TV v) { _mm256_storeu_si256(ptr, v); } 40 | 41 | static void store_compress_vec(TV*, TV, TCMPMASK) { throw std::runtime_error("operation is unsupported"); } 42 | 43 | static INLINE TV load_partial_vec(TV *p, TV base, TLOADSTOREMASK mask) { 44 | return _mm256_or_si256(_mm256_maskload_epi64((const long long *) p, mask), 45 | _mm256_andnot_si256(mask, base)); 46 | } 47 | 48 | static INLINE void store_masked_vec(TV *p, TV v, TLOADSTOREMASK mask) { 49 | _mm256_maskstore_epi64((long long *) p, mask, v); 50 | } 51 | 52 | static INLINE TV partition_vector(TV v, i32 mask) { 53 | assert(mask >= 0); 54 | assert(mask <= 15); 55 | return s2i(_mm256_permutevar8x32_ps(i2s(v), _mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i*)(perm_table_64 + mask * 8))))); 56 | } 57 | 58 | static INLINE TV broadcast(T pivot) { return _mm256_set1_epi64x(pivot); } 59 | 60 | static INLINE TCMPMASK get_cmpgt_mask(TV a, TV b) { return _mm256_movemask_pd(i2d(_mm256_cmpgt_epi64(a, b))); } 61 | 62 | static INLINE TV shift_right(TV v, i32 i) { return _mm256_srli_epi64(v, i); } 63 | static INLINE TV shift_left(TV v, i32 i) { return _mm256_slli_epi64(v, i); } 64 | 65 | static INLINE TV add(TV a, TV b) { return _mm256_add_epi64(a, b); } 66 | static INLINE TV sub(TV a, TV b) { return _mm256_sub_epi64(a, b); }; 67 | 68 | static INLINE TV pack_unordered(TV a, TV b) { 69 | b = _mm256_shuffle_epi32(b, _MM_PERM_CDAB); 70 | return _mm256_blend_epi32(a, b, 0b10101010); 71 | } 72 | 73 | static INLINE void unpack_ordered(TV p, TV& u1, TV& u2) { 74 | auto p01 = _mm256_extracti128_si256(p, 0); 75 | auto p02 = _mm256_extracti128_si256(p, 1); 76 | 77 | u1 = _mm256_cvtepi32_epi64(p01); 78 | u2 = _mm256_cvtepi32_epi64(p02); 79 | } 80 | 81 | template 82 | static T shift_n_sub(T v, T sub) { 83 | if (Shift > 0) 84 | v >>= Shift; 85 | v -= sub; 86 | return v; 87 | } 88 | 89 | template 90 | static T unshift_and_add(TPACK from, T add) { 91 | add += from; 92 | if (Shift > 0) 93 | add = (T) (((TU) add) << Shift); 94 | return add; 95 | } 96 | }; 97 | -------------------------------------------------------------------------------- /vxsort/vector_machine/avx2/u16.h: -------------------------------------------------------------------------------- 1 | template <> 2 | class vxsort_machine_traits { 3 | public: 4 | typedef u16 T; 5 | typedef __m256i TV; 6 | typedef i32 TLOADSTOREMASK; 7 | typedef u32 TCMPMASK; 8 | typedef u16 TPACK; 9 | typedef typename std::make_unsigned::type TU; 10 | 11 | static constexpr i32 N = sizeof(TV) / sizeof(T); 12 | static_assert(is_powerof2(N), "vector-size / element-size must be a power of 2"); 13 | 14 | static constexpr bool supports_compress_writes() { return false; } 15 | static constexpr bool supports_packing() { return false; } 16 | 17 | template 18 | static bool can_pack(T) { return false; } 19 | 20 | static INLINE TLOADSTOREMASK generate_prefix_mask(i32 amount) { 21 | assert(amount >= 0); 22 | assert(amount <= N); 23 | 24 | return amount ? amount : N; 25 | } 26 | 27 | static INLINE TLOADSTOREMASK generate_suffix_mask(i32 amount) { 28 | assert(amount >= 0); 29 | assert(amount <= N); 30 | 31 | return amount ? -N + amount : -N; 32 | } 33 | 34 | static INLINE TV load_vec(TV* p) { return _mm256_lddqu_si256(p); } 35 | 36 | static INLINE void store_vec(TV* ptr, TV v) { _mm256_storeu_si256(ptr, v); } 37 | 38 | static void store_compress_vec(TV*, TV, TCMPMASK) { throw std::runtime_error("operation is unsupported"); } 39 | 40 | static INLINE TV load_partial_vec(TV *p, TV base, TLOADSTOREMASK mask) { 41 | // FML: There is only so much AVX2 stupidity one person can 42 | // take in their entire lifetime, I'm personally over this crap 43 | std::array base_vec; 44 | _mm256_storeu_si256((TV *)base_vec.data(), base); 45 | auto pt = (T *)p; 46 | auto psrc = mask > 0 ? pt : pt + N + mask; 47 | auto pdest = mask > 0 ? base_vec.begin() : base_vec.end() + mask; 48 | auto amount = abs(mask); 49 | std::copy_n(psrc, amount, pdest); 50 | return _mm256_lddqu_si256((TV *)base_vec.data()); 51 | } 52 | 53 | static INLINE void store_masked_vec(TV *p, TV v, TLOADSTOREMASK mask) { 54 | memcpy(p, &v, sizeof(T) * mask); 55 | } 56 | 57 | static INLINE TV partition_vector(TV, i32) { 58 | // Should never be called, since we "hijack" 16b/avx2 partitioning with template 59 | // specializtion with partition_machine 60 | throw std::runtime_error("operation is unsupported"); 61 | } 62 | 63 | static INLINE TV broadcast(T pivot) { return _mm256_set1_epi16(pivot); } 64 | 65 | static INLINE TCMPMASK get_cmpgt_mask(TV a, TV b) { 66 | __m256i top_bit = _mm256_set1_epi16(1U << 15); 67 | return _pext_u32( 68 | _mm256_movemask_epi8(_mm256_cmpgt_epi16(_mm256_xor_si256(top_bit, a), 69 | _mm256_xor_si256(top_bit, b))), 70 | 0x55555555); 71 | } 72 | 73 | static INLINE TV shift_right(TV v, i32 i) { return _mm256_srli_epi16(v, i); } 74 | static INLINE TV shift_left(TV v, i32 i) { return _mm256_slli_epi16(v, i); } 75 | 76 | static INLINE TV add(TV a, TV b) { return _mm256_add_epi16(a, b); } 77 | static INLINE TV sub(TV a, TV b) { return _mm256_sub_epi16(a, b); }; 78 | 79 | static INLINE TV pack_unordered(TV, TV) { throw std::runtime_error("operation is unsupported"); } 80 | static INLINE void unpack_ordered(TV, TV&, TV&) { } 81 | 82 | template 83 | static INLINE T shift_n_sub(T v, T sub) { 84 | if (Shift > 0) 85 | v >>= Shift; 86 | v -= sub; 87 | return v; 88 | } 89 | 90 | template 91 | static INLINE T unshift_and_add(TPACK from, T add) { 92 | add += from; 93 | if (Shift > 0) 94 | add = (T) (((TU) add) << Shift); 95 | return add; 96 | } 97 | }; 98 | -------------------------------------------------------------------------------- /vxsort/vector_machine/avx2/u32.h: -------------------------------------------------------------------------------- 1 | template <> 2 | class vxsort_machine_traits { 3 | public: 4 | typedef u32 T; 5 | typedef __m256i TV; 6 | typedef __m256i TLOADSTOREMASK; 7 | typedef u32 TCMPMASK; 8 | typedef u16 TPACK; 9 | typedef typename std::make_unsigned::type TUPACK; 10 | typedef typename std::make_unsigned::type TU; 11 | static_assert(sizeof(TPACK)*2 == sizeof(T), "TPACK must be half-width of T"); 12 | 13 | static constexpr i32 N = sizeof(TV) / sizeof(T); 14 | static_assert(is_powerof2(N), "vector-size / element-size must be a power of 2"); 15 | 16 | static constexpr bool supports_compress_writes() { return false; } 17 | static constexpr bool supports_packing() { return true; } 18 | 19 | template 20 | static bool can_pack(T span) { 21 | constexpr auto PACK_LIMIT = (((TU)std::numeric_limits::max() + 1)) << Shift; 22 | return ((TU)span) < PACK_LIMIT; 23 | } 24 | 25 | static INLINE TLOADSTOREMASK generate_prefix_mask(i32 amount) { 26 | assert(amount >= 0); 27 | assert(amount <= N); 28 | return _mm256_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(prefix_mask_table_32b + N * amount))); 29 | } 30 | 31 | static INLINE TLOADSTOREMASK generate_suffix_mask(i32 amount) { 32 | assert(amount >= 0); 33 | assert(amount <= N); 34 | return _mm256_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(suffix_mask_table_32b + N * amount))); 35 | } 36 | 37 | static INLINE TV load_vec(TV* p) { return _mm256_lddqu_si256(p); } 38 | 39 | static INLINE void store_vec(TV* ptr, TV v) { _mm256_storeu_si256(ptr, v); } 40 | 41 | static INLINE void store_compress_vec(TV*, TV, TCMPMASK) { throw std::runtime_error("operation is unsupported"); } 42 | 43 | static INLINE TV load_partial_vec(TV *p, TV base, TLOADSTOREMASK mask) { 44 | return _mm256_or_si256(_mm256_maskload_epi32((i32 *) p, mask), 45 | _mm256_andnot_si256(mask, base)); 46 | } 47 | 48 | static INLINE void store_masked_vec(TV *p, TV v, TLOADSTOREMASK mask) { _mm256_maskstore_epi32((i32 *) p, mask, v); } 49 | 50 | static INLINE TV partition_vector(TV v, i32 mask) { 51 | assert(mask >= 0); 52 | assert(mask <= 255); 53 | return s2i(_mm256_permutevar8x32_ps(i2s(v), _mm256_cvtepi8_epi32(_mm_loadu_si128((__m128i*)(perm_table_32 + mask * 8))))); 54 | } 55 | 56 | static INLINE TV broadcast(T pivot) { return _mm256_set1_epi32(pivot); } 57 | 58 | static INLINE TCMPMASK get_cmpgt_mask(TV a, TV b) { 59 | __m256i top_bit = _mm256_set1_epi32(1U << 31); 60 | return _mm256_movemask_ps(i2s(_mm256_cmpgt_epi32(_mm256_xor_si256(top_bit, a), _mm256_xor_si256(top_bit, b)))); 61 | } 62 | 63 | static INLINE TV shift_right(TV v, i32 i) { return _mm256_srli_epi32(v, i); } 64 | static INLINE TV shift_left(TV v, i32 i) { return _mm256_slli_epi32(v, i); } 65 | 66 | static INLINE TV add(TV a, TV b) { return _mm256_add_epi32(a, b); } 67 | static INLINE TV sub(TV a, TV b) { return _mm256_sub_epi32(a, b); }; 68 | 69 | static INLINE TV pack_unordered(TV a, TV b) { return _mm256_packus_epi32(a, b); } 70 | 71 | static INLINE void unpack_ordered(TV p, TV& u1, TV& u2) { 72 | u1 = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(p, 0)); 73 | u2 = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(p, 1)); 74 | } 75 | 76 | template 77 | static INLINE T shift_n_sub(T v, T sub) { 78 | if (Shift > 0) 79 | v >>= Shift; 80 | v -= sub; 81 | return v; 82 | } 83 | 84 | template 85 | static INLINE T unshift_and_add(TPACK from, T add) { 86 | add += from; 87 | if (Shift > 0) 88 | add = (T) (((TU) add) << Shift); 89 | return add; 90 | } 91 | }; 92 | -------------------------------------------------------------------------------- /vxsort/vector_machine/avx2/u64.h: -------------------------------------------------------------------------------- 1 | template <> 2 | class vxsort_machine_traits { 3 | public: 4 | typedef u64 T; 5 | typedef __m256i TV; 6 | typedef __m256i TLOADSTOREMASK; 7 | typedef u32 TCMPMASK; 8 | typedef u32 TPACK; 9 | typedef typename std::make_unsigned::type TUPACK; 10 | typedef typename std::make_unsigned::type TU; 11 | static_assert(sizeof(TPACK)*2 == sizeof(T), "TPACK must be half-width of T"); 12 | 13 | static constexpr i32 N = sizeof(TV) / sizeof(T); 14 | static_assert(is_powerof2(N), "vector-size / element-size must be a power of 2"); 15 | 16 | static constexpr bool supports_compress_writes() { return false; } 17 | static constexpr bool supports_packing() { return true; } 18 | 19 | template 20 | static bool can_pack(T span) { 21 | constexpr auto PACK_LIMIT = (((TU) std::numeric_limits::max() + 1)) << Shift; 22 | return ((TU)span) < PACK_LIMIT; 23 | } 24 | 25 | static INLINE TLOADSTOREMASK generate_prefix_mask(i32 amount) { 26 | assert(amount >= 0); 27 | assert(amount <= N); 28 | return _mm256_cvtepi8_epi64(_mm_loadu_si128((__m128i*)(prefix_mask_table_64b + N * amount))); 29 | } 30 | 31 | static INLINE TLOADSTOREMASK generate_suffix_mask(i32 amount) { 32 | assert(amount >= 0); 33 | assert(amount <= N); 34 | return _mm256_cvtepi8_epi64(_mm_loadu_si128((__m128i*)(suffix_mask_table_64b + N * amount))); 35 | } 36 | 37 | static INLINE TV load_vec(TV* p) { return _mm256_lddqu_si256(p); } 38 | 39 | static INLINE void store_vec(TV* ptr, TV v) { _mm256_storeu_si256(ptr, v); } 40 | 41 | static void store_compress_vec(TV*, TV, TCMPMASK) { throw std::runtime_error("operation is unsupported"); } 42 | 43 | static INLINE TV load_partial_vec(TV *p, TV base, TLOADSTOREMASK mask) { 44 | return _mm256_or_si256(_mm256_maskload_epi64((const long long *) p, mask), 45 | _mm256_andnot_si256(mask, base)); 46 | } 47 | 48 | static INLINE void store_masked_vec(TV *p, TV v, TLOADSTOREMASK mask) { 49 | _mm256_maskstore_epi64((long long *) p, mask, v); 50 | } 51 | 52 | static INLINE TV partition_vector(TV v, i32 mask) { 53 | assert(mask >= 0); 54 | assert(mask <= 15); 55 | return s2i(_mm256_permutevar8x32_ps(i2s(v), _mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i*)(perm_table_64 + mask * 8))))); 56 | } 57 | 58 | static INLINE TV broadcast(T pivot) { return _mm256_set1_epi64x(pivot); } 59 | 60 | static INLINE TCMPMASK get_cmpgt_mask(TV a, TV b) { 61 | __m256i top_bit = _mm256_set1_epi64x(1LLU << 63); 62 | return _mm256_movemask_pd(i2d(_mm256_cmpgt_epi64(_mm256_xor_si256(top_bit, a), _mm256_xor_si256(top_bit, b)))); 63 | } 64 | 65 | static INLINE TV shift_right(TV v, i32 i) { return _mm256_srli_epi64(v, i); } 66 | static INLINE TV shift_left(TV v, i32 i) { return _mm256_slli_epi64(v, i); } 67 | 68 | static INLINE TV add(TV a, TV b) { return _mm256_add_epi64(a, b); } 69 | static INLINE TV sub(TV a, TV b) { return _mm256_sub_epi64(a, b); }; 70 | 71 | static INLINE TV pack_unordered(TV a, TV b) { 72 | b = _mm256_shuffle_epi32(b, _MM_PERM_CDAB); 73 | return _mm256_blend_epi32(a, b, 0b10101010); 74 | } 75 | 76 | static INLINE void unpack_ordered(TV p, TV& u1, TV& u2) { 77 | auto p01 = _mm256_extracti128_si256(p, 0); 78 | auto p02 = _mm256_extracti128_si256(p, 1); 79 | 80 | u1 = _mm256_cvtepu32_epi64(p01); 81 | u2 = _mm256_cvtepu32_epi64(p02); 82 | } 83 | 84 | template 85 | static T shift_n_sub(T v, T sub) { 86 | if (Shift > 0) 87 | v >>= Shift; 88 | v -= sub; 89 | return v; 90 | } 91 | 92 | template 93 | static T unshift_and_add(TPACK from, T add) { 94 | add += from; 95 | if (Shift > 0) 96 | add = (T) (((TU) add) << Shift); 97 | return add; 98 | } 99 | }; 100 | -------------------------------------------------------------------------------- /vxsort/vector_machine/avx512/f32.h: -------------------------------------------------------------------------------- 1 | template <> 2 | class vxsort_machine_traits { 3 | public: 4 | typedef f32 T; 5 | typedef __m512 TV; 6 | typedef __mmask16 TLOADSTOREMASK; 7 | typedef __mmask16 TCMPMASK; 8 | typedef f32 TPACK; 9 | 10 | static constexpr i32 N = sizeof(TV) / sizeof(T); 11 | static_assert(is_powerof2(N), "vector-size / element-size must be a power of 2"); 12 | 13 | static constexpr bool supports_compress_writes() { return true; } 14 | static constexpr bool supports_packing() { return false; } 15 | 16 | template 17 | static bool can_pack(T) { return false; } 18 | 19 | static INLINE TLOADSTOREMASK generate_prefix_mask(i32 amount) { 20 | assert(amount >= 0); 21 | assert(amount <= N); 22 | return 0xFFFF >> ((N - amount) & (N-1)); 23 | } 24 | 25 | static INLINE TLOADSTOREMASK generate_suffix_mask(i32 amount) { 26 | assert(amount >= 0); 27 | assert(amount <= N); 28 | return 0xFFFF << (amount & (N-1)); 29 | } 30 | 31 | static INLINE TV load_vec(TV* p) { return _mm512_loadu_ps(p); } 32 | 33 | static INLINE void store_vec(TV* ptr, TV v) { _mm512_storeu_ps(ptr, v); } 34 | 35 | static TV load_partial_vec(TV *ptr, TV base, TLOADSTOREMASK mask) { 36 | return _mm512_mask_loadu_ps(base, mask, (T const *) ptr); 37 | } 38 | 39 | static INLINE void store_masked_vec(TV * p, TV v, TLOADSTOREMASK mask) { 40 | _mm512_mask_storeu_ps(p, mask, v); 41 | } 42 | 43 | // Will never be called 44 | static INLINE TV partition_vector(TV v, i32 mask) { return v; } 45 | 46 | static void store_compress_vec(TV* ptr, TV v, TCMPMASK mask) { _mm512_mask_compressstoreu_ps(ptr, mask, v); } 47 | 48 | static INLINE TV broadcast(T pivot) { return _mm512_set1_ps(pivot); } 49 | 50 | static INLINE TCMPMASK get_cmpgt_mask(TV a, TV b) { return _mm512_cmp_ps_mask(a, b, _CMP_GT_OS); } 51 | 52 | static INLINE TV shift_right(TV v, i32 i) { return v; } 53 | static INLINE TV shift_left(TV v, i32 i) { return v; } 54 | 55 | static INLINE TV add(TV a, TV b) { return _mm512_add_ps(a, b); } 56 | static INLINE TV sub(TV a, TV b) { return _mm512_sub_ps(a, b); }; 57 | 58 | static INLINE TV pack_unordered(TV a, TV b) { return a; } 59 | static INLINE void unpack_ordered(TV p, TV& u1, TV& u2) { } 60 | template 61 | static INLINE T shift_n_sub(T v, T sub) { return v; } 62 | 63 | template 64 | static INLINE T unshift_and_add(TPACK from, T add) { return add; } 65 | }; 66 | -------------------------------------------------------------------------------- /vxsort/vector_machine/avx512/f64.h: -------------------------------------------------------------------------------- 1 | template <> 2 | class vxsort_machine_traits { 3 | public: 4 | typedef f64 T; 5 | typedef __m512d TV; 6 | typedef __mmask8 TLOADSTOREMASK; 7 | typedef __mmask8 TCMPMASK; 8 | typedef f64 TPACK; 9 | 10 | static constexpr i32 N = sizeof(TV) / sizeof(T); 11 | static_assert(is_powerof2(N), "vector-size / element-size must be a power of 2"); 12 | 13 | static constexpr bool supports_compress_writes() { return true; } 14 | static constexpr bool supports_packing() { return false; } 15 | 16 | template 17 | static bool can_pack(T) { return false; } 18 | 19 | static INLINE TLOADSTOREMASK generate_prefix_mask(i32 amount) { 20 | assert(amount >= 0); 21 | assert(amount <= N); 22 | return 0xFF >> ((N - amount) & (N-1)); 23 | } 24 | 25 | static INLINE TLOADSTOREMASK generate_suffix_mask(i32 amount) { 26 | assert(amount >= 0); 27 | assert(amount <= N); 28 | return 0xFF << (amount & (N-1)); 29 | } 30 | 31 | static INLINE TV load_vec(TV* p) { return _mm512_loadu_pd(p); } 32 | 33 | static INLINE void store_vec(TV* ptr, TV v) { _mm512_storeu_pd(ptr, v); } 34 | 35 | static INLINE TV load_partial_vec(TV *ptr, TV base, TLOADSTOREMASK mask) { 36 | return _mm512_mask_loadu_pd(base, mask, (T const *) ptr); 37 | } 38 | 39 | static INLINE void store_masked_vec(TV * p, TV v, TLOADSTOREMASK mask) { 40 | _mm512_mask_storeu_pd(p, mask, v); 41 | } 42 | 43 | // Will never be called 44 | static INLINE TV partition_vector(TV v, i32 mask) { return v; } 45 | 46 | static void store_compress_vec(TV* ptr, TV v, TCMPMASK mask) { _mm512_mask_compressstoreu_pd(ptr, mask, v); } 47 | static INLINE TV broadcast(T pivot) { return _mm512_set1_pd(pivot); } 48 | static INLINE TCMPMASK get_cmpgt_mask(TV a, TV b) { return _mm512_cmp_pd_mask(a, b, _CMP_GT_OS); } 49 | 50 | static INLINE TV shift_right(TV v, i32 i) { return v; } 51 | static INLINE TV shift_left(TV v, i32 i) { return v; } 52 | 53 | static INLINE TV add(TV a, TV b) { return _mm512_add_pd(a, b); } 54 | static INLINE TV sub(TV a, TV b) { return _mm512_sub_pd(a, b); }; 55 | 56 | static INLINE TV pack_unordered(TV a, TV b) { return a; } 57 | static INLINE void unpack_ordered(TV p, TV& u1, TV& u2) { } 58 | template 59 | static T shift_n_sub(T v, T sub) { return v; } 60 | 61 | template 62 | static T unshift_and_add(TPACK from, T add) { return add; } 63 | }; 64 | -------------------------------------------------------------------------------- /vxsort/vector_machine/avx512/i16.h: -------------------------------------------------------------------------------- 1 | template <> 2 | class vxsort_machine_traits { 3 | public: 4 | typedef i16 T; 5 | typedef __m512i TV; 6 | typedef __mmask32 TLOADSTOREMASK; 7 | typedef __mmask32 TCMPMASK; 8 | typedef i16 TPACK; 9 | typedef typename std::make_unsigned::type TU; 10 | 11 | static constexpr i32 N = sizeof(TV) / sizeof(T); 12 | static_assert(is_powerof2(N), "vector-size / element-size must be a power of 2"); 13 | 14 | static constexpr bool supports_compress_writes() { return true; } 15 | static constexpr bool supports_packing() { return false; } 16 | 17 | template 18 | static bool can_pack(T) { return false; } 19 | 20 | static INLINE TLOADSTOREMASK generate_prefix_mask(i32 amount) { 21 | assert(amount >= 0); 22 | assert(amount <= N); 23 | return 0xFFFFFFFF >> ((N - amount) & (N-1)); 24 | } 25 | 26 | static INLINE TLOADSTOREMASK generate_suffix_mask(i32 amount) { 27 | assert(amount >= 0); 28 | assert(amount <= N); 29 | return 0xFFFFFFFF << (amount & (N-1)); 30 | } 31 | 32 | static INLINE TV load_vec(TV* p) { return _mm512_loadu_si512(p); } 33 | 34 | static INLINE void store_vec(TV* ptr, TV v) { _mm512_storeu_si512(ptr, v); } 35 | 36 | static INLINE TV load_partial_vec(TV *ptr, TV base, TLOADSTOREMASK mask) { 37 | return _mm512_mask_loadu_epi16(base, mask, (T const *) ptr); 38 | } 39 | 40 | static INLINE void store_masked_vec(TV * p, TV v, TLOADSTOREMASK mask) { 41 | _mm512_mask_storeu_epi16(p, mask, v); 42 | } 43 | 44 | // Will never be called 45 | static INLINE TV partition_vector(TV v, i32 mask) { return v; } 46 | 47 | static INLINE void store_compress_vec(TV* ptr, TV v, TCMPMASK mask) { _mm512_mask_compressstoreu_epi16(ptr, mask, v); } 48 | 49 | static INLINE TV broadcast(T pivot) { return _mm512_set1_epi16(pivot); } 50 | 51 | static INLINE TCMPMASK get_cmpgt_mask(TV a, TV b) { return _mm512_cmp_epi16_mask(a, b, _MM_CMPINT_GT); } 52 | 53 | static INLINE TV shift_right(TV v, i32 i) { return _mm512_srli_epi16(v, i); } 54 | static INLINE TV shift_left(TV v, i32 i) { return _mm512_slli_epi16(v, i); } 55 | 56 | static INLINE TV add(TV a, TV b) { return _mm512_add_epi16(a, b); } 57 | static INLINE TV sub(TV a, TV b) { return _mm512_sub_epi16(a, b); }; 58 | 59 | static INLINE TV pack_unordered(TV a, TV b) { return a; } 60 | static INLINE void unpack_ordered(TV p, TV& u1, TV& u2) { } 61 | 62 | template 63 | static INLINE T shift_n_sub(T v, T sub) { 64 | if (Shift > 0) 65 | v >>= Shift; 66 | v -= sub; 67 | return v; 68 | } 69 | 70 | template 71 | static INLINE T unshift_and_add(TPACK from, T add) { 72 | add += from; 73 | if (Shift > 0) 74 | add = (T) (((TU) add) << Shift); 75 | return add; 76 | } 77 | }; 78 | -------------------------------------------------------------------------------- /vxsort/vector_machine/avx512/i32.h: -------------------------------------------------------------------------------- 1 | template <> 2 | class vxsort_machine_traits { 3 | public: 4 | typedef i32 T; 5 | typedef __m512i TV; 6 | typedef __mmask16 TLOADSTOREMASK; 7 | typedef __mmask16 TCMPMASK; 8 | typedef i16 TPACK; 9 | typedef typename std::make_unsigned::type TUPACK; 10 | typedef typename std::make_unsigned::type TU; 11 | static_assert(sizeof(TPACK)*2 == sizeof(T), "TPACK must be half-width of T"); 12 | 13 | static constexpr i32 N = sizeof(TV) / sizeof(T); 14 | static_assert(is_powerof2(N), "vector-size / element-size must be a power of 2"); 15 | 16 | static constexpr bool supports_compress_writes() { return true; } 17 | static constexpr bool supports_packing() { return true; } 18 | 19 | template 20 | static bool can_pack(T span) { 21 | if (!supports_vector_machine(sizeof(TPACK))) { 22 | return false; 23 | } 24 | constexpr auto PACK_LIMIT = (((TU)std::numeric_limits::max() + 1)) << Shift; 25 | return ((TU)span) < PACK_LIMIT; 26 | } 27 | 28 | static INLINE TLOADSTOREMASK generate_prefix_mask(i32 amount) { 29 | assert(amount >= 0); 30 | assert(amount <= N); 31 | return 0xFFFF >> ((N - amount) & (N-1)); 32 | } 33 | 34 | static INLINE TLOADSTOREMASK generate_suffix_mask(i32 amount) { 35 | assert(amount >= 0); 36 | assert(amount <= N); 37 | return 0xFFFF << (amount & (N-1)); 38 | } 39 | 40 | static INLINE TV load_vec(TV* p) { return _mm512_loadu_si512(p); } 41 | 42 | static INLINE void store_vec(TV* ptr, TV v) { _mm512_storeu_si512(ptr, v); } 43 | 44 | static INLINE TV load_partial_vec(TV *p, TV base, TLOADSTOREMASK mask) { 45 | return _mm512_mask_loadu_epi32(base, mask, (i32 const *) p); 46 | } 47 | 48 | static INLINE void store_masked_vec(TV *p, TV v, TLOADSTOREMASK mask) { 49 | _mm512_mask_storeu_epi32(p, mask, v); 50 | } 51 | 52 | // Will never be called 53 | static INLINE TV partition_vector(TV v, i32 mask) { return v; } 54 | 55 | static INLINE void store_compress_vec(TV* ptr, TV v, TCMPMASK mask) { _mm512_mask_compressstoreu_epi32(ptr, mask, v); } 56 | 57 | static INLINE TV broadcast(T pivot) { return _mm512_set1_epi32(pivot); } 58 | 59 | static INLINE TCMPMASK get_cmpgt_mask(TV a, TV b) { return _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_GT); } 60 | 61 | static INLINE TV shift_right(TV v, i32 i) { return _mm512_srli_epi32(v, i); } 62 | static INLINE TV shift_left(TV v, i32 i) { return _mm512_slli_epi32(v, i); } 63 | 64 | static INLINE TV add(TV a, TV b) { return _mm512_add_epi32(a, b); } 65 | static INLINE TV sub(TV a, TV b) { return _mm512_sub_epi32(a, b); }; 66 | 67 | static INLINE TV pack_unordered(TV a, TV b) { return _mm512_packs_epi32(a, b); } 68 | static INLINE void unpack_ordered(TV p, TV& u1, TV& u2) { 69 | u1 = _mm512_cvtepi16_epi32(_mm512_extracti32x8_epi32(p, 0)); 70 | u2 = _mm512_cvtepi16_epi32(_mm512_extracti32x8_epi32(p, 1)); 71 | } 72 | 73 | template 74 | static INLINE T shift_n_sub(T v, T sub) { 75 | if (Shift > 0) 76 | v >>= Shift; 77 | v -= sub; 78 | return v; 79 | } 80 | 81 | template 82 | static INLINE T unshift_and_add(TPACK from, T add) { 83 | add += from; 84 | if (Shift > 0) 85 | add = (T) (((TU) add) << Shift); 86 | return add; 87 | } 88 | }; 89 | -------------------------------------------------------------------------------- /vxsort/vector_machine/avx512/i64.h: -------------------------------------------------------------------------------- 1 | template <> 2 | class vxsort_machine_traits { 3 | public: 4 | typedef i64 T; 5 | typedef __m512i TV; 6 | typedef __mmask8 TLOADSTOREMASK; 7 | typedef __mmask8 TCMPMASK; 8 | typedef i32 TPACK; 9 | typedef typename std::make_unsigned::type TUPACK; 10 | typedef typename std::make_unsigned::type TU; 11 | static_assert(sizeof(TPACK)*2 == sizeof(T), "TPACK must be half-width of T"); 12 | 13 | static constexpr i32 N = sizeof(TV) / sizeof(T); 14 | static_assert(is_powerof2(N), "vector-size / element-size must be a power of 2"); 15 | 16 | static constexpr bool supports_compress_writes() { return true; } 17 | static constexpr bool supports_packing() { return true; } 18 | 19 | template 20 | static bool can_pack(T span) { 21 | constexpr auto PACK_LIMIT = (((TU) std::numeric_limits::max() + 1)) << Shift; 22 | return ((TU) span) < PACK_LIMIT; 23 | } 24 | 25 | static INLINE TLOADSTOREMASK generate_prefix_mask(i32 amount) { 26 | assert(amount >= 0); 27 | assert(amount <= N); 28 | return 0xFF >> ((N - amount) & (N-1)); 29 | } 30 | 31 | static INLINE TLOADSTOREMASK generate_suffix_mask(i32 amount) { 32 | assert(amount >= 0); 33 | assert(amount <= N); 34 | return 0xFF << (amount & (N-1)); 35 | } 36 | 37 | static INLINE TV load_vec(TV* p) { return _mm512_loadu_si512(p); } 38 | 39 | static INLINE void store_vec(TV* ptr, TV v) { _mm512_storeu_si512(ptr, v); } 40 | 41 | static INLINE TV load_partial_vec(TV *p, TV base, TLOADSTOREMASK mask) { 42 | return _mm512_mask_loadu_epi64(base, mask, (i64 const *) p); 43 | } 44 | 45 | static INLINE void store_masked_vec(TV * p, TV v, TLOADSTOREMASK mask) { 46 | _mm512_mask_storeu_epi64(p, mask, v); 47 | } 48 | 49 | // Will never be called 50 | static INLINE TV partition_vector(TV v, i32 mask) { return v; } 51 | 52 | static INLINE void store_compress_vec(TV* ptr, TV v, TCMPMASK mask) { _mm512_mask_compressstoreu_epi64(ptr, mask, v); } 53 | 54 | static INLINE TV broadcast(T pivot) { return _mm512_set1_epi64(pivot); } 55 | 56 | static INLINE TCMPMASK get_cmpgt_mask(TV a, TV b) { return _mm512_cmp_epi64_mask(a, b, _MM_CMPINT_GT); } 57 | 58 | static INLINE TV shift_right(TV v, i32 i) { return _mm512_srli_epi64(v, i); } 59 | static INLINE TV shift_left(TV v, i32 i) { return _mm512_slli_epi64(v, i); } 60 | 61 | static INLINE TV add(TV a, TV b) { return _mm512_add_epi64(a, b); } 62 | static INLINE TV sub(TV a, TV b) { return _mm512_sub_epi64(a, b); }; 63 | 64 | static INLINE TV pack_unordered(TV a, TV b) { return _mm512_mask_shuffle_epi32(a, 0b1010101010101010, b, _MM_PERM_CDAB); } 65 | 66 | static INLINE void unpack_ordered(TV p, TV& u1, TV& u2) { 67 | auto p01 = _mm512_extracti32x8_epi32(p, 0); 68 | auto p02 = _mm512_extracti32x8_epi32(p, 1); 69 | 70 | u1 = _mm512_cvtepi32_epi64(p01); 71 | u2 = _mm512_cvtepi32_epi64(p02); 72 | } 73 | 74 | template 75 | static INLINE T shift_n_sub(T v, T sub) { 76 | if (Shift > 0) 77 | v >>= Shift; 78 | v -= sub; 79 | return v; 80 | } 81 | 82 | template 83 | static INLINE T unshift_and_add(TPACK from, T add) { 84 | add += from; 85 | 86 | if (Shift > 0) 87 | add = (T) (((TU) add) << Shift); 88 | 89 | return add; 90 | } 91 | 92 | }; 93 | -------------------------------------------------------------------------------- /vxsort/vector_machine/avx512/u16.h: -------------------------------------------------------------------------------- 1 | template <> 2 | class vxsort_machine_traits { 3 | public: 4 | typedef u16 T; 5 | typedef __m512i TV; 6 | typedef __mmask32 TLOADSTOREMASK; 7 | typedef __mmask32 TCMPMASK; 8 | typedef u16 TPACK; 9 | typedef typename std::make_unsigned::type TU; 10 | 11 | static constexpr i32 N = sizeof(TV) / sizeof(T); 12 | static_assert(is_powerof2(N), "vector-size / element-size must be a power of 2"); 13 | 14 | static constexpr bool supports_compress_writes() { return true; } 15 | static constexpr bool supports_packing() { return false; } 16 | 17 | template 18 | static bool can_pack(T) { return false; } 19 | 20 | static INLINE TLOADSTOREMASK generate_prefix_mask(i32 amount) { 21 | assert(amount >= 0); 22 | assert(amount <= N); 23 | return 0xFFFFFFFF >> ((N - amount) & (N-1)); 24 | } 25 | 26 | static INLINE TLOADSTOREMASK generate_suffix_mask(i32 amount) { 27 | assert(amount >= 0); 28 | assert(amount <= N); 29 | return 0xFFFFFFFF << (amount & (N-1)); 30 | } 31 | 32 | static INLINE TV load_vec(TV* p) { return _mm512_loadu_si512(p); } 33 | 34 | static INLINE void store_vec(TV* ptr, TV v) { _mm512_storeu_si512(ptr, v); } 35 | 36 | static INLINE TV load_partial_vec(TV *ptr, TV base, TLOADSTOREMASK mask) { 37 | return _mm512_mask_loadu_epi16(base, mask, (T const *) ptr); 38 | } 39 | 40 | static INLINE void store_masked_vec(TV * p, TV v, TLOADSTOREMASK mask) { 41 | _mm512_mask_storeu_epi16(p, mask, v); 42 | } 43 | 44 | // Will never be called 45 | static INLINE TV partition_vector(TV v, i32 mask) { return v; } 46 | 47 | static INLINE void store_compress_vec(TV* ptr, TV v, TCMPMASK mask) { _mm512_mask_compressstoreu_epi16(ptr, mask, v); } 48 | 49 | static INLINE TV broadcast(T pivot) { return _mm512_set1_epi16(pivot); } 50 | 51 | static INLINE TCMPMASK get_cmpgt_mask(TV a, TV b) { return _mm512_cmp_epu16_mask(a, b, _MM_CMPINT_GT); } 52 | 53 | static INLINE TV shift_right(TV v, i32 i) { return _mm512_srli_epi16(v, i); } 54 | static INLINE TV shift_left(TV v, i32 i) { return _mm512_slli_epi16(v, i); } 55 | 56 | static INLINE TV add(TV a, TV b) { return _mm512_add_epi16(a, b); } 57 | static INLINE TV sub(TV a, TV b) { return _mm512_sub_epi16(a, b); }; 58 | 59 | static INLINE TV pack_unordered(TV a, TV b) { return a; } 60 | static INLINE void unpack_ordered(TV p, TV& u1, TV& u2) { } 61 | 62 | template 63 | static INLINE T shift_n_sub(T v, T sub) { 64 | if (Shift > 0) 65 | v >>= Shift; 66 | v -= sub; 67 | return v; 68 | } 69 | 70 | template 71 | static INLINE T unshift_and_add(TPACK from, T add) { 72 | add += from; 73 | if (Shift > 0) 74 | add = (T) (((TU) add) << Shift); 75 | return add; 76 | } 77 | }; 78 | -------------------------------------------------------------------------------- /vxsort/vector_machine/avx512/u32.h: -------------------------------------------------------------------------------- 1 | template <> 2 | class vxsort_machine_traits { 3 | public: 4 | typedef u32 T; 5 | typedef __m512i TV; 6 | typedef __mmask16 TLOADSTOREMASK; 7 | typedef __mmask16 TCMPMASK; 8 | typedef u16 TPACK; 9 | typedef typename std::make_unsigned::type TUPACK; 10 | typedef typename std::make_unsigned::type TU; 11 | static_assert(sizeof(TPACK)*2 == sizeof(T), "TPACK must be half-width of T"); 12 | 13 | static constexpr i32 N = sizeof(TV) / sizeof(T); 14 | static_assert(is_powerof2(N), "vector-size / element-size must be a power of 2"); 15 | 16 | static constexpr bool supports_compress_writes() { return true; } 17 | static constexpr bool supports_packing() { return true; } 18 | 19 | template 20 | static bool can_pack(T span) { 21 | if (!supports_vector_machine(sizeof(TPACK))) { 22 | return false; 23 | } 24 | constexpr auto PACK_LIMIT = (((TU)std::numeric_limits::max() + 1)) << Shift; 25 | return ((TU)span) < PACK_LIMIT; 26 | } 27 | 28 | static INLINE TLOADSTOREMASK generate_prefix_mask(i32 amount) { 29 | assert(amount >= 0); 30 | assert(amount <= N); 31 | return 0xFFFF >> ((N - amount) & (N-1)); 32 | } 33 | 34 | static INLINE TLOADSTOREMASK generate_suffix_mask(i32 amount) { 35 | assert(amount >= 0); 36 | assert(amount <= N); 37 | return 0xFFFF << (amount & (N-1)); 38 | } 39 | 40 | static INLINE TV load_vec(TV* p) { return _mm512_loadu_si512(p); } 41 | 42 | static INLINE void store_vec(TV* ptr, TV v) { _mm512_storeu_si512(ptr, v); } 43 | 44 | static INLINE TV load_partial_vec(TV *p, TV base, TLOADSTOREMASK mask) { 45 | return _mm512_mask_loadu_epi32(base, mask, (i32 const *) p); 46 | } 47 | 48 | static INLINE void store_masked_vec(TV *p, TV v, TLOADSTOREMASK mask) { 49 | _mm512_mask_storeu_epi32(p, mask, v); 50 | } 51 | 52 | // Will never be called 53 | static INLINE TV partition_vector(TV v, i32 mask) { return v; } 54 | 55 | static INLINE void store_compress_vec(TV* ptr, TV v, TCMPMASK mask) { _mm512_mask_compressstoreu_epi32(ptr, mask, v); } 56 | 57 | static INLINE TV broadcast(T pivot) { return _mm512_set1_epi32(pivot); } 58 | 59 | static INLINE TCMPMASK get_cmpgt_mask(TV a, TV b) { return _mm512_cmp_epu32_mask(a, b, _MM_CMPINT_GT); } 60 | 61 | static INLINE TV shift_right(TV v, i32 i) { return _mm512_srli_epi32(v, i); } 62 | static INLINE TV shift_left(TV v, i32 i) { return _mm512_slli_epi32(v, i); } 63 | 64 | static INLINE TV add(TV a, TV b) { return _mm512_add_epi32(a, b); } 65 | static INLINE TV sub(TV a, TV b) { return _mm512_sub_epi32(a, b); }; 66 | 67 | static INLINE TV pack_unordered(TV a, TV b) { return _mm512_packus_epi32(a, b); } 68 | static INLINE void unpack_ordered(TV p, TV& u1, TV& u2) { 69 | u1 = _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(p, 0)); 70 | u2 = _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(p, 1)); 71 | } 72 | 73 | template 74 | static INLINE T shift_n_sub(T v, T sub) { 75 | if (Shift > 0) 76 | v >>= Shift; 77 | v -= sub; 78 | return v; 79 | } 80 | 81 | template 82 | static INLINE T unshift_and_add(TPACK from, T add) { 83 | add += from; 84 | if (Shift > 0) 85 | add = (T) (((TU) add) << Shift); 86 | return add; 87 | } 88 | }; 89 | -------------------------------------------------------------------------------- /vxsort/vector_machine/avx512/u64.h: -------------------------------------------------------------------------------- 1 | template <> 2 | class vxsort_machine_traits { 3 | public: 4 | typedef u64 T; 5 | typedef __m512i TV; 6 | typedef __mmask8 TLOADSTOREMASK; 7 | typedef __mmask8 TCMPMASK; 8 | typedef u32 TPACK; 9 | typedef typename std::make_unsigned::type TUPACK; 10 | typedef typename std::make_unsigned::type TU; 11 | static_assert(sizeof(TPACK)*2 == sizeof(T), "TPACK must be half-width of T"); 12 | 13 | static constexpr i32 N = sizeof(TV) / sizeof(T); 14 | static_assert(is_powerof2(N), "vector-size / element-size must be a power of 2"); 15 | 16 | static constexpr bool supports_compress_writes() { return true; } 17 | static constexpr bool supports_packing() { return true; } 18 | 19 | template 20 | static bool can_pack(T span) { 21 | constexpr auto PACK_LIMIT = (((TU) std::numeric_limits::max() + 1)) << Shift; 22 | return ((TU) span) < PACK_LIMIT; 23 | } 24 | 25 | static INLINE TLOADSTOREMASK generate_prefix_mask(i32 amount) { 26 | assert(amount >= 0); 27 | assert(amount <= N); 28 | return 0xFF >> ((N - amount) & (N-1)); 29 | } 30 | 31 | static INLINE TLOADSTOREMASK generate_suffix_mask(i32 amount) { 32 | assert(amount >= 0); 33 | assert(amount <= N); 34 | return 0xFF << (amount & (N-1)); 35 | } 36 | 37 | static INLINE TV load_vec(TV* p) { return _mm512_loadu_si512(p); } 38 | 39 | static INLINE void store_vec(TV* ptr, TV v) { _mm512_storeu_si512(ptr, v); } 40 | 41 | static INLINE TV load_partial_vec(TV *p, TV base, TLOADSTOREMASK mask) { 42 | return _mm512_mask_loadu_epi64(base, mask, (i64 const *) p); 43 | } 44 | 45 | static INLINE void store_masked_vec(TV * p, TV v, TLOADSTOREMASK mask) { 46 | _mm512_mask_storeu_epi64(p, mask, v); 47 | } 48 | 49 | // Will never be called 50 | static INLINE TV partition_vector(TV v, i32 mask) { return v; } 51 | 52 | static INLINE void store_compress_vec(TV* ptr, TV v, TCMPMASK mask) { _mm512_mask_compressstoreu_epi64(ptr, mask, v); } 53 | 54 | static INLINE TV broadcast(T pivot) { return _mm512_set1_epi64(pivot); } 55 | 56 | static INLINE TCMPMASK get_cmpgt_mask(TV a, TV b) { return _mm512_cmp_epu64_mask(a, b, _MM_CMPINT_GT); } 57 | 58 | static INLINE TV shift_right(TV v, i32 i) { return _mm512_srli_epi64(v, i); } 59 | static INLINE TV shift_left(TV v, i32 i) { return _mm512_slli_epi64(v, i); } 60 | 61 | static INLINE TV add(TV a, TV b) { return _mm512_add_epi64(a, b); } 62 | static INLINE TV sub(TV a, TV b) { return _mm512_sub_epi64(a, b); }; 63 | 64 | static INLINE TV pack_unordered(TV a, TV b) { return _mm512_mask_shuffle_epi32(a, 0b1010101010101010, b, _MM_PERM_CDAB); } 65 | 66 | static INLINE void unpack_ordered(TV p, TV& u1, TV& u2) { 67 | auto p01 = _mm512_extracti32x8_epi32(p, 0); 68 | auto p02 = _mm512_extracti32x8_epi32(p, 1); 69 | 70 | u1 = _mm512_cvtepu32_epi64(p01); 71 | u2 = _mm512_cvtepu32_epi64(p02); 72 | } 73 | 74 | template 75 | static INLINE T shift_n_sub(T v, T sub) { 76 | if (Shift > 0) 77 | v >>= Shift; 78 | v -= sub; 79 | return v; 80 | } 81 | 82 | template 83 | static INLINE T unshift_and_add(TPACK from, T add) { 84 | add += from; 85 | 86 | if (Shift > 0) 87 | add = (T) (((TU) add) << Shift); 88 | 89 | return add; 90 | } 91 | 92 | }; 93 | -------------------------------------------------------------------------------- /vxsort/vector_machine/machine_traits.avx2.h: -------------------------------------------------------------------------------- 1 | #ifndef VXSORT_MACHINE_TRAITS_AVX2_H 2 | #define VXSORT_MACHINE_TRAITS_AVX2_H 3 | 4 | #include "vxsort_targets_enable_avx2.h" 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | #include "defs.h" 17 | #include "machine_traits.h" 18 | 19 | #define i2d _mm256_castsi256_pd 20 | #define d2i _mm256_castpd_si256 21 | #define i2s _mm256_castsi256_ps 22 | #define s2i _mm256_castps_si256 23 | #define s2d _mm256_castps_pd 24 | #define d2s _mm256_castpd_ps 25 | 26 | namespace vxsort { 27 | using namespace vxsort::types; 28 | 29 | // * We might read the last 4 bytes into a 128-bit vector for 64-bit element masking 30 | // * We might read the last 8 bytes into a 128-bit vector for 32-bit element masking 31 | // This mostly applies to debug mode, since without optimizations, most compilers 32 | // actually execute the instruction stream _mm256_cvtepi8_epiNN + _mm_loadu_si128 as they are given. 33 | // In contrast, release/optimizing compilers, turn that very specific intrinsic pair to 34 | // a more reasonable: vpmovsxbq ymm0, dword [rax*4 + prefix_mask_table_64b], eliminating the 128-bit 35 | // load completely and effectively reading exactly 4/8 (depending if the instruction is vpmovsxb[q,d] 36 | // without generating an out of bounds read at all. 37 | // But, life is harsh, and we can't trust the compiler to do the right thing if it is not 38 | // contractual, hence this flustercuck 39 | const i32 M4_SIZE = 16 + 4 + 12; 40 | const i32 M8_SIZE = 64 + 8 + 8; 41 | 42 | extern const u8 prefix_mask_table_64b[M4_SIZE]; 43 | extern const u8 prefix_mask_table_32b[M8_SIZE]; 44 | 45 | extern const u8 suffix_mask_table_64b[M4_SIZE]; 46 | extern const u8 suffix_mask_table_32b[M8_SIZE]; 47 | 48 | extern const i8 perm_table_64[128]; 49 | extern const i8 perm_table_32[2048]; 50 | 51 | #include "avx2/f64.h" 52 | #include "avx2/f32.h" 53 | #include "avx2/i16.h" 54 | #include "avx2/i32.h" 55 | #include "avx2/i64.h" 56 | #include "avx2/u16.h" 57 | #include "avx2/u32.h" 58 | #include "avx2/u64.h" 59 | } 60 | 61 | #undef i2d 62 | #undef d2i 63 | #undef i2s 64 | #undef s2i 65 | #undef s2d 66 | #undef d2s 67 | 68 | #include "vxsort_targets_disable.h" 69 | #endif // VXSORT_VXSORT_AVX2_H 70 | -------------------------------------------------------------------------------- /vxsort/vector_machine/machine_traits.avx512.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef VXSORT_MACHINE_TRAITS_AVX512_H 3 | #define VXSORT_MACHINE_TRAITS_AVX512_H 4 | 5 | #include "vxsort_targets_enable_avx512.h" 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include "defs.h" 12 | #include "isa_detection.h" 13 | #include "machine_traits.h" 14 | 15 | namespace vxsort { 16 | using namespace vxsort::types; 17 | 18 | #include "avx512/f64.h" 19 | #include "avx512/f32.h" 20 | #include "avx512/i16.h" 21 | #include "avx512/i32.h" 22 | #include "avx512/i64.h" 23 | #include "avx512/u16.h" 24 | #include "avx512/u32.h" 25 | #include "avx512/u64.h" 26 | } 27 | 28 | #include "vxsort_targets_disable.h" 29 | #endif // VXSORT_VXSORT_AVX512_H 30 | -------------------------------------------------------------------------------- /vxsort/vxsort.avx2.h: -------------------------------------------------------------------------------- 1 | #ifndef VXSORT_VXSORT_AVX2_H 2 | #define VXSORT_VXSORT_AVX2_H 3 | 4 | #include "vector_machine/machine_traits.avx2.h" 5 | #include "smallsort/avx2/bitonic_machine.avx2.h" 6 | #include "partition_machine.avx2.h" 7 | 8 | 9 | #include "vxsort.h" 10 | 11 | #endif //VXSORT_VXSORT_AVX2_H 12 | -------------------------------------------------------------------------------- /vxsort/vxsort.avx512.h: -------------------------------------------------------------------------------- 1 | #ifndef VXSORT_VXSORT_AVX512_H 2 | #define VXSORT_VXSORT_AVX512_H 3 | 4 | #include "vector_machine/machine_traits.avx512.h" 5 | #include "smallsort/avx512/bitonic_machine.avx512.h" 6 | #include "partition_machine.avx512.h" 7 | 8 | #include "vxsort.h" 9 | 10 | #endif //VXSORT_VXSORT_AVX512_H 11 | -------------------------------------------------------------------------------- /vxsort/vxsort_targets_disable.h: -------------------------------------------------------------------------------- 1 | #include "compiler.h" 2 | 3 | #ifdef VXSORT_TARGET_PUSHED 4 | 5 | #if defined(VXSORT_COMPILER_CLANG) || defined(VXSORT_COMPILER_CLANGCL) 6 | #pragma clang attribute pop 7 | #endif 8 | 9 | #if defined(VXSORT_COMPILER_GCC) 10 | #pragma GCC pop_options 11 | #endif 12 | 13 | 14 | #endif 15 | -------------------------------------------------------------------------------- /vxsort/vxsort_targets_enable_avx2.h: -------------------------------------------------------------------------------- 1 | #include "compiler.h" 2 | 3 | #if defined(VXSORT_COMPILER_CLANG) || defined(VXSORT_COMPILER_CLANGCL) 4 | #define VXSORT_TARGET_PUSHED 1 5 | #pragma clang attribute push (__attribute__((target("avx2,popcnt,bmi2"))), apply_to = any(function)) 6 | #endif 7 | 8 | #if defined(VXSORT_COMPILER_GCC) 9 | #define VXSORT_TARGET_PUSHED 1 10 | #pragma GCC push_options 11 | #pragma GCC target("avx2,popcnt,bmi2") 12 | #endif 13 | -------------------------------------------------------------------------------- /vxsort/vxsort_targets_enable_avx512.h: -------------------------------------------------------------------------------- 1 | #include "compiler.h" 2 | 3 | #if defined(VXSORT_COMPILER_CLANG) || defined(VXSORT_COMPILER_CLANGCL) 4 | #define VXSORT_TARGET_PUSHED 1 5 | #pragma clang attribute push (__attribute__((target("avx512f,avx512dq,avx512bw,avx512vbmi2,popcnt"))), apply_to = any(function)) 6 | #endif 7 | 8 | #if defined(VXSORT_COMPILER_GCC) 9 | #define VXSORT_TARGET_PUSHED 1 10 | #pragma GCC push_options 11 | #pragma GCC target("avx512f,avx512dq,avx512bw,avx512vbmi2,popcnt") 12 | #endif 13 | --------------------------------------------------------------------------------