├── .gitignore ├── CMakeLists.txt ├── LICENSE ├── LICENSE-catch2 ├── LICENSE-cub ├── README ├── app ├── CMakeLists.txt ├── benchmark.cu ├── benchmark_main.cu ├── benchmark_sort.cu ├── catch.hpp ├── test_basecase.cu ├── test_fixture.cuh ├── test_helpers.cu ├── test_main.cu ├── test_qs.cu └── test_ssss.cu ├── include ├── cpu_reference.hpp ├── cuda_definitions.cuh ├── cuda_error.cuh ├── cuda_memory.cuh ├── cuda_timer.cuh ├── kernel_config.cuh ├── launcher_fwd.cuh └── verification.hpp └── lib ├── CMakeLists.txt ├── cpu_reference.cpp ├── gen_instantiations.py ├── generated ├── gen-full.cu ├── gen0.cu ├── gen1.cu ├── gen10.cu ├── gen11.cu ├── gen12.cu ├── gen13.cu ├── gen14.cu ├── gen15.cu ├── gen16.cu ├── gen17.cu ├── gen18.cu ├── gen19.cu ├── gen2.cu ├── gen20.cu ├── gen21.cu ├── gen22.cu ├── gen23.cu ├── gen24.cu ├── gen25.cu ├── gen26.cu ├── gen27.cu ├── gen28.cu ├── gen29.cu ├── gen3.cu ├── gen30.cu ├── gen31.cu ├── gen32.cu ├── gen33.cu ├── gen34.cu ├── gen35.cu ├── gen36.cu ├── gen37.cu ├── gen38.cu ├── gen39.cu ├── gen4.cu ├── gen5.cu ├── gen6.cu ├── gen7.cu ├── gen8.cu └── gen9.cu ├── qs_launchers.cuh ├── qs_recursion.cuh ├── qs_recursion_multi.cuh ├── qs_reduce.cuh ├── qs_scan.cuh ├── ssss_build_searchtree.cuh ├── ssss_collect.cuh ├── ssss_collect_multi.cuh ├── ssss_count.cuh ├── ssss_launchers.cuh ├── ssss_merged.cuh ├── ssss_merged_memory.cuh ├── ssss_recursion.cuh ├── ssss_recursion_multi.cuh ├── ssss_reduce.cuh ├── utils.cuh ├── utils_basecase.cuh ├── utils_bytestorage.cuh ├── utils_mask.cuh ├── utils_prefixsum.cuh ├── utils_sampling.cuh ├── utils_search.cuh ├── utils_sort.cuh ├── utils_warpaggr.cuh ├── utils_work.cuh └── verification.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | build 2 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.8 FATAL_ERROR) 2 | project(gpu_selection LANGUAGES CXX CUDA) 3 | 4 | list(APPEND CMAKE_CUDA_FLAGS "-arch=sm_35 -rdc=true --maxrregcount 64 -gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_70,code=compute_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80") 5 | add_subdirectory(lib) 6 | add_subdirectory(app) 7 | -------------------------------------------------------------------------------- /LICENSE-catch2: -------------------------------------------------------------------------------- 1 | Boost Software License - Version 1.0 - August 17th, 2003 2 | 3 | Permission is hereby granted, free of charge, to any person or organization 4 | obtaining a copy of the software and accompanying documentation covered by 5 | this license (the "Software") to use, reproduce, display, distribute, 6 | execute, and transmit the Software, and to prepare derivative works of the 7 | Software, and to permit third-parties to whom the Software is furnished to 8 | do so, all subject to the following: 9 | 10 | The copyright notices in the Software and this entire statement, including 11 | the above license grant, this restriction and the following disclaimer, 12 | must be included in all copies of the Software, in whole or in part, and 13 | all derivative works of the Software, unless such copies or derivative 14 | works are solely in the form of machine-executable object code generated by 15 | a source language processor. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 20 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 21 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 22 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 23 | DEALINGS IN THE SOFTWARE. 24 | -------------------------------------------------------------------------------- /LICENSE-cub: -------------------------------------------------------------------------------- 1 | Copyright (c) 2010-2011, Duane Merrill. All rights reserved. 2 | Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | * Redistributions of source code must retain the above copyright 7 | notice, this list of conditions and the following disclaimer. 8 | * Redistributions in binary form must reproduce the above copyright 9 | notice, this list of conditions and the following disclaimer in the 10 | documentation and/or other materials provided with the distribution. 11 | * Neither the name of the NVIDIA CORPORATION nor the 12 | names of its contributors may be used to endorse or promote products 13 | derived from this software without specific prior written permission. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | This library implements a bucket-based selection algorithm on GPUs 2 | 3 | More details can be found in 4 | 5 | * T. Ribizel and H. Anzt, "Approximate and Exact Selection on GPUs," 2019 IEEE International Parallel and Distributed Processing Symposium Workshops (IPDPSW), Rio de Janeiro, Brazil, 2019, pp. 471-478. 6 | doi: 10.1109/IPDPSW.2019.00088 7 | * T. Ribizel, H. Anzt, "Parallel selection on GPUs," Parallel Computing, Volume 91, 2020, doi: 10.1016/j.parco.2019.102588 8 | 9 | It uses Catch2 as a test framework and the CUB library as a reference implementation for sorting. 10 | 11 | The tests can be run by simply executing `app/unittests`, the benchmarks can be run by executing `app/benchmark` with one of the following parameters 12 | 13 | [full] The full benchmark for exact single and multiple selection and the individual kernels (sample, count, reduce, filter) 14 | [full-multionly] The full benchmark for multiple selection only 15 | [approx] The full benchmark for approximate selection with shared-memory atomics 16 | [approx-g] The full benchmark for approximate selection with global-memory atomics 17 | [multi] The full benchmark for multiple selection with different numbers of ranks 18 | [test] A small benchmark that only executes a single benchmark with small input size 19 | 20 | The output of these tests is the following: 21 | On stdout, they print error messages in case the algorithm execution produces invalid results. For the approx tests, additionally the exact and approximate rank are being output in CSV format. 22 | On stderr, they print the individual timings of the kernels in CSV format for different input sizes given by the first CSV field. Runtime breakdowns are listed within parentheses (). 23 | 24 | `app/benchmark-sort` contains a benchmark for the CUB radix sort implementation as a performance baseline for the multiple selection. 25 | 26 | Structure of the project 27 | 28 | include/cpu_reference.hpp - Reference implementations for testing 29 | include/verification.hpp - Validation functions for testing 30 | include/cuda_definitions.cuh - Type definitions and hardware limits 31 | include/cuda_error.cuh - Wrapper for CUDA error handling 32 | include/cuda_memory.cuh - Wrapper for CUDA memory allocations 33 | include/cuda_timer.cuh - Wrapper for CUDA timing measurements 34 | include/kernel_config.cuh - Configuration struct for kernel templates 35 | include/launcher_fwd.cuh - Forward-declarations of launcher and kernel templates 36 | 37 | lib/generated/* - Explicit template instantiations to parallelize compilation 38 | lib/cpu_reference.cpp - Reference implementations for testing 39 | lib/verification.cpp - Validation functions for testing 40 | lib/qs_launchers.cuh - Wrappers for quickselect kernels 41 | lib/qs_recursion.cuh - Kernels for quickselect single-selection 42 | lib/qs_recursion_multi.cuh - Kernels for quickselect multi-selection 43 | lib/qs_reduce.cuh - Kernels for reducing quickselect partial sums 44 | lib/qs_scan.cuh - Kernels for quickselect bipartitioning 45 | lib/ssss_build_searchtree.cuh - Kernels for sampleselect sampling 46 | lib/ssss_collect.cuh - Kernels for sampleselect single-selection filtering 47 | lib/ssss_collect_multi.cuh - Kernels for sampleselect multi-selection filtering 48 | lib/ssss_count.cuh - Kernels for sampleselect counting 49 | lib/ssss_launchers.cuh - Wrappers for sampleselect kernels 50 | lib/ssss_merged.cuh - Kernels for multiple simultaneous sampleselects 51 | lib/ssss_merged_memory.cuh - Auxiliary data structure for sampleselect multi-selection 52 | lib/ssss_recursion.cuh - Kernels for sampleselect single-selection 53 | lib/ssss_recursion_multi.cuh - Kernels for sampleselect multi-selection 54 | lib/ssss_reduce.cuh - Kernels for reducing sampleselect partial sums 55 | lib/utils_basecase.cuh - Kernels for recursion basecase 56 | lib/utils_bytestorage.cuh - Auxiliary functions for reading/writing unaligned bytes 57 | lib/utils_mask.cuh - Auxiliary functions for bitmasks 58 | lib/utils_prefixsum.cuh - Auxiliary functions for tree-based partial sums 59 | lib/utils_sampling.cuh - Auxiliary functions for sampling 60 | lib/utils_search.cuh - Auxiliary functions for binary and warp-ary searches 61 | lib/utils_sort.cuh - Auxiliary functions for bitonic sorting 62 | lib/utils_warpaggr.cuh - Auxiliary functions for warp-aggregation 63 | lib/utils_work.cuh - Auxiliary functions for work-distribution 64 | lib/utils.cuh - Auxiliary wrappers for basic operations 65 | -------------------------------------------------------------------------------- /app/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(benchmark benchmark_main.cu benchmark.cu) 2 | add_executable(benchmark-sort benchmark_main.cu benchmark_sort.cu) 3 | target_include_directories(benchmark-sort PRIVATE ../include ../lib) 4 | 5 | add_executable(unittest test_main.cu test_qs.cu test_ssss.cu test_helpers.cu 6 | # test_basecase.cu 7 | ) 8 | 9 | target_link_libraries(benchmark gpu_selection) 10 | 11 | target_link_libraries(unittest gpu_selection) 12 | 13 | set_target_properties(unittest PROPERTIES CUDA_SEPARABLE_COMPILATION ON) 14 | set_target_properties(benchmark PROPERTIES CUDA_SEPARABLE_COMPILATION ON) 15 | set_target_properties(benchmark-sort PROPERTIES CUDA_SEPARABLE_COMPILATION ON) 16 | -------------------------------------------------------------------------------- /app/benchmark_main.cu: -------------------------------------------------------------------------------- 1 | #define CATCH_CONFIG_MAIN 2 | #include "catch.hpp" -------------------------------------------------------------------------------- /app/benchmark_sort.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Parallel selection algorithm on GPUs 3 | * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de) 4 | * 5 | * This program is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation, version 3. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program. If not, see . 16 | */ 17 | #include "catch.hpp" 18 | #include "test_fixture.cuh" 19 | #include 20 | #include 21 | #include 22 | 23 | namespace gpu { 24 | 25 | constexpr auto num_runs = 10; 26 | 27 | template 28 | void cub_sort(std::string name, index n, index d, basic_test_data>& data, cuda_timer& timer) { 29 | cub::DoubleBuffer keys{static_cast(data.gpu_data), static_cast(data.gpu_data_out)}; 30 | timer.timed(name, num_runs, [&](auto event) { 31 | data.reset(); 32 | auto tmp_size = sizeof(T) * n; 33 | event(0); 34 | cub::DeviceRadixSort::SortKeys(static_cast(data.gpu_data_tmp), tmp_size, keys, n); 35 | event(1); 36 | }); 37 | auto sorted = data.data; 38 | auto ref = sorted; 39 | cudaCheckError(cudaMemcpy(ref.data(), keys.Current(), n * sizeof(T), cudaMemcpyDeviceToHost)); 40 | std::sort(sorted.begin(), sorted.end()); 41 | bool is_sorted = sorted == ref; 42 | CHECK(is_sorted); 43 | } 44 | 45 | TEMPLATE_TEST_CASE("sort", "", float, double) { 46 | using T = TestType; 47 | auto n = GENERATE(as{}, 65536, 262144, 524288, 1048576, 2097152, 4194304, 8388608, 48 | 16777216, 33554432, 67108864, 134217728); 49 | auto d = GENERATE(as{}, 1 << 30); 50 | auto seed = GENERATE(take(10, Catch::Generators::random(0, 1000000))); 51 | basic_test_data> data{n, d, index(seed)}; 52 | CAPTURE(n); 53 | CAPTURE(d); 54 | CAPTURE(seed); 55 | cuda_timer timer{std::cerr}; 56 | auto suffix = "-" + std::to_string(n) + "-" + std::to_string(d) + "-" + typeid(T).name(); 57 | // thrust_sort("thrust_sort" + suffix, n, d, data, timer); 58 | cub_sort("cub_sort" + suffix, n, d, data, timer); 59 | } 60 | 61 | } // namespace gpu 62 | -------------------------------------------------------------------------------- /app/test_basecase.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Parallel selection algorithm on GPUs 3 | * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de) 4 | * 5 | * This program is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation, version 3. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program. If not, see . 16 | */ 17 | #include "catch.hpp" 18 | #include "../lib/utils_basecase.cuh" 19 | #include "test_fixture.cuh" 20 | #include 21 | #include 22 | #include 23 | 24 | namespace gpu { 25 | 26 | template 27 | struct extended_pair { 28 | constexpr static int size = Size; 29 | constexpr static int replsize = Replsize; 30 | using first_type = A; 31 | using second_type = B; 32 | }; 33 | 34 | template 35 | using test_data = basic_test_data; 36 | 37 | template 38 | using float_pair = extended_pair; 39 | 40 | TEMPLATE_PRODUCT_TEST_CASE_METHOD(test_data, "basecase", "[basecase]", 41 | (float_pair), 42 | ((select_config<10, 10, 8, true, true, false, 8, 10, 10, false, 8, 0>), 43 | (select_config<10, 10, 8, true, true, false, 8, 10, 10, false, 8, 1>), 44 | (select_config<10, 10, 8, true, true, false, 8, 10, 10, false, 8, 2>), 45 | (select_config<12, 10, 8, true, true, false, 8, 10, 10, false, 8, 2>))) { 46 | using T = typename TestType::first_type; 47 | using Config = typename TestType::second_type; 48 | std::vector ranks(1); 49 | constexpr auto basecase_size = Config::basecase::size; 50 | constexpr auto local_size = Config::basecase::local_size; 51 | constexpr auto cur_launch_size = Config::basecase::launch_size; 52 | auto size = GENERATE(as{}, basecase_size, basecase_size / 5, warp_size * local_size, warp_size * local_size / 5); 53 | auto launch_size = GENERATE(as{}, cur_launch_size, max_block_size); 54 | std::string mode; 55 | SECTION("some ranks") { 56 | mode = "some ranks"; 57 | ranks.resize(std::min(100, size / 2)); 58 | for (auto i = 0; i < ranks.size(); ++i) { 59 | ranks[i] = i * size / ranks.size(); 60 | } 61 | } 62 | SECTION("all ranks") { 63 | mode = "all ranks"; 64 | ranks.resize(size); 65 | std::iota(ranks.begin(), ranks.end(), 0); 66 | } 67 | CAPTURE(size); 68 | CAPTURE(launch_size); 69 | CAPTURE(mode); 70 | this->gpu_ranks.copy_from(ranks); 71 | this->run([&]() { kernels::select_bitonic_basecase<<<1, launch_size>>>(this->gpu_data, size, ranks.back(), this->gpu_data_out); }); 72 | std::vector result; 73 | this->gpu_data_out.copy_to(result); 74 | auto data = this->data; 75 | data.resize(size); 76 | std::sort(data.begin(), data.end()); 77 | CHECK(data[ranks.back()] == result[0]); 78 | this->run([&]() { kernels::select_bitonic_multiple_basecase<<<1, launch_size>>>(this->gpu_data, size, this->gpu_ranks, ranks.size(), 0, this->gpu_data_out); }); 79 | this->gpu_data_out.copy_to(result); 80 | index count{}; 81 | for (auto i = 0; i < ranks.size(); ++i) { 82 | count += result[i] != data[ranks[i]]; 83 | } 84 | CHECK(count == 0); 85 | } 86 | 87 | } // namespace gpu -------------------------------------------------------------------------------- /app/test_fixture.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Parallel selection algorithm on GPUs 3 | * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de) 4 | * 5 | * This program is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation, version 3. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program. If not, see . 16 | */ 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | 26 | namespace gpu { 27 | 28 | constexpr auto max_tree_width = 4096; 29 | constexpr auto max_tree_size = 2 * max_tree_width * 2; 30 | constexpr auto max_block_count = 1024; 31 | 32 | template 33 | struct basic_test_data { 34 | using T = typename Pair::first_type; 35 | using Config = typename Pair::second_type; 36 | index size; 37 | std::vector data; 38 | std::vector tree; 39 | std::vector data_out; 40 | std::vector oracles; 41 | std::vector count_out; 42 | std::vector atomic; 43 | std::vector zeros; 44 | std::vector ranks; 45 | std::vector bucket_mask; 46 | cuda_resettable_array gpu_data; 47 | cuda_array gpu_data_tmp; 48 | cuda_array gpu_tree; 49 | cuda_array gpu_data_out; 50 | cuda_array gpu_oracles; 51 | cuda_array gpu_aux; 52 | cuda_resettable_array gpu_atomic; 53 | cuda_array gpu_count_tmp; 54 | cuda_resettable_array gpu_count_out; 55 | cuda_array gpu_bucket_ranges; 56 | cuda_array gpu_rank_ranges; 57 | cuda_array gpu_ranks; 58 | cuda_array gpu_bucket_mask; 59 | index rank; 60 | T pivot; 61 | 62 | basic_test_data(index size = Size, index valsize = Valsize, index seed = 0) 63 | : size{size}, data(size), tree(max_tree_size), oracles(size), count_out(max_block_count * 2 + 2), 64 | zeros(size + max_block_count * max_tree_width * 16), ranks(287), bucket_mask(max_tree_size / (sizeof(mask) * 8)), 65 | atomic(max_tree_width) { 66 | std::default_random_engine random(seed); 67 | std::uniform_int_distribution dist(0, valsize - 1); 68 | std::uniform_int_distribution idist(0, size - 1); 69 | std::uniform_int_distribution maskdist(mask(0), ~mask(0)); 70 | std::vector smallzeros(max_tree_size); 71 | for (auto& el : data) { 72 | el = dist(random); 73 | } 74 | for (auto& el : ranks) { 75 | el = idist(random); 76 | } 77 | ranks.back() = size - 1; 78 | std::sort(ranks.begin(), ranks.end()); 79 | rank = idist(random); 80 | pivot = data[rank]; 81 | gpu_data.copy_from(data); 82 | gpu_tree.copy_from(tree); 83 | gpu_data_tmp.copy_from(data); 84 | gpu_data_out.copy_from(data); 85 | gpu_atomic.copy_from(atomic); 86 | gpu_count_tmp.copy_from(zeros); 87 | gpu_aux.copy_from(zeros); 88 | gpu_count_out.copy_from(count_out); 89 | gpu_oracles.copy_from(oracles); 90 | gpu_bucket_ranges.copy_from(smallzeros); 91 | gpu_rank_ranges.copy_from(smallzeros); 92 | gpu_ranks.copy_from(ranks); 93 | gpu_bucket_mask.copy_from(bucket_mask); 94 | } 95 | 96 | void reset() { 97 | gpu_data.reset(); 98 | gpu_atomic.reset(); 99 | gpu_count_out.reset(); 100 | } 101 | 102 | void copy_from_gpu() { 103 | gpu_data_out.copy_to(data_out); 104 | gpu_count_out.copy_to(count_out); 105 | gpu_tree.copy_to(tree); 106 | gpu_oracles.copy_to(oracles); 107 | gpu_atomic.copy_to(atomic); 108 | } 109 | 110 | template 111 | void run(F f) { 112 | cudaChecked(f); 113 | copy_from_gpu(); 114 | } 115 | }; 116 | 117 | inline std::vector unpack(const std::vector& in, int size) { 118 | using uc = unsigned char; 119 | std::vector result; 120 | result.reserve(in.size() * 4); 121 | for (auto el : in) { 122 | result.insert(result.end(), {uc(el), uc(el >> 8), uc(el >> 16), uc(el >> 24)}); 123 | } 124 | result.resize(size); 125 | return result; 126 | } 127 | 128 | inline std::vector build_ranks_uniform(index size, index count) { 129 | std::vector result; 130 | for (index i = 0; i < count; ++i) { 131 | result.push_back(int(double(i) * size / count)); 132 | } 133 | return result; 134 | } 135 | 136 | inline std::vector build_ranks_clustered(index size) { 137 | std::vector result; 138 | auto step = size / 2; 139 | while (step >= 1) { 140 | result.push_back(step); 141 | step = step / 2; 142 | } 143 | std::reverse(result.begin(), result.end()); 144 | return result; 145 | } 146 | 147 | } // namespace gpu -------------------------------------------------------------------------------- /app/test_main.cu: -------------------------------------------------------------------------------- 1 | #define CATCH_CONFIG_MAIN 2 | #include "catch.hpp" 3 | -------------------------------------------------------------------------------- /app/test_qs.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Parallel selection algorithm on GPUs 3 | * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de) 4 | * 5 | * This program is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation, version 3. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program. If not, see . 16 | */ 17 | #include "catch.hpp" 18 | #include "test_fixture.cuh" 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | namespace gpu { 26 | 27 | template 28 | using test_data = basic_test_data; 29 | 30 | template 31 | using float_pair = typename std::pair; 32 | 33 | template 34 | using double_pair = typename std::pair; 35 | 36 | TEMPLATE_PRODUCT_TEST_CASE_METHOD(test_data, "bipartition", "[quickselect]", 37 | (float_pair, double_pair), 38 | ((select_config<10, 5, 8, true, true, true, 8, 10, 10>), 39 | (select_config<10, 5, 8, false, true, true, 8, 10, 10>))) { 40 | using T = typename TestType::first_type; 41 | using Config = typename TestType::second_type; 42 | this->run([&]() { 43 | partition(this->gpu_data, this->gpu_data_out, this->gpu_count_out, this->size, 44 | this->pivot); 45 | }); 46 | auto lsize = this->count_out[0]; 47 | auto rsize = this->count_out[1]; 48 | CHECK(lsize + rsize == this->size); 49 | auto counts = verification::count_mispartitioned(this->data_out, lsize, this->pivot); 50 | auto lcount = counts.first; 51 | auto rcount = counts.second; 52 | CHECK(lcount == 0); 53 | CHECK(rcount == 0); 54 | } 55 | 56 | TEMPLATE_PRODUCT_TEST_CASE_METHOD(test_data, "quickselect", "[quickselect]", 57 | (float_pair, double_pair), 58 | ((select_config<10, 5, 8, true, true, true, 8, 10, 10>), 59 | (select_config<10, 5, 8, false, true, true, 8, 10, 10>))) { 60 | using T = typename TestType::first_type; 61 | using Config = typename TestType::second_type; 62 | this->run([&]() { 63 | quickselect(this->gpu_data, this->gpu_data_tmp, this->gpu_count_tmp, this->size, this->rank, 64 | this->gpu_data_out); 65 | }); 66 | auto ref = verification::nth_element(this->data, this->rank); 67 | CHECK(ref == this->data_out[0]); 68 | } 69 | 70 | TEMPLATE_PRODUCT_TEST_CASE_METHOD(test_data, "quickselect_multi", "[quickselect]", 71 | (float_pair, double_pair), 72 | ((select_config<10, 5, 8, true, true, true, 8, 10, 10>), 73 | (select_config<10, 5, 8, false, true, true, 8, 10, 10>))) { 74 | using T = typename TestType::first_type; 75 | using Config = typename TestType::second_type; 76 | std::vector ranks; 77 | SECTION("some ranks") { 78 | for (int i = 0; i < 100; ++i) { 79 | ranks.push_back(this->size * i / 120); 80 | ranks.push_back(this->size * i / 120 + 1); 81 | ranks.push_back(this->size * i / 120 + 2); 82 | ranks.push_back(this->size * i / 120 + 10); 83 | } 84 | for (int i = 0; i < 6000; ++i) { 85 | ranks.push_back(i + 4000); 86 | } 87 | std::sort(ranks.begin(), ranks.end()); 88 | ranks.erase(std::unique(ranks.begin(), ranks.end()), ranks.end()); 89 | } 90 | SECTION("all ranks") { 91 | ranks.resize(this->size); 92 | std::iota(ranks.begin(), ranks.end(), 0); 93 | } 94 | std::vector result(ranks.size()); 95 | this->gpu_ranks.copy_from(ranks); 96 | this->gpu_data_out.copy_from(result); 97 | this->run([&]() { 98 | quickselect_multi(this->gpu_data, this->gpu_data_tmp, this->gpu_count_tmp, this->size, this->gpu_ranks, ranks.size(), 99 | this->gpu_data_out); 100 | }); 101 | auto ref = this->data; 102 | std::sort(ref.begin(), ref.end()); 103 | this->gpu_data_out.copy_to(result); 104 | std::vector reference; 105 | for (auto rank : ranks) { 106 | reference.push_back(ref[rank]); 107 | } 108 | int count{}; 109 | for (index i = 0; i < reference.size(); ++i) { 110 | count += reference[i] != result[i]; 111 | } 112 | CAPTURE(reference.size()); 113 | CHECK(count == 0); 114 | } 115 | 116 | } // namespace gpu -------------------------------------------------------------------------------- /include/cpu_reference.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Parallel selection algorithm on GPUs 3 | * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de) 4 | * 5 | * This program is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation, version 3. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program. If not, see . 16 | */ 17 | #ifndef GPU_SELECTION_CPU_REFERENCE_HPP 18 | #define GPU_SELECTION_CPU_REFERENCE_HPP 19 | 20 | #include 21 | #include 22 | 23 | namespace cpu { 24 | 25 | using gpu::index; 26 | using gpu::mask; 27 | 28 | template 29 | std::pair partition(const std::vector& data, int begin, int end, std::vector& out, 30 | int pivot_idx); 31 | 32 | template 33 | T quickselect(std::vector& in, std::vector& out, int rank); 34 | 35 | template 36 | std::vector build_searchtree(const std::vector& in, int sample_size, int searchtree_size); 37 | 38 | template 39 | std::pair, std::vector> ssss(const std::vector& data, 40 | const std::vector& tree, bool write); 41 | 42 | std::vector grouped_reduce(const std::vector& data, int searchtree_size); 43 | std::vector grouped_prefix_sum(const std::vector& data, int searchtree_size); 44 | 45 | std::vector compute_rank_ranges(std::vector counts, const std::vector& ranks); 46 | std::vector compute_bucket_mask(const std::vector& rank_ranges); 47 | 48 | std::pair, index> masked_prefix_sum(const std::vector& counts, const std::vector& m); 49 | 50 | } // namespace cpu 51 | 52 | #endif // GPU_SELECTION_CPU_REFERENCE_HPP 53 | -------------------------------------------------------------------------------- /include/cuda_definitions.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Parallel selection algorithm on GPUs 3 | * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de) 4 | * 5 | * This program is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation, version 3. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program. If not, see . 16 | */ 17 | #ifndef GPU_SELECTION_CUDA_DEFINITIONS_CUH 18 | #define GPU_SELECTION_CUDA_DEFINITIONS_CUH 19 | 20 | #include 21 | 22 | namespace gpu { 23 | 24 | using index = std::uint32_t; 25 | using poracle = std::uint32_t; 26 | using oracle = std::uint32_t; 27 | using mask = std::uint32_t; 28 | 29 | constexpr index warp_size_log2 = 5; 30 | constexpr index warp_size = 1 << warp_size_log2; 31 | constexpr index max_block_size_log2 = 10; 32 | constexpr index max_block_size = 1 << max_block_size_log2; 33 | 34 | } // namespace gpu 35 | 36 | #endif // GPU_SELECTION_CUDA_DEFINITIONS_CUH 37 | -------------------------------------------------------------------------------- /include/cuda_error.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Parallel selection algorithm on GPUs 3 | * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de) 4 | * 5 | * This program is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation, version 3. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program. If not, see . 16 | */ 17 | #ifndef CUDA_CHECK_ERROR_CUH 18 | #define CUDA_CHECK_ERROR_CUH 19 | 20 | #include 21 | 22 | inline void cudaCheckError(cudaError_t error) { 23 | if (error != cudaSuccess) { 24 | std::string msg{"CUDA error "}; 25 | msg += cudaGetErrorName(error); 26 | msg += ": "; 27 | msg += cudaGetErrorString(error); 28 | throw std::runtime_error{msg}; 29 | } 30 | } 31 | 32 | template 33 | void cudaChecked(F func) { 34 | func(); 35 | cudaDeviceSynchronize(); 36 | cudaCheckError(cudaGetLastError()); 37 | } 38 | 39 | #endif // CUDA_CHECK_ERROR_CUH 40 | -------------------------------------------------------------------------------- /include/cuda_memory.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Parallel selection algorithm on GPUs 3 | * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de) 4 | * 5 | * This program is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation, version 3. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program. If not, see . 16 | */ 17 | #ifndef CUDA_MEMORY_CUH 18 | #define CUDA_MEMORY_CUH 19 | 20 | #include 21 | #include 22 | 23 | #include "cuda_error.cuh" 24 | 25 | template 26 | class cuda_resettable_array; 27 | 28 | template 29 | class cuda_array { 30 | friend class cuda_resettable_array; 31 | public: 32 | cuda_array() : size{}, storage{nullptr} {} 33 | cuda_array(std::size_t size) : size{size}, storage{nullptr} { 34 | cudaCheckError(cudaMalloc(&storage, sizeof(T) * size)); 35 | } 36 | ~cuda_array() { 37 | if (storage) { 38 | try { 39 | cudaCheckError(cudaFree(storage)); 40 | } catch (std::runtime_error& err) { 41 | std::cerr << err.what() << std::endl; 42 | } 43 | } 44 | } 45 | cuda_array(const cuda_array&) = delete; 46 | cuda_array(cuda_array&& other) { 47 | storage = other.storage; 48 | size = other.size; 49 | other.storage = nullptr; 50 | other.size = 0; 51 | } 52 | cuda_array& operator=(cuda_array&& other) { 53 | this->~cuda_array(); 54 | storage = other.storage; 55 | size = other.size; 56 | other.storage = nullptr; 57 | other.size = 0; 58 | return *this; 59 | } 60 | 61 | operator T*() { return storage; } 62 | 63 | void copy_from_raw(const T* src) { 64 | cudaCheckError(cudaMemcpy(storage, src, size * sizeof(T), cudaMemcpyHostToDevice)); 65 | } 66 | 67 | void copy_to_raw(T* dst) const { 68 | cudaCheckError(cudaMemcpy(dst, storage, size * sizeof(T), cudaMemcpyDeviceToHost)); 69 | } 70 | 71 | void copy_from(const std::vector& vec) { 72 | if (size != vec.size()) { 73 | *this = cuda_array{vec.size()}; 74 | } 75 | copy_from_raw(vec.data()); 76 | } 77 | 78 | void copy_to(std::vector& vec) const { 79 | vec.resize(size); 80 | copy_to_raw(vec.data()); 81 | } 82 | 83 | private: 84 | std::size_t size; 85 | T* storage; 86 | }; 87 | 88 | template 89 | class cuda_resettable_array { 90 | public: 91 | void copy_from_raw(const T* src) { 92 | storage.copy_from_raw(src); 93 | refstorage.copy_from_raw(src); 94 | } 95 | 96 | void copy_to_raw(T* dst) const { 97 | storage.copy_to_raw(dst); 98 | } 99 | 100 | void copy_from(const std::vector& vec) { 101 | storage.copy_from(vec); 102 | refstorage.copy_from(vec); 103 | } 104 | 105 | void copy_to(std::vector& vec) const { 106 | storage.copy_to(vec); 107 | } 108 | 109 | void reset() { 110 | cudaCheckError(cudaMemcpy(storage, refstorage, storage.size * sizeof(T), cudaMemcpyDeviceToDevice)); 111 | } 112 | 113 | operator T*() { return storage; } 114 | 115 | private: 116 | cuda_array storage; 117 | cuda_array refstorage; 118 | }; 119 | 120 | #endif // CUDA_MEMORY_CUH 121 | -------------------------------------------------------------------------------- /include/cuda_timer.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Parallel selection algorithm on GPUs 3 | * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de) 4 | * 5 | * This program is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation, version 3. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program. If not, see . 16 | */ 17 | #ifndef CUDA_TIMER_CUH 18 | #define CUDA_TIMER_CUH 19 | 20 | #include "cuda_error.cuh" 21 | 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | 28 | class cuda_timer { 29 | public: 30 | cuda_timer(std::ostream& output) : m_events(6), m_output{&output} { 31 | for (auto& event : m_events) { 32 | cudaCheckError(cudaEventCreate(&event)); 33 | } 34 | } 35 | 36 | ~cuda_timer() { 37 | for (auto& event : m_events) { 38 | cudaEventDestroy(event); 39 | } 40 | } 41 | 42 | template 43 | void timed(std::string name, int num_runs, Kernel kernel) { 44 | std::vector> results(num_runs, std::vector(m_events.size() - 1)); 45 | int max_event = -1; 46 | auto event = [&](int idx_event) { 47 | cudaCheckError(cudaEventRecord(m_events[idx_event])); 48 | max_event = std::max(idx_event, max_event); 49 | }; 50 | for (int i = 0; i < num_runs; ++i) { 51 | cudaChecked([&]() { kernel(event); }); 52 | cudaCheckError(cudaEventSynchronize(m_events[max_event])); 53 | for (int j = 0; j < max_event; ++j) { 54 | cudaCheckError(cudaEventElapsedTime(&results[i][j], m_events[j], m_events[j + 1])); 55 | } 56 | } 57 | auto& out = *m_output; 58 | out << name; 59 | for (const auto& run : results) { 60 | out << ",("; 61 | std::copy(run.begin(), run.begin() + max_event - 1, std::ostream_iterator(out, ";")); 62 | out << run[max_event - 1] << ')'; 63 | } 64 | out << std::endl; // flush output (in case of errors!) 65 | } 66 | 67 | private: 68 | std::vector m_events; 69 | std::ostream* m_output; 70 | }; 71 | 72 | class cpu_timer { 73 | public: 74 | void start() { m_start = std::chrono::high_resolution_clock::now(); } 75 | void stop() { m_end = std::chrono::high_resolution_clock::now(); } 76 | template 77 | void timed(F f) { 78 | start(); 79 | f(); 80 | stop(); 81 | } 82 | double elapsed_us(int repetitions = 1) { 83 | return std::chrono::duration(m_end - m_start).count() / repetitions; 84 | } 85 | 86 | private: 87 | std::chrono::high_resolution_clock::time_point m_start; 88 | std::chrono::high_resolution_clock::time_point m_end; 89 | }; 90 | 91 | #endif // CUDA_TIMER_CUH 92 | -------------------------------------------------------------------------------- /include/kernel_config.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Parallel selection algorithm on GPUs 3 | * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de) 4 | * 5 | * This program is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation, version 3. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program. If not, see . 16 | */ 17 | #include "cuda_definitions.cuh" 18 | #include 19 | 20 | namespace gpu { 21 | 22 | template 23 | struct bitonic_basecase_config { 24 | constexpr static index size_log2 = Size_log2; 25 | constexpr static index size = 1 << size_log2; 26 | constexpr static index local_size_log2 = Local_size_log2; 27 | constexpr static index local_size = 1 << local_size_log2; 28 | constexpr static index launch_size = size / local_size; 29 | }; 30 | 31 | template 32 | struct sample_config { 33 | constexpr static index size_log2 = Size_log2; 34 | constexpr static index size = 1 << size_log2; 35 | constexpr static index local_size_log2 = size_log2 > max_block_size_log2 ? size_log2 - max_block_size_log2 : 0; 36 | constexpr static index local_size = 1 << local_size_log2; 37 | }; 38 | 39 | template 40 | struct searchtree_config { 41 | constexpr static index height = Height; 42 | constexpr static index width = 1 << height; 43 | constexpr static index size = 2 * width - 1; 44 | }; 45 | 46 | template 48 | struct algorithm_config { 49 | constexpr static bool shared_memory = Shared_memory; 50 | constexpr static bool warp_aggr = Warp_aggr; 51 | constexpr static bool write = Write; 52 | constexpr static index unroll = Unroll; 53 | constexpr static index max_block_size_log2 = Max_block_size_log2; 54 | constexpr static index max_block_size = 1 << max_block_size_log2; 55 | constexpr static index max_block_count_log2 = Max_block_count_log2; 56 | constexpr static index max_block_count = 1 << max_block_count_log2; 57 | constexpr static index merged_limit = Merged_limit; 58 | constexpr static bool bucket_select = Bucket_select; 59 | }; 60 | 61 | template 64 | struct select_config { 65 | using basecase = bitonic_basecase_config; 66 | using sample = sample_config; 67 | using searchtree = searchtree_config; 68 | using algorithm = algorithm_config; 70 | constexpr static auto searchtree_kernel_size = std::max(std::min(max_block_size, sample::size), searchtree::width); 71 | }; 72 | 73 | } // namespace gpu -------------------------------------------------------------------------------- /include/launcher_fwd.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Parallel selection algorithm on GPUs 3 | * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de) 4 | * 5 | * This program is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation, version 3. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program. If not, see . 16 | */ 17 | #ifndef LAUNCHER_FWD_CUH 18 | #define LAUNCHER_FWD_CUH 19 | 20 | #include "cuda_definitions.cuh" 21 | #include "ssss_merged_memory.cuh" 22 | 23 | namespace gpu { 24 | 25 | namespace kernels { 26 | template 27 | struct ssss_multi_aux; 28 | 29 | template 30 | __global__ void partition(const T* in, T* out, index* atomic, index size, T pivot, index workcount); 31 | 32 | template 33 | __global__ void partition_count(const T* in, index* counts, index size, T pivot, index workcount); 34 | 35 | template 36 | __global__ void partition_distr(const T* in, T* out, const index* counts, index size, T pivot, index workcount); 37 | 38 | template 39 | __global__ void reduce_counts(const index* in, index* out, index num_blocks); 40 | 41 | template 42 | __global__ void prefix_sum_counts(index* in, index* out, index num_blocks); 43 | 44 | template 45 | __global__ void partition_prefixsum(index* counts, index block_count); 46 | 47 | template 48 | __global__ void count_buckets(const T* in, const T* tree, index* counts, poracle* oracles, index size, index workcount); 49 | 50 | template 51 | __device__ void masked_prefix_sum(index* counts, const mask* m); 52 | } 53 | 54 | template 55 | __host__ __device__ void build_searchtree(const T* in, T* out, index size); 56 | 57 | template 58 | __host__ __device__ void count_buckets(const T* in, const T* tree, index* localcounts, 59 | index* counts, poracle* oracles, index size); 60 | 61 | template 62 | __host__ __device__ void collect_bucket(const T* data, const poracle* oracles_packed, 63 | const index* prefix_sum, T* out, index size, oracle bucket, 64 | index* atomic); 65 | 66 | template 67 | __host__ __device__ void collect_bucket_indirect(const T* data, const poracle* oracles_packed, 68 | const index* prefix_sum, T* out, index size, 69 | const oracle* bucket, index* atomic); 70 | 71 | template 72 | __host__ __device__ void collect_buckets(const T* data, const poracle* oracles_packed, 73 | const index* block_prefix_sum, const index* bucket_out_ranges, 74 | T* out, index size, mask* buckets, index* atomic); 75 | 76 | template 77 | __host__ __device__ void ssss_merged( 78 | const T* in, 79 | T* out, 80 | poracle* oracles, 81 | index offset, 82 | const index* ranks, 83 | index rank_offset, 84 | index rank_base, 85 | const kernels::ssss_multi_aux* aux_in, 86 | kernels::ssss_multi_aux* aux_outs, 87 | T* out_trees); 88 | 89 | template 90 | void sampleselect(T* in, T* tmp, T* tree, index* count_tmp, index size, index rank, T* out); 91 | 92 | template 93 | void sampleselect_host(T* in, T* tmp, T* tree, index* count_tmp, index size, index rank, T* out); 94 | 95 | template 96 | void sampleselect_multi(T* in, T* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, T* out); 97 | 98 | template 99 | __device__ __host__ void partition(const T* in, T* out, index* counts, index size, T pivot); 100 | 101 | template 102 | void quickselect_multi(T* in, T* tmp, index* count_tmp, index size, const index* ranks, index rank_count, T* out); 103 | 104 | template 105 | void quickselect(T* in, T* tmp, index* count_tmp, index size, index rank, T* out); 106 | 107 | template 108 | __host__ __device__ launch_parameters get_launch_parameters(index size); 109 | 110 | } // namespace gpu 111 | 112 | #endif // LAUNCHER_FWD_CUH -------------------------------------------------------------------------------- /include/verification.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Parallel selection algorithm on GPUs 3 | * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de) 4 | * 5 | * This program is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation, version 3. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program. If not, see . 16 | */ 17 | #ifndef GPU_SELECTION_VERIFICATION_HPP 18 | #define GPU_SELECTION_VERIFICATION_HPP 19 | 20 | #include 21 | #include 22 | #include 23 | 24 | namespace verification { 25 | 26 | using gpu::index; 27 | using gpu::mask; 28 | 29 | template 30 | std::pair count_mispartitioned(const std::vector& data, int pivot_rank, T pivot); 31 | 32 | template 33 | T nth_element(const std::vector& data, int rank); 34 | 35 | template 36 | std::vector nth_elements(const std::vector& data, std::vector ranks); 37 | 38 | template 39 | int count_not_in_bucket(const std::vector& data, T lower, T upper); 40 | 41 | template 42 | std::vector count_not_in_buckets(const std::vector& data, std::vector prefix_sum, const std::vector& searchtree); 43 | 44 | bool verify_rank_ranges(const std::vector& ranks, const std::vector& index_ranges, const std::vector& rank_ranges); 45 | 46 | } // namespace verification 47 | 48 | #endif // GPU_SELECTION_VERIFICATION_HPP 49 | -------------------------------------------------------------------------------- /lib/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(gpu_selection 2 | cpu_reference.cpp 3 | verification.cpp 4 | generated/gen0.cu 5 | generated/gen1.cu 6 | generated/gen2.cu 7 | generated/gen3.cu 8 | generated/gen4.cu 9 | generated/gen5.cu 10 | generated/gen6.cu 11 | generated/gen7.cu 12 | generated/gen8.cu 13 | generated/gen9.cu 14 | generated/gen10.cu 15 | generated/gen11.cu 16 | generated/gen12.cu 17 | generated/gen13.cu 18 | generated/gen14.cu 19 | generated/gen15.cu 20 | generated/gen16.cu 21 | generated/gen17.cu 22 | generated/gen18.cu 23 | generated/gen19.cu 24 | generated/gen20.cu 25 | generated/gen21.cu 26 | generated/gen22.cu 27 | generated/gen23.cu 28 | generated/gen24.cu 29 | generated/gen25.cu 30 | generated/gen26.cu 31 | generated/gen27.cu 32 | generated/gen28.cu 33 | generated/gen29.cu 34 | generated/gen30.cu 35 | generated/gen31.cu 36 | generated/gen32.cu 37 | generated/gen33.cu 38 | generated/gen34.cu 39 | generated/gen35.cu 40 | generated/gen36.cu 41 | generated/gen37.cu 42 | generated/gen38.cu 43 | generated/gen39.cu 44 | ) 45 | 46 | target_compile_features(gpu_selection PUBLIC cxx_std_14) 47 | set_target_properties(gpu_selection PROPERTIES CUDA_SEPARABLE_COMPILATION ON) 48 | 49 | target_include_directories(gpu_selection PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}") 50 | target_include_directories(gpu_selection PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/../include") 51 | -------------------------------------------------------------------------------- /lib/generated/gen0.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | namespace gpu { 9 | 10 | template void sampleselect>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 11 | template __host__ __device__ void collect_bucket_indirect>(const float* data, const poracle* oracles_packed, const index* prefix_sum, float* out, index size, const oracle* bucket, index* atomic); 12 | template void sampleselect>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 13 | template void sampleselect>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 14 | template __host__ __device__ void count_buckets>(const float* in, const float* tree, index* localcounts, index* counts, poracle* oracles, index size); 15 | template __host__ __device__ void collect_bucket>(const double* data, const poracle* oracles_packed, const index* prefix_sum, double* out, index size, oracle bucket, index* atomic); 16 | template __global__ void kernels::partition_prefixsum>(index* counts, index block_count); 17 | template void sampleselect>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 18 | template __host__ __device__ void collect_bucket>(const double* data, const poracle* oracles_packed, const index* prefix_sum, double* out, index size, oracle bucket, index* atomic); 19 | template void quickselect_multi>(float* in, float* tmp, index* count_tmp, index size, const index* ranks, index rank_count, float* out); 20 | template __global__ void kernels::partition_prefixsum>(index* counts, index block_count); 21 | } -------------------------------------------------------------------------------- /lib/generated/gen1.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | namespace gpu { 9 | template __host__ __device__ launch_parameters get_launch_parameters>(index size); 10 | template void quickselect>(float* in, float* tmp, index* count_tmp, index size, index rank, float* out); 11 | template __device__ __host__ void partition>(const float* in, float* out, index* counts, index size, float pivot); 12 | template __global__ void kernels::count_buckets>(const double* in, const double* tree, index* counts, poracle* oracles, index size, index workcount); 13 | template __host__ __device__ launch_parameters get_launch_parameters>(index size); 14 | template __global__ void kernels::partition>(const double* in, double* out, index* atomic, index size, double pivot, index workcount); 15 | template __global__ void kernels::count_buckets>(const float* in, const float* tree, index* counts, poracle* oracles, index size, index workcount); 16 | template __global__ void kernels::count_buckets>(const float* in, const float* tree, index* counts, poracle* oracles, index size, index workcount); 17 | template void sampleselect_multi>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out); 18 | template __host__ __device__ launch_parameters get_launch_parameters>(index size); 19 | template void sampleselect>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 20 | template void sampleselect_multi>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out); 21 | } -------------------------------------------------------------------------------- /lib/generated/gen10.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | namespace gpu { 9 | template void sampleselect_multi>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out); 10 | template __host__ __device__ void collect_bucket_indirect>(const float* data, const poracle* oracles_packed, const index* prefix_sum, float* out, index size, const oracle* bucket, index* atomic); 11 | template void sampleselect_multi>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out); 12 | template void sampleselect_host>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 13 | template __host__ __device__ void build_searchtree>(const float* in, float* out, index size); 14 | template __host__ __device__ void ssss_merged>(const float* in, float* out, poracle* oracles, index offset, const index* ranks, index rank_offset, index rank_base, const kernels::ssss_multi_aux>* aux_in, kernels::ssss_multi_aux>* aux_outs, float* out_tree); 15 | template void quickselect>(double* in, double* tmp, index* count_tmp, index size, index rank, double* out); 16 | template void sampleselect>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 17 | template __host__ __device__ void collect_bucket_indirect>(const double* data, const poracle* oracles_packed, const index* prefix_sum, double* out, index size, const oracle* bucket, index* atomic); 18 | template void sampleselect>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 19 | template __host__ __device__ void count_buckets>(const double* in, const double* tree, index* localcounts, index* counts, poracle* oracles, index size); 20 | template __host__ __device__ void collect_bucket>(const float* data, const poracle* oracles_packed, const index* prefix_sum, float* out, index size, oracle bucket, index* atomic); 21 | } -------------------------------------------------------------------------------- /lib/generated/gen11.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | namespace gpu { 9 | template __global__ void kernels::reduce_counts>(const index* in, index* out, index); 10 | template void quickselect>(double* in, double* tmp, index* count_tmp, index size, index rank, double* out); 11 | template void sampleselect>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 12 | template __host__ __device__ launch_parameters get_launch_parameters>(index size); 13 | template void quickselect>(float* in, float* tmp, index* count_tmp, index size, index rank, float* out); 14 | template __host__ __device__ void build_searchtree>(const float* in, float* out, index size); 15 | template void sampleselect_multi>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out); 16 | template __device__ void kernels::masked_prefix_sum<8>(index* counts, const mask* m); 17 | template void sampleselect_host>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 18 | template __global__ void kernels::count_buckets>(const float* in, const float* tree, index* counts, poracle* oracles, index size, index workcount); 19 | template void sampleselect_host>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 20 | } -------------------------------------------------------------------------------- /lib/generated/gen12.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | namespace gpu { 9 | template __global__ void kernels::count_buckets>(const float* in, const float* tree, index* counts, poracle* oracles, index size, index workcount); 10 | template void sampleselect>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 11 | template __host__ __device__ void collect_bucket>(const double* data, const poracle* oracles_packed, const index* prefix_sum, double* out, index size, oracle bucket, index* atomic); 12 | template __global__ void kernels::reduce_counts>(const index* in, index* out, index); 13 | template void quickselect>(float* in, float* tmp, index* count_tmp, index size, index rank, float* out); 14 | template void sampleselect_host>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 15 | template void sampleselect_multi>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out); 16 | template void sampleselect_multi>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out); 17 | template __host__ __device__ void build_searchtree>(const float* in, float* out, index size); 18 | template void quickselect_multi>(double* in, double* tmp, index* count_tmp, index size, const index* ranks, index rank_count, double* out); 19 | template void sampleselect>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 20 | } -------------------------------------------------------------------------------- /lib/generated/gen13.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | namespace gpu { 9 | template void sampleselect_host>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 10 | template __global__ void kernels::reduce_counts>(const index* in, index* out, index); 11 | template void sampleselect>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 12 | template __host__ __device__ void collect_buckets>(const double* data, const poracle* oracles_packed, const index* block_prefix_sum, const index* bucket_out_ranges, double* out, index size, mask* buckets, index* atomic); 13 | template void sampleselect>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 14 | template void sampleselect_host>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 15 | template __host__ __device__ void count_buckets>(const float* in, const float* tree, index* localcounts, index* counts, poracle* oracles, index size); 16 | template void sampleselect_multi>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out); 17 | template __host__ __device__ launch_parameters get_launch_parameters>(index size); 18 | template void quickselect_multi>(float* in, float* tmp, index* count_tmp, index size, const index* ranks, index rank_count, float* out); 19 | template __host__ __device__ void build_searchtree>(const float* in, float* out, index size); 20 | } -------------------------------------------------------------------------------- /lib/generated/gen14.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | namespace gpu { 9 | template void sampleselect_host>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 10 | template void sampleselect_multi>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out); 11 | template __host__ __device__ launch_parameters get_launch_parameters>(index size); 12 | template __global__ void kernels::prefix_sum_counts>(index* in, index* out, index); 13 | template __global__ void kernels::count_buckets>(const double* in, const double* tree, index* counts, poracle* oracles, index size, index workcount); 14 | template void sampleselect_host>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 15 | template void sampleselect_multi>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out); 16 | template void sampleselect>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 17 | template __global__ void partition_count>(const double* in, index* counts, index size, double pivot, index workcount); 18 | template __host__ __device__ void build_searchtree>(const double* in, double* out, index size); 19 | template void sampleselect_host>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 20 | } -------------------------------------------------------------------------------- /lib/generated/gen15.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | namespace gpu { 9 | template __global__ void kernels::partition_distr>(const float* in, float* out, const index* counts, index size, float pivot, index workcount); 10 | template void quickselect_multi>(float* in, float* tmp, index* count_tmp, index size, const index* ranks, index rank_count, float* out); 11 | template __global__ void kernels::partition>(const double* in, double* out, index* atomic, index size, double pivot, index workcount); 12 | template void sampleselect_multi>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out); 13 | template __host__ __device__ void collect_bucket>(const float* data, const poracle* oracles_packed, const index* prefix_sum, float* out, index size, oracle bucket, index* atomic); 14 | template __host__ __device__ void collect_bucket_indirect>(const float* data, const poracle* oracles_packed, const index* prefix_sum, float* out, index size, const oracle* bucket, index* atomic); 15 | template __global__ void kernels::reduce_counts>(const index* in, index* out, index); 16 | template void sampleselect_multi>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out); 17 | template void sampleselect_host>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 18 | template void quickselect>(float* in, float* tmp, index* count_tmp, index size, index rank, float* out); 19 | template __global__ void kernels::count_buckets>(const double* in, const double* tree, index* counts, poracle* oracles, index size, index workcount); 20 | } -------------------------------------------------------------------------------- /lib/generated/gen16.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | namespace gpu { 9 | template void sampleselect>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 10 | template void sampleselect>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 11 | template __host__ __device__ void build_searchtree>(const double* in, double* out, index size); 12 | template __global__ void kernels::count_buckets>(const float* in, const float* tree, index* counts, poracle* oracles, index size, index workcount); 13 | template void sampleselect_host>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 14 | template __host__ __device__ void build_searchtree>(const double* in, double* out, index size); 15 | template __host__ __device__ launch_parameters get_launch_parameters>(index size); 16 | template __host__ __device__ void collect_bucket>(const float* data, const poracle* oracles_packed, const index* prefix_sum, float* out, index size, oracle bucket, index* atomic); 17 | template __host__ __device__ void build_searchtree>(const float* in, float* out, index size); 18 | template __host__ __device__ void ssss_merged>(const float* in, float* out, poracle* oracles, index offset, const index* ranks, index rank_offset, index rank_base, const kernels::ssss_multi_aux>* aux_in, kernels::ssss_multi_aux>* aux_outs, float* out_tree); 19 | template void quickselect>(double* in, double* tmp, index* count_tmp, index size, index rank, double* out); 20 | } -------------------------------------------------------------------------------- /lib/generated/gen17.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | namespace gpu { 9 | template __host__ __device__ void count_buckets>(const float* in, const float* tree, index* localcounts, index* counts, poracle* oracles, index size); 10 | template __global__ void kernels::partition>(const float* in, float* out, index* atomic, index size, float pivot, index workcount); 11 | template void sampleselect>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 12 | template void sampleselect>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 13 | template void sampleselect_multi>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out); 14 | template void sampleselect_multi>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out); 15 | template void sampleselect_multi>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out); 16 | template __host__ __device__ void collect_bucket>(const float* data, const poracle* oracles_packed, const index* prefix_sum, float* out, index size, oracle bucket, index* atomic); 17 | template void sampleselect>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 18 | template __host__ __device__ void collect_bucket>(const double* data, const poracle* oracles_packed, const index* prefix_sum, double* out, index size, oracle bucket, index* atomic); 19 | template void sampleselect_host>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 20 | } -------------------------------------------------------------------------------- /lib/generated/gen18.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | namespace gpu { 9 | template __host__ __device__ void ssss_merged>(const float* in, float* out, poracle* oracles, index offset, const index* ranks, index rank_offset, index rank_base, const kernels::ssss_multi_aux>* aux_in, kernels::ssss_multi_aux>* aux_outs, float* out_tree); 10 | template void quickselect>(float* in, float* tmp, index* count_tmp, index size, index rank, float* out); 11 | template __global__ void kernels::reduce_counts>(const index* in, index* out, index); 12 | template __global__ void kernels::count_buckets>(const float* in, const float* tree, index* counts, poracle* oracles, index size, index workcount); 13 | template void sampleselect_multi>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out); 14 | template __global__ void kernels::count_buckets>(const double* in, const double* tree, index* counts, poracle* oracles, index size, index workcount); 15 | template void sampleselect_host>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 16 | template __global__ void partition_count>(const float* in, index* counts, index size, float pivot, index workcount); 17 | template __global__ void kernels::prefix_sum_counts>(index* in, index* out, index); 18 | template __host__ __device__ void collect_bucket>(const double* data, const poracle* oracles_packed, const index* prefix_sum, double* out, index size, oracle bucket, index* atomic); 19 | template void sampleselect_host>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 20 | } -------------------------------------------------------------------------------- /lib/generated/gen19.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | namespace gpu { 9 | template void quickselect>(double* in, double* tmp, index* count_tmp, index size, index rank, double* out); 10 | template void sampleselect_multi>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out); 11 | template void sampleselect_multi>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out); 12 | template __host__ __device__ void count_buckets>(const double* in, const double* tree, index* localcounts, index* counts, poracle* oracles, index size); 13 | template void sampleselect_host>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 14 | template __host__ __device__ void build_searchtree>(const float* in, float* out, index size); 15 | template void sampleselect>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 16 | template void sampleselect_multi>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out); 17 | template __global__ void kernels::partition_distr>(const double* in, double* out, const index* counts, index size, double pivot, index workcount); 18 | template void sampleselect>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 19 | template __device__ __host__ void partition>(const double* in, double* out, index* counts, index size, double pivot); 20 | } -------------------------------------------------------------------------------- /lib/generated/gen2.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | namespace gpu { 9 | template __host__ __device__ launch_parameters get_launch_parameters>(index size); 10 | template void sampleselect>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 11 | template void quickselect>(double* in, double* tmp, index* count_tmp, index size, index rank, double* out); 12 | template __host__ __device__ void collect_bucket>(const double* data, const poracle* oracles_packed, const index* prefix_sum, double* out, index size, oracle bucket, index* atomic); 13 | template void quickselect>(double* in, double* tmp, index* count_tmp, index size, index rank, double* out); 14 | template __host__ __device__ void build_searchtree>(const float* in, float* out, index size); 15 | template void sampleselect>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 16 | template void sampleselect_multi>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out); 17 | template void sampleselect_host>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 18 | template void sampleselect_host>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 19 | template __host__ __device__ launch_parameters get_launch_parameters>(index size); 20 | template void quickselect>(double* in, double* tmp, index* count_tmp, index size, index rank, double* out); 21 | } -------------------------------------------------------------------------------- /lib/generated/gen20.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | namespace gpu { 9 | template void quickselect_multi>(double* in, double* tmp, index* count_tmp, index size, const index* ranks, index rank_count, double* out); 10 | template __host__ __device__ void build_searchtree>(const double* in, double* out, index size); 11 | template void sampleselect>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 12 | template void sampleselect_host>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 13 | template void sampleselect_host>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 14 | template __global__ void kernels::partition_distr>(const double* in, double* out, const index* counts, index size, double pivot, index workcount); 15 | template __host__ __device__ void collect_buckets>(const double* data, const poracle* oracles_packed, const index* block_prefix_sum, const index* bucket_out_ranges, double* out, index size, mask* buckets, index* atomic); 16 | template void sampleselect_multi>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out); 17 | template __host__ __device__ void collect_bucket>(const float* data, const poracle* oracles_packed, const index* prefix_sum, float* out, index size, oracle bucket, index* atomic); 18 | template void sampleselect_multi>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out); 19 | template __host__ __device__ void collect_bucket>(const float* data, const poracle* oracles_packed, const index* prefix_sum, float* out, index size, oracle bucket, index* atomic); 20 | } -------------------------------------------------------------------------------- /lib/generated/gen21.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | namespace gpu { 9 | template __global__ void kernels::reduce_counts>(const index* in, index* out, index); 10 | template __host__ __device__ launch_parameters get_launch_parameters>(index size); 11 | template __host__ __device__ void collect_bucket>(const double* data, const poracle* oracles_packed, const index* prefix_sum, double* out, index size, oracle bucket, index* atomic); 12 | template __host__ __device__ void build_searchtree>(const float* in, float* out, index size); 13 | template void sampleselect_host>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 14 | template __global__ void kernels::count_buckets>(const float* in, const float* tree, index* counts, poracle* oracles, index size, index workcount); 15 | template __host__ __device__ void count_buckets>(const double* in, const double* tree, index* localcounts, index* counts, poracle* oracles, index size); 16 | template __global__ void kernels::reduce_counts>(const index* in, index* out, index); 17 | template __host__ __device__ void build_searchtree>(const double* in, double* out, index size); 18 | template void sampleselect_multi>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out); 19 | template __global__ void kernels::prefix_sum_counts>(index* in, index* out, index); 20 | } -------------------------------------------------------------------------------- /lib/generated/gen22.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | namespace gpu { 9 | template void sampleselect>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 10 | template __host__ __device__ void count_buckets>(const double* in, const double* tree, index* localcounts, index* counts, poracle* oracles, index size); 11 | template void sampleselect_host>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 12 | template __host__ __device__ void build_searchtree>(const double* in, double* out, index size); 13 | template void sampleselect_multi>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out); 14 | template __host__ __device__ void collect_buckets>(const double* data, const poracle* oracles_packed, const index* block_prefix_sum, const index* bucket_out_ranges, double* out, index size, mask* buckets, index* atomic); 15 | template __device__ __host__ void partition>(const double* in, double* out, index* counts, index size, double pivot); 16 | template __global__ void kernels::count_buckets>(const float* in, const float* tree, index* counts, poracle* oracles, index size, index workcount); 17 | template void sampleselect_multi>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out); 18 | template void sampleselect_host>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 19 | template __global__ void kernels::prefix_sum_counts>(index* in, index* out, index); 20 | } -------------------------------------------------------------------------------- /lib/generated/gen23.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | namespace gpu { 9 | template void sampleselect_host>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 10 | template __global__ void kernels::prefix_sum_counts>(index* in, index* out, index); 11 | template void sampleselect>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 12 | template void sampleselect_multi>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out); 13 | template void sampleselect>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 14 | template __host__ __device__ void collect_bucket>(const double* data, const poracle* oracles_packed, const index* prefix_sum, double* out, index size, oracle bucket, index* atomic); 15 | template __host__ __device__ launch_parameters get_launch_parameters>(index size); 16 | template __host__ __device__ void build_searchtree>(const double* in, double* out, index size); 17 | template __global__ void kernels::partition_distr>(const double* in, double* out, const index* counts, index size, double pivot, index workcount); 18 | template void sampleselect>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 19 | template __global__ void kernels::prefix_sum_counts>(index* in, index* out, index); 20 | } -------------------------------------------------------------------------------- /lib/generated/gen24.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | namespace gpu { 9 | template void sampleselect>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 10 | template void sampleselect_multi>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out); 11 | template __host__ __device__ void build_searchtree>(const float* in, float* out, index size); 12 | template void sampleselect_host>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 13 | template __device__ __host__ void partition>(const float* in, float* out, index* counts, index size, float pivot); 14 | template void sampleselect_host>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 15 | template __host__ __device__ void collect_buckets>(const float* data, const poracle* oracles_packed, const index* block_prefix_sum, const index* bucket_out_ranges, float* out, index size, mask* buckets, index* atomic); 16 | template __host__ __device__ launch_parameters get_launch_parameters>(index size); 17 | template void sampleselect_multi>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out); 18 | template __host__ __device__ void count_buckets>(const float* in, const float* tree, index* localcounts, index* counts, poracle* oracles, index size); 19 | template __device__ __host__ void partition>(const float* in, float* out, index* counts, index size, float pivot); 20 | } -------------------------------------------------------------------------------- /lib/generated/gen25.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | namespace gpu { 9 | template void sampleselect_host>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 10 | template void sampleselect>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 11 | template __device__ void kernels::masked_prefix_sum<9>(index* counts, const mask* m); 12 | template void quickselect_multi>(double* in, double* tmp, index* count_tmp, index size, const index* ranks, index rank_count, double* out); 13 | template __host__ __device__ void collect_bucket>(const float* data, const poracle* oracles_packed, const index* prefix_sum, float* out, index size, oracle bucket, index* atomic); 14 | template __device__ void kernels::masked_prefix_sum<10>(index* counts, const mask* m); 15 | template void sampleselect_multi>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out); 16 | template void sampleselect_multi>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out); 17 | template __host__ __device__ void build_searchtree>(const float* in, float* out, index size); 18 | template __host__ __device__ launch_parameters get_launch_parameters>(index size); 19 | template void sampleselect_host>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 20 | } -------------------------------------------------------------------------------- /lib/generated/gen26.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | namespace gpu { 9 | template void sampleselect>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 10 | template void sampleselect_host>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 11 | template void sampleselect_host>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 12 | template void sampleselect_multi>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out); 13 | template __global__ void partition_count>(const double* in, index* counts, index size, double pivot, index workcount); 14 | template __global__ void kernels::reduce_counts>(const index* in, index* out, index); 15 | template void sampleselect>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 16 | template void sampleselect_host>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 17 | template void sampleselect>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 18 | template void quickselect>(double* in, double* tmp, index* count_tmp, index size, index rank, double* out); 19 | template __host__ __device__ void build_searchtree>(const float* in, float* out, index size); 20 | } -------------------------------------------------------------------------------- /lib/generated/gen27.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | namespace gpu { 9 | template __host__ __device__ void build_searchtree>(const double* in, double* out, index size); 10 | template __global__ void kernels::partition_prefixsum>(index* counts, index block_count); 11 | template void sampleselect>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 12 | template void sampleselect_host>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 13 | template void quickselect_multi>(float* in, float* tmp, index* count_tmp, index size, const index* ranks, index rank_count, float* out); 14 | template void sampleselect_host>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 15 | template void sampleselect_host>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 16 | template __host__ __device__ void build_searchtree>(const double* in, double* out, index size); 17 | template __global__ void kernels::count_buckets>(const double* in, const double* tree, index* counts, poracle* oracles, index size, index workcount); 18 | template void sampleselect_multi>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out); 19 | template void sampleselect>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 20 | } -------------------------------------------------------------------------------- /lib/generated/gen28.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | namespace gpu { 9 | template __host__ __device__ void collect_bucket>(const double* data, const poracle* oracles_packed, const index* prefix_sum, double* out, index size, oracle bucket, index* atomic); 10 | template void sampleselect_host>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 11 | template void sampleselect_host>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 12 | template __host__ __device__ void collect_buckets>(const float* data, const poracle* oracles_packed, const index* block_prefix_sum, const index* bucket_out_ranges, float* out, index size, mask* buckets, index* atomic); 13 | template __device__ void kernels::masked_prefix_sum<7>(index* counts, const mask* m); 14 | template void quickselect>(double* in, double* tmp, index* count_tmp, index size, index rank, double* out); 15 | template void sampleselect_multi>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out); 16 | template __global__ void kernels::prefix_sum_counts>(index* in, index* out, index); 17 | template void sampleselect>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 18 | template void quickselect>(float* in, float* tmp, index* count_tmp, index size, index rank, float* out); 19 | template __host__ __device__ void collect_bucket_indirect>(const double* data, const poracle* oracles_packed, const index* prefix_sum, double* out, index size, const oracle* bucket, index* atomic); 20 | } -------------------------------------------------------------------------------- /lib/generated/gen29.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | namespace gpu { 9 | template __host__ __device__ void collect_bucket>(const float* data, const poracle* oracles_packed, const index* prefix_sum, float* out, index size, oracle bucket, index* atomic); 10 | template void sampleselect_host>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 11 | template __global__ void kernels::count_buckets>(const double* in, const double* tree, index* counts, poracle* oracles, index size, index workcount); 12 | template __host__ __device__ launch_parameters get_launch_parameters>(index size); 13 | template void sampleselect_host>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 14 | template void sampleselect>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 15 | template void sampleselect_host>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 16 | template void quickselect>(float* in, float* tmp, index* count_tmp, index size, index rank, float* out); 17 | template void sampleselect_multi>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out); 18 | template void sampleselect>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 19 | template __host__ __device__ void build_searchtree>(const double* in, double* out, index size); 20 | } -------------------------------------------------------------------------------- /lib/generated/gen3.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | namespace gpu { 9 | template void quickselect>(float* in, float* tmp, index* count_tmp, index size, index rank, float* out); 10 | template __host__ __device__ void collect_bucket_indirect>(const double* data, const poracle* oracles_packed, const index* prefix_sum, double* out, index size, const oracle* bucket, index* atomic); 11 | template __device__ __host__ void partition>(const double* in, double* out, index* counts, index size, double pivot); 12 | template void sampleselect_multi>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out); 13 | template void sampleselect_host>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 14 | template __global__ void kernels::reduce_counts>(const index* in, index* out, index); 15 | template void sampleselect_host>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 16 | template __host__ __device__ void collect_bucket>(const float* data, const poracle* oracles_packed, const index* prefix_sum, float* out, index size, oracle bucket, index* atomic); 17 | template void sampleselect_multi>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out); 18 | template void sampleselect_host>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 19 | template void sampleselect_multi>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out); 20 | template __global__ void kernels::count_buckets>(const double* in, const double* tree, index* counts, poracle* oracles, index size, index workcount); 21 | } -------------------------------------------------------------------------------- /lib/generated/gen30.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | namespace gpu { 9 | template __host__ __device__ launch_parameters get_launch_parameters>(index size); 10 | template void sampleselect>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 11 | template __global__ void kernels::count_buckets>(const double* in, const double* tree, index* counts, poracle* oracles, index size, index workcount); 12 | template void quickselect>(float* in, float* tmp, index* count_tmp, index size, index rank, float* out); 13 | template void sampleselect_host>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 14 | template __host__ __device__ void collect_buckets>(const float* data, const poracle* oracles_packed, const index* block_prefix_sum, const index* bucket_out_ranges, float* out, index size, mask* buckets, index* atomic); 15 | template __host__ __device__ launch_parameters get_launch_parameters>(index size); 16 | template void sampleselect>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 17 | template __host__ __device__ launch_parameters get_launch_parameters>(index size); 18 | template void sampleselect_multi>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out); 19 | template void sampleselect_multi>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out); 20 | } -------------------------------------------------------------------------------- /lib/generated/gen31.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | namespace gpu { 9 | template void sampleselect>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 10 | template void sampleselect_multi>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out); 11 | template void quickselect>(double* in, double* tmp, index* count_tmp, index size, index rank, double* out); 12 | template __host__ __device__ void collect_bucket>(const float* data, const poracle* oracles_packed, const index* prefix_sum, float* out, index size, oracle bucket, index* atomic); 13 | template __device__ __host__ void partition>(const double* in, double* out, index* counts, index size, double pivot); 14 | template void sampleselect>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 15 | template __host__ __device__ void collect_bucket>(const double* data, const poracle* oracles_packed, const index* prefix_sum, double* out, index size, oracle bucket, index* atomic); 16 | template void sampleselect_multi>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out); 17 | template __host__ __device__ void ssss_merged>(const double* in, double* out, poracle* oracles, index offset, const index* ranks, index rank_offset, index rank_base, const kernels::ssss_multi_aux>* aux_in, kernels::ssss_multi_aux>* aux_outs, double* out_tree); 18 | template __host__ __device__ launch_parameters get_launch_parameters>(index size); 19 | template void sampleselect>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 20 | } -------------------------------------------------------------------------------- /lib/generated/gen32.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | namespace gpu { 9 | template __host__ __device__ void ssss_merged>(const double* in, double* out, poracle* oracles, index offset, const index* ranks, index rank_offset, index rank_base, const kernels::ssss_multi_aux>* aux_in, kernels::ssss_multi_aux>* aux_outs, double* out_tree); 10 | template __host__ __device__ launch_parameters get_launch_parameters>(index size); 11 | template void sampleselect_multi>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out); 12 | template __global__ void kernels::prefix_sum_counts>(index* in, index* out, index); 13 | template void sampleselect_multi>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out); 14 | template void sampleselect>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 15 | template void sampleselect_multi>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out); 16 | template void sampleselect>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 17 | template __host__ __device__ launch_parameters get_launch_parameters>(index size); 18 | template __host__ __device__ void build_searchtree>(const double* in, double* out, index size); 19 | template void sampleselect>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 20 | } -------------------------------------------------------------------------------- /lib/generated/gen33.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | namespace gpu { 9 | template __global__ void kernels::partition>(const float* in, float* out, index* atomic, index size, float pivot, index workcount); 10 | template void sampleselect_host>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 11 | template void sampleselect>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 12 | template __device__ void kernels::masked_prefix_sum<6>(index* counts, const mask* m); 13 | template __host__ __device__ void collect_bucket>(const float* data, const poracle* oracles_packed, const index* prefix_sum, float* out, index size, oracle bucket, index* atomic); 14 | template __global__ void kernels::prefix_sum_counts>(index* in, index* out, index); 15 | template __global__ void kernels::partition>(const float* in, float* out, index* atomic, index size, float pivot, index workcount); 16 | template void sampleselect_multi>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out); 17 | template __global__ void kernels::partition_prefixsum>(index* counts, index block_count); 18 | template void sampleselect_multi>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out); 19 | template void sampleselect_host>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 20 | } -------------------------------------------------------------------------------- /lib/generated/gen34.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | namespace gpu { 9 | template void sampleselect_host>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 10 | template __host__ __device__ void collect_bucket>(const float* data, const poracle* oracles_packed, const index* prefix_sum, float* out, index size, oracle bucket, index* atomic); 11 | template __host__ __device__ void build_searchtree>(const double* in, double* out, index size); 12 | template void sampleselect_host>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 13 | template void sampleselect_host>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 14 | template __host__ __device__ void collect_bucket>(const float* data, const poracle* oracles_packed, const index* prefix_sum, float* out, index size, oracle bucket, index* atomic); 15 | template void quickselect>(double* in, double* tmp, index* count_tmp, index size, index rank, double* out); 16 | template __host__ __device__ void build_searchtree>(const double* in, double* out, index size); 17 | template __global__ void partition_count>(const double* in, index* counts, index size, double pivot, index workcount); 18 | template void sampleselect>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 19 | template __device__ __host__ void partition>(const float* in, float* out, index* counts, index size, float pivot); 20 | } -------------------------------------------------------------------------------- /lib/generated/gen35.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | namespace gpu { 9 | template void sampleselect_multi>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out); 10 | template void sampleselect_multi>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out); 11 | template __host__ __device__ launch_parameters get_launch_parameters>(index size); 12 | template __host__ __device__ void build_searchtree>(const float* in, float* out, index size); 13 | template __global__ void kernels::partition>(const double* in, double* out, index* atomic, index size, double pivot, index workcount); 14 | template void sampleselect>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 15 | template void sampleselect>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 16 | template void quickselect>(double* in, double* tmp, index* count_tmp, index size, index rank, double* out); 17 | template void quickselect>(double* in, double* tmp, index* count_tmp, index size, index rank, double* out); 18 | template __global__ void kernels::prefix_sum_counts>(index* in, index* out, index); 19 | template void sampleselect_multi>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out); 20 | } -------------------------------------------------------------------------------- /lib/generated/gen36.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | namespace gpu { 9 | template void quickselect>(float* in, float* tmp, index* count_tmp, index size, index rank, float* out); 10 | template void sampleselect>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 11 | template __host__ __device__ void count_buckets>(const float* in, const float* tree, index* localcounts, index* counts, poracle* oracles, index size); 12 | template __host__ __device__ void ssss_merged>(const double* in, double* out, poracle* oracles, index offset, const index* ranks, index rank_offset, index rank_base, const kernels::ssss_multi_aux>* aux_in, kernels::ssss_multi_aux>* aux_outs, double* out_tree); 13 | template void quickselect>(double* in, double* tmp, index* count_tmp, index size, index rank, double* out); 14 | template __host__ __device__ void collect_bucket>(const float* data, const poracle* oracles_packed, const index* prefix_sum, float* out, index size, oracle bucket, index* atomic); 15 | template __global__ void kernels::count_buckets>(const double* in, const double* tree, index* counts, poracle* oracles, index size, index workcount); 16 | template void sampleselect_host>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 17 | template void sampleselect_multi>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out); 18 | template __global__ void kernels::reduce_counts>(const index* in, index* out, index); 19 | template void sampleselect_host>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 20 | } -------------------------------------------------------------------------------- /lib/generated/gen37.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | namespace gpu { 9 | template __global__ void kernels::partition_distr>(const float* in, float* out, const index* counts, index size, float pivot, index workcount); 10 | template void sampleselect_host>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 11 | template void quickselect>(float* in, float* tmp, index* count_tmp, index size, index rank, float* out); 12 | template void sampleselect_multi>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out); 13 | template __global__ void partition_count>(const float* in, index* counts, index size, float pivot, index workcount); 14 | template __global__ void kernels::reduce_counts>(const index* in, index* out, index); 15 | template void sampleselect_multi>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out); 16 | template __host__ __device__ void ssss_merged>(const float* in, float* out, poracle* oracles, index offset, const index* ranks, index rank_offset, index rank_base, const kernels::ssss_multi_aux>* aux_in, kernels::ssss_multi_aux>* aux_outs, float* out_tree); 17 | template __global__ void partition_count>(const float* in, index* counts, index size, float pivot, index workcount); 18 | template __global__ void kernels::reduce_counts>(const index* in, index* out, index); 19 | template __host__ __device__ void count_buckets>(const float* in, const float* tree, index* localcounts, index* counts, poracle* oracles, index size); 20 | } -------------------------------------------------------------------------------- /lib/generated/gen38.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | namespace gpu { 9 | template __host__ __device__ launch_parameters get_launch_parameters>(index size); 10 | template void sampleselect_multi>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out); 11 | template __host__ __device__ void collect_bucket>(const double* data, const poracle* oracles_packed, const index* prefix_sum, double* out, index size, oracle bucket, index* atomic); 12 | template void quickselect>(float* in, float* tmp, index* count_tmp, index size, index rank, float* out); 13 | template __global__ void kernels::reduce_counts>(const index* in, index* out, index); 14 | template __host__ __device__ void collect_buckets>(const double* data, const poracle* oracles_packed, const index* block_prefix_sum, const index* bucket_out_ranges, double* out, index size, mask* buckets, index* atomic); 15 | template void sampleselect_multi>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out); 16 | template void sampleselect_multi>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out); 17 | template void sampleselect_multi>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out); 18 | template void sampleselect_host>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 19 | template __host__ __device__ void collect_bucket>(const double* data, const poracle* oracles_packed, const index* prefix_sum, double* out, index size, oracle bucket, index* atomic); 20 | } -------------------------------------------------------------------------------- /lib/generated/gen39.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | namespace gpu { 9 | template __global__ void kernels::count_buckets>(const double* in, const double* tree, index* counts, poracle* oracles, index size, index workcount); 10 | template void sampleselect>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 11 | template void sampleselect>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 12 | template __host__ __device__ void collect_bucket_indirect>(const float* data, const poracle* oracles_packed, const index* prefix_sum, float* out, index size, const oracle* bucket, index* atomic); 13 | template __global__ void kernels::count_buckets>(const float* in, const float* tree, index* counts, poracle* oracles, index size, index workcount); 14 | template __global__ void kernels::partition_distr>(const float* in, float* out, const index* counts, index size, float pivot, index workcount); 15 | template __host__ __device__ launch_parameters get_launch_parameters>(index size); 16 | template void sampleselect_host>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 17 | template __host__ __device__ launch_parameters get_launch_parameters>(index size); 18 | template __global__ void kernels::partition_distr>(const double* in, double* out, const index* counts, index size, double pivot, index workcount); 19 | template __global__ void kernels::prefix_sum_counts>(index* in, index* out, index); 20 | } -------------------------------------------------------------------------------- /lib/generated/gen4.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | namespace gpu { 9 | template __global__ void kernels::count_buckets>(const float* in, const float* tree, index* counts, poracle* oracles, index size, index workcount); 10 | template void sampleselect_host>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 11 | template void sampleselect_host>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 12 | template void sampleselect_multi>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out); 13 | template void sampleselect>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 14 | template void quickselect>(float* in, float* tmp, index* count_tmp, index size, index rank, float* out); 15 | template __global__ void kernels::count_buckets>(const float* in, const float* tree, index* counts, poracle* oracles, index size, index workcount); 16 | template void sampleselect>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 17 | template void sampleselect_host>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 18 | template __host__ __device__ void collect_buckets>(const float* data, const poracle* oracles_packed, const index* block_prefix_sum, const index* bucket_out_ranges, float* out, index size, mask* buckets, index* atomic); 19 | template __host__ __device__ void collect_bucket>(const double* data, const poracle* oracles_packed, const index* prefix_sum, double* out, index size, oracle bucket, index* atomic); 20 | template __host__ __device__ void build_searchtree>(const float* in, float* out, index size); 21 | } -------------------------------------------------------------------------------- /lib/generated/gen5.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | namespace gpu { 9 | template __host__ __device__ void collect_bucket>(const float* data, const poracle* oracles_packed, const index* prefix_sum, float* out, index size, oracle bucket, index* atomic); 10 | template __global__ void kernels::prefix_sum_counts>(index* in, index* out, index); 11 | template __global__ void kernels::count_buckets>(const double* in, const double* tree, index* counts, poracle* oracles, index size, index workcount); 12 | template __host__ __device__ launch_parameters get_launch_parameters>(index size); 13 | template __global__ void partition_count>(const float* in, index* counts, index size, float pivot, index workcount); 14 | template void sampleselect_multi>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out); 15 | template __host__ __device__ void collect_bucket>(const double* data, const poracle* oracles_packed, const index* prefix_sum, double* out, index size, oracle bucket, index* atomic); 16 | template __global__ void kernels::partition_distr>(const float* in, float* out, const index* counts, index size, float pivot, index workcount); 17 | template __global__ void kernels::count_buckets>(const float* in, const float* tree, index* counts, poracle* oracles, index size, index workcount); 18 | template void sampleselect>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 19 | template __host__ __device__ void collect_bucket>(const float* data, const poracle* oracles_packed, const index* prefix_sum, float* out, index size, oracle bucket, index* atomic); 20 | template __global__ void kernels::partition>(const double* in, double* out, index* atomic, index size, double pivot, index workcount); 21 | } -------------------------------------------------------------------------------- /lib/generated/gen6.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | namespace gpu { 9 | template __host__ __device__ void build_searchtree>(const float* in, float* out, index size); 10 | template void sampleselect>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 11 | template __global__ void kernels::prefix_sum_counts>(index* in, index* out, index); 12 | template __host__ __device__ void count_buckets>(const float* in, const float* tree, index* localcounts, index* counts, poracle* oracles, index size); 13 | template void sampleselect_multi>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out); 14 | template void quickselect_multi>(double* in, double* tmp, index* count_tmp, index size, const index* ranks, index rank_count, double* out); 15 | template __host__ __device__ void build_searchtree>(const float* in, float* out, index size); 16 | template void sampleselect>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 17 | template void sampleselect>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 18 | template void sampleselect_host>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 19 | template void sampleselect_multi>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out); 20 | template void sampleselect_host>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 21 | } -------------------------------------------------------------------------------- /lib/generated/gen7.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | namespace gpu { 9 | template void sampleselect_multi>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out); 10 | template void sampleselect>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 11 | template void sampleselect_host>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 12 | template __host__ __device__ void ssss_merged>(const double* in, double* out, poracle* oracles, index offset, const index* ranks, index rank_offset, index rank_base, const kernels::ssss_multi_aux>* aux_in, kernels::ssss_multi_aux>* aux_outs, double* out_tree); 13 | template __global__ void kernels::prefix_sum_counts>(index* in, index* out, index); 14 | template __host__ __device__ void collect_bucket_indirect>(const double* data, const poracle* oracles_packed, const index* prefix_sum, double* out, index size, const oracle* bucket, index* atomic); 15 | template void quickselect>(float* in, float* tmp, index* count_tmp, index size, index rank, float* out); 16 | template void sampleselect_host>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 17 | template __global__ void kernels::count_buckets>(const float* in, const float* tree, index* counts, poracle* oracles, index size, index workcount); 18 | template void sampleselect>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 19 | template __host__ __device__ void count_buckets>(const double* in, const double* tree, index* localcounts, index* counts, poracle* oracles, index size); 20 | template __global__ void kernels::count_buckets>(const double* in, const double* tree, index* counts, poracle* oracles, index size, index workcount); 21 | } -------------------------------------------------------------------------------- /lib/generated/gen8.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | namespace gpu { 9 | template void quickselect>(double* in, double* tmp, index* count_tmp, index size, index rank, double* out); 10 | template void sampleselect>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 11 | template __global__ void partition_count>(const double* in, index* counts, index size, double pivot, index workcount); 12 | template void sampleselect_host>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 13 | template void sampleselect_multi>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out); 14 | template void sampleselect_multi>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out); 15 | template void sampleselect>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 16 | template __global__ void kernels::reduce_counts>(const index* in, index* out, index); 17 | template void quickselect>(double* in, double* tmp, index* count_tmp, index size, index rank, double* out); 18 | template __global__ void kernels::count_buckets>(const float* in, const float* tree, index* counts, poracle* oracles, index size, index workcount); 19 | template __host__ __device__ launch_parameters get_launch_parameters>(index size); 20 | template __host__ __device__ void count_buckets>(const double* in, const double* tree, index* localcounts, index* counts, poracle* oracles, index size); 21 | } -------------------------------------------------------------------------------- /lib/generated/gen9.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | namespace gpu { 9 | template __global__ void kernels::partition>(const float* in, float* out, index* atomic, index size, float pivot, index workcount); 10 | template void sampleselect_host>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 11 | template void sampleselect_host>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out); 12 | template __host__ __device__ void build_searchtree>(const float* in, float* out, index size); 13 | template void sampleselect_host>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 14 | template __host__ __device__ void build_searchtree>(const double* in, double* out, index size); 15 | template void sampleselect>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 16 | template void sampleselect>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 17 | template void sampleselect_multi>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out); 18 | template void quickselect>(float* in, float* tmp, index* count_tmp, index size, index rank, float* out); 19 | template void sampleselect_host>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out); 20 | template void quickselect>(float* in, float* tmp, index* count_tmp, index size, index rank, float* out); 21 | } -------------------------------------------------------------------------------- /lib/qs_launchers.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Parallel selection algorithm on GPUs 3 | * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de) 4 | * 5 | * This program is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation, version 3. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program. If not, see . 16 | */ 17 | #ifndef QS_LAUNCHERS_CUH 18 | #define QS_LAUNCHERS_CUH 19 | 20 | #include "qs_reduce.cuh" 21 | #include "qs_scan.cuh" 22 | 23 | namespace gpu { 24 | 25 | using kernels::partition; 26 | using kernels::partition_count; 27 | using kernels::partition_distr; 28 | using kernels::partition_prefixsum; 29 | 30 | template 31 | __device__ __host__ void partition(const T* in, T* out, index* counts, index size, T pivot) { 32 | auto bsize = Config::algorithm::max_block_size; 33 | auto nblocks = min(ceil_div(size, bsize), Config::algorithm::max_block_count); 34 | auto per_thread = ceil_div(size, nblocks * bsize); 35 | if (Config::algorithm::shared_memory) { 36 | partition_count<<>>(in, counts, size, pivot, per_thread); 37 | partition_prefixsum<<<1, Config::algorithm::max_block_count>>>(counts, nblocks); 38 | partition_distr<<>>(in, out, counts, size, pivot, per_thread); 39 | } else { 40 | partition<<>>(in, out, counts, size, pivot, per_thread); 41 | } 42 | } 43 | 44 | } // namespace gpu 45 | 46 | #endif // QS_LAUNCHERS_CUH -------------------------------------------------------------------------------- /lib/qs_recursion.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Parallel selection algorithm on GPUs 3 | * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de) 4 | * 5 | * This program is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation, version 3. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program. If not, see . 16 | */ 17 | #ifndef QS_RECURSION_CUH 18 | #define QS_RECURSION_CUH 19 | 20 | #include "qs_reduce.cuh" 21 | #include "qs_scan.cuh" 22 | #include "utils_prefixsum.cuh" 23 | #include "utils_sampling.cuh" 24 | #include "utils_basecase.cuh" 25 | #include "utils_search.cuh" 26 | 27 | namespace gpu { 28 | namespace kernels { 29 | 30 | template 31 | __global__ void quickselect_tailcall(T* in, T* tmp, 32 | index* count_tmp, index size, index rank, T pivot, 33 | T* out); 34 | 35 | template 36 | __device__ __forceinline__ void launch_quickselect(T* in, T* tmp, 37 | index* count_tmp, index size, 38 | index rank, T* out) { 39 | auto idx = threadIdx.x; 40 | // assert blockDim.x == warp_size 41 | 42 | if (size <= Config::basecase::size) { 43 | if (idx == 0) { 44 | select_bitonic_basecase<<<1, Config::basecase::launch_size>>>(in, size, rank, out); 45 | } 46 | } else { 47 | // find sample median 48 | auto pick_idx = random_pick_idx(idx, warp_size, size); 49 | auto pick = in[pick_idx]; 50 | auto local = pick; 51 | bitonic_helper_warp::sort(&local, false); 52 | auto pivot = shfl(full_mask, local, warp_size / 2); 53 | 54 | // determine the index of the sample median 55 | auto mask = ballot(full_mask, pick == pivot); 56 | auto pivot_idx = shfl(full_mask, pick_idx, __ffs(mask) - 1); 57 | if (idx > 0) { 58 | return; 59 | } 60 | // swap the sample median to the first position 61 | swap(in[pivot_idx], in[0]); 62 | // reset atomic counters 63 | if (!Config::algorithm::shared_memory) { 64 | count_tmp[0] = 0; 65 | count_tmp[1] = 0; 66 | } 67 | gpu::partition(in + 1, tmp, count_tmp, size - 1, pivot); 68 | quickselect_tailcall 69 | <<<1, warp_size>>>(in + 1, tmp, count_tmp, size - 1, rank, pivot, out); 70 | } 71 | } 72 | 73 | template 74 | __global__ void quickselect_tailcall(T* in, T* tmp, 75 | index* count_tmp, index size, index rank, T pivot, 76 | T* out) { 77 | if (threadIdx.x >= warp_size) { 78 | return; 79 | } 80 | 81 | auto lcount = count_tmp[0]; 82 | auto rcount = count_tmp[1]; 83 | if (rank == lcount) { 84 | if (threadIdx.x == 0) { 85 | *out = pivot; 86 | } 87 | } else if (rank < lcount) { 88 | launch_quickselect(tmp, in, count_tmp, lcount, rank, out); 89 | } else { 90 | launch_quickselect(tmp + lcount, in, count_tmp, rcount, rank - lcount - 1, out); 91 | } 92 | } 93 | 94 | template 95 | __global__ void quickselect(T* in, T* tmp, index* count_tmp, 96 | index size, index rank, T* out) { 97 | launch_quickselect(in, tmp, count_tmp, size, rank, out); 98 | } 99 | 100 | } // namespace kernels 101 | 102 | template 103 | void quickselect(T* in, T* tmp, index* count_tmp, index size, index rank, T* out) { 104 | kernels::quickselect<<<1, warp_size>>>(in, tmp, count_tmp, size, rank, out); 105 | } 106 | 107 | template 108 | index quickselect_alloc_size(index size) { 109 | return sizeof(index) * (Config::algorithm::max_block_count * 2 + 2); 110 | } 111 | 112 | } // namespace gpu 113 | 114 | #endif // QS_RECURSION_CUH 115 | -------------------------------------------------------------------------------- /lib/qs_recursion_multi.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Parallel selection algorithm on GPUs 3 | * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de) 4 | * 5 | * This program is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation, version 3. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program. If not, see . 16 | */ 17 | #ifndef QS_RECURSION_MULTI_CUH 18 | #define QS_RECURSION_MULTI_CUH 19 | 20 | #include "qs_recursion.cuh" 21 | 22 | namespace gpu { 23 | namespace kernels { 24 | 25 | template 26 | __global__ void quickselect_tailcall_multi(T* in, T* tmp, 27 | index* count_tmp, index size, const index* ranks, index rank_count, index rank_base, T pivot, 28 | T* out); 29 | 30 | template 31 | __device__ __forceinline__ void launch_quickselect_multi(T* in, T* tmp, 32 | index* count_tmp, index size, 33 | const index* ranks, index rank_count, index rank_base, T* out) { 34 | if (rank_count == 0) { 35 | return; 36 | } 37 | auto idx = threadIdx.x; 38 | // assert blockDim.x == warp_size 39 | 40 | if (size <= Config::basecase::size) { 41 | if (idx == 0) { 42 | select_bitonic_multiple_basecase<<<1, Config::basecase::launch_size>>>(in, size, ranks, rank_count, rank_base, out); 43 | } 44 | } else { 45 | // find sample median 46 | auto pick_idx = random_pick_idx(idx, warp_size, size); 47 | auto pick = in[pick_idx]; 48 | auto local = pick; 49 | bitonic_helper_warp::sort(&local, false); 50 | auto pivot = shfl(full_mask, local, warp_size / 2); 51 | 52 | // determine the index of the sample median 53 | auto mask = ballot(full_mask, pick == pivot); 54 | auto pivot_idx = shfl(full_mask, pick_idx, __ffs(mask) - 1); 55 | if (idx > 0) { 56 | return; 57 | } 58 | // swap the sample median to the first position 59 | swap(in[pivot_idx], in[0]); 60 | // reset atomic counters 61 | if (!Config::algorithm::shared_memory) { 62 | count_tmp[0] = 0; 63 | count_tmp[1] = 0; 64 | } 65 | gpu::partition(in + 1, tmp, count_tmp, size - 1, pivot); 66 | quickselect_tailcall_multi 67 | <<<2, warp_size>>>(in + 1, tmp, count_tmp, size - 1, ranks, rank_count, rank_base, pivot, out); 68 | } 69 | } 70 | 71 | template 72 | __global__ void quickselect_tailcall_multi(T* in, T* tmp, 73 | index* count_tmp, index size, const index* ranks, index rank_count, index rank_base, T pivot, 74 | T* out) { 75 | // assert blockDim.x == warp_size 76 | 77 | auto lcount = count_tmp[0]; 78 | auto rcount = count_tmp[1]; 79 | auto middle = binary_search(ranks, rank_count, lcount + rank_base); 80 | if (blockIdx.x == 0) { 81 | if (middle < rank_count && ranks[middle] == lcount + rank_base) { 82 | if (threadIdx.x == 0) { 83 | out[middle] = pivot; 84 | } 85 | if (middle < rank_count - 1) { 86 | launch_quickselect_multi(tmp + lcount, in + lcount, count_tmp + lcount, rcount, 87 | ranks + middle + 1, rank_count - middle - 1, rank_base + (lcount + 1), out + middle + 1); 88 | } 89 | } else { 90 | if (middle < rank_count) { 91 | launch_quickselect_multi(tmp + lcount, in + lcount, count_tmp + lcount, rcount, 92 | ranks + middle, rank_count - middle, rank_base + (lcount + 1), out + middle); 93 | } 94 | } 95 | } else { 96 | if (middle > 0) { 97 | launch_quickselect_multi(tmp, in, count_tmp, lcount, ranks, middle, rank_base, out); 98 | } 99 | } 100 | } 101 | 102 | template 103 | __global__ void quickselect_multi(T* in, T* tmp, index* count_tmp, 104 | index size, const index* ranks, index rank_count, T* out) { 105 | launch_quickselect_multi(in, tmp, count_tmp, size, ranks, rank_count, 0, out); 106 | } 107 | 108 | } // namespace kernels 109 | 110 | template 111 | void quickselect_multi(T* in, T* tmp, index* count_tmp, index size, const index* ranks, index rank_count, T* out) { 112 | kernels::quickselect_multi<<<1, warp_size>>>(in, tmp, count_tmp, size, ranks, rank_count, out); 113 | } 114 | 115 | template 116 | index quickselect_alloc_size_multi(index size) { 117 | return sizeof(index) * (std::max(Config::algorithm::max_block_count * 2 + 2, size)); 118 | } 119 | 120 | } // namespace gpu 121 | 122 | #endif // QS_RECURSION_MULTI_CUH 123 | -------------------------------------------------------------------------------- /lib/qs_reduce.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Parallel selection algorithm on GPUs 3 | * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de) 4 | * 5 | * This program is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation, version 3. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program. If not, see . 16 | */ 17 | #ifndef QS_REDUCE_CUH 18 | #define QS_REDUCE_CUH 19 | 20 | #include "utils_prefixsum.cuh" 21 | 22 | namespace gpu { 23 | namespace kernels { 24 | 25 | template 26 | __global__ void partition_prefixsum(index* counts, index block_count) { 27 | __shared__ index local_lcounts[Config::algorithm::max_block_count]; 28 | __shared__ index local_rcounts[Config::algorithm::max_block_count]; 29 | auto i = threadIdx.x; 30 | auto l = i >= block_count ? 0 : counts[2 * i]; 31 | auto r = i >= block_count ? 0 : counts[2 * i + 1]; 32 | local_lcounts[i] = l; 33 | local_rcounts[i] = r; 34 | small_prefix_sum(local_lcounts); 35 | small_prefix_sum(local_rcounts); 36 | __syncthreads(); 37 | if (i < block_count) { 38 | counts[2 * i + 2] = local_lcounts[i]; 39 | counts[2 * i + 3] = local_rcounts[i]; 40 | } 41 | // store the total sum at the beginning 42 | if (i == block_count - 1) { 43 | counts[0] = l + local_lcounts[i]; 44 | counts[1] = r + local_rcounts[i]; 45 | } 46 | } 47 | 48 | } // namespace kernels 49 | } // namespace gpu 50 | 51 | #endif // QS_REDUCE_CUH -------------------------------------------------------------------------------- /lib/qs_scan.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Parallel selection algorithm on GPUs 3 | * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de) 4 | * 5 | * This program is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation, version 3. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program. If not, see . 16 | */ 17 | #ifndef QS_SCAN_CUH 18 | #define QS_SCAN_CUH 19 | 20 | #include "utils_warpaggr.cuh" 21 | #include "utils_work.cuh" 22 | 23 | namespace gpu { 24 | namespace kernels { 25 | 26 | template 27 | __device__ void partition_impl(const T* in, index size, T pivot, index workcount, 28 | Callback callback) { 29 | blockwise_work(workcount, size, [&](index idx, mask amask) { 30 | auto el = in[idx]; 31 | bool left = el < pivot; 32 | auto lmask = ballot(amask, left); 33 | auto rmask = lmask ^ amask; 34 | 35 | callback(el, left, amask, lmask, rmask); 36 | }); 37 | } 38 | 39 | template 40 | __global__ void partition(const T* in, T* out, index* atomic, 41 | index size, T pivot, index workcount) { 42 | partition_impl(in, size, pivot, workcount, 43 | [&](T el, bool l, mask amask, mask lm, mask rm) { 44 | auto lofs = warp_aggr_atomic_count_mask(atomic, amask, lm); 45 | auto rofs = warp_aggr_atomic_count_mask(atomic + 1, amask, rm); 46 | auto target_idx = l ? lofs : size - 1 - rofs; 47 | out[target_idx] = el; 48 | }); 49 | } 50 | 51 | template 52 | __global__ void partition_count(const T* in, index* counts, index size, 53 | T pivot, index workcount) { 54 | __shared__ index lcount, rcount; 55 | if (threadIdx.x == 0) { 56 | lcount = 0; 57 | rcount = 0; 58 | } 59 | __syncthreads(); 60 | partition_impl(in, size, pivot, workcount, 61 | [&](T el, bool l, mask amask, mask lm, mask rm) { 62 | if (threadIdx.x % warp_size == 0) { 63 | atomicAdd(&lcount, __popc(lm)); 64 | atomicAdd(&rcount, __popc(rm)); 65 | } 66 | }); 67 | __syncthreads(); 68 | if (threadIdx.x == 0) { 69 | counts[2 * blockIdx.x] = lcount; 70 | counts[2 * blockIdx.x + 1] = rcount; 71 | } 72 | } 73 | 74 | template 75 | __global__ void partition_distr(const T* in, T* out, 76 | const index* counts, index size, T pivot, 77 | index workcount) { 78 | __shared__ index lcount, rcount; 79 | if (threadIdx.x == 0) { 80 | lcount = counts[2 * blockIdx.x + 2]; 81 | rcount = counts[2 * blockIdx.x + 3]; 82 | } 83 | __syncthreads(); 84 | partition_impl(in, size, pivot, workcount, 85 | [&](T el, bool l, mask amask, mask lm, mask rm) { 86 | auto lofs = warp_aggr_atomic_count_mask(&lcount, amask, lm); 87 | auto rofs = warp_aggr_atomic_count_mask(&rcount, amask, rm); 88 | auto target_idx = l ? lofs : size - 1 - rofs; 89 | out[target_idx] = el; 90 | }); 91 | } 92 | 93 | } // namespace kernels 94 | } // namespace gpu 95 | 96 | #endif // QS_SCAN_CUH -------------------------------------------------------------------------------- /lib/ssss_build_searchtree.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Parallel selection algorithm on GPUs 3 | * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de) 4 | * 5 | * This program is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation, version 3. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program. If not, see . 16 | */ 17 | #ifndef SSSS_BUILD_SEARCHTREE_CUH 18 | #define SSSS_BUILD_SEARCHTREE_CUH 19 | 20 | #include "utils_sampling.cuh" 21 | #include "utils_sort.cuh" 22 | #include "utils_work.cuh" 23 | 24 | namespace gpu { 25 | namespace kernels { 26 | 27 | template 28 | __device__ __forceinline__ index searchtree_entry(index idx) { 29 | // determine the level by the node index 30 | // rationale: a complete binary tree with 2^k leaves has 2^k - 1 inner nodes 31 | // lvl == log2(idx + 1) 32 | auto lvl = 31 - __clz(idx + 1); 33 | // step == n / 2^lvl 34 | auto step = Config::searchtree::width >> lvl; 35 | // index within the level 36 | auto lvl_idx = idx - (1 << lvl) + 1; 37 | return lvl_idx * step + step / 2; 38 | } 39 | 40 | template 41 | __host__ __device__ bool is_equality_bucket(const T* leaves, index bucket_idx) { 42 | // first and last bucket can't definitely be checked to be equality buckets 43 | return bucket_idx > 0 && bucket_idx < Config::searchtree::width - 1 && leaves[bucket_idx + 1] == add_epsilon(leaves[bucket_idx]); 44 | } 45 | 46 | template 47 | __device__ void equality_bucket(T* leaves) { 48 | auto idx = threadIdx.x; 49 | if (idx < Config::searchtree::width && idx > 0) { 50 | // If we are the last in a sequence of equal elements, we add a small epsilon 51 | bool equality = leaves[idx] == leaves[idx - 1] && 52 | (idx == Config::searchtree::width - 1 || leaves[idx] < leaves[idx + 1]); 53 | if (equality) { 54 | leaves[idx] = add_epsilon(leaves[idx]); 55 | } 56 | } 57 | } 58 | 59 | template 60 | __device__ void build_searchtree_shared(const T* in, index size, T* tree) { 61 | __shared__ T sample_buffer[Config::sample::size]; 62 | static_assert(Config::sample::size >= Config::searchtree::width, "sample too small"); 63 | auto idx = threadIdx.x; 64 | 65 | // pick sample 66 | T local_buffer[Config::sample::local_size]; 67 | if (threadIdx.x * Config::sample::local_size < Config::sample::size) { 68 | for (auto i = 0; i < Config::sample::local_size; ++i) { 69 | local_buffer[i] = in[random_pick_idx(threadIdx.x * Config::sample::local_size + i, Config::sample::size, size)]; 70 | } 71 | } 72 | // sort sample 73 | using sorter = bitonic_helper_global; 74 | sorter::sort(local_buffer, sample_buffer, false); 75 | __syncthreads(); 76 | // pick splitters from sorted sample 77 | if (idx < Config::searchtree::width) { 78 | tree[idx + Config::searchtree::width - 1] = sample_buffer[uniform_pick_idx(idx, 79 | Config::searchtree::width, Config::sample::size)]; 80 | } 81 | __syncthreads(); 82 | // create equality bucket if necessary 83 | equality_bucket(tree + (Config::searchtree::width - 1)); 84 | __syncthreads(); 85 | // inner nodes 86 | if (idx < Config::searchtree::width - 1) { 87 | tree[idx] = tree[searchtree_entry(idx) + Config::searchtree::width - 1]; 88 | } 89 | } 90 | 91 | template 92 | __global__ void build_searchtree(const T* in, T* out, index size) { 93 | build_searchtree_shared(in, size, out); 94 | } 95 | 96 | } // namespace kernels 97 | } // namespace gpu 98 | 99 | #endif // SSSS_BUILD_SEARCHTREE_CU -------------------------------------------------------------------------------- /lib/ssss_collect.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Parallel selection algorithm on GPUs 3 | * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de) 4 | * 5 | * This program is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation, version 3. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program. If not, see . 16 | */ 17 | #ifndef SSSS_COLLECT_CUH 18 | #define SSSS_COLLECT_CUH 19 | 20 | #include "ssss_reduce.cuh" 21 | #include "utils_bytestorage.cuh" 22 | #include "utils_warpaggr.cuh" 23 | #include "utils_work.cuh" 24 | 25 | namespace gpu { 26 | namespace kernels { 27 | 28 | template 29 | __device__ void 30 | collect_bucket_impl(const T* data, const poracle* oracles_packed, 31 | const index* prefix_sum, T* out, index size, 32 | oracle bucket, index* atomic, index workcount) { 33 | __shared__ index count; 34 | // initialize block-local count from prefix sum 35 | if (Config::algorithm::shared_memory && threadIdx.x == 0) { 36 | auto idx = partial_sum_idx(blockIdx.x, bucket, gridDim.x, Config::searchtree::width); 37 | count = prefix_sum[idx]; 38 | } 39 | __syncthreads(); 40 | // extract elements from the specified bucket 41 | blockwise_work(workcount, size, [&](index idx, mask amask) { 42 | // load bucket index 43 | auto packed = load_packed_bytes(oracles_packed, amask, idx); 44 | // determine target location 45 | index ofs{}; 46 | if (Config::algorithm::shared_memory) { 47 | ofs = warp_aggr_atomic_count_predicate(&count, amask, packed == bucket); 48 | } else { 49 | ofs = warp_aggr_atomic_count_predicate(atomic, amask, packed == bucket); 50 | } 51 | // store element 52 | if (packed == bucket) { 53 | out[ofs] = data[idx]; 54 | } 55 | }); 56 | } 57 | 58 | template 59 | __global__ void 60 | collect_bucket(const T* data, const poracle* oracles_packed, 61 | const index* prefix_sum, T* out, index size, oracle bucket, 62 | index* atomic, index workcount) { 63 | collect_bucket_impl(data, oracles_packed, prefix_sum, out, size, bucket, atomic, 64 | workcount); 65 | } 66 | 67 | template 68 | __global__ void 69 | collect_bucket_indirect(const T* data, const poracle* oracles_packed, 70 | const index* prefix_sum, T* out, index size, 71 | const oracle* bucket_ptr, index* atomic, index workcount) { 72 | collect_bucket_impl(data, oracles_packed, prefix_sum, out, size, *bucket_ptr, atomic, 73 | workcount); 74 | } 75 | 76 | } // namespace kernels 77 | } // namespace gpu 78 | 79 | #endif // SSSS_COLLECT_CUH -------------------------------------------------------------------------------- /lib/ssss_collect_multi.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Parallel selection algorithm on GPUs 3 | * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de) 4 | * 5 | * This program is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation, version 3. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program. If not, see . 16 | */ 17 | #ifndef SSSS_COLLECT_MULTI_CUH 18 | #define SSSS_COLLECT_MULTI_CUH 19 | 20 | #include "ssss_reduce.cuh" 21 | #include "utils_bytestorage.cuh" 22 | #include "utils_warpaggr.cuh" 23 | #include "utils_work.cuh" 24 | #include "utils_mask.cuh" 25 | #include "utils_search.cuh" 26 | 27 | namespace gpu { 28 | namespace kernels { 29 | 30 | template 31 | __global__ void 32 | collect_buckets(const T* data, const poracle* oracles_packed, 33 | const index* block_prefix_sum, const index* bucket_out_ranges, 34 | T* out, index size, const mask* buckets, 35 | index* atomic, index workcount) { 36 | // initialize mask cache in shared memory 37 | constexpr auto mask_size = ceil_div(Config::searchtree::width, sizeof(mask) * 8); 38 | __shared__ mask shared_mask[mask_size]; 39 | static_assert(mask_size < 32, "mask too big, just a misconfiguration failsafe"); 40 | if (threadIdx.x < mask_size) { 41 | shared_mask[threadIdx.x] = buckets[threadIdx.x]; 42 | } 43 | 44 | // initialize block-local count from prefix sum 45 | __shared__ index count[Config::searchtree::width]; 46 | if (Config::algorithm::shared_memory) { 47 | blockwise_work_local(Config::searchtree::width, [&](index bucket) { 48 | auto base_idx = partial_sum_idx(blockIdx.x, bucket, gridDim.x, Config::searchtree::width); 49 | count[bucket] = bucket_out_ranges[bucket] + block_prefix_sum[base_idx]; 50 | }); 51 | } else { 52 | blockwise_work_local(Config::searchtree::width, [&](index bucket) { 53 | count[bucket] = bucket_out_ranges[bucket]; 54 | }); 55 | } 56 | 57 | __syncthreads(); 58 | 59 | // extract elements from the specified bucket 60 | blockwise_work(workcount, size, [&](index idx, mask amask) { 61 | // load bucket index 62 | auto bucket = load_packed_bytes(oracles_packed, amask, idx); 63 | // determine target location 64 | index ofs{}; 65 | if (check_mask(bucket, shared_mask)) { 66 | if (Config::algorithm::shared_memory) { 67 | ofs = atomicAdd(&count[bucket], 1); 68 | } else { 69 | ofs = atomicAdd(&atomic[bucket], 1) + count[bucket]; 70 | } 71 | // store element 72 | out[ofs] = data[idx]; 73 | } 74 | }); 75 | } 76 | 77 | } // namespace kernels 78 | } // namespace gpu 79 | 80 | #endif // SSSS_COLLECT_MULTI_CUH -------------------------------------------------------------------------------- /lib/ssss_count.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Parallel selection algorithm on GPUs 3 | * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de) 4 | * 5 | * This program is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation, version 3. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program. If not, see . 16 | */ 17 | #ifndef SSSS_COUNT_CUH 18 | #define SSSS_COUNT_CUH 19 | 20 | #include "ssss_reduce.cuh" 21 | #include "utils_bytestorage.cuh" 22 | #include "utils_warpaggr.cuh" 23 | #include "utils_work.cuh" 24 | 25 | namespace gpu { 26 | namespace kernels { 27 | 28 | template 29 | __device__ oracle searchtree_traversal(const T* searchtree, T el, mask amask, mask& equal_mask, T min_split, T max_split) { 30 | index i = 0; 31 | equal_mask = amask; 32 | if (Config::algorithm::bucket_select) { 33 | auto maxbucket = Config::searchtree::width - 1; 34 | auto floatbucket = (el - min_split) / (max_split - min_split) * Config::searchtree::width - T(0.5); 35 | floatbucket = floatbucket > maxbucket ? maxbucket : floatbucket; 36 | floatbucket = floatbucket < 0 ? 0 : floatbucket; 37 | auto bucket = oracle(floatbucket); 38 | for (index lvl = 0; lvl < Config::searchtree::height; ++lvl) { 39 | auto bit = (bucket >> lvl) & 1; 40 | equal_mask &= ballot(amask, bit) ^ (bit - 1); 41 | } 42 | return oracle(floatbucket); 43 | } else { 44 | auto root_splitter = searchtree[0]; 45 | bool next_smaller = el < root_splitter; 46 | for (index lvl = 0; lvl < Config::searchtree::height; ++lvl) { 47 | // compute next node index 48 | bool smaller = next_smaller; 49 | i = 2 * i + 2 - smaller; 50 | next_smaller = el < searchtree[i]; 51 | // update equality mask 52 | auto local_mask = ballot(amask, smaller) ^ (smaller - 1); 53 | equal_mask &= local_mask; 54 | } 55 | // return leaf rank 56 | return i - (Config::searchtree::width - 1); 57 | } 58 | } 59 | 60 | template 61 | __device__ __forceinline__ void ssss_impl(const T* in, const T* tree, 62 | index size, index workcount, BucketCallback bucket_cb) { 63 | __shared__ T local_tree[Config::algorithm::bucket_select ? 2 : Config::searchtree::size]; 64 | // Load searchtree into shared memory 65 | if (Config::algorithm::bucket_select) { 66 | if (threadIdx.x == 0) { 67 | local_tree[0] = tree[Config::searchtree::width]; 68 | local_tree[1] = tree[Config::searchtree::size - 1]; 69 | } 70 | } else { 71 | blockwise_work_local(Config::searchtree::size, [&](index i) { local_tree[i] = tree[i]; }); 72 | } 73 | __syncthreads(); 74 | // only for bucket select 75 | auto min_split = local_tree[0]; 76 | auto max_split = local_tree[1]; 77 | 78 | // Determine the bucket and equality mask for every entry 79 | blockwise_work(workcount, size, [&](index idx, mask amask) { 80 | mask equal_mask{}; 81 | auto bucket_idx = searchtree_traversal(local_tree, in[idx], amask, equal_mask, min_split, max_split); 82 | bucket_cb(idx, bucket_idx, amask, equal_mask); 83 | }); 84 | } 85 | 86 | template 87 | __global__ void count_buckets(const T* in, const T* tree, 88 | index* counts, poracle* oracles, index size, 89 | index workcount) { 90 | __shared__ index local_counts[Config::searchtree::width]; 91 | // Initialize shared-memory counts 92 | if (Config::algorithm::shared_memory) { 93 | blockwise_work_local(Config::searchtree::width, [&](index i) { local_counts[i] = 0; }); 94 | __syncthreads(); 95 | } 96 | // Traverse searchtree for every entry 97 | ssss_impl( 98 | in, tree, size, workcount, [&](index idx, oracle bucket, mask amask, mask mask) { 99 | static_assert(!Config::algorithm::write || Config::searchtree::height <= 8, 100 | "can't pack bucket idx into byte"); 101 | // Store oracles 102 | if (Config::algorithm::write) { 103 | store_packed_bytes(oracles, amask, bucket, idx); 104 | } 105 | // Increment bucket count 106 | index add = Config::algorithm::warp_aggr ? __popc(mask) : 1; 107 | if (!Config::algorithm::warp_aggr || is_group_leader(mask)) { 108 | if (Config::algorithm::shared_memory) { 109 | atomicAdd(&local_counts[bucket], add); 110 | } else { 111 | atomicAdd(&counts[bucket], add); 112 | } 113 | } 114 | }); 115 | // Write shared-memory counts to global memory 116 | if (Config::algorithm::shared_memory) { 117 | __syncthreads(); 118 | // store the local counts grouped by block idx 119 | blockwise_work_local(Config::searchtree::width, [&](oracle bucket) { 120 | auto idx = partial_sum_idx(blockIdx.x, bucket, gridDim.x, Config::searchtree::width); 121 | counts[idx] = local_counts[bucket]; 122 | }); 123 | } 124 | } 125 | 126 | } // namespace kernels 127 | } // namespace gpu 128 | 129 | #endif // SSSS_COUNT_CUH 130 | -------------------------------------------------------------------------------- /lib/ssss_launchers.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Parallel selection algorithm on GPUs 3 | * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de) 4 | * 5 | * This program is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation, version 3. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program. If not, see . 16 | */ 17 | #ifndef SSSS_LAUNCHERS_CUH 18 | #define SSSS_LAUNCHERS_CUH 19 | 20 | #include "ssss_build_searchtree.cuh" 21 | #include "ssss_collect.cuh" 22 | #include "ssss_collect_multi.cuh" 23 | #include "ssss_count.cuh" 24 | #include "ssss_reduce.cuh" 25 | #include "ssss_merged.cuh" 26 | 27 | namespace gpu { 28 | 29 | template 30 | __host__ __device__ launch_parameters get_launch_parameters(index size) { 31 | launch_parameters result{}; 32 | result.block_size = Config::algorithm::max_block_size; 33 | result.block_count = min(ceil_div(size, result.block_size), Config::algorithm::max_block_count); 34 | auto threads = result.block_size * result.block_count; 35 | result.work_per_thread = ceil_div(size, threads); 36 | return result; 37 | } 38 | 39 | template 40 | __host__ __device__ void build_searchtree(const T* in, T* out, index size) { 41 | constexpr auto threads = Config::searchtree_kernel_size; 42 | static_assert(threads <= max_block_size, "Work won't fit into a single thread block"); 43 | kernels::build_searchtree<<<1, threads>>>(in, out, size); 44 | } 45 | 46 | template 47 | __host__ __device__ void count_buckets(const T* in, const T* tree, index* localcounts, 48 | index* counts, poracle* oracles, index size) { 49 | auto params = get_launch_parameters(size); 50 | if (Config::algorithm::shared_memory) { 51 | kernels::count_buckets<<>>( 52 | in, tree, localcounts, oracles, size, params.work_per_thread); 53 | constexpr auto reduce_bsize = 54 | min(Config::searchtree::width, Config::algorithm::max_block_count); 55 | constexpr auto reduce_blocks = ceil_div(Config::searchtree::width, reduce_bsize); 56 | if (Config::algorithm::write) { 57 | kernels::prefix_sum_counts 58 | <<>>(localcounts, counts, params.block_count); 59 | } else { 60 | kernels::reduce_counts 61 | <<>>(localcounts, counts, params.block_count); 62 | } 63 | } else { 64 | kernels::count_buckets<<>>( 65 | in, tree, counts, oracles, size, params.work_per_thread); 66 | } 67 | } 68 | 69 | template 70 | __host__ __device__ void collect_bucket(const T* data, const poracle* oracles_packed, 71 | const index* prefix_sum, T* out, index size, oracle bucket, 72 | index* atomic) { 73 | auto params = get_launch_parameters(size); 74 | kernels::collect_bucket<<>>( 75 | data, oracles_packed, prefix_sum, out, size, bucket, atomic, params.work_per_thread); 76 | } 77 | 78 | template 79 | __host__ __device__ void collect_bucket_indirect(const T* data, const poracle* oracles_packed, 80 | const index* prefix_sum, T* out, index size, 81 | const oracle* bucket, index* atomic) { 82 | auto params = get_launch_parameters(size); 83 | kernels::collect_bucket_indirect<<>>( 84 | data, oracles_packed, prefix_sum, out, size, bucket, atomic, params.work_per_thread); 85 | } 86 | 87 | template 88 | __host__ __device__ void collect_buckets(const T* data, const poracle* oracles_packed, 89 | const index* block_prefix_sum, const index* bucket_out_ranges, 90 | T* out, index size, mask* buckets, index* atomic) { 91 | auto params = get_launch_parameters(size); 92 | kernels::collect_buckets<<>>( 93 | data, oracles_packed, block_prefix_sum, bucket_out_ranges, out, size, buckets, atomic, params.work_per_thread); 94 | } 95 | 96 | template 97 | __host__ __device__ void ssss_merged( 98 | const T* in, 99 | T* out, 100 | poracle* oracles, 101 | index offset, 102 | const index* ranks, 103 | index rank_offset, 104 | index rank_base, 105 | const kernels::ssss_multi_aux* aux_in, 106 | kernels::ssss_multi_aux* aux_outs, 107 | T* out_trees) { 108 | kernels::ssss_merged_kernel<<>>( 109 | in, out, oracles, offset, ranks, rank_offset, rank_base, aux_in, aux_outs, out_trees); 110 | } 111 | 112 | } // namespace gpu 113 | 114 | #endif // SSSS_LAUNCHERS_CUH -------------------------------------------------------------------------------- /lib/ssss_merged_memory.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Parallel selection algorithm on GPUs 3 | * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de) 4 | * 5 | * This program is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation, version 3. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program. If not, see . 16 | */ 17 | #ifndef SSSS_MERGED_MEMORY_CUH 18 | #define SSSS_MERGED_MEMORY_CUH 19 | 20 | #include 21 | #include "utils.cuh" 22 | 23 | namespace gpu { 24 | namespace kernels { 25 | 26 | template 27 | struct ssss_multi_aux { 28 | constexpr static auto mask_size = ceil_div(Config::searchtree::width, sizeof(mask) * 8); 29 | union { 30 | struct { 31 | T tree[Config::searchtree::size]; 32 | index bucket_counts[Config::searchtree::width + 1]; 33 | } stage1; 34 | struct { 35 | mask bucket_mask[mask_size]; 36 | index bucket_prefixsum[Config::searchtree::width + 1]; 37 | index bucket_masked_prefixsum[Config::searchtree::width + 1]; 38 | index rank_ranges[Config::searchtree::width + 1]; 39 | index atomic[Config::algorithm::shared_memory ? 1 : Config::searchtree::width]; 40 | } stage2; 41 | }; 42 | }; 43 | 44 | } // namespace kernels 45 | } // namespace gpu 46 | 47 | #endif -------------------------------------------------------------------------------- /lib/ssss_reduce.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Parallel selection algorithm on GPUs 3 | * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de) 4 | * 5 | * This program is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation, version 3. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program. If not, see . 16 | */ 17 | #ifndef SSSS_REDUCE_CUH 18 | #define SSSS_REDUCE_CUH 19 | 20 | #include "utils.cuh" 21 | 22 | namespace gpu { 23 | namespace kernels { 24 | 25 | __device__ inline index partial_sum_idx(index block, oracle bucket, int num_blocks, 26 | int num_buckets) { 27 | return bucket + block * num_buckets; 28 | } 29 | 30 | template 31 | __global__ void reduce_counts(const index* in, index* out, 32 | index num_blocks) { 33 | index bucket = blockIdx.x * blockDim.x + threadIdx.x; 34 | if (bucket < Config::searchtree::width) { 35 | index sum{}; 36 | for (index block = 0; block < num_blocks; ++block) { 37 | sum += in[partial_sum_idx(block, bucket, num_blocks, Config::searchtree::width)]; 38 | } 39 | out[bucket] = sum; 40 | } 41 | } 42 | 43 | template 44 | __global__ void prefix_sum_counts(index* in, index* out, 45 | index num_blocks) { 46 | index bucket = blockIdx.x * blockDim.x + threadIdx.x; 47 | if (bucket < Config::searchtree::width) { 48 | index sum{}; 49 | for (index block = 0; block < num_blocks; ++block) { 50 | auto idx = partial_sum_idx(block, bucket, num_blocks, Config::searchtree::width); 51 | auto tmp = in[idx]; 52 | in[idx] = sum; 53 | sum += tmp; 54 | } 55 | out[bucket] = sum; 56 | } 57 | } 58 | 59 | } // namespace kernels 60 | } // namespace gpu 61 | 62 | #endif // SSSS_REDUCE_CUH 63 | -------------------------------------------------------------------------------- /lib/utils.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Parallel selection algorithm on GPUs 3 | * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de) 4 | * 5 | * This program is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation, version 3. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program. If not, see . 16 | */ 17 | #ifndef UTILS_CUH 18 | #define UTILS_CUH 19 | 20 | #include 21 | #include 22 | #include 23 | 24 | namespace gpu { 25 | 26 | struct launch_parameters { 27 | index block_count; 28 | index block_size; 29 | index work_per_thread; 30 | }; 31 | 32 | constexpr mask full_mask = 0xffffffffu; 33 | 34 | template 35 | struct max_helper { 36 | // workaround for ::max being a __host__ function 37 | constexpr static T value = std::numeric_limits::max(); 38 | }; 39 | 40 | __host__ __device__ inline float add_epsilon(float f) { 41 | return nextafterf(f, max_helper::value); 42 | } 43 | 44 | __host__ __device__ inline double add_epsilon(double f) { 45 | return nextafter(f, max_helper::value); 46 | } 47 | 48 | __host__ __device__ inline constexpr index min(index a, index b) { return a > b ? b : a; } 49 | 50 | __host__ __device__ inline constexpr index max(index a, index b) { return a < b ? b : a; } 51 | 52 | __host__ __device__ inline constexpr index ceil_div(index a, index b) { return (a + b - 1) / b; } 53 | 54 | __device__ inline index ceil_log2(index i) { 55 | auto high_bit = 31 - __clz(i); 56 | return __popc(i) <= 1 ? high_bit : high_bit + 1; 57 | } 58 | 59 | namespace kernels { 60 | 61 | template 62 | __device__ void swap(T& a, T& b) { 63 | auto tmp = b; 64 | b = a; 65 | a = tmp; 66 | } 67 | 68 | template 69 | __device__ T shfl(mask amask, T el, index source, index width = warp_size) { 70 | #if (__CUDACC_VER_MAJOR__ >= 9) 71 | return __shfl_sync(amask, el, source, width); 72 | #else 73 | return __shfl(el, source); 74 | #endif 75 | } 76 | 77 | template 78 | __device__ T shfl_xor(mask amask, T el, index lanemask, index width = warp_size) { 79 | #if (__CUDACC_VER_MAJOR__ >= 9) 80 | return __shfl_xor_sync(amask, el, lanemask, width); 81 | #else 82 | return __shfl_xor(el, lanemask); 83 | #endif 84 | } 85 | 86 | __device__ inline mask ballot(mask amask, bool predicate) { 87 | #if (__CUDACC_VER_MAJOR__ >= 9) 88 | return __ballot_sync(amask, predicate); 89 | #else 90 | return __ballot(predicate) & amask; 91 | #endif 92 | } 93 | 94 | __device__ inline void sync_dist(int dist) { 95 | #if (__CUDACC_VER_MAJOR__ >= 9) 96 | if (dist >= warp_size) { 97 | __syncthreads(); 98 | } else { 99 | __syncwarp(); 100 | } 101 | #else 102 | __syncthreads(); 103 | #endif 104 | } 105 | 106 | } // namespace kernels 107 | } // namespace gpu 108 | 109 | #endif // UTILS_CUH 110 | -------------------------------------------------------------------------------- /lib/utils_basecase.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Parallel selection algorithm on GPUs 3 | * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de) 4 | * 5 | * This program is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation, version 3. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program. If not, see . 16 | */ 17 | #ifndef UTILS_BASECASE_CUH 18 | #define UTILS_BASECASE_CUH 19 | #include "utils_sort.cuh" 20 | 21 | namespace gpu { 22 | namespace kernels { 23 | 24 | template 25 | __device__ void load_local(const T* in, T* local, index size) { 26 | index idx = threadIdx.x; 27 | for (index i = 0; i < Config::basecase::local_size; ++i) { 28 | auto lidx = idx * Config::basecase::local_size + i; 29 | local[i] = lidx < size ? in[lidx] : max_helper::value; 30 | } 31 | } 32 | 33 | template 34 | __device__ void small_sort_warp(const T* in, T* sorted, index size) { 35 | index idx = threadIdx.x; 36 | // load data padded with sentinels 37 | T local[Config::basecase::local_size]; 38 | load_local(in, local, size); 39 | using sorter = bitonic_helper_warp; 40 | sorter::sort(local, false); 41 | for (index i = 0; i < Config::basecase::local_size; ++i) { 42 | auto lidx = idx * Config::basecase::local_size + i; 43 | sorted[lidx] = local[i]; 44 | } 45 | } 46 | 47 | template 48 | __device__ void small_sort(const T* in, T* sorted, index size) { 49 | // load data padded with sentinels 50 | T local[Config::basecase::local_size]; 51 | load_local(in, local, size); 52 | using sorter = bitonic_helper_global; 53 | sorter::sort(local, sorted, false); 54 | } 55 | 56 | template 57 | __global__ void select_bitonic_basecase(const T* in, index size, index rank, T* out) { 58 | __shared__ T data[Config::basecase::size]; 59 | index idx = threadIdx.x; 60 | if (size <= Config::basecase::local_size * warp_size) { 61 | if (idx >= warp_size) { 62 | return; 63 | } 64 | small_sort_warp(in, data, size); 65 | } else { 66 | small_sort(in, data, size); 67 | } 68 | __syncthreads(); 69 | // store result 70 | if (idx == 0) { 71 | *out = data[rank]; 72 | } 73 | } 74 | 75 | template 76 | __device__ void select_bitonic_multiple_basecase_impl(const T* in, index size, 77 | const index* ranks, index ranks_size, 78 | index rank_base, T* out) { 79 | __shared__ T data[Config::basecase::size]; 80 | index idx = threadIdx.x; 81 | if (size <= Config::basecase::local_size * warp_size) { 82 | if (idx >= warp_size) { 83 | return; 84 | } 85 | small_sort_warp(in, data, size); 86 | } else { 87 | small_sort(in, data, size); 88 | } 89 | __syncthreads(); 90 | for (index i = 0; i < Config::basecase::local_size; i++) { 91 | auto gi = idx * Config::basecase::local_size + i; 92 | if (gi < ranks_size) { 93 | auto pos = ranks[gi] - rank_base; 94 | out[gi] = data[pos]; 95 | } 96 | } 97 | } 98 | 99 | template 100 | __global__ void select_bitonic_multiple_basecase(const T* in, index size, 101 | const index* ranks, index ranks_size, 102 | index rank_base, T* out) { 103 | select_bitonic_multiple_basecase_impl(in, size, ranks, ranks_size, rank_base, out); 104 | } 105 | 106 | } // namespace kernels 107 | } // namespace gpu 108 | 109 | #endif // UTILS_BASECASE_CUH -------------------------------------------------------------------------------- /lib/utils_bytestorage.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Parallel selection algorithm on GPUs 3 | * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de) 4 | * 5 | * This program is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation, version 3. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program. If not, see . 16 | */ 17 | #ifndef UTILS_BYTESTORAGE_CUH 18 | #define UTILS_BYTESTORAGE_CUH 19 | 20 | #include "utils.cuh" 21 | 22 | namespace gpu { 23 | namespace kernels { 24 | 25 | __device__ inline void store_packed_bytes(poracle* output, mask amask, oracle byte, index idx) { 26 | // pack 4 consecutive bytes into a dword 27 | poracle result = byte; 28 | // ------00 -> ----1100 29 | result |= shfl_xor(amask, result, 1, 4) << 8; 30 | // ----1100 -> 33221100 31 | result |= shfl_xor(amask, result, 2, 4) << 16; 32 | if (idx % 4 == 0) { 33 | output[idx / 4] = result; 34 | } 35 | } 36 | 37 | __device__ inline oracle load_packed_bytes(const poracle* input, mask amask, index idx) { 38 | auto char_idx = idx % 4; 39 | auto pack_idx = idx / 4; 40 | poracle packed{}; 41 | // first thread in quartet loads the data 42 | if (char_idx == 0) { 43 | packed = input[pack_idx]; 44 | } 45 | // distribute the data onto all threads 46 | packed = shfl(amask, packed, (pack_idx * 4) % warp_size, 4); 47 | packed >>= char_idx * 8; 48 | packed &= 0xff; 49 | return packed; 50 | } 51 | 52 | } // namespace kernels 53 | } // namespace gpu 54 | 55 | #endif // UTILS_BYTESTORAGE_CUH -------------------------------------------------------------------------------- /lib/utils_mask.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Parallel selection algorithm on GPUs 3 | * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de) 4 | * 5 | * This program is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation, version 3. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program. If not, see . 16 | */ 17 | #ifndef UTILS_MASK_CUH 18 | #define UTILS_MASK_CUH 19 | 20 | #include 21 | #include "utils_search.cuh" 22 | 23 | namespace gpu { 24 | namespace kernels { 25 | 26 | template 27 | __device__ int select_mask_local(index rank, mask m) { 28 | static_assert(mask_width <= 32, "mask too wide"); 29 | constexpr auto amask = ~mask{0} >> (32 - mask_width); 30 | auto masked_m = m << (31 - threadIdx.x); 31 | auto result = ballot(amask, __popc(masked_m) >= rank + 1); 32 | return __ffs(result) - 1; 33 | } 34 | 35 | template 36 | __device__ int select_mask(index rank, const mask* m) { 37 | constexpr auto mask_blocks = ceil_div(mask_width, 32); 38 | constexpr auto local_mask_size = mask_width >= 32 ? 32 : mask_width; 39 | static_assert(mask_blocks <= 32, "mask too wide"); 40 | // we have few enough blocks so we can do this naively 41 | index count{}; 42 | index block{}; 43 | for (; block < mask_blocks; ++block) { 44 | auto partial = __popc(m[block]); 45 | if (rank >= count && rank < count + partial) { 46 | return select_mask_local(rank - count, m[block]) + block * 32; 47 | } 48 | count += partial; 49 | } 50 | // should never be reached 51 | return 0xDEADBEEF; 52 | } 53 | 54 | inline __device__ bool check_mask(index idx, const mask* m) { 55 | static_assert(sizeof(mask) * 8 == warp_size, "Mask and warp size inconsistent"); 56 | auto mask_block = idx / (sizeof(mask) * 8); 57 | auto mask_bit = mask(idx % (sizeof(mask) * 8)); 58 | auto masked_bit = mask(1) << mask_bit; 59 | return bool(m[mask_block] & masked_bit); 60 | } 61 | 62 | inline __device__ void compute_bucket_mask_impl(const index* ranks, index rank_count, index rank_base, const index* bucket_prefixsum, mask* bucket_mask, index* range_begins) { 63 | auto bucket = threadIdx.x; 64 | auto lb = bucket_prefixsum[bucket] + rank_base; 65 | auto ub = bucket_prefixsum[bucket + 1] + rank_base; 66 | auto lb_start = binary_search(ranks, rank_count, lb); 67 | auto ub_start = binary_search(ranks, rank_count, ub); 68 | auto local_mask = ballot(full_mask, lb_start != ub_start); 69 | if (bucket % warp_size == 0) { 70 | bucket_mask[bucket / warp_size] = local_mask; 71 | } 72 | range_begins[bucket] = lb_start; 73 | // this is a deliberate race condition, as both threads compute the same result :) 74 | range_begins[bucket + 1] = ub_start; 75 | } 76 | 77 | } // namespace kernels 78 | } // namespace gpu 79 | #endif // UTILS_MASK_CUH -------------------------------------------------------------------------------- /lib/utils_prefixsum.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Parallel selection algorithm on GPUs 3 | * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de) 4 | * 5 | * This program is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation, version 3. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program. If not, see . 16 | */ 17 | #ifndef UTILS_PREFIXSUM_CUH 18 | #define UTILS_PREFIXSUM_CUH 19 | 20 | #include "utils.cuh" 21 | #include "utils_mask.cuh" 22 | 23 | namespace gpu { 24 | namespace kernels { 25 | 26 | template 27 | __device__ void small_prefix_sum_upward(index* data) { 28 | constexpr auto size = 1 << size_log2; 29 | auto idx = threadIdx.x; 30 | // upward phase: reduce 31 | // here we build an implicit reduction tree, overwriting values 32 | // the entry at the end of a power-of-two block stores the sum of this block 33 | // the block sizes are increased stepwise 34 | for (index blocksize = 2; blocksize <= size; blocksize *= 2) { 35 | index base_idx = idx * blocksize; 36 | __syncthreads(); 37 | if (base_idx < size) { 38 | data[base_idx + blocksize - 1] += data[base_idx + blocksize / 2 - 1]; 39 | } 40 | } 41 | } 42 | 43 | template 44 | __device__ void small_prefix_sum_downward(index* data) { 45 | constexpr auto size = 1 << size_log2; 46 | auto idx = threadIdx.x; 47 | // downward phase: build prefix sum 48 | // every right child stores the sum of its left sibling 49 | // every left child stores its own sum 50 | // thus we store zero at the root 51 | if (idx == 0) { 52 | data[size - 1] = 0; 53 | } 54 | for (auto blocksize = size; blocksize != 1; blocksize /= 2) { 55 | auto base_idx = idx * blocksize; 56 | __syncthreads(); 57 | if (base_idx < size) { 58 | // we preserve the invariant for the next level 59 | auto r = data[base_idx + blocksize - 1]; 60 | auto l = data[base_idx + blocksize / 2 - 1]; 61 | data[base_idx + blocksize / 2 - 1] = r; 62 | data[base_idx + blocksize - 1] = l + r; 63 | } 64 | } 65 | } 66 | 67 | template 68 | __device__ void small_prefix_sum(index* data) { 69 | small_prefix_sum_upward(data); 70 | __syncthreads(); 71 | small_prefix_sum_downward(data); 72 | } 73 | 74 | template 75 | __device__ void small_prefix_sum_sentinel(index* data) { 76 | auto size = 1 << size_log2; 77 | gpu::index tmp{}; 78 | if (threadIdx.x == size - 1) tmp = data[size - 1]; 79 | __syncthreads(); 80 | small_prefix_sum(data); 81 | __syncthreads(); 82 | // append sentinel 83 | if (threadIdx.x == size - 1) data[size] = data[size - 1] + tmp; 84 | } 85 | 86 | template 87 | __device__ void masked_prefix_sum(index* counts, const mask* m) { 88 | index bucket = threadIdx.x; 89 | constexpr auto size = 1 << size_log2; 90 | if (bucket < size && !check_mask(bucket, m)) { 91 | counts[bucket] = 0; 92 | } 93 | __syncthreads(); 94 | small_prefix_sum(counts); 95 | } 96 | 97 | template 98 | __device__ void masked_prefix_sum_sentinel(index* counts, const mask* m) { 99 | index bucket = threadIdx.x; 100 | constexpr auto size = 1 << size_log2; 101 | if (bucket < size && !check_mask(bucket, m)) { 102 | counts[bucket] = 0; 103 | } 104 | __syncthreads(); 105 | small_prefix_sum_sentinel(counts); 106 | } 107 | 108 | /* 109 | * Prefix sum selection 110 | */ 111 | template 112 | __device__ void prefix_sum_select(const index* counts, index rank, poracle* out_bucket, 113 | index* out_rank) { 114 | constexpr auto size = 1 << size_log2; 115 | // first compute prefix sum of counts 116 | auto idx = threadIdx.x; 117 | __shared__ index sums[size]; 118 | sums[2 * idx] = counts[2 * idx]; 119 | sums[2 * idx + 1] = counts[2 * idx + 1]; 120 | small_prefix_sum(sums); 121 | __syncthreads(); 122 | if (idx >= warp_size) { 123 | return; 124 | } 125 | // then determine which group of size step the element belongs to 126 | constexpr auto step = size / warp_size; 127 | static_assert(step <= warp_size, "need a third selection level"); 128 | auto mask = ballot(full_mask, sums[(warp_size - idx - 1) * step] > rank); 129 | if (idx >= step) { 130 | return; 131 | } 132 | auto group = __clz(mask) - 1; 133 | // finally determine which bucket within the group the element belongs to 134 | auto base_idx = step * group; 135 | constexpr auto cur_mask = ((1u << (step - 1)) << 1) - 1; 136 | mask = ballot(cur_mask, sums[base_idx + (step - idx - 1)] > rank); 137 | // here we need to subtract warp_size - step since we only use a subset of the warp 138 | if (idx == 0) { 139 | *out_bucket = __clz(mask) - 1 - (warp_size - step) + base_idx; 140 | *out_rank = rank - sums[*out_bucket]; 141 | } 142 | } 143 | 144 | } // namespace kernels 145 | } // namespace gpu 146 | 147 | #endif // UTILS_PREFIXSUM_CUH 148 | -------------------------------------------------------------------------------- /lib/utils_sampling.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Parallel selection algorithm on GPUs 3 | * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de) 4 | * 5 | * This program is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation, version 3. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program. If not, see . 16 | */ 17 | #ifndef UTILS_SAMPLING_CUH 18 | #define UTILS_SAMPLING_CUH 19 | 20 | #include "utils.cuh" 21 | 22 | namespace gpu { 23 | namespace kernels { 24 | 25 | __device__ inline index uniform_pick_idx(index idx, index samplesize, index size) { 26 | auto stride = size / samplesize; 27 | if (stride == 0) { 28 | return idx * size / samplesize; 29 | } else { 30 | return idx * stride + stride / 2; 31 | } 32 | } 33 | 34 | __device__ inline index random_pick_idx(index idx, index samplesize, index size) { 35 | // TODO 36 | return uniform_pick_idx(idx, samplesize, size); 37 | } 38 | 39 | } // namespace kernels 40 | } // namespace gpu 41 | 42 | #endif // UTILS_SAMPLING_CUH -------------------------------------------------------------------------------- /lib/utils_search.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Parallel selection algorithm on GPUs 3 | * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de) 4 | * 5 | * This program is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation, version 3. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program. If not, see . 16 | */ 17 | #ifndef UTILS_SEARCH_CUH 18 | #define UTILS_SEARCH_CUH 19 | 20 | #include "utils.cuh" 21 | 22 | namespace gpu { 23 | namespace kernels { 24 | 25 | // finds the smallest element >= needle in sorted haystack 26 | inline __device__ index binary_search(const index* haystack, index haystack_size, index needle) { 27 | auto range_begin = 0; 28 | auto range_size = haystack_size; 29 | while (range_size > 0) { 30 | auto half_size = range_size / 2; 31 | auto middle = range_begin + half_size; 32 | // if the middle is already a candidate: discard everything right of it 33 | auto go_left = haystack[middle] >= needle; 34 | range_begin = go_left ? range_begin : middle + 1; 35 | range_size = go_left ? half_size : (range_size - half_size - 1); 36 | } 37 | return range_begin; 38 | } 39 | 40 | } // namespace kernels 41 | } // namespace gpu 42 | 43 | #endif // UTILS_SEARCH_CUH -------------------------------------------------------------------------------- /lib/utils_warpaggr.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Parallel selection algorithm on GPUs 3 | * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de) 4 | * 5 | * This program is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation, version 3. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program. If not, see . 16 | */ 17 | #ifndef UTILS_WARPAGGR_CUH 18 | #define UTILS_WARPAGGR_CUH 19 | 20 | #include "utils.cuh" 21 | 22 | namespace gpu { 23 | namespace kernels { 24 | 25 | __device__ inline bool is_group_leader(mask amask) { 26 | return (__ffs(amask) - 1) == (threadIdx.x % warp_size); 27 | } 28 | 29 | __device__ inline index prefix_popc(mask amask, index shift) { 30 | mask prefix_mask = (1u << shift) - 1; 31 | return __popc(amask & prefix_mask); 32 | } 33 | 34 | __device__ inline index warp_aggr_atomic_count_mask(index* atomic, mask amask, mask cmask) { 35 | auto lane_idx = threadIdx.x % warp_size; 36 | index ofs{}; 37 | if (lane_idx == 0) { 38 | ofs = atomicAdd(atomic, __popc(cmask)); 39 | } 40 | ofs = shfl(amask, ofs, 0); 41 | auto local_ofs = prefix_popc(cmask, lane_idx); 42 | return ofs + local_ofs; 43 | } 44 | 45 | __device__ inline index warp_aggr_atomic_count_predicate(index* atomic, mask amask, 46 | bool predicate) { 47 | auto mask = ballot(amask, predicate); 48 | return warp_aggr_atomic_count_mask(atomic, amask, mask); 49 | } 50 | 51 | } // namespace kernels 52 | } // namespace gpu 53 | 54 | #endif // UTILS_WARPAGGR_CUH -------------------------------------------------------------------------------- /lib/utils_work.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Parallel selection algorithm on GPUs 3 | * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de) 4 | * 5 | * This program is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation, version 3. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program. If not, see . 16 | */ 17 | #ifndef UTILS_WORK_CUH 18 | #define UTILS_WORK_CUH 19 | 20 | #include "utils.cuh" 21 | 22 | namespace gpu { 23 | namespace kernels { 24 | 25 | template 26 | __device__ void blockwise_work(index local_work, index size, F function) { 27 | auto stride = gridDim.x * blockDim.x; 28 | auto base_idx = threadIdx.x + blockDim.x * blockIdx.x; 29 | constexpr auto unroll = Config::algorithm::unroll; 30 | for (index i = 0; i < local_work; i += unroll) { 31 | // will any thread in the current iteration be without work? 32 | auto warp_last_idx = (base_idx / warp_size) * warp_size + warp_size - 1; 33 | if (warp_last_idx + (i + unroll - 1) * stride >= size) { 34 | // then the compiler cannot benefit from unrolling 35 | for (auto j = 0; j < unroll; ++j) { 36 | auto idx = base_idx + (i + j) * stride; 37 | auto amask = ballot(full_mask, idx < size); 38 | if (idx < size) { 39 | function(idx, amask); 40 | } 41 | } 42 | } else { 43 | // otherwise all predicates above will be true, so we can unroll 44 | #pragma unroll 45 | for (auto j = 0; j < unroll; ++j) { 46 | auto idx = base_idx + (i + j) * stride; 47 | function(idx, full_mask); 48 | } 49 | } 50 | } 51 | } 52 | template 53 | __device__ void blockwise_work_local_large(index local_work, index size, F function) { 54 | auto stride = blockDim.x; 55 | auto base_idx = threadIdx.x; 56 | constexpr auto unroll = Config::algorithm::unroll; 57 | for (index i = 0; i < local_work; i += unroll) { 58 | // will any thread in the current iteration be without work? 59 | auto warp_last_idx = (base_idx / warp_size) * warp_size + warp_size - 1; 60 | if (warp_last_idx + (i + unroll - 1) * stride >= size) { 61 | // then the compiler cannot benefit from unrolling 62 | for (auto j = 0; j < unroll; ++j) { 63 | auto idx = base_idx + (i + j) * stride; 64 | auto amask = ballot(full_mask, idx < size); 65 | if (idx < size) { 66 | function(idx, amask); 67 | } 68 | } 69 | } else { 70 | // otherwise all predicates above will be true, so we can unroll 71 | #pragma unroll 72 | for (auto j = 0; j < unroll; ++j) { 73 | auto idx = base_idx + (i + j) * stride; 74 | function(idx, full_mask); 75 | } 76 | } 77 | } 78 | } 79 | 80 | template 81 | __device__ void blockwise_work_local(index size, F function) { 82 | for (index i = threadIdx.x; i < size; i += blockDim.x) { 83 | function(i); 84 | } 85 | } 86 | 87 | } // namespace kernels 88 | } // namespace gpu 89 | 90 | #endif // UTILS_WORK_CUH -------------------------------------------------------------------------------- /lib/verification.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Parallel selection algorithm on GPUs 3 | * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de) 4 | * 5 | * This program is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation, version 3. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program. If not, see . 16 | */ 17 | #include 18 | 19 | #include 20 | #include 21 | 22 | namespace verification { 23 | 24 | template 25 | std::pair count_mispartitioned(const std::vector& data, int pivot_rank, T pivot) { 26 | int lcount{}; 27 | int rcount{}; 28 | for (int i = 0; i < pivot_rank; ++i) { 29 | if (data[i] >= pivot) { 30 | ++lcount; 31 | } 32 | } 33 | for (int i = pivot_rank; i < data.size(); ++i) { 34 | if (data[i] < pivot) { 35 | ++rcount; 36 | } 37 | } 38 | return {lcount, rcount}; 39 | } 40 | 41 | template std::pair count_mispartitioned(const std::vector&, int, float); 42 | template std::pair count_mispartitioned(const std::vector&, int, double); 43 | 44 | template 45 | std::vector nth_elements(const std::vector& data, std::vector ranks) { 46 | auto tmp = data; 47 | std::sort(tmp.begin(), tmp.end()); 48 | std::vector result; 49 | for (auto el : ranks) { 50 | result.push_back(tmp[el]); 51 | } 52 | return result; 53 | } 54 | 55 | template 56 | T nth_element(const std::vector& data, int rank) { 57 | auto tmp = data; 58 | std::sort(tmp.begin(), tmp.end()); 59 | return tmp[rank]; 60 | } 61 | 62 | template float nth_element(const std::vector&, int); 63 | template double nth_element(const std::vector&, int); 64 | 65 | template std::vector nth_elements(const std::vector&, std::vector); 66 | template std::vector nth_elements(const std::vector&, std::vector); 67 | 68 | template 69 | int count_not_in_bucket(const std::vector& data, T lower, T upper) { 70 | return std::count_if(data.begin(), data.end(), 71 | [&](T val) { return val < lower || val >= upper; }); 72 | } 73 | 74 | template int count_not_in_bucket(const std::vector&, float, float); 75 | template int count_not_in_bucket(const std::vector&, double, double); 76 | 77 | template 78 | std::vector count_not_in_buckets(const std::vector& data, std::vector prefix_sum, const std::vector& searchtree) { 79 | auto splitter_count = prefix_sum.size() - 1; 80 | std::vector result(splitter_count); 81 | for (index bucket = 0; bucket < splitter_count; ++bucket) { 82 | // we don't use the smallest splitter 83 | auto lower = bucket == 0 ? 0 : searchtree[bucket + splitter_count - 1]; 84 | // we don't store the sentinels 85 | auto upper = bucket == splitter_count - 1 ? std::numeric_limits::max() : searchtree[bucket + splitter_count]; 86 | result[bucket] = std::count_if(data.begin() + prefix_sum[bucket], data.begin() + prefix_sum[bucket + 1], [&](T val) { 87 | return val < lower || val >= upper; 88 | }); 89 | } 90 | return result; 91 | } 92 | 93 | template std::vector count_not_in_buckets(const std::vector& data, std::vector prefix_sum, const std::vector& searchtree); 94 | template std::vector count_not_in_buckets(const std::vector& data, std::vector prefix_sum, const std::vector& searchtree); 95 | 96 | bool verify_rank_ranges(const std::vector& ranks, const std::vector& index_ranges, const std::vector& rank_ranges) { 97 | auto searchtree_width = rank_ranges.size() - 1; 98 | if (!std::is_sorted(rank_ranges.begin(), rank_ranges.end())) return false; 99 | for (gpu::index i = 0; i < searchtree_width; ++i) { 100 | auto lb = index_ranges[i]; 101 | auto ub = index_ranges[i + 1]; 102 | for (auto j = rank_ranges[i]; j < rank_ranges[i + 1]; ++j) { 103 | if (ranks[j] < lb || ranks[j] >= ub) return false; 104 | } 105 | } 106 | return rank_ranges[0] == 0 && rank_ranges.back() == ranks.size(); 107 | } 108 | 109 | } // namespace verification 110 | --------------------------------------------------------------------------------