├── .gitignore
├── CMakeLists.txt
├── LICENSE
├── LICENSE-catch2
├── LICENSE-cub
├── README
├── app
    ├── CMakeLists.txt
    ├── benchmark.cu
    ├── benchmark_main.cu
    ├── benchmark_sort.cu
    ├── catch.hpp
    ├── test_basecase.cu
    ├── test_fixture.cuh
    ├── test_helpers.cu
    ├── test_main.cu
    ├── test_qs.cu
    └── test_ssss.cu
├── include
    ├── cpu_reference.hpp
    ├── cuda_definitions.cuh
    ├── cuda_error.cuh
    ├── cuda_memory.cuh
    ├── cuda_timer.cuh
    ├── kernel_config.cuh
    ├── launcher_fwd.cuh
    └── verification.hpp
└── lib
    ├── CMakeLists.txt
    ├── cpu_reference.cpp
    ├── gen_instantiations.py
    ├── generated
        ├── gen-full.cu
        ├── gen0.cu
        ├── gen1.cu
        ├── gen10.cu
        ├── gen11.cu
        ├── gen12.cu
        ├── gen13.cu
        ├── gen14.cu
        ├── gen15.cu
        ├── gen16.cu
        ├── gen17.cu
        ├── gen18.cu
        ├── gen19.cu
        ├── gen2.cu
        ├── gen20.cu
        ├── gen21.cu
        ├── gen22.cu
        ├── gen23.cu
        ├── gen24.cu
        ├── gen25.cu
        ├── gen26.cu
        ├── gen27.cu
        ├── gen28.cu
        ├── gen29.cu
        ├── gen3.cu
        ├── gen30.cu
        ├── gen31.cu
        ├── gen32.cu
        ├── gen33.cu
        ├── gen34.cu
        ├── gen35.cu
        ├── gen36.cu
        ├── gen37.cu
        ├── gen38.cu
        ├── gen39.cu
        ├── gen4.cu
        ├── gen5.cu
        ├── gen6.cu
        ├── gen7.cu
        ├── gen8.cu
        └── gen9.cu
    ├── qs_launchers.cuh
    ├── qs_recursion.cuh
    ├── qs_recursion_multi.cuh
    ├── qs_reduce.cuh
    ├── qs_scan.cuh
    ├── ssss_build_searchtree.cuh
    ├── ssss_collect.cuh
    ├── ssss_collect_multi.cuh
    ├── ssss_count.cuh
    ├── ssss_launchers.cuh
    ├── ssss_merged.cuh
    ├── ssss_merged_memory.cuh
    ├── ssss_recursion.cuh
    ├── ssss_recursion_multi.cuh
    ├── ssss_reduce.cuh
    ├── utils.cuh
    ├── utils_basecase.cuh
    ├── utils_bytestorage.cuh
    ├── utils_mask.cuh
    ├── utils_prefixsum.cuh
    ├── utils_sampling.cuh
    ├── utils_search.cuh
    ├── utils_sort.cuh
    ├── utils_warpaggr.cuh
    ├── utils_work.cuh
    └── verification.cpp


/.gitignore:
--------------------------------------------------------------------------------
1 | build
2 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
2 | project(gpu_selection LANGUAGES CXX CUDA)
3 | 
4 | list(APPEND CMAKE_CUDA_FLAGS "-arch=sm_35 -rdc=true --maxrregcount 64 -gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_70,code=compute_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80")
5 | add_subdirectory(lib)
6 | add_subdirectory(app)
7 | 


--------------------------------------------------------------------------------
/LICENSE-catch2:
--------------------------------------------------------------------------------
 1 | Boost Software License - Version 1.0 - August 17th, 2003
 2 | 
 3 | Permission is hereby granted, free of charge, to any person or organization
 4 | obtaining a copy of the software and accompanying documentation covered by
 5 | this license (the "Software") to use, reproduce, display, distribute,
 6 | execute, and transmit the Software, and to prepare derivative works of the
 7 | Software, and to permit third-parties to whom the Software is furnished to
 8 | do so, all subject to the following:
 9 | 
10 | The copyright notices in the Software and this entire statement, including
11 | the above license grant, this restriction and the following disclaimer,
12 | must be included in all copies of the Software, in whole or in part, and
13 | all derivative works of the Software, unless such copies or derivative
14 | works are solely in the form of machine-executable object code generated by
15 | a source language processor.
16 | 
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
20 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
21 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
22 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 | DEALINGS IN THE SOFTWARE.
24 | 


--------------------------------------------------------------------------------
/LICENSE-cub:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
 2 | Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 |    *  Redistributions of source code must retain the above copyright
 7 |       notice, this list of conditions and the following disclaimer.
 8 |    *  Redistributions in binary form must reproduce the above copyright
 9 |       notice, this list of conditions and the following disclaimer in the
10 |       documentation and/or other materials provided with the distribution.
11 |    *  Neither the name of the NVIDIA CORPORATION nor the
12 |       names of its contributors may be used to endorse or promote products
13 |       derived from this software without specific prior written permission.
14 | 
15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 | DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
 1 | This library implements a bucket-based selection algorithm on GPUs
 2 | 
 3 | More details can be found in
 4 | 
 5 | * T. Ribizel and H. Anzt, "Approximate and Exact Selection on GPUs," 2019 IEEE International Parallel and Distributed Processing Symposium Workshops (IPDPSW), Rio de Janeiro, Brazil, 2019, pp. 471-478.
 6 | doi: 10.1109/IPDPSW.2019.00088
 7 | * T. Ribizel, H. Anzt, "Parallel selection on GPUs," Parallel Computing, Volume 91, 2020, doi: 10.1016/j.parco.2019.102588
 8 | 
 9 | It uses Catch2 as a test framework and the CUB library as a reference implementation for sorting.
10 | 
11 | The tests can be run by simply executing `app/unittests`, the benchmarks can be run by executing `app/benchmark` with one of the following parameters
12 | 
13 | [full]             The full benchmark for exact single and multiple selection and the individual kernels (sample, count, reduce, filter)
14 | [full-multionly]   The full benchmark for multiple selection only
15 | [approx]           The full benchmark for approximate selection with shared-memory atomics
16 | [approx-g]         The full benchmark for approximate selection with global-memory atomics
17 | [multi]            The full benchmark for multiple selection with different numbers of ranks
18 | [test]             A small benchmark that only executes a single benchmark with small input size
19 | 
20 | The output of these tests is the following:
21 | On stdout, they print error messages in case the algorithm execution produces invalid results. For the approx tests, additionally the exact and approximate rank are being output in CSV format.
22 | On stderr, they print the individual timings of the kernels in CSV format for different input sizes given by the first CSV field. Runtime breakdowns are listed within parentheses ().
23 | 
24 | `app/benchmark-sort` contains a benchmark for the CUB radix sort implementation as a performance baseline for the multiple selection.
25 | 
26 | Structure of the project
27 | 
28 | include/cpu_reference.hpp     - Reference implementations for testing
29 | include/verification.hpp      - Validation functions for testing
30 | include/cuda_definitions.cuh  - Type definitions and hardware limits
31 | include/cuda_error.cuh        - Wrapper for CUDA error handling
32 | include/cuda_memory.cuh       - Wrapper for CUDA memory allocations
33 | include/cuda_timer.cuh        - Wrapper for CUDA timing measurements
34 | include/kernel_config.cuh     - Configuration struct for kernel templates
35 | include/launcher_fwd.cuh      - Forward-declarations of launcher and kernel templates
36 | 
37 | lib/generated/*               - Explicit template instantiations to parallelize compilation
38 | lib/cpu_reference.cpp         - Reference implementations for testing
39 | lib/verification.cpp          - Validation functions for testing
40 | lib/qs_launchers.cuh          - Wrappers for quickselect kernels
41 | lib/qs_recursion.cuh          - Kernels for quickselect single-selection
42 | lib/qs_recursion_multi.cuh    - Kernels for quickselect multi-selection
43 | lib/qs_reduce.cuh             - Kernels for reducing quickselect partial sums
44 | lib/qs_scan.cuh               - Kernels for quickselect bipartitioning
45 | lib/ssss_build_searchtree.cuh - Kernels for sampleselect sampling
46 | lib/ssss_collect.cuh          - Kernels for sampleselect single-selection filtering
47 | lib/ssss_collect_multi.cuh    - Kernels for sampleselect multi-selection filtering
48 | lib/ssss_count.cuh            - Kernels for sampleselect counting
49 | lib/ssss_launchers.cuh        - Wrappers for sampleselect kernels
50 | lib/ssss_merged.cuh           - Kernels for multiple simultaneous sampleselects
51 | lib/ssss_merged_memory.cuh    - Auxiliary data structure for sampleselect multi-selection
52 | lib/ssss_recursion.cuh        - Kernels for sampleselect single-selection
53 | lib/ssss_recursion_multi.cuh  - Kernels for sampleselect multi-selection
54 | lib/ssss_reduce.cuh           - Kernels for reducing sampleselect partial sums
55 | lib/utils_basecase.cuh        - Kernels for recursion basecase
56 | lib/utils_bytestorage.cuh     - Auxiliary functions for reading/writing unaligned bytes
57 | lib/utils_mask.cuh            - Auxiliary functions for bitmasks
58 | lib/utils_prefixsum.cuh       - Auxiliary functions for tree-based partial sums
59 | lib/utils_sampling.cuh        - Auxiliary functions for sampling
60 | lib/utils_search.cuh          - Auxiliary functions for binary and warp-ary searches
61 | lib/utils_sort.cuh            - Auxiliary functions for bitonic sorting
62 | lib/utils_warpaggr.cuh        - Auxiliary functions for warp-aggregation
63 | lib/utils_work.cuh            - Auxiliary functions for work-distribution
64 | lib/utils.cuh                 - Auxiliary wrappers for basic operations
65 | 


--------------------------------------------------------------------------------
/app/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_executable(benchmark benchmark_main.cu benchmark.cu)
 2 | add_executable(benchmark-sort benchmark_main.cu benchmark_sort.cu)
 3 | target_include_directories(benchmark-sort PRIVATE ../include ../lib)
 4 | 
 5 | add_executable(unittest test_main.cu test_qs.cu test_ssss.cu test_helpers.cu
 6 | #    test_basecase.cu
 7 | 	)
 8 | 
 9 | target_link_libraries(benchmark gpu_selection)
10 | 
11 | target_link_libraries(unittest gpu_selection)
12 | 
13 | set_target_properties(unittest PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
14 | set_target_properties(benchmark PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
15 | set_target_properties(benchmark-sort PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
16 | 


--------------------------------------------------------------------------------
/app/benchmark_main.cu:
--------------------------------------------------------------------------------
1 | #define CATCH_CONFIG_MAIN
2 | #include "catch.hpp"


--------------------------------------------------------------------------------
/app/benchmark_sort.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Parallel selection algorithm on GPUs
 3 |  * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de)
 4 |  *
 5 |  * This program is free software: you can redistribute it and/or modify
 6 |  * it under the terms of the GNU General Public License as published by
 7 |  * the Free Software Foundation, version 3.
 8 |  *
 9 |  * This program is distributed in the hope that it will be useful, but
10 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
11 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 |  * General Public License for more details.
13 |  *
14 |  * You should have received a copy of the GNU General Public License
15 |  * along with this program. If not, see <https://www.gnu.org/licenses/>.
16 |  */
17 | #include "catch.hpp"
18 | #include "test_fixture.cuh"
19 | #include <cpu_reference.hpp>
20 | #include <cuda_timer.cuh>
21 | #include <cub/cub.cuh>
22 | 
23 | namespace gpu {
24 | 
25 | constexpr auto num_runs = 10;
26 | 
27 | template <typename T>
28 | void cub_sort(std::string name, index n, index d, basic_test_data<std::pair<T,int>>& data, cuda_timer& timer) {
29 |     cub::DoubleBuffer<T> keys{static_cast<T*>(data.gpu_data), static_cast<T*>(data.gpu_data_out)};
30 |     timer.timed(name, num_runs, [&](auto event) {
31 |         data.reset();
32 |         auto tmp_size = sizeof(T) * n;
33 |         event(0);
34 |         cub::DeviceRadixSort::SortKeys(static_cast<T*>(data.gpu_data_tmp), tmp_size, keys, n);
35 |         event(1);
36 |     });
37 |     auto sorted = data.data;
38 |     auto ref = sorted;
39 |     cudaCheckError(cudaMemcpy(ref.data(), keys.Current(), n * sizeof(T), cudaMemcpyDeviceToHost));
40 |     std::sort(sorted.begin(), sorted.end());
41 |     bool is_sorted = sorted == ref;
42 |     CHECK(is_sorted);
43 | }
44 | 
45 | TEMPLATE_TEST_CASE("sort", "", float, double) {
46 |     using T = TestType;
47 |     auto n = GENERATE(as<index>{}, 65536, 262144, 524288, 1048576, 2097152, 4194304, 8388608,
48 |                       16777216, 33554432, 67108864, 134217728);
49 |     auto d = GENERATE(as<index>{}, 1 << 30);
50 |     auto seed = GENERATE(take(10, Catch::Generators::random(0, 1000000)));
51 |     basic_test_data<std::pair<T, int>> data{n, d, index(seed)};
52 |     CAPTURE(n);
53 |     CAPTURE(d);
54 |     CAPTURE(seed);
55 |     cuda_timer timer{std::cerr};
56 |     auto suffix = "-" + std::to_string(n) + "-" + std::to_string(d) + "-" + typeid(T).name();
57 |     // thrust_sort<T>("thrust_sort" + suffix, n, d, data, timer);
58 |     cub_sort<T>("cub_sort" + suffix, n, d, data, timer);
59 | }
60 | 
61 | } // namespace gpu
62 | 


--------------------------------------------------------------------------------
/app/test_basecase.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Parallel selection algorithm on GPUs
 3 |  * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de)
 4 |  *
 5 |  * This program is free software: you can redistribute it and/or modify
 6 |  * it under the terms of the GNU General Public License as published by
 7 |  * the Free Software Foundation, version 3.
 8 |  *
 9 |  * This program is distributed in the hope that it will be useful, but
10 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
11 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 |  * General Public License for more details.
13 |  *
14 |  * You should have received a copy of the GNU General Public License
15 |  * along with this program. If not, see <https://www.gnu.org/licenses/>.
16 |  */
17 | #include "catch.hpp"
18 | #include "../lib/utils_basecase.cuh"
19 | #include "test_fixture.cuh"
20 | #include <kernel_config.cuh>
21 | #include <numeric>
22 | #include <algorithm>
23 | 
24 | namespace gpu {
25 | 
26 | template <typename A, typename B, int Size, int Replsize>
27 | struct extended_pair {
28 |     constexpr static int size = Size;
29 |     constexpr static int replsize = Replsize;
30 |     using first_type = A;
31 |     using second_type = B;
32 | };
33 | 
34 | template <typename Pair>
35 | using test_data = basic_test_data<Pair, Pair::size, Pair::replsize>;
36 | 
37 | template <typename Config>
38 | using float_pair = extended_pair<float, Config, 1024 * 4, 1024 * 4>;
39 |     
40 | TEMPLATE_PRODUCT_TEST_CASE_METHOD(test_data, "basecase", "[basecase]",
41 |                                   (float_pair),
42 |                                   ((select_config<10, 10, 8, true, true, false, 8, 10, 10, false, 8, 0>),
43 |                                    (select_config<10, 10, 8, true, true, false, 8, 10, 10, false, 8, 1>),
44 |                                    (select_config<10, 10, 8, true, true, false, 8, 10, 10, false, 8, 2>),
45 |                                    (select_config<12, 10, 8, true, true, false, 8, 10, 10, false, 8, 2>))) {
46 |     using T = typename TestType::first_type;
47 |     using Config = typename TestType::second_type;
48 |     std::vector<index> ranks(1);
49 |     constexpr auto basecase_size = Config::basecase::size;
50 |     constexpr auto local_size = Config::basecase::local_size;
51 |     constexpr auto cur_launch_size = Config::basecase::launch_size;
52 |     auto size = GENERATE(as<index>{}, basecase_size, basecase_size / 5, warp_size * local_size, warp_size * local_size / 5);
53 |     auto launch_size = GENERATE(as<index>{}, cur_launch_size, max_block_size);
54 |     std::string mode;
55 |     SECTION("some ranks") {
56 |         mode = "some ranks";
57 |         ranks.resize(std::min<int>(100, size / 2));
58 |         for (auto i = 0; i < ranks.size(); ++i) {
59 |             ranks[i] = i * size / ranks.size();
60 |         }
61 |     }
62 |     SECTION("all ranks") {
63 |         mode = "all ranks";
64 |         ranks.resize(size);
65 |         std::iota(ranks.begin(), ranks.end(), 0);
66 |     }
67 |     CAPTURE(size);
68 |     CAPTURE(launch_size);
69 |     CAPTURE(mode);
70 |     this->gpu_ranks.copy_from(ranks);
71 |     this->run([&]() { kernels::select_bitonic_basecase<T, Config><<<1, launch_size>>>(this->gpu_data, size, ranks.back(), this->gpu_data_out); });
72 |     std::vector<T> result;
73 |     this->gpu_data_out.copy_to(result);
74 |     auto data = this->data;
75 |     data.resize(size);
76 |     std::sort(data.begin(), data.end());
77 |     CHECK(data[ranks.back()] == result[0]);
78 |     this->run([&]() { kernels::select_bitonic_multiple_basecase<T, Config><<<1, launch_size>>>(this->gpu_data, size, this->gpu_ranks, ranks.size(), 0, this->gpu_data_out); });
79 |     this->gpu_data_out.copy_to(result);
80 |     index count{};
81 |     for (auto i = 0; i < ranks.size(); ++i) {
82 |         count += result[i] != data[ranks[i]];
83 |     }
84 |     CHECK(count == 0);
85 | }
86 | 
87 | } // namespace gpu


--------------------------------------------------------------------------------
/app/test_fixture.cuh:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Parallel selection algorithm on GPUs
  3 |  * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de)
  4 |  *
  5 |  * This program is free software: you can redistribute it and/or modify
  6 |  * it under the terms of the GNU General Public License as published by
  7 |  * the Free Software Foundation, version 3.
  8 |  *
  9 |  * This program is distributed in the hope that it will be useful, but
 10 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
 11 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 12 |  * General Public License for more details.
 13 |  *
 14 |  * You should have received a copy of the GNU General Public License
 15 |  * along with this program. If not, see <https://www.gnu.org/licenses/>.
 16 |  */
 17 | #include <algorithm>
 18 | #include <cuda_definitions.cuh>
 19 | #include <cuda_error.cuh>
 20 | #include <cuda_memory.cuh>
 21 | #include <random>
 22 | #include <tuple>
 23 | #include <utils.cuh>
 24 | #include <vector>
 25 | 
 26 | namespace gpu {
 27 | 
 28 | constexpr auto max_tree_width = 4096;
 29 | constexpr auto max_tree_size = 2 * max_tree_width * 2;
 30 | constexpr auto max_block_count = 1024;
 31 | 
 32 | template <typename Pair, index Size = 1, index Valsize = 1>
 33 | struct basic_test_data {
 34 |     using T = typename Pair::first_type;
 35 |     using Config = typename Pair::second_type;
 36 |     index size;
 37 |     std::vector<T> data;
 38 |     std::vector<T> tree;
 39 |     std::vector<T> data_out;
 40 |     std::vector<poracle> oracles;
 41 |     std::vector<index> count_out;
 42 |     std::vector<index> atomic;
 43 |     std::vector<index> zeros;
 44 |     std::vector<index> ranks;
 45 |     std::vector<mask> bucket_mask;
 46 |     cuda_resettable_array<T> gpu_data;
 47 |     cuda_array<T> gpu_data_tmp;
 48 |     cuda_array<T> gpu_tree;
 49 |     cuda_array<T> gpu_data_out;
 50 |     cuda_array<poracle> gpu_oracles;
 51 |     cuda_array<index> gpu_aux;
 52 |     cuda_resettable_array<index> gpu_atomic;
 53 |     cuda_array<index> gpu_count_tmp;
 54 |     cuda_resettable_array<index> gpu_count_out;
 55 |     cuda_array<index> gpu_bucket_ranges;
 56 |     cuda_array<index> gpu_rank_ranges;
 57 |     cuda_array<index> gpu_ranks;
 58 |     cuda_array<mask> gpu_bucket_mask;
 59 |     index rank;
 60 |     T pivot;
 61 | 
 62 |     basic_test_data(index size = Size, index valsize = Valsize, index seed = 0)
 63 |             : size{size}, data(size), tree(max_tree_size), oracles(size), count_out(max_block_count * 2 + 2),
 64 |               zeros(size + max_block_count * max_tree_width * 16), ranks(287), bucket_mask(max_tree_size / (sizeof(mask) * 8)),
 65 |               atomic(max_tree_width) {
 66 |         std::default_random_engine random(seed);
 67 |         std::uniform_int_distribution<std::size_t> dist(0, valsize - 1);
 68 |         std::uniform_int_distribution<std::size_t> idist(0, size - 1);
 69 |         std::uniform_int_distribution<mask> maskdist(mask(0), ~mask(0));
 70 |         std::vector<index> smallzeros(max_tree_size);
 71 |         for (auto& el : data) {
 72 |             el = dist(random);
 73 |         }
 74 |         for (auto& el : ranks) {
 75 |             el = idist(random);
 76 |         }
 77 |         ranks.back() = size - 1;
 78 |         std::sort(ranks.begin(), ranks.end());
 79 |         rank = idist(random);
 80 |         pivot = data[rank];
 81 |         gpu_data.copy_from(data);
 82 |         gpu_tree.copy_from(tree);
 83 |         gpu_data_tmp.copy_from(data);
 84 |         gpu_data_out.copy_from(data);
 85 |         gpu_atomic.copy_from(atomic);
 86 |         gpu_count_tmp.copy_from(zeros);
 87 |         gpu_aux.copy_from(zeros);
 88 |         gpu_count_out.copy_from(count_out);
 89 |         gpu_oracles.copy_from(oracles);
 90 |         gpu_bucket_ranges.copy_from(smallzeros);
 91 |         gpu_rank_ranges.copy_from(smallzeros);
 92 |         gpu_ranks.copy_from(ranks);
 93 |         gpu_bucket_mask.copy_from(bucket_mask);
 94 |     }
 95 | 
 96 |     void reset() {
 97 |         gpu_data.reset();
 98 |         gpu_atomic.reset();
 99 |         gpu_count_out.reset();
100 |     }
101 | 
102 |     void copy_from_gpu() {
103 |         gpu_data_out.copy_to(data_out);
104 |         gpu_count_out.copy_to(count_out);
105 |         gpu_tree.copy_to(tree);
106 |         gpu_oracles.copy_to(oracles);
107 |         gpu_atomic.copy_to(atomic);
108 |     }
109 | 
110 |     template <typename F>
111 |     void run(F f) {
112 |         cudaChecked(f);
113 |         copy_from_gpu();
114 |     }
115 | };
116 | 
117 | inline std::vector<unsigned char> unpack(const std::vector<poracle>& in, int size) {
118 |     using uc = unsigned char;
119 |     std::vector<uc> result;
120 |     result.reserve(in.size() * 4);
121 |     for (auto el : in) {
122 |         result.insert(result.end(), {uc(el), uc(el >> 8), uc(el >> 16), uc(el >> 24)});
123 |     }
124 |     result.resize(size);
125 |     return result;
126 | }
127 | 
128 | inline std::vector<index> build_ranks_uniform(index size, index count) {
129 |     std::vector<index> result;
130 |     for (index i = 0; i < count; ++i) {
131 |         result.push_back(int(double(i) * size / count));
132 |     }
133 |     return result;
134 | }
135 | 
136 | inline std::vector<index> build_ranks_clustered(index size) {
137 |     std::vector<index> result;
138 |     auto step = size / 2;
139 |     while (step >= 1) {
140 |         result.push_back(step);
141 |         step = step / 2;
142 |     }
143 |     std::reverse(result.begin(), result.end());
144 |     return result;
145 | }
146 | 
147 | } // namespace gpu


--------------------------------------------------------------------------------
/app/test_main.cu:
--------------------------------------------------------------------------------
1 | #define CATCH_CONFIG_MAIN
2 | #include "catch.hpp"
3 | 


--------------------------------------------------------------------------------
/app/test_qs.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Parallel selection algorithm on GPUs
  3 |  * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de)
  4 |  *
  5 |  * This program is free software: you can redistribute it and/or modify
  6 |  * it under the terms of the GNU General Public License as published by
  7 |  * the Free Software Foundation, version 3.
  8 |  *
  9 |  * This program is distributed in the hope that it will be useful, but
 10 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
 11 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 12 |  * General Public License for more details.
 13 |  *
 14 |  * You should have received a copy of the GNU General Public License
 15 |  * along with this program. If not, see <https://www.gnu.org/licenses/>.
 16 |  */
 17 | #include "catch.hpp"
 18 | #include "test_fixture.cuh"
 19 | 
 20 | #include <cpu_reference.hpp>
 21 | #include <kernel_config.cuh>
 22 | #include <launcher_fwd.cuh>
 23 | #include <verification.hpp>
 24 | 
 25 | namespace gpu {
 26 | 
 27 | template <typename Pair>
 28 | using test_data = basic_test_data<Pair, 182934, 182934>;
 29 | 
 30 | template <typename Config>
 31 | using float_pair = typename std::pair<float, Config>;
 32 | 
 33 | template <typename Config>
 34 | using double_pair = typename std::pair<double, Config>;
 35 | 
 36 | TEMPLATE_PRODUCT_TEST_CASE_METHOD(test_data, "bipartition", "[quickselect]",
 37 |                                   (float_pair, double_pair),
 38 |                                   ((select_config<10, 5, 8, true, true, true, 8, 10, 10>),
 39 |                                    (select_config<10, 5, 8, false, true, true, 8, 10, 10>))) {
 40 |     using T = typename TestType::first_type;
 41 |     using Config = typename TestType::second_type;
 42 |     this->run([&]() {
 43 |         partition<T, Config>(this->gpu_data, this->gpu_data_out, this->gpu_count_out, this->size,
 44 |                              this->pivot);
 45 |     });
 46 |     auto lsize = this->count_out[0];
 47 |     auto rsize = this->count_out[1];
 48 |     CHECK(lsize + rsize == this->size);
 49 |     auto counts = verification::count_mispartitioned(this->data_out, lsize, this->pivot);
 50 |     auto lcount = counts.first;
 51 |     auto rcount = counts.second;
 52 |     CHECK(lcount == 0);
 53 |     CHECK(rcount == 0);
 54 | }
 55 | 
 56 | TEMPLATE_PRODUCT_TEST_CASE_METHOD(test_data, "quickselect", "[quickselect]",
 57 |                                   (float_pair, double_pair),
 58 |                                   ((select_config<10, 5, 8, true, true, true, 8, 10, 10>),
 59 |                                    (select_config<10, 5, 8, false, true, true, 8, 10, 10>))) {
 60 |     using T = typename TestType::first_type;
 61 |     using Config = typename TestType::second_type;
 62 |     this->run([&]() {
 63 |         quickselect<T, Config>(this->gpu_data, this->gpu_data_tmp, this->gpu_count_tmp, this->size, this->rank,
 64 |                                this->gpu_data_out);
 65 |     });
 66 |     auto ref = verification::nth_element(this->data, this->rank);
 67 |     CHECK(ref == this->data_out[0]);
 68 | }
 69 | 
 70 | TEMPLATE_PRODUCT_TEST_CASE_METHOD(test_data, "quickselect_multi", "[quickselect]",
 71 |                                   (float_pair, double_pair),
 72 |                                   ((select_config<10, 5, 8, true, true, true, 8, 10, 10>),
 73 |                                    (select_config<10, 5, 8, false, true, true, 8, 10, 10>))) {
 74 |     using T = typename TestType::first_type;
 75 |     using Config = typename TestType::second_type;
 76 |     std::vector<index> ranks;
 77 |     SECTION("some ranks") {
 78 |         for (int i = 0; i < 100; ++i) {
 79 |             ranks.push_back(this->size * i / 120);
 80 |             ranks.push_back(this->size * i / 120 + 1);
 81 |             ranks.push_back(this->size * i / 120 + 2);
 82 |             ranks.push_back(this->size * i / 120 + 10);
 83 |         }
 84 |         for (int i = 0; i < 6000; ++i) {
 85 |             ranks.push_back(i + 4000);
 86 |         }
 87 |         std::sort(ranks.begin(), ranks.end());
 88 |         ranks.erase(std::unique(ranks.begin(), ranks.end()), ranks.end());
 89 |     }
 90 |     SECTION("all ranks") {
 91 |         ranks.resize(this->size);
 92 |         std::iota(ranks.begin(), ranks.end(), 0);
 93 |     }
 94 |     std::vector<T> result(ranks.size());
 95 |     this->gpu_ranks.copy_from(ranks);
 96 |     this->gpu_data_out.copy_from(result);
 97 |     this->run([&]() {
 98 |         quickselect_multi<T, Config>(this->gpu_data, this->gpu_data_tmp, this->gpu_count_tmp, this->size, this->gpu_ranks, ranks.size(),
 99 |                                      this->gpu_data_out);
100 |     });
101 |     auto ref = this->data;
102 |     std::sort(ref.begin(), ref.end());
103 |     this->gpu_data_out.copy_to(result);
104 |     std::vector<T> reference;
105 |     for (auto rank : ranks) {
106 |         reference.push_back(ref[rank]);
107 |     }
108 |     int count{};
109 |     for (index i = 0; i < reference.size(); ++i) {
110 |         count += reference[i] != result[i];
111 |     }
112 |     CAPTURE(reference.size());
113 |     CHECK(count == 0);
114 | }
115 | 
116 | } // namespace gpu


--------------------------------------------------------------------------------
/include/cpu_reference.hpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Parallel selection algorithm on GPUs
 3 |  * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de)
 4 |  *
 5 |  * This program is free software: you can redistribute it and/or modify
 6 |  * it under the terms of the GNU General Public License as published by
 7 |  * the Free Software Foundation, version 3.
 8 |  *
 9 |  * This program is distributed in the hope that it will be useful, but
10 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
11 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 |  * General Public License for more details.
13 |  *
14 |  * You should have received a copy of the GNU General Public License
15 |  * along with this program. If not, see <https://www.gnu.org/licenses/>.
16 |  */
17 | #ifndef GPU_SELECTION_CPU_REFERENCE_HPP
18 | #define GPU_SELECTION_CPU_REFERENCE_HPP
19 | 
20 | #include <cuda_definitions.cuh>
21 | #include <vector>
22 | 
23 | namespace cpu {
24 | 
25 | using gpu::index;
26 | using gpu::mask;
27 | 
28 | template <typename T>
29 | std::pair<int, int> partition(const std::vector<T>& data, int begin, int end, std::vector<T>& out,
30 |                               int pivot_idx);
31 | 
32 | template <typename T>
33 | T quickselect(std::vector<T>& in, std::vector<T>& out, int rank);
34 | 
35 | template <typename T>
36 | std::vector<T> build_searchtree(const std::vector<T>& in, int sample_size, int searchtree_size);
37 | 
38 | template <typename T>
39 | std::pair<std::vector<index>, std::vector<unsigned char>> ssss(const std::vector<T>& data,
40 |                                                                const std::vector<T>& tree, bool write);
41 | 
42 | std::vector<index> grouped_reduce(const std::vector<index>& data, int searchtree_size);
43 | std::vector<index> grouped_prefix_sum(const std::vector<index>& data, int searchtree_size);
44 | 
45 | std::vector<index> compute_rank_ranges(std::vector<index> counts, const std::vector<index>& ranks);
46 | std::vector<mask> compute_bucket_mask(const std::vector<index>& rank_ranges);
47 | 
48 | std::pair<std::vector<index>, index> masked_prefix_sum(const std::vector<index>& counts, const std::vector<mask>& m);
49 | 
50 | } // namespace cpu
51 | 
52 | #endif // GPU_SELECTION_CPU_REFERENCE_HPP
53 | 


--------------------------------------------------------------------------------
/include/cuda_definitions.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Parallel selection algorithm on GPUs
 3 |  * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de)
 4 |  *
 5 |  * This program is free software: you can redistribute it and/or modify
 6 |  * it under the terms of the GNU General Public License as published by
 7 |  * the Free Software Foundation, version 3.
 8 |  *
 9 |  * This program is distributed in the hope that it will be useful, but
10 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
11 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 |  * General Public License for more details.
13 |  *
14 |  * You should have received a copy of the GNU General Public License
15 |  * along with this program. If not, see <https://www.gnu.org/licenses/>.
16 |  */
17 | #ifndef GPU_SELECTION_CUDA_DEFINITIONS_CUH
18 | #define GPU_SELECTION_CUDA_DEFINITIONS_CUH
19 | 
20 | #include <cstdint>
21 | 
22 | namespace gpu {
23 | 
24 | using index = std::uint32_t;
25 | using poracle = std::uint32_t;
26 | using oracle = std::uint32_t;
27 | using mask = std::uint32_t;
28 | 
29 | constexpr index warp_size_log2 = 5;
30 | constexpr index warp_size = 1 << warp_size_log2;
31 | constexpr index max_block_size_log2 = 10;
32 | constexpr index max_block_size = 1 << max_block_size_log2;
33 | 
34 | } // namespace gpu
35 | 
36 | #endif // GPU_SELECTION_CUDA_DEFINITIONS_CUH
37 | 


--------------------------------------------------------------------------------
/include/cuda_error.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Parallel selection algorithm on GPUs
 3 |  * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de)
 4 |  *
 5 |  * This program is free software: you can redistribute it and/or modify
 6 |  * it under the terms of the GNU General Public License as published by
 7 |  * the Free Software Foundation, version 3.
 8 |  *
 9 |  * This program is distributed in the hope that it will be useful, but
10 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
11 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 |  * General Public License for more details.
13 |  *
14 |  * You should have received a copy of the GNU General Public License
15 |  * along with this program. If not, see <https://www.gnu.org/licenses/>.
16 |  */
17 | #ifndef CUDA_CHECK_ERROR_CUH
18 | #define CUDA_CHECK_ERROR_CUH
19 | 
20 | #include <stdexcept>
21 | 
22 | inline void cudaCheckError(cudaError_t error) {
23 |     if (error != cudaSuccess) {
24 |         std::string msg{"CUDA error "};
25 |         msg += cudaGetErrorName(error);
26 |         msg += ": ";
27 |         msg += cudaGetErrorString(error);
28 |         throw std::runtime_error{msg};
29 |     }
30 | }
31 | 
32 | template <typename F>
33 | void cudaChecked(F func) {
34 |     func();
35 |     cudaDeviceSynchronize();
36 |     cudaCheckError(cudaGetLastError());
37 | }
38 | 
39 | #endif // CUDA_CHECK_ERROR_CUH
40 | 


--------------------------------------------------------------------------------
/include/cuda_memory.cuh:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Parallel selection algorithm on GPUs
  3 |  * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de)
  4 |  *
  5 |  * This program is free software: you can redistribute it and/or modify
  6 |  * it under the terms of the GNU General Public License as published by
  7 |  * the Free Software Foundation, version 3.
  8 |  *
  9 |  * This program is distributed in the hope that it will be useful, but
 10 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
 11 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 12 |  * General Public License for more details.
 13 |  *
 14 |  * You should have received a copy of the GNU General Public License
 15 |  * along with this program. If not, see <https://www.gnu.org/licenses/>.
 16 |  */
 17 | #ifndef CUDA_MEMORY_CUH
 18 | #define CUDA_MEMORY_CUH
 19 | 
 20 | #include <iostream>
 21 | #include <vector>
 22 | 
 23 | #include "cuda_error.cuh"
 24 | 
 25 | template <typename T>
 26 | class cuda_resettable_array;
 27 | 
 28 | template <typename T>
 29 | class cuda_array {
 30 |     friend class cuda_resettable_array<T>;
 31 | public:
 32 |     cuda_array() : size{}, storage{nullptr} {}
 33 |     cuda_array(std::size_t size) : size{size}, storage{nullptr} {
 34 |         cudaCheckError(cudaMalloc(&storage, sizeof(T) * size));
 35 |     }
 36 |     ~cuda_array() {
 37 |         if (storage) {
 38 |             try {
 39 |                 cudaCheckError(cudaFree(storage));
 40 |             } catch (std::runtime_error& err) {
 41 |                 std::cerr << err.what() << std::endl;
 42 |             }
 43 |         }
 44 |     }
 45 |     cuda_array(const cuda_array&) = delete;
 46 |     cuda_array(cuda_array&& other) {
 47 |         storage = other.storage;
 48 |         size = other.size;
 49 |         other.storage = nullptr;
 50 |         other.size = 0;
 51 |     }
 52 |     cuda_array& operator=(cuda_array&& other) {
 53 |         this->~cuda_array();
 54 |         storage = other.storage;
 55 |         size = other.size;
 56 |         other.storage = nullptr;
 57 |         other.size = 0;
 58 |         return *this;
 59 |     }
 60 | 
 61 |     operator T*() { return storage; }
 62 | 
 63 |     void copy_from_raw(const T* src) {
 64 |         cudaCheckError(cudaMemcpy(storage, src, size * sizeof(T), cudaMemcpyHostToDevice));
 65 |     }
 66 | 
 67 |     void copy_to_raw(T* dst) const {
 68 |         cudaCheckError(cudaMemcpy(dst, storage, size * sizeof(T), cudaMemcpyDeviceToHost));
 69 |     }
 70 | 
 71 |     void copy_from(const std::vector<T>& vec) {
 72 |         if (size != vec.size()) {
 73 |             *this = cuda_array<T>{vec.size()};
 74 |         }
 75 |         copy_from_raw(vec.data());
 76 |     }
 77 | 
 78 |     void copy_to(std::vector<T>& vec) const {
 79 |         vec.resize(size);
 80 |         copy_to_raw(vec.data());
 81 |     }
 82 | 
 83 | private:
 84 |     std::size_t size;
 85 |     T* storage;
 86 | };
 87 | 
 88 | template <typename T>
 89 | class cuda_resettable_array {
 90 | public:
 91 |     void copy_from_raw(const T* src) {
 92 |         storage.copy_from_raw(src);
 93 |         refstorage.copy_from_raw(src);
 94 |     }
 95 | 
 96 |     void copy_to_raw(T* dst) const {
 97 |         storage.copy_to_raw(dst);
 98 |     }
 99 | 
100 |     void copy_from(const std::vector<T>& vec) {
101 |         storage.copy_from(vec);
102 |         refstorage.copy_from(vec);
103 |     }
104 | 
105 |     void copy_to(std::vector<T>& vec) const {
106 |         storage.copy_to(vec);
107 |     }
108 | 
109 |     void reset() {
110 |         cudaCheckError(cudaMemcpy(storage, refstorage, storage.size * sizeof(T), cudaMemcpyDeviceToDevice));
111 |     }
112 | 
113 |     operator T*() { return storage; }
114 | 
115 | private:
116 |     cuda_array<T> storage;
117 |     cuda_array<T> refstorage;
118 | };
119 | 
120 | #endif // CUDA_MEMORY_CUH
121 | 


--------------------------------------------------------------------------------
/include/cuda_timer.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Parallel selection algorithm on GPUs
 3 |  * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de)
 4 |  *
 5 |  * This program is free software: you can redistribute it and/or modify
 6 |  * it under the terms of the GNU General Public License as published by
 7 |  * the Free Software Foundation, version 3.
 8 |  *
 9 |  * This program is distributed in the hope that it will be useful, but
10 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
11 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 |  * General Public License for more details.
13 |  *
14 |  * You should have received a copy of the GNU General Public License
15 |  * along with this program. If not, see <https://www.gnu.org/licenses/>.
16 |  */
17 | #ifndef CUDA_TIMER_CUH
18 | #define CUDA_TIMER_CUH
19 | 
20 | #include "cuda_error.cuh"
21 | 
22 | #include <algorithm>
23 | #include <chrono>
24 | #include <iostream>
25 | #include <iterator>
26 | #include <string>
27 | 
28 | class cuda_timer {
29 | public:
30 |     cuda_timer(std::ostream& output) : m_events(6), m_output{&output} {
31 |         for (auto& event : m_events) {
32 |             cudaCheckError(cudaEventCreate(&event));
33 |         }
34 |     }
35 | 
36 |     ~cuda_timer() {
37 |         for (auto& event : m_events) {
38 |             cudaEventDestroy(event);
39 |         }
40 |     }
41 | 
42 |     template <typename Kernel>
43 |     void timed(std::string name, int num_runs, Kernel kernel) {
44 |         std::vector<std::vector<float>> results(num_runs, std::vector<float>(m_events.size() - 1));
45 |         int max_event = -1;
46 |         auto event = [&](int idx_event) {
47 |             cudaCheckError(cudaEventRecord(m_events[idx_event]));
48 |             max_event = std::max(idx_event, max_event);
49 |         };
50 |         for (int i = 0; i < num_runs; ++i) {
51 |             cudaChecked([&]() { kernel(event); });
52 |             cudaCheckError(cudaEventSynchronize(m_events[max_event]));
53 |             for (int j = 0; j < max_event; ++j) {
54 |                 cudaCheckError(cudaEventElapsedTime(&results[i][j], m_events[j], m_events[j + 1]));
55 |             }
56 |         }
57 |         auto& out = *m_output;
58 |         out << name;
59 |         for (const auto& run : results) {
60 |             out << ",(";
61 |             std::copy(run.begin(), run.begin() + max_event - 1, std::ostream_iterator<float>(out, ";"));
62 |             out << run[max_event - 1] << ')';
63 |         }
64 |         out << std::endl; // flush output (in case of errors!)
65 |     }
66 | 
67 | private:
68 |     std::vector<cudaEvent_t> m_events;
69 |     std::ostream* m_output;
70 | };
71 | 
72 | class cpu_timer {
73 | public:
74 |     void start() { m_start = std::chrono::high_resolution_clock::now(); }
75 |     void stop() { m_end = std::chrono::high_resolution_clock::now(); }
76 |     template <typename F>
77 |     void timed(F f) {
78 |         start();
79 |         f();
80 |         stop();
81 |     }
82 |     double elapsed_us(int repetitions = 1) {
83 |         return std::chrono::duration<double, std::micro>(m_end - m_start).count() / repetitions;
84 |     }
85 | 
86 | private:
87 |     std::chrono::high_resolution_clock::time_point m_start;
88 |     std::chrono::high_resolution_clock::time_point m_end;
89 | };
90 | 
91 | #endif // CUDA_TIMER_CUH
92 | 


--------------------------------------------------------------------------------
/include/kernel_config.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Parallel selection algorithm on GPUs
 3 |  * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de)
 4 |  *
 5 |  * This program is free software: you can redistribute it and/or modify
 6 |  * it under the terms of the GNU General Public License as published by
 7 |  * the Free Software Foundation, version 3.
 8 |  *
 9 |  * This program is distributed in the hope that it will be useful, but
10 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
11 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 |  * General Public License for more details.
13 |  *
14 |  * You should have received a copy of the GNU General Public License
15 |  * along with this program. If not, see <https://www.gnu.org/licenses/>.
16 |  */
17 | #include "cuda_definitions.cuh"
18 | #include <algorithm>
19 | 
20 | namespace gpu {
21 | 
22 | template <index Size_log2, index Local_size_log2>
23 | struct bitonic_basecase_config {
24 |     constexpr static index size_log2 = Size_log2;
25 |     constexpr static index size = 1 << size_log2;
26 |     constexpr static index local_size_log2 = Local_size_log2;
27 |     constexpr static index local_size = 1 << local_size_log2;
28 |     constexpr static index launch_size = size / local_size;
29 | };
30 | 
31 | template <index Size_log2>
32 | struct sample_config {
33 |     constexpr static index size_log2 = Size_log2;
34 |     constexpr static index size = 1 << size_log2;
35 |     constexpr static index local_size_log2 = size_log2 > max_block_size_log2 ? size_log2 - max_block_size_log2 : 0;
36 |     constexpr static index local_size = 1 << local_size_log2;
37 | };
38 | 
39 | template <index Height>
40 | struct searchtree_config {
41 |     constexpr static index height = Height;
42 |     constexpr static index width = 1 << height;
43 |     constexpr static index size = 2 * width - 1;
44 | };
45 | 
46 | template <bool Shared_memory, bool Warp_aggr, bool Write, index Unroll, index Max_block_size_log2,
47 |           index Max_block_count_log2, bool Bucket_select, index Merged_limit>
48 | struct algorithm_config {
49 |     constexpr static bool shared_memory = Shared_memory;
50 |     constexpr static bool warp_aggr = Warp_aggr;
51 |     constexpr static bool write = Write;
52 |     constexpr static index unroll = Unroll;
53 |     constexpr static index max_block_size_log2 = Max_block_size_log2;
54 |     constexpr static index max_block_size = 1 << max_block_size_log2;
55 |     constexpr static index max_block_count_log2 = Max_block_count_log2;
56 |     constexpr static index max_block_count = 1 << max_block_count_log2;
57 |     constexpr static index merged_limit = Merged_limit;
58 |     constexpr static bool bucket_select = Bucket_select;
59 | };
60 | 
61 | template <index basecase_log2 = 10, index sample_log2 = 10, index searchtree_height = 8,
62 |           bool shared_memory = true, bool warp_aggr = true, bool write = true, index unroll = 8,
63 |           index max_block_size_log2 = 10, index max_block_count_log2 = 10, bool bucket_select = false, index merged_limit = 8, index sort_local_log2 = 2>
64 | struct select_config {
65 |     using basecase = bitonic_basecase_config<basecase_log2, sort_local_log2>;
66 |     using sample = sample_config<sample_log2>;
67 |     using searchtree = searchtree_config<searchtree_height>;
68 |     using algorithm = algorithm_config<shared_memory, warp_aggr, write, unroll, max_block_size_log2,
69 |                                        max_block_count_log2, bucket_select, merged_limit>;
70 |     constexpr static auto searchtree_kernel_size = std::max(std::min(max_block_size, sample::size), searchtree::width);    
71 | };
72 | 
73 | } // namespace gpu


--------------------------------------------------------------------------------
/include/launcher_fwd.cuh:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Parallel selection algorithm on GPUs
  3 |  * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de)
  4 |  *
  5 |  * This program is free software: you can redistribute it and/or modify
  6 |  * it under the terms of the GNU General Public License as published by
  7 |  * the Free Software Foundation, version 3.
  8 |  *
  9 |  * This program is distributed in the hope that it will be useful, but
 10 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
 11 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 12 |  * General Public License for more details.
 13 |  *
 14 |  * You should have received a copy of the GNU General Public License
 15 |  * along with this program. If not, see <https://www.gnu.org/licenses/>.
 16 |  */
 17 | #ifndef LAUNCHER_FWD_CUH
 18 | #define LAUNCHER_FWD_CUH
 19 | 
 20 | #include "cuda_definitions.cuh"
 21 | #include "ssss_merged_memory.cuh"
 22 | 
 23 | namespace gpu {
 24 | 
 25 | namespace kernels {
 26 | template <typename T, typename Config>
 27 | struct ssss_multi_aux;
 28 | 
 29 | template <typename T, typename Config>
 30 | __global__ void partition(const T* in, T* out, index* atomic, index size, T pivot, index workcount);
 31 | 
 32 | template <typename T, typename Config>
 33 | __global__ void partition_count(const T* in, index* counts, index size, T pivot, index workcount);
 34 | 
 35 | template <typename T, typename Config>
 36 | __global__ void partition_distr(const T* in, T* out, const index* counts, index size, T pivot, index workcount);
 37 | 
 38 | template <typename Config>
 39 | __global__ void reduce_counts(const index* in, index* out, index num_blocks);
 40 | 
 41 | template <typename Config>
 42 | __global__ void prefix_sum_counts(index* in, index* out, index num_blocks);
 43 | 
 44 | template <typename Config>
 45 | __global__ void partition_prefixsum(index* counts, index block_count);
 46 | 
 47 | template <typename T, typename Config>
 48 | __global__ void count_buckets(const T* in, const T* tree, index* counts, poracle* oracles, index size, index workcount);
 49 | 
 50 | template<index size_log2>
 51 | __device__ void masked_prefix_sum(index* counts, const mask* m);
 52 | }
 53 |     
 54 | template <typename T, typename Config>
 55 | __host__ __device__ void build_searchtree(const T* in, T* out, index size);
 56 |     
 57 |     template <typename T, typename Config>
 58 | __host__ __device__ void count_buckets(const T* in, const T* tree, index* localcounts,
 59 |                                         index* counts, poracle* oracles, index size);
 60 |     
 61 | template <typename T, typename Config>
 62 | __host__ __device__ void collect_bucket(const T* data, const poracle* oracles_packed,
 63 |                                         const index* prefix_sum, T* out, index size, oracle bucket,
 64 |                                         index* atomic);
 65 |     
 66 | template <typename T, typename Config>
 67 | __host__ __device__ void collect_bucket_indirect(const T* data, const poracle* oracles_packed,
 68 |                                                  const index* prefix_sum, T* out, index size,
 69 |                                                  const oracle* bucket, index* atomic);
 70 |     
 71 | template <typename T, typename Config>
 72 | __host__ __device__ void collect_buckets(const T* data, const poracle* oracles_packed,
 73 |                                          const index* block_prefix_sum, const index* bucket_out_ranges,
 74 |                                          T* out, index size, mask* buckets, index* atomic);
 75 |     
 76 | template <typename T, typename Config>
 77 | __host__ __device__ void ssss_merged(
 78 |     const T* in,
 79 |     T* out,
 80 |     poracle* oracles,
 81 |     index offset,
 82 |     const index* ranks,
 83 |     index rank_offset,
 84 |     index rank_base,
 85 |     const kernels::ssss_multi_aux<T, Config>* aux_in,
 86 |     kernels::ssss_multi_aux<T, Config>* aux_outs,
 87 |     T* out_trees);
 88 | 
 89 | template <typename T, typename Config>
 90 | void sampleselect(T* in, T* tmp, T* tree, index* count_tmp, index size, index rank, T* out);
 91 | 
 92 | template <typename T, typename Config>
 93 | void sampleselect_host(T* in, T* tmp, T* tree, index* count_tmp, index size, index rank, T* out);
 94 | 
 95 | template <typename T, typename Config>
 96 | void sampleselect_multi(T* in, T* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, T* out);
 97 | 
 98 | template <typename T, typename Config>
 99 | __device__ __host__ void partition(const T* in, T* out, index* counts, index size, T pivot);
100 | 
101 | template <typename T, typename Config>
102 | void quickselect_multi(T* in, T* tmp, index* count_tmp, index size, const index* ranks, index rank_count, T* out);
103 | 
104 | template <typename T, typename Config>
105 | void quickselect(T* in, T* tmp, index* count_tmp, index size, index rank, T* out);
106 | 
107 | template <typename T, typename Config>
108 | __host__ __device__ launch_parameters get_launch_parameters(index size);
109 |     
110 | } // namespace gpu
111 | 
112 | #endif // LAUNCHER_FWD_CUH


--------------------------------------------------------------------------------
/include/verification.hpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Parallel selection algorithm on GPUs
 3 |  * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de)
 4 |  *
 5 |  * This program is free software: you can redistribute it and/or modify
 6 |  * it under the terms of the GNU General Public License as published by
 7 |  * the Free Software Foundation, version 3.
 8 |  *
 9 |  * This program is distributed in the hope that it will be useful, but
10 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
11 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 |  * General Public License for more details.
13 |  *
14 |  * You should have received a copy of the GNU General Public License
15 |  * along with this program. If not, see <https://www.gnu.org/licenses/>.
16 |  */
17 | #ifndef GPU_SELECTION_VERIFICATION_HPP
18 | #define GPU_SELECTION_VERIFICATION_HPP
19 | 
20 | #include <tuple>
21 | #include <vector>
22 | #include <cuda_definitions.cuh>
23 | 
24 | namespace verification {
25 | 
26 | using gpu::index;
27 | using gpu::mask;
28 | 
29 | template <typename T>
30 | std::pair<int, int> count_mispartitioned(const std::vector<T>& data, int pivot_rank, T pivot);
31 | 
32 | template <typename T>
33 | T nth_element(const std::vector<T>& data, int rank);
34 | 
35 | template <typename T>
36 | std::vector<T> nth_elements(const std::vector<T>& data, std::vector<gpu::index> ranks);
37 | 
38 | template <typename T>
39 | int count_not_in_bucket(const std::vector<T>& data, T lower, T upper);
40 | 
41 | template <typename T>
42 | std::vector<index> count_not_in_buckets(const std::vector<T>& data, std::vector<index> prefix_sum, const std::vector<T>& searchtree);
43 | 
44 | bool verify_rank_ranges(const std::vector<index>& ranks, const std::vector<index>& index_ranges, const std::vector<index>& rank_ranges);
45 | 
46 | } // namespace verification
47 | 
48 | #endif // GPU_SELECTION_VERIFICATION_HPP
49 | 


--------------------------------------------------------------------------------
/lib/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_library(gpu_selection
 2 |     cpu_reference.cpp
 3 |     verification.cpp
 4 | generated/gen0.cu
 5 | generated/gen1.cu
 6 | generated/gen2.cu
 7 | generated/gen3.cu
 8 | generated/gen4.cu
 9 | generated/gen5.cu
10 | generated/gen6.cu
11 | generated/gen7.cu
12 | generated/gen8.cu
13 | generated/gen9.cu
14 | generated/gen10.cu
15 | generated/gen11.cu
16 | generated/gen12.cu
17 | generated/gen13.cu
18 | generated/gen14.cu
19 | generated/gen15.cu
20 | generated/gen16.cu
21 | generated/gen17.cu
22 | generated/gen18.cu
23 | generated/gen19.cu
24 | generated/gen20.cu
25 | generated/gen21.cu
26 | generated/gen22.cu
27 | generated/gen23.cu
28 | generated/gen24.cu
29 | generated/gen25.cu
30 | generated/gen26.cu
31 | generated/gen27.cu
32 | generated/gen28.cu
33 | generated/gen29.cu
34 | generated/gen30.cu
35 | generated/gen31.cu
36 | generated/gen32.cu
37 | generated/gen33.cu
38 | generated/gen34.cu
39 | generated/gen35.cu
40 | generated/gen36.cu
41 | generated/gen37.cu
42 | generated/gen38.cu
43 | generated/gen39.cu
44 |     )
45 | 
46 | target_compile_features(gpu_selection PUBLIC cxx_std_14)
47 | set_target_properties(gpu_selection PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
48 | 
49 | target_include_directories(gpu_selection PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}")
50 | target_include_directories(gpu_selection PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/../include")
51 | 


--------------------------------------------------------------------------------
/lib/generated/gen0.cu:
--------------------------------------------------------------------------------
 1 | #include <kernel_config.cuh>
 2 | #include <qs_launchers.cuh>
 3 | #include <qs_recursion.cuh>
 4 | #include <qs_recursion_multi.cuh>
 5 | #include <ssss_recursion.cuh>
 6 | #include <ssss_recursion_multi.cuh>
 7 | #include <ssss_launchers.cuh>
 8 | namespace gpu {
 9 | 
10 | template void sampleselect<double,select_config<10, 10, 6, false, true, true, 8, 10, 10>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
11 | template __host__ __device__ void collect_bucket_indirect<float,select_config<10, 10, 8, false, false, true, 8, 10, 10>>(const float* data, const poracle* oracles_packed, const index* prefix_sum, float* out, index size, const oracle* bucket, index* atomic);
12 | template void sampleselect<double,select_config<10, 10, 6, true, false, true, 8, 10, 10>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
13 | template void sampleselect<float,select_config<10, 10, 8, false, false, true, 8, 10, 10>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
14 | template __host__ __device__ void count_buckets<float,select_config<10, 11, 9, true, true, false, 8, 10, 10>>(const float* in, const float* tree, index* localcounts, index* counts, poracle* oracles, index size);
15 | template __host__ __device__ void collect_bucket<double,select_config<10, 10, 8, true, true, false, 8, 10, 10>>(const double* data, const poracle* oracles_packed, const index* prefix_sum, double* out, index size, oracle bucket, index* atomic);
16 | template __global__ void kernels::partition_prefixsum<select_config<10, 10, 8, true, true, true, 8, 10, 10>>(index* counts, index block_count);
17 | template void sampleselect<float,select_config<10, 10, 8, true, true, true, 2, 10, 10>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
18 | template __host__ __device__ void collect_bucket<double,select_config<10, 10, 8, true, false, true, 8, 10, 10>>(const double* data, const poracle* oracles_packed, const index* prefix_sum, double* out, index size, oracle bucket, index* atomic);
19 | template void quickselect_multi<float,select_config<10, 10, 8, true, true, true, 8, 10, 10>>(float* in, float* tmp, index* count_tmp, index size, const index* ranks, index rank_count, float* out);
20 | template __global__ void kernels::partition_prefixsum<select_config<10, 10, 8, false, true, true, 8, 10, 10>>(index* counts, index block_count);
21 | }


--------------------------------------------------------------------------------
/lib/generated/gen1.cu:
--------------------------------------------------------------------------------
 1 | #include <kernel_config.cuh>
 2 | #include <qs_launchers.cuh>
 3 | #include <qs_recursion.cuh>
 4 | #include <qs_recursion_multi.cuh>
 5 | #include <ssss_recursion.cuh>
 6 | #include <ssss_recursion_multi.cuh>
 7 | #include <ssss_launchers.cuh>
 8 | namespace gpu {
 9 | template __host__ __device__ launch_parameters get_launch_parameters<float,select_config<10, 10, 8, true, true, true, 8, 10, 10>>(index size);
10 | template void quickselect<float,select_config<10, 5, 8, false, true, true, 8, 10, 10>>(float* in, float* tmp, index* count_tmp, index size, index rank, float* out);
11 | template __device__ __host__ void partition<float,select_config<10, 10, 8, false, true, true, 8, 10, 10>>(const float* in, float* out, index* counts, index size, float pivot);
12 | template __global__ void kernels::count_buckets<double, select_config<10, 10, 8, false, false, false, 8, 10, 10>>(const double* in, const double* tree, index* counts, poracle* oracles, index size, index workcount);
13 | template __host__ __device__ launch_parameters get_launch_parameters<double,select_config<10, 10, 8, true, true, true, 8, 10, 10>>(index size);
14 | template __global__ void kernels::partition<double,select_config<10, 5, 8, true, true, true, 8, 10, 10>>(const double* in, double* out, index* atomic, index size, double pivot, index workcount);
15 | template __global__ void kernels::count_buckets<float, select_config<10, 8, 6, true, true, false, 8, 10, 10>>(const float* in, const float* tree, index* counts, poracle* oracles, index size, index workcount);
16 | template __global__ void kernels::count_buckets<float, select_config<10, 10, 8, true, true, true, 8, 10, 10>>(const float* in, const float* tree, index* counts, poracle* oracles, index size, index workcount);
17 | template void sampleselect_multi<float,select_config<10, 10, 6, false, true, true, 8, 10, 10, false, 1024>>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out);
18 | template __host__ __device__ launch_parameters get_launch_parameters<float,select_config<10, 12, 10, true, true, false, 8, 10, 10>>(index size);
19 | template void sampleselect<float,select_config<10, 10, 7, false, true, true, 8, 10, 10>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
20 | template void sampleselect_multi<float,select_config<10, 10, 8, false, true, true, 8, 10, 10>>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out);
21 | }


--------------------------------------------------------------------------------
/lib/generated/gen10.cu:
--------------------------------------------------------------------------------
 1 | #include <kernel_config.cuh>
 2 | #include <qs_launchers.cuh>
 3 | #include <qs_recursion.cuh>
 4 | #include <qs_recursion_multi.cuh>
 5 | #include <ssss_recursion.cuh>
 6 | #include <ssss_recursion_multi.cuh>
 7 | #include <ssss_launchers.cuh>
 8 | namespace gpu {
 9 | template void sampleselect_multi<double,select_config<10, 10, 8, false, true, true, 8, 8, 10>>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out);
10 | template __host__ __device__ void collect_bucket_indirect<float,select_config<10, 10, 8, true, false, true, 8, 10, 10>>(const float* data, const poracle* oracles_packed, const index* prefix_sum, float* out, index size, const oracle* bucket, index* atomic);
11 | template void sampleselect_multi<double,select_config<10, 10, 6, true, false, true, 8, 10, 10>>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out);
12 | template void sampleselect_host<double,select_config<8, 10, 8, true, true, true, 8, 10, 10>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
13 | template __host__ __device__ void build_searchtree<float,select_config<10, 11, 9, false, true, false, 8, 10, 10>>(const float* in, float* out, index size);
14 | template __host__ __device__ void ssss_merged<float,select_config<10, 10, 8, false, true, true, 8, 10, 10>>(const float* in, float* out, poracle* oracles, index offset, const index* ranks, index rank_offset, index rank_base, const kernels::ssss_multi_aux<float, select_config<10, 10, 8, false, true, true, 8, 10, 10>>* aux_in, kernels::ssss_multi_aux<float, select_config<10, 10, 8, false, true, true, 8, 10, 10>>* aux_outs, float* out_tree);
15 | template void quickselect<double,select_config<10, 10, 8, true, true, true, 2, 10, 10>>(double* in, double* tmp, index* count_tmp, index size, index rank, double* out);
16 | template void sampleselect<float,select_config<10, 10, 6, false, true, true, 8, 10, 10>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
17 | template __host__ __device__ void collect_bucket_indirect<double,select_config<10, 10, 8, false, true, true, 8, 10, 10>>(const double* data, const poracle* oracles_packed, const index* prefix_sum, double* out, index size, const oracle* bucket, index* atomic);
18 | template void sampleselect<double,select_config<10, 10, 8, true, true, true, 8, 9, 10>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
19 | template __host__ __device__ void count_buckets<double,select_config<10, 10, 8, true, false, true, 8, 10, 10>>(const double* in, const double* tree, index* localcounts, index* counts, poracle* oracles, index size);
20 | template __host__ __device__ void collect_bucket<float,select_config<10, 10, 8, false, false, false, 8, 10, 10>>(const float* data, const poracle* oracles_packed, const index* prefix_sum, float* out, index size, oracle bucket, index* atomic);
21 | }


--------------------------------------------------------------------------------
/lib/generated/gen11.cu:
--------------------------------------------------------------------------------
 1 | #include <kernel_config.cuh>
 2 | #include <qs_launchers.cuh>
 3 | #include <qs_recursion.cuh>
 4 | #include <qs_recursion_multi.cuh>
 5 | #include <ssss_recursion.cuh>
 6 | #include <ssss_recursion_multi.cuh>
 7 | #include <ssss_launchers.cuh>
 8 | namespace gpu {
 9 | template __global__ void kernels::reduce_counts<select_config<10, 12, 10, true, true, false, 8, 10, 10>>(const index* in, index* out, index);
10 | template void quickselect<double,select_config<9, 10, 8, false, true, true, 8, 10, 10>>(double* in, double* tmp, index* count_tmp, index size, index rank, double* out);
11 | template void sampleselect<double,select_config<10, 10, 8, true, true, true, 8, 8, 10>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
12 | template __host__ __device__ launch_parameters get_launch_parameters<double,select_config<10, 9, 7, false, true, false, 8, 10, 10>>(index size);
13 | template void quickselect<float,select_config<10, 10, 8, false, true, true, 4, 10, 10>>(float* in, float* tmp, index* count_tmp, index size, index rank, float* out);
14 | template __host__ __device__ void build_searchtree<float,select_config<10, 10, 8, false, false, true, 8, 10, 10>>(const float* in, float* out, index size);
15 | template void sampleselect_multi<float,select_config<8, 10, 8, false, true, true, 8, 10, 10>>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out);
16 | template __device__ void kernels::masked_prefix_sum<8>(index* counts, const mask* m);
17 | template void sampleselect_host<float,select_config<10, 10, 7, true, true, true, 8, 10, 10>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
18 | template __global__ void kernels::count_buckets<float, select_config<10, 10, 8, true, true, false, 8, 10, 10>>(const float* in, const float* tree, index* counts, poracle* oracles, index size, index workcount);
19 | template void sampleselect_host<double,select_config<10, 10, 8, true, true, true, 8, 8, 10>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
20 | }


--------------------------------------------------------------------------------
/lib/generated/gen12.cu:
--------------------------------------------------------------------------------
 1 | #include <kernel_config.cuh>
 2 | #include <qs_launchers.cuh>
 3 | #include <qs_recursion.cuh>
 4 | #include <qs_recursion_multi.cuh>
 5 | #include <ssss_recursion.cuh>
 6 | #include <ssss_recursion_multi.cuh>
 7 | #include <ssss_launchers.cuh>
 8 | namespace gpu {
 9 | template __global__ void kernels::count_buckets<float, select_config<10, 9, 7, true, true, false, 8, 10, 10>>(const float* in, const float* tree, index* counts, poracle* oracles, index size, index workcount);
10 | template void sampleselect<double,select_config<10, 10, 8, false, true, true, 8, 9, 10>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
11 | template __host__ __device__ void collect_bucket<double,select_config<10, 9, 7, false, true, false, 8, 10, 10>>(const double* data, const poracle* oracles_packed, const index* prefix_sum, double* out, index size, oracle bucket, index* atomic);
12 | template __global__ void kernels::reduce_counts<select_config<10, 10, 8, false, true, false, 8, 10, 10>>(const index* in, index* out, index);
13 | template void quickselect<float,select_config<10, 10, 8, true, true, true, 8, 10, 10>>(float* in, float* tmp, index* count_tmp, index size, index rank, float* out);
14 | template void sampleselect_host<float,select_config<10, 10, 7, false, true, true, 8, 10, 10>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
15 | template void sampleselect_multi<double,select_config<10, 10, 8, true, true, true, 2, 10, 10>>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out);
16 | template void sampleselect_multi<float,select_config<10, 10, 8, false, true, true, 8, 10, 10, false, 4>>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out);
17 | template __host__ __device__ void build_searchtree<float,select_config<10, 10, 8, true, true, false, 8, 10, 10>>(const float* in, float* out, index size);
18 | template void quickselect_multi<double,select_config<10, 10, 8, true, true, true, 8, 10, 10>>(double* in, double* tmp, index* count_tmp, index size, const index* ranks, index rank_count, double* out);
19 | template void sampleselect<float,select_config<10, 10, 6, false, true, true, 8, 10, 10, false, 1024>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
20 | }


--------------------------------------------------------------------------------
/lib/generated/gen13.cu:
--------------------------------------------------------------------------------
 1 | #include <kernel_config.cuh>
 2 | #include <qs_launchers.cuh>
 3 | #include <qs_recursion.cuh>
 4 | #include <qs_recursion_multi.cuh>
 5 | #include <ssss_recursion.cuh>
 6 | #include <ssss_recursion_multi.cuh>
 7 | #include <ssss_launchers.cuh>
 8 | namespace gpu {
 9 | template void sampleselect_host<double,select_config<10, 10, 8, false, true, true, 8, 10, 10, false, 16>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
10 | template __global__ void kernels::reduce_counts<select_config<10, 10, 8, true, true, false, 8, 10, 10>>(const index* in, index* out, index);
11 | template void sampleselect<double,select_config<9, 10, 8, true, true, true, 8, 10, 10>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
12 | template __host__ __device__ void collect_buckets<double,select_config<10, 10, 8, true, false, true, 8, 10, 10>>(const double* data, const poracle* oracles_packed, const index* block_prefix_sum, const index* bucket_out_ranges, double* out, index size, mask* buckets, index* atomic);
13 | template void sampleselect<float,select_config<10, 10, 8, true, true, true, 4, 10, 10>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
14 | template void sampleselect_host<float,select_config<10, 10, 8, true, true, true, 2, 10, 10>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
15 | template __host__ __device__ void count_buckets<float,select_config<10, 10, 8, false, true, true, 8, 10, 10>>(const float* in, const float* tree, index* localcounts, index* counts, poracle* oracles, index size);
16 | template void sampleselect_multi<float,select_config<10, 10, 8, false, true, true, 8, 10, 10, true>>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out);
17 | template __host__ __device__ launch_parameters get_launch_parameters<float,select_config<10, 9, 7, false, true, false, 8, 10, 10>>(index size);
18 | template void quickselect_multi<float,select_config<10, 10, 8, false, true, true, 8, 10, 10>>(float* in, float* tmp, index* count_tmp, index size, const index* ranks, index rank_count, float* out);
19 | template __host__ __device__ void build_searchtree<float,select_config<10, 11, 9, true, true, false, 8, 10, 10>>(const float* in, float* out, index size);
20 | }


--------------------------------------------------------------------------------
/lib/generated/gen14.cu:
--------------------------------------------------------------------------------
 1 | #include <kernel_config.cuh>
 2 | #include <qs_launchers.cuh>
 3 | #include <qs_recursion.cuh>
 4 | #include <qs_recursion_multi.cuh>
 5 | #include <ssss_recursion.cuh>
 6 | #include <ssss_recursion_multi.cuh>
 7 | #include <ssss_launchers.cuh>
 8 | namespace gpu {
 9 | template void sampleselect_host<float,select_config<10, 10, 8, true, false, true, 8, 10, 10>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
10 | template void sampleselect_multi<double,select_config<10, 10, 8, false, true, true, 8, 10, 10>>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out);
11 | template __host__ __device__ launch_parameters get_launch_parameters<float,select_config<10, 8, 6, false, true, false, 8, 10, 10>>(index size);
12 | template __global__ void kernels::prefix_sum_counts<select_config<10, 9, 7, false, true, false, 8, 10, 10>>(index* in, index* out, index);
13 | template __global__ void kernels::count_buckets<double, select_config<10, 10, 8, false, true, true, 8, 10, 10>>(const double* in, const double* tree, index* counts, poracle* oracles, index size, index workcount);
14 | template void sampleselect_host<float,select_config<9, 10, 8, false, true, true, 8, 10, 10>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
15 | template void sampleselect_multi<double,select_config<10, 10, 8, true, false, true, 8, 10, 10>>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out);
16 | template void sampleselect<float,select_config<10, 10, 6, true, true, true, 8, 10, 10, false, 1024>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
17 | template __global__ void partition_count<double,select_config<10, 5, 8, true, true, true, 8, 10, 10>>(const double* in, index* counts, index size, double pivot, index workcount);
18 | template __host__ __device__ void build_searchtree<double,select_config<10, 10, 8, true, false, true, 8, 10, 10>>(const double* in, double* out, index size);
19 | template void sampleselect_host<float,select_config<10, 10, 8, true, true, true, 8, 10, 10>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
20 | }


--------------------------------------------------------------------------------
/lib/generated/gen15.cu:
--------------------------------------------------------------------------------
 1 | #include <kernel_config.cuh>
 2 | #include <qs_launchers.cuh>
 3 | #include <qs_recursion.cuh>
 4 | #include <qs_recursion_multi.cuh>
 5 | #include <ssss_recursion.cuh>
 6 | #include <ssss_recursion_multi.cuh>
 7 | #include <ssss_launchers.cuh>
 8 | namespace gpu {
 9 | template __global__ void kernels::partition_distr<float,select_config<10, 5, 8, true, true, true, 8, 10, 10>>(const float* in, float* out, const index* counts, index size, float pivot, index workcount);
10 | template void quickselect_multi<float,select_config<10, 5, 8, false, true, true, 8, 10, 10>>(float* in, float* tmp, index* count_tmp, index size, const index* ranks, index rank_count, float* out);
11 | template __global__ void kernels::partition<double,select_config<10, 10, 8, true, true, true, 8, 10, 10>>(const double* in, double* out, index* atomic, index size, double pivot, index workcount);
12 | template void sampleselect_multi<double,select_config<8, 10, 8, false, true, true, 8, 10, 10>>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out);
13 | template __host__ __device__ void collect_bucket<float,select_config<10, 9, 7, false, true, false, 8, 10, 10>>(const float* data, const poracle* oracles_packed, const index* prefix_sum, float* out, index size, oracle bucket, index* atomic);
14 | template __host__ __device__ void collect_bucket_indirect<float,select_config<10, 10, 8, true, true, true, 8, 10, 10>>(const float* data, const poracle* oracles_packed, const index* prefix_sum, float* out, index size, const oracle* bucket, index* atomic);
15 | template __global__ void kernels::reduce_counts<select_config<10, 10, 8, false, true, true, 8, 10, 10>>(const index* in, index* out, index);
16 | template void sampleselect_multi<double,select_config<10, 10, 8, false, true, true, 8, 10, 10, false, 8, 0>>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out);
17 | template void sampleselect_host<float,select_config<10, 10, 8, false, false, true, 8, 10, 10>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
18 | template void quickselect<float,select_config<8, 10, 8, true, true, true, 8, 10, 10>>(float* in, float* tmp, index* count_tmp, index size, index rank, float* out);
19 | template __global__ void kernels::count_buckets<double, select_config<10, 11, 9, true, true, false, 8, 10, 10>>(const double* in, const double* tree, index* counts, poracle* oracles, index size, index workcount);
20 | }


--------------------------------------------------------------------------------
/lib/generated/gen16.cu:
--------------------------------------------------------------------------------
 1 | #include <kernel_config.cuh>
 2 | #include <qs_launchers.cuh>
 3 | #include <qs_recursion.cuh>
 4 | #include <qs_recursion_multi.cuh>
 5 | #include <ssss_recursion.cuh>
 6 | #include <ssss_recursion_multi.cuh>
 7 | #include <ssss_launchers.cuh>
 8 | namespace gpu {
 9 | template void sampleselect<double,select_config<10, 10, 8, false, true, true, 8, 10, 10, false, 8, 0>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
10 | template void sampleselect<float,select_config<10, 10, 8, true, true, true, 8, 10, 10, false, 16>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
11 | template __host__ __device__ void build_searchtree<double,select_config<10, 10, 8, false, true, false, 8, 10, 10>>(const double* in, double* out, index size);
12 | template __global__ void kernels::count_buckets<float, select_config<10, 11, 9, true, true, false, 8, 10, 10>>(const float* in, const float* tree, index* counts, poracle* oracles, index size, index workcount);
13 | template void sampleselect_host<float,select_config<10, 10, 6, false, true, true, 8, 10, 10>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
14 | template __host__ __device__ void build_searchtree<double,select_config<10, 8, 6, false, true, false, 8, 10, 10>>(const double* in, double* out, index size);
15 | template __host__ __device__ launch_parameters get_launch_parameters<double,select_config<10, 11, 9, true, true, false, 8, 10, 10>>(index size);
16 | template __host__ __device__ void collect_bucket<float,select_config<10, 10, 8, true, false, true, 8, 10, 10>>(const float* data, const poracle* oracles_packed, const index* prefix_sum, float* out, index size, oracle bucket, index* atomic);
17 | template __host__ __device__ void build_searchtree<float,select_config<10, 12, 10, false, true, false, 8, 10, 10>>(const float* in, float* out, index size);
18 | template __host__ __device__ void ssss_merged<float,select_config<10, 10, 8, true, false, true, 8, 10, 10>>(const float* in, float* out, poracle* oracles, index offset, const index* ranks, index rank_offset, index rank_base, const kernels::ssss_multi_aux<float, select_config<10, 10, 8, true, false, true, 8, 10, 10>>* aux_in, kernels::ssss_multi_aux<float, select_config<10, 10, 8, true, false, true, 8, 10, 10>>* aux_outs, float* out_tree);
19 | template void quickselect<double,select_config<10, 10, 8, false, true, true, 2, 10, 10>>(double* in, double* tmp, index* count_tmp, index size, index rank, double* out);
20 | }


--------------------------------------------------------------------------------
/lib/generated/gen17.cu:
--------------------------------------------------------------------------------
 1 | #include <kernel_config.cuh>
 2 | #include <qs_launchers.cuh>
 3 | #include <qs_recursion.cuh>
 4 | #include <qs_recursion_multi.cuh>
 5 | #include <ssss_recursion.cuh>
 6 | #include <ssss_recursion_multi.cuh>
 7 | #include <ssss_launchers.cuh>
 8 | namespace gpu {
 9 | template __host__ __device__ void count_buckets<float,select_config<10, 10, 8, true, true, true, 8, 10, 10>>(const float* in, const float* tree, index* localcounts, index* counts, poracle* oracles, index size);
10 | template __global__ void kernels::partition<float,select_config<10, 10, 8, true, true, true, 8, 10, 10>>(const float* in, float* out, index* atomic, index size, float pivot, index workcount);
11 | template void sampleselect<double,select_config<10, 10, 8, true, false, true, 8, 10, 10>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
12 | template void sampleselect<float,select_config<10, 10, 8, true, true, true, 8, 10, 10, true>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
13 | template void sampleselect_multi<float,select_config<10, 10, 6, true, true, true, 8, 10, 10>>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out);
14 | template void sampleselect_multi<float,select_config<10, 10, 8, true, true, true, 8, 10, 10, false, 4>>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out);
15 | template void sampleselect_multi<double,select_config<10, 10, 8, true, true, true, 8, 10, 10, false, 4>>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out);
16 | template __host__ __device__ void collect_bucket<float,select_config<10, 11, 9, false, true, false, 8, 10, 10>>(const float* data, const poracle* oracles_packed, const index* prefix_sum, float* out, index size, oracle bucket, index* atomic);
17 | template void sampleselect<double,select_config<10, 10, 6, false, false, true, 8, 10, 10>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
18 | template __host__ __device__ void collect_bucket<double,select_config<10, 10, 8, false, false, false, 8, 10, 10>>(const double* data, const poracle* oracles_packed, const index* prefix_sum, double* out, index size, oracle bucket, index* atomic);
19 | template void sampleselect_host<float,select_config<10, 10, 8, true, true, true, 8, 8, 10>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
20 | }


--------------------------------------------------------------------------------
/lib/generated/gen18.cu:
--------------------------------------------------------------------------------
 1 | #include <kernel_config.cuh>
 2 | #include <qs_launchers.cuh>
 3 | #include <qs_recursion.cuh>
 4 | #include <qs_recursion_multi.cuh>
 5 | #include <ssss_recursion.cuh>
 6 | #include <ssss_recursion_multi.cuh>
 7 | #include <ssss_launchers.cuh>
 8 | namespace gpu {
 9 | template __host__ __device__ void ssss_merged<float,select_config<10, 10, 8, false, false, true, 8, 10, 10>>(const float* in, float* out, poracle* oracles, index offset, const index* ranks, index rank_offset, index rank_base, const kernels::ssss_multi_aux<float, select_config<10, 10, 8, false, false, true, 8, 10, 10>>* aux_in, kernels::ssss_multi_aux<float, select_config<10, 10, 8, false, false, true, 8, 10, 10>>* aux_outs, float* out_tree);
10 | template void quickselect<float,select_config<10, 10, 8, true, true, true, 2, 10, 10>>(float* in, float* tmp, index* count_tmp, index size, index rank, float* out);
11 | template __global__ void kernels::reduce_counts<select_config<10, 11, 9, true, true, false, 8, 10, 10>>(const index* in, index* out, index);
12 | template __global__ void kernels::count_buckets<float, select_config<10, 9, 7, false, true, false, 8, 10, 10>>(const float* in, const float* tree, index* counts, poracle* oracles, index size, index workcount);
13 | template void sampleselect_multi<float,select_config<10, 10, 8, false, true, true, 8, 10, 10, false, 8, 0>>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out);
14 | template __global__ void kernels::count_buckets<double, select_config<10, 10, 8, true, true, true, 8, 10, 10>>(const double* in, const double* tree, index* counts, poracle* oracles, index size, index workcount);
15 | template void sampleselect_host<double,select_config<10, 10, 6, true, false, true, 8, 10, 10>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
16 | template __global__ void partition_count<float,select_config<10, 5, 8, true, true, true, 8, 10, 10>>(const float* in, index* counts, index size, float pivot, index workcount);
17 | template __global__ void kernels::prefix_sum_counts<select_config<10, 10, 8, false, true, false, 8, 10, 10>>(index* in, index* out, index);
18 | template __host__ __device__ void collect_bucket<double,select_config<10, 10, 8, true, true, true, 8, 10, 10>>(const double* data, const poracle* oracles_packed, const index* prefix_sum, double* out, index size, oracle bucket, index* atomic);
19 | template void sampleselect_host<double,select_config<10, 10, 8, false, true, true, 2, 10, 10>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
20 | }


--------------------------------------------------------------------------------
/lib/generated/gen19.cu:
--------------------------------------------------------------------------------
 1 | #include <kernel_config.cuh>
 2 | #include <qs_launchers.cuh>
 3 | #include <qs_recursion.cuh>
 4 | #include <qs_recursion_multi.cuh>
 5 | #include <ssss_recursion.cuh>
 6 | #include <ssss_recursion_multi.cuh>
 7 | #include <ssss_launchers.cuh>
 8 | namespace gpu {
 9 | template void quickselect<double,select_config<10, 10, 8, true, true, true, 8, 8, 10>>(double* in, double* tmp, index* count_tmp, index size, index rank, double* out);
10 | template void sampleselect_multi<double,select_config<10, 10, 7, true, true, true, 8, 10, 10>>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out);
11 | template void sampleselect_multi<float,select_config<9, 10, 8, true, true, true, 8, 10, 10>>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out);
12 | template __host__ __device__ void count_buckets<double,select_config<10, 10, 8, true, true, true, 8, 10, 10>>(const double* in, const double* tree, index* localcounts, index* counts, poracle* oracles, index size);
13 | template void sampleselect_host<float,select_config<10, 10, 6, true, true, true, 8, 10, 10, false, 1024>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
14 | template __host__ __device__ void build_searchtree<float,select_config<10, 10, 8, true, false, true, 8, 10, 10>>(const float* in, float* out, index size);
15 | template void sampleselect<float,select_config<10, 10, 8, true, true, true, 8, 9, 10>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
16 | template void sampleselect_multi<double,select_config<10, 10, 8, true, true, true, 8, 10, 10, true>>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out);
17 | template __global__ void kernels::partition_distr<double,select_config<10, 5, 8, false, true, true, 8, 10, 10>>(const double* in, double* out, const index* counts, index size, double pivot, index workcount);
18 | template void sampleselect<double,select_config<8, 10, 8, false, true, true, 8, 10, 10>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
19 | template __device__ __host__ void partition<double,select_config<10, 10, 8, true, true, true, 8, 10, 10>>(const double* in, double* out, index* counts, index size, double pivot);
20 | }


--------------------------------------------------------------------------------
/lib/generated/gen2.cu:
--------------------------------------------------------------------------------
 1 | #include <kernel_config.cuh>
 2 | #include <qs_launchers.cuh>
 3 | #include <qs_recursion.cuh>
 4 | #include <qs_recursion_multi.cuh>
 5 | #include <ssss_recursion.cuh>
 6 | #include <ssss_recursion_multi.cuh>
 7 | #include <ssss_launchers.cuh>
 8 | namespace gpu {
 9 | template __host__ __device__ launch_parameters get_launch_parameters<double,select_config<10, 10, 8, false, true, true, 8, 10, 10>>(index size);
10 | template void sampleselect<double,select_config<8, 10, 8, true, true, true, 8, 10, 10>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
11 | template void quickselect<double,select_config<8, 10, 8, false, true, true, 8, 10, 10>>(double* in, double* tmp, index* count_tmp, index size, index rank, double* out);
12 | template __host__ __device__ void collect_bucket<double,select_config<10, 10, 8, false, false, true, 8, 10, 10>>(const double* data, const poracle* oracles_packed, const index* prefix_sum, double* out, index size, oracle bucket, index* atomic);
13 | template void quickselect<double,select_config<10, 10, 8, false, true, true, 8, 10, 10>>(double* in, double* tmp, index* count_tmp, index size, index rank, double* out);
14 | template __host__ __device__ void build_searchtree<float,select_config<10, 9, 7, false, true, false, 8, 10, 10>>(const float* in, float* out, index size);
15 | template void sampleselect<double,select_config<10, 10, 8, false, true, true, 8, 10, 10, true>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
16 | template void sampleselect_multi<float,select_config<10, 10, 8, false, true, true, 2, 10, 10>>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out);
17 | template void sampleselect_host<double,select_config<10, 10, 6, false, false, true, 8, 10, 10>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
18 | template void sampleselect_host<float,select_config<10, 10, 6, true, true, true, 8, 10, 10>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
19 | template __host__ __device__ launch_parameters get_launch_parameters<double,select_config<10, 8, 6, true, true, false, 8, 10, 10>>(index size);
20 | template void quickselect<double,select_config<8, 10, 8, true, true, true, 8, 10, 10>>(double* in, double* tmp, index* count_tmp, index size, index rank, double* out);
21 | }


--------------------------------------------------------------------------------
/lib/generated/gen20.cu:
--------------------------------------------------------------------------------
 1 | #include <kernel_config.cuh>
 2 | #include <qs_launchers.cuh>
 3 | #include <qs_recursion.cuh>
 4 | #include <qs_recursion_multi.cuh>
 5 | #include <ssss_recursion.cuh>
 6 | #include <ssss_recursion_multi.cuh>
 7 | #include <ssss_launchers.cuh>
 8 | namespace gpu {
 9 | template void quickselect_multi<double,select_config<10, 5, 8, false, true, true, 8, 10, 10>>(double* in, double* tmp, index* count_tmp, index size, const index* ranks, index rank_count, double* out);
10 | template __host__ __device__ void build_searchtree<double,select_config<10, 10, 8, true, false, false, 8, 10, 10>>(const double* in, double* out, index size);
11 | template void sampleselect<float,select_config<8, 10, 8, false, true, true, 8, 10, 10>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
12 | template void sampleselect_host<double,select_config<9, 10, 8, false, true, true, 8, 10, 10>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
13 | template void sampleselect_host<float,select_config<10, 10, 6, false, false, true, 8, 10, 10>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
14 | template __global__ void kernels::partition_distr<double,select_config<10, 5, 8, true, true, true, 8, 10, 10>>(const double* in, double* out, const index* counts, index size, double pivot, index workcount);
15 | template __host__ __device__ void collect_buckets<double,select_config<10, 10, 8, true, true, true, 8, 10, 10>>(const double* data, const poracle* oracles_packed, const index* block_prefix_sum, const index* bucket_out_ranges, double* out, index size, mask* buckets, index* atomic);
16 | template void sampleselect_multi<float,select_config<10, 10, 6, false, false, true, 8, 10, 10>>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out);
17 | template __host__ __device__ void collect_bucket<float,select_config<10, 11, 9, true, true, false, 8, 10, 10>>(const float* data, const poracle* oracles_packed, const index* prefix_sum, float* out, index size, oracle bucket, index* atomic);
18 | template void sampleselect_multi<double,select_config<10, 10, 8, false, true, true, 8, 10, 10, false, 4>>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out);
19 | template __host__ __device__ void collect_bucket<float,select_config<10, 9, 7, true, true, false, 8, 10, 10>>(const float* data, const poracle* oracles_packed, const index* prefix_sum, float* out, index size, oracle bucket, index* atomic);
20 | }


--------------------------------------------------------------------------------
/lib/generated/gen21.cu:
--------------------------------------------------------------------------------
 1 | #include <kernel_config.cuh>
 2 | #include <qs_launchers.cuh>
 3 | #include <qs_recursion.cuh>
 4 | #include <qs_recursion_multi.cuh>
 5 | #include <ssss_recursion.cuh>
 6 | #include <ssss_recursion_multi.cuh>
 7 | #include <ssss_launchers.cuh>
 8 | namespace gpu {
 9 | template __global__ void kernels::reduce_counts<select_config<10, 9, 7, false, true, false, 8, 10, 10>>(const index* in, index* out, index);
10 | template __host__ __device__ launch_parameters get_launch_parameters<double,select_config<10, 11, 9, false, true, false, 8, 10, 10>>(index size);
11 | template __host__ __device__ void collect_bucket<double,select_config<10, 8, 6, true, true, false, 8, 10, 10>>(const double* data, const poracle* oracles_packed, const index* prefix_sum, double* out, index size, oracle bucket, index* atomic);
12 | template __host__ __device__ void build_searchtree<float,select_config<10, 8, 6, true, true, false, 8, 10, 10>>(const float* in, float* out, index size);
13 | template void sampleselect_host<float,select_config<10, 10, 8, false, true, true, 8, 9, 10>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
14 | template __global__ void kernels::count_buckets<float, select_config<10, 10, 8, false, false, false, 8, 10, 10>>(const float* in, const float* tree, index* counts, poracle* oracles, index size, index workcount);
15 | template __host__ __device__ void count_buckets<double,select_config<10, 10, 8, true, true, false, 8, 10, 10>>(const double* in, const double* tree, index* localcounts, index* counts, poracle* oracles, index size);
16 | template __global__ void kernels::reduce_counts<select_config<10, 10, 8, false, false, false, 8, 10, 10>>(const index* in, index* out, index);
17 | template __host__ __device__ void build_searchtree<double,select_config<10, 9, 7, true, true, false, 8, 10, 10>>(const double* in, double* out, index size);
18 | template void sampleselect_multi<double,select_config<10, 10, 8, true, true, true, 8, 10, 10, false, 8, 0>>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out);
19 | template __global__ void kernels::prefix_sum_counts<select_config<10, 10, 8, false, false, false, 8, 10, 10>>(index* in, index* out, index);
20 | }


--------------------------------------------------------------------------------
/lib/generated/gen22.cu:
--------------------------------------------------------------------------------
 1 | #include <kernel_config.cuh>
 2 | #include <qs_launchers.cuh>
 3 | #include <qs_recursion.cuh>
 4 | #include <qs_recursion_multi.cuh>
 5 | #include <ssss_recursion.cuh>
 6 | #include <ssss_recursion_multi.cuh>
 7 | #include <ssss_launchers.cuh>
 8 | namespace gpu {
 9 | template void sampleselect<float,select_config<10, 10, 8, true, true, true, 8, 10, 10>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
10 | template __host__ __device__ void count_buckets<double,select_config<10, 10, 8, false, true, true, 8, 10, 10>>(const double* in, const double* tree, index* localcounts, index* counts, poracle* oracles, index size);
11 | template void sampleselect_host<float,select_config<10, 10, 8, true, true, true, 8, 10, 10, false, 16>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
12 | template __host__ __device__ void build_searchtree<double,select_config<10, 10, 8, false, false, false, 8, 10, 10>>(const double* in, double* out, index size);
13 | template void sampleselect_multi<double,select_config<10, 10, 6, true, true, true, 8, 10, 10, false, 1024>>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out);
14 | template __host__ __device__ void collect_buckets<double,select_config<10, 10, 8, false, false, true, 8, 10, 10>>(const double* data, const poracle* oracles_packed, const index* block_prefix_sum, const index* bucket_out_ranges, double* out, index size, mask* buckets, index* atomic);
15 | template __device__ __host__ void partition<double,select_config<10, 5, 8, true, true, true, 8, 10, 10>>(const double* in, double* out, index* counts, index size, double pivot);
16 | template __global__ void kernels::count_buckets<float, select_config<10, 10, 8, false, true, true, 8, 10, 10>>(const float* in, const float* tree, index* counts, poracle* oracles, index size, index workcount);
17 | template void sampleselect_multi<double,select_config<10, 10, 8, true, true, true, 8, 9, 10>>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out);
18 | template void sampleselect_host<float,select_config<10, 10, 6, false, true, true, 8, 10, 10, false, 1024>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
19 | template __global__ void kernels::prefix_sum_counts<select_config<10, 8, 6, false, true, false, 8, 10, 10>>(index* in, index* out, index);
20 | }


--------------------------------------------------------------------------------
/lib/generated/gen23.cu:
--------------------------------------------------------------------------------
 1 | #include <kernel_config.cuh>
 2 | #include <qs_launchers.cuh>
 3 | #include <qs_recursion.cuh>
 4 | #include <qs_recursion_multi.cuh>
 5 | #include <ssss_recursion.cuh>
 6 | #include <ssss_recursion_multi.cuh>
 7 | #include <ssss_launchers.cuh>
 8 | namespace gpu {
 9 | template void sampleselect_host<double,select_config<10, 10, 8, false, true, true, 8, 10, 10, false, 8, 0>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
10 | template __global__ void kernels::prefix_sum_counts<select_config<10, 10, 8, true, false, false, 8, 10, 10>>(index* in, index* out, index);
11 | template void sampleselect<double,select_config<10, 10, 8, true, true, true, 8, 10, 10, false, 16>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
12 | template void sampleselect_multi<float,select_config<10, 10, 8, true, true, true, 8, 10, 10>>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out);
13 | template void sampleselect<double,select_config<10, 10, 7, true, true, true, 8, 10, 10>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
14 | template __host__ __device__ void collect_bucket<double,select_config<10, 10, 8, false, true, true, 8, 10, 10>>(const double* data, const poracle* oracles_packed, const index* prefix_sum, double* out, index size, oracle bucket, index* atomic);
15 | template __host__ __device__ launch_parameters get_launch_parameters<float,select_config<10, 10, 8, true, false, false, 8, 10, 10>>(index size);
16 | template __host__ __device__ void build_searchtree<double,select_config<10, 11, 9, false, true, false, 8, 10, 10>>(const double* in, double* out, index size);
17 | template __global__ void kernels::partition_distr<double,select_config<10, 10, 8, false, true, true, 8, 10, 10>>(const double* in, double* out, const index* counts, index size, double pivot, index workcount);
18 | template void sampleselect<float,select_config<10, 10, 8, false, true, true, 8, 10, 10, true>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
19 | template __global__ void kernels::prefix_sum_counts<select_config<10, 8, 6, true, true, false, 8, 10, 10>>(index* in, index* out, index);
20 | }


--------------------------------------------------------------------------------
/lib/generated/gen24.cu:
--------------------------------------------------------------------------------
 1 | #include <kernel_config.cuh>
 2 | #include <qs_launchers.cuh>
 3 | #include <qs_recursion.cuh>
 4 | #include <qs_recursion_multi.cuh>
 5 | #include <ssss_recursion.cuh>
 6 | #include <ssss_recursion_multi.cuh>
 7 | #include <ssss_launchers.cuh>
 8 | namespace gpu {
 9 | template void sampleselect<double,select_config<10, 10, 8, false, true, true, 8, 10, 10>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
10 | template void sampleselect_multi<double,select_config<10, 10, 6, true, true, true, 8, 10, 10>>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out);
11 | template __host__ __device__ void build_searchtree<float,select_config<10, 10, 8, true, false, false, 8, 10, 10>>(const float* in, float* out, index size);
12 | template void sampleselect_host<double,select_config<10, 10, 8, false, true, true, 4, 10, 10>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
13 | template __device__ __host__ void partition<float,select_config<10, 5, 8, false, true, true, 8, 10, 10>>(const float* in, float* out, index* counts, index size, float pivot);
14 | template void sampleselect_host<double,select_config<10, 10, 8, true, false, true, 8, 10, 10>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
15 | template __host__ __device__ void collect_buckets<float,select_config<10, 10, 8, false, true, true, 8, 10, 10>>(const float* data, const poracle* oracles_packed, const index* block_prefix_sum, const index* bucket_out_ranges, float* out, index size, mask* buckets, index* atomic);
16 | template __host__ __device__ launch_parameters get_launch_parameters<double,select_config<10, 10, 8, true, false, false, 8, 10, 10>>(index size);
17 | template void sampleselect_multi<double,select_config<10, 10, 8, true, true, true, 8, 8, 10>>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out);
18 | template __host__ __device__ void count_buckets<float,select_config<10, 10, 8, true, false, true, 8, 10, 10>>(const float* in, const float* tree, index* localcounts, index* counts, poracle* oracles, index size);
19 | template __device__ __host__ void partition<float,select_config<10, 10, 8, true, true, true, 8, 10, 10>>(const float* in, float* out, index* counts, index size, float pivot);
20 | }


--------------------------------------------------------------------------------
/lib/generated/gen25.cu:
--------------------------------------------------------------------------------
 1 | #include <kernel_config.cuh>
 2 | #include <qs_launchers.cuh>
 3 | #include <qs_recursion.cuh>
 4 | #include <qs_recursion_multi.cuh>
 5 | #include <ssss_recursion.cuh>
 6 | #include <ssss_recursion_multi.cuh>
 7 | #include <ssss_launchers.cuh>
 8 | namespace gpu {
 9 | template void sampleselect_host<float,select_config<10, 10, 8, true, true, true, 8, 10, 10, false, 4>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
10 | template void sampleselect<double,select_config<10, 10, 6, false, true, true, 8, 10, 10, false, 1024>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
11 | template __device__ void kernels::masked_prefix_sum<9>(index* counts, const mask* m);
12 | template void quickselect_multi<double,select_config<10, 5, 8, true, true, true, 8, 10, 10>>(double* in, double* tmp, index* count_tmp, index size, const index* ranks, index rank_count, double* out);
13 | template __host__ __device__ void collect_bucket<float,select_config<10, 10, 8, true, false, false, 8, 10, 10>>(const float* data, const poracle* oracles_packed, const index* prefix_sum, float* out, index size, oracle bucket, index* atomic);
14 | template __device__ void kernels::masked_prefix_sum<10>(index* counts, const mask* m);
15 | template void sampleselect_multi<double,select_config<10, 10, 8, false, true, true, 8, 10, 10, true>>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out);
16 | template void sampleselect_multi<float,select_config<10, 10, 8, true, true, true, 8, 10, 10, false, 8, 0>>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out);
17 | template __host__ __device__ void build_searchtree<float,select_config<10, 10, 8, false, true, false, 8, 10, 10>>(const float* in, float* out, index size);
18 | template __host__ __device__ launch_parameters get_launch_parameters<float,select_config<10, 10, 8, false, false, false, 8, 10, 10>>(index size);
19 | template void sampleselect_host<double,select_config<10, 10, 6, true, true, true, 8, 10, 10>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
20 | }


--------------------------------------------------------------------------------
/lib/generated/gen26.cu:
--------------------------------------------------------------------------------
 1 | #include <kernel_config.cuh>
 2 | #include <qs_launchers.cuh>
 3 | #include <qs_recursion.cuh>
 4 | #include <qs_recursion_multi.cuh>
 5 | #include <ssss_recursion.cuh>
 6 | #include <ssss_recursion_multi.cuh>
 7 | #include <ssss_launchers.cuh>
 8 | namespace gpu {
 9 | template void sampleselect<double,select_config<10, 10, 8, true, true, true, 2, 10, 10>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
10 | template void sampleselect_host<double,select_config<10, 10, 8, false, true, true, 8, 8, 10>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
11 | template void sampleselect_host<float,select_config<10, 10, 8, false, true, true, 8, 10, 10, true>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
12 | template void sampleselect_multi<float,select_config<10, 10, 8, true, true, true, 8, 10, 10, true>>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out);
13 | template __global__ void partition_count<double,select_config<10, 10, 8, false, true, true, 8, 10, 10>>(const double* in, index* counts, index size, double pivot, index workcount);
14 | template __global__ void kernels::reduce_counts<select_config<10, 8, 6, false, true, false, 8, 10, 10>>(const index* in, index* out, index);
15 | template void sampleselect<double,select_config<10, 10, 7, false, true, true, 8, 10, 10>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
16 | template void sampleselect_host<float,select_config<10, 10, 8, false, true, true, 8, 10, 10, false, 8, 0>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
17 | template void sampleselect<float,select_config<10, 10, 6, false, false, true, 8, 10, 10>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
18 | template void quickselect<double,select_config<10, 5, 8, false, true, true, 8, 10, 10>>(double* in, double* tmp, index* count_tmp, index size, index rank, double* out);
19 | template __host__ __device__ void build_searchtree<float,select_config<10, 12, 10, true, true, false, 8, 10, 10>>(const float* in, float* out, index size);
20 | }


--------------------------------------------------------------------------------
/lib/generated/gen27.cu:
--------------------------------------------------------------------------------
 1 | #include <kernel_config.cuh>
 2 | #include <qs_launchers.cuh>
 3 | #include <qs_recursion.cuh>
 4 | #include <qs_recursion_multi.cuh>
 5 | #include <ssss_recursion.cuh>
 6 | #include <ssss_recursion_multi.cuh>
 7 | #include <ssss_launchers.cuh>
 8 | namespace gpu {
 9 | template __host__ __device__ void build_searchtree<double,select_config<10, 8, 6, true, true, false, 8, 10, 10>>(const double* in, double* out, index size);
10 | template __global__ void kernels::partition_prefixsum<select_config<10, 5, 8, true, true, true, 8, 10, 10>>(index* counts, index block_count);
11 | template void sampleselect<double,select_config<10, 10, 8, false, true, true, 8, 8, 10>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
12 | template void sampleselect_host<double,select_config<10, 10, 8, true, true, true, 8, 10, 10, false, 8, 0>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
13 | template void quickselect_multi<float,select_config<10, 5, 8, true, true, true, 8, 10, 10>>(float* in, float* tmp, index* count_tmp, index size, const index* ranks, index rank_count, float* out);
14 | template void sampleselect_host<double,select_config<10, 10, 8, true, true, true, 8, 9, 10>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
15 | template void sampleselect_host<float,select_config<10, 10, 8, false, true, true, 4, 10, 10>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
16 | template __host__ __device__ void build_searchtree<double,select_config<10, 11, 9, true, true, false, 8, 10, 10>>(const double* in, double* out, index size);
17 | template __global__ void kernels::count_buckets<double, select_config<10, 8, 6, false, true, false, 8, 10, 10>>(const double* in, const double* tree, index* counts, poracle* oracles, index size, index workcount);
18 | template void sampleselect_multi<float,select_config<10, 10, 8, true, true, true, 2, 10, 10>>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out);
19 | template void sampleselect<double,select_config<10, 10, 8, true, true, true, 4, 10, 10>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
20 | }


--------------------------------------------------------------------------------
/lib/generated/gen28.cu:
--------------------------------------------------------------------------------
 1 | #include <kernel_config.cuh>
 2 | #include <qs_launchers.cuh>
 3 | #include <qs_recursion.cuh>
 4 | #include <qs_recursion_multi.cuh>
 5 | #include <ssss_recursion.cuh>
 6 | #include <ssss_recursion_multi.cuh>
 7 | #include <ssss_launchers.cuh>
 8 | namespace gpu {
 9 | template __host__ __device__ void collect_bucket<double,select_config<10, 9, 7, true, true, false, 8, 10, 10>>(const double* data, const poracle* oracles_packed, const index* prefix_sum, double* out, index size, oracle bucket, index* atomic);
10 | template void sampleselect_host<float,select_config<10, 10, 8, true, true, true, 8, 10, 10, true>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
11 | template void sampleselect_host<double,select_config<10, 10, 8, true, true, true, 4, 10, 10>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
12 | template __host__ __device__ void collect_buckets<float,select_config<10, 10, 8, false, false, true, 8, 10, 10>>(const float* data, const poracle* oracles_packed, const index* block_prefix_sum, const index* bucket_out_ranges, float* out, index size, mask* buckets, index* atomic);
13 | template __device__ void kernels::masked_prefix_sum<7>(index* counts, const mask* m);
14 | template void quickselect<double,select_config<9, 10, 8, true, true, true, 8, 10, 10>>(double* in, double* tmp, index* count_tmp, index size, index rank, double* out);
15 | template void sampleselect_multi<double,select_config<9, 10, 8, true, true, true, 8, 10, 10>>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out);
16 | template __global__ void kernels::prefix_sum_counts<select_config<10, 12, 10, true, true, false, 8, 10, 10>>(index* in, index* out, index);
17 | template void sampleselect<float,select_config<9, 10, 8, true, true, true, 8, 10, 10>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
18 | template void quickselect<float,select_config<8, 10, 8, false, true, true, 8, 10, 10>>(float* in, float* tmp, index* count_tmp, index size, index rank, float* out);
19 | template __host__ __device__ void collect_bucket_indirect<double,select_config<10, 10, 8, false, false, true, 8, 10, 10>>(const double* data, const poracle* oracles_packed, const index* prefix_sum, double* out, index size, const oracle* bucket, index* atomic);
20 | }


--------------------------------------------------------------------------------
/lib/generated/gen29.cu:
--------------------------------------------------------------------------------
 1 | #include <kernel_config.cuh>
 2 | #include <qs_launchers.cuh>
 3 | #include <qs_recursion.cuh>
 4 | #include <qs_recursion_multi.cuh>
 5 | #include <ssss_recursion.cuh>
 6 | #include <ssss_recursion_multi.cuh>
 7 | #include <ssss_launchers.cuh>
 8 | namespace gpu {
 9 | template __host__ __device__ void collect_bucket<float,select_config<10, 10, 8, true, true, true, 8, 10, 10>>(const float* data, const poracle* oracles_packed, const index* prefix_sum, float* out, index size, oracle bucket, index* atomic);
10 | template void sampleselect_host<double,select_config<10, 10, 7, false, true, true, 8, 10, 10>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
11 | template __global__ void kernels::count_buckets<double, select_config<10, 10, 8, false, true, false, 8, 10, 10>>(const double* in, const double* tree, index* counts, poracle* oracles, index size, index workcount);
12 | template __host__ __device__ launch_parameters get_launch_parameters<float,select_config<10, 8, 6, true, true, false, 8, 10, 10>>(index size);
13 | template void sampleselect_host<double,select_config<10, 10, 8, false, true, true, 8, 9, 10>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
14 | template void sampleselect<float,select_config<10, 10, 8, true, true, true, 8, 10, 10, false, 8, 0>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
15 | template void sampleselect_host<double,select_config<10, 10, 8, true, true, true, 8, 10, 10, false, 16>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
16 | template void quickselect<float,select_config<9, 10, 8, false, true, true, 8, 10, 10>>(float* in, float* tmp, index* count_tmp, index size, index rank, float* out);
17 | template void sampleselect_multi<double,select_config<10, 10, 8, true, true, true, 4, 10, 10>>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out);
18 | template void sampleselect<double,select_config<9, 10, 8, false, true, true, 8, 10, 10>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
19 | template __host__ __device__ void build_searchtree<double,select_config<10, 10, 8, false, false, true, 8, 10, 10>>(const double* in, double* out, index size);
20 | }


--------------------------------------------------------------------------------
/lib/generated/gen3.cu:
--------------------------------------------------------------------------------
 1 | #include <kernel_config.cuh>
 2 | #include <qs_launchers.cuh>
 3 | #include <qs_recursion.cuh>
 4 | #include <qs_recursion_multi.cuh>
 5 | #include <ssss_recursion.cuh>
 6 | #include <ssss_recursion_multi.cuh>
 7 | #include <ssss_launchers.cuh>
 8 | namespace gpu {
 9 | template void quickselect<float,select_config<10, 10, 8, false, true, true, 8, 9, 10>>(float* in, float* tmp, index* count_tmp, index size, index rank, float* out);
10 | template __host__ __device__ void collect_bucket_indirect<double,select_config<10, 10, 8, true, false, true, 8, 10, 10>>(const double* data, const poracle* oracles_packed, const index* prefix_sum, double* out, index size, const oracle* bucket, index* atomic);
11 | template __device__ __host__ void partition<double,select_config<10, 5, 8, false, true, true, 8, 10, 10>>(const double* in, double* out, index* counts, index size, double pivot);
12 | template void sampleselect_multi<float,select_config<10, 10, 7, false, true, true, 8, 10, 10>>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out);
13 | template void sampleselect_host<double,select_config<9, 10, 8, true, true, true, 8, 10, 10>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
14 | template __global__ void kernels::reduce_counts<select_config<10, 11, 9, false, true, false, 8, 10, 10>>(const index* in, index* out, index);
15 | template void sampleselect_host<float,select_config<10, 10, 8, true, true, true, 4, 10, 10>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
16 | template __host__ __device__ void collect_bucket<float,select_config<10, 10, 8, true, true, false, 8, 10, 10>>(const float* data, const poracle* oracles_packed, const index* prefix_sum, float* out, index size, oracle bucket, index* atomic);
17 | template void sampleselect_multi<float,select_config<10, 10, 6, true, true, true, 8, 10, 10, false, 1024>>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out);
18 | template void sampleselect_host<double,select_config<10, 10, 6, true, true, true, 8, 10, 10, false, 1024>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
19 | template void sampleselect_multi<float,select_config<10, 10, 8, false, true, true, 8, 8, 10>>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out);
20 | template __global__ void kernels::count_buckets<double, select_config<10, 8, 6, true, true, false, 8, 10, 10>>(const double* in, const double* tree, index* counts, poracle* oracles, index size, index workcount);
21 | }


--------------------------------------------------------------------------------
/lib/generated/gen30.cu:
--------------------------------------------------------------------------------
 1 | #include <kernel_config.cuh>
 2 | #include <qs_launchers.cuh>
 3 | #include <qs_recursion.cuh>
 4 | #include <qs_recursion_multi.cuh>
 5 | #include <ssss_recursion.cuh>
 6 | #include <ssss_recursion_multi.cuh>
 7 | #include <ssss_launchers.cuh>
 8 | namespace gpu {
 9 | template __host__ __device__ launch_parameters get_launch_parameters<double,select_config<10, 8, 6, false, true, false, 8, 10, 10>>(index size);
10 | template void sampleselect<float,select_config<10, 10, 8, true, true, true, 8, 8, 10>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
11 | template __global__ void kernels::count_buckets<double, select_config<10, 9, 7, true, true, false, 8, 10, 10>>(const double* in, const double* tree, index* counts, poracle* oracles, index size, index workcount);
12 | template void quickselect<float,select_config<10, 5, 8, true, true, true, 8, 10, 10>>(float* in, float* tmp, index* count_tmp, index size, index rank, float* out);
13 | template void sampleselect_host<double,select_config<10, 10, 8, true, true, true, 2, 10, 10>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
14 | template __host__ __device__ void collect_buckets<float,select_config<10, 10, 8, true, false, true, 8, 10, 10>>(const float* data, const poracle* oracles_packed, const index* block_prefix_sum, const index* bucket_out_ranges, float* out, index size, mask* buckets, index* atomic);
15 | template __host__ __device__ launch_parameters get_launch_parameters<double,select_config<10, 10, 8, false, true, false, 8, 10, 10>>(index size);
16 | template void sampleselect<float,select_config<10, 10, 8, false, true, true, 2, 10, 10>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
17 | template __host__ __device__ launch_parameters get_launch_parameters<float,select_config<10, 10, 8, false, true, true, 8, 10, 10>>(index size);
18 | template void sampleselect_multi<double,select_config<10, 10, 8, false, true, true, 4, 10, 10>>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out);
19 | template void sampleselect_multi<float,select_config<8, 10, 8, true, true, true, 8, 10, 10>>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out);
20 | }


--------------------------------------------------------------------------------
/lib/generated/gen31.cu:
--------------------------------------------------------------------------------
 1 | #include <kernel_config.cuh>
 2 | #include <qs_launchers.cuh>
 3 | #include <qs_recursion.cuh>
 4 | #include <qs_recursion_multi.cuh>
 5 | #include <ssss_recursion.cuh>
 6 | #include <ssss_recursion_multi.cuh>
 7 | #include <ssss_launchers.cuh>
 8 | namespace gpu {
 9 | template void sampleselect<double,select_config<10, 10, 8, true, true, true, 8, 10, 10, false, 8, 0>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
10 | template void sampleselect_multi<float,select_config<10, 10, 8, true, true, true, 8, 8, 10>>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out);
11 | template void quickselect<double,select_config<10, 10, 8, true, true, true, 4, 10, 10>>(double* in, double* tmp, index* count_tmp, index size, index rank, double* out);
12 | template __host__ __device__ void collect_bucket<float,select_config<10, 10, 8, false, false, true, 8, 10, 10>>(const float* data, const poracle* oracles_packed, const index* prefix_sum, float* out, index size, oracle bucket, index* atomic);
13 | template __device__ __host__ void partition<double,select_config<10, 10, 8, false, true, true, 8, 10, 10>>(const double* in, double* out, index* counts, index size, double pivot);
14 | template void sampleselect<float,select_config<10, 10, 6, true, true, true, 8, 10, 10>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
15 | template __host__ __device__ void collect_bucket<double,select_config<10, 10, 8, false, true, false, 8, 10, 10>>(const double* data, const poracle* oracles_packed, const index* prefix_sum, double* out, index size, oracle bucket, index* atomic);
16 | template void sampleselect_multi<double,select_config<10, 10, 7, false, true, true, 8, 10, 10>>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out);
17 | template __host__ __device__ void ssss_merged<double,select_config<10, 10, 8, true, false, true, 8, 10, 10>>(const double* in, double* out, poracle* oracles, index offset, const index* ranks, index rank_offset, index rank_base, const kernels::ssss_multi_aux<double, select_config<10, 10, 8, true, false, true, 8, 10, 10>>* aux_in, kernels::ssss_multi_aux<double, select_config<10, 10, 8, true, false, true, 8, 10, 10>>* aux_outs, double* out_tree);
18 | template __host__ __device__ launch_parameters get_launch_parameters<float,select_config<10, 11, 9, true, true, false, 8, 10, 10>>(index size);
19 | template void sampleselect<float,select_config<10, 10, 8, true, true, true, 8, 10, 10, false, 4>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
20 | }


--------------------------------------------------------------------------------
/lib/generated/gen32.cu:
--------------------------------------------------------------------------------
 1 | #include <kernel_config.cuh>
 2 | #include <qs_launchers.cuh>
 3 | #include <qs_recursion.cuh>
 4 | #include <qs_recursion_multi.cuh>
 5 | #include <ssss_recursion.cuh>
 6 | #include <ssss_recursion_multi.cuh>
 7 | #include <ssss_launchers.cuh>
 8 | namespace gpu {
 9 | template __host__ __device__ void ssss_merged<double,select_config<10, 10, 8, true, true, true, 8, 10, 10>>(const double* in, double* out, poracle* oracles, index offset, const index* ranks, index rank_offset, index rank_base, const kernels::ssss_multi_aux<double, select_config<10, 10, 8, true, true, true, 8, 10, 10>>* aux_in, kernels::ssss_multi_aux<double, select_config<10, 10, 8, true, true, true, 8, 10, 10>>* aux_outs, double* out_tree);
10 | template __host__ __device__ launch_parameters get_launch_parameters<float,select_config<10, 9, 7, true, true, false, 8, 10, 10>>(index size);
11 | template void sampleselect_multi<double,select_config<10, 10, 8, true, true, true, 8, 10, 10>>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out);
12 | template __global__ void kernels::prefix_sum_counts<select_config<10, 11, 9, false, true, false, 8, 10, 10>>(index* in, index* out, index);
13 | template void sampleselect_multi<float,select_config<10, 10, 8, true, true, true, 4, 10, 10>>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out);
14 | template void sampleselect<double,select_config<10, 10, 8, true, true, true, 8, 10, 10, true>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
15 | template void sampleselect_multi<float,select_config<10, 10, 7, true, true, true, 8, 10, 10>>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out);
16 | template void sampleselect<float,select_config<10, 10, 8, false, true, true, 8, 10, 10, false, 16>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
17 | template __host__ __device__ launch_parameters get_launch_parameters<float,select_config<10, 12, 10, false, true, false, 8, 10, 10>>(index size);
18 | template __host__ __device__ void build_searchtree<double,select_config<10, 10, 8, false, true, true, 8, 10, 10>>(const double* in, double* out, index size);
19 | template void sampleselect<float,select_config<10, 10, 8, false, true, true, 8, 10, 10, false, 4>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
20 | }


--------------------------------------------------------------------------------
/lib/generated/gen33.cu:
--------------------------------------------------------------------------------
 1 | #include <kernel_config.cuh>
 2 | #include <qs_launchers.cuh>
 3 | #include <qs_recursion.cuh>
 4 | #include <qs_recursion_multi.cuh>
 5 | #include <ssss_recursion.cuh>
 6 | #include <ssss_recursion_multi.cuh>
 7 | #include <ssss_launchers.cuh>
 8 | namespace gpu {
 9 | template __global__ void kernels::partition<float,select_config<10, 10, 8, false, true, true, 8, 10, 10>>(const float* in, float* out, index* atomic, index size, float pivot, index workcount);
10 | template void sampleselect_host<float,select_config<10, 10, 8, false, true, true, 8, 10, 10, false, 16>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
11 | template void sampleselect<float,select_config<9, 10, 8, false, true, true, 8, 10, 10>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
12 | template __device__ void kernels::masked_prefix_sum<6>(index* counts, const mask* m);
13 | template __host__ __device__ void collect_bucket<float,select_config<10, 8, 6, false, true, false, 8, 10, 10>>(const float* data, const poracle* oracles_packed, const index* prefix_sum, float* out, index size, oracle bucket, index* atomic);
14 | template __global__ void kernels::prefix_sum_counts<select_config<10, 11, 9, true, true, false, 8, 10, 10>>(index* in, index* out, index);
15 | template __global__ void kernels::partition<float,select_config<10, 5, 8, false, true, true, 8, 10, 10>>(const float* in, float* out, index* atomic, index size, float pivot, index workcount);
16 | template void sampleselect_multi<float,select_config<10, 10, 6, false, true, true, 8, 10, 10>>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out);
17 | template __global__ void kernels::partition_prefixsum<select_config<10, 5, 8, false, true, true, 8, 10, 10>>(index* counts, index block_count);
18 | template void sampleselect_multi<float,select_config<10, 10, 6, true, false, true, 8, 10, 10>>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out);
19 | template void sampleselect_host<double,select_config<10, 10, 8, false, false, true, 8, 10, 10>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
20 | }


--------------------------------------------------------------------------------
/lib/generated/gen34.cu:
--------------------------------------------------------------------------------
 1 | #include <kernel_config.cuh>
 2 | #include <qs_launchers.cuh>
 3 | #include <qs_recursion.cuh>
 4 | #include <qs_recursion_multi.cuh>
 5 | #include <ssss_recursion.cuh>
 6 | #include <ssss_recursion_multi.cuh>
 7 | #include <ssss_launchers.cuh>
 8 | namespace gpu {
 9 | template void sampleselect_host<float,select_config<10, 10, 8, false, true, true, 2, 10, 10>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
10 | template __host__ __device__ void collect_bucket<float,select_config<10, 8, 6, true, true, false, 8, 10, 10>>(const float* data, const poracle* oracles_packed, const index* prefix_sum, float* out, index size, oracle bucket, index* atomic);
11 | template __host__ __device__ void build_searchtree<double,select_config<10, 10, 8, true, true, true, 8, 10, 10>>(const double* in, double* out, index size);
12 | template void sampleselect_host<double,select_config<10, 10, 6, false, true, true, 8, 10, 10>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
13 | template void sampleselect_host<double,select_config<10, 10, 8, false, true, true, 8, 10, 10, false, 4>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
14 | template __host__ __device__ void collect_bucket<float,select_config<10, 10, 8, false, true, false, 8, 10, 10>>(const float* data, const poracle* oracles_packed, const index* prefix_sum, float* out, index size, oracle bucket, index* atomic);
15 | template void quickselect<double,select_config<10, 10, 8, true, true, true, 8, 9, 10>>(double* in, double* tmp, index* count_tmp, index size, index rank, double* out);
16 | template __host__ __device__ void build_searchtree<double,select_config<10, 9, 7, false, true, false, 8, 10, 10>>(const double* in, double* out, index size);
17 | template __global__ void partition_count<double,select_config<10, 5, 8, false, true, true, 8, 10, 10>>(const double* in, index* counts, index size, double pivot, index workcount);
18 | template void sampleselect<float,select_config<10, 10, 8, false, true, true, 8, 10, 10>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
19 | template __device__ __host__ void partition<float,select_config<10, 5, 8, true, true, true, 8, 10, 10>>(const float* in, float* out, index* counts, index size, float pivot);
20 | }


--------------------------------------------------------------------------------
/lib/generated/gen35.cu:
--------------------------------------------------------------------------------
 1 | #include <kernel_config.cuh>
 2 | #include <qs_launchers.cuh>
 3 | #include <qs_recursion.cuh>
 4 | #include <qs_recursion_multi.cuh>
 5 | #include <ssss_recursion.cuh>
 6 | #include <ssss_recursion_multi.cuh>
 7 | #include <ssss_launchers.cuh>
 8 | namespace gpu {
 9 | template void sampleselect_multi<float,select_config<10, 10, 8, false, true, true, 4, 10, 10>>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out);
10 | template void sampleselect_multi<double,select_config<10, 10, 8, false, false, true, 8, 10, 10>>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out);
11 | template __host__ __device__ launch_parameters get_launch_parameters<double,select_config<10, 10, 8, true, true, false, 8, 10, 10>>(index size);
12 | template __host__ __device__ void build_searchtree<float,select_config<10, 10, 8, false, false, false, 8, 10, 10>>(const float* in, float* out, index size);
13 | template __global__ void kernels::partition<double,select_config<10, 5, 8, false, true, true, 8, 10, 10>>(const double* in, double* out, index* atomic, index size, double pivot, index workcount);
14 | template void sampleselect<float,select_config<10, 10, 6, true, false, true, 8, 10, 10>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
15 | template void sampleselect<double,select_config<10, 10, 6, true, true, true, 8, 10, 10>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
16 | template void quickselect<double,select_config<10, 5, 8, true, true, true, 8, 10, 10>>(double* in, double* tmp, index* count_tmp, index size, index rank, double* out);
17 | template void quickselect<double,select_config<10, 10, 8, false, true, true, 8, 9, 10>>(double* in, double* tmp, index* count_tmp, index size, index rank, double* out);
18 | template __global__ void kernels::prefix_sum_counts<select_config<10, 10, 8, true, true, true, 8, 10, 10>>(index* in, index* out, index);
19 | template void sampleselect_multi<float,select_config<10, 10, 8, true, true, true, 8, 10, 10, false, 16>>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out);
20 | }


--------------------------------------------------------------------------------
/lib/generated/gen36.cu:
--------------------------------------------------------------------------------
 1 | #include <kernel_config.cuh>
 2 | #include <qs_launchers.cuh>
 3 | #include <qs_recursion.cuh>
 4 | #include <qs_recursion_multi.cuh>
 5 | #include <ssss_recursion.cuh>
 6 | #include <ssss_recursion_multi.cuh>
 7 | #include <ssss_launchers.cuh>
 8 | namespace gpu {
 9 | template void quickselect<float,select_config<10, 10, 8, false, true, true, 8, 8, 10>>(float* in, float* tmp, index* count_tmp, index size, index rank, float* out);
10 | template void sampleselect<double,select_config<10, 10, 8, true, true, true, 8, 10, 10, false, 4>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
11 | template __host__ __device__ void count_buckets<float,select_config<10, 12, 10, true, true, false, 8, 10, 10>>(const float* in, const float* tree, index* localcounts, index* counts, poracle* oracles, index size);
12 | template __host__ __device__ void ssss_merged<double,select_config<10, 10, 8, false, false, true, 8, 10, 10>>(const double* in, double* out, poracle* oracles, index offset, const index* ranks, index rank_offset, index rank_base, const kernels::ssss_multi_aux<double, select_config<10, 10, 8, false, false, true, 8, 10, 10>>* aux_in, kernels::ssss_multi_aux<double, select_config<10, 10, 8, false, false, true, 8, 10, 10>>* aux_outs, double* out_tree);
13 | template void quickselect<double,select_config<10, 10, 8, false, true, true, 8, 8, 10>>(double* in, double* tmp, index* count_tmp, index size, index rank, double* out);
14 | template __host__ __device__ void collect_bucket<float,select_config<10, 12, 10, false, true, false, 8, 10, 10>>(const float* data, const poracle* oracles_packed, const index* prefix_sum, float* out, index size, oracle bucket, index* atomic);
15 | template __global__ void kernels::count_buckets<double, select_config<10, 11, 9, false, true, false, 8, 10, 10>>(const double* in, const double* tree, index* counts, poracle* oracles, index size, index workcount);
16 | template void sampleselect_host<float,select_config<10, 10, 8, true, true, true, 8, 10, 10, false, 8, 0>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
17 | template void sampleselect_multi<double,select_config<8, 10, 8, true, true, true, 8, 10, 10>>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out);
18 | template __global__ void kernels::reduce_counts<select_config<10, 9, 7, true, true, false, 8, 10, 10>>(const index* in, index* out, index);
19 | template void sampleselect_host<double,select_config<10, 10, 8, false, true, true, 8, 10, 10>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
20 | }


--------------------------------------------------------------------------------
/lib/generated/gen37.cu:
--------------------------------------------------------------------------------
 1 | #include <kernel_config.cuh>
 2 | #include <qs_launchers.cuh>
 3 | #include <qs_recursion.cuh>
 4 | #include <qs_recursion_multi.cuh>
 5 | #include <ssss_recursion.cuh>
 6 | #include <ssss_recursion_multi.cuh>
 7 | #include <ssss_launchers.cuh>
 8 | namespace gpu {
 9 | template __global__ void kernels::partition_distr<float,select_config<10, 5, 8, false, true, true, 8, 10, 10>>(const float* in, float* out, const index* counts, index size, float pivot, index workcount);
10 | template void sampleselect_host<double,select_config<10, 10, 8, true, true, true, 8, 10, 10>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
11 | template void quickselect<float,select_config<10, 10, 8, true, true, true, 4, 10, 10>>(float* in, float* tmp, index* count_tmp, index size, index rank, float* out);
12 | template void sampleselect_multi<double,select_config<10, 10, 6, false, false, true, 8, 10, 10>>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out);
13 | template __global__ void partition_count<float,select_config<10, 10, 8, false, true, true, 8, 10, 10>>(const float* in, index* counts, index size, float pivot, index workcount);
14 | template __global__ void kernels::reduce_counts<select_config<10, 10, 8, true, false, false, 8, 10, 10>>(const index* in, index* out, index);
15 | template void sampleselect_multi<double,select_config<10, 10, 8, false, true, true, 2, 10, 10>>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out);
16 | template __host__ __device__ void ssss_merged<float,select_config<10, 10, 8, true, true, true, 8, 10, 10>>(const float* in, float* out, poracle* oracles, index offset, const index* ranks, index rank_offset, index rank_base, const kernels::ssss_multi_aux<float, select_config<10, 10, 8, true, true, true, 8, 10, 10>>* aux_in, kernels::ssss_multi_aux<float, select_config<10, 10, 8, true, true, true, 8, 10, 10>>* aux_outs, float* out_tree);
17 | template __global__ void partition_count<float,select_config<10, 10, 8, true, true, true, 8, 10, 10>>(const float* in, index* counts, index size, float pivot, index workcount);
18 | template __global__ void kernels::reduce_counts<select_config<10, 8, 6, true, true, false, 8, 10, 10>>(const index* in, index* out, index);
19 | template __host__ __device__ void count_buckets<float,select_config<10, 10, 8, false, false, true, 8, 10, 10>>(const float* in, const float* tree, index* localcounts, index* counts, poracle* oracles, index size);
20 | }


--------------------------------------------------------------------------------
/lib/generated/gen38.cu:
--------------------------------------------------------------------------------
 1 | #include <kernel_config.cuh>
 2 | #include <qs_launchers.cuh>
 3 | #include <qs_recursion.cuh>
 4 | #include <qs_recursion_multi.cuh>
 5 | #include <ssss_recursion.cuh>
 6 | #include <ssss_recursion_multi.cuh>
 7 | #include <ssss_launchers.cuh>
 8 | namespace gpu {
 9 | template __host__ __device__ launch_parameters get_launch_parameters<double,select_config<10, 10, 8, false, false, false, 8, 10, 10>>(index size);
10 | template void sampleselect_multi<float,select_config<9, 10, 8, false, true, true, 8, 10, 10>>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out);
11 | template __host__ __device__ void collect_bucket<double,select_config<10, 11, 9, true, true, false, 8, 10, 10>>(const double* data, const poracle* oracles_packed, const index* prefix_sum, double* out, index size, oracle bucket, index* atomic);
12 | template void quickselect<float,select_config<10, 10, 8, true, true, true, 8, 9, 10>>(float* in, float* tmp, index* count_tmp, index size, index rank, float* out);
13 | template __global__ void kernels::reduce_counts<select_config<10, 10, 8, true, true, true, 8, 10, 10>>(const index* in, index* out, index);
14 | template __host__ __device__ void collect_buckets<double,select_config<10, 10, 8, false, true, true, 8, 10, 10>>(const double* data, const poracle* oracles_packed, const index* block_prefix_sum, const index* bucket_out_ranges, double* out, index size, mask* buckets, index* atomic);
15 | template void sampleselect_multi<float,select_config<10, 10, 8, true, true, true, 8, 9, 10>>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out);
16 | template void sampleselect_multi<double,select_config<10, 10, 8, false, true, true, 8, 10, 10, false, 16>>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out);
17 | template void sampleselect_multi<float,select_config<10, 10, 8, false, false, true, 8, 10, 10>>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out);
18 | template void sampleselect_host<double,select_config<10, 10, 7, true, true, true, 8, 10, 10>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
19 | template __host__ __device__ void collect_bucket<double,select_config<10, 11, 9, false, true, false, 8, 10, 10>>(const double* data, const poracle* oracles_packed, const index* prefix_sum, double* out, index size, oracle bucket, index* atomic);
20 | }


--------------------------------------------------------------------------------
/lib/generated/gen39.cu:
--------------------------------------------------------------------------------
 1 | #include <kernel_config.cuh>
 2 | #include <qs_launchers.cuh>
 3 | #include <qs_recursion.cuh>
 4 | #include <qs_recursion_multi.cuh>
 5 | #include <ssss_recursion.cuh>
 6 | #include <ssss_recursion_multi.cuh>
 7 | #include <ssss_launchers.cuh>
 8 | namespace gpu {
 9 | template __global__ void kernels::count_buckets<double, select_config<10, 9, 7, false, true, false, 8, 10, 10>>(const double* in, const double* tree, index* counts, poracle* oracles, index size, index workcount);
10 | template void sampleselect<float,select_config<10, 10, 8, false, true, true, 8, 8, 10>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
11 | template void sampleselect<double,select_config<10, 10, 8, false, true, true, 2, 10, 10>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
12 | template __host__ __device__ void collect_bucket_indirect<float,select_config<10, 10, 8, false, true, true, 8, 10, 10>>(const float* data, const poracle* oracles_packed, const index* prefix_sum, float* out, index size, const oracle* bucket, index* atomic);
13 | template __global__ void kernels::count_buckets<float, select_config<10, 12, 10, false, true, false, 8, 10, 10>>(const float* in, const float* tree, index* counts, poracle* oracles, index size, index workcount);
14 | template __global__ void kernels::partition_distr<float,select_config<10, 10, 8, true, true, true, 8, 10, 10>>(const float* in, float* out, const index* counts, index size, float pivot, index workcount);
15 | template __host__ __device__ launch_parameters get_launch_parameters<float,select_config<10, 10, 8, true, true, false, 8, 10, 10>>(index size);
16 | template void sampleselect_host<float,select_config<8, 10, 8, false, true, true, 8, 10, 10>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
17 | template __host__ __device__ launch_parameters get_launch_parameters<double,select_config<10, 9, 7, true, true, false, 8, 10, 10>>(index size);
18 | template __global__ void kernels::partition_distr<double,select_config<10, 10, 8, true, true, true, 8, 10, 10>>(const double* in, double* out, const index* counts, index size, double pivot, index workcount);
19 | template __global__ void kernels::prefix_sum_counts<select_config<10, 12, 10, false, true, false, 8, 10, 10>>(index* in, index* out, index);
20 | }


--------------------------------------------------------------------------------
/lib/generated/gen4.cu:
--------------------------------------------------------------------------------
 1 | #include <kernel_config.cuh>
 2 | #include <qs_launchers.cuh>
 3 | #include <qs_recursion.cuh>
 4 | #include <qs_recursion_multi.cuh>
 5 | #include <ssss_recursion.cuh>
 6 | #include <ssss_recursion_multi.cuh>
 7 | #include <ssss_launchers.cuh>
 8 | namespace gpu {
 9 | template __global__ void kernels::count_buckets<float, select_config<10, 8, 6, false, true, false, 8, 10, 10>>(const float* in, const float* tree, index* counts, poracle* oracles, index size, index workcount);
10 | template void sampleselect_host<float,select_config<10, 10, 8, false, true, true, 8, 8, 10>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
11 | template void sampleselect_host<float,select_config<8, 10, 8, true, true, true, 8, 10, 10>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
12 | template void sampleselect_multi<float,select_config<10, 10, 8, false, true, true, 8, 10, 10, false, 16>>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out);
13 | template void sampleselect<float,select_config<10, 10, 8, false, true, true, 8, 9, 10>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
14 | template void quickselect<float,select_config<10, 10, 8, false, true, true, 2, 10, 10>>(float* in, float* tmp, index* count_tmp, index size, index rank, float* out);
15 | template __global__ void kernels::count_buckets<float, select_config<10, 11, 9, false, true, false, 8, 10, 10>>(const float* in, const float* tree, index* counts, poracle* oracles, index size, index workcount);
16 | template void sampleselect<double,select_config<10, 10, 8, true, true, true, 8, 10, 10>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
17 | template void sampleselect_host<double,select_config<10, 10, 8, false, true, true, 8, 10, 10, true>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
18 | template __host__ __device__ void collect_buckets<float,select_config<10, 10, 8, true, true, true, 8, 10, 10>>(const float* data, const poracle* oracles_packed, const index* block_prefix_sum, const index* bucket_out_ranges, float* out, index size, mask* buckets, index* atomic);
19 | template __host__ __device__ void collect_bucket<double,select_config<10, 8, 6, false, true, false, 8, 10, 10>>(const double* data, const poracle* oracles_packed, const index* prefix_sum, double* out, index size, oracle bucket, index* atomic);
20 | template __host__ __device__ void build_searchtree<float,select_config<10, 10, 8, false, true, true, 8, 10, 10>>(const float* in, float* out, index size);
21 | }


--------------------------------------------------------------------------------
/lib/generated/gen5.cu:
--------------------------------------------------------------------------------
 1 | #include <kernel_config.cuh>
 2 | #include <qs_launchers.cuh>
 3 | #include <qs_recursion.cuh>
 4 | #include <qs_recursion_multi.cuh>
 5 | #include <ssss_recursion.cuh>
 6 | #include <ssss_recursion_multi.cuh>
 7 | #include <ssss_launchers.cuh>
 8 | namespace gpu {
 9 | template __host__ __device__ void collect_bucket<float,select_config<10, 12, 10, true, true, false, 8, 10, 10>>(const float* data, const poracle* oracles_packed, const index* prefix_sum, float* out, index size, oracle bucket, index* atomic);
10 | template __global__ void kernels::prefix_sum_counts<select_config<10, 10, 8, true, true, false, 8, 10, 10>>(index* in, index* out, index);
11 | template __global__ void kernels::count_buckets<double, select_config<10, 10, 8, true, false, false, 8, 10, 10>>(const double* in, const double* tree, index* counts, poracle* oracles, index size, index workcount);
12 | template __host__ __device__ launch_parameters get_launch_parameters<float,select_config<10, 11, 9, false, true, false, 8, 10, 10>>(index size);
13 | template __global__ void partition_count<float,select_config<10, 5, 8, false, true, true, 8, 10, 10>>(const float* in, index* counts, index size, float pivot, index workcount);
14 | template void sampleselect_multi<double,select_config<10, 10, 8, false, true, true, 8, 9, 10>>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out);
15 | template __host__ __device__ void collect_bucket<double,select_config<10, 10, 8, true, false, false, 8, 10, 10>>(const double* data, const poracle* oracles_packed, const index* prefix_sum, double* out, index size, oracle bucket, index* atomic);
16 | template __global__ void kernels::partition_distr<float,select_config<10, 10, 8, false, true, true, 8, 10, 10>>(const float* in, float* out, const index* counts, index size, float pivot, index workcount);
17 | template __global__ void kernels::count_buckets<float, select_config<10, 12, 10, true, true, false, 8, 10, 10>>(const float* in, const float* tree, index* counts, poracle* oracles, index size, index workcount);
18 | template void sampleselect<double,select_config<10, 10, 8, false, true, true, 8, 10, 10, false, 4>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
19 | template __host__ __device__ void collect_bucket<float,select_config<10, 10, 8, false, true, true, 8, 10, 10>>(const float* data, const poracle* oracles_packed, const index* prefix_sum, float* out, index size, oracle bucket, index* atomic);
20 | template __global__ void kernels::partition<double,select_config<10, 10, 8, false, true, true, 8, 10, 10>>(const double* in, double* out, index* atomic, index size, double pivot, index workcount);
21 | }


--------------------------------------------------------------------------------
/lib/generated/gen6.cu:
--------------------------------------------------------------------------------
 1 | #include <kernel_config.cuh>
 2 | #include <qs_launchers.cuh>
 3 | #include <qs_recursion.cuh>
 4 | #include <qs_recursion_multi.cuh>
 5 | #include <ssss_recursion.cuh>
 6 | #include <ssss_recursion_multi.cuh>
 7 | #include <ssss_launchers.cuh>
 8 | namespace gpu {
 9 | template __host__ __device__ void build_searchtree<float,select_config<10, 10, 8, true, true, true, 8, 10, 10>>(const float* in, float* out, index size);
10 | template void sampleselect<double,select_config<10, 10, 8, false, false, true, 8, 10, 10>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
11 | template __global__ void kernels::prefix_sum_counts<select_config<10, 9, 7, true, true, false, 8, 10, 10>>(index* in, index* out, index);
12 | template __host__ __device__ void count_buckets<float,select_config<10, 10, 8, true, true, false, 8, 10, 10>>(const float* in, const float* tree, index* localcounts, index* counts, poracle* oracles, index size);
13 | template void sampleselect_multi<float,select_config<10, 10, 8, true, false, true, 8, 10, 10>>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out);
14 | template void quickselect_multi<double,select_config<10, 10, 8, false, true, true, 8, 10, 10>>(double* in, double* tmp, index* count_tmp, index size, const index* ranks, index rank_count, double* out);
15 | template __host__ __device__ void build_searchtree<float,select_config<10, 8, 6, false, true, false, 8, 10, 10>>(const float* in, float* out, index size);
16 | template void sampleselect<float,select_config<10, 10, 8, false, true, true, 4, 10, 10>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
17 | template void sampleselect<double,select_config<10, 10, 6, true, true, true, 8, 10, 10, false, 1024>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
18 | template void sampleselect_host<float,select_config<10, 10, 6, true, false, true, 8, 10, 10>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
19 | template void sampleselect_multi<double,select_config<10, 10, 6, false, true, true, 8, 10, 10, false, 1024>>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out);
20 | template void sampleselect_host<float,select_config<10, 10, 8, false, true, true, 8, 10, 10>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
21 | }


--------------------------------------------------------------------------------
/lib/generated/gen7.cu:
--------------------------------------------------------------------------------
 1 | #include <kernel_config.cuh>
 2 | #include <qs_launchers.cuh>
 3 | #include <qs_recursion.cuh>
 4 | #include <qs_recursion_multi.cuh>
 5 | #include <ssss_recursion.cuh>
 6 | #include <ssss_recursion_multi.cuh>
 7 | #include <ssss_launchers.cuh>
 8 | namespace gpu {
 9 | template void sampleselect_multi<double,select_config<9, 10, 8, false, true, true, 8, 10, 10>>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out);
10 | template void sampleselect<float,select_config<10, 10, 7, true, true, true, 8, 10, 10>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
11 | template void sampleselect_host<float,select_config<10, 10, 8, false, true, true, 8, 10, 10, false, 4>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
12 | template __host__ __device__ void ssss_merged<double,select_config<10, 10, 8, false, true, true, 8, 10, 10>>(const double* in, double* out, poracle* oracles, index offset, const index* ranks, index rank_offset, index rank_base, const kernels::ssss_multi_aux<double, select_config<10, 10, 8, false, true, true, 8, 10, 10>>* aux_in, kernels::ssss_multi_aux<double, select_config<10, 10, 8, false, true, true, 8, 10, 10>>* aux_outs, double* out_tree);
13 | template __global__ void kernels::prefix_sum_counts<select_config<10, 10, 8, false, true, true, 8, 10, 10>>(index* in, index* out, index);
14 | template __host__ __device__ void collect_bucket_indirect<double,select_config<10, 10, 8, true, true, true, 8, 10, 10>>(const double* data, const poracle* oracles_packed, const index* prefix_sum, double* out, index size, const oracle* bucket, index* atomic);
15 | template void quickselect<float,select_config<9, 10, 8, true, true, true, 8, 10, 10>>(float* in, float* tmp, index* count_tmp, index size, index rank, float* out);
16 | template void sampleselect_host<double,select_config<10, 10, 8, true, true, true, 8, 10, 10, true>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
17 | template __global__ void kernels::count_buckets<float, select_config<10, 10, 8, false, true, false, 8, 10, 10>>(const float* in, const float* tree, index* counts, poracle* oracles, index size, index workcount);
18 | template void sampleselect<float,select_config<10, 10, 8, true, false, true, 8, 10, 10>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
19 | template __host__ __device__ void count_buckets<double,select_config<10, 11, 9, true, true, false, 8, 10, 10>>(const double* in, const double* tree, index* localcounts, index* counts, poracle* oracles, index size);
20 | template __global__ void kernels::count_buckets<double, select_config<10, 10, 8, true, true, false, 8, 10, 10>>(const double* in, const double* tree, index* counts, poracle* oracles, index size, index workcount);
21 | }


--------------------------------------------------------------------------------
/lib/generated/gen8.cu:
--------------------------------------------------------------------------------
 1 | #include <kernel_config.cuh>
 2 | #include <qs_launchers.cuh>
 3 | #include <qs_recursion.cuh>
 4 | #include <qs_recursion_multi.cuh>
 5 | #include <ssss_recursion.cuh>
 6 | #include <ssss_recursion_multi.cuh>
 7 | #include <ssss_launchers.cuh>
 8 | namespace gpu {
 9 | template void quickselect<double,select_config<10, 10, 8, false, true, true, 4, 10, 10>>(double* in, double* tmp, index* count_tmp, index size, index rank, double* out);
10 | template void sampleselect<float,select_config<8, 10, 8, true, true, true, 8, 10, 10>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
11 | template __global__ void partition_count<double,select_config<10, 10, 8, true, true, true, 8, 10, 10>>(const double* in, index* counts, index size, double pivot, index workcount);
12 | template void sampleselect_host<double,select_config<10, 10, 8, true, true, true, 8, 10, 10, false, 4>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
13 | template void sampleselect_multi<float,select_config<10, 10, 8, false, true, true, 8, 9, 10>>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out);
14 | template void sampleselect_multi<double,select_config<10, 10, 8, true, true, true, 8, 10, 10, false, 16>>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out);
15 | template void sampleselect<float,select_config<10, 10, 8, false, true, true, 8, 10, 10, false, 8, 0>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
16 | template __global__ void kernels::reduce_counts<select_config<10, 12, 10, false, true, false, 8, 10, 10>>(const index* in, index* out, index);
17 | template void quickselect<double,select_config<10, 10, 8, true, true, true, 8, 10, 10>>(double* in, double* tmp, index* count_tmp, index size, index rank, double* out);
18 | template __global__ void kernels::count_buckets<float, select_config<10, 10, 8, true, false, false, 8, 10, 10>>(const float* in, const float* tree, index* counts, poracle* oracles, index size, index workcount);
19 | template __host__ __device__ launch_parameters get_launch_parameters<float,select_config<10, 10, 8, false, true, false, 8, 10, 10>>(index size);
20 | template __host__ __device__ void count_buckets<double,select_config<10, 10, 8, false, false, true, 8, 10, 10>>(const double* in, const double* tree, index* localcounts, index* counts, poracle* oracles, index size);
21 | }


--------------------------------------------------------------------------------
/lib/generated/gen9.cu:
--------------------------------------------------------------------------------
 1 | #include <kernel_config.cuh>
 2 | #include <qs_launchers.cuh>
 3 | #include <qs_recursion.cuh>
 4 | #include <qs_recursion_multi.cuh>
 5 | #include <ssss_recursion.cuh>
 6 | #include <ssss_recursion_multi.cuh>
 7 | #include <ssss_launchers.cuh>
 8 | namespace gpu {
 9 | template __global__ void kernels::partition<float,select_config<10, 5, 8, true, true, true, 8, 10, 10>>(const float* in, float* out, index* atomic, index size, float pivot, index workcount);
10 | template void sampleselect_host<float,select_config<9, 10, 8, true, true, true, 8, 10, 10>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
11 | template void sampleselect_host<float,select_config<10, 10, 8, true, true, true, 8, 9, 10>>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
12 | template __host__ __device__ void build_searchtree<float,select_config<10, 9, 7, true, true, false, 8, 10, 10>>(const float* in, float* out, index size);
13 | template void sampleselect_host<double,select_config<8, 10, 8, false, true, true, 8, 10, 10>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
14 | template __host__ __device__ void build_searchtree<double,select_config<10, 10, 8, true, true, false, 8, 10, 10>>(const double* in, double* out, index size);
15 | template void sampleselect<double,select_config<10, 10, 8, false, true, true, 8, 10, 10, false, 16>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
16 | template void sampleselect<double,select_config<10, 10, 8, false, true, true, 4, 10, 10>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
17 | template void sampleselect_multi<double,select_config<10, 10, 6, false, true, true, 8, 10, 10>>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out);
18 | template void quickselect<float,select_config<10, 10, 8, false, true, true, 8, 10, 10>>(float* in, float* tmp, index* count_tmp, index size, index rank, float* out);
19 | template void sampleselect_host<double,select_config<10, 10, 6, false, true, true, 8, 10, 10, false, 1024>>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
20 | template void quickselect<float,select_config<10, 10, 8, true, true, true, 8, 8, 10>>(float* in, float* tmp, index* count_tmp, index size, index rank, float* out);
21 | }


--------------------------------------------------------------------------------
/lib/qs_launchers.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Parallel selection algorithm on GPUs
 3 |  * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de)
 4 |  *
 5 |  * This program is free software: you can redistribute it and/or modify
 6 |  * it under the terms of the GNU General Public License as published by
 7 |  * the Free Software Foundation, version 3.
 8 |  *
 9 |  * This program is distributed in the hope that it will be useful, but
10 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
11 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 |  * General Public License for more details.
13 |  *
14 |  * You should have received a copy of the GNU General Public License
15 |  * along with this program. If not, see <https://www.gnu.org/licenses/>.
16 |  */
17 | #ifndef QS_LAUNCHERS_CUH
18 | #define QS_LAUNCHERS_CUH
19 | 
20 | #include "qs_reduce.cuh"
21 | #include "qs_scan.cuh"
22 | 
23 | namespace gpu {
24 | 
25 | using kernels::partition;
26 | using kernels::partition_count;
27 | using kernels::partition_distr;
28 | using kernels::partition_prefixsum;
29 | 
30 | template <typename T, typename Config>
31 | __device__ __host__ void partition(const T* in, T* out, index* counts, index size, T pivot) {
32 |     auto bsize = Config::algorithm::max_block_size;
33 |     auto nblocks = min(ceil_div(size, bsize), Config::algorithm::max_block_count);
34 |     auto per_thread = ceil_div(size, nblocks * bsize);
35 |     if (Config::algorithm::shared_memory) {
36 |         partition_count<T, Config><<<nblocks, bsize>>>(in, counts, size, pivot, per_thread);
37 |         partition_prefixsum<Config><<<1, Config::algorithm::max_block_count>>>(counts, nblocks);
38 |         partition_distr<T, Config><<<nblocks, bsize>>>(in, out, counts, size, pivot, per_thread);
39 |     } else {
40 |         partition<T, Config><<<nblocks, bsize>>>(in, out, counts, size, pivot, per_thread);
41 |     }
42 | }
43 | 
44 | } // namespace gpu
45 | 
46 | #endif // QS_LAUNCHERS_CUH


--------------------------------------------------------------------------------
/lib/qs_recursion.cuh:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Parallel selection algorithm on GPUs
  3 |  * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de)
  4 |  *
  5 |  * This program is free software: you can redistribute it and/or modify
  6 |  * it under the terms of the GNU General Public License as published by
  7 |  * the Free Software Foundation, version 3.
  8 |  *
  9 |  * This program is distributed in the hope that it will be useful, but
 10 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
 11 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 12 |  * General Public License for more details.
 13 |  *
 14 |  * You should have received a copy of the GNU General Public License
 15 |  * along with this program. If not, see <https://www.gnu.org/licenses/>.
 16 |  */
 17 | #ifndef QS_RECURSION_CUH
 18 | #define QS_RECURSION_CUH
 19 | 
 20 | #include "qs_reduce.cuh"
 21 | #include "qs_scan.cuh"
 22 | #include "utils_prefixsum.cuh"
 23 | #include "utils_sampling.cuh"
 24 | #include "utils_basecase.cuh"
 25 | #include "utils_search.cuh"
 26 | 
 27 | namespace gpu {
 28 | namespace kernels {
 29 | 
 30 | template <typename T, typename Config>
 31 | __global__ void quickselect_tailcall(T* in, T* tmp,
 32 |                                      index* count_tmp, index size, index rank, T pivot,
 33 |                                      T* out);
 34 | 
 35 | template <typename T, typename Config>
 36 | __device__ __forceinline__ void launch_quickselect(T* in, T* tmp,
 37 |                                                    index* count_tmp, index size,
 38 |                                                    index rank, T* out) {
 39 |     auto idx = threadIdx.x;
 40 |     // assert blockDim.x == warp_size
 41 | 
 42 |     if (size <= Config::basecase::size) {
 43 |         if (idx == 0) {
 44 |             select_bitonic_basecase<T, Config><<<1, Config::basecase::launch_size>>>(in, size, rank, out);
 45 |         }
 46 |     } else {
 47 |         // find sample median
 48 |         auto pick_idx = random_pick_idx(idx, warp_size, size);
 49 |         auto pick = in[pick_idx];
 50 |         auto local = pick;
 51 |         bitonic_helper_warp<T, 0, warp_size_log2>::sort(&local, false);
 52 |         auto pivot = shfl(full_mask, local, warp_size / 2);
 53 | 
 54 |         // determine the index of the sample median
 55 |         auto mask = ballot(full_mask, pick == pivot);
 56 |         auto pivot_idx = shfl(full_mask, pick_idx, __ffs(mask) - 1);
 57 |         if (idx > 0) {
 58 |             return;
 59 |         }
 60 |         // swap the sample median to the first position
 61 |         swap(in[pivot_idx], in[0]);
 62 |         // reset atomic counters
 63 |         if (!Config::algorithm::shared_memory) {
 64 |             count_tmp[0] = 0;
 65 |             count_tmp[1] = 0;
 66 |         }
 67 |         gpu::partition<T, Config>(in + 1, tmp, count_tmp, size - 1, pivot);
 68 |         quickselect_tailcall<T, Config>
 69 |                 <<<1, warp_size>>>(in + 1, tmp, count_tmp, size - 1, rank, pivot, out);
 70 |     }
 71 | }
 72 | 
 73 | template <typename T, typename Config>
 74 | __global__ void quickselect_tailcall(T* in, T* tmp,
 75 |                                      index* count_tmp, index size, index rank, T pivot,
 76 |                                      T* out) {
 77 |     if (threadIdx.x >= warp_size) {
 78 |         return;
 79 |     }
 80 | 
 81 |     auto lcount = count_tmp[0];
 82 |     auto rcount = count_tmp[1];
 83 |     if (rank == lcount) {
 84 |         if (threadIdx.x == 0) {
 85 |             *out = pivot;
 86 |         }
 87 |     } else if (rank < lcount) {
 88 |         launch_quickselect<T, Config>(tmp, in, count_tmp, lcount, rank, out);
 89 |     } else {
 90 |         launch_quickselect<T, Config>(tmp + lcount, in, count_tmp, rcount, rank - lcount - 1, out);
 91 |     }
 92 | }
 93 | 
 94 | template <typename T, typename Config>
 95 | __global__ void quickselect(T* in, T* tmp, index* count_tmp,
 96 |                             index size, index rank, T* out) {
 97 |     launch_quickselect<T, Config>(in, tmp, count_tmp, size, rank, out);
 98 | }
 99 | 
100 | } // namespace kernels
101 | 
102 | template <typename T, typename Config>
103 | void quickselect(T* in, T* tmp, index* count_tmp, index size, index rank, T* out) {
104 |     kernels::quickselect<T, Config><<<1, warp_size>>>(in, tmp, count_tmp, size, rank, out);
105 | }
106 | 
107 | template <typename Config>
108 | index quickselect_alloc_size(index size) {
109 |     return sizeof(index) * (Config::algorithm::max_block_count * 2 + 2);
110 | }
111 | 
112 | } // namespace gpu
113 | 
114 | #endif // QS_RECURSION_CUH
115 | 


--------------------------------------------------------------------------------
/lib/qs_recursion_multi.cuh:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Parallel selection algorithm on GPUs
  3 |  * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de)
  4 |  *
  5 |  * This program is free software: you can redistribute it and/or modify
  6 |  * it under the terms of the GNU General Public License as published by
  7 |  * the Free Software Foundation, version 3.
  8 |  *
  9 |  * This program is distributed in the hope that it will be useful, but
 10 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
 11 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 12 |  * General Public License for more details.
 13 |  *
 14 |  * You should have received a copy of the GNU General Public License
 15 |  * along with this program. If not, see <https://www.gnu.org/licenses/>.
 16 |  */
 17 | #ifndef QS_RECURSION_MULTI_CUH
 18 | #define QS_RECURSION_MULTI_CUH
 19 | 
 20 | #include "qs_recursion.cuh"
 21 | 
 22 | namespace gpu {
 23 | namespace kernels {
 24 | 
 25 | template <typename T, typename Config>
 26 | __global__ void quickselect_tailcall_multi(T* in, T* tmp,
 27 |                                            index* count_tmp, index size, const index* ranks, index rank_count, index rank_base, T pivot,
 28 |                                            T* out);
 29 | 
 30 | template <typename T, typename Config>
 31 | __device__ __forceinline__ void launch_quickselect_multi(T* in, T* tmp,
 32 |                                                          index* count_tmp, index size,
 33 |                                                          const index* ranks, index rank_count, index rank_base, T* out) {
 34 |     if (rank_count == 0) {
 35 |         return;
 36 |     }
 37 |     auto idx = threadIdx.x;
 38 |     // assert blockDim.x == warp_size
 39 | 
 40 |     if (size <= Config::basecase::size) {
 41 |         if (idx == 0) {
 42 |             select_bitonic_multiple_basecase<T, Config><<<1, Config::basecase::launch_size>>>(in, size, ranks, rank_count, rank_base, out);
 43 |         }
 44 |     } else {
 45 |         // find sample median
 46 |         auto pick_idx = random_pick_idx(idx, warp_size, size);
 47 |         auto pick = in[pick_idx];
 48 |         auto local = pick;
 49 |         bitonic_helper_warp<T, 0, warp_size_log2>::sort(&local, false);
 50 |         auto pivot = shfl(full_mask, local, warp_size / 2);
 51 | 
 52 |         // determine the index of the sample median
 53 |         auto mask = ballot(full_mask, pick == pivot);
 54 |         auto pivot_idx = shfl(full_mask, pick_idx, __ffs(mask) - 1);
 55 |         if (idx > 0) {
 56 |             return;
 57 |         }
 58 |         // swap the sample median to the first position
 59 |         swap(in[pivot_idx], in[0]);
 60 |         // reset atomic counters
 61 |         if (!Config::algorithm::shared_memory) {
 62 |             count_tmp[0] = 0;
 63 |             count_tmp[1] = 0;
 64 |         }
 65 |         gpu::partition<T, Config>(in + 1, tmp, count_tmp, size - 1, pivot);
 66 |         quickselect_tailcall_multi<T, Config>
 67 |                 <<<2, warp_size>>>(in + 1, tmp, count_tmp, size - 1, ranks, rank_count, rank_base, pivot, out);
 68 |     }
 69 | }
 70 | 
 71 | template <typename T, typename Config>
 72 | __global__ void quickselect_tailcall_multi(T* in, T* tmp,
 73 |                                            index* count_tmp, index size, const index* ranks, index rank_count, index rank_base, T pivot,
 74 |                                            T* out) {
 75 |     // assert blockDim.x == warp_size
 76 | 
 77 |     auto lcount = count_tmp[0];
 78 |     auto rcount = count_tmp[1];
 79 |     auto middle = binary_search(ranks, rank_count, lcount + rank_base);
 80 |     if (blockIdx.x == 0) {
 81 |         if (middle < rank_count && ranks[middle] == lcount + rank_base) {
 82 |             if (threadIdx.x == 0) {
 83 |                 out[middle] = pivot;
 84 |             }
 85 |             if (middle < rank_count - 1) {
 86 |                 launch_quickselect_multi<T, Config>(tmp + lcount, in + lcount, count_tmp + lcount, rcount,
 87 |                     ranks + middle + 1, rank_count - middle - 1, rank_base + (lcount + 1), out + middle + 1);
 88 |             }
 89 |             } else {
 90 |             if (middle < rank_count) {
 91 |                 launch_quickselect_multi<T, Config>(tmp + lcount, in + lcount, count_tmp + lcount, rcount,
 92 |                     ranks + middle, rank_count - middle, rank_base + (lcount + 1), out + middle);
 93 |             }
 94 |         }
 95 |     } else {
 96 |         if (middle > 0) {
 97 |             launch_quickselect_multi<T, Config>(tmp, in, count_tmp, lcount, ranks, middle, rank_base, out);
 98 |         }
 99 |     }
100 | }
101 | 
102 | template <typename T, typename Config>
103 | __global__ void quickselect_multi(T* in, T* tmp, index* count_tmp,
104 |                                   index size, const index* ranks, index rank_count, T* out) {
105 |     launch_quickselect_multi<T, Config>(in, tmp, count_tmp, size, ranks, rank_count, 0, out);
106 | }
107 | 
108 | } // namespace kernels
109 | 
110 | template <typename T, typename Config>
111 | void quickselect_multi(T* in, T* tmp, index* count_tmp, index size, const index* ranks, index rank_count, T* out) {
112 |     kernels::quickselect_multi<T, Config><<<1, warp_size>>>(in, tmp, count_tmp, size, ranks, rank_count, out);
113 | }
114 | 
115 | template <typename Config>
116 | index quickselect_alloc_size_multi(index size) {
117 |     return sizeof(index) * (std::max(Config::algorithm::max_block_count * 2 + 2, size));
118 | }
119 | 
120 | } // namespace gpu
121 | 
122 | #endif // QS_RECURSION_MULTI_CUH
123 | 


--------------------------------------------------------------------------------
/lib/qs_reduce.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Parallel selection algorithm on GPUs
 3 |  * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de)
 4 |  *
 5 |  * This program is free software: you can redistribute it and/or modify
 6 |  * it under the terms of the GNU General Public License as published by
 7 |  * the Free Software Foundation, version 3.
 8 |  *
 9 |  * This program is distributed in the hope that it will be useful, but
10 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
11 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 |  * General Public License for more details.
13 |  *
14 |  * You should have received a copy of the GNU General Public License
15 |  * along with this program. If not, see <https://www.gnu.org/licenses/>.
16 |  */
17 | #ifndef QS_REDUCE_CUH
18 | #define QS_REDUCE_CUH
19 | 
20 | #include "utils_prefixsum.cuh"
21 | 
22 | namespace gpu {
23 | namespace kernels {
24 | 
25 | template <typename Config>
26 | __global__ void partition_prefixsum(index* counts, index block_count) {
27 |     __shared__ index local_lcounts[Config::algorithm::max_block_count];
28 |     __shared__ index local_rcounts[Config::algorithm::max_block_count];
29 |     auto i = threadIdx.x;
30 |     auto l = i >= block_count ? 0 : counts[2 * i];
31 |     auto r = i >= block_count ? 0 : counts[2 * i + 1];
32 |     local_lcounts[i] = l;
33 |     local_rcounts[i] = r;
34 |     small_prefix_sum<Config::algorithm::max_block_count_log2>(local_lcounts);
35 |     small_prefix_sum<Config::algorithm::max_block_count_log2>(local_rcounts);
36 |     __syncthreads();
37 |     if (i < block_count) {
38 |         counts[2 * i + 2] = local_lcounts[i];
39 |         counts[2 * i + 3] = local_rcounts[i];
40 |     }
41 |     // store the total sum at the beginning
42 |     if (i == block_count - 1) {
43 |         counts[0] = l + local_lcounts[i];
44 |         counts[1] = r + local_rcounts[i];
45 |     }
46 | }
47 | 
48 | } // namespace kernels
49 | } // namespace gpu
50 | 
51 | #endif // QS_REDUCE_CUH


--------------------------------------------------------------------------------
/lib/qs_scan.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Parallel selection algorithm on GPUs
 3 |  * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de)
 4 |  *
 5 |  * This program is free software: you can redistribute it and/or modify
 6 |  * it under the terms of the GNU General Public License as published by
 7 |  * the Free Software Foundation, version 3.
 8 |  *
 9 |  * This program is distributed in the hope that it will be useful, but
10 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
11 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 |  * General Public License for more details.
13 |  *
14 |  * You should have received a copy of the GNU General Public License
15 |  * along with this program. If not, see <https://www.gnu.org/licenses/>.
16 |  */
17 | #ifndef QS_SCAN_CUH
18 | #define QS_SCAN_CUH
19 | 
20 | #include "utils_warpaggr.cuh"
21 | #include "utils_work.cuh"
22 | 
23 | namespace gpu {
24 | namespace kernels {
25 | 
26 | template <typename T, typename Config, typename Callback>
27 | __device__ void partition_impl(const T* in, index size, T pivot, index workcount,
28 |                                Callback callback) {
29 |     blockwise_work<Config>(workcount, size, [&](index idx, mask amask) {
30 |         auto el = in[idx];
31 |         bool left = el < pivot;
32 |         auto lmask = ballot(amask, left);
33 |         auto rmask = lmask ^ amask;
34 | 
35 |         callback(el, left, amask, lmask, rmask);
36 |     });
37 | }
38 | 
39 | template <typename T, typename Config>
40 | __global__ void partition(const T* in, T* out, index* atomic,
41 |                           index size, T pivot, index workcount) {
42 |     partition_impl<T, Config>(in, size, pivot, workcount,
43 |                               [&](T el, bool l, mask amask, mask lm, mask rm) {
44 |                                   auto lofs = warp_aggr_atomic_count_mask(atomic, amask, lm);
45 |                                   auto rofs = warp_aggr_atomic_count_mask(atomic + 1, amask, rm);
46 |                                   auto target_idx = l ? lofs : size - 1 - rofs;
47 |                                   out[target_idx] = el;
48 |                               });
49 | }
50 | 
51 | template <typename T, typename Config>
52 | __global__ void partition_count(const T* in, index* counts, index size,
53 |                                 T pivot, index workcount) {
54 |     __shared__ index lcount, rcount;
55 |     if (threadIdx.x == 0) {
56 |         lcount = 0;
57 |         rcount = 0;
58 |     }
59 |     __syncthreads();
60 |     partition_impl<T, Config>(in, size, pivot, workcount,
61 |                               [&](T el, bool l, mask amask, mask lm, mask rm) {
62 |                                   if (threadIdx.x % warp_size == 0) {
63 |                                       atomicAdd(&lcount, __popc(lm));
64 |                                       atomicAdd(&rcount, __popc(rm));
65 |                                   }
66 |                               });
67 |     __syncthreads();
68 |     if (threadIdx.x == 0) {
69 |         counts[2 * blockIdx.x] = lcount;
70 |         counts[2 * blockIdx.x + 1] = rcount;
71 |     }
72 | }
73 | 
74 | template <typename T, typename Config>
75 | __global__ void partition_distr(const T* in, T* out,
76 |                                 const index* counts, index size, T pivot,
77 |                                 index workcount) {
78 |     __shared__ index lcount, rcount;
79 |     if (threadIdx.x == 0) {
80 |         lcount = counts[2 * blockIdx.x + 2];
81 |         rcount = counts[2 * blockIdx.x + 3];
82 |     }
83 |     __syncthreads();
84 |     partition_impl<T, Config>(in, size, pivot, workcount,
85 |                               [&](T el, bool l, mask amask, mask lm, mask rm) {
86 |                                   auto lofs = warp_aggr_atomic_count_mask(&lcount, amask, lm);
87 |                                   auto rofs = warp_aggr_atomic_count_mask(&rcount, amask, rm);
88 |                                   auto target_idx = l ? lofs : size - 1 - rofs;
89 |                                   out[target_idx] = el;
90 |                               });
91 | }
92 | 
93 | } // namespace kernels
94 | } // namespace gpu
95 | 
96 | #endif // QS_SCAN_CUH


--------------------------------------------------------------------------------
/lib/ssss_build_searchtree.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Parallel selection algorithm on GPUs
 3 |  * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de)
 4 |  *
 5 |  * This program is free software: you can redistribute it and/or modify
 6 |  * it under the terms of the GNU General Public License as published by
 7 |  * the Free Software Foundation, version 3.
 8 |  *
 9 |  * This program is distributed in the hope that it will be useful, but
10 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
11 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 |  * General Public License for more details.
13 |  *
14 |  * You should have received a copy of the GNU General Public License
15 |  * along with this program. If not, see <https://www.gnu.org/licenses/>.
16 |  */
17 | #ifndef SSSS_BUILD_SEARCHTREE_CUH
18 | #define SSSS_BUILD_SEARCHTREE_CUH
19 | 
20 | #include "utils_sampling.cuh"
21 | #include "utils_sort.cuh"
22 | #include "utils_work.cuh"
23 | 
24 | namespace gpu {
25 | namespace kernels {
26 | 
27 | template <typename Config>
28 | __device__ __forceinline__ index searchtree_entry(index idx) {
29 |     // determine the level by the node index
30 |     // rationale: a complete binary tree with 2^k leaves has 2^k - 1 inner nodes
31 |     // lvl == log2(idx + 1)
32 |     auto lvl = 31 - __clz(idx + 1);
33 |     // step == n / 2^lvl
34 |     auto step = Config::searchtree::width >> lvl;
35 |     // index within the level
36 |     auto lvl_idx = idx - (1 << lvl) + 1;
37 |     return lvl_idx * step + step / 2;
38 | }
39 | 
40 | template <typename T, typename Config>
41 | __host__ __device__ bool is_equality_bucket(const T* leaves, index bucket_idx) {
42 |     // first and last bucket can't definitely be checked to be equality buckets
43 |     return bucket_idx > 0 && bucket_idx < Config::searchtree::width - 1 && leaves[bucket_idx + 1] == add_epsilon(leaves[bucket_idx]);
44 | }
45 | 
46 | template <typename T, typename Config>
47 | __device__ void equality_bucket(T* leaves) {
48 |     auto idx = threadIdx.x;
49 |     if (idx < Config::searchtree::width && idx > 0) {
50 |         // If we are the last in a sequence of equal elements, we add a small epsilon
51 |         bool equality = leaves[idx] == leaves[idx - 1] &&
52 |                         (idx == Config::searchtree::width - 1 || leaves[idx] < leaves[idx + 1]);
53 |         if (equality) {
54 |             leaves[idx] = add_epsilon(leaves[idx]);
55 |         }
56 |     }
57 | }
58 | 
59 | template <typename T, typename Config>
60 | __device__ void build_searchtree_shared(const T* in, index size, T* tree) {
61 |     __shared__ T sample_buffer[Config::sample::size];
62 |     static_assert(Config::sample::size >= Config::searchtree::width, "sample too small");
63 |     auto idx = threadIdx.x;
64 |     
65 |     // pick sample
66 |     T local_buffer[Config::sample::local_size];
67 |     if (threadIdx.x * Config::sample::local_size < Config::sample::size) {
68 |         for (auto i = 0; i < Config::sample::local_size; ++i) {
69 |             local_buffer[i] = in[random_pick_idx(threadIdx.x * Config::sample::local_size + i, Config::sample::size, size)];
70 |         }
71 |     }
72 |     // sort sample
73 |     using sorter = bitonic_helper_global<T, Config::sample::local_size_log2, warp_size_log2, Config::sample::size_log2 - warp_size_log2 - Config::sample::local_size_log2, Config::sample::size_log2>;
74 |     sorter::sort(local_buffer, sample_buffer, false);
75 |     __syncthreads();
76 |     // pick splitters from sorted sample
77 |     if (idx < Config::searchtree::width) {
78 |         tree[idx + Config::searchtree::width - 1] = sample_buffer[uniform_pick_idx(idx,
79 |             Config::searchtree::width, Config::sample::size)];
80 |     }
81 |     __syncthreads();
82 |     // create equality bucket if necessary
83 |     equality_bucket<T, Config>(tree + (Config::searchtree::width - 1));
84 |     __syncthreads();
85 |     // inner nodes
86 |     if (idx < Config::searchtree::width - 1) {
87 |         tree[idx] = tree[searchtree_entry<Config>(idx) + Config::searchtree::width - 1];
88 |     }
89 | }
90 | 
91 | template <typename T, typename Config>
92 | __global__ void build_searchtree(const T* in, T* out, index size) {
93 |     build_searchtree_shared<T, Config>(in, size, out);
94 | }
95 | 
96 | } // namespace kernels
97 | } // namespace gpu
98 | 
99 | #endif // SSSS_BUILD_SEARCHTREE_CU


--------------------------------------------------------------------------------
/lib/ssss_collect.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Parallel selection algorithm on GPUs
 3 |  * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de)
 4 |  *
 5 |  * This program is free software: you can redistribute it and/or modify
 6 |  * it under the terms of the GNU General Public License as published by
 7 |  * the Free Software Foundation, version 3.
 8 |  *
 9 |  * This program is distributed in the hope that it will be useful, but
10 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
11 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 |  * General Public License for more details.
13 |  *
14 |  * You should have received a copy of the GNU General Public License
15 |  * along with this program. If not, see <https://www.gnu.org/licenses/>.
16 |  */
17 | #ifndef SSSS_COLLECT_CUH
18 | #define SSSS_COLLECT_CUH
19 | 
20 | #include "ssss_reduce.cuh"
21 | #include "utils_bytestorage.cuh"
22 | #include "utils_warpaggr.cuh"
23 | #include "utils_work.cuh"
24 | 
25 | namespace gpu {
26 | namespace kernels {
27 | 
28 | template <typename T, typename Config>
29 | __device__ void
30 | collect_bucket_impl(const T* data, const poracle* oracles_packed,
31 |                     const index* prefix_sum, T* out, index size,
32 |                     oracle bucket, index* atomic, index workcount) {
33 |     __shared__ index count;
34 |     // initialize block-local count from prefix sum
35 |     if (Config::algorithm::shared_memory && threadIdx.x == 0) {
36 |         auto idx = partial_sum_idx(blockIdx.x, bucket, gridDim.x, Config::searchtree::width);
37 |         count = prefix_sum[idx];
38 |     }
39 |     __syncthreads();
40 |     // extract elements from the specified bucket
41 |     blockwise_work<Config>(workcount, size, [&](index idx, mask amask) {
42 |         // load bucket index
43 |         auto packed = load_packed_bytes(oracles_packed, amask, idx);
44 |         // determine target location
45 |         index ofs{};
46 |         if (Config::algorithm::shared_memory) {
47 |             ofs = warp_aggr_atomic_count_predicate(&count, amask, packed == bucket);
48 |         } else {
49 |             ofs = warp_aggr_atomic_count_predicate(atomic, amask, packed == bucket);
50 |         }
51 |         // store element
52 |         if (packed == bucket) {
53 |             out[ofs] = data[idx];
54 |         }
55 |     });
56 | }
57 | 
58 | template <typename T, typename Config>
59 | __global__ void
60 | collect_bucket(const T* data, const poracle* oracles_packed,
61 |                const index* prefix_sum, T* out, index size, oracle bucket,
62 |                index* atomic, index workcount) {
63 |     collect_bucket_impl<T, Config>(data, oracles_packed, prefix_sum, out, size, bucket, atomic,
64 |                                    workcount);
65 | }
66 | 
67 | template <typename T, typename Config>
68 | __global__ void
69 | collect_bucket_indirect(const T* data, const poracle* oracles_packed,
70 |                         const index* prefix_sum, T* out, index size,
71 |                         const oracle* bucket_ptr, index* atomic, index workcount) {
72 |     collect_bucket_impl<T, Config>(data, oracles_packed, prefix_sum, out, size, *bucket_ptr, atomic,
73 |                                    workcount);
74 | }
75 | 
76 | } // namespace kernels
77 | } // namespace gpu
78 | 
79 | #endif // SSSS_COLLECT_CUH


--------------------------------------------------------------------------------
/lib/ssss_collect_multi.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Parallel selection algorithm on GPUs
 3 |  * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de)
 4 |  *
 5 |  * This program is free software: you can redistribute it and/or modify
 6 |  * it under the terms of the GNU General Public License as published by
 7 |  * the Free Software Foundation, version 3.
 8 |  *
 9 |  * This program is distributed in the hope that it will be useful, but
10 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
11 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 |  * General Public License for more details.
13 |  *
14 |  * You should have received a copy of the GNU General Public License
15 |  * along with this program. If not, see <https://www.gnu.org/licenses/>.
16 |  */
17 | #ifndef SSSS_COLLECT_MULTI_CUH
18 | #define SSSS_COLLECT_MULTI_CUH
19 | 
20 | #include "ssss_reduce.cuh"
21 | #include "utils_bytestorage.cuh"
22 | #include "utils_warpaggr.cuh"
23 | #include "utils_work.cuh"
24 | #include "utils_mask.cuh"
25 | #include "utils_search.cuh"
26 | 
27 | namespace gpu {
28 | namespace kernels {
29 | 
30 | template <typename T, typename Config>
31 | __global__ void
32 | collect_buckets(const T* data, const poracle* oracles_packed,
33 |     const index* block_prefix_sum, const index* bucket_out_ranges,
34 |     T* out, index size, const mask* buckets,
35 |     index* atomic, index workcount) {
36 |         // initialize mask cache in shared memory
37 |         constexpr auto mask_size = ceil_div(Config::searchtree::width, sizeof(mask) * 8);
38 |         __shared__ mask shared_mask[mask_size];
39 |         static_assert(mask_size < 32, "mask too big, just a misconfiguration failsafe");
40 |         if (threadIdx.x < mask_size) {
41 |             shared_mask[threadIdx.x] = buckets[threadIdx.x];
42 |         }
43 | 
44 |         // initialize block-local count from prefix sum
45 |         __shared__ index count[Config::searchtree::width];
46 |         if (Config::algorithm::shared_memory) {
47 |             blockwise_work_local(Config::searchtree::width, [&](index bucket) {
48 |                 auto base_idx = partial_sum_idx(blockIdx.x, bucket, gridDim.x, Config::searchtree::width);
49 |                 count[bucket] = bucket_out_ranges[bucket] + block_prefix_sum[base_idx];
50 |             });
51 |         } else {
52 |             blockwise_work_local(Config::searchtree::width, [&](index bucket) {
53 |                 count[bucket] = bucket_out_ranges[bucket];
54 |             });
55 |         }
56 | 
57 |         __syncthreads();
58 | 
59 |         // extract elements from the specified bucket
60 |         blockwise_work<Config>(workcount, size, [&](index idx, mask amask) {
61 |             // load bucket index
62 |             auto bucket = load_packed_bytes(oracles_packed, amask, idx);
63 |             // determine target location
64 |             index ofs{};
65 |             if (check_mask(bucket, shared_mask)) {
66 |                 if (Config::algorithm::shared_memory) {
67 |                     ofs = atomicAdd(&count[bucket], 1);
68 |                 } else {
69 |                     ofs = atomicAdd(&atomic[bucket], 1) + count[bucket];
70 |                 }
71 |                 // store element
72 |                 out[ofs] = data[idx];
73 |             }
74 |         });
75 | }
76 | 
77 | } // namespace kernels
78 | } // namespace gpu
79 | 
80 | #endif // SSSS_COLLECT_MULTI_CUH


--------------------------------------------------------------------------------
/lib/ssss_count.cuh:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Parallel selection algorithm on GPUs
  3 |  * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de)
  4 |  *
  5 |  * This program is free software: you can redistribute it and/or modify
  6 |  * it under the terms of the GNU General Public License as published by
  7 |  * the Free Software Foundation, version 3.
  8 |  *
  9 |  * This program is distributed in the hope that it will be useful, but
 10 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
 11 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 12 |  * General Public License for more details.
 13 |  *
 14 |  * You should have received a copy of the GNU General Public License
 15 |  * along with this program. If not, see <https://www.gnu.org/licenses/>.
 16 |  */
 17 | #ifndef SSSS_COUNT_CUH
 18 | #define SSSS_COUNT_CUH
 19 | 
 20 | #include "ssss_reduce.cuh"
 21 | #include "utils_bytestorage.cuh"
 22 | #include "utils_warpaggr.cuh"
 23 | #include "utils_work.cuh"
 24 | 
 25 | namespace gpu {
 26 | namespace kernels {
 27 | 
 28 | template <typename T, typename Config>
 29 | __device__ oracle searchtree_traversal(const T* searchtree, T el, mask amask, mask& equal_mask, T min_split, T max_split) {
 30 |     index i = 0;
 31 |     equal_mask = amask;
 32 |     if (Config::algorithm::bucket_select) {
 33 |         auto maxbucket = Config::searchtree::width - 1;
 34 |         auto floatbucket = (el - min_split) / (max_split - min_split) * Config::searchtree::width - T(0.5);
 35 |         floatbucket = floatbucket > maxbucket ? maxbucket : floatbucket;
 36 |         floatbucket = floatbucket < 0 ? 0 : floatbucket;
 37 |         auto bucket = oracle(floatbucket);
 38 |         for (index lvl = 0; lvl < Config::searchtree::height; ++lvl) {
 39 |             auto bit = (bucket >> lvl) & 1;
 40 |             equal_mask &= ballot(amask, bit) ^ (bit - 1);
 41 |         }
 42 |         return oracle(floatbucket);
 43 |     } else {
 44 |         auto root_splitter = searchtree[0];
 45 |         bool next_smaller = el < root_splitter;
 46 |         for (index lvl = 0; lvl < Config::searchtree::height; ++lvl) {
 47 |             // compute next node index
 48 |             bool smaller = next_smaller;
 49 |             i = 2 * i + 2 - smaller;
 50 |             next_smaller = el < searchtree[i];
 51 |             // update equality mask
 52 |             auto local_mask = ballot(amask, smaller) ^ (smaller - 1);
 53 |             equal_mask &= local_mask;
 54 |         }
 55 |         // return leaf rank
 56 |         return i - (Config::searchtree::width - 1);
 57 |     }
 58 | }
 59 | 
 60 | template <typename T, typename Config, typename BucketCallback>
 61 | __device__ __forceinline__ void ssss_impl(const T* in, const T* tree,
 62 |                                           index size, index workcount, BucketCallback bucket_cb) {
 63 |     __shared__ T local_tree[Config::algorithm::bucket_select ? 2 : Config::searchtree::size];
 64 |     // Load searchtree into shared memory
 65 |     if (Config::algorithm::bucket_select) {
 66 |         if (threadIdx.x == 0) {
 67 |             local_tree[0] = tree[Config::searchtree::width];
 68 |             local_tree[1] = tree[Config::searchtree::size - 1];
 69 |         }
 70 |     } else {
 71 |         blockwise_work_local(Config::searchtree::size, [&](index i) { local_tree[i] = tree[i]; });
 72 |     }
 73 |     __syncthreads();
 74 |     // only for bucket select
 75 |     auto min_split = local_tree[0];
 76 |     auto max_split = local_tree[1];
 77 | 
 78 |     // Determine the bucket and equality mask for every entry
 79 |     blockwise_work<Config>(workcount, size, [&](index idx, mask amask) {
 80 |         mask equal_mask{};
 81 |         auto bucket_idx = searchtree_traversal<T, Config>(local_tree, in[idx], amask, equal_mask, min_split, max_split);
 82 |         bucket_cb(idx, bucket_idx, amask, equal_mask);
 83 |     });
 84 | }
 85 | 
 86 | template <typename T, typename Config>
 87 | __global__ void count_buckets(const T* in, const T* tree,
 88 |                               index* counts, poracle* oracles, index size,
 89 |                               index workcount) {
 90 |     __shared__ index local_counts[Config::searchtree::width];
 91 |     // Initialize shared-memory counts
 92 |     if (Config::algorithm::shared_memory) {
 93 |         blockwise_work_local(Config::searchtree::width, [&](index i) { local_counts[i] = 0; });
 94 |         __syncthreads();
 95 |     }
 96 |     // Traverse searchtree for every entry
 97 |     ssss_impl<T, Config>(
 98 |             in, tree, size, workcount, [&](index idx, oracle bucket, mask amask, mask mask) {
 99 |                 static_assert(!Config::algorithm::write || Config::searchtree::height <= 8,
100 |                               "can't pack bucket idx into byte");
101 |                 // Store oracles
102 |                 if (Config::algorithm::write) {
103 |                     store_packed_bytes(oracles, amask, bucket, idx);
104 |                 }
105 |                 // Increment bucket count
106 |                 index add = Config::algorithm::warp_aggr ? __popc(mask) : 1;
107 |                 if (!Config::algorithm::warp_aggr || is_group_leader(mask)) {
108 |                     if (Config::algorithm::shared_memory) {
109 |                         atomicAdd(&local_counts[bucket], add);
110 |                     } else {
111 |                         atomicAdd(&counts[bucket], add);
112 |                     }
113 |                 }
114 |             });
115 |     // Write shared-memory counts to global memory
116 |     if (Config::algorithm::shared_memory) {
117 |         __syncthreads();
118 |         // store the local counts grouped by block idx
119 |         blockwise_work_local(Config::searchtree::width, [&](oracle bucket) {
120 |             auto idx = partial_sum_idx(blockIdx.x, bucket, gridDim.x, Config::searchtree::width);
121 |             counts[idx] = local_counts[bucket];
122 |         });
123 |     }
124 | }
125 | 
126 | } // namespace kernels
127 | } // namespace gpu
128 | 
129 | #endif // SSSS_COUNT_CUH
130 | 


--------------------------------------------------------------------------------
/lib/ssss_launchers.cuh:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Parallel selection algorithm on GPUs
  3 |  * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de)
  4 |  *
  5 |  * This program is free software: you can redistribute it and/or modify
  6 |  * it under the terms of the GNU General Public License as published by
  7 |  * the Free Software Foundation, version 3.
  8 |  *
  9 |  * This program is distributed in the hope that it will be useful, but
 10 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
 11 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 12 |  * General Public License for more details.
 13 |  *
 14 |  * You should have received a copy of the GNU General Public License
 15 |  * along with this program. If not, see <https://www.gnu.org/licenses/>.
 16 |  */
 17 | #ifndef SSSS_LAUNCHERS_CUH
 18 | #define SSSS_LAUNCHERS_CUH
 19 | 
 20 | #include "ssss_build_searchtree.cuh"
 21 | #include "ssss_collect.cuh"
 22 | #include "ssss_collect_multi.cuh"
 23 | #include "ssss_count.cuh"
 24 | #include "ssss_reduce.cuh"
 25 | #include "ssss_merged.cuh"
 26 | 
 27 | namespace gpu {
 28 | 
 29 | template <typename T, typename Config>
 30 | __host__ __device__ launch_parameters get_launch_parameters(index size) {
 31 |     launch_parameters result{};
 32 |     result.block_size = Config::algorithm::max_block_size;
 33 |     result.block_count = min(ceil_div(size, result.block_size), Config::algorithm::max_block_count);
 34 |     auto threads = result.block_size * result.block_count;
 35 |     result.work_per_thread = ceil_div(size, threads);
 36 |     return result;
 37 | }
 38 | 
 39 | template <typename T, typename Config>
 40 | __host__ __device__ void build_searchtree(const T* in, T* out, index size) {
 41 |     constexpr auto threads = Config::searchtree_kernel_size;
 42 |     static_assert(threads <= max_block_size, "Work won't fit into a single thread block");
 43 |     kernels::build_searchtree<T, Config><<<1, threads>>>(in, out, size);
 44 | }
 45 | 
 46 | template <typename T, typename Config>
 47 | __host__ __device__ void count_buckets(const T* in, const T* tree, index* localcounts,
 48 |                                        index* counts, poracle* oracles, index size) {
 49 |     auto params = get_launch_parameters<T, Config>(size);
 50 |     if (Config::algorithm::shared_memory) {
 51 |         kernels::count_buckets<T, Config><<<params.block_count, params.block_size>>>(
 52 |                 in, tree, localcounts, oracles, size, params.work_per_thread);
 53 |         constexpr auto reduce_bsize =
 54 |                 min(Config::searchtree::width, Config::algorithm::max_block_count);
 55 |         constexpr auto reduce_blocks = ceil_div(Config::searchtree::width, reduce_bsize);
 56 |         if (Config::algorithm::write) {
 57 |             kernels::prefix_sum_counts<Config>
 58 |                     <<<reduce_blocks, reduce_bsize>>>(localcounts, counts, params.block_count);
 59 |         } else {
 60 |             kernels::reduce_counts<Config>
 61 |                     <<<reduce_blocks, reduce_bsize>>>(localcounts, counts, params.block_count);
 62 |         }
 63 |     } else {
 64 |         kernels::count_buckets<T, Config><<<params.block_count, params.block_size>>>(
 65 |                 in, tree, counts, oracles, size, params.work_per_thread);
 66 |     }
 67 | }
 68 | 
 69 | template <typename T, typename Config>
 70 | __host__ __device__ void collect_bucket(const T* data, const poracle* oracles_packed,
 71 |                                         const index* prefix_sum, T* out, index size, oracle bucket,
 72 |                                         index* atomic) {
 73 |     auto params = get_launch_parameters<T, Config>(size);
 74 |     kernels::collect_bucket<T, Config><<<params.block_count, params.block_size>>>(
 75 |             data, oracles_packed, prefix_sum, out, size, bucket, atomic, params.work_per_thread);
 76 | }
 77 | 
 78 | template <typename T, typename Config>
 79 | __host__ __device__ void collect_bucket_indirect(const T* data, const poracle* oracles_packed,
 80 |                                                  const index* prefix_sum, T* out, index size,
 81 |                                                  const oracle* bucket, index* atomic) {
 82 |     auto params = get_launch_parameters<T, Config>(size);
 83 |     kernels::collect_bucket_indirect<T, Config><<<params.block_count, params.block_size>>>(
 84 |             data, oracles_packed, prefix_sum, out, size, bucket, atomic, params.work_per_thread);
 85 | }
 86 | 
 87 | template <typename T, typename Config>
 88 | __host__ __device__ void collect_buckets(const T* data, const poracle* oracles_packed,
 89 |                                          const index* block_prefix_sum, const index* bucket_out_ranges,
 90 |                                          T* out, index size, mask* buckets, index* atomic) {
 91 |     auto params = get_launch_parameters<T, Config>(size);
 92 |     kernels::collect_buckets<T, Config><<<params.block_count, params.block_size>>>(
 93 |             data, oracles_packed, block_prefix_sum, bucket_out_ranges, out, size, buckets, atomic, params.work_per_thread);
 94 | }
 95 | 
 96 | template <typename T, typename Config>
 97 | __host__ __device__ void ssss_merged(
 98 |     const T* in,
 99 |     T* out,
100 |     poracle* oracles,
101 |     index offset,
102 |     const index* ranks,
103 |     index rank_offset,
104 |     index rank_base,
105 |     const kernels::ssss_multi_aux<T, Config>* aux_in,
106 |     kernels::ssss_multi_aux<T, Config>* aux_outs,
107 |     T* out_trees) {
108 |     kernels::ssss_merged_kernel<T, Config><<<Config::searchtree::width, Config::algorithm::max_block_size>>>(
109 |         in, out, oracles, offset, ranks, rank_offset, rank_base, aux_in, aux_outs, out_trees);
110 | }
111 | 
112 | } // namespace gpu
113 | 
114 | #endif // SSSS_LAUNCHERS_CUH


--------------------------------------------------------------------------------
/lib/ssss_merged_memory.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Parallel selection algorithm on GPUs
 3 |  * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de)
 4 |  *
 5 |  * This program is free software: you can redistribute it and/or modify
 6 |  * it under the terms of the GNU General Public License as published by
 7 |  * the Free Software Foundation, version 3.
 8 |  *
 9 |  * This program is distributed in the hope that it will be useful, but
10 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
11 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 |  * General Public License for more details.
13 |  *
14 |  * You should have received a copy of the GNU General Public License
15 |  * along with this program. If not, see <https://www.gnu.org/licenses/>.
16 |  */
17 | #ifndef SSSS_MERGED_MEMORY_CUH
18 | #define SSSS_MERGED_MEMORY_CUH
19 | 
20 | #include <cuda_definitions.cuh>
21 | #include "utils.cuh"
22 | 
23 | namespace gpu {
24 | namespace kernels {
25 | 
26 | template <typename T, typename Config>
27 | struct ssss_multi_aux {
28 |     constexpr static auto mask_size = ceil_div(Config::searchtree::width, sizeof(mask) * 8);
29 |     union {
30 |         struct {
31 |             T tree[Config::searchtree::size];
32 |             index bucket_counts[Config::searchtree::width + 1];
33 |         } stage1;
34 |         struct {
35 |             mask bucket_mask[mask_size];
36 |             index bucket_prefixsum[Config::searchtree::width + 1];
37 |             index bucket_masked_prefixsum[Config::searchtree::width + 1];
38 |             index rank_ranges[Config::searchtree::width + 1];
39 |             index atomic[Config::algorithm::shared_memory ? 1 : Config::searchtree::width];
40 |         } stage2;
41 |     };
42 | };
43 | 
44 | } // namespace kernels
45 | } // namespace gpu
46 | 
47 | #endif


--------------------------------------------------------------------------------
/lib/ssss_reduce.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Parallel selection algorithm on GPUs
 3 |  * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de)
 4 |  *
 5 |  * This program is free software: you can redistribute it and/or modify
 6 |  * it under the terms of the GNU General Public License as published by
 7 |  * the Free Software Foundation, version 3.
 8 |  *
 9 |  * This program is distributed in the hope that it will be useful, but
10 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
11 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 |  * General Public License for more details.
13 |  *
14 |  * You should have received a copy of the GNU General Public License
15 |  * along with this program. If not, see <https://www.gnu.org/licenses/>.
16 |  */
17 | #ifndef SSSS_REDUCE_CUH
18 | #define SSSS_REDUCE_CUH
19 | 
20 | #include "utils.cuh"
21 | 
22 | namespace gpu {
23 | namespace kernels {
24 | 
25 | __device__ inline index partial_sum_idx(index block, oracle bucket, int num_blocks,
26 |                                         int num_buckets) {
27 |     return bucket + block * num_buckets;
28 | }
29 | 
30 | template <typename Config>
31 | __global__ void reduce_counts(const index* in, index* out,
32 |                               index num_blocks) {
33 |     index bucket = blockIdx.x * blockDim.x + threadIdx.x;
34 |     if (bucket < Config::searchtree::width) {
35 |         index sum{};
36 |         for (index block = 0; block < num_blocks; ++block) {
37 |             sum += in[partial_sum_idx(block, bucket, num_blocks, Config::searchtree::width)];
38 |         }
39 |         out[bucket] = sum;
40 |     }
41 | }
42 | 
43 | template <typename Config>
44 | __global__ void prefix_sum_counts(index* in, index* out,
45 |                                   index num_blocks) {
46 |     index bucket = blockIdx.x * blockDim.x + threadIdx.x;
47 |     if (bucket < Config::searchtree::width) {
48 |         index sum{};
49 |         for (index block = 0; block < num_blocks; ++block) {
50 |             auto idx = partial_sum_idx(block, bucket, num_blocks, Config::searchtree::width);
51 |             auto tmp = in[idx];
52 |             in[idx] = sum;
53 |             sum += tmp;
54 |         }
55 |         out[bucket] = sum;
56 |     }
57 | }
58 | 
59 | } // namespace kernels
60 | } // namespace gpu
61 | 
62 | #endif // SSSS_REDUCE_CUH
63 | 


--------------------------------------------------------------------------------
/lib/utils.cuh:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Parallel selection algorithm on GPUs
  3 |  * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de)
  4 |  *
  5 |  * This program is free software: you can redistribute it and/or modify
  6 |  * it under the terms of the GNU General Public License as published by
  7 |  * the Free Software Foundation, version 3.
  8 |  *
  9 |  * This program is distributed in the hope that it will be useful, but
 10 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
 11 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 12 |  * General Public License for more details.
 13 |  *
 14 |  * You should have received a copy of the GNU General Public License
 15 |  * along with this program. If not, see <https://www.gnu.org/licenses/>.
 16 |  */
 17 | #ifndef UTILS_CUH
 18 | #define UTILS_CUH
 19 | 
 20 | #include <cuda_definitions.cuh>
 21 | #include <limits>
 22 | #include <algorithm>
 23 | 
 24 | namespace gpu {
 25 | 
 26 | struct launch_parameters {
 27 |     index block_count;
 28 |     index block_size;
 29 |     index work_per_thread;
 30 | };
 31 | 
 32 | constexpr mask full_mask = 0xffffffffu;
 33 | 
 34 | template <typename T>
 35 | struct max_helper {
 36 |     // workaround for ::max being a __host__ function
 37 |     constexpr static T value = std::numeric_limits<T>::max();
 38 | };
 39 | 
 40 | __host__ __device__ inline float add_epsilon(float f) {
 41 |     return nextafterf(f, max_helper<float>::value);
 42 | }
 43 | 
 44 | __host__ __device__ inline double add_epsilon(double f) {
 45 |     return nextafter(f, max_helper<double>::value);
 46 | }
 47 | 
 48 | __host__ __device__ inline constexpr index min(index a, index b) { return a > b ? b : a; }
 49 | 
 50 | __host__ __device__ inline constexpr index max(index a, index b) { return a < b ? b : a; }
 51 | 
 52 | __host__ __device__ inline constexpr index ceil_div(index a, index b) { return (a + b - 1) / b; }
 53 | 
 54 | __device__ inline index ceil_log2(index i) {
 55 |     auto high_bit = 31 - __clz(i);
 56 |     return __popc(i) <= 1 ? high_bit : high_bit + 1;
 57 | }
 58 | 
 59 | namespace kernels {
 60 | 
 61 | template <typename T>
 62 | __device__ void swap(T& a, T& b) {
 63 |     auto tmp = b;
 64 |     b = a;
 65 |     a = tmp;
 66 | }
 67 | 
 68 | template <typename T>
 69 | __device__ T shfl(mask amask, T el, index source, index width = warp_size) {
 70 | #if (__CUDACC_VER_MAJOR__ >= 9)
 71 |     return __shfl_sync(amask, el, source, width);
 72 | #else
 73 |     return __shfl(el, source);
 74 | #endif
 75 | }
 76 | 
 77 | template <typename T>
 78 | __device__ T shfl_xor(mask amask, T el, index lanemask, index width = warp_size) {
 79 | #if (__CUDACC_VER_MAJOR__ >= 9)
 80 |     return __shfl_xor_sync(amask, el, lanemask, width);
 81 | #else
 82 |     return __shfl_xor(el, lanemask);
 83 | #endif
 84 | }
 85 | 
 86 | __device__ inline mask ballot(mask amask, bool predicate) {
 87 | #if (__CUDACC_VER_MAJOR__ >= 9)
 88 |     return __ballot_sync(amask, predicate);
 89 | #else
 90 |     return __ballot(predicate) & amask;
 91 | #endif
 92 | }
 93 | 
 94 | __device__ inline void sync_dist(int dist) {
 95 | #if (__CUDACC_VER_MAJOR__ >= 9)
 96 |     if (dist >= warp_size) {
 97 |         __syncthreads();
 98 |     } else {
 99 |         __syncwarp();
100 |     }
101 | #else
102 |     __syncthreads();
103 | #endif
104 | }
105 | 
106 | } // namespace kernels
107 | } // namespace gpu
108 | 
109 | #endif // UTILS_CUH
110 | 


--------------------------------------------------------------------------------
/lib/utils_basecase.cuh:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Parallel selection algorithm on GPUs
  3 |  * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de)
  4 |  *
  5 |  * This program is free software: you can redistribute it and/or modify
  6 |  * it under the terms of the GNU General Public License as published by
  7 |  * the Free Software Foundation, version 3.
  8 |  *
  9 |  * This program is distributed in the hope that it will be useful, but
 10 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
 11 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 12 |  * General Public License for more details.
 13 |  *
 14 |  * You should have received a copy of the GNU General Public License
 15 |  * along with this program. If not, see <https://www.gnu.org/licenses/>.
 16 |  */
 17 | #ifndef UTILS_BASECASE_CUH
 18 | #define UTILS_BASECASE_CUH
 19 | #include "utils_sort.cuh"
 20 | 
 21 | namespace gpu {
 22 | namespace kernels {
 23 | 
 24 | template <typename T, typename Config>
 25 | __device__ void load_local(const T* in, T* local, index size) {
 26 |     index idx = threadIdx.x;
 27 |     for (index i = 0; i < Config::basecase::local_size; ++i) {
 28 |         auto lidx = idx * Config::basecase::local_size + i;
 29 |         local[i] = lidx < size ? in[lidx] : max_helper<T>::value;
 30 |     }
 31 | }
 32 | 
 33 | template <typename T, typename Config>
 34 | __device__ void small_sort_warp(const T* in, T* sorted, index size) {
 35 |     index idx = threadIdx.x;
 36 |     // load data padded with sentinels
 37 |     T local[Config::basecase::local_size];
 38 |     load_local<T, Config>(in, local, size);
 39 |     using sorter = bitonic_helper_warp<T, Config::basecase::local_size_log2, warp_size_log2>;
 40 |     sorter::sort(local, false);
 41 |     for (index i = 0; i < Config::basecase::local_size; ++i) {
 42 |         auto lidx = idx * Config::basecase::local_size + i;
 43 |         sorted[lidx] = local[i];
 44 |     }
 45 | }
 46 | 
 47 | template <typename T, typename Config>
 48 | __device__ void small_sort(const T* in, T* sorted, index size) {
 49 |     // load data padded with sentinels
 50 |     T local[Config::basecase::local_size];
 51 |     load_local<T, Config>(in, local, size);
 52 |     using sorter = bitonic_helper_global<T, Config::basecase::local_size_log2, warp_size_log2, Config::basecase::size_log2 - warp_size_log2 - Config::basecase::local_size_log2, Config::basecase::size_log2>;
 53 |     sorter::sort(local, sorted, false);
 54 | }
 55 | 
 56 | template <typename T, typename Config>
 57 | __global__ void select_bitonic_basecase(const T* in, index size, index rank, T* out) {
 58 |     __shared__ T data[Config::basecase::size];
 59 |     index idx = threadIdx.x;
 60 |     if (size <= Config::basecase::local_size * warp_size) {
 61 |         if (idx >= warp_size) {
 62 |             return;
 63 |         }
 64 |         small_sort_warp<T, Config>(in, data, size);
 65 |     } else {
 66 |         small_sort<T, Config>(in, data, size);
 67 |     }
 68 |     __syncthreads();
 69 |     // store result
 70 |     if (idx == 0) {
 71 |         *out = data[rank];
 72 |     }
 73 | }
 74 | 
 75 | template <typename T, typename Config>
 76 | __device__ void select_bitonic_multiple_basecase_impl(const T* in, index size,
 77 |                                                       const index* ranks, index ranks_size,
 78 |                                                       index rank_base, T* out) {
 79 |     __shared__ T data[Config::basecase::size];
 80 |     index idx = threadIdx.x;
 81 |     if (size <= Config::basecase::local_size * warp_size) {
 82 |         if (idx >= warp_size) {
 83 |             return;
 84 |         }
 85 |         small_sort_warp<T, Config>(in, data, size);
 86 |     } else {
 87 |         small_sort<T, Config>(in, data, size);
 88 |     }
 89 |     __syncthreads();
 90 |     for (index i = 0; i < Config::basecase::local_size; i++) {
 91 |         auto gi = idx * Config::basecase::local_size + i;
 92 |         if (gi < ranks_size) {
 93 |             auto pos = ranks[gi] - rank_base;
 94 |             out[gi] = data[pos];
 95 |         }
 96 |     }
 97 | }
 98 | 
 99 | template <typename T, typename Config>
100 | __global__ void select_bitonic_multiple_basecase(const T* in, index size,
101 |                                                         const index* ranks, index ranks_size,
102 |                                                         index rank_base, T* out) {
103 |     select_bitonic_multiple_basecase_impl<T, Config>(in, size, ranks, ranks_size, rank_base, out);
104 | }
105 | 
106 | } // namespace kernels
107 | } // namespace gpu
108 | 
109 | #endif // UTILS_BASECASE_CUH


--------------------------------------------------------------------------------
/lib/utils_bytestorage.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Parallel selection algorithm on GPUs
 3 |  * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de)
 4 |  *
 5 |  * This program is free software: you can redistribute it and/or modify
 6 |  * it under the terms of the GNU General Public License as published by
 7 |  * the Free Software Foundation, version 3.
 8 |  *
 9 |  * This program is distributed in the hope that it will be useful, but
10 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
11 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 |  * General Public License for more details.
13 |  *
14 |  * You should have received a copy of the GNU General Public License
15 |  * along with this program. If not, see <https://www.gnu.org/licenses/>.
16 |  */
17 | #ifndef UTILS_BYTESTORAGE_CUH
18 | #define UTILS_BYTESTORAGE_CUH
19 | 
20 | #include "utils.cuh"
21 | 
22 | namespace gpu {
23 | namespace kernels {
24 | 
25 | __device__ inline void store_packed_bytes(poracle* output, mask amask, oracle byte, index idx) {
26 |     // pack 4 consecutive bytes into a dword
27 |     poracle result = byte;
28 |     // ------00 -> ----1100
29 |     result |= shfl_xor(amask, result, 1, 4) << 8;
30 |     // ----1100 -> 33221100
31 |     result |= shfl_xor(amask, result, 2, 4) << 16;
32 |     if (idx % 4 == 0) {
33 |         output[idx / 4] = result;
34 |     }
35 | }
36 | 
37 | __device__ inline oracle load_packed_bytes(const poracle* input, mask amask, index idx) {
38 |     auto char_idx = idx % 4;
39 |     auto pack_idx = idx / 4;
40 |     poracle packed{};
41 |     // first thread in quartet loads the data
42 |     if (char_idx == 0) {
43 |         packed = input[pack_idx];
44 |     }
45 |     // distribute the data onto all threads
46 |     packed = shfl(amask, packed, (pack_idx * 4) % warp_size, 4);
47 |     packed >>= char_idx * 8;
48 |     packed &= 0xff;
49 |     return packed;
50 | }
51 | 
52 | } // namespace kernels
53 | } // namespace gpu
54 | 
55 | #endif // UTILS_BYTESTORAGE_CUH


--------------------------------------------------------------------------------
/lib/utils_mask.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Parallel selection algorithm on GPUs
 3 |  * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de)
 4 |  *
 5 |  * This program is free software: you can redistribute it and/or modify
 6 |  * it under the terms of the GNU General Public License as published by
 7 |  * the Free Software Foundation, version 3.
 8 |  *
 9 |  * This program is distributed in the hope that it will be useful, but
10 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
11 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 |  * General Public License for more details.
13 |  *
14 |  * You should have received a copy of the GNU General Public License
15 |  * along with this program. If not, see <https://www.gnu.org/licenses/>.
16 |  */
17 | #ifndef UTILS_MASK_CUH
18 | #define UTILS_MASK_CUH
19 | 
20 | #include <cuda_definitions.cuh>
21 | #include "utils_search.cuh"
22 | 
23 | namespace gpu {
24 | namespace kernels {
25 | 
26 | template <int mask_width>
27 | __device__ int select_mask_local(index rank, mask m) {
28 |     static_assert(mask_width <= 32, "mask too wide");
29 |     constexpr auto amask = ~mask{0} >> (32 - mask_width);
30 |     auto masked_m = m << (31 - threadIdx.x);
31 |     auto result = ballot(amask, __popc(masked_m) >= rank + 1);
32 |     return __ffs(result) - 1;
33 | }
34 | 
35 | template <int mask_width>
36 | __device__ int select_mask(index rank, const mask* m) {
37 |     constexpr auto mask_blocks = ceil_div(mask_width, 32);
38 |     constexpr auto local_mask_size = mask_width >= 32 ? 32 : mask_width;
39 |     static_assert(mask_blocks <= 32, "mask too wide");
40 |     // we have few enough blocks so we can do this naively
41 |     index count{};
42 |     index block{};
43 |     for (; block < mask_blocks; ++block) {
44 |         auto partial = __popc(m[block]);
45 |         if (rank >= count && rank < count + partial) {
46 |             return select_mask_local<local_mask_size>(rank - count, m[block]) + block * 32;
47 |         }
48 |         count += partial;
49 |     }
50 |     // should never be reached
51 |     return 0xDEADBEEF;
52 | }    
53 | 
54 | inline __device__ bool check_mask(index idx, const mask* m) {
55 |     static_assert(sizeof(mask) * 8 == warp_size, "Mask and warp size inconsistent");
56 |     auto mask_block = idx / (sizeof(mask) * 8);
57 |     auto mask_bit = mask(idx % (sizeof(mask) * 8));
58 |     auto masked_bit = mask(1) << mask_bit;
59 |     return bool(m[mask_block] & masked_bit);
60 | }
61 | 
62 | inline __device__ void compute_bucket_mask_impl(const index* ranks, index rank_count, index rank_base, const index* bucket_prefixsum, mask* bucket_mask, index* range_begins) {
63 |     auto bucket = threadIdx.x;
64 |     auto lb = bucket_prefixsum[bucket] + rank_base;
65 |     auto ub = bucket_prefixsum[bucket + 1] + rank_base;
66 |     auto lb_start = binary_search(ranks, rank_count, lb);
67 |     auto ub_start = binary_search(ranks, rank_count, ub);
68 |     auto local_mask = ballot(full_mask, lb_start != ub_start);
69 |     if (bucket % warp_size == 0) {
70 |         bucket_mask[bucket / warp_size] = local_mask;
71 |     }
72 |     range_begins[bucket] = lb_start;
73 |     // this is a deliberate race condition, as both threads compute the same result :)
74 |     range_begins[bucket + 1] = ub_start;
75 | }
76 | 
77 | } // namespace kernels
78 | } // namespace gpu
79 | #endif // UTILS_MASK_CUH


--------------------------------------------------------------------------------
/lib/utils_prefixsum.cuh:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Parallel selection algorithm on GPUs
  3 |  * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de)
  4 |  *
  5 |  * This program is free software: you can redistribute it and/or modify
  6 |  * it under the terms of the GNU General Public License as published by
  7 |  * the Free Software Foundation, version 3.
  8 |  *
  9 |  * This program is distributed in the hope that it will be useful, but
 10 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
 11 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 12 |  * General Public License for more details.
 13 |  *
 14 |  * You should have received a copy of the GNU General Public License
 15 |  * along with this program. If not, see <https://www.gnu.org/licenses/>.
 16 |  */
 17 | #ifndef UTILS_PREFIXSUM_CUH
 18 | #define UTILS_PREFIXSUM_CUH
 19 | 
 20 | #include "utils.cuh"
 21 | #include "utils_mask.cuh"
 22 | 
 23 | namespace gpu {
 24 | namespace kernels {
 25 | 
 26 | template <index size_log2>
 27 | __device__ void small_prefix_sum_upward(index* data) {
 28 |     constexpr auto size = 1 << size_log2;
 29 |     auto idx = threadIdx.x;
 30 |     // upward phase: reduce
 31 |     // here we build an implicit reduction tree, overwriting values
 32 |     // the entry at the end of a power-of-two block stores the sum of this block
 33 |     // the block sizes are increased stepwise
 34 |     for (index blocksize = 2; blocksize <= size; blocksize *= 2) {
 35 |         index base_idx = idx * blocksize;
 36 |         __syncthreads();
 37 |         if (base_idx < size) {
 38 |             data[base_idx + blocksize - 1] += data[base_idx + blocksize / 2 - 1];
 39 |         }
 40 |     }
 41 | }
 42 | 
 43 | template <index size_log2>
 44 | __device__ void small_prefix_sum_downward(index* data) {
 45 |     constexpr auto size = 1 << size_log2;
 46 |     auto idx = threadIdx.x;
 47 |     // downward phase: build prefix sum
 48 |     // every right child stores the sum of its left sibling
 49 |     // every left child stores its own sum
 50 |     // thus we store zero at the root
 51 |     if (idx == 0) {
 52 |         data[size - 1] = 0;
 53 |     }
 54 |     for (auto blocksize = size; blocksize != 1; blocksize /= 2) {
 55 |         auto base_idx = idx * blocksize;
 56 |         __syncthreads();
 57 |         if (base_idx < size) {
 58 |             // we preserve the invariant for the next level
 59 |             auto r = data[base_idx + blocksize - 1];
 60 |             auto l = data[base_idx + blocksize / 2 - 1];
 61 |             data[base_idx + blocksize / 2 - 1] = r;
 62 |             data[base_idx + blocksize - 1] = l + r;
 63 |         }
 64 |     }
 65 | }
 66 | 
 67 | template <index size_log2>
 68 | __device__ void small_prefix_sum(index* data) {
 69 |     small_prefix_sum_upward<size_log2>(data);
 70 |     __syncthreads();
 71 |     small_prefix_sum_downward<size_log2>(data);
 72 | }
 73 | 
 74 | template <index size_log2>
 75 | __device__ void small_prefix_sum_sentinel(index* data) {
 76 |     auto size = 1 << size_log2;
 77 |     gpu::index tmp{};
 78 |     if (threadIdx.x == size - 1) tmp = data[size - 1];
 79 |     __syncthreads();
 80 |     small_prefix_sum<size_log2>(data);
 81 |     __syncthreads();
 82 |     // append sentinel
 83 |     if (threadIdx.x == size - 1) data[size] = data[size - 1] + tmp;
 84 | }
 85 | 
 86 | template<index size_log2>
 87 | __device__ void masked_prefix_sum(index* counts, const mask* m) {
 88 |     index bucket = threadIdx.x;
 89 |     constexpr auto size = 1 << size_log2;
 90 |     if (bucket < size && !check_mask(bucket, m)) {
 91 |         counts[bucket] = 0;
 92 |     }
 93 |     __syncthreads();
 94 |     small_prefix_sum<size_log2>(counts);
 95 | }
 96 | 
 97 | template<index size_log2>
 98 | __device__ void masked_prefix_sum_sentinel(index* counts, const mask* m) {
 99 |     index bucket = threadIdx.x;
100 |     constexpr auto size = 1 << size_log2;
101 |     if (bucket < size && !check_mask(bucket, m)) {
102 |         counts[bucket] = 0;
103 |     }
104 |     __syncthreads();
105 |     small_prefix_sum_sentinel<size_log2>(counts);
106 | }
107 | 
108 | /*
109 |  * Prefix sum selection
110 |  */
111 | template <index size_log2>
112 | __device__ void prefix_sum_select(const index* counts, index rank, poracle* out_bucket,
113 |                                   index* out_rank) {
114 |     constexpr auto size = 1 << size_log2;
115 |     // first compute prefix sum of counts
116 |     auto idx = threadIdx.x;
117 |     __shared__ index sums[size];
118 |     sums[2 * idx] = counts[2 * idx];
119 |     sums[2 * idx + 1] = counts[2 * idx + 1];
120 |     small_prefix_sum<size_log2>(sums);
121 |     __syncthreads();
122 |     if (idx >= warp_size) {
123 |         return;
124 |     }
125 |     // then determine which group of size step the element belongs to
126 |     constexpr auto step = size / warp_size;
127 |     static_assert(step <= warp_size, "need a third selection level");
128 |     auto mask = ballot(full_mask, sums[(warp_size - idx - 1) * step] > rank);
129 |     if (idx >= step) {
130 |         return;
131 |     }
132 |     auto group = __clz(mask) - 1;
133 |     // finally determine which bucket within the group the element belongs to
134 |     auto base_idx = step * group;
135 |     constexpr auto cur_mask = ((1u << (step - 1)) << 1) - 1;
136 |     mask = ballot(cur_mask, sums[base_idx + (step - idx - 1)] > rank);
137 |     // here we need to subtract warp_size - step since we only use a subset of the warp
138 |     if (idx == 0) {
139 |         *out_bucket = __clz(mask) - 1 - (warp_size - step) + base_idx;
140 |         *out_rank = rank - sums[*out_bucket];
141 |     }
142 | }
143 | 
144 | } // namespace kernels
145 | } // namespace gpu
146 | 
147 | #endif // UTILS_PREFIXSUM_CUH
148 | 


--------------------------------------------------------------------------------
/lib/utils_sampling.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Parallel selection algorithm on GPUs
 3 |  * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de)
 4 |  *
 5 |  * This program is free software: you can redistribute it and/or modify
 6 |  * it under the terms of the GNU General Public License as published by
 7 |  * the Free Software Foundation, version 3.
 8 |  *
 9 |  * This program is distributed in the hope that it will be useful, but
10 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
11 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 |  * General Public License for more details.
13 |  *
14 |  * You should have received a copy of the GNU General Public License
15 |  * along with this program. If not, see <https://www.gnu.org/licenses/>.
16 |  */
17 | #ifndef UTILS_SAMPLING_CUH
18 | #define UTILS_SAMPLING_CUH
19 | 
20 | #include "utils.cuh"
21 | 
22 | namespace gpu {
23 | namespace kernels {
24 | 
25 | __device__ inline index uniform_pick_idx(index idx, index samplesize, index size) {
26 |     auto stride = size / samplesize;
27 |     if (stride == 0) {
28 |         return idx * size / samplesize;
29 |     } else {
30 |         return idx * stride + stride / 2;
31 |     }
32 | }
33 | 
34 | __device__ inline index random_pick_idx(index idx, index samplesize, index size) {
35 |     // TODO
36 |     return uniform_pick_idx(idx, samplesize, size);
37 | }
38 | 
39 | } // namespace kernels
40 | } // namespace gpu
41 | 
42 | #endif // UTILS_SAMPLING_CUH


--------------------------------------------------------------------------------
/lib/utils_search.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Parallel selection algorithm on GPUs
 3 |  * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de)
 4 |  *
 5 |  * This program is free software: you can redistribute it and/or modify
 6 |  * it under the terms of the GNU General Public License as published by
 7 |  * the Free Software Foundation, version 3.
 8 |  *
 9 |  * This program is distributed in the hope that it will be useful, but
10 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
11 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 |  * General Public License for more details.
13 |  *
14 |  * You should have received a copy of the GNU General Public License
15 |  * along with this program. If not, see <https://www.gnu.org/licenses/>.
16 |  */
17 | #ifndef UTILS_SEARCH_CUH
18 | #define UTILS_SEARCH_CUH
19 | 
20 | #include "utils.cuh"
21 | 
22 | namespace gpu {
23 | namespace kernels {
24 | 
25 | // finds the smallest element >= needle in sorted haystack
26 | inline __device__ index binary_search(const index* haystack, index haystack_size, index needle) {
27 |     auto range_begin = 0;
28 |     auto range_size = haystack_size;
29 |     while (range_size > 0) {
30 |         auto half_size = range_size / 2;
31 |         auto middle = range_begin + half_size;
32 |         // if the middle is already a candidate: discard everything right of it
33 |         auto go_left = haystack[middle] >= needle;
34 |         range_begin = go_left ? range_begin : middle + 1;
35 |         range_size = go_left ? half_size : (range_size - half_size - 1);
36 |     }
37 |     return range_begin;
38 | }
39 | 
40 | } // namespace kernels
41 | } // namespace gpu
42 | 
43 | #endif // UTILS_SEARCH_CUH


--------------------------------------------------------------------------------
/lib/utils_warpaggr.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Parallel selection algorithm on GPUs
 3 |  * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de)
 4 |  *
 5 |  * This program is free software: you can redistribute it and/or modify
 6 |  * it under the terms of the GNU General Public License as published by
 7 |  * the Free Software Foundation, version 3.
 8 |  *
 9 |  * This program is distributed in the hope that it will be useful, but
10 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
11 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 |  * General Public License for more details.
13 |  *
14 |  * You should have received a copy of the GNU General Public License
15 |  * along with this program. If not, see <https://www.gnu.org/licenses/>.
16 |  */
17 | #ifndef UTILS_WARPAGGR_CUH
18 | #define UTILS_WARPAGGR_CUH
19 | 
20 | #include "utils.cuh"
21 | 
22 | namespace gpu {
23 | namespace kernels {
24 | 
25 | __device__ inline bool is_group_leader(mask amask) {
26 |     return (__ffs(amask) - 1) == (threadIdx.x % warp_size);
27 | }
28 | 
29 | __device__ inline index prefix_popc(mask amask, index shift) {
30 |     mask prefix_mask = (1u << shift) - 1;
31 |     return __popc(amask & prefix_mask);
32 | }
33 | 
34 | __device__ inline index warp_aggr_atomic_count_mask(index* atomic, mask amask, mask cmask) {
35 |     auto lane_idx = threadIdx.x % warp_size;
36 |     index ofs{};
37 |     if (lane_idx == 0) {
38 |         ofs = atomicAdd(atomic, __popc(cmask));
39 |     }
40 |     ofs = shfl(amask, ofs, 0);
41 |     auto local_ofs = prefix_popc(cmask, lane_idx);
42 |     return ofs + local_ofs;
43 | }
44 | 
45 | __device__ inline index warp_aggr_atomic_count_predicate(index* atomic, mask amask,
46 |                                                          bool predicate) {
47 |     auto mask = ballot(amask, predicate);
48 |     return warp_aggr_atomic_count_mask(atomic, amask, mask);
49 | }
50 | 
51 | } // namespace kernels
52 | } // namespace gpu
53 | 
54 | #endif // UTILS_WARPAGGR_CUH


--------------------------------------------------------------------------------
/lib/utils_work.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Parallel selection algorithm on GPUs
 3 |  * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de)
 4 |  *
 5 |  * This program is free software: you can redistribute it and/or modify
 6 |  * it under the terms of the GNU General Public License as published by
 7 |  * the Free Software Foundation, version 3.
 8 |  *
 9 |  * This program is distributed in the hope that it will be useful, but
10 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
11 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 |  * General Public License for more details.
13 |  *
14 |  * You should have received a copy of the GNU General Public License
15 |  * along with this program. If not, see <https://www.gnu.org/licenses/>.
16 |  */
17 | #ifndef UTILS_WORK_CUH
18 | #define UTILS_WORK_CUH
19 | 
20 | #include "utils.cuh"
21 | 
22 | namespace gpu {
23 | namespace kernels {
24 | 
25 | template <typename Config, typename F>
26 | __device__ void blockwise_work(index local_work, index size, F function) {
27 |     auto stride = gridDim.x * blockDim.x;
28 |     auto base_idx = threadIdx.x + blockDim.x * blockIdx.x;
29 |     constexpr auto unroll = Config::algorithm::unroll;
30 |     for (index i = 0; i < local_work; i += unroll) {
31 |         // will any thread in the current iteration be without work?
32 |         auto warp_last_idx = (base_idx / warp_size) * warp_size + warp_size - 1;
33 |         if (warp_last_idx + (i + unroll - 1) * stride >= size) {
34 |             // then the compiler cannot benefit from unrolling
35 |             for (auto j = 0; j < unroll; ++j) {
36 |                 auto idx = base_idx + (i + j) * stride;
37 |                 auto amask = ballot(full_mask, idx < size);
38 |                 if (idx < size) {
39 |                     function(idx, amask);
40 |                 }
41 |             }
42 |         } else {
43 | // otherwise all predicates above will be true, so we can unroll
44 | #pragma unroll
45 |             for (auto j = 0; j < unroll; ++j) {
46 |                 auto idx = base_idx + (i + j) * stride;
47 |                 function(idx, full_mask);
48 |             }
49 |         }
50 |     }
51 | }
52 | template <typename Config, typename F>
53 | __device__ void blockwise_work_local_large(index local_work, index size, F function) {
54 |     auto stride = blockDim.x;
55 |     auto base_idx = threadIdx.x;
56 |     constexpr auto unroll = Config::algorithm::unroll;
57 |     for (index i = 0; i < local_work; i += unroll) {
58 |         // will any thread in the current iteration be without work?
59 |         auto warp_last_idx = (base_idx / warp_size) * warp_size + warp_size - 1;
60 |         if (warp_last_idx + (i + unroll - 1) * stride >= size) {
61 |             // then the compiler cannot benefit from unrolling
62 |             for (auto j = 0; j < unroll; ++j) {
63 |                 auto idx = base_idx + (i + j) * stride;
64 |                 auto amask = ballot(full_mask, idx < size);
65 |                 if (idx < size) {
66 |                     function(idx, amask);
67 |                 }
68 |             }
69 |         } else {
70 | // otherwise all predicates above will be true, so we can unroll
71 | #pragma unroll
72 |             for (auto j = 0; j < unroll; ++j) {
73 |                 auto idx = base_idx + (i + j) * stride;
74 |                 function(idx, full_mask);
75 |             }
76 |         }
77 |     }
78 | }
79 | 
80 | template <typename F>
81 | __device__ void blockwise_work_local(index size, F function) {
82 |     for (index i = threadIdx.x; i < size; i += blockDim.x) {
83 |         function(i);
84 |     }
85 | }
86 | 
87 | } // namespace kernels
88 | } // namespace gpu
89 | 
90 | #endif // UTILS_WORK_CUH


--------------------------------------------------------------------------------
/lib/verification.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Parallel selection algorithm on GPUs
  3 |  * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de)
  4 |  *
  5 |  * This program is free software: you can redistribute it and/or modify
  6 |  * it under the terms of the GNU General Public License as published by
  7 |  * the Free Software Foundation, version 3.
  8 |  *
  9 |  * This program is distributed in the hope that it will be useful, but
 10 |  * WITHOUT ANY WARRANTY; without even the implied warranty of
 11 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 12 |  * General Public License for more details.
 13 |  *
 14 |  * You should have received a copy of the GNU General Public License
 15 |  * along with this program. If not, see <https://www.gnu.org/licenses/>.
 16 |  */
 17 | #include <verification.hpp>
 18 | 
 19 | #include <algorithm>
 20 | #include <iostream>
 21 | 
 22 | namespace verification {
 23 | 
 24 | template <typename T>
 25 | std::pair<int, int> count_mispartitioned(const std::vector<T>& data, int pivot_rank, T pivot) {
 26 |     int lcount{};
 27 |     int rcount{};
 28 |     for (int i = 0; i < pivot_rank; ++i) {
 29 |         if (data[i] >= pivot) {
 30 |             ++lcount;
 31 |         }
 32 |     }
 33 |     for (int i = pivot_rank; i < data.size(); ++i) {
 34 |         if (data[i] < pivot) {
 35 |             ++rcount;
 36 |         }
 37 |     }
 38 |     return {lcount, rcount};
 39 | }
 40 | 
 41 | template std::pair<int, int> count_mispartitioned<float>(const std::vector<float>&, int, float);
 42 | template std::pair<int, int> count_mispartitioned<double>(const std::vector<double>&, int, double);
 43 | 
 44 | template <typename T>
 45 | std::vector<T> nth_elements(const std::vector<T>& data, std::vector<gpu::index> ranks) {
 46 |     auto tmp = data;
 47 |     std::sort(tmp.begin(), tmp.end());
 48 |     std::vector<T> result;
 49 |     for (auto el : ranks) {
 50 |         result.push_back(tmp[el]);
 51 |     }
 52 |     return result;
 53 | }
 54 | 
 55 | template <typename T>
 56 | T nth_element(const std::vector<T>& data, int rank) {
 57 |     auto tmp = data;
 58 |     std::sort(tmp.begin(), tmp.end());
 59 |     return tmp[rank];
 60 | }
 61 | 
 62 | template float nth_element<float>(const std::vector<float>&, int);
 63 | template double nth_element<double>(const std::vector<double>&, int);
 64 | 
 65 | template std::vector<float> nth_elements<float>(const std::vector<float>&, std::vector<gpu::index>);
 66 | template std::vector<double> nth_elements<double>(const std::vector<double>&, std::vector<gpu::index>);
 67 | 
 68 | template <typename T>
 69 | int count_not_in_bucket(const std::vector<T>& data, T lower, T upper) {
 70 |     return std::count_if(data.begin(), data.end(),
 71 |                          [&](T val) { return val < lower || val >= upper; });
 72 | }
 73 | 
 74 | template int count_not_in_bucket<float>(const std::vector<float>&, float, float);
 75 | template int count_not_in_bucket<double>(const std::vector<double>&, double, double);
 76 | 
 77 | template <typename T>
 78 | std::vector<index> count_not_in_buckets(const std::vector<T>& data, std::vector<index> prefix_sum, const std::vector<T>& searchtree) {
 79 |     auto splitter_count = prefix_sum.size() - 1;
 80 |     std::vector<index> result(splitter_count);
 81 |     for (index bucket = 0; bucket < splitter_count; ++bucket) {
 82 |         // we don't use the smallest splitter
 83 |         auto lower = bucket == 0 ? 0 : searchtree[bucket + splitter_count - 1];
 84 |         // we don't store the sentinels
 85 |         auto upper = bucket == splitter_count - 1 ? std::numeric_limits<T>::max() : searchtree[bucket + splitter_count];
 86 |         result[bucket] = std::count_if(data.begin() + prefix_sum[bucket], data.begin() + prefix_sum[bucket + 1], [&](T val) {
 87 |             return val < lower || val >= upper;
 88 |         });
 89 |     }
 90 |     return result;
 91 | }
 92 | 
 93 | template std::vector<index> count_not_in_buckets<float>(const std::vector<float>& data, std::vector<index> prefix_sum, const std::vector<float>& searchtree);
 94 | template std::vector<index> count_not_in_buckets<double>(const std::vector<double>& data, std::vector<index> prefix_sum, const std::vector<double>& searchtree);
 95 | 
 96 | bool verify_rank_ranges(const std::vector<index>& ranks, const std::vector<index>& index_ranges, const std::vector<index>& rank_ranges) {
 97 |     auto searchtree_width = rank_ranges.size() - 1;
 98 |     if (!std::is_sorted(rank_ranges.begin(), rank_ranges.end())) return false;
 99 |     for (gpu::index i = 0; i < searchtree_width; ++i) {
100 |         auto lb = index_ranges[i];
101 |         auto ub = index_ranges[i + 1];
102 |         for (auto j = rank_ranges[i]; j < rank_ranges[i + 1]; ++j) {
103 |             if (ranks[j] < lb || ranks[j] >= ub) return false;
104 |         }
105 |     }
106 |     return rank_ranges[0] == 0 && rank_ranges.back() == ranks.size();
107 | }
108 | 
109 | } // namespace verification
110 | 


--------------------------------------------------------------------------------