├── .gitignore
├── CMakeLists.txt
├── LICENSE
├── LICENSE-catch2
├── LICENSE-cub
├── README
├── app
├── CMakeLists.txt
├── benchmark.cu
├── benchmark_main.cu
├── benchmark_sort.cu
├── catch.hpp
├── test_basecase.cu
├── test_fixture.cuh
├── test_helpers.cu
├── test_main.cu
├── test_qs.cu
└── test_ssss.cu
├── include
├── cpu_reference.hpp
├── cuda_definitions.cuh
├── cuda_error.cuh
├── cuda_memory.cuh
├── cuda_timer.cuh
├── kernel_config.cuh
├── launcher_fwd.cuh
└── verification.hpp
└── lib
├── CMakeLists.txt
├── cpu_reference.cpp
├── gen_instantiations.py
├── generated
├── gen-full.cu
├── gen0.cu
├── gen1.cu
├── gen10.cu
├── gen11.cu
├── gen12.cu
├── gen13.cu
├── gen14.cu
├── gen15.cu
├── gen16.cu
├── gen17.cu
├── gen18.cu
├── gen19.cu
├── gen2.cu
├── gen20.cu
├── gen21.cu
├── gen22.cu
├── gen23.cu
├── gen24.cu
├── gen25.cu
├── gen26.cu
├── gen27.cu
├── gen28.cu
├── gen29.cu
├── gen3.cu
├── gen30.cu
├── gen31.cu
├── gen32.cu
├── gen33.cu
├── gen34.cu
├── gen35.cu
├── gen36.cu
├── gen37.cu
├── gen38.cu
├── gen39.cu
├── gen4.cu
├── gen5.cu
├── gen6.cu
├── gen7.cu
├── gen8.cu
└── gen9.cu
├── qs_launchers.cuh
├── qs_recursion.cuh
├── qs_recursion_multi.cuh
├── qs_reduce.cuh
├── qs_scan.cuh
├── ssss_build_searchtree.cuh
├── ssss_collect.cuh
├── ssss_collect_multi.cuh
├── ssss_count.cuh
├── ssss_launchers.cuh
├── ssss_merged.cuh
├── ssss_merged_memory.cuh
├── ssss_recursion.cuh
├── ssss_recursion_multi.cuh
├── ssss_reduce.cuh
├── utils.cuh
├── utils_basecase.cuh
├── utils_bytestorage.cuh
├── utils_mask.cuh
├── utils_prefixsum.cuh
├── utils_sampling.cuh
├── utils_search.cuh
├── utils_sort.cuh
├── utils_warpaggr.cuh
├── utils_work.cuh
└── verification.cpp
/.gitignore:
--------------------------------------------------------------------------------
1 | build
2 |
--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
2 | project(gpu_selection LANGUAGES CXX CUDA)
3 |
4 | list(APPEND CMAKE_CUDA_FLAGS "-arch=sm_35 -rdc=true --maxrregcount 64 -gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_70,code=compute_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80")
5 | add_subdirectory(lib)
6 | add_subdirectory(app)
7 |
--------------------------------------------------------------------------------
/LICENSE-catch2:
--------------------------------------------------------------------------------
1 | Boost Software License - Version 1.0 - August 17th, 2003
2 |
3 | Permission is hereby granted, free of charge, to any person or organization
4 | obtaining a copy of the software and accompanying documentation covered by
5 | this license (the "Software") to use, reproduce, display, distribute,
6 | execute, and transmit the Software, and to prepare derivative works of the
7 | Software, and to permit third-parties to whom the Software is furnished to
8 | do so, all subject to the following:
9 |
10 | The copyright notices in the Software and this entire statement, including
11 | the above license grant, this restriction and the following disclaimer,
12 | must be included in all copies of the Software, in whole or in part, and
13 | all derivative works of the Software, unless such copies or derivative
14 | works are solely in the form of machine-executable object code generated by
15 | a source language processor.
16 |
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
20 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
21 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
22 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 | DEALINGS IN THE SOFTWARE.
24 |
--------------------------------------------------------------------------------
/LICENSE-cub:
--------------------------------------------------------------------------------
1 | Copyright (c) 2010-2011, Duane Merrill. All rights reserved.
2 | Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
3 |
4 | Redistribution and use in source and binary forms, with or without
5 | modification, are permitted provided that the following conditions are met:
6 | * Redistributions of source code must retain the above copyright
7 | notice, this list of conditions and the following disclaimer.
8 | * Redistributions in binary form must reproduce the above copyright
9 | notice, this list of conditions and the following disclaimer in the
10 | documentation and/or other materials provided with the distribution.
11 | * Neither the name of the NVIDIA CORPORATION nor the
12 | names of its contributors may be used to endorse or promote products
13 | derived from this software without specific prior written permission.
14 |
15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 | DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
1 | This library implements a bucket-based selection algorithm on GPUs
2 |
3 | More details can be found in
4 |
5 | * T. Ribizel and H. Anzt, "Approximate and Exact Selection on GPUs," 2019 IEEE International Parallel and Distributed Processing Symposium Workshops (IPDPSW), Rio de Janeiro, Brazil, 2019, pp. 471-478.
6 | doi: 10.1109/IPDPSW.2019.00088
7 | * T. Ribizel, H. Anzt, "Parallel selection on GPUs," Parallel Computing, Volume 91, 2020, doi: 10.1016/j.parco.2019.102588
8 |
9 | It uses Catch2 as a test framework and the CUB library as a reference implementation for sorting.
10 |
11 | The tests can be run by simply executing `app/unittests`, the benchmarks can be run by executing `app/benchmark` with one of the following parameters
12 |
13 | [full] The full benchmark for exact single and multiple selection and the individual kernels (sample, count, reduce, filter)
14 | [full-multionly] The full benchmark for multiple selection only
15 | [approx] The full benchmark for approximate selection with shared-memory atomics
16 | [approx-g] The full benchmark for approximate selection with global-memory atomics
17 | [multi] The full benchmark for multiple selection with different numbers of ranks
18 | [test] A small benchmark that only executes a single benchmark with small input size
19 |
20 | The output of these tests is the following:
21 | On stdout, they print error messages in case the algorithm execution produces invalid results. For the approx tests, additionally the exact and approximate rank are being output in CSV format.
22 | On stderr, they print the individual timings of the kernels in CSV format for different input sizes given by the first CSV field. Runtime breakdowns are listed within parentheses ().
23 |
24 | `app/benchmark-sort` contains a benchmark for the CUB radix sort implementation as a performance baseline for the multiple selection.
25 |
26 | Structure of the project
27 |
28 | include/cpu_reference.hpp - Reference implementations for testing
29 | include/verification.hpp - Validation functions for testing
30 | include/cuda_definitions.cuh - Type definitions and hardware limits
31 | include/cuda_error.cuh - Wrapper for CUDA error handling
32 | include/cuda_memory.cuh - Wrapper for CUDA memory allocations
33 | include/cuda_timer.cuh - Wrapper for CUDA timing measurements
34 | include/kernel_config.cuh - Configuration struct for kernel templates
35 | include/launcher_fwd.cuh - Forward-declarations of launcher and kernel templates
36 |
37 | lib/generated/* - Explicit template instantiations to parallelize compilation
38 | lib/cpu_reference.cpp - Reference implementations for testing
39 | lib/verification.cpp - Validation functions for testing
40 | lib/qs_launchers.cuh - Wrappers for quickselect kernels
41 | lib/qs_recursion.cuh - Kernels for quickselect single-selection
42 | lib/qs_recursion_multi.cuh - Kernels for quickselect multi-selection
43 | lib/qs_reduce.cuh - Kernels for reducing quickselect partial sums
44 | lib/qs_scan.cuh - Kernels for quickselect bipartitioning
45 | lib/ssss_build_searchtree.cuh - Kernels for sampleselect sampling
46 | lib/ssss_collect.cuh - Kernels for sampleselect single-selection filtering
47 | lib/ssss_collect_multi.cuh - Kernels for sampleselect multi-selection filtering
48 | lib/ssss_count.cuh - Kernels for sampleselect counting
49 | lib/ssss_launchers.cuh - Wrappers for sampleselect kernels
50 | lib/ssss_merged.cuh - Kernels for multiple simultaneous sampleselects
51 | lib/ssss_merged_memory.cuh - Auxiliary data structure for sampleselect multi-selection
52 | lib/ssss_recursion.cuh - Kernels for sampleselect single-selection
53 | lib/ssss_recursion_multi.cuh - Kernels for sampleselect multi-selection
54 | lib/ssss_reduce.cuh - Kernels for reducing sampleselect partial sums
55 | lib/utils_basecase.cuh - Kernels for recursion basecase
56 | lib/utils_bytestorage.cuh - Auxiliary functions for reading/writing unaligned bytes
57 | lib/utils_mask.cuh - Auxiliary functions for bitmasks
58 | lib/utils_prefixsum.cuh - Auxiliary functions for tree-based partial sums
59 | lib/utils_sampling.cuh - Auxiliary functions for sampling
60 | lib/utils_search.cuh - Auxiliary functions for binary and warp-ary searches
61 | lib/utils_sort.cuh - Auxiliary functions for bitonic sorting
62 | lib/utils_warpaggr.cuh - Auxiliary functions for warp-aggregation
63 | lib/utils_work.cuh - Auxiliary functions for work-distribution
64 | lib/utils.cuh - Auxiliary wrappers for basic operations
65 |
--------------------------------------------------------------------------------
/app/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(benchmark benchmark_main.cu benchmark.cu)
2 | add_executable(benchmark-sort benchmark_main.cu benchmark_sort.cu)
3 | target_include_directories(benchmark-sort PRIVATE ../include ../lib)
4 |
5 | add_executable(unittest test_main.cu test_qs.cu test_ssss.cu test_helpers.cu
6 | # test_basecase.cu
7 | )
8 |
9 | target_link_libraries(benchmark gpu_selection)
10 |
11 | target_link_libraries(unittest gpu_selection)
12 |
13 | set_target_properties(unittest PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
14 | set_target_properties(benchmark PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
15 | set_target_properties(benchmark-sort PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
16 |
--------------------------------------------------------------------------------
/app/benchmark_main.cu:
--------------------------------------------------------------------------------
1 | #define CATCH_CONFIG_MAIN
2 | #include "catch.hpp"
--------------------------------------------------------------------------------
/app/benchmark_sort.cu:
--------------------------------------------------------------------------------
1 | /*
2 | * Parallel selection algorithm on GPUs
3 | * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de)
4 | *
5 | * This program is free software: you can redistribute it and/or modify
6 | * it under the terms of the GNU General Public License as published by
7 | * the Free Software Foundation, version 3.
8 | *
9 | * This program is distributed in the hope that it will be useful, but
10 | * WITHOUT ANY WARRANTY; without even the implied warranty of
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 | * General Public License for more details.
13 | *
14 | * You should have received a copy of the GNU General Public License
15 | * along with this program. If not, see .
16 | */
17 | #include "catch.hpp"
18 | #include "test_fixture.cuh"
19 | #include
20 | #include
21 | #include
22 |
23 | namespace gpu {
24 |
25 | constexpr auto num_runs = 10;
26 |
27 | template
28 | void cub_sort(std::string name, index n, index d, basic_test_data>& data, cuda_timer& timer) {
29 | cub::DoubleBuffer keys{static_cast(data.gpu_data), static_cast(data.gpu_data_out)};
30 | timer.timed(name, num_runs, [&](auto event) {
31 | data.reset();
32 | auto tmp_size = sizeof(T) * n;
33 | event(0);
34 | cub::DeviceRadixSort::SortKeys(static_cast(data.gpu_data_tmp), tmp_size, keys, n);
35 | event(1);
36 | });
37 | auto sorted = data.data;
38 | auto ref = sorted;
39 | cudaCheckError(cudaMemcpy(ref.data(), keys.Current(), n * sizeof(T), cudaMemcpyDeviceToHost));
40 | std::sort(sorted.begin(), sorted.end());
41 | bool is_sorted = sorted == ref;
42 | CHECK(is_sorted);
43 | }
44 |
45 | TEMPLATE_TEST_CASE("sort", "", float, double) {
46 | using T = TestType;
47 | auto n = GENERATE(as{}, 65536, 262144, 524288, 1048576, 2097152, 4194304, 8388608,
48 | 16777216, 33554432, 67108864, 134217728);
49 | auto d = GENERATE(as{}, 1 << 30);
50 | auto seed = GENERATE(take(10, Catch::Generators::random(0, 1000000)));
51 | basic_test_data> data{n, d, index(seed)};
52 | CAPTURE(n);
53 | CAPTURE(d);
54 | CAPTURE(seed);
55 | cuda_timer timer{std::cerr};
56 | auto suffix = "-" + std::to_string(n) + "-" + std::to_string(d) + "-" + typeid(T).name();
57 | // thrust_sort("thrust_sort" + suffix, n, d, data, timer);
58 | cub_sort("cub_sort" + suffix, n, d, data, timer);
59 | }
60 |
61 | } // namespace gpu
62 |
--------------------------------------------------------------------------------
/app/test_basecase.cu:
--------------------------------------------------------------------------------
1 | /*
2 | * Parallel selection algorithm on GPUs
3 | * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de)
4 | *
5 | * This program is free software: you can redistribute it and/or modify
6 | * it under the terms of the GNU General Public License as published by
7 | * the Free Software Foundation, version 3.
8 | *
9 | * This program is distributed in the hope that it will be useful, but
10 | * WITHOUT ANY WARRANTY; without even the implied warranty of
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 | * General Public License for more details.
13 | *
14 | * You should have received a copy of the GNU General Public License
15 | * along with this program. If not, see .
16 | */
17 | #include "catch.hpp"
18 | #include "../lib/utils_basecase.cuh"
19 | #include "test_fixture.cuh"
20 | #include
21 | #include
22 | #include
23 |
24 | namespace gpu {
25 |
26 | template
27 | struct extended_pair {
28 | constexpr static int size = Size;
29 | constexpr static int replsize = Replsize;
30 | using first_type = A;
31 | using second_type = B;
32 | };
33 |
34 | template
35 | using test_data = basic_test_data;
36 |
37 | template
38 | using float_pair = extended_pair;
39 |
40 | TEMPLATE_PRODUCT_TEST_CASE_METHOD(test_data, "basecase", "[basecase]",
41 | (float_pair),
42 | ((select_config<10, 10, 8, true, true, false, 8, 10, 10, false, 8, 0>),
43 | (select_config<10, 10, 8, true, true, false, 8, 10, 10, false, 8, 1>),
44 | (select_config<10, 10, 8, true, true, false, 8, 10, 10, false, 8, 2>),
45 | (select_config<12, 10, 8, true, true, false, 8, 10, 10, false, 8, 2>))) {
46 | using T = typename TestType::first_type;
47 | using Config = typename TestType::second_type;
48 | std::vector ranks(1);
49 | constexpr auto basecase_size = Config::basecase::size;
50 | constexpr auto local_size = Config::basecase::local_size;
51 | constexpr auto cur_launch_size = Config::basecase::launch_size;
52 | auto size = GENERATE(as{}, basecase_size, basecase_size / 5, warp_size * local_size, warp_size * local_size / 5);
53 | auto launch_size = GENERATE(as{}, cur_launch_size, max_block_size);
54 | std::string mode;
55 | SECTION("some ranks") {
56 | mode = "some ranks";
57 | ranks.resize(std::min(100, size / 2));
58 | for (auto i = 0; i < ranks.size(); ++i) {
59 | ranks[i] = i * size / ranks.size();
60 | }
61 | }
62 | SECTION("all ranks") {
63 | mode = "all ranks";
64 | ranks.resize(size);
65 | std::iota(ranks.begin(), ranks.end(), 0);
66 | }
67 | CAPTURE(size);
68 | CAPTURE(launch_size);
69 | CAPTURE(mode);
70 | this->gpu_ranks.copy_from(ranks);
71 | this->run([&]() { kernels::select_bitonic_basecase<<<1, launch_size>>>(this->gpu_data, size, ranks.back(), this->gpu_data_out); });
72 | std::vector result;
73 | this->gpu_data_out.copy_to(result);
74 | auto data = this->data;
75 | data.resize(size);
76 | std::sort(data.begin(), data.end());
77 | CHECK(data[ranks.back()] == result[0]);
78 | this->run([&]() { kernels::select_bitonic_multiple_basecase<<<1, launch_size>>>(this->gpu_data, size, this->gpu_ranks, ranks.size(), 0, this->gpu_data_out); });
79 | this->gpu_data_out.copy_to(result);
80 | index count{};
81 | for (auto i = 0; i < ranks.size(); ++i) {
82 | count += result[i] != data[ranks[i]];
83 | }
84 | CHECK(count == 0);
85 | }
86 |
87 | } // namespace gpu
--------------------------------------------------------------------------------
/app/test_fixture.cuh:
--------------------------------------------------------------------------------
1 | /*
2 | * Parallel selection algorithm on GPUs
3 | * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de)
4 | *
5 | * This program is free software: you can redistribute it and/or modify
6 | * it under the terms of the GNU General Public License as published by
7 | * the Free Software Foundation, version 3.
8 | *
9 | * This program is distributed in the hope that it will be useful, but
10 | * WITHOUT ANY WARRANTY; without even the implied warranty of
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 | * General Public License for more details.
13 | *
14 | * You should have received a copy of the GNU General Public License
15 | * along with this program. If not, see .
16 | */
17 | #include
18 | #include
19 | #include
20 | #include
21 | #include
22 | #include
23 | #include
24 | #include
25 |
26 | namespace gpu {
27 |
28 | constexpr auto max_tree_width = 4096;
29 | constexpr auto max_tree_size = 2 * max_tree_width * 2;
30 | constexpr auto max_block_count = 1024;
31 |
32 | template
33 | struct basic_test_data {
34 | using T = typename Pair::first_type;
35 | using Config = typename Pair::second_type;
36 | index size;
37 | std::vector data;
38 | std::vector tree;
39 | std::vector data_out;
40 | std::vector oracles;
41 | std::vector count_out;
42 | std::vector atomic;
43 | std::vector zeros;
44 | std::vector ranks;
45 | std::vector bucket_mask;
46 | cuda_resettable_array gpu_data;
47 | cuda_array gpu_data_tmp;
48 | cuda_array gpu_tree;
49 | cuda_array gpu_data_out;
50 | cuda_array gpu_oracles;
51 | cuda_array gpu_aux;
52 | cuda_resettable_array gpu_atomic;
53 | cuda_array gpu_count_tmp;
54 | cuda_resettable_array gpu_count_out;
55 | cuda_array gpu_bucket_ranges;
56 | cuda_array gpu_rank_ranges;
57 | cuda_array gpu_ranks;
58 | cuda_array gpu_bucket_mask;
59 | index rank;
60 | T pivot;
61 |
62 | basic_test_data(index size = Size, index valsize = Valsize, index seed = 0)
63 | : size{size}, data(size), tree(max_tree_size), oracles(size), count_out(max_block_count * 2 + 2),
64 | zeros(size + max_block_count * max_tree_width * 16), ranks(287), bucket_mask(max_tree_size / (sizeof(mask) * 8)),
65 | atomic(max_tree_width) {
66 | std::default_random_engine random(seed);
67 | std::uniform_int_distribution dist(0, valsize - 1);
68 | std::uniform_int_distribution idist(0, size - 1);
69 | std::uniform_int_distribution maskdist(mask(0), ~mask(0));
70 | std::vector smallzeros(max_tree_size);
71 | for (auto& el : data) {
72 | el = dist(random);
73 | }
74 | for (auto& el : ranks) {
75 | el = idist(random);
76 | }
77 | ranks.back() = size - 1;
78 | std::sort(ranks.begin(), ranks.end());
79 | rank = idist(random);
80 | pivot = data[rank];
81 | gpu_data.copy_from(data);
82 | gpu_tree.copy_from(tree);
83 | gpu_data_tmp.copy_from(data);
84 | gpu_data_out.copy_from(data);
85 | gpu_atomic.copy_from(atomic);
86 | gpu_count_tmp.copy_from(zeros);
87 | gpu_aux.copy_from(zeros);
88 | gpu_count_out.copy_from(count_out);
89 | gpu_oracles.copy_from(oracles);
90 | gpu_bucket_ranges.copy_from(smallzeros);
91 | gpu_rank_ranges.copy_from(smallzeros);
92 | gpu_ranks.copy_from(ranks);
93 | gpu_bucket_mask.copy_from(bucket_mask);
94 | }
95 |
96 | void reset() {
97 | gpu_data.reset();
98 | gpu_atomic.reset();
99 | gpu_count_out.reset();
100 | }
101 |
102 | void copy_from_gpu() {
103 | gpu_data_out.copy_to(data_out);
104 | gpu_count_out.copy_to(count_out);
105 | gpu_tree.copy_to(tree);
106 | gpu_oracles.copy_to(oracles);
107 | gpu_atomic.copy_to(atomic);
108 | }
109 |
110 | template
111 | void run(F f) {
112 | cudaChecked(f);
113 | copy_from_gpu();
114 | }
115 | };
116 |
117 | inline std::vector unpack(const std::vector& in, int size) {
118 | using uc = unsigned char;
119 | std::vector result;
120 | result.reserve(in.size() * 4);
121 | for (auto el : in) {
122 | result.insert(result.end(), {uc(el), uc(el >> 8), uc(el >> 16), uc(el >> 24)});
123 | }
124 | result.resize(size);
125 | return result;
126 | }
127 |
128 | inline std::vector build_ranks_uniform(index size, index count) {
129 | std::vector result;
130 | for (index i = 0; i < count; ++i) {
131 | result.push_back(int(double(i) * size / count));
132 | }
133 | return result;
134 | }
135 |
136 | inline std::vector build_ranks_clustered(index size) {
137 | std::vector result;
138 | auto step = size / 2;
139 | while (step >= 1) {
140 | result.push_back(step);
141 | step = step / 2;
142 | }
143 | std::reverse(result.begin(), result.end());
144 | return result;
145 | }
146 |
147 | } // namespace gpu
--------------------------------------------------------------------------------
/app/test_main.cu:
--------------------------------------------------------------------------------
1 | #define CATCH_CONFIG_MAIN
2 | #include "catch.hpp"
3 |
--------------------------------------------------------------------------------
/app/test_qs.cu:
--------------------------------------------------------------------------------
1 | /*
2 | * Parallel selection algorithm on GPUs
3 | * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de)
4 | *
5 | * This program is free software: you can redistribute it and/or modify
6 | * it under the terms of the GNU General Public License as published by
7 | * the Free Software Foundation, version 3.
8 | *
9 | * This program is distributed in the hope that it will be useful, but
10 | * WITHOUT ANY WARRANTY; without even the implied warranty of
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 | * General Public License for more details.
13 | *
14 | * You should have received a copy of the GNU General Public License
15 | * along with this program. If not, see .
16 | */
17 | #include "catch.hpp"
18 | #include "test_fixture.cuh"
19 |
20 | #include
21 | #include
22 | #include
23 | #include
24 |
25 | namespace gpu {
26 |
27 | template
28 | using test_data = basic_test_data;
29 |
30 | template
31 | using float_pair = typename std::pair;
32 |
33 | template
34 | using double_pair = typename std::pair;
35 |
36 | TEMPLATE_PRODUCT_TEST_CASE_METHOD(test_data, "bipartition", "[quickselect]",
37 | (float_pair, double_pair),
38 | ((select_config<10, 5, 8, true, true, true, 8, 10, 10>),
39 | (select_config<10, 5, 8, false, true, true, 8, 10, 10>))) {
40 | using T = typename TestType::first_type;
41 | using Config = typename TestType::second_type;
42 | this->run([&]() {
43 | partition(this->gpu_data, this->gpu_data_out, this->gpu_count_out, this->size,
44 | this->pivot);
45 | });
46 | auto lsize = this->count_out[0];
47 | auto rsize = this->count_out[1];
48 | CHECK(lsize + rsize == this->size);
49 | auto counts = verification::count_mispartitioned(this->data_out, lsize, this->pivot);
50 | auto lcount = counts.first;
51 | auto rcount = counts.second;
52 | CHECK(lcount == 0);
53 | CHECK(rcount == 0);
54 | }
55 |
56 | TEMPLATE_PRODUCT_TEST_CASE_METHOD(test_data, "quickselect", "[quickselect]",
57 | (float_pair, double_pair),
58 | ((select_config<10, 5, 8, true, true, true, 8, 10, 10>),
59 | (select_config<10, 5, 8, false, true, true, 8, 10, 10>))) {
60 | using T = typename TestType::first_type;
61 | using Config = typename TestType::second_type;
62 | this->run([&]() {
63 | quickselect(this->gpu_data, this->gpu_data_tmp, this->gpu_count_tmp, this->size, this->rank,
64 | this->gpu_data_out);
65 | });
66 | auto ref = verification::nth_element(this->data, this->rank);
67 | CHECK(ref == this->data_out[0]);
68 | }
69 |
70 | TEMPLATE_PRODUCT_TEST_CASE_METHOD(test_data, "quickselect_multi", "[quickselect]",
71 | (float_pair, double_pair),
72 | ((select_config<10, 5, 8, true, true, true, 8, 10, 10>),
73 | (select_config<10, 5, 8, false, true, true, 8, 10, 10>))) {
74 | using T = typename TestType::first_type;
75 | using Config = typename TestType::second_type;
76 | std::vector ranks;
77 | SECTION("some ranks") {
78 | for (int i = 0; i < 100; ++i) {
79 | ranks.push_back(this->size * i / 120);
80 | ranks.push_back(this->size * i / 120 + 1);
81 | ranks.push_back(this->size * i / 120 + 2);
82 | ranks.push_back(this->size * i / 120 + 10);
83 | }
84 | for (int i = 0; i < 6000; ++i) {
85 | ranks.push_back(i + 4000);
86 | }
87 | std::sort(ranks.begin(), ranks.end());
88 | ranks.erase(std::unique(ranks.begin(), ranks.end()), ranks.end());
89 | }
90 | SECTION("all ranks") {
91 | ranks.resize(this->size);
92 | std::iota(ranks.begin(), ranks.end(), 0);
93 | }
94 | std::vector result(ranks.size());
95 | this->gpu_ranks.copy_from(ranks);
96 | this->gpu_data_out.copy_from(result);
97 | this->run([&]() {
98 | quickselect_multi(this->gpu_data, this->gpu_data_tmp, this->gpu_count_tmp, this->size, this->gpu_ranks, ranks.size(),
99 | this->gpu_data_out);
100 | });
101 | auto ref = this->data;
102 | std::sort(ref.begin(), ref.end());
103 | this->gpu_data_out.copy_to(result);
104 | std::vector reference;
105 | for (auto rank : ranks) {
106 | reference.push_back(ref[rank]);
107 | }
108 | int count{};
109 | for (index i = 0; i < reference.size(); ++i) {
110 | count += reference[i] != result[i];
111 | }
112 | CAPTURE(reference.size());
113 | CHECK(count == 0);
114 | }
115 |
116 | } // namespace gpu
--------------------------------------------------------------------------------
/include/cpu_reference.hpp:
--------------------------------------------------------------------------------
1 | /*
2 | * Parallel selection algorithm on GPUs
3 | * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de)
4 | *
5 | * This program is free software: you can redistribute it and/or modify
6 | * it under the terms of the GNU General Public License as published by
7 | * the Free Software Foundation, version 3.
8 | *
9 | * This program is distributed in the hope that it will be useful, but
10 | * WITHOUT ANY WARRANTY; without even the implied warranty of
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 | * General Public License for more details.
13 | *
14 | * You should have received a copy of the GNU General Public License
15 | * along with this program. If not, see .
16 | */
17 | #ifndef GPU_SELECTION_CPU_REFERENCE_HPP
18 | #define GPU_SELECTION_CPU_REFERENCE_HPP
19 |
20 | #include
21 | #include
22 |
23 | namespace cpu {
24 |
25 | using gpu::index;
26 | using gpu::mask;
27 |
28 | template
29 | std::pair partition(const std::vector& data, int begin, int end, std::vector& out,
30 | int pivot_idx);
31 |
32 | template
33 | T quickselect(std::vector& in, std::vector& out, int rank);
34 |
35 | template
36 | std::vector build_searchtree(const std::vector& in, int sample_size, int searchtree_size);
37 |
38 | template
39 | std::pair, std::vector> ssss(const std::vector& data,
40 | const std::vector& tree, bool write);
41 |
42 | std::vector grouped_reduce(const std::vector& data, int searchtree_size);
43 | std::vector grouped_prefix_sum(const std::vector& data, int searchtree_size);
44 |
45 | std::vector compute_rank_ranges(std::vector counts, const std::vector& ranks);
46 | std::vector compute_bucket_mask(const std::vector& rank_ranges);
47 |
48 | std::pair, index> masked_prefix_sum(const std::vector& counts, const std::vector& m);
49 |
50 | } // namespace cpu
51 |
52 | #endif // GPU_SELECTION_CPU_REFERENCE_HPP
53 |
--------------------------------------------------------------------------------
/include/cuda_definitions.cuh:
--------------------------------------------------------------------------------
1 | /*
2 | * Parallel selection algorithm on GPUs
3 | * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de)
4 | *
5 | * This program is free software: you can redistribute it and/or modify
6 | * it under the terms of the GNU General Public License as published by
7 | * the Free Software Foundation, version 3.
8 | *
9 | * This program is distributed in the hope that it will be useful, but
10 | * WITHOUT ANY WARRANTY; without even the implied warranty of
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 | * General Public License for more details.
13 | *
14 | * You should have received a copy of the GNU General Public License
15 | * along with this program. If not, see .
16 | */
17 | #ifndef GPU_SELECTION_CUDA_DEFINITIONS_CUH
18 | #define GPU_SELECTION_CUDA_DEFINITIONS_CUH
19 |
20 | #include
21 |
22 | namespace gpu {
23 |
24 | using index = std::uint32_t;
25 | using poracle = std::uint32_t;
26 | using oracle = std::uint32_t;
27 | using mask = std::uint32_t;
28 |
29 | constexpr index warp_size_log2 = 5;
30 | constexpr index warp_size = 1 << warp_size_log2;
31 | constexpr index max_block_size_log2 = 10;
32 | constexpr index max_block_size = 1 << max_block_size_log2;
33 |
34 | } // namespace gpu
35 |
36 | #endif // GPU_SELECTION_CUDA_DEFINITIONS_CUH
37 |
--------------------------------------------------------------------------------
/include/cuda_error.cuh:
--------------------------------------------------------------------------------
1 | /*
2 | * Parallel selection algorithm on GPUs
3 | * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de)
4 | *
5 | * This program is free software: you can redistribute it and/or modify
6 | * it under the terms of the GNU General Public License as published by
7 | * the Free Software Foundation, version 3.
8 | *
9 | * This program is distributed in the hope that it will be useful, but
10 | * WITHOUT ANY WARRANTY; without even the implied warranty of
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 | * General Public License for more details.
13 | *
14 | * You should have received a copy of the GNU General Public License
15 | * along with this program. If not, see .
16 | */
17 | #ifndef CUDA_CHECK_ERROR_CUH
18 | #define CUDA_CHECK_ERROR_CUH
19 |
20 | #include
21 |
22 | inline void cudaCheckError(cudaError_t error) {
23 | if (error != cudaSuccess) {
24 | std::string msg{"CUDA error "};
25 | msg += cudaGetErrorName(error);
26 | msg += ": ";
27 | msg += cudaGetErrorString(error);
28 | throw std::runtime_error{msg};
29 | }
30 | }
31 |
32 | template
33 | void cudaChecked(F func) {
34 | func();
35 | cudaDeviceSynchronize();
36 | cudaCheckError(cudaGetLastError());
37 | }
38 |
39 | #endif // CUDA_CHECK_ERROR_CUH
40 |
--------------------------------------------------------------------------------
/include/cuda_memory.cuh:
--------------------------------------------------------------------------------
1 | /*
2 | * Parallel selection algorithm on GPUs
3 | * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de)
4 | *
5 | * This program is free software: you can redistribute it and/or modify
6 | * it under the terms of the GNU General Public License as published by
7 | * the Free Software Foundation, version 3.
8 | *
9 | * This program is distributed in the hope that it will be useful, but
10 | * WITHOUT ANY WARRANTY; without even the implied warranty of
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 | * General Public License for more details.
13 | *
14 | * You should have received a copy of the GNU General Public License
15 | * along with this program. If not, see .
16 | */
17 | #ifndef CUDA_MEMORY_CUH
18 | #define CUDA_MEMORY_CUH
19 |
20 | #include
21 | #include
22 |
23 | #include "cuda_error.cuh"
24 |
25 | template
26 | class cuda_resettable_array;
27 |
28 | template
29 | class cuda_array {
30 | friend class cuda_resettable_array;
31 | public:
32 | cuda_array() : size{}, storage{nullptr} {}
33 | cuda_array(std::size_t size) : size{size}, storage{nullptr} {
34 | cudaCheckError(cudaMalloc(&storage, sizeof(T) * size));
35 | }
36 | ~cuda_array() {
37 | if (storage) {
38 | try {
39 | cudaCheckError(cudaFree(storage));
40 | } catch (std::runtime_error& err) {
41 | std::cerr << err.what() << std::endl;
42 | }
43 | }
44 | }
45 | cuda_array(const cuda_array&) = delete;
46 | cuda_array(cuda_array&& other) {
47 | storage = other.storage;
48 | size = other.size;
49 | other.storage = nullptr;
50 | other.size = 0;
51 | }
52 | cuda_array& operator=(cuda_array&& other) {
53 | this->~cuda_array();
54 | storage = other.storage;
55 | size = other.size;
56 | other.storage = nullptr;
57 | other.size = 0;
58 | return *this;
59 | }
60 |
61 | operator T*() { return storage; }
62 |
63 | void copy_from_raw(const T* src) {
64 | cudaCheckError(cudaMemcpy(storage, src, size * sizeof(T), cudaMemcpyHostToDevice));
65 | }
66 |
67 | void copy_to_raw(T* dst) const {
68 | cudaCheckError(cudaMemcpy(dst, storage, size * sizeof(T), cudaMemcpyDeviceToHost));
69 | }
70 |
71 | void copy_from(const std::vector& vec) {
72 | if (size != vec.size()) {
73 | *this = cuda_array{vec.size()};
74 | }
75 | copy_from_raw(vec.data());
76 | }
77 |
78 | void copy_to(std::vector& vec) const {
79 | vec.resize(size);
80 | copy_to_raw(vec.data());
81 | }
82 |
83 | private:
84 | std::size_t size;
85 | T* storage;
86 | };
87 |
88 | template
89 | class cuda_resettable_array {
90 | public:
91 | void copy_from_raw(const T* src) {
92 | storage.copy_from_raw(src);
93 | refstorage.copy_from_raw(src);
94 | }
95 |
96 | void copy_to_raw(T* dst) const {
97 | storage.copy_to_raw(dst);
98 | }
99 |
100 | void copy_from(const std::vector& vec) {
101 | storage.copy_from(vec);
102 | refstorage.copy_from(vec);
103 | }
104 |
105 | void copy_to(std::vector& vec) const {
106 | storage.copy_to(vec);
107 | }
108 |
109 | void reset() {
110 | cudaCheckError(cudaMemcpy(storage, refstorage, storage.size * sizeof(T), cudaMemcpyDeviceToDevice));
111 | }
112 |
113 | operator T*() { return storage; }
114 |
115 | private:
116 | cuda_array storage;
117 | cuda_array refstorage;
118 | };
119 |
120 | #endif // CUDA_MEMORY_CUH
121 |
--------------------------------------------------------------------------------
/include/cuda_timer.cuh:
--------------------------------------------------------------------------------
1 | /*
2 | * Parallel selection algorithm on GPUs
3 | * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de)
4 | *
5 | * This program is free software: you can redistribute it and/or modify
6 | * it under the terms of the GNU General Public License as published by
7 | * the Free Software Foundation, version 3.
8 | *
9 | * This program is distributed in the hope that it will be useful, but
10 | * WITHOUT ANY WARRANTY; without even the implied warranty of
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 | * General Public License for more details.
13 | *
14 | * You should have received a copy of the GNU General Public License
15 | * along with this program. If not, see .
16 | */
17 | #ifndef CUDA_TIMER_CUH
18 | #define CUDA_TIMER_CUH
19 |
20 | #include "cuda_error.cuh"
21 |
22 | #include
23 | #include
24 | #include
25 | #include
26 | #include
27 |
28 | class cuda_timer {
29 | public:
30 | cuda_timer(std::ostream& output) : m_events(6), m_output{&output} {
31 | for (auto& event : m_events) {
32 | cudaCheckError(cudaEventCreate(&event));
33 | }
34 | }
35 |
36 | ~cuda_timer() {
37 | for (auto& event : m_events) {
38 | cudaEventDestroy(event);
39 | }
40 | }
41 |
42 | template
43 | void timed(std::string name, int num_runs, Kernel kernel) {
44 | std::vector> results(num_runs, std::vector(m_events.size() - 1));
45 | int max_event = -1;
46 | auto event = [&](int idx_event) {
47 | cudaCheckError(cudaEventRecord(m_events[idx_event]));
48 | max_event = std::max(idx_event, max_event);
49 | };
50 | for (int i = 0; i < num_runs; ++i) {
51 | cudaChecked([&]() { kernel(event); });
52 | cudaCheckError(cudaEventSynchronize(m_events[max_event]));
53 | for (int j = 0; j < max_event; ++j) {
54 | cudaCheckError(cudaEventElapsedTime(&results[i][j], m_events[j], m_events[j + 1]));
55 | }
56 | }
57 | auto& out = *m_output;
58 | out << name;
59 | for (const auto& run : results) {
60 | out << ",(";
61 | std::copy(run.begin(), run.begin() + max_event - 1, std::ostream_iterator(out, ";"));
62 | out << run[max_event - 1] << ')';
63 | }
64 | out << std::endl; // flush output (in case of errors!)
65 | }
66 |
67 | private:
68 | std::vector m_events;
69 | std::ostream* m_output;
70 | };
71 |
72 | class cpu_timer {
73 | public:
74 | void start() { m_start = std::chrono::high_resolution_clock::now(); }
75 | void stop() { m_end = std::chrono::high_resolution_clock::now(); }
76 | template
77 | void timed(F f) {
78 | start();
79 | f();
80 | stop();
81 | }
82 | double elapsed_us(int repetitions = 1) {
83 | return std::chrono::duration(m_end - m_start).count() / repetitions;
84 | }
85 |
86 | private:
87 | std::chrono::high_resolution_clock::time_point m_start;
88 | std::chrono::high_resolution_clock::time_point m_end;
89 | };
90 |
91 | #endif // CUDA_TIMER_CUH
92 |
--------------------------------------------------------------------------------
/include/kernel_config.cuh:
--------------------------------------------------------------------------------
1 | /*
2 | * Parallel selection algorithm on GPUs
3 | * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de)
4 | *
5 | * This program is free software: you can redistribute it and/or modify
6 | * it under the terms of the GNU General Public License as published by
7 | * the Free Software Foundation, version 3.
8 | *
9 | * This program is distributed in the hope that it will be useful, but
10 | * WITHOUT ANY WARRANTY; without even the implied warranty of
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 | * General Public License for more details.
13 | *
14 | * You should have received a copy of the GNU General Public License
15 | * along with this program. If not, see .
16 | */
17 | #include "cuda_definitions.cuh"
18 | #include
19 |
20 | namespace gpu {
21 |
22 | template
23 | struct bitonic_basecase_config {
24 | constexpr static index size_log2 = Size_log2;
25 | constexpr static index size = 1 << size_log2;
26 | constexpr static index local_size_log2 = Local_size_log2;
27 | constexpr static index local_size = 1 << local_size_log2;
28 | constexpr static index launch_size = size / local_size;
29 | };
30 |
31 | template
32 | struct sample_config {
33 | constexpr static index size_log2 = Size_log2;
34 | constexpr static index size = 1 << size_log2;
35 | constexpr static index local_size_log2 = size_log2 > max_block_size_log2 ? size_log2 - max_block_size_log2 : 0;
36 | constexpr static index local_size = 1 << local_size_log2;
37 | };
38 |
39 | template
40 | struct searchtree_config {
41 | constexpr static index height = Height;
42 | constexpr static index width = 1 << height;
43 | constexpr static index size = 2 * width - 1;
44 | };
45 |
46 | template
48 | struct algorithm_config {
49 | constexpr static bool shared_memory = Shared_memory;
50 | constexpr static bool warp_aggr = Warp_aggr;
51 | constexpr static bool write = Write;
52 | constexpr static index unroll = Unroll;
53 | constexpr static index max_block_size_log2 = Max_block_size_log2;
54 | constexpr static index max_block_size = 1 << max_block_size_log2;
55 | constexpr static index max_block_count_log2 = Max_block_count_log2;
56 | constexpr static index max_block_count = 1 << max_block_count_log2;
57 | constexpr static index merged_limit = Merged_limit;
58 | constexpr static bool bucket_select = Bucket_select;
59 | };
60 |
61 | template
64 | struct select_config {
65 | using basecase = bitonic_basecase_config;
66 | using sample = sample_config;
67 | using searchtree = searchtree_config;
68 | using algorithm = algorithm_config;
70 | constexpr static auto searchtree_kernel_size = std::max(std::min(max_block_size, sample::size), searchtree::width);
71 | };
72 |
73 | } // namespace gpu
--------------------------------------------------------------------------------
/include/launcher_fwd.cuh:
--------------------------------------------------------------------------------
1 | /*
2 | * Parallel selection algorithm on GPUs
3 | * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de)
4 | *
5 | * This program is free software: you can redistribute it and/or modify
6 | * it under the terms of the GNU General Public License as published by
7 | * the Free Software Foundation, version 3.
8 | *
9 | * This program is distributed in the hope that it will be useful, but
10 | * WITHOUT ANY WARRANTY; without even the implied warranty of
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 | * General Public License for more details.
13 | *
14 | * You should have received a copy of the GNU General Public License
15 | * along with this program. If not, see .
16 | */
17 | #ifndef LAUNCHER_FWD_CUH
18 | #define LAUNCHER_FWD_CUH
19 |
20 | #include "cuda_definitions.cuh"
21 | #include "ssss_merged_memory.cuh"
22 |
23 | namespace gpu {
24 |
25 | namespace kernels {
26 | template
27 | struct ssss_multi_aux;
28 |
29 | template
30 | __global__ void partition(const T* in, T* out, index* atomic, index size, T pivot, index workcount);
31 |
32 | template
33 | __global__ void partition_count(const T* in, index* counts, index size, T pivot, index workcount);
34 |
35 | template
36 | __global__ void partition_distr(const T* in, T* out, const index* counts, index size, T pivot, index workcount);
37 |
38 | template
39 | __global__ void reduce_counts(const index* in, index* out, index num_blocks);
40 |
41 | template
42 | __global__ void prefix_sum_counts(index* in, index* out, index num_blocks);
43 |
44 | template
45 | __global__ void partition_prefixsum(index* counts, index block_count);
46 |
47 | template
48 | __global__ void count_buckets(const T* in, const T* tree, index* counts, poracle* oracles, index size, index workcount);
49 |
50 | template
51 | __device__ void masked_prefix_sum(index* counts, const mask* m);
52 | }
53 |
54 | template
55 | __host__ __device__ void build_searchtree(const T* in, T* out, index size);
56 |
57 | template
58 | __host__ __device__ void count_buckets(const T* in, const T* tree, index* localcounts,
59 | index* counts, poracle* oracles, index size);
60 |
61 | template
62 | __host__ __device__ void collect_bucket(const T* data, const poracle* oracles_packed,
63 | const index* prefix_sum, T* out, index size, oracle bucket,
64 | index* atomic);
65 |
66 | template
67 | __host__ __device__ void collect_bucket_indirect(const T* data, const poracle* oracles_packed,
68 | const index* prefix_sum, T* out, index size,
69 | const oracle* bucket, index* atomic);
70 |
71 | template
72 | __host__ __device__ void collect_buckets(const T* data, const poracle* oracles_packed,
73 | const index* block_prefix_sum, const index* bucket_out_ranges,
74 | T* out, index size, mask* buckets, index* atomic);
75 |
76 | template
77 | __host__ __device__ void ssss_merged(
78 | const T* in,
79 | T* out,
80 | poracle* oracles,
81 | index offset,
82 | const index* ranks,
83 | index rank_offset,
84 | index rank_base,
85 | const kernels::ssss_multi_aux* aux_in,
86 | kernels::ssss_multi_aux* aux_outs,
87 | T* out_trees);
88 |
89 | template
90 | void sampleselect(T* in, T* tmp, T* tree, index* count_tmp, index size, index rank, T* out);
91 |
92 | template
93 | void sampleselect_host(T* in, T* tmp, T* tree, index* count_tmp, index size, index rank, T* out);
94 |
95 | template
96 | void sampleselect_multi(T* in, T* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, T* out);
97 |
98 | template
99 | __device__ __host__ void partition(const T* in, T* out, index* counts, index size, T pivot);
100 |
101 | template
102 | void quickselect_multi(T* in, T* tmp, index* count_tmp, index size, const index* ranks, index rank_count, T* out);
103 |
104 | template
105 | void quickselect(T* in, T* tmp, index* count_tmp, index size, index rank, T* out);
106 |
107 | template
108 | __host__ __device__ launch_parameters get_launch_parameters(index size);
109 |
110 | } // namespace gpu
111 |
112 | #endif // LAUNCHER_FWD_CUH
--------------------------------------------------------------------------------
/include/verification.hpp:
--------------------------------------------------------------------------------
1 | /*
2 | * Parallel selection algorithm on GPUs
3 | * Copyright (c) 2018-2019 Tobias Ribizel (oss@ribizel.de)
4 | *
5 | * This program is free software: you can redistribute it and/or modify
6 | * it under the terms of the GNU General Public License as published by
7 | * the Free Software Foundation, version 3.
8 | *
9 | * This program is distributed in the hope that it will be useful, but
10 | * WITHOUT ANY WARRANTY; without even the implied warranty of
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 | * General Public License for more details.
13 | *
14 | * You should have received a copy of the GNU General Public License
15 | * along with this program. If not, see .
16 | */
17 | #ifndef GPU_SELECTION_VERIFICATION_HPP
18 | #define GPU_SELECTION_VERIFICATION_HPP
19 |
20 | #include
21 | #include
22 | #include
23 |
24 | namespace verification {
25 |
26 | using gpu::index;
27 | using gpu::mask;
28 |
29 | template
30 | std::pair count_mispartitioned(const std::vector& data, int pivot_rank, T pivot);
31 |
32 | template
33 | T nth_element(const std::vector& data, int rank);
34 |
35 | template
36 | std::vector nth_elements(const std::vector& data, std::vector ranks);
37 |
38 | template
39 | int count_not_in_bucket(const std::vector& data, T lower, T upper);
40 |
41 | template
42 | std::vector count_not_in_buckets(const std::vector& data, std::vector prefix_sum, const std::vector& searchtree);
43 |
44 | bool verify_rank_ranges(const std::vector& ranks, const std::vector& index_ranges, const std::vector& rank_ranges);
45 |
46 | } // namespace verification
47 |
48 | #endif // GPU_SELECTION_VERIFICATION_HPP
49 |
--------------------------------------------------------------------------------
/lib/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_library(gpu_selection
2 | cpu_reference.cpp
3 | verification.cpp
4 | generated/gen0.cu
5 | generated/gen1.cu
6 | generated/gen2.cu
7 | generated/gen3.cu
8 | generated/gen4.cu
9 | generated/gen5.cu
10 | generated/gen6.cu
11 | generated/gen7.cu
12 | generated/gen8.cu
13 | generated/gen9.cu
14 | generated/gen10.cu
15 | generated/gen11.cu
16 | generated/gen12.cu
17 | generated/gen13.cu
18 | generated/gen14.cu
19 | generated/gen15.cu
20 | generated/gen16.cu
21 | generated/gen17.cu
22 | generated/gen18.cu
23 | generated/gen19.cu
24 | generated/gen20.cu
25 | generated/gen21.cu
26 | generated/gen22.cu
27 | generated/gen23.cu
28 | generated/gen24.cu
29 | generated/gen25.cu
30 | generated/gen26.cu
31 | generated/gen27.cu
32 | generated/gen28.cu
33 | generated/gen29.cu
34 | generated/gen30.cu
35 | generated/gen31.cu
36 | generated/gen32.cu
37 | generated/gen33.cu
38 | generated/gen34.cu
39 | generated/gen35.cu
40 | generated/gen36.cu
41 | generated/gen37.cu
42 | generated/gen38.cu
43 | generated/gen39.cu
44 | )
45 |
46 | target_compile_features(gpu_selection PUBLIC cxx_std_14)
47 | set_target_properties(gpu_selection PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
48 |
49 | target_include_directories(gpu_selection PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}")
50 | target_include_directories(gpu_selection PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/../include")
51 |
--------------------------------------------------------------------------------
/lib/generated/gen0.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include
8 | namespace gpu {
9 |
10 | template void sampleselect>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
11 | template __host__ __device__ void collect_bucket_indirect>(const float* data, const poracle* oracles_packed, const index* prefix_sum, float* out, index size, const oracle* bucket, index* atomic);
12 | template void sampleselect>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
13 | template void sampleselect>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
14 | template __host__ __device__ void count_buckets>(const float* in, const float* tree, index* localcounts, index* counts, poracle* oracles, index size);
15 | template __host__ __device__ void collect_bucket>(const double* data, const poracle* oracles_packed, const index* prefix_sum, double* out, index size, oracle bucket, index* atomic);
16 | template __global__ void kernels::partition_prefixsum>(index* counts, index block_count);
17 | template void sampleselect>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
18 | template __host__ __device__ void collect_bucket>(const double* data, const poracle* oracles_packed, const index* prefix_sum, double* out, index size, oracle bucket, index* atomic);
19 | template void quickselect_multi>(float* in, float* tmp, index* count_tmp, index size, const index* ranks, index rank_count, float* out);
20 | template __global__ void kernels::partition_prefixsum>(index* counts, index block_count);
21 | }
--------------------------------------------------------------------------------
/lib/generated/gen1.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include
8 | namespace gpu {
9 | template __host__ __device__ launch_parameters get_launch_parameters>(index size);
10 | template void quickselect>(float* in, float* tmp, index* count_tmp, index size, index rank, float* out);
11 | template __device__ __host__ void partition>(const float* in, float* out, index* counts, index size, float pivot);
12 | template __global__ void kernels::count_buckets>(const double* in, const double* tree, index* counts, poracle* oracles, index size, index workcount);
13 | template __host__ __device__ launch_parameters get_launch_parameters>(index size);
14 | template __global__ void kernels::partition>(const double* in, double* out, index* atomic, index size, double pivot, index workcount);
15 | template __global__ void kernels::count_buckets>(const float* in, const float* tree, index* counts, poracle* oracles, index size, index workcount);
16 | template __global__ void kernels::count_buckets>(const float* in, const float* tree, index* counts, poracle* oracles, index size, index workcount);
17 | template void sampleselect_multi>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out);
18 | template __host__ __device__ launch_parameters get_launch_parameters>(index size);
19 | template void sampleselect>(float* in, float* tmp, float* tree, index* count_tmp, index size, index rank, float* out);
20 | template void sampleselect_multi>(float* in, float* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, float* out);
21 | }
--------------------------------------------------------------------------------
/lib/generated/gen10.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include
8 | namespace gpu {
9 | template void sampleselect_multi>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out);
10 | template __host__ __device__ void collect_bucket_indirect>(const float* data, const poracle* oracles_packed, const index* prefix_sum, float* out, index size, const oracle* bucket, index* atomic);
11 | template void sampleselect_multi>(double* in, double* tmp, index size, const index* ranks, index rank_count, index* tmp_storage, index* aux_storage, index* aux_atomic, double* out);
12 | template void sampleselect_host>(double* in, double* tmp, double* tree, index* count_tmp, index size, index rank, double* out);
13 | template __host__ __device__ void build_searchtree>(const float* in, float* out, index size);
14 | template __host__ __device__ void ssss_merged>(const float* in, float* out, poracle* oracles, index offset, const index* ranks, index rank_offset, index rank_base, const kernels::ssss_multi_aux>* aux_in, kernels::ssss_multi_aux>* aux_outs, float* out_tree);
15 | template void quickselect>(double* in, double* tmp, index* count_tmp, index size, index rank, double* out);
16 | template void sampleselect