├── README.rst ├── measure.cc ├── sample.cc ├── parallel_radix_sort_test.cc └── parallel_radix_sort.h /README.rst: -------------------------------------------------------------------------------- 1 | 概要 2 | ---- 3 | OpenMP を用いて並列化した Radix Sort です. 4 | また,参考文献の論文で提案されている高速化手法である Buffer based scheme を採用しています. 5 | 6 | キーのみのソートと,キー・値のペアのソートができます. 7 | キーとして,以下の型がとれます. 8 | 9 | * 符号付き整数 (char, short, int, long, long long) 10 | * 符号なし整数 (上のに unsigned がついたもの) 11 | * 浮動小数点数 (float, double) 12 | 13 | 使い方 14 | ------ 15 | sample.cc や measure.cc を見ると大体分かると思います. 16 | 17 | コンパイル時に -fopenmp を付けないと並列化されないので注意してください. 18 | 19 | 性能 20 | ---- 21 | measure.cc で 2 億要素の int 配列のソートの時間を測ります. 22 | 23 | 実行例:: 24 | 25 | % g++ -O3 measure.cc -fopenmp 26 | % ./a.out 27 | N = 200000000 28 | parallel_radix_sort::KeySort(0): 1.159468 sec 29 | parallel_radix_sort::KeySort(1): 0.972533 sec 30 | parallel_radix_sort::KeySort(2): 1.013231 sec 31 | std::sort(0): 19.788240 sec 32 | std::sort(1): 19.786527 sec 33 | std::sort(2): 19.858960 sec 34 | 35 | 参考文献 36 | -------- 37 | * Nadathur Satish, Changkyu Kim, Jatin Chhugani, Anthony D. Nguyen, Victor W. Lee, Daehyun Kim, and Pradeep Dubey. 2010. Fast sort on CPUs and GPUs: a case for bandwidth oblivious SIMD sort. In Proceedings of the 2010 international conference on Management of data (SIGMOD '10). ACM, New York, NY, USA, 351-362. DOI=10.1145/1807167.1807207 http://doi.acm.org/10.1145/1807167.1807207 38 | -------------------------------------------------------------------------------- /measure.cc: -------------------------------------------------------------------------------- 1 | #include "parallel_radix_sort.h" 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | const int DEFAULT_N = 200000000; 13 | const int TRIAL = 3; 14 | 15 | struct __bench__ { 16 | double start; 17 | char msg[100]; 18 | __bench__(const char* format, ...) 19 | __attribute__((format(printf, 2, 3))) 20 | { 21 | va_list args; 22 | va_start(args, format); 23 | vsnprintf(msg, sizeof(msg), format, args); 24 | va_end(args); 25 | 26 | start = sec(); 27 | } 28 | ~__bench__() { 29 | fprintf(stderr, "%s: %.6f sec\n", msg, sec() - start); 30 | } 31 | double sec() { 32 | struct timeval tv; 33 | gettimeofday(&tv, NULL); 34 | return tv.tv_sec + tv.tv_usec * 1e-6; 35 | } 36 | operator bool() { return false; } 37 | }; 38 | 39 | #define benchmark(...) if (__bench__ __b__ = __bench__(__VA_ARGS__)); else 40 | 41 | void InitRandom(int *a, int n) { 42 | for (int i = 0; i < n; ++i) { 43 | a[i] = rand(); 44 | } 45 | } 46 | 47 | int main(int argc, char **argv) { 48 | if (argc >= 3) { 49 | fprintf(stderr, "usage: measure [size]\n"); 50 | exit(EXIT_FAILURE); 51 | } 52 | 53 | int N = DEFAULT_N; 54 | if (argc == 2) N = 1 << atoi(argv[1]); 55 | printf("N = %d\n", N); 56 | 57 | int *buf; 58 | buf = new int[N]; 59 | assert(buf); 60 | 61 | parallel_radix_sort::KeySort key_sort; 62 | key_sort.Init(N); 63 | 64 | for (int t = 0; t < TRIAL; ++t) { 65 | InitRandom(buf, N); 66 | benchmark("parallel_radix_sort::KeySort(%d)", t) { 67 | key_sort.Sort(buf, N); 68 | } 69 | } 70 | 71 | for (int t = 0; t < TRIAL; ++t) { 72 | InitRandom(buf, N); 73 | benchmark("std::sort(%d)", t) { 74 | std::sort(buf, buf + N); 75 | } 76 | } 77 | 78 | exit(EXIT_SUCCESS); 79 | } 80 | -------------------------------------------------------------------------------- /sample.cc: -------------------------------------------------------------------------------- 1 | #include "parallel_radix_sort.h" 2 | 3 | #include 4 | 5 | int main() { 6 | // Sorting keys 7 | { 8 | int data[5] = {-1, 2, 0, -2, 1}; 9 | 10 | parallel_radix_sort::SortKeys(data, 5); 11 | 12 | for (int i = 0; i < 5; ++i) printf("%d ", data[i]); 13 | puts("\n"); 14 | } 15 | 16 | // Sorting pairs 17 | { 18 | double keys[5] = {-0.1, 0.2, 0.0, -0.2, 0.1}; 19 | int vals[5] = {1, 2, 3, 4, 5}; 20 | 21 | parallel_radix_sort::SortPairs(keys, vals, 5); 22 | 23 | for (int i = 0; i < 5; ++i) printf("%+.1f ", keys[i]); 24 | puts(""); 25 | for (int i = 0; i < 5; ++i) printf("%4d ", vals[i]); 26 | puts("\n"); 27 | } 28 | 29 | // When you perform sorting more than once, you can avoid 30 | // the cost of initialization using classes |KeySort| or |PairSort| 31 | { 32 | int data[5] = {-1, 2, 0, -2, 1}; 33 | 34 | parallel_radix_sort::KeySort key_sort; 35 | key_sort.Init(5); 36 | int *sorted = key_sort.Sort(data, 5); 37 | 38 | for (int i = 0; i < 5; ++i) printf("%d ", sorted[i]); 39 | puts("\n"); 40 | } 41 | { 42 | double keys[5] = {-0.1, 0.2, 0.0, -0.2, 0.1}; 43 | int vals[5] = {1, 2, 3, 4, 5}; 44 | 45 | parallel_radix_sort::PairSort pair_sort; 46 | pair_sort.Init(5); 47 | std::pair sorted = pair_sort.Sort(keys, vals, 5); 48 | 49 | for (int i = 0; i < 5; ++i) printf("%+.1f ", sorted.first[i]); 50 | puts(""); 51 | for (int i = 0; i < 5; ++i) printf("%4d ", sorted.second[i]); 52 | puts("\n"); 53 | } 54 | 55 | // You can specify the number of threads. 56 | // (Otherwise default value given by OpenMP would be used.) 57 | { 58 | int data[5] = {-1, 2, 0, -2, 1}; 59 | 60 | parallel_radix_sort::SortKeys(data, 5, 4); // 4 thread 61 | 62 | for (int i = 0; i < 5; ++i) printf("%d ", data[i]); 63 | puts("\n"); 64 | } 65 | { 66 | int data[5] = {-1, 2, 0, -2, 1}; 67 | 68 | parallel_radix_sort::KeySort key_sort; 69 | key_sort.Init(5, 4); 70 | int *sorted = key_sort.Sort(data, 5, 4); 71 | 72 | for (int i = 0; i < 5; ++i) printf("%d ", sorted[i]); 73 | puts("\n"); 74 | } 75 | 76 | return 0; 77 | } 78 | -------------------------------------------------------------------------------- /parallel_radix_sort_test.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2010, Takuya Akiba 2 | // All rights reserved. 3 | // 4 | // Redistribution and use in source and binary forms, with or without 5 | // modification, are permitted provided that the following conditions are 6 | // met: 7 | // 8 | // * Redistributions of source code must retain the above copyright 9 | // notice, this list of conditions and the following disclaimer. 10 | // * Redistributions in binary form must reproduce the above 11 | // copyright notice, this list of conditions and the following disclaimer 12 | // in the documentation and/or other materials provided with the 13 | // distribution. 14 | // * Neither the name of Takuya Akiba nor the names of its 15 | // contributors may be used to endorse or promote products derived from 16 | // this software without specific prior written permission. 17 | // 18 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | 30 | // Usage: 31 | // % g++ -O3 parallel_radix_sort_test.cc -lgtest -lgtest_main -fopenmp 32 | // % ./a.out 33 | 34 | #include "parallel_radix_sort.h" 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | 42 | using testing::Types; 43 | 44 | const int kMaxNumElems = 100000; 45 | const int kMaxNumThreads = 32; 46 | const int kNumTrials = 100; 47 | 48 | namespace { 49 | uint8_t Random8bit() { 50 | return rand() & 0xFF; 51 | } 52 | 53 | template T Random() { 54 | T r(0); 55 | for (size_t i = 0; i < sizeof(T); ++i) { 56 | r |= static_cast(Random8bit()) << (i * 8); 57 | } 58 | return r; 59 | } 60 | 61 | template<> float Random() { 62 | for (;;) { 63 | union { 64 | uint32_t u; 65 | float f; 66 | } uf; 67 | uf.u = Random(); 68 | if (isnanf(uf.f)) continue; 69 | return uf.f; 70 | } 71 | } 72 | 73 | template<> double Random() { 74 | for (;;) { 75 | union { 76 | uint64_t u; 77 | double f; 78 | } uf; 79 | uf.u = Random(); 80 | if (isnan(uf.f)) continue; 81 | return uf.f; 82 | } 83 | } 84 | 85 | template void FillRandom(T *a, size_t n) { 86 | for (size_t i = 0; i < n; ++i) { 87 | a[i] = Random(); 88 | } 89 | } 90 | } // namespace 91 | 92 | typedef Types SortingTypes; 98 | 99 | template 100 | class ParallelRadixSortTest : public testing::Test {}; 101 | TYPED_TEST_CASE(ParallelRadixSortTest, SortingTypes); 102 | 103 | TYPED_TEST(ParallelRadixSortTest, KeySort) { 104 | TypeParam *dat = new TypeParam[kMaxNumElems]; 105 | TypeParam *ans = new TypeParam[kMaxNumElems]; 106 | ASSERT_NE(reinterpret_cast(NULL), dat); 107 | ASSERT_NE(reinterpret_cast(NULL), ans); 108 | 109 | parallel_radix_sort::KeySort key_sort; 110 | key_sort.Init(kMaxNumElems, kMaxNumThreads); 111 | 112 | for (int t = 0; t < kNumTrials; ++t) { 113 | int num_elems = 1 + rand() % kMaxNumElems; 114 | int num_threads = 1 + rand() % kMaxNumThreads; 115 | 116 | FillRandom(dat, num_elems); 117 | 118 | std::partial_sort_copy(dat, dat + num_elems, ans, ans + num_elems); 119 | TypeParam *res = key_sort.Sort(dat, num_elems, num_threads); 120 | 121 | for (int i = 0; i < num_elems; ++i) { 122 | // std::cout << ans[i] << " " << res[i] << std::endl; 123 | ASSERT_EQ(ans[i], res[i]); 124 | } 125 | // puts(""); 126 | } 127 | } 128 | 129 | -------------------------------------------------------------------------------- /parallel_radix_sort.h: -------------------------------------------------------------------------------- 1 | // Copyright 2010, Takuya Akiba 2 | // All rights reserved. 3 | // 4 | // Redistribution and use in source and binary forms, with or without 5 | // modification, are permitted provided that the following conditions are 6 | // met: 7 | // 8 | // * Redistributions of source code must retain the above copyright 9 | // notice, this list of conditions and the following disclaimer. 10 | // * Redistributions in binary form must reproduce the above 11 | // copyright notice, this list of conditions and the following disclaimer 12 | // in the documentation and/or other materials provided with the 13 | // distribution. 14 | // * Neither the name of Takuya Akiba nor the names of its 15 | // contributors may be used to endorse or promote products derived from 16 | // this software without specific prior written permission. 17 | // 18 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | 30 | #ifndef PARALLEL_RADIX_SORT_H_ 31 | #define PARALLEL_RADIX_SORT_H_ 32 | 33 | #ifdef _OPENMP 34 | #include 35 | #endif 36 | 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include 43 | 44 | namespace parallel_radix_sort { 45 | namespace utility { 46 | // Return the number of threads that would be executed in parallel regions 47 | int GetMaxThreads() { 48 | #ifdef _OPENMP 49 | return omp_get_max_threads(); 50 | #else 51 | return 1; 52 | #endif 53 | } 54 | 55 | // Set the number of threads that would be executed in parallel regions 56 | void SetNumThreads(int num_threads) { 57 | #ifdef _OPENMP 58 | omp_set_num_threads(num_threads); 59 | #else 60 | if (num_threads != 1) { 61 | assert(!"compile with -fopenmp"); 62 | } 63 | #endif 64 | } 65 | 66 | // Return the thread number, which lies in [0, the number of threads) 67 | int GetThreadId() { 68 | #ifdef _OPENMP 69 | return omp_get_thread_num(); 70 | #else 71 | return 0; 72 | #endif 73 | } 74 | } // namespace utility 75 | 76 | namespace internal { 77 | // Size of the software managed buffer 78 | const size_t kOutBufferSize = 32; 79 | 80 | // The algorithm is implemented in this internal class 81 | template 83 | class ParallelRadixSortInternal { 84 | public: 85 | ParallelRadixSortInternal(); 86 | ~ParallelRadixSortInternal(); 87 | 88 | void Init(size_t max_elems, int max_threads); 89 | 90 | PlainType *Sort(PlainType *data, size_t num_elems, int num_threads, 91 | ValueManager *value_manager); 92 | 93 | static void InitAndSort(PlainType *data, size_t num_elems, int num_threads, 94 | ValueManager *value_manager); 95 | private: 96 | size_t max_elems_; 97 | int max_threads_; 98 | 99 | UnsignedType *tmp_; 100 | size_t **histo_; 101 | UnsignedType ***out_buf_; 102 | size_t **out_buf_n_; 103 | 104 | int num_threads_; 105 | size_t *pos_bgn_, *pos_end_; 106 | ValueManager *value_manager_; 107 | 108 | void DeleteAll(); 109 | 110 | UnsignedType *SortInternal(UnsignedType *data, size_t num_elems, 111 | int num_threads, ValueManager *value_manager); 112 | 113 | // Compute |pos_bgn_| and |pos_end_| (associated ranges for each threads) 114 | void ComputeRanges(size_t num_elems); 115 | 116 | // First step of each iteration of sorting 117 | // Compute the histogram of |src| using bits in [b, b + Base) 118 | void ComputeHistogram(int b, UnsignedType *src); 119 | 120 | // Second step of each iteration of sorting 121 | // Scatter elements of |src| to |dst| using the histogram 122 | void Scatter(int b, UnsignedType *src, UnsignedType *dst); 123 | }; 124 | 125 | template 127 | ParallelRadixSortInternal 128 | ::ParallelRadixSortInternal() 129 | : max_elems_(0), max_threads_(0), tmp_(NULL), histo_(NULL), 130 | out_buf_(NULL), out_buf_n_(NULL), pos_bgn_(NULL), pos_end_(NULL) { 131 | assert(sizeof(PlainType) == sizeof(UnsignedType)); 132 | } 133 | 134 | template 136 | ParallelRadixSortInternal 137 | 138 | ::~ParallelRadixSortInternal() { 139 | DeleteAll(); 140 | } 141 | 142 | template 144 | void ParallelRadixSortInternal 145 | 146 | ::DeleteAll() { 147 | delete [] tmp_; 148 | tmp_ = NULL; 149 | 150 | for (int i = 0; i < max_threads_; ++i) delete [] histo_[i]; 151 | delete [] histo_; 152 | histo_ = NULL; 153 | 154 | for (int i = 0; i < max_threads_; ++i) { 155 | for (size_t j = 0; j < 1 << Base; ++j) { 156 | delete [] out_buf_[i][j]; 157 | } 158 | delete [] out_buf_n_[i]; 159 | delete [] out_buf_[i]; 160 | } 161 | delete [] out_buf_; 162 | delete [] out_buf_n_; 163 | out_buf_ = NULL; 164 | out_buf_n_ = NULL; 165 | 166 | delete [] pos_bgn_; 167 | delete [] pos_end_; 168 | pos_bgn_ = pos_end_ = NULL; 169 | 170 | max_elems_ = 0; 171 | max_threads_ = 0; 172 | } 173 | 174 | template 176 | void ParallelRadixSortInternal 177 | 178 | ::Init(size_t max_elems, int max_threads) { 179 | DeleteAll(); 180 | 181 | max_elems_ = max_elems; 182 | 183 | if (max_threads == -1) { 184 | max_threads = utility::GetMaxThreads(); 185 | } 186 | assert(max_threads >= 1); 187 | max_threads_ = max_threads; 188 | 189 | tmp_ = new UnsignedType[max_elems]; 190 | histo_ = new size_t*[max_threads]; 191 | for (int i = 0; i < max_threads; ++i) { 192 | histo_[i] = new size_t[1 << Base]; 193 | } 194 | 195 | out_buf_ = new UnsignedType**[max_threads]; 196 | out_buf_n_ = new size_t*[max_threads]; 197 | for (int i = 0; i < max_threads; ++i) { 198 | out_buf_[i] = new UnsignedType*[1 << Base]; 199 | out_buf_n_[i] = new size_t[1 << Base]; 200 | for (size_t j = 0; j < 1 << Base; ++j) { 201 | out_buf_[i][j] = new UnsignedType[kOutBufferSize]; 202 | } 203 | } 204 | 205 | pos_bgn_ = new size_t[max_threads]; 206 | pos_end_ = new size_t[max_threads]; 207 | } 208 | 209 | template 211 | PlainType *ParallelRadixSortInternal 212 | 213 | ::Sort(PlainType *data, size_t num_elems, 214 | int num_threads, ValueManager *value_manager) { 215 | UnsignedType *src = reinterpret_cast(data); 216 | UnsignedType *res = SortInternal(src, num_elems, num_threads, value_manager); 217 | return reinterpret_cast(res); 218 | } 219 | 220 | template 222 | void ParallelRadixSortInternal 223 | 224 | ::InitAndSort(PlainType *data, size_t num_elems, 225 | int num_threads, ValueManager *value_manager) { 226 | ParallelRadixSortInternal prs; 227 | prs.Init(num_elems, num_threads); 228 | const PlainType *res = prs.Sort(data, num_elems, num_threads, value_manager); 229 | if (res != data) { 230 | for (size_t i = 0; i < num_elems; ++i) data[i] = res[i]; 231 | } 232 | } 233 | 234 | template 236 | UnsignedType *ParallelRadixSortInternal 237 | 238 | ::SortInternal(UnsignedType *data, size_t num_elems, 239 | int num_threads, ValueManager *value_manager) { 240 | assert(num_elems <= max_elems_); 241 | 242 | if (num_threads == -1) { 243 | num_threads = utility::GetMaxThreads(); 244 | } 245 | assert(1 <= num_threads && num_threads <= max_threads_); 246 | utility::SetNumThreads(num_threads); 247 | assert(utility::GetMaxThreads() == num_threads); 248 | num_threads_ = num_threads; 249 | 250 | value_manager_ = value_manager; 251 | 252 | // Compute |pos_bgn_| and |pos_end_| 253 | ComputeRanges(num_elems); 254 | 255 | // Iterate from lower bits to higher bits 256 | const int bits = CHAR_BIT * sizeof(UnsignedType); 257 | UnsignedType *src = data, *dst = tmp_; 258 | for (int b = 0; b < bits; b += Base) { 259 | ComputeHistogram(b, src); 260 | Scatter(b, src, dst); 261 | 262 | std::swap(src, dst); 263 | value_manager->Next(); 264 | } 265 | 266 | return src; 267 | } 268 | 269 | template 271 | void ParallelRadixSortInternal 272 | 273 | ::ComputeRanges(size_t num_elems) { 274 | pos_bgn_[0] = 0; 275 | for (int i = 0; i < num_threads_ - 1; ++i) { 276 | const size_t t = (num_elems - pos_bgn_[i]) / (num_threads_ - i); 277 | pos_bgn_[i + 1] = pos_end_[i] = pos_bgn_[i] + t; 278 | } 279 | pos_end_[num_threads_ - 1] = num_elems; 280 | } 281 | 282 | template 284 | void ParallelRadixSortInternal 285 | 286 | ::ComputeHistogram(int b, UnsignedType *src) { 287 | // Compute local histogram 288 | #ifdef _OPENMP 289 | #pragma omp parallel 290 | #endif 291 | { 292 | const int my_id = utility::GetThreadId(); 293 | const size_t my_bgn = pos_bgn_[my_id]; 294 | const size_t my_end = pos_end_[my_id]; 295 | size_t *my_histo = histo_[my_id]; 296 | 297 | memset(my_histo, 0, sizeof(size_t) * (1 << Base)); 298 | for (size_t i = my_bgn; i < my_end; ++i) { 299 | const UnsignedType s = Encoder::encode(src[i]); 300 | const UnsignedType t = (s >> b) & ((1 << Base) - 1); 301 | ++my_histo[t]; 302 | } 303 | } 304 | 305 | // Compute global histogram 306 | size_t s = 0; 307 | for (size_t i = 0; i < 1 << Base; ++i) { 308 | for (int j = 0; j < num_threads_; ++j) { 309 | const size_t t = s + histo_[j][i]; 310 | histo_[j][i] = s; 311 | s = t; 312 | } 313 | } 314 | } 315 | 316 | template 318 | void ParallelRadixSortInternal 319 | 320 | ::Scatter(int b, UnsignedType *src, UnsignedType *dst) { 321 | #ifdef _OPENMP 322 | #pragma omp parallel 323 | #endif 324 | { 325 | const int my_id = utility::GetThreadId(); 326 | const size_t my_bgn = pos_bgn_[my_id]; 327 | const size_t my_end = pos_end_[my_id]; 328 | size_t *my_histo = histo_[my_id]; 329 | UnsignedType **my_buf = out_buf_[my_id]; 330 | size_t *my_buf_n = out_buf_n_[my_id]; 331 | 332 | memset(my_buf_n, 0, sizeof(size_t) * (1 << Base)); 333 | for (size_t i = my_bgn; i < my_end; ++i) { 334 | const UnsignedType s = Encoder::encode(src[i]); 335 | const UnsignedType t = (s >> b) & ((1 << Base) - 1); 336 | my_buf[t][my_buf_n[t]] = src[i]; 337 | value_manager_->Push(my_id, t, my_buf_n[t], i); 338 | ++my_buf_n[t]; 339 | 340 | if (my_buf_n[t] == kOutBufferSize) { 341 | size_t p = my_histo[t]; 342 | for (size_t j = 0; j < kOutBufferSize; ++j) { 343 | dst[p++] = my_buf[t][j]; 344 | } 345 | value_manager_->Flush(my_id, t, kOutBufferSize, my_histo[t]); 346 | 347 | my_histo[t] += kOutBufferSize; 348 | my_buf_n[t] = 0; 349 | } 350 | } 351 | 352 | // Flush everything 353 | for (size_t i = 0; i < 1 << Base; ++i) { 354 | size_t p = my_histo[i]; 355 | for (size_t j = 0; j < my_buf_n[i]; ++j) { 356 | dst[p++] = my_buf[i][j]; 357 | } 358 | value_manager_->Flush(my_id, i, my_buf_n[i], my_histo[i]); 359 | } 360 | } 361 | } 362 | } // namespace internal 363 | 364 | // Encoders encode signed/unsigned integers and floating point numbers 365 | // to correctly ordered unsigned integers 366 | namespace encoder { 367 | class EncoderUnsigned { 368 | public: 369 | template 370 | inline static UnsignedType encode(UnsignedType x) { 371 | return x; 372 | } 373 | }; 374 | 375 | class EncoderSigned { 376 | public: 377 | template 378 | inline static UnsignedType encode(UnsignedType x) { 379 | return x ^ (UnsignedType(1) << (CHAR_BIT * sizeof(UnsignedType) - 1)); 380 | } 381 | }; 382 | 383 | class EncoderDecimal { 384 | public: 385 | template 386 | inline static UnsignedType encode(UnsignedType x) { 387 | static const int bits = CHAR_BIT * sizeof(UnsignedType); 388 | const UnsignedType a = x >> (bits - 1); 389 | const UnsignedType b = (-a) | (UnsignedType(1) << (bits - 1)); 390 | return x ^ b; 391 | } 392 | }; 393 | } // namespace encoder 394 | 395 | // Value managers are used to generalize the sorting algorithm 396 | // to sorting of keys and sorting of pairs 397 | namespace value_manager { 398 | class DummyValueManager { 399 | public: 400 | inline void Push(int thread __attribute__((unused)), 401 | size_t bucket __attribute__((unused)), 402 | size_t num __attribute__((unused)), 403 | size_t from_pos __attribute__((unused))) {} 404 | 405 | inline void Flush(int thread __attribute__((unused)), 406 | size_t bucket __attribute__((unused)), 407 | size_t num __attribute__((unused)), 408 | size_t to_pos __attribute__((unused))) {} 409 | 410 | void Next() {} 411 | }; 412 | 413 | template class PairValueManager { 414 | public: 415 | PairValueManager() 416 | : max_elems_(0), max_threads_(0), original_(NULL), tmp_(NULL), 417 | src_(NULL), dst_(NULL), out_buf_(NULL) {} 418 | 419 | ~PairValueManager() { 420 | DeleteAll(); 421 | } 422 | 423 | void Init(size_t max_elems, int max_threads); 424 | 425 | void Start(ValueType *original, size_t num_elems, int num_threads) { 426 | assert(num_elems <= max_elems_); 427 | assert(num_threads <= max_threads_); 428 | src_ = original_ = original; 429 | dst_ = tmp_; 430 | } 431 | 432 | inline void Push(int thread, size_t bucket, size_t num, size_t from_pos) { 433 | out_buf_[thread][bucket][num] = src_[from_pos]; 434 | } 435 | 436 | inline void Flush(int thread, size_t bucket, size_t num, size_t to_pos) { 437 | for (size_t i = 0; i < num; ++i) { 438 | dst_[to_pos++] = out_buf_[thread][bucket][i]; 439 | } 440 | } 441 | 442 | void Next() { 443 | std::swap(src_, dst_); 444 | } 445 | 446 | ValueType *GetResult() { 447 | return src_; 448 | } 449 | private: 450 | size_t max_elems_; 451 | int max_threads_; 452 | 453 | static const size_t kOutBufferSize = internal::kOutBufferSize; 454 | ValueType *original_, *tmp_; 455 | ValueType *src_, *dst_; 456 | ValueType ***out_buf_; 457 | 458 | void DeleteAll(); 459 | }; 460 | 461 | template 462 | void PairValueManager 463 | ::Init(size_t max_elems, int max_threads) { 464 | if (max_threads == -1) { 465 | max_threads = utility::GetMaxThreads(); 466 | } 467 | assert(max_threads >= 1); 468 | 469 | DeleteAll(); 470 | 471 | max_elems_ = max_elems; 472 | max_threads_ = max_threads; 473 | 474 | tmp_ = new ValueType[max_elems]; 475 | 476 | out_buf_ = new ValueType**[max_threads]; 477 | for (int i = 0; i < max_threads; ++i) { 478 | out_buf_[i] = new ValueType*[1 << Base]; 479 | for (size_t j = 0; j < 1 << Base; ++j) { 480 | out_buf_[i][j] = new ValueType[kOutBufferSize]; 481 | } 482 | } 483 | } 484 | 485 | template 486 | void PairValueManager 487 | ::DeleteAll() { 488 | delete [] tmp_; 489 | tmp_ = NULL; 490 | 491 | for (int i = 0; i < max_threads_; ++i) { 492 | for (size_t j = 0; j < 1 << Base; ++j) { 493 | delete [] out_buf_[i][j]; 494 | } 495 | delete [] out_buf_[i]; 496 | } 497 | delete [] out_buf_; 498 | out_buf_ = NULL; 499 | 500 | max_elems_ = 0; 501 | max_threads_ = 0; 502 | } 503 | } // namespace value_manager 504 | 505 | // Frontend class for sorting keys 506 | template 508 | class KeySort { 509 | typedef value_manager::DummyValueManager DummyValueManager; 510 | typedef internal::ParallelRadixSortInternal 511 | Internal; 512 | 513 | public: 514 | // In the following functions, when |max_threads| or |num_threads| is -1, 515 | // the default value given by OpenMP would be used. 516 | void Init(size_t max_elems, int max_threads = -1) { 517 | internal_.Init(max_elems, max_threads); 518 | } 519 | 520 | // Notice that the pointer returned by this 521 | // does not necessarily equal to |data|. 522 | PlainType *Sort(PlainType *data, size_t num_elems, int num_threads = -1) { 523 | return internal_.Sort(data, num_elems, num_threads, &dummy_value_manager_); 524 | } 525 | 526 | static void InitAndSort(PlainType *data, size_t num_elems, int num_threads = -1) { 527 | DummyValueManager dvm; 528 | Internal::InitAndSort(data, num_elems, num_threads, &dvm); 529 | } 530 | private: 531 | Internal internal_; 532 | DummyValueManager dummy_value_manager_; 533 | }; 534 | 535 | // Frontend class for sorting pairs 536 | template 540 | class PairSort { 541 | typedef value_manager::PairValueManager 542 | ValueManager; 543 | typedef internal::ParallelRadixSortInternal 544 | Internal; 545 | 546 | public: 547 | // In the following functions, when |max_threads| or |num_threads| is -1, 548 | // the default value given by OpenMP would be used. 549 | void Init(size_t max_elems, int max_threads = -1) { 550 | internal_.Init(max_elems, max_threads); 551 | value_manager_.Init(max_elems, max_threads); 552 | } 553 | 554 | // Notice that the pointers returned by this 555 | // do not necessarily equal to |keys| and |vals|. 556 | std::pair Sort(PlainType *keys, ValueType *vals, 557 | size_t num_elems, int num_threads = -1) { 558 | value_manager_.Start(vals, num_elems, num_threads); 559 | PlainType *res_keys = internal_.Sort(keys, num_elems, num_threads, &value_manager_); 560 | ValueType *res_vals = value_manager_.GetResult(); 561 | return std::make_pair(res_keys, res_vals); 562 | } 563 | 564 | static void InitAndSort(PlainType *keys, ValueType *vals, 565 | size_t num_elems, int num_threads = -1) { 566 | ValueManager vm; 567 | vm.Init(num_elems, num_threads); 568 | vm.Start(vals, num_elems, num_threads); 569 | Internal::InitAndSort(keys, num_elems, num_threads, &vm); 570 | ValueType *res_vals = vm.GetResult(); 571 | if (res_vals != vals) { 572 | for (size_t i = 0; i < num_elems; ++i) { 573 | vals[i] = res_vals[i]; 574 | } 575 | } 576 | } 577 | private: 578 | Internal internal_; 579 | ValueManager value_manager_; 580 | }; 581 | 582 | #define TYPE_CASE(plain_type, unsigned_type, encoder_type) \ 583 | template<> class KeySort \ 584 | : public KeySort {}; \ 586 | template class PairSort \ 587 | : public PairSort {}; \ 589 | 590 | // Signed integers 591 | TYPE_CASE(char, unsigned char, Signed); 592 | TYPE_CASE(short, unsigned short, Signed); 593 | TYPE_CASE(int, unsigned int, Signed); 594 | TYPE_CASE(long, unsigned long, Signed); 595 | TYPE_CASE(long long, unsigned long long, Signed); 596 | 597 | // |signed char| and |char| are treated as different types 598 | TYPE_CASE(signed char, unsigned char, Signed); 599 | 600 | // Floating point numbers 601 | TYPE_CASE(float, uint32_t, Decimal); 602 | TYPE_CASE(double, uint64_t, Decimal); 603 | 604 | #undef TYPE_CASE 605 | 606 | template 607 | void SortKeys(KeyType *data, size_t num_elems, int num_threads = -1) { 608 | KeySort::InitAndSort(data, num_elems, num_threads); 609 | } 610 | 611 | template 612 | void SortPairs(KeyType *keys, ValueType *vals, size_t num_elems, int num_threads = -1) { 613 | PairSort::InitAndSort(keys, vals, num_elems, num_threads); 614 | } 615 | }; // namespace parallel radix sort 616 | 617 | #endif // PARALLEL_RADIX_SORT_H_ 618 | --------------------------------------------------------------------------------