├── AverageTests.cpp ├── BinarySearch.h ├── BundleSmallWorkItems.cpp ├── Configuration.h ├── CountingSort.h ├── CountingSortParallel.h ├── CountingSortParallelBenchmark.cpp ├── FillParallel.h ├── Histogram.h ├── HistogramParallel.h ├── InplaceMerge.h ├── InsertionSort.h ├── MIT-LICENSE ├── MemoryUsage.cpp ├── ParallelAlgorithms.cpp ├── ParallelAlgorithms.sln ├── ParallelAlgorithms.vcxproj ├── ParallelMerge.h ├── ParallelMergeSort.h ├── ParallelMergeSortBenchmark.cpp ├── ParallelQuickSort.cpp ├── ParallelStdCppExample.cpp ├── README.md ├── RadixSortCommon.h ├── RadixSortLSD.h ├── RadixSortLsdBenchmark.cpp ├── RadixSortLsdParallel.h ├── RadixSortMSD.h ├── RadixSortMsdBenchmark.cpp ├── RadixSortMsdParallel.h ├── SortParallel.h ├── StdParallelSortMemoryLeakDemo.cpp ├── SumBenchmark.cpp ├── SumParallel.h └── TODO.txt /AverageTests.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | int AverageOverflowFree(int a, int b) 10 | { 11 | int average_ab; 12 | 13 | if ((a >= 0 && b >= 0) || (a < 0 && b < 0)) 14 | average_ab = a + (b - a) / 2; 15 | else 16 | average_ab = (a + b) / 2; 17 | 18 | return average_ab; 19 | } 20 | 21 | long long AverageOverflowFree(long long a, long long b) 22 | { 23 | long long average_ab; 24 | 25 | if ((a >= 0 && b >= 0) || (a < 0 && b < 0)) 26 | average_ab = a + (b - a) / 2; 27 | else 28 | average_ab = (a + b) / 2; 29 | 30 | return average_ab; 31 | } 32 | unsigned int AverageUnderflowFree(unsigned int a, unsigned int b) 33 | { 34 | unsigned int average_ab; 35 | 36 | if (b >= a) 37 | average_ab = a + (b - a) / 2; 38 | else 39 | average_ab = b + (a - b) / 2; 40 | 41 | return average_ab; 42 | } 43 | 44 | size_t AverageUnderflowFree(size_t a, size_t b) 45 | { 46 | size_t average_ab; 47 | 48 | if (b >= a) 49 | average_ab = a + (b - a) / 2; 50 | else 51 | average_ab = b + (a - b) / 2; 52 | 53 | return average_ab; 54 | } 55 | 56 | size_t AverageUnderflowFreeModulo(size_t a, size_t b) 57 | { 58 | return a / 2 + b / 2 + (a % 2 + b % 2) / 2; 59 | } 60 | 61 | void TestAverageOfTwoIntegers() 62 | { 63 | unsigned c_u = 6; 64 | unsigned d_u = 8; 65 | unsigned ave_cd_0 = (c_u + d_u) / 2; // correct result of 7 66 | unsigned ave_cd_1 = c_u + (c_u - d_u) / 2; // wrong result of 2147483653 67 | unsigned ave_cd_2 = c_u + (d_u - c_u) / 2; // correct result of 7 68 | unsigned ave_cd_3 = ((c_u ^ d_u) >> 1) + (c_u & d_u); 69 | 70 | printf("Average #0 = %u Average #1 = %u Average #2 = %u Average #3 = %u\n", ave_cd_0, ave_cd_1, ave_cd_2, ave_cd_3); 71 | 72 | unsigned a_u = 1; 73 | unsigned b_u = UINT32_MAX; 74 | 75 | unsigned ave_ab_0 = (a_u + b_u) / 2; // wrong result of 0 76 | unsigned ave_ab_1 = a_u + (b_u - a_u) / 2; // wrong result of 2 77 | unsigned ave_ab_2 = a_u + (b_u - a_u) / 2; // correct result of 2147483648 78 | unsigned ave_ab_3 = b_u + (a_u - b_u) / 2; // wrong result of 0 79 | unsigned ave_ab_4 = ((a_u ^ b_u) >> 1) + (a_u & b_u); // correct result of 2147483648 80 | 81 | printf("Average #0 = %u Average #1 = %u Average #2 = %u Average #3 = %u (a_u - b_u) = %u AverageMod = %u Average = %u\n", 82 | ave_ab_0, ave_ab_1, ave_ab_2, ave_ab_3, (a_u - b_u), (unsigned)AverageUnderflowFreeModulo(a_u, b_u), ave_ab_4); 83 | 84 | int e_i = 1; 85 | int f_i = INT32_MAX; 86 | 87 | int ave_ef_0 = (e_i + f_i) / 2; // wrong result of -1073741824 88 | int ave_ef_1 = f_i + (e_i - f_i) / 2; // correct result of 1073741824 89 | int ave_ef_2 = e_i + (f_i - e_i) / 2; // correct result of 1073741824 90 | int sum_ef_0 = e_i + f_i; // wrong result of -2147483648 91 | 92 | printf("Average #0 = %d Average #1 = %d Average #2 = %d Sum #0 = %d\n", ave_ef_0, ave_ef_1, ave_ef_2, sum_ef_0); 93 | 94 | e_i = -1; 95 | f_i = INT32_MIN; 96 | 97 | ave_ef_0 = (e_i + f_i) / 2; // wrong result of 1073741823 98 | ave_ef_1 = f_i + (e_i - f_i) / 2; // correct result of -1073741824 99 | ave_ef_2 = e_i + (f_i - e_i) / 2; // correct result of -1073741824 100 | int ave_ef_3 = ((unsigned)e_i + (unsigned)f_i) >> 1; // wrong result 1073741823 101 | sum_ef_0 = e_i + f_i; // wrong result of 2147483647 102 | 103 | printf("Average #0 = %d Average #1 = %d Average #2 = %d Average #3 = %d Sum #0 = %d\n", ave_ef_0, ave_ef_1, ave_ef_2, ave_ef_3, sum_ef_0); 104 | 105 | e_i = 1; 106 | f_i = INT32_MIN; 107 | 108 | ave_ef_0 = (e_i + f_i) / 2; // corrent result of -1073741823 109 | ave_ef_1 = f_i + (e_i - f_i) / 2; // wrong result of 1073741825 110 | ave_ef_2 = e_i + (f_i - e_i) / 2; // wrong result of 1073741824 111 | ave_ef_3 = ((unsigned)e_i + (unsigned)f_i) >> 1; // wrong result of 1073741824 112 | sum_ef_0 = e_i + f_i; // correct result of -2147483647 113 | int sub_ef_0 = e_i - f_i; // wrong result of -2147483647 114 | 115 | printf("Average #0 = %d Average #1 = %d Average #2 = %d Average #3 = %d Sum #0 = %d Sub #0 = %d\n", ave_ef_0, ave_ef_1, ave_ef_2, ave_ef_3, sum_ef_0, sub_ef_0); 116 | 117 | // Idea for unsigned: compare the two values, use the case of (larger - smaller) 118 | // Idea for signed: compare the two values with zero, if both negative then compare to each other and use (smaller - larger) 119 | // if both positive then compare to each other and use (larger - smaller), if oposite signs then can subtract without comparing to each other 120 | // Another clever solution for integers if you know they will be positive: ave = ((unsigned)low + (unsigned)high) / 2 . This works because 121 | // If we know that high >= low, then int mid = low + ((high - low) / 2 ) works 122 | 123 | if (f_i >= e_i) 124 | ave_ef_2 = e_i + (f_i - e_i) / 2; 125 | else 126 | ave_ef_2 = e_i + (e_i - f_i) / 2; 127 | 128 | printf("\n\n"); 129 | printf("AverageSafe = %d input A = %d input B = %d\n", AverageOverflowFree( -1, INT32_MIN), -1, INT32_MIN); 130 | printf("AverageSafe = %d input A = %d input B = %d\n", AverageOverflowFree(INT32_MIN, -1), INT32_MIN, -1); 131 | printf("AverageSafe = %d input A = %d input B = %d\n", AverageOverflowFree( 1, INT32_MAX), 1, INT32_MAX); 132 | printf("AverageSafe = %d input A = %d input B = %d\n", AverageOverflowFree(INT32_MAX, 1), INT32_MAX, 1); 133 | printf("AverageSafe = %d input A = %d input B = %d\n", AverageOverflowFree(INT32_MAX, INT32_MIN), INT32_MAX, INT32_MIN); 134 | printf("AverageSafe = %d input A = %d input B = %d\n", AverageOverflowFree(INT32_MIN, INT32_MAX), INT32_MIN, INT32_MAX); 135 | printf("AverageSafe = %d input A = %d input B = %d\n", AverageOverflowFree( 5, -1), 5, -1); 136 | printf("AverageSafe = %d input A = %d input B = %d\n", AverageOverflowFree( -1, 5), -1, 5); 137 | 138 | } 139 | -------------------------------------------------------------------------------- /BinarySearch.h: -------------------------------------------------------------------------------- 1 | // Copyright(c), Victor J. Duvanenko, 2010 2 | // Binary search variations that will be used by various other algorithms internally. 3 | 4 | #ifndef _BinarySearch_h 5 | #define _BinarySearch_h 6 | 7 | #pragma once 8 | 9 | #include 10 | 11 | // There are several ways to implement a modified binary search for insertion sort. One way is to compare with the middle array 12 | // element in the first step. Another way is to compare with the largest element in the first step and then the smallest element. 13 | // Or to compare largest and then middle element. It just depends if we want to optimize for search or for nearly presorted or 14 | // nearly inverted - i.e. worst and best cases and in between. 15 | // 16 | // What it will boil down to is to find an index where an element is larger and an index where an element is smaller and 17 | // the difference between these indexes is 1 - this is the termination condition of the loop. 18 | // From this termination condition it seems that we must test the upper limit and then the lower limit of the array and then 19 | // reduce the distance between max and min indexes by 2X. 20 | // 21 | // For binary search we want to do the following: 22 | // Compare with the largest element, which is what we are doing in the first if statement, and if the current element is 23 | // bigger then no work is done and the element stays in its current spot. 24 | // Compare with the smallest element and if it's smaller then we are done as well and need to move the entire array over 25 | 26 | // Searches for the value within array "a", from a[ left ] to a[ right ] inclusively 27 | // Returns the left-most index at which the element of the array is larger than the value. 28 | // Thus, the return index can be between left and (right + 1) 29 | // Expects "a" array to be pre-sorted with the smallest element on the left and the largest on the right. 30 | // It would be cool if the routine worked automagically for the condition of right < left (i.e. no elements) - return left 31 | // It would be cool if the routine worked automagically for the condition of left == right (i.e. one element ) 32 | // This version is borrowed from "Introduction to Algorithms" 3rd edition, p. 799. 33 | template< class _Type > 34 | inline size_t my_binary_search( _Type value, const _Type* a, size_t left, size_t right ) 35 | { 36 | size_t low = left; 37 | size_t high = (std::max)( left, right + 1 ); 38 | while( low < high ) 39 | { 40 | size_t mid = low + ((high - low) / 2); // overflow-free average calculation, since high > low is the condition for entering while-loop body 41 | if ( value <= a[ mid ] ) high = mid; 42 | else low = mid + 1; // because we compared to a[mid] and the value was larger than a[mid]. 43 | // Thus, the next array element to the right from mid is the next possible 44 | // candidate for low, and a[mid] can not possibly be that candidate. 45 | } 46 | return high; 47 | } 48 | 49 | #endif // _BinarySearch_h 50 | -------------------------------------------------------------------------------- /BundleSmallWorkItems.cpp: -------------------------------------------------------------------------------- 1 | //#include "oneapi/tbb/tbbmalloc_proxy.h" // Intel's scalable memory allocator 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | 13 | using std::chrono::duration; 14 | using std::chrono::duration_cast; 15 | using std::chrono::high_resolution_clock; 16 | using std::milli; 17 | using std::random_device; 18 | using std::sort; 19 | using std::vector; 20 | 21 | #include "ParallelMergeSort.h" 22 | 23 | void print_results(const char* const tag, const vector& in_array, 24 | high_resolution_clock::time_point startTime, 25 | high_resolution_clock::time_point endTime) 26 | { 27 | printf("%s: size = %zu Lowest: %d Highest: %d Time: %fms\n", tag, in_array.size(), in_array.front(), in_array.back(), 28 | duration_cast>(endTime - startTime).count()); 29 | } 30 | 31 | // Take the number_of_items of size_of_each_item: 32 | // - run them sequentially 33 | // - run them in parallel 34 | // - bundle multiple items and run these bundles in parallel 35 | // Bundle 1, 2, 3, ..., small work items, such as std::all_of() 36 | int bundling_small_work_items_benchmark(size_t max_bundle_size = 20, size_t number_of_items = 1000, size_t size_of_item = 100) 37 | { 38 | std::vector item(size_of_item, 2); 39 | std::vector< std::vector > all_data(number_of_items, item); // all_data[number_of_items][size_of_item] 40 | std::vector< bool > return_values(number_of_items, false); 41 | std::vector > bundle_of_functions; // vector of function calls which take no arguments and return void 42 | std::vector< std::vector > > all_bundles(number_of_items, bundle_of_functions); // all_bundles[number_of_items][?] 43 | 44 | high_resolution_clock::time_point startTime, endTime; 45 | 46 | if (number_of_items == 0) 47 | return 0; 48 | 49 | // Run and time Sequential algorithms 50 | startTime = high_resolution_clock::now(); 51 | for (size_t i = 0; i < number_of_items; i++) 52 | { 53 | return_values[i] = std::all_of(all_data[i].begin(), all_data[i].end(), [](int j) { return j == 2; }); 54 | } 55 | endTime = high_resolution_clock::now(); 56 | 57 | bool combined_return_values = return_values[0]; 58 | for (size_t i = 1; i < number_of_items; i++) 59 | { 60 | combined_return_values = combined_return_values && return_values[i]; 61 | } 62 | if (combined_return_values) 63 | print_results("Sequential std::all_of", return_values, startTime, endTime); 64 | else 65 | printf("Error: Sequential std::all_of failed!\n"); 66 | 67 | // Parallel execute and time many sequential algorithms 68 | return_values.assign(return_values.size(), false); 69 | startTime = high_resolution_clock::now(); 70 | 71 | #if defined(USE_PPL) 72 | Concurrency::task_group g; 73 | #else 74 | tbb::task_group g; 75 | #endif 76 | 77 | for (size_t i = 0; i < number_of_items; i++) 78 | { 79 | g.run([=, &return_values] { // important to not pass by reference, as all tasks will then get the same/last value 80 | return_values[i] = std::all_of(all_data[i].begin(), all_data[i].end(), [](int j) { return j == 2; }); 81 | }); 82 | } 83 | g.wait(); 84 | endTime = high_resolution_clock::now(); 85 | 86 | combined_return_values = return_values[0]; 87 | for (size_t i = 1; i < number_of_items; i++) 88 | { 89 | combined_return_values = combined_return_values && return_values[i]; 90 | } 91 | if (combined_return_values) 92 | print_results("Parallel std::all_of", return_values, startTime, endTime); 93 | else 94 | printf("Error: Parallel std::all_of failed!\n"); 95 | 96 | // Parallel execute and benchmark bundles of sequential algorithms 97 | for (size_t bundle_size = number_of_items; bundle_size > (number_of_items / 512); bundle_size /= 2) 98 | { 99 | return_values.assign(return_values.size(), false); 100 | for (size_t n = 0; n < number_of_items; n++) 101 | all_bundles[n].clear(); 102 | size_t number_of_full_bundles = number_of_items / bundle_size; // i.e. do only full bundles and not the partial bundle at the end 103 | printf("number_of_full_bundles = %zu number_of_items = %zu bundle_size = %zu\n", number_of_full_bundles, number_of_items, bundle_size); 104 | size_t ci = 0; // current item 105 | startTime = high_resolution_clock::now(); 106 | for (size_t b = 0; b < number_of_full_bundles; b++) // create all bundles 107 | { 108 | //printf("b = %zu\b", b); 109 | for (size_t bs = 0; bs < bundle_size; bs++, ci++) // create a single bundle of bundle_size items 110 | { 111 | //printf("bs = %zu ci = %zu\b", bs, ci); 112 | all_bundles[b].push_back([&, ci, b] { return_values[b] = std::all_of(all_data[ci].begin(), all_data[ci].end(), [](int j) { return j == 2; }); }); 113 | } 114 | } 115 | endTime = high_resolution_clock::now(); 116 | print_results("Time to create all bundles", return_values, startTime, endTime); 117 | 118 | startTime = high_resolution_clock::now(); 119 | // Run all bundles in parallel 120 | for (size_t b = 0; b < number_of_full_bundles; b++) 121 | { 122 | g.run([=, &all_bundles] { 123 | for (const auto& func : all_bundles[b]) { 124 | func(); // Execute each function within this bundle sequentially 125 | } 126 | }); 127 | } 128 | g.wait(); 129 | endTime = high_resolution_clock::now(); 130 | print_results("Time to run all bundles", return_values, startTime, endTime); 131 | 132 | combined_return_values = return_values[0]; 133 | for (size_t i = 1; i < number_of_full_bundles; i++) 134 | { 135 | combined_return_values = combined_return_values && return_values[i]; 136 | } 137 | if (combined_return_values) 138 | print_results("Parallel bundles of std::all_of", return_values, startTime, endTime); 139 | else 140 | printf("Error: Parallel bundles of std::all_of failed!\n"); 141 | } 142 | 143 | return 0; 144 | } 145 | 146 | 147 | -------------------------------------------------------------------------------- /Configuration.h: -------------------------------------------------------------------------------- 1 | // Controls configuration of ParallelAlgorithms 2 | // Supports for Windows, either Microsoft PPL or Intel TBB 3 | // Supports for Linux, Intel TBB 4 | 5 | #pragma once 6 | 7 | //#define USE_TBB // Uncomment for Windows to use Intel TBB 8 | 9 | #if defined(WIN32) || defined(_WIN32) || defined(__WIN32) && !defined(__CYGWIN__) 10 | #ifndef USE_TBB 11 | #define USE_PPL 12 | #include 13 | #else 14 | #include "tbb/tbb.h" 15 | #include 16 | #include 17 | #endif 18 | #else 19 | #define USE_TBB 20 | #include "tbb/tbb.h" 21 | #include 22 | #include 23 | #endif 24 | -------------------------------------------------------------------------------- /CountingSort.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | // Counting Sort implementations 4 | 5 | #ifndef _CountingSort_h 6 | #define _CountingSort_h 7 | 8 | //#include 9 | 10 | #if defined(WIN32) || defined(_WIN32) || defined(__WIN32) && !defined(__CYGWIN__) 11 | #include 12 | #include 13 | #include 14 | #else 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #endif 24 | 25 | using std::chrono::duration; 26 | using std::chrono::duration_cast; 27 | using std::chrono::high_resolution_clock; 28 | using std::milli; 29 | using std::random_device; 30 | using std::sort; 31 | using std::vector; 32 | 33 | void print_results(const char* const tag, high_resolution_clock::time_point startTime, high_resolution_clock::time_point endTime) 34 | { 35 | printf("%s: Time: %fms\n", tag, duration_cast>(endTime - startTime).count()); 36 | } 37 | 38 | namespace ParallelAlgorithms 39 | { 40 | // left-inclusive and right-exclusive boundaries 41 | inline void counting_sort(unsigned char* array_to_sort, size_t l, size_t r) 42 | { 43 | //const auto startTimeHistogram = high_resolution_clock::now(); 44 | 45 | // TODO: Turn this into Histogram of Byte, as it's a useful abstraction for counting bytes of a byte array 46 | size_t counts[256]{}; 47 | for (size_t _current = l; _current < r; _current++) // Scan the array and count the number of times each value appears 48 | counts[array_to_sort[_current]]++; 49 | 50 | //const auto endTimeHistogram = high_resolution_clock::now(); 51 | //print_results("Histogram inside byte array Counting Sort", startTimeHistogram, endTimeHistogram); 52 | 53 | //const auto startTime = high_resolution_clock::now(); 54 | 55 | size_t start_index = 0; 56 | for (size_t count_index = 0; count_index < 256; count_index++) 57 | { 58 | #if 1 59 | //memset(array_to_sort + start_index, count_index, counts[count_index]); 60 | std::fill(array_to_sort + start_index, array_to_sort + start_index + counts[count_index], (unsigned char)count_index); 61 | #else 62 | size_t end_index = start_index + counts[count_index]; 63 | for (size_t i = start_index; i <= end_index; i++) 64 | array_to_sort[i] = count_index; 65 | #endif 66 | start_index += counts[count_index]; 67 | } 68 | //const auto endTime = high_resolution_clock::now(); 69 | //print_results("Fill inside byte array Counting Sort", startTime, endTime); 70 | } 71 | } 72 | #endif -------------------------------------------------------------------------------- /CountingSortParallel.h: -------------------------------------------------------------------------------- 1 | // TODO: Add a version of the Parallel Count/Histogram, which does not allocate the count array, but instead allocates a single array that's big enough to fit all of the needed count arrays 2 | // as a single buffer. A unique ID would also need to be provided to each of the leaf node (have done this before) to select a unique sub-buffer within the single buffer. 3 | // TODO: Combine reading 64-bits technique with multi-buffering that removed dependency across for loop iterations into a single implementation to see if it performs even faster and more consistent. 4 | #pragma once 5 | 6 | // Parallel Counting Sort implementations 7 | 8 | #ifndef _ParallelCountingSort_h 9 | #define _ParallelCountingSort_h 10 | 11 | #include "Configuration.h" 12 | 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | #include "RadixSortMsdParallel.h" 23 | #include "FillParallel.h" 24 | #include "HistogramParallel.h" 25 | 26 | namespace ParallelAlgorithms 27 | { 28 | // left (l) boundary is inclusive and right (r) boundary is exclusive 29 | template< unsigned long NumberOfBins > 30 | inline void counting_sort_parallel_inner(unsigned char *array_to_sort, size_t l, size_t r, size_t threshold_count = 64 * 1024, size_t threshold_fill = 64 * 1024) 31 | { 32 | //const auto startTimeHistogram = high_resolution_clock::now(); 33 | 34 | size_t* counts = HistogramOneByteComponentParallel_3< NumberOfBins >(array_to_sort, l, r, threshold_count); 35 | //size_t* counts = HistogramOneByteComponentParallel< NumberOfBins >(array_to_sort, l, r, threshold_count); 36 | 37 | //const auto endTimeHistogram = high_resolution_clock::now(); 38 | //print_results_par("Parallel Histogram inside byte array Counting Sort", startTimeHistogram, endTimeHistogram); 39 | 40 | //const auto startTimeFill = high_resolution_clock::now(); 41 | #if 0 42 | size_t start_index = 0; 43 | for (size_t count_index = 0; count_index < PowerOfTwoRadix; count_index++) 44 | { 45 | // Then use both of these combined to show that full memory bandwidth can be achieved in C# and C++. In C++ we can use memset() to do SSE 46 | parallel_fill(array_to_sort, (unsigned char)count_index, start_index, start_index + counts[count_index], threshold_fill); 47 | //parallel_fill_2(array_to_sort, (unsigned char)count_index, start_index, counts[count_index], threshold_fill); 48 | //memset(array_to_sort + start_index, count_index, counts[count_index]); 49 | //std::fill(array_to_sort + start_index, array_to_sort + start_index + counts[count_index], count_index); 50 | //std::fill(oneapi::dpl::execution::par_unseq, array_to_sort + start_index, array_to_sort + start_index + counts[count_index], count_index); 51 | 52 | //size_t end_index = start_index + counts[count_index]; // for loop leads to 3X slower algorithm 53 | //for (size_t i = start_index; i <= end_index; i++) 54 | // array_to_sort[i] = count_index; 55 | start_index += counts[count_index]; 56 | } 57 | #else 58 | size_t start_indexes[NumberOfBins]; 59 | start_indexes[0] = 0; 60 | for (size_t count_index = 1; count_index < NumberOfBins; count_index++) 61 | start_indexes[count_index] = start_indexes[count_index - 1] + counts[count_index - 1]; 62 | 63 | #if defined(USE_PPL) 64 | Concurrency::parallel_for(size_t(0), size_t(NumberOfBins), [&](size_t count_index) 65 | #else 66 | tbb::parallel_for(size_t(0), size_t(NumberOfBins), [&](size_t count_index) 67 | #endif 68 | { 69 | parallel_fill(array_to_sort, (unsigned char)count_index, start_indexes[count_index], start_indexes[count_index] + counts[count_index], threshold_fill); 70 | //std::fill(oneapi::dpl::execution::par_unseq, array_to_sort + start_indexes[count_index], array_to_sort + start_indexes[count_index] + counts[count_index], count_index); 71 | }); 72 | #endif 73 | //const auto endTimeFill = high_resolution_clock::now(); 74 | //print_results_par("Parallel Fill inside byte array Counting Sort", startTimeFill, endTimeFill); 75 | delete[] counts; 76 | } 77 | 78 | inline void counting_sort_parallel(unsigned char* a, size_t a_size) 79 | { 80 | if (a_size == 0) return; 81 | 82 | const unsigned long NumberOfBins = 256; 83 | //const long threshold_count = a_size / 18; // 18 cores on 24-core seems to lead to maximal performance. 12-cores seems to be a good value for 6-core CPU 84 | //const long threshold_fill = a_size / 12; // 10-12 cores on 24-core seems to lead to maximal performance, with 24-cores slowing down by 2X. 2-cores seems to be the best value for 6-core CPU. Using std::fill is even better 85 | const long threshold_count = 64 * 1024; // 18 cores on 24-core seems to lead to maximal performance. 12-cores seems to be a good value for 6-core CPU 86 | const long threshold_fill = 64 * 1024; // 10-12 cores on 24-core seems to lead to maximal performance, with 24-cores slowing down by 2X. 2-cores seems to be the best value for 6-core CPU 87 | 88 | counting_sort_parallel_inner< NumberOfBins >(a, 0, a_size, threshold_count, threshold_fill); 89 | //counting_sort_parallel_inner< PowerOfTwoRadix >(a, 0, a_size); 90 | } 91 | } 92 | #endif -------------------------------------------------------------------------------- /CountingSortParallelBenchmark.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include "CountingSortParallel.h" 12 | #include "CountingSort.h" 13 | 14 | using std::chrono::duration; 15 | using std::chrono::duration_cast; 16 | using std::chrono::high_resolution_clock; 17 | using std::milli; 18 | using std::random_device; 19 | using std::sort; 20 | using std::vector; 21 | 22 | const int iterationCount = 5; 23 | 24 | static void print_results(const char* const tag, const unsigned char* sorted, size_t sortedLength, 25 | high_resolution_clock::time_point startTime, 26 | high_resolution_clock::time_point endTime) { 27 | printf("%s: Lowest: %u Highest: %u Time: %fms\n", tag, 28 | (unsigned)sorted[0], (unsigned)sorted[sortedLength - 1], 29 | duration_cast>(endTime - startTime).count()); 30 | } 31 | 32 | int CountingSortBenchmark(vector& uints) 33 | { 34 | unsigned char* ucharCopy = new unsigned char[uints.size()]; 35 | unsigned char* sorted = new unsigned char[uints.size()]; 36 | unsigned long long* u64array = new unsigned long long[uints.size()]; 37 | 38 | printf("\n"); 39 | // time how long it takes to sort them: 40 | for (int i = 0; i < iterationCount; ++i) 41 | { 42 | for (size_t j = 0; j < uints.size(); j++) { // copy the original random array into the source array each time, since ParallelMergeSort modifies the source array while sorting 43 | //uints[j] = j + 2; // for pre-sorted array testing 44 | ucharCopy[j] = (unsigned char)uints[j]; 45 | sorted[j] = (unsigned char)j; // page in the destination array into system memory 46 | u64array[j] = (unsigned long long)j; 47 | } 48 | // Eliminate compiler ability to optimize paging-in of the input and output arrays 49 | // Paging-in source and destination arrays leads to a 50% speed-up on Linux, and 15% on Windows 50 | 51 | vector sorted_reference(uints.size()); 52 | for (size_t j = 0; j < uints.size(); j++) 53 | sorted_reference[j] = (unsigned char)uints[j]; 54 | const auto startTimeRef = high_resolution_clock::now(); 55 | //sort(sorted_reference.begin(), sorted_reference.end()); 56 | sort(std::execution::par_unseq, sorted_reference.begin(), sorted_reference.end()); 57 | //sort(oneapi::dpl::execution::par_unseq, sorted_reference.begin(), sorted_reference.end()); 58 | const auto endTimeRef = high_resolution_clock::now(); 59 | print_results("std::sort of byte array", ucharCopy, uints.size(), startTimeRef, endTimeRef); 60 | 61 | //printf("ulongsCopy address = %p sorted address = %p value at a random location = %lu %lu\n", ucharCopy, sorted, sorted[static_cast(rd()) % uints.size()], ulongsCopy[static_cast(rd()) % uints.size()]); 62 | const auto startTime = high_resolution_clock::now(); 63 | //ParallelAlgorithms::counting_sort(ucharCopy, 0, uints.size()); 64 | ParallelAlgorithms::counting_sort_parallel(ucharCopy, uints.size()); 65 | //ParallelAlgorithms::parallel_fill(u64array, 0, 0, uints.size(), uints.size() / 24); // same performance for filling array of 64-bit 66 | //ParallelAlgorithms::parallel_fill(sorted, 0, 0, uints.size(), uints.size() / 2); // dividing by # cores provides more consistent performance 67 | //std::fill(std::execution::par_unseq, ucharCopy + 0, ucharCopy + uints.size(), 10); // does not support parallel 68 | const auto endTime = high_resolution_clock::now(); 69 | print_results("Parallel Counting Sort", ucharCopy, uints.size(), startTime, endTime); 70 | if (std::equal(sorted_reference.begin(), sorted_reference.end(), ucharCopy)) 71 | printf("Arrays are equal\n"); 72 | else 73 | { 74 | printf("Arrays are not equal\n"); 75 | exit(1); 76 | } 77 | } 78 | 79 | delete[] sorted; 80 | delete[] ucharCopy; 81 | 82 | return 0; 83 | } 84 | 85 | -------------------------------------------------------------------------------- /FillParallel.h: -------------------------------------------------------------------------------- 1 | // TODO: Possible improvement is to provide an option to go around the CPU cache using SSE instructions for writes that can go around the cache, to not evict items out of the cache. 2 | // Parallel Fill implementations 3 | 4 | #ifndef _ParallelFill_h 5 | #define _ParallelFill_h 6 | 7 | #include "Configuration.h" 8 | 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | namespace ParallelAlgorithms 20 | { 21 | // Inclusive-left and exclusive-right boundaries 22 | template< class _Type > 23 | inline void parallel_fill(_Type* src, _Type value, size_t l, size_t r, size_t parallel_threshold = 16 * 1024) 24 | { 25 | if (r <= l) 26 | return; 27 | if ((r - l) < parallel_threshold) 28 | { 29 | std::fill(src + l, src + r, value); // many times faster than for loop 30 | //for (size_t i = l; i < r; i++) 31 | // src[i] = value; 32 | return; 33 | } 34 | size_t m = r / 2 + l / 2 + (r % 2 + l % 2) / 2; // average without overflow 35 | #if defined(USE_PPL) 36 | Concurrency::parallel_invoke( 37 | #else 38 | tbb::parallel_invoke( 39 | #endif 40 | [&] { parallel_fill(src, value, l, m, parallel_threshold); }, 41 | [&] { parallel_fill(src, value, m, r, parallel_threshold); } 42 | ); 43 | } 44 | // Inclusive-left and exclusive-right boundaries 45 | inline void parallel_fill(unsigned char* src, unsigned char value, size_t l, size_t r, size_t parallel_threshold = 16 * 1024) 46 | { 47 | if (r <= l) 48 | return; 49 | if ((r - l) < parallel_threshold) 50 | { 51 | //memset(src + l, (int)value, r - l); // many times faster than the for loop below 52 | std::fill(src + l, src + r, value); // same performance as memset 53 | //for (size_t i = l; i < r; i++) 54 | // src[i] = value; 55 | return; 56 | } 57 | size_t m = r / 2 + l / 2 + (r % 2 + l % 2) / 2; // average without overflow 58 | #if defined(USE_PPL) 59 | Concurrency::parallel_invoke( 60 | #else 61 | tbb::parallel_invoke( 62 | #endif 63 | [&] { parallel_fill(src, value, l, m, parallel_threshold); }, 64 | [&] { parallel_fill(src, value, m, r, parallel_threshold); } 65 | ); 66 | } 67 | } 68 | #endif -------------------------------------------------------------------------------- /Histogram.h: -------------------------------------------------------------------------------- 1 | // TODO: Switch histogram calculation from mask/shift to union 2 | 3 | #pragma once 4 | 5 | template< unsigned PowerOfTwoRadix, unsigned Log2ofPowerOfTwoRadix > 6 | inline size_t* HistogramByteComponents_1(unsigned inArray[], size_t l, size_t r) 7 | { 8 | const unsigned numberOfDigits = Log2ofPowerOfTwoRadix; 9 | const unsigned NumberOfBins = PowerOfTwoRadix; 10 | 11 | size_t* count = new size_t[numberOfDigits * NumberOfBins]; 12 | 13 | for (unsigned i = 0; i < numberOfDigits * NumberOfBins; i++) 14 | count[i] = 0; 15 | 16 | size_t* count0 = count + (0 * NumberOfBins); 17 | size_t* count1 = count + (1 * NumberOfBins); 18 | size_t* count2 = count + (2 * NumberOfBins); 19 | size_t* count3 = count + (3 * NumberOfBins); 20 | 21 | for (size_t current = l; current <= r; current++) // Scan the array and count the number of times each digit value appears - i.e. size of each bin 22 | { 23 | unsigned value = inArray[current]; 24 | count0[ value & 0xff]++; 25 | count1[(value >> 8) & 0xff]++; 26 | count2[(value >> 16) & 0xff]++; 27 | count3[(value >> 24) & 0xff]++; 28 | } 29 | return count; 30 | } 31 | 32 | template< unsigned PowerOfTwoRadix, unsigned Log2ofPowerOfTwoRadix > 33 | inline size_t** HistogramByteComponents(unsigned long long inArray[], int l, int r) 34 | { 35 | const unsigned numberOfDigits = Log2ofPowerOfTwoRadix; 36 | const unsigned NumberOfBins = PowerOfTwoRadix; 37 | 38 | size_t** count = new size_t * [numberOfDigits]; 39 | 40 | for (unsigned i = 0; i < numberOfDigits; i++) 41 | { 42 | count[i] = new size_t[NumberOfBins]; 43 | for (unsigned j = 0; j < NumberOfBins; j++) 44 | count[i][j] = 0; 45 | } 46 | 47 | // Faster version, since it doesn't use a 2-D array, reducing one level of indirection 48 | size_t* count0 = count[0]; 49 | size_t* count1 = count[1]; 50 | size_t* count2 = count[2]; 51 | size_t* count3 = count[3]; 52 | 53 | for (int current = l; current <= r; current++) // Scan the array and count the number of times each digit value appears - i.e. size of each bin 54 | { 55 | unsigned long value = inArray[current]; 56 | count0[value & 0xff]++; 57 | count1[(value >> 8) & 0xff]++; 58 | count2[(value >> 16) & 0xff]++; 59 | count3[(value >> 24) & 0xff]++; 60 | } 61 | return count; 62 | } 63 | -------------------------------------------------------------------------------- /HistogramParallel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "Configuration.h" 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | namespace ParallelAlgorithms 14 | { 15 | // left (l) boundary is inclusive and right (r) boundary is exclusive 16 | template< unsigned NumberOfBins > 17 | inline size_t* HistogramOneByteComponentParallel(unsigned char inArray[], size_t l, size_t r, size_t parallelThreshold = 64 * 1024) 18 | { 19 | size_t* countLeft = NULL; 20 | size_t* countRight = NULL; 21 | 22 | if (l >= r) // zero elements to compare 23 | { 24 | countLeft = new size_t[NumberOfBins]{}; 25 | return countLeft; 26 | } 27 | if ((r - l) <= parallelThreshold) 28 | { 29 | countLeft = new size_t[NumberOfBins]{}; 30 | 31 | for (size_t current = l; current < r; current++) // Scan the array and count the number of times each digit value appears - i.e. size of each bin 32 | countLeft[inArray[current]]++; 33 | 34 | return countLeft; 35 | } 36 | 37 | size_t m = r / 2 + l / 2 + (r % 2 + l % 2) / 2; // average without overflow 38 | 39 | #if defined(USE_PPL) 40 | Concurrency::parallel_invoke( 41 | #else 42 | tbb::parallel_invoke( 43 | #endif 44 | [&] { countLeft = HistogramOneByteComponentParallel (inArray, l, m, parallelThreshold); }, 45 | [&] { countRight = HistogramOneByteComponentParallel (inArray, m, r, parallelThreshold); } 46 | ); 47 | // Combine left and right results 48 | for (size_t j = 0; j < NumberOfBins; j++) 49 | countLeft[j] += countRight[j]; 50 | 51 | delete[] countRight; 52 | 53 | return countLeft; 54 | } 55 | 56 | // left (l) boundary is inclusive and right (r) boundary is exclusive 57 | template< unsigned NumberOfBins > 58 | inline size_t* HistogramOneByteComponentParallel_4(unsigned char inArray[], size_t l, size_t r, size_t parallelThreshold = 64 * 1024) 59 | { 60 | size_t* countLeft = NULL; 61 | size_t* countRight = NULL; 62 | 63 | if (l >= r) // zero elements to compare 64 | { 65 | countLeft = new size_t[NumberOfBins]{}; 66 | return countLeft; 67 | } 68 | if ((r - l) <= parallelThreshold) 69 | { 70 | countLeft = new size_t[NumberOfBins]{}; 71 | size_t current = l; 72 | //if (((unsigned long long)(inArray + l) & 0x7) != 0) 73 | // printf("Memory alignment is not on 8-byte boundary\n"); 74 | // TODO: Detect not-64-bit aligned address and process bytes individually until alignment is achieved and then do 64-bits at a time 75 | size_t last_by_eight = l + ((r - l) / 8) * 8; 76 | unsigned long long* inArrayCurr = (unsigned long long*)(inArray + current); 77 | for (; current < last_by_eight; current += 8, inArrayCurr++) // Scan the array and count the number of times each digit value appears - i.e. size of each bin 78 | { 79 | unsigned long long eight_bytes = *inArrayCurr; 80 | countLeft[ eight_bytes & 0xff]++; 81 | countLeft[(eight_bytes >> 8) & 0xff]++; 82 | countLeft[(eight_bytes >> 16) & 0xff]++; 83 | countLeft[(eight_bytes >> 24) & 0xff]++; 84 | countLeft[(eight_bytes >> 32) & 0xff]++; 85 | countLeft[(eight_bytes >> 40) & 0xff]++; 86 | countLeft[(eight_bytes >> 48) & 0xff]++; 87 | countLeft[(eight_bytes >> 56) & 0xff]++; 88 | } 89 | for (; current < r; current++) // Scan the array and count the number of times each digit value appears - i.e. size of each bin 90 | countLeft[inArray[current]]++; 91 | 92 | return countLeft; 93 | } 94 | 95 | size_t m = r / 2 + l / 2 + (r % 2 + l % 2) / 2; // average without overflow 96 | 97 | #if defined(USE_PPL) 98 | Concurrency::parallel_invoke( 99 | #else 100 | tbb::parallel_invoke( 101 | #endif 102 | [&] { countLeft = HistogramOneByteComponentParallel_4 (inArray, l, m, parallelThreshold); }, 103 | [&] { countRight = HistogramOneByteComponentParallel_4 (inArray, m, r, parallelThreshold); } 104 | ); 105 | // Combine left and right results 106 | for (size_t j = 0; j < NumberOfBins; j++) 107 | countLeft[j] += countRight[j]; 108 | 109 | delete[] countRight; 110 | 111 | return countLeft; 112 | } 113 | 114 | 115 | // left (l) boundary is inclusive and right (r) boundary is exclusive 116 | template< unsigned NumberOfBins > 117 | inline size_t* HistogramOneByteComponentParallel_2(unsigned char inArray[], size_t l, size_t r, size_t parallelThreshold = 64 * 1024) 118 | { 119 | size_t* countLeft_0 = NULL; 120 | size_t* countLeft_1 = NULL; 121 | size_t* countLeft_2 = NULL; 122 | size_t* countLeft_3 = NULL; 123 | size_t* countRight = NULL; 124 | 125 | if (l >= r) // zero elements to compare 126 | { 127 | countLeft_0 = new size_t[NumberOfBins]{}; 128 | return countLeft_0; 129 | } 130 | if ((r - l) <= parallelThreshold) 131 | { 132 | countLeft_0 = new size_t[NumberOfBins]{}; 133 | countLeft_1 = new size_t[NumberOfBins]{}; 134 | countLeft_2 = new size_t[NumberOfBins]{}; 135 | countLeft_3 = new size_t[NumberOfBins]{}; 136 | 137 | size_t last_by_four = l + ((r - l) / 4) * 4; 138 | size_t current = l; 139 | for (; current < last_by_four;) // Scan the array and count the number of times each digit value appears - i.e. size of each bin 140 | { 141 | countLeft_0[inArray[current]]++; current++; 142 | countLeft_1[inArray[current]]++; current++; 143 | countLeft_2[inArray[current]]++; current++; 144 | countLeft_3[inArray[current]]++; current++; 145 | } 146 | for (; current < r; current++) // possibly last element 147 | countLeft_0[inArray[current]]++; 148 | 149 | // Combine the two count arrays into a single arrray to return 150 | for (size_t count_index = 0; count_index < NumberOfBins; count_index++) 151 | { 152 | countLeft_0[count_index] += countLeft_1[count_index]; 153 | countLeft_0[count_index] += countLeft_2[count_index]; 154 | countLeft_0[count_index] += countLeft_3[count_index]; 155 | } 156 | 157 | delete[] countLeft_3; 158 | delete[] countLeft_2; 159 | delete[] countLeft_1; 160 | return countLeft_0; 161 | } 162 | 163 | size_t m = r / 2 + l / 2 + (r % 2 + l % 2) / 2; // average without overflow 164 | 165 | #if defined(USE_PPL) 166 | Concurrency::parallel_invoke( 167 | #else 168 | tbb::parallel_invoke( 169 | #endif 170 | [&] { countLeft_0 = HistogramOneByteComponentParallel_2 (inArray, l, m, parallelThreshold); }, 171 | [&] { countRight = HistogramOneByteComponentParallel_2 (inArray, m, r, parallelThreshold); } 172 | ); 173 | // Combine left and right results 174 | for (size_t j = 0; j < NumberOfBins; j++) 175 | countLeft_0[j] += countRight[j]; 176 | 177 | delete[] countRight; 178 | 179 | return countLeft_0; 180 | } 181 | 182 | // left (l) boundary is inclusive and right (r) boundary is exclusive 183 | template< unsigned NumberOfBins > 184 | inline size_t* HistogramOneByteComponentParallel_3(unsigned char inArray[], size_t l, size_t r, size_t parallelThreshold = 64 * 1024) 185 | { 186 | size_t* countLeft_0 = NULL; 187 | size_t* countRight = NULL; 188 | 189 | if (l >= r) // zero elements to compare 190 | { 191 | countLeft_0 = new size_t[NumberOfBins]{}; 192 | return countLeft_0; 193 | } 194 | if ((r - l) <= parallelThreshold) 195 | { 196 | countLeft_0 = new size_t[NumberOfBins]{}; 197 | #if defined(WIN32) || defined(_WIN32) || defined(__WIN32) && !defined(__CYGWIN__) 198 | __declspec(align(64)) size_t countLeft_1[NumberOfBins] = { 0 }; 199 | __declspec(align(64)) size_t countLeft_2[NumberOfBins] = { 0 }; 200 | __declspec(align(64)) size_t countLeft_3[NumberOfBins] = { 0 }; 201 | #else 202 | size_t countLeft_1[NumberOfBins] __attribute__((aligned(64))) = { 0 }; 203 | size_t countLeft_2[NumberOfBins] __attribute__((aligned(64))) = { 0 }; 204 | size_t countLeft_3[NumberOfBins] __attribute__((aligned(64))) = { 0 }; 205 | #endif 206 | 207 | size_t last_by_four = l + ((r - l) / 4) * 4; 208 | size_t current = l; 209 | for (; current < last_by_four;) // Scan the array and count the number of times each digit value appears - i.e. size of each bin 210 | { 211 | countLeft_0[inArray[current]]++; current++; 212 | countLeft_1[inArray[current]]++; current++; 213 | countLeft_2[inArray[current]]++; current++; 214 | countLeft_3[inArray[current]]++; current++; 215 | } 216 | for (; current < r; current++) // possibly last element 217 | countLeft_0[inArray[current]]++; 218 | 219 | // Combine the two count arrays into a single arrray to return 220 | for (size_t count_index = 0; count_index < NumberOfBins; count_index++) 221 | { 222 | countLeft_0[count_index] += countLeft_1[count_index]; 223 | countLeft_0[count_index] += countLeft_2[count_index]; 224 | countLeft_0[count_index] += countLeft_3[count_index]; 225 | } 226 | 227 | return countLeft_0; 228 | } 229 | 230 | size_t m = r / 2 + l / 2 + (r % 2 + l % 2) / 2; // average without overflow 231 | 232 | #if defined(USE_PPL) 233 | Concurrency::parallel_invoke( 234 | #else 235 | tbb::parallel_invoke( 236 | #endif 237 | [&] { countLeft_0 = HistogramOneByteComponentParallel_3 (inArray, l, m, parallelThreshold); }, 238 | [&] { countRight = HistogramOneByteComponentParallel_3 (inArray, m, r, parallelThreshold); } 239 | ); 240 | // Combine left and right results 241 | for (size_t j = 0; j < NumberOfBins; j++) 242 | countLeft_0[j] += countRight[j]; 243 | 244 | delete[] countRight; 245 | 246 | return countLeft_0; 247 | } 248 | 249 | template< unsigned PowerOfTwoRadix, unsigned Log2ofPowerOfTwoRadix > 250 | inline size_t** HistogramByteComponentsParallel(unsigned inArray[], size_t l, size_t r, int parallelThreshold = 64 * 1024) 251 | { 252 | const unsigned numberOfDigits = Log2ofPowerOfTwoRadix; 253 | const unsigned NumberOfBins = PowerOfTwoRadix; 254 | 255 | size_t** countLeft = NULL; 256 | size_t** countRight = NULL; 257 | 258 | if (l > r) // zero elements to compare 259 | { 260 | countLeft = new size_t* [numberOfDigits]; 261 | 262 | for (unsigned i = 0; i < numberOfDigits; i++) 263 | { 264 | countLeft[i] = new size_t[NumberOfBins]; 265 | for (unsigned j = 0; j < NumberOfBins; j++) 266 | countLeft[i][j] = 0; 267 | } 268 | return countLeft; 269 | } 270 | if ((r - l + 1) <= parallelThreshold) 271 | { 272 | countLeft = new size_t* [numberOfDigits]; 273 | 274 | for (unsigned i = 0; i < numberOfDigits; i++) 275 | { 276 | countLeft[i] = new size_t[NumberOfBins]; 277 | for (unsigned j = 0; j < NumberOfBins; j++) 278 | countLeft[i][j] = 0; 279 | } 280 | // Faster version, since it doesn't use a 2-D array, reducing one level of indirection 281 | size_t* count0 = countLeft[0]; 282 | size_t* count1 = countLeft[1]; 283 | size_t* count2 = countLeft[2]; 284 | size_t* count3 = countLeft[3]; 285 | #if 1 286 | for (size_t current = l; current <= r; current++) // Scan the array and count the number of times each digit value appears - i.e. size of each bin 287 | { 288 | unsigned value = inArray[current]; 289 | count0[value & 0xff]++; 290 | count1[(value >> 8) & 0xff]++; 291 | count2[(value >> 16) & 0xff]++; 292 | count3[(value >> 24) & 0xff]++; 293 | } 294 | #else 295 | // Seems to be about the same performance as masking and shifting 296 | for (int current = l; current <= r; current++) // Scan the array and count the number of times each digit value appears - i.e. size of each bin 297 | { 298 | either value; 299 | value.dw = inArray[current]; 300 | count0[value.bytes.b[0]]++; 301 | count1[value.bytes.b[1]]++; 302 | count2[value.bytes.b[2]]++; 303 | count3[value.bytes.b[3]]++; 304 | } 305 | #endif 306 | return countLeft; 307 | } 308 | 309 | size_t m = r / 2 + l / 2 + (r % 2 + l % 2) / 2; // average without overflow 310 | 311 | #if defined(USE_PPL) 312 | Concurrency::parallel_invoke( 313 | #else 314 | tbb::parallel_invoke( 315 | #endif 316 | [&] { countLeft = HistogramByteComponentsParallel (inArray, l, m, parallelThreshold); }, 317 | [&] { countRight = HistogramByteComponentsParallel (inArray, m + 1, r, parallelThreshold); } 318 | ); 319 | // Combine left and right results 320 | for (unsigned i = 0; i < numberOfDigits; i++) 321 | for (unsigned j = 0; j < NumberOfBins; j++) 322 | countLeft[i][j] += countRight[i][j]; 323 | 324 | for (unsigned i = 0; i < numberOfDigits; i++) 325 | delete[] countRight[i]; 326 | delete[] countRight; 327 | 328 | return countLeft; 329 | } 330 | 331 | // Returns count[quanta][NumberOfBins] 332 | template< unsigned PowerOfTwoRadix, unsigned Log2ofPowerOfTwoRadix > 333 | inline size_t** HistogramByteComponentsAcrossWorkQuantasQC(unsigned inArray[], size_t l, size_t r, size_t workQuanta, size_t numberOfQuantas, unsigned whichByte) 334 | { 335 | const unsigned NumberOfBins = PowerOfTwoRadix; 336 | const unsigned mask = 0xff; 337 | int shiftRightAmount = (int)(8 * whichByte); 338 | //cout << "HistogramQC: l = " << l << " r = " << r << " workQuanta = " << workQuanta << " quanta = " << quanta << " whichByte = " << whichByte << endl; 339 | 340 | size_t** count = new size_t * [numberOfQuantas]; 341 | for (size_t i = 0; i < numberOfQuantas; i++) 342 | { 343 | count[i] = new size_t[NumberOfBins]; 344 | for (unsigned long j = 0; j < NumberOfBins; j++) 345 | count[i][j] = 0; 346 | } 347 | 348 | if (l > r) 349 | return count; 350 | 351 | size_t startQuanta = l / workQuanta; 352 | size_t endQuanta = r / workQuanta; 353 | if (startQuanta == endQuanta) // counting within a single workQuanta, either partial or full 354 | { 355 | size_t q = startQuanta; 356 | for (size_t currIndex = l; currIndex <= r; currIndex++) 357 | { 358 | unsigned inByte = (inArray[currIndex] >> shiftRightAmount) & mask; 359 | count[q][inByte]++; 360 | } 361 | } 362 | else 363 | { 364 | size_t currIndex, endIndex; 365 | 366 | // process startQuanta, which is either partial or full 367 | size_t q = startQuanta; 368 | endIndex = startQuanta * workQuanta + (workQuanta - 1); 369 | for (currIndex = l; currIndex <= endIndex; currIndex++) 370 | { 371 | unsigned inByte = (inArray[currIndex] >> shiftRightAmount) & mask; 372 | count[q][inByte]++; 373 | } 374 | 375 | // process endQuanta, which is either partial or full 376 | q = endQuanta; 377 | for (currIndex = endQuanta * workQuanta; currIndex <= r; currIndex++) 378 | { 379 | unsigned inByte = (inArray[currIndex] >> shiftRightAmount) & mask; 380 | count[q][inByte]++; 381 | } 382 | 383 | // process full workQuantas > startQuanta and < endQuanta 384 | currIndex = (startQuanta + 1) * workQuanta; 385 | endQuanta--; 386 | for (q = startQuanta + 1; q <= endQuanta; q++) 387 | { 388 | for (size_t j = 0; j < workQuanta; j++) 389 | { 390 | unsigned inByte = (inArray[currIndex++] >> shiftRightAmount) & mask; 391 | count[q][inByte]++; 392 | } 393 | } 394 | } 395 | 396 | return count; 397 | } 398 | 399 | template< unsigned PowerOfTwoRadix, unsigned Log2ofPowerOfTwoRadix > 400 | inline size_t** HistogramByteComponentsQCParInner(unsigned inArray[], size_t l, size_t r, size_t workQuanta, size_t numberOfQuantas, unsigned whichByte, size_t parallelThreshold = 16 * 1024) 401 | { 402 | const unsigned NumberOfBins = PowerOfTwoRadix; 403 | size_t** countLeft = NULL; 404 | size_t** countRight = NULL; 405 | 406 | if (l > r) // zero elements to compare 407 | { 408 | size_t** countLeft = new size_t * [numberOfQuantas]; 409 | for (size_t i = 0; i < numberOfQuantas; i++) 410 | { 411 | countLeft[i] = new size_t[NumberOfBins]; 412 | for (unsigned j = 0; j < NumberOfBins; j++) 413 | countLeft[i][j] = 0; 414 | } 415 | return countLeft; 416 | } 417 | if ((r - l + 1) <= parallelThreshold) 418 | return HistogramByteComponentsAcrossWorkQuantasQC(inArray, l, r, workQuanta, numberOfQuantas, whichByte); 419 | 420 | size_t m = ((r + l) / 2); 421 | 422 | #if defined(USE_PPL) 423 | Concurrency::parallel_invoke( 424 | #else 425 | tbb::parallel_invoke( 426 | #endif 427 | [&] { countLeft = HistogramByteComponentsQCParInner (inArray, l, m, workQuanta, numberOfQuantas, whichByte, parallelThreshold); }, 428 | [&] { countRight = HistogramByteComponentsQCParInner (inArray, m + 1, r, workQuanta, numberOfQuantas, whichByte, parallelThreshold); } 429 | ); 430 | // Combine left and right results (reduce step), only for workQuantas for which the counts were computed 431 | size_t startQuanta = l / workQuanta; 432 | size_t endQuanta = r / workQuanta; 433 | for (size_t i = startQuanta; i <= endQuanta; i++) 434 | for (unsigned long j = 0; j < NumberOfBins; j++) 435 | countLeft[i][j] += countRight[i][j]; 436 | 437 | for (size_t i = 0; i < numberOfQuantas; i++) 438 | delete[] countRight[i]; 439 | delete[] countRight; 440 | 441 | return countLeft; 442 | } 443 | 444 | template< unsigned PowerOfTwoRadix, unsigned Log2ofPowerOfTwoRadix > 445 | inline size_t** HistogramByteComponentsQCPar(unsigned* inArray, size_t l, size_t r, size_t workQuanta, size_t numberOfQuantas, unsigned whichByte, size_t parallelThreshold = 16 * 1024) 446 | { 447 | //may return 0 when not able to detect 448 | auto processor_count = std::thread::hardware_concurrency(); 449 | if (processor_count < 1) 450 | { 451 | processor_count = 1; 452 | //cout << "Warning: Fewer than 1 processor core detected. Using only a single core."; 453 | } 454 | 455 | size_t length = r - l + 1; 456 | 457 | if ((parallelThreshold * processor_count) < length) 458 | parallelThreshold = length / processor_count; 459 | #if 1 460 | return HistogramByteComponentsQCParInner(inArray, l, r, workQuanta, numberOfQuantas, whichByte, parallelThreshold); 461 | #else 462 | return HistogramByteComponentsAcrossWorkQuantasQC(inArray, l, r, workQuanta, quanta, whichByte); 463 | #endif 464 | } 465 | 466 | // This version did not seem to speed up over the single count array version. It proves that Histogram is not the bottleneck. 467 | template< unsigned long PowerOfTwoRadix, unsigned long Log2ofPowerOfTwoRadix > 468 | inline size_t* HistogramOneByteComponentParallel_2(unsigned inArray[], size_t l, size_t r, unsigned long shiftRight, size_t parallelThreshold = 64 * 1024) 469 | { 470 | const unsigned long NumberOfBins = PowerOfTwoRadix; 471 | 472 | size_t* countLeft_0 = NULL; 473 | size_t* countLeft_1 = NULL; 474 | size_t* countLeft_2 = NULL; 475 | size_t* countLeft_3 = NULL; 476 | size_t* countRight = NULL; 477 | 478 | if (l > r) // zero elements to compare 479 | { 480 | countLeft_0 = new size_t[NumberOfBins]{}; 481 | return countLeft_0; 482 | } 483 | if ((r - l + 1) <= parallelThreshold) 484 | { 485 | countLeft_0 = new size_t[NumberOfBins]{}; 486 | countLeft_1 = new size_t[NumberOfBins]{}; 487 | countLeft_2 = new size_t[NumberOfBins]{}; 488 | countLeft_3 = new size_t[NumberOfBins]{}; 489 | 490 | size_t last_by_four = l + ((r - l) / 4) * 4; 491 | size_t current = l; 492 | for (; current < last_by_four;) // Scan the array and count the number of times each digit value appears - i.e. size of each bin 493 | { 494 | countLeft_0[(inArray[current] >> shiftRight) & 0xff]++; current++; 495 | countLeft_1[(inArray[current] >> shiftRight) & 0xff]++; current++; 496 | countLeft_2[(inArray[current] >> shiftRight) & 0xff]++; current++; 497 | countLeft_3[(inArray[current] >> shiftRight) & 0xff]++; current++; 498 | } 499 | for (; current < r; current++) // possibly last element 500 | countLeft_0[(inArray[current] >> shiftRight) & 0xff]++; 501 | 502 | // Combine the two count arrays into a single arrray to return 503 | for (size_t count_index = 0; count_index < NumberOfBins; count_index++) 504 | { 505 | countLeft_0[count_index] += countLeft_1[count_index]; 506 | countLeft_0[count_index] += countLeft_2[count_index]; 507 | countLeft_0[count_index] += countLeft_3[count_index]; 508 | } 509 | 510 | delete[] countLeft_3; 511 | delete[] countLeft_2; 512 | delete[] countLeft_1; 513 | return countLeft_0; 514 | } 515 | 516 | size_t m = r / 2 + l / 2 + (r % 2 + l % 2) / 2; 517 | 518 | #if defined(USE_PPL) 519 | Concurrency::parallel_invoke( 520 | #else 521 | tbb::parallel_invoke( 522 | #endif 523 | [&] { countLeft_0 = HistogramOneByteComponentParallel_2 (inArray, l, m, shiftRight, parallelThreshold); }, 524 | [&] { countRight = HistogramOneByteComponentParallel_2 (inArray, m + 1, r, shiftRight, parallelThreshold); } 525 | ); 526 | // Combine left and right results 527 | for (size_t j = 0; j < NumberOfBins; j++) 528 | countLeft_0[j] += countRight[j]; 529 | 530 | delete[] countRight; 531 | 532 | return countLeft_0; 533 | } 534 | 535 | template< unsigned long PowerOfTwoRadix, unsigned long Log2ofPowerOfTwoRadix > 536 | inline size_t* HistogramOneByteComponentParallel(unsigned inArray[], size_t l, size_t r, unsigned long shiftRight, size_t parallelThreshold = 64 * 1024) 537 | { 538 | const size_t NumberOfBins = PowerOfTwoRadix; 539 | const unsigned mask = NumberOfBins - 1; 540 | 541 | size_t* countLeft = NULL; 542 | size_t* countRight = NULL; 543 | 544 | if (l > r) // zero elements to compare 545 | { 546 | countLeft = new size_t[NumberOfBins]{}; 547 | return countLeft; 548 | } 549 | if ((r - l + 1) <= parallelThreshold) 550 | { 551 | countLeft = new size_t[NumberOfBins]{}; 552 | 553 | for (size_t current = l; current <= r; current++) // Scan the array and count the number of times each digit value appears - i.e. size of each bin 554 | countLeft[(inArray[current] >> shiftRight) & mask]++; 555 | 556 | return countLeft; 557 | } 558 | 559 | size_t m = r / 2 + l / 2 + (r % 2 + l % 2) / 2; 560 | 561 | #if defined(USE_PPL) 562 | Concurrency::parallel_invoke( 563 | #else 564 | tbb::parallel_invoke( 565 | #endif 566 | [&] { countLeft = HistogramOneByteComponentParallel (inArray, l, m, shiftRight, parallelThreshold); }, 567 | [&] { countRight = HistogramOneByteComponentParallel (inArray, m + 1, r, shiftRight, parallelThreshold); } 568 | ); 569 | // Combine left and right results 570 | for (size_t j = 0; j < NumberOfBins; j++) 571 | countLeft[j] += countRight[j]; 572 | 573 | delete[] countRight; 574 | 575 | return countLeft; 576 | } 577 | 578 | } -------------------------------------------------------------------------------- /InsertionSort.h: -------------------------------------------------------------------------------- 1 | // Insertion Sort implementations 2 | 3 | #ifndef _InsertionSort_h 4 | #define _InsertionSort_h 5 | 6 | template< class _Type > 7 | inline void insertionSortSimilarToSTLnoSelfAssignment( _Type* a, size_t a_size ) 8 | { 9 | for ( size_t i = 1; i < a_size; i++ ) 10 | { 11 | if ( a[ i ] < a[ i - 1 ] ) // no need to do (j > 0) compare for the first iteration 12 | { 13 | _Type currentElement = a[ i ]; 14 | a[ i ] = a[ i - 1 ]; 15 | size_t j; 16 | for ( j = i - 1; j > 0 && currentElement < a[ j - 1 ]; j-- ) 17 | { 18 | a[ j ] = a[ j - 1 ]; 19 | } 20 | a[ j ] = currentElement; // always necessary work/write 21 | } 22 | // Perform no work at all if the first comparison fails - i.e. never assign an element to itself! 23 | } 24 | } 25 | 26 | #endif 27 | -------------------------------------------------------------------------------- /MIT-LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2011-2018 Victor J. Duvanenko 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining 4 | a copy of this software and associated documentation files (the 5 | "Software"), to deal in the Software without restriction, including 6 | without limitation the rights to use, copy, modify, merge, publish, 7 | distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so, subject to 9 | the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be 12 | included in all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /MemoryUsage.cpp: -------------------------------------------------------------------------------- 1 | // from https://stackoverflow.com/questions/63166/how-to-determine-cpu-and-memory-consumption-from-inside-a-process 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | using std::chrono::duration; 9 | using std::chrono::duration_cast; 10 | using std::chrono::high_resolution_clock; 11 | using std::milli; 12 | 13 | #if defined(WIN32) || defined(_WIN32) || defined(__WIN32) && !defined(__CYGWIN__) 14 | #include "windows.h" 15 | #else 16 | #include "sys/types.h" 17 | #include "sys/sysinfo.h" 18 | #endif 19 | 20 | unsigned long long physical_memory_used_in_megabytes() 21 | { 22 | #if defined(WIN32) || defined(_WIN32) || defined(__WIN32) && !defined(__CYGWIN__) 23 | MEMORYSTATUSEX memInfo; 24 | memInfo.dwLength = sizeof(MEMORYSTATUSEX); 25 | GlobalMemoryStatusEx(&memInfo); 26 | //DWORDLONG totalVirtualMem = memInfo.ullTotalPageFile; 27 | //DWORDLONG virtualMemUsed = memInfo.ullTotalPageFile - memInfo.ullAvailPageFile; 28 | //DWORDLONG totalPhysMem = memInfo.ullTotalPhys; 29 | DWORDLONG physMemUsed = memInfo.ullTotalPhys - memInfo.ullAvailPhys; // physical memory used by the system 30 | //SIZE_T physMemUsedByMe = pmc.WorkingSetSize; // by current process 31 | return (physMemUsed / (1024ULL * 1024)); 32 | #else 33 | struct sysinfo memInfo; 34 | 35 | sysinfo(&memInfo); 36 | long long totalVirtualMem = memInfo.totalram; 37 | //Add other values in next statement to avoid int overflow on right hand side... 38 | totalVirtualMem += memInfo.totalswap; 39 | totalVirtualMem *= memInfo.mem_unit; 40 | long long virtualMemUsed = memInfo.totalram - memInfo.freeram; 41 | //Add other values in next statement to avoid int overflow on right hand side... 42 | virtualMemUsed += memInfo.totalswap - memInfo.freeswap; 43 | virtualMemUsed *= memInfo.mem_unit; 44 | long long totalPhysMem = memInfo.totalram; 45 | //Multiply in next statement to avoid int overflow on right hand side... 46 | totalPhysMem *= memInfo.mem_unit; 47 | long long physMemUsed = memInfo.totalram - memInfo.freeram; 48 | //Multiply in next statement to avoid int overflow on right hand side... 49 | physMemUsed *= memInfo.mem_unit; // total physical memory used by the whole system 50 | return (physMemUsed / (1024ULL * 1024)); 51 | #endif 52 | } 53 | 54 | unsigned long long physical_memory_total_in_megabytes() 55 | { 56 | #if defined(WIN32) || defined(_WIN32) || defined(__WIN32) && !defined(__CYGWIN__) 57 | MEMORYSTATUSEX memInfo; 58 | memInfo.dwLength = sizeof(MEMORYSTATUSEX); 59 | GlobalMemoryStatusEx(&memInfo); 60 | //DWORDLONG totalVirtualMem = memInfo.ullTotalPageFile; 61 | //DWORDLONG virtualMemUsed = memInfo.ullTotalPageFile - memInfo.ullAvailPageFile; 62 | DWORDLONG totalPhysMem = memInfo.ullTotalPhys; 63 | //DWORDLONG physMemUsed = memInfo.ullTotalPhys - memInfo.ullAvailPhys; // physical memory used by the system 64 | //SIZE_T physMemUsedByMe = pmc.WorkingSetSize; // by current process 65 | return (totalPhysMem / (1024ULL * 1024)); 66 | #else 67 | struct sysinfo memInfo; 68 | 69 | sysinfo(&memInfo); 70 | long long totalVirtualMem = memInfo.totalram; 71 | //Add other values in next statement to avoid int overflow on right hand side... 72 | totalVirtualMem += memInfo.totalswap; 73 | totalVirtualMem *= memInfo.mem_unit; 74 | long long virtualMemUsed = memInfo.totalram - memInfo.freeram; 75 | //Add other values in next statement to avoid int overflow on right hand side... 76 | virtualMemUsed += memInfo.totalswap - memInfo.freeswap; 77 | virtualMemUsed *= memInfo.mem_unit; 78 | long long totalPhysMem = memInfo.totalram; 79 | //Multiply in next statement to avoid int overflow on right hand side... 80 | totalPhysMem *= memInfo.mem_unit; 81 | long long physMemUsed = memInfo.totalram - memInfo.freeram; 82 | //Multiply in next statement to avoid int overflow on right hand side... 83 | physMemUsed *= memInfo.mem_unit; // total physical memory used by the whole system 84 | return (totalPhysMem / (1024ULL * 1024)); 85 | #endif 86 | } 87 | 88 | // Test memory allocation 89 | int TestMemoryAllocation() 90 | { 91 | #if defined(WIN32) || defined(_WIN32) || defined(__WIN32) && !defined(__CYGWIN__) 92 | const size_t NUM_TIMES = 1000; 93 | const size_t SIZE_OF_ARRAY = 100'000'000; 94 | unsigned char* array_of_pointers[NUM_TIMES]{}; 95 | size_t sum = 0; 96 | 97 | for (size_t i = 0; i < NUM_TIMES; ++i) 98 | { 99 | array_of_pointers[i] = new unsigned char[SIZE_OF_ARRAY]; 100 | for (size_t j = 0; j < SIZE_OF_ARRAY; ++j) 101 | array_of_pointers[i][j] = (unsigned char)j; 102 | for (size_t j = 0; j < SIZE_OF_ARRAY; ++j) 103 | sum += array_of_pointers[i][j]; 104 | printf("Allocated array: %zu sum = %zu\n", i, sum); 105 | std::this_thread::sleep_for(std::chrono::milliseconds(300)); 106 | } 107 | printf("Final sum = %zu\n", sum); 108 | 109 | for (size_t i = 0; i < NUM_TIMES; ++i) 110 | { 111 | delete[] array_of_pointers[i]; 112 | } 113 | #endif 114 | return 0; 115 | } 116 | 117 | void print_current_memory_space() 118 | { 119 | printf("physical memory used = %llu physical memory total = %llu\n", 120 | physical_memory_used_in_megabytes(), physical_memory_total_in_megabytes()); 121 | } 122 | 123 | void test_lazy_memory_allocation() 124 | { 125 | const size_t SIZE_OF_ARRAY = 10'000'000'000; 126 | //size_t sum = 0; 127 | 128 | print_current_memory_space(); 129 | 130 | unsigned char* my_array = new unsigned char[SIZE_OF_ARRAY]; 131 | 132 | print_current_memory_space(); 133 | 134 | for (size_t i = 0; i < (SIZE_OF_ARRAY / 2); ++i) 135 | my_array[i] = (unsigned char)i; 136 | 137 | print_current_memory_space(); 138 | 139 | for (size_t i = (SIZE_OF_ARRAY / 2); i < SIZE_OF_ARRAY; ++i) 140 | my_array[i] = (unsigned char)i; 141 | 142 | print_current_memory_space(); 143 | 144 | delete[] my_array; 145 | 146 | print_current_memory_space(); 147 | } 148 | -------------------------------------------------------------------------------- /ParallelAlgorithms.cpp: -------------------------------------------------------------------------------- 1 | // ParallelAlgorithms main application entry point 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | using std::random_device; 9 | using std::vector; 10 | 11 | extern int ParallelStdCppExample( vector& doubles); 12 | extern int ParallelStdCppExample( vector& uints, bool stable = false); 13 | extern int ParallelStdCppExample( vector& uints); 14 | extern int RadixSortLsdBenchmark( vector& uints); 15 | extern int ParallelMergeSortBenchmark( vector& doubles); 16 | extern int ParallelMergeSortBenchmark( vector& uints, const size_t& testSize); 17 | extern int ParallelInPlaceMergeSortBenchmark(vector& uints); 18 | extern int ParallelMergeSortBenchmark( vector& uints); 19 | extern int main_quicksort(); 20 | extern int ParallelMergeBenchmark(); 21 | extern int ParallelRadixSortLsdBenchmark( vector& uints); 22 | extern int RadixSortMsdBenchmark( vector& uints); 23 | extern void TestAverageOfTwoIntegers(); 24 | extern int CountingSortBenchmark( vector& uints); 25 | extern int SumBenchmark( vector& uints); 26 | extern int SumBenchmarkChar( vector& uints); 27 | extern int SumBenchmark64( vector& uints); 28 | extern int TestMemoryAllocation(); 29 | extern int std_parallel_sort_leak_demo(); 30 | extern int bundling_small_work_items_benchmark(size_t, size_t, size_t); 31 | 32 | int main() 33 | { 34 | // Test configuration options 35 | bool UseStableStdSort = false; 36 | 37 | // Test cases for averaging of two integers 38 | //TestAverageOfTwoIntegers(); 39 | 40 | // Benchmark QuickSort 41 | //main_quicksort(); 42 | 43 | #if 0 44 | // Demonstrate Parallel Merge 45 | 46 | const unsigned long A_NumElements = 8; 47 | const unsigned long B_NumElements = 6; 48 | unsigned long a_array[A_NumElements + B_NumElements] = { 2, 3, 3, 5, 15, 17, 20, 22, 0, 0, 6, 9, 16, 17 }; // first array has 8 elements, second array has 6 49 | const unsigned long C_NumElements = A_NumElements + B_NumElements; 50 | unsigned long c_array[C_NumElements]; 51 | 52 | //merge_parallel_L5(a_array, 0, A_NumElements - 1, A_NumElements, C_NumElements - 1, c_array, 0); 53 | 54 | std::cout << std::endl << "merged array: "; 55 | for (unsigned long i = 0; i < C_NumElements; i++) 56 | std::cout << c_array[i] << " "; 57 | std::cout << std::endl; 58 | 59 | // Demonstrate Parallel Merge Sort 60 | 61 | const size_t NumElements = 8; 62 | //unsigned long unsorted_array[NumElements] = { 10, 3, 5, 2, 4, 11, 0, 3 }; 63 | unsigned long sorted_array[NumElements]; 64 | 65 | //parallel_merge_sort_simplest(unsorted_array, 0, NumElements - 1, sorted_array); // simplest, but slowest 66 | //ParallelAlgorithms::parallel_merge_sort_hybrid_rh_1(unsorted_array, 0, NumElements - 1, sorted_array); // fastest 67 | 68 | std::cout << "sorted array: "; 69 | for (size_t i = 0; i < NumElements; i++) 70 | { 71 | std::cout << sorted_array[i] << " "; 72 | //std::cout << unsorted_array[i] << " "; 73 | } 74 | std::cout << std::endl << std::endl; 75 | #endif 76 | 77 | // Provide the same input random array of doubles to all sorting algorithms 78 | const size_t testSize = 1'000'000'000; 79 | //random_device rd; 80 | std::mt19937_64 dist(1234); 81 | 82 | //const auto processor_count = std::thread::hardware_concurrency(); 83 | //printf("Number of cores = %u \n", processor_count); 84 | 85 | #if 0 86 | // generate some random doubles: 87 | printf("Testing with %zu random doubles...\n", testSize); 88 | vector doubles(testSize); 89 | for (auto& d : doubles) { 90 | d = static_cast(rd()); 91 | } 92 | // Example of C++17 Standard C++ Parallel Sorting 93 | //ParallelStdCppExample(doubles); 94 | 95 | // Benchmark the above Parallel Merge Sort algorithm 96 | ParallelMergeSortBenchmark(doubles); 97 | #endif 98 | // generate some random unsigned integers: 99 | printf("\nTesting with %zu random unsigned integers...\n\n", testSize); 100 | vector uints(testSize); 101 | for (auto& d : uints) { 102 | //d = static_cast(rd()); 103 | d = static_cast(dist()); // way faster on Linux 104 | } 105 | // Example of C++17 Standard C++ Parallel Sorting 106 | //ParallelStdCppExample(uints, UseStableStdSort); 107 | 108 | bundling_small_work_items_benchmark(20, 10000, 1000); 109 | 110 | // std_parallel_sort_leak_demo(); 111 | // return 0; 112 | 113 | // RadixSortMsdBenchmark(uints); 114 | 115 | //CountingSortBenchmark(uints); // sorts uchar's and not ulongs 116 | 117 | //SumBenchmarkChar(uints); 118 | SumBenchmark( uints); 119 | //SumBenchmark64( uints); 120 | 121 | return 0; 122 | 123 | // Benchmark the above Parallel Merge Sort algorithm 124 | ParallelMergeSortBenchmark(uints, testSize); 125 | 126 | ParallelInPlaceMergeSortBenchmark(uints); 127 | 128 | ParallelRadixSortLsdBenchmark(uints); 129 | 130 | ParallelMergeBenchmark(); 131 | 132 | RadixSortLsdBenchmark(uints); 133 | 134 | // generate some nearly pre-sorted unsigned integers: 135 | printf("\nTesting with %zu nearly pre-sorted unsigned integers...\n\n", testSize); 136 | //vector uints(testSize); 137 | for (size_t i = 0; i < uints.size(); i++) { 138 | if ((i % 100) == 0) 139 | { 140 | //uints[i] = static_cast(rd()); 141 | uints[i] = static_cast(dist()); // way faster on Linux 142 | } 143 | else 144 | uints[i] = static_cast(i); 145 | } 146 | // Example of C++17 Standard C++ Parallel Sorting 147 | ParallelStdCppExample(uints, UseStableStdSort); 148 | 149 | RadixSortMsdBenchmark(uints); 150 | 151 | //CountingSortBenchmark(uints); // sorts uchar's and not ulongs 152 | 153 | SumBenchmarkChar(uints); 154 | 155 | SumBenchmark(uints); 156 | 157 | // Benchmark the above Parallel Merge Sort algorithm 158 | ParallelMergeSortBenchmark(uints, testSize); 159 | 160 | ParallelInPlaceMergeSortBenchmark(uints); 161 | 162 | ParallelRadixSortLsdBenchmark(uints); 163 | 164 | RadixSortLsdBenchmark(uints); 165 | 166 | printf("\nTesting with %zu constant unsigned integers...\n\n", testSize); 167 | for (size_t i = 0; i < uints.size(); i++) { 168 | uints[i] = 10; 169 | } 170 | 171 | // Example of C++17 Standard C++ Parallel Sorting 172 | ParallelStdCppExample(uints, UseStableStdSort); 173 | 174 | RadixSortMsdBenchmark(uints); 175 | 176 | //CountingSortBenchmark(uints); // sorts uchar's and not ulongs 177 | 178 | SumBenchmarkChar(uints); 179 | 180 | SumBenchmark(uints); 181 | 182 | // Benchmark the above Parallel Merge Sort algorithm 183 | ParallelMergeSortBenchmark(uints, testSize); 184 | 185 | ParallelInPlaceMergeSortBenchmark(uints); 186 | 187 | ParallelRadixSortLsdBenchmark(uints); 188 | 189 | RadixSortLsdBenchmark(uints); 190 | 191 | #if 0 192 | // generate some random unsigned longs: 193 | printf("\nTesting with %zu random unsigned longs longs...\n\n", testSize); 194 | vector ulonglongs(testSize); 195 | for (auto& d : ulonglongs) { 196 | //d = static_cast(rd()); 197 | d = static_cast(dist()); // way faster on Linux 198 | } 199 | printf("Finished initializing unsigned long long array\n"); 200 | 201 | #if 1 202 | //RadixSortMsdBenchmark(ulongs); 203 | 204 | //CountingSortBenchmark(ulongs); // sorts uchar's and not ulongs 205 | 206 | //SumBenchmarkChar(ulongs); 207 | //SumBenchmark(ulongs); 208 | 209 | // Example of C++17 Standard C++ Parallel Sorting 210 | //ParallelStdCppExample(ulongs, UseStableStdSort); 211 | 212 | // Benchmark the above Parallel Merge Sort algorithm 213 | //ParallelMergeSortBenchmark(ulongs, testSize); 214 | 215 | #endif 216 | // Benchmark Parallel InPlace Merge Sort algorithm 217 | //ParallelInPlaceMergeSortBenchmark(ulongs); 218 | 219 | // Benchmark Radix Sort LSD algorithm 220 | //RadixSortLsdBenchmark(ulongs); 221 | 222 | // Benchmark Radix Sort LSD algorithm 223 | //ParallelRadixSortLsdBenchmark(ulongs); 224 | 225 | printf("\nTesting with %zu nearly pre-sorted unsigned long longs...\n\n", testSize); 226 | for (size_t i = 0; i < ulonglongs.size(); i++) { 227 | if ((i % 100) == 0) 228 | { 229 | //ulongs[i] = static_cast(rd()); 230 | ulonglongs[i] = static_cast(dist()); // way faster on Linux 231 | } 232 | else 233 | ulonglongs[i] = static_cast(i); 234 | } 235 | 236 | //CountingSortBenchmark(ulongs); // sorts uchar's and not ulongs 237 | 238 | //SumBenchmarkChar(ulongs); 239 | //SumBenchmark(ulongs); 240 | 241 | //RadixSortMsdBenchmark(ulongs); 242 | 243 | // Example of C++17 Standard C++ Parallel Sorting 244 | ParallelStdCppExample(ulongs, UseStableStdSort); 245 | 246 | // Benchmark the above Parallel Merge Sort algorithm 247 | //ParallelMergeSortBenchmark(ulongs, testSize); 248 | 249 | // Benchmark Parallel InPlace Merge Sort algorithm 250 | //ParallelInPlaceMergeSortBenchmark(ulongs); 251 | 252 | // Benchmark the above Parallel Merge Sort algorithm 253 | //RadixSortLsdBenchmark(ulongs); 254 | 255 | // Benchmark Radix Sort LSD algorithm 256 | //ParallelRadixSortLsdBenchmark(ulongs); 257 | 258 | printf("\nTesting with %zu constant unsigned long longs...\n\n", testSize); 259 | for (size_t i = 0; i < ulonglongs.size(); i++) { 260 | ulonglongs[i] = 10; 261 | } 262 | 263 | //CountingSortBenchmark(ulongs); // sorts uchar's and not ulongs 264 | 265 | //SumBenchmarkChar(ulongs); 266 | //SumBenchmark(ulongs); 267 | 268 | //RadixSortMsdBenchmark(ulongs); 269 | 270 | // Example of C++17 Standard C++ Parallel Sorting 271 | ParallelStdCppExample(ulongs, UseStableStdSort); 272 | 273 | // Benchmark the above Parallel Merge Sort algorithm 274 | //ParallelMergeSortBenchmark(ulongs, testSize); 275 | 276 | // Benchmark Parallel InPlace Merge Sort algorithm 277 | //ParallelInPlaceMergeSortBenchmark(ulongs); 278 | 279 | // Benchmark the above Parallel Merge Sort algorithm 280 | //RadixSortLsdBenchmark(ulongs); 281 | 282 | // Benchmark Radix Sort LSD algorithm 283 | //ParallelRadixSortLsdBenchmark(ulongs); 284 | #endif 285 | 286 | return 0; 287 | } 288 | -------------------------------------------------------------------------------- /ParallelAlgorithms.sln: -------------------------------------------------------------------------------- 1 | 2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio Version 16 4 | VisualStudioVersion = 16.0.31205.134 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ParallelAlgorithms", "ParallelAlgorithms.vcxproj", "{E496E25A-09FF-4BC8-A1D9-7271D4B706BF}" 7 | EndProject 8 | Global 9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 10 | Debug|x64 = Debug|x64 11 | Debug|x86 = Debug|x86 12 | Release|x64 = Release|x64 13 | Release|x86 = Release|x86 14 | EndGlobalSection 15 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 16 | {E496E25A-09FF-4BC8-A1D9-7271D4B706BF}.Debug|x64.ActiveCfg = Debug|x64 17 | {E496E25A-09FF-4BC8-A1D9-7271D4B706BF}.Debug|x64.Build.0 = Debug|x64 18 | {E496E25A-09FF-4BC8-A1D9-7271D4B706BF}.Debug|x86.ActiveCfg = Debug|Win32 19 | {E496E25A-09FF-4BC8-A1D9-7271D4B706BF}.Debug|x86.Build.0 = Debug|Win32 20 | {E496E25A-09FF-4BC8-A1D9-7271D4B706BF}.Release|x64.ActiveCfg = Release|x64 21 | {E496E25A-09FF-4BC8-A1D9-7271D4B706BF}.Release|x64.Build.0 = Release|x64 22 | {E496E25A-09FF-4BC8-A1D9-7271D4B706BF}.Release|x86.ActiveCfg = Release|Win32 23 | {E496E25A-09FF-4BC8-A1D9-7271D4B706BF}.Release|x86.Build.0 = Release|Win32 24 | EndGlobalSection 25 | GlobalSection(SolutionProperties) = preSolution 26 | HideSolutionNode = FALSE 27 | EndGlobalSection 28 | GlobalSection(ExtensibilityGlobals) = postSolution 29 | SolutionGuid = {DDE00453-8BEE-4362-83FF-3C06ABA33B22} 30 | EndGlobalSection 31 | EndGlobal 32 | -------------------------------------------------------------------------------- /ParallelAlgorithms.vcxproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Debug 6 | Win32 7 | 8 | 9 | Release 10 | Win32 11 | 12 | 13 | Debug 14 | x64 15 | 16 | 17 | Release 18 | x64 19 | 20 | 21 | 22 | {E496E25A-09FF-4BC8-A1D9-7271D4B706BF} 23 | Win32Proj 24 | ParallelAlgorithms 25 | 10.0 26 | 27 | 28 | 29 | Application 30 | true 31 | v143 32 | Unicode 33 | 34 | 35 | Application 36 | false 37 | v143 38 | true 39 | Unicode 40 | 41 | 42 | Application 43 | true 44 | v143 45 | Unicode 46 | true 47 | 48 | 49 | Application 50 | false 51 | v143 52 | true 53 | Unicode 54 | true 55 | true 56 | Static_Library 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | true 78 | 79 | 80 | true 81 | 82 | 83 | false 84 | 85 | 86 | false 87 | C:\Program Files (x86)\Intel\oneAPI\dpl\latest\windows\include;C:\Program Files (x86)\Intel\oneAPI\tbb\latest\include;$(IncludePath) 88 | 89 | 90 | 91 | Use 92 | Level3 93 | Disabled 94 | WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) 95 | true 96 | 97 | 98 | Console 99 | true 100 | 101 | 102 | 103 | 104 | NotUsing 105 | Level4 106 | Disabled 107 | _DEBUG;_CONSOLE;%(PreprocessorDefinitions) 108 | true 109 | true 110 | stdcpp17 111 | 112 | 113 | Console 114 | true 115 | 116 | 117 | 118 | 119 | Level3 120 | Use 121 | MaxSpeed 122 | true 123 | true 124 | WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 125 | true 126 | 127 | 128 | Console 129 | true 130 | true 131 | true 132 | 133 | 134 | 135 | 136 | Level3 137 | NotUsing 138 | Full 139 | true 140 | true 141 | NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 142 | true 143 | false 144 | 145 | 146 | Speed 147 | AdvancedVectorExtensions2 148 | GenerateParallelCode 149 | MultiThreadedDLL 150 | None 151 | Cpp20Support 152 | Default 153 | None 154 | stdcpp17 155 | 156 | 157 | Console 158 | true 159 | true 160 | true 161 | 16777216 162 | 16777216 163 | C:\Program Files (x86)\Intel\oneAPI\tbb\latest\lib\intel64\vc14;%(AdditionalLibraryDirectories) 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | /std:c++latest %(AdditionalOptions) 193 | /std:c++latest %(AdditionalOptions) 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | -------------------------------------------------------------------------------- /ParallelMergeSortBenchmark.cpp: -------------------------------------------------------------------------------- 1 | //#include 2 | //#include 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include "ParallelMergeSort.h" 13 | #include "SortParallel.h" 14 | 15 | using std::chrono::duration; 16 | using std::chrono::duration_cast; 17 | using std::chrono::high_resolution_clock; 18 | using std::milli; 19 | using std::random_device; 20 | using std::sort; 21 | using std::vector; 22 | 23 | const int iterationCount = 5; 24 | 25 | static void print_results(const char *const tag, const double * sorted, size_t sortedLength, 26 | high_resolution_clock::time_point startTime, 27 | high_resolution_clock::time_point endTime) { 28 | printf("%s: Lowest: %g Highest: %g Time: %fms\n", tag, 29 | sorted[0], sorted[sortedLength - 1], 30 | duration_cast>(endTime - startTime).count()); 31 | } 32 | 33 | static void print_results(const char* const tag, const unsigned* sorted, size_t sortedLength, 34 | high_resolution_clock::time_point startTime, 35 | high_resolution_clock::time_point endTime) { 36 | printf("%s: Lowest: %u Highest: %u Time: %fms\n", tag, 37 | sorted[0], sorted[sortedLength - 1], 38 | duration_cast>(endTime - startTime).count()); 39 | } 40 | 41 | 42 | int ParallelMergeSortBenchmark(vector& doubles) 43 | { 44 | // generate some random uints: 45 | printf("\nBenchmarking Parallel Merge Sort Hybrid with %zu doubles...\n", doubles.size()); 46 | double* doublesCopy = new double[doubles.size()]; 47 | double* doublesCopy2 = new double[doubles.size()]; 48 | double* sorted = new double[doubles.size()]; 49 | vector doublesCopyVec(doubles); 50 | 51 | // time how long it takes to sort them: 52 | for (int i = 0; i < iterationCount; ++i) 53 | { 54 | for (size_t j = 0; j < doubles.size(); j++) { // copy the original random array into the source array each time, since ParallelMergeSort modifies the source array while sorting 55 | doublesCopy[ j] = doubles[j]; 56 | doublesCopy2[ j] = doubles[j]; 57 | doublesCopyVec[j] = doubles[j]; 58 | sorted[ j] = (double)j; // page in the destination array into system memory 59 | } 60 | const auto startTime = high_resolution_clock::now(); 61 | ParallelAlgorithms::sort_par(doublesCopy, doubles.size(), sorted, doubles.size(), false); // not-in-place interface 62 | //ParallelAlgorithms::sort_par(doublesCopy, doubles.size()); // in-place adaptive interface 63 | //ParallelAlgorithms::sort_par(doublesCopyVec); // in-place adaptive interface (vector) 64 | const auto endTime = high_resolution_clock::now(); 65 | //printf("ParallelAlgorithms sorting is done\n"); 66 | 67 | sort(std::execution::par_unseq, doublesCopy2, doublesCopy2 + doubles.size()); 68 | //sort(std::execution::par_unseq, ulongsCopyVec2.begin(), ulongsCopyVec2.end()); 69 | //if (std::equal(sorted, sorted + uints.size(), uintsCopy2)) 70 | if (!std::equal(doublesCopy, doublesCopy + doubles.size(), doublesCopy2)) 71 | { 72 | std::cout << "Arrays are not equal "; 73 | exit(1); 74 | } 75 | print_results("Parallel Merge Sort", sorted, doubles.size(), startTime, endTime); 76 | } 77 | 78 | delete[] sorted; 79 | delete[] doublesCopy; 80 | 81 | return 0; 82 | } 83 | 84 | int ParallelMergeSortBenchmark(vector& uints, const size_t& testSize) 85 | { 86 | // generate some random uints: 87 | //printf("\nBenchmarking Parallel Merge Sort Hybrid with %zu unsigned longs (each of %lu bytes)...\n", uints.size(), (unsigned long)sizeof(unsigned long)); 88 | //const size_t testSize = 1'000'000'000; 89 | unsigned* uintsCopy = new unsigned[testSize]; 90 | unsigned* uintsCopy2 = new unsigned[testSize]; 91 | unsigned* sorted = new unsigned[testSize]; 92 | //printf("Allocations of arrays succeeded using new\n"); 93 | //vector ulongsCopyVec(uints); 94 | //vector ulongsCopyVec2(uints); 95 | //printf("Allocations of arrays succeeded using vector\n"); 96 | 97 | // time how long it takes to sort them: 98 | for (int i = 0; i < iterationCount; ++i) 99 | { 100 | for (size_t j = 0; j < uints.size(); j++) { // copy the original random array into the source array each time, since ParallelMergeSort modifies the source array while sorting 101 | uintsCopy[ j] = uints[j]; 102 | uintsCopy2[j] = uints[j]; 103 | sorted[j] = (unsigned long)j; // page in the destination array into system memory 104 | //ulongsCopyVec[ j] = uints[j]; 105 | //ulongsCopyVec2[j] = uints[j]; 106 | } 107 | const auto startTime = high_resolution_clock::now(); 108 | // Example of usages, which trade off ease of use and performance 109 | //ParallelAlgorithms::sort_par(uintsCopy, uints.size()); // in-place adaptive interface 110 | //ParallelAlgorithms::sort_par(uintsCopy, 0, uints.size()); // in-place adaptive interface 111 | //ParallelAlgorithms::sort_par(uintsCopy, uints.size(), sorted, uints.size(), false); // in-place interface 112 | //ParallelAlgorithms::sort_par(uintsCopy, uints.size(), sorted, uints.size(), true); // not in-place interface 113 | //ParallelAlgorithms::sort_par(uintsCopy, 0, uints.size(), sorted, uints.size(), false); // in-place interface 114 | //ParallelAlgorithms::sort_par(ulongsCopyVec); // in-place adaptive interface (vector) 115 | //sort(ulongsCopyVec.begin(), ulongsCopyVec.end()); // in-place adaptive interface (vector) 116 | //ParallelAlgorithms::merge_sort(uintsCopy, 0, uints.size() - 1, sorted, false); 117 | //ParallelAlgorithms::merge_sort_hybrid(uintsCopy, 0, uints.size() - 1, sorted, false); 118 | //ParallelAlgorithms::parallel_merge_sort_hybrid(uintsCopy, 0, uints.size() - 1, sorted, false); 119 | //ParallelAlgorithms::parallel_merge_sort_hybrid_rh(uintsCopy, 0, uints.size() - 1, sorted, false); 120 | //ParallelAlgorithms::parallel_merge_sort_hybrid_rh_1(uintsCopy, 0, uints.size() - 1, sorted, false); 121 | //ParallelAlgorithms::parallel_merge_merge_sort_hybrid(uintsCopy, 0, uints.size() - 1, sorted, false, uints.size() / 8); 122 | //ParallelAlgorithms::parallel_merge_merge_sort_hybrid(uintsCopy, 0, uints.size() - 1, sorted, false); 123 | //ParallelAlgorithms::parallel_merge_sort_hybrid_radix(uintsCopy, 0, (int)(uints.size() - 1), sorted, false, uints.size() / 8); // ParallelMergeSort modifies the source array (using 8-cores get highest performance on 48-core CPU C5i) 124 | ParallelAlgorithms::parallel_merge_sort_hybrid_radix(uintsCopy, 0, uints.size() - 1, sorted, false); 125 | //ParallelAlgorithms::parallel_inplace_merge_sort_radix_hybrid(uintsCopy, 0, uints.size() - 1, uints.size() / 4); // using 4 cores best performance on 6-core AWS node 126 | //ParallelAlgorithms::parallel_inplace_merge_sort_radix_hybrid(uintsCopy, 0, uints.size() - 1, uints.size() / 18); // using 18 cores best performance on C5.24xlarge 48-core AWS node 127 | //RadixSortLSDPowerOf2Radix_unsigned_TwoPhase(uintsCopy, sorted, uints.size()); 128 | //RadixSortLSDPowerOf2Radix_unsigned_TwoPhase_DeRandomize(uintsCopy, sorted, uints.size()) 129 | const auto endTime = high_resolution_clock::now(); 130 | 131 | //printf("ParallelAlgorithms sorting is done\n"); 132 | sort(std::execution::par_unseq, uintsCopy2, uintsCopy2 + uints.size()); 133 | //sort(std::execution::par_unseq, ulongsCopyVec2.begin(), ulongsCopyVec2.end()); 134 | //if (std::equal(sorted, sorted + uints.size(), uintsCopy2)) 135 | if (!std::equal(uintsCopy, uintsCopy + uints.size(), uintsCopy2)) 136 | { 137 | std::cout << "Arrays are not equal "; 138 | exit(1); 139 | } 140 | print_results("Parallel Merge Sort", sorted, uints.size(), startTime, endTime); 141 | } 142 | 143 | delete[] sorted; 144 | delete[] uintsCopy2; 145 | delete[] uintsCopy; 146 | 147 | return 0; 148 | } 149 | 150 | int ParallelInPlaceMergeSortBenchmark(vector& uints) 151 | { 152 | // generate some random uints: 153 | printf("\nBenchmarking InPlace Parallel Merge Sort Hybrid with %zu unsigned (each of %u bytes)...\n", uints.size(), (unsigned)sizeof(unsigned)); 154 | unsigned* uintsCopy = new unsigned[uints.size()]; 155 | unsigned* uintsCopy2 = new unsigned[uints.size()]; 156 | unsigned* sorted = new unsigned[uints.size()]; 157 | 158 | // time how long it takes to sort them: 159 | for (int i = 0; i < iterationCount; ++i) 160 | { 161 | for (size_t j = 0; j < uints.size(); j++) { // copy the original random array into the source array each time, since ParallelMergeSort modifies the source array while sorting 162 | uintsCopy2[j] = uints[j]; 163 | uintsCopy[ j] = uints[j]; 164 | sorted[ j] = (unsigned)j; // page in the destination array into system memory 165 | } 166 | const auto startTime = high_resolution_clock::now(); 167 | //ParallelAlgorithms::parallel_merge_sort_hybrid_rh_1(uintsCopy, 0, uints.size() - 1, sorted); // ParallelMergeSort modifies the source array 168 | //ParallelAlgorithms::merge_sort_bottom_up_inplace(uintsCopy, 0, uints.size()); 169 | //ParallelAlgorithms::merge_sort_bottom_up_inplace_hybrid(uintsCopy, 0, uints.size()); 170 | //ParallelAlgorithms::merge_sort_inplace(uintsCopy, 0, uints.size() - 1); 171 | ParallelAlgorithms::merge_sort_inplace_hybrid_with_insertion(uintsCopy, 0, uints.size() - 1); 172 | //ParallelAlgorithms::merge_sort_inplace_hybrid_with_sort(uintsCopy, 0, uints.size() - 1, false); 173 | //std::cout << "Before parallel inplace merge sort" << std::endl; 174 | //parallel_inplace_merge_sort_hybrid_inner(uintsCopy2, 0, (int)(uints.size() - 1)); 175 | //ParallelAlgorithms::parallel_inplace_merge_sort_hybrid(uintsCopy, 0, uints.size() - 1, true, uints.size() / 4); 176 | //ParallelAlgorithms::parallel_inplace_merge_sort_hybrid(uintsCopy, 0, uints.size() - 1, false, uints.size() / 48); 177 | //ParallelAlgorithms::preventative_adaptive_inplace_merge_sort(uintsCopy, 0, uints.size() - 1, 0.75); 178 | //ParallelAlgorithms::parallel_preventative_adaptive_inplace_merge_sort(uintsCopy, 0, uints.size() - 1, 0.75); 179 | //ParallelAlgorithms::parallel_preventative_adaptive_inplace_merge_sort(uintsCopy, 0, uints.size() - 1, false, 0.01, uints.size() / 48); // threshold 48 or 32 * 1024 180 | //ParallelAlgorithms::parallel_preventative_adaptive_inplace_merge_sort_2(uintsCopy, 0, uints.size() - 1, 0.9, uints.size() / 24); // threshold 48 or 32 * 1024 181 | //ParallelAlgorithms::parallel_linear_in_place_preventative_adaptive_sort(uintsCopy, (unsigned long)uints.size(), true, 0.01, uints.size() / 6); // using 4-cores is fastest on 6-core CPU 182 | //ParallelAlgorithms::parallel_linear_in_place_preventative_adaptive_sort(uintsCopy, (unsigned long)uints.size(), true, 0.9, uints.size() / 8); // using 8-cores is fastest on 48-core CPU 183 | //ParallelAlgorithms::parallel_linear_in_place_preventative_adaptive_sort(uintsCopy, (unsigned long)uints.size(), false, 0.01, uints.size() / 24); 184 | //std::sort(uintsCopy, uintsCopy + uints.size()); 185 | const auto endTime = high_resolution_clock::now(); 186 | 187 | std::sort(std::execution::par_unseq, uintsCopy2, uintsCopy2 + uints.size()); 188 | //std::stable_sort(std::execution::par_unseq, uintsCopy2, uintsCopy2 + uints.size()); 189 | //if (std::equal(sorted, sorted + uints.size(), uintsCopy2)) 190 | if (std::equal(uintsCopy, uintsCopy + uints.size(), uintsCopy2)) 191 | std::cout << "Arrays are equal "; 192 | else 193 | std::cout << "Arrays are not equal "; 194 | 195 | print_results("Parallel InPlace Merge Sort", uintsCopy, uints.size(), startTime, endTime); 196 | } 197 | 198 | delete[] sorted; 199 | delete[] uintsCopy2; 200 | delete[] uintsCopy; 201 | 202 | return 0; 203 | } 204 | 205 | int ParallelMergeSortBenchmark(vector& uints) 206 | { 207 | // generate some random uints: 208 | printf("\nBenchmarking Parallel Merge Sort Hybrid with %zu unsigned integers (each of %lu bytes)...\n", uints.size(), (unsigned long)sizeof(unsigned)); 209 | unsigned* uintsCopy = new unsigned[uints.size()]; 210 | unsigned* uintsCopy2 = new unsigned[uints.size()]; 211 | unsigned* sorted = new unsigned[uints.size()]; 212 | 213 | // time how long it takes to sort them: 214 | for (int i = 0; i < iterationCount; ++i) 215 | { 216 | for (size_t j = 0; j < uints.size(); j++) { // copy the original random array into the source array each time, since ParallelMergeSort modifies the source array while sorting 217 | uintsCopy[ j] = uints[j]; 218 | uintsCopy2[j] = uints[j]; 219 | sorted[ j] = (unsigned)j; // page in the destination array into system memory 220 | } 221 | const auto startTime = high_resolution_clock::now(); 222 | //ParallelAlgorithms::parallel_merge_sort_hybrid_rh_1(uintsCopy, 0, (int)(uints.size() - 1), sorted); // ParallelMergeSort modifies the source array 223 | ParallelAlgorithms::parallel_merge_sort_hybrid(uintsCopy, (size_t)0, uints.size() - 1, sorted, false); // ParallelMergeSort modifies the source array 224 | //ParallelAlgorithms::parallel_merge_merge_sort_hybrid(uintsCopy, (size_t)0, uints.size() - 1, sorted, false); // ParallelMergeSort modifies the source array 225 | const auto endTime = high_resolution_clock::now(); 226 | 227 | std::sort(std::execution::par_unseq, uintsCopy2, uintsCopy2 + uints.size()); 228 | //std::stable_sort(std::execution::par_unseq, uintsCopy2, uintsCopy2 + uints.size()); 229 | //if (std::equal(sorted, sorted + uints.size(), uintsCopy2)) 230 | if (std::equal(uintsCopy, uintsCopy + uints.size(), uintsCopy2)) 231 | std::cout << "Arrays are equal "; 232 | else 233 | std::cout << "Arrays are not equal "; 234 | 235 | print_results("Parallel Merge Sort", sorted, uints.size(), startTime, endTime); 236 | } 237 | 238 | delete[] sorted; 239 | delete[] uintsCopy; 240 | 241 | return 0; 242 | } 243 | 244 | int ParallelMergeBenchmark() 245 | { 246 | const size_t testSize = 10'000'000; 247 | random_device rd; 248 | 249 | // generate some random uints: 250 | vector uints_0(testSize); 251 | for (auto& d : uints_0) 252 | d = static_cast(rd()); 253 | 254 | printf("\nBenchmarking Parallel Merge with %zu unsigned integers (each of %lu bytes)...\n", uints_0.size(), (unsigned long)sizeof(unsigned)); 255 | 256 | #if 1 257 | sort(std::execution::par_unseq, uints_0.begin(), uints_0.begin() + testSize/2); 258 | sort(std::execution::par_unseq, uints_0.begin() + testSize/2, uints_0.end()); 259 | #else 260 | sort(oneapi::dpl::execution::par_unseq, uints_0.begin(), uints_0.begin() + testSize / 2); 261 | sort(oneapi::dpl::execution::par_unseq, uints_0.begin() + testSize / 2, uints_0.end()); 262 | #endif 263 | 264 | // time how long it takes to merge them them: 265 | for (int i = 0; i < iterationCount; ++i) 266 | { 267 | vector uints_work(uints_0); // copy the original into a working vector, since it's an in-place merge 268 | const auto startTime = high_resolution_clock::now(); 269 | std::inplace_merge(std::execution::par_unseq, uints_work.begin(), uints_work.begin() + testSize / 2, uints_work.end()); 270 | const auto endTime = high_resolution_clock::now(); 271 | print_results("Parallel Merge", uints_work.data(), uints_work.size(), startTime, endTime); 272 | } 273 | // time how long it takes to merge them them: 274 | for (int i = 0; i < iterationCount; ++i) 275 | { 276 | vector uints_work(uints_0); // copy the original into a working vector, since it's an in-place merge 277 | const auto startTime = high_resolution_clock::now(); 278 | std::inplace_merge(uints_work.begin(), uints_work.begin() + testSize / 2, uints_work.end()); 279 | const auto endTime = high_resolution_clock::now(); 280 | print_results("Parallel Merge", uints_work.data(), uints_work.size(), startTime, endTime); 281 | } 282 | 283 | return 0; 284 | } 285 | -------------------------------------------------------------------------------- /ParallelQuickSort.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (C) 2019 Intel Corporation 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), 6 | to deal in the Software without restriction, including without limitation 7 | the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 | and/or sell copies of the Software, and to permit persons to whom 9 | the Software is furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included 12 | in all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 15 | OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 | THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES 18 | OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE 20 | OR OTHER DEALINGS IN THE SOFTWARE. 21 | 22 | SPDX-License-Identifier: MIT 23 | */ 24 | 25 | // avoid Windows macros 26 | #define NOMINMAX 27 | 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | 36 | struct DataItem { 37 | int id; 38 | double value; 39 | DataItem(int i, double v) : id{ i }, value{ v } {} 40 | }; 41 | 42 | using QSVector = std::vector; 43 | 44 | void serialQuicksortFromBook(QSVector::iterator b, QSVector::iterator e); 45 | void serialQuicksort(QSVector::iterator b, QSVector::iterator e); 46 | 47 | // Has a performance issue for pre-sorted arrays, possibly O(N^2) performance for pre-sorted input data 48 | void parallelQuicksortFromBook(QSVector::iterator b, QSVector::iterator e) { 49 | const int cutoff = 100; 50 | 51 | if (e - b < cutoff) { 52 | serialQuicksortFromBook(b, e); 53 | } 54 | else { 55 | // do shuffle 56 | unsigned pivot_value = *b; 57 | QSVector::iterator i = b, j = e - 1; 58 | while (i != j) { 59 | while (i != j && pivot_value < *j ) --j; 60 | while (i != j && *i <= pivot_value) ++i; 61 | std::iter_swap(i, j); 62 | } 63 | std::iter_swap(b, i); 64 | 65 | // recursive call 66 | tbb::parallel_invoke( 67 | [=]() { parallelQuicksortFromBook(b, i); }, 68 | [=]() { parallelQuicksortFromBook(i + 1, e); } 69 | ); 70 | } 71 | } 72 | 73 | void parallelQuicksort(QSVector::iterator b, QSVector::iterator e) { 74 | 75 | const int cutoff = 100; 76 | 77 | if (e - b < cutoff) { 78 | serialQuicksort(b, e); 79 | } 80 | else { 81 | 82 | // do shuffle 83 | QSVector::iterator i = b, j = e - 1; 84 | auto pivot_value = *(i + (j - i) / 2); 85 | if (*i < pivot_value) 86 | while (*(++i) < pivot_value); 87 | if (*j > pivot_value) 88 | while (*(--j) > pivot_value); 89 | 90 | while (i < j) 91 | { 92 | std::iter_swap(i, j); 93 | while (*(++i) < pivot_value); 94 | while (*(--j) > pivot_value); 95 | } 96 | j++; 97 | 98 | tbb::parallel_invoke( 99 | [=]() { parallelQuicksort(b, j); }, 100 | [=]() { parallelQuicksort(j, e); } 101 | ); 102 | } 103 | } 104 | 105 | // Has a performance issue for pre-sorted arrays, possibly O(N^2) performance for pre-sorted input data 106 | void serialQuicksortFromBook(QSVector::iterator b, QSVector::iterator e) { 107 | if (b >= e) return; 108 | 109 | // do shuffle 110 | unsigned pivot_value = *b; 111 | QSVector::iterator i = b, j = e - 1; 112 | while (i != j) { 113 | while (i != j && pivot_value < *j ) --j; 114 | while (i != j && *i <= pivot_value) ++i; 115 | std::iter_swap(i, j); 116 | } 117 | std::iter_swap(b, i); 118 | 119 | // recursive call 120 | serialQuicksortFromBook(b, i); 121 | serialQuicksortFromBook(i + 1, e); 122 | } 123 | 124 | // ADAPTED FROM:https://stackoverflow.com/questions/53722004/generic-quicksort-implemented-with-vector-and-iterators-c 125 | void serialQuicksort(QSVector::iterator b, QSVector::iterator e) { 126 | 127 | if ((e - b) < 2) 128 | return; 129 | 130 | // do shuffle 131 | QSVector::iterator i = b, j = e - 1; 132 | auto pivot_value = *(i + (j - i) / 2); 133 | if (*i < pivot_value) 134 | while (*(++i) < pivot_value); 135 | if (*j > pivot_value) 136 | while (*(--j) > pivot_value); 137 | 138 | while (i < j) 139 | { 140 | std::iter_swap(i, j); 141 | while (*(++i) < pivot_value); 142 | while (*(--j) > pivot_value); 143 | } 144 | j++; 145 | 146 | // recursive call 147 | serialQuicksort(b, j); 148 | serialQuicksort(j, e); 149 | } 150 | // FROM:https://stackoverflow.com/questions/53722004/generic-quicksort-implemented-with-vector-and-iterators-c 151 | // Generic implementation 152 | template 153 | void serialQuickSort2(I beg, I end) 154 | { 155 | if (end - beg < 2) 156 | return; 157 | I lft(beg); 158 | I rgt(end - 1); 159 | auto pvt = *(lft + (rgt - lft) / 2); 160 | if (*lft < pvt) 161 | while (*++lft < pvt); 162 | if (*rgt > pvt) 163 | while (*--rgt > pvt); 164 | while (lft < rgt) 165 | { 166 | std::iter_swap(lft, rgt); 167 | while (*++lft < pvt); 168 | while (*--rgt > pvt); 169 | } 170 | rgt++; 171 | serialQuickSort2(beg, rgt); 172 | serialQuickSort2(rgt, end); 173 | } 174 | 175 | static QSVector makeQSData(int N) { 176 | QSVector v; 177 | #if 0 178 | std::default_random_engine g; 179 | std::uniform_real_distribution d(0, std::numeric_limits::max()); 180 | 181 | for (int i = 0; i < N; ++i) 182 | v.push_back(d(g)); 183 | #endif 184 | std::random_device rd; 185 | 186 | for (int i = 0; i < N; ++i) 187 | v.push_back(static_cast(rd())); 188 | 189 | return v; 190 | } 191 | 192 | static bool checkIsSorted(const QSVector& v) { 193 | double max_value = std::numeric_limits::min(); 194 | for (auto e : v) { 195 | if (e < max_value) { 196 | std::cerr << "Sort FAILED" << std::endl; 197 | return false; 198 | } 199 | max_value = e; 200 | } 201 | return true; 202 | } 203 | 204 | static void warmupTBB() { 205 | tbb::parallel_for(0, (int)std::thread::hardware_concurrency(), [](int) { 206 | tbb::tick_count t0 = tbb::tick_count::now(); 207 | while ((tbb::tick_count::now() - t0).seconds() < 0.01); 208 | }); 209 | } 210 | 211 | int main_quicksort() { 212 | std::cout << std::endl << "Parallel Quicksort" << std::endl; 213 | const int N = 10000000; 214 | 215 | QSVector v = makeQSData(N); 216 | 217 | warmupTBB(); 218 | double parallel_time = 0.0; 219 | { 220 | tbb::tick_count t0 = tbb::tick_count::now(); 221 | parallelQuicksort(v.begin(), v.end()); 222 | //parallelQuicksort(v.begin(), v.end()); // see if sorting a pre-sorted array runs infinitely as it does in C# 223 | //serialQuicksortFromBook(v.begin(), v.end()); 224 | //serialQuicksort(v.begin(), v.end()); 225 | parallel_time = (tbb::tick_count::now() - t0).seconds(); 226 | std::cout << "parallel_time == " << parallel_time << " seconds" << std::endl; 227 | std::cout << "Done with the first pass of serialQuicksort" << std::endl; 228 | t0 = tbb::tick_count::now(); 229 | //serialQuicksortFromBook(v.begin(), v.end()); 230 | //serialQuicksort(v.begin(), v.end()); 231 | parallel_time = (tbb::tick_count::now() - t0).seconds(); 232 | std::cout << "parallel_time == " << parallel_time << " seconds" << std::endl; 233 | std::cout << "Done with the second pass of serialQuicksort" << std::endl; 234 | if (!checkIsSorted(v)) { 235 | std::cerr << "ERROR: tbb sorted list out-of-order" << std::endl; 236 | } 237 | } 238 | 239 | //std::cout << "parallel_time == " << parallel_time << " seconds" << std::endl; 240 | return 0; 241 | } 242 | 243 | -------------------------------------------------------------------------------- /ParallelStdCppExample.cpp: -------------------------------------------------------------------------------- 1 | // From https://blogs.msdn.microsoft.com/vcblog/2018/09/11/using-c17-parallel-algorithms-for-better-performance/ 2 | // compile with: 3 | // debug: cl /EHsc /W4 /WX /std:c++latest /Fedebug /MDd .\program.cpp 4 | // release: cl /EHsc /W4 /WX /std:c++latest /Ferelease /MD /O2 .\program.cpp 5 | //#include 6 | //#include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | //#include // TBB 15 | //#include // TBB 16 | 17 | using std::chrono::duration; 18 | using std::chrono::duration_cast; 19 | using std::chrono::high_resolution_clock; 20 | using std::milli; 21 | using std::random_device; 22 | using std::sort; 23 | using std::vector; 24 | 25 | const int iterationCount = 5; 26 | 27 | void print_results(const char *const tag, const vector& sorted, 28 | high_resolution_clock::time_point startTime, 29 | high_resolution_clock::time_point endTime) 30 | { 31 | printf("%s: Lowest: %g Highest: %g Time: %fms\n", tag, sorted.front(), sorted.back(), 32 | duration_cast>(endTime - startTime).count()); 33 | } 34 | 35 | void print_results(const char* const tag, const vector& sorted, 36 | high_resolution_clock::time_point startTime, 37 | high_resolution_clock::time_point endTime) 38 | { 39 | printf("%s: Lowest: %lu Highest: %lu Time: %fms\n", tag, sorted.front(), sorted.back(), 40 | duration_cast>(endTime - startTime).count()); 41 | std::cout << tag << ": Lowest: " << sorted.front() << " " << sizeof(sorted.front()) << std::endl; 42 | } 43 | 44 | void print_results(const char* const tag, const vector& sorted, 45 | high_resolution_clock::time_point startTime, 46 | high_resolution_clock::time_point endTime) 47 | { 48 | printf("%s: Lowest: %u Highest: %u Time: %fms\n", tag, sorted.front(), sorted.back(), 49 | duration_cast>(endTime - startTime).count()); 50 | } 51 | 52 | void print_results(const char *const tag, double first, double last, 53 | high_resolution_clock::time_point startTime, 54 | high_resolution_clock::time_point endTime) 55 | { 56 | printf("%s: Lowest: %g Highest: %g Time: %fms\n", tag, first, last, 57 | duration_cast>(endTime - startTime).count()); 58 | } 59 | 60 | int ParallelStdCppExample(vector& doubles) 61 | { 62 | // time how long it takes to sort them: 63 | for (int i = 0; i < iterationCount; ++i) 64 | { 65 | vector sorted(doubles); 66 | const auto startTime = high_resolution_clock::now(); 67 | sort(sorted.begin(), sorted.end()); 68 | const auto endTime = high_resolution_clock::now(); 69 | print_results("Serial", sorted, startTime, endTime); 70 | } 71 | 72 | for (int i = 0; i < iterationCount; ++i) 73 | { 74 | vector sorted(doubles); 75 | const auto startTime = high_resolution_clock::now(); 76 | // same sort call as above, but with par_unseq: 77 | sort(std::execution::par_unseq, sorted.begin(), sorted.end()); 78 | const auto endTime = high_resolution_clock::now(); 79 | // in our output, note that these are the parallel results: 80 | print_results("Parallel", sorted, startTime, endTime); 81 | } 82 | 83 | for (int i = 0; i < iterationCount; ++i) 84 | { 85 | double * s = new double[doubles.size()]; 86 | for (size_t j = 0; j < doubles.size(); j++) { // copy the original random array into the source array each time, since ParallelMergeSort modifies the source array while sorting 87 | s[j] = doubles[j]; 88 | } 89 | const auto startTime = high_resolution_clock::now(); 90 | sort(std::execution::par_unseq, s, s+doubles.size()); 91 | const auto endTime = high_resolution_clock::now(); 92 | print_results("Parallel Array", s[0], s[doubles.size() - 1], startTime, endTime); 93 | delete[] s; 94 | } 95 | 96 | return 0; 97 | } 98 | 99 | int ParallelStdCppExample(vector& uints, bool stable = false) 100 | { 101 | // time how long it takes to sort them: 102 | for (int i = 0; i < iterationCount; ++i) 103 | { 104 | vector sorted(uints); 105 | const auto startTime = high_resolution_clock::now(); 106 | if (!stable) 107 | sort(sorted.begin(), sorted.end()); 108 | else 109 | stable_sort(sorted.begin(), sorted.end()); 110 | const auto endTime = high_resolution_clock::now(); 111 | print_results("Serial", sorted, startTime, endTime); 112 | } 113 | 114 | for (int i = 0; i < iterationCount; ++i) 115 | { 116 | vector sorted(uints); 117 | const auto startTime = high_resolution_clock::now(); 118 | // same sort call as above, but with par_unseq: 119 | if (!stable) 120 | sort(std::execution::par_unseq, sorted.begin(), sorted.end()); 121 | else 122 | stable_sort(std::execution::par_unseq, sorted.begin(), sorted.end()); 123 | const auto endTime = high_resolution_clock::now(); 124 | // in our output, note that these are the parallel results: 125 | print_results("Parallel", sorted, startTime, endTime); 126 | } 127 | 128 | for (int i = 0; i < iterationCount; ++i) 129 | { 130 | unsigned* s = new unsigned[uints.size()]; 131 | for (size_t j = 0; j < uints.size(); j++) { // copy the original random array into the source array each time, since ParallelMergeSort modifies the source array while sorting 132 | s[j] = uints[j]; 133 | } 134 | const auto startTime = high_resolution_clock::now(); 135 | if (!stable) 136 | sort(std::execution::par_unseq, s, s + uints.size()); 137 | else 138 | std::stable_sort(std::execution::par_unseq, s, s + uints.size()); 139 | const auto endTime = high_resolution_clock::now(); 140 | print_results("Parallel Array", s[0], s[uints.size() - 1], startTime, endTime); 141 | delete[] s; 142 | } 143 | 144 | return 0; 145 | } 146 | 147 | int ParallelStdCppExample(vector& uints) 148 | { 149 | // time how long it takes to sort them: 150 | for (int i = 0; i < iterationCount; ++i) 151 | { 152 | vector sorted(uints); 153 | const auto startTime = high_resolution_clock::now(); 154 | sort(sorted.begin(), sorted.end()); 155 | const auto endTime = high_resolution_clock::now(); 156 | print_results("Serial", sorted, startTime, endTime); 157 | } 158 | 159 | for (int i = 0; i < iterationCount; ++i) 160 | { 161 | vector sorted(uints); 162 | const auto startTime = high_resolution_clock::now(); 163 | // same sort call as above, but with par_unseq: 164 | sort(std::execution::par_unseq, sorted.begin(), sorted.end()); 165 | const auto endTime = high_resolution_clock::now(); 166 | // in our output, note that these are the parallel results: 167 | print_results("Parallel", sorted, startTime, endTime); 168 | } 169 | 170 | for (int i = 0; i < iterationCount; ++i) 171 | { 172 | unsigned *s = new unsigned[uints.size()]; 173 | for (size_t j = 0; j < uints.size(); j++) { // copy the original random array into the source array each time, since ParallelMergeSort modifies the source array while sorting 174 | s[j] = uints[j]; 175 | } 176 | const auto startTime = high_resolution_clock::now(); 177 | sort(std::execution::par_unseq, s, s + uints.size()); 178 | const auto endTime = high_resolution_clock::now(); 179 | print_results("Parallel Array", s[0], s[uints.size() - 1], startTime, endTime); 180 | delete[] s; 181 | } 182 | 183 | printf("\nAccumulate/Sum Benchmark:\n"); 184 | 185 | for (int i = 0; i < iterationCount; ++i) 186 | { 187 | unsigned* s = new unsigned[uints.size()]; 188 | for (size_t j = 0; j < uints.size(); j++) { // copy the original random array into the source array each time, since ParallelMergeSort modifies the source array while sorting 189 | s[j] = uints[j]; 190 | } 191 | const auto startTime = high_resolution_clock::now(); 192 | unsigned result_serial = std::accumulate(s, s + uints.size(), 0); 193 | const auto endTime = high_resolution_clock::now(); 194 | print_results("Serial Array Sum", result_serial, result_serial, startTime, endTime); 195 | delete[] s; 196 | } 197 | for (int i = 0; i < iterationCount; ++i) 198 | { 199 | unsigned* s = new unsigned[uints.size()]; 200 | for (size_t j = 0; j < uints.size(); j++) { // copy the original random array into the source array each time, since ParallelMergeSort modifies the source array while sorting 201 | s[j] = uints[j]; 202 | } 203 | const auto startTime = high_resolution_clock::now(); 204 | unsigned result_parallel = std::reduce(std::execution::par, s, s + uints.size(), 0); //Faster 205 | const auto endTime = high_resolution_clock::now(); 206 | print_results("Parallel Array Sum", result_parallel, result_parallel, startTime, endTime); 207 | delete[] s; 208 | } 209 | 210 | return 0; 211 | } 212 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ParallelAlgorithms 2 | 3 | High Performance Parallel (and Sequential) C++ Algorithms, which accompany "[Practical Parallel Algorithms in C++ and C#](https://www.amazon.com/Practical-Parallel-Algorithms-Sorting-Multicore-ebook/dp/B0C3TZPRKZ/ref=sr_1_1?crid=3P7Q0RUP8OBXB&keywords=duvanenko&qid=1702488919&sprefix=duvanenko%2Caps%2C95&sr=8-1)" book. 4 | 5 | ### Multi-Core Parallel Sorting Algorithms: 6 | 7 | *Algorithm*|*Random*|*Presorted*|*Constant*|*Description* 8 | --- | --- | --- | --- | --- 9 | LSD Radix Sort |2865|2907|4769| 48-core AWS C7a.24xlarge (AMD) 10 | LSD Radix Sort |2338|2297|2255| 48-core AWS C7i.24xlarge (Intel) 11 | LSD Radix Sort | 952| 831| 846| 14-core Intel i7-12700H 12 | Merge Radix Sort | 877| 945| 971| 48-core AWS C7a.24xlarge (AMD) 13 | Merge Sort |1176|1143|1143| 144-core Azure HBV4 (AMD) 14 | Merge Sort | 695| 946|1954| 48-core AWS C7i.24xlarge (Intel) 15 | Merge Sort | 174| 275| 617| 14-core Intel i7-12700H 16 | Merge Sort (in-place) | 272| 502| 549| 48-core AWS C7i.24xlarge (Intel) 17 | Merge Sort (in-place) | 90| 234| 339| 14-core Intel i7-12700H 18 | 19 | The above performance is in millions of unsigned 32-bit integers/second when sorting an array of 100 million elements. 20 | Benchmarks ran on Linux. 21 | 22 | ### High Performance Single-Core Sequential Algorithms 23 | 24 | *Algorithm*|*Random*|*Presorted*|*Constant*|*Description* 25 | --- | --- | --- | --- | --- 26 | LSD Radix Sort (two phase) |153|139|159| 1-core of Intel i7-12700H 27 | Merge Sort | 12|136|177| 1-core of Intel i7-12700H 28 | Merge Sort (in-place) | 12| 97|296| 1-core of Intel i7-12700H 29 | MSD Radix Sort (in-place) | 41| 48| 46| 1-core of Intel i7-12700H 30 | 31 | LSD Radix Sort single-core with two additional performance tools: 32 | - novel two-phase implementation reduces passes over the array to (1 + D), where D is the number of digits 33 | - de-randomization of writes to bins 34 | 35 | ## Other Algorithms 36 | Sorting algorithms provided in this repository: 37 | - Single-core LSD Radix Sort: Novel Two Phase 38 | - Multi-core Parallel LSD Radix Sort : linear time 39 | - Multi-core Parallel Merge Sort 40 | - Single-core In-Place Merge Sort 41 | - Multi-core Parallel In-Place Merge Sort 42 | - Single-core In-Place MSD Radix Sort: linear time 43 | - Numerous hyrid sorting algorithms - e.g. Paralle Merge Insertion Sort 44 | - Merge Radix Sort hybrids: linear time 45 | - Improved adaptivity to memory resources, even with virtual memory 46 | - Count Sort 47 | - Parallel Histogram 48 | - Block Swap 49 | - Parallel Merge 50 | - Radix Sort to support non-integer data types 51 | - Safer Average calculations 52 | - Blazing Fast sort of byte array 53 | - De-Randomization of Radix Sort writes to bins 54 | - Recursive and non-recursive Parallel Sum 55 | - Bottom-up Non-Recursive In-Place Merge Sort 56 | 57 | 58 | Windows support: 59 | - VisualStudio 2022 Microsoft compiler and Intel's OneAPI compiler. Solution is included. 60 | - Intel's Threading Building Blocks (TBB) and Microsoft's Parallel Patterns Library (PPL) 61 | - C++17 62 | 63 | Linux support: 64 | - g++ using Intel's Threading Building Blocks (TBB) 65 | - C++20 66 | 67 | ## Building on Ubuntu Linux (22.04 LTS) 68 | To install g++ which supports C++17: 69 | ``` 70 | sudo apt update 71 | sudo apt upgrade 72 | # reboot the machine 73 | sudo apt install build-essential 74 | ``` 75 | 76 | To update gcc to support c++17 standard, Parallel STL and Intel's Threading Building Blocks (TBB): 77 | ``` 78 | sudo apt install libtbb-dev 79 | git clone https://github.com/DragonSpit/ParallelAlgorithms.git 80 | cd ParallelAlgorithms 81 | ``` 82 | 83 | To build on WSL Ubuntu, use g++ command and not gcc. The order of the following arguments matters! 84 | ``` 85 | g++ ParallelAlgorithms.cpp ParallelStdCppExample.cpp RadixSortLsdBenchmark.cpp MemoryUsage.cpp CountingSortParallelBenchmark.cpp SumBenchmark.cpp RadixSortMsdBenchmark.cpp ParallelMergeSortBenchmark.cpp -ltbb -std=c++20 -O3 -o ParallelAlgorithms 86 | ``` 87 | To build on AWS Ubuntu, use g++ command and not gcc. The order of the following arguments matters! 88 | ``` 89 | g++ ParallelAlgorithms.cpp ParallelStdCppExample.cpp RadixSortLsdBenchmark.cpp MemoryUsage.cpp CountingSortParallelBenchmark.cpp SumBenchmark.cpp RadixSortMsdBenchmark.cpp ParallelMergeSortBenchmark.cpp -ltbb -std=c++2a -O3 -o ParallelAlgorithms 90 | ``` 91 | To run it: 92 | ``` 93 | ./ParallelAlgorithms 94 | ``` 95 | ## Building on Windows 96 | On Windows, [Visual Studio 2022](https://visualstudio.microsoft.com/downloads/), free and paid versions are supported. To build the project use Visual Studio 2022 to open ParallelAlgorithms.sln file, then 97 | select Build/RebuildSolution. Once the project builds, select "Local Windows Debugger" to run it. Or, open the CommandPrompt application, go to the "x64\Release" directory, and run ParallelAlgorithms.exe. 98 | 99 | By default, the solution/project uses Microsoft C++ compiler, which avoid requiring installation of Intel's OneAPI. Intel's OneAPI C++ compiler is supported by the ParallelAlgorithms.sln VisualStudio 2022 Solution. 100 | Once [Intel's OneAPI](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html), which is free, has been installed, select Project/IntelCompiler/UseIntelOneAPICompiler, followed by Build/RebuildSolution. Some of the algorithms are faster when build with Intel's OneAPI compiler. 101 | 102 | ## Other Resources 103 | [Benchmarks of C++ Standard Parallel Algorithms (STL)](https://duvanenko.tech.blog/2023/05/21/c-parallel-stl-benchmark/) are provided, with benchmark code in [ParallelSTL](https://github.com/DragonSpit/ParallelSTL) repository, which builds and runs on Linix and Windows. 104 | 105 | Blogs: 106 | - [Sorting 19X Faster than C++ Parallel Sort](https://duvanenko.tech.blog/2023/10/29/sorting-19x-faster-than-c-parallel-sort/) 107 | - [Parallel Merge](https://duvanenko.tech.blog/2018/01/14/parallel-merge/) for merging two arrays of any data type supporting comparison. 108 | - [Parallel Merge Sort](https://duvanenko.tech.blog/2018/01/13/parallel-merge-sort/) for sorting arrays of any data type supporting comparison. 109 | - [Novel LSD Radix Sort (two-phase)](https://duvanenko.tech.blog/2019/02/27/lsd-radix-sort-performance-improvements/). 110 | - [Parallel Sort from Standard C++17](https://blogs.msdn.microsoft.com/vcblog/2018/09/11/using-c17-parallel-algorithms-for-better-performance/). 111 | - [LSD Radix Sort](https://blogs.msdn.microsoft.com/vcblog/2018/09/11/using-c17-parallel-algorithms-for-better-performance/) for arrays of unsigned long's. 112 | 113 | 114 | [![paypal](https://www.paypalobjects.com/en_US/i/btn/btn_donateCC_LG.gif)](https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=LDD8L7UPAC7QL) 115 | -------------------------------------------------------------------------------- /RadixSortCommon.h: -------------------------------------------------------------------------------- 1 | // Copyright(c), Victor J. Duvanenko, 2010 2 | // Common items for Radix Sort implementations. 3 | 4 | #ifndef _RadixSortCommon_h 5 | #define _RadixSortCommon_h 6 | 7 | // A set of logical right shift functions to work-around the C++ issue of performing an arithmetic right shift 8 | // for >>= operation on signed types. 9 | inline char logicalRightShift( char a, unsigned long shiftAmount ) 10 | { 11 | return (char)(((unsigned char)a ) >> shiftAmount ); 12 | } 13 | inline unsigned char logicalRightShift_ru( char a, unsigned long shiftAmount ) 14 | { 15 | return (((unsigned char)a ) >> shiftAmount ); 16 | } 17 | inline short logicalRightShift( short a, unsigned long shiftAmount ) 18 | { 19 | return (short)(((unsigned short)a ) >> shiftAmount ); 20 | } 21 | inline unsigned short logicalRightShift_ru( short a, unsigned long shiftAmount ) 22 | { 23 | return (((unsigned short)a ) >> shiftAmount ); 24 | } 25 | inline long logicalRightShift( long a, unsigned long shiftAmount ) 26 | { 27 | return (long)(((unsigned long)a ) >> shiftAmount ); 28 | } 29 | inline int logicalRightShift( int a, unsigned long shiftAmount ) 30 | { 31 | return (int)(((unsigned long)a ) >> shiftAmount ); 32 | } 33 | inline unsigned long logicalRightShift_ru( long a, unsigned long shiftAmount ) 34 | { 35 | return (((unsigned long)a ) >> shiftAmount ); 36 | } 37 | inline unsigned long logicalRightShift_ru( int a, unsigned long shiftAmount ) 38 | { 39 | return (((unsigned long)a ) >> shiftAmount ); 40 | } 41 | #if 0 42 | inline __int64 logicalRightShift( __int64 a, unsigned long shiftAmount ) 43 | { 44 | return (__int64)(((unsigned __int64)a ) >> shiftAmount ); 45 | } 46 | inline unsigned __int64 logicalRightShift_ru( __int64 a, unsigned long shiftAmount ) 47 | { 48 | return (((unsigned __int64)a ) >> shiftAmount ); 49 | } 50 | #endif 51 | template< class _Type > 52 | inline unsigned extractDigit( _Type a, _Type bitMask, unsigned shiftRightAmount ) 53 | { 54 | unsigned digit = (unsigned)(( a & bitMask ) >> shiftRightAmount ); // extract the digit we are sorting based on 55 | return digit; 56 | } 57 | template< class _Type > 58 | inline unsigned extractDigit_1(_Type a, _Type bitMask, unsigned shiftRightAmount) 59 | { 60 | unsigned digit = (unsigned)((a >> shiftRightAmount) & bitMask); // extract the digit we are sorting based on 61 | return digit; 62 | }template< unsigned long PowerOfTwoRadix, class _Type > 63 | inline unsigned long extractDigitNegate( _Type a, _Type bitMask, unsigned long shiftRightAmount ) 64 | { 65 | unsigned long digit = (unsigned long)logicalRightShift_ru((_Type)( a & bitMask ), shiftRightAmount ); // extract the digit we are sorting based on 66 | digit ^= ( PowerOfTwoRadix >> 1 ); 67 | return digit; 68 | } 69 | // Shifts either left or right based on the sign of the shiftAmount argument. Positive values shift left by that many bits, 70 | // zero does not shift at all, and negative values shift right by that many bits. 71 | template< class _Type > 72 | inline _Type shift_left_or_right( _Type a, long shiftAmount ) 73 | { 74 | if ( shiftAmount >= 0 ) return a << shiftAmount; 75 | else return a >> ( -shiftAmount ); 76 | } 77 | 78 | 79 | #endif // _CommonRadixSort_h -------------------------------------------------------------------------------- /RadixSortLSD.h: -------------------------------------------------------------------------------- 1 | // TODO: Allocate a single array (cache-line aligned) for all the count arrays and index into it for each of the counts 2 | // TODO: Create a version of Radix Sort that handles 64-bit indexes (size_t) for arrays larger than 4GigaElements 3 | // TODO: Detect the size of array and use unsigned/32-bit counts for smaller arrays and size_t/64-bit counts for larger arrays 4 | // TODO: sort_radix_in_place_stable_adaptive can be implemented as preventative adaptive and stable/unstable option in a single function 5 | 6 | #ifndef _RadixSortLSD_h 7 | #define _RadixSortLSD_h 8 | 9 | #include "RadixSortCommon.h" 10 | #include "RadixSortMSD.h" 11 | #include "InsertionSort.h" 12 | #include "ParallelMergeSort.h" 13 | #include "Histogram.h" 14 | 15 | extern unsigned long long physical_memory_used_in_megabytes(); 16 | extern unsigned long long physical_memory_total_in_megabytes(); 17 | 18 | // Serial LSD Radix Sort, with Counting separated into its own phase, followed by a permutation phase, as is done in HPCsharp in C# 19 | template< unsigned long PowerOfTwoRadix, unsigned long Log2ofPowerOfTwoRadix, long Threshold> 20 | inline void _RadixSortLSD_StableUnsigned_PowerOf2RadixScalar_TwoPhase(unsigned long long* input_array, unsigned long long* output_array, size_t last, unsigned long long bitMask, unsigned long shiftRightAmount, bool inputArrayIsDestination) 21 | { 22 | const unsigned NumberOfBins = PowerOfTwoRadix; 23 | unsigned long long* _input_array = input_array; 24 | unsigned long long* _output_array = output_array; 25 | bool _output_array_has_result = false; 26 | unsigned currentDigit = 0; 27 | 28 | size_t** count2D = HistogramByteComponents (input_array, 0, last); 29 | 30 | while (bitMask != 0) // end processing digits when all the mask bits have been processes and shift out, leaving none 31 | { 32 | size_t* count = count2D[currentDigit]; 33 | 34 | size_t startOfBin[NumberOfBins]; 35 | //long endOfBin[NumberOfBins]; 36 | alignas(64) size_t endOfBin[NumberOfBins]; 37 | //printf("endOfBin address = %p\n", endOfBin); 38 | startOfBin[0] = endOfBin[0] = 0; 39 | for (unsigned i = 1; i < NumberOfBins; i++) 40 | startOfBin[i] = endOfBin[i] = startOfBin[i - 1] + count[i - 1]; 41 | 42 | for (size_t _current = 0; _current <= last; _current++) // permutation phase 43 | _output_array[endOfBin[extractDigit(_input_array[_current], bitMask, shiftRightAmount)]++] = _input_array[_current]; 44 | 45 | bitMask <<= Log2ofPowerOfTwoRadix; 46 | shiftRightAmount += Log2ofPowerOfTwoRadix; 47 | _output_array_has_result = !_output_array_has_result; 48 | std::swap(_input_array, _output_array); 49 | currentDigit++; 50 | } 51 | // Done with processing, copy all of the bins 52 | if (_output_array_has_result && inputArrayIsDestination) 53 | for (long _current = 0; _current <= last; _current++) // copy from output array into the input array 54 | _input_array[_current] = _output_array[_current]; 55 | if (!_output_array_has_result && !inputArrayIsDestination) 56 | for (long _current = 0; _current <= last; _current++) // copy from input array back into the output array 57 | _output_array[_current] = _input_array[_current]; 58 | 59 | const unsigned numberOfDigits = Log2ofPowerOfTwoRadix; // deallocate 2D count array, which was allocated in Histogram 60 | for (unsigned i = 0; i < numberOfDigits; i++) 61 | delete[] count2D[i]; 62 | delete[] count2D; 63 | } 64 | 65 | // Serial LSD Radix Sort, with Counting separated into its own phase, followed by a permutation phase, as is done in HPCsharp in C# 66 | template< unsigned long PowerOfTwoRadix, unsigned long Log2ofPowerOfTwoRadix > 67 | inline void _RadixSortLSD_StableUnsigned_PowerOf2RadixScalar_TwoPhase_1(unsigned* input_array, unsigned* output_array, size_t last, unsigned bitMask, unsigned long shiftRightAmount, bool inputArrayIsDestination) 68 | { 69 | const size_t NumberOfBins = PowerOfTwoRadix; 70 | unsigned* _input_array = input_array; 71 | unsigned* _output_array = output_array; 72 | bool _output_array_has_result = false; 73 | unsigned currentDigit = 0; 74 | unsigned maxDigit = sizeof(unsigned); 75 | unsigned bit_mask = NumberOfBins - 1; 76 | 77 | size_t* count2D = HistogramByteComponents_1 (input_array, 0, last); 78 | 79 | while (currentDigit < maxDigit) // end processing digits when all the mask bits have been processes and shift out, leaving none 80 | { 81 | size_t* count = count2D + (currentDigit * NumberOfBins); 82 | 83 | alignas(64) size_t endOfBin[NumberOfBins]; 84 | //printf("endOfBin address = %p\n", endOfBin); 85 | endOfBin[0] = 0; 86 | for (size_t i = 1; i < NumberOfBins; i++) 87 | endOfBin[i] = endOfBin[i - 1] + count[i - 1]; 88 | 89 | // permutation phase 90 | for (size_t _current = 0; _current <= last; _current++) 91 | _output_array[endOfBin[(_input_array[_current] & bit_mask ) >> shiftRightAmount]++] = _input_array[_current]; 92 | 93 | bit_mask <<= Log2ofPowerOfTwoRadix; 94 | shiftRightAmount += Log2ofPowerOfTwoRadix; 95 | _output_array_has_result = !_output_array_has_result; 96 | std::swap(_input_array, _output_array); 97 | currentDigit++; 98 | } 99 | // Done with processing, copy all of the bins 100 | if (_output_array_has_result && inputArrayIsDestination) 101 | for (size_t _current = 0; _current <= last; _current++) // copy from output array into the input array 102 | _input_array[_current] = _output_array[_current]; 103 | if (!_output_array_has_result && !inputArrayIsDestination) 104 | for (size_t _current = 0; _current <= last; _current++) // copy from input array back into the output array 105 | _output_array[_current] = _input_array[_current]; 106 | 107 | delete[] count2D; 108 | } 109 | 110 | // LSD Radix Sort - stable (LSD has to be, and this may preclude LSD Radix from being able to be in-place) 111 | inline void RadixSortLSDPowerOf2Radix_unsigned_TwoPhase(unsigned* a, unsigned* b, size_t a_size) 112 | { 113 | const unsigned long Threshold = 100; // Threshold of when to switch to using Insertion Sort 114 | const unsigned long PowerOfTwoRadix = 256; 115 | const unsigned long Log2ofPowerOfTwoRadix = 8; 116 | // Create bit-mask and shift right amount 117 | unsigned long shiftRightAmount = 0; 118 | unsigned bitMask = (unsigned)(((unsigned)(PowerOfTwoRadix - 1)) << shiftRightAmount); // bitMask controls/selects how many and which bits we process at a time 119 | 120 | // The beauty of using template arguments instead of function parameters for the Threshold and Log2ofPowerOfTwoRadix is 121 | // they are not pushed on the stack and are treated as constants, but local. 122 | if (a_size >= Threshold) { 123 | _RadixSortLSD_StableUnsigned_PowerOf2RadixScalar_TwoPhase_1< PowerOfTwoRadix, Log2ofPowerOfTwoRadix >(a, b, a_size - 1, bitMask, shiftRightAmount, false); 124 | } 125 | else { 126 | // TODO: Substitute Merge Sort, as it will get rid off the for loop, since it's internal to MergeSort 127 | insertionSortSimilarToSTLnoSelfAssignment(a, a_size); 128 | for (size_t j = 0; j < a_size; j++) // copy from input array to the destination array 129 | b[j] = a[j]; 130 | } 131 | } 132 | 133 | // Permute phase of LSD Radix Sort with de-randomized write memory accesses 134 | // Derandomizes system memory accesses by buffering all Radix bin accesses, turning 256-bin random memory writes into sequential writes 135 | template< unsigned long PowerOfTwoRadix, unsigned long Log2ofPowerOfTwoRadix, long Threshold, unsigned long BufferDepth> 136 | inline void _RadixSortLSD_StableUnsigned_PowerOf2Radix_PermuteDerandomized(unsigned* input_array, unsigned* output_array, size_t startIndex, size_t endIndex, unsigned bitMask, unsigned shiftRightAmount, 137 | size_t* endOfBin, size_t bufferIndex[], unsigned bufferDerandomize[][BufferDepth]) 138 | { 139 | const unsigned long NumberOfBins = PowerOfTwoRadix; 140 | 141 | for (size_t _current = startIndex; _current <= endIndex; _current++) 142 | { 143 | unsigned digit = extractDigit(input_array[_current], bitMask, shiftRightAmount); 144 | if (bufferIndex[digit] < BufferDepth) 145 | { 146 | bufferDerandomize[digit][bufferIndex[digit]++] = input_array[_current]; 147 | } 148 | else 149 | { 150 | size_t outIndex = endOfBin[digit]; 151 | unsigned* buff = &(bufferDerandomize[digit][0]); 152 | #if 1 153 | memcpy(&(output_array[outIndex]), buff, BufferDepth * sizeof(unsigned)); // significantly faster than a for loop 154 | #else 155 | unsigned* outBuff = &(output_array[outIndex]); 156 | for (size_t i = 0; i < BufferDepth; i++) 157 | *outBuff++ = *buff++; 158 | #endif 159 | endOfBin[digit] += BufferDepth; 160 | bufferDerandomize[digit][0] = input_array[_current]; 161 | bufferIndex[digit] = 1; 162 | } 163 | } 164 | // Flush all the derandomization buffers 165 | for (size_t whichBuff = 0; whichBuff < NumberOfBins; whichBuff++) 166 | { 167 | size_t numOfElementsInBuff = bufferIndex[whichBuff]; 168 | for (size_t i = 0; i < numOfElementsInBuff; i++) 169 | output_array[endOfBin[whichBuff]++] = bufferDerandomize[whichBuff][i]; 170 | bufferIndex[whichBuff] = 0; 171 | } 172 | } 173 | 174 | // Derandomizes system memory accesses by buffering all Radix bin accesses, turning 256-bin random memory writes into sequential writes 175 | // Parallel LSD Radix Sort, with Counting separated into its own parallel phase, followed by a serial permutation phase, as is done in HPCsharp in C# 176 | template< unsigned long PowerOfTwoRadix, unsigned long Log2ofPowerOfTwoRadix, long Threshold> 177 | void _RadixSortLSD_StableUnsigned_PowerOf2Radix_TwoPhase_DeRandomize(unsigned* input_array, unsigned* output_array, size_t last, unsigned bitMask, unsigned long shiftRightAmount, bool inputArrayIsDestination) 178 | { 179 | const size_t NumberOfBins = PowerOfTwoRadix; 180 | unsigned* _input_array = input_array; 181 | unsigned* _output_array = output_array; 182 | bool _output_array_has_result = false; 183 | unsigned currentDigit = 0; 184 | static const size_t bufferDepth = 128; 185 | #if 0 186 | __declspec(align(64)) unsigned bufferDerandomize[NumberOfBins][bufferDepth]; 187 | __declspec(align(64)) size_t bufferIndex[ NumberOfBins] = { 0 }; 188 | #else 189 | auto bufferDerandomize = new unsigned[NumberOfBins][bufferDepth]; 190 | auto bufferIndex = new size_t[ NumberOfBins] { 0 }; 191 | #endif 192 | 193 | size_t* count2D = HistogramByteComponents_1 (input_array, 0, last); 194 | 195 | while (bitMask != 0) // end processing digits when all the mask bits have been processes and shift out, leaving none 196 | { 197 | size_t* count = count2D + (currentDigit * NumberOfBins); 198 | 199 | size_t startOfBin[NumberOfBins], endOfBin[NumberOfBins]; 200 | startOfBin[0] = endOfBin[0] = 0; 201 | for (size_t i = 1; i < NumberOfBins; i++) 202 | startOfBin[i] = endOfBin[i] = startOfBin[i - 1] + count[i - 1]; 203 | 204 | _RadixSortLSD_StableUnsigned_PowerOf2Radix_PermuteDerandomized< PowerOfTwoRadix, Log2ofPowerOfTwoRadix, Threshold, bufferDepth>( 205 | _input_array, _output_array, 0, last, bitMask, shiftRightAmount, endOfBin, bufferIndex, bufferDerandomize); 206 | 207 | bitMask <<= Log2ofPowerOfTwoRadix; 208 | shiftRightAmount += Log2ofPowerOfTwoRadix; 209 | _output_array_has_result = !_output_array_has_result; 210 | std::swap(_input_array, _output_array); 211 | currentDigit++; 212 | } 213 | // Done with processing, copy all of the bins 214 | if (_output_array_has_result && inputArrayIsDestination) 215 | for (size_t _current = 0; _current <= last; _current++) // copy from output array into the input array 216 | _input_array[_current] = _output_array[_current]; 217 | if (!_output_array_has_result && !inputArrayIsDestination) 218 | for (size_t _current = 0; _current <= last; _current++) // copy from input array back into the output array 219 | _output_array[_current] = _input_array[_current]; 220 | #if 1 221 | delete[] bufferIndex; 222 | delete[] bufferDerandomize; 223 | #endif 224 | } 225 | 226 | // LSD Radix Sort - stable (LSD has to be, and this may preclude LSD Radix from being able to be in-place) 227 | inline void RadixSortLSDPowerOf2Radix_unsigned_TwoPhase_DeRandomize(unsigned* a, unsigned* b, size_t a_size) 228 | { 229 | const unsigned long Threshold = 100; // Threshold of when to switch to using Insertion Sort 230 | const unsigned long PowerOfTwoRadix = 256; 231 | const unsigned long Log2ofPowerOfTwoRadix = 8; 232 | // Create bit-mask and shift right amount 233 | unsigned long shiftRightAmount = 0; 234 | unsigned bitMask = (unsigned)(((unsigned)(PowerOfTwoRadix - 1)) << shiftRightAmount); // bitMask controls/selects how many and which bits we process at a time 235 | 236 | // The beauty of using template arguments instead of function parameters for the Threshold and Log2ofPowerOfTwoRadix is 237 | // they are not pushed on the stack and are treated as constants, but local. 238 | if (a_size >= Threshold) { 239 | _RadixSortLSD_StableUnsigned_PowerOf2Radix_TwoPhase_DeRandomize< PowerOfTwoRadix, Log2ofPowerOfTwoRadix, Threshold >(a, b, a_size - 1, bitMask, shiftRightAmount, false); 240 | } 241 | else { 242 | // TODO: Substitute Merge Sort, as it will get rid off the for loop, since it's internal to MergeSort 243 | insertionSortSimilarToSTLnoSelfAssignment(a, a_size); 244 | for (unsigned long j = 0; j < a_size; j++) // copy from input array to the destination array 245 | b[j] = a[j]; 246 | } 247 | } 248 | 249 | // Stability is not needed when sorting an array of integers 250 | // Post-allocation adaptivity, since the size of allocation is known in advance 251 | inline void sort_radix_in_place_adaptive(unsigned* src, size_t src_size, double physical_memory_threshold_post = 0.75) 252 | { 253 | size_t anticipated_memory_usage = sizeof(unsigned long) * src_size + physical_memory_used_in_megabytes(); 254 | double physical_memory_fraction = (double)anticipated_memory_usage / (double)physical_memory_total_in_megabytes(); 255 | printf("sort_radix_in_place_adaptive: physical memory used = %llu physical memory total = %llu\n", 256 | physical_memory_used_in_megabytes(), physical_memory_total_in_megabytes()); 257 | 258 | if (physical_memory_fraction > physical_memory_threshold_post) 259 | { 260 | printf("Running truly in-place MSD Radix Sort\n"); 261 | hybrid_inplace_msd_radix_sort(src, src_size); // in-place, not stable 262 | } 263 | else 264 | { 265 | unsigned* working_array = new(std::nothrow) unsigned[src_size]; 266 | 267 | if (!working_array) 268 | { 269 | printf("Running truly in-place MSD Radix Sort\n"); 270 | hybrid_inplace_msd_radix_sort(src, src_size); // in-place, not stable 271 | } 272 | else 273 | { 274 | //for (size_t i = 0; i < src_size; i++) // page in allocated array. Only then it shows up in memory usage measurements 275 | // working_array[i] = (unsigned)i; 276 | 277 | //physical_memory_fraction = (double)physical_memory_used_in_megabytes() / (double)physical_memory_total_in_megabytes(); 278 | //printf("sort_radix_in_place_adaptive #2: physical memory used = %llu physical memory total = %llu\n", 279 | // physical_memory_used_in_megabytes(), physical_memory_total_in_megabytes()); 280 | 281 | printf("Running not-in-place LSD Radix Sort\n"); 282 | RadixSortLSDPowerOf2Radix_unsigned_TwoPhase(src, working_array, src_size); // not-in-place, stable 283 | delete[] working_array; 284 | } 285 | } 286 | } 287 | 288 | // l boundary is inclusive and r boundary is exclusive 289 | template< class _Type > 290 | inline void merge_sort_inplace_hybrid_with_insertion(_Type* src, size_t l, size_t r) 291 | { 292 | if (r <= l) return; 293 | if ((r - l) <= 48) { 294 | insertionSortSimilarToSTLnoSelfAssignment(src + l, r - l); 295 | return; 296 | } 297 | size_t m = r / 2 + l / 2 + (r % 2 + l % 2) / 2; // average without overflow 298 | 299 | merge_sort_inplace_hybrid_with_insertion(src, l, m); 300 | merge_sort_inplace_hybrid_with_insertion(src, m, r); 301 | 302 | //merge_in_place(src, l, m, r); // merge the results (TODO: Needs size_t for arguments and modified to be truly in-place all the way down) 303 | std::inplace_merge(src + l, src + m, src + r); 304 | } 305 | 306 | inline void sort_radix_in_place_stable_adaptive(unsigned* src, size_t src_size, double physical_memory_threshold_post = 0.75) 307 | { 308 | size_t memory_to_be_allocated_in_megabytes = src_size * sizeof(unsigned) / ((size_t)1024 * 1024); 309 | double physical_memory_fraction = (double)(physical_memory_used_in_megabytes() + memory_to_be_allocated_in_megabytes) 310 | / (double)physical_memory_total_in_megabytes(); 311 | //printf("sort_radix_in_place_adaptive: physical memory used = %llu physical memory total = %llu to be allocated = %llu\n", 312 | // physical_memory_used_in_megabytes(), physical_memory_total_in_megabytes(), memory_to_be_allocated_in_megabytes); 313 | 314 | if (physical_memory_fraction > physical_memory_threshold_post) 315 | { 316 | //printf("Running in-place stable adaptive sort\n"); 317 | //std::stable_sort(src + 0, src + src_size); // problematic as it is not purely in-place algorithm, which is what is needed to keep memory footprint low 318 | merge_sort_inplace_hybrid_with_insertion(src, 0, src_size); // truly in-place 319 | } 320 | else 321 | { 322 | unsigned* working_array = new(std::nothrow) unsigned[src_size]; 323 | 324 | if (!working_array) 325 | { 326 | //printf("Running truly in-place MSD Radix Sort\n"); 327 | //std::stable_sort(src + 0, src + src_size); // problematic as it is not purely in-place algorithm, which is what is needed to keep memory footprint low 328 | merge_sort_inplace_hybrid_with_insertion(src, 0, src_size); 329 | } 330 | else 331 | { 332 | //for (size_t i = 0; i < src_size; i++) // page in allocated array. Only then it shows up in memory usage measurements 333 | // working_array[i] = (unsigned long)i; 334 | 335 | //physical_memory_fraction = (double)physical_memory_used_in_megabytes() / (double)physical_memory_total_in_megabytes(); 336 | //printf("sort_radix_in_place_adaptive #2: physical memory used = %llu physical memory total = %llu\n", 337 | // physical_memory_used_in_megabytes(), physical_memory_total_in_megabytes()); 338 | 339 | //printf("Running not-in-place LSD Radix Sort\n"); 340 | RadixSortLSDPowerOf2Radix_unsigned_TwoPhase(src, working_array, src_size); // not-in-place, stable 341 | delete[] working_array; 342 | } 343 | } 344 | } 345 | 346 | #endif -------------------------------------------------------------------------------- /RadixSortLsdBenchmark.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include "RadixSortLSD.h" 12 | #include "RadixSortLsdParallel.h" 13 | 14 | using std::chrono::duration; 15 | using std::chrono::duration_cast; 16 | using std::chrono::high_resolution_clock; 17 | using std::milli; 18 | using std::random_device; 19 | using std::sort; 20 | using std::vector; 21 | 22 | const int iterationCount = 5; 23 | 24 | static void print_results(const char* const tag, const unsigned* sorted, size_t sortedLength, 25 | high_resolution_clock::time_point startTime, 26 | high_resolution_clock::time_point endTime) { 27 | printf("%s: Lowest: %u Highest: %u Time: %fms\n", tag, 28 | sorted[0], sorted[sortedLength - 1], 29 | duration_cast>(endTime - startTime).count()); 30 | } 31 | 32 | int RadixSortLsdBenchmark(vector& uints) 33 | { 34 | vector uintsCopy(uints); 35 | vector tmp_working(uints); 36 | 37 | for (int i = 0; i < iterationCount; ++i) 38 | { 39 | for (size_t j = 0; j < uints.size(); j++) { // copy the original random array into the source array each time, since ParallelMergeSort modifies the source array while sorting 40 | uintsCopy[ j] = uints[j]; 41 | tmp_working[j] = (unsigned)j; // page in the destination array into system memory 42 | } 43 | // Eliminate compiler ability to optimize paging-in of the input and output arrays 44 | // Paging-in source and destination arrays leads to a 50% speed-up on Linux, and 15% on Windows 45 | 46 | vector sorted_reference(uints); 47 | sort(std::execution::par_unseq, sorted_reference.begin(), sorted_reference.end()); 48 | 49 | //printf("uintsCopy address = %p sorted address = %p value at a random location = %lu %lu\n", uintsCopy, sorted, sorted[static_cast(rd()) % uints.size()], uintsCopy[static_cast(rd()) % uints.size()]); 50 | const auto startTime = high_resolution_clock::now(); 51 | //RadixSortLSDPowerOf2Radix_unsigned_TwoPhase( uintsCopy.data(), tmp_working.data(), uints.size()); 52 | RadixSortLSDPowerOf2Radix_unsigned_TwoPhase_DeRandomize(uintsCopy.data(), tmp_working.data(), uints.size()); 53 | //sort_radix_in_place_adaptive(uintsCopy, (unsigned long)uints.size(), 0.1); 54 | //sort_radix_in_place_stable_adaptive(uintsCopy, uints.size(), 0.9); 55 | const auto endTime = high_resolution_clock::now(); 56 | print_results("Radix Sort LSD", uintsCopy.data(), uints.size(), startTime, endTime); 57 | if (!std::equal(sorted_reference.begin(), sorted_reference.end(), uintsCopy.begin())) 58 | { 59 | printf("Arrays are not equal\n"); 60 | exit(1); 61 | } 62 | } 63 | 64 | return 0; 65 | } 66 | 67 | int ParallelRadixSortLsdBenchmark(vector& uints) 68 | { 69 | vector uintsCopy( uints.size()); 70 | vector tmp_working(uints.size()); 71 | 72 | printf("\n"); 73 | // time how long it takes to sort them: 74 | for (int i = 0; i < iterationCount; ++i) 75 | { 76 | for (size_t j = 0; j < uints.size(); j++) { // copy the original random array into the source array each time, since ParallelMergeSort modifies the source array while sorting 77 | uintsCopy[ j] = uints[j]; 78 | tmp_working[j] = (unsigned)j; // page in the destination array into system memory 79 | } 80 | // Eliminate compiler ability to optimize paging-in of the input and output arrays 81 | // Paging-in source and destination arrays leads to a 50% speed-up on Linux, and 15% on Windows 82 | 83 | vector sorted_reference(uints); 84 | stable_sort(std::execution::par_unseq, sorted_reference.begin(), sorted_reference.end()); 85 | 86 | //printf("uintsCopy address = %p sorted address = %p value at a random location = %lu %lu\n", uintsCopy, tmp_working, tmp_working[static_cast(rd()) % uints.size()], uintsCopy[static_cast(rd()) % uints.size()]); 87 | const auto startTime = high_resolution_clock::now(); 88 | //RadixSortLSDPowerOf2Radix_unsigned_TwoPhase(uintsCopy, tmp_working, uints.size()); 89 | //RadixSortLSDPowerOf2RadixParallel_unsigned_TwoPhase_DeRandomize(uintsCopy, tmp_working, (unsigned long)uints.size()); 90 | //SortRadixPar(uintsCopy, tmp_working, uints.size(), uints.size() / 24); // slower than using all cores 91 | ParallelAlgorithms::SortRadixPar(uintsCopy.data(), tmp_working.data(), uints.size()); // fastest on 96-core Intel and AMD AWS c7 nodes 92 | const auto endTime = high_resolution_clock::now(); 93 | print_results("Parallel Radix Sort LSD", uintsCopy.data(), uints.size(), startTime, endTime); 94 | 95 | if (!std::equal(sorted_reference.begin(), sorted_reference.end(), uintsCopy.data())) 96 | { 97 | printf("Arrays are not equal\n"); 98 | exit(1); 99 | } 100 | } 101 | 102 | return 0; 103 | } 104 | -------------------------------------------------------------------------------- /RadixSortMSD.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef _RadixSortMSD_h 3 | #define _RadixSortMSD_h 4 | 5 | #include "RadixSortCommon.h" 6 | #include "InsertionSort.h" 7 | 8 | // Swap that does not check for self-assignment. 9 | template< class _Type > 10 | inline void _swap(_Type& a, _Type& b) 11 | { 12 | _Type tmp = a; 13 | a = b; 14 | b = tmp; 15 | } 16 | 17 | // Simplified the implementation of the inner loop. 18 | template< class _Type, unsigned long PowerOfTwoRadix, unsigned long Log2ofPowerOfTwoRadix, long Threshold > 19 | inline void _RadixSort_Unsigned_PowerOf2Radix_L1(_Type* a, size_t a_size, _Type bitMask, unsigned long shiftRightAmount) 20 | { 21 | size_t last = a_size - 1; 22 | size_t count[PowerOfTwoRadix]; 23 | for (unsigned long i = 0; i < PowerOfTwoRadix; i++) count[i] = 0; 24 | for (size_t _current = 0; _current <= last; _current++) // Scan the array and count the number of times each value appears 25 | count[(unsigned)((a[_current] & bitMask) >> shiftRightAmount)]++; 26 | 27 | size_t startOfBin[PowerOfTwoRadix + 1], endOfBin[PowerOfTwoRadix], nextBin = 1; 28 | startOfBin[0] = endOfBin[0] = 0; startOfBin[PowerOfTwoRadix] = 0; // sentinal 29 | for (unsigned long i = 1; i < PowerOfTwoRadix; i++) 30 | startOfBin[i] = endOfBin[i] = startOfBin[i - 1] + count[i - 1]; 31 | 32 | for (size_t _current = 0; _current <= last; ) 33 | { 34 | unsigned digit; 35 | _Type _current_element = a[_current]; // get the compiler to recognize that a register can be used for the loop instead of a[_current] memory location 36 | while (endOfBin[digit = (unsigned)((_current_element & bitMask) >> shiftRightAmount)] != _current) _swap(_current_element, a[endOfBin[digit]++]); 37 | a[_current] = _current_element; 38 | 39 | endOfBin[digit]++; 40 | while (endOfBin[nextBin - 1] == startOfBin[nextBin]) nextBin++; // skip over empty and full bins, when the end of the current bin reaches the start of the next bin 41 | _current = endOfBin[nextBin - 1]; 42 | } 43 | bitMask >>= Log2ofPowerOfTwoRadix; 44 | if (bitMask != 0) // end recursion when all the bits have been processes 45 | { 46 | if (shiftRightAmount >= Log2ofPowerOfTwoRadix) shiftRightAmount -= Log2ofPowerOfTwoRadix; 47 | else shiftRightAmount = 0; 48 | 49 | for (unsigned long i = 0; i < PowerOfTwoRadix; i++) 50 | { 51 | size_t numberOfElements = endOfBin[i] - startOfBin[i]; 52 | if (numberOfElements >= Threshold) // endOfBin actually points to one beyond the bin 53 | _RadixSort_Unsigned_PowerOf2Radix_L1< _Type, PowerOfTwoRadix, Log2ofPowerOfTwoRadix, Threshold >(&a[startOfBin[i]], numberOfElements, bitMask, shiftRightAmount); 54 | else if (numberOfElements >= 2) 55 | insertionSortSimilarToSTLnoSelfAssignment(&a[startOfBin[i]], numberOfElements); 56 | } 57 | } 58 | } 59 | 60 | inline void hybrid_inplace_msd_radix_sort(unsigned* a, size_t a_size) 61 | { 62 | if (a_size < 2) return; 63 | 64 | const long PowerOfTwoRadix = 256; 65 | const long Log2ofPowerOfTwoRadix = 8; 66 | const long Threshold = 48; 67 | 68 | unsigned bitMask = 0x80000000; // bitMask controls how many bits we process at a time 69 | unsigned long shiftRightAmount = 31; 70 | 71 | for (unsigned long i = 2; i < PowerOfTwoRadix; ) // if not power-of-two value then it will do up to the largest power-of-two value 72 | { // that's smaller than the value provided (e.g. radix-10 will do radix-8) 73 | bitMask |= (bitMask >> 1); 74 | shiftRightAmount -= 1; 75 | i <<= 1; 76 | } 77 | 78 | if (a_size >= Threshold) 79 | _RadixSort_Unsigned_PowerOf2Radix_L1< unsigned, PowerOfTwoRadix, Log2ofPowerOfTwoRadix, Threshold >(a, a_size, bitMask, shiftRightAmount); 80 | else 81 | insertionSortSimilarToSTLnoSelfAssignment( a, a_size ); 82 | //insertionSortHybrid(a, a_size); 83 | } 84 | 85 | template< unsigned PowerOfTwoRadix, unsigned Log2ofPowerOfTwoRadix, size_t Threshold, class _Type > 86 | inline void _RadixSort_StableUnsigned_PowerOf2Radix_2(_Type* a, _Type* b, size_t last, _Type bitMask, unsigned shiftRightAmount, bool inputArrayIsDestination) 87 | { 88 | const unsigned NumberOfBins = PowerOfTwoRadix; 89 | size_t count[NumberOfBins] = {}; 90 | 91 | for (size_t _current = 0; _current <= last; _current++) // Scan the array and count the number of times each value appears 92 | count[extractDigit(a[_current], bitMask, shiftRightAmount)]++; 93 | 94 | size_t startOfBin[NumberOfBins], endOfBin[NumberOfBins]; 95 | startOfBin[0] = endOfBin[0] = 0; 96 | for (unsigned i = 1; i < NumberOfBins; i++) 97 | startOfBin[i] = endOfBin[i] = startOfBin[i - 1] + count[i - 1]; 98 | 99 | for (size_t _current = 0; _current <= last; _current++) // permute array elements 100 | b[endOfBin[extractDigit(a[_current], bitMask, shiftRightAmount)]++] = a[_current]; 101 | 102 | bitMask >>= Log2ofPowerOfTwoRadix; 103 | if (bitMask != 0) // end recursion when all the bits have been processes 104 | { 105 | if (shiftRightAmount >= Log2ofPowerOfTwoRadix) shiftRightAmount -= Log2ofPowerOfTwoRadix; 106 | else shiftRightAmount = 0; 107 | inputArrayIsDestination = !inputArrayIsDestination; 108 | for (unsigned i = 0; i < NumberOfBins; i++) 109 | { 110 | size_t numOfElements = endOfBin[i] - startOfBin[i]; 111 | if (numOfElements >= Threshold) 112 | _RadixSort_StableUnsigned_PowerOf2Radix_2< PowerOfTwoRadix, Log2ofPowerOfTwoRadix, Threshold >(&b[startOfBin[i]], &a[startOfBin[i]], numOfElements - 1, bitMask, shiftRightAmount, inputArrayIsDestination); 113 | else { 114 | insertionSortSimilarToSTLnoSelfAssignment(&b[startOfBin[i]], numOfElements); 115 | if (inputArrayIsDestination) 116 | for (size_t j = startOfBin[i]; j < endOfBin[i]; j++) // copy from external array back into the input array 117 | a[j] = b[j]; 118 | } 119 | } 120 | } 121 | else { // Done with recursion copy all of the bins 122 | if (!inputArrayIsDestination) 123 | for (size_t _current = 0; _current <= last; _current++) // copy from external array back into the input array 124 | a[_current] = b[_current]; 125 | } 126 | } 127 | 128 | template< class _Type > 129 | inline void RadixSortMSDStablePowerOf2Radix_unsigned(_Type* a, _Type* b, size_t a_size) 130 | { 131 | const size_t Threshold = 100; // Threshold of when to switch to using Insertion Sort 132 | const unsigned PowerOfTwoRadix = 256; 133 | const unsigned Log2ofPowerOfTwoRadix = 8; 134 | // Create bit-mask and shift right amount 135 | unsigned shiftRightAmount = sizeof(_Type) * 8 - Log2ofPowerOfTwoRadix; 136 | _Type bitMask = (_Type)(((_Type)(PowerOfTwoRadix - 1)) << shiftRightAmount); // bitMask controls/selects how many and which bits we process at a time 137 | 138 | // The beauty of using template arguments instead of function parameters for the Threshold and Log2ofPowerOfTwoRadix is 139 | // they are not pushed on the stack and are treated as constants, but local. 140 | if (a_size >= Threshold) 141 | _RadixSort_StableUnsigned_PowerOf2Radix_2< PowerOfTwoRadix, Log2ofPowerOfTwoRadix, Threshold >(a, b, a_size - 1, bitMask, shiftRightAmount, false); 142 | else 143 | insertionSortSimilarToSTLnoSelfAssignment(a, a_size); 144 | } 145 | 146 | #endif -------------------------------------------------------------------------------- /RadixSortMsdBenchmark.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include "RadixSortMSD.h" 12 | #include "RadixSortMsdParallel.h" 13 | 14 | using std::chrono::duration; 15 | using std::chrono::duration_cast; 16 | using std::chrono::high_resolution_clock; 17 | using std::milli; 18 | using std::random_device; 19 | using std::sort; 20 | using std::vector; 21 | 22 | const int iterationCount = 5; 23 | 24 | static void print_results(const char* const tag, const vector& sorted, size_t sortedLength, 25 | high_resolution_clock::time_point startTime, 26 | high_resolution_clock::time_point endTime) { 27 | printf("%s: Lowest: %u Highest: %u Time: %fms\n", tag, 28 | sorted[0], sorted[sortedLength - 1], 29 | duration_cast>(endTime - startTime).count()); 30 | } 31 | 32 | 33 | int RadixSortMsdBenchmark(vector& uints) 34 | { 35 | vector uintsCopy( uints); 36 | vector tmp_working(uints); 37 | //vector sorted(uints); 38 | 39 | printf("\n"); 40 | // time how long it takes to sort them: 41 | for (int i = 0; i < iterationCount; ++i) 42 | { 43 | for (size_t j = 0; j < uints.size(); j++) { // copy the original random array into the source array each time, since ParallelMergeSort modifies the source array while sorting 44 | //uints[j] = j + 2; // for pre-sorted array testing 45 | uintsCopy[j] = uints[j]; 46 | //sorted[j] = (unsigned)j; // page in the destination array into system memory 47 | } 48 | // Eliminate compiler ability to optimize paging-in of the input and output arrays 49 | // Paging-in source and destination arrays leads to a 50% speed-up on Linux, and 15% on Windows 50 | 51 | vector sorted_reference(uints); 52 | sort(std::execution::par_unseq, sorted_reference.begin(), sorted_reference.end()); 53 | 54 | //printf("uintsCopy address = %p sorted address = %p value at a random location = %lu %lu\n", uintsCopy, sorted, sorted[static_cast(rd()) % uints.size()], uintsCopy[static_cast(rd()) % uints.size()]); 55 | const auto startTime = high_resolution_clock::now(); 56 | //RadixSortLSDPowerOf2RadixScalar_unsigned_TwoPhase( uintsCopy.data(), sorted.data(), (unsigned long)uints.size()); 57 | //RadixSortLSDPowerOf2RadixParallel_unsigned_TwoPhase(uintsCopy.data(), sorted.data(), (unsigned long)uints.size()); 58 | //hybrid_inplace_msd_radix_sort(uintsCopy.data(), (unsigned long)uints.size()); 59 | //ParallelAlgorithms::parallel_hybrid_inplace_msd_radix_sort(uintsCopy.data(), (unsigned long)uints.size()); 60 | RadixSortMSDStablePowerOf2Radix_unsigned(uintsCopy.data(), tmp_working.data(), uints.size()); 61 | const auto endTime = high_resolution_clock::now(); 62 | print_results("Radix Sort MSD", uintsCopy, uints.size(), startTime, endTime); 63 | 64 | if (!std::equal(sorted_reference.begin(), sorted_reference.end(), uintsCopy.begin())) 65 | { 66 | printf("Arrays are not equal\n"); 67 | exit(1); 68 | } 69 | } 70 | 71 | return 0; 72 | } 73 | -------------------------------------------------------------------------------- /RadixSortMsdParallel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifndef _RadixSortMsdParallel_h 4 | #define _RadixSortMsdParallel_h 5 | 6 | // TBB-only implementation 7 | #include "tbb/tbb.h" 8 | #include 9 | 10 | #include "InsertionSort.h" 11 | #include "BinarySearch.h" 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | using namespace tbb; 23 | 24 | #include "RadixSortCommon.h" 25 | #include "RadixSortMSD.h" 26 | #include "InsertionSort.h" 27 | #include "HistogramParallel.h" 28 | 29 | namespace ParallelAlgorithms 30 | { 31 | // Simplified the implementation of the inner loop. 32 | template< class _Type, unsigned long PowerOfTwoRadix, unsigned long Log2ofPowerOfTwoRadix, long Threshold > 33 | inline void _RadixSort_Unsigned_PowerOf2Radix_Par_L1(_Type* a, size_t a_size, _Type bitMask, unsigned long shiftRightAmount) 34 | { 35 | size_t last = a_size - 1; 36 | #if 0 37 | size_t count[PowerOfTwoRadix]; 38 | 39 | for (unsigned long i = 0; i < PowerOfTwoRadix; i++) count[i] = 0; 40 | for (size_t _current = 0; _current <= last; _current++) // Scan the array and count the number of times each value appears 41 | count[(unsigned long)((a[_current] & bitMask) >> shiftRightAmount)]++; 42 | #else 43 | size_t* count = HistogramOneByteComponentParallel< PowerOfTwoRadix, Log2ofPowerOfTwoRadix >(a, 0, last, shiftRightAmount); 44 | #endif 45 | 46 | size_t startOfBin[PowerOfTwoRadix + 1], endOfBin[PowerOfTwoRadix], nextBin = 1; 47 | startOfBin[0] = endOfBin[0] = 0; startOfBin[PowerOfTwoRadix] = 0; // sentinal 48 | for (unsigned long i = 1; i < PowerOfTwoRadix; i++) 49 | startOfBin[i] = endOfBin[i] = startOfBin[i - 1] + count[i - 1]; 50 | 51 | for (size_t _current = 0; _current <= last; ) 52 | { 53 | unsigned digit; 54 | _Type _current_element = a[_current]; // get the compiler to recognize that a register can be used for the loop instead of a[_current] memory location 55 | while (endOfBin[digit = (unsigned)((_current_element & bitMask) >> shiftRightAmount)] != _current) _swap(_current_element, a[endOfBin[digit]++]); 56 | a[_current] = _current_element; 57 | 58 | endOfBin[digit]++; 59 | while (endOfBin[nextBin - 1] == startOfBin[nextBin]) nextBin++; // skip over empty and full bins, when the end of the current bin reaches the start of the next bin 60 | _current = endOfBin[nextBin - 1]; 61 | } 62 | 63 | bitMask >>= Log2ofPowerOfTwoRadix; 64 | if (bitMask != 0) // end recursion when all the bits have been processes 65 | { 66 | if (shiftRightAmount >= Log2ofPowerOfTwoRadix) shiftRightAmount -= Log2ofPowerOfTwoRadix; 67 | else shiftRightAmount = 0; 68 | 69 | #if 0 70 | for (unsigned long i = 0; i < PowerOfTwoRadix; i++) 71 | { 72 | size_t numberOfElements = endOfBin[i] - startOfBin[i]; // endOfBin actually points to one beyond the bin 73 | if (numberOfElements >= Threshold) 74 | _RadixSort_Unsigned_PowerOf2Radix_Par_L1< _Type, PowerOfTwoRadix, Log2ofPowerOfTwoRadix, Threshold >(&a[startOfBin[i]], numberOfElements, bitMask, shiftRightAmount); 75 | else if (numberOfElements >= 2) 76 | insertionSortSimilarToSTLnoSelfAssignment(&a[startOfBin[i]], numberOfElements); 77 | } 78 | #else 79 | // Multi-core version of the algorithm 80 | #if defined(USE_PPL) 81 | Concurrency::task_group g; 82 | #else 83 | tbb::task_group g; 84 | #endif 85 | for (unsigned long i = 0; i < PowerOfTwoRadix; i++) 86 | { 87 | size_t numberOfElements = endOfBin[i] - startOfBin[i]; 88 | if (numberOfElements >= Threshold) // endOfBin actually points to one beyond the bin 89 | g.run([=] { // important to not pass by reference, as all tasks will then get the same/last value 90 | _RadixSort_Unsigned_PowerOf2Radix_Par_L1< _Type, PowerOfTwoRadix, Log2ofPowerOfTwoRadix, Threshold >(&a[startOfBin[i]], numberOfElements, bitMask, shiftRightAmount); 91 | }); 92 | else if (numberOfElements >= 2) 93 | insertionSortSimilarToSTLnoSelfAssignment(&a[startOfBin[i]], numberOfElements); 94 | } 95 | g.wait(); 96 | #endif 97 | } 98 | } 99 | 100 | // Permute phase of MSD Radix Sort with de-randomized write memory accesses 101 | // Derandomizes system memory accesses by buffering all Radix bin accesses, turning 256-bin random memory writes into sequential writes 102 | // Separates read pointers from write pointers of/to each bin 103 | // Idea: It may be better to implement read/write buffering where all of the swaps happen within those buffers with writes dumping out to the bins and fetching the next buffer 104 | // It's similar to caching, but doing it in a more cache-friendly way where all of the bin-buffers can fit into the cache and not map on top of each other. Otherwise, with 105 | // bins we get data-dependent cache thrashing. Plus, all of the swapping will be within the buffers which are well organized for caching. 106 | template< unsigned long PowerOfTwoRadix, unsigned long Log2ofPowerOfTwoRadix, long Threshold, unsigned long BufferDepth> 107 | inline void _RadixSortMSD_StableUnsigned_PowerOf2Radix_PermuteDerandomized_1(unsigned long* inout_array, size_t startIndex, size_t endIndex, unsigned long bitMask, unsigned long shiftRightAmount, 108 | size_t* startOfBin, size_t* endOfBin, unsigned long bufferIndex[], unsigned long bufferDerandomize[][BufferDepth]) 109 | { 110 | // TODO: This version is broken and needs to be fixed!! 111 | 112 | //printf("Permute Derandomized #1: startIndex = %zu endIndex = %zu bitMask = %lx shiftRight = %lu \n", startIndex, endIndex, bitMask, shiftRightAmount); 113 | const unsigned long NumberOfBins = PowerOfTwoRadix; 114 | size_t writeEndOfBin[NumberOfBins]; // write pointers to each bin 115 | std::copy(endOfBin + 0, endOfBin + NumberOfBins, writeEndOfBin); // copy read pointers (endOfBin) to write pointers 116 | 117 | size_t nextBin = 1; 118 | for (size_t _current = startIndex; _current <= endIndex;) 119 | { 120 | unsigned long digit; 121 | unsigned long _current_element = inout_array[_current]; // get the compiler to recognize that a register can be used for the loop instead of a[_current] memory location 122 | while (endOfBin[digit = (unsigned long)((_current_element & bitMask) >> shiftRightAmount)] != _current) 123 | { 124 | unsigned long tmp = _current_element; // read of the current element to squirl it away 125 | _current_element = inout_array[endOfBin[digit]]; // read from a bin and place in the current element 126 | endOfBin[digit]++; // advance the read pointer of that bin 127 | 128 | //a[endOfBin[digit]] = tmp; // write the current element to that bin. This is the one (the write) which needs to be buffered 129 | 130 | if (bufferIndex[digit] < BufferDepth) 131 | { 132 | bufferDerandomize[digit][bufferIndex[digit]++] = tmp; // write the current element into the buffer for that bin 133 | } 134 | else 135 | { 136 | unsigned long outIndex = writeEndOfBin[digit]; 137 | unsigned long* buff = &(bufferDerandomize[digit][0]); 138 | #if 1 139 | memcpy(&(inout_array[outIndex]), buff, BufferDepth * sizeof(unsigned long)); // significantly faster than a for loop. Need to try std::copy - simpler interface 140 | #else 141 | unsigned long* outBuff = &(output_array[outIndex]); 142 | for (unsigned long i = 0; i < BufferDepth; i++) 143 | *outBuff++ = *buff++; 144 | #endif 145 | writeEndOfBin[digit] += BufferDepth; 146 | bufferDerandomize[digit][0] = tmp; // write the current element into the buffer for that bin 147 | bufferIndex[digit] = 1; 148 | } 149 | } 150 | inout_array[_current] = _current_element; // write the current element into the current bin 151 | endOfBin[digit]++; // advance the read pointer 152 | writeEndOfBin[digit]++; // advance the write pointer 153 | 154 | while (endOfBin[nextBin - 1] == startOfBin[nextBin]) nextBin++; // skip over empty and full bins, when the end of the current bin reaches the start of the next bin 155 | _current = endOfBin[nextBin - 1]; 156 | } 157 | // Flush all the derandomization buffers 158 | for (unsigned long whichBuff = 0; whichBuff < NumberOfBins; whichBuff++) 159 | { 160 | unsigned long numOfElementsInBuff = bufferIndex[whichBuff]; 161 | for (size_t i = 0; i < numOfElementsInBuff; i++) 162 | inout_array[writeEndOfBin[whichBuff]++] = bufferDerandomize[whichBuff][i]; 163 | bufferIndex[whichBuff] = 0; 164 | } 165 | } 166 | 167 | // Permute phase of MSD Radix Sort with de-randomized write memory accesses 168 | // Derandomizes system memory accesses by buffering all Radix bin accesses, turning 256-bin random memory writes into sequential writes 169 | template< unsigned long PowerOfTwoRadix, unsigned long Log2ofPowerOfTwoRadix, long Threshold, unsigned long BufferDepth> 170 | inline void _RadixSortMSD_StableUnsigned_PowerOf2Radix_PermuteDerandomized(unsigned long* inout_array, size_t startIndex, size_t endIndex, unsigned long bitMask, unsigned long shiftRightAmount, 171 | size_t* endOfBin, unsigned long bufferIndex[], unsigned long bufferDerandomize[][BufferDepth]) 172 | { 173 | printf("Permute Derandomized: startIndex = %zu endIndex = %zu bitMask = %lx shiftRight = %lu \n", startIndex, endIndex, bitMask, shiftRightAmount); 174 | const unsigned long NumberOfBins = PowerOfTwoRadix; 175 | 176 | for (size_t _current = startIndex; _current <= endIndex; _current++) 177 | { 178 | unsigned long digit = extractDigit(inout_array[_current], bitMask, shiftRightAmount); 179 | if (bufferIndex[digit] < BufferDepth) 180 | { 181 | bufferDerandomize[digit][bufferIndex[digit]++] = inout_array[_current]; 182 | } 183 | else 184 | { 185 | unsigned long outIndex = endOfBin[digit]; 186 | unsigned long* buff = &(bufferDerandomize[digit][0]); 187 | #if 1 188 | memcpy(&(inout_array[outIndex]), buff, BufferDepth * sizeof(unsigned long)); // significantly faster than a for loop 189 | #else 190 | unsigned long* outBuff = &(output_array[outIndex]); 191 | for (unsigned long i = 0; i < BufferDepth; i++) 192 | *outBuff++ = *buff++; 193 | #endif 194 | endOfBin[digit] += BufferDepth; 195 | bufferDerandomize[digit][0] = inout_array[_current]; 196 | bufferIndex[digit] = 1; 197 | } 198 | } 199 | // Flush all the derandomization buffers 200 | for (unsigned long whichBuff = 0; whichBuff < NumberOfBins; whichBuff++) 201 | { 202 | unsigned long numOfElementsInBuff = bufferIndex[whichBuff]; 203 | for (size_t i = 0; i < numOfElementsInBuff; i++) 204 | inout_array[endOfBin[whichBuff]++] = bufferDerandomize[whichBuff][i]; 205 | bufferIndex[whichBuff] = 0; 206 | } 207 | } 208 | 209 | // Simplified the implementation of the inner loop. 210 | template< unsigned long PowerOfTwoRadix, unsigned long Log2ofPowerOfTwoRadix, long Threshold > 211 | inline void _RadixSort_Unsigned_PowerOf2Radix_Derandomized_Par_L1(unsigned long* a, size_t a_size, unsigned long bitMask, unsigned long shiftRightAmount) 212 | { 213 | size_t last = a_size - 1; 214 | #if 0 215 | size_t count[PowerOfTwoRadix]; 216 | 217 | for (unsigned long i = 0; i < PowerOfTwoRadix; i++) count[i] = 0; 218 | for (size_t _current = 0; _current <= last; _current++) // Scan the array and count the number of times each value appears 219 | count[(unsigned long)((a[_current] & bitMask) >> shiftRightAmount)]++; 220 | #else 221 | size_t* count = HistogramOneByteComponentParallel< PowerOfTwoRadix, Log2ofPowerOfTwoRadix >(a, 0, last, shiftRightAmount); 222 | #endif 223 | 224 | size_t startOfBin[PowerOfTwoRadix + 1], endOfBin[PowerOfTwoRadix]; 225 | startOfBin[0] = endOfBin[0] = 0; startOfBin[PowerOfTwoRadix] = 0; // sentinal 226 | for (unsigned long i = 1; i < PowerOfTwoRadix; i++) 227 | startOfBin[i] = endOfBin[i] = startOfBin[i - 1] + count[i - 1]; 228 | 229 | #if 1 230 | size_t nextBin = 1; 231 | for (size_t _current = 0; _current <= last; ) 232 | { 233 | unsigned long digit; 234 | unsigned long _current_element = a[_current]; // get the compiler to recognize that a register can be used for the loop instead of a[_current] memory location 235 | while (endOfBin[digit = (unsigned long)((_current_element & bitMask) >> shiftRightAmount)] != _current) _swap(_current_element, a[endOfBin[digit]++]); 236 | a[_current] = _current_element; 237 | 238 | endOfBin[digit]++; 239 | while (endOfBin[nextBin - 1] == startOfBin[nextBin]) nextBin++; // skip over empty and full bins, when the end of the current bin reaches the start of the next bin 240 | _current = endOfBin[nextBin - 1]; 241 | } 242 | #else 243 | // TODO: This version is broken and needs to be fixed!! 244 | const unsigned long NumberOfBins = PowerOfTwoRadix; 245 | static const unsigned long bufferDepth = 128; 246 | __declspec(align(64)) unsigned long bufferDerandomize[NumberOfBins][bufferDepth]; 247 | __declspec(align(64)) unsigned long bufferIndex[NumberOfBins] = { 0 }; 248 | 249 | //printf("Before Permute Derandomized \n"); 250 | _RadixSortMSD_StableUnsigned_PowerOf2Radix_PermuteDerandomized_1< PowerOfTwoRadix, Log2ofPowerOfTwoRadix, Threshold, bufferDepth>( 251 | a, 0, last, bitMask, shiftRightAmount, startOfBin, endOfBin, bufferIndex, bufferDerandomize); 252 | //printf("After Permute Derandomized \n"); 253 | #endif 254 | bitMask >>= Log2ofPowerOfTwoRadix; 255 | if (bitMask != 0) // end recursion when all the bits have been processes 256 | { 257 | if (shiftRightAmount >= Log2ofPowerOfTwoRadix) shiftRightAmount -= Log2ofPowerOfTwoRadix; 258 | else shiftRightAmount = 0; 259 | 260 | #if 0 261 | for (unsigned long i = 0; i < PowerOfTwoRadix; i++) 262 | { 263 | size_t numberOfElements = endOfBin[i] - startOfBin[i]; // endOfBin actually points to one beyond the bin 264 | if (numberOfElements >= Threshold) 265 | _RadixSort_Unsigned_PowerOf2Radix_Derandomized_Par_L1< PowerOfTwoRadix, Log2ofPowerOfTwoRadix, Threshold >(&a[startOfBin[i]], numberOfElements, bitMask, shiftRightAmount); 266 | else if (numberOfElements >= 2) 267 | insertionSortSimilarToSTLnoSelfAssignment(&a[startOfBin[i]], numberOfElements); 268 | } 269 | #else 270 | // Multi-core version of the algorithm 271 | #if defined(USE_PPL) 272 | Concurrency::task_group g; 273 | #else 274 | tbb::task_group g; 275 | #endif 276 | for (unsigned long i = 0; i < PowerOfTwoRadix; i++) 277 | { 278 | size_t numberOfElements = endOfBin[i] - startOfBin[i]; 279 | if (numberOfElements >= Threshold) // endOfBin actually points to one beyond the bin 280 | g.run([=] { // important to not pass by reference, as all tasks will then get the same/last value 281 | _RadixSort_Unsigned_PowerOf2Radix_Derandomized_Par_L1< PowerOfTwoRadix, Log2ofPowerOfTwoRadix, Threshold >(&a[startOfBin[i]], numberOfElements, bitMask, shiftRightAmount); 282 | }); 283 | else if (numberOfElements >= 2) 284 | insertionSortSimilarToSTLnoSelfAssignment(&a[startOfBin[i]], numberOfElements); 285 | } 286 | g.wait(); 287 | #endif 288 | } 289 | } 290 | 291 | inline void parallel_hybrid_inplace_msd_radix_sort(unsigned* a, size_t a_size) 292 | { 293 | if (a_size < 2) return; 294 | 295 | const long PowerOfTwoRadix = 256; 296 | const long Log2ofPowerOfTwoRadix = 8; 297 | const long Threshold = 100; 298 | 299 | unsigned bitMask = 0x80000000; // bitMask controls how many bits we process at a time 300 | unsigned shiftRightAmount = 31; 301 | 302 | for (size_t i = 2; i < PowerOfTwoRadix; ) // if not power-of-two value then it will do up to the largest power-of-two value 303 | { // that's smaller than the value provided (e.g. radix-10 will do radix-8) 304 | bitMask |= (bitMask >> 1); 305 | shiftRightAmount -= 1; 306 | i <<= 1; 307 | } 308 | 309 | if (a_size >= Threshold) 310 | { 311 | _RadixSort_Unsigned_PowerOf2Radix_Par_L1< unsigned, PowerOfTwoRadix, Log2ofPowerOfTwoRadix, Threshold >(a, a_size, bitMask, shiftRightAmount); // same speed as de-randomization on 6-core 312 | //_RadixSort_Unsigned_PowerOf2Radix_Derandomized_Par_L1< PowerOfTwoRadix, Log2ofPowerOfTwoRadix, Threshold >(a, a_size, bitMask, shiftRightAmount); 313 | } 314 | else 315 | insertionSortSimilarToSTLnoSelfAssignment(a, a_size); 316 | //insertionSortHybrid(a, a_size); 317 | } 318 | } 319 | #endif -------------------------------------------------------------------------------- /SortParallel.h: -------------------------------------------------------------------------------- 1 | // TODO: Benchmark how long memory allocation takes 2 | // TODO: Benchmark how much better algorithm does where dst/working buffer is provided, versus one that is provided and paged in 3 | #pragma once 4 | 5 | #include "Configuration.h" 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | #include "ParallelMergeSort.h" 17 | 18 | namespace ParallelAlgorithms 19 | { 20 | // Sort the entire array of any data type with comparable elements 21 | // Adaptive algorithm: if enough memory to allocate a temporary working buffer, then faster not-in-place parallel merge sort is used. 22 | // if not enough memory, then the standard C++ in-place parallel sort is used, which is slower. 23 | template< class _Type > 24 | inline void sort_par(_Type* src, size_t src_size) 25 | { 26 | ParallelAlgorithms::sort_par(src, 0, src_size); 27 | } 28 | 29 | template< class _Type > 30 | inline void sort_par(std::vector<_Type>& src) 31 | { 32 | ParallelAlgorithms::sort_par(src, 0, src.size()); 33 | } 34 | 35 | // Array bounds includes l/left, but does not include r/right 36 | template< class _Type > 37 | inline void sort_par(_Type* src, size_t l, size_t r) 38 | { 39 | size_t src_size = r; 40 | _Type* sorted = new(std::nothrow) _Type[src_size]; 41 | 42 | if (!sorted) 43 | sort(std::execution::par_unseq, src + l, src + r); 44 | else 45 | { 46 | ParallelAlgorithms::parallel_merge_sort_hybrid_rh_1(src, l, r - 1, sorted, false); // r - 1 because this algorithm wants inclusive bounds 47 | 48 | delete[] sorted; 49 | } 50 | } 51 | 52 | // Array bounds includes l/left, but does not include r/right 53 | template< class _Type > 54 | inline void sort_par(std::vector<_Type>& src, size_t l, size_t r) 55 | { 56 | try 57 | { 58 | size_t src_size = r; 59 | std::vector<_Type> sorted(src_size); 60 | ParallelAlgorithms::parallel_merge_sort_hybrid_rh_1(src.data(), l, r - 1, sorted.data(), false); // r - 1 because this algorithm wants inclusive bounds 61 | } 62 | catch (std::bad_alloc& ba) 63 | { 64 | sort(std::execution::par_unseq, src.begin() + l, src.begin() + r); 65 | } 66 | } 67 | 68 | // Array bounds includes l/left, but does not include r/right 69 | // dst buffer must be large enough to provide elements dst[0 to r-1], as the result is placed in dst[l to r-1] 70 | // Two use cases: 71 | // - in-place interface, where the dst buffer is a temporary work buffer 72 | // - not-in-place interface, where the dst buffer is the destination memory buffer 73 | template< class _Type > 74 | inline void sort_par(_Type* src, size_t l, size_t r, _Type* dst, size_t dst_size, bool srcToDst = false) 75 | { 76 | if (!dst) 77 | throw std::invalid_argument("dst is null, which is not supported"); 78 | size_t src_size = r; 79 | if (dst_size < src_size) 80 | throw std::invalid_argument("dst_size must be larger or equal to r, to be able to return dst[l to r-1]"); 81 | 82 | ParallelAlgorithms::parallel_merge_sort_hybrid_rh_2(src, l, r - 1, dst, srcToDst); // r - 1 because this algorithm wants inclusive bounds 83 | } 84 | 85 | // dst buffer must be the same or larger in size than the src 86 | // Two use cases: 87 | // - in-place interface, where the dst buffer is a temporary work buffer 88 | // - not-in-place interface, where the dst buffer is the destination memory buffer 89 | template< class _Type > 90 | inline void sort_par(_Type* src, size_t src_size, _Type* dst, size_t dst_size, bool srcToDst = false) 91 | { 92 | ParallelAlgorithms::sort_par(src, (size_t)0, src_size - 1, dst, dst_size, srcToDst); 93 | } 94 | 95 | } -------------------------------------------------------------------------------- /StdParallelSortMemoryLeakDemo.cpp: -------------------------------------------------------------------------------- 1 | // ParallelAlgorithms main application entry point 2 | #if 0 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | using std::random_device; 16 | using std::vector; 17 | using std::chrono::duration; 18 | using std::chrono::duration_cast; 19 | using std::chrono::high_resolution_clock; 20 | using std::milli; 21 | using std::random_device; 22 | using std::sort; 23 | using std::vector; 24 | 25 | static const int iterationCount = 1000; 26 | 27 | static void print_results(const char* const tag, const vector& sorted, high_resolution_clock::time_point startTime, high_resolution_clock::time_point endTime) 28 | { 29 | printf("%s: Lowest: %u Highest: %u Time: %fms\n", tag, sorted.front(), sorted.back(), 30 | duration_cast>(endTime - startTime).count()); 31 | } 32 | 33 | static int ParallelStdCppExample(vector& uints, bool stable = false) 34 | { 35 | for (int i = 0; i < iterationCount; ++i) 36 | { 37 | vector sorted(uints); 38 | const auto startTime = high_resolution_clock::now(); 39 | // same sort call as above, but with par_unseq: 40 | if (!stable) 41 | { 42 | //sort(std::execution::par_unseq, sorted.begin(), sorted.end()); 43 | sort(oneapi::dpl::execution::par_unseq, sorted.begin(), sorted.end()); 44 | } 45 | else 46 | { 47 | //stable_sort(std::execution::par_unseq, sorted.begin(), sorted.end()); 48 | stable_sort(oneapi::dpl::execution::par_unseq, sorted.begin(), sorted.end()); 49 | } 50 | const auto endTime = high_resolution_clock::now(); 51 | // in our output, note that these are the parallel results: 52 | print_results("Parallel", sorted, startTime, endTime); 53 | } 54 | 55 | return 0; 56 | } 57 | 58 | 59 | //int main() 60 | int std_parallel_sort_leak_demo() 61 | { 62 | // Test configuration options 63 | bool UseStableStdSort = false; 64 | 65 | // Provide the same input random array of doubles to all sorting algorithms 66 | const size_t testSize = 1'000'000'000; 67 | //random_device rd; 68 | std::mt19937_64 dist(1234); 69 | 70 | // generate some random unsigned integers: 71 | printf("\nTesting with %zu random unsigned integers...\n\n", testSize); 72 | vector uints(testSize); 73 | for (auto& d : uints) { 74 | //d = static_cast(rd()); 75 | d = static_cast(dist()); // way faster on Linux 76 | } 77 | // Example of C++17 Standard C++ Parallel Sorting 78 | ParallelStdCppExample(uints, UseStableStdSort); 79 | 80 | return 0; 81 | } 82 | #endif -------------------------------------------------------------------------------- /SumBenchmark.cpp: -------------------------------------------------------------------------------- 1 | #if defined(WIN32) || defined(_WIN32) || defined(__WIN32) && !defined(__CYGWIN__) 2 | // #define DPL_ALGORITHMS // Includes Intel's OneAPI parallel algorithm implementations 3 | #define MICROSOFT_ALGORITHMS // Excludes single-core SIMD implementations, which Microsoft does not support 4 | #endif 5 | 6 | #ifdef DPL_ALGORITHMS 7 | // oneDPL headers should be included before standard headers 8 | #include 9 | #include 10 | #include 11 | #else 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #endif 21 | 22 | //#include 23 | //#define __TBB_PREVIEW_TASK_ARENA_CONSTRAINTS_EXTENSION_PRESENT 1 24 | //#include 25 | 26 | #include "SumParallel.h" 27 | 28 | using std::chrono::duration; 29 | using std::chrono::duration_cast; 30 | using std::chrono::high_resolution_clock; 31 | using std::milli; 32 | using std::random_device; 33 | using std::sort; 34 | using std::vector; 35 | 36 | const int iterationCount = 20; 37 | 38 | //extern void print_results(const char* const tag, const unsigned long long sum, size_t sum_array_length, 39 | // high_resolution_clock::time_point startTime, high_resolution_clock::time_point endTime); 40 | 41 | void print_results(const char* const tag, const unsigned long long sum, size_t sum_array_length, 42 | high_resolution_clock::time_point startTime, 43 | high_resolution_clock::time_point endTime) 44 | { 45 | printf("%s: Sum: %llu Array Length: %zu Time: %fms Throughput: %llu millions/second\n", tag, sum, sum_array_length, 46 | duration_cast>(endTime - startTime).count(), (unsigned long long)(sum_array_length / duration_cast>(endTime - startTime).count() * 1000.0 / 1000000.0)); 47 | } 48 | void print_results(const char* const tag, const unsigned long long sum, size_t sum_array_length, 49 | high_resolution_clock::time_point startTime, 50 | high_resolution_clock::time_point endTime, 51 | double thruput_average, double thruput_std_dev 52 | ) 53 | { 54 | printf("%s: Sum: %llu Array Length: %zu Time: %fms Throughput Average: %.lf million Standard Deviation: %.lf\n", tag, sum, sum_array_length, 55 | duration_cast>(endTime - startTime).count(), thruput_average, thruput_std_dev); 56 | } 57 | 58 | // From: https://stackoverflow.com/questions/7616511/calculate-mean-and-standard-deviation-from-a-vector-of-samples-in-c-using-boos 59 | double std_deviation(vector& v) 60 | { 61 | double sum = std::accumulate(v.begin(), v.end(), 0.0); 62 | double mean = sum / v.size(); 63 | 64 | std::vector diff(v.size()); 65 | //std::transform(v.begin(), v.end(), diff.begin(), std::bind2nd(std::minus(), mean)); 66 | std::transform(v.begin(), v.end(), diff.begin(), [mean](double x) { return x - mean; }); 67 | double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0); 68 | double stdev = std::sqrt(sq_sum / v.size()); 69 | return(stdev); 70 | } 71 | 72 | int SumBenchmarkChar(vector& uints) 73 | { 74 | vector u8Copy(uints.size()); 75 | vector u8Array(uints.size()); 76 | 77 | // time how long it takes to sort them: 78 | for (int i = 0; i < iterationCount; ++i) 79 | { 80 | for (size_t j = 0; j < uints.size(); j++) { // copy the original random array into the source array each time, since ParallelMergeSort modifies the source array while sorting 81 | u8Array[j] = (unsigned char)uints[j]; 82 | u8Copy[ j] = (unsigned char)uints[j]; 83 | } 84 | // Eliminate compiler ability to optimize paging-in of the input and output arrays 85 | // Paging-in source and destination arrays leads to a 50% speed-up on Linux, and 15% on Windows 86 | 87 | const auto startTimeRef = high_resolution_clock::now(); 88 | unsigned long long sum_ref = 0; 89 | //for (size_t i = 0; i < uints.size(); i++) 90 | // sum_ref += u8Copy[i]; 91 | sum_ref = std::accumulate(u8Copy.begin(), u8Copy.end(), 0ULL); 92 | const auto endTimeRef = high_resolution_clock::now(); 93 | print_results("std::accumulate", sum_ref, u8Copy.size(), startTimeRef, endTimeRef); 94 | 95 | unsigned long long sum = 0; 96 | //for (size_t k = 0; k < 100; k++) 97 | //{ 98 | const auto startTime = high_resolution_clock::now(); 99 | //long long sum = ParallelAlgorithms::SumParallel(u8Array, 0, uints.size()); 100 | //sum = ParallelAlgorithms::SumParallel(u8Array, 0, uints.size(), uints.size() / 24); // Running on 24-core is fastest, however with 2.7X run-to-run variation 101 | sum = ParallelAlgorithms::SumParallel(u8Array.data(), 0, uints.size()); 102 | const auto endTime = high_resolution_clock::now(); 103 | print_results("Parallel Sum of uchars", sum, uints.size(), startTime, endTime); 104 | //} 105 | if (sum == sum_ref) 106 | printf("Sums are equal\n"); 107 | else 108 | { 109 | printf("Sums are not equal\n"); 110 | exit(1); 111 | } 112 | } 113 | return 0; 114 | } 115 | 116 | int SumBenchmark(vector& uints) 117 | { 118 | vector u32Copy( uints.size()); 119 | vector u32Array(uints.size()); 120 | 121 | // time how long it takes to sort them: 122 | for (int i = 0; i < iterationCount; ++i) 123 | { 124 | for (size_t j = 0; j < uints.size(); j++) { // copy the original random array into the source array each time, since ParallelMergeSort modifies the source array while sorting 125 | u32Array[j] = (unsigned)uints[j]; 126 | u32Copy[ j] = (unsigned)uints[j]; 127 | } 128 | // Eliminate compiler ability to optimize paging-in of the input and output arrays 129 | // Paging-in source and destination arrays leads to a 50% speed-up on Linux, and 15% on Windows 130 | 131 | const auto startTimeRef = high_resolution_clock::now(); 132 | long long sum_ref = 0; 133 | //for (size_t i = 0; i < uints.size(); i++) 134 | // sum_ref += u8Copy[i]; 135 | sum_ref = std::accumulate(u32Copy.begin(), u32Copy.end(), 0LL); 136 | const auto endTimeRef = high_resolution_clock::now(); 137 | print_results("std::accumulate", sum_ref, u32Copy.size(), startTimeRef, endTimeRef); 138 | 139 | long long sum = 0; 140 | //for (size_t k = 0; k < 100; k++) 141 | //{ 142 | const auto startTime = high_resolution_clock::now(); 143 | //long long sum = ParallelAlgorithms::SumParallel(u8Array, 0, uints.size()); 144 | //sum = ParallelAlgorithms::SumParallel(u8Array, 0, uints.size(), uints.size() / 24); // Running on 24-core is fastest, however with 2.7X run-to-run variation 145 | //sum = ParallelAlgorithms::SumParallel(u32Array.data(), 0, uints.size()); 146 | sum = ParallelAlgorithms::SumParallelNonRecursive(u32Array.data(), 0, uints.size()); 147 | //sum = ParallelAlgorithms::SumParallelNonRecursiveBuffered(u32Array.data(), 0, uints.size()); 148 | //sum = ParallelAlgorithms::SumParallelNonRecursiveBufferedLocally(u32Array.data(), 0, uints.size()); 149 | //sum = ParallelAlgorithms::SumParallelNonRecursiveNoHyperthreading(u32Array.data(), 0, uints.size()); 150 | //sum = ParallelAlgorithms::SumParallelNonRecursiveBufferedLocallyNoHyperthreading(u32Array.data(), 0, uints.size()); 151 | const auto endTime = high_resolution_clock::now(); 152 | print_results("Parallel Sum of unsigned", sum, uints.size(), startTime, endTime); 153 | //} 154 | if (sum == sum_ref) 155 | printf("Sums are equal\n"); 156 | else 157 | { 158 | printf("Sums are not equal\n"); 159 | exit(1); 160 | } 161 | } 162 | return 0; 163 | } 164 | 165 | int SumBenchmark64(vector& uints) 166 | { 167 | vector u64Copy( uints.size()); 168 | vector u64Array(uints.size()); 169 | size_t num_times = 10; 170 | double thruput_sum; 171 | std::vector thruputs(num_times); 172 | 173 | // time how long it takes to sort them: 174 | for (int i = 0; i < iterationCount; ++i) 175 | { 176 | for (size_t j = 0; j < uints.size(); j++) { // copy the original random array into the source array each time, since ParallelMergeSort modifies the source array while sorting 177 | u64Array[j] = (unsigned long long)uints[j]; 178 | u64Copy[ j] = (unsigned long long)uints[j]; 179 | } 180 | // Eliminate compiler ability to optimize paging-in of the input and output arrays 181 | // Paging-in source and destination arrays leads to a 50% speed-up on Linux, and 15% on Windows 182 | 183 | const auto startTimeRef = high_resolution_clock::now(); 184 | unsigned long long sum_ref = 0; 185 | //for (size_t i = 0; i < uints.size(); i++) 186 | // sum_ref += u64Copy[i]; 187 | //sum_ref = std::accumulate(u64Copy.begin(), u64Copy.end(), 0ULL); 188 | sum_ref = std::accumulate(u64Copy.begin(), u64Copy.end(), 0ULL); 189 | //std::fill(oneapi::dpl::execution::par_unseq, u64Copy.begin(), u64Copy.end(), 42); 190 | //std::fill(u64Copy.begin(), u64Copy.end(), 42); 191 | const auto endTimeRef = high_resolution_clock::now(); 192 | print_results("std::accumulate", sum_ref, u64Copy.size(), startTimeRef, endTimeRef); 193 | //unsigned long long sum_array[1000] = { 0 }; 194 | 195 | auto startTime = high_resolution_clock::now(); 196 | auto endTime = high_resolution_clock::now(); 197 | unsigned long long sum = 0; 198 | thruput_sum = 0.0; 199 | for (size_t j = 0; j < num_times; ++j) 200 | { 201 | startTime = high_resolution_clock::now(); 202 | sum = 0; 203 | 204 | //unsigned long long sum = ParallelAlgorithms::SumParallel(u64Array, 0, uints.size()); // Running on 24-core is fastest, however with 2.7X run-to-run variation 205 | //unsigned long long sum = ParallelAlgorithms::SumParallel(u64Array, 0, uints.size(), uints.size() / 24); // Running on 24-core is fastest, however with 2.7X run-to-run variation 206 | //unsigned long long sum = ParallelAlgorithms::SumParallelNonRecursive(u64Array, 0, uints.size()); 207 | //unsigned long long sum = ParallelAlgorithms::SumParallelNonRecursive(u64Array, 0, uints.size(), uints.size() / 8); 208 | //sum = ParallelAlgorithms::SumParallelNonRecursiveNoHyperthreading(u64Array, 0, uints.size(), uints.size() / 14); 209 | //sum = ParallelAlgorithms::SumNonRecursive(u64Array, 0, uints.size(), uints.size() / 2); 210 | //sum = ParallelAlgorithms::SumParallelNonRecursive(u64Array, 0, uints.size(), uints.size() / 4); 211 | //sum = ParallelAlgorithms::SumParallelNonRecursive(u64Array, 0, uints.size(), sum_array); 212 | //sum = ParallelAlgorithms::SumParallelNonRecursive(u64Array.data(), 0, uints.size()); 213 | //sum = ParallelAlgorithms::SumParallelNonRecursiveBuffered(u64Array, 0, uints.size()); 214 | sum = ParallelAlgorithms::SumParallel(u64Array.data(), 0, uints.size()); 215 | //sum = ParallelAlgorithms::SumParallel(u64Array, 0, uints.size(), uints.size() / 4); 216 | //sum = ParallelAlgorithms::SumParallel(u64Array, 0, uints.size(), uints.size() / 16); // highest performance with /15 and /17 at half the performance 217 | 218 | endTime = high_resolution_clock::now(); 219 | thruputs[j] = (double)uints.size() / (duration_cast>(endTime - startTime).count() / 1000.0) / 1000000.0; 220 | thruput_sum += (double)uints.size() / (duration_cast>(endTime - startTime).count() / 1000.0) / 1000000.0; 221 | if (sum != sum_ref) 222 | { 223 | printf("Sums are not equal\n"); 224 | exit(1); 225 | } 226 | } 227 | print_results("Parallel 64-bit Sum", sum, uints.size(), startTime, endTime, thruput_sum / num_times, std_deviation(thruputs)); 228 | } 229 | return 0; 230 | } 231 | -------------------------------------------------------------------------------- /SumParallel.h: -------------------------------------------------------------------------------- 1 | // TODO: Implement a more efficient suggestion of using task_group to split the array into chunks with each returning its sum into one index of an array of sums 2 | #pragma once 3 | 4 | // Parallel Sum implementations 5 | 6 | #ifndef _SumParallel_h 7 | #define _SumParallel_h 8 | 9 | #include "Configuration.h" 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | #if defined(WIN32) || defined(_WIN32) || defined(__WIN32) && !defined(__CYGWIN__) 21 | #define __TBB_PREVIEW_TASK_ARENA_CONSTRAINTS_EXTENSION_PRESENT 1 22 | #include 23 | #endif 24 | 25 | #include "RadixSortMsdParallel.h" 26 | #include "FillParallel.h" 27 | 28 | using std::chrono::duration; 29 | using std::chrono::duration_cast; 30 | using std::chrono::high_resolution_clock; 31 | using std::milli; 32 | using std::random_device; 33 | using std::sort; 34 | using std::vector; 35 | 36 | 37 | namespace ParallelAlgorithms 38 | { 39 | // left (l) boundary is inclusive and right (r) boundary is exclusive 40 | inline unsigned long long Sum(unsigned long long in_array[], size_t l, size_t r) 41 | { 42 | unsigned long long sum = 0; 43 | for (size_t current = l; current < r; current++) 44 | sum += in_array[current]; 45 | //unsigned long long sum_left = std::accumulate(in_array + l, in_array + r, 0); // may be implemented using SIMD/SSE 46 | return sum; 47 | } 48 | 49 | // left (l) boundary is inclusive and right (r) boundary is exclusive 50 | inline unsigned long long Sum(unsigned in_array[], size_t l, size_t r) 51 | { 52 | unsigned long long sum = 0; 53 | for (size_t current = l; current < r; current++) 54 | sum += (unsigned long long)in_array[current]; 55 | //unsigned long long sum_left = std::accumulate(in_array + l, in_array + r, 0); // may be implemented using SIMD/SSE 56 | return sum; 57 | } 58 | 59 | // left (l) boundary is inclusive and right (r) boundary is exclusive 60 | // TODO: Not yet implemented 61 | inline unsigned long long SumUnrolled(unsigned in_array[], size_t l, size_t r) 62 | { 63 | unsigned long long sum = 0; 64 | for (size_t current = l; current < r; current++) 65 | sum += (unsigned long long)in_array[current]; 66 | //unsigned long long sum_left = std::accumulate(in_array + l, in_array + r, 0); // may be implemented using SIMD/SSE 67 | return sum; 68 | } 69 | #if 0 70 | size_t last_by_four = l + ((r - l) / 4) * 4; 71 | size_t current = l; 72 | for (; current < last_by_four;) // Scan the array and count the number of times each digit value appears - i.e. size of each bin 73 | { 74 | countLeft_0[inArray[current]]++; current++; 75 | countLeft_1[inArray[current]]++; current++; 76 | countLeft_2[inArray[current]]++; current++; 77 | countLeft_3[inArray[current]]++; current++; 78 | } 79 | #endif 80 | // left (l) boundary is inclusive and right (r) boundary is exclusive 81 | inline unsigned long long SumBufferedLocally(unsigned in_array[], size_t l, size_t r) 82 | { 83 | const size_t BUFFER_DEPTH = 1024; 84 | unsigned buffer_loc[BUFFER_DEPTH]; 85 | size_t num_buffers = (r - l + (BUFFER_DEPTH - 1)) / BUFFER_DEPTH; 86 | unsigned long long sum = 0; 87 | size_t current = l; 88 | size_t i = 0; 89 | for (; i < (num_buffers - 1); ++i) 90 | { 91 | std::copy(in_array + current, in_array + current + BUFFER_DEPTH, buffer_loc); // possibly using SIMD/SSE. TODO: Need to switch to Intel's unseq version 92 | for (size_t j = 0; j < BUFFER_DEPTH; ++j) 93 | sum += buffer_loc[j]; 94 | current += BUFFER_DEPTH; 95 | } 96 | for (; current < r; ++current) 97 | sum += in_array[current]; 98 | return sum; 99 | } 100 | 101 | // left (l) boundary is inclusive and right (r) boundary is exclusive 102 | inline unsigned long long SumBufferedLocally(unsigned long long in_array[], size_t l, size_t r) 103 | { 104 | const size_t BUFFER_DEPTH = 1024; 105 | unsigned long long buffer_loc[BUFFER_DEPTH]; 106 | size_t num_buffers = (r - l + (BUFFER_DEPTH - 1)) / BUFFER_DEPTH; 107 | unsigned long long sum = 0; 108 | size_t current = l; 109 | size_t i = 0; 110 | for (; i < (num_buffers - 1); ++i) 111 | { 112 | std::copy(in_array + current, in_array + current + BUFFER_DEPTH, buffer_loc); // possibly using SIMD/SSE. TODO: Need to switch to Intel's unseq version 113 | for (size_t j = 0; j < BUFFER_DEPTH; ++j) 114 | sum += buffer_loc[j]; 115 | current += BUFFER_DEPTH; 116 | } 117 | for (; current < r; ++current) 118 | sum += in_array[current]; 119 | return sum; 120 | } 121 | 122 | // left (l) boundary is inclusive and right (r) boundary is exclusive 123 | inline unsigned long long SumBufferedExternally(unsigned in_array[], size_t l, size_t r, unsigned buffer[], size_t buffer_depth) 124 | { 125 | size_t num_buffers = (r - l + (buffer_depth - 1)) / buffer_depth; 126 | unsigned long long sum = 0; 127 | size_t current = l; 128 | size_t i = 0; 129 | for (; i < (num_buffers - 1); ++i) 130 | { 131 | std::copy(in_array + current, in_array + current + buffer_depth, buffer); // possibly using SIMD/SSE. TODO: Need to switch to Intel's unseq version 132 | for (size_t j = 0; j < buffer_depth; ++j) 133 | sum += buffer[j]; 134 | current += buffer_depth; 135 | } 136 | for (; current < r; ++current) 137 | sum += in_array[current]; 138 | return sum; 139 | } 140 | 141 | // left (l) boundary is inclusive and right (r) boundary is exclusive 142 | // 50% slower than non-buffered version 143 | inline unsigned long long SumBufferedExternally(unsigned long long in_array[], size_t l, size_t r, unsigned long long buffer[], size_t buffer_depth) 144 | { 145 | size_t num_buffers = (r - l + (buffer_depth - 1)) / buffer_depth; 146 | unsigned long long sum = 0; 147 | size_t current = l; 148 | size_t i = 0; 149 | for (; i < (num_buffers - 1); ++i) 150 | { 151 | std::copy(in_array + current, in_array + current + buffer_depth, buffer); // possibly using SIMD/SSE. TODO: Need to switch to Intel's unseq version 152 | for (size_t j = 0; j < buffer_depth; ++j) 153 | sum += buffer[j]; 154 | current += buffer_depth; 155 | } 156 | for (; current < r; ++current) 157 | sum += in_array[current]; 158 | return sum; 159 | } 160 | 161 | // left (l) boundary is inclusive and right (r) boundary is exclusive 162 | inline unsigned long long SumParallel(unsigned long long in_array[], size_t l, size_t r, size_t parallelThreshold = 16 * 1024) 163 | { 164 | //if (((unsigned long long)(in_array + l) & 0x7) != 0) 165 | // printf("Memory alignment is not on 8-byte boundary\n"); 166 | if ((r - l) <= parallelThreshold) 167 | return Sum( in_array, l, r ); 168 | //return std::accumulate(in_array + l, in_array + r, 0ULL); 169 | 170 | unsigned long long sum_left = 0, sum_right = 0; 171 | 172 | size_t m = r / 2 + l / 2 + (r % 2 + l % 2) / 2; // average without overflow 173 | 174 | #if defined(USE_PPL) 175 | Concurrency::parallel_invoke( 176 | #else 177 | tbb::parallel_invoke( 178 | #endif 179 | [&] { sum_left = SumParallel(in_array, l, m, parallelThreshold); }, 180 | [&] { sum_right = SumParallel(in_array, m, r, parallelThreshold); } 181 | ); 182 | // Combine left and right results 183 | sum_left += sum_right; 184 | 185 | return sum_left; 186 | } 187 | #if 0 188 | inline unsigned long long SumParallel(unsigned long long in_array[], size_t l, size_t r, size_t parallelThreshold = 16 * 1024) 189 | { 190 | //may return 0 when not able to detect 191 | auto processor_count = std::thread::hardware_concurrency(); 192 | if (processor_count < 1) 193 | { 194 | processor_count = 1; 195 | //cout << "Warning: Fewer than 1 processor core detected. Using only a single core."; 196 | } 197 | 198 | size_t length = r - l + 1; 199 | 200 | if ((parallelThreshold * processor_count) < length) 201 | parallelThreshold = length / processor_count; 202 | return SumParallel_inner(in_array, l, r, parallelThreshold); 203 | } 204 | #endif 205 | // Sum of an arbitrary numerical type to a 64-bit sum 206 | // left (l) boundary is inclusive and right (r) boundary is exclusive 207 | template< class _Type > 208 | inline long long SumParallel(_Type in_array[], size_t l, size_t r, size_t parallelThreshold = 16 * 1024) 209 | { 210 | //if (((unsigned long long)(in_array + l) & 0x7) != 0) 211 | // printf("Memory alignment is not on 8-byte boundary\n"); 212 | if ((r - l) <= parallelThreshold) 213 | { 214 | long long sum_left = 0; 215 | for (size_t current = l; current < r; current++) 216 | sum_left += (long long)in_array[current]; 217 | //long long sum_left = std::accumulate(in_array + l, in_array + r, 0LL); 218 | return sum_left; 219 | } 220 | 221 | long long sum_left = 0, sum_right = 0; 222 | 223 | size_t m = r / 2 + l / 2 + (r % 2 + l % 2) / 2; // average without overflow 224 | 225 | #if defined(USE_PPL) 226 | Concurrency::parallel_invoke( 227 | #else 228 | tbb::parallel_invoke( 229 | #endif 230 | [&] { sum_left = SumParallel(in_array, l, m, parallelThreshold); }, 231 | [&] { sum_right = SumParallel(in_array, m, r, parallelThreshold); } 232 | ); 233 | // Combine left and right results 234 | sum_left += sum_right; 235 | 236 | return sum_left; 237 | } 238 | // Non-recursive Sum 239 | // left (l) boundary is inclusive and right (r) boundary is exclusive 240 | inline unsigned long long SumNonRecursive(unsigned long long in_array[], size_t l, size_t r, size_t parallelThreshold = 128 * 1024) 241 | { 242 | size_t num_tasks = (r - l + (parallelThreshold - 1)) / parallelThreshold; 243 | unsigned long long* sum_array = new unsigned long long[num_tasks] {}; 244 | 245 | size_t i = 0; 246 | for (; i < (num_tasks - 1); i++) 247 | sum_array[i] = Sum(in_array, l + parallelThreshold * i, l + parallelThreshold * (i + 1)); // process full parallelThreshold chunks 248 | 249 | sum_array[num_tasks - 1] = Sum(in_array, l + parallelThreshold * i, r); // process the last partial parallelThreshold chunk 250 | 251 | unsigned long long sum = 0; 252 | for (size_t i = 0; i < num_tasks; i++) 253 | sum += sum_array[i]; 254 | 255 | delete[] sum_array; 256 | return sum; 257 | } 258 | 259 | // Non-recursive Parallel Sum 260 | // left (l) boundary is inclusive and right (r) boundary is exclusive 261 | inline unsigned long long SumParallelNonRecursive(unsigned in_array[], size_t l, size_t r, size_t parallelThreshold = 16 * 1024) 262 | { 263 | size_t num_tasks = (r - l + (parallelThreshold - 1)) / parallelThreshold; 264 | unsigned long long* sum_array = new unsigned long long[num_tasks] {}; 265 | tbb::task_group g; 266 | 267 | size_t i = 0; 268 | for (; i < (num_tasks - 1); i++) 269 | g.run([=] {sum_array[i] = Sum(in_array, l + parallelThreshold * i, l + parallelThreshold * (i + 1)); }); // process full parallelThreshold chunks 270 | 271 | g.run([=] {sum_array[num_tasks - 1] = Sum(in_array, l + parallelThreshold * i, r); }); // process the last partial parallelThreshold chunk 272 | 273 | g.wait(); // wait for all tasks to complete 274 | 275 | unsigned long long sum = 0; 276 | for (size_t i = 0; i < num_tasks; i++) 277 | sum += sum_array[i]; 278 | 279 | delete[] sum_array; 280 | return sum; 281 | } 282 | 283 | // Non-recursive Parallel Sum 284 | // left (l) boundary is inclusive and right (r) boundary is exclusive 285 | inline unsigned long long SumParallelNonRecursive(unsigned long long in_array[], size_t l, size_t r, size_t parallelThreshold = 16 * 1024) 286 | { 287 | size_t num_tasks = (r - l + (parallelThreshold - 1)) / parallelThreshold; 288 | unsigned long long* sum_array = new unsigned long long[num_tasks] {}; 289 | tbb::task_group g; 290 | 291 | size_t i = 0; 292 | for (; i < (num_tasks - 1); i++) 293 | g.run([=] {sum_array[i] = Sum(in_array, l + parallelThreshold * i, l + parallelThreshold * (i + 1)); }); // process full parallelThreshold chunks 294 | 295 | g.run([=] {sum_array[num_tasks - 1] = Sum(in_array, l + parallelThreshold * i, r); }); // process the last partial parallelThreshold chunk 296 | 297 | g.wait(); // wait for all tasks to complete 298 | 299 | unsigned long long sum = 0; 300 | for (size_t i = 0; i < num_tasks; i++) 301 | sum += sum_array[i]; 302 | 303 | delete[] sum_array; 304 | return sum; 305 | } 306 | 307 | // Non-recursive Parallel Sum 308 | // left (l) boundary is inclusive and right (r) boundary is exclusive 309 | inline unsigned long long SumParallelNonRecursiveBufferedLocally(unsigned in_array[], size_t l, size_t r, size_t parallelThreshold = 16 * 1024) 310 | { 311 | size_t num_tasks = (r - l + (parallelThreshold - 1)) / parallelThreshold; 312 | unsigned long long* sum_array = new unsigned long long[num_tasks] {}; 313 | tbb::task_group g; 314 | 315 | size_t i = 0; 316 | for (; i < (num_tasks - 1); i++) 317 | g.run([=] { // process full parallelThreshold chunks 318 | sum_array[i] = SumBufferedLocally(in_array, l + parallelThreshold * i, l + parallelThreshold * (i + 1)); 319 | }); 320 | 321 | g.run([=] { // process the last partial parallelThreshold chunk 322 | sum_array[num_tasks - 1] = SumBufferedLocally(in_array, l + parallelThreshold * i, r); 323 | }); 324 | 325 | g.wait(); // wait for all tasks to complete 326 | 327 | unsigned long long sum = 0; 328 | for (size_t i = 0; i < num_tasks; i++) 329 | sum += sum_array[i]; 330 | 331 | delete[] sum_array; 332 | return sum; 333 | } 334 | 335 | // Non-recursive Parallel Sum 336 | // left (l) boundary is inclusive and right (r) boundary is exclusive 337 | inline unsigned long long SumParallelNonRecursiveBufferedLocally(unsigned long long in_array[], size_t l, size_t r, size_t parallelThreshold = 16 * 1024) 338 | { 339 | size_t num_tasks = (r - l + (parallelThreshold - 1)) / parallelThreshold; 340 | unsigned long long* sum_array = new unsigned long long[num_tasks] {}; 341 | const size_t BUFFER_DEPTH_PER_TASK = 1024; 342 | tbb::task_group g; 343 | 344 | size_t i = 0; 345 | for (; i < (num_tasks - 1); i++) 346 | g.run([=] { // process full parallelThreshold chunks 347 | sum_array[i] = SumBufferedLocally(in_array, l + parallelThreshold * i, l + parallelThreshold * (i + 1)); 348 | }); 349 | 350 | g.run([=] { // process the last partial parallelThreshold chunk 351 | sum_array[num_tasks - 1] = SumBufferedLocally(in_array, l + parallelThreshold * i, r); 352 | }); 353 | 354 | g.wait(); // wait for all tasks to complete 355 | 356 | unsigned long long sum = 0; 357 | for (size_t i = 0; i < num_tasks; i++) 358 | sum += sum_array[i]; 359 | 360 | delete[] sum_array; 361 | return sum; 362 | } 363 | 364 | // Non-recursive Parallel Sum 365 | // left (l) boundary is inclusive and right (r) boundary is exclusive 366 | inline unsigned long long SumParallelNonRecursiveBuffered(unsigned in_array[], size_t l, size_t r, size_t parallelThreshold = 16 * 1024) 367 | { 368 | size_t num_tasks = (r - l + (parallelThreshold - 1)) / parallelThreshold; 369 | unsigned long long* sum_array = new unsigned long long[num_tasks] {}; 370 | const size_t BUFFER_DEPTH_PER_TASK = 1024; 371 | unsigned* buffers = new unsigned[num_tasks * BUFFER_DEPTH_PER_TASK] {}; 372 | tbb::task_group g; 373 | 374 | size_t i = 0; 375 | for (; i < (num_tasks - 1); i++) 376 | g.run([=] {sum_array[i] = SumBufferedExternally(in_array, l + parallelThreshold * i, l + parallelThreshold * (i + 1), &buffers[i * BUFFER_DEPTH_PER_TASK], BUFFER_DEPTH_PER_TASK); }); // process full parallelThreshold chunks 377 | 378 | g.run([=] {sum_array[num_tasks - 1] = Sum(in_array, l + parallelThreshold * i, r); }); // process the last partial parallelThreshold chunk 379 | 380 | g.wait(); // wait for all tasks to complete 381 | 382 | unsigned long long sum = 0; 383 | for (size_t i = 0; i < num_tasks; i++) 384 | sum += sum_array[i]; 385 | 386 | delete[] sum_array; 387 | return sum; 388 | } 389 | 390 | // Non-recursive Parallel Sum 391 | // left (l) boundary is inclusive and right (r) boundary is exclusive 392 | // 50% slower than non-buffered version 393 | inline unsigned long long SumParallelNonRecursiveBuffered(unsigned long long in_array[], size_t l, size_t r, size_t parallelThreshold = 16 * 1024) 394 | { 395 | size_t num_tasks = (r - l + (parallelThreshold - 1)) / parallelThreshold; 396 | unsigned long long* sum_array = new unsigned long long[num_tasks] {}; 397 | const size_t BUFFER_DEPTH_PER_TASK = 1024; 398 | unsigned long long* buffers = new unsigned long long[num_tasks * BUFFER_DEPTH_PER_TASK] {}; 399 | tbb::task_group g; 400 | 401 | size_t i = 0; 402 | for (; i < (num_tasks - 1); i++) 403 | g.run([=] {sum_array[i] = SumBufferedExternally(in_array, l + parallelThreshold * i, l + parallelThreshold * (i + 1), &buffers[i * BUFFER_DEPTH_PER_TASK], BUFFER_DEPTH_PER_TASK); }); // process full parallelThreshold chunks 404 | 405 | g.run([=] {sum_array[num_tasks - 1] = Sum(in_array, l + parallelThreshold * i, r); }); // process the last partial parallelThreshold chunk 406 | 407 | g.wait(); // wait for all tasks to complete 408 | 409 | unsigned long long sum = 0; 410 | for (size_t i = 0; i < num_tasks; i++) 411 | sum += sum_array[i]; 412 | 413 | delete[] sum_array; 414 | return sum; 415 | } 416 | 417 | // Non-recursive Parallel Sum 418 | // left (l) boundary is inclusive and right (r) boundary is exclusive 419 | inline unsigned long long SumParallelNonRecursive(unsigned in_array[], size_t l, size_t r, unsigned long long* sum_array, size_t parallelThreshold = 16 * 1024) 420 | { 421 | size_t num_tasks = (r - l + (parallelThreshold - 1)) / parallelThreshold; 422 | //unsigned long long* sum_array = new unsigned long long[num_tasks] {}; 423 | tbb::task_group g; 424 | 425 | size_t i = 0; 426 | for (; i < (num_tasks - 1); i++) 427 | g.run([=] {sum_array[i] = Sum(in_array, l + parallelThreshold * i, l + parallelThreshold * (i + 1)); }); // process full parallelThreshold chunks 428 | 429 | g.run([=] {sum_array[num_tasks - 1] = Sum(in_array, l + parallelThreshold * i, r); }); // process the last partial parallelThreshold chunk 430 | 431 | g.wait(); // wait for all tasks to complete 432 | 433 | unsigned long long sum = 0; 434 | for (size_t i = 0; i < num_tasks; i++) 435 | sum += sum_array[i]; 436 | 437 | return sum; 438 | } 439 | 440 | // Non-recursive Parallel Sum 441 | // left (l) boundary is inclusive and right (r) boundary is exclusive 442 | inline unsigned long long SumParallelNonRecursive(unsigned long long in_array[], size_t l, size_t r, unsigned long long* sum_array, size_t parallelThreshold = 16 * 1024) 443 | { 444 | size_t num_tasks = (r - l + (parallelThreshold - 1)) / parallelThreshold; 445 | //unsigned long long* sum_array = new unsigned long long[num_tasks] {}; 446 | tbb::task_group g; 447 | 448 | size_t i = 0; 449 | for (; i < (num_tasks - 1); i++) 450 | g.run([=] {sum_array[i] = Sum(in_array, l + parallelThreshold * i, l + parallelThreshold * (i + 1)); }); // process full parallelThreshold chunks 451 | 452 | g.run([=] {sum_array[num_tasks - 1] = Sum(in_array, l + parallelThreshold * i, r); }); // process the last partial parallelThreshold chunk 453 | 454 | g.wait(); // wait for all tasks to complete 455 | 456 | unsigned long long sum = 0; 457 | for (size_t i = 0; i < num_tasks; i++) 458 | sum += sum_array[i]; 459 | 460 | return sum; 461 | } 462 | 463 | #if defined(WIN32) || defined(_WIN32) || defined(__WIN32) && !defined(__CYGWIN__) 464 | // Non-recursive Parallel Sum without Hyperthreading 465 | // left (l) boundary is inclusive and right (r) boundary is exclusive 466 | inline unsigned long long SumParallelNonRecursiveNoHyperthreading(unsigned long long in_array[], size_t l, size_t r, size_t parallelThreshold = 16 * 1024) 467 | { 468 | size_t num_tasks = (r - l + (parallelThreshold - 1)) / parallelThreshold; 469 | unsigned long long* sum_array = new unsigned long long[num_tasks] {}; 470 | 471 | int no_ht_concurrency = tbb::info::default_concurrency( 472 | tbb::task_arena::constraints{}.set_max_threads_per_core(1) 473 | ); 474 | tbb::task_arena arena(no_ht_concurrency); 475 | arena.execute([=] { 476 | tbb::task_group g; 477 | size_t i = 0; 478 | for (; i < (num_tasks - 1); i++) 479 | g.run([=] {sum_array[i] = Sum(in_array, l + parallelThreshold * i, l + parallelThreshold * (i + 1)); }); // process full parallelThreshold chunks 480 | 481 | g.run([=] {sum_array[num_tasks - 1] = Sum(in_array, l + parallelThreshold * i, r); }); // process the last partial parallelThreshold chunk 482 | 483 | g.wait(); // wait for all tasks to complete 484 | }); 485 | 486 | unsigned long long sum = 0; 487 | for (size_t i = 0; i < num_tasks; i++) 488 | sum += sum_array[i]; 489 | 490 | delete[] sum_array; 491 | return sum; 492 | } 493 | 494 | // Non-recursive Parallel Sum without Hyperthreading 495 | // left (l) boundary is inclusive and right (r) boundary is exclusive 496 | inline unsigned long long SumParallelNonRecursiveNoHyperthreading(unsigned in_array[], size_t l, size_t r, size_t parallelThreshold = 16 * 1024) 497 | { 498 | size_t num_tasks = (r - l + (parallelThreshold - 1)) / parallelThreshold; 499 | unsigned long long* sum_array = new unsigned long long[num_tasks] {}; 500 | 501 | int no_ht_concurrency = tbb::info::default_concurrency( 502 | tbb::task_arena::constraints{}.set_max_threads_per_core(1) 503 | ); 504 | tbb::task_arena arena(no_ht_concurrency); 505 | arena.execute([=] { 506 | tbb::task_group g; 507 | size_t i = 0; 508 | for (; i < (num_tasks - 1); i++) 509 | g.run([=] {sum_array[i] = Sum(in_array, l + parallelThreshold * i, l + parallelThreshold * (i + 1)); }); // process full parallelThreshold chunks 510 | 511 | g.run([=] {sum_array[num_tasks - 1] = Sum(in_array, l + parallelThreshold * i, r); }); // process the last partial parallelThreshold chunk 512 | 513 | g.wait(); // wait for all tasks to complete 514 | }); 515 | 516 | unsigned long long sum = 0; 517 | for (size_t i = 0; i < num_tasks; i++) 518 | sum += sum_array[i]; 519 | 520 | delete[] sum_array; 521 | return sum; 522 | } 523 | 524 | // Non-recursive Parallel Sum without Hyperthreading 525 | // left (l) boundary is inclusive and right (r) boundary is exclusive 526 | inline unsigned long long SumParallelNonRecursiveBufferedLocallyNoHyperthreading( 527 | unsigned in_array[], size_t l, size_t r, size_t parallelThreshold = 16 * 1024) 528 | { 529 | size_t num_tasks = (r - l + (parallelThreshold - 1)) / parallelThreshold; 530 | unsigned long long* sum_array = new unsigned long long[num_tasks] {}; 531 | 532 | int no_ht_concurrency = tbb::info::default_concurrency( 533 | tbb::task_arena::constraints{}.set_max_threads_per_core(1) 534 | ); 535 | tbb::task_arena arena(no_ht_concurrency); 536 | arena.execute([=] { 537 | tbb::task_group g; 538 | size_t i = 0; 539 | for (; i < (num_tasks - 1); i++) 540 | g.run([=] {sum_array[i] = SumBufferedLocally(in_array, l + parallelThreshold * i, l + parallelThreshold * (i + 1)); }); // process full parallelThreshold chunks 541 | 542 | g.run([=] {sum_array[num_tasks - 1] = SumBufferedLocally(in_array, l + parallelThreshold * i, r); }); // process the last partial parallelThreshold chunk 543 | 544 | g.wait(); // wait for all tasks to complete 545 | }); 546 | 547 | unsigned long long sum = 0; 548 | for (size_t i = 0; i < num_tasks; i++) 549 | sum += sum_array[i]; 550 | 551 | delete[] sum_array; 552 | return sum; 553 | } 554 | #endif 555 | } 556 | 557 | #endif -------------------------------------------------------------------------------- /TODO.txt: -------------------------------------------------------------------------------- 1 | TODO: Consider switching interfaces of most functions to use std::vector instead of old-style C arrays/pointers. Returning vectors should be fine and less for users to think about for deallocation. 2 | TODO: Create functions for integer division with ceiling (round up) and division with round. 3 | TODO: Explore parallelism at small work load, such as small arrays. For example, sum of an array I was exploring in the book didn't improve in performance when I tried to reduce the overhead of small tasks in the body 4 | of the recursive tree. The mistake I made is using a large tree. Instead, this reduction should pay off big time for small trees, possibly with small parallel_threshold (cutoff) points. It may enable smaller parallel cutoff, 5 | which would be a win for parallelism for smaller arrays. 6 | Idea: The whole "parallelism for the small" could be our value add. We could develop techniques and technologies that increase usefulness at the small array scale and enable use of parallelism and higher performance for more use cases, 7 | expanding the market for parallelism. It would benefit not only C++, but also C#, both of which would enjoy the benefit of parallelism performance gains in more cases - i.e. not just the very large cases. This would blow the doors 8 | off the parallel performance market. 9 | One tool is to reduce the overhead of the "recursive body nodes" by eliminating them as I did in the book by replacing recursion with a single 10 | level of recursion. This should work for small arrays well, to reduce the overhead of these due to their inefficiency. Allocation of the single 11 | array for the single recursion level can also be done on the stack for small arrays, reduring overhead of allocation. This needs to be benchmarked 12 | for small arrays to show how much benefit can be harvested for small arrays. Hopefully, it's enough to increase parallelism usefulness to smallest 13 | of arrays, extending usefulness of parallelism to all arrays. 14 | TODO: C++ added support for new without throwing exception to improve performance. This needs to be used everywhere, especially in Adaptive algorithms. 15 | _Type* sorted = new(std::nothrow) _Type[src_size]; 16 | Benchmarking for comparison would be a great one too, to see how much time exception throwing takes 17 | TODO: Change all interfaces to be either start_iterator/end_iterator or to start_pointer/size. Otherwise, size_t left and size_t right has a problem of not being able to 18 | support zero element at zero starting location, forcing the right/end index to be exclusive and left/start being inclusive which seems ackward since they are not the same. 19 | Another possibility is to adapt C++ method of iterators for both bounds, and C++ users are used to having the end iterator being exclusive. 20 | Another possibility is to adapt C# method of using start/length pair for bounds specification, which I like better than C++ convention of inclusive/exclusive start/end iterator pair. 21 | Right now we are getting away with it because sorting 0 elements and sorting 1 element are equivalent to not doing anything and it's impossible to tell the difference, as sorting 22 | starts permuting with 2 array elements and larger, otherwise nothing is done. 23 | TODO: For interfaces that use size_t left and right boundary, support for zero elements in the array needs to be supported in some way. 24 | This can be done in two ways: start-length pair like C# prefers, or left-inclusive and right-exclusive like C++ prefers. 25 | For C++ it seems like this should be handled in a C++ fashion to make algorithms have a familiar interface for C++ developers. 26 | It's possible to add a wrapper that does inclusive-left and exclusive-right in C++, checks for zero length and returns. This handles size_t left and right 27 | boundary condition. This may be the solution at the moment. 28 | TODO: Add output and possibly input buffering to merge where C++ copy is used to bring inputs and outputs in and out by using SSE/SIMD instructions. (Using Microsoft's version of copy to buffer did not help. 29 | Need to use Intel's unseq version.) 30 | TODO: Implement adaptive algorithms that provide a way to save and restore settings for tuning of algorithms - i.e. provide persistence. --------------------------------------------------------------------------------