├── Windows └── ParallelSTL │ ├── ParallelSTL.sln │ └── ParallelSTL.vcxproj ├── README.md ├── LICENSE └── src └── main.cpp /Windows/ParallelSTL/ParallelSTL.sln: -------------------------------------------------------------------------------- 1 | 2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio Version 17 4 | VisualStudioVersion = 17.7.34221.43 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ParallelSTL", "ParallelSTL.vcxproj", "{D45A65A4-6B11-46F0-8223-F15CE5DF669A}" 7 | EndProject 8 | Global 9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 10 | Debug|x64 = Debug|x64 11 | Debug|x86 = Debug|x86 12 | Release|x64 = Release|x64 13 | Release|x86 = Release|x86 14 | EndGlobalSection 15 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 16 | {D45A65A4-6B11-46F0-8223-F15CE5DF669A}.Debug|x64.ActiveCfg = Debug|x64 17 | {D45A65A4-6B11-46F0-8223-F15CE5DF669A}.Debug|x64.Build.0 = Debug|x64 18 | {D45A65A4-6B11-46F0-8223-F15CE5DF669A}.Debug|x86.ActiveCfg = Debug|Win32 19 | {D45A65A4-6B11-46F0-8223-F15CE5DF669A}.Debug|x86.Build.0 = Debug|Win32 20 | {D45A65A4-6B11-46F0-8223-F15CE5DF669A}.Release|x64.ActiveCfg = Release|x64 21 | {D45A65A4-6B11-46F0-8223-F15CE5DF669A}.Release|x64.Build.0 = Release|x64 22 | {D45A65A4-6B11-46F0-8223-F15CE5DF669A}.Release|x86.ActiveCfg = Release|Win32 23 | {D45A65A4-6B11-46F0-8223-F15CE5DF669A}.Release|x86.Build.0 = Release|Win32 24 | EndGlobalSection 25 | GlobalSection(SolutionProperties) = preSolution 26 | HideSolutionNode = FALSE 27 | EndGlobalSection 28 | GlobalSection(ExtensibilityGlobals) = postSolution 29 | SolutionGuid = {566B586F-8381-49AE-A0EE-5D95DB3DBECA} 30 | EndGlobalSection 31 | EndGlobal 32 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ParallelSTL 2 | Benchmark of C++ Parallel Standard Library Algorithms (STL) 3 | 4 | To setup Intel's C++ Standard Parallel Algorithms on Linux, the following are needed: 5 | - (Windows) OneAPI 6 | - (Linux) Threading Building Blocks (TBB) 7 | 8 | ## Building on Ubuntu 20.04 Linux (or WSL on Windows) 9 | To install g++ which supports C++17: 10 | ``` 11 | sudo apt update 12 | sudo apt upgrade 13 | # reboot the machine 14 | sudo apt install build-essential 15 | ``` 16 | 17 | To update gcc to support c++17 standard, which uses Intel's Threading Building Blocks (TBB) for C++ Parallel Standard Algorithms: 18 | ``` 19 | sudo apt install libtbb-dev 20 | ``` 21 | 22 | To build, use g++ command and not gcc. The order of the following arguments matters! 23 | ``` 24 | g++ /mnt/c/repos/ParallelSTL/src/main.cpp -ltbb -std=c++17 -O3 -o benchmark_parallel_std 25 | ``` 26 | 27 | ## Building NVidia's Parallel STL on Ubuntu 20.04 Linux (or WSL on Windows) 28 | To setup NVidia's compiler on Linux follow these instructions: 29 | https://developer.nvidia.com/blog/accelerating-standard-c-with-gpus-using-stdpar/ 30 | 31 | To setup NVidia's compiler and C++ Standard parallel algorithms on Linux 32 | ``` 33 | curl https://developer.download.nvidia.com/hpc-sdk/ubuntu/DEB-GPG-KEY-NVIDIA-HPC-SDK | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-hpcsdk-archive-keyring.gpg 34 | echo 'deb [signed-by=/usr/share/keyrings/nvidia-hpcsdk-archive-keyring.gpg] https://developer.download.nvidia.com/hpc-sdk/ubuntu/amd64 /' | sudo tee /etc/apt/sources.list.d/nvhpc.list 35 | sudo apt-get update -y 36 | sudo apt-get install -y nvhpc-23-5 37 | ``` 38 | 39 | To compile Benchmark using NVidia's compiler targetting Intel multicore CPU 40 | ``` 41 | nvc++ /mnt/c/repos/ParallelSTL/src/main.cpp -stdpar=multicore -O3 -o benchmark_nvc 42 | ``` 43 | ## Building on Windows 44 | In Windows/ParallelSTL sub-directory a Visual Studio 2022 solution/project can be used to build an executable using either Microsoft's compiler or Intel compiler. 45 | Microsoft implements Parallel standard algorithms, but not all of them. 46 | Intel implements more standard parallel algorithms with higher performance as show in [C++ Parallel STL Benchmark.](https://duvanenko.tech.blog/2023/05/21/c-parallel-stl-benchmark/) 47 | 48 | [Intel OneAPI](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html) must be installed to obtain Intel's implementation of C++ Parallel Standard Algorithms. If Intel implementations are not desired, then comment out "#define DPL_ALGORITHMS" at the top of "main.cpp". 49 | 50 | Both compilers (Microsoft or Intel) can be used to build this project, with or without Intel's algorithms. To switch between compilers, select "Project/Intel-Compiler" from Visual Studio 2022 menu. 51 | 52 | ## Additional Resources 53 | The following book provides examples, explanations, and a free code repository for developing your own parallel algorithms: 54 | - [Practical Parallel Algorithms in C++ and C#: Part 1: Sorting on Multicore CPUs](https://www.amazon.com/Practical-Parallel-Algorithms-Sorting-Multicore-ebook/dp/B0C3TZPRKZ/ref=sr_1_2?crid=2WH4J28ICJ1DV&keywords=duvanenko&qid=1700855661&sprefix=duvanenko%2Caps%2C103&sr=8-2) 55 | - [C++ open source repository of parallel algorithms for the above book](https://github.com/DragonSpit/ParallelAlgorithms) 56 | 57 | Related blogs to improve performance of C++ Standard Parallel Algorithms: 58 | - [Sorting 19X Faster than C++ Parallel Sort](https://duvanenko.tech.blog/2023/10/29/sorting-19x-faster-than-c-parallel-sort/) 59 | - [Can C++ Paralllel Standard Algorithms Accelerate, Even Small Arrays](https://duvanenko.tech.blog/2023/05/31/c-parallel-stl-performance-for-small-ish-arrays/) 60 | - [Improving Parallel Performance for Small Arrays](https://duvanenko.tech.blog/2023/05/31/c-parallel-stl-performance-for-small-ish-arrays/) -------------------------------------------------------------------------------- /Windows/ParallelSTL/ParallelSTL.vcxproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Debug 6 | Win32 7 | 8 | 9 | Release 10 | Win32 11 | 12 | 13 | Debug 14 | x64 15 | 16 | 17 | Release 18 | x64 19 | 20 | 21 | 22 | 23 | 24 | 25 | 17.0 26 | Win32Proj 27 | {d45a65a4-6b11-46f0-8223-f15ce5df669a} 28 | ParallelSTL 29 | 10.0 30 | 31 | 32 | 33 | Application 34 | true 35 | v143 36 | Unicode 37 | 38 | 39 | Application 40 | false 41 | v143 42 | true 43 | Unicode 44 | 45 | 46 | Application 47 | true 48 | v143 49 | Unicode 50 | 51 | 52 | Application 53 | false 54 | v143 55 | true 56 | Unicode 57 | true 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | C:\Program Files (x86)\Intel\oneAPI\2024.0\include;$(VC_IncludePath);$(WindowsSDK_IncludePath); 79 | 80 | 81 | 82 | Level3 83 | true 84 | WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) 85 | true 86 | 87 | 88 | Console 89 | true 90 | 91 | 92 | 93 | 94 | Level3 95 | true 96 | true 97 | true 98 | WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 99 | true 100 | 101 | 102 | Console 103 | true 104 | true 105 | true 106 | 107 | 108 | 109 | 110 | Level3 111 | true 112 | _DEBUG;_CONSOLE;%(PreprocessorDefinitions) 113 | true 114 | 115 | 116 | Console 117 | true 118 | 119 | 120 | 121 | 122 | Level3 123 | true 124 | true 125 | true 126 | NDEBUG;_CONSOLE;%(PreprocessorDefinitions);_SILENCE_CXX17_ITERATOR_BASE_CLASS_DEPRECATION_WARNING 127 | true 128 | stdcpp20 129 | 130 | 131 | Console 132 | true 133 | true 134 | true 135 | 136 | 137 | 138 | 139 | 140 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /src/main.cpp: -------------------------------------------------------------------------------- 1 | // Additional reference besides oneapi/dpl/algorithm header itself: https://www.intel.com/content/www/us/en/developer/articles/guide/get-started-with-parallel-stl.html (this reference provides links for each standard algorithm manual page) 2 | // On Windows std::sort is Microsoft's when using Microsoft or Intel compilers. 3 | // On Linux std::sort is g++'s. 4 | // 5 | // TODO: These benchmarks show that bandwidth limits parallel scaling. Maybe instead of large arrays, small enough arrays need to be used that fit in cache, using algorithms repeatedly within cache 6 | // to show parallel scaling when higher bandwidth is available. This would be a good demonstration of parallel scaling of each algorithm with higher bandwidth availability. 7 | // TODO: Demonstrate a nice cache effect on performance, where a small array of 1,000,000 elements which fits into cache, first time run is much slower than the rest of runs, with different 8 | // implementation winning in performance. However, as an algorithm or an array is used again and again, performance goes up substatially. Need to measure not only the first run time 9 | // but also the rest of run times to show this clearly by showing run time for each time use. Show that for large arrays running once versus running again and again, the times stay the same. 10 | // This complicates parallel algorithm usage for arrays that fit into the cache. It becomes not clear which algorithm is best to use: parallel or serial and when to use each. 11 | // This belongs in a blog entry of its own! 12 | // TODO: Make sure to page-in the buffer, even for fill, before benchmarking. Didn't seem to make much difference on my laptop 13 | // Conclusion: Not all parallel algorithms are advantageous when arrays fit into cache, with serial algorithms outperforming the parallel on first few runs only for some. 14 | 15 | #if defined(WIN32) || defined(_WIN32) || defined(__WIN32) && !defined(__CYGWIN__) 16 | #define DPL_ALGORITHMS // Includes Intel's OneAPI parallel algorithm implementations 17 | #define MICROSOFT_ALGORITHMS // Excludes single-core SIMD implementations, which Microsoft does not support 18 | #endif 19 | 20 | #ifdef DPL_ALGORITHMS 21 | // oneDPL headers should be included before standard headers 22 | #include 23 | #include 24 | #include 25 | #else 26 | #include 27 | #include 28 | #include 29 | #include 30 | #endif 31 | 32 | #include 33 | #include 34 | #include 35 | 36 | #include 37 | 38 | using namespace std; 39 | using std::chrono::duration; 40 | using std::chrono::duration_cast; 41 | using std::chrono::high_resolution_clock; 42 | using std::milli; 43 | 44 | void print_results(const char* const tag, const vector& in_array, 45 | high_resolution_clock::time_point startTime, 46 | high_resolution_clock::time_point endTime) 47 | { 48 | printf("%s: size = %zu Lowest: %d Highest: %d Time: %fms\n", tag, in_array.size(), in_array.front(), in_array.back(), 49 | duration_cast>(endTime - startTime).count()); 50 | } 51 | 52 | void print_results(const char* const tag, std::vector::iterator result, const vector& in_array, 53 | high_resolution_clock::time_point startTime, 54 | high_resolution_clock::time_point endTime) 55 | { 56 | printf("%s: size = %zu Result: %d Lowest: %d Highest: %d Time: %fms\n", tag, in_array.size(), *result, 57 | in_array.front(), in_array.back(), duration_cast>(endTime - startTime).count()); 58 | } 59 | 60 | void print_results(const char* const tag, size_t result, const vector& in_array, 61 | high_resolution_clock::time_point startTime, 62 | high_resolution_clock::time_point endTime) 63 | { 64 | printf("%s: size = %zu Result: %zu Lowest: %d Highest: %d Time: %fms\n", tag, in_array.size(), result, 65 | in_array.front(), in_array.back(), duration_cast>(endTime - startTime).count()); 66 | } 67 | 68 | void print_results(const char* const tag, const vector& in_array, 69 | high_resolution_clock::time_point startTime, 70 | high_resolution_clock::time_point endTime) 71 | { 72 | printf("%s: size = %zu Lowest: %lld Highest: %lld Time: %fms\n", tag, in_array.size(), in_array.front(), in_array.back(), 73 | duration_cast>(endTime - startTime).count()); 74 | } 75 | 76 | void print_results(const char* const tag, const vector& in_array, 77 | high_resolution_clock::time_point startTime, 78 | high_resolution_clock::time_point endTime) 79 | { 80 | printf("%s: size = %zu %p Lowest: %g Highest: %g Time: %fms\n", tag, in_array.size(), in_array.data(), in_array.front(), in_array.back(), 81 | duration_cast>(endTime - startTime).count()); 82 | } 83 | 84 | void print_results(const char* const tag, const vector& in_array, 85 | high_resolution_clock::time_point startTime, 86 | high_resolution_clock::time_point endTime) 87 | { 88 | printf("%s: size = %zu %p Lowest: %zu Highest: %zu Time: %fms\n", tag, in_array.size(), in_array.data(), in_array.front(), in_array.back(), 89 | duration_cast>(endTime - startTime).count()); 90 | } 91 | 92 | void fill_scalar_around_cache(vector& data, int value) 93 | { 94 | int* p_data = data.data(); 95 | 96 | for (size_t i = 0; i < data.size(); i++, p_data++) 97 | { 98 | _mm_stream_si32(p_data, value); 99 | } 100 | } 101 | 102 | void fill_scalar_around_cache_64(vector& data, int value) 103 | { 104 | long long* p_data = (long long *)data.data(); 105 | size_t end = data.size() / 2; 106 | 107 | for (size_t i = 0; i < end; i++, p_data++) 108 | { 109 | _mm_stream_si64(p_data, (long long)value); 110 | } 111 | } 112 | 113 | void fill_scalar_around_cache(int* p_data, size_t l, size_t r, int value) 114 | { 115 | int* r_data = p_data + (r - l); 116 | for (; p_data != r_data; p_data++) 117 | { 118 | _mm_stream_si32(p_data, value); 119 | } 120 | } 121 | 122 | void fill_benchmark(size_t array_size, size_t num_times) 123 | { 124 | high_resolution_clock::time_point startTime, endTime; 125 | std::vector data(array_size); 126 | 127 | printf("\n\n"); 128 | 129 | for (size_t i = 0; i < num_times; i++) 130 | { 131 | startTime = high_resolution_clock::now(); 132 | std::fill(std::execution::seq, data.begin(), data.end(), 42); 133 | endTime = high_resolution_clock::now(); 134 | print_results("Serial std::fill", data, startTime, endTime); 135 | } 136 | 137 | for (size_t i = 0; i < num_times; i++) 138 | { 139 | startTime = high_resolution_clock::now(); 140 | fill_scalar_around_cache(data, 42); 141 | endTime = high_resolution_clock::now(); 142 | print_results("fill_scalar_around_cache", data, startTime, endTime); 143 | } 144 | 145 | //for (size_t i = 0; i < num_times; i++) 146 | //{ 147 | // startTime = high_resolution_clock::now(); 148 | // //ParallelAlgorithms::parallel_fill(data.data(), 42, 0, data.size() - 1, data.size() / 8); 149 | // ParallelAlgorithms::parallel_fill(data.data(), 42, 0, data.size() - 1, data.size() / 8); 150 | // endTime = high_resolution_clock::now(); 151 | // print_results("parallel_fill", data, startTime, endTime); 152 | //} 153 | #ifndef MICROSOFT_ALGORITHMS 154 | for (size_t i = 0; i < num_times; i++) 155 | { 156 | startTime = high_resolution_clock::now(); 157 | std::fill(std::execution::unseq, data.begin(), data.end(), 42); 158 | endTime = high_resolution_clock::now(); 159 | print_results("Serial SIMD std::fill", data, startTime, endTime); 160 | } 161 | #endif 162 | for (size_t i = 0; i < num_times; i++) 163 | { 164 | startTime = high_resolution_clock::now(); 165 | std::fill(std::execution::par, data.begin(), data.end(), 42); 166 | endTime = high_resolution_clock::now(); 167 | print_results("Parallel std::fill", data, startTime, endTime); 168 | } 169 | 170 | for (size_t i = 0; i < num_times; i++) 171 | { 172 | startTime = high_resolution_clock::now(); 173 | std::fill(std::execution::par_unseq, data.begin(), data.end(), 42); 174 | endTime = high_resolution_clock::now(); 175 | print_results("Parallel SIMD std::fill", data, startTime, endTime); 176 | } 177 | 178 | #ifdef DPL_ALGORITHMS 179 | for (size_t i = 0; i < num_times; i++) 180 | { 181 | startTime = high_resolution_clock::now(); 182 | std::fill(oneapi::dpl::execution::seq, data.begin(), data.end(), 42); 183 | endTime = high_resolution_clock::now(); 184 | print_results("Serial dpl::fill", data, startTime, endTime); 185 | } 186 | 187 | for (size_t i = 0; i < num_times; i++) 188 | { 189 | startTime = high_resolution_clock::now(); 190 | std::fill(oneapi::dpl::execution::unseq, data.begin(), data.end(), 42); 191 | endTime = high_resolution_clock::now(); 192 | print_results("SIMD dpl::fill", data, startTime, endTime); 193 | } 194 | 195 | for (size_t i = 0; i < num_times; i++) 196 | { 197 | startTime = high_resolution_clock::now(); 198 | std::fill(oneapi::dpl::execution::par, data.begin(), data.end(), 42); 199 | endTime = high_resolution_clock::now(); 200 | print_results("Parallel dpl::fill", data, startTime, endTime); 201 | } 202 | 203 | for (size_t i = 0; i < num_times; i++) 204 | { 205 | startTime = high_resolution_clock::now(); 206 | std::fill(oneapi::dpl::execution::par_unseq, data.begin(), data.end(), 42); 207 | endTime = high_resolution_clock::now(); 208 | print_results("Parallel SIMD dpl::fill", data, startTime, endTime); 209 | } 210 | 211 | // for (size_t i = 0; i < num_times; i++) 212 | // { 213 | // startTime = high_resolution_clock::now(); 214 | // std::fill(oneapi::dpl::execution::dpcpp_default, data.begin(), data.end(), 42); 215 | // endTime = high_resolution_clock::now(); 216 | // print_results("Parallel DPCPP_DEFAULT dpl::fill", data, startTime, endTime); 217 | // } 218 | #endif 219 | } 220 | 221 | void fill_long_long_benchmark(size_t array_size, int num_times) 222 | { 223 | high_resolution_clock::time_point startTime, endTime; 224 | std::vector data(array_size); 225 | 226 | printf("\n\n"); 227 | 228 | for (size_t i = 0; i < num_times; i++) 229 | { 230 | startTime = high_resolution_clock::now(); 231 | std::fill(std::execution::seq, data.begin(), data.end(), 42); 232 | endTime = high_resolution_clock::now(); 233 | print_results("Serial std::fill", data, startTime, endTime); 234 | } 235 | 236 | //startTime = high_resolution_clock::now(); 237 | //std::fill(std::execution::unseq, data.begin(), data.end(), 42); 238 | //endTime = high_resolution_clock::now(); 239 | //print_results("SIMD Fill", data, startTime, endTime); 240 | 241 | for (size_t i = 0; i < num_times; i++) 242 | { 243 | startTime = high_resolution_clock::now(); 244 | std::fill(std::execution::par, data.begin(), data.end(), 42); 245 | endTime = high_resolution_clock::now(); 246 | print_results("Parallel std::fill", data, startTime, endTime); 247 | } 248 | 249 | for (size_t i = 0; i < num_times; i++) 250 | { 251 | startTime = high_resolution_clock::now(); 252 | std::fill(std::execution::par_unseq, data.begin(), data.end(), 42); 253 | endTime = high_resolution_clock::now(); 254 | print_results("Parallel SIMD std::fill", data, startTime, endTime); 255 | } 256 | 257 | #ifdef DPL_ALGORITHMS 258 | for (size_t i = 0; i < num_times; i++) 259 | { 260 | startTime = high_resolution_clock::now(); 261 | std::fill(oneapi::dpl::execution::seq, data.begin(), data.end(), 42); 262 | endTime = high_resolution_clock::now(); 263 | print_results("Serial dpl::fill", data, startTime, endTime); 264 | } 265 | 266 | for (size_t i = 0; i < num_times; i++) 267 | { 268 | startTime = high_resolution_clock::now(); 269 | std::fill(oneapi::dpl::execution::unseq, data.begin(), data.end(), 42); 270 | endTime = high_resolution_clock::now(); 271 | print_results("SIMD dpl::fill", data, startTime, endTime); 272 | } 273 | 274 | for (size_t i = 0; i < num_times; i++) 275 | { 276 | startTime = high_resolution_clock::now(); 277 | std::fill(oneapi::dpl::execution::par, data.begin(), data.end(), 42); 278 | endTime = high_resolution_clock::now(); 279 | print_results("Parallel dpl::fill", data, startTime, endTime); 280 | } 281 | 282 | for (size_t i = 0; i < num_times; i++) 283 | { 284 | startTime = high_resolution_clock::now(); 285 | std::fill(oneapi::dpl::execution::par_unseq, data.begin(), data.end(), 42); 286 | endTime = high_resolution_clock::now(); 287 | print_results("Parallel SIMD dpl::fill", data, startTime, endTime); 288 | } 289 | 290 | // for (size_t i = 0; i < num_times; i++) 291 | // { 292 | // startTime = high_resolution_clock::now(); 293 | // std::fill(oneapi::dpl::execution::dpcpp_default, data.begin(), data.end(), 42); 294 | // endTime = high_resolution_clock::now(); 295 | // print_results("Parallel DPCPP_DEFAULT dpl::fill", data, startTime, endTime); 296 | // } 297 | #endif 298 | } 299 | 300 | void sort_benchmark(size_t array_size, size_t num_times) 301 | { 302 | //std::cout << "Size of int: " << sizeof(int) << std::endl; 303 | printf("\n\n"); 304 | 305 | std::vector data( array_size); 306 | std::vector data_copy(array_size); 307 | 308 | high_resolution_clock::time_point startTime, endTime; 309 | //random_device rd; 310 | std::mt19937_64 dist(1234); 311 | 312 | for (auto& d : data) { 313 | //d = static_cast(rd()); 314 | d = static_cast(dist()); // way faster on Linux 315 | } 316 | 317 | // std::sort benchmarks 318 | 319 | for (size_t i = 0; i < num_times; i++) 320 | { 321 | copy(std::execution::par, data.begin(), data.end(), data_copy.begin()); 322 | 323 | startTime = high_resolution_clock::now(); 324 | sort(std::execution::seq, data_copy.begin(), data_copy.end()); 325 | endTime = high_resolution_clock::now(); 326 | print_results("Serial std::sort", data, startTime, endTime); 327 | } 328 | 329 | #ifndef MICROSOFT_ALGORITHMS 330 | for (size_t i = 0; i < num_times; i++) 331 | { 332 | copy(std::execution::par, data.begin(), data.end(), data_copy.begin()); 333 | 334 | startTime = high_resolution_clock::now(); 335 | sort(std::execution::unseq, data_copy.begin(), data_copy.end()); 336 | endTime = high_resolution_clock::now(); 337 | print_results("Serial SIMD std::sort", data, startTime, endTime); 338 | } 339 | #endif 340 | 341 | for (size_t i = 0; i < num_times; i++) 342 | { 343 | copy(std::execution::par, data.begin(), data.end(), data_copy.begin()); 344 | 345 | startTime = high_resolution_clock::now(); 346 | sort(std::execution::par, data_copy.begin(), data_copy.end()); 347 | endTime = high_resolution_clock::now(); 348 | print_results("Parallel std::sort", data, startTime, endTime); 349 | } 350 | 351 | for (size_t i = 0; i < num_times; i++) 352 | { 353 | copy(std::execution::par, data.begin(), data.end(), data_copy.begin()); 354 | 355 | startTime = high_resolution_clock::now(); 356 | sort(std::execution::par_unseq, data_copy.begin(), data_copy.end()); 357 | endTime = high_resolution_clock::now(); 358 | print_results("Parallel SIMD std::sort", data, startTime, endTime); 359 | } 360 | 361 | // dpl::sort benchmarks 362 | 363 | #ifdef DPL_ALGORITHMS 364 | for (size_t i = 0; i < num_times; i++) 365 | { 366 | copy(std::execution::par, data.begin(), data.end(), data_copy.begin()); 367 | 368 | startTime = high_resolution_clock::now(); 369 | sort(oneapi::dpl::execution::seq, data_copy.begin(), data_copy.end()); 370 | endTime = high_resolution_clock::now(); 371 | print_results("Serial dpl::sort", data, startTime, endTime); 372 | } 373 | 374 | for (size_t i = 0; i < num_times; i++) 375 | { 376 | copy(std::execution::par, data.begin(), data.end(), data_copy.begin()); 377 | 378 | startTime = high_resolution_clock::now(); 379 | sort(oneapi::dpl::execution::unseq, data_copy.begin(), data_copy.end()); 380 | endTime = high_resolution_clock::now(); 381 | print_results("SIMD dpl::sort", data, startTime, endTime); 382 | } 383 | 384 | for (size_t i = 0; i < num_times; i++) 385 | { 386 | copy(std::execution::par, data.begin(), data.end(), data_copy.begin()); 387 | 388 | startTime = high_resolution_clock::now(); 389 | sort(oneapi::dpl::execution::par, data_copy.begin(), data_copy.end()); 390 | endTime = high_resolution_clock::now(); 391 | print_results("Parallel dpl::sort", data, startTime, endTime); 392 | } 393 | 394 | for (size_t i = 0; i < num_times; i++) 395 | { 396 | copy(std::execution::par, data.begin(), data.end(), data_copy.begin()); 397 | 398 | startTime = high_resolution_clock::now(); 399 | sort(oneapi::dpl::execution::par_unseq, data_copy.begin(), data_copy.end()); 400 | endTime = high_resolution_clock::now(); 401 | print_results("Parallel SIMD dpl::sort", data, startTime, endTime); 402 | } 403 | #endif 404 | } 405 | 406 | void sort_doubles_benchmark(size_t array_size, int num_times, bool reuse_array) 407 | { 408 | //std::cout << "Size of int: " << sizeof(int) << std::endl; 409 | printf("\n\n"); 410 | 411 | vector data(array_size); 412 | high_resolution_clock::time_point startTime, endTime; 413 | random_device rd; 414 | 415 | for (auto& d : data) { 416 | d = static_cast(rd()); 417 | } 418 | 419 | // std::sort benchmarks 420 | 421 | for (size_t i = 0; i < num_times; i++) 422 | { 423 | if (reuse_array) 424 | { 425 | for (auto& d : data) { 426 | d = static_cast(rd()); 427 | } 428 | startTime = high_resolution_clock::now(); 429 | sort(std::execution::seq, data.begin(), data.end()); 430 | endTime = high_resolution_clock::now(); 431 | print_results("Serial std::sort", data, startTime, endTime); 432 | } 433 | else 434 | { 435 | vector data_loc(array_size); 436 | for (auto& d : data_loc) { 437 | d = static_cast(rd()); 438 | } 439 | startTime = high_resolution_clock::now(); 440 | sort(std::execution::seq, data_loc.begin(), data_loc.end()); 441 | endTime = high_resolution_clock::now(); 442 | print_results("Serial std::sort", data_loc, startTime, endTime); 443 | } 444 | } 445 | 446 | //for (auto& d : data) { 447 | // d = static_cast(rd()); 448 | //} 449 | 450 | //startTime = high_resolution_clock::now(); 451 | //sort(std::execution::unseq, data.begin(), data.end()); 452 | //endTime = high_resolution_clock::now(); 453 | //print_results("SIMD std::sort", data, startTime, endTime); 454 | 455 | for (size_t i = 0; i < num_times; i++) 456 | { 457 | if (reuse_array) 458 | { 459 | for (auto& d : data) { 460 | d = static_cast(rd()); 461 | } 462 | startTime = high_resolution_clock::now(); 463 | sort(std::execution::par, data.begin(), data.end()); 464 | endTime = high_resolution_clock::now(); 465 | print_results("Parallel std::sort", data, startTime, endTime); 466 | } 467 | else 468 | { 469 | vector data_loc(array_size); 470 | for (auto& d : data_loc) { 471 | d = static_cast(rd()); 472 | } 473 | startTime = high_resolution_clock::now(); 474 | sort(std::execution::par, data_loc.begin(), data_loc.end()); 475 | endTime = high_resolution_clock::now(); 476 | print_results("Parallel std::sort", data_loc, startTime, endTime); 477 | } 478 | } 479 | 480 | for (size_t i = 0; i < num_times; i++) 481 | { 482 | if (reuse_array) 483 | { 484 | for (auto& d : data) { 485 | d = static_cast(rd()); 486 | } 487 | startTime = high_resolution_clock::now(); 488 | sort(std::execution::par_unseq, data.begin(), data.end()); 489 | endTime = high_resolution_clock::now(); 490 | print_results("Parallel SIMD std::sort", data, startTime, endTime); 491 | } 492 | else 493 | { 494 | vector data_loc(array_size); 495 | for (auto& d : data_loc) { 496 | d = static_cast(rd()); 497 | } 498 | startTime = high_resolution_clock::now(); 499 | sort(std::execution::par_unseq, data_loc.begin(), data_loc.end()); 500 | endTime = high_resolution_clock::now(); 501 | print_results("Parallel SIMD std::sort", data_loc, startTime, endTime); 502 | } 503 | } 504 | 505 | // dpl::sort benchmarks 506 | 507 | #ifdef DPL_ALGORITHMS 508 | for (size_t i = 0; i < num_times; i++) 509 | { 510 | if (reuse_array) 511 | { 512 | for (auto& d : data) { 513 | d = static_cast(rd()); 514 | } 515 | startTime = high_resolution_clock::now(); 516 | sort(oneapi::dpl::execution::seq, data.begin(), data.end()); 517 | endTime = high_resolution_clock::now(); 518 | print_results("Serial dpl::sort", data, startTime, endTime); 519 | } 520 | else 521 | { 522 | vector data_loc(array_size); 523 | for (auto& d : data_loc) { 524 | d = static_cast(rd()); 525 | } 526 | startTime = high_resolution_clock::now(); 527 | sort(oneapi::dpl::execution::seq, data_loc.begin(), data_loc.end()); 528 | endTime = high_resolution_clock::now(); 529 | print_results("Serial dpl::sort", data_loc, startTime, endTime); 530 | } 531 | } 532 | 533 | for (size_t i = 0; i < num_times; i++) 534 | { 535 | if (reuse_array) 536 | { 537 | for (auto& d : data) { 538 | d = static_cast(rd()); 539 | } 540 | startTime = high_resolution_clock::now(); 541 | sort(oneapi::dpl::execution::unseq, data.begin(), data.end()); 542 | endTime = high_resolution_clock::now(); 543 | print_results("Serial SIMD dpl::sort", data, startTime, endTime); 544 | } 545 | else 546 | { 547 | vector data_loc(array_size); 548 | for (auto& d : data_loc) { 549 | d = static_cast(rd()); 550 | } 551 | startTime = high_resolution_clock::now(); 552 | sort(oneapi::dpl::execution::unseq, data_loc.begin(), data_loc.end()); 553 | endTime = high_resolution_clock::now(); 554 | print_results("Serial SIMD dpl::sort", data_loc, startTime, endTime); 555 | } 556 | } 557 | 558 | 559 | for (size_t i = 0; i < num_times; i++) 560 | { 561 | if (reuse_array) 562 | { 563 | for (auto& d : data) { 564 | d = static_cast(rd()); 565 | } 566 | startTime = high_resolution_clock::now(); 567 | sort(oneapi::dpl::execution::par, data.begin(), data.end()); 568 | endTime = high_resolution_clock::now(); 569 | print_results("Parallel dpl::sort", data, startTime, endTime); 570 | } 571 | else 572 | { 573 | vector data_loc(array_size); 574 | for (auto& d : data_loc) { 575 | d = static_cast(rd()); 576 | } 577 | startTime = high_resolution_clock::now(); 578 | sort(oneapi::dpl::execution::par, data_loc.begin(), data_loc.end()); 579 | endTime = high_resolution_clock::now(); 580 | print_results("Parallel dpl::sort", data_loc, startTime, endTime); 581 | } 582 | } 583 | 584 | for (size_t i = 0; i < num_times; i++) 585 | { 586 | if (reuse_array) 587 | { 588 | for (auto& d : data) { 589 | d = static_cast(rd()); 590 | } 591 | startTime = high_resolution_clock::now(); 592 | sort(oneapi::dpl::execution::par_unseq, data.begin(), data.end()); 593 | endTime = high_resolution_clock::now(); 594 | print_results("Parallel SIMD dpl::sort", data, startTime, endTime); 595 | } 596 | else 597 | { 598 | vector data_loc(array_size); 599 | for (auto& d : data_loc) { 600 | d = static_cast(rd()); 601 | } 602 | startTime = high_resolution_clock::now(); 603 | sort(oneapi::dpl::execution::par_unseq, data_loc.begin(), data_loc.end()); 604 | endTime = high_resolution_clock::now(); 605 | print_results("Parallel SIMD dpl::sort", data_loc, startTime, endTime); 606 | } 607 | } 608 | #endif 609 | } 610 | 611 | void stable_sort_benchmark(size_t array_size, size_t num_times) 612 | { 613 | vector data(array_size); 614 | vector data_copy(array_size); 615 | high_resolution_clock::time_point startTime, endTime; 616 | //random_device rd; 617 | std::mt19937_64 dist(1234); 618 | 619 | for (auto& d : data) { 620 | //d = static_cast(rd()); 621 | d = static_cast(dist()); // way faster on Linux 622 | } 623 | 624 | // std::stable_sort benchmarks 625 | printf("\n\n"); 626 | 627 | for (size_t i = 0; i < num_times; i++) 628 | { 629 | copy(std::execution::par, data.begin(), data.end(), data_copy.begin()); 630 | 631 | startTime = high_resolution_clock::now(); 632 | stable_sort(std::execution::seq, data_copy.begin(), data_copy.end()); 633 | endTime = high_resolution_clock::now(); 634 | print_results("Serial std::stable_sort", data, startTime, endTime); 635 | } 636 | #ifndef MICROSOFT_ALGORITHMS 637 | for (size_t i = 0; i < num_times; i++) 638 | { 639 | copy(std::execution::par, data.begin(), data.end(), data_copy.begin()); 640 | 641 | startTime = high_resolution_clock::now(); 642 | stable_sort(std::execution::unseq, data_copy.begin(), data_copy.end()); 643 | endTime = high_resolution_clock::now(); 644 | print_results("Serial SIMD std::stable_sort", data, startTime, endTime); 645 | } 646 | #endif 647 | for (size_t i = 0; i < num_times; i++) 648 | { 649 | copy(std::execution::par, data.begin(), data.end(), data_copy.begin()); 650 | 651 | startTime = high_resolution_clock::now(); 652 | stable_sort(std::execution::par, data_copy.begin(), data_copy.end()); 653 | endTime = high_resolution_clock::now(); 654 | print_results("Parallel std::stable_sort", data, startTime, endTime); 655 | } 656 | 657 | for (size_t i = 0; i < num_times; i++) 658 | { 659 | copy(std::execution::par, data.begin(), data.end(), data_copy.begin()); 660 | 661 | startTime = high_resolution_clock::now(); 662 | stable_sort(std::execution::par_unseq, data_copy.begin(), data_copy.end()); 663 | endTime = high_resolution_clock::now(); 664 | print_results("Parallel SIMD std::stable_sort", data, startTime, endTime); 665 | } 666 | 667 | // dpl::stable_sort benchmarks 668 | #ifdef DPL_ALGORITHMS 669 | 670 | for (size_t i = 0; i < num_times; i++) 671 | { 672 | copy(std::execution::par, data.begin(), data.end(), data_copy.begin()); 673 | 674 | startTime = high_resolution_clock::now(); 675 | stable_sort(oneapi::dpl::execution::seq, data_copy.begin(), data_copy.end()); 676 | endTime = high_resolution_clock::now(); 677 | print_results("Serial dpl::stable_sort", data, startTime, endTime); 678 | } 679 | 680 | for (size_t i = 0; i < num_times; i++) 681 | { 682 | copy(std::execution::par, data.begin(), data.end(), data_copy.begin()); 683 | 684 | startTime = high_resolution_clock::now(); 685 | stable_sort(oneapi::dpl::execution::unseq, data_copy.begin(), data_copy.end()); 686 | endTime = high_resolution_clock::now(); 687 | print_results("SIMD dpl::stable_sort", data, startTime, endTime); 688 | } 689 | 690 | for (size_t i = 0; i < num_times; i++) 691 | { 692 | copy(std::execution::par, data.begin(), data.end(), data_copy.begin()); 693 | 694 | startTime = high_resolution_clock::now(); 695 | stable_sort(oneapi::dpl::execution::par, data_copy.begin(), data_copy.end()); 696 | endTime = high_resolution_clock::now(); 697 | print_results("Parallel dpl::stable_sort", data, startTime, endTime); 698 | } 699 | 700 | for (size_t i = 0; i < num_times; i++) 701 | { 702 | copy(std::execution::par, data.begin(), data.end(), data_copy.begin()); 703 | 704 | startTime = high_resolution_clock::now(); 705 | stable_sort(oneapi::dpl::execution::par_unseq, data_copy.begin(), data_copy.end()); 706 | endTime = high_resolution_clock::now(); 707 | print_results("Parallel SIMD dpl::stable_sort", data, startTime, endTime); 708 | } 709 | #endif 710 | } 711 | 712 | void merge_benchmark(size_t array_size, size_t num_times) 713 | { 714 | std::vector data_int_src_0(array_size); 715 | std::vector data_int_src_1(array_size); 716 | std::vector data_int_dst( 2 * array_size, 1); // initializate destination to page in and cache it 717 | high_resolution_clock::time_point startTime, endTime; 718 | 719 | //random_device rd; 720 | mt19937_64 dist(1234); 721 | 722 | printf("\n\n"); 723 | 724 | for (auto& d : data_int_src_0) { 725 | //d = static_cast(rd()); 726 | d = static_cast(dist()); // way faster on Linux 727 | } 728 | for (auto& d : data_int_src_1) { 729 | //d = static_cast(rd()); 730 | d = static_cast(dist()); // way faster on Linux 731 | } 732 | 733 | sort(std::execution::par, data_int_src_0.begin(), data_int_src_0.end()); 734 | sort(std::execution::par, data_int_src_1.begin(), data_int_src_1.end()); 735 | 736 | // std::merge benchmarks 737 | 738 | for (size_t i = 0; i < num_times; i++) 739 | { 740 | startTime = high_resolution_clock::now(); 741 | merge(std::execution::seq, data_int_src_0.begin(), data_int_src_0.end(), data_int_src_1.begin(), data_int_src_1.end(), data_int_dst.begin()); 742 | endTime = high_resolution_clock::now(); 743 | print_results("Serial std::merge", data_int_dst, startTime, endTime); 744 | } 745 | #ifndef MICROSOFT_ALGORITHMS 746 | for (size_t i = 0; i < num_times; i++) 747 | { 748 | startTime = high_resolution_clock::now(); 749 | merge(std::execution::unseq, data_int_src_0.begin(), data_int_src_0.end(), data_int_src_1.begin(), data_int_src_1.end(), data_int_dst.begin()); 750 | endTime = high_resolution_clock::now(); 751 | print_results("Serial SIMD std::merge", data_int_dst, startTime, endTime); 752 | } 753 | #endif 754 | for (size_t i = 0; i < num_times; i++) 755 | { 756 | startTime = high_resolution_clock::now(); 757 | merge(std::execution::par, data_int_src_0.begin(), data_int_src_0.end(), data_int_src_1.begin(), data_int_src_1.end(), data_int_dst.begin()); 758 | endTime = high_resolution_clock::now(); 759 | print_results("Parallel std::merge", data_int_dst, startTime, endTime); 760 | } 761 | 762 | for (size_t i = 0; i < num_times; i++) 763 | { 764 | startTime = high_resolution_clock::now(); 765 | merge(std::execution::par_unseq, data_int_src_0.begin(), data_int_src_0.end(), data_int_src_1.begin(), data_int_src_1.end(), data_int_dst.begin()); 766 | endTime = high_resolution_clock::now(); 767 | print_results("Parallel SIMD std::merge", data_int_dst, startTime, endTime); 768 | } 769 | 770 | // dpl::stable_sort benchmarks 771 | #ifdef DPL_ALGORITHMS 772 | 773 | for (size_t i = 0; i < num_times; i++) 774 | { 775 | startTime = high_resolution_clock::now(); 776 | merge(oneapi::dpl::execution::seq, data_int_src_0.begin(), data_int_src_0.end(), data_int_src_1.begin(), data_int_src_1.end(), data_int_dst.begin()); 777 | endTime = high_resolution_clock::now(); 778 | print_results("Serial dpl::merge", data_int_dst, startTime, endTime); 779 | } 780 | 781 | for (size_t i = 0; i < num_times; i++) 782 | { 783 | startTime = high_resolution_clock::now(); 784 | merge(oneapi::dpl::execution::unseq, data_int_src_0.begin(), data_int_src_0.end(), data_int_src_1.begin(), data_int_src_1.end(), data_int_dst.begin()); 785 | endTime = high_resolution_clock::now(); 786 | print_results("SIMD dpl::merge", data_int_dst, startTime, endTime); 787 | } 788 | 789 | for (size_t i = 0; i < num_times; i++) 790 | { 791 | startTime = high_resolution_clock::now(); 792 | merge(oneapi::dpl::execution::par, data_int_src_0.begin(), data_int_src_0.end(), data_int_src_1.begin(), data_int_src_1.end(), data_int_dst.begin()); 793 | endTime = high_resolution_clock::now(); 794 | print_results("Parallel dpl::merge", data_int_dst, startTime, endTime); 795 | } 796 | 797 | for (size_t i = 0; i < num_times; i++) 798 | { 799 | startTime = high_resolution_clock::now(); 800 | merge(oneapi::dpl::execution::par_unseq, data_int_src_0.begin(), data_int_src_0.end(), data_int_src_1.begin(), data_int_src_1.end(), data_int_dst.begin()); 801 | endTime = high_resolution_clock::now(); 802 | print_results("Parallel SIMD dpl::merge", data_int_dst, startTime, endTime); 803 | } 804 | 805 | #endif 806 | } 807 | 808 | void inplace_merge_benchmark(size_t array_size, size_t num_times) 809 | { 810 | std::vector data_int( array_size * 2); 811 | std::vector data_copy(array_size * 2); 812 | high_resolution_clock::time_point startTime, endTime; 813 | 814 | //random_device rd; 815 | mt19937_64 dist(1234); 816 | 817 | for (auto& d : data_int) { 818 | //d = static_cast(rd()); 819 | d = static_cast(dist()); // way faster on Linux 820 | } 821 | 822 | printf("\n\n"); 823 | 824 | // std::inplace_merge benchmarks 825 | 826 | for (size_t i = 0; i < num_times; i++) 827 | { 828 | copy(std::execution::par, data_int.begin(), data_int.end(), data_copy.begin()); 829 | 830 | sort(std::execution::par, data_copy.begin(), data_copy.begin() + data_copy.size() / 2); // left half 831 | sort(std::execution::par, data_copy.begin() + data_copy.size() / 2, data_copy.end()); // right half 832 | 833 | startTime = high_resolution_clock::now(); 834 | inplace_merge(std::execution::seq, data_copy.begin(), data_copy.begin() + data_copy.size() / 2, data_copy.end()); 835 | endTime = high_resolution_clock::now(); 836 | print_results("Serial std::inplace_merge", data_int, startTime, endTime); 837 | } 838 | #ifndef MICROSOFT_ALGORITHMS 839 | for (size_t i = 0; i < num_times; i++) 840 | { 841 | copy(std::execution::par, data_int.begin(), data_int.end(), data_copy.begin()); 842 | 843 | sort(std::execution::par, data_copy.begin(), data_copy.begin() + data_copy.size() / 2); // left half 844 | sort(std::execution::par, data_copy.begin() + data_copy.size() / 2, data_copy.end()); // right half 845 | 846 | startTime = high_resolution_clock::now(); 847 | inplace_merge(std::execution::unseq, data_copy.begin(), data_copy.begin() + data_copy.size() / 2, data_copy.end()); 848 | endTime = high_resolution_clock::now(); 849 | print_results("Serial SIMD std::inplace_merge", data_int, startTime, endTime); 850 | } 851 | #endif 852 | for (size_t i = 0; i < num_times; i++) 853 | { 854 | copy(std::execution::par, data_int.begin(), data_int.end(), data_copy.begin()); 855 | 856 | sort(std::execution::par, data_copy.begin(), data_copy.begin() + data_copy.size() / 2); // left half 857 | sort(std::execution::par, data_copy.begin() + data_copy.size() / 2, data_copy.end()); // right half 858 | 859 | startTime = high_resolution_clock::now(); 860 | inplace_merge(std::execution::par, data_copy.begin(), data_copy.begin() + data_copy.size() / 2, data_copy.end()); 861 | endTime = high_resolution_clock::now(); 862 | print_results("Parallel std::inplace_merge", data_int, startTime, endTime); 863 | } 864 | 865 | for (size_t i = 0; i < num_times; i++) 866 | { 867 | copy(std::execution::par, data_int.begin(), data_int.end(), data_copy.begin()); 868 | 869 | sort(std::execution::par, data_copy.begin(), data_copy.begin() + data_copy.size() / 2); // left half 870 | sort(std::execution::par, data_copy.begin() + data_copy.size() / 2, data_copy.end()); // right half 871 | 872 | startTime = high_resolution_clock::now(); 873 | inplace_merge(std::execution::par_unseq, data_copy.begin(), data_copy.begin() + data_copy.size() / 2, data_copy.end()); 874 | endTime = high_resolution_clock::now(); 875 | print_results("Parallel SIMD std::inplace_merge", data_int, startTime, endTime); 876 | } 877 | 878 | // dpl::stable_sort benchmarks 879 | #ifdef DPL_ALGORITHMS 880 | 881 | for (size_t i = 0; i < num_times; i++) 882 | { 883 | copy(std::execution::par, data_int.begin(), data_int.end(), data_copy.begin()); 884 | 885 | sort(std::execution::par, data_copy.begin(), data_copy.begin() + data_copy.size() / 2); // left half 886 | sort(std::execution::par, data_copy.begin() + data_copy.size() / 2, data_copy.end()); // right half 887 | 888 | startTime = high_resolution_clock::now(); 889 | inplace_merge(oneapi::dpl::execution::seq, data_copy.begin(), data_copy.begin() + data_copy.size() / 2, data_copy.end()); 890 | endTime = high_resolution_clock::now(); 891 | print_results("Serial dpl::inplace_merge", data_int, startTime, endTime); 892 | } 893 | 894 | for (size_t i = 0; i < num_times; i++) 895 | { 896 | copy(std::execution::par, data_int.begin(), data_int.end(), data_copy.begin()); 897 | 898 | sort(std::execution::par, data_copy.begin(), data_copy.begin() + data_copy.size() / 2); // left half 899 | sort(std::execution::par, data_copy.begin() + data_copy.size() / 2, data_copy.end()); // right half 900 | 901 | startTime = high_resolution_clock::now(); 902 | inplace_merge(oneapi::dpl::execution::unseq, data_copy.begin(), data_copy.begin() + data_copy.size() / 2, data_copy.end()); 903 | endTime = high_resolution_clock::now(); 904 | print_results("SIMD dpl::inplace_merge", data_int, startTime, endTime); 905 | } 906 | 907 | for (size_t i = 0; i < num_times; i++) 908 | { 909 | copy(std::execution::par, data_int.begin(), data_int.end(), data_copy.begin()); 910 | 911 | sort(std::execution::par, data_copy.begin(), data_copy.begin() + data_copy.size() / 2); // left half 912 | sort(std::execution::par, data_copy.begin() + data_copy.size() / 2, data_copy.end()); // right half 913 | 914 | startTime = high_resolution_clock::now(); 915 | inplace_merge(oneapi::dpl::execution::par, data_copy.begin(), data_copy.begin() + data_copy.size() / 2, data_copy.end()); 916 | endTime = high_resolution_clock::now(); 917 | print_results("Parallel dpl::inplace_merge", data_int, startTime, endTime); 918 | } 919 | 920 | for (size_t i = 0; i < num_times; i++) 921 | { 922 | copy(std::execution::par, data_int.begin(), data_int.end(), data_copy.begin()); 923 | 924 | sort(std::execution::par, data_copy.begin(), data_copy.begin() + data_copy.size() / 2); // left half 925 | sort(std::execution::par, data_copy.begin() + data_copy.size() / 2, data_copy.end()); // right half 926 | 927 | startTime = high_resolution_clock::now(); 928 | inplace_merge(oneapi::dpl::execution::par_unseq, data_copy.begin(), data_copy.begin() + data_copy.size() / 2, data_copy.end()); 929 | endTime = high_resolution_clock::now(); 930 | print_results("Parallel SIMD dpl::inplace_merge", data_int, startTime, endTime); 931 | } 932 | 933 | #endif 934 | } 935 | 936 | void merge_dual_buffer_benchmark(size_t array_size, int num_times) 937 | { 938 | std::vector data_int_src(2 * array_size); 939 | std::vector data_int_dst(2 * array_size); 940 | high_resolution_clock::time_point startTime, endTime; 941 | 942 | random_device rd; 943 | 944 | printf("\n\n"); 945 | 946 | for (auto& d : data_int_src) { 947 | d = static_cast(rd()); 948 | } 949 | for (auto& d : data_int_dst) { 950 | d = static_cast(rd()); 951 | } 952 | 953 | sort(std::execution::par, data_int_src.begin(), data_int_src.begin() + array_size); 954 | sort(std::execution::par, data_int_src.begin() + array_size, data_int_src.begin() + 2 * array_size); 955 | sort(std::execution::par, data_int_dst.begin(), data_int_dst.begin() + 2 * array_size); 956 | 957 | // std::merge benchmarks 958 | 959 | for (size_t i = 0; i < num_times; i++) 960 | { 961 | startTime = high_resolution_clock::now(); 962 | merge(std::execution::seq, data_int_src.begin(), data_int_src.begin() + array_size, 963 | data_int_src.begin() + array_size, data_int_src.begin() + 2 * array_size, data_int_dst.begin()); 964 | endTime = high_resolution_clock::now(); 965 | print_results("Serial single array std::merge", data_int_dst, startTime, endTime); 966 | } 967 | 968 | //startTime = high_resolution_clock::now(); 969 | //merge(std::execution::unseq, data_int_src_0.begin(), data_int_src_0.end(), data_int_src_1.begin(), data_int_src_1.end(), data_int_dst.begin()); 970 | //endTime = high_resolution_clock::now(); 971 | //print_results("SIMD std::merge", data_int_dst, startTime, endTime); 972 | 973 | for (size_t i = 0; i < num_times; i++) 974 | { 975 | startTime = high_resolution_clock::now(); 976 | merge(std::execution::par, data_int_src.begin(), data_int_src.begin() + array_size, 977 | data_int_src.begin() + array_size, data_int_src.begin() + 2 * array_size, data_int_dst.begin()); 978 | endTime = high_resolution_clock::now(); 979 | print_results("Parallel single array std::merge", data_int_dst, startTime, endTime); 980 | } 981 | 982 | for (size_t i = 0; i < num_times; i++) 983 | { 984 | startTime = high_resolution_clock::now(); 985 | merge(std::execution::par_unseq, data_int_src.begin(), data_int_src.begin() + array_size, 986 | data_int_src.begin() + array_size, data_int_src.begin() + 2 * array_size, data_int_dst.begin()); 987 | endTime = high_resolution_clock::now(); 988 | print_results("Parallel SIMD std::merge", data_int_dst, startTime, endTime); 989 | } 990 | 991 | // dpl::stable_sort benchmarks 992 | #ifdef DPL_ALGORITHMS 993 | 994 | for (size_t i = 0; i < num_times; i++) 995 | { 996 | startTime = high_resolution_clock::now(); 997 | merge(oneapi::dpl::execution::seq, data_int_src.begin(), data_int_src.begin() + array_size, 998 | data_int_src.begin() + array_size, data_int_src.begin() + 2 * array_size, data_int_dst.begin()); 999 | endTime = high_resolution_clock::now(); 1000 | print_results("Serial dpl::merge", data_int_dst, startTime, endTime); 1001 | } 1002 | 1003 | for (size_t i = 0; i < num_times; i++) 1004 | { 1005 | startTime = high_resolution_clock::now(); 1006 | merge(oneapi::dpl::execution::unseq, data_int_src.begin(), data_int_src.begin() + array_size, 1007 | data_int_src.begin() + array_size, data_int_src.begin() + 2 * array_size, data_int_dst.begin()); 1008 | endTime = high_resolution_clock::now(); 1009 | print_results("SIMD dpl::merge", data_int_dst, startTime, endTime); 1010 | } 1011 | 1012 | for (size_t i = 0; i < num_times; i++) 1013 | { 1014 | startTime = high_resolution_clock::now(); 1015 | merge(oneapi::dpl::execution::par, data_int_src.begin(), data_int_src.begin() + array_size, 1016 | data_int_src.begin() + array_size, data_int_src.begin() + 2 * array_size, data_int_dst.begin()); 1017 | endTime = high_resolution_clock::now(); 1018 | print_results("Parallel dpl::merge", data_int_dst, startTime, endTime); 1019 | } 1020 | 1021 | for (size_t i = 0; i < num_times; i++) 1022 | { 1023 | startTime = high_resolution_clock::now(); 1024 | merge(oneapi::dpl::execution::par_unseq, data_int_src.begin(), data_int_src.begin() + array_size, 1025 | data_int_src.begin() + array_size, data_int_src.begin() + 2 * array_size, data_int_dst.begin()); 1026 | endTime = high_resolution_clock::now(); 1027 | print_results("Parallel SIMD dpl::merge", data_int_dst, startTime, endTime); 1028 | } 1029 | 1030 | printf("done\n"); 1031 | 1032 | #endif 1033 | 1034 | //for (size_t i = 0; i < num_times; i++) 1035 | //{ 1036 | // startTime = high_resolution_clock::now(); 1037 | // merge_parallel_L5(data_int_src.data(), 0, array_size - 1, array_size, 2 * array_size - 1, data_int_dst.data(), 0); 1038 | // endTime = high_resolution_clock::now(); 1039 | // print_results("Parallel Victor's merge", data_int_dst, startTime, endTime); 1040 | //} 1041 | 1042 | } 1043 | void merge_single_buffer_benchmark(size_t array_size, int num_times) 1044 | { 1045 | std::vector data_int_src_dst(4 * array_size); 1046 | high_resolution_clock::time_point startTime, endTime; 1047 | 1048 | random_device rd; 1049 | 1050 | printf("\n\n"); 1051 | 1052 | for (auto& d : data_int_src_dst) { 1053 | d = static_cast(rd()); 1054 | } 1055 | 1056 | sort(std::execution::par, data_int_src_dst.begin(), data_int_src_dst.begin() + array_size); 1057 | sort(std::execution::par, data_int_src_dst.begin() + array_size, data_int_src_dst.begin() + 2 * array_size); 1058 | sort(std::execution::par, data_int_src_dst.begin() + 2 * array_size, data_int_src_dst.end()); 1059 | 1060 | // std::merge benchmarks 1061 | 1062 | for (size_t i = 0; i < num_times; i++) 1063 | { 1064 | startTime = high_resolution_clock::now(); 1065 | merge(std::execution::seq, data_int_src_dst.begin(), data_int_src_dst.begin() + array_size, 1066 | data_int_src_dst.begin() + array_size, data_int_src_dst.begin() + 2 * array_size, data_int_src_dst.begin() + 2 * array_size); 1067 | endTime = high_resolution_clock::now(); 1068 | print_results("Serial single array std::merge", data_int_src_dst, startTime, endTime); 1069 | } 1070 | 1071 | //startTime = high_resolution_clock::now(); 1072 | //merge(std::execution::unseq, data_int_src_0.begin(), data_int_src_0.end(), data_int_src_1.begin(), data_int_src_1.end(), data_int_dst.begin()); 1073 | //endTime = high_resolution_clock::now(); 1074 | //print_results("SIMD std::merge", data_int_dst, startTime, endTime); 1075 | 1076 | for (size_t i = 0; i < num_times; i++) 1077 | { 1078 | startTime = high_resolution_clock::now(); 1079 | merge(std::execution::par, data_int_src_dst.begin(), data_int_src_dst.begin() + array_size, 1080 | data_int_src_dst.begin() + array_size, data_int_src_dst.begin() + 2 * array_size, data_int_src_dst.begin() + 2 * array_size); 1081 | endTime = high_resolution_clock::now(); 1082 | print_results("Parallel single array std::merge", data_int_src_dst, startTime, endTime); 1083 | } 1084 | 1085 | for (size_t i = 0; i < num_times; i++) 1086 | { 1087 | startTime = high_resolution_clock::now(); 1088 | merge(std::execution::par_unseq, data_int_src_dst.begin(), data_int_src_dst.begin() + array_size, 1089 | data_int_src_dst.begin() + array_size, data_int_src_dst.begin() + 2 * array_size, data_int_src_dst.begin() + 2 * array_size); 1090 | endTime = high_resolution_clock::now(); 1091 | print_results("Parallel SIMD std::merge", data_int_src_dst, startTime, endTime); 1092 | } 1093 | 1094 | // dpl::stable_sort benchmarks 1095 | #ifdef DPL_ALGORITHMS 1096 | 1097 | for (size_t i = 0; i < num_times; i++) 1098 | { 1099 | startTime = high_resolution_clock::now(); 1100 | merge(oneapi::dpl::execution::seq, data_int_src_dst.begin(), data_int_src_dst.begin() + array_size, 1101 | data_int_src_dst.begin() + array_size, data_int_src_dst.begin() + 2 * array_size, data_int_src_dst.begin() + 2 * array_size); 1102 | endTime = high_resolution_clock::now(); 1103 | print_results("Serial dpl::merge", data_int_src_dst, startTime, endTime); 1104 | } 1105 | 1106 | for (size_t i = 0; i < num_times; i++) 1107 | { 1108 | startTime = high_resolution_clock::now(); 1109 | merge(oneapi::dpl::execution::unseq, data_int_src_dst.begin(), data_int_src_dst.begin() + array_size, 1110 | data_int_src_dst.begin() + array_size, data_int_src_dst.begin() + 2 * array_size, data_int_src_dst.begin() + 2 * array_size); 1111 | endTime = high_resolution_clock::now(); 1112 | print_results("SIMD dpl::merge", data_int_src_dst, startTime, endTime); 1113 | } 1114 | 1115 | for (size_t i = 0; i < num_times; i++) 1116 | { 1117 | startTime = high_resolution_clock::now(); 1118 | merge(oneapi::dpl::execution::par, data_int_src_dst.begin(), data_int_src_dst.begin() + array_size, 1119 | data_int_src_dst.begin() + array_size, data_int_src_dst.begin() + 2 * array_size, data_int_src_dst.begin() + 2 * array_size); 1120 | endTime = high_resolution_clock::now(); 1121 | print_results("Parallel dpl::merge", data_int_src_dst, startTime, endTime); 1122 | } 1123 | 1124 | for (size_t i = 0; i < num_times; i++) 1125 | { 1126 | startTime = high_resolution_clock::now(); 1127 | merge(oneapi::dpl::execution::par_unseq, data_int_src_dst.begin(), data_int_src_dst.begin() + array_size, 1128 | data_int_src_dst.begin() + array_size, data_int_src_dst.begin() + 2 * array_size, data_int_src_dst.begin() + 2 * array_size); 1129 | endTime = high_resolution_clock::now(); 1130 | print_results("Parallel SIMD dpl::merge", data_int_src_dst, startTime, endTime); 1131 | } 1132 | #endif 1133 | 1134 | //for (size_t i = 0; i < num_times; i++) 1135 | //{ 1136 | // startTime = high_resolution_clock::now(); 1137 | // merge_parallel_L5(data_int_src_dst.data(), 0, array_size - 1, array_size, 2 * array_size - 1, data_int_src_dst.data(), 2 * array_size); 1138 | // endTime = high_resolution_clock::now(); 1139 | // print_results("Parallel Victor's merge", data_int_src_dst, startTime, endTime); 1140 | //} 1141 | } 1142 | 1143 | void all_of_benchmark(size_t array_size, size_t num_times) 1144 | { 1145 | std::vector data_int(array_size, 2); 1146 | high_resolution_clock::time_point startTime, endTime; 1147 | 1148 | printf("\n\n"); 1149 | 1150 | // std::inplace_merge benchmarks 1151 | 1152 | for (size_t i = 0; i < num_times; i++) 1153 | { 1154 | startTime = high_resolution_clock::now(); 1155 | if (all_of(std::execution::seq, data_int.begin(), data_int.end(), [](int i) { return i == 2; })) 1156 | printf("All numbers in the array are equal to 2\n"); 1157 | endTime = high_resolution_clock::now(); 1158 | print_results("Serial std::all_of", data_int, startTime, endTime); 1159 | } 1160 | #ifndef MICROSOFT_ALGORITHMS 1161 | for (size_t i = 0; i < num_times; i++) 1162 | { 1163 | startTime = high_resolution_clock::now(); 1164 | if (all_of(std::execution::unseq, data_int.begin(), data_int.end(), [](int i) { return i == 2; })) 1165 | printf("All numbers in the array are equal to 2\n"); 1166 | endTime = high_resolution_clock::now(); 1167 | print_results("Serial SIMD std::all_of", data_int, startTime, endTime); 1168 | } 1169 | #endif 1170 | for (size_t i = 0; i < num_times; i++) 1171 | { 1172 | startTime = high_resolution_clock::now(); 1173 | if (all_of(std::execution::par, data_int.begin(), data_int.end(), [](int i) { return i == 2; })) 1174 | printf("All numbers in the array are equal to 2\n"); 1175 | endTime = high_resolution_clock::now(); 1176 | print_results("Parallel std::all_of", data_int, startTime, endTime); 1177 | } 1178 | 1179 | for (size_t i = 0; i < num_times; i++) 1180 | { 1181 | startTime = high_resolution_clock::now(); 1182 | if (all_of(std::execution::par_unseq, data_int.begin(), data_int.end(), [](int i) { return i == 2; })) 1183 | printf("All numbers in the array are equal to 2\n"); 1184 | endTime = high_resolution_clock::now(); 1185 | print_results("Parallel SIMD std::all_of", data_int, startTime, endTime); 1186 | } 1187 | 1188 | // dpl::stable_sort benchmarks 1189 | #ifdef DPL_ALGORITHMS 1190 | 1191 | for (size_t i = 0; i < num_times; i++) 1192 | { 1193 | startTime = high_resolution_clock::now(); 1194 | if (all_of(oneapi::dpl::execution::seq, data_int.begin(), data_int.end(), [](int i) { return i == 2; })) 1195 | printf("All numbers in the array are equal to 2\n"); 1196 | endTime = high_resolution_clock::now(); 1197 | print_results("Serial dpl::all_of", data_int, startTime, endTime); 1198 | } 1199 | 1200 | for (size_t i = 0; i < num_times; i++) 1201 | { 1202 | startTime = high_resolution_clock::now(); 1203 | if (all_of(oneapi::dpl::execution::unseq, data_int.begin(), data_int.end(), [](int i) { return i == 2; })) 1204 | printf("All numbers in the array are equal to 2\n"); 1205 | endTime = high_resolution_clock::now(); 1206 | print_results("SIMD dpl::all_of", data_int, startTime, endTime); 1207 | } 1208 | 1209 | for (size_t i = 0; i < num_times; i++) 1210 | { 1211 | startTime = high_resolution_clock::now(); 1212 | if (all_of(oneapi::dpl::execution::par, data_int.begin(), data_int.end(), [](int i) { return i == 2; })) 1213 | printf("All numbers in the array are equal to 2\n"); 1214 | endTime = high_resolution_clock::now(); 1215 | print_results("Parallel dpl::all_of", data_int, startTime, endTime); 1216 | } 1217 | 1218 | for (size_t i = 0; i < num_times; i++) 1219 | { 1220 | startTime = high_resolution_clock::now(); 1221 | if (all_of(oneapi::dpl::execution::par_unseq, data_int.begin(), data_int.end(), [](int i) { return i == 2; })) 1222 | printf("All numbers in the array are equal to 2\n"); 1223 | endTime = high_resolution_clock::now(); 1224 | print_results("Parallel SIMD dpl::all_of", data_int, startTime, endTime); 1225 | } 1226 | #endif 1227 | } 1228 | 1229 | void any_of_benchmark(size_t array_size, size_t num_times) 1230 | { 1231 | std::vector data_int(array_size, 2); 1232 | high_resolution_clock::time_point startTime, endTime; 1233 | 1234 | printf("\n\n"); 1235 | 1236 | // std::inplace_merge benchmarks 1237 | 1238 | for (size_t i = 0; i < num_times; i++) 1239 | { 1240 | startTime = high_resolution_clock::now(); 1241 | if (!any_of(std::execution::seq, data_int.begin(), data_int.end(), [](int i) { return i == 3; })) 1242 | printf("No numbers in the array are equal to 3\n"); 1243 | endTime = high_resolution_clock::now(); 1244 | print_results("Serial std::any_of", data_int, startTime, endTime); 1245 | } 1246 | #ifndef MICROSOFT_ALGORITHMS 1247 | for (size_t i = 0; i < num_times; i++) 1248 | { 1249 | startTime = high_resolution_clock::now(); 1250 | if (!any_of(std::execution::unseq, data_int.begin(), data_int.end(), [](int i) { return i == 3; })) 1251 | printf("No numbers in the array are equal to 3\n"); 1252 | endTime = high_resolution_clock::now(); 1253 | print_results("Serial SIMD std::any_of", data_int, startTime, endTime); 1254 | } 1255 | #endif 1256 | for (size_t i = 0; i < num_times; i++) 1257 | { 1258 | startTime = high_resolution_clock::now(); 1259 | if (!any_of(std::execution::par, data_int.begin(), data_int.end(), [](int i) { return i == 3; })) 1260 | printf("No numbers in the array are equal to 3\n"); 1261 | endTime = high_resolution_clock::now(); 1262 | print_results("Parallel std::any_of", data_int, startTime, endTime); 1263 | } 1264 | 1265 | for (size_t i = 0; i < num_times; i++) 1266 | { 1267 | startTime = high_resolution_clock::now(); 1268 | if (!any_of(std::execution::par_unseq, data_int.begin(), data_int.end(), [](int i) { return i == 3; })) 1269 | printf("No numbers in the array are equal to 3\n"); 1270 | endTime = high_resolution_clock::now(); 1271 | print_results("Parallel SIMD std::any_of", data_int, startTime, endTime); 1272 | } 1273 | 1274 | // dpl::stable_sort benchmarks 1275 | #ifdef DPL_ALGORITHMS 1276 | 1277 | for (size_t i = 0; i < num_times; i++) 1278 | { 1279 | startTime = high_resolution_clock::now(); 1280 | if (!any_of(oneapi::dpl::execution::seq, data_int.begin(), data_int.end(), [](int i) { return i == 3; })) 1281 | printf("No numbers in the array are equal to 3\n"); 1282 | endTime = high_resolution_clock::now(); 1283 | print_results("Serial dpl::any_of", data_int, startTime, endTime); 1284 | } 1285 | 1286 | for (size_t i = 0; i < num_times; i++) 1287 | { 1288 | startTime = high_resolution_clock::now(); 1289 | if (!any_of(oneapi::dpl::execution::unseq, data_int.begin(), data_int.end(), [](int i) { return i == 3; })) 1290 | printf("No numbers in the array are equal to 3\n"); 1291 | endTime = high_resolution_clock::now(); 1292 | print_results("SIMD dpl::any_of", data_int, startTime, endTime); 1293 | } 1294 | 1295 | for (size_t i = 0; i < num_times; i++) 1296 | { 1297 | startTime = high_resolution_clock::now(); 1298 | if (!any_of(oneapi::dpl::execution::par, data_int.begin(), data_int.end(), [](int i) { return i == 3; })) 1299 | printf("No numbers in the array are equal to 3\n"); 1300 | endTime = high_resolution_clock::now(); 1301 | print_results("Parallel dpl::any_of", data_int, startTime, endTime); 1302 | } 1303 | 1304 | for (size_t i = 0; i < num_times; i++) 1305 | { 1306 | startTime = high_resolution_clock::now(); 1307 | if (!any_of(oneapi::dpl::execution::par_unseq, data_int.begin(), data_int.end(), [](int i) { return i == 3; })) 1308 | printf("No numbers in the array are equal to 3\n"); 1309 | endTime = high_resolution_clock::now(); 1310 | print_results("Parallel SIMD dpl::any_of", data_int, startTime, endTime); 1311 | } 1312 | #endif 1313 | } 1314 | 1315 | void copy_benchmark(size_t array_size, size_t num_times) 1316 | { 1317 | std::vector data_int_src(array_size); 1318 | std::vector data_int_dst(array_size); 1319 | high_resolution_clock::time_point startTime, endTime; 1320 | 1321 | printf("\n\n"); 1322 | 1323 | for (size_t i = 0; i < array_size; i++) 1324 | { 1325 | data_int_src[i] = (int)i; 1326 | } 1327 | //for (auto& d : data_int_src) { 1328 | // d = static_cast(rd()); 1329 | //} 1330 | 1331 | printf("\n\n"); 1332 | 1333 | // std::merge benchmarks 1334 | 1335 | for (size_t i = 0; i < num_times; i++) 1336 | { 1337 | startTime = high_resolution_clock::now(); 1338 | copy(std::execution::seq, data_int_src.begin(), data_int_src.end(), data_int_dst.begin()); 1339 | endTime = high_resolution_clock::now(); 1340 | print_results("Serial std::copy", data_int_dst, startTime, endTime); 1341 | } 1342 | #ifndef MICROSOFT_ALGORITHMS 1343 | for (size_t i = 0; i < num_times; i++) 1344 | { 1345 | startTime = high_resolution_clock::now(); 1346 | copy(std::execution::unseq, data_int_src.begin(), data_int_src.end(), data_int_dst.begin()); 1347 | endTime = high_resolution_clock::now(); 1348 | print_results("Serial SIMD std::copy", data_int_dst, startTime, endTime); 1349 | } 1350 | #endif 1351 | for (size_t i = 0; i < num_times; i++) 1352 | { 1353 | startTime = high_resolution_clock::now(); 1354 | copy(std::execution::par, data_int_src.begin(), data_int_src.end(), data_int_dst.begin()); 1355 | endTime = high_resolution_clock::now(); 1356 | print_results("Parallel std::copy", data_int_dst, startTime, endTime); 1357 | } 1358 | 1359 | for (size_t i = 0; i < num_times; i++) 1360 | { 1361 | startTime = high_resolution_clock::now(); 1362 | copy(std::execution::par_unseq, data_int_src.begin(), data_int_src.end(), data_int_dst.begin()); 1363 | endTime = high_resolution_clock::now(); 1364 | print_results("Parallel SIMD std::copy", data_int_dst, startTime, endTime); 1365 | } 1366 | 1367 | // dpl::stable_sort benchmarks 1368 | #ifdef DPL_ALGORITHMS 1369 | 1370 | for (size_t i = 0; i < num_times; i++) 1371 | { 1372 | startTime = high_resolution_clock::now(); 1373 | copy(oneapi::dpl::execution::seq, data_int_src.begin(), data_int_src.end(), data_int_dst.begin()); 1374 | endTime = high_resolution_clock::now(); 1375 | print_results("Serial dpl::copy", data_int_dst, startTime, endTime); 1376 | } 1377 | 1378 | for (size_t i = 0; i < num_times; i++) 1379 | { 1380 | startTime = high_resolution_clock::now(); 1381 | copy(oneapi::dpl::execution::unseq, data_int_src.begin(), data_int_src.end(), data_int_dst.begin()); 1382 | endTime = high_resolution_clock::now(); 1383 | print_results("SIMD dpl::copy", data_int_dst, startTime, endTime); 1384 | } 1385 | 1386 | for (size_t i = 0; i < num_times; i++) 1387 | { 1388 | startTime = high_resolution_clock::now(); 1389 | copy(oneapi::dpl::execution::par, data_int_src.begin(), data_int_src.end(), data_int_dst.begin()); 1390 | endTime = high_resolution_clock::now(); 1391 | print_results("Parallel dpl::copy", data_int_dst, startTime, endTime); 1392 | } 1393 | 1394 | for (size_t i = 0; i < num_times; i++) 1395 | { 1396 | startTime = high_resolution_clock::now(); 1397 | copy(oneapi::dpl::execution::par_unseq, data_int_src.begin(), data_int_src.end(), data_int_dst.begin()); 1398 | endTime = high_resolution_clock::now(); 1399 | print_results("Parallel SIMD dpl::copy", data_int_dst, startTime, endTime); 1400 | } 1401 | 1402 | #endif 1403 | } 1404 | 1405 | void equal_benchmark(size_t array_size, size_t num_times) 1406 | { 1407 | std::vector data_int_src_0(100000000, 0); 1408 | std::vector data_int_src_1(100000000, 0); 1409 | high_resolution_clock::time_point startTime, endTime; 1410 | 1411 | printf("\n\n"); 1412 | 1413 | // std::equal benchmarks 1414 | 1415 | for (size_t i = 0; i < num_times; i++) 1416 | { 1417 | startTime = high_resolution_clock::now(); 1418 | bool equals = equal(std::execution::seq, data_int_src_0.begin(), data_int_src_0.end(), data_int_src_1.begin()); 1419 | endTime = high_resolution_clock::now(); 1420 | if (equals) 1421 | print_results("Serial std::equal", data_int_src_0, startTime, endTime); 1422 | else 1423 | exit(1); 1424 | } 1425 | #ifndef MICROSOFT_ALGORITHMS 1426 | for (size_t i = 0; i < num_times; i++) 1427 | { 1428 | startTime = high_resolution_clock::now(); 1429 | if (equal(std::execution::unseq, data_int_src_0.begin(), data_int_src_0.end(), data_int_src_1.begin())) 1430 | printf("Arrays are equal\n"); 1431 | endTime = high_resolution_clock::now(); 1432 | print_results("Serial SIMD std::equal", data_int_src_0, startTime, endTime); 1433 | } 1434 | #endif 1435 | for (size_t i = 0; i < num_times; i++) 1436 | { 1437 | startTime = high_resolution_clock::now(); 1438 | bool equals = equal(std::execution::par, data_int_src_0.begin(), data_int_src_0.end(), data_int_src_1.begin()); 1439 | endTime = high_resolution_clock::now(); 1440 | if (equals) 1441 | print_results("Parallel std::equal", data_int_src_0, startTime, endTime); 1442 | else 1443 | exit(1); 1444 | } 1445 | 1446 | for (size_t i = 0; i < num_times; i++) 1447 | { 1448 | startTime = high_resolution_clock::now(); 1449 | bool equals = equal(std::execution::par_unseq, data_int_src_0.begin(), data_int_src_0.end(), data_int_src_1.begin()); 1450 | endTime = high_resolution_clock::now(); 1451 | if (equals) 1452 | print_results("Parallel SIMD std::equal", data_int_src_0, startTime, endTime); 1453 | else 1454 | exit(1); 1455 | } 1456 | 1457 | // dpl::stable_sort benchmarks 1458 | #ifdef DPL_ALGORITHMS 1459 | 1460 | for (size_t i = 0; i < num_times; i++) 1461 | { 1462 | startTime = high_resolution_clock::now(); 1463 | bool equals = equal(oneapi::dpl::execution::seq, data_int_src_0.begin(), data_int_src_0.end(), data_int_src_1.begin()); 1464 | endTime = high_resolution_clock::now(); 1465 | if (equals) 1466 | print_results("Serial dpl::equal", data_int_src_0, startTime, endTime); 1467 | else 1468 | exit(1); 1469 | } 1470 | 1471 | for (size_t i = 0; i < num_times; i++) 1472 | { 1473 | startTime = high_resolution_clock::now(); 1474 | bool equals = equal(oneapi::dpl::execution::unseq, data_int_src_0.begin(), data_int_src_0.end(), data_int_src_1.begin()); 1475 | endTime = high_resolution_clock::now(); 1476 | if (equals) 1477 | print_results("SIMD dpl::equal", data_int_src_0, startTime, endTime); 1478 | else 1479 | exit(1); 1480 | } 1481 | 1482 | for (size_t i = 0; i < num_times; i++) 1483 | { 1484 | startTime = high_resolution_clock::now(); 1485 | bool equals = equal(oneapi::dpl::execution::par, data_int_src_0.begin(), data_int_src_0.end(), data_int_src_1.begin()); 1486 | endTime = high_resolution_clock::now(); 1487 | if (equals) 1488 | print_results("Parallel dpl::equal", data_int_src_0, startTime, endTime); 1489 | else 1490 | exit(1); 1491 | } 1492 | 1493 | for (size_t i = 0; i < num_times; i++) 1494 | { 1495 | startTime = high_resolution_clock::now(); 1496 | bool equals = equal(oneapi::dpl::execution::par_unseq, data_int_src_0.begin(), data_int_src_0.end(), data_int_src_1.begin()); 1497 | endTime = high_resolution_clock::now(); 1498 | if (equals) 1499 | print_results("Parallel SIMD dpl::equal", data_int_src_0, startTime, endTime); 1500 | else 1501 | exit(1); 1502 | } 1503 | #endif 1504 | } 1505 | 1506 | 1507 | void count_benchmark(size_t array_size, size_t num_times) 1508 | { 1509 | std::vector data_int_src(array_size); 1510 | size_t num_items; 1511 | high_resolution_clock::time_point startTime, endTime; 1512 | 1513 | printf("\n\n"); 1514 | 1515 | for (size_t i = 0; i < array_size; i++) 1516 | { 1517 | data_int_src[i] = (int)i; 1518 | } 1519 | //for (auto& d : data_int_src) { 1520 | // d = static_cast(rd()); 1521 | //} 1522 | 1523 | // std::count benchmarks 1524 | 1525 | for (size_t i = 0; i < num_times; i++) 1526 | { 1527 | startTime = high_resolution_clock::now(); 1528 | num_items = count(std::execution::seq, data_int_src.begin(), data_int_src.end(), 42); 1529 | endTime = high_resolution_clock::now(); 1530 | print_results("Serial std::count", num_items, data_int_src, startTime, endTime); 1531 | } 1532 | #ifndef MICROSOFT_ALGORITHMS 1533 | for (size_t i = 0; i < num_times; i++) 1534 | { 1535 | startTime = high_resolution_clock::now(); 1536 | num_items = count(std::execution::unseq, data_int_src.begin(), data_int_src.end(), 42); 1537 | endTime = high_resolution_clock::now(); 1538 | print_results("Serial SIMD std::count", num_items, data_int_src, startTime, endTime); 1539 | } 1540 | #endif 1541 | for (size_t i = 0; i < num_times; i++) 1542 | { 1543 | startTime = high_resolution_clock::now(); 1544 | num_items = count(std::execution::par, data_int_src.begin(), data_int_src.end(), 42); 1545 | endTime = high_resolution_clock::now(); 1546 | print_results("Parallel std::count", num_items, data_int_src, startTime, endTime); 1547 | } 1548 | 1549 | for (size_t i = 0; i < num_times; i++) 1550 | { 1551 | startTime = high_resolution_clock::now(); 1552 | num_items = count(std::execution::par_unseq, data_int_src.begin(), data_int_src.end(), 42); 1553 | endTime = high_resolution_clock::now(); 1554 | print_results("Parallel SIMD std::count", num_items, data_int_src, startTime, endTime); 1555 | } 1556 | 1557 | // dpl::count benchmarks 1558 | #ifdef DPL_ALGORITHMS 1559 | 1560 | for (size_t i = 0; i < num_times; i++) 1561 | { 1562 | startTime = high_resolution_clock::now(); 1563 | num_items = count(oneapi::dpl::execution::seq, data_int_src.begin(), data_int_src.end(), 42); 1564 | endTime = high_resolution_clock::now(); 1565 | print_results("Serial dpl::count", num_items, data_int_src, startTime, endTime); 1566 | } 1567 | 1568 | for (size_t i = 0; i < num_times; i++) 1569 | { 1570 | startTime = high_resolution_clock::now(); 1571 | num_items = count(oneapi::dpl::execution::unseq, data_int_src.begin(), data_int_src.end(), 42); 1572 | endTime = high_resolution_clock::now(); 1573 | print_results("SIMD dpl::count", num_items, data_int_src, startTime, endTime); 1574 | } 1575 | 1576 | for (size_t i = 0; i < num_times; i++) 1577 | { 1578 | startTime = high_resolution_clock::now(); 1579 | num_items = count(oneapi::dpl::execution::par, data_int_src.begin(), data_int_src.end(), 42); 1580 | endTime = high_resolution_clock::now(); 1581 | print_results("Parallel dpl::count", num_items, data_int_src, startTime, endTime); 1582 | } 1583 | 1584 | for (size_t i = 0; i < num_times; i++) 1585 | { 1586 | startTime = high_resolution_clock::now(); 1587 | num_items = count(oneapi::dpl::execution::par_unseq, data_int_src.begin(), data_int_src.end(), 42); 1588 | endTime = high_resolution_clock::now(); 1589 | print_results("Parallel SIMD dpl::count", num_items, data_int_src, startTime, endTime); 1590 | } 1591 | 1592 | #endif 1593 | } 1594 | 1595 | void adjacent_find_benchmark(size_t array_size, size_t num_times) 1596 | { 1597 | std::vector data_int(array_size, 2); 1598 | high_resolution_clock::time_point startTime, endTime; 1599 | 1600 | printf("\nAdjacent Find\n"); 1601 | 1602 | for (size_t i = 0; i < array_size; i++) // force all adjacent elements to be different - i.e. no matching pairs 1603 | { 1604 | if (i % 2 == 0) 1605 | data_int[i] = 3; 1606 | } 1607 | 1608 | // std::adjacent_find benchmarks 1609 | 1610 | for (size_t i = 0; i < num_times; i++) 1611 | { 1612 | startTime = high_resolution_clock::now(); 1613 | auto iequal = adjacent_find (std::execution::seq, data_int.begin(), data_int.end()); 1614 | endTime = high_resolution_clock::now(); 1615 | if (iequal == data_int.end()) printf("No equal pairs found\n"); 1616 | print_results("Serial std::adjacent_find", data_int, startTime, endTime); 1617 | } 1618 | #ifndef MICROSOFT_ALGORITHMS 1619 | for (size_t i = 0; i < num_times; i++) 1620 | { 1621 | startTime = high_resolution_clock::now(); 1622 | auto iequal = adjacent_find(std::execution::unseq, data_int.begin(), data_int.end()); 1623 | endTime = high_resolution_clock::now(); 1624 | if (iequal == data_int.end()) printf("No equal pairs found\n"); 1625 | print_results("Serial SIMD std::adjacent_find", data_int, startTime, endTime); 1626 | } 1627 | #endif 1628 | for (size_t i = 0; i < num_times; i++) 1629 | { 1630 | startTime = high_resolution_clock::now(); 1631 | auto iequal = adjacent_find(std::execution::par, data_int.begin(), data_int.end()); 1632 | endTime = high_resolution_clock::now(); 1633 | if (iequal == data_int.end()) printf("No equal pairs found\n"); 1634 | print_results("Parallel std::adjacent_find", data_int, startTime, endTime); 1635 | } 1636 | 1637 | for (size_t i = 0; i < num_times; i++) 1638 | { 1639 | startTime = high_resolution_clock::now(); 1640 | auto iequal = adjacent_find(std::execution::par_unseq, data_int.begin(), data_int.end()); 1641 | endTime = high_resolution_clock::now(); 1642 | if (iequal == data_int.end()) printf("No equal pairs found\n"); 1643 | print_results("Parallel SIMD std::adjacent_find", data_int, startTime, endTime); 1644 | } 1645 | // dpl::adjacent_find benchmarks 1646 | #ifdef DPL_ALGORITHMS 1647 | 1648 | for (size_t i = 0; i < num_times; i++) 1649 | { 1650 | startTime = high_resolution_clock::now(); 1651 | auto iequal = adjacent_find(oneapi::dpl::execution::seq, data_int.begin(), data_int.end()); 1652 | endTime = high_resolution_clock::now(); 1653 | if (iequal == data_int.end()) printf("No equal pairs found\n"); 1654 | print_results("Serial dpl::adjacent_find", data_int, startTime, endTime); 1655 | } 1656 | 1657 | for (size_t i = 0; i < num_times; i++) 1658 | { 1659 | startTime = high_resolution_clock::now(); 1660 | auto iequal = adjacent_find(oneapi::dpl::execution::unseq, data_int.begin(), data_int.end()); 1661 | endTime = high_resolution_clock::now(); 1662 | if (iequal == data_int.end()) printf("No equal pairs found\n"); 1663 | print_results("SIMD dpl::adjacent_find", data_int, startTime, endTime); 1664 | } 1665 | 1666 | for (size_t i = 0; i < num_times; i++) 1667 | { 1668 | startTime = high_resolution_clock::now(); 1669 | auto iequal = adjacent_find(oneapi::dpl::execution::par, data_int.begin(), data_int.end()); 1670 | endTime = high_resolution_clock::now(); 1671 | if (iequal == data_int.end()) printf("No equal pairs found\n"); 1672 | print_results("Parallel dpl::adjacent_find", data_int, startTime, endTime); 1673 | } 1674 | 1675 | for (size_t i = 0; i < num_times; i++) 1676 | { 1677 | startTime = high_resolution_clock::now(); 1678 | auto iequal = adjacent_find(oneapi::dpl::execution::par_unseq, data_int.begin(), data_int.end()); 1679 | endTime = high_resolution_clock::now(); 1680 | if (iequal == data_int.end()) printf("No equal pairs found\n"); 1681 | print_results("Parallel SIMD dpl::adjacent_find", data_int, startTime, endTime); 1682 | } 1683 | #endif 1684 | } 1685 | 1686 | void adjacent_difference_benchmark(size_t array_size, size_t num_times) 1687 | { 1688 | std::vector data_int_src(array_size); 1689 | std::vector data_int_dst(array_size, 10); 1690 | high_resolution_clock::time_point startTime, endTime; 1691 | 1692 | printf("\nAdjacent Difference\n"); 1693 | 1694 | for (size_t i = 0; i < array_size; i++) 1695 | { 1696 | data_int_src[i] = (int)i; 1697 | } 1698 | //for (auto& d : data_int_src) { 1699 | // d = static_cast(rd()); 1700 | //} 1701 | 1702 | // std::adjacent_difference benchmarks 1703 | printf("Benchmarks:\n"); 1704 | 1705 | for (size_t i = 0; i < num_times; i++) 1706 | { 1707 | startTime = high_resolution_clock::now(); 1708 | adjacent_difference(std::execution::seq, data_int_src.begin(), data_int_src.end(), data_int_dst.begin()); 1709 | endTime = high_resolution_clock::now(); 1710 | print_results("Serial std::adjacent_difference", data_int_src, startTime, endTime); 1711 | } 1712 | #ifndef MICROSOFT_ALGORITHMS 1713 | for (size_t i = 0; i < num_times; i++) 1714 | { 1715 | startTime = high_resolution_clock::now(); 1716 | adjacent_difference(std::execution::unseq, data_int_src.begin(), data_int_src.end(), data_int_dst.begin()); 1717 | endTime = high_resolution_clock::now(); 1718 | print_results("Serial SIMD std::adjacent_difference", data_int_src, startTime, endTime); 1719 | } 1720 | #endif 1721 | for (size_t i = 0; i < num_times; i++) 1722 | { 1723 | startTime = high_resolution_clock::now(); 1724 | adjacent_difference(std::execution::par, data_int_src.begin(), data_int_src.end(), data_int_dst.begin()); 1725 | endTime = high_resolution_clock::now(); 1726 | print_results("Parallel std::adjacent_difference", data_int_src, startTime, endTime); 1727 | } 1728 | 1729 | for (size_t i = 0; i < num_times; i++) 1730 | { 1731 | startTime = high_resolution_clock::now(); 1732 | adjacent_difference(std::execution::par_unseq, data_int_src.begin(), data_int_src.end(), data_int_dst.begin()); 1733 | endTime = high_resolution_clock::now(); 1734 | print_results("Parallel SIMD std::adjacent_difference", data_int_src, startTime, endTime); 1735 | } 1736 | #if 0 1737 | // dpl::adjacent_difference benchmarks (Intel doesn't implement!) 1738 | #ifdef DPL_ALGORITHMS 1739 | 1740 | for (size_t i = 0; i < num_times; i++) 1741 | { 1742 | startTime = high_resolution_clock::now(); 1743 | adjacent_difference(oneapi::dpl::execution::seq, data_int_src.begin(), data_int_src.end(), data_int_dst.begin()); 1744 | endTime = high_resolution_clock::now(); 1745 | print_results("Serial dpl::adjacent_difference", data_int_src, startTime, endTime); 1746 | } 1747 | 1748 | for (size_t i = 0; i < num_times; i++) 1749 | { 1750 | startTime = high_resolution_clock::now(); 1751 | adjacent_difference(oneapi::dpl::execution::unseq, data_int_src.begin(), data_int_src.end(), data_int_dst.begin()); 1752 | endTime = high_resolution_clock::now(); 1753 | print_results("SIMD dpl::adjacent_difference", data_int_src, startTime, endTime); 1754 | } 1755 | 1756 | for (size_t i = 0; i < num_times; i++) 1757 | { 1758 | startTime = high_resolution_clock::now(); 1759 | adjacent_difference(oneapi::dpl::execution::par, data_int_src.begin(), data_int_src.end(), data_int_dst.begin()); 1760 | endTime = high_resolution_clock::now(); 1761 | print_results("Parallel dpl::adjacent_difference", data_int_src, startTime, endTime); 1762 | } 1763 | 1764 | for (size_t i = 0; i < num_times; i++) 1765 | { 1766 | startTime = high_resolution_clock::now(); 1767 | adjacent_difference(oneapi::dpl::execution::par_unseq, data_int_src.begin(), data_int_src.end(), data_int_dst.begin()); 1768 | endTime = high_resolution_clock::now(); 1769 | print_results("Parallel SIMD dpl::adjacent_difference", data_int_src, startTime, endTime); 1770 | } 1771 | 1772 | #endif 1773 | #endif 1774 | } 1775 | 1776 | void max_element_benchmark(size_t array_size, size_t num_times) 1777 | { 1778 | std::vector data_int_src(array_size); 1779 | high_resolution_clock::time_point startTime, endTime; 1780 | std::vector::iterator max_index; 1781 | 1782 | printf("\n\n"); 1783 | 1784 | for (size_t i = 0; i < array_size; i++) 1785 | { 1786 | data_int_src[i] = (int)i; 1787 | } 1788 | //for (auto& d : data_int_src) { 1789 | // d = static_cast(rd()); 1790 | //} 1791 | #if 1 1792 | // std::max_element benchmarks 1793 | 1794 | for (size_t i = 0; i < num_times; i++) 1795 | { 1796 | startTime = high_resolution_clock::now(); 1797 | max_index = max_element(std::execution::seq, data_int_src.begin(), data_int_src.end()); 1798 | endTime = high_resolution_clock::now(); 1799 | print_results("Serial std::max_element", max_index, data_int_src, startTime, endTime); 1800 | } 1801 | #ifndef MICROSOFT_ALGORITHMS 1802 | for (size_t i = 0; i < num_times; i++) 1803 | { 1804 | startTime = high_resolution_clock::now(); 1805 | max_index = max_element(std::execution::unseq, data_int_src.begin(), data_int_src.end()); 1806 | endTime = high_resolution_clock::now(); 1807 | print_results("Serial SIMD std::max_element", max_index, data_int_src, startTime, endTime); 1808 | } 1809 | #endif 1810 | for (size_t i = 0; i < num_times; i++) 1811 | { 1812 | startTime = high_resolution_clock::now(); 1813 | max_index = max_element(std::execution::par, data_int_src.begin(), data_int_src.end()); 1814 | endTime = high_resolution_clock::now(); 1815 | print_results("Parallel std::max_element", max_index, data_int_src, startTime, endTime); 1816 | } 1817 | 1818 | for (size_t i = 0; i < num_times; i++) 1819 | { 1820 | startTime = high_resolution_clock::now(); 1821 | max_index = max_element(std::execution::par_unseq, data_int_src.begin(), data_int_src.end()); 1822 | endTime = high_resolution_clock::now(); 1823 | print_results("Parallel SIMD std::max_element", max_index, data_int_src, startTime, endTime); 1824 | } 1825 | #endif 1826 | // dpl::max_element benchmarks 1827 | #ifdef DPL_ALGORITHMS 1828 | 1829 | for (size_t i = 0; i < num_times; i++) 1830 | { 1831 | startTime = high_resolution_clock::now(); 1832 | max_index = max_element(oneapi::dpl::execution::seq, data_int_src.begin(), data_int_src.end()); 1833 | endTime = high_resolution_clock::now(); 1834 | print_results("Serial dpl::max_element", max_index, data_int_src, startTime, endTime); 1835 | } 1836 | 1837 | for (size_t i = 0; i < num_times; i++) 1838 | { 1839 | startTime = high_resolution_clock::now(); 1840 | max_index = max_element(oneapi::dpl::execution::unseq, data_int_src.begin(), data_int_src.end()); 1841 | endTime = high_resolution_clock::now(); 1842 | print_results("SIMD dpl::max_element", max_index, data_int_src, startTime, endTime); 1843 | } 1844 | 1845 | for (size_t i = 0; i < num_times; i++) 1846 | { 1847 | startTime = high_resolution_clock::now(); 1848 | max_index = max_element(oneapi::dpl::execution::par, data_int_src.begin(), data_int_src.end()); 1849 | endTime = high_resolution_clock::now(); 1850 | print_results("Parallel dpl::max_element", max_index, data_int_src, startTime, endTime); 1851 | } 1852 | 1853 | for (size_t i = 0; i < num_times; i++) 1854 | { 1855 | startTime = high_resolution_clock::now(); 1856 | max_index = max_element(oneapi::dpl::execution::par_unseq, data_int_src.begin(), data_int_src.end()); 1857 | endTime = high_resolution_clock::now(); 1858 | print_results("Parallel SIMD dpl::max_element", max_index, data_int_src, startTime, endTime); 1859 | } 1860 | 1861 | #endif 1862 | } 1863 | 1864 | 1865 | int main() 1866 | { 1867 | size_t array_size = 100'000'000; 1868 | size_t number_of_tests = 5; 1869 | 1870 | max_element_benchmark( array_size, number_of_tests); // for small arrays parallel implementation is much slower than serial 1871 | adjacent_difference_benchmark(array_size, number_of_tests); // for small arrays parallel implementation is much slower than serial 1872 | adjacent_find_benchmark( array_size, number_of_tests); // for small arrays parallel implementation is much slower than serial 1873 | all_of_benchmark( array_size, number_of_tests); 1874 | any_of_benchmark( array_size, number_of_tests); 1875 | count_benchmark( array_size, number_of_tests); 1876 | //count_benchmark(10000, 20); // for small arrays parallel implementations are much slower than serial 1877 | equal_benchmark( array_size, number_of_tests); 1878 | copy_benchmark( array_size, number_of_tests); 1879 | //copy_benchmark( 10000, 10); // for small arrays parallel implementation is much slower than serial 1880 | fill_benchmark(array_size, number_of_tests); 1881 | merge_benchmark( array_size, number_of_tests); 1882 | inplace_merge_benchmark( array_size, number_of_tests); 1883 | //merge_dual_buffer_benchmark( 100000000, 10); 1884 | //merge_single_buffer_benchmark( 10000, 10); 1885 | //fill_long_long_benchmark( 100000000, 10); 1886 | sort_benchmark( array_size, number_of_tests); 1887 | //sort_doubles_benchmark( 100000000, 10, true ); 1888 | //sort_doubles_benchmark( 100000000, 10, false); 1889 | stable_sort_benchmark( array_size, number_of_tests); 1890 | 1891 | return 0; 1892 | } 1893 | --------------------------------------------------------------------------------