├── vcpkg ├── versions │ └── r- │ │ └── rtc-benchmarksuite.json └── ports │ └── rtc-benchmarksuite │ ├── portfile.cmake │ └── vcpkg.json ├── .gitignore ├── License.md ├── .github └── workflows │ ├── Construct-Vcpkg-Info.yml │ ├── DeleteRuns.yml │ └── benchmark.yml ├── cmake ├── benchmarksuiteConfig.cmake.in ├── detection │ ├── CMakeLists.txt │ ├── benchmarksuite_cpu_properties.hpp.in │ ├── benchmarksuite_gpu_properties.hpp.in │ ├── benchmarksuite_detect_gpu_properties.cmake │ ├── main.cpp │ └── benchmarksuite_detect_cpu_properties.cmake └── flags_and_options.cmake ├── include └── bnch_swt │ ├── config.hpp │ ├── benchmarksuite_cpu_properties.hpp │ ├── counters │ ├── andriod_events.hpp │ ├── windows_perf_events.hpp │ ├── cuda_perf_events.hpp │ └── linux_perf_events.hpp │ ├── file_loader.hpp │ ├── benchmarksuite_gpu_properties.hpp │ ├── aligned_const.hpp │ ├── do_not_optimize.hpp │ ├── printable.hpp │ ├── event_counter.hpp │ ├── string_literal.hpp │ ├── cache_clearer.hpp │ ├── random_generators.hpp │ ├── concepts.hpp │ ├── metrics.hpp │ └── index.hpp ├── .clang-format ├── src ├── main.cpp ├── CMakeLists.txt └── main.cu ├── CMakePresets.json └── CMakeLists.txt /vcpkg/versions/r-/rtc-benchmarksuite.json: -------------------------------------------------------------------------------- 1 | { 2 | "versions": [ 3 | { 4 | "git-tree": "70b1a7b0086e9e0d1ab2d892830c54aa076af23d", 5 | "version": "1.0.0", 6 | "port-version": 0 7 | } 8 | ] 9 | } 10 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .vs/ 2 | out/ 3 | Build/ 4 | build/ 5 | 6 | cmake/detection/Build-Gpu-* 7 | cmake/detection/Build-Cpu-* 8 | cmake/detection/build_feature_tester_gpu_properties.bat 9 | cmake/detection/build_feature_tester_cpu_properties.bat 10 | -------------------------------------------------------------------------------- /vcpkg/ports/rtc-benchmarksuite/portfile.cmake: -------------------------------------------------------------------------------- 1 | vcpkg_from_github( 2 | OUT_SOURCE_PATH SOURCE_PATH 3 | REPO realtimechris/benchmarksuite 4 | REF "v${VERSION}" 5 | SHA512 277f8e33d836c99c9a2f7b51e92c6c2df8bc549483118d77022a0776c493423975c118482b369c6fd728907fd76af02474d7d2d34ac9e335bb8314bed0866268 6 | HEAD_REF main 7 | ) 8 | 9 | set(VCPKG_BUILD_TYPE release) # header-only 10 | 11 | vcpkg_cmake_configure( 12 | SOURCE_PATH "${SOURCE_PATH}" 13 | ) 14 | 15 | vcpkg_cmake_install() 16 | 17 | vcpkg_install_copyright(FILE_LIST "${SOURCE_PATH}/License.md") 18 | -------------------------------------------------------------------------------- /vcpkg/ports/rtc-benchmarksuite/vcpkg.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "rtc-benchmarksuite", 3 | "version": "1.0.1", 4 | "description": "A header-only C++ benchmarking library with cross-platform hardware performance counter integration, providing precise measurements of cycles, instructions, branches, cache behavior, and throughput with minimal overhead.", 5 | "homepage": "https://github.com/realtimechris/benchmarksuite", 6 | "license": "MIT", 7 | "supports": "(windows & x64 & !xbox) | (linux & x64) | (osx & x64)", 8 | "dependencies": [ 9 | { 10 | "name": "vcpkg-cmake", 11 | "host": true 12 | } 13 | ] 14 | } 15 | -------------------------------------------------------------------------------- /License.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 RealTimeChris 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.github/workflows/Construct-Vcpkg-Info.yml: -------------------------------------------------------------------------------- 1 | name: Prepare release for VCPKG 2 | 3 | on: 4 | release: 5 | types: [published] 6 | workflow_dispatch: 7 | 8 | permissions: 9 | contents: write 10 | 11 | jobs: 12 | Collect-Vcpkg-Info: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - name: Setup PHP 16 | uses: shivammathur/setup-php@v2 17 | with: 18 | php-version: '8.1' 19 | 20 | - name: Checkout benchmarksuite 21 | uses: actions/checkout@v4 22 | with: 23 | submodules: recursive 24 | 25 | - name: Update vcpkg and install other dependencies 26 | run: sudo apt-get update && 27 | sudo apt-get install nasm linux-headers-$(uname -r) && 28 | cd /usr/local/share/vcpkg && 29 | sudo ./bootstrap-vcpkg.sh && 30 | sudo git stash && 31 | sudo git pull && 32 | sudo vcpkg update 33 | 34 | - name: Run vcpkg release builder 35 | run: git fetch --tags --force && 36 | cd build_tools && 37 | php make_vcpkg.php "${{ github.repository_owner }}" "${{ secrets.GITHUB_TOKEN }}" 38 | -------------------------------------------------------------------------------- /cmake/benchmarksuiteConfig.cmake.in: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2024 RealTimeChris 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this 6 | # software and associated documentation files (the "Software"), to deal in the Software 7 | # without restriction, including without limitation the rights to use, copy, modify, merge, 8 | # publish, distribute, sublicense, and/or sell copies of the Software, and to permit 9 | # persons to whom the Software is furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice shall be included in all copies or 12 | # substantial portions of the Software. 13 | # 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 15 | # INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 16 | # PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE 17 | # FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 18 | # OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 19 | # DEALINGS IN THE SOFTWARE. 20 | 21 | @PACKAGE_INIT@ 22 | 23 | set_and_check(EXPORT_TARGETS_FILE_NEW "@PACKAGE_EXPORTED_TARGETS_FILE_PATH@") 24 | 25 | include("${EXPORT_TARGETS_FILE_NEW}") 26 | 27 | check_required_components("@PROJECT_NAME@") -------------------------------------------------------------------------------- /include/bnch_swt/config.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | MIT License 3 | 4 | Copyright (c) 2024 RealTimeChris 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy of this 7 | software and associated documentation files (the "Software"), to deal in the Software 8 | without restriction, including without limitation the rights to use, copy, modify, merge, 9 | publish, distribute, sublicense, and/or sell copies of the Software, and to permit 10 | persons to whom the Software is furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all copies or 13 | substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 16 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 17 | PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE 18 | FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 19 | OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | DEALINGS IN THE SOFTWARE. 21 | */ 22 | /// https://github.com/RealTimeChris/benchmarksuite 23 | #pragma once 24 | 25 | #include 26 | #include 27 | 28 | #if BNCH_SWT_COMPILER_CUDA 29 | #define BNCH_SWT_ALIGN(x) __align__(x) 30 | #include 31 | #include 32 | #else 33 | #define BNCH_SWT_ALIGN(x) alignas(x) 34 | #endif 35 | 36 | namespace bnch_swt { 37 | 38 | using clock_type = std::conditional_t; 39 | using duration_type = std::chrono::duration; 40 | using time_point_type = std::chrono::time_point; 41 | 42 | enum class benchmark_types { 43 | cpu, 44 | cuda, 45 | }; 46 | 47 | namespace internal { 48 | 49 | template struct event_collector_type; 50 | 51 | } 52 | 53 | } 54 | -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | AlignAfterOpenBracket: DontAlign 3 | AlignConsecutiveAssignments: true 4 | AlignConsecutiveDeclarations: false 5 | AlignConsecutiveMacros: false 6 | AlignEscapedNewlines: DontAlign 7 | AlignOperands: false 8 | AlignTrailingComments: false 9 | AllowAllArgumentsOnNextLine: false 10 | AllowAllConstructorInitializersOnNextLine: false 11 | AllowAllParametersOfDeclarationOnNextLine: false 12 | AllowShortBlocksOnASingleLine: false 13 | AllowShortCaseLabelsOnASingleLine: false 14 | AllowShortLambdasOnASingleLine: None 15 | AllowShortFunctionsOnASingleLine: None 16 | AllowShortIfStatementsOnASingleLine: Never 17 | AllowShortLoopsOnASingleLine: false 18 | AlwaysBreakAfterDefinitionReturnType: None 19 | AlwaysBreakAfterReturnType: None 20 | AlwaysBreakTemplateDeclarations: No 21 | ColumnLimit: 180 22 | CompactNamespaces: false 23 | ConstructorInitializerIndentWidth: 4 24 | ContinuationIndentWidth: 4 25 | Cpp11BracedListStyle: false 26 | FixNamespaceComments: false 27 | IndentCaseLabels: true 28 | IndentPPDirectives: BeforeHash 29 | IndentWidth: 4 30 | IndentWrappedFunctionNames: false 31 | KeepEmptyLinesAtTheStartOfBlocks: false 32 | Language: Cpp 33 | MaxEmptyLinesToKeep: 4 34 | NamespaceIndentation: All 35 | ObjCBinPackProtocolList: Auto 36 | ObjCBlockIndentWidth: 4 37 | ObjCSpaceAfterProperty: false 38 | ObjCSpaceBeforeProtocolList: true 39 | PointerAlignment: Left 40 | ReflowComments: false 41 | SortIncludes: false 42 | SortUsingDeclarations: false 43 | SpaceAfterCStyleCast: false 44 | SpaceAfterLogicalNot: false 45 | SpaceAfterTemplateKeyword: false 46 | SpaceBeforeAssignmentOperators: true 47 | SpaceBeforeCpp11BracedList: false 48 | SpaceBeforeCtorInitializerColon: true 49 | SpaceBeforeInheritanceColon: true 50 | SpaceBeforeParens: ControlStatements 51 | SpaceBeforeRangeBasedForLoopColon: false 52 | SpaceInEmptyParentheses: false 53 | SpacesBeforeTrailingComments: 0 54 | SpacesInAngles: false 55 | SpacesInContainerLiterals: true 56 | SpacesInCStyleCastParentheses: true 57 | SpacesInParentheses: false 58 | SpacesInSquareBrackets: false 59 | Standard: Cpp11 60 | TabWidth: 4 61 | UseTab: Always 62 | -------------------------------------------------------------------------------- /cmake/detection/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2024 RealTimeChris 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this 6 | # software and associated documentation files (the "Software"), to deal in the Software 7 | # without restriction, including without limitation the rights to use, copy, modify, merge, 8 | # publish, distribute, sublicense, and/or sell copies of the Software, and to permit 9 | # persons to whom the Software is furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice shall be included in all copies or 12 | # substantial portions of the Software. 13 | # 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 15 | # INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 16 | # PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE 17 | # FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 18 | # OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 19 | # DEALINGS IN THE SOFTWARE. 20 | 21 | cmake_minimum_required(VERSION 3.28) 22 | 23 | project(feature_detector 24 | VERSION 1.0.0 25 | LANGUAGES CXX 26 | ) 27 | 28 | add_executable(feature_detector main.cpp) 29 | 30 | include(${CMAKE_SOURCE_DIR}/../flags_and_options.cmake) 31 | 32 | if(BNCH_SWT_DETECT_CPU_PROPERTIES) 33 | target_compile_definitions(feature_detector 34 | PUBLIC 35 | BNCH_SWT_DETECT_CPU_PROPERTIES 36 | ${BNCH_SWT_COMPILE_DEFINITIONS} 37 | ) 38 | 39 | elseif(BNCH_SWT_DETECT_GPU_PROPERTIES) 40 | find_package(CUDAToolkit REQUIRED) 41 | set_property(GLOBAL PROPERTY CUDA_ARCHITECTURES native) 42 | 43 | target_link_libraries(feature_detector 44 | PUBLIC 45 | CUDA::cudart 46 | ) 47 | 48 | target_compile_definitions(feature_detector 49 | PUBLIC 50 | BNCH_SWT_DETECT_GPU_PROPERTIES 51 | ${BNCH_SWT_COMPILE_DEFINITIONS} 52 | ) 53 | endif() -------------------------------------------------------------------------------- /include/bnch_swt/benchmarksuite_cpu_properties.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | MIT License 3 | 4 | Copyright (c) 2024 RealTimeChris 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy of this 7 | software and associated documentation files (the "Software"), to deal in the Software 8 | without restriction, including without limitation the rights to use, copy, modify, merge, 9 | publish, distribute, sublicense, and/or sell copies of the Software, and to permit 10 | persons to whom the Software is furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all copies or 13 | substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 16 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 17 | PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE 18 | FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 19 | OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | DEALINGS IN THE SOFTWARE. 21 | */ 22 | /// https://github.com/RealTimeChris/benchmarksuite 23 | #pragma once 24 | 25 | #include 26 | 27 | namespace bnch_swt { 28 | 29 | struct cpu_properties { 30 | protected: 31 | static constexpr aligned_const thread_count_raw{ 32ull }; 32 | static constexpr aligned_const l1_cache_size_raw{ 49152ull }; 33 | static constexpr aligned_const l2_cache_size_raw{ 2097152ull }; 34 | static constexpr aligned_const l3_cache_size_raw{ 37748736ull }; 35 | static constexpr aligned_const cpu_arch_index_raw{ 1ull }; 36 | static constexpr aligned_const cpu_alignment_raw{ 32ull }; 37 | 38 | public: 39 | static constexpr const uint64_t& thread_count{ *thread_count_raw }; 40 | static constexpr const uint64_t& l1_cache_size{ *l1_cache_size_raw }; 41 | static constexpr const uint64_t& l2_cache_size{ *l2_cache_size_raw }; 42 | static constexpr const uint64_t& l3_cache_size{ *l3_cache_size_raw }; 43 | static constexpr const uint64_t& cpu_arch_index{ *cpu_arch_index_raw }; 44 | static constexpr const uint64_t& cpu_alignment{ *cpu_alignment_raw }; 45 | }; 46 | 47 | } 48 | -------------------------------------------------------------------------------- /include/bnch_swt/counters/andriod_events.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | MIT License 3 | 4 | Copyright (c) 2024 RealTimeChris 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy of this 7 | software and associated documentation files (the "Software"), to deal in the Software 8 | without restriction, including without limitation the rights to use, copy, modify, merge, 9 | publish, distribute, sublicense, and/or sell copies of the Software, and to permit 10 | persons to whom the Software is furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all copies or 13 | substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 16 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 17 | PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE 18 | FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 19 | OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | DEALINGS IN THE SOFTWARE. 21 | */ 22 | /// https://github.com/RealTimeChris/benchmarksuite 23 | 24 | #pragma once 25 | 26 | #include 27 | #include 28 | #include 29 | 30 | #if BNCH_SWT_PLATFORM_ANDROID 31 | 32 | namespace bnch_swt::internal { 33 | 34 | template struct event_collector_type : public std::vector { 35 | uint64_t current_index{}; 36 | 37 | BNCH_SWT_HOST event_collector_type() : std::vector{ count_t } {}; 38 | 39 | template BNCH_SWT_HOST void run(arg_types&&... args) { 40 | const auto start_clock = clock_type::now(); 41 | std::vector::operator[](current_index).bytesProcessedVal.emplace(static_cast(function_type::impl(std::forward(args)...))); 42 | const auto end_clock = clock_type::now(); 43 | Vstd::vector::operator[](current_index).elapsed_ns_val.emplace(end_clock - start_clock); 44 | ++current_index; 45 | return; 46 | } 47 | }; 48 | 49 | } 50 | 51 | #endif 52 | -------------------------------------------------------------------------------- /cmake/detection/benchmarksuite_cpu_properties.hpp.in: -------------------------------------------------------------------------------- 1 | /* 2 | MIT License 3 | 4 | Copyright (c) 2024 RealTimeChris 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy of this 7 | software and associated documentation files (the "Software"), to deal in the Software 8 | without restriction, including without limitation the rights to use, copy, modify, merge, 9 | publish, distribute, sublicense, and/or sell copies of the Software, and to permit 10 | persons to whom the Software is furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all copies or 13 | substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 16 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 17 | PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE 18 | FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 19 | OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | DEALINGS IN THE SOFTWARE. 21 | */ 22 | /// https://github.com/RealTimeChris/benchmarksuite 23 | #pragma once 24 | 25 | #include 26 | 27 | namespace bnch_swt { 28 | 29 | struct cpu_properties { 30 | protected: 31 | static constexpr aligned_const thread_count_raw{ @BNCH_SWT_THREAD_COUNT@ull }; 32 | static constexpr aligned_const l1_cache_size_raw{ @BNCH_SWT_CPU_L1_CACHE_SIZE@ull }; 33 | static constexpr aligned_const l2_cache_size_raw{ @BNCH_SWT_CPU_L2_CACHE_SIZE@ull }; 34 | static constexpr aligned_const l3_cache_size_raw{ @BNCH_SWT_CPU_L3_CACHE_SIZE@ull }; 35 | static constexpr aligned_const cpu_arch_index_raw{ @BNCH_SWT_CPU_ARCH_INDEX@ull }; 36 | static constexpr aligned_const cpu_alignment_raw{ @BNCH_SWT_CPU_ALIGNMENT@ull }; 37 | 38 | public: 39 | static constexpr const uint64_t& thread_count{ *thread_count_raw }; 40 | static constexpr const uint64_t& l1_cache_size{ *l1_cache_size_raw }; 41 | static constexpr const uint64_t& l2_cache_size{ *l2_cache_size_raw }; 42 | static constexpr const uint64_t& l3_cache_size{ *l3_cache_size_raw }; 43 | static constexpr const uint64_t& cpu_arch_index{ *cpu_arch_index_raw }; 44 | static constexpr const uint64_t& cpu_alignment{ *cpu_alignment_raw }; 45 | }; 46 | 47 | } 48 | -------------------------------------------------------------------------------- /include/bnch_swt/counters/windows_perf_events.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | MIT License 3 | 4 | Copyright (c) 2024 RealTimeChris 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy of this 7 | software and associated documentation files (the "Software"), to deal in the Software 8 | without restriction, including without limitation the rights to use, copy, modify, merge, 9 | publish, distribute, sublicense, and/or sell copies of the Software, and to permit 10 | persons to whom the Software is furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all copies or 13 | substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 16 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 17 | PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE 18 | FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 19 | OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | DEALINGS IN THE SOFTWARE. 21 | */ 22 | /// https://github.com/RealTimeChris/benchmarksuite 23 | 24 | #pragma once 25 | 26 | #include 27 | 28 | #if BNCH_SWT_PLATFORM_WINDOWS 29 | 30 | #include 31 | #include 32 | 33 | namespace bnch_swt::internal { 34 | 35 | template struct event_collector_type : public std::vector { 36 | uint64_t current_index{}; 37 | 38 | BNCH_SWT_HOST event_collector_type() : std::vector{ count } {}; 39 | 40 | template BNCH_SWT_HOST void run(arg_types&&... args) { 41 | uint64_t result; 42 | const auto start_clock = clock_type::now(); 43 | volatile uint64_t cycleStart = __rdtsc(); 44 | result = static_cast(function_type::impl(std::forward(args)...)); 45 | volatile uint64_t cycleEnd = __rdtsc(); 46 | const auto end_clock = clock_type::now(); 47 | std::vector::operator[](current_index).cycles_val.emplace(cycleEnd - cycleStart); 48 | std::vector::operator[](current_index).elapsed_ns_val.emplace(end_clock - start_clock); 49 | std::vector::operator[](current_index).bytes_processed_val.emplace(result); 50 | ++current_index; 51 | return; 52 | } 53 | }; 54 | 55 | } 56 | #endif 57 | -------------------------------------------------------------------------------- /.github/workflows/DeleteRuns.yml: -------------------------------------------------------------------------------- 1 | name: Delete old workflow runs 2 | on: 3 | workflow_dispatch: 4 | inputs: 5 | days: 6 | description: 'Days-worth of runs to keep for each workflow' 7 | required: true 8 | default: '0' 9 | minimum_runs: 10 | description: 'Minimum runs to keep for each workflow' 11 | required: true 12 | default: '1' 13 | delete_workflow_pattern: 14 | description: 'Name or filename of the workflow (if not set, all workflows are targeted)' 15 | required: false 16 | delete_workflow_by_state_pattern: 17 | description: 'Filter workflows by state: active, deleted, disabled_fork, disabled_inactivity, disabled_manually' 18 | required: true 19 | default: "ALL" 20 | type: choice 21 | options: 22 | - "ALL" 23 | - active 24 | - deleted 25 | - disabled_inactivity 26 | - disabled_manually 27 | delete_run_by_conclusion_pattern: 28 | description: 'Remove runs based on conclusion: action_required, cancelled, failure, skipped, success' 29 | required: true 30 | default: "ALL" 31 | type: choice 32 | options: 33 | - "ALL" 34 | - "Unsuccessful: action_required,cancelled,failure,skipped" 35 | - action_required 36 | - cancelled 37 | - failure 38 | - skipped 39 | - success 40 | dry_run: 41 | description: 'Logs simulated changes, no deletions are performed' 42 | required: false 43 | 44 | jobs: 45 | del_runs: 46 | runs-on: ubuntu-latest 47 | permissions: 48 | actions: write 49 | contents: read 50 | steps: 51 | - name: Delete workflow runs 52 | uses: Mattraks/delete-workflow-runs@v2 53 | with: 54 | token: ${{ github.token }} 55 | repository: ${{ github.repository }} 56 | retain_days: ${{ github.event.inputs.days }} 57 | keep_minimum_runs: ${{ github.event.inputs.minimum_runs }} 58 | delete_workflow_pattern: ${{ github.event.inputs.delete_workflow_pattern }} 59 | delete_workflow_by_state_pattern: ${{ github.event.inputs.delete_workflow_by_state_pattern }} 60 | delete_run_by_conclusion_pattern: >- 61 | ${{ 62 | startsWith(github.event.inputs.delete_run_by_conclusion_pattern, 'Unsuccessful:') 63 | && 'action_required,cancelled,failure,skipped' 64 | || github.event.inputs.delete_run_by_conclusion_pattern 65 | }} 66 | dry_run: ${{ github.event.inputs.dry_run }} -------------------------------------------------------------------------------- /src/main.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | MIT License 3 | 4 | Copyright (c) 2024 RealTimeChris 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy of this 7 | software and associated documentation files (the "Software"), to deal in the Software 8 | without restriction, including without limitation the rights to use, copy, modify, merge, 9 | publish, distribute, sublicense, and/or sell copies of the Software, and to permit 10 | persons to whom the Software is furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all copies or 13 | substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 16 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 17 | PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE 18 | FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 19 | OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | DEALINGS IN THE SOFTWARE. 21 | */ 22 | /// https://github.com/RealTimeChris/benchmarksuite 23 | #include 24 | 25 | static constexpr uint64_t total_iteration_count{ 10 }; 26 | static constexpr uint64_t measured_iterations{ 10 }; 27 | 28 | template struct test_struct_no_pause { 29 | BNCH_SWT_HOST static uint64_t impl([[maybe_unused]] value_type& values_01) { 30 | [[maybe_unused]] auto start = bnch_swt::clock_type::now(); 31 | [[maybe_unused]] auto end = bnch_swt::clock_type::now(); 32 | while ((end - start).count() < 10000) { 33 | end = bnch_swt::clock_type::now(); 34 | } 35 | values_01[0] += values_01[2]; 36 | bnch_swt::do_not_optimize_away(values_01); 37 | return 200000ull; 38 | } 39 | }; 40 | 41 | template struct test_struct_pause { 42 | BNCH_SWT_HOST static uint64_t impl([[maybe_unused]] value_type& values_01) { 43 | [[maybe_unused]] auto start = bnch_swt::clock_type::now(); 44 | [[maybe_unused]] auto end = bnch_swt::clock_type::now(); 45 | while ((end - start).count() < 10000) { 46 | end = bnch_swt::clock_type::now(); 47 | } 48 | values_01[0] += values_01[2]; 49 | bnch_swt::do_not_optimize_away(values_01); 50 | return 200000ull; 51 | } 52 | }; 53 | 54 | int main() { 55 | using bench_type = bnch_swt::benchmark_stage<"test_stage", total_iteration_count, measured_iterations, bnch_swt::benchmark_types::cpu, false>; 56 | std::vector doubles{}; 57 | for (uint64_t x = 0; x < 1024; ++x) { 58 | doubles.emplace_back(bnch_swt::random_generator::impl()); 59 | } 60 | 61 | bench_type::run_benchmark<"no-yield-std::string", test_struct_no_pause>>(doubles); 62 | bench_type::run_benchmark<"yield-std::string", test_struct_pause>>(doubles); 63 | 64 | bench_type::print_results(); 65 | return 0; 66 | } 67 | -------------------------------------------------------------------------------- /include/bnch_swt/file_loader.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | MIT License 3 | 4 | Copyright (c) 2024 RealTimeChris 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy of this 7 | software and associated documentation files (the "Software"), to deal in the Software 8 | without restriction, including without limitation the rights to use, copy, modify, merge, 9 | publish, distribute, sublicense, and/or sell copies of the Software, and to permit 10 | persons to whom the Software is furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all copies or 13 | substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 16 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 17 | PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE 18 | FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 19 | OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | DEALINGS IN THE SOFTWARE. 21 | */ 22 | /// https://github.com/RealTimeChris/benchmarksuite 23 | 24 | #pragma once 25 | 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | namespace bnch_swt { 32 | 33 | class file_loader { 34 | public: 35 | constexpr file_loader() { 36 | } 37 | 38 | static std::string load_file(const std::string& file_path) { 39 | std::string directory{ file_path.substr(0, file_path.find_last_of("/") + 1) }; 40 | if (!std::filesystem::exists(directory)) { 41 | std::filesystem::create_directories(directory); 42 | } 43 | if (!std::filesystem::exists(static_cast(file_path))) { 44 | std::ofstream create_file{ file_path.data() }; 45 | create_file.close(); 46 | } 47 | std::ifstream the_stream{ file_path.data(), std::ios::binary | std::ios::in }; 48 | std::stringstream input_stream{}; 49 | input_stream << the_stream.rdbuf(); 50 | the_stream.close(); 51 | return input_stream.str(); 52 | } 53 | 54 | static void save_file(const std::string& file_to_save, const std::string& file_path, bool retry = true) { 55 | std::ofstream the_stream{ file_path.data(), std::ios::binary | std::ios::out | std::ios::trunc }; 56 | the_stream.write(file_to_save.data(), static_cast(file_to_save.size())); 57 | if (the_stream.is_open()) { 58 | std::cout << "File succesfully written to: " << file_path << std::endl; 59 | } else { 60 | std::string directory{ file_path.substr(0, file_path.find_last_of("/") + 1) }; 61 | if (!std::filesystem::exists(directory) && retry) { 62 | std::filesystem::create_directories(directory); 63 | return save_file(file_to_save, file_path, false); 64 | } 65 | std::cerr << "File failed to be written to: " << file_path << std::endl; 66 | } 67 | the_stream.close(); 68 | } 69 | }; 70 | 71 | } 72 | -------------------------------------------------------------------------------- /include/bnch_swt/benchmarksuite_gpu_properties.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | MIT License 3 | 4 | Copyright (c) 2024 RealTimeChris 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy of this 7 | software and associated documentation files (the "Software"), to deal in the Software 8 | without restriction, including without limitation the rights to use, copy, modify, merge, 9 | publish, distribute, sublicense, and/or sell copies of the Software, and to permit 10 | persons to whom the Software is furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all copies or 13 | substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 16 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 17 | PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE 18 | FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 19 | OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | DEALINGS IN THE SOFTWARE. 21 | */ 22 | /// https://github.com/RealTimeChris/benchmarksuite 23 | #pragma once 24 | 25 | #include 26 | 27 | namespace bnch_swt { 28 | 29 | struct gpu_properties { 30 | protected: 31 | static constexpr aligned_const sm_count_raw{ 70ull }; 32 | static constexpr aligned_const max_threads_per_sm_raw{ 1536ull }; 33 | static constexpr aligned_const max_threads_per_block_raw{ 1024ull }; 34 | static constexpr aligned_const warp_size_raw{ 32ull }; 35 | static constexpr aligned_const l2_cache_size_raw{ 50331648ull }; 36 | static constexpr aligned_const shared_mem_per_block_raw{ 49152ull }; 37 | static constexpr aligned_const max_grid_size_x_raw{ 2147483647ull }; 38 | static constexpr aligned_const max_grid_size_y_raw{ 65535ull }; 39 | static constexpr aligned_const max_grid_size_z_raw{ 65535ull }; 40 | static constexpr aligned_const gpu_arch_index_raw{ 4ull }; 41 | static constexpr aligned_const total_threads_raw{ 107520ull }; 42 | 43 | public: 44 | static constexpr const uint64_t& sm_count{ *sm_count_raw }; 45 | static constexpr const uint64_t& max_threads_per_sm{ *max_threads_per_sm_raw }; 46 | static constexpr const uint64_t& max_threads_per_block{ *max_threads_per_block_raw }; 47 | static constexpr const uint64_t& warp_size{ *warp_size_raw }; 48 | static constexpr const uint64_t& l2_cache_size{ *l2_cache_size_raw }; 49 | static constexpr const uint64_t& shared_mem_per_block{ *shared_mem_per_block_raw }; 50 | static constexpr const uint64_t& max_grid_size_x{ *max_grid_size_x_raw }; 51 | static constexpr const uint64_t& max_grid_size_y{ *max_grid_size_y_raw }; 52 | static constexpr const uint64_t& max_grid_size_z{ *max_grid_size_z_raw }; 53 | static constexpr const uint64_t& total_threads{ *total_threads_raw }; 54 | static constexpr const uint64_t& gpu_arch_index{ *gpu_arch_index_raw }; 55 | }; 56 | } 57 | -------------------------------------------------------------------------------- /cmake/detection/benchmarksuite_gpu_properties.hpp.in: -------------------------------------------------------------------------------- 1 | /* 2 | MIT License 3 | 4 | Copyright (c) 2024 RealTimeChris 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy of this 7 | software and associated documentation files (the "Software"), to deal in the Software 8 | without restriction, including without limitation the rights to use, copy, modify, merge, 9 | publish, distribute, sublicense, and/or sell copies of the Software, and to permit 10 | persons to whom the Software is furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all copies or 13 | substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 16 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 17 | PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE 18 | FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 19 | OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | DEALINGS IN THE SOFTWARE. 21 | */ 22 | /// https://github.com/RealTimeChris/benchmarksuite 23 | #pragma once 24 | 25 | #include 26 | 27 | namespace bnch_swt { 28 | 29 | struct gpu_properties { 30 | protected: 31 | static constexpr aligned_const sm_count_raw{ @BNCH_SWT_SM_COUNT@ull }; 32 | static constexpr aligned_const max_threads_per_sm_raw{ @BNCH_SWT_MAX_THREADS_PER_SM@ull }; 33 | static constexpr aligned_const max_threads_per_block_raw{ @BNCH_SWT_MAX_THREADS_PER_BLOCK@ull }; 34 | static constexpr aligned_const warp_size_raw{ @BNCH_SWT_WARP_SIZE@ull }; 35 | static constexpr aligned_const l2_cache_size_raw{ @BNCH_SWT_GPU_L2_CACHE_SIZE@ull }; 36 | static constexpr aligned_const shared_mem_per_block_raw{ @BNCH_SWT_SHARED_MEM_PER_BLOCK@ull }; 37 | static constexpr aligned_const max_grid_size_x_raw{ @BNCH_SWT_MAX_GRID_SIZE_X@ull }; 38 | static constexpr aligned_const max_grid_size_y_raw{ @BNCH_SWT_MAX_GRID_SIZE_Y@ull }; 39 | static constexpr aligned_const max_grid_size_z_raw{ @BNCH_SWT_MAX_GRID_SIZE_Z@ull }; 40 | static constexpr aligned_const gpu_arch_index_raw{ @BNCH_SWT_GPU_ARCH_INDEX@ull }; 41 | static constexpr aligned_const total_threads_raw{ @BNCH_SWT_TOTAL_THREADS@ull }; 42 | 43 | public: 44 | static constexpr const uint64_t& sm_count{ *sm_count_raw }; 45 | static constexpr const uint64_t& max_threads_per_sm{ *max_threads_per_sm_raw }; 46 | static constexpr const uint64_t& max_threads_per_block{ *max_threads_per_block_raw }; 47 | static constexpr const uint64_t& warp_size{ *warp_size_raw }; 48 | static constexpr const uint64_t& l2_cache_size{ *l2_cache_size_raw }; 49 | static constexpr const uint64_t& shared_mem_per_block{ *shared_mem_per_block_raw }; 50 | static constexpr const uint64_t& max_grid_size_x{ *max_grid_size_x_raw }; 51 | static constexpr const uint64_t& max_grid_size_y{ *max_grid_size_y_raw }; 52 | static constexpr const uint64_t& max_grid_size_z{ *max_grid_size_z_raw }; 53 | static constexpr const uint64_t& total_threads{ *total_threads_raw }; 54 | static constexpr const uint64_t& gpu_arch_index{ *gpu_arch_index_raw }; 55 | }; 56 | } 57 | -------------------------------------------------------------------------------- /include/bnch_swt/aligned_const.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | MIT License 3 | 4 | Copyright (c) 2024 RealTimeChris 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy of this 7 | software and associated documentation files (the "Software"), to deal in the Software 8 | without restriction, including without limitation the rights to use, copy, modify, merge, 9 | publish, distribute, sublicense, and/or sell copies of the Software, and to permit 10 | persons to whom the Software is furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all copies or 13 | substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 16 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 17 | PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE 18 | FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 19 | OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | DEALINGS IN THE SOFTWARE. 21 | */ 22 | /// https://github.com/RealTimeChris/benchmarksuite 23 | 24 | #pragma once 25 | 26 | #include 27 | 28 | namespace bnch_swt { 29 | 30 | static constexpr uint64_t device_alignment{ [] { 31 | if constexpr (BNCH_SWT_COMPILER_CUDA) { 32 | return 16ull; 33 | } else { 34 | return 64ull; 35 | } 36 | }() }; 37 | 38 | template struct BNCH_SWT_ALIGN(device_alignment) aligned_const { 39 | using value_type = value_type_new; 40 | value_type value{}; 41 | 42 | BNCH_SWT_HOST_DEVICE constexpr aligned_const() { 43 | } 44 | BNCH_SWT_HOST_DEVICE constexpr aligned_const(const value_type& v) : value(v) { 45 | } 46 | BNCH_SWT_HOST_DEVICE constexpr aligned_const(value_type&& v) : value(std::move(v)) { 47 | } 48 | 49 | BNCH_SWT_HOST_DEVICE constexpr operator const value_type&() const& { 50 | return value; 51 | } 52 | 53 | BNCH_SWT_HOST_DEVICE explicit constexpr operator value_type&() & { 54 | return value; 55 | } 56 | 57 | BNCH_SWT_HOST_DEVICE explicit constexpr operator value_type&&() && { 58 | return std::move(value); 59 | } 60 | 61 | BNCH_SWT_HOST_DEVICE constexpr const value_type* get() const { 62 | return &value; 63 | } 64 | 65 | BNCH_SWT_HOST_DEVICE constexpr value_type* get() { 66 | return &value; 67 | } 68 | 69 | BNCH_SWT_HOST_DEVICE constexpr const value_type& operator*() const { 70 | return value; 71 | } 72 | 73 | BNCH_SWT_HOST_DEVICE constexpr value_type& operator*() { 74 | return value; 75 | } 76 | 77 | template BNCH_SWT_HOST_DEVICE constexpr void emplace(value_type_newer&& value_new) { 78 | value = std::forward(value_new); 79 | } 80 | 81 | BNCH_SWT_HOST_DEVICE constexpr value_type multiply(const aligned_const& other) const { 82 | return value * other.value; 83 | } 84 | 85 | BNCH_SWT_HOST_DEVICE constexpr bool operator==(const aligned_const& other) const { 86 | return value == other.value; 87 | } 88 | 89 | BNCH_SWT_HOST_DEVICE constexpr bool operator!=(const aligned_const& other) const { 90 | return value != other.value; 91 | } 92 | 93 | BNCH_SWT_HOST_DEVICE constexpr bool operator<(const aligned_const& other) const { 94 | return value < other.value; 95 | } 96 | 97 | BNCH_SWT_HOST_DEVICE constexpr bool operator>(const aligned_const& other) const { 98 | return value > other.value; 99 | } 100 | }; 101 | 102 | template aligned_const(value_type) -> aligned_const; 103 | 104 | } 105 | -------------------------------------------------------------------------------- /CMakePresets.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 3, 3 | "configurePresets": [ 4 | { 5 | "name": "${hostSystemName}-base", 6 | "hidden": true, 7 | "generator": "Ninja", 8 | "binaryDir": "${sourceDir}/out/build/${presetName}", 9 | "installDir": "${sourceDir}/out/build/${presetName}", 10 | "toolchainFile": "C:/vcpkg/scripts/buildsystems/vcpkg.cmake", 11 | "cacheVariables": { 12 | "CMAKE_C_COMPILER": "cl.exe", 13 | "CMAKE_CXX_COMPILER": "cl.exe" 14 | }, 15 | "architecture": { 16 | "value": "x64", 17 | "strategy": "external" 18 | }, 19 | "condition": { 20 | "type": "equals", 21 | "lhs": "${hostSystemName}", 22 | "rhs": "Windows" 23 | } 24 | }, 25 | { 26 | "name": "debug-base", 27 | "hidden": true, 28 | "inherits": "${hostSystemName}-base", 29 | "cacheVariables": { 30 | "CMAKE_BUILD_TYPE": "Debug" 31 | } 32 | }, 33 | { 34 | "name": "release-base", 35 | "hidden": true, 36 | "inherits": "${hostSystemName}-base", 37 | "cacheVariables": { 38 | "CMAKE_BUILD_TYPE": "Release" 39 | } 40 | }, 41 | { 42 | "name": "debug-benchmarksuite-cpu", 43 | "inherits": "debug-base", 44 | "cacheVariables": { 45 | "BNCH_SWT_CPU": true 46 | } 47 | }, 48 | { 49 | "name": "release-benchmarksuite-cpu", 50 | "inherits": "release-base", 51 | "cacheVariables": { 52 | "BNCH_SWT_CPU": true 53 | } 54 | }, 55 | { 56 | "name": "debug-dev-benchmarksuite-cpu", 57 | "inherits": "debug-base", 58 | "cacheVariables": { 59 | "BNCH_SWT_CPU": true, 60 | "BENCHMARKS": true 61 | } 62 | }, 63 | { 64 | "name": "release-dev-benchmarksuite-cpu", 65 | "inherits": "release-base", 66 | "cacheVariables": { 67 | "BNCH_SWT_CPU": true, 68 | "BENCHMARKS": true 69 | } 70 | }, 71 | { 72 | "name": "debug-benchmarksuite-cuda", 73 | "inherits": "debug-base", 74 | "cacheVariables": { 75 | "BNCH_SWT_CUDA": true 76 | } 77 | }, 78 | { 79 | "name": "release-benchmarksuite-cuda", 80 | "inherits": "release-base", 81 | "cacheVariables": { 82 | "BNCH_SWT_CUDA": true 83 | } 84 | }, 85 | 86 | { 87 | "name": "debug-dev-benchmarksuite-cuda", 88 | "inherits": "debug-base", 89 | "cacheVariables": { 90 | "BNCH_SWT_CUDA": true, 91 | "BENCHMARKS": true 92 | } 93 | }, 94 | { 95 | "name": "release-dev-benchmarksuite-cuda", 96 | "inherits": "release-base", 97 | "cacheVariables": { 98 | "BNCH_SWT_CUDA": true, 99 | "BENCHMARKS": true 100 | } 101 | }, 102 | { 103 | "name": "linux-debug", 104 | "displayName": "Linux Debug", 105 | "generator": "Ninja", 106 | "binaryDir": "${sourceDir}/out/build/${presetName}", 107 | "installDir": "${sourceDir}/out/install/${presetName}", 108 | "cacheVariables": { 109 | "CMAKE_BUILD_TYPE": "Debug" 110 | }, 111 | "condition": { 112 | "type": "equals", 113 | "lhs": "${hostSystemName}", 114 | "rhs": "Linux" 115 | }, 116 | "vendor": { 117 | "microsoft.com/VisualStudioRemoteSettings/cmake/1.0": { 118 | "sourceDir": "$env{HOME}/.vs/$ms{projectDirName}" 119 | } 120 | } 121 | }, 122 | { 123 | "name": "macos-debug", 124 | "displayName": "macOS Debug", 125 | "generator": "Ninja", 126 | "binaryDir": "${sourceDir}/out/build/${presetName}", 127 | "installDir": "${sourceDir}/out/install/${presetName}", 128 | "cacheVariables": { 129 | "CMAKE_BUILD_TYPE": "Debug" 130 | }, 131 | "condition": { 132 | "type": "equals", 133 | "lhs": "${hostSystemName}", 134 | "rhs": "Darwin" 135 | }, 136 | "vendor": { 137 | "microsoft.com/VisualStudioRemoteSettings/cmake/1.0": { 138 | "sourceDir": "$env{HOME}/.vs/$ms{projectDirName}" 139 | } 140 | } 141 | } 142 | ] 143 | } -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2024 RealTimeChris 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this 6 | # software and associated documentation files (the "Software"), to deal in the Software 7 | # without restriction, including without limitation the rights to use, copy, modify, merge, 8 | # publish, distribute, sublicense, and/or sell copies of the Software, and to permit 9 | # persons to whom the Software is furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice shall be included in all copies or 12 | # substantial portions of the Software. 13 | # 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 15 | # INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 16 | # PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE 17 | # FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 18 | # OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 19 | # DEALINGS IN THE SOFTWARE. 20 | 21 | set(MAIN_SOURCE $,main_deploy,main>${BNCH_SWT_MAIN_FILE_EXTENSION}) 22 | 23 | add_executable(benchmarksuite-main ${MAIN_SOURCE}) 24 | 25 | if (PRINT_ASSEMBLY) 26 | if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID MATCHES "GNU") 27 | message(STATUS "Assembly printing enabled. Using Post-Build step to generate assembly file.") 28 | 29 | if(CMAKE_SYSTEM_NAME MATCHES "Darwin") 30 | add_custom_command( 31 | TARGET benchmarksuite-main POST_BUILD 32 | COMMAND ${CMAKE_COMMAND} -E make_directory "$" 33 | COMMAND ${CMAKE_COMMAND} -E echo "Assembly file location: $/benchmarksuite-main.asm" 34 | COMMAND sh -c "otool -tV '$' > '$/benchmarksuite-main.asm'" 35 | COMMENT "Generating final assembly for benchmarksuite-main" 36 | VERBATIM 37 | ) 38 | else() 39 | add_custom_command( 40 | TARGET benchmarksuite-main POST_BUILD 41 | COMMAND ${CMAKE_COMMAND} -E make_directory "$" 42 | COMMAND ${CMAKE_COMMAND} -E echo "Assembly file location: $/benchmarksuite-main.asm" 43 | COMMAND sh -c "objdump -d '$' > '$/benchmarksuite-main.asm'" 44 | COMMENT "Generating final assembly for benchmarksuite-main" 45 | VERBATIM 46 | ) 47 | endif() 48 | elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") 49 | set(ASM_OUTPUT_DIR "${CMAKE_CURRENT_BINARY_DIR}/asm_output") 50 | file(MAKE_DIRECTORY ${ASM_OUTPUT_DIR}) 51 | 52 | target_compile_options(benchmarksuite-main PRIVATE 53 | "/FAcs" 54 | "/Fa${ASM_OUTPUT_DIR}/" 55 | ) 56 | 57 | message(STATUS "Assembly printing enabled for MSVC. Assembly files will be generated in: ${ASM_OUTPUT_DIR}") 58 | 59 | add_custom_command( 60 | TARGET benchmarksuite-main POST_BUILD 61 | COMMAND ${CMAKE_COMMAND} -E echo "Copying MSVC assembly files from ${ASM_OUTPUT_DIR}..." 62 | COMMAND ${CMAKE_COMMAND} -E copy_directory "${ASM_OUTPUT_DIR}" "$" 63 | COMMAND ${CMAKE_COMMAND} -E echo "Assembly files location: $" 64 | COMMENT "Copying MSVC assembly output" 65 | ) 66 | endif() 67 | endif() 68 | 69 | target_link_libraries(benchmarksuite-main 70 | PUBLIC 71 | benchmarksuite::benchmarksuite 72 | ) 73 | 74 | if(MSVC) 75 | set_property(TARGET benchmarksuite-main PROPERTY 76 | MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>" 77 | ) 78 | endif() 79 | 80 | install( 81 | FILES 82 | $ 83 | DESTINATION bin 84 | OPTIONAL 85 | ) 86 | -------------------------------------------------------------------------------- /include/bnch_swt/counters/cuda_perf_events.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | MIT License 3 | 4 | Copyright (c) 2024 RealTimeChris 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy of this 7 | software and associated documentation files (the "Software"), to deal in the Software 8 | without restriction, including without limitation the rights to use, copy, modify, merge, 9 | publish, distribute, sublicense, and/or sell copies of the Software, and to permit 10 | persons to whom the Software is furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all copies or 13 | substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 16 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 17 | PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE 18 | FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 19 | OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | DEALINGS IN THE SOFTWARE. 21 | */ 22 | /// https://github.com/RealTimeChris/benchmarksuite 23 | 24 | #pragma once 25 | 26 | #include 27 | 28 | #if BNCH_SWT_COMPILER_CUDA 29 | 30 | #include 31 | #include 32 | 33 | namespace bnch_swt::internal { 34 | 35 | struct cuda_timer { 36 | BNCH_SWT_HOST cuda_timer() noexcept { 37 | if (cudaEventCreate(&start_val) != cudaSuccess) { 38 | return; 39 | } 40 | if (cudaEventCreate(&stop_val) != cudaSuccess) { 41 | return; 42 | } 43 | } 44 | 45 | BNCH_SWT_HOST void start() noexcept { 46 | cudaEventRecord(start_val, 0); 47 | } 48 | 49 | BNCH_SWT_HOST void stop() noexcept { 50 | cudaEventRecord(stop_val, 0); 51 | cudaEventSynchronize(stop_val); 52 | } 53 | 54 | BNCH_SWT_HOST double get_time() noexcept { 55 | float milliseconds = 0; 56 | cudaEventElapsedTime(&milliseconds, start_val, stop_val); 57 | return static_cast(milliseconds); 58 | } 59 | 60 | BNCH_SWT_HOST ~cuda_timer() noexcept { 61 | cudaEventDestroy(start_val); 62 | cudaEventDestroy(stop_val); 63 | } 64 | 65 | protected: 66 | cudaEvent_t start_val{}, stop_val{}; 67 | }; 68 | 69 | template BNCH_SWT_GLOBAL static void profiling_wrapper(args_types... args) { 70 | function_type::impl(args...); 71 | } 72 | 73 | template struct event_collector_type : public std::vector { 74 | std::vector events{}; 75 | uint64_t current_index{}; 76 | 77 | BNCH_SWT_HOST event_collector_type() : std::vector(count), current_index(0) { 78 | events.resize(count); 79 | } 80 | 81 | BNCH_SWT_HOST ~event_collector_type() { 82 | } 83 | 84 | template BNCH_SWT_HOST void run(dim3 grid, dim3 block, uint64_t shared_mem, uint64_t bytes_processed, args_types... args) { 85 | if (current_index >= count) { 86 | return; 87 | } 88 | events[current_index].start(); 89 | profiling_wrapper<<>>(args...); 90 | events[current_index].stop(); 91 | double ms{ events[current_index].get_time() }; 92 | std::vector::operator[](current_index).elapsed_ns_val.emplace(duration_type(ms)); 93 | std::vector::operator[](current_index).cuda_event_ms_val.emplace(ms); 94 | std::vector::operator[](current_index).bytes_processed_val.emplace(bytes_processed); 95 | int clock_rate_khz; 96 | cudaDeviceGetAttribute(&clock_rate_khz, cudaDevAttrClockRate, 0); 97 | uint64_t cycles = static_cast(ms * 1e-3 * clock_rate_khz * 1000.0); 98 | std::vector::operator[](current_index).cycles_val.emplace(cycles); 99 | ++current_index; 100 | } 101 | 102 | template BNCH_SWT_HOST void run(dim3 grid, dim3 block, uint64_t shared_mem, uint64_t bytes_processed, args_types... args) { 103 | if (current_index >= count) { 104 | return; 105 | } 106 | events[current_index].start(); 107 | function<<>>(args...); 108 | events[current_index].stop(); 109 | double ms{ events[current_index].get_time() }; 110 | std::vector::operator[](current_index).elapsed_ns_val.emplace(duration_type(ms)); 111 | std::vector::operator[](current_index).cuda_event_ms_val.emplace(ms); 112 | std::vector::operator[](current_index).bytes_processed_val.emplace(bytes_processed); 113 | int clock_rate_khz; 114 | cudaDeviceGetAttribute(&clock_rate_khz, cudaDevAttrClockRate, 0); 115 | uint64_t cycles = static_cast(ms * 1e-3 * clock_rate_khz * 1000.0); 116 | std::vector::operator[](current_index).cycles_val.emplace(cycles); 117 | ++current_index; 118 | } 119 | 120 | BNCH_SWT_HOST void set_bytes_processed(uint64_t bytes) { 121 | if (current_index > 0) { 122 | std::vector::operator[](current_index - 1).bytes_processed_val.emplace(bytes); 123 | } 124 | } 125 | }; 126 | 127 | } 128 | 129 | #endif 130 | -------------------------------------------------------------------------------- /include/bnch_swt/do_not_optimize.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | MIT License 3 | 4 | Copyright (c) 2024 RealTimeChris 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy of this 7 | software and associated documentation files (the "Software"), to deal in the Software 8 | without restriction, including without limitation the rights to use, copy, modify, merge, 9 | publish, distribute, sublicense, and/or sell copies of the Software, and to permit 10 | persons to whom the Software is furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all copies or 13 | substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 16 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 17 | PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE 18 | FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 19 | OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | DEALINGS IN THE SOFTWARE. 21 | */ 22 | /// https://github.com/RealTimeChris/benchmarksuite 23 | /// Sep 1, 2024 24 | #pragma once 25 | 26 | #include 27 | 28 | namespace bnch_swt::internal { 29 | 30 | template 31 | concept invocable = std::is_invocable_v, arg_types...>; 32 | 33 | template 34 | concept not_invocable = !invocable; 35 | 36 | template 37 | concept invocable_void = invocable && std::is_void_v>; 38 | 39 | template 40 | concept invocable_not_void = invocable && !std::is_void_v>; 41 | 42 | template 43 | concept small_trivially_copyable = std::is_trivially_copyable_v && (sizeof(value_type) <= sizeof(value_type*)); 44 | 45 | template 46 | concept large_or_non_trivially_copyable = !std::is_trivially_copyable_v || (sizeof(value_type) > sizeof(value_type*)); 47 | 48 | inline void const volatile* volatile global_force_escape_pointer; 49 | 50 | BNCH_SWT_HOST static void use_char_pointer(void const volatile* const v) { 51 | global_force_escape_pointer = v; 52 | } 53 | 54 | #if BNCH_SWT_COMPILER_MSVC 55 | 56 | template BNCH_SWT_HOST static void do_not_optimize(value_type const& value) { 57 | use_char_pointer(static_cast(&value)); 58 | _ReadWriteBarrier(); 59 | } 60 | 61 | template BNCH_SWT_HOST static void do_not_optimize(value_type&& value) { 62 | use_char_pointer(static_cast(&value)); 63 | _ReadWriteBarrier(); 64 | } 65 | 66 | #elif BNCH_SWT_COMPILER_CLANG 67 | template BNCH_SWT_HOST static void do_not_optimize(value_type const& value) { 68 | asm volatile("" : : "r,m"(value) : "memory"); 69 | } 70 | 71 | template BNCH_SWT_HOST static void do_not_optimize(value_type&& value) { 72 | asm volatile("" : "+r,m"(value) : : "memory"); 73 | } 74 | 75 | #elif BNCH_SWT_COMPILER_GCC 76 | template BNCH_SWT_HOST static void do_not_optimize(value_type const& value) { 77 | asm volatile("" : : "r,m"(value) : "memory"); 78 | } 79 | 80 | template BNCH_SWT_HOST static void do_not_optimize(value_type const& value) { 81 | asm volatile("" : : "m"(value) : "memory"); 82 | } 83 | 84 | template BNCH_SWT_HOST static void do_not_optimize(value_type&& value) { 85 | asm volatile("" : "+m,r"(value) : : "memory"); 86 | } 87 | 88 | template BNCH_SWT_HOST static void do_not_optimize(value_type&& value) { 89 | asm volatile("" : "+m"(value) : : "memory"); 90 | } 91 | #else 92 | 93 | template inline BNCH_SWT_HOST static void do_not_optimize(value_type&& value) { 94 | internal::use_char_pointer(&reinterpret_cast(value)); 95 | } 96 | 97 | #endif 98 | 99 | BNCH_SWT_HOST static void clobber_memory() { 100 | #if BNCH_SWT_COMPILER_MSVC 101 | _ReadWriteBarrier(); 102 | #elif BNCH_SWT_COMPILER_CLANG || BNCH_SWT_COMPILER_GCC 103 | asm volatile("" ::: "memory"); 104 | #endif 105 | } 106 | } 107 | 108 | namespace bnch_swt { 109 | 110 | template BNCH_SWT_HOST static void do_not_optimize_away(value_type&& value) { 111 | internal::do_not_optimize(value); 112 | } 113 | 114 | template BNCH_SWT_HOST static void do_not_optimize_away(function_type&& value, arg_types&&... args) { 115 | std::forward(value)(std::forward(args)...); 116 | internal::clobber_memory(); 117 | } 118 | 119 | template BNCH_SWT_HOST static auto do_not_optimize_away(function_type&& value, arg_types&&... args) { 120 | auto result_val = std::forward(value)(std::forward(args)...); 121 | internal::do_not_optimize(result_val); 122 | return result_val; 123 | } 124 | 125 | } 126 | -------------------------------------------------------------------------------- /include/bnch_swt/printable.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | MIT License 3 | 4 | Copyright (c) 2024 RealTimeChris 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy of this 7 | software and associated documentation files (the "Software"), to deal in the Software 8 | without restriction, including without limitation the rights to use, copy, modify, merge, 9 | publish, distribute, sublicense, and/or sell copies of the Software, and to permit 10 | persons to whom the Software is furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all copies or 13 | substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 16 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 17 | PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE 18 | FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 19 | OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | DEALINGS IN THE SOFTWARE. 21 | */ 22 | /// https://github.com/RealTimeChris/benchmarksuite 23 | #pragma once 24 | 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | namespace bnch_swt { 32 | 33 | namespace internal { 34 | 35 | template 36 | constexpr void visit(variant_type&& variant, arg_types&&... args) noexcept { 37 | if constexpr (current_index < std::variant_size_v>) { 38 | variant_type&& variant_new = std::forward(variant); 39 | if (variant_new.index() == current_index) { 40 | function(std::get(std::forward(variant_new)), std::forward(args)...); 41 | return; 42 | } 43 | visit(std::forward(variant_new), std::forward(args)...); 44 | } 45 | } 46 | 47 | template void print_value(std::ostream& os, const value_type& value) { 48 | os << value; 49 | } 50 | 51 | template void print_value(std::ostream& os, const value_type& value) { 52 | os << std::boolalpha << value; 53 | } 54 | 55 | template void print_value(std::ostream& os, const value_type& value) { 56 | os << "\"" << value << "\""; 57 | } 58 | 59 | template void print_value(std::ostream& os, const value_type& value) { 60 | os << "\"" << value << "\""; 61 | } 62 | 63 | template void print_value(std::ostream& os, const value_type& value) { 64 | os << "["; 65 | for (uint64_t x = 0; x < value.size(); ++x) { 66 | print_value(os, value[x]); 67 | if (x < value.size() - 1) { 68 | os << ','; 69 | } 70 | } 71 | os << "]"; 72 | } 73 | 74 | template void print_value(std::ostream& os, const value_type& value) { 75 | os << "{"; 76 | uint64_t index{}; 77 | for (auto iter = value.begin(); iter != value.end(); ++iter) { 78 | print_value(os, iter->first); 79 | os << ":"; 80 | print_value(os, iter->second); 81 | if (index < value.size() - 1) { 82 | os << ","; 83 | } 84 | ++index; 85 | } 86 | os << "}"; 87 | } 88 | 89 | template void print_value(std::ostream& os, const value_type& value) { 90 | static constexpr auto lambda = [](auto&& value_new, auto& os_new) { 91 | print_value(os_new, value_new); 92 | }; 93 | visit(value, os); 94 | } 95 | 96 | template void print_value(std::ostream& os, const value_type& value) { 97 | if (value.has_value()) { 98 | print_value(os, value.value()); 99 | } 100 | } 101 | 102 | template void print_value(std::ostream& os, const value_type& value) { 103 | os << "{"; 104 | print_value(os, std::get<0>(value)); 105 | os << ","; 106 | print_value(os, std::get<1>(value)); 107 | os << "}"; 108 | } 109 | 110 | template void print_value(std::ostream& os, const value_type& value) { 111 | if constexpr (index < std::tuple_size_v>) { 112 | print_value(os, std::get(value)); 113 | if constexpr (index < std::tuple_size_v> - 1) { 114 | os << ","; 115 | } 116 | print_value(os, value); 117 | } 118 | } 119 | 120 | template void print_value(std::ostream& os, const value_type& value) { 121 | os << "{"; 122 | print_value(os, value); 123 | os << "}"; 124 | } 125 | 126 | } 127 | 128 | } 129 | 130 | template std::ostream& operator<<(std::ostream& os, const value_type& value) { 131 | bnch_swt::internal::print_value(os, value); 132 | return os; 133 | } 134 | 135 | template std::ostream& operator<<(std::ostream& os, const value_type& value) { 136 | bnch_swt::internal::print_value(os, value); 137 | return os; 138 | } 139 | 140 | template std::ostream& operator<<(std::ostream& os, const value_type& value) { 141 | bnch_swt::internal::print_value(os, value); 142 | return os; 143 | } 144 | 145 | template std::ostream& operator<<(std::ostream& os, const value_type& value) { 146 | bnch_swt::internal::print_value(os, value); 147 | return os; 148 | } 149 | 150 | template std::ostream& operator<<(std::ostream& os, const value_type& value) { 151 | bnch_swt::internal::print_value(os, value); 152 | return os; 153 | } 154 | 155 | template std::ostream& operator<<(std::ostream& os, const value_type& value) { 156 | bnch_swt::internal::print_value(os, value); 157 | return os; 158 | } 159 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2024 RealTimeChris 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this 6 | # software and associated documentation files (the "Software"), to deal in the Software 7 | # without restriction, including without limitation the rights to use, copy, modify, merge, 8 | # publish, distribute, sublicense, and/or sell copies of the Software, and to permit 9 | # persons to whom the Software is furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice shall be included in all copies or 12 | # substantial portions of the Software. 13 | # 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 15 | # INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 16 | # PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE 17 | # FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 18 | # OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 19 | # DEALINGS IN THE SOFTWARE. 20 | cmake_minimum_required(VERSION 3.28) 21 | 22 | if(WIN32) 23 | set(VCPKG_TARGET_TRIPLET x64-windows-static CACHE STRING "" FORCE) 24 | elseif(UNIX AND NOT APPLE) 25 | set(VCPKG_TARGET_TRIPLET x64-linux CACHE STRING "" FORCE) 26 | elseif(APPLE) 27 | set(VCPKG_TARGET_TRIPLET x64-osx CACHE STRING "" FORCE) 28 | endif() 29 | 30 | option(BUILD_SHARED_LIBS "Build using shared libraries" OFF) 31 | option(BNCH_SWT_DETECT_CPU_PROPERTIES "Override cpu-cache-size selection" OFF) 32 | option(BNCH_SWT_DETECT_GPU_PROPERTIES "Override cuda-cache-size selection" OFF) 33 | option(BNCH_SWT_CUDA "Enable CUDA support" OFF) 34 | 35 | if(BNCH_SWT_CUDA) 36 | set(BNCH_SWT_LANGUAGE CUDA) 37 | project(benchmarksuite 38 | VERSION 1.0.0 39 | DESCRIPTION "A header-only C++ benchmarking library with cross-platform hardware performance counter integration,\ 40 | providing precise measurements of cycles, instructions, branches, cache behavior, and throughput with minimal overhead." 41 | LANGUAGES CUDA 42 | ) 43 | find_package(CUDAToolkit REQUIRED COMPONENTS cudart) 44 | include(cmake/detection/benchmarksuite_detect_gpu_properties.cmake) 45 | set(CMAKE_CUDA_ARCHITECTURES ${BNCH_SWT_MAJOR_COMPUTE_CAPABILITY}${BNCH_SWT_MINOR_COMPUTE_CAPABILITY} CACHE STRING "Cuda architectures." FORCE) 46 | set(BNCH_SWT_MAIN_FILE_EXTENSION .cu) 47 | set(CMAKE_CUDA_STANDARD 20) 48 | set(CMAKE_CUDA_STANDARD_REQUIRED ON) 49 | else() 50 | set(BNCH_SWT_LANGUAGE CXX) 51 | project(benchmarksuite 52 | VERSION 1.0.0 53 | DESCRIPTION "A header-only C++ benchmarking library with cross-platform hardware performance counter integration,\ 54 | providing precise measurements of cycles, instructions, branches, cache behavior, and throughput with minimal overhead." 55 | LANGUAGES CXX 56 | ) 57 | set(BNCH_SWT_MAIN_FILE_EXTENSION .cpp) 58 | set(CMAKE_CXX_STANDARD 20) 59 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 60 | endif() 61 | 62 | include(cmake/detection/benchmarksuite_detect_cpu_properties.cmake) 63 | 64 | set(BNCH_SWT_COMPILER_ID ${CMAKE_${BNCH_SWT_LANGUAGE}_COMPILER_ID}) 65 | 66 | file(GLOB_RECURSE BNCH_SWT_HEADERS CONFIGURE_DEPENDS include/*.hpp) 67 | 68 | add_library(${PROJECT_NAME} INTERFACE ${BNCH_SWT_HEADERS}) 69 | 70 | add_library(${PROJECT_NAME}::${PROJECT_NAME} ALIAS ${PROJECT_NAME}) 71 | 72 | include(cmake/flags_and_options.cmake) 73 | 74 | target_include_directories(${PROJECT_NAME} 75 | INTERFACE 76 | $ 77 | $ 78 | ) 79 | 80 | target_link_libraries(${PROJECT_NAME} 81 | INTERFACE 82 | $<$:CUDA::cudart_static> 83 | ) 84 | 85 | target_compile_options(${PROJECT_NAME} 86 | INTERFACE 87 | ${BNCH_SWT_COMPILE_OPTIONS} 88 | ) 89 | 90 | target_link_options(${PROJECT_NAME} 91 | INTERFACE 92 | ${BNCH_SWT_LINK_OPTIONS} 93 | ) 94 | 95 | target_compile_definitions(${PROJECT_NAME} 96 | INTERFACE 97 | ${BNCH_SWT_COMPILE_DEFINITIONS} 98 | ) 99 | 100 | set(CONFIG_FILE_NAME "${PROJECT_NAME}Config.cmake") 101 | set(EXPORTED_TARGETS_NAME "${PROJECT_NAME}Targets") 102 | set(EXPORTED_TARGETS_FILE_NAME "${EXPORTED_TARGETS_NAME}.cmake") 103 | set(EXPORTED_TARGETS_FILE_PATH "share/benchmarksuite/${EXPORTED_TARGETS_FILE_NAME}") 104 | 105 | include(CMakePackageConfigHelpers) 106 | configure_package_config_file( 107 | "${CMAKE_CURRENT_SOURCE_DIR}/cmake/${CONFIG_FILE_NAME}.in" 108 | "${CMAKE_CURRENT_BINARY_DIR}/${CONFIG_FILE_NAME}" 109 | INSTALL_DESTINATION "share/benchmarksuite" 110 | PATH_VARS 111 | EXPORTED_TARGETS_FILE_PATH 112 | ) 113 | 114 | set(VERSION_FILE_NAME "${PROJECT_NAME}ConfigVersion.cmake") 115 | 116 | write_basic_package_version_file( 117 | "${CMAKE_CURRENT_BINARY_DIR}/${VERSION_FILE_NAME}" 118 | VERSION "${PRODUCT_VERSION}" 119 | COMPATIBILITY AnyNewerVersion 120 | ) 121 | 122 | install( 123 | FILES 124 | "${CMAKE_CURRENT_BINARY_DIR}/${CONFIG_FILE_NAME}" 125 | "${CMAKE_CURRENT_BINARY_DIR}/${VERSION_FILE_NAME}" 126 | DESTINATION "share/benchmarksuite" 127 | ) 128 | 129 | install( 130 | DIRECTORY 131 | "${CMAKE_CURRENT_SOURCE_DIR}/include/" 132 | DESTINATION "include" 133 | ) 134 | 135 | install( 136 | TARGETS "${PROJECT_NAME}" 137 | EXPORT "${EXPORTED_TARGETS_NAME}" 138 | ) 139 | 140 | install( 141 | EXPORT "${EXPORTED_TARGETS_NAME}" 142 | FILE "${EXPORTED_TARGETS_FILE_NAME}" 143 | NAMESPACE "${PROJECT_NAME}::" 144 | DESTINATION "share/benchmarksuite" 145 | ) 146 | 147 | message(STATUS "benchmarksuite Configuration Summary:") 148 | message(STATUS "=============================") 149 | message(STATUS "Version: ${PROJECT_VERSION}") 150 | message(STATUS "Max Thread Count: ${BNCH_SWT_THREAD_COUNT}") 151 | message(STATUS "Architecture: ${CMAKE_SYSTEM_PROCESSOR}") 152 | message(STATUS "Compiler: ${BNCH_SWT_COMPILER_ID}") 153 | message(STATUS "CPU Variants: ${BNCH_SWT_INSTRUCTION_SET_NAME}") 154 | 155 | if(BNCH_SWT_CUDA) 156 | message(STATUS "CUDA Architectures: ${CMAKE_CUDA_ARCHITECTURES}") 157 | endif() 158 | 159 | if(BENCHMARKS) 160 | add_subdirectory(src) 161 | endif() -------------------------------------------------------------------------------- /include/bnch_swt/event_counter.hpp: -------------------------------------------------------------------------------- 1 | // bnch_swt/event_counter.hpp 2 | /* 3 | MIT License 4 | 5 | Copyright (c) 2024 RealTimeChris 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy of this 8 | software and associated documentation files (the "Software"), to deal in the Software 9 | without restriction, including without limitation the rights to use, copy, modify, merge, 10 | publish, distribute, sublicense, and/or sell copies of the Software, and to permit 11 | persons to whom the Software is furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all copies or 14 | substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 17 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 18 | PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE 19 | FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 20 | OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | DEALINGS IN THE SOFTWARE. 22 | */ 23 | /// https://github.com/RealTimeChris/benchmarksuite 24 | 25 | #pragma once 26 | 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | 36 | namespace bnch_swt::internal { 37 | 38 | template struct event_count; 39 | 40 | template<> struct event_count { 41 | template friend struct event_collector_type; 42 | 43 | BNCH_SWT_HOST event_count() noexcept { 44 | } 45 | 46 | BNCH_SWT_HOST bool elapsed_ns(double& elapsed_ns_new) const noexcept { 47 | if (elapsed_ns_val.has_value()) { 48 | elapsed_ns_new = elapsed_ns_val.value().count(); 49 | return true; 50 | } else { 51 | return false; 52 | } 53 | } 54 | 55 | BNCH_SWT_HOST bool bytes_processed(uint64_t& bytes_processed_new) const noexcept { 56 | if (bytes_processed_val.has_value()) { 57 | bytes_processed_new = bytes_processed_val.value(); 58 | return true; 59 | } else { 60 | return false; 61 | } 62 | } 63 | 64 | BNCH_SWT_HOST bool cycles(double& cycles_new) const { 65 | if (cycles_val.has_value()) { 66 | cycles_new = static_cast(cycles_val.value()); 67 | return true; 68 | } else { 69 | return false; 70 | } 71 | } 72 | 73 | BNCH_SWT_HOST bool instructions(double& instructions_new) const noexcept { 74 | if (instructions_val.has_value()) { 75 | instructions_new = static_cast(instructions_val.value()); 76 | return true; 77 | } else { 78 | return false; 79 | } 80 | } 81 | 82 | BNCH_SWT_HOST bool branches(double& branches_new) const noexcept { 83 | if (branches_val.has_value()) { 84 | branches_new = static_cast(branches_val.value()); 85 | return true; 86 | } else { 87 | return false; 88 | } 89 | } 90 | 91 | BNCH_SWT_HOST bool branch_misses(double& branch_misses_new) const noexcept { 92 | if (branch_misses_val.has_value()) { 93 | branch_misses_new = static_cast(branch_misses_val.value()); 94 | return true; 95 | } else { 96 | return false; 97 | } 98 | } 99 | 100 | BNCH_SWT_HOST bool cache_misses(double& cache_misses_new) const noexcept { 101 | if (cache_misses_val.has_value()) { 102 | cache_misses_new = static_cast(cache_misses_val.value()); 103 | return true; 104 | } else { 105 | return false; 106 | } 107 | } 108 | 109 | BNCH_SWT_HOST bool cache_references(double& cache_references_new) const noexcept { 110 | if (cache_references_val.has_value()) { 111 | cache_references_new = static_cast(cache_references_val.value()); 112 | return true; 113 | } else { 114 | return false; 115 | } 116 | } 117 | 118 | protected: 119 | std::optional cache_references_val{}; 120 | std::optional bytes_processed_val{}; 121 | std::optional elapsed_ns_val{}; 122 | std::optional branch_misses_val{}; 123 | std::optional instructions_val{}; 124 | std::optional cache_misses_val{}; 125 | std::optional branches_val{}; 126 | std::optional cycles_val{}; 127 | }; 128 | 129 | template<> struct event_count { 130 | BNCH_SWT_HOST event_count() noexcept { 131 | } 132 | 133 | BNCH_SWT_HOST bool elapsed_ns(double& elapsed_ns_new) const noexcept { 134 | if (elapsed_ns_val.has_value()) { 135 | elapsed_ns_new = elapsed_ns_val.value().count(); 136 | return true; 137 | } else { 138 | return false; 139 | } 140 | } 141 | 142 | BNCH_SWT_HOST bool cuda_event_ms(double& cuda_event_ms_new) const noexcept { 143 | if (cuda_event_ms_val.has_value()) { 144 | cuda_event_ms_new = cuda_event_ms_val.value(); 145 | return true; 146 | } 147 | return false; 148 | } 149 | 150 | BNCH_SWT_HOST bool bytes_processed(uint64_t& bytes_processed_new) const noexcept { 151 | if (bytes_processed_val.has_value()) { 152 | bytes_processed_new = bytes_processed_val.value(); 153 | return true; 154 | } 155 | return false; 156 | } 157 | 158 | BNCH_SWT_HOST bool cycles(double& cycles_new) const { 159 | if (cycles_val.has_value()) { 160 | cycles_new = static_cast(cycles_val.value()); 161 | return true; 162 | } 163 | return false; 164 | } 165 | 166 | protected: 167 | template friend struct event_collector_type; 168 | std::optional bytes_processed_val{}; 169 | std::optional elapsed_ns_val{}; 170 | std::optional cuda_event_ms_val{}; 171 | std::optional cycles_val{}; 172 | }; 173 | 174 | template using event_collector = event_collector_type, benchmark_type, count>; 175 | 176 | } 177 | -------------------------------------------------------------------------------- /include/bnch_swt/string_literal.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | MIT License 3 | 4 | Copyright (c) 2024 RealTimeChris 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy of this 7 | software and associated documentation files (the "Software"), to deal in the Software 8 | without restriction, including without limitation the rights to use, copy, modify, merge, 9 | publish, distribute, sublicense, and/or sell copies of the Software, and to permit 10 | persons to whom the Software is furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all copies or 13 | substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 16 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 17 | PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE 18 | FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 19 | OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | DEALINGS IN THE SOFTWARE. 21 | */ 22 | /// https://github.com/RealTimeChris/benchmarksuite 23 | 24 | #pragma once 25 | 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | namespace bnch_swt { 32 | 33 | template struct BNCH_SWT_ALIGN(64) string_literal { 34 | using value_type = char; 35 | using const_reference = const value_type&; 36 | using reference = value_type&; 37 | using const_pointer = const value_type*; 38 | using pointer = value_type*; 39 | using size_type = uint64_t; 40 | 41 | static constexpr size_type length{ size_val > 0 ? size_val - 1 : 0 }; 42 | 43 | constexpr string_literal() noexcept { 44 | } 45 | 46 | constexpr string_literal(const char (&str)[size_val]) noexcept { 47 | std::copy_n(str, size_val, values); 48 | values[length] = '\0'; 49 | } 50 | 51 | constexpr const_pointer data() const noexcept { 52 | return values; 53 | } 54 | 55 | constexpr pointer data() noexcept { 56 | return values; 57 | } 58 | 59 | template constexpr auto operator+=(const string_literal& str) const noexcept { 60 | string_literal new_literal{}; 61 | std::copy_n(values, size(), new_literal.data()); 62 | std::copy_n(str.data(), size_new, new_literal.data() + size()); 63 | return new_literal; 64 | } 65 | 66 | template constexpr auto operator+=(const value_type (&str)[size_new]) const noexcept { 67 | string_literal new_literal{}; 68 | std::copy_n(values, size(), new_literal.data()); 69 | std::copy_n(str, size_new, new_literal.data() + size()); 70 | return new_literal; 71 | } 72 | 73 | template constexpr auto operator+(const string_literal& str) const noexcept { 74 | string_literal new_literal{}; 75 | std::copy_n(values, size(), new_literal.data()); 76 | std::copy_n(str.data(), size_new, new_literal.data() + size()); 77 | return new_literal; 78 | } 79 | 80 | template constexpr auto operator+(const value_type (&str)[size_new]) const noexcept { 81 | string_literal new_literal{}; 82 | std::copy_n(values, size(), new_literal.data()); 83 | std::copy_n(str, size_new, new_literal.data() + size()); 84 | return new_literal; 85 | } 86 | 87 | template constexpr friend auto operator+(const value_type (&lhs)[size_new], const string_literal& str) noexcept { 88 | return string_literal{ lhs } + str; 89 | } 90 | 91 | constexpr reference operator[](size_type index) noexcept { 92 | return values[index]; 93 | } 94 | 95 | constexpr const_reference operator[](size_type index) const noexcept { 96 | return values[index]; 97 | } 98 | 99 | constexpr size_type size() const noexcept { 100 | return length; 101 | } 102 | 103 | template constexpr operator string_type() const { 104 | BNCH_SWT_ALIGN(64) string_type return_values{ values, length }; 105 | return return_values; 106 | } 107 | 108 | BNCH_SWT_ALIGN(64) char values[size_val > 0 ? size_val : 1] {}; 109 | }; 110 | 111 | template string_literal(const char (&str)[size]) -> string_literal; 112 | 113 | namespace internal { 114 | 115 | template constexpr auto string_literal_from_view(string_type str) noexcept { 116 | string_literal sl{}; 117 | std::copy_n(str.data(), str.size(), sl.values); 118 | sl[N] = '\0'; 119 | return sl; 120 | } 121 | 122 | template BNCH_SWT_HOST std::ostream& operator<<(std::ostream&, const string_literal& input) noexcept { 123 | std::cout << input.operator std::string_view(); 124 | return std::cout; 125 | } 126 | 127 | template constexpr uint64_t count_digits(value_type number) noexcept { 128 | uint64_t count = 0; 129 | if (static_cast(number) < 0) { 130 | number *= -1; 131 | ++count; 132 | } 133 | do { 134 | ++count; 135 | number /= 10; 136 | } while (number != 0); 137 | return count; 138 | } 139 | 140 | template constexpr string_literal to_string_literal() noexcept { 141 | char buffer[num_digits + 1]{}; 142 | char* ptr = buffer + num_digits; 143 | *ptr = '\0'; 144 | int64_t temp{}; 145 | if constexpr (number < 0) { 146 | temp = number * -1; 147 | *(ptr - num_digits) = '-'; 148 | } else { 149 | temp = number; 150 | } 151 | do { 152 | *--ptr = '0' + (temp % 10); 153 | temp /= 10; 154 | } while (temp != 0); 155 | return string_literal{ buffer }; 156 | } 157 | 158 | constexpr char to_lower(char input) noexcept { 159 | return (input >= 'A' && input <= 'Z') ? (input + 32) : input; 160 | } 161 | 162 | template constexpr auto to_lower(string_literal input) noexcept { 163 | string_literal output{}; 164 | for (uint64_t x = 0; x < size; ++x) { 165 | output[x] = to_lower(input[x]); 166 | } 167 | return output; 168 | } 169 | 170 | } 171 | 172 | } 173 | -------------------------------------------------------------------------------- /include/bnch_swt/cache_clearer.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | MIT License 3 | 4 | Copyright (c) 2024 RealTimeChris 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy of this 7 | software and associated documentation files (the "Software"), to deal in the Software 8 | without restriction, including without limitation the rights to use, copy, modify, merge, 9 | publish, distribute, sublicense, and/or sell copies of the Software, and to permit 10 | persons to whom the Software is furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all copies or 13 | substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 16 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 17 | PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE 18 | FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 19 | OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | DEALINGS IN THE SOFTWARE. 21 | */ 22 | /// https://github.com/RealTimeChris/benchmarksuite 23 | 24 | #pragma once 25 | 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | #if BNCH_SWT_PLATFORM_WINDOWS 32 | #include 33 | #include 34 | #elif BNCH_SWT_PLATFORM_LINUX 35 | #include 36 | #include 37 | #include 38 | #if defined(__i386__) || defined(__x86_64__) 39 | #include 40 | #endif 41 | #elif BNCH_SWT_PLATFORM_MAC 42 | #include 43 | #include 44 | #include 45 | #include 46 | #endif 47 | 48 | namespace bnch_swt::internal { 49 | 50 | enum class cache_level { 51 | one = 1, 52 | two = 2, 53 | three = 3, 54 | }; 55 | 56 | BNCH_SWT_HOST size_t get_cache_line_size() { 57 | #if BNCH_SWT_PLATFORM_WINDOWS 58 | DWORD buffer_size = 0; 59 | GetLogicalProcessorInformation(nullptr, &buffer_size); 60 | 61 | std::vector buffer_raw(static_cast(buffer_size) / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION)); 62 | DWORD actual_size = buffer_size; 63 | 64 | if (!GetLogicalProcessorInformation(buffer_raw.data(), &actual_size)) { 65 | return 64; 66 | } 67 | 68 | size_t num_elements = actual_size / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION); 69 | 70 | const auto* buffer = buffer_raw.data(); 71 | 72 | for (size_t i = 0; i < num_elements; ++i) { 73 | const auto& info = buffer[i]; 74 | if (info.Relationship == RelationCache && info.Cache.Level == 1) { 75 | return info.Cache.LineSize; 76 | } 77 | } 78 | 79 | return 64; 80 | 81 | #elif BNCH_SWT_PLATFORM_LINUX 82 | long line_size = sysconf(_SC_LEVEL1_DCACHE_LINESIZE); 83 | if (line_size <= 0) { 84 | std::cerr << "Failed to retrieve cache line size using sysconf! Falling back to 64." << std::endl; 85 | return 64; 86 | } 87 | return static_cast(line_size); 88 | #elif BNCH_SWT_PLATFORM_MAC 89 | size_t line_size = 0; 90 | size_t size = sizeof(line_size); 91 | if (sysctlbyname("hw.cachelinesize", &line_size, &size, nullptr, 0) != 0) { 92 | std::cerr << "Failed to retrieve cache line size using sysctl! Falling back to 64." << std::endl; 93 | return 64; 94 | } 95 | return line_size; 96 | #else 97 | std::cerr << "Unsupported platform! Falling back to 64." << std::endl; 98 | return 64; 99 | #endif 100 | return 64; 101 | } 102 | 103 | BNCH_SWT_HOST static void flush_cache(void* ptr, size_t size, [[maybe_unused]] size_t cache_line_size, bool clear_instruction_cache = false) { 104 | if (cache_line_size == 0) { 105 | return; 106 | } 107 | #if BNCH_SWT_PLATFORM_MAC 108 | if (clear_instruction_cache) { 109 | sys_icache_invalidate(ptr, size); 110 | } else { 111 | sys_dcache_flush(ptr, size); 112 | } 113 | #else 114 | char* buffer = static_cast(ptr); 115 | #if BNCH_SWT_PLATFORM_WINDOWS 116 | for (size_t i = 0; i < size; i += cache_line_size) { 117 | _mm_clflush(buffer + i); 118 | } 119 | _mm_sfence(); 120 | 121 | if (clear_instruction_cache) { 122 | if (!FlushInstructionCache(GetCurrentProcess(), buffer, size)) { 123 | std::cerr << "Failed to flush instruction cache!" << std::endl; 124 | } 125 | } 126 | #elif BNCH_SWT_PLATFORM_LINUX 127 | #if BNCH_SWT_ARCH_X64 128 | for (size_t i = 0; i < size; i += cache_line_size) { 129 | __builtin_ia32_clflush(buffer + i); 130 | } 131 | __builtin_ia32_sfence(); 132 | #elif BNCH_SWT_ARCH_ARM || BNCH_SWT_ARCH_ARM64 133 | for (size_t i = 0; i < size; i += cache_line_size) { 134 | #if BNCH_SWT_ARCH_ARM64 135 | __asm__ __volatile__("dc civac, %0" : : "r"(buffer + i) : "memory"); 136 | #else 137 | __builtin___clear_cache(buffer + i, buffer + i + cache_line_size); 138 | #endif 139 | } 140 | __asm__ __volatile__("dsb sy" : : : "memory"); 141 | #endif 142 | 143 | if (clear_instruction_cache) { 144 | __builtin___clear_cache(buffer, buffer + size); 145 | } 146 | #elif BNCH_SWT_PLATFORM_ANDROID 147 | if (clear_instruction_cache) { 148 | __builtin___clear_cache(buffer, buffer + size); 149 | } 150 | #endif 151 | #endif 152 | } 153 | 154 | template class cache_clearer { 155 | size_t cache_line_size{ get_cache_line_size() }; 156 | std::array cache_sizes{ { cpu_properties::l1_cache_size, cpu_properties::l2_cache_size, cpu_properties::l3_cache_size } }; 157 | 158 | size_t max_cache_size{ std::max({ cache_sizes[0], cache_sizes[1], cache_sizes[2] }) }; 159 | 160 | std::vector evict_buffer{ [&] { 161 | std::vector return_values{}; 162 | if (max_cache_size > 0) { 163 | return_values.resize(max_cache_size * 4 + cache_line_size); 164 | } 165 | return return_values; 166 | }() }; 167 | 168 | BNCH_SWT_HOST void evict_cache(size_t cache_level) { 169 | if (cache_level >= 1 && cache_level <= 3 && cache_sizes[cache_level - 1] > 0 && !evict_buffer.empty()) { 170 | size_t target_size = cache_sizes[cache_level - 1] * 4; 171 | const size_t stride = 4093; 172 | volatile char sink = 0; 173 | 174 | for (size_t offset = 0; offset < target_size; offset += cache_line_size) { 175 | size_t idx = (offset * stride) % evict_buffer.size(); 176 | evict_buffer[idx] = static_cast(idx); 177 | sink = sink + evict_buffer[idx]; 178 | } 179 | 180 | flush_cache(evict_buffer.data(), evict_buffer.size(), cache_line_size); 181 | if (cache_level == 1) { 182 | flush_cache(evict_buffer.data(), evict_buffer.size(), cache_line_size, true); 183 | } 184 | } 185 | } 186 | 187 | public: 188 | BNCH_SWT_HOST void evict_caches() { 189 | evict_cache(3); 190 | evict_cache(2); 191 | evict_cache(1); 192 | } 193 | }; 194 | } 195 | -------------------------------------------------------------------------------- /include/bnch_swt/counters/linux_perf_events.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | MIT License 3 | 4 | Copyright (c) 2024 RealTimeChris 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy of this 7 | software and associated documentation files (the "Software"), to deal in the Software 8 | without restriction, including without limitation the rights to use, copy, modify, merge, 9 | publish, distribute, sublicense, and/or sell copies of the Software, and to permit 10 | persons to whom the Software is furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all copies or 13 | substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 16 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 17 | PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE 18 | FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 19 | OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | DEALINGS IN THE SOFTWARE. 21 | */ 22 | /// https://github.com/RealTimeChris/benchmarksuite 23 | 24 | // Sampled mostly from https://github.com/fastfloat/fast_float 25 | #pragma once 26 | 27 | #include 28 | 29 | #if BNCH_SWT_PLATFORM_LINUX 30 | 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | 38 | namespace bnch_swt::internal { 39 | 40 | BNCH_SWT_HOST uint64_t rdtsc() { 41 | #if defined(__x86_64__) 42 | uint32_t a, d; 43 | __asm__ volatile("rdtsc" : "=a"(a), "=d"(d)); 44 | return static_cast(a) | (static_cast(d) << 32); 45 | #elif defined(__i386__) 46 | uint64_t x; 47 | __asm__ volatile("rdtsc" : "=A"(x)); 48 | return x; 49 | #else 50 | return 0; 51 | #endif 52 | } 53 | 54 | class linux_events { 55 | protected: 56 | std::vector temp_result_vec{}; 57 | std::vector ids{}; 58 | perf_event_attr attribs{}; 59 | uint64_t num_events{}; 60 | bool working{}; 61 | int32_t fd{}; 62 | 63 | public: 64 | BNCH_SWT_HOST explicit linux_events(std::vector config_vec) : working(true) { 65 | memset(&attribs, 0, sizeof(attribs)); 66 | attribs.type = PERF_TYPE_HARDWARE; 67 | attribs.size = sizeof(attribs); 68 | attribs.disabled = 1; 69 | attribs.exclude_kernel = 1; 70 | attribs.exclude_hv = 1; 71 | 72 | attribs.sample_period = 0; 73 | attribs.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID; 74 | const int32_t pid = 0; 75 | const int32_t cpu = -1; 76 | const unsigned long flags = 0; 77 | 78 | int32_t group = -1; 79 | num_events = config_vec.size(); 80 | ids.resize(config_vec.size()); 81 | uint32_t i = 0; 82 | for (auto config: config_vec) { 83 | attribs.config = static_cast(config); 84 | int32_t _fd = static_cast(syscall(__NR_perf_event_open, &attribs, pid, cpu, group, flags)); 85 | if (_fd == -1) { 86 | report_error("perf_event_open"); 87 | } 88 | ioctl(_fd, PERF_EVENT_IOC_ID, &ids[i++]); 89 | if (group == -1) { 90 | group = _fd; 91 | fd = _fd; 92 | } 93 | } 94 | 95 | temp_result_vec.resize(num_events * 2 + 1); 96 | } 97 | 98 | BNCH_SWT_HOST ~linux_events() { 99 | if (fd != -1) { 100 | close(fd); 101 | } 102 | } 103 | 104 | BNCH_SWT_HOST void run() { 105 | if (fd != -1) { 106 | if (ioctl(fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP) == -1) { 107 | report_error("ioctl(PERF_EVENT_IOC_RESET)"); 108 | } 109 | 110 | if (ioctl(fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) == -1) { 111 | report_error("ioctl(PERF_EVENT_IOC_ENABLE)"); 112 | } 113 | } 114 | } 115 | 116 | BNCH_SWT_HOST void end(std::vector& results) { 117 | if (fd != -1) { 118 | if (ioctl(fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP) == -1) { 119 | report_error("ioctl(PERF_EVENT_IOC_DISABLE)"); 120 | } 121 | 122 | if (read(fd, temp_result_vec.data(), temp_result_vec.size() * 8) == -1) { 123 | report_error("read"); 124 | } 125 | } 126 | 127 | for (uint32_t i = 1; i < temp_result_vec.size(); i += 2) { 128 | results[i / 2] = temp_result_vec[i]; 129 | } 130 | for (uint32_t i = 2; i < temp_result_vec.size(); i += 2) { 131 | if (ids[i / 2 - 1] != temp_result_vec[i]) { 132 | report_error("event mismatch"); 133 | } 134 | } 135 | } 136 | 137 | bool is_working() { 138 | return working; 139 | } 140 | 141 | protected: 142 | BNCH_SWT_HOST void report_error(const std::string&) { 143 | working = false; 144 | } 145 | }; 146 | 147 | template struct event_collector_type : public linux_events, public std::vector { 148 | std::vector results{}; 149 | uint64_t current_index{}; 150 | BNCH_SWT_HOST event_collector_type() 151 | : linux_events{ std::vector{ PERF_COUNT_HW_CPU_CYCLES, PERF_COUNT_HW_INSTRUCTIONS, PERF_COUNT_HW_BRANCH_INSTRUCTIONS, PERF_COUNT_HW_BRANCH_MISSES, 152 | PERF_COUNT_HW_CACHE_REFERENCES, PERF_COUNT_HW_CACHE_MISSES } }, 153 | std::vector{ count } { 154 | } 155 | 156 | BNCH_SWT_HOST bool has_events() { 157 | return linux_events::is_working(); 158 | } 159 | 160 | template BNCH_SWT_HOST void run(arg_types&&... args) { 161 | if (has_events()) { 162 | linux_events::run(); 163 | } 164 | uint64_t result; 165 | const auto start_clock = clock_type::now(); 166 | volatile uint64_t cycle_start = rdtsc(); 167 | result = static_cast(function_type::impl(std::forward(args)...)); 168 | volatile uint64_t cycle_end = rdtsc(); 169 | const auto end_clock = clock_type::now(); 170 | std::vector::operator[](current_index).cycles_val.emplace(cycle_end - cycle_start); 171 | std::vector::operator[](current_index).elapsed_ns_val.emplace(end_clock - start_clock); 172 | std::vector::operator[](current_index).bytes_processed_val.emplace(result); 173 | if (has_events()) { 174 | if (results.size() != linux_events::temp_result_vec.size()) { 175 | results.resize(linux_events::temp_result_vec.size()); 176 | } 177 | linux_events::end(results); 178 | std::vector::operator[](current_index).instructions_val.emplace(results[1]); 179 | std::vector::operator[](current_index).branches_val.emplace(results[2]); 180 | std::vector::operator[](current_index).branch_misses_val.emplace(results[3]); 181 | std::vector::operator[](current_index).cache_references_val.emplace(results[4]); 182 | std::vector::operator[](current_index).cache_misses_val.emplace(results[5]); 183 | } 184 | ++current_index; 185 | return; 186 | } 187 | }; 188 | } 189 | 190 | #endif 191 | -------------------------------------------------------------------------------- /.github/workflows/benchmark.yml: -------------------------------------------------------------------------------- 1 | name: Benchmark 2 | on: 3 | push: 4 | branches: [ "**" ] 5 | pull_request: 6 | branches: [ "**" ] 7 | workflow_dispatch: 8 | jobs: 9 | build: 10 | strategy: 11 | fail-fast: false 12 | matrix: 13 | include: 14 | - os: ubuntu-latest 15 | compiler: clang 16 | cc: clang 17 | cxx: clang++ 18 | name: "Ubuntu Clang" 19 | cmake_cxx: /usr/bin/clang++-20 20 | - os: ubuntu-latest 21 | compiler: gcc 22 | cc: gcc 23 | cxx: g++ 24 | name: "Ubuntu GCC" 25 | cmake_cxx: /usr/bin/g++-14 26 | - os: macos-latest 27 | compiler: clang 28 | cc: clang 29 | cxx: clang++ 30 | name: "macOS Clang" 31 | cmake_cxx: "" 32 | - os: macos-latest 33 | compiler: gcc 34 | cc: gcc 35 | cxx: g++ 36 | name: "macOS GCC" 37 | cmake_cxx: "" 38 | - os: windows-latest 39 | compiler: msvc 40 | name: "Windows MSVC" 41 | cmake_cxx: "" 42 | runs-on: ${{ matrix.os }} 43 | name: Build on ${{ matrix.name }} 44 | steps: 45 | - name: Checkout Repository 46 | uses: actions/checkout@v4 47 | 48 | - name: Setup Clang (Ubuntu) 49 | if: matrix.os == 'ubuntu-latest' && matrix.compiler == 'clang' 50 | run: | 51 | sudo apt-get update 52 | sudo apt update 53 | wget https://apt.llvm.org/llvm.sh 54 | chmod u+x llvm.sh 55 | sudo ./llvm.sh 20 56 | 57 | - name: Setup GCC (Ubuntu) 58 | if: matrix.os == 'ubuntu-latest' && matrix.compiler == 'gcc' 59 | run: | 60 | sudo apt-get install build-essential 61 | sudo apt-get install g++-14 62 | 63 | - name: Setup Clang (macOS) 64 | if: matrix.os == 'macos-latest' && matrix.compiler == 'clang' 65 | run: | 66 | brew install llvm 67 | 68 | - name: Setup GCC (macOS) 69 | if: matrix.os == 'macos-latest' && matrix.compiler == 'gcc' 70 | run: | 71 | brew install gcc 72 | GCC_PATH=$(brew --prefix gcc) 73 | GCC_VER=$(ls ${GCC_PATH}/bin/gcc-* 2>/dev/null | grep -oE '[0-9]+$' | sort -rn | head -1) 74 | echo "CC=${GCC_PATH}/bin/gcc-${GCC_VER}" >> $GITHUB_ENV 75 | echo "CXX=${GCC_PATH}/bin/g++-${GCC_VER}" >> $GITHUB_ENV 76 | ${GCC_PATH}/bin/gcc-${GCC_VER} --version 77 | ${GCC_PATH}/bin/g++-${GCC_VER} --version 78 | 79 | - name: Setup MSVC (Windows) 80 | if: matrix.os == 'windows-latest' 81 | uses: ilammy/msvc-dev-cmd@v1 82 | 83 | - name: Create Build Directory 84 | run: mkdir -p Build 85 | 86 | - name: Configure CMake (Ubuntu-GCC) 87 | if: matrix.os == 'ubuntu-latest' && matrix.compiler == 'gcc' 88 | run: | 89 | cmake -S . -B ./Build -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_COMPILER=${{ matrix.cmake_cxx }} -DBENCHMARKS=TRUE -DPRINT_ASSEMBLY=TRUE 90 | 91 | - name: Configure CMake (Ubuntu-Clang) 92 | if: matrix.os == 'ubuntu-latest' && matrix.compiler == 'clang' 93 | run: | 94 | cmake -S . -B ./Build -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_COMPILER=${{ matrix.cmake_cxx }} -DBENCHMARKS=TRUE -DPRINT_ASSEMBLY=TRUE 95 | 96 | - name: Configure CMake (macOS-Clang) 97 | if: matrix.os == 'macos-latest' && matrix.compiler == 'clang' 98 | run: | 99 | cmake -S . -B ./Build -DCMAKE_BUILD_TYPE=Release -DBENCHMARKS=TRUE -DPRINT_ASSEMBLY=TRUE 100 | 101 | - name: Configure CMake (macOS-GCC) 102 | if: matrix.os == 'macos-latest' && matrix.compiler == 'gcc' 103 | run: | 104 | cmake -S . -B ./Build -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=${{ env.CC }} -DCMAKE_CXX_COMPILER=${{ env.CXX }} -DBENCHMARKS=TRUE -DPRINT_ASSEMBLY=TRUE 105 | 106 | - name: Configure CMake (Windows) 107 | if: matrix.os == 'windows-latest' 108 | run: | 109 | cmake -S . -B ./Build -DCMAKE_BUILD_TYPE=Release -DBENCHMARKS=TRUE -DPRINT_ASSEMBLY=TRUE 110 | 111 | - name: Build the Test 112 | run: | 113 | cmake --build ./Build --config=Release 114 | 115 | - name: Find and Print Assembly Files (Unix) 116 | if: matrix.os != 'windows-latest' 117 | run: | 118 | echo "=== Searching for assembly files ===" 119 | find ./Build -name "*.asm" -type f 120 | echo "" 121 | echo "=== Assembly file details ===" 122 | find ./Build -name "*.asm" -type f -exec sh -c 'echo "File: {}"; ls -lh "{}"; echo "Lines: $(wc -l < "{}")"; echo ""' \; 123 | echo "" 124 | echo "=== Printing COMPLETE assembly files ===" 125 | find ./Build -name "*.asm" -type f -exec sh -c 'echo "========== {} =========="; cat "{}"; echo ""; echo "========== END OF {} =========="; echo ""' \; 126 | 127 | - name: Find and Print Assembly Files (Windows) 128 | if: matrix.os == 'windows-latest' 129 | run: | 130 | echo "=== Searching for assembly files ===" 131 | echo "Current directory:" 132 | Get-Location 133 | echo "" 134 | echo "Build directory contents:" 135 | Get-ChildItem -Path ./Build -Recurse | Select-Object FullName 136 | echo "" 137 | echo "Specifically checking ./Build/src/:" 138 | if (Test-Path ./Build/src) { 139 | Get-ChildItem -Path ./Build/src -File | Select-Object Name, Length 140 | } else { 141 | echo "./Build/src does not exist!" 142 | } 143 | echo "" 144 | echo "Searching for .cod files recursively:" 145 | $asmFiles = Get-ChildItem -Path ./Build -Recurse -Filter "*.cod" -File -ErrorAction SilentlyContinue 146 | 147 | if ($null -eq $asmFiles -or $asmFiles.Count -eq 0) { 148 | echo "NO .ASM FILES FOUND!" 149 | echo "" 150 | echo "Checking if /Fa flag was applied - looking for compilation output:" 151 | Get-ChildItem -Path ./Build/src -Recurse -File | Where-Object { $_.Extension -match '\.(obj|cod|cod)' } | Select-Object FullName 152 | } else { 153 | echo "Found $($asmFiles.Count) assembly file(s):" 154 | $asmFiles | ForEach-Object { echo $_.FullName } 155 | 156 | echo "" 157 | echo "=== Assembly file details ===" 158 | $asmFiles | ForEach-Object { 159 | echo "File: $($_.FullName)" 160 | echo "Size: $($_.Length) bytes" 161 | echo "Lines: $((Get-Content $_.FullName).Count)" 162 | echo "" 163 | } 164 | 165 | echo "" 166 | echo "=== Printing COMPLETE assembly files ===" 167 | $asmFiles | ForEach-Object { 168 | echo "========== $($_.FullName) ==========" 169 | Get-Content $_.FullName 170 | echo "" 171 | echo "========== END OF $($_.FullName) ==========" 172 | echo "" 173 | } 174 | } 175 | 176 | - name: Install the Test (Unix) 177 | if: matrix.os != 'windows-latest' 178 | run: | 179 | sudo cmake --install ./Build --config=Release 180 | 181 | - name: Install the Test (Windows) 182 | if: matrix.os == 'windows-latest' 183 | run: | 184 | cmake --install ./Build --config=Release 185 | 186 | - name: Run the Test (Unix) 187 | if: matrix.os != 'windows-latest' 188 | run: | 189 | sudo chmod +x /usr/local/bin/benchmarksuite-main 190 | sudo /usr/local/bin/benchmarksuite-main 191 | 192 | - name: Run the Test (Windows) 193 | if: matrix.os == 'windows-latest' 194 | run: | 195 | & "C:/Program Files (x86)/benchmarksuite/bin/benchmarksuite-main.exe" -------------------------------------------------------------------------------- /include/bnch_swt/random_generators.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | MIT License 3 | 4 | Copyright (c) 2024 RealTimeChris 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy of this 7 | software and associated documentation files (the "Software"), to deal in the Software 8 | without restriction, including without limitation the rights to use, copy, modify, merge, 9 | publish, distribute, sublicense, and/or sell copies of the Software, and to permit 10 | persons to whom the Software is furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all copies or 13 | substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 16 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 17 | PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE 18 | FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 19 | OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | DEALINGS IN THE SOFTWARE. 21 | */ 22 | /// https://github.com/RealTimeChris/benchmarksuite 23 | /// Feb 3, 2023 24 | #pragma once 25 | 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | 33 | namespace bnch_swt { 34 | 35 | BNCH_SWT_HOST static uint64_t get_time_based_seed() noexcept { 36 | return std::chrono::duration_cast>(clock_type::now().time_since_epoch()).count(); 37 | } 38 | 39 | enum class xoshiro_256_seeds : uint64_t { 40 | deterministic, 41 | time_based = std::numeric_limits::max(), 42 | }; 43 | 44 | template 45 | struct xoshiro_256_base { 46 | BNCH_SWT_HOST constexpr xoshiro_256_base() { 47 | if constexpr (xoshiro_256_seed == xoshiro_256_seeds::time_based) { 48 | uint64_t s = get_time_based_seed(); 49 | for (uint64_t y = 0; y < 4; ++y) { 50 | state[y] = splitmix64(s); 51 | } 52 | } else { 53 | uint64_t s = static_cast(xoshiro_256_seed); 54 | for (uint64_t y = 0; y < 4; ++y) { 55 | state[y] = splitmix64(s); 56 | } 57 | } 58 | 59 | this->operator()(); 60 | this->operator()(); 61 | } 62 | 63 | BNCH_SWT_HOST constexpr uint64_t operator()() noexcept { 64 | const uint64_t result = rotl(state[1ull] * 5ull, 7ull) * 9ull; 65 | const uint64_t t = state[1ull] << 17ull; 66 | 67 | state[2ull] ^= state[0ull]; 68 | state[3ull] ^= state[1ull]; 69 | state[1ull] ^= state[2ull]; 70 | state[0ull] ^= state[3ull]; 71 | 72 | state[2ull] ^= t; 73 | 74 | state[3ull] = rotl(state[3ull], 45ull); 75 | 76 | return result; 77 | } 78 | 79 | protected: 80 | mutable std::array state{}; 81 | 82 | BNCH_SWT_HOST constexpr uint64_t rotl(const uint64_t x, const uint64_t k) const noexcept { 83 | return (x << k) | (x >> (64ull - k)); 84 | } 85 | 86 | BNCH_SWT_HOST constexpr uint64_t splitmix64(uint64_t& seed64) const noexcept { 87 | uint64_t result = seed64 += 0x9E3779B97F4A7C15ull; 88 | result = (result ^ (result >> 30ull)) * 0xBF58476D1CE4E5B9ull; 89 | result = (result ^ (result >> 27ull)) * 0x94D049BB133111EBull; 90 | return result ^ (result >> 31ull); 91 | } 92 | }; 93 | 94 | template struct xoshiro_256 : public xoshiro_256_base { 95 | using value_type = std::make_unsigned_t; 96 | 97 | BNCH_SWT_HOST value_type_new operator()(value_type_new min, value_type_new max) { 98 | if (min >= max) { 99 | return min; 100 | } 101 | 102 | value_type range = static_cast(max) - static_cast(min); 103 | 104 | if (range == std::numeric_limits::max()) { 105 | return static_cast(xoshiro_256_base::operator()()); 106 | } 107 | 108 | constexpr uint64_t max_val = std::numeric_limits::max(); 109 | const uint64_t bucket_size = range + 1; 110 | const uint64_t threshold = (max_val / bucket_size) * bucket_size; 111 | 112 | uint64_t result; 113 | do { 114 | result = xoshiro_256_base::operator()(); 115 | } while (result >= threshold); 116 | 117 | return static_cast(static_cast(min) + (result % bucket_size)); 118 | } 119 | }; 120 | 121 | template struct xoshiro_256_traits; 122 | 123 | template 124 | requires(sizeof(value_type) == 4) 125 | struct xoshiro_256_traits { 126 | static constexpr value_type multiplicand{ 0x1.0p-24 }; 127 | static constexpr uint64_t shift{ 40 }; 128 | }; 129 | 130 | template 131 | requires(sizeof(value_type) == 8) 132 | struct xoshiro_256_traits { 133 | static constexpr value_type multiplicand{ 0x1.0p-53 }; 134 | static constexpr uint64_t shift{ 11 }; 135 | }; 136 | 137 | template struct xoshiro_256 138 | : public xoshiro_256_base { 139 | BNCH_SWT_HOST value_type operator()(value_type min, value_type max) { 140 | return min + (max - min) * next(); 141 | } 142 | 143 | protected: 144 | BNCH_SWT_HOST value_type next() { 145 | return static_cast( 146 | (xoshiro_256_base::operator()() >> xoshiro_256_traits::shift) * xoshiro_256_traits::multiplicand); 147 | } 148 | }; 149 | 150 | template struct random_generator; 151 | 152 | template struct random_generator { 153 | BNCH_SWT_HOST static value_type impl(uint64_t length) { 154 | static thread_local xoshiro_256 random_engine{}; 155 | value_type result{}; 156 | result.resize(length); 157 | for (uint64_t x = 0; x < length; ++x) { 158 | result[x] = static_cast(random_engine(32, 127)); 159 | } 160 | return result; 161 | } 162 | }; 163 | 164 | template struct random_generator { 165 | BNCH_SWT_HOST static value_type impl() { 166 | static thread_local xoshiro_256 random_engine{}; 167 | return static_cast(random_engine(0, 1)); 168 | } 169 | }; 170 | 171 | template struct random_generator { 172 | BNCH_SWT_HOST static value_type impl(value_type min = static_cast(-1.0), value_type max = static_cast(1.0)) { 173 | static thread_local xoshiro_256 random_engine{}; 174 | return random_engine(min, max); 175 | } 176 | }; 177 | 178 | template 179 | requires(std::is_unsigned_v) 180 | struct random_generator { 181 | BNCH_SWT_HOST static value_type impl(value_type min = std::numeric_limits::min(), value_type max = std::numeric_limits::max()) { 182 | static thread_local xoshiro_256 random_engine{}; 183 | return static_cast(random_engine(min, max)); 184 | } 185 | }; 186 | 187 | template 188 | requires(std::is_signed_v) 189 | struct random_generator { 190 | BNCH_SWT_HOST static value_type impl(value_type min = std::numeric_limits::min(), value_type max = std::numeric_limits::max()) { 191 | static thread_local xoshiro_256 random_engine{}; 192 | return random_engine(min, max); 193 | } 194 | }; 195 | 196 | } 197 | -------------------------------------------------------------------------------- /cmake/detection/benchmarksuite_detect_gpu_properties.cmake: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2024 RealTimeChris 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this 6 | # software and associated documentation files (the "Software"), to deal in the Software 7 | # without restriction, including without limitation the rights to use, copy, modify, merge, 8 | # publish, distribute, sublicense, and/or sell copies of the Software, and to permit 9 | # persons to whom the Software is furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice shall be included in all copies or 12 | # substantial portions of the Software. 13 | # 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 15 | # INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 16 | # PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE 17 | # FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 18 | # OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 19 | # DEALINGS IN THE SOFTWARE. 20 | 21 | if(UNIX OR APPLE) 22 | file(WRITE ${CMAKE_CURRENT_SOURCE_DIR}/cmake/detection/build_feature_tester_gpu_properties.sh "#!/bin/bash\n" 23 | "\"${CMAKE_COMMAND}\" -S ./ -B ./Build-Gpu-Properties -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_COMPILER=\"${CMAKE_CXX_COMPILER}\" -DBNCH_SWT_DETECT_GPU_PROPERTIES=TRUE\n" 24 | "\"${CMAKE_COMMAND}\" --build ./Build-Gpu-Properties --config=Release" 25 | ) 26 | 27 | execute_process( 28 | COMMAND chmod +x ${CMAKE_CURRENT_SOURCE_DIR}/cmake/detection/build_feature_tester_gpu_properties.sh 29 | RESULT_VARIABLE CHMOD_RESULT 30 | ) 31 | 32 | if(NOT CHMOD_RESULT EQUAL 0) 33 | message(FATAL_ERROR "Failed to set executable permissions for build_feature_tester_gpu_properties.sh") 34 | endif() 35 | 36 | execute_process( 37 | COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/cmake/detection/build_feature_tester_gpu_properties.sh 38 | WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/cmake/detection 39 | ) 40 | 41 | set(FEATURE_TESTER_FILE ${CMAKE_CURRENT_SOURCE_DIR}/cmake/detection/Build-Gpu-Properties/feature_detector) 42 | 43 | elseif(WIN32) 44 | file(WRITE ${CMAKE_CURRENT_SOURCE_DIR}/cmake/detection/build_feature_tester_gpu_properties.bat 45 | "\"${CMAKE_COMMAND}\" -S ./ -B ./Build-Gpu-Properties -DBNCH_SWT_DETECT_GPU_PROPERTIES=TRUE\n" 46 | "\"${CMAKE_COMMAND}\" --build ./Build-Gpu-Properties --config=Release" 47 | ) 48 | 49 | execute_process( 50 | COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/cmake/detection/build_feature_tester_gpu_properties.bat 51 | WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/cmake/detection 52 | ) 53 | 54 | set(FEATURE_TESTER_FILE ${CMAKE_CURRENT_SOURCE_DIR}/cmake/detection/Build-Gpu-Properties/Release/feature_detector.exe) 55 | endif() 56 | 57 | if(NOT DEFINED BNCH_SWT_SM_COUNT OR 58 | NOT DEFINED BNCH_SWT_MAX_THREADS_PER_SM OR 59 | NOT DEFINED BNCH_SWT_MAX_THREADS_PER_BLOCK OR 60 | NOT DEFINED BNCH_SWT_WARP_SIZE OR 61 | NOT DEFINED BNCH_SWT_GPU_L2_CACHE_SIZE OR 62 | NOT DEFINED BNCH_SWT_SHARED_MEM_PER_BLOCK OR 63 | NOT DEFINED BNCH_SWT_MAX_GRID_SIZE_X OR 64 | NOT DEFINED BNCH_SWT_MAX_GRID_SIZE_Y OR 65 | NOT DEFINED BNCH_SWT_MAX_GRID_SIZE_Z OR 66 | NOT DEFINED BNCH_SWT_GPU_ARCH_INDEX OR 67 | NOT BNCH_SWT_DETECT_GPU_PROPERTIES) 68 | 69 | execute_process( 70 | COMMAND ${FEATURE_TESTER_FILE} 71 | RESULT_VARIABLE FEATURE_TESTER_EXIT_CODE 72 | OUTPUT_VARIABLE GPU_PROPERTIES_OUTPUT 73 | ERROR_VARIABLE FEATURE_TESTER_ERROR 74 | OUTPUT_STRIP_TRAILING_WHITESPACE 75 | ) 76 | endif() 77 | 78 | if(FEATURE_TESTER_EXIT_CODE EQUAL 0 AND GPU_PROPERTIES_OUTPUT MATCHES "GPU_SUCCESS=1") 79 | 80 | string(REGEX MATCH "SM_COUNT=([0-9]+)" _ ${GPU_PROPERTIES_OUTPUT}) 81 | if(NOT DEFINED BNCH_SWT_SM_COUNT) 82 | set(BNCH_SWT_SM_COUNT ${CMAKE_MATCH_1} CACHE STRING "GPU SM count" FORCE) 83 | endif() 84 | 85 | string(REGEX MATCH "MAX_THREADS_PER_SM=([0-9]+)" _ ${GPU_PROPERTIES_OUTPUT}) 86 | if(NOT DEFINED BNCH_SWT_MAX_THREADS_PER_SM) 87 | set(BNCH_SWT_MAX_THREADS_PER_SM ${CMAKE_MATCH_1} CACHE STRING "GPU max threads per SM" FORCE) 88 | endif() 89 | 90 | string(REGEX MATCH "MAX_THREADS_PER_BLOCK=([0-9]+)" _ ${GPU_PROPERTIES_OUTPUT}) 91 | if(NOT DEFINED BNCH_SWT_MAX_THREADS_PER_BLOCK) 92 | set(BNCH_SWT_MAX_THREADS_PER_BLOCK ${CMAKE_MATCH_1} CACHE STRING "GPU max threads per block" FORCE) 93 | endif() 94 | 95 | string(REGEX MATCH "WARP_SIZE=([0-9]+)" _ ${GPU_PROPERTIES_OUTPUT}) 96 | if(NOT DEFINED BNCH_SWT_WARP_SIZE) 97 | set(BNCH_SWT_WARP_SIZE ${CMAKE_MATCH_1} CACHE STRING "GPU warp size" FORCE) 98 | endif() 99 | 100 | string(REGEX MATCH "L2_CACHE_SIZE=([0-9]+)" _ ${GPU_PROPERTIES_OUTPUT}) 101 | if(NOT DEFINED BNCH_SWT_GPU_L2_CACHE_SIZE) 102 | set(BNCH_SWT_GPU_L2_CACHE_SIZE ${CMAKE_MATCH_1} CACHE STRING "GPU L2 cache size" FORCE) 103 | endif() 104 | 105 | string(REGEX MATCH "SHARED_MEM_PER_BLOCK=([0-9]+)" _ ${GPU_PROPERTIES_OUTPUT}) 106 | if(NOT DEFINED BNCH_SWT_SHARED_MEM_PER_BLOCK) 107 | set(BNCH_SWT_SHARED_MEM_PER_BLOCK ${CMAKE_MATCH_1} CACHE STRING "GPU shared memory per block" FORCE) 108 | endif() 109 | 110 | string(REGEX MATCH "MAX_GRID_SIZE_X=([0-9]+)" _ ${GPU_PROPERTIES_OUTPUT}) 111 | if(NOT DEFINED BNCH_SWT_MAX_GRID_SIZE_X) 112 | set(BNCH_SWT_MAX_GRID_SIZE_X ${CMAKE_MATCH_1} CACHE STRING "GPU max grid size X" FORCE) 113 | endif() 114 | 115 | string(REGEX MATCH "MAX_GRID_SIZE_Y=([0-9]+)" _ ${GPU_PROPERTIES_OUTPUT}) 116 | if(NOT DEFINED BNCH_SWT_MAX_GRID_SIZE_Y) 117 | set(BNCH_SWT_MAX_GRID_SIZE_Y ${CMAKE_MATCH_1} CACHE STRING "GPU max grid size Y" FORCE) 118 | endif() 119 | 120 | string(REGEX MATCH "MAX_GRID_SIZE_Z=([0-9]+)" _ ${GPU_PROPERTIES_OUTPUT}) 121 | if(NOT DEFINED BNCH_SWT_MAX_GRID_SIZE_Z) 122 | set(BNCH_SWT_MAX_GRID_SIZE_Z ${CMAKE_MATCH_1} CACHE STRING "GPU max grid size Z" FORCE) 123 | endif() 124 | 125 | string(REGEX MATCH "MAJOR_COMPUTE_CAPABILITY=([0-9]+)" _ ${GPU_PROPERTIES_OUTPUT}) 126 | if(NOT DEFINED BNCH_SWT_MAJOR_COMPUTE_CAPABILITY) 127 | set(BNCH_SWT_MAJOR_COMPUTE_CAPABILITY ${CMAKE_MATCH_1} CACHE STRING "GPU major compute capability" FORCE) 128 | endif() 129 | 130 | string(REGEX MATCH "MINOR_COMPUTE_CAPABILITY=([0-9]+)" _ ${GPU_PROPERTIES_OUTPUT}) 131 | if(NOT DEFINED BNCH_SWT_MINOR_COMPUTE_CAPABILITY) 132 | set(BNCH_SWT_MINOR_COMPUTE_CAPABILITY ${CMAKE_MATCH_1} CACHE STRING "GPU minor compute capability" FORCE) 133 | endif() 134 | 135 | string(REGEX MATCH "GPU_ARCH_INDEX=([0-9]+)" _ ${GPU_PROPERTIES_OUTPUT}) 136 | if(NOT DEFINED BNCH_SWT_GPU_ARCH_INDEX) 137 | set(BNCH_SWT_GPU_ARCH_INDEX ${CMAKE_MATCH_1} CACHE STRING "GPU architecture index" FORCE) 138 | endif() 139 | 140 | if(NOT DEFINED BNCH_SWT_GPU_PROPERTIES_ERECTED) 141 | set(BNCH_SWT_GPU_PROPERTIES_ERECTED TRUE CACHE BOOL "GPU properties successfully detected" FORCE) 142 | endif() 143 | 144 | message(STATUS "GPU Properties detected successfully") 145 | 146 | else() 147 | message(WARNING "GPU feature detector failed, using reasonable default values for unset properties") 148 | 149 | if(NOT DEFINED BNCH_SWT_SM_COUNT) 150 | set(BNCH_SWT_SM_COUNT 16 CACHE STRING "GPU SM count (fallback)" FORCE) 151 | endif() 152 | 153 | if(NOT DEFINED BNCH_SWT_MAX_THREADS_PER_SM) 154 | set(BNCH_SWT_MAX_THREADS_PER_SM 1024 CACHE STRING "GPU max threads per SM (fallback)" FORCE) 155 | endif() 156 | 157 | if(NOT DEFINED BNCH_SWT_MAX_THREADS_PER_BLOCK) 158 | set(BNCH_SWT_MAX_THREADS_PER_BLOCK 1024 CACHE STRING "GPU max threads per block (fallback)" FORCE) 159 | endif() 160 | 161 | if(NOT DEFINED BNCH_SWT_WARP_SIZE) 162 | set(BNCH_SWT_WARP_SIZE 32 CACHE STRING "GPU warp size (fallback)" FORCE) 163 | endif() 164 | 165 | if(NOT DEFINED BNCH_SWT_GPU_L2_CACHE_SIZE) 166 | set(BNCH_SWT_GPU_L2_CACHE_SIZE 2097152 CACHE STRING "GPU L2 cache size (fallback)" FORCE) 167 | endif() 168 | 169 | if(NOT DEFINED BNCH_SWT_SHARED_MEM_PER_BLOCK) 170 | set(BNCH_SWT_SHARED_MEM_PER_BLOCK 49152 CACHE STRING "GPU shared memory per block (fallback)" FORCE) 171 | endif() 172 | 173 | if(NOT DEFINED BNCH_SWT_MAX_GRID_SIZE_X) 174 | set(BNCH_SWT_MAX_GRID_SIZE_X 2147483647 CACHE STRING "GPU max grid size X (fallback)" FORCE) 175 | endif() 176 | 177 | if(NOT DEFINED BNCH_SWT_MAX_GRID_SIZE_Y) 178 | set(BNCH_SWT_MAX_GRID_SIZE_Y 65535 CACHE STRING "GPU max grid size Y (fallback)" FORCE) 179 | endif() 180 | 181 | if(NOT DEFINED BNCH_SWT_MAX_GRID_SIZE_Z) 182 | set(BNCH_SWT_MAX_GRID_SIZE_Z 65535 CACHE STRING "GPU max grid size Z (fallback)" FORCE) 183 | endif() 184 | 185 | if(NOT DEFINED BNCH_SWT_GPU_ARCH_INDEX) 186 | set(BNCH_SWT_GPU_ARCH_INDEX 0 CACHE STRING "GPU architecture index (fallback)" FORCE) 187 | endif() 188 | endif() 189 | 190 | if(NOT DEFINED BNCH_SWT_TOTAL_THREADS) 191 | math(EXPR BNCH_SWT_TOTAL_THREADS "${BNCH_SWT_SM_COUNT} * ${BNCH_SWT_MAX_THREADS_PER_SM}") 192 | set(BNCH_SWT_TOTAL_THREADS ${BNCH_SWT_TOTAL_THREADS} CACHE STRING "GPU total concurrent threads" FORCE) 193 | endif() 194 | 195 | message(STATUS "GPU Configuration: ${BNCH_SWT_SM_COUNT} SMs, ${BNCH_SWT_TOTAL_THREADS} total threads") 196 | 197 | configure_file( 198 | ${CMAKE_CURRENT_SOURCE_DIR}/cmake/detection/benchmarksuite_gpu_properties.hpp.in 199 | ${CMAKE_CURRENT_SOURCE_DIR}/include/bnch_swt/benchmarksuite_gpu_properties.hpp 200 | @ONLY 201 | ) -------------------------------------------------------------------------------- /cmake/flags_and_options.cmake: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2024 RealTimeChris 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this 6 | # software and associated documentation files (the "Software"), to deal in the Software 7 | # without restriction, including without limitation the rights to use, copy, modify, merge, 8 | # publish, distribute, sublicense, and/or sell copies of the Software, and to permit 9 | # persons to whom the Software is furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice shall be included in all copies or 12 | # substantial portions of the Software. 13 | # 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 15 | # INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 16 | # PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE 17 | # FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 18 | # OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 19 | # DEALINGS IN THE SOFTWARE. 20 | 21 | set(BNCH_SWT_COMPILE_DEFINITIONS 22 | BNCH_SWT_COMPILER_CUDA=$,1,0> 23 | BNCH_SWT_ARCH_X64=$,$>,1,0> 24 | BNCH_SWT_ARCH_ARM64=$,$,$>,1,0> 25 | BNCH_SWT_PLATFORM_ANDROID=$,1,0> 26 | BNCH_SWT_PLATFORM_WINDOWS=$,1,0> 27 | BNCH_SWT_PLATFORM_LINUX=$,1,0> 28 | BNCH_SWT_PLATFORM_MAC=$,1,0> 29 | BNCH_SWT_COMPILER_CLANG=$,$>,1,0> 30 | BNCH_SWT_COMPILER_MSVC=$,1,0> 31 | BNCH_SWT_COMPILER_GCC=$,1,0> 32 | BNCH_SWT_DEV=$,1,0> 33 | BNCH_SWT_CUDA_TENSOR_CORES=$,$>,1,0> 34 | BNCH_SWT_CUDA_MAX_REGISTERS=$,128,0> 35 | "BNCH_SWT_HOST_DEVICE=$,$,__forceinline__ __host__ __device__,__noinline__ __host__ __device__>,$,$,[[msvc::forceinline]] inline,inline __attribute__((always_inline))>,$,[[msvc::noinline]],__attribute__((noinline))>>>" 36 | "BNCH_SWT_HOST=$,$,__forceinline__ __host__,__noinline__ __host__>,$,$,[[msvc::forceinline]] inline,inline __attribute__((always_inline))>,$,[[msvc::noinline]],__attribute__((noinline))>>>" 37 | "BNCH_SWT_STATIC_HOST=$,$,static __forceinline__ __host__,static __noinline__ __host__>,$,$,[[msvc::forceinline]] static inline,inline static __attribute__((always_inline))>,$,[[msvc::noinline]] static ,__attribute__((noinline))>>>" 38 | "BNCH_SWT_NOINLINE_DEVICE=$,$,__noinline__ __device__,__noinline__ __device__>,$,$,[[msvc::noinline]],__attribute__((noinline))>,$,[[msvc::noinline]],__attribute__((noinline))>>>" 39 | "BNCH_SWT_NOINLINE=$,$,__noinline__,__noinline__>,$,$,[[msvc::noinline]],__attribute__((noinline))>,$,[[msvc::noinline]],__attribute__((noinline))>>>" 40 | "BNCH_SWT_DEVICE=$,$,__forceinline__ __device__,__noinline__ __device__>,$,$,[[msvc::forceinline]] inline,inline __attribute__((always_inline))>,$,[[msvc::noinline]],__attribute__((noinline))>>>" 41 | "BNCH_SWT_GLOBAL=__global__" 42 | "half=$,__half,uint16_t>" 43 | "half2=$,__half2,uint32_t>" 44 | "bf16_t=$,__nv_bfloat16,uint16_t>" 45 | $<$:NOMINMAX;WIN32_LEAN_AND_MEAN> 46 | ${BNCH_SWT_SIMD_DEFINITIONS} 47 | ) 48 | 49 | set(BNCH_SWT_CLANG_COMPILE_OPTIONS 50 | -O3 51 | -funroll-loops 52 | -fvectorize 53 | -fslp-vectorize 54 | -finline-functions 55 | -fomit-frame-pointer 56 | -fmerge-all-constants 57 | -ffunction-sections 58 | -fdata-sections 59 | -falign-functions=32 60 | -fno-math-errno 61 | -ffp-contract=on 62 | -fvisibility=hidden 63 | -fvisibility-inlines-hidden 64 | -fno-rtti 65 | -fno-asynchronous-unwind-tables 66 | -fno-unwind-tables 67 | -fno-ident 68 | -pipe 69 | -fno-common 70 | -fwrapv 71 | -Weverything 72 | -Wnon-virtual-dtor 73 | -Wno-c++98-compat 74 | -Wno-c++98-compat-pedantic 75 | -Wno-unsafe-buffer-usage 76 | -Wno-padded 77 | -Wno-c++20-compat 78 | -Wno-exit-time-destructors 79 | -Wno-c++20-extensions 80 | ) 81 | 82 | set(BNCH_SWT_APPLECLANG_COMPILE_OPTIONS 83 | -O3 84 | -funroll-loops 85 | -fvectorize 86 | -fslp-vectorize 87 | -finline-functions 88 | -fomit-frame-pointer 89 | -fmerge-all-constants 90 | -ffunction-sections 91 | -fdata-sections 92 | -falign-functions=32 93 | -fno-math-errno 94 | -ffp-contract=on 95 | -fvisibility=hidden 96 | -fvisibility-inlines-hidden 97 | -fno-rtti 98 | -fno-asynchronous-unwind-tables 99 | -fno-unwind-tables 100 | -fno-ident 101 | -pipe 102 | -fno-common 103 | -fwrapv 104 | -Weverything 105 | -Wnon-virtual-dtor 106 | -Wno-c++98-compat 107 | -Wno-c++98-compat-pedantic 108 | -Wno-unsafe-buffer-usage 109 | -Wno-padded 110 | -Wno-c++20-compat 111 | -Wno-exit-time-destructors 112 | -Wno-poison-system-directories 113 | -Wno-c++20-extensions 114 | ) 115 | 116 | set(BNCH_SWT_GNU_COMPILE_OPTIONS 117 | -O3 118 | -funroll-loops 119 | -finline-functions 120 | -fomit-frame-pointer 121 | -fno-math-errno 122 | -falign-functions=32 123 | -falign-loops=32 124 | -fprefetch-loop-arrays 125 | -ftree-vectorize 126 | -fstrict-aliasing 127 | -ffunction-sections 128 | -fdata-sections 129 | -fvisibility=hidden 130 | -fvisibility-inlines-hidden 131 | -fno-keep-inline-functions 132 | -fno-ident 133 | -fmerge-all-constants 134 | -fno-rtti 135 | -fgcse-after-reload 136 | -ftree-loop-distribute-patterns 137 | -fpredictive-commoning 138 | -funswitch-loops 139 | -ftree-loop-vectorize 140 | -ftree-slp-vectorize 141 | -Wall 142 | -Wextra 143 | -Wpedantic 144 | -Wnon-virtual-dtor 145 | -Wlogical-op 146 | -Wduplicated-cond 147 | -Wduplicated-branches 148 | -Wnull-dereference 149 | -Wdouble-promotion 150 | ) 151 | 152 | set(BNCH_SWT_MSVC_RELEASE_FLAGS 153 | /Ob3 154 | /Ot 155 | /Oy 156 | /GT 157 | $<$>:/GL> 158 | /fp:precise 159 | /Qpar 160 | /GS- 161 | ) 162 | 163 | set(BNCH_SWT_MSVC_COMPILE_OPTIONS 164 | /Gy 165 | /Gw 166 | $<$>:/Zc:inline> 167 | /Zc:throwingNew 168 | /W4 169 | $<$>:/bigobj> 170 | /permissive- 171 | /Zc:__cplusplus 172 | /wd4820 173 | /wd4324 174 | /wd5002 175 | /Zc:alignedNew 176 | /Zc:auto 177 | /Zc:forScope 178 | /Zc:implicitNoexcept 179 | /Zc:noexceptTypes 180 | /Zc:referenceBinding 181 | /Zc:rvalueCast 182 | /Zc:sizedDealloc 183 | /Zc:strictStrings 184 | /Zc:ternary 185 | /Zc:wchar_t 186 | $<$:${BNCH_SWT_MSVC_RELEASE_FLAGS}> 187 | ) 188 | 189 | string(TOUPPER "${CMAKE_CUDA_HOST_COMPILER_ID}" BNCH_SWT_HOST_COMPILER_ID) 190 | 191 | set(BNCH_SWT_NVCC_HOST_FLAGS "") 192 | foreach(flag ${BNCH_SWT_${BNCH_SWT_HOST_COMPILER_ID}_COMPILE_OPTIONS}) 193 | list(APPEND BNCH_SWT_NVCC_HOST_FLAGS "-Xcompiler=${flag}") 194 | endforeach() 195 | 196 | set(BNCH_SWT_NVCC_COMPILE_OPTIONS 197 | ${BNCH_SWT_NVCC_HOST_FLAGS} 198 | $<$:-g -G> 199 | $<$>:-O3> 200 | --fmad=false 201 | --prec-div=true 202 | --prec-sqrt=true 203 | --restrict 204 | --extended-lambda 205 | ) 206 | 207 | set(BNCH_SWT_CXX_COMPILE_OPTIONS 208 | $<$:${BNCH_SWT_CLANG_COMPILE_OPTIONS}> 209 | $<$:${BNCH_SWT_APPLECLANG_COMPILE_OPTIONS}> 210 | $<$:${BNCH_SWT_GNU_COMPILE_OPTIONS}> 211 | $<$:${BNCH_SWT_MSVC_COMPILE_OPTIONS}> 212 | ) 213 | 214 | set(BNCH_SWT_COMPILE_OPTIONS 215 | $<$:${BNCH_SWT_CXX_COMPILE_OPTIONS}> 216 | $<$:${BNCH_SWT_NVCC_COMPILE_OPTIONS}> 217 | ${BNCH_SWT_SIMD_FLAGS} 218 | ) 219 | 220 | set(BNCH_SWT_LINK_OPTIONS 221 | $<$,$>: 222 | -Wl,-dead_strip 223 | -Wl,-x 224 | -Wl,-S 225 | > 226 | $<$,$>: 227 | -Wl,-dead_strip 228 | -Wl,-x 229 | -Wl,-S 230 | > 231 | $<$,$>: 232 | -Wl,-dead_strip 233 | -Wl,-x 234 | -Wl,-S 235 | > 236 | $<$,$>: 237 | -Wl,--gc-sections 238 | -Wl,--strip-all 239 | -Wl,--build-id=none 240 | -Wl,--hash-style=gnu 241 | -Wl,-z,now 242 | -Wl,-z,relro 243 | -flto=thin 244 | -fwhole-program-vtables 245 | > 246 | $<$,$>: 247 | -Wl,--gc-sections 248 | -Wl,--strip-all 249 | -Wl,--as-needed 250 | -Wl,-O3 251 | > 252 | $<$,$>: 253 | /DYNAMICBASE:NO 254 | /OPT:REF 255 | /OPT:ICF 256 | /INCREMENTAL:NO 257 | /MACHINE:X64 258 | /LTCG 259 | > 260 | $<$,$>: 261 | -lcudart_static 262 | -lrt 263 | -ldl 264 | -lpthread 265 | > 266 | ) -------------------------------------------------------------------------------- /cmake/detection/main.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | MIT License 3 | 4 | Copyright (c) 2024 RealTimeChris 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy of this 7 | software and associated documentation files (the "Software"), to deal in the Software 8 | without restriction, including without limitation the rights to use, copy, modify, merge, 9 | publish, distribute, sublicense, and/or sell copies of the Software, and to permit 10 | persons to whom the Software is furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all copies or 13 | substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 16 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 17 | PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE 18 | FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 19 | OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | DEALINGS IN THE SOFTWARE. 21 | */ 22 | /// https://github.com/RealTimeChris/benchmarksuite 23 | #if defined(BNCH_SWT_DETECT_GPU_PROPERTIES) 24 | #include 25 | #include 26 | 27 | int32_t main() { 28 | cudaDeviceProp device_prop; 29 | cudaError_t result = cudaGetDeviceProperties(&device_prop, 0); 30 | 31 | if (result != cudaSuccess) { 32 | std::cout << "CUDA_ERROR=1" << std::endl; 33 | return 1; 34 | } 35 | 36 | uint32_t gpu_arch_index = 0; 37 | if (device_prop.major == 9) { 38 | gpu_arch_index = 1; 39 | } else if (device_prop.major == 10) { 40 | gpu_arch_index = 2; 41 | } else if (device_prop.major == 11) { 42 | gpu_arch_index = 3; 43 | } else if (device_prop.major == 12) { 44 | gpu_arch_index = 4; 45 | } else { 46 | gpu_arch_index = 0; 47 | } 48 | 49 | std::cout << "SM_COUNT=" << device_prop.multiProcessorCount << std::endl; 50 | std::cout << "MAX_THREADS_PER_SM=" << device_prop.maxThreadsPerMultiProcessor << std::endl; 51 | std::cout << "MAX_THREADS_PER_BLOCK=" << device_prop.maxThreadsPerBlock << std::endl; 52 | std::cout << "WARP_SIZE=" << device_prop.warpSize << std::endl; 53 | std::cout << "L2_CACHE_SIZE=" << device_prop.l2CacheSize << std::endl; 54 | std::cout << "SHARED_MEM_PER_BLOCK=" << device_prop.sharedMemPerBlock << std::endl; 55 | std::cout << "MEMORY_BUS_WIDTH=" << device_prop.memoryBusWidth << std::endl; 56 | std::cout << "MEMORY_CLOCK_RATE=" << device_prop.memoryClockRate << std::endl; 57 | std::cout << "MAJOR_COMPUTE_CAPABILITY=" << device_prop.major << std::endl; 58 | std::cout << "MINOR_COMPUTE_CAPABILITY=" << device_prop.minor << std::endl; 59 | std::cout << "MAX_GRID_SIZE_X=" << device_prop.maxGridSize[0] << std::endl; 60 | std::cout << "MAX_GRID_SIZE_Y=" << device_prop.maxGridSize[1] << std::endl; 61 | std::cout << "MAX_GRID_SIZE_Z=" << device_prop.maxGridSize[2] << std::endl; 62 | std::cout << "MAX_BLOCK_SIZE_X=" << device_prop.maxThreadsPerBlock << std::endl; 63 | std::cout << "GPU_ARCH_INDEX=" << gpu_arch_index << std::endl; 64 | std::cout << "GPU_SUCCESS=1" << std::endl; 65 | 66 | return 0; 67 | } 68 | #elif defined(BNCH_SWT_DETECT_CPU_PROPERTIES) 69 | #include 70 | #include 71 | #include 72 | #include 73 | #include 74 | #include 75 | 76 | #if BNCH_SWT_COMPILER_MSVC 77 | #include 78 | #elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID) 79 | #include 80 | #endif 81 | 82 | #if BNCH_SWT_PLATFORM_WINDOWS 83 | #include 84 | #endif 85 | #if BNCH_SWT_PLATFORM_LINUX || BNCH_SWT_PLATFORM_ANDROID 86 | #include 87 | #include 88 | #endif 89 | #if BNCH_SWT_PLATFORM_MAC 90 | #include 91 | #include 92 | #include 93 | #endif 94 | 95 | #if BNCH_SWT_ARCH_ARM64 96 | #if BNCH_SWT_PLATFORM_LINUX 97 | #include 98 | #include 99 | #elif BNCH_SWT_PLATFORM_MAC 100 | #include 101 | #endif 102 | #endif 103 | 104 | enum class instruction_set { 105 | FALLBACK = 0x0, 106 | AVX2 = 0x1, 107 | AVX512f = 0x2, 108 | NEON = 0x4, 109 | SVE2 = 0x8, 110 | }; 111 | 112 | enum class cache_level { 113 | one = 1, 114 | two = 2, 115 | three = 3, 116 | }; 117 | 118 | #if BNCH_SWT_ARCH_ARM64 119 | inline static uint32_t detect_supported_architectures() { 120 | uint32_t host_isa = static_cast(instruction_set::NEON); 121 | 122 | #if BNCH_SWT_PLATFORM_LINUX 123 | unsigned long hwcap = getauxval(AT_HWCAP); 124 | if (hwcap & HWCAP_SVE) { 125 | host_isa |= static_cast(instruction_set::SVE2); 126 | } 127 | #endif 128 | 129 | return host_isa; 130 | } 131 | 132 | #elif BNCH_SWT_ARCH_X64 133 | static constexpr uint64_t cpuid_avx2_bit = 1ul << 5; 134 | static constexpr uint64_t cpuid_avx512_bit = 1ul << 16; 135 | static constexpr uint64_t cpuid_avx256_saved = 1ull << 2; 136 | static constexpr uint64_t cpuid_avx512_saved = 7ull << 5; 137 | static constexpr uint64_t cpuid_osx_save = (1ul << 26) | (1ul << 27); 138 | 139 | inline static void cpuid(uint32_t* eax, uint32_t* ebx, uint32_t* ecx, uint32_t* edx) { 140 | #if BNCH_SWT_COMPILER_MSVC 141 | int32_t cpu_info[4]; 142 | __cpuidex(cpu_info, *eax, *ecx); 143 | *eax = cpu_info[0]; 144 | *ebx = cpu_info[1]; 145 | *ecx = cpu_info[2]; 146 | *edx = cpu_info[3]; 147 | #elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID) 148 | uint32_t level = *eax; 149 | __get_cpuid(level, eax, ebx, ecx, edx); 150 | #else 151 | uint32_t a = *eax, b, c = *ecx, d; 152 | asm volatile("cpuid" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "a"(a), "c"(c)); 153 | *eax = a; 154 | *ebx = b; 155 | *ecx = c; 156 | *edx = d; 157 | #endif 158 | } 159 | 160 | inline static uint64_t xgetbv() { 161 | #if BNCH_SWT_COMPILER_MSVC 162 | return _xgetbv(0); 163 | #else 164 | uint32_t eax, edx; 165 | asm volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(0)); 166 | return (( uint64_t )edx << 32) | eax; 167 | #endif 168 | } 169 | 170 | inline static uint32_t detect_supported_architectures() { 171 | std::uint32_t eax = 0; 172 | std::uint32_t ebx = 0; 173 | std::uint32_t ecx = 0; 174 | std::uint32_t edx = 0; 175 | std::uint32_t host_isa = static_cast(instruction_set::FALLBACK); 176 | 177 | eax = 0x1; 178 | ecx = 0x0; 179 | cpuid(&eax, &ebx, &ecx, &edx); 180 | 181 | if ((ecx & cpuid_osx_save) != cpuid_osx_save) { 182 | return host_isa; 183 | } 184 | 185 | uint64_t xcr0 = xgetbv(); 186 | if ((xcr0 & cpuid_avx256_saved) == 0) { 187 | return host_isa; 188 | } 189 | 190 | eax = 0x7; 191 | ecx = 0x0; 192 | cpuid(&eax, &ebx, &ecx, &edx); 193 | 194 | if (ebx & cpuid_avx2_bit) { 195 | host_isa |= static_cast(instruction_set::AVX2); 196 | } 197 | 198 | if (!((xcr0 & cpuid_avx512_saved) == cpuid_avx512_saved)) { 199 | return host_isa; 200 | } 201 | 202 | if (ebx & cpuid_avx512_bit) { 203 | host_isa |= static_cast(instruction_set::AVX512f); 204 | } 205 | 206 | return host_isa; 207 | } 208 | 209 | #else 210 | inline static uint32_t detect_supported_architectures() { 211 | return static_cast(instruction_set::FALLBACK); 212 | } 213 | #endif 214 | 215 | inline uint64_t get_cache_size(cache_level level) { 216 | #if BNCH_SWT_PLATFORM_WINDOWS 217 | DWORD buffer_size = 0; 218 | std::vector buffer{}; 219 | GetLogicalProcessorInformation(nullptr, &buffer_size); 220 | buffer.resize(buffer_size / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION)); 221 | 222 | if (!GetLogicalProcessorInformation(buffer.data(), &buffer_size)) { 223 | return 0; 224 | } 225 | 226 | const auto info_count = buffer_size / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION); 227 | for (uint64_t i = 0; i < info_count; ++i) { 228 | if (buffer[i].Relationship == RelationCache && buffer[i].Cache.Level == static_cast(level)) { 229 | if (level == cache_level::one && buffer[i].Cache.Type == CacheData) { 230 | return buffer[i].Cache.Size; 231 | } else if (level != cache_level::one && buffer[i].Cache.Type == CacheUnified) { 232 | return buffer[i].Cache.Size; 233 | } 234 | } 235 | } 236 | return 0; 237 | 238 | #elif BNCH_SWT_PLATFORM_LINUX || BNCH_SWT_PLATFORM_ANDROID 239 | auto get_cache_size_from_file = [](const std::string& index) { 240 | const std::string cache_file_path = "/sys/devices/system/cpu/cpu0/cache/index" + index + "/size"; 241 | std::ifstream file(cache_file_path); 242 | if (!file.is_open()) { 243 | return static_cast(0); 244 | } 245 | 246 | std::string size_str; 247 | file >> size_str; 248 | file.close(); 249 | 250 | uint64_t size = 0; 251 | if (size_str.back() == 'K') { 252 | size = std::stoul(size_str) * 1024; 253 | } else if (size_str.back() == 'M') { 254 | size = std::stoul(size_str) * 1024 * 1024; 255 | } else { 256 | size = std::stoul(size_str); 257 | } 258 | return size; 259 | }; 260 | 261 | if (level == cache_level::one) { 262 | return get_cache_size_from_file("0"); 263 | } else { 264 | std::string index = (level == cache_level::two) ? "2" : "3"; 265 | return get_cache_size_from_file(index); 266 | } 267 | 268 | #elif BNCH_SWT_PLATFORM_MAC 269 | auto get_cache_size_for_mac = [](const std::string& cache_type) { 270 | uint64_t cache_size = 0; 271 | size_t size = sizeof(cache_size); 272 | std::string sysctl_query = "hw." + cache_type + "cachesize"; 273 | if (sysctlbyname(sysctl_query.c_str(), &cache_size, &size, nullptr, 0) != 0) { 274 | return uint64_t{ 0 }; 275 | } 276 | return cache_size; 277 | }; 278 | 279 | if (level == cache_level::one) { 280 | return get_cache_size_for_mac("l1d"); 281 | } else if (level == cache_level::two) { 282 | return get_cache_size_for_mac("l2"); 283 | } else { 284 | return get_cache_size_for_mac("l3"); 285 | } 286 | #endif 287 | 288 | return 0; 289 | } 290 | 291 | int32_t main() { 292 | const uint32_t thread_count = std::thread::hardware_concurrency(); 293 | const uint32_t supported_isa = detect_supported_architectures(); 294 | const uint64_t l1_cache_size = get_cache_size(cache_level::one); 295 | const uint64_t l2_cache_size = get_cache_size(cache_level::two); 296 | const uint64_t l3_cache_size = get_cache_size(cache_level::three); 297 | std::cout << "THREAD_COUNT=" << thread_count << std::endl; 298 | std::cout << "INSTRUCTION_SET=" << supported_isa << std::endl; 299 | std::cout << "HAS_AVX2=" << ((supported_isa & static_cast(instruction_set::AVX2)) ? 1 : 0) << std::endl; 300 | std::cout << "HAS_AVX512=" << ((supported_isa & static_cast(instruction_set::AVX512f)) ? 1 : 0) << std::endl; 301 | std::cout << "HAS_NEON=" << ((supported_isa & static_cast(instruction_set::NEON)) ? 1 : 0) << std::endl; 302 | std::cout << "HAS_SVE2=" << ((supported_isa & static_cast(instruction_set::SVE2)) ? 1 : 0) << std::endl; 303 | std::cout << "L1_CACHE_SIZE=" << l1_cache_size << std::endl; 304 | std::cout << "L2_CACHE_SIZE=" << l2_cache_size << std::endl; 305 | std::cout << "L3_CACHE_SIZE=" << l3_cache_size << std::endl; 306 | std::cout << "CPU_SUCCESS=1" << std::endl; 307 | return 0; 308 | } 309 | #else 310 | int32_t main() { 311 | return -1; 312 | } 313 | #endif -------------------------------------------------------------------------------- /cmake/detection/benchmarksuite_detect_cpu_properties.cmake: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2024 RealTimeChris 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this 6 | # software and associated documentation files (the "Software"), to deal in the Software 7 | # without restriction, including without limitation the rights to use, copy, modify, merge, 8 | # publish, distribute, sublicense, and/or sell copies of the Software, and to permit 9 | # persons to whom the Software is furnished to do so, subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice shall be included in all copies or 12 | # substantial portions of the Software. 13 | # 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 15 | # INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 16 | # PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE 17 | # FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 18 | # OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 19 | # DEALINGS IN THE SOFTWARE. 20 | 21 | if(UNIX OR APPLE) 22 | file(WRITE ${CMAKE_CURRENT_SOURCE_DIR}/cmake/detection/build_feature_tester_cpu_properties.sh "#!/bin/bash\n" 23 | "\"${CMAKE_COMMAND}\" -S ./ -B ./Build-Cpu-Properties -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_COMPILER=\"${CMAKE_CXX_COMPILER}\" -DBNCH_SWT_DETECT_CPU_PROPERTIES=TRUE\n" 24 | "\"${CMAKE_COMMAND}\" --build ./Build-Cpu-Properties --config=Release" 25 | ) 26 | 27 | execute_process( 28 | COMMAND chmod +x ${CMAKE_CURRENT_SOURCE_DIR}/cmake/detection/build_feature_tester_cpu_properties.sh 29 | RESULT_VARIABLE CHMOD_RESULT 30 | ) 31 | 32 | if(NOT CHMOD_RESULT EQUAL 0) 33 | message(FATAL_ERROR "Failed to set executable permissions for build_feature_tester_cpu_properties.sh") 34 | endif() 35 | 36 | execute_process( 37 | COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/cmake/detection/build_feature_tester_cpu_properties.sh 38 | WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/cmake/detection 39 | ) 40 | 41 | set(FEATURE_TESTER_FILE ${CMAKE_CURRENT_SOURCE_DIR}/cmake/detection/Build-Cpu-Properties/feature_detector) 42 | 43 | elseif(WIN32) 44 | file(WRITE ${CMAKE_CURRENT_SOURCE_DIR}/cmake/detection/build_feature_tester_cpu_properties.bat 45 | "\"${CMAKE_COMMAND}\" -S ./ -B ./Build-Cpu-Properties -DBNCH_SWT_DETECT_CPU_PROPERTIES=TRUE\n" 46 | "\"${CMAKE_COMMAND}\" --build ./Build-Cpu-Properties --config=Release" 47 | ) 48 | 49 | execute_process( 50 | COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/cmake/detection/build_feature_tester_cpu_properties.bat 51 | WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/cmake/detection 52 | ) 53 | 54 | set(FEATURE_TESTER_FILE ${CMAKE_CURRENT_SOURCE_DIR}/cmake/detection/Build-Cpu-Properties/Release/feature_detector.exe) 55 | endif() 56 | 57 | if(NOT DEFINED BNCH_SWT_THREAD_COUNT OR 58 | NOT DEFINED BNCH_SWT_CPU_L1_CACHE_SIZE OR 59 | NOT DEFINED BNCH_SWT_CPU_L2_CACHE_SIZE OR 60 | NOT DEFINED BNCH_SWT_CPU_L3_CACHE_SIZE OR 61 | NOT BNCH_SWT_DETECT_CPU_PROPERTIES) 62 | 63 | execute_process( 64 | COMMAND ${FEATURE_TESTER_FILE} 65 | RESULT_VARIABLE FEATURE_TESTER_EXIT_CODE 66 | OUTPUT_VARIABLE CPU_PROPERTIES_OUTPUT 67 | ERROR_VARIABLE FEATURE_TESTER_ERROR 68 | OUTPUT_STRIP_TRAILING_WHITESPACE 69 | ) 70 | endif() 71 | 72 | message(STATUS "CPU detector exit code: ${FEATURE_TESTER_EXIT_CODE}") 73 | message(STATUS "CPU detector output: '${CPU_PROPERTIES_OUTPUT}'") 74 | message(STATUS "CPU detector error: '${FEATURE_TESTER_ERROR}'") 75 | 76 | if(FEATURE_TESTER_EXIT_CODE EQUAL 0 AND CPU_PROPERTIES_OUTPUT MATCHES "CPU_SUCCESS=1") 77 | 78 | string(REGEX MATCH "THREAD_COUNT=([0-9]+)" _ ${CPU_PROPERTIES_OUTPUT}) 79 | if(NOT DEFINED BNCH_SWT_THREAD_COUNT) 80 | set(BNCH_SWT_THREAD_COUNT ${CMAKE_MATCH_1} CACHE STRING "CPU thread count" FORCE) 81 | endif() 82 | 83 | string(REGEX MATCH "INSTRUCTION_SET=([0-9]+)" _ ${CPU_PROPERTIES_OUTPUT}) 84 | if(NOT DEFINED BNCH_SWT_INSTRUCTION_SET) 85 | set(BNCH_SWT_INSTRUCTION_SET ${CMAKE_MATCH_1} CACHE STRING "CPU instruction set bitmask" FORCE) 86 | endif() 87 | 88 | string(REGEX MATCH "HAS_AVX2=([0-1]+)" _ ${CPU_PROPERTIES_OUTPUT}) 89 | if(NOT DEFINED BNCH_SWT_HAS_AVX2) 90 | set(BNCH_SWT_HAS_AVX2 ${CMAKE_MATCH_1} CACHE STRING "CPU has AVX2 support" FORCE) 91 | endif() 92 | 93 | string(REGEX MATCH "HAS_AVX512=([0-1]+)" _ ${CPU_PROPERTIES_OUTPUT}) 94 | if(NOT DEFINED BNCH_SWT_HAS_AVX512) 95 | set(BNCH_SWT_HAS_AVX512 ${CMAKE_MATCH_1} CACHE STRING "CPU has AVX512 support" FORCE) 96 | endif() 97 | 98 | string(REGEX MATCH "HAS_NEON=([0-1]+)" _ ${CPU_PROPERTIES_OUTPUT}) 99 | if(NOT DEFINED BNCH_SWT_HAS_NEON) 100 | set(BNCH_SWT_HAS_NEON ${CMAKE_MATCH_1} CACHE STRING "CPU has NEON support" FORCE) 101 | endif() 102 | 103 | string(REGEX MATCH "HAS_SVE2=([0-1]+)" _ ${CPU_PROPERTIES_OUTPUT}) 104 | if(NOT DEFINED BNCH_SWT_HAS_SVE2) 105 | set(BNCH_SWT_HAS_SVE2 ${CMAKE_MATCH_1} CACHE STRING "CPU has SVE2 support" FORCE) 106 | endif() 107 | 108 | string(REGEX MATCH "L1_CACHE_SIZE=([0-9]+)" _ ${CPU_PROPERTIES_OUTPUT}) 109 | if(NOT DEFINED BNCH_SWT_CPU_L1_CACHE_SIZE) 110 | set(BNCH_SWT_CPU_L1_CACHE_SIZE ${CMAKE_MATCH_1} CACHE STRING "CPU L1 cache size" FORCE) 111 | endif() 112 | 113 | string(REGEX MATCH "L2_CACHE_SIZE=([0-9]+)" _ ${CPU_PROPERTIES_OUTPUT}) 114 | if(NOT DEFINED BNCH_SWT_CPU_L2_CACHE_SIZE) 115 | set(BNCH_SWT_CPU_L2_CACHE_SIZE ${CMAKE_MATCH_1} CACHE STRING "CPU L2 cache size" FORCE) 116 | endif() 117 | 118 | string(REGEX MATCH "L3_CACHE_SIZE=([0-9]+)" _ ${CPU_PROPERTIES_OUTPUT}) 119 | if(NOT DEFINED BNCH_SWT_CPU_L3_CACHE_SIZE) 120 | set(BNCH_SWT_CPU_L3_CACHE_SIZE ${CMAKE_MATCH_1} CACHE STRING "CPU L3 cache size" FORCE) 121 | endif() 122 | 123 | if(NOT DEFINED BNCH_SWT_CPU_PROPERTIES_ERECTED) 124 | set(BNCH_SWT_CPU_PROPERTIES_ERECTED TRUE CACHE BOOL "CPU properties successfully detected" FORCE) 125 | endif() 126 | 127 | message(STATUS "CPU Properties detected successfully") 128 | 129 | else() 130 | message(WARNING "CPU feature detector failed, using reasonable default values for unset properties") 131 | 132 | if(NOT DEFINED BNCH_SWT_THREAD_COUNT) 133 | set(BNCH_SWT_THREAD_COUNT 4 CACHE STRING "CPU thread count (fallback)" FORCE) 134 | endif() 135 | 136 | if(NOT DEFINED BNCH_SWT_INSTRUCTION_SET) 137 | set(BNCH_SWT_INSTRUCTION_SET 0 CACHE STRING "CPU instruction set bitmask (fallback)" FORCE) 138 | endif() 139 | 140 | if(NOT DEFINED BNCH_SWT_HAS_AVX2) 141 | set(BNCH_SWT_HAS_AVX2 0 CACHE STRING "CPU has AVX2 support (fallback)" FORCE) 142 | endif() 143 | 144 | if(NOT DEFINED BNCH_SWT_HAS_AVX512) 145 | set(BNCH_SWT_HAS_AVX512 0 CACHE STRING "CPU has AVX512 support (fallback)" FORCE) 146 | endif() 147 | 148 | if(NOT DEFINED BNCH_SWT_HAS_NEON) 149 | set(BNCH_SWT_HAS_NEON 0 CACHE STRING "CPU has NEON support (fallback)" FORCE) 150 | endif() 151 | 152 | if(NOT DEFINED BNCH_SWT_HAS_SVE2) 153 | set(BNCH_SWT_HAS_SVE2 0 CACHE STRING "CPU has SVE2 support (fallback)" FORCE) 154 | endif() 155 | 156 | if(NOT DEFINED BNCH_SWT_CPU_L1_CACHE_SIZE) 157 | set(BNCH_SWT_CPU_L1_CACHE_SIZE 32768 CACHE STRING "CPU L1 cache size - 32KB (fallback)" FORCE) 158 | endif() 159 | 160 | if(NOT DEFINED BNCH_SWT_CPU_L2_CACHE_SIZE) 161 | set(BNCH_SWT_CPU_L2_CACHE_SIZE 262144 CACHE STRING "CPU L2 cache size - 256KB (fallback)" FORCE) 162 | endif() 163 | 164 | if(NOT DEFINED BNCH_SWT_CPU_L3_CACHE_SIZE) 165 | set(BNCH_SWT_CPU_L3_CACHE_SIZE 8388608 CACHE STRING "CPU L3 cache size - 8MB (fallback)" FORCE) 166 | endif() 167 | endif() 168 | 169 | if(NOT DEFINED BNCH_SWT_CPU_ARCH_INDEX) 170 | if(BNCH_SWT_HAS_AVX512) 171 | set(BNCH_SWT_CPU_ARCH_INDEX 2 CACHE STRING "CPU architecture index - AVX512" FORCE) 172 | set(BNCH_SWT_CPU_ALIGNMENT 64 CACHE STRING "CPU Alignment" FORCE) 173 | set(BNCH_SWT_SIMD_FLAGS $,,$,/arch:AVX512,-mavx512f;-mavx512bw;-mfma;-mavx2;-mavx;-mlzcnt;-mpopcnt;-mbmi;-mbmi2;-msse4.2;-mf16c>> CACHE STRING "SIMD flags" FORCE) 174 | set(BNCH_SWT_SIMD_DEFINITIONS BNCH_SWT_SVE2=0;BNCH_SWT_AVX512=1;BNCH_SWT_AVX2=0;BNCH_SWT_NEON=0 CACHE STRING "SIMD definitions" FORCE) 175 | set(BNCH_SWT_INSTRUCTION_SET_NAME AVX512 CACHE STRING "Instruction set name" FORCE) 176 | elseif(BNCH_SWT_HAS_AVX2) 177 | set(BNCH_SWT_CPU_ARCH_INDEX 1 CACHE STRING "CPU architecture index - AVX2" FORCE) 178 | set(BNCH_SWT_CPU_ALIGNMENT 32 CACHE STRING "CPU Alignment" FORCE) 179 | set(BNCH_SWT_SIMD_FLAGS $,,$,/arch:AVX2,-mavx2;-mfma;-mavx;-mlzcnt;-mpopcnt;-mbmi;-mbmi2;-msse4.2;-mf16c>> CACHE STRING "SIMD flags" FORCE) 180 | set(BNCH_SWT_SIMD_DEFINITIONS BNCH_SWT_SVE2=0;BNCH_SWT_AVX512=0;BNCH_SWT_AVX2=1;BNCH_SWT_NEON=0 CACHE STRING "SIMD definitions" FORCE) 181 | set(BNCH_SWT_INSTRUCTION_SET_NAME AVX2 CACHE STRING "Instruction set name" FORCE) 182 | elseif(BNCH_SWT_HAS_SVE2) 183 | set(BNCH_SWT_CPU_ARCH_INDEX 2 CACHE STRING "CPU architecture index - SVE2" FORCE) 184 | set(BNCH_SWT_CPU_ALIGNMENT 64 CACHE STRING "CPU Alignment" FORCE) 185 | set(BNCH_SWT_SIMD_FLAGS $,,$,,-march=armv8-a+sve;-msve-vector-bits=scalable;-march=armv8-a+sve+sve2>> CACHE STRING "SIMD flags" FORCE) 186 | set(BNCH_SWT_SIMD_DEFINITIONS BNCH_SWT_SVE2=1;BNCH_SWT_AVX512=0;BNCH_SWT_AVX2=0;BNCH_SWT_NEON=0 CACHE STRING "SIMD definitions" FORCE) 187 | set(BNCH_SWT_INSTRUCTION_SET_NAME SVE2 CACHE STRING "Instruction set name" FORCE) 188 | elseif(BNCH_SWT_HAS_NEON) 189 | set(BNCH_SWT_CPU_ARCH_INDEX 1 CACHE STRING "CPU architecture index - NEON" FORCE) 190 | set(BNCH_SWT_CPU_ALIGNMENT 16 CACHE STRING "CPU Alignment" FORCE) 191 | set(BNCH_SWT_SIMD_FLAGS $,,$,,-march=armv8-a>> CACHE STRING "SIMD flags" FORCE) 192 | set(BNCH_SWT_SIMD_DEFINITIONS BNCH_SWT_SVE2=0;BNCH_SWT_AVX512=0;BNCH_SWT_AVX2=0;BNCH_SWT_NEON=1 CACHE STRING "SIMD definitions" FORCE) 193 | set(BNCH_SWT_INSTRUCTION_SET_NAME NEON CACHE STRING "Instruction set name" FORCE) 194 | else() 195 | set(BNCH_SWT_CPU_ARCH_INDEX 0 CACHE STRING "CPU architecture index - fallback" FORCE) 196 | set(BNCH_SWT_CPU_ALIGNMENT 16 CACHE STRING "CPU Alignment" FORCE) 197 | set(BNCH_SWT_SIMD_FLAGS "" CACHE STRING "SIMD flags" FORCE) 198 | set(BNCH_SWT_SIMD_DEFINITIONS BNCH_SWT_SVE2=0;BNCH_SWT_AVX512=0;BNCH_SWT_AVX2=0;BNCH_SWT_NEON=0 CACHE STRING "SIMD definitions" FORCE) 199 | set(BNCH_SWT_INSTRUCTION_SET_NAME NONE CACHE STRING "Instruction set name" FORCE) 200 | endif() 201 | endif() 202 | 203 | message(STATUS "CPU Configuration: ${BNCH_SWT_THREAD_COUNT} threads, L1: ${BNCH_SWT_CPU_L1_CACHE_SIZE}B, arch index: ${BNCH_SWT_CPU_ARCH_INDEX}") 204 | 205 | configure_file( 206 | ${CMAKE_CURRENT_SOURCE_DIR}/cmake/detection/benchmarksuite_cpu_properties.hpp.in 207 | ${CMAKE_CURRENT_SOURCE_DIR}/include/bnch_swt/benchmarksuite_cpu_properties.hpp 208 | @ONLY 209 | ) -------------------------------------------------------------------------------- /include/bnch_swt/concepts.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | MIT License 3 | 4 | Copyright (c) 2024 RealTimeChris 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy of this 7 | software and associated documentation files (the "Software"), to deal in the Software 8 | without restriction, including without limitation the rights to use, copy, modify, merge, 9 | publish, distribute, sublicense, and/or sell copies of the Software, and to permit 10 | persons to whom the Software is furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all copies or 13 | substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 16 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 17 | PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE 18 | FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 19 | OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | DEALINGS IN THE SOFTWARE. 21 | */ 22 | /// https://github.com/RealTimeChris/benchmarksuite 23 | #pragma once 24 | 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | 34 | namespace bnch_swt { 35 | 36 | namespace internal { 37 | 38 | template 39 | concept has_range = requires(std::remove_cvref_t value) { 40 | { value.begin() }; 41 | { value.end() }; 42 | }; 43 | 44 | template 45 | concept map_subscriptable = requires(std::remove_cvref_t value) { 46 | { value[typename std::remove_cvref_t::key_type{}] } -> std::same_as::mapped_type&>; 47 | } || requires(std::remove_cvref_t value) { 48 | { value[typename std::remove_cvref_t::key_type{}] } -> std::same_as::mapped_type&>; 49 | }; 50 | 51 | template 52 | concept vector_subscriptable = requires(std::remove_cvref_t value) { 53 | { value[typename std::remove_cvref_t::size_type{}] } -> std::same_as::const_reference>; 54 | } || requires(std::remove_cvref_t value) { 55 | { value[typename std::remove_cvref_t::size_type{}] } -> std::same_as::reference>; 56 | }; 57 | 58 | template 59 | concept has_size = requires(std::remove_cvref_t value) { 60 | { value.size() } -> std::same_as::size_type>; 61 | }; 62 | 63 | template 64 | concept has_empty = requires(std::remove_cvref_t value) { 65 | { value.empty() } -> std::same_as; 66 | }; 67 | 68 | template 69 | concept variant_t = requires(std::remove_cvref_t var) { 70 | { var.index() } -> std::same_as; 71 | { var.valueless_by_exception() } -> std::same_as; 72 | { std::holds_alternative(var))>(var) } -> std::same_as; 73 | { std::get<0>(var) } -> std::same_as(var))&>; 74 | { std::get_if<0>(&var) } -> std::same_as(var))>*>; 75 | }; 76 | 77 | template 78 | concept has_resize = requires(std::remove_cvref_t value) { value.resize(typename std::remove_cvref_t::size_type{}); }; 79 | 80 | template 81 | concept has_reserve = requires(std::remove_cvref_t value) { value.reserve(typename std::remove_cvref_t::size_type{}); }; 82 | 83 | template 84 | concept has_data = requires(std::remove_cvref_t value) { 85 | { value.data() } -> std::same_as::const_pointer>; 86 | } || requires(std::remove_cvref_t value) { 87 | { value.data() } -> std::same_as::pointer>; 88 | }; 89 | 90 | template 91 | concept stateless = std::is_empty_v>; 92 | 93 | template 94 | concept bool_t = std::same_as, bool> || std::same_as, std::vector::reference> || 95 | std::same_as, std::vector::const_reference>; 96 | 97 | template 98 | concept always_null_t = std::same_as, std::nullptr_t> || std::same_as, std::monostate> || 99 | std::same_as, std::nullopt_t>; 100 | 101 | template 102 | concept pointer_t = (std::is_pointer_v> || 103 | ( std::is_null_pointer_v> && !std::is_array_v> )) && 104 | !always_null_t; 105 | 106 | template 107 | concept floating_point_t = std::floating_point>; 108 | 109 | template 110 | concept char_t = std::same_as, char>; 111 | 112 | template 113 | concept has_substr = requires(std::remove_cvref_t value) { 114 | { 115 | value.substr(typename std::remove_cvref_t::size_type{}, typename std::remove_cvref_t::size_type{}) 116 | } -> std::same_as>; 117 | }; 118 | 119 | template 120 | concept has_find = requires(std::remove_cvref_t value) { 121 | { value.find(typename std::remove_cvref_t::value_type{}) } -> std::same_as::size_type>; 122 | } || requires(std::remove_cvref_t value) { 123 | { value.find(typename std::remove_cvref_t::key_type{}) } -> std::same_as::iterator>; 124 | } || requires(std::remove_cvref_t value) { 125 | { value.find(typename std::remove_cvref_t::key_type{}) } -> std::same_as::const_iterator>; 126 | }; 127 | 128 | template 129 | concept string_t = has_substr && has_data && has_size && vector_subscriptable && has_find; 130 | 131 | template 132 | concept string_view_t = 133 | has_substr && has_data && has_size && vector_subscriptable && has_find && !has_resize; 134 | 135 | template 136 | concept map_t = map_subscriptable && has_range && has_size && has_find && has_empty; 137 | 138 | template 139 | concept pair_t = requires(std::remove_cvref_t value) { 140 | typename std::remove_cvref_t::first_type; 141 | typename std::remove_cvref_t::second_type; 142 | }; 143 | 144 | template 145 | concept has_fill = requires(std::remove_cvref_t value) { 146 | { value.fill(typename std::remove_cvref_t::value_type{}) } -> std::same_as; 147 | }; 148 | 149 | template 150 | concept has_emplace_back = requires(std::remove_cvref_t value) { 151 | { value.emplace_back(typename std::remove_cvref_t::value_type{}) } -> std::same_as::reference>; 152 | }; 153 | 154 | template 155 | concept has_release = requires(std::remove_cvref_t value) { 156 | { value.release() } -> std::same_as::pointer>; 157 | }; 158 | 159 | template 160 | concept has_reset = requires(std::remove_cvref_t value) { 161 | { value.reset() } -> std::same_as; 162 | }; 163 | 164 | template 165 | concept has_get = requires(std::remove_cvref_t value) { 166 | { value.get() } -> std::same_as::element_type*>; 167 | }; 168 | 169 | template 170 | concept copyable = std::copyable>; 171 | 172 | template 173 | concept unique_ptr_t = requires(std::remove_cvref_t value) { 174 | typename std::remove_cvref_t::element_type; 175 | typename std::remove_cvref_t::deleter_type; 176 | } && has_release && has_get; 177 | 178 | template 179 | concept shared_ptr_t = has_reset && has_get && copyable; 180 | 181 | template 182 | concept nullable_t = !string_t && requires(std::remove_cvref_t value) { 183 | bool(value); 184 | { *value }; 185 | }; 186 | 187 | template 188 | concept null_t = nullable_t || always_null_t; 189 | 190 | template constexpr bool has_size_equal_to_zero{ std::tuple_size_v> == 0 }; 191 | 192 | template 193 | concept has_get_template = requires(std::remove_cvref_t value) { 194 | { std::get<0>(value) } -> std::same_as(value))&>; 195 | }; 196 | 197 | template 198 | concept tuple_t = requires(std::remove_cvref_t t) { std::tuple_size>::value; } && 199 | (has_size_equal_to_zero || has_get_template) && !has_data; 200 | 201 | template 202 | concept optional_t = requires(std::remove_cvref_t opt) { 203 | { opt.has_value() } -> std::same_as; 204 | { opt.value() } -> std::same_as::value_type&>; 205 | { *opt } -> std::same_as::value_type&>; 206 | { opt.reset() } -> std::same_as; 207 | { opt.emplace(typename std::remove_cvref_t::value_type{}) } -> std::same_as::value_type&>; 208 | }; 209 | 210 | template 211 | concept enum_t = std::is_enum_v>; 212 | 213 | template 214 | concept vector_t = vector_subscriptable && !string_t; 215 | 216 | template 217 | concept raw_array_t = ( std::is_array_v> && !std::is_pointer_v> ) || 218 | (vector_subscriptable && !vector_t && !has_substr && !tuple_t); 219 | 220 | template 221 | concept integer_t = std::integral> && !bool_t && !std::floating_point>; 222 | 223 | template 224 | concept printable = requires(std::remove_cvref_t value) { std::cout << value << std::endl; }; 225 | 226 | template 227 | concept not_printable = !printable; 228 | 229 | } 230 | 231 | } 232 | -------------------------------------------------------------------------------- /include/bnch_swt/metrics.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | MIT License 3 | 4 | Copyright (c) 2024 RealTimeChris 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy of this 7 | software and associated documentation files (the "Software"), to deal in the Software 8 | without restriction, including without limitation the rights to use, copy, modify, merge, 9 | publish, distribute, sublicense, and/or sell copies of the Software, and to permit 10 | persons to whom the Software is furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all copies or 13 | substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 16 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 17 | PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE 18 | FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 19 | OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | DEALINGS IN THE SOFTWARE. 21 | */ 22 | /// https://github.com/RealTimeChris/benchmarksuite 23 | #pragma once 24 | 25 | #include 26 | #include 27 | #include 28 | 29 | namespace bnch_swt { 30 | 31 | BNCH_SWT_HOST double calculate_throughput_mbps(double nanoseconds, double bytes_processed) { 32 | constexpr double bytes_per_mb = 1024.0 * 1024.0; 33 | constexpr double nanos_per_second = 1e9; 34 | double megabytes = bytes_processed / bytes_per_mb; 35 | double seconds = nanoseconds / nanos_per_second; 36 | if (seconds == 0.0) { 37 | return 0.0; 38 | } 39 | return megabytes / seconds; 40 | } 41 | 42 | BNCH_SWT_HOST double calculate_units_ps(double nanoseconds, double bytes_processed) { 43 | return (bytes_processed * 1000000000.0) / nanoseconds; 44 | } 45 | 46 | template struct performance_metrics; 47 | 48 | template<> struct performance_metrics { 49 | double throughput_percentage_deviation{ std::numeric_limits::max() }; 50 | std::optional cache_references_per_execution{}; 51 | std::optional instructions_per_execution{}; 52 | std::optional branch_misses_per_execution{}; 53 | std::optional cache_misses_per_execution{}; 54 | std::optional instructions_per_cycle{}; 55 | std::optional branches_per_execution{}; 56 | std::optional instructions_per_byte{}; 57 | std::optional cycles_per_execution{}; 58 | std::optional cycles_per_byte{}; 59 | std::optional frequency_ghz{}; 60 | 61 | uint64_t measured_iteration_count{}; 62 | uint64_t iterations_to_stabilize{}; 63 | uint64_t total_iteration_count{}; 64 | double throughput_mb_per_sec{}; 65 | uint64_t bytes_processed{}; 66 | std::string name{}; 67 | double time_in_ns{}; 68 | 69 | BNCH_SWT_HOST bool operator>(const performance_metrics& other) const { 70 | return throughput_mb_per_sec > other.throughput_mb_per_sec; 71 | } 72 | 73 | template BNCH_SWT_HOST static performance_metrics collect_metrics( 74 | std::span>&& events_newer, uint64_t iterations_to_stabilize, uint64_t total_iteration_count) { 75 | static constexpr string_literal benchmark_name{ benchmark_name_new }; 76 | 77 | if (events_newer.empty()) { 78 | return {}; 79 | } 80 | 81 | performance_metrics metrics{}; 82 | metrics.name = benchmark_name.operator std::string(); 83 | metrics.measured_iteration_count = events_newer.size(); 84 | metrics.total_iteration_count = total_iteration_count; 85 | metrics.iterations_to_stabilize = iterations_to_stabilize; 86 | 87 | double throughput_total{}; 88 | double throughput_min{ std::numeric_limits::max() }; 89 | uint64_t valid_throughput_count{ 0 }; 90 | uint64_t bytes_processed_total{}; 91 | double ns_total{}; 92 | double cycles_total{}; 93 | double instructions_total{}; 94 | double branches_total{}; 95 | double branch_misses_total{}; 96 | double cache_references_total{}; 97 | double cache_misses_total{}; 98 | 99 | for (const auto& e: events_newer) { 100 | double ns{}; 101 | if (e.elapsed_ns(ns)) { 102 | ns_total += ns; 103 | uint64_t bytes_processed{}; 104 | if (e.bytes_processed(bytes_processed)) { 105 | bytes_processed_total += bytes_processed; 106 | 107 | double throughput{}; 108 | if constexpr (mbps) { 109 | throughput = calculate_throughput_mbps(ns, static_cast(bytes_processed)); 110 | } else { 111 | throughput = calculate_units_ps(ns, static_cast(bytes_processed)); 112 | } 113 | 114 | if (throughput > 0.0) { 115 | throughput_total += throughput; 116 | throughput_min = std::min(throughput, throughput_min); 117 | ++valid_throughput_count; 118 | } 119 | } 120 | 121 | double value{}; 122 | if (e.cycles(value)) { 123 | cycles_total += value; 124 | } 125 | if (e.instructions(value)) { 126 | instructions_total += value; 127 | } 128 | if (e.branches(value)) { 129 | branches_total += value; 130 | } 131 | if (e.branch_misses(value)) { 132 | branch_misses_total += value; 133 | } 134 | if (e.cache_references(value)) { 135 | cache_references_total += value; 136 | } 137 | if (e.cache_misses(value)) { 138 | cache_misses_total += value; 139 | } 140 | } 141 | } 142 | 143 | const double inv_size = 1.0 / static_cast(events_newer.size()); 144 | const uint64_t bytes_processed_avg = bytes_processed_total / events_newer.size(); 145 | const double ns_avg = ns_total * inv_size; 146 | const double cycles_avg = cycles_total * inv_size; 147 | const double instructions_avg = instructions_total * inv_size; 148 | const double branches_avg = branches_total * inv_size; 149 | const double branch_misses_avg = branch_misses_total * inv_size; 150 | const double cache_references_avg = cache_references_total * inv_size; 151 | const double cache_misses_avg = cache_misses_total * inv_size; 152 | 153 | metrics.time_in_ns = ns_avg; 154 | 155 | constexpr double epsilon = 1e-6; 156 | 157 | const double throughput_avg = valid_throughput_count > 0 ? throughput_total / static_cast(valid_throughput_count) : 0.0; 158 | if (valid_throughput_count > 0 && throughput_avg > epsilon) { 159 | metrics.bytes_processed = bytes_processed_avg; 160 | metrics.throughput_mb_per_sec = throughput_avg; 161 | metrics.throughput_percentage_deviation = ((throughput_avg - throughput_min) * 100.0) / throughput_avg; 162 | } 163 | 164 | if (std::abs(cycles_avg) > epsilon) { 165 | if (bytes_processed_avg > 0) { 166 | metrics.cycles_per_byte.emplace(cycles_avg / static_cast(bytes_processed_avg)); 167 | } 168 | metrics.cycles_per_execution.emplace(cycles_avg); 169 | metrics.frequency_ghz.emplace(cycles_avg / ns_avg); 170 | } 171 | 172 | if (std::abs(instructions_avg) > epsilon) { 173 | if (bytes_processed_avg > 0) { 174 | metrics.instructions_per_byte.emplace(instructions_avg / static_cast(bytes_processed_avg)); 175 | } 176 | if (std::abs(cycles_avg) > epsilon) { 177 | metrics.instructions_per_cycle.emplace(instructions_avg / cycles_avg); 178 | } 179 | metrics.instructions_per_execution.emplace(instructions_avg); 180 | } 181 | 182 | if (std::abs(branches_avg) > epsilon) { 183 | metrics.branches_per_execution.emplace(branches_avg); 184 | metrics.branch_misses_per_execution.emplace(branch_misses_avg); 185 | } 186 | 187 | if (std::abs(cache_misses_avg) > epsilon) { 188 | metrics.cache_misses_per_execution.emplace(cache_misses_avg); 189 | } 190 | 191 | if (std::abs(cache_references_avg) > epsilon) { 192 | metrics.cache_references_per_execution.emplace(cache_references_avg); 193 | } 194 | 195 | return metrics; 196 | } 197 | }; 198 | 199 | template<> struct performance_metrics { 200 | double throughput_percentage_deviation{ std::numeric_limits::max() }; 201 | std::optional cycles_per_execution{}; 202 | std::optional cuda_event_ms_avg{}; 203 | std::optional cycles_per_byte{}; 204 | 205 | uint64_t measured_iteration_count{}; 206 | uint64_t iterations_to_stabilize{}; 207 | uint64_t total_iteration_count{}; 208 | double throughput_mb_per_sec{}; 209 | uint64_t bytes_processed{}; 210 | std::string name{}; 211 | double time_in_ns{}; 212 | 213 | BNCH_SWT_HOST bool operator>(const performance_metrics& other) const { 214 | return throughput_mb_per_sec > other.throughput_mb_per_sec; 215 | } 216 | 217 | BNCH_SWT_HOST bool operator<(const performance_metrics& other) const { 218 | return throughput_mb_per_sec < other.throughput_mb_per_sec; 219 | } 220 | 221 | template BNCH_SWT_HOST static performance_metrics collect_metrics( 222 | std::span>&& events_newer, uint64_t iterations_to_stabilize, uint64_t total_iteration_count) { 223 | static constexpr string_literal benchmark_name{ benchmark_name_new }; 224 | performance_metrics metrics{}; 225 | metrics.name = benchmark_name.operator std::string(); 226 | metrics.measured_iteration_count = events_newer.size(); 227 | metrics.total_iteration_count = total_iteration_count; 228 | metrics.iterations_to_stabilize = iterations_to_stabilize; 229 | double throughput{}; 230 | double throughput_total{}; 231 | double throughput_avg{}; 232 | double throughput_min{ std::numeric_limits::max() }; 233 | uint64_t bytes_processed{}; 234 | uint64_t bytes_processed_total{}; 235 | uint64_t bytes_processed_avg{}; 236 | double ns{}; 237 | double ns_total{}; 238 | double ns_avg{}; 239 | double ms{}; 240 | double ms_total{}; 241 | double ms_avg{}; 242 | double cycles{}; 243 | double cycles_total{}; 244 | double cycles_avg{}; 245 | for (const internal::event_count& e: events_newer) { 246 | if (e.elapsed_ns(ns)) { 247 | ns_total += ns; 248 | e.cuda_event_ms(ms); 249 | ms_total += ms; 250 | 251 | if (e.bytes_processed(bytes_processed)) { 252 | bytes_processed_total += bytes_processed; 253 | if constexpr (mbps) { 254 | throughput = calculate_throughput_mbps(ns, static_cast(bytes_processed)); 255 | } else { 256 | throughput = calculate_units_ps(ns, static_cast(bytes_processed)); 257 | } 258 | throughput_total += throughput; 259 | throughput_min = throughput < throughput_min ? throughput : throughput_min; 260 | } 261 | 262 | if (e.cycles(cycles)) { 263 | cycles_total += cycles; 264 | } 265 | } 266 | } 267 | if (events_newer.size() > 0) { 268 | bytes_processed_avg = bytes_processed_total / events_newer.size(); 269 | ns_avg = ns_total / static_cast(events_newer.size()); 270 | ms_avg = ms_total / static_cast(events_newer.size()); 271 | throughput_avg = throughput_total / static_cast(events_newer.size()); 272 | cycles_avg = cycles_total / static_cast(events_newer.size()); 273 | metrics.time_in_ns = ns_avg; 274 | } else { 275 | return {}; 276 | } 277 | 278 | constexpr double epsilon = 1e-6; 279 | if (std::abs(ns_avg) > epsilon) { 280 | metrics.bytes_processed = bytes_processed_avg; 281 | metrics.throughput_mb_per_sec = throughput_avg; 282 | metrics.throughput_percentage_deviation = ((throughput_avg - throughput_min) * 100.0) / throughput_avg; 283 | metrics.cuda_event_ms_avg.emplace(ms_avg); 284 | } 285 | if (std::abs(cycles_avg) > epsilon) { 286 | if (metrics.bytes_processed > 0) { 287 | metrics.cycles_per_byte.emplace(cycles_avg / static_cast(metrics.bytes_processed)); 288 | } 289 | metrics.cycles_per_execution.emplace(cycles_total / static_cast(events_newer.size())); 290 | } 291 | return metrics; 292 | } 293 | }; 294 | } 295 | -------------------------------------------------------------------------------- /include/bnch_swt/index.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | MIT License 3 | 4 | Copyright (c) 2024 RealTimeChris 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy of this 7 | software and associated documentation files (the "Software"), to deal in the Software 8 | without restriction, including without limitation the rights to use, copy, modify, merge, 9 | publish, distribute, sublicense, and/or sell copies of the Software, and to permit 10 | persons to whom the Software is furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all copies or 13 | substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 16 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 17 | PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE 18 | FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 19 | OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | DEALINGS IN THE SOFTWARE. 21 | */ 22 | /// https://github.com/RealTimeChris/benchmarksuite 23 | 24 | #pragma once 25 | 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | 39 | namespace bnch_swt { 40 | 41 | template struct result_printer; 42 | 43 | template BNCH_SWT_HOST static auto print_metric(std::string_view label, const value_type& value_new) { 44 | static constexpr uint64_t LABEL_WIDTH = 60; 45 | 46 | if constexpr (internal::optional_t) { 47 | if (value_new.has_value()) { 48 | std::cout << std::left << std::setw(LABEL_WIDTH) << label << ": "; 49 | std::cout << value_new.value(); 50 | std::cout << std::endl; 51 | } 52 | } else { 53 | std::cout << std::left << std::setw(LABEL_WIDTH) << label << ": "; 54 | std::cout << value_new; 55 | std::cout << std::endl; 56 | } 57 | } 58 | 59 | template struct result_printer { 60 | BNCH_SWT_HOST static void impl(const std::vector>& results_new, bool show_comparison = true, bool show_metrics = true) { 61 | std::cout << "CPU Performance Metrics for: " << stage_name_new << std::endl; 62 | 63 | if (show_metrics) { 64 | constexpr string_literal throughput_label = []() { 65 | if constexpr (metric_name_new.size() > 0) { 66 | string_literal throughput_label_new{ "Throughput (" + internal::string_literal_from_view(metric_name_new) + "/s)" }; 67 | return throughput_label_new; 68 | } else { 69 | return string_literal{ "Throughput (MB/s)" }; 70 | } 71 | }(); 72 | 73 | constexpr string_literal metric_label = []() { 74 | if constexpr (metric_name_new.size() > 0) { 75 | string_literal metric_label_new{ string_literal{ internal::string_literal_from_view(metric_name_new) + "s Processed" } }; 76 | return metric_label_new; 77 | } else { 78 | return string_literal{ "Bytes Processed" }; 79 | } 80 | }(); 81 | 82 | constexpr string_literal cycle_label = []() { 83 | if constexpr (metric_name_new.size() > 0) { 84 | return string_literal{ "Cycles per " + internal::string_literal_from_view(metric_name_new) } ; 85 | } else { 86 | return string_literal{ "Cycles per Byte" }; 87 | } 88 | }(); 89 | 90 | constexpr string_literal instruction_label = []() { 91 | if constexpr (metric_name_new.size() > 0) { 92 | return string_literal{ "Instructions per " + internal::string_literal_from_view(metric_name_new) } ; 93 | } else { 94 | return string_literal{ "Instructions per Byte" }; 95 | } 96 | }(); 97 | 98 | for (const auto& value: results_new) { 99 | std::cout << "Metrics for: " << value.name << std::endl; 100 | std::cout << std::fixed << std::setprecision(2); 101 | 102 | print_metric("Total Iterations", value.total_iteration_count); 103 | print_metric("Total Iterations to Stabilize", value.iterations_to_stabilize); 104 | print_metric("Measured Iterations", value.measured_iteration_count); 105 | print_metric(metric_label, value.bytes_processed); 106 | 107 | print_metric("Nanoseconds per Execution", value.time_in_ns); 108 | print_metric("Frequency (GHz)", value.frequency_ghz); 109 | 110 | print_metric(throughput_label, value.throughput_mb_per_sec); 111 | 112 | print_metric("Throughput Percentage Deviation (+/-%)", value.throughput_percentage_deviation); 113 | print_metric("Cycles per Execution", value.cycles_per_execution); 114 | print_metric(cycle_label, value.cycles_per_byte); 115 | 116 | print_metric("Instructions per Execution", value.instructions_per_execution); 117 | print_metric("Instructions per Cycle", value.instructions_per_cycle); 118 | print_metric(instruction_label, value.instructions_per_byte); 119 | 120 | print_metric("Branches per Execution", value.branches_per_execution); 121 | print_metric("Branch Misses per Execution", value.branch_misses_per_execution); 122 | print_metric("Cache References per Execution", value.cache_references_per_execution); 123 | print_metric("Cache Misses per Execution", value.cache_misses_per_execution); 124 | 125 | std::cout << "----------------------------------------" << std::endl; 126 | } 127 | } 128 | 129 | if (show_comparison && results_new.size() > 1) { 130 | for (uint64_t x = 0; x < results_new.size() - 1; ++x) { 131 | double difference = ((results_new[x].throughput_mb_per_sec - results_new[x + 1].throughput_mb_per_sec) / results_new[x + 1].throughput_mb_per_sec) * 100.0; 132 | 133 | std::cout << "Library " << results_new[x].name << " is faster than library " << results_new[x + 1].name << " by " << difference << "%." << std::endl; 134 | } 135 | } 136 | } 137 | }; 138 | 139 | template struct result_printer { 140 | BNCH_SWT_HOST static void impl(const std::vector>& results_new, bool show_comparison = true, bool show_metrics = true) { 141 | std::cout << "GPU Performance Metrics for: " << stage_name_new << std::endl; 142 | 143 | if (show_metrics) { 144 | constexpr string_literal throughput_label = []() { 145 | if constexpr (metric_name_new.size() > 0) { 146 | string_literal throughput_label_new{ "Throughput (" + internal::string_literal_from_view(metric_name_new) + "/s)" }; 147 | return throughput_label_new; 148 | } else { 149 | return string_literal{ "Throughput (MB/s)" }; 150 | } 151 | }(); 152 | 153 | constexpr string_literal metric_label = []() { 154 | if constexpr (metric_name_new.size() > 0) { 155 | string_literal metric_label_new{ string_literal{ internal::string_literal_from_view(metric_name_new) + "s Processed" } }; 156 | return metric_label_new; 157 | } else { 158 | return string_literal{ "Bytes Processed" }; 159 | } 160 | }(); 161 | 162 | constexpr string_literal cycle_label = []() { 163 | if constexpr (metric_name_new.size() > 0) { 164 | return string_literal{ "GPU Cycles per " + internal::string_literal_from_view(metric_name_new) }; 165 | } else { 166 | return string_literal{ "GPU Cycles per Byte" }; 167 | } 168 | }(); 169 | 170 | for (const auto& value: results_new) { 171 | std::cout << "Metrics for: " << value.name << std::endl; 172 | std::cout << std::fixed << std::setprecision(2); 173 | print_metric("Total Iterations", value.total_iteration_count); 174 | print_metric("Total Iterations to Stabilize", value.iterations_to_stabilize); 175 | print_metric("Measured Iterations", value.measured_iteration_count); 176 | print_metric(metric_label, value.bytes_processed); 177 | 178 | print_metric("Milliseconds per Execution", value.cuda_event_ms_avg); 179 | print_metric("Nanoseconds per Execution", value.time_in_ns); 180 | 181 | print_metric(throughput_label, value.throughput_mb_per_sec); 182 | 183 | print_metric("Throughput Percentage Deviation (+/-%)", value.throughput_percentage_deviation); 184 | print_metric("Cycles per Execution", value.cycles_per_execution); 185 | print_metric(cycle_label, value.cycles_per_byte); 186 | 187 | std::cout << "(CPU metrics like instructions/branches/cache are not available on GPU)" << std::endl; 188 | 189 | std::cout << "----------------------------------------" << std::endl; 190 | } 191 | } 192 | 193 | if (show_comparison && results_new.size() > 1) { 194 | for (uint64_t x = 0; x < results_new.size() - 1; ++x) { 195 | double difference = ((results_new[x].throughput_mb_per_sec - results_new[x + 1].throughput_mb_per_sec) / results_new[x + 1].throughput_mb_per_sec) * 100.0; 196 | 197 | std::cout << "Kernel " << results_new[x].name << " is faster than kernel " << results_new[x + 1].name << " by " << difference << "%." << std::endl; 198 | } 199 | } 200 | } 201 | }; 202 | 203 | template{}> 205 | struct benchmark_stage { 206 | static_assert(max_execution_count % measured_iteration_count == 0, "Sorry, but please enter a max_execution_count that is divisible by measured_iteration_count."); 207 | 208 | BNCH_SWT_HOST static auto& get_results_internal() { 209 | static thread_local std::unordered_map> results{}; 210 | return results; 211 | } 212 | 213 | static constexpr bool use_non_mbps_metric{ metric_name_new.size() == 0 }; 214 | 215 | BNCH_SWT_HOST static void print_results(bool show_comparison = true, bool show_metrics = true) { 216 | std::vector> results_new{}; 217 | for (const auto& [key, value]: get_results_internal()) { 218 | results_new.emplace_back(value); 219 | } 220 | if (results_new.size() > 0) { 221 | std::sort(results_new.begin(), results_new.end(), std::greater>{}); 222 | static constexpr std::string_view stage_name_newer{ stage_name_new.operator std::string_view() }; 223 | static constexpr std::string_view metric_name_newer{ metric_name_new.operator std::string_view() }; 224 | result_printer::impl(results_new, show_comparison, show_metrics); 225 | } 226 | } 227 | 228 | BNCH_SWT_HOST static auto get_results() { 229 | std::vector> results_new{}; 230 | for (const auto& [key, value]: get_results_internal()) { 231 | results_new.emplace_back(value); 232 | } 233 | if (results_new.size() > 0) { 234 | std::sort(results_new.begin(), results_new.end(), std::greater>{}); 235 | } 236 | return results_new; 237 | } 238 | 239 | template 240 | BNCH_SWT_HOST static performance_metrics run_benchmark(arg_types&&... args) { 241 | static constexpr string_literal subject_name{ subject_name_new }; 242 | if constexpr (benchmark_type == benchmark_types::cpu) { 243 | static_assert(std::convertible_to, uint64_t>, 244 | "Sorry, but the lambda passed to run_benchmark() must return a uint64_t, reflecting the number of bytes processed!"); 245 | } 246 | internal::event_collector events{}; 247 | internal::cache_clearer cache_clearer{}; 248 | performance_metrics lowest_results{}; 249 | performance_metrics results_temp{}; 250 | uint64_t current_global_index{ measured_iteration_count }; 251 | for (uint64_t x = 0; x < max_execution_count; ++x) { 252 | if constexpr (clear_cpu_cache_between_each_iteration && benchmark_type == benchmark_types::cpu) { 253 | cache_clearer.evict_caches(); 254 | } 255 | events.template run(std::forward(args)...); 256 | } 257 | std::span> new_ptr{ static_cast>&>(events) }; 258 | static constexpr uint64_t final_measured_iteration_count{ max_execution_count - measured_iteration_count > 0 ? max_execution_count - measured_iteration_count : 1 }; 259 | for (uint64_t x = 0; x < final_measured_iteration_count; ++x, ++current_global_index) { 260 | results_temp = performance_metrics::template collect_metrics(new_ptr.subspan(x, measured_iteration_count), 261 | current_global_index - 1, max_execution_count); 262 | lowest_results = results_temp.throughput_percentage_deviation < lowest_results.throughput_percentage_deviation ? results_temp : lowest_results; 263 | } 264 | get_results_internal()[subject_name.operator std::string_view()] = lowest_results; 265 | return get_results_internal()[subject_name.operator std::string_view()]; 266 | } 267 | 268 | template 269 | BNCH_SWT_HOST static performance_metrics run_benchmark(arg_types&&... args) { 270 | static constexpr string_literal subject_name{ subject_name_new }; 271 | if constexpr (benchmark_type == benchmark_types::cpu) { 272 | static_assert(std::convertible_to, uint64_t>, 273 | "Sorry, but the lambda passed to run_benchmark() must return a uint64_t, reflecting the number of bytes processed!"); 274 | } 275 | internal::event_collector events{}; 276 | internal::cache_clearer cache_clearer{}; 277 | performance_metrics lowest_results{}; 278 | performance_metrics results_temp{}; 279 | uint64_t current_global_index{ measured_iteration_count }; 280 | for (uint64_t x = 0; x < max_execution_count; ++x) { 281 | if constexpr (clear_cpu_cache_between_each_iteration && benchmark_type == benchmark_types::cpu) { 282 | cache_clearer.evict_caches(); 283 | } 284 | events.template run(std::forward(args)...); 285 | } 286 | std::span> new_ptr{ static_cast>&>(events) }; 287 | static constexpr uint64_t final_measured_iteration_count{ max_execution_count - measured_iteration_count > 0 ? max_execution_count - measured_iteration_count : 1 }; 288 | for (uint64_t x = 0; x < final_measured_iteration_count; ++x, ++current_global_index) { 289 | results_temp = performance_metrics::template collect_metrics(new_ptr.subspan(x, measured_iteration_count), 290 | current_global_index - 1, max_execution_count); 291 | lowest_results = results_temp.throughput_percentage_deviation < lowest_results.throughput_percentage_deviation ? results_temp : lowest_results; 292 | } 293 | get_results_internal()[subject_name.operator std::string_view()] = lowest_results; 294 | return get_results_internal()[subject_name.operator std::string_view()]; 295 | } 296 | }; 297 | 298 | } 299 | -------------------------------------------------------------------------------- /src/main.cu: -------------------------------------------------------------------------------- 1 | /* 2 | MIT License 3 | 4 | Copyright (c) 2024 RealTimeChris 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy of this 7 | software and associated documentation files (the "Software"), to deal in the Software 8 | without restriction, including without limitation the rights to use, copy, modify, merge, 9 | publish, distribute, sublicense, and/or sell copies of the Software, and to permit 10 | persons to whom the Software is furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all copies or 13 | substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 16 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 17 | PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE 18 | FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 19 | OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | DEALINGS IN THE SOFTWARE. 21 | */ 22 | /// https://github.com/RealTimeChris/BenchmarkSuite 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | enum class core_types { 32 | // Weights. 33 | attn_q, 34 | attn_k, 35 | attn_v, 36 | attn_output, 37 | attn_norm, 38 | ffn_gate, 39 | ffn_up, 40 | ffn_down, 41 | moe_gate, 42 | moe_experts_gate, 43 | moe_experts_up, 44 | moe_experts_down, 45 | ffn_norm, 46 | token_embd, 47 | rope_freqs, 48 | output_norm, 49 | output, 50 | end_of_weights, 51 | // Global Inputs. 52 | inp_tokens, 53 | inp_pos, 54 | inp_out_ids, 55 | cache_k, 56 | cache_v, 57 | kq_mask, 58 | benchmark_data, 59 | end_of_input_only, 60 | // Token-Embeddings Mega-Kernel. 61 | inp_embd_get_rows, 62 | end_of_global_inputs, 63 | // attn_prep_and_score Mega-Kernel. 64 | norm_rms_norm, 65 | attn_norm_mul, 66 | qcur_mul_mat, 67 | qcur_reshape, 68 | qcur_rope, 69 | kcur_mul_mat, 70 | kcur_reshape, 71 | kcur_rope, 72 | vcur_mul_mat, 73 | k_cache_view, 74 | k_cache_view_copy, 75 | vcur_transpose, 76 | v_cache_view, 77 | v_cache_view_copy, 78 | v_view, 79 | k_view, 80 | q_permute, 81 | kq_mul_mat, 82 | // attn_and_ffn_out Mega-Kernel (Dense FFN - Llama). 83 | kq_soft_max, 84 | kqv_mul_mat, 85 | kqv_merged_permute, 86 | kqv_merged_cont, 87 | kqv_out_mul_mat, 88 | ffn_inp_add, 89 | norm_pre_ffn_rms_norm, 90 | ffn_norm_mul, 91 | ffn_gate_mul_mat, 92 | ffn_silu, 93 | ffn_up_mul_mat, 94 | ffn_gate_par_mul, 95 | ffn_out_mul_mat, 96 | // attn_and_moe_out Mega-Kernel (MoE - Grok). 97 | moe_inp_add, 98 | norm_pre_moe_rms_norm, 99 | moe_norm_mul, 100 | moe_router_mul_mat, 101 | moe_router_softmax, 102 | moe_expert_select, 103 | moe_expert_gate_mul_mat, 104 | moe_expert_silu, 105 | moe_expert_up_mul_mat, 106 | moe_expert_gate_par_mul, 107 | moe_expert_down_mul_mat, 108 | moe_expert_weighted_sum, 109 | layer_out_add, 110 | end_of_per_block, 111 | // global_output_and_sampling Mega-Kernel (Dense FFN - Llama). 112 | node_1016_get_rows, 113 | node_1017_get_rows, 114 | final_ffn_inp_add, 115 | final_norm_pre_rms_norm, 116 | final_ffn_norm_mul, 117 | final_ffn_gate_mul_mat, 118 | final_ffn_silu, 119 | final_ffn_up_mul_mat, 120 | final_ffn_gate_par_mul, 121 | final_ffn_out_mul_mat, 122 | // global_output_and_sampling Mega-Kernel (MoE - Grok). 123 | final_moe_inp_add, 124 | final_norm_pre_moe_rms_norm, 125 | final_moe_norm_mul, 126 | final_moe_router_mul_mat, 127 | final_moe_router_softmax, 128 | final_moe_expert_select, 129 | final_moe_expert_gate_mul_mat, 130 | final_moe_expert_silu, 131 | final_moe_expert_up_mul_mat, 132 | final_moe_expert_gate_par_mul, 133 | final_moe_expert_down_mul_mat, 134 | final_moe_expert_weighted_sum, 135 | final_layer_out_add, 136 | final_norm_rms_norm, 137 | result_norm_mul, 138 | result_output_mul_mat, 139 | sample_tokens, 140 | count 141 | }; 142 | 143 | enum class kernel_types : uint8_t { 144 | weights, 145 | global_inputs, 146 | get_rows, 147 | rms_norm, 148 | mul, 149 | mul_mat, 150 | mul_mat_moe, 151 | reshape, 152 | transpose, 153 | permute, 154 | view, 155 | rope, 156 | softmax, 157 | silu, 158 | copy, 159 | cont, 160 | add, 161 | sub, 162 | div, 163 | top_k, 164 | weighted_sum, 165 | sample_tokens, 166 | count, 167 | }; 168 | 169 | enum class device_types : uint8_t { 170 | cpu, 171 | gpu, 172 | numa, 173 | }; 174 | 175 | enum class model_arches : uint8_t { 176 | llama, 177 | deci, 178 | falcon, 179 | baichuan, 180 | grok, 181 | gpt2, 182 | gptj, 183 | gptneox, 184 | mpt, 185 | starcoder, 186 | refact, 187 | bert, 188 | nomic_bert, 189 | jina_bert_v2, 190 | bloom, 191 | stablelm, 192 | qwen, 193 | qwen2, 194 | qwen2moe, 195 | qwen2vl, 196 | phi2, 197 | phi3, 198 | phimoe, 199 | plamo, 200 | codeshell, 201 | orion, 202 | internlm2, 203 | minicpm, 204 | minicpm3, 205 | gemma, 206 | gemma2, 207 | starcoder2, 208 | mamba, 209 | xverse, 210 | command_r, 211 | cohere2, 212 | dbrx, 213 | olmo, 214 | olmo2, 215 | olmoe, 216 | openelm, 217 | arctic, 218 | deepseek, 219 | deepseek2, 220 | chatglm, 221 | bitnet, 222 | t5, 223 | t5encoder, 224 | jais, 225 | nemotron, 226 | exaone, 227 | rwkv6, 228 | rwkv6qwen2, 229 | granite, 230 | granite_moe, 231 | chameleon, 232 | wavtokenizer_dec, 233 | unknown, 234 | count, 235 | }; 236 | 237 | enum class kernel_type_profiles : uint8_t { 238 | fp16_mha, 239 | fp16_moe, 240 | bf16_mha, 241 | bf16_gqa, 242 | q4_mha, 243 | q4_gqa, 244 | q4_moe, 245 | q8_mha, 246 | q8_gqa, 247 | q8_moe, 248 | mixed_fp16_fp32, 249 | mixed_bf16_fp32, 250 | count, 251 | }; 252 | 253 | enum class model_generations : uint8_t { 254 | v1, 255 | v1_v2, 256 | v1_5, 257 | v2, 258 | v3, 259 | v3_1, 260 | v3_2, 261 | count, 262 | }; 263 | 264 | enum class model_sizes : uint8_t { 265 | llm_unknown, 266 | llm_14M, 267 | llm_17M, 268 | llm_22M, 269 | llm_33M, 270 | llm_60M, 271 | llm_70M, 272 | llm_80M, 273 | llm_109M, 274 | llm_137M, 275 | llm_160M, 276 | llm_220M, 277 | llm_250M, 278 | llm_270M, 279 | llm_335M, 280 | llm_410M, 281 | llm_450M, 282 | llm_770M, 283 | llm_780M, 284 | llm_0_5B, 285 | llm_1B, 286 | llm_1_3B, 287 | llm_1_4B, 288 | llm_1_5B, 289 | llm_1_6B, 290 | llm_2B, 291 | llm_2_8B, 292 | llm_3B, 293 | llm_4B, 294 | llm_6B, 295 | llm_6_9B, 296 | llm_7B, 297 | llm_8B, 298 | llm_9B, 299 | llm_11B, 300 | llm_12B, 301 | llm_13B, 302 | llm_14B, 303 | llm_15B, 304 | llm_16B, 305 | llm_20B, 306 | llm_30B, 307 | llm_32B, 308 | llm_34B, 309 | llm_35B, 310 | llm_40B, 311 | llm_46B, 312 | llm_65B, 313 | llm_70B, 314 | llm_314B, 315 | llm_405B, 316 | llm_SMALL, 317 | llm_MEDIUM, 318 | llm_LARGE, 319 | llm_XL, 320 | llm_A1_7B, 321 | llm_A2_7B, 322 | llm_8x7B, 323 | llm_8x22B, 324 | llm_16x12B, 325 | llm_16x3_8B, 326 | llm_10B_128x3_66B, 327 | llm_57B_A14B, 328 | llm_27B, 329 | count, 330 | }; 331 | 332 | struct model_traits { 333 | static constexpr const char name[]{ "llama-3.1-8B" }; 334 | static constexpr model_arches model_arch{ model_arches::llama }; 335 | static constexpr model_generations model_generation{ model_generations::v3_1 }; 336 | static constexpr model_sizes model_size{ model_sizes::llm_8B }; 337 | static constexpr float layer_norm_rms_epsilon = 1e-5f; 338 | static constexpr float rope_freq_base = 500000.0f; 339 | static constexpr uint32_t vocab_size = 128256; 340 | static constexpr uint32_t embedding_length = 4096; 341 | static constexpr uint32_t block_count = 32; 342 | static constexpr uint32_t feed_forward_length = 14336; 343 | static constexpr uint32_t attention_head_count = 32; 344 | static constexpr uint32_t attention_head_count_kv = 8; 345 | static constexpr uint32_t rope_dimension_count = embedding_length / attention_head_count; 346 | static constexpr uint32_t context_length = 131072; 347 | static constexpr uint64_t n_embd_kv_gqa = rope_dimension_count * attention_head_count_kv; 348 | }; 349 | 350 | template BNCH_SWT_HOST constexpr value_type_01 round_up_to_multiple(value_type_01 value) noexcept { 351 | if constexpr ((multiple > 0) && ((multiple & (multiple - 1)) == 0)) { 352 | constexpr value_type_01 mulSub1{ multiple - 1 }; 353 | return (value + mulSub1) & ~mulSub1; 354 | } else { 355 | return ((value + multiple - 1) / multiple) * multiple; 356 | } 357 | } 358 | 359 | template BNCH_SWT_HOST constexpr decltype(auto) move(value_type&& arg) noexcept { 360 | return static_cast&&>(arg); 361 | } 362 | 363 | template BNCH_SWT_HOST constexpr void swap(value_type_01& left, value_type_01& right) noexcept( 364 | std::is_nothrow_move_constructible_v && std::is_nothrow_move_assignable_v) { 365 | value_type_01 tmp = ::move(left); 366 | left = ::move(right); 367 | right = ::move(tmp); 368 | } 369 | 370 | struct cuda_buffer { 371 | using size_type = uint64_t; 372 | using value_type = std::byte; 373 | using pointer = value_type*; 374 | BNCH_SWT_HOST cuda_buffer() noexcept { 375 | } 376 | BNCH_SWT_HOST cuda_buffer& operator=(const cuda_buffer&) noexcept = delete; 377 | BNCH_SWT_HOST cuda_buffer(const cuda_buffer&) noexcept = delete; 378 | 379 | BNCH_SWT_HOST cuda_buffer& operator=(cuda_buffer&& other) noexcept { 380 | if (this != &other) { 381 | ::swap(data_val, other.data_val); 382 | ::swap(size_val, other.size_val); 383 | } 384 | return *this; 385 | } 386 | 387 | BNCH_SWT_HOST cuda_buffer(cuda_buffer&& other) noexcept { 388 | *this = std::move(other); 389 | } 390 | 391 | BNCH_SWT_HOST void init(uint64_t size) noexcept { 392 | if (data_val) { 393 | clear(); 394 | } 395 | 396 | cudaError_t result = cudaMalloc(&data_val, size); 397 | if (result != cudaSuccess) { 398 | data_val = nullptr; 399 | } 400 | 401 | size_val = size; 402 | } 403 | 404 | BNCH_SWT_HOST void deinit() noexcept { 405 | clear(); 406 | } 407 | 408 | BNCH_SWT_HOST size_type size() noexcept { 409 | return size_val; 410 | } 411 | 412 | BNCH_SWT_HOST pointer data() noexcept { 413 | return data_val; 414 | } 415 | 416 | BNCH_SWT_HOST void* claim_memory(uint64_t offset_to_claim) noexcept { 417 | uint64_t aligned_amount = round_up_to_multiple<64ull>(offset_to_claim); 418 | pointer return_value = data_val + aligned_amount; 419 | return return_value; 420 | } 421 | 422 | BNCH_SWT_HOST ~cuda_buffer() noexcept { 423 | clear(); 424 | } 425 | 426 | protected: 427 | size_type size_val{}; 428 | pointer data_val{}; 429 | 430 | BNCH_SWT_HOST void clear() noexcept { 431 | if (data_val) { 432 | cudaFree(data_val); 433 | data_val = nullptr; 434 | size_val = 0; 435 | } 436 | } 437 | }; 438 | 439 | template 440 | concept integral_types = std::is_integral_v>; 441 | 442 | template BNCH_SWT_HOST_DEVICE constexpr value_type operator<<(const value_type arg, std::integral_constant) noexcept { 443 | constexpr uint64_t shift_amount{ shift }; 444 | return arg << shift_amount; 445 | } 446 | 447 | template BNCH_SWT_HOST_DEVICE constexpr value_type& operator<<=(value_type& arg, std::integral_constant) noexcept { 448 | constexpr uint64_t shift_amount{ shift }; 449 | return arg = arg << shift_amount; 450 | } 451 | 452 | template BNCH_SWT_HOST_DEVICE constexpr value_type operator>>(const value_type arg, std::integral_constant) noexcept { 453 | constexpr uint64_t shift_amount{ shift }; 454 | return arg >> shift_amount; 455 | } 456 | 457 | template BNCH_SWT_HOST_DEVICE constexpr value_type& operator>>=(value_type& arg, std::integral_constant) noexcept { 458 | constexpr uint64_t shift_amount{ shift }; 459 | return arg = arg >> shift_amount; 460 | } 461 | 462 | template BNCH_SWT_HOST_DEVICE constexpr std::byte operator<<(const std::byte _Arg, std::integral_constant) noexcept { 463 | constexpr uint64_t shift_amount{ shift }; 464 | return static_cast(static_cast(static_cast(_Arg) << shift_amount)); 465 | } 466 | 467 | struct cpu_buffer { 468 | using size_type = uint64_t; 469 | using value_type = std::byte; 470 | using pointer = value_type*; 471 | BNCH_SWT_HOST cpu_buffer() noexcept { 472 | } 473 | BNCH_SWT_HOST cpu_buffer& operator=(const cpu_buffer&) noexcept = delete; 474 | BNCH_SWT_HOST cpu_buffer(const cpu_buffer&) noexcept = delete; 475 | 476 | BNCH_SWT_HOST cpu_buffer& operator=(cpu_buffer&& other) noexcept { 477 | if (this != &other) { 478 | ::swap(data_val, other.data_val); 479 | ::swap(size_val, other.size_val); 480 | } 481 | return *this; 482 | } 483 | 484 | BNCH_SWT_HOST cpu_buffer(cuda_buffer&& other) noexcept { 485 | *this = std::move(other); 486 | } 487 | 488 | BNCH_SWT_HOST void init(uint64_t size) noexcept { 489 | if (data_val.size()) { 490 | clear(); 491 | } 492 | data_val.resize(size); 493 | 494 | size_val = size; 495 | } 496 | 497 | BNCH_SWT_HOST void deinit() noexcept { 498 | clear(); 499 | } 500 | 501 | BNCH_SWT_HOST size_type size() noexcept { 502 | return size_val; 503 | } 504 | 505 | BNCH_SWT_HOST pointer data() noexcept { 506 | return data_val.data(); 507 | } 508 | 509 | BNCH_SWT_HOST void* claim_memory(uint64_t offset_to_claim) noexcept { 510 | uint64_t aligned_amount = round_up_to_multiple<64ull>(offset_to_claim); 511 | pointer return_value = data_val.data() + aligned_amount; 512 | return return_value; 513 | } 514 | 515 | BNCH_SWT_HOST ~cpu_buffer() noexcept { 516 | clear(); 517 | } 518 | 519 | protected: 520 | std::vector data_val{}; 521 | size_type size_val{}; 522 | 523 | BNCH_SWT_HOST void clear() noexcept { 524 | data_val.clear(); 525 | } 526 | }; 527 | 528 | template struct tensor { 529 | static constexpr uint64_t dim_01{ dim_01_new }; 530 | static constexpr uint64_t dim_02{ dim_02_new }; 531 | static constexpr uint64_t dim_03{ dim_03_new }; 532 | static constexpr uint64_t dim_04{ dim_04_new }; 533 | static constexpr uint64_t element_count{ dim_01 * dim_02 * dim_03 * dim_04 }; 534 | static constexpr uint64_t byte_count{ sizeof(value_type) * element_count }; 535 | 536 | void* data{}; 537 | }; 538 | 539 | struct memory_footprint { 540 | uint64_t byte_count{}; 541 | void* const* data{}; 542 | }; 543 | 544 | struct cuda_tensors { 545 | tensor attn_q_weight{}; 546 | 547 | tensor attn_k_weight{}; 548 | 549 | tensor attn_v_weight{}; 550 | 551 | tensor attn_output_weight{}; 552 | 553 | tensor attn_norm_weight{}; 554 | 555 | tensor ffn_gate_weight{}; 556 | 557 | tensor ffn_up_weight{}; 558 | 559 | tensor ffn_down_weight{}; 560 | 561 | tensor ffn_norm_weight{}; 562 | 563 | tensor token_embd_weight{}; 564 | 565 | tensor rope_freqs_weight{}; 566 | 567 | tensor output_norm_weight{}; 568 | 569 | tensor output_weight{}; 570 | }; 571 | 572 | constexpr cuda_tensors cuda_tensors_val{}; 573 | 574 | constexpr std::array footprints{ memory_footprint{ cuda_tensors_val.attn_q_weight.byte_count, &cuda_tensors_val.attn_q_weight.data }, 575 | memory_footprint{ cuda_tensors_val.attn_k_weight.byte_count, &cuda_tensors_val.attn_k_weight.data }, 576 | memory_footprint{ cuda_tensors_val.attn_v_weight.byte_count, &cuda_tensors_val.attn_v_weight.data }, 577 | memory_footprint{ cuda_tensors_val.attn_output_weight.byte_count, &cuda_tensors_val.attn_output_weight.data }, 578 | memory_footprint{ cuda_tensors_val.attn_norm_weight.byte_count, &cuda_tensors_val.attn_norm_weight.data }, 579 | memory_footprint{ cuda_tensors_val.ffn_gate_weight.byte_count, &cuda_tensors_val.ffn_gate_weight.data }, 580 | memory_footprint{ cuda_tensors_val.ffn_up_weight.byte_count, &cuda_tensors_val.ffn_up_weight.data }, 581 | memory_footprint{ cuda_tensors_val.ffn_down_weight.byte_count, &cuda_tensors_val.ffn_down_weight.data }, 582 | memory_footprint{ cuda_tensors_val.ffn_norm_weight.byte_count, &cuda_tensors_val.ffn_norm_weight.data }, 583 | memory_footprint{ cuda_tensors_val.token_embd_weight.byte_count, &cuda_tensors_val.token_embd_weight.data }, 584 | memory_footprint{ cuda_tensors_val.rope_freqs_weight.byte_count, &cuda_tensors_val.rope_freqs_weight.data }, 585 | memory_footprint{ cuda_tensors_val.output_norm_weight.byte_count, &cuda_tensors_val.output_norm_weight.data }, 586 | memory_footprint{ cuda_tensors_val.output_weight.byte_count, &cuda_tensors_val.output_weight.data } }; 587 | 588 | constexpr uint64_t byte_count{ [] { 589 | uint64_t return_value{}; 590 | for (uint64_t x = 0; x < footprints.size(); ++x) { 591 | return_value += footprints[x].byte_count; 592 | } 593 | return return_value; 594 | }() }; 595 | 596 | template::min(), value_type max = std::numeric_limits::max()> 597 | BNCH_SWT_HOST void generate_values(void* cuda_memory) { 598 | static std::vector return_values{}; 599 | auto size = return_values.size(); 600 | for (uint64_t x = 0; x < size; ++x) { 601 | return_values[x] = bnch_swt::random_generator::impl(min, max); 602 | } 603 | for (uint64_t x = size; x < value_count; ++x) { 604 | return_values.emplace_back(bnch_swt::random_generator::impl(min, max)); 605 | } 606 | if (auto result = cudaMemcpy(cuda_memory, return_values.data(), return_values.size() * sizeof(value_type), cudaMemcpyKind::cudaMemcpyHostToDevice); result) { 607 | std::cout << "cudaMemcpy Error: " << cudaGetErrorString(result) << std::endl; 608 | } 609 | return; 610 | } 611 | 612 | template BNCH_SWT_HOST void generate_cuda_data(cuda_buffer& buffer) { 613 | uint64_t current_offset = 0; 614 | 615 | generate_values(static_cast(buffer.data()) + current_offset); 616 | current_offset += cuda_tensors_val.attn_q_weight.byte_count; 617 | 618 | generate_values(static_cast(buffer.data()) + current_offset); 619 | current_offset += cuda_tensors_val.attn_k_weight.byte_count; 620 | 621 | generate_values(static_cast(buffer.data()) + current_offset); 622 | current_offset += cuda_tensors_val.attn_v_weight.byte_count; 623 | 624 | generate_values(static_cast(buffer.data()) + current_offset); 625 | current_offset += cuda_tensors_val.attn_output_weight.byte_count; 626 | 627 | generate_values(static_cast(buffer.data()) + current_offset); 628 | current_offset += cuda_tensors_val.attn_norm_weight.byte_count; 629 | 630 | generate_values(static_cast(buffer.data()) + current_offset); 631 | current_offset += cuda_tensors_val.ffn_gate_weight.byte_count; 632 | 633 | generate_values(static_cast(buffer.data()) + current_offset); 634 | current_offset += cuda_tensors_val.ffn_up_weight.byte_count; 635 | 636 | generate_values(static_cast(buffer.data()) + current_offset); 637 | current_offset += cuda_tensors_val.ffn_down_weight.byte_count; 638 | 639 | generate_values(static_cast(buffer.data()) + current_offset); 640 | current_offset += cuda_tensors_val.ffn_norm_weight.byte_count; 641 | 642 | generate_values(static_cast(buffer.data()) + current_offset); 643 | current_offset += cuda_tensors_val.token_embd_weight.byte_count; 644 | 645 | generate_values(static_cast(buffer.data()) + current_offset); 646 | current_offset += cuda_tensors_val.rope_freqs_weight.byte_count; 647 | 648 | generate_values(static_cast(buffer.data()) + current_offset); 649 | current_offset += cuda_tensors_val.output_norm_weight.byte_count; 650 | 651 | generate_values(static_cast(buffer.data()) + current_offset); 652 | } 653 | 654 | static constexpr uint64_t total_iteration_count{ 4 }; 655 | static constexpr uint64_t measured_iterations{ 2 }; 656 | 657 | cuda_buffer buffer{ [] { 658 | cuda_buffer return_values{}; 659 | return_values.init(byte_count); 660 | return return_values; 661 | }() }; 662 | 663 | struct benchmark_test_cpu { 664 | BNCH_SWT_HOST static uint64_t impl() { 665 | generate_cuda_data(buffer); 666 | return buffer.size(); 667 | } 668 | }; 669 | 670 | struct benchmark_ggml { 671 | BNCH_SWT_DEVICE static void impl() { 672 | } 673 | }; 674 | 675 | struct benchmark_nihilus { 676 | BNCH_SWT_DEVICE static void impl(cuda_tensors cuda_tensors_val_new) { 677 | } 678 | }; 679 | 680 | BNCH_SWT_GLOBAL void test_function() {}; 681 | 682 | int main() { 683 | static constexpr auto test_function_ptr{ &test_function }; 684 | uint64_t test_byte{}; 685 | test_byte << std::integral_constant{}; 686 | using benchmark = bnch_swt::benchmark_stage<"kernel-gegen-kernel", total_iteration_count, measured_iterations, bnch_swt::benchmark_types::cuda>; 687 | using test_benchmark = bnch_swt::benchmark_stage<"kernel-gegen-kernel-test", total_iteration_count, measured_iterations, bnch_swt::benchmark_types::cpu>; 688 | generate_cuda_data(buffer); 689 | dim3 grid{}; 690 | dim3 block{}; 691 | test_function_ptr<<<1, 3>>>(); 692 | uint64_t bytes_transferred{}; 693 | test_benchmark::run_benchmark<"cuda-setup", benchmark_test_cpu>(); 694 | benchmark::run_benchmark<"ggml", test_function_ptr>(grid, block, 0, bytes_transferred); 695 | 696 | benchmark::run_benchmark<"nihilus", benchmark_nihilus>(grid, block, 0, bytes_transferred, cuda_tensors_val); 697 | 698 | benchmark::print_results(); 699 | test_benchmark::print_results(); 700 | return 0; 701 | } --------------------------------------------------------------------------------