├── examples ├── .gitignore ├── counter_definition.cpp ├── statistics │ ├── single_thread.cpp │ ├── live_events.cpp │ ├── inherit_thread.cpp │ ├── multi_thread.cpp │ ├── metric.cpp │ ├── multi_cpu.cpp │ └── multi_process.cpp ├── sampling │ ├── flame_graph.cpp │ ├── perf_record.cpp │ ├── context_switch.cpp │ ├── memory_access_analyzer.cpp │ ├── instruction_pointer.cpp │ ├── register.cpp │ ├── counter.cpp │ ├── multi_thread.cpp │ ├── multi_cpu.cpp │ ├── multi_event.cpp │ ├── memory_address.cpp │ └── branch.cpp ├── README.md ├── access_benchmark.h ├── access_benchmark.cpp └── CMakeLists.txt ├── test ├── events.csv ├── events-and-metrics.csv ├── hardware_info.cpp ├── CMakeLists.txt ├── access_benchmark.h ├── access_benchmark.cpp ├── counter_definition.cpp └── requested_event.cpp ├── .gitignore ├── include └── perfcpp │ ├── throttle.h │ ├── cgroup.h │ ├── period.h │ ├── precision.h │ ├── metric │ └── expression │ │ ├── function.h │ │ ├── tokenizer.h │ │ ├── parser.h │ │ └── token.h │ ├── context_switch.h │ ├── time_event.h │ ├── counter_result.h │ ├── util │ ├── unique_file_descriptor.h │ ├── table.h │ └── graph.h │ ├── feature.h │ ├── analyzer │ ├── flame_graph_generator.h │ └── data_type.h │ ├── metadata.h │ ├── branch.h │ ├── mmap_buffer.h │ └── hardware_info.h ├── src ├── config.cpp ├── metric │ └── expression │ │ ├── token.cpp │ │ ├── function.cpp │ │ └── expression.cpp ├── exception.cpp ├── counter_result.cpp └── requested_event.cpp ├── docs ├── README.md ├── perf-paranoid.md ├── build.md ├── recording-live-events.md ├── sampling-symbols-and-flamegraphs.md └── analyzing-memory-access-patterns.md ├── events └── x86 │ ├── intel │ ├── clearwater-forest.csv │ └── panther-lake.csv │ └── micro-architecture-register.csv └── script └── create_perf_list.py /examples/.gitignore: -------------------------------------------------------------------------------- 1 | bin/ -------------------------------------------------------------------------------- /test/events.csv: -------------------------------------------------------------------------------- 1 | #name, config, config1, type 2 | EVENT.TEST0,0x1f3010e 3 | event-test-1,0x1CD,3 4 | -------------------------------------------------------------------------------- /test/events-and-metrics.csv: -------------------------------------------------------------------------------- 1 | EVENT.TEST0,0x1f3010e 2 | event-test-1,0x1CD,3 3 | test-metric,3*'event-test-1' -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | Makefile 2 | libperf-cpp.a 3 | CMakeFiles 4 | cmake-build-debug 5 | .cmake 6 | CMakeCache.txt 7 | cmake_install.cmake 8 | build/ 9 | tests 10 | src/processor_specific_event_provider.cpp -------------------------------------------------------------------------------- /test/hardware_info.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | TEST_CASE("number of performance counters", "[HardwareInfo]") 5 | { 6 | REQUIRE(perf::HardwareInfo::physical_performance_counters_per_logical_core() > 1U); 7 | } 8 | 9 | TEST_CASE("number of events per performance counter", "[HardwareInfo]") 10 | { 11 | REQUIRE(perf::HardwareInfo::events_per_physical_performance_counter() > 1U); 12 | } -------------------------------------------------------------------------------- /include/perfcpp/throttle.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace perf { 4 | class Throttle 5 | { 6 | public: 7 | explicit Throttle(const bool is_throttle) noexcept 8 | : _is_throttle(is_throttle) 9 | { 10 | } 11 | ~Throttle() noexcept = default; 12 | 13 | /** 14 | * @return True, if the event was a throttle event. 15 | */ 16 | [[nodiscard]] bool is_throttle() const noexcept { return _is_throttle; } 17 | 18 | /** 19 | * @return True, if the event was an unthrottle event. 20 | */ 21 | [[nodiscard]] bool is_unthrottle() const noexcept { return !_is_throttle; } 22 | 23 | private: 24 | bool _is_throttle; 25 | }; 26 | } -------------------------------------------------------------------------------- /include/perfcpp/cgroup.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | namespace perf { 7 | class CGroup 8 | { 9 | public: 10 | CGroup(const std::uint64_t id, std::string&& path) noexcept 11 | : _id(id) 12 | , _path(std::move(path)) 13 | { 14 | } 15 | ~CGroup() = default; 16 | 17 | /** 18 | * @return Id of the CGgroup (as found in samples). 19 | */ 20 | [[nodiscard]] std::uint64_t id() const noexcept { return _id; } 21 | 22 | /** 23 | * @return Path of the CGroup. 24 | */ 25 | [[nodiscard]] const std::string& path() const noexcept { return _path; } 26 | 27 | private: 28 | std::uint64_t _id; 29 | std::string _path; 30 | }; 31 | } -------------------------------------------------------------------------------- /include/perfcpp/period.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | 5 | namespace perf { 6 | class Period 7 | { 8 | public: 9 | explicit Period(const std::uint64_t period) noexcept 10 | : _period(period) 11 | { 12 | } 13 | ~Period() noexcept = default; 14 | 15 | [[nodiscard]] std::uint64_t get() const noexcept { return _period; } 16 | 17 | private: 18 | std::uint64_t _period; 19 | }; 20 | 21 | class Frequency 22 | { 23 | public: 24 | explicit Frequency(const std::uint64_t frequency) noexcept 25 | : _frequency(frequency) 26 | { 27 | } 28 | ~Frequency() noexcept = default; 29 | 30 | [[nodiscard]] std::uint64_t get() const noexcept { return _frequency; } 31 | 32 | private: 33 | std::uint64_t _frequency; 34 | }; 35 | 36 | using PeriodOrFrequency = std::variant; 37 | } -------------------------------------------------------------------------------- /test/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | Include(FetchContent) 2 | FetchContent_Declare( 3 | Catch2 4 | GIT_REPOSITORY https://github.com/catchorg/Catch2.git 5 | GIT_TAG v3.8.1 # or a later release 6 | ) 7 | FetchContent_MakeAvailable(Catch2) 8 | 9 | set(PERF_CPP_TEST 10 | test/access_benchmark.cpp 11 | test/counter_definition.cpp 12 | test/metric.cpp 13 | test/event_counter.cpp 14 | test/sampler.cpp 15 | test/hardware_info.cpp 16 | test/requested_event.cpp 17 | ) 18 | 19 | add_executable(tests ${PERF_CPP_TEST}) 20 | target_link_libraries(tests PRIVATE Catch2::Catch2WithMain perf-cpp) 21 | target_include_directories(tests PRIVATE Catch2::Catch2WithMain test) 22 | set_target_properties(tests 23 | PROPERTIES 24 | RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin" 25 | ) -------------------------------------------------------------------------------- /src/config.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | perf::Process perf::Process::Any = perf::Process{ -1 }; 5 | perf::Process perf::Process::Calling = perf::Process{ 0 }; 6 | perf::CpuCore perf::CpuCore::Any = perf::CpuCore{ -1 }; 7 | 8 | perf::Config::Config() noexcept 9 | { 10 | /// Try to read the number of physical performance counters from the hardware (either from cpuid or by trying). 11 | if (const auto physical_performance_counters = HardwareInfo::physical_performance_counters_per_logical_core(); 12 | physical_performance_counters > 0U) { 13 | /// If that worked, also read the number of events per physical performance counter. 14 | this->_num_physical_counters = physical_performance_counters; 15 | this->_num_events_per_physical_counter = HardwareInfo::events_per_physical_performance_counter(); 16 | } 17 | } -------------------------------------------------------------------------------- /examples/counter_definition.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | int 6 | main() 7 | { 8 | std::cout 9 | << "libperf-cpp example: This example prints all automatically read events stored in the perf::CounterDefinition.\n" 10 | << std::endl; 11 | 12 | std::cout << "Scanning the underlying hardware for hardware counters..." << std::endl; 13 | std::cout << "Physical Hardware Counters = " 14 | << std::uint16_t(perf::HardwareInfo::physical_performance_counters_per_logical_core()) << "\n"; 15 | std::cout << "Events per Hardware Counter = " 16 | << std::uint16_t(perf::HardwareInfo::events_per_physical_performance_counter()) << "\n" 17 | << std::endl; 18 | 19 | /// Create custom instance of the counter definition. 20 | const auto counter_definition = perf::CounterDefinition{}; 21 | 22 | /// Dump to the console without adding further events. 23 | std::cout << "Detected events:\n" << counter_definition.to_string() << std::endl; 24 | 25 | return 0; 26 | } -------------------------------------------------------------------------------- /src/metric/expression/token.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | std::string 4 | perf::metric::expression::Token::TokenToStringVisitor::operator()( 5 | const perf::metric::expression::Operator_ metric_operator) const 6 | { 7 | switch (metric_operator) { 8 | case Operator_::Plus: 9 | return "+"; 10 | case Operator_::Minus: 11 | return "-"; 12 | case Operator_::Times: 13 | return "*"; 14 | case Operator_::Divide: 15 | return "/"; 16 | default: 17 | return ""; 18 | } 19 | } 20 | 21 | std::string 22 | perf::metric::expression::Token::TokenToStringVisitor::operator()( 23 | const perf::metric::expression::Token::Punctuation punctutation) const 24 | { 25 | switch (punctutation) { 26 | case Token::Punctuation::LeftParentheses: 27 | return "("; 28 | case Token::Punctuation::RightParentheses: 29 | return ")"; 30 | case Token::Punctuation::Comma: 31 | return ","; 32 | default: 33 | return ""; 34 | } 35 | } 36 | 37 | std::string 38 | perf::metric::expression::Token::to_string() const 39 | { 40 | return std::visit(TokenToStringVisitor{}, this->_token); 41 | } -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # Documentation 2 | 3 | Welcome to the comprehensive documentation for the *perf-cpp* library. 4 | This guide is designed to assist you with everything from initial setup to advanced functionalities. 5 | Explore the sections below to gain insights and instructions tailored to your needs. 6 | 7 | --- 8 | - [Building and Including the *perf-cpp* Library](build.md) 9 | - **Counting Performance Events** 10 | - [Basics of Recording Performance Events](recording.md) 11 | - [Multi-threading and Multi-CPU Event Recording](recording-parallel.md) 12 | - [Access Statistics without Stopping the Counter](recording-live-events.md) 13 | - [Defining and Using Metrics](metrics.md) 14 | - **Sampling Techniques** 15 | - [Basics of Event Sampling](sampling.md) 16 | - [Multi-threading and Multi-CPU Event Sampling](sampling-parallel.md) 17 | - [Use the Linux Perf Tool to Analyze Recorded Samples](analyzing-samples-with-perf-report.md) 18 | - [Symbols and Flamegraphs](sampling-symbols-and-flamegraphs.md) 19 | - [Analyzing Memory Access Patterns using Sampling](analyzing-memory-access-patterns.md) 20 | - [Built-in and Hardware-specific Performance Events](counters.md) 21 | - [Understanding the Perf Paranoid Value](perf-paranoid.md) -------------------------------------------------------------------------------- /include/perfcpp/precision.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | namespace perf { 7 | /** 8 | * The precision controls the skid, which refers to the amount of 9 | * instructions between the event and the kernel records the sample. 10 | * 11 | * For more information see "precise_ip" on https://man7.org/linux/man-pages/man2/perf_event_open.2.html. 12 | */ 13 | enum Precision : std::uint8_t 14 | { 15 | /// The recorded instruction pointer may land anywhere within a broad, implementation-defined window around the real 16 | /// instruction. 17 | AllowArbitrarySkid = 0U, 18 | 19 | /// The recorded instruction pointer must have a constant, repeatable skid offset (still non-zero) so the displacement 20 | /// is predictable even if not exact. 21 | MustHaveConstantSkid = 1U, 22 | 23 | /// Request–but do not insist on—zero skid, asking the PMU for exact instruction pointer attribution while allowing 24 | /// fallback on CPUs that cannot guarantee it. 25 | RequestZeroSkid = 2U, 26 | 27 | /// Require zero skid: the sample instruction pointer must be the exact triggering instruction; if the hardware cannot 28 | /// provide this, perf-cpp will lower the precision. 29 | MustHaveZeroSkid = 3U, 30 | }; 31 | } -------------------------------------------------------------------------------- /events/x86/intel/clearwater-forest.csv: -------------------------------------------------------------------------------- 1 | br_inst_retired.all_branches, 0xc4 2 | br_misp_retired.all_branches, 0xc5 3 | cpu_clk_unhalted.core_p, 0x3c 4 | cpu_clk_unhalted.ref_tsc_p, 0x13c 5 | cpu_clk_unhalted.thread_p, 0x3c 6 | dtlb_load_misses.walk_completed, 0xe08 7 | dtlb_store_misses.walk_completed, 0xe49 8 | icache.accesses, 0x380 9 | icache.misses, 0x280 10 | inst_retired.any_p, 0xc0 11 | itlb_misses.walk_completed, 0xe85 12 | longest_lat_cache.miss, 0x412e 13 | longest_lat_cache.reference, 0x4f2e 14 | mem_uops_retired.all_loads, 0x81d0 15 | mem_uops_retired.all_stores, 0x82d0 16 | mem_uops_retired.load_latency_gt_1024, 0x5d0, 0x400 17 | mem_uops_retired.load_latency_gt_128, 0x5d0, 0x80 18 | mem_uops_retired.load_latency_gt_16, 0x5d0, 0x10 19 | mem_uops_retired.load_latency_gt_2048, 0x5d0, 0x800 20 | mem_uops_retired.load_latency_gt_256, 0x5d0, 0x100 21 | mem_uops_retired.load_latency_gt_32, 0x5d0, 0x20 22 | mem_uops_retired.load_latency_gt_4, 0x5d0, 0x4 23 | mem_uops_retired.load_latency_gt_512, 0x5d0, 0x200 24 | mem_uops_retired.load_latency_gt_64, 0x5d0, 0x40 25 | mem_uops_retired.load_latency_gt_8, 0x5d0, 0x8 26 | mem_uops_retired.store_latency, 0x6d0 27 | ocr.demand_data_rd.any_response, 0x1b7, 0x10001 28 | ocr.demand_data_rd.l3_miss, 0x1b7, 0x33fbfc00001 29 | ocr.demand_rfo.any_response, 0x1b7, 0x10002 30 | ocr.demand_rfo.l3_miss, 0x1b7, 0x33fbfc00002 31 | topdown_be_bound.all, 0x2a4 32 | topdown_be_bound.all_p, 0x2a4 33 | -------------------------------------------------------------------------------- /src/metric/expression/function.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | std::optional 5 | perf::metric::expression::DRatioFunction::evaluate(const std::optional left, 6 | const std::optional right) const 7 | { 8 | if (left.has_value() && right.has_value() && right.value() != .0) { 9 | return left.value() / right.value(); 10 | } 11 | 12 | /// If one of the operands cannot be evaluated OR the right operand is zero, we cannot calculate the ratio. 13 | return std::nullopt; 14 | } 15 | 16 | std::optional 17 | perf::metric::expression::SumFunction::evaluate(const perf::CounterResult& result) const 18 | { 19 | auto sum = .0; 20 | 21 | for (const auto& argument : this->_arguments) { 22 | /// Evaluate the argument. 23 | const auto evaluated_argument = argument->evaluate(result); 24 | 25 | /// If the argument cannot be evaluated, the function fails. 26 | if (!evaluated_argument.has_value()) { 27 | return std::nullopt; 28 | } 29 | 30 | /// Accumulate. 31 | sum += evaluated_argument.value(); 32 | } 33 | 34 | return sum; 35 | } 36 | 37 | void 38 | perf::metric::expression::SumFunction::add_required_hardware_counter( 39 | std::vector& hardware_counter_names) const 40 | { 41 | for (const auto& argument : this->_arguments) { 42 | argument->add_required_hardware_counter(hardware_counter_names); 43 | } 44 | } -------------------------------------------------------------------------------- /src/metric/expression/expression.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | std::optional 4 | perf::metric::expression::AdditionExpression::evaluate(const std::optional left, 5 | const std::optional right) const 6 | { 7 | if (left.has_value() && right.has_value()) { 8 | return left.value() + right.value(); 9 | } 10 | 11 | return std::nullopt; 12 | } 13 | 14 | std::optional 15 | perf::metric::expression::SubtractionExpression::evaluate(const std::optional left, 16 | const std::optional right) const 17 | { 18 | if (left.has_value() && right.has_value()) { 19 | return left.value() - right.value(); 20 | } 21 | 22 | return std::nullopt; 23 | } 24 | 25 | std::optional 26 | perf::metric::expression::MultiplyExpression::evaluate(const std::optional left, 27 | const std::optional right) const 28 | { 29 | if (left.has_value() && right.has_value()) { 30 | return left.value() * right.value(); 31 | } 32 | 33 | return std::nullopt; 34 | } 35 | 36 | std::optional 37 | perf::metric::expression::DivideExpression::evaluate(const std::optional left, 38 | const std::optional right) const 39 | { 40 | if (left.has_value() && right.has_value()) { 41 | return left.value() / right.value(); 42 | } 43 | 44 | return std::nullopt; 45 | } -------------------------------------------------------------------------------- /include/perfcpp/metric/expression/function.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "expression.h" 4 | #include 5 | #include 6 | #include 7 | 8 | namespace perf::metric::expression { 9 | /** 10 | * Calculates the ratio between both operands. 11 | */ 12 | class DRatioFunction final : public BinaryExpression 13 | { 14 | public: 15 | DRatioFunction(std::unique_ptr&& left, std::unique_ptr&& right) 16 | : BinaryExpression(std::move(left), std::move(right)) 17 | { 18 | } 19 | 20 | ~DRatioFunction() override = default; 21 | 22 | protected: 23 | [[nodiscard]] std::optional evaluate(std::optional left, std::optional right) const override; 24 | }; 25 | 26 | class SumFunction final : public ExpressionInterface 27 | { 28 | public: 29 | explicit SumFunction(std::vector>&& arguments) 30 | : _arguments(std::move(arguments)) 31 | { 32 | } 33 | 34 | ~SumFunction() override = default; 35 | 36 | /** 37 | * Sums up all arguments. 38 | * 39 | * @param result List of results. 40 | * @return The sum of all arguments. 41 | */ 42 | [[nodiscard]] std::optional evaluate(const CounterResult& result) const override; 43 | 44 | /** 45 | * Adds all counters for all arguments. 46 | * 47 | * @param hardware_counter_names List of hardware counters that will be augmented. 48 | */ 49 | void add_required_hardware_counter(std::vector& hardware_counter_names) const override; 50 | 51 | private: 52 | std::vector> _arguments; 53 | }; 54 | } -------------------------------------------------------------------------------- /include/perfcpp/context_switch.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | 5 | namespace perf { 6 | class ContextSwitch 7 | { 8 | public: 9 | ContextSwitch(const bool is_out, 10 | const bool is_preempt, 11 | const std::optional process_id, 12 | const std::optional thread_id) noexcept 13 | : _is_out(is_out) 14 | , _is_preempt(is_preempt) 15 | , _process_id(process_id) 16 | , _thread_id(thread_id) 17 | { 18 | } 19 | ~ContextSwitch() noexcept = default; 20 | 21 | /** 22 | * @return True, if the process/thread was switched out. 23 | */ 24 | [[nodiscard]] bool is_out() const noexcept { return _is_out; } 25 | 26 | /** 27 | * @return True, if the process/thread was switched in. 28 | */ 29 | [[nodiscard]] bool is_in() const noexcept { return !_is_out; } 30 | 31 | /** 32 | * @return True, if the process/thread was preempted. 33 | */ 34 | [[nodiscard]] bool is_preempt() const noexcept { return _is_preempt; } 35 | 36 | /** 37 | * @return Id of the process, or std::nullopt if not provided (currently only provided on CPU-wide sampling). 38 | */ 39 | [[nodiscard]] std::optional process_id() const noexcept { return _process_id; } 40 | 41 | /** 42 | * @return Id of the thread, or std::nullopt if not provided (currently only provided on CPU-wide sampling). 43 | */ 44 | [[nodiscard]] std::optional thread_id() const noexcept { return _thread_id; } 45 | 46 | private: 47 | bool _is_out; 48 | bool _is_preempt; 49 | std::optional _process_id{ std::nullopt }; 50 | std::optional _thread_id{ std::nullopt }; 51 | }; 52 | } 53 | -------------------------------------------------------------------------------- /events/x86/micro-architecture-register.csv: -------------------------------------------------------------------------------- 1 | regex,vendor,micro-architecture 2 | AuthenticAMD-23-([12][0-9A-F]|[0-9A-F]),amd,zen-1 3 | AuthenticAMD-23-[0-9A-F]+,amd,zen-2 4 | AuthenticAMD-25-([245][0-9A-F]|[0-9A-F]),amd,zen-3 5 | AuthenticAMD-25-[0-9A-F]+,amd,zen-4 6 | AuthenticAMD-26-[0-9A-F]+,amd,zen-5 7 | GenuineIntel-6-(1C|26|27|35|36),intel,bonnell 8 | GenuineIntel-6-(37|4A|4C|4D|5A),intel,silvermont 9 | GenuineIntel-6-(3C|45|46),intel,haswell 10 | GenuineIntel-6-(3D|47),intel,broadwell 11 | GenuineIntel-6-(4E|5E|8E|9E|A5|A6),intel,skylake 12 | GenuineIntel-6-(57|85),intel,knights-landing 13 | GenuineIntel-6-(97|9A|B7|BA|BF),intel,alder-lake 14 | GenuineIntel-6-(AA|AC|B5),intel,meteor-lake 15 | GenuineIntel-6-1[AEF],intel,nehalem-ep 16 | GenuineIntel-6-25,intel,westmere-ep-sp 17 | GenuineIntel-6-2A,intel,sandy-bridge 18 | GenuineIntel-6-2C,intel,westmere-ep-dp 19 | GenuineIntel-6-2D,intel,jake-town 20 | GenuineIntel-6-2E,intel,nehalem-ex 21 | GenuineIntel-6-2F,intel,westmere-ex 22 | GenuineIntel-6-3A,intel,ivy-bridge 23 | GenuineIntel-6-3E,intel,ivy-town 24 | GenuineIntel-6-3F,intel,haswell-x 25 | GenuineIntel-6-4F,intel,broadwell-x 26 | GenuineIntel-6-55-[01234],intel,skylake-x 27 | GenuineIntel-6-55-[56789ABCDEF],intel,cascade-lake-x 28 | GenuineIntel-6-56,intel,broadwell-de 29 | GenuineIntel-6-5[CF],intel,goldmont 30 | GenuineIntel-6-6[AC],intel,ice-lake-x 31 | GenuineIntel-6-7A,intel,goldmont-plus 32 | GenuineIntel-6-7[DE],intel,ice-lake 33 | GenuineIntel-6-86,intel,snow-ridge-x 34 | GenuineIntel-6-8F,intel,sapphire-rapids 35 | GenuineIntel-6-8[CD],intel,tiger-lake 36 | GenuineIntel-6-9[6C],intel,elkhart-lake 37 | GenuineIntel-6-A7,intel,rocket-lake 38 | GenuineIntel-6-AF,intel,sierra-forest 39 | GenuineIntel-6-A[DE],intel,granite-rapids 40 | GenuineIntel-6-B6,intel,grand-ridge 41 | GenuineIntel-6-BD,intel,lunar-lake 42 | GenuineIntel-6-BE,intel,alder-lake-n 43 | GenuineIntel-6-CC,intel,panther-lake 44 | GenuineIntel-6-CF,intel,emerald-rapids 45 | GenuineIntel-6-C[56],intel,arrow-lake 46 | GenuineIntel-6-DD,intel,clearwater-forest 47 | -------------------------------------------------------------------------------- /include/perfcpp/time_event.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | 5 | namespace perf { 6 | class TimeEvent 7 | { 8 | public: 9 | virtual ~TimeEvent() noexcept = default; 10 | [[nodiscard]] virtual double calculate(std::chrono::steady_clock::time_point start, 11 | std::chrono::steady_clock::time_point end) const noexcept = 0; 12 | }; 13 | 14 | class SecondsTimeEvent final : public TimeEvent 15 | { 16 | public: 17 | [[nodiscard]] double calculate(const std::chrono::steady_clock::time_point start, 18 | const std::chrono::steady_clock::time_point end) const noexcept override 19 | { 20 | return static_cast(std::chrono::duration_cast(end - start).count()) / 1000000000.; 21 | } 22 | }; 23 | 24 | class MillisecondsTimeEvent final : public TimeEvent 25 | { 26 | public: 27 | [[nodiscard]] double calculate(const std::chrono::steady_clock::time_point start, 28 | const std::chrono::steady_clock::time_point end) const noexcept override 29 | { 30 | return static_cast(std::chrono::duration_cast(end - start).count()) / 1000000.; 31 | } 32 | }; 33 | 34 | class MicrosecondsTimeEvent final : public TimeEvent 35 | { 36 | public: 37 | [[nodiscard]] double calculate(const std::chrono::steady_clock::time_point start, 38 | const std::chrono::steady_clock::time_point end) const noexcept override 39 | { 40 | return static_cast(std::chrono::duration_cast(end - start).count()) / 1000.; 41 | } 42 | }; 43 | 44 | class NanosecondsTimeEvent final : public TimeEvent 45 | { 46 | public: 47 | [[nodiscard]] double calculate(const std::chrono::steady_clock::time_point start, 48 | const std::chrono::steady_clock::time_point end) const noexcept override 49 | { 50 | return static_cast(std::chrono::duration_cast(end - start).count()); 51 | } 52 | }; 53 | } -------------------------------------------------------------------------------- /examples/statistics/single_thread.cpp: -------------------------------------------------------------------------------- 1 | #include "perfcpp/event_counter.h" 2 | #include 3 | 4 | #include "../access_benchmark.h" 5 | 6 | int 7 | main() 8 | { 9 | std::cout << "libperf-cpp example: Record performance counter for " 10 | "single-threaded random access to an in-memory array." 11 | << std::endl; 12 | 13 | /// Initialize performance counters. 14 | auto event_counter = perf::EventCounter{}; 15 | 16 | /// Add all the performance counters we want to record. 17 | try { 18 | event_counter.add( 19 | { "instructions", "cycles", "branches", "branch-misses", "cycles-per-instruction", "nanoseconds", "gigahertz" }); 20 | } catch (std::runtime_error& e) { 21 | std::cerr << e.what() << std::endl; 22 | return 1; 23 | } 24 | 25 | /// Create random access benchmark. 26 | auto benchmark = perf::example::AccessBenchmark{ /*randomize the accesses*/ true, 27 | /* create benchmark of 512 MB */ 512 }; 28 | 29 | /// Start recording. 30 | try { 31 | event_counter.start(); 32 | } catch (std::runtime_error& exception) { 33 | std::cerr << exception.what() << std::endl; 34 | return 1; 35 | } 36 | 37 | /// Execute the benchmark (accessing cache lines in a random order). 38 | auto value = 0ULL; 39 | for (auto index = 0U; index < benchmark.size(); ++index) { 40 | value += benchmark[index].value; 41 | } 42 | 43 | /// We do not want the compiler to optimize away this (otherwise) unused value (and consequently the loop above). 44 | benchmark.pretend_to_use(value); 45 | 46 | /// Stop recording counters. 47 | event_counter.stop(); 48 | 49 | /// Get the result (normalized per cache line). 50 | const auto result = event_counter.result(benchmark.size()); 51 | 52 | /// Print the performance counters manually. 53 | std::cout << "\nResults:\n"; 54 | for (const auto& [counter_name, counter_value] : result) { 55 | std::cout << counter_value << " " << counter_name << " / cache line" << std::endl; 56 | } 57 | 58 | /// Print the performance counters as table. 59 | std::cout << "\nResults as table:\n" << result.to_string() << std::endl; 60 | 61 | return 0; 62 | } -------------------------------------------------------------------------------- /src/exception.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | std::string 4 | perf::CannotOpenCounterError::create_error_message_from_code(const std::int64_t error_code) 5 | { 6 | switch (error_code) { 7 | case ENOENT: 8 | return "configuration might not be valid (e.g., unsupported event)"; 9 | case E2BIG: 10 | return "perf_event_attr.size was not configured properly – this could be a bug in the perf-cpp library"; 11 | case EACCES: 12 | return "insufficient access rights to start the counter, e.g., profiling a not user-owned process or " 13 | "perf_event_paranoid value too high (see " 14 | "https://github.com/jmuehlig/perf-cpp/blob/dev/docs/perf-paranoid.md)"; 15 | #ifndef PERFCPP_NO_ERROR_EBUSY /// Busy error is reported since Linux 4.1 16 | case EBUSY: 17 | return "another event has exclusive access to the PMU"; 18 | #endif 19 | case EINVAL: 20 | return "counter is configured with an invalid argument (e.g., too high sample frequency, unknown CPU, invalid " 21 | "sample type)"; 22 | case EMFILE: 23 | return "too many open file descriptors (e.g., too many opened counters?)"; 24 | case ENODEV: 25 | return "configured with feature that does not exist on this CPU"; 26 | case EOVERFLOW: 27 | return "maximal callchain stack size is higher than the maximum (see /proc/sys/kernel/perf_event_max_stack)"; 28 | case EPERM: 29 | return "one of the following features is set but not supported: excluding hypervisor, excluding idle, " 30 | "excluding " 31 | "user, or excluding kernel"; 32 | case ESRCH: 33 | return "specified process does not exist"; 34 | default: 35 | return "perf_event_open failed with unknown error"; 36 | } 37 | } 38 | 39 | std::string 40 | perf::IoctlError::create_error_message_from_code(const std::int64_t error_code) 41 | { 42 | switch (error_code) { 43 | case EBADF: 44 | return "file descriptor is not valid"; 45 | case EFAULT: 46 | return "references inaccessible memory area"; 47 | case ENOTTY: 48 | return "file descriptor cannot be used"; 49 | default: 50 | return "::ioctl failed with unknown error"; 51 | } 52 | } -------------------------------------------------------------------------------- /examples/sampling/flame_graph.cpp: -------------------------------------------------------------------------------- 1 | #include "../access_benchmark.h" 2 | #include "perfcpp/analyzer/flame_graph_generator.h" 3 | #include "perfcpp/sampler.h" 4 | #include 5 | 6 | int 7 | main() 8 | { 9 | std::cout << "libperf-cpp example: Record perf samples including time, " 10 | "instruction pointer, and callchain for flamegraph generation." 11 | << std::endl; 12 | 13 | auto sampler = perf::Sampler{}; 14 | 15 | /// Event that generates an overflow which is samples. 16 | sampler.trigger("cycles", perf::Precision::RequestZeroSkid, perf::Period{ 50000U }); 17 | 18 | /// Include Timestamp, period, instruction pointer, and CPU number into samples. 19 | sampler.values().timestamp(true).instruction_pointer(true).callchain(true); 20 | 21 | /// Start sampling. 22 | try { 23 | sampler.start(); 24 | } catch (std::runtime_error& exception) { 25 | std::cerr << exception.what() << std::endl; 26 | return 1; 27 | } 28 | 29 | /// Create random access benchmark. 30 | auto benchmark = perf::example::AccessBenchmark{ /*randomize the accesses*/ true, 31 | /* create benchmark of 512 MB */ 512U }; 32 | 33 | /// Execute the benchmark (accessing cache lines in a random order). 34 | auto value = 0ULL; 35 | for (auto index = 0U; index < benchmark.size(); ++index) { 36 | value += benchmark[index].value; 37 | } 38 | 39 | /// We do not want the compiler to optimize away this (otherwise) unused value (and consequently the loop above). 40 | benchmark.pretend_to_use(value); 41 | 42 | /// Stop sampling. 43 | sampler.stop(); 44 | 45 | /// Get all the recorded samples. 46 | const auto samples = sampler.result(true); 47 | 48 | /// Translate into frame graph entries. 49 | auto flame_graph_generator = perf::analyzer::FlameGraphGenerator{}; 50 | flame_graph_generator.map(samples, "flamegraphs.txt"); 51 | 52 | std::cout << "Wrote samples into flamegraphs.txt" << std::endl; 53 | std::cout << "You can upload the flamgraphs.txt here: https://flamegraph.com/" << std::endl; 54 | 55 | /// Close the sampler. 56 | /// Note that the sampler can only be closed after reading the samples. 57 | sampler.close(); 58 | 59 | return 0; 60 | } -------------------------------------------------------------------------------- /test/access_benchmark.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | namespace perf::test { 7 | 8 | /** 9 | * Generator for unique and zipf data sets. 10 | */ 11 | class DataGenerator 12 | { 13 | public: 14 | [[nodiscard]] static std::vector generate_unique(std::size_t size); 15 | 16 | private: 17 | [[nodiscard]] static std::vector alphabet(std::size_t size); 18 | 19 | [[nodiscard]] static std::vector lookup_table(double zipf_param, const std::vector& alphabet); 20 | }; 21 | 22 | /** 23 | * Benchmark accessing benchmarks in random or sequential order. 24 | * This is an example to demonstrate the perfcpp library. 25 | */ 26 | class AccessBenchmark 27 | { 28 | public: 29 | /** 30 | * Object sized of one cache line. 31 | */ 32 | struct alignas(64U) cache_line 33 | { 34 | cache_line() noexcept = default; 35 | explicit cache_line(const std::uint64_t value_) noexcept 36 | : value(value_) 37 | { 38 | } 39 | ~cache_line() noexcept = default; 40 | 41 | std::uint64_t value; 42 | }; 43 | 44 | AccessBenchmark(bool is_random, std::uint64_t access_data_size_in_mb, bool is_write = false); 45 | ~AccessBenchmark() = default; 46 | 47 | /** 48 | * @return Number of cache lines. 49 | */ 50 | [[nodiscard]] std::size_t size() const noexcept { return _indices.size(); } 51 | 52 | /** 53 | * Grant access to the i-th cache line, considering the defined access order. 54 | * 55 | * @param index Index of the cache line to access. 56 | * @return Cache line. 57 | */ 58 | [[nodiscard]] const cache_line& operator[](const std::size_t index) const noexcept 59 | { 60 | return _data_to_read[_indices[index]]; 61 | } 62 | 63 | void set(const std::size_t index, const std::uint64_t value) { _data_to_write[_indices[index]].value = value; } 64 | 65 | void run(); 66 | 67 | [[nodiscard]] const std::vector& indices() const noexcept { return _indices; } 68 | [[nodiscard]] const std::vector& data_to_read() const noexcept { return _data_to_read; } 69 | 70 | private: 71 | /// Indices, defining the order in which the memory chunk is accessed. 72 | std::vector _indices; 73 | 74 | /// Memory chunk that is read during the benchmark. 75 | std::vector _data_to_read; 76 | 77 | /// Memory chunk that is written during the benchmark. 78 | std::vector _data_to_write; 79 | }; 80 | } -------------------------------------------------------------------------------- /script/create_perf_list.py: -------------------------------------------------------------------------------- 1 | import re 2 | import subprocess 3 | import os 4 | 5 | 6 | codes_regex = re.compile(r'Codes\s+:\s?([a-zA-Z0-9]+)') 7 | 8 | def extract_events(content: str): 9 | # Find the event name 10 | name_match = re.search(r"Name\s+:\s+([a-zA-Z0-9\_\-]+)", content) 11 | if not name_match: 12 | return [] 13 | event_name = name_match.group(1) 14 | 15 | # Find all Umask entries and their descriptions 16 | umask_matches = re.findall(r"Umask-\d+\s+:\s+0x[0-9a-fA-F]+\s+:\s+PMU\s+:\s+\[(.*?)\]", content) 17 | 18 | if not umask_matches: 19 | return [event_name] 20 | 21 | # Combine the event name with each Umask 22 | return [f"{event_name}.{umask.replace(' ', '_')}" for umask in umask_matches] 23 | 24 | ## Clone or pull the repository 25 | if not os.path.exists('libpfm4'): 26 | subprocess.run(['git', 'clone', '-b', 'master', '--single-branch', 'https://github.com/wcohen/libpfm4.git', 'libpfm4'], stdout=subprocess.PIPE) 27 | else: 28 | os.chdir('libpfm4') 29 | subprocess.run(['git', 'pull'], stdout=subprocess.PIPE) 30 | os.chdir('..') 31 | 32 | ## Make the libpfm4 lib 33 | os.chdir('libpfm4') 34 | subprocess.run(['make'], stdout=subprocess.PIPE) 35 | 36 | ## Read all the event infos 37 | events_result = subprocess.run(['examples/showevtinfo'], stdout=subprocess.PIPE) 38 | counters = [] 39 | 40 | ## Transform into counters 41 | for events_content in str(events_result.stdout).split('#-----------------------------'): 42 | events = extract_events(events_content) 43 | for event_to_check in events: 44 | event_result = subprocess.run([f'examples/check_events', event_to_check], stdout=subprocess.PIPE) 45 | codes_match = re.search(r"Codes\s+:\s+(0x[0-9a-fA-F]+)", str(event_result.stdout)) 46 | 47 | if codes_match: 48 | counters.append((event_to_check, codes_match.group(1))) 49 | 50 | with open('../perf_list.csv', 'w') as perf_out_file: 51 | for counter in counters: 52 | perf_out_file.write(f'{counter[0]},{counter[1]}\n') 53 | 54 | print('-------------------------------------------------------------------------------------------------------------------------------------------------') 55 | print('WARNING: The \'perf-list\' target is deprecated and will be removed with perf-cpp v0.14! Use the processor-specific event files in events/ instead.') 56 | print('-------------------------------------------------------------------------------------------------------------------------------------------------') 57 | print(f'Wrote {len(counters)} counter definitions to \'perf_list.csv\'.') 58 | 59 | -------------------------------------------------------------------------------- /test/access_benchmark.cpp: -------------------------------------------------------------------------------- 1 | #include "access_benchmark.h" 2 | #include 3 | #include 4 | #include 5 | 6 | perf::test::AccessBenchmark::AccessBenchmark(const bool is_random, 7 | const std::uint64_t access_data_size_in_mb, 8 | const bool is_write) 9 | { 10 | const auto count_cache_lines = (access_data_size_in_mb * 1024U * 1024U) / sizeof(cache_line); 11 | 12 | /// Fill the data array with some unique. 13 | this->_data_to_read.reserve(count_cache_lines); 14 | for (const auto item : DataGenerator::generate_unique(count_cache_lines)) { 15 | this->_data_to_read.emplace_back(item); 16 | } 17 | 18 | if (is_write) { 19 | this->_data_to_write.resize(count_cache_lines); 20 | } 21 | 22 | /// Create the access pattern by filling the indices and shuffle, if we want a 23 | /// random access pattern. 24 | this->_indices.resize(count_cache_lines); 25 | std::iota(this->_indices.begin(), this->_indices.end(), 0U); 26 | 27 | if (is_random) { 28 | std::shuffle(this->_indices.begin(), this->_indices.end(), std::mt19937{ std::random_device{}() }); 29 | } 30 | } 31 | 32 | std::vector 33 | perf::test::DataGenerator::generate_unique(const std::size_t size) 34 | { 35 | /// Create a list for the tuples. 36 | auto relation = std::vector{}; 37 | relation.reserve(size); 38 | 39 | /// Create tuples. 40 | auto generator = std::mt19937{ 864896UL }; 41 | auto distribution = std::uniform_int_distribution{}; 42 | for (auto i = 0ULL; i < size; ++i) { 43 | relation.emplace_back(distribution(generator)); 44 | } 45 | 46 | /// Shuffle the relation. 47 | std::shuffle(relation.begin(), relation.end(), generator); 48 | 49 | return relation; 50 | } 51 | 52 | void 53 | perf::test::AccessBenchmark::run() 54 | { 55 | const auto is_readonly = this->_data_to_read.size() != this->_data_to_write.size(); 56 | auto value = 0ULL; 57 | 58 | if (is_readonly) { 59 | for (auto index = 0U; index < this->size(); ++index) { 60 | value += this->_data_to_read[this->_indices[index]].value; 61 | } 62 | } else { 63 | for (auto index = 0U; index < this->size(); ++index) { 64 | value += this->_data_to_read[this->_indices[index]].value; 65 | 66 | this->_data_to_write[this->_indices[index]].value = value; 67 | } 68 | } 69 | 70 | asm volatile("" 71 | : "+r,m"(value) 72 | : 73 | : "memory"); /// We do not want the compiler to optimize away 74 | /// this unused value. 75 | } -------------------------------------------------------------------------------- /docs/perf-paranoid.md: -------------------------------------------------------------------------------- 1 | # Perf Paranoid 2 | 3 | This section explains the *perf paranoid* setting in Linux, which controls access to performance monitoring features and can restrict unprivileged users from using performance counters. 4 | 5 | ## Understanding Perf Paranoid 6 | The *perf paranoid* level is defined in `/proc/sys/kernel/perf_event_paranoid`, with values ranging from *highly restrictive* to *fully permissive*: 7 | 8 | | Value | Access Level | 9 | |--------|----------------------------------------------------------------------| 10 | | `-1` | No restrictions (full access). | 11 | | `0` | Allow normal users access, but no raw tracepoint samples. | 12 | | `1` | Allow user and kernel-level profiling (default before Linux `4.6`). | 13 | | `>= 2` | Only user-level measurements allowed (**default since Linux** `4.6`). | 14 | 15 | 16 | If the setting is too restrictive, you may encounter errors like: 17 | 18 | Cannot open perf counter: insufficient access rights to start the counter, 19 | e.g., profiling a not user-owned process or perf_event_paranoid value too high. 20 | 21 | This can be resolved either by [modifying the paranoid level](#setting-the-perf-paranoid-value) or [adjusting monitoring settings](#adjusting-monitoring-configuration). 22 | 23 | ## Setting the Perf Paranoid Value 24 | To enable full access (if permitted by system policy), you can lower the paranoid value **temporarily** with: 25 | 26 | ```bash 27 | sudo sysctl -w kernel.perf_event_paranoid=-1 28 | ``` 29 | 30 | For a **persistent** change, add the following line to `/etc/sysctl.conf`: 31 | 32 | ``` 33 | kernel.perf_event_paranoid = -1 34 | ``` 35 | 36 | Then apply changes with: 37 | 38 | ```bash 39 | sudo sysctl --system 40 | ``` 41 | 42 | ## Adjusting Monitoring Configuration 43 | If you **cannot modify the paranoid level**, you may still be able to record user-level events only. 44 | Use the `perf::Config` class to **disable kernel/hypervisor-level** measurements, which allows profiling under restrictive `perf_event_paranoid` settings (`>= 2`). 45 | 46 | ```cpp 47 | const auto counter_definitions = perf::CounterDefinition{}; 48 | 49 | auto config = perf::Config{}; 50 | config.include_kernel(false); /// Disable kernel event sampling 51 | config.include_hypervisor(false); /// Disable hypervisor event sampling 52 | 53 | auto event_counter = perf::EventCounter{ counter_definitions, config }; 54 | event_counter.add(...); 55 | 56 | event_counter.start(); /// Will only record user-level events. 57 | ``` 58 | 59 | To further restrict monitoring exclude guest events: 60 | 61 | ```cpp 62 | config.include_guest(false); 63 | ``` -------------------------------------------------------------------------------- /examples/statistics/live_events.cpp: -------------------------------------------------------------------------------- 1 | #include "../access_benchmark.h" 2 | #include "perfcpp/event_counter.h" 3 | #include 4 | 5 | int 6 | main() 7 | { 8 | std::cout 9 | << "libperf-cpp example: Record a events and live events where the latter can be read without stopping the " 10 | "hardware performance counters. As a benchmark, we use random access to an in-memory array multiple times." 11 | << std::endl; 12 | 13 | auto event_counter = perf::EventCounter{}; 14 | 15 | try { 16 | /// Add counters that are recorded over the entire period (from start to end). 17 | // event_counter.add({ "cycles", "instructions", "cache-references", "cache-misses", "branches" }); 18 | 19 | /// Add live counters that can be read without stopping the EventCounter. 20 | event_counter.add_live(std::vector{ "cache-references", "cache-misses", "branches" }); 21 | } catch (std::runtime_error& e) { 22 | std::cerr << e.what() << std::endl; 23 | return 1; 24 | } 25 | 26 | /// Create random access benchmark. 27 | auto benchmark = perf::example::AccessBenchmark{ /*randomize the accesses*/ true, 28 | /* create benchmark of 512 MB */ 512U }; 29 | 30 | /// Access to live events. Needs to be initiated after adding all live events. 31 | auto live_events = perf::LiveEventCounter{ event_counter }; 32 | 33 | /// Start recording. 34 | try { 35 | event_counter.start(); 36 | } catch (std::runtime_error& exception) { 37 | std::cerr << exception.what() << std::endl; 38 | return 1; 39 | } 40 | 41 | /// Execute the benchmark (accessing cache lines in a random order). 42 | constexpr auto iterations = 20U; 43 | for (auto i = 0U; i < iterations; ++i) { 44 | /// Read current values of live events and mark them as "start" values. 45 | live_events.start(); 46 | 47 | /// Perform benchmark. 48 | auto value = 0ULL; 49 | for (auto index = 0U; index < benchmark.size(); ++index) { 50 | value += benchmark[index].value; 51 | } 52 | 53 | /// We do not want the compiler to optimize away this (otherwise) unused value (and consequently the loop above). 54 | benchmark.pretend_to_use(value); 55 | 56 | /// Read the current counter value after the benchmark. 57 | live_events.stop(); 58 | 59 | /// Print the live values. 60 | std::cout << "Live results: " << live_events.get("cache-references", benchmark.size()) << " cache-references, " 61 | << live_events.get("cache-misses", benchmark.size()) << " cache-misses, " 62 | << live_events.get("branches", benchmark.size()) << " branches" << std::endl; 63 | } 64 | 65 | /// Stop recording counters. 66 | event_counter.stop(); 67 | 68 | return 0; 69 | } -------------------------------------------------------------------------------- /examples/sampling/perf_record.cpp: -------------------------------------------------------------------------------- 1 | #include "../access_benchmark.h" 2 | #include "perfcpp/hardware_info.h" 3 | #include "perfcpp/sampler.h" 4 | #include 5 | 6 | int 7 | main() 8 | { 9 | std::cout << "libperf-cpp example: Record perf samples including time, " 10 | "logical memory address, latency, data source, and instruction and write as a perf data file `perf.dat`. " 11 | << std::endl; 12 | 13 | /// Initialize sampler. 14 | auto sampler = perf::Sampler{}; 15 | 16 | /// Setup which counters trigger the writing of samples (depends on the underlying hardware substrate). 17 | if (perf::HardwareInfo::is_amd_ibs_supported()) { 18 | sampler.trigger("ibs_op_uops", perf::Precision::MustHaveZeroSkid, perf::Period{ 16000 }); 19 | } else if (perf::HardwareInfo::is_intel()) { 20 | sampler.trigger("mem-loads", perf::Precision::MustHaveZeroSkid, perf::Period{ 16000 }); 21 | } else { 22 | std::cout << "Error: Memory sampling is not supported on this CPU." << std::endl; 23 | return 1; 24 | } 25 | 26 | /// Setup which data will be included into samples (timestamp, virtual memory address, data source like L1d or RAM, 27 | /// latency, instruction address, thread id, and the callstack). 28 | sampler.values().timestamp(true).logical_memory_address(true).data_source(true).latency(true).instruction_pointer(true).thread_id(true).callchain(true); 29 | 30 | /// Start sampling. 31 | try { 32 | sampler.start(); 33 | } catch (std::runtime_error& exception) { 34 | std::cerr << exception.what() << std::endl; 35 | return 1; 36 | } 37 | 38 | /// Create random access benchmark. 39 | auto benchmark = perf::example::AccessBenchmark{ /*randomize the accesses*/ true, 40 | /* create benchmark of 1024 MB */ 1024U }; 41 | 42 | /// Execute the benchmark (accessing cache lines in a random order). 43 | auto value = 0ULL; 44 | for (auto index = 0U; index < benchmark.size(); ++index) { 45 | value += benchmark[index].value; 46 | } 47 | 48 | /// We do not want the compiler to optimize away this (otherwise) unused value (and consequently the loop above). 49 | benchmark.pretend_to_use(value); 50 | 51 | /// Stop sampling. 52 | sampler.stop(); 53 | 54 | /// Write sample results to perf file. 55 | sampler.to_perf_file("perf.data"); 56 | 57 | std::cout << "Wrote " << sampler.result().size() << " samples to `perf.data`." 58 | << "\n Run `perf report` to show overhead per symbol" 59 | << "\n Run `perf mem report` to show overhead per data object" 60 | << std::endl; 61 | 62 | 63 | /// Close the sampler. 64 | /// Note that the sampler can only be closed after reading the samples. 65 | sampler.close(); 66 | 67 | return 0; 68 | } -------------------------------------------------------------------------------- /include/perfcpp/counter_result.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | namespace perf { 10 | class CounterResult 11 | { 12 | public: 13 | using iterator = std::vector>::iterator; 14 | using const_iterator = std::vector>::const_iterator; 15 | 16 | CounterResult() = default; 17 | CounterResult(CounterResult&&) noexcept = default; 18 | CounterResult(const CounterResult&) = default; 19 | explicit CounterResult(std::vector>&& results) noexcept 20 | : _results(std::move(results)) 21 | { 22 | } 23 | 24 | ~CounterResult() = default; 25 | 26 | CounterResult& operator=(CounterResult&&) noexcept = default; 27 | CounterResult& operator=(const CounterResult&) = default; 28 | 29 | /** 30 | * Access the result of the counter or metric with the given name. 31 | * 32 | * @param name Name of the counter or metric to access. 33 | * @return The value, or std::nullopt of the result has no counter or value with the requested name. 34 | */ 35 | [[nodiscard]] std::optional get(std::string_view name) const noexcept; 36 | 37 | [[nodiscard]] std::optional operator[](const std::string_view name) const noexcept { return get(name); } 38 | 39 | [[nodiscard]] iterator begin() { return _results.begin(); } 40 | [[nodiscard]] iterator end() { return _results.end(); } 41 | [[nodiscard]] const_iterator begin() const { return _results.begin(); } 42 | [[nodiscard]] const_iterator end() const { return _results.end(); } 43 | 44 | /** 45 | * Adds the given result to the end of the results. 46 | * 47 | * @param name Name of the result. 48 | * @param value Value of the result. 49 | */ 50 | void emplace_back(const std::string_view name, const double value) { _results.emplace_back(name, value); } 51 | 52 | /** 53 | * Converts the result to a json-formatted string. 54 | * @return Result in JSON format. 55 | */ 56 | [[nodiscard]] std::string to_json() const; 57 | 58 | /** 59 | * Converts the result to a CSV-formatted string. 60 | * 61 | * @param delimiter Char to separate columns (',' by default). 62 | * @param print_header If true, the header will be printed first (true by default). 63 | * @return Result in CSV format. 64 | */ 65 | [[nodiscard]] std::string to_csv(char delimiter = ',', bool print_header = true) const; 66 | 67 | /** 68 | * Converts the result to a table-formatted string. 69 | * @return Result as a table-formatted string. 70 | */ 71 | [[nodiscard]] std::string to_string() const; 72 | 73 | private: 74 | std::vector> _results; 75 | }; 76 | } -------------------------------------------------------------------------------- /include/perfcpp/util/unique_file_descriptor.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | 6 | namespace perf::util { 7 | /** 8 | * The unique file descriptor is backed by a common file descriptor; however, it does not allow copying. 9 | * Destroying the unique file descriptor leads to closing the underyling file descriptor – comparable to a unique 10 | * pointer. 11 | */ 12 | class UniqueFileDescriptor 13 | { 14 | public: 15 | UniqueFileDescriptor() noexcept = default; 16 | 17 | explicit UniqueFileDescriptor(const std::int64_t file_descriptor) noexcept 18 | : _file_descriptor(static_cast(file_descriptor)) 19 | { 20 | } 21 | 22 | UniqueFileDescriptor(UniqueFileDescriptor&& other) noexcept 23 | : _file_descriptor(std::exchange(other._file_descriptor, -1L)) 24 | { 25 | } 26 | 27 | /** 28 | * Closes the file descriptor underneath. 29 | */ 30 | ~UniqueFileDescriptor() 31 | { 32 | if (has_value()) { 33 | ::close(value()); 34 | } 35 | } 36 | 37 | UniqueFileDescriptor& operator=(UniqueFileDescriptor&& other) noexcept 38 | { 39 | _file_descriptor = std::exchange(other._file_descriptor, -1L); 40 | return *this; 41 | } 42 | 43 | UniqueFileDescriptor& operator=(const std::int64_t file_descriptor) noexcept 44 | { 45 | _file_descriptor = static_cast(file_descriptor); 46 | return *this; 47 | } 48 | 49 | UniqueFileDescriptor& operator=(const std::int32_t file_descriptor) noexcept 50 | { 51 | _file_descriptor = file_descriptor; 52 | return *this; 53 | } 54 | 55 | /** 56 | * @return True, if the filter descriptor underneath is opened. 57 | */ 58 | [[nodiscard]] bool has_value() const noexcept { return _file_descriptor > -1LL; } 59 | 60 | /** 61 | * @return The "real" filter descriptor. 62 | */ 63 | [[nodiscard]] std::int32_t value() const noexcept { return _file_descriptor; } 64 | 65 | private: 66 | std::int32_t _file_descriptor{ -1LL }; 67 | }; 68 | 69 | /** 70 | * The file descriptor view grants access to a file descriptor without closing it when destroyed. 71 | * Logically, the view does not own the file descriptor; it can be outdated. 72 | */ 73 | class FileDescriptorView 74 | { 75 | public: 76 | FileDescriptorView() noexcept = default; 77 | 78 | explicit FileDescriptorView(const UniqueFileDescriptor& file_descriptor) noexcept 79 | : _file_descriptor(file_descriptor.value()) 80 | { 81 | } 82 | FileDescriptorView(const FileDescriptorView&) noexcept = default; 83 | 84 | ~FileDescriptorView() noexcept = default; 85 | 86 | /** 87 | * @return True, if the filter descriptor underneath is opened. 88 | */ 89 | [[nodiscard]] bool has_value() const noexcept { return _file_descriptor > -1LL; } 90 | 91 | /** 92 | * @return The "real" filter descriptor. 93 | */ 94 | [[nodiscard]] std::int32_t value() const noexcept { return _file_descriptor; } 95 | 96 | private: 97 | std::int32_t _file_descriptor{ -1LL }; 98 | }; 99 | } -------------------------------------------------------------------------------- /events/x86/intel/panther-lake.csv: -------------------------------------------------------------------------------- 1 | br_inst_retired.all_branches, 0xc4 2 | br_misp_retired.all_branches, 0xc5 3 | cpu_clk_unhalted.core_p, 0x3c 4 | cpu_clk_unhalted.ref_tsc_p, 0x13c 5 | cpu_clk_unhalted.thread_p, 0x3c 6 | dtlb_load_misses.walk_completed, 0xe08 7 | dtlb_store_misses.walk_completed, 0xe49 8 | icache.accesses, 0x380 9 | icache.misses, 0x280 10 | idq_bubbles.core, 0x19c 11 | inst_retired.any_p, 0xc0 12 | itlb_misses.walk_completed, 0xe85 13 | l2_request.all, 0x1ff24 14 | l2_rqsts.all_code_rd, 0xe424 15 | l2_rqsts.all_demand_data_rd, 0xe124 16 | ld_blocks.store_forward, 0x203 17 | longest_lat_cache.miss, 0x412e 18 | longest_lat_cache.reference, 0x4f2e 19 | mem_inst_retired.all_loads, 0x81d0 20 | mem_inst_retired.all_stores, 0x82d0 21 | mem_trans_retired.load_latency_gt_1024, 0x1cd, 0x400 22 | mem_trans_retired.load_latency_gt_128, 0x1cd, 0x80 23 | mem_trans_retired.load_latency_gt_16, 0x1cd, 0x10 24 | mem_trans_retired.load_latency_gt_2048, 0x1cd, 0x800 25 | mem_trans_retired.load_latency_gt_256, 0x1cd, 0x100 26 | mem_trans_retired.load_latency_gt_32, 0x1cd, 0x20 27 | mem_trans_retired.load_latency_gt_4, 0x1cd, 0x4 28 | mem_trans_retired.load_latency_gt_512, 0x1cd, 0x200 29 | mem_trans_retired.load_latency_gt_64, 0x1cd, 0x40 30 | mem_trans_retired.load_latency_gt_8, 0x1cd, 0x8 31 | mem_trans_retired.store_sample, 0x2cd 32 | mem_uops_retired.all_loads, 0x81d0 33 | mem_uops_retired.all_stores, 0x82d0 34 | mem_uops_retired.load_latency_gt_1024, 0x5d0, 0x400 35 | mem_uops_retired.load_latency_gt_128, 0x5d0, 0x80 36 | mem_uops_retired.load_latency_gt_16, 0x5d0, 0x10 37 | mem_uops_retired.load_latency_gt_2048, 0x5d0, 0x800 38 | mem_uops_retired.load_latency_gt_256, 0x5d0, 0x100 39 | mem_uops_retired.load_latency_gt_32, 0x5d0, 0x20 40 | mem_uops_retired.load_latency_gt_4, 0x5d0, 0x4 41 | mem_uops_retired.load_latency_gt_512, 0x5d0, 0x200 42 | mem_uops_retired.load_latency_gt_64, 0x5d0, 0x40 43 | mem_uops_retired.load_latency_gt_8, 0x5d0, 0x8 44 | mem_uops_retired.store_latency, 0x6d0 45 | misc_retired.lbr_inserts, 0x1e4 46 | ocr.demand_data_rd.any_response, 0x1b7, 0x10001 47 | ocr.demand_data_rd.any_response_0, 0x12a, 0x10001 48 | ocr.demand_data_rd.any_response_1, 0x12b, 0x10001 49 | ocr.demand_data_rd.dram, 0x1b7, 0x7bc000001 50 | ocr.demand_data_rd.dram_0, 0x12a, 0x1e780000001 51 | ocr.demand_data_rd.dram_1, 0x12b, 0x1e780000001 52 | ocr.demand_data_rd.l3_miss, 0x1b7, 0x13fbfc00001 53 | ocr.demand_data_rd.l3_miss_0, 0x12a, 0x9e7fa000001 54 | ocr.demand_data_rd.l3_miss_1, 0x12b, 0x9e7fa000001 55 | ocr.demand_rfo.any_response, 0x1b7, 0x10002 56 | ocr.demand_rfo.any_response_0, 0x12a, 0x10002 57 | ocr.demand_rfo.any_response_1, 0x12b, 0x10002 58 | ocr.demand_rfo.l3_miss, 0x1b7, 0x13fbfc00002 59 | ocr.demand_rfo.l3_miss_0, 0x12a, 0x9e7fa000002 60 | ocr.demand_rfo.l3_miss_1, 0x12b, 0x9e7fa000002 61 | topdown.backend_bound_slots, 0x2a4 62 | topdown.slots_p, 0x1a4 63 | topdown_bad_speculation.all_p, 0x73 64 | topdown_be_bound.all, 0x2a4 65 | topdown_be_bound.all_p, 0x2a4 66 | topdown_fe_bound.all_p, 0x19c 67 | topdown_retiring.all_p, 0x2c2 68 | uops_retired.slots, 0x2c2 69 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # *perf-cpp* Examples 2 | 3 | We included various examples to teach you how to use *perf-cpp* and leverage hardware performance counter results directly from your application. 4 | 5 | ## How to Build the Examples 6 | 7 | ``` 8 | # Clone the repository 9 | git clone https://github.com/jmuehlig/perf-cpp.git 10 | 11 | # Switch to the cloned folder 12 | cd perf-cpp 13 | 14 | # Generate the Makefile 15 | cmake . -B build -DBUILD_EXAMPLES=1 16 | 17 | # Build the examples 18 | cmake --build build --target examples 19 | ``` 20 | 21 | The examples will be built to `build/examples/bin/`. 22 | 23 | ## List of Examples 24 | ### Counting Hardware Events 25 | - [statistics/single_thread.cpp](statistics/single_thread.cpp) provides an example to record and read performance counters for a specific code segment on a **single** thread. 26 | - [statistics/inherit_thread.cpp](statistics/inherit_thread.cpp) advances the example to record counter statistics not only from one but also for its **child-threads**. 27 | - [statistics/multi_thread.cpp](statistics/multi_thread.cpp) shows how to record performance counter statistics on **multiple** threads. 28 | - [statistics/multi_cpu.cpp](statistics/multi_cpu.cpp) shows how to pin performance counters to **specific CPU cores** instead of focussing on threads and processes. 29 | - [statistics/live_events.cpp](statistics/live_events.cpp) shows how to access hardware counters with **low latency**. 30 | - [statistics/metric.cpp](statistics/metric.cpp) shows how define new metrics. 31 | 32 | 33 | ### Sampling 34 | - [sampling/instruction_pointer.cpp](sampling/instruction_pointer.cpp) provides an example to sample instruction pointers on a single thread. 35 | - [sampling/flame_graph.cpp](sampling/flame_graph.cpp) provides an example to generate a format that can be used by flamegraph generators. 36 | - [sampling/perf_record.cpp](sampling/perf_record.cpp) generates a `perf.data` file that can be read and analyzed via the *Linux perf* tool using `perf report` or `perf mem report`. 37 | - [sampling/memory_address.cpp](sampling/memory_address.cpp) provides an example to sample virtual memory addresses, their latency, and their origin. 38 | - [sampling/counter.cpp](sampling/counter.cpp) shows how to include values of further hardware performance counters into samples. 39 | - [sampling/branch.cpp](sampling/branch.cpp) exemplifies sampling for last branch records and their prediction success. 40 | - [sampling/register.cpp](sampling/register.cpp) provides an example on how to include values of specific registers into samples. 41 | - [sampling/context_switch.cpp](sampling/context_switch.cpp) provides an example that samples context switches on a single thread. 42 | - [sampling/multi_event.cpp](sampling/multi_event.cpp) exemplifies how to use multiple events as a trigger using Intel counters as an example. 43 | - [sampling/multi_thread.cpp)](sampling/multi_thread.cpp) explains how to sample data on multiple threads at the same time. 44 | - [sampling/multi_cpu.cpp](sampling/multi_cpu.cpp) provides an example that monitors multiple CPU cores and records samples. 45 | -------------------------------------------------------------------------------- /include/perfcpp/feature.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | /// The features of the perf subsystem have evolved over time (more precisely over Linux Kernel generations). 6 | /// In this file, we define some preprocessor variables to keep up with older Linux Kernel versions without yielding 7 | /// errors at compile- and runtime. 8 | /// The documentation for the perf_event_open system call (https://man7.org/linux/man-pages/man2/perf_event_open.2.html) 9 | /// has a great overview of features added in various versions. 10 | /// For the time being, we support Linux 4.0 and newer. 11 | 12 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 1, 0) 13 | #define PERFCPP_NO_ERROR_EBUSY 14 | #define PERFCPP_NO_MMAP_DATA_SIZE 15 | #endif 16 | 17 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 2, 0) 18 | #define PERFCPP_NO_SAMPLE_BRANCH_IND_JUMP 19 | #define PERFCPP_NO_RECORD_LOST_SAMPLES 20 | #endif 21 | 22 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 3, 0) 23 | #define PERFCPP_NO_RECORD_SWITCH 24 | #define PERFCPP_NO_BRANCH_STACK_CYCLES 25 | #endif 26 | 27 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0) 28 | #define PERFCPP_NO_SAMPLE_BRANCH_CALL 29 | #define PERFCPP_NO_COUNT_SW_BPF_OUTPUT 30 | #endif 31 | 32 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0) 33 | #define PERFCPP_NO_SAMPLE_MAX_STACK 34 | #endif 35 | 36 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 13, 0) 37 | #define PERFCPP_NO_SAMPLE_PHYS_ADDR 38 | #endif 39 | 40 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 14, 0) 41 | #define PERFCPP_NO_MEM_REMOTE 42 | #define PERFCPP_NO_MEM_SNOOPX 43 | #endif 44 | 45 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 17, 0) 46 | #define PERFCPP_NO_RECORD_MISC_SWITCH_OUT_PREEMPT 47 | #endif 48 | 49 | #if LINUX_VERSION_CODE < KERNEL_VERSION(5, 7, 0) 50 | #define PERFCPP_NO_RECORD_CGROUP 51 | #define PERFCPP_NO_SAMPLE_CGROUP 52 | #endif 53 | 54 | #if LINUX_VERSION_CODE < KERNEL_VERSION(5, 11, 0) 55 | #define PERFCPP_NO_SAMPLE_DATA_PAGE_SIZE 56 | #define PERFCPP_NO_SAMPLE_CODE_PAGE_SIZE 57 | #endif 58 | 59 | #if LINUX_VERSION_CODE < KERNEL_VERSION(5, 12, 0) 60 | #define PERFCPP_NO_SAMPLE_WEIGHT_STRUCT 61 | #define PERFCPP_NO_MEM_BLK 62 | #endif 63 | 64 | #if LINUX_VERSION_CODE < KERNEL_VERSION(5, 13, 0) 65 | #define PERFCPP_NO_CGROUP_SWITCHES 66 | #define PERFCPP_NO_INHERIT_THREAD 67 | #endif 68 | 69 | #if LINUX_VERSION_CODE < KERNEL_VERSION(5, 16, 0) 70 | #define PERFCPP_NO_MEM_HOPS_0 71 | #endif 72 | 73 | #if LINUX_VERSION_CODE < KERNEL_VERSION(5, 17, 0) 74 | #define PERFCPP_NO_MEM_HOPS_1_3 75 | #endif 76 | 77 | #if LINUX_VERSION_CODE < KERNEL_VERSION(6, 0, 0) 78 | #define PERFCPP_NO_FORMAT_LOST 79 | #endif 80 | 81 | #if LINUX_VERSION_CODE < KERNEL_VERSION(6, 1, 0) 82 | #define PERFCPP_NO_MEM_LVLNUM 83 | #define PERFCPP_NO_MEM_LVLNUM_PMEM 84 | #define PERFCPP_NO_MEM_LVLNUM_IO 85 | #define PERFCPP_NO_MEM_LVLNUM_CXL 86 | #define PERFCPP_NO_MEM_SNOOPX_PEER 87 | #endif 88 | 89 | #if LINUX_VERSION_CODE < KERNEL_VERSION(6, 6, 0) 90 | #define PERFCPP_NO_MEM_LVLNUM_UNC 91 | #endif 92 | 93 | #if LINUX_VERSION_CODE < KERNEL_VERSION(6, 11, 0) 94 | #define PERFCPP_NO_MEM_LVLNUM_L2_MHB 95 | #endif 96 | -------------------------------------------------------------------------------- /include/perfcpp/metric/expression/tokenizer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "token.h" 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | namespace perf::metric::expression { 11 | /** 12 | * The Tokenizer translates a given input string into a queue of tokens. 13 | */ 14 | class Tokenizer 15 | { 16 | public: 17 | explicit Tokenizer(std::string&& input) 18 | : _input(std::move(input)) 19 | { 20 | } 21 | 22 | /** 23 | * @return The original input. 24 | */ 25 | [[nodiscard]] const std::string& input() const noexcept { return _input; } 26 | 27 | /** 28 | * Tokenizes the input string. 29 | * @return Queue of tokens. 30 | */ 31 | [[nodiscard]] std::optional next(); 32 | 33 | private: 34 | /// The expression to tokenize. 35 | const std::string _input; 36 | 37 | std::size_t _position{ 0U }; 38 | 39 | /** 40 | * @return Position after skipping all whitespaces. 41 | */ 42 | [[nodiscard]] std::size_t skip_whitespaces() const noexcept; 43 | 44 | /** 45 | * Reads a constant number (e.g., 13.37) from the input string, starting at the given position. 46 | * 47 | * @param begin Position within the input string. 48 | * @return A tuple (token containing the constant, new position). 49 | */ 50 | [[nodiscard]] std::pair read_constant(std::size_t begin) const; 51 | 52 | /** 53 | * Reads an identifier (e.g., a hardware counter name) from the input string, starting at the given position. 54 | * 55 | * @param begin Position within the input string. 56 | * @return A tuple (token containing the identifier, new position). 57 | */ 58 | [[nodiscard]] std::pair read_identifier(std::size_t begin) const; 59 | 60 | /** 61 | * Reads an operator (e.g., +) from the given char. 62 | * 63 | * @param current_char Current char from input string. 64 | * @return The metric operator. 65 | */ 66 | [[nodiscard]] Operator_ read_operator(char current_char) const; 67 | 68 | /** 69 | * Checks if the given char is an escape character. 70 | * 71 | * @param current_char Current character. 72 | * @return True, if the given char is an escape character. 73 | */ 74 | [[nodiscard]] static bool is_escape_char(char current_char) noexcept 75 | { 76 | return current_char == '\'' || current_char == '`'; 77 | } 78 | 79 | /** 80 | * Checks if the given char could belong to an identifier (alphanumerical chars, _, ., etc.). 81 | * 82 | * @param char_ Char to check. 83 | * @return True, if the char could belong to an identifier. 84 | */ 85 | [[nodiscard]] static bool is_identifier_char(const char char_) noexcept 86 | { 87 | return std::isalnum(char_) || char_ == '_' || char_ == '.'; 88 | } 89 | 90 | /** 91 | * Checks if the given char is a scientific 'e'. 92 | * 93 | * @param char_ Char to check. 94 | * @return True, if the char could is a scientific 'e'. 95 | */ 96 | [[nodiscard]] static bool is_scientific_e(const char char_) noexcept { return char_ == 'e' || char_ == 'E'; } 97 | }; 98 | } -------------------------------------------------------------------------------- /src/counter_result.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | std::optional 7 | perf::CounterResult::get(std::string_view name) const noexcept 8 | { 9 | if (const auto result_iterator = std::find_if( 10 | this->_results.begin(), this->_results.end(), [&name](const auto res) { return name == res.first; }); 11 | result_iterator != this->_results.end()) { 12 | return result_iterator->second; 13 | } 14 | 15 | return std::nullopt; 16 | } 17 | 18 | std::string 19 | perf::CounterResult::to_json() const 20 | { 21 | auto json_stream = std::stringstream{}; 22 | 23 | json_stream << "{"; 24 | 25 | for (auto i = 0U; i < this->_results.size(); ++i) { 26 | if (i > 0U) { 27 | json_stream << ","; 28 | } 29 | 30 | json_stream << "\"" << this->_results[i].first << "\": " << this->_results[i].second; 31 | } 32 | 33 | json_stream << "}"; 34 | 35 | return json_stream.str(); 36 | } 37 | 38 | std::string 39 | perf::CounterResult::to_csv(const char delimiter, const bool print_header) const 40 | { 41 | auto csv_stream = std::stringstream{}; 42 | 43 | if (print_header) { 44 | csv_stream << "counter" << delimiter << "value\n"; 45 | } 46 | 47 | for (auto i = 0U; i < this->_results.size(); ++i) { 48 | if (i > 0U) { 49 | csv_stream << "\n"; 50 | } 51 | 52 | csv_stream << this->_results[i].first << delimiter << this->_results[i].second; 53 | } 54 | 55 | return csv_stream.str(); 56 | } 57 | 58 | std::string 59 | perf::CounterResult::to_string() const 60 | { 61 | auto result = std::vector>{}; 62 | result.reserve(this->_results.size()); 63 | 64 | /// Default column lengths, equal to the header. 65 | auto max_name_length = 12UL, max_value_length = 5UL; 66 | 67 | /// Collect counter names and values as strings. 68 | for (const auto& [name, value] : this->_results) { 69 | auto value_string = std::to_string(value); 70 | 71 | max_name_length = std::max(max_name_length, name.size()); 72 | max_value_length = std::max(max_value_length, value_string.size()); 73 | 74 | result.emplace_back(name, std::move(value_string)); 75 | } 76 | 77 | /// Format the counters as a table. 78 | auto table_stream = std::stringstream{}; 79 | table_stream 80 | /// Print the header. 81 | << "| Value" << std::setw(static_cast(max_value_length) - 4) << " " << "| Counter" 82 | << std::setw(static_cast(max_name_length) - 6) << " " 83 | << "|\n" 84 | 85 | /// Print the separator line. 86 | << "|" << std::string(max_value_length + 2U, '-') << "|" << std::string(max_name_length + 2U, '-') << "|"; 87 | 88 | /// Print the results as columns. 89 | for (const auto& [name, value] : result) { 90 | table_stream << "\n| " << std::setw(static_cast(max_value_length)) << value << " | " << name 91 | << std::setw(static_cast(max_name_length - name.size()) + 1) << " " << "|"; 92 | } 93 | 94 | table_stream << std::flush; 95 | 96 | return table_stream.str(); 97 | } -------------------------------------------------------------------------------- /include/perfcpp/analyzer/flame_graph_generator.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | namespace perf::analyzer { 13 | class FlameGraphGenerator 14 | { 15 | public: 16 | FlameGraphGenerator() = default; 17 | ~FlameGraphGenerator() = default; 18 | 19 | [[nodiscard]] std::vector, std::uint64_t>> map( 20 | const std::vector& samples) const; 21 | 22 | [[nodiscard]] std::vector, std::uint64_t>> map( 23 | const std::vector& samples, 24 | std::function::const_iterator begin, std::vector::const_iterator end)> 25 | mapper) const; 26 | 27 | void map(const std::vector& samples, std::string&& out_file_path) const 28 | { 29 | return map(samples, out_file_path); 30 | } 31 | 32 | void map(const std::vector& samples, const std::string& out_file_path) const; 33 | 34 | void map(const std::vector& samples, 35 | std::function::const_iterator begin, 36 | std::vector::const_iterator end)> mapper, 37 | std::string&& out_file_path) const 38 | { 39 | map(samples, std::move(mapper), out_file_path); 40 | } 41 | 42 | void map(const std::vector& samples, 43 | std::function::const_iterator begin, 44 | std::vector::const_iterator end)> mapper, 45 | const std::string& out_file_path) const; 46 | 47 | private: 48 | class SymbolCache 49 | { 50 | public: 51 | explicit SymbolCache(const SymbolResolver& symbol_resolver) noexcept 52 | : _symbol_resolver(symbol_resolver) 53 | { 54 | } 55 | ~SymbolCache() = default; 56 | 57 | [[nodiscard]] std::optional> symbol( 58 | std::uintptr_t logical_instruction_pointer); 59 | 60 | private: 61 | const SymbolResolver _symbol_resolver; 62 | std::unordered_map>> _cache; 63 | }; 64 | 65 | SymbolResolver _symbol_resolver; 66 | 67 | [[nodiscard]] static bool have_equal_callchains(SymbolCache& symbol_cache, 68 | const Sample& original_sample, 69 | const Sample& follow_up_sample) noexcept; 70 | 71 | [[nodiscard]] static bool have_equal_symbols( 72 | std::optional> first, 73 | std::optional> second) noexcept; 74 | 75 | [[nodiscard]] std::vector resolve_symbols( 76 | const std::optional>& callchain, 77 | std::optional top_logical_instruction_pointer) const; 78 | }; 79 | } -------------------------------------------------------------------------------- /examples/access_benchmark.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | namespace perf::example { 7 | 8 | /** 9 | * Generator for unique and zipf data sets. 10 | */ 11 | class DataGenerator 12 | { 13 | public: 14 | [[nodiscard]] static std::vector generate_unique(std::size_t size); 15 | 16 | [[nodiscard]] static std::vector generate_zipf(std::size_t size, 17 | std::size_t alphabet_size, 18 | double zipf_param); 19 | 20 | private: 21 | [[nodiscard]] static std::vector alphabet(std::size_t size); 22 | 23 | [[nodiscard]] static std::vector lookup_table(double zipf_param, const std::vector& alphabet); 24 | }; 25 | 26 | /** 27 | * Benchmark accessing benchmarks in random or sequential order. 28 | * This is an example to demonstrate the perfcpp library. 29 | */ 30 | class AccessBenchmark 31 | { 32 | public: 33 | /** 34 | * Object sized of one cache line. 35 | */ 36 | struct alignas(64U) cache_line 37 | { 38 | cache_line() noexcept = default; 39 | explicit cache_line(const std::uint64_t value_) noexcept 40 | : value(value_) 41 | { 42 | } 43 | ~cache_line() noexcept = default; 44 | 45 | std::uint64_t value; 46 | }; 47 | 48 | AccessBenchmark(bool is_random, std::uint64_t access_data_size_in_mb, bool is_write = false); 49 | ~AccessBenchmark() = default; 50 | 51 | /** 52 | * @return Number of cache lines. 53 | */ 54 | [[nodiscard]] std::size_t size() const noexcept { return _indices.size(); } 55 | 56 | /** 57 | * Grant access to the i-th cache line, considering the defined access order. 58 | * 59 | * @param index Index of the cache line to access. 60 | * @return Cache line. 61 | */ 62 | [[nodiscard]] const cache_line& operator[](const std::size_t index) const noexcept 63 | { 64 | return _data_to_read[_indices[index]]; 65 | } 66 | 67 | void set(const std::size_t index, const std::uint64_t value) { _data_to_write[_indices[index]].value = value; } 68 | 69 | [[nodiscard]] const std::vector& indices() const noexcept { return _indices; } 70 | [[nodiscard]] const std::vector& data_to_read() const noexcept { return _data_to_read; } 71 | 72 | /** 73 | * Makes the compiler think that the result is used – consequently, the optimizer cannot optimize the value away. 74 | * 75 | * @param result Value that should not be optimized away. 76 | */ 77 | template 78 | inline void pretend_to_use(T& result) const noexcept 79 | { 80 | #ifdef __clang__ 81 | asm volatile("" : "+r,m"(result) : : "memory"); 82 | #else 83 | asm volatile("" : "+m,r"(result) : : "memory"); 84 | #endif 85 | } 86 | 87 | private: 88 | /// Indices, defining the order in which the memory chunk is accessed. 89 | std::vector _indices; 90 | 91 | /// Memory chunk that is read during the benchmark. 92 | std::vector _data_to_read; 93 | 94 | /// Memory chunk that is written during the benchmark. 95 | std::vector _data_to_write; 96 | }; 97 | } -------------------------------------------------------------------------------- /examples/sampling/context_switch.cpp: -------------------------------------------------------------------------------- 1 | #include "../access_benchmark.h" 2 | #include "perfcpp/sampler.h" 3 | #include 4 | 5 | int 6 | main() 7 | { 8 | std::cout << "libperf-cpp example: Record perf samples including time, " 9 | "instruction pointer, and cpu id for single-threaded random " 10 | "access to an in-memory array." 11 | << std::endl; 12 | 13 | auto sampler = perf::Sampler{}; 14 | 15 | /// Event that generates an overflow which is samples. 16 | sampler.trigger("cycles", perf::Precision::RequestZeroSkid, perf::Period{ 50000 }); 17 | 18 | /// Include Timestamp, period, instruction pointer, and CPU number into samples. 19 | sampler.values().timestamp(true).cpu_id(true).context_switch(true); 20 | 21 | /// Create random access benchmark. 22 | auto benchmark = perf::example::AccessBenchmark{ /*randomize the accesses*/ true, 23 | /* create benchmark of 2 GB */ 2048U }; 24 | 25 | /// Start sampling. 26 | try { 27 | sampler.start(); 28 | } catch (std::runtime_error& exception) { 29 | std::cerr << exception.what() << std::endl; 30 | return 1; 31 | } 32 | 33 | /// Execute the benchmark (accessing cache lines in a random order). 34 | auto value = 0ULL; 35 | for (auto index = 0U; index < benchmark.size(); ++index) { 36 | value += benchmark[index].value; 37 | } 38 | 39 | /// We do not want the compiler to optimize away this (otherwise) unused value (and consequently the loop above). 40 | benchmark.pretend_to_use(value); 41 | 42 | /// Stop sampling. 43 | sampler.stop(); 44 | 45 | /// Get all the recorded samples. 46 | auto samples = sampler.result(); 47 | const auto count_samples_before_filter = samples.size(); 48 | 49 | /// Filter out samples without context switch. 50 | samples.erase(std::remove_if(samples.begin(), 51 | samples.end(), 52 | [](const auto& sample) { 53 | return !sample.metadata().cpu_id().has_value() || 54 | !sample.metadata().timestamp().has_value() || 55 | !sample.context_switch().has_value(); 56 | }), 57 | samples.end()); 58 | 59 | /// Print the first samples. 60 | const auto count_show_samples = std::min(samples.size(), 40U); 61 | std::cout << "\nRecorded " << count_samples_before_filter << " samples. " << samples.size() 62 | << " remaining after filter." << std::endl; 63 | std::cout << "Here are the first " << count_show_samples << " recorded samples:\n" << std::endl; 64 | for (auto index = 0U; index < count_show_samples; ++index) { 65 | const auto& sample = samples[index]; 66 | 67 | std::cout << "Time = " << sample.metadata().timestamp().value() 68 | << " | CPU ID = " << sample.metadata().cpu_id().value() 69 | << " | is in = " << sample.context_switch().value().is_in() 70 | << " | is preempt = " << sample.context_switch().value().is_preempt() << "\n"; 71 | } 72 | std::cout << std::flush; 73 | 74 | /// Close the sampler. 75 | /// Note that the sampler can only be closed after reading the samples. 76 | sampler.close(); 77 | 78 | return 0; 79 | } -------------------------------------------------------------------------------- /examples/sampling/memory_access_analyzer.cpp: -------------------------------------------------------------------------------- 1 | #include "../access_benchmark.h" 2 | #include "perfcpp/analyzer/memory_access.h" 3 | #include "perfcpp/hardware_info.h" 4 | #include "perfcpp/sampler.h" 5 | #include 6 | 7 | int 8 | main() 9 | { 10 | std::cout << "libperf-cpp example: Sample memory addresses and analyze data objects." << std::endl; 11 | 12 | /// Initialize sampler. 13 | auto sampler = perf::Sampler{}; 14 | 15 | /// Setup which counters trigger the writing of samples (depends on the underlying hardware substrate). 16 | if (perf::HardwareInfo::is_amd_ibs_supported()) { 17 | sampler.trigger("ibs_op_uops", perf::Precision::MustHaveZeroSkid, perf::Period{ 4000U }); 18 | } else if (perf::HardwareInfo::is_intel()) { 19 | sampler.trigger("mem-loads", perf::Precision::MustHaveZeroSkid, perf::Period{ 2000U }); 20 | } else { 21 | std::cout << "Error: Memory sampling is not supported on this CPU." << std::endl; 22 | return 1; 23 | } 24 | 25 | /// Setup which data will be included into samples (timestamp, virtual memory address, data source like L1d or RAM, 26 | /// and latency). 27 | sampler.values().logical_memory_address(true).data_source(true).latency(true); 28 | if (perf::HardwareInfo::is_amd()) { 29 | sampler.values().raw(true); 30 | } 31 | 32 | /// Create random access benchmark. 33 | auto benchmark = perf::example::AccessBenchmark{ /*randomize the accesses*/ true, 34 | /* create benchmark of 2 GB */ 2048 }; 35 | 36 | /// Start sampling. 37 | try { 38 | sampler.start(); 39 | } catch (std::runtime_error& exception) { 40 | std::cerr << exception.what() << std::endl; 41 | return 1; 42 | } 43 | 44 | /// Execute the benchmark (accessing cache lines in a random order). 45 | auto value = 0ULL; 46 | for (auto index = 0U; index < benchmark.size(); ++index) { 47 | value += benchmark[index].value; 48 | } 49 | 50 | /// We do not want the compiler to optimize away this (otherwise) unused value (and consequently the loop above). 51 | benchmark.pretend_to_use(value); 52 | 53 | /// Stop sampling. 54 | sampler.stop(); 55 | 56 | /// Create data types for analyzer. 57 | auto data_analyzer = perf::analyzer::MemoryAccess{}; 58 | 59 | /// 1) Create and add the "index" data type (normal u64 that dictates the pattern through the data array in the random 60 | /// access benchmark). 61 | auto index = perf::analyzer::DataType{ "index", sizeof(std::uint64_t) }; 62 | index.add("index"); 63 | data_analyzer.add(std::move(index)); 64 | 65 | /// 2) Create and add the "data_cache_line" data type (single cache line that is accessed in the random access 66 | /// benchmark). 67 | auto cache_line = perf::analyzer::DataType{ "data_cache_line", sizeof(perf::example::AccessBenchmark::cache_line) }; 68 | cache_line.add("value"); 69 | data_analyzer.add(std::move(cache_line)); 70 | 71 | /// 3) Register instances in memory for both data types. 72 | data_analyzer.annotate("index", benchmark.indices()); 73 | data_analyzer.annotate("data_cache_line", benchmark.data_to_read()); 74 | 75 | /// 4) Get all the recorded samples. 76 | const auto samples = sampler.result(); 77 | 78 | /// 5) Map the samples to data type instances. 79 | const auto result = data_analyzer.map(samples); 80 | 81 | /// 6) Print the results to the console. 82 | std::cout << result.to_string() << std::flush; 83 | 84 | return 0; 85 | } -------------------------------------------------------------------------------- /examples/sampling/instruction_pointer.cpp: -------------------------------------------------------------------------------- 1 | #include "../access_benchmark.h" 2 | #include "perfcpp/sampler.h" 3 | #include "perfcpp/symbol_resolver.h" 4 | #include 5 | 6 | int 7 | main() 8 | { 9 | std::cout << "libperf-cpp example: Record perf samples including time, " 10 | "instruction pointer, and cpu id for single-threaded random " 11 | "access to an in-memory array." 12 | << std::endl; 13 | 14 | auto sampler = perf::Sampler{}; 15 | 16 | /// Event that generates an overflow which is samples. 17 | sampler.trigger("cycles", perf::Precision::RequestZeroSkid, perf::Period{ 50000U }); 18 | 19 | /// Include Timestamp, period, instruction pointer, and CPU number into samples. 20 | sampler.values().timestamp(true).period(true).instruction_pointer(true).cpu_id(true); 21 | 22 | /// Create random access benchmark. 23 | auto benchmark = perf::example::AccessBenchmark{ /*randomize the accesses*/ true, 24 | /* create benchmark of 512 MB */ 512U }; 25 | 26 | /// Start sampling. 27 | try { 28 | sampler.start(); 29 | } catch (std::runtime_error& exception) { 30 | std::cerr << exception.what() << std::endl; 31 | return 1; 32 | } 33 | 34 | /// Execute the benchmark (accessing cache lines in a random order). 35 | auto value = 0ULL; 36 | for (auto index = 0U; index < benchmark.size(); ++index) { 37 | value += benchmark[index].value; 38 | } 39 | 40 | /// We do not want the compiler to optimize away this (otherwise) unused value (and consequently the loop above). 41 | benchmark.pretend_to_use(value); 42 | 43 | /// Stop sampling. 44 | sampler.stop(); 45 | 46 | /// Get all the recorded samples. 47 | const auto samples = sampler.result(); 48 | 49 | auto symbol_resolver = perf::SymbolResolver{}; 50 | 51 | /// Print the first samples. 52 | const auto count_show_samples = std::min(samples.size(), 400U); 53 | std::cout << "\nRecorded " << samples.size() << " samples." << std::endl; 54 | std::cout << "Here are the first " << count_show_samples << " recorded samples:\n" << std::endl; 55 | for (auto index = 0U; index < count_show_samples; ++index) { 56 | const auto& sample = samples[index]; 57 | 58 | /// Since we recorded the time, period, the instruction pointer, and the CPU 59 | /// id, we can only read these values. 60 | if (sample.metadata().timestamp().has_value() && sample.metadata().period().has_value() && 61 | sample.instruction_execution().logical_instruction_pointer().has_value() && 62 | sample.metadata().cpu_id().has_value()) { 63 | 64 | auto symbol = std::string{ "??" }; 65 | if (auto sym = symbol_resolver.resolve(sample.instruction_execution().logical_instruction_pointer().value()); 66 | sym.has_value()) { 67 | symbol = sym->to_string(); 68 | } 69 | 70 | std::cout << "Time = " << sample.metadata().timestamp().value() 71 | << " | Period = " << sample.metadata().period().value() << " | Instruction Pointer = 0x" << std::hex 72 | << sample.instruction_execution().logical_instruction_pointer().value() << std::dec 73 | << " | Symbol = " << symbol << " | CPU ID = " << sample.metadata().cpu_id().value() << " | " 74 | << (sample.instruction_execution().logical_instruction_pointer() ? "exact" : "not exact") << "\n"; 75 | } 76 | } 77 | std::cout << std::flush; 78 | 79 | /// Close the sampler. 80 | /// Note that the sampler can only be closed after reading the samples. 81 | sampler.close(); 82 | 83 | return 0; 84 | } -------------------------------------------------------------------------------- /examples/statistics/inherit_thread.cpp: -------------------------------------------------------------------------------- 1 | #include "../access_benchmark.h" 2 | #include "perfcpp/event_counter.h" 3 | #include 4 | #include 5 | #include 6 | 7 | int 8 | main() 9 | { 10 | std::cout << "libperf-cpp example: Record performance counter for " 11 | "multi-threaded random access to an in-memory array." 12 | << std::endl; 13 | std::cout << "We will record the counters for all threads spawned by the main-thread." << std::endl; 14 | 15 | /// In this example, we will perform the benchmark multi-threaded and record 16 | /// all child-threads. If `include_child_threads` is not set to true, we would 17 | /// only record the main-thread. 18 | auto config = perf::Config{}; 19 | config.include_child_threads(true); 20 | auto event_counter = perf::EventCounter{ config }; 21 | 22 | /// Add all the performance counters we want to record. 23 | try { 24 | event_counter.add({ "instructions", 25 | "cycles", 26 | "branches", 27 | "cache-misses", 28 | "dTLB-miss-ratio", 29 | "L1-data-miss-ratio", 30 | "cycles-per-instruction" }); 31 | } catch (std::runtime_error& e) { 32 | std::cerr << e.what() << std::endl; 33 | return 1; 34 | } 35 | 36 | /// Create random access benchmark. 37 | auto benchmark = perf::example::AccessBenchmark{ /*randomize the accesses*/ true, 38 | /* create benchmark of 1024 MB */ 1024U }; 39 | 40 | /// One event_counter instance for every thread. 41 | const auto count_threads = 2U; 42 | const auto items_per_thread = benchmark.size() / count_threads; 43 | auto threads = std::vector{}; 44 | auto thread_local_results = std::vector(2U, 0U); /// Array to store the thread-local results. 45 | 46 | /// Start the performance counters. Note that the counters will also record 47 | /// the thread-creation. 48 | try { 49 | event_counter.start(); 50 | } catch (std::runtime_error& exception) { 51 | std::cerr << exception.what() << std::endl; 52 | return 1; 53 | } 54 | 55 | for (auto thread_index = 0U; thread_index < count_threads; ++thread_index) { 56 | threads.emplace_back([thread_index, items_per_thread, &thread_local_results, &benchmark]() { 57 | auto local_value = 0ULL; 58 | 59 | /// Process the data. 60 | for (auto index = 0U; index < items_per_thread; ++index) { 61 | local_value += benchmark[(thread_index * items_per_thread) + index].value; 62 | } 63 | 64 | thread_local_results[thread_index] = local_value; 65 | }); 66 | } 67 | 68 | /// Wait for all threads to finish. 69 | for (auto& thread : threads) { 70 | thread.join(); 71 | } 72 | 73 | /// Stop recording counters. 74 | event_counter.stop(); 75 | 76 | /// Add up the results so that the compiler does not get the idea of 77 | /// optimizing away the accesses. 78 | auto value = std::accumulate(thread_local_results.begin(), thread_local_results.end(), 0UL); 79 | 80 | /// We do not want the compiler to optimize away this (otherwise) unused value (and consequently the loop above). 81 | benchmark.pretend_to_use(value); 82 | 83 | /// Get the result (normalized per cache line). 84 | const auto result = event_counter.result(benchmark.size()); 85 | 86 | /// Print the performance counters. 87 | std::cout << "\nResults:\n"; 88 | for (const auto& [counter_name, counter_value] : result) { 89 | std::cout << counter_value << " " << counter_name << " / cache line" << std::endl; 90 | } 91 | 92 | return 0; 93 | } -------------------------------------------------------------------------------- /include/perfcpp/metadata.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | namespace perf { 7 | class Metadata 8 | { 9 | public: 10 | enum class Mode 11 | { 12 | Kernel, 13 | User, 14 | Hypervisor, 15 | GuestKernel, 16 | GuestUser, 17 | Unknown, /// DEPRECATED: Will be removed in v0.12 18 | }; 19 | 20 | /** 21 | * Set the mode of the sample. 22 | * @param mode Mode. 23 | */ 24 | void mode(const std::optional mode) noexcept { _mode = mode; } 25 | 26 | /** 27 | * Set the sample ID. 28 | * @param sample_id Sample ID. 29 | */ 30 | void sample_id(const std::uint64_t sample_id) noexcept { _sample_id = sample_id; } 31 | 32 | /** 33 | * Set the stream ID. 34 | * @param stream_id Stream ID. 35 | */ 36 | void stream_id(const std::uint64_t stream_id) noexcept { _stream_id = stream_id; } 37 | 38 | /** 39 | * Set the timestamp. 40 | * @param timestamp Timestamp. 41 | */ 42 | void timestamp(const std::uint64_t timestamp) noexcept { _timestamp = timestamp; } 43 | 44 | /** 45 | * Set the period. 46 | * @param period Period. 47 | */ 48 | void period(const std::uint64_t period) noexcept { _period = period; } 49 | 50 | /** 51 | * Set the CPU ID. 52 | * @param cpu_id CPU ID. 53 | */ 54 | void cpu_id(const std::uint32_t cpu_id) noexcept { _cpu_id = cpu_id; } 55 | 56 | /** 57 | * Set the process ID. 58 | * @param process_id Process ID. 59 | */ 60 | void process_id(const std::uint32_t process_id) noexcept { _process_id = process_id; } 61 | 62 | /** 63 | * Set the thread ID. 64 | * @param thread_id Thread ID. 65 | */ 66 | void thread_id(const std::uint32_t thread_id) noexcept { _thread_id = thread_id; } 67 | 68 | /** 69 | * @return Mode, if included in the sample. std::nullopt otherwise. 70 | */ 71 | [[nodiscard]] std::optional mode() const noexcept { return _mode; } 72 | 73 | /** 74 | * @return Sample ID, if included in the sample. std::nullopt otherwise. 75 | */ 76 | [[nodiscard]] std::optional sample_id() const noexcept { return _sample_id; } 77 | 78 | /** 79 | * @return Stream ID, if included in the sample. std::nullopt otherwise. 80 | */ 81 | [[nodiscard]] std::optional stream_id() const noexcept { return _stream_id; } 82 | 83 | /** 84 | * @return Timestamp, if included in the sample. std::nullopt otherwise. 85 | */ 86 | [[nodiscard]] std::optional timestamp() const noexcept { return _timestamp; } 87 | 88 | /** 89 | * @return Period, if included in the sample. std::nullopt otherwise. 90 | */ 91 | [[nodiscard]] std::optional period() const noexcept { return _period; } 92 | 93 | /** 94 | * @return CPU ID, if included in the sample. std::nullopt otherwise. 95 | */ 96 | [[nodiscard]] std::optional cpu_id() const noexcept { return _cpu_id; } 97 | 98 | /** 99 | * @return Process ID, if included in the sample. std::nullopt otherwise. 100 | */ 101 | [[nodiscard]] std::optional process_id() const noexcept { return _process_id; } 102 | 103 | /** 104 | * @return Thread ID, if included in the sample. std::nullopt otherwise. 105 | */ 106 | [[nodiscard]] std::optional thread_id() const noexcept { return _thread_id; } 107 | 108 | private: 109 | std::optional _mode; 110 | std::optional _sample_id; 111 | std::optional _stream_id; 112 | std::optional _timestamp; 113 | std::optional _period; 114 | std::optional _cpu_id; 115 | std::optional _process_id; 116 | std::optional _thread_id; 117 | }; 118 | } -------------------------------------------------------------------------------- /examples/sampling/register.cpp: -------------------------------------------------------------------------------- 1 | #include "../access_benchmark.h" 2 | #include "perfcpp/sampler.h" 3 | #include 4 | 5 | int 6 | main() 7 | { 8 | std::cout << "libperf-cpp example: Record perf samples including time, " 9 | "user_registers, and cpu id for single-threaded random " 10 | "access to an in-memory array." 11 | << std::endl; 12 | 13 | auto sampler = perf::Sampler{}; 14 | sampler.trigger("cycles", perf::Period{ 100000 }); 15 | sampler.values() 16 | .timestamp(true) 17 | .user_registers( 18 | perf::Registers{ { perf::Registers::x86::IP, perf::Registers::x86::DI, perf::Registers::x86::R10 } }) 19 | .kernel_registers( 20 | perf::Registers{ { perf::Registers::x86::IP, perf::Registers::x86::DI, perf::Registers::x86::R10 } }) 21 | .cpu_id(true); 22 | 23 | /// Create random access benchmark. 24 | auto benchmark = perf::example::AccessBenchmark{ /*randomize the accesses*/ true, 25 | /* create benchmark of 512 MB */ 512U }; 26 | 27 | /// Start sampling. 28 | try { 29 | sampler.start(); 30 | } catch (std::runtime_error& exception) { 31 | std::cerr << exception.what() << std::endl; 32 | return 1; 33 | } 34 | 35 | /// Execute the benchmark (accessing cache lines in a random order). 36 | auto value = 0ULL; 37 | for (auto index = 0U; index < benchmark.size(); ++index) { 38 | value += benchmark[index].value; 39 | } 40 | 41 | /// We do not want the compiler to optimize away this (otherwise) unused value (and consequently the loop above). 42 | benchmark.pretend_to_use(value); 43 | 44 | /// Stop sampling. 45 | sampler.stop(); 46 | 47 | /// Get all the recorded samples. 48 | const auto samples = sampler.result(); 49 | 50 | /// Print the first samples. 51 | const auto count_show_samples = std::min(samples.size(), 40U); 52 | std::cout << "\nRecorded " << samples.size() << " samples." << std::endl; 53 | std::cout << "Here are the first " << count_show_samples << " recorded samples:\n" << std::endl; 54 | for (auto index = 0U; index < count_show_samples; ++index) { 55 | const auto& sample = samples[index]; 56 | 57 | /// Since we recorded the time, period, the instruction pointer, and the CPU 58 | /// id, we can only read these values. 59 | if (sample.metadata().timestamp().has_value() && 60 | (sample.user_registers().has_value() || sample.kernel_registers().has_value()) && 61 | sample.metadata().cpu_id().has_value()) { 62 | 63 | std::cout << "Time = " << sample.metadata().timestamp().value() 64 | << " | CPU ID = " << sample.metadata().cpu_id().value(); 65 | 66 | if (sample.user_registers().has_value()) { 67 | const auto& user_registers = sample.user_registers().value(); 68 | std::cout << " | User Registers = IP(" << user_registers.get(perf::Registers::x86::IP).value_or(0) << "), DI(" 69 | << user_registers.get(perf::Registers::x86::DI).value_or(0) << "), R10(" 70 | << user_registers.get(perf::Registers::x86::R10).value_or(0) << ")"; 71 | } 72 | 73 | if (sample.kernel_registers().has_value()) { 74 | const auto& kernel_registers = sample.kernel_registers().value(); 75 | std::cout << " | Kernel Registers = IP(" << kernel_registers.get(perf::Registers::x86::IP).value_or(0) 76 | << "), DI(" << kernel_registers.get(perf::Registers::x86::DI).value_or(0) << "), R10(" 77 | << kernel_registers.get(perf::Registers::x86::R10).value_or(0) << ")"; 78 | } 79 | 80 | std::cout << "\n"; 81 | } 82 | } 83 | std::cout << std::flush; 84 | 85 | /// Close the sampler. 86 | /// Note that the sampler can only be closed after reading the samples. 87 | sampler.close(); 88 | 89 | return 0; 90 | } -------------------------------------------------------------------------------- /include/perfcpp/branch.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | namespace perf { 9 | /** 10 | * Branch types that can be filtered when recording branches via sampling. 11 | */ 12 | enum BranchType : std::uint64_t 13 | { 14 | None = 0ULL, 15 | 16 | User = PERF_SAMPLE_BRANCH_USER, 17 | Kernel = PERF_SAMPLE_BRANCH_KERNEL, 18 | HyperVisor = PERF_SAMPLE_BRANCH_HV, 19 | 20 | Any = PERF_SAMPLE_BRANCH_ANY, 21 | #ifndef PERFCPP_NO_SAMPLE_BRANCH_CALL 22 | Call = PERF_SAMPLE_BRANCH_ANY_CALL, 23 | #else 24 | Call = 1ULL << 61, 25 | #endif 26 | #ifndef PERFCPP_NO_SAMPLE_BRANCH_CALL 27 | DirectCall = PERF_SAMPLE_BRANCH_CALL, 28 | #else 29 | DirectCall = 1ULL << 62, 30 | #endif 31 | IndirectCall = PERF_SAMPLE_BRANCH_IND_CALL, 32 | Return = PERF_SAMPLE_BRANCH_ANY_RETURN, 33 | #ifndef PERFCPP_NO_SAMPLE_BRANCH_IND_JUMP 34 | IndirectJump = PERF_SAMPLE_BRANCH_IND_JUMP, 35 | #else 36 | IndirectJump = 1ULL << 63, 37 | #endif 38 | Conditional = PERF_SAMPLE_BRANCH_COND, 39 | TransactionalMemoryAbort = PERF_SAMPLE_BRANCH_ABORT_TX, 40 | InTransaction = PERF_SAMPLE_BRANCH_IN_TX, 41 | NotInTransaction = PERF_SAMPLE_BRANCH_NO_TX 42 | }; 43 | 44 | /** 45 | * A Branch represents one branch from the branch stack, including information where the branch started (and in case of 46 | * jmp/call where the branch ended), if the branch was predicted correctly, and how long 47 | */ 48 | class Branch 49 | { 50 | public: 51 | Branch(const std::uintptr_t instruction_pointer_from, 52 | const std::uintptr_t instruction_pointer_to, 53 | const bool is_mispredicted, 54 | const bool is_predicted, 55 | const bool is_in_transaction, 56 | const bool is_transaction_abort, 57 | const std::optional cycles) 58 | : _instruction_pointer_from(instruction_pointer_from) 59 | , _instruction_pointer_to(instruction_pointer_to) 60 | , _is_mispredicted(is_mispredicted) 61 | , _is_predicted(is_predicted) 62 | , _is_in_transaction(is_in_transaction) 63 | , _is_transaction_abort(is_transaction_abort) 64 | , _cycles(cycles) 65 | { 66 | } 67 | 68 | /** 69 | * @return The instruction pointer the branch started. 70 | */ 71 | [[nodiscard]] std::uintptr_t instruction_pointer_from() const noexcept { return _instruction_pointer_from; } 72 | 73 | /** 74 | * @return The instruction pointer the branch ended. 75 | */ 76 | [[nodiscard]] std::uintptr_t instruction_pointer_to() const noexcept { return _instruction_pointer_to; } 77 | 78 | /** 79 | * @return True, if the branch was not predicted properly. 80 | */ 81 | [[nodiscard]] bool is_mispredicted() const noexcept { return _is_mispredicted; } 82 | 83 | /** 84 | * @return True, if the branch was predicted correctly. 85 | */ 86 | [[nodiscard]] bool is_predicted() const noexcept { return _is_predicted; } 87 | 88 | /** 89 | * @return True, if the branch was within a memory transaction. 90 | */ 91 | [[nodiscard]] bool is_in_transaction() const noexcept { return _is_in_transaction; } 92 | 93 | /** 94 | * @return True, if the branch was a transaction abort. 95 | */ 96 | [[nodiscard]] bool is_transaction_abort() const noexcept { return _is_transaction_abort; } 97 | 98 | /** 99 | * @return The number of cycles of the branch (zero if not supported on the underlying hardware). 100 | */ 101 | [[nodiscard]] std::optional cycles() const noexcept { return _cycles; } 102 | 103 | private: 104 | std::uintptr_t _instruction_pointer_from; 105 | std::uintptr_t _instruction_pointer_to; 106 | bool _is_mispredicted; 107 | bool _is_predicted; 108 | bool _is_in_transaction; 109 | bool _is_transaction_abort; 110 | std::optional _cycles; 111 | }; 112 | } -------------------------------------------------------------------------------- /examples/statistics/multi_thread.cpp: -------------------------------------------------------------------------------- 1 | #include "../access_benchmark.h" 2 | #include "perfcpp/event_counter.h" 3 | #include 4 | #include 5 | #include 6 | 7 | int 8 | main() 9 | { 10 | std::cout << "libperf-cpp example: Record performance counter for " 11 | "multi-threaded random access to an in-memory array." 12 | << std::endl; 13 | std::cout << "We will record the counters per thread and merge the results " 14 | "afterwards." 15 | << std::endl; 16 | 17 | constexpr auto count_threads = 2U; 18 | 19 | /// Initialize performance counters.# 20 | auto multithread_event_counter = perf::MultiThreadEventCounter{ count_threads }; 21 | 22 | /// Add all the performance counters we want to record. 23 | try { 24 | multithread_event_counter.add({ "instructions", 25 | "cycles", 26 | "branches", 27 | "cache-misses", 28 | "dTLB-miss-ratio", 29 | "L1-data-miss-ratio", 30 | "cycles-per-instruction", 31 | "nanoseconds" }); 32 | } catch (std::runtime_error& e) { 33 | std::cerr << e.what() << std::endl; 34 | return 1; 35 | } 36 | 37 | /// Create random access benchmark. 38 | auto benchmark = perf::example::AccessBenchmark{ /*randomize the accesses*/ true, 39 | /* create benchmark of 1024 MB */ 1024U }; 40 | 41 | /// One event_counter instance for every thread. 42 | const auto items_per_thread = benchmark.size() / count_threads; 43 | auto threads = std::vector{}; 44 | auto thread_local_results = 45 | std::vector(count_threads, 0U); /// Array to store the thread-local results. 46 | 47 | for (auto thread_index = std::uint16_t(0U); thread_index < count_threads; ++thread_index) { 48 | threads.emplace_back( 49 | [thread_index, items_per_thread, &thread_local_results, &benchmark, &multithread_event_counter]() { 50 | auto local_value = 0ULL; 51 | 52 | /// Start recording counters. 53 | /// In contrast to the inherit-thread example (see inherit_thread.cpp), we 54 | /// will record the performance counters on each thread. 55 | try { 56 | multithread_event_counter.start(thread_index); 57 | } catch (std::runtime_error& exception) { 58 | std::cerr << exception.what() << std::endl; 59 | return; 60 | } 61 | 62 | /// Process the data. 63 | for (auto index = 0U; index < items_per_thread; ++index) { 64 | local_value += benchmark[(thread_index * items_per_thread) + index].value; 65 | } 66 | 67 | /// Stop recording counters on this thread. 68 | multithread_event_counter.stop(thread_index); 69 | 70 | thread_local_results[thread_index] = local_value; 71 | }); 72 | } 73 | 74 | /// Wait for all threads to finish. 75 | for (auto& thread : threads) { 76 | thread.join(); 77 | } 78 | 79 | /// Add up the results so that the compiler does not get the idea of 80 | /// optimizing away the accesses. 81 | auto value = std::accumulate(thread_local_results.begin(), thread_local_results.end(), 0UL); 82 | 83 | /// We do not want the compiler to optimize away this (otherwise) unused value. 84 | benchmark.pretend_to_use(value); 85 | 86 | /// Get the result (normalized per cache line) from the 87 | /// multithread_event_counter. 88 | auto result = multithread_event_counter.result(benchmark.size()); 89 | 90 | /// Print the performance counters. 91 | std::cout << "\nResults:\n"; 92 | for (const auto& [counter_name, counter_value] : result) { 93 | std::cout << counter_value << " " << counter_name << " / cache line" << std::endl; 94 | } 95 | 96 | return 0; 97 | } 98 | -------------------------------------------------------------------------------- /test/counter_definition.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | TEST_CASE("adding new events and metrics", "[CounterDefinition]") 5 | { 6 | auto definition = perf::CounterDefinition{}; 7 | 8 | auto test_counter = std::string{ "some-test-counter-name" }; 9 | 10 | SECTION("events do not exist") 11 | { 12 | REQUIRE(definition.counter(test_counter).empty()); 13 | REQUIRE(definition.is_metric(test_counter) == false); 14 | REQUIRE(definition.metric(test_counter).has_value() == false); 15 | } 16 | 17 | SECTION("add hardware counter") 18 | { 19 | definition.add(std::string{ test_counter }, 100U, 0x1234); 20 | REQUIRE(definition.counter(test_counter).size() == 1U); 21 | REQUIRE(std::get<1>(definition.counter(test_counter).front()) == test_counter); 22 | REQUIRE(std::get<2>(definition.counter(test_counter).front()).configs()[0U] == 0x1234); 23 | REQUIRE(std::get<2>(definition.counter(test_counter).front()).type() == 100U); 24 | REQUIRE(definition.is_metric(test_counter) == false); 25 | REQUIRE(definition.metric(test_counter).has_value() == false); 26 | } 27 | 28 | SECTION("add metric") 29 | { 30 | auto test_metric = std::string{ "some-test-metric-name" }; 31 | definition.add(std::string{ test_metric }, "cycles/instructions"); 32 | REQUIRE(definition.counter(test_metric).empty()); 33 | REQUIRE(definition.is_metric(test_metric)); 34 | REQUIRE(definition.metric(test_metric).has_value()); 35 | REQUIRE(std::get<0>(definition.metric(test_metric).value()) == test_metric); 36 | } 37 | 38 | SECTION("read csv counter-only") 39 | { 40 | const auto event0 = std::string{ "EVENT.TEST0" }; 41 | const auto event1 = std::string{ "event-test-1" }; 42 | 43 | REQUIRE(definition.counter(event0).empty()); 44 | REQUIRE(definition.counter(event1).empty()); 45 | 46 | const auto definition_with_file = perf::CounterDefinition{ "test/events.csv" }; 47 | 48 | REQUIRE_FALSE(definition_with_file.counter(event0).empty()); 49 | REQUIRE_FALSE(definition_with_file.counter(event1).empty()); 50 | 51 | REQUIRE(std::get<2>(definition_with_file.counter(event0).front()).configs()[0U] == 0x1f3010e); 52 | REQUIRE(std::get<2>(definition_with_file.counter(event0).front()).configs()[1U] == 0U); 53 | REQUIRE(std::get<2>(definition_with_file.counter(event0).front()).configs()[2U] == 0U); 54 | 55 | REQUIRE(std::get<2>(definition_with_file.counter(event1).front()).configs()[0U] == 0x1CD); 56 | REQUIRE(std::get<2>(definition_with_file.counter(event1).front()).configs()[1U] == 3U); 57 | REQUIRE(std::get<2>(definition_with_file.counter(event1).front()).configs()[2U] == 0U); 58 | } 59 | 60 | SECTION("read csv with metric") 61 | { 62 | const auto event0 = std::string{ "EVENT.TEST0" }; 63 | const auto event1 = std::string{ "event-test-1" }; 64 | const auto test_metric = std::string{ "test-metric" }; 65 | 66 | REQUIRE(definition.counter(event0).empty()); 67 | REQUIRE(definition.counter(event1).empty()); 68 | REQUIRE(definition.counter(test_metric).empty()); 69 | 70 | REQUIRE_FALSE(definition.is_metric(test_metric)); 71 | 72 | const auto definition_with_file = perf::CounterDefinition{ "test/events-and-metrics.csv" }; 73 | 74 | REQUIRE_FALSE(definition_with_file.counter(event0).empty()); 75 | REQUIRE_FALSE(definition_with_file.counter(event1).empty()); 76 | REQUIRE(definition.counter(test_metric).empty()); 77 | 78 | REQUIRE(definition_with_file.is_metric(test_metric)); 79 | 80 | auto metric = definition_with_file.metric(test_metric); 81 | REQUIRE(metric.has_value()); 82 | 83 | auto counter_result = perf::CounterResult{ std::vector>{ 84 | std::make_pair("EVENT.TEST0", 100U), std::make_pair("event-test-1", 500U) } }; 85 | const auto metric_result = metric->second.calculate(counter_result); 86 | REQUIRE(metric_result.has_value()); 87 | REQUIRE(metric_result.value() == 1500U); 88 | } 89 | } -------------------------------------------------------------------------------- /examples/sampling/counter.cpp: -------------------------------------------------------------------------------- 1 | #include "../access_benchmark.h" 2 | #include "perfcpp/sampler.h" 3 | #include 4 | 5 | int 6 | main() 7 | { 8 | std::cout << "libperf-cpp example: Record perf samples including performance " 9 | "counters for single-threaded random access to an in-memory array." 10 | << std::endl; 11 | 12 | /// Initialize counter definitions. 13 | /// Note that the perf::CounterDefinition holds all counter names and must be 14 | /// alive until the benchmark finishes. 15 | auto counter_definition = perf::CounterDefinition{}; 16 | 17 | /// Add metric that calculates the L1d miss ratio. 18 | counter_definition.add("L1d-misses-per-load", "'L1-dcache-load-misses'/'L1-dcache-loads'"); 19 | 20 | auto sampler = perf::Sampler{ counter_definition }; 21 | 22 | /// Setup the event that will trigger writing samples. 23 | sampler.trigger("cycles", perf::Precision::AllowArbitrarySkid, perf::Period{ 50000 }); 24 | 25 | /// Setup which data should be included (L1 hit and miss counter, timestamp). 26 | sampler.values().counter({ "L1-dcache-loads", "L1-dcache-load-misses", "L1d-misses-per-load" }).timestamp(true); 27 | 28 | /// Create random access benchmark. 29 | auto benchmark = perf::example::AccessBenchmark{ /*randomize the accesses*/ true, 30 | /* create benchmark of 512 MB */ 512U }; 31 | 32 | /// Start sampling. 33 | try { 34 | sampler.start(); 35 | } catch (std::runtime_error& exception) { 36 | std::cerr << exception.what() << std::endl; 37 | return 1; 38 | } 39 | 40 | /// Execute the benchmark (accessing cache lines in a random order). 41 | auto value = 0ULL; 42 | for (auto index = 0U; index < benchmark.size(); ++index) { 43 | value += benchmark[index].value; 44 | } 45 | 46 | /// We do not want the compiler to optimize away this (otherwise) unused value (and consequently the loop above). 47 | benchmark.pretend_to_use(value); 48 | 49 | /// Stop sampling. 50 | sampler.stop(); 51 | 52 | /// Get all the recorded samples. 53 | const auto samples = sampler.result(); 54 | 55 | /// Print the first samples. 56 | const auto count_show_samples = std::min(samples.size(), 40U); 57 | std::cout << "\nRecorded " << samples.size() << " samples." << std::endl; 58 | std::cout << "Here are the first " << count_show_samples << " recorded samples:\n" << std::endl; 59 | 60 | std::optional last_counter_result = std::nullopt; /// Remember the last counter result to show 61 | /// only the difference. 62 | 63 | for (auto index = 0U; index < count_show_samples; ++index) { 64 | const auto& sample = samples[index]; 65 | 66 | /// Since we recorded the time, period, the instruction pointer, and the CPU 67 | /// id, we can only read these values. 68 | if (sample.metadata().timestamp().has_value() && sample.counter().has_value()) { 69 | if (last_counter_result.has_value()) { 70 | std::cout << "Time = " << sample.metadata().timestamp().value() << " | cycles (diff) = " 71 | << sample.counter()->get("cycles").value_or(.0) - last_counter_result->get("cycles").value_or(.0) 72 | << " | L1-dcache-loads (diff) = " 73 | << sample.counter()->get("L1-dcache-loads").value_or(.0) - 74 | last_counter_result->get("L1-dcache-loads").value_or(.0) 75 | << " | L1-dcache-load-misses (diff) = " 76 | << sample.counter()->get("L1-dcache-load-misses").value_or(.0) - 77 | last_counter_result->get("L1-dcache-load-misses").value_or(.0) 78 | << " | L1d-misses-per-load = " << sample.counter()->get("L1d-misses-per-load").value_or(.0) << "\n"; 79 | } 80 | 81 | last_counter_result = sample.counter(); 82 | } 83 | } 84 | std::cout << std::flush; 85 | 86 | /// Close the sampler. 87 | /// Note that the sampler can only be closed after reading the samples. 88 | sampler.close(); 89 | 90 | return 0; 91 | } -------------------------------------------------------------------------------- /include/perfcpp/metric/expression/parser.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "expression.h" 4 | #include "tokenizer.h" 5 | #include 6 | #include 7 | #include 8 | 9 | namespace perf::metric::expression { 10 | /** 11 | * The parser translates an expression (string) to an executable metric expression. 12 | */ 13 | class Parser 14 | { 15 | class TokenVisitor; 16 | friend TokenVisitor; 17 | 18 | public: 19 | explicit Parser(std::string&& input) 20 | : _tokenizer(std::move(input)) 21 | , _current_token(_tokenizer.next()) 22 | { 23 | } 24 | 25 | ~Parser() = default; 26 | 27 | /** 28 | * Builds an evaluable expression from the given expression-string. 29 | * 30 | * @return Evaluable expression. 31 | */ 32 | [[nodiscard]] std::unique_ptr parse(); 33 | 34 | private: 35 | Tokenizer _tokenizer; 36 | 37 | std::optional _current_token; 38 | 39 | /** 40 | * Consume the current token by moving to the next one. 41 | */ 42 | void consume() { this->_current_token = this->_tokenizer.next(); } 43 | 44 | /** 45 | * Verifies that the current token is equal to the expected punctuation. Throws an exception otherwise. 46 | * If the current token is equal to the expected punctuation, the current token is consumed. 47 | * 48 | * @param expected_punctuation Expected punctuation. 49 | */ 50 | void consume(const Token::Punctuation expected_punctuation) 51 | { 52 | if (!(this->_current_token.has_value() && this->_current_token.value() == expected_punctuation)) { 53 | throw CannotParseMetricExpressionError{ this->_tokenizer.input() }; 54 | } 55 | 56 | consume(); 57 | } 58 | 59 | /** 60 | * Parses the input and returns a parsed expression. 61 | * 62 | * @return The parsed expression. 63 | */ 64 | [[nodiscard]] std::unique_ptr parse_expression() { return this->parse_additive_expression(); } 65 | 66 | /** 67 | * Parses an additive expression (with "+" and "-" operations). 68 | * 69 | * @return The parsed expression. 70 | */ 71 | [[nodiscard]] std::unique_ptr parse_additive_expression(); 72 | 73 | /** 74 | * Parses a multiplicative expression (with "*" and "/" operations). 75 | * 76 | * @return The parsed expression. 77 | */ 78 | [[nodiscard]] std::unique_ptr parse_multiplicative_expression(); 79 | 80 | /** 81 | * Parse numbers, identifiers, functions, and parenthesized expressions. 82 | * 83 | * @return The parsed expression. 84 | */ 85 | [[nodiscard]] std::unique_ptr parse_primary(); 86 | 87 | /** 88 | * Creates a function with the given name and arguments. 89 | * 90 | * @param function_name Name of the function. 91 | * @param arguments Arguments of the function. 92 | * @return Function expression. 93 | */ 94 | [[nodiscard]] std::unique_ptr build_function( 95 | std::string&& function_name, 96 | std::vector>&& arguments) const; 97 | 98 | /** 99 | * Visits a token and translates it into an expression, called by the parse_primary() function. 100 | * This may include recursive calls for parenthesized expressions. 101 | */ 102 | class TokenVisitor 103 | { 104 | public: 105 | explicit TokenVisitor(Parser& parser) noexcept 106 | : _parser(parser) 107 | { 108 | } 109 | ~TokenVisitor() noexcept = default; 110 | 111 | [[nodiscard]] std::unique_ptr operator()(std::string& identifier); 112 | [[nodiscard]] std::unique_ptr operator()(double constant); 113 | [[nodiscard]] std::unique_ptr operator()(Operator_ metric_operator); 114 | [[nodiscard]] std::unique_ptr operator()(Token::Punctuation punctutation); 115 | 116 | private: 117 | /// The calling parser to consume tokens and continue parsing. 118 | Parser& _parser; 119 | }; 120 | }; 121 | } -------------------------------------------------------------------------------- /examples/statistics/metric.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "../access_benchmark.h" 6 | 7 | /** 8 | * Example of a metric implementation that calculates the number of branch misses per executed branch instruction. 9 | */ 10 | class BranchMissesPerBranchInstruction final : public perf::Metric 11 | { 12 | public: 13 | [[nodiscard]] std::string name() const override { return "branch-misses-per-branch-instruction"; } 14 | 15 | [[nodiscard]] std::vector required_counter_names() const override 16 | { 17 | return { "branch-misses", "branch-instructions" }; 18 | } 19 | 20 | [[nodiscard]] std::optional calculate(const perf::CounterResult& result) const override 21 | { 22 | const auto branch_misses = result.get("branch-misses"); 23 | const auto branch_instructions = result.get("branch-instructions"); 24 | 25 | if (branch_misses.has_value() && branch_instructions.has_value()) { 26 | if (branch_instructions.value() > 0U) { 27 | return branch_misses.value() / branch_instructions.value(); 28 | } 29 | } 30 | 31 | return std::nullopt; 32 | } 33 | 34 | private: 35 | }; 36 | 37 | int 38 | main() 39 | { 40 | std::cout << "libperf-cpp example: Implementing new metrics." << std::endl; 41 | 42 | auto counter_definition = perf::CounterDefinition{}; 43 | 44 | /// Define a metric that returns the number of cache misses per cache reference: 45 | counter_definition.add("cache-misses-per-reference", "d_ratio(`cache-misses`, `cache-references`)"); 46 | 47 | /// Define a metric that sums up all L1 loads: 48 | counter_definition.add("l1-loads", "`L1-dcache-loads` + `L1-icache-loads`"); 49 | 50 | /// Define a metric that sums up all L1 load misses: 51 | counter_definition.add("l1-load-misses", "sum(`L1-dcache-load-misses`, `L1-icache-load-misses`)"); 52 | 53 | /// Define a metric that calculates the ratio between L1 load misses and L1 loads: 54 | counter_definition.add("l1-misses-per-load", "`l1-load-misses` / `l1-loads`"); 55 | 56 | /// Initialize the above defined metric that returns the number of branch misses per branch instruction. 57 | counter_definition.add(std::make_unique()); 58 | 59 | /// Initialize performance counters. 60 | auto event_counter = perf::EventCounter{ counter_definition }; 61 | 62 | /// Add the new defined metrics. 63 | try { 64 | event_counter.add(std::vector{ "cache-misses-per-reference", 65 | "branch-misses-per-branch-instruction", 66 | "l1-loads", 67 | "l1-load-misses", 68 | "l1-misses-per-load" }); 69 | } catch (std::runtime_error& e) { 70 | std::cerr << e.what() << std::endl; 71 | return 1; 72 | } 73 | 74 | /// Create random access benchmark. 75 | auto benchmark = perf::example::AccessBenchmark{ /*randomize the accesses*/ true, 76 | /* create benchmark of 512 MB */ 512 }; 77 | 78 | /// Start recording. 79 | try { 80 | event_counter.start(); 81 | } catch (std::runtime_error& exception) { 82 | std::cerr << exception.what() << std::endl; 83 | return 1; 84 | } 85 | 86 | /// Execute the benchmark (accessing cache lines in a random order). 87 | auto value = 0ULL; 88 | for (auto index = 0U; index < benchmark.size(); ++index) { 89 | value += benchmark[index].value; 90 | } 91 | 92 | /// We do not want the compiler to optimize away this (otherwise) unused value (and consequently the loop above). 93 | benchmark.pretend_to_use(value); 94 | 95 | /// Stop recording counters. 96 | event_counter.stop(); 97 | 98 | /// Get the result. 99 | const auto result = event_counter.result(); 100 | 101 | /// Print the metrics as table. 102 | std::cout << "\nResults as table:\n" << result.to_string() << std::endl; 103 | 104 | return 0; 105 | } -------------------------------------------------------------------------------- /examples/sampling/multi_thread.cpp: -------------------------------------------------------------------------------- 1 | #include "../access_benchmark.h" 2 | #include "perfcpp/sampler.h" 3 | #include 4 | #include 5 | #include 6 | 7 | int 8 | main() 9 | { 10 | std::cout << "libperf-cpp example: Record perf samples including time, " 11 | "instruction pointer, and cpu id for single-threaded random " 12 | "access to an in-memory array on multiple threads." 13 | << std::endl; 14 | 15 | constexpr auto count_threads = 4U; 16 | 17 | auto sampler = perf::MultiThreadSampler{ count_threads }; 18 | 19 | /// Setup event that triggers writing samples. 20 | sampler.trigger("cycles", perf::Period{ 50000 }); 21 | 22 | /// Setup what data the samples should include (timestamp, instruction pointer, CPU id, thread id). 23 | sampler.values().timestamp(true).instruction_pointer(true).cpu_id(true).thread_id(true); 24 | 25 | /// Create random access benchmark. 26 | auto benchmark = perf::example::AccessBenchmark{ /*randomize the accesses*/ true, 27 | /* create benchmark of 512 MB */ 1024U }; 28 | 29 | /// Allocate space for threads and their results. 30 | const auto items_per_thread = benchmark.size() / count_threads; 31 | auto threads = std::vector{}; 32 | auto thread_local_results = 33 | std::vector(count_threads, 0U); /// Array to store the thread-local results. 34 | 35 | for (auto thread_index = std::uint16_t(0U); thread_index < count_threads; ++thread_index) { 36 | threads.emplace_back([thread_index, items_per_thread, &thread_local_results, &benchmark, &sampler]() { 37 | auto local_value = 0ULL; 38 | 39 | /// Start sampling per thread. 40 | try { 41 | sampler.start(thread_index); 42 | } catch (std::runtime_error& exception) { 43 | std::cerr << exception.what() << std::endl; 44 | return; 45 | } 46 | 47 | /// Process the data. 48 | for (auto index = 0U; index < items_per_thread; ++index) { 49 | local_value += benchmark[(thread_index * items_per_thread) + index].value; 50 | } 51 | 52 | /// Stop sampling on this thread. 53 | sampler.stop(thread_index); 54 | 55 | thread_local_results[thread_index] = local_value; 56 | }); 57 | } 58 | 59 | /// Wait for all threads to finish. 60 | for (auto& thread : threads) { 61 | thread.join(); 62 | } 63 | 64 | /// Add up the results so that the compiler does not get the idea of 65 | /// optimizing away the accesses. 66 | auto value = std::accumulate(thread_local_results.begin(), thread_local_results.end(), 0UL); 67 | 68 | /// We do not want the compiler to optimize away this (otherwise) unused value. 69 | benchmark.pretend_to_use(value); 70 | 71 | /// Get all the recorded samples – ordered by timestamp. 72 | auto samples = sampler.result(true); 73 | 74 | /// Print the first samples. 75 | const auto count_show_samples = std::min(samples.size(), 40U); 76 | std::cout << "\nRecorded " << samples.size() << " samples." << std::endl; 77 | std::cout << "Here are the first " << count_show_samples << " recorded samples:\n" << std::endl; 78 | for (auto index = 0U; index < count_show_samples; ++index) { 79 | const auto& sample = samples[index]; 80 | 81 | /// Since we recorded the time, period, the instruction pointer, and the CPU 82 | /// id, we can only read these values. 83 | if (sample.metadata().timestamp().has_value() && sample.metadata().thread_id().has_value() && 84 | sample.instruction_execution().logical_instruction_pointer().has_value() && 85 | sample.metadata().cpu_id().has_value()) { 86 | std::cout << "Time = " << sample.metadata().timestamp().value() 87 | << " | CPU ID = " << sample.metadata().cpu_id().value() 88 | << " | Thread ID = " << sample.metadata().thread_id().value() << " | Instruction Pointer = 0x" 89 | << std::hex << sample.instruction_execution().logical_instruction_pointer().value() << std::dec << "\n"; 90 | } 91 | } 92 | std::cout << std::flush; 93 | 94 | /// Close the sampler. 95 | /// Note that the sampler can only be closed after reading the samples. 96 | sampler.close(); 97 | 98 | return 0; 99 | } -------------------------------------------------------------------------------- /include/perfcpp/metric/expression/token.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | namespace perf::metric::expression { 8 | /** 9 | * Representation of the supported operators. 10 | */ 11 | enum class Operator_ : std::uint8_t 12 | { 13 | Plus, 14 | Minus, 15 | Times, 16 | Divide 17 | }; 18 | 19 | /** 20 | * A token represents a single constant, identifier, operator (like +,-, etc.), or punctutations. 21 | */ 22 | class Token 23 | { 24 | public: 25 | enum class Punctuation : std::uint8_t 26 | { 27 | LeftParentheses, 28 | RightParentheses, 29 | Comma, 30 | }; 31 | 32 | /** 33 | * Token can be an identifier, a constant number, an operator, or a punctutation. 34 | */ 35 | using token_t = std::variant; 36 | 37 | Token(Token&&) noexcept = default; 38 | Token(const Token&) = default; 39 | 40 | explicit Token(const Operator_ operator_) 41 | : _token(operator_) 42 | { 43 | } 44 | 45 | explicit Token(const double number) 46 | : _token(number) 47 | { 48 | } 49 | 50 | explicit Token(std::string&& text) 51 | : _token(std::move(text)) 52 | { 53 | } 54 | 55 | explicit Token(const Punctuation punctutation) 56 | : _token(punctutation) 57 | { 58 | } 59 | 60 | ~Token() = default; 61 | 62 | Token& operator=(Token&&) noexcept = default; 63 | 64 | /** 65 | * @return True, if this token is a left parenthesis. 66 | */ 67 | [[nodiscard]] bool is_left_parenthesis() const noexcept 68 | { 69 | return std::holds_alternative(_token) && std::get(_token) == Punctuation::LeftParentheses; 70 | } 71 | 72 | /** 73 | * @return True, if this token is a right parenthesis. 74 | */ 75 | [[nodiscard]] bool is_right_parenthesis() const noexcept 76 | { 77 | return std::holds_alternative(_token) && 78 | std::get(_token) == Punctuation::RightParentheses; 79 | } 80 | 81 | /** 82 | * @return True, if this token is a comma. 83 | */ 84 | [[nodiscard]] bool is_comma() const noexcept 85 | { 86 | return std::holds_alternative(_token) && std::get(_token) == Punctuation::Comma; 87 | } 88 | 89 | /** 90 | * @return True, if this token is an additive operator. 91 | */ 92 | [[nodiscard]] bool is_additive_operator() const noexcept 93 | { 94 | return std::holds_alternative(_token) && 95 | (std::get(_token) == Operator_::Plus || std::get(_token) == Operator_::Minus); 96 | } 97 | 98 | /** 99 | * @return True, if this token is a multiplicative operator. 100 | */ 101 | [[nodiscard]] bool is_multiplicative_operator() const noexcept 102 | { 103 | return std::holds_alternative(_token) && 104 | (std::get(_token) == Operator_::Divide || std::get(_token) == Operator_::Times); 105 | } 106 | 107 | /** 108 | * @return The operator inside the token. 109 | */ 110 | [[nodiscard]] Operator_ operator_() const noexcept { return std::get(_token); } 111 | 112 | /** 113 | * @return Ownership of the underlying token data. 114 | */ 115 | [[nodiscard]] token_t& data() noexcept { return _token; } 116 | 117 | [[nodiscard]] bool operator==(const Punctuation punctuation) const noexcept 118 | { 119 | return std::holds_alternative(_token) && std::get(_token) == punctuation; 120 | } 121 | 122 | /** 123 | * @return A text representation of this token. 124 | */ 125 | [[nodiscard]] std::string to_string() const; 126 | 127 | private: 128 | token_t _token; 129 | 130 | /** 131 | * Visits a token and translates it into an std::string. 132 | */ 133 | class TokenToStringVisitor 134 | { 135 | public: 136 | [[nodiscard]] std::string operator()(const std::string& identifier) const { return identifier; } 137 | 138 | [[nodiscard]] std::string operator()(const double constant) const { return std::to_string(constant); } 139 | 140 | [[nodiscard]] std::string operator()(Operator_ metric_operator) const; 141 | 142 | [[nodiscard]] std::string operator()(Token::Punctuation punctutation) const; 143 | }; 144 | }; 145 | } -------------------------------------------------------------------------------- /examples/access_benchmark.cpp: -------------------------------------------------------------------------------- 1 | #include "access_benchmark.h" 2 | #include 3 | #include 4 | #include 5 | 6 | perf::example::AccessBenchmark::AccessBenchmark(const bool is_random, 7 | const std::uint64_t access_data_size_in_mb, 8 | const bool is_write) 9 | { 10 | const auto count_cache_lines = (access_data_size_in_mb * 1024U * 1024U) / sizeof(cache_line); 11 | 12 | /// Fill the data array with some unique. 13 | this->_data_to_read.reserve(count_cache_lines); 14 | for (const auto item : DataGenerator::generate_unique(count_cache_lines)) { 15 | this->_data_to_read.emplace_back(item); 16 | } 17 | 18 | if (is_write) { 19 | this->_data_to_write.resize(count_cache_lines); 20 | } 21 | 22 | /// Create the access pattern by filling the indices and shuffle, if we want a 23 | /// random access pattern. 24 | this->_indices.resize(count_cache_lines); 25 | std::iota(this->_indices.begin(), this->_indices.end(), 0U); 26 | 27 | if (is_random) { 28 | std::shuffle(this->_indices.begin(), this->_indices.end(), std::mt19937{ std::random_device{}() }); 29 | } 30 | } 31 | 32 | std::vector 33 | perf::example::DataGenerator::generate_unique(const std::size_t size) 34 | { 35 | /// Create a list for the tuples. 36 | auto relation = std::vector{}; 37 | relation.reserve(size); 38 | 39 | /// Create tuples. 40 | auto generator = std::mt19937{ 864896UL }; 41 | auto distribution = std::uniform_int_distribution{}; 42 | for (auto i = 0ULL; i < size; ++i) { 43 | relation.emplace_back(distribution(generator)); 44 | } 45 | 46 | /// Shuffle the relation. 47 | std::shuffle(relation.begin(), relation.end(), generator); 48 | 49 | return relation; 50 | } 51 | 52 | std::vector 53 | perf::example::DataGenerator::generate_zipf(const std::size_t size, 54 | const std::size_t alphabet_size, 55 | const double zipf_param) 56 | { 57 | /// Create a list for the tuples. 58 | auto relation = std::vector{}; 59 | relation.reserve(size); 60 | 61 | const auto alphabet = DataGenerator::alphabet(alphabet_size); 62 | const auto lookup_table = DataGenerator::lookup_table(zipf_param, alphabet); 63 | 64 | std::srand(6854686UL); 65 | for (auto i = 0ULL; i < size; ++i) { 66 | const auto random_key = static_cast(std::rand()) / RAND_MAX; 67 | 68 | if (lookup_table[0U] >= random_key) { 69 | relation.emplace_back(alphabet[0U]); 70 | } else { 71 | auto left = 0ULL; 72 | auto right = alphabet_size - 1ULL; 73 | std::uint64_t mid; 74 | while (right - left > 1ULL) { 75 | mid = (left + right) / 2; 76 | if (lookup_table[mid] < random_key) { 77 | left = mid; 78 | } else { 79 | right = mid; 80 | } 81 | } 82 | 83 | relation.emplace_back(alphabet[right]); 84 | } 85 | } 86 | 87 | return relation; 88 | } 89 | 90 | std::vector 91 | perf::example::DataGenerator::alphabet(const std::size_t size) 92 | { 93 | auto alphabet = std::vector{}; 94 | alphabet.reserve(size); 95 | 96 | /// Fill the alphabet. 97 | for (auto i = 0ULL; i < size; ++i) { 98 | alphabet.emplace_back(i); 99 | } 100 | 101 | /// Permute the alphabet. 102 | auto generator = std::mt19937{ 864896UL }; 103 | std::shuffle(alphabet.begin(), alphabet.end(), generator); 104 | 105 | return alphabet; 106 | } 107 | 108 | std::vector 109 | perf::example::DataGenerator::lookup_table(double zipf_param, const std::vector& alphabet) 110 | { 111 | auto lookup_table = std::vector{}; 112 | lookup_table.reserve(alphabet.size()); 113 | 114 | /// Compute scaling factor such that sum (lookup_table[i], i=1..alphabet_size) = 1.0 115 | auto scaling_factor = 0.0; 116 | for (auto i = 0ULL; i < alphabet.size(); ++i) { 117 | scaling_factor += 1.0 / std::pow(double(i) + 1., zipf_param); 118 | } 119 | 120 | /// Generate the lookup table. 121 | auto sum = 0.0; 122 | for (auto i = 0ULL; i < alphabet.size(); ++i) { 123 | sum += 1.0 / std::pow(double(i) + 1.0, zipf_param); 124 | lookup_table.emplace_back(sum / scaling_factor); 125 | } 126 | 127 | return lookup_table; 128 | } -------------------------------------------------------------------------------- /examples/statistics/multi_cpu.cpp: -------------------------------------------------------------------------------- 1 | #include "../access_benchmark.h" 2 | #include "perfcpp/event_counter.h" 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | int 9 | main() 10 | { 11 | std::cout << "libperf-cpp example: Record performance counter for " 12 | "random access to an in-memory array on all CPU cores." 13 | << std::endl; 14 | std::cout << "We will record the counters per (logical) CPU core and merge the results " 15 | "afterwards." 16 | << std::endl; 17 | 18 | /// Create a list of cpus to record performance counters on (all available, in this example). 19 | auto cpus_to_watch = std::vector(std::thread::hardware_concurrency()); 20 | std::iota(cpus_to_watch.begin(), cpus_to_watch.end(), 0U); 21 | std::cout << "Creating counters for CPUs: "; 22 | for (auto cpu : cpus_to_watch) { 23 | std::cout << std::int32_t(cpu) << " "; 24 | } 25 | std::cout << std::endl; 26 | 27 | /// Initialize performance counters. 28 | auto multi_cpu_event_counter = perf::MultiCoreEventCounter{ std::move(cpus_to_watch) }; 29 | 30 | /// Add all the performance counters we want to record. 31 | try { 32 | multi_cpu_event_counter.add({ "instructions", 33 | "cycles", 34 | "branches", 35 | "cache-misses", 36 | "dTLB-miss-ratio", 37 | "L1-data-miss-ratio", 38 | "cycles-per-instruction" }); 39 | } catch (std::runtime_error& e) { 40 | std::cerr << e.what() << std::endl; 41 | return 1; 42 | } 43 | 44 | /// Create random access benchmark. 45 | auto benchmark = perf::example::AccessBenchmark{ /*randomize the accesses*/ true, 46 | /* create benchmark of 1024 MB */ 1024U }; 47 | 48 | /// One event_counter instance for every thread. 49 | constexpr auto count_threads = 2U; 50 | const auto items_per_thread = benchmark.size() / count_threads; 51 | auto threads = std::vector{}; 52 | auto thread_local_results = std::vector(2U, 0U); /// Array to store the thread-local results. 53 | 54 | /// Barrier for the threads to wait. 55 | auto thread_barrier = std::atomic{ false }; 56 | 57 | for (auto thread_index = 0U; thread_index < count_threads; ++thread_index) { 58 | threads.emplace_back([thread_index, items_per_thread, &thread_local_results, &benchmark, &thread_barrier]() { 59 | auto local_value = 0ULL; 60 | 61 | /// Wait for the barrier to become "true", i.e., all threads are spawned. 62 | while (!thread_barrier) 63 | ; 64 | 65 | /// Process the data. 66 | for (auto index = 0U; index < items_per_thread; ++index) { 67 | local_value += benchmark[(thread_index * items_per_thread) + index].value; 68 | } 69 | 70 | thread_local_results[thread_index] = local_value; 71 | }); 72 | } 73 | 74 | /// Start recording performance counter. 75 | /// In contrast to the inherit-thread example (see inherit_thread.cpp), we 76 | /// will record the performance counters on each logical CPU core. 77 | try { 78 | multi_cpu_event_counter.start(); 79 | } catch (std::runtime_error& exception) { 80 | std::cerr << exception.what() << std::endl; 81 | return 1; 82 | } 83 | 84 | /// Let threads start. 85 | thread_barrier = true; 86 | 87 | /// Wait for all threads to finish. 88 | for (auto& thread : threads) { 89 | thread.join(); 90 | } 91 | 92 | /// Stop performance counter recording. 93 | multi_cpu_event_counter.stop(); 94 | 95 | /// Add up the results so that the compiler does not get the idea of 96 | /// optimizing away the accesses. 97 | auto value = std::accumulate(thread_local_results.begin(), thread_local_results.end(), 0UL); 98 | 99 | /// We do not want the compiler to optimize away this (otherwise) unused value. 100 | benchmark.pretend_to_use(value); 101 | 102 | /// Get the result (normalized per cache line) from the 103 | /// multithread_event_counter. 104 | auto result = multi_cpu_event_counter.result(benchmark.size()); 105 | 106 | /// Print the performance counters. 107 | std::cout << "\nResults:\n"; 108 | for (const auto& [counter_name, counter_value] : result) { 109 | std::cout << counter_value << " " << counter_name << " / cache line" << std::endl; 110 | } 111 | 112 | return 0; 113 | } 114 | -------------------------------------------------------------------------------- /include/perfcpp/util/table.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | namespace perf::util { 8 | class Table 9 | { 10 | public: 11 | enum class Alignment : std::uint8_t 12 | { 13 | Left, 14 | Center, 15 | Right 16 | }; 17 | 18 | class Header 19 | { 20 | public: 21 | Header(std::string&& text, const std::uint8_t span, const bool has_separator) noexcept 22 | : _text(std::move(text)) 23 | , _span(span) 24 | , _has_separator(has_separator) 25 | { 26 | } 27 | explicit Header(std::string&& text, 28 | const Alignment alignment = Alignment::Right, 29 | const bool has_separator = false) noexcept 30 | : _text(std::move(text)) 31 | , _alignment(alignment) 32 | , _has_separator(has_separator) 33 | { 34 | } 35 | ~Header() = default; 36 | 37 | [[nodiscard]] const std::string& text() const noexcept { return _text; } 38 | [[nodiscard]] Alignment alignment() const noexcept { return _alignment; } 39 | [[nodiscard]] std::uint8_t span() const noexcept { return _span; } 40 | [[nodiscard]] bool has_separator() const noexcept { return _has_separator; } 41 | 42 | private: 43 | std::string _text; 44 | Alignment _alignment{ Alignment::Left }; 45 | std::uint8_t _span{ 1U }; 46 | bool _has_separator; 47 | }; 48 | 49 | class Row 50 | { 51 | public: 52 | Row() { _columns.reserve(32U); } 53 | ~Row() = default; 54 | 55 | void add(std::string&& column) { _columns.push_back(std::move(column)); } 56 | 57 | Row& operator<<(std::string&& column) 58 | { 59 | _columns.emplace_back(std::move(column)); 60 | return *this; 61 | } 62 | 63 | Row& operator<<(const std::string& column) 64 | { 65 | _columns.emplace_back(column); 66 | return *this; 67 | } 68 | 69 | Row& operator<<(const std::size_t column) 70 | { 71 | _columns.emplace_back(std::to_string(column)); 72 | return *this; 73 | } 74 | 75 | Row& operator<<(const std::uint32_t column) 76 | { 77 | _columns.emplace_back(std::to_string(column)); 78 | return *this; 79 | } 80 | 81 | Row& operator<<(const std::uint16_t column) 82 | { 83 | _columns.emplace_back(std::to_string(column)); 84 | return *this; 85 | } 86 | 87 | Row& operator<<(const std::uint8_t column) 88 | { 89 | _columns.emplace_back(std::to_string(column)); 90 | return *this; 91 | } 92 | 93 | Row& operator<<(const float column) 94 | { 95 | _columns.emplace_back(std::to_string(column)); 96 | return *this; 97 | } 98 | 99 | Row& operator<<(const double column) 100 | { 101 | _columns.emplace_back(std::to_string(column)); 102 | return *this; 103 | } 104 | 105 | [[nodiscard]] const std::vector& columns() const noexcept { return _columns; } 106 | [[nodiscard]] std::vector& columns() noexcept { return _columns; } 107 | 108 | private: 109 | std::vector _columns; 110 | }; 111 | 112 | Table() = default; 113 | explicit Table(const std::uint64_t offset) 114 | : _offset(offset) 115 | { 116 | } 117 | ~Table() = default; 118 | 119 | /** 120 | * Reserve space for the given number of rows. 121 | * @param count_rows Number of rows to reserve. 122 | */ 123 | void reserve(const std::size_t count_rows) { _rows.reserve(count_rows); } 124 | 125 | /** 126 | * Adds a header row to the table. 127 | * @param header_row Header row to add. 128 | */ 129 | void add(std::vector
&& header_row); 130 | 131 | /** 132 | * Adds the given row to the table. 133 | * @param row Row to add. 134 | */ 135 | void add(Row&& row); 136 | 137 | /** 138 | * @return Turns the table into a printable string. 139 | */ 140 | [[nodiscard]] std::string to_string() const; 141 | 142 | private: 143 | std::optional _count_columns{ std::nullopt }; 144 | 145 | /// Headers 146 | std::vector> _header_row; 147 | 148 | /// Rows 149 | std::vector _rows; 150 | 151 | /// Offset of each row in number of empty spaces. 152 | std::uint64_t _offset{ 0U }; 153 | 154 | void static print_text_aligned(std::stringstream& stream, 155 | Alignment alignment, 156 | const std::string& text, 157 | std::size_t column_size); 158 | }; 159 | } -------------------------------------------------------------------------------- /include/perfcpp/mmap_buffer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "util/unique_file_descriptor.h" 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | namespace perf { 13 | class MmapBuffer; 14 | 15 | /** 16 | * The MmapBufferOverflowWorker is responsible for handling an extra thread that waits for overflows to happen and then 17 | * triggers the MmapBuffer to handle the overflow. 18 | */ 19 | class MmapBufferOverflowWorker 20 | { 21 | public: 22 | MmapBufferOverflowWorker(MmapBuffer& mmap_buffer, const util::UniqueFileDescriptor& counter_file_descriptor); 23 | ~MmapBufferOverflowWorker() = default; 24 | 25 | MmapBufferOverflowWorker(MmapBufferOverflowWorker&&) = delete; 26 | MmapBufferOverflowWorker(const MmapBufferOverflowWorker&) = delete; 27 | 28 | /** 29 | * Cancels the worker thread and awaits its shutdown. 30 | */ 31 | void cancel(); 32 | 33 | private: 34 | /// Thread to run for handling overflows. 35 | std::thread _overflow_handle_thread; 36 | 37 | /// File descriptor to communicate with the thread in case of canceling the worker. 38 | util::UniqueFileDescriptor _cancel_thread_file_descriptor; 39 | 40 | /** 41 | * Worker function that waits for an overflow within the mmap-ed buffer–calling the mmap buffer to handle the 42 | * overflow–and a cancel signal to shut down the worker thread. 43 | * 44 | * @param mmap_buffer Mmap buffer that will be triggered to handle the overflow. 45 | * @param counter_file_descriptor File descriptor of the counter that is used to mmap the buffer; used to await the 46 | * overflow. 47 | * @param cancel_file_descriptor File descriptor used to communicate canceling the worker thread. 48 | */ 49 | static void run(MmapBuffer& mmap_buffer, 50 | util::FileDescriptorView counter_file_descriptor, 51 | util::FileDescriptorView cancel_file_descriptor) noexcept; 52 | }; 53 | 54 | class MmapBuffer 55 | { 56 | public: 57 | explicit MmapBuffer(const util::UniqueFileDescriptor& file_descriptor, std::uint64_t count_pages = 1ULL); 58 | ~MmapBuffer(); 59 | 60 | MmapBuffer(MmapBuffer&&) = delete; 61 | MmapBuffer(const MmapBuffer&) = delete; 62 | 63 | /** 64 | * Reads a performance monitoring counter value from the mmap-ed buffer via the `rdpmc` instruction. 65 | * 66 | * @return PMC value read via `rdpmc` from the buffer. 67 | */ 68 | [[nodiscard]] std::optional read_performance_monitoring_counter() const noexcept; 69 | 70 | /** 71 | * @return The entire data from the buffer, including all data copied from overflows. This will consume the data, 72 | * i.e., the caller owns the data. 73 | */ 74 | [[nodiscard]] std::vector> consume_data(); 75 | 76 | /** 77 | * Copies the data from the mmap-ed buffer into a specific application-level buffer. 78 | * Overflow handling (i.e., this function) will be triggered by the overflow handler. 79 | */ 80 | void handle_overflow(); 81 | 82 | [[nodiscard]] explicit operator bool() const noexcept { return _ringbuffer_header != nullptr; } 83 | 84 | private: 85 | /// First page of the mmap-ed buffer; pointing to the buffer's header. 86 | perf_event_mmap_page* _ringbuffer_header{ nullptr }; 87 | 88 | /// Number of pages allocated via mmap. 89 | std::uint64_t _count_pages{ 0ULL }; 90 | 91 | /// List of data copied from the mmap-ed buffer after an overflow happened. 92 | std::vector> _overflow_data; 93 | 94 | /// Worker that waits for overflows and triggers the overflow handling. 95 | std::optional _overflow_worker{ std::nullopt }; 96 | 97 | /// Mutex for accessing the overflow data. 98 | alignas(64) std::mutex _overflow_data_mutex; 99 | 100 | /** 101 | * Copies the sample data from the mmap-ed buffer into a vector and sets the tail of the mmap-ed buffer accordingly. 102 | * 103 | * @return Data copied from the buffer. 104 | */ 105 | [[nodiscard]] std::vector copy_data_from_ringbuffer(); 106 | 107 | /** 108 | * Aligns the number of buffer pages to a number that is a power of two plus one for the header. 109 | * 110 | * @param number_of_buffer_pages Current number of buffer pages. 111 | * @return An aligned number that is a power of two plus one. Nothing changes if the number is already aligned. 112 | */ 113 | [[nodiscard]] static std::uint64_t align_number_of_buffer_pages(std::uint64_t number_of_buffer_pages) noexcept; 114 | }; 115 | } -------------------------------------------------------------------------------- /examples/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | ## Single-threaded 2 | add_executable(single-thread-statistics EXCLUDE_FROM_ALL examples/statistics/single_thread.cpp examples/access_benchmark.cpp) 3 | target_link_libraries(single-thread-statistics perf-cpp) 4 | 5 | ## Multi-threaded; but inherit counter from main-thread 6 | add_executable(inherit-thread-statistics EXCLUDE_FROM_ALL examples/statistics/inherit_thread.cpp examples/access_benchmark.cpp) 7 | target_link_libraries(inherit-thread-statistics perf-cpp) 8 | 9 | ## Multi-threaded with thread-local counter 10 | add_executable(multi-thread-statistics EXCLUDE_FROM_ALL examples/statistics/multi_thread.cpp examples/access_benchmark.cpp) 11 | target_link_libraries(multi-thread-statistics perf-cpp) 12 | 13 | ## Multi-CPU with per-CPU counter 14 | add_executable(multi-cpu-statistics EXCLUDE_FROM_ALL examples/statistics/multi_cpu.cpp examples/access_benchmark.cpp) 15 | target_link_libraries(multi-cpu-statistics perf-cpp) 16 | 17 | ## Multi-Process with per-process counter 18 | add_executable(multi-process-statistics EXCLUDE_FROM_ALL examples/statistics/multi_process.cpp examples/access_benchmark.cpp) 19 | target_link_libraries(multi-process-statistics perf-cpp) 20 | 21 | ## Metrics 22 | add_executable(metric EXCLUDE_FROM_ALL examples/statistics/metric.cpp examples/access_benchmark.cpp) 23 | target_link_libraries(metric perf-cpp) 24 | 25 | ## Live Events 26 | add_executable(live-events EXCLUDE_FROM_ALL examples/statistics/live_events.cpp examples/access_benchmark.cpp) 27 | target_link_libraries(live-events perf-cpp) 28 | 29 | ## Sampling instruction pointers 30 | add_executable(instruction-pointer-sampling EXCLUDE_FROM_ALL examples/sampling/instruction_pointer.cpp examples/access_benchmark.cpp) 31 | target_link_libraries(instruction-pointer-sampling perf-cpp) 32 | 33 | ## Sampling instruction pointers 34 | add_executable(counter-sampling EXCLUDE_FROM_ALL examples/sampling/counter.cpp examples/access_benchmark.cpp) 35 | target_link_libraries(counter-sampling perf-cpp) 36 | 37 | ## Branch sampling 38 | add_executable(branch-sampling EXCLUDE_FROM_ALL examples/sampling/branch.cpp examples/access_benchmark.cpp) 39 | target_link_libraries(branch-sampling perf-cpp) 40 | 41 | ## Memory address sampling 42 | add_executable(memory-address-sampling EXCLUDE_FROM_ALL examples/sampling/memory_address.cpp examples/access_benchmark.cpp) 43 | target_link_libraries(memory-address-sampling perf-cpp) 44 | 45 | ## Sampling user_registers 46 | add_executable(register-sampling EXCLUDE_FROM_ALL examples/sampling/register.cpp examples/access_benchmark.cpp) 47 | target_link_libraries(register-sampling perf-cpp) 48 | 49 | ## Sampling on multiple threads 50 | add_executable(multi-thread-sampling EXCLUDE_FROM_ALL examples/sampling/multi_thread.cpp examples/access_benchmark.cpp) 51 | target_link_libraries(multi-thread-sampling perf-cpp) 52 | 53 | ## Sampling on multiple threads 54 | add_executable(multi-cpu-sampling EXCLUDE_FROM_ALL examples/sampling/multi_cpu.cpp examples/access_benchmark.cpp) 55 | target_link_libraries(multi-cpu-sampling perf-cpp) 56 | 57 | ## Sampling with multiple events 58 | add_executable(multi-event-sampling EXCLUDE_FROM_ALL examples/sampling/multi_event.cpp examples/access_benchmark.cpp) 59 | target_link_libraries(multi-event-sampling perf-cpp) 60 | 61 | ## Sampling with raw values 62 | add_executable(context-switch-sampling EXCLUDE_FROM_ALL examples/sampling/context_switch.cpp examples/access_benchmark.cpp) 63 | target_link_libraries(context-switch-sampling perf-cpp) 64 | 65 | ## Analyze Samples with DataAnalyzer 66 | add_executable(memory-access-analyzer EXCLUDE_FROM_ALL examples/sampling/memory_access_analyzer.cpp examples/access_benchmark.cpp) 67 | target_link_libraries(memory-access-analyzer perf-cpp) 68 | 69 | ## Flame graph 70 | add_executable(flame-graph EXCLUDE_FROM_ALL examples/sampling/flame_graph.cpp examples/access_benchmark.cpp) 71 | target_compile_options(flame-graph PUBLIC "-g3") 72 | target_link_libraries(flame-graph perf-cpp) 73 | 74 | ## Analyze Samples with the perf tool 75 | add_executable(perf-record EXCLUDE_FROM_ALL examples/sampling/perf_record.cpp examples/access_benchmark.cpp) 76 | target_link_libraries(perf-record perf-cpp) 77 | 78 | ## List counters 79 | add_executable(counter-definition EXCLUDE_FROM_ALL examples/counter_definition.cpp) 80 | target_link_libraries(counter-definition perf-cpp) 81 | 82 | ## One target for all examples 83 | add_custom_target(examples) 84 | add_dependencies(examples 85 | single-thread-statistics inherit-thread-statistics multi-thread-statistics multi-cpu-statistics multi-process-statistics 86 | metric instruction-pointer-sampling counter-sampling branch-sampling 87 | memory-address-sampling register-sampling multi-thread-sampling multi-cpu-sampling 88 | multi-event-sampling context-switch-sampling 89 | memory-access-analyzer live-events flame-graph perf-record counter-definition) -------------------------------------------------------------------------------- /docs/build.md: -------------------------------------------------------------------------------- 1 | # How to build and include *perf-cpp* in your project 2 | *perf-cpp* can be build manually or included into CMake projects. 3 | 4 | ## Table of Contents 5 | - [Building Manually](#building-manually) 6 | - [Build the Library](#build-the-library) 7 | - [Install the Library](#install-the-library) 8 | - [Build the Examples](#build-examples) 9 | - [Building as a Dynamically Linked Library](#building-as-a-dynamically-linked-library) 10 | - [Use CMake](#including-into-cmakeliststxt) 11 | - [ExternalProject](#via-externalproject) 12 | - [FetchContent](#via-fetchcontent) 13 | - [find_package](#via-find_package) 14 | --- 15 | 16 | ## Building Manually 17 | > [!NOTE] 18 | > Throughout the documentation, we use `./build` as the build directory. 19 | > However, the build directory can be any directory of your choice (including `.`). 20 | 21 | ### Build the Library 22 | #### Download the source code 23 | 24 | ```bash 25 | git clone https://github.com/jmuehlig/perf-cpp.git 26 | cd perf-cpp 27 | 28 | # Optional: switch to this development version 29 | git checkout v0.12.4 30 | ``` 31 | 32 | #### Generate the Makefile and Build 33 | 34 | ```bash 35 | cmake . -B build 36 | cmake --build build 37 | ``` 38 | 39 | ### Install the Library 40 | To install the library, specify the `CMAKE_INSTALL_PREFIX`: 41 | ```bash 42 | # Generate Makefile 43 | cmake . -B build -DCMAKE_INSTALL_PREFIX=/path/to/install/dir 44 | 45 | # Build 46 | cmake --build build 47 | 48 | # Install 49 | cmake --install build 50 | ``` 51 | 52 | The library will then be available for discovery via CMake and `find_package` (see [below](#via-find_package)). 53 | 54 | ### Generate Processor-specific Events 55 | With `-DGEN_PROCESSOR_EVENTS=1`, the build process will try to read the processor-specific events from the event library ([events/](../events)) and generate a source file (`src/processor_specific_event_provider.cpp`) that adds these events to every (the default and *manually* instantiated) `perf::CounterDefinition` (see also the documentation on [hardware events](counters.md)). 56 | 57 | With this option, processor-specific events can be used like *built-in* ones. 58 | 59 | ```bash 60 | # Generate Makefile and source file for processor-specific events 61 | cmake . -B build -DGEN_PROCESSOR_EVENTS=1 62 | 63 | # Build Library 64 | cmake --build build 65 | ``` 66 | 67 | > [!IMPORTANT] 68 | > Depending on the underlying processor, the source file can grow very large and increase compilation time significantly. 69 | 70 | 71 | ### Build Examples 72 | Enable example compilation with `-DBUILD_EXAMPLES=1` and build the `examples` target: 73 | 74 | ```bash 75 | # Generate Makefile 76 | cmake . -B build -DBUILD_EXAMPLES=1 77 | 78 | # Build Library and Examples 79 | cmake --build build --target examples 80 | ``` 81 | 82 | The example binaries will be located in `build/examples/bin`. 83 | 84 | ### Building as a Dynamically Linked Library 85 | By default, *perf-cpp* is build as a **static** library. 86 | You can request to build a **shared** library with `-DBUILD_LIB_SHARED=1`: 87 | 88 | ```bash 89 | cmake . -B build -DBUILD_LIB_SHARED=1 90 | cmake --build build 91 | ``` 92 | 93 | ## Including into `CMakeLists.txt` 94 | *perf-cpp* uses [CMake](https://cmake.org/) as its build system, facilitating integration into additional CMake projects. 95 | Choose from the following methods: 96 | 97 | ### Via ExternalProject 98 | Include `ExternalProject` in your `CMakeLists.txt` and define the project: 99 | 100 | ```cmake 101 | include(ExternalProject) 102 | ExternalProject_Add( 103 | perf-cpp-external 104 | GIT_REPOSITORY "https://github.com/jmuehlig/perf-cpp" 105 | GIT_TAG "v0.12.4" 106 | PREFIX "lib/perf-cpp" 107 | INSTALL_COMMAND cmake -E echo "" 108 | ) 109 | ``` 110 | * Add `lib/perf-cpp/src/perf-cpp-external/include` to your `include_directories()`. 111 | * Add `lib/perf-cpp/src/perf-cpp-external-build` to your `link_directories()`. 112 | 113 | Note: The directory `lib/` can be any folder of your choice. 114 | 115 | ### Via FetchContent 116 | Include `FetchContent` in your `CMakeLists.txt` and define the project: 117 | 118 | ```cmake 119 | include(FetchContent) 120 | FetchContent_Declare( 121 | perf-cpp-external 122 | GIT_REPOSITORY "https://github.com/jmuehlig/perf-cpp" 123 | GIT_TAG "v0.12.4" 124 | ) 125 | FetchContent_MakeAvailable(perf-cpp-external) 126 | ``` 127 | * Add `perf-cpp` to your linked libraries. 128 | * Add `${perf-cpp-external_SOURCE_DIR}/include/` to your include directories. 129 | 130 | ### Via find_package 131 | If *perf-cpp* is already installed on your system (see [install instructions above](#install-the-library)), you can simply use `find_package` to link it with your project: 132 | 133 | ```cmake 134 | find_package(perf-cpp REQUIRED) 135 | target_link_libraries(perf-cpp::perf-cpp) 136 | ``` 137 | -------------------------------------------------------------------------------- /examples/sampling/multi_cpu.cpp: -------------------------------------------------------------------------------- 1 | #include "../access_benchmark.h" 2 | #include "perfcpp/sampler.h" 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | int 9 | main() 10 | { 11 | std::cout << "libperf-cpp example: Record perf samples including time, " 12 | "instruction pointer, and cpu id for single-threaded random " 13 | "access to an in-memory array on multiple CPU cores." 14 | << std::endl; 15 | 16 | constexpr auto count_threads = 4U; 17 | 18 | /// Create a list of cpus to sample (all available, in this example). 19 | auto cpus_to_watch = std::vector(std::min(4U, std::thread::hardware_concurrency())); 20 | std::iota(cpus_to_watch.begin(), cpus_to_watch.end(), 0U); 21 | 22 | auto sampler = perf::MultiCoreSampler{ std::move(cpus_to_watch) }; 23 | 24 | /// Setup event that triggers writing samples. 25 | sampler.trigger("cycles", perf::Period{ 50000 }); 26 | 27 | /// Setup what data the samples should include (timestamp, instruction pointer, CPU id, thread id). 28 | sampler.values().timestamp(true).instruction_pointer(true).cpu_id(true).thread_id(true); 29 | 30 | /// Create random access benchmark. 31 | auto benchmark = perf::example::AccessBenchmark{ /*randomize the accesses*/ true, 32 | /* create benchmark of 512 MB */ 1024U }; 33 | 34 | /// Allocate space for threads and their results. 35 | const auto items_per_thread = benchmark.size() / count_threads; 36 | auto threads = std::vector{}; 37 | auto thread_local_results = 38 | std::vector(count_threads, 0U); /// Array to store the thread-local results. 39 | 40 | /// Barrier for the threads to wait in order to start them all at the same time. 41 | auto thread_barrier = std::atomic{ false }; 42 | 43 | for (auto thread_index = 0U; thread_index < count_threads; ++thread_index) { 44 | threads.emplace_back([thread_index, items_per_thread, &thread_local_results, &benchmark, &thread_barrier]() { 45 | auto local_value = 0ULL; 46 | 47 | /// Wait for the barrier to become "true", i.e., all threads are spawned. 48 | while (!thread_barrier) 49 | ; 50 | 51 | /// Process the data. 52 | for (auto index = 0U; index < items_per_thread; ++index) { 53 | local_value += benchmark[(thread_index * items_per_thread) + index].value; 54 | } 55 | 56 | thread_local_results[thread_index] = local_value; 57 | }); 58 | } 59 | 60 | /// Start sampling for all specified CPUs at once. 61 | try { 62 | sampler.start(); 63 | } catch (std::runtime_error& exception) { 64 | std::cerr << exception.what() << std::endl; 65 | return 1; 66 | } 67 | 68 | /// Let threads start. 69 | thread_barrier = true; 70 | 71 | /// Wait for all threads to finish. 72 | for (auto& thread : threads) { 73 | thread.join(); 74 | } 75 | 76 | /// Stop sampling on all CPUs. 77 | sampler.stop(); 78 | 79 | /// Add up the results so that the compiler does not get the idea of 80 | /// optimizing away the accesses. 81 | auto value = std::accumulate(thread_local_results.begin(), thread_local_results.end(), 0UL); 82 | 83 | /// We do not want the compiler to optimize away this (otherwise) unused value. 84 | benchmark.pretend_to_use(value); 85 | 86 | /// Get all the recorded samples – ordered by timestamp. 87 | auto samples = sampler.result(true); 88 | 89 | /// Print the first samples. 90 | const auto count_show_samples = std::min(samples.size(), 40U); 91 | std::cout << "\nRecorded " << samples.size() << " samples." << std::endl; 92 | std::cout << "Here are the first " << count_show_samples << " recorded samples:\n" << std::endl; 93 | for (auto index = 0U; index < count_show_samples; ++index) { 94 | const auto& sample = samples[index]; 95 | 96 | /// Since we recorded the time, period, the instruction pointer, and the CPU 97 | /// id, we can only read these values. 98 | if (sample.metadata().timestamp().has_value() && sample.metadata().cpu_id().has_value() && 99 | sample.metadata().thread_id().has_value() && 100 | sample.instruction_execution().logical_instruction_pointer().has_value()) { 101 | std::cout << "Time = " << sample.metadata().timestamp().value() 102 | << " | CPU ID = " << sample.metadata().cpu_id().value() 103 | << " | Thread ID = " << sample.metadata().thread_id().value() << " | Instruction Pointer = 0x" 104 | << std::hex << sample.instruction_execution().logical_instruction_pointer().value() << std::dec << "\n"; 105 | } 106 | } 107 | std::cout << std::flush; 108 | 109 | /// Close the sampler. 110 | /// Note that the sampler can only be closed after reading the samples. 111 | sampler.close(); 112 | 113 | return 0; 114 | } -------------------------------------------------------------------------------- /include/perfcpp/hardware_info.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | 5 | #include 6 | #if defined(__x86_64__) || defined(__i386__) 7 | #include 8 | #endif 9 | 10 | #if !(defined(__x86_64__) || defined(__i386__)) 11 | #define __builtin_cpu_is(x) 0 12 | #endif 13 | 14 | namespace perf { 15 | /** 16 | * Access to information about the underlying hardware substrate like manufacturer and perf specifics. 17 | */ 18 | class HardwareInfo 19 | { 20 | public: 21 | /** 22 | * @return True, if the underlying hardware is an Intel processor. 23 | */ 24 | [[nodiscard]] static bool is_intel() noexcept { return static_cast(__builtin_cpu_is("intel")); } 25 | 26 | /** 27 | * @return True, if the underlying Intel processor requires an aux counter for memory sampling. 28 | */ 29 | [[nodiscard]] static bool is_intel_aux_counter_required(); 30 | 31 | /** 32 | * @return True, if the underlying Intel processor is equal or newer than the 12th generation. 33 | */ 34 | [[nodiscard]] static bool is_intel_12th_generation_or_newer(); 35 | 36 | /** 37 | * @return True, if the underlying hardware is an AMD processor. 38 | */ 39 | [[nodiscard]] static bool is_amd() noexcept { return static_cast(__builtin_cpu_is("amd")); } 40 | 41 | /** 42 | * @return True, if the underlying AMD processor supports Instruction Based Sampling (IBS). 43 | */ 44 | [[nodiscard]] static bool is_amd_ibs_supported() noexcept; 45 | 46 | /** 47 | * @return True, if the underlying AMD processor supports Instruction Based Sampling (IBS) with L3 filter. 48 | */ 49 | [[nodiscard]] static bool is_ibs_l3_filter_supported() noexcept; 50 | 51 | /** 52 | * @return The page size of memory of the underlying machine. 53 | */ 54 | [[nodiscard]] static std::uint64_t memory_page_size(); 55 | 56 | /** 57 | * @return The number of physical performance counters per logical CPU core. 58 | */ 59 | [[nodiscard]] static std::uint8_t physical_performance_counters_per_logical_core(); 60 | 61 | /** 62 | * @return The number of events that can be scheduled to the same physical performance counter. 63 | */ 64 | [[nodiscard]] static std::uint8_t events_per_physical_performance_counter(); 65 | 66 | private: 67 | static std::optional _is_intel_aux_event_required; 68 | static std::optional _is_intel_12th_generation_or_newer; 69 | static std::optional _is_amd_ibs_supported; 70 | static std::optional _is_ibs_l3_filter_supported; 71 | static std::optional _memory_page_size; 72 | static std::optional _physical_performance_counters_per_logical_core; 73 | static std::optional _events_per_physical_performance_counter; 74 | 75 | #if defined(__x86_64__) || defined(__i386__) 76 | /** 77 | * Result of a __get_cpuid call. 78 | */ 79 | class CPUIDResult 80 | { 81 | public: 82 | CPUIDResult() noexcept = default; 83 | ~CPUIDResult() noexcept = default; 84 | 85 | std::uint32_t eax; 86 | std::uint32_t ebx; 87 | std::uint32_t ecx; 88 | std::uint32_t edx; 89 | }; 90 | 91 | /** 92 | * Fires a __get_cpuid call with the provided leaf and sub leaf. In case the call was successful, the register values 93 | * are returned. 94 | * 95 | * @param leaf Leaf. 96 | * @param sub_leaf Sub leaf (0 by default). 97 | * @return Register values (eax, ebx, ecx, edx) in case the cpuid request was successful. 98 | */ 99 | static std::optional cpuid(std::uint32_t leaf, std::uint32_t sub_leaf = 0U) noexcept; 100 | #endif 101 | 102 | /** 103 | * Writes a value into the cache variable and returns the value. 104 | * 105 | * @param variable Cache variable. 106 | * @param value Value to write into the cache variable. 107 | * @return The cached value. 108 | */ 109 | template 110 | [[nodiscard]] static T cache_value(std::optional& variable, const T value) 111 | { 112 | variable = value; 113 | return value; 114 | } 115 | 116 | /** 117 | * Tries to open a performance counter with more and more events until it cannot open more events on a single physical 118 | * performance counter. 119 | * 120 | * @param is_identify_hardware_counters If true, identify the number of hardware counters. Otherwise, identify the 121 | * number of events per hardware counter. 122 | * @return The maximum number of events on a single physical performance counter. 123 | */ 124 | [[nodiscard]] static std::optional explore_hardware_counters_experimentally( 125 | bool is_identify_hardware_counters); 126 | 127 | /** 128 | * Creates a list for hardware counter and event identification. The list may depend on the underlying hardware (e.g., 129 | * some ARM CPUs do not support all events defined by the perf subsystem). 130 | * 131 | * @return List of events to experiment for hardware counter and event identification. 132 | */ 133 | [[nodiscard]] static std::vector generate_events_for_counter_identification(); 134 | }; 135 | } -------------------------------------------------------------------------------- /docs/recording-live-events.md: -------------------------------------------------------------------------------- 1 | # Accessing Live Event Counts 2 | The *perf-cpp* library supports reading hardware performance counter values without stopping the counters ("live" events), particularly on `x86` systems using the [rdpmc](https://www.felixcloutier.com/x86/rdpmc) instruction. 3 | This feature allows for interim results during ongoing computations, ideal for real-time monitoring and adjustments. 4 | 5 | The `perf::EventCounter` class is designed to support both standard and "live" events, allowing configuration of hardware performance counters to access results either "live" (for interim results) or after stopping. 6 | For the latter, see [the recording basics documentation](recording.md). 7 | 8 | > [!TIP] 9 | > Our examples include a working code-example: **[statistics/live_events.cpp](../examples/statistics/live_events.cpp)**. 10 | 11 | --- 12 | ## Table of Contents 13 | - [Setting Up Live Events](#setting-up-live-events) 14 | - [Initializing the Hardware Counters *(optional)*](#initializing-the-hardware-counters-optional) 15 | - [Reading Live Events During Computation](#reading-live-events-during-computation) 16 | - [Finalizing and Retrieving Results](#finalizing-and-retrieving-results) 17 | --- 18 | 19 | ## Setting Up Live Events 20 | Define which events to monitor live and which to read post-computation using the `perf::EventCounter`: 21 | 22 | ```cpp 23 | #include 24 | 25 | auto event_counter = perf::EventCounter{}; 26 | 27 | try { 28 | /// Events for live monitoring. 29 | event_counter.add_live({"cache-misses", "cache-references", "branches"}); 30 | } catch (std::runtime_error& e) { 31 | std::cerr << e.what() << std::endl; 32 | } 33 | ``` 34 | 35 | > [!IMPORTANT] 36 | > We experienced that not mixing live with "traditional" events leads to more consistent results. 37 | 38 | > [!NOTE] 39 | > Live events can only capture hardware events but not metrics. 40 | 41 | ## Initializing the Hardware Counters *(optional)* 42 | Optionally, preparing the hardware counters ahead of time to exclude configuration time from your measurements, though this is also handled automatically at the start if skipped: 43 | 44 | ```cpp 45 | try { 46 | event_counter.open(); 47 | } catch (std::runtime_error& e) { 48 | std::cerr << e.what() << std::endl; 49 | } 50 | ``` 51 | 52 | ## Reading Live Events During Computation 53 | The library provides two methods for accessing live events during computation: directly via the `EventCounter` and using a simplified `LiveEventCounter` wrapper. 54 | 55 | ### Option 1: Direct Access via `EventCounter` 56 | Events added as live events (via `add_live()`) can be directly accessed from the `EventCounter` without stopping. 57 | To be efficient, read live event counts by pre-allocating memory for the results to avoid allocation overheads during critical measurement phases: 58 | 59 | ```cpp 60 | try { 61 | event_counter.start(); 62 | } catch (std::runtime_error& e) { 63 | std::cerr << e.what() << std::endl; 64 | } 65 | 66 | /// Pre-allocated containers for live results. 67 | auto start_values = std::vector{/* cache-misses */ .0, /* cache-references */ .0}; 68 | auto end_values = std::vector{/* cache-misses */ .0, /* cache-references */ .0}; 69 | 70 | for (auto i = 0U; i < runs; ++i) { 71 | /// Capture start values. 72 | event_counter.live_results(start_values); 73 | 74 | /// Computation here... 75 | 76 | /// Capture end values after computation. 77 | event_counter.live_results(end_values); 78 | 79 | std::cout << "Live Results: " 80 | << "cache-misses: " << end_values[0U] - start_values[0U] << "," 81 | << "cache-references: " << end_values[1U] - start_values[1U] << std::endl; 82 | } 83 | ``` 84 | 85 | ### Option 2: Simplified Access via `LiveEventCounter` Wrapper 86 | The `LiveEventCounter` provides a streamlined method to manage live event monitoring by handling memory management and calculation of differences internally. 87 | 88 | ```cpp 89 | /// Initiate the LiveEventCounter wrapper before starting. 90 | auto live_event_counter = perf::LiveEventCounter{ event_counter }; 91 | 92 | try { 93 | event_counter.start(); 94 | } catch (std::runtime_error& e) { 95 | std::cerr << e.what() << std::endl; 96 | } 97 | 98 | for (auto i = 0U; i < runs; ++i) { 99 | /// Capture start values. 100 | live_event_counter.start(); 101 | 102 | /// Computation here... 103 | 104 | /// Capture end values after computation. 105 | live_event_counter.stop(); 106 | 107 | std::cout << "Live Results: " 108 | << "cache-misses: " << live_event_counter.get("cache-misses") << "," 109 | << "cache-references: " << live_event_counter.get("cache-references") << std::endl; 110 | } 111 | ``` 112 | 113 | ## Finalizing and Retrieving Results 114 | Upon completion, stop the counters: 115 | 116 | ```cpp 117 | /// Stop the counter after processing. 118 | event_counter.stop(); 119 | ``` 120 | 121 | For further information, refer to the [recording basics documentation](recording.md) and the [code example](../examples/statistics/live_events.cpp). 122 | -------------------------------------------------------------------------------- /include/perfcpp/util/graph.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | namespace perf::util { 9 | template 10 | class DirectedGraph 11 | { 12 | public: 13 | DirectedGraph() = default; 14 | ~DirectedGraph() = default; 15 | 16 | /** 17 | * Inserts a node into the graph. 18 | * 19 | * @param node Node to insert. 20 | */ 21 | void insert(const N& node) { _nodes_and_edges.insert(std::make_pair(node, std::unordered_set())); } 22 | 23 | /** 24 | * Creates an edge between the node and the successor. 25 | * 26 | * @param node Starting node. 27 | * @param successor Ending node. 28 | */ 29 | void connect(const N& node, const N& successor) 30 | { 31 | /// Insert the successor node, if the node does not exist in the graph. 32 | if (_nodes_and_edges.find(successor) == _nodes_and_edges.end()) { 33 | insert(successor); 34 | } 35 | 36 | /// If the node exists, add the successor into the successor set. 37 | if (auto iterator = _nodes_and_edges.find(node); iterator != _nodes_and_edges.end()) { 38 | iterator->second.insert(std::move(successor)); 39 | } 40 | 41 | /// Otherwise, create a new node with the successor. 42 | else { 43 | _nodes_and_edges.insert(std::make_pair(node, std::unordered_set{ successor })); 44 | } 45 | } 46 | 47 | /** 48 | * @return True, when the graph is empty. 49 | */ 50 | [[nodiscard]] bool empty() const noexcept { return _nodes_and_edges.empty(); } 51 | 52 | /** 53 | * @return The first node that has no incoming edge. 54 | */ 55 | [[nodiscard]] std::optional pop() 56 | { 57 | for (auto& [node, _] : _nodes_and_edges) { 58 | /// Check every node if the node is not a successor. 59 | if (this->is_successor(node) == false) { 60 | 61 | const auto node_without_successor = node; 62 | 63 | /// If the node is not a successor, remove the node and return it. 64 | this->erase(node); 65 | 66 | return node_without_successor; 67 | } 68 | } 69 | 70 | return std::nullopt; 71 | } 72 | 73 | /** 74 | * Checks if the directed graph contains a cycle. 75 | * Uses DFS with three-color approach for optimal O(V + E) performance. 76 | * 77 | * @return True if the graph has a cycle, false otherwise. 78 | */ 79 | [[nodiscard]] bool is_cyclic() const noexcept 80 | { 81 | if (empty()) { 82 | return false; 83 | } 84 | 85 | // Three-color DFS: 0 = white (unvisited), 1 = gray (in current path), 2 = black (finished) 86 | auto node_color = std::unordered_map{}; 87 | 88 | // Initialize all nodes as white 89 | for (const auto& [node, _] : _nodes_and_edges) { 90 | node_color.insert(std::make_pair(node, 0U)); 91 | } 92 | 93 | // Check each unvisited node 94 | for (const auto& [node, _] : _nodes_and_edges) { 95 | if (node_color[node] == 0U && dfs_has_cycle(node, node_color)) { 96 | return true; 97 | } 98 | } 99 | 100 | return false; 101 | } 102 | 103 | private: 104 | /// Map of nodes and their successors. 105 | std::unordered_map> _nodes_and_edges; 106 | 107 | /** 108 | * Checks if the node is in any successor list. 109 | * 110 | * @param node Node to check. 111 | * @return True, of the node is in any successor list. 112 | */ 113 | [[nodiscard]] bool is_successor(const N& node) const noexcept 114 | { 115 | for (const auto& [_, successors] : _nodes_and_edges) { 116 | if (successors.find(node) != successors.end()) { 117 | return true; 118 | } 119 | } 120 | 121 | return false; 122 | } 123 | 124 | /** 125 | * Removes the node from the graph. 126 | * 127 | * @param node Node to remove. 128 | */ 129 | void erase(const N& node) { _nodes_and_edges.erase(node); } 130 | 131 | /** 132 | * DFS helper function for cycle detection. 133 | * 134 | * @param node Current node being explored. 135 | * @param node_color Color map tracking node states. 136 | * @return True if a cycle is found during this DFS traversal. 137 | */ 138 | [[nodiscard]] bool dfs_has_cycle(const N& node, std::unordered_map& node_color) const noexcept 139 | { 140 | // Mark current node as gray (in current path) 141 | node_color[node] = 1U; 142 | 143 | // Find the node's successors 144 | if (const auto iterator = _nodes_and_edges.find(node); iterator != _nodes_and_edges.end()) { 145 | for (const auto& successor : iterator->second) { 146 | if (node_color[successor] == 1U) { 147 | // Found a back edge (gray node) - cycle detected 148 | return true; 149 | } 150 | if (node_color[successor] == 0U && dfs_has_cycle(successor, node_color)) { 151 | // Recursively check unvisited successors 152 | return true; 153 | } 154 | } 155 | } 156 | 157 | // Mark current node as black (finished processing) 158 | node_color[node] = 2U; 159 | return false; 160 | } 161 | }; 162 | } -------------------------------------------------------------------------------- /include/perfcpp/analyzer/data_type.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | namespace perf::analyzer { 9 | /** 10 | * The DataType projects a data object with members (attributes). 11 | */ 12 | class DataType 13 | { 14 | public: 15 | class Member 16 | { 17 | public: 18 | Member(std::string&& name, const std::size_t offset, const std::size_t size) noexcept 19 | : _name(std::move(name)) 20 | , _offset(offset) 21 | , _size(size) 22 | { 23 | } 24 | ~Member() = default; 25 | 26 | [[nodiscard]] const std::string& name() const noexcept { return _name; } 27 | [[nodiscard]] std::size_t offset() const noexcept { return _offset; } 28 | [[nodiscard]] std::size_t size() const noexcept { return _size; } 29 | [[nodiscard]] const std::vector& samples() const noexcept { return _samples; } 30 | [[nodiscard]] std::vector& samples() noexcept { return _samples; } 31 | 32 | private: 33 | std::string _name; 34 | std::size_t _offset; 35 | std::size_t _size; 36 | 37 | std::vector _samples; 38 | }; 39 | 40 | DataType(std::string&& name, const std::size_t size) 41 | : _name(std::move(name)) 42 | , _size(size) 43 | { 44 | } 45 | DataType(const DataType&) = default; 46 | DataType(DataType&&) noexcept = default; 47 | DataType(std::string&& new_name, const DataType& other) 48 | : _name(std::move(new_name)) 49 | , _size(other._size) 50 | , _members(other._members) 51 | { 52 | } 53 | ~DataType() = default; 54 | 55 | DataType& operator=(const DataType&) = default; 56 | DataType& operator=(DataType&&) noexcept = default; 57 | 58 | /** 59 | * @return Name of the data type. 60 | */ 61 | [[nodiscard]] const std::string& name() const noexcept { return _name; } 62 | 63 | /** 64 | * @return Size of the data type. 65 | */ 66 | [[nodiscard]] std::size_t size() const noexcept { return _size; } 67 | 68 | /** 69 | * @return List of all members. 70 | */ 71 | [[nodiscard]] const std::vector& members() const noexcept { return _members; } 72 | 73 | /** 74 | * @return List of all members. 75 | */ 76 | [[nodiscard]] std::vector& members() noexcept { return _members; } 77 | 78 | /** 79 | * Adds a member with the given name and size to the data type. A member can be, for example, an attribute of the data 80 | * type. 81 | * 82 | * @param member_name Name of the member. 83 | * @param size Size of the member. 84 | */ 85 | void add(std::string&& member_name, const std::size_t size) 86 | { 87 | if (_members.empty()) { 88 | add(std::move(member_name), 0U, size); 89 | } else { 90 | add(std::move(member_name), _members.back().offset() + _members.back().size(), size); 91 | } 92 | } 93 | 94 | /** 95 | * Adds a member with the given name and size with a specified offset (relative to the beginning of the data object) 96 | * to the data type wit. A member can be, for example, an attribute of the data type. 97 | * 98 | * @param member_name Name of the member. 99 | * @param offset Offset of the member, relative to the data type. 100 | * @param size Size of the member. 101 | */ 102 | void add(std::string&& member_name, const std::size_t offset, const std::size_t size) 103 | { 104 | _members.emplace_back(std::move(member_name), offset, size); 105 | } 106 | 107 | /** 108 | * Adds a member to the data type. Name and size will be derived from the type in the template. 109 | */ 110 | template 111 | void add() 112 | { 113 | add(std::string{ typeid(T).name() }, sizeof(T)); 114 | } 115 | 116 | /** 117 | * Adds a member with a given name to the data type. The Size will be derived from the type in the template. 118 | * 119 | * @param name Name of the member. 120 | */ 121 | template 122 | void add(std::string&& name) 123 | { 124 | add(std::move(name), sizeof(T)); 125 | } 126 | 127 | /** 128 | * Adds a member with a given name to the data type. The Size will be derived from the type in the template. 129 | * 130 | * @param name Name of the member. 131 | */ 132 | template 133 | void add(const std::string& name) 134 | { 135 | add(std::string{ name }, sizeof(T)); 136 | } 137 | 138 | /** 139 | * Adds a member at a specific offset to the data type. Name and size will be derived from the type in the template. 140 | * 141 | * @param offset Offset relative to the data type. 142 | */ 143 | template 144 | void add(const std::size_t offset) 145 | { 146 | add(typeid(T).name(), offset, sizeof(T)); 147 | } 148 | 149 | /** 150 | * Adds a member with a given name at a specific offset to the data type. The ize will be derived from the type in the 151 | * template. 152 | * 153 | * @param name 154 | * @param offset 155 | */ 156 | template 157 | void add(std::string&& name, const std::size_t offset) 158 | { 159 | add(std::move(name), offset, sizeof(T)); 160 | } 161 | 162 | private: 163 | std::string _name; 164 | std::size_t _size; 165 | std::vector _members; 166 | }; 167 | } -------------------------------------------------------------------------------- /examples/sampling/multi_event.cpp: -------------------------------------------------------------------------------- 1 | #include "../access_benchmark.h" 2 | #include "perfcpp/hardware_info.h" 3 | #include "perfcpp/sampler.h" 4 | #include 5 | 6 | int 7 | main() 8 | { 9 | std::cout << "libperf-cpp example: Record perf samples including time, " 10 | "logical memory address, latency, and data source for " 11 | "single-threaded random access to an in-memory array " 12 | "using multiple events as trigger." 13 | << std::endl; 14 | 15 | /// Initialize sampler. 16 | auto sampler = perf::Sampler{}; 17 | 18 | if (perf::HardwareInfo::is_intel()) { 19 | sampler.trigger(std::vector>{ 20 | { 21 | perf::Sampler::Trigger{ "mem-loads", perf::Precision::RequestZeroSkid, perf::Period{ 8000U } } /// Loads 22 | }, 23 | { perf::Sampler::Trigger{ "mem-stores", perf::Precision::MustHaveZeroSkid, perf::Period{ 8000U } } } /// Stores 24 | }); 25 | } else { 26 | std::cout << "Error: Memory sampling with multiple triggers is not supported on this CPU." << std::endl; 27 | return 1; 28 | } 29 | 30 | /// Define what to sample. 31 | sampler.values().timestamp(true).logical_memory_address(true).data_source(true).latency(true); 32 | 33 | /// Create random access benchmark. 34 | auto benchmark = perf::example::AccessBenchmark{ /*randomize the accesses*/ true, 35 | /* create benchmark of 512 MB */ 512U, 36 | /* also support writing */ true }; 37 | 38 | /// Start sampling. 39 | try { 40 | sampler.start(); 41 | } catch (std::runtime_error& exception) { 42 | std::cerr << exception.what() << std::endl; 43 | return 1; 44 | } 45 | 46 | /// Execute the benchmark (accessing cache lines in a random order). 47 | auto value = 0ULL; 48 | for (auto index = 0U; index < benchmark.size(); ++index) { 49 | value += benchmark[index].value; 50 | 51 | /// Also write a value to get store events. 52 | benchmark.set(index, value); 53 | } 54 | 55 | /// We do not want the compiler to optimize away this (otherwise) unused value. 56 | benchmark.pretend_to_use(value); 57 | 58 | /// Stop sampling. 59 | sampler.stop(); 60 | 61 | /// Get all the recorded samples. 62 | auto samples = sampler.result(/* sort by time */ true); 63 | const auto count_samples_before_filter = samples.size(); 64 | 65 | /// Print the first samples. 66 | const auto count_show_samples = std::min(samples.size(), 40U); 67 | std::cout << "\nRecorded " << count_samples_before_filter << " samples. " << samples.size() 68 | << " remaining after filter." << std::endl; 69 | std::cout << "Here are the first " << count_show_samples << " recorded samples:\n" << std::endl; 70 | for (auto index = 0U; index < count_show_samples; ++index) { 71 | const auto& sample = samples[index]; 72 | 73 | /// Since we recorded the time, period, the instruction pointer, and the CPU 74 | /// id, we can only read these values. 75 | if (sample.metadata().timestamp().has_value() && sample.data_access().logical_memory_address().has_value() && 76 | sample.data_access().source().has_value()) { 77 | auto data_source = "N/A"; 78 | if (sample.data_access().source()->is_l1_hit()) { 79 | data_source = "L1d"; 80 | } else if (sample.data_access().source()->is_mhb_hit().value_or(false)) { 81 | data_source = "LFB/MAB"; 82 | } else if (sample.data_access().source()->is_l2_hit()) { 83 | data_source = "L2"; 84 | } else if (sample.data_access().source()->is_l3_hit()) { 85 | data_source = "L3"; 86 | } else if (sample.data_access().source()->is_memory_hit()) { 87 | data_source = "RAM"; 88 | } 89 | 90 | auto type = "N/A"; 91 | if (sample.instruction_execution().type().has_value()) { 92 | if (sample.data_access().is_load()) { 93 | type = "Load"; 94 | } else if (sample.data_access().is_store()) { 95 | type = "Store"; 96 | } 97 | } 98 | 99 | const auto instruction_latency = sample.instruction_execution().latency().instruction_retirement().value_or( 100 | sample.instruction_execution().latency().uop_tag_to_retirement().value_or(0U)); 101 | const auto cache_latency = 102 | sample.data_access().latency().cache_miss().value_or(sample.data_access().latency().cache_miss().value_or(0U)); 103 | 104 | std::cout << "Time = " << sample.metadata().timestamp().value() << " | Logical Mem Address = 0x" << std::hex 105 | << sample.data_access().logical_memory_address().value() << std::dec 106 | << " | Latency (cache, instruction) = " << cache_latency << ", " << instruction_latency 107 | << " | Type = " << type << " | Data Source = " << data_source << "\n"; 108 | } else if (sample.count_loss().has_value()) { 109 | std::cout << "Loss = " << sample.count_loss().value() << "\n"; 110 | } 111 | } 112 | std::cout << std::flush; 113 | 114 | /// Close the sampler. 115 | /// Note that the sampler can only be closed after reading the samples. 116 | sampler.close(); 117 | 118 | return 0; 119 | } -------------------------------------------------------------------------------- /docs/sampling-symbols-and-flamegraphs.md: -------------------------------------------------------------------------------- 1 | # Symbols and Flamegraphs 2 | Performance bottlenecks often hide inside deep call stacks: the one slow function that is really stalling your frame rate sits four layers below the code you are looking at. 3 | A flamegraph ([example](https://www.brendangregg.com/flamegraphs.html)) collapses thousands of sampled call-stacks into a single, interactive SVG where the widest bars show the functions that burn the most CPU time. 4 | 5 | *perf‑cpp* now provides two building blocks for flamegraph generation: 6 | - **Symbol resolution**: translate raw instruction pointers into `::+` strings. 7 | - **Collapsed‑stack export**: emit samples in the canonical `func1;func2;func3 ` format understood by tools such as [Brendan Gregg's FlameGraph](https://github.com/brendangregg/FlameGraph), [Speedscope](https://www.speedscope.app/), or [flamegraph.com](https://flamegraph.com/). 8 | 9 | With just a few lines of code you can record samples, resolve symbols, and open a browser to an interactive heat‑map of your code. 10 | 11 | --- 12 | ## Table of Contents 13 | - [Translating Instruction Pointers into Symbols](#translating-instruction-pointers-into-symbols) 14 | - [Translating Sampler Results into Flame Graphs](#translating-sampler-results-into-flame-graphs) 15 | - [Setting up the Sampler](#setting-up-the-sampler) 16 | - [Generating Flamegraphs](#generating-flamegraphs) 17 | --- 18 | 19 | ## Translating Instruction Pointers into Symbols 20 | The `perf::SymbolResolver` allows to translate logical instruction pointers into symbols (i.e., the name if the module, the name of the function, and the offset within that function). 21 | 22 | ```cpp 23 | #include 24 | #include 25 | 26 | auto sampler = perf::Sampler{ }; 27 | sampler.trigger("cycles", perf::Precision::RequestZeroSkid, perf::Period{ 50000U }); 28 | sampler.values().instruction_pointer(true); 29 | 30 | sampler.start(); 31 | /// Run some code 32 | sampler.stop(); 33 | 34 | auto symbol_resolver = perf::SymbolResolver{}; 35 | 36 | for (const auto& sample : sampler.results()) { 37 | const auto instruction_pointer = sample.instruction_execution().logical_instruction_pointer(); 38 | if (instruction_pointer.has_value()) { 39 | 40 | /// Resolve the symbol. 41 | const auto symbol = symbol_resolver.resolve(instruction_pointer.value()); 42 | 43 | /// Translate the symbol into a string. 44 | const auto symbol_name = symbol.has_value() ? symbol->to_string() : std::string{"??"}; 45 | 46 | std::cout " Instruction Pointer = 0x" << std::hex 47 | << instruction_pointer.value() << std::dec 48 | << " | Symbol = " << symbol_name 49 | << "\n"; 50 | } 51 | } 52 | ``` 53 | 54 | The output could look like the following: 55 | 56 | ```bash 57 | Instruction Pointer = 0x57459be95faf | Symbol = [instruction-pointer-sampling] _ZNK4perf7example15AccessBenchmarkixEm+47 58 | Instruction Pointer = 0x57459be95faf | Symbol = [instruction-pointer-sampling] _ZNK4perf7example15AccessBenchmarkixEm+47 59 | Instruction Pointer = 0x57459be987d0 | Symbol = [instruction-pointer-sampling] _ZNKSt6vectorIN4perf7example15AccessBenchmark10cache_lineESaIS3_EEixEm+0 60 | ``` 61 | 62 | **→ [See a practical example](../examples/sampling/instruction_pointer.cpp)** 63 | 64 | ### Translating Sampler Results into Flame Graphs 65 | 66 | #### Setting up the Sampler 67 | To generate flamegraphs, we need include 68 | - the (logical) *instruction pointer* (to identify the leaf frame) 69 | - and the *callchain* (to reconstruct the stack) 70 | 71 | into samples. 72 | For more condensed outputs, it is also recommended to include the *timestamp* and sort the results afterward. 73 | 74 | ```cpp 75 | #include 76 | 77 | auto sampler = perf::Sampler{ }; 78 | sampler.trigger("cycles"); 79 | sampler.values() 80 | .instruction_pointer(true) 81 | .callchain(true) 82 | .timestamp(true); 83 | ``` 84 | 85 | #### Generating Flamegraphs 86 | After sampling, the `perf::analyzer::FlameGraphGenerator` can map the samples into a format that can be read by common used flamegraph generators: 87 | 88 | ```cpp 89 | #include 90 | 91 | sampler.start(); 92 | /// Code to sample will be called here... 93 | sampler.stop(); 94 | 95 | /// Get all the recorded samples and sort for condensed outputs 96 | /// (sorting via `true` flag is optional). 97 | const auto samples = sampler.result(/*sort = */ true); 98 | 99 | /// Translate into a frame graph format and write the result to "flagraphs.txt". 100 | auto flame_graph_generator = perf::analyzer::FlameGraphGenerator{}; 101 | flame_graph_generator.map(samples, "flamegraphs.txt"); 102 | ``` 103 | 104 | After writing the output, we can use that file as an input to flamegraph generators, for example: 105 | - [Brendan Gregg's FlameGraph](https://github.com/brendangregg/FlameGraph): Download the project and translate `flamegraphs.txt` into an SVG via `./flamegraph.pl flamegraphs.txt > flamegraphs.svg` 106 | - [flamegraph.com](https://flamegraph.com/): Upload the `flamegraphs.txt` 107 | - [Speedscope](https://www.speedscope.app/): Upload the `flamegraphs.txt` 108 | 109 | **→ [See full example](../examples/sampling/flame_graph.cpp)** -------------------------------------------------------------------------------- /examples/sampling/memory_address.cpp: -------------------------------------------------------------------------------- 1 | #include "../access_benchmark.h" 2 | #include "perfcpp/hardware_info.h" 3 | #include "perfcpp/sampler.h" 4 | #include 5 | 6 | int 7 | main() 8 | { 9 | std::cout << "libperf-cpp example: Record perf samples including time, " 10 | "logical memory address, latency, and data source for " 11 | "single-threaded random access to an in-memory array." 12 | << std::endl; 13 | 14 | /// Initialize sampler. 15 | auto sampler = perf::Sampler{}; 16 | 17 | /// Setup which counters trigger the writing of samples (depends on the underlying hardware substrate). 18 | if (perf::HardwareInfo::is_amd_ibs_supported()) { 19 | sampler.trigger("ibs_op_uops", perf::Precision::MustHaveZeroSkid, perf::Period{ 4000U }); 20 | } else if (perf::HardwareInfo::is_intel()) { 21 | sampler.trigger("mem-loads", perf::Precision::MustHaveZeroSkid, perf::Period{ 4000U }); 22 | } else { 23 | std::cout << "Error: Memory sampling is not supported on this CPU." << std::endl; 24 | return 1; 25 | } 26 | 27 | /// Setup which data will be included into samples (timestamp, virtual memory address, data source like L1d or RAM, 28 | /// and latency). 29 | sampler.values().timestamp(true).logical_memory_address(true).data_source(true).latency(true); 30 | 31 | /// Create random access benchmark. 32 | auto benchmark = perf::example::AccessBenchmark{ /*randomize the accesses*/ true, 33 | /* create benchmark of 1024 MB */ 1024U }; 34 | 35 | /// Start sampling. 36 | try { 37 | sampler.start(); 38 | } catch (std::runtime_error& exception) { 39 | std::cerr << exception.what() << std::endl; 40 | return 1; 41 | } 42 | 43 | /// Execute the benchmark (accessing cache lines in a random order). 44 | auto value = 0ULL; 45 | for (auto index = 0U; index < benchmark.size(); ++index) { 46 | value += benchmark[index].value; 47 | } 48 | 49 | /// We do not want the compiler to optimize away this (otherwise) unused value (and consequently the loop above). 50 | benchmark.pretend_to_use(value); 51 | 52 | /// Stop sampling. 53 | sampler.stop(); 54 | 55 | /// Get all the recorded samples. 56 | auto samples = sampler.result(); 57 | const auto count_samples_before_filter = samples.size(); 58 | 59 | /// Filter out samples without data source (AMD samples all instructions, not only data-related). 60 | samples.erase(std::remove_if(samples.begin(), 61 | samples.end(), 62 | [](const auto& sample) { 63 | return sample.count_loss().has_value() || !sample.data_access().source().has_value() || 64 | sample.data_access().logical_memory_address().value_or(0U) == 0U; 65 | }), 66 | samples.end()); 67 | 68 | /// Print the first samples. 69 | const auto count_show_samples = std::min(samples.size(), 40U); 70 | std::cout << "\nRecorded " << count_samples_before_filter << " samples. " << samples.size() 71 | << " remaining after filter." << std::endl; 72 | std::cout << "Here are the first " << count_show_samples << " recorded samples:\n" << std::endl; 73 | for (auto index = 0U; index < count_show_samples; ++index) { 74 | const auto& sample = samples[index]; 75 | 76 | /// Since we recorded the time, period, the instruction pointer, and the CPU 77 | /// id, we can only read these values. 78 | auto data_source = "N/A"; 79 | if (sample.data_access().source()->is_l1_hit()) { 80 | data_source = "L1d"; 81 | } else if (sample.data_access().source()->is_mhb_hit().value_or(false)) { 82 | data_source = "LFB/MAB"; 83 | } else if (sample.data_access().source()->is_l2_hit()) { 84 | data_source = "L2"; 85 | } else if (sample.data_access().source()->is_l3_hit()) { 86 | data_source = "L3"; 87 | } else if (sample.data_access().source()->is_memory_hit()) { 88 | data_source = "RAM"; 89 | } 90 | 91 | auto instruction_latency = 0ULL; 92 | auto cache_latency = 0ULL; 93 | 94 | if (perf::HardwareInfo::is_intel()) { 95 | instruction_latency = sample.instruction_execution().latency().instruction_retirement().value_or(0U); 96 | cache_latency = sample.data_access().latency().cache_access().value_or(0U); 97 | } else if (perf::HardwareInfo::is_amd()) { 98 | instruction_latency = sample.instruction_execution().latency().uop_tag_to_retirement().value_or(0U); 99 | cache_latency = sample.data_access().latency().cache_miss().value_or(0U); 100 | } 101 | 102 | std::cout << "Time = " << sample.metadata().timestamp().value_or(0U) << " | Logical Mem Address = 0x" << std::hex 103 | << sample.data_access().logical_memory_address().value() << std::dec 104 | << " | Latency (cache, instruction) = " << cache_latency << ", " << instruction_latency 105 | << " | Is Load = " << sample.data_access().is_load() << " | Data Source = " << data_source << "\n"; 106 | } 107 | std::cout << std::flush; 108 | 109 | /// Close the sampler. 110 | /// Note that the sampler can only be closed after reading the samples. 111 | sampler.close(); 112 | 113 | return 0; 114 | } -------------------------------------------------------------------------------- /examples/statistics/multi_process.cpp: -------------------------------------------------------------------------------- 1 | #include "../access_benchmark.h" 2 | #include "perfcpp/event_counter.h" 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | int 9 | main() 10 | { 11 | std::cout << "libperf-cpp example: Record performance counter for " 12 | "random access to an in-memory array per process." 13 | << std::endl; 14 | std::cout << "We will record the counters per process and merge the results " 15 | "afterwards." 16 | << std::endl; 17 | 18 | /// Create random access benchmark. 19 | auto benchmark = perf::example::AccessBenchmark{ /*randomize the accesses*/ true, 20 | /* create benchmark of 1024 MB */ 1024U }; 21 | 22 | /// One event_counter instance for every thread. 23 | constexpr auto count_threads = 2U; 24 | const auto items_per_thread = benchmark.size() / count_threads; 25 | auto threads = std::vector{}; 26 | auto thread_local_results = 27 | std::vector(count_threads, 0U); /// Array to store the thread-local results. 28 | 29 | /// process ids to record performance counters. 30 | auto process_ids = std::vector{}; 31 | process_ids.resize(count_threads); 32 | 33 | /// Barrier for the threads to wait. 34 | auto thread_barrier = std::atomic{ false }; 35 | 36 | /// Barrier for main thread to wait until threads have written their process ids. 37 | auto written_pid_counter = std::atomic{ 0U }; 38 | 39 | for (auto thread_index = 0U; thread_index < count_threads; ++thread_index) { 40 | threads.emplace_back([thread_index, 41 | items_per_thread, 42 | &thread_local_results, 43 | &benchmark, 44 | &process_ids, 45 | &thread_barrier, 46 | &written_pid_counter]() { 47 | auto local_value = 0ULL; 48 | 49 | /// Store the process id for creating performance counters on that thread. 50 | /// To the best of our knowledge, there is no other way to get the (linux) tid/pid of an std::thread :-(. 51 | process_ids[thread_index] = gettid(); 52 | 53 | /// Notify the main thread that the pid/tid was written by this thread. 54 | written_pid_counter.fetch_add(1U); 55 | 56 | /// Wait for the barrier to become "true". 57 | while (!thread_barrier) 58 | ; 59 | 60 | /// Process the data. 61 | for (auto index = 0U; index < items_per_thread; ++index) { 62 | local_value += benchmark[(thread_index * items_per_thread) + index].value; 63 | } 64 | 65 | thread_local_results[thread_index] = local_value; 66 | }); 67 | } 68 | 69 | /// Wait for all threads to write their pid/tid. 70 | while (written_pid_counter < count_threads) 71 | ; 72 | 73 | /// Create process ids to watch. 74 | std::cout << "Creating counters for Processes: "; 75 | for (const auto pid : process_ids) { 76 | std::cout << pid << " "; 77 | } 78 | std::cout << std::endl; 79 | 80 | /// Initialize performance counters. 81 | auto multi_cpu_event_counter = perf::MultiProcessEventCounter{ std::move(process_ids) }; 82 | 83 | /// Add all the performance counters we want to record. 84 | try { 85 | multi_cpu_event_counter.add({ "instructions", 86 | "cycles", 87 | "branches", 88 | "cache-misses", 89 | "dTLB-miss-ratio", 90 | "L1-data-miss-ratio", 91 | "cycles-per-instruction" }); 92 | } catch (std::runtime_error& e) { 93 | std::cerr << e.what() << std::endl; 94 | return 1; 95 | } 96 | 97 | /// Start recording performance counter. 98 | /// In contrast to the inherit-thread example (see inherit_thread.cpp), we 99 | /// will record the performance counters on each thread on every core. 100 | try { 101 | multi_cpu_event_counter.start(); 102 | } catch (std::runtime_error& exception) { 103 | std::cerr << exception.what() << std::endl; 104 | return 1; 105 | } 106 | 107 | /// Let threads start. 108 | thread_barrier = true; 109 | 110 | /// Wait for all threads to finish. 111 | for (auto& thread : threads) { 112 | thread.join(); 113 | } 114 | 115 | /// Stop performance counter recording. 116 | multi_cpu_event_counter.stop(); 117 | 118 | /// Add up the results so that the compiler does not get the idea of 119 | /// optimizing away the accesses. 120 | auto value = std::accumulate(thread_local_results.begin(), thread_local_results.end(), 0UL); 121 | 122 | /// We do not want the compiler to optimize away this (otherwise) unused value. 123 | benchmark.pretend_to_use(value); 124 | 125 | /// Get the result (normalized per cache line) from the 126 | /// multithread_event_counter. 127 | auto result = multi_cpu_event_counter.result(benchmark.size()); 128 | 129 | /// Print the performance counters. 130 | std::cout << "\nResults:\n"; 131 | for (const auto& [counter_name, counter_value] : result) { 132 | std::cout << counter_value << " " << counter_name << " / cache line" << std::endl; 133 | } 134 | 135 | return 0; 136 | } 137 | -------------------------------------------------------------------------------- /src/requested_event.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | bool 4 | perf::RequestedEventSet::add(const perf::RequestedEvent& event) 5 | { 6 | /// If the event is not already added (in that case adjust_visibility_if_present() will return false), add it. 7 | /// If the event is already in the set, adjust_visibility_if_present() will adjust the visibility to true, if 8 | /// is_shown_in_results is true. 9 | if (!this->adjust_visibility_if_present(event.pmu_name(), event.event_name(), event.is_shown_in_results())) { 10 | this->_requested_events.push_back(event); 11 | return true; 12 | } 13 | 14 | return false; 15 | } 16 | 17 | bool 18 | perf::RequestedEventSet::adjust_visibility_if_present(const std::optional pmu_name, 19 | const std::string_view event_name, 20 | const bool is_shown_in_results) 21 | { 22 | auto iterator = std::find_if( 23 | this->_requested_events.begin(), this->_requested_events.end(), [pmu_name, event_name](const auto& event) { 24 | return event.pmu_name() == pmu_name && event.event_name() == event_name; 25 | }); 26 | 27 | /// If the event is not in the set, notify the caller that the event needs to be added. 28 | if (iterator == this->_requested_events.end()) { 29 | return false; 30 | } 31 | 32 | /// If the event should be included into the results, mark it accordingly. 33 | if (is_shown_in_results) { 34 | iterator->is_shown_in_results(true); 35 | } 36 | 37 | return true; 38 | } 39 | 40 | perf::CounterResult 41 | perf::RequestedEventSet::result(const perf::CounterDefinition& counter_definition, 42 | perf::CounterResult&& hardware_events_result, 43 | const std::uint64_t normalization) const 44 | { 45 | /// Combine all hardware events and metrics into a single result, showing only the requested events and metrics, in 46 | /// the requested order. Accordingly, we need to calculate the metrics first, using the given hardware events. 47 | /// However, since metrics can be referenced recursively (metric_a uses metric_b), we need to resolve the metrics in a 48 | /// specific order (metric_b before metric_a in this example). To do so, we calculate a metric dependency graph first 49 | /// and calculate metrics without dependencies, until all metrics are calculated. 50 | auto metric_graph = this->build_metric_graph(counter_definition); 51 | 52 | /// Check if the metric graph has a cycle. In that case, we cannot evaluate the metrics. 53 | if (metric_graph.is_cyclic()) { 54 | throw CannotEvaluateMetricsBecauseOfCycleError{}; 55 | } 56 | 57 | /// Walk through the metric graph, removing one metric without dependencies ata time. 58 | while (!metric_graph.empty()) { 59 | 60 | /// Get metric without un-calculated dependency. 61 | if (const auto metric_name = metric_graph.pop(); metric_name.has_value()) { 62 | if (auto metric = counter_definition.metric(metric_name.value()); metric.has_value()) { 63 | 64 | /// Calculate the metric. 65 | if (const auto calculated_metric_value = std::get<1>(metric.value()).calculate(hardware_events_result); 66 | calculated_metric_value.has_value()) { 67 | 68 | /// Add it to the results. 69 | hardware_events_result.emplace_back(metric_name.value(), calculated_metric_value.value()); 70 | } 71 | } 72 | } 73 | } 74 | 75 | auto event_results = std::vector>{}; 76 | event_results.reserve(this->_requested_events.size()); 77 | 78 | /// Transform hardware events (now containing also metric results) into a set that is ordered like dictated by the 79 | /// requested event. 80 | for (const auto& requested_event : this->_requested_events) { 81 | if (requested_event.is_shown_in_results()) { 82 | if (const auto result = hardware_events_result.get(requested_event.event_name()); result.has_value()) { 83 | 84 | /// Normalize hardware and time events. 85 | if (requested_event.is_hardware_event() || requested_event.is_time_event()) { 86 | event_results.emplace_back(requested_event.event_name(), result.value() / static_cast(normalization)); 87 | } 88 | 89 | /// Add metrics without normalization. 90 | else { 91 | event_results.emplace_back(requested_event.event_name(), result.value()); 92 | } 93 | } 94 | } 95 | } 96 | 97 | return CounterResult{ std::move(event_results) }; 98 | } 99 | 100 | perf::util::DirectedGraph 101 | perf::RequestedEventSet::build_metric_graph(const perf::CounterDefinition& counter_definition) const 102 | { 103 | auto metric_graph = util::DirectedGraph{}; 104 | for (const auto& requested_event : this->_requested_events) { 105 | if (const auto metric = counter_definition.metric(requested_event.event_name()); metric.has_value()) { 106 | 107 | /// Add the metric as a node to the graph. 108 | metric_graph.insert(requested_event.event_name()); 109 | 110 | /// Add an edge for every dependent metric: dependent_metric -> metric 111 | for (const auto& dependency : std::get<1>(metric.value()).required_counter_names()) { 112 | if (const auto dependent_metric = counter_definition.metric(dependency); dependent_metric.has_value()) { 113 | metric_graph.connect(std::get<0>(dependent_metric.value()), requested_event.event_name()); 114 | } 115 | } 116 | } 117 | } 118 | 119 | return metric_graph; 120 | } -------------------------------------------------------------------------------- /examples/sampling/branch.cpp: -------------------------------------------------------------------------------- 1 | #include "../access_benchmark.h" 2 | #include "perfcpp/sampler.h" 3 | #include 4 | 5 | /** 6 | * A function using multiple branches hard to optimize for the compiler for 7 | * demonstrating branch-sampling. 8 | * 9 | * @param cache_line Cache line to use as an input. 10 | * @return Another value through a handful of branches. 11 | */ 12 | [[nodiscard]] std::uint64_t 13 | branchy_function(const perf::example::AccessBenchmark::cache_line& cache_line); 14 | 15 | int 16 | main() 17 | { 18 | 19 | std::cout << "libperf-cpp example: Record perf branch samples for " 20 | "single-threaded sequential access to an in-memory array." 21 | << std::endl; 22 | 23 | /// Initialize sampler. 24 | auto sampler = perf::Sampler{}; 25 | 26 | /// Setup which counters trigger the writing of samples. 27 | sampler.trigger("cycles", perf::Precision::AllowArbitrarySkid, perf::Period{ 1000000U }); 28 | 29 | /// Setup which data will be included into samples (timestamp and stack of branches). 30 | sampler.values().timestamp(true).branch_stack( 31 | { perf::BranchType::User, perf::BranchType::Conditional }) /// Only sample conditional branches in user-mode. 32 | ; 33 | 34 | /// Create random access benchmark. 35 | auto benchmark = perf::example::AccessBenchmark{ /*sequential accesses*/ false, 36 | /* create benchmark of 512 MB */ 512U }; 37 | 38 | /// Start sampling. 39 | try { 40 | sampler.start(); 41 | } catch (std::runtime_error& exception) { 42 | std::cerr << exception.what() << std::endl; 43 | return 1; 44 | } 45 | 46 | /// Execute the benchmark (accessing cache lines in a random order). 47 | auto value = 0ULL; 48 | for (auto index = 0U; index < benchmark.size(); ++index) { 49 | value += branchy_function(benchmark[index]); 50 | } 51 | 52 | /// We do not want the compiler to optimize away this (otherwise) unused value (and consequently the loop above). 53 | benchmark.pretend_to_use(value); 54 | 55 | /// Stop sampling. 56 | sampler.stop(); 57 | 58 | /// Get all the recorded samples. 59 | const auto samples = sampler.result(); 60 | 61 | /// Print the first samples. 62 | const auto count_show_samples = std::min(samples.size(), 10U); 63 | std::cout << "\nRecorded " << samples.size() << " samples." << std::endl; 64 | std::cout << "Here are the first " << count_show_samples << " recorded samples:\n" << std::endl; 65 | 66 | for (auto index = 0U; index < count_show_samples; ++index) { 67 | const auto& sample = samples[index]; 68 | 69 | /// Since we recorded the time, period, the instruction pointer, and the CPU 70 | /// id, we can only read these values. 71 | if (sample.metadata().timestamp().has_value() && sample.branch_stack().has_value()) { 72 | std::cout << "Time = " << sample.metadata().timestamp().value() << "\n"; 73 | for (const auto& branch : sample.branch_stack().value()) { 74 | std::cout << "\tpredicted correct = " << branch.is_predicted() << " | from instruction 0x" << std::hex 75 | << branch.instruction_pointer_from() << std::dec << " | to instruction 0x" << std::hex 76 | << branch.instruction_pointer_to() << std::dec; 77 | if (branch.cycles().has_value()) { 78 | std::cout << " | cycles = " << branch.cycles().value(); 79 | } 80 | 81 | std::cout << "\n"; 82 | } 83 | } 84 | } 85 | std::cout << std::flush; 86 | 87 | /// Close the sampler. 88 | /// Note that the sampler can only be closed after reading the samples. 89 | sampler.close(); 90 | 91 | return 0; 92 | } 93 | 94 | std::uint64_t 95 | branchy_function(const perf::example::AccessBenchmark::cache_line& cache_line) 96 | { 97 | auto result = cache_line.value; 98 | 99 | for (auto i = 0U; i < 10U; ++i) { 100 | switch ((cache_line.value >> (4U * i)) & 0xF) { // Extract 4 bits at a time 101 | case 0ULL: 102 | result += cache_line.value * (i + 1U); 103 | break; 104 | case 1ULL: 105 | result -= cache_line.value / (i + 2U); 106 | break; 107 | case 2ULL: 108 | result *= cache_line.value + (i * 3U); 109 | break; 110 | case 3ULL: 111 | result /= (cache_line.value - i) | 1U; 112 | break; // Avoid division by zero 113 | case 4ULL: 114 | result ^= cache_line.value << i; 115 | break; 116 | case 5ULL: 117 | result %= (cache_line.value >> i) | 1U; 118 | break; 119 | case 6ULL: 120 | result = ~result; 121 | break; 122 | case 7ULL: 123 | result &= cache_line.value | (std::uint64_t(0xFF) << (i * 8U)); 124 | break; 125 | case 8ULL: 126 | result |= cache_line.value & (std::uint64_t(0xFFFF) << (i * 16U)); 127 | break; 128 | case 9ULL: 129 | result >>= cache_line.value % (i + 1); 130 | break; 131 | case 10ULL: 132 | result <<= cache_line.value % (i + 2); 133 | break; 134 | case 11ULL: 135 | result += cache_line.value + i * 7; 136 | break; 137 | case 12ULL: 138 | result -= cache_line.value - i * 11; 139 | break; 140 | case 13ULL: 141 | result *= cache_line.value * (i + 5); 142 | break; 143 | case 14ULL: 144 | result /= (cache_line.value / (i + 3)) | 1; 145 | break; 146 | case 15ULL: 147 | result ^= cache_line.value ^ (i * 13); 148 | break; 149 | default: 150 | result = cache_line.value; 151 | } 152 | } 153 | return result; 154 | } -------------------------------------------------------------------------------- /docs/analyzing-memory-access-patterns.md: -------------------------------------------------------------------------------- 1 | # Analyzing Memory Access Patterns of Data Structures 2 | 3 | Modern applications often contain multiple instances of complex data structures, making it challenging to analyze their memory access patterns. 4 | While tools like Linux Perf and Intel VTune excel at identifying resource-intensive instructions, they cannot differentiate between different instances of the same data structure sharing identical code - for example, different nodes within a tree structure experiencing varying access patterns. 5 | 6 | *perf-cpp* addresses this limitation through its **Memory Access Analyzer** component, which works in conjunction with memory-based sampling ([detailed in the sampling documentation](sampling.md)). 7 | The Memory Access Analyzer helps identify which specific memory addresses experience high access latency by: 8 | 9 | * Mapping samples to individual data object instances 10 | * Generating detailed access statistics including cache hits/misses, TLB performance, and average latency metrics 11 | 12 | → [For a practical implementation, check out our random-access-benchmark example.](../examples/sampling/memory_access_analyzer.cpp) 13 | 14 | --- 15 | ## Table of Contents 16 | - [Describing Data Types](#step-1-describing-data-types) 17 | - [Registering Data Type Instances](#step-2-registering-data-type-instances) 18 | - [Mapping Samples to Data Type Instances](#step-3-mapping-samples-to-data-type-instances) 19 | - [Processing the Result](#step-4-processing-the-result) 20 | --- 21 | 22 | ## Step 1: Describing Data Types 23 | The **Memory Access Analyzer** requires information about the structure of your data types. 24 | Let's walk through an example using a binary tree node: 25 | ```cpp 26 | class BinaryTreeNode { 27 | std::uint64_t value; 28 | BinaryTreeNode* left_child; 29 | BinaryTreeNode* right_child; 30 | }; 31 | ``` 32 | 33 | To analyze this structure, create a `perf::analyzer::DataType` definition: 34 | 35 | ```cpp 36 | #include 37 | 38 | auto binary_tree_node = perf::analyzer::DataType{"BinaryTreeNode", sizeof(BinaryTreeNode)}; 39 | binary_tree_node.add("value", sizeof(std::uint64_t)); /// Describe the "value" attribute. 40 | binary_tree_node.add("left_child", sizeof(BinaryTreeNode*)); /// Describe the "left_child" attribute. 41 | binary_tree_node.add("right_child", sizeof(BinaryTreeNode*)); /// Describe the "right_child" attribute. 42 | ``` 43 | 44 | > [!TIP] 45 | > For accurate size and offset information, you can use [**pahole**](https://linux.die.net/man/1/pahole). See [Paramoud Kumbhar's detailed guide](https://pramodkumbhar.com/2023/11/pahole-to-analyz-data-structure-memory-layouts-with-ease/) for usage instructions. 46 | 47 | ## Step 2: Registering Data Type Instances 48 | Since each instance of a data structure may exhibit different access patterns, the Memory Access Analyzer needs to track individual instances. 49 | Here's how to register them: 50 | 51 | ```cpp 52 | #include 53 | auto memory_access_analyzer = perf::analyzer::MemoryAccess{}; 54 | 55 | /// Expose the data type to the Analyzer. 56 | memory_access_analyzer.add(std::move(binary_tree_node)); 57 | 58 | /// Expose memory addresses to the Analyzer. 59 | for (auto* node : tree->nodes()) { 60 | /// The first argument is the name describing the data type. 61 | /// The second argument is a pointer to the instance. 62 | memory_access_analyzer.annotate("BinaryTreeNode", node); 63 | } 64 | ``` 65 | 66 | ## Step 3: Mapping Samples to Data Type Instances 67 | To collect memory access data, use *perf-cpp*'s [sampling mechanism](sampling.md) with the following key requirements: 68 | * Include logical memory addresses 69 | * Capture data source information 70 | * Record latency data ("weight") 71 | * Use a memory-address-capable sample trigger (e.g., `mem-loads` on Intel, `ibs_op` on AMD – see the [documentation](sampling.md#specific-notes-for-different-cpu-vendors)) 72 | 73 | ```cpp 74 | #include 75 | #include 76 | 77 | auto sampler = perf::Sampler{}; 78 | 79 | /// Set trigger that enables memory sampling. 80 | sampler.trigger("mem-loads", perf::Precision::MustHaveZeroSkid, perf::Period{ 1000U }); 81 | 82 | /// Include addresses, data source, and latency. 83 | sampler.values() 84 | .logical_memory_address(true) 85 | .data_src(true) 86 | .weight_struct(true); 87 | 88 | /// Run the workload while recording samples. 89 | sampler.start(); 90 | ///... execute .... 91 | sampler.stop(); 92 | 93 | /// Get the samples and map to described and registered data types and instances. 94 | const auto samples = sampler.result(); 95 | const auto result = memory_access_analyzer.map(samples); 96 | ``` 97 | 98 | ## Step 4: Processing the Result 99 | The analyzer generates detailed statistics for each data type attribute. 100 | To view the results: 101 | ```cpp 102 | std::cout << result.to_string() << std::endl; 103 | ``` 104 | 105 | Example output: 106 | 107 | ```bash 108 | DataType BinaryTreeNode (24B) { 109 | | loads | cache hits | RAM hits | TLB | stores 110 | samples | count latency | L1d LFB L2 L3 | local remote | L1 hits L2 hits misses | count latency 111 | 0: value (8B) 373 | 373 439 | 154 0 0 7 | 212 0 | 190 5 178 | 0 0 112 | 8: left_child (8B) 146 | 146 720 | 1 0 0 5 | 140 0 | 12 18 116 | 0 0 113 | 16: right_child (8B) 528 | 528 173 | 393 0 1 14 | 120 0 | 415 4 109 | 0 0 114 | } 115 | ``` 116 | 117 | The output shows: 118 | * Attribute details (offset, name, size) 119 | * Sample counts 120 | * Detailed performance metrics per attribute 121 | 122 | For further analysis, export the results in structured formats: 123 | 124 | ```cpp 125 | result.to_json(); /// JSON format 126 | result.to_csv(); /// CSV format 127 | ``` -------------------------------------------------------------------------------- /test/requested_event.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | TEST_CASE("empty RequestedEventSet", "[RequestedEventSet]") 7 | { 8 | auto event_set = perf::RequestedEventSet{}; 9 | auto counter_definition = perf::CounterDefinition{}; 10 | 11 | SECTION("newly created event set is empty") 12 | { 13 | REQUIRE(event_set.empty()); 14 | REQUIRE(event_set.size() == 0U); 15 | REQUIRE(event_set.begin() == event_set.end()); 16 | } 17 | 18 | SECTION("event set with reserved capacity is still empty") 19 | { 20 | auto event_set_with_capacity = perf::RequestedEventSet{ 10U }; 21 | 22 | REQUIRE(event_set_with_capacity.empty()); 23 | REQUIRE(event_set_with_capacity.size() == 0U); 24 | REQUIRE(event_set_with_capacity.begin() == event_set_with_capacity.end()); 25 | } 26 | 27 | SECTION("result with empty hardware_events_result should be empty") 28 | { 29 | auto empty_hardware_result = perf::CounterResult{}; 30 | 31 | auto result = event_set.result(counter_definition, std::move(empty_hardware_result), 1U); 32 | 33 | REQUIRE(result.begin() == result.end()); 34 | REQUIRE_FALSE(result.get("instructions").has_value()); 35 | REQUIRE_FALSE(result.get("cycles").has_value()); 36 | } 37 | 38 | SECTION("result with non-empty hardware_events_result but no requested events should be empty") 39 | { 40 | auto hardware_result = 41 | perf::CounterResult{ std::vector>{ std::make_pair("instructions", 1000.0) } }; 42 | 43 | auto result = event_set.result(counter_definition, std::move(hardware_result), 1U); 44 | 45 | REQUIRE(result.begin() == result.end()); 46 | REQUIRE_FALSE(result.get("instructions").has_value()); 47 | REQUIRE_FALSE(result.get("cycles").has_value()); 48 | } 49 | 50 | SECTION("result with reserved capacity - empty hardware_events_result should be empty") 51 | { 52 | auto event_set_with_capacity = perf::RequestedEventSet{ 10U }; 53 | auto empty_hardware_result = perf::CounterResult{}; 54 | 55 | auto result = event_set_with_capacity.result(counter_definition, std::move(empty_hardware_result), 1U); 56 | 57 | REQUIRE(result.begin() == result.end()); 58 | REQUIRE_FALSE(result.get("instructions").has_value()); 59 | REQUIRE_FALSE(result.get("cycles").has_value()); 60 | } 61 | 62 | SECTION("result with reserved capacity - non-empty hardware_events_result but no requested events should be empty") 63 | { 64 | auto event_set_with_capacity = perf::RequestedEventSet{ 10U }; 65 | auto hardware_result = 66 | perf::CounterResult{ std::vector>{ std::make_pair("instructions", 1000.0) } }; 67 | 68 | auto result = event_set_with_capacity.result(counter_definition, std::move(hardware_result), 1U); 69 | 70 | REQUIRE(result.begin() == result.end()); 71 | REQUIRE_FALSE(result.get("instructions").has_value()); 72 | REQUIRE_FALSE(result.get("cycles").has_value()); 73 | } 74 | } 75 | 76 | TEST_CASE("RequestedEventSet with requested events", "[RequestedEventSet]") 77 | { 78 | auto counter_definition = perf::CounterDefinition{}; 79 | 80 | SECTION("result contains requested event when present in hardware result") 81 | { 82 | auto event_set = perf::RequestedEventSet{}; 83 | 84 | /// Add the "instructions" event as a hardware event 85 | REQUIRE( 86 | event_set.add(perf::RequestedEvent{ "cpu", "instructions", true, perf::RequestedEvent::Type::HardwareEvent })); 87 | 88 | /// Verify the event set is no longer empty 89 | REQUIRE_FALSE(event_set.empty()); 90 | REQUIRE(event_set.size() == 1U); 91 | 92 | /// Create hardware result containing instructions 93 | auto hardware_result = 94 | perf::CounterResult{ std::vector>{ std::make_pair("instructions", 1000.0) } }; 95 | 96 | auto result = event_set.result(counter_definition, std::move(hardware_result), 1U); 97 | 98 | /// The result should contain the requested instructions event 99 | REQUIRE(result.get("instructions").has_value()); 100 | REQUIRE(result.get("instructions").value() == 1000.0); 101 | 102 | /// But should not contain unrequested events 103 | REQUIRE_FALSE(result.get("cycles").has_value()); 104 | 105 | /// Verify iterator access 106 | REQUIRE(result.begin() != result.end()); 107 | auto it = result.begin(); 108 | REQUIRE(it->first == "instructions"); 109 | REQUIRE(it->second == 1000.0); 110 | ++it; 111 | REQUIRE(it == result.end()); 112 | } 113 | 114 | SECTION("result is empty when requested event not in hardware result") 115 | { 116 | auto event_set = perf::RequestedEventSet{}; 117 | 118 | /// Add the "instructions" event as a hardware event 119 | REQUIRE( 120 | event_set.add(perf::RequestedEvent{ "cpu", "instructions", true, perf::RequestedEvent::Type::HardwareEvent })); 121 | 122 | /// Create hardware result NOT containing instructions 123 | auto hardware_result = 124 | perf::CounterResult{ std::vector>{ std::make_pair("cycles", 2000.0) } }; 125 | 126 | auto result = event_set.result(counter_definition, std::move(hardware_result), 1U); 127 | 128 | /// The result should be empty since requested event is not in hardware result 129 | REQUIRE(result.begin() == result.end()); 130 | REQUIRE_FALSE(result.get("instructions").has_value()); 131 | REQUIRE_FALSE(result.get("cycles").has_value()); 132 | } 133 | 134 | SECTION("result respects normalization for hardware events") 135 | { 136 | auto event_set = perf::RequestedEventSet{}; 137 | 138 | /// Add the "instructions" event as a hardware event 139 | REQUIRE( 140 | event_set.add(perf::RequestedEvent{ "cpu", "instructions", true, perf::RequestedEvent::Type::HardwareEvent })); 141 | 142 | /// Create hardware result containing instructions 143 | auto hardware_result = 144 | perf::CounterResult{ std::vector>{ std::make_pair("instructions", 1000.0) } }; 145 | 146 | auto result = event_set.result(counter_definition, std::move(hardware_result), 10U); 147 | 148 | /// The result should be normalized (1000.0 / 10 = 100.0) 149 | REQUIRE(result.get("instructions").has_value()); 150 | REQUIRE(result.get("instructions").value() == 100.0); 151 | } 152 | } --------------------------------------------------------------------------------