├── examples
    ├── .gitignore
    ├── counter_definition.cpp
    ├── statistics
    │   ├── single_thread.cpp
    │   ├── live_events.cpp
    │   ├── inherit_thread.cpp
    │   ├── multi_thread.cpp
    │   ├── metric.cpp
    │   ├── multi_cpu.cpp
    │   └── multi_process.cpp
    ├── sampling
    │   ├── flame_graph.cpp
    │   ├── perf_record.cpp
    │   ├── context_switch.cpp
    │   ├── memory_access_analyzer.cpp
    │   ├── instruction_pointer.cpp
    │   ├── register.cpp
    │   ├── counter.cpp
    │   ├── multi_thread.cpp
    │   ├── multi_cpu.cpp
    │   ├── multi_event.cpp
    │   ├── memory_address.cpp
    │   └── branch.cpp
    ├── README.md
    ├── access_benchmark.h
    ├── access_benchmark.cpp
    └── CMakeLists.txt
├── test
    ├── events.csv
    ├── events-and-metrics.csv
    ├── hardware_info.cpp
    ├── CMakeLists.txt
    ├── access_benchmark.h
    ├── access_benchmark.cpp
    ├── counter_definition.cpp
    └── requested_event.cpp
├── .gitignore
├── include
    └── perfcpp
    │   ├── throttle.h
    │   ├── cgroup.h
    │   ├── period.h
    │   ├── precision.h
    │   ├── metric
    │       └── expression
    │       │   ├── function.h
    │       │   ├── tokenizer.h
    │       │   ├── parser.h
    │       │   └── token.h
    │   ├── context_switch.h
    │   ├── time_event.h
    │   ├── counter_result.h
    │   ├── util
    │       ├── unique_file_descriptor.h
    │       ├── table.h
    │       └── graph.h
    │   ├── feature.h
    │   ├── analyzer
    │       ├── flame_graph_generator.h
    │       └── data_type.h
    │   ├── metadata.h
    │   ├── branch.h
    │   ├── mmap_buffer.h
    │   └── hardware_info.h
├── src
    ├── config.cpp
    ├── metric
    │   └── expression
    │   │   ├── token.cpp
    │   │   ├── function.cpp
    │   │   └── expression.cpp
    ├── exception.cpp
    ├── counter_result.cpp
    └── requested_event.cpp
├── docs
    ├── README.md
    ├── perf-paranoid.md
    ├── build.md
    ├── recording-live-events.md
    ├── sampling-symbols-and-flamegraphs.md
    └── analyzing-memory-access-patterns.md
├── events
    └── x86
    │   ├── intel
    │       ├── clearwater-forest.csv
    │       └── panther-lake.csv
    │   └── micro-architecture-register.csv
└── script
    └── create_perf_list.py


/examples/.gitignore:
--------------------------------------------------------------------------------
1 | bin/


--------------------------------------------------------------------------------
/test/events.csv:
--------------------------------------------------------------------------------
1 | #name, config, config1, type
2 | EVENT.TEST0,0x1f3010e
3 | event-test-1,0x1CD,3
4 | 


--------------------------------------------------------------------------------
/test/events-and-metrics.csv:
--------------------------------------------------------------------------------
1 | EVENT.TEST0,0x1f3010e
2 | event-test-1,0x1CD,3
3 | test-metric,3*'event-test-1'


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | Makefile
 2 | libperf-cpp.a
 3 | CMakeFiles
 4 | cmake-build-debug
 5 | .cmake
 6 | CMakeCache.txt
 7 | cmake_install.cmake
 8 | build/
 9 | tests
10 | src/processor_specific_event_provider.cpp


--------------------------------------------------------------------------------
/test/hardware_info.cpp:
--------------------------------------------------------------------------------
 1 | #include <catch2/catch_test_macros.hpp>
 2 | #include <perfcpp/hardware_info.h>
 3 | 
 4 | TEST_CASE("number of performance counters", "[HardwareInfo]")
 5 | {
 6 |   REQUIRE(perf::HardwareInfo::physical_performance_counters_per_logical_core() > 1U);
 7 | }
 8 | 
 9 | TEST_CASE("number of events per performance counter", "[HardwareInfo]")
10 | {
11 |   REQUIRE(perf::HardwareInfo::events_per_physical_performance_counter() > 1U);
12 | }


--------------------------------------------------------------------------------
/include/perfcpp/throttle.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | namespace perf {
 4 | class Throttle
 5 | {
 6 | public:
 7 |   explicit Throttle(const bool is_throttle) noexcept
 8 |     : _is_throttle(is_throttle)
 9 |   {
10 |   }
11 |   ~Throttle() noexcept = default;
12 | 
13 |   /**
14 |    * @return True, if the event was a throttle event.
15 |    */
16 |   [[nodiscard]] bool is_throttle() const noexcept { return _is_throttle; }
17 | 
18 |   /**
19 |    * @return True, if the event was an unthrottle event.
20 |    */
21 |   [[nodiscard]] bool is_unthrottle() const noexcept { return !_is_throttle; }
22 | 
23 | private:
24 |   bool _is_throttle;
25 | };
26 | }


--------------------------------------------------------------------------------
/include/perfcpp/cgroup.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cstdint>
 4 | #include <string>
 5 | 
 6 | namespace perf {
 7 | class CGroup
 8 | {
 9 | public:
10 |   CGroup(const std::uint64_t id, std::string&& path) noexcept
11 |     : _id(id)
12 |     , _path(std::move(path))
13 |   {
14 |   }
15 |   ~CGroup() = default;
16 | 
17 |   /**
18 |    * @return Id of the CGgroup (as found in samples).
19 |    */
20 |   [[nodiscard]] std::uint64_t id() const noexcept { return _id; }
21 | 
22 |   /**
23 |    * @return Path of the CGroup.
24 |    */
25 |   [[nodiscard]] const std::string& path() const noexcept { return _path; }
26 | 
27 | private:
28 |   std::uint64_t _id;
29 |   std::string _path;
30 | };
31 | }


--------------------------------------------------------------------------------
/include/perfcpp/period.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <cstdint>
 3 | #include <variant>
 4 | 
 5 | namespace perf {
 6 | class Period
 7 | {
 8 | public:
 9 |   explicit Period(const std::uint64_t period) noexcept
10 |     : _period(period)
11 |   {
12 |   }
13 |   ~Period() noexcept = default;
14 | 
15 |   [[nodiscard]] std::uint64_t get() const noexcept { return _period; }
16 | 
17 | private:
18 |   std::uint64_t _period;
19 | };
20 | 
21 | class Frequency
22 | {
23 | public:
24 |   explicit Frequency(const std::uint64_t frequency) noexcept
25 |     : _frequency(frequency)
26 |   {
27 |   }
28 |   ~Frequency() noexcept = default;
29 | 
30 |   [[nodiscard]] std::uint64_t get() const noexcept { return _frequency; }
31 | 
32 | private:
33 |   std::uint64_t _frequency;
34 | };
35 | 
36 | using PeriodOrFrequency = std::variant<Period, Frequency>;
37 | }


--------------------------------------------------------------------------------
/test/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | Include(FetchContent)
 2 | FetchContent_Declare(
 3 |         Catch2
 4 |         GIT_REPOSITORY https://github.com/catchorg/Catch2.git
 5 |         GIT_TAG        v3.8.1 # or a later release
 6 | )
 7 | FetchContent_MakeAvailable(Catch2)
 8 | 
 9 | set(PERF_CPP_TEST
10 |         test/access_benchmark.cpp
11 |         test/counter_definition.cpp
12 |         test/metric.cpp
13 |         test/event_counter.cpp
14 |         test/sampler.cpp
15 |         test/hardware_info.cpp
16 |         test/requested_event.cpp
17 | )
18 | 
19 | add_executable(tests ${PERF_CPP_TEST})
20 | target_link_libraries(tests PRIVATE Catch2::Catch2WithMain perf-cpp)
21 | target_include_directories(tests PRIVATE Catch2::Catch2WithMain test)
22 | set_target_properties(tests
23 |         PROPERTIES
24 |         RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin"
25 | )


--------------------------------------------------------------------------------
/src/config.cpp:
--------------------------------------------------------------------------------
 1 | #include <perfcpp/config.h>
 2 | #include <perfcpp/hardware_info.h>
 3 | 
 4 | perf::Process perf::Process::Any = perf::Process{ -1 };
 5 | perf::Process perf::Process::Calling = perf::Process{ 0 };
 6 | perf::CpuCore perf::CpuCore::Any = perf::CpuCore{ -1 };
 7 | 
 8 | perf::Config::Config() noexcept
 9 | {
10 |   /// Try to read the number of physical performance counters from the hardware (either from cpuid or by trying).
11 |   if (const auto physical_performance_counters = HardwareInfo::physical_performance_counters_per_logical_core();
12 |       physical_performance_counters > 0U) {
13 |     /// If that worked, also read the number of events per physical performance counter.
14 |     this->_num_physical_counters = physical_performance_counters;
15 |     this->_num_events_per_physical_counter = HardwareInfo::events_per_physical_performance_counter();
16 |   }
17 | }


--------------------------------------------------------------------------------
/examples/counter_definition.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <perfcpp/counter_definition.h>
 3 | #include <perfcpp/hardware_info.h>
 4 | 
 5 | int
 6 | main()
 7 | {
 8 |   std::cout
 9 |     << "libperf-cpp example: This example prints all automatically read events stored in the perf::CounterDefinition.\n"
10 |     << std::endl;
11 | 
12 |   std::cout << "Scanning the underlying hardware for hardware counters..." << std::endl;
13 |   std::cout << "Physical Hardware Counters  = "
14 |             << std::uint16_t(perf::HardwareInfo::physical_performance_counters_per_logical_core()) << "\n";
15 |   std::cout << "Events per Hardware Counter = "
16 |             << std::uint16_t(perf::HardwareInfo::events_per_physical_performance_counter()) << "\n"
17 |             << std::endl;
18 | 
19 |   /// Create custom instance of the counter definition.
20 |   const auto counter_definition = perf::CounterDefinition{};
21 | 
22 |   /// Dump to the console without adding further events.
23 |   std::cout << "Detected events:\n" << counter_definition.to_string() << std::endl;
24 | 
25 |   return 0;
26 | }


--------------------------------------------------------------------------------
/src/metric/expression/token.cpp:
--------------------------------------------------------------------------------
 1 | #include <perfcpp/metric/expression/token.h>
 2 | 
 3 | std::string
 4 | perf::metric::expression::Token::TokenToStringVisitor::operator()(
 5 |   const perf::metric::expression::Operator_ metric_operator) const
 6 | {
 7 |   switch (metric_operator) {
 8 |     case Operator_::Plus:
 9 |       return "+";
10 |     case Operator_::Minus:
11 |       return "-";
12 |     case Operator_::Times:
13 |       return "*";
14 |     case Operator_::Divide:
15 |       return "/";
16 |     default:
17 |       return "<unknown operator>";
18 |   }
19 | }
20 | 
21 | std::string
22 | perf::metric::expression::Token::TokenToStringVisitor::operator()(
23 |   const perf::metric::expression::Token::Punctuation punctutation) const
24 | {
25 |   switch (punctutation) {
26 |     case Token::Punctuation::LeftParentheses:
27 |       return "(";
28 |     case Token::Punctuation::RightParentheses:
29 |       return ")";
30 |     case Token::Punctuation::Comma:
31 |       return ",";
32 |     default:
33 |       return "<unknown parenthesis>";
34 |   }
35 | }
36 | 
37 | std::string
38 | perf::metric::expression::Token::to_string() const
39 | {
40 |   return std::visit(TokenToStringVisitor{}, this->_token);
41 | }


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
 1 | # Documentation
 2 | 
 3 | Welcome to the comprehensive documentation for the *perf-cpp* library. 
 4 | This guide is designed to assist you with everything from initial setup to advanced functionalities. 
 5 | Explore the sections below to gain insights and instructions tailored to your needs.
 6 | 
 7 | ---
 8 | - [Building and Including the *perf-cpp* Library](build.md)
 9 | - **Counting Performance Events**
10 |   - [Basics of Recording Performance Events](recording.md)
11 |   - [Multi-threading and Multi-CPU Event Recording](recording-parallel.md)
12 |   - [Access Statistics without Stopping the Counter](recording-live-events.md)
13 |   - [Defining and Using Metrics](metrics.md)
14 | - **Sampling Techniques**
15 |   - [Basics of Event Sampling](sampling.md)
16 |   - [Multi-threading and Multi-CPU Event Sampling](sampling-parallel.md)
17 |   - [Use the Linux Perf Tool to Analyze Recorded Samples](analyzing-samples-with-perf-report.md)
18 |   - [Symbols and Flamegraphs](sampling-symbols-and-flamegraphs.md)
19 |   - [Analyzing Memory Access Patterns using Sampling](analyzing-memory-access-patterns.md)
20 | - [Built-in and Hardware-specific Performance Events](counters.md)
21 | - [Understanding the Perf Paranoid Value](perf-paranoid.md)


--------------------------------------------------------------------------------
/include/perfcpp/precision.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cstdint>
 4 | #include <limits>
 5 | 
 6 | namespace perf {
 7 | /**
 8 |  * The precision controls the skid, which refers to the amount of
 9 |  * instructions between the event and the kernel records the sample.
10 |  *
11 |  * For more information see "precise_ip" on https://man7.org/linux/man-pages/man2/perf_event_open.2.html.
12 |  */
13 | enum Precision : std::uint8_t
14 | {
15 |   /// The recorded instruction pointer may land anywhere within a broad, implementation-defined window around the real
16 |   /// instruction.
17 |   AllowArbitrarySkid = 0U,
18 | 
19 |   /// The recorded instruction pointer must have a constant, repeatable skid offset (still non-zero) so the displacement
20 |   /// is predictable even if not exact.
21 |   MustHaveConstantSkid = 1U,
22 | 
23 |   /// Request–but do not insist on—zero skid, asking the PMU for exact instruction pointer attribution while allowing
24 |   /// fallback on CPUs that cannot guarantee it.
25 |   RequestZeroSkid = 2U,
26 | 
27 |   /// Require zero skid: the sample instruction pointer must be the exact triggering instruction; if the hardware cannot
28 |   /// provide this, perf-cpp will lower the precision.
29 |   MustHaveZeroSkid = 3U,
30 | };
31 | }


--------------------------------------------------------------------------------
/events/x86/intel/clearwater-forest.csv:
--------------------------------------------------------------------------------
 1 | br_inst_retired.all_branches, 0xc4
 2 | br_misp_retired.all_branches, 0xc5
 3 | cpu_clk_unhalted.core_p, 0x3c
 4 | cpu_clk_unhalted.ref_tsc_p, 0x13c
 5 | cpu_clk_unhalted.thread_p, 0x3c
 6 | dtlb_load_misses.walk_completed, 0xe08
 7 | dtlb_store_misses.walk_completed, 0xe49
 8 | icache.accesses, 0x380
 9 | icache.misses, 0x280
10 | inst_retired.any_p, 0xc0
11 | itlb_misses.walk_completed, 0xe85
12 | longest_lat_cache.miss, 0x412e
13 | longest_lat_cache.reference, 0x4f2e
14 | mem_uops_retired.all_loads, 0x81d0
15 | mem_uops_retired.all_stores, 0x82d0
16 | mem_uops_retired.load_latency_gt_1024, 0x5d0, 0x400
17 | mem_uops_retired.load_latency_gt_128, 0x5d0, 0x80
18 | mem_uops_retired.load_latency_gt_16, 0x5d0, 0x10
19 | mem_uops_retired.load_latency_gt_2048, 0x5d0, 0x800
20 | mem_uops_retired.load_latency_gt_256, 0x5d0, 0x100
21 | mem_uops_retired.load_latency_gt_32, 0x5d0, 0x20
22 | mem_uops_retired.load_latency_gt_4, 0x5d0, 0x4
23 | mem_uops_retired.load_latency_gt_512, 0x5d0, 0x200
24 | mem_uops_retired.load_latency_gt_64, 0x5d0, 0x40
25 | mem_uops_retired.load_latency_gt_8, 0x5d0, 0x8
26 | mem_uops_retired.store_latency, 0x6d0
27 | ocr.demand_data_rd.any_response, 0x1b7, 0x10001
28 | ocr.demand_data_rd.l3_miss, 0x1b7, 0x33fbfc00001
29 | ocr.demand_rfo.any_response, 0x1b7, 0x10002
30 | ocr.demand_rfo.l3_miss, 0x1b7, 0x33fbfc00002
31 | topdown_be_bound.all, 0x2a4
32 | topdown_be_bound.all_p, 0x2a4
33 | 


--------------------------------------------------------------------------------
/src/metric/expression/function.cpp:
--------------------------------------------------------------------------------
 1 | #include <numeric>
 2 | #include <perfcpp/metric/expression/function.h>
 3 | 
 4 | std::optional<double>
 5 | perf::metric::expression::DRatioFunction::evaluate(const std::optional<double> left,
 6 |                                                    const std::optional<double> right) const
 7 | {
 8 |   if (left.has_value() && right.has_value() && right.value() != .0) {
 9 |     return left.value() / right.value();
10 |   }
11 | 
12 |   /// If one of the operands cannot be evaluated OR the right operand is zero, we cannot calculate the ratio.
13 |   return std::nullopt;
14 | }
15 | 
16 | std::optional<double>
17 | perf::metric::expression::SumFunction::evaluate(const perf::CounterResult& result) const
18 | {
19 |   auto sum = .0;
20 | 
21 |   for (const auto& argument : this->_arguments) {
22 |     /// Evaluate the argument.
23 |     const auto evaluated_argument = argument->evaluate(result);
24 | 
25 |     /// If the argument cannot be evaluated, the function fails.
26 |     if (!evaluated_argument.has_value()) {
27 |       return std::nullopt;
28 |     }
29 | 
30 |     /// Accumulate.
31 |     sum += evaluated_argument.value();
32 |   }
33 | 
34 |   return sum;
35 | }
36 | 
37 | void
38 | perf::metric::expression::SumFunction::add_required_hardware_counter(
39 |   std::vector<std::string>& hardware_counter_names) const
40 | {
41 |   for (const auto& argument : this->_arguments) {
42 |     argument->add_required_hardware_counter(hardware_counter_names);
43 |   }
44 | }


--------------------------------------------------------------------------------
/src/metric/expression/expression.cpp:
--------------------------------------------------------------------------------
 1 | #include <perfcpp/metric/expression/expression.h>
 2 | 
 3 | std::optional<double>
 4 | perf::metric::expression::AdditionExpression::evaluate(const std::optional<double> left,
 5 |                                                        const std::optional<double> right) const
 6 | {
 7 |   if (left.has_value() && right.has_value()) {
 8 |     return left.value() + right.value();
 9 |   }
10 | 
11 |   return std::nullopt;
12 | }
13 | 
14 | std::optional<double>
15 | perf::metric::expression::SubtractionExpression::evaluate(const std::optional<double> left,
16 |                                                           const std::optional<double> right) const
17 | {
18 |   if (left.has_value() && right.has_value()) {
19 |     return left.value() - right.value();
20 |   }
21 | 
22 |   return std::nullopt;
23 | }
24 | 
25 | std::optional<double>
26 | perf::metric::expression::MultiplyExpression::evaluate(const std::optional<double> left,
27 |                                                        const std::optional<double> right) const
28 | {
29 |   if (left.has_value() && right.has_value()) {
30 |     return left.value() * right.value();
31 |   }
32 | 
33 |   return std::nullopt;
34 | }
35 | 
36 | std::optional<double>
37 | perf::metric::expression::DivideExpression::evaluate(const std::optional<double> left,
38 |                                                      const std::optional<double> right) const
39 | {
40 |   if (left.has_value() && right.has_value()) {
41 |     return left.value() / right.value();
42 |   }
43 | 
44 |   return std::nullopt;
45 | }


--------------------------------------------------------------------------------
/include/perfcpp/metric/expression/function.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "expression.h"
 4 | #include <memory>
 5 | #include <optional>
 6 | #include <vector>
 7 | 
 8 | namespace perf::metric::expression {
 9 | /**
10 |  * Calculates the ratio between both operands.
11 |  */
12 | class DRatioFunction final : public BinaryExpression
13 | {
14 | public:
15 |   DRatioFunction(std::unique_ptr<ExpressionInterface>&& left, std::unique_ptr<ExpressionInterface>&& right)
16 |     : BinaryExpression(std::move(left), std::move(right))
17 |   {
18 |   }
19 | 
20 |   ~DRatioFunction() override = default;
21 | 
22 | protected:
23 |   [[nodiscard]] std::optional<double> evaluate(std::optional<double> left, std::optional<double> right) const override;
24 | };
25 | 
26 | class SumFunction final : public ExpressionInterface
27 | {
28 | public:
29 |   explicit SumFunction(std::vector<std::unique_ptr<ExpressionInterface>>&& arguments)
30 |     : _arguments(std::move(arguments))
31 |   {
32 |   }
33 | 
34 |   ~SumFunction() override = default;
35 | 
36 |   /**
37 |    * Sums up all arguments.
38 |    *
39 |    * @param result List of results.
40 |    * @return The sum of all arguments.
41 |    */
42 |   [[nodiscard]] std::optional<double> evaluate(const CounterResult& result) const override;
43 | 
44 |   /**
45 |    * Adds all counters for all arguments.
46 |    *
47 |    * @param hardware_counter_names List of hardware counters that will be augmented.
48 |    */
49 |   void add_required_hardware_counter(std::vector<std::string>& hardware_counter_names) const override;
50 | 
51 | private:
52 |   std::vector<std::unique_ptr<ExpressionInterface>> _arguments;
53 | };
54 | }


--------------------------------------------------------------------------------
/include/perfcpp/context_switch.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <cstdint>
 3 | #include <optional>
 4 | 
 5 | namespace perf {
 6 | class ContextSwitch
 7 | {
 8 | public:
 9 |   ContextSwitch(const bool is_out,
10 |                 const bool is_preempt,
11 |                 const std::optional<std::uint32_t> process_id,
12 |                 const std::optional<std::uint32_t> thread_id) noexcept
13 |     : _is_out(is_out)
14 |     , _is_preempt(is_preempt)
15 |     , _process_id(process_id)
16 |     , _thread_id(thread_id)
17 |   {
18 |   }
19 |   ~ContextSwitch() noexcept = default;
20 | 
21 |   /**
22 |    * @return True, if the process/thread was switched out.
23 |    */
24 |   [[nodiscard]] bool is_out() const noexcept { return _is_out; }
25 | 
26 |   /**
27 |    * @return True, if the process/thread was switched in.
28 |    */
29 |   [[nodiscard]] bool is_in() const noexcept { return !_is_out; }
30 | 
31 |   /**
32 |    * @return True, if the process/thread was preempted.
33 |    */
34 |   [[nodiscard]] bool is_preempt() const noexcept { return _is_preempt; }
35 | 
36 |   /**
37 |    * @return Id of the process, or std::nullopt if not provided (currently only provided on CPU-wide sampling).
38 |    */
39 |   [[nodiscard]] std::optional<std::uint32_t> process_id() const noexcept { return _process_id; }
40 | 
41 |   /**
42 |    * @return Id of the thread, or std::nullopt if not provided (currently only provided on CPU-wide sampling).
43 |    */
44 |   [[nodiscard]] std::optional<std::uint32_t> thread_id() const noexcept { return _thread_id; }
45 | 
46 | private:
47 |   bool _is_out;
48 |   bool _is_preempt;
49 |   std::optional<std::uint32_t> _process_id{ std::nullopt };
50 |   std::optional<std::uint32_t> _thread_id{ std::nullopt };
51 | };
52 | }
53 | 


--------------------------------------------------------------------------------
/events/x86/micro-architecture-register.csv:
--------------------------------------------------------------------------------
 1 | regex,vendor,micro-architecture
 2 | AuthenticAMD-23-([12][0-9A-F]|[0-9A-F]),amd,zen-1
 3 | AuthenticAMD-23-[0-9A-F]+,amd,zen-2
 4 | AuthenticAMD-25-([245][0-9A-F]|[0-9A-F]),amd,zen-3
 5 | AuthenticAMD-25-[0-9A-F]+,amd,zen-4
 6 | AuthenticAMD-26-[0-9A-F]+,amd,zen-5
 7 | GenuineIntel-6-(1C|26|27|35|36),intel,bonnell
 8 | GenuineIntel-6-(37|4A|4C|4D|5A),intel,silvermont
 9 | GenuineIntel-6-(3C|45|46),intel,haswell
10 | GenuineIntel-6-(3D|47),intel,broadwell
11 | GenuineIntel-6-(4E|5E|8E|9E|A5|A6),intel,skylake
12 | GenuineIntel-6-(57|85),intel,knights-landing
13 | GenuineIntel-6-(97|9A|B7|BA|BF),intel,alder-lake
14 | GenuineIntel-6-(AA|AC|B5),intel,meteor-lake
15 | GenuineIntel-6-1[AEF],intel,nehalem-ep
16 | GenuineIntel-6-25,intel,westmere-ep-sp
17 | GenuineIntel-6-2A,intel,sandy-bridge
18 | GenuineIntel-6-2C,intel,westmere-ep-dp
19 | GenuineIntel-6-2D,intel,jake-town
20 | GenuineIntel-6-2E,intel,nehalem-ex
21 | GenuineIntel-6-2F,intel,westmere-ex
22 | GenuineIntel-6-3A,intel,ivy-bridge
23 | GenuineIntel-6-3E,intel,ivy-town
24 | GenuineIntel-6-3F,intel,haswell-x
25 | GenuineIntel-6-4F,intel,broadwell-x
26 | GenuineIntel-6-55-[01234],intel,skylake-x
27 | GenuineIntel-6-55-[56789ABCDEF],intel,cascade-lake-x
28 | GenuineIntel-6-56,intel,broadwell-de
29 | GenuineIntel-6-5[CF],intel,goldmont
30 | GenuineIntel-6-6[AC],intel,ice-lake-x
31 | GenuineIntel-6-7A,intel,goldmont-plus
32 | GenuineIntel-6-7[DE],intel,ice-lake
33 | GenuineIntel-6-86,intel,snow-ridge-x
34 | GenuineIntel-6-8F,intel,sapphire-rapids
35 | GenuineIntel-6-8[CD],intel,tiger-lake
36 | GenuineIntel-6-9[6C],intel,elkhart-lake
37 | GenuineIntel-6-A7,intel,rocket-lake
38 | GenuineIntel-6-AF,intel,sierra-forest
39 | GenuineIntel-6-A[DE],intel,granite-rapids
40 | GenuineIntel-6-B6,intel,grand-ridge
41 | GenuineIntel-6-BD,intel,lunar-lake
42 | GenuineIntel-6-BE,intel,alder-lake-n
43 | GenuineIntel-6-CC,intel,panther-lake
44 | GenuineIntel-6-CF,intel,emerald-rapids
45 | GenuineIntel-6-C[56],intel,arrow-lake
46 | GenuineIntel-6-DD,intel,clearwater-forest
47 | 


--------------------------------------------------------------------------------
/include/perfcpp/time_event.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <chrono>
 3 | #include <string>
 4 | 
 5 | namespace perf {
 6 | class TimeEvent
 7 | {
 8 | public:
 9 |   virtual ~TimeEvent() noexcept = default;
10 |   [[nodiscard]] virtual double calculate(std::chrono::steady_clock::time_point start,
11 |                                          std::chrono::steady_clock::time_point end) const noexcept = 0;
12 | };
13 | 
14 | class SecondsTimeEvent final : public TimeEvent
15 | {
16 | public:
17 |   [[nodiscard]] double calculate(const std::chrono::steady_clock::time_point start,
18 |                                  const std::chrono::steady_clock::time_point end) const noexcept override
19 |   {
20 |     return static_cast<double>(std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count()) / 1000000000.;
21 |   }
22 | };
23 | 
24 | class MillisecondsTimeEvent final : public TimeEvent
25 | {
26 | public:
27 |   [[nodiscard]] double calculate(const std::chrono::steady_clock::time_point start,
28 |                                  const std::chrono::steady_clock::time_point end) const noexcept override
29 |   {
30 |     return static_cast<double>(std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count()) / 1000000.;
31 |   }
32 | };
33 | 
34 | class MicrosecondsTimeEvent final : public TimeEvent
35 | {
36 | public:
37 |   [[nodiscard]] double calculate(const std::chrono::steady_clock::time_point start,
38 |                                  const std::chrono::steady_clock::time_point end) const noexcept override
39 |   {
40 |     return static_cast<double>(std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count()) / 1000.;
41 |   }
42 | };
43 | 
44 | class NanosecondsTimeEvent final : public TimeEvent
45 | {
46 | public:
47 |   [[nodiscard]] double calculate(const std::chrono::steady_clock::time_point start,
48 |                                  const std::chrono::steady_clock::time_point end) const noexcept override
49 |   {
50 |     return static_cast<double>(std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count());
51 |   }
52 | };
53 | }


--------------------------------------------------------------------------------
/examples/statistics/single_thread.cpp:
--------------------------------------------------------------------------------
 1 | #include "perfcpp/event_counter.h"
 2 | #include <iostream>
 3 | 
 4 | #include "../access_benchmark.h"
 5 | 
 6 | int
 7 | main()
 8 | {
 9 |   std::cout << "libperf-cpp example: Record performance counter for "
10 |                "single-threaded random access to an in-memory array."
11 |             << std::endl;
12 | 
13 |   /// Initialize performance counters.
14 |   auto event_counter = perf::EventCounter{};
15 | 
16 |   /// Add all the performance counters we want to record.
17 |   try {
18 |     event_counter.add(
19 |       { "instructions", "cycles", "branches", "branch-misses", "cycles-per-instruction", "nanoseconds", "gigahertz" });
20 |   } catch (std::runtime_error& e) {
21 |     std::cerr << e.what() << std::endl;
22 |     return 1;
23 |   }
24 | 
25 |   /// Create random access benchmark.
26 |   auto benchmark = perf::example::AccessBenchmark{ /*randomize the accesses*/ true,
27 |                                                    /* create benchmark of 512 MB */ 512 };
28 | 
29 |   /// Start recording.
30 |   try {
31 |     event_counter.start();
32 |   } catch (std::runtime_error& exception) {
33 |     std::cerr << exception.what() << std::endl;
34 |     return 1;
35 |   }
36 | 
37 |   /// Execute the benchmark (accessing cache lines in a random order).
38 |   auto value = 0ULL;
39 |   for (auto index = 0U; index < benchmark.size(); ++index) {
40 |     value += benchmark[index].value;
41 |   }
42 | 
43 |   /// We do not want the compiler to optimize away this (otherwise) unused value (and consequently the loop above).
44 |   benchmark.pretend_to_use(value);
45 | 
46 |   /// Stop recording counters.
47 |   event_counter.stop();
48 | 
49 |   /// Get the result (normalized per cache line).
50 |   const auto result = event_counter.result(benchmark.size());
51 | 
52 |   /// Print the performance counters manually.
53 |   std::cout << "\nResults:\n";
54 |   for (const auto& [counter_name, counter_value] : result) {
55 |     std::cout << counter_value << " " << counter_name << " / cache line" << std::endl;
56 |   }
57 | 
58 |   /// Print the performance counters as table.
59 |   std::cout << "\nResults as table:\n" << result.to_string() << std::endl;
60 | 
61 |   return 0;
62 | }


--------------------------------------------------------------------------------
/src/exception.cpp:
--------------------------------------------------------------------------------
 1 | #include <perfcpp/exception.h>
 2 | 
 3 | std::string
 4 | perf::CannotOpenCounterError::create_error_message_from_code(const std::int64_t error_code)
 5 | {
 6 |   switch (error_code) {
 7 |     case ENOENT:
 8 |       return "configuration might not be valid (e.g., unsupported event)";
 9 |     case E2BIG:
10 |       return "perf_event_attr.size was not configured properly – this could be a bug in the perf-cpp library";
11 |     case EACCES:
12 |       return "insufficient access rights to start the counter, e.g., profiling a not user-owned process or "
13 |              "perf_event_paranoid value too high (see "
14 |              "https://github.com/jmuehlig/perf-cpp/blob/dev/docs/perf-paranoid.md)";
15 | #ifndef PERFCPP_NO_ERROR_EBUSY /// Busy error is reported since Linux 4.1
16 |     case EBUSY:
17 |       return "another event has exclusive access to the PMU";
18 | #endif
19 |     case EINVAL:
20 |       return "counter is configured with an invalid argument (e.g., too high sample frequency, unknown CPU, invalid "
21 |              "sample type)";
22 |     case EMFILE:
23 |       return "too many open file descriptors (e.g., too many opened counters?)";
24 |     case ENODEV:
25 |       return "configured with feature that does not exist on this CPU";
26 |     case EOVERFLOW:
27 |       return "maximal callchain stack size is higher than the maximum (see /proc/sys/kernel/perf_event_max_stack)";
28 |     case EPERM:
29 |       return "one of the following features is set but not supported: excluding hypervisor, excluding idle, "
30 |              "excluding "
31 |              "user, or excluding kernel";
32 |     case ESRCH:
33 |       return "specified process does not exist";
34 |     default:
35 |       return "perf_event_open failed with unknown error";
36 |   }
37 | }
38 | 
39 | std::string
40 | perf::IoctlError::create_error_message_from_code(const std::int64_t error_code)
41 | {
42 |   switch (error_code) {
43 |     case EBADF:
44 |       return "file descriptor is not valid";
45 |     case EFAULT:
46 |       return "references inaccessible memory area";
47 |     case ENOTTY:
48 |       return "file descriptor cannot be used";
49 |     default:
50 |       return "::ioctl failed with unknown error";
51 |   }
52 | }


--------------------------------------------------------------------------------
/examples/sampling/flame_graph.cpp:
--------------------------------------------------------------------------------
 1 | #include "../access_benchmark.h"
 2 | #include "perfcpp/analyzer/flame_graph_generator.h"
 3 | #include "perfcpp/sampler.h"
 4 | #include <iostream>
 5 | 
 6 | int
 7 | main()
 8 | {
 9 |   std::cout << "libperf-cpp example: Record perf samples including time, "
10 |                "instruction pointer, and callchain for flamegraph generation."
11 |             << std::endl;
12 | 
13 |   auto sampler = perf::Sampler{};
14 | 
15 |   /// Event that generates an overflow which is samples.
16 |   sampler.trigger("cycles", perf::Precision::RequestZeroSkid, perf::Period{ 50000U });
17 | 
18 |   /// Include Timestamp, period, instruction pointer, and CPU number into samples.
19 |   sampler.values().timestamp(true).instruction_pointer(true).callchain(true);
20 | 
21 |   /// Start sampling.
22 |   try {
23 |     sampler.start();
24 |   } catch (std::runtime_error& exception) {
25 |     std::cerr << exception.what() << std::endl;
26 |     return 1;
27 |   }
28 | 
29 |   /// Create random access benchmark.
30 |   auto benchmark = perf::example::AccessBenchmark{ /*randomize the accesses*/ true,
31 |                                                    /* create benchmark of 512 MB */ 512U };
32 | 
33 |   /// Execute the benchmark (accessing cache lines in a random order).
34 |   auto value = 0ULL;
35 |   for (auto index = 0U; index < benchmark.size(); ++index) {
36 |     value += benchmark[index].value;
37 |   }
38 | 
39 |   /// We do not want the compiler to optimize away this (otherwise) unused value (and consequently the loop above).
40 |   benchmark.pretend_to_use(value);
41 | 
42 |   /// Stop sampling.
43 |   sampler.stop();
44 | 
45 |   /// Get all the recorded samples.
46 |   const auto samples = sampler.result(true);
47 | 
48 |   /// Translate into frame graph entries.
49 |   auto flame_graph_generator = perf::analyzer::FlameGraphGenerator{};
50 |   flame_graph_generator.map(samples, "flamegraphs.txt");
51 | 
52 |   std::cout << "Wrote samples into flamegraphs.txt" << std::endl;
53 |   std::cout << "You can upload the flamgraphs.txt here: https://flamegraph.com/" << std::endl;
54 | 
55 |   /// Close the sampler.
56 |   /// Note that the sampler can only be closed after reading the samples.
57 |   sampler.close();
58 | 
59 |   return 0;
60 | }


--------------------------------------------------------------------------------
/test/access_benchmark.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cstdint>
 4 | #include <vector>
 5 | 
 6 | namespace perf::test {
 7 | 
 8 | /**
 9 |  * Generator for unique and zipf data sets.
10 |  */
11 | class DataGenerator
12 | {
13 | public:
14 |   [[nodiscard]] static std::vector<std::uint64_t> generate_unique(std::size_t size);
15 | 
16 | private:
17 |   [[nodiscard]] static std::vector<std::uint64_t> alphabet(std::size_t size);
18 | 
19 |   [[nodiscard]] static std::vector<double> lookup_table(double zipf_param, const std::vector<std::uint64_t>& alphabet);
20 | };
21 | 
22 | /**
23 |  * Benchmark accessing benchmarks in random or sequential order.
24 |  * This is an example to demonstrate the perfcpp library.
25 |  */
26 | class AccessBenchmark
27 | {
28 | public:
29 |   /**
30 |    * Object sized of one cache line.
31 |    */
32 |   struct alignas(64U) cache_line
33 |   {
34 |     cache_line() noexcept = default;
35 |     explicit cache_line(const std::uint64_t value_) noexcept
36 |       : value(value_)
37 |     {
38 |     }
39 |     ~cache_line() noexcept = default;
40 | 
41 |     std::uint64_t value;
42 |   };
43 | 
44 |   AccessBenchmark(bool is_random, std::uint64_t access_data_size_in_mb, bool is_write = false);
45 |   ~AccessBenchmark() = default;
46 | 
47 |   /**
48 |    * @return Number of cache lines.
49 |    */
50 |   [[nodiscard]] std::size_t size() const noexcept { return _indices.size(); }
51 | 
52 |   /**
53 |    * Grant access to the i-th cache line, considering the defined access order.
54 |    *
55 |    * @param index Index of the cache line to access.
56 |    * @return Cache line.
57 |    */
58 |   [[nodiscard]] const cache_line& operator[](const std::size_t index) const noexcept
59 |   {
60 |     return _data_to_read[_indices[index]];
61 |   }
62 | 
63 |   void set(const std::size_t index, const std::uint64_t value) { _data_to_write[_indices[index]].value = value; }
64 | 
65 |   void run();
66 | 
67 |   [[nodiscard]] const std::vector<std::uint64_t>& indices() const noexcept { return _indices; }
68 |   [[nodiscard]] const std::vector<cache_line>& data_to_read() const noexcept { return _data_to_read; }
69 | 
70 | private:
71 |   /// Indices, defining the order in which the memory chunk is accessed.
72 |   std::vector<std::uint64_t> _indices;
73 | 
74 |   /// Memory chunk that is read during the benchmark.
75 |   std::vector<cache_line> _data_to_read;
76 | 
77 |   /// Memory chunk that is written during the benchmark.
78 |   std::vector<cache_line> _data_to_write;
79 | };
80 | }


--------------------------------------------------------------------------------
/script/create_perf_list.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import subprocess
 3 | import os
 4 | 
 5 | 
 6 | codes_regex = re.compile(r'Codes\s+:\s?([a-zA-Z0-9]+)')
 7 | 
 8 | def extract_events(content: str):
 9 |     # Find the event name
10 |     name_match = re.search(r"Name\s+:\s+([a-zA-Z0-9\_\-]+)", content)
11 |     if not name_match:
12 |         return []
13 |     event_name = name_match.group(1)
14 | 
15 |     # Find all Umask entries and their descriptions
16 |     umask_matches = re.findall(r"Umask-\d+\s+:\s+0x[0-9a-fA-F]+\s+:\s+PMU\s+:\s+\[(.*?)\]", content)
17 | 
18 |     if not umask_matches:
19 |         return [event_name]
20 | 
21 |     # Combine the event name with each Umask
22 |     return [f"{event_name}.{umask.replace(' ', '_')}" for umask in umask_matches]
23 | 
24 | ## Clone or pull the repository
25 | if not os.path.exists('libpfm4'):
26 |     subprocess.run(['git', 'clone', '-b', 'master', '--single-branch', 'https://github.com/wcohen/libpfm4.git', 'libpfm4'], stdout=subprocess.PIPE)
27 | else:
28 |     os.chdir('libpfm4')
29 |     subprocess.run(['git',  'pull'], stdout=subprocess.PIPE)
30 |     os.chdir('..')
31 | 
32 | ## Make the libpfm4 lib
33 | os.chdir('libpfm4')
34 | subprocess.run(['make'], stdout=subprocess.PIPE)
35 | 
36 | ## Read all the event infos
37 | events_result = subprocess.run(['examples/showevtinfo'], stdout=subprocess.PIPE)
38 | counters = []
39 | 
40 | ## Transform into counters
41 | for events_content in str(events_result.stdout).split('#-----------------------------'):
42 |     events = extract_events(events_content)
43 |     for event_to_check in events:
44 |         event_result = subprocess.run([f'examples/check_events', event_to_check], stdout=subprocess.PIPE)
45 |         codes_match = re.search(r"Codes\s+:\s+(0x[0-9a-fA-F]+)", str(event_result.stdout))
46 | 
47 |         if codes_match:
48 |             counters.append((event_to_check, codes_match.group(1)))
49 | 
50 | with open('../perf_list.csv', 'w') as perf_out_file:
51 |     for counter in counters:
52 |         perf_out_file.write(f'{counter[0]},{counter[1]}\n')
53 | 
54 | print('-------------------------------------------------------------------------------------------------------------------------------------------------')
55 | print('WARNING: The \'perf-list\' target is deprecated and will be removed with perf-cpp v0.14! Use the processor-specific event files in events/ instead.')
56 | print('-------------------------------------------------------------------------------------------------------------------------------------------------')
57 | print(f'Wrote {len(counters)} counter definitions to \'perf_list.csv\'.')
58 | 
59 | 


--------------------------------------------------------------------------------
/test/access_benchmark.cpp:
--------------------------------------------------------------------------------
 1 | #include "access_benchmark.h"
 2 | #include <algorithm>
 3 | #include <numeric>
 4 | #include <random>
 5 | 
 6 | perf::test::AccessBenchmark::AccessBenchmark(const bool is_random,
 7 |                                              const std::uint64_t access_data_size_in_mb,
 8 |                                              const bool is_write)
 9 | {
10 |   const auto count_cache_lines = (access_data_size_in_mb * 1024U * 1024U) / sizeof(cache_line);
11 | 
12 |   /// Fill the data array with some unique.
13 |   this->_data_to_read.reserve(count_cache_lines);
14 |   for (const auto item : DataGenerator::generate_unique(count_cache_lines)) {
15 |     this->_data_to_read.emplace_back(item);
16 |   }
17 | 
18 |   if (is_write) {
19 |     this->_data_to_write.resize(count_cache_lines);
20 |   }
21 | 
22 |   /// Create the access pattern by filling the indices and shuffle, if we want a
23 |   /// random access pattern.
24 |   this->_indices.resize(count_cache_lines);
25 |   std::iota(this->_indices.begin(), this->_indices.end(), 0U);
26 | 
27 |   if (is_random) {
28 |     std::shuffle(this->_indices.begin(), this->_indices.end(), std::mt19937{ std::random_device{}() });
29 |   }
30 | }
31 | 
32 | std::vector<std::uint64_t>
33 | perf::test::DataGenerator::generate_unique(const std::size_t size)
34 | {
35 |   /// Create a list for the tuples.
36 |   auto relation = std::vector<std::uint64_t>{};
37 |   relation.reserve(size);
38 | 
39 |   /// Create tuples.
40 |   auto generator = std::mt19937{ 864896UL };
41 |   auto distribution = std::uniform_int_distribution<std::uint64_t>{};
42 |   for (auto i = 0ULL; i < size; ++i) {
43 |     relation.emplace_back(distribution(generator));
44 |   }
45 | 
46 |   /// Shuffle the relation.
47 |   std::shuffle(relation.begin(), relation.end(), generator);
48 | 
49 |   return relation;
50 | }
51 | 
52 | void
53 | perf::test::AccessBenchmark::run()
54 | {
55 |   const auto is_readonly = this->_data_to_read.size() != this->_data_to_write.size();
56 |   auto value = 0ULL;
57 | 
58 |   if (is_readonly) {
59 |     for (auto index = 0U; index < this->size(); ++index) {
60 |       value += this->_data_to_read[this->_indices[index]].value;
61 |     }
62 |   } else {
63 |     for (auto index = 0U; index < this->size(); ++index) {
64 |       value += this->_data_to_read[this->_indices[index]].value;
65 | 
66 |       this->_data_to_write[this->_indices[index]].value = value;
67 |     }
68 |   }
69 | 
70 |   asm volatile(""
71 |                : "+r,m"(value)
72 |                :
73 |                : "memory"); /// We do not want the compiler to optimize away
74 |                             /// this unused value.
75 | }


--------------------------------------------------------------------------------
/docs/perf-paranoid.md:
--------------------------------------------------------------------------------
 1 | # Perf Paranoid
 2 | 
 3 | This section explains the *perf paranoid* setting in Linux, which controls access to performance monitoring features and can restrict unprivileged users from using performance counters.
 4 | 
 5 | ## Understanding Perf Paranoid
 6 | The *perf paranoid* level is defined in `/proc/sys/kernel/perf_event_paranoid`, with values ranging from *highly restrictive* to *fully permissive*:
 7 | 
 8 | | Value  | Access Level                                                         |
 9 | |--------|----------------------------------------------------------------------|
10 | | `-1`   | No restrictions (full access).                                       |
11 | | `0`    | Allow normal users access, but no raw tracepoint samples.            |
12 | | `1`    | Allow user and kernel-level profiling (default before Linux `4.6`).    |
13 | | `>= 2` | Only user-level measurements allowed  (**default since Linux** `4.6`). |
14 | 
15 | 
16 | If the setting is too restrictive, you may encounter errors like:
17 | 
18 |     Cannot open perf counter: insufficient access rights to start the counter, 
19 |     e.g., profiling a not user-owned process or perf_event_paranoid value too high.
20 | 
21 | This can be resolved either by [modifying the paranoid level](#setting-the-perf-paranoid-value) or [adjusting monitoring settings](#adjusting-monitoring-configuration). 
22 | 
23 | ## Setting the Perf Paranoid Value
24 | To enable full access (if permitted by system policy), you can lower the paranoid value **temporarily** with:
25 | 
26 | ```bash
27 | sudo sysctl -w kernel.perf_event_paranoid=-1
28 | ```
29 | 
30 | For a **persistent** change, add the following line to `/etc/sysctl.conf`:
31 | 
32 | ``` 
33 | kernel.perf_event_paranoid = -1
34 | ```
35 | 
36 | Then apply changes with:
37 | 
38 | ```bash
39 | sudo sysctl --system
40 | ```
41 | 
42 | ## Adjusting Monitoring Configuration
43 | If you **cannot modify the paranoid level**, you may still be able to record user-level events only.
44 | Use the `perf::Config` class to **disable kernel/hypervisor-level** measurements, which allows profiling under restrictive `perf_event_paranoid` settings (`>= 2`).
45 | 
46 | ```cpp
47 | const auto counter_definitions = perf::CounterDefinition{};
48 | 
49 | auto config = perf::Config{};
50 | config.include_kernel(false);       /// Disable kernel event sampling
51 | config.include_hypervisor(false);   /// Disable hypervisor event sampling
52 | 
53 | auto event_counter = perf::EventCounter{ counter_definitions, config };
54 | event_counter.add(...);
55 | 
56 | event_counter.start(); /// Will only record user-level events.
57 | ```
58 | 
59 | To further restrict monitoring exclude guest events:
60 | 
61 | ```cpp
62 | config.include_guest(false);
63 | ```


--------------------------------------------------------------------------------
/examples/statistics/live_events.cpp:
--------------------------------------------------------------------------------
 1 | #include "../access_benchmark.h"
 2 | #include "perfcpp/event_counter.h"
 3 | #include <iostream>
 4 | 
 5 | int
 6 | main()
 7 | {
 8 |   std::cout
 9 |     << "libperf-cpp example: Record a events and live events where the latter can be read without stopping the "
10 |        "hardware performance counters. As a benchmark, we use random access to an in-memory array multiple times."
11 |     << std::endl;
12 | 
13 |   auto event_counter = perf::EventCounter{};
14 | 
15 |   try {
16 |     /// Add counters that are recorded over the entire period (from start to end).
17 |     // event_counter.add({ "cycles", "instructions", "cache-references", "cache-misses", "branches" });
18 | 
19 |     /// Add live counters that can be read without stopping the EventCounter.
20 |     event_counter.add_live(std::vector<std::string>{ "cache-references", "cache-misses", "branches" });
21 |   } catch (std::runtime_error& e) {
22 |     std::cerr << e.what() << std::endl;
23 |     return 1;
24 |   }
25 | 
26 |   /// Create random access benchmark.
27 |   auto benchmark = perf::example::AccessBenchmark{ /*randomize the accesses*/ true,
28 |                                                    /* create benchmark of 512 MB */ 512U };
29 | 
30 |   /// Access to live events. Needs to be initiated after adding all live events.
31 |   auto live_events = perf::LiveEventCounter{ event_counter };
32 | 
33 |   /// Start recording.
34 |   try {
35 |     event_counter.start();
36 |   } catch (std::runtime_error& exception) {
37 |     std::cerr << exception.what() << std::endl;
38 |     return 1;
39 |   }
40 | 
41 |   /// Execute the benchmark (accessing cache lines in a random order).
42 |   constexpr auto iterations = 20U;
43 |   for (auto i = 0U; i < iterations; ++i) {
44 |     /// Read current values of live events and mark them as "start" values.
45 |     live_events.start();
46 | 
47 |     /// Perform benchmark.
48 |     auto value = 0ULL;
49 |     for (auto index = 0U; index < benchmark.size(); ++index) {
50 |       value += benchmark[index].value;
51 |     }
52 | 
53 |     /// We do not want the compiler to optimize away this (otherwise) unused value (and consequently the loop above).
54 |     benchmark.pretend_to_use(value);
55 | 
56 |     /// Read the current counter value after the benchmark.
57 |     live_events.stop();
58 | 
59 |     /// Print the live values.
60 |     std::cout << "Live results: " << live_events.get("cache-references", benchmark.size()) << " cache-references, "
61 |               << live_events.get("cache-misses", benchmark.size()) << " cache-misses, "
62 |               << live_events.get("branches", benchmark.size()) << " branches" << std::endl;
63 |   }
64 | 
65 |   /// Stop recording counters.
66 |   event_counter.stop();
67 | 
68 |   return 0;
69 | }


--------------------------------------------------------------------------------
/examples/sampling/perf_record.cpp:
--------------------------------------------------------------------------------
 1 | #include "../access_benchmark.h"
 2 | #include "perfcpp/hardware_info.h"
 3 | #include "perfcpp/sampler.h"
 4 | #include <iostream>
 5 | 
 6 | int
 7 | main()
 8 | {
 9 |   std::cout << "libperf-cpp example: Record perf samples including time, "
10 |                "logical memory address, latency, data source, and instruction and write as a perf data file `perf.dat`. "
11 |             << std::endl;
12 | 
13 |   /// Initialize sampler.
14 |   auto sampler = perf::Sampler{};
15 | 
16 |   /// Setup which counters trigger the writing of samples (depends on the underlying hardware substrate).
17 |   if (perf::HardwareInfo::is_amd_ibs_supported()) {
18 |     sampler.trigger("ibs_op_uops", perf::Precision::MustHaveZeroSkid, perf::Period{ 16000 });
19 |   } else if (perf::HardwareInfo::is_intel()) {
20 |     sampler.trigger("mem-loads", perf::Precision::MustHaveZeroSkid, perf::Period{ 16000 });
21 |   } else {
22 |     std::cout << "Error: Memory sampling is not supported on this CPU." << std::endl;
23 |     return 1;
24 |   }
25 | 
26 |   /// Setup which data will be included into samples (timestamp, virtual memory address, data source like L1d or RAM,
27 |   /// latency, instruction address, thread id, and the callstack).
28 |   sampler.values().timestamp(true).logical_memory_address(true).data_source(true).latency(true).instruction_pointer(true).thread_id(true).callchain(true);
29 | 
30 |   /// Start sampling.
31 |   try {
32 |     sampler.start();
33 |   } catch (std::runtime_error& exception) {
34 |     std::cerr << exception.what() << std::endl;
35 |     return 1;
36 |   }
37 | 
38 |   /// Create random access benchmark.
39 |   auto benchmark = perf::example::AccessBenchmark{ /*randomize the accesses*/ true,
40 |                                                    /* create benchmark of 1024 MB */ 1024U };
41 | 
42 |   /// Execute the benchmark (accessing cache lines in a random order).
43 |   auto value = 0ULL;
44 |   for (auto index = 0U; index < benchmark.size(); ++index) {
45 |     value += benchmark[index].value;
46 |   }
47 | 
48 |   /// We do not want the compiler to optimize away this (otherwise) unused value (and consequently the loop above).
49 |   benchmark.pretend_to_use(value);
50 | 
51 |   /// Stop sampling.
52 |   sampler.stop();
53 | 
54 |   /// Write sample results to perf file.
55 |   sampler.to_perf_file("perf.data");
56 | 
57 |   std::cout << "Wrote " << sampler.result().size() << " samples to `perf.data`."
58 |     << "\n    Run `perf report`     to show overhead per symbol"
59 |     << "\n    Run `perf mem report` to show overhead per data object"
60 |   << std::endl;
61 | 
62 | 
63 |   /// Close the sampler.
64 |   /// Note that the sampler can only be closed after reading the samples.
65 |   sampler.close();
66 | 
67 |   return 0;
68 | }


--------------------------------------------------------------------------------
/include/perfcpp/counter_result.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cstdint>
 4 | #include <optional>
 5 | #include <string>
 6 | #include <utility>
 7 | #include <vector>
 8 | 
 9 | namespace perf {
10 | class CounterResult
11 | {
12 | public:
13 |   using iterator = std::vector<std::pair<std::string_view, double>>::iterator;
14 |   using const_iterator = std::vector<std::pair<std::string_view, double>>::const_iterator;
15 | 
16 |   CounterResult() = default;
17 |   CounterResult(CounterResult&&) noexcept = default;
18 |   CounterResult(const CounterResult&) = default;
19 |   explicit CounterResult(std::vector<std::pair<std::string_view, double>>&& results) noexcept
20 |     : _results(std::move(results))
21 |   {
22 |   }
23 | 
24 |   ~CounterResult() = default;
25 | 
26 |   CounterResult& operator=(CounterResult&&) noexcept = default;
27 |   CounterResult& operator=(const CounterResult&) = default;
28 | 
29 |   /**
30 |    * Access the result of the counter or metric with the given name.
31 |    *
32 |    * @param name Name of the counter or metric to access.
33 |    * @return The value, or std::nullopt of the result has no counter or value with the requested name.
34 |    */
35 |   [[nodiscard]] std::optional<double> get(std::string_view name) const noexcept;
36 | 
37 |   [[nodiscard]] std::optional<double> operator[](const std::string_view name) const noexcept { return get(name); }
38 | 
39 |   [[nodiscard]] iterator begin() { return _results.begin(); }
40 |   [[nodiscard]] iterator end() { return _results.end(); }
41 |   [[nodiscard]] const_iterator begin() const { return _results.begin(); }
42 |   [[nodiscard]] const_iterator end() const { return _results.end(); }
43 | 
44 |   /**
45 |    * Adds the given result to the end of the results.
46 |    *
47 |    * @param name Name of the result.
48 |    * @param value Value of the result.
49 |    */
50 |   void emplace_back(const std::string_view name, const double value) { _results.emplace_back(name, value); }
51 | 
52 |   /**
53 |    * Converts the result to a json-formatted string.
54 |    * @return Result in JSON format.
55 |    */
56 |   [[nodiscard]] std::string to_json() const;
57 | 
58 |   /**
59 |    * Converts the result to a CSV-formatted string.
60 |    *
61 |    * @param delimiter Char to separate columns (',' by default).
62 |    * @param print_header If true, the header will be printed first (true by default).
63 |    * @return Result in CSV format.
64 |    */
65 |   [[nodiscard]] std::string to_csv(char delimiter = ',', bool print_header = true) const;
66 | 
67 |   /**
68 |    * Converts the result to a table-formatted string.
69 |    * @return Result as a table-formatted string.
70 |    */
71 |   [[nodiscard]] std::string to_string() const;
72 | 
73 | private:
74 |   std::vector<std::pair<std::string_view, double>> _results;
75 | };
76 | }


--------------------------------------------------------------------------------
/include/perfcpp/util/unique_file_descriptor.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <cstdint>
 3 | #include <unistd.h>
 4 | #include <utility>
 5 | 
 6 | namespace perf::util {
 7 | /**
 8 |  * The unique file descriptor is backed by a common file descriptor; however, it does not allow copying.
 9 |  * Destroying the unique file descriptor leads to closing the underyling file descriptor – comparable to a unique
10 |  * pointer.
11 |  */
12 | class UniqueFileDescriptor
13 | {
14 | public:
15 |   UniqueFileDescriptor() noexcept = default;
16 | 
17 |   explicit UniqueFileDescriptor(const std::int64_t file_descriptor) noexcept
18 |     : _file_descriptor(static_cast<std::int32_t>(file_descriptor))
19 |   {
20 |   }
21 | 
22 |   UniqueFileDescriptor(UniqueFileDescriptor&& other) noexcept
23 |     : _file_descriptor(std::exchange(other._file_descriptor, -1L))
24 |   {
25 |   }
26 | 
27 |   /**
28 |    * Closes the file descriptor underneath.
29 |    */
30 |   ~UniqueFileDescriptor()
31 |   {
32 |     if (has_value()) {
33 |       ::close(value());
34 |     }
35 |   }
36 | 
37 |   UniqueFileDescriptor& operator=(UniqueFileDescriptor&& other) noexcept
38 |   {
39 |     _file_descriptor = std::exchange(other._file_descriptor, -1L);
40 |     return *this;
41 |   }
42 | 
43 |   UniqueFileDescriptor& operator=(const std::int64_t file_descriptor) noexcept
44 |   {
45 |     _file_descriptor = static_cast<std::int32_t>(file_descriptor);
46 |     return *this;
47 |   }
48 | 
49 |   UniqueFileDescriptor& operator=(const std::int32_t file_descriptor) noexcept
50 |   {
51 |     _file_descriptor = file_descriptor;
52 |     return *this;
53 |   }
54 | 
55 |   /**
56 |    * @return True, if the filter descriptor underneath is opened.
57 |    */
58 |   [[nodiscard]] bool has_value() const noexcept { return _file_descriptor > -1LL; }
59 | 
60 |   /**
61 |    * @return The "real" filter descriptor.
62 |    */
63 |   [[nodiscard]] std::int32_t value() const noexcept { return _file_descriptor; }
64 | 
65 | private:
66 |   std::int32_t _file_descriptor{ -1LL };
67 | };
68 | 
69 | /**
70 |  * The file descriptor view grants access to a file descriptor without closing it when destroyed.
71 |  * Logically, the view does not own the file descriptor; it can be outdated.
72 |  */
73 | class FileDescriptorView
74 | {
75 | public:
76 |   FileDescriptorView() noexcept = default;
77 | 
78 |   explicit FileDescriptorView(const UniqueFileDescriptor& file_descriptor) noexcept
79 |     : _file_descriptor(file_descriptor.value())
80 |   {
81 |   }
82 |   FileDescriptorView(const FileDescriptorView&) noexcept = default;
83 | 
84 |   ~FileDescriptorView() noexcept = default;
85 | 
86 |   /**
87 |    * @return True, if the filter descriptor underneath is opened.
88 |    */
89 |   [[nodiscard]] bool has_value() const noexcept { return _file_descriptor > -1LL; }
90 | 
91 |   /**
92 |    * @return The "real" filter descriptor.
93 |    */
94 |   [[nodiscard]] std::int32_t value() const noexcept { return _file_descriptor; }
95 | 
96 | private:
97 |   std::int32_t _file_descriptor{ -1LL };
98 | };
99 | }


--------------------------------------------------------------------------------
/events/x86/intel/panther-lake.csv:
--------------------------------------------------------------------------------
 1 | br_inst_retired.all_branches, 0xc4
 2 | br_misp_retired.all_branches, 0xc5
 3 | cpu_clk_unhalted.core_p, 0x3c
 4 | cpu_clk_unhalted.ref_tsc_p, 0x13c
 5 | cpu_clk_unhalted.thread_p, 0x3c
 6 | dtlb_load_misses.walk_completed, 0xe08
 7 | dtlb_store_misses.walk_completed, 0xe49
 8 | icache.accesses, 0x380
 9 | icache.misses, 0x280
10 | idq_bubbles.core, 0x19c
11 | inst_retired.any_p, 0xc0
12 | itlb_misses.walk_completed, 0xe85
13 | l2_request.all, 0x1ff24
14 | l2_rqsts.all_code_rd, 0xe424
15 | l2_rqsts.all_demand_data_rd, 0xe124
16 | ld_blocks.store_forward, 0x203
17 | longest_lat_cache.miss, 0x412e
18 | longest_lat_cache.reference, 0x4f2e
19 | mem_inst_retired.all_loads, 0x81d0
20 | mem_inst_retired.all_stores, 0x82d0
21 | mem_trans_retired.load_latency_gt_1024, 0x1cd, 0x400
22 | mem_trans_retired.load_latency_gt_128, 0x1cd, 0x80
23 | mem_trans_retired.load_latency_gt_16, 0x1cd, 0x10
24 | mem_trans_retired.load_latency_gt_2048, 0x1cd, 0x800
25 | mem_trans_retired.load_latency_gt_256, 0x1cd, 0x100
26 | mem_trans_retired.load_latency_gt_32, 0x1cd, 0x20
27 | mem_trans_retired.load_latency_gt_4, 0x1cd, 0x4
28 | mem_trans_retired.load_latency_gt_512, 0x1cd, 0x200
29 | mem_trans_retired.load_latency_gt_64, 0x1cd, 0x40
30 | mem_trans_retired.load_latency_gt_8, 0x1cd, 0x8
31 | mem_trans_retired.store_sample, 0x2cd
32 | mem_uops_retired.all_loads, 0x81d0
33 | mem_uops_retired.all_stores, 0x82d0
34 | mem_uops_retired.load_latency_gt_1024, 0x5d0, 0x400
35 | mem_uops_retired.load_latency_gt_128, 0x5d0, 0x80
36 | mem_uops_retired.load_latency_gt_16, 0x5d0, 0x10
37 | mem_uops_retired.load_latency_gt_2048, 0x5d0, 0x800
38 | mem_uops_retired.load_latency_gt_256, 0x5d0, 0x100
39 | mem_uops_retired.load_latency_gt_32, 0x5d0, 0x20
40 | mem_uops_retired.load_latency_gt_4, 0x5d0, 0x4
41 | mem_uops_retired.load_latency_gt_512, 0x5d0, 0x200
42 | mem_uops_retired.load_latency_gt_64, 0x5d0, 0x40
43 | mem_uops_retired.load_latency_gt_8, 0x5d0, 0x8
44 | mem_uops_retired.store_latency, 0x6d0
45 | misc_retired.lbr_inserts, 0x1e4
46 | ocr.demand_data_rd.any_response, 0x1b7, 0x10001
47 | ocr.demand_data_rd.any_response_0, 0x12a, 0x10001
48 | ocr.demand_data_rd.any_response_1, 0x12b, 0x10001
49 | ocr.demand_data_rd.dram, 0x1b7, 0x7bc000001
50 | ocr.demand_data_rd.dram_0, 0x12a, 0x1e780000001
51 | ocr.demand_data_rd.dram_1, 0x12b, 0x1e780000001
52 | ocr.demand_data_rd.l3_miss, 0x1b7, 0x13fbfc00001
53 | ocr.demand_data_rd.l3_miss_0, 0x12a, 0x9e7fa000001
54 | ocr.demand_data_rd.l3_miss_1, 0x12b, 0x9e7fa000001
55 | ocr.demand_rfo.any_response, 0x1b7, 0x10002
56 | ocr.demand_rfo.any_response_0, 0x12a, 0x10002
57 | ocr.demand_rfo.any_response_1, 0x12b, 0x10002
58 | ocr.demand_rfo.l3_miss, 0x1b7, 0x13fbfc00002
59 | ocr.demand_rfo.l3_miss_0, 0x12a, 0x9e7fa000002
60 | ocr.demand_rfo.l3_miss_1, 0x12b, 0x9e7fa000002
61 | topdown.backend_bound_slots, 0x2a4
62 | topdown.slots_p, 0x1a4
63 | topdown_bad_speculation.all_p, 0x73
64 | topdown_be_bound.all, 0x2a4
65 | topdown_be_bound.all_p, 0x2a4
66 | topdown_fe_bound.all_p, 0x19c
67 | topdown_retiring.all_p, 0x2c2
68 | uops_retired.slots, 0x2c2
69 | 


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
 1 | # *perf-cpp* Examples
 2 | 
 3 | We included various examples to teach you how to use *perf-cpp* and leverage hardware performance counter results directly from your application.
 4 | 
 5 | ## How to Build the Examples
 6 | 
 7 | ```
 8 | # Clone the repository
 9 | git clone https://github.com/jmuehlig/perf-cpp.git
10 | 
11 | # Switch to the cloned folder
12 | cd perf-cpp
13 | 
14 | # Generate the Makefile
15 | cmake . -B build -DBUILD_EXAMPLES=1
16 | 
17 | # Build the examples
18 | cmake --build build --target examples
19 | ```
20 | 
21 | The examples will be built to `build/examples/bin/`.
22 | 
23 | ## List of Examples
24 | ### Counting Hardware Events
25 | - [statistics/single_thread.cpp](statistics/single_thread.cpp) provides an example to record and read performance counters for a specific code segment on a **single** thread.
26 | - [statistics/inherit_thread.cpp](statistics/inherit_thread.cpp) advances the example to record counter statistics not only from one but also for its **child-threads**.
27 | - [statistics/multi_thread.cpp](statistics/multi_thread.cpp) shows how to record performance counter statistics on **multiple** threads.
28 | - [statistics/multi_cpu.cpp](statistics/multi_cpu.cpp) shows how to pin performance counters to **specific CPU cores** instead of focussing on threads and processes.
29 | - [statistics/live_events.cpp](statistics/live_events.cpp) shows how to access hardware counters with **low latency**.
30 | - [statistics/metric.cpp](statistics/metric.cpp) shows how define new metrics.
31 | 
32 | 
33 | ### Sampling
34 | - [sampling/instruction_pointer.cpp](sampling/instruction_pointer.cpp) provides an example to sample instruction pointers on a single thread.
35 | - [sampling/flame_graph.cpp](sampling/flame_graph.cpp) provides an example to generate a format that can be used by flamegraph generators.
36 | - [sampling/perf_record.cpp](sampling/perf_record.cpp) generates a `perf.data` file that can be read and analyzed via the *Linux perf* tool using `perf report` or `perf mem report`.
37 | - [sampling/memory_address.cpp](sampling/memory_address.cpp) provides an example to sample virtual memory addresses, their latency, and their origin.
38 | - [sampling/counter.cpp](sampling/counter.cpp) shows how to include values of further hardware performance counters into samples.
39 | - [sampling/branch.cpp](sampling/branch.cpp) exemplifies sampling for last branch records and their prediction success.
40 | - [sampling/register.cpp](sampling/register.cpp) provides an example on how to include values of specific registers into samples.
41 | - [sampling/context_switch.cpp](sampling/context_switch.cpp) provides an example that samples context switches on a single thread.
42 | - [sampling/multi_event.cpp](sampling/multi_event.cpp) exemplifies how to use multiple events as a trigger using Intel counters as an example.
43 | - [sampling/multi_thread.cpp)](sampling/multi_thread.cpp) explains how to sample data on multiple threads at the same time.
44 | - [sampling/multi_cpu.cpp](sampling/multi_cpu.cpp) provides an example that monitors multiple CPU cores and records samples.
45 | 


--------------------------------------------------------------------------------
/include/perfcpp/feature.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <linux/version.h>
 4 | 
 5 | /// The features of the perf subsystem have evolved over time (more precisely over Linux Kernel generations).
 6 | /// In this file, we define some preprocessor variables to keep up with older Linux Kernel versions without yielding
 7 | /// errors at compile- and runtime.
 8 | /// The documentation for the perf_event_open system call (https://man7.org/linux/man-pages/man2/perf_event_open.2.html)
 9 | /// has a great overview of features added in various versions.
10 | /// For the time being, we support Linux 4.0 and newer.
11 | 
12 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 1, 0)
13 | #define PERFCPP_NO_ERROR_EBUSY
14 | #define PERFCPP_NO_MMAP_DATA_SIZE
15 | #endif
16 | 
17 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 2, 0)
18 | #define PERFCPP_NO_SAMPLE_BRANCH_IND_JUMP
19 | #define PERFCPP_NO_RECORD_LOST_SAMPLES
20 | #endif
21 | 
22 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 3, 0)
23 | #define PERFCPP_NO_RECORD_SWITCH
24 | #define PERFCPP_NO_BRANCH_STACK_CYCLES
25 | #endif
26 | 
27 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0)
28 | #define PERFCPP_NO_SAMPLE_BRANCH_CALL
29 | #define PERFCPP_NO_COUNT_SW_BPF_OUTPUT
30 | #endif
31 | 
32 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0)
33 | #define PERFCPP_NO_SAMPLE_MAX_STACK
34 | #endif
35 | 
36 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 13, 0)
37 | #define PERFCPP_NO_SAMPLE_PHYS_ADDR
38 | #endif
39 | 
40 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 14, 0)
41 | #define PERFCPP_NO_MEM_REMOTE
42 | #define PERFCPP_NO_MEM_SNOOPX
43 | #endif
44 | 
45 | #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 17, 0)
46 | #define PERFCPP_NO_RECORD_MISC_SWITCH_OUT_PREEMPT
47 | #endif
48 | 
49 | #if LINUX_VERSION_CODE < KERNEL_VERSION(5, 7, 0)
50 | #define PERFCPP_NO_RECORD_CGROUP
51 | #define PERFCPP_NO_SAMPLE_CGROUP
52 | #endif
53 | 
54 | #if LINUX_VERSION_CODE < KERNEL_VERSION(5, 11, 0)
55 | #define PERFCPP_NO_SAMPLE_DATA_PAGE_SIZE
56 | #define PERFCPP_NO_SAMPLE_CODE_PAGE_SIZE
57 | #endif
58 | 
59 | #if LINUX_VERSION_CODE < KERNEL_VERSION(5, 12, 0)
60 | #define PERFCPP_NO_SAMPLE_WEIGHT_STRUCT
61 | #define PERFCPP_NO_MEM_BLK
62 | #endif
63 | 
64 | #if LINUX_VERSION_CODE < KERNEL_VERSION(5, 13, 0)
65 | #define PERFCPP_NO_CGROUP_SWITCHES
66 | #define PERFCPP_NO_INHERIT_THREAD
67 | #endif
68 | 
69 | #if LINUX_VERSION_CODE < KERNEL_VERSION(5, 16, 0)
70 | #define PERFCPP_NO_MEM_HOPS_0
71 | #endif
72 | 
73 | #if LINUX_VERSION_CODE < KERNEL_VERSION(5, 17, 0)
74 | #define PERFCPP_NO_MEM_HOPS_1_3
75 | #endif
76 | 
77 | #if LINUX_VERSION_CODE < KERNEL_VERSION(6, 0, 0)
78 | #define PERFCPP_NO_FORMAT_LOST
79 | #endif
80 | 
81 | #if LINUX_VERSION_CODE < KERNEL_VERSION(6, 1, 0)
82 | #define PERFCPP_NO_MEM_LVLNUM
83 | #define PERFCPP_NO_MEM_LVLNUM_PMEM
84 | #define PERFCPP_NO_MEM_LVLNUM_IO
85 | #define PERFCPP_NO_MEM_LVLNUM_CXL
86 | #define PERFCPP_NO_MEM_SNOOPX_PEER
87 | #endif
88 | 
89 | #if LINUX_VERSION_CODE < KERNEL_VERSION(6, 6, 0)
90 | #define PERFCPP_NO_MEM_LVLNUM_UNC
91 | #endif
92 | 
93 | #if LINUX_VERSION_CODE < KERNEL_VERSION(6, 11, 0)
94 | #define PERFCPP_NO_MEM_LVLNUM_L2_MHB
95 | #endif
96 | 


--------------------------------------------------------------------------------
/include/perfcpp/metric/expression/tokenizer.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "token.h"
 4 | #include <cctype>
 5 | #include <cstdint>
 6 | #include <optional>
 7 | #include <string>
 8 | #include <utility>
 9 | 
10 | namespace perf::metric::expression {
11 | /**
12 |  * The Tokenizer translates a given input string into a queue of tokens.
13 |  */
14 | class Tokenizer
15 | {
16 | public:
17 |   explicit Tokenizer(std::string&& input)
18 |     : _input(std::move(input))
19 |   {
20 |   }
21 | 
22 |   /**
23 |    * @return The original input.
24 |    */
25 |   [[nodiscard]] const std::string& input() const noexcept { return _input; }
26 | 
27 |   /**
28 |    * Tokenizes the input string.
29 |    * @return Queue of tokens.
30 |    */
31 |   [[nodiscard]] std::optional<Token> next();
32 | 
33 | private:
34 |   /// The expression to tokenize.
35 |   const std::string _input;
36 | 
37 |   std::size_t _position{ 0U };
38 | 
39 |   /**
40 |    * @return Position after skipping all whitespaces.
41 |    */
42 |   [[nodiscard]] std::size_t skip_whitespaces() const noexcept;
43 | 
44 |   /**
45 |    * Reads a constant number (e.g., 13.37) from the input string, starting at the given position.
46 |    *
47 |    * @param begin Position within the input string.
48 |    * @return A tuple (token containing the constant, new position).
49 |    */
50 |   [[nodiscard]] std::pair<double, std::size_t> read_constant(std::size_t begin) const;
51 | 
52 |   /**
53 |    * Reads an identifier (e.g., a hardware counter name) from the input string, starting at the given position.
54 |    *
55 |    * @param begin Position within the input string.
56 |    * @return A tuple (token containing the identifier, new position).
57 |    */
58 |   [[nodiscard]] std::pair<std::string, std::size_t> read_identifier(std::size_t begin) const;
59 | 
60 |   /**
61 |    * Reads an operator (e.g., +) from the given char.
62 |    *
63 |    * @param current_char Current char from input string.
64 |    * @return The metric operator.
65 |    */
66 |   [[nodiscard]] Operator_ read_operator(char current_char) const;
67 | 
68 |   /**
69 |    * Checks if the given char is an escape character.
70 |    *
71 |    * @param current_char Current character.
72 |    * @return True, if the given char is an escape character.
73 |    */
74 |   [[nodiscard]] static bool is_escape_char(char current_char) noexcept
75 |   {
76 |     return current_char == '\'' || current_char == '`';
77 |   }
78 | 
79 |   /**
80 |    * Checks if the given char could belong to an identifier (alphanumerical chars, _, ., etc.).
81 |    *
82 |    * @param char_ Char to check.
83 |    * @return True, if the char could belong to an identifier.
84 |    */
85 |   [[nodiscard]] static bool is_identifier_char(const char char_) noexcept
86 |   {
87 |     return std::isalnum(char_) || char_ == '_' || char_ == '.';
88 |   }
89 | 
90 |   /**
91 |    * Checks if the given char is a scientific 'e'.
92 |    *
93 |    * @param char_ Char to check.
94 |    * @return True, if the char could is a scientific 'e'.
95 |    */
96 |   [[nodiscard]] static bool is_scientific_e(const char char_) noexcept { return char_ == 'e' || char_ == 'E'; }
97 | };
98 | }


--------------------------------------------------------------------------------
/src/counter_result.cpp:
--------------------------------------------------------------------------------
 1 | #include <algorithm>
 2 | #include <iomanip>
 3 | #include <perfcpp/counter_result.h>
 4 | #include <sstream>
 5 | 
 6 | std::optional<double>
 7 | perf::CounterResult::get(std::string_view name) const noexcept
 8 | {
 9 |   if (const auto result_iterator = std::find_if(
10 |         this->_results.begin(), this->_results.end(), [&name](const auto res) { return name == res.first; });
11 |       result_iterator != this->_results.end()) {
12 |     return result_iterator->second;
13 |   }
14 | 
15 |   return std::nullopt;
16 | }
17 | 
18 | std::string
19 | perf::CounterResult::to_json() const
20 | {
21 |   auto json_stream = std::stringstream{};
22 | 
23 |   json_stream << "{";
24 | 
25 |   for (auto i = 0U; i < this->_results.size(); ++i) {
26 |     if (i > 0U) {
27 |       json_stream << ",";
28 |     }
29 | 
30 |     json_stream << "\"" << this->_results[i].first << "\": " << this->_results[i].second;
31 |   }
32 | 
33 |   json_stream << "}";
34 | 
35 |   return json_stream.str();
36 | }
37 | 
38 | std::string
39 | perf::CounterResult::to_csv(const char delimiter, const bool print_header) const
40 | {
41 |   auto csv_stream = std::stringstream{};
42 | 
43 |   if (print_header) {
44 |     csv_stream << "counter" << delimiter << "value\n";
45 |   }
46 | 
47 |   for (auto i = 0U; i < this->_results.size(); ++i) {
48 |     if (i > 0U) {
49 |       csv_stream << "\n";
50 |     }
51 | 
52 |     csv_stream << this->_results[i].first << delimiter << this->_results[i].second;
53 |   }
54 | 
55 |   return csv_stream.str();
56 | }
57 | 
58 | std::string
59 | perf::CounterResult::to_string() const
60 | {
61 |   auto result = std::vector<std::pair<std::string_view, std::string>>{};
62 |   result.reserve(this->_results.size());
63 | 
64 |   /// Default column lengths, equal to the header.
65 |   auto max_name_length = 12UL, max_value_length = 5UL;
66 | 
67 |   /// Collect counter names and values as strings.
68 |   for (const auto& [name, value] : this->_results) {
69 |     auto value_string = std::to_string(value);
70 | 
71 |     max_name_length = std::max(max_name_length, name.size());
72 |     max_value_length = std::max(max_value_length, value_string.size());
73 | 
74 |     result.emplace_back(name, std::move(value_string));
75 |   }
76 | 
77 |   /// Format the counters as a table.
78 |   auto table_stream = std::stringstream{};
79 |   table_stream
80 |     /// Print the header.
81 |     << "| Value" << std::setw(static_cast<std::int32_t>(max_value_length) - 4) << " " << "| Counter"
82 |     << std::setw(static_cast<std::int32_t>(max_name_length) - 6) << " "
83 |     << "|\n"
84 | 
85 |     /// Print the separator line.
86 |     << "|" << std::string(max_value_length + 2U, '-') << "|" << std::string(max_name_length + 2U, '-') << "|";
87 | 
88 |   /// Print the results as columns.
89 |   for (const auto& [name, value] : result) {
90 |     table_stream << "\n| " << std::setw(static_cast<std::int32_t>(max_value_length)) << value << " | " << name
91 |                  << std::setw(static_cast<std::int32_t>(max_name_length - name.size()) + 1) << " " << "|";
92 |   }
93 | 
94 |   table_stream << std::flush;
95 | 
96 |   return table_stream.str();
97 | }


--------------------------------------------------------------------------------
/include/perfcpp/analyzer/flame_graph_generator.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <cstdint>
 3 | #include <functional>
 4 | #include <perfcpp/sample.h>
 5 | #include <perfcpp/symbol_resolver.h>
 6 | #include <string>
 7 | #include <string_view>
 8 | #include <unordered_map>
 9 | #include <utility>
10 | #include <vector>
11 | 
12 | namespace perf::analyzer {
13 | class FlameGraphGenerator
14 | {
15 | public:
16 |   FlameGraphGenerator() = default;
17 |   ~FlameGraphGenerator() = default;
18 | 
19 |   [[nodiscard]] std::vector<std::pair<std::vector<std::string>, std::uint64_t>> map(
20 |     const std::vector<Sample>& samples) const;
21 | 
22 |   [[nodiscard]] std::vector<std::pair<std::vector<std::string>, std::uint64_t>> map(
23 |     const std::vector<Sample>& samples,
24 |     std::function<std::uint64_t(std::vector<Sample>::const_iterator begin, std::vector<Sample>::const_iterator end)>
25 |       mapper) const;
26 | 
27 |   void map(const std::vector<Sample>& samples, std::string&& out_file_path) const
28 |   {
29 |     return map(samples, out_file_path);
30 |   }
31 | 
32 |   void map(const std::vector<Sample>& samples, const std::string& out_file_path) const;
33 | 
34 |   void map(const std::vector<Sample>& samples,
35 |            std::function<std::uint64_t(std::vector<Sample>::const_iterator begin,
36 |                                        std::vector<Sample>::const_iterator end)> mapper,
37 |            std::string&& out_file_path) const
38 |   {
39 |     map(samples, std::move(mapper), out_file_path);
40 |   }
41 | 
42 |   void map(const std::vector<Sample>& samples,
43 |            std::function<std::uint64_t(std::vector<Sample>::const_iterator begin,
44 |                                        std::vector<Sample>::const_iterator end)> mapper,
45 |            const std::string& out_file_path) const;
46 | 
47 | private:
48 |   class SymbolCache
49 |   {
50 |   public:
51 |     explicit SymbolCache(const SymbolResolver& symbol_resolver) noexcept
52 |       : _symbol_resolver(symbol_resolver)
53 |     {
54 |     }
55 |     ~SymbolCache() = default;
56 | 
57 |     [[nodiscard]] std::optional<std::reference_wrapper<const SymbolResolver::Symbol>> symbol(
58 |       std::uintptr_t logical_instruction_pointer);
59 | 
60 |   private:
61 |     const SymbolResolver _symbol_resolver;
62 |     std::unordered_map<std::uintptr_t, std::optional<std::reference_wrapper<const SymbolResolver::Symbol>>> _cache;
63 |   };
64 | 
65 |   SymbolResolver _symbol_resolver;
66 | 
67 |   [[nodiscard]] static bool have_equal_callchains(SymbolCache& symbol_cache,
68 |                                                   const Sample& original_sample,
69 |                                                   const Sample& follow_up_sample) noexcept;
70 | 
71 |   [[nodiscard]] static bool have_equal_symbols(
72 |     std::optional<std::reference_wrapper<const SymbolResolver::Symbol>> first,
73 |     std::optional<std::reference_wrapper<const SymbolResolver::Symbol>> second) noexcept;
74 | 
75 |   [[nodiscard]] std::vector<std::string> resolve_symbols(
76 |     const std::optional<std::vector<std::uintptr_t>>& callchain,
77 |     std::optional<std::uintptr_t> top_logical_instruction_pointer) const;
78 | };
79 | }


--------------------------------------------------------------------------------
/examples/access_benchmark.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cstdint>
 4 | #include <vector>
 5 | 
 6 | namespace perf::example {
 7 | 
 8 | /**
 9 |  * Generator for unique and zipf data sets.
10 |  */
11 | class DataGenerator
12 | {
13 | public:
14 |   [[nodiscard]] static std::vector<std::uint64_t> generate_unique(std::size_t size);
15 | 
16 |   [[nodiscard]] static std::vector<std::uint64_t> generate_zipf(std::size_t size,
17 |                                                                 std::size_t alphabet_size,
18 |                                                                 double zipf_param);
19 | 
20 | private:
21 |   [[nodiscard]] static std::vector<std::uint64_t> alphabet(std::size_t size);
22 | 
23 |   [[nodiscard]] static std::vector<double> lookup_table(double zipf_param, const std::vector<std::uint64_t>& alphabet);
24 | };
25 | 
26 | /**
27 |  * Benchmark accessing benchmarks in random or sequential order.
28 |  * This is an example to demonstrate the perfcpp library.
29 |  */
30 | class AccessBenchmark
31 | {
32 | public:
33 |   /**
34 |    * Object sized of one cache line.
35 |    */
36 |   struct alignas(64U) cache_line
37 |   {
38 |     cache_line() noexcept = default;
39 |     explicit cache_line(const std::uint64_t value_) noexcept
40 |       : value(value_)
41 |     {
42 |     }
43 |     ~cache_line() noexcept = default;
44 | 
45 |     std::uint64_t value;
46 |   };
47 | 
48 |   AccessBenchmark(bool is_random, std::uint64_t access_data_size_in_mb, bool is_write = false);
49 |   ~AccessBenchmark() = default;
50 | 
51 |   /**
52 |    * @return Number of cache lines.
53 |    */
54 |   [[nodiscard]] std::size_t size() const noexcept { return _indices.size(); }
55 | 
56 |   /**
57 |    * Grant access to the i-th cache line, considering the defined access order.
58 |    *
59 |    * @param index Index of the cache line to access.
60 |    * @return Cache line.
61 |    */
62 |   [[nodiscard]] const cache_line& operator[](const std::size_t index) const noexcept
63 |   {
64 |     return _data_to_read[_indices[index]];
65 |   }
66 | 
67 |   void set(const std::size_t index, const std::uint64_t value) { _data_to_write[_indices[index]].value = value; }
68 | 
69 |   [[nodiscard]] const std::vector<std::uint64_t>& indices() const noexcept { return _indices; }
70 |   [[nodiscard]] const std::vector<cache_line>& data_to_read() const noexcept { return _data_to_read; }
71 | 
72 |   /**
73 |    * Makes the compiler think that the result is used – consequently, the optimizer cannot optimize the value away.
74 |    *
75 |    * @param result Value that should not be optimized away.
76 |    */
77 |   template<typename T>
78 |   inline void pretend_to_use(T& result) const noexcept
79 |   {
80 | #ifdef __clang__
81 |     asm volatile("" : "+r,m"(result) : : "memory");
82 | #else
83 |     asm volatile("" : "+m,r"(result) : : "memory");
84 | #endif
85 |   }
86 | 
87 | private:
88 |   /// Indices, defining the order in which the memory chunk is accessed.
89 |   std::vector<std::uint64_t> _indices;
90 | 
91 |   /// Memory chunk that is read during the benchmark.
92 |   std::vector<cache_line> _data_to_read;
93 | 
94 |   /// Memory chunk that is written during the benchmark.
95 |   std::vector<cache_line> _data_to_write;
96 | };
97 | }


--------------------------------------------------------------------------------
/examples/sampling/context_switch.cpp:
--------------------------------------------------------------------------------
 1 | #include "../access_benchmark.h"
 2 | #include "perfcpp/sampler.h"
 3 | #include <iostream>
 4 | 
 5 | int
 6 | main()
 7 | {
 8 |   std::cout << "libperf-cpp example: Record perf samples including time, "
 9 |                "instruction pointer, and cpu id for single-threaded random "
10 |                "access to an in-memory array."
11 |             << std::endl;
12 | 
13 |   auto sampler = perf::Sampler{};
14 | 
15 |   /// Event that generates an overflow which is samples.
16 |   sampler.trigger("cycles", perf::Precision::RequestZeroSkid, perf::Period{ 50000 });
17 | 
18 |   /// Include Timestamp, period, instruction pointer, and CPU number into samples.
19 |   sampler.values().timestamp(true).cpu_id(true).context_switch(true);
20 | 
21 |   /// Create random access benchmark.
22 |   auto benchmark = perf::example::AccessBenchmark{ /*randomize the accesses*/ true,
23 |                                                    /* create benchmark of 2 GB */ 2048U };
24 | 
25 |   /// Start sampling.
26 |   try {
27 |     sampler.start();
28 |   } catch (std::runtime_error& exception) {
29 |     std::cerr << exception.what() << std::endl;
30 |     return 1;
31 |   }
32 | 
33 |   /// Execute the benchmark (accessing cache lines in a random order).
34 |   auto value = 0ULL;
35 |   for (auto index = 0U; index < benchmark.size(); ++index) {
36 |     value += benchmark[index].value;
37 |   }
38 | 
39 |   /// We do not want the compiler to optimize away this (otherwise) unused value (and consequently the loop above).
40 |   benchmark.pretend_to_use(value);
41 | 
42 |   /// Stop sampling.
43 |   sampler.stop();
44 | 
45 |   /// Get all the recorded samples.
46 |   auto samples = sampler.result();
47 |   const auto count_samples_before_filter = samples.size();
48 | 
49 |   /// Filter out samples without context switch.
50 |   samples.erase(std::remove_if(samples.begin(),
51 |                                samples.end(),
52 |                                [](const auto& sample) {
53 |                                  return !sample.metadata().cpu_id().has_value() ||
54 |                                         !sample.metadata().timestamp().has_value() ||
55 |                                         !sample.context_switch().has_value();
56 |                                }),
57 |                 samples.end());
58 | 
59 |   /// Print the first samples.
60 |   const auto count_show_samples = std::min<std::size_t>(samples.size(), 40U);
61 |   std::cout << "\nRecorded " << count_samples_before_filter << " samples. " << samples.size()
62 |             << " remaining after filter." << std::endl;
63 |   std::cout << "Here are the first " << count_show_samples << " recorded samples:\n" << std::endl;
64 |   for (auto index = 0U; index < count_show_samples; ++index) {
65 |     const auto& sample = samples[index];
66 | 
67 |     std::cout << "Time = " << sample.metadata().timestamp().value()
68 |               << " | CPU ID = " << sample.metadata().cpu_id().value()
69 |               << " | is in = " << sample.context_switch().value().is_in()
70 |               << " | is preempt = " << sample.context_switch().value().is_preempt() << "\n";
71 |   }
72 |   std::cout << std::flush;
73 | 
74 |   /// Close the sampler.
75 |   /// Note that the sampler can only be closed after reading the samples.
76 |   sampler.close();
77 | 
78 |   return 0;
79 | }


--------------------------------------------------------------------------------
/examples/sampling/memory_access_analyzer.cpp:
--------------------------------------------------------------------------------
 1 | #include "../access_benchmark.h"
 2 | #include "perfcpp/analyzer/memory_access.h"
 3 | #include "perfcpp/hardware_info.h"
 4 | #include "perfcpp/sampler.h"
 5 | #include <iostream>
 6 | 
 7 | int
 8 | main()
 9 | {
10 |   std::cout << "libperf-cpp example: Sample memory addresses and analyze data objects." << std::endl;
11 | 
12 |   /// Initialize sampler.
13 |   auto sampler = perf::Sampler{};
14 | 
15 |   /// Setup which counters trigger the writing of samples (depends on the underlying hardware substrate).
16 |   if (perf::HardwareInfo::is_amd_ibs_supported()) {
17 |     sampler.trigger("ibs_op_uops", perf::Precision::MustHaveZeroSkid, perf::Period{ 4000U });
18 |   } else if (perf::HardwareInfo::is_intel()) {
19 |     sampler.trigger("mem-loads", perf::Precision::MustHaveZeroSkid, perf::Period{ 2000U });
20 |   } else {
21 |     std::cout << "Error: Memory sampling is not supported on this CPU." << std::endl;
22 |     return 1;
23 |   }
24 | 
25 |   /// Setup which data will be included into samples (timestamp, virtual memory address, data source like L1d or RAM,
26 |   /// and latency).
27 |   sampler.values().logical_memory_address(true).data_source(true).latency(true);
28 |   if (perf::HardwareInfo::is_amd()) {
29 |     sampler.values().raw(true);
30 |   }
31 | 
32 |   /// Create random access benchmark.
33 |   auto benchmark = perf::example::AccessBenchmark{ /*randomize the accesses*/ true,
34 |                                                    /* create benchmark of 2 GB */ 2048 };
35 | 
36 |   /// Start sampling.
37 |   try {
38 |     sampler.start();
39 |   } catch (std::runtime_error& exception) {
40 |     std::cerr << exception.what() << std::endl;
41 |     return 1;
42 |   }
43 | 
44 |   /// Execute the benchmark (accessing cache lines in a random order).
45 |   auto value = 0ULL;
46 |   for (auto index = 0U; index < benchmark.size(); ++index) {
47 |     value += benchmark[index].value;
48 |   }
49 | 
50 |   /// We do not want the compiler to optimize away this (otherwise) unused value (and consequently the loop above).
51 |   benchmark.pretend_to_use(value);
52 | 
53 |   /// Stop sampling.
54 |   sampler.stop();
55 | 
56 |   /// Create data types for analyzer.
57 |   auto data_analyzer = perf::analyzer::MemoryAccess{};
58 | 
59 |   /// 1) Create and add the "index" data type (normal u64 that dictates the pattern through the data array in the random
60 |   /// access benchmark).
61 |   auto index = perf::analyzer::DataType{ "index", sizeof(std::uint64_t) };
62 |   index.add<std::uint64_t>("index");
63 |   data_analyzer.add(std::move(index));
64 | 
65 |   /// 2) Create and add the "data_cache_line" data type (single cache line that is accessed in the random access
66 |   /// benchmark).
67 |   auto cache_line = perf::analyzer::DataType{ "data_cache_line", sizeof(perf::example::AccessBenchmark::cache_line) };
68 |   cache_line.add<std::uint64_t>("value");
69 |   data_analyzer.add(std::move(cache_line));
70 | 
71 |   /// 3) Register instances in memory for both data types.
72 |   data_analyzer.annotate("index", benchmark.indices());
73 |   data_analyzer.annotate("data_cache_line", benchmark.data_to_read());
74 | 
75 |   /// 4) Get all the recorded samples.
76 |   const auto samples = sampler.result();
77 | 
78 |   /// 5) Map the samples to data type instances.
79 |   const auto result = data_analyzer.map(samples);
80 | 
81 |   /// 6) Print the results to the console.
82 |   std::cout << result.to_string() << std::flush;
83 | 
84 |   return 0;
85 | }


--------------------------------------------------------------------------------
/examples/sampling/instruction_pointer.cpp:
--------------------------------------------------------------------------------
 1 | #include "../access_benchmark.h"
 2 | #include "perfcpp/sampler.h"
 3 | #include "perfcpp/symbol_resolver.h"
 4 | #include <iostream>
 5 | 
 6 | int
 7 | main()
 8 | {
 9 |   std::cout << "libperf-cpp example: Record perf samples including time, "
10 |                "instruction pointer, and cpu id for single-threaded random "
11 |                "access to an in-memory array."
12 |             << std::endl;
13 | 
14 |   auto sampler = perf::Sampler{};
15 | 
16 |   /// Event that generates an overflow which is samples.
17 |   sampler.trigger("cycles", perf::Precision::RequestZeroSkid, perf::Period{ 50000U });
18 | 
19 |   /// Include Timestamp, period, instruction pointer, and CPU number into samples.
20 |   sampler.values().timestamp(true).period(true).instruction_pointer(true).cpu_id(true);
21 | 
22 |   /// Create random access benchmark.
23 |   auto benchmark = perf::example::AccessBenchmark{ /*randomize the accesses*/ true,
24 |                                                    /* create benchmark of 512 MB */ 512U };
25 | 
26 |   /// Start sampling.
27 |   try {
28 |     sampler.start();
29 |   } catch (std::runtime_error& exception) {
30 |     std::cerr << exception.what() << std::endl;
31 |     return 1;
32 |   }
33 | 
34 |   /// Execute the benchmark (accessing cache lines in a random order).
35 |   auto value = 0ULL;
36 |   for (auto index = 0U; index < benchmark.size(); ++index) {
37 |     value += benchmark[index].value;
38 |   }
39 | 
40 |   /// We do not want the compiler to optimize away this (otherwise) unused value (and consequently the loop above).
41 |   benchmark.pretend_to_use(value);
42 | 
43 |   /// Stop sampling.
44 |   sampler.stop();
45 | 
46 |   /// Get all the recorded samples.
47 |   const auto samples = sampler.result();
48 | 
49 |   auto symbol_resolver = perf::SymbolResolver{};
50 | 
51 |   /// Print the first samples.
52 |   const auto count_show_samples = std::min<std::size_t>(samples.size(), 400U);
53 |   std::cout << "\nRecorded " << samples.size() << " samples." << std::endl;
54 |   std::cout << "Here are the first " << count_show_samples << " recorded samples:\n" << std::endl;
55 |   for (auto index = 0U; index < count_show_samples; ++index) {
56 |     const auto& sample = samples[index];
57 | 
58 |     /// Since we recorded the time, period, the instruction pointer, and the CPU
59 |     /// id, we can only read these values.
60 |     if (sample.metadata().timestamp().has_value() && sample.metadata().period().has_value() &&
61 |         sample.instruction_execution().logical_instruction_pointer().has_value() &&
62 |         sample.metadata().cpu_id().has_value()) {
63 | 
64 |       auto symbol = std::string{ "??" };
65 |       if (auto sym = symbol_resolver.resolve(sample.instruction_execution().logical_instruction_pointer().value());
66 |           sym.has_value()) {
67 |         symbol = sym->to_string();
68 |       }
69 | 
70 |       std::cout << "Time = " << sample.metadata().timestamp().value()
71 |                 << " | Period = " << sample.metadata().period().value() << " | Instruction Pointer = 0x" << std::hex
72 |                 << sample.instruction_execution().logical_instruction_pointer().value() << std::dec
73 |                 << " | Symbol = " << symbol << " | CPU ID = " << sample.metadata().cpu_id().value() << " | "
74 |                 << (sample.instruction_execution().logical_instruction_pointer() ? "exact" : "not exact") << "\n";
75 |     }
76 |   }
77 |   std::cout << std::flush;
78 | 
79 |   /// Close the sampler.
80 |   /// Note that the sampler can only be closed after reading the samples.
81 |   sampler.close();
82 | 
83 |   return 0;
84 | }


--------------------------------------------------------------------------------
/examples/statistics/inherit_thread.cpp:
--------------------------------------------------------------------------------
 1 | #include "../access_benchmark.h"
 2 | #include "perfcpp/event_counter.h"
 3 | #include <iostream>
 4 | #include <numeric>
 5 | #include <thread>
 6 | 
 7 | int
 8 | main()
 9 | {
10 |   std::cout << "libperf-cpp example: Record performance counter for "
11 |                "multi-threaded random access to an in-memory array."
12 |             << std::endl;
13 |   std::cout << "We will record the counters for all threads spawned by the main-thread." << std::endl;
14 | 
15 |   /// In this example, we will perform the benchmark multi-threaded and record
16 |   /// all child-threads. If `include_child_threads` is not set to true, we would
17 |   /// only record the main-thread.
18 |   auto config = perf::Config{};
19 |   config.include_child_threads(true);
20 |   auto event_counter = perf::EventCounter{ config };
21 | 
22 |   /// Add all the performance counters we want to record.
23 |   try {
24 |     event_counter.add({ "instructions",
25 |                         "cycles",
26 |                         "branches",
27 |                         "cache-misses",
28 |                         "dTLB-miss-ratio",
29 |                         "L1-data-miss-ratio",
30 |                         "cycles-per-instruction" });
31 |   } catch (std::runtime_error& e) {
32 |     std::cerr << e.what() << std::endl;
33 |     return 1;
34 |   }
35 | 
36 |   /// Create random access benchmark.
37 |   auto benchmark = perf::example::AccessBenchmark{ /*randomize the accesses*/ true,
38 |                                                    /* create benchmark of 1024 MB */ 1024U };
39 | 
40 |   /// One event_counter instance for every thread.
41 |   const auto count_threads = 2U;
42 |   const auto items_per_thread = benchmark.size() / count_threads;
43 |   auto threads = std::vector<std::thread>{};
44 |   auto thread_local_results = std::vector<std::uint64_t>(2U, 0U); /// Array to store the thread-local results.
45 | 
46 |   /// Start the performance counters. Note that the counters will also record
47 |   /// the thread-creation.
48 |   try {
49 |     event_counter.start();
50 |   } catch (std::runtime_error& exception) {
51 |     std::cerr << exception.what() << std::endl;
52 |     return 1;
53 |   }
54 | 
55 |   for (auto thread_index = 0U; thread_index < count_threads; ++thread_index) {
56 |     threads.emplace_back([thread_index, items_per_thread, &thread_local_results, &benchmark]() {
57 |       auto local_value = 0ULL;
58 | 
59 |       /// Process the data.
60 |       for (auto index = 0U; index < items_per_thread; ++index) {
61 |         local_value += benchmark[(thread_index * items_per_thread) + index].value;
62 |       }
63 | 
64 |       thread_local_results[thread_index] = local_value;
65 |     });
66 |   }
67 | 
68 |   /// Wait for all threads to finish.
69 |   for (auto& thread : threads) {
70 |     thread.join();
71 |   }
72 | 
73 |   /// Stop recording counters.
74 |   event_counter.stop();
75 | 
76 |   /// Add up the results so that the compiler does not get the idea of
77 |   /// optimizing away the accesses.
78 |   auto value = std::accumulate(thread_local_results.begin(), thread_local_results.end(), 0UL);
79 | 
80 |   /// We do not want the compiler to optimize away this (otherwise) unused value (and consequently the loop above).
81 |   benchmark.pretend_to_use(value);
82 | 
83 |   /// Get the result (normalized per cache line).
84 |   const auto result = event_counter.result(benchmark.size());
85 | 
86 |   /// Print the performance counters.
87 |   std::cout << "\nResults:\n";
88 |   for (const auto& [counter_name, counter_value] : result) {
89 |     std::cout << counter_value << " " << counter_name << " / cache line" << std::endl;
90 |   }
91 | 
92 |   return 0;
93 | }


--------------------------------------------------------------------------------
/include/perfcpp/metadata.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <cstdint>
  4 | #include <optional>
  5 | 
  6 | namespace perf {
  7 | class Metadata
  8 | {
  9 | public:
 10 |   enum class Mode
 11 |   {
 12 |     Kernel,
 13 |     User,
 14 |     Hypervisor,
 15 |     GuestKernel,
 16 |     GuestUser,
 17 |     Unknown, /// DEPRECATED: Will be removed in v0.12
 18 |   };
 19 | 
 20 |   /**
 21 |    * Set the mode of the sample.
 22 |    * @param mode Mode.
 23 |    */
 24 |   void mode(const std::optional<Mode> mode) noexcept { _mode = mode; }
 25 | 
 26 |   /**
 27 |    * Set the sample ID.
 28 |    * @param sample_id Sample ID.
 29 |    */
 30 |   void sample_id(const std::uint64_t sample_id) noexcept { _sample_id = sample_id; }
 31 | 
 32 |   /**
 33 |    * Set the stream ID.
 34 |    * @param stream_id Stream ID.
 35 |    */
 36 |   void stream_id(const std::uint64_t stream_id) noexcept { _stream_id = stream_id; }
 37 | 
 38 |   /**
 39 |    * Set the timestamp.
 40 |    * @param timestamp Timestamp.
 41 |    */
 42 |   void timestamp(const std::uint64_t timestamp) noexcept { _timestamp = timestamp; }
 43 | 
 44 |   /**
 45 |    * Set the period.
 46 |    * @param period Period.
 47 |    */
 48 |   void period(const std::uint64_t period) noexcept { _period = period; }
 49 | 
 50 |   /**
 51 |    * Set the CPU ID.
 52 |    * @param cpu_id CPU ID.
 53 |    */
 54 |   void cpu_id(const std::uint32_t cpu_id) noexcept { _cpu_id = cpu_id; }
 55 | 
 56 |   /**
 57 |    * Set the process ID.
 58 |    * @param process_id Process ID.
 59 |    */
 60 |   void process_id(const std::uint32_t process_id) noexcept { _process_id = process_id; }
 61 | 
 62 |   /**
 63 |    * Set the thread ID.
 64 |    * @param thread_id Thread ID.
 65 |    */
 66 |   void thread_id(const std::uint32_t thread_id) noexcept { _thread_id = thread_id; }
 67 | 
 68 |   /**
 69 |    * @return Mode, if included in the sample. std::nullopt otherwise.
 70 |    */
 71 |   [[nodiscard]] std::optional<Mode> mode() const noexcept { return _mode; }
 72 | 
 73 |   /**
 74 |    * @return Sample ID, if included in the sample. std::nullopt otherwise.
 75 |    */
 76 |   [[nodiscard]] std::optional<std::uint64_t> sample_id() const noexcept { return _sample_id; }
 77 | 
 78 |   /**
 79 |    * @return Stream ID, if included in the sample. std::nullopt otherwise.
 80 |    */
 81 |   [[nodiscard]] std::optional<std::uint64_t> stream_id() const noexcept { return _stream_id; }
 82 | 
 83 |   /**
 84 |    * @return Timestamp, if included in the sample. std::nullopt otherwise.
 85 |    */
 86 |   [[nodiscard]] std::optional<std::uint64_t> timestamp() const noexcept { return _timestamp; }
 87 | 
 88 |   /**
 89 |    * @return Period, if included in the sample. std::nullopt otherwise.
 90 |    */
 91 |   [[nodiscard]] std::optional<std::uint64_t> period() const noexcept { return _period; }
 92 | 
 93 |   /**
 94 |    * @return CPU ID, if included in the sample. std::nullopt otherwise.
 95 |    */
 96 |   [[nodiscard]] std::optional<std::uint32_t> cpu_id() const noexcept { return _cpu_id; }
 97 | 
 98 |   /**
 99 |    * @return Process ID, if included in the sample. std::nullopt otherwise.
100 |    */
101 |   [[nodiscard]] std::optional<std::uint32_t> process_id() const noexcept { return _process_id; }
102 | 
103 |   /**
104 |    * @return Thread ID, if included in the sample. std::nullopt otherwise.
105 |    */
106 |   [[nodiscard]] std::optional<std::uint32_t> thread_id() const noexcept { return _thread_id; }
107 | 
108 | private:
109 |   std::optional<Mode> _mode;
110 |   std::optional<std::uint64_t> _sample_id;
111 |   std::optional<std::uint64_t> _stream_id;
112 |   std::optional<std::uint64_t> _timestamp;
113 |   std::optional<std::uint64_t> _period;
114 |   std::optional<std::uint32_t> _cpu_id;
115 |   std::optional<std::uint32_t> _process_id;
116 |   std::optional<std::uint32_t> _thread_id;
117 | };
118 | }


--------------------------------------------------------------------------------
/examples/sampling/register.cpp:
--------------------------------------------------------------------------------
 1 | #include "../access_benchmark.h"
 2 | #include "perfcpp/sampler.h"
 3 | #include <iostream>
 4 | 
 5 | int
 6 | main()
 7 | {
 8 |   std::cout << "libperf-cpp example: Record perf samples including time, "
 9 |                "user_registers, and cpu id for single-threaded random "
10 |                "access to an in-memory array."
11 |             << std::endl;
12 | 
13 |   auto sampler = perf::Sampler{};
14 |   sampler.trigger("cycles", perf::Period{ 100000 });
15 |   sampler.values()
16 |     .timestamp(true)
17 |     .user_registers(
18 |       perf::Registers{ { perf::Registers::x86::IP, perf::Registers::x86::DI, perf::Registers::x86::R10 } })
19 |     .kernel_registers(
20 |       perf::Registers{ { perf::Registers::x86::IP, perf::Registers::x86::DI, perf::Registers::x86::R10 } })
21 |     .cpu_id(true);
22 | 
23 |   /// Create random access benchmark.
24 |   auto benchmark = perf::example::AccessBenchmark{ /*randomize the accesses*/ true,
25 |                                                    /* create benchmark of 512 MB */ 512U };
26 | 
27 |   /// Start sampling.
28 |   try {
29 |     sampler.start();
30 |   } catch (std::runtime_error& exception) {
31 |     std::cerr << exception.what() << std::endl;
32 |     return 1;
33 |   }
34 | 
35 |   /// Execute the benchmark (accessing cache lines in a random order).
36 |   auto value = 0ULL;
37 |   for (auto index = 0U; index < benchmark.size(); ++index) {
38 |     value += benchmark[index].value;
39 |   }
40 | 
41 |   /// We do not want the compiler to optimize away this (otherwise) unused value (and consequently the loop above).
42 |   benchmark.pretend_to_use(value);
43 | 
44 |   /// Stop sampling.
45 |   sampler.stop();
46 | 
47 |   /// Get all the recorded samples.
48 |   const auto samples = sampler.result();
49 | 
50 |   /// Print the first samples.
51 |   const auto count_show_samples = std::min<std::size_t>(samples.size(), 40U);
52 |   std::cout << "\nRecorded " << samples.size() << " samples." << std::endl;
53 |   std::cout << "Here are the first " << count_show_samples << " recorded samples:\n" << std::endl;
54 |   for (auto index = 0U; index < count_show_samples; ++index) {
55 |     const auto& sample = samples[index];
56 | 
57 |     /// Since we recorded the time, period, the instruction pointer, and the CPU
58 |     /// id, we can only read these values.
59 |     if (sample.metadata().timestamp().has_value() &&
60 |         (sample.user_registers().has_value() || sample.kernel_registers().has_value()) &&
61 |         sample.metadata().cpu_id().has_value()) {
62 | 
63 |       std::cout << "Time = " << sample.metadata().timestamp().value()
64 |                 << " | CPU ID = " << sample.metadata().cpu_id().value();
65 | 
66 |       if (sample.user_registers().has_value()) {
67 |         const auto& user_registers = sample.user_registers().value();
68 |         std::cout << " | User Registers = IP(" << user_registers.get(perf::Registers::x86::IP).value_or(0) << "), DI("
69 |                   << user_registers.get(perf::Registers::x86::DI).value_or(0) << "), R10("
70 |                   << user_registers.get(perf::Registers::x86::R10).value_or(0) << ")";
71 |       }
72 | 
73 |       if (sample.kernel_registers().has_value()) {
74 |         const auto& kernel_registers = sample.kernel_registers().value();
75 |         std::cout << " | Kernel Registers = IP(" << kernel_registers.get(perf::Registers::x86::IP).value_or(0)
76 |                   << "), DI(" << kernel_registers.get(perf::Registers::x86::DI).value_or(0) << "), R10("
77 |                   << kernel_registers.get(perf::Registers::x86::R10).value_or(0) << ")";
78 |       }
79 | 
80 |       std::cout << "\n";
81 |     }
82 |   }
83 |   std::cout << std::flush;
84 | 
85 |   /// Close the sampler.
86 |   /// Note that the sampler can only be closed after reading the samples.
87 |   sampler.close();
88 | 
89 |   return 0;
90 | }


--------------------------------------------------------------------------------
/include/perfcpp/branch.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <cstdint>
  4 | #include <linux/perf_event.h>
  5 | #include <optional>
  6 | #include <perfcpp/feature.h>
  7 | 
  8 | namespace perf {
  9 | /**
 10 |  * Branch types that can be filtered when recording branches via sampling.
 11 |  */
 12 | enum BranchType : std::uint64_t
 13 | {
 14 |   None = 0ULL,
 15 | 
 16 |   User = PERF_SAMPLE_BRANCH_USER,
 17 |   Kernel = PERF_SAMPLE_BRANCH_KERNEL,
 18 |   HyperVisor = PERF_SAMPLE_BRANCH_HV,
 19 | 
 20 |   Any = PERF_SAMPLE_BRANCH_ANY,
 21 | #ifndef PERFCPP_NO_SAMPLE_BRANCH_CALL
 22 |   Call = PERF_SAMPLE_BRANCH_ANY_CALL,
 23 | #else
 24 |   Call = 1ULL << 61,
 25 | #endif
 26 | #ifndef PERFCPP_NO_SAMPLE_BRANCH_CALL
 27 |   DirectCall = PERF_SAMPLE_BRANCH_CALL,
 28 | #else
 29 |   DirectCall = 1ULL << 62,
 30 | #endif
 31 |   IndirectCall = PERF_SAMPLE_BRANCH_IND_CALL,
 32 |   Return = PERF_SAMPLE_BRANCH_ANY_RETURN,
 33 | #ifndef PERFCPP_NO_SAMPLE_BRANCH_IND_JUMP
 34 |   IndirectJump = PERF_SAMPLE_BRANCH_IND_JUMP,
 35 | #else
 36 |   IndirectJump = 1ULL << 63,
 37 | #endif
 38 |   Conditional = PERF_SAMPLE_BRANCH_COND,
 39 |   TransactionalMemoryAbort = PERF_SAMPLE_BRANCH_ABORT_TX,
 40 |   InTransaction = PERF_SAMPLE_BRANCH_IN_TX,
 41 |   NotInTransaction = PERF_SAMPLE_BRANCH_NO_TX
 42 | };
 43 | 
 44 | /**
 45 |  * A Branch represents one branch from the branch stack, including information where the branch started (and in case of
 46 |  * jmp/call where the branch ended), if the branch was predicted correctly, and how long
 47 |  */
 48 | class Branch
 49 | {
 50 | public:
 51 |   Branch(const std::uintptr_t instruction_pointer_from,
 52 |          const std::uintptr_t instruction_pointer_to,
 53 |          const bool is_mispredicted,
 54 |          const bool is_predicted,
 55 |          const bool is_in_transaction,
 56 |          const bool is_transaction_abort,
 57 |          const std::optional<std::uint16_t> cycles)
 58 |     : _instruction_pointer_from(instruction_pointer_from)
 59 |     , _instruction_pointer_to(instruction_pointer_to)
 60 |     , _is_mispredicted(is_mispredicted)
 61 |     , _is_predicted(is_predicted)
 62 |     , _is_in_transaction(is_in_transaction)
 63 |     , _is_transaction_abort(is_transaction_abort)
 64 |     , _cycles(cycles)
 65 |   {
 66 |   }
 67 | 
 68 |   /**
 69 |    * @return The instruction pointer the branch started.
 70 |    */
 71 |   [[nodiscard]] std::uintptr_t instruction_pointer_from() const noexcept { return _instruction_pointer_from; }
 72 | 
 73 |   /**
 74 |    * @return The instruction pointer the branch ended.
 75 |    */
 76 |   [[nodiscard]] std::uintptr_t instruction_pointer_to() const noexcept { return _instruction_pointer_to; }
 77 | 
 78 |   /**
 79 |    * @return True, if the branch was not predicted properly.
 80 |    */
 81 |   [[nodiscard]] bool is_mispredicted() const noexcept { return _is_mispredicted; }
 82 | 
 83 |   /**
 84 |    * @return True, if the branch was predicted correctly.
 85 |    */
 86 |   [[nodiscard]] bool is_predicted() const noexcept { return _is_predicted; }
 87 | 
 88 |   /**
 89 |    * @return True, if the branch was within a memory transaction.
 90 |    */
 91 |   [[nodiscard]] bool is_in_transaction() const noexcept { return _is_in_transaction; }
 92 | 
 93 |   /**
 94 |    * @return True, if the branch was a transaction abort.
 95 |    */
 96 |   [[nodiscard]] bool is_transaction_abort() const noexcept { return _is_transaction_abort; }
 97 | 
 98 |   /**
 99 |    * @return The number of cycles of the branch (zero if not supported on the underlying hardware).
100 |    */
101 |   [[nodiscard]] std::optional<std::uint16_t> cycles() const noexcept { return _cycles; }
102 | 
103 | private:
104 |   std::uintptr_t _instruction_pointer_from;
105 |   std::uintptr_t _instruction_pointer_to;
106 |   bool _is_mispredicted;
107 |   bool _is_predicted;
108 |   bool _is_in_transaction;
109 |   bool _is_transaction_abort;
110 |   std::optional<std::uint16_t> _cycles;
111 | };
112 | }


--------------------------------------------------------------------------------
/examples/statistics/multi_thread.cpp:
--------------------------------------------------------------------------------
 1 | #include "../access_benchmark.h"
 2 | #include "perfcpp/event_counter.h"
 3 | #include <iostream>
 4 | #include <numeric>
 5 | #include <thread>
 6 | 
 7 | int
 8 | main()
 9 | {
10 |   std::cout << "libperf-cpp example: Record performance counter for "
11 |                "multi-threaded random access to an in-memory array."
12 |             << std::endl;
13 |   std::cout << "We will record the counters per thread and merge the results "
14 |                "afterwards."
15 |             << std::endl;
16 | 
17 |   constexpr auto count_threads = 2U;
18 | 
19 |   /// Initialize performance counters.#
20 |   auto multithread_event_counter = perf::MultiThreadEventCounter{ count_threads };
21 | 
22 |   /// Add all the performance counters we want to record.
23 |   try {
24 |     multithread_event_counter.add({ "instructions",
25 |                                     "cycles",
26 |                                     "branches",
27 |                                     "cache-misses",
28 |                                     "dTLB-miss-ratio",
29 |                                     "L1-data-miss-ratio",
30 |                                     "cycles-per-instruction",
31 |                                     "nanoseconds" });
32 |   } catch (std::runtime_error& e) {
33 |     std::cerr << e.what() << std::endl;
34 |     return 1;
35 |   }
36 | 
37 |   /// Create random access benchmark.
38 |   auto benchmark = perf::example::AccessBenchmark{ /*randomize the accesses*/ true,
39 |                                                    /* create benchmark of 1024 MB */ 1024U };
40 | 
41 |   /// One event_counter instance for every thread.
42 |   const auto items_per_thread = benchmark.size() / count_threads;
43 |   auto threads = std::vector<std::thread>{};
44 |   auto thread_local_results =
45 |     std::vector<std::uint64_t>(count_threads, 0U); /// Array to store the thread-local results.
46 | 
47 |   for (auto thread_index = std::uint16_t(0U); thread_index < count_threads; ++thread_index) {
48 |     threads.emplace_back(
49 |       [thread_index, items_per_thread, &thread_local_results, &benchmark, &multithread_event_counter]() {
50 |         auto local_value = 0ULL;
51 | 
52 |         /// Start recording counters.
53 |         /// In contrast to the inherit-thread example (see inherit_thread.cpp), we
54 |         /// will record the performance counters on each thread.
55 |         try {
56 |           multithread_event_counter.start(thread_index);
57 |         } catch (std::runtime_error& exception) {
58 |           std::cerr << exception.what() << std::endl;
59 |           return;
60 |         }
61 | 
62 |         /// Process the data.
63 |         for (auto index = 0U; index < items_per_thread; ++index) {
64 |           local_value += benchmark[(thread_index * items_per_thread) + index].value;
65 |         }
66 | 
67 |         /// Stop recording counters on this thread.
68 |         multithread_event_counter.stop(thread_index);
69 | 
70 |         thread_local_results[thread_index] = local_value;
71 |       });
72 |   }
73 | 
74 |   /// Wait for all threads to finish.
75 |   for (auto& thread : threads) {
76 |     thread.join();
77 |   }
78 | 
79 |   /// Add up the results so that the compiler does not get the idea of
80 |   /// optimizing away the accesses.
81 |   auto value = std::accumulate(thread_local_results.begin(), thread_local_results.end(), 0UL);
82 | 
83 |   /// We do not want the compiler to optimize away this (otherwise) unused value.
84 |   benchmark.pretend_to_use(value);
85 | 
86 |   /// Get the result (normalized per cache line) from the
87 |   /// multithread_event_counter.
88 |   auto result = multithread_event_counter.result(benchmark.size());
89 | 
90 |   /// Print the performance counters.
91 |   std::cout << "\nResults:\n";
92 |   for (const auto& [counter_name, counter_value] : result) {
93 |     std::cout << counter_value << " " << counter_name << " / cache line" << std::endl;
94 |   }
95 | 
96 |   return 0;
97 | }
98 | 


--------------------------------------------------------------------------------
/test/counter_definition.cpp:
--------------------------------------------------------------------------------
 1 | #include <catch2/catch_test_macros.hpp>
 2 | #include <perfcpp/counter_definition.h>
 3 | 
 4 | TEST_CASE("adding new events and metrics", "[CounterDefinition]")
 5 | {
 6 |   auto definition = perf::CounterDefinition{};
 7 | 
 8 |   auto test_counter = std::string{ "some-test-counter-name" };
 9 | 
10 |   SECTION("events do not exist")
11 |   {
12 |     REQUIRE(definition.counter(test_counter).empty());
13 |     REQUIRE(definition.is_metric(test_counter) == false);
14 |     REQUIRE(definition.metric(test_counter).has_value() == false);
15 |   }
16 | 
17 |   SECTION("add hardware counter")
18 |   {
19 |     definition.add(std::string{ test_counter }, 100U, 0x1234);
20 |     REQUIRE(definition.counter(test_counter).size() == 1U);
21 |     REQUIRE(std::get<1>(definition.counter(test_counter).front()) == test_counter);
22 |     REQUIRE(std::get<2>(definition.counter(test_counter).front()).configs()[0U] == 0x1234);
23 |     REQUIRE(std::get<2>(definition.counter(test_counter).front()).type() == 100U);
24 |     REQUIRE(definition.is_metric(test_counter) == false);
25 |     REQUIRE(definition.metric(test_counter).has_value() == false);
26 |   }
27 | 
28 |   SECTION("add metric")
29 |   {
30 |     auto test_metric = std::string{ "some-test-metric-name" };
31 |     definition.add(std::string{ test_metric }, "cycles/instructions");
32 |     REQUIRE(definition.counter(test_metric).empty());
33 |     REQUIRE(definition.is_metric(test_metric));
34 |     REQUIRE(definition.metric(test_metric).has_value());
35 |     REQUIRE(std::get<0>(definition.metric(test_metric).value()) == test_metric);
36 |   }
37 | 
38 |   SECTION("read csv counter-only")
39 |   {
40 |     const auto event0 = std::string{ "EVENT.TEST0" };
41 |     const auto event1 = std::string{ "event-test-1" };
42 | 
43 |     REQUIRE(definition.counter(event0).empty());
44 |     REQUIRE(definition.counter(event1).empty());
45 | 
46 |     const auto definition_with_file = perf::CounterDefinition{ "test/events.csv" };
47 | 
48 |     REQUIRE_FALSE(definition_with_file.counter(event0).empty());
49 |     REQUIRE_FALSE(definition_with_file.counter(event1).empty());
50 | 
51 |     REQUIRE(std::get<2>(definition_with_file.counter(event0).front()).configs()[0U] == 0x1f3010e);
52 |     REQUIRE(std::get<2>(definition_with_file.counter(event0).front()).configs()[1U] == 0U);
53 |     REQUIRE(std::get<2>(definition_with_file.counter(event0).front()).configs()[2U] == 0U);
54 | 
55 |     REQUIRE(std::get<2>(definition_with_file.counter(event1).front()).configs()[0U] == 0x1CD);
56 |     REQUIRE(std::get<2>(definition_with_file.counter(event1).front()).configs()[1U] == 3U);
57 |     REQUIRE(std::get<2>(definition_with_file.counter(event1).front()).configs()[2U] == 0U);
58 |   }
59 | 
60 |   SECTION("read csv with metric")
61 |   {
62 |     const auto event0 = std::string{ "EVENT.TEST0" };
63 |     const auto event1 = std::string{ "event-test-1" };
64 |     const auto test_metric = std::string{ "test-metric" };
65 | 
66 |     REQUIRE(definition.counter(event0).empty());
67 |     REQUIRE(definition.counter(event1).empty());
68 |     REQUIRE(definition.counter(test_metric).empty());
69 | 
70 |     REQUIRE_FALSE(definition.is_metric(test_metric));
71 | 
72 |     const auto definition_with_file = perf::CounterDefinition{ "test/events-and-metrics.csv" };
73 | 
74 |     REQUIRE_FALSE(definition_with_file.counter(event0).empty());
75 |     REQUIRE_FALSE(definition_with_file.counter(event1).empty());
76 |     REQUIRE(definition.counter(test_metric).empty());
77 | 
78 |     REQUIRE(definition_with_file.is_metric(test_metric));
79 | 
80 |     auto metric = definition_with_file.metric(test_metric);
81 |     REQUIRE(metric.has_value());
82 | 
83 |     auto counter_result = perf::CounterResult{ std::vector<std::pair<std::string_view, double>>{
84 |       std::make_pair("EVENT.TEST0", 100U), std::make_pair("event-test-1", 500U) } };
85 |     const auto metric_result = metric->second.calculate(counter_result);
86 |     REQUIRE(metric_result.has_value());
87 |     REQUIRE(metric_result.value() == 1500U);
88 |   }
89 | }


--------------------------------------------------------------------------------
/examples/sampling/counter.cpp:
--------------------------------------------------------------------------------
 1 | #include "../access_benchmark.h"
 2 | #include "perfcpp/sampler.h"
 3 | #include <iostream>
 4 | 
 5 | int
 6 | main()
 7 | {
 8 |   std::cout << "libperf-cpp example: Record perf samples including performance "
 9 |                "counters for single-threaded random access to an in-memory array."
10 |             << std::endl;
11 | 
12 |   /// Initialize counter definitions.
13 |   /// Note that the perf::CounterDefinition holds all counter names and must be
14 |   /// alive until the benchmark finishes.
15 |   auto counter_definition = perf::CounterDefinition{};
16 | 
17 |   /// Add metric that calculates the L1d miss ratio.
18 |   counter_definition.add("L1d-misses-per-load", "'L1-dcache-load-misses'/'L1-dcache-loads'");
19 | 
20 |   auto sampler = perf::Sampler{ counter_definition };
21 | 
22 |   /// Setup the event that will trigger writing samples.
23 |   sampler.trigger("cycles", perf::Precision::AllowArbitrarySkid, perf::Period{ 50000 });
24 | 
25 |   /// Setup which data should be included (L1 hit and miss counter, timestamp).
26 |   sampler.values().counter({ "L1-dcache-loads", "L1-dcache-load-misses", "L1d-misses-per-load" }).timestamp(true);
27 | 
28 |   /// Create random access benchmark.
29 |   auto benchmark = perf::example::AccessBenchmark{ /*randomize the accesses*/ true,
30 |                                                    /* create benchmark of 512 MB */ 512U };
31 | 
32 |   /// Start sampling.
33 |   try {
34 |     sampler.start();
35 |   } catch (std::runtime_error& exception) {
36 |     std::cerr << exception.what() << std::endl;
37 |     return 1;
38 |   }
39 | 
40 |   /// Execute the benchmark (accessing cache lines in a random order).
41 |   auto value = 0ULL;
42 |   for (auto index = 0U; index < benchmark.size(); ++index) {
43 |     value += benchmark[index].value;
44 |   }
45 | 
46 |   /// We do not want the compiler to optimize away this (otherwise) unused value (and consequently the loop above).
47 |   benchmark.pretend_to_use(value);
48 | 
49 |   /// Stop sampling.
50 |   sampler.stop();
51 | 
52 |   /// Get all the recorded samples.
53 |   const auto samples = sampler.result();
54 | 
55 |   /// Print the first samples.
56 |   const auto count_show_samples = std::min<std::size_t>(samples.size(), 40U);
57 |   std::cout << "\nRecorded " << samples.size() << " samples." << std::endl;
58 |   std::cout << "Here are the first " << count_show_samples << " recorded samples:\n" << std::endl;
59 | 
60 |   std::optional<perf::CounterResult> last_counter_result = std::nullopt; /// Remember the last counter result to show
61 |                                                                          /// only the difference.
62 | 
63 |   for (auto index = 0U; index < count_show_samples; ++index) {
64 |     const auto& sample = samples[index];
65 | 
66 |     /// Since we recorded the time, period, the instruction pointer, and the CPU
67 |     /// id, we can only read these values.
68 |     if (sample.metadata().timestamp().has_value() && sample.counter().has_value()) {
69 |       if (last_counter_result.has_value()) {
70 |         std::cout << "Time = " << sample.metadata().timestamp().value() << " | cycles (diff) = "
71 |                   << sample.counter()->get("cycles").value_or(.0) - last_counter_result->get("cycles").value_or(.0)
72 |                   << " | L1-dcache-loads (diff) = "
73 |                   << sample.counter()->get("L1-dcache-loads").value_or(.0) -
74 |                        last_counter_result->get("L1-dcache-loads").value_or(.0)
75 |                   << " | L1-dcache-load-misses (diff) = "
76 |                   << sample.counter()->get("L1-dcache-load-misses").value_or(.0) -
77 |                        last_counter_result->get("L1-dcache-load-misses").value_or(.0)
78 |                   << " | L1d-misses-per-load = " << sample.counter()->get("L1d-misses-per-load").value_or(.0) << "\n";
79 |       }
80 | 
81 |       last_counter_result = sample.counter();
82 |     }
83 |   }
84 |   std::cout << std::flush;
85 | 
86 |   /// Close the sampler.
87 |   /// Note that the sampler can only be closed after reading the samples.
88 |   sampler.close();
89 | 
90 |   return 0;
91 | }


--------------------------------------------------------------------------------
/include/perfcpp/metric/expression/parser.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "expression.h"
  4 | #include "tokenizer.h"
  5 | #include <memory>
  6 | #include <perfcpp/exception.h>
  7 | #include <string>
  8 | 
  9 | namespace perf::metric::expression {
 10 | /**
 11 |  * The parser translates an expression (string) to an executable metric expression.
 12 |  */
 13 | class Parser
 14 | {
 15 |   class TokenVisitor;
 16 |   friend TokenVisitor;
 17 | 
 18 | public:
 19 |   explicit Parser(std::string&& input)
 20 |     : _tokenizer(std::move(input))
 21 |     , _current_token(_tokenizer.next())
 22 |   {
 23 |   }
 24 | 
 25 |   ~Parser() = default;
 26 | 
 27 |   /**
 28 |    * Builds an evaluable expression from the given expression-string.
 29 |    *
 30 |    * @return Evaluable expression.
 31 |    */
 32 |   [[nodiscard]] std::unique_ptr<ExpressionInterface> parse();
 33 | 
 34 | private:
 35 |   Tokenizer _tokenizer;
 36 | 
 37 |   std::optional<Token> _current_token;
 38 | 
 39 |   /**
 40 |    * Consume the current token by moving to the next one.
 41 |    */
 42 |   void consume() { this->_current_token = this->_tokenizer.next(); }
 43 | 
 44 |   /**
 45 |    * Verifies that the current token is equal to the expected punctuation. Throws an exception otherwise.
 46 |    * If the current token is equal to the expected punctuation, the current token is consumed.
 47 |    *
 48 |    * @param expected_punctuation Expected punctuation.
 49 |    */
 50 |   void consume(const Token::Punctuation expected_punctuation)
 51 |   {
 52 |     if (!(this->_current_token.has_value() && this->_current_token.value() == expected_punctuation)) {
 53 |       throw CannotParseMetricExpressionError{ this->_tokenizer.input() };
 54 |     }
 55 | 
 56 |     consume();
 57 |   }
 58 | 
 59 |   /**
 60 |    * Parses the input and returns a parsed expression.
 61 |    *
 62 |    * @return The parsed expression.
 63 |    */
 64 |   [[nodiscard]] std::unique_ptr<ExpressionInterface> parse_expression() { return this->parse_additive_expression(); }
 65 | 
 66 |   /**
 67 |    * Parses an additive expression (with "+" and "-" operations).
 68 |    *
 69 |    * @return The parsed expression.
 70 |    */
 71 |   [[nodiscard]] std::unique_ptr<ExpressionInterface> parse_additive_expression();
 72 | 
 73 |   /**
 74 |    * Parses a multiplicative expression (with "*" and "/" operations).
 75 |    *
 76 |    * @return The parsed expression.
 77 |    */
 78 |   [[nodiscard]] std::unique_ptr<ExpressionInterface> parse_multiplicative_expression();
 79 | 
 80 |   /**
 81 |    * Parse numbers, identifiers, functions, and parenthesized expressions.
 82 |    *
 83 |    * @return The parsed expression.
 84 |    */
 85 |   [[nodiscard]] std::unique_ptr<ExpressionInterface> parse_primary();
 86 | 
 87 |   /**
 88 |    * Creates a function with the given name and arguments.
 89 |    *
 90 |    * @param function_name Name of the function.
 91 |    * @param arguments Arguments of the function.
 92 |    * @return Function expression.
 93 |    */
 94 |   [[nodiscard]] std::unique_ptr<ExpressionInterface> build_function(
 95 |     std::string&& function_name,
 96 |     std::vector<std::unique_ptr<ExpressionInterface>>&& arguments) const;
 97 | 
 98 |   /**
 99 |    * Visits a token and translates it into an expression, called by the parse_primary() function.
100 |    * This may include recursive calls for parenthesized expressions.
101 |    */
102 |   class TokenVisitor
103 |   {
104 |   public:
105 |     explicit TokenVisitor(Parser& parser) noexcept
106 |       : _parser(parser)
107 |     {
108 |     }
109 |     ~TokenVisitor() noexcept = default;
110 | 
111 |     [[nodiscard]] std::unique_ptr<ExpressionInterface> operator()(std::string& identifier);
112 |     [[nodiscard]] std::unique_ptr<ExpressionInterface> operator()(double constant);
113 |     [[nodiscard]] std::unique_ptr<ExpressionInterface> operator()(Operator_ metric_operator);
114 |     [[nodiscard]] std::unique_ptr<ExpressionInterface> operator()(Token::Punctuation punctutation);
115 | 
116 |   private:
117 |     /// The calling parser to consume tokens and continue parsing.
118 |     Parser& _parser;
119 |   };
120 | };
121 | }


--------------------------------------------------------------------------------
/examples/statistics/metric.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <perfcpp/event_counter.h>
  3 | #include <perfcpp/metric.h>
  4 | 
  5 | #include "../access_benchmark.h"
  6 | 
  7 | /**
  8 |  * Example of a metric implementation that calculates the number of branch misses per executed branch instruction.
  9 |  */
 10 | class BranchMissesPerBranchInstruction final : public perf::Metric
 11 | {
 12 | public:
 13 |   [[nodiscard]] std::string name() const override { return "branch-misses-per-branch-instruction"; }
 14 | 
 15 |   [[nodiscard]] std::vector<std::string> required_counter_names() const override
 16 |   {
 17 |     return { "branch-misses", "branch-instructions" };
 18 |   }
 19 | 
 20 |   [[nodiscard]] std::optional<double> calculate(const perf::CounterResult& result) const override
 21 |   {
 22 |     const auto branch_misses = result.get("branch-misses");
 23 |     const auto branch_instructions = result.get("branch-instructions");
 24 | 
 25 |     if (branch_misses.has_value() && branch_instructions.has_value()) {
 26 |       if (branch_instructions.value() > 0U) {
 27 |         return branch_misses.value() / branch_instructions.value();
 28 |       }
 29 |     }
 30 | 
 31 |     return std::nullopt;
 32 |   }
 33 | 
 34 | private:
 35 | };
 36 | 
 37 | int
 38 | main()
 39 | {
 40 |   std::cout << "libperf-cpp example: Implementing new metrics." << std::endl;
 41 | 
 42 |   auto counter_definition = perf::CounterDefinition{};
 43 | 
 44 |   /// Define a metric that returns the number of cache misses per cache reference:
 45 |   counter_definition.add("cache-misses-per-reference", "d_ratio(`cache-misses`, `cache-references`)");
 46 | 
 47 |   /// Define a metric that sums up all L1 loads:
 48 |   counter_definition.add("l1-loads", "`L1-dcache-loads` + `L1-icache-loads`");
 49 | 
 50 |   /// Define a metric that sums up all L1 load misses:
 51 |   counter_definition.add("l1-load-misses", "sum(`L1-dcache-load-misses`, `L1-icache-load-misses`)");
 52 | 
 53 |   /// Define a metric that calculates the ratio between L1 load misses and L1 loads:
 54 |   counter_definition.add("l1-misses-per-load", "`l1-load-misses` / `l1-loads`");
 55 | 
 56 |   /// Initialize the above defined metric that returns the number of branch misses per branch instruction.
 57 |   counter_definition.add(std::make_unique<BranchMissesPerBranchInstruction>());
 58 | 
 59 |   /// Initialize performance counters.
 60 |   auto event_counter = perf::EventCounter{ counter_definition };
 61 | 
 62 |   /// Add the new defined metrics.
 63 |   try {
 64 |     event_counter.add(std::vector<std::string>{ "cache-misses-per-reference",
 65 |                                                 "branch-misses-per-branch-instruction",
 66 |                                                 "l1-loads",
 67 |                                                 "l1-load-misses",
 68 |                                                 "l1-misses-per-load" });
 69 |   } catch (std::runtime_error& e) {
 70 |     std::cerr << e.what() << std::endl;
 71 |     return 1;
 72 |   }
 73 | 
 74 |   /// Create random access benchmark.
 75 |   auto benchmark = perf::example::AccessBenchmark{ /*randomize the accesses*/ true,
 76 |                                                    /* create benchmark of 512 MB */ 512 };
 77 | 
 78 |   /// Start recording.
 79 |   try {
 80 |     event_counter.start();
 81 |   } catch (std::runtime_error& exception) {
 82 |     std::cerr << exception.what() << std::endl;
 83 |     return 1;
 84 |   }
 85 | 
 86 |   /// Execute the benchmark (accessing cache lines in a random order).
 87 |   auto value = 0ULL;
 88 |   for (auto index = 0U; index < benchmark.size(); ++index) {
 89 |     value += benchmark[index].value;
 90 |   }
 91 | 
 92 |   /// We do not want the compiler to optimize away this (otherwise) unused value (and consequently the loop above).
 93 |   benchmark.pretend_to_use(value);
 94 | 
 95 |   /// Stop recording counters.
 96 |   event_counter.stop();
 97 | 
 98 |   /// Get the result.
 99 |   const auto result = event_counter.result();
100 | 
101 |   /// Print the metrics as table.
102 |   std::cout << "\nResults as table:\n" << result.to_string() << std::endl;
103 | 
104 |   return 0;
105 | }


--------------------------------------------------------------------------------
/examples/sampling/multi_thread.cpp:
--------------------------------------------------------------------------------
 1 | #include "../access_benchmark.h"
 2 | #include "perfcpp/sampler.h"
 3 | #include <iostream>
 4 | #include <numeric>
 5 | #include <thread>
 6 | 
 7 | int
 8 | main()
 9 | {
10 |   std::cout << "libperf-cpp example: Record perf samples including time, "
11 |                "instruction pointer, and cpu id for single-threaded random "
12 |                "access to an in-memory array on multiple threads."
13 |             << std::endl;
14 | 
15 |   constexpr auto count_threads = 4U;
16 | 
17 |   auto sampler = perf::MultiThreadSampler{ count_threads };
18 | 
19 |   /// Setup event that triggers writing samples.
20 |   sampler.trigger("cycles", perf::Period{ 50000 });
21 | 
22 |   /// Setup what data the samples should include (timestamp, instruction pointer, CPU id, thread id).
23 |   sampler.values().timestamp(true).instruction_pointer(true).cpu_id(true).thread_id(true);
24 | 
25 |   /// Create random access benchmark.
26 |   auto benchmark = perf::example::AccessBenchmark{ /*randomize the accesses*/ true,
27 |                                                    /* create benchmark of 512 MB */ 1024U };
28 | 
29 |   /// Allocate space for threads and their results.
30 |   const auto items_per_thread = benchmark.size() / count_threads;
31 |   auto threads = std::vector<std::thread>{};
32 |   auto thread_local_results =
33 |     std::vector<std::uint64_t>(count_threads, 0U); /// Array to store the thread-local results.
34 | 
35 |   for (auto thread_index = std::uint16_t(0U); thread_index < count_threads; ++thread_index) {
36 |     threads.emplace_back([thread_index, items_per_thread, &thread_local_results, &benchmark, &sampler]() {
37 |       auto local_value = 0ULL;
38 | 
39 |       /// Start sampling per thread.
40 |       try {
41 |         sampler.start(thread_index);
42 |       } catch (std::runtime_error& exception) {
43 |         std::cerr << exception.what() << std::endl;
44 |         return;
45 |       }
46 | 
47 |       /// Process the data.
48 |       for (auto index = 0U; index < items_per_thread; ++index) {
49 |         local_value += benchmark[(thread_index * items_per_thread) + index].value;
50 |       }
51 | 
52 |       /// Stop sampling on this thread.
53 |       sampler.stop(thread_index);
54 | 
55 |       thread_local_results[thread_index] = local_value;
56 |     });
57 |   }
58 | 
59 |   /// Wait for all threads to finish.
60 |   for (auto& thread : threads) {
61 |     thread.join();
62 |   }
63 | 
64 |   /// Add up the results so that the compiler does not get the idea of
65 |   /// optimizing away the accesses.
66 |   auto value = std::accumulate(thread_local_results.begin(), thread_local_results.end(), 0UL);
67 | 
68 |   /// We do not want the compiler to optimize away this (otherwise) unused value.
69 |   benchmark.pretend_to_use(value);
70 | 
71 |   /// Get all the recorded samples – ordered by timestamp.
72 |   auto samples = sampler.result(true);
73 | 
74 |   /// Print the first samples.
75 |   const auto count_show_samples = std::min<std::size_t>(samples.size(), 40U);
76 |   std::cout << "\nRecorded " << samples.size() << " samples." << std::endl;
77 |   std::cout << "Here are the first " << count_show_samples << " recorded samples:\n" << std::endl;
78 |   for (auto index = 0U; index < count_show_samples; ++index) {
79 |     const auto& sample = samples[index];
80 | 
81 |     /// Since we recorded the time, period, the instruction pointer, and the CPU
82 |     /// id, we can only read these values.
83 |     if (sample.metadata().timestamp().has_value() && sample.metadata().thread_id().has_value() &&
84 |         sample.instruction_execution().logical_instruction_pointer().has_value() &&
85 |         sample.metadata().cpu_id().has_value()) {
86 |       std::cout << "Time = " << sample.metadata().timestamp().value()
87 |                 << " | CPU ID = " << sample.metadata().cpu_id().value()
88 |                 << " | Thread ID = " << sample.metadata().thread_id().value() << " | Instruction Pointer = 0x"
89 |                 << std::hex << sample.instruction_execution().logical_instruction_pointer().value() << std::dec << "\n";
90 |     }
91 |   }
92 |   std::cout << std::flush;
93 | 
94 |   /// Close the sampler.
95 |   /// Note that the sampler can only be closed after reading the samples.
96 |   sampler.close();
97 | 
98 |   return 0;
99 | }


--------------------------------------------------------------------------------
/include/perfcpp/metric/expression/token.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <cstdint>
  4 | #include <string>
  5 | #include <variant>
  6 | 
  7 | namespace perf::metric::expression {
  8 | /**
  9 |  * Representation of the supported operators.
 10 |  */
 11 | enum class Operator_ : std::uint8_t
 12 | {
 13 |   Plus,
 14 |   Minus,
 15 |   Times,
 16 |   Divide
 17 | };
 18 | 
 19 | /**
 20 |  * A token represents a single constant, identifier, operator (like +,-, etc.), or punctutations.
 21 |  */
 22 | class Token
 23 | {
 24 | public:
 25 |   enum class Punctuation : std::uint8_t
 26 |   {
 27 |     LeftParentheses,
 28 |     RightParentheses,
 29 |     Comma,
 30 |   };
 31 | 
 32 |   /**
 33 |    * Token can be an identifier, a constant number, an operator, or a punctutation.
 34 |    */
 35 |   using token_t = std::variant<std::string, double, Operator_, Punctuation>;
 36 | 
 37 |   Token(Token&&) noexcept = default;
 38 |   Token(const Token&) = default;
 39 | 
 40 |   explicit Token(const Operator_ operator_)
 41 |     : _token(operator_)
 42 |   {
 43 |   }
 44 | 
 45 |   explicit Token(const double number)
 46 |     : _token(number)
 47 |   {
 48 |   }
 49 | 
 50 |   explicit Token(std::string&& text)
 51 |     : _token(std::move(text))
 52 |   {
 53 |   }
 54 | 
 55 |   explicit Token(const Punctuation punctutation)
 56 |     : _token(punctutation)
 57 |   {
 58 |   }
 59 | 
 60 |   ~Token() = default;
 61 | 
 62 |   Token& operator=(Token&&) noexcept = default;
 63 | 
 64 |   /**
 65 |    * @return True, if this token is a left parenthesis.
 66 |    */
 67 |   [[nodiscard]] bool is_left_parenthesis() const noexcept
 68 |   {
 69 |     return std::holds_alternative<Punctuation>(_token) && std::get<Punctuation>(_token) == Punctuation::LeftParentheses;
 70 |   }
 71 | 
 72 |   /**
 73 |    * @return True, if this token is a right parenthesis.
 74 |    */
 75 |   [[nodiscard]] bool is_right_parenthesis() const noexcept
 76 |   {
 77 |     return std::holds_alternative<Punctuation>(_token) &&
 78 |            std::get<Punctuation>(_token) == Punctuation::RightParentheses;
 79 |   }
 80 | 
 81 |   /**
 82 |    * @return True, if this token is a comma.
 83 |    */
 84 |   [[nodiscard]] bool is_comma() const noexcept
 85 |   {
 86 |     return std::holds_alternative<Punctuation>(_token) && std::get<Punctuation>(_token) == Punctuation::Comma;
 87 |   }
 88 | 
 89 |   /**
 90 |    * @return True, if this token is an additive operator.
 91 |    */
 92 |   [[nodiscard]] bool is_additive_operator() const noexcept
 93 |   {
 94 |     return std::holds_alternative<Operator_>(_token) &&
 95 |            (std::get<Operator_>(_token) == Operator_::Plus || std::get<Operator_>(_token) == Operator_::Minus);
 96 |   }
 97 | 
 98 |   /**
 99 |    * @return True, if this token is a multiplicative operator.
100 |    */
101 |   [[nodiscard]] bool is_multiplicative_operator() const noexcept
102 |   {
103 |     return std::holds_alternative<Operator_>(_token) &&
104 |            (std::get<Operator_>(_token) == Operator_::Divide || std::get<Operator_>(_token) == Operator_::Times);
105 |   }
106 | 
107 |   /**
108 |    * @return The operator inside the token.
109 |    */
110 |   [[nodiscard]] Operator_ operator_() const noexcept { return std::get<Operator_>(_token); }
111 | 
112 |   /**
113 |    * @return Ownership of the underlying token data.
114 |    */
115 |   [[nodiscard]] token_t& data() noexcept { return _token; }
116 | 
117 |   [[nodiscard]] bool operator==(const Punctuation punctuation) const noexcept
118 |   {
119 |     return std::holds_alternative<Punctuation>(_token) && std::get<Punctuation>(_token) == punctuation;
120 |   }
121 | 
122 |   /**
123 |    * @return A text representation of this token.
124 |    */
125 |   [[nodiscard]] std::string to_string() const;
126 | 
127 | private:
128 |   token_t _token;
129 | 
130 |   /**
131 |    * Visits a token and translates it into an std::string.
132 |    */
133 |   class TokenToStringVisitor
134 |   {
135 |   public:
136 |     [[nodiscard]] std::string operator()(const std::string& identifier) const { return identifier; }
137 | 
138 |     [[nodiscard]] std::string operator()(const double constant) const { return std::to_string(constant); }
139 | 
140 |     [[nodiscard]] std::string operator()(Operator_ metric_operator) const;
141 | 
142 |     [[nodiscard]] std::string operator()(Token::Punctuation punctutation) const;
143 |   };
144 | };
145 | }


--------------------------------------------------------------------------------
/examples/access_benchmark.cpp:
--------------------------------------------------------------------------------
  1 | #include "access_benchmark.h"
  2 | #include <algorithm>
  3 | #include <numeric>
  4 | #include <random>
  5 | 
  6 | perf::example::AccessBenchmark::AccessBenchmark(const bool is_random,
  7 |                                                 const std::uint64_t access_data_size_in_mb,
  8 |                                                 const bool is_write)
  9 | {
 10 |   const auto count_cache_lines = (access_data_size_in_mb * 1024U * 1024U) / sizeof(cache_line);
 11 | 
 12 |   /// Fill the data array with some unique.
 13 |   this->_data_to_read.reserve(count_cache_lines);
 14 |   for (const auto item : DataGenerator::generate_unique(count_cache_lines)) {
 15 |     this->_data_to_read.emplace_back(item);
 16 |   }
 17 | 
 18 |   if (is_write) {
 19 |     this->_data_to_write.resize(count_cache_lines);
 20 |   }
 21 | 
 22 |   /// Create the access pattern by filling the indices and shuffle, if we want a
 23 |   /// random access pattern.
 24 |   this->_indices.resize(count_cache_lines);
 25 |   std::iota(this->_indices.begin(), this->_indices.end(), 0U);
 26 | 
 27 |   if (is_random) {
 28 |     std::shuffle(this->_indices.begin(), this->_indices.end(), std::mt19937{ std::random_device{}() });
 29 |   }
 30 | }
 31 | 
 32 | std::vector<std::uint64_t>
 33 | perf::example::DataGenerator::generate_unique(const std::size_t size)
 34 | {
 35 |   /// Create a list for the tuples.
 36 |   auto relation = std::vector<std::uint64_t>{};
 37 |   relation.reserve(size);
 38 | 
 39 |   /// Create tuples.
 40 |   auto generator = std::mt19937{ 864896UL };
 41 |   auto distribution = std::uniform_int_distribution<std::uint64_t>{};
 42 |   for (auto i = 0ULL; i < size; ++i) {
 43 |     relation.emplace_back(distribution(generator));
 44 |   }
 45 | 
 46 |   /// Shuffle the relation.
 47 |   std::shuffle(relation.begin(), relation.end(), generator);
 48 | 
 49 |   return relation;
 50 | }
 51 | 
 52 | std::vector<std::uint64_t>
 53 | perf::example::DataGenerator::generate_zipf(const std::size_t size,
 54 |                                             const std::size_t alphabet_size,
 55 |                                             const double zipf_param)
 56 | {
 57 |   /// Create a list for the tuples.
 58 |   auto relation = std::vector<std::uint64_t>{};
 59 |   relation.reserve(size);
 60 | 
 61 |   const auto alphabet = DataGenerator::alphabet(alphabet_size);
 62 |   const auto lookup_table = DataGenerator::lookup_table(zipf_param, alphabet);
 63 | 
 64 |   std::srand(6854686UL);
 65 |   for (auto i = 0ULL; i < size; ++i) {
 66 |     const auto random_key = static_cast<double>(std::rand()) / RAND_MAX;
 67 | 
 68 |     if (lookup_table[0U] >= random_key) {
 69 |       relation.emplace_back(alphabet[0U]);
 70 |     } else {
 71 |       auto left = 0ULL;
 72 |       auto right = alphabet_size - 1ULL;
 73 |       std::uint64_t mid;
 74 |       while (right - left > 1ULL) {
 75 |         mid = (left + right) / 2;
 76 |         if (lookup_table[mid] < random_key) {
 77 |           left = mid;
 78 |         } else {
 79 |           right = mid;
 80 |         }
 81 |       }
 82 | 
 83 |       relation.emplace_back(alphabet[right]);
 84 |     }
 85 |   }
 86 | 
 87 |   return relation;
 88 | }
 89 | 
 90 | std::vector<std::uint64_t>
 91 | perf::example::DataGenerator::alphabet(const std::size_t size)
 92 | {
 93 |   auto alphabet = std::vector<std::uint64_t>{};
 94 |   alphabet.reserve(size);
 95 | 
 96 |   /// Fill the alphabet.
 97 |   for (auto i = 0ULL; i < size; ++i) {
 98 |     alphabet.emplace_back(i);
 99 |   }
100 | 
101 |   /// Permute the alphabet.
102 |   auto generator = std::mt19937{ 864896UL };
103 |   std::shuffle(alphabet.begin(), alphabet.end(), generator);
104 | 
105 |   return alphabet;
106 | }
107 | 
108 | std::vector<double>
109 | perf::example::DataGenerator::lookup_table(double zipf_param, const std::vector<std::uint64_t>& alphabet)
110 | {
111 |   auto lookup_table = std::vector<double>{};
112 |   lookup_table.reserve(alphabet.size());
113 | 
114 |   /// Compute scaling factor such that sum (lookup_table[i], i=1..alphabet_size) = 1.0
115 |   auto scaling_factor = 0.0;
116 |   for (auto i = 0ULL; i < alphabet.size(); ++i) {
117 |     scaling_factor += 1.0 / std::pow(double(i) + 1., zipf_param);
118 |   }
119 | 
120 |   /// Generate the lookup table.
121 |   auto sum = 0.0;
122 |   for (auto i = 0ULL; i < alphabet.size(); ++i) {
123 |     sum += 1.0 / std::pow(double(i) + 1.0, zipf_param);
124 |     lookup_table.emplace_back(sum / scaling_factor);
125 |   }
126 | 
127 |   return lookup_table;
128 | }


--------------------------------------------------------------------------------
/examples/statistics/multi_cpu.cpp:
--------------------------------------------------------------------------------
  1 | #include "../access_benchmark.h"
  2 | #include "perfcpp/event_counter.h"
  3 | #include <atomic>
  4 | #include <iostream>
  5 | #include <numeric>
  6 | #include <thread>
  7 | 
  8 | int
  9 | main()
 10 | {
 11 |   std::cout << "libperf-cpp example: Record performance counter for "
 12 |                "random access to an in-memory array on all CPU cores."
 13 |             << std::endl;
 14 |   std::cout << "We will record the counters per (logical) CPU core and merge the results "
 15 |                "afterwards."
 16 |             << std::endl;
 17 | 
 18 |   /// Create a list of cpus to record performance counters on (all available, in this example).
 19 |   auto cpus_to_watch = std::vector<std::uint16_t>(std::thread::hardware_concurrency());
 20 |   std::iota(cpus_to_watch.begin(), cpus_to_watch.end(), 0U);
 21 |   std::cout << "Creating counters for CPUs: ";
 22 |   for (auto cpu : cpus_to_watch) {
 23 |     std::cout << std::int32_t(cpu) << " ";
 24 |   }
 25 |   std::cout << std::endl;
 26 | 
 27 |   /// Initialize performance counters.
 28 |   auto multi_cpu_event_counter = perf::MultiCoreEventCounter{ std::move(cpus_to_watch) };
 29 | 
 30 |   /// Add all the performance counters we want to record.
 31 |   try {
 32 |     multi_cpu_event_counter.add({ "instructions",
 33 |                                   "cycles",
 34 |                                   "branches",
 35 |                                   "cache-misses",
 36 |                                   "dTLB-miss-ratio",
 37 |                                   "L1-data-miss-ratio",
 38 |                                   "cycles-per-instruction" });
 39 |   } catch (std::runtime_error& e) {
 40 |     std::cerr << e.what() << std::endl;
 41 |     return 1;
 42 |   }
 43 | 
 44 |   /// Create random access benchmark.
 45 |   auto benchmark = perf::example::AccessBenchmark{ /*randomize the accesses*/ true,
 46 |                                                    /* create benchmark of 1024 MB */ 1024U };
 47 | 
 48 |   /// One event_counter instance for every thread.
 49 |   constexpr auto count_threads = 2U;
 50 |   const auto items_per_thread = benchmark.size() / count_threads;
 51 |   auto threads = std::vector<std::thread>{};
 52 |   auto thread_local_results = std::vector<std::uint64_t>(2U, 0U); /// Array to store the thread-local results.
 53 | 
 54 |   /// Barrier for the threads to wait.
 55 |   auto thread_barrier = std::atomic<bool>{ false };
 56 | 
 57 |   for (auto thread_index = 0U; thread_index < count_threads; ++thread_index) {
 58 |     threads.emplace_back([thread_index, items_per_thread, &thread_local_results, &benchmark, &thread_barrier]() {
 59 |       auto local_value = 0ULL;
 60 | 
 61 |       /// Wait for the barrier to become "true", i.e., all threads are spawned.
 62 |       while (!thread_barrier)
 63 |         ;
 64 | 
 65 |       /// Process the data.
 66 |       for (auto index = 0U; index < items_per_thread; ++index) {
 67 |         local_value += benchmark[(thread_index * items_per_thread) + index].value;
 68 |       }
 69 | 
 70 |       thread_local_results[thread_index] = local_value;
 71 |     });
 72 |   }
 73 | 
 74 |   /// Start recording performance counter.
 75 |   /// In contrast to the inherit-thread example (see inherit_thread.cpp), we
 76 |   /// will record the performance counters on each logical CPU core.
 77 |   try {
 78 |     multi_cpu_event_counter.start();
 79 |   } catch (std::runtime_error& exception) {
 80 |     std::cerr << exception.what() << std::endl;
 81 |     return 1;
 82 |   }
 83 | 
 84 |   /// Let threads start.
 85 |   thread_barrier = true;
 86 | 
 87 |   /// Wait for all threads to finish.
 88 |   for (auto& thread : threads) {
 89 |     thread.join();
 90 |   }
 91 | 
 92 |   /// Stop performance counter recording.
 93 |   multi_cpu_event_counter.stop();
 94 | 
 95 |   /// Add up the results so that the compiler does not get the idea of
 96 |   /// optimizing away the accesses.
 97 |   auto value = std::accumulate(thread_local_results.begin(), thread_local_results.end(), 0UL);
 98 | 
 99 |   /// We do not want the compiler to optimize away this (otherwise) unused value.
100 |   benchmark.pretend_to_use(value);
101 | 
102 |   /// Get the result (normalized per cache line) from the
103 |   /// multithread_event_counter.
104 |   auto result = multi_cpu_event_counter.result(benchmark.size());
105 | 
106 |   /// Print the performance counters.
107 |   std::cout << "\nResults:\n";
108 |   for (const auto& [counter_name, counter_value] : result) {
109 |     std::cout << counter_value << " " << counter_name << " / cache line" << std::endl;
110 |   }
111 | 
112 |   return 0;
113 | }
114 | 


--------------------------------------------------------------------------------
/include/perfcpp/util/table.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #include <cstdint>
  3 | #include <optional>
  4 | #include <string>
  5 | #include <vector>
  6 | 
  7 | namespace perf::util {
  8 | class Table
  9 | {
 10 | public:
 11 |   enum class Alignment : std::uint8_t
 12 |   {
 13 |     Left,
 14 |     Center,
 15 |     Right
 16 |   };
 17 | 
 18 |   class Header
 19 |   {
 20 |   public:
 21 |     Header(std::string&& text, const std::uint8_t span, const bool has_separator) noexcept
 22 |       : _text(std::move(text))
 23 |       , _span(span)
 24 |       , _has_separator(has_separator)
 25 |     {
 26 |     }
 27 |     explicit Header(std::string&& text,
 28 |                     const Alignment alignment = Alignment::Right,
 29 |                     const bool has_separator = false) noexcept
 30 |       : _text(std::move(text))
 31 |       , _alignment(alignment)
 32 |       , _has_separator(has_separator)
 33 |     {
 34 |     }
 35 |     ~Header() = default;
 36 | 
 37 |     [[nodiscard]] const std::string& text() const noexcept { return _text; }
 38 |     [[nodiscard]] Alignment alignment() const noexcept { return _alignment; }
 39 |     [[nodiscard]] std::uint8_t span() const noexcept { return _span; }
 40 |     [[nodiscard]] bool has_separator() const noexcept { return _has_separator; }
 41 | 
 42 |   private:
 43 |     std::string _text;
 44 |     Alignment _alignment{ Alignment::Left };
 45 |     std::uint8_t _span{ 1U };
 46 |     bool _has_separator;
 47 |   };
 48 | 
 49 |   class Row
 50 |   {
 51 |   public:
 52 |     Row() { _columns.reserve(32U); }
 53 |     ~Row() = default;
 54 | 
 55 |     void add(std::string&& column) { _columns.push_back(std::move(column)); }
 56 | 
 57 |     Row& operator<<(std::string&& column)
 58 |     {
 59 |       _columns.emplace_back(std::move(column));
 60 |       return *this;
 61 |     }
 62 | 
 63 |     Row& operator<<(const std::string& column)
 64 |     {
 65 |       _columns.emplace_back(column);
 66 |       return *this;
 67 |     }
 68 | 
 69 |     Row& operator<<(const std::size_t column)
 70 |     {
 71 |       _columns.emplace_back(std::to_string(column));
 72 |       return *this;
 73 |     }
 74 | 
 75 |     Row& operator<<(const std::uint32_t column)
 76 |     {
 77 |       _columns.emplace_back(std::to_string(column));
 78 |       return *this;
 79 |     }
 80 | 
 81 |     Row& operator<<(const std::uint16_t column)
 82 |     {
 83 |       _columns.emplace_back(std::to_string(column));
 84 |       return *this;
 85 |     }
 86 | 
 87 |     Row& operator<<(const std::uint8_t column)
 88 |     {
 89 |       _columns.emplace_back(std::to_string(column));
 90 |       return *this;
 91 |     }
 92 | 
 93 |     Row& operator<<(const float column)
 94 |     {
 95 |       _columns.emplace_back(std::to_string(column));
 96 |       return *this;
 97 |     }
 98 | 
 99 |     Row& operator<<(const double column)
100 |     {
101 |       _columns.emplace_back(std::to_string(column));
102 |       return *this;
103 |     }
104 | 
105 |     [[nodiscard]] const std::vector<std::string>& columns() const noexcept { return _columns; }
106 |     [[nodiscard]] std::vector<std::string>& columns() noexcept { return _columns; }
107 | 
108 |   private:
109 |     std::vector<std::string> _columns;
110 |   };
111 | 
112 |   Table() = default;
113 |   explicit Table(const std::uint64_t offset)
114 |     : _offset(offset)
115 |   {
116 |   }
117 |   ~Table() = default;
118 | 
119 |   /**
120 |    * Reserve space for the given number of rows.
121 |    * @param count_rows Number of rows to reserve.
122 |    */
123 |   void reserve(const std::size_t count_rows) { _rows.reserve(count_rows); }
124 | 
125 |   /**
126 |    * Adds a header row to the table.
127 |    * @param header_row Header row to add.
128 |    */
129 |   void add(std::vector<Header>&& header_row);
130 | 
131 |   /**
132 |    * Adds the given row to the table.
133 |    * @param row Row to add.
134 |    */
135 |   void add(Row&& row);
136 | 
137 |   /**
138 |    * @return Turns the table into a printable string.
139 |    */
140 |   [[nodiscard]] std::string to_string() const;
141 | 
142 | private:
143 |   std::optional<std::uint16_t> _count_columns{ std::nullopt };
144 | 
145 |   /// Headers
146 |   std::vector<std::vector<Header>> _header_row;
147 | 
148 |   /// Rows
149 |   std::vector<Row> _rows;
150 | 
151 |   /// Offset of each row in number of empty spaces.
152 |   std::uint64_t _offset{ 0U };
153 | 
154 |   void static print_text_aligned(std::stringstream& stream,
155 |                                  Alignment alignment,
156 |                                  const std::string& text,
157 |                                  std::size_t column_size);
158 | };
159 | }


--------------------------------------------------------------------------------
/include/perfcpp/mmap_buffer.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "util/unique_file_descriptor.h"
  4 | #include <cstddef>
  5 | #include <cstdint>
  6 | #include <linux/perf_event.h>
  7 | #include <mutex>
  8 | #include <thread>
  9 | #include <utility>
 10 | #include <vector>
 11 | 
 12 | namespace perf {
 13 | class MmapBuffer;
 14 | 
 15 | /**
 16 |  * The MmapBufferOverflowWorker is responsible for handling an extra thread that waits for overflows to happen and then
 17 |  * triggers the MmapBuffer to handle the overflow.
 18 |  */
 19 | class MmapBufferOverflowWorker
 20 | {
 21 | public:
 22 |   MmapBufferOverflowWorker(MmapBuffer& mmap_buffer, const util::UniqueFileDescriptor& counter_file_descriptor);
 23 |   ~MmapBufferOverflowWorker() = default;
 24 | 
 25 |   MmapBufferOverflowWorker(MmapBufferOverflowWorker&&) = delete;
 26 |   MmapBufferOverflowWorker(const MmapBufferOverflowWorker&) = delete;
 27 | 
 28 |   /**
 29 |    * Cancels the worker thread and awaits its shutdown.
 30 |    */
 31 |   void cancel();
 32 | 
 33 | private:
 34 |   /// Thread to run for handling overflows.
 35 |   std::thread _overflow_handle_thread;
 36 | 
 37 |   /// File descriptor to communicate with the thread in case of canceling the worker.
 38 |   util::UniqueFileDescriptor _cancel_thread_file_descriptor;
 39 | 
 40 |   /**
 41 |    * Worker function that waits for an overflow within the mmap-ed buffer–calling the mmap buffer to handle the
 42 |    * overflow–and a cancel signal to shut down the worker thread.
 43 |    *
 44 |    * @param mmap_buffer Mmap buffer that will be triggered to handle the overflow.
 45 |    * @param counter_file_descriptor File descriptor of the counter that is used to mmap the buffer; used to await the
 46 |    * overflow.
 47 |    * @param cancel_file_descriptor File descriptor used to communicate canceling the worker thread.
 48 |    */
 49 |   static void run(MmapBuffer& mmap_buffer,
 50 |                   util::FileDescriptorView counter_file_descriptor,
 51 |                   util::FileDescriptorView cancel_file_descriptor) noexcept;
 52 | };
 53 | 
 54 | class MmapBuffer
 55 | {
 56 | public:
 57 |   explicit MmapBuffer(const util::UniqueFileDescriptor& file_descriptor, std::uint64_t count_pages = 1ULL);
 58 |   ~MmapBuffer();
 59 | 
 60 |   MmapBuffer(MmapBuffer&&) = delete;
 61 |   MmapBuffer(const MmapBuffer&) = delete;
 62 | 
 63 |   /**
 64 |    * Reads a performance monitoring counter value from the mmap-ed buffer via the `rdpmc` instruction.
 65 |    *
 66 |    * @return PMC value read via `rdpmc` from the buffer.
 67 |    */
 68 |   [[nodiscard]] std::optional<std::uint64_t> read_performance_monitoring_counter() const noexcept;
 69 | 
 70 |   /**
 71 |    * @return The entire data from the buffer, including all data copied from overflows. This will consume the data,
 72 |    * i.e., the caller owns the data.
 73 |    */
 74 |   [[nodiscard]] std::vector<std::vector<std::byte>> consume_data();
 75 | 
 76 |   /**
 77 |    * Copies the data from the mmap-ed buffer into a specific application-level buffer.
 78 |    * Overflow handling (i.e., this function) will be triggered by the overflow handler.
 79 |    */
 80 |   void handle_overflow();
 81 | 
 82 |   [[nodiscard]] explicit operator bool() const noexcept { return _ringbuffer_header != nullptr; }
 83 | 
 84 | private:
 85 |   /// First page of the mmap-ed buffer; pointing to the buffer's header.
 86 |   perf_event_mmap_page* _ringbuffer_header{ nullptr };
 87 | 
 88 |   /// Number of pages allocated via mmap.
 89 |   std::uint64_t _count_pages{ 0ULL };
 90 | 
 91 |   /// List of data copied from the mmap-ed buffer after an overflow happened.
 92 |   std::vector<std::vector<std::byte>> _overflow_data;
 93 | 
 94 |   /// Worker that waits for overflows and triggers the overflow handling.
 95 |   std::optional<MmapBufferOverflowWorker> _overflow_worker{ std::nullopt };
 96 | 
 97 |   /// Mutex for accessing the overflow data.
 98 |   alignas(64) std::mutex _overflow_data_mutex;
 99 | 
100 |   /**
101 |    * Copies the sample data from the mmap-ed buffer into a vector and sets the tail of the mmap-ed buffer accordingly.
102 |    *
103 |    * @return Data copied from the buffer.
104 |    */
105 |   [[nodiscard]] std::vector<std::byte> copy_data_from_ringbuffer();
106 | 
107 |   /**
108 |    * Aligns the number of buffer pages to a number that is a power of two plus one for the header.
109 |    *
110 |    * @param number_of_buffer_pages Current number of buffer pages.
111 |    * @return An aligned number that is a power of two plus one. Nothing changes if the number is already aligned.
112 |    */
113 |   [[nodiscard]] static std::uint64_t align_number_of_buffer_pages(std::uint64_t number_of_buffer_pages) noexcept;
114 | };
115 | }


--------------------------------------------------------------------------------
/examples/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | ## Single-threaded
 2 | add_executable(single-thread-statistics EXCLUDE_FROM_ALL examples/statistics/single_thread.cpp examples/access_benchmark.cpp)
 3 | target_link_libraries(single-thread-statistics perf-cpp)
 4 | 
 5 | ## Multi-threaded; but inherit counter from main-thread
 6 | add_executable(inherit-thread-statistics EXCLUDE_FROM_ALL examples/statistics/inherit_thread.cpp examples/access_benchmark.cpp)
 7 | target_link_libraries(inherit-thread-statistics perf-cpp)
 8 | 
 9 | ## Multi-threaded with thread-local counter
10 | add_executable(multi-thread-statistics EXCLUDE_FROM_ALL examples/statistics/multi_thread.cpp examples/access_benchmark.cpp)
11 | target_link_libraries(multi-thread-statistics perf-cpp)
12 | 
13 | ## Multi-CPU with per-CPU counter
14 | add_executable(multi-cpu-statistics EXCLUDE_FROM_ALL examples/statistics/multi_cpu.cpp examples/access_benchmark.cpp)
15 | target_link_libraries(multi-cpu-statistics perf-cpp)
16 | 
17 | ## Multi-Process with per-process counter
18 | add_executable(multi-process-statistics EXCLUDE_FROM_ALL examples/statistics/multi_process.cpp examples/access_benchmark.cpp)
19 | target_link_libraries(multi-process-statistics perf-cpp)
20 | 
21 | ## Metrics
22 | add_executable(metric EXCLUDE_FROM_ALL examples/statistics/metric.cpp examples/access_benchmark.cpp)
23 | target_link_libraries(metric perf-cpp)
24 | 
25 | ## Live Events
26 | add_executable(live-events EXCLUDE_FROM_ALL examples/statistics/live_events.cpp examples/access_benchmark.cpp)
27 | target_link_libraries(live-events perf-cpp)
28 | 
29 | ## Sampling instruction pointers
30 | add_executable(instruction-pointer-sampling EXCLUDE_FROM_ALL examples/sampling/instruction_pointer.cpp examples/access_benchmark.cpp)
31 | target_link_libraries(instruction-pointer-sampling perf-cpp)
32 | 
33 | ## Sampling instruction pointers
34 | add_executable(counter-sampling EXCLUDE_FROM_ALL examples/sampling/counter.cpp examples/access_benchmark.cpp)
35 | target_link_libraries(counter-sampling perf-cpp)
36 | 
37 | ## Branch sampling
38 | add_executable(branch-sampling EXCLUDE_FROM_ALL examples/sampling/branch.cpp examples/access_benchmark.cpp)
39 | target_link_libraries(branch-sampling perf-cpp)
40 | 
41 | ## Memory address sampling
42 | add_executable(memory-address-sampling EXCLUDE_FROM_ALL examples/sampling/memory_address.cpp examples/access_benchmark.cpp)
43 | target_link_libraries(memory-address-sampling perf-cpp)
44 | 
45 | ## Sampling user_registers
46 | add_executable(register-sampling EXCLUDE_FROM_ALL examples/sampling/register.cpp examples/access_benchmark.cpp)
47 | target_link_libraries(register-sampling perf-cpp)
48 | 
49 | ## Sampling on multiple threads
50 | add_executable(multi-thread-sampling EXCLUDE_FROM_ALL examples/sampling/multi_thread.cpp examples/access_benchmark.cpp)
51 | target_link_libraries(multi-thread-sampling perf-cpp)
52 | 
53 | ## Sampling on multiple threads
54 | add_executable(multi-cpu-sampling EXCLUDE_FROM_ALL examples/sampling/multi_cpu.cpp examples/access_benchmark.cpp)
55 | target_link_libraries(multi-cpu-sampling perf-cpp)
56 | 
57 | ## Sampling with multiple events
58 | add_executable(multi-event-sampling EXCLUDE_FROM_ALL examples/sampling/multi_event.cpp examples/access_benchmark.cpp)
59 | target_link_libraries(multi-event-sampling perf-cpp)
60 | 
61 | ## Sampling with raw values
62 | add_executable(context-switch-sampling EXCLUDE_FROM_ALL examples/sampling/context_switch.cpp examples/access_benchmark.cpp)
63 | target_link_libraries(context-switch-sampling perf-cpp)
64 | 
65 | ## Analyze Samples with DataAnalyzer
66 | add_executable(memory-access-analyzer EXCLUDE_FROM_ALL examples/sampling/memory_access_analyzer.cpp examples/access_benchmark.cpp)
67 | target_link_libraries(memory-access-analyzer perf-cpp)
68 | 
69 | ## Flame graph
70 | add_executable(flame-graph EXCLUDE_FROM_ALL examples/sampling/flame_graph.cpp examples/access_benchmark.cpp)
71 | target_compile_options(flame-graph PUBLIC "-g3")
72 | target_link_libraries(flame-graph perf-cpp)
73 | 
74 | ## Analyze Samples with the perf tool
75 | add_executable(perf-record EXCLUDE_FROM_ALL examples/sampling/perf_record.cpp examples/access_benchmark.cpp)
76 | target_link_libraries(perf-record perf-cpp)
77 | 
78 | ## List counters
79 | add_executable(counter-definition EXCLUDE_FROM_ALL examples/counter_definition.cpp)
80 | target_link_libraries(counter-definition perf-cpp)
81 | 
82 | ## One target for all examples
83 | add_custom_target(examples)
84 | add_dependencies(examples
85 |         single-thread-statistics inherit-thread-statistics multi-thread-statistics multi-cpu-statistics multi-process-statistics
86 |         metric instruction-pointer-sampling counter-sampling branch-sampling
87 |         memory-address-sampling register-sampling multi-thread-sampling multi-cpu-sampling
88 |         multi-event-sampling context-switch-sampling
89 |         memory-access-analyzer live-events flame-graph perf-record counter-definition)


--------------------------------------------------------------------------------
/docs/build.md:
--------------------------------------------------------------------------------
  1 | # How to build and include *perf-cpp* in your project
  2 | *perf-cpp* can be build manually or included into CMake projects.
  3 | 
  4 | ## Table of Contents
  5 | - [Building Manually](#building-manually)
  6 |   - [Build the Library](#build-the-library)
  7 |   - [Install the Library](#install-the-library)
  8 |   - [Build the Examples](#build-examples)
  9 |   - [Building as a Dynamically Linked Library](#building-as-a-dynamically-linked-library)
 10 | - [Use CMake](#including-into-cmakeliststxt)
 11 |   - [ExternalProject](#via-externalproject)
 12 |   - [FetchContent](#via-fetchcontent)
 13 |   - [find_package](#via-find_package)
 14 | ---
 15 | 
 16 | ## Building Manually
 17 | > [!NOTE]
 18 | > Throughout the documentation, we use `./build` as the build directory. 
 19 | > However, the build directory can be any directory of your choice (including `.`).
 20 | 
 21 | ### Build the Library
 22 | #### Download the source code
 23 | 
 24 | ```bash
 25 | git clone https://github.com/jmuehlig/perf-cpp.git
 26 | cd perf-cpp
 27 | 
 28 | # Optional: switch to this development version
 29 | git checkout v0.12.4
 30 | ```
 31 | 
 32 | #### Generate the Makefile and Build
 33 | 
 34 | ```bash
 35 | cmake . -B build 
 36 | cmake --build build
 37 | ```
 38 | 
 39 | ### Install the Library
 40 | To install the library, specify the `CMAKE_INSTALL_PREFIX`:
 41 | ```bash
 42 | # Generate Makefile
 43 | cmake . -B build -DCMAKE_INSTALL_PREFIX=/path/to/install/dir
 44 | 
 45 | # Build
 46 | cmake --build build
 47 | 
 48 | # Install
 49 | cmake --install build
 50 | ```
 51 | 
 52 | The library will then be available for discovery via CMake and `find_package` (see [below](#via-find_package)).
 53 | 
 54 | ### Generate Processor-specific Events
 55 | With `-DGEN_PROCESSOR_EVENTS=1`, the build process will try to read the processor-specific events from the event library ([events/](../events)) and generate a source file (`src/processor_specific_event_provider.cpp`) that adds these events to every (the default and *manually* instantiated) `perf::CounterDefinition` (see also the documentation on [hardware events](counters.md)).
 56 | 
 57 | With this option, processor-specific events can be used like *built-in* ones.
 58 | 
 59 | ```bash
 60 | # Generate Makefile and source file for processor-specific events
 61 | cmake . -B build -DGEN_PROCESSOR_EVENTS=1
 62 | 
 63 | # Build Library
 64 | cmake --build build
 65 | ```
 66 | 
 67 | > [!IMPORTANT]
 68 | > Depending on the underlying processor, the source file can grow very large and increase compilation time significantly.
 69 | 
 70 | 
 71 | ### Build Examples
 72 | Enable example compilation with `-DBUILD_EXAMPLES=1` and build the `examples` target:
 73 | 
 74 | ```bash
 75 | # Generate Makefile
 76 | cmake . -B build -DBUILD_EXAMPLES=1
 77 | 
 78 | # Build Library and Examples
 79 | cmake --build build --target examples
 80 | ```
 81 | 
 82 | The example binaries will be located in `build/examples/bin`.
 83 | 
 84 | ### Building as a Dynamically Linked Library
 85 | By default, *perf-cpp* is build as a **static** library.
 86 | You can request to build a **shared** library with `-DBUILD_LIB_SHARED=1`:
 87 | 
 88 | ```bash
 89 | cmake . -B build -DBUILD_LIB_SHARED=1
 90 | cmake --build build
 91 | ```
 92 | 
 93 | ## Including into `CMakeLists.txt`
 94 | *perf-cpp* uses [CMake](https://cmake.org/) as its build system, facilitating integration into additional CMake projects. 
 95 | Choose from the following methods:
 96 | 
 97 | ### Via ExternalProject
 98 | Include `ExternalProject` in your `CMakeLists.txt` and define the project:
 99 | 
100 | ```cmake
101 | include(ExternalProject)
102 | ExternalProject_Add(
103 |   perf-cpp-external
104 |   GIT_REPOSITORY "https://github.com/jmuehlig/perf-cpp"
105 |   GIT_TAG "v0.12.4"
106 |   PREFIX "lib/perf-cpp"
107 |   INSTALL_COMMAND cmake -E echo ""
108 | )
109 | ```
110 | * Add `lib/perf-cpp/src/perf-cpp-external/include` to your `include_directories()`.
111 | * Add `lib/perf-cpp/src/perf-cpp-external-build` to your `link_directories()`.
112 | 
113 | Note: The directory `lib/` can be any folder of your choice.
114 | 
115 | ### Via FetchContent
116 | Include `FetchContent` in your `CMakeLists.txt` and define the project:
117 | 
118 | ```cmake
119 | include(FetchContent)
120 | FetchContent_Declare(
121 |   perf-cpp-external
122 |   GIT_REPOSITORY "https://github.com/jmuehlig/perf-cpp"
123 |   GIT_TAG "v0.12.4"
124 | )
125 | FetchContent_MakeAvailable(perf-cpp-external)
126 | ```
127 | * Add `perf-cpp` to your linked libraries.
128 | * Add `${perf-cpp-external_SOURCE_DIR}/include/` to your include directories.
129 | 
130 | ### Via find_package
131 | If *perf-cpp* is already installed on your system (see [install instructions above](#install-the-library)), you can simply use `find_package` to link it with your project:
132 | 
133 | ```cmake
134 | find_package(perf-cpp REQUIRED)
135 | target_link_libraries(perf-cpp::perf-cpp)
136 | ```
137 | 


--------------------------------------------------------------------------------
/examples/sampling/multi_cpu.cpp:
--------------------------------------------------------------------------------
  1 | #include "../access_benchmark.h"
  2 | #include "perfcpp/sampler.h"
  3 | #include <atomic>
  4 | #include <iostream>
  5 | #include <numeric>
  6 | #include <thread>
  7 | 
  8 | int
  9 | main()
 10 | {
 11 |   std::cout << "libperf-cpp example: Record perf samples including time, "
 12 |                "instruction pointer, and cpu id for single-threaded random "
 13 |                "access to an in-memory array on multiple CPU cores."
 14 |             << std::endl;
 15 | 
 16 |   constexpr auto count_threads = 4U;
 17 | 
 18 |   /// Create a list of cpus to sample (all available, in this example).
 19 |   auto cpus_to_watch = std::vector<std::uint16_t>(std::min(4U, std::thread::hardware_concurrency()));
 20 |   std::iota(cpus_to_watch.begin(), cpus_to_watch.end(), 0U);
 21 | 
 22 |   auto sampler = perf::MultiCoreSampler{ std::move(cpus_to_watch) };
 23 | 
 24 |   /// Setup event that triggers writing samples.
 25 |   sampler.trigger("cycles", perf::Period{ 50000 });
 26 | 
 27 |   /// Setup what data the samples should include (timestamp, instruction pointer, CPU id, thread id).
 28 |   sampler.values().timestamp(true).instruction_pointer(true).cpu_id(true).thread_id(true);
 29 | 
 30 |   /// Create random access benchmark.
 31 |   auto benchmark = perf::example::AccessBenchmark{ /*randomize the accesses*/ true,
 32 |                                                    /* create benchmark of 512 MB */ 1024U };
 33 | 
 34 |   /// Allocate space for threads and their results.
 35 |   const auto items_per_thread = benchmark.size() / count_threads;
 36 |   auto threads = std::vector<std::thread>{};
 37 |   auto thread_local_results =
 38 |     std::vector<std::uint64_t>(count_threads, 0U); /// Array to store the thread-local results.
 39 | 
 40 |   /// Barrier for the threads to wait in order to start them all at the same time.
 41 |   auto thread_barrier = std::atomic<bool>{ false };
 42 | 
 43 |   for (auto thread_index = 0U; thread_index < count_threads; ++thread_index) {
 44 |     threads.emplace_back([thread_index, items_per_thread, &thread_local_results, &benchmark, &thread_barrier]() {
 45 |       auto local_value = 0ULL;
 46 | 
 47 |       /// Wait for the barrier to become "true", i.e., all threads are spawned.
 48 |       while (!thread_barrier)
 49 |         ;
 50 | 
 51 |       /// Process the data.
 52 |       for (auto index = 0U; index < items_per_thread; ++index) {
 53 |         local_value += benchmark[(thread_index * items_per_thread) + index].value;
 54 |       }
 55 | 
 56 |       thread_local_results[thread_index] = local_value;
 57 |     });
 58 |   }
 59 | 
 60 |   /// Start sampling for all specified CPUs at once.
 61 |   try {
 62 |     sampler.start();
 63 |   } catch (std::runtime_error& exception) {
 64 |     std::cerr << exception.what() << std::endl;
 65 |     return 1;
 66 |   }
 67 | 
 68 |   /// Let threads start.
 69 |   thread_barrier = true;
 70 | 
 71 |   /// Wait for all threads to finish.
 72 |   for (auto& thread : threads) {
 73 |     thread.join();
 74 |   }
 75 | 
 76 |   /// Stop sampling on all CPUs.
 77 |   sampler.stop();
 78 | 
 79 |   /// Add up the results so that the compiler does not get the idea of
 80 |   /// optimizing away the accesses.
 81 |   auto value = std::accumulate(thread_local_results.begin(), thread_local_results.end(), 0UL);
 82 | 
 83 |   /// We do not want the compiler to optimize away this (otherwise) unused value.
 84 |   benchmark.pretend_to_use(value);
 85 | 
 86 |   /// Get all the recorded samples – ordered by timestamp.
 87 |   auto samples = sampler.result(true);
 88 | 
 89 |   /// Print the first samples.
 90 |   const auto count_show_samples = std::min<std::size_t>(samples.size(), 40U);
 91 |   std::cout << "\nRecorded " << samples.size() << " samples." << std::endl;
 92 |   std::cout << "Here are the first " << count_show_samples << " recorded samples:\n" << std::endl;
 93 |   for (auto index = 0U; index < count_show_samples; ++index) {
 94 |     const auto& sample = samples[index];
 95 | 
 96 |     /// Since we recorded the time, period, the instruction pointer, and the CPU
 97 |     /// id, we can only read these values.
 98 |     if (sample.metadata().timestamp().has_value() && sample.metadata().cpu_id().has_value() &&
 99 |         sample.metadata().thread_id().has_value() &&
100 |         sample.instruction_execution().logical_instruction_pointer().has_value()) {
101 |       std::cout << "Time = " << sample.metadata().timestamp().value()
102 |                 << " | CPU ID = " << sample.metadata().cpu_id().value()
103 |                 << " | Thread ID = " << sample.metadata().thread_id().value() << " | Instruction Pointer = 0x"
104 |                 << std::hex << sample.instruction_execution().logical_instruction_pointer().value() << std::dec << "\n";
105 |     }
106 |   }
107 |   std::cout << std::flush;
108 | 
109 |   /// Close the sampler.
110 |   /// Note that the sampler can only be closed after reading the samples.
111 |   sampler.close();
112 | 
113 |   return 0;
114 | }


--------------------------------------------------------------------------------
/include/perfcpp/hardware_info.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #include <optional>
  3 | #include <perfcpp/counter.h>
  4 | 
  5 | #include <cstdint>
  6 | #if defined(__x86_64__) || defined(__i386__)
  7 | #include <cpuid.h>
  8 | #endif
  9 | 
 10 | #if !(defined(__x86_64__) || defined(__i386__))
 11 | #define __builtin_cpu_is(x) 0
 12 | #endif
 13 | 
 14 | namespace perf {
 15 | /**
 16 |  * Access to information about the underlying hardware substrate like manufacturer and perf specifics.
 17 |  */
 18 | class HardwareInfo
 19 | {
 20 | public:
 21 |   /**
 22 |    * @return True, if the underlying hardware is an Intel processor.
 23 |    */
 24 |   [[nodiscard]] static bool is_intel() noexcept { return static_cast<bool>(__builtin_cpu_is("intel")); }
 25 | 
 26 |   /**
 27 |    * @return True, if the underlying Intel processor requires an aux counter for memory sampling.
 28 |    */
 29 |   [[nodiscard]] static bool is_intel_aux_counter_required();
 30 | 
 31 |   /**
 32 |    * @return True, if the underlying Intel processor is equal or newer than the 12th generation.
 33 |    */
 34 |   [[nodiscard]] static bool is_intel_12th_generation_or_newer();
 35 | 
 36 |   /**
 37 |    * @return True, if the underlying hardware is an AMD processor.
 38 |    */
 39 |   [[nodiscard]] static bool is_amd() noexcept { return static_cast<bool>(__builtin_cpu_is("amd")); }
 40 | 
 41 |   /**
 42 |    * @return True, if the underlying AMD processor supports Instruction Based Sampling (IBS).
 43 |    */
 44 |   [[nodiscard]] static bool is_amd_ibs_supported() noexcept;
 45 | 
 46 |   /**
 47 |    * @return True, if the underlying AMD processor supports Instruction Based Sampling (IBS) with L3 filter.
 48 |    */
 49 |   [[nodiscard]] static bool is_ibs_l3_filter_supported() noexcept;
 50 | 
 51 |   /**
 52 |    * @return The page size of memory of the underlying machine.
 53 |    */
 54 |   [[nodiscard]] static std::uint64_t memory_page_size();
 55 | 
 56 |   /**
 57 |    * @return The number of physical performance counters per logical CPU core.
 58 |    */
 59 |   [[nodiscard]] static std::uint8_t physical_performance_counters_per_logical_core();
 60 | 
 61 |   /**
 62 |    * @return The number of events that can be scheduled to the same physical performance counter.
 63 |    */
 64 |   [[nodiscard]] static std::uint8_t events_per_physical_performance_counter();
 65 | 
 66 | private:
 67 |   static std::optional<bool> _is_intel_aux_event_required;
 68 |   static std::optional<bool> _is_intel_12th_generation_or_newer;
 69 |   static std::optional<bool> _is_amd_ibs_supported;
 70 |   static std::optional<bool> _is_ibs_l3_filter_supported;
 71 |   static std::optional<std::uint64_t> _memory_page_size;
 72 |   static std::optional<std::uint8_t> _physical_performance_counters_per_logical_core;
 73 |   static std::optional<std::uint8_t> _events_per_physical_performance_counter;
 74 | 
 75 | #if defined(__x86_64__) || defined(__i386__)
 76 |   /**
 77 |    * Result of a __get_cpuid call.
 78 |    */
 79 |   class CPUIDResult
 80 |   {
 81 |   public:
 82 |     CPUIDResult() noexcept = default;
 83 |     ~CPUIDResult() noexcept = default;
 84 | 
 85 |     std::uint32_t eax;
 86 |     std::uint32_t ebx;
 87 |     std::uint32_t ecx;
 88 |     std::uint32_t edx;
 89 |   };
 90 | 
 91 |   /**
 92 |    * Fires a __get_cpuid call with the provided leaf and sub leaf. In case the call was successful, the register values
 93 |    * are returned.
 94 |    *
 95 |    * @param leaf Leaf.
 96 |    * @param sub_leaf Sub leaf (0 by default).
 97 |    * @return Register values (eax, ebx, ecx, edx) in case the cpuid request was successful.
 98 |    */
 99 |   static std::optional<CPUIDResult> cpuid(std::uint32_t leaf, std::uint32_t sub_leaf = 0U) noexcept;
100 | #endif
101 | 
102 |   /**
103 |    * Writes a value into the cache variable and returns the value.
104 |    *
105 |    * @param variable Cache variable.
106 |    * @param value Value to write into the cache variable.
107 |    * @return The cached value.
108 |    */
109 |   template<typename T>
110 |   [[nodiscard]] static T cache_value(std::optional<T>& variable, const T value)
111 |   {
112 |     variable = value;
113 |     return value;
114 |   }
115 | 
116 |   /**
117 |    * Tries to open a performance counter with more and more events until it cannot open more events on a single physical
118 |    * performance counter.
119 |    *
120 |    * @param is_identify_hardware_counters If true, identify the number of hardware counters. Otherwise, identify the
121 |    * number of events per hardware counter.
122 |    * @return The maximum number of events on a single physical performance counter.
123 |    */
124 |   [[nodiscard]] static std::optional<std::uint8_t> explore_hardware_counters_experimentally(
125 |     bool is_identify_hardware_counters);
126 | 
127 |   /**
128 |    * Creates a list for hardware counter and event identification. The list may depend on the underlying hardware (e.g.,
129 |    * some ARM CPUs do not support all events defined by the perf subsystem).
130 |    *
131 |    * @return List of events to experiment for hardware counter and event identification.
132 |    */
133 |   [[nodiscard]] static std::vector<CounterConfig> generate_events_for_counter_identification();
134 | };
135 | }


--------------------------------------------------------------------------------
/docs/recording-live-events.md:
--------------------------------------------------------------------------------
  1 | # Accessing Live Event Counts
  2 | The *perf-cpp* library supports reading hardware performance counter values without stopping the counters ("live" events), particularly on `x86` systems using the [rdpmc](https://www.felixcloutier.com/x86/rdpmc) instruction. 
  3 | This feature allows for interim results during ongoing computations, ideal for real-time monitoring and adjustments.
  4 | 
  5 | The `perf::EventCounter` class is designed to support both standard and "live" events, allowing configuration of hardware performance counters to access results either "live" (for interim results) or after stopping.
  6 | For the latter, see [the recording basics documentation](recording.md).
  7 | 
  8 | > [!TIP]
  9 | > Our examples include a working code-example: **[statistics/live_events.cpp](../examples/statistics/live_events.cpp)**.
 10 | 
 11 | ---
 12 | ## Table of Contents
 13 | - [Setting Up Live Events](#setting-up-live-events)
 14 | - [Initializing the Hardware Counters *(optional)*](#initializing-the-hardware-counters-optional)
 15 | - [Reading Live Events During Computation](#reading-live-events-during-computation)
 16 | - [Finalizing and Retrieving Results](#finalizing-and-retrieving-results)
 17 | ---
 18 | 
 19 | ## Setting Up Live Events
 20 | Define which events to monitor live and which to read post-computation using the `perf::EventCounter`:
 21 | 
 22 | ```cpp
 23 | #include <perfcpp/event_counter.h>
 24 | 
 25 | auto event_counter = perf::EventCounter{};
 26 | 
 27 | try {
 28 |     /// Events for live monitoring.
 29 |     event_counter.add_live({"cache-misses", "cache-references", "branches"});
 30 | } catch (std::runtime_error& e) {
 31 |     std::cerr << e.what() << std::endl;
 32 | }
 33 | ```
 34 | 
 35 | > [!IMPORTANT]
 36 | > We experienced that not mixing live with "traditional" events leads to more consistent results.
 37 | 
 38 | > [!NOTE]
 39 | > Live events can only capture hardware events but not metrics.
 40 | 
 41 | ## Initializing the Hardware Counters *(optional)*
 42 | Optionally, preparing the hardware counters ahead of time to exclude configuration time from your measurements, though this is also handled automatically at the start if skipped:
 43 | 
 44 | ```cpp
 45 | try {
 46 |     event_counter.open();
 47 | } catch (std::runtime_error& e) {
 48 |     std::cerr << e.what() << std::endl;
 49 | }
 50 | ```
 51 | 
 52 | ## Reading Live Events During Computation
 53 | The library provides two methods for accessing live events during computation: directly via the `EventCounter` and using a simplified `LiveEventCounter` wrapper.
 54 | 
 55 | ### Option 1: Direct Access via `EventCounter`
 56 | Events added as live events (via `add_live()`) can be directly accessed from the `EventCounter` without stopping.
 57 | To be efficient, read live event counts by pre-allocating memory for the results to avoid allocation overheads during critical measurement phases:
 58 | 
 59 | ```cpp
 60 | try {
 61 |     event_counter.start();
 62 | } catch (std::runtime_error& e) {
 63 |     std::cerr << e.what() << std::endl;
 64 | }
 65 | 
 66 | /// Pre-allocated containers for live results.
 67 | auto start_values = std::vector<double>{/* cache-misses */ .0, /* cache-references */ .0};
 68 | auto end_values = std::vector<double>{/* cache-misses */ .0, /* cache-references */ .0};
 69 | 
 70 | for (auto i = 0U; i < runs; ++i) {
 71 |     /// Capture start values.
 72 |     event_counter.live_results(start_values); 
 73 |     
 74 |     /// Computation here...
 75 |     
 76 |     /// Capture end values after computation.
 77 |     event_counter.live_results(end_values);  
 78 |     
 79 |     std::cout << "Live Results: "
 80 |         << "cache-misses: " << end_values[0U] - start_values[0U] << ","
 81 |         << "cache-references: " << end_values[1U] - start_values[1U] << std::endl;
 82 | }
 83 | ```
 84 | 
 85 | ### Option 2: Simplified Access via `LiveEventCounter` Wrapper
 86 | The `LiveEventCounter` provides a streamlined method to manage live event monitoring by handling memory management and calculation of differences internally.
 87 | 
 88 | ```cpp
 89 | /// Initiate the LiveEventCounter wrapper before starting.
 90 | auto live_event_counter = perf::LiveEventCounter{ event_counter };
 91 | 
 92 | try {
 93 |     event_counter.start();
 94 | } catch (std::runtime_error& e) {
 95 |     std::cerr << e.what() << std::endl;
 96 | }
 97 | 
 98 | for (auto i = 0U; i < runs; ++i) {
 99 |     /// Capture start values.
100 |     live_event_counter.start();
101 |     
102 |     /// Computation here...
103 |     
104 |     /// Capture end values after computation.
105 |     live_event_counter.stop();
106 |     
107 |     std::cout << "Live Results: "
108 |         << "cache-misses: " << live_event_counter.get("cache-misses") << ","
109 |         << "cache-references: " << live_event_counter.get("cache-references") << std::endl;
110 | }
111 | ```
112 | 
113 | ## Finalizing and Retrieving Results
114 | Upon completion, stop the counters:
115 | 
116 | ```cpp
117 | /// Stop the counter after processing.
118 | event_counter.stop();
119 | ```
120 | 
121 | For further information, refer to the [recording basics documentation](recording.md) and the [code example](../examples/statistics/live_events.cpp).
122 | 


--------------------------------------------------------------------------------
/include/perfcpp/util/graph.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <algorithm>
  4 | #include <optional>
  5 | #include <unordered_map>
  6 | #include <unordered_set>
  7 | 
  8 | namespace perf::util {
  9 | template<typename N>
 10 | class DirectedGraph
 11 | {
 12 | public:
 13 |   DirectedGraph() = default;
 14 |   ~DirectedGraph() = default;
 15 | 
 16 |   /**
 17 |    * Inserts a node into the graph.
 18 |    *
 19 |    * @param node Node to insert.
 20 |    */
 21 |   void insert(const N& node) { _nodes_and_edges.insert(std::make_pair(node, std::unordered_set<N>())); }
 22 | 
 23 |   /**
 24 |    * Creates an edge between the node and the successor.
 25 |    *
 26 |    * @param node Starting node.
 27 |    * @param successor Ending node.
 28 |    */
 29 |   void connect(const N& node, const N& successor)
 30 |   {
 31 |     /// Insert the successor node, if the node does not exist in the graph.
 32 |     if (_nodes_and_edges.find(successor) == _nodes_and_edges.end()) {
 33 |       insert(successor);
 34 |     }
 35 | 
 36 |     /// If the node exists, add the successor into the successor set.
 37 |     if (auto iterator = _nodes_and_edges.find(node); iterator != _nodes_and_edges.end()) {
 38 |       iterator->second.insert(std::move(successor));
 39 |     }
 40 | 
 41 |     /// Otherwise, create a new node with the successor.
 42 |     else {
 43 |       _nodes_and_edges.insert(std::make_pair(node, std::unordered_set<N>{ successor }));
 44 |     }
 45 |   }
 46 | 
 47 |   /**
 48 |    * @return True, when the graph is empty.
 49 |    */
 50 |   [[nodiscard]] bool empty() const noexcept { return _nodes_and_edges.empty(); }
 51 | 
 52 |   /**
 53 |    * @return The first node that has no incoming edge.
 54 |    */
 55 |   [[nodiscard]] std::optional<N> pop()
 56 |   {
 57 |     for (auto& [node, _] : _nodes_and_edges) {
 58 |       /// Check every node if the node is not a successor.
 59 |       if (this->is_successor(node) == false) {
 60 | 
 61 |         const auto node_without_successor = node;
 62 | 
 63 |         /// If the node is not a successor, remove the node and return it.
 64 |         this->erase(node);
 65 | 
 66 |         return node_without_successor;
 67 |       }
 68 |     }
 69 | 
 70 |     return std::nullopt;
 71 |   }
 72 | 
 73 |   /**
 74 |    * Checks if the directed graph contains a cycle.
 75 |    * Uses DFS with three-color approach for optimal O(V + E) performance.
 76 |    *
 77 |    * @return True if the graph has a cycle, false otherwise.
 78 |    */
 79 |   [[nodiscard]] bool is_cyclic() const noexcept
 80 |   {
 81 |     if (empty()) {
 82 |       return false;
 83 |     }
 84 | 
 85 |     // Three-color DFS: 0 = white (unvisited), 1 = gray (in current path), 2 = black (finished)
 86 |     auto node_color = std::unordered_map<N, std::uint8_t>{};
 87 | 
 88 |     // Initialize all nodes as white
 89 |     for (const auto& [node, _] : _nodes_and_edges) {
 90 |       node_color.insert(std::make_pair(node, 0U));
 91 |     }
 92 | 
 93 |     // Check each unvisited node
 94 |     for (const auto& [node, _] : _nodes_and_edges) {
 95 |       if (node_color[node] == 0U && dfs_has_cycle(node, node_color)) {
 96 |         return true;
 97 |       }
 98 |     }
 99 | 
100 |     return false;
101 |   }
102 | 
103 | private:
104 |   /// Map of nodes and their successors.
105 |   std::unordered_map<N, std::unordered_set<N>> _nodes_and_edges;
106 | 
107 |   /**
108 |    * Checks if the node is in any successor list.
109 |    *
110 |    * @param node Node to check.
111 |    * @return True, of the node is in any successor list.
112 |    */
113 |   [[nodiscard]] bool is_successor(const N& node) const noexcept
114 |   {
115 |     for (const auto& [_, successors] : _nodes_and_edges) {
116 |       if (successors.find(node) != successors.end()) {
117 |         return true;
118 |       }
119 |     }
120 | 
121 |     return false;
122 |   }
123 | 
124 |   /**
125 |    * Removes the node from the graph.
126 |    *
127 |    * @param node Node to remove.
128 |    */
129 |   void erase(const N& node) { _nodes_and_edges.erase(node); }
130 | 
131 |   /**
132 |    * DFS helper function for cycle detection.
133 |    *
134 |    * @param node Current node being explored.
135 |    * @param node_color Color map tracking node states.
136 |    * @return True if a cycle is found during this DFS traversal.
137 |    */
138 |   [[nodiscard]] bool dfs_has_cycle(const N& node, std::unordered_map<N, std::uint8_t>& node_color) const noexcept
139 |   {
140 |     // Mark current node as gray (in current path)
141 |     node_color[node] = 1U;
142 | 
143 |     // Find the node's successors
144 |     if (const auto iterator = _nodes_and_edges.find(node); iterator != _nodes_and_edges.end()) {
145 |       for (const auto& successor : iterator->second) {
146 |         if (node_color[successor] == 1U) {
147 |           // Found a back edge (gray node) - cycle detected
148 |           return true;
149 |         }
150 |         if (node_color[successor] == 0U && dfs_has_cycle(successor, node_color)) {
151 |           // Recursively check unvisited successors
152 |           return true;
153 |         }
154 |       }
155 |     }
156 | 
157 |     // Mark current node as black (finished processing)
158 |     node_color[node] = 2U;
159 |     return false;
160 |   }
161 | };
162 | }


--------------------------------------------------------------------------------
/include/perfcpp/analyzer/data_type.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <cstdint>
  4 | #include <perfcpp/sample.h>
  5 | #include <string>
  6 | #include <vector>
  7 | 
  8 | namespace perf::analyzer {
  9 | /**
 10 |  * The DataType projects a data object with members (attributes).
 11 |  */
 12 | class DataType
 13 | {
 14 | public:
 15 |   class Member
 16 |   {
 17 |   public:
 18 |     Member(std::string&& name, const std::size_t offset, const std::size_t size) noexcept
 19 |       : _name(std::move(name))
 20 |       , _offset(offset)
 21 |       , _size(size)
 22 |     {
 23 |     }
 24 |     ~Member() = default;
 25 | 
 26 |     [[nodiscard]] const std::string& name() const noexcept { return _name; }
 27 |     [[nodiscard]] std::size_t offset() const noexcept { return _offset; }
 28 |     [[nodiscard]] std::size_t size() const noexcept { return _size; }
 29 |     [[nodiscard]] const std::vector<Sample>& samples() const noexcept { return _samples; }
 30 |     [[nodiscard]] std::vector<Sample>& samples() noexcept { return _samples; }
 31 | 
 32 |   private:
 33 |     std::string _name;
 34 |     std::size_t _offset;
 35 |     std::size_t _size;
 36 | 
 37 |     std::vector<Sample> _samples;
 38 |   };
 39 | 
 40 |   DataType(std::string&& name, const std::size_t size)
 41 |     : _name(std::move(name))
 42 |     , _size(size)
 43 |   {
 44 |   }
 45 |   DataType(const DataType&) = default;
 46 |   DataType(DataType&&) noexcept = default;
 47 |   DataType(std::string&& new_name, const DataType& other)
 48 |     : _name(std::move(new_name))
 49 |     , _size(other._size)
 50 |     , _members(other._members)
 51 |   {
 52 |   }
 53 |   ~DataType() = default;
 54 | 
 55 |   DataType& operator=(const DataType&) = default;
 56 |   DataType& operator=(DataType&&) noexcept = default;
 57 | 
 58 |   /**
 59 |    * @return Name of the data type.
 60 |    */
 61 |   [[nodiscard]] const std::string& name() const noexcept { return _name; }
 62 | 
 63 |   /**
 64 |    * @return Size of the data type.
 65 |    */
 66 |   [[nodiscard]] std::size_t size() const noexcept { return _size; }
 67 | 
 68 |   /**
 69 |    * @return List of all members.
 70 |    */
 71 |   [[nodiscard]] const std::vector<Member>& members() const noexcept { return _members; }
 72 | 
 73 |   /**
 74 |    * @return List of all members.
 75 |    */
 76 |   [[nodiscard]] std::vector<Member>& members() noexcept { return _members; }
 77 | 
 78 |   /**
 79 |    * Adds a member with the given name and size to the data type. A member can be, for example, an attribute of the data
 80 |    * type.
 81 |    *
 82 |    * @param member_name Name of the member.
 83 |    * @param size Size of the member.
 84 |    */
 85 |   void add(std::string&& member_name, const std::size_t size)
 86 |   {
 87 |     if (_members.empty()) {
 88 |       add(std::move(member_name), 0U, size);
 89 |     } else {
 90 |       add(std::move(member_name), _members.back().offset() + _members.back().size(), size);
 91 |     }
 92 |   }
 93 | 
 94 |   /**
 95 |    * Adds a member with the given name and size with a specified offset (relative to the beginning of the data object)
 96 |    * to the data type wit. A member can be, for example, an attribute of the data type.
 97 |    *
 98 |    * @param member_name Name of the member.
 99 |    * @param offset Offset of the member, relative to the data type.
100 |    * @param size Size of the member.
101 |    */
102 |   void add(std::string&& member_name, const std::size_t offset, const std::size_t size)
103 |   {
104 |     _members.emplace_back(std::move(member_name), offset, size);
105 |   }
106 | 
107 |   /**
108 |    * Adds a member to the data type. Name and size will be derived from the type in the template.
109 |    */
110 |   template<typename T>
111 |   void add()
112 |   {
113 |     add(std::string{ typeid(T).name() }, sizeof(T));
114 |   }
115 | 
116 |   /**
117 |    * Adds a member with a given name to the data type. The Size will be derived from the type in the template.
118 |    *
119 |    * @param name Name of the member.
120 |    */
121 |   template<typename T>
122 |   void add(std::string&& name)
123 |   {
124 |     add(std::move(name), sizeof(T));
125 |   }
126 | 
127 |   /**
128 |    * Adds a member with a given name to the data type. The Size will be derived from the type in the template.
129 |    *
130 |    * @param name Name of the member.
131 |    */
132 |   template<typename T>
133 |   void add(const std::string& name)
134 |   {
135 |     add(std::string{ name }, sizeof(T));
136 |   }
137 | 
138 |   /**
139 |    * Adds a member at a specific offset to the data type. Name and size will be derived from the type in the template.
140 |    *
141 |    * @param offset Offset relative to the data type.
142 |    */
143 |   template<typename T>
144 |   void add(const std::size_t offset)
145 |   {
146 |     add(typeid(T).name(), offset, sizeof(T));
147 |   }
148 | 
149 |   /**
150 |    * Adds a member with a given name at a specific offset to the data type. The ize will be derived from the type in the
151 |    * template.
152 |    *
153 |    * @param name
154 |    * @param offset
155 |    */
156 |   template<typename T>
157 |   void add(std::string&& name, const std::size_t offset)
158 |   {
159 |     add(std::move(name), offset, sizeof(T));
160 |   }
161 | 
162 | private:
163 |   std::string _name;
164 |   std::size_t _size;
165 |   std::vector<Member> _members;
166 | };
167 | }


--------------------------------------------------------------------------------
/examples/sampling/multi_event.cpp:
--------------------------------------------------------------------------------
  1 | #include "../access_benchmark.h"
  2 | #include "perfcpp/hardware_info.h"
  3 | #include "perfcpp/sampler.h"
  4 | #include <iostream>
  5 | 
  6 | int
  7 | main()
  8 | {
  9 |   std::cout << "libperf-cpp example: Record perf samples including time, "
 10 |                "logical memory address, latency, and data source for "
 11 |                "single-threaded random access to an in-memory array "
 12 |                "using multiple events as trigger."
 13 |             << std::endl;
 14 | 
 15 |   /// Initialize sampler.
 16 |   auto sampler = perf::Sampler{};
 17 | 
 18 |   if (perf::HardwareInfo::is_intel()) {
 19 |     sampler.trigger(std::vector<std::vector<perf::Sampler::Trigger>>{
 20 |       {
 21 |         perf::Sampler::Trigger{ "mem-loads", perf::Precision::RequestZeroSkid, perf::Period{ 8000U } } /// Loads
 22 |       },
 23 |       { perf::Sampler::Trigger{ "mem-stores", perf::Precision::MustHaveZeroSkid, perf::Period{ 8000U } } } /// Stores
 24 |     });
 25 |   } else {
 26 |     std::cout << "Error: Memory sampling with multiple triggers is not supported on this CPU." << std::endl;
 27 |     return 1;
 28 |   }
 29 | 
 30 |   /// Define what to sample.
 31 |   sampler.values().timestamp(true).logical_memory_address(true).data_source(true).latency(true);
 32 | 
 33 |   /// Create random access benchmark.
 34 |   auto benchmark = perf::example::AccessBenchmark{ /*randomize the accesses*/ true,
 35 |                                                    /* create benchmark of 512 MB */ 512U,
 36 |                                                    /* also support writing */ true };
 37 | 
 38 |   /// Start sampling.
 39 |   try {
 40 |     sampler.start();
 41 |   } catch (std::runtime_error& exception) {
 42 |     std::cerr << exception.what() << std::endl;
 43 |     return 1;
 44 |   }
 45 | 
 46 |   /// Execute the benchmark (accessing cache lines in a random order).
 47 |   auto value = 0ULL;
 48 |   for (auto index = 0U; index < benchmark.size(); ++index) {
 49 |     value += benchmark[index].value;
 50 | 
 51 |     /// Also write a value to get store events.
 52 |     benchmark.set(index, value);
 53 |   }
 54 | 
 55 |   /// We do not want the compiler to optimize away this (otherwise) unused value.
 56 |   benchmark.pretend_to_use(value);
 57 | 
 58 |   /// Stop sampling.
 59 |   sampler.stop();
 60 | 
 61 |   /// Get all the recorded samples.
 62 |   auto samples = sampler.result(/* sort by time */ true);
 63 |   const auto count_samples_before_filter = samples.size();
 64 | 
 65 |   /// Print the first samples.
 66 |   const auto count_show_samples = std::min<std::size_t>(samples.size(), 40U);
 67 |   std::cout << "\nRecorded " << count_samples_before_filter << " samples. " << samples.size()
 68 |             << " remaining after filter." << std::endl;
 69 |   std::cout << "Here are the first " << count_show_samples << " recorded samples:\n" << std::endl;
 70 |   for (auto index = 0U; index < count_show_samples; ++index) {
 71 |     const auto& sample = samples[index];
 72 | 
 73 |     /// Since we recorded the time, period, the instruction pointer, and the CPU
 74 |     /// id, we can only read these values.
 75 |     if (sample.metadata().timestamp().has_value() && sample.data_access().logical_memory_address().has_value() &&
 76 |         sample.data_access().source().has_value()) {
 77 |       auto data_source = "N/A";
 78 |       if (sample.data_access().source()->is_l1_hit()) {
 79 |         data_source = "L1d";
 80 |       } else if (sample.data_access().source()->is_mhb_hit().value_or(false)) {
 81 |         data_source = "LFB/MAB";
 82 |       } else if (sample.data_access().source()->is_l2_hit()) {
 83 |         data_source = "L2";
 84 |       } else if (sample.data_access().source()->is_l3_hit()) {
 85 |         data_source = "L3";
 86 |       } else if (sample.data_access().source()->is_memory_hit()) {
 87 |         data_source = "RAM";
 88 |       }
 89 | 
 90 |       auto type = "N/A";
 91 |       if (sample.instruction_execution().type().has_value()) {
 92 |         if (sample.data_access().is_load()) {
 93 |           type = "Load";
 94 |         } else if (sample.data_access().is_store()) {
 95 |           type = "Store";
 96 |         }
 97 |       }
 98 | 
 99 |       const auto instruction_latency = sample.instruction_execution().latency().instruction_retirement().value_or(
100 |         sample.instruction_execution().latency().uop_tag_to_retirement().value_or(0U));
101 |       const auto cache_latency =
102 |         sample.data_access().latency().cache_miss().value_or(sample.data_access().latency().cache_miss().value_or(0U));
103 | 
104 |       std::cout << "Time = " << sample.metadata().timestamp().value() << " | Logical Mem Address = 0x" << std::hex
105 |                 << sample.data_access().logical_memory_address().value() << std::dec
106 |                 << " | Latency (cache, instruction) = " << cache_latency << ", " << instruction_latency
107 |                 << " | Type = " << type << " | Data Source = " << data_source << "\n";
108 |     } else if (sample.count_loss().has_value()) {
109 |       std::cout << "Loss = " << sample.count_loss().value() << "\n";
110 |     }
111 |   }
112 |   std::cout << std::flush;
113 | 
114 |   /// Close the sampler.
115 |   /// Note that the sampler can only be closed after reading the samples.
116 |   sampler.close();
117 | 
118 |   return 0;
119 | }


--------------------------------------------------------------------------------
/docs/sampling-symbols-and-flamegraphs.md:
--------------------------------------------------------------------------------
  1 | # Symbols and Flamegraphs
  2 | Performance bottlenecks often hide inside deep call stacks: the one slow function that is really stalling your frame rate sits four layers below the code you are looking at. 
  3 | A flamegraph ([example](https://www.brendangregg.com/flamegraphs.html)) collapses thousands of sampled call-stacks into a single, interactive SVG where the widest bars show the functions that burn the most CPU time.
  4 | 
  5 | *perf‑cpp* now provides two building blocks for flamegraph generation:
  6 | - **Symbol resolution**: translate raw instruction pointers into `<module>::<function>+<offset>` strings.
  7 | - **Collapsed‑stack export**: emit samples in the canonical `func1;func2;func3 <count>` format understood by tools such as [Brendan Gregg's FlameGraph](https://github.com/brendangregg/FlameGraph), [Speedscope](https://www.speedscope.app/), or [flamegraph.com](https://flamegraph.com/).
  8 | 
  9 | With just a few lines of code you can record samples, resolve symbols, and open a browser to an interactive heat‑map of your code.
 10 | 
 11 | ---
 12 | ## Table of Contents
 13 | - [Translating Instruction Pointers into Symbols](#translating-instruction-pointers-into-symbols)
 14 | - [Translating Sampler Results into Flame Graphs](#translating-sampler-results-into-flame-graphs)
 15 |   - [Setting up the Sampler](#setting-up-the-sampler)
 16 |   - [Generating Flamegraphs](#generating-flamegraphs)
 17 | ---
 18 | 
 19 | ## Translating Instruction Pointers into Symbols
 20 | The `perf::SymbolResolver` allows to translate logical instruction pointers into symbols (i.e., the name if the module, the name of the function, and the offset within that function).
 21 | 
 22 | ```cpp
 23 | #include <perfcpp/sampler.h>
 24 | #include <perfcpp/symbol_resolver.h>
 25 | 
 26 | auto sampler = perf::Sampler{ };
 27 | sampler.trigger("cycles", perf::Precision::RequestZeroSkid, perf::Period{ 50000U });
 28 | sampler.values().instruction_pointer(true);
 29 | 
 30 | sampler.start();
 31 | /// Run some code
 32 | sampler.stop();
 33 | 
 34 | auto symbol_resolver = perf::SymbolResolver{};
 35 | 
 36 | for (const auto& sample : sampler.results()) {
 37 |   const auto instruction_pointer =   sample.instruction_execution().logical_instruction_pointer();
 38 |   if (instruction_pointer.has_value()) {
 39 |       
 40 |     /// Resolve the symbol.
 41 |     const auto symbol = symbol_resolver.resolve(instruction_pointer.value());
 42 |     
 43 |     /// Translate the symbol into a string.
 44 |     const auto symbol_name = symbol.has_value() ? symbol->to_string() : std::string{"??"};
 45 | 
 46 |     std::cout " Instruction Pointer = 0x" << std::hex
 47 |               << instruction_pointer.value() << std::dec
 48 |               << " | Symbol = " << symbol_name
 49 |               << "\n";
 50 |   }
 51 | }
 52 | ```
 53 | 
 54 | The output could look like the following:
 55 | 
 56 | ```bash
 57 | Instruction Pointer = 0x57459be95faf | Symbol = [instruction-pointer-sampling] _ZNK4perf7example15AccessBenchmarkixEm+47
 58 | Instruction Pointer = 0x57459be95faf | Symbol = [instruction-pointer-sampling] _ZNK4perf7example15AccessBenchmarkixEm+47 
 59 | Instruction Pointer = 0x57459be987d0 | Symbol = [instruction-pointer-sampling] _ZNKSt6vectorIN4perf7example15AccessBenchmark10cache_lineESaIS3_EEixEm+0
 60 | ```
 61 | 
 62 | **&rarr; [See a practical example](../examples/sampling/instruction_pointer.cpp)**
 63 | 
 64 | ### Translating Sampler Results into Flame Graphs
 65 | 
 66 | #### Setting up the Sampler
 67 | To generate flamegraphs, we need include 
 68 | - the (logical) *instruction pointer* (to identify the leaf frame) 
 69 | - and the *callchain* (to reconstruct the stack)
 70 | 
 71 | into samples.
 72 | For more condensed outputs, it is also recommended to include the *timestamp* and sort the results afterward.
 73 | 
 74 | ```cpp
 75 | #include <perfcpp/sampler.h>
 76 | 
 77 | auto sampler = perf::Sampler{ };
 78 | sampler.trigger("cycles");
 79 | sampler.values()
 80 |     .instruction_pointer(true)
 81 |     .callchain(true)
 82 |     .timestamp(true);
 83 | ```
 84 | 
 85 | #### Generating Flamegraphs
 86 | After sampling, the `perf::analyzer::FlameGraphGenerator` can map the samples into a format that can be read by common used flamegraph generators:
 87 | 
 88 | ```cpp
 89 | #include <perfcpp/analyzer/flame_graph_generator.h>
 90 | 
 91 | sampler.start();
 92 | /// Code to sample will be called here...
 93 | sampler.stop();
 94 | 
 95 | /// Get all the recorded samples and sort for condensed outputs 
 96 | /// (sorting via `true` flag is optional).
 97 | const auto samples = sampler.result(/*sort = */ true);
 98 | 
 99 | /// Translate into a frame graph format and write the result to "flagraphs.txt".
100 | auto flame_graph_generator = perf::analyzer::FlameGraphGenerator{};
101 | flame_graph_generator.map(samples, "flamegraphs.txt");
102 | ```
103 | 
104 | After writing the output, we can use that file as an input to flamegraph generators, for example:
105 | - [Brendan Gregg's FlameGraph](https://github.com/brendangregg/FlameGraph): Download the project and translate `flamegraphs.txt` into an SVG via `./flamegraph.pl flamegraphs.txt > flamegraphs.svg`
106 | - [flamegraph.com](https://flamegraph.com/): Upload the `flamegraphs.txt`
107 | - [Speedscope](https://www.speedscope.app/): Upload the `flamegraphs.txt`
108 | 
109 | **&rarr; [See full example](../examples/sampling/flame_graph.cpp)**


--------------------------------------------------------------------------------
/examples/sampling/memory_address.cpp:
--------------------------------------------------------------------------------
  1 | #include "../access_benchmark.h"
  2 | #include "perfcpp/hardware_info.h"
  3 | #include "perfcpp/sampler.h"
  4 | #include <iostream>
  5 | 
  6 | int
  7 | main()
  8 | {
  9 |   std::cout << "libperf-cpp example: Record perf samples including time, "
 10 |                "logical memory address, latency, and data source for "
 11 |                "single-threaded random access to an in-memory array."
 12 |             << std::endl;
 13 | 
 14 |   /// Initialize sampler.
 15 |   auto sampler = perf::Sampler{};
 16 | 
 17 |   /// Setup which counters trigger the writing of samples (depends on the underlying hardware substrate).
 18 |   if (perf::HardwareInfo::is_amd_ibs_supported()) {
 19 |     sampler.trigger("ibs_op_uops", perf::Precision::MustHaveZeroSkid, perf::Period{ 4000U });
 20 |   } else if (perf::HardwareInfo::is_intel()) {
 21 |     sampler.trigger("mem-loads", perf::Precision::MustHaveZeroSkid, perf::Period{ 4000U });
 22 |   } else {
 23 |     std::cout << "Error: Memory sampling is not supported on this CPU." << std::endl;
 24 |     return 1;
 25 |   }
 26 | 
 27 |   /// Setup which data will be included into samples (timestamp, virtual memory address, data source like L1d or RAM,
 28 |   /// and latency).
 29 |   sampler.values().timestamp(true).logical_memory_address(true).data_source(true).latency(true);
 30 | 
 31 |   /// Create random access benchmark.
 32 |   auto benchmark = perf::example::AccessBenchmark{ /*randomize the accesses*/ true,
 33 |                                                    /* create benchmark of 1024 MB */ 1024U };
 34 | 
 35 |   /// Start sampling.
 36 |   try {
 37 |     sampler.start();
 38 |   } catch (std::runtime_error& exception) {
 39 |     std::cerr << exception.what() << std::endl;
 40 |     return 1;
 41 |   }
 42 | 
 43 |   /// Execute the benchmark (accessing cache lines in a random order).
 44 |   auto value = 0ULL;
 45 |   for (auto index = 0U; index < benchmark.size(); ++index) {
 46 |     value += benchmark[index].value;
 47 |   }
 48 | 
 49 |   /// We do not want the compiler to optimize away this (otherwise) unused value (and consequently the loop above).
 50 |   benchmark.pretend_to_use(value);
 51 | 
 52 |   /// Stop sampling.
 53 |   sampler.stop();
 54 | 
 55 |   /// Get all the recorded samples.
 56 |   auto samples = sampler.result();
 57 |   const auto count_samples_before_filter = samples.size();
 58 | 
 59 |   /// Filter out samples without data source (AMD samples all instructions, not only data-related).
 60 |   samples.erase(std::remove_if(samples.begin(),
 61 |                                samples.end(),
 62 |                                [](const auto& sample) {
 63 |                                  return sample.count_loss().has_value() || !sample.data_access().source().has_value() ||
 64 |                                         sample.data_access().logical_memory_address().value_or(0U) == 0U;
 65 |                                }),
 66 |                 samples.end());
 67 | 
 68 |   /// Print the first samples.
 69 |   const auto count_show_samples = std::min<std::size_t>(samples.size(), 40U);
 70 |   std::cout << "\nRecorded " << count_samples_before_filter << " samples. " << samples.size()
 71 |             << " remaining after filter." << std::endl;
 72 |   std::cout << "Here are the first " << count_show_samples << " recorded samples:\n" << std::endl;
 73 |   for (auto index = 0U; index < count_show_samples; ++index) {
 74 |     const auto& sample = samples[index];
 75 | 
 76 |     /// Since we recorded the time, period, the instruction pointer, and the CPU
 77 |     /// id, we can only read these values.
 78 |     auto data_source = "N/A";
 79 |     if (sample.data_access().source()->is_l1_hit()) {
 80 |       data_source = "L1d";
 81 |     } else if (sample.data_access().source()->is_mhb_hit().value_or(false)) {
 82 |       data_source = "LFB/MAB";
 83 |     } else if (sample.data_access().source()->is_l2_hit()) {
 84 |       data_source = "L2";
 85 |     } else if (sample.data_access().source()->is_l3_hit()) {
 86 |       data_source = "L3";
 87 |     } else if (sample.data_access().source()->is_memory_hit()) {
 88 |       data_source = "RAM";
 89 |     }
 90 | 
 91 |     auto instruction_latency = 0ULL;
 92 |     auto cache_latency = 0ULL;
 93 | 
 94 |     if (perf::HardwareInfo::is_intel()) {
 95 |       instruction_latency = sample.instruction_execution().latency().instruction_retirement().value_or(0U);
 96 |       cache_latency = sample.data_access().latency().cache_access().value_or(0U);
 97 |     } else if (perf::HardwareInfo::is_amd()) {
 98 |       instruction_latency = sample.instruction_execution().latency().uop_tag_to_retirement().value_or(0U);
 99 |       cache_latency = sample.data_access().latency().cache_miss().value_or(0U);
100 |     }
101 | 
102 |     std::cout << "Time = " << sample.metadata().timestamp().value_or(0U) << " | Logical Mem Address = 0x" << std::hex
103 |               << sample.data_access().logical_memory_address().value() << std::dec
104 |               << " | Latency (cache, instruction) = " << cache_latency << ", " << instruction_latency
105 |               << " | Is Load = " << sample.data_access().is_load() << " | Data Source = " << data_source << "\n";
106 |   }
107 |   std::cout << std::flush;
108 | 
109 |   /// Close the sampler.
110 |   /// Note that the sampler can only be closed after reading the samples.
111 |   sampler.close();
112 | 
113 |   return 0;
114 | }


--------------------------------------------------------------------------------
/examples/statistics/multi_process.cpp:
--------------------------------------------------------------------------------
  1 | #include "../access_benchmark.h"
  2 | #include "perfcpp/event_counter.h"
  3 | #include <atomic>
  4 | #include <iostream>
  5 | #include <numeric>
  6 | #include <thread>
  7 | #include <unistd.h>
  8 | int
  9 | main()
 10 | {
 11 |   std::cout << "libperf-cpp example: Record performance counter for "
 12 |                "random access to an in-memory array per process."
 13 |             << std::endl;
 14 |   std::cout << "We will record the counters per process and merge the results "
 15 |                "afterwards."
 16 |             << std::endl;
 17 | 
 18 |   /// Create random access benchmark.
 19 |   auto benchmark = perf::example::AccessBenchmark{ /*randomize the accesses*/ true,
 20 |                                                    /* create benchmark of 1024 MB */ 1024U };
 21 | 
 22 |   /// One event_counter instance for every thread.
 23 |   constexpr auto count_threads = 2U;
 24 |   const auto items_per_thread = benchmark.size() / count_threads;
 25 |   auto threads = std::vector<std::thread>{};
 26 |   auto thread_local_results =
 27 |     std::vector<std::uint64_t>(count_threads, 0U); /// Array to store the thread-local results.
 28 | 
 29 |   /// process ids to record performance counters.
 30 |   auto process_ids = std::vector<pid_t>{};
 31 |   process_ids.resize(count_threads);
 32 | 
 33 |   /// Barrier for the threads to wait.
 34 |   auto thread_barrier = std::atomic<bool>{ false };
 35 | 
 36 |   /// Barrier for main thread to wait until threads have written their process ids.
 37 |   auto written_pid_counter = std::atomic<std::uint16_t>{ 0U };
 38 | 
 39 |   for (auto thread_index = 0U; thread_index < count_threads; ++thread_index) {
 40 |     threads.emplace_back([thread_index,
 41 |                           items_per_thread,
 42 |                           &thread_local_results,
 43 |                           &benchmark,
 44 |                           &process_ids,
 45 |                           &thread_barrier,
 46 |                           &written_pid_counter]() {
 47 |       auto local_value = 0ULL;
 48 | 
 49 |       /// Store the process id for creating performance counters on that thread.
 50 |       /// To the best of our knowledge, there is no other way to get the (linux) tid/pid of an std::thread :-(.
 51 |       process_ids[thread_index] = gettid();
 52 | 
 53 |       /// Notify the main thread that the pid/tid was written by this thread.
 54 |       written_pid_counter.fetch_add(1U);
 55 | 
 56 |       /// Wait for the barrier to become "true".
 57 |       while (!thread_barrier)
 58 |         ;
 59 | 
 60 |       /// Process the data.
 61 |       for (auto index = 0U; index < items_per_thread; ++index) {
 62 |         local_value += benchmark[(thread_index * items_per_thread) + index].value;
 63 |       }
 64 | 
 65 |       thread_local_results[thread_index] = local_value;
 66 |     });
 67 |   }
 68 | 
 69 |   /// Wait for all threads to write their pid/tid.
 70 |   while (written_pid_counter < count_threads)
 71 |     ;
 72 | 
 73 |   /// Create process ids to watch.
 74 |   std::cout << "Creating counters for Processes: ";
 75 |   for (const auto pid : process_ids) {
 76 |     std::cout << pid << " ";
 77 |   }
 78 |   std::cout << std::endl;
 79 | 
 80 |   /// Initialize performance counters.
 81 |   auto multi_cpu_event_counter = perf::MultiProcessEventCounter{ std::move(process_ids) };
 82 | 
 83 |   /// Add all the performance counters we want to record.
 84 |   try {
 85 |     multi_cpu_event_counter.add({ "instructions",
 86 |                                   "cycles",
 87 |                                   "branches",
 88 |                                   "cache-misses",
 89 |                                   "dTLB-miss-ratio",
 90 |                                   "L1-data-miss-ratio",
 91 |                                   "cycles-per-instruction" });
 92 |   } catch (std::runtime_error& e) {
 93 |     std::cerr << e.what() << std::endl;
 94 |     return 1;
 95 |   }
 96 | 
 97 |   /// Start recording performance counter.
 98 |   /// In contrast to the inherit-thread example (see inherit_thread.cpp), we
 99 |   /// will record the performance counters on each thread on every core.
100 |   try {
101 |     multi_cpu_event_counter.start();
102 |   } catch (std::runtime_error& exception) {
103 |     std::cerr << exception.what() << std::endl;
104 |     return 1;
105 |   }
106 | 
107 |   /// Let threads start.
108 |   thread_barrier = true;
109 | 
110 |   /// Wait for all threads to finish.
111 |   for (auto& thread : threads) {
112 |     thread.join();
113 |   }
114 | 
115 |   /// Stop performance counter recording.
116 |   multi_cpu_event_counter.stop();
117 | 
118 |   /// Add up the results so that the compiler does not get the idea of
119 |   /// optimizing away the accesses.
120 |   auto value = std::accumulate(thread_local_results.begin(), thread_local_results.end(), 0UL);
121 | 
122 |   /// We do not want the compiler to optimize away this (otherwise) unused value.
123 |   benchmark.pretend_to_use(value);
124 | 
125 |   /// Get the result (normalized per cache line) from the
126 |   /// multithread_event_counter.
127 |   auto result = multi_cpu_event_counter.result(benchmark.size());
128 | 
129 |   /// Print the performance counters.
130 |   std::cout << "\nResults:\n";
131 |   for (const auto& [counter_name, counter_value] : result) {
132 |     std::cout << counter_value << " " << counter_name << " / cache line" << std::endl;
133 |   }
134 | 
135 |   return 0;
136 | }
137 | 


--------------------------------------------------------------------------------
/src/requested_event.cpp:
--------------------------------------------------------------------------------
  1 | #include <perfcpp/requested_event.h>
  2 | 
  3 | bool
  4 | perf::RequestedEventSet::add(const perf::RequestedEvent& event)
  5 | {
  6 |   /// If the event is not already added (in that case adjust_visibility_if_present() will return false), add it.
  7 |   /// If the event is already in the set, adjust_visibility_if_present() will adjust the visibility to true, if
  8 |   /// is_shown_in_results is true.
  9 |   if (!this->adjust_visibility_if_present(event.pmu_name(), event.event_name(), event.is_shown_in_results())) {
 10 |     this->_requested_events.push_back(event);
 11 |     return true;
 12 |   }
 13 | 
 14 |   return false;
 15 | }
 16 | 
 17 | bool
 18 | perf::RequestedEventSet::adjust_visibility_if_present(const std::optional<std::string_view> pmu_name,
 19 |                                                       const std::string_view event_name,
 20 |                                                       const bool is_shown_in_results)
 21 | {
 22 |   auto iterator = std::find_if(
 23 |     this->_requested_events.begin(), this->_requested_events.end(), [pmu_name, event_name](const auto& event) {
 24 |       return event.pmu_name() == pmu_name && event.event_name() == event_name;
 25 |     });
 26 | 
 27 |   /// If the event is not in the set, notify the caller that the event needs to be added.
 28 |   if (iterator == this->_requested_events.end()) {
 29 |     return false;
 30 |   }
 31 | 
 32 |   /// If the event should be included into the results, mark it accordingly.
 33 |   if (is_shown_in_results) {
 34 |     iterator->is_shown_in_results(true);
 35 |   }
 36 | 
 37 |   return true;
 38 | }
 39 | 
 40 | perf::CounterResult
 41 | perf::RequestedEventSet::result(const perf::CounterDefinition& counter_definition,
 42 |                                 perf::CounterResult&& hardware_events_result,
 43 |                                 const std::uint64_t normalization) const
 44 | {
 45 |   /// Combine all hardware events and metrics into a single result, showing only the requested events and metrics, in
 46 |   /// the requested order. Accordingly, we need to calculate the metrics first, using the given hardware events.
 47 |   /// However, since metrics can be referenced recursively (metric_a uses metric_b), we need to resolve the metrics in a
 48 |   /// specific order (metric_b before metric_a in this example). To do so, we calculate a metric dependency graph first
 49 |   /// and calculate metrics without dependencies, until all metrics are calculated.
 50 |   auto metric_graph = this->build_metric_graph(counter_definition);
 51 | 
 52 |   /// Check if the metric graph has a cycle. In that case, we cannot evaluate the metrics.
 53 |   if (metric_graph.is_cyclic()) {
 54 |     throw CannotEvaluateMetricsBecauseOfCycleError{};
 55 |   }
 56 | 
 57 |   /// Walk through the metric graph, removing one metric without dependencies ata time.
 58 |   while (!metric_graph.empty()) {
 59 | 
 60 |     /// Get metric without un-calculated dependency.
 61 |     if (const auto metric_name = metric_graph.pop(); metric_name.has_value()) {
 62 |       if (auto metric = counter_definition.metric(metric_name.value()); metric.has_value()) {
 63 | 
 64 |         /// Calculate the metric.
 65 |         if (const auto calculated_metric_value = std::get<1>(metric.value()).calculate(hardware_events_result);
 66 |             calculated_metric_value.has_value()) {
 67 | 
 68 |           /// Add it to the results.
 69 |           hardware_events_result.emplace_back(metric_name.value(), calculated_metric_value.value());
 70 |         }
 71 |       }
 72 |     }
 73 |   }
 74 | 
 75 |   auto event_results = std::vector<std::pair<std::string_view, double>>{};
 76 |   event_results.reserve(this->_requested_events.size());
 77 | 
 78 |   /// Transform hardware events (now containing also metric results) into a set that is ordered like dictated by the
 79 |   /// requested event.
 80 |   for (const auto& requested_event : this->_requested_events) {
 81 |     if (requested_event.is_shown_in_results()) {
 82 |       if (const auto result = hardware_events_result.get(requested_event.event_name()); result.has_value()) {
 83 | 
 84 |         /// Normalize hardware and time events.
 85 |         if (requested_event.is_hardware_event() || requested_event.is_time_event()) {
 86 |           event_results.emplace_back(requested_event.event_name(), result.value() / static_cast<double>(normalization));
 87 |         }
 88 | 
 89 |         /// Add metrics without normalization.
 90 |         else {
 91 |           event_results.emplace_back(requested_event.event_name(), result.value());
 92 |         }
 93 |       }
 94 |     }
 95 |   }
 96 | 
 97 |   return CounterResult{ std::move(event_results) };
 98 | }
 99 | 
100 | perf::util::DirectedGraph<std::string_view>
101 | perf::RequestedEventSet::build_metric_graph(const perf::CounterDefinition& counter_definition) const
102 | {
103 |   auto metric_graph = util::DirectedGraph<std::string_view>{};
104 |   for (const auto& requested_event : this->_requested_events) {
105 |     if (const auto metric = counter_definition.metric(requested_event.event_name()); metric.has_value()) {
106 | 
107 |       /// Add the metric as a node to the graph.
108 |       metric_graph.insert(requested_event.event_name());
109 | 
110 |       /// Add an edge for every dependent metric: dependent_metric -> metric
111 |       for (const auto& dependency : std::get<1>(metric.value()).required_counter_names()) {
112 |         if (const auto dependent_metric = counter_definition.metric(dependency); dependent_metric.has_value()) {
113 |           metric_graph.connect(std::get<0>(dependent_metric.value()), requested_event.event_name());
114 |         }
115 |       }
116 |     }
117 |   }
118 | 
119 |   return metric_graph;
120 | }


--------------------------------------------------------------------------------
/examples/sampling/branch.cpp:
--------------------------------------------------------------------------------
  1 | #include "../access_benchmark.h"
  2 | #include "perfcpp/sampler.h"
  3 | #include <iostream>
  4 | 
  5 | /**
  6 |  * A function using multiple branches hard to optimize for the compiler for
  7 |  * demonstrating branch-sampling.
  8 |  *
  9 |  * @param cache_line Cache line to use as an input.
 10 |  * @return Another value through a handful of branches.
 11 |  */
 12 | [[nodiscard]] std::uint64_t
 13 | branchy_function(const perf::example::AccessBenchmark::cache_line& cache_line);
 14 | 
 15 | int
 16 | main()
 17 | {
 18 | 
 19 |   std::cout << "libperf-cpp example: Record perf branch samples for "
 20 |                "single-threaded sequential access to an in-memory array."
 21 |             << std::endl;
 22 | 
 23 |   /// Initialize sampler.
 24 |   auto sampler = perf::Sampler{};
 25 | 
 26 |   /// Setup which counters trigger the writing of samples.
 27 |   sampler.trigger("cycles", perf::Precision::AllowArbitrarySkid, perf::Period{ 1000000U });
 28 | 
 29 |   /// Setup which data will be included into samples (timestamp and stack of branches).
 30 |   sampler.values().timestamp(true).branch_stack(
 31 |     { perf::BranchType::User, perf::BranchType::Conditional }) /// Only sample conditional branches in user-mode.
 32 |     ;
 33 | 
 34 |   /// Create random access benchmark.
 35 |   auto benchmark = perf::example::AccessBenchmark{ /*sequential accesses*/ false,
 36 |                                                    /* create benchmark of 512 MB */ 512U };
 37 | 
 38 |   /// Start sampling.
 39 |   try {
 40 |     sampler.start();
 41 |   } catch (std::runtime_error& exception) {
 42 |     std::cerr << exception.what() << std::endl;
 43 |     return 1;
 44 |   }
 45 | 
 46 |   /// Execute the benchmark (accessing cache lines in a random order).
 47 |   auto value = 0ULL;
 48 |   for (auto index = 0U; index < benchmark.size(); ++index) {
 49 |     value += branchy_function(benchmark[index]);
 50 |   }
 51 | 
 52 |   /// We do not want the compiler to optimize away this (otherwise) unused value (and consequently the loop above).
 53 |   benchmark.pretend_to_use(value);
 54 | 
 55 |   /// Stop sampling.
 56 |   sampler.stop();
 57 | 
 58 |   /// Get all the recorded samples.
 59 |   const auto samples = sampler.result();
 60 | 
 61 |   /// Print the first samples.
 62 |   const auto count_show_samples = std::min<std::size_t>(samples.size(), 10U);
 63 |   std::cout << "\nRecorded " << samples.size() << " samples." << std::endl;
 64 |   std::cout << "Here are the first " << count_show_samples << " recorded samples:\n" << std::endl;
 65 | 
 66 |   for (auto index = 0U; index < count_show_samples; ++index) {
 67 |     const auto& sample = samples[index];
 68 | 
 69 |     /// Since we recorded the time, period, the instruction pointer, and the CPU
 70 |     /// id, we can only read these values.
 71 |     if (sample.metadata().timestamp().has_value() && sample.branch_stack().has_value()) {
 72 |       std::cout << "Time = " << sample.metadata().timestamp().value() << "\n";
 73 |       for (const auto& branch : sample.branch_stack().value()) {
 74 |         std::cout << "\tpredicted correct = " << branch.is_predicted() << " | from instruction 0x" << std::hex
 75 |                   << branch.instruction_pointer_from() << std::dec << " | to instruction 0x" << std::hex
 76 |                   << branch.instruction_pointer_to() << std::dec;
 77 |         if (branch.cycles().has_value()) {
 78 |           std::cout << " | cycles = " << branch.cycles().value();
 79 |         }
 80 | 
 81 |         std::cout << "\n";
 82 |       }
 83 |     }
 84 |   }
 85 |   std::cout << std::flush;
 86 | 
 87 |   /// Close the sampler.
 88 |   /// Note that the sampler can only be closed after reading the samples.
 89 |   sampler.close();
 90 | 
 91 |   return 0;
 92 | }
 93 | 
 94 | std::uint64_t
 95 | branchy_function(const perf::example::AccessBenchmark::cache_line& cache_line)
 96 | {
 97 |   auto result = cache_line.value;
 98 | 
 99 |   for (auto i = 0U; i < 10U; ++i) {
100 |     switch ((cache_line.value >> (4U * i)) & 0xF) { // Extract 4 bits at a time
101 |       case 0ULL:
102 |         result += cache_line.value * (i + 1U);
103 |         break;
104 |       case 1ULL:
105 |         result -= cache_line.value / (i + 2U);
106 |         break;
107 |       case 2ULL:
108 |         result *= cache_line.value + (i * 3U);
109 |         break;
110 |       case 3ULL:
111 |         result /= (cache_line.value - i) | 1U;
112 |         break; // Avoid division by zero
113 |       case 4ULL:
114 |         result ^= cache_line.value << i;
115 |         break;
116 |       case 5ULL:
117 |         result %= (cache_line.value >> i) | 1U;
118 |         break;
119 |       case 6ULL:
120 |         result = ~result;
121 |         break;
122 |       case 7ULL:
123 |         result &= cache_line.value | (std::uint64_t(0xFF) << (i * 8U));
124 |         break;
125 |       case 8ULL:
126 |         result |= cache_line.value & (std::uint64_t(0xFFFF) << (i * 16U));
127 |         break;
128 |       case 9ULL:
129 |         result >>= cache_line.value % (i + 1);
130 |         break;
131 |       case 10ULL:
132 |         result <<= cache_line.value % (i + 2);
133 |         break;
134 |       case 11ULL:
135 |         result += cache_line.value + i * 7;
136 |         break;
137 |       case 12ULL:
138 |         result -= cache_line.value - i * 11;
139 |         break;
140 |       case 13ULL:
141 |         result *= cache_line.value * (i + 5);
142 |         break;
143 |       case 14ULL:
144 |         result /= (cache_line.value / (i + 3)) | 1;
145 |         break;
146 |       case 15ULL:
147 |         result ^= cache_line.value ^ (i * 13);
148 |         break;
149 |       default:
150 |         result = cache_line.value;
151 |     }
152 |   }
153 |   return result;
154 | }


--------------------------------------------------------------------------------
/docs/analyzing-memory-access-patterns.md:
--------------------------------------------------------------------------------
  1 | # Analyzing Memory Access Patterns of Data Structures
  2 | 
  3 | Modern applications often contain multiple instances of complex data structures, making it challenging to analyze their memory access patterns. 
  4 | While tools like Linux Perf and Intel VTune excel at identifying resource-intensive instructions, they cannot differentiate between different instances of the same data structure sharing identical code - for example, different nodes within a tree structure experiencing varying access patterns.
  5 | 
  6 | *perf-cpp* addresses this limitation through its **Memory Access Analyzer** component, which works in conjunction with memory-based sampling ([detailed in the sampling documentation](sampling.md)). 
  7 | The Memory Access Analyzer helps identify which specific memory addresses experience high access latency by:
  8 | 
  9 | * Mapping samples to individual data object instances
 10 | * Generating detailed access statistics including cache hits/misses, TLB performance, and average latency metrics
 11 | 
 12 | &rarr; [For a practical implementation, check out our random-access-benchmark example.](../examples/sampling/memory_access_analyzer.cpp)
 13 | 
 14 | ---
 15 | ## Table of Contents
 16 | - [Describing Data Types](#step-1-describing-data-types)
 17 | - [Registering Data Type Instances](#step-2-registering-data-type-instances)
 18 | - [Mapping Samples to Data Type Instances](#step-3-mapping-samples-to-data-type-instances)
 19 | - [Processing the Result](#step-4-processing-the-result)
 20 | ---
 21 | 
 22 | ## Step 1: Describing Data Types
 23 | The **Memory Access Analyzer** requires information about the structure of your data types. 
 24 | Let's walk through an example using a binary tree node:
 25 | ```cpp
 26 | class BinaryTreeNode {
 27 |     std::uint64_t value;
 28 |     BinaryTreeNode* left_child;
 29 |     BinaryTreeNode* right_child;
 30 | };
 31 | ```
 32 | 
 33 | To analyze this structure, create a `perf::analyzer::DataType` definition:
 34 | 
 35 | ```cpp
 36 | #include <perfcpp/analyzer/memory_access.h>
 37 | 
 38 | auto binary_tree_node = perf::analyzer::DataType{"BinaryTreeNode", sizeof(BinaryTreeNode)};
 39 | binary_tree_node.add("value", sizeof(std::uint64_t));         /// Describe the "value" attribute.
 40 | binary_tree_node.add("left_child", sizeof(BinaryTreeNode*));  /// Describe the "left_child" attribute.
 41 | binary_tree_node.add("right_child", sizeof(BinaryTreeNode*)); /// Describe the "right_child" attribute.
 42 | ```
 43 | 
 44 | > [!TIP]
 45 | > For accurate size and offset information, you can use [**pahole**](https://linux.die.net/man/1/pahole). See [Paramoud Kumbhar's detailed guide](https://pramodkumbhar.com/2023/11/pahole-to-analyz-data-structure-memory-layouts-with-ease/) for usage instructions.
 46 | 
 47 | ## Step 2: Registering Data Type Instances
 48 | Since each instance of a data structure may exhibit different access patterns, the Memory Access Analyzer needs to track individual instances. 
 49 | Here's how to register them:
 50 | 
 51 | ```cpp
 52 | #include <perfcpp/analyzer/memory_access.h>
 53 | auto memory_access_analyzer = perf::analyzer::MemoryAccess{};
 54 | 
 55 | /// Expose the data type to the Analyzer.
 56 | memory_access_analyzer.add(std::move(binary_tree_node));
 57 | 
 58 | /// Expose memory addresses to the Analyzer.
 59 | for (auto* node : tree->nodes()) {
 60 |     /// The first argument is the name describing the data type.
 61 |     /// The second argument is a pointer to the instance.
 62 |     memory_access_analyzer.annotate("BinaryTreeNode", node);
 63 | }
 64 | ```
 65 | 
 66 | ## Step 3: Mapping Samples to Data Type Instances
 67 | To collect memory access data, use *perf-cpp*'s [sampling mechanism](sampling.md) with the following key requirements:
 68 | * Include logical memory addresses
 69 | * Capture data source information
 70 | * Record latency data ("weight")
 71 | * Use a memory-address-capable sample trigger (e.g., `mem-loads` on Intel, `ibs_op` on AMD – see the [documentation](sampling.md#specific-notes-for-different-cpu-vendors))
 72 | 
 73 | ```cpp
 74 | #include <perfcpp/sampler.h>
 75 | #include <perfcpp/analyzer/memory_access.h>
 76 | 
 77 | auto sampler = perf::Sampler{};
 78 | 
 79 | /// Set trigger that enables memory sampling.
 80 | sampler.trigger("mem-loads", perf::Precision::MustHaveZeroSkid, perf::Period{ 1000U });
 81 | 
 82 | /// Include addresses, data source, and latency.
 83 | sampler.values()
 84 |     .logical_memory_address(true)
 85 |     .data_src(true)
 86 |     .weight_struct(true);
 87 | 
 88 | /// Run the workload while recording samples.
 89 | sampler.start();
 90 | ///... execute ....
 91 | sampler.stop();
 92 | 
 93 | /// Get the samples and map to described and registered data types and instances.
 94 | const auto samples = sampler.result();
 95 | const auto result = memory_access_analyzer.map(samples);
 96 | ```
 97 | 
 98 | ## Step 4: Processing the Result
 99 | The analyzer generates detailed statistics for each data type attribute. 
100 | To view the results:
101 | ```cpp
102 | std::cout << result.to_string() << std::endl;
103 | ```
104 | 
105 | Example output:
106 | 
107 | ```bash
108 | DataType BinaryTreeNode (24B) {
109 |                                       |     loads      |    cache hits    |   RAM hits    |           TLB            |     stores    
110 |                               samples | count  latency | L1d  LFB  L2  L3 | local  remote | L1 hits  L2 hits  misses | count  latency
111 |       0:   value (8B)             373 |   373      439 | 154    0   0   7 |   212       0 |     190        5     178 |     0        0
112 |       8:   left_child (8B)        146 |   146      720 |   1    0   0   5 |   140       0 |      12       18     116 |     0        0
113 |      16:   right_child (8B)       528 |   528      173 | 393    0   1  14 |   120       0 |     415        4     109 |     0        0
114 | }
115 | ```
116 | 
117 | The output shows:
118 | * Attribute details (offset, name, size)
119 | * Sample counts
120 | * Detailed performance metrics per attribute
121 | 
122 | For further analysis, export the results in structured formats:
123 | 
124 | ```cpp
125 | result.to_json();  /// JSON format
126 | result.to_csv();   /// CSV format
127 | ```


--------------------------------------------------------------------------------
/test/requested_event.cpp:
--------------------------------------------------------------------------------
  1 | #include <catch2/catch_test_macros.hpp>
  2 | #include <perfcpp/counter_definition.h>
  3 | #include <perfcpp/counter_result.h>
  4 | #include <perfcpp/requested_event.h>
  5 | 
  6 | TEST_CASE("empty RequestedEventSet", "[RequestedEventSet]")
  7 | {
  8 |   auto event_set = perf::RequestedEventSet{};
  9 |   auto counter_definition = perf::CounterDefinition{};
 10 | 
 11 |   SECTION("newly created event set is empty")
 12 |   {
 13 |     REQUIRE(event_set.empty());
 14 |     REQUIRE(event_set.size() == 0U);
 15 |     REQUIRE(event_set.begin() == event_set.end());
 16 |   }
 17 | 
 18 |   SECTION("event set with reserved capacity is still empty")
 19 |   {
 20 |     auto event_set_with_capacity = perf::RequestedEventSet{ 10U };
 21 | 
 22 |     REQUIRE(event_set_with_capacity.empty());
 23 |     REQUIRE(event_set_with_capacity.size() == 0U);
 24 |     REQUIRE(event_set_with_capacity.begin() == event_set_with_capacity.end());
 25 |   }
 26 | 
 27 |   SECTION("result with empty hardware_events_result should be empty")
 28 |   {
 29 |     auto empty_hardware_result = perf::CounterResult{};
 30 | 
 31 |     auto result = event_set.result(counter_definition, std::move(empty_hardware_result), 1U);
 32 | 
 33 |     REQUIRE(result.begin() == result.end());
 34 |     REQUIRE_FALSE(result.get("instructions").has_value());
 35 |     REQUIRE_FALSE(result.get("cycles").has_value());
 36 |   }
 37 | 
 38 |   SECTION("result with non-empty hardware_events_result but no requested events should be empty")
 39 |   {
 40 |     auto hardware_result =
 41 |       perf::CounterResult{ std::vector<std::pair<std::string_view, double>>{ std::make_pair("instructions", 1000.0) } };
 42 | 
 43 |     auto result = event_set.result(counter_definition, std::move(hardware_result), 1U);
 44 | 
 45 |     REQUIRE(result.begin() == result.end());
 46 |     REQUIRE_FALSE(result.get("instructions").has_value());
 47 |     REQUIRE_FALSE(result.get("cycles").has_value());
 48 |   }
 49 | 
 50 |   SECTION("result with reserved capacity - empty hardware_events_result should be empty")
 51 |   {
 52 |     auto event_set_with_capacity = perf::RequestedEventSet{ 10U };
 53 |     auto empty_hardware_result = perf::CounterResult{};
 54 | 
 55 |     auto result = event_set_with_capacity.result(counter_definition, std::move(empty_hardware_result), 1U);
 56 | 
 57 |     REQUIRE(result.begin() == result.end());
 58 |     REQUIRE_FALSE(result.get("instructions").has_value());
 59 |     REQUIRE_FALSE(result.get("cycles").has_value());
 60 |   }
 61 | 
 62 |   SECTION("result with reserved capacity - non-empty hardware_events_result but no requested events should be empty")
 63 |   {
 64 |     auto event_set_with_capacity = perf::RequestedEventSet{ 10U };
 65 |     auto hardware_result =
 66 |       perf::CounterResult{ std::vector<std::pair<std::string_view, double>>{ std::make_pair("instructions", 1000.0) } };
 67 | 
 68 |     auto result = event_set_with_capacity.result(counter_definition, std::move(hardware_result), 1U);
 69 | 
 70 |     REQUIRE(result.begin() == result.end());
 71 |     REQUIRE_FALSE(result.get("instructions").has_value());
 72 |     REQUIRE_FALSE(result.get("cycles").has_value());
 73 |   }
 74 | }
 75 | 
 76 | TEST_CASE("RequestedEventSet with requested events", "[RequestedEventSet]")
 77 | {
 78 |   auto counter_definition = perf::CounterDefinition{};
 79 | 
 80 |   SECTION("result contains requested event when present in hardware result")
 81 |   {
 82 |     auto event_set = perf::RequestedEventSet{};
 83 | 
 84 |     /// Add the "instructions" event as a hardware event
 85 |     REQUIRE(
 86 |       event_set.add(perf::RequestedEvent{ "cpu", "instructions", true, perf::RequestedEvent::Type::HardwareEvent }));
 87 | 
 88 |     /// Verify the event set is no longer empty
 89 |     REQUIRE_FALSE(event_set.empty());
 90 |     REQUIRE(event_set.size() == 1U);
 91 | 
 92 |     /// Create hardware result containing instructions
 93 |     auto hardware_result =
 94 |       perf::CounterResult{ std::vector<std::pair<std::string_view, double>>{ std::make_pair("instructions", 1000.0) } };
 95 | 
 96 |     auto result = event_set.result(counter_definition, std::move(hardware_result), 1U);
 97 | 
 98 |     /// The result should contain the requested instructions event
 99 |     REQUIRE(result.get("instructions").has_value());
100 |     REQUIRE(result.get("instructions").value() == 1000.0);
101 | 
102 |     /// But should not contain unrequested events
103 |     REQUIRE_FALSE(result.get("cycles").has_value());
104 | 
105 |     /// Verify iterator access
106 |     REQUIRE(result.begin() != result.end());
107 |     auto it = result.begin();
108 |     REQUIRE(it->first == "instructions");
109 |     REQUIRE(it->second == 1000.0);
110 |     ++it;
111 |     REQUIRE(it == result.end());
112 |   }
113 | 
114 |   SECTION("result is empty when requested event not in hardware result")
115 |   {
116 |     auto event_set = perf::RequestedEventSet{};
117 | 
118 |     /// Add the "instructions" event as a hardware event
119 |     REQUIRE(
120 |       event_set.add(perf::RequestedEvent{ "cpu", "instructions", true, perf::RequestedEvent::Type::HardwareEvent }));
121 | 
122 |     /// Create hardware result NOT containing instructions
123 |     auto hardware_result =
124 |       perf::CounterResult{ std::vector<std::pair<std::string_view, double>>{ std::make_pair("cycles", 2000.0) } };
125 | 
126 |     auto result = event_set.result(counter_definition, std::move(hardware_result), 1U);
127 | 
128 |     /// The result should be empty since requested event is not in hardware result
129 |     REQUIRE(result.begin() == result.end());
130 |     REQUIRE_FALSE(result.get("instructions").has_value());
131 |     REQUIRE_FALSE(result.get("cycles").has_value());
132 |   }
133 | 
134 |   SECTION("result respects normalization for hardware events")
135 |   {
136 |     auto event_set = perf::RequestedEventSet{};
137 | 
138 |     /// Add the "instructions" event as a hardware event
139 |     REQUIRE(
140 |       event_set.add(perf::RequestedEvent{ "cpu", "instructions", true, perf::RequestedEvent::Type::HardwareEvent }));
141 | 
142 |     /// Create hardware result containing instructions
143 |     auto hardware_result =
144 |       perf::CounterResult{ std::vector<std::pair<std::string_view, double>>{ std::make_pair("instructions", 1000.0) } };
145 | 
146 |     auto result = event_set.result(counter_definition, std::move(hardware_result), 10U);
147 | 
148 |     /// The result should be normalized (1000.0 / 10 = 100.0)
149 |     REQUIRE(result.get("instructions").has_value());
150 |     REQUIRE(result.get("instructions").value() == 100.0);
151 |   }
152 | }


--------------------------------------------------------------------------------