├── .github ├── ISSUE_TEMPLATE.md ├── PULL_REQUEST_TEMPLATE.md └── workflows │ └── cargo.yml ├── .gitignore ├── CHANGELOG.md ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── Cargo.lock ├── Cargo.toml ├── LICENSE ├── README.md ├── build └── ci.sh ├── configs ├── ci.toml ├── example.toml ├── macos.toml ├── memcache.toml └── usercall.toml ├── docs ├── DESIGN.md └── METRICS.md ├── rust-toolchain └── src ├── common ├── bpf.rs ├── mod.rs └── value_to_index2.c ├── config ├── exposition │ ├── kafka.rs │ └── mod.rs ├── general.rs ├── mod.rs └── samplers.rs ├── exposition ├── http.rs ├── kafka.rs └── mod.rs ├── main.rs ├── metrics ├── channel │ └── mod.rs ├── entry │ └── mod.rs ├── error │ └── mod.rs ├── metrics │ └── mod.rs ├── mod.rs ├── outputs │ └── mod.rs ├── source │ └── mod.rs ├── summary │ └── mod.rs └── traits │ ├── count.rs │ ├── float_convert.rs │ ├── mod.rs │ ├── primitive.rs │ ├── statistic.rs │ └── value.rs └── samplers ├── cpu ├── config.rs ├── mod.rs ├── perf.c └── stat.rs ├── disk ├── bpf.c ├── config.rs ├── mod.rs └── stat.rs ├── ext4 ├── bpf.c ├── config.rs ├── mod.rs └── stat.rs ├── http ├── config.rs ├── mod.rs └── stat.rs ├── interrupt ├── bpf.c ├── config.rs ├── mod.rs └── stat.rs ├── krb5kdc ├── bpf.c ├── config.rs ├── mod.rs └── stat.rs ├── memcache ├── config.rs ├── mod.rs └── stat.rs ├── memory ├── config.rs ├── mod.rs └── stat.rs ├── mod.rs ├── network ├── bpf.c ├── config.rs ├── mod.rs └── stat.rs ├── ntp ├── config.rs ├── mod.rs └── stat.rs ├── nvidia ├── config.rs ├── mod.rs └── stat.rs ├── page_cache ├── bpf.c ├── config.rs ├── mod.rs └── stat.rs ├── process ├── config.rs ├── mod.rs └── stat.rs ├── rezolus ├── config.rs ├── mod.rs └── stat.rs ├── scheduler ├── bpf.c ├── config.rs ├── mod.rs ├── perf.c └── stat.rs ├── softnet ├── config.rs ├── mod.rs └── stat.rs ├── tcp ├── bpf.c ├── config.rs ├── mod.rs └── stat.rs ├── udp ├── config.rs ├── mod.rs └── stat.rs ├── usercall ├── config.rs ├── mod.rs └── stat.rs └── xfs ├── bpf.c ├── config.rs ├── mod.rs └── stat.rs /.github/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | One line summary of the issue here. 2 | 3 | ### Expected behavior 4 | 5 | As concisely as possible, describe the expected behavior. 6 | 7 | ### Actual behavior 8 | 9 | As concisely as possible, describe the observed behavior. 10 | 11 | ### Steps to reproduce the behavior 12 | 13 | Please list all relevant steps to reproduce the observed behavior. 14 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | Problem 2 | 3 | Explain the context and why you're making that change. What is the 4 | problem you're trying to solve? In some cases there is not a problem 5 | and this can be thought of being the motivation for your change. 6 | 7 | Solution 8 | 9 | Describe the modifications you've done. 10 | 11 | Result 12 | 13 | What will change as a result of your pull request? Note that sometimes 14 | this section is unnecessary because it is self-explanatory based on 15 | the solution. 16 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .cargo 2 | target 3 | **/*.rs.bk -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | We feel that a welcoming community is important and we ask that you follow Twitter's 2 | [Open Source Code of Conduct](https://github.com/twitter/code-of-conduct/blob/master/code-of-conduct.md) 3 | in all interactions with the community. 4 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to Contribute 2 | 3 | We'd love to get patches from you! 4 | 5 | ## Getting Started 6 | 7 | You can find [open issues](https://github.com/twitter/rezolus/issues) to work 8 | on labelled 'help-wanted' or 'easy'. If you have and idea for an improvement or 9 | feature that's not covered by an existing issue, please create one first to get 10 | early feedback on your idea. 11 | 12 | ## Building 13 | 14 | A guide to building can be found in the README 15 | 16 | ## Workflow 17 | 18 | We follow the [GitHub Flow Workflow](https://guides.github.com/introduction/flow/) 19 | 20 | 1. Fork the project 21 | 1. Check out the `master` branch 22 | 1. Create a feature branch 23 | 1. Write code and tests for your change 24 | 1. From your branch, make a pull request against `twitter/rezolus/master` 25 | 1. Work with repo maintainers to get your change reviewed 26 | 1. Wait for your change to be pulled into `twitter/rezolus/master` 27 | 1. Delete your feature branch 28 | 29 | ## Testing 30 | 31 | All testing is driven by the standard Rust toolchain using `cargo test` to run 32 | tests locally. In addition, tests will be run automatically in travis-ci for all 33 | pull requests and merges into this repository. 34 | 35 | ## Style 36 | 37 | We use rustfmt to enforce code style. Please be sure to run `cargo fmt` to make 38 | sure your changes adhere to the style. As rustfmt is under constant development, 39 | you may find that it changes style for files you haven't edited. In this case, 40 | open a [new issue](https://github.com/twitter/rezolus/issues/new). Do not 41 | include formatting changes for unrelated files in your main pull request as it 42 | can make review more time consuming to understand the changes. A separate pull 43 | request to first address any existing style issues will help keep code review 44 | as fast as possible. You can get rustfmt via: `rustup component add rustfmt` 45 | 46 | Additionally, we use clippy as our linting tool. Please be sure to run 47 | `cargo clippy` to make sure your changes pass the linter. As with rustfmt, 48 | clippy is under constant development and new lints are added regularly. If you 49 | find that clippy is catching existing issues unrelated to your changes, open a 50 | [new issue](https://github.com/twitter/rezolus/issues/new). Keeping these 51 | changes in a separate pull request will help keep review as fast as possible. 52 | 53 | Style and linting checks will be run automatically in travis-ci for all pull 54 | requests and merges into this repository. 55 | 56 | ## Issues 57 | 58 | When creating an issue please try to ahere to the following format: 59 | 60 | module-name: One line summary of the issue (less than 72 characters) 61 | 62 | ### Expected behavior 63 | 64 | As concisely as possible, describe the expected behavior. 65 | 66 | ### Actual behavior 67 | 68 | As concisely as possible, describe the observed behavior. 69 | 70 | ### Steps to reproduce the behavior 71 | 72 | List all relevant steps to reproduce the observed behavior. 73 | 74 | ## Pull Requests 75 | 76 | Comments should be formatted to a width no greater than 80 columns. 77 | 78 | Files should be exempt of trailing spaces. 79 | 80 | We adhere to a specific format for commit messages. Please write your commit 81 | messages along these guidelines. Please keep the line width no greater than 80 82 | columns (You can use `fmt -n -p -w 80` to accomplish this). 83 | 84 | module-name: One line description of your change (less than 72 characters) 85 | 86 | Problem 87 | 88 | Explain the context and why you're making that change. What is the problem 89 | you're trying to solve? In some cases there is not a problem and this can be 90 | thought of being the motivation for your change. 91 | 92 | Solution 93 | 94 | Describe the modifications you've done. 95 | 96 | Result 97 | 98 | What will change as a result of your pull request? Note that sometimes this 99 | section is unnecessary because it is self-explanatory based on the solution. 100 | 101 | Some important notes regarding the summary line: 102 | 103 | * Describe what was done; not the result 104 | * Use the active voice 105 | * Use the present tense 106 | * Capitalize properly 107 | * Do not end in a period — this is a title/subject 108 | * Prefix the subject with its scope 109 | 110 | ## Code Review 111 | 112 | All pull requests will be reviewed on GitHub and changes may be requested prior 113 | to merging your pull request. Once the changes are approved, the pull request 114 | will be squash-merged into a single commit which retains authorship metadata. 115 | 116 | ## Documentation 117 | 118 | We also welcome improvements to the project documentation or to the existing 119 | docs. Please file an [issue](https://github.com/twitter/rezolus/issues). 120 | 121 | # License 122 | 123 | By contributing your code, you agree to license your contribution under the 124 | terms of the APLv2: https://github.com/twitter/rezolus/blob/master/LICENSE 125 | 126 | # Code of Conduct 127 | 128 | Read our [Code of Conduct](CODE_OF_CONDUCT.md) for the project. 129 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "rezolus" 3 | version = "2.16.4-alpha.0" 4 | authors = ["Brian Martin "] 5 | license = "Apache-2.0" 6 | publish = false 7 | edition = "2021" 8 | description = "High resolution systems performance telemetry agent" 9 | 10 | [dependencies] 11 | anyhow = "1.0.57" 12 | async-trait = "0.1.56" 13 | bcc = { version = "0.0.32", optional = true } 14 | clap = "3.2.1" 15 | crossbeam = "0.8.1" 16 | ctrlc = { version = "3.2.2", features = ["termination"] } 17 | dashmap = "5.3.4" 18 | json = "0.12.4" 19 | kafka = { version = "0.8.0", optional = true } 20 | libc = "0.2.126" 21 | num = "0.4.0" 22 | num-derive = "0.3.3" 23 | num-traits = "0.2.15" 24 | nvml-wrapper = "0.7.0" 25 | regex = "1.5.6" 26 | reqwest = { version = "0.11.10", default-features = false, features = ["blocking"] } 27 | rustcommon-atomics = { git = "https://github.com/twitter/rustcommon", rev = "ff5ca96b31461e1b08c59df770ae17903c54c1b2" } 28 | rustcommon-heatmap = { git = "https://github.com/twitter/rustcommon", rev = "ff5ca96b31461e1b08c59df770ae17903c54c1b2" } 29 | rustcommon-logger = { git = "https://github.com/twitter/rustcommon", rev = "ff5ca96b31461e1b08c59df770ae17903c54c1b2" } 30 | rustcommon-streamstats = { git = "https://github.com/twitter/rustcommon", rev = "ff5ca96b31461e1b08c59df770ae17903c54c1b2" } 31 | rustcommon-time = { git = "https://github.com/twitter/rustcommon", rev = "ff5ca96b31461e1b08c59df770ae17903c54c1b2" } 32 | serde = "1.0.137" 33 | serde_derive = "1.0.137" 34 | strum = "0.24.1" 35 | strum_macros = "0.24.1" 36 | sysconf = "0.3.4" 37 | thiserror = "1.0.31" 38 | tiny_http = "0.10.0" 39 | tokio = { version = "1.19.2", features = ["full"] } 40 | toml = "0.5.9" 41 | uuid = "0.8.2" 42 | walkdir = "2.3.2" 43 | 44 | [features] 45 | all = ["bpf", "push_kafka"] 46 | default = [] 47 | bpf = ["bcc"] 48 | bpf_static = ["bpf", "bcc/static"] 49 | bpf_static_llvm_8 = ["bpf", "bcc/llvm_8", "bcc/static"] 50 | bpf_static_llvm_9 = ["bpf", "bcc/llvm_9", "bcc/static"] 51 | bpf_v0_12_0 = ["bpf", "bcc/v0_12_0"] 52 | bpf_v0_13_0 = ["bpf", "bcc/v0_13_0"] 53 | bpf_v0_14_0 = ["bpf", "bcc/v0_14_0"] 54 | bpf_v0_15_0 = ["bpf", "bcc/v0_15_0"] 55 | bpf_v0_16_0 = ["bpf", "bcc/v0_16_0"] 56 | bpf_v0_17_0 = ["bpf", "bcc/v0_17_0"] 57 | bpf_v0_18_0 = ["bpf", "bcc/v0_18_0"] 58 | bpf_v0_19_0 = ["bpf", "bcc/v0_19_0"] 59 | bpf_v0_20_0 = ["bpf", "bcc/v0_20_0"] 60 | bpf_v0_21_0 = ["bpf", "bcc/v0_21_0"] 61 | bpf_v0_22_0 = ["bpf", "bcc/v0_22_0"] 62 | bpf_v0_23_0 = ["bpf", "bcc/v0_23_0"] 63 | push_kafka = ["kafka"] 64 | 65 | [profile.bench] 66 | debug = true 67 | lto = true 68 | codegen-units = 1 69 | 70 | [profile.release] 71 | debug = true 72 | lto = true 73 | codegen-units = 1 74 | -------------------------------------------------------------------------------- /configs/ci.toml: -------------------------------------------------------------------------------- 1 | # This configuration is intended for use on CI environments where we want any 2 | # errors to be critical and cause the run to fail. As such, it is not suitable 3 | # for real usage. 4 | 5 | # Samplers which don't properly function due to limitations of the CI 6 | # environment should be disabled or limited portions which are expected to work. 7 | 8 | [general] 9 | fault_tolerant = false 10 | listen = "0.0.0.0:4242" 11 | 12 | [samplers] 13 | [samplers.cpu] 14 | enabled = false 15 | perf_events = false 16 | 17 | [samplers.disk] 18 | bpf = false 19 | enabled = true 20 | 21 | [samplers.ext4] 22 | bpf = true 23 | enabled = true 24 | 25 | [samplers.interrupt] 26 | bpf = true 27 | enabled = true 28 | 29 | [samplers.memory] 30 | enabled = true 31 | 32 | [samplers.network] 33 | bpf = true 34 | enabled = true 35 | 36 | [samplers.ntp] 37 | enabled = true 38 | 39 | [samplers.page_cache] 40 | bpf = false 41 | enabled = true 42 | 43 | [samplers.rezolus] 44 | enabled = true 45 | 46 | [samplers.scheduler] 47 | bpf = true 48 | enabled = true 49 | perf_events = true 50 | 51 | [samplers.softnet] 52 | enabled = true 53 | 54 | [samplers.tcp] 55 | bpf = false 56 | enabled = true 57 | 58 | [samplers.udp] 59 | enabled = true 60 | 61 | [samplers.xfs] 62 | bpf = false 63 | enabled = true -------------------------------------------------------------------------------- /configs/macos.toml: -------------------------------------------------------------------------------- 1 | # This configuration is intended for use on CI environments and testing on macOS 2 | 3 | # NOTE: Not all samplers are supported on macOS. 4 | 5 | [general] 6 | fault_tolerant = false 7 | listen = "0.0.0.0:4242" 8 | 9 | [samplers] 10 | [samplers.ntp] 11 | enabled = true 12 | -------------------------------------------------------------------------------- /configs/memcache.toml: -------------------------------------------------------------------------------- 1 | # An example config that produces percentile metrics for specific memcache stats 2 | # while preserving the original metric names. 3 | 4 | [general] 5 | listen = "0.0.0.0:4242" 6 | fault_tolerant = false 7 | reading_suffix = "" 8 | 9 | [samplers] 10 | [samplers.memcache] 11 | enabled = true 12 | endpoint = "localhost:11211" 13 | -------------------------------------------------------------------------------- /configs/usercall.toml: -------------------------------------------------------------------------------- 1 | # This example configuration covers detailed configuration for the usercall sampler 2 | # This sampler allows you to attach a probe to a user space library and export the number of times 3 | # that it has been called. 4 | 5 | [general] 6 | listen = "0.0.0.0:4242" 7 | 8 | [samplers.usercall] 9 | enabled = true 10 | 11 | 12 | 13 | # WARNING: Probing the same function in the same library file will result in one of the probes 14 | # being discarded. 15 | 16 | # Exporting metrics for libcurl by searching in the default paths. 17 | [[samplers.usercall.libraries]] 18 | # This is the name for the library that is exported. It will show up in the metric path. 19 | name = "curl" 20 | # Since no path is specified, rezolus will search in the following directories: 21 | # "/lib64", "/usr/lib64", "/usr/local/lib64", "/lib", "/usr/lib", "/usr/local/lib" looking for 22 | # variants of curl.so, libcurl.so and libcurl.so.5 23 | functions = ["curl_global_init"] 24 | 25 | 26 | # Below is an example of exporting metrics for two different versions of the same KRB library by 27 | # specifying the exact file to be probed. 28 | 29 | [[samplers.usercall.libraries]] 30 | # This is the name for the library that is exported. It will show up in the metric path. 31 | name = "krb3" 32 | # This is the path to the library that will be probed. 33 | path = "/usr/lib/x86_64-linux-gnu/libkrb5.so.3" 34 | # Export usercall/krb3/krb5_cc_get_principal and usercall/krb3/krb5_parse_name_flags 35 | functions = ["krb5_cc_get_principal", "krb5_parse_name_flags"] 36 | 37 | [[samplers.usercall.libraries]] 38 | # This is the name for the library that is exported. It will show up in the metric path. 39 | name = "krb26" 40 | # This is the path to the library that will be probed. 41 | path = "/usr/lib/x86_64-linux-gnu/libkrb5.so.26" 42 | # Export usercall/krb26/krb5_cc_get_principal and usercall/krb26/krb5_parse_name_flags 43 | functions = ["krb5_cc_get_principal", "krb5_parse_name_flags"] 44 | -------------------------------------------------------------------------------- /docs/DESIGN.md: -------------------------------------------------------------------------------- 1 | # Design 2 | 3 | The primary goal for Rezolus is to provide rich telemetry with a low resource 4 | utilization. We want to make it so that Rezolus can be run everywhere so that 5 | we have enhanced visibility into performance anomalies. With this visibility, 6 | we will be able to capture data about runtime performance problems, profile 7 | systems performance to identify tuning and optimization opportunities, and 8 | measure how we are using our infrastructure. 9 | 10 | ## Goals 11 | 12 | * Rich telemetry for performance visibility. 13 | * High-resolution local sampling to capture bursts and brief anomalies. 14 | * Low runtime overhead to enable wide deployment. 15 | 16 | ## Background 17 | 18 | Rezolus helps address the issues around sampling rate and metrics collection. It 19 | can be very expensive to collect secondly metrics across large infrastructure, 20 | but often we care about performance changes that are on the timescale of an a 21 | typical request, which is typically well below one second. To address this 22 | issue, we built Rezolus to do local high-resolution sampling and produce summary 23 | metrics across a moving window. By exporting percentiles across the past minute, 24 | we provide insight into sub-minutely behaviors without the expense of collecting 25 | telemetry at high resolution. For instance, if we collect six percentile metrics 26 | every minute, that's one tenth the number of metrics to collect and aggregate 27 | than if we collected secondly. The savings improves further as sample rate 28 | increases. 29 | 30 | ## Resource utilization 31 | 32 | To ensure it can be deployed broadly, we are deeply focused on making sure that 33 | the resource footprint is kept to a reasonable level. It can be difficult to 34 | quantify exact utilization, because Rezolus is able to be configured to collect 35 | from a variety of sources with configurable resolution. There parameters 36 | greatly influence the amount of CPU and memory that Rezolus will require. 37 | 38 | As a rough estimate, at 1Hz sampling with all samplers enabled, Rezolus will 39 | occupy 125MB of RAM and utilize approximately 0.08 CPUs. With eBPF disabled, 40 | the footprint drops to approximately 20MB RAM and 0.03 CPUs and increasing the 41 | sampling rate to 10Hz with results in approximately 50MB RAM and 0.12 CPUs 42 | utilized. We believe these levels of resource utilization are well-balanced 43 | against the enhanced telemetry that Rezolus is able to provide. 44 | 45 | ## Samplers 46 | 47 | All samplers implement the same set of core functions. This makes it easy to 48 | add new samplers as manage them as a collection without worrying about 49 | implementation details. You may think of them as "plugins", even though they 50 | are compiled with the rest of the code. 51 | 52 | At a high-level, a sampler takes a sample and records values into a metrics 53 | library. The sampler must also be able to add and remove metrics from the 54 | metrics registry in addition to specifying what types of telemetry will be 55 | exposed for aggregation. For instance, a sampler may specify specific 56 | percentiles to export for one or more metrics. 57 | 58 | We recommend taking a look at the rest of the documentation and at a few of the 59 | samplers within this repository to get a sense of how they can be implemented. 60 | 61 | ## Metrics 62 | 63 | We are using the metrics library provided in the [rustcommon][1] project. This 64 | metrics library is focused on performance and precision. 65 | 66 | The metrics library provides all of the core functionalities related to 67 | tracking values and producing the types of telemetry we get from oversampling. 68 | We can simply write consecutive readings of a counter into the metrics library, 69 | and it can generate percentiles across a time interval in addition to tracking 70 | the counters value. We can also directly insert bucketized readings like we get 71 | from BPF samplers to transfer the kernel-space aggregate over to user-space. 72 | 73 | Perhaps the most critical aspect of this library to understand in the context 74 | of its usage in Rezolus is how it handles counter measurements with regard to 75 | oversampling and producing percentiles across a time range. The first time a 76 | counter is recorded, it simply stores the current value and time the counter 77 | was read. When this counter is again measured and recorded, it calculates the 78 | delta between the two consecutive measurements in both value and time. It uses 79 | the difference in value and time to calculate a rate which is normalized to a 80 | secondly rate. 81 | 82 | Assuming that we have asked the library to track one or more percentiles for 83 | this counter, the secondly rate is recorded into a histogram. In Rezolus, we 84 | use moving histograms which retain values across a rolling window. As values 85 | age-out, they are dropped from the histogram. This means when we poll Rezolus's 86 | exposition endpoint, we are given values which represent secondly rates across 87 | the configured time interval. 88 | 89 | For instance, we typically would use a one-minute window, and the p50 value 90 | would tell us the secondly rate for which half of the samples would be at or 91 | below this value and the other half would be at or above this value. 92 | Additionally, the p100 value would represent the highest rate seen between two 93 | consecutive samplings of the counter. 94 | 95 | [1]: https://github.com/twitter/rustcommon 96 | -------------------------------------------------------------------------------- /rust-toolchain: -------------------------------------------------------------------------------- 1 | stable -------------------------------------------------------------------------------- /src/common/mod.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | use std::collections::HashMap; 6 | use std::io::BufRead; 7 | use std::io::SeekFrom; 8 | 9 | use dashmap::DashMap; 10 | use tokio::fs::File; 11 | use tokio::io::{AsyncBufReadExt, AsyncSeekExt, BufReader}; 12 | 13 | pub mod bpf; 14 | 15 | pub const VERSION: &str = env!("CARGO_PKG_VERSION"); 16 | pub const NAME: &str = env!("CARGO_PKG_NAME"); 17 | 18 | pub const SECOND: u64 = 1_000 * MILLISECOND; 19 | pub const MILLISECOND: u64 = 1_000 * MICROSECOND; 20 | pub const MICROSECOND: u64 = 1_000 * NANOSECOND; 21 | pub const NANOSECOND: u64 = 1; 22 | 23 | pub struct HardwareInfo { 24 | numa_mapping: DashMap, 25 | } 26 | 27 | impl HardwareInfo { 28 | pub fn new() -> Self { 29 | let numa_mapping = DashMap::new(); 30 | let mut node = 0; 31 | loop { 32 | let path = format!("/sys/devices/system/node/node{}/cpulist", node); 33 | if let Ok(f) = std::fs::File::open(path) { 34 | let mut reader = std::io::BufReader::new(f); 35 | let mut line = String::new(); 36 | if reader.read_line(&mut line).is_ok() { 37 | let ranges: Vec<&str> = line.trim().split(',').collect(); 38 | for range in ranges { 39 | let parts: Vec<&str> = range.split('-').collect(); 40 | if parts.len() == 1 { 41 | if let Ok(id) = parts[0].parse() { 42 | numa_mapping.insert(id, node); 43 | } 44 | } else if parts.len() == 2 { 45 | if let Ok(start) = parts[0].parse() { 46 | if let Ok(stop) = parts[1].parse() { 47 | for id in start..=stop { 48 | numa_mapping.insert(id, node); 49 | } 50 | } 51 | } 52 | } 53 | } 54 | } 55 | } else { 56 | break; 57 | } 58 | node += 1; 59 | } 60 | Self { numa_mapping } 61 | } 62 | 63 | pub fn get_numa(&self, core: u64) -> Option { 64 | self.numa_mapping.get(&core).map(|v| *v.value()) 65 | } 66 | } 67 | 68 | /// helper function to discover the number of hardware threads 69 | pub fn hardware_threads() -> Result { 70 | let path = "/sys/devices/system/cpu/present"; 71 | let f = 72 | std::fs::File::open(path).map_err(|e| debug!("failed to open file ({:?}): {}", path, e))?; 73 | let mut f = std::io::BufReader::new(f); 74 | 75 | let mut line = String::new(); 76 | f.read_line(&mut line) 77 | .map_err(|_| debug!("failed to read line"))?; 78 | let line = line.trim(); 79 | let a: Vec<&str> = line.split('-').collect(); 80 | a.last() 81 | .unwrap_or(&"0") 82 | .parse::() 83 | .map_err(|e| debug!("could not parse num cpus from file ({:?}): {}", path, e)) 84 | .map(|i| i + 1) 85 | } 86 | 87 | /// helper function to create a nested map from files with the form of 88 | /// pkey1 lkey1 lkey2 ... lkeyN 89 | /// pkey1 value1 value2 ... valueN 90 | /// pkey2 ... 91 | pub async fn nested_map_from_file( 92 | file: &mut File, 93 | ) -> Result>, std::io::Error> { 94 | file.seek(SeekFrom::Start(0)).await?; 95 | let mut ret = HashMap::>::new(); 96 | let mut reader = BufReader::new(file); 97 | let mut keys = String::new(); 98 | let mut values = String::new(); 99 | while reader.read_line(&mut keys).await? > 0 { 100 | if reader.read_line(&mut values).await? > 0 { 101 | let mut keys_split = keys.trim().split_whitespace(); 102 | let mut values_split = values.trim().split_whitespace(); 103 | 104 | if let Some(pkey) = keys_split.next() { 105 | let _ = values_split.next(); 106 | if !ret.contains_key(pkey) { 107 | ret.insert(pkey.to_string(), Default::default()); 108 | } 109 | let inner = ret.get_mut(pkey).unwrap(); 110 | for key in keys_split { 111 | if let Some(Ok(value)) = values_split.next().map(|v| v.parse()) { 112 | inner.insert(key.to_owned(), value); 113 | } 114 | } 115 | } 116 | keys.clear(); 117 | values.clear(); 118 | } 119 | } 120 | Ok(ret) 121 | } 122 | 123 | pub fn default_percentiles() -> Vec { 124 | vec![1.0, 10.0, 50.0, 90.0, 99.0] 125 | } 126 | 127 | #[allow(dead_code)] 128 | pub struct KernelInfo { 129 | release: String, 130 | } 131 | 132 | #[allow(dead_code)] 133 | impl KernelInfo { 134 | pub fn new() -> Result { 135 | let output = std::process::Command::new("uname").args(["-r"]).output()?; 136 | let release = std::str::from_utf8(&output.stdout) 137 | .map_err(|_| std::io::Error::from(std::io::ErrorKind::InvalidInput))?; 138 | 139 | Ok(Self { 140 | release: release.to_string(), 141 | }) 142 | } 143 | 144 | pub fn release_major(&self) -> Result { 145 | let parts: Vec<&str> = self.release.split('.').collect(); 146 | if let Some(s) = parts.get(0) { 147 | return s 148 | .parse::() 149 | .map_err(|_| std::io::Error::from(std::io::ErrorKind::InvalidInput)); 150 | } 151 | Err(std::io::Error::from(std::io::ErrorKind::InvalidInput)) 152 | } 153 | 154 | pub fn release_minor(&self) -> Result { 155 | let parts: Vec<&str> = self.release.split('.').collect(); 156 | if let Some(s) = parts.get(1) { 157 | return s 158 | .parse::() 159 | .map_err(|_| std::io::Error::from(std::io::ErrorKind::InvalidInput)); 160 | } 161 | Err(std::io::Error::from(std::io::ErrorKind::InvalidInput)) 162 | } 163 | } 164 | -------------------------------------------------------------------------------- /src/common/value_to_index2.c: -------------------------------------------------------------------------------- 1 | /* This file contains the value_to_index2 that are shared, 2 | * it's tailored to be replaced into the main bpf.c for each sampler */ 3 | 4 | // histogram indexing 5 | static unsigned int value_to_index2(unsigned int value) { 6 | unsigned int index = 460; 7 | if (value < 100) { 8 | // 0-99 => [0..100) 9 | // 0 => 0 10 | // 99 => 99 11 | index = value; 12 | } else if (value < 1000) { 13 | // 100-999 => [100..190) 14 | // 100 => 100 15 | // 999 => 189 16 | index = 90 + value / 10; 17 | } else if (value < 10000) { 18 | // 1_000-9_999 => [190..280) 19 | // 1000 => 190 20 | // 9999 => 279 21 | index = 180 + value / 100; 22 | } else if (value < 100000) { 23 | // 10_000-99_999 => [280..370) 24 | // 10000 => 280 25 | // 99999 => 369 26 | index = 270 + value / 1000; 27 | } else if (value < 1000000) { 28 | // 100_000-999_999 => [370..460) 29 | // 100000 => 370 30 | // 999999 => 459 31 | index = 360 + value / 10000; 32 | } else { 33 | index = 460; 34 | } 35 | return index; 36 | } -------------------------------------------------------------------------------- /src/config/exposition/kafka.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | use crate::config::*; 6 | use rustcommon_atomics::*; 7 | 8 | #[derive(Debug, Deserialize)] 9 | #[serde(deny_unknown_fields)] 10 | #[allow(dead_code)] 11 | pub struct Kafka { 12 | #[serde(default = "default_enabled")] 13 | enabled: AtomicBool, 14 | #[serde(default = "default_interval")] 15 | interval: AtomicUsize, 16 | hosts: Vec, 17 | topic: Option, 18 | } 19 | 20 | impl Default for Kafka { 21 | fn default() -> Kafka { 22 | Kafka { 23 | enabled: default_enabled(), 24 | interval: default_interval(), 25 | hosts: Default::default(), 26 | topic: Default::default(), 27 | } 28 | } 29 | } 30 | 31 | fn default_enabled() -> AtomicBool { 32 | AtomicBool::new(false) 33 | } 34 | 35 | fn default_interval() -> AtomicUsize { 36 | AtomicUsize::new(500) 37 | } 38 | 39 | #[cfg(feature = "push_kafka")] 40 | impl Kafka { 41 | pub fn enabled(&self) -> bool { 42 | self.enabled.load(Ordering::Relaxed) 43 | } 44 | 45 | pub fn interval(&self) -> usize { 46 | self.interval.load(Ordering::Relaxed) 47 | } 48 | 49 | pub fn hosts(&self) -> Vec { 50 | self.hosts.clone() 51 | } 52 | 53 | pub fn topic(&self) -> Option { 54 | self.topic.clone() 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /src/config/exposition/mod.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | use serde_derive::*; 6 | 7 | mod kafka; 8 | 9 | use self::kafka::*; 10 | 11 | #[derive(Debug, Default, Deserialize)] 12 | #[serde(deny_unknown_fields)] 13 | pub struct Exposition { 14 | #[serde(default)] 15 | #[allow(dead_code)] 16 | kafka: Kafka, 17 | } 18 | 19 | impl Exposition { 20 | #[cfg(feature = "push_kafka")] 21 | pub fn kafka(&self) -> &Kafka { 22 | &self.kafka 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /src/config/general.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | use rustcommon_atomics::*; 6 | 7 | use crate::config::*; 8 | 9 | #[derive(Debug, Deserialize)] 10 | #[serde(deny_unknown_fields)] 11 | pub struct General { 12 | listen: Option, 13 | #[serde(with = "LevelDef")] 14 | #[serde(default = "default_logging_level")] 15 | logging: Level, 16 | #[serde(default = "default_interval")] 17 | interval: AtomicUsize, 18 | #[serde(default = "default_threads")] 19 | threads: usize, 20 | #[serde(default = "default_window")] 21 | window: AtomicUsize, 22 | #[serde(default = "default_fault_tolerant")] 23 | fault_tolerant: AtomicBool, 24 | #[serde(default = "default_reading_suffix")] 25 | reading_suffix: String, 26 | } 27 | 28 | impl General { 29 | pub fn listen(&self) -> Option { 30 | self.listen.clone() 31 | } 32 | 33 | pub fn logging(&self) -> Level { 34 | self.logging 35 | } 36 | 37 | pub fn set_logging(&mut self, level: Level) { 38 | self.logging = level; 39 | } 40 | 41 | /// interval in ms between samples if no sampler specific interval 42 | pub fn interval(&self) -> usize { 43 | self.interval.load(Ordering::Relaxed) 44 | } 45 | 46 | pub fn threads(&self) -> usize { 47 | self.threads 48 | } 49 | 50 | /// windows for histogram lookback 51 | pub fn window(&self) -> usize { 52 | self.window.load(Ordering::Relaxed) as usize 53 | } 54 | 55 | pub fn fault_tolerant(&self) -> bool { 56 | self.fault_tolerant.load(Ordering::Relaxed) 57 | } 58 | 59 | pub fn reading_suffix(&self) -> Option<&str> { 60 | if self.reading_suffix.is_empty() { 61 | None 62 | } else { 63 | Some(&self.reading_suffix) 64 | } 65 | } 66 | } 67 | 68 | impl Default for General { 69 | fn default() -> General { 70 | General { 71 | listen: None, 72 | logging: default_logging_level(), 73 | interval: default_interval(), 74 | threads: default_threads(), 75 | window: default_window(), 76 | fault_tolerant: default_fault_tolerant(), 77 | reading_suffix: default_reading_suffix(), 78 | } 79 | } 80 | } 81 | 82 | fn default_interval() -> AtomicUsize { 83 | AtomicUsize::new(1000) 84 | } 85 | 86 | fn default_threads() -> usize { 87 | 1 88 | } 89 | 90 | fn default_window() -> AtomicUsize { 91 | AtomicUsize::new(60) 92 | } 93 | 94 | fn default_fault_tolerant() -> AtomicBool { 95 | AtomicBool::new(true) 96 | } 97 | 98 | fn default_reading_suffix() -> String { 99 | "count".to_string() 100 | } 101 | 102 | #[derive(Clone, Deserialize, Debug)] 103 | #[serde(rename_all = "lowercase")] 104 | #[serde(remote = "Level")] 105 | #[serde(deny_unknown_fields)] 106 | enum LevelDef { 107 | Error, 108 | Warn, 109 | Info, 110 | Debug, 111 | Trace, 112 | } 113 | 114 | fn default_logging_level() -> Level { 115 | Level::Info 116 | } 117 | -------------------------------------------------------------------------------- /src/config/mod.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | mod exposition; 6 | mod general; 7 | mod samplers; 8 | 9 | use std::io::Read; 10 | use std::net::{SocketAddr, ToSocketAddrs}; 11 | 12 | use clap::{App, Arg}; 13 | use rustcommon_logger::Level; 14 | use serde_derive::*; 15 | 16 | use crate::*; 17 | 18 | use config::exposition::*; 19 | pub use config::general::General; 20 | use config::samplers::*; 21 | 22 | pub const VERSION: &str = env!("CARGO_PKG_VERSION"); 23 | pub const NAME: &str = env!("CARGO_PKG_NAME"); 24 | 25 | #[derive(Debug, Default, Deserialize)] 26 | #[serde(deny_unknown_fields)] 27 | pub struct Config { 28 | #[serde(default)] 29 | exposition: Exposition, 30 | #[serde(default)] 31 | general: General, 32 | #[serde(default)] 33 | samplers: Samplers, 34 | } 35 | 36 | impl Config { 37 | /// parse command line options and return `Config` 38 | pub fn new() -> Config { 39 | let app = App::new(NAME) 40 | .version(VERSION) 41 | .author("Brian Martin ") 42 | .about("High-Resolution Systems Performance Telemetry") 43 | .arg( 44 | Arg::new("config") 45 | .long("config") 46 | .value_name("FILE") 47 | .help("TOML config file") 48 | .takes_value(true), 49 | ) 50 | .arg( 51 | Arg::new("verbose") 52 | .short('v') 53 | .long("verbose") 54 | .help("Increase verbosity by one level. Can be used more than once") 55 | .multiple_occurrences(true), 56 | ); 57 | 58 | let matches = app.get_matches(); 59 | 60 | let mut config = if let Some(file) = matches.value_of("config") { 61 | Config::load_from_file(file) 62 | } else { 63 | println!("NOTE: using builtin base configuration"); 64 | Default::default() 65 | }; 66 | 67 | match matches.occurrences_of("verbose") { 68 | 0 => {} // don't do anything, default is Info 69 | 1 => { 70 | if config.general.logging() == Level::Info { 71 | config.general.set_logging(Level::Debug); 72 | } 73 | } 74 | _ => config.general.set_logging(Level::Trace), 75 | } 76 | 77 | config 78 | } 79 | 80 | /// get listen address 81 | pub fn listen(&self) -> Option { 82 | self.general 83 | .listen() 84 | .map(|v| v.to_socket_addrs().unwrap().next().unwrap()) 85 | } 86 | 87 | /// get logging level 88 | pub fn logging(&self) -> Level { 89 | self.general.logging() 90 | } 91 | 92 | #[allow(dead_code)] 93 | pub fn exposition(&self) -> &Exposition { 94 | &self.exposition 95 | } 96 | 97 | pub fn general(&self) -> &General { 98 | &self.general 99 | } 100 | 101 | pub fn samplers(&self) -> &Samplers { 102 | &self.samplers 103 | } 104 | 105 | pub fn fault_tolerant(&self) -> bool { 106 | self.general().fault_tolerant() 107 | } 108 | 109 | fn load_from_file(filename: &str) -> Config { 110 | let mut file = std::fs::File::open(filename).expect("failed to open workload file"); 111 | let mut content = String::new(); 112 | file.read_to_string(&mut content).expect("failed to read"); 113 | let toml = toml::from_str(&content); 114 | match toml { 115 | Ok(toml) => toml, 116 | Err(e) => { 117 | println!("Failed to parse TOML config: {}", filename); 118 | println!("{}", e); 119 | std::process::exit(1); 120 | } 121 | } 122 | } 123 | } 124 | 125 | pub trait SamplerConfig { 126 | type Statistic; 127 | fn bpf(&self) -> bool { 128 | false 129 | } 130 | fn enabled(&self) -> bool { 131 | false 132 | } 133 | fn interval(&self) -> Option; 134 | fn percentiles(&self) -> &[f64]; 135 | fn perf_events(&self) -> bool { 136 | false 137 | } 138 | fn statistics(&self) -> Vec<::Statistic>; 139 | } 140 | -------------------------------------------------------------------------------- /src/config/samplers.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | use crate::config::*; 6 | 7 | use samplers::cpu::CpuConfig; 8 | use samplers::disk::DiskConfig; 9 | use samplers::ext4::Ext4Config; 10 | use samplers::http::HttpConfig; 11 | use samplers::interrupt::InterruptConfig; 12 | use samplers::krb5kdc::Krb5kdcConfig; 13 | use samplers::memcache::MemcacheConfig; 14 | use samplers::memory::MemoryConfig; 15 | use samplers::network::NetworkConfig; 16 | use samplers::ntp::NtpConfig; 17 | use samplers::nvidia::NvidiaConfig; 18 | use samplers::page_cache::PageCacheConfig; 19 | use samplers::process::ProcessConfig; 20 | use samplers::rezolus::RezolusConfig; 21 | use samplers::scheduler::SchedulerConfig; 22 | use samplers::softnet::SoftnetConfig; 23 | use samplers::tcp::TcpConfig; 24 | use samplers::udp::UdpConfig; 25 | use samplers::usercall::UsercallConfig; 26 | use samplers::xfs::XfsConfig; 27 | 28 | #[derive(Debug, Default, Deserialize)] 29 | #[serde(deny_unknown_fields)] 30 | pub struct Samplers { 31 | #[serde(default)] 32 | cpu: CpuConfig, 33 | #[serde(default)] 34 | disk: DiskConfig, 35 | #[serde(default)] 36 | ext4: Ext4Config, 37 | #[serde(default)] 38 | http: HttpConfig, 39 | #[serde(default)] 40 | interrupt: InterruptConfig, 41 | #[serde(default)] 42 | krb5kdc: Krb5kdcConfig, 43 | #[serde(default)] 44 | memcache: MemcacheConfig, 45 | #[serde(default)] 46 | memory: MemoryConfig, 47 | #[serde(default)] 48 | network: NetworkConfig, 49 | #[serde(default)] 50 | ntp: NtpConfig, 51 | #[serde(default)] 52 | nvidia: NvidiaConfig, 53 | #[serde(default)] 54 | page_cache: PageCacheConfig, 55 | #[serde(default)] 56 | process: ProcessConfig, 57 | #[serde(default)] 58 | rezolus: RezolusConfig, 59 | #[serde(default)] 60 | scheduler: SchedulerConfig, 61 | #[serde(default)] 62 | softnet: SoftnetConfig, 63 | #[serde(default)] 64 | tcp: TcpConfig, 65 | #[serde(default)] 66 | udp: UdpConfig, 67 | #[serde(default)] 68 | usercall: UsercallConfig, 69 | #[serde(default)] 70 | xfs: XfsConfig, 71 | } 72 | 73 | impl Samplers { 74 | pub fn cpu(&self) -> &CpuConfig { 75 | &self.cpu 76 | } 77 | 78 | pub fn disk(&self) -> &DiskConfig { 79 | &self.disk 80 | } 81 | 82 | pub fn ext4(&self) -> &Ext4Config { 83 | &self.ext4 84 | } 85 | 86 | pub fn http(&self) -> &HttpConfig { 87 | &self.http 88 | } 89 | 90 | pub fn interrupt(&self) -> &InterruptConfig { 91 | &self.interrupt 92 | } 93 | 94 | pub fn krb5kdc(&self) -> &Krb5kdcConfig { 95 | &self.krb5kdc 96 | } 97 | 98 | pub fn memcache(&self) -> &MemcacheConfig { 99 | &self.memcache 100 | } 101 | 102 | pub fn memory(&self) -> &MemoryConfig { 103 | &self.memory 104 | } 105 | 106 | pub fn network(&self) -> &NetworkConfig { 107 | &self.network 108 | } 109 | 110 | pub fn ntp(&self) -> &NtpConfig { 111 | &self.ntp 112 | } 113 | 114 | pub fn nvidia(&self) -> &NvidiaConfig { 115 | &self.nvidia 116 | } 117 | 118 | pub fn page_cache(&self) -> &PageCacheConfig { 119 | &self.page_cache 120 | } 121 | 122 | pub fn process(&self) -> &ProcessConfig { 123 | &self.process 124 | } 125 | 126 | pub fn rezolus(&self) -> &RezolusConfig { 127 | &self.rezolus 128 | } 129 | 130 | pub fn scheduler(&self) -> &SchedulerConfig { 131 | &self.scheduler 132 | } 133 | 134 | pub fn softnet(&self) -> &SoftnetConfig { 135 | &self.softnet 136 | } 137 | 138 | pub fn tcp(&self) -> &TcpConfig { 139 | &self.tcp 140 | } 141 | 142 | pub fn udp(&self) -> &UdpConfig { 143 | &self.udp 144 | } 145 | 146 | pub fn usercall(&self) -> &UsercallConfig { 147 | &self.usercall 148 | } 149 | 150 | pub fn xfs(&self) -> &XfsConfig { 151 | &self.xfs 152 | } 153 | } 154 | -------------------------------------------------------------------------------- /src/exposition/http.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | use std::net::SocketAddr; 6 | use std::sync::Arc; 7 | use std::time::{Duration, Instant}; 8 | 9 | use crate::*; 10 | use rustcommon_logger::*; 11 | use tiny_http::{Method, Response, Server}; 12 | 13 | use super::MetricsSnapshot; 14 | 15 | pub struct Http { 16 | snapshot: MetricsSnapshot, 17 | server: Server, 18 | updated: Instant, 19 | } 20 | 21 | impl Http { 22 | pub fn new(address: SocketAddr, metrics: Arc, count_label: Option<&str>) -> Self { 23 | let server = tiny_http::Server::http(address); 24 | if server.is_err() { 25 | fatal!("Failed to open {} for HTTP Stats listener", address); 26 | } 27 | Self { 28 | snapshot: MetricsSnapshot::new(metrics, count_label), 29 | server: server.unwrap(), 30 | updated: Instant::now(), 31 | } 32 | } 33 | 34 | pub fn run(&mut self) { 35 | if let Ok(Some(request)) = self.server.try_recv() { 36 | if self.updated.elapsed() >= Duration::from_millis(500) { 37 | self.snapshot.refresh(); 38 | self.updated = Instant::now(); 39 | } 40 | let url = request.url(); 41 | let parts: Vec<&str> = url.split('?').collect(); 42 | let url = parts[0]; 43 | match request.method() { 44 | Method::Get => match url { 45 | "/" => { 46 | debug!("Serving GET on index"); 47 | let _ = request.respond(Response::from_string(format!( 48 | "Welcome to {}\nVersion: {}\n", 49 | crate::config::NAME, 50 | crate::config::VERSION, 51 | ))); 52 | } 53 | "/metrics" => { 54 | debug!("Serving Prometheus compatible stats"); 55 | let _ = request.respond(Response::from_string(self.snapshot.prometheus())); 56 | } 57 | "/metrics.json" | "/vars.json" | "/admin/metrics.json" => { 58 | debug!("Serving machine readable stats"); 59 | let _ = request.respond(Response::from_string(self.snapshot.json(false))); 60 | } 61 | "/vars" => { 62 | debug!("Serving human readable stats"); 63 | let _ = request.respond(Response::from_string(self.snapshot.human())); 64 | } 65 | url => { 66 | debug!("GET on non-existent url: {}", url); 67 | debug!("Serving machine readable stats"); 68 | let _ = request.respond(Response::from_string(self.snapshot.json(false))); 69 | } 70 | }, 71 | method => { 72 | debug!("unsupported request method: {}", method); 73 | let _ = request.respond(Response::empty(404)); 74 | } 75 | } 76 | } 77 | std::thread::sleep(std::time::Duration::from_millis(1)); 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /src/exposition/kafka.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | use std::convert::TryInto; 6 | use std::sync::Arc; 7 | use std::time::{Duration, Instant}; 8 | 9 | use crate::*; 10 | use kafka::producer::{Producer, Record}; 11 | 12 | use crate::config::Config; 13 | use crate::exposition::MetricsSnapshot; 14 | 15 | pub struct KafkaProducer { 16 | snapshot: MetricsSnapshot, 17 | producer: Producer, 18 | topic: String, 19 | interval: Duration, 20 | } 21 | 22 | impl KafkaProducer { 23 | pub fn new(config: Arc, metrics: Arc) -> Self { 24 | Self { 25 | snapshot: MetricsSnapshot::new(metrics, config.general().reading_suffix()), 26 | producer: Producer::from_hosts(config.exposition().kafka().hosts()) 27 | .create() 28 | .unwrap(), 29 | topic: config.exposition().kafka().topic().unwrap(), 30 | interval: Duration::from_millis( 31 | config.exposition().kafka().interval().try_into().unwrap(), 32 | ), 33 | } 34 | } 35 | 36 | pub fn run(&mut self) { 37 | let start = Instant::now(); 38 | self.snapshot.refresh(); 39 | let _ = self 40 | .producer 41 | .send(&Record::from_value(&self.topic, self.snapshot.json(false))); 42 | let stop = Instant::now(); 43 | if start + self.interval > stop { 44 | std::thread::sleep(self.interval - (stop - start)); 45 | } 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/exposition/mod.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | use std::collections::HashMap; 6 | use std::sync::Arc; 7 | use std::time::Instant; 8 | 9 | use crate::*; 10 | 11 | mod http; 12 | #[cfg(feature = "push_kafka")] 13 | mod kafka; 14 | 15 | pub use self::http::Http; 16 | #[cfg(feature = "push_kafka")] 17 | pub use self::kafka::KafkaProducer; 18 | 19 | pub struct MetricsSnapshot { 20 | metrics: Arc, 21 | snapshot: HashMap, 22 | refreshed: Instant, 23 | count_label: Option, 24 | } 25 | 26 | impl<'a> MetricsSnapshot { 27 | pub fn new(metrics: Arc, count_label: Option<&str>) -> Self { 28 | Self { 29 | metrics, 30 | snapshot: HashMap::new(), 31 | refreshed: Instant::now(), 32 | count_label: count_label.map(std::string::ToString::to_string), 33 | } 34 | } 35 | 36 | pub fn refresh(&mut self) { 37 | self.snapshot = self.metrics.snapshot(); 38 | self.refreshed = Instant::now(); 39 | } 40 | 41 | pub fn prometheus(&self) -> String { 42 | let mut data = Vec::new(); 43 | for (metric, value) in &self.snapshot { 44 | let label = metric.statistic().name(); 45 | let output = metric.output(); 46 | match output { 47 | Output::Reading => { 48 | data.push(format!("# TYPE {} gauge\n{} {}", label, label, value)); 49 | } 50 | Output::Percentile(percentile) => { 51 | data.push(format!( 52 | "# TYPE {} gauge\n{}{{percentile=\"{:02}\"}} {}", 53 | label, label, percentile, value 54 | )); 55 | } 56 | } 57 | } 58 | data.sort(); 59 | let mut content = data.join("\n"); 60 | content += "\n"; 61 | let parts: Vec<&str> = content.split('/').collect(); 62 | parts.join("_") 63 | } 64 | 65 | pub fn human(&self) -> String { 66 | let mut data = Vec::new(); 67 | for (metric, value) in &self.snapshot { 68 | let label = metric.statistic().name(); 69 | let output = metric.output(); 70 | match output { 71 | Output::Reading => { 72 | if let Some(ref count_label) = self.count_label { 73 | data.push(format!("{}/{}: {}", label, count_label, value)); 74 | } else { 75 | data.push(format!("{}: {}", label, value)); 76 | } 77 | } 78 | Output::Percentile(percentile) => { 79 | data.push(format!("{}/histogram/p{:02}: {}", label, percentile, value)); 80 | } 81 | } 82 | } 83 | data.sort(); 84 | let mut content = data.join("\n"); 85 | content += "\n"; 86 | content 87 | } 88 | 89 | fn json(&self, pretty: bool) -> String { 90 | let mut head = "{".to_owned(); 91 | if pretty { 92 | head += "\n "; 93 | } 94 | let mut data = Vec::new(); 95 | for (metric, value) in &self.snapshot { 96 | let label = metric.statistic().name(); 97 | let output = metric.output(); 98 | match output { 99 | Output::Reading => { 100 | if let Some(ref count_label) = self.count_label { 101 | data.push(format!("\"{}/{}\": {}", label, count_label, value)); 102 | } else { 103 | data.push(format!("\"{}\": {}", label, value)); 104 | } 105 | } 106 | Output::Percentile(percentile) => { 107 | data.push(format!( 108 | "\"{}/histogram/p{:02}\": {}", 109 | label, percentile, value 110 | )); 111 | } 112 | } 113 | } 114 | data.sort(); 115 | let body = if pretty { 116 | data.join(",\n ") 117 | } else { 118 | data.join(",") 119 | }; 120 | let mut content = head; 121 | content += &body; 122 | if pretty { 123 | content += "\n"; 124 | } 125 | content += "}"; 126 | content 127 | } 128 | } 129 | -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | #[macro_use] 6 | extern crate rustcommon_logger; 7 | 8 | #[macro_use] 9 | extern crate anyhow; 10 | 11 | use rustcommon_atomics::{Atomic, Ordering}; 12 | use std::sync::Arc; 13 | 14 | use rustcommon_atomics::AtomicBool; 15 | use rustcommon_logger::Logger; 16 | use tokio::runtime::Builder; 17 | 18 | mod common; 19 | mod config; 20 | mod exposition; 21 | mod metrics; 22 | mod samplers; 23 | 24 | use common::*; 25 | use config::Config; 26 | use metrics::*; 27 | use samplers::*; 28 | 29 | pub type Instant = rustcommon_time::Instant>; 30 | pub type Duration = rustcommon_time::Duration>; 31 | 32 | fn main() -> Result<(), Box> { 33 | // get config 34 | let config = Arc::new(Config::new()); 35 | 36 | // initialize logging 37 | Logger::new() 38 | .label(common::NAME) 39 | .level(config.logging()) 40 | .init() 41 | .expect("Failed to initialize logger"); 42 | 43 | info!("----------"); 44 | info!("{} {}", common::NAME, common::VERSION); 45 | info!("----------"); 46 | debug!("host cores: {}", hardware_threads().unwrap_or(1)); 47 | 48 | let runnable = Arc::new(AtomicBool::new(true)); 49 | let r = runnable.clone(); 50 | 51 | // initialize signal handler 52 | debug!("initializing signal handler"); 53 | ctrlc::set_handler(move || { 54 | r.store(false, Ordering::Relaxed); 55 | }) 56 | .expect("Failed to set handler for SIGINT / SIGTERM"); 57 | 58 | // initialize metrics 59 | debug!("initializing metrics"); 60 | let metrics = Arc::new(Metrics::new()); 61 | 62 | // initialize async runtime 63 | debug!("initializing async runtime"); 64 | let runtime = Arc::new( 65 | Builder::new_multi_thread() 66 | .enable_all() 67 | .worker_threads(config.general().threads()) 68 | .max_blocking_threads(config.general().threads()) 69 | .thread_name("rezolus-worker") 70 | .build() 71 | .unwrap(), 72 | ); 73 | 74 | // spawn samplers 75 | debug!("spawning samplers"); 76 | let common = Common::new(config.clone(), metrics.clone(), runtime); 77 | Cpu::spawn(common.clone()); 78 | Disk::spawn(common.clone()); 79 | Ext4::spawn(common.clone()); 80 | Http::spawn(common.clone()); 81 | Interrupt::spawn(common.clone()); 82 | Krb5kdc::spawn(common.clone()); 83 | Memcache::spawn(common.clone()); 84 | Memory::spawn(common.clone()); 85 | PageCache::spawn(common.clone()); 86 | Network::spawn(common.clone()); 87 | Ntp::spawn(common.clone()); 88 | Nvidia::spawn(common.clone()); 89 | Process::spawn(common.clone()); 90 | Rezolus::spawn(common.clone()); 91 | Scheduler::spawn(common.clone()); 92 | Softnet::spawn(common.clone()); 93 | Tcp::spawn(common.clone()); 94 | Udp::spawn(common.clone()); 95 | Usercall::spawn(common.clone()); 96 | Xfs::spawn(common); 97 | 98 | #[cfg(feature = "push_kafka")] 99 | { 100 | if config.exposition().kafka().enabled() { 101 | let mut kafka_producer = 102 | exposition::KafkaProducer::new(config.clone(), metrics.clone()); 103 | let _ = std::thread::Builder::new() 104 | .name("kafka".to_string()) 105 | .spawn(move || loop { 106 | kafka_producer.run(); 107 | }); 108 | } 109 | } 110 | 111 | debug!("beginning stats exposition"); 112 | let mut http = exposition::Http::new( 113 | config.listen().expect("no listen address"), 114 | metrics, 115 | config.general().reading_suffix(), 116 | ); 117 | 118 | while runnable.load(Ordering::Relaxed) { 119 | http.run(); 120 | } 121 | 122 | Ok(()) 123 | } 124 | -------------------------------------------------------------------------------- /src/metrics/channel/mod.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2020 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | use crate::metrics::entry::Entry; 6 | use crate::metrics::outputs::ApproxOutput; 7 | use crate::metrics::summary::SummaryStruct; 8 | use crate::metrics::traits::*; 9 | use crate::metrics::MetricsError; 10 | use crate::metrics::Output; 11 | use crate::metrics::Summary; 12 | use rustcommon_atomics::Arithmetic; 13 | use rustcommon_atomics::AtomicU64; 14 | use rustcommon_time::*; 15 | 16 | use crossbeam::atomic::AtomicCell; 17 | use dashmap::DashSet; 18 | use rustcommon_atomics::{Atomic, AtomicBool, Ordering}; 19 | 20 | /// Internal type which stores fields necessary to track a corresponding 21 | /// statistic. 22 | pub struct Channel { 23 | refreshed: AtomicCell>>, 24 | statistic: Entry, 25 | empty: AtomicBool, 26 | reading: AtomicU64, 27 | summary: Option, 28 | outputs: DashSet, 29 | } 30 | 31 | impl Channel { 32 | /// Creates an empty channel for a statistic. 33 | pub fn new(statistic: &dyn Statistic) -> Self { 34 | let summary = statistic.summary().map(|v| v.build()); 35 | Self { 36 | empty: AtomicBool::new(true), 37 | statistic: Entry::from(statistic), 38 | reading: Default::default(), 39 | refreshed: AtomicCell::new(Instant::>::now()), 40 | summary, 41 | outputs: Default::default(), 42 | } 43 | } 44 | 45 | /// Records a bucket value + count pair into the summary. 46 | pub fn record_bucket( 47 | &self, 48 | time: Instant>, 49 | value: u64, 50 | count: u32, 51 | ) -> Result<(), MetricsError> { 52 | if let Some(summary) = &self.summary { 53 | summary.increment(time, value, count); 54 | Ok(()) 55 | } else { 56 | Err(MetricsError::NoSummary) 57 | } 58 | } 59 | 60 | /// Updates a counter to a new value if the reading is newer than the stored 61 | /// reading. 62 | pub fn record_counter(&self, time: Instant>, value: u64) { 63 | let t0 = self.refreshed.load(); 64 | if time <= t0 { 65 | return; 66 | } 67 | if !self.empty.load(Ordering::Relaxed) { 68 | if let Some(summary) = &self.summary { 69 | self.refreshed.store(time); 70 | let v0 = self.reading.load(Ordering::Relaxed); 71 | let dt = time - t0; 72 | let dv = (value - v0).to_float(); 73 | let rate = (dv 74 | / (dt.as_secs() as f64 + dt.subsec_nanos() as f64 / 1_000_000_000.0)) 75 | .ceil(); 76 | summary.increment(time, u64::from_float(rate), 1_u8.into()); 77 | } 78 | self.reading.store(value, Ordering::Relaxed); 79 | } else { 80 | self.reading.store(value, Ordering::Relaxed); 81 | self.empty.store(false, Ordering::Relaxed); 82 | self.refreshed.store(time); 83 | } 84 | } 85 | 86 | /// Increment a counter by an amount 87 | pub fn increment_counter(&self, value: u64) { 88 | self.empty.store(false, Ordering::Relaxed); 89 | self.reading.fetch_add(value, Ordering::Relaxed); 90 | } 91 | 92 | /// Updates a gauge reading if the new value is newer than the stored value. 93 | pub fn record_gauge(&self, time: Instant>, value: u64) { 94 | { 95 | let t0 = self.refreshed.load(); 96 | if time <= t0 { 97 | return; 98 | } 99 | } 100 | if let Some(summary) = &self.summary { 101 | summary.increment(time, value, 1_u8.into()); 102 | } 103 | self.reading.store(value, Ordering::Relaxed); 104 | self.empty.store(false, Ordering::Relaxed); 105 | self.refreshed.store(time); 106 | } 107 | 108 | /// Returns a percentile across stored readings/rates/... 109 | pub fn percentile(&self, percentile: f64) -> Result { 110 | if let Some(summary) = &self.summary { 111 | summary.percentile(percentile).map_err(MetricsError::from) 112 | } else { 113 | Err(MetricsError::NoSummary) 114 | } 115 | } 116 | 117 | /// Returns the main reading for the channel (eg: counter, gauge) 118 | pub fn reading(&self) -> Result { 119 | if !self.empty.load(Ordering::Relaxed) { 120 | Ok(self.reading.load(Ordering::Relaxed)) 121 | } else { 122 | Err(MetricsError::Empty) 123 | } 124 | } 125 | 126 | /// Set a summary to be used for an existing channel 127 | pub fn set_summary(&mut self, summary: Summary) { 128 | let summary = summary.build(); 129 | self.summary = Some(summary); 130 | } 131 | 132 | /// Set a summary to be used for an existing channel 133 | pub fn add_summary(&mut self, summary: Summary) { 134 | if self.summary.is_none() { 135 | self.set_summary(summary); 136 | } 137 | } 138 | 139 | pub fn statistic(&self) -> &dyn Statistic { 140 | &self.statistic 141 | } 142 | 143 | pub fn outputs(&self) -> Vec { 144 | let mut ret = Vec::new(); 145 | for output in self.outputs.iter().map(|v| *v) { 146 | ret.push(output); 147 | } 148 | ret 149 | } 150 | 151 | pub fn add_output(&self, output: Output) { 152 | self.outputs.insert(ApproxOutput::from(output)); 153 | } 154 | 155 | pub fn remove_output(&self, output: Output) { 156 | self.outputs.remove(&ApproxOutput::from(output)); 157 | } 158 | } 159 | -------------------------------------------------------------------------------- /src/metrics/entry/mod.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2020 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | use core::hash::Hash; 6 | use core::hash::Hasher; 7 | 8 | use crate::metrics::*; 9 | 10 | pub struct Entry { 11 | name: String, 12 | source: Source, 13 | } 14 | 15 | impl Clone for Entry { 16 | fn clone(&self) -> Self { 17 | Self { 18 | name: self.name.clone(), 19 | source: self.source, 20 | } 21 | } 22 | } 23 | 24 | impl Statistic for Entry { 25 | fn name(&self) -> &str { 26 | &self.name 27 | } 28 | 29 | fn source(&self) -> Source { 30 | self.source 31 | } 32 | } 33 | 34 | impl Hash for Entry { 35 | fn hash(&self, state: &mut H) { 36 | self.name.hash(state); 37 | } 38 | } 39 | 40 | impl From<&dyn Statistic> for Entry { 41 | fn from(statistic: &dyn Statistic) -> Self { 42 | Self { 43 | name: statistic.name().to_string(), 44 | source: statistic.source(), 45 | } 46 | } 47 | } 48 | impl PartialEq for Entry { 49 | fn eq(&self, other: &Self) -> bool { 50 | self.name == other.name 51 | } 52 | } 53 | 54 | impl Eq for Entry {} 55 | -------------------------------------------------------------------------------- /src/metrics/error/mod.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2020 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | use rustcommon_heatmap::HeatmapError; 6 | use rustcommon_streamstats::StreamstatsError; 7 | use thiserror::Error; 8 | 9 | /// Possible errors returned by operations on a histogram. 10 | #[derive(Error, Debug, PartialEq)] 11 | pub enum MetricsError { 12 | #[error("no samples for the statistic")] 13 | /// The summary contains no samples. 14 | Empty, 15 | #[error("invalid percentile")] 16 | /// The provided percentile is outside of the range 0.0 - 100.0 (inclusive) 17 | InvalidPercentile, 18 | #[error("statistic is not registered")] 19 | /// The statistic has not been registered 20 | NotRegistered, 21 | #[error("no summary configured for the statistic")] 22 | /// The statistic does not have a configured summary 23 | NoSummary, 24 | #[error("value out of range")] 25 | /// The requested value is out of range. 26 | OutOfRange, 27 | #[error("method does not apply for this statistic")] 28 | /// A method has been called which does not match the statistic source 29 | SourceMismatch, 30 | } 31 | 32 | impl From for MetricsError { 33 | fn from(other: SummaryError) -> Self { 34 | match other { 35 | SummaryError::Empty => Self::Empty, 36 | SummaryError::InvalidPercentile => Self::InvalidPercentile, 37 | SummaryError::OutOfRange => Self::OutOfRange, 38 | SummaryError::NoSummary => Self::NoSummary, 39 | } 40 | } 41 | } 42 | 43 | #[derive(Error, Debug, PartialEq)] 44 | pub enum SummaryError { 45 | #[error("summary contains no samples")] 46 | /// The summary contains no samples. 47 | Empty, 48 | #[error("invalid percentile")] 49 | /// The provided percentile is outside of the range 0.0 - 100.0 (inclusive) 50 | InvalidPercentile, 51 | #[error("no summary configured for the statistic")] 52 | /// There is no summary for the statistic 53 | NoSummary, 54 | #[error("value out of range")] 55 | /// The requested value is out of range. 56 | OutOfRange, 57 | } 58 | 59 | impl From for SummaryError { 60 | fn from(other: HeatmapError) -> Self { 61 | match other { 62 | HeatmapError::Empty => Self::Empty, 63 | HeatmapError::InvalidPercentile => Self::InvalidPercentile, 64 | HeatmapError::OutOfRange => Self::OutOfRange, 65 | } 66 | } 67 | } 68 | 69 | impl From for SummaryError { 70 | fn from(other: StreamstatsError) -> Self { 71 | match other { 72 | StreamstatsError::Empty => Self::Empty, 73 | StreamstatsError::InvalidPercentile => Self::InvalidPercentile, 74 | } 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/metrics/mod.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2020 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | #![allow(dead_code)] 6 | 7 | mod channel; 8 | mod entry; 9 | mod error; 10 | #[allow(clippy::module_inception)] 11 | mod metrics; 12 | mod outputs; 13 | mod source; 14 | mod summary; 15 | mod traits; 16 | 17 | pub use error::MetricsError; 18 | pub use metrics::{Metric, Metrics}; 19 | pub use outputs::Output; 20 | pub use source::Source; 21 | pub use summary::Summary; 22 | pub use traits::{Count, Primitive, Statistic, Value}; 23 | 24 | // Re-export atomic trait and types for convenience 25 | pub use rustcommon_atomics::{Atomic, AtomicU16, AtomicU32, AtomicU64, AtomicU8}; 26 | // Re-export time types for convenience 27 | pub use rustcommon_time::*; 28 | 29 | #[cfg(test)] 30 | mod tests { 31 | use super::*; 32 | 33 | enum TestStat { 34 | Alpha, 35 | } 36 | 37 | impl Statistic for TestStat { 38 | fn name(&self) -> &str { 39 | match self { 40 | Self::Alpha => "alpha", 41 | } 42 | } 43 | 44 | fn source(&self) -> Source { 45 | match self { 46 | Self::Alpha => Source::Counter, 47 | } 48 | } 49 | 50 | fn summary(&self) -> Option { 51 | match self { 52 | Self::Alpha => Some(Summary::stream(1000)), 53 | } 54 | } 55 | } 56 | 57 | #[test] 58 | fn basic() { 59 | let metrics = Metrics::new(); 60 | metrics.register(&TestStat::Alpha); 61 | assert!(metrics.reading(&TestStat::Alpha).is_err()); 62 | metrics 63 | .record_counter(&TestStat::Alpha, Instant::>::now(), 0) 64 | .expect("failed to record counter"); 65 | assert_eq!(metrics.reading(&TestStat::Alpha), Ok(0)); 66 | let now = Instant::>::now(); 67 | metrics 68 | .record_counter(&TestStat::Alpha, now + Duration::from_millis(500), 0) 69 | .expect("failed to record counter"); 70 | assert_eq!(metrics.reading(&TestStat::Alpha), Ok(0)); 71 | assert_eq!(metrics.percentile(&TestStat::Alpha, 0.0), Ok(0)); 72 | metrics 73 | .record_counter(&TestStat::Alpha, now + Duration::from_millis(1500), 1) 74 | .expect("failed to record counter"); 75 | assert_eq!(metrics.reading(&TestStat::Alpha), Ok(1)); 76 | assert_eq!(metrics.percentile(&TestStat::Alpha, 100.0), Ok(1)); 77 | } 78 | 79 | #[test] 80 | fn outputs() { 81 | let metrics = Metrics::new(); 82 | metrics.register(&TestStat::Alpha); 83 | assert!(metrics.snapshot().is_empty()); 84 | metrics.add_output(&TestStat::Alpha, Output::Reading); 85 | let _ = metrics.record_counter(&TestStat::Alpha, Instant::>::now(), 1); 86 | assert_eq!(metrics.snapshot().len(), 1); 87 | assert_eq!(metrics.reading(&TestStat::Alpha), Ok(1)); 88 | } 89 | 90 | #[test] 91 | fn absolute_counter() { 92 | let metrics = Metrics::new(); 93 | metrics.register(&TestStat::Alpha); 94 | let start = Instant::>::now(); 95 | assert!(metrics.reading(&TestStat::Alpha).is_err()); 96 | metrics.record_counter(&TestStat::Alpha, start, 0).unwrap(); 97 | assert_eq!(metrics.reading(&TestStat::Alpha), Ok(0)); 98 | metrics 99 | .record_counter( 100 | &TestStat::Alpha, 101 | start + Duration::from_millis(1000), 102 | 1000000, 103 | ) 104 | .unwrap(); 105 | assert_eq!(metrics.reading(&TestStat::Alpha), Ok(1000000)); 106 | assert_eq!(metrics.percentile(&TestStat::Alpha, 99.9), Ok(1000000)); 107 | metrics 108 | .record_counter( 109 | &TestStat::Alpha, 110 | start + Duration::from_millis(2000), 111 | 3000000, 112 | ) 113 | .unwrap(); 114 | assert_eq!(metrics.reading(&TestStat::Alpha), Ok(3000000)); 115 | assert_eq!(metrics.percentile(&TestStat::Alpha, 99.9), Ok(2000000)); 116 | metrics.record_counter(&TestStat::Alpha, start, 42).unwrap(); 117 | assert_ne!(metrics.reading(&TestStat::Alpha), Ok(42)); 118 | } 119 | 120 | #[test] 121 | fn increment_counter() { 122 | let metrics = Metrics::new(); 123 | metrics.register(&TestStat::Alpha); 124 | assert!(metrics.reading(&TestStat::Alpha).is_err()); 125 | metrics.increment_counter(&TestStat::Alpha, 1).unwrap(); 126 | assert_eq!(metrics.reading(&TestStat::Alpha), Ok(1)); 127 | metrics.increment_counter(&TestStat::Alpha, 0).unwrap(); 128 | assert_eq!(metrics.reading(&TestStat::Alpha), Ok(1)); 129 | metrics.increment_counter(&TestStat::Alpha, 10).unwrap(); 130 | assert_eq!(metrics.reading(&TestStat::Alpha), Ok(11)); 131 | } 132 | } 133 | -------------------------------------------------------------------------------- /src/metrics/outputs/mod.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2020 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | // Internal representation which approximates the percentile 6 | #[derive(PartialEq, Eq, Hash, Copy, Clone)] 7 | pub enum ApproxOutput { 8 | Reading, 9 | Percentile(u64), 10 | } 11 | 12 | /// Defines an output that should be reported in a snapshot for a statistic 13 | #[derive(Copy, Clone)] 14 | pub enum Output { 15 | /// A counter or gauge reading 16 | Reading, 17 | /// A percentile from a statistic summary 18 | Percentile(f64), 19 | } 20 | 21 | impl From for ApproxOutput { 22 | fn from(output: Output) -> Self { 23 | match output { 24 | Output::Reading => Self::Reading, 25 | Output::Percentile(percentile) => { 26 | Self::Percentile((percentile * 1000000.0).ceil() as u64) 27 | } 28 | } 29 | } 30 | } 31 | 32 | impl From for Output { 33 | fn from(output: ApproxOutput) -> Self { 34 | match output { 35 | ApproxOutput::Reading => Self::Reading, 36 | ApproxOutput::Percentile(percentile) => Self::Percentile(percentile as f64 / 1000000.0), 37 | } 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/metrics/source/mod.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2020 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | /// Defines the source for a given statistic 6 | #[derive(PartialEq, Eq, Debug, Hash, Copy, Clone)] 7 | pub enum Source { 8 | /// Indicates that the source is a monotonically incrementing count. 9 | Counter, 10 | /// Indicates that the source is an instantaneous gauge reading. 11 | Gauge, 12 | /// Indicates that the source is an underlying distribution (histogram). 13 | Distribution, 14 | } 15 | -------------------------------------------------------------------------------- /src/metrics/summary/mod.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2020 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | use crate::metrics::error::SummaryError; 6 | use crate::metrics::*; 7 | 8 | use rustcommon_heatmap::{AtomicHeatmap, Duration, Instant}; 9 | use rustcommon_streamstats::AtomicStreamstats; 10 | 11 | pub(crate) enum SummaryStruct { 12 | Heatmap(AtomicHeatmap), 13 | Stream(AtomicStreamstats), 14 | } 15 | 16 | impl SummaryStruct { 17 | pub fn increment(&self, time: Instant>, value: u64, count: u32) { 18 | match self { 19 | Self::Heatmap(heatmap) => heatmap.increment(time, value, count), 20 | Self::Stream(stream) => stream.insert(value), 21 | } 22 | } 23 | 24 | pub fn percentile(&self, percentile: f64) -> Result { 25 | match self { 26 | Self::Heatmap(heatmap) => heatmap.percentile(percentile).map_err(SummaryError::from), 27 | Self::Stream(stream) => stream.percentile(percentile).map_err(SummaryError::from), 28 | } 29 | } 30 | 31 | pub fn heatmap( 32 | max: u64, 33 | precision: u8, 34 | span: Duration>, 35 | resolution: Duration>, 36 | ) -> Self { 37 | Self::Heatmap(AtomicHeatmap::new(max, precision, span, resolution)) 38 | } 39 | 40 | pub fn stream(samples: usize) -> Self { 41 | Self::Stream(AtomicStreamstats::new(samples)) 42 | } 43 | } 44 | 45 | enum SummaryType { 46 | Heatmap( 47 | u64, 48 | u8, 49 | Duration>, 50 | Duration>, 51 | ), 52 | Stream(usize), 53 | } 54 | 55 | pub struct Summary { 56 | inner: SummaryType, 57 | } 58 | 59 | impl Summary { 60 | pub fn heatmap( 61 | max: u64, 62 | precision: u8, 63 | span: Duration>, 64 | resolution: Duration>, 65 | ) -> Summary { 66 | Self { 67 | inner: SummaryType::Heatmap(max, precision, span, resolution), 68 | } 69 | } 70 | 71 | pub fn stream(samples: usize) -> Summary { 72 | Self { 73 | inner: SummaryType::Stream(samples), 74 | } 75 | } 76 | 77 | pub(crate) fn build(&self) -> SummaryStruct { 78 | match self.inner { 79 | SummaryType::Heatmap(max, precision, span, resolution) => { 80 | SummaryStruct::heatmap(max, precision, span, resolution) 81 | } 82 | SummaryType::Stream(samples) => SummaryStruct::stream(samples), 83 | } 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /src/metrics/traits/count.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2020 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | use rustcommon_atomics::*; 6 | use rustcommon_heatmap::AtomicCounter; 7 | 8 | /// Count types are used internally for some types of summary datastructures, 9 | /// such as heatmaps. The selected atomic is used as the internal counter width. 10 | /// A well matched type would be large enough to hold maximum number of 11 | /// observations that would fall into the same bucket in a heatmap. Using types 12 | /// that are oversized will result in higher memory utilization for heatmap 13 | /// summaries, but has no effect on basic counter/gauge values or streaming 14 | /// summary sizes. 15 | pub trait Count: Atomic + Default + AtomicCounter {} 16 | 17 | impl Count for AtomicU8 {} 18 | impl Count for AtomicU16 {} 19 | impl Count for AtomicU32 {} 20 | impl Count for AtomicU64 {} 21 | -------------------------------------------------------------------------------- /src/metrics/traits/float_convert.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2020 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | pub trait FloatConvert { 6 | fn to_float(self) -> f64; 7 | fn from_float(value: f64) -> Self; 8 | } 9 | 10 | impl FloatConvert for u64 { 11 | fn to_float(self) -> f64 { 12 | self as f64 13 | } 14 | fn from_float(value: f64) -> Self { 15 | value as Self 16 | } 17 | } 18 | 19 | impl FloatConvert for u32 { 20 | fn to_float(self) -> f64 { 21 | self as f64 22 | } 23 | fn from_float(value: f64) -> Self { 24 | value as Self 25 | } 26 | } 27 | 28 | impl FloatConvert for u16 { 29 | fn to_float(self) -> f64 { 30 | self as f64 31 | } 32 | fn from_float(value: f64) -> Self { 33 | value as Self 34 | } 35 | } 36 | 37 | impl FloatConvert for u8 { 38 | fn to_float(self) -> f64 { 39 | self as f64 40 | } 41 | fn from_float(value: f64) -> Self { 42 | value as Self 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/metrics/traits/mod.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2020 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | mod count; 6 | mod float_convert; 7 | mod primitive; 8 | mod statistic; 9 | mod value; 10 | 11 | pub use count::Count; 12 | pub use float_convert::FloatConvert; 13 | pub use primitive::Primitive; 14 | pub use statistic::Statistic; 15 | pub use value::Value; 16 | -------------------------------------------------------------------------------- /src/metrics/traits/primitive.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2020 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | use crate::metrics::traits::*; 6 | 7 | use rustcommon_heatmap::Indexing; 8 | 9 | use core::ops::Sub; 10 | 11 | /// A trait that is used to track primitive types that correspond to supported 12 | /// atomic types. 13 | pub trait Primitive: 14 | Ord + Indexing + Copy + From + Sub + FloatConvert 15 | { 16 | } 17 | 18 | impl Primitive for u8 {} 19 | impl Primitive for u16 {} 20 | impl Primitive for u32 {} 21 | impl Primitive for u64 {} 22 | -------------------------------------------------------------------------------- /src/metrics/traits/statistic.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2020 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | use crate::metrics::{Source, Summary}; 6 | 7 | use core::hash::{Hash, Hasher}; 8 | 9 | /// A statistic represents a named entity that has associated measurements which 10 | /// are recorded and metrics which are reported. This trait defines a set of 11 | /// methods which uniquely identify the statistic, help the metrics library 12 | /// track it appropriately, and allow including metadata in the exposition 13 | /// format. 14 | pub trait Statistic { 15 | /// The name is used to lookup the channel for the statistic and should be 16 | /// unique for each statistic. This field is used to hash the statistic in 17 | /// the core structure. 18 | fn name(&self) -> &str; 19 | /// Indicates which source type the statistic tracks. 20 | fn source(&self) -> Source; 21 | /// Optionally, specify a summary builder which configures a summary 22 | /// aggregation for producing additional metrics such as percentiles. 23 | fn summary(&self) -> Option { 24 | None 25 | } 26 | } 27 | 28 | impl Hash for dyn Statistic { 29 | fn hash(&self, state: &mut H) { 30 | self.name().to_string().hash(state); 31 | } 32 | } 33 | 34 | impl PartialEq for dyn Statistic { 35 | fn eq(&self, other: &Self) -> bool { 36 | self.name() == other.name() 37 | } 38 | } 39 | 40 | impl Eq for dyn Statistic {} 41 | -------------------------------------------------------------------------------- /src/metrics/traits/value.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2020 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | use rustcommon_atomics::*; 6 | 7 | /// Value types may be used to store the primary value for a metric. For example 8 | /// counter readings, gauge readings, or buckets values from underlying 9 | /// distributions. Lower precision atomics help reduce in-memory representation 10 | /// for stored values and streaming summaries, but are unable to represent large 11 | /// counter and gauge values. 12 | pub trait Value: Atomic + Arithmetic + Default {} 13 | 14 | impl Value for AtomicU8 {} 15 | impl Value for AtomicU16 {} 16 | impl Value for AtomicU32 {} 17 | impl Value for AtomicU64 {} 18 | -------------------------------------------------------------------------------- /src/samplers/cpu/config.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | use serde_derive::Deserialize; 6 | use strum::IntoEnumIterator; 7 | 8 | use crate::config::SamplerConfig; 9 | 10 | use super::stat::*; 11 | 12 | #[derive(Debug, Deserialize)] 13 | #[serde(deny_unknown_fields)] 14 | pub struct CpuConfig { 15 | #[serde(default)] 16 | enabled: bool, 17 | #[serde(default)] 18 | interval: Option, 19 | #[serde(default = "crate::common::default_percentiles")] 20 | percentiles: Vec, 21 | #[serde(default)] 22 | perf_events: bool, 23 | #[serde(default = "default_statistics")] 24 | statistics: Vec, 25 | } 26 | 27 | impl Default for CpuConfig { 28 | fn default() -> Self { 29 | Self { 30 | enabled: Default::default(), 31 | interval: Default::default(), 32 | percentiles: crate::common::default_percentiles(), 33 | perf_events: Default::default(), 34 | statistics: default_statistics(), 35 | } 36 | } 37 | } 38 | 39 | fn default_statistics() -> Vec { 40 | CpuStatistic::iter().collect() 41 | } 42 | 43 | impl SamplerConfig for CpuConfig { 44 | type Statistic = CpuStatistic; 45 | fn enabled(&self) -> bool { 46 | self.enabled 47 | } 48 | 49 | fn interval(&self) -> Option { 50 | self.interval 51 | } 52 | 53 | fn percentiles(&self) -> &[f64] { 54 | &self.percentiles 55 | } 56 | 57 | fn perf_events(&self) -> bool { 58 | self.perf_events 59 | } 60 | 61 | fn statistics(&self) -> Vec<::Statistic> { 62 | let mut enabled = Vec::new(); 63 | for statistic in self.statistics.iter() { 64 | if statistic.table().is_some() { 65 | if self.perf_events() { 66 | enabled.push(*statistic); 67 | } 68 | } else { 69 | enabled.push(*statistic); 70 | } 71 | } 72 | enabled 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /src/samplers/cpu/perf.c: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | #include 6 | #include 7 | 8 | // Arrays which hold the perf counters 9 | BPF_PERF_ARRAY(branch_instructions_array, NUM_CPU); 10 | BPF_PERF_ARRAY(branch_misses_array, NUM_CPU); 11 | BPF_PERF_ARRAY(cache_misses_array, NUM_CPU); 12 | BPF_PERF_ARRAY(cache_references_array, NUM_CPU); 13 | BPF_PERF_ARRAY(cycles_array, NUM_CPU); 14 | BPF_PERF_ARRAY(dtlb_load_miss_array, NUM_CPU); 15 | BPF_PERF_ARRAY(dtlb_load_access_array, NUM_CPU); 16 | BPF_PERF_ARRAY(dtlb_store_miss_array, NUM_CPU); 17 | BPF_PERF_ARRAY(dtlb_store_access_array, NUM_CPU); 18 | BPF_PERF_ARRAY(instructions_array, NUM_CPU); 19 | BPF_PERF_ARRAY(reference_cycles_array, NUM_CPU); 20 | 21 | // Tables which are read in user space 22 | BPF_ARRAY(branch_instructions, u64, NUM_CPU); 23 | BPF_ARRAY(branch_misses, u64, NUM_CPU); 24 | BPF_ARRAY(cache_misses, u64, NUM_CPU); 25 | BPF_ARRAY(cache_references, u64, NUM_CPU); 26 | BPF_ARRAY(cycles, u64, NUM_CPU); 27 | BPF_ARRAY(dtlb_load_access, u64, NUM_CPU); 28 | BPF_ARRAY(dtlb_load_miss, u64, NUM_CPU); 29 | BPF_ARRAY(dtlb_store_access, u64, NUM_CPU); 30 | BPF_ARRAY(dtlb_store_miss, u64, NUM_CPU); 31 | BPF_ARRAY(instructions, u64, NUM_CPU); 32 | BPF_ARRAY(reference_cycles, u64, NUM_CPU); 33 | 34 | int do_count(struct bpf_perf_event_data *ctx) { 35 | u32 cpu = bpf_get_smp_processor_id(); 36 | u64 count = 0; 37 | 38 | count = branch_instructions_array.perf_read(CUR_CPU_IDENTIFIER); 39 | if ((s64)count < -256 || (s64)count > 0) { 40 | branch_instructions.update(&cpu, &count); 41 | } 42 | 43 | count = branch_misses_array.perf_read(CUR_CPU_IDENTIFIER); 44 | if ((s64)count < -256 || (s64)count > 0) { 45 | branch_misses.update(&cpu, &count); 46 | } 47 | 48 | count = cache_misses_array.perf_read(CUR_CPU_IDENTIFIER); 49 | if ((s64)count < -256 || (s64)count > 0) { 50 | cache_misses.update(&cpu, &count); 51 | } 52 | 53 | count = cache_references_array.perf_read(CUR_CPU_IDENTIFIER); 54 | if ((s64)count < -256 || (s64)count > 0) { 55 | cache_references.update(&cpu, &count); 56 | } 57 | 58 | count = cycles_array.perf_read(CUR_CPU_IDENTIFIER); 59 | if ((s64)count < -256 || (s64)count > 0) { 60 | cycles.update(&cpu, &count); 61 | } 62 | 63 | count = dtlb_load_access_array.perf_read(CUR_CPU_IDENTIFIER); 64 | if ((s64)count < -256 || (s64)count > 0) { 65 | dtlb_load_access.update(&cpu, &count); 66 | } 67 | 68 | count = dtlb_load_miss_array.perf_read(CUR_CPU_IDENTIFIER); 69 | if ((s64)count < -256 || (s64)count > 0) { 70 | dtlb_load_miss.update(&cpu, &count); 71 | } 72 | 73 | count = dtlb_store_access_array.perf_read(CUR_CPU_IDENTIFIER); 74 | if ((s64)count < -256 || (s64)count > 0) { 75 | dtlb_store_access.update(&cpu, &count); 76 | } 77 | 78 | count = dtlb_store_miss_array.perf_read(CUR_CPU_IDENTIFIER); 79 | if ((s64)count < -256 || (s64)count > 0) { 80 | dtlb_store_miss.update(&cpu, &count); 81 | } 82 | 83 | count = instructions_array.perf_read(CUR_CPU_IDENTIFIER); 84 | if ((s64)count < -256 || (s64)count > 0) { 85 | instructions.update(&cpu, &count); 86 | } 87 | 88 | count = reference_cycles_array.perf_read(CUR_CPU_IDENTIFIER); 89 | if ((s64)count < -256 || (s64)count > 0) { 90 | reference_cycles.update(&cpu, &count); 91 | } 92 | 93 | return 0; 94 | } 95 | -------------------------------------------------------------------------------- /src/samplers/cpu/stat.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | use core::str::FromStr; 6 | 7 | #[cfg(feature = "bpf")] 8 | use bcc::perf_event::*; 9 | 10 | use crate::metrics::*; 11 | use serde_derive::{Deserialize, Serialize}; 12 | use strum_macros::{EnumIter, EnumString, IntoStaticStr}; 13 | 14 | #[derive( 15 | Clone, 16 | Copy, 17 | Debug, 18 | Deserialize, 19 | EnumIter, 20 | EnumString, 21 | Eq, 22 | IntoStaticStr, 23 | PartialEq, 24 | Hash, 25 | Serialize, 26 | )] 27 | #[serde(deny_unknown_fields, try_from = "&str", into = "&str")] 28 | pub enum CpuStatistic { 29 | #[strum(serialize = "cpu/usage/user")] 30 | UsageUser, 31 | #[strum(serialize = "cpu/usage/nice")] 32 | UsageNice, 33 | #[strum(serialize = "cpu/usage/system")] 34 | UsageSystem, 35 | #[strum(serialize = "cpu/usage/idle")] 36 | UsageIdle, 37 | #[strum(serialize = "cpu/usage/irq")] 38 | UsageIrq, 39 | #[strum(serialize = "cpu/usage/softirq")] 40 | UsageSoftirq, 41 | #[strum(serialize = "cpu/usage/steal")] 42 | UsageSteal, 43 | #[strum(serialize = "cpu/usage/guest")] 44 | UsageGuest, 45 | #[strum(serialize = "cpu/usage/guestnice")] 46 | UsageGuestNice, 47 | #[strum(serialize = "cpu/cache/miss")] 48 | CacheMiss, 49 | #[strum(serialize = "cpu/cache/access")] 50 | CacheAccess, 51 | #[strum(serialize = "cpu/bpu/branch")] 52 | BpuBranches, 53 | #[strum(serialize = "cpu/bpu/miss")] 54 | BpuMiss, 55 | #[strum(serialize = "cpu/cycles")] 56 | Cycles, 57 | #[strum(serialize = "cpu/dtlb/load/miss")] 58 | DtlbLoadMiss, 59 | #[strum(serialize = "cpu/dtlb/load/access")] 60 | DtlbLoadAccess, 61 | #[strum(serialize = "cpu/dtlb/store/access")] 62 | DtlbStoreAccess, 63 | #[strum(serialize = "cpu/dtlb/store/miss")] 64 | DtlbStoreMiss, 65 | #[strum(serialize = "cpu/instructions")] 66 | Instructions, 67 | #[strum(serialize = "cpu/reference_cycles")] 68 | ReferenceCycles, 69 | #[strum(serialize = "cpu/cstate/c0/time")] 70 | CstateC0Time, 71 | #[strum(serialize = "cpu/cstate/c1/time")] 72 | CstateC1Time, 73 | #[strum(serialize = "cpu/cstate/c1e/time")] 74 | CstateC1ETime, 75 | #[strum(serialize = "cpu/cstate/c2/time")] 76 | CstateC2Time, 77 | #[strum(serialize = "cpu/cstate/c3/time")] 78 | CstateC3Time, 79 | #[strum(serialize = "cpu/cstate/c6/time")] 80 | CstateC6Time, 81 | #[strum(serialize = "cpu/cstate/c7/time")] 82 | CstateC7Time, 83 | #[strum(serialize = "cpu/cstate/c8/time")] 84 | CstateC8Time, 85 | #[strum(serialize = "cpu/frequency")] 86 | Frequency, 87 | } 88 | 89 | impl Statistic for CpuStatistic { 90 | fn name(&self) -> &str { 91 | (*self).into() 92 | } 93 | 94 | fn source(&self) -> Source { 95 | match self { 96 | Self::Frequency => Source::Gauge, 97 | _ => Source::Counter, 98 | } 99 | } 100 | } 101 | 102 | impl CpuStatistic { 103 | #[cfg(feature = "bpf")] 104 | pub fn event(self) -> Option { 105 | match self { 106 | Self::BpuBranches => Some(Event::Hardware(HardwareEvent::BranchInstructions)), 107 | Self::BpuMiss => Some(Event::Hardware(HardwareEvent::BranchMisses)), 108 | Self::CacheAccess => Some(Event::Hardware(HardwareEvent::CacheReferences)), 109 | Self::CacheMiss => Some(Event::Hardware(HardwareEvent::CacheMisses)), 110 | Self::Cycles => Some(Event::Hardware(HardwareEvent::CpuCycles)), 111 | Self::DtlbLoadMiss => Some(Event::HardwareCache( 112 | CacheId::DTLB, 113 | CacheOp::Read, 114 | CacheResult::Miss, 115 | )), 116 | Self::DtlbLoadAccess => Some(Event::HardwareCache( 117 | CacheId::DTLB, 118 | CacheOp::Read, 119 | CacheResult::Access, 120 | )), 121 | Self::DtlbStoreMiss => Some(Event::HardwareCache( 122 | CacheId::DTLB, 123 | CacheOp::Write, 124 | CacheResult::Miss, 125 | )), 126 | Self::DtlbStoreAccess => Some(Event::HardwareCache( 127 | CacheId::DTLB, 128 | CacheOp::Write, 129 | CacheResult::Access, 130 | )), 131 | Self::Instructions => Some(Event::Hardware(HardwareEvent::Instructions)), 132 | Self::ReferenceCycles => Some(Event::Hardware(HardwareEvent::RefCpuCycles)), 133 | _ => None, 134 | } 135 | } 136 | 137 | pub fn table(self) -> Option<&'static str> { 138 | match self { 139 | Self::BpuBranches => Some("branch_instructions"), 140 | Self::BpuMiss => Some("branch_misses"), 141 | Self::CacheMiss => Some("cache_misses"), 142 | Self::CacheAccess => Some("cache_references"), 143 | Self::Cycles => Some("cycles"), 144 | Self::DtlbLoadMiss => Some("dtlb_load_miss"), 145 | Self::DtlbLoadAccess => Some("dtlb_load_access"), 146 | Self::DtlbStoreMiss => Some("dtlb_store_miss"), 147 | Self::DtlbStoreAccess => Some("dtlb_store_access"), 148 | Self::Instructions => Some("instructions"), 149 | Self::ReferenceCycles => Some("reference_cycles"), 150 | _ => None, 151 | } 152 | } 153 | } 154 | 155 | #[derive(Debug)] 156 | pub struct ParseCStateError; 157 | 158 | impl std::fmt::Display for ParseCStateError { 159 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 160 | write!(f, "Error parsing cstate") 161 | } 162 | } 163 | 164 | impl std::error::Error for ParseCStateError { 165 | fn description(&self) -> &str { 166 | "Error parsing cstate" 167 | } 168 | } 169 | 170 | #[derive(Clone, Copy, Debug, Deserialize, Eq, PartialEq, Hash)] 171 | pub enum CState { 172 | C0, 173 | C1, 174 | C1E, 175 | C2, 176 | C3, 177 | C6, 178 | C7, 179 | C8, 180 | } 181 | 182 | impl FromStr for CState { 183 | type Err = ParseCStateError; 184 | 185 | fn from_str(s: &str) -> Result { 186 | match s { 187 | "POLL" | "C0" => Ok(CState::C0), 188 | "C1" => Ok(CState::C1), 189 | "C1E" => Ok(CState::C1E), 190 | "C2" => Ok(CState::C2), 191 | "C3" => Ok(CState::C3), 192 | "C6" => Ok(CState::C6), 193 | "C7" => Ok(CState::C7), 194 | "C8" => Ok(CState::C8), 195 | _ => Err(ParseCStateError), 196 | } 197 | } 198 | } 199 | -------------------------------------------------------------------------------- /src/samplers/disk/bpf.c: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | #include 6 | #include 7 | 8 | struct val_t { 9 | char name[TASK_COMM_LEN]; 10 | }; 11 | 12 | // hashes to track request details 13 | BPF_HASH(queue_start, struct request *); 14 | BPF_HASH(request_start, struct request *); 15 | BPF_HASH(commbyreq, struct request *, struct val_t); 16 | 17 | // value_to_index2() gives us from 0-460 as the index 18 | BPF_HISTOGRAM(io_size_read, int, 461); 19 | BPF_HISTOGRAM(latency_read, int, 461); 20 | BPF_HISTOGRAM(device_latency_read, int, 461); 21 | BPF_HISTOGRAM(queue_latency_read, int, 461); 22 | BPF_HISTOGRAM(io_size_write, int, 461); 23 | BPF_HISTOGRAM(latency_write, int, 461); 24 | BPF_HISTOGRAM(device_latency_write, int, 461); 25 | BPF_HISTOGRAM(queue_latency_write, int, 461); 26 | 27 | VALUE_TO_INDEX2_FUNC 28 | 29 | int trace_pid_start(struct pt_regs *ctx, struct request *req) 30 | { 31 | struct val_t val = {}; 32 | if (bpf_get_current_comm(&val.name, sizeof(val.name)) == 0) { 33 | u64 ts = bpf_ktime_get_ns(); 34 | queue_start.update(&req, &ts); 35 | commbyreq.update(&req, &val); 36 | } 37 | return 0; 38 | } 39 | 40 | int trace_req_start(struct pt_regs *ctx, struct request *req) 41 | { 42 | u64 now = bpf_ktime_get_ns(); 43 | 44 | u64 rwflag = 0; 45 | #ifdef REQ_WRITE 46 | rwflag = !!(req->cmd_flags & REQ_WRITE); 47 | #elif defined(REQ_OP_SHIFT) 48 | rwflag = !!((req->cmd_flags >> REQ_OP_SHIFT) == REQ_OP_WRITE); 49 | #else 50 | rwflag = !!((req->cmd_flags & REQ_OP_MASK) == REQ_OP_WRITE); 51 | #endif 52 | 53 | u64 *enqueued; 54 | enqueued = queue_start.lookup(&req); 55 | if (enqueued != 0) { 56 | unsigned int index = value_to_index2((now - *enqueued) / 1000); 57 | if (rwflag == 1) { 58 | queue_latency_write.increment(index); 59 | } else { 60 | queue_latency_read.increment(index); 61 | } 62 | } 63 | request_start.update(&req, &now); 64 | return 0; 65 | } 66 | 67 | int do_count(struct pt_regs *ctx, struct request *req) 68 | { 69 | u64 now = bpf_ktime_get_ns(); 70 | 71 | u64 rwflag = 0; 72 | #ifdef REQ_WRITE 73 | rwflag = !!(req->cmd_flags & REQ_WRITE); 74 | #elif defined(REQ_OP_SHIFT) 75 | rwflag = !!((req->cmd_flags >> REQ_OP_SHIFT) == REQ_OP_WRITE); 76 | #else 77 | rwflag = !!((req->cmd_flags & REQ_OP_MASK) == REQ_OP_WRITE); 78 | #endif 79 | 80 | // Size 81 | struct val_t *valp; 82 | valp = commbyreq.lookup(&req); 83 | if (valp == 0) { 84 | return 0; 85 | } 86 | u64 delta = req->__data_len / 1024; 87 | unsigned int index = value_to_index2(delta); 88 | if (req->__data_len > 0) { 89 | if (rwflag == 1) { 90 | io_size_write.increment(index); 91 | } else { 92 | io_size_read.increment(index); 93 | } 94 | } 95 | 96 | // Latency 97 | u64 *enqueued, *requested; 98 | 99 | // total latency including queued time 100 | enqueued = queue_start.lookup(&req); 101 | if (enqueued != 0) { 102 | unsigned int index = value_to_index2((now - *enqueued) / 1000); 103 | if (rwflag == 1) { 104 | latency_write.increment(index); 105 | } else { 106 | latency_read.increment(index); 107 | } 108 | } 109 | 110 | // request latency not including queued time 111 | requested = request_start.lookup(&req); 112 | if (requested != 0) { 113 | unsigned int index = value_to_index2((now - *requested) / 1000); 114 | if (rwflag == 1) { 115 | device_latency_write.increment(index); 116 | } else { 117 | device_latency_read.increment(index); 118 | } 119 | } 120 | 121 | return 0; 122 | } 123 | -------------------------------------------------------------------------------- /src/samplers/disk/config.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | use serde_derive::Deserialize; 6 | use strum::IntoEnumIterator; 7 | 8 | use crate::config::SamplerConfig; 9 | 10 | use super::stat::*; 11 | 12 | #[derive(Debug, Deserialize)] 13 | #[serde(deny_unknown_fields)] 14 | pub struct DiskConfig { 15 | #[serde(default)] 16 | bpf: bool, 17 | #[serde(default)] 18 | enabled: bool, 19 | #[serde(default)] 20 | interval: Option, 21 | #[serde(default = "crate::common::default_percentiles")] 22 | percentiles: Vec, 23 | #[serde(default = "default_statistics")] 24 | statistics: Vec, 25 | } 26 | 27 | impl Default for DiskConfig { 28 | fn default() -> Self { 29 | Self { 30 | bpf: Default::default(), 31 | enabled: Default::default(), 32 | interval: Default::default(), 33 | percentiles: crate::common::default_percentiles(), 34 | statistics: default_statistics(), 35 | } 36 | } 37 | } 38 | 39 | fn default_statistics() -> Vec { 40 | DiskStatistic::iter().collect() 41 | } 42 | 43 | impl SamplerConfig for DiskConfig { 44 | type Statistic = DiskStatistic; 45 | 46 | fn bpf(&self) -> bool { 47 | self.bpf 48 | } 49 | 50 | fn enabled(&self) -> bool { 51 | self.enabled 52 | } 53 | 54 | fn interval(&self) -> Option { 55 | self.interval 56 | } 57 | 58 | fn percentiles(&self) -> &[f64] { 59 | &self.percentiles 60 | } 61 | 62 | fn statistics(&self) -> Vec<::Statistic> { 63 | let mut enabled = Vec::new(); 64 | for statistic in self.statistics.iter() { 65 | if statistic.bpf_table().is_some() { 66 | if self.bpf() { 67 | enabled.push(*statistic); 68 | } 69 | } else { 70 | enabled.push(*statistic); 71 | } 72 | } 73 | enabled 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /src/samplers/disk/stat.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | use crate::metrics::*; 6 | use serde_derive::{Deserialize, Serialize}; 7 | use strum_macros::{EnumIter, EnumString, IntoStaticStr}; 8 | 9 | #[cfg(feature = "bpf")] 10 | use crate::common::bpf::*; 11 | 12 | #[derive( 13 | Clone, 14 | Copy, 15 | Debug, 16 | Deserialize, 17 | EnumIter, 18 | EnumString, 19 | Eq, 20 | IntoStaticStr, 21 | PartialEq, 22 | Hash, 23 | Serialize, 24 | )] 25 | #[serde(deny_unknown_fields, try_from = "&str", into = "&str")] 26 | pub enum DiskStatistic { 27 | #[strum(serialize = "disk/read/bytes")] 28 | BandwidthRead, 29 | #[strum(serialize = "disk/write/bytes")] 30 | BandwidthWrite, 31 | #[strum(serialize = "disk/discard/bytes")] 32 | BandwidthDiscard, 33 | #[strum(serialize = "disk/read/operations")] 34 | OperationsRead, 35 | #[strum(serialize = "disk/write/operations")] 36 | OperationsWrite, 37 | #[strum(serialize = "disk/discard/operations")] 38 | OperationsDiscard, 39 | #[strum(serialize = "disk/read/latency")] 40 | LatencyRead, 41 | #[strum(serialize = "disk/write/latency")] 42 | LatencyWrite, 43 | #[strum(serialize = "disk/read/device_latency")] 44 | DeviceLatencyRead, 45 | #[strum(serialize = "disk/write/device_latency")] 46 | DeviceLatencyWrite, 47 | #[strum(serialize = "disk/read/queue_latency")] 48 | QueueLatencyRead, 49 | #[strum(serialize = "disk/write/queue_latency")] 50 | QueueLatencyWrite, 51 | #[strum(serialize = "disk/read/io_size")] 52 | IoSizeRead, 53 | #[strum(serialize = "disk/write/io_size")] 54 | IoSizeWrite, 55 | } 56 | 57 | impl DiskStatistic { 58 | pub fn bpf_table(self) -> Option<&'static str> { 59 | match self { 60 | Self::LatencyRead => Some("latency_read"), 61 | Self::LatencyWrite => Some("latency_write"), 62 | Self::DeviceLatencyRead => Some("device_latency_read"), 63 | Self::DeviceLatencyWrite => Some("device_latency_write"), 64 | Self::QueueLatencyRead => Some("queue_latency_read"), 65 | Self::QueueLatencyWrite => Some("queue_latency_write"), 66 | Self::IoSizeRead => Some("io_size_read"), 67 | Self::IoSizeWrite => Some("io_size_write"), 68 | _ => None, 69 | } 70 | } 71 | 72 | #[cfg(feature = "bpf")] 73 | pub fn bpf_probes_required(self) -> Vec { 74 | // define the unique probes below. 75 | let pid_start_probe = Probe { 76 | name: "blk_account_io_start".to_string(), 77 | handler: "trace_pid_start".to_string(), 78 | probe_type: ProbeType::Kernel, 79 | probe_location: ProbeLocation::Entry, 80 | binary_path: None, 81 | sub_system: None, 82 | }; 83 | let request_start_probe = Probe { 84 | name: "blk_start_request".to_string(), 85 | handler: "trace_req_start".to_string(), 86 | probe_type: ProbeType::Kernel, 87 | probe_location: ProbeLocation::Entry, 88 | binary_path: None, 89 | sub_system: None, 90 | }; 91 | let request_mq_start_request_probe = Probe { 92 | name: "blk_mq_start_request".to_string(), 93 | handler: "trace_req_start".to_string(), 94 | probe_type: ProbeType::Kernel, 95 | probe_location: ProbeLocation::Entry, 96 | binary_path: None, 97 | sub_system: None, 98 | }; 99 | let pid_done_probe = Probe { 100 | name: "blk_account_io_done".to_string(), 101 | handler: "do_count".to_string(), 102 | probe_type: ProbeType::Kernel, 103 | probe_location: ProbeLocation::Return, 104 | binary_path: None, 105 | sub_system: None, 106 | }; 107 | let pid_completion_probe = Probe { 108 | name: "blk_account_io_completion".to_string(), 109 | handler: "do_count".to_string(), 110 | probe_type: ProbeType::Kernel, 111 | probe_location: ProbeLocation::Entry, 112 | binary_path: None, 113 | sub_system: None, 114 | }; 115 | 116 | // specify what probes are required for each telemetry. 117 | match self { 118 | Self::LatencyRead | Self::LatencyWrite | Self::IoSizeRead | Self::IoSizeWrite => { 119 | vec![pid_start_probe, pid_done_probe, pid_completion_probe] 120 | } 121 | Self::DeviceLatencyRead | Self::DeviceLatencyWrite => vec![ 122 | request_start_probe, 123 | request_mq_start_request_probe, 124 | pid_done_probe, 125 | pid_completion_probe, 126 | ], 127 | Self::QueueLatencyRead | Self::QueueLatencyWrite => vec![ 128 | pid_start_probe, 129 | request_start_probe, 130 | request_mq_start_request_probe, 131 | ], 132 | _ => Vec::new(), 133 | } 134 | } 135 | } 136 | 137 | impl Statistic for DiskStatistic { 138 | fn name(&self) -> &str { 139 | (*self).into() 140 | } 141 | 142 | fn source(&self) -> Source { 143 | if self.bpf_table().is_some() { 144 | Source::Distribution 145 | } else { 146 | Source::Counter 147 | } 148 | } 149 | } 150 | -------------------------------------------------------------------------------- /src/samplers/ext4/bpf.c: -------------------------------------------------------------------------------- 1 | // Based on: https://github.com/iovisor/bcc/blob/master/tools/ext4dist.py 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #define OP_NAME_LEN 8 8 | 9 | typedef struct dist_key { 10 | char op[OP_NAME_LEN]; 11 | u64 slot; 12 | } dist_key_t; 13 | 14 | BPF_HASH(start, u32); 15 | 16 | // value_to_index2() gives us from 0-460 as the index 17 | BPF_HISTOGRAM(read, int, 461); 18 | BPF_HISTOGRAM(write, int, 461); 19 | BPF_HISTOGRAM(open, int, 461); 20 | BPF_HISTOGRAM(fsync, int, 461); 21 | 22 | VALUE_TO_INDEX2_FUNC 23 | 24 | int trace_entry(struct pt_regs *ctx) 25 | { 26 | u32 pid = bpf_get_current_pid_tgid(); 27 | u64 ts = bpf_ktime_get_ns(); 28 | start.update(&pid, &ts); 29 | return 0; 30 | } 31 | 32 | int trace_read_entry(struct pt_regs *ctx, struct kiocb *iocb) 33 | { 34 | u32 pid = bpf_get_current_pid_tgid(); 35 | struct file *fp = iocb->ki_filp; 36 | if ((u64)fp->f_op == EXT4_FILE_OPERATIONS) 37 | return 0; 38 | u64 ts = bpf_ktime_get_ns(); 39 | start.update(&pid, &ts); 40 | return 0; 41 | } 42 | 43 | static int trace_return(struct pt_regs *ctx, int op) 44 | { 45 | // get pid 46 | u32 pid = bpf_get_current_pid_tgid(); 47 | 48 | // lookup start 49 | u64 *tsp = start.lookup(&pid); 50 | 51 | // skip events with unknown start 52 | if (tsp == 0) { 53 | return 0; 54 | } 55 | 56 | // calculate latency 57 | u64 delta = (bpf_ktime_get_ns() - *tsp) / 1000; 58 | 59 | // store as histogram 60 | unsigned int index = value_to_index2(delta); 61 | if (op == 0) { 62 | read.increment(index); 63 | } else if (op == 1) { 64 | write.increment(index); 65 | } else if (op == 2) { 66 | open.increment(index); 67 | } else if (op == 3) { 68 | fsync.increment(index); 69 | } 70 | 71 | // clear the start entry from the map 72 | start.delete(&pid); 73 | 74 | return 0; 75 | } 76 | 77 | int trace_read_return(struct pt_regs *ctx) 78 | { 79 | return trace_return(ctx, 0); 80 | } 81 | 82 | int trace_write_return(struct pt_regs *ctx) 83 | { 84 | return trace_return(ctx, 1); 85 | } 86 | 87 | int trace_open_return(struct pt_regs *ctx) 88 | { 89 | return trace_return(ctx, 2); 90 | } 91 | 92 | int trace_fsync_return(struct pt_regs *ctx) 93 | { 94 | return trace_return(ctx, 3); 95 | } 96 | -------------------------------------------------------------------------------- /src/samplers/ext4/config.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | use serde_derive::Deserialize; 6 | use strum::IntoEnumIterator; 7 | 8 | use crate::config::SamplerConfig; 9 | 10 | use super::stat::*; 11 | 12 | #[derive(Debug, Deserialize)] 13 | #[serde(deny_unknown_fields)] 14 | pub struct Ext4Config { 15 | #[serde(default)] 16 | bpf: bool, 17 | #[serde(default)] 18 | enabled: bool, 19 | #[serde(default)] 20 | interval: Option, 21 | #[serde(default = "crate::common::default_percentiles")] 22 | percentiles: Vec, 23 | #[serde(default = "default_statistics")] 24 | statistics: Vec, 25 | } 26 | 27 | impl Default for Ext4Config { 28 | fn default() -> Self { 29 | Self { 30 | bpf: Default::default(), 31 | enabled: Default::default(), 32 | interval: Default::default(), 33 | percentiles: crate::common::default_percentiles(), 34 | statistics: default_statistics(), 35 | } 36 | } 37 | } 38 | 39 | fn default_statistics() -> Vec { 40 | Ext4Statistic::iter().collect() 41 | } 42 | 43 | impl SamplerConfig for Ext4Config { 44 | type Statistic = Ext4Statistic; 45 | 46 | fn bpf(&self) -> bool { 47 | self.bpf 48 | } 49 | 50 | fn enabled(&self) -> bool { 51 | self.enabled 52 | } 53 | 54 | fn interval(&self) -> Option { 55 | self.interval 56 | } 57 | 58 | fn percentiles(&self) -> &[f64] { 59 | &self.percentiles 60 | } 61 | 62 | fn statistics(&self) -> Vec<::Statistic> { 63 | let mut enabled = Vec::new(); 64 | for statistic in self.statistics.iter() { 65 | if statistic.bpf_table().is_some() { 66 | if self.bpf() { 67 | enabled.push(*statistic); 68 | } 69 | } else { 70 | enabled.push(*statistic); 71 | } 72 | } 73 | enabled 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /src/samplers/ext4/stat.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | use crate::metrics::*; 6 | use serde_derive::{Deserialize, Serialize}; 7 | use strum_macros::{EnumIter, EnumString, IntoStaticStr}; 8 | 9 | #[cfg(feature = "bpf")] 10 | use crate::common::bpf::*; 11 | 12 | #[derive( 13 | Clone, 14 | Copy, 15 | Debug, 16 | Deserialize, 17 | EnumIter, 18 | EnumString, 19 | Eq, 20 | IntoStaticStr, 21 | PartialEq, 22 | Hash, 23 | Serialize, 24 | )] 25 | #[serde(deny_unknown_fields, try_from = "&str", into = "&str")] 26 | #[allow(clippy::enum_variant_names)] 27 | pub enum Ext4Statistic { 28 | #[strum(serialize = "ext4/read/latency")] 29 | ReadLatency, 30 | #[strum(serialize = "ext4/write/latency")] 31 | WriteLatency, 32 | #[strum(serialize = "ext4/open/latency")] 33 | OpenLatency, 34 | #[strum(serialize = "ext4/fsync/latency")] 35 | FsyncLatency, 36 | } 37 | 38 | impl Ext4Statistic { 39 | #[allow(dead_code)] 40 | pub fn bpf_table(self) -> Option<&'static str> { 41 | match self { 42 | Self::ReadLatency => Some("read"), 43 | Self::WriteLatency => Some("write"), 44 | Self::OpenLatency => Some("open"), 45 | Self::FsyncLatency => Some("fsync"), 46 | } 47 | } 48 | 49 | #[cfg(feature = "bpf")] 50 | pub fn bpf_probes_required(self) -> Vec { 51 | // define the unique probes below. 52 | let generic_file_read_probe = Probe { 53 | name: "generic_file_read_iter".to_string(), 54 | handler: "trace_read_entry".to_string(), 55 | probe_type: ProbeType::Kernel, 56 | probe_location: ProbeLocation::Entry, 57 | binary_path: None, 58 | sub_system: None, 59 | }; 60 | let ext4_file_write_probe = Probe { 61 | name: "ext4_file_write_iter".to_string(), 62 | handler: "trace_entry".to_string(), 63 | probe_type: ProbeType::Kernel, 64 | probe_location: ProbeLocation::Entry, 65 | binary_path: None, 66 | sub_system: None, 67 | }; 68 | let ext4_file_open_probe = Probe { 69 | name: "ext4_file_open".to_string(), 70 | handler: "trace_entry".to_string(), 71 | probe_type: ProbeType::Kernel, 72 | probe_location: ProbeLocation::Entry, 73 | binary_path: None, 74 | sub_system: None, 75 | }; 76 | let ext4_sync_file_probe = Probe { 77 | name: "ext4_sync_file".to_string(), 78 | handler: "trace_entry".to_string(), 79 | probe_type: ProbeType::Kernel, 80 | probe_location: ProbeLocation::Entry, 81 | binary_path: None, 82 | sub_system: None, 83 | }; 84 | let generic_file_read_ret_probe = Probe { 85 | name: "generic_file_read_iter".to_string(), 86 | handler: "trace_read_return".to_string(), 87 | probe_type: ProbeType::Kernel, 88 | probe_location: ProbeLocation::Return, 89 | binary_path: None, 90 | sub_system: None, 91 | }; 92 | let ext4_file_write_ret_probe = Probe { 93 | name: "ext4_file_write_iter".to_string(), 94 | handler: "trace_write_return".to_string(), 95 | probe_type: ProbeType::Kernel, 96 | probe_location: ProbeLocation::Return, 97 | binary_path: None, 98 | sub_system: None, 99 | }; 100 | let ext4_file_open_ret_probe = Probe { 101 | name: "ext4_file_open".to_string(), 102 | handler: "trace_open_return".to_string(), 103 | probe_type: ProbeType::Kernel, 104 | probe_location: ProbeLocation::Return, 105 | binary_path: None, 106 | sub_system: None, 107 | }; 108 | let ext4_sync_file_ret_probe = Probe { 109 | name: "ext4_sync_file".to_string(), 110 | handler: "trace_fsync_return".to_string(), 111 | probe_type: ProbeType::Kernel, 112 | probe_location: ProbeLocation::Return, 113 | binary_path: None, 114 | sub_system: None, 115 | }; 116 | 117 | // specify what probes are required for each telemetry. 118 | match self { 119 | Self::ReadLatency => vec![generic_file_read_probe, generic_file_read_ret_probe], 120 | Self::WriteLatency => vec![ext4_file_write_probe, ext4_file_write_ret_probe], 121 | Self::OpenLatency => vec![ext4_file_open_probe, ext4_file_open_ret_probe], 122 | Self::FsyncLatency => vec![ext4_sync_file_probe, ext4_sync_file_ret_probe], 123 | } 124 | } 125 | } 126 | 127 | impl Statistic for Ext4Statistic { 128 | fn name(&self) -> &str { 129 | (*self).into() 130 | } 131 | 132 | fn source(&self) -> Source { 133 | Source::Distribution 134 | } 135 | } 136 | -------------------------------------------------------------------------------- /src/samplers/http/config.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | use serde_derive::Deserialize; 6 | 7 | use crate::config::SamplerConfig; 8 | 9 | use super::stat::*; 10 | 11 | fn default_timeout() -> u64 { 12 | 200 13 | } 14 | 15 | #[derive(Debug, Deserialize)] 16 | #[serde(deny_unknown_fields)] 17 | pub struct HttpConfig { 18 | counters: Vec, 19 | #[serde(default)] 20 | enabled: bool, 21 | gauges: Vec, 22 | #[serde(default)] 23 | interval: Option, 24 | #[serde(default)] 25 | passthrough: bool, 26 | #[serde(default = "crate::common::default_percentiles")] 27 | percentiles: Vec, 28 | url: Option, 29 | // http request timeout in milliseconds 30 | #[serde(default = "default_timeout")] 31 | timeout: u64, 32 | } 33 | 34 | impl Default for HttpConfig { 35 | fn default() -> Self { 36 | Self { 37 | counters: Vec::new(), 38 | enabled: Default::default(), 39 | gauges: Vec::new(), 40 | interval: Default::default(), 41 | passthrough: Default::default(), 42 | percentiles: crate::common::default_percentiles(), 43 | url: None, 44 | timeout: default_timeout(), 45 | } 46 | } 47 | } 48 | 49 | impl HttpConfig { 50 | /// The URL to query metrics from 51 | pub fn url(&self) -> Option { 52 | self.url.clone() 53 | } 54 | 55 | /// Timeout for HTTP requests 56 | pub fn timeout(&self) -> core::time::Duration { 57 | core::time::Duration::from_millis(self.timeout) 58 | } 59 | 60 | /// A list of metric names that should be processed as gauges with 61 | /// percentiles 62 | pub fn gauges(&self) -> &[String] { 63 | &self.gauges 64 | } 65 | 66 | /// A list of metric names that should be processed as counters with 67 | /// percentiles 68 | pub fn counters(&self) -> &[String] { 69 | &self.counters 70 | } 71 | 72 | /// Whether unlisted metrics should be passed through to the output, which 73 | /// internally treats them as gauges without percentiles 74 | pub fn passthrough(&self) -> bool { 75 | self.passthrough 76 | } 77 | } 78 | 79 | impl SamplerConfig for HttpConfig { 80 | type Statistic = HttpStatistic; 81 | 82 | fn enabled(&self) -> bool { 83 | self.enabled 84 | } 85 | 86 | fn interval(&self) -> Option { 87 | self.interval 88 | } 89 | 90 | fn percentiles(&self) -> &[f64] { 91 | &self.percentiles 92 | } 93 | 94 | fn statistics(&self) -> Vec<::Statistic> { 95 | // we don't know the statistics yet, register at runtime instead 96 | Vec::new() 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /src/samplers/http/stat.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | use crate::metrics::*; 6 | use crate::Statistic; 7 | 8 | // #[derive(Eq, PartialEq, Hash)] 9 | pub struct HttpStatistic { 10 | name: String, 11 | source: Source, 12 | } 13 | 14 | impl HttpStatistic { 15 | pub fn new(name: String, source: Source) -> Self { 16 | Self { name, source } 17 | } 18 | } 19 | 20 | impl Statistic for HttpStatistic { 21 | fn name(&self) -> &str { 22 | &self.name 23 | } 24 | 25 | fn source(&self) -> Source { 26 | self.source 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /src/samplers/interrupt/bpf.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | // This code is taken from: 7 | // https://github.com/iovisor/bcc/blob/master/tools/hardirqs.py 8 | // https://github.com/iovisor/bcc/blob/master/tools/softirqs.py 9 | // 10 | // Copyright (c) 2015 Brendan Gregg. 11 | // Licensed under the Apache License, Version 2.0 (the "License") 12 | 13 | typedef struct account_val { 14 | u64 ts; 15 | u32 vec; 16 | } account_val_t; 17 | 18 | // Software IRQ 19 | BPF_HASH(soft_start, u32, account_val_t); 20 | BPF_HISTOGRAM(hi, int, 461); 21 | BPF_HISTOGRAM(timer, int, 461); 22 | BPF_HISTOGRAM(net_tx, int, 461); 23 | BPF_HISTOGRAM(net_rx, int, 461); 24 | BPF_HISTOGRAM(block, int, 461); 25 | BPF_HISTOGRAM(irq_poll, int, 461); 26 | BPF_HISTOGRAM(tasklet, int, 461); 27 | BPF_HISTOGRAM(sched, int, 461); 28 | BPF_HISTOGRAM(hr_timer, int, 461); 29 | BPF_HISTOGRAM(rcu, int, 461); 30 | BPF_HISTOGRAM(unknown, int, 461); 31 | 32 | // Hardware IRQ 33 | BPF_HASH(hard_start, u32, u64); 34 | BPF_HISTOGRAM(hardirq_total, int, 461); 35 | 36 | VALUE_TO_INDEX2_FUNC 37 | 38 | // Software IRQ 39 | int softirq_entry(struct tracepoint__irq__softirq_entry *args) 40 | { 41 | u32 pid = bpf_get_current_pid_tgid(); 42 | account_val_t val = {}; 43 | val.ts = bpf_ktime_get_ns(); 44 | val.vec = args->vec; 45 | soft_start.update(&pid, &val); 46 | return 0; 47 | } 48 | 49 | // For bcc 0.7.0 + 50 | int softirq_exit(struct tracepoint__irq__softirq_exit *args) 51 | { 52 | u64 delta_us; 53 | u32 vec; 54 | u32 pid = bpf_get_current_pid_tgid(); 55 | account_val_t *valp; 56 | 57 | // fetch timestamp and calculate delta 58 | valp = soft_start.lookup(&pid); 59 | if (valp == 0) { 60 | return 0; // missed start 61 | } 62 | delta_us = (bpf_ktime_get_ns() - valp->ts) / 1000ul; 63 | vec = valp->vec; 64 | u64 index = value_to_index2(delta_us); 65 | 66 | // May need updates if more softirqs are added 67 | switch (vec) { 68 | case 0: hi.increment(index); break; 69 | case 1: timer.increment(index); break; 70 | case 2: net_tx.increment(index); break; 71 | case 3: net_rx.increment(index); break; 72 | case 4: block.increment(index); break; 73 | case 5: irq_poll.increment(index); break; 74 | case 6: tasklet.increment(index); break; 75 | case 7: sched.increment(index); break; 76 | case 8: hr_timer.increment(index); break; 77 | case 9: rcu.increment(index); break; 78 | default: unknown.increment(index); break; 79 | } 80 | 81 | soft_start.delete(&pid); 82 | return 0; 83 | } 84 | 85 | // Hardware IRQ 86 | int hardirq_entry(struct pt_regs *ctx, struct irq_desc *desc) 87 | { 88 | u32 pid = bpf_get_current_pid_tgid(); 89 | u64 ts = bpf_ktime_get_ns(); 90 | hard_start.update(&pid, &ts); 91 | return 0; 92 | } 93 | 94 | int hardirq_exit(struct pt_regs *ctx) 95 | { 96 | u64 *tsp, delta_us, index; 97 | u32 pid = bpf_get_current_pid_tgid(); 98 | 99 | // fetch timestamp and calculate delta 100 | tsp = hard_start.lookup(&pid); 101 | if (tsp == 0 ) { 102 | return 0; // missed start 103 | } 104 | 105 | delta_us = (bpf_ktime_get_ns() - *tsp) / 1000ul; 106 | index = value_to_index2(delta_us); 107 | hardirq_total.increment(index); 108 | 109 | hard_start.delete(&pid); 110 | return 0; 111 | } 112 | -------------------------------------------------------------------------------- /src/samplers/interrupt/config.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | use serde_derive::Deserialize; 6 | use strum::IntoEnumIterator; 7 | 8 | use crate::config::SamplerConfig; 9 | 10 | use super::stat::*; 11 | 12 | #[derive(Debug, Deserialize)] 13 | #[serde(deny_unknown_fields)] 14 | pub struct InterruptConfig { 15 | #[serde(default)] 16 | bpf: bool, 17 | #[serde(default)] 18 | enabled: bool, 19 | #[serde(default)] 20 | interval: Option, 21 | #[serde(default = "crate::common::default_percentiles")] 22 | percentiles: Vec, 23 | #[serde(default = "default_statistics")] 24 | statistics: Vec, 25 | } 26 | 27 | impl Default for InterruptConfig { 28 | fn default() -> Self { 29 | Self { 30 | bpf: Default::default(), 31 | enabled: Default::default(), 32 | interval: Default::default(), 33 | percentiles: crate::common::default_percentiles(), 34 | statistics: default_statistics(), 35 | } 36 | } 37 | } 38 | 39 | fn default_statistics() -> Vec { 40 | InterruptStatistic::iter().collect() 41 | } 42 | 43 | impl SamplerConfig for InterruptConfig { 44 | type Statistic = InterruptStatistic; 45 | 46 | fn bpf(&self) -> bool { 47 | self.bpf 48 | } 49 | 50 | fn enabled(&self) -> bool { 51 | self.enabled 52 | } 53 | 54 | fn interval(&self) -> Option { 55 | self.interval 56 | } 57 | 58 | fn percentiles(&self) -> &[f64] { 59 | &self.percentiles 60 | } 61 | 62 | fn statistics(&self) -> Vec<::Statistic> { 63 | let mut enabled = Vec::new(); 64 | for statistic in self.statistics.iter() { 65 | if statistic.bpf_table().is_some() { 66 | if self.bpf() { 67 | enabled.push(*statistic); 68 | } 69 | } else { 70 | enabled.push(*statistic); 71 | } 72 | } 73 | enabled 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /src/samplers/interrupt/stat.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | use crate::metrics::*; 6 | use serde_derive::{Deserialize, Serialize}; 7 | use strum_macros::{EnumIter, EnumString, IntoStaticStr}; 8 | 9 | #[cfg(feature = "bpf")] 10 | use crate::common::bpf::*; 11 | 12 | #[derive( 13 | Clone, 14 | Copy, 15 | Debug, 16 | Deserialize, 17 | EnumIter, 18 | EnumString, 19 | Eq, 20 | IntoStaticStr, 21 | PartialEq, 22 | Hash, 23 | Serialize, 24 | )] 25 | #[serde(deny_unknown_fields, try_from = "&str", into = "&str")] 26 | pub enum InterruptStatistic { 27 | #[strum(serialize = "interrupt/total")] 28 | Total, 29 | #[strum(serialize = "interrupt/timer")] 30 | Timer, 31 | #[strum(serialize = "interrupt/nmi")] 32 | NonMaskable, 33 | #[strum(serialize = "interrupt/nvme")] 34 | Nvme, 35 | #[strum(serialize = "interrupt/network")] 36 | Network, 37 | #[strum(serialize = "interrupt/local_timer")] 38 | LocalTimer, 39 | #[strum(serialize = "interrupt/spurious")] 40 | Spurious, 41 | #[strum(serialize = "interrupt/performance_monitoring")] 42 | PerformanceMonitoring, 43 | #[strum(serialize = "interrupt/rescheduling")] 44 | Rescheduling, 45 | #[strum(serialize = "interrupt/function_call")] 46 | FunctionCall, 47 | #[strum(serialize = "interrupt/tlb_shootdowns")] 48 | TlbShootdowns, 49 | #[strum(serialize = "interrupt/thermal_event")] 50 | ThermalEvent, 51 | #[strum(serialize = "interrupt/machine_check_exception")] 52 | MachineCheckException, 53 | #[strum(serialize = "interrupt/rtc")] 54 | RealTimeClock, 55 | #[strum(serialize = "interrupt/node0/total")] 56 | Node0Total, 57 | #[strum(serialize = "interrupt/node1/total")] 58 | Node1Total, 59 | #[strum(serialize = "interrupt/node0/network")] 60 | Node0Network, 61 | #[strum(serialize = "interrupt/node1/network")] 62 | Node1Network, 63 | #[strum(serialize = "interrupt/node0/nvme")] 64 | Node0Nvme, 65 | #[strum(serialize = "interrupt/node1/nvme")] 66 | Node1Nvme, 67 | #[strum(serialize = "interrupt/softirq/hi")] 68 | SoftIrqHI, 69 | #[strum(serialize = "interrupt/softirq/timer")] 70 | SoftIrqTimer, 71 | #[strum(serialize = "interrupt/softirq/net_rx")] 72 | SoftIrqNetRx, 73 | #[strum(serialize = "interrupt/softirq/net_tx")] 74 | SoftIrqNetTx, 75 | #[strum(serialize = "interrupt/softirq/block")] 76 | SoftIrqBlock, 77 | #[strum(serialize = "interrupt/softirq/irq_poll")] 78 | SoftIrqPoll, 79 | #[strum(serialize = "interrupt/softirq/tasklet")] 80 | SoftIrqTasklet, 81 | #[strum(serialize = "interrupt/softirq/sched")] 82 | SoftIrqSched, 83 | #[strum(serialize = "interrupt/softirq/hr_timer")] 84 | SoftIrqHRTimer, 85 | #[strum(serialize = "interrupt/softirq/rcu")] 86 | SoftIrqRCU, 87 | #[strum(serialize = "interrupt/softirq/unknown")] 88 | SoftIrqUnknown, 89 | #[strum(serialize = "interrupt/hardirq")] 90 | HardIrq, 91 | } 92 | 93 | impl InterruptStatistic { 94 | pub fn bpf_table(self) -> Option<&'static str> { 95 | match self { 96 | Self::SoftIrqHI => Some("hi"), 97 | Self::SoftIrqTimer => Some("timer"), 98 | Self::SoftIrqNetRx => Some("net_rx"), 99 | Self::SoftIrqNetTx => Some("net_tx"), 100 | Self::SoftIrqBlock => Some("block"), 101 | Self::SoftIrqPoll => Some("irq_poll"), 102 | Self::SoftIrqTasklet => Some("tasklet"), 103 | Self::SoftIrqSched => Some("sched"), 104 | Self::SoftIrqHRTimer => Some("hr_timer"), 105 | Self::SoftIrqRCU => Some("rcu"), 106 | Self::SoftIrqUnknown => Some("unknown"), 107 | Self::HardIrq => Some("hardirq_total"), 108 | _ => None, 109 | } 110 | } 111 | 112 | #[cfg(feature = "bpf")] 113 | pub fn bpf_probes_required(self) -> Vec { 114 | // define the unique probes below. 115 | let irq_event_percpu_probe = Probe { 116 | name: "handle_irq_event_percpu".to_string(), 117 | handler: "hardirq_entry".to_string(), 118 | probe_type: ProbeType::Kernel, 119 | probe_location: ProbeLocation::Entry, 120 | binary_path: None, 121 | sub_system: None, 122 | }; 123 | let irq_event_percpu_ret_probe = Probe { 124 | name: "handle_irq_event_percpu".to_string(), 125 | handler: "hardirq_exit".to_string(), 126 | probe_type: ProbeType::Kernel, 127 | probe_location: ProbeLocation::Return, 128 | binary_path: None, 129 | sub_system: None, 130 | }; 131 | let softirq_entry_tracepoint = Probe { 132 | name: "softirq_entry".to_string(), 133 | handler: "softirq_entry".to_string(), 134 | probe_type: ProbeType::Tracepoint, 135 | probe_location: ProbeLocation::Entry, 136 | binary_path: None, 137 | sub_system: Some("irq".to_string()), 138 | }; 139 | let softirq_exit_tracepoint = Probe { 140 | name: "softirq_exit".to_string(), 141 | handler: "softirq_exit".to_string(), 142 | probe_type: ProbeType::Tracepoint, 143 | probe_location: ProbeLocation::Entry, 144 | binary_path: None, 145 | sub_system: Some("irq".to_string()), 146 | }; 147 | 148 | // specify what probes are required for each telemetry. 149 | match self { 150 | Self::SoftIrqHI 151 | | Self::SoftIrqTimer 152 | | Self::SoftIrqNetRx 153 | | Self::SoftIrqNetTx 154 | | Self::SoftIrqBlock 155 | | Self::SoftIrqPoll 156 | | Self::SoftIrqTasklet 157 | | Self::SoftIrqSched 158 | | Self::SoftIrqHRTimer 159 | | Self::SoftIrqRCU 160 | | Self::SoftIrqUnknown => vec![softirq_entry_tracepoint, softirq_exit_tracepoint], 161 | Self::HardIrq => vec![irq_event_percpu_probe, irq_event_percpu_ret_probe], 162 | _ => Vec::new(), 163 | } 164 | } 165 | } 166 | 167 | impl Statistic for InterruptStatistic { 168 | fn name(&self) -> &str { 169 | (*self).into() 170 | } 171 | 172 | fn source(&self) -> Source { 173 | if self.bpf_table().is_some() { 174 | Source::Distribution 175 | } else { 176 | Source::Counter 177 | } 178 | } 179 | } 180 | -------------------------------------------------------------------------------- /src/samplers/krb5kdc/config.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | use serde_derive::Deserialize; 6 | use strum::IntoEnumIterator; 7 | 8 | use crate::config::SamplerConfig; 9 | 10 | use super::stat::*; 11 | 12 | #[derive(Debug, Deserialize)] 13 | #[serde(deny_unknown_fields)] 14 | pub struct Krb5kdcConfig { 15 | #[serde(default)] 16 | bpf: bool, 17 | #[serde(default)] 18 | enabled: bool, 19 | #[serde(default)] 20 | interval: Option, 21 | #[serde(default)] 22 | percentiles: Vec, 23 | #[serde(default = "default_statistics")] 24 | statistics: Vec, 25 | #[serde(default)] 26 | path: String, 27 | } 28 | 29 | impl Default for Krb5kdcConfig { 30 | fn default() -> Self { 31 | Self { 32 | bpf: Default::default(), 33 | enabled: Default::default(), 34 | interval: Default::default(), 35 | percentiles: crate::common::default_percentiles(), 36 | statistics: default_statistics(), 37 | path: Default::default(), 38 | } 39 | } 40 | } 41 | 42 | impl Krb5kdcConfig { 43 | pub fn path(&self) -> String { 44 | self.path.clone() 45 | } 46 | } 47 | 48 | fn default_statistics() -> Vec { 49 | Krb5kdcStatistic::iter().collect() 50 | } 51 | 52 | impl SamplerConfig for Krb5kdcConfig { 53 | type Statistic = Krb5kdcStatistic; 54 | 55 | fn bpf(&self) -> bool { 56 | self.bpf 57 | } 58 | 59 | fn enabled(&self) -> bool { 60 | self.enabled 61 | } 62 | 63 | fn interval(&self) -> Option { 64 | self.interval 65 | } 66 | 67 | fn percentiles(&self) -> &[f64] { 68 | &self.percentiles 69 | } 70 | 71 | fn statistics(&self) -> Vec<::Statistic> { 72 | let mut enabled = Vec::new(); 73 | for statistic in self.statistics.iter() { 74 | enabled.push(*statistic); 75 | } 76 | enabled 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /src/samplers/memcache/config.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | use serde_derive::Deserialize; 6 | 7 | use crate::config::SamplerConfig; 8 | 9 | use super::stat::*; 10 | 11 | #[derive(Debug, Deserialize)] 12 | #[serde(deny_unknown_fields)] 13 | pub struct MemcacheConfig { 14 | #[serde(default)] 15 | enabled: bool, 16 | #[serde(default)] 17 | interval: Option, 18 | #[serde(default = "crate::common::default_percentiles")] 19 | percentiles: Vec, 20 | endpoint: Option, 21 | } 22 | 23 | impl Default for MemcacheConfig { 24 | fn default() -> Self { 25 | Self { 26 | enabled: Default::default(), 27 | interval: Default::default(), 28 | percentiles: crate::common::default_percentiles(), 29 | endpoint: None, 30 | } 31 | } 32 | } 33 | 34 | impl MemcacheConfig { 35 | pub fn endpoint(&self) -> Option { 36 | self.endpoint.clone() 37 | } 38 | } 39 | 40 | impl SamplerConfig for MemcacheConfig { 41 | type Statistic = MemcacheStatistic; 42 | 43 | fn enabled(&self) -> bool { 44 | self.enabled 45 | } 46 | 47 | fn interval(&self) -> Option { 48 | self.interval 49 | } 50 | 51 | fn percentiles(&self) -> &[f64] { 52 | &self.percentiles 53 | } 54 | 55 | fn statistics(&self) -> Vec<::Statistic> { 56 | Vec::new() 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/samplers/memcache/stat.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | use crate::Statistic; 6 | 7 | use crate::metrics::*; 8 | 9 | #[derive(Debug, Eq, PartialEq, Hash)] 10 | pub struct MemcacheStatistic { 11 | inner: String, 12 | } 13 | 14 | impl MemcacheStatistic { 15 | pub fn new(name: String) -> Self { 16 | Self { inner: name } 17 | } 18 | 19 | pub fn summary_type(&self) -> Option { 20 | match self.inner.as_ref() { 21 | "data_read" | "data_written" | "cmd_total" | "conn_total" | "conn_yield" 22 | | "process_req" | "tcp_accept" | "tcp_recv_byte" | "tcp_send_byte" => { 23 | Some(Source::Counter) 24 | } 25 | "hotkey_bw" | "hotkey_qps" => Some(Source::Gauge), 26 | _ => None, 27 | } 28 | } 29 | } 30 | 31 | impl Statistic for MemcacheStatistic { 32 | fn name(&self) -> &str { 33 | &self.inner 34 | } 35 | 36 | fn source(&self) -> Source { 37 | self.summary_type().unwrap_or(Source::Gauge) 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/samplers/memory/config.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | use serde_derive::Deserialize; 6 | use strum::IntoEnumIterator; 7 | 8 | use crate::config::SamplerConfig; 9 | 10 | use super::stat::*; 11 | 12 | #[derive(Debug, Deserialize)] 13 | #[serde(deny_unknown_fields)] 14 | pub struct MemoryConfig { 15 | #[serde(default)] 16 | enabled: bool, 17 | #[serde(default)] 18 | interval: Option, 19 | #[serde(default = "crate::common::default_percentiles")] 20 | percentiles: Vec, 21 | #[serde(default = "default_statistics")] 22 | statistics: Vec, 23 | } 24 | 25 | impl Default for MemoryConfig { 26 | fn default() -> Self { 27 | Self { 28 | enabled: Default::default(), 29 | interval: Default::default(), 30 | percentiles: crate::common::default_percentiles(), 31 | statistics: default_statistics(), 32 | } 33 | } 34 | } 35 | 36 | fn default_statistics() -> Vec { 37 | MemoryStatistic::iter().collect() 38 | } 39 | 40 | impl SamplerConfig for MemoryConfig { 41 | type Statistic = MemoryStatistic; 42 | fn enabled(&self) -> bool { 43 | self.enabled 44 | } 45 | 46 | fn interval(&self) -> Option { 47 | self.interval 48 | } 49 | 50 | fn percentiles(&self) -> &[f64] { 51 | &self.percentiles 52 | } 53 | 54 | fn statistics(&self) -> Vec<::Statistic> { 55 | self.statistics.clone() 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/samplers/network/bpf.c: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | #include 6 | 7 | BPF_HISTOGRAM(rx_size, int, 461); 8 | BPF_HISTOGRAM(tx_size, int, 461); 9 | 10 | VALUE_TO_INDEX2_FUNC 11 | 12 | int trace_transmit(struct tracepoint__net__net_dev_queue *args) 13 | { 14 | u64 index = value_to_index2(args->len); 15 | tx_size.increment(index); 16 | return 0; 17 | } 18 | 19 | int trace_receive(struct tracepoint__net__netif_rx *args) 20 | { 21 | u64 index = value_to_index2(args->len); 22 | rx_size.increment(index); 23 | return 0; 24 | } 25 | -------------------------------------------------------------------------------- /src/samplers/network/config.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | use serde_derive::Deserialize; 6 | use strum::IntoEnumIterator; 7 | 8 | use crate::config::SamplerConfig; 9 | 10 | use super::stat::*; 11 | 12 | #[derive(Debug, Deserialize)] 13 | #[serde(deny_unknown_fields)] 14 | pub struct NetworkConfig { 15 | #[serde(default)] 16 | bpf: bool, 17 | #[serde(default)] 18 | enabled: bool, 19 | #[serde(default)] 20 | interval: Option, 21 | #[serde(default = "crate::common::default_percentiles")] 22 | percentiles: Vec, 23 | #[serde(default = "default_statistics")] 24 | statistics: Vec, 25 | } 26 | 27 | impl Default for NetworkConfig { 28 | fn default() -> Self { 29 | Self { 30 | bpf: Default::default(), 31 | enabled: Default::default(), 32 | interval: Default::default(), 33 | percentiles: crate::common::default_percentiles(), 34 | statistics: default_statistics(), 35 | } 36 | } 37 | } 38 | 39 | fn default_statistics() -> Vec { 40 | NetworkStatistic::iter().collect() 41 | } 42 | 43 | impl SamplerConfig for NetworkConfig { 44 | type Statistic = NetworkStatistic; 45 | 46 | fn bpf(&self) -> bool { 47 | self.bpf 48 | } 49 | 50 | fn enabled(&self) -> bool { 51 | self.enabled 52 | } 53 | 54 | fn interval(&self) -> Option { 55 | self.interval 56 | } 57 | 58 | fn percentiles(&self) -> &[f64] { 59 | &self.percentiles 60 | } 61 | 62 | fn statistics(&self) -> Vec<::Statistic> { 63 | let mut enabled = Vec::new(); 64 | for statistic in self.statistics.iter() { 65 | if statistic.bpf_table().is_some() { 66 | if self.bpf() { 67 | enabled.push(*statistic); 68 | } 69 | } else { 70 | enabled.push(*statistic); 71 | } 72 | } 73 | enabled 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /src/samplers/network/stat.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | use crate::metrics::*; 6 | use serde_derive::{Deserialize, Serialize}; 7 | use strum_macros::{EnumIter, EnumString, IntoStaticStr}; 8 | 9 | #[cfg(feature = "bpf")] 10 | use crate::common::bpf::*; 11 | 12 | #[derive( 13 | Clone, 14 | Copy, 15 | Debug, 16 | Deserialize, 17 | EnumIter, 18 | EnumString, 19 | Eq, 20 | IntoStaticStr, 21 | PartialEq, 22 | Hash, 23 | Serialize, 24 | )] 25 | #[serde(deny_unknown_fields, try_from = "&str", into = "&str")] 26 | pub enum NetworkStatistic { 27 | #[strum(serialize = "network/receive/bytes")] 28 | ReceiveBytes, 29 | #[strum(serialize = "network/receive/packets")] 30 | ReceivePackets, 31 | #[strum(serialize = "network/receive/errors")] 32 | ReceiveErrors, 33 | #[strum(serialize = "network/receive/drops")] 34 | ReceiveDrops, 35 | #[strum(serialize = "network/receive/fifo")] 36 | ReceiveFifo, 37 | #[strum(serialize = "network/receive/frame")] 38 | ReceiveFrame, 39 | #[strum(serialize = "network/receive/compressed")] 40 | ReceiveCompressed, 41 | #[strum(serialize = "network/receive/multicast")] 42 | ReceiveMulticast, 43 | #[strum(serialize = "network/transmit/bytes")] 44 | TransmitBytes, 45 | #[strum(serialize = "network/transmit/packets")] 46 | TransmitPackets, 47 | #[strum(serialize = "network/transmit/errors")] 48 | TransmitErrors, 49 | #[strum(serialize = "network/transmit/drops")] 50 | TransmitDrops, 51 | #[strum(serialize = "network/transmit/fifo")] 52 | TransmitFifo, 53 | #[strum(serialize = "network/transmit/collisions")] 54 | TransmitCollisions, 55 | #[strum(serialize = "network/transmit/carrier")] 56 | TransmitCarrier, 57 | #[strum(serialize = "network/transmit/compressed")] 58 | TransmitCompressed, 59 | #[strum(serialize = "network/receive/size")] 60 | ReceiveSize, 61 | #[strum(serialize = "network/transmit/size")] 62 | TransmitSize, 63 | } 64 | 65 | impl NetworkStatistic { 66 | pub fn field_number(self) -> Option { 67 | match self { 68 | Self::ReceiveBytes => Some(1), 69 | Self::ReceivePackets => Some(2), 70 | Self::ReceiveErrors => Some(3), 71 | Self::ReceiveDrops => Some(4), 72 | Self::ReceiveFifo => Some(5), 73 | Self::ReceiveFrame => Some(6), 74 | Self::ReceiveCompressed => Some(7), 75 | Self::ReceiveMulticast => Some(8), 76 | Self::TransmitBytes => Some(9), 77 | Self::TransmitPackets => Some(10), 78 | Self::TransmitErrors => Some(11), 79 | Self::TransmitDrops => Some(12), 80 | Self::TransmitFifo => Some(13), 81 | Self::TransmitCollisions => Some(14), 82 | Self::TransmitCarrier => Some(15), 83 | Self::TransmitCompressed => Some(16), 84 | _ => None, 85 | } 86 | } 87 | 88 | pub fn bpf_table(self) -> Option<&'static str> { 89 | match self { 90 | Self::ReceiveSize => Some("rx_size"), 91 | Self::TransmitSize => Some("tx_size"), 92 | _ => None, 93 | } 94 | } 95 | 96 | #[cfg(feature = "bpf")] 97 | pub fn bpf_probes_required(self) -> Vec { 98 | // define the unique probes below. 99 | let tx_tracepoint = Probe { 100 | name: "net_dev_queue".to_string(), 101 | handler: "trace_transmit".to_string(), 102 | probe_type: ProbeType::Tracepoint, 103 | probe_location: ProbeLocation::Entry, 104 | binary_path: None, 105 | sub_system: Some("net".to_string()), 106 | }; 107 | let rx_tracepoint = Probe { 108 | name: "netif_rx".to_string(), 109 | handler: "trace_receive".to_string(), 110 | probe_type: ProbeType::Tracepoint, 111 | probe_location: ProbeLocation::Entry, 112 | binary_path: None, 113 | sub_system: Some("net".to_string()), 114 | }; 115 | 116 | match self { 117 | Self::ReceiveSize => vec![rx_tracepoint], 118 | Self::TransmitSize => vec![tx_tracepoint], 119 | _ => Vec::new(), 120 | } 121 | } 122 | } 123 | 124 | impl Statistic for NetworkStatistic { 125 | fn name(&self) -> &str { 126 | (*self).into() 127 | } 128 | 129 | fn source(&self) -> Source { 130 | if self.bpf_table().is_some() { 131 | Source::Distribution 132 | } else { 133 | Source::Counter 134 | } 135 | } 136 | } 137 | -------------------------------------------------------------------------------- /src/samplers/ntp/config.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | use serde_derive::Deserialize; 6 | use strum::IntoEnumIterator; 7 | 8 | use crate::config::SamplerConfig; 9 | 10 | use super::stat::*; 11 | 12 | #[derive(Debug, Deserialize)] 13 | #[serde(deny_unknown_fields)] 14 | pub struct NtpConfig { 15 | #[serde(default)] 16 | enabled: bool, 17 | #[serde(default)] 18 | interval: Option, 19 | #[serde(default = "crate::common::default_percentiles")] 20 | percentiles: Vec, 21 | #[serde(default = "default_statistics")] 22 | statistics: Vec, 23 | } 24 | 25 | impl Default for NtpConfig { 26 | fn default() -> Self { 27 | Self { 28 | enabled: Default::default(), 29 | interval: Default::default(), 30 | percentiles: crate::common::default_percentiles(), 31 | statistics: default_statistics(), 32 | } 33 | } 34 | } 35 | 36 | fn default_statistics() -> Vec { 37 | NtpStatistic::iter().collect() 38 | } 39 | 40 | impl SamplerConfig for NtpConfig { 41 | type Statistic = NtpStatistic; 42 | fn enabled(&self) -> bool { 43 | self.enabled 44 | } 45 | 46 | fn interval(&self) -> Option { 47 | self.interval 48 | } 49 | 50 | fn percentiles(&self) -> &[f64] { 51 | &self.percentiles 52 | } 53 | 54 | fn statistics(&self) -> Vec<::Statistic> { 55 | let mut enabled = Vec::new(); 56 | for statistic in self.statistics.iter() { 57 | enabled.push(*statistic); 58 | } 59 | enabled 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /src/samplers/ntp/mod.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | use async_trait::async_trait; 6 | 7 | use crate::common::*; 8 | use crate::config::SamplerConfig; 9 | use crate::samplers::Common; 10 | use crate::*; 11 | 12 | mod config; 13 | mod stat; 14 | 15 | pub use config::*; 16 | pub use stat::*; 17 | 18 | #[allow(dead_code)] 19 | pub struct Ntp { 20 | common: Common, 21 | statistics: Vec, 22 | } 23 | 24 | #[async_trait] 25 | impl Sampler for Ntp { 26 | type Statistic = NtpStatistic; 27 | 28 | fn new(common: Common) -> Result { 29 | let statistics = common.config().samplers().ntp().statistics(); 30 | #[allow(unused_mut)] 31 | let mut sampler = Self { common, statistics }; 32 | 33 | if sampler.sampler_config().enabled() { 34 | sampler.register(); 35 | } 36 | 37 | Ok(sampler) 38 | } 39 | 40 | fn spawn(common: Common) { 41 | debug!("spawning"); 42 | if common.config().samplers().ntp().enabled() { 43 | debug!("sampler is enabled"); 44 | if let Ok(mut ntp) = Ntp::new(common.clone()) { 45 | common.runtime().spawn(async move { 46 | loop { 47 | let _ = ntp.sample().await; 48 | } 49 | }); 50 | } else if !common.config.fault_tolerant() { 51 | fatal!("failed to initialize ntp sampler"); 52 | } else { 53 | error!("failed to initialize ntp sampler"); 54 | } 55 | } 56 | } 57 | 58 | fn common(&self) -> &Common { 59 | &self.common 60 | } 61 | 62 | fn common_mut(&mut self) -> &mut Common { 63 | &mut self.common 64 | } 65 | 66 | fn sampler_config(&self) -> &dyn SamplerConfig { 67 | self.common.config().samplers().ntp() 68 | } 69 | 70 | async fn sample(&mut self) -> Result<(), std::io::Error> { 71 | if let Some(ref mut delay) = self.delay() { 72 | delay.tick().await; 73 | } 74 | 75 | if !self.sampler_config().enabled() { 76 | return Ok(()); 77 | } 78 | 79 | debug!("sampling"); 80 | 81 | let r = self.sample_ntp_adjtime().await; 82 | self.map_result(r)?; 83 | 84 | Ok(()) 85 | } 86 | } 87 | 88 | impl Ntp { 89 | #[cfg(not(target_env = "musl"))] 90 | async fn sample_ntp_adjtime(&mut self) -> Result<(), std::io::Error> { 91 | let mut timeval = default_ntptimeval(); 92 | let time = Instant::now(); 93 | let status = unsafe { libc::ntp_gettime(&mut timeval) }; 94 | if status == 0 { 95 | let _ = self.metrics().record_gauge( 96 | &NtpStatistic::MaximumError, 97 | time, 98 | timeval.maxerror as u64 * MICROSECOND, 99 | ); 100 | 101 | #[cfg(all(not(target_os = "macos"), not(target_os = "ios"), unix))] 102 | let _ = self.metrics().record_gauge( 103 | &NtpStatistic::EstimatedError, 104 | time, 105 | timeval.esterror as u64 * MICROSECOND, 106 | ); 107 | } 108 | Ok(()) 109 | } 110 | 111 | #[cfg(target_env = "musl")] 112 | async fn sample_ntp_adjtime(&mut self) -> Result<(), std::io::Error> { 113 | // TODO: implement NTP sampling for musl 114 | Ok(()) 115 | } 116 | } 117 | 118 | #[cfg(any(target_os = "macos", target_os = "ios"))] 119 | fn default_ntptimeval() -> libc::ntptimeval { 120 | libc::ntptimeval { 121 | time: libc::timespec { 122 | tv_sec: 0, 123 | tv_nsec: 0, 124 | }, 125 | maxerror: 0, 126 | esterror: 0, 127 | tai: 0, 128 | time_state: 0, 129 | } 130 | } 131 | 132 | #[cfg(all( 133 | not(target_os = "macos"), 134 | not(target_os = "ios"), 135 | not(target_env = "musl"), 136 | unix 137 | ))] 138 | fn default_ntptimeval() -> libc::ntptimeval { 139 | libc::ntptimeval { 140 | time: libc::timeval { 141 | tv_sec: 0, 142 | tv_usec: 0, 143 | }, 144 | maxerror: 0, 145 | esterror: 0, 146 | tai: 0, 147 | __glibc_reserved1: 0, 148 | __glibc_reserved2: 0, 149 | __glibc_reserved3: 0, 150 | __glibc_reserved4: 0, 151 | } 152 | } 153 | -------------------------------------------------------------------------------- /src/samplers/ntp/stat.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | use crate::metrics::*; 6 | use serde_derive::{Deserialize, Serialize}; 7 | use strum_macros::{EnumIter, EnumString, IntoStaticStr}; 8 | 9 | #[derive( 10 | Clone, 11 | Copy, 12 | Debug, 13 | Deserialize, 14 | EnumIter, 15 | EnumString, 16 | Eq, 17 | IntoStaticStr, 18 | PartialEq, 19 | Hash, 20 | Serialize, 21 | )] 22 | #[serde(deny_unknown_fields, try_from = "&str", into = "&str")] 23 | pub enum NtpStatistic { 24 | #[strum(serialize = "ntp/estimated_error")] 25 | EstimatedError, 26 | #[strum(serialize = "ntp/maximum_error")] 27 | MaximumError, 28 | } 29 | 30 | impl Statistic for NtpStatistic { 31 | fn name(&self) -> &str { 32 | (*self).into() 33 | } 34 | 35 | fn source(&self) -> Source { 36 | Source::Gauge 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/samplers/nvidia/config.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | use nvml_wrapper::NVML; 6 | use serde_derive::Deserialize; 7 | use strum::IntoEnumIterator; 8 | 9 | use crate::config::SamplerConfig; 10 | 11 | use super::stat::*; 12 | 13 | #[derive(Debug, Deserialize)] 14 | #[serde(deny_unknown_fields)] 15 | pub struct NvidiaConfig { 16 | #[serde(default)] 17 | enabled: bool, 18 | #[serde(default)] 19 | interval: Option, 20 | #[serde(default = "crate::common::default_percentiles")] 21 | percentiles: Vec, 22 | #[serde(default = "default_statistics")] 23 | pub(crate) statistics: Vec, 24 | } 25 | 26 | impl Default for NvidiaConfig { 27 | fn default() -> Self { 28 | Self { 29 | enabled: Default::default(), 30 | interval: Default::default(), 31 | percentiles: crate::common::default_percentiles(), 32 | statistics: default_statistics(), 33 | } 34 | } 35 | } 36 | 37 | fn default_statistics() -> Vec { 38 | NvidiaConfigStatistic::iter().collect() 39 | } 40 | 41 | impl SamplerConfig for NvidiaConfig { 42 | type Statistic = NvidiaStatistic; 43 | 44 | fn enabled(&self) -> bool { 45 | self.enabled 46 | } 47 | 48 | fn interval(&self) -> Option { 49 | self.interval 50 | } 51 | 52 | fn percentiles(&self) -> &[f64] { 53 | &self.percentiles 54 | } 55 | 56 | fn statistics(&self) -> Vec<::Statistic> { 57 | let mut enabled = Vec::new(); 58 | if let Ok(nvml) = NVML::builder().init() { 59 | let devices = nvml.device_count().unwrap_or(0); 60 | for statistic in self.statistics.iter() { 61 | for id in 0..devices { 62 | match statistic { 63 | NvidiaConfigStatistic::GpuTemperature => { 64 | enabled.push(NvidiaStatistic::GpuTemperature(id)); 65 | } 66 | NvidiaConfigStatistic::GpuUtilization => { 67 | enabled.push(NvidiaStatistic::GpuUtilization(id)); 68 | } 69 | NvidiaConfigStatistic::MemoryEccEnabled => { 70 | enabled.push(NvidiaStatistic::MemoryEccEnabled(id)); 71 | } 72 | NvidiaConfigStatistic::MemoryEccSbe => { 73 | enabled.push(NvidiaStatistic::MemoryEccSbe(id)); 74 | } 75 | NvidiaConfigStatistic::MemoryEccDbe => { 76 | enabled.push(NvidiaStatistic::MemoryEccDbe(id)); 77 | } 78 | NvidiaConfigStatistic::MemoryUtilization => { 79 | enabled.push(NvidiaStatistic::MemoryUtilization(id)); 80 | } 81 | NvidiaConfigStatistic::EncoderUtilization => { 82 | enabled.push(NvidiaStatistic::EncoderUtilization(id)); 83 | } 84 | NvidiaConfigStatistic::DecoderUtilization => { 85 | enabled.push(NvidiaStatistic::DecoderUtilization(id)); 86 | } 87 | NvidiaConfigStatistic::PowerUsage => { 88 | enabled.push(NvidiaStatistic::PowerUsage(id)); 89 | } 90 | NvidiaConfigStatistic::PowerLimit => { 91 | enabled.push(NvidiaStatistic::PowerLimit(id)); 92 | } 93 | NvidiaConfigStatistic::EnergyConsumption => { 94 | enabled.push(NvidiaStatistic::EnergyConsumption(id)); 95 | } 96 | NvidiaConfigStatistic::ClockSMCurrent => { 97 | enabled.push(NvidiaStatistic::ClockSMCurrent(id)); 98 | } 99 | NvidiaConfigStatistic::ClockMemoryCurrent => { 100 | enabled.push(NvidiaStatistic::ClockMemoryCurrent(id)); 101 | } 102 | NvidiaConfigStatistic::PcieReplay => { 103 | enabled.push(NvidiaStatistic::PcieReplay(id)); 104 | } 105 | NvidiaConfigStatistic::PcieRxThroughput => { 106 | enabled.push(NvidiaStatistic::PcieRxThroughput(id)); 107 | } 108 | NvidiaConfigStatistic::PcieTxThroughput => { 109 | enabled.push(NvidiaStatistic::PcieTxThroughput(id)); 110 | } 111 | NvidiaConfigStatistic::MemoryFbFree => { 112 | enabled.push(NvidiaStatistic::MemoryFbFree(id)); 113 | } 114 | NvidiaConfigStatistic::MemoryFbTotal => { 115 | enabled.push(NvidiaStatistic::MemoryFbTotal(id)); 116 | } 117 | NvidiaConfigStatistic::MemoryFbUsed => { 118 | enabled.push(NvidiaStatistic::MemoryFbUsed(id)); 119 | } 120 | NvidiaConfigStatistic::MemoryRetiredSbe => { 121 | enabled.push(NvidiaStatistic::MemoryRetiredSbe(id)); 122 | } 123 | NvidiaConfigStatistic::MemoryRetiredDbe => { 124 | enabled.push(NvidiaStatistic::MemoryRetiredDbe(id)); 125 | } 126 | NvidiaConfigStatistic::MemoryRetiredPending => { 127 | enabled.push(NvidiaStatistic::MemoryRetiredPending(id)); 128 | } 129 | NvidiaConfigStatistic::ProcessesCompute => { 130 | enabled.push(NvidiaStatistic::ProcessesCompute(id)); 131 | } 132 | } 133 | } 134 | } 135 | } 136 | enabled 137 | } 138 | } 139 | -------------------------------------------------------------------------------- /src/samplers/page_cache/bpf.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | BPF_ARRAY(page_accessed, u64, 1); 4 | BPF_ARRAY(buffer_dirty, u64, 1); 5 | BPF_ARRAY(add_to_page_cache_lru, u64, 1); 6 | BPF_ARRAY(page_dirtied, u64, 1); 7 | 8 | int trace_mark_page_accessed(struct pt_regs *ctx) 9 | { 10 | int zero = 0; 11 | u64 *count = page_accessed.lookup(&zero); 12 | if (count) lock_xadd(count, 1); 13 | return 0; 14 | } 15 | 16 | int trace_mark_buffer_dirty(struct pt_regs *ctx) 17 | { 18 | int zero = 0; 19 | u64 *count = buffer_dirty.lookup(&zero); 20 | if (count) lock_xadd(count, 1); 21 | return 0; 22 | } 23 | 24 | int trace_add_to_page_cache_lru(struct pt_regs *ctx) 25 | { 26 | int zero = 0; 27 | u64 *count = add_to_page_cache_lru.lookup(&zero); 28 | if (count) lock_xadd(count, 1); 29 | return 0; 30 | } 31 | 32 | int trace_account_page_dirtied(struct pt_regs *ctx) 33 | { 34 | int zero = 0; 35 | u64 *count = page_dirtied.lookup(&zero); 36 | if (count) lock_xadd(count, 1); 37 | return 0; 38 | } 39 | -------------------------------------------------------------------------------- /src/samplers/page_cache/config.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | use serde_derive::Deserialize; 6 | use strum::IntoEnumIterator; 7 | 8 | use crate::config::SamplerConfig; 9 | 10 | use super::stat::*; 11 | 12 | #[derive(Debug, Deserialize)] 13 | #[serde(deny_unknown_fields)] 14 | pub struct PageCacheConfig { 15 | #[serde(default)] 16 | bpf: bool, 17 | #[serde(default)] 18 | enabled: bool, 19 | #[serde(default)] 20 | interval: Option, 21 | #[serde(default = "crate::common::default_percentiles")] 22 | percentiles: Vec, 23 | #[serde(default = "default_statistics")] 24 | statistics: Vec, 25 | } 26 | 27 | impl Default for PageCacheConfig { 28 | fn default() -> Self { 29 | Self { 30 | bpf: Default::default(), 31 | enabled: Default::default(), 32 | interval: Default::default(), 33 | percentiles: crate::common::default_percentiles(), 34 | statistics: default_statistics(), 35 | } 36 | } 37 | } 38 | 39 | fn default_statistics() -> Vec { 40 | PageCacheStatistic::iter().collect() 41 | } 42 | 43 | impl SamplerConfig for PageCacheConfig { 44 | type Statistic = PageCacheStatistic; 45 | 46 | fn bpf(&self) -> bool { 47 | self.bpf 48 | } 49 | 50 | fn enabled(&self) -> bool { 51 | self.enabled 52 | } 53 | 54 | fn interval(&self) -> Option { 55 | self.interval 56 | } 57 | 58 | fn percentiles(&self) -> &[f64] { 59 | &self.percentiles 60 | } 61 | 62 | fn statistics(&self) -> Vec<::Statistic> { 63 | let mut enabled = Vec::new(); 64 | for statistic in self.statistics.iter() { 65 | if statistic.is_bpf() { 66 | if self.bpf() { 67 | enabled.push(*statistic); 68 | } 69 | } else { 70 | enabled.push(*statistic); 71 | } 72 | } 73 | enabled 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /src/samplers/page_cache/stat.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | use crate::metrics::*; 6 | use serde_derive::{Deserialize, Serialize}; 7 | use strum_macros::{EnumIter, EnumString, IntoStaticStr}; 8 | 9 | #[cfg(feature = "bpf")] 10 | use crate::common::bpf::*; 11 | 12 | #[derive( 13 | Clone, 14 | Copy, 15 | Debug, 16 | Deserialize, 17 | EnumIter, 18 | EnumString, 19 | Eq, 20 | IntoStaticStr, 21 | PartialEq, 22 | Hash, 23 | Serialize, 24 | )] 25 | #[serde(deny_unknown_fields, try_from = "&str", into = "&str")] 26 | pub enum PageCacheStatistic { 27 | #[strum(serialize = "page_cache/hit")] 28 | Hit, 29 | #[strum(serialize = "page_cache/miss")] 30 | Miss, 31 | } 32 | 33 | impl PageCacheStatistic { 34 | pub fn is_bpf(&self) -> bool { 35 | true 36 | } 37 | 38 | #[cfg(feature = "bpf")] 39 | pub fn bpf_probes_required(self) -> Vec { 40 | // define the unique probes below. 41 | let page_accessed_probe = Probe { 42 | name: "mark_page_accessed".to_string(), 43 | handler: "trace_mark_page_accessed".to_string(), 44 | probe_type: ProbeType::Kernel, 45 | probe_location: ProbeLocation::Entry, 46 | binary_path: None, 47 | sub_system: None, 48 | }; 49 | let buffer_dirty_probe = Probe { 50 | name: "mark_buffer_dirty".to_string(), 51 | handler: "trace_mark_buffer_dirty".to_string(), 52 | probe_type: ProbeType::Kernel, 53 | probe_location: ProbeLocation::Entry, 54 | binary_path: None, 55 | sub_system: None, 56 | }; 57 | let page_cache_lru_probe = Probe { 58 | name: "add_to_page_cache_lru".to_string(), 59 | handler: "trace_add_to_page_cache_lru".to_string(), 60 | probe_type: ProbeType::Kernel, 61 | probe_location: ProbeLocation::Entry, 62 | binary_path: None, 63 | sub_system: None, 64 | }; 65 | let page_dirtied_probe = Probe { 66 | name: "account_page_dirtied".to_string(), 67 | handler: "trace_account_page_dirtied".to_string(), 68 | probe_type: ProbeType::Kernel, 69 | probe_location: ProbeLocation::Entry, 70 | binary_path: None, 71 | sub_system: None, 72 | }; 73 | 74 | match self { 75 | Self::Hit | Self::Miss => vec![ 76 | page_accessed_probe, 77 | buffer_dirty_probe, 78 | page_cache_lru_probe, 79 | page_dirtied_probe, 80 | ], 81 | } 82 | } 83 | } 84 | 85 | impl Statistic for PageCacheStatistic { 86 | fn name(&self) -> &str { 87 | (*self).into() 88 | } 89 | 90 | fn source(&self) -> Source { 91 | Source::Counter 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /src/samplers/process/config.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | use std::io::{BufRead, BufReader}; 6 | 7 | use serde_derive::Deserialize; 8 | use strum::IntoEnumIterator; 9 | 10 | use crate::config::SamplerConfig; 11 | 12 | use super::stat::*; 13 | 14 | #[derive(Debug, Deserialize)] 15 | #[serde(deny_unknown_fields)] 16 | pub struct ProcessConfig { 17 | #[serde(default)] 18 | enabled: bool, 19 | #[serde(default)] 20 | interval: Option, 21 | #[serde(default = "crate::common::default_percentiles")] 22 | percentiles: Vec, 23 | #[serde(default = "default_statistics")] 24 | statistics: Vec, 25 | #[serde(default)] 26 | pid_file: Option, 27 | } 28 | 29 | impl Default for ProcessConfig { 30 | fn default() -> Self { 31 | Self { 32 | enabled: Default::default(), 33 | interval: Default::default(), 34 | percentiles: crate::common::default_percentiles(), 35 | statistics: default_statistics(), 36 | pid_file: Default::default(), 37 | } 38 | } 39 | } 40 | 41 | fn default_statistics() -> Vec { 42 | ProcessStatistic::iter().collect() 43 | } 44 | 45 | impl SamplerConfig for ProcessConfig { 46 | type Statistic = ProcessStatistic; 47 | 48 | fn enabled(&self) -> bool { 49 | self.enabled 50 | } 51 | 52 | fn interval(&self) -> Option { 53 | self.interval 54 | } 55 | 56 | fn percentiles(&self) -> &[f64] { 57 | &self.percentiles 58 | } 59 | 60 | fn statistics(&self) -> Vec<::Statistic> { 61 | self.statistics.clone() 62 | } 63 | } 64 | 65 | impl ProcessConfig { 66 | pub fn pid(&self) -> Option { 67 | if let Some(filename) = &self.pid_file { 68 | if let Ok(file) = std::fs::File::open(filename) { 69 | let mut buffer = BufReader::new(file); 70 | let mut line = String::new(); 71 | if buffer.read_line(&mut line).is_ok() { 72 | let line = line.trim(); 73 | if let Ok(pid) = line.parse::() { 74 | return Some(pid); 75 | } else { 76 | debug!("PID file did not parse: {}", line); 77 | } 78 | } else { 79 | debug!("failed to read line from PID file"); 80 | } 81 | } else { 82 | debug!("failed to open PID file"); 83 | } 84 | } else { 85 | debug!("no PID file provided"); 86 | } 87 | None 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /src/samplers/process/stat.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | use crate::metrics::*; 6 | use serde_derive::{Deserialize, Serialize}; 7 | use strum_macros::{EnumIter, EnumString, IntoStaticStr}; 8 | 9 | #[derive( 10 | Clone, 11 | Copy, 12 | Debug, 13 | Deserialize, 14 | EnumIter, 15 | EnumString, 16 | Eq, 17 | IntoStaticStr, 18 | PartialEq, 19 | Hash, 20 | Serialize, 21 | )] 22 | #[serde(deny_unknown_fields, try_from = "&str", into = "&str")] 23 | pub enum ProcessStatistic { 24 | #[strum(serialize = "process/cpu/user")] 25 | CpuUser, 26 | #[strum(serialize = "process/cpu/system")] 27 | CpuSystem, 28 | #[strum(serialize = "process/memory/virtual")] 29 | MemoryVirtual, 30 | #[strum(serialize = "process/memory/resident")] 31 | MemoryResident, 32 | } 33 | 34 | impl Statistic for ProcessStatistic { 35 | fn name(&self) -> &str { 36 | (*self).into() 37 | } 38 | 39 | fn source(&self) -> Source { 40 | match self { 41 | Self::MemoryVirtual | Self::MemoryResident => Source::Gauge, 42 | _ => Source::Counter, 43 | } 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/samplers/rezolus/config.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | use serde_derive::Deserialize; 6 | use strum::IntoEnumIterator; 7 | 8 | use crate::config::SamplerConfig; 9 | 10 | use super::stat::*; 11 | 12 | #[derive(Debug, Deserialize)] 13 | #[serde(deny_unknown_fields)] 14 | pub struct RezolusConfig { 15 | #[serde(default)] 16 | enabled: bool, 17 | #[serde(default)] 18 | interval: Option, 19 | #[serde(default = "crate::common::default_percentiles")] 20 | percentiles: Vec, 21 | #[serde(default = "default_statistics")] 22 | statistics: Vec, 23 | } 24 | 25 | impl Default for RezolusConfig { 26 | fn default() -> Self { 27 | Self { 28 | enabled: Default::default(), 29 | interval: Default::default(), 30 | percentiles: crate::common::default_percentiles(), 31 | statistics: default_statistics(), 32 | } 33 | } 34 | } 35 | 36 | fn default_statistics() -> Vec { 37 | RezolusStatistic::iter().collect() 38 | } 39 | 40 | impl SamplerConfig for RezolusConfig { 41 | type Statistic = RezolusStatistic; 42 | 43 | fn enabled(&self) -> bool { 44 | self.enabled 45 | } 46 | 47 | fn interval(&self) -> Option { 48 | self.interval 49 | } 50 | 51 | fn percentiles(&self) -> &[f64] { 52 | &self.percentiles 53 | } 54 | 55 | fn statistics(&self) -> Vec<::Statistic> { 56 | self.statistics.clone() 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/samplers/rezolus/mod.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | use std::collections::HashMap; 6 | use std::io::SeekFrom; 7 | 8 | use async_trait::async_trait; 9 | use tokio::fs::File; 10 | use tokio::io::{AsyncBufReadExt, AsyncSeekExt, BufReader}; 11 | 12 | use crate::common::*; 13 | use crate::config::SamplerConfig; 14 | use crate::samplers::Common; 15 | use crate::*; 16 | 17 | mod config; 18 | mod stat; 19 | 20 | pub use config::*; 21 | pub use stat::*; 22 | 23 | pub fn nanos_per_tick() -> u64 { 24 | let ticks_per_second = sysconf::raw::sysconf(sysconf::raw::SysconfVariable::ScClkTck) 25 | .expect("Failed to get Clock Ticks per Second") as u64; 26 | SECOND / ticks_per_second 27 | } 28 | 29 | pub struct Rezolus { 30 | common: Common, 31 | nanos_per_tick: u64, 32 | proc_stat: Option, 33 | proc_statm: Option, 34 | statistics: Vec, 35 | } 36 | 37 | #[async_trait] 38 | impl Sampler for Rezolus { 39 | type Statistic = RezolusStatistic; 40 | 41 | fn new(common: Common) -> Result { 42 | let statistics = common.config().samplers().rezolus().statistics(); 43 | let sampler = Self { 44 | common, 45 | nanos_per_tick: nanos_per_tick() as u64, 46 | proc_stat: None, 47 | proc_statm: None, 48 | statistics, 49 | }; 50 | if sampler.sampler_config().enabled() { 51 | sampler.register(); 52 | } 53 | Ok(sampler) 54 | } 55 | 56 | fn spawn(common: Common) { 57 | if common.config().samplers().rezolus().enabled() { 58 | if let Ok(mut sampler) = Self::new(common.clone()) { 59 | common.runtime().spawn(async move { 60 | loop { 61 | let _ = sampler.sample().await; 62 | } 63 | }); 64 | } else if !common.config.fault_tolerant() { 65 | fatal!("failed to initialize rezolus sampler"); 66 | } else { 67 | error!("failed to initialize rezolus sampler"); 68 | } 69 | } 70 | } 71 | 72 | fn common(&self) -> &Common { 73 | &self.common 74 | } 75 | 76 | fn common_mut(&mut self) -> &mut Common { 77 | &mut self.common 78 | } 79 | 80 | fn sampler_config(&self) -> &dyn SamplerConfig { 81 | self.common.config().samplers().rezolus() 82 | } 83 | 84 | async fn sample(&mut self) -> Result<(), std::io::Error> { 85 | if let Some(ref mut delay) = self.delay() { 86 | delay.tick().await; 87 | } 88 | 89 | if !self.sampler_config().enabled() { 90 | return Ok(()); 91 | } 92 | 93 | debug!("sampling"); 94 | let r = self.sample_memory().await; 95 | self.map_result(r)?; 96 | 97 | let r = self.sample_cpu().await; 98 | self.map_result(r)?; 99 | 100 | Ok(()) 101 | } 102 | } 103 | 104 | impl Rezolus { 105 | async fn sample_cpu(&mut self) -> Result<(), std::io::Error> { 106 | if self.proc_stat.is_none() { 107 | let pid: u32 = std::process::id(); 108 | let path = format!("/proc/{}/stat", pid); 109 | let file = File::open(path).await?; 110 | self.proc_stat = Some(file); 111 | } 112 | 113 | if let Some(file) = &mut self.proc_stat { 114 | file.seek(SeekFrom::Start(0)).await?; 115 | let mut reader = BufReader::new(file); 116 | let mut result = HashMap::new(); 117 | let mut line = String::new(); 118 | if reader.read_line(&mut line).await? > 0 { 119 | let parts: Vec<&str> = line.split_whitespace().collect(); 120 | let user = parts.get(13).map(|v| v.parse().unwrap_or(0)).unwrap_or(0) 121 | + parts.get(15).map(|v| v.parse().unwrap_or(0)).unwrap_or(0); 122 | let system = parts.get(14).map(|v| v.parse().unwrap_or(0)).unwrap_or(0) 123 | + parts.get(16).map(|v| v.parse().unwrap_or(0)).unwrap_or(0); 124 | result.insert(RezolusStatistic::CpuUser, user * self.nanos_per_tick); 125 | result.insert(RezolusStatistic::CpuSystem, system * self.nanos_per_tick); 126 | line.clear(); 127 | } 128 | 129 | let time = Instant::now(); 130 | for statistic in &self.statistics { 131 | if let Some(value) = result.get(statistic) { 132 | let _ = self.metrics().record_counter(statistic, time, *value); 133 | } 134 | } 135 | } 136 | 137 | Ok(()) 138 | } 139 | 140 | async fn sample_memory(&mut self) -> Result<(), std::io::Error> { 141 | if self.proc_statm.is_none() { 142 | let pid: u32 = std::process::id(); 143 | let path = format!("/proc/{}/statm", pid); 144 | let file = File::open(path).await?; 145 | self.proc_statm = Some(file); 146 | } 147 | 148 | if let Some(file) = &mut self.proc_statm { 149 | file.seek(SeekFrom::Start(0)).await?; 150 | let mut result_memory = HashMap::new(); 151 | let mut reader = BufReader::new(file); 152 | let mut line = String::new(); 153 | if reader.read_line(&mut line).await? > 0 { 154 | let parts: Vec<&str> = line.split_whitespace().collect(); 155 | let vm = parts.get(0).map(|v| v.parse().unwrap_or(0)).unwrap_or(0); 156 | let rss = parts.get(1).map(|v| v.parse().unwrap_or(0)).unwrap_or(0); 157 | result_memory.insert(RezolusStatistic::MemoryVirtual, vm); 158 | result_memory.insert(RezolusStatistic::MemoryResident, rss); 159 | line.clear(); 160 | } 161 | 162 | let time = Instant::now(); 163 | for statistic in &self.statistics { 164 | if let Some(value) = result_memory.get(statistic) { 165 | let _ = self.metrics().record_gauge(statistic, time, *value * 4096); 166 | } 167 | } 168 | } 169 | 170 | Ok(()) 171 | } 172 | } 173 | -------------------------------------------------------------------------------- /src/samplers/rezolus/stat.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | use crate::metrics::*; 6 | use serde_derive::{Deserialize, Serialize}; 7 | use strum_macros::{EnumIter, EnumString, IntoStaticStr}; 8 | 9 | #[derive( 10 | Clone, 11 | Copy, 12 | Debug, 13 | Deserialize, 14 | EnumIter, 15 | EnumString, 16 | Eq, 17 | IntoStaticStr, 18 | PartialEq, 19 | Hash, 20 | Serialize, 21 | )] 22 | #[serde(deny_unknown_fields, try_from = "&str", into = "&str")] 23 | pub enum RezolusStatistic { 24 | #[strum(serialize = "rezolus/cpu/user")] 25 | CpuUser, 26 | #[strum(serialize = "rezolus/cpu/system")] 27 | CpuSystem, 28 | #[strum(serialize = "rezolus/memory/virtual")] 29 | MemoryVirtual, 30 | #[strum(serialize = "rezolus/memory/resident")] 31 | MemoryResident, 32 | } 33 | 34 | impl Statistic for RezolusStatistic { 35 | fn name(&self) -> &str { 36 | (*self).into() 37 | } 38 | 39 | fn source(&self) -> Source { 40 | match self { 41 | Self::MemoryVirtual | Self::MemoryResident => Source::Gauge, 42 | _ => Source::Counter, 43 | } 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/samplers/scheduler/bpf.c: -------------------------------------------------------------------------------- 1 | // Based on: https://github.com/iovisor/bcc/blob/master/tools/runqlat.py 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | typedef struct pid_key { 9 | u64 id; 10 | u64 slot; 11 | } pid_key_t; 12 | 13 | typedef struct pidns_key { 14 | u64 id; 15 | u64 slot; 16 | } pidns_key_t; 17 | 18 | BPF_TABLE("hash", u32, u64, start, 65536); 19 | 20 | // value_to_index() gives us from 0-460 as the index 21 | BPF_HISTOGRAM(runqueue_latency, int, 461); 22 | 23 | struct rq; 24 | 25 | // from /sys/kernel/debug/tracing/events/sched/sched_wakeup/format 26 | struct sched_wakeup_arg { 27 | u64 __unused__; 28 | char comm[16]; 29 | pid_t pid; 30 | int prio; 31 | int success; 32 | int target_cpu; 33 | }; 34 | 35 | static int trace_enqueue(u32 tgid, u32 pid) 36 | { 37 | u64 ts = bpf_ktime_get_ns(); 38 | start.update(&pid, &ts); 39 | return 0; 40 | } 41 | 42 | int trace_wake_up_new_task(struct pt_regs *ctx, struct task_struct *p) 43 | { 44 | return trace_enqueue(p->tgid, p->pid); 45 | } 46 | 47 | int trace_ttwu_do_wakeup(struct pt_regs *ctx, struct rq *rq, struct task_struct *p, 48 | int wake_flags) 49 | { 50 | return trace_enqueue(p->tgid, p->pid); 51 | } 52 | 53 | // from /sys/kernel/debug/tracing/events/sched/sched_switch/format 54 | struct sched_switch_arg { 55 | u64 __unused__; 56 | char prev_comm[16]; 57 | pid_t prev_pid; 58 | int prev_prio; 59 | long prev_state; 60 | char next_comm[16]; 61 | pid_t next_pid; 62 | int next_prio; 63 | }; 64 | 65 | VALUE_TO_INDEX2_FUNC 66 | 67 | int trace_run(struct pt_regs *ctx, struct task_struct *prev) 68 | { 69 | // handle involuntary context switch 70 | if (prev->STATE_FIELD == TASK_RUNNING) { 71 | u32 tgid = prev->tgid; 72 | u32 pid = prev->pid; 73 | u64 ts = bpf_ktime_get_ns(); 74 | start.update(&pid, &ts); 75 | } 76 | 77 | // get tgid and pid 78 | u32 tgid = bpf_get_current_pid_tgid() >> 32; 79 | u32 pid = bpf_get_current_pid_tgid(); 80 | 81 | // lookup start time 82 | u64 *tsp = start.lookup(&pid); 83 | 84 | // skip events with unknown start 85 | if (tsp == 0) { 86 | return 0; 87 | } 88 | 89 | // calculate latency in microseconds 90 | u64 delta = (bpf_ktime_get_ns() - *tsp) / 1000; 91 | 92 | // calculate index and increment histogram 93 | unsigned int index = value_to_index2(delta); 94 | runqueue_latency.increment(index); 95 | 96 | // clear the start time 97 | start.delete(&pid); 98 | return 0; 99 | } 100 | -------------------------------------------------------------------------------- /src/samplers/scheduler/config.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | use serde_derive::Deserialize; 6 | use strum::IntoEnumIterator; 7 | 8 | use crate::config::SamplerConfig; 9 | 10 | use super::stat::*; 11 | 12 | #[derive(Debug, Deserialize)] 13 | #[serde(deny_unknown_fields)] 14 | pub struct SchedulerConfig { 15 | #[serde(default)] 16 | bpf: bool, 17 | #[serde(default)] 18 | enabled: bool, 19 | #[serde(default)] 20 | interval: Option, 21 | #[serde(default = "crate::common::default_percentiles")] 22 | percentiles: Vec, 23 | #[serde(default)] 24 | perf_events: bool, 25 | #[serde(default = "default_statistics")] 26 | statistics: Vec, 27 | } 28 | 29 | impl Default for SchedulerConfig { 30 | fn default() -> Self { 31 | Self { 32 | bpf: Default::default(), 33 | enabled: Default::default(), 34 | interval: Default::default(), 35 | percentiles: crate::common::default_percentiles(), 36 | perf_events: Default::default(), 37 | statistics: default_statistics(), 38 | } 39 | } 40 | } 41 | 42 | fn default_statistics() -> Vec { 43 | SchedulerStatistic::iter().collect() 44 | } 45 | 46 | impl SamplerConfig for SchedulerConfig { 47 | type Statistic = SchedulerStatistic; 48 | 49 | fn bpf(&self) -> bool { 50 | self.bpf 51 | } 52 | 53 | fn enabled(&self) -> bool { 54 | self.enabled 55 | } 56 | 57 | fn interval(&self) -> Option { 58 | self.interval 59 | } 60 | 61 | fn percentiles(&self) -> &[f64] { 62 | &self.percentiles 63 | } 64 | 65 | fn perf_events(&self) -> bool { 66 | self.perf_events 67 | } 68 | 69 | fn statistics(&self) -> Vec<::Statistic> { 70 | let mut enabled = Vec::new(); 71 | for statistic in self.statistics.iter() { 72 | if statistic.perf_table().is_some() { 73 | if self.perf_events() { 74 | enabled.push(*statistic); 75 | } 76 | } else if statistic.bpf_table().is_some() { 77 | if self.bpf() { 78 | enabled.push(*statistic); 79 | } 80 | } else { 81 | enabled.push(*statistic); 82 | } 83 | } 84 | enabled 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /src/samplers/scheduler/perf.c: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | #include 6 | #include 7 | 8 | // Arrays which hold the perf counters 9 | BPF_PERF_ARRAY(cpu_migrations_array, NUM_CPU); 10 | 11 | 12 | // Tables which are read in user space 13 | BPF_ARRAY(cpu_migrations, u64, NUM_CPU); 14 | 15 | 16 | int do_count(struct bpf_perf_event_data *ctx) { 17 | u32 cpu = bpf_get_smp_processor_id(); 18 | u64 count = 0; 19 | 20 | count = cpu_migrations_array.perf_read(CUR_CPU_IDENTIFIER); 21 | if ((s64)count < -256 || (s64)count > 0) { 22 | cpu_migrations.update(&cpu, &count); 23 | } 24 | 25 | return 0; 26 | } 27 | -------------------------------------------------------------------------------- /src/samplers/scheduler/stat.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | use crate::common::SECOND; 6 | use crate::metrics::*; 7 | #[cfg(feature = "bpf")] 8 | use bcc::perf_event::*; 9 | use serde_derive::{Deserialize, Serialize}; 10 | use strum_macros::{EnumIter, EnumString, IntoStaticStr}; 11 | 12 | #[cfg(feature = "bpf")] 13 | use crate::common::bpf::*; 14 | 15 | #[derive( 16 | Clone, 17 | Copy, 18 | Debug, 19 | Deserialize, 20 | EnumIter, 21 | EnumString, 22 | Eq, 23 | IntoStaticStr, 24 | PartialEq, 25 | Hash, 26 | Serialize, 27 | )] 28 | #[serde(deny_unknown_fields, try_from = "&str", into = "&str")] 29 | pub enum SchedulerStatistic { 30 | #[strum(serialize = "scheduler/cpu_migrations")] 31 | CpuMigrations, 32 | #[strum(serialize = "scheduler/runqueue/latency")] 33 | RunqueueLatency, 34 | #[strum(serialize = "scheduler/context_switches")] 35 | ContextSwitches, 36 | #[strum(serialize = "scheduler/processes/created")] 37 | ProcessesCreated, 38 | #[strum(serialize = "scheduler/processes/running")] 39 | ProcessesRunning, 40 | #[strum(serialize = "scheduler/processes/blocked")] 41 | ProcessesBlocked, 42 | } 43 | 44 | impl SchedulerStatistic { 45 | #[allow(dead_code)] 46 | pub fn bpf_table(self) -> Option<&'static str> { 47 | match self { 48 | Self::RunqueueLatency => Some("runqueue_latency"), 49 | _ => None, 50 | } 51 | } 52 | 53 | #[allow(dead_code)] 54 | pub fn perf_table(self) -> Option<&'static str> { 55 | match self { 56 | Self::CpuMigrations => Some("cpu_migrations"), 57 | _ => None, 58 | } 59 | } 60 | 61 | #[cfg(feature = "bpf")] 62 | pub fn bpf_probes_required(self) -> Vec { 63 | // define the unique probes below. 64 | 65 | let finish_task_switch = if symbol_lookup("finish_task_switch.isra.0").is_some() { 66 | "finish_task_switch.isra.0".to_string() 67 | } else { 68 | "finish_task_switch".to_string() 69 | }; 70 | 71 | let finish_task_probe = Probe { 72 | name: finish_task_switch, 73 | handler: "trace_run".to_string(), 74 | probe_type: ProbeType::Kernel, 75 | probe_location: ProbeLocation::Entry, 76 | binary_path: None, 77 | sub_system: None, 78 | }; 79 | let wakeup_probe = Probe { 80 | name: "ttwu_do_wakeup".to_string(), 81 | handler: "trace_ttwu_do_wakeup".to_string(), 82 | probe_type: ProbeType::Kernel, 83 | probe_location: ProbeLocation::Entry, 84 | binary_path: None, 85 | sub_system: None, 86 | }; 87 | let new_task_probe = Probe { 88 | name: "wake_up_new_task".to_string(), 89 | handler: "trace_wake_up_new_task".to_string(), 90 | probe_type: ProbeType::Kernel, 91 | probe_location: ProbeLocation::Entry, 92 | binary_path: None, 93 | sub_system: None, 94 | }; 95 | 96 | match self { 97 | Self::RunqueueLatency => vec![finish_task_probe, wakeup_probe, new_task_probe], 98 | _ => Vec::new(), 99 | } 100 | } 101 | 102 | #[cfg(feature = "bpf")] 103 | pub fn event(self) -> Option { 104 | match self { 105 | Self::CpuMigrations => Some(Event::Software(SoftwareEvent::CpuMigrations)), 106 | _ => None, 107 | } 108 | } 109 | 110 | pub fn max(&self) -> u64 { 111 | match self { 112 | Self::RunqueueLatency => SECOND, 113 | _ => 1_000_000_000, 114 | } 115 | } 116 | } 117 | 118 | impl Statistic for SchedulerStatistic { 119 | fn name(&self) -> &str { 120 | (*self).into() 121 | } 122 | 123 | fn source(&self) -> Source { 124 | match *self { 125 | Self::RunqueueLatency => Source::Distribution, 126 | Self::ProcessesRunning | Self::ProcessesBlocked => Source::Gauge, 127 | _ => Source::Counter, 128 | } 129 | } 130 | } 131 | -------------------------------------------------------------------------------- /src/samplers/softnet/config.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | use serde_derive::Deserialize; 6 | use strum::IntoEnumIterator; 7 | 8 | use crate::config::SamplerConfig; 9 | 10 | use super::stat::*; 11 | 12 | #[derive(Debug, Deserialize)] 13 | #[serde(deny_unknown_fields)] 14 | pub struct SoftnetConfig { 15 | #[serde(default)] 16 | enabled: bool, 17 | #[serde(default)] 18 | interval: Option, 19 | #[serde(default = "crate::common::default_percentiles")] 20 | percentiles: Vec, 21 | #[serde(default = "default_statistics")] 22 | statistics: Vec, 23 | } 24 | 25 | impl Default for SoftnetConfig { 26 | fn default() -> Self { 27 | Self { 28 | enabled: Default::default(), 29 | interval: Default::default(), 30 | percentiles: crate::common::default_percentiles(), 31 | statistics: default_statistics(), 32 | } 33 | } 34 | } 35 | 36 | fn default_statistics() -> Vec { 37 | SoftnetStatistic::iter().collect() 38 | } 39 | 40 | impl SamplerConfig for SoftnetConfig { 41 | type Statistic = SoftnetStatistic; 42 | 43 | fn enabled(&self) -> bool { 44 | self.enabled 45 | } 46 | 47 | fn interval(&self) -> Option { 48 | self.interval 49 | } 50 | 51 | fn percentiles(&self) -> &[f64] { 52 | &self.percentiles 53 | } 54 | 55 | fn statistics(&self) -> Vec<::Statistic> { 56 | self.statistics.clone() 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/samplers/softnet/mod.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | use std::collections::HashMap; 6 | use std::io::SeekFrom; 7 | 8 | use async_trait::async_trait; 9 | 10 | use tokio::fs::File; 11 | use tokio::io::{AsyncBufReadExt, AsyncSeekExt, BufReader}; 12 | 13 | use crate::config::SamplerConfig; 14 | use crate::samplers::Common; 15 | use crate::*; 16 | 17 | mod config; 18 | mod stat; 19 | 20 | pub use config::*; 21 | pub use stat::*; 22 | 23 | pub struct Softnet { 24 | common: Common, 25 | softnet_stat: Option, 26 | statistics: Vec, 27 | } 28 | 29 | #[async_trait] 30 | impl Sampler for Softnet { 31 | type Statistic = SoftnetStatistic; 32 | fn new(common: Common) -> Result { 33 | let statistics = common.config().samplers().softnet().statistics(); 34 | let sampler = Self { 35 | common, 36 | softnet_stat: None, 37 | statistics, 38 | }; 39 | if sampler.sampler_config().enabled() { 40 | sampler.register(); 41 | } 42 | Ok(sampler) 43 | } 44 | 45 | fn spawn(common: Common) { 46 | if common.config().samplers().softnet().enabled() { 47 | if let Ok(mut sampler) = Self::new(common.clone()) { 48 | common.runtime().spawn(async move { 49 | loop { 50 | let _ = sampler.sample().await; 51 | } 52 | }); 53 | } else if !common.config.fault_tolerant() { 54 | fatal!("failed to initialize softnet sampler"); 55 | } else { 56 | error!("failed to initialize softnet sampler"); 57 | } 58 | } 59 | } 60 | 61 | fn common(&self) -> &Common { 62 | &self.common 63 | } 64 | 65 | fn common_mut(&mut self) -> &mut Common { 66 | &mut self.common 67 | } 68 | 69 | fn sampler_config(&self) -> &dyn SamplerConfig { 70 | self.common.config().samplers().softnet() 71 | } 72 | 73 | async fn sample(&mut self) -> Result<(), std::io::Error> { 74 | if let Some(ref mut delay) = self.delay() { 75 | delay.tick().await; 76 | } 77 | 78 | if !self.sampler_config().enabled() { 79 | return Ok(()); 80 | } 81 | 82 | debug!("sampling"); 83 | 84 | let r = self.sample_softnet_stats().await; 85 | self.map_result(r)?; 86 | 87 | Ok(()) 88 | } 89 | } 90 | 91 | impl Softnet { 92 | async fn sample_softnet_stats(&mut self) -> Result<(), std::io::Error> { 93 | if self.softnet_stat.is_none() { 94 | let file = File::open("/proc/net/softnet_stat").await?; 95 | self.softnet_stat = Some(file); 96 | } 97 | 98 | if let Some(file) = &mut self.softnet_stat { 99 | file.seek(SeekFrom::Start(0)).await?; 100 | let mut reader = BufReader::new(file); 101 | let mut line = String::new(); 102 | let mut result = HashMap::::new(); 103 | 104 | while reader.read_line(&mut line).await? > 0 { 105 | for (id, part) in line.split_whitespace().enumerate() { 106 | if let Some(statistic) = num::FromPrimitive::from_usize(id) { 107 | result.entry(statistic).or_insert(0); 108 | let current = result.get_mut(&statistic).unwrap(); 109 | *current += u64::from_str_radix(part, 16).unwrap_or(0); 110 | } 111 | } 112 | line.clear(); 113 | } 114 | 115 | let time = Instant::now(); 116 | for statistic in &self.statistics { 117 | if let Some(value) = result.get(statistic) { 118 | let _ = self.metrics().record_counter(statistic, time, *value); 119 | } 120 | } 121 | } 122 | 123 | Ok(()) 124 | } 125 | } 126 | -------------------------------------------------------------------------------- /src/samplers/softnet/stat.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | use crate::*; 6 | 7 | use num_derive::FromPrimitive; 8 | use serde_derive::{Deserialize, Serialize}; 9 | use strum_macros::{EnumIter, EnumString, IntoStaticStr}; 10 | 11 | #[derive( 12 | Clone, 13 | Copy, 14 | Debug, 15 | Deserialize, 16 | EnumIter, 17 | EnumString, 18 | Eq, 19 | FromPrimitive, 20 | IntoStaticStr, 21 | PartialEq, 22 | Hash, 23 | Serialize, 24 | )] 25 | #[serde(deny_unknown_fields, try_from = "&str", into = "&str")] 26 | pub enum SoftnetStatistic { 27 | #[strum(serialize = "softnet/processed")] 28 | Processed = 0, 29 | #[strum(serialize = "softnet/dropped")] 30 | Dropped = 1, 31 | #[strum(serialize = "softnet/time_squeezed")] 32 | TimeSqueezed = 2, 33 | #[strum(serialize = "softnet/cpu_collision")] 34 | CpuCollision = 3, 35 | #[strum(serialize = "softnet/received_rps")] 36 | ReceivedRps = 4, 37 | #[strum(serialize = "softnet/flow_limit_count")] 38 | FlowLimitCount = 5, 39 | } 40 | 41 | impl Statistic for SoftnetStatistic { 42 | fn name(&self) -> &str { 43 | (*self).into() 44 | } 45 | 46 | fn source(&self) -> Source { 47 | Source::Counter 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/samplers/tcp/config.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | use serde_derive::Deserialize; 6 | use strum::IntoEnumIterator; 7 | 8 | use crate::config::SamplerConfig; 9 | 10 | use super::stat::*; 11 | 12 | #[derive(Debug, Deserialize)] 13 | #[serde(deny_unknown_fields)] 14 | pub struct TcpConfig { 15 | #[serde(default)] 16 | bpf: bool, 17 | #[serde(default)] 18 | enabled: bool, 19 | #[serde(default)] 20 | interval: Option, 21 | #[serde(default = "crate::common::default_percentiles")] 22 | percentiles: Vec, 23 | #[serde(default = "default_statistics")] 24 | statistics: Vec, 25 | } 26 | 27 | impl Default for TcpConfig { 28 | fn default() -> Self { 29 | Self { 30 | bpf: Default::default(), 31 | enabled: Default::default(), 32 | interval: Default::default(), 33 | percentiles: crate::common::default_percentiles(), 34 | statistics: default_statistics(), 35 | } 36 | } 37 | } 38 | 39 | fn default_statistics() -> Vec { 40 | TcpStatistic::iter().collect() 41 | } 42 | 43 | impl SamplerConfig for TcpConfig { 44 | type Statistic = TcpStatistic; 45 | 46 | fn bpf(&self) -> bool { 47 | self.bpf 48 | } 49 | 50 | fn enabled(&self) -> bool { 51 | self.enabled 52 | } 53 | 54 | fn interval(&self) -> Option { 55 | self.interval 56 | } 57 | 58 | fn percentiles(&self) -> &[f64] { 59 | &self.percentiles 60 | } 61 | 62 | fn statistics(&self) -> Vec<::Statistic> { 63 | let mut enabled = Vec::new(); 64 | for statistic in self.statistics.iter() { 65 | if statistic.bpf_table().is_some() { 66 | if self.bpf() { 67 | enabled.push(*statistic); 68 | } 69 | } else { 70 | enabled.push(*statistic); 71 | } 72 | } 73 | enabled 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /src/samplers/udp/config.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | use serde_derive::Deserialize; 6 | use strum::IntoEnumIterator; 7 | 8 | use crate::config::SamplerConfig; 9 | 10 | use super::stat::*; 11 | 12 | #[derive(Debug, Deserialize)] 13 | #[serde(deny_unknown_fields)] 14 | pub struct UdpConfig { 15 | #[serde(default)] 16 | enabled: bool, 17 | #[serde(default)] 18 | interval: Option, 19 | #[serde(default = "crate::common::default_percentiles")] 20 | percentiles: Vec, 21 | #[serde(default = "default_statistics")] 22 | statistics: Vec, 23 | } 24 | 25 | impl Default for UdpConfig { 26 | fn default() -> Self { 27 | Self { 28 | enabled: Default::default(), 29 | interval: Default::default(), 30 | percentiles: crate::common::default_percentiles(), 31 | statistics: default_statistics(), 32 | } 33 | } 34 | } 35 | 36 | fn default_statistics() -> Vec { 37 | UdpStatistic::iter().collect() 38 | } 39 | 40 | impl SamplerConfig for UdpConfig { 41 | type Statistic = UdpStatistic; 42 | 43 | fn enabled(&self) -> bool { 44 | self.enabled 45 | } 46 | 47 | fn interval(&self) -> Option { 48 | self.interval 49 | } 50 | 51 | fn percentiles(&self) -> &[f64] { 52 | &self.percentiles 53 | } 54 | 55 | fn statistics(&self) -> Vec<::Statistic> { 56 | self.statistics.clone() 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/samplers/udp/mod.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | use async_trait::async_trait; 6 | use tokio::fs::File; 7 | 8 | use crate::config::SamplerConfig; 9 | use crate::samplers::Common; 10 | use crate::*; 11 | 12 | mod config; 13 | mod stat; 14 | 15 | pub use config::UdpConfig; 16 | pub use stat::UdpStatistic; 17 | 18 | #[allow(dead_code)] 19 | pub struct Udp { 20 | common: Common, 21 | proc_net_snmp: Option, 22 | proc_net_netstat: Option, 23 | statistics: Vec, 24 | } 25 | 26 | #[async_trait] 27 | impl Sampler for Udp { 28 | type Statistic = UdpStatistic; 29 | 30 | fn new(common: Common) -> Result { 31 | let statistics = common.config().samplers().udp().statistics(); 32 | 33 | let sampler = Self { 34 | common, 35 | proc_net_snmp: None, 36 | proc_net_netstat: None, 37 | statistics, 38 | }; 39 | if sampler.sampler_config().enabled() { 40 | sampler.register(); 41 | } 42 | Ok(sampler) 43 | } 44 | 45 | fn spawn(common: Common) { 46 | if common.config().samplers().udp().enabled() { 47 | if let Ok(mut sampler) = Self::new(common.clone()) { 48 | common.runtime().spawn(async move { 49 | loop { 50 | let _ = sampler.sample().await; 51 | } 52 | }); 53 | } else if !common.config.fault_tolerant() { 54 | fatal!("failed to initialize udp sampler"); 55 | } else { 56 | error!("failed to initialize udp sampler"); 57 | } 58 | } 59 | } 60 | 61 | fn common(&self) -> &Common { 62 | &self.common 63 | } 64 | 65 | fn common_mut(&mut self) -> &mut Common { 66 | &mut self.common 67 | } 68 | 69 | async fn sample(&mut self) -> Result<(), std::io::Error> { 70 | if let Some(ref mut delay) = self.delay() { 71 | delay.tick().await; 72 | } 73 | 74 | if !self.sampler_config().enabled() { 75 | return Ok(()); 76 | } 77 | 78 | debug!("sampling"); 79 | 80 | let r = self.sample_snmp().await; 81 | self.map_result(r)?; 82 | 83 | let r = self.sample_netstat().await; 84 | self.map_result(r)?; 85 | 86 | Ok(()) 87 | } 88 | 89 | fn sampler_config(&self) -> &dyn SamplerConfig { 90 | self.common.config().samplers().udp() 91 | } 92 | } 93 | 94 | impl Udp { 95 | async fn sample_snmp(&mut self) -> Result<(), std::io::Error> { 96 | if self.proc_net_snmp.is_none() { 97 | let file = File::open("/proc/net/snmp").await?; 98 | self.proc_net_snmp = Some(file); 99 | } 100 | if let Some(file) = &mut self.proc_net_snmp { 101 | let parsed = crate::common::nested_map_from_file(file).await?; 102 | let time = Instant::now(); 103 | for statistic in &self.statistics { 104 | if let Some((pkey, lkey)) = statistic.keys() { 105 | if let Some(inner) = parsed.get(pkey) { 106 | if let Some(value) = inner.get(lkey) { 107 | let _ = self.metrics().record_counter(statistic, time, *value); 108 | } 109 | } 110 | } 111 | } 112 | } 113 | 114 | Ok(()) 115 | } 116 | 117 | async fn sample_netstat(&mut self) -> Result<(), std::io::Error> { 118 | if self.proc_net_netstat.is_none() { 119 | let file = File::open("/proc/net/netstat").await?; 120 | self.proc_net_netstat = Some(file); 121 | } 122 | if let Some(file) = &mut self.proc_net_netstat { 123 | let parsed = crate::common::nested_map_from_file(file).await?; 124 | let time = Instant::now(); 125 | for statistic in &self.statistics { 126 | if let Some((pkey, lkey)) = statistic.keys() { 127 | if let Some(inner) = parsed.get(pkey) { 128 | if let Some(value) = inner.get(lkey) { 129 | let _ = self.metrics().record_counter(statistic, time, *value); 130 | } 131 | } 132 | } 133 | } 134 | } 135 | Ok(()) 136 | } 137 | } 138 | -------------------------------------------------------------------------------- /src/samplers/udp/stat.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | use crate::metrics::*; 6 | use serde_derive::{Deserialize, Serialize}; 7 | use strum_macros::{EnumIter, EnumString, IntoStaticStr}; 8 | 9 | #[derive( 10 | Clone, 11 | Copy, 12 | Debug, 13 | Deserialize, 14 | EnumIter, 15 | EnumString, 16 | Eq, 17 | IntoStaticStr, 18 | PartialEq, 19 | Hash, 20 | Serialize, 21 | )] 22 | #[serde(deny_unknown_fields, try_from = "&str", into = "&str")] 23 | pub enum UdpStatistic { 24 | #[strum(serialize = "udp/receive/datagrams")] 25 | InDatagrams, 26 | #[strum(serialize = "udp/receive/errors")] 27 | InErrors, 28 | #[strum(serialize = "udp/transmit/datagrams")] 29 | OutDatagrams, 30 | } 31 | 32 | impl UdpStatistic { 33 | pub fn keys(self) -> Option<(&'static str, &'static str)> { 34 | match self { 35 | Self::InDatagrams => Some(("Udp:", "InDatagrams")), 36 | Self::InErrors => Some(("Udp:", "InErrors")), 37 | Self::OutDatagrams => Some(("Udp:", "OutDatagrams")), 38 | } 39 | } 40 | } 41 | 42 | impl Statistic for UdpStatistic { 43 | fn name(&self) -> &str { 44 | (*self).into() 45 | } 46 | 47 | fn source(&self) -> Source { 48 | Source::Counter 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/samplers/usercall/config.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | use serde_derive::Deserialize; 6 | use std::collections::BTreeMap; 7 | 8 | use crate::config::SamplerConfig; 9 | 10 | use super::stat::UsercallStatistic; 11 | 12 | pub const NAMESPACE: &str = "usercall"; 13 | 14 | #[derive(Debug, Deserialize, Default, Clone, PartialEq)] 15 | pub struct LibraryProbeConfig { 16 | pub name: String, 17 | pub path: Option, 18 | pub functions: Vec, 19 | } 20 | 21 | #[derive(Debug, Deserialize, Default)] 22 | #[serde(deny_unknown_fields)] 23 | pub struct UsercallConfig { 24 | #[serde(default)] 25 | bpf: bool, 26 | #[serde(default)] 27 | enabled: bool, 28 | #[serde(default)] 29 | interval: Option, 30 | #[serde(default)] 31 | percentiles: Vec, 32 | #[serde(default)] 33 | libraries: Vec, 34 | } 35 | 36 | impl UsercallConfig { 37 | pub fn libraries(&self) -> Vec { 38 | let mut lib_map: BTreeMap, LibraryProbeConfig>> = 39 | BTreeMap::new(); 40 | for lib_conf in self.libraries.iter() { 41 | if lib_conf.name.is_empty() { 42 | warn!("Skipping library config without a name: {:?}", lib_conf); 43 | continue; 44 | } 45 | 46 | match lib_map.get_mut(&lib_conf.name) { 47 | Some(file_map) => { 48 | if let Some(conf) = file_map.get(&None) { 49 | warn!( 50 | "Removing duplicate user call library search path: {:?}", 51 | conf 52 | ); 53 | file_map.remove(&None); 54 | } 55 | if lib_conf.path.is_some() && !file_map.contains_key(&lib_conf.path) { 56 | file_map.insert(lib_conf.path.clone(), lib_conf.clone()); 57 | } else { 58 | warn!("Removing duplicate user call config: {:?}", lib_conf); 59 | } 60 | } 61 | None => { 62 | let mut to_add = BTreeMap::new(); 63 | to_add.insert(lib_conf.path.clone(), lib_conf.clone()); 64 | lib_map.insert(lib_conf.name.clone(), to_add); 65 | } 66 | }; 67 | } 68 | lib_map 69 | .values() 70 | .map(|m| m.values()) 71 | .flatten() 72 | .cloned() 73 | .collect() 74 | } 75 | } 76 | 77 | impl SamplerConfig for UsercallConfig { 78 | type Statistic = UsercallStatistic; 79 | 80 | fn bpf(&self) -> bool { 81 | self.bpf 82 | } 83 | 84 | fn enabled(&self) -> bool { 85 | self.enabled 86 | } 87 | 88 | fn interval(&self) -> Option { 89 | self.interval 90 | } 91 | 92 | fn percentiles(&self) -> &[f64] { 93 | &self.percentiles 94 | } 95 | 96 | fn statistics(&self) -> Vec<::Statistic> { 97 | let mut stats = Vec::new(); 98 | for lib_conf in self.libraries().iter() { 99 | for func in lib_conf.functions.iter() { 100 | stats.push(UsercallStatistic { 101 | stat_path: format!("{}/{}/{}", NAMESPACE, lib_conf.name, func), 102 | }); 103 | } 104 | } 105 | stats 106 | } 107 | } 108 | 109 | #[cfg(test)] 110 | mod tests { 111 | use super::*; 112 | 113 | macro_rules! dedup_tests { 114 | ($($name:ident: $value:expr,)*) => { 115 | $( 116 | #[test] 117 | fn $name() { 118 | let (input, expected) : (Vec, Vec) = $value; 119 | let mut config = UsercallConfig::default(); 120 | config.libraries = input; 121 | assert_eq!(expected, config.libraries()); 122 | } 123 | )* 124 | } 125 | } 126 | 127 | macro_rules! vals { 128 | ($lt:literal) => { 129 | match ($lt) { 130 | "cTmp" => LibraryProbeConfig { 131 | path: Some("/tmp".into()), 132 | name: "c".into(), 133 | functions: vec![], 134 | }, 135 | "cUsr" => LibraryProbeConfig { 136 | path: Some("/usr".into()), 137 | name: "c".into(), 138 | functions: vec![], 139 | }, 140 | "cSearch" => LibraryProbeConfig { 141 | path: None, 142 | name: "c".into(), 143 | functions: vec![], 144 | }, 145 | _ => LibraryProbeConfig::default(), 146 | } 147 | }; 148 | } 149 | 150 | dedup_tests! { 151 | dedup_1: (vec![vals!("")], vec![]), 152 | dedup_2: (vec![vals!("cTmp")], vec![vals!("cTmp")]), 153 | dedup_3: (vec![vals!("cUsr"), vals!("cTmp")], vec![vals!("cTmp"), vals!("cUsr")]), 154 | dedup_4: (vec![vals!("cUsr"), vals!("cTmp"), vals!("cSearch")], vec![vals!("cTmp"), vals!("cUsr")]), 155 | dedup_5: (vec![vals!("cSearch"), vals!("cTmp"), vals!("cUsr")], vec![vals!("cTmp"), vals!("cUsr")]), 156 | dedup_6: (vec![vals!("cTmp"), vals!("cTmp"), vals!("cUsr")], vec![vals!("cTmp"), vals!("cUsr")]), 157 | } 158 | } 159 | -------------------------------------------------------------------------------- /src/samplers/usercall/stat.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2021 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | use crate::metrics::{Source, Statistic}; 6 | 7 | #[derive(Clone, Debug, Eq, PartialEq, Hash)] 8 | pub struct UsercallStatistic { 9 | pub stat_path: String, 10 | } 11 | 12 | impl Statistic for UsercallStatistic { 13 | fn name(&self) -> &str { 14 | &self.stat_path 15 | } 16 | 17 | fn source(&self) -> Source { 18 | Source::Counter 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /src/samplers/xfs/bpf.c: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | // Based on: https://github.com/iovisor/bcc/blob/master/tools/xfsdist.py 6 | 7 | #include 8 | #include 9 | #include 10 | 11 | #define OP_NAME_LEN 8 12 | 13 | typedef struct dist_key { 14 | char op[OP_NAME_LEN]; 15 | u64 slot; 16 | } dist_key_t; 17 | 18 | BPF_HASH(start, u32); 19 | 20 | // value_to_index2() gives us from 0-460 as the index 21 | BPF_HISTOGRAM(read, int, 461); 22 | BPF_HISTOGRAM(write, int, 461); 23 | BPF_HISTOGRAM(open, int, 461); 24 | BPF_HISTOGRAM(fsync, int, 461); 25 | 26 | VALUE_TO_INDEX2_FUNC 27 | 28 | int trace_entry(struct pt_regs *ctx) 29 | { 30 | u32 pid = bpf_get_current_pid_tgid(); 31 | u64 ts = bpf_ktime_get_ns(); 32 | start.update(&pid, &ts); 33 | return 0; 34 | } 35 | 36 | static int trace_return(struct pt_regs *ctx, int op) 37 | { 38 | // get pid 39 | u32 pid = bpf_get_current_pid_tgid(); 40 | 41 | // lookup start time 42 | u64 *tsp = start.lookup(&pid); 43 | 44 | // skip events without start 45 | if (tsp == 0) { 46 | return 0; 47 | } 48 | 49 | // calculate latency in microseconds 50 | u64 delta = (bpf_ktime_get_ns() - *tsp) / 1000; 51 | 52 | // calculate index 53 | u64 index = value_to_index2(delta); 54 | 55 | // store into correct histogram for OP 56 | if (op == 0) { 57 | read.increment(index); 58 | } else if (op == 1) { 59 | write.increment(index); 60 | } else if (op == 2) { 61 | open.increment(index); 62 | } else if (op == 3) { 63 | fsync.increment(index); 64 | } 65 | 66 | // clear the start time 67 | start.delete(&pid); 68 | 69 | return 0; 70 | } 71 | 72 | int trace_read_return(struct pt_regs *ctx) 73 | { 74 | return trace_return(ctx, 0); 75 | } 76 | 77 | int trace_write_return(struct pt_regs *ctx) 78 | { 79 | return trace_return(ctx, 1); 80 | } 81 | 82 | int trace_open_return(struct pt_regs *ctx) 83 | { 84 | return trace_return(ctx, 2); 85 | } 86 | 87 | int trace_fsync_return(struct pt_regs *ctx) 88 | { 89 | return trace_return(ctx, 3); 90 | } 91 | -------------------------------------------------------------------------------- /src/samplers/xfs/config.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | use serde_derive::Deserialize; 6 | use strum::IntoEnumIterator; 7 | 8 | use crate::config::SamplerConfig; 9 | 10 | use super::stat::*; 11 | 12 | #[derive(Debug, Deserialize)] 13 | #[serde(deny_unknown_fields)] 14 | pub struct XfsConfig { 15 | #[serde(default)] 16 | bpf: bool, 17 | #[serde(default)] 18 | enabled: bool, 19 | #[serde(default)] 20 | interval: Option, 21 | #[serde(default = "crate::common::default_percentiles")] 22 | percentiles: Vec, 23 | #[serde(default = "default_statistics")] 24 | statistics: Vec, 25 | } 26 | 27 | impl Default for XfsConfig { 28 | fn default() -> Self { 29 | Self { 30 | bpf: Default::default(), 31 | enabled: Default::default(), 32 | interval: Default::default(), 33 | percentiles: crate::common::default_percentiles(), 34 | statistics: default_statistics(), 35 | } 36 | } 37 | } 38 | 39 | fn default_statistics() -> Vec { 40 | XfsStatistic::iter().collect() 41 | } 42 | 43 | impl SamplerConfig for XfsConfig { 44 | type Statistic = XfsStatistic; 45 | 46 | fn bpf(&self) -> bool { 47 | self.bpf 48 | } 49 | 50 | fn enabled(&self) -> bool { 51 | self.enabled 52 | } 53 | 54 | fn interval(&self) -> Option { 55 | self.interval 56 | } 57 | 58 | fn percentiles(&self) -> &[f64] { 59 | &self.percentiles 60 | } 61 | 62 | fn statistics(&self) -> Vec<::Statistic> { 63 | let mut enabled = Vec::new(); 64 | for statistic in self.statistics.iter() { 65 | if statistic.bpf_table().is_some() { 66 | if self.bpf() { 67 | enabled.push(*statistic); 68 | } 69 | } else { 70 | enabled.push(*statistic); 71 | } 72 | } 73 | enabled 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /src/samplers/xfs/mod.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | #[cfg(feature = "bpf")] 6 | use std::collections::HashSet; 7 | use std::sync::{Arc, Mutex}; 8 | 9 | use async_trait::async_trait; 10 | 11 | use crate::common::bpf::*; 12 | use crate::config::SamplerConfig; 13 | use crate::samplers::Common; 14 | use crate::Sampler; 15 | use crate::*; 16 | 17 | mod config; 18 | mod stat; 19 | 20 | pub use config::*; 21 | pub use stat::*; 22 | 23 | #[allow(dead_code)] 24 | pub struct Xfs { 25 | bpf: Option>>, 26 | bpf_last: Arc>, 27 | common: Common, 28 | statistics: Vec, 29 | } 30 | 31 | #[async_trait] 32 | impl Sampler for Xfs { 33 | type Statistic = XfsStatistic; 34 | fn new(common: Common) -> Result { 35 | let fault_tolerant = common.config.general().fault_tolerant(); 36 | let statistics = common.config().samplers().xfs().statistics(); 37 | 38 | #[allow(unused_mut)] 39 | let mut sampler = Self { 40 | bpf: None, 41 | bpf_last: Arc::new(Mutex::new(Instant::now())), 42 | common, 43 | statistics, 44 | }; 45 | 46 | if let Err(e) = sampler.initialize_bpf() { 47 | error!("failed to initialize bpf: {}", e); 48 | if !fault_tolerant { 49 | return Err(e); 50 | } 51 | } 52 | 53 | if sampler.sampler_config().enabled() { 54 | sampler.register(); 55 | } 56 | 57 | Ok(sampler) 58 | } 59 | 60 | fn spawn(common: Common) { 61 | if common.config().samplers().xfs().enabled() { 62 | if let Ok(mut sampler) = Self::new(common.clone()) { 63 | common.runtime().spawn(async move { 64 | loop { 65 | let _ = sampler.sample().await; 66 | } 67 | }); 68 | } else if !common.config.fault_tolerant() { 69 | fatal!("failed to initialize xfs sampler"); 70 | } else { 71 | error!("failed to initialize xfs sampler"); 72 | } 73 | } 74 | } 75 | 76 | fn common(&self) -> &Common { 77 | &self.common 78 | } 79 | 80 | fn common_mut(&mut self) -> &mut Common { 81 | &mut self.common 82 | } 83 | 84 | fn sampler_config(&self) -> &dyn SamplerConfig { 85 | self.common.config().samplers().xfs() 86 | } 87 | 88 | async fn sample(&mut self) -> Result<(), std::io::Error> { 89 | if let Some(ref mut delay) = self.delay() { 90 | delay.tick().await; 91 | } 92 | 93 | if !self.sampler_config().enabled() { 94 | return Ok(()); 95 | } 96 | 97 | debug!("sampling"); 98 | 99 | // sample bpf 100 | #[cfg(feature = "bpf")] 101 | self.map_result(self.sample_bpf())?; 102 | 103 | Ok(()) 104 | } 105 | } 106 | 107 | impl Xfs { 108 | // checks that bpf is enabled in config and one or more bpf stats enabled 109 | #[cfg(feature = "bpf")] 110 | fn bpf_enabled(&self) -> bool { 111 | if self.sampler_config().bpf() { 112 | for statistic in &self.statistics { 113 | if statistic.bpf_table().is_some() { 114 | return true; 115 | } 116 | } 117 | } 118 | false 119 | } 120 | 121 | fn initialize_bpf(&mut self) -> Result<(), anyhow::Error> { 122 | #[cfg(feature = "bpf")] 123 | { 124 | if self.enabled() && self.bpf_enabled() { 125 | debug!("initializing bpf"); 126 | 127 | // load the code and compile 128 | let code = include_str!("bpf.c"); 129 | let code = code.replace( 130 | "VALUE_TO_INDEX2_FUNC", 131 | include_str!("../../common/value_to_index2.c"), 132 | ); 133 | let mut bpf = bcc::BPF::new(&code)?; 134 | 135 | // collect the set of probes required from the statistics enabled. 136 | let mut probes = HashSet::new(); 137 | for statistic in &self.statistics { 138 | for probe in statistic.bpf_probes_required() { 139 | probes.insert(probe); 140 | } 141 | } 142 | 143 | // load + attach the kernel probes that are required to the bpf instance. 144 | for probe in probes { 145 | if self.common.config.fault_tolerant() { 146 | if let Err(e) = probe.try_attach_to_bpf(&mut bpf) { 147 | warn!("skipping {} with error: {}", probe.name, e); 148 | } 149 | } else { 150 | probe.try_attach_to_bpf(&mut bpf)?; 151 | } 152 | } 153 | 154 | self.bpf = Some(Arc::new(Mutex::new(BPF { inner: bpf }))); 155 | } 156 | } 157 | 158 | Ok(()) 159 | } 160 | 161 | #[cfg(feature = "bpf")] 162 | fn sample_bpf(&self) -> Result<(), std::io::Error> { 163 | if self.bpf_last.lock().unwrap().elapsed() 164 | >= Duration::from_secs(self.general_config().window() as u64) 165 | { 166 | if let Some(ref bpf) = self.bpf { 167 | let bpf = bpf.lock().unwrap(); 168 | let time = Instant::now(); 169 | for statistic in self.statistics.iter().filter(|s| s.bpf_table().is_some()) { 170 | if let Ok(mut table) = (*bpf).inner.table(statistic.bpf_table().unwrap()) { 171 | for (&value, &count) in &map_from_table(&mut table) { 172 | if count > 0 { 173 | let _ = self.metrics().record_bucket( 174 | statistic, 175 | time, 176 | value * crate::MICROSECOND, 177 | count, 178 | ); 179 | } 180 | } 181 | } 182 | } 183 | } 184 | *self.bpf_last.lock().unwrap() = Instant::now(); 185 | } 186 | Ok(()) 187 | } 188 | } 189 | -------------------------------------------------------------------------------- /src/samplers/xfs/stat.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Twitter, Inc. 2 | // Licensed under the Apache License, Version 2.0 3 | // http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | use crate::metrics::*; 6 | use serde_derive::{Deserialize, Serialize}; 7 | use strum_macros::{EnumIter, EnumString, IntoStaticStr}; 8 | 9 | #[cfg(feature = "bpf")] 10 | use crate::common::bpf::*; 11 | 12 | #[derive( 13 | Clone, 14 | Copy, 15 | Debug, 16 | Deserialize, 17 | EnumIter, 18 | EnumString, 19 | Eq, 20 | IntoStaticStr, 21 | PartialEq, 22 | Hash, 23 | Serialize, 24 | )] 25 | #[serde(deny_unknown_fields, try_from = "&str", into = "&str")] 26 | #[allow(clippy::enum_variant_names)] 27 | pub enum XfsStatistic { 28 | #[strum(serialize = "xfs/read/latency")] 29 | ReadLatency, 30 | #[strum(serialize = "xfs/write/latency")] 31 | WriteLatency, 32 | #[strum(serialize = "xfs/open/latency")] 33 | OpenLatency, 34 | #[strum(serialize = "xfs/fsync/latency")] 35 | FsyncLatency, 36 | } 37 | 38 | impl XfsStatistic { 39 | #[allow(dead_code)] 40 | pub fn bpf_table(self) -> Option<&'static str> { 41 | match self { 42 | Self::ReadLatency => Some("read"), 43 | Self::WriteLatency => Some("write"), 44 | Self::OpenLatency => Some("open"), 45 | Self::FsyncLatency => Some("fsync"), 46 | } 47 | } 48 | 49 | #[cfg(feature = "bpf")] 50 | pub fn bpf_probes_required(self) -> Vec { 51 | // define the unique probes below. 52 | let file_read_probe = Probe { 53 | name: "xfs_file_read_iter".to_string(), 54 | handler: "trace_entry".to_string(), 55 | probe_type: ProbeType::Kernel, 56 | probe_location: ProbeLocation::Entry, 57 | binary_path: None, 58 | sub_system: None, 59 | }; 60 | let file_write_probe = Probe { 61 | name: "xfs_file_write_iter".to_string(), 62 | handler: "trace_entry".to_string(), 63 | probe_type: ProbeType::Kernel, 64 | probe_location: ProbeLocation::Entry, 65 | binary_path: None, 66 | sub_system: None, 67 | }; 68 | let file_open_probe = Probe { 69 | name: "xfs_file_open".to_string(), 70 | handler: "trace_entry".to_string(), 71 | probe_type: ProbeType::Kernel, 72 | probe_location: ProbeLocation::Entry, 73 | binary_path: None, 74 | sub_system: None, 75 | }; 76 | let file_sync_probe = Probe { 77 | name: "xfs_file_fsync".to_string(), 78 | handler: "trace_entry".to_string(), 79 | probe_type: ProbeType::Kernel, 80 | probe_location: ProbeLocation::Entry, 81 | binary_path: None, 82 | sub_system: None, 83 | }; 84 | let file_read_ret_probe = Probe { 85 | name: "xfs_file_read_iter".to_string(), 86 | handler: "trace_read_return".to_string(), 87 | probe_type: ProbeType::Kernel, 88 | probe_location: ProbeLocation::Return, 89 | binary_path: None, 90 | sub_system: None, 91 | }; 92 | let file_write_ret_probe = Probe { 93 | name: "xfs_file_write_iter".to_string(), 94 | handler: "trace_write_return".to_string(), 95 | probe_type: ProbeType::Kernel, 96 | probe_location: ProbeLocation::Return, 97 | binary_path: None, 98 | sub_system: None, 99 | }; 100 | let file_open_ret_probe = Probe { 101 | name: "xfs_file_open".to_string(), 102 | handler: "trace_open_return".to_string(), 103 | probe_type: ProbeType::Kernel, 104 | probe_location: ProbeLocation::Return, 105 | binary_path: None, 106 | sub_system: None, 107 | }; 108 | let file_sync_ret_probe = Probe { 109 | name: "xfs_file_fsync".to_string(), 110 | handler: "trace_fsync_return".to_string(), 111 | probe_type: ProbeType::Kernel, 112 | probe_location: ProbeLocation::Return, 113 | binary_path: None, 114 | sub_system: None, 115 | }; 116 | 117 | // specify what probes are required for each telemetry. 118 | match self { 119 | Self::ReadLatency => vec![file_read_probe, file_read_ret_probe], 120 | Self::WriteLatency => vec![file_write_probe, file_write_ret_probe], 121 | Self::OpenLatency => vec![file_open_probe, file_open_ret_probe], 122 | Self::FsyncLatency => vec![file_sync_probe, file_sync_ret_probe], 123 | } 124 | } 125 | } 126 | 127 | impl Statistic for XfsStatistic { 128 | fn name(&self) -> &str { 129 | (*self).into() 130 | } 131 | 132 | fn source(&self) -> Source { 133 | Source::Distribution 134 | } 135 | } 136 | --------------------------------------------------------------------------------