├── .github └── workflows │ └── ci.yml ├── .gitignore ├── Cargo.toml ├── README.md └── src ├── lib.rs └── tokio_runtime_metrics.rs /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: Continuous Integration 2 | 3 | on: 4 | push: 5 | branches: [ "main" ] 6 | pull_request: 7 | 8 | env: 9 | CARGO_TERM_COLOR: always 10 | RUST_BACKTRACE: full 11 | 12 | jobs: 13 | # build rust 14 | build: 15 | runs-on: ubuntu-latest 16 | strategy: 17 | matrix: 18 | RUSTFLAGS: ["--deny warnings", "--cfg tokio_unstable --deny warnings", "--cfg tokio_unstable --cfg tokio_taskdump --deny warnings"] 19 | TARGET: ["", "--target wasm32-unknown-unknown"] # An arbitrary 32-bit target 20 | exclude: # This doesn't work because tokio_taskdump doesn't support wasm32 21 | - RUSTFLAGS: "--cfg tokio_unstable --cfg tokio_taskdump --deny warnings" 22 | TARGET: "--target wasm32-unknown-unknown" 23 | 24 | steps: 25 | - uses: actions/checkout@v4 26 | - name: Rustup update 27 | run: rustup update && rustup target list 28 | - name: Install wasm32 stdlib 29 | run: rustup target add wasm32-unknown-unknown 30 | - name: Show cargo version 31 | run: cargo --version 32 | - name: rust build caching 33 | uses: Swatinem/rust-cache@v2 34 | with: 35 | workspaces: . -> target 36 | save-if: ${{ github.ref == 'refs/heads/main' }} 37 | - name: Build Rust 38 | env: 39 | RUSTFLAGS: ${{ matrix.RUSTFLAGS }} 40 | run: cargo build --verbose ${{ matrix.TARGET }} 41 | 42 | # lint rust 43 | lint: 44 | runs-on: ubuntu-latest 45 | steps: 46 | - uses: actions/checkout@v4 47 | - name: Rustup update 48 | run: rustup update 49 | - name: Show cargo version 50 | run: cargo --version 51 | - name: rust build caching 52 | uses: Swatinem/rust-cache@v2 53 | with: 54 | workspaces: . -> target 55 | save-if: ${{ github.ref == 'refs/heads/main' }} 56 | - name: Format 57 | run: cargo fmt -- --check 58 | - name: Clippy 59 | run: cargo clippy --all --all-features -- -D warnings 60 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | Cargo.lock 2 | target/** 3 | 4 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "tokio_util_watchdog" 3 | version = "0.1.2" 4 | authors = ["beck.ct@gmail.com"] 5 | edition = "2021" 6 | license = "MIT OR Apache-2.0" 7 | readme = "README.md" 8 | rust-version = "1.74" 9 | repository = "https://github.com/cbeck88/tokio_util_watchdog" 10 | description = "A watchdog utility for tokio runtimes" 11 | categories = ["asynchronous", "development-tools::debugging"] 12 | keywords = [ 13 | "async", 14 | "tokio", 15 | "utility", 16 | "watchdog", 17 | ] 18 | 19 | [lints.rust] 20 | unexpected_cfgs = { level = "allow", check-cfg = ['cfg(tokio_unstable)'] } 21 | 22 | [dependencies] 23 | tokio = { version = "1.43.0", features = ["rt", "time"] } 24 | tracing = { version = "0.1" } 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # tokio_util_watchdog 2 | 3 | A watchdog utility for detecting deadlocks in tokio runtimes. 4 | 5 | [![Crates.io](https://img.shields.io/crates/v/tokio_util_watchdog?style=flat-square)](https://crates.io/crates/tokio_util_watchdog) 6 | [![Crates.io](https://img.shields.io/crates/d/tokio_util_watchdog?style=flat-square)](https://crates.io/crates/tokio_util_watchdog) 7 | [![License](https://img.shields.io/badge/license-Apache%202.0-blue?style=flat-square)](LICENSE-APACHE) 8 | [![License](https://img.shields.io/badge/license-MIT-blue?style=flat-square)](LICENSE-MIT) 9 | [![Build Status](https://img.shields.io/github/actions/workflow/status/cbeck88/tokio_util_watchdog/ci.yml?branch=main&style=flat-square)](https://github.com/cbeck88/tokio_util_watchdog/actions/workflows/ci.yml?query=branch%3Amain) 10 | 11 | [API Docs](https://docs.rs/tokio_util_watchdog/latest/tokio_util_watchdog/) 12 | 13 | --- 14 | 15 | If we get a tokio deadlock, i.e. all worker threads get blocked and no more 16 | asynchronous futures can be driven, it can be hard to diagnose and debug 17 | in production. 18 | 19 | This watchdog uses a very simple strategy to detect and try to recover from that 20 | situation: 21 | 22 | * Spawn a task on the runtime that periodically records "heartbeats", e.g. once a second. 23 | * Spawn a thread (using std) outside of the runtime that wakes up periodically and checks 24 | for those heartbeats. 25 | * If heartbeats are not detected for a few seconds (configurable), panic. 26 | * Before we panic, try to collect and log [`tokio::RuntimeMetrics`](https://docs.rs/tokio/latest/tokio/runtime/struct.RuntimeMetrics.html) for this runtime for a few seconds (configurable). 27 | * If `cfg(tokio_unstable)` and `cfg(tokio_taskdump)` were used, also try to collect and log a [task dump](https://docs.rs/tokio/latest/tokio/runtime/dump/struct.Dump.html) for a few seconds. 28 | 29 | The assumption here is that when the panic occurs, your deployment infrastructure will detect that this happened 30 | and restart the process. Hopefully the process will recover and not immediately deadlock again. And meanwhile, you 31 | will automatically get more information than you would otherwise, which might help you fix the underlying issue, 32 | especially if you used the extra features. 33 | 34 | (If you used django in the past, you might have seen similar behavior, where timed-out worker processes are automatically 35 | killed and restarted, with some error logging, without blocking or starving the whole webserver.) 36 | 37 | Note that this is a different type of watchdog from e.g. [`simple-tokio-watchdog`](https://crates.io/crates/simple-tokio-watchdog) and 38 | some other such crates -- our crate is specifically for checking the tokio runtime itself for liveness, and then logging any useful diagnostics 39 | and panicking (configurable). 40 | 41 | ## Quick start 42 | 43 | 1. Add `tokio_util_watchdog = "0.1"` to your `Cargo.toml`. 44 | 1. In `main.rs` somewhere, add lines such as: 45 | 46 | ```rust 47 | use tokio_util_watchdog::Watchdog; 48 | 49 | ... 50 | 51 | #[tokio::main] 52 | async fn main() { 53 | ... 54 | 55 | let _watchdog = Watchdog::builder().build(); 56 | 57 | ... 58 | } 59 | ``` 60 | 61 | See the [builder documentation](https://docs.rs/tokio_util_watchdog/latest/tokio_util_watchdog/struct.Builder.html) for configuration options. The watchdog is disarmed gracefully if it is dropped. 62 | 63 | **Optional:** 64 | 65 | In `.cargo/config.toml`, add content such as: 66 | 67 | ``` 68 | # We only enable tokio_taskdump on Linux targets since it's not supported on Mac 69 | [build] 70 | rustflags = ["--cfg", "tokio_unstable"] 71 | 72 | [target.x86_64-unknown-linux-gnu] 73 | rustflags = ["--cfg", "tokio_unstable", "--cfg", "tokio_taskdump"] 74 | 75 | [target.aarch64-unknown-linux-gnu] 76 | rustflags = ["--cfg", "tokio_unstable", "--cfg", "tokio_taskdump"] 77 | ``` 78 | 79 | This will enable collection of additional [`tokio::RuntimeMetrics`](https://docs.rs/tokio/latest/tokio/runtime/struct.RuntimeMetrics.html) 80 | and task dumps, which will be logged if a deadlock is detected. 81 | 82 | Note: Since some parts of `tokio::RuntimeMetrics` were stabilized, you can still get some data without this, although you will miss many metrics 83 | and won't get task dumps. See [tokio unstable features documentation](https://docs.rs/tokio/latest/tokio/index.html#unstable-features). 84 | 85 | ## Pros and Cons 86 | 87 | Some types of deployment infrastructure will do external liveness checking of your process, e.g. using http requests. 88 | Then, if this check fails, your process might get SIGTERM before SIGKILL, so you could try to tie this type of data collection and logging to 89 | the SIGTERM signal handler instead of an internal timer. 90 | 91 | There are a few advantages that I've seen to the internal watchdog timer approach: 92 | 93 | * Not everything that uses async rust is an http server, and adding an http server just for liveness checks may feel heavy or awkward, as you will also have to configure it. 94 | * Signal handling can itself be a can of worms. 95 | * Sometimes if there are deadlocks in your system, a good way to reproduce them is to set `TOKIO_NUM_WORKERS` to 2 or 1, and 96 | exercise some part of your system via integration tests in CI. You may want those tests to be very simple and not involve docker etc., 97 | and at that point internal liveness checking such as by this watchdog may be attractive. 98 | * The other thing I like to do when smoking out these issues is, don't run your binary directly in CI, run it through `gdb`, such as: 99 | `gdb -return-child-result -batch -ex run -ex thread apply all bt -ex quit --args target/release/my_bin` 100 | This will make it so that your process runs with `gdb` already attached, and whenever it stops, the command `thread apply all bt` is run. 101 | Then `gdb` quits and it returns the child's exit code, so CI fails if a panic occurred. 102 | If the process runs this way and the watchdog panics, you will get a backtrace from every thread 103 | in the program, in the logs, automatically, without having to ssh into the CI worker and attach gdb manually. These backtraces are thread backtraces, not 104 | async-aware task backtraces, so they aren't as helpful or informative as the task dump -- the higher frames of the stack are likely to be unrelated to whatever 105 | sequence of async calls was happening. However, the final calls of the stack frame can be very interesting -- if your thread is in `pthread_sleep`, or one of the mutex-related 106 | `pthread` calls, or in a C library like `libpq`, that can help you figure out what blocking calls might be happening and narrow down where your problem might be. And you will 107 | get this data even if the watchdog was unable to get a task dump. 108 | * The in-process heartbeat system is really very simple, whereas with http-based liveness checking, it could be failing because of a networking issue instead. 109 | Note that nothing stops you from using both and putting a longer timeout on the http-based check. 110 | * I have not experienced any false positives from this system in production or in CI testing -- the watchdog triggering has always been traced back to an actual problem. 111 | 112 | You do pay the cost of having an extra thread in your process, but it only wakes up once a second (configurable) and this is typically negligible. 113 | Anyways, any scheme of getting more tokio metrics after your runtime is deadlocked will require you to have a thread somewhere outside the runtime that can still do some work. 114 | 115 | Another option is to use the [`tokio_metrics`](https://github.com/tokio-rs/tokio-metrics) crate, which is geared towards always collecting these metrics and publishing them e.g. via prometheus. If you do that, you might choose to set `triggered_metrics_collections` to `0` on the watchdog, so that it won't bother collecting any metrics. You can still benefit from logging of task dumps performed by the watchdog, and you can even set `panic` to `false`, so that the only thing the watchdog does is attempt to collect task dumps and log them when heartbeats are missed. 116 | 117 | ## License 118 | 119 | MIT or Apache 2.0 120 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | //! This crate provides utilities for trying to catch and debug a blocked tokio runtime. 2 | //! 3 | //! * A watchdog which consists of a thread outside of tokio, and also a task within tokio, 4 | //! which sends heartbeats to the watchdog thread. If the heartbeats do not come frequently 5 | //! enough, the watchdog decides that the tokio executor is probably blocked. 6 | //! It will then attempt to collect metrics from the runtime and log them, 7 | //! any other relevant info (backtraces would be ideal), 8 | //! and eventually panic, although this behavior is configurable by env. 9 | //! * Helper functions for obtaining runtime metrics etc. are also exposed. 10 | 11 | #![deny(missing_docs)] 12 | #![allow(deprecated)] 13 | 14 | use std::{ 15 | path::Path, 16 | sync::{ 17 | atomic::{AtomicBool, Ordering}, 18 | Arc, Mutex, 19 | }, 20 | time::{Duration, Instant}, 21 | }; 22 | use tokio::runtime; 23 | #[allow(unused)] 24 | use tracing::{error, info, warn, warn_span, Instrument}; 25 | 26 | mod tokio_runtime_metrics; 27 | 28 | pub use tokio_runtime_metrics::TokioRuntimeMetrics; 29 | 30 | #[derive(Clone, Debug)] 31 | struct Config { 32 | heartbeat_period: Duration, 33 | watchdog_timeout: Duration, 34 | triggered_metrics_duration: Duration, 35 | triggered_metrics_collections: u32, 36 | task_dump_deadline: Duration, 37 | panic: bool, 38 | thread_name: String, 39 | } 40 | 41 | impl Default for Config { 42 | fn default() -> Self { 43 | Self { 44 | heartbeat_period: Duration::from_secs(1), 45 | watchdog_timeout: Duration::from_secs(5), 46 | triggered_metrics_duration: Duration::from_secs(2), 47 | triggered_metrics_collections: 20, 48 | task_dump_deadline: Duration::from_secs(5), 49 | panic: true, 50 | thread_name: "tokio-watchdog".into(), 51 | } 52 | } 53 | } 54 | 55 | /// Builder which can configure a watchdog object. 56 | #[derive(Clone, Debug, Default)] 57 | pub struct Builder { 58 | config: Config, 59 | } 60 | 61 | impl Builder { 62 | /// Set the heartbeat period, i.e. how frequently the heartbeat task beats 63 | /// Defaults to 1s. 64 | pub fn heartbeat_period(mut self, d: Duration) -> Self { 65 | self.config.heartbeat_period = d; 66 | self 67 | } 68 | 69 | /// Set the watchdog timeout, i.e. how long we can go without seeing a heartbeat before the watchdog is triggered 70 | /// Defaults to 5s. 71 | pub fn watchdog_timeout(mut self, d: Duration) -> Self { 72 | self.config.watchdog_timeout = d; 73 | self 74 | } 75 | 76 | /// Set how long we collect metrics for when triggered. 77 | /// Defaults to 2s. 78 | pub fn triggered_metrics_duration(mut self, d: Duration) -> Self { 79 | self.config.triggered_metrics_duration = d; 80 | self 81 | } 82 | 83 | /// Set how many times we will try to collect metrics during the period. 84 | /// Defaults to 20. Set to 0 to disable metrics collection. 85 | pub fn triggered_metrics_collections(mut self, n: u32) -> Self { 86 | self.config.triggered_metrics_collections = n; 87 | self 88 | } 89 | 90 | /// Set how long we will wait for a taskdump when triggered. 91 | /// Defaults to 5s. 92 | pub fn task_dump_deadline(mut self, d: Duration) -> Self { 93 | self.config.task_dump_deadline = d; 94 | self 95 | } 96 | 97 | /// Set whether or not to panic when triggered. Defaults to true. 98 | pub fn panic(mut self, b: bool) -> Self { 99 | self.config.panic = b; 100 | self 101 | } 102 | 103 | /// Set the thread name. Defaults to "tokio-watchdog". 104 | pub fn thread_name(mut self, s: &str) -> Self { 105 | self.config.thread_name = s.to_owned(); 106 | self 107 | } 108 | 109 | /// Build a watchdog instance for the current tokio runtime. 110 | /// Panics if there is no current runtime. 111 | pub fn build(self) -> Watchdog { 112 | self.build_for_runtime(runtime::Handle::current()) 113 | } 114 | 115 | /// Build a watchdog instance for given tokio runtime 116 | pub fn build_for_runtime(self, handle: runtime::Handle) -> Watchdog { 117 | Watchdog::new_for_runtime(self.config, handle) 118 | } 119 | } 120 | 121 | /// The watchdog object monitors a given tokio runtime to see if it looks deadlocked. 122 | /// 123 | /// It spawns a thread outside of the runtime which watches for heartbeats from an async task 124 | /// that it spawns in the runtime. 125 | /// 126 | /// When a long enough time passes without a heartbeat, the watchdog is "triggered". 127 | /// 128 | /// By default, when it is triggered it will: 129 | /// * Try to collect tokio runtime metrics for a few seconds and log them 130 | /// * Try to log a tokio task dump for a few seconds (giving up if it doesn't succeed). 131 | /// * Panic, so that the process can restart and hopefully recover. 132 | /// 133 | /// The panic / restart idea is similar in spirit to how gunicorn / django will try to 134 | /// restart worker processes that time out. 135 | /// 136 | /// Dropping the watchdog will join the watchdog thread and the heartbeat task. 137 | pub struct Watchdog { 138 | watchdog_thread: Option>, 139 | stop_requested: Arc, 140 | } 141 | 142 | impl Drop for Watchdog { 143 | fn drop(&mut self) { 144 | if let Some(handle) = self.watchdog_thread.take() { 145 | self.stop_requested.store(true, Ordering::SeqCst); 146 | handle.join().expect("Could not join watchdog thread"); 147 | } 148 | } 149 | } 150 | 151 | impl Watchdog { 152 | /// Create a new watchdog builder, to configure a watchdog instance. 153 | pub fn builder() -> Builder { 154 | Builder::default() 155 | } 156 | 157 | /// Make a new watchdog for a given tokio runtime. 158 | /// Starts the heartbeat task and the watchdog thread. 159 | /// Drop this handle in order to stop both. 160 | fn new_for_runtime(config: Config, handle: runtime::Handle) -> Self { 161 | let stop_requested = Arc::new(AtomicBool::default()); 162 | let thread_stop_requested = stop_requested.clone(); 163 | 164 | let watchdog_thread = Some( 165 | std::thread::Builder::new() 166 | .name(format!("{}-thread", config.thread_name)) 167 | .spawn(move || { 168 | Self::watchdog_thread_entrypoint(config, handle, thread_stop_requested) 169 | }) 170 | .expect("could not spawn thread"), 171 | ); 172 | 173 | Self { 174 | watchdog_thread, 175 | stop_requested, 176 | } 177 | } 178 | 179 | fn exe_name() -> Option { 180 | Some( 181 | Path::new(&std::env::args_os().next()?) 182 | .file_name()? 183 | .to_string_lossy() 184 | .as_ref() 185 | .to_owned(), 186 | ) 187 | } 188 | 189 | fn watchdog_thread_entrypoint( 190 | config: Config, 191 | handle: runtime::Handle, 192 | stop_requested: Arc, 193 | ) { 194 | let exe_name = Self::exe_name().unwrap_or_else(|| "?".into()); 195 | let span = warn_span!("watchdog", exe = exe_name); 196 | let span_clone = span.clone(); 197 | span.in_scope(move || { 198 | #[allow(unused)] 199 | let Config { 200 | heartbeat_period, 201 | watchdog_timeout, 202 | triggered_metrics_duration, 203 | triggered_metrics_collections, 204 | task_dump_deadline, 205 | panic, 206 | thread_name, 207 | } = config; 208 | 209 | // The heartbeat channel is an std::Mutex < Instant > shared between the heartbeat task and the watchdog thread. 210 | // The heartbeat task updates it periodically, and the watchdog reads it periodically. 211 | let heartbeat_channel = Arc::new(Mutex::new(Instant::now())); 212 | let task_heartbeat_channel = heartbeat_channel.clone(); 213 | let task_stop_requested = stop_requested.clone(); 214 | let task_name = thread_name.clone(); 215 | 216 | // Spawn the tokio task that will periodically update the heartbeat channel 217 | handle.spawn(async move { 218 | info!("{task_name} heartbeat task started"); 219 | loop { 220 | *task_heartbeat_channel.lock().unwrap() = Instant::now(); 221 | tokio::time::sleep(heartbeat_period).await; 222 | if task_stop_requested.load(Ordering::SeqCst) { 223 | info!("{task_name} heartbeat task stop requested"); 224 | break; 225 | } 226 | } 227 | }.instrument(span_clone)); 228 | 229 | info!("{thread_name} thread started"); 230 | 231 | // Now enter the watchdog loop 232 | loop { 233 | let last_heartbeat = *heartbeat_channel.lock().unwrap(); 234 | let elapsed = last_heartbeat.elapsed(); 235 | 236 | if elapsed > watchdog_timeout { 237 | error!("{thread_name} thread: Watchdog has been triggered: {elapsed:?} since last heartbeat > {watchdog_timeout:?}"); 238 | 239 | for i in 0..triggered_metrics_collections { 240 | let metrics = TokioRuntimeMetrics::from(&handle); 241 | warn!("Runtime metrics {i}/{triggered_metrics_collections}: {metrics:#?}"); 242 | std::thread::sleep(triggered_metrics_duration / triggered_metrics_collections); 243 | } 244 | 245 | // If task dumps are enabled, try also to acquire a task dump 246 | // According to docu, a taskdump requires all polled futures to eventually yield, 247 | // then it polls them in a special tracing mode. So this won't work if the runtime is actually 248 | // deadlocked. If it's just slow however, then we will get this additional data. 249 | // If the task dump works, then we cancel the panic. 250 | // We have to run the task dump on a separate thread since the runtime may be FUBAR. 251 | #[cfg(all(tokio_unstable, tokio_taskdump))] 252 | { 253 | use std::sync::Condvar; 254 | 255 | warn!("{thread_name}: Attempting to collect a taskdump"); 256 | 257 | let pair = Arc::new((Mutex::new(false), Condvar::new())); 258 | let thread_pair = pair.clone(); 259 | let thread_handle = handle.clone(); 260 | 261 | if let Err(err) = std::thread::Builder::new() 262 | .name(format!("{thread_name}-taskdump")) 263 | .spawn(move || { 264 | let fut = tokio::time::timeout(task_dump_deadline, thread_handle.dump()); 265 | match thread_handle.block_on(fut) { 266 | Ok(dump) => { 267 | for (i, task) in dump.tasks().iter().enumerate() { 268 | let trace = task.trace().to_string(); 269 | warn!(task = i, "{trace}"); 270 | } 271 | let (lk, cvar) = &*thread_pair; 272 | *lk.lock().unwrap() = true; 273 | cvar.notify_one(); 274 | } 275 | Err(err) => { 276 | warn!("task dump error: {err}"); 277 | } 278 | } 279 | }) { 280 | error!("{thread_name}: Could not spawn taskdump thread: {err}"); 281 | } else { 282 | // Wait at least task_dump_deadline for the task dump job to complete. 283 | let (lk, cvar) = &*pair; 284 | let (gd, _timeout_result) = cvar.wait_timeout_while( 285 | lk.lock().unwrap(), 286 | task_dump_deadline, 287 | |&mut done| !done 288 | ).expect("Error waiting for condvar"); 289 | 290 | // Check if success was recorded 291 | if *gd { 292 | info!("{thread_name}: Task dump was succesful, this indicates that the runtime is not deadlocked. Watchdog is being reset"); 293 | 294 | // Re-enter the loop. We should not immediately retrigger watchdog, because if all futures polled successfully, 295 | // then the heartbeat task should have run at least once in the last task_dump_deadline, which presumably is <= watchdog_timeout. 296 | continue; 297 | } else { 298 | warn!("{thread_name}: Task dump was unsuccessful after {task_dump_deadline:?}"); 299 | } 300 | } 301 | } 302 | 303 | if panic { 304 | // Check if stop was requested immediately before panicking 305 | if stop_requested.load(Ordering::SeqCst) { 306 | info!("{thread_name} stop requested"); 307 | break; 308 | } 309 | panic!("{thread_name} panicked: {elapsed:?} since last heartbeat > {watchdog_timeout:?}, exe = {exe_name}"); 310 | } 311 | } 312 | 313 | // Sleep for a bit 314 | std::thread::sleep(heartbeat_period); 315 | 316 | // Exit if requested 317 | if stop_requested.load(Ordering::SeqCst) { 318 | info!("{thread_name} stop requested"); 319 | break; 320 | } 321 | } 322 | }) 323 | } 324 | } 325 | -------------------------------------------------------------------------------- /src/tokio_runtime_metrics.rs: -------------------------------------------------------------------------------- 1 | use std::time::Duration; 2 | use tokio::runtime; 3 | 4 | /// Data collected from tokio::runtime::RuntimeMetrics 5 | /// 6 | /// https://docs.rs/tokio/latest/tokio/runtime/struct.RuntimeMetrics.html 7 | /// 8 | /// Annoyingly, debug logging that thing doesn't do quite what you would expect. 9 | /// 10 | /// I didn't want to use the tokio_metrics crate because that thing assumes that 11 | /// you want to continuously publish these metrics, rather than just helping me 12 | /// process snapshots from tokio. 13 | /// 14 | /// To populate this, usually one might do like `TokioRuntimeMetrics::from(Handle::current())` 15 | /// or simply `TokioRuntimeMetrics::current()`. 16 | /// 17 | /// The only thing you can do with this is debug print it. 18 | #[allow(dead_code)] 19 | #[derive(Clone, Debug)] 20 | pub struct TokioRuntimeMetrics { 21 | /// Number of worker threads 22 | num_workers: usize, 23 | /// Number of active tasks 24 | num_alive_tasks: usize, 25 | /// Global queue depth 26 | global_queue_depth: usize, 27 | /// Number of blocking threads 28 | #[cfg(tokio_unstable)] 29 | num_blocking_threads: usize, 30 | /// Number of idle blocking threads 31 | #[cfg(tokio_unstable)] 32 | num_idle_blocking_threads: usize, 33 | /// The number of tasks currently in the blocking tasks queue, created via spawn_blocking. 34 | #[cfg(tokio_unstable)] 35 | blocking_queue_depth: usize, 36 | /// Total number of tasks spawned on this runtime 37 | #[cfg(all(tokio_unstable, target_has_atomic = "64"))] 38 | spawned_tasks_count: u64, 39 | /// Number of times that a thread outside the runtime has scheduled a task 40 | #[cfg(all(tokio_unstable, target_has_atomic = "64"))] 41 | remote_schedule_count: u64, 42 | /// Number of times that tasks have been forced to yield back to the scheduler after exhausting their task budgets. 43 | #[cfg(all(tokio_unstable, target_has_atomic = "64"))] 44 | budget_forced_yield_count: u64, 45 | /// How many times each worker has parked 46 | worker_park_count: Vec, 47 | /// How many times each worker has woken up and immediately parked again 48 | worker_noop_count: Vec, 49 | /// How many tasks each worker has stolen from another worker thread 50 | worker_steal_count: Vec, 51 | /// How many steal operations each worker has performed which stole at least one task 52 | worker_steal_operations: Vec, 53 | /// How many times each worker has polled a task 54 | worker_poll_count: Vec, 55 | /// The total amount of time each worker has been busy 56 | worker_total_busy_duration: Vec, 57 | /// How many times a worker has scheduled a task (from within the runtime) onto its own queue 58 | worker_local_schedule_count: Vec, 59 | /// How many times a worker's local queue has become full. When this happens it sends tasks to the injection queue 60 | worker_overflow_count: Vec, 61 | /// The number of tasks currently in each worker's local queue 62 | worker_local_queue_depth: Vec, 63 | /// The mean duration of task poll times for each worker. This is an exponentially weighted moving average for each worker. 64 | worker_mean_poll_time: Vec, 65 | } 66 | 67 | impl TokioRuntimeMetrics { 68 | /// Construct self from a tokio RuntimeMetrics object 69 | #[allow(unused_mut)] 70 | pub fn new(src: runtime::RuntimeMetrics) -> Self { 71 | let num_workers = src.num_workers(); 72 | let mut worker_park_count = vec![]; 73 | let mut worker_noop_count = vec![]; 74 | let mut worker_steal_count = vec![]; 75 | let mut worker_steal_operations = vec![]; 76 | let mut worker_poll_count = vec![]; 77 | let mut worker_total_busy_duration = vec![]; 78 | let mut worker_local_schedule_count = vec![]; 79 | let mut worker_overflow_count = vec![]; 80 | let mut worker_local_queue_depth = vec![]; 81 | let mut worker_mean_poll_time = vec![]; 82 | 83 | #[cfg(tokio_unstable)] 84 | for i in 0..num_workers { 85 | worker_local_queue_depth.push(src.worker_local_queue_depth(i)); 86 | } 87 | 88 | #[cfg(all(tokio_unstable, target_has_atomic = "64"))] 89 | for i in 0..num_workers { 90 | worker_park_count.push(src.worker_park_count(i)); 91 | worker_noop_count.push(src.worker_noop_count(i)); 92 | worker_steal_count.push(src.worker_steal_count(i)); 93 | worker_steal_operations.push(src.worker_steal_operations(i)); 94 | worker_poll_count.push(src.worker_poll_count(i)); 95 | worker_total_busy_duration.push(src.worker_total_busy_duration(i)); 96 | worker_local_schedule_count.push(src.worker_local_schedule_count(i)); 97 | worker_overflow_count.push(src.worker_overflow_count(i)); 98 | worker_mean_poll_time.push(src.worker_mean_poll_time(i)); 99 | } 100 | 101 | Self { 102 | num_workers, 103 | num_alive_tasks: src.num_alive_tasks(), 104 | global_queue_depth: src.global_queue_depth(), 105 | #[cfg(tokio_unstable)] 106 | blocking_queue_depth: src.blocking_queue_depth(), 107 | #[cfg(tokio_unstable)] 108 | num_blocking_threads: src.num_blocking_threads(), 109 | #[cfg(tokio_unstable)] 110 | num_idle_blocking_threads: src.num_idle_blocking_threads(), 111 | #[cfg(all(tokio_unstable, target_has_atomic = "64"))] 112 | spawned_tasks_count: src.spawned_tasks_count(), 113 | #[cfg(all(tokio_unstable, target_has_atomic = "64"))] 114 | remote_schedule_count: src.remote_schedule_count(), 115 | #[cfg(all(tokio_unstable, target_has_atomic = "64"))] 116 | budget_forced_yield_count: src.budget_forced_yield_count(), 117 | worker_park_count, 118 | worker_noop_count, 119 | worker_steal_count, 120 | worker_steal_operations, 121 | worker_poll_count, 122 | worker_total_busy_duration, 123 | worker_local_schedule_count, 124 | worker_overflow_count, 125 | worker_local_queue_depth, 126 | worker_mean_poll_time, 127 | } 128 | } 129 | 130 | /// Construct self using metrics from the current tokio runtime. 131 | /// Panics if there is no current runtime. 132 | pub fn current() -> Self { 133 | Self::from(&runtime::Handle::current()) 134 | } 135 | } 136 | 137 | impl From<&runtime::Handle> for TokioRuntimeMetrics { 138 | fn from(src: &runtime::Handle) -> Self { 139 | Self::new(src.metrics()) 140 | } 141 | } 142 | --------------------------------------------------------------------------------