├── .github
    └── workflows
    │   └── ci.yml
├── .gitignore
├── Cargo.toml
├── README.md
└── src
    ├── lib.rs
    └── tokio_runtime_metrics.rs


/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: Continuous Integration
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ "main" ]
 6 |   pull_request:
 7 | 
 8 | env:
 9 |   CARGO_TERM_COLOR: always
10 |   RUST_BACKTRACE: full
11 | 
12 | jobs:
13 |   # build rust
14 |   build:
15 |     runs-on: ubuntu-latest
16 |     strategy:
17 |       matrix:
18 |         RUSTFLAGS: ["--deny warnings", "--cfg tokio_unstable --deny warnings", "--cfg tokio_unstable --cfg tokio_taskdump --deny warnings"]
19 |         TARGET: ["", "--target wasm32-unknown-unknown"] # An arbitrary 32-bit target
20 |         exclude: # This doesn't work because tokio_taskdump doesn't support wasm32
21 |           - RUSTFLAGS: "--cfg tokio_unstable --cfg tokio_taskdump --deny warnings"
22 |             TARGET: "--target wasm32-unknown-unknown"
23 | 
24 |     steps:
25 |     - uses: actions/checkout@v4
26 |     - name: Rustup update
27 |       run: rustup update && rustup target list
28 |     - name: Install wasm32 stdlib
29 |       run: rustup target add wasm32-unknown-unknown
30 |     - name: Show cargo version
31 |       run: cargo --version
32 |     - name: rust build caching
33 |       uses: Swatinem/rust-cache@v2
34 |       with:
35 |         workspaces: . -> target
36 |         save-if: ${{ github.ref == 'refs/heads/main' }}
37 |     - name: Build Rust
38 |       env:
39 |         RUSTFLAGS: ${{ matrix.RUSTFLAGS }}
40 |       run: cargo build --verbose ${{ matrix.TARGET }}
41 | 
42 |   # lint rust
43 |   lint:
44 |     runs-on: ubuntu-latest
45 |     steps:
46 |     - uses: actions/checkout@v4
47 |     - name: Rustup update
48 |       run: rustup update
49 |     - name: Show cargo version
50 |       run: cargo --version
51 |     - name: rust build caching
52 |       uses: Swatinem/rust-cache@v2
53 |       with:
54 |         workspaces: . -> target
55 |         save-if: ${{ github.ref == 'refs/heads/main' }}
56 |     - name: Format
57 |       run: cargo fmt -- --check
58 |     - name: Clippy
59 |       run: cargo clippy --all --all-features -- -D warnings
60 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | Cargo.lock
2 | target/**
3 | 
4 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "tokio_util_watchdog"
 3 | version = "0.1.2"
 4 | authors = ["beck.ct@gmail.com"]
 5 | edition = "2021"
 6 | license = "MIT OR Apache-2.0"
 7 | readme = "README.md"
 8 | rust-version = "1.74"
 9 | repository = "https://github.com/cbeck88/tokio_util_watchdog"
10 | description = "A watchdog utility for tokio runtimes"
11 | categories = ["asynchronous", "development-tools::debugging"]
12 | keywords = [
13 |   "async",
14 |   "tokio",
15 |   "utility",
16 |   "watchdog",
17 | ]
18 | 
19 | [lints.rust]
20 | unexpected_cfgs = { level = "allow", check-cfg = ['cfg(tokio_unstable)'] }
21 | 
22 | [dependencies]
23 | tokio = { version = "1.43.0", features = ["rt", "time"] }
24 | tracing = { version = "0.1" }
25 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # tokio_util_watchdog
  2 | 
  3 | A watchdog utility for detecting deadlocks in tokio runtimes.
  4 | 
  5 | [![Crates.io](https://img.shields.io/crates/v/tokio_util_watchdog?style=flat-square)](https://crates.io/crates/tokio_util_watchdog)
  6 | [![Crates.io](https://img.shields.io/crates/d/tokio_util_watchdog?style=flat-square)](https://crates.io/crates/tokio_util_watchdog)
  7 | [![License](https://img.shields.io/badge/license-Apache%202.0-blue?style=flat-square)](LICENSE-APACHE)
  8 | [![License](https://img.shields.io/badge/license-MIT-blue?style=flat-square)](LICENSE-MIT)
  9 | [![Build Status](https://img.shields.io/github/actions/workflow/status/cbeck88/tokio_util_watchdog/ci.yml?branch=main&style=flat-square)](https://github.com/cbeck88/tokio_util_watchdog/actions/workflows/ci.yml?query=branch%3Amain)
 10 | 
 11 | [API Docs](https://docs.rs/tokio_util_watchdog/latest/tokio_util_watchdog/)
 12 | 
 13 | ---
 14 | 
 15 | If we get a tokio deadlock, i.e. all worker threads get blocked and no more
 16 | asynchronous futures can be driven, it can be hard to diagnose and debug
 17 | in production.
 18 | 
 19 | This watchdog uses a very simple strategy to detect and try to recover from that
 20 | situation:
 21 | 
 22 | * Spawn a task on the runtime that periodically records "heartbeats", e.g. once a second.
 23 | * Spawn a thread (using std) outside of the runtime that wakes up periodically and checks
 24 |   for those heartbeats.
 25 | * If heartbeats are not detected for a few seconds (configurable), panic.
 26 | * Before we panic, try to collect and log [`tokio::RuntimeMetrics`](https://docs.rs/tokio/latest/tokio/runtime/struct.RuntimeMetrics.html) for this runtime for a few seconds (configurable).
 27 | * If `cfg(tokio_unstable)` and `cfg(tokio_taskdump)` were used, also try to collect and log a [task dump](https://docs.rs/tokio/latest/tokio/runtime/dump/struct.Dump.html) for a few seconds.
 28 | 
 29 | The assumption here is that when the panic occurs, your deployment infrastructure will detect that this happened
 30 | and restart the process. Hopefully the process will recover and not immediately deadlock again. And meanwhile, you
 31 | will automatically get more information than you would otherwise, which might help you fix the underlying issue,
 32 | especially if you used the extra features.
 33 | 
 34 | (If you used django in the past, you might have seen similar behavior, where timed-out worker processes are automatically
 35 | killed and restarted, with some error logging, without blocking or starving the whole webserver.)
 36 | 
 37 | Note that this is a different type of watchdog from e.g. [`simple-tokio-watchdog`](https://crates.io/crates/simple-tokio-watchdog) and
 38 | some other such crates -- our crate is specifically for checking the tokio runtime itself for liveness, and then logging any useful diagnostics
 39 | and panicking (configurable).
 40 | 
 41 | ## Quick start
 42 | 
 43 | 1. Add `tokio_util_watchdog = "0.1"` to your `Cargo.toml`.
 44 | 1. In `main.rs` somewhere, add lines such as:
 45 | 
 46 | ```rust
 47 | use tokio_util_watchdog::Watchdog;
 48 | 
 49 | ...
 50 | 
 51 | #[tokio::main]
 52 | async fn main() {
 53 |     ...
 54 | 
 55 |     let _watchdog = Watchdog::builder().build();
 56 | 
 57 |     ...
 58 | }
 59 | ```
 60 | 
 61 | See the [builder documentation](https://docs.rs/tokio_util_watchdog/latest/tokio_util_watchdog/struct.Builder.html) for configuration options. The watchdog is disarmed gracefully if it is dropped.
 62 | 
 63 | **Optional:**
 64 | 
 65 | In `.cargo/config.toml`, add content such as:
 66 | 
 67 | ```
 68 | # We only enable tokio_taskdump on Linux targets since it's not supported on Mac
 69 | [build]
 70 | rustflags = ["--cfg", "tokio_unstable"]
 71 | 
 72 | [target.x86_64-unknown-linux-gnu]
 73 | rustflags = ["--cfg", "tokio_unstable", "--cfg", "tokio_taskdump"]
 74 | 
 75 | [target.aarch64-unknown-linux-gnu]
 76 | rustflags = ["--cfg", "tokio_unstable", "--cfg", "tokio_taskdump"]
 77 | ```
 78 | 
 79 | This will enable collection of additional [`tokio::RuntimeMetrics`](https://docs.rs/tokio/latest/tokio/runtime/struct.RuntimeMetrics.html)
 80 | and task dumps, which will be logged if a deadlock is detected.
 81 | 
 82 | Note: Since some parts of `tokio::RuntimeMetrics` were stabilized, you can still get some data without this, although you will miss many metrics
 83 | and won't get task dumps. See [tokio unstable features documentation](https://docs.rs/tokio/latest/tokio/index.html#unstable-features).
 84 | 
 85 | ## Pros and Cons
 86 | 
 87 | Some types of deployment infrastructure will do external liveness checking of your process, e.g. using http requests.
 88 | Then, if this check fails, your process might get SIGTERM before SIGKILL, so you could try to tie this type of data collection and logging to
 89 | the SIGTERM signal handler instead of an internal timer.
 90 | 
 91 | There are a few advantages that I've seen to the internal watchdog timer approach:
 92 | 
 93 | * Not everything that uses async rust is an http server, and adding an http server just for liveness checks may feel heavy or awkward, as you will also have to configure it.
 94 | * Signal handling can itself be a can of worms.
 95 | * Sometimes if there are deadlocks in your system, a good way to reproduce them is to set `TOKIO_NUM_WORKERS` to 2 or 1, and
 96 |   exercise some part of your system via integration tests in CI. You may want those tests to be very simple and not involve docker etc.,
 97 |   and at that point internal liveness checking such as by this watchdog may be attractive.
 98 |   * The other thing I like to do when smoking out these issues is, don't run your binary directly in CI, run it through `gdb`, such as:
 99 |     `gdb -return-child-result -batch -ex run -ex thread apply all bt -ex quit --args target/release/my_bin`
100 |     This will make it so that your process runs with `gdb` already attached, and whenever it stops, the command `thread apply all bt` is run.
101 |     Then `gdb` quits and it returns the child's exit code, so CI fails if a panic occurred.
102 |     If the process runs this way and the watchdog panics, you will get a backtrace from every thread
103 |     in the program, in the logs, automatically, without having to ssh into the CI worker and attach gdb manually. These backtraces are thread backtraces, not
104 |     async-aware task backtraces, so they aren't as helpful or informative as the task dump -- the higher frames of the stack are likely to be unrelated to whatever
105 |     sequence of async calls was happening. However, the final calls of the stack frame can be very interesting -- if your thread is in `pthread_sleep`, or one of the mutex-related
106 |     `pthread` calls, or in a C library like `libpq`, that can help you figure out what blocking calls might be happening and narrow down where your problem might be. And you will
107 |     get this data even if the watchdog was unable to get a task dump.
108 | * The in-process heartbeat system is really very simple, whereas with http-based liveness checking, it could be failing because of a networking issue instead.
109 |   Note that nothing stops you from using both and putting a longer timeout on the http-based check.
110 | * I have not experienced any false positives from this system in production or in CI testing -- the watchdog triggering has always been traced back to an actual problem.
111 | 
112 | You do pay the cost of having an extra thread in your process, but it only wakes up once a second (configurable) and this is typically negligible.
113 | Anyways, any scheme of getting more tokio metrics after your runtime is deadlocked will require you to have a thread somewhere outside the runtime that can still do some work.
114 | 
115 | Another option is to use the [`tokio_metrics`](https://github.com/tokio-rs/tokio-metrics) crate, which is geared towards always collecting these metrics and publishing them e.g. via prometheus. If you do that, you might choose to set `triggered_metrics_collections` to `0` on the watchdog, so that it won't bother collecting any metrics. You can still benefit from logging of task dumps performed by the watchdog, and you can even set `panic` to `false`, so that the only thing the watchdog does is attempt to collect task dumps and log them when heartbeats are missed.
116 | 
117 | ## License
118 | 
119 | MIT or Apache 2.0
120 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
  1 | //! This crate provides utilities for trying to catch and debug a blocked tokio runtime.
  2 | //!
  3 | //! * A watchdog which consists of a thread outside of tokio, and also a task within tokio,
  4 | //!   which sends heartbeats to the watchdog thread. If the heartbeats do not come frequently
  5 | //!   enough, the watchdog decides that the tokio executor is probably blocked.
  6 | //!   It will then attempt to collect metrics from the runtime and log them,
  7 | //!   any other relevant info (backtraces would be ideal),
  8 | //!   and eventually panic, although this behavior is configurable by env.
  9 | //! * Helper functions for obtaining runtime metrics etc. are also exposed.
 10 | 
 11 | #![deny(missing_docs)]
 12 | #![allow(deprecated)]
 13 | 
 14 | use std::{
 15 |     path::Path,
 16 |     sync::{
 17 |         atomic::{AtomicBool, Ordering},
 18 |         Arc, Mutex,
 19 |     },
 20 |     time::{Duration, Instant},
 21 | };
 22 | use tokio::runtime;
 23 | #[allow(unused)]
 24 | use tracing::{error, info, warn, warn_span, Instrument};
 25 | 
 26 | mod tokio_runtime_metrics;
 27 | 
 28 | pub use tokio_runtime_metrics::TokioRuntimeMetrics;
 29 | 
 30 | #[derive(Clone, Debug)]
 31 | struct Config {
 32 |     heartbeat_period: Duration,
 33 |     watchdog_timeout: Duration,
 34 |     triggered_metrics_duration: Duration,
 35 |     triggered_metrics_collections: u32,
 36 |     task_dump_deadline: Duration,
 37 |     panic: bool,
 38 |     thread_name: String,
 39 | }
 40 | 
 41 | impl Default for Config {
 42 |     fn default() -> Self {
 43 |         Self {
 44 |             heartbeat_period: Duration::from_secs(1),
 45 |             watchdog_timeout: Duration::from_secs(5),
 46 |             triggered_metrics_duration: Duration::from_secs(2),
 47 |             triggered_metrics_collections: 20,
 48 |             task_dump_deadline: Duration::from_secs(5),
 49 |             panic: true,
 50 |             thread_name: "tokio-watchdog".into(),
 51 |         }
 52 |     }
 53 | }
 54 | 
 55 | /// Builder which can configure a watchdog object.
 56 | #[derive(Clone, Debug, Default)]
 57 | pub struct Builder {
 58 |     config: Config,
 59 | }
 60 | 
 61 | impl Builder {
 62 |     /// Set the heartbeat period, i.e. how frequently the heartbeat task beats
 63 |     /// Defaults to 1s.
 64 |     pub fn heartbeat_period(mut self, d: Duration) -> Self {
 65 |         self.config.heartbeat_period = d;
 66 |         self
 67 |     }
 68 | 
 69 |     /// Set the watchdog timeout, i.e. how long we can go without seeing a heartbeat before the watchdog is triggered
 70 |     /// Defaults to 5s.
 71 |     pub fn watchdog_timeout(mut self, d: Duration) -> Self {
 72 |         self.config.watchdog_timeout = d;
 73 |         self
 74 |     }
 75 | 
 76 |     /// Set how long we collect metrics for when triggered.
 77 |     /// Defaults to 2s.
 78 |     pub fn triggered_metrics_duration(mut self, d: Duration) -> Self {
 79 |         self.config.triggered_metrics_duration = d;
 80 |         self
 81 |     }
 82 | 
 83 |     /// Set how many times we will try to collect metrics during the period.
 84 |     /// Defaults to 20. Set to 0 to disable metrics collection.
 85 |     pub fn triggered_metrics_collections(mut self, n: u32) -> Self {
 86 |         self.config.triggered_metrics_collections = n;
 87 |         self
 88 |     }
 89 | 
 90 |     /// Set how long we will wait for a taskdump when triggered.
 91 |     /// Defaults to 5s.
 92 |     pub fn task_dump_deadline(mut self, d: Duration) -> Self {
 93 |         self.config.task_dump_deadline = d;
 94 |         self
 95 |     }
 96 | 
 97 |     /// Set whether or not to panic when triggered. Defaults to true.
 98 |     pub fn panic(mut self, b: bool) -> Self {
 99 |         self.config.panic = b;
100 |         self
101 |     }
102 | 
103 |     /// Set the thread name. Defaults to "tokio-watchdog".
104 |     pub fn thread_name(mut self, s: &str) -> Self {
105 |         self.config.thread_name = s.to_owned();
106 |         self
107 |     }
108 | 
109 |     /// Build a watchdog instance for the current tokio runtime.
110 |     /// Panics if there is no current runtime.
111 |     pub fn build(self) -> Watchdog {
112 |         self.build_for_runtime(runtime::Handle::current())
113 |     }
114 | 
115 |     /// Build a watchdog instance for given tokio runtime
116 |     pub fn build_for_runtime(self, handle: runtime::Handle) -> Watchdog {
117 |         Watchdog::new_for_runtime(self.config, handle)
118 |     }
119 | }
120 | 
121 | /// The watchdog object monitors a given tokio runtime to see if it looks deadlocked.
122 | ///
123 | /// It spawns a thread outside of the runtime which watches for heartbeats from an async task
124 | /// that it spawns in the runtime.
125 | ///
126 | /// When a long enough time passes without a heartbeat, the watchdog is "triggered".
127 | ///
128 | /// By default, when it is triggered it will:
129 | /// * Try to collect tokio runtime metrics for a few seconds and log them
130 | /// * Try to log a tokio task dump for a few seconds (giving up if it doesn't succeed).
131 | /// * Panic, so that the process can restart and hopefully recover.
132 | ///
133 | /// The panic / restart idea is similar in spirit to how gunicorn / django will try to
134 | /// restart worker processes that time out.
135 | ///
136 | /// Dropping the watchdog will join the watchdog thread and the heartbeat task.
137 | pub struct Watchdog {
138 |     watchdog_thread: Option<std::thread::JoinHandle<()>>,
139 |     stop_requested: Arc<AtomicBool>,
140 | }
141 | 
142 | impl Drop for Watchdog {
143 |     fn drop(&mut self) {
144 |         if let Some(handle) = self.watchdog_thread.take() {
145 |             self.stop_requested.store(true, Ordering::SeqCst);
146 |             handle.join().expect("Could not join watchdog thread");
147 |         }
148 |     }
149 | }
150 | 
151 | impl Watchdog {
152 |     /// Create a new watchdog builder, to configure a watchdog instance.
153 |     pub fn builder() -> Builder {
154 |         Builder::default()
155 |     }
156 | 
157 |     /// Make a new watchdog for a given tokio runtime.
158 |     /// Starts the heartbeat task and the watchdog thread.
159 |     /// Drop this handle in order to stop both.
160 |     fn new_for_runtime(config: Config, handle: runtime::Handle) -> Self {
161 |         let stop_requested = Arc::new(AtomicBool::default());
162 |         let thread_stop_requested = stop_requested.clone();
163 | 
164 |         let watchdog_thread = Some(
165 |             std::thread::Builder::new()
166 |                 .name(format!("{}-thread", config.thread_name))
167 |                 .spawn(move || {
168 |                     Self::watchdog_thread_entrypoint(config, handle, thread_stop_requested)
169 |                 })
170 |                 .expect("could not spawn thread"),
171 |         );
172 | 
173 |         Self {
174 |             watchdog_thread,
175 |             stop_requested,
176 |         }
177 |     }
178 | 
179 |     fn exe_name() -> Option<String> {
180 |         Some(
181 |             Path::new(&std::env::args_os().next()?)
182 |                 .file_name()?
183 |                 .to_string_lossy()
184 |                 .as_ref()
185 |                 .to_owned(),
186 |         )
187 |     }
188 | 
189 |     fn watchdog_thread_entrypoint(
190 |         config: Config,
191 |         handle: runtime::Handle,
192 |         stop_requested: Arc<AtomicBool>,
193 |     ) {
194 |         let exe_name = Self::exe_name().unwrap_or_else(|| "?".into());
195 |         let span = warn_span!("watchdog", exe = exe_name);
196 |         let span_clone = span.clone();
197 |         span.in_scope(move || {
198 |             #[allow(unused)]
199 |             let Config {
200 |                 heartbeat_period,
201 |                 watchdog_timeout,
202 |                 triggered_metrics_duration,
203 |                 triggered_metrics_collections,
204 |                 task_dump_deadline,
205 |                 panic,
206 |                 thread_name,
207 |             } = config;
208 | 
209 |             // The heartbeat channel is an std::Mutex < Instant > shared between the heartbeat task and the watchdog thread.
210 |             // The heartbeat task updates it periodically, and the watchdog reads it periodically.
211 |             let heartbeat_channel = Arc::new(Mutex::new(Instant::now()));
212 |             let task_heartbeat_channel = heartbeat_channel.clone();
213 |             let task_stop_requested = stop_requested.clone();
214 |             let task_name = thread_name.clone();
215 | 
216 |             // Spawn the tokio task that will periodically update the heartbeat channel
217 |             handle.spawn(async move {
218 |                 info!("{task_name} heartbeat task started");
219 |                 loop {
220 |                     *task_heartbeat_channel.lock().unwrap() = Instant::now();
221 |                     tokio::time::sleep(heartbeat_period).await;
222 |                     if task_stop_requested.load(Ordering::SeqCst) {
223 |                         info!("{task_name} heartbeat task stop requested");
224 |                         break;
225 |                     }
226 |                 }
227 |             }.instrument(span_clone));
228 | 
229 |             info!("{thread_name} thread started");
230 | 
231 |             // Now enter the watchdog loop
232 |             loop {
233 |                 let last_heartbeat = *heartbeat_channel.lock().unwrap();
234 |                 let elapsed = last_heartbeat.elapsed();
235 | 
236 |                 if elapsed > watchdog_timeout {
237 |                     error!("{thread_name} thread: Watchdog has been triggered: {elapsed:?} since last heartbeat > {watchdog_timeout:?}");
238 | 
239 |                     for i in 0..triggered_metrics_collections {
240 |                         let metrics = TokioRuntimeMetrics::from(&handle);
241 |                         warn!("Runtime metrics {i}/{triggered_metrics_collections}: {metrics:#?}");
242 |                         std::thread::sleep(triggered_metrics_duration / triggered_metrics_collections);
243 |                     }
244 | 
245 |                     // If task dumps are enabled, try also to acquire a task dump
246 |                     // According to docu, a taskdump requires all polled futures to eventually yield,
247 |                     // then it polls them in a special tracing mode. So this won't work if the runtime is actually
248 |                     // deadlocked. If it's just slow however, then we will get this additional data.
249 |                     // If the task dump works, then we cancel the panic.
250 |                     // We have to run the task dump on a separate thread since the runtime may be FUBAR.
251 |                     #[cfg(all(tokio_unstable, tokio_taskdump))]
252 |                     {
253 |                         use std::sync::Condvar;
254 | 
255 |                         warn!("{thread_name}: Attempting to collect a taskdump");
256 | 
257 |                         let pair = Arc::new((Mutex::new(false), Condvar::new()));
258 |                         let thread_pair = pair.clone();
259 |                         let thread_handle = handle.clone();
260 | 
261 |                         if let Err(err) = std::thread::Builder::new()
262 |                             .name(format!("{thread_name}-taskdump"))
263 |                             .spawn(move || {
264 |                                 let fut = tokio::time::timeout(task_dump_deadline, thread_handle.dump());
265 |                                 match thread_handle.block_on(fut) {
266 |                                     Ok(dump) => {
267 |                                         for (i, task) in dump.tasks().iter().enumerate() {
268 |                                             let trace = task.trace().to_string();
269 |                                             warn!(task = i, "{trace}");
270 |                                         }
271 |                                         let (lk, cvar) = &*thread_pair;
272 |                                         *lk.lock().unwrap() = true;
273 |                                         cvar.notify_one();
274 |                                     }
275 |                                     Err(err) => {
276 |                                         warn!("task dump error: {err}");
277 |                                     }
278 |                                 }
279 |                             }) {
280 |                             error!("{thread_name}: Could not spawn taskdump thread: {err}");
281 |                         } else {
282 |                             // Wait at least task_dump_deadline for the task dump job to complete.
283 |                             let (lk, cvar) = &*pair;
284 |                             let (gd, _timeout_result) = cvar.wait_timeout_while(
285 |                                 lk.lock().unwrap(),
286 |                                 task_dump_deadline,
287 |                                 |&mut done| !done
288 |                             ).expect("Error waiting for condvar");
289 | 
290 |                             // Check if success was recorded
291 |                             if *gd {
292 |                                 info!("{thread_name}: Task dump was succesful, this indicates that the runtime is not deadlocked. Watchdog is being reset");
293 | 
294 |                                 // Re-enter the loop. We should not immediately retrigger watchdog, because if all futures polled successfully,
295 |                                 // then the heartbeat task should have run at least once in the last task_dump_deadline, which presumably is <= watchdog_timeout.
296 |                                 continue;
297 |                             } else {
298 |                                 warn!("{thread_name}: Task dump was unsuccessful after {task_dump_deadline:?}");
299 |                             }
300 |                         }
301 |                     }
302 | 
303 |                     if panic {
304 |                         // Check if stop was requested immediately before panicking
305 |                         if stop_requested.load(Ordering::SeqCst) {
306 |                             info!("{thread_name} stop requested");
307 |                             break;
308 |                         }
309 |                         panic!("{thread_name} panicked: {elapsed:?} since last heartbeat > {watchdog_timeout:?}, exe = {exe_name}");
310 |                     }
311 |                 }
312 | 
313 |                 // Sleep for a bit
314 |                 std::thread::sleep(heartbeat_period);
315 | 
316 |                 // Exit if requested
317 |                 if stop_requested.load(Ordering::SeqCst) {
318 |                     info!("{thread_name} stop requested");
319 |                     break;
320 |                 }
321 |             }
322 |         })
323 |     }
324 | }
325 | 


--------------------------------------------------------------------------------
/src/tokio_runtime_metrics.rs:
--------------------------------------------------------------------------------
  1 | use std::time::Duration;
  2 | use tokio::runtime;
  3 | 
  4 | /// Data collected from tokio::runtime::RuntimeMetrics
  5 | ///
  6 | /// https://docs.rs/tokio/latest/tokio/runtime/struct.RuntimeMetrics.html
  7 | ///
  8 | /// Annoyingly, debug logging that thing doesn't do quite what you would expect.
  9 | ///
 10 | /// I didn't want to use the tokio_metrics crate because that thing assumes that
 11 | /// you want to continuously publish these metrics, rather than just helping me
 12 | /// process snapshots from tokio.
 13 | ///
 14 | /// To populate this, usually one might do like `TokioRuntimeMetrics::from(Handle::current())`
 15 | /// or simply `TokioRuntimeMetrics::current()`.
 16 | ///
 17 | /// The only thing you can do with this is debug print it.
 18 | #[allow(dead_code)]
 19 | #[derive(Clone, Debug)]
 20 | pub struct TokioRuntimeMetrics {
 21 |     /// Number of worker threads
 22 |     num_workers: usize,
 23 |     /// Number of active tasks
 24 |     num_alive_tasks: usize,
 25 |     /// Global queue depth
 26 |     global_queue_depth: usize,
 27 |     /// Number of blocking threads
 28 |     #[cfg(tokio_unstable)]
 29 |     num_blocking_threads: usize,
 30 |     /// Number of idle blocking threads
 31 |     #[cfg(tokio_unstable)]
 32 |     num_idle_blocking_threads: usize,
 33 |     /// The number of tasks currently in the blocking tasks queue, created via spawn_blocking.
 34 |     #[cfg(tokio_unstable)]
 35 |     blocking_queue_depth: usize,
 36 |     /// Total number of tasks spawned on this runtime
 37 |     #[cfg(all(tokio_unstable, target_has_atomic = "64"))]
 38 |     spawned_tasks_count: u64,
 39 |     /// Number of times that a thread outside the runtime has scheduled a task
 40 |     #[cfg(all(tokio_unstable, target_has_atomic = "64"))]
 41 |     remote_schedule_count: u64,
 42 |     /// Number of times that tasks have been forced to yield back to the scheduler after exhausting their task budgets.
 43 |     #[cfg(all(tokio_unstable, target_has_atomic = "64"))]
 44 |     budget_forced_yield_count: u64,
 45 |     /// How many times each worker has parked
 46 |     worker_park_count: Vec<u64>,
 47 |     /// How many times each worker has woken up and immediately parked again
 48 |     worker_noop_count: Vec<u64>,
 49 |     /// How many tasks each worker has stolen from another worker thread
 50 |     worker_steal_count: Vec<u64>,
 51 |     /// How many steal operations each worker has performed which stole at least one task
 52 |     worker_steal_operations: Vec<u64>,
 53 |     /// How many times each worker has polled a task
 54 |     worker_poll_count: Vec<u64>,
 55 |     /// The total amount of time each worker has been busy
 56 |     worker_total_busy_duration: Vec<Duration>,
 57 |     /// How many times a worker has scheduled a task (from within the runtime) onto its own queue
 58 |     worker_local_schedule_count: Vec<u64>,
 59 |     /// How many times a worker's local queue has become full. When this happens it sends tasks to the injection queue
 60 |     worker_overflow_count: Vec<u64>,
 61 |     /// The number of tasks currently in each worker's local queue
 62 |     worker_local_queue_depth: Vec<usize>,
 63 |     /// The mean duration of task poll times for each worker. This is an exponentially weighted moving average for each worker.
 64 |     worker_mean_poll_time: Vec<Duration>,
 65 | }
 66 | 
 67 | impl TokioRuntimeMetrics {
 68 |     /// Construct self from a tokio RuntimeMetrics object
 69 |     #[allow(unused_mut)]
 70 |     pub fn new(src: runtime::RuntimeMetrics) -> Self {
 71 |         let num_workers = src.num_workers();
 72 |         let mut worker_park_count = vec![];
 73 |         let mut worker_noop_count = vec![];
 74 |         let mut worker_steal_count = vec![];
 75 |         let mut worker_steal_operations = vec![];
 76 |         let mut worker_poll_count = vec![];
 77 |         let mut worker_total_busy_duration = vec![];
 78 |         let mut worker_local_schedule_count = vec![];
 79 |         let mut worker_overflow_count = vec![];
 80 |         let mut worker_local_queue_depth = vec![];
 81 |         let mut worker_mean_poll_time = vec![];
 82 | 
 83 |         #[cfg(tokio_unstable)]
 84 |         for i in 0..num_workers {
 85 |             worker_local_queue_depth.push(src.worker_local_queue_depth(i));
 86 |         }
 87 | 
 88 |         #[cfg(all(tokio_unstable, target_has_atomic = "64"))]
 89 |         for i in 0..num_workers {
 90 |             worker_park_count.push(src.worker_park_count(i));
 91 |             worker_noop_count.push(src.worker_noop_count(i));
 92 |             worker_steal_count.push(src.worker_steal_count(i));
 93 |             worker_steal_operations.push(src.worker_steal_operations(i));
 94 |             worker_poll_count.push(src.worker_poll_count(i));
 95 |             worker_total_busy_duration.push(src.worker_total_busy_duration(i));
 96 |             worker_local_schedule_count.push(src.worker_local_schedule_count(i));
 97 |             worker_overflow_count.push(src.worker_overflow_count(i));
 98 |             worker_mean_poll_time.push(src.worker_mean_poll_time(i));
 99 |         }
100 | 
101 |         Self {
102 |             num_workers,
103 |             num_alive_tasks: src.num_alive_tasks(),
104 |             global_queue_depth: src.global_queue_depth(),
105 |             #[cfg(tokio_unstable)]
106 |             blocking_queue_depth: src.blocking_queue_depth(),
107 |             #[cfg(tokio_unstable)]
108 |             num_blocking_threads: src.num_blocking_threads(),
109 |             #[cfg(tokio_unstable)]
110 |             num_idle_blocking_threads: src.num_idle_blocking_threads(),
111 |             #[cfg(all(tokio_unstable, target_has_atomic = "64"))]
112 |             spawned_tasks_count: src.spawned_tasks_count(),
113 |             #[cfg(all(tokio_unstable, target_has_atomic = "64"))]
114 |             remote_schedule_count: src.remote_schedule_count(),
115 |             #[cfg(all(tokio_unstable, target_has_atomic = "64"))]
116 |             budget_forced_yield_count: src.budget_forced_yield_count(),
117 |             worker_park_count,
118 |             worker_noop_count,
119 |             worker_steal_count,
120 |             worker_steal_operations,
121 |             worker_poll_count,
122 |             worker_total_busy_duration,
123 |             worker_local_schedule_count,
124 |             worker_overflow_count,
125 |             worker_local_queue_depth,
126 |             worker_mean_poll_time,
127 |         }
128 |     }
129 | 
130 |     /// Construct self using metrics from the current tokio runtime.
131 |     /// Panics if there is no current runtime.
132 |     pub fn current() -> Self {
133 |         Self::from(&runtime::Handle::current())
134 |     }
135 | }
136 | 
137 | impl From<&runtime::Handle> for TokioRuntimeMetrics {
138 |     fn from(src: &runtime::Handle) -> Self {
139 |         Self::new(src.metrics())
140 |     }
141 | }
142 | 


--------------------------------------------------------------------------------