├── .gitignore ├── .github └── workflows │ ├── ci.yml │ ├── semgrep.yml │ └── publish.yml ├── Cargo.toml ├── LICENCE ├── README.md ├── src ├── lifecycle.rs ├── pipes.rs ├── restart_coordination_socket.rs ├── shutdown.rs └── lib.rs └── examples └── restarter.rs /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /Cargo.lock 3 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: [ "main" ] 6 | pull_request: 7 | branches: [ "main" ] 8 | 9 | env: 10 | CARGO_TERM_COLOR: always 11 | 12 | jobs: 13 | lint: 14 | runs-on: ubuntu-latest 15 | steps: 16 | - uses: actions/checkout@v3 17 | - name: Check format 18 | run: cargo fmt -- --check 19 | - name: Run clippy 20 | run: cargo clippy --all --all-targets 21 | build: 22 | runs-on: ubuntu-latest 23 | steps: 24 | - uses: actions/checkout@v3 25 | - name: Build 26 | run: cargo build --verbose 27 | - name: Run tests 28 | run: cargo test --verbose 29 | -------------------------------------------------------------------------------- /.github/workflows/semgrep.yml: -------------------------------------------------------------------------------- 1 | on: 2 | pull_request: {} 3 | workflow_dispatch: {} 4 | push: 5 | branches: 6 | - main 7 | - master 8 | schedule: 9 | - cron: '0 0 * * *' 10 | name: Semgrep config 11 | jobs: 12 | semgrep: 13 | name: semgrep/ci 14 | runs-on: ubuntu-latest 15 | env: 16 | SEMGREP_APP_TOKEN: ${{ secrets.SEMGREP_APP_TOKEN }} 17 | SEMGREP_URL: https://cloudflare.semgrep.dev 18 | SEMGREP_APP_URL: https://cloudflare.semgrep.dev 19 | SEMGREP_VERSION_CHECK_URL: https://cloudflare.semgrep.dev/api/check-version 20 | container: 21 | image: returntocorp/semgrep 22 | steps: 23 | - uses: actions/checkout@v4 24 | - run: semgrep ci 25 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Rust 2 | 3 | on: 4 | release: 5 | types: [published] # Only publish to crates.io when we formally publish a release 6 | # For more on how to formally release shellflip, check out https://help.github.com/en/articles/creating-releases 7 | 8 | jobs: 9 | build: 10 | 11 | runs-on: ubuntu-latest 12 | 13 | steps: 14 | - uses: actions/checkout@v3 15 | - name: Login to crates.io 16 | run: cargo login $CRATES_IO_TOKEN 17 | env: 18 | CRATES_IO_TOKEN: ${{ secrets.crates_io_token }} # https://help.github.com/en/articles/virtual-environments-for-github-actions#creating-and-using-secrets-encrypted-variables 19 | - name: Dry run publish 20 | run: cargo publish --dry-run 21 | - name: Publish 22 | run: cargo publish 23 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "shellflip" 3 | version = "2.1.2" 4 | edition = "2021" 5 | description = "Graceful process restarts in Rust" 6 | repository = "https://github.com/cloudflare/shellflip" 7 | documentation = "https://docs.rs/shellflip" 8 | keywords = ["restart", "systemd"] 9 | license = "BSD-3-Clause" 10 | readme = "README.md" 11 | 12 | [dependencies] 13 | async-trait = "0.1.61" 14 | anyhow = "1.0.56" 15 | futures = "0.3" 16 | libc = "0.2.76" 17 | log = "0.4.17" 18 | nix = "0.25" 19 | sd-notify = "0.3" 20 | serde = { version = "1", features = ["derive", "rc"] } 21 | serde_json = "1.0" 22 | thiserror = "1.0" 23 | tokio = { version = "1.24.1", features = ["full", "test-util"] } 24 | tokio-stream = { version = "0.1", features = ["net", "io-util" ] } 25 | tokio-util = { version = "0.7.4", features = ["compat", "time", "codec"] } 26 | 27 | [dev-dependencies] 28 | clap = { version = "4.1.8", features = ["derive"] } 29 | env_logger = "0.10.0" 30 | rand = { version = "0.8", features = ["small_rng"] } 31 | -------------------------------------------------------------------------------- /LICENCE: -------------------------------------------------------------------------------- 1 | Copyright (C) 2023, Cloudflare, Inc. 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, 5 | are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | 2. Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation and/or 12 | other materials provided with the distribution. 13 | 14 | 3. Neither the name of the copyright holder nor the names of its contributors 15 | may be used to endorse or promote products derived from this software without 16 | specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 22 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 23 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 24 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 25 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # shellflip 2 | [![crates.io](https://img.shields.io/crates/v/shellflip.svg)](https://crates.io/crates/shellflip) 3 | [![docs.rs](https://docs.rs/shellflip/badge.svg)](https://docs.rs/shellflip) 4 | 5 | Graceful process restarts in Rust. 6 | 7 | This crate facilitates upgrading or reconfiguring a service without disrupting existing connections. 8 | This is achieved by forking the process and communicating a small amount of state between the old 9 | and new processes; once the new process has started successfully the old process may terminate. 10 | 11 | This crate has the following goals: 12 | 13 | * No old code keeps running after a successful upgrade (and inevitable shutdown of the old process) 14 | * The new process has a grace period for performing initialisation 15 | * Crashing during initialisation is OK 16 | * Only a single upgrade is ever run in parallel 17 | * It is possible for the user/process initiating the upgrade to know if the upgrade succeeded 18 | 19 | Inspired by the [tableflip](https://github.com/cloudflare/tableflip) go package but not a direct 20 | replacement. 21 | 22 | # Using the library 23 | 24 | A full example is given in the [restarter example service](examples/restarter.rs). 25 | 26 | The main struct of interest is `RestartConfig` which has methods for detecting or initiating 27 | restart. For shutting down a restarted process, the `ShutdownCoordinator` provides the means for 28 | both signalling a shutdown event to spawned tasks, and awaiting their completion. 29 | 30 | ## License 31 | 32 | BSD licensed. See the [LICENSE](LICENSE) file for details. 33 | 34 | 🦀ノ( º _ ºノ) - respect crables! 35 | -------------------------------------------------------------------------------- /src/lifecycle.rs: -------------------------------------------------------------------------------- 1 | use super::ENV_HANDOVER_PIPE; 2 | use crate::pipes::FdStringExt; 3 | use async_trait::async_trait; 4 | use std::env; 5 | use std::io; 6 | use std::pin::Pin; 7 | use tokio::fs::File; 8 | use tokio::io::{AsyncRead, AsyncWrite}; 9 | 10 | pub type PipeReader = Pin>; 11 | pub type PipeWriter = Pin>; 12 | 13 | #[async_trait] 14 | pub trait LifecycleHandler: Send { 15 | /// Called after the child process has been spawned, allowing the current process to send state 16 | /// to the child process. The child process can receive this data by calling 17 | /// `receive_from_old_process`. 18 | async fn send_to_new_process(&mut self, _write_pipe: PipeWriter) -> io::Result<()> { 19 | Ok(()) 20 | } 21 | 22 | /// Called before the child process has been spawned. 23 | async fn pre_new_process(&mut self) {} 24 | 25 | /// Called after `send_to_new_process` if the child process fails to start successfully. 26 | /// This gives you an opportunity to undo any state changes made in `send_to_new_process`. 27 | async fn new_process_failed(&mut self) {} 28 | } 29 | 30 | /// A default implementation of LifecycleHandler that does nothing in response to lifecycle events. 31 | pub struct NullLifecycleHandler; 32 | 33 | impl LifecycleHandler for NullLifecycleHandler {} 34 | 35 | /// If this process has been spawned due to graceful restart, returns a `PipeReader` used to receive 36 | /// data from the parent process's implementation of `LifecycleHandler::send_to_new_process`. 37 | /// 38 | /// The behaviour of this function is undefined if the environment variables used by this crate to 39 | /// pass file descriptor numbers were set by something other than shellflip spawning a new instance 40 | /// of the calling process. 41 | pub fn receive_from_old_process() -> Option { 42 | if let Ok(handover_fd) = env::var(ENV_HANDOVER_PIPE) { 43 | unsafe { File::from_fd_string(&handover_fd) } 44 | .ok() 45 | .map(|x| Box::pin(x) as PipeReader) 46 | } else { 47 | None 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/pipes.rs: -------------------------------------------------------------------------------- 1 | use libc::c_int; 2 | use std::fs::File; 3 | use std::io::{self, Read, Write}; 4 | use std::os::fd::{AsRawFd, FromRawFd}; 5 | 6 | pub(crate) enum PipeMode { 7 | ParentWrites, 8 | ChildWrites, 9 | } 10 | 11 | /// Create a pair of pipes. 12 | /// The first element is the read side, and the second element is the write side. 13 | /// `mode` determines whether the read or write end will be inherited by the child. 14 | pub(crate) fn create_paired_pipes(mode: PipeMode) -> io::Result<(File, File)> { 15 | let mut fds: [c_int; 2] = [0; 2]; 16 | let res = unsafe { libc::pipe(fds.as_mut_ptr()) }; 17 | if res != 0 { 18 | return Err(io::Error::last_os_error()); 19 | } 20 | 21 | match mode { 22 | PipeMode::ParentWrites => { 23 | set_cloexec(fds[1])?; 24 | } 25 | PipeMode::ChildWrites => { 26 | set_cloexec(fds[0])?; 27 | } 28 | }; 29 | 30 | let reader = unsafe { File::from_raw_fd(fds[0]) }; 31 | let writer = unsafe { File::from_raw_fd(fds[1]) }; 32 | 33 | Ok((reader, writer)) 34 | } 35 | 36 | fn set_cloexec(fd: c_int) -> io::Result<()> { 37 | let res = unsafe { libc::fcntl(fd, libc::F_SETFD, libc::FD_CLOEXEC) }; 38 | if res != 0 { 39 | return Err(io::Error::last_os_error()); 40 | } 41 | Ok(()) 42 | } 43 | 44 | pub(crate) fn completion_pipes() -> io::Result<(CompletionReceiver, CompletionSender)> { 45 | let (reader, writer) = create_paired_pipes(PipeMode::ChildWrites)?; 46 | 47 | let reader = CompletionReceiver(reader); 48 | let writer = CompletionSender(writer); 49 | 50 | Ok((reader, writer)) 51 | } 52 | 53 | pub(crate) struct CompletionReceiver(File); 54 | 55 | impl CompletionReceiver { 56 | pub(crate) fn recv(&mut self) -> io::Result<()> { 57 | let mut buf = [0u8; 1]; 58 | 59 | if self.0.read(&mut buf)? != 1 { 60 | Err(io::Error::new( 61 | io::ErrorKind::Other, 62 | "child failed to notify parent", 63 | )) 64 | } else { 65 | Ok(()) 66 | } 67 | } 68 | } 69 | 70 | pub(crate) struct CompletionSender(pub(crate) File); 71 | 72 | impl CompletionSender { 73 | pub(crate) fn send(&mut self) -> io::Result<()> { 74 | if self.0.write(b"1")? != 1 { 75 | Err(io::Error::new( 76 | io::ErrorKind::Other, 77 | "failed to signal parent to close", 78 | )) 79 | } else { 80 | Ok(()) 81 | } 82 | } 83 | } 84 | 85 | pub(crate) trait FdStringExt { 86 | fn fd_string(&self) -> String; 87 | unsafe fn from_fd_string(fd_str: &str) -> io::Result 88 | where 89 | Self: Sized; 90 | } 91 | 92 | impl FdStringExt for T { 93 | unsafe fn from_fd_string(fd_str: &str) -> io::Result { 94 | match fd_str.parse() { 95 | Ok(fd) => Ok(Self::from_raw_fd(fd)), 96 | Err(_) => Err(io::Error::new( 97 | io::ErrorKind::InvalidInput, 98 | "invalid notify socket fd", 99 | )), 100 | } 101 | } 102 | 103 | fn fd_string(&self) -> String { 104 | self.as_raw_fd().to_string() 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /src/restart_coordination_socket.rs: -------------------------------------------------------------------------------- 1 | //! Communication with a running process over a unix domain socket. 2 | use crate::RestartResult; 3 | use anyhow::{anyhow, Context}; 4 | use futures::sink::SinkExt; 5 | use futures::stream::StreamExt; 6 | use serde::{Deserialize, Serialize}; 7 | use tokio::net::UnixStream; 8 | use tokio_util::codec::length_delimited::LengthDelimitedCodec; 9 | use tokio_util::codec::{Decoder, Framed}; 10 | 11 | /// Represents the restart coordination socket, used for communicating with a running oxy process. 12 | /// This is used to trigger a restart and receive notification of its completion or failure. 13 | pub struct RestartCoordinationSocket { 14 | codec: Framed, 15 | } 16 | 17 | impl RestartCoordinationSocket { 18 | /// Create a new RestartCoordinationSocket wrapping a unix socket. 19 | pub fn new(socket: UnixStream) -> Self { 20 | RestartCoordinationSocket { 21 | codec: LengthDelimitedCodec::new().framed(socket), 22 | } 23 | } 24 | 25 | /// Sends a restart command through the socket. Returns Ok(child_pid) on success or an error 26 | /// if the restart failed for any reason. 27 | pub async fn send_restart_command(&mut self) -> RestartResult { 28 | self.send_message(RestartMessage::Request(RestartRequest::TryRestart)) 29 | .await?; 30 | match self.receive_message().await? { 31 | RestartMessage::Response(RestartResponse::RestartComplete(pid)) => Ok(pid), 32 | RestartMessage::Response(RestartResponse::RestartFailed(reason)) => { 33 | Err(anyhow!(reason)) 34 | } 35 | _ => Err(anyhow!("unexpected message received")), 36 | } 37 | } 38 | 39 | /// Send a message over the socket 40 | pub async fn send_message(&mut self, msg: RestartMessage) -> RestartResult<()> { 41 | self.codec 42 | .send(serde_json::to_string(&msg).unwrap().into()) 43 | .await?; 44 | 45 | Ok(()) 46 | } 47 | 48 | /// Receive a message from the socket. 49 | pub async fn receive_message(&mut self) -> RestartResult { 50 | let message = self 51 | .codec 52 | .next() 53 | .await 54 | .context("connection closed while awaiting a message")??; 55 | 56 | Ok(serde_json::from_slice(&message)?) 57 | } 58 | } 59 | 60 | /// Represents any message that may be sent over the socket. 61 | #[derive(Debug, Serialize, Deserialize)] 62 | pub enum RestartMessage { 63 | Request(RestartRequest), 64 | Response(RestartResponse), 65 | } 66 | 67 | /// A request message that expects a response. 68 | #[derive(Debug, Serialize, Deserialize)] 69 | pub enum RestartRequest { 70 | TryRestart, 71 | } 72 | 73 | /// A response to a request message. 74 | #[derive(Debug, Serialize, Deserialize)] 75 | pub enum RestartResponse { 76 | // Restart completed. The child PID is provided. 77 | RestartComplete(u32), 78 | // Restart failed. The error message is attached. 79 | RestartFailed(String), 80 | } 81 | 82 | #[cfg(test)] 83 | mod tests { 84 | use super::*; 85 | 86 | #[tokio::test] 87 | async fn test_restart_complete() { 88 | let (client, server) = UnixStream::pair().unwrap(); 89 | let mut client = RestartCoordinationSocket::new(client); 90 | let mut server = RestartCoordinationSocket::new(server); 91 | let child_pid = 42; 92 | 93 | tokio::spawn(async move { 94 | let message = server.receive_message().await.unwrap(); 95 | assert!(matches!( 96 | message, 97 | RestartMessage::Request(RestartRequest::TryRestart) 98 | )); 99 | let response = RestartMessage::Response(RestartResponse::RestartComplete(child_pid)); 100 | server.send_message(response).await.unwrap(); 101 | }); 102 | 103 | assert_eq!(client.send_restart_command().await.unwrap(), child_pid); 104 | } 105 | 106 | #[tokio::test] 107 | async fn test_restart_failed() { 108 | let (client, server) = UnixStream::pair().unwrap(); 109 | let mut client = RestartCoordinationSocket::new(client); 110 | let mut server = RestartCoordinationSocket::new(server); 111 | let error_message = "huge success"; 112 | 113 | tokio::spawn(async move { 114 | let message = server.receive_message().await.unwrap(); 115 | assert!(matches!( 116 | message, 117 | RestartMessage::Request(RestartRequest::TryRestart) 118 | )); 119 | let response = 120 | RestartMessage::Response(RestartResponse::RestartFailed(error_message.into())); 121 | server.send_message(response).await.unwrap(); 122 | }); 123 | 124 | let r = client.send_restart_command().await; 125 | assert_eq!(r.err().map(|e| e.to_string()), Some(error_message.into())); 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /src/shutdown.rs: -------------------------------------------------------------------------------- 1 | use std::default::Default; 2 | use std::fmt; 3 | use std::sync::{Arc, Weak}; 4 | use tokio::sync::{mpsc, watch}; 5 | 6 | /// A handle held by an active task, which can be used to receive 7 | /// shutdown notifications and to delay program termination until the 8 | /// task has completed. 9 | pub struct ShutdownHandle { 10 | /// Receives cancellation notifications. 11 | cancellation_rx: ShutdownSignal, 12 | /// Signals connection completion (when dropped) 13 | _shutdown_tx: mpsc::Sender<()>, 14 | } 15 | 16 | impl Default for ShutdownHandle { 17 | fn default() -> Self { 18 | let (_shutdown_tx, _) = mpsc::channel(1); 19 | ShutdownHandle { 20 | cancellation_rx: ShutdownSignal::default(), 21 | _shutdown_tx, 22 | } 23 | } 24 | } 25 | 26 | impl fmt::Debug for ShutdownHandle { 27 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 28 | f.debug_struct("ShutdownHandle").finish() 29 | } 30 | } 31 | 32 | /// Receives a shutdown signal, which can be awaited on once using the `on_shutdown` method. 33 | #[derive(Clone)] 34 | pub enum ShutdownSignal { 35 | WaitingForSignal(watch::Receiver), 36 | Signalled, 37 | } 38 | 39 | impl ShutdownSignal { 40 | /// Wait for a shutdown signal to happen once. 41 | /// This function can be called multiple times but it will only resolve once, 42 | /// so it can be used in a `select!` block to receive the shutdown signal once. 43 | pub async fn on_shutdown(&mut self) { 44 | match self { 45 | ShutdownSignal::WaitingForSignal(r) => { 46 | let _ = r.changed().await; 47 | *self = ShutdownSignal::Signalled; 48 | } 49 | ShutdownSignal::Signalled => { 50 | futures::future::pending::<()>().await; 51 | } 52 | } 53 | } 54 | } 55 | 56 | /// The default implementation of ShutdownSignal will never be signalled. 57 | impl Default for ShutdownSignal { 58 | fn default() -> Self { 59 | ShutdownSignal::Signalled 60 | } 61 | } 62 | 63 | impl From<&ShutdownHandle> for ShutdownSignal { 64 | fn from(handle: &ShutdownHandle) -> Self { 65 | handle.cancellation_rx.clone() 66 | } 67 | } 68 | 69 | /// Coordinates the shutdown process for a group of tasks. 70 | /// This allows the tasks to get notified when a shutdown is requested, 71 | /// and allows the main thread to defer termination until all of the tasks have successfully 72 | /// completed. 73 | pub struct ShutdownCoordinator { 74 | /// Holds onto the ShutdownHandle until shutdown starts. 75 | shutdown_handle: Arc, 76 | /// Used to notify tasks to start shutdown 77 | cancellation_tx: watch::Sender, 78 | /// Used to wait for all connections to shutdown successfully. 79 | shutdown_rx: mpsc::Receiver<()>, 80 | } 81 | 82 | impl ShutdownCoordinator { 83 | /// Get a cancellation channel that, when dropped, can be used to close all proxy endpoints created from this object. 84 | pub fn new() -> Self { 85 | let (cancellation_tx, cancellation_rx) = watch::channel(false); 86 | let (shutdown_tx, shutdown_rx) = mpsc::channel(1); 87 | let shutdown_handle = Arc::new(ShutdownHandle { 88 | cancellation_rx: ShutdownSignal::WaitingForSignal(cancellation_rx), 89 | _shutdown_tx: shutdown_tx, 90 | }); 91 | ShutdownCoordinator { 92 | shutdown_handle, 93 | cancellation_tx, 94 | shutdown_rx, 95 | } 96 | } 97 | 98 | /// Get a ShutdownHandle to be held by a task that needs to be waited on during shutdown. 99 | pub fn handle(&self) -> Arc { 100 | Arc::clone(&self.shutdown_handle) 101 | } 102 | 103 | /// Get a ShutdownHandle that can be held by a task that does not need to be waited on, but may 104 | /// spawn tasks that should be waited on. If the task can upgrade the handle with Arc::upgrade, 105 | /// then shutdown has not yet started. 106 | pub fn handle_weak(&self) -> Weak { 107 | Arc::downgrade(&self.shutdown_handle) 108 | } 109 | 110 | /// Initiate shutdown and wait for its successful completion. 111 | /// To prevent new connections from being accepted, drop any listening tasks first. 112 | pub async fn shutdown(mut self) { 113 | let _ = self.cancellation_tx.send(true); 114 | drop(self.shutdown_handle); 115 | let _ = self.shutdown_rx.recv().await; 116 | } 117 | 118 | /// Shutdown, waiting a maximum amount of time before returning. 119 | pub async fn shutdown_with_timeout(self, timeout: u64) { 120 | let _ = 121 | tokio::time::timeout(tokio::time::Duration::from_secs(timeout), self.shutdown()).await; 122 | } 123 | } 124 | 125 | impl Default for ShutdownCoordinator { 126 | fn default() -> Self { 127 | Self::new() 128 | } 129 | } 130 | 131 | #[cfg(test)] 132 | mod tests { 133 | use super::*; 134 | use futures::{pin_mut, FutureExt}; 135 | 136 | #[tokio::test] 137 | async fn test_shutdown_coordinator() { 138 | // Shutdown with no active tasks should happen immediately 139 | assert!(ShutdownCoordinator::new() 140 | .shutdown() 141 | .now_or_never() 142 | .is_some()); 143 | 144 | // Shutdown is delayed with an active task 145 | let sc = ShutdownCoordinator::new(); 146 | let handle = sc.handle(); 147 | let shutdown_fut = sc.shutdown(); 148 | pin_mut!(shutdown_fut); 149 | assert!(shutdown_fut.as_mut().now_or_never().is_none()); 150 | drop(handle); 151 | assert!(shutdown_fut.now_or_never().is_some()); 152 | } 153 | 154 | #[tokio::test] 155 | async fn test_default_shutdown_handle() { 156 | let handle = ShutdownHandle::default(); 157 | let mut signal = ShutdownSignal::from(&handle); 158 | assert!(signal.on_shutdown().now_or_never().is_none()); 159 | } 160 | } 161 | -------------------------------------------------------------------------------- /examples/restarter.rs: -------------------------------------------------------------------------------- 1 | //! Sample restarter application. 2 | //! This implements a TCP server that accepts connections, 3 | //! outputs a short line describing the running process, 4 | //! then echoes back anything sent to it by the client. 5 | //! 6 | //! While the application is running, another instance can be invoked with the 7 | //! `restart` command which will trigger a restart. Existing connections will be maintained and the 8 | //! old process will terminate as soon as all clients disconnect. The new process will listen on 9 | //! another socket (as this library does not provide for socket inheritance or rebinding). 10 | use anyhow::Error; 11 | use async_trait::async_trait; 12 | use clap::{Parser, Subcommand}; 13 | use shellflip::lifecycle::*; 14 | use shellflip::{RestartConfig, ShutdownCoordinator, ShutdownHandle, ShutdownSignal}; 15 | use std::sync::Arc; 16 | use tokio::io::{AsyncReadExt, AsyncWriteExt}; 17 | use tokio::net::{TcpListener, TcpStream}; 18 | use tokio::{pin, select}; 19 | 20 | /// Simple program to test graceful shutdown and restart 21 | #[derive(Parser)] 22 | #[command(author, version, about, long_about = None)] 23 | struct Args { 24 | #[command(subcommand)] 25 | command: Option, 26 | /// Restart coordination socket path 27 | #[arg(short, long, default_value = "/tmp/restarter.sock")] 28 | socket: String, 29 | } 30 | 31 | #[derive(Subcommand)] 32 | enum Commands { 33 | /// Trigger restart 34 | Restart, 35 | } 36 | 37 | struct AppData { 38 | restart_generation: u32, 39 | } 40 | 41 | #[async_trait] 42 | impl LifecycleHandler for AppData { 43 | async fn send_to_new_process(&mut self, mut write_pipe: PipeWriter) -> std::io::Result<()> { 44 | if self.restart_generation > 4 { 45 | log::info!("Four restarts is more than anybody needs, surely?"); 46 | return Err(std::io::Error::new( 47 | std::io::ErrorKind::Other, 48 | "The operation completed successfully", 49 | )); 50 | } 51 | write_pipe.write_u32(self.restart_generation).await?; 52 | Ok(()) 53 | } 54 | } 55 | 56 | #[tokio::main] 57 | async fn main() -> Result<(), Error> { 58 | env_logger::init(); 59 | let args = Args::parse(); 60 | let mut app_data = AppData { 61 | restart_generation: 0, 62 | }; 63 | 64 | if let Some(mut handover_pipe) = receive_from_old_process() { 65 | app_data.restart_generation = handover_pipe.read_u32().await? + 1; 66 | } 67 | 68 | let restart_generation = app_data.restart_generation; 69 | 70 | // Configure the essential requirements for implementing graceful restart. 71 | let restart_conf = RestartConfig { 72 | enabled: true, 73 | coordination_socket_path: args.socket.into(), 74 | lifecycle_handler: Box::new(app_data), 75 | ..Default::default() 76 | }; 77 | 78 | match args.command { 79 | // Restart an already-running process 80 | Some(Commands::Restart) => { 81 | let res = restart_conf.request_restart().await; 82 | match res { 83 | Ok(id) => { 84 | log::info!("Restart succeeded, child pid is {}", id); 85 | return Ok(()); 86 | } 87 | Err(e) => { 88 | log::error!("Restart failed: {}", e); 89 | return Err(e); 90 | } 91 | } 92 | } 93 | // Standard operating mode 94 | None => {} 95 | } 96 | 97 | // Start the restart thread and get a task that will complete when a restart completes. 98 | let restart_task = restart_conf.try_into_restart_task()?; 99 | // (need to pin this because of the loop below!) 100 | pin!(restart_task); 101 | // Create a shutdown coordinator so that we can wait for all client connections to complete. 102 | let shutdown_coordinator = ShutdownCoordinator::new(); 103 | // Bind a TCP listener socket to give us something to do 104 | let listener = TcpListener::bind("127.0.0.1:0").await.unwrap(); 105 | println!( 106 | "Instance no. {} listening on {}", 107 | restart_generation, 108 | listener.local_addr().unwrap() 109 | ); 110 | 111 | loop { 112 | select! { 113 | res = listener.accept() => { 114 | match res { 115 | Ok((sock, addr)) => { 116 | log::info!("Received connection from {}", addr); 117 | // Spawn a new task to handle the client connection. 118 | // Give it a shutdown handle so we can await its completion. 119 | tokio::spawn(echo(sock, shutdown_coordinator.handle())); 120 | } 121 | Err(e) => { 122 | log::warn!("Accept error: {}", e); 123 | } 124 | } 125 | } 126 | res = &mut restart_task => { 127 | match res { 128 | Ok(_) => { 129 | log::info!("Restart successful, waiting for tasks to complete"); 130 | } 131 | Err(e) => { 132 | log::error!("Restart task failed: {}", e); 133 | } 134 | } 135 | // Wait for all clients to complete. 136 | shutdown_coordinator.shutdown().await; 137 | log::info!("Exiting..."); 138 | return Ok(()); 139 | } 140 | } 141 | } 142 | } 143 | 144 | async fn echo(mut sock: TcpStream, shutdown_handle: Arc) { 145 | // Get notification that shutdown has been requested. 146 | // Note that we still keep the shutdown_handle active during the lifetime of this task. 147 | let mut shutdown_signal = ShutdownSignal::from(&*shutdown_handle); 148 | let mut buf = [0u8; 1024]; 149 | let out = format!("Hello, this is process {}\n", std::process::id()); 150 | let _ = sock.write_all(out.as_bytes()).await; 151 | 152 | loop { 153 | select! { 154 | r = sock.read(&mut buf) => { 155 | match r { 156 | Ok(0) => return, 157 | Ok(n) => { 158 | if let Err(e) = sock.write_all(&buf[..n]).await { 159 | log::error!("write failed: {}", e); 160 | return; 161 | } 162 | } 163 | Err(e) => { 164 | log::error!("read failed: {}", e); 165 | return; 166 | } 167 | } 168 | } 169 | _ = shutdown_signal.on_shutdown() => { 170 | log::info!("shutdown requested but client {} is still active", sock.peer_addr().unwrap()); 171 | } 172 | } 173 | } 174 | } 175 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | //! Graceful restart management inspired by tableflip, but more basic. 2 | //! 3 | //! To implement restarts, the simplest thing to do is to generate a `RestartConfig` from 4 | //! command-line values or hardcoded defaults, then call `RestartConfig::try_into_restart_task`. If 5 | //! you implement a restart command using unix sockets for interactive error reporting, call 6 | //! `RestartConfig::request_restart` and return the Result in your main() function. 7 | //! 8 | //! The process is automatically placed into the ready state the first time the restart task is 9 | //! polled. This should be put into a select statement with other futures your app may await on. 10 | //! The restart task will resolve with `Ok(())` if a restart signal was sent and the new process 11 | //! spawned successfully. If the task is unable to handle future restart signals for any reason, 12 | //! it will resolve to an `Err`. 13 | //! 14 | //! The process can also be restarted by sending it SIGUSR1. After any kind of restart request, the 15 | //! old process will terminate if the new process starts up successfully, otherwise it will 16 | //! continue if possible. 17 | //! 18 | //! For coordinating graceful shutdown of the old process, see `ShutdownCoordinator` in the 19 | //! `shutdown` module. 20 | //! 21 | //! # Restart thread 22 | //! 23 | //! Process restarts are handled by a dedicated thread which is spawned when calling either 24 | //! `RestartConfig::try_into_restart_task` or `spawn_restart_task`. If you are dropping privileges, 25 | //! capabilities or using seccomp policies to limit the syscalls that can execute, it is a good 26 | //! idea to call the aforementioned functions before locking down the main & future child threads. 27 | //! You likely don't want the restart thread to have the same restrictions and limitations that may 28 | //! otherwise prevent you from calling execve() or doing certain I/O operations. 29 | //! 30 | //! # Transferring state to the new process 31 | //! 32 | //! It is possible for the old process to serialise state and send it to the new process to 33 | //! continue processing. Your code must set an implementation of `LifecycleHandler` on the 34 | //! `RestartConfig`. After a new process is spawned, shellflip will call 35 | //! `LifecycleHandler::send_to_new_process` which gives you a unidirectional pipe to write data to 36 | //! the new process, which receives this data by calling the `receive_from_old_process` function. 37 | //! 38 | //! The data should be received and validated by the new process before it signals readiness by 39 | //! polling the restart task, in case the data is unusable e.g. if the data format changed slightly 40 | //! between versions causing serialisation to fail. If the new process fails to signal readiness, 41 | //! `LifecycleHandler::new_process_failed` is called and you can undo any changes you made in 42 | //! preparation for handover. If the new process succeeds, however, the restart task will resolve 43 | //! and you may terminate the process as usual. 44 | pub mod lifecycle; 45 | mod pipes; 46 | pub mod restart_coordination_socket; 47 | pub mod shutdown; 48 | 49 | pub use shutdown::{ShutdownCoordinator, ShutdownHandle, ShutdownSignal}; 50 | 51 | use crate::lifecycle::LifecycleHandler; 52 | use crate::pipes::{ 53 | completion_pipes, create_paired_pipes, CompletionReceiver, CompletionSender, FdStringExt, 54 | PipeMode, 55 | }; 56 | use crate::restart_coordination_socket::{ 57 | RestartCoordinationSocket, RestartMessage, RestartRequest, RestartResponse, 58 | }; 59 | use anyhow::anyhow; 60 | use futures::stream::{Stream, StreamExt}; 61 | use std::env; 62 | use std::ffi::OsString; 63 | use std::fs::{remove_file, File as StdFile}; 64 | use std::future::Future; 65 | use std::io; 66 | use std::os::fd::{AsFd, AsRawFd, BorrowedFd, OwnedFd, RawFd}; 67 | use std::os::unix::net::UnixListener as StdUnixListener; 68 | use std::os::unix::process::CommandExt; 69 | use std::path::{Path, PathBuf}; 70 | use std::process; 71 | use std::thread; 72 | use thiserror::Error; 73 | use tokio::fs::File; 74 | use tokio::net::{UnixListener, UnixStream}; 75 | use tokio::select; 76 | use tokio::signal::unix::{signal, Signal, SignalKind}; 77 | use tokio::sync::mpsc::{channel, Receiver, Sender}; 78 | use tokio_stream::wrappers::UnixListenerStream; 79 | 80 | pub type RestartResult = anyhow::Result; 81 | 82 | const ENV_NOTIFY_SOCKET: &str = "OXY_NOTIFY_SOCKET"; 83 | const ENV_RESTART_SOCKET: &str = "OXY_RESTART_SOCKET"; 84 | const ENV_HANDOVER_PIPE: &str = "OXY_HANDOVER_PIPE"; 85 | const ENV_SYSTEMD_PID: &str = "LISTEN_PID"; 86 | const REBIND_SYSTEMD_PID: &str = "auto"; 87 | 88 | /// Settings for graceful restarts 89 | pub struct RestartConfig { 90 | /// Enables the restart coordination socket for graceful restarts as an alternative to a Unix signal. 91 | pub enabled: bool, 92 | /// Socket path 93 | pub coordination_socket_path: PathBuf, 94 | /// Sets environment variables on the newly-started process 95 | pub environment: Vec<(OsString, OsString)>, 96 | /// Receive fine-grained events on the lifecycle of the new process and support data transfer. 97 | pub lifecycle_handler: Box, 98 | /// Exits early when child process fail to start 99 | pub exit_on_error: bool, 100 | /// Sets the signal to listen to on restart. This defaults to SIGUSR1. 101 | pub restart_signal: SignalKind, 102 | } 103 | 104 | impl RestartConfig { 105 | /// Prepare the current process to handle restarts, if enabled. 106 | pub fn try_into_restart_task( 107 | self, 108 | ) -> io::Result<(impl Future> + Send)> { 109 | fixup_systemd_env(); 110 | spawn_restart_task(self) 111 | } 112 | 113 | /// Request an already-running service to restart. 114 | pub async fn request_restart(self) -> RestartResult { 115 | if !self.enabled { 116 | return Err(anyhow!( 117 | "no restart coordination socket socket defined in config" 118 | )); 119 | } 120 | 121 | let socket = UnixStream::connect(self.coordination_socket_path).await?; 122 | restart_coordination_socket::RestartCoordinationSocket::new(socket) 123 | .send_restart_command() 124 | .await 125 | } 126 | 127 | /// Request an already-running service to restart. 128 | /// Does not require the tokio runtime to be started yet. 129 | pub fn request_restart_sync(self) -> RestartResult { 130 | tokio::runtime::Runtime::new() 131 | .unwrap() 132 | .block_on(self.request_restart()) 133 | } 134 | } 135 | 136 | impl Default for RestartConfig { 137 | fn default() -> Self { 138 | RestartConfig { 139 | enabled: false, 140 | coordination_socket_path: Default::default(), 141 | environment: vec![], 142 | lifecycle_handler: Box::new(lifecycle::NullLifecycleHandler), 143 | exit_on_error: true, 144 | restart_signal: SignalKind::user_defined1(), 145 | } 146 | } 147 | } 148 | 149 | /// When the proxy restarts itself, it sets the child's LISTEN_PID env to a 150 | /// special value so that the child can replace it with the real child PID. 151 | /// Doing this is easier than reimplementing rust's process spawn code just so 152 | /// we can call execvpe to replace the environment in the forked process. 153 | /// 154 | /// This is usually called by `RestartConfig::try_into_restart_task` but this function is available 155 | /// if it needs to be done at an earlier or more convenient time, such as the top of `fn main()`. 156 | pub fn fixup_systemd_env() { 157 | #[cfg(target_os = "linux")] 158 | if let Ok(true) = env::var(ENV_SYSTEMD_PID).map(|p| p == REBIND_SYSTEMD_PID) { 159 | env::set_var(ENV_SYSTEMD_PID, process::id().to_string()); 160 | } 161 | } 162 | 163 | /// Notify systemd and the parent process (if any) that the proxy has started successfully. 164 | /// Returns an error if there was a parent process and we failed to notify it. 165 | /// 166 | /// This is usually called by the restart task returned from `RestartConfig::try_into_restart_task` 167 | /// but this function is available if indicating readiness needs to happen sooner or at a more 168 | /// convenient time then first polling the restart task. 169 | /// 170 | /// The behaviour of this function is undefined if the environment variables used by this crate to 171 | /// pass file descriptor numbers were set by something other than shellflip spawning a new instance 172 | /// of the calling process. 173 | pub fn startup_complete() -> io::Result<()> { 174 | if let Ok(notify_fd) = env::var(ENV_NOTIFY_SOCKET) { 175 | pipes::CompletionSender(unsafe { std::fs::File::from_fd_string(¬ify_fd)? }).send()?; 176 | } 177 | // Avoid sending twice on the notification pipe, if this is manually called outside 178 | // of the restart task. 179 | env::remove_var(ENV_NOTIFY_SOCKET); 180 | 181 | let _ = sd_notify::notify(false, &[sd_notify::NotifyState::Ready]); 182 | Ok(()) 183 | } 184 | 185 | /// Returns the restart completion or error message through the restart coordination socket, if used. 186 | struct RestartResponder { 187 | rpc: Option, 188 | } 189 | 190 | impl RestartResponder { 191 | /// Send success or failure to the restart coordination socket client. 192 | async fn respond(self, result: Result) { 193 | let response = match result { 194 | Ok(pid) => RestartResponse::RestartComplete(pid), 195 | Err(e) => RestartResponse::RestartFailed(e), 196 | }; 197 | if let Some(mut rpc) = self.rpc { 198 | if let Err(e) = rpc.send_message(RestartMessage::Response(response)).await { 199 | log::warn!("Failed to respond to restart coordinator: {}", e); 200 | } 201 | } 202 | } 203 | } 204 | 205 | /// Spawns a thread that can be used to restart the process. 206 | /// Returns a future that resolves when a restart succeeds, or if restart 207 | /// becomes impossible. 208 | /// The child spawner thread needs to be created before seccomp locks down fork/exec. 209 | pub fn spawn_restart_task( 210 | settings: RestartConfig, 211 | ) -> io::Result> + Send> { 212 | let socket = match settings.enabled { 213 | true => Some(settings.coordination_socket_path.as_ref()), 214 | false => None, 215 | }; 216 | 217 | let mut signal_stream = signal(settings.restart_signal)?; 218 | let (restart_fd, mut socket_stream) = new_restart_coordination_socket_stream(socket)?; 219 | let mut child_spawner = 220 | ChildSpawner::new(restart_fd, settings.environment, settings.lifecycle_handler); 221 | 222 | Ok(async move { 223 | startup_complete()?; 224 | loop { 225 | let responder = next_restart_request(&mut signal_stream, &mut socket_stream).await?; 226 | 227 | log::debug!("Spawning new process"); 228 | let res = child_spawner.spawn_new_process().await; 229 | 230 | responder 231 | .respond(res.as_ref().map(|p| p.id()).map_err(|e| e.to_string())) 232 | .await; 233 | 234 | match res { 235 | Ok(child) => { 236 | log::debug!("New process spawned with pid {}", child.id()); 237 | 238 | if let Err(e) = 239 | sd_notify::notify(true, &[sd_notify::NotifyState::MainPid(child.id())]) 240 | { 241 | log::error!("Failed to notify systemd: {}", e); 242 | } 243 | 244 | return Ok(child); 245 | } 246 | Err(ChildSpawnError::ChildError(e)) => { 247 | if settings.exit_on_error { 248 | return Err(anyhow!("Restart failed: {}", e)); 249 | } else { 250 | log::error!("Restart failed: {}", e); 251 | } 252 | } 253 | Err(ChildSpawnError::RestartThreadGone) => { 254 | res?; 255 | } 256 | } 257 | } 258 | }) 259 | } 260 | 261 | /// Handles forking a new client in a more privileged thread. 262 | struct ChildSpawner { 263 | signal_sender: Sender<()>, 264 | pid_receiver: Receiver>, 265 | } 266 | 267 | impl ChildSpawner { 268 | /// Create a ChildSpawner that will pass restart_fd to child processes. 269 | fn new( 270 | restart_fd: Option, 271 | environment: Vec<(OsString, OsString)>, 272 | mut lifecycle_handler: Box, 273 | ) -> Self { 274 | let (signal_sender, mut signal_receiver) = channel(1); 275 | let (pid_sender, pid_receiver) = channel(1); 276 | 277 | thread::spawn(move || { 278 | let restart_fd = restart_fd.as_ref().map(OwnedFd::as_fd); 279 | 280 | while let Some(()) = signal_receiver.blocking_recv() { 281 | let child = tokio::runtime::Runtime::new() 282 | .unwrap() 283 | .block_on(spawn_child( 284 | restart_fd, 285 | &environment, 286 | &mut *lifecycle_handler, 287 | )); 288 | 289 | pid_sender 290 | .blocking_send(child) 291 | .expect("parent needs to receive the child"); 292 | } 293 | }); 294 | 295 | ChildSpawner { 296 | signal_sender, 297 | pid_receiver, 298 | } 299 | } 300 | 301 | /// Spawn a process via IPC to the privileged thread. 302 | /// Returns the child pid on success. 303 | async fn spawn_new_process(&mut self) -> Result { 304 | self.signal_sender 305 | .send(()) 306 | .await 307 | .map_err(|_| ChildSpawnError::RestartThreadGone)?; 308 | match self.pid_receiver.recv().await { 309 | Some(Ok(child)) => Ok(child), 310 | Some(Err(e)) => Err(ChildSpawnError::ChildError(e)), 311 | None => Err(ChildSpawnError::RestartThreadGone), 312 | } 313 | } 314 | } 315 | 316 | /// Indicates an error that happened during child forking. 317 | #[derive(Error, Debug)] 318 | pub enum ChildSpawnError { 319 | #[error("Restart thread exited")] 320 | RestartThreadGone, 321 | #[error("Child failed to start: {0}")] 322 | ChildError(io::Error), 323 | } 324 | 325 | /// Await the next request to gracefully restart the process. 326 | /// Returns a RestartResponder used to receive the outcome of the restart attempt. 327 | async fn next_restart_request( 328 | signal_stream: &mut Signal, 329 | mut socket_stream: impl Stream + Unpin, 330 | ) -> RestartResult { 331 | select! { 332 | _ = signal_stream.recv() => Ok(RestartResponder{ rpc: None }), 333 | r = socket_stream.next() => match r { 334 | Some(r) => Ok(r), 335 | None => { 336 | // Technically we can still support signal restart! However if you have the restart coordination 337 | // socket enabled you probably don't want to use signals, and need to recover the process such 338 | // that you can use the restart coordinator socket again. 339 | Err(anyhow!("Restart coordinator socket acceptor terminated")) 340 | } 341 | } 342 | } 343 | } 344 | 345 | fn new_restart_coordination_socket_stream( 346 | restart_coordination_socket: Option<&Path>, 347 | ) -> io::Result<(Option, impl Stream)> { 348 | if let Some(path) = restart_coordination_socket { 349 | let listener = bind_restart_coordination_socket(path)?; 350 | listener.set_nonblocking(true)?; 351 | let inherit_socket = OwnedFd::from(listener.try_clone()?); 352 | let listener = UnixListener::from_std(listener)?; 353 | let st = listen_for_restart_events(listener); 354 | Ok((Some(inherit_socket), st.boxed())) 355 | } else { 356 | Ok((None, futures::stream::pending().boxed())) 357 | } 358 | } 359 | 360 | fn bind_restart_coordination_socket(path: &Path) -> io::Result { 361 | match env::var(ENV_RESTART_SOCKET) { 362 | Err(_) => { 363 | // This may fail but binding will succeed despite that. If binding fails, 364 | // that's the error we really care about. 365 | let _ = remove_file(path); 366 | StdUnixListener::bind(path) 367 | } 368 | Ok(maybe_sock_fd) => unsafe { StdUnixListener::from_fd_string(&maybe_sock_fd) }, 369 | } 370 | } 371 | 372 | fn listen_for_restart_events( 373 | restart_coordination_socket: UnixListener, 374 | ) -> impl Stream { 375 | UnixListenerStream::new(restart_coordination_socket).filter_map(move |r| async move { 376 | let sock = match r { 377 | Ok(sock) => sock, 378 | Err(e) => { 379 | log::error!("Restart coordination socket accept error: {}", e); 380 | return None; 381 | } 382 | }; 383 | 384 | let mut rpc = RestartCoordinationSocket::new(sock); 385 | match rpc.receive_message().await { 386 | Ok(RestartMessage::Request(RestartRequest::TryRestart)) => { 387 | Some(RestartResponder { rpc: Some(rpc) }) 388 | } 389 | Ok(m) => { 390 | log::warn!( 391 | "Restart coordination socket received unexpected message: {:?}", 392 | m 393 | ); 394 | None 395 | } 396 | Err(e) => { 397 | log::warn!("Restart coordination socket connection error: {}", e); 398 | None 399 | } 400 | } 401 | }) 402 | } 403 | 404 | /// Clears the FD_CLOEXEC flag on a fd so it can be inherited by a child process. 405 | fn clear_cloexec(fd: RawFd) -> nix::Result<()> { 406 | use nix::fcntl::*; 407 | let mut current_flags = FdFlag::from_bits_truncate(fcntl(fd, FcntlArg::F_GETFD)?); 408 | current_flags.remove(FdFlag::FD_CLOEXEC); 409 | fcntl(fd, FcntlArg::F_SETFD(current_flags))?; 410 | Ok(()) 411 | } 412 | 413 | /// Attempt to start a new instance of this proxy. 414 | async fn spawn_child( 415 | restart_fd: Option>, 416 | user_envs: &[(OsString, OsString)], 417 | lifecycle_handler: &mut dyn LifecycleHandler, 418 | ) -> io::Result { 419 | lifecycle_handler.pre_new_process().await; 420 | 421 | let mut args = env::args(); 422 | let process_name = args.next().unwrap(); 423 | 424 | // Create a pipe for the child to notify us on successful startup 425 | let (notif_r, notif_w) = completion_pipes()?; 426 | 427 | // And another pair of pipes to hand over data to the child process. 428 | let (handover_r, handover_w) = create_paired_pipes(PipeMode::ParentWrites)?; 429 | 430 | let mut cmd = process::Command::new(process_name); 431 | cmd.args(args) 432 | .envs(user_envs.iter().map(|(k, v)| (k, v))) 433 | .env(ENV_SYSTEMD_PID, REBIND_SYSTEMD_PID) 434 | .env(ENV_HANDOVER_PIPE, handover_r.fd_string()) 435 | .env(ENV_NOTIFY_SOCKET, notif_w.0.fd_string()); 436 | 437 | if let Some(fd) = restart_fd { 438 | // Let the child inherit the restart coordination socket 439 | let fd = fd.as_raw_fd(); 440 | unsafe { 441 | cmd.env(ENV_RESTART_SOCKET, fd.to_string()) 442 | .pre_exec(move || { 443 | clear_cloexec(fd)?; 444 | Ok(()) 445 | }); 446 | } 447 | } 448 | let mut child = cmd.spawn()?; 449 | 450 | if let Err(e) = send_parent_state(lifecycle_handler, notif_r, notif_w, handover_w).await { 451 | if child.kill().is_err() { 452 | log::error!("Child process has already exited. Failed to send parent state: {e:?}"); 453 | } else { 454 | log::error!("Killed child process because failed to send parent state: {e:?}"); 455 | } 456 | return Err(e); 457 | } 458 | 459 | Ok(child) 460 | } 461 | 462 | async fn send_parent_state( 463 | lifecycle_handler: &mut dyn LifecycleHandler, 464 | mut notif_r: CompletionReceiver, 465 | notif_w: CompletionSender, 466 | handover_w: StdFile, 467 | ) -> io::Result<()> { 468 | lifecycle_handler 469 | .send_to_new_process(Box::pin(File::from(handover_w))) 470 | .await?; 471 | 472 | // only the child needs the write end 473 | drop(notif_w); 474 | match notif_r.recv() { 475 | Ok(_) => Ok(()), 476 | Err(e) => { 477 | lifecycle_handler.new_process_failed().await; 478 | Err(e) 479 | } 480 | } 481 | } 482 | --------------------------------------------------------------------------------