├── .dockerignore ├── .gitignore ├── tools ├── build.sh ├── bake_debian.sh └── selfpack.sh ├── src ├── console.rs ├── firecracker.rs ├── embed.rs ├── ssh_launcher.rs ├── fileshare.rs ├── util.rs ├── raw_udp.rs ├── wireguard.rs ├── vm_console.rs ├── socks5.rs ├── vminit.rs └── main.rs ├── Cargo.toml ├── LICENSE ├── .github └── workflows │ └── ci.yml ├── Dockerfile └── README.md /.dockerignore: -------------------------------------------------------------------------------- 1 | target/ 2 | *.bin 3 | *.elf 4 | output/ 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | *.bin 3 | *.elf 4 | output/ 5 | *.note 6 | -------------------------------------------------------------------------------- /tools/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | cd "$(dirname $0)/.." 6 | if [ -f "$HOME/proxy.sh" ]; then 7 | . "$HOME/proxy.sh" 8 | echo "Loaded proxy configuration" 9 | fi 10 | 11 | docker build -t bake --build-arg http_proxy --build-arg https_proxy . 12 | -------------------------------------------------------------------------------- /src/console.rs: -------------------------------------------------------------------------------- 1 | #[derive(rkyv::Archive, rkyv::Serialize, rkyv::Deserialize)] 2 | pub enum ConsoleRequest { 3 | // Bytes flowing from VM (pty) to host 4 | Data(Vec), 5 | } 6 | 7 | #[derive(rkyv::Archive, rkyv::Serialize, rkyv::Deserialize)] 8 | pub enum ConsoleResponse { 9 | // Bytes flowing from host stdin to VM (pty) 10 | Data(Vec), 11 | // TTY control: update pty window size 12 | SetWindowSize { rows: u16, cols: u16 }, 13 | } 14 | -------------------------------------------------------------------------------- /tools/bake_debian.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | cd "$(dirname $0)/.." 6 | 7 | arch="$(uname -m)" 8 | if [ "$arch" = "x86_64" ]; then 9 | arch=amd64 10 | elif [ "$arch" = "aarch64" ]; then 11 | arch=arm64 12 | else 13 | echo "unsupported arch: $arch" 14 | exit 1 15 | fi 16 | 17 | if [ "$TARGETARCH" = "amd64" ]; then 18 | echo -n "" 19 | elif [ "$TARGETARCH" = "arm64" ]; then 20 | echo -n "" 21 | else 22 | echo "Invalid TARGETARCH" >&2 23 | exit 1 24 | fi 25 | 26 | container_id=$(docker create --platform "linux/$TARGETARCH" "debian:trixie") 27 | tempfile=$(mktemp -t debian-bake-XXXXXXXX) 28 | rm -f "$tempfile" 29 | docker export $container_id | sqfstar "$tempfile" 30 | docker rm $container_id 31 | 32 | ./bake.$arch.elf --cpus 1 -v ./output:/output -v "$tempfile":/rootfs.img:ro -- \ 33 | --input /opt/bake/bake.$TARGETARCH \ 34 | --firecracker /opt/bake/firecracker.$TARGETARCH \ 35 | --kernel /opt/bake/kernel.$TARGETARCH \ 36 | --initrd /opt/bake/initrd.$TARGETARCH.img \ 37 | --output /output/debian.$TARGETARCH.elf --rootfs /rootfs.img 38 | 39 | rm -f "$tempfile" 40 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "bake" 3 | version = "0.1.0" 4 | edition = "2024" 5 | 6 | [profile.release] 7 | panic = "abort" 8 | opt-level = "z" 9 | lto = "thin" 10 | 11 | [dependencies] 12 | anyhow = "1" 13 | bytes = "1" 14 | chrono = "0.4" 15 | clap = { version = "4", features = ["derive", "env"] } 16 | fast-socks5 = "0.10" 17 | faster-hex = "0.10" 18 | fdlimit = "0.3" 19 | futures = "0.3" 20 | landlock = "0.4" 21 | libc = "0.2" 22 | memmap2 = "0.9" 23 | moka = { version = "0.12", features = ["sync"] } 24 | nix = { version = "0.30", features = ["mount"] } 25 | p9 = "0.3" 26 | rand = "0.9" 27 | rand_core_06 = { package = "rand_core", version = "0.6", features = ["getrandom"] } 28 | rkyv = { version = "0.8", features = ["bytes-1"] } 29 | rkyv_derive = "0.8" 30 | scopeguard = "1" 31 | shell-escape = "0.1" 32 | serde = { version = "1", features = ["derive"] } 33 | serde_json = "1" 34 | ssh-key = { version = "0.6", features = ["p256"] } 35 | termion = "4" 36 | tokio = { version = "1", features = ["full"] } 37 | tokio-vsock = "0.7" 38 | tun = { version = "0.8", features = ["async"] } 39 | urlencoding = "2" 40 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Heyang Zhou 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /tools/selfpack.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | cd "$(dirname $0)/.." 6 | ./tools/build.sh 7 | 8 | arch="$(uname -m)" 9 | if [ "$arch" = "x86_64" ]; then 10 | arch=amd64 11 | elif [ "$arch" = "aarch64" ]; then 12 | arch=arm64 13 | else 14 | echo "unsupported arch: $arch" 15 | exit 1 16 | fi 17 | 18 | container_id="$(docker create bake)" 19 | tempdir="$(mktemp -d -t bake-selfpack-XXXXXXXX)" 20 | docker export "$container_id" | sqfstar "$tempdir/rootfs.img" 21 | 22 | for target in amd64 arm64; do 23 | docker run -it --rm -v "$tempdir:/data" \ 24 | --entrypoint /opt/bake/bake.$arch \ 25 | bake \ 26 | --input /opt/bake/bake.$target \ 27 | --firecracker /opt/bake/firecracker.$target \ 28 | --kernel /opt/bake/kernel.$target \ 29 | --initrd /opt/bake/initrd.$target.img \ 30 | --rootfs /data/rootfs.img \ 31 | --entrypoint /opt/bake/bake.$target \ 32 | --env BAKE_NOT_INIT=1 \ 33 | --env BAKE_BUILD_FIRECRACKER=/opt/bake/firecracker.$target \ 34 | --env BAKE_BUILD_KERNEL=/opt/bake/kernel.$target \ 35 | --env BAKE_BUILD_INITRD=/opt/bake/initrd.$target.img \ 36 | --output /data/app.$target.elf 37 | done 38 | 39 | cp "$tempdir/app.amd64.elf" ./bake.amd64.elf 40 | cp "$tempdir/app.arm64.elf" ./bake.arm64.elf 41 | rm -rf "$tempdir" 42 | -------------------------------------------------------------------------------- /src/firecracker.rs: -------------------------------------------------------------------------------- 1 | use serde::{Deserialize, Serialize}; 2 | 3 | #[derive(Serialize, Deserialize, Debug)] 4 | #[serde(rename_all = "kebab-case")] 5 | pub struct FirecrackerConfig { 6 | pub boot_source: BootSource, 7 | pub drives: Vec, 8 | pub machine_config: MachineConfig, 9 | pub network_interfaces: Vec, 10 | pub vsock: VsockConfig, 11 | } 12 | 13 | #[derive(Serialize, Deserialize, Debug)] 14 | #[serde(rename_all = "snake_case")] 15 | pub struct BootSource { 16 | pub kernel_image_path: String, 17 | pub initrd_path: String, 18 | pub boot_args: String, 19 | } 20 | 21 | #[derive(Serialize, Deserialize, Debug)] 22 | #[serde(rename_all = "snake_case")] 23 | pub struct Drive { 24 | pub drive_id: String, 25 | pub is_root_device: bool, 26 | pub is_read_only: bool, 27 | pub io_engine: String, 28 | pub path_on_host: String, 29 | } 30 | 31 | #[derive(Serialize, Deserialize, Debug)] 32 | #[serde(rename_all = "snake_case")] 33 | pub struct MachineConfig { 34 | pub vcpu_count: u32, 35 | pub mem_size_mib: u32, 36 | } 37 | 38 | #[derive(Serialize, Deserialize, Debug)] 39 | #[serde(rename_all = "snake_case")] 40 | pub struct NetworkInterface { 41 | pub iface_id: String, 42 | pub guest_mac: String, 43 | pub host_dev_name: String, 44 | } 45 | 46 | #[derive(Serialize, Deserialize, Debug)] 47 | #[serde(rename_all = "snake_case")] 48 | pub struct VsockConfig { 49 | pub guest_cid: u32, 50 | pub uds_path: String, 51 | } 52 | -------------------------------------------------------------------------------- /src/embed.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | fs::File, 3 | io::{Read, Write}, 4 | ptr::NonNull, 5 | }; 6 | 7 | use memmap2::Mmap; 8 | 9 | use crate::util::align_up; 10 | 11 | const MAGIC: [u8; 56] = *b"fd9b84110b992e9bc0e7bf44f166abe83fad5dc2a281271c5193BAKE"; 12 | 13 | pub fn write_embedded_data( 14 | data: &mut [&mut dyn Read], 15 | w: &mut impl Write, 16 | current_size: usize, 17 | ) -> std::io::Result<()> { 18 | // align to 512 bytes 19 | const ALIGN: usize = 512; 20 | let alignment_fill_bytes = align_up(current_size, ALIGN) - current_size; 21 | assert!(alignment_fill_bytes < ALIGN); 22 | w.write_all(&vec![0u8; alignment_fill_bytes])?; 23 | let mut len = 0usize; 24 | for data in data { 25 | let n = std::io::copy(data, w)?; 26 | len += n as usize; 27 | } 28 | w.write_all(&(len as u64).to_le_bytes())?; 29 | w.write_all(&MAGIC)?; 30 | Ok(()) 31 | } 32 | 33 | #[derive(Copy, Clone)] 34 | pub struct EmbeddedInfo { 35 | pub base: NonNull, 36 | pub data: &'static [u8], 37 | } 38 | 39 | unsafe impl Send for EmbeddedInfo {} 40 | unsafe impl Sync for EmbeddedInfo {} 41 | 42 | pub fn get_embedded_data() -> Option { 43 | let me = unsafe { Mmap::map(&File::open("/proc/self/exe").ok()?) }.ok()?; 44 | if me.len() < 64 { 45 | return None; 46 | }; 47 | let (prefix, trailer) = me.split_at(me.len() - 64); 48 | if &trailer[8..64] != &MAGIC[..] { 49 | return None; 50 | } 51 | let len = u64::from_le_bytes(trailer[0..8].try_into().unwrap()) as usize; 52 | if prefix.len() < len { 53 | return None; 54 | }; 55 | let data: &'static [u8] = 56 | unsafe { std::mem::transmute::<&[u8], &'static [u8]>(&prefix[prefix.len() - len..]) }; 57 | let base = NonNull::from(me.as_ref()).cast::(); 58 | std::mem::forget(me); 59 | return Some(EmbeddedInfo { base, data }); 60 | } 61 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: ci 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | tags: [ 'v*' ] 7 | pull_request: 8 | 9 | permissions: 10 | contents: read 11 | packages: write 12 | 13 | jobs: 14 | docker: 15 | runs-on: ubuntu-latest 16 | timeout-minutes: 120 17 | steps: 18 | - name: Checkout 19 | uses: actions/checkout@v4 20 | 21 | - name: Set up QEMU (binfmt) 22 | uses: docker/setup-qemu-action@v3 23 | with: 24 | platforms: arm64 25 | 26 | - name: Set up Docker Buildx 27 | uses: docker/setup-buildx-action@v3 28 | 29 | - name: Compute image name (lowercase) 30 | id: vars 31 | run: | 32 | IMAGE_ID=ghcr.io/${{ github.repository_owner }}/bake 33 | echo "image=${IMAGE_ID,,}" >> "$GITHUB_OUTPUT" 34 | 35 | - name: Log in to GHCR 36 | if: github.event_name != 'pull_request' 37 | uses: docker/login-action@v3 38 | with: 39 | registry: ghcr.io 40 | username: ${{ github.actor }} 41 | password: ${{ secrets.GITHUB_TOKEN }} 42 | 43 | - name: Extract Docker metadata 44 | id: meta 45 | uses: docker/metadata-action@v5 46 | with: 47 | images: | 48 | ${{ steps.vars.outputs.image }} 49 | tags: | 50 | type=ref,event=branch,enable=true 51 | type=ref,event=branch,branch=main,value=latest 52 | type=semver,pattern={{version}} 53 | type=semver,pattern={{major}}.{{minor}} 54 | type=semver,pattern={{major}} 55 | type=sha,prefix=sha-,format=short 56 | 57 | - name: Build and push Docker image 58 | uses: docker/build-push-action@v6 59 | with: 60 | context: . 61 | file: ./Dockerfile 62 | push: ${{ github.event_name != 'pull_request' }} 63 | platforms: linux/amd64 64 | labels: ${{ steps.meta.outputs.labels }} 65 | tags: ${{ steps.meta.outputs.tags }} 66 | cache-from: type=gha 67 | cache-to: type=gha,mode=max 68 | -------------------------------------------------------------------------------- /src/ssh_launcher.rs: -------------------------------------------------------------------------------- 1 | use std::os::unix::fs::MetadataExt; 2 | 3 | pub fn launch_ssh(sel_pid: Option, ssh_args: Vec) -> anyhow::Result<()> { 4 | use std::fs; 5 | use std::os::unix::process::CommandExt as _; 6 | 7 | // Identify this executable by (dev,inode) 8 | let self_meta = std::fs::metadata("/proc/self/exe")?; 9 | let self_dev = self_meta.dev(); 10 | let self_ino = self_meta.ino(); 11 | 12 | // Collect candidate PIDs whose /proc//exe matches our (dev,ino) 13 | struct Candidate { 14 | pid: i32, 15 | ssh_proxy_fd: i32, 16 | id_ecdsa_fd: i32, 17 | } 18 | 19 | let mut candidates: Vec = Vec::new(); 20 | 21 | let mut collect_for_pid = |pid: i32| { 22 | let exe_meta = match fs::metadata(format!("/proc/{}/exe", pid)) { 23 | Ok(m) => m, 24 | Err(_) => return, 25 | }; 26 | if exe_meta.dev() != self_dev || exe_meta.ino() != self_ino { 27 | return; 28 | } 29 | let mut ssh_proxy_fd: Option = None; 30 | let mut id_ecdsa_fd: Option = None; 31 | let fd_dir = format!("/proc/{}/fd", pid); 32 | let Ok(fd_iter) = fs::read_dir(&fd_dir) else { 33 | return; 34 | }; 35 | for fdent in fd_iter.flatten() { 36 | let fd_name = fdent.file_name(); 37 | let fd_str = match fd_name.to_str() { 38 | Some(s) => s, 39 | None => continue, 40 | }; 41 | let fd_num: i32 = match fd_str.parse() { 42 | Ok(n) => n, 43 | Err(_) => continue, 44 | }; 45 | let link_target = match fs::read_link(fdent.path()) { 46 | Ok(p) => p, 47 | Err(_) => continue, 48 | }; 49 | let lt = link_target.as_os_str().as_encoded_bytes(); 50 | if lt == b"/memfd:ssh_proxy_path (deleted)" { 51 | ssh_proxy_fd = Some(fd_num); 52 | } else if lt == b"/memfd:id_ecdsa (deleted)" { 53 | id_ecdsa_fd = Some(fd_num); 54 | } 55 | if ssh_proxy_fd.is_some() && id_ecdsa_fd.is_some() { 56 | break; 57 | } 58 | } 59 | if let (Some(ssh_fd), Some(key_fd)) = (ssh_proxy_fd, id_ecdsa_fd) { 60 | candidates.push(Candidate { 61 | pid, 62 | ssh_proxy_fd: ssh_fd, 63 | id_ecdsa_fd: key_fd, 64 | }); 65 | } 66 | }; 67 | 68 | if let Some(target) = sel_pid { 69 | collect_for_pid(target); 70 | } else { 71 | for entry in fs::read_dir("/proc")? { 72 | let entry = match entry { 73 | Ok(e) => e, 74 | Err(_) => continue, 75 | }; 76 | let file_name = entry.file_name(); 77 | let name = match file_name.to_str() { 78 | Some(s) => s, 79 | None => continue, 80 | }; 81 | if !name.chars().all(|c| c.is_ascii_digit()) { 82 | continue; 83 | } 84 | let pid: i32 = match name.parse() { 85 | Ok(p) => p, 86 | Err(_) => continue, 87 | }; 88 | collect_for_pid(pid); 89 | } 90 | } 91 | 92 | match candidates.len() { 93 | 0 => { 94 | if let Some(p) = sel_pid { 95 | anyhow::bail!("specified pid {} is not a matching running instance", p); 96 | } else { 97 | eprintln!("No running instance of this binary with SSH detected."); 98 | eprintln!( 99 | "Start it first, then run: {} ssh", 100 | std::env::args().next().unwrap_or_else(|| "app.elf".into()) 101 | ); 102 | anyhow::bail!("no running instance found"); 103 | } 104 | } 105 | n if n > 1 => { 106 | let mut pids: Vec = candidates.iter().map(|c| c.pid.to_string()).collect(); 107 | pids.sort(); 108 | eprintln!("Multiple running instances detected: {}", pids.join(", ")); 109 | eprintln!("Please specify which to connect to by stopping others."); 110 | anyhow::bail!("multiple instances"); 111 | } 112 | _ => {} 113 | } 114 | 115 | let c = &candidates[0]; 116 | 117 | // Read the proxy socket path from the other process' memfd 118 | let proxy_path_bytes = fs::read(format!("/proc/{}/fd/{}", c.pid, c.ssh_proxy_fd))?; 119 | // Trim trailing newlines/NULs, then build a UTF-8 string if possible 120 | let mut proxy_path = String::from_utf8_lossy(&proxy_path_bytes).to_string(); 121 | proxy_path.truncate(proxy_path.trim_end_matches(['\n', '\0']).len()); 122 | 123 | // Build ssh invocation 124 | let mut cmd = std::process::Command::new("ssh"); 125 | cmd.arg("-o").arg(format!( 126 | "ProxyCommand=nc -U {}", 127 | shell_escape::escape(std::borrow::Cow::Borrowed(proxy_path.as_str())) 128 | )); 129 | cmd.arg("-i") 130 | .arg(format!("/proc/{}/fd/{}", c.pid, c.id_ecdsa_fd)); 131 | cmd.arg("-o").arg("UserKnownHostsFile=/dev/null"); 132 | cmd.arg("-o").arg("StrictHostKeyChecking=no"); 133 | cmd.arg("root@localhost"); 134 | if !ssh_args.is_empty() { 135 | cmd.args(ssh_args); 136 | } 137 | 138 | Err(anyhow::anyhow!("exec failed: {:?}", cmd.exec())) 139 | } 140 | -------------------------------------------------------------------------------- /src/fileshare.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | collections::{BTreeMap, HashSet}, 3 | io::{BufReader, ErrorKind, Read, Write}, 4 | os::unix::net::{UnixListener, UnixStream}, 5 | path::Path, 6 | sync::Arc, 7 | }; 8 | 9 | use anyhow::Context; 10 | use landlock::{ 11 | Access, AccessFs, Ruleset, RulesetAttr, RulesetCreatedAttr, RulesetStatus, path_beneath_rules, 12 | }; 13 | 14 | #[derive(Clone, Debug)] 15 | pub struct VolumeSpec { 16 | pub host: String, 17 | pub guest: String, 18 | pub ro: bool, 19 | pub ext4: bool, 20 | pub is_file: bool, 21 | } 22 | 23 | pub fn spawn_file_server(specs: Vec, uds_path: &Path) -> Vec { 24 | let listener = UnixListener::bind(&uds_path).expect("spawn_file_server: bind failed"); 25 | let mut volumes: Vec = vec![]; 26 | 27 | for spec_text in specs { 28 | if let Some(spec) = parse_volume(&spec_text) { 29 | volumes.push(spec); 30 | } else { 31 | panic!("invalid -v/--volume spec: {}", spec_text); 32 | } 33 | } 34 | let volumes_clone = volumes.clone(); 35 | std::thread::spawn(move || serve(listener, volumes_clone)); 36 | volumes 37 | } 38 | 39 | fn parse_volume(spec_str: &str) -> Option { 40 | let parts: Vec<&str> = spec_str.split(':').collect(); 41 | let spec = match parts.len() { 42 | 2 => Some(VolumeSpec { 43 | host: parts[0].to_string(), 44 | guest: parts[1].to_string(), 45 | ro: false, 46 | ext4: false, 47 | is_file: false, 48 | }), 49 | 3 => { 50 | let flags = parts[2] 51 | .split(',') 52 | .map(|x| x.trim()) 53 | .filter(|x| !x.is_empty()) 54 | .collect::>(); 55 | Some(VolumeSpec { 56 | host: parts[0].to_string(), 57 | guest: parts[1].to_string(), 58 | ro: flags.contains("ro"), 59 | ext4: flags.contains("ext4"), 60 | is_file: false, 61 | }) 62 | } 63 | _ => None, 64 | }; 65 | let mut spec = spec?; 66 | if !spec.guest.starts_with('/') { 67 | eprintln!("volume: {}: guest path must start with '/'", spec_str); 68 | return None; 69 | } 70 | spec.is_file = match std::fs::metadata(&spec.host) { 71 | Ok(x) => x.is_file(), 72 | Err(e) => { 73 | eprintln!("volume: {}: host path is inaccessible: {:?}", spec_str, e); 74 | return None; 75 | } 76 | }; 77 | if spec.ext4 && !spec.is_file { 78 | eprintln!( 79 | "volume: {}: ext4 mount requested but host path is not a file", 80 | spec_str 81 | ); 82 | return None; 83 | } 84 | Some(spec) 85 | } 86 | 87 | fn serve(listener: UnixListener, volumes: Vec) { 88 | let volumes = Arc::new(volumes); 89 | loop { 90 | let (conn, _) = listener.accept().expect("failed to accept fileshare conn"); 91 | let volumes = volumes.clone(); 92 | std::thread::spawn(move || { 93 | if let Err(e) = serve_conn(conn, volumes) { 94 | let is_eof = e 95 | .downcast_ref::() 96 | .map(|x| x.kind() == ErrorKind::UnexpectedEof) 97 | .unwrap_or_default(); 98 | if !is_eof { 99 | eprintln!("failed serving fileshare conn: {:?}", e); 100 | } 101 | } 102 | }); 103 | } 104 | } 105 | 106 | fn serve_conn(conn: UnixStream, volumes: Arc>) -> anyhow::Result<()> { 107 | let mut conn = BufReader::new(conn); 108 | let mut name_len: [u8; 4] = [0u8; 4]; 109 | conn.read_exact(&mut name_len)?; 110 | let name_len = u32::from_le_bytes(name_len) as usize; 111 | if name_len > 256 { 112 | anyhow::bail!("invalid name len"); 113 | } 114 | let mut name = vec![0u8; name_len]; 115 | conn.read_exact(&mut name)?; 116 | let volume = volumes 117 | .iter() 118 | .find(|x| x.guest.as_bytes() == name) 119 | .ok_or_else(|| anyhow::anyhow!("requested volume not found"))?; 120 | if volume.ext4 { 121 | anyhow::bail!("ext4 volumes must not be mounted through 9pfs"); 122 | } 123 | let abi = landlock::ABI::V2; 124 | let ruleset = Ruleset::default().handle_access(AccessFs::from_all(abi))?; 125 | let status = ruleset 126 | .create()? 127 | .add_rules(path_beneath_rules( 128 | [volume.host.as_str()], 129 | if volume.ro { 130 | AccessFs::from_read(abi) 131 | } else { 132 | AccessFs::from_all(abi) 133 | }, 134 | ))? 135 | .restrict_self() 136 | .expect("Failed to enforce ruleset"); 137 | 138 | if status.ruleset != RulesetStatus::FullyEnforced { 139 | anyhow::bail!("Landlock V2 is not supported by the running kernel."); 140 | } 141 | 142 | let host_path = Path::new(&volume.host); 143 | let serve_dir = if volume.is_file { 144 | host_path.parent().unwrap_or(host_path) 145 | } else { 146 | host_path 147 | }; 148 | let mut server = p9::Server::new(serve_dir, BTreeMap::new(), BTreeMap::new()) 149 | .with_context(|| "failed to start p9 server")?; 150 | let mut writebuf: Vec = vec![]; 151 | loop { 152 | writebuf.clear(); 153 | server 154 | .handle_message(&mut conn, &mut writebuf) 155 | .with_context(|| "p9 server failed")?; 156 | if !writebuf.is_empty() { 157 | conn.get_mut().write_all(&writebuf)?; 158 | } 159 | } 160 | } 161 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM rust:1.89-slim-bullseye AS build_bake 2 | RUN apt-get update && apt-get install -o Acquire::Retries="5" -y musl-tools python3-pip && python3 -m pip install cargo-zigbuild 3 | RUN rustup target add x86_64-unknown-linux-musl aarch64-unknown-linux-musl 4 | WORKDIR /build 5 | COPY Cargo.toml Cargo.lock /build/bake/ 6 | COPY ./src/ /build/bake/src/ 7 | RUN cd /build/bake && cargo zigbuild --release --target x86_64-unknown-linux-musl && \ 8 | cargo zigbuild --release --target aarch64-unknown-linux-musl && \ 9 | mkdir ../bin && \ 10 | cp target/x86_64-unknown-linux-musl/release/bake ../bin/bake.amd64 && \ 11 | cp target/aarch64-unknown-linux-musl/release/bake ../bin/bake.arm64 12 | 13 | FROM golang:1.25-alpine AS build_tun2socks 14 | WORKDIR /opt 15 | RUN apk add git 16 | RUN mkdir tun2socks && cd tun2socks && git init && \ 17 | git remote add origin https://github.com/xjasonlyu/tun2socks && \ 18 | git fetch --depth 1 origin a1a64030c4c08b1970736e6dca5dbf070535407a && \ 19 | git checkout FETCH_HEAD 20 | RUN cd tun2socks && \ 21 | GOOS=linux GOARCH=amd64 CGO_ENABLED=0 go build -ldflags "-s -w" -o /opt/tun2socks.amd64 . && \ 22 | GOOS=linux GOARCH=arm64 CGO_ENABLED=0 go build -ldflags "-s -w" -o /opt/tun2socks.arm64 . 23 | 24 | FROM debian:bullseye-slim AS build_vm_initrd_amd64 25 | RUN apt-get update && apt-get install -y curl cpio 26 | WORKDIR /build 27 | RUN curl -fsSL -o alpine.tar.gz https://dl-cdn.alpinelinux.org/alpine/v3.22/releases/x86_64/alpine-minirootfs-3.22.1-x86_64.tar.gz 28 | RUN mkdir rootfs && cd rootfs && tar xzf ../alpine.tar.gz && cat /etc/resolv.conf > etc/resolv.conf && \ 29 | LD_LIBRARY_PATH=$(pwd)/lib:$(pwd)/usr/lib ./lib/ld-musl-x86_64.so.1 ./sbin/apk add --root . --no-scripts \ 30 | runc device-mapper iproute2 nftables e2fsprogs openssh wireguard-tools 31 | COPY --from=build_bake /build/bin/bake.amd64 ./rootfs/init 32 | COPY --from=build_tun2socks /opt/tun2socks.amd64 ./rootfs/usr/bin/tun2socks 33 | RUN cd rootfs && bash -c "set -euo pipefail; find . | cpio -o --format=newc | gzip > /build/initrd.cpio.gz" 34 | 35 | FROM debian:bullseye-slim AS build_vm_initrd_arm64 36 | RUN apt-get update && apt-get install -y curl cpio 37 | WORKDIR /build 38 | RUN curl -fsSL -o alpine.tar.gz https://dl-cdn.alpinelinux.org/alpine/v3.22/releases/aarch64/alpine-minirootfs-3.22.1-aarch64.tar.gz 39 | RUN mkdir rootfs && cd rootfs && tar xzf ../alpine.tar.gz && cat /etc/resolv.conf > etc/resolv.conf && \ 40 | LD_LIBRARY_PATH=$(pwd)/lib:$(pwd)/usr/lib ./lib/ld-musl-aarch64.so.1 ./sbin/apk add --root . --no-scripts \ 41 | runc device-mapper iproute2 nftables e2fsprogs openssh wireguard-tools 42 | COPY --from=build_bake /build/bin/bake.arm64 ./rootfs/init 43 | COPY --from=build_tun2socks /opt/tun2socks.arm64 ./rootfs/usr/bin/tun2socks 44 | RUN cd rootfs && bash -c "set -euo pipefail; find . | cpio -o --format=newc | gzip > /build/initrd.cpio.gz" 45 | 46 | FROM debian:bookworm-slim AS build_kernel_base 47 | RUN apt-get update && apt-get install -y --no-install-recommends \ 48 | build-essential \ 49 | bc \ 50 | bison \ 51 | flex \ 52 | libssl-dev \ 53 | libelf-dev \ 54 | dwarves \ 55 | curl \ 56 | xz-utils \ 57 | ca-certificates \ 58 | python3 \ 59 | clang \ 60 | lld \ 61 | llvm \ 62 | pkg-config \ 63 | rsync \ 64 | file && \ 65 | rm -rf /var/lib/apt/lists/* 66 | WORKDIR /build/linux 67 | RUN curl -fsSL -o linux.tar.xz https://cdn.kernel.org/pub/linux/kernel/v6.x/linux-6.1.149.tar.xz && \ 68 | tar -xJf linux.tar.xz --strip-components=1 && rm linux.tar.xz 69 | 70 | COPY ./kernel_config/microvm-kernel-ci-x86_64-6.1.config /tmp/config.x86_64 71 | COPY ./kernel_config/microvm-kernel-ci-aarch64-6.1.config /tmp/config.aarch64 72 | 73 | FROM build_kernel_base AS build_kernel_amd64 74 | WORKDIR /build/linux 75 | RUN make mrproper && \ 76 | cp /tmp/config.x86_64 .config && \ 77 | make LLVM=1 ARCH=x86_64 olddefconfig && \ 78 | make LLVM=1 ARCH=x86_64 -j"$(nproc)" vmlinux && \ 79 | mkdir -p /opt && cp vmlinux /opt/kernel.amd64 80 | 81 | FROM build_kernel_base AS build_kernel_arm64 82 | WORKDIR /build/linux 83 | RUN make mrproper && \ 84 | cp /tmp/config.aarch64 .config && \ 85 | make LLVM=1 ARCH=arm64 olddefconfig && \ 86 | make LLVM=1 ARCH=arm64 -j"$(nproc)" Image && \ 87 | mkdir -p /opt && cp arch/arm64/boot/Image /opt/kernel.arm64 88 | 89 | FROM debian:bullseye-slim AS fetch_firecracker 90 | RUN apt-get update && apt-get install -y curl 91 | WORKDIR /opt 92 | RUN curl -fsSL -o firecracker-v1.13.1-x86_64.tgz https://github.com/firecracker-microvm/firecracker/releases/download/v1.13.1/firecracker-v1.13.1-x86_64.tgz && \ 93 | curl -fsSL -o firecracker-v1.13.1-aarch64.tgz https://github.com/firecracker-microvm/firecracker/releases/download/v1.13.1/firecracker-v1.13.1-aarch64.tgz && \ 94 | tar xzf firecracker-v1.13.1-x86_64.tgz && \ 95 | tar xzf firecracker-v1.13.1-aarch64.tgz && \ 96 | mv release-v1.13.1-x86_64/firecracker-v1.13.1-x86_64 ./firecracker.amd64 && \ 97 | mv release-v1.13.1-aarch64/firecracker-v1.13.1-aarch64 ./firecracker.arm64 98 | 99 | # We are doing kinda strange thing here - 100 | # this image is ALWAYS built for amd64, regardless of the current platform architecture. 101 | FROM --platform=linux/amd64 debian:bullseye-slim AS bake 102 | RUN apt-get update && apt-get install -y --no-install-recommends \ 103 | squashfs-tools \ 104 | erofs-utils \ 105 | ca-certificates makefs && \ 106 | rm -rf /var/lib/apt/lists/* 107 | WORKDIR /opt/bake 108 | COPY --from=build_kernel_amd64 /opt/kernel.amd64 ./kernel.amd64 109 | COPY --from=build_kernel_arm64 /opt/kernel.arm64 ./kernel.arm64 110 | COPY --from=fetch_firecracker /opt/firecracker.amd64 /opt/firecracker.arm64 ./ 111 | COPY --from=build_bake /build/bin/ /opt/bake/ 112 | COPY --from=build_vm_initrd_amd64 /build/initrd.cpio.gz ./initrd.amd64.img 113 | COPY --from=build_vm_initrd_arm64 /build/initrd.cpio.gz ./initrd.arm64.img 114 | ENV BAKE_NOT_INIT=1 115 | -------------------------------------------------------------------------------- /src/util.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | collections::HashMap, 3 | os::fd::{AsFd, AsRawFd, BorrowedFd, OwnedFd}, 4 | path::Path, 5 | sync::atomic::Ordering, 6 | }; 7 | 8 | use fdlimit::Outcome; 9 | use tokio::{ 10 | io::{AsyncReadExt, AsyncWriteExt, Interest, unix::AsyncFd}, 11 | net::UnixStream, 12 | }; 13 | use tokio_vsock::VsockStream; 14 | 15 | use crate::DEBUG; 16 | 17 | #[derive(rkyv::Serialize, rkyv::Deserialize, rkyv::Archive, Clone, Debug)] 18 | pub struct BootManifest { 19 | pub entrypoint: Option, 20 | pub args: Vec, 21 | pub env: HashMap, 22 | pub cwd: Option, 23 | pub volumes: Vec, 24 | pub uid: Option, 25 | pub gid: Option, 26 | pub disable_hostnet: bool, 27 | pub wireguard_conf: Option, 28 | pub ssh_ecdsa_private_key: String, 29 | pub ssh_ecdsa_public_key: String, 30 | } 31 | 32 | #[derive(rkyv::Serialize, rkyv::Deserialize, rkyv::Archive, Clone, Debug)] 33 | pub struct VolumeManifest { 34 | pub guest_path: String, 35 | pub host_filename: Option, 36 | pub ro: bool, 37 | pub ext4: bool, 38 | } 39 | 40 | pub fn align_up(value: usize, align: usize) -> usize { 41 | assert_eq!(align.count_ones(), 1); 42 | (value + (align - 1)) & !(align - 1) 43 | } 44 | 45 | pub async fn vsock_uds_connect(uds_path: &Path, port: u32) -> anyhow::Result { 46 | 'outer: loop { 47 | let mut stream = UnixStream::connect(uds_path).await?; 48 | stream 49 | .write_all(format!("CONNECT {}\n", port).as_bytes()) 50 | .await?; 51 | stream.flush().await?; 52 | 53 | let mut recv_buf = [0u8; 64]; 54 | let mut recv_cursor = 0usize; 55 | loop { 56 | let b = match stream.read_u8().await { 57 | Ok(b) => b, 58 | Err(e) => { 59 | if e.kind() == std::io::ErrorKind::UnexpectedEof { 60 | tokio::time::sleep(std::time::Duration::from_millis(10)).await; 61 | continue 'outer; 62 | } 63 | return Err(e.into()); 64 | } 65 | }; 66 | if b == b'\n' { 67 | break; 68 | } 69 | 70 | if recv_cursor == recv_buf.len() { 71 | anyhow::bail!("buffer overflow"); 72 | } 73 | recv_buf[recv_cursor] = b; 74 | recv_cursor += 1; 75 | } 76 | let msg = std::str::from_utf8(&recv_buf[..recv_cursor])?; 77 | if !msg.starts_with("OK ") { 78 | anyhow::bail!("unexpected response: {}", msg); 79 | } 80 | 81 | return Ok(stream); 82 | } 83 | } 84 | 85 | pub fn set_nonblocking(fd: BorrowedFd, nb: bool) -> std::io::Result<()> { 86 | unsafe { 87 | let flags = libc::fcntl(fd.as_raw_fd(), libc::F_GETFL, 0); 88 | if flags < 0 { 89 | return Err(std::io::Error::last_os_error()); 90 | } 91 | if libc::fcntl( 92 | fd.as_raw_fd(), 93 | libc::F_SETFL, 94 | if nb { 95 | flags | libc::O_NONBLOCK 96 | } else { 97 | flags & !libc::O_NONBLOCK 98 | }, 99 | ) < 0 100 | { 101 | return Err(std::io::Error::last_os_error()); 102 | } 103 | } 104 | Ok(()) 105 | } 106 | 107 | pub fn quote_systemd_string(s: &str) -> String { 108 | let mut output = String::with_capacity(s.len()); 109 | for ch in s.chars() { 110 | if ch.is_ascii_alphanumeric() || ch == '-' || ch == '_' || ch == '/' || ch == '=' { 111 | output.push(ch); 112 | } else { 113 | let mut bytes = [0u8; 4]; 114 | let bytes = ch.encode_utf8(&mut bytes); 115 | for b in bytes.as_bytes() { 116 | output.push_str(&format!("\\x{:02x}", b)); 117 | } 118 | } 119 | } 120 | output 121 | } 122 | 123 | pub fn best_effort_raise_fd_limit() { 124 | match fdlimit::raise_fd_limit() { 125 | Ok(Outcome::LimitRaised { from, to }) => { 126 | if DEBUG.load(Ordering::Relaxed) { 127 | eprintln!("raised fd limit from {} to {}", from, to); 128 | } 129 | } 130 | Ok(Outcome::Unsupported) => { 131 | eprintln!("raising fd limit is not supported on this platform"); 132 | } 133 | Err(e) => { 134 | eprintln!("failed to raise fd limit: {:?}", e); 135 | } 136 | } 137 | } 138 | 139 | pub async fn copy_bidirectional_fastclose(a: OwnedFd, b: OwnedFd) -> std::io::Result<()> { 140 | async fn copy<'b>( 141 | a: &'b AsyncFd, 142 | b: &'b AsyncFd, 143 | drain: bool, 144 | ) -> std::io::Result<()> { 145 | const BUFFER_SIZE: usize = 8192; 146 | let mut buf = vec![0u8; BUFFER_SIZE]; 147 | loop { 148 | let n = if drain { 149 | nix::unistd::read(a.get_ref(), &mut buf).map_err(std::io::Error::from)? 150 | } else { 151 | a.async_io(Interest::READABLE, |x| { 152 | nix::unistd::read(x, &mut buf).map_err(std::io::Error::from) 153 | }) 154 | .await? 155 | }; 156 | if n == 0 { 157 | if drain { 158 | unsafe { 159 | libc::shutdown(b.as_raw_fd(), libc::SHUT_WR); 160 | } 161 | } 162 | return Ok(()); 163 | } 164 | let mut buf = &buf[..n]; 165 | while !buf.is_empty() { 166 | let written = b 167 | .async_io(Interest::WRITABLE, |x| { 168 | nix::unistd::write(x, buf).map_err(std::io::Error::from) 169 | }) 170 | .await?; 171 | assert!(written > 0); 172 | buf = &buf[written..]; 173 | } 174 | } 175 | } 176 | let a = AsyncFd::with_interest(a, Interest::READABLE | Interest::WRITABLE)?; 177 | let b = AsyncFd::with_interest(b, Interest::READABLE | Interest::WRITABLE)?; 178 | let ret = tokio::select! { 179 | biased; 180 | x = copy(&a, &b, false) => x, 181 | x = copy(&b, &a, false) => x, 182 | }; 183 | let _ = tokio::join!(copy(&a, &b, true), copy(&b, &a, true)); 184 | ret 185 | } 186 | 187 | pub fn decompose_vsock_stream(s: VsockStream) -> std::io::Result { 188 | s.as_fd().try_clone_to_owned() 189 | } 190 | -------------------------------------------------------------------------------- /src/raw_udp.rs: -------------------------------------------------------------------------------- 1 | use std::io::{Error, ErrorKind, Result}; 2 | use std::net::Ipv4Addr; 3 | 4 | use tokio::io::{AsyncReadExt, AsyncWriteExt}; 5 | use tun::{AsyncDevice, Configuration, DeviceReader, DeviceWriter, Layer}; 6 | 7 | /// Parsed packet view for convenience when receiving 8 | #[derive(Debug, Clone, rkyv::Archive, rkyv::Serialize, rkyv::Deserialize)] 9 | pub struct UdpPacket { 10 | pub src_ip: Ipv4Addr, 11 | pub dst_ip: Ipv4Addr, 12 | pub src_port: u16, 13 | pub dst_port: u16, 14 | pub payload: Vec, 15 | } 16 | 17 | pub struct RawUdp { 18 | tun_r: tokio::sync::Mutex, 19 | tun_w: tokio::sync::Mutex, 20 | } 21 | 22 | impl RawUdp { 23 | /// Create a TUN device for UDP packet processing. 24 | pub fn open(ifname: &str, address: Ipv4Addr) -> Result { 25 | let mut config = Configuration::default(); 26 | config 27 | .tun_name(ifname) 28 | .layer(Layer::L3) 29 | .address(address) 30 | .mtu(1500) 31 | .up(); 32 | 33 | let device = tun::create(&config).map_err(|e| { 34 | Error::new( 35 | ErrorKind::Other, 36 | format!("Failed to create TUN device: {}", e), 37 | ) 38 | })?; 39 | 40 | let tun = AsyncDevice::new(device).map_err(|e| { 41 | Error::new( 42 | ErrorKind::Other, 43 | format!("Failed to create async TUN device: {}", e), 44 | ) 45 | })?; 46 | let (tun_w, tun_r) = tun.split().unwrap(); 47 | 48 | Ok(Self { 49 | tun_w: tokio::sync::Mutex::new(tun_w), 50 | tun_r: tokio::sync::Mutex::new(tun_r), 51 | }) 52 | } 53 | 54 | /// Receive one raw IPv4+UDP packet from the TUN device. 55 | /// Blocks until a packet arrives. 56 | pub async fn recv(&self) -> Result { 57 | loop { 58 | // 9600B buffer for raw IPv4 packets 59 | let mut buf = vec![0u8; 9600]; 60 | let n = self.tun_r.lock().await.read(&mut buf).await?; 61 | 62 | if n < 20 { 63 | // eprintln!("DEBUG: Packet too short: {} bytes, first 16 bytes: {:02x?}", n, &buf[..n.min(16)]); 64 | continue; // Skip this packet 65 | } 66 | 67 | // Check if this is IPv4 68 | let version = buf[0] >> 4; 69 | if version != 4 { 70 | // eprintln!("DEBUG: Not IPv4 packet, version: {}", version); 71 | continue; // Skip non-IPv4 packets 72 | } 73 | 74 | // Parse IPv4 75 | let ihl_bytes = (buf[0] & 0x0F) as usize * 4; 76 | let protocol = buf[9]; 77 | 78 | // eprintln!("DEBUG: Packet length: {}, version: {}, IHL: {} bytes, protocol: {}", 79 | // n, version, ihl_bytes, protocol); 80 | 81 | if ihl_bytes < 20 || n < ihl_bytes + 8 { 82 | // eprintln!("DEBUG: Invalid packet structure - IHL: {}, total len: {}, needed: {}", 83 | // ihl_bytes, n, ihl_bytes + 8); 84 | continue; // Skip malformed packets 85 | } 86 | 87 | if protocol != 17 { 88 | // eprintln!("DEBUG: Not UDP packet, protocol: {}", protocol); 89 | continue; // Skip non-UDP packets 90 | } 91 | 92 | // This is a valid IPv4+UDP packet, process it 93 | let src_ip = Ipv4Addr::new(buf[12], buf[13], buf[14], buf[15]); 94 | let dst_ip = Ipv4Addr::new(buf[16], buf[17], buf[18], buf[19]); 95 | 96 | // Parse UDP 97 | let u = ihl_bytes; 98 | let src_port = u16::from_be_bytes([buf[u], buf[u + 1]]); 99 | let dst_port = u16::from_be_bytes([buf[u + 2], buf[u + 3]]); 100 | let udp_len = u16::from_be_bytes([buf[u + 4], buf[u + 5]]) as usize; 101 | 102 | if udp_len < 8 || u + udp_len > n { 103 | // eprintln!("DEBUG: Invalid UDP length: {} vs available: {}", udp_len, n - u); 104 | continue; // Skip malformed UDP packets 105 | } 106 | 107 | let payload = buf[u + 8..u + udp_len].to_vec(); 108 | 109 | return Ok(UdpPacket { 110 | src_ip, 111 | dst_ip, 112 | src_port, 113 | dst_port, 114 | payload, 115 | }); 116 | } 117 | } 118 | 119 | /// Inject a UDP packet via the TUN device. 120 | /// 121 | /// Creates a complete IPv4+UDP packet and writes it to the TUN interface. 122 | pub async fn inject( 123 | &self, 124 | src_ip: Ipv4Addr, 125 | dst_ip: Ipv4Addr, 126 | src_port: u16, 127 | dst_port: u16, 128 | payload: &[u8], 129 | ttl: u8, 130 | ) -> Result<()> { 131 | // Build UDP 132 | let udp_len = 8 + payload.len(); 133 | let mut udp = vec![0u8; 8]; 134 | put_u16(&mut udp[0..2], src_port); 135 | put_u16(&mut udp[2..4], dst_port); 136 | put_u16(&mut udp[4..6], udp_len as u16); 137 | put_u16(&mut udp[6..8], 0); // checksum placeholder 138 | 139 | // Build IPv4 header 140 | let total_len = 20 + udp_len; 141 | let mut ip = vec![0u8; 20]; 142 | ip[0] = (4u8 << 4) | (5u8); // version=4, IHL=5 143 | ip[1] = 0; // DSCP/ECN 144 | put_u16(&mut ip[2..4], total_len as u16); 145 | put_u16(&mut ip[4..6], 0); // identification 146 | put_u16(&mut ip[6..8], 0x4000); // flags: DF set, no fragmentation 147 | ip[8] = ttl; 148 | ip[9] = 17; // UDP 149 | put_u32(&mut ip[12..16], src_ip); 150 | put_u32(&mut ip[16..20], dst_ip); 151 | // checksum over the IPv4 header 152 | put_u16(&mut ip[10..12], 0); 153 | let ip_sum = checksum16(&ip); 154 | put_u16(&mut ip[10..12], ip_sum); 155 | 156 | // UDP checksum with IPv4 pseudo-header 157 | let mut pseudo = Vec::with_capacity(12 + udp_len); 158 | // Pseudo header: src, dst, zero, proto, udp_len 159 | pseudo.extend_from_slice(&src_ip.octets()); 160 | pseudo.extend_from_slice(&dst_ip.octets()); 161 | pseudo.push(0); 162 | pseudo.push(17); 163 | pseudo.extend_from_slice(&(udp_len as u16).to_be_bytes()); 164 | // UDP header + payload 165 | pseudo.extend_from_slice(&udp); 166 | pseudo.extend_from_slice(payload); 167 | 168 | // If odd length, pad with zero for checksum calc 169 | let checksum = checksum16_pad(&pseudo); 170 | let udp_checksum = if checksum == 0 { 0xFFFF } else { checksum }; 171 | put_u16(&mut udp[6..8], udp_checksum); 172 | 173 | // Final packet: IP + UDP + payload 174 | let mut pkt = Vec::with_capacity(total_len); 175 | pkt.extend_from_slice(&ip); 176 | pkt.extend_from_slice(&udp); 177 | pkt.extend_from_slice(payload); 178 | 179 | // Write to TUN device 180 | self.tun_w.lock().await.write_all(&pkt).await?; 181 | Ok(()) 182 | } 183 | } 184 | 185 | /* -------------------- helpers -------------------- */ 186 | 187 | fn put_u16(dst: &mut [u8], v: u16) { 188 | dst.copy_from_slice(&v.to_be_bytes()); 189 | } 190 | fn put_u32(dst: &mut [u8], ip: Ipv4Addr) { 191 | dst.copy_from_slice(&ip.octets()); 192 | } 193 | 194 | /// Internet checksum (ones’ complement) over even-length slice 195 | fn checksum16(data: &[u8]) -> u16 { 196 | let mut sum: u32 = 0; 197 | let mut i = 0; 198 | while i + 1 < data.len() { 199 | let word = u16::from_be_bytes([data[i], data[i + 1]]) as u32; 200 | sum = sum.wrapping_add(word); 201 | i += 2; 202 | } 203 | // Fold to 16 bits 204 | while (sum >> 16) != 0 { 205 | sum = (sum & 0xFFFF) + (sum >> 16); 206 | } 207 | !(sum as u16) 208 | } 209 | 210 | /// Internet checksum permitting odd-length input (pads with 0) 211 | fn checksum16_pad(data: &[u8]) -> u16 { 212 | if data.len() % 2 == 0 { 213 | return checksum16(data); 214 | } 215 | let mut tmp = Vec::with_capacity(data.len() + 1); 216 | tmp.extend_from_slice(data); 217 | tmp.push(0); 218 | checksum16(&tmp) 219 | } 220 | -------------------------------------------------------------------------------- /src/wireguard.rs: -------------------------------------------------------------------------------- 1 | #[derive(Debug, Clone, PartialEq, Eq)] 2 | pub struct WgSection { 3 | pub name: String, 4 | pub items: Vec<(String, Vec)>, 5 | } 6 | 7 | #[derive(Debug, Clone, PartialEq, Eq)] 8 | pub struct ParsedWireguardConf { 9 | pub sections: Vec, 10 | } 11 | 12 | pub fn parse_wireguard_conf(conf: &str) -> ParsedWireguardConf { 13 | fn strip_comment(mut s: &str) -> &str { 14 | let bytes = s.as_bytes(); 15 | for (i, &b) in bytes.iter().enumerate() { 16 | if b == b'#' || b == b';' { 17 | s = &s[..i]; 18 | break; 19 | } 20 | } 21 | s.trim() 22 | } 23 | fn split_values(v: &str) -> Vec { 24 | let mut out: Vec = Vec::new(); 25 | let mut buf = String::new(); 26 | let mut in_squote = false; 27 | let mut in_dquote = false; 28 | let flush = |buf: &mut String, out: &mut Vec| { 29 | let mut tok = buf.trim().to_string(); 30 | if tok.len() >= 2 { 31 | let b = tok.as_bytes(); 32 | let first = b[0]; 33 | let last = *b.last().unwrap(); 34 | if (first == b'"' && last == b'"') || (first == b'\'' && last == b'\'') { 35 | tok = tok[1..tok.len() - 1].to_string(); 36 | } 37 | } 38 | if !tok.is_empty() { 39 | out.push(tok); 40 | } 41 | buf.clear(); 42 | }; 43 | for ch in v.chars() { 44 | match ch { 45 | '\'' if !in_dquote => { 46 | in_squote = !in_squote; 47 | buf.push(ch); 48 | } 49 | '"' if !in_squote => { 50 | in_dquote = !in_dquote; 51 | buf.push(ch); 52 | } 53 | ',' | ' ' | '\t' | '\n' | '\r' if !in_squote && !in_dquote => { 54 | if !buf.trim().is_empty() { 55 | flush(&mut buf, &mut out); 56 | } 57 | } 58 | _ => buf.push(ch), 59 | } 60 | } 61 | if !buf.trim().is_empty() { 62 | flush(&mut buf, &mut out); 63 | } 64 | out 65 | } 66 | 67 | let mut sections: Vec = Vec::new(); 68 | let mut current: Option = None; 69 | 70 | for raw in conf.lines() { 71 | let line = strip_comment(raw); 72 | if line.is_empty() { 73 | continue; 74 | } 75 | if line.starts_with('[') && line.ends_with(']') { 76 | if let Some(sec) = current.take() { 77 | sections.push(sec); 78 | } 79 | let mut name = &line[1..line.len() - 1]; 80 | name = name.trim(); 81 | current = Some(WgSection { 82 | name: name.to_string(), 83 | items: Vec::new(), 84 | }); 85 | continue; 86 | } 87 | let Some((k, v)) = line.split_once('=') else { 88 | continue; 89 | }; 90 | let key = k.trim().to_string(); 91 | let vals = split_values(v.trim()); 92 | if current.is_none() { 93 | // Ignore items outside any section 94 | continue; 95 | } 96 | current.as_mut().unwrap().items.push((key, vals)); 97 | } 98 | 99 | if let Some(sec) = current.take() { 100 | sections.push(sec); 101 | } 102 | 103 | ParsedWireguardConf { sections } 104 | } 105 | 106 | pub fn serialize_without_keys(conf: &ParsedWireguardConf, drop_keys_ci: &[&str]) -> String { 107 | let mut out = String::new(); 108 | for sec in &conf.sections { 109 | out.push('['); 110 | out.push_str(&sec.name); 111 | out.push_str("]\n"); 112 | for (k, vals) in &sec.items { 113 | if drop_keys_ci.iter().any(|d| k.eq_ignore_ascii_case(d)) { 114 | continue; 115 | } 116 | out.push_str(k); 117 | out.push_str(" = "); 118 | out.push_str(&vals.join(", ")); 119 | out.push('\n'); 120 | } 121 | out.push('\n'); 122 | } 123 | out 124 | } 125 | 126 | #[cfg(test)] 127 | mod tests { 128 | use super::*; 129 | 130 | #[test] 131 | fn parses_basic_interface_and_peer() { 132 | let conf = r#" 133 | [Interface] 134 | Address = 10.0.0.2/24, 10.0.1.2/24 # inline comment 135 | ListenPort = 51820 136 | 137 | [Peer] 138 | AllowedIPs = 10.0.0.0/24,10.0.2.0/24 ; another comment 139 | PublicKey = abc 140 | "#; 141 | let p = parse_wireguard_conf(conf); 142 | assert_eq!(p.sections.len(), 2); 143 | assert_eq!(p.sections[0].name.to_ascii_lowercase(), "interface"); 144 | let iface_addr = p.sections[0] 145 | .items 146 | .iter() 147 | .find(|(k, _)| k.eq_ignore_ascii_case("address")) 148 | .map(|(_, v)| v.clone()) 149 | .unwrap(); 150 | assert_eq!(iface_addr, vec!["10.0.0.2/24", "10.0.1.2/24"]); 151 | assert_eq!(p.sections[1].name.to_ascii_lowercase(), "peer"); 152 | let peer_allowed = p.sections[1] 153 | .items 154 | .iter() 155 | .find(|(k, _)| k.eq_ignore_ascii_case("allowedips")) 156 | .map(|(_, v)| v.clone()) 157 | .unwrap(); 158 | assert_eq!(peer_allowed, vec!["10.0.0.0/24", "10.0.2.0/24"]); 159 | } 160 | 161 | #[test] 162 | fn splits_and_preserves_tokens() { 163 | let conf = r#" 164 | [Interface] 165 | Address=10.0.0.2/24 10.0.0.2/24,10.0.1.2/24 166 | [Peer] 167 | AllowedIPs = 10.0.0.0/24 10.0.0.0/24 , 10.0.2.0/24 168 | "#; 169 | let p = parse_wireguard_conf(conf); 170 | let iface = p 171 | .sections 172 | .iter() 173 | .find(|s| s.name.eq_ignore_ascii_case("interface")) 174 | .unwrap(); 175 | let addr = iface 176 | .items 177 | .iter() 178 | .find(|(k, _)| k.eq_ignore_ascii_case("address")) 179 | .unwrap() 180 | .1 181 | .clone(); 182 | assert_eq!(addr, vec!["10.0.0.2/24", "10.0.0.2/24", "10.0.1.2/24"]); 183 | let peer = p 184 | .sections 185 | .iter() 186 | .find(|s| s.name.eq_ignore_ascii_case("peer")) 187 | .unwrap(); 188 | let allowed = peer 189 | .items 190 | .iter() 191 | .find(|(k, _)| k.eq_ignore_ascii_case("allowedips")) 192 | .unwrap() 193 | .1 194 | .clone(); 195 | assert_eq!(allowed, vec!["10.0.0.0/24", "10.0.0.0/24", "10.0.2.0/24"]); 196 | } 197 | 198 | #[test] 199 | fn quotes_and_ipv6() { 200 | let conf = r#" 201 | [Interface] 202 | Address = "2001:db8::1/64", '10.2.3.4/32' 203 | [Peer] 204 | AllowedIPs = "fd00::/8", 0.0.0.0/0 205 | "#; 206 | let p = parse_wireguard_conf(conf); 207 | let iface = p 208 | .sections 209 | .iter() 210 | .find(|s| s.name.eq_ignore_ascii_case("interface")) 211 | .unwrap(); 212 | let addr = iface 213 | .items 214 | .iter() 215 | .find(|(k, _)| k.eq_ignore_ascii_case("address")) 216 | .unwrap() 217 | .1 218 | .clone(); 219 | assert_eq!(addr, vec!["2001:db8::1/64", "10.2.3.4/32"]); 220 | let peer = p 221 | .sections 222 | .iter() 223 | .find(|s| s.name.eq_ignore_ascii_case("peer")) 224 | .unwrap(); 225 | let allowed = peer 226 | .items 227 | .iter() 228 | .find(|(k, _)| k.eq_ignore_ascii_case("allowedips")) 229 | .unwrap() 230 | .1 231 | .clone(); 232 | assert_eq!(allowed, vec!["fd00::/8", "0.0.0.0/0"]); 233 | } 234 | 235 | #[test] 236 | fn multiple_peers_collect_all_allowedips() { 237 | let conf = r#" 238 | [Peer] 239 | AllowedIPs = 10.0.0.0/24 240 | [Peer] 241 | AllowedIPs = 10.0.1.0/24,10.0.2.0/24 242 | "#; 243 | let p = parse_wireguard_conf(conf); 244 | let peers: Vec<_> = p 245 | .sections 246 | .iter() 247 | .filter(|s| s.name.eq_ignore_ascii_case("peer")) 248 | .collect(); 249 | assert_eq!(peers.len(), 2); 250 | let vals: Vec = peers 251 | .iter() 252 | .flat_map(|sec| sec.items.iter()) 253 | .filter(|(k, _)| k.eq_ignore_ascii_case("allowedips")) 254 | .flat_map(|(_, v)| v.clone()) 255 | .collect(); 256 | assert_eq!(vals, vec!["10.0.0.0/24", "10.0.1.0/24", "10.0.2.0/24"]); 257 | } 258 | 259 | #[test] 260 | fn serializer_drops_address_and_dns() { 261 | let conf = r#" 262 | [Interface] 263 | Address = 10.0.0.2/24, 10.0.1.2/24 264 | DNS = 1.1.1.1 265 | PrivateKey = abc 266 | 267 | [Peer] 268 | PublicKey = def 269 | AllowedIPs = 10.0.0.0/24 270 | "#; 271 | let p = parse_wireguard_conf(conf); 272 | let s = serialize_without_keys(&p, &["address", "dns"]); 273 | let expected = "[Interface]\nPrivateKey = abc\n\n[Peer]\nPublicKey = def\nAllowedIPs = 10.0.0.0/24\n\n"; 274 | assert_eq!(s, expected); 275 | } 276 | } 277 | -------------------------------------------------------------------------------- /src/vm_console.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | os::fd::{AsFd, AsRawFd, FromRawFd, OwnedFd}, 3 | path::Path, 4 | }; 5 | 6 | use anyhow::Context; 7 | use tokio::{ 8 | io::{AsyncReadExt, AsyncWriteExt, Interest, unix::AsyncFd}, 9 | net::UnixListener, 10 | runtime::Runtime, 11 | task::JoinHandle, 12 | }; 13 | use tokio_vsock::{VsockAddr, VsockStream}; 14 | 15 | use crate::{ 16 | console::{ArchivedConsoleResponse, ConsoleRequest, ConsoleResponse}, 17 | util::set_nonblocking, 18 | }; 19 | 20 | pub fn start_console_bridge() -> anyhow::Result { 21 | // Runtime to host the vsock bridge tasks; keep it alive. 22 | let rt = tokio::runtime::Builder::new_multi_thread() 23 | .enable_all() 24 | .worker_threads(1) 25 | .thread_name("bake-vm-console") 26 | .build() 27 | .unwrap(); 28 | 29 | // Create a pty pair 30 | let (master_fd, slave_fd) = unsafe { 31 | let mut master: libc::c_int = -1; 32 | let mut slave: libc::c_int = -1; 33 | if libc::openpty( 34 | &mut master, 35 | &mut slave, 36 | std::ptr::null_mut(), 37 | std::ptr::null(), 38 | std::ptr::null(), 39 | ) != 0 40 | { 41 | anyhow::bail!("openpty failed: {:?}", std::io::Error::last_os_error()); 42 | } 43 | (OwnedFd::from_raw_fd(master), OwnedFd::from_raw_fd(slave)) 44 | }; 45 | 46 | let slave_fd_for_ioctl = slave_fd.try_clone().unwrap(); 47 | 48 | let conn = rt 49 | .block_on(async { VsockStream::connect(VsockAddr::new(2, 14)).await }) 50 | .expect("failed to connect to vsock (2, 14)"); 51 | 52 | rt.spawn(async move { 53 | set_nonblocking(master_fd.as_fd(), true).expect("failed to set nonblocking"); 54 | let master_fd = AsyncFd::new(master_fd).unwrap(); 55 | let (mut conn_r, mut conn_w) = conn.into_split(); 56 | let rd_fut = async { 57 | let mut buf = vec![0u8; 4096]; 58 | loop { 59 | let n = master_fd.async_io(Interest::READABLE, |x| nix::unistd::read(x, &mut buf).map_err(std::io::Error::from)).await?; 60 | if n == 0 { 61 | break; 62 | } 63 | let msg = ConsoleRequest::Data(buf[..n].to_vec()); 64 | let bytes = rkyv::to_bytes::(&msg)?; 65 | write_frame(&mut conn_w, &bytes).await.with_context(|| "writing frame")?; 66 | } 67 | Ok::<_, anyhow::Error>(()) 68 | }; 69 | 70 | let wr_fut = async { 71 | loop { 72 | let Ok(frame) = read_frame(&mut conn_r).await else { 73 | break; 74 | }; 75 | let archived = rkyv::access::(&frame)?; 76 | match archived { 77 | crate::console::ArchivedConsoleResponse::Data(data) => { 78 | let mut data = &data[..]; 79 | while !data.is_empty() { 80 | let n = master_fd.async_io(Interest::WRITABLE, |x| nix::unistd::write(x, data).map_err(std::io::Error::from)).await?; 81 | data = &data[n..]; 82 | } 83 | } 84 | crate::console::ArchivedConsoleResponse::SetWindowSize { rows, cols } => { 85 | unsafe { 86 | let mut ws: libc::winsize = std::mem::zeroed(); 87 | ws.ws_row = (*rows).into(); 88 | ws.ws_col = (*cols).into(); 89 | let _ = libc::ioctl(slave_fd_for_ioctl.as_raw_fd(), libc::TIOCSWINSZ, &ws); 90 | } 91 | } 92 | } 93 | } 94 | Ok::<_, anyhow::Error>(()) 95 | }; 96 | 97 | let ret = tokio::select! { 98 | biased; 99 | x = rd_fut => x, 100 | x = wr_fut => x, 101 | }; 102 | panic!("vm_console exited: {:?}", ret); 103 | }); 104 | 105 | // Keep runtime alive for the lifetime of the process 106 | std::mem::forget(rt); 107 | 108 | Ok(slave_fd) 109 | } 110 | 111 | pub fn host_run_console(rt: &Runtime, path: &Path) -> anyhow::Result> { 112 | let listener = rt.block_on(async { UnixListener::bind(path) })?; 113 | let task = rt.spawn(async move { 114 | // only accept once 115 | let Ok((conn, _)) = listener.accept().await else { 116 | return; 117 | }; 118 | 119 | // Enable raw mode on host tty for pass-through control characters 120 | struct TermiosGuard(Option<(i32, libc::termios)>); 121 | impl Drop for TermiosGuard { 122 | fn drop(&mut self) { 123 | if let Some((fd, orig)) = self.0.take() { 124 | unsafe { 125 | let _ = libc::tcsetattr(fd, libc::TCSANOW, &orig); 126 | } 127 | } 128 | } 129 | } 130 | 131 | let mut guard = TermiosGuard(None); 132 | let tty_fd = choose_tty_fd(); 133 | unsafe { 134 | if tty_fd >= 0 { 135 | let mut tio: libc::termios = std::mem::zeroed(); 136 | if libc::tcgetattr(tty_fd, &mut tio) == 0 { 137 | let orig = tio; 138 | tio.c_iflag &= 139 | !(libc::BRKINT | libc::ICRNL | libc::INPCK | libc::ISTRIP | libc::IXON); 140 | tio.c_oflag &= !(libc::OPOST); 141 | tio.c_cflag |= libc::CS8; 142 | tio.c_lflag &= !(libc::ECHO | libc::ICANON | libc::IEXTEN | libc::ISIG); 143 | tio.c_cc[libc::VMIN] = 1; 144 | tio.c_cc[libc::VTIME] = 0; 145 | let _ = libc::tcsetattr(tty_fd, libc::TCSANOW, &tio); 146 | guard.0 = Some((tty_fd, orig)); 147 | } 148 | } 149 | } 150 | 151 | let (mut conn_r, conn_w) = conn.into_split(); 152 | let conn_w = tokio::sync::Mutex::new(conn_w); 153 | 154 | // stdin -> tx 155 | let stdin_fut = async { 156 | let mut stdin = tokio::io::stdin(); 157 | let mut buf = vec![0u8; 4096]; 158 | loop { 159 | let n = match stdin.read(&mut buf).await { 160 | Ok(0) => break, 161 | Ok(n) => n, 162 | Err(_) => break, 163 | }; 164 | let msg = ConsoleResponse::Data(buf[..n].to_vec()); 165 | let bytes = match rkyv::to_bytes::(&msg) { 166 | Ok(b) => b, 167 | Err(_) => break, 168 | }; 169 | if write_frame(&mut *conn_w.lock().await, &bytes) 170 | .await 171 | .is_err() 172 | { 173 | break; 174 | } 175 | } 176 | }; 177 | 178 | // winsize updates via SIGWINCH 179 | let winsize_fut = async { 180 | if tty_fd < 0 { 181 | futures::future::pending::<()>().await; 182 | unreachable!(); 183 | } 184 | 185 | use tokio::signal::unix::{SignalKind, signal}; 186 | let mut sig = match signal(SignalKind::window_change()) { 187 | Ok(s) => s, 188 | Err(_) => return, 189 | }; 190 | let mut last_rows = 0u16; 191 | let mut last_cols = 0u16; 192 | let mut first_iteration = true; 193 | loop { 194 | if first_iteration { 195 | first_iteration = false; 196 | } else { 197 | if sig.recv().await.is_none() { 198 | break; 199 | } 200 | } 201 | unsafe { 202 | let mut ws: libc::winsize = std::mem::zeroed(); 203 | if libc::ioctl(tty_fd, libc::TIOCGWINSZ, &mut ws) != 0 { 204 | continue; 205 | } 206 | if ws.ws_row == 0 && ws.ws_col == 0 { 207 | continue; 208 | } 209 | if ws.ws_row == last_rows && ws.ws_col == last_cols { 210 | continue; 211 | } 212 | last_rows = ws.ws_row; 213 | last_cols = ws.ws_col; 214 | let msg = ConsoleResponse::SetWindowSize { 215 | rows: ws.ws_row, 216 | cols: ws.ws_col, 217 | }; 218 | let bytes = rkyv::to_bytes::(&msg).unwrap(); 219 | let _ = write_frame(&mut *conn_w.lock().await, &bytes).await; 220 | } 221 | } 222 | }; 223 | 224 | // VM -> stdout (ConsoleRequest::Data) 225 | let stdout_fut = async { 226 | let mut stdout = tokio::io::stdout(); 227 | loop { 228 | let frame = match read_frame(&mut conn_r).await { 229 | Ok(f) => f, 230 | Err(_) => break, 231 | }; 232 | let archived = match rkyv::access::< 233 | crate::console::ArchivedConsoleRequest, 234 | rkyv::rancor::Error, 235 | >(&frame) 236 | { 237 | Ok(a) => a, 238 | Err(_) => break, 239 | }; 240 | match archived { 241 | crate::console::ArchivedConsoleRequest::Data(data) => { 242 | if stdout.write_all(data).await.is_err() { 243 | break; 244 | } 245 | let _ = stdout.flush().await; 246 | } 247 | } 248 | } 249 | }; 250 | 251 | tokio::join!(stdin_fut, stdout_fut, winsize_fut); 252 | drop(guard); 253 | }); 254 | Ok(task) 255 | } 256 | 257 | async fn read_frame(conn: &mut R) -> anyhow::Result> 258 | where 259 | R: tokio::io::AsyncRead + Unpin, 260 | { 261 | let mut len_buf = [0u8; 4]; 262 | tokio::io::AsyncReadExt::read_exact(conn, &mut len_buf).await?; 263 | let len = u32::from_le_bytes(len_buf) as usize; 264 | let mut buf = vec![0u8; len]; 265 | tokio::io::AsyncReadExt::read_exact(conn, &mut buf).await?; 266 | Ok(buf) 267 | } 268 | 269 | async fn write_frame(conn: &mut W, bytes: &[u8]) -> anyhow::Result<()> 270 | where 271 | W: tokio::io::AsyncWrite + Unpin, 272 | { 273 | let len = bytes.len() as u32; 274 | tokio::io::AsyncWriteExt::write_all(conn, &len.to_le_bytes()).await?; 275 | tokio::io::AsyncWriteExt::write_all(conn, bytes).await?; 276 | tokio::io::AsyncWriteExt::flush(conn).await?; 277 | Ok(()) 278 | } 279 | 280 | // Resolve controlling TTY fd to use for termios/ioctl 281 | fn choose_tty_fd() -> i32 { 282 | for fd in [0, 1, 2] { 283 | let is_tty = unsafe { libc::isatty(fd) }; 284 | if is_tty == 1 { 285 | return fd; 286 | } 287 | } 288 | -1 289 | } 290 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # bake 2 | 3 | `bake` is a Linux CLI tool that can embed microVM resources (firecracker binary, kernel, initrd, boot disk) into itself. It also implements bidirectional communication between VM and host - including networking and directory sharing - entirely in userspace, without requiring root privilege. 4 | 5 | ## Usage 6 | 7 | The Docker image includes pre-packaged `bake`, firecracker, kernel and initrd binaries for amd64 and arm64 platforms. 8 | 9 | ```bash 10 | # make sure `./rootfs.squashfs.img` exists 11 | # create output directory 12 | $ mkdir -p output 13 | 14 | # assuming you are building on an amd64 host for an amd64 target 15 | $ docker run -it --rm \ 16 | -v ./rootfs.squashfs.img:/rootfs.img:ro \ 17 | -v ./output:/output \ 18 | --entrypoint /opt/bake/bake.amd64 \ 19 | ghcr.io/losfair/bake \ 20 | --input /opt/bake/bake.amd64 \ 21 | --firecracker /opt/bake/firecracker.amd64 \ 22 | --kernel /opt/bake/kernel.amd64 \ 23 | --initrd /opt/bake/initrd.amd64.img \ 24 | --rootfs /rootfs.img \ 25 | --output /output/app.elf 26 | 27 | # start microVM and print uname 28 | $ ./output/app.elf -- uname -a 29 | Linux container 6.1.149-bottlefire #1 SMP Sat Sep 6 13:50:25 UTC 2025 x86_64 GNU/Linux 30 | 31 | # show usage 32 | $ ./output/app.elf --help 33 | Bottlefire microVM Image 34 | 35 | Usage: app.elf [OPTIONS] [SUBCOMMAND] 36 | 37 | Options: 38 | --cpus Number of CPU cores 39 | --memory Amount of memory (in MB) allocated to the microVM [default: 256] 40 | --boot-args Kernel command line [default: "console=ttyS0 reboot=k panic=-1"] 41 | --entrypoint Container entrypoint 42 | -- Separator; everything after goes to the container 43 | --env Container environment variables 44 | --verbose Enable verbose output 45 | --cwd Container working directory [default: ] 46 | -p, --publish Publish host:vm port forward (e.g. -p 8080:8080) 47 | -v, --volume Directory/volume mappings (e.g. -v ./data:/data) 48 | --allow-net Allow outbound network to IPv4 address or CIDR (repeatable) 49 | --disable-hostnet Disable outbound network bridge 50 | --wireguard-conf-file Provide a WireGuard config (wg setconf format) 51 | -h, --help Print help 52 | 53 | Subcommands: 54 | ssh Auto-connect to the running microVM via SSH 55 | Options: -p, --pid 56 | Pass-through: arguments after `--` go to ssh(1) 57 | systemd Print a systemd service unit and exit 58 | ``` 59 | 60 | ## How it works 61 | 62 | Depending on whether embedded data is detected and whether running as PID 1, `bake` runs in one of the following modes: 63 | 64 | - If PID is 1 and env var `BAKE_NOT_INIT` is not `1`: vminit mode. `bake` assumes that it is running as the init task inside the Firecracker VM, and perform the init sequence. 65 | - If PID is not 1, and embedded data is detected: run mode - accept Firecracker startup parameters (e.g. number of CPUs, memory size, network config), extract kernel and initrd into memfd, start firecracker. 66 | - If PID is not 1, and embedded data is not detected: build mode - accept `--input`, `--firecracker`, `--kernel`, `--initrd`, `--rootfs`, build a binary from `/proc/self/exe` (or the provided input elf) with everything embedded. 67 | 68 | ### Init sequence (src/vminit.rs) 69 | 70 | When running as PID 1 inside the microVM, `bake` executes an init routine that prepares the root filesystem, host-guest connectivity, optional volume mounts, and finally launches the container process with `runc`. 71 | 72 | - Bootstrap system mounts and loopback 73 | - Mount `proc`, `sysfs`, `devtmpfs`, and unified `cgroup2`. 74 | - Bring `lo` up. 75 | 76 | - Parse kernel cmdline and banner 77 | - Read `/proc/cmdline`, parse `bake.*` parameters and `quiet`. 78 | - If not quiet, print a banner and `/proc/version` for diagnostics. 79 | - Fetch BootManifest from host vsock port 13 containing container runtime parameters. 80 | 81 | - Expose embedded rootfs via device-mapper 82 | - Read `bake.rootfs_offset` and `bake.rootfs_size` (sectors) from cmdline. 83 | - Create a linear mapping `rootfs` with `dmsetup` over `/dev/vda` at the given offset/size. 84 | 85 | - Build overlay root on top of ephemeral disk 86 | - Format `/dev/vdb` as ext4 and mount at `/ephemeral`. 87 | - Prepare overlay dirs: `/ephemeral/rootfs.overlay/{upper,work}` and `/ephemeral/container-tmp` (mode 1777). 88 | - Mount the base rootfs from `/dev/mapper/rootfs` at `/rootfs.base`. 89 | - Mount an overlay at `/rootfs` with `lowerdir=/rootfs.base`, `upperdir=/ephemeral/rootfs.overlay/upper`, `workdir=/ephemeral/rootfs.overlay/work`. 90 | 91 | - Set up host-guest networking over vsock with SOCKS5 and tun2socks 92 | - Inside the VM, start a SOCKS5 server listening on vsock port 10. 93 | - Start a small TCP proxy that exposes that vsock service on `127.0.0.10:10` for local clients. 94 | - Create a TUN device `hostnet` (L3), assign `198.18.0.1/32`, bring it up, and add a default route via `hostnet`. 95 | - Start a UDP bridge that exchanges UDP packets with the host over vsock port 11 (length-prefixed rkyv-encoded frames). 96 | - Add nftables and `ip rule` entries to policy-route UDP (fwmark `0x64`) via table 100 (via interface `hostudp` created by the UDP injector). 97 | - Launch `tun2socks` to route TCP over the local SOCKS5 proxy (`socks5://127.0.0.10:10`), keeping the VM’s loopback as the outgoing interface. 98 | 99 | - Mount shared volumes via 9p over vsock (optional) 100 | - If volumes are specified in the BootManifest, start a per-volume Unix-to-vsock proxy that connects to host vsock port 12 and first writes the length-prefixed guest path. 101 | - Mount each volume into the overlay root under `/rootfs` using `9p` with `trans=unix,version=9p2000.L` pointing at the per-volume UDS. 102 | 103 | - Launch the container with runc 104 | - Read container runtime params from BootManifest: 105 | - entrypoint, args, env variables, and working directory. 106 | - Create a container bundle at `/var/lib/container` and generate `config.json` (OCI runtime spec): 107 | - Root at `/rootfs` (overlay), terminal enabled, UID/GID 0, wide capabilities enabled. 108 | - Namespaces: `pid`, `ipc`, `uts`, `mount`. 109 | - Mounts: `proc`, `sys` (ro), `cgroup` (ro), `dev` (tmpfs) + `devpts`, bind `/etc/resolv.conf`, bind `/ephemeral/container-tmp` to `/tmp`. 110 | - PATH is set; `env`/`cwd` applied if specified. 111 | - Execute `runc run --no-pivot container1` in the bundle directory with stdio attached. 112 | 113 | - Shutdown 114 | - On container exit, log status (if non-zero) and trigger a reboot via `/proc/sysrq-trigger` (`b`). 115 | 116 | ### Host-side flow (run mode) 117 | 118 | When invoked on the host with embedded resources present, `bake` prepares resources, sets up vsock-backed host services, and launches Firecracker: 119 | 120 | - Embedded data and params 121 | - Locate embedded archive and rootfs trailer via the magic footer; deserialize metadata (firecracker, kernel, initrd, rootfs size, optional entrypoint/args/env/cwd). 122 | - Merge CLI overrides with embedded values and BAKE_VM_ environment variables. 123 | - Create a BootManifest containing entrypoint, args, env, cwd, and volumes, served to the VM over vsock port 13. 124 | - Compute and pass `bake.rootfs_offset` and `bake.rootfs_size` (in 512-byte sectors) so the guest can expose the rootfs from the host ELF. 125 | 126 | - Transient workspace and cleanup 127 | - Create a temp dir for Firecracker artifacts and UDS endpoints; install signal and panic hooks to remove it on exit. 128 | 129 | - Vsock endpoints for guest services 130 | - Start Unix-socket services that Firecracker's vsock backend connects to per guest port: 131 | - Port 10: SOCKS5 TCP proxy (for guest outbound TCP). 132 | - Port 11: UDP bridge/injector (guest<->host UDP via framed rkyv messages). 133 | - Port 12: 9p file server (guest volume mounts). 134 | - Port 13: BootManifest server (provides container runtime parameters to VM). 135 | - If `-v/--volume` is provided, start the 9p server and include volume mount points in the BootManifest. 136 | 137 | - Host TCP port forwards (`-p/--publish`) 138 | - For each `HOST:VM` mapping, bind a host TCP listener and, on accept, open a vsock connection (via the Firecracker UDS) to guest port 10, perform a SOCKS5 CONNECT to `127.0.0.1:VM`, and pipe data bidirectionally. 139 | 140 | - Memfd resources and drives 141 | - Copy firecracker, kernel, and initrd bytes into sealed `memfd`s (no CLOEXEC) and reference them by `/proc/self/fd/` paths. 142 | - Point Firecracker root drive at our own executable FD (read-only) so the guest can slice out the embedded rootfs; create a 2GiB ephemeral ext4 disk file (read-write) for overlay upper/work/tmp. 143 | 144 | - Firecracker launch 145 | - Generate a minimal config (boot source, two drives, vsock with `guest_cid=3`, no network interfaces, machine config for vCPUs/mem). Honor `--verbose` by adjusting log level. 146 | - Write the config to a `memfd`, then exec Firecracker with `--config-file --no-api --enable-pci`; set `PR_SET_PDEATHSIG=SIGKILL` to ensure teardown with the parent. 147 | - If `BAKE_DRY_RUN=1`, print the config JSON and exit instead of launching. 148 | 149 | ### SSH helper 150 | 151 | When a microVM is running, `bake` exposes two `memfd` FDs from the host process: 152 | 153 | - `memfd:ssh_proxy_path`: contains the Unix socket path for the host-side SSH proxy. 154 | - `memfd:id_ecdsa`: contains the private key used by the guest SSH server. 155 | 156 | To simplify connecting, `app.elf ssh` auto-discovers a running instance of the same binary, and then `exec`s `ssh` with the correct `ProxyCommand` and identity key: 157 | 158 | ```bash 159 | $ ./output/app.elf ssh 160 | 161 | # Or target a specific PID if multiple are running 162 | $ ./output/app.elf ssh --pid 1260276 163 | 164 | # Pass arbitrary ssh options after `--` 165 | $ ./output/app.elf ssh -- -L 8080:localhost:8080 -o ConnectTimeout=5 166 | ``` 167 | 168 | If multiple instances are running, it prints their PIDs and exits so you can stop the others and retry. 169 | 170 | ### Network allowlist 171 | 172 | Use `--allow-net` to restrict outbound network destinations from the guest (via the host SOCKS/UDP bridges). 173 | 174 | - Without any `--allow-net`, all destinations are allowed (default-allow). 175 | - Repeat the flag to allow multiple IPv4 addresses. 176 | - IPv6 is permitted only if it is IPv4-mapped and the mapped IPv4 appears in the allowlist. 177 | 178 | Examples: 179 | 180 | ```bash 181 | # Allow only 1.2.3.4 182 | $ ./output/app.elf --allow-net 1.2.3.4 -- curl http://1.2.3.4/ 183 | 184 | # Allow 1.2.3.4 and 8.8.8.8 185 | $ ./output/app.elf --allow-net 1.2.3.4 --allow-net 8.8.8.8 -- some_command 186 | ``` 187 | 188 | To disable proxied outbound network, add `--disable-hostnet`: 189 | 190 | ```bash 191 | $ ./output/app.elf --disable-hostnet -- some_command 192 | ``` 193 | 194 | ### WireGuard 195 | 196 | Pass a WireGuard config file with `--wireguard-conf-file`. In the guest, the interface `wg0` is created and configured using the `wg` CLI (not `wg-quick`). If the config contains `Address=` entries, they are applied to `wg0`. All `AllowedIPs` entries are parsed and added as routes via `wg0`. If omitted, configure addresses/routes yourself as needed. 197 | 198 | Example: 199 | 200 | ```bash 201 | $ ./output/app.elf --wireguard-conf-file ./wg.conf -- some_command 202 | ``` 203 | -------------------------------------------------------------------------------- /src/socks5.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | collections::HashSet, 3 | net::{IpAddr, Ipv4Addr, SocketAddr, SocketAddrV4}, 4 | os::fd::{AsFd, OwnedFd}, 5 | path::Path, 6 | sync::{Arc, LazyLock, atomic::Ordering}, 7 | time::Duration, 8 | }; 9 | 10 | use fast_socks5::{ 11 | ReplyError, Socks5Command, consts, 12 | server::{AcceptAuthentication, Config, Socks5Socket}, 13 | util::target_addr::TargetAddr, 14 | }; 15 | use tokio::{ 16 | io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt, BufReader, Interest, unix::AsyncFd}, 17 | net::{TcpListener, TcpStream, UdpSocket, UnixListener, UnixStream}, 18 | sync::{OnceCell, broadcast}, 19 | }; 20 | use tokio_vsock::{VsockAddr, VsockListener, VsockStream}; 21 | 22 | use crate::{ 23 | DEBUG, 24 | raw_udp::{ArchivedUdpPacket, RawUdp, UdpPacket}, 25 | util::{copy_bidirectional_fastclose, decompose_vsock_stream}, 26 | }; 27 | 28 | static UDPBUS_RX: LazyLock> = 29 | LazyLock::new(|| broadcast::Sender::new(128)); 30 | 31 | #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] 32 | struct Ipv4Cidr { 33 | network: u32, 34 | prefix: u8, 35 | } 36 | 37 | impl Ipv4Cidr { 38 | fn mask(prefix: u8) -> u32 { 39 | if prefix == 0 { 40 | 0 41 | } else { 42 | u32::MAX << (32 - prefix as u32) 43 | } 44 | } 45 | fn contains(&self, ip: Ipv4Addr) -> bool { 46 | let ip_u = u32::from(ip); 47 | let m = Self::mask(self.prefix); 48 | (ip_u & m) == (self.network & m) 49 | } 50 | } 51 | 52 | fn parse_ipv4_cidr_or_addr(s: &str) -> Option { 53 | if let Some((ip_s, pref_s)) = s.split_once('/') { 54 | let ip = ip_s.parse::().ok()?; 55 | let prefix = pref_s.parse::().ok()?; 56 | if prefix > 32 { 57 | return None; 58 | } 59 | let m = Ipv4Cidr::mask(prefix); 60 | let net = u32::from(ip) & m; 61 | Some(Ipv4Cidr { 62 | network: net, 63 | prefix, 64 | }) 65 | } else { 66 | let ip = s.parse::().ok()?; 67 | Some(Ipv4Cidr { 68 | network: u32::from(ip), 69 | prefix: 32, 70 | }) 71 | } 72 | } 73 | 74 | // Outbound network allowlist (IPv4 CIDRs). Empty/None means allow all. 75 | static ALLOW_NET: OnceCell>> = OnceCell::const_new(); 76 | 77 | pub fn set_allow_net(entries: Vec) { 78 | let mut set: HashSet = HashSet::new(); 79 | for e in entries { 80 | if let Some(c) = parse_ipv4_cidr_or_addr(e.trim()) { 81 | set.insert(c); 82 | } else if DEBUG.load(Ordering::Relaxed) { 83 | eprintln!("invalid --allow-net entry, ignoring: {}", e); 84 | } 85 | } 86 | let _ = ALLOW_NET.set(Arc::new(set)); 87 | } 88 | 89 | fn ip_allowed(ip: IpAddr) -> bool { 90 | match ALLOW_NET.get() { 91 | None => true, 92 | Some(set) => match ip { 93 | IpAddr::V4(v4) => set.iter().any(|c| c.contains(v4)), 94 | IpAddr::V6(v6) => v6 95 | .to_ipv4_mapped() 96 | .map_or(false, |v4| set.iter().any(|c| c.contains(v4))), 97 | }, 98 | } 99 | } 100 | 101 | pub fn run_socks5_unix(uds_path: &Path) -> anyhow::Result<()> { 102 | let rt = tokio::runtime::Builder::new_multi_thread() 103 | .enable_all() 104 | .worker_threads(1) 105 | .thread_name("bake-s5unix") 106 | .build() 107 | .unwrap(); 108 | let listener = rt.block_on(async { UnixListener::bind(uds_path) })?; 109 | rt.spawn(async move { 110 | loop { 111 | let Ok((conn, _)) = listener.accept().await else { 112 | break; 113 | }; 114 | tokio::spawn(async move { 115 | let Ok(conn) = conn.into_std() else { 116 | return; 117 | }; 118 | if let Err(e) = serve_socks5(conn.into()).await { 119 | if DEBUG.load(Ordering::Relaxed) { 120 | eprintln!("run_socks5_unix: {:?}", e); 121 | } 122 | } 123 | }); 124 | } 125 | }); 126 | std::mem::forget(rt); 127 | Ok(()) 128 | } 129 | 130 | pub fn run_socks5_udp_unix(uds_path: &Path) -> anyhow::Result<()> { 131 | let rt = tokio::runtime::Builder::new_multi_thread() 132 | .enable_all() 133 | .worker_threads(1) 134 | .thread_name("bake-s5udp") 135 | .build() 136 | .unwrap(); 137 | let listener = rt.block_on(async { UnixListener::bind(uds_path) })?; 138 | rt.spawn(async move { 139 | loop { 140 | let Ok((conn, _)) = listener.accept().await else { 141 | break; 142 | }; 143 | let Ok(sockfd) = conn.as_fd().try_clone_to_owned() else { 144 | continue; 145 | }; 146 | 147 | tokio::spawn(async move { 148 | if let Err(e) = serve(conn, sockfd, serve_socks5_udp).await { 149 | eprintln!("run_socks5_udp_unix: {:?}", e); 150 | } 151 | }); 152 | } 153 | }); 154 | std::mem::forget(rt); 155 | Ok(()) 156 | } 157 | 158 | pub fn run_socks5_vsock() -> anyhow::Result<()> { 159 | let rt = tokio::runtime::Builder::new_multi_thread() 160 | .enable_all() 161 | .worker_threads(1) 162 | .thread_name("bake-s5vsock") 163 | .build() 164 | .unwrap(); 165 | let listener = rt.block_on(async { VsockListener::bind(VsockAddr::new(u32::MAX, 10)) })?; 166 | rt.spawn(async move { 167 | loop { 168 | let Ok((conn, _)) = listener.accept().await else { 169 | break; 170 | }; 171 | tokio::spawn(async move { 172 | let Ok(conn) = decompose_vsock_stream(conn) else { 173 | return; 174 | }; 175 | if let Err(e) = serve_socks5(conn).await { 176 | eprintln!("run_socks5_vsock: {:?}", e); 177 | } 178 | }); 179 | } 180 | }); 181 | std::mem::forget(rt); 182 | Ok(()) 183 | } 184 | 185 | pub fn run_socks5_tcp_to_vsock_proxy() -> anyhow::Result<()> { 186 | let rt = tokio::runtime::Builder::new_multi_thread() 187 | .enable_all() 188 | .worker_threads(1) 189 | .thread_name("bake-s5t2v") 190 | .build() 191 | .unwrap(); 192 | let listener = rt.block_on(async { TcpListener::bind("127.0.0.10:10").await })?; 193 | rt.spawn(async move { 194 | loop { 195 | let Ok((conn, _)) = listener.accept().await else { 196 | break; 197 | }; 198 | tokio::spawn(async move { 199 | let outbound = tokio::select! { 200 | biased; 201 | _ = conn.ready(Interest::PRIORITY) => return, 202 | x = VsockStream::connect(VsockAddr::new(2, 10)) => x, 203 | }; 204 | let Ok(outbound) = outbound else { 205 | return; 206 | }; 207 | let Ok(outbound) = decompose_vsock_stream(outbound) else { 208 | return; 209 | }; 210 | let Ok(conn) = conn.into_std() else { 211 | return; 212 | }; 213 | let _ = copy_bidirectional_fastclose(conn.into(), outbound).await; 214 | }); 215 | } 216 | }); 217 | std::mem::forget(rt); 218 | Ok(()) 219 | } 220 | 221 | pub fn run_socks5_udp_injection(tun2socks_ifname: &str) -> anyhow::Result<()> { 222 | let rt = tokio::runtime::Builder::new_multi_thread() 223 | .enable_all() 224 | .worker_threads(1) 225 | .thread_name("bake-s5udpinj") 226 | .build() 227 | .unwrap(); 228 | let udp = 229 | rt.block_on(async { RawUdp::open(tun2socks_ifname, "198.18.0.2".parse().unwrap()) })?; 230 | let udp = Arc::new(udp); 231 | let host = rt.block_on(async { VsockStream::connect(VsockAddr::new(2, 11)).await })?; 232 | rt.spawn(async move { 233 | let (mut rx, mut tx) = tokio::io::split(host); 234 | let udp_rx = udp.clone(); 235 | let udp_tx = udp; 236 | 237 | let rx_fut = async { 238 | loop { 239 | let msg = udp_rx.recv().await?; 240 | let msg = rkyv::to_bytes::(&msg).unwrap(); 241 | tx.write_all(&(msg.len() as u32).to_le_bytes()).await?; 242 | tx.write_all(&msg).await?; 243 | } 244 | }; 245 | let tx_fut = async { 246 | let mut pkt = vec![0u8; 9600]; 247 | loop { 248 | let len = rx.read_u32_le().await? as usize; 249 | assert!(len <= pkt.len()); 250 | rx.read_exact(&mut pkt[..len]).await?; 251 | let pkt = &pkt[..len]; 252 | let pkt = rkyv::access::(pkt).unwrap(); 253 | 254 | // eprintln!( 255 | // "INJECT: {}:{}->{}:{}", 256 | // pkt.src_ip.as_ipv4(), 257 | // pkt.src_port.to_native(), 258 | // pkt.dst_ip.as_ipv4(), 259 | // pkt.dst_port.to_native(), 260 | // ); 261 | udp_tx 262 | .inject( 263 | pkt.src_ip.as_ipv4(), 264 | pkt.dst_ip.as_ipv4(), 265 | pkt.src_port.to_native(), 266 | pkt.dst_port.to_native(), 267 | &pkt.payload[..], 268 | 64, 269 | ) 270 | .await?; 271 | } 272 | }; 273 | let res: anyhow::Result<()> = tokio::select! { 274 | biased; 275 | x = tx_fut => x, 276 | x = rx_fut => x, 277 | }; 278 | res.expect("udp injection task failed"); 279 | }); 280 | std::mem::forget(rt); 281 | Ok(()) 282 | } 283 | 284 | async fn serve< 285 | C: AsyncRead + AsyncWrite + Unpin + 'static, 286 | Fut: Future>, 287 | >( 288 | conn: C, 289 | sockfd: OwnedFd, 290 | f: impl FnOnce(C) -> Fut, 291 | ) -> anyhow::Result<()> { 292 | let sockfd = AsyncFd::with_interest(sockfd.as_fd(), Interest::PRIORITY)?; 293 | tokio::select! { 294 | biased; 295 | _ = sockfd.ready(Interest::PRIORITY) => { 296 | // eprintln!("PRIORITY received, shutting down"); 297 | Ok(()) 298 | } 299 | x = f(conn) => x 300 | } 301 | } 302 | 303 | async fn serve_socks5(conn: OwnedFd) -> anyhow::Result<()> { 304 | static CONFIG: OnceCell>> = OnceCell::const_new(); 305 | let config = CONFIG 306 | .get_or_init(|| async { 307 | let mut config = Config::default(); 308 | config.set_dns_resolve(false); 309 | config.set_execute_command(false); 310 | Arc::new(config) 311 | }) 312 | .await; 313 | let conn = UnixStream::from_std(std::os::unix::net::UnixStream::from(conn))?; 314 | let sock = Socks5Socket::new(conn, config.clone()); 315 | let sock = sock.upgrade_to_socks5().await?; 316 | match sock.cmd() { 317 | Some(Socks5Command::TCPConnect) => { 318 | let target_addr = match sock.target_addr() { 319 | Some(x) => match x.clone().resolve_dns().await { 320 | Ok(TargetAddr::Ip(x)) => x, 321 | Ok(_) => panic!("unexpected target addr type"), 322 | Err(e) => { 323 | anyhow::bail!("dns resolution failed: {:?}: {:?}", x, e); 324 | } 325 | }, 326 | _ => anyhow::bail!("invalid target addr"), 327 | }; 328 | // Check allowlist before proceeding 329 | if !ip_allowed(target_addr.ip()) { 330 | if DEBUG.load(Ordering::Relaxed) { 331 | eprintln!("TCP blocked by allowlist: {}", target_addr); 332 | } 333 | let mut sock = sock.into_inner(); 334 | sock.write_all(&new_reply(&ReplyError::ConnectionNotAllowed, target_addr)) 335 | .await?; 336 | return Ok(()); 337 | } 338 | let mut sock = sock.into_inner(); 339 | sock.write_all(&new_reply(&ReplyError::Succeeded, target_addr)) 340 | .await?; 341 | let sock = sock.into_std()?; 342 | let outbound = TcpStream::connect(target_addr).await?; 343 | outbound.set_nodelay(true)?; 344 | 345 | let _ = copy_bidirectional_fastclose(sock.into(), outbound.into_std()?.into()).await; 346 | } 347 | _ => {} 348 | } 349 | Ok(()) 350 | } 351 | 352 | async fn serve_socks5_udp( 353 | conn: impl AsyncRead + AsyncWrite + Unpin + 'static, 354 | ) -> anyhow::Result<()> { 355 | // vm local port -> host socket 356 | let portmap: moka::sync::Cache, Arc>)> = 357 | moka::sync::Cache::builder() 358 | .time_to_idle(Duration::from_secs(60)) 359 | .build(); 360 | let (rx, mut tx) = tokio::io::split(conn); 361 | let rx_fut = async { 362 | let mut rx = BufReader::new(rx); 363 | let mut buf = vec![0u8; 9600]; 364 | loop { 365 | let len = rx.read_u32_le().await? as usize; 366 | if len > buf.len() { 367 | anyhow::bail!("packet too large"); 368 | } 369 | rx.read_exact(&mut buf[..len]).await?; 370 | let packet = rkyv::access::(&buf[..len])?; 371 | let client_addr = 372 | SocketAddrV4::new(packet.src_ip.as_ipv4(), packet.src_port.to_native()); 373 | let socket = portmap 374 | .try_get_with(client_addr.port(), || portmap_elem_init(client_addr)) 375 | .map_err(|e: Arc| { 376 | anyhow::anyhow!("failed to create host udp socket: {:?}", e) 377 | })? 378 | .0; 379 | // Allowlist check for UDP destination 380 | if !ip_allowed(IpAddr::V4(packet.dst_ip.as_ipv4())) { 381 | if DEBUG.load(Ordering::Relaxed) { 382 | eprintln!( 383 | "UDP blocked by allowlist: {}:{}", 384 | packet.dst_ip.as_ipv4(), 385 | packet.dst_port.to_native() 386 | ); 387 | } 388 | continue; 389 | } 390 | socket 391 | .send_to( 392 | &packet.payload, 393 | SocketAddr::new(packet.dst_ip.as_ipv4().into(), packet.dst_port.to_native()), 394 | ) 395 | .await?; 396 | if DEBUG.load(Ordering::Relaxed) { 397 | eprintln!( 398 | "UDP TX {}:{}->{}:{}", 399 | packet.src_ip.as_ipv4(), 400 | packet.src_port.to_native(), 401 | packet.dst_ip.as_ipv4(), 402 | packet.dst_port.to_native(), 403 | ); 404 | } 405 | } 406 | }; 407 | let tx_fut = async { 408 | let mut sub = UDPBUS_RX.subscribe(); 409 | loop { 410 | let pkt = match sub.recv().await { 411 | Ok(x) => x, 412 | Err(broadcast::error::RecvError::Lagged(_)) => continue, 413 | Err(broadcast::error::RecvError::Closed) => unreachable!(), 414 | }; 415 | let pkt = rkyv::to_bytes::(&pkt).unwrap(); 416 | tx.write_all(&(pkt.len() as u32).to_le_bytes()).await?; 417 | tx.write_all(&pkt).await?; 418 | tx.flush().await?; 419 | } 420 | }; 421 | tokio::select! { 422 | biased; 423 | x = rx_fut => x, 424 | x = tx_fut => x, 425 | } 426 | } 427 | 428 | // copied from fast_socks5 429 | fn new_reply(error: &ReplyError, sock_addr: SocketAddr) -> Vec { 430 | let (addr_type, mut ip_oct, mut port) = match sock_addr { 431 | SocketAddr::V4(sock) => ( 432 | consts::SOCKS5_ADDR_TYPE_IPV4, 433 | sock.ip().octets().to_vec(), 434 | sock.port().to_be_bytes().to_vec(), 435 | ), 436 | SocketAddr::V6(sock) => ( 437 | consts::SOCKS5_ADDR_TYPE_IPV6, 438 | sock.ip().octets().to_vec(), 439 | sock.port().to_be_bytes().to_vec(), 440 | ), 441 | }; 442 | 443 | let mut reply = vec![ 444 | consts::SOCKS5_VERSION, 445 | error.as_u8(), // transform the error into byte code 446 | 0x00, // reserved 447 | addr_type, // address type (ipv4, v6, domain) 448 | ]; 449 | reply.append(&mut ip_oct); 450 | reply.append(&mut port); 451 | 452 | reply 453 | } 454 | 455 | fn portmap_elem_init( 456 | client_addr: SocketAddrV4, 457 | ) -> anyhow::Result<(Arc, Arc>)> { 458 | let socket = std::net::UdpSocket::bind("[::]:0")?; 459 | socket.set_nonblocking(true)?; 460 | let socket = Arc::new(tokio::net::UdpSocket::from_std(socket)?); 461 | let socket_clone = socket.clone(); 462 | let (kill_tx, kill_rx) = tokio::sync::oneshot::channel(); 463 | tokio::spawn(async move { 464 | let fut = async { 465 | let mut pkt = vec![0u8; 9600]; 466 | loop { 467 | let Ok((n, mut addr)) = socket_clone.recv_from(&mut pkt).await else { 468 | break; 469 | }; 470 | if let IpAddr::V6(x) = addr.ip() { 471 | if let Some(x) = x.to_ipv4_mapped() { 472 | addr.set_ip(IpAddr::V4(x)); 473 | } 474 | } 475 | let IpAddr::V4(ip) = addr.ip() else { 476 | if DEBUG.load(Ordering::Relaxed) { 477 | eprintln!("dropping packet with invalid source: {}", addr); 478 | } 479 | continue; 480 | }; 481 | if DEBUG.load(Ordering::Relaxed) { 482 | eprintln!( 483 | "UDP RX: {}:{} -> {}:{}", 484 | ip, 485 | addr.port(), 486 | client_addr.ip(), 487 | client_addr.port() 488 | ); 489 | } 490 | let _ = UDPBUS_RX.send(UdpPacket { 491 | src_ip: ip, 492 | dst_ip: *client_addr.ip(), 493 | src_port: addr.port(), 494 | dst_port: client_addr.port(), 495 | payload: pkt[..n].to_vec(), 496 | }); 497 | } 498 | }; 499 | tokio::select! { 500 | biased; 501 | _ = fut => {} 502 | _ = kill_rx => {} 503 | } 504 | }); 505 | Ok((socket, Arc::new(kill_tx))) 506 | } 507 | -------------------------------------------------------------------------------- /src/vminit.rs: -------------------------------------------------------------------------------- 1 | use crate::util::{ 2 | ArchivedVolumeManifest, best_effort_raise_fd_limit, copy_bidirectional_fastclose, 3 | decompose_vsock_stream, set_nonblocking, 4 | }; 5 | use anyhow::Context; 6 | use nix::mount::MsFlags; 7 | use serde_json::json; 8 | use std::{ 9 | borrow::Cow, 10 | collections::HashMap, 11 | fs::{self, OpenOptions, Permissions}, 12 | io::Write, 13 | net::{IpAddr, SocketAddr}, 14 | os::{ 15 | fd::{AsFd, AsRawFd}, 16 | unix::{ 17 | fs::{OpenOptionsExt, PermissionsExt}, 18 | process::CommandExt, 19 | }, 20 | }, 21 | path::Path, 22 | process::{Command, ExitStatus, Stdio}, 23 | str::FromStr, 24 | sync::atomic::Ordering, 25 | }; 26 | use tokio::{io::AsyncReadExt, net::UnixListener}; 27 | 28 | use crate::{DEBUG, util::ArchivedBootManifest}; 29 | use tokio_vsock::{VsockAddr, VsockListener, VsockStream}; 30 | 31 | const ALL_NS: &[&str] = &[ 32 | "CAP_AUDIT_CONTROL", 33 | "CAP_AUDIT_READ", 34 | "CAP_AUDIT_WRITE", 35 | "CAP_BLOCK_SUSPEND", 36 | "CAP_CHOWN", 37 | "CAP_DAC_OVERRIDE", 38 | "CAP_DAC_READ_SEARCH", 39 | "CAP_FOWNER", 40 | "CAP_FSETID", 41 | "CAP_IPC_LOCK", 42 | "CAP_IPC_OWNER", 43 | "CAP_KILL", 44 | "CAP_LEASE", 45 | "CAP_LINUX_IMMUTABLE", 46 | "CAP_MAC_ADMIN", 47 | "CAP_MAC_OVERRIDE", 48 | "CAP_MKNOD", 49 | "CAP_NET_ADMIN", 50 | "CAP_NET_BIND_SERVICE", 51 | "CAP_NET_BROADCAST", 52 | "CAP_NET_RAW", 53 | "CAP_SETGID", 54 | "CAP_SETFCAP", 55 | "CAP_SETPCAP", 56 | "CAP_SETUID", 57 | "CAP_SYS_ADMIN", 58 | "CAP_SYS_BOOT", 59 | "CAP_SYS_CHROOT", 60 | "CAP_SYS_MODULE", 61 | "CAP_SYS_NICE", 62 | "CAP_SYS_PACCT", 63 | "CAP_SYS_PTRACE", 64 | "CAP_SYS_RAWIO", 65 | "CAP_SYS_RESOURCE", 66 | "CAP_SYS_TIME", 67 | "CAP_SYS_TTY_CONFIG", 68 | "CAP_SYSLOG", 69 | "CAP_WAKE_ALARM", 70 | ]; 71 | 72 | pub fn run() -> anyhow::Result<()> { 73 | cmd(r#"set -e 74 | mount -t proc proc /proc 75 | mount -t sysfs sysfs /sys 76 | mount -t devtmpfs devtmpfs /dev 77 | mkdir -p /dev/pts 78 | mount -t devpts devpts /dev/pts 79 | mount -t cgroup2 cgroup2 /sys/fs/cgroup 80 | ip link set lo up 81 | "#); 82 | let cmdline = Box::leak( 83 | std::fs::read_to_string("/proc/cmdline") 84 | .unwrap() 85 | .into_boxed_str(), 86 | ) 87 | .split(' ') 88 | .filter(|x| !x.is_empty()) 89 | .map(|x| x.split_once('=').unwrap_or((x, ""))) 90 | .collect::>(); 91 | let quiet = cmdline.contains_key("quiet"); 92 | if !quiet { 93 | DEBUG.store(true, Ordering::Relaxed); 94 | println!("Bottlefire v0.1.0"); 95 | cmd("cat /proc/version"); 96 | } 97 | best_effort_raise_fd_limit(); 98 | let rootfs_offset = cmdline 99 | .get("bake.rootfs_offset") 100 | .and_then(|x| x.parse::().ok()) 101 | .expect("bake.rootfs_offset not found"); 102 | let rootfs_size = cmdline 103 | .get("bake.rootfs_size") 104 | .and_then(|x| x.parse::().ok()) 105 | .expect("bake.rootfs_size not found"); 106 | cmd(&format!( 107 | "echo '0 {} linear /dev/vda {}' | dmsetup create rootfs", 108 | rootfs_size, rootfs_offset 109 | )); 110 | cmd(r#" 111 | set -e 112 | mkdir /rootfs.base /rootfs /ephemeral 113 | mkfs.ext4 -q /dev/vdb 114 | mount -t ext4 /dev/vdb /ephemeral 115 | mkdir -p /ephemeral/rootfs.overlay/upper /ephemeral/rootfs.overlay/work /ephemeral/container-tmp 116 | chmod 1777 /ephemeral/container-tmp 117 | mount /dev/mapper/rootfs /rootfs.base 118 | mount -t overlay -o rw,lowerdir=/rootfs.base,upperdir=/ephemeral/rootfs.overlay/upper,workdir=/ephemeral/rootfs.overlay/work overlay /rootfs 119 | "#); 120 | if !quiet { 121 | cmd("ls /dev; mount"); 122 | } 123 | 124 | // Fetch boot manifest 125 | let mut boot_manifest = Vec::new(); 126 | tokio::runtime::Builder::new_current_thread() 127 | .enable_all() 128 | .build() 129 | .unwrap() 130 | .block_on(async { 131 | VsockStream::connect(VsockAddr::new(2, 13)) 132 | .await? 133 | .read_to_end(&mut boot_manifest) 134 | .await 135 | })?; 136 | let boot_manifest = rkyv::access::(&boot_manifest) 137 | .with_context(|| "invalid boot request")?; 138 | 139 | // start socks5 server 140 | crate::socks5::run_socks5_vsock().expect("failed to start socks5 server"); 141 | 142 | // proxy socks5 to host 143 | crate::socks5::run_socks5_tcp_to_vsock_proxy().expect("failed to start socks5 host proxy"); 144 | 145 | crate::socks5::run_socks5_udp_injection("hostudp").expect("failed to start udp injection task"); 146 | if !boot_manifest.disable_hostnet { 147 | // configure udp routing and start tun2socks 148 | cmd(r#"set -e 149 | ip route add default dev hostudp table 100 150 | nft add table inet mangle 151 | nft 'add chain inet mangle output { type route hook output priority mangle; }' 152 | nft 'add rule inet mangle output meta l4proto udp meta mark set 0x64' 153 | ip rule add preference 100 fwmark 0x64 lookup 100 154 | 155 | ip tuntap add mode tun dev hostnet 156 | ip addr add 198.18.0.1/32 dev hostnet 157 | ip link set dev hostnet up 158 | ip route add default dev hostnet 159 | "#); 160 | let mut tun2socks = Command::new("/usr/bin/tun2socks") 161 | .arg("-device") 162 | .arg("hostnet") 163 | .arg("-proxy") 164 | .arg("socks5://127.0.0.10:10") 165 | .arg("-interface") 166 | .arg("lo") 167 | .stdin(Stdio::null()) 168 | .stdout(if quiet { 169 | Stdio::null() 170 | } else { 171 | Stdio::inherit() 172 | }) 173 | .stderr(if quiet { 174 | Stdio::null() 175 | } else { 176 | Stdio::inherit() 177 | }) 178 | .spawn() 179 | .unwrap(); 180 | std::thread::spawn(move || { 181 | let ret = tun2socks.wait(); 182 | panic!("tun2socks exited: {:?}", ret); 183 | }); 184 | } 185 | 186 | // Configure WireGuard if provided by host 187 | if let Some(conf) = boot_manifest.wireguard_conf.as_deref() { 188 | std::fs::write("/ephemeral/wg.conf", conf).expect("failed to write wg.conf"); 189 | let parsed = crate::wireguard::parse_wireguard_conf(conf); 190 | let sanitized = crate::wireguard::serialize_without_keys(&parsed, &["address", "dns"]); 191 | std::fs::write("/ephemeral/wg.setconf", sanitized).expect("failed to write wg.setconf"); 192 | use std::collections::BTreeSet; 193 | let mut addr_set = BTreeSet::new(); 194 | let mut allowed_set = BTreeSet::new(); 195 | let mut endpoints = BTreeSet::new(); 196 | for sec in &parsed.sections { 197 | if sec.name.eq_ignore_ascii_case("interface") { 198 | for (k, v) in &sec.items { 199 | if k.eq_ignore_ascii_case("address") { 200 | for item in v { 201 | addr_set.insert(item.clone()); 202 | } 203 | } 204 | } 205 | } else if sec.name.eq_ignore_ascii_case("peer") { 206 | for (k, v) in &sec.items { 207 | if k.eq_ignore_ascii_case("allowedips") { 208 | for item in v { 209 | allowed_set.insert(item.clone()); 210 | } 211 | } 212 | if k.eq_ignore_ascii_case("endpoint") { 213 | for item in v { 214 | endpoints.insert(item.clone()); 215 | } 216 | } 217 | } 218 | } 219 | } 220 | // Create interface, apply config, assign addresses, bring up 221 | cmd("ip link add dev wg0 mtu 1280 type wireguard"); 222 | cmd("wg setconf wg0 /ephemeral/wg.setconf"); 223 | for addr in addr_set { 224 | cmd(&format!( 225 | "ip addr add {} dev wg0", 226 | shell_escape::escape(addr.into()) 227 | )); 228 | } 229 | cmd("ip link set up dev wg0"); 230 | for cidr in allowed_set { 231 | // Use -4/-6 depending on address family for clarity 232 | if cidr.contains(':') { 233 | cmd(&format!( 234 | "ip -6 route add {} dev wg0 table 99", 235 | shell_escape::escape(cidr.into()) 236 | )); 237 | } else { 238 | cmd(&format!( 239 | "ip -4 route add {} dev wg0 table 99", 240 | shell_escape::escape(cidr.into()) 241 | )); 242 | } 243 | } 244 | for endpoint in endpoints { 245 | let Ok(addr) = SocketAddr::from_str(&endpoint) else { 246 | continue; 247 | }; 248 | let IpAddr::V4(ip) = addr.ip() else { 249 | continue; 250 | }; 251 | let status = Command::new("/sbin/ip") 252 | .arg("route") 253 | .arg("add") 254 | .arg(ip.to_string()) 255 | .arg("dev") 256 | .arg("hostudp") 257 | .arg("table") 258 | .arg("99") 259 | .stdin(Stdio::inherit()) 260 | .stdout(Stdio::inherit()) 261 | .stderr(Stdio::inherit()) 262 | .status() 263 | .unwrap(); 264 | if !status.success() { 265 | panic!("ip route add dev hostudp failed"); 266 | } 267 | } 268 | cmd("ip rule add preference 99 from all lookup 99"); 269 | } 270 | 271 | if !boot_manifest.volumes.is_empty() { 272 | setup_9p_volumes(&boot_manifest.volumes); 273 | } 274 | 275 | std::fs::write( 276 | "/etc/ssh/sshd_config", 277 | r#"HostKey /etc/ssh/ssh_host_ecdsa_key 278 | PermitRootLogin without-password 279 | AuthorizedKeysFile .ssh/authorized_keys 280 | PasswordAuthentication no 281 | KbdInteractiveAuthentication no 282 | ForceCommand /ssh.sh 283 | "#, 284 | ) 285 | .with_context(|| "failed to write sshd_config")?; 286 | 287 | std::fs::write( 288 | "/ssh.sh", 289 | r#"#!/bin/sh 290 | set -e 291 | cd /var/lib/container 292 | if [ -z "$SSH_TTY" ]; then 293 | exec runc exec container1 sh -c "$SSH_ORIGINAL_COMMAND" 294 | else 295 | if [ -z "$SSH_ORIGINAL_COMMAND" ]; then 296 | exec runc exec -t container1 sh 297 | else 298 | exec runc exec -t container1 sh -c "$SSH_ORIGINAL_COMMAND" 299 | fi 300 | fi 301 | "#, 302 | ) 303 | .with_context(|| "failed to write ssh.sh")?; 304 | std::fs::set_permissions("/ssh.sh", Permissions::from_mode(0o555))?; 305 | 306 | std::fs::write( 307 | "/etc/ssh/ssh_host_ecdsa_key", 308 | boot_manifest.ssh_ecdsa_private_key.as_bytes(), 309 | ) 310 | .with_context(|| "failed to write ssh_host_ecdsa_key")?; 311 | std::fs::set_permissions("/etc/ssh/ssh_host_ecdsa_key", Permissions::from_mode(0o600))?; 312 | std::fs::create_dir_all("/root/.ssh")?; 313 | std::fs::write( 314 | "/root/.ssh/authorized_keys", 315 | boot_manifest.ssh_ecdsa_public_key.as_bytes(), 316 | ) 317 | .with_context(|| "failed to write authorized_keys")?; 318 | std::fs::set_permissions("/root/.ssh/authorized_keys", Permissions::from_mode(0o600))?; 319 | 320 | sshd_vsock(VsockAddr::new(u32::MAX, 22)) 321 | .with_context(|| "failed to start sshd vsock listener")?; 322 | 323 | let res = start_container( 324 | boot_manifest 325 | .uid 326 | .as_ref() 327 | .map(|x| x.to_native()) 328 | .unwrap_or(0), 329 | boot_manifest 330 | .gid 331 | .as_ref() 332 | .map(|x| x.to_native()) 333 | .unwrap_or(0), 334 | boot_manifest.entrypoint.as_ref(), 335 | &boot_manifest.args[..], 336 | boot_manifest 337 | .env 338 | .iter() 339 | .map(|(k, v)| format!("{}={}", k, v)) 340 | .chain( 341 | std::iter::once(|| "TERM=xterm".to_string()) 342 | .filter(|_| !boot_manifest.env.contains_key("TERM")) 343 | .map(|x| x()), 344 | ), 345 | boot_manifest.cwd.as_deref().unwrap_or_default(), 346 | )?; 347 | if !res.success() { 348 | eprintln!("exit status: {:?}", res); 349 | } 350 | // Respect kernel cmdline flag to avoid reboot for debugging 351 | if cmdline.contains_key("bake.noreboot") { 352 | if DEBUG.load(Ordering::Relaxed) { 353 | eprintln!("[vminit] bake.noreboot present; holding VM after container exit"); 354 | } 355 | loop { 356 | std::thread::sleep(std::time::Duration::from_secs(3600)); 357 | } 358 | } else { 359 | unsafe { 360 | libc::sync(); 361 | } 362 | if !DEBUG.load(Ordering::Relaxed) { 363 | let _ = std::fs::write("/proc/sys/kernel/printk", b"0"); 364 | } 365 | unsafe { 366 | libc::reboot(libc::RB_AUTOBOOT); 367 | } 368 | } 369 | 370 | Ok(()) 371 | } 372 | 373 | fn cmd(cmd: &str) { 374 | assert!( 375 | Command::new("/bin/busybox") 376 | .arg("sh") 377 | .arg("-c") 378 | .arg(cmd) 379 | .stdin(Stdio::inherit()) 380 | .stdout(Stdio::inherit()) 381 | .stderr(Stdio::inherit()) 382 | .status() 383 | .unwrap() 384 | .success() 385 | ); 386 | } 387 | 388 | fn setup_9p_volumes(vols: &[ArchivedVolumeManifest]) { 389 | // Create a lightweight runtime to host listeners; keep it alive. 390 | let rt = tokio::runtime::Builder::new_multi_thread() 391 | .enable_all() 392 | .worker_threads(1) 393 | .build() 394 | .unwrap(); 395 | 396 | // Base dir for unix sockets 397 | let base = "/ephemeral/9p-sock"; 398 | let _ = std::fs::create_dir_all(base); 399 | 400 | let mut vd_names = ('c'..='z').map(|x| format!("/dev/vd{}", x)); 401 | 402 | for (idx, vol) in vols.iter().enumerate() { 403 | let uds_path = format!("{}/vol{}.sock", base, idx); 404 | 405 | // Start a unix-to-vsock proxy that writes the guest path prefix 406 | let uds_path_clone = uds_path.clone(); 407 | rt.block_on(async { start_9p_unix_to_vsock_proxy(&uds_path_clone, &vol.guest_path) }) 408 | .expect("failed to start unix to vsock proxy"); 409 | 410 | let mount_it = |path: &Path| { 411 | nix::mount::mount( 412 | Some(uds_path.as_str()), 413 | path, 414 | Some("9p"), 415 | MsFlags::empty(), 416 | Some("trans=unix,version=9p2000.L"), 417 | ) 418 | .expect("9p mount failed"); 419 | }; 420 | let guest_path = Path::new("/rootfs").join(vol.guest_path.as_str().trim_start_matches('/')); 421 | if let Some(host_filename) = vol.host_filename.as_deref() { 422 | if vol.ext4 { 423 | let _ = std::fs::create_dir_all(&guest_path); 424 | let vd = vd_names.next().expect("too many ext4 volumes"); 425 | let status = Command::new("mount") 426 | .arg("-t") 427 | .arg("ext4") 428 | .arg("-o") 429 | .arg(if vol.ro { "ro,relatime" } else { "rw,relatime" }) 430 | .arg(vd) 431 | .arg(&guest_path) 432 | .stdin(Stdio::inherit()) 433 | .stdout(Stdio::inherit()) 434 | .stderr(Stdio::inherit()) 435 | .status() 436 | .unwrap(); 437 | if !status.success() { 438 | panic!("ext4 mount failed: {}", vol.guest_path); 439 | } 440 | } else { 441 | let filebase = format!("/filebase/{}", idx); 442 | let _ = std::fs::create_dir_all(&filebase); 443 | mount_it(Path::new(&filebase)); 444 | if let Some(parent) = guest_path.parent() { 445 | let _ = std::fs::create_dir_all(parent); 446 | } 447 | let _ = OpenOptions::new() 448 | .write(true) 449 | .create_new(true) 450 | .mode(0o000) 451 | .open(&guest_path); 452 | 453 | nix::mount::mount( 454 | Some(format!("{}/{}", filebase, host_filename).as_str()), 455 | guest_path.as_path(), 456 | None::<&str>, 457 | MsFlags::MS_BIND, 458 | None::<&str>, 459 | ) 460 | .expect("bind mount failed"); 461 | } 462 | } else { 463 | let _ = std::fs::create_dir_all(&guest_path); 464 | mount_it(&guest_path); 465 | } 466 | } 467 | 468 | // Keep runtime alive for the lifetime of the init process 469 | std::mem::forget(rt); 470 | } 471 | 472 | fn start_9p_unix_to_vsock_proxy(uds_path: &str, guest_path: &str) -> anyhow::Result<()> { 473 | let listener = UnixListener::bind(uds_path)?; 474 | let guest_path = guest_path.to_string(); 475 | tokio::spawn(async move { 476 | loop { 477 | let Ok((inbound, _)) = listener.accept().await else { 478 | break; 479 | }; 480 | let guest_path = guest_path.clone(); 481 | 482 | // Connect to host vsock CID 2, port 12 483 | // Then write the length-prefixed guest path, little-endian 484 | tokio::spawn(async move { 485 | let mut outbound = VsockStream::connect(VsockAddr::new(2, 12)).await?; 486 | 487 | let name_bytes = guest_path.as_bytes(); 488 | ::write_all( 489 | &mut outbound, 490 | &(name_bytes.len() as u32).to_le_bytes(), 491 | ) 492 | .await?; 493 | ::write_all(&mut outbound, name_bytes) 494 | .await?; 495 | 496 | let e = copy_bidirectional_fastclose( 497 | inbound.into_std()?.into(), 498 | decompose_vsock_stream(outbound)?, 499 | ) 500 | .await; 501 | if let Err(e) = e { 502 | eprintln!("9p proxy error: {:?}", e); 503 | } 504 | Ok::<_, anyhow::Error>(()) 505 | }); 506 | } 507 | }); 508 | Ok(()) 509 | } 510 | 511 | fn start_container( 512 | uid: u32, 513 | gid: u32, 514 | entrypoint: Option>, 515 | args: &[impl AsRef], 516 | env: impl Iterator, 517 | cwd: &str, 518 | ) -> anyhow::Result { 519 | // Create container directories 520 | cmd("mkdir -p /var/lib/container"); 521 | std::env::set_current_dir("/var/lib/container")?; 522 | 523 | // Create resolv.conf with Google DNS 524 | fs::write("/var/lib/container/resolv.conf", "nameserver 8.8.8.8\n")?; 525 | 526 | // Create hosts 527 | fs::write("/var/lib/container/hosts", "127.0.0.1 localhost\n")?; 528 | 529 | let mut env_vars = 530 | vec!["PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin".to_string()]; 531 | env_vars.extend(env); 532 | 533 | // Determine the process command and args 534 | let mut process_args = if let Some(entrypoint) = &entrypoint { 535 | vec![entrypoint.as_ref()] 536 | } else { 537 | vec![] 538 | }; 539 | process_args.extend(args.iter().map(|x| x.as_ref())); 540 | 541 | // Generate OCI runtime spec 542 | let spec = generate_oci_spec(uid, gid, &process_args, &env_vars, cwd); 543 | 544 | // Write config.json 545 | let mut config_file = fs::File::create("/var/lib/container/config.json")?; 546 | config_file.write_all(serde_json::to_string_pretty(&spec)?.as_bytes())?; 547 | drop(config_file); 548 | 549 | // Console bridge (vsock CID 2, port 14) 550 | let tty = crate::vm_console::start_console_bridge()?; 551 | 552 | // Start container with runc (we're already in the bundle directory) 553 | let mut cmd = Command::new("runc"); 554 | cmd.arg("run") 555 | .arg("--no-pivot") 556 | .arg("container1") 557 | .stdin(Stdio::inherit()) 558 | .stdout(Stdio::inherit()) 559 | .stderr(Stdio::inherit()); 560 | unsafe { 561 | let tty_fd = tty.as_raw_fd(); 562 | cmd.pre_exec(move || { 563 | libc::login_tty(tty_fd); 564 | Ok(()) 565 | }); 566 | } 567 | let status = cmd.status()?; 568 | 569 | Ok(status) 570 | } 571 | 572 | fn sshd_vsock(vsock_listen: VsockAddr) -> anyhow::Result<()> { 573 | let rt = tokio::runtime::Builder::new_multi_thread() 574 | .enable_all() 575 | .worker_threads(1) 576 | .build() 577 | .unwrap(); 578 | let sock = rt.block_on(async { VsockListener::bind(vsock_listen) })?; 579 | rt.spawn(async move { 580 | loop { 581 | let Ok((conn, _)) = sock.accept().await else { 582 | break; 583 | }; 584 | let mut cmd = Command::new("/usr/sbin/sshd"); 585 | cmd.arg("-i") 586 | .stdin(Stdio::null()) 587 | .stdout(Stdio::null()) 588 | .stderr(Stdio::inherit()); 589 | set_nonblocking(conn.as_fd(), false).expect("failed to set nonblocking"); 590 | let fd = conn.as_fd().as_raw_fd(); 591 | unsafe { 592 | let ppid = libc::getpid(); 593 | cmd.pre_exec(move || { 594 | if libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGKILL) != 0 595 | || libc::getppid() != ppid 596 | { 597 | libc::abort(); 598 | } 599 | if libc::dup2(fd, 0) < 0 || libc::dup2(fd, 1) < 0 { 600 | return Err(std::io::Error::last_os_error()); 601 | } 602 | Ok(()) 603 | }); 604 | } 605 | if let Err(e) = cmd.spawn() { 606 | eprintln!("failed to spawn sshd: {:?}", e); 607 | } 608 | } 609 | }); 610 | std::mem::forget(rt); 611 | Ok(()) 612 | } 613 | 614 | fn generate_oci_spec( 615 | uid: u32, 616 | gid: u32, 617 | args: &[&str], 618 | env: &[String], 619 | cwd: &str, 620 | ) -> serde_json::Value { 621 | json!({ 622 | "ociVersion": "1.0.0", 623 | "process": { 624 | "terminal": true, 625 | "user": { 626 | "uid": uid, 627 | "gid": gid 628 | }, 629 | "args": args, 630 | "env": env, 631 | "cwd": if cwd.is_empty() { "/" } else { cwd }, 632 | "capabilities": { 633 | "bounding": ALL_NS, 634 | "effective": ALL_NS, 635 | "inheritable": ALL_NS, 636 | "permitted": ALL_NS, 637 | "ambient": ALL_NS 638 | }, 639 | "rlimits": [ 640 | { 641 | "type": "RLIMIT_NOFILE", 642 | "hard": 1048576, 643 | "soft": 1048576 644 | } 645 | ], 646 | "noNewPrivileges": false 647 | }, 648 | "root": { 649 | "path": "/rootfs", 650 | "readonly": false 651 | }, 652 | "hostname": "container", 653 | "mounts": [ 654 | { 655 | "destination": "/proc", 656 | "type": "proc", 657 | "source": "proc" 658 | }, 659 | { 660 | "destination": "/dev", 661 | "type": "tmpfs", 662 | "source": "tmpfs", 663 | "options": [ 664 | "nosuid", 665 | "strictatime", 666 | "mode=755", 667 | "size=65536k" 668 | ] 669 | }, 670 | { 671 | "destination": "/dev/pts", 672 | "type": "devpts", 673 | "source": "devpts", 674 | "options": [ 675 | "nosuid", 676 | "noexec", 677 | "newinstance", 678 | "ptmxmode=0666", 679 | "mode=0620", 680 | "gid=5" 681 | ] 682 | }, 683 | { 684 | "destination": "/sys", 685 | "type": "sysfs", 686 | "source": "sysfs", 687 | "options": [ 688 | "nosuid", 689 | "noexec", 690 | "nodev", 691 | "ro" 692 | ] 693 | }, 694 | { 695 | "destination": "/sys/fs/cgroup", 696 | "type": "cgroup", 697 | "source": "cgroup", 698 | "options": [ 699 | "nosuid", 700 | "noexec", 701 | "nodev", 702 | "relatime", 703 | "ro" 704 | ] 705 | }, 706 | { 707 | "destination": "/etc/resolv.conf", 708 | "type": "bind", 709 | "source": "/var/lib/container/resolv.conf", 710 | "options": [ 711 | "bind", 712 | "ro" 713 | ] 714 | }, 715 | { 716 | "destination": "/etc/hosts", 717 | "type": "bind", 718 | "source": "/var/lib/container/hosts", 719 | "options": [ 720 | "bind", 721 | "ro" 722 | ] 723 | }, 724 | { 725 | "destination": "/tmp", 726 | "type": "bind", 727 | "source": "/ephemeral/container-tmp", 728 | "options": [ 729 | "bind", 730 | "rw" 731 | ] 732 | } 733 | ], 734 | "linux": { 735 | "resources": { 736 | "devices": [ 737 | { 738 | "allow": true, 739 | "access": "rwm" 740 | } 741 | ] 742 | }, 743 | "namespaces": [ 744 | { 745 | "type": "pid" 746 | }, 747 | { 748 | "type": "ipc" 749 | }, 750 | { 751 | "type": "uts" 752 | }, 753 | { 754 | "type": "mount" 755 | } 756 | ], 757 | "maskedPaths": [], 758 | "readonlyPaths": [] 759 | } 760 | }) 761 | } 762 | -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | mod console; 2 | mod embed; 3 | mod fileshare; 4 | mod firecracker; 5 | mod raw_udp; 6 | mod socks5; 7 | mod ssh_launcher; 8 | mod util; 9 | mod vm_console; 10 | mod vminit; 11 | mod wireguard; 12 | 13 | use anyhow::Context; 14 | use bytes::Bytes; 15 | use clap::Parser; 16 | use memmap2::Mmap; 17 | use rand::Rng; 18 | use rkyv::{Archive, Deserialize, Serialize}; 19 | use std::collections::HashMap; 20 | use std::fs::{self, File, OpenOptions, Permissions}; 21 | use std::io::{self, Cursor, Read, Seek, SeekFrom, Write}; 22 | use std::os::raw::{c_char, c_void}; 23 | use std::os::unix::fs::{MetadataExt, OpenOptionsExt, PermissionsExt}; 24 | use std::os::unix::process::CommandExt; 25 | use std::path::{Path, PathBuf}; 26 | use std::process::{Command as ProcessCommand, Stdio}; 27 | use std::sync::atomic::{AtomicBool, Ordering}; 28 | use std::sync::{Arc, Mutex, OnceLock}; 29 | use tokio::net::UnixListener; 30 | use tokio::runtime::Runtime; 31 | 32 | use crate::embed::{EmbeddedInfo, get_embedded_data, write_embedded_data}; 33 | use crate::fileshare::spawn_file_server; 34 | use crate::firecracker::{BootSource, Drive, FirecrackerConfig, MachineConfig, VsockConfig}; 35 | use crate::util::{ 36 | BootManifest, VolumeManifest, align_up, best_effort_raise_fd_limit, 37 | copy_bidirectional_fastclose, 38 | }; 39 | use crate::util::{quote_systemd_string, vsock_uds_connect}; 40 | use crate::vm_console::host_run_console; 41 | use tokio::io::{AsyncReadExt, AsyncWriteExt}; 42 | 43 | static DEBUG: AtomicBool = AtomicBool::new(false); 44 | static TMP_BASE_DIR: Mutex> = Mutex::new(None); 45 | static RT: OnceLock = OnceLock::new(); 46 | 47 | #[derive(Archive, Deserialize, Serialize)] 48 | struct Embedded { 49 | firecracker: Bytes, 50 | kernel: Bytes, 51 | initrd: Bytes, 52 | rootfs_size: u64, 53 | entrypoint: Option, 54 | args: Vec, 55 | env: Vec, 56 | cwd: String, 57 | uid: Option, 58 | gid: Option, 59 | } 60 | 61 | fn main() -> anyhow::Result<()> { 62 | if std::env::var("BAKE_DEBUG").ok().as_deref() == Some("1") { 63 | DEBUG.store(true, Ordering::Relaxed); 64 | } 65 | 66 | if std::env::var("BAKE_NOT_INIT").ok().as_deref() != Some("1") && unsafe { libc::getpid() } == 1 67 | { 68 | return vminit::run(); 69 | } 70 | 71 | // Check if we have embedded data by looking for our custom sections 72 | let embedded = check_for_embedded_sections(); 73 | 74 | if let Some(embedded) = embedded { 75 | run_mode(embedded) 76 | } else { 77 | build_mode() 78 | } 79 | } 80 | 81 | #[derive(Debug, Parser)] 82 | #[command(name = "bake", about = "Embed Firecracker resources into a binary")] 83 | struct BuildArgs { 84 | #[arg(short, long, default_value = "/proc/self/exe")] 85 | input: String, 86 | 87 | #[arg(short, long)] 88 | output: String, 89 | 90 | #[arg(long, env = "BAKE_BUILD_FIRECRACKER")] 91 | firecracker: String, 92 | 93 | #[arg(long, env = "BAKE_BUILD_KERNEL")] 94 | kernel: String, 95 | 96 | #[arg(long, env = "BAKE_BUILD_INITRD")] 97 | initrd: String, 98 | 99 | #[arg(long, env = "BAKE_BUILD_ROOTFS")] 100 | rootfs: String, 101 | 102 | #[arg(long)] 103 | entrypoint: Option, 104 | 105 | #[arg(long)] 106 | arg: Vec, 107 | 108 | #[arg(long, value_name = "KEY=VALUE")] 109 | env: Vec, 110 | 111 | #[arg(long)] 112 | cwd: Option, 113 | 114 | #[arg(long)] 115 | uid: Option, 116 | 117 | #[arg(long)] 118 | gid: Option, 119 | } 120 | 121 | fn check_for_embedded_sections() -> Option<(EmbeddedInfo, &'static [u8], &'static ArchivedEmbedded)> 122 | { 123 | let info = get_embedded_data()?; 124 | let embedded_len = u32::from_le_bytes(info.data[0..4].try_into().unwrap()) as usize; 125 | let embedded = &info.data[16..16 + embedded_len]; 126 | let rootfs_offset = align_up(16 + embedded_len, 512); 127 | if DEBUG.load(Ordering::Relaxed) { 128 | eprintln!( 129 | "embedded data @ {:p}, header length {}, rootfs offset {}", 130 | info.data.as_ptr(), 131 | embedded_len, 132 | rootfs_offset 133 | ); 134 | } 135 | let archived = rkyv::access::(embedded) 136 | .expect("invalid archived data"); 137 | let rootfs = 138 | &info.data[rootfs_offset..rootfs_offset + archived.rootfs_size.to_native() as usize]; 139 | Some((info, rootfs, archived)) 140 | } 141 | 142 | fn build_mode() -> anyhow::Result<()> { 143 | let args = BuildArgs::parse(); 144 | 145 | let output_path = &args.output; 146 | let firecracker_path = &args.firecracker; 147 | let kernel_path = &args.kernel; 148 | let initrd_path = &args.initrd; 149 | let rootfs_path = &args.rootfs; 150 | let input_path = &args.input; 151 | let entrypoint = args.entrypoint.clone(); 152 | let args_vec: Vec = args.arg.clone(); 153 | let env_vec: Vec = args.env.clone(); 154 | let cwd = args.cwd.clone().unwrap_or_default(); 155 | let uid = args.uid.clone(); 156 | let gid = args.gid.clone(); 157 | 158 | // Read resource files 159 | let firecracker_data = &*Box::leak(Box::new(unsafe { 160 | Mmap::map(&File::open(firecracker_path)?)? 161 | })); 162 | let kernel_data = &*Box::leak(Box::new(unsafe { Mmap::map(&File::open(kernel_path)?)? })); 163 | let initrd_data = &*Box::leak(Box::new(unsafe { Mmap::map(&File::open(initrd_path)?)? })); 164 | let mut rootfs_file = File::open(rootfs_path)?; 165 | let rootfs_size = rootfs_file.metadata()?.size(); 166 | let input = unsafe { Mmap::map(&File::open(input_path)?)? }; 167 | let embedded = Embedded { 168 | firecracker: Bytes::from_static(firecracker_data), 169 | kernel: Bytes::from_static(kernel_data), 170 | initrd: Bytes::from_static(initrd_data), 171 | rootfs_size, 172 | entrypoint, 173 | args: args_vec, 174 | env: env_vec, 175 | cwd, 176 | uid, 177 | gid, 178 | }; 179 | let embedded = rkyv::to_bytes::(&embedded).expect("serialization failed"); 180 | let embedded_len = (embedded.len() as u32).to_le_bytes(); 181 | let align_fill_bytes_1 = align_up(embedded.len() + 16, 512) - (embedded.len() + 16); 182 | assert!(align_fill_bytes_1 < 512); 183 | let align_fill_bytes_1 = vec![0u8; align_fill_bytes_1]; 184 | let align_fill_bytes_2 = align_up(rootfs_size as usize, 512) - rootfs_size as usize; 185 | assert!(align_fill_bytes_2 < 512); 186 | let align_fill_bytes_2 = vec![0u8; align_fill_bytes_2]; 187 | 188 | // Create ELF with embedded sections 189 | let mut output_file = File::create(output_path)?; 190 | output_file.write_all(&input)?; 191 | write_embedded_data( 192 | &mut [ 193 | &mut Cursor::new(&embedded_len[..]), 194 | &mut Cursor::new(&[0u8; 12]), 195 | &mut Cursor::new(&embedded), 196 | &mut Cursor::new(&align_fill_bytes_1), 197 | &mut rootfs_file, 198 | &mut Cursor::new(&align_fill_bytes_2), 199 | ], 200 | &mut output_file, 201 | input.len(), 202 | )?; 203 | drop(output_file); 204 | 205 | fs::set_permissions(output_path, Permissions::from_mode(0o755))?; 206 | 207 | println!("{}", output_path); 208 | Ok(()) 209 | } 210 | 211 | #[derive(clap::Subcommand, Debug)] 212 | enum RunSubcommand { 213 | /// Connect to the running microVM via SSH 214 | Ssh { 215 | /// PID of the target instance 216 | #[arg(short = 'p', long = "pid")] 217 | pid: Option, 218 | /// Extra ssh(1) arguments after `--` 219 | #[arg(trailing_var_arg = true, last = true)] 220 | ssh_args: Vec, 221 | }, 222 | /// Print a systemd service unit for current options 223 | Systemd { 224 | /// Container arguments (after `--`) 225 | #[arg(trailing_var_arg = true, last = true)] 226 | container_args: Vec, 227 | }, 228 | } 229 | 230 | #[derive(Debug, Parser)] 231 | #[command(name = "bake", about = "Bottlefire microVM Image")] 232 | struct RunArgs { 233 | /// Number of CPU cores 234 | #[arg(long)] 235 | cpus: Option, 236 | 237 | /// Amount of memory (in MB) allocated to the microVM 238 | #[arg(long, default_value_t = 256)] 239 | memory: u32, 240 | 241 | /// Kernel command line 242 | #[arg(long = "boot-args", default_value = "console=ttyS0 reboot=k panic=-1")] 243 | boot_args: String, 244 | 245 | /// Container entrypoint 246 | #[arg(long)] 247 | entrypoint: Option, 248 | 249 | /// Container arguments (after `--`) 250 | #[arg(trailing_var_arg = true, last = true)] 251 | container_args: Vec, 252 | 253 | /// Container environment variables 254 | #[arg(short = 'e', long, value_name = "KEY=VALUE")] 255 | env: Vec, 256 | 257 | /// Enable verbose output 258 | #[arg(long)] 259 | verbose: bool, 260 | 261 | /// Container working directory 262 | #[arg(long, default_value = "")] 263 | cwd: String, 264 | 265 | /// Container user ID inside the microVM 266 | #[arg(long)] 267 | uid: Option, 268 | 269 | /// Container group ID inside the microVM 270 | #[arg(long)] 271 | gid: Option, 272 | 273 | /// Publish host:vm port forward (e.g. -p 8080:8080) 274 | #[arg(short = 'p', long = "publish", value_name = "HOST:VM")] 275 | publish: Vec, 276 | 277 | /// Directory/volume mappings (e.g. -v ./data:/data) 278 | #[arg(short = 'v', long = "volume", value_name = "HOST:VM[:ro]")] 279 | volume: Vec, 280 | 281 | /// Allow outbound network to IPv4 address or CIDR (repeatable) 282 | #[arg(long = "allow-net")] 283 | allow_net: Vec, 284 | 285 | /// Disable outbound network bridge 286 | #[arg(long = "disable-hostnet")] 287 | disable_hostnet: bool, 288 | 289 | /// WireGuard config file path (wg setconf format) 290 | #[arg(long = "wireguard-conf-file")] 291 | wireguard_conf_file: Option, 292 | 293 | /// Size of ephemeral disk (in MB) for overlay filesystem [default: 2048] 294 | #[arg(long, default_value_t = 2048)] 295 | ephemeral_disk_size: u32, 296 | 297 | /// Path to write SSH private key to 298 | #[arg(long, env = "BAKE_SSH_PRIVATE_KEY_PATH")] 299 | ssh_private_key_path: Option, 300 | 301 | /// Unix socket path to listen for SSH connections 302 | #[arg(long, env = "BAKE_SSH_SOCK_PATH")] 303 | ssh_sock_path: Option, 304 | 305 | /// Path to write SSH connect script to 306 | #[arg(long, env = "BAKE_SSH_SCRIPT_PATH")] 307 | ssh_script_path: Option, 308 | 309 | /// Subcommands for interacting with a running instance 310 | #[command(subcommand)] 311 | cmd: Option, 312 | } 313 | 314 | fn generate_systemd_unit(args: &RunArgs) -> anyhow::Result<()> { 315 | let executable_path = std::env::current_exe()?; 316 | let executable_path = executable_path.to_string_lossy(); 317 | 318 | let mut service_args = Vec::new(); 319 | 320 | if let Some(cpus) = args.cpus { 321 | service_args.push(format!("--cpus {}", cpus)); 322 | } 323 | 324 | service_args.push(format!("--memory {}", args.memory)); 325 | 326 | service_args.push(format!( 327 | "--boot-args {}", 328 | shell_escape::escape(args.boot_args.as_str().into()) 329 | )); 330 | 331 | if let Some(ref entrypoint) = args.entrypoint { 332 | service_args.push(format!( 333 | "--entrypoint {}", 334 | shell_escape::escape(entrypoint.as_str().into()) 335 | )); 336 | } 337 | 338 | for env in &args.env { 339 | service_args.push(format!( 340 | "--env {}", 341 | shell_escape::escape(env.as_str().into()) 342 | )); 343 | } 344 | 345 | if !args.cwd.is_empty() { 346 | service_args.push(format!( 347 | "--cwd {}", 348 | shell_escape::escape(args.cwd.as_str().into()) 349 | )); 350 | } 351 | 352 | if let Some(uid) = args.uid { 353 | service_args.push(format!("--uid {}", uid)); 354 | } 355 | if let Some(gid) = args.gid { 356 | service_args.push(format!("--gid {}", gid)); 357 | } 358 | 359 | for publish in &args.publish { 360 | service_args.push(format!( 361 | "--publish {}", 362 | shell_escape::escape(publish.as_str().into()) 363 | )); 364 | } 365 | 366 | for volume in &args.volume { 367 | service_args.push(format!( 368 | "--volume {}", 369 | shell_escape::escape(volume.as_str().into()) 370 | )); 371 | } 372 | 373 | for ip in &args.allow_net { 374 | service_args.push(format!("--allow-net {}", ip)); 375 | } 376 | if args.disable_hostnet { 377 | service_args.push("--disable-hostnet".into()); 378 | } 379 | if let Some(path) = &args.wireguard_conf_file { 380 | service_args.push(format!( 381 | "--wireguard-conf-file {}", 382 | shell_escape::escape(path.to_string_lossy().into()) 383 | )); 384 | } 385 | 386 | service_args.push(format!( 387 | "--ephemeral-disk-size {}", 388 | args.ephemeral_disk_size 389 | )); 390 | 391 | if !args.container_args.is_empty() { 392 | service_args.push("--".into()); 393 | for carg in &args.container_args { 394 | service_args.push(format!("{}", shell_escape::escape(carg.as_str().into()))); 395 | } 396 | } 397 | 398 | let args_str = if service_args.is_empty() { 399 | String::new() 400 | } else { 401 | format!(" \\\n {}", service_args.join(" \\\n ")) 402 | }; 403 | 404 | let mut service_file = format!( 405 | r#"[Unit] 406 | Description=Bottlefire microVM Service 407 | 408 | [Service] 409 | Type=simple 410 | ExecStart={}{} 411 | Restart=always 412 | RestartSec=5 413 | PrivateTmp=true 414 | ProtectSystem=strict 415 | CapabilityBoundingSet= 416 | NoNewPrivileges=true 417 | Environment=BAKE_SSH_PRIVATE_KEY_PATH=/tmp/id_ecdsa 418 | Environment=BAKE_SSH_SOCK_PATH=/tmp/ssh.sock 419 | Environment=BAKE_SSH_SCRIPT_PATH=/tmp/ssh.sh 420 | "#, 421 | executable_path, args_str 422 | ); 423 | 424 | for env in &args.env { 425 | service_file.push_str("Environment=\"BAKE_VM_"); 426 | service_file.push_str("e_systemd_string(env)); 427 | service_file.push_str("\"\n"); 428 | } 429 | 430 | service_file.push_str( 431 | r#" 432 | [Install] 433 | WantedBy=multi-user.target 434 | "#, 435 | ); 436 | 437 | print!("{}", service_file); 438 | Ok(()) 439 | } 440 | 441 | fn run_mode( 442 | (info, rootfs, embedded): (EmbeddedInfo, &'static [u8], &'static ArchivedEmbedded), 443 | ) -> anyhow::Result<()> { 444 | let mut parsed = RunArgs::parse(); 445 | 446 | // If a subcommand is specified, handle it and exit. 447 | if let Some(cmd) = &parsed.cmd { 448 | match cmd { 449 | RunSubcommand::Ssh { pid, ssh_args } => { 450 | return ssh_launcher::launch_ssh(*pid, ssh_args.clone()); 451 | } 452 | RunSubcommand::Systemd { container_args } => { 453 | parsed.container_args = container_args.clone(); 454 | return generate_systemd_unit(&parsed); 455 | } 456 | } 457 | } 458 | 459 | best_effort_raise_fd_limit(); 460 | 461 | RT.set( 462 | tokio::runtime::Builder::new_multi_thread() 463 | .enable_all() 464 | .worker_threads(1) 465 | .thread_name("bake-worker") 466 | .build() 467 | .unwrap(), 468 | ) 469 | .ok() 470 | .expect("RT.set()"); 471 | 472 | // Auto-detect CPUs if not provided 473 | let cpus: u32 = parsed.cpus.unwrap_or_else(|| { 474 | let n = unsafe { libc::sysconf(libc::_SC_NPROCESSORS_ONLN) }; 475 | if n > 0 { n as u32 } else { 1 } 476 | }); 477 | let memory = &parsed.memory; 478 | let boot_args = &parsed.boot_args; 479 | let verbose = &parsed.verbose; 480 | let cwd = &parsed.cwd; 481 | 482 | // CLI params take precedence over embedded params 483 | let entrypoint = parsed 484 | .entrypoint 485 | .as_ref() 486 | .map(|x| x.as_str()) 487 | .or_else(|| embedded.entrypoint.as_ref().map(|x| x.as_str())); 488 | let args: Vec = if !parsed.container_args.is_empty() { 489 | parsed.container_args.clone() 490 | } else { 491 | embedded.args.iter().map(|x| x.to_string()).collect() 492 | }; 493 | let mut env: HashMap = HashMap::new(); 494 | 495 | // Merge embedded env with BAKE_VM_ env and CLI env 496 | // Precedence: CLI > BAKE_VM_ > embedded 497 | 498 | // Embedded env 499 | for x in &*embedded.env { 500 | let Some((key, value)) = x.split_once('=') else { 501 | continue; 502 | }; 503 | env.insert(key.to_string(), value.to_string()); 504 | } 505 | 506 | // Collect BAKE_VM_ environment variables from host and strip prefix 507 | for (key, value) in std::env::vars() { 508 | let Some(stripped_key) = key.strip_prefix("BAKE_VM_") else { 509 | continue; 510 | }; 511 | env.insert(stripped_key.to_string(), value); 512 | } 513 | 514 | // Add CLI env vars 515 | for x in &parsed.env { 516 | let Some((key, value)) = x.split_once('=') else { 517 | continue; 518 | }; 519 | env.insert(key.to_string(), value.to_string()); 520 | } 521 | 522 | let cwd = if cwd.is_empty() { 523 | embedded.cwd.to_string() 524 | } else { 525 | cwd.clone() 526 | }; 527 | 528 | // Determine default uid/gid: CLI > embedded > None 529 | let uid: Option = parsed 530 | .uid 531 | .or_else(|| embedded.uid.as_ref().map(|x| x.to_native())); 532 | let gid: Option = parsed 533 | .gid 534 | .or_else(|| embedded.gid.as_ref().map(|x| x.to_native())); 535 | 536 | // Create memfd for firecracker binary 537 | let firecracker_path = unsafe { memfd_from_mmap("firecracker", &embedded.firecracker)? }; 538 | 539 | // Create memfd for kernel 540 | let kernel_path = unsafe { memfd_from_mmap("kernel", &embedded.kernel)? }; 541 | 542 | // Create memfd for initrd 543 | let initrd_path = unsafe { memfd_from_mmap("initrd", &embedded.initrd)? }; 544 | 545 | // No O_CLOEXEC to be inherited by firecracker 546 | let exe_fd = unsafe { 547 | libc::open( 548 | b"/proc/self/exe\0".as_ptr() as *const c_char, 549 | libc::O_RDONLY, 550 | ) 551 | }; 552 | let exe_path = format!("/proc/self/fd/{}", exe_fd); 553 | 554 | let rootfs_offset = unsafe { rootfs.as_ptr().offset_from(info.base.as_ptr()) }; 555 | assert!(rootfs_offset % 512 == 0); 556 | 557 | let mut boot_args = format!( 558 | "{} bake.rootfs_offset={} bake.rootfs_size={}", 559 | boot_args, 560 | rootfs_offset / 512, 561 | align_up(rootfs.len(), 512) / 512 562 | ); 563 | 564 | if !verbose { 565 | boot_args.push_str(" quiet"); 566 | } 567 | 568 | // Propagate host debug no-reboot flag into VM kernel cmdline 569 | if std::env::var("BAKE_NO_REBOOT").ok().as_deref() == Some("1") { 570 | boot_args.push_str(" bake.noreboot=1"); 571 | } 572 | 573 | let tmp_base_dir = std::env::temp_dir().join(format!( 574 | "bottlefire-bake-fc-{}", 575 | faster_hex::hex_string(&rand::rng().random::<[u8; 16]>()) 576 | )); 577 | std::fs::create_dir(&tmp_base_dir).with_context(|| { 578 | format!( 579 | "failed to create vsock base dir at {}", 580 | tmp_base_dir.display() 581 | ) 582 | })?; 583 | TMP_BASE_DIR 584 | .try_lock() 585 | .unwrap() 586 | .replace(tmp_base_dir.clone()); 587 | unsafe { 588 | libc::signal(libc::SIGTERM, term_signal as usize); 589 | libc::signal(libc::SIGINT, term_signal as usize); 590 | libc::signal(libc::SIGHUP, term_signal as usize); 591 | } 592 | scopeguard::defer! { 593 | let _ = std::fs::remove_dir_all(&tmp_base_dir); 594 | } 595 | let prev_hook = std::panic::take_hook(); 596 | std::panic::set_hook(Box::new(move |info| { 597 | if let Ok(x) = TMP_BASE_DIR.try_lock() { 598 | if let Some(path) = &*x { 599 | let _ = std::fs::remove_dir_all(path); 600 | } 601 | } 602 | prev_hook(info) 603 | })); 604 | let ephemeral_disk = tmp_base_dir.join("ephemeral.img"); 605 | let vsock_outbound_uds = tmp_base_dir.join("fc.sock"); 606 | let vsock_inbound_socks5_uds = tmp_base_dir.join("fc.sock_10"); 607 | let vsock_inbound_socks5_udp_uds = tmp_base_dir.join("fc.sock_11"); 608 | let vsock_inbound_9p = tmp_base_dir.join("fc.sock_12"); 609 | let vsock_inbound_boot_manifest_request = tmp_base_dir.join("fc.sock_13"); 610 | let vsock_inbound_console = tmp_base_dir.join("fc.sock_14"); 611 | // Apply network allowlist (if any) for outbound network 612 | if !parsed.allow_net.is_empty() || parsed.disable_hostnet { 613 | crate::socks5::set_allow_net(parsed.allow_net.clone()); 614 | } 615 | 616 | if !parsed.disable_hostnet { 617 | crate::socks5::run_socks5_unix(&vsock_inbound_socks5_uds) 618 | .with_context(|| "failed to start socks5 uds listener")?; 619 | } 620 | crate::socks5::run_socks5_udp_unix(&vsock_inbound_socks5_udp_uds) 621 | .with_context(|| "failed to start socks5 udp uds listener")?; 622 | let console_task = host_run_console(RT.get().unwrap(), &vsock_inbound_console) 623 | .with_context(|| "failed to start console listener")?; 624 | 625 | // Start requested TCP port forwards via vsock SOCKS5 (port 10) 626 | if !parsed.publish.is_empty() { 627 | let uds_path = vsock_outbound_uds 628 | .to_str() 629 | .expect("invalid vsock_outbound_uds") 630 | .to_string(); 631 | spawn_port_forwards(parsed.publish, uds_path); 632 | } 633 | 634 | // Start plan9 filesystem server 635 | let volumes = if !parsed.volume.is_empty() { 636 | spawn_file_server(parsed.volume, &vsock_inbound_9p) 637 | } else { 638 | vec![] 639 | }; 640 | 641 | if volumes.iter().filter(|x| x.ext4).count() > 20 { 642 | panic!("too many ext4 volumes, max 20"); 643 | } 644 | 645 | let ssh_ecdsa_private_key = ssh_key::PrivateKey::random( 646 | &mut rand_core_06::OsRng, 647 | ssh_key::Algorithm::Ecdsa { 648 | curve: ssh_key::EcdsaCurve::NistP256, 649 | }, 650 | ) 651 | .with_context(|| "failed to generate ssh key")?; 652 | let ssh_ecdsa_public_key = ssh_ecdsa_private_key.public_key().to_openssh().unwrap(); 653 | let ssh_ecdsa_private_key = ssh_ecdsa_private_key 654 | .to_openssh(ssh_key::LineEnding::LF) 655 | .unwrap() 656 | .to_string(); 657 | let ssh_ecdsa_private_key_path = mkmemfd( 658 | "id_ecdsa", 659 | ssh_ecdsa_private_key.as_bytes(), 660 | Permissions::from_mode(0o400), 661 | )?; 662 | mkmemfd( 663 | "id_ecdsa.pub", 664 | ssh_ecdsa_public_key.as_bytes(), 665 | Permissions::from_mode(0o400), 666 | )?; 667 | 668 | let ssh_ecdsa_private_key_path: &Path = if let Some(x) = &parsed.ssh_private_key_path { 669 | OpenOptions::new() 670 | .write(true) 671 | .create(true) 672 | .truncate(true) 673 | .mode(0o600) 674 | .open(x) 675 | .and_then(|mut x| x.write_all(ssh_ecdsa_private_key.as_bytes())) 676 | .with_context(|| "failed to write ssh key to BAKE_SSH_PRIVATE_KEY_PATH")?; 677 | x 678 | } else { 679 | Path::new(&ssh_ecdsa_private_key_path) 680 | }; 681 | 682 | let ssh_proxy_path = parsed 683 | .ssh_sock_path 684 | .unwrap_or_else(|| tmp_base_dir.join("ssh.sock")); 685 | serve_ssh_proxy(&ssh_proxy_path, &vsock_outbound_uds) 686 | .with_context(|| "failed to start ssh proxy service")?; 687 | mkmemfd( 688 | "ssh_proxy_path", 689 | ssh_proxy_path.as_os_str().as_encoded_bytes(), 690 | Permissions::from_mode(0o444), 691 | )?; 692 | 693 | if let Some(x) = &parsed.ssh_script_path { 694 | OpenOptions::new() 695 | .write(true) 696 | .create(true) 697 | .truncate(true) 698 | .mode(0o700) 699 | .open(x) 700 | .and_then(|mut x| x.write_all(format!(r#"#!/bin/sh 701 | exec ssh -i {} -o "ProxyCommand nc -U {}" -o "UserKnownHostsFile=/dev/null" -o "StrictHostKeyChecking=no" root@localhost 702 | "#, shell_escape::escape(ssh_ecdsa_private_key_path.to_string_lossy()), shell_escape::escape(ssh_proxy_path.to_string_lossy())).as_bytes())) 703 | .with_context(|| "failed to write to BAKE_SSH_SCRIPT_PATH")?; 704 | } 705 | 706 | let manifest = BootManifest { 707 | entrypoint: entrypoint.map(|x| x.to_string()), 708 | args, 709 | cwd: if cwd.is_empty() { None } else { Some(cwd) }, 710 | env, 711 | volumes: volumes 712 | .iter() 713 | .map(|x| VolumeManifest { 714 | guest_path: x.guest.clone(), 715 | host_filename: if x.is_file { 716 | Some( 717 | Path::new(&x.host) 718 | .file_name() 719 | .and_then(|x| x.to_str()) 720 | .unwrap_or_else(|| panic!("cannot determine host filename: {:?}", x)) 721 | .to_string(), 722 | ) 723 | } else { 724 | None 725 | }, 726 | ext4: x.ext4, 727 | ro: x.ro, 728 | }) 729 | .collect(), 730 | uid, 731 | gid, 732 | disable_hostnet: parsed.disable_hostnet, 733 | wireguard_conf: if let Some(ref path) = parsed.wireguard_conf_file { 734 | Some( 735 | std::fs::read_to_string(path) 736 | .with_context(|| "failed to read wireguard conf file")?, 737 | ) 738 | } else { 739 | None 740 | }, 741 | ssh_ecdsa_private_key, 742 | ssh_ecdsa_public_key, 743 | }; 744 | serve_boot_manifest_request(&vsock_inbound_boot_manifest_request, &manifest)?; 745 | 746 | let mut drives = vec![ 747 | Drive { 748 | drive_id: "rootfs".into(), 749 | is_root_device: true, 750 | is_read_only: true, 751 | io_engine: "Async".into(), 752 | path_on_host: exe_path, 753 | }, 754 | Drive { 755 | drive_id: "ephemeral".into(), 756 | is_root_device: false, 757 | is_read_only: false, 758 | io_engine: "Async".into(), 759 | path_on_host: ephemeral_disk 760 | .to_str() 761 | .expect("invalid ephemeral disk path") 762 | .to_string(), 763 | }, 764 | ]; 765 | 766 | for (i, vol) in volumes.iter().enumerate() { 767 | if vol.ext4 { 768 | drives.push(Drive { 769 | drive_id: format!("vol-{}", i), 770 | is_root_device: false, 771 | is_read_only: vol.ro, 772 | io_engine: "Async".into(), 773 | path_on_host: vol.host.clone(), 774 | }) 775 | } 776 | } 777 | 778 | { 779 | // Convert MB to bytes 780 | let disk_size: u64 = parsed.ephemeral_disk_size as u64 * 1024 * 1024; 781 | let mut disk = OpenOptions::new() 782 | .write(true) 783 | .create_new(true) 784 | .open(&ephemeral_disk) 785 | .with_context(|| "failed to open ephemeral disk")?; 786 | disk.seek(SeekFrom::Start(disk_size - 1)) 787 | .and_then(|_| disk.write(&[0u8])) 788 | .with_context(|| "failed to initialize ephemeral disk")?; 789 | } 790 | 791 | let firecracker_config = FirecrackerConfig { 792 | boot_source: BootSource { 793 | kernel_image_path: kernel_path, 794 | initrd_path, 795 | boot_args, 796 | }, 797 | drives, 798 | machine_config: MachineConfig { 799 | vcpu_count: cpus, 800 | mem_size_mib: *memory, 801 | }, 802 | network_interfaces: vec![], 803 | vsock: VsockConfig { 804 | guest_cid: 3, 805 | uds_path: vsock_outbound_uds 806 | .to_str() 807 | .expect("invalid vsock_outbound_uds") 808 | .to_string(), 809 | }, 810 | }; 811 | 812 | // Check for dry run mode 813 | if std::env::var("BAKE_DRY_RUN").ok().as_deref() == Some("1") { 814 | let config_json = serde_json::to_string_pretty(&firecracker_config)?; 815 | println!("{}", config_json); 816 | return Ok(()); 817 | } 818 | 819 | let config_json = serde_json::to_vec(&firecracker_config)?; 820 | let config_path = mkmemfd("config", &config_json, Permissions::from_mode(0o444))?; 821 | 822 | // Start firecracker with the specified parameters 823 | let mut cmd = ProcessCommand::new(&firecracker_path); 824 | cmd.arg("--config-file") 825 | .arg(config_path) 826 | .arg("--no-api") 827 | .arg("--enable-pci") 828 | .stdin(Stdio::null()) 829 | .stdout(Stdio::piped()) 830 | .stderr(Stdio::piped()); 831 | if !*verbose { 832 | cmd.arg("--level").arg("error"); 833 | } 834 | unsafe { 835 | let ppid = libc::getpid(); 836 | cmd.pre_exec(move || { 837 | if libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGKILL) != 0 || libc::getppid() != ppid { 838 | libc::abort(); 839 | } 840 | Ok(()) 841 | }); 842 | } 843 | 844 | let mut cmd = cmd.spawn()?; 845 | let stdout = cmd.stdout.take().unwrap(); 846 | let stderr = cmd.stderr.take().unwrap(); 847 | for mut pipe in [ 848 | Box::new(stdout) as Box, 849 | Box::new(stderr) as Box, 850 | ] { 851 | std::thread::spawn(move || { 852 | let mut buf = vec![0u8; 4096]; 853 | loop { 854 | let Ok(n) = pipe.read(&mut buf) else { 855 | break; 856 | }; 857 | let mut stdout = std::io::stdout().lock(); 858 | let _ = stdout.write_all(&buf[..n]); 859 | let _ = stdout.flush(); 860 | } 861 | }); 862 | } 863 | let status = cmd.wait()?; 864 | console_task.abort(); 865 | if status.success() { 866 | Ok(()) 867 | } else { 868 | Err(anyhow::anyhow!( 869 | "firecracker exited with status {}", 870 | status 871 | .code() 872 | .map(|x| x.to_string()) 873 | .unwrap_or_else(|| "unknown".into()) 874 | )) 875 | } 876 | } 877 | 878 | fn spawn_port_forwards(publishes: Vec, uds_path: String) { 879 | RT.get().unwrap().spawn(async move { 880 | for spec in publishes { 881 | if let Some((ip, host, vm)) = parse_publish(&spec) { 882 | let uds_path = uds_path.clone(); 883 | tokio::spawn(async move { 884 | if let Err(e) = forward_listener(ip, host, vm, &uds_path).await { 885 | eprintln!("port forward {}:{}:{} failed: {:?}", ip, host, vm, e); 886 | } 887 | }); 888 | } else { 889 | eprintln!("invalid -p/--publish spec: {} (expected HOST:VM)", spec); 890 | } 891 | } 892 | }); 893 | } 894 | 895 | fn parse_publish(spec: &str) -> Option<(std::net::IpAddr, u16, u16)> { 896 | let parts: Vec<&str> = spec.split(':').collect(); 897 | match parts.len() { 898 | 2 => { 899 | let host: u16 = parts[0].parse().ok()?; 900 | let vm: u16 = parts[1].parse().ok()?; 901 | Some((std::net::IpAddr::from([127, 0, 0, 1]), host, vm)) 902 | } 903 | 3 => { 904 | let ip: std::net::IpAddr = parts[0].parse().ok()?; 905 | let host: u16 = parts[1].parse().ok()?; 906 | let vm: u16 = parts[2].parse().ok()?; 907 | Some((ip, host, vm)) 908 | } 909 | _ => None, 910 | } 911 | } 912 | 913 | async fn forward_listener( 914 | bind_ip: std::net::IpAddr, 915 | host_port: u16, 916 | vm_port: u16, 917 | uds_path: &str, 918 | ) -> anyhow::Result<()> { 919 | let bind_addr = std::net::SocketAddr::new(bind_ip, host_port); 920 | let listener = tokio::net::TcpListener::bind(bind_addr).await?; 921 | loop { 922 | let (inbound, _) = listener.accept().await?; 923 | let uds_path = Path::new(uds_path).to_path_buf(); 924 | tokio::spawn(async move { 925 | match vsock_uds_connect(&uds_path, 10).await { 926 | Ok(mut stream) => { 927 | // SOCKS5 handshake: no auth 928 | let mut resp = [0u8; 2]; 929 | if stream.write_all(&[0x05, 0x01, 0x00]).await.is_err() 930 | || stream.flush().await.is_err() 931 | || stream.read_exact(&mut resp).await.is_err() 932 | || resp != [0x05, 0x00] 933 | { 934 | return; 935 | } 936 | 937 | // CONNECT 127.0.0.1:vm_port 938 | let port_be = vm_port.to_be_bytes(); 939 | let req = [0x05, 0x01, 0x00, 0x01, 127, 0, 0, 1, port_be[0], port_be[1]]; 940 | if stream.write_all(&req).await.is_err() || stream.flush().await.is_err() { 941 | return; 942 | } 943 | 944 | // Reply: version, status, rsv, atyp, addr..., port 945 | let mut hdr = [0u8; 4]; 946 | if stream.read_exact(&mut hdr).await.is_err() 947 | || hdr[0] != 0x05 948 | || hdr[1] != 0x00 949 | { 950 | return; 951 | } 952 | let addr_len = match hdr[3] { 953 | 0x01 => 4, 954 | 0x03 => { 955 | let mut l = [0u8; 1]; 956 | if stream.read_exact(&mut l).await.is_err() { 957 | return; 958 | } 959 | l[0] as usize 960 | } 961 | 0x04 => 16, 962 | _ => 0, 963 | }; 964 | if addr_len == 0 { 965 | return; 966 | } 967 | let mut skip = vec![0u8; addr_len + 2]; 968 | if let Err(_) = stream.read_exact(&mut skip).await { 969 | return; 970 | } 971 | 972 | let Ok(inbound) = inbound.into_std() else { 973 | return; 974 | }; 975 | let Ok(stream) = stream.into_std() else { 976 | return; 977 | }; 978 | // Pipe data both ways 979 | if let Err(e) = 980 | copy_bidirectional_fastclose(inbound.into(), stream.into()).await 981 | { 982 | eprintln!("forward connection failed: {:?}", e); 983 | } 984 | } 985 | Err(e) => { 986 | eprintln!("failed to connect vsock proxy: {:?}", e); 987 | } 988 | } 989 | }); 990 | } 991 | } 992 | 993 | fn mkmemfd(name: &str, data: &[u8], permissions: Permissions) -> anyhow::Result { 994 | use std::ffi::CString; 995 | use std::os::unix::io::FromRawFd; 996 | 997 | let name_cstring = CString::new(name) 998 | .map_err(|_| io::Error::new(io::ErrorKind::InvalidInput, "Invalid name for memfd"))?; 999 | 1000 | // Create memfd 1001 | // No cloexec! 1002 | let fd = unsafe { libc::memfd_create(name_cstring.as_ptr(), libc::MFD_ALLOW_SEALING) }; 1003 | 1004 | if fd == -1 { 1005 | return Err(io::Error::last_os_error().into()); 1006 | } 1007 | 1008 | // Write data to memfd 1009 | let mut file = unsafe { File::from_raw_fd(fd) }; 1010 | file.write_all(data)?; 1011 | file.flush()?; 1012 | 1013 | // Seal it 1014 | if unsafe { 1015 | libc::fcntl( 1016 | fd, 1017 | libc::F_ADD_SEALS, 1018 | libc::F_SEAL_GROW | libc::F_SEAL_SHRINK | libc::F_SEAL_SEAL, 1019 | ) 1020 | } != 0 1021 | { 1022 | anyhow::bail!("file sealing failed: {:?}", std::io::Error::last_os_error()); 1023 | } 1024 | 1025 | if unsafe { libc::fchmod(fd, permissions.mode()) } < 0 { 1026 | anyhow::bail!("fchmod failed: {:?}", std::io::Error::last_os_error()); 1027 | } 1028 | 1029 | // Return the file descriptor (but don't close it) 1030 | std::mem::forget(file); 1031 | Ok(format!("/proc/self/fd/{}", fd)) 1032 | } 1033 | 1034 | fn serve_boot_manifest_request(path: &Path, manifest: &BootManifest) -> anyhow::Result<()> { 1035 | let listener = std::os::unix::net::UnixListener::bind(path)?; 1036 | let manifest = rkyv::to_bytes::(manifest)?; 1037 | std::thread::spawn(move || { 1038 | // only serve the manifest once 1039 | let Ok((mut conn, _)) = listener.accept() else { 1040 | return; 1041 | }; 1042 | let _: Result<_, _> = conn 1043 | .write_all(&manifest) 1044 | .and_then(|_| conn.shutdown(std::net::Shutdown::Write)); 1045 | }); 1046 | Ok(()) 1047 | } 1048 | 1049 | fn serve_ssh_proxy(path: &Path, vsock_outbound_uds: &Path) -> anyhow::Result<()> { 1050 | let listener = RT 1051 | .get() 1052 | .unwrap() 1053 | .block_on(async { UnixListener::bind(path) })?; 1054 | let vsock_outbound_uds = Arc::new(vsock_outbound_uds.to_path_buf()); 1055 | RT.get().unwrap().spawn(async move { 1056 | loop { 1057 | let Ok((conn, _)) = listener.accept().await else { 1058 | break; 1059 | }; 1060 | let vsock_outbound_uds = vsock_outbound_uds.clone(); 1061 | tokio::spawn(async move { 1062 | let Ok(outbound) = vsock_uds_connect(&vsock_outbound_uds, 22).await else { 1063 | return; 1064 | }; 1065 | let Ok(conn) = conn.into_std() else { 1066 | return; 1067 | }; 1068 | let Ok(outbound) = outbound.into_std() else { 1069 | return; 1070 | }; 1071 | let _ = copy_bidirectional_fastclose(conn.into(), outbound.into()).await; 1072 | }); 1073 | } 1074 | }); 1075 | Ok(()) 1076 | } 1077 | 1078 | unsafe fn memfd_from_mmap(name: &str, data: &'static [u8]) -> anyhow::Result { 1079 | unsafe { 1080 | let pgsize = libc::sysconf(libc::_SC_PAGESIZE); 1081 | assert!(pgsize >= 4096); 1082 | let pgsize = pgsize as usize; 1083 | 1084 | let path = mkmemfd(name, data, Permissions::from_mode(0o777))?; 1085 | let ptr = data.as_ptr(); 1086 | let end = ptr.add(data.len()); 1087 | let ptr = align_up(ptr as usize, pgsize); 1088 | let end = end as usize & !(pgsize - 1); 1089 | 1090 | if end > ptr { 1091 | if DEBUG.load(Ordering::Relaxed) { 1092 | eprintln!( 1093 | "madvise({:p}, {:#x}, MADV_DONTNEED)", 1094 | ptr as *mut c_void, 1095 | end - ptr 1096 | ); 1097 | } 1098 | assert_eq!( 1099 | libc::madvise(ptr as *mut c_void, end - ptr, libc::MADV_DONTNEED), 1100 | 0 1101 | ); 1102 | } 1103 | Ok(path) 1104 | } 1105 | } 1106 | 1107 | unsafe extern "C" fn term_signal(sig: i32) { 1108 | if let Ok(x) = TMP_BASE_DIR.try_lock() { 1109 | if let Some(path) = &*x { 1110 | let _ = std::fs::remove_dir_all(path); 1111 | } 1112 | } 1113 | unsafe { 1114 | libc::signal(sig, libc::SIG_DFL); 1115 | libc::raise(sig); 1116 | } 1117 | } 1118 | --------------------------------------------------------------------------------