├── .envrc ├── lib ├── default.nix ├── constants.nix └── helpers.nix ├── scripts ├── bump-version │ ├── default.nix │ └── bump-version.sh └── benchmark-vm │ ├── default.nix │ └── benchmark-vm.sh ├── .gitignore ├── pkgs ├── vm-image │ ├── default.nix │ └── image-config.nix └── vm-runner │ ├── src │ └── virby_vm_runner │ │ ├── exceptions.py │ │ ├── constants.py │ │ ├── __init__.py │ │ ├── ssh.py │ │ ├── circuit_breaker.py │ │ ├── signal_manager.py │ │ ├── cli.py │ │ ├── api.py │ │ ├── ip_discovery.py │ │ ├── socket_activation.py │ │ ├── config.py │ │ ├── runner.py │ │ └── vm_process.py │ ├── .envrc │ ├── default.nix │ ├── pyproject.toml │ └── CHANGELOG.md ├── flake.lock ├── LICENSE ├── Justfile ├── .github └── workflows │ └── image.yml ├── flake.nix ├── module ├── options.nix └── default.nix └── README.md /.envrc: -------------------------------------------------------------------------------- 1 | use flake 2 | -------------------------------------------------------------------------------- /lib/default.nix: -------------------------------------------------------------------------------- 1 | # Library for shared constants and helper functions for Virby 2 | { lib }: 3 | 4 | let 5 | constants = import ./constants.nix; 6 | helpers = import ./helpers.nix { inherit lib; }; 7 | in 8 | 9 | { 10 | inherit constants helpers; 11 | } 12 | -------------------------------------------------------------------------------- /scripts/bump-version/default.nix: -------------------------------------------------------------------------------- 1 | { 2 | bash, 3 | commitizen, 4 | git, 5 | writeShellApplication, 6 | }: 7 | 8 | writeShellApplication { 9 | name = "bump-version"; 10 | 11 | runtimeInputs = [ 12 | bash 13 | commitizen 14 | git 15 | ]; 16 | 17 | text = '' 18 | bash ${./bump-version.sh} "$@" 19 | ''; 20 | } 21 | -------------------------------------------------------------------------------- /scripts/benchmark-vm/default.nix: -------------------------------------------------------------------------------- 1 | { 2 | bash, 3 | curl, 4 | hyperfine, 5 | writeShellApplication, 6 | }: 7 | 8 | writeShellApplication { 9 | name = "benchmark-virby-vm"; 10 | 11 | runtimeInputs = [ 12 | bash 13 | curl 14 | hyperfine 15 | ]; 16 | 17 | text = '' 18 | bash ${./benchmark-vm.sh} "$@" 19 | ''; 20 | } 21 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python 2 | *.egg-info/ 3 | *.test.json 4 | .*_cache/ 5 | .basedpyright/ 6 | .coverage 7 | .mypy_cache/ 8 | .pytest_cache/ 9 | .venv/ 10 | __pycache__/ 11 | dist/ 12 | 13 | # Nix 14 | result/ 15 | 16 | # Testing 17 | .env 18 | benchmark-results/ 19 | test/ 20 | virby-vm-benchmark-* 21 | 22 | # Misc 23 | *.bak 24 | *.bck 25 | .direnv/ 26 | .DS_Store 27 | .rules 28 | .zed/ 29 | 30 | # Agent related 31 | .claude/ 32 | .context/ 33 | .rules 34 | CLAUDE.md 35 | GEMINI.md 36 | -------------------------------------------------------------------------------- /pkgs/vm-image/default.nix: -------------------------------------------------------------------------------- 1 | { 2 | _lib, 3 | inputs, 4 | lib, 5 | pkgs, 6 | 7 | debug ? false, 8 | extraConfig ? { }, 9 | onDemand ? { 10 | enable = false; 11 | ttl = 180; 12 | }, 13 | rosetta ? false, 14 | }: 15 | 16 | let 17 | cfg = { inherit debug onDemand rosetta; }; 18 | 19 | nixosSystem = lib.nixosSystem { 20 | inherit pkgs; 21 | specialArgs = { inherit _lib cfg inputs; }; 22 | modules = [ 23 | ./image-config.nix 24 | extraConfig 25 | ]; 26 | }; 27 | in 28 | 29 | nixosSystem.config.system.build.images.raw-efi 30 | -------------------------------------------------------------------------------- /flake.lock: -------------------------------------------------------------------------------- 1 | { 2 | "nodes": { 3 | "nixpkgs": { 4 | "locked": { 5 | "lastModified": 1763966396, 6 | "narHash": "sha256-6eeL1YPcY1MV3DDStIDIdy/zZCDKgHdkCmsrLJFiZf0=", 7 | "owner": "nixos", 8 | "repo": "nixpkgs", 9 | "rev": "5ae3b07d8d6527c42f17c876e404993199144b6a", 10 | "type": "github" 11 | }, 12 | "original": { 13 | "owner": "nixos", 14 | "ref": "nixos-unstable", 15 | "repo": "nixpkgs", 16 | "type": "github" 17 | } 18 | }, 19 | "root": { 20 | "inputs": { 21 | "nixpkgs": "nixpkgs" 22 | } 23 | } 24 | }, 25 | "root": "root", 26 | "version": 7 27 | } 28 | -------------------------------------------------------------------------------- /pkgs/vm-runner/src/virby_vm_runner/exceptions.py: -------------------------------------------------------------------------------- 1 | """Exceptions for the Virby VM runner.""" 2 | 3 | 4 | class VirbyVMError(Exception): 5 | """Base exception for all Virby VM errors.""" 6 | 7 | pass 8 | 9 | 10 | class VMConfigurationError(VirbyVMError): 11 | """Raised when VM configuration is invalid.""" 12 | 13 | pass 14 | 15 | 16 | class VMStartupError(VirbyVMError): 17 | """Raised when VM fails to start.""" 18 | 19 | pass 20 | 21 | 22 | class VMRuntimeError(VirbyVMError): 23 | """Raised when VM encounters runtime errors.""" 24 | 25 | pass 26 | 27 | 28 | class IPDiscoveryError(VirbyVMError): 29 | """Raised when IP discovery fails.""" 30 | 31 | pass 32 | 33 | 34 | class SSHConnectivityError(VirbyVMError): 35 | """Raised when SSH connectivity fails.""" 36 | 37 | pass 38 | -------------------------------------------------------------------------------- /pkgs/vm-runner/src/virby_vm_runner/constants.py: -------------------------------------------------------------------------------- 1 | """Variable constants for Virby 2 | 3 | WARNING: This file's contents are overwritten as part of the nix build process. The values here are 4 | just for testing without building the nix package. The actual values used in the package are 5 | derived from _lib.constants, or ../../lib/constants.nix. 6 | """ 7 | 8 | WORKING_DIRECTORY = "/var/lib/virby" 9 | 10 | # DHCP lease file location on macOS 11 | DHCPD_LEASES_FILE_PATH = "/var/db/dhcpd_leases" 12 | 13 | # VM configuration 14 | VM_USER = "builder" 15 | VM_HOST_NAME = "virby-vm" 16 | 17 | # File names 18 | SSH_HOST_PRIVATE_KEY_FILE_NAME = "ssh_host_ed25519_key" 19 | SSH_HOST_PUBLIC_KEY_FILE_NAME = "ssh_host_ed25519_key.pub" 20 | SSH_USER_PRIVATE_KEY_FILE_NAME = "ssh_user_ed25519_key" 21 | SSH_USER_PUBLIC_KEY_FILE_NAME = "ssh_user_ed25519_key.pub" 22 | SSHD_KEYS_SHARED_DIR_NAME = "vm_sshd_keys" 23 | SSH_KNOWN_HOSTS_FILE_NAME = "ssh_known_hosts" 24 | 25 | # VM runtime files 26 | EFI_VARIABLE_STORE_FILE_NAME = "efistore.nvram" 27 | BASE_DISK_FILE_NAME = "base.img" 28 | DIFF_DISK_FILE_NAME = "diff.img" 29 | SERIAL_LOG_FILE_NAME = "serial.log" 30 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Quinn Edenfield 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /lib/constants.nix: -------------------------------------------------------------------------------- 1 | # Constants for the Virby Nix-darwin module 2 | let 3 | baseDiskFileName = "base.img"; 4 | dhcpdLeasesFilePath = "/var/db/dhcpd_leases"; 5 | diffDiskFileName = "diff.img"; 6 | efiVariableStoreFileName = "efistore.nvram"; 7 | serialLogFileName = "serial.log"; 8 | sshdKeysSharedDirName = "vm_sshd_keys"; 9 | sshHostPrivateKeyFileName = "ssh_host_ed25519_key"; 10 | sshHostPublicKeyFileName = sshHostPrivateKeyFileName + ".pub"; 11 | sshKnownHostsFileName = "ssh_known_hosts"; 12 | sshUserPrivateKeyFileName = "ssh_user_ed25519_key"; 13 | sshUserPublicKeyFileName = sshUserPrivateKeyFileName + ".pub"; 14 | vmHostName = "virby-vm"; 15 | vmUser = "builder"; 16 | workingDirectory = "/var/lib/virby"; 17 | in 18 | 19 | { 20 | inherit 21 | baseDiskFileName 22 | dhcpdLeasesFilePath 23 | diffDiskFileName 24 | efiVariableStoreFileName 25 | serialLogFileName 26 | sshdKeysSharedDirName 27 | sshHostPrivateKeyFileName 28 | sshHostPublicKeyFileName 29 | sshKnownHostsFileName 30 | sshUserPrivateKeyFileName 31 | sshUserPublicKeyFileName 32 | vmHostName 33 | vmUser 34 | workingDirectory 35 | ; 36 | } 37 | -------------------------------------------------------------------------------- /Justfile: -------------------------------------------------------------------------------- 1 | set working-directory := "pkgs/vm-runner" 2 | 3 | default: help 4 | 5 | help: 6 | @echo "Available commands:" 7 | @echo " build Build the Python package" 8 | @echo " clean Clean build artifacts and cache dirs" 9 | @echo " format Format code with ruff and isort" 10 | @echo " help Show this help message" 11 | @echo " lint Run linting checks" 12 | @echo " type-check Run mypy type checking" 13 | 14 | [working-directory("../..")] 15 | clean: 16 | @echo "Cleaning cache dirs..." 17 | @rm -rf dist build *.egg-info 18 | @find . -type d \ 19 | -not -path "*/.venv/*" "(" \ 20 | -name "*.egg-info" -or \ 21 | -name ".*_cache" -or \ 22 | -name "__pycache__" -or \ 23 | -name "build" -or \ 24 | -name "dist" -or \ 25 | -name "result" \ 26 | ")" -exec rm -rf {} + 2>/dev/null || true 27 | 28 | format: 29 | @echo "Formatting code..." 30 | @uv run ruff format src/ 31 | @uv run isort src/ 32 | 33 | lint: 34 | @echo "Running linting checks..." 35 | @uv run ruff check --fix src/ 36 | @uv run isort --check-only --diff src/ 37 | 38 | type-check: 39 | @echo "Running type checks..." 40 | @uv run mypy 41 | 42 | build: 43 | @echo "Building package..." 44 | @uv build 45 | 46 | check: lint type-check 47 | @echo "All checks passed!" 48 | -------------------------------------------------------------------------------- /pkgs/vm-runner/src/virby_vm_runner/__init__.py: -------------------------------------------------------------------------------- 1 | """Virby VM runner package.""" 2 | 3 | from importlib.metadata import version 4 | 5 | from .exceptions import ( 6 | IPDiscoveryError, 7 | SSHConnectivityError, 8 | VirbyVMError, 9 | VMConfigurationError, 10 | VMRuntimeError, 11 | VMStartupError, 12 | ) 13 | 14 | __version__ = version("virby_vm_runner") 15 | 16 | 17 | # Lazy imports to avoid dependency issues when importing submodules 18 | def _get_vm_runner(): 19 | from .runner import VirbyVMRunner 20 | 21 | return VirbyVMRunner 22 | 23 | 24 | def _get_vm_config(): 25 | from .config import VMConfig 26 | 27 | return VMConfig 28 | 29 | 30 | def _get_api_client(): 31 | from .api import VfkitAPIClient 32 | 33 | return VfkitAPIClient 34 | 35 | 36 | def _get_vm_state(): 37 | from .api import VirtualMachineState 38 | 39 | return VirtualMachineState 40 | 41 | 42 | def __getattr__(name): 43 | if name == "VirbyVMRunner": 44 | return _get_vm_runner() 45 | elif name == "VMConfig": 46 | return _get_vm_config() 47 | elif name == "VfkitAPIClient": 48 | return _get_api_client() 49 | elif name == "VirtualMachineState": 50 | return _get_vm_state() 51 | raise AttributeError(f"module '{__name__}' has no attribute '{name}'") 52 | 53 | 54 | __all__ = [ 55 | "__version__", 56 | "VirbyVMRunner", 57 | "VMConfig", 58 | "VfkitAPIClient", 59 | "VirtualMachineState", 60 | "VirbyVMError", 61 | "VMConfigurationError", 62 | "VMStartupError", 63 | "VMRuntimeError", 64 | "IPDiscoveryError", 65 | "SSHConnectivityError", 66 | ] 67 | -------------------------------------------------------------------------------- /pkgs/vm-runner/.envrc: -------------------------------------------------------------------------------- 1 | # Use uv to create virtual env, similar to direnv's `layout_python`. 2 | # Taken from: https://github.com/direnv/direnv/blob/a65ac6098669721ffbf5784dc206e62d064fa749/stdlib.sh#L1077-L1110 3 | layout_uv() { 4 | local python_version=${1:-} 5 | # If they have specified a python version, check to see if there is already a 6 | # .python-version file. If there is, and the specified version is different, 7 | # then recreate the virtual environment with the new version. 8 | # Otherwise, just use the existing virtual environment, if there is already a 9 | # .venv directory. 10 | 11 | VIRTUAL_ENV="${PWD}/.venv" 12 | # Get the current python version from the .python-version file 13 | local python_version_file=".python-version" 14 | local current_python_version="" 15 | if [[ -f "$python_version_file" ]]; then 16 | current_python_version=$(<"$python_version_file") 17 | fi 18 | 19 | # Check to see if there is already an existing virtual environment, 20 | # OR if the current python version is different from the one specified in .python-version 21 | if [[ -z $VIRTUAL_ENV || ! -d $VIRTUAL_ENV || (-n $python_version && $current_python_version != "$python_version") ]]; then 22 | log_status "No virtual environment exists. Executing \`uv venv\` to create one." 23 | if [[ -n $python_version ]]; then 24 | uv venv --python "$python_version" 25 | # Write the python version to the .python-version file 26 | echo "$python_version" > .python-version 27 | else 28 | uv venv 29 | fi 30 | fi 31 | 32 | PATH_add "$VIRTUAL_ENV/bin" 33 | export UV_ACTIVE=1 34 | export VIRTUAL_ENV 35 | export UV_PROJECT_ENVIRONMENT=$VIRTUAL_ENV 36 | } 37 | 38 | layout uv 39 | source_up 40 | -------------------------------------------------------------------------------- /pkgs/vm-runner/default.nix: -------------------------------------------------------------------------------- 1 | { 2 | _lib, 3 | lib, 4 | aiofiles, 5 | buildPythonApplication, 6 | hatchling, 7 | httpx, 8 | vfkit, 9 | writeText, 10 | }: 11 | 12 | let 13 | inherit (_lib.helpers) toScreamingSnakeCase; 14 | in 15 | 16 | buildPythonApplication { 17 | pname = "virby-vm-runner"; 18 | version = (fromTOML (builtins.readFile ./pyproject.toml)).project.version; 19 | 20 | pyproject = true; 21 | src = ./.; 22 | 23 | build-system = [ hatchling ]; 24 | 25 | dependencies = [ 26 | aiofiles 27 | httpx 28 | vfkit 29 | ]; 30 | 31 | pythonImportsCheck = [ 32 | "virby_vm_runner" 33 | "virby_vm_runner.api" 34 | "virby_vm_runner.circuit_breaker" 35 | "virby_vm_runner.cli" 36 | "virby_vm_runner.config" 37 | "virby_vm_runner.constants" 38 | "virby_vm_runner.exceptions" 39 | "virby_vm_runner.ip_discovery" 40 | "virby_vm_runner.runner" 41 | "virby_vm_runner.signal_manager" 42 | "virby_vm_runner.socket_activation" 43 | "virby_vm_runner.ssh" 44 | "virby_vm_runner.vm_process" 45 | ]; 46 | 47 | preBuild = '' 48 | cat ${writeText "constants.py" '' 49 | """Variable constants for Virby. 50 | 51 | This file was generated by Nix using the variables from `lib/constants.nix`. 52 | """ 53 | 54 | ${lib.concatStringsSep "\n" ( 55 | lib.mapAttrsToList (name: value: "${toScreamingSnakeCase name} = \"${value}\"") _lib.constants 56 | )} 57 | ''} > src/virby_vm_runner/constants.py 58 | ''; 59 | 60 | meta = { 61 | description = "Vfkit-based VM runner for Virby"; 62 | homepage = "https://github.com/quinneden/virby-nix-darwin"; 63 | license = lib.licenses.mit; 64 | platforms = lib.platforms.darwin; 65 | mainProgram = "virby-vm"; 66 | }; 67 | } 68 | -------------------------------------------------------------------------------- /.github/workflows/image.yml: -------------------------------------------------------------------------------- 1 | # Build and cache the VM disk images for both architectures 2 | 3 | name: Build and Cache VM Images 4 | 5 | on: 6 | push: 7 | branches: [main] 8 | paths: 9 | [ 10 | "!CLAUDE.md", 11 | "!flake.nix", 12 | "!lib/**", 13 | "!module/**", 14 | "!pkgs/vm-runner/**", 15 | "!README.md", 16 | "flake.lock", 17 | "pkgs/vm-image/**", 18 | ] 19 | pull_request: 20 | types: [closed] 21 | branches: [main] 22 | paths: 23 | [ 24 | "!CLAUDE.md", 25 | "!flake.nix", 26 | "!lib/**", 27 | "!module/**", 28 | "!pkgs/vm-runner/**", 29 | "!README.md", 30 | "flake.lock", 31 | "pkgs/vm-image/**", 32 | ] 33 | workflow_dispatch: 34 | 35 | jobs: 36 | build: 37 | name: packages.${{ matrix.arch }}.vm-image 38 | if: github.event_name != 'pull_request' || github.event.pull_request.merged == true 39 | strategy: 40 | matrix: 41 | include: 42 | - arch: aarch64-linux 43 | runner: ubuntu-24.04-arm 44 | - arch: x86_64-linux 45 | runner: ubuntu-latest 46 | fail-fast: false 47 | 48 | runs-on: ${{ matrix.runner }} 49 | 50 | steps: 51 | - uses: actions/checkout@v4.2.2 52 | 53 | - name: Set up Nix 54 | uses: cachix/install-nix-action@v31.4.1 55 | with: 56 | enable_kvm: true 57 | extra_nix_config: "system-features = benchmark big-parallel kvm nixos-test uid-range" 58 | github_access_token: ${{ secrets.GITHUB_TOKEN }} 59 | 60 | - name: Set up Cachix 61 | uses: cachix/cachix-action@v16 62 | with: 63 | name: virby-nix-darwin 64 | authToken: ${{ secrets.CACHIX_AUTH_TOKEN }} 65 | 66 | - name: Nix build 67 | run: nix build -L .#packages.${{ matrix.arch }}.vm-image 68 | -------------------------------------------------------------------------------- /pkgs/vm-runner/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "virby-vm-runner" 3 | version = "0.3.0" 4 | description = "Vfkit-based VM runner for Virby" 5 | requires-python = ">=3.10" 6 | authors = [{ name = "Quinn Edenfield", email = "quinn@qeden.dev" }] 7 | dependencies = [ 8 | "aiofiles>=24.1.0", 9 | "httpx>=0.28.1", 10 | ] 11 | 12 | [project.scripts] 13 | virby-vm = "virby_vm_runner.cli:cli_main" 14 | 15 | [dependency-groups] 16 | dev = [ 17 | "basedpyright>=1.31.0", 18 | "commitizen>=4.8.3", 19 | "ipython>=8.37.0", 20 | "isort>=5.12.0", 21 | "mypy>=1.0.0", 22 | "ruff>=0.1.0", 23 | "types-aiofiles>=24.1.0.20250606", 24 | ] 25 | 26 | [build-system] 27 | requires = ["hatchling"] 28 | build-backend = "hatchling.build" 29 | 30 | [tool.basedpyright] 31 | include = ["src"] 32 | exclude = [ 33 | "**/node_modules", 34 | "**/__pycache__", 35 | ] 36 | reportMissingImports = "none" 37 | reportMissingTypeStubs = false 38 | reportUnsupportedDunderAll = "none" 39 | 40 | [tool.commitizen] 41 | name = "cz_conventional_commits" 42 | tag_format = "vm-runner-v$version" 43 | ignored_tag_formats = ["latest"] 44 | version_scheme = "pep440" 45 | version_provider = "pep621" 46 | update_changelog_on_bump = true 47 | 48 | [tool.hatch.build.targets.wheel] 49 | packages = ["src/virby_vm_runner"] 50 | 51 | [tool.hatch.build.targets.sdist] 52 | only-include = ["src"] 53 | 54 | [tool.isort] 55 | ensure_newline_before_comments = true 56 | force_grid_wrap = 0 57 | include_trailing_comma = true 58 | line_length = 100 59 | multi_line_output = 3 60 | use_parentheses = true 61 | 62 | [tool.mypy] 63 | check_untyped_defs = true 64 | mypy_path = ["src"] 65 | packages = ["virby_vm_runner"] 66 | pretty = true 67 | python_version = "3.13" 68 | warn_no_return = true 69 | warn_redundant_casts = true 70 | warn_return_any = true 71 | warn_unused_configs = true 72 | warn_unused_ignores = true 73 | 74 | [tool.ruff] 75 | fix = true 76 | line-length = 100 77 | 78 | [tool.uv] 79 | package = true 80 | default-groups = ["dev"] 81 | -------------------------------------------------------------------------------- /flake.nix: -------------------------------------------------------------------------------- 1 | { 2 | description = "A vfkit-based linux builder for Nix-darwin"; 3 | 4 | inputs = { 5 | nixpkgs.url = "github:nixos/nixpkgs?ref=nixos-unstable"; 6 | }; 7 | 8 | outputs = 9 | { self, nixpkgs }@inputs: 10 | 11 | let 12 | inherit (nixpkgs) lib; 13 | _lib = import ./lib { inherit lib; }; 14 | 15 | darwinSystems = lib.systems.doubles.darwin; 16 | linuxSystems = _lib.helpers.doppelganger darwinSystems; 17 | 18 | pkgsFor = systems: f: lib.genAttrs systems (system: f (import nixpkgs { inherit system; })); 19 | 20 | perDarwinSystem = pkgsFor darwinSystems; 21 | perLinuxSystem = pkgsFor linuxSystems; 22 | in 23 | 24 | { 25 | darwinModules = { 26 | default = self.darwinModules.virby; 27 | virby = import ./module { inherit _lib self; }; 28 | }; 29 | 30 | packages = 31 | perDarwinSystem (pkgs: { 32 | default = self.packages.${pkgs.stdenv.hostPlatform.system}.vm-runner; 33 | vm-runner = pkgs.python3Packages.callPackage ./pkgs/vm-runner { inherit _lib; }; 34 | }) 35 | // perLinuxSystem (pkgs: { 36 | default = self.packages.${pkgs.stdenv.hostPlatform.system}.vm-image; 37 | vm-image = pkgs.callPackage ./pkgs/vm-image { inherit _lib inputs lib; }; 38 | }); 39 | 40 | apps = perDarwinSystem (pkgs: { 41 | benchmark-vm = { 42 | type = "app"; 43 | program = lib.getExe (pkgs.callPackage ./scripts/benchmark-vm { }); 44 | }; 45 | bump-version = { 46 | type = "app"; 47 | program = lib.getExe (pkgs.callPackage ./scripts/bump-version { }); 48 | }; 49 | }); 50 | 51 | devShells = perDarwinSystem (pkgs: { 52 | default = pkgs.mkShellNoCC { 53 | name = "virby-dev"; 54 | packages = [ pkgs.vfkit ]; 55 | }; 56 | }); 57 | 58 | formatter = perDarwinSystem ( 59 | pkgs: pkgs.nixfmt-tree.override { settings.formatter.nixfmt.options = [ "--strict" ]; } 60 | ); 61 | }; 62 | } 63 | -------------------------------------------------------------------------------- /scripts/bump-version/bump-version.sh: -------------------------------------------------------------------------------- 1 | # Don't run this script directly. 2 | # Use: 3 | # `nix run .#bump-version -- {package} [options]` 4 | # 5 | # This script bumps the version of the package specified by $1. Currently, the 6 | # only valid option is `vm-runner`. 7 | 8 | set -eo pipefail 9 | 10 | show_help() { 11 | echo "Bump the version of the vm-runner package" >&2 12 | echo >&2 13 | echo "Usage:" >&2 14 | echo " bump-version {package} [-h|--help]" >&2 15 | echo >&2 16 | echo "Arguments:" >&2 17 | echo " package The package to bump the version of (currently, only accepts 'vm-runner')" >&2 18 | echo >&2 19 | echo "Options:" >&2 20 | echo " -h, --help Show this help message" >&2 21 | } 22 | 23 | while [[ $# -gt 0 ]]; do 24 | case "$1" in 25 | -h|--help) 26 | show_help 27 | exit 0 28 | ;; 29 | vm-runner) 30 | package="$1" 31 | shift 32 | ;; 33 | *) 34 | echo "Error: Unknown option '$1'" >&2 35 | show_help 36 | exit 1 37 | ;; 38 | esac 39 | done 40 | 41 | if [[ $(git symbolic-ref --short HEAD) != "main" ]]; then 42 | echo "Error: script must be run from main branch" >&2 43 | exit 1 44 | fi 45 | 46 | uncommitted_changes=$(git diff --compact-summary) 47 | if [[ -n "$uncommitted_changes" ]]; then 48 | echo "Error: there are uncommitted changes:" >&2 49 | echo "$uncommitted_changes" >&2 50 | exit 1 51 | fi 52 | 53 | git fetch "git@github.com:quinneden/virby-nix-darwin" main 54 | unpushed_commits=$(git log --format=oneline origin/main..main) 55 | if [[ -n "$unpushed_commits" ]]; then 56 | echo "Error: there are unpushed commits:" >&2 57 | echo "$unpushed_commits" >&2 58 | exit 1 59 | fi 60 | 61 | pushd pkgs/vm-runner &>/dev/null || exit 1 62 | 63 | version=$(cz bump --major-version-zero --get-next) 64 | cz bump \ 65 | --major-version-zero \ 66 | --tag-format="$package-v\$version" \ 67 | --bump-message="chore($package): bump version to $version" 68 | 69 | popd &>/dev/null || exit 1 70 | 71 | read -rN1 -p "Push $package-v$version to remote? (y/N): " input 72 | if [[ $input != [yY] ]]; then 73 | echo "To push the changes, run:" 74 | echo 75 | echo " git push origin main $package-v$version" 76 | exit 0 77 | else 78 | git push origin main "$package-v$version" 79 | fi 80 | -------------------------------------------------------------------------------- /lib/helpers.nix: -------------------------------------------------------------------------------- 1 | # Helper functions for the Virby Nix-darwin module 2 | { lib }: 3 | 4 | let 5 | ESC = builtins.fromJSON ''"\u001b"''; 6 | GREEN = "${ESC}[32m"; 7 | RED = "${ESC}[31m"; 8 | RESET = "${ESC}[0m"; 9 | 10 | doppelganger = 11 | f: 12 | let 13 | swap = f: lib.replaceStrings [ "darwin" ] [ "linux" ] f; 14 | in 15 | if (lib.isList f) then map (l: swap l) f else swap f; 16 | 17 | logError = "printf \"[$(date '+%Y-%m-%d %H:%M:%S')] ${RED}ERROR:${RESET} %s\n\""; 18 | logInfo = "printf \"[$(date '+%Y-%m-%d %H:%M:%S')] ${GREEN}INFO:${RESET} %s\n\""; 19 | 20 | parseMemoryMiB = 21 | with lib; 22 | let 23 | validateMin = 24 | mib: 25 | if mib < 1024 then 26 | throw "The Virby VM requires at least 1024 MiB of memory, got: ${toString mib} MiB" 27 | else 28 | mib; 29 | in 30 | mem: 31 | if isString mem then 32 | let 33 | parts = splitStringBy ( 34 | prev: curr: match "[0-9]" prev != null && match "[aA-zZ]" curr != null 35 | ) true mem; 36 | num = elemAt parts 0; 37 | suffix = 38 | if (length parts > 1) then 39 | elemAt parts 1 40 | else 41 | throw "memory string must contain a suffix, e.g. `4096MiB`"; 42 | mib = 43 | if suffix == "GiB" || suffix == "G" then 44 | (toInt num) * 1024 45 | else if suffix == "MiB" || suffix == "M" then 46 | toInt num 47 | else 48 | throw "unsupported memory format: ${suffix}"; 49 | in 50 | validateMin mib 51 | else 52 | let 53 | mib = mem; 54 | in 55 | validateMin mib; 56 | 57 | setupLogFunctions = '' 58 | logInfo() { 59 | echo -e "${GREEN}[virby]${RESET} $*" >&2 60 | } 61 | logError() { 62 | echo -e "${RED}[virby]${RESET} $*" >&2 63 | } 64 | ''; 65 | 66 | toScreamingSnakeCase = 67 | with lib; 68 | s: 69 | let 70 | isUpper = c: match "[A-Z]" c != null; 71 | chars = stringToCharacters s; 72 | in 73 | concatStrings ( 74 | map ( 75 | c: 76 | if (isUpper c && c != elemAt chars 0) then 77 | "_" + c 78 | else if c == "-" then 79 | "_" 80 | else 81 | toUpper c 82 | ) chars 83 | ); 84 | in 85 | 86 | { 87 | inherit 88 | doppelganger 89 | logError 90 | logInfo 91 | parseMemoryMiB 92 | setupLogFunctions 93 | toScreamingSnakeCase 94 | ; 95 | } 96 | -------------------------------------------------------------------------------- /pkgs/vm-runner/src/virby_vm_runner/ssh.py: -------------------------------------------------------------------------------- 1 | """SSH connectivity testing for Virby VM.""" 2 | 3 | import asyncio 4 | import logging 5 | from pathlib import Path 6 | 7 | from .constants import SSH_KNOWN_HOSTS_FILE_NAME, SSH_USER_PRIVATE_KEY_FILE_NAME, VM_USER 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | class SSHConnectivityTester: 13 | """Cached SSH connectivity tester.""" 14 | 15 | def __init__(self, working_dir: Path, username: str = VM_USER): 16 | self.working_dir = working_dir 17 | self.username = username 18 | self.ssh_key_path = working_dir / SSH_USER_PRIVATE_KEY_FILE_NAME 19 | self.known_hosts_path = working_dir / SSH_KNOWN_HOSTS_FILE_NAME 20 | 21 | self._ssh_base_command = [ 22 | "ssh", 23 | "-o", 24 | "BatchMode=yes", 25 | "-o", 26 | "LogLevel=ERROR", 27 | "-o", 28 | "PasswordAuthentication=no", 29 | "-o", 30 | "StrictHostKeyChecking=accept-new", 31 | "-o", 32 | f"UserKnownHostsFile={self.known_hosts_path}", 33 | "-p", 34 | "22", 35 | "-i", 36 | str(self.ssh_key_path), 37 | ] 38 | 39 | async def test_connectivity(self, ip_address: str, timeout: int = 10) -> bool: 40 | """Test SSH connectivity with cached command.""" 41 | if not self.ssh_key_path.exists(): 42 | logger.debug(f"SSH key not found at {self.ssh_key_path}") 43 | return False 44 | 45 | ssh_command = self._ssh_base_command + [ 46 | "-o", 47 | f"ConnectTimeout={timeout}", 48 | f"{self.username}@{ip_address}", 49 | "true", 50 | ] 51 | 52 | try: 53 | process = await asyncio.create_subprocess_exec( 54 | *ssh_command, 55 | stdout=asyncio.subprocess.DEVNULL, 56 | stderr=asyncio.subprocess.DEVNULL, 57 | ) 58 | 59 | try: 60 | await asyncio.wait_for(process.wait(), timeout=timeout) 61 | success = process.returncode == 0 62 | if success: 63 | logger.debug(f"SSH connection to {ip_address} successful") 64 | return success 65 | except asyncio.TimeoutError: 66 | process.kill() 67 | await process.wait() 68 | return False 69 | 70 | except Exception as e: 71 | logger.debug(f"SSH connectivity test failed: {e}") 72 | return False 73 | -------------------------------------------------------------------------------- /pkgs/vm-runner/src/virby_vm_runner/circuit_breaker.py: -------------------------------------------------------------------------------- 1 | """Circuit breaker pattern for VM operations.""" 2 | 3 | import time 4 | from enum import Enum 5 | from typing import Any, Callable 6 | 7 | from .exceptions import VMRuntimeError 8 | 9 | 10 | class CircuitState(Enum): 11 | """Circuit breaker states.""" 12 | 13 | CLOSED = "closed" # Normal operation 14 | OPEN = "open" # Failing, reject requests 15 | HALF_OPEN = "half_open" # Testing if service recovered 16 | 17 | 18 | class CircuitBreaker: 19 | """Circuit breaker to prevent cascading failures in VM operations.""" 20 | 21 | def __init__(self, failure_threshold: int = 5, timeout: float = 30.0): 22 | """Initialize circuit breaker. 23 | 24 | Args: 25 | failure_threshold: Number of failures before opening circuit 26 | timeout: Time to wait before attempting recovery (seconds) 27 | """ 28 | self.failure_threshold = failure_threshold 29 | self.timeout = timeout 30 | self.failure_count = 0 31 | self.last_failure_time = None 32 | self.state = CircuitState.CLOSED 33 | 34 | async def call(self, func: Callable, *args, **kwargs) -> Any: 35 | """Execute function via circuit breaker. 36 | 37 | Args: 38 | func: Function to execute 39 | *args: Function arguments 40 | **kwargs: Function keyword arguments 41 | 42 | Returns: 43 | Function result 44 | 45 | Raises: 46 | VMRuntimeError: If circuit is open or function fails 47 | """ 48 | if self.state == CircuitState.OPEN: 49 | if time.time() - self.last_failure_time > self.timeout: 50 | self.state = CircuitState.HALF_OPEN 51 | else: 52 | raise VMRuntimeError("Circuit breaker is OPEN") 53 | 54 | try: 55 | result = await func(*args, **kwargs) 56 | self._on_success() 57 | return result 58 | except Exception: 59 | self._on_failure() 60 | raise 61 | 62 | def _on_success(self): 63 | """Handle successful operation.""" 64 | self.failure_count = 0 65 | self.state = CircuitState.CLOSED 66 | 67 | def _on_failure(self): 68 | """Handle failed operation.""" 69 | self.failure_count += 1 70 | self.last_failure_time = time.time() 71 | 72 | if self.failure_count >= self.failure_threshold: 73 | self.state = CircuitState.OPEN 74 | 75 | def reset(self): 76 | """Manually reset circuit breaker to closed state.""" 77 | self.failure_count = 0 78 | self.last_failure_time = None 79 | self.state = CircuitState.CLOSED 80 | 81 | @property 82 | def is_open(self) -> bool: 83 | """Check if circuit is open.""" 84 | return self.state == CircuitState.OPEN 85 | 86 | @property 87 | def is_half_open(self) -> bool: 88 | """Check if circuit is half-open.""" 89 | return self.state == CircuitState.HALF_OPEN 90 | 91 | @property 92 | def is_closed(self) -> bool: 93 | """Check if circuit is closed.""" 94 | return self.state == CircuitState.CLOSED 95 | -------------------------------------------------------------------------------- /pkgs/vm-runner/src/virby_vm_runner/signal_manager.py: -------------------------------------------------------------------------------- 1 | """Centralized signal manager for consistent shutdown handling.""" 2 | 3 | import asyncio 4 | import logging 5 | import signal 6 | from typing import Callable, Set 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | class SignalManager: 12 | """Centralized signal manager for VM shutdown coordination.""" 13 | 14 | def __init__(self): 15 | self._shutdown_event = asyncio.Event() 16 | self._handlers: Set[Callable] = set() 17 | self._original_handlers = {} 18 | self._signals_setup = False 19 | 20 | def add_shutdown_handler(self, handler: Callable): 21 | """Add a shutdown handler to be called on signal. 22 | 23 | Args: 24 | handler: Callable to be executed during shutdown 25 | """ 26 | self._handlers.add(handler) 27 | 28 | def remove_shutdown_handler(self, handler: Callable): 29 | """Remove a shutdown handler. 30 | 31 | Args: 32 | handler: Callable to be removed from shutdown handlers 33 | """ 34 | self._handlers.discard(handler) 35 | 36 | def setup_signal_handlers(self): 37 | """Setup signal handlers once.""" 38 | if self._signals_setup: 39 | logger.debug("Signal handlers already setup") 40 | return 41 | 42 | def signal_handler(signum, frame): 43 | logger.info(f"Received signal {signum}, initiating shutdown") 44 | self._shutdown_event.set() 45 | 46 | # Call registered handlers 47 | for handler in self._handlers: 48 | try: 49 | handler() 50 | except Exception as e: 51 | logger.error(f"Error in shutdown handler: {e}") 52 | 53 | # Store original handlers for cleanup 54 | self._original_handlers[signal.SIGTERM] = signal.signal(signal.SIGTERM, signal_handler) 55 | self._original_handlers[signal.SIGINT] = signal.signal(signal.SIGINT, signal_handler) 56 | 57 | self._signals_setup = True 58 | logger.debug("Signal handlers setup complete") 59 | 60 | @property 61 | def shutdown_event(self) -> asyncio.Event: 62 | """Get the shutdown event for async coordination.""" 63 | return self._shutdown_event 64 | 65 | def request_shutdown(self): 66 | """Manually request shutdown.""" 67 | logger.info("Shutdown requested programmatically") 68 | self._shutdown_event.set() 69 | 70 | def cleanup(self): 71 | """Restore original signal handlers and cleanup resources.""" 72 | if not self._signals_setup: 73 | return 74 | 75 | for sig, handler in self._original_handlers.items(): 76 | try: 77 | signal.signal(sig, handler) 78 | except Exception as e: 79 | logger.error(f"Error restoring signal handler for {sig}: {e}") 80 | 81 | self._original_handlers.clear() 82 | self._handlers.clear() 83 | self._signals_setup = False 84 | logger.debug("Signal handlers cleaned up") 85 | 86 | def is_shutdown_requested(self) -> bool: 87 | """Check if shutdown has been requested.""" 88 | return self._shutdown_event.is_set() 89 | -------------------------------------------------------------------------------- /pkgs/vm-runner/src/virby_vm_runner/cli.py: -------------------------------------------------------------------------------- 1 | """CLI entry point for the Virby VM runner.""" 2 | 3 | import asyncio 4 | import logging 5 | import os 6 | import stat 7 | import sys 8 | 9 | from .config import VMConfig 10 | from .runner import VirbyVMRunner 11 | from .signal_manager import SignalManager 12 | from .vm_process import cleanup_orphaned_vfkit_processes 13 | 14 | 15 | def setup_logging(debug: bool = False) -> None: 16 | """Setup logging configuration.""" 17 | level = logging.DEBUG if debug else logging.INFO 18 | logging.basicConfig( 19 | level=level, 20 | format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", 21 | handlers=[logging.StreamHandler(sys.stdout)], 22 | ) 23 | 24 | 25 | def debug_startup_environment(): 26 | """Debug environment and file descriptors at startup.""" 27 | logger = logging.getLogger(__name__) 28 | 29 | if not logger.isEnabledFor(logging.DEBUG): 30 | return 31 | 32 | logger.debug("=== STARTUP DEBUG ===") 33 | 34 | env_vars = [ 35 | "VIRBY_VM_CONFIG_FILE", 36 | "VIRBY_WORKING_DIRECTORY", 37 | "LISTEN_FDS", 38 | "LISTEN_PID", 39 | "LAUNCH_DAEMON_SOCKET_NAME", 40 | ] 41 | 42 | env_info = [] 43 | for var in env_vars: 44 | value = os.environ.get(var, "null") 45 | env_info.append(f"{var}={value}") 46 | logger.debug(f"ENV: {', '.join(env_info)}") 47 | 48 | # Only check file descriptors if really needed and limit to first 5 FDs 49 | socket_fds = [] 50 | for fd in range(5): 51 | try: 52 | fd_stat = os.fstat(fd) 53 | if stat.S_ISSOCK(fd_stat.st_mode): 54 | socket_fds.append(str(fd)) 55 | except OSError: 56 | continue 57 | 58 | if socket_fds: 59 | logger.debug(f"Socket FDs: {', '.join(socket_fds)}") 60 | 61 | logger.debug("=== END STARTUP DEBUG ===") 62 | 63 | 64 | async def main() -> int: 65 | """Main CLI entry point.""" 66 | signal_manager = SignalManager() 67 | 68 | try: 69 | debug_startup_environment() 70 | 71 | config_file_env = os.getenv("VIRBY_VM_CONFIG_FILE") 72 | config = VMConfig(config_path=config_file_env) 73 | 74 | setup_logging(config.debug_enabled) 75 | 76 | # Setup signal handling once 77 | signal_manager.setup_signal_handlers() 78 | 79 | # Clean up any orphaned processes from previous runs 80 | try: 81 | await cleanup_orphaned_vfkit_processes(config.working_directory) 82 | except Exception as e: 83 | logging.warning(f"Error during orphan cleanup: {e}") 84 | 85 | runner = VirbyVMRunner(config, signal_manager) 86 | await runner.run() 87 | 88 | return 0 89 | 90 | except KeyboardInterrupt: 91 | logging.info("Interrupted by user") 92 | return 0 93 | except Exception as e: 94 | logging.error(f"Fatal error: {e}") 95 | return 1 96 | finally: 97 | signal_manager.cleanup() 98 | 99 | 100 | def cli_main() -> None: 101 | """Entry point for CLI.""" 102 | try: 103 | exit_code = asyncio.run(main()) 104 | sys.exit(exit_code) 105 | except KeyboardInterrupt: 106 | sys.exit(130) 107 | 108 | 109 | if __name__ == "__main__": 110 | cli_main() 111 | -------------------------------------------------------------------------------- /pkgs/vm-runner/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ## vm-runner-v0.3.0 (2025-08-01) 2 | 3 | ### Feat 4 | 5 | - **vm-runner,module**: add support for arbitrary shared directories 6 | 7 | ### Fix 8 | 9 | - **vm-runner**: fix error message in shared_dirs config validation 10 | - **vm-runner**: validate non-privileged port range in config 11 | - **vm-runner**: remove obsolete test_ssh_connectivity function 12 | - **vm-runner**: stop using typing.Optional 13 | - **lib,module**: refactor parseMemoryString function 14 | 15 | ### Refactor 16 | 17 | - **module**: split module options into separate file 18 | - **vm-runner**: rename functions 19 | - **vm-runner,lib**: generate constants for python from Nix 20 | 21 | ## vm-runner-v0.2.0 (2025-07-27) 22 | 23 | ### Feat 24 | 25 | - add bump-version flake app 26 | - **vm-runner**: add support for VM pause/resume operations 27 | - add centralized signal manager for VM shutdown coordination 28 | - add circuit breaker pattern for VM operations 29 | - add vfkit API client for virtual machine management 30 | - add benchmark-vm flake application 31 | - **vm-runner**: add early signal handling and orphaned process cleanup 32 | 33 | ### Fix 34 | 35 | - **bump-version**: pass positional args to script 36 | - **bump-version**: fix cz commands 37 | - rewrap comment in code block 38 | - **flake**: fix path to benchmark-vm script 39 | - **vm-runner,module**: add on-demand to json config file 40 | - **vm-runner**: optimize VM startup performance 41 | - **module**: initialize `NEEDS_GENERATE_SSH_KEYS` variable before logic 42 | - **flake**: change `nixpkgs` branch 43 | - comment out logind auto-poweroff config and add vm pause/resume 44 | 45 | ### Refactor 46 | 47 | - **module**: refactor ssh keygen logic 48 | - **vm-runner**: remove unneccessary `debug_file_descriptors` 49 | 50 | ## vm-runner-v0.1.0 (2025-07-27) 51 | 52 | ### BREAKING CHANGE 53 | 54 | - The `rosetta.enable` option has been removed and replaced with 55 | `rosetta`. Update your configuration accordingly. 56 | 57 | ### Feat 58 | 59 | - add CI workflow 60 | - **vm-runner**: add socket activation with tcp port forwarding 61 | - **vm-runner**: add comprehensive Python VM runner implementation 62 | - add python VM runner package 63 | 64 | ### Fix 65 | 66 | - **vm-runner**: refactor code structure 67 | - **module**: fix option type for 68 | - **vm-image**: fix `virtualization.rosetta.enable` setting 69 | - **module,vm-runner**: change option `rosetta.enable` to `rosetta` 70 | - **vm-runner**: fix SSH in always-on mode 71 | - **module**: fix default values for `onDemand` and `rosetta` options 72 | - direct both logInfo and logError to stdout 73 | - **module**: compare store path of source image instead of hash 74 | - remove gvproxy from dev-shell packages 75 | - change package name to `virby-vm-runner` in pkgs/vm-runner/default.nix 76 | - `pkgs.hostPlatform.system` -> `pkgs.system` in virby/default.nix 77 | - remove redundant `pkgs` reference in `default.nix` 78 | - **docs**: add information about binary-cache to README 79 | - use nix in .envrc 80 | - update reference to vm-runner executable in darwin module 81 | - **pkgs/vm-runner**: Fix SSH connection issues with VM runner 82 | 83 | ### Refactor 84 | 85 | - **vm-image**: simplify install-sshd-keys service 86 | - update functions in flake.nix 87 | - add `default` attr for vm-runner, lib.replaceString -> lib.replaceStrings 88 | - remove `lib/option-defaults.nix`, cleanup code 89 | - **module,vm-runner**: generate python constants from nix code 90 | - **Justfile**: add setup-test-working-directory 91 | - include `layout_uv` in `.envrc` and update `pyproject.toml` 92 | - update module and image configuration 93 | - **vm-image**: change kernel flag 94 | -------------------------------------------------------------------------------- /pkgs/vm-image/image-config.nix: -------------------------------------------------------------------------------- 1 | { 2 | _lib, 3 | cfg, 4 | config, 5 | inputs, 6 | lib, 7 | pkgs, 8 | ... 9 | }: 10 | 11 | let 12 | inherit (_lib.constants) 13 | sshHostPrivateKeyFileName 14 | sshUserPublicKeyFileName 15 | vmHostName 16 | vmUser 17 | ; 18 | 19 | sshDirPath = "/etc/ssh/"; 20 | sshHostPrivateKeyPath = sshDirPath + sshHostPrivateKeyFileName; 21 | in 22 | 23 | { 24 | imports = [ "${inputs.nixpkgs}/nixos/modules/image/file-options.nix" ]; 25 | 26 | boot = { 27 | enableContainers = lib.mkDefault false; 28 | kernelParams = [ "console=hvc0" ]; 29 | loader = { 30 | efi.canTouchEfiVariables = true; 31 | systemd-boot.enable = true; 32 | timeout = 0; 33 | }; 34 | }; 35 | 36 | documentation = { 37 | enable = false; 38 | nixos.enable = false; 39 | man.enable = false; 40 | info.enable = false; 41 | doc.enable = false; 42 | }; 43 | 44 | environment = { 45 | defaultPackages = lib.mkDefault [ ]; 46 | stub-ld.enable = lib.mkDefault false; 47 | }; 48 | 49 | fileSystems = { 50 | "/".options = [ 51 | "discard" 52 | "noatime" 53 | ]; 54 | "/boot".options = [ 55 | "discard" 56 | "noatime" 57 | "umask=0077" 58 | ]; 59 | }; 60 | 61 | image = lib.mkForce { 62 | baseName = "virby-vm-nixos-image-${config.system.nixos.label}-${pkgs.stdenv.hostPlatform.system}"; 63 | extension = "img"; 64 | }; 65 | 66 | networking = { 67 | hostName = lib.mkForce vmHostName; 68 | dhcpcd.extraConfig = lib.mkForce '' 69 | clientid "" 70 | ''; 71 | }; 72 | 73 | nix = { 74 | channel.enable = false; 75 | registry.nixpkgs.flake = inputs.nixpkgs; 76 | 77 | settings = 78 | let 79 | gibibyte = 1024 * 1024 * 1024; 80 | in 81 | { 82 | auto-optimise-store = true; 83 | experimental-features = [ 84 | "flakes" 85 | "nix-command" 86 | ]; 87 | min-free = gibibyte * 5; 88 | max-free = gibibyte * 7; 89 | trusted-users = [ vmUser ]; 90 | }; 91 | }; 92 | 93 | programs = { 94 | less.lessopen = lib.mkDefault null; 95 | command-not-found.enable = lib.mkDefault false; 96 | fish.generateCompletions = lib.mkDefault false; 97 | }; 98 | 99 | security.sudo = { 100 | enable = cfg.debug; 101 | wheelNeedsPassword = !cfg.debug; 102 | }; 103 | 104 | services = { 105 | getty = lib.optionalAttrs cfg.debug { autologinUser = vmUser; }; 106 | logrotate.enable = lib.mkDefault false; 107 | 108 | openssh = { 109 | enable = true; 110 | hostKeys = [ ]; # disable automatic host key generation 111 | 112 | settings = { 113 | HostKey = sshHostPrivateKeyPath; 114 | PasswordAuthentication = false; 115 | }; 116 | }; 117 | 118 | udisks2.enable = lib.mkDefault false; 119 | }; 120 | 121 | system = { 122 | disableInstallerTools = true; 123 | nixos.revision = null; 124 | stateVersion = "25.05"; 125 | systemBuilderArgs.allowSubstitutes = true; 126 | }; 127 | 128 | # Virtualization.framework's virtiofs implementation will grant any guest user access 129 | # to mounted files; they always appear to be owned by the effective UID and so access cannot 130 | # be restricted. 131 | # To protect the guest's SSH host key, the VM is configured to prevent any logins (via 132 | # console, SSH, etc) by default. This service then runs before sshd, mounts virtiofs, 133 | # copies the keys to local files (with appropriate ownership and permissions), and unmounts 134 | # the filesystem before allowing SSH to start. 135 | # Once SSH has been allowed to start (and given the guest user a chance to log in), the 136 | # virtiofs must never be mounted again (as the user could have left some process active to 137 | # read its secrets). This is prevented by `unitconfig.ConditionPathExists` below. 138 | systemd.services.install-sshd-keys = 139 | let 140 | mountTag = "sshd-keys"; 141 | mountPoint = "/var/${mountTag}"; 142 | authorizedKeysDir = "${sshDirPath}/authorized_keys.d"; 143 | in 144 | { 145 | description = "Install sshd's host and authorized keys"; 146 | 147 | path = with pkgs; [ 148 | coreutils 149 | mount 150 | umount 151 | ]; 152 | 153 | before = [ "sshd.service" ]; 154 | requiredBy = [ "sshd.service" ]; 155 | 156 | enableStrictShellChecks = true; 157 | serviceConfig.Type = "oneshot"; 158 | unitConfig.ConditionPathExists = "!${authorizedKeysDir}/${vmUser}"; 159 | 160 | script = '' 161 | mkdir -p ${mountPoint} 162 | mount -t virtiofs -o nodev,noexec,nosuid,ro ${mountTag} ${mountPoint} 163 | 164 | install -Dm600 -t ${sshDirPath} ${mountPoint}/${sshHostPrivateKeyFileName} 165 | install -Dm644 ${mountPoint}/${sshUserPublicKeyFileName} ${authorizedKeysDir}/${vmUser} 166 | 167 | umount ${mountPoint} 168 | rm -rf ${mountPoint} 169 | ''; 170 | }; 171 | 172 | users = { 173 | allowNoPasswordLogin = true; 174 | mutableUsers = false; 175 | 176 | users.${vmUser} = { 177 | isNormalUser = true; 178 | extraGroups = lib.optional cfg.debug "wheel"; 179 | }; 180 | }; 181 | 182 | virtualisation = { 183 | rosetta.enable = cfg.rosetta; 184 | }; 185 | 186 | xdg = { 187 | autostart.enable = lib.mkDefault false; 188 | icons.enable = lib.mkDefault false; 189 | mime.enable = lib.mkDefault false; 190 | sounds.enable = lib.mkDefault false; 191 | }; 192 | } 193 | -------------------------------------------------------------------------------- /pkgs/vm-runner/src/virby_vm_runner/api.py: -------------------------------------------------------------------------------- 1 | """Vfkit API Client.""" 2 | 3 | import asyncio 4 | import logging 5 | import random 6 | from functools import wraps 7 | from json import JSONDecodeError 8 | 9 | import httpx 10 | 11 | from .exceptions import VMRuntimeError 12 | 13 | logging.getLogger("httpcore.connection").setLevel(logging.ERROR) 14 | logging.getLogger("httpcore.http11").setLevel(logging.ERROR) 15 | 16 | 17 | class VirtualMachineState: 18 | """Enumeration of virtual machine states returned by vfkit's RESTful API.""" 19 | 20 | RUNNING = "VirtualMachineStateRunning" 21 | STOPPED = "VirtualMachineStateStopped" 22 | PAUSED = "VirtualMachineStatePaused" 23 | ERROR = "VirtualMachineStateError" 24 | STARTING = "VirtualMachineStateStarting" 25 | PAUSING = "VirtualMachineStatePausing" 26 | RESUMING = "VirtualMachineStateResuming" 27 | STOPPING = "VirtualMachineStateStopping" 28 | SAVING = "VirtualMachineStateSaving" 29 | RESTORING = "VirtualMachineStateRestoring" 30 | 31 | 32 | def retry_on_failure(max_retries=3, base_delay=0.1): 33 | """Decorator to retry operations with exponential backoff.""" 34 | 35 | def decorator(func): 36 | @wraps(func) 37 | async def wrapper(*args, **kwargs): 38 | for attempt in range(max_retries + 1): 39 | try: 40 | return await func(*args, **kwargs) 41 | except (httpx.ConnectError, httpx.TimeoutException): 42 | if attempt == max_retries: 43 | raise 44 | delay = base_delay * (2**attempt) + random.uniform(0, 0.1) 45 | await asyncio.sleep(delay) 46 | return None 47 | 48 | return wrapper 49 | 50 | return decorator 51 | 52 | 53 | class VfkitAPIClient: 54 | """Client for interacting with vfkit's RESTful API.""" 55 | 56 | def __init__(self, api_port: int, is_running_check: callable = None): 57 | """Initialize the vfkit API client. 58 | 59 | Args: 60 | api_port: Port number where the vfkit API is running 61 | is_running_check: Optional callable that returns True if VM is running 62 | """ 63 | self._vfkit_api_port = api_port 64 | self._is_running_check = is_running_check 65 | self._client: httpx.AsyncClient | None = None 66 | self._client_lock = asyncio.Lock() 67 | 68 | async def _get_client(self) -> httpx.AsyncClient: 69 | """Get or create HTTP client with connection pooling.""" 70 | if self._client is None: 71 | async with self._client_lock: 72 | if self._client is None: # Double-check locking 73 | self._client = httpx.AsyncClient( 74 | timeout=httpx.Timeout(5.0, connect=2.0), 75 | limits=httpx.Limits(max_connections=5, max_keepalive_connections=2), 76 | http2=False, # Disable HTTP/2 for simplicity 77 | ) 78 | return self._client 79 | 80 | async def close(self): 81 | """Close HTTP client and cleanup resources.""" 82 | if self._client: 83 | await self._client.aclose() 84 | self._client = None 85 | 86 | async def __aenter__(self): 87 | return self 88 | 89 | async def __aexit__(self, exc_type, exc_val, exc_tb): 90 | await self.close() 91 | 92 | async def get(self, endpoint: str) -> dict: 93 | """Make a GET request to vfkit's RESTful API. 94 | 95 | Args: 96 | endpoint: API endpoint to call 97 | 98 | Returns: 99 | dict | None: JSON response from the API, or None if no content 100 | """ 101 | return await self._call_api(endpoint, "GET") 102 | 103 | async def post(self, endpoint: str, data: dict | None = None) -> dict: 104 | """Make a POST request to vfkit's RESTful API. 105 | 106 | Args: 107 | endpoint: API endpoint to call 108 | data: JSON data to send in the request 109 | 110 | Returns: 111 | dict | None: JSON response from the API, or None if no content 112 | """ 113 | return await self._call_api(endpoint, "POST", data) 114 | 115 | @retry_on_failure(max_retries=2, base_delay=0.1) 116 | async def _call_api(self, endpoint: str, method: str = "GET", data: dict | None = None) -> dict: 117 | """Make a request to vfkit's RESTful API. 118 | 119 | Args: 120 | endpoint: API endpoint to call 121 | method: HTTP method to use (default: GET) 122 | data: JSON data to send in the request (default: None) 123 | 124 | Returns: 125 | dict | None: JSON response from the API, or None if no content 126 | """ 127 | if self._is_running_check and not self._is_running_check(): 128 | raise VMRuntimeError("Cannot make API request: VM is not running") 129 | 130 | url = f"http://localhost:{self._vfkit_api_port}{endpoint}" 131 | 132 | try: 133 | client = await self._get_client() 134 | response = await client.request(method, url, json=data) 135 | response.raise_for_status() 136 | 137 | if response.content: 138 | try: 139 | return response.json() 140 | except (ValueError, JSONDecodeError): 141 | return None 142 | return None 143 | 144 | except httpx.HTTPError as e: 145 | raise VMRuntimeError(f"vfkit API request failed: {e}") 146 | except Exception as e: 147 | raise VMRuntimeError(f"Unexpected error in vfkit API request: {e}") 148 | -------------------------------------------------------------------------------- /pkgs/vm-runner/src/virby_vm_runner/ip_discovery.py: -------------------------------------------------------------------------------- 1 | """IP discovery via DHCP lease parsing for Virby VM.""" 2 | 3 | import logging 4 | import re 5 | from pathlib import Path 6 | 7 | import aiofiles 8 | 9 | from .constants import DHCPD_LEASES_FILE_PATH 10 | from .exceptions import IPDiscoveryError 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | # Regex for trimming leading zeros from MAC addresses 16 | LEADING_ZERO_REGEXP = re.compile(r"0([A-Fa-f0-9](:|$))") 17 | 18 | 19 | class DHCPEntry: 20 | """Holds a parsed DHCP entry.""" 21 | 22 | def __init__(self): 23 | self.name: str | None = None 24 | self.ip_address: str | None = None 25 | self.hw_address: str | None = None 26 | self.identifier: str | None = None 27 | self.lease: str | None = None 28 | 29 | def __repr__(self) -> str: 30 | return ( 31 | f"DHCPEntry(name='{self.name}', ip_address='{self.ip_address}', " 32 | f"hw_address='{self.hw_address}', identifier='{self.identifier}', lease='{self.lease}')" 33 | ) 34 | 35 | 36 | class IPDiscovery: 37 | """Discovers VM IP address via DHCP lease file parsing.""" 38 | 39 | def __init__(self, mac_address: str, leases_file: str = DHCPD_LEASES_FILE_PATH): 40 | """Initialize IP discovery. 41 | 42 | Args: 43 | mac_address: MAC address to search for 44 | leases_file: Path to DHCP leases file 45 | """ 46 | self.mac_address = self._normalize_mac(mac_address) 47 | self.leases_file = leases_file 48 | # Cache for file reading optimization 49 | self._cached_entries: list[DHCPEntry] | None = None 50 | self._cached_mtime: float | None = None 51 | 52 | def _normalize_mac(self, mac: str) -> str: 53 | """Normalize MAC address by trimming leading zeros.""" 54 | return LEADING_ZERO_REGEXP.sub(r"\1", mac.lower()) 55 | 56 | async def discover_ip(self) -> str | None: 57 | """Discover IP address for the configured MAC address. 58 | 59 | Returns: 60 | IP address if found, None otherwise 61 | """ 62 | try: 63 | leases_path = Path(self.leases_file) 64 | if not leases_path.exists(): 65 | logger.debug(f"DHCP leases file not found: {self.leases_file}") 66 | return None 67 | 68 | # Check if file has been modified since last read 69 | current_mtime = leases_path.stat().st_mtime 70 | if ( 71 | self._cached_entries is not None 72 | and self._cached_mtime is not None 73 | and current_mtime == self._cached_mtime 74 | ): 75 | # Use cached entries 76 | entries = self._cached_entries 77 | else: 78 | # Read and cache new entries 79 | async with aiofiles.open(self.leases_file, "r") as file: 80 | content = await file.read() 81 | entries = self._parse_dhcp_leases(content) 82 | self._cached_entries = entries 83 | self._cached_mtime = current_mtime 84 | 85 | for entry in entries: 86 | if entry.hw_address == self.mac_address: 87 | logger.debug(f"Found IP {entry.ip_address} for MAC {self.mac_address}") 88 | return entry.ip_address 89 | 90 | logger.debug(f"No IP found for MAC {self.mac_address}") 91 | return None 92 | 93 | except (OSError, IOError) as e: 94 | logger.error(f"Failed to read DHCP leases file {self.leases_file}: {e}") 95 | # Clear cache on error 96 | self._cached_entries = None 97 | self._cached_mtime = None 98 | return None 99 | except Exception as e: 100 | logger.error(f"Unexpected error discovering IP for MAC {self.mac_address}: {e}") 101 | raise IPDiscoveryError(f"IP discovery failed: {e}") from e 102 | 103 | def _parse_dhcp_leases(self, content: str) -> list[DHCPEntry]: 104 | """Parse DHCP leases file content. 105 | 106 | Args: 107 | content: Raw file content 108 | 109 | Returns: 110 | List of DHCP entries 111 | """ 112 | entries = [] 113 | current_entry = None 114 | 115 | for line in content.splitlines(): 116 | line = line.strip() 117 | 118 | if line == "{": 119 | current_entry = DHCPEntry() 120 | continue 121 | elif line == "}": 122 | if current_entry: 123 | entries.append(current_entry) 124 | current_entry = None 125 | continue 126 | 127 | if current_entry is None: 128 | continue 129 | 130 | # Parse key=value pairs 131 | if "=" not in line: 132 | continue 133 | 134 | key, value = line.split("=", 1) 135 | key = key.strip() 136 | value = value.strip() 137 | 138 | if key == "name": 139 | current_entry.name = value 140 | elif key == "ip_address": 141 | current_entry.ip_address = value 142 | elif key == "hw_address": 143 | # Remove "1," prefix from hardware address 144 | if value.startswith("1,"): 145 | current_entry.hw_address = self._normalize_mac(value[2:]) 146 | else: 147 | current_entry.hw_address = self._normalize_mac(value) 148 | elif key == "identifier": 149 | current_entry.identifier = value 150 | elif key == "lease": 151 | current_entry.lease = value 152 | 153 | return entries 154 | -------------------------------------------------------------------------------- /module/options.nix: -------------------------------------------------------------------------------- 1 | { lib, ... }: 2 | 3 | { 4 | options.services.virby = { 5 | enable = lib.mkEnableOption "Virby, a vfkit-based linux builder for nix-darwin"; 6 | 7 | allowUserSsh = lib.mkOption { 8 | type = lib.types.bool; 9 | default = false; 10 | description = '' 11 | Whether to allow non-root users to SSH into the VM. 12 | 13 | This is useful for debugging, but it means that any user on the host machine can ssh into 14 | the VM without root privileges, which could pose a security risk. 15 | ''; 16 | }; 17 | 18 | cores = lib.mkOption { 19 | type = lib.types.int; 20 | default = 8; 21 | description = '' 22 | The number of CPU cores allocated to the VM. 23 | 24 | This also sets the `nix.buildMachines.max-jobs` setting. 25 | ''; 26 | }; 27 | 28 | debug = lib.mkOption { 29 | type = lib.types.bool; 30 | default = false; 31 | description = '' 32 | Whether to enable debug logging for the VM. 33 | 34 | When enabled, the launchd daemon will direct all stdout/stderr output to log files, as well 35 | as the VM's serial output. This is useful for debugging issues with the VM, but it may pose 36 | a security risk and should only be enabled when necessary. 37 | ''; 38 | }; 39 | 40 | diskSize = lib.mkOption { 41 | type = lib.types.str; 42 | default = "100GiB"; 43 | description = '' 44 | The size of the disk image for the VM. 45 | 46 | The option value must be a string with a number, followed by the suffix "GiB". 47 | ''; 48 | }; 49 | 50 | extraConfig = lib.mkOption { 51 | type = lib.types.deferredModule; 52 | default = { }; 53 | description = '' 54 | Additional NixOS modules to include in the VM's system configuration. 55 | 56 | The VM's default configuration allows it to be securely used as a builder. Be aware when 57 | using this option, that additional configuration could potentially expose the VM to 58 | security risks such as compromised derivations being added to the nix store. 59 | 60 | Any changes made to this option's value will cause a rebuild of the VM's disk image, and 61 | the copy-on-write overlay image will be recreated from the new base image. 62 | 63 | Options defined here which are also defined by the default configuration, but not forced in 64 | the default configuration, will override the default values. Some options in the default 65 | configuration are forced (with `lib.mkForce`), such as `networking.hostName`. Any options 66 | defined here which are forced in the default configuration will be silently ignored. 67 | ''; 68 | }; 69 | 70 | memory = lib.mkOption { 71 | type = with lib.types; either int str; 72 | default = 6144; 73 | description = '' 74 | The amount of memory to allocate to the VM in MiB. 75 | 76 | This can be specified as either: an integer representing an amount in MiB, e.g., `6144`, or 77 | a string, e.g., `"6GiB"`. 78 | ''; 79 | }; 80 | 81 | onDemand = lib.mkOption { 82 | type = 83 | with lib.types; 84 | (submodule { 85 | options = { 86 | enable = lib.mkOption { 87 | type = bool; 88 | default = false; 89 | description = '' 90 | Whether to enable on-demand activation of the VM. 91 | ''; 92 | }; 93 | ttl = lib.mkOption { 94 | type = int; 95 | default = 180; 96 | description = '' 97 | This specifies the number of minutes of inactivity which must pass before the VM 98 | shuts down. 99 | 100 | This option is only relevant when `onDemand.enable` is true. 101 | ''; 102 | }; 103 | }; 104 | }); 105 | default = { }; 106 | description = '' 107 | By default, the VM is always-on, running as a daemon in the background. This allows builds 108 | to started right away, but also means the VM will always be consuming (a small amount of) 109 | cpu and memory resources. 110 | 111 | When enabled, this option will allow the VM to be activated on-demand; when not in use, the 112 | VM will not be running. When a build job requiring use of the VM is initiated, it signals 113 | the VM to start, and once an SSH connection can be established, the VM continues the build. 114 | After a period of time passes in which the VM stays idle, it will shut down. 115 | 116 | By default, the VM waits 3 hours before shutting down, but this can be configured using the 117 | option `onDemand.ttl`. 118 | ''; 119 | }; 120 | 121 | port = lib.mkOption { 122 | type = lib.types.port; 123 | default = 31222; 124 | description = '' 125 | The SSH port used by the VM. 126 | ''; 127 | }; 128 | 129 | rosetta = lib.mkOption { 130 | type = lib.types.bool; 131 | default = false; 132 | description = '' 133 | Whether to enable Rosetta support for the VM. 134 | 135 | This is only supported on aarch64-darwin systems and allows the VM to build x86_64-linux 136 | packages using Rosetta translation. It is recommended to only enable this option if you 137 | need that functionality, as Rosetta causes a slight performance decrease in VMs when 138 | enabled, even when it's not being utilized. 139 | ''; 140 | }; 141 | 142 | sharedDirectories = lib.mkOption { 143 | type = with lib.types; attrsOf str; 144 | default = { }; 145 | description = '' 146 | An attribute set of directories that will be shared with the VM as virtio-fs devices. 147 | 148 | The attribute name will be used as the mount tag. 149 | ''; 150 | example = { 151 | tmp-share = "/tmp/virby"; 152 | }; 153 | }; 154 | 155 | speedFactor = lib.mkOption { 156 | type = lib.types.int; 157 | default = 1; 158 | description = '' 159 | The speed factor to set for the VM in `nix.buildMachines`. 160 | 161 | This is an arbitrary integer that indicates the speed of this builder, relative to other 162 | builders. Higher is faster. 163 | ''; 164 | }; 165 | }; 166 | } 167 | -------------------------------------------------------------------------------- /pkgs/vm-runner/src/virby_vm_runner/socket_activation.py: -------------------------------------------------------------------------------- 1 | """Launchd socket activation logic for Virby VM.""" 2 | 3 | import ctypes 4 | import ctypes.util 5 | import logging 6 | import os 7 | import socket 8 | import stat 9 | from contextlib import asynccontextmanager 10 | 11 | from .exceptions import VMStartupError 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | @asynccontextmanager 17 | async def managed_socket(fd: int, family: int, type: int): 18 | """Context manager for socket file descriptors.""" 19 | sock = None 20 | try: 21 | sock = socket.fromfd(fd, family, type) 22 | yield sock 23 | except Exception as e: 24 | logger.debug(f"Error with socket FD {fd}: {e}") 25 | raise 26 | finally: 27 | if sock: 28 | try: 29 | sock.close() 30 | except Exception as e: 31 | logger.debug(f"Error closing socket FD {fd}: {e}") 32 | 33 | 34 | class SocketActivation: 35 | """Handles launchd socket activation and file descriptor management.""" 36 | 37 | def __init__(self, port: int, debug: bool = False): 38 | """Initialize socket activation manager. 39 | 40 | Args: 41 | port: Expected port number for socket activation 42 | debug: Enable debug logging 43 | """ 44 | self.port = port 45 | self.debug = debug 46 | 47 | def _call_launch_activate_socket(self, socket_name: str) -> list[int]: 48 | """Use launch_activate_socket to get socket file descriptors.""" 49 | try: 50 | # Load the System library which contains launch_activate_socket 51 | libsystem = ctypes.CDLL(ctypes.util.find_library("System")) 52 | 53 | # Verify function exists 54 | if not hasattr(libsystem, "launch_activate_socket"): 55 | logger.debug("launch_activate_socket not available") 56 | return [] 57 | 58 | # Define the function signature 59 | # int launch_activate_socket(const char *name, int **fds, size_t *cnt); 60 | launch_activate_socket = libsystem.launch_activate_socket 61 | launch_activate_socket.argtypes = [ 62 | ctypes.c_char_p, # const char *name 63 | ctypes.POINTER(ctypes.POINTER(ctypes.c_int)), # int **fds 64 | ctypes.POINTER(ctypes.c_size_t), # size_t *cnt 65 | ] 66 | launch_activate_socket.restype = ctypes.c_int 67 | 68 | # Prepare parameters 69 | name_bytes = socket_name.encode("utf-8") 70 | fds_ptr = ctypes.POINTER(ctypes.c_int)() 71 | count = ctypes.c_size_t() 72 | 73 | # Call the function 74 | result = launch_activate_socket(name_bytes, ctypes.byref(fds_ptr), ctypes.byref(count)) 75 | 76 | if result != 0: 77 | logger.debug(f"launch_activate_socket returned error: {result}") 78 | return [] 79 | 80 | if count.value == 0: 81 | logger.debug("launch_activate_socket returned 0 file descriptors") 82 | return [] 83 | 84 | # Extract file descriptors from the returned array 85 | fds = [] 86 | for i in range(count.value): 87 | fds.append(fds_ptr[i]) 88 | 89 | logger.debug(f"launch_activate_socket returned {count.value} file descriptors: {fds}") 90 | return fds 91 | 92 | except (OSError, AttributeError) as e: 93 | logger.debug(f"Failed to load launch_activate_socket: {e}") 94 | return [] 95 | 96 | def get_activation_socket(self) -> socket.socket: 97 | """Get the socket passed by launchd for activation.""" 98 | logger.debug("Attempting to find activation socket...") 99 | 100 | # Try proper launchd API first 101 | socket_fds = self._call_launch_activate_socket("Listener") 102 | 103 | if socket_fds: 104 | return self._process_launchd_sockets(socket_fds) 105 | 106 | # Limited fallback scanning 107 | return self._fallback_socket_scan() 108 | 109 | def _process_launchd_sockets(self, socket_fds: list[int]) -> socket.socket: 110 | """Process sockets from launchd with proper cleanup.""" 111 | for fd in socket_fds: 112 | try: 113 | test_sock = socket.fromfd(fd, socket.AF_INET, socket.SOCK_STREAM) 114 | sock_name = test_sock.getsockname() 115 | logger.info(f"Found launchd socket on FD {fd}, bound to {sock_name}") 116 | 117 | if sock_name[1] == self.port: 118 | # Duplicate the socket before returning it 119 | dup_fd = os.dup(fd) 120 | final_sock = socket.fromfd(dup_fd, socket.AF_INET, socket.SOCK_STREAM) 121 | test_sock.close() 122 | return final_sock 123 | else: 124 | test_sock.close() 125 | 126 | except Exception as e: 127 | logger.debug(f"Failed to process FD {fd}: {e}") 128 | continue 129 | 130 | raise VMStartupError("No matching socket found in launchd file descriptors") 131 | 132 | def _fallback_socket_scan(self) -> socket.socket: 133 | """Limited fallback file descriptor scanning.""" 134 | logger.debug("Falling back to manual file descriptor scanning...") 135 | 136 | # Check environment variables for additional clues 137 | for env_var in ["LISTEN_FDS", "LISTEN_PID", "LAUNCH_DAEMON_SOCKET_NAME"]: 138 | value = os.environ.get(env_var) 139 | if value: 140 | logger.debug(f"Found env var {env_var}={value}") 141 | 142 | # Scan standard range for launchd sockets (typically 3-10) 143 | for fd in range(3, 11): 144 | try: 145 | fd_stat = os.fstat(fd) 146 | if not stat.S_ISSOCK(fd_stat.st_mode): 147 | continue 148 | 149 | test_sock = socket.fromfd(fd, socket.AF_INET, socket.SOCK_STREAM) 150 | try: 151 | sock_name = test_sock.getsockname() 152 | logger.debug(f"FD {fd}: Socket bound to {sock_name}") 153 | 154 | if sock_name[1] == self.port: 155 | logger.info(f"Found matching socket on FD {fd}, bound to {sock_name}") 156 | # Duplicate socket before returning 157 | dup_fd = os.dup(fd) 158 | final_sock = socket.fromfd(dup_fd, socket.AF_INET, socket.SOCK_STREAM) 159 | test_sock.close() 160 | return final_sock 161 | else: 162 | test_sock.close() 163 | 164 | except Exception as e: 165 | logger.debug(f"Failed to get socket info for FD {fd}: {e}") 166 | try: 167 | test_sock.close() 168 | except Exception: 169 | pass 170 | 171 | except (OSError, Exception): 172 | continue 173 | 174 | raise VMStartupError(f"No activation socket found on port {self.port}") 175 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Virby - Linux Builder for Nix-darwin 2 | 3 | Virby is a module for [nix-darwin](https://github.com/nix-darwin/nix-darwin) that configures a lightweight, [vfkit](https://github.com/crc-org/vfkit)-based linux VM as a remote build machine for nix, allowing linux packages to be built on macOS. This project is modeled after [nix-rosetta-builder](https://github.com/cpick/nix-rosetta-builder), which provides a similar service, using lima to manage the VM. Some parts of the code in this repository are directly borrowed and adapted from that project. 4 | 5 | ## Quick Start 6 | 7 | Add virby to your flake inputs: 8 | 9 | ```nix 10 | { 11 | inputs = { 12 | virby.url = "github:quinneden/virby-nix-darwin"; 13 | # It is important that you dont add the line: 14 | # 15 | # inputs.nixpkgs.follows = "nixpkgs"; 16 | # 17 | # until after you've activated with `darwin-rebuild`. This way, the cached 18 | # image can be used and you won't have to build from source (which requires 19 | # an existing aarch64-linux builder). 20 | }; 21 | 22 | outputs = { virby, ... }: { 23 | darwinConfigurations."myHost" = { 24 | # Import the module 25 | modules = [ virby.darwinModules.default ]; 26 | }; 27 | }; 28 | } 29 | ``` 30 | 31 | > [!Important] 32 | > When enabling Virby for the first time, you must add the binary cache to your Nix configuration. This ensures that the prebuilt VM image is available for download, rather than having to be built locally, which requires an existing linux builder. You can do this in one of two ways: 33 | 34 | Add the binary cache to your configuration **before** enabling Virby: 35 | 36 | ```nix 37 | { 38 | nix.settings.extra-substituters = [ "https://virby-nix-darwin.cachix.org" ]; 39 | nix.settings.extra-trusted-public-keys = [ 40 | "virby-nix-darwin.cachix.org-1:z9GiEZeBU5bEeoDQjyfHPMGPBaIQJOOvYOOjGMKIlLo=" 41 | ]; 42 | 43 | services.virby.enable = false; 44 | } 45 | ``` 46 | 47 | Run `darwin-rebuild`, then enable Virby: 48 | 49 | ```nix 50 | { 51 | nix.settings.extra-substituters = [ "https://virby-nix-darwin.cachix.org" ]; 52 | nix.settings.extra-trusted-public-keys = [ 53 | "virby-nix-darwin.cachix.org-1:z9GiEZeBU5bEeoDQjyfHPMGPBaIQJOOvYOOjGMKIlLo=" 54 | ]; 55 | 56 | # Don't configure any other Virby options until after you've switched to the new 57 | # configuration. If the hash for the disk image derivation doesn't match the one 58 | # in the binary cache, then nix will try to build the image locally. 59 | services.virby.enable = true; 60 | } 61 | ``` 62 | 63 | Finally, rebuild again. 64 | 65 | **OR** 66 | 67 | Run the `darwin-rebuild` command with the following options: 68 | 69 | ```bash 70 | sudo darwin-rebuild switch --flake .#myHost \ 71 | --option "extra-substituters" "https://virby-nix-darwin.cachix.org" \ 72 | --option "extra-trusted-public-keys" "virby-nix-darwin.cachix.org-1:z9GiEZeBU5bEeoDQjyfHPMGPBaIQJOOvYOOjGMKIlLo=" 73 | ``` 74 | 75 | If you prefer building the image locally, you can enable the `nix.linux-builder` option before enabling Virby: 76 | 77 | ```nix 78 | { 79 | nix.linux-builder.enable = true; 80 | 81 | services.virby.enable = false; 82 | } 83 | ``` 84 | 85 | ## Key Features 86 | 87 | - **On-demand activation** (optional) - VM is started when needed, then shuts down after a period of inactivity 88 | - **Rosetta support** (optional) - Build `x86_64-linux` packages on Apple Silicon using Rosetta translation 89 | - **Secure by default** - Host-only access via loopback (i.e. `127.0.0.1`), with automatic ED25519 key generation 90 | - **Fully configurable** - Adjust VM resources and add custom NixOS modules 91 | 92 | ## Configuration 93 | 94 | ### Basic Settings 95 | 96 | | Option | Type | Default | Description | 97 | |---------------|------------|------------|----------------------------------------------| 98 | | `enable` | _bool_ | `false` | Enable the service | 99 | | `cores` | _int_ | `8` | CPU cores allocated to VM | 100 | | `memory` | _int_ or _string_ | `6144` | Memory in MiB or string format (e.g. "6GiB") | 101 | | `diskSize` | _string_ | `"100GiB"` | VM disk size | 102 | | `port` | _int_ | `31222` | SSH port for VM access | 103 | | `speedFactor` | _int_ | `1` | Speed factor for Nix build machine | 104 | 105 | ### Other Settings 106 | 107 | **On-demand Activation** 108 | 109 | ```nix 110 | { 111 | services.virby.onDemand.enable = true; 112 | services.virby.onDemand.ttl = 180; # Idle timeout in minutes 113 | } 114 | ``` 115 | 116 | **Rosetta Support** 117 | 118 | ```nix 119 | # Requires `aarch64-darwin` host 120 | { 121 | services.virby.rosetta = true; 122 | } 123 | ``` 124 | 125 | **Custom NixOS Configuration** 126 | 127 | 128 | ```nix 129 | { 130 | services.virby.extraConfig = { 131 | inherit (config.nix) settings; 132 | # Some NixOS options which are defined in the default VM configuration cannot 133 | # be overridden, such as `networking.hostName`. Others may be overridden with 134 | # `lib.mkForce`. Also note that anything changed here will cause a rebuild of 135 | # the VM image, and SSH keys will be regenerated. 136 | }; 137 | } 138 | ``` 139 | > [!Warning] 140 | > This option allows you to arbitrarily change the NixOS configuration, which could expose the VM to security risks. 141 | 142 | **Debug Options** (insecure, for troubleshooting only) 143 | 144 | ```nix 145 | { 146 | services.virby.debug = true; # Enable verbose logging 147 | services.virby.allowUserSsh = true; # Allow non-root SSH access 148 | } 149 | ``` 150 | 151 | ## Architecture 152 | 153 | Virby integrates three components: 154 | 155 | - **nix-darwin Module** - Configures VM as a Nix build machine for host 156 | - **VM Image** - Minimal NixOS disk image configured for secure ssh access and build isolation 157 | - **VM Runner** - Python package managing VM lifecycle and SSH proxying 158 | 159 | **Build workflow:** Linux build requested → VM started (if needed) → Build on VM → Results copied to host → VM shutdown (after idle timeout) 160 | 161 | **Security model:** 162 | - VM doesn't accept remote connections as it binds to the loopback interface 163 | - SSH keys are generated and copied to the VM on first run. 164 | - `builder` user has minimal permissions, root access is restricted by default 165 | 166 | ## Benchmarks 167 | 168 | | Test | Command | Mean [s] | Min [s] | Max [s] | Relative | 169 | |:-----|:--------|---------:|--------:|--------:|---------:| 170 | | Boot | `ssh virby-vm -- true` (triggers startup in on-demand mode) | 9.203 ± 0.703 | 7.795 | 9.818 | 1.00 | 171 | | Build | `nix build --rebuild nixpkgs#hello` | 8.136 ± 0.031 | 8.087 | 8.173 | 1.00 | 172 | 173 | ## Troubleshooting 174 | 175 | **Debug logging** 176 | ```nix 177 | { 178 | # Enable debug logging to `/tmp/virbyd.log` 179 | services.virby.debug = true; 180 | } 181 | ``` 182 | 183 | ```bash 184 | # View daemon logs 185 | tail -f /tmp/virbyd.log 186 | ``` 187 | 188 | **SSH into VM** 189 | 190 | ```bash 191 | # Requires `allowUserSsh = true` 192 | ssh virby-vm 193 | # or use sudo 194 | ``` 195 | 196 | ## Acknowledgments 197 | 198 | - Inspired by [nix-rosetta-builder](https://github.com/cpick/nix-rosetta-builder) 199 | - Uses [vfkit](https://github.com/crc-org/vfkit) 200 | 201 | --- 202 | 203 | **License**: MIT - see [LICENSE](LICENSE) file for details. 204 | -------------------------------------------------------------------------------- /scripts/benchmark-vm/benchmark-vm.sh: -------------------------------------------------------------------------------- 1 | # Don't run this script directly. 2 | # Use: 3 | # `nix run .#benchmark-vm -- {subcommand} [options]` 4 | # 5 | # This script benchmarks the performance of the Virby VM. It depends on a nix-darwin configuration 6 | # with `services.virby.enable = true`. In the future, I may add logic to setup a temporary VM and 7 | # mock launchd environment. 8 | # 9 | # Currently, it is not possible to build derivations on a remote builder while specifying the `--rebuild` 10 | # flag (see: https://github.com/NixOS/nix/issues/10451), so, the workaround is to manually ssh into 11 | # the VM and run the `nix build` command instead. 12 | 13 | set -eo pipefail 14 | 15 | show_help() { 16 | echo "Benchmark the performance of the Virby VM" 17 | echo 18 | echo "Usage:" 19 | echo " benchmark-vm {boot|build|help} [options]" 20 | echo 21 | echo "Subcommands:" 22 | echo " boot Measure the time it takes to boot the VM from a cold start" 23 | echo " build [DERIVATION] Measure the time it takes to build a derivation on the VM (default: 'nixpkgs#hello')" 24 | echo " help Show this help message" 25 | echo 26 | echo "Options:" 27 | echo " -d, --output-dir DIRECTORY Specify the output directory for the results (default: current directory)" 28 | echo " -f, --format FORMAT Specify the format in which the results will be exported (default: markdown)" 29 | echo " Supported formats: asciidoc, csv, json, markdown, org" 30 | echo " -h, --help Show this help message" 31 | echo " -r, --runs RUNS Specify the number of times to run the benchmark (default: 5)" 32 | } 33 | 34 | show_ssh_warning() { 35 | echo -e "${YELLOW}Warning: The script may be unable to connect to the VM via SSH. To ensure that you have the correct${RESET}" >&2 36 | echo -e "${YELLOW}permissions, either run the script as root, or in your Nix-darwin configuration, set:${RESET}" >&2 37 | echo >&2 38 | echo -e "${YELLOW} services.virby.allowUserSsh = true${RESET}" >&2 39 | sleep 1 40 | } 41 | export -f show_ssh_warning 42 | 43 | check_vm_is_started() { 44 | vm_state=$(curl -s http://localhost:31223/vm/state | jq -r '.state') 45 | [[ $vm_state == 'VirtualMachineStateRunning' ]] || return 1 46 | } 47 | export -f check_vm_is_started 48 | 49 | stop_vm() { 50 | if ! curl -X POST -d '{"state":"Stop"}' http://localhost:31223/vm/state; then 51 | echo -e "${RED}Error: Failed to stop the VM${RESET}" >&2 52 | exit 1 53 | fi 54 | } 55 | export -f stop_vm 56 | 57 | ssh_vm() { 58 | ssh virby-vm -- "${@:-true}" 59 | } 60 | export -f ssh_vm 61 | 62 | run_benchmark() { 63 | local benchmark_type filename timestamp 64 | local args=() 65 | benchmark_type="$1" 66 | timestamp=$(date +%Y%m%d-%H%M%S) 67 | filename="${filename_prefix}-${benchmark_type}-${timestamp}.${filename_extension}" 68 | 69 | echo -e "${BOLD}Running benchmark:${RESET} ${benchmark_type}" 70 | echo -e "${BOLD}Export format:${RESET} ${export_format}" 71 | echo -e "${BOLD}Output file:${RESET} ${output_dir}/${filename}" 72 | echo -e "${BOLD}Runs:${RESET} ${runs}" 73 | echo 74 | 75 | args+=( 76 | "--style" "full" 77 | "--runs" "${runs}" 78 | "--export-${export_format}" "${output_dir}/${filename}" 79 | ) 80 | 81 | if [[ $benchmark_type == 'boot' ]]; then 82 | args+=( 83 | "--setup" "check_vm_is_started || (ssh_vm; sleep 3)" 84 | "--prepare" "stop_vm; sleep 3" 85 | "ssh_vm" 86 | ) 87 | elif [[ $benchmark_type == 'build' ]]; then 88 | args+=( 89 | "--warmup" "1" 90 | "--setup" "ssh_vm nix build --no-link $derivation" 91 | "ssh_vm nix build --no-link --rebuild $derivation" 92 | ) 93 | fi 94 | 95 | if ! hyperfine "${args[@]}"; then 96 | if [[ -f ${output_dir}/${filename} && -z $(cat "${output_dir}/${filename}") ]]; then 97 | rm -f "${output_dir}/${filename}" 98 | fi 99 | 100 | show_ssh_warning 101 | exit 1 102 | fi 103 | } 104 | 105 | BOLD='\033[1m' 106 | RED='\033[0;31m' 107 | YELLOW='\033[0;33m' 108 | RESET='\033[0m' 109 | 110 | filename_prefix="virby-vm-benchmark" 111 | 112 | # Default values 113 | command="show_help" 114 | derivation="nixpkgs#hello" 115 | export_format="markdown" 116 | filename_extension="md" 117 | output_dir="$PWD" 118 | runs=5 119 | 120 | # Check if the Launchd service plist file exists 121 | if [[ ! -f /Library/LaunchDaemons/org.nixos.virbyd.plist ]]; then 122 | echo -e "${RED}Error: property list file for 'org.nixos.virbyd' not found.${RESET}" >&2 123 | echo -e "${RED}In your Nix-darwin configuration, set:${RESET}" >&2 124 | echo >&2 125 | echo -e " ${RED}services.virby.enable = true;${RESET}" >&2 126 | echo -e " ${RED}services.virby.allowUserSsh = true;${RESET}" >&2 127 | exit 1 128 | fi 129 | 130 | # Parse command line arguments 131 | while [[ $# -gt 0 ]]; do 132 | case $1 in 133 | -h | --help | help) 134 | shift 135 | show_help 136 | exit 0 137 | ;; 138 | 139 | -d | --output-dir) 140 | shift 141 | if [[ -z $1 ]]; then 142 | echo -e "${RED}Error: argument requires 1 arg, but were given${RESET}" >&2 143 | exit 1 144 | elif [[ ! -d $1 ]]; then 145 | echo -e "${RED}Error: $1: does not exist, or is not a directory${RESET}" >&2 146 | exit 1 147 | fi 148 | output_dir="$1" 149 | shift 150 | ;; 151 | 152 | -f | --format) 153 | shift 154 | if [[ -z $1 ]]; then 155 | echo -e "${RED}Error: argument requires 1 arg, but 0 were given${RESET}" >&2 156 | exit 1 157 | fi 158 | case $1 in 159 | asciidoc) 160 | export_format="$1" 161 | filename_extension="adoc" 162 | ;; 163 | csv) 164 | export_format="$1" 165 | filename_extension="csv" 166 | ;; 167 | json) 168 | export_format="$1" 169 | filename_extension="json" 170 | ;; 171 | markdown) 172 | export_format="$1" 173 | filename_extension="md" 174 | ;; 175 | org) 176 | export_format="$1" 177 | filename_extension="org" 178 | ;; 179 | *) 180 | echo -e "${RED}Error: '$1' is not one of: asciidoc, csv, json, markdown, org${RESET}" >&2 181 | exit 1 182 | ;; 183 | esac 184 | shift 185 | ;; 186 | 187 | -r | --runs) 188 | shift 189 | if [[ -z $1 ]]; then 190 | echo -e "${RED}Error: argument requires 1 arg, but 0 were given${RESET}" >&2 191 | exit 1 192 | elif ! [[ $1 =~ ^[0-9]+$ ]]; then 193 | echo -e "${RED}Error: Invalid number of runs: $1${RESET}" >&2 194 | exit 1 195 | fi 196 | runs="$1" 197 | shift 198 | ;; 199 | 200 | boot) 201 | shift 202 | command="run_benchmark boot" 203 | ;; 204 | 205 | build) 206 | shift 207 | if [[ -n $1 && $1 != -* ]]; then 208 | if ! [[ $1 =~ ^[[:alnum:](.|/)]+(:|#).+ ]]; then 209 | echo -e "${RED}Error: Invalid derivation format: $1${RESET}" >&2 210 | exit 1 211 | fi 212 | derivation="$1" 213 | shift 214 | fi 215 | command="run_benchmark build" 216 | ;; 217 | 218 | *) 219 | echo -e "${RED}Error: Invalid argument: $1${RESET}"; 220 | show_help 221 | exit 1 222 | ;; 223 | esac 224 | done 225 | 226 | # Run the benchmark command, or show help 227 | eval "${command}" 228 | -------------------------------------------------------------------------------- /pkgs/vm-runner/src/virby_vm_runner/config.py: -------------------------------------------------------------------------------- 1 | """Configuration management for the Virby VM runner.""" 2 | 3 | import json 4 | import logging 5 | import os 6 | from pathlib import Path 7 | from typing import Any, Dict 8 | 9 | from .constants import WORKING_DIRECTORY 10 | from .exceptions import VMConfigurationError 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | class VMConfig: 16 | """VM configuration management.""" 17 | 18 | def __init__(self, config_path: str | None = None): 19 | """Initialize VM configuration. 20 | 21 | Args: 22 | config_path: Path to JSON configuration file. 23 | """ 24 | if not config_path: 25 | raise ValueError("Configuration file path must be provided") 26 | 27 | self.config_path: Path = Path(config_path) 28 | self._config: Dict[str, Any] = self._load_config() 29 | self._validate_and_store_config() 30 | 31 | def _load_config(self) -> Dict[str, Any]: 32 | """Load configuration from JSON file.""" 33 | try: 34 | with open(self.config_path) as f: 35 | config: Dict[str, Any] = json.load(f) 36 | logger.debug(f"Loaded configuration from {self.config_path}") 37 | return config 38 | except FileNotFoundError: 39 | raise VMConfigurationError(f"Configuration file not found: {self.config_path}") 40 | except json.JSONDecodeError as e: 41 | raise VMConfigurationError(f"Invalid JSON in configuration file: {e}") 42 | except Exception as e: 43 | raise VMConfigurationError(f"Failed to load configuration: {e}") 44 | 45 | def _validate_and_store_config(self) -> None: 46 | """Validate configuration parameters and store validated values.""" 47 | required_fields = ["cores", "memory"] 48 | 49 | for field in required_fields: 50 | if field not in self._config: 51 | raise VMConfigurationError(f"Required configuration field missing: {field}") 52 | 53 | # Validate and store cores 54 | cores = self._config["cores"] 55 | if not isinstance(cores, int) or cores < 1: 56 | raise VMConfigurationError( 57 | f"Invalid cores setting: {cores}. Expected: positive integer" 58 | ) 59 | self._cores = cores 60 | 61 | # Validate and store memory 62 | memory = self._config["memory"] 63 | if not isinstance(memory, int) or memory < 1024: 64 | raise VMConfigurationError( 65 | f"Invalid memory setting: {memory}. Expected: at least 1024 MiB" 66 | ) 67 | self._memory = memory 68 | 69 | # Validate and store debug 70 | debug = self._config.get("debug", False) 71 | if not isinstance(debug, bool): 72 | raise VMConfigurationError(f"Invalid debug setting: {debug}. Expected: boolean") 73 | self._debug_enabled = debug 74 | 75 | # Validate and store port 76 | port = self._config.get("port", None) 77 | if not isinstance(port, int) or port < 1024 or port > 65535: 78 | raise VMConfigurationError( 79 | f"Invalid port: {port}. Expected: integer between 1024 and 65535" 80 | ) 81 | self._port = port 82 | 83 | # Validate and store rosetta 84 | rosetta = self._config.get("rosetta", False) 85 | if not isinstance(rosetta, bool): 86 | raise VMConfigurationError(f"Invalid rosetta setting: {rosetta}. Expected: boolean") 87 | self._rosetta_enabled = rosetta 88 | 89 | # Validate and store on-demand 90 | on_demand = self._config.get("on-demand", False) 91 | if not isinstance(on_demand, bool): 92 | raise VMConfigurationError(f"Invalid on-demand setting: {on_demand}. Expected: boolean") 93 | self._on_demand_enabled = on_demand 94 | 95 | # Validate and store TTL 96 | on_demand_ttl = self._config.get("ttl", 10800) 97 | if not isinstance(on_demand_ttl, int) or on_demand_ttl < 0: 98 | raise VMConfigurationError( 99 | f"Invalid ttl: {on_demand_ttl}. Expected: non-negative integer" 100 | ) 101 | self._on_demand_ttl = on_demand_ttl 102 | 103 | # Validate and store shared-dirs 104 | self._shared_dirs: dict[str, Path] = {} 105 | shared_dirs = self._config.get("shared-dirs", {}) 106 | if not isinstance(shared_dirs, dict): 107 | raise VMConfigurationError(f"Invalid shared-dirs: {shared_dirs}. Expected: dictionary") 108 | for tag, path in shared_dirs.items(): 109 | host_path = Path(path) 110 | if not host_path.exists(): 111 | raise VMConfigurationError(f"Shared directory does not exist on host: {host_path}") 112 | self._shared_dirs[tag] = host_path.resolve() 113 | 114 | # Store other config values 115 | self._ip_discovery_timeout = self._config.get("ip_discovery_timeout", 60) 116 | self._ssh_ready_timeout = self._config.get("ssh_ready_timeout", 30) 117 | 118 | # VM operation timeouts 119 | self._vm_pause_timeout = self._config.get("vm_pause_timeout", 30) 120 | self._vm_resume_timeout = self._config.get("vm_resume_timeout", 30) 121 | self._vm_stop_timeout = self._config.get("vm_stop_timeout", 30) 122 | 123 | for timeout_name, timeout_val in [ 124 | ("vm_pause_timeout", self._vm_pause_timeout), 125 | ("vm_resume_timeout", self._vm_resume_timeout), 126 | ("vm_stop_timeout", self._vm_stop_timeout), 127 | ]: 128 | if not isinstance(timeout_val, int) or timeout_val < 1: 129 | raise VMConfigurationError(f"Invalid {timeout_name}: {timeout_val}") 130 | 131 | @property 132 | def cores(self) -> int: 133 | """Get number of CPU cores.""" 134 | return self._cores 135 | 136 | @property 137 | def memory(self) -> int: 138 | """Get memory size in MiB.""" 139 | return self._memory 140 | 141 | @property 142 | def debug_enabled(self) -> bool: 143 | """Check if debug mode is enabled.""" 144 | return self._debug_enabled 145 | 146 | @property 147 | def port(self) -> int: 148 | """Get SSH port.""" 149 | return self._port 150 | 151 | @property 152 | def rosetta_enabled(self) -> bool: 153 | """Check if Rosetta is enabled.""" 154 | return bool(self._rosetta_enabled) 155 | 156 | @property 157 | def working_directory(self) -> Path: 158 | """Get working directory.""" 159 | value = os.getenv("VIRBY_WORKING_DIRECTORY", WORKING_DIRECTORY) 160 | return Path(value) 161 | 162 | @property 163 | def ip_discovery_timeout(self) -> int: 164 | """Get IP discovery timeout in seconds.""" 165 | return int(self._ip_discovery_timeout) 166 | 167 | @property 168 | def ssh_ready_timeout(self) -> int: 169 | """Get SSH ready timeout in seconds.""" 170 | return int(self._ssh_ready_timeout) 171 | 172 | @property 173 | def on_demand_enabled(self) -> bool: 174 | """Check if on-demand activation is enabled.""" 175 | return bool(self._on_demand_enabled) 176 | 177 | @property 178 | def on_demand_ttl(self) -> int: 179 | """Get TTL (time to live) in seconds for on-demand VM shutdown.""" 180 | return int(self._on_demand_ttl) 181 | 182 | @property 183 | def shared_dirs(self) -> Dict[str, Path]: 184 | """Get shared directories mapping.""" 185 | return self._shared_dirs 186 | 187 | @property 188 | def vm_pause_timeout(self) -> int: 189 | """Get VM pause timeout in seconds.""" 190 | return self._vm_pause_timeout 191 | 192 | @property 193 | def vm_resume_timeout(self) -> int: 194 | """Get VM resume timeout in seconds.""" 195 | return self._vm_resume_timeout 196 | 197 | @property 198 | def vm_stop_timeout(self) -> int: 199 | """Get VM stop timeout in seconds.""" 200 | return self._vm_stop_timeout 201 | 202 | def __repr__(self) -> str: 203 | """String representation of configuration.""" 204 | return ", ".join( 205 | [ 206 | f"VMConfig(cores={self.cores}", 207 | f"debug={self.debug_enabled}", 208 | f"ip_discovery_timeout={self.ip_discovery_timeout}", 209 | f"memory={self.memory}MiB", 210 | f"on_demand_enabled={self._on_demand_enabled}", 211 | f"on_demand_ttl={self.on_demand_ttl}", 212 | f"port={self.port}", 213 | f"rosetta_enabled={self.rosetta_enabled}", 214 | f"shared_dirs={self.shared_dirs}", 215 | f"ssh_ready_timeout={self.ssh_ready_timeout}", 216 | f"vm_pause_timeout={self.vm_pause_timeout}", 217 | f"vm_resume_timeout={self.vm_resume_timeout}", 218 | f"vm_stop_timeout={self.vm_stop_timeout}", 219 | f"working_directory={self.working_directory})", 220 | ] 221 | ) 222 | -------------------------------------------------------------------------------- /module/default.nix: -------------------------------------------------------------------------------- 1 | { _lib, self }: 2 | { 3 | config, 4 | lib, 5 | pkgs, 6 | ... 7 | }: 8 | 9 | let 10 | inherit (_lib.constants) 11 | baseDiskFileName 12 | diffDiskFileName 13 | sshdKeysSharedDirName 14 | sshHostPrivateKeyFileName 15 | sshHostPublicKeyFileName 16 | sshKnownHostsFileName 17 | sshUserPrivateKeyFileName 18 | sshUserPublicKeyFileName 19 | vmHostName 20 | vmUser 21 | workingDirectory 22 | ; 23 | 24 | inherit (_lib.helpers) 25 | doppelganger 26 | logError 27 | logInfo 28 | parseMemoryMiB 29 | setupLogFunctions 30 | ; 31 | 32 | cfg = config.services.virby; 33 | 34 | binPath = lib.makeBinPath ( 35 | with pkgs; 36 | [ 37 | coreutils 38 | findutils 39 | gnugrep 40 | nix 41 | openssh 42 | self.packages.${stdenv.hostPlatform.system}.vm-runner 43 | ] 44 | ); 45 | 46 | linuxSystem = doppelganger pkgs.stdenv.hostPlatform.system; 47 | 48 | imageWithFinalConfig = self.packages.${linuxSystem}.vm-image.override { 49 | inherit (cfg) 50 | debug 51 | extraConfig 52 | onDemand 53 | rosetta 54 | ; 55 | }; 56 | 57 | baseDiskPath = "${workingDirectory}/${baseDiskFileName}"; 58 | diffDiskPath = "${workingDirectory}/${diffDiskFileName}"; 59 | sourceImagePath = "${imageWithFinalConfig}/${imageWithFinalConfig.passthru.filePath}"; 60 | 61 | sshHostKeyAlias = "${vmHostName}-key"; 62 | 63 | daemonName = "virbyd"; 64 | 65 | darwinGid = 348; 66 | darwinGroup = "virby"; 67 | darwinUid = darwinGid; 68 | darwinUser = "_${darwinGroup}"; 69 | groupPath = "/Groups/${darwinGroup}"; 70 | userPath = "/Users/${darwinUser}"; 71 | 72 | vmConfigJson = pkgs.writeText "virby-vm-config.json" ( 73 | builtins.toJSON { 74 | cores = cfg.cores; 75 | debug = cfg.debug; 76 | memory = parseMemoryMiB cfg.memory; 77 | on-demand = cfg.onDemand.enable; 78 | port = cfg.port; 79 | rosetta = cfg.rosetta; 80 | ttl = cfg.onDemand.ttl * 60; # Convert to seconds 81 | shared-dirs = cfg.sharedDirectories; 82 | } 83 | ); 84 | 85 | runnerScript = pkgs.writeShellScript "${daemonName}-runner" '' 86 | PATH=${binPath}:$PATH 87 | 88 | set -euo pipefail 89 | 90 | NEEDS_GENERATE_SSH_KEYS=0 91 | 92 | should_generate_ssh_keys() { 93 | local key_files=( 94 | ${sshdKeysSharedDirName}/${sshHostPrivateKeyFileName} 95 | ${sshdKeysSharedDirName}/${sshUserPublicKeyFileName} 96 | ${sshHostPublicKeyFileName} 97 | ${sshUserPrivateKeyFileName} 98 | ) 99 | 100 | [[ $NEEDS_GENERATE_SSH_KEYS == 1 ]] && return 0 101 | 102 | for file in "''${key_files[@]}"; do 103 | [[ ! -f $file ]] && return 0 104 | done 105 | } 106 | 107 | generate_ssh_keys() { 108 | local temp_dir=$(mktemp -d) 109 | local temp_host_key="$temp_dir/host_key" 110 | local temp_user_key="$temp_dir/user_key" 111 | local user_key_required_mode=${if cfg.allowUserSsh then "644" else "600"} 112 | 113 | trap "rm -rf $temp_dir" RETURN 114 | 115 | ssh-keygen -C ${darwinUser}@darwin -f "$temp_user_key" -N "" -t ed25519 || return 1 116 | ssh-keygen -C root@${vmHostName} -f "$temp_host_key" -N "" -t ed25519 || return 1 117 | 118 | # Set permissions based on `cfg.allowUserSsh` 119 | chmod 640 "$temp_host_key.pub" "$temp_user_key.pub" 120 | chmod 600 "$temp_host_key" 121 | chmod "$user_key_required_mode" "$temp_user_key" 122 | 123 | # Remove old keys if they exist 124 | rm -f ${sshUserPrivateKeyFileName} ${sshHostPublicKeyFileName} 125 | rm -rf ${sshdKeysSharedDirName} 126 | 127 | echo "${sshHostKeyAlias} $(cat $temp_host_key.pub)" > ${sshKnownHostsFileName} 128 | 129 | mkdir -p ${sshdKeysSharedDirName} 130 | 131 | mv "$temp_user_key" ${sshUserPrivateKeyFileName} 132 | mv "$temp_host_key.pub" ${sshHostPublicKeyFileName} 133 | mv "$temp_host_key" ${sshdKeysSharedDirName}/${sshHostPrivateKeyFileName} 134 | mv "$temp_user_key.pub" ${sshdKeysSharedDirName}/${sshUserPublicKeyFileName} 135 | } 136 | 137 | umask 'g-w,o=' 138 | chmod 'g-w,o=x' . 139 | 140 | source_image_path_marker="${workingDirectory}/.disk-image-store-path" 141 | current_source_image_path=$(cat $source_image_path_marker 2>/dev/null) || true 142 | 143 | if [[ ! -f ${diffDiskPath} ]] || [[ $current_source_image_path != ${imageWithFinalConfig} ]]; then 144 | ${logInfo} "Creating VM disk images..." 145 | 146 | rm -f ${baseDiskPath} ${diffDiskPath} 147 | 148 | if ! cp ${sourceImagePath} ${baseDiskPath}; then 149 | ${logError} "Failed to copy source image to ${baseDiskPath}" 150 | exit 1 151 | fi 152 | ${logInfo} "Copied base disk image to ${baseDiskPath}" 153 | 154 | if ! (cp --reflink=always ${baseDiskPath} ${diffDiskPath} && chmod 'u+w' ${diffDiskPath}); then 155 | ${logError} "Failed to create diff disk image" 156 | exit 1 157 | fi 158 | ${logInfo} "Created diff disk image: ${diffDiskPath}" 159 | 160 | if ! truncate -s ${cfg.diskSize} ${diffDiskPath}; then 161 | ${logError} "Failed to resize diff disk image to ${cfg.diskSize}" 162 | exit 1 163 | fi 164 | ${logInfo} "Resized diff disk image to ${cfg.diskSize}" 165 | 166 | echo ${imageWithFinalConfig} > "$source_image_path_marker" 167 | 168 | NEEDS_GENERATE_SSH_KEYS=1 169 | fi 170 | 171 | if should_generate_ssh_keys; then 172 | ${logInfo} "Generating SSH keys..." 173 | if ! generate_ssh_keys; then 174 | ${logError} "Failed to generate SSH keys" 175 | exit 1 176 | fi 177 | fi 178 | 179 | # If `cfg.allowUserSsh` is true, the user key should be group-readable, otherwise it 180 | # should be owner-only 181 | user_key_required_mode=${if cfg.allowUserSsh then "644" else "600"} 182 | user_key_actual_mode=$(stat -c "%a" ${sshUserPrivateKeyFileName} 2>/dev/null) 183 | 184 | if [[ $user_key_required_mode -ne $user_key_actual_mode ]]; then 185 | if ! chmod "$user_key_required_mode" ${sshUserPrivateKeyFileName}; then 186 | ${logError} "Failed to set permissions on ${sshUserPrivateKeyFileName}" 187 | exit 1 188 | fi 189 | fi 190 | 191 | if ! chmod 'go+r' ${sshKnownHostsFileName}; then 192 | ${logError} "Failed to set permissions on ${sshKnownHostsFileName}" 193 | exit 1 194 | fi 195 | 196 | ${logInfo} "Starting VM..." 197 | 198 | if ! exec virby-vm; then 199 | ${logError} "Failed to start the VM" 200 | exit 1 201 | fi 202 | ''; 203 | in 204 | 205 | { 206 | imports = [ ./options.nix ]; 207 | 208 | config = lib.mkMerge [ 209 | (lib.mkIf (!cfg.enable) { 210 | system.activationScripts.postActivation.text = lib.mkBefore '' 211 | ${setupLogFunctions} 212 | 213 | if [[ -d ${workingDirectory} ]]; then 214 | logInfo "Removing working directory..." 215 | rm -rf ${workingDirectory} 216 | fi 217 | 218 | if uid=$(id -u ${darwinUser} 2>/dev/null); then 219 | if [[ $uid -ne ${toString darwinUid} ]]; then 220 | logError "Existing user: ${darwinUser} has unexpected UID: $uid" 221 | exit 1 222 | fi 223 | 224 | logInfo "Deleting user ${darwinUser}..." 225 | dscl . -delete ${userPath} 226 | fi 227 | 228 | unset 'uid' 229 | 230 | if primaryGroupId=$(dscl . -read ${groupPath} 'PrimaryGroupID' 2>/dev/null | cut -d' ' -f2); then 231 | if [[ $primaryGroupId -ne ${toString darwinGid} ]]; then 232 | logError "Existing group: ${darwinGroup} has unexpected GID: $primaryGroupId" 233 | exit 1 234 | fi 235 | 236 | logInfo "Deleting group ${darwinGroup}..." 237 | dscl . -delete ${groupPath} 238 | fi 239 | 240 | unset 'primaryGroupId' 241 | ''; 242 | }) 243 | 244 | (lib.mkIf cfg.enable { 245 | assertions = [ 246 | { 247 | assertion = !(pkgs.stdenv.hostPlatform.system != "aarch64-darwin" && cfg.rosetta); 248 | message = "Rosetta is only supported on aarch64-darwin systems."; 249 | } 250 | ]; 251 | 252 | system.activationScripts.extraActivation.text = lib.mkAfter '' 253 | ${setupLogFunctions} 254 | 255 | # Create group 256 | if ! primaryGroupId=$(dscl . -read ${groupPath} 'PrimaryGroupID' 2>/dev/null | cut -d' ' -f2); then 257 | logInfo "Creating group ${darwinGroup}..." 258 | dscl . -create ${groupPath} 'PrimaryGroupID' ${toString darwinGid} 259 | elif [[ $primaryGroupId -ne ${toString darwinGid} ]]; then 260 | logError "Existing group: ${darwinGroup} has unexpected GID: $primaryGroupId, expected: ${toString darwinGid}" 261 | exit 1 262 | fi 263 | 264 | unset 'primaryGroupId' 265 | 266 | # Create user 267 | if ! uid=$(id -u ${darwinUser} 2>/dev/null); then 268 | logInfo "Setting up user ${darwinUser}..." 269 | dscl . -create ${userPath} 270 | dscl . -create ${userPath} 'PrimaryGroupID' ${toString darwinGid} 271 | dscl . -create ${userPath} 'NFSHomeDirectory' ${workingDirectory} 272 | dscl . -create ${userPath} 'UserShell' /usr/bin/false 273 | dscl . -create ${userPath} 'IsHidden' 1 274 | dscl . -create ${userPath} 'UniqueID' ${toString darwinUid} 275 | elif [[ $uid -ne ${toString darwinUid} ]]; then 276 | logError "Existing user: ${darwinUser} has unexpected UID: $uid, expected: ${toString darwinUid}" 277 | exit 1 278 | fi 279 | 280 | unset 'uid' 281 | 282 | # Setup working directory 283 | if [[ ! -d ${workingDirectory} ]]; then 284 | logInfo "Setting up working directory..." 285 | mkdir -p ${workingDirectory} 286 | fi 287 | 288 | chown ${darwinUser}:${darwinGroup} ${workingDirectory} 289 | ''; 290 | 291 | environment.etc."ssh/ssh_config.d/100-${vmHostName}.conf".text = '' 292 | Host ${vmHostName} 293 | GlobalKnownHostsFile ${workingDirectory}/${sshKnownHostsFileName} 294 | UserKnownHostsFile /dev/null 295 | HostKeyAlias ${sshHostKeyAlias} 296 | Hostname localhost 297 | AddressFamily inet 298 | IdentitiesOnly yes 299 | IdentityFile ${workingDirectory}/${sshUserPrivateKeyFileName} 300 | Port ${toString cfg.port} 301 | StrictHostKeyChecking yes 302 | User ${vmUser} 303 | ''; 304 | 305 | launchd.daemons = { 306 | ${daemonName} = { 307 | path = [ "/bin" ]; 308 | command = runnerScript; 309 | 310 | serviceConfig = { 311 | UserName = darwinUser; 312 | WorkingDirectory = workingDirectory; 313 | KeepAlive = !cfg.onDemand.enable; 314 | 315 | Sockets.Listener = { 316 | SockFamily = "IPv4"; 317 | SockNodeName = "localhost"; 318 | SockServiceName = toString cfg.port; 319 | }; 320 | 321 | EnvironmentVariables = { 322 | VIRBY_VM_CONFIG_FILE = toString vmConfigJson; 323 | }; 324 | } 325 | // lib.optionalAttrs cfg.debug { StandardOutPath = "/tmp/${daemonName}.log"; }; 326 | }; 327 | }; 328 | 329 | nix = { 330 | buildMachines = [ 331 | { 332 | hostName = vmHostName; 333 | maxJobs = cfg.cores; 334 | protocol = "ssh-ng"; 335 | supportedFeatures = [ 336 | "benchmark" 337 | "big-parallel" 338 | "kvm" 339 | "nixos-test" 340 | ]; 341 | speedFactor = cfg.speedFactor; 342 | systems = [ linuxSystem ] ++ lib.optional cfg.rosetta "x86_64-linux"; 343 | } 344 | ]; 345 | 346 | distributedBuilds = lib.mkForce true; 347 | settings.builders-use-substitutes = lib.mkDefault true; 348 | }; 349 | }) 350 | ]; 351 | } 352 | -------------------------------------------------------------------------------- /pkgs/vm-runner/src/virby_vm_runner/runner.py: -------------------------------------------------------------------------------- 1 | """Virby VM runner.""" 2 | 3 | import asyncio 4 | import logging 5 | import socket 6 | import sys 7 | import time 8 | 9 | from .config import VMConfig 10 | from .exceptions import VMStartupError 11 | from .signal_manager import SignalManager 12 | from .socket_activation import SocketActivation 13 | from .vm_process import VMProcess 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | 18 | class VirbyVMRunner: 19 | """VM runner that integrates with the Virby Nix-darwin module.""" 20 | 21 | def __init__(self, config: VMConfig, signal_manager: SignalManager): 22 | self.config = config 23 | self.signal_manager = signal_manager 24 | 25 | # Initialize components 26 | self.vm_process = VMProcess(config, config.working_directory) 27 | self.socket_activation = SocketActivation(config.port, config.debug_enabled) 28 | 29 | # Runner state 30 | self._shutdown_requested = False 31 | self._activation_socket: socket.socket | None = None 32 | self._active_connections: int = 0 33 | self._last_connection_time: int | float = 0 34 | self._shutdown_timer: asyncio.Task | None = None 35 | 36 | async def _handle_activation_connections(self) -> None: 37 | """Handle incoming connections on the activation socket.""" 38 | if not self._activation_socket: 39 | raise VMStartupError("No activation socket available") 40 | 41 | async def handle_client(reader: asyncio.StreamReader, writer: asyncio.StreamWriter) -> None: 42 | """Handle a single client connection.""" 43 | try: 44 | await self._proxy_connection(reader, writer) 45 | except Exception as e: 46 | logger.error(f"Error handling client connection: {e}") 47 | 48 | # Start server using the inherited socket 49 | server = await asyncio.start_server(handle_client, sock=self._activation_socket) 50 | 51 | logger.info("Started proxy server for socket activation") 52 | 53 | # Keep serving until shutdown 54 | async with server: 55 | try: 56 | await server.serve_forever() 57 | except asyncio.CancelledError: 58 | logger.info("Proxy server cancelled") 59 | 60 | async def _ensure_vm_ready(self) -> None: 61 | """Ensure VM is started and ready for connections.""" 62 | # Check for shutdown signal 63 | if self.signal_manager.is_shutdown_requested(): 64 | raise VMStartupError("Shutdown requested, not starting VM") 65 | 66 | # Also check VM process shutdown flag if it exists 67 | if hasattr(self, "vm_process") and self.vm_process._shutdown_requested: 68 | raise VMStartupError("VM shutdown already requested") 69 | 70 | vm_running = self.vm_process.is_running 71 | if self.config.on_demand_enabled: 72 | can_resume = self.vm_process.can_resume() 73 | if can_resume or not vm_running: 74 | ip = await self.vm_process.resume_or_start() 75 | logger.info(f"VM ready (ip: {ip})") 76 | else: 77 | if not vm_running: 78 | ip = await self.vm_process.start() 79 | logger.info(f"VM ready (ip: {ip})") 80 | 81 | async def _proxy_connection( 82 | self, client_reader: asyncio.StreamReader, client_writer: asyncio.StreamWriter 83 | ) -> None: 84 | """Proxy a client connection to the VM's SSH port.""" 85 | self._active_connections += 1 86 | self._last_connection_time = time.time() 87 | 88 | # Reset shutdown timer if it exists 89 | if self._shutdown_timer and not self._shutdown_timer.done(): 90 | self._shutdown_timer.cancel() 91 | self._shutdown_timer = None 92 | 93 | try: 94 | # Check for shutdown signal 95 | if self.signal_manager.is_shutdown_requested(): 96 | logger.info("Shutdown requested, rejecting connection") 97 | return 98 | 99 | # Check VM process shutdown state 100 | if hasattr(self, "vm_process") and self.vm_process._shutdown_requested: 101 | logger.info("VM shutdown requested, rejecting connection") 102 | return 103 | 104 | # Ensure VM is ready 105 | await self._ensure_vm_ready() 106 | 107 | # Connect to VM's SSH port 108 | vm_reader, vm_writer = await asyncio.open_connection(self.vm_process.ip_address, 22) 109 | 110 | logger.debug( 111 | f"Proxying connection to VM at {self.vm_process.ip_address}:22 (active connections: {self._active_connections})" 112 | ) 113 | 114 | async def pipe_data( 115 | src_reader: asyncio.StreamReader, dst_writer: asyncio.StreamWriter 116 | ) -> None: 117 | """Pipe data from source to destination.""" 118 | try: 119 | while True: 120 | data = await src_reader.read(4096) 121 | if not data: 122 | break 123 | dst_writer.write(data) 124 | await dst_writer.drain() 125 | except (asyncio.CancelledError, ConnectionResetError): 126 | pass 127 | finally: 128 | try: 129 | dst_writer.close() 130 | await dst_writer.wait_closed() 131 | except Exception: 132 | pass 133 | 134 | # Start bidirectional piping 135 | await asyncio.gather( 136 | pipe_data(client_reader, vm_writer), 137 | pipe_data(vm_reader, client_writer), 138 | return_exceptions=True, 139 | ) 140 | 141 | except Exception as e: 142 | logger.error(f"Connection proxy error: {e}") 143 | finally: 144 | self._active_connections -= 1 145 | logger.debug(f"Connection closed (active connections: {self._active_connections})") 146 | 147 | try: 148 | client_writer.close() 149 | await client_writer.wait_closed() 150 | except Exception: 151 | pass 152 | 153 | # In on-demand mode, start shutdown timer after connection ends 154 | if self.config.on_demand_enabled and self._active_connections == 0: 155 | self._shutdown_timer = asyncio.create_task(self._schedule_shutdown_check()) 156 | 157 | async def _schedule_shutdown_check(self) -> None: 158 | """Schedule a shutdown check after TTL expires in on-demand mode.""" 159 | # Get TTL from config 160 | try: 161 | ttl_seconds = self.config.on_demand_ttl 162 | 163 | logger.debug(f"Scheduling shutdown check in {ttl_seconds} seconds") 164 | await asyncio.sleep(ttl_seconds) 165 | 166 | # Check if we should shutdown 167 | if self._active_connections == 0: 168 | logger.info("TTL expired with no active connections, shutting down VM") 169 | # In on-demand mode, try pause before stop 170 | if self.config.on_demand_enabled: 171 | was_paused = await self.vm_process.pause_or_stop() 172 | if was_paused: 173 | logger.info("VM paused") 174 | else: 175 | logger.info("VM stopped") 176 | else: 177 | await self.stop() 178 | else: 179 | logger.debug( 180 | f"TTL expired but there are {self._active_connections} active connections, not shutting down" 181 | ) 182 | except asyncio.CancelledError: 183 | logger.debug("Shutdown timer cancelled due to new connection") 184 | raise 185 | 186 | async def start(self) -> None: 187 | """Start the VM and wait for it to be ready.""" 188 | await self.vm_process.start() 189 | 190 | async def stop(self, timeout: int = 30) -> None: 191 | """Stop the VM gracefully.""" 192 | self._shutdown_requested = True 193 | 194 | if self._shutdown_timer and not self._shutdown_timer.done(): 195 | self._shutdown_timer.cancel() 196 | self._shutdown_timer = None 197 | 198 | await self.vm_process.stop(timeout) 199 | 200 | async def resume(self) -> None: 201 | """Resume the VM if it was paused.""" 202 | await self.vm_process.resume() 203 | 204 | async def pause(self, timeout: int = 30) -> None: 205 | """Pause the VM.""" 206 | await self.vm_process.pause(timeout) 207 | 208 | async def run(self) -> None: 209 | """Main run loop.""" 210 | # Check if shutdown was already requested 211 | if self.signal_manager.is_shutdown_requested(): 212 | logger.info("Shutdown already requested, exiting immediately") 213 | return 214 | 215 | try: 216 | self._activation_socket = self.socket_activation.get_activation_socket() 217 | 218 | # Start VM immediately if not on-demand 219 | if not self.config.on_demand_enabled: 220 | logger.info("Starting VM") 221 | await self.start() 222 | 223 | # Start connection handling 224 | proxy_task = asyncio.create_task(self._handle_activation_connections()) 225 | 226 | # Add periodic check for VM shutdown requests 227 | async def monitor_shutdown_signals(): 228 | while not self.signal_manager.is_shutdown_requested(): 229 | # Check if VM process has requested shutdown 230 | if hasattr(self, "vm_process") and self.vm_process._shutdown_requested: 231 | logger.info("VM process requested shutdown") 232 | self.signal_manager.request_shutdown() 233 | break 234 | 235 | await asyncio.sleep(1) # Check every second 236 | 237 | monitor_task = asyncio.create_task(monitor_shutdown_signals()) 238 | 239 | # Wait for shutdown signal or tasks to complete 240 | await asyncio.wait( 241 | [ 242 | asyncio.create_task(self.signal_manager.shutdown_event.wait()), 243 | proxy_task, 244 | monitor_task, 245 | ], 246 | return_when=asyncio.FIRST_COMPLETED, 247 | ) 248 | 249 | # Cancel remaining tasks 250 | for task in [proxy_task, monitor_task]: 251 | if not task.done(): 252 | task.cancel() 253 | try: 254 | await task 255 | except asyncio.CancelledError: 256 | pass 257 | 258 | except KeyboardInterrupt: 259 | logger.info("Received keyboard interrupt") 260 | except Exception as e: 261 | logger.error(f"VM runner error: {e}") 262 | raise 263 | finally: 264 | await self.stop() 265 | 266 | @property 267 | def is_running(self) -> bool: 268 | """Check if VM is running.""" 269 | return self.vm_process.is_running 270 | 271 | @property 272 | def ip_address(self) -> str | None: 273 | """Get VM IP address.""" 274 | return self.vm_process.ip_address 275 | 276 | 277 | async def main() -> None: 278 | """Main entry point.""" 279 | # Set up logging 280 | logging.basicConfig( 281 | level=logging.INFO, 282 | format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", 283 | ) 284 | 285 | try: 286 | # Load configuration 287 | config = VMConfig() 288 | 289 | # Create and run VM 290 | runner = VirbyVMRunner(config) 291 | await runner.run() 292 | 293 | except KeyboardInterrupt: 294 | logger.info("Interrupted by user") 295 | except Exception as e: 296 | logger.error(f"Fatal error: {e}") 297 | sys.exit(1) 298 | 299 | 300 | if __name__ == "__main__": 301 | asyncio.run(main()) 302 | -------------------------------------------------------------------------------- /pkgs/vm-runner/src/virby_vm_runner/vm_process.py: -------------------------------------------------------------------------------- 1 | """VM process lifecycle management for Virby VM.""" 2 | 3 | import asyncio 4 | import atexit 5 | import fcntl 6 | import logging 7 | import os 8 | import random 9 | import signal 10 | import tempfile 11 | import time 12 | from pathlib import Path 13 | from typing import Any, Callable 14 | 15 | from .api import VfkitAPIClient, VirtualMachineState 16 | from .circuit_breaker import CircuitBreaker 17 | from .config import VMConfig 18 | from .constants import ( 19 | DIFF_DISK_FILE_NAME, 20 | EFI_VARIABLE_STORE_FILE_NAME, 21 | SERIAL_LOG_FILE_NAME, 22 | SSHD_KEYS_SHARED_DIR_NAME, 23 | ) 24 | from .exceptions import VMRuntimeError, VMStartupError 25 | from .ip_discovery import IPDiscovery 26 | from .ssh import SSHConnectivityTester 27 | 28 | logger = logging.getLogger(__name__) 29 | 30 | 31 | async def with_timeout( 32 | coro: Callable[..., Any], timeout: float, operation_name: str, *args, **kwargs 33 | ) -> Any: 34 | """Execute coroutine with timeout and proper error handling.""" 35 | try: 36 | return await asyncio.wait_for(coro(*args, **kwargs), timeout=timeout) 37 | except asyncio.TimeoutError: 38 | raise VMRuntimeError(f"{operation_name} timed out after {timeout} seconds") 39 | 40 | 41 | class VMProcessState: 42 | """VM process state enumeration.""" 43 | 44 | RUNNING = "running" 45 | STOPPED = "stopped" 46 | PAUSED = "paused" 47 | UNKNOWN = "unknown" 48 | 49 | 50 | async def cleanup_orphaned_vfkit_processes(working_dir: Path) -> None: 51 | """Async cleanup of orphaned vfkit processes using PID files. 52 | 53 | This function can be called during startup to clean up any processes 54 | that were orphaned due to unclean shutdowns. 55 | """ 56 | pid_file = working_dir / "vfkit.pid" 57 | 58 | try: 59 | with open(pid_file, "r") as f: 60 | fcntl.flock(f.fileno(), fcntl.LOCK_SH | fcntl.LOCK_NB) 61 | pid_str = f.read().strip() 62 | 63 | if not pid_str: 64 | pid_file.unlink(missing_ok=True) 65 | return 66 | 67 | try: 68 | pid = int(pid_str) 69 | except ValueError: 70 | logger.warning(f"Invalid PID in file {pid_file}: {pid_str}") 71 | pid_file.unlink(missing_ok=True) 72 | return 73 | 74 | if pid <= 0: 75 | logger.warning(f"Invalid PID value: {pid}") 76 | pid_file.unlink(missing_ok=True) 77 | return 78 | 79 | # Quick check if process exists 80 | try: 81 | os.kill(pid, 0) # Process exists 82 | logger.info(f"Found orphaned vfkit process with PID {pid}") 83 | 84 | # Kill gracefully then forcefully if needed 85 | os.kill(pid, signal.SIGTERM) 86 | await asyncio.sleep(0.5) # Non-blocking sleep 87 | 88 | try: 89 | os.kill(pid, 0) # Still exists? 90 | os.kill(pid, signal.SIGKILL) 91 | logger.info(f"Force killed orphaned vfkit process {pid}") 92 | except ProcessLookupError: 93 | logger.info(f"Orphaned vfkit process {pid} terminated gracefully") 94 | 95 | except ProcessLookupError: 96 | # Process doesn't exist, just clean up the PID file 97 | logger.debug(f"Orphaned vfkit process {pid} no longer exists") 98 | 99 | pid_file.unlink(missing_ok=True) 100 | logger.info("Cleaned up orphaned vfkit process") 101 | 102 | except (FileNotFoundError, BlockingIOError): 103 | return 104 | except Exception as e: 105 | logger.error(f"Error cleaning up orphaned vfkit process: {e}") 106 | 107 | 108 | def cleanup_orphaned_vfkit_processes_sync(working_dir: Path) -> None: 109 | """Synchronous wrapper for atexit compatibility.""" 110 | try: 111 | # Try to get current event loop 112 | loop = asyncio.get_running_loop() 113 | # If we're in an async context, schedule async 114 | loop.create_task(cleanup_orphaned_vfkit_processes(working_dir)) 115 | except RuntimeError: 116 | # No event loop, use synchronous 117 | _cleanup_orphaned_vfkit_processes_sync(working_dir) 118 | 119 | 120 | def _cleanup_orphaned_vfkit_processes_sync(working_dir: Path) -> None: 121 | """Synchronous implementation for atexit.""" 122 | pid_file = working_dir / "vfkit.pid" 123 | 124 | try: 125 | with open(pid_file, "r") as f: 126 | fcntl.flock(f.fileno(), fcntl.LOCK_SH | fcntl.LOCK_NB) 127 | pid_str = f.read().strip() 128 | 129 | if not pid_str: 130 | pid_file.unlink(missing_ok=True) 131 | return 132 | 133 | try: 134 | pid = int(pid_str) 135 | except ValueError: 136 | logger.warning(f"Invalid PID in file {pid_file}: {pid_str}") 137 | pid_file.unlink(missing_ok=True) 138 | return 139 | 140 | if pid <= 0: 141 | logger.warning(f"Invalid PID value: {pid}") 142 | pid_file.unlink(missing_ok=True) 143 | return 144 | 145 | # Quick check if process exists 146 | try: 147 | os.kill(pid, 0) # Process exists 148 | logger.info(f"Found orphaned vfkit process with PID {pid}") 149 | 150 | # Kill gracefully then forcefully if needed 151 | os.kill(pid, signal.SIGTERM) 152 | time.sleep(0.5) # Synchronous sleep for atexit 153 | 154 | try: 155 | os.kill(pid, 0) # Still exists? 156 | os.kill(pid, signal.SIGKILL) 157 | logger.info(f"Force killed orphaned vfkit process {pid}") 158 | except ProcessLookupError: 159 | logger.info(f"Orphaned vfkit process {pid} terminated gracefully") 160 | 161 | except ProcessLookupError: 162 | # Process doesn't exist, just clean up the PID file 163 | logger.debug(f"Orphaned vfkit process {pid} no longer exists") 164 | 165 | pid_file.unlink(missing_ok=True) 166 | logger.info("Cleaned up orphaned vfkit process") 167 | 168 | except (FileNotFoundError, BlockingIOError): 169 | # File doesn't exist or is locked by active process 170 | return 171 | except Exception as e: 172 | logger.error(f"Error cleaning up orphaned vfkit process: {e}") 173 | 174 | 175 | class VMProcess: 176 | """Manages VM process lifecycle independent of networking concerns.""" 177 | 178 | def __init__(self, config: VMConfig, working_dir: Path): 179 | """Initialize VM process manager. 180 | 181 | Args: 182 | config: VM configuration 183 | working_dir: Working directory for VM files 184 | """ 185 | self.config = config 186 | self.working_dir = working_dir 187 | 188 | # Validate working directory 189 | if not self.working_dir.exists(): 190 | raise VMStartupError(f"Working directory does not exist: {self.working_dir}") 191 | if not self.working_dir.is_dir(): 192 | raise VMStartupError(f"Working directory is not a directory: {self.working_dir}") 193 | 194 | # VM process state 195 | self.vm_process: asyncio.subprocess.Process | None = None 196 | self.mac_address = self._generate_mac_address() 197 | self.ip_discovery = IPDiscovery(self.mac_address) 198 | self._ip_address: str | None = None 199 | self._output_task: asyncio.Task | None = None 200 | self._shutdown_requested = False 201 | self._vfkit_api_port = self.config.port + 1 202 | 203 | # Process management 204 | self.pid_file = self.working_dir / "vfkit.pid" 205 | 206 | # Initialize vfkit API client 207 | self.api_client = VfkitAPIClient( 208 | api_port=self._vfkit_api_port, 209 | is_running_check=self.is_running, 210 | ) 211 | 212 | # Initialize circuit breaker for API operations 213 | self._api_circuit_breaker = CircuitBreaker(failure_threshold=3, timeout=10.0) 214 | 215 | # Setup cleanup handler 216 | atexit.register(self._cleanup_on_exit) 217 | 218 | def _generate_mac_address(self) -> str: 219 | """Generate a random MAC address for VM usage.""" 220 | prefix = "02:94" # Locally administered, unicast 221 | suffix = ":".join(f"{random.randint(0, 255):02x}" for _ in range(4)) 222 | return f"{prefix}:{suffix}" 223 | 224 | def build_vfkit_command(self) -> list[str]: 225 | """Build vfkit command from configuration.""" 226 | diff_disk = self.working_dir / DIFF_DISK_FILE_NAME 227 | efi_store = self.working_dir / EFI_VARIABLE_STORE_FILE_NAME 228 | sshd_keys = self.working_dir / SSHD_KEYS_SHARED_DIR_NAME 229 | 230 | cmd = [ 231 | "vfkit", 232 | "--cpus", 233 | str(self.config.cores), 234 | "--memory", 235 | str(self.config.memory), 236 | "--bootloader", 237 | f"efi,variable-store={efi_store},create", 238 | "--device", 239 | f"virtio-blk,path={diff_disk}", 240 | "--device", 241 | f"virtio-fs,sharedDir={sshd_keys},mountTag=sshd-keys", 242 | "--device", 243 | f"virtio-net,nat,mac={self.mac_address}", 244 | "--restful-uri", 245 | f"tcp://localhost:{self._vfkit_api_port}", 246 | "--device", 247 | "virtio-rng", 248 | "--device", 249 | "virtio-balloon", 250 | ] 251 | 252 | if self.config.debug_enabled: 253 | serial_log = self.working_dir / SERIAL_LOG_FILE_NAME 254 | cmd.extend(["--device", f"virtio-serial,logFilePath={serial_log}"]) 255 | 256 | if self.config.rosetta_enabled: 257 | cmd.extend(["--device", "rosetta,mountTag=rosetta"]) 258 | 259 | # Add shared directories, if any 260 | shared_dirs = self.config.shared_dirs 261 | for tag, path in shared_dirs.items(): 262 | cmd.extend(["--device", f"virtio-fs,sharedDir={path},mountTag={tag}"]) 263 | 264 | return cmd 265 | 266 | async def _get_state_info(self, max_retries: int = 3) -> dict | None: 267 | """Get VM state via vfkit API with retry logic for transient failures.""" 268 | for attempt in range(max_retries): 269 | try: 270 | return await self.api_client.get("/vm/state") 271 | except VMRuntimeError as e: 272 | if attempt < max_retries - 1: 273 | # Exponential backoff with jitter 274 | delay = (0.1 * (2**attempt)) + random.uniform(0, 0.05) 275 | logger.debug(f"VM state query failed, retrying in {delay:.2f}s: {e}") 276 | await asyncio.sleep(delay) 277 | else: 278 | logger.error(f"VM state query failed after {max_retries} attempts: {e}") 279 | 280 | return None 281 | 282 | async def _get_state_info_with_breaker(self) -> dict | None: 283 | """Get VM state with circuit breaker protection.""" 284 | try: 285 | return await self._api_circuit_breaker.call(self._get_state_info_raw) 286 | except VMRuntimeError: 287 | logger.warning("Circuit breaker prevented VM state query") 288 | return None 289 | 290 | async def _get_state_info_raw(self) -> dict | None: 291 | """Raw VM state query with no retry logic.""" 292 | return await self.api_client.get("/vm/state") 293 | 294 | async def get_current_state(self) -> str: 295 | """Get current VM process state with validation and recovery. 296 | 297 | Returns: 298 | str: One of VMProcessState constants 299 | """ 300 | if not self.is_running: 301 | return VMProcessState.STOPPED 302 | 303 | # Try to get state with circuit breaker 304 | state_info = await self._get_state_info_with_breaker() 305 | 306 | if not state_info: 307 | # If API is unavailable, check process status 308 | if self.vm_process and self.vm_process.returncode is None: 309 | logger.warning("VM API unavailable but process running") 310 | return VMProcessState.UNKNOWN 311 | else: 312 | return VMProcessState.STOPPED 313 | 314 | if "state" not in state_info: 315 | logger.warning("Invalid VM state response") 316 | return VMProcessState.UNKNOWN 317 | 318 | vm_state = state_info.get("state") 319 | if vm_state == VirtualMachineState.RUNNING: 320 | return VMProcessState.RUNNING 321 | elif vm_state == VirtualMachineState.PAUSED: 322 | return VMProcessState.PAUSED 323 | elif vm_state == VirtualMachineState.STOPPED: 324 | return VMProcessState.STOPPED 325 | else: 326 | logger.debug(f"Unhandled VM state returned: {vm_state}") 327 | return VMProcessState.UNKNOWN 328 | 329 | async def can_pause(self) -> bool: 330 | """Check if VM can be paused.""" 331 | if not self.is_running: 332 | return False 333 | vm_state = await self._get_state_info() 334 | return vm_state.get("canPause", False) if vm_state else False 335 | 336 | async def can_resume(self) -> bool: 337 | """Check if VM can be resumed.""" 338 | if not self.is_running: 339 | return False 340 | vm_state = await self._get_state_info() 341 | return vm_state.get("canResume", False) if vm_state else False 342 | 343 | async def _start_vm_process(self) -> None: 344 | """Start the VM process.""" 345 | if self.vm_process is not None: 346 | raise VMStartupError("VM process is already running") 347 | 348 | cmd = self.build_vfkit_command() 349 | logger.info(f"Starting VM with command: {' '.join(cmd)}") 350 | 351 | try: 352 | kwargs: dict = {"cwd": self.working_dir} 353 | 354 | if self.config.debug_enabled: 355 | kwargs.update( 356 | { 357 | "stdout": asyncio.subprocess.PIPE, 358 | "stderr": asyncio.subprocess.PIPE, 359 | } 360 | ) 361 | else: 362 | kwargs.update( 363 | { 364 | "stdout": asyncio.subprocess.DEVNULL, 365 | "stderr": asyncio.subprocess.DEVNULL, 366 | } 367 | ) 368 | 369 | self.vm_process = await asyncio.create_subprocess_exec(*cmd, **kwargs) 370 | logger.info(f"VM started with PID {self.vm_process.pid}") 371 | 372 | # Write PID file for external cleanup 373 | self._write_pid_file(self.vm_process.pid) 374 | 375 | # Start background task to consume output if debug is enabled 376 | if self.config.debug_enabled and self.vm_process.stdout and self.vm_process.stderr: 377 | self._output_task = asyncio.create_task(self._consume_vm_process_output()) 378 | 379 | except Exception as e: 380 | raise VMStartupError(f"Failed to start VM process: {e}") 381 | 382 | async def _discover_ip_address(self) -> str: 383 | """Discover the VM's IP address via DHCP.""" 384 | logger.info("Discovering VM IP address...") 385 | 386 | timeout = self.config.ip_discovery_timeout 387 | start_time = asyncio.get_event_loop().time() 388 | interval = 0.1 # Start with 100ms 389 | max_interval = 2.0 # Cap at 2s 390 | 391 | while (asyncio.get_event_loop().time() - start_time) < timeout: 392 | # Check for shutdown signals 393 | await self._check_shutdown_signals() 394 | 395 | if self._shutdown_requested: 396 | raise VMRuntimeError("Shutdown requested during IP discovery") 397 | 398 | if self.vm_process and self.vm_process.returncode is not None: 399 | raise VMRuntimeError("VM process died during IP discovery") 400 | 401 | ip = await self.ip_discovery.discover_ip() 402 | if ip: 403 | logger.info(f"Discovered VM IP: {ip}") 404 | self._ip_address = ip 405 | return ip 406 | 407 | await asyncio.sleep(interval) 408 | # Exponential backoff: 100ms -> 200ms -> 400ms -> 800ms -> 1.6s -> 2s 409 | interval = min(interval * 2, max_interval) 410 | 411 | raise VMRuntimeError(f"Failed to discover VM IP within {timeout} seconds") 412 | 413 | async def _consume_vm_process_output(self) -> None: 414 | """Consume VM process (vfkit) stdout/stderr to prevent buffer overflow.""" 415 | if not self.vm_process or not self.vm_process.stdout or not self.vm_process.stderr: 416 | return 417 | 418 | async def read_stream(stream, name): 419 | try: 420 | while True: 421 | line = await stream.readline() 422 | if not line: 423 | break 424 | logger.debug(f"VM {name}: {line.decode().rstrip()}") 425 | except Exception as e: 426 | logger.debug(f"Error reading VM {name}: {e}") 427 | 428 | try: 429 | await asyncio.gather( 430 | read_stream(self.vm_process.stdout, "stdout"), 431 | read_stream(self.vm_process.stderr, "stderr"), 432 | return_exceptions=True, 433 | ) 434 | except Exception as e: 435 | logger.error(f"Error consuming VM output: {e}") 436 | 437 | async def _check_shutdown_signals(self) -> None: 438 | """Check for shutdown signals from environment.""" 439 | if os.environ.get("VIRBY_SHUTDOWN_REQUESTED"): 440 | logger.info("Early shutdown signal detected") 441 | self._shutdown_requested = True 442 | os.environ.pop("VIRBY_SHUTDOWN_REQUESTED", None) 443 | 444 | async def _monitor_vm(self) -> None: 445 | """Monitor VM process for unexpected death.""" 446 | if self.vm_process: 447 | await self.vm_process.wait() 448 | 449 | # Log VM shutdown 450 | if self.vm_process.returncode == 0: 451 | logger.info("VM shut down normally") 452 | elif not self._shutdown_requested: 453 | logger.error(f"VM process died unexpectedly with code {self.vm_process.returncode}") 454 | 455 | # Clean up VM state so it can be restarted 456 | self._cleanup_vm_state() 457 | 458 | def _write_pid_file(self, pid: int) -> None: 459 | """Write process PID to file atomically.""" 460 | try: 461 | # Write to temporary file first 462 | pid_dir = self.pid_file.parent 463 | with tempfile.NamedTemporaryFile( 464 | mode="w", dir=pid_dir, prefix=f"{self.pid_file.name}.tmp.", delete=False 465 | ) as tmp_file: 466 | tmp_file.write(str(pid)) 467 | tmp_file.flush() 468 | os.fsync(tmp_file.fileno()) 469 | tmp_path = tmp_file.name 470 | 471 | # Atomic move to final location 472 | os.rename(tmp_path, self.pid_file) 473 | logger.debug(f"Wrote PID {pid} to {self.pid_file}") 474 | 475 | except Exception as e: 476 | logger.error(f"Error writing PID file: {e}") 477 | # Clean up temporary file if it exists 478 | try: 479 | if "tmp_path" in locals(): 480 | os.unlink(tmp_path) 481 | except Exception: 482 | pass 483 | 484 | def _validate_pid_file(self) -> bool: 485 | """Validate PID file format and content.""" 486 | try: 487 | if not self.pid_file.exists(): 488 | return False 489 | 490 | content = self.pid_file.read_text().strip() 491 | if not content: 492 | return False 493 | 494 | pid = int(content) 495 | if pid <= 0: 496 | return False 497 | 498 | # Check if process exists 499 | try: 500 | os.kill(pid, 0) 501 | return True 502 | except ProcessLookupError: 503 | # Process doesn't exist, remove stale PID file 504 | self._cleanup_pid_file() 505 | return False 506 | 507 | except (ValueError, OSError) as e: 508 | logger.warning(f"Invalid PID file {self.pid_file}: {e}") 509 | self._cleanup_pid_file() 510 | return False 511 | 512 | def _cleanup_pid_file(self) -> None: 513 | """Remove PID file.""" 514 | try: 515 | self.pid_file.unlink(missing_ok=True) 516 | logger.debug(f"Removed PID file {self.pid_file}") 517 | except Exception as e: 518 | logger.error(f"Error removing PID file: {e}") 519 | 520 | def _cleanup_on_exit(self) -> None: 521 | """Cleanup handler called by atexit - must be synchronous.""" 522 | logger.debug("atexit cleanup handler called") 523 | try: 524 | self._cleanup_process_sync() 525 | self._cleanup_pid_file() 526 | except Exception as e: 527 | logger.error(f"Error in atexit cleanup: {e}") 528 | 529 | def _cleanup_process_sync(self) -> None: 530 | """Synchronous process cleanup for atexit handler.""" 531 | if self.vm_process and self.vm_process.returncode is None: 532 | try: 533 | pid = self.vm_process.pid 534 | logger.debug(f"Synchronously terminating VM process {pid}") 535 | 536 | try: 537 | # Try graceful termination first 538 | os.kill(pid, signal.SIGTERM) 539 | time.sleep(2) 540 | 541 | # Force kill if still running 542 | try: 543 | os.kill(pid, signal.SIGKILL) 544 | except ProcessLookupError: 545 | pass # Already dead 546 | 547 | except ProcessLookupError: 548 | pass # Process already dead 549 | 550 | except Exception as e: 551 | logger.error(f"Error in synchronous process cleanup: {e}") 552 | 553 | def _cleanup_vm_state(self) -> None: 554 | """Clean up VM state after shutdown.""" 555 | # Schedule API client cleanup 556 | if self.api_client: 557 | asyncio.create_task(self.api_client.close()) 558 | 559 | # Cancel output task 560 | if self._output_task and not self._output_task.done(): 561 | self._output_task.cancel() 562 | 563 | # Reset state variables 564 | self.vm_process = None 565 | self._ip_address = None 566 | self._output_task = None 567 | 568 | logger.debug("VM state cleaned up") 569 | # Cleanup PID file 570 | self._cleanup_pid_file() 571 | 572 | async def start(self) -> str: 573 | """Start the VM and wait for it to be ready. 574 | 575 | Returns: 576 | IP address of the started VM 577 | """ 578 | logger.info("Starting Virby VM...") 579 | 580 | try: 581 | # Start VM process 582 | await self._start_vm_process() 583 | 584 | # Start monitoring task 585 | asyncio.create_task(self._monitor_vm()) 586 | 587 | # Pre-create SSH tester while discovering IP 588 | ssh_tester = SSHConnectivityTester(self.working_dir) 589 | 590 | # Discover IP 591 | ip = await self._discover_ip_address() 592 | 593 | logger.info(f"VM IP discovered: {ip}, testing SSH connectivity...") 594 | 595 | # Wait for SSH 596 | if not await self._wait_for_ssh(ip, ssh_tester): 597 | raise VMRuntimeError("SSH did not become ready in time") 598 | 599 | logger.info(f"VM is ready at {ip}") 600 | return ip 601 | 602 | except Exception as e: 603 | logger.error(f"Failed to start VM: {e}") 604 | await self.stop() 605 | raise 606 | 607 | async def _wait_for_ssh(self, ip: str, ssh_tester) -> bool: 608 | """Wait for SSH to become ready.""" 609 | logger.info(f"Waiting for SSH connectivity to {ip}") 610 | 611 | timeout = self.config.ssh_ready_timeout 612 | start_time = asyncio.get_event_loop().time() 613 | check_interval = 0.5 614 | 615 | while (asyncio.get_event_loop().time() - start_time) < timeout: 616 | if await ssh_tester.test_connectivity(ip, timeout=5): 617 | logger.info("SSH is ready") 618 | return True 619 | 620 | await asyncio.sleep(check_interval) 621 | # Gradual backoff 622 | check_interval = min(check_interval * 1.5, 1.0) 623 | 624 | logger.warning(f"SSH not ready within {timeout} seconds") 625 | return False 626 | 627 | async def stop(self, timeout: int = 30) -> None: 628 | """Stop the VM gracefully.""" 629 | logger.info("Stopping VM...") 630 | self._shutdown_requested = True 631 | 632 | # Close API client first 633 | await self.api_client.close() 634 | 635 | # Cancel output task 636 | if self._output_task and not self._output_task.done(): 637 | self._output_task.cancel() 638 | try: 639 | await self._output_task 640 | except asyncio.CancelledError: 641 | pass 642 | 643 | if self.vm_process and self.vm_process.returncode is None: 644 | try: 645 | self.vm_process.terminate() 646 | logger.info(f"Sent SIGTERM to VM process {self.vm_process.pid}") 647 | 648 | # Wait for graceful shutdown 649 | try: 650 | await asyncio.wait_for(self.vm_process.wait(), timeout=timeout) 651 | logger.info("VM stopped gracefully") 652 | except asyncio.TimeoutError: 653 | logger.warning("VM did not stop gracefully, killing...") 654 | self.vm_process.kill() 655 | await self.vm_process.wait() 656 | logger.info("VM killed") 657 | except Exception as e: 658 | logger.error(f"Error stopping VM: {e}") 659 | 660 | self._cleanup_vm_state() 661 | 662 | async def pause(self, timeout: int = 30) -> None: 663 | """Pause the VM via the vfkit API with timeout.""" 664 | logger.info("Pausing the VM...") 665 | 666 | if not self.is_running: 667 | raise VMRuntimeError("Cannot pause: VM is not running") 668 | 669 | # Check if VM can be paused 670 | can_pause = await with_timeout(self.can_pause, 5.0, "Can pause check") 671 | 672 | if not can_pause: 673 | raise VMRuntimeError("VM cannot be paused in current state") 674 | 675 | try: 676 | data = {"state": "Pause"} 677 | await with_timeout( 678 | lambda: self.api_client.post("/vm/state", data), 679 | timeout=timeout, 680 | operation_name="VM pause", 681 | ) 682 | logger.info("VM paused successfully") 683 | 684 | except VMRuntimeError: 685 | raise 686 | except Exception as e: 687 | raise VMRuntimeError(f"Error pausing VM: {e}") 688 | 689 | async def resume(self, timeout: int = 30) -> None: 690 | """Resume the VM via the vfkit API with timeout.""" 691 | logger.info("Resuming the VM...") 692 | 693 | if not self.is_running: 694 | raise VMRuntimeError("Cannot resume: VM is not running") 695 | 696 | # Check if VM can be resumed with timeout 697 | can_resume = await with_timeout(self.can_resume, 5.0, "Can resume check") 698 | 699 | if not can_resume: 700 | raise VMRuntimeError("VM cannot be resumed in current state") 701 | 702 | try: 703 | data = {"state": "Resume"} 704 | await with_timeout( 705 | lambda: self.api_client.post("/vm/state", data), 706 | timeout=timeout, 707 | operation_name="VM resume", 708 | ) 709 | logger.info("VM resumed successfully") 710 | 711 | except VMRuntimeError: 712 | raise 713 | except Exception as e: 714 | raise VMRuntimeError(f"Error resuming VM: {e}") 715 | 716 | async def pause_or_stop(self, timeout: int = 30) -> bool: 717 | """Attempt to pause VM, fall back to stop. 718 | 719 | Args: 720 | timeout: Total timeout for the operation 721 | 722 | Returns: 723 | bool: True if paused, False if stopped 724 | """ 725 | if not self.is_running: 726 | logger.debug("VM not running, nothing to pause or stop") 727 | return False 728 | 729 | # Try to pause first 730 | try: 731 | pause_timeout = min(timeout // 2, 15) 732 | if await with_timeout(self.can_pause, 1.0, "Can pause check"): 733 | await self.pause(pause_timeout) 734 | return True 735 | else: 736 | logger.debug("VM cannot be paused, stopping instead...") 737 | 738 | except VMRuntimeError as e: 739 | logger.warning(f"Failed to pause VM: {e}, stopping instead...") 740 | 741 | # Fall back to stop 742 | stop_timeout = max(timeout - pause_timeout if "pause_timeout" in locals() else timeout, 10) 743 | await self.stop(stop_timeout) 744 | return False 745 | 746 | async def resume_or_start(self) -> str: 747 | """Attempt to resume VM, fall back to start. 748 | 749 | Returns: 750 | str: IP address of the VM 751 | """ 752 | current_state = await self.get_current_state() 753 | 754 | # If VM is already running, return IP 755 | if current_state == VMProcessState.RUNNING: 756 | if self._ip_address: 757 | return self._ip_address 758 | else: 759 | logger.debug("VM running but no cached IP found, rediscovering...") 760 | ip = await self._discover_ip_address() 761 | return ip 762 | 763 | # If VM is paused, try to resume 764 | if current_state == VMProcessState.PAUSED: 765 | try: 766 | if await self.can_resume(): 767 | logger.info("Attempting to resume VM...") 768 | await self.resume() 769 | 770 | if self._ip_address: 771 | logger.info(f"Successfully resumed VM (ip: {self._ip_address})") 772 | return self._ip_address 773 | else: 774 | # IP not cached, rediscover 775 | logger.debug("VM resumed but IP not cached, rediscovering...") 776 | ip = await self._discover_ip_address() 777 | return ip 778 | else: 779 | logger.debug("VM cannot be resumed, starting instead...") 780 | except Exception as e: 781 | logger.warning(f"Failed to resume VM: {e}, starting instead...") 782 | # Ensure VM is properly stopped before starting 783 | await self.stop() 784 | 785 | return await self.start() 786 | 787 | @property 788 | def is_running(self) -> bool: 789 | """Check if VM is running.""" 790 | return self.vm_process is not None and self.vm_process.returncode is None 791 | 792 | @property 793 | def ip_address(self) -> str | None: 794 | """Get VM IP address.""" 795 | return self._ip_address 796 | --------------------------------------------------------------------------------