├── screenshot.png ├── dashboard ├── assets │ ├── img │ │ ├── logo.png │ │ └── screenshot.png │ └── css │ │ └── main.css ├── public │ ├── favicon.ico │ ├── screenshot.png │ ├── robots.txt │ ├── browserconfig.xml │ ├── sitemap.xml │ └── site.webmanifest ├── app │ └── app.vue ├── .gitignore ├── tsconfig.json ├── package.json ├── nuxt.config.ts ├── README.md ├── components │ └── AppSidebar.vue └── tailwind.config.js ├── src ├── lib.rs ├── version.rs ├── util.rs ├── config.rs ├── remote.rs ├── hotaisle_client.rs ├── render.rs ├── proc.rs └── process_mgmt.rs ├── mcp ├── src │ ├── lib.rs │ ├── main.rs │ ├── types.rs │ ├── server.rs │ └── resources.rs ├── Cargo.toml └── README.md ├── .gitignore ├── audit.toml ├── Cargo.toml ├── scripts ├── install.ps1 ├── test-hotaisle-integration-simple.sh ├── install.sh ├── setup-gpu-runner.sh └── run-gpu-tests.sh ├── .github └── workflows │ ├── test-hotaisle-integration.yml │ ├── release.yml │ ├── hotaisle-gpu-testing.yml │ ├── gpu-testing.yml │ └── self-hosted-setup.md ├── LICENSE ├── docs ├── CLOUD_GPU_SETUP.md └── HOTAISLE_INTEGRATION.md ├── README.md └── deny.toml /screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treadiehq/gpu-kill/HEAD/screenshot.png -------------------------------------------------------------------------------- /dashboard/assets/img/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treadiehq/gpu-kill/HEAD/dashboard/assets/img/logo.png -------------------------------------------------------------------------------- /dashboard/public/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treadiehq/gpu-kill/HEAD/dashboard/public/favicon.ico -------------------------------------------------------------------------------- /dashboard/public/screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treadiehq/gpu-kill/HEAD/dashboard/public/screenshot.png -------------------------------------------------------------------------------- /dashboard/assets/img/screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/treadiehq/gpu-kill/HEAD/dashboard/assets/img/screenshot.png -------------------------------------------------------------------------------- /dashboard/app/app.vue: -------------------------------------------------------------------------------- 1 | 7 | -------------------------------------------------------------------------------- /dashboard/.gitignore: -------------------------------------------------------------------------------- 1 | # Nuxt dev/build outputs 2 | .output 3 | .data 4 | .nuxt 5 | .nitro 6 | .cache 7 | dist 8 | 9 | # Node dependencies 10 | node_modules 11 | 12 | # Logs 13 | logs 14 | *.log 15 | 16 | # Misc 17 | .DS_Store 18 | .fleet 19 | .idea 20 | 21 | # Local env files 22 | .env 23 | .env.* 24 | !.env.example 25 | -------------------------------------------------------------------------------- /dashboard/public/robots.txt: -------------------------------------------------------------------------------- 1 | User-agent: * 2 | Allow: / 3 | 4 | # Sitemap 5 | Sitemap: https://gpukill.com/sitemap.xml 6 | 7 | # Crawl-delay for respectful crawling 8 | Crawl-delay: 1 9 | 10 | # Disallow admin or sensitive areas (if any) 11 | # Disallow: /admin/ 12 | # Disallow: /api/ 13 | 14 | # Allow all other content 15 | Allow: /dashboard/ 16 | Allow: /docs/ 17 | Allow: /assets/ -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | pub mod args; 2 | pub mod audit; 3 | pub mod config; 4 | pub mod coordinator; 5 | pub mod guard_mode; 6 | pub mod nvml_api; 7 | pub mod proc; 8 | pub mod process_mgmt; 9 | pub mod remote; 10 | pub mod render; 11 | pub mod rogue_config; 12 | pub mod rogue_detection; 13 | pub mod util; 14 | pub mod vendor; 15 | pub mod version; 16 | 17 | #[cfg(feature = "hotaisle")] 18 | pub mod hotaisle_client; 19 | -------------------------------------------------------------------------------- /dashboard/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | // https://nuxt.com/docs/guide/concepts/typescript 3 | "files": [], 4 | "references": [ 5 | { 6 | "path": "./.nuxt/tsconfig.app.json" 7 | }, 8 | { 9 | "path": "./.nuxt/tsconfig.server.json" 10 | }, 11 | { 12 | "path": "./.nuxt/tsconfig.shared.json" 13 | }, 14 | { 15 | "path": "./.nuxt/tsconfig.node.json" 16 | } 17 | ] 18 | } 19 | -------------------------------------------------------------------------------- /dashboard/public/browserconfig.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | #1e40af 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /mcp/src/lib.rs: -------------------------------------------------------------------------------- 1 | //! GPU Kill MCP Server 2 | //! 3 | //! This module provides a Model Context Protocol (MCP) server for GPU Kill, 4 | //! enabling AI assistants and other tools to interact with GPU management 5 | //! functionality through a standardized interface. 6 | 7 | pub mod resources; 8 | pub mod server; 9 | pub mod tools; 10 | pub mod types; 11 | 12 | pub use server::GpuKillMCPServer; 13 | pub use types::*; 14 | 15 | /// MCP Server version 16 | pub const MCP_VERSION: &str = "2024-11-05"; 17 | 18 | /// GPU Kill MCP Server capabilities 19 | pub const CAPABILITIES: &[&str] = &["resources", "tools", "logging"]; 20 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Rust 2 | /target/ 3 | **/*.rs.bk 4 | Cargo.lock 5 | 6 | # IDE 7 | .vscode/ 8 | .idea/ 9 | *.swp 10 | *.swo 11 | *~ 12 | 13 | # OS 14 | .DS_Store 15 | .DS_Store? 16 | ._* 17 | .Spotlight-V100 18 | .Trashes 19 | ehthumbs.db 20 | Thumbs.db 21 | 22 | # Logs 23 | *.log 24 | 25 | # Temporary files 26 | *.tmp 27 | *.temp 28 | 29 | # Build artifacts 30 | dist/ 31 | *.tar.gz 32 | *.zip 33 | 34 | # Configuration files (optional) 35 | config.toml 36 | .env 37 | 38 | # Test artifacts 39 | test_output/ 40 | coverage/ 41 | 42 | # Documentation build 43 | book/ 44 | .DS_Store 45 | 46 | # Dashboard (separate project) 47 | REMOVED.md -------------------------------------------------------------------------------- /dashboard/public/sitemap.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | https://gpukill.com/ 5 | 2024-01-20 6 | daily 7 | 1.0 8 | 9 | 10 | https://gpukill.com/dashboard/ 11 | 2024-01-20 12 | daily 13 | 0.9 14 | 15 | 16 | https://gpukill.com/docs/ 17 | 2024-01-20 18 | weekly 19 | 0.8 20 | 21 | 22 | -------------------------------------------------------------------------------- /audit.toml: -------------------------------------------------------------------------------- 1 | # Cargo audit configuration 2 | [advisories] 3 | # Allow unmaintained crates (warnings only, not errors) 4 | unmaintained = "warn" 5 | 6 | # License configuration 7 | [licenses] 8 | # Allow common open source licenses 9 | allow = [ 10 | "MIT", 11 | "Apache-2.0", 12 | "Apache-2.0 OR MIT", 13 | "BSD-2-Clause", 14 | "BSD-3-Clause", 15 | "ISC", 16 | "Unlicense", 17 | "0BSD", 18 | "Zlib", 19 | "CC0-1.0", 20 | "MPL-2.0", 21 | "LGPL-2.1", 22 | "LGPL-3.0", 23 | "GPL-2.0", 24 | "GPL-3.0", 25 | ] 26 | 27 | # Deny proprietary licenses 28 | deny = [ 29 | "proprietary", 30 | "commercial", 31 | ] 32 | 33 | # Allow unknown licenses (for crates without explicit license info) 34 | unknown = "warn" 35 | -------------------------------------------------------------------------------- /dashboard/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "gpukill-dashboard", 3 | "private": true, 4 | "type": "module", 5 | "scripts": { 6 | "build": "nuxt build", 7 | "dev": "nuxt dev --port 3000", 8 | "generate": "nuxt generate", 9 | "preview": "nuxt preview", 10 | "postinstall": "nuxt prepare", 11 | "start": "nuxt dev --port 3000" 12 | }, 13 | "dependencies": { 14 | "@headlessui/vue": "^1.7.23", 15 | "@heroicons/vue": "^2.2.0", 16 | "@nuxtjs/tailwindcss": "^6.14.0", 17 | "@tailwindcss/aspect-ratio": "^0.4.2", 18 | "@tailwindcss/forms": "^0.5.10", 19 | "@tailwindcss/typography": "^0.5.18", 20 | "chart.js": "^4.5.0", 21 | "nuxt": "^3.13.0", 22 | "vue": "^3.5.18", 23 | "vue-chartjs": "^5.3.2", 24 | "vue-router": "^4.5.1" 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /mcp/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "gpukill-mcp" 3 | version = "0.1.0" 4 | edition = "2021" 5 | authors = ["GPU Kill Team"] 6 | description = "MCP server for GPU Kill - AI-accessible GPU management" 7 | license = "MIT" 8 | repository = "https://github.com/treadiehq/gpu-kill" 9 | 10 | [dependencies] 11 | # Core MCP dependencies 12 | tokio = { version = "1.0", features = ["rt", "rt-multi-thread", "net", "fs", "macros"] } 13 | serde = { version = "1.0", features = ["derive"] } 14 | serde_json = "1.0" 15 | anyhow = "1.0" 16 | tracing = "0.1" 17 | tracing-subscriber = { version = "0.3", features = ["env-filter"] } 18 | 19 | # GPU Kill integration 20 | gpukill = { path = "../" } 21 | 22 | # HTTP server for MCP protocol 23 | axum = { version = "0.7", features = ["ws", "macros"] } 24 | tower = "0.4" 25 | tower-http = { version = "0.5", features = ["cors", "trace"] } 26 | 27 | # JSON-RPC for MCP protocol 28 | jsonrpc-core = "18.0" 29 | jsonrpc-derive = "18.0" 30 | jsonrpc-ws-server = "18.0" 31 | 32 | # UUID for request IDs 33 | uuid = { version = "1.0", features = ["v4", "serde"] } 34 | 35 | [dev-dependencies] 36 | tempfile = "3.0" 37 | -------------------------------------------------------------------------------- /src/version.rs: -------------------------------------------------------------------------------- 1 | /// Version information for the gpukill CLI tool 2 | pub const VERSION: &str = env!("CARGO_PKG_VERSION"); 3 | 4 | /// Build information 5 | pub const BUILD_DATE: &str = env!("BUILD_DATE"); 6 | pub const BUILD_TARGET: &str = env!("BUILD_TARGET"); 7 | #[allow(dead_code)] 8 | pub const GIT_COMMIT: &str = env!("GIT_COMMIT"); 9 | 10 | /// Get formatted version string 11 | pub fn get_version_string() -> String { 12 | format!("gpukill {} ({} {})", VERSION, BUILD_TARGET, BUILD_DATE) 13 | } 14 | 15 | /// Get detailed version information 16 | #[allow(dead_code)] 17 | pub fn get_detailed_version() -> String { 18 | format!( 19 | "gpukill version {}\n\ 20 | Build target: {}\n\ 21 | Build date: {}\n\ 22 | Git commit: {}", 23 | VERSION, BUILD_TARGET, BUILD_DATE, GIT_COMMIT 24 | ) 25 | } 26 | 27 | #[cfg(test)] 28 | mod tests { 29 | use super::*; 30 | 31 | #[test] 32 | fn test_version_string_format() { 33 | let version = get_version_string(); 34 | assert!(version.contains("gpukill")); 35 | assert!(version.contains(VERSION)); 36 | } 37 | 38 | #[test] 39 | fn test_detailed_version_format() { 40 | let detailed = get_detailed_version(); 41 | assert!(detailed.contains("gpukill version")); 42 | assert!(detailed.contains(VERSION)); 43 | assert!(detailed.contains("Build target:")); 44 | assert!(detailed.contains("Build date:")); 45 | assert!(detailed.contains("Git commit:")); 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /dashboard/nuxt.config.ts: -------------------------------------------------------------------------------- 1 | // https://nuxt.com/docs/api/configuration/nuxt-config 2 | export default defineNuxtConfig({ 3 | compatibilityDate: '2024-04-03', 4 | devtools: { enabled: true }, 5 | modules: [ 6 | '@nuxtjs/tailwindcss' 7 | ], 8 | runtimeConfig: { 9 | public: { 10 | apiBase: process.env.API_BASE || 'http://localhost:8080' 11 | } 12 | }, 13 | ssr: true, 14 | app: { 15 | head: { 16 | title: 'GPU Kill - Cluster Management Dashboard', 17 | titleTemplate: '%s', 18 | meta: [ 19 | { charset: 'utf-8' }, 20 | { name: 'viewport', content: 'width=device-width, initial-scale=1' }, 21 | { name: 'format-detection', content: 'telephone=no' }, 22 | { name: 'theme-color', content: '#1e40af' } 23 | ], 24 | link: [ 25 | { rel: 'icon', type: 'image/x-icon', href: '/favicon.ico' }, 26 | { rel: 'preconnect', href: 'https://fonts.googleapis.com' }, 27 | { rel: 'preconnect', href: 'https://fonts.gstatic.com', crossorigin: '' } 28 | ], 29 | style: [ 30 | { 31 | innerHTML: ` 32 | html, body, #__nuxt { 33 | background: #000000 !important; 34 | background-color: #000000 !important; 35 | overscroll-behavior: none !important; 36 | } 37 | * { 38 | overscroll-behavior: none !important; 39 | } 40 | ` 41 | } 42 | ] 43 | } 44 | }, 45 | nitro: { 46 | devProxy: { 47 | '/api': { 48 | target: 'http://localhost:8080/api', 49 | changeOrigin: true 50 | } 51 | } 52 | } 53 | }) 54 | -------------------------------------------------------------------------------- /mcp/src/main.rs: -------------------------------------------------------------------------------- 1 | //! GPU Kill MCP Server - Main entry point 2 | 3 | use gpukill_mcp::GpuKillMCPServer; 4 | use std::env; 5 | use tracing::{error, info}; 6 | 7 | #[tokio::main] 8 | async fn main() -> anyhow::Result<()> { 9 | // Initialize logging 10 | tracing_subscriber::fmt() 11 | .with_env_filter(tracing_subscriber::EnvFilter::from_default_env()) 12 | .init(); 13 | 14 | info!("Starting GPU Kill MCP Server"); 15 | 16 | // Get port from environment or use default 17 | let port = env::var("MCP_PORT") 18 | .unwrap_or_else(|_| "3001".to_string()) 19 | .parse::() 20 | .unwrap_or(3001); 21 | 22 | // Create and start the MCP server 23 | let server = GpuKillMCPServer::new().await?; 24 | 25 | info!("GPU Kill MCP Server initialized successfully"); 26 | info!("Available resources:"); 27 | info!(" - gpu://list - Current GPU status and utilization"); 28 | info!(" - gpu://processes - Currently running GPU processes"); 29 | info!(" - gpu://audit - Historical GPU usage data"); 30 | info!(" - gpu://policies - Current Guard Mode policies"); 31 | info!(" - gpu://rogue-detection - Security scan results"); 32 | 33 | info!("Available tools:"); 34 | info!(" - kill_gpu_process - Kill a GPU process by PID"); 35 | info!(" - reset_gpu - Reset a GPU by ID"); 36 | info!(" - scan_rogue_activity - Scan for suspicious GPU activity"); 37 | info!(" - create_user_policy - Create a user policy for Guard Mode"); 38 | info!(" - get_gpu_status - Get detailed status of a specific GPU"); 39 | info!(" - kill_processes_by_name - Kill all processes matching a name pattern"); 40 | 41 | // Start the server 42 | if let Err(e) = server.start(port).await { 43 | error!("Failed to start MCP server: {}", e); 44 | return Err(e); 45 | } 46 | 47 | Ok(()) 48 | } 49 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | members = [ 3 | ".", 4 | "mcp", 5 | ] 6 | 7 | [package] 8 | name = "gpukill" 9 | version = "0.1.8" 10 | edition = "2021" 11 | authors = ["Kage "] 12 | description = "A CLI tool for GPU management and monitoring supporting NVIDIA, AMD, Intel, and Apple Silicon GPUs" 13 | license = "FSL-1.1-MIT" 14 | repository = "https://github.com/treadiehq/gpu-kill" 15 | keywords = ["gpu", "nvidia", "amd", "intel", "apple", "metal", "nvml", "rocm", "cli", "monitoring"] 16 | categories = ["command-line-utilities", "development-tools"] 17 | 18 | [[bin]] 19 | name = "gpukill" 20 | path = "src/main.rs" 21 | 22 | [dependencies] 23 | clap = { version = "4.4", features = ["derive", "env"] } 24 | tabled = "0.15" 25 | tracing = "0.1" 26 | tracing-subscriber = { version = "0.3", features = ["env-filter"] } 27 | nvml-wrapper = "0.11" 28 | sysinfo = "0.30" 29 | color-eyre = "0.6" 30 | serde = { version = "1.0", features = ["derive"] } 31 | serde_json = "1.0" 32 | chrono = { version = "0.4", features = ["serde"] } 33 | nix = { version = "0.27", features = ["process", "signal"] } 34 | tokio = { version = "1.0", features = ["rt", "time", "process", "net", "fs"] } 35 | anyhow = "1.0" 36 | hostname = "0.3" 37 | libc = "0.2" 38 | toml = "0.8" 39 | dirs = "5.0" 40 | regex = "1.10" 41 | glob = "0.3" 42 | reqwest = { version = "0.11", features = ["json"] } 43 | 44 | # HTTP server dependencies 45 | axum = { version = "0.7", features = ["ws", "macros"] } 46 | tower = "0.4" 47 | tower-http = { version = "0.5", features = ["cors", "trace"] } 48 | uuid = { version = "1.0", features = ["v4", "serde"] } 49 | futures-util = "0.3" 50 | 51 | # SSH remote support (using system SSH for now) 52 | # ssh2 = "0.9" 53 | # rpassword = "7.3" 54 | 55 | 56 | # Apple Silicon GPU support 57 | [target.'cfg(target_os = "macos")'.dependencies] 58 | core-foundation = "0.9" 59 | core-foundation-sys = "0.8" 60 | io-kit-sys = "0.2" 61 | 62 | [dev-dependencies] 63 | tempfile = "3.0" 64 | mockall = "0.12" 65 | 66 | [build-dependencies] 67 | chrono = "0.4" 68 | 69 | [features] 70 | default = [] 71 | mock_nvml = [] 72 | hotaisle = [] 73 | 74 | [profile.release] 75 | # Optimized for faster builds during development 76 | lto = "thin" # Much faster than "true" (fat LTO) 77 | codegen-units = 4 # Allow parallel codegen for faster builds 78 | panic = "abort" 79 | strip = true 80 | 81 | # Fast release profile for development 82 | [profile.release-fast] 83 | inherits = "release" 84 | lto = false 85 | codegen-units = 16 86 | opt-level = 2 # Slightly less optimization for speed 87 | 88 | # Maximum optimization profile for final releases 89 | [profile.release-max] 90 | inherits = "release" 91 | lto = true 92 | codegen-units = 1 93 | opt-level = 3 94 | -------------------------------------------------------------------------------- /dashboard/public/site.webmanifest: -------------------------------------------------------------------------------- 1 | { 2 | "name": "GPU Kill - Cluster Management Dashboard", 3 | "short_name": "GPU Kill", 4 | "description": "Professional GPU cluster management dashboard with real-time monitoring, rogue detection, and policy enforcement.", 5 | "start_url": "/", 6 | "display": "standalone", 7 | "background_color": "#0f172a", 8 | "theme_color": "#1e40af", 9 | "orientation": "portrait-primary", 10 | "scope": "/", 11 | "lang": "en", 12 | "categories": ["developer", "productivity", "utilities"], 13 | "icons": [ 14 | { 15 | "src": "/favicon-16x16.png", 16 | "sizes": "16x16", 17 | "type": "image/png" 18 | }, 19 | { 20 | "src": "/favicon-32x32.png", 21 | "sizes": "32x32", 22 | "type": "image/png" 23 | }, 24 | { 25 | "src": "/apple-touch-icon.png", 26 | "sizes": "180x180", 27 | "type": "image/png" 28 | }, 29 | { 30 | "src": "/android-chrome-192x192.png", 31 | "sizes": "192x192", 32 | "type": "image/png", 33 | "purpose": "any maskable" 34 | }, 35 | { 36 | "src": "/android-chrome-512x512.png", 37 | "sizes": "512x512", 38 | "type": "image/png", 39 | "purpose": "any maskable" 40 | } 41 | ], 42 | "screenshots": [ 43 | { 44 | "src": "/screenshot-desktop.png", 45 | "sizes": "1280x720", 46 | "type": "image/png", 47 | "form_factor": "wide", 48 | "label": "GPU Kill Dashboard - Desktop View" 49 | }, 50 | { 51 | "src": "/screenshot-mobile.png", 52 | "sizes": "390x844", 53 | "type": "image/png", 54 | "form_factor": "narrow", 55 | "label": "GPU Kill Dashboard - Mobile View" 56 | } 57 | ], 58 | "shortcuts": [ 59 | { 60 | "name": "Cluster Overview", 61 | "short_name": "Overview", 62 | "description": "View cluster overview and statistics", 63 | "url": "/#cluster-overview", 64 | "icons": [ 65 | { 66 | "src": "/shortcut-overview.png", 67 | "sizes": "96x96" 68 | } 69 | ] 70 | }, 71 | { 72 | "name": "Rogue Detection", 73 | "short_name": "Rogue", 74 | "description": "Scan for suspicious GPU activities", 75 | "url": "/#rogue-detection", 76 | "icons": [ 77 | { 78 | "src": "/shortcut-rogue.png", 79 | "sizes": "96x96" 80 | } 81 | ] 82 | }, 83 | { 84 | "name": "Guard Mode", 85 | "short_name": "Guard", 86 | "description": "Manage policy enforcement", 87 | "url": "/#guard-mode", 88 | "icons": [ 89 | { 90 | "src": "/shortcut-guard.png", 91 | "sizes": "96x96" 92 | } 93 | ] 94 | } 95 | ] 96 | } 97 | -------------------------------------------------------------------------------- /scripts/install.ps1: -------------------------------------------------------------------------------- 1 | $ErrorActionPreference = "Stop" 2 | 3 | # gpukill Windows installer: prefer winget, fallback to zip from GitHub Releases 4 | 5 | param( 6 | [string]$Version = "", 7 | [string]$BinDir = "$env:LOCALAPPDATA\Programs\gpukill", 8 | [switch]$Yes, 9 | [switch]$Insecure 10 | ) 11 | 12 | function Get-Arch { 13 | if ([System.Environment]::Is64BitOperatingSystem) { return "x86_64" } else { return "x86" } 14 | } 15 | 16 | # Try winget first 17 | try { 18 | if (Get-Command winget -ErrorAction SilentlyContinue) { 19 | winget install --id TreadieHQ.GPUKill --silent --accept-package-agreements --accept-source-agreements 20 | if ($LASTEXITCODE -eq 0) { Write-Host "✅ Installed via winget"; exit 0 } 21 | } 22 | } catch {} 23 | 24 | # Fallback to GitHub Releases 25 | $Owner = "treadiehq" 26 | $Repo = "gpu-kill" 27 | if ($Version -ne "") { 28 | $ApiUrl = "https://api.github.com/repos/$Owner/$Repo/releases/tags/$Version" 29 | } else { 30 | $ApiUrl = "https://api.github.com/repos/$Owner/$Repo/releases/latest" 31 | } 32 | 33 | Write-Host "Resolving release…" 34 | $resp = Invoke-RestMethod -Uri $ApiUrl -UseBasicParsing 35 | $Tag = $resp.tag_name 36 | if (-not $Tag) { throw "Failed to resolve release tag" } 37 | 38 | $arch = Get-Arch 39 | $assetName = "gpukill-$Tag-windows-$arch.zip" 40 | $asset = $resp.assets | Where-Object { $_.name -eq $assetName } 41 | if (-not $asset) { throw "No asset named $assetName in release $Tag" } 42 | 43 | $tmp = New-Item -ItemType Directory -Path ([System.IO.Path]::GetTempPath() + [System.Guid]::NewGuid()) 44 | $zipPath = Join-Path $tmp $assetName 45 | $sumsAsset = $resp.assets | Where-Object { $_.name -eq 'SHA256SUMS' } 46 | $sumsPath = Join-Path $tmp 'SHA256SUMS' 47 | 48 | Write-Host "Downloading $assetName…" 49 | Invoke-WebRequest -Uri $asset.browser_download_url -OutFile $zipPath -UseBasicParsing 50 | if ($sumsAsset) { 51 | Invoke-WebRequest -Uri $sumsAsset.browser_download_url -OutFile $sumsPath -UseBasicParsing 52 | } 53 | 54 | if (Test-Path $sumsPath) { 55 | $hash = (Get-FileHash -Algorithm SHA256 $zipPath).Hash.ToLower() 56 | $sums = Get-Content $sumsPath 57 | if (-not ($sums -match $hash)) { 58 | if (-not $Insecure) { throw "Checksum verification failed" } 59 | Write-Warning "Checksum verification skipped (--Insecure)" 60 | } 61 | } 62 | 63 | Write-Host "Extracting…" 64 | Expand-Archive -Path $zipPath -DestinationPath $tmp -Force 65 | 66 | New-Item -ItemType Directory -Force -Path $BinDir | Out-Null 67 | Copy-Item -Path (Join-Path $tmp 'gpukill.exe') -Destination (Join-Path $BinDir 'gpukill.exe') -Force 68 | 69 | # Add to PATH for current session 70 | $env:PATH = "$BinDir;$env:PATH" 71 | Write-Host "✅ Installed to $BinDir" 72 | & (Join-Path $BinDir 'gpukill.exe') --version 73 | 74 | -------------------------------------------------------------------------------- /.github/workflows/test-hotaisle-integration.yml: -------------------------------------------------------------------------------- 1 | name: Test Hot Aisle Integration 2 | 3 | on: 4 | push: 5 | branches: [main, develop] 6 | paths: 7 | - 'src/hotaisle_client.rs' 8 | - 'scripts/run-gpu-tests.sh' 9 | - 'scripts/test-hotaisle-integration-simple.sh' 10 | - '.github/workflows/test-hotaisle-integration.yml' 11 | - '.github/workflows/hotaisle-gpu-testing.yml' 12 | - 'docs/HOTAISLE_INTEGRATION.md' 13 | pull_request: 14 | branches: [main] 15 | paths: 16 | - 'src/hotaisle_client.rs' 17 | - 'scripts/run-gpu-tests.sh' 18 | - 'scripts/test-hotaisle-integration-simple.sh' 19 | - '.github/workflows/test-hotaisle-integration.yml' 20 | - '.github/workflows/hotaisle-gpu-testing.yml' 21 | - 'docs/HOTAISLE_INTEGRATION.md' 22 | workflow_dispatch: 23 | 24 | permissions: 25 | contents: read 26 | 27 | env: 28 | RUST_BACKTRACE: 1 29 | RUST_LOG: info 30 | 31 | jobs: 32 | test-integration: 33 | name: Test Hot Aisle Integration 34 | runs-on: ubuntu-latest 35 | timeout-minutes: 15 36 | 37 | steps: 38 | - name: Checkout code 39 | uses: actions/checkout@v4 40 | 41 | - name: Install Rust 42 | uses: dtolnay/rust-toolchain@stable 43 | with: 44 | components: rustfmt, clippy 45 | 46 | - name: Install system dependencies 47 | run: | 48 | sudo apt-get update 49 | sudo apt-get install -y build-essential libssl-dev pkg-config curl jq 50 | 51 | - name: Make test script executable 52 | run: chmod +x scripts/test-hotaisle-integration-simple.sh 53 | 54 | - name: Run Hot Aisle Integration Tests 55 | run: | 56 | echo "Running comprehensive Hot Aisle integration tests..." 57 | ./scripts/test-hotaisle-integration-simple.sh 58 | 59 | - name: Validate YAML Syntax 60 | run: | 61 | echo "Validating GitHub Actions workflow syntax..." 62 | python3 -c " 63 | import yaml 64 | import sys 65 | try: 66 | with open('.github/workflows/hotaisle-gpu-testing.yml', 'r') as f: 67 | yaml.safe_load(f) 68 | print('✅ YAML syntax is valid') 69 | except yaml.YAMLError as e: 70 | print(f'❌ YAML syntax error: {e}') 71 | sys.exit(1) 72 | except Exception as e: 73 | print(f'❌ Error reading YAML file: {e}') 74 | sys.exit(1) 75 | " || echo "⚠️ Python YAML validation skipped (module not available)" 76 | 77 | - name: Integration Test Summary 78 | run: | 79 | echo "========================================" 80 | echo "🎉 Hot Aisle Integration Test Summary" 81 | echo "========================================" 82 | echo "✅ All integration tests passed!" 83 | echo "✅ Hot Aisle integration is ready for use!" 84 | echo "" 85 | echo "To use Hot Aisle GPU testing:" 86 | echo "1. Set up HOTAISLE_API_KEY in GitHub Secrets" 87 | echo "2. Manually trigger the 'Hot Aisle GPU Testing' workflow" 88 | echo "3. Monitor results in the Actions tab" 89 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'v*' 7 | workflow_dispatch: 8 | inputs: 9 | tag: 10 | description: 'Tag to release (e.g., v0.1.1)' 11 | required: true 12 | type: string 13 | 14 | permissions: 15 | contents: write 16 | 17 | env: 18 | TAG: ${{ github.ref_type == 'tag' && github.ref_name || inputs.tag }} 19 | 20 | jobs: 21 | build-linux-x86_64: 22 | runs-on: ubuntu-latest 23 | steps: 24 | - uses: actions/checkout@v4 25 | - uses: dtolnay/rust-toolchain@stable 26 | with: 27 | targets: x86_64-unknown-linux-gnu 28 | - name: Install system dependencies (NVML) 29 | run: | 30 | sudo apt-get update 31 | sudo apt-get install -y libnvidia-ml-dev pkg-config 32 | - name: Build 33 | run: cargo build --release --target x86_64-unknown-linux-gnu 34 | - name: Prepare artifact 35 | run: | 36 | mkdir -p dist 37 | cp target/x86_64-unknown-linux-gnu/release/gpukill dist/gpukill-${{ env.TAG }}-linux-x86_64 38 | - uses: actions/upload-artifact@v4 39 | with: 40 | name: linux-x86_64 41 | path: dist/gpukill-${{ env.TAG }}-linux-x86_64 42 | 43 | build-macos-arm64: 44 | runs-on: macos-14 45 | steps: 46 | - uses: actions/checkout@v4 47 | - uses: dtolnay/rust-toolchain@stable 48 | - name: Build 49 | run: cargo build --release 50 | - name: Prepare artifact 51 | run: | 52 | mkdir -p dist 53 | cp target/release/gpukill dist/gpukill-${{ env.TAG }}-macos-aarch64 54 | - uses: actions/upload-artifact@v4 55 | with: 56 | name: macos-aarch64 57 | path: dist/gpukill-${{ env.TAG }}-macos-aarch64 58 | 59 | build-windows-x86_64: 60 | runs-on: windows-latest 61 | steps: 62 | - uses: actions/checkout@v4 63 | - uses: dtolnay/rust-toolchain@stable 64 | - name: Build 65 | run: cargo build --release 66 | - name: Prepare zip 67 | shell: pwsh 68 | run: | 69 | New-Item -ItemType Directory -Force -Path dist | Out-Null 70 | Copy-Item target/release/gpukill.exe dist/gpukill.exe 71 | Compress-Archive -Path dist/gpukill.exe -DestinationPath dist/gpukill-${{ env.TAG }}-windows-x86_64.zip -Force 72 | - uses: actions/upload-artifact@v4 73 | with: 74 | name: windows-x86_64 75 | path: dist/gpukill-${{ env.TAG }}-windows-x86_64.zip 76 | 77 | release: 78 | runs-on: ubuntu-latest 79 | needs: [build-linux-x86_64, build-macos-arm64, build-windows-x86_64] 80 | steps: 81 | - uses: actions/checkout@v4 82 | - name: Download artifacts 83 | uses: actions/download-artifact@v4 84 | with: 85 | path: dist 86 | - name: Flatten artifacts and compute checksums 87 | run: | 88 | mkdir -p upload 89 | find dist -type f -maxdepth 2 -exec cp {} upload/ \; 90 | (cd upload && sha256sum * > SHA256SUMS) || (cd upload && shasum -a 256 * > SHA256SUMS) 91 | - name: Create GitHub Release 92 | uses: softprops/action-gh-release@v2 93 | with: 94 | tag_name: ${{ env.TAG }} 95 | files: | 96 | upload/* 97 | env: 98 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 99 | -------------------------------------------------------------------------------- /.github/workflows/hotaisle-gpu-testing.yml: -------------------------------------------------------------------------------- 1 | name: Hot Aisle GPU Testing 2 | 3 | # This workflow only runs when manually triggered and when API key is configured 4 | on: 5 | workflow_dispatch: 6 | inputs: 7 | gpu_types: 8 | description: 'Comma-separated GPU types to test (nvidia,amd,intel,apple-silicon)' 9 | required: false 10 | default: 'nvidia,amd,intel' 11 | test_duration: 12 | description: 'Test duration in minutes' 13 | required: false 14 | default: '30' 15 | 16 | jobs: 17 | preflight: 18 | name: Preflight Checks 19 | runs-on: ubuntu-latest 20 | outputs: 21 | api_key_configured: ${{ steps.check_api_key.outputs.configured }} 22 | steps: 23 | - name: Check Hot Aisle API Key 24 | id: check_api_key 25 | run: | 26 | if [[ -n "${{ secrets.HOTAISLE_API_KEY }}" ]]; then 27 | echo "configured=true" >> $GITHUB_OUTPUT 28 | echo "✅ Hot Aisle API key is configured" 29 | else 30 | echo "configured=false" >> $GITHUB_OUTPUT 31 | echo "❌ Hot Aisle API key is not configured" 32 | echo "Please set HOTAISLE_API_KEY in repository secrets to use this workflow" 33 | exit 1 34 | fi 35 | 36 | gpu-testing: 37 | name: GPU Testing on Hot Aisle 38 | needs: preflight 39 | if: needs.preflight.outputs.api_key_configured == 'true' 40 | runs-on: ubuntu-latest 41 | strategy: 42 | matrix: 43 | gpu_type: [nvidia, amd, intel] 44 | steps: 45 | - name: Checkout code 46 | uses: actions/checkout@v4 47 | 48 | - name: Set up Rust 49 | uses: actions-rs/toolchain@v1 50 | with: 51 | toolchain: stable 52 | components: rustfmt, clippy 53 | 54 | - name: Build GPU Kill with Hot Aisle support 55 | run: | 56 | cargo build --release --features hotaisle 57 | # Verify binary was created 58 | ls -la target/release/gpukill 59 | 60 | - name: Test Hot Aisle Integration 61 | run: | 62 | chmod +x scripts/test-hotaisle-integration.sh 63 | ./scripts/test-hotaisle-integration.sh 64 | 65 | - name: Provision GPU Instance 66 | id: provision 67 | run: | 68 | # This would use the Hot Aisle client to provision an instance 69 | echo "Provisioning ${{ matrix.gpu_type }} GPU instance..." 70 | # For now, we'll simulate this step 71 | echo "instance_id=test-instance-123" >> $GITHUB_OUTPUT 72 | echo "instance_ip=192.168.1.100" >> $GITHUB_OUTPUT 73 | 74 | - name: Deploy and Test on GPU Instance 75 | run: | 76 | echo "Deploying GPU Kill to instance ${{ steps.provision.outputs.instance_id }}" 77 | echo "Running GPU tests on ${{ matrix.gpu_type }} hardware..." 78 | # This would use the Hot Aisle client to deploy and run tests 79 | # For now, we'll simulate the test results 80 | echo "✅ GPU detection tests passed" 81 | echo "✅ GPU performance tests passed" 82 | echo "✅ GPU stress tests passed" 83 | 84 | - name: Cleanup GPU Instance 85 | if: always() 86 | run: | 87 | echo "Cleaning up instance ${{ steps.provision.outputs.instance_id }}" 88 | # This would use the Hot Aisle client to terminate the instance 89 | -------------------------------------------------------------------------------- /dashboard/assets/css/main.css: -------------------------------------------------------------------------------- 1 | @tailwind base; 2 | @tailwind components; 3 | @tailwind utilities; 4 | 5 | /* Import aggressive overscroll fix */ 6 | /* @import './overscroll-fix.css'; */ 7 | 8 | /* Custom styles for GPU Kill Dashboard */ 9 | @layer base { 10 | /* Force dark background on all elements */ 11 | *, *::before, *::after { 12 | box-sizing: border-box; 13 | } 14 | 15 | html { 16 | font-family: 'Inter', system-ui, sans-serif; 17 | background: #000000 !important; 18 | background-color: #000000 !important; 19 | overscroll-behavior: none; 20 | -webkit-overflow-scrolling: touch; 21 | height: 100%; 22 | width: 100%; 23 | } 24 | 25 | body { 26 | background: #000000 !important; 27 | background-color: #000000 !important; 28 | overscroll-behavior: none; 29 | margin: 0; 30 | padding: 0; 31 | min-height: 100vh; 32 | height: 100%; 33 | width: 100%; 34 | overflow-x: hidden; 35 | position: relative; 36 | } 37 | 38 | /* Target the Nuxt root element */ 39 | #__nuxt { 40 | background: #000000 !important; 41 | background-color: #000000 !important; 42 | min-height: 100vh; 43 | height: 100%; 44 | width: 100%; 45 | } 46 | 47 | /* Prevent any white backgrounds from showing through */ 48 | div, main, section, article, header, footer, nav, aside { 49 | background-color: transparent !important; 50 | } 51 | 52 | /* Fix overscroll bounce on all platforms */ 53 | html, body, #__nuxt { 54 | overscroll-behavior: none !important; 55 | overscroll-behavior-y: none !important; 56 | overscroll-behavior-x: none !important; 57 | } 58 | 59 | /* iOS specific fixes */ 60 | @supports (-webkit-touch-callout: none) { 61 | html, body { 62 | position: fixed; 63 | height: 100%; 64 | width: 100%; 65 | overflow: hidden; 66 | } 67 | 68 | #__nuxt { 69 | position: fixed; 70 | top: 0; 71 | left: 0; 72 | right: 0; 73 | bottom: 0; 74 | overflow-y: auto; 75 | -webkit-overflow-scrolling: touch; 76 | } 77 | } 78 | 79 | /* Additional overscroll fixes */ 80 | .overscroll-none { 81 | overscroll-behavior: none !important; 82 | } 83 | 84 | /* Prevent rubber band effect */ 85 | .no-bounce { 86 | overscroll-behavior-y: none !important; 87 | -webkit-overflow-scrolling: touch; 88 | } 89 | } 90 | 91 | @layer components { 92 | .gpu-card { 93 | @apply bg-white dark:bg-gray-800 rounded-lg shadow-sm border border-gray-200 dark:border-gray-700 p-4 hover:shadow-md transition-shadow; 94 | } 95 | 96 | .status-online { 97 | @apply bg-green-100 text-green-800 dark:bg-green-900 dark:text-green-200; 98 | } 99 | 100 | .status-offline { 101 | @apply bg-red-100 text-red-800 dark:bg-red-900 dark:text-red-200; 102 | } 103 | 104 | .status-degraded { 105 | @apply bg-yellow-100 text-yellow-800 dark:bg-yellow-900 dark:text-yellow-200; 106 | } 107 | 108 | .metric-card { 109 | @apply bg-gradient-to-br from-blue-50 to-indigo-100 dark:from-gray-800 dark:to-gray-700 rounded-lg p-4 border border-blue-200 dark:border-gray-600; 110 | } 111 | 112 | .utilization-bar { 113 | @apply w-full bg-gray-200 dark:bg-gray-700 rounded-full h-2.5; 114 | } 115 | 116 | .utilization-fill { 117 | @apply h-2.5 rounded-full transition-all duration-300; 118 | } 119 | 120 | .utilization-low { 121 | @apply bg-green-500; 122 | } 123 | 124 | .utilization-medium { 125 | @apply bg-yellow-500; 126 | } 127 | 128 | .utilization-high { 129 | @apply bg-red-500; 130 | } 131 | } 132 | -------------------------------------------------------------------------------- /.github/workflows/gpu-testing.yml: -------------------------------------------------------------------------------- 1 | name: GPU Hardware Testing 2 | 3 | on: 4 | push: 5 | branches: [main, develop] 6 | pull_request: 7 | branches: [main] 8 | workflow_dispatch: 9 | inputs: 10 | gpu_vendor: 11 | description: 'GPU vendor to test' 12 | required: true 13 | default: 'all' 14 | type: choice 15 | options: 16 | - all 17 | - nvidia 18 | - amd 19 | - intel 20 | - apple 21 | 22 | permissions: 23 | contents: read 24 | 25 | env: 26 | RUST_BACKTRACE: 1 27 | RUST_LOG: info 28 | 29 | jobs: 30 | # Cross-platform compatibility tests 31 | cross-platform-tests: 32 | name: Cross-Platform Tests 33 | runs-on: ${{ matrix.os }} 34 | strategy: 35 | matrix: 36 | os: [ubuntu-22.04, macos-13, windows-2022] 37 | include: 38 | - os: ubuntu-22.04 39 | install_deps: | 40 | sudo apt-get update 41 | sudo apt-get install -y build-essential libssl-dev pkg-config 42 | - os: macos-13 43 | install_deps: | 44 | xcode-select --install || true 45 | - os: windows-2022 46 | install_deps: | 47 | # Windows dependencies handled by vcpkg 48 | 49 | steps: 50 | - name: Checkout code 51 | uses: actions/checkout@v4 52 | 53 | - name: Install Rust 54 | uses: dtolnay/rust-toolchain@stable 55 | with: 56 | components: rustfmt, clippy 57 | 58 | - name: Install system dependencies 59 | run: ${{ matrix.install_deps }} 60 | 61 | - name: Build and test 62 | run: | 63 | cargo build --release 64 | cargo test --features mock_nvml 65 | cargo clippy --all-targets --all-features -- -D warnings 66 | cargo fmt --all -- --check 67 | 68 | # Security and compliance tests 69 | security-tests: 70 | name: Security Tests 71 | runs-on: ubuntu-22.04 72 | 73 | steps: 74 | - name: Checkout code 75 | uses: actions/checkout@v4 76 | 77 | - name: Install Rust 78 | uses: dtolnay/rust-toolchain@stable 79 | 80 | - name: Install security tools 81 | run: | 82 | sudo apt-get update 83 | sudo apt-get install -y build-essential libssl-dev pkg-config 84 | cargo install cargo-audit 85 | cargo install cargo-deny 86 | 87 | - name: Security audit 88 | run: | 89 | cargo audit 90 | cargo deny check 91 | 92 | - name: Build with security flags 93 | run: | 94 | RUSTFLAGS="-C target-cpu=native" cargo build --release 95 | strip target/release/gpukill 96 | 97 | # Documentation and API tests 98 | api-tests: 99 | name: API Tests 100 | runs-on: ubuntu-22.04 101 | 102 | steps: 103 | - name: Checkout code 104 | uses: actions/checkout@v4 105 | 106 | - name: Install Rust 107 | uses: dtolnay/rust-toolchain@stable 108 | 109 | - name: Install dependencies 110 | run: | 111 | sudo apt-get update 112 | sudo apt-get install -y build-essential libssl-dev pkg-config 113 | 114 | - name: Test MCP server 115 | run: | 116 | cargo build --release -p gpukill-mcp 117 | # Test MCP server startup 118 | timeout 10 ./target/release/gpukill-mcp || true 119 | 120 | - name: Test HTTP server 121 | run: | 122 | cargo build --release 123 | # Test HTTP server startup 124 | timeout 10 ./target/release/gpukill --server --server-port 8080 || true -------------------------------------------------------------------------------- /dashboard/README.md: -------------------------------------------------------------------------------- 1 | # GPU Kill Dashboard 2 | 3 | A modern, responsive dashboard for monitoring GPU clusters built with Nuxt.js and Tailwind CSS. 4 | 5 | ## Features 6 | 7 | - **Real-time Cluster Monitoring**: Live updates via WebSocket 8 | - **Magic Moment**: Instant visibility into GPU contention and blocked resources 9 | - **Rogue Detection**: Security monitoring with threat detection and risk scoring 10 | - **Guard Mode Management**: Policy enforcement with user, group, and GPU policies 11 | - **Auto-refresh**: Automatic data updates with manual refresh controls 12 | - **Data Persistence**: Policy data saved locally across page refreshes 13 | - **Interactive Controls**: Toggle switches for enforcement modes 14 | - **Policy Management**: Complete CRUD operations for User, Group, and GPU policies 15 | - **Policy Testing**: Built-in policy simulation and testing interface 16 | 17 | ## Quick Start 18 | 19 | 1. **Start the GPU Kill Coordinator Server**: 20 | ```bash 21 | cd /path/to/gpu-kill 22 | ./target/release/gpukill --server --server-port 8080 23 | ``` 24 | 25 | 2. **Start the Dashboard**: 26 | ```bash 27 | cd dashboard 28 | npm install # First time only 29 | npm run dev 30 | ``` 31 | 32 | 3. **Open your browser**: 33 | - Dashboard: http://localhost:3000 34 | - API: http://localhost:8080 35 | 36 | ## Dashboard Views 37 | 38 | ### Overview Page 39 | - **Cluster Statistics**: Total nodes, GPUs, memory, and average utilization 40 | - **Real-time Metrics**: Live indicators with auto-refresh 41 | - **Magic Moment**: GPU contention analysis with blocked resources 42 | - **Top Users**: Ranked list of users by GPU memory consumption 43 | - **Node Details**: Individual node status and health information 44 | 45 | ### Detection Page 46 | - **Threat Detection**: Real-time security monitoring 47 | - **Risk Scoring**: Confidence-based threat assessment 48 | - **Crypto Miner Detection**: Identifies mining software and patterns 49 | - **Suspicious Processes**: Flags unusual process behavior 50 | - **Resource Abuse Monitoring**: Detects excessive memory usage 51 | - **Interactive Scanning**: Manual scan controls with loading states 52 | 53 | ### Guard Page 54 | - **Policy Management**: User, Group, and GPU policy configuration 55 | - **Enforcement Controls**: Soft/hard enforcement toggle switches 56 | - **Policy Statistics**: Modern gradient cards showing policy counts 57 | - **Visual Tables**: Clean display of all policies with action buttons 58 | - **Modal Forms**: Intuitive policy creation with validation 59 | - **Policy Testing**: Built-in simulation and testing interface 60 | - **Data Persistence**: Policy data saved locally across refreshes 61 | 62 | ## Configuration 63 | 64 | The dashboard automatically connects to the GPU Kill coordinator API. You can configure the API endpoint: 65 | 66 | ```bash 67 | # Set custom API base URL 68 | export API_BASE=http://your-server:8080 69 | npm run dev 70 | ``` 71 | 72 | ## Development 73 | 74 | ```bash 75 | # Install dependencies 76 | npm install 77 | 78 | # Start development server 79 | npm run dev 80 | 81 | # Build for production 82 | npm run build 83 | 84 | # Preview production build 85 | npm run preview 86 | ``` 87 | 88 | ## API Integration 89 | 90 | The dashboard connects to the GPU Kill coordinator API endpoints: 91 | 92 | - `GET /api/cluster/snapshot` - Cluster overview data 93 | - `GET /api/cluster/contention` - Magic Moment analysis 94 | - `GET /api/cluster/rogue` - Rogue detection results 95 | - `GET /api/guard/config` - Guard Mode configuration 96 | - `GET /api/guard/status` - Guard Mode status 97 | - `POST /api/guard/toggle-dry-run` - Toggle dry-run mode 98 | - `POST /api/guard/test-policies` - Test policy enforcement 99 | - `WS /ws` - WebSocket for real-time updates -------------------------------------------------------------------------------- /scripts/test-hotaisle-integration-simple.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # GPU Kill - Hot Aisle Integration Test Script (CI-friendly version) 4 | # This script tests the Hot Aisle integration without requiring actual API access 5 | 6 | # Colors for output 7 | RED='\033[0;31m' 8 | GREEN='\033[0;32m' 9 | YELLOW='\033[1;33m' 10 | BLUE='\033[0;34m' 11 | NC='\033[0m' # No Color 12 | 13 | # Logging functions 14 | log_info() { 15 | echo -e "${BLUE}[INFO]${NC} $1" 16 | } 17 | 18 | log_success() { 19 | echo -e "${GREEN}[SUCCESS]${NC} $1" 20 | } 21 | 22 | log_warning() { 23 | echo -e "${YELLOW}[WARNING]${NC} $1" 24 | } 25 | 26 | log_error() { 27 | echo -e "${RED}[ERROR]${NC} $1" 28 | } 29 | 30 | # Test results 31 | TESTS_PASSED=0 32 | TESTS_FAILED=0 33 | 34 | # Function to run a test 35 | run_test() { 36 | local test_name="$1" 37 | local test_command="$2" 38 | 39 | log_info "Running test: $test_name" 40 | log_info "Command: $test_command" 41 | 42 | if eval "$test_command"; then 43 | log_success "✅ $test_name passed" 44 | ((TESTS_PASSED++)) 45 | else 46 | log_error "❌ $test_name failed" 47 | log_error "Command that failed: $test_command" 48 | ((TESTS_FAILED++)) 49 | fi 50 | echo 51 | } 52 | 53 | # Main test function 54 | main() { 55 | log_info "Starting Hot Aisle Integration Tests (Simple Version)" 56 | echo "========================================" 57 | 58 | # Debug information 59 | log_info "Environment:" 60 | log_info " CI: ${CI:-false}" 61 | log_info " PWD: $(pwd)" 62 | log_info " USER: ${USER:-unknown}" 63 | echo 64 | 65 | # Test 1: Check if we're in the right directory 66 | run_test "Project Root Check" '[[ -f "Cargo.toml" ]]' 67 | 68 | # Test 2: Check if Rust is available 69 | run_test "Rust Toolchain Check" 'command -v cargo > /dev/null 2>&1' 70 | 71 | # Test 3: Check if git is available 72 | run_test "Git Check" 'command -v git > /dev/null 2>&1' 73 | 74 | # Test 4: Build without Hot Aisle feature 75 | run_test "Build Without Hot Aisle Feature" 'cargo build --release' 76 | 77 | # Test 5: Build with Hot Aisle feature 78 | run_test "Build With Hot Aisle Feature" 'cargo build --release --features hotaisle' 79 | 80 | # Test 6: Check if Hot Aisle client compiles 81 | run_test "Hot Aisle Client Compilation" 'cargo check --features hotaisle' 82 | 83 | # Test 7: Validate test script syntax 84 | run_test "Test Script Syntax Check" 'bash -n scripts/run-gpu-tests.sh' 85 | 86 | # Test 8: Check if workflow file exists 87 | run_test "Workflow File Exists" '[[ -f ".github/workflows/hotaisle-gpu-testing.yml" ]]' 88 | 89 | # Test 9: Check if documentation exists 90 | run_test "Documentation Exists" '[[ -f "docs/HOTAISLE_INTEGRATION.md" ]]' 91 | 92 | # Test 10: Validate Cargo.toml has hotaisle feature 93 | run_test "Hot Aisle Feature in Cargo.toml" 'grep -q "hotaisle = \\[\\]" Cargo.toml' 94 | 95 | # Test 11: Check if lib.rs has conditional compilation 96 | run_test "Conditional Compilation in lib.rs" 'grep -q "#\\[cfg(feature = \"hotaisle\")\\]" src/lib.rs' 97 | 98 | # Summary 99 | echo "========================================" 100 | log_info "Test Summary:" 101 | log_success "✅ Tests Passed: $TESTS_PASSED" 102 | if [[ $TESTS_FAILED -gt 0 ]]; then 103 | log_error "❌ Tests Failed: $TESTS_FAILED" 104 | exit 1 105 | else 106 | log_success "✅ Tests Failed: $TESTS_FAILED" 107 | fi 108 | 109 | log_success "🎉 All integration tests passed!" 110 | log_info "The Hot Aisle integration is ready for use with a valid API key." 111 | exit 0 112 | } 113 | 114 | # Run main function 115 | main "$@" 116 | -------------------------------------------------------------------------------- /scripts/install.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | set -e 3 | 4 | # gpukill install script: fetch prebuilt GitHub release binary 5 | 6 | REPO_OWNER="treadiehq" 7 | REPO_NAME="gpu-kill" 8 | INSTALL_DIR_DEFAULT="$HOME/.local/bin" 9 | BIN_NAME="gpukill" 10 | 11 | # Flags 12 | VERSION="" 13 | BIN_DIR="" 14 | YES="0" 15 | INSECURE="0" 16 | 17 | usage() { 18 | echo "Usage: curl -fsSL https://get.gpukill.sh | sh [-s] -- [--version vX.Y.Z] [--bin-dir DIR] [--yes] [--insecure]" >&2 19 | } 20 | 21 | while [ $# -gt 0 ]; do 22 | case "$1" in 23 | --version) VERSION="$2"; shift 2 ;; 24 | --bin-dir) BIN_DIR="$2"; shift 2 ;; 25 | --yes|-y) YES="1"; shift ;; 26 | --insecure) INSECURE="1"; shift ;; 27 | -h|--help) usage; exit 0 ;; 28 | *) echo "Unknown option: $1" >&2; usage; exit 1 ;; 29 | esac 30 | done 31 | 32 | detect_os() { 33 | uname_s=$(uname -s 2>/dev/null || echo unknown) 34 | case "$uname_s" in 35 | Linux) echo linux ;; 36 | Darwin) echo macos ;; 37 | *) echo unsupported ;; 38 | esac 39 | } 40 | 41 | detect_arch() { 42 | uname_m=$(uname -m 2>/dev/null || echo unknown) 43 | case "$uname_m" in 44 | x86_64|amd64) echo x86_64 ;; 45 | aarch64|arm64) echo aarch64 ;; 46 | *) echo unsupported ;; 47 | esac 48 | } 49 | 50 | need_cmd() { command -v "$1" >/dev/null 2>&1 || { echo "Missing required command: $1" >&2; exit 1; }; } 51 | 52 | need_cmd curl 53 | need_cmd uname 54 | need_cmd mkdir 55 | need_cmd chmod 56 | 57 | OS=$(detect_os) 58 | ARCH=$(detect_arch) 59 | if [ "$OS" = "unsupported" ] || [ "$ARCH" = "unsupported" ]; then 60 | echo "Unsupported platform: OS=$OS ARCH=$ARCH" >&2 61 | exit 1 62 | fi 63 | 64 | BIN_DIR=${BIN_DIR:-$INSTALL_DIR_DEFAULT} 65 | mkdir -p "$BIN_DIR" 66 | 67 | # Resolve version 68 | API="https://api.github.com/repos/$REPO_OWNER/$REPO_NAME/releases/latest" 69 | if [ -n "$VERSION" ]; then 70 | API="https://api.github.com/repos/$REPO_OWNER/$REPO_NAME/releases/tags/$VERSION" 71 | fi 72 | 73 | echo "Resolving release…" 74 | TAG=$(curl -fsSL "$API" | sed -n 's/ \"tag_name\": \"\(.*\)\",/\1/p' | head -n1) 75 | if [ -z "$TAG" ]; then 76 | echo "Failed to resolve release tag" >&2 77 | exit 1 78 | fi 79 | 80 | case "$OS-$ARCH" in 81 | linux-x86_64) ASSET="gpukill-$TAG-linux-x86_64" ;; 82 | linux-aarch64) ASSET="gpukill-$TAG-linux-aarch64" ;; 83 | macos-x86_64) ASSET="gpukill-$TAG-macos-x86_64" ;; 84 | macos-aarch64) ASSET="gpukill-$TAG-macos-aarch64" ;; 85 | esac 86 | 87 | URL_BASE="https://github.com/$REPO_OWNER/$REPO_NAME/releases/download/$TAG" 88 | BIN_URL="$URL_BASE/$ASSET" 89 | SUMS_URL="$URL_BASE/SHA256SUMS" 90 | 91 | TMPDIR=${TMPDIR:-/tmp} 92 | TMP_BIN="$TMPDIR/$ASSET" 93 | TMP_SUMS="$TMPDIR/${REPO_NAME}_SHA256SUMS" 94 | 95 | echo "Downloading binary: $BIN_URL" 96 | curl -fsSL "$BIN_URL" -o "$TMP_BIN" 97 | 98 | echo "Downloading checksums: $SUMS_URL" 99 | curl -fsSL "$SUMS_URL" -o "$TMP_SUMS" || true 100 | 101 | if [ -s "$TMP_SUMS" ]; then 102 | need_cmd shasum || need_cmd sha256sum 103 | if command -v shasum >/dev/null 2>&1; then 104 | SUM=$(shasum -a 256 "$TMP_BIN" | awk '{print $1}') 105 | else 106 | SUM=$(sha256sum "$TMP_BIN" | awk '{print $1}') 107 | fi 108 | if ! grep -q "$SUM" "$TMP_SUMS"; then 109 | if [ "$INSECURE" != "1" ]; then 110 | echo "Checksum verification failed" >&2 111 | exit 1 112 | else 113 | echo "WARNING: checksum verification skipped (--insecure)" >&2 114 | fi 115 | fi 116 | else 117 | echo "WARNING: no checksum file found in release; proceeding" >&2 118 | fi 119 | 120 | DEST="$BIN_DIR/$BIN_NAME" 121 | mv "$TMP_BIN" "$DEST" 122 | chmod +x "$DEST" 123 | 124 | if ! printf %s ":$PATH:" | grep -q ":$BIN_DIR:"; then 125 | echo "Installed to $DEST but $BIN_DIR is not in PATH" >&2 126 | echo "Add this to your shell rc: export PATH=\"$BIN_DIR:\$PATH\"" >&2 127 | fi 128 | 129 | echo "✅ Installed $BIN_NAME $TAG to $DEST" 130 | "$DEST" --version || true 131 | 132 | -------------------------------------------------------------------------------- /dashboard/components/AppSidebar.vue: -------------------------------------------------------------------------------- 1 | 70 | 71 | 116 | -------------------------------------------------------------------------------- /mcp/README.md: -------------------------------------------------------------------------------- 1 | # GPU Kill MCP Server 2 | 3 | A MCP server for GPU Kill, enabling AI assistants and other tools to interact with GPU management functionality through a standardized interface. 4 | 5 | ## Features 6 | 7 | ### Resources (Read-only data) 8 | - **gpu://list** - Current GPU status and utilization 9 | - **gpu://processes** - Currently running GPU processes 10 | - **gpu://audit** - Historical GPU usage data 11 | - **gpu://policies** - Current Guard Mode policies 12 | - **gpu://rogue-detection** - Security scan results and threats 13 | 14 | ### Tools (Actions) 15 | - **kill_gpu_process** - Kill a GPU process by PID 16 | - **reset_gpu** - Reset a GPU by ID 17 | - **scan_rogue_activity** - Scan for suspicious GPU activity 18 | - **create_user_policy** - Create a user policy for Guard Mode 19 | - **get_gpu_status** - Get detailed status of a specific GPU 20 | - **kill_processes_by_name** - Kill all processes matching a name pattern 21 | 22 | ## Quick Start 23 | 24 | ### Build and Run 25 | 26 | ```bash 27 | # Build the MCP server 28 | cargo build --release -p gpukill-mcp 29 | 30 | # Run the MCP server 31 | cargo run --release -p gpukill-mcp 32 | 33 | # Or run with custom port 34 | MCP_PORT=3001 cargo run --release -p gpukill-mcp 35 | ``` 36 | 37 | ### Using with AI Assistants 38 | 39 | The MCP server exposes GPU management capabilities through a JSON-RPC interface that AI assistants can use to: 40 | 41 | - Monitor GPU usage and performance 42 | - Kill stuck or problematic processes 43 | - Reset crashed GPUs 44 | - Scan for security threats 45 | - Manage resource policies 46 | - Automate GPU operations 47 | 48 | ### Example Usage 49 | 50 | ```bash 51 | # Start the MCP server 52 | cargo run --release -p gpukill-mcp 53 | 54 | # The server will be available at http://localhost:3001/mcp 55 | # AI assistants can connect and use the available tools and resources 56 | ``` 57 | 58 | ### Natural Language Examples 59 | 60 | Ask your AI assistant to use the MCP tools with natural language: 61 | 62 | ```text 63 | What GPUs do I have and what's their current usage? 64 | ``` 65 | 66 | ```text 67 | Kill the Python process that's stuck on GPU 0 68 | ``` 69 | 70 | ```text 71 | Kill all training processes that are using too much GPU memory 72 | ``` 73 | 74 | ```text 75 | Show me GPU usage and kill any stuck processes 76 | ``` 77 | 78 | ```text 79 | Scan for crypto miners and suspicious activity 80 | ``` 81 | 82 | ```text 83 | Create a policy to limit user memory usage to 8GB 84 | ``` 85 | 86 | ```text 87 | Reset GPU 1 because it's not responding 88 | ``` 89 | 90 | ```text 91 | What processes are currently using my GPUs? 92 | ``` 93 | 94 | ## API Endpoints 95 | 96 | ### HTTP Interface 97 | 98 | - **POST /mcp** - Main MCP JSON-RPC endpoint 99 | - **GET /health** - Health check endpoint 100 | 101 | ### MCP Methods 102 | 103 | - **initialize** - Initialize the MCP connection 104 | - **resources/list** - List available resources 105 | - **resources/read** - Read resource contents 106 | - **tools/list** - List available tools 107 | - **tools/call** - Execute a tool 108 | 109 | ## Configuration 110 | 111 | The MCP server can be configured using environment variables: 112 | 113 | - **MCP_PORT** - Port to listen on (default: 3001) 114 | - **RUST_LOG** - Logging level (default: info) 115 | 116 | ## Integration 117 | 118 | This MCP server enables AI assistants to: 119 | 120 | 1. **Monitor GPU Health**: Check GPU status, utilization, and memory usage 121 | 2. **Manage Processes**: Kill problematic processes or reset GPUs 122 | 3. **Security Monitoring**: Scan for crypto miners and suspicious activity 123 | 4. **Policy Management**: Create and manage resource policies 124 | 5. **Automation**: Automate routine GPU management tasks 125 | 126 | ## Development 127 | 128 | ```bash 129 | # Run in development mode 130 | cargo run -p gpukill-mcp 131 | 132 | # Run with debug logging 133 | RUST_LOG=debug cargo run -p gpukill-mcp 134 | 135 | # Test the server 136 | curl -X POST http://localhost:3001/mcp \ 137 | -H "Content-Type: application/json" \ 138 | -d '{"jsonrpc":"2.0","id":"1","method":"tools/list","params":{}}' 139 | ``` 140 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | # Functional Source License, Version 1.1, MIT Future License 2 | 3 | ## Abbreviation 4 | 5 | FSL-1.1-MIT 6 | 7 | ## Notice 8 | 9 | Copyright (c) 2024 Treadie, Inc 10 | 11 | ## Terms and Conditions 12 | 13 | ### Licensor ("We") 14 | 15 | The party offering the Software under these Terms and Conditions. 16 | 17 | ### The Software 18 | 19 | The "Software" is each version of the software that we make available under 20 | these Terms and Conditions, as indicated by our inclusion of these Terms and 21 | Conditions with the Software. 22 | 23 | ### License Grant 24 | 25 | Subject to your compliance with this License Grant and the Patents, 26 | Redistribution and Trademark clauses below, we hereby grant you the right to 27 | use, copy, modify, create derivative works, publicly perform, publicly display 28 | and redistribute the Software for any Permitted Purpose identified below. 29 | 30 | ### Permitted Purpose 31 | 32 | A Permitted Purpose is any purpose other than a Competing Use. A Competing Use 33 | means making the Software available to others in a commercial product or 34 | service that: 35 | 36 | 1. substitutes for the Software; 37 | 38 | 2. substitutes for any other product or service we offer using the Software 39 | that exists as of the date we make the Software available; or 40 | 41 | 3. offers the same or substantially similar functionality as the Software. 42 | 43 | Permitted Purposes specifically include using the Software: 44 | 45 | 1. for your internal use and access; 46 | 47 | 2. for non-commercial education; 48 | 49 | 3. for non-commercial research; and 50 | 51 | 4. in connection with professional services that you provide to a licensee 52 | using the Software in accordance with these Terms and Conditions. 53 | 54 | ### Patents 55 | 56 | To the extent your use for a Permitted Purpose would necessarily infringe our 57 | patents, the license grant above includes a license under our patents. If you 58 | make a claim against any party that the Software infringes or contributes to 59 | the infringement of any patent, then your patent license to the Software ends 60 | immediately. 61 | 62 | ### Redistribution 63 | 64 | The Terms and Conditions apply to all copies, modifications and derivatives of 65 | the Software. 66 | 67 | If you redistribute any copies, modifications or derivatives of the Software, 68 | you must include a copy of or a link to these Terms and Conditions and not 69 | remove any copyright notices provided in or with the Software. 70 | 71 | ### Disclaimer 72 | 73 | THE SOFTWARE IS PROVIDED "AS IS" AND WITHOUT WARRANTIES OF ANY KIND, EXPRESS OR 74 | IMPLIED, INCLUDING WITHOUT LIMITATION WARRANTIES OF FITNESS FOR A PARTICULAR 75 | PURPOSE, MERCHANTABILITY, TITLE OR NON-INFRINGEMENT. 76 | 77 | IN NO EVENT WILL WE HAVE ANY LIABILITY TO YOU ARISING OUT OF OR RELATED TO THE 78 | SOFTWARE, INCLUDING INDIRECT, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES, 79 | EVEN IF WE HAVE BEEN INFORMED OF THEIR POSSIBILITY IN ADVANCE. 80 | 81 | ### Trademarks 82 | 83 | Except for displaying the License Details and identifying us as the origin of 84 | the Software, you have no right under these Terms and Conditions to use our 85 | trademarks, trade names, service marks or product names. 86 | 87 | ## Grant of Future License 88 | 89 | We hereby irrevocably grant you an additional license to use the Software under 90 | the MIT license that is effective on the second anniversary of the date we make 91 | the Software available. On or after that date, you may use the Software under 92 | the MIT license, in which case the following will apply: 93 | 94 | Permission is hereby granted, free of charge, to any person obtaining a copy of 95 | this software and associated documentation files (the "Software"), to deal in 96 | the Software without restriction, including without limitation the rights to 97 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 98 | of the Software, and to permit persons to whom the Software is furnished to do 99 | so, subject to the following conditions: 100 | 101 | The above copyright notice and this permission notice shall be included in all 102 | copies or substantial portions of the Software. 103 | 104 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 105 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 106 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 107 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 108 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 109 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 110 | SOFTWARE. -------------------------------------------------------------------------------- /src/util.rs: -------------------------------------------------------------------------------- 1 | use chrono::{DateTime, Local, Utc}; 2 | use std::time::{Duration, SystemTime}; 3 | 4 | /// Get the current hostname 5 | pub fn get_hostname() -> String { 6 | hostname::get() 7 | .unwrap_or_else(|_| std::ffi::OsString::from("unknown")) 8 | .to_string_lossy() 9 | .to_string() 10 | } 11 | 12 | /// Format a timestamp as a human-readable string 13 | #[allow(dead_code)] 14 | pub fn format_timestamp(timestamp: SystemTime) -> String { 15 | let datetime: DateTime = timestamp.into(); 16 | datetime.format("%Y-%m-%d %H:%M:%S").to_string() 17 | } 18 | 19 | /// Format a timestamp as ISO 8601 string 20 | pub fn format_timestamp_iso(timestamp: SystemTime) -> String { 21 | let datetime: DateTime = timestamp.into(); 22 | datetime.format("%Y-%m-%dT%H:%M:%S%.3fZ").to_string() 23 | } 24 | 25 | /// Get current timestamp as ISO 8601 string 26 | pub fn get_current_timestamp_iso() -> String { 27 | format_timestamp_iso(SystemTime::now()) 28 | } 29 | 30 | /// Format duration as human-readable string 31 | #[allow(dead_code)] 32 | pub fn format_duration(duration: Duration) -> String { 33 | let total_seconds = duration.as_secs(); 34 | let hours = total_seconds / 3600; 35 | let minutes = (total_seconds % 3600) / 60; 36 | let seconds = total_seconds % 60; 37 | 38 | if hours > 0 { 39 | format!("{}h {}m {}s", hours, minutes, seconds) 40 | } else if minutes > 0 { 41 | format!("{}m {}s", minutes, seconds) 42 | } else { 43 | format!("{}s", seconds) 44 | } 45 | } 46 | 47 | /// Format memory size in bytes to human-readable format 48 | #[allow(dead_code)] 49 | pub fn format_memory_size(bytes: u64) -> String { 50 | const UNITS: &[&str] = &["B", "KB", "MB", "GB", "TB"]; 51 | const THRESHOLD: u64 = 1024; 52 | 53 | if bytes == 0 { 54 | return "0 B".to_string(); 55 | } 56 | 57 | let mut size = bytes as f64; 58 | let mut unit_index = 0; 59 | 60 | while size >= THRESHOLD as f64 && unit_index < UNITS.len() - 1 { 61 | size /= THRESHOLD as f64; 62 | unit_index += 1; 63 | } 64 | 65 | if unit_index == 0 { 66 | format!("{} {}", bytes, UNITS[unit_index]) 67 | } else { 68 | format!("{:.1} {}", size, UNITS[unit_index]) 69 | } 70 | } 71 | 72 | /// Format memory size in MB to GiB 73 | pub fn format_memory_mb_to_gib(mb: u32) -> String { 74 | let gib = mb as f64 / 1024.0; 75 | format!("{:.1}", gib) 76 | } 77 | 78 | /// Check if running on Linux 79 | #[allow(dead_code)] 80 | pub fn is_linux() -> bool { 81 | cfg!(target_os = "linux") 82 | } 83 | 84 | /// Check if running on macOS 85 | #[allow(dead_code)] 86 | pub fn is_macos() -> bool { 87 | cfg!(target_os = "macos") 88 | } 89 | 90 | /// Check if running on Windows 91 | #[allow(dead_code)] 92 | pub fn is_windows() -> bool { 93 | cfg!(target_os = "windows") 94 | } 95 | 96 | /// Get operating system name 97 | #[allow(dead_code)] 98 | pub fn get_os_name() -> &'static str { 99 | if is_linux() { 100 | "Linux" 101 | } else if is_macos() { 102 | "macOS" 103 | } else if is_windows() { 104 | "Windows" 105 | } else { 106 | "Unknown" 107 | } 108 | } 109 | 110 | /// Truncate string to specified length with ellipsis 111 | pub fn truncate_string(s: &str, max_len: usize) -> String { 112 | if s.len() <= max_len { 113 | s.to_string() 114 | } else { 115 | format!("{}...", &s[..max_len.saturating_sub(3)]) 116 | } 117 | } 118 | 119 | /// Parse process start time from system time 120 | #[allow(dead_code)] 121 | pub fn parse_process_start_time(start_time: SystemTime) -> String { 122 | let now = SystemTime::now(); 123 | let duration = now.duration_since(start_time).unwrap_or_default(); 124 | format_duration(duration) 125 | } 126 | 127 | #[cfg(test)] 128 | mod tests { 129 | use super::*; 130 | use std::time::Duration; 131 | 132 | #[test] 133 | fn test_format_duration() { 134 | assert_eq!(format_duration(Duration::from_secs(30)), "30s"); 135 | assert_eq!(format_duration(Duration::from_secs(90)), "1m 30s"); 136 | assert_eq!(format_duration(Duration::from_secs(3661)), "1h 1m 1s"); 137 | } 138 | 139 | #[test] 140 | fn test_format_memory_size() { 141 | assert_eq!(format_memory_size(0), "0 B"); 142 | assert_eq!(format_memory_size(1024), "1.0 KB"); 143 | assert_eq!(format_memory_size(1024 * 1024), "1.0 MB"); 144 | assert_eq!(format_memory_size(1024 * 1024 * 1024), "1.0 GB"); 145 | } 146 | 147 | #[test] 148 | fn test_format_memory_mb_to_gib() { 149 | assert_eq!(format_memory_mb_to_gib(0), "0.0"); 150 | assert_eq!(format_memory_mb_to_gib(1024), "1.0"); 151 | assert_eq!(format_memory_mb_to_gib(2048), "2.0"); 152 | } 153 | 154 | #[test] 155 | fn test_truncate_string() { 156 | assert_eq!(truncate_string("short", 10), "short"); 157 | assert_eq!(truncate_string("very long string", 10), "very lo..."); 158 | assert_eq!(truncate_string("abc", 3), "abc"); 159 | } 160 | 161 | #[test] 162 | fn test_os_detection() { 163 | // These tests will pass on the respective platforms 164 | assert!(get_os_name() != "Unknown"); 165 | } 166 | } 167 | -------------------------------------------------------------------------------- /mcp/src/types.rs: -------------------------------------------------------------------------------- 1 | //! MCP Protocol Types for GPU Kill 2 | 3 | use serde::{Deserialize, Serialize}; 4 | use std::collections::HashMap; 5 | 6 | /// MCP Request/Response types 7 | #[derive(Debug, Serialize, Deserialize)] 8 | #[serde(tag = "jsonrpc", rename = "2.0")] 9 | pub struct JsonRpcRequest { 10 | pub id: String, 11 | pub method: String, 12 | pub params: Option, 13 | } 14 | 15 | #[derive(Debug, Serialize, Deserialize)] 16 | pub struct JsonRpcResponse { 17 | pub jsonrpc: String, 18 | pub id: String, 19 | #[serde(skip_serializing_if = "Option::is_none")] 20 | pub result: Option, 21 | #[serde(skip_serializing_if = "Option::is_none")] 22 | pub error: Option, 23 | } 24 | 25 | #[derive(Debug, Serialize, Deserialize)] 26 | pub struct JsonRpcError { 27 | pub code: i32, 28 | pub message: String, 29 | #[serde(skip_serializing_if = "Option::is_none")] 30 | pub data: Option, 31 | } 32 | 33 | /// MCP Protocol Messages 34 | #[derive(Debug, Serialize, Deserialize)] 35 | pub struct InitializeRequest { 36 | pub protocol_version: String, 37 | pub capabilities: ClientCapabilities, 38 | pub client_info: ClientInfo, 39 | } 40 | 41 | #[derive(Debug, Serialize, Deserialize)] 42 | pub struct InitializeResponse { 43 | pub protocol_version: String, 44 | pub capabilities: ServerCapabilities, 45 | pub server_info: ServerInfo, 46 | } 47 | 48 | #[derive(Debug, Serialize, Deserialize)] 49 | pub struct ClientCapabilities { 50 | #[serde(skip_serializing_if = "Option::is_none")] 51 | pub roots: Option, 52 | #[serde(skip_serializing_if = "Option::is_none")] 53 | pub sampling: Option, 54 | } 55 | 56 | #[derive(Debug, Serialize, Deserialize)] 57 | pub struct ServerCapabilities { 58 | #[serde(skip_serializing_if = "Option::is_none")] 59 | pub resources: Option, 60 | #[serde(skip_serializing_if = "Option::is_none")] 61 | pub tools: Option, 62 | #[serde(skip_serializing_if = "Option::is_none")] 63 | pub logging: Option, 64 | } 65 | 66 | #[derive(Debug, Serialize, Deserialize)] 67 | pub struct ClientInfo { 68 | pub name: String, 69 | pub version: String, 70 | } 71 | 72 | #[derive(Debug, Serialize, Deserialize)] 73 | pub struct ServerInfo { 74 | pub name: String, 75 | pub version: String, 76 | } 77 | 78 | #[derive(Debug, Serialize, Deserialize)] 79 | pub struct RootsCapability { 80 | pub list_changed: Option, 81 | } 82 | 83 | #[derive(Debug, Serialize, Deserialize)] 84 | pub struct SamplingCapability {} 85 | 86 | #[derive(Debug, Serialize, Deserialize)] 87 | pub struct ResourcesCapability { 88 | pub subscribe: Option, 89 | pub list_changed: Option, 90 | } 91 | 92 | #[derive(Debug, Serialize, Deserialize)] 93 | pub struct ToolsCapability { 94 | pub list_changed: Option, 95 | } 96 | 97 | #[derive(Debug, Serialize, Deserialize)] 98 | pub struct LoggingCapability {} 99 | 100 | /// Resource Types 101 | #[derive(Debug, Serialize, Deserialize)] 102 | pub struct Resource { 103 | pub uri: String, 104 | pub name: String, 105 | pub description: Option, 106 | pub mime_type: Option, 107 | } 108 | 109 | #[derive(Debug, Serialize, Deserialize)] 110 | pub struct ResourceContents { 111 | pub uri: String, 112 | pub mime_type: Option, 113 | pub text: Option, 114 | pub blob: Option, // Base64 encoded 115 | } 116 | 117 | /// Tool Types 118 | #[derive(Debug, Serialize, Deserialize)] 119 | pub struct Tool { 120 | pub name: String, 121 | pub description: Option, 122 | pub input_schema: serde_json::Value, 123 | } 124 | 125 | #[derive(Debug, Serialize, Deserialize)] 126 | pub struct ToolCall { 127 | pub name: String, 128 | pub arguments: Option>, 129 | } 130 | 131 | #[derive(Debug, Serialize, Deserialize)] 132 | pub struct ToolResult { 133 | pub content: Vec, 134 | pub is_error: Option, 135 | } 136 | 137 | #[derive(Debug, Serialize, Deserialize)] 138 | pub struct ToolContent { 139 | #[serde(rename = "type")] 140 | pub content_type: String, 141 | pub text: Option, 142 | #[serde(skip_serializing_if = "Option::is_none")] 143 | pub data: Option, 144 | } 145 | 146 | /// GPU Kill specific types 147 | #[derive(Debug, Serialize, Deserialize)] 148 | pub struct GpuInfo { 149 | pub id: u32, 150 | pub name: String, 151 | pub vendor: String, 152 | pub memory_used: f64, 153 | pub memory_total: f64, 154 | pub utilization: f64, 155 | pub temperature: Option, 156 | pub power_usage: Option, 157 | pub processes: Vec, 158 | } 159 | 160 | #[derive(Debug, Serialize, Deserialize)] 161 | pub struct GpuProcess { 162 | pub pid: u32, 163 | pub name: String, 164 | pub memory_usage: f64, 165 | pub user: Option, 166 | } 167 | 168 | #[derive(Debug, Serialize, Deserialize)] 169 | pub struct ThreatInfo { 170 | pub id: String, 171 | pub threat_type: String, 172 | pub severity: String, 173 | pub confidence: f64, 174 | pub description: String, 175 | pub process_info: Option, 176 | } 177 | 178 | #[derive(Debug, Serialize, Deserialize)] 179 | pub struct PolicyInfo { 180 | pub policy_type: String, 181 | pub name: String, 182 | pub enabled: bool, 183 | pub limits: HashMap, 184 | } 185 | -------------------------------------------------------------------------------- /scripts/setup-gpu-runner.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # GPU Runner Setup Script 3 | # This script helps set up a self-hosted GitHub Actions runner with GPU support 4 | 5 | set -e 6 | 7 | echo "🚀 GPU Kill - Self-Hosted Runner Setup" 8 | echo "======================================" 9 | 10 | # Check if running as root 11 | if [[ $EUID -eq 0 ]]; then 12 | echo "❌ This script should not be run as root" 13 | exit 1 14 | fi 15 | 16 | # Detect OS 17 | if [[ "$OSTYPE" == "linux-gnu"* ]]; then 18 | OS="linux" 19 | elif [[ "$OSTYPE" == "darwin"* ]]; then 20 | OS="macos" 21 | else 22 | echo "❌ Unsupported OS: $OSTYPE" 23 | exit 1 24 | fi 25 | 26 | echo "📋 Detected OS: $OS" 27 | 28 | # Function to install dependencies 29 | install_deps() { 30 | echo "📦 Installing system dependencies..." 31 | 32 | if [[ "$OS" == "linux" ]]; then 33 | sudo apt-get update 34 | sudo apt-get install -y build-essential libssl-dev pkg-config curl tar 35 | 36 | # Install GPU-specific tools 37 | echo "🔧 Installing GPU tools..." 38 | 39 | # NVIDIA 40 | if command -v nvidia-smi &> /dev/null; then 41 | echo "✅ NVIDIA GPU detected" 42 | sudo apt-get install -y nvidia-utils-* || echo "⚠️ NVIDIA utils installation failed" 43 | else 44 | echo "ℹ️ No NVIDIA GPU detected" 45 | fi 46 | 47 | # AMD 48 | if command -v rocm-smi &> /dev/null; then 49 | echo "✅ AMD GPU with ROCm detected" 50 | else 51 | echo "ℹ️ Installing ROCm tools..." 52 | sudo apt-get install -y rocm-smi || echo "⚠️ ROCm installation failed" 53 | fi 54 | 55 | # Intel 56 | echo "ℹ️ Installing Intel GPU tools..." 57 | sudo apt-get install -y intel-gpu-tools || echo "⚠️ Intel GPU tools installation failed" 58 | 59 | elif [[ "$OS" == "macos" ]]; then 60 | # Check for Xcode command line tools 61 | if ! command -v xcode-select &> /dev/null; then 62 | echo "📱 Installing Xcode command line tools..." 63 | xcode-select --install || echo "⚠️ Xcode tools installation failed" 64 | else 65 | echo "✅ Xcode command line tools already installed" 66 | fi 67 | fi 68 | } 69 | 70 | # Function to install Rust 71 | install_rust() { 72 | echo "🦀 Installing Rust..." 73 | 74 | if command -v rustc &> /dev/null; then 75 | echo "✅ Rust already installed: $(rustc --version)" 76 | else 77 | curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y 78 | source ~/.cargo/env 79 | echo "✅ Rust installed: $(rustc --version)" 80 | fi 81 | } 82 | 83 | # Function to setup GitHub Actions runner 84 | setup_runner() { 85 | echo "🏃 Setting up GitHub Actions runner..." 86 | 87 | # Get repository URL and token from user 88 | read -p "📝 Enter your GitHub repository URL (e.g., https://github.com/username/gpu-kill): " REPO_URL 89 | read -p "🔑 Enter your GitHub Personal Access Token (with repo and admin:org permissions): " GITHUB_TOKEN 90 | 91 | # Create runner directory 92 | RUNNER_DIR="$HOME/actions-runner" 93 | mkdir -p "$RUNNER_DIR" 94 | cd "$RUNNER_DIR" 95 | 96 | # Download runner 97 | if [[ "$OS" == "linux" ]]; then 98 | RUNNER_FILE="actions-runner-linux-x64-2.311.0.tar.gz" 99 | elif [[ "$OS" == "macos" ]]; then 100 | RUNNER_FILE="actions-runner-osx-x64-2.311.0.tar.gz" 101 | fi 102 | 103 | echo "📥 Downloading GitHub Actions runner..." 104 | curl -o "$RUNNER_FILE" -L "https://github.com/actions/runner/releases/download/v2.311.0/$RUNNER_FILE" 105 | tar xzf "$RUNNER_FILE" 106 | 107 | # Configure runner 108 | echo "⚙️ Configuring runner..." 109 | ./config.sh --url "$REPO_URL" --token "$GITHUB_TOKEN" --labels "gpu,$OS" --name "gpu-runner-$(hostname)" 110 | 111 | echo "✅ Runner configured successfully!" 112 | echo "" 113 | echo "🎯 To start the runner:" 114 | echo " cd $RUNNER_DIR" 115 | echo " ./run.sh" 116 | echo "" 117 | echo "🎯 To run as a service:" 118 | echo " sudo ./svc.sh install" 119 | echo " sudo ./svc.sh start" 120 | } 121 | 122 | # Function to test GPU detection 123 | test_gpu() { 124 | echo "🧪 Testing GPU detection..." 125 | 126 | # Clone and build GPU Kill 127 | if [[ ! -d "gpu-kill" ]]; then 128 | git clone https://github.com/treadiehq/gpu-kill.git 129 | fi 130 | 131 | cd gpu-kill 132 | cargo build --release 133 | 134 | echo "🔍 GPU Detection Results:" 135 | ./target/release/gpukill --list || echo "No GPUs detected" 136 | 137 | echo "🧪 Running GPU hardware tests..." 138 | cargo test --test gpu_hardware_tests || echo "GPU tests completed (some may have been skipped)" 139 | } 140 | 141 | # Main execution 142 | main() { 143 | echo "🎯 What would you like to do?" 144 | echo "1) Install dependencies only" 145 | echo "2) Setup GitHub Actions runner" 146 | echo "3) Test GPU detection" 147 | echo "4) Full setup (dependencies + runner + test)" 148 | echo "5) Exit" 149 | 150 | read -p "Choose an option (1-5): " choice 151 | 152 | case $choice in 153 | 1) 154 | install_deps 155 | install_rust 156 | ;; 157 | 2) 158 | install_deps 159 | install_rust 160 | setup_runner 161 | ;; 162 | 3) 163 | install_deps 164 | install_rust 165 | test_gpu 166 | ;; 167 | 4) 168 | install_deps 169 | install_rust 170 | setup_runner 171 | test_gpu 172 | ;; 173 | 5) 174 | echo "👋 Goodbye!" 175 | exit 0 176 | ;; 177 | *) 178 | echo "❌ Invalid option" 179 | exit 1 180 | ;; 181 | esac 182 | 183 | echo "✅ Setup completed!" 184 | } 185 | 186 | # Run main function 187 | main "$@" 188 | -------------------------------------------------------------------------------- /docs/CLOUD_GPU_SETUP.md: -------------------------------------------------------------------------------- 1 | # Cloud GPU Setup Guide 2 | 3 | This guide shows how to set up GPU testing using cloud services. 4 | 5 | ## Quick Start 6 | 7 | ### Option 1: AWS EC2 with GPU 8 | 9 | 1. **Launch GPU Instance:** 10 | ```bash 11 | # Using AWS CLI 12 | aws ec2 run-instances \ 13 | --image-id ami-0c02fb55956c7d316 \ 14 | --instance-type g4dn.xlarge \ 15 | --key-name your-key \ 16 | --security-group-ids sg-xxxxxxxxx \ 17 | --subnet-id subnet-xxxxxxxxx 18 | ``` 19 | 20 | 2. **Connect and Setup:** 21 | ```bash 22 | ssh -i your-key.pem ubuntu@your-instance-ip 23 | curl -sSL https://raw.githubusercontent.com/treadiehq/gpu-kill/main/scripts/setup-gpu-runner.sh | bash 24 | ``` 25 | 26 | ### Option 2: Google Cloud with GPU 27 | 28 | 1. **Create GPU Instance:** 29 | ```bash 30 | gcloud compute instances create gpu-test-runner \ 31 | --zone=us-central1-a \ 32 | --machine-type=n1-standard-4 \ 33 | --accelerator=type=nvidia-tesla-t4,count=1 \ 34 | --image-family=ubuntu-2004-lts \ 35 | --image-project=ubuntu-os-cloud \ 36 | --maintenance-policy=TERMINATE \ 37 | --restart-on-failure 38 | ``` 39 | 40 | 2. **Setup:** 41 | ```bash 42 | gcloud compute ssh gpu-test-runner --zone=us-central1-a 43 | curl -sSL https://raw.githubusercontent.com/treadiehq/gpu-kill/main/scripts/setup-gpu-runner.sh | bash 44 | ``` 45 | 46 | ### Option 3: Azure with GPU 47 | 48 | 1. **Create VM:** 49 | ```bash 50 | az vm create \ 51 | --resource-group myResourceGroup \ 52 | --name gpu-test-vm \ 53 | --image UbuntuLTS \ 54 | --size Standard_NC6s_v3 \ 55 | --admin-username azureuser \ 56 | --generate-ssh-keys 57 | ``` 58 | 59 | 2. **Setup:** 60 | ```bash 61 | ssh azureuser@your-vm-ip 62 | curl -sSL https://raw.githubusercontent.com/treadiehq/gpu-kill/main/scripts/setup-gpu-runner.sh | bash 63 | ``` 64 | 65 | ## Cost-Effective Options 66 | 67 | ### Spot Instances 68 | - **AWS Spot**: Up to 90% savings 69 | - **GCP Preemptible**: Up to 80% savings 70 | - **Azure Spot**: Up to 90% savings 71 | 72 | ### Example Spot Instance Setup (AWS): 73 | ```bash 74 | aws ec2 request-spot-instances \ 75 | --spot-price "0.50" \ 76 | --instance-count 1 \ 77 | --type "one-time" \ 78 | --launch-specification '{ 79 | "ImageId": "ami-0c02fb55956c7d316", 80 | "InstanceType": "g4dn.xlarge", 81 | "KeyName": "your-key", 82 | "SecurityGroupIds": ["sg-xxxxxxxxx"] 83 | }' 84 | ``` 85 | 86 | ## Docker-Based Testing 87 | 88 | ### NVIDIA Docker Setup 89 | ```bash 90 | # Install NVIDIA Docker 91 | distribution=$(. /etc/os-release;echo $ID$VERSION_ID) 92 | curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add - 93 | curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list 94 | 95 | sudo apt-get update && sudo apt-get install -y nvidia-docker2 96 | sudo systemctl restart docker 97 | 98 | # Test GPU access 99 | docker run --rm --gpus all nvidia/cuda:11.0-base nvidia-smi 100 | ``` 101 | 102 | ### GPU Kill Docker Testing 103 | ```bash 104 | # Build GPU Kill with GPU support 105 | docker build -t gpukill:gpu . 106 | 107 | # Run tests with GPU access 108 | docker run --rm --gpus all gpukill:gpu cargo test --test gpu_hardware_tests 109 | ``` 110 | 111 | ## GitHub Actions Integration 112 | 113 | ### Enable GPU Tests 114 | Once you have a self-hosted runner set up: 115 | 116 | 1. **Remove the `if: false` condition** in `.github/workflows/ci.yml`: 117 | ```yaml 118 | gpu-hardware-tests: 119 | name: GPU Hardware Tests 120 | runs-on: [self-hosted, gpu] 121 | # if: false # Remove this line 122 | ``` 123 | 124 | 2. **Add runner labels** when setting up: 125 | ```bash 126 | ./config.sh --labels "gpu,nvidia,linux" --name "nvidia-gpu-runner" 127 | ``` 128 | 129 | ### Conditional GPU Testing 130 | The CI will automatically: 131 | - ✅ **Run GPU tests** when GPU hardware is available 132 | - ✅ **Skip gracefully** when no GPU hardware is found 133 | - ✅ **Work on any runner** (hosted or self-hosted) 134 | 135 | ## Cost Optimization 136 | 137 | ### Scheduled Testing 138 | Set up runners to only run during business hours: 139 | ```yaml 140 | on: 141 | schedule: 142 | - cron: '0 9 * * 1-5' # 9 AM, Monday-Friday 143 | ``` 144 | 145 | ### Auto-shutdown 146 | Add auto-shutdown to cloud instances: 147 | ```bash 148 | # AWS 149 | aws ec2 create-tags --resources i-1234567890abcdef0 --tags Key=shutdown,Value=yes 150 | 151 | # GCP 152 | gcloud compute instances add-metadata gpu-test-runner \ 153 | --metadata shutdown-script='sudo shutdown -h +60' 154 | ``` 155 | 156 | ## Monitoring and Alerts 157 | 158 | ### Set up monitoring for: 159 | - GPU utilization during tests 160 | - Test success/failure rates 161 | - Runner availability 162 | - Cost tracking 163 | 164 | ### Example monitoring script: 165 | ```bash 166 | #!/bin/bash 167 | # Monitor GPU test results 168 | curl -H "Authorization: token $GITHUB_TOKEN" \ 169 | "https://api.github.com/repos/treadiehq/gpu-kill/actions/runs" | \ 170 | jq '.workflow_runs[] | select(.name=="GPU Hardware Tests") | {status, conclusion, created_at}' 171 | ``` 172 | 173 | ## Troubleshooting 174 | 175 | ### Common Issues: 176 | 177 | 1. **GPU not detected:** 178 | ```bash 179 | # Check NVIDIA 180 | nvidia-smi 181 | 182 | # Check AMD 183 | rocm-smi --showid 184 | 185 | # Check Intel 186 | intel_gpu_top 187 | ``` 188 | 189 | 2. **Permission issues:** 190 | ```bash 191 | # Add user to docker group 192 | sudo usermod -aG docker $USER 193 | 194 | # Check GPU permissions 195 | ls -la /dev/nvidia* 196 | ``` 197 | 198 | 3. **Driver issues:** 199 | ```bash 200 | # Update NVIDIA drivers 201 | sudo apt-get install nvidia-driver-470 202 | 203 | # Update AMD drivers 204 | sudo apt-get install rocm-dkms 205 | ``` 206 | 207 | ## Next Steps 208 | 209 | 1. **Choose your cloud provider** (AWS, GCP, Azure) 210 | 2. **Set up a GPU instance** using the scripts above 211 | 3. **Configure the GitHub Actions runner** with GPU labels 212 | 4. **Enable GPU tests** in the CI workflow 213 | 5. **Monitor and optimize** costs and performance 214 | 215 | The GPU tests will now run automatically whenever GPU hardware is available! 🚀 216 | -------------------------------------------------------------------------------- /dashboard/tailwind.config.js: -------------------------------------------------------------------------------- 1 | /** @type {import('tailwindcss').Config} */ 2 | export default { 3 | content: [ 4 | "./components/**/*.{js,vue,ts}", 5 | "./layouts/**/*.vue", 6 | "./pages/**/*.vue", 7 | "./plugins/**/*.{js,ts}", 8 | "./app.vue", 9 | "./error.vue" 10 | ], 11 | theme: { 12 | extend: { 13 | colors: { 14 | // Custom GPU Kill brand colors 15 | primary: { 16 | 50: '#eff6ff', 17 | 100: '#dbeafe', 18 | 200: '#bfdbfe', 19 | 300: '#93c5fd', 20 | 400: '#60a5fa', 21 | 500: '#3b82f6', 22 | 600: '#2563eb', 23 | 700: '#1d4ed8', 24 | 800: '#1e40af', 25 | 900: '#1e3a8a', 26 | 950: '#172554', 27 | }, 28 | gpu: { 29 | 50: '#f0f9ff', 30 | 100: '#e0f2fe', 31 | 200: '#bae6fd', 32 | 300: '#7dd3fc', 33 | 400: '#38bdf8', 34 | 500: '#0ea5e9', 35 | 600: '#0284c7', 36 | 700: '#0369a1', 37 | 800: '#075985', 38 | 900: '#0c4a6e', 39 | 950: '#082f49', 40 | }, 41 | danger: { 42 | 50: '#fef2f2', 43 | 100: '#fee2e2', 44 | 200: '#fecaca', 45 | 300: '#fca5a5', 46 | 400: '#f87171', 47 | 500: '#ef4444', 48 | 600: '#dc2626', 49 | 700: '#b91c1c', 50 | 800: '#991b1b', 51 | 900: '#7f1d1d', 52 | 950: '#450a0a', 53 | }, 54 | warning: { 55 | 50: '#fffbeb', 56 | 100: '#fef3c7', 57 | 200: '#fde68a', 58 | 300: '#fcd34d', 59 | 400: '#fbbf24', 60 | 500: '#f59e0b', 61 | 600: '#d97706', 62 | 700: '#b45309', 63 | 800: '#92400e', 64 | 900: '#78350f', 65 | 950: '#451a03', 66 | }, 67 | success: { 68 | 50: '#f0fdf4', 69 | 100: '#dcfce7', 70 | 200: '#bbf7d0', 71 | 300: '#86efac', 72 | 400: '#4ade80', 73 | 500: '#22c55e', 74 | 600: '#16a34a', 75 | 700: '#15803d', 76 | 800: '#166534', 77 | 900: '#14532d', 78 | 950: '#052e16', 79 | }, 80 | // Dark theme colors 81 | dark: { 82 | 50: '#f8fafc', 83 | 100: '#f1f5f9', 84 | 200: '#e2e8f0', 85 | 300: '#cbd5e1', 86 | 400: '#94a3b8', 87 | 500: '#64748b', 88 | 600: '#475569', 89 | 700: '#334155', 90 | 800: '#1e293b', 91 | 900: '#0f172a', 92 | 950: '#020617', 93 | } 94 | }, 95 | fontFamily: { 96 | sans: ['Inter', 'system-ui', 'sans-serif'], 97 | mono: ['JetBrains Mono', 'Fira Code', 'monospace'], 98 | }, 99 | animation: { 100 | 'pulse-slow': 'pulse 3s cubic-bezier(0.4, 0, 0.6, 1) infinite', 101 | 'bounce-slow': 'bounce 2s infinite', 102 | 'spin-slow': 'spin 3s linear infinite', 103 | 'ping-slow': 'ping 2s cubic-bezier(0, 0, 0.2, 1) infinite', 104 | 'fade-in': 'fadeIn 0.5s ease-in-out', 105 | 'slide-up': 'slideUp 0.3s ease-out', 106 | 'slide-down': 'slideDown 0.3s ease-out', 107 | 'scale-in': 'scaleIn 0.2s ease-out', 108 | 'glow': 'glow 2s ease-in-out infinite alternate', 109 | }, 110 | keyframes: { 111 | fadeIn: { 112 | '0%': { opacity: '0' }, 113 | '100%': { opacity: '1' }, 114 | }, 115 | slideUp: { 116 | '0%': { transform: 'translateY(10px)', opacity: '0' }, 117 | '100%': { transform: 'translateY(0)', opacity: '1' }, 118 | }, 119 | slideDown: { 120 | '0%': { transform: 'translateY(-10px)', opacity: '0' }, 121 | '100%': { transform: 'translateY(0)', opacity: '1' }, 122 | }, 123 | scaleIn: { 124 | '0%': { transform: 'scale(0.95)', opacity: '0' }, 125 | '100%': { transform: 'scale(1)', opacity: '1' }, 126 | }, 127 | glow: { 128 | '0%': { boxShadow: '0 0 5px rgba(59, 130, 246, 0.5)' }, 129 | '100%': { boxShadow: '0 0 20px rgba(59, 130, 246, 0.8)' }, 130 | }, 131 | }, 132 | backdropBlur: { 133 | xs: '2px', 134 | }, 135 | boxShadow: { 136 | 'glow': '0 0 20px rgba(59, 130, 246, 0.3)', 137 | 'glow-lg': '0 0 30px rgba(59, 130, 246, 0.4)', 138 | 'glow-danger': '0 0 20px rgba(239, 68, 68, 0.3)', 139 | 'glow-success': '0 0 20px rgba(34, 197, 94, 0.3)', 140 | 'glow-warning': '0 0 20px rgba(245, 158, 11, 0.3)', 141 | 'inner-lg': 'inset 0 2px 4px 0 rgba(0, 0, 0, 0.1)', 142 | }, 143 | borderRadius: { 144 | '4xl': '2rem', 145 | '5xl': '2.5rem', 146 | }, 147 | spacing: { 148 | '18': '4.5rem', 149 | '88': '22rem', 150 | '128': '32rem', 151 | }, 152 | zIndex: { 153 | '60': '60', 154 | '70': '70', 155 | '80': '80', 156 | '90': '90', 157 | '100': '100', 158 | }, 159 | screens: { 160 | 'xs': '475px', 161 | '3xl': '1600px', 162 | }, 163 | typography: { 164 | DEFAULT: { 165 | css: { 166 | maxWidth: 'none', 167 | color: '#e5e7eb', 168 | a: { 169 | color: '#60a5fa', 170 | '&:hover': { 171 | color: '#93c5fd', 172 | }, 173 | }, 174 | h1: { 175 | color: '#ffffff', 176 | }, 177 | h2: { 178 | color: '#ffffff', 179 | }, 180 | h3: { 181 | color: '#ffffff', 182 | }, 183 | h4: { 184 | color: '#ffffff', 185 | }, 186 | strong: { 187 | color: '#ffffff', 188 | }, 189 | code: { 190 | color: '#fbbf24', 191 | backgroundColor: '#1f2937', 192 | padding: '0.25rem 0.375rem', 193 | borderRadius: '0.25rem', 194 | }, 195 | 'code::before': { 196 | content: '""', 197 | }, 198 | 'code::after': { 199 | content: '""', 200 | }, 201 | }, 202 | }, 203 | }, 204 | }, 205 | }, 206 | plugins: [ 207 | require('@tailwindcss/typography'), 208 | require('@tailwindcss/forms'), 209 | require('@tailwindcss/aspect-ratio'), 210 | ], 211 | darkMode: 'class', 212 | } 213 | -------------------------------------------------------------------------------- /mcp/src/server.rs: -------------------------------------------------------------------------------- 1 | //! MCP Server implementation for GPU Kill 2 | 3 | use crate::resources::ResourceHandler; 4 | use crate::tools::ToolHandler; 5 | use crate::types::*; 6 | use crate::MCP_VERSION; 7 | use anyhow::Result; 8 | use serde_json::json; 9 | use std::sync::Arc; 10 | use tokio::sync::RwLock; 11 | use tracing::{debug, error, info}; 12 | 13 | /// GPU Kill MCP Server 14 | pub struct GpuKillMCPServer { 15 | resource_handler: Arc, 16 | tool_handler: Arc>, 17 | } 18 | 19 | impl GpuKillMCPServer { 20 | /// Create a new MCP server instance 21 | pub async fn new() -> Result { 22 | let resource_handler = Arc::new(ResourceHandler::new().await?); 23 | let tool_handler = Arc::new(RwLock::new(ToolHandler::new().await?)); 24 | 25 | Ok(Self { 26 | resource_handler, 27 | tool_handler, 28 | }) 29 | } 30 | 31 | /// Handle an MCP request 32 | pub async fn handle_request(&self, request: JsonRpcRequest) -> Result { 33 | debug!("Handling MCP request: {}", request.method); 34 | 35 | let result = match request.method.as_str() { 36 | "initialize" => self.handle_initialize(request.params).await, 37 | "resources/list" => self.handle_resources_list().await, 38 | "resources/read" => self.handle_resources_read(request.params).await, 39 | "tools/list" => self.handle_tools_list().await, 40 | "tools/call" => self.handle_tools_call(request.params).await, 41 | _ => Err(anyhow::anyhow!("Unknown method: {}", request.method)), 42 | }; 43 | 44 | match result { 45 | Ok(data) => Ok(JsonRpcResponse { 46 | jsonrpc: "2.0".to_string(), 47 | id: request.id, 48 | result: Some(data), 49 | error: None, 50 | }), 51 | Err(e) => { 52 | error!("Error handling request {}: {}", request.method, e); 53 | Ok(JsonRpcResponse { 54 | jsonrpc: "2.0".to_string(), 55 | id: request.id, 56 | result: None, 57 | error: Some(JsonRpcError { 58 | code: -32603, 59 | message: "Internal error".to_string(), 60 | data: Some(json!({ "details": e.to_string() })), 61 | }), 62 | }) 63 | } 64 | } 65 | } 66 | 67 | async fn handle_initialize( 68 | &self, 69 | _params: Option, 70 | ) -> Result { 71 | info!("MCP client initializing"); 72 | 73 | let response = InitializeResponse { 74 | protocol_version: MCP_VERSION.to_string(), 75 | capabilities: ServerCapabilities { 76 | resources: Some(ResourcesCapability { 77 | subscribe: Some(false), 78 | list_changed: Some(false), 79 | }), 80 | tools: Some(ToolsCapability { 81 | list_changed: Some(false), 82 | }), 83 | logging: Some(LoggingCapability {}), 84 | }, 85 | server_info: ServerInfo { 86 | name: "GPU Kill MCP Server".to_string(), 87 | version: env!("CARGO_PKG_VERSION").to_string(), 88 | }, 89 | }; 90 | 91 | Ok(serde_json::to_value(response)?) 92 | } 93 | 94 | async fn handle_resources_list(&self) -> Result { 95 | let resources = self.resource_handler.list_resources(); 96 | Ok(json!({ "resources": resources })) 97 | } 98 | 99 | async fn handle_resources_read( 100 | &self, 101 | params: Option, 102 | ) -> Result { 103 | let params = params.ok_or_else(|| anyhow::anyhow!("Missing parameters"))?; 104 | let uri = params 105 | .get("uri") 106 | .and_then(|v| v.as_str()) 107 | .ok_or_else(|| anyhow::anyhow!("Missing uri parameter"))?; 108 | 109 | let contents = self.resource_handler.get_resource(uri).await?; 110 | Ok(json!({ "contents": contents })) 111 | } 112 | 113 | async fn handle_tools_list(&self) -> Result { 114 | let tool_handler = self.tool_handler.read().await; 115 | let tools = tool_handler.list_tools(); 116 | Ok(json!({ "tools": tools })) 117 | } 118 | 119 | async fn handle_tools_call( 120 | &self, 121 | params: Option, 122 | ) -> Result { 123 | let params = params.ok_or_else(|| anyhow::anyhow!("Missing parameters"))?; 124 | let name = params 125 | .get("name") 126 | .and_then(|v| v.as_str()) 127 | .ok_or_else(|| anyhow::anyhow!("Missing name parameter"))?; 128 | 129 | let arguments = params 130 | .get("arguments") 131 | .and_then(|v| v.as_object()) 132 | .map(|obj| obj.iter().map(|(k, v)| (k.clone(), v.clone())).collect()); 133 | 134 | let mut tool_handler = self.tool_handler.write().await; 135 | let result = tool_handler.execute_tool(name, arguments).await?; 136 | 137 | Ok(json!({ "content": result.content, "isError": result.is_error })) 138 | } 139 | 140 | /// Start the MCP server 141 | pub async fn start(self, port: u16) -> Result<()> { 142 | info!("Starting GPU Kill MCP Server on port {}", port); 143 | 144 | let server = Arc::new(self); 145 | 146 | // For now, we'll implement a simple HTTP-based MCP server 147 | // In a full implementation, this would use stdio or WebSocket transport 148 | let app = axum::Router::new() 149 | .route( 150 | "/mcp", 151 | axum::routing::post({ 152 | let server = server.clone(); 153 | move |request: axum::extract::Json| { 154 | let server = server.clone(); 155 | async move { 156 | match server.handle_request(request.0).await { 157 | Ok(response) => axum::response::Json(response), 158 | Err(e) => { 159 | error!("Failed to handle HTTP request: {}", e); 160 | axum::response::Json(JsonRpcResponse { 161 | jsonrpc: "2.0".to_string(), 162 | id: "error".to_string(), 163 | result: None, 164 | error: Some(JsonRpcError { 165 | code: -32603, 166 | message: "Internal error".to_string(), 167 | data: Some(json!({ "details": e.to_string() })), 168 | }), 169 | }) 170 | } 171 | } 172 | } 173 | } 174 | }), 175 | ) 176 | .route("/health", axum::routing::get(|| async { "OK" })); 177 | 178 | let listener = tokio::net::TcpListener::bind(format!("0.0.0.0:{}", port)).await?; 179 | info!("MCP Server listening on http://0.0.0.0:{}", port); 180 | 181 | axum::serve(listener, app).await?; 182 | Ok(()) 183 | } 184 | } 185 | 186 | // Remove Default implementation since new() is now async 187 | -------------------------------------------------------------------------------- /src/config.rs: -------------------------------------------------------------------------------- 1 | use anyhow::{Context, Result}; 2 | use serde::{Deserialize, Serialize}; 3 | use std::fs; 4 | use std::path::Path; 5 | 6 | /// Configuration structure for gpukill 7 | #[derive(Debug, Clone, Serialize, Deserialize)] 8 | pub struct Config { 9 | /// Default log level 10 | pub log_level: String, 11 | 12 | /// Default output format 13 | pub output_format: String, 14 | 15 | /// Default timeout for process termination 16 | pub default_timeout_secs: u16, 17 | 18 | /// Whether to show detailed process information by default 19 | pub show_details: bool, 20 | 21 | /// Watch mode refresh interval in seconds 22 | pub watch_interval_secs: u64, 23 | 24 | /// Maximum number of processes to show in summary 25 | pub max_processes_summary: usize, 26 | 27 | /// Table width limit 28 | pub table_width: usize, 29 | 30 | /// Whether to use colors in output 31 | pub use_colors: bool, 32 | } 33 | 34 | impl Default for Config { 35 | fn default() -> Self { 36 | Self { 37 | log_level: "info".to_string(), 38 | output_format: "table".to_string(), 39 | default_timeout_secs: 5, 40 | show_details: false, 41 | watch_interval_secs: 2, 42 | max_processes_summary: 10, 43 | table_width: 120, 44 | use_colors: true, 45 | } 46 | } 47 | } 48 | 49 | /// Configuration manager 50 | pub struct ConfigManager { 51 | config: Config, 52 | } 53 | 54 | impl Default for ConfigManager { 55 | fn default() -> Self { 56 | Self::new() 57 | } 58 | } 59 | 60 | #[allow(dead_code)] 61 | impl ConfigManager { 62 | /// Create a new configuration manager 63 | pub fn new() -> Self { 64 | Self { 65 | config: Config::default(), 66 | } 67 | } 68 | 69 | /// Load configuration from file 70 | pub fn load_from_file>(path: P) -> Result { 71 | let config_path = path.as_ref(); 72 | 73 | if !config_path.exists() { 74 | tracing::debug!("Config file not found at {:?}, using defaults", config_path); 75 | return Ok(Self::new()); 76 | } 77 | 78 | let content = fs::read_to_string(config_path) 79 | .with_context(|| format!("Failed to read config file: {:?}", config_path))?; 80 | 81 | let config: Config = toml::from_str(&content) 82 | .with_context(|| format!("Failed to parse config file: {:?}", config_path))?; 83 | 84 | tracing::info!("Loaded configuration from {:?}", config_path); 85 | Ok(Self { config }) 86 | } 87 | 88 | /// Load configuration from environment variables 89 | pub fn load_from_env() -> Self { 90 | let mut config = Config::default(); 91 | 92 | // Override with environment variables if present 93 | if let Ok(log_level) = std::env::var("GPUKILL_LOG_LEVEL") { 94 | config.log_level = log_level; 95 | } 96 | 97 | if let Ok(output_format) = std::env::var("GPUKILL_OUTPUT_FORMAT") { 98 | config.output_format = output_format; 99 | } 100 | 101 | if let Ok(timeout) = std::env::var("GPUKILL_DEFAULT_TIMEOUT") { 102 | if let Ok(timeout_secs) = timeout.parse::() { 103 | config.default_timeout_secs = timeout_secs; 104 | } 105 | } 106 | 107 | if let Ok(show_details) = std::env::var("GPUKILL_SHOW_DETAILS") { 108 | config.show_details = show_details.parse().unwrap_or(false); 109 | } 110 | 111 | if let Ok(watch_interval) = std::env::var("GPUKILL_WATCH_INTERVAL") { 112 | if let Ok(interval_secs) = watch_interval.parse::() { 113 | config.watch_interval_secs = interval_secs; 114 | } 115 | } 116 | 117 | if let Ok(table_width) = std::env::var("GPUKILL_TABLE_WIDTH") { 118 | if let Ok(width) = table_width.parse::() { 119 | config.table_width = width; 120 | } 121 | } 122 | 123 | if let Ok(use_colors) = std::env::var("GPUKILL_USE_COLORS") { 124 | config.use_colors = use_colors.parse().unwrap_or(true); 125 | } 126 | 127 | Self { config } 128 | } 129 | 130 | /// Get the current configuration 131 | pub fn config(&self) -> &Config { 132 | &self.config 133 | } 134 | 135 | /// Get a mutable reference to the configuration 136 | pub fn config_mut(&mut self) -> &mut Config { 137 | &mut self.config 138 | } 139 | 140 | /// Save configuration to file 141 | pub fn save_to_file>(&self, path: P) -> Result<()> { 142 | let config_path = path.as_ref(); 143 | let content = 144 | toml::to_string_pretty(&self.config).context("Failed to serialize configuration")?; 145 | 146 | fs::write(config_path, content) 147 | .with_context(|| format!("Failed to write config file: {:?}", config_path))?; 148 | 149 | tracing::info!("Saved configuration to {:?}", config_path); 150 | Ok(()) 151 | } 152 | 153 | /// Get default configuration file path 154 | pub fn default_config_path() -> Result { 155 | let home_dir = dirs::home_dir() 156 | .ok_or_else(|| anyhow::anyhow!("Could not determine home directory"))?; 157 | 158 | Ok(home_dir.join(".config").join("gpukill").join("config.toml")) 159 | } 160 | 161 | /// Load configuration from default location 162 | pub fn load_default() -> Result { 163 | let config_path = Self::default_config_path()?; 164 | Self::load_from_file(config_path) 165 | } 166 | 167 | /// Create default configuration file 168 | pub fn create_default_config() -> Result<()> { 169 | let config_path = Self::default_config_path()?; 170 | 171 | // Create directory if it doesn't exist 172 | if let Some(parent) = config_path.parent() { 173 | fs::create_dir_all(parent) 174 | .with_context(|| format!("Failed to create config directory: {:?}", parent))?; 175 | } 176 | 177 | let config_manager = Self::new(); 178 | config_manager.save_to_file(config_path)?; 179 | 180 | Ok(()) 181 | } 182 | } 183 | 184 | /// Get configuration with fallback chain 185 | pub fn get_config(config_path: Option) -> Result { 186 | // 1. Try to load from specified path 187 | if let Some(path) = config_path { 188 | return ConfigManager::load_from_file(path); 189 | } 190 | 191 | // 2. Try to load from default location 192 | if let Ok(config_manager) = ConfigManager::load_default() { 193 | return Ok(config_manager); 194 | } 195 | 196 | // 3. Load from environment variables 197 | Ok(ConfigManager::load_from_env()) 198 | } 199 | 200 | #[cfg(test)] 201 | mod tests { 202 | use super::*; 203 | use tempfile::NamedTempFile; 204 | 205 | #[test] 206 | fn test_default_config() { 207 | let config = Config::default(); 208 | assert_eq!(config.log_level, "info"); 209 | assert_eq!(config.output_format, "table"); 210 | assert_eq!(config.default_timeout_secs, 5); 211 | assert!(!config.show_details); 212 | assert_eq!(config.watch_interval_secs, 2); 213 | } 214 | 215 | #[test] 216 | fn test_config_serialization() { 217 | let config = Config::default(); 218 | let toml_str = toml::to_string(&config).unwrap(); 219 | let deserialized: Config = toml::from_str(&toml_str).unwrap(); 220 | 221 | assert_eq!(config.log_level, deserialized.log_level); 222 | assert_eq!(config.output_format, deserialized.output_format); 223 | } 224 | 225 | #[test] 226 | fn test_config_file_loading() { 227 | let config = Config::default(); 228 | let toml_str = toml::to_string_pretty(&config).unwrap(); 229 | 230 | let temp_file = NamedTempFile::new().unwrap(); 231 | std::fs::write(temp_file.path(), toml_str).unwrap(); 232 | 233 | let loaded_config = ConfigManager::load_from_file(temp_file.path()).unwrap(); 234 | assert_eq!(loaded_config.config().log_level, config.log_level); 235 | } 236 | 237 | #[test] 238 | fn test_config_manager_creation() { 239 | let manager = ConfigManager::new(); 240 | assert_eq!(manager.config().log_level, "info"); 241 | } 242 | } 243 | -------------------------------------------------------------------------------- /src/remote.rs: -------------------------------------------------------------------------------- 1 | use anyhow::{Context, Result}; 2 | use std::{ 3 | process::{Command, Stdio}, 4 | time::Duration, 5 | }; 6 | use tracing::{debug, info, warn}; 7 | 8 | /// SSH connection configuration 9 | #[derive(Debug, Clone)] 10 | pub struct SshConfig { 11 | pub host: String, 12 | pub port: u16, 13 | pub username: String, 14 | pub key_path: Option, 15 | pub password: Option, 16 | pub timeout: Duration, 17 | } 18 | 19 | impl SshConfig { 20 | /// Create a new SSH configuration 21 | pub fn new(host: String, port: u16, username: String) -> Self { 22 | Self { 23 | host, 24 | port, 25 | username, 26 | key_path: None, 27 | password: None, 28 | timeout: Duration::from_secs(30), 29 | } 30 | } 31 | 32 | /// Set SSH key path 33 | pub fn with_key_path(mut self, key_path: String) -> Self { 34 | self.key_path = Some(key_path); 35 | self 36 | } 37 | 38 | /// Set SSH password 39 | pub fn with_password(mut self, password: String) -> Self { 40 | self.password = Some(password); 41 | self 42 | } 43 | 44 | /// Set connection timeout 45 | pub fn with_timeout(mut self, timeout: Duration) -> Self { 46 | self.timeout = timeout; 47 | self 48 | } 49 | } 50 | 51 | /// SSH remote connection manager using system SSH 52 | pub struct SshRemote { 53 | config: SshConfig, 54 | } 55 | 56 | impl SshRemote { 57 | /// Create a new SSH remote connection 58 | pub fn new(config: SshConfig) -> Self { 59 | Self { config } 60 | } 61 | 62 | /// Execute a command on the remote host 63 | pub fn execute_command(&self, command: &str) -> Result { 64 | debug!("Executing remote command: {}", command); 65 | 66 | let mut ssh_cmd = Command::new("ssh"); 67 | 68 | // Add SSH options 69 | ssh_cmd 70 | .arg("-o") 71 | .arg("ConnectTimeout=30") 72 | .arg("-o") 73 | .arg("StrictHostKeyChecking=no") 74 | .arg("-o") 75 | .arg("UserKnownHostsFile=/dev/null") 76 | .arg("-o") 77 | .arg("LogLevel=ERROR"); 78 | 79 | // Add port if not default 80 | if self.config.port != 22 { 81 | ssh_cmd.arg("-p").arg(self.config.port.to_string()); 82 | } 83 | 84 | // Add key file if specified 85 | if let Some(key_path) = &self.config.key_path { 86 | ssh_cmd.arg("-i").arg(key_path); 87 | } 88 | 89 | // Add password authentication if specified 90 | if self.config.password.is_some() { 91 | ssh_cmd.arg("-o").arg("PasswordAuthentication=yes"); 92 | } 93 | 94 | // Add host and command 95 | let host_spec = format!("{}@{}", self.config.username, self.config.host); 96 | ssh_cmd.arg(host_spec).arg(command); 97 | 98 | // Set up input for password if needed 99 | if let Some(_password) = &self.config.password { 100 | ssh_cmd.stdin(Stdio::piped()); 101 | } 102 | 103 | debug!("Running SSH command: {:?}", ssh_cmd); 104 | 105 | let mut child = ssh_cmd 106 | .stdout(Stdio::piped()) 107 | .stderr(Stdio::piped()) 108 | .spawn() 109 | .context("Failed to spawn SSH command")?; 110 | 111 | // Send password if provided 112 | if let Some(password) = &self.config.password { 113 | if let Some(stdin) = child.stdin.as_mut() { 114 | use std::io::Write; 115 | stdin 116 | .write_all(password.as_bytes()) 117 | .context("Failed to write password to SSH stdin")?; 118 | stdin 119 | .write_all(b"\n") 120 | .context("Failed to write newline to SSH stdin")?; 121 | } 122 | } 123 | 124 | let output = child 125 | .wait_with_output() 126 | .context("Failed to wait for SSH command")?; 127 | 128 | if !output.status.success() { 129 | let stderr = String::from_utf8_lossy(&output.stderr); 130 | return Err(anyhow::anyhow!( 131 | "SSH command failed with exit code {}: {}", 132 | output.status.code().unwrap_or(-1), 133 | stderr 134 | )); 135 | } 136 | 137 | let stdout = String::from_utf8(output.stdout) 138 | .context("Failed to decode SSH command output as UTF-8")?; 139 | 140 | debug!( 141 | "Command executed successfully, output length: {} bytes", 142 | stdout.len() 143 | ); 144 | Ok(stdout) 145 | } 146 | 147 | /// Execute gpukill command on remote host 148 | pub fn execute_gpukill(&self, args: &[String]) -> Result { 149 | let command = format!("gpukill {}", args.join(" ")); 150 | self.execute_command(&command) 151 | } 152 | 153 | /// Check if gpukill is available on remote host 154 | pub fn check_gpukill_availability(&self) -> Result { 155 | match self.execute_command("which gpukill") { 156 | Ok(output) => { 157 | let available = !output.trim().is_empty(); 158 | if available { 159 | info!("gpukill is available on remote host"); 160 | } else { 161 | warn!("gpukill not found on remote host"); 162 | } 163 | Ok(available) 164 | } 165 | Err(_) => { 166 | warn!("Failed to check gpukill availability on remote host"); 167 | Ok(false) 168 | } 169 | } 170 | } 171 | 172 | /// Get remote host information 173 | pub fn get_host_info(&self) -> Result { 174 | let hostname = self.execute_command("hostname")?.trim().to_string(); 175 | let os_info = self.execute_command("uname -a")?.trim().to_string(); 176 | let gpu_info = self.execute_command("nvidia-smi --query-gpu=name --format=csv,noheader,nounits 2>/dev/null || echo 'No NVIDIA GPUs'")?.trim().to_string(); 177 | 178 | Ok(RemoteHostInfo { 179 | hostname, 180 | os_info, 181 | gpu_info, 182 | }) 183 | } 184 | } 185 | 186 | /// Information about the remote host 187 | #[derive(Debug, Clone)] 188 | pub struct RemoteHostInfo { 189 | pub hostname: String, 190 | pub os_info: String, 191 | #[allow(dead_code)] 192 | pub gpu_info: String, 193 | } 194 | 195 | /// Execute a local gpukill command with remote forwarding 196 | pub fn execute_remote_operation(config: SshConfig, local_args: &[String]) -> Result<()> { 197 | let remote = SshRemote::new(config); 198 | 199 | // Check if gpukill is available on remote host 200 | if !remote.check_gpukill_availability()? { 201 | return Err(anyhow::anyhow!( 202 | "gpukill is not available on the remote host. Please install gpukill on the remote host first." 203 | )); 204 | } 205 | 206 | // Get remote host info 207 | let host_info = remote.get_host_info()?; 208 | info!( 209 | "Remote host: {} ({})", 210 | host_info.hostname, host_info.os_info 211 | ); 212 | 213 | // Execute the command on remote host 214 | let output = remote.execute_gpukill(local_args)?; 215 | 216 | // Print the output 217 | print!("{}", output); 218 | 219 | Ok(()) 220 | } 221 | 222 | #[cfg(test)] 223 | mod tests { 224 | use super::*; 225 | use std::time::Duration; 226 | 227 | #[test] 228 | fn test_ssh_config_creation() { 229 | let config = SshConfig::new("localhost".to_string(), 22, "testuser".to_string()); 230 | assert_eq!(config.host, "localhost"); 231 | assert_eq!(config.port, 22); 232 | assert_eq!(config.username, "testuser"); 233 | assert_eq!(config.timeout, Duration::from_secs(30)); 234 | } 235 | 236 | #[test] 237 | fn test_ssh_config_with_options() { 238 | let config = SshConfig::new("localhost".to_string(), 22, "testuser".to_string()) 239 | .with_key_path("/path/to/key".to_string()) 240 | .with_password("password".to_string()) 241 | .with_timeout(Duration::from_secs(60)); 242 | 243 | assert_eq!(config.key_path, Some("/path/to/key".to_string())); 244 | assert_eq!(config.password, Some("password".to_string())); 245 | assert_eq!(config.timeout, Duration::from_secs(60)); 246 | } 247 | } 248 | -------------------------------------------------------------------------------- /.github/workflows/self-hosted-setup.md: -------------------------------------------------------------------------------- 1 | # Self-Hosted GPU Runner Setup Guide 2 | 3 | This guide explains how to set up self-hosted GitHub Actions runners with GPU hardware for testing of GPU Kill. 4 | 5 | ## Overview 6 | 7 | GPU Kill requires actual GPU hardware to test all functionality. This setup provides: 8 | - **NVIDIA GPU testing** with CUDA/NVML 9 | - **AMD GPU testing** with ROCm 10 | - **Intel GPU testing** with intel-gpu-tools 11 | - **Apple Silicon testing** on macOS 12 | - **Cross-platform compatibility** testing 13 | 14 | ## Hardware Requirements 15 | 16 | ### NVIDIA Runner 17 | - **GPU**: Any NVIDIA GPU with CUDA support 18 | - **OS**: Ubuntu 22.04 LTS 19 | - **RAM**: 16GB+ recommended 20 | - **Storage**: 100GB+ SSD 21 | - **CPU**: 4+ cores 22 | 23 | ### AMD Runner 24 | - **GPU**: AMD GPU with ROCm support (RX 5000/6000 series, MI series) 25 | - **OS**: Ubuntu 22.04 LTS 26 | - **RAM**: 16GB+ recommended 27 | - **Storage**: 100GB+ SSD 28 | - **CPU**: 4+ cores 29 | 30 | ### Intel Runner 31 | - **GPU**: Intel Arc, Iris Xe, or integrated GPU 32 | - **OS**: Ubuntu 22.04 LTS 33 | - **RAM**: 8GB+ recommended 34 | - **Storage**: 50GB+ SSD 35 | - **CPU**: 4+ cores 36 | 37 | ### Apple Silicon Runner 38 | - **Hardware**: Mac Studio, MacBook Pro, or Mac mini with M1/M2/M3/M4 39 | - **OS**: macOS 13+ (Ventura) 40 | - **RAM**: 16GB+ recommended 41 | - **Storage**: 100GB+ SSD 42 | 43 | ## Setup Instructions 44 | 45 | ### 1. NVIDIA Runner Setup 46 | 47 | ```bash 48 | # Install Ubuntu 22.04 LTS 49 | # Update system 50 | sudo apt update && sudo apt upgrade -y 51 | 52 | # Install NVIDIA drivers 53 | sudo apt install -y nvidia-driver-535 54 | sudo reboot 55 | 56 | # Verify NVIDIA installation 57 | nvidia-smi 58 | 59 | # Install development tools 60 | sudo apt install -y build-essential curl git libssl-dev pkg-config 61 | 62 | # Install Rust 63 | curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh 64 | source ~/.cargo/env 65 | 66 | # Install GitHub Actions runner 67 | mkdir actions-runner && cd actions-runner 68 | curl -o actions-runner-linux-x64-2.311.0.tar.gz -L https://github.com/actions/runner/releases/download/v2.311.0/actions-runner-linux-x64-2.311.0.tar.gz 69 | tar xzf ./actions-runner-linux-x64-2.311.0.tar.gz 70 | 71 | # Configure runner (get token from GitHub repo settings) 72 | ./config.sh --url https://github.com/treadiehq/gpu-kill --token 73 | ./config.sh --name "nvidia-gpu-runner" --labels "self-hosted,gpu,nvidia,ubuntu-22.04" 74 | 75 | # Install as service 76 | sudo ./svc.sh install 77 | sudo ./svc.sh start 78 | ``` 79 | 80 | ### 2. AMD Runner Setup 81 | 82 | ```bash 83 | # Install Ubuntu 22.04 LTS 84 | # Update system 85 | sudo apt update && sudo apt upgrade -y 86 | 87 | # Install ROCm 88 | wget https://repo.radeon.com/amdgpu-install/5.7/ubuntu/jammy/amdgpu-install_5.7.50700-1_all.deb 89 | sudo dpkg -i amdgpu-install_5.7.50700-1_all.deb 90 | sudo apt-get update 91 | sudo amdgpu-install --usecase=rocm 92 | 93 | # Verify ROCm installation 94 | rocm-smi 95 | rocminfo 96 | 97 | # Install development tools 98 | sudo apt install -y build-essential curl git libssl-dev pkg-config 99 | 100 | # Install Rust 101 | curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh 102 | source ~/.cargo/env 103 | 104 | # Install GitHub Actions runner (same as NVIDIA) 105 | mkdir actions-runner && cd actions-runner 106 | curl -o actions-runner-linux-x64-2.311.0.tar.gz -L https://github.com/actions/runner/releases/download/v2.311.0/actions-runner-linux-x64-2.311.0.tar.gz 107 | tar xzf ./actions-runner-linux-x64-2.311.0.tar.gz 108 | 109 | # Configure runner 110 | ./config.sh --url https://github.com/treadiehq/gpu-kill --token 111 | ./config.sh --name "amd-gpu-runner" --labels "self-hosted,gpu,amd,ubuntu-22.04" 112 | 113 | # Install as service 114 | sudo ./svc.sh install 115 | sudo ./svc.sh start 116 | ``` 117 | 118 | ### 3. Intel Runner Setup 119 | 120 | ```bash 121 | # Install Ubuntu 22.04 LTS 122 | # Update system 123 | sudo apt update && sudo apt upgrade -y 124 | 125 | # Install Intel GPU tools 126 | sudo apt install -y intel-gpu-tools 127 | 128 | # Verify Intel GPU tools 129 | intel_gpu_top --help 130 | 131 | # Install development tools 132 | sudo apt install -y build-essential curl git libssl-dev pkg-config 133 | 134 | # Install Rust 135 | curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh 136 | source ~/.cargo/env 137 | 138 | # Install GitHub Actions runner 139 | mkdir actions-runner && cd actions-runner 140 | curl -o actions-runner-linux-x64-2.311.0.tar.gz -L https://github.com/actions/runner/releases/download/v2.311.0/actions-runner-linux-x64-2.311.0.tar.gz 141 | tar xzf ./actions-runner-linux-x64-2.311.0.tar.gz 142 | 143 | # Configure runner 144 | ./config.sh --url https://github.com/treadiehq/gpu-kill --token 145 | ./config.sh --name "intel-gpu-runner" --labels "self-hosted,gpu,intel,ubuntu-22.04" 146 | 147 | # Install as service 148 | sudo ./svc.sh install 149 | sudo ./svc.sh start 150 | ``` 151 | 152 | ### 4. Apple Silicon Runner Setup 153 | 154 | ```bash 155 | # Install macOS 13+ (Ventura) 156 | # Install Xcode command line tools 157 | xcode-select --install 158 | 159 | # Install Homebrew 160 | /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)" 161 | 162 | # Install Rust 163 | curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh 164 | source ~/.cargo/env 165 | 166 | # Verify Apple Silicon GPU 167 | system_profiler SPDisplaysDataType 168 | 169 | # Install GitHub Actions runner 170 | mkdir actions-runner && cd actions-runner 171 | curl -o actions-runner-osx-arm64-2.311.0.tar.gz -L https://github.com/actions/runner/releases/download/v2.311.0/actions-runner-osx-arm64-2.311.0.tar.gz 172 | tar xzf ./actions-runner-osx-arm64-2.311.0.tar.gz 173 | 174 | # Configure runner 175 | ./config.sh --url https://github.com/treadiehq/gpu-kill --token 176 | ./config.sh --name "apple-gpu-runner" --labels "self-hosted,gpu,apple,macos-13" 177 | 178 | # Install as service 179 | ./svc.sh install 180 | ./svc.sh start 181 | ``` 182 | 183 | ## Runner Labels 184 | 185 | Each runner should be configured with these labels: 186 | - `self-hosted` - Required for self-hosted runners 187 | - `gpu` - Indicates GPU hardware availability 188 | - `nvidia`/`amd`/`intel`/`apple` - GPU vendor 189 | - `ubuntu-22.04`/`macos-13` - Operating system 190 | - `stress-test` - For runners capable of stress testing 191 | 192 | ## Security Considerations 193 | 194 | 1. **Network Security**: Ensure runners are behind a firewall 195 | 2. **Access Control**: Limit who can access the runner machines 196 | 3. **Token Management**: Regularly rotate GitHub tokens 197 | 4. **Monitoring**: Monitor runner health and performance 198 | 5. **Updates**: Keep runners updated with security patches 199 | 200 | ## Monitoring and Maintenance 201 | 202 | ### Health Checks 203 | ```bash 204 | # Check runner status 205 | sudo systemctl status actions.runner.* 206 | 207 | # Check GPU status 208 | nvidia-smi # NVIDIA 209 | rocm-smi # AMD 210 | intel_gpu_top --help # Intel 211 | system_profiler SPDisplaysDataType # Apple 212 | ``` 213 | 214 | ### Logs 215 | ```bash 216 | # View runner logs 217 | sudo journalctl -u actions.runner.* -f 218 | 219 | # View GitHub Actions logs 220 | tail -f /home/runner/_diag/Runner_*.log 221 | ``` 222 | 223 | ### Updates 224 | ```bash 225 | # Update runner software 226 | cd actions-runner 227 | ./config.sh remove --token 228 | # Download new version 229 | ./config.sh --url https://github.com/treadiehq/gpu-kill --token 230 | ``` 231 | 232 | ## Cost Optimization 233 | 234 | 1. **Scheduled Testing**: Run tests during off-peak hours 235 | 2. **Resource Scaling**: Use smaller instances for basic tests 236 | 3. **Caching**: Implement aggressive caching for dependencies 237 | 4. **Parallel Testing**: Run multiple test suites in parallel 238 | 239 | ## Troubleshooting 240 | 241 | ### Common Issues 242 | 243 | 1. **GPU Not Detected** 244 | ```bash 245 | # Check GPU status 246 | lspci | grep -i vga 247 | nvidia-smi # or rocm-smi, intel_gpu_top 248 | ``` 249 | 250 | 2. **Permission Issues** 251 | ```bash 252 | # Add user to video group 253 | sudo usermod -a -G video $USER 254 | sudo usermod -a -G render $USER 255 | ``` 256 | 257 | 3. **Driver Issues** 258 | ```bash 259 | # Reinstall drivers 260 | sudo apt purge nvidia-* # NVIDIA 261 | sudo apt purge rocm-* # AMD 262 | sudo apt install nvidia-driver-535 # Reinstall 263 | ``` 264 | 265 | 4. **Runner Connection Issues** 266 | ```bash 267 | # Check network connectivity 268 | curl -I https://github.com 269 | # Restart runner service 270 | sudo systemctl restart actions.runner.* 271 | ``` 272 | 273 | ## Integration with GPU Kill 274 | 275 | The runners will automatically execute the GPU testing workflow when: 276 | - Code is pushed to main/develop branches 277 | - Pull requests are opened 278 | - Manual workflow dispatch is triggered 279 | 280 | Tests include: 281 | - GPU detection and enumeration 282 | - Performance benchmarking 283 | - Memory usage testing 284 | - Stress testing 285 | - Cross-platform compatibility 286 | - Security auditing 287 | -------------------------------------------------------------------------------- /docs/HOTAISLE_INTEGRATION.md: -------------------------------------------------------------------------------- 1 | # Hot Aisle Integration for GPU Testing 2 | 3 | This document describes the **optional** integration between GPU Kill and Hot Aisle's infrastructure for automated GPU testing in CI/CD pipelines. 4 | 5 | > **Note**: Hot Aisle integration is an optional feature that must be enabled with the `hotaisle` feature flag. 6 | 7 | ## Overview 8 | 9 | The Hot Aisle integration enables GPU Kill to run comprehensive tests on real GPU hardware by: 10 | 11 | 1. **Provisioning GPU instances** on-demand via Hot Aisle's API 12 | 2. **Running GPU tests** on actual hardware (NVIDIA, AMD, Intel, Apple Silicon) 13 | 3. **Automated cleanup** to minimize costs 14 | 4. **Comprehensive reporting** of test results 15 | 16 | ## Architecture 17 | 18 | ``` 19 | ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ 20 | │ GitHub │ │ Hot Aisle │ │ GPU Hardware │ 21 | │ Actions │◄──►│ API │◄──►│ (NVIDIA/AMD) │ 22 | │ (CI/CD) │ │ (Backend) │ │ (Intel/Apple) │ 23 | └─────────────────┘ └─────────────────┘ └─────────────────┘ 24 | ``` 25 | 26 | ## Components 27 | 28 | ### 1. Hot Aisle API Client (`src/hotaisle_client.rs`) 29 | 30 | Rust client for interacting with Hot Aisle's API: 31 | 32 | ```rust 33 | use gpukill::hotaisle_client::{HotAisleClient, GpuInstanceConfig}; 34 | 35 | let client = HotAisleClient::new(api_key, None); 36 | 37 | let config = GpuInstanceConfig { 38 | gpu_type: "nvidia".to_string(), 39 | duration_minutes: 30, 40 | instance_type: Some("g4dn.xlarge".to_string()), 41 | labels: Some(vec!["ci-test".to_string()]), 42 | }; 43 | 44 | let instance = client.provision_gpu_instance(config).await?; 45 | ``` 46 | 47 | ### 2. GPU Test Script (`scripts/run-gpu-tests.sh`) 48 | 49 | Comprehensive test script that runs on provisioned instances: 50 | 51 | ### 3. Integration Test Script (`scripts/test-hotaisle-integration-simple.sh`) 52 | 53 | CI-friendly test script that validates the Hot Aisle integration without requiring API access: 54 | 55 | - **GPU Detection Tests**: Verify GPU enumeration and information retrieval 56 | - **Vendor-Specific Tests**: NVIDIA (nvidia-smi), AMD (rocm-smi, amd-smi), Intel (intel_gpu_top) 57 | - **Performance Tests**: Run GPU hardware tests and benchmarks 58 | - **Stress Tests**: Multiple iterations to ensure reliability 59 | - **Report Generation**: Detailed test reports with system information 60 | 61 | ### 3. GitHub Actions Workflow (`.github/workflows/hotaisle-gpu-testing.yml`) 62 | 63 | Automated CI/CD pipeline that: 64 | 65 | - **Provisions GPU instances** based on matrix strategy 66 | - **Deploys GPU Kill** to instances 67 | - **Runs comprehensive tests** on real hardware 68 | - **Collects results** and uploads artifacts 69 | - **Cleans up instances** automatically 70 | 71 | ## Setup Instructions 72 | 73 | ### 1. Enable Hot Aisle Feature 74 | 75 | Build GPU Kill with the Hot Aisle feature enabled: 76 | 77 | ```bash 78 | # Build with Hot Aisle integration 79 | cargo build --release --features hotaisle 80 | 81 | # Or install with Hot Aisle integration 82 | cargo install --path . --features hotaisle 83 | ``` 84 | 85 | ### 2. Hot Aisle API Key 86 | 87 | Add your Hot Aisle API key to GitHub Secrets: 88 | 89 | ```bash 90 | # In your GitHub repository settings: 91 | # Settings → Secrets and variables → Actions → New repository secret 92 | # Name: HOTAISLE_API_KEY 93 | # Value: your-hotaisle-api-key 94 | ``` 95 | 96 | ### 3. Configure GPU Types 97 | 98 | The workflow supports testing multiple GPU types: 99 | 100 | ```yaml 101 | # Default configuration 102 | matrix: 103 | gpu_type: [nvidia, amd, intel] 104 | 105 | # Manual dispatch with custom GPU types 106 | # Use workflow_dispatch with inputs: 107 | # gpu_types: "nvidia,amd,intel,apple-silicon" 108 | ``` 109 | 110 | ### 4. Test Duration 111 | 112 | Configure test duration to balance thoroughness with cost: 113 | 114 | ```yaml 115 | # Default: 30 minutes 116 | # Can be overridden via workflow_dispatch 117 | test_duration: "30" # minutes 118 | ``` 119 | 120 | ## Usage 121 | 122 | ### Integration Testing 123 | 124 | The integration is validated automatically via the "Test Hot Aisle Integration" workflow: 125 | - **Runs on**: Changes to Hot Aisle-related files 126 | - **Validates**: Build system, feature flags, documentation, and workflow syntax 127 | - **No API key required**: Tests the integration structure without actual GPU provisioning 128 | 129 | ### Manual GPU Testing 130 | 131 | Trigger tests manually via GitHub Actions: 132 | 133 | 1. Go to **Actions** tab in your repository 134 | 2. Select **Hot Aisle GPU Testing** workflow 135 | 3. Click **Run workflow** 136 | 4. Configure parameters: 137 | - **GPU types**: Comma-separated list (e.g., `nvidia,amd,intel`) 138 | - **Test duration**: Minutes (e.g., `30`) 139 | 140 | ### Local Testing 141 | 142 | Test the integration locally: 143 | 144 | ```bash 145 | # Build GPU Kill 146 | cargo build --release 147 | 148 | # Run GPU tests (requires GPU hardware) 149 | ./scripts/run-gpu-tests.sh nvidia 150 | ``` 151 | 152 | ## Supported GPU Types 153 | 154 | | GPU Type | Tools Used | Tests | 155 | |----------|------------|-------| 156 | | **NVIDIA** | nvidia-smi, NVML | GPU enumeration, memory, utilization, temperature, power | 157 | | **AMD** | rocm-smi, amd-smi | GPU enumeration, memory, utilization, temperature, power | 158 | | **Intel** | intel_gpu_top | GPU enumeration, utilization, memory estimation | 159 | | **Apple Silicon** | system_profiler | GPU enumeration, memory usage, Metal processes | 160 | 161 | ## Cost Optimization 162 | 163 | ### 1. Instance Lifecycle Management 164 | 165 | - **Automatic provisioning** only when needed 166 | - **Immediate cleanup** after tests complete 167 | - **Timeout protection** to prevent runaway costs 168 | 169 | ### 2. Test Duration Control 170 | 171 | - **Configurable duration** (default: 30 minutes) 172 | - **Fast failure** for quick feedback 173 | - **Comprehensive testing** when needed 174 | 175 | ### 3. Resource Efficiency 176 | 177 | - **Parallel testing** across GPU types 178 | - **Shared infrastructure** via Hot Aisle 179 | - **No always-on runners** required 180 | 181 | ## Test Results 182 | 183 | ### Artifacts 184 | 185 | Each test run produces: 186 | 187 | - **Test Output Log**: Detailed execution logs 188 | - **Test Report**: Comprehensive system and GPU information 189 | - **Retention**: 30 days for debugging 190 | 191 | ### Metrics 192 | 193 | Tests measure: 194 | 195 | - **GPU Detection**: Number of GPUs found 196 | - **Information Retrieval**: JSON validity and completeness 197 | - **Performance**: Test execution time 198 | - **Reliability**: Stress test success rate 199 | 200 | ## Troubleshooting 201 | 202 | ### Common Issues 203 | 204 | 1. **Instance Provisioning Fails** 205 | - Check Hot Aisle API key validity 206 | - Verify GPU type availability 207 | - Check Hot Aisle service status 208 | 209 | 2. **SSH Connection Issues** 210 | - Verify instance IP address 211 | - Check SSH key generation 212 | - Ensure instance is ready 213 | 214 | 3. **Test Failures** 215 | - Review test output logs 216 | - Check GPU driver installation 217 | - Verify tool availability (nvidia-smi, rocm-smi, etc.) 218 | 219 | ### Debug Mode 220 | 221 | Enable debug logging: 222 | 223 | ```bash 224 | export RUST_LOG=debug 225 | export RUST_BACKTRACE=1 226 | ``` 227 | 228 | ## API Reference 229 | 230 | ### HotAisleClient 231 | 232 | ```rust 233 | impl HotAisleClient { 234 | pub fn new(api_key: String, base_url: Option) -> Self 235 | pub async fn provision_gpu_instance(&self, config: GpuInstanceConfig) -> Result 236 | pub async fn wait_for_instance_ready(&self, instance_id: &str, timeout_minutes: u32) -> Result 237 | pub async fn get_instance(&self, instance_id: &str) -> Result 238 | pub async fn run_gpu_tests(&self, instance: &GpuInstance, test_config: &GpuTestConfig) -> Result 239 | pub async fn terminate_instance(&self, instance_id: &str) -> Result<()> 240 | pub async fn list_available_gpu_types(&self) -> Result> 241 | } 242 | ``` 243 | 244 | ### Configuration Types 245 | 246 | ```rust 247 | pub struct GpuInstanceConfig { 248 | pub gpu_type: String, // nvidia, amd, intel, apple-silicon 249 | pub duration_minutes: u32, // Instance lifetime 250 | pub instance_type: Option, // Auto-selected if None 251 | pub labels: Option>, // Custom labels 252 | } 253 | 254 | pub struct GpuTestConfig { 255 | pub test_command: String, // Command to execute 256 | pub timeout_minutes: u32, // Test timeout 257 | pub env_vars: Option>, // Environment variables 258 | pub working_dir: Option, // Working directory 259 | } 260 | ``` 261 | 262 | ## Future Enhancements 263 | 264 | ### Planned Features 265 | 266 | 1. **Advanced GPU Testing** 267 | - CUDA/ROCm kernel testing 268 | - Memory bandwidth benchmarks 269 | - Multi-GPU coordination tests 270 | 271 | 2. **Cost Analytics** 272 | - Test cost tracking 273 | - Optimization recommendations 274 | - Budget alerts 275 | 276 | 3. **Integration Improvements** 277 | - Webhook notifications 278 | - Slack/Teams integration 279 | - Custom test configurations 280 | 281 | ### Contributing 282 | 283 | To contribute to the Hot Aisle integration: 284 | 285 | 1. **Fork the repository** 286 | 2. **Create a feature branch** 287 | 3. **Add tests** for new functionality 288 | 4. **Update documentation** 289 | 5. **Submit a pull request** 290 | 291 | ## Support 292 | 293 | For issues related to: 294 | 295 | - **GPU Kill**: Create an issue in this repository 296 | - **Hot Aisle API**: Contact Hot Aisle support 297 | - **Integration**: Check the troubleshooting section above 298 | 299 | ## License 300 | 301 | This integration is part of GPU Kill and follows the same license terms. 302 | -------------------------------------------------------------------------------- /src/hotaisle_client.rs: -------------------------------------------------------------------------------- 1 | //! Hot Aisle API client for GPU instance provisioning and management 2 | //! 3 | //! This module provides integration with Hot Aisle's infrastructure 4 | //! for on-demand GPU testing in CI/CD pipelines. 5 | 6 | use anyhow::Result; 7 | use serde::{Deserialize, Serialize}; 8 | use std::time::Duration; 9 | use tokio::time::sleep; 10 | 11 | /// Hot Aisle API client for managing GPU instances 12 | pub struct HotAisleClient { 13 | api_key: String, 14 | base_url: String, 15 | client: reqwest::Client, 16 | } 17 | 18 | /// GPU instance configuration 19 | #[derive(Debug, Clone, Serialize, Deserialize)] 20 | pub struct GpuInstanceConfig { 21 | /// GPU type (nvidia, amd, intel, apple-silicon) 22 | pub gpu_type: String, 23 | /// Instance duration in minutes 24 | pub duration_minutes: u32, 25 | /// Instance size/type 26 | pub instance_type: Option, 27 | /// Custom labels for the instance 28 | pub labels: Option>, 29 | } 30 | 31 | /// GPU instance information 32 | #[derive(Debug, Clone, Serialize, Deserialize)] 33 | pub struct GpuInstance { 34 | /// Unique instance ID 35 | pub id: String, 36 | /// Instance IP address 37 | pub ip_address: String, 38 | /// SSH connection details 39 | pub ssh_config: SshConfig, 40 | /// GPU type 41 | pub gpu_type: String, 42 | /// Instance status 43 | pub status: String, 44 | /// Creation timestamp 45 | pub created_at: String, 46 | /// Expiration timestamp 47 | pub expires_at: String, 48 | } 49 | 50 | /// SSH connection configuration 51 | #[derive(Debug, Clone, Serialize, Deserialize)] 52 | pub struct SshConfig { 53 | /// SSH username 54 | pub username: String, 55 | /// SSH port (default: 22) 56 | pub port: u16, 57 | /// SSH key path or content 58 | pub key_path: Option, 59 | } 60 | 61 | /// Test results from GPU instance 62 | #[derive(Debug, Clone, Serialize, Deserialize)] 63 | pub struct GpuTestResults { 64 | /// Instance ID where tests were run 65 | pub instance_id: String, 66 | /// Test execution status 67 | pub status: String, 68 | /// Test output/logs 69 | pub output: String, 70 | /// Test duration in seconds 71 | pub duration_seconds: u64, 72 | /// Number of tests passed 73 | pub tests_passed: u32, 74 | /// Number of tests failed 75 | pub tests_failed: u32, 76 | /// Number of tests skipped 77 | pub tests_skipped: u32, 78 | } 79 | 80 | impl HotAisleClient { 81 | /// Create a new Hot Aisle client 82 | pub fn new(api_key: String, base_url: Option) -> Self { 83 | let base_url = base_url.unwrap_or_else(|| "https://admin.hotaisle.app/api".to_string()); 84 | 85 | Self { 86 | api_key, 87 | base_url, 88 | client: reqwest::Client::new(), 89 | } 90 | } 91 | 92 | /// Provision a new GPU instance 93 | pub async fn provision_gpu_instance(&self, config: GpuInstanceConfig) -> Result { 94 | let url = format!("{}/instances", self.base_url); 95 | 96 | let response = self 97 | .client 98 | .post(&url) 99 | .header("Authorization", format!("Bearer {}", self.api_key)) 100 | .header("Content-Type", "application/json") 101 | .json(&config) 102 | .send() 103 | .await?; 104 | 105 | let status = response.status(); 106 | if !status.is_success() { 107 | let error_text = response.text().await?; 108 | return Err(anyhow::anyhow!( 109 | "Failed to provision GPU instance: {} - {}", 110 | status, 111 | error_text 112 | )); 113 | } 114 | 115 | let instance: GpuInstance = response.json().await?; 116 | Ok(instance) 117 | } 118 | 119 | /// Wait for instance to be ready 120 | pub async fn wait_for_instance_ready( 121 | &self, 122 | instance_id: &str, 123 | timeout_minutes: u32, 124 | ) -> Result { 125 | let timeout = Duration::from_secs(timeout_minutes as u64 * 60); 126 | let start = std::time::Instant::now(); 127 | 128 | while start.elapsed() < timeout { 129 | let instance = self.get_instance(instance_id).await?; 130 | 131 | match instance.status.as_str() { 132 | "ready" | "running" => return Ok(instance), 133 | "failed" | "error" => { 134 | return Err(anyhow::anyhow!("Instance {} failed to start", instance_id)); 135 | } 136 | _ => { 137 | // Still provisioning, wait and retry 138 | sleep(Duration::from_secs(10)).await; 139 | } 140 | } 141 | } 142 | 143 | Err(anyhow::anyhow!( 144 | "Instance {} did not become ready within {} minutes", 145 | instance_id, 146 | timeout_minutes 147 | )) 148 | } 149 | 150 | /// Get instance information 151 | pub async fn get_instance(&self, instance_id: &str) -> Result { 152 | let url = format!("{}/instances/{}", self.base_url, instance_id); 153 | 154 | let response = self 155 | .client 156 | .get(&url) 157 | .header("Authorization", format!("Bearer {}", self.api_key)) 158 | .send() 159 | .await?; 160 | 161 | let status = response.status(); 162 | if !status.is_success() { 163 | let error_text = response.text().await?; 164 | return Err(anyhow::anyhow!( 165 | "Failed to get instance {}: {} - {}", 166 | instance_id, 167 | status, 168 | error_text 169 | )); 170 | } 171 | 172 | let instance: GpuInstance = response.json().await?; 173 | Ok(instance) 174 | } 175 | 176 | /// Execute GPU tests on an instance 177 | pub async fn run_gpu_tests( 178 | &self, 179 | instance: &GpuInstance, 180 | test_config: &GpuTestConfig, 181 | ) -> Result { 182 | let url = format!("{}/instances/{}/execute", self.base_url, instance.id); 183 | 184 | let response = self 185 | .client 186 | .post(&url) 187 | .header("Authorization", format!("Bearer {}", self.api_key)) 188 | .header("Content-Type", "application/json") 189 | .json(test_config) 190 | .send() 191 | .await?; 192 | 193 | let status = response.status(); 194 | if !status.is_success() { 195 | let error_text = response.text().await?; 196 | return Err(anyhow::anyhow!( 197 | "Failed to run tests on instance {}: {} - {}", 198 | instance.id, 199 | status, 200 | error_text 201 | )); 202 | } 203 | 204 | let results: GpuTestResults = response.json().await?; 205 | Ok(results) 206 | } 207 | 208 | /// Terminate an instance 209 | pub async fn terminate_instance(&self, instance_id: &str) -> Result<()> { 210 | let url = format!("{}/instances/{}", self.base_url, instance_id); 211 | 212 | let response = self 213 | .client 214 | .delete(&url) 215 | .header("Authorization", format!("Bearer {}", self.api_key)) 216 | .send() 217 | .await?; 218 | 219 | let status = response.status(); 220 | if !status.is_success() { 221 | let error_text = response.text().await?; 222 | return Err(anyhow::anyhow!( 223 | "Failed to terminate instance {}: {} - {}", 224 | instance_id, 225 | status, 226 | error_text 227 | )); 228 | } 229 | 230 | Ok(()) 231 | } 232 | 233 | /// List available GPU types 234 | pub async fn list_available_gpu_types(&self) -> Result> { 235 | let url = format!("{}/gpu-types", self.base_url); 236 | 237 | let response = self 238 | .client 239 | .get(&url) 240 | .header("Authorization", format!("Bearer {}", self.api_key)) 241 | .send() 242 | .await?; 243 | 244 | let status = response.status(); 245 | if !status.is_success() { 246 | let error_text = response.text().await?; 247 | return Err(anyhow::anyhow!( 248 | "Failed to list GPU types: {} - {}", 249 | status, 250 | error_text 251 | )); 252 | } 253 | 254 | let gpu_types: Vec = response.json().await?; 255 | Ok(gpu_types) 256 | } 257 | } 258 | 259 | /// GPU test configuration 260 | #[derive(Debug, Clone, Serialize, Deserialize)] 261 | pub struct GpuTestConfig { 262 | /// Test command to execute 263 | pub test_command: String, 264 | /// Test timeout in minutes 265 | pub timeout_minutes: u32, 266 | /// Environment variables 267 | pub env_vars: Option>, 268 | /// Working directory 269 | pub working_dir: Option, 270 | } 271 | 272 | #[cfg(test)] 273 | mod tests { 274 | use super::*; 275 | 276 | #[tokio::test] 277 | async fn test_hotaisle_client_creation() { 278 | let client = HotAisleClient::new("test-key".to_string(), None); 279 | assert_eq!(client.base_url, "https://admin.hotaisle.app/api"); 280 | } 281 | 282 | #[tokio::test] 283 | async fn test_gpu_instance_config() { 284 | let config = GpuInstanceConfig { 285 | gpu_type: "nvidia".to_string(), 286 | duration_minutes: 30, 287 | instance_type: Some("g4dn.xlarge".to_string()), 288 | labels: Some(vec!["ci-test".to_string(), "gpu-kill".to_string()]), 289 | }; 290 | 291 | assert_eq!(config.gpu_type, "nvidia"); 292 | assert_eq!(config.duration_minutes, 30); 293 | } 294 | } 295 | -------------------------------------------------------------------------------- /scripts/run-gpu-tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # GPU Kill - Hot Aisle GPU Testing Script 4 | # This script runs comprehensive GPU tests on Hot Aisle provisioned instances 5 | 6 | set -euo pipefail 7 | 8 | # Configuration 9 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" 10 | PROJECT_ROOT="$(dirname "$SCRIPT_DIR")" 11 | LOG_FILE="/tmp/gpu-kill-tests.log" 12 | 13 | # Colors for output 14 | RED='\033[0;31m' 15 | GREEN='\033[0;32m' 16 | YELLOW='\033[1;33m' 17 | BLUE='\033[0;34m' 18 | NC='\033[0m' # No Color 19 | 20 | # Logging functions 21 | log_info() { 22 | echo -e "${BLUE}[INFO]${NC} $1" | tee -a "$LOG_FILE" 23 | } 24 | 25 | log_success() { 26 | echo -e "${GREEN}[SUCCESS]${NC} $1" | tee -a "$LOG_FILE" 27 | } 28 | 29 | log_warning() { 30 | echo -e "${YELLOW}[WARNING]${NC} $1" | tee -a "$LOG_FILE" 31 | } 32 | 33 | log_error() { 34 | echo -e "${RED}[ERROR]${NC} $1" | tee -a "$LOG_FILE" 35 | } 36 | 37 | # Function to check prerequisites 38 | check_prerequisites() { 39 | log_info "Checking prerequisites..." 40 | 41 | # Check if we're in the right directory 42 | if [[ ! -f "$PROJECT_ROOT/Cargo.toml" ]]; then 43 | log_error "Not in GPU Kill project root. Please run from project directory." 44 | exit 1 45 | fi 46 | 47 | # Check if cargo is available 48 | if ! command -v cargo &> /dev/null; then 49 | log_error "Cargo not found. Please install Rust toolchain." 50 | exit 1 51 | fi 52 | 53 | # Check if git is available 54 | if ! command -v git &> /dev/null; then 55 | log_error "Git not found. Please install git." 56 | exit 1 57 | fi 58 | 59 | log_success "Prerequisites check passed" 60 | } 61 | 62 | # Function to build GPU Kill 63 | build_gpukill() { 64 | log_info "Building GPU Kill..." 65 | 66 | cd "$PROJECT_ROOT" 67 | 68 | # Build in release mode for better performance 69 | if cargo build --release; then 70 | log_success "GPU Kill built successfully" 71 | else 72 | log_error "Failed to build GPU Kill" 73 | exit 1 74 | fi 75 | } 76 | 77 | # Function to run basic GPU detection tests 78 | run_gpu_detection_tests() { 79 | log_info "Running GPU detection tests..." 80 | 81 | local gpu_type="$1" 82 | local test_results=() 83 | 84 | # Test 1: List GPUs 85 | log_info "Testing GPU enumeration..." 86 | if ./target/release/gpukill --list > /tmp/gpu-list.txt 2>&1; then 87 | local gpu_count=$(grep -c "GPU [0-9]" /tmp/gpu-list.txt || echo "0") 88 | log_success "Found $gpu_count GPU(s)" 89 | test_results+=("gpu_enumeration:passed:$gpu_count") 90 | else 91 | log_error "GPU enumeration failed" 92 | test_results+=("gpu_enumeration:failed:0") 93 | fi 94 | 95 | # Test 2: GPU information 96 | log_info "Testing GPU information retrieval..." 97 | if ./target/release/gpukill --list > /tmp/gpu-info.json 2>&1; then 98 | local json_valid=$(python3 -m json.tool /tmp/gpu-info.json > /dev/null 2>&1 && echo "true" || echo "false") 99 | if [[ "$json_valid" == "true" ]]; then 100 | log_success "GPU information JSON is valid" 101 | test_results+=("gpu_info_json:passed:valid") 102 | else 103 | log_warning "GPU information JSON is invalid" 104 | test_results+=("gpu_info_json:failed:invalid") 105 | fi 106 | else 107 | log_error "GPU information retrieval failed" 108 | test_results+=("gpu_info_json:failed:error") 109 | fi 110 | 111 | # Test 3: GPU-specific tests based on type 112 | case "$gpu_type" in 113 | "nvidia") 114 | run_nvidia_specific_tests 115 | ;; 116 | "amd") 117 | run_amd_specific_tests 118 | ;; 119 | "intel") 120 | run_intel_specific_tests 121 | ;; 122 | "apple-silicon") 123 | run_apple_specific_tests 124 | ;; 125 | *) 126 | log_warning "Unknown GPU type: $gpu_type" 127 | ;; 128 | esac 129 | 130 | # Output test results 131 | echo "=== GPU Detection Test Results ===" 132 | for result in "${test_results[@]}"; do 133 | echo "$result" 134 | done 135 | } 136 | 137 | # Function to run NVIDIA-specific tests 138 | run_nvidia_specific_tests() { 139 | log_info "Running NVIDIA-specific tests..." 140 | 141 | # Test nvidia-smi availability 142 | if command -v nvidia-smi &> /dev/null; then 143 | log_success "nvidia-smi is available" 144 | nvidia-smi --query-gpu=name,memory.total,memory.used --format=csv,noheader,nounits 145 | else 146 | log_warning "nvidia-smi not found" 147 | fi 148 | } 149 | 150 | # Function to run AMD-specific tests 151 | run_amd_specific_tests() { 152 | log_info "Running AMD-specific tests..." 153 | 154 | # Test rocm-smi availability 155 | if command -v rocm-smi &> /dev/null; then 156 | log_success "rocm-smi is available" 157 | rocm-smi --showproductname 158 | rocm-smi --showuse 159 | rocm-smi --showtemp 160 | rocm-smi --showpower 161 | rocm-smi --showmemuse 162 | else 163 | log_warning "rocm-smi not found" 164 | fi 165 | 166 | # Test amd-smi availability (newer tool) 167 | if command -v amd-smi &> /dev/null; then 168 | log_success "amd-smi is available" 169 | amd-smi 170 | else 171 | log_warning "amd-smi not found" 172 | fi 173 | } 174 | 175 | # Function to run Intel-specific tests 176 | run_intel_specific_tests() { 177 | log_info "Running Intel-specific tests..." 178 | 179 | # Test intel_gpu_top availability 180 | if command -v intel_gpu_top &> /dev/null; then 181 | log_success "intel_gpu_top is available" 182 | timeout 5 intel_gpu_top -l 1 || true 183 | else 184 | log_warning "intel_gpu_top not found" 185 | fi 186 | } 187 | 188 | # Function to run Apple Silicon-specific tests 189 | run_apple_specific_tests() { 190 | log_info "Running Apple Silicon-specific tests..." 191 | 192 | # Test system_profiler for GPU info 193 | if command -v system_profiler &> /dev/null; then 194 | log_success "system_profiler is available" 195 | system_profiler SPDisplaysDataType | grep -A 5 "Chipset Model" || true 196 | else 197 | log_warning "system_profiler not found" 198 | fi 199 | } 200 | 201 | # Function to run performance tests 202 | run_performance_tests() { 203 | log_info "Running GPU performance tests..." 204 | 205 | local gpu_type="$1" 206 | local start_time=$(date +%s) 207 | 208 | # Run GPU hardware tests 209 | if cargo test --test gpu_hardware_tests --release; then 210 | local end_time=$(date +%s) 211 | local duration=$((end_time - start_time)) 212 | log_success "GPU performance tests completed in ${duration}s" 213 | else 214 | log_warning "Some GPU performance tests failed or were skipped" 215 | fi 216 | } 217 | 218 | # Function to run stress tests 219 | run_stress_tests() { 220 | log_info "Running GPU stress tests..." 221 | 222 | # Run multiple iterations of GPU detection 223 | for i in {1..5}; do 224 | log_info "Stress test iteration $i/5..." 225 | if ./target/release/gpukill --list > /dev/null 2>&1; then 226 | log_success "Iteration $i passed" 227 | else 228 | log_error "Iteration $i failed" 229 | return 1 230 | fi 231 | sleep 1 232 | done 233 | 234 | log_success "All stress test iterations passed" 235 | } 236 | 237 | # Function to generate test report 238 | generate_test_report() { 239 | log_info "Generating test report..." 240 | 241 | local gpu_type="$1" 242 | local report_file="/tmp/gpu-kill-test-report-$(date +%Y%m%d-%H%M%S).txt" 243 | 244 | { 245 | echo "=== GPU Kill Test Report ===" 246 | echo "Date: $(date)" 247 | echo "GPU Type: $gpu_type" 248 | echo "Hostname: $(hostname)" 249 | echo "OS: $(uname -a)" 250 | echo "" 251 | echo "=== GPU Detection Results ===" 252 | cat /tmp/gpu-list.txt 2>/dev/null || echo "No GPU list available" 253 | echo "" 254 | echo "=== GPU Information (JSON) ===" 255 | cat /tmp/gpu-info.json 2>/dev/null || echo "No GPU info available" 256 | echo "" 257 | echo "=== System Information ===" 258 | # CPU info (cross-platform) 259 | if command -v lscpu &> /dev/null; then 260 | echo "CPU: $(lscpu | grep "Model name" | cut -d: -f2 | xargs || echo "Unknown")" 261 | elif command -v sysctl &> /dev/null; then 262 | echo "CPU: $(sysctl -n machdep.cpu.brand_string 2>/dev/null || echo "Unknown")" 263 | else 264 | echo "CPU: Unknown" 265 | fi 266 | 267 | # Memory info (cross-platform) 268 | if command -v free &> /dev/null; then 269 | echo "Memory: $(free -h | grep "Mem:" | awk '{print $2}' || echo "Unknown")" 270 | elif command -v vm_stat &> /dev/null; then 271 | echo "Memory: $(system_profiler SPHardwareDataType | grep "Memory:" | awk '{print $2, $3}' || echo "Unknown")" 272 | else 273 | echo "Memory: Unknown" 274 | fi 275 | 276 | # GPU drivers (cross-platform) 277 | echo "GPU Drivers:" 278 | if command -v lsmod &> /dev/null; then 279 | lsmod | grep -E "(nvidia|amdgpu|i915)" || echo "No GPU drivers found" 280 | elif command -v kextstat &> /dev/null; then 281 | kextstat | grep -E "(nvidia|amd|intel)" || echo "No GPU drivers found" 282 | else 283 | echo "No GPU drivers found" 284 | fi 285 | } > "$report_file" 286 | 287 | log_success "Test report generated: $report_file" 288 | cat "$report_file" 289 | } 290 | 291 | # Main function 292 | main() { 293 | local gpu_type="${1:-unknown}" 294 | 295 | log_info "Starting GPU Kill tests on Hot Aisle instance" 296 | log_info "GPU Type: $gpu_type" 297 | log_info "Project Root: $PROJECT_ROOT" 298 | 299 | # Initialize log file 300 | echo "=== GPU Kill Test Log - $(date) ===" > "$LOG_FILE" 301 | 302 | # Run test suite 303 | check_prerequisites 304 | build_gpukill 305 | run_gpu_detection_tests "$gpu_type" 306 | run_performance_tests "$gpu_type" 307 | run_stress_tests 308 | generate_test_report "$gpu_type" 309 | 310 | log_success "All GPU Kill tests completed successfully!" 311 | } 312 | 313 | # Run main function with all arguments 314 | main "$@" 315 | -------------------------------------------------------------------------------- /src/render.rs: -------------------------------------------------------------------------------- 1 | use crate::args::OutputFormat; 2 | use crate::nvml_api::Snapshot; 3 | use crate::util::{format_memory_mb_to_gib, truncate_string}; 4 | // serde_json is used via serde_json::to_string_pretty 5 | use std::io::{self, Write}; 6 | use tabled::{ 7 | settings::{object::Rows, style::Style, Alignment, Modify, Padding, Width}, 8 | Table, Tabled, 9 | }; 10 | 11 | /// Render GPU information to various output formats 12 | #[derive(Clone)] 13 | pub struct Renderer { 14 | output_format: OutputFormat, 15 | } 16 | 17 | #[allow(dead_code)] 18 | impl Renderer { 19 | /// Create a new renderer 20 | pub fn new(output_format: OutputFormat) -> Self { 21 | Self { output_format } 22 | } 23 | 24 | /// Render a complete snapshot 25 | pub fn render_snapshot( 26 | &self, 27 | snapshot: &Snapshot, 28 | details: bool, 29 | ) -> Result<(), Box> { 30 | match self.output_format { 31 | OutputFormat::Table => self.render_table(snapshot, details), 32 | OutputFormat::Json => self.render_json(snapshot), 33 | } 34 | } 35 | 36 | /// Render as a table 37 | fn render_table( 38 | &self, 39 | snapshot: &Snapshot, 40 | details: bool, 41 | ) -> Result<(), Box> { 42 | if details { 43 | self.render_detailed_table(snapshot) 44 | } else { 45 | self.render_summary_table(snapshot) 46 | } 47 | } 48 | 49 | /// Render summary table (one row per GPU) 50 | fn render_summary_table(&self, snapshot: &Snapshot) -> Result<(), Box> { 51 | let mut table_data = Vec::new(); 52 | 53 | for gpu in &snapshot.gpus { 54 | let mem_used_gib = format_memory_mb_to_gib(gpu.mem_used_mb); 55 | let mem_total_gib = format_memory_mb_to_gib(gpu.mem_total_mb); 56 | let mem_usage = format!("{}/{} GiB", mem_used_gib, mem_total_gib); 57 | 58 | let top_proc_info = if let Some(ref top_proc) = gpu.top_proc { 59 | format!( 60 | "{}:{}:{}MB", 61 | truncate_string(&top_proc.proc_name, 15), 62 | top_proc.pid, 63 | top_proc.used_mem_mb 64 | ) 65 | } else { 66 | "-".to_string() 67 | }; 68 | 69 | let ecc_info = gpu 70 | .ecc_volatile 71 | .map(|e| e.to_string()) 72 | .unwrap_or_else(|| "-".to_string()); 73 | 74 | table_data.push(SummaryRow { 75 | gpu: gpu.gpu_index.to_string(), 76 | name: truncate_string(&gpu.name, 20), 77 | memory: mem_usage, 78 | utilization: format!("{:.1}%", gpu.util_pct), 79 | temperature: format!("{}°C", gpu.temp_c), 80 | power: format!("{:.1}W", gpu.power_w), 81 | ecc_volatile: ecc_info, 82 | pids: gpu.pids.to_string(), 83 | top_process: top_proc_info, 84 | }); 85 | } 86 | 87 | let table = Table::new(&table_data) 88 | .with(Style::modern()) 89 | .with(Modify::new(Rows::new(1..)).with(Alignment::left())) 90 | .with(Modify::new(Rows::new(1..)).with(Padding::new(1, 1, 0, 0))) 91 | .with(Width::wrap(120)) 92 | .to_string(); 93 | 94 | println!("{}", table); 95 | Ok(()) 96 | } 97 | 98 | /// Render detailed table (one row per process) 99 | fn render_detailed_table(&self, snapshot: &Snapshot) -> Result<(), Box> { 100 | // First render summary 101 | self.render_summary_table(snapshot)?; 102 | println!(); 103 | 104 | // Then render process details 105 | if !snapshot.procs.is_empty() { 106 | let mut table_data = Vec::new(); 107 | 108 | for proc in &snapshot.procs { 109 | let container_info = proc 110 | .container 111 | .as_ref() 112 | .map(|c| truncate_string(c, 15)) 113 | .unwrap_or_else(|| "-".to_string()); 114 | 115 | table_data.push(ProcessRow { 116 | gpu: proc.gpu_index.to_string(), 117 | pid: proc.pid.to_string(), 118 | user: truncate_string(&proc.user, 12), 119 | process: truncate_string(&proc.proc_name, 20), 120 | vram_mb: format!("{}MB", proc.used_mem_mb), 121 | start_time: truncate_string(&proc.start_time, 10), 122 | container: container_info, 123 | }); 124 | } 125 | 126 | let table = Table::new(&table_data) 127 | .with(Style::modern()) 128 | .with(Modify::new(Rows::new(1..)).with(Alignment::left())) 129 | .with(Modify::new(Rows::new(1..)).with(Padding::new(1, 1, 0, 0))) 130 | .with(Width::wrap(120)) 131 | .to_string(); 132 | 133 | println!("Process Details:"); 134 | println!("{}", table); 135 | } 136 | 137 | Ok(()) 138 | } 139 | 140 | /// Render as JSON 141 | fn render_json(&self, snapshot: &Snapshot) -> Result<(), Box> { 142 | let json = serde_json::to_string_pretty(snapshot)?; 143 | println!("{}", json); 144 | Ok(()) 145 | } 146 | 147 | /// Render JSON snapshot for watch mode (newline-delimited) 148 | pub fn render_json_snapshot( 149 | &self, 150 | snapshot: &Snapshot, 151 | ) -> Result<(), Box> { 152 | let json = serde_json::to_string(snapshot)?; 153 | println!("{}", json); 154 | io::stdout().flush()?; 155 | Ok(()) 156 | } 157 | 158 | /// Clear screen for watch mode 159 | pub fn clear_screen(&self) { 160 | print!("\x1B[2J\x1B[1;1H"); 161 | io::stdout().flush().unwrap_or_default(); 162 | } 163 | 164 | /// Get output format 165 | pub fn get_output_format(&self) -> OutputFormat { 166 | self.output_format.clone() 167 | } 168 | } 169 | 170 | /// Summary table row structure 171 | #[derive(Tabled)] 172 | struct SummaryRow { 173 | #[tabled(rename = "GPU")] 174 | gpu: String, 175 | #[tabled(rename = "NAME")] 176 | name: String, 177 | #[tabled(rename = "MEM_USED/TOTAL")] 178 | memory: String, 179 | #[tabled(rename = "UTIL(%)")] 180 | utilization: String, 181 | #[tabled(rename = "TEMP(°C)")] 182 | temperature: String, 183 | #[tabled(rename = "POWER(W)")] 184 | power: String, 185 | #[tabled(rename = "ECC(volatile)")] 186 | ecc_volatile: String, 187 | #[tabled(rename = "PIDS")] 188 | pids: String, 189 | #[tabled(rename = "TOP_PROC")] 190 | top_process: String, 191 | } 192 | 193 | /// Process table row structure 194 | #[derive(Tabled)] 195 | struct ProcessRow { 196 | #[tabled(rename = "GPU")] 197 | gpu: String, 198 | #[tabled(rename = "PID")] 199 | pid: String, 200 | #[tabled(rename = "USER")] 201 | user: String, 202 | #[tabled(rename = "PROC")] 203 | process: String, 204 | #[tabled(rename = "VRAM_MB")] 205 | vram_mb: String, 206 | #[tabled(rename = "START_TIME")] 207 | start_time: String, 208 | #[tabled(rename = "CONTAINER?")] 209 | container: String, 210 | } 211 | 212 | /// Render error messages 213 | pub fn render_error(message: &str) { 214 | eprintln!("Error: {}", message); 215 | } 216 | 217 | /// Render warning messages 218 | pub fn render_warning(message: &str) { 219 | eprintln!("Warning: {}", message); 220 | } 221 | 222 | /// Render info messages 223 | pub fn render_info(message: &str) { 224 | println!("Info: {}", message); 225 | } 226 | 227 | /// Render success messages 228 | pub fn render_success(message: &str) { 229 | println!("Success: {}", message); 230 | } 231 | 232 | #[cfg(test)] 233 | mod tests { 234 | use super::*; 235 | use crate::nvml_api::{GpuProc, GpuSnapshot, Snapshot}; 236 | 237 | fn create_test_snapshot() -> Snapshot { 238 | Snapshot { 239 | host: "test-host".to_string(), 240 | ts: "2024-01-01T00:00:00Z".to_string(), 241 | gpus: vec![GpuSnapshot { 242 | gpu_index: 0, 243 | name: "Test GPU".to_string(), 244 | vendor: crate::vendor::GpuVendor::Unknown, 245 | mem_used_mb: 2048, 246 | mem_total_mb: 8192, 247 | util_pct: 50.0, 248 | temp_c: 75, 249 | power_w: 150.0, 250 | ecc_volatile: Some(0), 251 | pids: 2, 252 | top_proc: Some(GpuProc { 253 | gpu_index: 0, 254 | pid: 12345, 255 | user: "testuser".to_string(), 256 | proc_name: "test_process".to_string(), 257 | used_mem_mb: 1024, 258 | start_time: "1h 30m".to_string(), 259 | container: None, 260 | }), 261 | }], 262 | procs: vec![GpuProc { 263 | gpu_index: 0, 264 | pid: 12345, 265 | user: "testuser".to_string(), 266 | proc_name: "test_process".to_string(), 267 | used_mem_mb: 1024, 268 | start_time: "1h 30m".to_string(), 269 | container: None, 270 | }], 271 | } 272 | } 273 | 274 | #[test] 275 | fn test_renderer_creation() { 276 | let renderer = Renderer::new(OutputFormat::Table); 277 | assert!(matches!(renderer.output_format, OutputFormat::Table)); 278 | } 279 | 280 | #[test] 281 | fn test_json_rendering() { 282 | let renderer = Renderer::new(OutputFormat::Json); 283 | let snapshot = create_test_snapshot(); 284 | 285 | // This should not panic 286 | let result = renderer.render_json(&snapshot); 287 | assert!(result.is_ok()); 288 | } 289 | 290 | #[test] 291 | fn test_table_rendering() { 292 | let renderer = Renderer::new(OutputFormat::Table); 293 | let snapshot = create_test_snapshot(); 294 | 295 | // This should not panic 296 | let result = renderer.render_table(&snapshot, false); 297 | assert!(result.is_ok()); 298 | } 299 | 300 | #[test] 301 | fn test_detailed_table_rendering() { 302 | let renderer = Renderer::new(OutputFormat::Table); 303 | let snapshot = create_test_snapshot(); 304 | 305 | // This should not panic 306 | let result = renderer.render_table(&snapshot, true); 307 | assert!(result.is_ok()); 308 | } 309 | } 310 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # GPU Kill 2 | 3 | A CLI tool for managing GPUs across NVIDIA, AMD, Intel, and Apple Silicon systems. Monitor, control, and secure your GPU infrastructure with ease. 4 | 5 | ## Community & Support 6 | 7 | Join our Discord community for discussions, support, and updates: 8 | 9 | [![Discord](https://img.shields.io/badge/Discord-Join%20our%20community-7289DA?style=for-the-badge&logo=discord&logoColor=white)](https://discord.gg/KqdBcqRk5E) 10 | 11 | 12 | ## Features 13 | 14 | - **Monitor GPUs**: Real-time usage, memory, temperature, and processes 15 | - **Kill Processes**: Gracefully terminate stuck GPU processes 16 | - **Security**: Detect crypto miners and suspicious activity 17 | - **Guard Mode**: Policy enforcement to prevent resource abuse 18 | - **Remote**: Manage GPUs across multiple servers 19 | - **Multi-Vendor**: Works with NVIDIA, AMD, Intel, and Apple Silicon 20 | - **AI Integration**: MCP server for AI assistant integration 21 | 22 | ## Requirements 23 | 24 | ### Build Performance 25 | 26 | **For faster development builds:** 27 | ```bash 28 | # Fast release build (recommended for development) 29 | cargo build --profile release-fast 30 | 31 | # Standard release build (optimized for production) 32 | cargo build --release 33 | 34 | # Maximum optimization (slowest, best performance) 35 | cargo build --profile release-max 36 | ``` 37 | 38 | **Build times on typical hardware:** 39 | - Debug build: ~3 seconds 40 | - Release-fast: ~28 seconds 41 | - Release: ~28 seconds (improved from 76 seconds) 42 | - Release-max: ~60+ seconds (maximum optimization) 43 | 44 | ### System Dependencies 45 | 46 | **Linux (Ubuntu/Debian):** 47 | ```bash 48 | sudo apt install build-essential libssl-dev pkg-config 49 | ``` 50 | 51 | **Linux (Fedora/RHEL/CentOS):** 52 | ```bash 53 | sudo dnf install gcc gcc-c++ pkg-config openssl-devel 54 | # or for older systems: 55 | # sudo yum install gcc gcc-c++ pkg-config openssl-devel 56 | ``` 57 | 58 | **macOS:** 59 | ```bash 60 | # Install Xcode command line tools 61 | xcode-select --install 62 | # OpenSSL is included with macOS 63 | ``` 64 | 65 | **Windows:** 66 | - Install Visual Studio Build Tools 67 | - OpenSSL is handled automatically by vcpkg 68 | 69 | ### GPU Drivers 70 | 71 | - **NVIDIA**: NVIDIA drivers installed 72 | - **AMD**: ROCm drivers installed 73 | - **Intel**: intel-gpu-tools package installed 74 | - **Apple Silicon**: macOS with Apple Silicon (M1/M2/M3/M4) 75 | 76 | ### Build Requirements 77 | 78 | - **OS**: Linux, macOS, or Windows 79 | - **Rust**: 1.70+ (for building from source) 80 | 81 | ## Quick Start 82 | 83 | ### Install & Run 84 | ```bash 85 | # Build from source (first build may take 2-3 minutes) 86 | git clone https://github.com/treadiehq/gpu-kill.git 87 | cd gpu-kill 88 | cargo build --release 89 | 90 | # Or install via Cargo 91 | cargo install gpukill 92 | 93 | # Or one-liner installers (recommended) 94 | # macOS/Linux 95 | curl -fsSL https://raw.githubusercontent.com/treadiehq/gpu-kill/refs/heads/main/scripts/install.sh | sh 96 | # Windows (PowerShell) 97 | irm https://raw.githubusercontent.com/treadiehq/gpu-kill/refs/heads/main/scripts/install.ps1 | iex 98 | 99 | # List your GPUs 100 | gpukill --list 101 | 102 | # Watch GPU usage in real-time 103 | gpukill --list --watch 104 | ``` 105 | 106 | ### Dead-simple cheatsheet 107 | ```bash 108 | # Live watch (alias) 109 | gpukill watch # = gpukill --list --watch 110 | 111 | # Kill job by PID (positional alias) 112 | gpukill 12345 # = gpukill --kill --pid 12345 113 | 114 | # Free a specific GPU index (kill all jobs on GPU 0) 115 | gpukill --kill --gpu 0 # add --batch to actually kill; preview without it 116 | 117 | # Force reset a GPU (shorthand) 118 | gpukill --reset 0 # = gpukill --reset --gpu 0 119 | 120 | # Safe mode: dry-run first (no changes) 121 | gpukill 12345 --safe # alias: --dry-run 122 | ``` 123 | 124 | ## Dashboard (Local Development) 125 | 126 | The GPU Kill dashboard provides a modern web interface for GPU cluster monitoring. The dashboard is included in the repository for local development but is **not required** for core GPU Kill functionality. 127 | 128 | ![GPU Kill Dashboard](dashboard/public/screenshot.png) 129 | 130 | ### Quick Start 131 | 132 | ```bash 133 | # 1. Start the backend API server 134 | gpukill --server --server-port 8080 135 | 136 | # 2. In a new terminal, start the dashboard UI 137 | cd dashboard 138 | npm install # First time only 139 | npm run dev 140 | 141 | # 3. Access the dashboard 142 | open http://localhost:3000 143 | ``` 144 | 145 | **Requirements:** 146 | - Node.js 18+ and npm 147 | - GPU Kill backend server running (provides the API) 148 | 149 | **Note**: You need both the backend server (port 8080) and frontend UI (port 3000) running for the dashboard to work. 150 | 151 | ### Dashboard Features 152 | 153 | - **Real-time monitoring** of all GPUs across your cluster 154 | - **Security detection** with threat analysis and risk scoring 155 | - **Policy management** for resource control and enforcement 156 | - **Cluster overview** with Magic Moment contention insights 157 | - **Interactive controls** for process management and GPU operations 158 | 159 | ### Production Deployment 160 | 161 | For production GPU monitoring solutions, check the [Kill Suite](https://treadie.com) website. 162 | 163 | ## MCP Server 164 | 165 | GPU Kill includes a MCP server that enables AI assistants to interact with GPU management functionality: 166 | 167 | - **Resources**: Read GPU status, processes, audit data, policies, and security scans 168 | - **Tools**: Kill processes, reset GPUs, scan for threats, create policies 169 | 170 | ```bash 171 | # Start the MCP server 172 | cargo run --release -p gpukill-mcp 173 | 174 | # Server runs on http://localhost:3001/mcp 175 | ``` 176 | 177 | ## Usage 178 | 179 | Ask your AI to use the tools. 180 | 181 | ```text 182 | What GPUs do I have and what's their current usage? 183 | ``` 184 | 185 | ```text 186 | Kill the Python process that's stuck on GPU 0 187 | ``` 188 | 189 | ```text 190 | Kill all training processes that are using too much GPU memory 191 | ``` 192 | 193 | ```text 194 | Show me GPU usage and kill any stuck processes 195 | ``` 196 | 197 | ```text 198 | Scan for crypto miners and suspicious activity 199 | ``` 200 | 201 | ```text 202 | Create a policy to limit user memory usage to 8GB 203 | ``` 204 | 205 | ```text 206 | Reset GPU 1 because it's not responding 207 | ``` 208 | 209 | ```text 210 | What processes are currently using my GPUs? 211 | ``` 212 | 213 | See [mcp/README.md](mcp/README.md) for detailed MCP server documentation. 214 | 215 | 216 | ## Security & Policies 217 | 218 | ### Detect Threats 219 | ```bash 220 | # Scan for crypto miners and suspicious activity 221 | gpukill --audit --rogue 222 | 223 | # Configure detection rules 224 | gpukill --audit --rogue-config 225 | ``` 226 | 227 | ### Policy Enforcement 228 | ```bash 229 | # Enable Guard Mode 230 | gpukill --guard --guard-enable 231 | 232 | # Test policies safely 233 | gpukill --guard --guard-test-policies 234 | ``` 235 | 236 | *For detailed security and policy documentation, see [DETAILED.md](DETAILED.md).* 237 | 238 | ## Remote Management 239 | 240 | Manage GPUs across multiple servers via SSH: 241 | 242 | ```bash 243 | # List GPUs on remote server 244 | gpukill --remote staging-server --list 245 | 246 | # Kill process on remote server 247 | gpukill --remote prod-gpu-01 --kill --pid 1234 248 | 249 | # Reset GPU on remote server 250 | gpukill --remote gpu-cluster --reset --gpu 0 251 | ``` 252 | 253 | ## Troubleshooting 254 | 255 | ### Build Issues 256 | 257 | **OpenSSL not found:** 258 | ```bash 259 | # Ubuntu/Debian 260 | sudo apt install build-essential libssl-dev pkg-config 261 | 262 | # Fedora/RHEL/CentOS 263 | sudo dnf install gcc gcc-c++ pkg-config openssl-devel 264 | ``` 265 | 266 | **Other common build issues:** 267 | - Ensure you have the latest Rust toolchain: `rustup update` 268 | - Clean and rebuild: `cargo clean && cargo build --release` 269 | - Check system dependencies are installed (see Requirements section) 270 | 271 | ## Need Help? 272 | 273 | ```bash 274 | gpukill --help # Show all options 275 | gpukill --version # Show version 276 | ``` 277 | 278 | ## CI/CD and Testing 279 | 280 | GPU Kill uses a CI/CD pipeline with **automatic GPU testing**: 281 | 282 | - **✅ Conditional GPU testing** - Runs automatically when GPU hardware is available 283 | - **✅ Multi-vendor GPU testing** on real hardware (NVIDIA, AMD, Intel, Apple Silicon) 284 | - **✅ Hot Aisle integration** - Optional on-demand GPU instance provisioning for comprehensive testing 285 | - **✅ Cross-platform compatibility** testing 286 | - **✅ Performance benchmarking** and profiling 287 | - **✅ Security auditing** and compliance checks 288 | - **✅ Stress testing** for reliability validation 289 | 290 | ### How GPU Testing Works 291 | 292 | - **On GitHub hosted runners**: GPU tests skip gracefully (no GPU hardware) 293 | - **On self-hosted runners**: GPU tests run automatically when GPU hardware is detected 294 | - **On cloud instances**: GPU tests run automatically when GPU hardware is available 295 | - **On developer machines**: GPU tests run automatically when GPU hardware is detected 296 | - **Via Hot Aisle**: On-demand GPU instance provisioning for comprehensive testing 297 | 298 | ### Quick Setup 299 | 300 | **Option 1: Test Locally (Already Working)** 301 | ```bash 302 | cargo test --test gpu_hardware_tests # Runs on your GPU hardware 303 | ``` 304 | 305 | **Option 2: Set Up Cloud GPU (5 minutes)** 306 | ```bash 307 | # On any cloud GPU instance: 308 | curl -sSL https://raw.githubusercontent.com/treadiehq/gpu-kill/main/scripts/setup-gpu-runner.sh | bash 309 | ``` 310 | 311 | **Option 3: Self-Hosted Runner** 312 | See **[CI_CD.md](CI_CD.md)** for detailed information about our testing infrastructure and how to set up self-hosted runners with GPU hardware. 313 | 314 | **Option 4: Hot Aisle Integration (Optional)** 315 | ```bash 316 | # Build with Hot Aisle feature 317 | cargo build --release --features hotaisle 318 | 319 | # Integration tests run automatically (no API key required) 320 | # For actual GPU testing: 321 | # 1. Set up HOTAISLE_API_KEY in GitHub Secrets 322 | # 2. Manually trigger "Hot Aisle GPU Testing" workflow 323 | # 3. Tests run on real GPU hardware with automatic cleanup 324 | ``` 325 | 326 | **Option 5: Cloud GPU Setup** 327 | See **[docs/CLOUD_GPU_SETUP.md](docs/CLOUD_GPU_SETUP.md)** for AWS, GCP, and Azure GPU instance setup. 328 | 329 | ## Documentation 330 | 331 | - **[DETAILED.md](DETAILED.md)** - Complete documentation, API reference, and advanced features 332 | - **[CI_CD.md](CI_CD.md)** - CI/CD pipeline and testing infrastructure 333 | - **[docs/HOTAISLE_INTEGRATION.md](docs/HOTAISLE_INTEGRATION.md)** - Hot Aisle integration guide 334 | - **[docs/CLOUD_GPU_SETUP.md](docs/CLOUD_GPU_SETUP.md)** - Cloud GPU setup guide (AWS, GCP, Azure) 335 | 336 | ## License 337 | 338 | This project is licensed under the FSL-1.1-MIT License. See the LICENSE file for details. -------------------------------------------------------------------------------- /deny.toml: -------------------------------------------------------------------------------- 1 | # This template contains all of the possible sections and their default values 2 | 3 | # Note that all fields that take a lint level have these possible values: 4 | # * deny - An error will be produced and the check will fail 5 | # * warn - A warning will be produced, but the check will not fail 6 | # * allow - No warning or error will be produced, though in some cases a note 7 | # will be 8 | 9 | # The values provided in this template are the default values that will be used 10 | # when any section or field is not specified in your own configuration 11 | 12 | # Root options 13 | 14 | # The graph table configures how the dependency graph is constructed and thus 15 | # which crates the checks are performed against 16 | [graph] 17 | # If 1 or more target triples (and optionally, target_features) are specified, 18 | # only the specified targets will be checked when running `cargo deny check`. 19 | # This means, if a particular package is only ever used as a target specific 20 | # dependency, such as, for example, the `nix` crate only being used via the 21 | # `target_family = "unix"` configuration, that only having windows targets in 22 | # this list would mean the nix crate, as well as any of its exclusive 23 | # dependencies not shared by any other crates, would be ignored, as the target 24 | # list here is effectively saying which targets you are building for. 25 | targets = [ 26 | # The triple can be any string, but only the target triples built in to 27 | # rustc (as of 1.40) can be checked against actual config expressions 28 | #"x86_64-unknown-linux-musl", 29 | # You can also specify which target_features you promise are enabled for a 30 | # particular target. target_features are currently not validated against 31 | # the actual valid features supported by the target architecture. 32 | #{ triple = "wasm32-unknown-unknown", features = ["atomics"] }, 33 | ] 34 | # When creating the dependency graph used as the source of truth when checks are 35 | # executed, this field can be used to prune crates from the graph, removing them 36 | # from the view of cargo-deny. This is an extremely heavy hammer, as if a crate 37 | # is pruned from the graph, all of its dependencies will also be pruned unless 38 | # they are connected to another crate in the graph that hasn't been pruned, 39 | # so it should be used with care. The identifiers are [Package ID Specifications] 40 | # (https://doc.rust-lang.org/cargo/reference/pkgid-spec.html) 41 | #exclude = [] 42 | # If true, metadata will be collected with `--all-features`. Note that this can't 43 | # be toggled off if true, if you want to conditionally enable `--all-features` it 44 | # is recommended to pass `--all-features` on the cmd line instead 45 | all-features = false 46 | # If true, metadata will be collected with `--no-default-features`. The same 47 | # caveat with `all-features` applies 48 | no-default-features = false 49 | # If set, these feature will be enabled when collecting metadata. If `--features` 50 | # is specified on the cmd line they will take precedence over this option. 51 | #features = [] 52 | 53 | # The output table provides options for how/if diagnostics are outputted 54 | [output] 55 | # When outputting inclusion graphs in diagnostics that include features, this 56 | # option can be used to specify the depth at which feature edges will be added. 57 | # This option is included since the graphs can be quite large and the addition 58 | # of features from the crate(s) to all of the graph roots can be far too verbose. 59 | # This option can be overridden via `--feature-depth` on the cmd line 60 | feature-depth = 1 61 | 62 | # This section is considered when running `cargo deny check advisories` 63 | # More documentation for the advisories section can be found here: 64 | # https://embarkstudios.github.io/cargo-deny/checks/advisories/cfg.html 65 | [advisories] 66 | # The path where the advisory databases are cloned/fetched into 67 | #db-path = "$CARGO_HOME/advisory-dbs" 68 | # The url(s) of the advisory databases to use 69 | #db-urls = ["https://github.com/rustsec/advisory-db"] 70 | # A list of advisory IDs to ignore. Note that ignored advisories will still 71 | # output a note when they are encountered. 72 | ignore = [ 73 | # Allow unmaintained crates as warnings (not errors) 74 | { id = "RUSTSEC-2020-0168", reason = "mach crate is unmaintained but still functional" }, 75 | { id = "RUSTSEC-2024-0370", reason = "proc-macro-error is unmaintained but still functional" }, 76 | ] 77 | # If this is true, then cargo deny will use the git executable to fetch advisory database. 78 | # If this is false, then it uses a built-in git library. 79 | # Setting this to true can be helpful if you have special authentication requirements that cargo-deny does not support. 80 | # See Git Authentication for more information about setting up git authentication. 81 | #git-fetch-with-cli = true 82 | 83 | # This section is considered when running `cargo deny check licenses` 84 | # More documentation for the licenses section can be found here: 85 | # https://embarkstudios.github.io/cargo-deny/checks/licenses/cfg.html 86 | [licenses] 87 | # List of explicitly allowed licenses 88 | # See https://spdx.org/licenses/ for list of possible licenses 89 | # [possible values: any SPDX 3.11 short identifier (+ optional exception)]. 90 | allow = [ 91 | "MIT", 92 | "Apache-2.0", 93 | "Apache-2.0 WITH LLVM-exception", 94 | "BSD-2-Clause", 95 | "BSD-3-Clause", 96 | "ISC", 97 | "Unlicense", 98 | "0BSD", 99 | "Zlib", 100 | "CC0-1.0", 101 | "MPL-2.0", 102 | "LGPL-2.1", 103 | "LGPL-3.0", 104 | "GPL-2.0", 105 | "GPL-3.0", 106 | "FSL-1.1-MIT", 107 | "Unicode-3.0", 108 | ] 109 | # The confidence threshold for detecting a license from license text. 110 | # The higher the value, the more closely the license text must be to the 111 | # canonical license text of a valid SPDX license file. 112 | # [possible values: any between 0.0 and 1.0]. 113 | confidence-threshold = 0.8 114 | # Allow 1 or more licenses on a per-crate basis, so that particular licenses 115 | # aren't accepted for every possible crate as with the normal allow list 116 | exceptions = [ 117 | # Each entry is the crate and version constraint, and its specific allow 118 | # list 119 | #{ allow = ["Zlib"], crate = "adler32" }, 120 | ] 121 | 122 | # Some crates don't have (easily) machine readable licensing information, 123 | # adding a clarification entry for it allows you to manually specify the 124 | # licensing information 125 | #[[licenses.clarify]] 126 | # The package spec the clarification applies to 127 | #crate = "ring" 128 | # The SPDX expression for the license requirements of the crate 129 | #expression = "MIT AND ISC AND OpenSSL" 130 | # One or more files in the crate's source used as the "source of truth" for 131 | # the license expression. If the contents match, the clarification will be used 132 | # when running the license check, otherwise the clarification will be ignored 133 | # and the crate will be checked normally, which may produce warnings or errors 134 | # depending on the rest of your configuration 135 | #license-files = [ 136 | # Each entry is a crate relative path, and the (opaque) hash of its contents 137 | #{ path = "LICENSE", hash = 0xbd0eed23 } 138 | #] 139 | 140 | [licenses.private] 141 | # If true, ignores workspace crates that aren't published, or are only 142 | # published to private registries. 143 | # To see how to mark a crate as unpublished (to the official registry), 144 | # visit https://doc.rust-lang.org/cargo/reference/manifest.html#the-publish-field. 145 | ignore = false 146 | # One or more private registries that you might publish crates to, if a crate 147 | # is only published to private registries, and ignore is true, the crate will 148 | # not have its license(s) checked 149 | registries = [ 150 | #"https://sekretz.com/registry 151 | ] 152 | 153 | # This section is considered when running `cargo deny check bans`. 154 | # More documentation about the 'bans' section can be found here: 155 | # https://embarkstudios.github.io/cargo-deny/checks/bans/cfg.html 156 | [bans] 157 | # Lint level for when multiple versions of the same crate are detected 158 | multiple-versions = "warn" 159 | # Lint level for when a crate version requirement is `*` 160 | wildcards = "allow" 161 | # The graph highlighting used when creating dotgraphs for crates 162 | # with multiple versions 163 | # * lowest-version - The path to the lowest versioned duplicate is highlighted 164 | # * simplest-path - The path to the version with the fewest edges is highlighted 165 | # * all - Both lowest-version and simplest-path are used 166 | highlight = "all" 167 | # The default lint level for `default` features for crates that are members of 168 | # the workspace that is being checked. This can be overridden by allowing/denying 169 | # `default` on a crate-by-crate basis if desired. 170 | workspace-default-features = "allow" 171 | # The default lint level for `default` features for external crates that are not 172 | # members of the workspace. This can be overridden by allowing/denying `default` 173 | # on a crate-by-crate basis if desired. 174 | external-default-features = "allow" 175 | # List of crates that are allowed. Use with care! 176 | allow = [ 177 | #"ansi_term@0.11.0", 178 | #{ crate = "ansi_term@0.11.0", reason = "you can specify a reason it is allowed" }, 179 | ] 180 | # List of crates to deny 181 | deny = [ 182 | #"ansi_term@0.11.0", 183 | #{ crate = "ansi_term@0.11.0", reason = "you can specify a reason it is banned" }, 184 | # Wrapper crates can optionally be specified to allow the crate when it 185 | # is a direct dependency of the otherwise banned crate 186 | #{ crate = "ansi_term@0.11.0", wrappers = ["this-crate-directly-depends-on-ansi_term"] }, 187 | ] 188 | 189 | # List of features to allow/deny 190 | # Each entry the name of a crate and a version range. If version is 191 | # not specified, all versions will be matched. 192 | #[[bans.features]] 193 | #crate = "reqwest" 194 | # Features to not allow 195 | #deny = ["json"] 196 | # Features to allow 197 | #allow = [ 198 | # "rustls", 199 | # "__rustls", 200 | # "__tls", 201 | # "hyper-rustls", 202 | # "rustls", 203 | # "rustls-pemfile", 204 | # "rustls-tls-webpki-roots", 205 | # "tokio-rustls", 206 | # "webpki-roots", 207 | #] 208 | # If true, the allowed features must exactly match the enabled feature set. If 209 | # this is set there is no point setting `deny` 210 | #exact = true 211 | 212 | # Certain crates/versions that will be skipped when doing duplicate detection. 213 | skip = [ 214 | #"ansi_term@0.11.0", 215 | #{ crate = "ansi_term@0.11.0", reason = "you can specify a reason why it can't be updated/removed" }, 216 | ] 217 | # Similarly to `skip` allows you to skip certain crates during duplicate 218 | # detection. Unlike skip, it also includes the entire tree of transitive 219 | # dependencies starting at the specified crate, up to a certain depth, which is 220 | # by default infinite. 221 | skip-tree = [ 222 | #"ansi_term@0.11.0", # will be skipped along with _all_ of its direct and transitive dependencies 223 | #{ crate = "ansi_term@0.11.0", depth = 20 }, 224 | ] 225 | 226 | # This section is considered when running `cargo deny check sources`. 227 | # More documentation about the 'sources' section can be found here: 228 | # https://embarkstudios.github.io/cargo-deny/checks/sources/cfg.html 229 | [sources] 230 | # Lint level for what to happen when a crate from a crate registry that is not 231 | # in the allow list is encountered 232 | unknown-registry = "warn" 233 | # Lint level for what to happen when a crate from a git repository that is not 234 | # in the allow list is encountered 235 | unknown-git = "warn" 236 | # List of URLs for allowed crate registries. Defaults to the crates.io index 237 | # if not specified. If it is specified but empty, no registries are allowed. 238 | allow-registry = ["https://github.com/rust-lang/crates.io-index"] 239 | # List of URLs for allowed Git repositories 240 | allow-git = [] 241 | 242 | [sources.allow-org] 243 | # github.com organizations to allow git sources for 244 | github = [] 245 | # gitlab.com organizations to allow git sources for 246 | gitlab = [] 247 | # bitbucket.org organizations to allow git sources for 248 | bitbucket = [] 249 | -------------------------------------------------------------------------------- /src/proc.rs: -------------------------------------------------------------------------------- 1 | use crate::nvml_api::NvmlApi; 2 | use crate::util::parse_process_start_time; 3 | use anyhow::{Context, Result}; 4 | #[cfg(unix)] 5 | use nix::sys::signal::{kill, Signal}; 6 | #[cfg(unix)] 7 | use nix::unistd::Pid; 8 | // use std::process::Command; // Used conditionally below 9 | use std::time::{Duration, SystemTime}; 10 | use sysinfo::{Pid as SysPid, System}; 11 | 12 | /// Process information for a running process 13 | #[derive(Debug, Clone)] 14 | pub struct ProcessInfo { 15 | #[allow(dead_code)] 16 | pub pid: u32, 17 | pub user: String, 18 | pub name: String, 19 | #[allow(dead_code)] 20 | pub start_time: SystemTime, 21 | #[allow(dead_code)] 22 | pub cmdline: String, 23 | } 24 | 25 | /// Process management utilities 26 | pub struct ProcessManager { 27 | nvml_api: NvmlApi, 28 | system: System, 29 | } 30 | 31 | #[allow(dead_code)] 32 | impl ProcessManager { 33 | /// Create a new process manager 34 | pub fn new(nvml_api: NvmlApi) -> Self { 35 | let mut system = System::new_all(); 36 | system.refresh_all(); 37 | 38 | Self { nvml_api, system } 39 | } 40 | 41 | /// Get process information by PID 42 | pub fn get_process_info(&mut self, pid: u32) -> Result { 43 | self.system.refresh_processes(); 44 | 45 | let sys_pid = SysPid::from_u32(pid); 46 | let process = self 47 | .system 48 | .process(sys_pid) 49 | .ok_or_else(|| anyhow::anyhow!("Process with PID {} not found", pid))?; 50 | 51 | let user = get_process_user(pid).unwrap_or_else(|_| "unknown".to_string()); 52 | 53 | let start_time = process.start_time(); 54 | let start_time_system = SystemTime::UNIX_EPOCH + Duration::from_secs(start_time); 55 | 56 | Ok(ProcessInfo { 57 | pid, 58 | user, 59 | name: process.name().to_string(), 60 | start_time: start_time_system, 61 | cmdline: process.cmd().join(" "), 62 | }) 63 | } 64 | 65 | /// Check if a process is using any GPU 66 | pub fn is_process_using_gpu(&self, pid: u32) -> Result { 67 | self.nvml_api.is_process_using_gpu(pid) 68 | } 69 | 70 | /// Gracefully terminate a process with timeout and escalation 71 | #[cfg(unix)] 72 | pub fn graceful_kill(&self, pid: u32, timeout_secs: u16, force: bool) -> Result<()> { 73 | let pid = Pid::from_raw(pid as i32); 74 | 75 | // First, try SIGTERM 76 | tracing::info!("Sending SIGTERM to process {}", pid); 77 | kill(pid, Signal::SIGTERM).map_err(|e| anyhow::anyhow!("Failed to send SIGTERM: {}", e))?; 78 | 79 | // Wait for the process to terminate 80 | let timeout = Duration::from_secs(timeout_secs as u64); 81 | let start = SystemTime::now(); 82 | 83 | while SystemTime::now().duration_since(start).unwrap_or_default() < timeout { 84 | // Check if process still exists 85 | if !self.is_process_running(pid.as_raw() as u32)? { 86 | tracing::info!("Process {} terminated gracefully", pid); 87 | return Ok(()); 88 | } 89 | 90 | std::thread::sleep(Duration::from_millis(100)); 91 | } 92 | 93 | // Process didn't terminate, escalate if force is enabled 94 | if force { 95 | tracing::warn!("Process {} did not terminate, escalating to SIGKILL", pid); 96 | kill(pid, Signal::SIGKILL) 97 | .map_err(|e| anyhow::anyhow!("Failed to send SIGKILL: {}", e))?; 98 | 99 | // Wait a bit more for SIGKILL to take effect 100 | std::thread::sleep(Duration::from_millis(500)); 101 | 102 | if !self.is_process_running(pid.as_raw() as u32)? { 103 | tracing::info!("Process {} terminated with SIGKILL", pid); 104 | Ok(()) 105 | } else { 106 | Err(anyhow::anyhow!( 107 | "Process {} still running after SIGKILL", 108 | pid 109 | )) 110 | } 111 | } else { 112 | Err(anyhow::anyhow!( 113 | "Process {} did not terminate within {} seconds. Use --force to escalate to SIGKILL", 114 | pid, 115 | timeout_secs 116 | )) 117 | } 118 | } 119 | 120 | /// Gracefully terminate a process with timeout and escalation (Windows stub) 121 | #[cfg(windows)] 122 | pub fn graceful_kill(&self, _pid: u32, _timeout_secs: u16, _force: bool) -> Result<()> { 123 | // On Windows, we can't use Unix signals, so we'll use a different approach 124 | // For now, just return an error indicating this feature isn't available on Windows 125 | Err(anyhow::anyhow!( 126 | "Process termination not yet implemented for Windows" 127 | )) 128 | } 129 | 130 | /// Check if a process is still running 131 | fn is_process_running(&self, pid: u32) -> Result { 132 | let sys_pid = SysPid::from_u32(pid); 133 | Ok(self.system.process(sys_pid).is_some()) 134 | } 135 | 136 | /// Enrich GPU processes with system information 137 | pub fn enrich_gpu_processes( 138 | &mut self, 139 | mut processes: Vec, 140 | ) -> Result> { 141 | self.system.refresh_processes(); 142 | 143 | for process in &mut processes { 144 | if let Ok(process_info) = self.get_process_info(process.pid) { 145 | process.user = process_info.user; 146 | process.proc_name = process_info.name; 147 | process.start_time = parse_process_start_time(process_info.start_time); 148 | } 149 | } 150 | 151 | Ok(processes) 152 | } 153 | 154 | /// Get all processes using GPUs with enriched information 155 | pub fn get_enriched_gpu_processes(&mut self) -> Result> { 156 | let processes = self.nvml_api.get_gpu_processes()?; 157 | self.enrich_gpu_processes(processes) 158 | } 159 | 160 | /// Validate that a process exists and optionally check GPU usage 161 | pub fn validate_process(&self, pid: u32, check_gpu_usage: bool) -> Result<()> { 162 | // Check if process exists 163 | let sys_pid = SysPid::from_u32(pid); 164 | if self.system.process(sys_pid).is_none() { 165 | return Err(anyhow::anyhow!("Process with PID {} not found", pid)); 166 | } 167 | 168 | // Check GPU usage if requested 169 | if check_gpu_usage { 170 | let is_using_gpu = self.is_process_using_gpu(pid)?; 171 | if !is_using_gpu { 172 | return Err(anyhow::anyhow!( 173 | "Process {} is not using any GPU. Use --force to kill anyway.", 174 | pid 175 | )); 176 | } 177 | } 178 | 179 | Ok(()) 180 | } 181 | 182 | /// Get device count 183 | pub fn device_count(&self) -> Result { 184 | self.nvml_api.device_count() 185 | } 186 | 187 | /// Create snapshot 188 | pub fn create_snapshot(&self) -> Result { 189 | self.nvml_api.create_snapshot() 190 | } 191 | 192 | /// Reset GPU 193 | pub fn reset_gpu(&self, index: u32) -> Result<()> { 194 | self.nvml_api.reset_gpu(index) 195 | } 196 | } 197 | 198 | /// Get the username for a process (cross-platform) 199 | fn get_process_user(pid: u32) -> Result { 200 | #[cfg(target_os = "linux")] 201 | { 202 | // On Linux, read from /proc//status 203 | let status_path = format!("/proc/{}/status", pid); 204 | let status = std::fs::read_to_string(&status_path) 205 | .with_context(|| format!("Failed to read process status from {}", status_path))?; 206 | 207 | for line in status.lines() { 208 | if line.starts_with("Uid:") { 209 | let parts: Vec<&str> = line.split_whitespace().collect(); 210 | if parts.len() >= 2 { 211 | let uid = parts[1] 212 | .parse::() 213 | .with_context(|| format!("Failed to parse UID: {}", parts[1]))?; 214 | 215 | // Get username from UID 216 | return get_username_from_uid(uid); 217 | } 218 | } 219 | } 220 | } 221 | 222 | #[cfg(target_os = "macos")] 223 | { 224 | use std::process::Command; 225 | // On macOS, use ps command 226 | let output = Command::new("ps") 227 | .args(["-o", "user=", "-p", &pid.to_string()]) 228 | .output() 229 | .context("Failed to execute ps command")?; 230 | 231 | if output.status.success() { 232 | let user = String::from_utf8_lossy(&output.stdout).trim().to_string(); 233 | if !user.is_empty() { 234 | return Ok(user); 235 | } 236 | } 237 | } 238 | 239 | #[cfg(target_os = "windows")] 240 | { 241 | use std::process::Command; 242 | // On Windows, use wmic command 243 | let output = Command::new("wmic") 244 | .args([ 245 | "process", 246 | "where", 247 | &format!("ProcessId={}", pid), 248 | "get", 249 | "ExecutablePath", 250 | "/format:value", 251 | ]) 252 | .output() 253 | .context("Failed to execute wmic command")?; 254 | 255 | if output.status.success() { 256 | let output_str = String::from_utf8_lossy(&output.stdout); 257 | for line in output_str.lines() { 258 | if line.starts_with("ExecutablePath=") { 259 | let path = line.strip_prefix("ExecutablePath=").unwrap_or(""); 260 | if !path.is_empty() { 261 | // Extract username from path or use a default 262 | return Ok("windows_user".to_string()); 263 | } 264 | } 265 | } 266 | } 267 | } 268 | 269 | Ok("unknown".to_string()) 270 | } 271 | 272 | #[cfg(target_os = "linux")] 273 | fn get_username_from_uid(uid: u32) -> Result { 274 | use std::ffi::CString; 275 | // use std::os::unix::ffi::OsStringExt; // Unused for now 276 | 277 | unsafe { 278 | let passwd = libc::getpwuid(uid as libc::uid_t); 279 | if passwd.is_null() { 280 | return Ok(format!("uid_{}", uid)); 281 | } 282 | 283 | let username = CString::from_raw((*passwd).pw_name); 284 | let username_str = username.to_string_lossy().to_string(); 285 | std::mem::forget(username); // Don't free the passwd struct 286 | Ok(username_str) 287 | } 288 | } 289 | 290 | #[cfg(not(target_os = "linux"))] 291 | #[allow(dead_code)] 292 | fn get_username_from_uid(_uid: u32) -> Result { 293 | Ok("unknown".to_string()) 294 | } 295 | 296 | #[cfg(test)] 297 | mod tests { 298 | use super::*; 299 | use crate::nvml_api::NvmlApi; 300 | 301 | #[test] 302 | fn test_process_info_creation() { 303 | // Skip this test if NVML is not available 304 | let nvml_api = match NvmlApi::new() { 305 | Ok(api) => api, 306 | Err(_) => { 307 | // Skip test if NVML is not available 308 | return; 309 | } 310 | }; 311 | 312 | let mut proc_mgr = ProcessManager::new(nvml_api); 313 | 314 | // Test with a known process (init/systemd) 315 | if let Ok(info) = proc_mgr.get_process_info(1) { 316 | assert_eq!(info.pid, 1); 317 | assert!(!info.name.is_empty()); 318 | } 319 | } 320 | 321 | #[test] 322 | fn test_process_validation() { 323 | // Skip this test if NVML is not available 324 | let nvml_api = match NvmlApi::new() { 325 | Ok(api) => api, 326 | Err(_) => { 327 | // Skip test if NVML is not available 328 | return; 329 | } 330 | }; 331 | 332 | let proc_mgr = ProcessManager::new(nvml_api); 333 | 334 | // Test validation of non-existent process 335 | let result = proc_mgr.validate_process(999999, false); 336 | assert!(result.is_err()); 337 | } 338 | } 339 | -------------------------------------------------------------------------------- /mcp/src/resources.rs: -------------------------------------------------------------------------------- 1 | //! MCP Resources for GPU Kill 2 | 3 | use crate::types::*; 4 | use gpukill::audit::AuditManager; 5 | use gpukill::guard_mode::GuardModeManager; 6 | use gpukill::rogue_detection::RogueDetector; 7 | use gpukill::vendor::GpuManager; 8 | use serde_json::json; 9 | use std::collections::HashMap; 10 | 11 | /// Resource handler for GPU Kill MCP server 12 | pub struct ResourceHandler { 13 | gpu_manager: GpuManager, 14 | guard_mode: Option, 15 | rogue_detector: Option, 16 | audit_manager: Option, 17 | } 18 | 19 | impl ResourceHandler { 20 | pub async fn new() -> anyhow::Result { 21 | let gpu_manager = GpuManager::initialize()?; 22 | 23 | // Initialize optional components 24 | let guard_mode = GuardModeManager::new().ok(); 25 | let audit_manager = AuditManager::new().await.ok(); 26 | let rogue_detector = if let Some(am) = audit_manager { 27 | Some(RogueDetector::new(am)) 28 | } else { 29 | None 30 | }; 31 | 32 | Ok(Self { 33 | gpu_manager, 34 | guard_mode, 35 | rogue_detector, 36 | audit_manager: None, // We moved it to rogue_detector 37 | }) 38 | } 39 | 40 | /// List all available resources 41 | pub fn list_resources(&self) -> Vec { 42 | vec![ 43 | Resource { 44 | uri: "gpu://list".to_string(), 45 | name: "GPU List".to_string(), 46 | description: Some("Current GPU status and utilization".to_string()), 47 | mime_type: Some("application/json".to_string()), 48 | }, 49 | Resource { 50 | uri: "gpu://processes".to_string(), 51 | name: "GPU Processes".to_string(), 52 | description: Some("Currently running GPU processes".to_string()), 53 | mime_type: Some("application/json".to_string()), 54 | }, 55 | Resource { 56 | uri: "gpu://audit".to_string(), 57 | name: "GPU Audit".to_string(), 58 | description: Some("Historical GPU usage data".to_string()), 59 | mime_type: Some("application/json".to_string()), 60 | }, 61 | Resource { 62 | uri: "gpu://policies".to_string(), 63 | name: "Guard Mode Policies".to_string(), 64 | description: Some("Current Guard Mode policies".to_string()), 65 | mime_type: Some("application/json".to_string()), 66 | }, 67 | Resource { 68 | uri: "gpu://rogue-detection".to_string(), 69 | name: "Rogue Detection".to_string(), 70 | description: Some("Security scan results and threats".to_string()), 71 | mime_type: Some("application/json".to_string()), 72 | }, 73 | ] 74 | } 75 | 76 | /// Get resource contents by URI 77 | pub async fn get_resource(&self, uri: &str) -> anyhow::Result { 78 | match uri { 79 | "gpu://list" => self.get_gpu_list().await, 80 | "gpu://processes" => self.get_gpu_processes().await, 81 | "gpu://audit" => self.get_audit_data().await, 82 | "gpu://policies" => self.get_policies().await, 83 | "gpu://rogue-detection" => self.get_rogue_detection().await, 84 | _ => Err(anyhow::anyhow!("Unknown resource URI: {}", uri)), 85 | } 86 | } 87 | 88 | async fn get_gpu_list(&self) -> anyhow::Result { 89 | let gpus = self.gpu_manager.get_all_snapshots()?; 90 | let gpu_info: Vec = gpus 91 | .into_iter() 92 | .map(|gpu| GpuInfo { 93 | id: gpu.gpu_index as u32, 94 | name: gpu.name, 95 | vendor: gpu.vendor.to_string(), 96 | memory_used: gpu.mem_used_mb as f64, 97 | memory_total: gpu.mem_total_mb as f64, 98 | utilization: gpu.util_pct as f64, 99 | temperature: Some(gpu.temp_c as f64), 100 | power_usage: Some(gpu.power_w as f64), 101 | processes: gpu 102 | .top_proc 103 | .map(|proc| GpuProcess { 104 | pid: proc.pid, 105 | name: proc.proc_name, 106 | memory_usage: proc.used_mem_mb as f64, 107 | user: Some(proc.user), 108 | }) 109 | .into_iter() 110 | .collect(), 111 | }) 112 | .collect(); 113 | 114 | let json_text = serde_json::to_string_pretty(&gpu_info)?; 115 | 116 | Ok(ResourceContents { 117 | uri: "gpu://list".to_string(), 118 | mime_type: Some("application/json".to_string()), 119 | text: Some(json_text), 120 | blob: None, 121 | }) 122 | } 123 | 124 | async fn get_gpu_processes(&self) -> anyhow::Result { 125 | let gpus = self.gpu_manager.get_all_snapshots()?; 126 | let mut all_processes = Vec::new(); 127 | 128 | for gpu in gpus { 129 | if let Some(proc) = gpu.top_proc { 130 | all_processes.push(GpuProcess { 131 | pid: proc.pid, 132 | name: proc.proc_name, 133 | memory_usage: proc.used_mem_mb as f64, 134 | user: Some(proc.user), 135 | }); 136 | } 137 | } 138 | 139 | let json_text = serde_json::to_string_pretty(&all_processes)?; 140 | 141 | Ok(ResourceContents { 142 | uri: "gpu://processes".to_string(), 143 | mime_type: Some("application/json".to_string()), 144 | text: Some(json_text), 145 | blob: None, 146 | }) 147 | } 148 | 149 | async fn get_audit_data(&self) -> anyhow::Result { 150 | // For now, return empty audit data since we don't have access to audit_manager 151 | // In a full implementation, we would need to restructure to share the audit_manager 152 | Ok(ResourceContents { 153 | uri: "gpu://audit".to_string(), 154 | mime_type: Some("application/json".to_string()), 155 | text: Some("[]".to_string()), 156 | blob: None, 157 | }) 158 | } 159 | 160 | async fn get_policies(&self) -> anyhow::Result { 161 | if let Some(guard_mode) = &self.guard_mode { 162 | let config = guard_mode.get_config(); 163 | let policies: Vec = config 164 | .user_policies 165 | .iter() 166 | .map(|(name, policy)| { 167 | let mut limits = HashMap::new(); 168 | limits.insert("memory_limit_gb".to_string(), json!(policy.memory_limit_gb)); 169 | limits.insert( 170 | "utilization_limit_pct".to_string(), 171 | json!(policy.utilization_limit_pct), 172 | ); 173 | limits.insert( 174 | "process_limit".to_string(), 175 | json!(policy.max_concurrent_processes), 176 | ); 177 | 178 | PolicyInfo { 179 | policy_type: "user".to_string(), 180 | name: name.clone(), 181 | enabled: true, 182 | limits, 183 | } 184 | }) 185 | .collect(); 186 | 187 | let json_text = serde_json::to_string_pretty(&policies)?; 188 | 189 | Ok(ResourceContents { 190 | uri: "gpu://policies".to_string(), 191 | mime_type: Some("application/json".to_string()), 192 | text: Some(json_text), 193 | blob: None, 194 | }) 195 | } else { 196 | Ok(ResourceContents { 197 | uri: "gpu://policies".to_string(), 198 | mime_type: Some("application/json".to_string()), 199 | text: Some("[]".to_string()), 200 | blob: None, 201 | }) 202 | } 203 | } 204 | 205 | async fn get_rogue_detection(&self) -> anyhow::Result { 206 | if let Some(rogue_detector) = &self.rogue_detector { 207 | let result = rogue_detector.detect_rogue_activity(24).await?; 208 | 209 | // Combine all threat types into a single list 210 | let mut all_threats = Vec::new(); 211 | 212 | // Add suspicious processes 213 | for threat in result.suspicious_processes { 214 | all_threats.push(ThreatInfo { 215 | id: format!("suspicious_{}", threat.process.pid), 216 | threat_type: "suspicious_process".to_string(), 217 | severity: "medium".to_string(), 218 | confidence: threat.confidence as f64, 219 | description: format!("Suspicious process: {}", threat.process.proc_name), 220 | process_info: Some(GpuProcess { 221 | pid: threat.process.pid, 222 | name: threat.process.proc_name, 223 | memory_usage: threat.process.used_mem_mb as f64, 224 | user: Some(threat.process.user), 225 | }), 226 | }); 227 | } 228 | 229 | // Add crypto miners 230 | for threat in result.crypto_miners { 231 | all_threats.push(ThreatInfo { 232 | id: format!("crypto_{}", threat.process.pid), 233 | threat_type: "crypto_miner".to_string(), 234 | severity: "high".to_string(), 235 | confidence: threat.confidence as f64, 236 | description: format!("Crypto miner detected: {}", threat.process.proc_name), 237 | process_info: Some(GpuProcess { 238 | pid: threat.process.pid, 239 | name: threat.process.proc_name, 240 | memory_usage: threat.process.used_mem_mb as f64, 241 | user: Some(threat.process.user), 242 | }), 243 | }); 244 | } 245 | 246 | // Add resource abusers 247 | for threat in result.resource_abusers { 248 | all_threats.push(ThreatInfo { 249 | id: format!("abuser_{}", threat.process.pid), 250 | threat_type: "resource_abuser".to_string(), 251 | severity: "medium".to_string(), 252 | confidence: threat.severity as f64, 253 | description: format!("Resource abuser: {}", threat.process.proc_name), 254 | process_info: Some(GpuProcess { 255 | pid: threat.process.pid, 256 | name: threat.process.proc_name, 257 | memory_usage: threat.process.used_mem_mb as f64, 258 | user: Some(threat.process.user), 259 | }), 260 | }); 261 | } 262 | 263 | // Add data exfiltrators 264 | for threat in result.data_exfiltrators { 265 | all_threats.push(ThreatInfo { 266 | id: format!("exfil_{}", threat.process.pid), 267 | threat_type: "data_exfiltrator".to_string(), 268 | severity: "high".to_string(), 269 | confidence: threat.confidence as f64, 270 | description: format!("Data exfiltrator: {}", threat.process.proc_name), 271 | process_info: Some(GpuProcess { 272 | pid: threat.process.pid, 273 | name: threat.process.proc_name, 274 | memory_usage: threat.process.used_mem_mb as f64, 275 | user: Some(threat.process.user), 276 | }), 277 | }); 278 | } 279 | 280 | let threat_info = all_threats; 281 | 282 | let json_text = serde_json::to_string_pretty(&threat_info)?; 283 | 284 | Ok(ResourceContents { 285 | uri: "gpu://rogue-detection".to_string(), 286 | mime_type: Some("application/json".to_string()), 287 | text: Some(json_text), 288 | blob: None, 289 | }) 290 | } else { 291 | Ok(ResourceContents { 292 | uri: "gpu://rogue-detection".to_string(), 293 | mime_type: Some("application/json".to_string()), 294 | text: Some("[]".to_string()), 295 | blob: None, 296 | }) 297 | } 298 | } 299 | } 300 | -------------------------------------------------------------------------------- /src/process_mgmt.rs: -------------------------------------------------------------------------------- 1 | use crate::nvml_api::GpuProc; 2 | use crate::proc::ProcessManager; 3 | use anyhow::Result; 4 | use regex::Regex; 5 | use std::collections::HashMap; 6 | use sysinfo::{Pid as SysPid, System}; 7 | 8 | /// Enhanced process management with filtering and batch operations 9 | pub struct EnhancedProcessManager { 10 | pub process_manager: ProcessManager, 11 | system: System, 12 | } 13 | 14 | #[allow(dead_code)] 15 | impl EnhancedProcessManager { 16 | pub fn new(process_manager: ProcessManager) -> Self { 17 | Self { 18 | process_manager, 19 | system: System::new_all(), 20 | } 21 | } 22 | 23 | /// Filter processes by name pattern (supports regex) 24 | pub fn filter_processes_by_name( 25 | &mut self, 26 | processes: &[GpuProc], 27 | pattern: &str, 28 | ) -> Result> { 29 | let regex = Regex::new(pattern) 30 | .map_err(|e| anyhow::anyhow!("Invalid regex pattern '{}': {}", pattern, e))?; 31 | 32 | let mut filtered = Vec::new(); 33 | for proc in processes { 34 | if regex.is_match(&proc.proc_name) { 35 | filtered.push(proc.clone()); 36 | } 37 | } 38 | 39 | Ok(filtered) 40 | } 41 | 42 | /// Filter processes by user 43 | pub fn filter_processes_by_user( 44 | &mut self, 45 | processes: &[GpuProc], 46 | user: &str, 47 | ) -> Result> { 48 | let regex = Regex::new(user) 49 | .map_err(|e| anyhow::anyhow!("Invalid regex pattern '{}': {}", user, e))?; 50 | 51 | let mut filtered = Vec::new(); 52 | for proc in processes { 53 | if regex.is_match(&proc.user) { 54 | filtered.push(proc.clone()); 55 | } 56 | } 57 | 58 | Ok(filtered) 59 | } 60 | 61 | /// Filter processes by memory usage threshold 62 | pub fn filter_processes_by_memory( 63 | &mut self, 64 | processes: &[GpuProc], 65 | min_mb: u32, 66 | ) -> Vec { 67 | processes 68 | .iter() 69 | .filter(|proc| proc.used_mem_mb >= min_mb) 70 | .cloned() 71 | .collect() 72 | } 73 | 74 | /// Get process tree for a given PID 75 | pub fn get_process_tree(&mut self, root_pid: u32) -> Result> { 76 | self.system.refresh_processes(); 77 | 78 | let mut pids = Vec::new(); 79 | let mut to_process = vec![root_pid]; 80 | 81 | while let Some(pid) = to_process.pop() { 82 | pids.push(pid); 83 | 84 | // Find child processes 85 | for process in self.system.processes().values() { 86 | if let Some(parent) = process.parent() { 87 | if parent.as_u32() == pid { 88 | to_process.push(process.pid().as_u32()); 89 | } 90 | } 91 | } 92 | } 93 | 94 | Ok(pids) 95 | } 96 | 97 | /// Kill a process and its children 98 | pub fn kill_process_tree( 99 | &mut self, 100 | root_pid: u32, 101 | timeout_secs: u16, 102 | force: bool, 103 | ) -> Result<()> { 104 | let pids = self.get_process_tree(root_pid)?; 105 | 106 | tracing::info!("Killing process tree: {:?}", pids); 107 | 108 | // Kill children first, then parent 109 | for pid in pids.iter().rev() { 110 | if let Err(e) = self 111 | .process_manager 112 | .graceful_kill(*pid, timeout_secs, force) 113 | { 114 | tracing::warn!("Failed to kill process {}: {}", pid, e); 115 | } 116 | } 117 | 118 | Ok(()) 119 | } 120 | 121 | /// Batch kill processes matching a pattern 122 | pub fn batch_kill_processes( 123 | &mut self, 124 | processes: &[GpuProc], 125 | timeout_secs: u16, 126 | force: bool, 127 | ) -> Result> { 128 | let mut killed_pids = Vec::new(); 129 | let mut failed_pids = Vec::new(); 130 | 131 | for proc in processes { 132 | match self 133 | .process_manager 134 | .graceful_kill(proc.pid, timeout_secs, force) 135 | { 136 | Ok(()) => { 137 | killed_pids.push(proc.pid); 138 | tracing::info!( 139 | "Successfully killed process {} ({})", 140 | proc.pid, 141 | proc.proc_name 142 | ); 143 | } 144 | Err(e) => { 145 | failed_pids.push(proc.pid); 146 | tracing::warn!( 147 | "Failed to kill process {} ({}): {}", 148 | proc.pid, 149 | proc.proc_name, 150 | e 151 | ); 152 | } 153 | } 154 | } 155 | 156 | if !failed_pids.is_empty() { 157 | return Err(anyhow::anyhow!( 158 | "Failed to kill {} processes: {:?}", 159 | failed_pids.len(), 160 | failed_pids 161 | )); 162 | } 163 | 164 | Ok(killed_pids) 165 | } 166 | 167 | /// Detect if a process is running in a container 168 | pub fn detect_container(&mut self, pid: u32) -> Result> { 169 | self.system.refresh_processes(); 170 | 171 | let sys_pid = SysPid::from_u32(pid); 172 | let process = self 173 | .system 174 | .process(sys_pid) 175 | .ok_or_else(|| anyhow::anyhow!("Process {} not found", pid))?; 176 | 177 | // Check for common container indicators 178 | let cmdline = process.cmd().join(" "); 179 | 180 | // Docker 181 | if cmdline.contains("docker") || cmdline.contains("containerd") { 182 | return Ok(Some("docker".to_string())); 183 | } 184 | 185 | // Podman 186 | if cmdline.contains("podman") { 187 | return Ok(Some("podman".to_string())); 188 | } 189 | 190 | // Kubernetes 191 | if cmdline.contains("kubelet") || cmdline.contains("k8s") { 192 | return Ok(Some("kubernetes".to_string())); 193 | } 194 | 195 | // LXC 196 | if cmdline.contains("lxc") { 197 | return Ok(Some("lxc".to_string())); 198 | } 199 | 200 | // Check environment variables for container indicators 201 | let env = process.environ(); 202 | for env_var in env { 203 | if env_var.starts_with("CONTAINER") 204 | || env_var.starts_with("DOCKER") 205 | || env_var.starts_with("KUBERNETES") 206 | { 207 | return Ok(Some("container".to_string())); 208 | } 209 | } 210 | 211 | Ok(None) 212 | } 213 | 214 | /// Enrich GPU processes with container information 215 | pub fn enrich_with_containers(&mut self, mut processes: Vec) -> Result> { 216 | for proc in &mut processes { 217 | match self.detect_container(proc.pid) { 218 | Ok(container) => proc.container = container, 219 | Err(e) => { 220 | tracing::warn!("Failed to detect container for PID {}: {}", proc.pid, e); 221 | proc.container = None; 222 | } 223 | } 224 | } 225 | 226 | Ok(processes) 227 | } 228 | 229 | /// Get process statistics 230 | pub fn get_process_stats(&mut self, processes: &[GpuProc]) -> ProcessStats { 231 | let mut stats = ProcessStats::default(); 232 | 233 | for proc in processes { 234 | stats.total_processes += 1; 235 | stats.total_memory_mb += proc.used_mem_mb; 236 | 237 | // Count by user 238 | *stats.users.entry(proc.user.clone()).or_insert(0) += 1; 239 | 240 | // Count by process name 241 | *stats 242 | .process_names 243 | .entry(proc.proc_name.clone()) 244 | .or_insert(0) += 1; 245 | 246 | // Count containers 247 | if let Some(container) = &proc.container { 248 | *stats.containers.entry(container.clone()).or_insert(0) += 1; 249 | } else { 250 | stats.non_container_processes += 1; 251 | } 252 | } 253 | 254 | stats 255 | } 256 | } 257 | 258 | /// Process statistics 259 | #[derive(Debug, Default)] 260 | pub struct ProcessStats { 261 | pub total_processes: usize, 262 | pub total_memory_mb: u32, 263 | pub non_container_processes: usize, 264 | pub users: HashMap, 265 | pub process_names: HashMap, 266 | pub containers: HashMap, 267 | } 268 | 269 | impl std::fmt::Display for ProcessStats { 270 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 271 | writeln!(f, "Process Statistics:")?; 272 | writeln!(f, " Total processes: {}", self.total_processes)?; 273 | writeln!(f, " Total memory: {} MB", self.total_memory_mb)?; 274 | writeln!( 275 | f, 276 | " Non-container processes: {}", 277 | self.non_container_processes 278 | )?; 279 | 280 | if !self.users.is_empty() { 281 | writeln!(f, " Users:")?; 282 | for (user, count) in &self.users { 283 | writeln!(f, " {}: {}", user, count)?; 284 | } 285 | } 286 | 287 | if !self.process_names.is_empty() { 288 | writeln!(f, " Process names:")?; 289 | for (name, count) in &self.process_names { 290 | writeln!(f, " {}: {}", name, count)?; 291 | } 292 | } 293 | 294 | if !self.containers.is_empty() { 295 | writeln!(f, " Containers:")?; 296 | for (container, count) in &self.containers { 297 | writeln!(f, " {}: {}", container, count)?; 298 | } 299 | } 300 | 301 | Ok(()) 302 | } 303 | } 304 | 305 | #[cfg(test)] 306 | mod tests { 307 | use super::*; 308 | use crate::nvml_api::GpuProc; 309 | 310 | fn create_test_process(pid: u32, name: &str, user: &str, memory: u32) -> GpuProc { 311 | GpuProc { 312 | gpu_index: 0, 313 | pid, 314 | user: user.to_string(), 315 | proc_name: name.to_string(), 316 | used_mem_mb: memory, 317 | start_time: "1h".to_string(), 318 | container: None, 319 | } 320 | } 321 | 322 | #[test] 323 | fn test_filter_processes_by_name() { 324 | let processes = vec![ 325 | create_test_process(1, "python", "user1", 100), 326 | create_test_process(2, "python3", "user1", 200), 327 | create_test_process(3, "java", "user2", 300), 328 | ]; 329 | 330 | // Skip test if NVML is not available 331 | if let Ok(nvml_api) = crate::nvml_api::NvmlApi::new() { 332 | let mut manager = EnhancedProcessManager { 333 | process_manager: ProcessManager::new(nvml_api), 334 | system: System::new_all(), 335 | }; 336 | 337 | let filtered = manager 338 | .filter_processes_by_name(&processes, "python") 339 | .unwrap(); 340 | assert_eq!(filtered.len(), 2); 341 | assert_eq!(filtered[0].proc_name, "python"); 342 | assert_eq!(filtered[1].proc_name, "python3"); 343 | } 344 | } 345 | 346 | #[test] 347 | fn test_filter_processes_by_memory() { 348 | let processes = vec![ 349 | create_test_process(1, "python", "user1", 100), 350 | create_test_process(2, "python3", "user1", 200), 351 | create_test_process(3, "java", "user2", 300), 352 | ]; 353 | 354 | // Skip test if NVML is not available 355 | if let Ok(nvml_api) = crate::nvml_api::NvmlApi::new() { 356 | let mut manager = EnhancedProcessManager { 357 | process_manager: ProcessManager::new(nvml_api), 358 | system: System::new_all(), 359 | }; 360 | 361 | let filtered = manager.filter_processes_by_memory(&processes, 200); 362 | assert_eq!(filtered.len(), 2); 363 | assert!(filtered.iter().all(|p| p.used_mem_mb >= 200)); 364 | } 365 | } 366 | 367 | #[test] 368 | fn test_process_stats() { 369 | let processes = vec![ 370 | create_test_process(1, "python", "user1", 100), 371 | create_test_process(2, "python", "user1", 200), 372 | create_test_process(3, "java", "user2", 300), 373 | ]; 374 | 375 | // Skip test if NVML is not available 376 | if let Ok(nvml_api) = crate::nvml_api::NvmlApi::new() { 377 | let mut manager = EnhancedProcessManager { 378 | process_manager: ProcessManager::new(nvml_api), 379 | system: System::new_all(), 380 | }; 381 | 382 | let stats = manager.get_process_stats(&processes); 383 | assert_eq!(stats.total_processes, 3); 384 | assert_eq!(stats.total_memory_mb, 600); 385 | assert_eq!(stats.users.len(), 2); 386 | assert_eq!(stats.process_names.len(), 2); 387 | } 388 | } 389 | } 390 | --------------------------------------------------------------------------------