├── screenshot.png
├── dashboard
├── assets
│ ├── img
│ │ ├── logo.png
│ │ └── screenshot.png
│ └── css
│ │ └── main.css
├── public
│ ├── favicon.ico
│ ├── screenshot.png
│ ├── robots.txt
│ ├── browserconfig.xml
│ ├── sitemap.xml
│ └── site.webmanifest
├── app
│ └── app.vue
├── .gitignore
├── tsconfig.json
├── package.json
├── nuxt.config.ts
├── README.md
├── components
│ └── AppSidebar.vue
└── tailwind.config.js
├── src
├── lib.rs
├── version.rs
├── util.rs
├── config.rs
├── remote.rs
├── hotaisle_client.rs
├── render.rs
├── proc.rs
└── process_mgmt.rs
├── mcp
├── src
│ ├── lib.rs
│ ├── main.rs
│ ├── types.rs
│ ├── server.rs
│ └── resources.rs
├── Cargo.toml
└── README.md
├── .gitignore
├── audit.toml
├── Cargo.toml
├── scripts
├── install.ps1
├── test-hotaisle-integration-simple.sh
├── install.sh
├── setup-gpu-runner.sh
└── run-gpu-tests.sh
├── .github
└── workflows
│ ├── test-hotaisle-integration.yml
│ ├── release.yml
│ ├── hotaisle-gpu-testing.yml
│ ├── gpu-testing.yml
│ └── self-hosted-setup.md
├── LICENSE
├── docs
├── CLOUD_GPU_SETUP.md
└── HOTAISLE_INTEGRATION.md
├── README.md
└── deny.toml
/screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treadiehq/gpu-kill/HEAD/screenshot.png
--------------------------------------------------------------------------------
/dashboard/assets/img/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treadiehq/gpu-kill/HEAD/dashboard/assets/img/logo.png
--------------------------------------------------------------------------------
/dashboard/public/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treadiehq/gpu-kill/HEAD/dashboard/public/favicon.ico
--------------------------------------------------------------------------------
/dashboard/public/screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treadiehq/gpu-kill/HEAD/dashboard/public/screenshot.png
--------------------------------------------------------------------------------
/dashboard/assets/img/screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treadiehq/gpu-kill/HEAD/dashboard/assets/img/screenshot.png
--------------------------------------------------------------------------------
/dashboard/app/app.vue:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/dashboard/.gitignore:
--------------------------------------------------------------------------------
1 | # Nuxt dev/build outputs
2 | .output
3 | .data
4 | .nuxt
5 | .nitro
6 | .cache
7 | dist
8 |
9 | # Node dependencies
10 | node_modules
11 |
12 | # Logs
13 | logs
14 | *.log
15 |
16 | # Misc
17 | .DS_Store
18 | .fleet
19 | .idea
20 |
21 | # Local env files
22 | .env
23 | .env.*
24 | !.env.example
25 |
--------------------------------------------------------------------------------
/dashboard/public/robots.txt:
--------------------------------------------------------------------------------
1 | User-agent: *
2 | Allow: /
3 |
4 | # Sitemap
5 | Sitemap: https://gpukill.com/sitemap.xml
6 |
7 | # Crawl-delay for respectful crawling
8 | Crawl-delay: 1
9 |
10 | # Disallow admin or sensitive areas (if any)
11 | # Disallow: /admin/
12 | # Disallow: /api/
13 |
14 | # Allow all other content
15 | Allow: /dashboard/
16 | Allow: /docs/
17 | Allow: /assets/
--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
1 | pub mod args;
2 | pub mod audit;
3 | pub mod config;
4 | pub mod coordinator;
5 | pub mod guard_mode;
6 | pub mod nvml_api;
7 | pub mod proc;
8 | pub mod process_mgmt;
9 | pub mod remote;
10 | pub mod render;
11 | pub mod rogue_config;
12 | pub mod rogue_detection;
13 | pub mod util;
14 | pub mod vendor;
15 | pub mod version;
16 |
17 | #[cfg(feature = "hotaisle")]
18 | pub mod hotaisle_client;
19 |
--------------------------------------------------------------------------------
/dashboard/tsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | // https://nuxt.com/docs/guide/concepts/typescript
3 | "files": [],
4 | "references": [
5 | {
6 | "path": "./.nuxt/tsconfig.app.json"
7 | },
8 | {
9 | "path": "./.nuxt/tsconfig.server.json"
10 | },
11 | {
12 | "path": "./.nuxt/tsconfig.shared.json"
13 | },
14 | {
15 | "path": "./.nuxt/tsconfig.node.json"
16 | }
17 | ]
18 | }
19 |
--------------------------------------------------------------------------------
/dashboard/public/browserconfig.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 | #1e40af
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/mcp/src/lib.rs:
--------------------------------------------------------------------------------
1 | //! GPU Kill MCP Server
2 | //!
3 | //! This module provides a Model Context Protocol (MCP) server for GPU Kill,
4 | //! enabling AI assistants and other tools to interact with GPU management
5 | //! functionality through a standardized interface.
6 |
7 | pub mod resources;
8 | pub mod server;
9 | pub mod tools;
10 | pub mod types;
11 |
12 | pub use server::GpuKillMCPServer;
13 | pub use types::*;
14 |
15 | /// MCP Server version
16 | pub const MCP_VERSION: &str = "2024-11-05";
17 |
18 | /// GPU Kill MCP Server capabilities
19 | pub const CAPABILITIES: &[&str] = &["resources", "tools", "logging"];
20 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Rust
2 | /target/
3 | **/*.rs.bk
4 | Cargo.lock
5 |
6 | # IDE
7 | .vscode/
8 | .idea/
9 | *.swp
10 | *.swo
11 | *~
12 |
13 | # OS
14 | .DS_Store
15 | .DS_Store?
16 | ._*
17 | .Spotlight-V100
18 | .Trashes
19 | ehthumbs.db
20 | Thumbs.db
21 |
22 | # Logs
23 | *.log
24 |
25 | # Temporary files
26 | *.tmp
27 | *.temp
28 |
29 | # Build artifacts
30 | dist/
31 | *.tar.gz
32 | *.zip
33 |
34 | # Configuration files (optional)
35 | config.toml
36 | .env
37 |
38 | # Test artifacts
39 | test_output/
40 | coverage/
41 |
42 | # Documentation build
43 | book/
44 | .DS_Store
45 |
46 | # Dashboard (separate project)
47 | REMOVED.md
--------------------------------------------------------------------------------
/dashboard/public/sitemap.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | https://gpukill.com/
5 | 2024-01-20
6 | daily
7 | 1.0
8 |
9 |
10 | https://gpukill.com/dashboard/
11 | 2024-01-20
12 | daily
13 | 0.9
14 |
15 |
16 | https://gpukill.com/docs/
17 | 2024-01-20
18 | weekly
19 | 0.8
20 |
21 |
22 |
--------------------------------------------------------------------------------
/audit.toml:
--------------------------------------------------------------------------------
1 | # Cargo audit configuration
2 | [advisories]
3 | # Allow unmaintained crates (warnings only, not errors)
4 | unmaintained = "warn"
5 |
6 | # License configuration
7 | [licenses]
8 | # Allow common open source licenses
9 | allow = [
10 | "MIT",
11 | "Apache-2.0",
12 | "Apache-2.0 OR MIT",
13 | "BSD-2-Clause",
14 | "BSD-3-Clause",
15 | "ISC",
16 | "Unlicense",
17 | "0BSD",
18 | "Zlib",
19 | "CC0-1.0",
20 | "MPL-2.0",
21 | "LGPL-2.1",
22 | "LGPL-3.0",
23 | "GPL-2.0",
24 | "GPL-3.0",
25 | ]
26 |
27 | # Deny proprietary licenses
28 | deny = [
29 | "proprietary",
30 | "commercial",
31 | ]
32 |
33 | # Allow unknown licenses (for crates without explicit license info)
34 | unknown = "warn"
35 |
--------------------------------------------------------------------------------
/dashboard/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "gpukill-dashboard",
3 | "private": true,
4 | "type": "module",
5 | "scripts": {
6 | "build": "nuxt build",
7 | "dev": "nuxt dev --port 3000",
8 | "generate": "nuxt generate",
9 | "preview": "nuxt preview",
10 | "postinstall": "nuxt prepare",
11 | "start": "nuxt dev --port 3000"
12 | },
13 | "dependencies": {
14 | "@headlessui/vue": "^1.7.23",
15 | "@heroicons/vue": "^2.2.0",
16 | "@nuxtjs/tailwindcss": "^6.14.0",
17 | "@tailwindcss/aspect-ratio": "^0.4.2",
18 | "@tailwindcss/forms": "^0.5.10",
19 | "@tailwindcss/typography": "^0.5.18",
20 | "chart.js": "^4.5.0",
21 | "nuxt": "^3.13.0",
22 | "vue": "^3.5.18",
23 | "vue-chartjs": "^5.3.2",
24 | "vue-router": "^4.5.1"
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/mcp/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | name = "gpukill-mcp"
3 | version = "0.1.0"
4 | edition = "2021"
5 | authors = ["GPU Kill Team"]
6 | description = "MCP server for GPU Kill - AI-accessible GPU management"
7 | license = "MIT"
8 | repository = "https://github.com/treadiehq/gpu-kill"
9 |
10 | [dependencies]
11 | # Core MCP dependencies
12 | tokio = { version = "1.0", features = ["rt", "rt-multi-thread", "net", "fs", "macros"] }
13 | serde = { version = "1.0", features = ["derive"] }
14 | serde_json = "1.0"
15 | anyhow = "1.0"
16 | tracing = "0.1"
17 | tracing-subscriber = { version = "0.3", features = ["env-filter"] }
18 |
19 | # GPU Kill integration
20 | gpukill = { path = "../" }
21 |
22 | # HTTP server for MCP protocol
23 | axum = { version = "0.7", features = ["ws", "macros"] }
24 | tower = "0.4"
25 | tower-http = { version = "0.5", features = ["cors", "trace"] }
26 |
27 | # JSON-RPC for MCP protocol
28 | jsonrpc-core = "18.0"
29 | jsonrpc-derive = "18.0"
30 | jsonrpc-ws-server = "18.0"
31 |
32 | # UUID for request IDs
33 | uuid = { version = "1.0", features = ["v4", "serde"] }
34 |
35 | [dev-dependencies]
36 | tempfile = "3.0"
37 |
--------------------------------------------------------------------------------
/src/version.rs:
--------------------------------------------------------------------------------
1 | /// Version information for the gpukill CLI tool
2 | pub const VERSION: &str = env!("CARGO_PKG_VERSION");
3 |
4 | /// Build information
5 | pub const BUILD_DATE: &str = env!("BUILD_DATE");
6 | pub const BUILD_TARGET: &str = env!("BUILD_TARGET");
7 | #[allow(dead_code)]
8 | pub const GIT_COMMIT: &str = env!("GIT_COMMIT");
9 |
10 | /// Get formatted version string
11 | pub fn get_version_string() -> String {
12 | format!("gpukill {} ({} {})", VERSION, BUILD_TARGET, BUILD_DATE)
13 | }
14 |
15 | /// Get detailed version information
16 | #[allow(dead_code)]
17 | pub fn get_detailed_version() -> String {
18 | format!(
19 | "gpukill version {}\n\
20 | Build target: {}\n\
21 | Build date: {}\n\
22 | Git commit: {}",
23 | VERSION, BUILD_TARGET, BUILD_DATE, GIT_COMMIT
24 | )
25 | }
26 |
27 | #[cfg(test)]
28 | mod tests {
29 | use super::*;
30 |
31 | #[test]
32 | fn test_version_string_format() {
33 | let version = get_version_string();
34 | assert!(version.contains("gpukill"));
35 | assert!(version.contains(VERSION));
36 | }
37 |
38 | #[test]
39 | fn test_detailed_version_format() {
40 | let detailed = get_detailed_version();
41 | assert!(detailed.contains("gpukill version"));
42 | assert!(detailed.contains(VERSION));
43 | assert!(detailed.contains("Build target:"));
44 | assert!(detailed.contains("Build date:"));
45 | assert!(detailed.contains("Git commit:"));
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/dashboard/nuxt.config.ts:
--------------------------------------------------------------------------------
1 | // https://nuxt.com/docs/api/configuration/nuxt-config
2 | export default defineNuxtConfig({
3 | compatibilityDate: '2024-04-03',
4 | devtools: { enabled: true },
5 | modules: [
6 | '@nuxtjs/tailwindcss'
7 | ],
8 | runtimeConfig: {
9 | public: {
10 | apiBase: process.env.API_BASE || 'http://localhost:8080'
11 | }
12 | },
13 | ssr: true,
14 | app: {
15 | head: {
16 | title: 'GPU Kill - Cluster Management Dashboard',
17 | titleTemplate: '%s',
18 | meta: [
19 | { charset: 'utf-8' },
20 | { name: 'viewport', content: 'width=device-width, initial-scale=1' },
21 | { name: 'format-detection', content: 'telephone=no' },
22 | { name: 'theme-color', content: '#1e40af' }
23 | ],
24 | link: [
25 | { rel: 'icon', type: 'image/x-icon', href: '/favicon.ico' },
26 | { rel: 'preconnect', href: 'https://fonts.googleapis.com' },
27 | { rel: 'preconnect', href: 'https://fonts.gstatic.com', crossorigin: '' }
28 | ],
29 | style: [
30 | {
31 | innerHTML: `
32 | html, body, #__nuxt {
33 | background: #000000 !important;
34 | background-color: #000000 !important;
35 | overscroll-behavior: none !important;
36 | }
37 | * {
38 | overscroll-behavior: none !important;
39 | }
40 | `
41 | }
42 | ]
43 | }
44 | },
45 | nitro: {
46 | devProxy: {
47 | '/api': {
48 | target: 'http://localhost:8080/api',
49 | changeOrigin: true
50 | }
51 | }
52 | }
53 | })
54 |
--------------------------------------------------------------------------------
/mcp/src/main.rs:
--------------------------------------------------------------------------------
1 | //! GPU Kill MCP Server - Main entry point
2 |
3 | use gpukill_mcp::GpuKillMCPServer;
4 | use std::env;
5 | use tracing::{error, info};
6 |
7 | #[tokio::main]
8 | async fn main() -> anyhow::Result<()> {
9 | // Initialize logging
10 | tracing_subscriber::fmt()
11 | .with_env_filter(tracing_subscriber::EnvFilter::from_default_env())
12 | .init();
13 |
14 | info!("Starting GPU Kill MCP Server");
15 |
16 | // Get port from environment or use default
17 | let port = env::var("MCP_PORT")
18 | .unwrap_or_else(|_| "3001".to_string())
19 | .parse::()
20 | .unwrap_or(3001);
21 |
22 | // Create and start the MCP server
23 | let server = GpuKillMCPServer::new().await?;
24 |
25 | info!("GPU Kill MCP Server initialized successfully");
26 | info!("Available resources:");
27 | info!(" - gpu://list - Current GPU status and utilization");
28 | info!(" - gpu://processes - Currently running GPU processes");
29 | info!(" - gpu://audit - Historical GPU usage data");
30 | info!(" - gpu://policies - Current Guard Mode policies");
31 | info!(" - gpu://rogue-detection - Security scan results");
32 |
33 | info!("Available tools:");
34 | info!(" - kill_gpu_process - Kill a GPU process by PID");
35 | info!(" - reset_gpu - Reset a GPU by ID");
36 | info!(" - scan_rogue_activity - Scan for suspicious GPU activity");
37 | info!(" - create_user_policy - Create a user policy for Guard Mode");
38 | info!(" - get_gpu_status - Get detailed status of a specific GPU");
39 | info!(" - kill_processes_by_name - Kill all processes matching a name pattern");
40 |
41 | // Start the server
42 | if let Err(e) = server.start(port).await {
43 | error!("Failed to start MCP server: {}", e);
44 | return Err(e);
45 | }
46 |
47 | Ok(())
48 | }
49 |
--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
1 | [workspace]
2 | members = [
3 | ".",
4 | "mcp",
5 | ]
6 |
7 | [package]
8 | name = "gpukill"
9 | version = "0.1.8"
10 | edition = "2021"
11 | authors = ["Kage "]
12 | description = "A CLI tool for GPU management and monitoring supporting NVIDIA, AMD, Intel, and Apple Silicon GPUs"
13 | license = "FSL-1.1-MIT"
14 | repository = "https://github.com/treadiehq/gpu-kill"
15 | keywords = ["gpu", "nvidia", "amd", "intel", "apple", "metal", "nvml", "rocm", "cli", "monitoring"]
16 | categories = ["command-line-utilities", "development-tools"]
17 |
18 | [[bin]]
19 | name = "gpukill"
20 | path = "src/main.rs"
21 |
22 | [dependencies]
23 | clap = { version = "4.4", features = ["derive", "env"] }
24 | tabled = "0.15"
25 | tracing = "0.1"
26 | tracing-subscriber = { version = "0.3", features = ["env-filter"] }
27 | nvml-wrapper = "0.11"
28 | sysinfo = "0.30"
29 | color-eyre = "0.6"
30 | serde = { version = "1.0", features = ["derive"] }
31 | serde_json = "1.0"
32 | chrono = { version = "0.4", features = ["serde"] }
33 | nix = { version = "0.27", features = ["process", "signal"] }
34 | tokio = { version = "1.0", features = ["rt", "time", "process", "net", "fs"] }
35 | anyhow = "1.0"
36 | hostname = "0.3"
37 | libc = "0.2"
38 | toml = "0.8"
39 | dirs = "5.0"
40 | regex = "1.10"
41 | glob = "0.3"
42 | reqwest = { version = "0.11", features = ["json"] }
43 |
44 | # HTTP server dependencies
45 | axum = { version = "0.7", features = ["ws", "macros"] }
46 | tower = "0.4"
47 | tower-http = { version = "0.5", features = ["cors", "trace"] }
48 | uuid = { version = "1.0", features = ["v4", "serde"] }
49 | futures-util = "0.3"
50 |
51 | # SSH remote support (using system SSH for now)
52 | # ssh2 = "0.9"
53 | # rpassword = "7.3"
54 |
55 |
56 | # Apple Silicon GPU support
57 | [target.'cfg(target_os = "macos")'.dependencies]
58 | core-foundation = "0.9"
59 | core-foundation-sys = "0.8"
60 | io-kit-sys = "0.2"
61 |
62 | [dev-dependencies]
63 | tempfile = "3.0"
64 | mockall = "0.12"
65 |
66 | [build-dependencies]
67 | chrono = "0.4"
68 |
69 | [features]
70 | default = []
71 | mock_nvml = []
72 | hotaisle = []
73 |
74 | [profile.release]
75 | # Optimized for faster builds during development
76 | lto = "thin" # Much faster than "true" (fat LTO)
77 | codegen-units = 4 # Allow parallel codegen for faster builds
78 | panic = "abort"
79 | strip = true
80 |
81 | # Fast release profile for development
82 | [profile.release-fast]
83 | inherits = "release"
84 | lto = false
85 | codegen-units = 16
86 | opt-level = 2 # Slightly less optimization for speed
87 |
88 | # Maximum optimization profile for final releases
89 | [profile.release-max]
90 | inherits = "release"
91 | lto = true
92 | codegen-units = 1
93 | opt-level = 3
94 |
--------------------------------------------------------------------------------
/dashboard/public/site.webmanifest:
--------------------------------------------------------------------------------
1 | {
2 | "name": "GPU Kill - Cluster Management Dashboard",
3 | "short_name": "GPU Kill",
4 | "description": "Professional GPU cluster management dashboard with real-time monitoring, rogue detection, and policy enforcement.",
5 | "start_url": "/",
6 | "display": "standalone",
7 | "background_color": "#0f172a",
8 | "theme_color": "#1e40af",
9 | "orientation": "portrait-primary",
10 | "scope": "/",
11 | "lang": "en",
12 | "categories": ["developer", "productivity", "utilities"],
13 | "icons": [
14 | {
15 | "src": "/favicon-16x16.png",
16 | "sizes": "16x16",
17 | "type": "image/png"
18 | },
19 | {
20 | "src": "/favicon-32x32.png",
21 | "sizes": "32x32",
22 | "type": "image/png"
23 | },
24 | {
25 | "src": "/apple-touch-icon.png",
26 | "sizes": "180x180",
27 | "type": "image/png"
28 | },
29 | {
30 | "src": "/android-chrome-192x192.png",
31 | "sizes": "192x192",
32 | "type": "image/png",
33 | "purpose": "any maskable"
34 | },
35 | {
36 | "src": "/android-chrome-512x512.png",
37 | "sizes": "512x512",
38 | "type": "image/png",
39 | "purpose": "any maskable"
40 | }
41 | ],
42 | "screenshots": [
43 | {
44 | "src": "/screenshot-desktop.png",
45 | "sizes": "1280x720",
46 | "type": "image/png",
47 | "form_factor": "wide",
48 | "label": "GPU Kill Dashboard - Desktop View"
49 | },
50 | {
51 | "src": "/screenshot-mobile.png",
52 | "sizes": "390x844",
53 | "type": "image/png",
54 | "form_factor": "narrow",
55 | "label": "GPU Kill Dashboard - Mobile View"
56 | }
57 | ],
58 | "shortcuts": [
59 | {
60 | "name": "Cluster Overview",
61 | "short_name": "Overview",
62 | "description": "View cluster overview and statistics",
63 | "url": "/#cluster-overview",
64 | "icons": [
65 | {
66 | "src": "/shortcut-overview.png",
67 | "sizes": "96x96"
68 | }
69 | ]
70 | },
71 | {
72 | "name": "Rogue Detection",
73 | "short_name": "Rogue",
74 | "description": "Scan for suspicious GPU activities",
75 | "url": "/#rogue-detection",
76 | "icons": [
77 | {
78 | "src": "/shortcut-rogue.png",
79 | "sizes": "96x96"
80 | }
81 | ]
82 | },
83 | {
84 | "name": "Guard Mode",
85 | "short_name": "Guard",
86 | "description": "Manage policy enforcement",
87 | "url": "/#guard-mode",
88 | "icons": [
89 | {
90 | "src": "/shortcut-guard.png",
91 | "sizes": "96x96"
92 | }
93 | ]
94 | }
95 | ]
96 | }
97 |
--------------------------------------------------------------------------------
/scripts/install.ps1:
--------------------------------------------------------------------------------
1 | $ErrorActionPreference = "Stop"
2 |
3 | # gpukill Windows installer: prefer winget, fallback to zip from GitHub Releases
4 |
5 | param(
6 | [string]$Version = "",
7 | [string]$BinDir = "$env:LOCALAPPDATA\Programs\gpukill",
8 | [switch]$Yes,
9 | [switch]$Insecure
10 | )
11 |
12 | function Get-Arch {
13 | if ([System.Environment]::Is64BitOperatingSystem) { return "x86_64" } else { return "x86" }
14 | }
15 |
16 | # Try winget first
17 | try {
18 | if (Get-Command winget -ErrorAction SilentlyContinue) {
19 | winget install --id TreadieHQ.GPUKill --silent --accept-package-agreements --accept-source-agreements
20 | if ($LASTEXITCODE -eq 0) { Write-Host "✅ Installed via winget"; exit 0 }
21 | }
22 | } catch {}
23 |
24 | # Fallback to GitHub Releases
25 | $Owner = "treadiehq"
26 | $Repo = "gpu-kill"
27 | if ($Version -ne "") {
28 | $ApiUrl = "https://api.github.com/repos/$Owner/$Repo/releases/tags/$Version"
29 | } else {
30 | $ApiUrl = "https://api.github.com/repos/$Owner/$Repo/releases/latest"
31 | }
32 |
33 | Write-Host "Resolving release…"
34 | $resp = Invoke-RestMethod -Uri $ApiUrl -UseBasicParsing
35 | $Tag = $resp.tag_name
36 | if (-not $Tag) { throw "Failed to resolve release tag" }
37 |
38 | $arch = Get-Arch
39 | $assetName = "gpukill-$Tag-windows-$arch.zip"
40 | $asset = $resp.assets | Where-Object { $_.name -eq $assetName }
41 | if (-not $asset) { throw "No asset named $assetName in release $Tag" }
42 |
43 | $tmp = New-Item -ItemType Directory -Path ([System.IO.Path]::GetTempPath() + [System.Guid]::NewGuid())
44 | $zipPath = Join-Path $tmp $assetName
45 | $sumsAsset = $resp.assets | Where-Object { $_.name -eq 'SHA256SUMS' }
46 | $sumsPath = Join-Path $tmp 'SHA256SUMS'
47 |
48 | Write-Host "Downloading $assetName…"
49 | Invoke-WebRequest -Uri $asset.browser_download_url -OutFile $zipPath -UseBasicParsing
50 | if ($sumsAsset) {
51 | Invoke-WebRequest -Uri $sumsAsset.browser_download_url -OutFile $sumsPath -UseBasicParsing
52 | }
53 |
54 | if (Test-Path $sumsPath) {
55 | $hash = (Get-FileHash -Algorithm SHA256 $zipPath).Hash.ToLower()
56 | $sums = Get-Content $sumsPath
57 | if (-not ($sums -match $hash)) {
58 | if (-not $Insecure) { throw "Checksum verification failed" }
59 | Write-Warning "Checksum verification skipped (--Insecure)"
60 | }
61 | }
62 |
63 | Write-Host "Extracting…"
64 | Expand-Archive -Path $zipPath -DestinationPath $tmp -Force
65 |
66 | New-Item -ItemType Directory -Force -Path $BinDir | Out-Null
67 | Copy-Item -Path (Join-Path $tmp 'gpukill.exe') -Destination (Join-Path $BinDir 'gpukill.exe') -Force
68 |
69 | # Add to PATH for current session
70 | $env:PATH = "$BinDir;$env:PATH"
71 | Write-Host "✅ Installed to $BinDir"
72 | & (Join-Path $BinDir 'gpukill.exe') --version
73 |
74 |
--------------------------------------------------------------------------------
/.github/workflows/test-hotaisle-integration.yml:
--------------------------------------------------------------------------------
1 | name: Test Hot Aisle Integration
2 |
3 | on:
4 | push:
5 | branches: [main, develop]
6 | paths:
7 | - 'src/hotaisle_client.rs'
8 | - 'scripts/run-gpu-tests.sh'
9 | - 'scripts/test-hotaisle-integration-simple.sh'
10 | - '.github/workflows/test-hotaisle-integration.yml'
11 | - '.github/workflows/hotaisle-gpu-testing.yml'
12 | - 'docs/HOTAISLE_INTEGRATION.md'
13 | pull_request:
14 | branches: [main]
15 | paths:
16 | - 'src/hotaisle_client.rs'
17 | - 'scripts/run-gpu-tests.sh'
18 | - 'scripts/test-hotaisle-integration-simple.sh'
19 | - '.github/workflows/test-hotaisle-integration.yml'
20 | - '.github/workflows/hotaisle-gpu-testing.yml'
21 | - 'docs/HOTAISLE_INTEGRATION.md'
22 | workflow_dispatch:
23 |
24 | permissions:
25 | contents: read
26 |
27 | env:
28 | RUST_BACKTRACE: 1
29 | RUST_LOG: info
30 |
31 | jobs:
32 | test-integration:
33 | name: Test Hot Aisle Integration
34 | runs-on: ubuntu-latest
35 | timeout-minutes: 15
36 |
37 | steps:
38 | - name: Checkout code
39 | uses: actions/checkout@v4
40 |
41 | - name: Install Rust
42 | uses: dtolnay/rust-toolchain@stable
43 | with:
44 | components: rustfmt, clippy
45 |
46 | - name: Install system dependencies
47 | run: |
48 | sudo apt-get update
49 | sudo apt-get install -y build-essential libssl-dev pkg-config curl jq
50 |
51 | - name: Make test script executable
52 | run: chmod +x scripts/test-hotaisle-integration-simple.sh
53 |
54 | - name: Run Hot Aisle Integration Tests
55 | run: |
56 | echo "Running comprehensive Hot Aisle integration tests..."
57 | ./scripts/test-hotaisle-integration-simple.sh
58 |
59 | - name: Validate YAML Syntax
60 | run: |
61 | echo "Validating GitHub Actions workflow syntax..."
62 | python3 -c "
63 | import yaml
64 | import sys
65 | try:
66 | with open('.github/workflows/hotaisle-gpu-testing.yml', 'r') as f:
67 | yaml.safe_load(f)
68 | print('✅ YAML syntax is valid')
69 | except yaml.YAMLError as e:
70 | print(f'❌ YAML syntax error: {e}')
71 | sys.exit(1)
72 | except Exception as e:
73 | print(f'❌ Error reading YAML file: {e}')
74 | sys.exit(1)
75 | " || echo "⚠️ Python YAML validation skipped (module not available)"
76 |
77 | - name: Integration Test Summary
78 | run: |
79 | echo "========================================"
80 | echo "🎉 Hot Aisle Integration Test Summary"
81 | echo "========================================"
82 | echo "✅ All integration tests passed!"
83 | echo "✅ Hot Aisle integration is ready for use!"
84 | echo ""
85 | echo "To use Hot Aisle GPU testing:"
86 | echo "1. Set up HOTAISLE_API_KEY in GitHub Secrets"
87 | echo "2. Manually trigger the 'Hot Aisle GPU Testing' workflow"
88 | echo "3. Monitor results in the Actions tab"
89 |
--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
1 | name: Release
2 |
3 | on:
4 | push:
5 | tags:
6 | - 'v*'
7 | workflow_dispatch:
8 | inputs:
9 | tag:
10 | description: 'Tag to release (e.g., v0.1.1)'
11 | required: true
12 | type: string
13 |
14 | permissions:
15 | contents: write
16 |
17 | env:
18 | TAG: ${{ github.ref_type == 'tag' && github.ref_name || inputs.tag }}
19 |
20 | jobs:
21 | build-linux-x86_64:
22 | runs-on: ubuntu-latest
23 | steps:
24 | - uses: actions/checkout@v4
25 | - uses: dtolnay/rust-toolchain@stable
26 | with:
27 | targets: x86_64-unknown-linux-gnu
28 | - name: Install system dependencies (NVML)
29 | run: |
30 | sudo apt-get update
31 | sudo apt-get install -y libnvidia-ml-dev pkg-config
32 | - name: Build
33 | run: cargo build --release --target x86_64-unknown-linux-gnu
34 | - name: Prepare artifact
35 | run: |
36 | mkdir -p dist
37 | cp target/x86_64-unknown-linux-gnu/release/gpukill dist/gpukill-${{ env.TAG }}-linux-x86_64
38 | - uses: actions/upload-artifact@v4
39 | with:
40 | name: linux-x86_64
41 | path: dist/gpukill-${{ env.TAG }}-linux-x86_64
42 |
43 | build-macos-arm64:
44 | runs-on: macos-14
45 | steps:
46 | - uses: actions/checkout@v4
47 | - uses: dtolnay/rust-toolchain@stable
48 | - name: Build
49 | run: cargo build --release
50 | - name: Prepare artifact
51 | run: |
52 | mkdir -p dist
53 | cp target/release/gpukill dist/gpukill-${{ env.TAG }}-macos-aarch64
54 | - uses: actions/upload-artifact@v4
55 | with:
56 | name: macos-aarch64
57 | path: dist/gpukill-${{ env.TAG }}-macos-aarch64
58 |
59 | build-windows-x86_64:
60 | runs-on: windows-latest
61 | steps:
62 | - uses: actions/checkout@v4
63 | - uses: dtolnay/rust-toolchain@stable
64 | - name: Build
65 | run: cargo build --release
66 | - name: Prepare zip
67 | shell: pwsh
68 | run: |
69 | New-Item -ItemType Directory -Force -Path dist | Out-Null
70 | Copy-Item target/release/gpukill.exe dist/gpukill.exe
71 | Compress-Archive -Path dist/gpukill.exe -DestinationPath dist/gpukill-${{ env.TAG }}-windows-x86_64.zip -Force
72 | - uses: actions/upload-artifact@v4
73 | with:
74 | name: windows-x86_64
75 | path: dist/gpukill-${{ env.TAG }}-windows-x86_64.zip
76 |
77 | release:
78 | runs-on: ubuntu-latest
79 | needs: [build-linux-x86_64, build-macos-arm64, build-windows-x86_64]
80 | steps:
81 | - uses: actions/checkout@v4
82 | - name: Download artifacts
83 | uses: actions/download-artifact@v4
84 | with:
85 | path: dist
86 | - name: Flatten artifacts and compute checksums
87 | run: |
88 | mkdir -p upload
89 | find dist -type f -maxdepth 2 -exec cp {} upload/ \;
90 | (cd upload && sha256sum * > SHA256SUMS) || (cd upload && shasum -a 256 * > SHA256SUMS)
91 | - name: Create GitHub Release
92 | uses: softprops/action-gh-release@v2
93 | with:
94 | tag_name: ${{ env.TAG }}
95 | files: |
96 | upload/*
97 | env:
98 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
99 |
--------------------------------------------------------------------------------
/.github/workflows/hotaisle-gpu-testing.yml:
--------------------------------------------------------------------------------
1 | name: Hot Aisle GPU Testing
2 |
3 | # This workflow only runs when manually triggered and when API key is configured
4 | on:
5 | workflow_dispatch:
6 | inputs:
7 | gpu_types:
8 | description: 'Comma-separated GPU types to test (nvidia,amd,intel,apple-silicon)'
9 | required: false
10 | default: 'nvidia,amd,intel'
11 | test_duration:
12 | description: 'Test duration in minutes'
13 | required: false
14 | default: '30'
15 |
16 | jobs:
17 | preflight:
18 | name: Preflight Checks
19 | runs-on: ubuntu-latest
20 | outputs:
21 | api_key_configured: ${{ steps.check_api_key.outputs.configured }}
22 | steps:
23 | - name: Check Hot Aisle API Key
24 | id: check_api_key
25 | run: |
26 | if [[ -n "${{ secrets.HOTAISLE_API_KEY }}" ]]; then
27 | echo "configured=true" >> $GITHUB_OUTPUT
28 | echo "✅ Hot Aisle API key is configured"
29 | else
30 | echo "configured=false" >> $GITHUB_OUTPUT
31 | echo "❌ Hot Aisle API key is not configured"
32 | echo "Please set HOTAISLE_API_KEY in repository secrets to use this workflow"
33 | exit 1
34 | fi
35 |
36 | gpu-testing:
37 | name: GPU Testing on Hot Aisle
38 | needs: preflight
39 | if: needs.preflight.outputs.api_key_configured == 'true'
40 | runs-on: ubuntu-latest
41 | strategy:
42 | matrix:
43 | gpu_type: [nvidia, amd, intel]
44 | steps:
45 | - name: Checkout code
46 | uses: actions/checkout@v4
47 |
48 | - name: Set up Rust
49 | uses: actions-rs/toolchain@v1
50 | with:
51 | toolchain: stable
52 | components: rustfmt, clippy
53 |
54 | - name: Build GPU Kill with Hot Aisle support
55 | run: |
56 | cargo build --release --features hotaisle
57 | # Verify binary was created
58 | ls -la target/release/gpukill
59 |
60 | - name: Test Hot Aisle Integration
61 | run: |
62 | chmod +x scripts/test-hotaisle-integration.sh
63 | ./scripts/test-hotaisle-integration.sh
64 |
65 | - name: Provision GPU Instance
66 | id: provision
67 | run: |
68 | # This would use the Hot Aisle client to provision an instance
69 | echo "Provisioning ${{ matrix.gpu_type }} GPU instance..."
70 | # For now, we'll simulate this step
71 | echo "instance_id=test-instance-123" >> $GITHUB_OUTPUT
72 | echo "instance_ip=192.168.1.100" >> $GITHUB_OUTPUT
73 |
74 | - name: Deploy and Test on GPU Instance
75 | run: |
76 | echo "Deploying GPU Kill to instance ${{ steps.provision.outputs.instance_id }}"
77 | echo "Running GPU tests on ${{ matrix.gpu_type }} hardware..."
78 | # This would use the Hot Aisle client to deploy and run tests
79 | # For now, we'll simulate the test results
80 | echo "✅ GPU detection tests passed"
81 | echo "✅ GPU performance tests passed"
82 | echo "✅ GPU stress tests passed"
83 |
84 | - name: Cleanup GPU Instance
85 | if: always()
86 | run: |
87 | echo "Cleaning up instance ${{ steps.provision.outputs.instance_id }}"
88 | # This would use the Hot Aisle client to terminate the instance
89 |
--------------------------------------------------------------------------------
/dashboard/assets/css/main.css:
--------------------------------------------------------------------------------
1 | @tailwind base;
2 | @tailwind components;
3 | @tailwind utilities;
4 |
5 | /* Import aggressive overscroll fix */
6 | /* @import './overscroll-fix.css'; */
7 |
8 | /* Custom styles for GPU Kill Dashboard */
9 | @layer base {
10 | /* Force dark background on all elements */
11 | *, *::before, *::after {
12 | box-sizing: border-box;
13 | }
14 |
15 | html {
16 | font-family: 'Inter', system-ui, sans-serif;
17 | background: #000000 !important;
18 | background-color: #000000 !important;
19 | overscroll-behavior: none;
20 | -webkit-overflow-scrolling: touch;
21 | height: 100%;
22 | width: 100%;
23 | }
24 |
25 | body {
26 | background: #000000 !important;
27 | background-color: #000000 !important;
28 | overscroll-behavior: none;
29 | margin: 0;
30 | padding: 0;
31 | min-height: 100vh;
32 | height: 100%;
33 | width: 100%;
34 | overflow-x: hidden;
35 | position: relative;
36 | }
37 |
38 | /* Target the Nuxt root element */
39 | #__nuxt {
40 | background: #000000 !important;
41 | background-color: #000000 !important;
42 | min-height: 100vh;
43 | height: 100%;
44 | width: 100%;
45 | }
46 |
47 | /* Prevent any white backgrounds from showing through */
48 | div, main, section, article, header, footer, nav, aside {
49 | background-color: transparent !important;
50 | }
51 |
52 | /* Fix overscroll bounce on all platforms */
53 | html, body, #__nuxt {
54 | overscroll-behavior: none !important;
55 | overscroll-behavior-y: none !important;
56 | overscroll-behavior-x: none !important;
57 | }
58 |
59 | /* iOS specific fixes */
60 | @supports (-webkit-touch-callout: none) {
61 | html, body {
62 | position: fixed;
63 | height: 100%;
64 | width: 100%;
65 | overflow: hidden;
66 | }
67 |
68 | #__nuxt {
69 | position: fixed;
70 | top: 0;
71 | left: 0;
72 | right: 0;
73 | bottom: 0;
74 | overflow-y: auto;
75 | -webkit-overflow-scrolling: touch;
76 | }
77 | }
78 |
79 | /* Additional overscroll fixes */
80 | .overscroll-none {
81 | overscroll-behavior: none !important;
82 | }
83 |
84 | /* Prevent rubber band effect */
85 | .no-bounce {
86 | overscroll-behavior-y: none !important;
87 | -webkit-overflow-scrolling: touch;
88 | }
89 | }
90 |
91 | @layer components {
92 | .gpu-card {
93 | @apply bg-white dark:bg-gray-800 rounded-lg shadow-sm border border-gray-200 dark:border-gray-700 p-4 hover:shadow-md transition-shadow;
94 | }
95 |
96 | .status-online {
97 | @apply bg-green-100 text-green-800 dark:bg-green-900 dark:text-green-200;
98 | }
99 |
100 | .status-offline {
101 | @apply bg-red-100 text-red-800 dark:bg-red-900 dark:text-red-200;
102 | }
103 |
104 | .status-degraded {
105 | @apply bg-yellow-100 text-yellow-800 dark:bg-yellow-900 dark:text-yellow-200;
106 | }
107 |
108 | .metric-card {
109 | @apply bg-gradient-to-br from-blue-50 to-indigo-100 dark:from-gray-800 dark:to-gray-700 rounded-lg p-4 border border-blue-200 dark:border-gray-600;
110 | }
111 |
112 | .utilization-bar {
113 | @apply w-full bg-gray-200 dark:bg-gray-700 rounded-full h-2.5;
114 | }
115 |
116 | .utilization-fill {
117 | @apply h-2.5 rounded-full transition-all duration-300;
118 | }
119 |
120 | .utilization-low {
121 | @apply bg-green-500;
122 | }
123 |
124 | .utilization-medium {
125 | @apply bg-yellow-500;
126 | }
127 |
128 | .utilization-high {
129 | @apply bg-red-500;
130 | }
131 | }
132 |
--------------------------------------------------------------------------------
/.github/workflows/gpu-testing.yml:
--------------------------------------------------------------------------------
1 | name: GPU Hardware Testing
2 |
3 | on:
4 | push:
5 | branches: [main, develop]
6 | pull_request:
7 | branches: [main]
8 | workflow_dispatch:
9 | inputs:
10 | gpu_vendor:
11 | description: 'GPU vendor to test'
12 | required: true
13 | default: 'all'
14 | type: choice
15 | options:
16 | - all
17 | - nvidia
18 | - amd
19 | - intel
20 | - apple
21 |
22 | permissions:
23 | contents: read
24 |
25 | env:
26 | RUST_BACKTRACE: 1
27 | RUST_LOG: info
28 |
29 | jobs:
30 | # Cross-platform compatibility tests
31 | cross-platform-tests:
32 | name: Cross-Platform Tests
33 | runs-on: ${{ matrix.os }}
34 | strategy:
35 | matrix:
36 | os: [ubuntu-22.04, macos-13, windows-2022]
37 | include:
38 | - os: ubuntu-22.04
39 | install_deps: |
40 | sudo apt-get update
41 | sudo apt-get install -y build-essential libssl-dev pkg-config
42 | - os: macos-13
43 | install_deps: |
44 | xcode-select --install || true
45 | - os: windows-2022
46 | install_deps: |
47 | # Windows dependencies handled by vcpkg
48 |
49 | steps:
50 | - name: Checkout code
51 | uses: actions/checkout@v4
52 |
53 | - name: Install Rust
54 | uses: dtolnay/rust-toolchain@stable
55 | with:
56 | components: rustfmt, clippy
57 |
58 | - name: Install system dependencies
59 | run: ${{ matrix.install_deps }}
60 |
61 | - name: Build and test
62 | run: |
63 | cargo build --release
64 | cargo test --features mock_nvml
65 | cargo clippy --all-targets --all-features -- -D warnings
66 | cargo fmt --all -- --check
67 |
68 | # Security and compliance tests
69 | security-tests:
70 | name: Security Tests
71 | runs-on: ubuntu-22.04
72 |
73 | steps:
74 | - name: Checkout code
75 | uses: actions/checkout@v4
76 |
77 | - name: Install Rust
78 | uses: dtolnay/rust-toolchain@stable
79 |
80 | - name: Install security tools
81 | run: |
82 | sudo apt-get update
83 | sudo apt-get install -y build-essential libssl-dev pkg-config
84 | cargo install cargo-audit
85 | cargo install cargo-deny
86 |
87 | - name: Security audit
88 | run: |
89 | cargo audit
90 | cargo deny check
91 |
92 | - name: Build with security flags
93 | run: |
94 | RUSTFLAGS="-C target-cpu=native" cargo build --release
95 | strip target/release/gpukill
96 |
97 | # Documentation and API tests
98 | api-tests:
99 | name: API Tests
100 | runs-on: ubuntu-22.04
101 |
102 | steps:
103 | - name: Checkout code
104 | uses: actions/checkout@v4
105 |
106 | - name: Install Rust
107 | uses: dtolnay/rust-toolchain@stable
108 |
109 | - name: Install dependencies
110 | run: |
111 | sudo apt-get update
112 | sudo apt-get install -y build-essential libssl-dev pkg-config
113 |
114 | - name: Test MCP server
115 | run: |
116 | cargo build --release -p gpukill-mcp
117 | # Test MCP server startup
118 | timeout 10 ./target/release/gpukill-mcp || true
119 |
120 | - name: Test HTTP server
121 | run: |
122 | cargo build --release
123 | # Test HTTP server startup
124 | timeout 10 ./target/release/gpukill --server --server-port 8080 || true
--------------------------------------------------------------------------------
/dashboard/README.md:
--------------------------------------------------------------------------------
1 | # GPU Kill Dashboard
2 |
3 | A modern, responsive dashboard for monitoring GPU clusters built with Nuxt.js and Tailwind CSS.
4 |
5 | ## Features
6 |
7 | - **Real-time Cluster Monitoring**: Live updates via WebSocket
8 | - **Magic Moment**: Instant visibility into GPU contention and blocked resources
9 | - **Rogue Detection**: Security monitoring with threat detection and risk scoring
10 | - **Guard Mode Management**: Policy enforcement with user, group, and GPU policies
11 | - **Auto-refresh**: Automatic data updates with manual refresh controls
12 | - **Data Persistence**: Policy data saved locally across page refreshes
13 | - **Interactive Controls**: Toggle switches for enforcement modes
14 | - **Policy Management**: Complete CRUD operations for User, Group, and GPU policies
15 | - **Policy Testing**: Built-in policy simulation and testing interface
16 |
17 | ## Quick Start
18 |
19 | 1. **Start the GPU Kill Coordinator Server**:
20 | ```bash
21 | cd /path/to/gpu-kill
22 | ./target/release/gpukill --server --server-port 8080
23 | ```
24 |
25 | 2. **Start the Dashboard**:
26 | ```bash
27 | cd dashboard
28 | npm install # First time only
29 | npm run dev
30 | ```
31 |
32 | 3. **Open your browser**:
33 | - Dashboard: http://localhost:3000
34 | - API: http://localhost:8080
35 |
36 | ## Dashboard Views
37 |
38 | ### Overview Page
39 | - **Cluster Statistics**: Total nodes, GPUs, memory, and average utilization
40 | - **Real-time Metrics**: Live indicators with auto-refresh
41 | - **Magic Moment**: GPU contention analysis with blocked resources
42 | - **Top Users**: Ranked list of users by GPU memory consumption
43 | - **Node Details**: Individual node status and health information
44 |
45 | ### Detection Page
46 | - **Threat Detection**: Real-time security monitoring
47 | - **Risk Scoring**: Confidence-based threat assessment
48 | - **Crypto Miner Detection**: Identifies mining software and patterns
49 | - **Suspicious Processes**: Flags unusual process behavior
50 | - **Resource Abuse Monitoring**: Detects excessive memory usage
51 | - **Interactive Scanning**: Manual scan controls with loading states
52 |
53 | ### Guard Page
54 | - **Policy Management**: User, Group, and GPU policy configuration
55 | - **Enforcement Controls**: Soft/hard enforcement toggle switches
56 | - **Policy Statistics**: Modern gradient cards showing policy counts
57 | - **Visual Tables**: Clean display of all policies with action buttons
58 | - **Modal Forms**: Intuitive policy creation with validation
59 | - **Policy Testing**: Built-in simulation and testing interface
60 | - **Data Persistence**: Policy data saved locally across refreshes
61 |
62 | ## Configuration
63 |
64 | The dashboard automatically connects to the GPU Kill coordinator API. You can configure the API endpoint:
65 |
66 | ```bash
67 | # Set custom API base URL
68 | export API_BASE=http://your-server:8080
69 | npm run dev
70 | ```
71 |
72 | ## Development
73 |
74 | ```bash
75 | # Install dependencies
76 | npm install
77 |
78 | # Start development server
79 | npm run dev
80 |
81 | # Build for production
82 | npm run build
83 |
84 | # Preview production build
85 | npm run preview
86 | ```
87 |
88 | ## API Integration
89 |
90 | The dashboard connects to the GPU Kill coordinator API endpoints:
91 |
92 | - `GET /api/cluster/snapshot` - Cluster overview data
93 | - `GET /api/cluster/contention` - Magic Moment analysis
94 | - `GET /api/cluster/rogue` - Rogue detection results
95 | - `GET /api/guard/config` - Guard Mode configuration
96 | - `GET /api/guard/status` - Guard Mode status
97 | - `POST /api/guard/toggle-dry-run` - Toggle dry-run mode
98 | - `POST /api/guard/test-policies` - Test policy enforcement
99 | - `WS /ws` - WebSocket for real-time updates
--------------------------------------------------------------------------------
/scripts/test-hotaisle-integration-simple.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # GPU Kill - Hot Aisle Integration Test Script (CI-friendly version)
4 | # This script tests the Hot Aisle integration without requiring actual API access
5 |
6 | # Colors for output
7 | RED='\033[0;31m'
8 | GREEN='\033[0;32m'
9 | YELLOW='\033[1;33m'
10 | BLUE='\033[0;34m'
11 | NC='\033[0m' # No Color
12 |
13 | # Logging functions
14 | log_info() {
15 | echo -e "${BLUE}[INFO]${NC} $1"
16 | }
17 |
18 | log_success() {
19 | echo -e "${GREEN}[SUCCESS]${NC} $1"
20 | }
21 |
22 | log_warning() {
23 | echo -e "${YELLOW}[WARNING]${NC} $1"
24 | }
25 |
26 | log_error() {
27 | echo -e "${RED}[ERROR]${NC} $1"
28 | }
29 |
30 | # Test results
31 | TESTS_PASSED=0
32 | TESTS_FAILED=0
33 |
34 | # Function to run a test
35 | run_test() {
36 | local test_name="$1"
37 | local test_command="$2"
38 |
39 | log_info "Running test: $test_name"
40 | log_info "Command: $test_command"
41 |
42 | if eval "$test_command"; then
43 | log_success "✅ $test_name passed"
44 | ((TESTS_PASSED++))
45 | else
46 | log_error "❌ $test_name failed"
47 | log_error "Command that failed: $test_command"
48 | ((TESTS_FAILED++))
49 | fi
50 | echo
51 | }
52 |
53 | # Main test function
54 | main() {
55 | log_info "Starting Hot Aisle Integration Tests (Simple Version)"
56 | echo "========================================"
57 |
58 | # Debug information
59 | log_info "Environment:"
60 | log_info " CI: ${CI:-false}"
61 | log_info " PWD: $(pwd)"
62 | log_info " USER: ${USER:-unknown}"
63 | echo
64 |
65 | # Test 1: Check if we're in the right directory
66 | run_test "Project Root Check" '[[ -f "Cargo.toml" ]]'
67 |
68 | # Test 2: Check if Rust is available
69 | run_test "Rust Toolchain Check" 'command -v cargo > /dev/null 2>&1'
70 |
71 | # Test 3: Check if git is available
72 | run_test "Git Check" 'command -v git > /dev/null 2>&1'
73 |
74 | # Test 4: Build without Hot Aisle feature
75 | run_test "Build Without Hot Aisle Feature" 'cargo build --release'
76 |
77 | # Test 5: Build with Hot Aisle feature
78 | run_test "Build With Hot Aisle Feature" 'cargo build --release --features hotaisle'
79 |
80 | # Test 6: Check if Hot Aisle client compiles
81 | run_test "Hot Aisle Client Compilation" 'cargo check --features hotaisle'
82 |
83 | # Test 7: Validate test script syntax
84 | run_test "Test Script Syntax Check" 'bash -n scripts/run-gpu-tests.sh'
85 |
86 | # Test 8: Check if workflow file exists
87 | run_test "Workflow File Exists" '[[ -f ".github/workflows/hotaisle-gpu-testing.yml" ]]'
88 |
89 | # Test 9: Check if documentation exists
90 | run_test "Documentation Exists" '[[ -f "docs/HOTAISLE_INTEGRATION.md" ]]'
91 |
92 | # Test 10: Validate Cargo.toml has hotaisle feature
93 | run_test "Hot Aisle Feature in Cargo.toml" 'grep -q "hotaisle = \\[\\]" Cargo.toml'
94 |
95 | # Test 11: Check if lib.rs has conditional compilation
96 | run_test "Conditional Compilation in lib.rs" 'grep -q "#\\[cfg(feature = \"hotaisle\")\\]" src/lib.rs'
97 |
98 | # Summary
99 | echo "========================================"
100 | log_info "Test Summary:"
101 | log_success "✅ Tests Passed: $TESTS_PASSED"
102 | if [[ $TESTS_FAILED -gt 0 ]]; then
103 | log_error "❌ Tests Failed: $TESTS_FAILED"
104 | exit 1
105 | else
106 | log_success "✅ Tests Failed: $TESTS_FAILED"
107 | fi
108 |
109 | log_success "🎉 All integration tests passed!"
110 | log_info "The Hot Aisle integration is ready for use with a valid API key."
111 | exit 0
112 | }
113 |
114 | # Run main function
115 | main "$@"
116 |
--------------------------------------------------------------------------------
/scripts/install.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env sh
2 | set -e
3 |
4 | # gpukill install script: fetch prebuilt GitHub release binary
5 |
6 | REPO_OWNER="treadiehq"
7 | REPO_NAME="gpu-kill"
8 | INSTALL_DIR_DEFAULT="$HOME/.local/bin"
9 | BIN_NAME="gpukill"
10 |
11 | # Flags
12 | VERSION=""
13 | BIN_DIR=""
14 | YES="0"
15 | INSECURE="0"
16 |
17 | usage() {
18 | echo "Usage: curl -fsSL https://get.gpukill.sh | sh [-s] -- [--version vX.Y.Z] [--bin-dir DIR] [--yes] [--insecure]" >&2
19 | }
20 |
21 | while [ $# -gt 0 ]; do
22 | case "$1" in
23 | --version) VERSION="$2"; shift 2 ;;
24 | --bin-dir) BIN_DIR="$2"; shift 2 ;;
25 | --yes|-y) YES="1"; shift ;;
26 | --insecure) INSECURE="1"; shift ;;
27 | -h|--help) usage; exit 0 ;;
28 | *) echo "Unknown option: $1" >&2; usage; exit 1 ;;
29 | esac
30 | done
31 |
32 | detect_os() {
33 | uname_s=$(uname -s 2>/dev/null || echo unknown)
34 | case "$uname_s" in
35 | Linux) echo linux ;;
36 | Darwin) echo macos ;;
37 | *) echo unsupported ;;
38 | esac
39 | }
40 |
41 | detect_arch() {
42 | uname_m=$(uname -m 2>/dev/null || echo unknown)
43 | case "$uname_m" in
44 | x86_64|amd64) echo x86_64 ;;
45 | aarch64|arm64) echo aarch64 ;;
46 | *) echo unsupported ;;
47 | esac
48 | }
49 |
50 | need_cmd() { command -v "$1" >/dev/null 2>&1 || { echo "Missing required command: $1" >&2; exit 1; }; }
51 |
52 | need_cmd curl
53 | need_cmd uname
54 | need_cmd mkdir
55 | need_cmd chmod
56 |
57 | OS=$(detect_os)
58 | ARCH=$(detect_arch)
59 | if [ "$OS" = "unsupported" ] || [ "$ARCH" = "unsupported" ]; then
60 | echo "Unsupported platform: OS=$OS ARCH=$ARCH" >&2
61 | exit 1
62 | fi
63 |
64 | BIN_DIR=${BIN_DIR:-$INSTALL_DIR_DEFAULT}
65 | mkdir -p "$BIN_DIR"
66 |
67 | # Resolve version
68 | API="https://api.github.com/repos/$REPO_OWNER/$REPO_NAME/releases/latest"
69 | if [ -n "$VERSION" ]; then
70 | API="https://api.github.com/repos/$REPO_OWNER/$REPO_NAME/releases/tags/$VERSION"
71 | fi
72 |
73 | echo "Resolving release…"
74 | TAG=$(curl -fsSL "$API" | sed -n 's/ \"tag_name\": \"\(.*\)\",/\1/p' | head -n1)
75 | if [ -z "$TAG" ]; then
76 | echo "Failed to resolve release tag" >&2
77 | exit 1
78 | fi
79 |
80 | case "$OS-$ARCH" in
81 | linux-x86_64) ASSET="gpukill-$TAG-linux-x86_64" ;;
82 | linux-aarch64) ASSET="gpukill-$TAG-linux-aarch64" ;;
83 | macos-x86_64) ASSET="gpukill-$TAG-macos-x86_64" ;;
84 | macos-aarch64) ASSET="gpukill-$TAG-macos-aarch64" ;;
85 | esac
86 |
87 | URL_BASE="https://github.com/$REPO_OWNER/$REPO_NAME/releases/download/$TAG"
88 | BIN_URL="$URL_BASE/$ASSET"
89 | SUMS_URL="$URL_BASE/SHA256SUMS"
90 |
91 | TMPDIR=${TMPDIR:-/tmp}
92 | TMP_BIN="$TMPDIR/$ASSET"
93 | TMP_SUMS="$TMPDIR/${REPO_NAME}_SHA256SUMS"
94 |
95 | echo "Downloading binary: $BIN_URL"
96 | curl -fsSL "$BIN_URL" -o "$TMP_BIN"
97 |
98 | echo "Downloading checksums: $SUMS_URL"
99 | curl -fsSL "$SUMS_URL" -o "$TMP_SUMS" || true
100 |
101 | if [ -s "$TMP_SUMS" ]; then
102 | need_cmd shasum || need_cmd sha256sum
103 | if command -v shasum >/dev/null 2>&1; then
104 | SUM=$(shasum -a 256 "$TMP_BIN" | awk '{print $1}')
105 | else
106 | SUM=$(sha256sum "$TMP_BIN" | awk '{print $1}')
107 | fi
108 | if ! grep -q "$SUM" "$TMP_SUMS"; then
109 | if [ "$INSECURE" != "1" ]; then
110 | echo "Checksum verification failed" >&2
111 | exit 1
112 | else
113 | echo "WARNING: checksum verification skipped (--insecure)" >&2
114 | fi
115 | fi
116 | else
117 | echo "WARNING: no checksum file found in release; proceeding" >&2
118 | fi
119 |
120 | DEST="$BIN_DIR/$BIN_NAME"
121 | mv "$TMP_BIN" "$DEST"
122 | chmod +x "$DEST"
123 |
124 | if ! printf %s ":$PATH:" | grep -q ":$BIN_DIR:"; then
125 | echo "Installed to $DEST but $BIN_DIR is not in PATH" >&2
126 | echo "Add this to your shell rc: export PATH=\"$BIN_DIR:\$PATH\"" >&2
127 | fi
128 |
129 | echo "✅ Installed $BIN_NAME $TAG to $DEST"
130 | "$DEST" --version || true
131 |
132 |
--------------------------------------------------------------------------------
/dashboard/components/AppSidebar.vue:
--------------------------------------------------------------------------------
1 |
2 |
69 |
70 |
71 |
116 |
--------------------------------------------------------------------------------
/mcp/README.md:
--------------------------------------------------------------------------------
1 | # GPU Kill MCP Server
2 |
3 | A MCP server for GPU Kill, enabling AI assistants and other tools to interact with GPU management functionality through a standardized interface.
4 |
5 | ## Features
6 |
7 | ### Resources (Read-only data)
8 | - **gpu://list** - Current GPU status and utilization
9 | - **gpu://processes** - Currently running GPU processes
10 | - **gpu://audit** - Historical GPU usage data
11 | - **gpu://policies** - Current Guard Mode policies
12 | - **gpu://rogue-detection** - Security scan results and threats
13 |
14 | ### Tools (Actions)
15 | - **kill_gpu_process** - Kill a GPU process by PID
16 | - **reset_gpu** - Reset a GPU by ID
17 | - **scan_rogue_activity** - Scan for suspicious GPU activity
18 | - **create_user_policy** - Create a user policy for Guard Mode
19 | - **get_gpu_status** - Get detailed status of a specific GPU
20 | - **kill_processes_by_name** - Kill all processes matching a name pattern
21 |
22 | ## Quick Start
23 |
24 | ### Build and Run
25 |
26 | ```bash
27 | # Build the MCP server
28 | cargo build --release -p gpukill-mcp
29 |
30 | # Run the MCP server
31 | cargo run --release -p gpukill-mcp
32 |
33 | # Or run with custom port
34 | MCP_PORT=3001 cargo run --release -p gpukill-mcp
35 | ```
36 |
37 | ### Using with AI Assistants
38 |
39 | The MCP server exposes GPU management capabilities through a JSON-RPC interface that AI assistants can use to:
40 |
41 | - Monitor GPU usage and performance
42 | - Kill stuck or problematic processes
43 | - Reset crashed GPUs
44 | - Scan for security threats
45 | - Manage resource policies
46 | - Automate GPU operations
47 |
48 | ### Example Usage
49 |
50 | ```bash
51 | # Start the MCP server
52 | cargo run --release -p gpukill-mcp
53 |
54 | # The server will be available at http://localhost:3001/mcp
55 | # AI assistants can connect and use the available tools and resources
56 | ```
57 |
58 | ### Natural Language Examples
59 |
60 | Ask your AI assistant to use the MCP tools with natural language:
61 |
62 | ```text
63 | What GPUs do I have and what's their current usage?
64 | ```
65 |
66 | ```text
67 | Kill the Python process that's stuck on GPU 0
68 | ```
69 |
70 | ```text
71 | Kill all training processes that are using too much GPU memory
72 | ```
73 |
74 | ```text
75 | Show me GPU usage and kill any stuck processes
76 | ```
77 |
78 | ```text
79 | Scan for crypto miners and suspicious activity
80 | ```
81 |
82 | ```text
83 | Create a policy to limit user memory usage to 8GB
84 | ```
85 |
86 | ```text
87 | Reset GPU 1 because it's not responding
88 | ```
89 |
90 | ```text
91 | What processes are currently using my GPUs?
92 | ```
93 |
94 | ## API Endpoints
95 |
96 | ### HTTP Interface
97 |
98 | - **POST /mcp** - Main MCP JSON-RPC endpoint
99 | - **GET /health** - Health check endpoint
100 |
101 | ### MCP Methods
102 |
103 | - **initialize** - Initialize the MCP connection
104 | - **resources/list** - List available resources
105 | - **resources/read** - Read resource contents
106 | - **tools/list** - List available tools
107 | - **tools/call** - Execute a tool
108 |
109 | ## Configuration
110 |
111 | The MCP server can be configured using environment variables:
112 |
113 | - **MCP_PORT** - Port to listen on (default: 3001)
114 | - **RUST_LOG** - Logging level (default: info)
115 |
116 | ## Integration
117 |
118 | This MCP server enables AI assistants to:
119 |
120 | 1. **Monitor GPU Health**: Check GPU status, utilization, and memory usage
121 | 2. **Manage Processes**: Kill problematic processes or reset GPUs
122 | 3. **Security Monitoring**: Scan for crypto miners and suspicious activity
123 | 4. **Policy Management**: Create and manage resource policies
124 | 5. **Automation**: Automate routine GPU management tasks
125 |
126 | ## Development
127 |
128 | ```bash
129 | # Run in development mode
130 | cargo run -p gpukill-mcp
131 |
132 | # Run with debug logging
133 | RUST_LOG=debug cargo run -p gpukill-mcp
134 |
135 | # Test the server
136 | curl -X POST http://localhost:3001/mcp \
137 | -H "Content-Type: application/json" \
138 | -d '{"jsonrpc":"2.0","id":"1","method":"tools/list","params":{}}'
139 | ```
140 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | # Functional Source License, Version 1.1, MIT Future License
2 |
3 | ## Abbreviation
4 |
5 | FSL-1.1-MIT
6 |
7 | ## Notice
8 |
9 | Copyright (c) 2024 Treadie, Inc
10 |
11 | ## Terms and Conditions
12 |
13 | ### Licensor ("We")
14 |
15 | The party offering the Software under these Terms and Conditions.
16 |
17 | ### The Software
18 |
19 | The "Software" is each version of the software that we make available under
20 | these Terms and Conditions, as indicated by our inclusion of these Terms and
21 | Conditions with the Software.
22 |
23 | ### License Grant
24 |
25 | Subject to your compliance with this License Grant and the Patents,
26 | Redistribution and Trademark clauses below, we hereby grant you the right to
27 | use, copy, modify, create derivative works, publicly perform, publicly display
28 | and redistribute the Software for any Permitted Purpose identified below.
29 |
30 | ### Permitted Purpose
31 |
32 | A Permitted Purpose is any purpose other than a Competing Use. A Competing Use
33 | means making the Software available to others in a commercial product or
34 | service that:
35 |
36 | 1. substitutes for the Software;
37 |
38 | 2. substitutes for any other product or service we offer using the Software
39 | that exists as of the date we make the Software available; or
40 |
41 | 3. offers the same or substantially similar functionality as the Software.
42 |
43 | Permitted Purposes specifically include using the Software:
44 |
45 | 1. for your internal use and access;
46 |
47 | 2. for non-commercial education;
48 |
49 | 3. for non-commercial research; and
50 |
51 | 4. in connection with professional services that you provide to a licensee
52 | using the Software in accordance with these Terms and Conditions.
53 |
54 | ### Patents
55 |
56 | To the extent your use for a Permitted Purpose would necessarily infringe our
57 | patents, the license grant above includes a license under our patents. If you
58 | make a claim against any party that the Software infringes or contributes to
59 | the infringement of any patent, then your patent license to the Software ends
60 | immediately.
61 |
62 | ### Redistribution
63 |
64 | The Terms and Conditions apply to all copies, modifications and derivatives of
65 | the Software.
66 |
67 | If you redistribute any copies, modifications or derivatives of the Software,
68 | you must include a copy of or a link to these Terms and Conditions and not
69 | remove any copyright notices provided in or with the Software.
70 |
71 | ### Disclaimer
72 |
73 | THE SOFTWARE IS PROVIDED "AS IS" AND WITHOUT WARRANTIES OF ANY KIND, EXPRESS OR
74 | IMPLIED, INCLUDING WITHOUT LIMITATION WARRANTIES OF FITNESS FOR A PARTICULAR
75 | PURPOSE, MERCHANTABILITY, TITLE OR NON-INFRINGEMENT.
76 |
77 | IN NO EVENT WILL WE HAVE ANY LIABILITY TO YOU ARISING OUT OF OR RELATED TO THE
78 | SOFTWARE, INCLUDING INDIRECT, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES,
79 | EVEN IF WE HAVE BEEN INFORMED OF THEIR POSSIBILITY IN ADVANCE.
80 |
81 | ### Trademarks
82 |
83 | Except for displaying the License Details and identifying us as the origin of
84 | the Software, you have no right under these Terms and Conditions to use our
85 | trademarks, trade names, service marks or product names.
86 |
87 | ## Grant of Future License
88 |
89 | We hereby irrevocably grant you an additional license to use the Software under
90 | the MIT license that is effective on the second anniversary of the date we make
91 | the Software available. On or after that date, you may use the Software under
92 | the MIT license, in which case the following will apply:
93 |
94 | Permission is hereby granted, free of charge, to any person obtaining a copy of
95 | this software and associated documentation files (the "Software"), to deal in
96 | the Software without restriction, including without limitation the rights to
97 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
98 | of the Software, and to permit persons to whom the Software is furnished to do
99 | so, subject to the following conditions:
100 |
101 | The above copyright notice and this permission notice shall be included in all
102 | copies or substantial portions of the Software.
103 |
104 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
105 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
106 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
107 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
108 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
109 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
110 | SOFTWARE.
--------------------------------------------------------------------------------
/src/util.rs:
--------------------------------------------------------------------------------
1 | use chrono::{DateTime, Local, Utc};
2 | use std::time::{Duration, SystemTime};
3 |
4 | /// Get the current hostname
5 | pub fn get_hostname() -> String {
6 | hostname::get()
7 | .unwrap_or_else(|_| std::ffi::OsString::from("unknown"))
8 | .to_string_lossy()
9 | .to_string()
10 | }
11 |
12 | /// Format a timestamp as a human-readable string
13 | #[allow(dead_code)]
14 | pub fn format_timestamp(timestamp: SystemTime) -> String {
15 | let datetime: DateTime = timestamp.into();
16 | datetime.format("%Y-%m-%d %H:%M:%S").to_string()
17 | }
18 |
19 | /// Format a timestamp as ISO 8601 string
20 | pub fn format_timestamp_iso(timestamp: SystemTime) -> String {
21 | let datetime: DateTime = timestamp.into();
22 | datetime.format("%Y-%m-%dT%H:%M:%S%.3fZ").to_string()
23 | }
24 |
25 | /// Get current timestamp as ISO 8601 string
26 | pub fn get_current_timestamp_iso() -> String {
27 | format_timestamp_iso(SystemTime::now())
28 | }
29 |
30 | /// Format duration as human-readable string
31 | #[allow(dead_code)]
32 | pub fn format_duration(duration: Duration) -> String {
33 | let total_seconds = duration.as_secs();
34 | let hours = total_seconds / 3600;
35 | let minutes = (total_seconds % 3600) / 60;
36 | let seconds = total_seconds % 60;
37 |
38 | if hours > 0 {
39 | format!("{}h {}m {}s", hours, minutes, seconds)
40 | } else if minutes > 0 {
41 | format!("{}m {}s", minutes, seconds)
42 | } else {
43 | format!("{}s", seconds)
44 | }
45 | }
46 |
47 | /// Format memory size in bytes to human-readable format
48 | #[allow(dead_code)]
49 | pub fn format_memory_size(bytes: u64) -> String {
50 | const UNITS: &[&str] = &["B", "KB", "MB", "GB", "TB"];
51 | const THRESHOLD: u64 = 1024;
52 |
53 | if bytes == 0 {
54 | return "0 B".to_string();
55 | }
56 |
57 | let mut size = bytes as f64;
58 | let mut unit_index = 0;
59 |
60 | while size >= THRESHOLD as f64 && unit_index < UNITS.len() - 1 {
61 | size /= THRESHOLD as f64;
62 | unit_index += 1;
63 | }
64 |
65 | if unit_index == 0 {
66 | format!("{} {}", bytes, UNITS[unit_index])
67 | } else {
68 | format!("{:.1} {}", size, UNITS[unit_index])
69 | }
70 | }
71 |
72 | /// Format memory size in MB to GiB
73 | pub fn format_memory_mb_to_gib(mb: u32) -> String {
74 | let gib = mb as f64 / 1024.0;
75 | format!("{:.1}", gib)
76 | }
77 |
78 | /// Check if running on Linux
79 | #[allow(dead_code)]
80 | pub fn is_linux() -> bool {
81 | cfg!(target_os = "linux")
82 | }
83 |
84 | /// Check if running on macOS
85 | #[allow(dead_code)]
86 | pub fn is_macos() -> bool {
87 | cfg!(target_os = "macos")
88 | }
89 |
90 | /// Check if running on Windows
91 | #[allow(dead_code)]
92 | pub fn is_windows() -> bool {
93 | cfg!(target_os = "windows")
94 | }
95 |
96 | /// Get operating system name
97 | #[allow(dead_code)]
98 | pub fn get_os_name() -> &'static str {
99 | if is_linux() {
100 | "Linux"
101 | } else if is_macos() {
102 | "macOS"
103 | } else if is_windows() {
104 | "Windows"
105 | } else {
106 | "Unknown"
107 | }
108 | }
109 |
110 | /// Truncate string to specified length with ellipsis
111 | pub fn truncate_string(s: &str, max_len: usize) -> String {
112 | if s.len() <= max_len {
113 | s.to_string()
114 | } else {
115 | format!("{}...", &s[..max_len.saturating_sub(3)])
116 | }
117 | }
118 |
119 | /// Parse process start time from system time
120 | #[allow(dead_code)]
121 | pub fn parse_process_start_time(start_time: SystemTime) -> String {
122 | let now = SystemTime::now();
123 | let duration = now.duration_since(start_time).unwrap_or_default();
124 | format_duration(duration)
125 | }
126 |
127 | #[cfg(test)]
128 | mod tests {
129 | use super::*;
130 | use std::time::Duration;
131 |
132 | #[test]
133 | fn test_format_duration() {
134 | assert_eq!(format_duration(Duration::from_secs(30)), "30s");
135 | assert_eq!(format_duration(Duration::from_secs(90)), "1m 30s");
136 | assert_eq!(format_duration(Duration::from_secs(3661)), "1h 1m 1s");
137 | }
138 |
139 | #[test]
140 | fn test_format_memory_size() {
141 | assert_eq!(format_memory_size(0), "0 B");
142 | assert_eq!(format_memory_size(1024), "1.0 KB");
143 | assert_eq!(format_memory_size(1024 * 1024), "1.0 MB");
144 | assert_eq!(format_memory_size(1024 * 1024 * 1024), "1.0 GB");
145 | }
146 |
147 | #[test]
148 | fn test_format_memory_mb_to_gib() {
149 | assert_eq!(format_memory_mb_to_gib(0), "0.0");
150 | assert_eq!(format_memory_mb_to_gib(1024), "1.0");
151 | assert_eq!(format_memory_mb_to_gib(2048), "2.0");
152 | }
153 |
154 | #[test]
155 | fn test_truncate_string() {
156 | assert_eq!(truncate_string("short", 10), "short");
157 | assert_eq!(truncate_string("very long string", 10), "very lo...");
158 | assert_eq!(truncate_string("abc", 3), "abc");
159 | }
160 |
161 | #[test]
162 | fn test_os_detection() {
163 | // These tests will pass on the respective platforms
164 | assert!(get_os_name() != "Unknown");
165 | }
166 | }
167 |
--------------------------------------------------------------------------------
/mcp/src/types.rs:
--------------------------------------------------------------------------------
1 | //! MCP Protocol Types for GPU Kill
2 |
3 | use serde::{Deserialize, Serialize};
4 | use std::collections::HashMap;
5 |
6 | /// MCP Request/Response types
7 | #[derive(Debug, Serialize, Deserialize)]
8 | #[serde(tag = "jsonrpc", rename = "2.0")]
9 | pub struct JsonRpcRequest {
10 | pub id: String,
11 | pub method: String,
12 | pub params: Option,
13 | }
14 |
15 | #[derive(Debug, Serialize, Deserialize)]
16 | pub struct JsonRpcResponse {
17 | pub jsonrpc: String,
18 | pub id: String,
19 | #[serde(skip_serializing_if = "Option::is_none")]
20 | pub result: Option,
21 | #[serde(skip_serializing_if = "Option::is_none")]
22 | pub error: Option,
23 | }
24 |
25 | #[derive(Debug, Serialize, Deserialize)]
26 | pub struct JsonRpcError {
27 | pub code: i32,
28 | pub message: String,
29 | #[serde(skip_serializing_if = "Option::is_none")]
30 | pub data: Option,
31 | }
32 |
33 | /// MCP Protocol Messages
34 | #[derive(Debug, Serialize, Deserialize)]
35 | pub struct InitializeRequest {
36 | pub protocol_version: String,
37 | pub capabilities: ClientCapabilities,
38 | pub client_info: ClientInfo,
39 | }
40 |
41 | #[derive(Debug, Serialize, Deserialize)]
42 | pub struct InitializeResponse {
43 | pub protocol_version: String,
44 | pub capabilities: ServerCapabilities,
45 | pub server_info: ServerInfo,
46 | }
47 |
48 | #[derive(Debug, Serialize, Deserialize)]
49 | pub struct ClientCapabilities {
50 | #[serde(skip_serializing_if = "Option::is_none")]
51 | pub roots: Option,
52 | #[serde(skip_serializing_if = "Option::is_none")]
53 | pub sampling: Option,
54 | }
55 |
56 | #[derive(Debug, Serialize, Deserialize)]
57 | pub struct ServerCapabilities {
58 | #[serde(skip_serializing_if = "Option::is_none")]
59 | pub resources: Option,
60 | #[serde(skip_serializing_if = "Option::is_none")]
61 | pub tools: Option,
62 | #[serde(skip_serializing_if = "Option::is_none")]
63 | pub logging: Option,
64 | }
65 |
66 | #[derive(Debug, Serialize, Deserialize)]
67 | pub struct ClientInfo {
68 | pub name: String,
69 | pub version: String,
70 | }
71 |
72 | #[derive(Debug, Serialize, Deserialize)]
73 | pub struct ServerInfo {
74 | pub name: String,
75 | pub version: String,
76 | }
77 |
78 | #[derive(Debug, Serialize, Deserialize)]
79 | pub struct RootsCapability {
80 | pub list_changed: Option,
81 | }
82 |
83 | #[derive(Debug, Serialize, Deserialize)]
84 | pub struct SamplingCapability {}
85 |
86 | #[derive(Debug, Serialize, Deserialize)]
87 | pub struct ResourcesCapability {
88 | pub subscribe: Option,
89 | pub list_changed: Option,
90 | }
91 |
92 | #[derive(Debug, Serialize, Deserialize)]
93 | pub struct ToolsCapability {
94 | pub list_changed: Option,
95 | }
96 |
97 | #[derive(Debug, Serialize, Deserialize)]
98 | pub struct LoggingCapability {}
99 |
100 | /// Resource Types
101 | #[derive(Debug, Serialize, Deserialize)]
102 | pub struct Resource {
103 | pub uri: String,
104 | pub name: String,
105 | pub description: Option,
106 | pub mime_type: Option,
107 | }
108 |
109 | #[derive(Debug, Serialize, Deserialize)]
110 | pub struct ResourceContents {
111 | pub uri: String,
112 | pub mime_type: Option,
113 | pub text: Option,
114 | pub blob: Option, // Base64 encoded
115 | }
116 |
117 | /// Tool Types
118 | #[derive(Debug, Serialize, Deserialize)]
119 | pub struct Tool {
120 | pub name: String,
121 | pub description: Option,
122 | pub input_schema: serde_json::Value,
123 | }
124 |
125 | #[derive(Debug, Serialize, Deserialize)]
126 | pub struct ToolCall {
127 | pub name: String,
128 | pub arguments: Option>,
129 | }
130 |
131 | #[derive(Debug, Serialize, Deserialize)]
132 | pub struct ToolResult {
133 | pub content: Vec,
134 | pub is_error: Option,
135 | }
136 |
137 | #[derive(Debug, Serialize, Deserialize)]
138 | pub struct ToolContent {
139 | #[serde(rename = "type")]
140 | pub content_type: String,
141 | pub text: Option,
142 | #[serde(skip_serializing_if = "Option::is_none")]
143 | pub data: Option,
144 | }
145 |
146 | /// GPU Kill specific types
147 | #[derive(Debug, Serialize, Deserialize)]
148 | pub struct GpuInfo {
149 | pub id: u32,
150 | pub name: String,
151 | pub vendor: String,
152 | pub memory_used: f64,
153 | pub memory_total: f64,
154 | pub utilization: f64,
155 | pub temperature: Option,
156 | pub power_usage: Option,
157 | pub processes: Vec,
158 | }
159 |
160 | #[derive(Debug, Serialize, Deserialize)]
161 | pub struct GpuProcess {
162 | pub pid: u32,
163 | pub name: String,
164 | pub memory_usage: f64,
165 | pub user: Option,
166 | }
167 |
168 | #[derive(Debug, Serialize, Deserialize)]
169 | pub struct ThreatInfo {
170 | pub id: String,
171 | pub threat_type: String,
172 | pub severity: String,
173 | pub confidence: f64,
174 | pub description: String,
175 | pub process_info: Option,
176 | }
177 |
178 | #[derive(Debug, Serialize, Deserialize)]
179 | pub struct PolicyInfo {
180 | pub policy_type: String,
181 | pub name: String,
182 | pub enabled: bool,
183 | pub limits: HashMap,
184 | }
185 |
--------------------------------------------------------------------------------
/scripts/setup-gpu-runner.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # GPU Runner Setup Script
3 | # This script helps set up a self-hosted GitHub Actions runner with GPU support
4 |
5 | set -e
6 |
7 | echo "🚀 GPU Kill - Self-Hosted Runner Setup"
8 | echo "======================================"
9 |
10 | # Check if running as root
11 | if [[ $EUID -eq 0 ]]; then
12 | echo "❌ This script should not be run as root"
13 | exit 1
14 | fi
15 |
16 | # Detect OS
17 | if [[ "$OSTYPE" == "linux-gnu"* ]]; then
18 | OS="linux"
19 | elif [[ "$OSTYPE" == "darwin"* ]]; then
20 | OS="macos"
21 | else
22 | echo "❌ Unsupported OS: $OSTYPE"
23 | exit 1
24 | fi
25 |
26 | echo "📋 Detected OS: $OS"
27 |
28 | # Function to install dependencies
29 | install_deps() {
30 | echo "📦 Installing system dependencies..."
31 |
32 | if [[ "$OS" == "linux" ]]; then
33 | sudo apt-get update
34 | sudo apt-get install -y build-essential libssl-dev pkg-config curl tar
35 |
36 | # Install GPU-specific tools
37 | echo "🔧 Installing GPU tools..."
38 |
39 | # NVIDIA
40 | if command -v nvidia-smi &> /dev/null; then
41 | echo "✅ NVIDIA GPU detected"
42 | sudo apt-get install -y nvidia-utils-* || echo "⚠️ NVIDIA utils installation failed"
43 | else
44 | echo "ℹ️ No NVIDIA GPU detected"
45 | fi
46 |
47 | # AMD
48 | if command -v rocm-smi &> /dev/null; then
49 | echo "✅ AMD GPU with ROCm detected"
50 | else
51 | echo "ℹ️ Installing ROCm tools..."
52 | sudo apt-get install -y rocm-smi || echo "⚠️ ROCm installation failed"
53 | fi
54 |
55 | # Intel
56 | echo "ℹ️ Installing Intel GPU tools..."
57 | sudo apt-get install -y intel-gpu-tools || echo "⚠️ Intel GPU tools installation failed"
58 |
59 | elif [[ "$OS" == "macos" ]]; then
60 | # Check for Xcode command line tools
61 | if ! command -v xcode-select &> /dev/null; then
62 | echo "📱 Installing Xcode command line tools..."
63 | xcode-select --install || echo "⚠️ Xcode tools installation failed"
64 | else
65 | echo "✅ Xcode command line tools already installed"
66 | fi
67 | fi
68 | }
69 |
70 | # Function to install Rust
71 | install_rust() {
72 | echo "🦀 Installing Rust..."
73 |
74 | if command -v rustc &> /dev/null; then
75 | echo "✅ Rust already installed: $(rustc --version)"
76 | else
77 | curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
78 | source ~/.cargo/env
79 | echo "✅ Rust installed: $(rustc --version)"
80 | fi
81 | }
82 |
83 | # Function to setup GitHub Actions runner
84 | setup_runner() {
85 | echo "🏃 Setting up GitHub Actions runner..."
86 |
87 | # Get repository URL and token from user
88 | read -p "📝 Enter your GitHub repository URL (e.g., https://github.com/username/gpu-kill): " REPO_URL
89 | read -p "🔑 Enter your GitHub Personal Access Token (with repo and admin:org permissions): " GITHUB_TOKEN
90 |
91 | # Create runner directory
92 | RUNNER_DIR="$HOME/actions-runner"
93 | mkdir -p "$RUNNER_DIR"
94 | cd "$RUNNER_DIR"
95 |
96 | # Download runner
97 | if [[ "$OS" == "linux" ]]; then
98 | RUNNER_FILE="actions-runner-linux-x64-2.311.0.tar.gz"
99 | elif [[ "$OS" == "macos" ]]; then
100 | RUNNER_FILE="actions-runner-osx-x64-2.311.0.tar.gz"
101 | fi
102 |
103 | echo "📥 Downloading GitHub Actions runner..."
104 | curl -o "$RUNNER_FILE" -L "https://github.com/actions/runner/releases/download/v2.311.0/$RUNNER_FILE"
105 | tar xzf "$RUNNER_FILE"
106 |
107 | # Configure runner
108 | echo "⚙️ Configuring runner..."
109 | ./config.sh --url "$REPO_URL" --token "$GITHUB_TOKEN" --labels "gpu,$OS" --name "gpu-runner-$(hostname)"
110 |
111 | echo "✅ Runner configured successfully!"
112 | echo ""
113 | echo "🎯 To start the runner:"
114 | echo " cd $RUNNER_DIR"
115 | echo " ./run.sh"
116 | echo ""
117 | echo "🎯 To run as a service:"
118 | echo " sudo ./svc.sh install"
119 | echo " sudo ./svc.sh start"
120 | }
121 |
122 | # Function to test GPU detection
123 | test_gpu() {
124 | echo "🧪 Testing GPU detection..."
125 |
126 | # Clone and build GPU Kill
127 | if [[ ! -d "gpu-kill" ]]; then
128 | git clone https://github.com/treadiehq/gpu-kill.git
129 | fi
130 |
131 | cd gpu-kill
132 | cargo build --release
133 |
134 | echo "🔍 GPU Detection Results:"
135 | ./target/release/gpukill --list || echo "No GPUs detected"
136 |
137 | echo "🧪 Running GPU hardware tests..."
138 | cargo test --test gpu_hardware_tests || echo "GPU tests completed (some may have been skipped)"
139 | }
140 |
141 | # Main execution
142 | main() {
143 | echo "🎯 What would you like to do?"
144 | echo "1) Install dependencies only"
145 | echo "2) Setup GitHub Actions runner"
146 | echo "3) Test GPU detection"
147 | echo "4) Full setup (dependencies + runner + test)"
148 | echo "5) Exit"
149 |
150 | read -p "Choose an option (1-5): " choice
151 |
152 | case $choice in
153 | 1)
154 | install_deps
155 | install_rust
156 | ;;
157 | 2)
158 | install_deps
159 | install_rust
160 | setup_runner
161 | ;;
162 | 3)
163 | install_deps
164 | install_rust
165 | test_gpu
166 | ;;
167 | 4)
168 | install_deps
169 | install_rust
170 | setup_runner
171 | test_gpu
172 | ;;
173 | 5)
174 | echo "👋 Goodbye!"
175 | exit 0
176 | ;;
177 | *)
178 | echo "❌ Invalid option"
179 | exit 1
180 | ;;
181 | esac
182 |
183 | echo "✅ Setup completed!"
184 | }
185 |
186 | # Run main function
187 | main "$@"
188 |
--------------------------------------------------------------------------------
/docs/CLOUD_GPU_SETUP.md:
--------------------------------------------------------------------------------
1 | # Cloud GPU Setup Guide
2 |
3 | This guide shows how to set up GPU testing using cloud services.
4 |
5 | ## Quick Start
6 |
7 | ### Option 1: AWS EC2 with GPU
8 |
9 | 1. **Launch GPU Instance:**
10 | ```bash
11 | # Using AWS CLI
12 | aws ec2 run-instances \
13 | --image-id ami-0c02fb55956c7d316 \
14 | --instance-type g4dn.xlarge \
15 | --key-name your-key \
16 | --security-group-ids sg-xxxxxxxxx \
17 | --subnet-id subnet-xxxxxxxxx
18 | ```
19 |
20 | 2. **Connect and Setup:**
21 | ```bash
22 | ssh -i your-key.pem ubuntu@your-instance-ip
23 | curl -sSL https://raw.githubusercontent.com/treadiehq/gpu-kill/main/scripts/setup-gpu-runner.sh | bash
24 | ```
25 |
26 | ### Option 2: Google Cloud with GPU
27 |
28 | 1. **Create GPU Instance:**
29 | ```bash
30 | gcloud compute instances create gpu-test-runner \
31 | --zone=us-central1-a \
32 | --machine-type=n1-standard-4 \
33 | --accelerator=type=nvidia-tesla-t4,count=1 \
34 | --image-family=ubuntu-2004-lts \
35 | --image-project=ubuntu-os-cloud \
36 | --maintenance-policy=TERMINATE \
37 | --restart-on-failure
38 | ```
39 |
40 | 2. **Setup:**
41 | ```bash
42 | gcloud compute ssh gpu-test-runner --zone=us-central1-a
43 | curl -sSL https://raw.githubusercontent.com/treadiehq/gpu-kill/main/scripts/setup-gpu-runner.sh | bash
44 | ```
45 |
46 | ### Option 3: Azure with GPU
47 |
48 | 1. **Create VM:**
49 | ```bash
50 | az vm create \
51 | --resource-group myResourceGroup \
52 | --name gpu-test-vm \
53 | --image UbuntuLTS \
54 | --size Standard_NC6s_v3 \
55 | --admin-username azureuser \
56 | --generate-ssh-keys
57 | ```
58 |
59 | 2. **Setup:**
60 | ```bash
61 | ssh azureuser@your-vm-ip
62 | curl -sSL https://raw.githubusercontent.com/treadiehq/gpu-kill/main/scripts/setup-gpu-runner.sh | bash
63 | ```
64 |
65 | ## Cost-Effective Options
66 |
67 | ### Spot Instances
68 | - **AWS Spot**: Up to 90% savings
69 | - **GCP Preemptible**: Up to 80% savings
70 | - **Azure Spot**: Up to 90% savings
71 |
72 | ### Example Spot Instance Setup (AWS):
73 | ```bash
74 | aws ec2 request-spot-instances \
75 | --spot-price "0.50" \
76 | --instance-count 1 \
77 | --type "one-time" \
78 | --launch-specification '{
79 | "ImageId": "ami-0c02fb55956c7d316",
80 | "InstanceType": "g4dn.xlarge",
81 | "KeyName": "your-key",
82 | "SecurityGroupIds": ["sg-xxxxxxxxx"]
83 | }'
84 | ```
85 |
86 | ## Docker-Based Testing
87 |
88 | ### NVIDIA Docker Setup
89 | ```bash
90 | # Install NVIDIA Docker
91 | distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
92 | curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
93 | curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
94 |
95 | sudo apt-get update && sudo apt-get install -y nvidia-docker2
96 | sudo systemctl restart docker
97 |
98 | # Test GPU access
99 | docker run --rm --gpus all nvidia/cuda:11.0-base nvidia-smi
100 | ```
101 |
102 | ### GPU Kill Docker Testing
103 | ```bash
104 | # Build GPU Kill with GPU support
105 | docker build -t gpukill:gpu .
106 |
107 | # Run tests with GPU access
108 | docker run --rm --gpus all gpukill:gpu cargo test --test gpu_hardware_tests
109 | ```
110 |
111 | ## GitHub Actions Integration
112 |
113 | ### Enable GPU Tests
114 | Once you have a self-hosted runner set up:
115 |
116 | 1. **Remove the `if: false` condition** in `.github/workflows/ci.yml`:
117 | ```yaml
118 | gpu-hardware-tests:
119 | name: GPU Hardware Tests
120 | runs-on: [self-hosted, gpu]
121 | # if: false # Remove this line
122 | ```
123 |
124 | 2. **Add runner labels** when setting up:
125 | ```bash
126 | ./config.sh --labels "gpu,nvidia,linux" --name "nvidia-gpu-runner"
127 | ```
128 |
129 | ### Conditional GPU Testing
130 | The CI will automatically:
131 | - ✅ **Run GPU tests** when GPU hardware is available
132 | - ✅ **Skip gracefully** when no GPU hardware is found
133 | - ✅ **Work on any runner** (hosted or self-hosted)
134 |
135 | ## Cost Optimization
136 |
137 | ### Scheduled Testing
138 | Set up runners to only run during business hours:
139 | ```yaml
140 | on:
141 | schedule:
142 | - cron: '0 9 * * 1-5' # 9 AM, Monday-Friday
143 | ```
144 |
145 | ### Auto-shutdown
146 | Add auto-shutdown to cloud instances:
147 | ```bash
148 | # AWS
149 | aws ec2 create-tags --resources i-1234567890abcdef0 --tags Key=shutdown,Value=yes
150 |
151 | # GCP
152 | gcloud compute instances add-metadata gpu-test-runner \
153 | --metadata shutdown-script='sudo shutdown -h +60'
154 | ```
155 |
156 | ## Monitoring and Alerts
157 |
158 | ### Set up monitoring for:
159 | - GPU utilization during tests
160 | - Test success/failure rates
161 | - Runner availability
162 | - Cost tracking
163 |
164 | ### Example monitoring script:
165 | ```bash
166 | #!/bin/bash
167 | # Monitor GPU test results
168 | curl -H "Authorization: token $GITHUB_TOKEN" \
169 | "https://api.github.com/repos/treadiehq/gpu-kill/actions/runs" | \
170 | jq '.workflow_runs[] | select(.name=="GPU Hardware Tests") | {status, conclusion, created_at}'
171 | ```
172 |
173 | ## Troubleshooting
174 |
175 | ### Common Issues:
176 |
177 | 1. **GPU not detected:**
178 | ```bash
179 | # Check NVIDIA
180 | nvidia-smi
181 |
182 | # Check AMD
183 | rocm-smi --showid
184 |
185 | # Check Intel
186 | intel_gpu_top
187 | ```
188 |
189 | 2. **Permission issues:**
190 | ```bash
191 | # Add user to docker group
192 | sudo usermod -aG docker $USER
193 |
194 | # Check GPU permissions
195 | ls -la /dev/nvidia*
196 | ```
197 |
198 | 3. **Driver issues:**
199 | ```bash
200 | # Update NVIDIA drivers
201 | sudo apt-get install nvidia-driver-470
202 |
203 | # Update AMD drivers
204 | sudo apt-get install rocm-dkms
205 | ```
206 |
207 | ## Next Steps
208 |
209 | 1. **Choose your cloud provider** (AWS, GCP, Azure)
210 | 2. **Set up a GPU instance** using the scripts above
211 | 3. **Configure the GitHub Actions runner** with GPU labels
212 | 4. **Enable GPU tests** in the CI workflow
213 | 5. **Monitor and optimize** costs and performance
214 |
215 | The GPU tests will now run automatically whenever GPU hardware is available! 🚀
216 |
--------------------------------------------------------------------------------
/dashboard/tailwind.config.js:
--------------------------------------------------------------------------------
1 | /** @type {import('tailwindcss').Config} */
2 | export default {
3 | content: [
4 | "./components/**/*.{js,vue,ts}",
5 | "./layouts/**/*.vue",
6 | "./pages/**/*.vue",
7 | "./plugins/**/*.{js,ts}",
8 | "./app.vue",
9 | "./error.vue"
10 | ],
11 | theme: {
12 | extend: {
13 | colors: {
14 | // Custom GPU Kill brand colors
15 | primary: {
16 | 50: '#eff6ff',
17 | 100: '#dbeafe',
18 | 200: '#bfdbfe',
19 | 300: '#93c5fd',
20 | 400: '#60a5fa',
21 | 500: '#3b82f6',
22 | 600: '#2563eb',
23 | 700: '#1d4ed8',
24 | 800: '#1e40af',
25 | 900: '#1e3a8a',
26 | 950: '#172554',
27 | },
28 | gpu: {
29 | 50: '#f0f9ff',
30 | 100: '#e0f2fe',
31 | 200: '#bae6fd',
32 | 300: '#7dd3fc',
33 | 400: '#38bdf8',
34 | 500: '#0ea5e9',
35 | 600: '#0284c7',
36 | 700: '#0369a1',
37 | 800: '#075985',
38 | 900: '#0c4a6e',
39 | 950: '#082f49',
40 | },
41 | danger: {
42 | 50: '#fef2f2',
43 | 100: '#fee2e2',
44 | 200: '#fecaca',
45 | 300: '#fca5a5',
46 | 400: '#f87171',
47 | 500: '#ef4444',
48 | 600: '#dc2626',
49 | 700: '#b91c1c',
50 | 800: '#991b1b',
51 | 900: '#7f1d1d',
52 | 950: '#450a0a',
53 | },
54 | warning: {
55 | 50: '#fffbeb',
56 | 100: '#fef3c7',
57 | 200: '#fde68a',
58 | 300: '#fcd34d',
59 | 400: '#fbbf24',
60 | 500: '#f59e0b',
61 | 600: '#d97706',
62 | 700: '#b45309',
63 | 800: '#92400e',
64 | 900: '#78350f',
65 | 950: '#451a03',
66 | },
67 | success: {
68 | 50: '#f0fdf4',
69 | 100: '#dcfce7',
70 | 200: '#bbf7d0',
71 | 300: '#86efac',
72 | 400: '#4ade80',
73 | 500: '#22c55e',
74 | 600: '#16a34a',
75 | 700: '#15803d',
76 | 800: '#166534',
77 | 900: '#14532d',
78 | 950: '#052e16',
79 | },
80 | // Dark theme colors
81 | dark: {
82 | 50: '#f8fafc',
83 | 100: '#f1f5f9',
84 | 200: '#e2e8f0',
85 | 300: '#cbd5e1',
86 | 400: '#94a3b8',
87 | 500: '#64748b',
88 | 600: '#475569',
89 | 700: '#334155',
90 | 800: '#1e293b',
91 | 900: '#0f172a',
92 | 950: '#020617',
93 | }
94 | },
95 | fontFamily: {
96 | sans: ['Inter', 'system-ui', 'sans-serif'],
97 | mono: ['JetBrains Mono', 'Fira Code', 'monospace'],
98 | },
99 | animation: {
100 | 'pulse-slow': 'pulse 3s cubic-bezier(0.4, 0, 0.6, 1) infinite',
101 | 'bounce-slow': 'bounce 2s infinite',
102 | 'spin-slow': 'spin 3s linear infinite',
103 | 'ping-slow': 'ping 2s cubic-bezier(0, 0, 0.2, 1) infinite',
104 | 'fade-in': 'fadeIn 0.5s ease-in-out',
105 | 'slide-up': 'slideUp 0.3s ease-out',
106 | 'slide-down': 'slideDown 0.3s ease-out',
107 | 'scale-in': 'scaleIn 0.2s ease-out',
108 | 'glow': 'glow 2s ease-in-out infinite alternate',
109 | },
110 | keyframes: {
111 | fadeIn: {
112 | '0%': { opacity: '0' },
113 | '100%': { opacity: '1' },
114 | },
115 | slideUp: {
116 | '0%': { transform: 'translateY(10px)', opacity: '0' },
117 | '100%': { transform: 'translateY(0)', opacity: '1' },
118 | },
119 | slideDown: {
120 | '0%': { transform: 'translateY(-10px)', opacity: '0' },
121 | '100%': { transform: 'translateY(0)', opacity: '1' },
122 | },
123 | scaleIn: {
124 | '0%': { transform: 'scale(0.95)', opacity: '0' },
125 | '100%': { transform: 'scale(1)', opacity: '1' },
126 | },
127 | glow: {
128 | '0%': { boxShadow: '0 0 5px rgba(59, 130, 246, 0.5)' },
129 | '100%': { boxShadow: '0 0 20px rgba(59, 130, 246, 0.8)' },
130 | },
131 | },
132 | backdropBlur: {
133 | xs: '2px',
134 | },
135 | boxShadow: {
136 | 'glow': '0 0 20px rgba(59, 130, 246, 0.3)',
137 | 'glow-lg': '0 0 30px rgba(59, 130, 246, 0.4)',
138 | 'glow-danger': '0 0 20px rgba(239, 68, 68, 0.3)',
139 | 'glow-success': '0 0 20px rgba(34, 197, 94, 0.3)',
140 | 'glow-warning': '0 0 20px rgba(245, 158, 11, 0.3)',
141 | 'inner-lg': 'inset 0 2px 4px 0 rgba(0, 0, 0, 0.1)',
142 | },
143 | borderRadius: {
144 | '4xl': '2rem',
145 | '5xl': '2.5rem',
146 | },
147 | spacing: {
148 | '18': '4.5rem',
149 | '88': '22rem',
150 | '128': '32rem',
151 | },
152 | zIndex: {
153 | '60': '60',
154 | '70': '70',
155 | '80': '80',
156 | '90': '90',
157 | '100': '100',
158 | },
159 | screens: {
160 | 'xs': '475px',
161 | '3xl': '1600px',
162 | },
163 | typography: {
164 | DEFAULT: {
165 | css: {
166 | maxWidth: 'none',
167 | color: '#e5e7eb',
168 | a: {
169 | color: '#60a5fa',
170 | '&:hover': {
171 | color: '#93c5fd',
172 | },
173 | },
174 | h1: {
175 | color: '#ffffff',
176 | },
177 | h2: {
178 | color: '#ffffff',
179 | },
180 | h3: {
181 | color: '#ffffff',
182 | },
183 | h4: {
184 | color: '#ffffff',
185 | },
186 | strong: {
187 | color: '#ffffff',
188 | },
189 | code: {
190 | color: '#fbbf24',
191 | backgroundColor: '#1f2937',
192 | padding: '0.25rem 0.375rem',
193 | borderRadius: '0.25rem',
194 | },
195 | 'code::before': {
196 | content: '""',
197 | },
198 | 'code::after': {
199 | content: '""',
200 | },
201 | },
202 | },
203 | },
204 | },
205 | },
206 | plugins: [
207 | require('@tailwindcss/typography'),
208 | require('@tailwindcss/forms'),
209 | require('@tailwindcss/aspect-ratio'),
210 | ],
211 | darkMode: 'class',
212 | }
213 |
--------------------------------------------------------------------------------
/mcp/src/server.rs:
--------------------------------------------------------------------------------
1 | //! MCP Server implementation for GPU Kill
2 |
3 | use crate::resources::ResourceHandler;
4 | use crate::tools::ToolHandler;
5 | use crate::types::*;
6 | use crate::MCP_VERSION;
7 | use anyhow::Result;
8 | use serde_json::json;
9 | use std::sync::Arc;
10 | use tokio::sync::RwLock;
11 | use tracing::{debug, error, info};
12 |
13 | /// GPU Kill MCP Server
14 | pub struct GpuKillMCPServer {
15 | resource_handler: Arc,
16 | tool_handler: Arc>,
17 | }
18 |
19 | impl GpuKillMCPServer {
20 | /// Create a new MCP server instance
21 | pub async fn new() -> Result {
22 | let resource_handler = Arc::new(ResourceHandler::new().await?);
23 | let tool_handler = Arc::new(RwLock::new(ToolHandler::new().await?));
24 |
25 | Ok(Self {
26 | resource_handler,
27 | tool_handler,
28 | })
29 | }
30 |
31 | /// Handle an MCP request
32 | pub async fn handle_request(&self, request: JsonRpcRequest) -> Result {
33 | debug!("Handling MCP request: {}", request.method);
34 |
35 | let result = match request.method.as_str() {
36 | "initialize" => self.handle_initialize(request.params).await,
37 | "resources/list" => self.handle_resources_list().await,
38 | "resources/read" => self.handle_resources_read(request.params).await,
39 | "tools/list" => self.handle_tools_list().await,
40 | "tools/call" => self.handle_tools_call(request.params).await,
41 | _ => Err(anyhow::anyhow!("Unknown method: {}", request.method)),
42 | };
43 |
44 | match result {
45 | Ok(data) => Ok(JsonRpcResponse {
46 | jsonrpc: "2.0".to_string(),
47 | id: request.id,
48 | result: Some(data),
49 | error: None,
50 | }),
51 | Err(e) => {
52 | error!("Error handling request {}: {}", request.method, e);
53 | Ok(JsonRpcResponse {
54 | jsonrpc: "2.0".to_string(),
55 | id: request.id,
56 | result: None,
57 | error: Some(JsonRpcError {
58 | code: -32603,
59 | message: "Internal error".to_string(),
60 | data: Some(json!({ "details": e.to_string() })),
61 | }),
62 | })
63 | }
64 | }
65 | }
66 |
67 | async fn handle_initialize(
68 | &self,
69 | _params: Option,
70 | ) -> Result {
71 | info!("MCP client initializing");
72 |
73 | let response = InitializeResponse {
74 | protocol_version: MCP_VERSION.to_string(),
75 | capabilities: ServerCapabilities {
76 | resources: Some(ResourcesCapability {
77 | subscribe: Some(false),
78 | list_changed: Some(false),
79 | }),
80 | tools: Some(ToolsCapability {
81 | list_changed: Some(false),
82 | }),
83 | logging: Some(LoggingCapability {}),
84 | },
85 | server_info: ServerInfo {
86 | name: "GPU Kill MCP Server".to_string(),
87 | version: env!("CARGO_PKG_VERSION").to_string(),
88 | },
89 | };
90 |
91 | Ok(serde_json::to_value(response)?)
92 | }
93 |
94 | async fn handle_resources_list(&self) -> Result {
95 | let resources = self.resource_handler.list_resources();
96 | Ok(json!({ "resources": resources }))
97 | }
98 |
99 | async fn handle_resources_read(
100 | &self,
101 | params: Option,
102 | ) -> Result {
103 | let params = params.ok_or_else(|| anyhow::anyhow!("Missing parameters"))?;
104 | let uri = params
105 | .get("uri")
106 | .and_then(|v| v.as_str())
107 | .ok_or_else(|| anyhow::anyhow!("Missing uri parameter"))?;
108 |
109 | let contents = self.resource_handler.get_resource(uri).await?;
110 | Ok(json!({ "contents": contents }))
111 | }
112 |
113 | async fn handle_tools_list(&self) -> Result {
114 | let tool_handler = self.tool_handler.read().await;
115 | let tools = tool_handler.list_tools();
116 | Ok(json!({ "tools": tools }))
117 | }
118 |
119 | async fn handle_tools_call(
120 | &self,
121 | params: Option,
122 | ) -> Result {
123 | let params = params.ok_or_else(|| anyhow::anyhow!("Missing parameters"))?;
124 | let name = params
125 | .get("name")
126 | .and_then(|v| v.as_str())
127 | .ok_or_else(|| anyhow::anyhow!("Missing name parameter"))?;
128 |
129 | let arguments = params
130 | .get("arguments")
131 | .and_then(|v| v.as_object())
132 | .map(|obj| obj.iter().map(|(k, v)| (k.clone(), v.clone())).collect());
133 |
134 | let mut tool_handler = self.tool_handler.write().await;
135 | let result = tool_handler.execute_tool(name, arguments).await?;
136 |
137 | Ok(json!({ "content": result.content, "isError": result.is_error }))
138 | }
139 |
140 | /// Start the MCP server
141 | pub async fn start(self, port: u16) -> Result<()> {
142 | info!("Starting GPU Kill MCP Server on port {}", port);
143 |
144 | let server = Arc::new(self);
145 |
146 | // For now, we'll implement a simple HTTP-based MCP server
147 | // In a full implementation, this would use stdio or WebSocket transport
148 | let app = axum::Router::new()
149 | .route(
150 | "/mcp",
151 | axum::routing::post({
152 | let server = server.clone();
153 | move |request: axum::extract::Json| {
154 | let server = server.clone();
155 | async move {
156 | match server.handle_request(request.0).await {
157 | Ok(response) => axum::response::Json(response),
158 | Err(e) => {
159 | error!("Failed to handle HTTP request: {}", e);
160 | axum::response::Json(JsonRpcResponse {
161 | jsonrpc: "2.0".to_string(),
162 | id: "error".to_string(),
163 | result: None,
164 | error: Some(JsonRpcError {
165 | code: -32603,
166 | message: "Internal error".to_string(),
167 | data: Some(json!({ "details": e.to_string() })),
168 | }),
169 | })
170 | }
171 | }
172 | }
173 | }
174 | }),
175 | )
176 | .route("/health", axum::routing::get(|| async { "OK" }));
177 |
178 | let listener = tokio::net::TcpListener::bind(format!("0.0.0.0:{}", port)).await?;
179 | info!("MCP Server listening on http://0.0.0.0:{}", port);
180 |
181 | axum::serve(listener, app).await?;
182 | Ok(())
183 | }
184 | }
185 |
186 | // Remove Default implementation since new() is now async
187 |
--------------------------------------------------------------------------------
/src/config.rs:
--------------------------------------------------------------------------------
1 | use anyhow::{Context, Result};
2 | use serde::{Deserialize, Serialize};
3 | use std::fs;
4 | use std::path::Path;
5 |
6 | /// Configuration structure for gpukill
7 | #[derive(Debug, Clone, Serialize, Deserialize)]
8 | pub struct Config {
9 | /// Default log level
10 | pub log_level: String,
11 |
12 | /// Default output format
13 | pub output_format: String,
14 |
15 | /// Default timeout for process termination
16 | pub default_timeout_secs: u16,
17 |
18 | /// Whether to show detailed process information by default
19 | pub show_details: bool,
20 |
21 | /// Watch mode refresh interval in seconds
22 | pub watch_interval_secs: u64,
23 |
24 | /// Maximum number of processes to show in summary
25 | pub max_processes_summary: usize,
26 |
27 | /// Table width limit
28 | pub table_width: usize,
29 |
30 | /// Whether to use colors in output
31 | pub use_colors: bool,
32 | }
33 |
34 | impl Default for Config {
35 | fn default() -> Self {
36 | Self {
37 | log_level: "info".to_string(),
38 | output_format: "table".to_string(),
39 | default_timeout_secs: 5,
40 | show_details: false,
41 | watch_interval_secs: 2,
42 | max_processes_summary: 10,
43 | table_width: 120,
44 | use_colors: true,
45 | }
46 | }
47 | }
48 |
49 | /// Configuration manager
50 | pub struct ConfigManager {
51 | config: Config,
52 | }
53 |
54 | impl Default for ConfigManager {
55 | fn default() -> Self {
56 | Self::new()
57 | }
58 | }
59 |
60 | #[allow(dead_code)]
61 | impl ConfigManager {
62 | /// Create a new configuration manager
63 | pub fn new() -> Self {
64 | Self {
65 | config: Config::default(),
66 | }
67 | }
68 |
69 | /// Load configuration from file
70 | pub fn load_from_file>(path: P) -> Result {
71 | let config_path = path.as_ref();
72 |
73 | if !config_path.exists() {
74 | tracing::debug!("Config file not found at {:?}, using defaults", config_path);
75 | return Ok(Self::new());
76 | }
77 |
78 | let content = fs::read_to_string(config_path)
79 | .with_context(|| format!("Failed to read config file: {:?}", config_path))?;
80 |
81 | let config: Config = toml::from_str(&content)
82 | .with_context(|| format!("Failed to parse config file: {:?}", config_path))?;
83 |
84 | tracing::info!("Loaded configuration from {:?}", config_path);
85 | Ok(Self { config })
86 | }
87 |
88 | /// Load configuration from environment variables
89 | pub fn load_from_env() -> Self {
90 | let mut config = Config::default();
91 |
92 | // Override with environment variables if present
93 | if let Ok(log_level) = std::env::var("GPUKILL_LOG_LEVEL") {
94 | config.log_level = log_level;
95 | }
96 |
97 | if let Ok(output_format) = std::env::var("GPUKILL_OUTPUT_FORMAT") {
98 | config.output_format = output_format;
99 | }
100 |
101 | if let Ok(timeout) = std::env::var("GPUKILL_DEFAULT_TIMEOUT") {
102 | if let Ok(timeout_secs) = timeout.parse::() {
103 | config.default_timeout_secs = timeout_secs;
104 | }
105 | }
106 |
107 | if let Ok(show_details) = std::env::var("GPUKILL_SHOW_DETAILS") {
108 | config.show_details = show_details.parse().unwrap_or(false);
109 | }
110 |
111 | if let Ok(watch_interval) = std::env::var("GPUKILL_WATCH_INTERVAL") {
112 | if let Ok(interval_secs) = watch_interval.parse::() {
113 | config.watch_interval_secs = interval_secs;
114 | }
115 | }
116 |
117 | if let Ok(table_width) = std::env::var("GPUKILL_TABLE_WIDTH") {
118 | if let Ok(width) = table_width.parse::() {
119 | config.table_width = width;
120 | }
121 | }
122 |
123 | if let Ok(use_colors) = std::env::var("GPUKILL_USE_COLORS") {
124 | config.use_colors = use_colors.parse().unwrap_or(true);
125 | }
126 |
127 | Self { config }
128 | }
129 |
130 | /// Get the current configuration
131 | pub fn config(&self) -> &Config {
132 | &self.config
133 | }
134 |
135 | /// Get a mutable reference to the configuration
136 | pub fn config_mut(&mut self) -> &mut Config {
137 | &mut self.config
138 | }
139 |
140 | /// Save configuration to file
141 | pub fn save_to_file>(&self, path: P) -> Result<()> {
142 | let config_path = path.as_ref();
143 | let content =
144 | toml::to_string_pretty(&self.config).context("Failed to serialize configuration")?;
145 |
146 | fs::write(config_path, content)
147 | .with_context(|| format!("Failed to write config file: {:?}", config_path))?;
148 |
149 | tracing::info!("Saved configuration to {:?}", config_path);
150 | Ok(())
151 | }
152 |
153 | /// Get default configuration file path
154 | pub fn default_config_path() -> Result {
155 | let home_dir = dirs::home_dir()
156 | .ok_or_else(|| anyhow::anyhow!("Could not determine home directory"))?;
157 |
158 | Ok(home_dir.join(".config").join("gpukill").join("config.toml"))
159 | }
160 |
161 | /// Load configuration from default location
162 | pub fn load_default() -> Result {
163 | let config_path = Self::default_config_path()?;
164 | Self::load_from_file(config_path)
165 | }
166 |
167 | /// Create default configuration file
168 | pub fn create_default_config() -> Result<()> {
169 | let config_path = Self::default_config_path()?;
170 |
171 | // Create directory if it doesn't exist
172 | if let Some(parent) = config_path.parent() {
173 | fs::create_dir_all(parent)
174 | .with_context(|| format!("Failed to create config directory: {:?}", parent))?;
175 | }
176 |
177 | let config_manager = Self::new();
178 | config_manager.save_to_file(config_path)?;
179 |
180 | Ok(())
181 | }
182 | }
183 |
184 | /// Get configuration with fallback chain
185 | pub fn get_config(config_path: Option) -> Result {
186 | // 1. Try to load from specified path
187 | if let Some(path) = config_path {
188 | return ConfigManager::load_from_file(path);
189 | }
190 |
191 | // 2. Try to load from default location
192 | if let Ok(config_manager) = ConfigManager::load_default() {
193 | return Ok(config_manager);
194 | }
195 |
196 | // 3. Load from environment variables
197 | Ok(ConfigManager::load_from_env())
198 | }
199 |
200 | #[cfg(test)]
201 | mod tests {
202 | use super::*;
203 | use tempfile::NamedTempFile;
204 |
205 | #[test]
206 | fn test_default_config() {
207 | let config = Config::default();
208 | assert_eq!(config.log_level, "info");
209 | assert_eq!(config.output_format, "table");
210 | assert_eq!(config.default_timeout_secs, 5);
211 | assert!(!config.show_details);
212 | assert_eq!(config.watch_interval_secs, 2);
213 | }
214 |
215 | #[test]
216 | fn test_config_serialization() {
217 | let config = Config::default();
218 | let toml_str = toml::to_string(&config).unwrap();
219 | let deserialized: Config = toml::from_str(&toml_str).unwrap();
220 |
221 | assert_eq!(config.log_level, deserialized.log_level);
222 | assert_eq!(config.output_format, deserialized.output_format);
223 | }
224 |
225 | #[test]
226 | fn test_config_file_loading() {
227 | let config = Config::default();
228 | let toml_str = toml::to_string_pretty(&config).unwrap();
229 |
230 | let temp_file = NamedTempFile::new().unwrap();
231 | std::fs::write(temp_file.path(), toml_str).unwrap();
232 |
233 | let loaded_config = ConfigManager::load_from_file(temp_file.path()).unwrap();
234 | assert_eq!(loaded_config.config().log_level, config.log_level);
235 | }
236 |
237 | #[test]
238 | fn test_config_manager_creation() {
239 | let manager = ConfigManager::new();
240 | assert_eq!(manager.config().log_level, "info");
241 | }
242 | }
243 |
--------------------------------------------------------------------------------
/src/remote.rs:
--------------------------------------------------------------------------------
1 | use anyhow::{Context, Result};
2 | use std::{
3 | process::{Command, Stdio},
4 | time::Duration,
5 | };
6 | use tracing::{debug, info, warn};
7 |
8 | /// SSH connection configuration
9 | #[derive(Debug, Clone)]
10 | pub struct SshConfig {
11 | pub host: String,
12 | pub port: u16,
13 | pub username: String,
14 | pub key_path: Option,
15 | pub password: Option,
16 | pub timeout: Duration,
17 | }
18 |
19 | impl SshConfig {
20 | /// Create a new SSH configuration
21 | pub fn new(host: String, port: u16, username: String) -> Self {
22 | Self {
23 | host,
24 | port,
25 | username,
26 | key_path: None,
27 | password: None,
28 | timeout: Duration::from_secs(30),
29 | }
30 | }
31 |
32 | /// Set SSH key path
33 | pub fn with_key_path(mut self, key_path: String) -> Self {
34 | self.key_path = Some(key_path);
35 | self
36 | }
37 |
38 | /// Set SSH password
39 | pub fn with_password(mut self, password: String) -> Self {
40 | self.password = Some(password);
41 | self
42 | }
43 |
44 | /// Set connection timeout
45 | pub fn with_timeout(mut self, timeout: Duration) -> Self {
46 | self.timeout = timeout;
47 | self
48 | }
49 | }
50 |
51 | /// SSH remote connection manager using system SSH
52 | pub struct SshRemote {
53 | config: SshConfig,
54 | }
55 |
56 | impl SshRemote {
57 | /// Create a new SSH remote connection
58 | pub fn new(config: SshConfig) -> Self {
59 | Self { config }
60 | }
61 |
62 | /// Execute a command on the remote host
63 | pub fn execute_command(&self, command: &str) -> Result {
64 | debug!("Executing remote command: {}", command);
65 |
66 | let mut ssh_cmd = Command::new("ssh");
67 |
68 | // Add SSH options
69 | ssh_cmd
70 | .arg("-o")
71 | .arg("ConnectTimeout=30")
72 | .arg("-o")
73 | .arg("StrictHostKeyChecking=no")
74 | .arg("-o")
75 | .arg("UserKnownHostsFile=/dev/null")
76 | .arg("-o")
77 | .arg("LogLevel=ERROR");
78 |
79 | // Add port if not default
80 | if self.config.port != 22 {
81 | ssh_cmd.arg("-p").arg(self.config.port.to_string());
82 | }
83 |
84 | // Add key file if specified
85 | if let Some(key_path) = &self.config.key_path {
86 | ssh_cmd.arg("-i").arg(key_path);
87 | }
88 |
89 | // Add password authentication if specified
90 | if self.config.password.is_some() {
91 | ssh_cmd.arg("-o").arg("PasswordAuthentication=yes");
92 | }
93 |
94 | // Add host and command
95 | let host_spec = format!("{}@{}", self.config.username, self.config.host);
96 | ssh_cmd.arg(host_spec).arg(command);
97 |
98 | // Set up input for password if needed
99 | if let Some(_password) = &self.config.password {
100 | ssh_cmd.stdin(Stdio::piped());
101 | }
102 |
103 | debug!("Running SSH command: {:?}", ssh_cmd);
104 |
105 | let mut child = ssh_cmd
106 | .stdout(Stdio::piped())
107 | .stderr(Stdio::piped())
108 | .spawn()
109 | .context("Failed to spawn SSH command")?;
110 |
111 | // Send password if provided
112 | if let Some(password) = &self.config.password {
113 | if let Some(stdin) = child.stdin.as_mut() {
114 | use std::io::Write;
115 | stdin
116 | .write_all(password.as_bytes())
117 | .context("Failed to write password to SSH stdin")?;
118 | stdin
119 | .write_all(b"\n")
120 | .context("Failed to write newline to SSH stdin")?;
121 | }
122 | }
123 |
124 | let output = child
125 | .wait_with_output()
126 | .context("Failed to wait for SSH command")?;
127 |
128 | if !output.status.success() {
129 | let stderr = String::from_utf8_lossy(&output.stderr);
130 | return Err(anyhow::anyhow!(
131 | "SSH command failed with exit code {}: {}",
132 | output.status.code().unwrap_or(-1),
133 | stderr
134 | ));
135 | }
136 |
137 | let stdout = String::from_utf8(output.stdout)
138 | .context("Failed to decode SSH command output as UTF-8")?;
139 |
140 | debug!(
141 | "Command executed successfully, output length: {} bytes",
142 | stdout.len()
143 | );
144 | Ok(stdout)
145 | }
146 |
147 | /// Execute gpukill command on remote host
148 | pub fn execute_gpukill(&self, args: &[String]) -> Result {
149 | let command = format!("gpukill {}", args.join(" "));
150 | self.execute_command(&command)
151 | }
152 |
153 | /// Check if gpukill is available on remote host
154 | pub fn check_gpukill_availability(&self) -> Result {
155 | match self.execute_command("which gpukill") {
156 | Ok(output) => {
157 | let available = !output.trim().is_empty();
158 | if available {
159 | info!("gpukill is available on remote host");
160 | } else {
161 | warn!("gpukill not found on remote host");
162 | }
163 | Ok(available)
164 | }
165 | Err(_) => {
166 | warn!("Failed to check gpukill availability on remote host");
167 | Ok(false)
168 | }
169 | }
170 | }
171 |
172 | /// Get remote host information
173 | pub fn get_host_info(&self) -> Result {
174 | let hostname = self.execute_command("hostname")?.trim().to_string();
175 | let os_info = self.execute_command("uname -a")?.trim().to_string();
176 | let gpu_info = self.execute_command("nvidia-smi --query-gpu=name --format=csv,noheader,nounits 2>/dev/null || echo 'No NVIDIA GPUs'")?.trim().to_string();
177 |
178 | Ok(RemoteHostInfo {
179 | hostname,
180 | os_info,
181 | gpu_info,
182 | })
183 | }
184 | }
185 |
186 | /// Information about the remote host
187 | #[derive(Debug, Clone)]
188 | pub struct RemoteHostInfo {
189 | pub hostname: String,
190 | pub os_info: String,
191 | #[allow(dead_code)]
192 | pub gpu_info: String,
193 | }
194 |
195 | /// Execute a local gpukill command with remote forwarding
196 | pub fn execute_remote_operation(config: SshConfig, local_args: &[String]) -> Result<()> {
197 | let remote = SshRemote::new(config);
198 |
199 | // Check if gpukill is available on remote host
200 | if !remote.check_gpukill_availability()? {
201 | return Err(anyhow::anyhow!(
202 | "gpukill is not available on the remote host. Please install gpukill on the remote host first."
203 | ));
204 | }
205 |
206 | // Get remote host info
207 | let host_info = remote.get_host_info()?;
208 | info!(
209 | "Remote host: {} ({})",
210 | host_info.hostname, host_info.os_info
211 | );
212 |
213 | // Execute the command on remote host
214 | let output = remote.execute_gpukill(local_args)?;
215 |
216 | // Print the output
217 | print!("{}", output);
218 |
219 | Ok(())
220 | }
221 |
222 | #[cfg(test)]
223 | mod tests {
224 | use super::*;
225 | use std::time::Duration;
226 |
227 | #[test]
228 | fn test_ssh_config_creation() {
229 | let config = SshConfig::new("localhost".to_string(), 22, "testuser".to_string());
230 | assert_eq!(config.host, "localhost");
231 | assert_eq!(config.port, 22);
232 | assert_eq!(config.username, "testuser");
233 | assert_eq!(config.timeout, Duration::from_secs(30));
234 | }
235 |
236 | #[test]
237 | fn test_ssh_config_with_options() {
238 | let config = SshConfig::new("localhost".to_string(), 22, "testuser".to_string())
239 | .with_key_path("/path/to/key".to_string())
240 | .with_password("password".to_string())
241 | .with_timeout(Duration::from_secs(60));
242 |
243 | assert_eq!(config.key_path, Some("/path/to/key".to_string()));
244 | assert_eq!(config.password, Some("password".to_string()));
245 | assert_eq!(config.timeout, Duration::from_secs(60));
246 | }
247 | }
248 |
--------------------------------------------------------------------------------
/.github/workflows/self-hosted-setup.md:
--------------------------------------------------------------------------------
1 | # Self-Hosted GPU Runner Setup Guide
2 |
3 | This guide explains how to set up self-hosted GitHub Actions runners with GPU hardware for testing of GPU Kill.
4 |
5 | ## Overview
6 |
7 | GPU Kill requires actual GPU hardware to test all functionality. This setup provides:
8 | - **NVIDIA GPU testing** with CUDA/NVML
9 | - **AMD GPU testing** with ROCm
10 | - **Intel GPU testing** with intel-gpu-tools
11 | - **Apple Silicon testing** on macOS
12 | - **Cross-platform compatibility** testing
13 |
14 | ## Hardware Requirements
15 |
16 | ### NVIDIA Runner
17 | - **GPU**: Any NVIDIA GPU with CUDA support
18 | - **OS**: Ubuntu 22.04 LTS
19 | - **RAM**: 16GB+ recommended
20 | - **Storage**: 100GB+ SSD
21 | - **CPU**: 4+ cores
22 |
23 | ### AMD Runner
24 | - **GPU**: AMD GPU with ROCm support (RX 5000/6000 series, MI series)
25 | - **OS**: Ubuntu 22.04 LTS
26 | - **RAM**: 16GB+ recommended
27 | - **Storage**: 100GB+ SSD
28 | - **CPU**: 4+ cores
29 |
30 | ### Intel Runner
31 | - **GPU**: Intel Arc, Iris Xe, or integrated GPU
32 | - **OS**: Ubuntu 22.04 LTS
33 | - **RAM**: 8GB+ recommended
34 | - **Storage**: 50GB+ SSD
35 | - **CPU**: 4+ cores
36 |
37 | ### Apple Silicon Runner
38 | - **Hardware**: Mac Studio, MacBook Pro, or Mac mini with M1/M2/M3/M4
39 | - **OS**: macOS 13+ (Ventura)
40 | - **RAM**: 16GB+ recommended
41 | - **Storage**: 100GB+ SSD
42 |
43 | ## Setup Instructions
44 |
45 | ### 1. NVIDIA Runner Setup
46 |
47 | ```bash
48 | # Install Ubuntu 22.04 LTS
49 | # Update system
50 | sudo apt update && sudo apt upgrade -y
51 |
52 | # Install NVIDIA drivers
53 | sudo apt install -y nvidia-driver-535
54 | sudo reboot
55 |
56 | # Verify NVIDIA installation
57 | nvidia-smi
58 |
59 | # Install development tools
60 | sudo apt install -y build-essential curl git libssl-dev pkg-config
61 |
62 | # Install Rust
63 | curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
64 | source ~/.cargo/env
65 |
66 | # Install GitHub Actions runner
67 | mkdir actions-runner && cd actions-runner
68 | curl -o actions-runner-linux-x64-2.311.0.tar.gz -L https://github.com/actions/runner/releases/download/v2.311.0/actions-runner-linux-x64-2.311.0.tar.gz
69 | tar xzf ./actions-runner-linux-x64-2.311.0.tar.gz
70 |
71 | # Configure runner (get token from GitHub repo settings)
72 | ./config.sh --url https://github.com/treadiehq/gpu-kill --token
73 | ./config.sh --name "nvidia-gpu-runner" --labels "self-hosted,gpu,nvidia,ubuntu-22.04"
74 |
75 | # Install as service
76 | sudo ./svc.sh install
77 | sudo ./svc.sh start
78 | ```
79 |
80 | ### 2. AMD Runner Setup
81 |
82 | ```bash
83 | # Install Ubuntu 22.04 LTS
84 | # Update system
85 | sudo apt update && sudo apt upgrade -y
86 |
87 | # Install ROCm
88 | wget https://repo.radeon.com/amdgpu-install/5.7/ubuntu/jammy/amdgpu-install_5.7.50700-1_all.deb
89 | sudo dpkg -i amdgpu-install_5.7.50700-1_all.deb
90 | sudo apt-get update
91 | sudo amdgpu-install --usecase=rocm
92 |
93 | # Verify ROCm installation
94 | rocm-smi
95 | rocminfo
96 |
97 | # Install development tools
98 | sudo apt install -y build-essential curl git libssl-dev pkg-config
99 |
100 | # Install Rust
101 | curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
102 | source ~/.cargo/env
103 |
104 | # Install GitHub Actions runner (same as NVIDIA)
105 | mkdir actions-runner && cd actions-runner
106 | curl -o actions-runner-linux-x64-2.311.0.tar.gz -L https://github.com/actions/runner/releases/download/v2.311.0/actions-runner-linux-x64-2.311.0.tar.gz
107 | tar xzf ./actions-runner-linux-x64-2.311.0.tar.gz
108 |
109 | # Configure runner
110 | ./config.sh --url https://github.com/treadiehq/gpu-kill --token
111 | ./config.sh --name "amd-gpu-runner" --labels "self-hosted,gpu,amd,ubuntu-22.04"
112 |
113 | # Install as service
114 | sudo ./svc.sh install
115 | sudo ./svc.sh start
116 | ```
117 |
118 | ### 3. Intel Runner Setup
119 |
120 | ```bash
121 | # Install Ubuntu 22.04 LTS
122 | # Update system
123 | sudo apt update && sudo apt upgrade -y
124 |
125 | # Install Intel GPU tools
126 | sudo apt install -y intel-gpu-tools
127 |
128 | # Verify Intel GPU tools
129 | intel_gpu_top --help
130 |
131 | # Install development tools
132 | sudo apt install -y build-essential curl git libssl-dev pkg-config
133 |
134 | # Install Rust
135 | curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
136 | source ~/.cargo/env
137 |
138 | # Install GitHub Actions runner
139 | mkdir actions-runner && cd actions-runner
140 | curl -o actions-runner-linux-x64-2.311.0.tar.gz -L https://github.com/actions/runner/releases/download/v2.311.0/actions-runner-linux-x64-2.311.0.tar.gz
141 | tar xzf ./actions-runner-linux-x64-2.311.0.tar.gz
142 |
143 | # Configure runner
144 | ./config.sh --url https://github.com/treadiehq/gpu-kill --token
145 | ./config.sh --name "intel-gpu-runner" --labels "self-hosted,gpu,intel,ubuntu-22.04"
146 |
147 | # Install as service
148 | sudo ./svc.sh install
149 | sudo ./svc.sh start
150 | ```
151 |
152 | ### 4. Apple Silicon Runner Setup
153 |
154 | ```bash
155 | # Install macOS 13+ (Ventura)
156 | # Install Xcode command line tools
157 | xcode-select --install
158 |
159 | # Install Homebrew
160 | /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
161 |
162 | # Install Rust
163 | curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
164 | source ~/.cargo/env
165 |
166 | # Verify Apple Silicon GPU
167 | system_profiler SPDisplaysDataType
168 |
169 | # Install GitHub Actions runner
170 | mkdir actions-runner && cd actions-runner
171 | curl -o actions-runner-osx-arm64-2.311.0.tar.gz -L https://github.com/actions/runner/releases/download/v2.311.0/actions-runner-osx-arm64-2.311.0.tar.gz
172 | tar xzf ./actions-runner-osx-arm64-2.311.0.tar.gz
173 |
174 | # Configure runner
175 | ./config.sh --url https://github.com/treadiehq/gpu-kill --token
176 | ./config.sh --name "apple-gpu-runner" --labels "self-hosted,gpu,apple,macos-13"
177 |
178 | # Install as service
179 | ./svc.sh install
180 | ./svc.sh start
181 | ```
182 |
183 | ## Runner Labels
184 |
185 | Each runner should be configured with these labels:
186 | - `self-hosted` - Required for self-hosted runners
187 | - `gpu` - Indicates GPU hardware availability
188 | - `nvidia`/`amd`/`intel`/`apple` - GPU vendor
189 | - `ubuntu-22.04`/`macos-13` - Operating system
190 | - `stress-test` - For runners capable of stress testing
191 |
192 | ## Security Considerations
193 |
194 | 1. **Network Security**: Ensure runners are behind a firewall
195 | 2. **Access Control**: Limit who can access the runner machines
196 | 3. **Token Management**: Regularly rotate GitHub tokens
197 | 4. **Monitoring**: Monitor runner health and performance
198 | 5. **Updates**: Keep runners updated with security patches
199 |
200 | ## Monitoring and Maintenance
201 |
202 | ### Health Checks
203 | ```bash
204 | # Check runner status
205 | sudo systemctl status actions.runner.*
206 |
207 | # Check GPU status
208 | nvidia-smi # NVIDIA
209 | rocm-smi # AMD
210 | intel_gpu_top --help # Intel
211 | system_profiler SPDisplaysDataType # Apple
212 | ```
213 |
214 | ### Logs
215 | ```bash
216 | # View runner logs
217 | sudo journalctl -u actions.runner.* -f
218 |
219 | # View GitHub Actions logs
220 | tail -f /home/runner/_diag/Runner_*.log
221 | ```
222 |
223 | ### Updates
224 | ```bash
225 | # Update runner software
226 | cd actions-runner
227 | ./config.sh remove --token
228 | # Download new version
229 | ./config.sh --url https://github.com/treadiehq/gpu-kill --token
230 | ```
231 |
232 | ## Cost Optimization
233 |
234 | 1. **Scheduled Testing**: Run tests during off-peak hours
235 | 2. **Resource Scaling**: Use smaller instances for basic tests
236 | 3. **Caching**: Implement aggressive caching for dependencies
237 | 4. **Parallel Testing**: Run multiple test suites in parallel
238 |
239 | ## Troubleshooting
240 |
241 | ### Common Issues
242 |
243 | 1. **GPU Not Detected**
244 | ```bash
245 | # Check GPU status
246 | lspci | grep -i vga
247 | nvidia-smi # or rocm-smi, intel_gpu_top
248 | ```
249 |
250 | 2. **Permission Issues**
251 | ```bash
252 | # Add user to video group
253 | sudo usermod -a -G video $USER
254 | sudo usermod -a -G render $USER
255 | ```
256 |
257 | 3. **Driver Issues**
258 | ```bash
259 | # Reinstall drivers
260 | sudo apt purge nvidia-* # NVIDIA
261 | sudo apt purge rocm-* # AMD
262 | sudo apt install nvidia-driver-535 # Reinstall
263 | ```
264 |
265 | 4. **Runner Connection Issues**
266 | ```bash
267 | # Check network connectivity
268 | curl -I https://github.com
269 | # Restart runner service
270 | sudo systemctl restart actions.runner.*
271 | ```
272 |
273 | ## Integration with GPU Kill
274 |
275 | The runners will automatically execute the GPU testing workflow when:
276 | - Code is pushed to main/develop branches
277 | - Pull requests are opened
278 | - Manual workflow dispatch is triggered
279 |
280 | Tests include:
281 | - GPU detection and enumeration
282 | - Performance benchmarking
283 | - Memory usage testing
284 | - Stress testing
285 | - Cross-platform compatibility
286 | - Security auditing
287 |
--------------------------------------------------------------------------------
/docs/HOTAISLE_INTEGRATION.md:
--------------------------------------------------------------------------------
1 | # Hot Aisle Integration for GPU Testing
2 |
3 | This document describes the **optional** integration between GPU Kill and Hot Aisle's infrastructure for automated GPU testing in CI/CD pipelines.
4 |
5 | > **Note**: Hot Aisle integration is an optional feature that must be enabled with the `hotaisle` feature flag.
6 |
7 | ## Overview
8 |
9 | The Hot Aisle integration enables GPU Kill to run comprehensive tests on real GPU hardware by:
10 |
11 | 1. **Provisioning GPU instances** on-demand via Hot Aisle's API
12 | 2. **Running GPU tests** on actual hardware (NVIDIA, AMD, Intel, Apple Silicon)
13 | 3. **Automated cleanup** to minimize costs
14 | 4. **Comprehensive reporting** of test results
15 |
16 | ## Architecture
17 |
18 | ```
19 | ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
20 | │ GitHub │ │ Hot Aisle │ │ GPU Hardware │
21 | │ Actions │◄──►│ API │◄──►│ (NVIDIA/AMD) │
22 | │ (CI/CD) │ │ (Backend) │ │ (Intel/Apple) │
23 | └─────────────────┘ └─────────────────┘ └─────────────────┘
24 | ```
25 |
26 | ## Components
27 |
28 | ### 1. Hot Aisle API Client (`src/hotaisle_client.rs`)
29 |
30 | Rust client for interacting with Hot Aisle's API:
31 |
32 | ```rust
33 | use gpukill::hotaisle_client::{HotAisleClient, GpuInstanceConfig};
34 |
35 | let client = HotAisleClient::new(api_key, None);
36 |
37 | let config = GpuInstanceConfig {
38 | gpu_type: "nvidia".to_string(),
39 | duration_minutes: 30,
40 | instance_type: Some("g4dn.xlarge".to_string()),
41 | labels: Some(vec!["ci-test".to_string()]),
42 | };
43 |
44 | let instance = client.provision_gpu_instance(config).await?;
45 | ```
46 |
47 | ### 2. GPU Test Script (`scripts/run-gpu-tests.sh`)
48 |
49 | Comprehensive test script that runs on provisioned instances:
50 |
51 | ### 3. Integration Test Script (`scripts/test-hotaisle-integration-simple.sh`)
52 |
53 | CI-friendly test script that validates the Hot Aisle integration without requiring API access:
54 |
55 | - **GPU Detection Tests**: Verify GPU enumeration and information retrieval
56 | - **Vendor-Specific Tests**: NVIDIA (nvidia-smi), AMD (rocm-smi, amd-smi), Intel (intel_gpu_top)
57 | - **Performance Tests**: Run GPU hardware tests and benchmarks
58 | - **Stress Tests**: Multiple iterations to ensure reliability
59 | - **Report Generation**: Detailed test reports with system information
60 |
61 | ### 3. GitHub Actions Workflow (`.github/workflows/hotaisle-gpu-testing.yml`)
62 |
63 | Automated CI/CD pipeline that:
64 |
65 | - **Provisions GPU instances** based on matrix strategy
66 | - **Deploys GPU Kill** to instances
67 | - **Runs comprehensive tests** on real hardware
68 | - **Collects results** and uploads artifacts
69 | - **Cleans up instances** automatically
70 |
71 | ## Setup Instructions
72 |
73 | ### 1. Enable Hot Aisle Feature
74 |
75 | Build GPU Kill with the Hot Aisle feature enabled:
76 |
77 | ```bash
78 | # Build with Hot Aisle integration
79 | cargo build --release --features hotaisle
80 |
81 | # Or install with Hot Aisle integration
82 | cargo install --path . --features hotaisle
83 | ```
84 |
85 | ### 2. Hot Aisle API Key
86 |
87 | Add your Hot Aisle API key to GitHub Secrets:
88 |
89 | ```bash
90 | # In your GitHub repository settings:
91 | # Settings → Secrets and variables → Actions → New repository secret
92 | # Name: HOTAISLE_API_KEY
93 | # Value: your-hotaisle-api-key
94 | ```
95 |
96 | ### 3. Configure GPU Types
97 |
98 | The workflow supports testing multiple GPU types:
99 |
100 | ```yaml
101 | # Default configuration
102 | matrix:
103 | gpu_type: [nvidia, amd, intel]
104 |
105 | # Manual dispatch with custom GPU types
106 | # Use workflow_dispatch with inputs:
107 | # gpu_types: "nvidia,amd,intel,apple-silicon"
108 | ```
109 |
110 | ### 4. Test Duration
111 |
112 | Configure test duration to balance thoroughness with cost:
113 |
114 | ```yaml
115 | # Default: 30 minutes
116 | # Can be overridden via workflow_dispatch
117 | test_duration: "30" # minutes
118 | ```
119 |
120 | ## Usage
121 |
122 | ### Integration Testing
123 |
124 | The integration is validated automatically via the "Test Hot Aisle Integration" workflow:
125 | - **Runs on**: Changes to Hot Aisle-related files
126 | - **Validates**: Build system, feature flags, documentation, and workflow syntax
127 | - **No API key required**: Tests the integration structure without actual GPU provisioning
128 |
129 | ### Manual GPU Testing
130 |
131 | Trigger tests manually via GitHub Actions:
132 |
133 | 1. Go to **Actions** tab in your repository
134 | 2. Select **Hot Aisle GPU Testing** workflow
135 | 3. Click **Run workflow**
136 | 4. Configure parameters:
137 | - **GPU types**: Comma-separated list (e.g., `nvidia,amd,intel`)
138 | - **Test duration**: Minutes (e.g., `30`)
139 |
140 | ### Local Testing
141 |
142 | Test the integration locally:
143 |
144 | ```bash
145 | # Build GPU Kill
146 | cargo build --release
147 |
148 | # Run GPU tests (requires GPU hardware)
149 | ./scripts/run-gpu-tests.sh nvidia
150 | ```
151 |
152 | ## Supported GPU Types
153 |
154 | | GPU Type | Tools Used | Tests |
155 | |----------|------------|-------|
156 | | **NVIDIA** | nvidia-smi, NVML | GPU enumeration, memory, utilization, temperature, power |
157 | | **AMD** | rocm-smi, amd-smi | GPU enumeration, memory, utilization, temperature, power |
158 | | **Intel** | intel_gpu_top | GPU enumeration, utilization, memory estimation |
159 | | **Apple Silicon** | system_profiler | GPU enumeration, memory usage, Metal processes |
160 |
161 | ## Cost Optimization
162 |
163 | ### 1. Instance Lifecycle Management
164 |
165 | - **Automatic provisioning** only when needed
166 | - **Immediate cleanup** after tests complete
167 | - **Timeout protection** to prevent runaway costs
168 |
169 | ### 2. Test Duration Control
170 |
171 | - **Configurable duration** (default: 30 minutes)
172 | - **Fast failure** for quick feedback
173 | - **Comprehensive testing** when needed
174 |
175 | ### 3. Resource Efficiency
176 |
177 | - **Parallel testing** across GPU types
178 | - **Shared infrastructure** via Hot Aisle
179 | - **No always-on runners** required
180 |
181 | ## Test Results
182 |
183 | ### Artifacts
184 |
185 | Each test run produces:
186 |
187 | - **Test Output Log**: Detailed execution logs
188 | - **Test Report**: Comprehensive system and GPU information
189 | - **Retention**: 30 days for debugging
190 |
191 | ### Metrics
192 |
193 | Tests measure:
194 |
195 | - **GPU Detection**: Number of GPUs found
196 | - **Information Retrieval**: JSON validity and completeness
197 | - **Performance**: Test execution time
198 | - **Reliability**: Stress test success rate
199 |
200 | ## Troubleshooting
201 |
202 | ### Common Issues
203 |
204 | 1. **Instance Provisioning Fails**
205 | - Check Hot Aisle API key validity
206 | - Verify GPU type availability
207 | - Check Hot Aisle service status
208 |
209 | 2. **SSH Connection Issues**
210 | - Verify instance IP address
211 | - Check SSH key generation
212 | - Ensure instance is ready
213 |
214 | 3. **Test Failures**
215 | - Review test output logs
216 | - Check GPU driver installation
217 | - Verify tool availability (nvidia-smi, rocm-smi, etc.)
218 |
219 | ### Debug Mode
220 |
221 | Enable debug logging:
222 |
223 | ```bash
224 | export RUST_LOG=debug
225 | export RUST_BACKTRACE=1
226 | ```
227 |
228 | ## API Reference
229 |
230 | ### HotAisleClient
231 |
232 | ```rust
233 | impl HotAisleClient {
234 | pub fn new(api_key: String, base_url: Option) -> Self
235 | pub async fn provision_gpu_instance(&self, config: GpuInstanceConfig) -> Result
236 | pub async fn wait_for_instance_ready(&self, instance_id: &str, timeout_minutes: u32) -> Result
237 | pub async fn get_instance(&self, instance_id: &str) -> Result
238 | pub async fn run_gpu_tests(&self, instance: &GpuInstance, test_config: &GpuTestConfig) -> Result
239 | pub async fn terminate_instance(&self, instance_id: &str) -> Result<()>
240 | pub async fn list_available_gpu_types(&self) -> Result>
241 | }
242 | ```
243 |
244 | ### Configuration Types
245 |
246 | ```rust
247 | pub struct GpuInstanceConfig {
248 | pub gpu_type: String, // nvidia, amd, intel, apple-silicon
249 | pub duration_minutes: u32, // Instance lifetime
250 | pub instance_type: Option, // Auto-selected if None
251 | pub labels: Option>, // Custom labels
252 | }
253 |
254 | pub struct GpuTestConfig {
255 | pub test_command: String, // Command to execute
256 | pub timeout_minutes: u32, // Test timeout
257 | pub env_vars: Option>, // Environment variables
258 | pub working_dir: Option, // Working directory
259 | }
260 | ```
261 |
262 | ## Future Enhancements
263 |
264 | ### Planned Features
265 |
266 | 1. **Advanced GPU Testing**
267 | - CUDA/ROCm kernel testing
268 | - Memory bandwidth benchmarks
269 | - Multi-GPU coordination tests
270 |
271 | 2. **Cost Analytics**
272 | - Test cost tracking
273 | - Optimization recommendations
274 | - Budget alerts
275 |
276 | 3. **Integration Improvements**
277 | - Webhook notifications
278 | - Slack/Teams integration
279 | - Custom test configurations
280 |
281 | ### Contributing
282 |
283 | To contribute to the Hot Aisle integration:
284 |
285 | 1. **Fork the repository**
286 | 2. **Create a feature branch**
287 | 3. **Add tests** for new functionality
288 | 4. **Update documentation**
289 | 5. **Submit a pull request**
290 |
291 | ## Support
292 |
293 | For issues related to:
294 |
295 | - **GPU Kill**: Create an issue in this repository
296 | - **Hot Aisle API**: Contact Hot Aisle support
297 | - **Integration**: Check the troubleshooting section above
298 |
299 | ## License
300 |
301 | This integration is part of GPU Kill and follows the same license terms.
302 |
--------------------------------------------------------------------------------
/src/hotaisle_client.rs:
--------------------------------------------------------------------------------
1 | //! Hot Aisle API client for GPU instance provisioning and management
2 | //!
3 | //! This module provides integration with Hot Aisle's infrastructure
4 | //! for on-demand GPU testing in CI/CD pipelines.
5 |
6 | use anyhow::Result;
7 | use serde::{Deserialize, Serialize};
8 | use std::time::Duration;
9 | use tokio::time::sleep;
10 |
11 | /// Hot Aisle API client for managing GPU instances
12 | pub struct HotAisleClient {
13 | api_key: String,
14 | base_url: String,
15 | client: reqwest::Client,
16 | }
17 |
18 | /// GPU instance configuration
19 | #[derive(Debug, Clone, Serialize, Deserialize)]
20 | pub struct GpuInstanceConfig {
21 | /// GPU type (nvidia, amd, intel, apple-silicon)
22 | pub gpu_type: String,
23 | /// Instance duration in minutes
24 | pub duration_minutes: u32,
25 | /// Instance size/type
26 | pub instance_type: Option,
27 | /// Custom labels for the instance
28 | pub labels: Option>,
29 | }
30 |
31 | /// GPU instance information
32 | #[derive(Debug, Clone, Serialize, Deserialize)]
33 | pub struct GpuInstance {
34 | /// Unique instance ID
35 | pub id: String,
36 | /// Instance IP address
37 | pub ip_address: String,
38 | /// SSH connection details
39 | pub ssh_config: SshConfig,
40 | /// GPU type
41 | pub gpu_type: String,
42 | /// Instance status
43 | pub status: String,
44 | /// Creation timestamp
45 | pub created_at: String,
46 | /// Expiration timestamp
47 | pub expires_at: String,
48 | }
49 |
50 | /// SSH connection configuration
51 | #[derive(Debug, Clone, Serialize, Deserialize)]
52 | pub struct SshConfig {
53 | /// SSH username
54 | pub username: String,
55 | /// SSH port (default: 22)
56 | pub port: u16,
57 | /// SSH key path or content
58 | pub key_path: Option,
59 | }
60 |
61 | /// Test results from GPU instance
62 | #[derive(Debug, Clone, Serialize, Deserialize)]
63 | pub struct GpuTestResults {
64 | /// Instance ID where tests were run
65 | pub instance_id: String,
66 | /// Test execution status
67 | pub status: String,
68 | /// Test output/logs
69 | pub output: String,
70 | /// Test duration in seconds
71 | pub duration_seconds: u64,
72 | /// Number of tests passed
73 | pub tests_passed: u32,
74 | /// Number of tests failed
75 | pub tests_failed: u32,
76 | /// Number of tests skipped
77 | pub tests_skipped: u32,
78 | }
79 |
80 | impl HotAisleClient {
81 | /// Create a new Hot Aisle client
82 | pub fn new(api_key: String, base_url: Option) -> Self {
83 | let base_url = base_url.unwrap_or_else(|| "https://admin.hotaisle.app/api".to_string());
84 |
85 | Self {
86 | api_key,
87 | base_url,
88 | client: reqwest::Client::new(),
89 | }
90 | }
91 |
92 | /// Provision a new GPU instance
93 | pub async fn provision_gpu_instance(&self, config: GpuInstanceConfig) -> Result {
94 | let url = format!("{}/instances", self.base_url);
95 |
96 | let response = self
97 | .client
98 | .post(&url)
99 | .header("Authorization", format!("Bearer {}", self.api_key))
100 | .header("Content-Type", "application/json")
101 | .json(&config)
102 | .send()
103 | .await?;
104 |
105 | let status = response.status();
106 | if !status.is_success() {
107 | let error_text = response.text().await?;
108 | return Err(anyhow::anyhow!(
109 | "Failed to provision GPU instance: {} - {}",
110 | status,
111 | error_text
112 | ));
113 | }
114 |
115 | let instance: GpuInstance = response.json().await?;
116 | Ok(instance)
117 | }
118 |
119 | /// Wait for instance to be ready
120 | pub async fn wait_for_instance_ready(
121 | &self,
122 | instance_id: &str,
123 | timeout_minutes: u32,
124 | ) -> Result {
125 | let timeout = Duration::from_secs(timeout_minutes as u64 * 60);
126 | let start = std::time::Instant::now();
127 |
128 | while start.elapsed() < timeout {
129 | let instance = self.get_instance(instance_id).await?;
130 |
131 | match instance.status.as_str() {
132 | "ready" | "running" => return Ok(instance),
133 | "failed" | "error" => {
134 | return Err(anyhow::anyhow!("Instance {} failed to start", instance_id));
135 | }
136 | _ => {
137 | // Still provisioning, wait and retry
138 | sleep(Duration::from_secs(10)).await;
139 | }
140 | }
141 | }
142 |
143 | Err(anyhow::anyhow!(
144 | "Instance {} did not become ready within {} minutes",
145 | instance_id,
146 | timeout_minutes
147 | ))
148 | }
149 |
150 | /// Get instance information
151 | pub async fn get_instance(&self, instance_id: &str) -> Result {
152 | let url = format!("{}/instances/{}", self.base_url, instance_id);
153 |
154 | let response = self
155 | .client
156 | .get(&url)
157 | .header("Authorization", format!("Bearer {}", self.api_key))
158 | .send()
159 | .await?;
160 |
161 | let status = response.status();
162 | if !status.is_success() {
163 | let error_text = response.text().await?;
164 | return Err(anyhow::anyhow!(
165 | "Failed to get instance {}: {} - {}",
166 | instance_id,
167 | status,
168 | error_text
169 | ));
170 | }
171 |
172 | let instance: GpuInstance = response.json().await?;
173 | Ok(instance)
174 | }
175 |
176 | /// Execute GPU tests on an instance
177 | pub async fn run_gpu_tests(
178 | &self,
179 | instance: &GpuInstance,
180 | test_config: &GpuTestConfig,
181 | ) -> Result {
182 | let url = format!("{}/instances/{}/execute", self.base_url, instance.id);
183 |
184 | let response = self
185 | .client
186 | .post(&url)
187 | .header("Authorization", format!("Bearer {}", self.api_key))
188 | .header("Content-Type", "application/json")
189 | .json(test_config)
190 | .send()
191 | .await?;
192 |
193 | let status = response.status();
194 | if !status.is_success() {
195 | let error_text = response.text().await?;
196 | return Err(anyhow::anyhow!(
197 | "Failed to run tests on instance {}: {} - {}",
198 | instance.id,
199 | status,
200 | error_text
201 | ));
202 | }
203 |
204 | let results: GpuTestResults = response.json().await?;
205 | Ok(results)
206 | }
207 |
208 | /// Terminate an instance
209 | pub async fn terminate_instance(&self, instance_id: &str) -> Result<()> {
210 | let url = format!("{}/instances/{}", self.base_url, instance_id);
211 |
212 | let response = self
213 | .client
214 | .delete(&url)
215 | .header("Authorization", format!("Bearer {}", self.api_key))
216 | .send()
217 | .await?;
218 |
219 | let status = response.status();
220 | if !status.is_success() {
221 | let error_text = response.text().await?;
222 | return Err(anyhow::anyhow!(
223 | "Failed to terminate instance {}: {} - {}",
224 | instance_id,
225 | status,
226 | error_text
227 | ));
228 | }
229 |
230 | Ok(())
231 | }
232 |
233 | /// List available GPU types
234 | pub async fn list_available_gpu_types(&self) -> Result> {
235 | let url = format!("{}/gpu-types", self.base_url);
236 |
237 | let response = self
238 | .client
239 | .get(&url)
240 | .header("Authorization", format!("Bearer {}", self.api_key))
241 | .send()
242 | .await?;
243 |
244 | let status = response.status();
245 | if !status.is_success() {
246 | let error_text = response.text().await?;
247 | return Err(anyhow::anyhow!(
248 | "Failed to list GPU types: {} - {}",
249 | status,
250 | error_text
251 | ));
252 | }
253 |
254 | let gpu_types: Vec = response.json().await?;
255 | Ok(gpu_types)
256 | }
257 | }
258 |
259 | /// GPU test configuration
260 | #[derive(Debug, Clone, Serialize, Deserialize)]
261 | pub struct GpuTestConfig {
262 | /// Test command to execute
263 | pub test_command: String,
264 | /// Test timeout in minutes
265 | pub timeout_minutes: u32,
266 | /// Environment variables
267 | pub env_vars: Option>,
268 | /// Working directory
269 | pub working_dir: Option,
270 | }
271 |
272 | #[cfg(test)]
273 | mod tests {
274 | use super::*;
275 |
276 | #[tokio::test]
277 | async fn test_hotaisle_client_creation() {
278 | let client = HotAisleClient::new("test-key".to_string(), None);
279 | assert_eq!(client.base_url, "https://admin.hotaisle.app/api");
280 | }
281 |
282 | #[tokio::test]
283 | async fn test_gpu_instance_config() {
284 | let config = GpuInstanceConfig {
285 | gpu_type: "nvidia".to_string(),
286 | duration_minutes: 30,
287 | instance_type: Some("g4dn.xlarge".to_string()),
288 | labels: Some(vec!["ci-test".to_string(), "gpu-kill".to_string()]),
289 | };
290 |
291 | assert_eq!(config.gpu_type, "nvidia");
292 | assert_eq!(config.duration_minutes, 30);
293 | }
294 | }
295 |
--------------------------------------------------------------------------------
/scripts/run-gpu-tests.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # GPU Kill - Hot Aisle GPU Testing Script
4 | # This script runs comprehensive GPU tests on Hot Aisle provisioned instances
5 |
6 | set -euo pipefail
7 |
8 | # Configuration
9 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
10 | PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
11 | LOG_FILE="/tmp/gpu-kill-tests.log"
12 |
13 | # Colors for output
14 | RED='\033[0;31m'
15 | GREEN='\033[0;32m'
16 | YELLOW='\033[1;33m'
17 | BLUE='\033[0;34m'
18 | NC='\033[0m' # No Color
19 |
20 | # Logging functions
21 | log_info() {
22 | echo -e "${BLUE}[INFO]${NC} $1" | tee -a "$LOG_FILE"
23 | }
24 |
25 | log_success() {
26 | echo -e "${GREEN}[SUCCESS]${NC} $1" | tee -a "$LOG_FILE"
27 | }
28 |
29 | log_warning() {
30 | echo -e "${YELLOW}[WARNING]${NC} $1" | tee -a "$LOG_FILE"
31 | }
32 |
33 | log_error() {
34 | echo -e "${RED}[ERROR]${NC} $1" | tee -a "$LOG_FILE"
35 | }
36 |
37 | # Function to check prerequisites
38 | check_prerequisites() {
39 | log_info "Checking prerequisites..."
40 |
41 | # Check if we're in the right directory
42 | if [[ ! -f "$PROJECT_ROOT/Cargo.toml" ]]; then
43 | log_error "Not in GPU Kill project root. Please run from project directory."
44 | exit 1
45 | fi
46 |
47 | # Check if cargo is available
48 | if ! command -v cargo &> /dev/null; then
49 | log_error "Cargo not found. Please install Rust toolchain."
50 | exit 1
51 | fi
52 |
53 | # Check if git is available
54 | if ! command -v git &> /dev/null; then
55 | log_error "Git not found. Please install git."
56 | exit 1
57 | fi
58 |
59 | log_success "Prerequisites check passed"
60 | }
61 |
62 | # Function to build GPU Kill
63 | build_gpukill() {
64 | log_info "Building GPU Kill..."
65 |
66 | cd "$PROJECT_ROOT"
67 |
68 | # Build in release mode for better performance
69 | if cargo build --release; then
70 | log_success "GPU Kill built successfully"
71 | else
72 | log_error "Failed to build GPU Kill"
73 | exit 1
74 | fi
75 | }
76 |
77 | # Function to run basic GPU detection tests
78 | run_gpu_detection_tests() {
79 | log_info "Running GPU detection tests..."
80 |
81 | local gpu_type="$1"
82 | local test_results=()
83 |
84 | # Test 1: List GPUs
85 | log_info "Testing GPU enumeration..."
86 | if ./target/release/gpukill --list > /tmp/gpu-list.txt 2>&1; then
87 | local gpu_count=$(grep -c "GPU [0-9]" /tmp/gpu-list.txt || echo "0")
88 | log_success "Found $gpu_count GPU(s)"
89 | test_results+=("gpu_enumeration:passed:$gpu_count")
90 | else
91 | log_error "GPU enumeration failed"
92 | test_results+=("gpu_enumeration:failed:0")
93 | fi
94 |
95 | # Test 2: GPU information
96 | log_info "Testing GPU information retrieval..."
97 | if ./target/release/gpukill --list > /tmp/gpu-info.json 2>&1; then
98 | local json_valid=$(python3 -m json.tool /tmp/gpu-info.json > /dev/null 2>&1 && echo "true" || echo "false")
99 | if [[ "$json_valid" == "true" ]]; then
100 | log_success "GPU information JSON is valid"
101 | test_results+=("gpu_info_json:passed:valid")
102 | else
103 | log_warning "GPU information JSON is invalid"
104 | test_results+=("gpu_info_json:failed:invalid")
105 | fi
106 | else
107 | log_error "GPU information retrieval failed"
108 | test_results+=("gpu_info_json:failed:error")
109 | fi
110 |
111 | # Test 3: GPU-specific tests based on type
112 | case "$gpu_type" in
113 | "nvidia")
114 | run_nvidia_specific_tests
115 | ;;
116 | "amd")
117 | run_amd_specific_tests
118 | ;;
119 | "intel")
120 | run_intel_specific_tests
121 | ;;
122 | "apple-silicon")
123 | run_apple_specific_tests
124 | ;;
125 | *)
126 | log_warning "Unknown GPU type: $gpu_type"
127 | ;;
128 | esac
129 |
130 | # Output test results
131 | echo "=== GPU Detection Test Results ==="
132 | for result in "${test_results[@]}"; do
133 | echo "$result"
134 | done
135 | }
136 |
137 | # Function to run NVIDIA-specific tests
138 | run_nvidia_specific_tests() {
139 | log_info "Running NVIDIA-specific tests..."
140 |
141 | # Test nvidia-smi availability
142 | if command -v nvidia-smi &> /dev/null; then
143 | log_success "nvidia-smi is available"
144 | nvidia-smi --query-gpu=name,memory.total,memory.used --format=csv,noheader,nounits
145 | else
146 | log_warning "nvidia-smi not found"
147 | fi
148 | }
149 |
150 | # Function to run AMD-specific tests
151 | run_amd_specific_tests() {
152 | log_info "Running AMD-specific tests..."
153 |
154 | # Test rocm-smi availability
155 | if command -v rocm-smi &> /dev/null; then
156 | log_success "rocm-smi is available"
157 | rocm-smi --showproductname
158 | rocm-smi --showuse
159 | rocm-smi --showtemp
160 | rocm-smi --showpower
161 | rocm-smi --showmemuse
162 | else
163 | log_warning "rocm-smi not found"
164 | fi
165 |
166 | # Test amd-smi availability (newer tool)
167 | if command -v amd-smi &> /dev/null; then
168 | log_success "amd-smi is available"
169 | amd-smi
170 | else
171 | log_warning "amd-smi not found"
172 | fi
173 | }
174 |
175 | # Function to run Intel-specific tests
176 | run_intel_specific_tests() {
177 | log_info "Running Intel-specific tests..."
178 |
179 | # Test intel_gpu_top availability
180 | if command -v intel_gpu_top &> /dev/null; then
181 | log_success "intel_gpu_top is available"
182 | timeout 5 intel_gpu_top -l 1 || true
183 | else
184 | log_warning "intel_gpu_top not found"
185 | fi
186 | }
187 |
188 | # Function to run Apple Silicon-specific tests
189 | run_apple_specific_tests() {
190 | log_info "Running Apple Silicon-specific tests..."
191 |
192 | # Test system_profiler for GPU info
193 | if command -v system_profiler &> /dev/null; then
194 | log_success "system_profiler is available"
195 | system_profiler SPDisplaysDataType | grep -A 5 "Chipset Model" || true
196 | else
197 | log_warning "system_profiler not found"
198 | fi
199 | }
200 |
201 | # Function to run performance tests
202 | run_performance_tests() {
203 | log_info "Running GPU performance tests..."
204 |
205 | local gpu_type="$1"
206 | local start_time=$(date +%s)
207 |
208 | # Run GPU hardware tests
209 | if cargo test --test gpu_hardware_tests --release; then
210 | local end_time=$(date +%s)
211 | local duration=$((end_time - start_time))
212 | log_success "GPU performance tests completed in ${duration}s"
213 | else
214 | log_warning "Some GPU performance tests failed or were skipped"
215 | fi
216 | }
217 |
218 | # Function to run stress tests
219 | run_stress_tests() {
220 | log_info "Running GPU stress tests..."
221 |
222 | # Run multiple iterations of GPU detection
223 | for i in {1..5}; do
224 | log_info "Stress test iteration $i/5..."
225 | if ./target/release/gpukill --list > /dev/null 2>&1; then
226 | log_success "Iteration $i passed"
227 | else
228 | log_error "Iteration $i failed"
229 | return 1
230 | fi
231 | sleep 1
232 | done
233 |
234 | log_success "All stress test iterations passed"
235 | }
236 |
237 | # Function to generate test report
238 | generate_test_report() {
239 | log_info "Generating test report..."
240 |
241 | local gpu_type="$1"
242 | local report_file="/tmp/gpu-kill-test-report-$(date +%Y%m%d-%H%M%S).txt"
243 |
244 | {
245 | echo "=== GPU Kill Test Report ==="
246 | echo "Date: $(date)"
247 | echo "GPU Type: $gpu_type"
248 | echo "Hostname: $(hostname)"
249 | echo "OS: $(uname -a)"
250 | echo ""
251 | echo "=== GPU Detection Results ==="
252 | cat /tmp/gpu-list.txt 2>/dev/null || echo "No GPU list available"
253 | echo ""
254 | echo "=== GPU Information (JSON) ==="
255 | cat /tmp/gpu-info.json 2>/dev/null || echo "No GPU info available"
256 | echo ""
257 | echo "=== System Information ==="
258 | # CPU info (cross-platform)
259 | if command -v lscpu &> /dev/null; then
260 | echo "CPU: $(lscpu | grep "Model name" | cut -d: -f2 | xargs || echo "Unknown")"
261 | elif command -v sysctl &> /dev/null; then
262 | echo "CPU: $(sysctl -n machdep.cpu.brand_string 2>/dev/null || echo "Unknown")"
263 | else
264 | echo "CPU: Unknown"
265 | fi
266 |
267 | # Memory info (cross-platform)
268 | if command -v free &> /dev/null; then
269 | echo "Memory: $(free -h | grep "Mem:" | awk '{print $2}' || echo "Unknown")"
270 | elif command -v vm_stat &> /dev/null; then
271 | echo "Memory: $(system_profiler SPHardwareDataType | grep "Memory:" | awk '{print $2, $3}' || echo "Unknown")"
272 | else
273 | echo "Memory: Unknown"
274 | fi
275 |
276 | # GPU drivers (cross-platform)
277 | echo "GPU Drivers:"
278 | if command -v lsmod &> /dev/null; then
279 | lsmod | grep -E "(nvidia|amdgpu|i915)" || echo "No GPU drivers found"
280 | elif command -v kextstat &> /dev/null; then
281 | kextstat | grep -E "(nvidia|amd|intel)" || echo "No GPU drivers found"
282 | else
283 | echo "No GPU drivers found"
284 | fi
285 | } > "$report_file"
286 |
287 | log_success "Test report generated: $report_file"
288 | cat "$report_file"
289 | }
290 |
291 | # Main function
292 | main() {
293 | local gpu_type="${1:-unknown}"
294 |
295 | log_info "Starting GPU Kill tests on Hot Aisle instance"
296 | log_info "GPU Type: $gpu_type"
297 | log_info "Project Root: $PROJECT_ROOT"
298 |
299 | # Initialize log file
300 | echo "=== GPU Kill Test Log - $(date) ===" > "$LOG_FILE"
301 |
302 | # Run test suite
303 | check_prerequisites
304 | build_gpukill
305 | run_gpu_detection_tests "$gpu_type"
306 | run_performance_tests "$gpu_type"
307 | run_stress_tests
308 | generate_test_report "$gpu_type"
309 |
310 | log_success "All GPU Kill tests completed successfully!"
311 | }
312 |
313 | # Run main function with all arguments
314 | main "$@"
315 |
--------------------------------------------------------------------------------
/src/render.rs:
--------------------------------------------------------------------------------
1 | use crate::args::OutputFormat;
2 | use crate::nvml_api::Snapshot;
3 | use crate::util::{format_memory_mb_to_gib, truncate_string};
4 | // serde_json is used via serde_json::to_string_pretty
5 | use std::io::{self, Write};
6 | use tabled::{
7 | settings::{object::Rows, style::Style, Alignment, Modify, Padding, Width},
8 | Table, Tabled,
9 | };
10 |
11 | /// Render GPU information to various output formats
12 | #[derive(Clone)]
13 | pub struct Renderer {
14 | output_format: OutputFormat,
15 | }
16 |
17 | #[allow(dead_code)]
18 | impl Renderer {
19 | /// Create a new renderer
20 | pub fn new(output_format: OutputFormat) -> Self {
21 | Self { output_format }
22 | }
23 |
24 | /// Render a complete snapshot
25 | pub fn render_snapshot(
26 | &self,
27 | snapshot: &Snapshot,
28 | details: bool,
29 | ) -> Result<(), Box> {
30 | match self.output_format {
31 | OutputFormat::Table => self.render_table(snapshot, details),
32 | OutputFormat::Json => self.render_json(snapshot),
33 | }
34 | }
35 |
36 | /// Render as a table
37 | fn render_table(
38 | &self,
39 | snapshot: &Snapshot,
40 | details: bool,
41 | ) -> Result<(), Box> {
42 | if details {
43 | self.render_detailed_table(snapshot)
44 | } else {
45 | self.render_summary_table(snapshot)
46 | }
47 | }
48 |
49 | /// Render summary table (one row per GPU)
50 | fn render_summary_table(&self, snapshot: &Snapshot) -> Result<(), Box> {
51 | let mut table_data = Vec::new();
52 |
53 | for gpu in &snapshot.gpus {
54 | let mem_used_gib = format_memory_mb_to_gib(gpu.mem_used_mb);
55 | let mem_total_gib = format_memory_mb_to_gib(gpu.mem_total_mb);
56 | let mem_usage = format!("{}/{} GiB", mem_used_gib, mem_total_gib);
57 |
58 | let top_proc_info = if let Some(ref top_proc) = gpu.top_proc {
59 | format!(
60 | "{}:{}:{}MB",
61 | truncate_string(&top_proc.proc_name, 15),
62 | top_proc.pid,
63 | top_proc.used_mem_mb
64 | )
65 | } else {
66 | "-".to_string()
67 | };
68 |
69 | let ecc_info = gpu
70 | .ecc_volatile
71 | .map(|e| e.to_string())
72 | .unwrap_or_else(|| "-".to_string());
73 |
74 | table_data.push(SummaryRow {
75 | gpu: gpu.gpu_index.to_string(),
76 | name: truncate_string(&gpu.name, 20),
77 | memory: mem_usage,
78 | utilization: format!("{:.1}%", gpu.util_pct),
79 | temperature: format!("{}°C", gpu.temp_c),
80 | power: format!("{:.1}W", gpu.power_w),
81 | ecc_volatile: ecc_info,
82 | pids: gpu.pids.to_string(),
83 | top_process: top_proc_info,
84 | });
85 | }
86 |
87 | let table = Table::new(&table_data)
88 | .with(Style::modern())
89 | .with(Modify::new(Rows::new(1..)).with(Alignment::left()))
90 | .with(Modify::new(Rows::new(1..)).with(Padding::new(1, 1, 0, 0)))
91 | .with(Width::wrap(120))
92 | .to_string();
93 |
94 | println!("{}", table);
95 | Ok(())
96 | }
97 |
98 | /// Render detailed table (one row per process)
99 | fn render_detailed_table(&self, snapshot: &Snapshot) -> Result<(), Box> {
100 | // First render summary
101 | self.render_summary_table(snapshot)?;
102 | println!();
103 |
104 | // Then render process details
105 | if !snapshot.procs.is_empty() {
106 | let mut table_data = Vec::new();
107 |
108 | for proc in &snapshot.procs {
109 | let container_info = proc
110 | .container
111 | .as_ref()
112 | .map(|c| truncate_string(c, 15))
113 | .unwrap_or_else(|| "-".to_string());
114 |
115 | table_data.push(ProcessRow {
116 | gpu: proc.gpu_index.to_string(),
117 | pid: proc.pid.to_string(),
118 | user: truncate_string(&proc.user, 12),
119 | process: truncate_string(&proc.proc_name, 20),
120 | vram_mb: format!("{}MB", proc.used_mem_mb),
121 | start_time: truncate_string(&proc.start_time, 10),
122 | container: container_info,
123 | });
124 | }
125 |
126 | let table = Table::new(&table_data)
127 | .with(Style::modern())
128 | .with(Modify::new(Rows::new(1..)).with(Alignment::left()))
129 | .with(Modify::new(Rows::new(1..)).with(Padding::new(1, 1, 0, 0)))
130 | .with(Width::wrap(120))
131 | .to_string();
132 |
133 | println!("Process Details:");
134 | println!("{}", table);
135 | }
136 |
137 | Ok(())
138 | }
139 |
140 | /// Render as JSON
141 | fn render_json(&self, snapshot: &Snapshot) -> Result<(), Box> {
142 | let json = serde_json::to_string_pretty(snapshot)?;
143 | println!("{}", json);
144 | Ok(())
145 | }
146 |
147 | /// Render JSON snapshot for watch mode (newline-delimited)
148 | pub fn render_json_snapshot(
149 | &self,
150 | snapshot: &Snapshot,
151 | ) -> Result<(), Box> {
152 | let json = serde_json::to_string(snapshot)?;
153 | println!("{}", json);
154 | io::stdout().flush()?;
155 | Ok(())
156 | }
157 |
158 | /// Clear screen for watch mode
159 | pub fn clear_screen(&self) {
160 | print!("\x1B[2J\x1B[1;1H");
161 | io::stdout().flush().unwrap_or_default();
162 | }
163 |
164 | /// Get output format
165 | pub fn get_output_format(&self) -> OutputFormat {
166 | self.output_format.clone()
167 | }
168 | }
169 |
170 | /// Summary table row structure
171 | #[derive(Tabled)]
172 | struct SummaryRow {
173 | #[tabled(rename = "GPU")]
174 | gpu: String,
175 | #[tabled(rename = "NAME")]
176 | name: String,
177 | #[tabled(rename = "MEM_USED/TOTAL")]
178 | memory: String,
179 | #[tabled(rename = "UTIL(%)")]
180 | utilization: String,
181 | #[tabled(rename = "TEMP(°C)")]
182 | temperature: String,
183 | #[tabled(rename = "POWER(W)")]
184 | power: String,
185 | #[tabled(rename = "ECC(volatile)")]
186 | ecc_volatile: String,
187 | #[tabled(rename = "PIDS")]
188 | pids: String,
189 | #[tabled(rename = "TOP_PROC")]
190 | top_process: String,
191 | }
192 |
193 | /// Process table row structure
194 | #[derive(Tabled)]
195 | struct ProcessRow {
196 | #[tabled(rename = "GPU")]
197 | gpu: String,
198 | #[tabled(rename = "PID")]
199 | pid: String,
200 | #[tabled(rename = "USER")]
201 | user: String,
202 | #[tabled(rename = "PROC")]
203 | process: String,
204 | #[tabled(rename = "VRAM_MB")]
205 | vram_mb: String,
206 | #[tabled(rename = "START_TIME")]
207 | start_time: String,
208 | #[tabled(rename = "CONTAINER?")]
209 | container: String,
210 | }
211 |
212 | /// Render error messages
213 | pub fn render_error(message: &str) {
214 | eprintln!("Error: {}", message);
215 | }
216 |
217 | /// Render warning messages
218 | pub fn render_warning(message: &str) {
219 | eprintln!("Warning: {}", message);
220 | }
221 |
222 | /// Render info messages
223 | pub fn render_info(message: &str) {
224 | println!("Info: {}", message);
225 | }
226 |
227 | /// Render success messages
228 | pub fn render_success(message: &str) {
229 | println!("Success: {}", message);
230 | }
231 |
232 | #[cfg(test)]
233 | mod tests {
234 | use super::*;
235 | use crate::nvml_api::{GpuProc, GpuSnapshot, Snapshot};
236 |
237 | fn create_test_snapshot() -> Snapshot {
238 | Snapshot {
239 | host: "test-host".to_string(),
240 | ts: "2024-01-01T00:00:00Z".to_string(),
241 | gpus: vec![GpuSnapshot {
242 | gpu_index: 0,
243 | name: "Test GPU".to_string(),
244 | vendor: crate::vendor::GpuVendor::Unknown,
245 | mem_used_mb: 2048,
246 | mem_total_mb: 8192,
247 | util_pct: 50.0,
248 | temp_c: 75,
249 | power_w: 150.0,
250 | ecc_volatile: Some(0),
251 | pids: 2,
252 | top_proc: Some(GpuProc {
253 | gpu_index: 0,
254 | pid: 12345,
255 | user: "testuser".to_string(),
256 | proc_name: "test_process".to_string(),
257 | used_mem_mb: 1024,
258 | start_time: "1h 30m".to_string(),
259 | container: None,
260 | }),
261 | }],
262 | procs: vec![GpuProc {
263 | gpu_index: 0,
264 | pid: 12345,
265 | user: "testuser".to_string(),
266 | proc_name: "test_process".to_string(),
267 | used_mem_mb: 1024,
268 | start_time: "1h 30m".to_string(),
269 | container: None,
270 | }],
271 | }
272 | }
273 |
274 | #[test]
275 | fn test_renderer_creation() {
276 | let renderer = Renderer::new(OutputFormat::Table);
277 | assert!(matches!(renderer.output_format, OutputFormat::Table));
278 | }
279 |
280 | #[test]
281 | fn test_json_rendering() {
282 | let renderer = Renderer::new(OutputFormat::Json);
283 | let snapshot = create_test_snapshot();
284 |
285 | // This should not panic
286 | let result = renderer.render_json(&snapshot);
287 | assert!(result.is_ok());
288 | }
289 |
290 | #[test]
291 | fn test_table_rendering() {
292 | let renderer = Renderer::new(OutputFormat::Table);
293 | let snapshot = create_test_snapshot();
294 |
295 | // This should not panic
296 | let result = renderer.render_table(&snapshot, false);
297 | assert!(result.is_ok());
298 | }
299 |
300 | #[test]
301 | fn test_detailed_table_rendering() {
302 | let renderer = Renderer::new(OutputFormat::Table);
303 | let snapshot = create_test_snapshot();
304 |
305 | // This should not panic
306 | let result = renderer.render_table(&snapshot, true);
307 | assert!(result.is_ok());
308 | }
309 | }
310 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # GPU Kill
2 |
3 | A CLI tool for managing GPUs across NVIDIA, AMD, Intel, and Apple Silicon systems. Monitor, control, and secure your GPU infrastructure with ease.
4 |
5 | ## Community & Support
6 |
7 | Join our Discord community for discussions, support, and updates:
8 |
9 | [](https://discord.gg/KqdBcqRk5E)
10 |
11 |
12 | ## Features
13 |
14 | - **Monitor GPUs**: Real-time usage, memory, temperature, and processes
15 | - **Kill Processes**: Gracefully terminate stuck GPU processes
16 | - **Security**: Detect crypto miners and suspicious activity
17 | - **Guard Mode**: Policy enforcement to prevent resource abuse
18 | - **Remote**: Manage GPUs across multiple servers
19 | - **Multi-Vendor**: Works with NVIDIA, AMD, Intel, and Apple Silicon
20 | - **AI Integration**: MCP server for AI assistant integration
21 |
22 | ## Requirements
23 |
24 | ### Build Performance
25 |
26 | **For faster development builds:**
27 | ```bash
28 | # Fast release build (recommended for development)
29 | cargo build --profile release-fast
30 |
31 | # Standard release build (optimized for production)
32 | cargo build --release
33 |
34 | # Maximum optimization (slowest, best performance)
35 | cargo build --profile release-max
36 | ```
37 |
38 | **Build times on typical hardware:**
39 | - Debug build: ~3 seconds
40 | - Release-fast: ~28 seconds
41 | - Release: ~28 seconds (improved from 76 seconds)
42 | - Release-max: ~60+ seconds (maximum optimization)
43 |
44 | ### System Dependencies
45 |
46 | **Linux (Ubuntu/Debian):**
47 | ```bash
48 | sudo apt install build-essential libssl-dev pkg-config
49 | ```
50 |
51 | **Linux (Fedora/RHEL/CentOS):**
52 | ```bash
53 | sudo dnf install gcc gcc-c++ pkg-config openssl-devel
54 | # or for older systems:
55 | # sudo yum install gcc gcc-c++ pkg-config openssl-devel
56 | ```
57 |
58 | **macOS:**
59 | ```bash
60 | # Install Xcode command line tools
61 | xcode-select --install
62 | # OpenSSL is included with macOS
63 | ```
64 |
65 | **Windows:**
66 | - Install Visual Studio Build Tools
67 | - OpenSSL is handled automatically by vcpkg
68 |
69 | ### GPU Drivers
70 |
71 | - **NVIDIA**: NVIDIA drivers installed
72 | - **AMD**: ROCm drivers installed
73 | - **Intel**: intel-gpu-tools package installed
74 | - **Apple Silicon**: macOS with Apple Silicon (M1/M2/M3/M4)
75 |
76 | ### Build Requirements
77 |
78 | - **OS**: Linux, macOS, or Windows
79 | - **Rust**: 1.70+ (for building from source)
80 |
81 | ## Quick Start
82 |
83 | ### Install & Run
84 | ```bash
85 | # Build from source (first build may take 2-3 minutes)
86 | git clone https://github.com/treadiehq/gpu-kill.git
87 | cd gpu-kill
88 | cargo build --release
89 |
90 | # Or install via Cargo
91 | cargo install gpukill
92 |
93 | # Or one-liner installers (recommended)
94 | # macOS/Linux
95 | curl -fsSL https://raw.githubusercontent.com/treadiehq/gpu-kill/refs/heads/main/scripts/install.sh | sh
96 | # Windows (PowerShell)
97 | irm https://raw.githubusercontent.com/treadiehq/gpu-kill/refs/heads/main/scripts/install.ps1 | iex
98 |
99 | # List your GPUs
100 | gpukill --list
101 |
102 | # Watch GPU usage in real-time
103 | gpukill --list --watch
104 | ```
105 |
106 | ### Dead-simple cheatsheet
107 | ```bash
108 | # Live watch (alias)
109 | gpukill watch # = gpukill --list --watch
110 |
111 | # Kill job by PID (positional alias)
112 | gpukill 12345 # = gpukill --kill --pid 12345
113 |
114 | # Free a specific GPU index (kill all jobs on GPU 0)
115 | gpukill --kill --gpu 0 # add --batch to actually kill; preview without it
116 |
117 | # Force reset a GPU (shorthand)
118 | gpukill --reset 0 # = gpukill --reset --gpu 0
119 |
120 | # Safe mode: dry-run first (no changes)
121 | gpukill 12345 --safe # alias: --dry-run
122 | ```
123 |
124 | ## Dashboard (Local Development)
125 |
126 | The GPU Kill dashboard provides a modern web interface for GPU cluster monitoring. The dashboard is included in the repository for local development but is **not required** for core GPU Kill functionality.
127 |
128 | 
129 |
130 | ### Quick Start
131 |
132 | ```bash
133 | # 1. Start the backend API server
134 | gpukill --server --server-port 8080
135 |
136 | # 2. In a new terminal, start the dashboard UI
137 | cd dashboard
138 | npm install # First time only
139 | npm run dev
140 |
141 | # 3. Access the dashboard
142 | open http://localhost:3000
143 | ```
144 |
145 | **Requirements:**
146 | - Node.js 18+ and npm
147 | - GPU Kill backend server running (provides the API)
148 |
149 | **Note**: You need both the backend server (port 8080) and frontend UI (port 3000) running for the dashboard to work.
150 |
151 | ### Dashboard Features
152 |
153 | - **Real-time monitoring** of all GPUs across your cluster
154 | - **Security detection** with threat analysis and risk scoring
155 | - **Policy management** for resource control and enforcement
156 | - **Cluster overview** with Magic Moment contention insights
157 | - **Interactive controls** for process management and GPU operations
158 |
159 | ### Production Deployment
160 |
161 | For production GPU monitoring solutions, check the [Kill Suite](https://treadie.com) website.
162 |
163 | ## MCP Server
164 |
165 | GPU Kill includes a MCP server that enables AI assistants to interact with GPU management functionality:
166 |
167 | - **Resources**: Read GPU status, processes, audit data, policies, and security scans
168 | - **Tools**: Kill processes, reset GPUs, scan for threats, create policies
169 |
170 | ```bash
171 | # Start the MCP server
172 | cargo run --release -p gpukill-mcp
173 |
174 | # Server runs on http://localhost:3001/mcp
175 | ```
176 |
177 | ## Usage
178 |
179 | Ask your AI to use the tools.
180 |
181 | ```text
182 | What GPUs do I have and what's their current usage?
183 | ```
184 |
185 | ```text
186 | Kill the Python process that's stuck on GPU 0
187 | ```
188 |
189 | ```text
190 | Kill all training processes that are using too much GPU memory
191 | ```
192 |
193 | ```text
194 | Show me GPU usage and kill any stuck processes
195 | ```
196 |
197 | ```text
198 | Scan for crypto miners and suspicious activity
199 | ```
200 |
201 | ```text
202 | Create a policy to limit user memory usage to 8GB
203 | ```
204 |
205 | ```text
206 | Reset GPU 1 because it's not responding
207 | ```
208 |
209 | ```text
210 | What processes are currently using my GPUs?
211 | ```
212 |
213 | See [mcp/README.md](mcp/README.md) for detailed MCP server documentation.
214 |
215 |
216 | ## Security & Policies
217 |
218 | ### Detect Threats
219 | ```bash
220 | # Scan for crypto miners and suspicious activity
221 | gpukill --audit --rogue
222 |
223 | # Configure detection rules
224 | gpukill --audit --rogue-config
225 | ```
226 |
227 | ### Policy Enforcement
228 | ```bash
229 | # Enable Guard Mode
230 | gpukill --guard --guard-enable
231 |
232 | # Test policies safely
233 | gpukill --guard --guard-test-policies
234 | ```
235 |
236 | *For detailed security and policy documentation, see [DETAILED.md](DETAILED.md).*
237 |
238 | ## Remote Management
239 |
240 | Manage GPUs across multiple servers via SSH:
241 |
242 | ```bash
243 | # List GPUs on remote server
244 | gpukill --remote staging-server --list
245 |
246 | # Kill process on remote server
247 | gpukill --remote prod-gpu-01 --kill --pid 1234
248 |
249 | # Reset GPU on remote server
250 | gpukill --remote gpu-cluster --reset --gpu 0
251 | ```
252 |
253 | ## Troubleshooting
254 |
255 | ### Build Issues
256 |
257 | **OpenSSL not found:**
258 | ```bash
259 | # Ubuntu/Debian
260 | sudo apt install build-essential libssl-dev pkg-config
261 |
262 | # Fedora/RHEL/CentOS
263 | sudo dnf install gcc gcc-c++ pkg-config openssl-devel
264 | ```
265 |
266 | **Other common build issues:**
267 | - Ensure you have the latest Rust toolchain: `rustup update`
268 | - Clean and rebuild: `cargo clean && cargo build --release`
269 | - Check system dependencies are installed (see Requirements section)
270 |
271 | ## Need Help?
272 |
273 | ```bash
274 | gpukill --help # Show all options
275 | gpukill --version # Show version
276 | ```
277 |
278 | ## CI/CD and Testing
279 |
280 | GPU Kill uses a CI/CD pipeline with **automatic GPU testing**:
281 |
282 | - **✅ Conditional GPU testing** - Runs automatically when GPU hardware is available
283 | - **✅ Multi-vendor GPU testing** on real hardware (NVIDIA, AMD, Intel, Apple Silicon)
284 | - **✅ Hot Aisle integration** - Optional on-demand GPU instance provisioning for comprehensive testing
285 | - **✅ Cross-platform compatibility** testing
286 | - **✅ Performance benchmarking** and profiling
287 | - **✅ Security auditing** and compliance checks
288 | - **✅ Stress testing** for reliability validation
289 |
290 | ### How GPU Testing Works
291 |
292 | - **On GitHub hosted runners**: GPU tests skip gracefully (no GPU hardware)
293 | - **On self-hosted runners**: GPU tests run automatically when GPU hardware is detected
294 | - **On cloud instances**: GPU tests run automatically when GPU hardware is available
295 | - **On developer machines**: GPU tests run automatically when GPU hardware is detected
296 | - **Via Hot Aisle**: On-demand GPU instance provisioning for comprehensive testing
297 |
298 | ### Quick Setup
299 |
300 | **Option 1: Test Locally (Already Working)**
301 | ```bash
302 | cargo test --test gpu_hardware_tests # Runs on your GPU hardware
303 | ```
304 |
305 | **Option 2: Set Up Cloud GPU (5 minutes)**
306 | ```bash
307 | # On any cloud GPU instance:
308 | curl -sSL https://raw.githubusercontent.com/treadiehq/gpu-kill/main/scripts/setup-gpu-runner.sh | bash
309 | ```
310 |
311 | **Option 3: Self-Hosted Runner**
312 | See **[CI_CD.md](CI_CD.md)** for detailed information about our testing infrastructure and how to set up self-hosted runners with GPU hardware.
313 |
314 | **Option 4: Hot Aisle Integration (Optional)**
315 | ```bash
316 | # Build with Hot Aisle feature
317 | cargo build --release --features hotaisle
318 |
319 | # Integration tests run automatically (no API key required)
320 | # For actual GPU testing:
321 | # 1. Set up HOTAISLE_API_KEY in GitHub Secrets
322 | # 2. Manually trigger "Hot Aisle GPU Testing" workflow
323 | # 3. Tests run on real GPU hardware with automatic cleanup
324 | ```
325 |
326 | **Option 5: Cloud GPU Setup**
327 | See **[docs/CLOUD_GPU_SETUP.md](docs/CLOUD_GPU_SETUP.md)** for AWS, GCP, and Azure GPU instance setup.
328 |
329 | ## Documentation
330 |
331 | - **[DETAILED.md](DETAILED.md)** - Complete documentation, API reference, and advanced features
332 | - **[CI_CD.md](CI_CD.md)** - CI/CD pipeline and testing infrastructure
333 | - **[docs/HOTAISLE_INTEGRATION.md](docs/HOTAISLE_INTEGRATION.md)** - Hot Aisle integration guide
334 | - **[docs/CLOUD_GPU_SETUP.md](docs/CLOUD_GPU_SETUP.md)** - Cloud GPU setup guide (AWS, GCP, Azure)
335 |
336 | ## License
337 |
338 | This project is licensed under the FSL-1.1-MIT License. See the LICENSE file for details.
--------------------------------------------------------------------------------
/deny.toml:
--------------------------------------------------------------------------------
1 | # This template contains all of the possible sections and their default values
2 |
3 | # Note that all fields that take a lint level have these possible values:
4 | # * deny - An error will be produced and the check will fail
5 | # * warn - A warning will be produced, but the check will not fail
6 | # * allow - No warning or error will be produced, though in some cases a note
7 | # will be
8 |
9 | # The values provided in this template are the default values that will be used
10 | # when any section or field is not specified in your own configuration
11 |
12 | # Root options
13 |
14 | # The graph table configures how the dependency graph is constructed and thus
15 | # which crates the checks are performed against
16 | [graph]
17 | # If 1 or more target triples (and optionally, target_features) are specified,
18 | # only the specified targets will be checked when running `cargo deny check`.
19 | # This means, if a particular package is only ever used as a target specific
20 | # dependency, such as, for example, the `nix` crate only being used via the
21 | # `target_family = "unix"` configuration, that only having windows targets in
22 | # this list would mean the nix crate, as well as any of its exclusive
23 | # dependencies not shared by any other crates, would be ignored, as the target
24 | # list here is effectively saying which targets you are building for.
25 | targets = [
26 | # The triple can be any string, but only the target triples built in to
27 | # rustc (as of 1.40) can be checked against actual config expressions
28 | #"x86_64-unknown-linux-musl",
29 | # You can also specify which target_features you promise are enabled for a
30 | # particular target. target_features are currently not validated against
31 | # the actual valid features supported by the target architecture.
32 | #{ triple = "wasm32-unknown-unknown", features = ["atomics"] },
33 | ]
34 | # When creating the dependency graph used as the source of truth when checks are
35 | # executed, this field can be used to prune crates from the graph, removing them
36 | # from the view of cargo-deny. This is an extremely heavy hammer, as if a crate
37 | # is pruned from the graph, all of its dependencies will also be pruned unless
38 | # they are connected to another crate in the graph that hasn't been pruned,
39 | # so it should be used with care. The identifiers are [Package ID Specifications]
40 | # (https://doc.rust-lang.org/cargo/reference/pkgid-spec.html)
41 | #exclude = []
42 | # If true, metadata will be collected with `--all-features`. Note that this can't
43 | # be toggled off if true, if you want to conditionally enable `--all-features` it
44 | # is recommended to pass `--all-features` on the cmd line instead
45 | all-features = false
46 | # If true, metadata will be collected with `--no-default-features`. The same
47 | # caveat with `all-features` applies
48 | no-default-features = false
49 | # If set, these feature will be enabled when collecting metadata. If `--features`
50 | # is specified on the cmd line they will take precedence over this option.
51 | #features = []
52 |
53 | # The output table provides options for how/if diagnostics are outputted
54 | [output]
55 | # When outputting inclusion graphs in diagnostics that include features, this
56 | # option can be used to specify the depth at which feature edges will be added.
57 | # This option is included since the graphs can be quite large and the addition
58 | # of features from the crate(s) to all of the graph roots can be far too verbose.
59 | # This option can be overridden via `--feature-depth` on the cmd line
60 | feature-depth = 1
61 |
62 | # This section is considered when running `cargo deny check advisories`
63 | # More documentation for the advisories section can be found here:
64 | # https://embarkstudios.github.io/cargo-deny/checks/advisories/cfg.html
65 | [advisories]
66 | # The path where the advisory databases are cloned/fetched into
67 | #db-path = "$CARGO_HOME/advisory-dbs"
68 | # The url(s) of the advisory databases to use
69 | #db-urls = ["https://github.com/rustsec/advisory-db"]
70 | # A list of advisory IDs to ignore. Note that ignored advisories will still
71 | # output a note when they are encountered.
72 | ignore = [
73 | # Allow unmaintained crates as warnings (not errors)
74 | { id = "RUSTSEC-2020-0168", reason = "mach crate is unmaintained but still functional" },
75 | { id = "RUSTSEC-2024-0370", reason = "proc-macro-error is unmaintained but still functional" },
76 | ]
77 | # If this is true, then cargo deny will use the git executable to fetch advisory database.
78 | # If this is false, then it uses a built-in git library.
79 | # Setting this to true can be helpful if you have special authentication requirements that cargo-deny does not support.
80 | # See Git Authentication for more information about setting up git authentication.
81 | #git-fetch-with-cli = true
82 |
83 | # This section is considered when running `cargo deny check licenses`
84 | # More documentation for the licenses section can be found here:
85 | # https://embarkstudios.github.io/cargo-deny/checks/licenses/cfg.html
86 | [licenses]
87 | # List of explicitly allowed licenses
88 | # See https://spdx.org/licenses/ for list of possible licenses
89 | # [possible values: any SPDX 3.11 short identifier (+ optional exception)].
90 | allow = [
91 | "MIT",
92 | "Apache-2.0",
93 | "Apache-2.0 WITH LLVM-exception",
94 | "BSD-2-Clause",
95 | "BSD-3-Clause",
96 | "ISC",
97 | "Unlicense",
98 | "0BSD",
99 | "Zlib",
100 | "CC0-1.0",
101 | "MPL-2.0",
102 | "LGPL-2.1",
103 | "LGPL-3.0",
104 | "GPL-2.0",
105 | "GPL-3.0",
106 | "FSL-1.1-MIT",
107 | "Unicode-3.0",
108 | ]
109 | # The confidence threshold for detecting a license from license text.
110 | # The higher the value, the more closely the license text must be to the
111 | # canonical license text of a valid SPDX license file.
112 | # [possible values: any between 0.0 and 1.0].
113 | confidence-threshold = 0.8
114 | # Allow 1 or more licenses on a per-crate basis, so that particular licenses
115 | # aren't accepted for every possible crate as with the normal allow list
116 | exceptions = [
117 | # Each entry is the crate and version constraint, and its specific allow
118 | # list
119 | #{ allow = ["Zlib"], crate = "adler32" },
120 | ]
121 |
122 | # Some crates don't have (easily) machine readable licensing information,
123 | # adding a clarification entry for it allows you to manually specify the
124 | # licensing information
125 | #[[licenses.clarify]]
126 | # The package spec the clarification applies to
127 | #crate = "ring"
128 | # The SPDX expression for the license requirements of the crate
129 | #expression = "MIT AND ISC AND OpenSSL"
130 | # One or more files in the crate's source used as the "source of truth" for
131 | # the license expression. If the contents match, the clarification will be used
132 | # when running the license check, otherwise the clarification will be ignored
133 | # and the crate will be checked normally, which may produce warnings or errors
134 | # depending on the rest of your configuration
135 | #license-files = [
136 | # Each entry is a crate relative path, and the (opaque) hash of its contents
137 | #{ path = "LICENSE", hash = 0xbd0eed23 }
138 | #]
139 |
140 | [licenses.private]
141 | # If true, ignores workspace crates that aren't published, or are only
142 | # published to private registries.
143 | # To see how to mark a crate as unpublished (to the official registry),
144 | # visit https://doc.rust-lang.org/cargo/reference/manifest.html#the-publish-field.
145 | ignore = false
146 | # One or more private registries that you might publish crates to, if a crate
147 | # is only published to private registries, and ignore is true, the crate will
148 | # not have its license(s) checked
149 | registries = [
150 | #"https://sekretz.com/registry
151 | ]
152 |
153 | # This section is considered when running `cargo deny check bans`.
154 | # More documentation about the 'bans' section can be found here:
155 | # https://embarkstudios.github.io/cargo-deny/checks/bans/cfg.html
156 | [bans]
157 | # Lint level for when multiple versions of the same crate are detected
158 | multiple-versions = "warn"
159 | # Lint level for when a crate version requirement is `*`
160 | wildcards = "allow"
161 | # The graph highlighting used when creating dotgraphs for crates
162 | # with multiple versions
163 | # * lowest-version - The path to the lowest versioned duplicate is highlighted
164 | # * simplest-path - The path to the version with the fewest edges is highlighted
165 | # * all - Both lowest-version and simplest-path are used
166 | highlight = "all"
167 | # The default lint level for `default` features for crates that are members of
168 | # the workspace that is being checked. This can be overridden by allowing/denying
169 | # `default` on a crate-by-crate basis if desired.
170 | workspace-default-features = "allow"
171 | # The default lint level for `default` features for external crates that are not
172 | # members of the workspace. This can be overridden by allowing/denying `default`
173 | # on a crate-by-crate basis if desired.
174 | external-default-features = "allow"
175 | # List of crates that are allowed. Use with care!
176 | allow = [
177 | #"ansi_term@0.11.0",
178 | #{ crate = "ansi_term@0.11.0", reason = "you can specify a reason it is allowed" },
179 | ]
180 | # List of crates to deny
181 | deny = [
182 | #"ansi_term@0.11.0",
183 | #{ crate = "ansi_term@0.11.0", reason = "you can specify a reason it is banned" },
184 | # Wrapper crates can optionally be specified to allow the crate when it
185 | # is a direct dependency of the otherwise banned crate
186 | #{ crate = "ansi_term@0.11.0", wrappers = ["this-crate-directly-depends-on-ansi_term"] },
187 | ]
188 |
189 | # List of features to allow/deny
190 | # Each entry the name of a crate and a version range. If version is
191 | # not specified, all versions will be matched.
192 | #[[bans.features]]
193 | #crate = "reqwest"
194 | # Features to not allow
195 | #deny = ["json"]
196 | # Features to allow
197 | #allow = [
198 | # "rustls",
199 | # "__rustls",
200 | # "__tls",
201 | # "hyper-rustls",
202 | # "rustls",
203 | # "rustls-pemfile",
204 | # "rustls-tls-webpki-roots",
205 | # "tokio-rustls",
206 | # "webpki-roots",
207 | #]
208 | # If true, the allowed features must exactly match the enabled feature set. If
209 | # this is set there is no point setting `deny`
210 | #exact = true
211 |
212 | # Certain crates/versions that will be skipped when doing duplicate detection.
213 | skip = [
214 | #"ansi_term@0.11.0",
215 | #{ crate = "ansi_term@0.11.0", reason = "you can specify a reason why it can't be updated/removed" },
216 | ]
217 | # Similarly to `skip` allows you to skip certain crates during duplicate
218 | # detection. Unlike skip, it also includes the entire tree of transitive
219 | # dependencies starting at the specified crate, up to a certain depth, which is
220 | # by default infinite.
221 | skip-tree = [
222 | #"ansi_term@0.11.0", # will be skipped along with _all_ of its direct and transitive dependencies
223 | #{ crate = "ansi_term@0.11.0", depth = 20 },
224 | ]
225 |
226 | # This section is considered when running `cargo deny check sources`.
227 | # More documentation about the 'sources' section can be found here:
228 | # https://embarkstudios.github.io/cargo-deny/checks/sources/cfg.html
229 | [sources]
230 | # Lint level for what to happen when a crate from a crate registry that is not
231 | # in the allow list is encountered
232 | unknown-registry = "warn"
233 | # Lint level for what to happen when a crate from a git repository that is not
234 | # in the allow list is encountered
235 | unknown-git = "warn"
236 | # List of URLs for allowed crate registries. Defaults to the crates.io index
237 | # if not specified. If it is specified but empty, no registries are allowed.
238 | allow-registry = ["https://github.com/rust-lang/crates.io-index"]
239 | # List of URLs for allowed Git repositories
240 | allow-git = []
241 |
242 | [sources.allow-org]
243 | # github.com organizations to allow git sources for
244 | github = []
245 | # gitlab.com organizations to allow git sources for
246 | gitlab = []
247 | # bitbucket.org organizations to allow git sources for
248 | bitbucket = []
249 |
--------------------------------------------------------------------------------
/src/proc.rs:
--------------------------------------------------------------------------------
1 | use crate::nvml_api::NvmlApi;
2 | use crate::util::parse_process_start_time;
3 | use anyhow::{Context, Result};
4 | #[cfg(unix)]
5 | use nix::sys::signal::{kill, Signal};
6 | #[cfg(unix)]
7 | use nix::unistd::Pid;
8 | // use std::process::Command; // Used conditionally below
9 | use std::time::{Duration, SystemTime};
10 | use sysinfo::{Pid as SysPid, System};
11 |
12 | /// Process information for a running process
13 | #[derive(Debug, Clone)]
14 | pub struct ProcessInfo {
15 | #[allow(dead_code)]
16 | pub pid: u32,
17 | pub user: String,
18 | pub name: String,
19 | #[allow(dead_code)]
20 | pub start_time: SystemTime,
21 | #[allow(dead_code)]
22 | pub cmdline: String,
23 | }
24 |
25 | /// Process management utilities
26 | pub struct ProcessManager {
27 | nvml_api: NvmlApi,
28 | system: System,
29 | }
30 |
31 | #[allow(dead_code)]
32 | impl ProcessManager {
33 | /// Create a new process manager
34 | pub fn new(nvml_api: NvmlApi) -> Self {
35 | let mut system = System::new_all();
36 | system.refresh_all();
37 |
38 | Self { nvml_api, system }
39 | }
40 |
41 | /// Get process information by PID
42 | pub fn get_process_info(&mut self, pid: u32) -> Result {
43 | self.system.refresh_processes();
44 |
45 | let sys_pid = SysPid::from_u32(pid);
46 | let process = self
47 | .system
48 | .process(sys_pid)
49 | .ok_or_else(|| anyhow::anyhow!("Process with PID {} not found", pid))?;
50 |
51 | let user = get_process_user(pid).unwrap_or_else(|_| "unknown".to_string());
52 |
53 | let start_time = process.start_time();
54 | let start_time_system = SystemTime::UNIX_EPOCH + Duration::from_secs(start_time);
55 |
56 | Ok(ProcessInfo {
57 | pid,
58 | user,
59 | name: process.name().to_string(),
60 | start_time: start_time_system,
61 | cmdline: process.cmd().join(" "),
62 | })
63 | }
64 |
65 | /// Check if a process is using any GPU
66 | pub fn is_process_using_gpu(&self, pid: u32) -> Result {
67 | self.nvml_api.is_process_using_gpu(pid)
68 | }
69 |
70 | /// Gracefully terminate a process with timeout and escalation
71 | #[cfg(unix)]
72 | pub fn graceful_kill(&self, pid: u32, timeout_secs: u16, force: bool) -> Result<()> {
73 | let pid = Pid::from_raw(pid as i32);
74 |
75 | // First, try SIGTERM
76 | tracing::info!("Sending SIGTERM to process {}", pid);
77 | kill(pid, Signal::SIGTERM).map_err(|e| anyhow::anyhow!("Failed to send SIGTERM: {}", e))?;
78 |
79 | // Wait for the process to terminate
80 | let timeout = Duration::from_secs(timeout_secs as u64);
81 | let start = SystemTime::now();
82 |
83 | while SystemTime::now().duration_since(start).unwrap_or_default() < timeout {
84 | // Check if process still exists
85 | if !self.is_process_running(pid.as_raw() as u32)? {
86 | tracing::info!("Process {} terminated gracefully", pid);
87 | return Ok(());
88 | }
89 |
90 | std::thread::sleep(Duration::from_millis(100));
91 | }
92 |
93 | // Process didn't terminate, escalate if force is enabled
94 | if force {
95 | tracing::warn!("Process {} did not terminate, escalating to SIGKILL", pid);
96 | kill(pid, Signal::SIGKILL)
97 | .map_err(|e| anyhow::anyhow!("Failed to send SIGKILL: {}", e))?;
98 |
99 | // Wait a bit more for SIGKILL to take effect
100 | std::thread::sleep(Duration::from_millis(500));
101 |
102 | if !self.is_process_running(pid.as_raw() as u32)? {
103 | tracing::info!("Process {} terminated with SIGKILL", pid);
104 | Ok(())
105 | } else {
106 | Err(anyhow::anyhow!(
107 | "Process {} still running after SIGKILL",
108 | pid
109 | ))
110 | }
111 | } else {
112 | Err(anyhow::anyhow!(
113 | "Process {} did not terminate within {} seconds. Use --force to escalate to SIGKILL",
114 | pid,
115 | timeout_secs
116 | ))
117 | }
118 | }
119 |
120 | /// Gracefully terminate a process with timeout and escalation (Windows stub)
121 | #[cfg(windows)]
122 | pub fn graceful_kill(&self, _pid: u32, _timeout_secs: u16, _force: bool) -> Result<()> {
123 | // On Windows, we can't use Unix signals, so we'll use a different approach
124 | // For now, just return an error indicating this feature isn't available on Windows
125 | Err(anyhow::anyhow!(
126 | "Process termination not yet implemented for Windows"
127 | ))
128 | }
129 |
130 | /// Check if a process is still running
131 | fn is_process_running(&self, pid: u32) -> Result {
132 | let sys_pid = SysPid::from_u32(pid);
133 | Ok(self.system.process(sys_pid).is_some())
134 | }
135 |
136 | /// Enrich GPU processes with system information
137 | pub fn enrich_gpu_processes(
138 | &mut self,
139 | mut processes: Vec,
140 | ) -> Result> {
141 | self.system.refresh_processes();
142 |
143 | for process in &mut processes {
144 | if let Ok(process_info) = self.get_process_info(process.pid) {
145 | process.user = process_info.user;
146 | process.proc_name = process_info.name;
147 | process.start_time = parse_process_start_time(process_info.start_time);
148 | }
149 | }
150 |
151 | Ok(processes)
152 | }
153 |
154 | /// Get all processes using GPUs with enriched information
155 | pub fn get_enriched_gpu_processes(&mut self) -> Result> {
156 | let processes = self.nvml_api.get_gpu_processes()?;
157 | self.enrich_gpu_processes(processes)
158 | }
159 |
160 | /// Validate that a process exists and optionally check GPU usage
161 | pub fn validate_process(&self, pid: u32, check_gpu_usage: bool) -> Result<()> {
162 | // Check if process exists
163 | let sys_pid = SysPid::from_u32(pid);
164 | if self.system.process(sys_pid).is_none() {
165 | return Err(anyhow::anyhow!("Process with PID {} not found", pid));
166 | }
167 |
168 | // Check GPU usage if requested
169 | if check_gpu_usage {
170 | let is_using_gpu = self.is_process_using_gpu(pid)?;
171 | if !is_using_gpu {
172 | return Err(anyhow::anyhow!(
173 | "Process {} is not using any GPU. Use --force to kill anyway.",
174 | pid
175 | ));
176 | }
177 | }
178 |
179 | Ok(())
180 | }
181 |
182 | /// Get device count
183 | pub fn device_count(&self) -> Result {
184 | self.nvml_api.device_count()
185 | }
186 |
187 | /// Create snapshot
188 | pub fn create_snapshot(&self) -> Result {
189 | self.nvml_api.create_snapshot()
190 | }
191 |
192 | /// Reset GPU
193 | pub fn reset_gpu(&self, index: u32) -> Result<()> {
194 | self.nvml_api.reset_gpu(index)
195 | }
196 | }
197 |
198 | /// Get the username for a process (cross-platform)
199 | fn get_process_user(pid: u32) -> Result {
200 | #[cfg(target_os = "linux")]
201 | {
202 | // On Linux, read from /proc//status
203 | let status_path = format!("/proc/{}/status", pid);
204 | let status = std::fs::read_to_string(&status_path)
205 | .with_context(|| format!("Failed to read process status from {}", status_path))?;
206 |
207 | for line in status.lines() {
208 | if line.starts_with("Uid:") {
209 | let parts: Vec<&str> = line.split_whitespace().collect();
210 | if parts.len() >= 2 {
211 | let uid = parts[1]
212 | .parse::()
213 | .with_context(|| format!("Failed to parse UID: {}", parts[1]))?;
214 |
215 | // Get username from UID
216 | return get_username_from_uid(uid);
217 | }
218 | }
219 | }
220 | }
221 |
222 | #[cfg(target_os = "macos")]
223 | {
224 | use std::process::Command;
225 | // On macOS, use ps command
226 | let output = Command::new("ps")
227 | .args(["-o", "user=", "-p", &pid.to_string()])
228 | .output()
229 | .context("Failed to execute ps command")?;
230 |
231 | if output.status.success() {
232 | let user = String::from_utf8_lossy(&output.stdout).trim().to_string();
233 | if !user.is_empty() {
234 | return Ok(user);
235 | }
236 | }
237 | }
238 |
239 | #[cfg(target_os = "windows")]
240 | {
241 | use std::process::Command;
242 | // On Windows, use wmic command
243 | let output = Command::new("wmic")
244 | .args([
245 | "process",
246 | "where",
247 | &format!("ProcessId={}", pid),
248 | "get",
249 | "ExecutablePath",
250 | "/format:value",
251 | ])
252 | .output()
253 | .context("Failed to execute wmic command")?;
254 |
255 | if output.status.success() {
256 | let output_str = String::from_utf8_lossy(&output.stdout);
257 | for line in output_str.lines() {
258 | if line.starts_with("ExecutablePath=") {
259 | let path = line.strip_prefix("ExecutablePath=").unwrap_or("");
260 | if !path.is_empty() {
261 | // Extract username from path or use a default
262 | return Ok("windows_user".to_string());
263 | }
264 | }
265 | }
266 | }
267 | }
268 |
269 | Ok("unknown".to_string())
270 | }
271 |
272 | #[cfg(target_os = "linux")]
273 | fn get_username_from_uid(uid: u32) -> Result {
274 | use std::ffi::CString;
275 | // use std::os::unix::ffi::OsStringExt; // Unused for now
276 |
277 | unsafe {
278 | let passwd = libc::getpwuid(uid as libc::uid_t);
279 | if passwd.is_null() {
280 | return Ok(format!("uid_{}", uid));
281 | }
282 |
283 | let username = CString::from_raw((*passwd).pw_name);
284 | let username_str = username.to_string_lossy().to_string();
285 | std::mem::forget(username); // Don't free the passwd struct
286 | Ok(username_str)
287 | }
288 | }
289 |
290 | #[cfg(not(target_os = "linux"))]
291 | #[allow(dead_code)]
292 | fn get_username_from_uid(_uid: u32) -> Result {
293 | Ok("unknown".to_string())
294 | }
295 |
296 | #[cfg(test)]
297 | mod tests {
298 | use super::*;
299 | use crate::nvml_api::NvmlApi;
300 |
301 | #[test]
302 | fn test_process_info_creation() {
303 | // Skip this test if NVML is not available
304 | let nvml_api = match NvmlApi::new() {
305 | Ok(api) => api,
306 | Err(_) => {
307 | // Skip test if NVML is not available
308 | return;
309 | }
310 | };
311 |
312 | let mut proc_mgr = ProcessManager::new(nvml_api);
313 |
314 | // Test with a known process (init/systemd)
315 | if let Ok(info) = proc_mgr.get_process_info(1) {
316 | assert_eq!(info.pid, 1);
317 | assert!(!info.name.is_empty());
318 | }
319 | }
320 |
321 | #[test]
322 | fn test_process_validation() {
323 | // Skip this test if NVML is not available
324 | let nvml_api = match NvmlApi::new() {
325 | Ok(api) => api,
326 | Err(_) => {
327 | // Skip test if NVML is not available
328 | return;
329 | }
330 | };
331 |
332 | let proc_mgr = ProcessManager::new(nvml_api);
333 |
334 | // Test validation of non-existent process
335 | let result = proc_mgr.validate_process(999999, false);
336 | assert!(result.is_err());
337 | }
338 | }
339 |
--------------------------------------------------------------------------------
/mcp/src/resources.rs:
--------------------------------------------------------------------------------
1 | //! MCP Resources for GPU Kill
2 |
3 | use crate::types::*;
4 | use gpukill::audit::AuditManager;
5 | use gpukill::guard_mode::GuardModeManager;
6 | use gpukill::rogue_detection::RogueDetector;
7 | use gpukill::vendor::GpuManager;
8 | use serde_json::json;
9 | use std::collections::HashMap;
10 |
11 | /// Resource handler for GPU Kill MCP server
12 | pub struct ResourceHandler {
13 | gpu_manager: GpuManager,
14 | guard_mode: Option,
15 | rogue_detector: Option,
16 | audit_manager: Option,
17 | }
18 |
19 | impl ResourceHandler {
20 | pub async fn new() -> anyhow::Result {
21 | let gpu_manager = GpuManager::initialize()?;
22 |
23 | // Initialize optional components
24 | let guard_mode = GuardModeManager::new().ok();
25 | let audit_manager = AuditManager::new().await.ok();
26 | let rogue_detector = if let Some(am) = audit_manager {
27 | Some(RogueDetector::new(am))
28 | } else {
29 | None
30 | };
31 |
32 | Ok(Self {
33 | gpu_manager,
34 | guard_mode,
35 | rogue_detector,
36 | audit_manager: None, // We moved it to rogue_detector
37 | })
38 | }
39 |
40 | /// List all available resources
41 | pub fn list_resources(&self) -> Vec {
42 | vec![
43 | Resource {
44 | uri: "gpu://list".to_string(),
45 | name: "GPU List".to_string(),
46 | description: Some("Current GPU status and utilization".to_string()),
47 | mime_type: Some("application/json".to_string()),
48 | },
49 | Resource {
50 | uri: "gpu://processes".to_string(),
51 | name: "GPU Processes".to_string(),
52 | description: Some("Currently running GPU processes".to_string()),
53 | mime_type: Some("application/json".to_string()),
54 | },
55 | Resource {
56 | uri: "gpu://audit".to_string(),
57 | name: "GPU Audit".to_string(),
58 | description: Some("Historical GPU usage data".to_string()),
59 | mime_type: Some("application/json".to_string()),
60 | },
61 | Resource {
62 | uri: "gpu://policies".to_string(),
63 | name: "Guard Mode Policies".to_string(),
64 | description: Some("Current Guard Mode policies".to_string()),
65 | mime_type: Some("application/json".to_string()),
66 | },
67 | Resource {
68 | uri: "gpu://rogue-detection".to_string(),
69 | name: "Rogue Detection".to_string(),
70 | description: Some("Security scan results and threats".to_string()),
71 | mime_type: Some("application/json".to_string()),
72 | },
73 | ]
74 | }
75 |
76 | /// Get resource contents by URI
77 | pub async fn get_resource(&self, uri: &str) -> anyhow::Result {
78 | match uri {
79 | "gpu://list" => self.get_gpu_list().await,
80 | "gpu://processes" => self.get_gpu_processes().await,
81 | "gpu://audit" => self.get_audit_data().await,
82 | "gpu://policies" => self.get_policies().await,
83 | "gpu://rogue-detection" => self.get_rogue_detection().await,
84 | _ => Err(anyhow::anyhow!("Unknown resource URI: {}", uri)),
85 | }
86 | }
87 |
88 | async fn get_gpu_list(&self) -> anyhow::Result {
89 | let gpus = self.gpu_manager.get_all_snapshots()?;
90 | let gpu_info: Vec = gpus
91 | .into_iter()
92 | .map(|gpu| GpuInfo {
93 | id: gpu.gpu_index as u32,
94 | name: gpu.name,
95 | vendor: gpu.vendor.to_string(),
96 | memory_used: gpu.mem_used_mb as f64,
97 | memory_total: gpu.mem_total_mb as f64,
98 | utilization: gpu.util_pct as f64,
99 | temperature: Some(gpu.temp_c as f64),
100 | power_usage: Some(gpu.power_w as f64),
101 | processes: gpu
102 | .top_proc
103 | .map(|proc| GpuProcess {
104 | pid: proc.pid,
105 | name: proc.proc_name,
106 | memory_usage: proc.used_mem_mb as f64,
107 | user: Some(proc.user),
108 | })
109 | .into_iter()
110 | .collect(),
111 | })
112 | .collect();
113 |
114 | let json_text = serde_json::to_string_pretty(&gpu_info)?;
115 |
116 | Ok(ResourceContents {
117 | uri: "gpu://list".to_string(),
118 | mime_type: Some("application/json".to_string()),
119 | text: Some(json_text),
120 | blob: None,
121 | })
122 | }
123 |
124 | async fn get_gpu_processes(&self) -> anyhow::Result {
125 | let gpus = self.gpu_manager.get_all_snapshots()?;
126 | let mut all_processes = Vec::new();
127 |
128 | for gpu in gpus {
129 | if let Some(proc) = gpu.top_proc {
130 | all_processes.push(GpuProcess {
131 | pid: proc.pid,
132 | name: proc.proc_name,
133 | memory_usage: proc.used_mem_mb as f64,
134 | user: Some(proc.user),
135 | });
136 | }
137 | }
138 |
139 | let json_text = serde_json::to_string_pretty(&all_processes)?;
140 |
141 | Ok(ResourceContents {
142 | uri: "gpu://processes".to_string(),
143 | mime_type: Some("application/json".to_string()),
144 | text: Some(json_text),
145 | blob: None,
146 | })
147 | }
148 |
149 | async fn get_audit_data(&self) -> anyhow::Result {
150 | // For now, return empty audit data since we don't have access to audit_manager
151 | // In a full implementation, we would need to restructure to share the audit_manager
152 | Ok(ResourceContents {
153 | uri: "gpu://audit".to_string(),
154 | mime_type: Some("application/json".to_string()),
155 | text: Some("[]".to_string()),
156 | blob: None,
157 | })
158 | }
159 |
160 | async fn get_policies(&self) -> anyhow::Result {
161 | if let Some(guard_mode) = &self.guard_mode {
162 | let config = guard_mode.get_config();
163 | let policies: Vec = config
164 | .user_policies
165 | .iter()
166 | .map(|(name, policy)| {
167 | let mut limits = HashMap::new();
168 | limits.insert("memory_limit_gb".to_string(), json!(policy.memory_limit_gb));
169 | limits.insert(
170 | "utilization_limit_pct".to_string(),
171 | json!(policy.utilization_limit_pct),
172 | );
173 | limits.insert(
174 | "process_limit".to_string(),
175 | json!(policy.max_concurrent_processes),
176 | );
177 |
178 | PolicyInfo {
179 | policy_type: "user".to_string(),
180 | name: name.clone(),
181 | enabled: true,
182 | limits,
183 | }
184 | })
185 | .collect();
186 |
187 | let json_text = serde_json::to_string_pretty(&policies)?;
188 |
189 | Ok(ResourceContents {
190 | uri: "gpu://policies".to_string(),
191 | mime_type: Some("application/json".to_string()),
192 | text: Some(json_text),
193 | blob: None,
194 | })
195 | } else {
196 | Ok(ResourceContents {
197 | uri: "gpu://policies".to_string(),
198 | mime_type: Some("application/json".to_string()),
199 | text: Some("[]".to_string()),
200 | blob: None,
201 | })
202 | }
203 | }
204 |
205 | async fn get_rogue_detection(&self) -> anyhow::Result {
206 | if let Some(rogue_detector) = &self.rogue_detector {
207 | let result = rogue_detector.detect_rogue_activity(24).await?;
208 |
209 | // Combine all threat types into a single list
210 | let mut all_threats = Vec::new();
211 |
212 | // Add suspicious processes
213 | for threat in result.suspicious_processes {
214 | all_threats.push(ThreatInfo {
215 | id: format!("suspicious_{}", threat.process.pid),
216 | threat_type: "suspicious_process".to_string(),
217 | severity: "medium".to_string(),
218 | confidence: threat.confidence as f64,
219 | description: format!("Suspicious process: {}", threat.process.proc_name),
220 | process_info: Some(GpuProcess {
221 | pid: threat.process.pid,
222 | name: threat.process.proc_name,
223 | memory_usage: threat.process.used_mem_mb as f64,
224 | user: Some(threat.process.user),
225 | }),
226 | });
227 | }
228 |
229 | // Add crypto miners
230 | for threat in result.crypto_miners {
231 | all_threats.push(ThreatInfo {
232 | id: format!("crypto_{}", threat.process.pid),
233 | threat_type: "crypto_miner".to_string(),
234 | severity: "high".to_string(),
235 | confidence: threat.confidence as f64,
236 | description: format!("Crypto miner detected: {}", threat.process.proc_name),
237 | process_info: Some(GpuProcess {
238 | pid: threat.process.pid,
239 | name: threat.process.proc_name,
240 | memory_usage: threat.process.used_mem_mb as f64,
241 | user: Some(threat.process.user),
242 | }),
243 | });
244 | }
245 |
246 | // Add resource abusers
247 | for threat in result.resource_abusers {
248 | all_threats.push(ThreatInfo {
249 | id: format!("abuser_{}", threat.process.pid),
250 | threat_type: "resource_abuser".to_string(),
251 | severity: "medium".to_string(),
252 | confidence: threat.severity as f64,
253 | description: format!("Resource abuser: {}", threat.process.proc_name),
254 | process_info: Some(GpuProcess {
255 | pid: threat.process.pid,
256 | name: threat.process.proc_name,
257 | memory_usage: threat.process.used_mem_mb as f64,
258 | user: Some(threat.process.user),
259 | }),
260 | });
261 | }
262 |
263 | // Add data exfiltrators
264 | for threat in result.data_exfiltrators {
265 | all_threats.push(ThreatInfo {
266 | id: format!("exfil_{}", threat.process.pid),
267 | threat_type: "data_exfiltrator".to_string(),
268 | severity: "high".to_string(),
269 | confidence: threat.confidence as f64,
270 | description: format!("Data exfiltrator: {}", threat.process.proc_name),
271 | process_info: Some(GpuProcess {
272 | pid: threat.process.pid,
273 | name: threat.process.proc_name,
274 | memory_usage: threat.process.used_mem_mb as f64,
275 | user: Some(threat.process.user),
276 | }),
277 | });
278 | }
279 |
280 | let threat_info = all_threats;
281 |
282 | let json_text = serde_json::to_string_pretty(&threat_info)?;
283 |
284 | Ok(ResourceContents {
285 | uri: "gpu://rogue-detection".to_string(),
286 | mime_type: Some("application/json".to_string()),
287 | text: Some(json_text),
288 | blob: None,
289 | })
290 | } else {
291 | Ok(ResourceContents {
292 | uri: "gpu://rogue-detection".to_string(),
293 | mime_type: Some("application/json".to_string()),
294 | text: Some("[]".to_string()),
295 | blob: None,
296 | })
297 | }
298 | }
299 | }
300 |
--------------------------------------------------------------------------------
/src/process_mgmt.rs:
--------------------------------------------------------------------------------
1 | use crate::nvml_api::GpuProc;
2 | use crate::proc::ProcessManager;
3 | use anyhow::Result;
4 | use regex::Regex;
5 | use std::collections::HashMap;
6 | use sysinfo::{Pid as SysPid, System};
7 |
8 | /// Enhanced process management with filtering and batch operations
9 | pub struct EnhancedProcessManager {
10 | pub process_manager: ProcessManager,
11 | system: System,
12 | }
13 |
14 | #[allow(dead_code)]
15 | impl EnhancedProcessManager {
16 | pub fn new(process_manager: ProcessManager) -> Self {
17 | Self {
18 | process_manager,
19 | system: System::new_all(),
20 | }
21 | }
22 |
23 | /// Filter processes by name pattern (supports regex)
24 | pub fn filter_processes_by_name(
25 | &mut self,
26 | processes: &[GpuProc],
27 | pattern: &str,
28 | ) -> Result> {
29 | let regex = Regex::new(pattern)
30 | .map_err(|e| anyhow::anyhow!("Invalid regex pattern '{}': {}", pattern, e))?;
31 |
32 | let mut filtered = Vec::new();
33 | for proc in processes {
34 | if regex.is_match(&proc.proc_name) {
35 | filtered.push(proc.clone());
36 | }
37 | }
38 |
39 | Ok(filtered)
40 | }
41 |
42 | /// Filter processes by user
43 | pub fn filter_processes_by_user(
44 | &mut self,
45 | processes: &[GpuProc],
46 | user: &str,
47 | ) -> Result> {
48 | let regex = Regex::new(user)
49 | .map_err(|e| anyhow::anyhow!("Invalid regex pattern '{}': {}", user, e))?;
50 |
51 | let mut filtered = Vec::new();
52 | for proc in processes {
53 | if regex.is_match(&proc.user) {
54 | filtered.push(proc.clone());
55 | }
56 | }
57 |
58 | Ok(filtered)
59 | }
60 |
61 | /// Filter processes by memory usage threshold
62 | pub fn filter_processes_by_memory(
63 | &mut self,
64 | processes: &[GpuProc],
65 | min_mb: u32,
66 | ) -> Vec {
67 | processes
68 | .iter()
69 | .filter(|proc| proc.used_mem_mb >= min_mb)
70 | .cloned()
71 | .collect()
72 | }
73 |
74 | /// Get process tree for a given PID
75 | pub fn get_process_tree(&mut self, root_pid: u32) -> Result> {
76 | self.system.refresh_processes();
77 |
78 | let mut pids = Vec::new();
79 | let mut to_process = vec![root_pid];
80 |
81 | while let Some(pid) = to_process.pop() {
82 | pids.push(pid);
83 |
84 | // Find child processes
85 | for process in self.system.processes().values() {
86 | if let Some(parent) = process.parent() {
87 | if parent.as_u32() == pid {
88 | to_process.push(process.pid().as_u32());
89 | }
90 | }
91 | }
92 | }
93 |
94 | Ok(pids)
95 | }
96 |
97 | /// Kill a process and its children
98 | pub fn kill_process_tree(
99 | &mut self,
100 | root_pid: u32,
101 | timeout_secs: u16,
102 | force: bool,
103 | ) -> Result<()> {
104 | let pids = self.get_process_tree(root_pid)?;
105 |
106 | tracing::info!("Killing process tree: {:?}", pids);
107 |
108 | // Kill children first, then parent
109 | for pid in pids.iter().rev() {
110 | if let Err(e) = self
111 | .process_manager
112 | .graceful_kill(*pid, timeout_secs, force)
113 | {
114 | tracing::warn!("Failed to kill process {}: {}", pid, e);
115 | }
116 | }
117 |
118 | Ok(())
119 | }
120 |
121 | /// Batch kill processes matching a pattern
122 | pub fn batch_kill_processes(
123 | &mut self,
124 | processes: &[GpuProc],
125 | timeout_secs: u16,
126 | force: bool,
127 | ) -> Result> {
128 | let mut killed_pids = Vec::new();
129 | let mut failed_pids = Vec::new();
130 |
131 | for proc in processes {
132 | match self
133 | .process_manager
134 | .graceful_kill(proc.pid, timeout_secs, force)
135 | {
136 | Ok(()) => {
137 | killed_pids.push(proc.pid);
138 | tracing::info!(
139 | "Successfully killed process {} ({})",
140 | proc.pid,
141 | proc.proc_name
142 | );
143 | }
144 | Err(e) => {
145 | failed_pids.push(proc.pid);
146 | tracing::warn!(
147 | "Failed to kill process {} ({}): {}",
148 | proc.pid,
149 | proc.proc_name,
150 | e
151 | );
152 | }
153 | }
154 | }
155 |
156 | if !failed_pids.is_empty() {
157 | return Err(anyhow::anyhow!(
158 | "Failed to kill {} processes: {:?}",
159 | failed_pids.len(),
160 | failed_pids
161 | ));
162 | }
163 |
164 | Ok(killed_pids)
165 | }
166 |
167 | /// Detect if a process is running in a container
168 | pub fn detect_container(&mut self, pid: u32) -> Result