├── screenshot.png
├── dashboard
    ├── assets
    │   ├── img
    │   │   ├── logo.png
    │   │   └── screenshot.png
    │   └── css
    │   │   └── main.css
    ├── public
    │   ├── favicon.ico
    │   ├── screenshot.png
    │   ├── robots.txt
    │   ├── browserconfig.xml
    │   ├── sitemap.xml
    │   └── site.webmanifest
    ├── app
    │   └── app.vue
    ├── .gitignore
    ├── tsconfig.json
    ├── package.json
    ├── nuxt.config.ts
    ├── README.md
    ├── components
    │   └── AppSidebar.vue
    └── tailwind.config.js
├── src
    ├── lib.rs
    ├── version.rs
    ├── util.rs
    ├── config.rs
    ├── remote.rs
    ├── hotaisle_client.rs
    ├── render.rs
    ├── proc.rs
    └── process_mgmt.rs
├── mcp
    ├── src
    │   ├── lib.rs
    │   ├── main.rs
    │   ├── types.rs
    │   ├── server.rs
    │   └── resources.rs
    ├── Cargo.toml
    └── README.md
├── .gitignore
├── audit.toml
├── Cargo.toml
├── scripts
    ├── install.ps1
    ├── test-hotaisle-integration-simple.sh
    ├── install.sh
    ├── setup-gpu-runner.sh
    └── run-gpu-tests.sh
├── .github
    └── workflows
    │   ├── test-hotaisle-integration.yml
    │   ├── release.yml
    │   ├── hotaisle-gpu-testing.yml
    │   ├── gpu-testing.yml
    │   └── self-hosted-setup.md
├── LICENSE
├── docs
    ├── CLOUD_GPU_SETUP.md
    └── HOTAISLE_INTEGRATION.md
├── README.md
└── deny.toml


/screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treadiehq/gpu-kill/HEAD/screenshot.png


--------------------------------------------------------------------------------
/dashboard/assets/img/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treadiehq/gpu-kill/HEAD/dashboard/assets/img/logo.png


--------------------------------------------------------------------------------
/dashboard/public/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treadiehq/gpu-kill/HEAD/dashboard/public/favicon.ico


--------------------------------------------------------------------------------
/dashboard/public/screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treadiehq/gpu-kill/HEAD/dashboard/public/screenshot.png


--------------------------------------------------------------------------------
/dashboard/assets/img/screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/treadiehq/gpu-kill/HEAD/dashboard/assets/img/screenshot.png


--------------------------------------------------------------------------------
/dashboard/app/app.vue:
--------------------------------------------------------------------------------
1 | <template>
2 |   <div class="min-h-screen bg-gray-50 dark:bg-gray-900">
3 |     <NuxtRouteAnnouncer />
4 |     <NuxtPage />
5 |   </div>
6 | </template>
7 | 


--------------------------------------------------------------------------------
/dashboard/.gitignore:
--------------------------------------------------------------------------------
 1 | # Nuxt dev/build outputs
 2 | .output
 3 | .data
 4 | .nuxt
 5 | .nitro
 6 | .cache
 7 | dist
 8 | 
 9 | # Node dependencies
10 | node_modules
11 | 
12 | # Logs
13 | logs
14 | *.log
15 | 
16 | # Misc
17 | .DS_Store
18 | .fleet
19 | .idea
20 | 
21 | # Local env files
22 | .env
23 | .env.*
24 | !.env.example
25 | 


--------------------------------------------------------------------------------
/dashboard/public/robots.txt:
--------------------------------------------------------------------------------
 1 | User-agent: *
 2 | Allow: /
 3 | 
 4 | # Sitemap
 5 | Sitemap: https://gpukill.com/sitemap.xml
 6 | 
 7 | # Crawl-delay for respectful crawling
 8 | Crawl-delay: 1
 9 | 
10 | # Disallow admin or sensitive areas (if any)
11 | # Disallow: /admin/
12 | # Disallow: /api/
13 | 
14 | # Allow all other content
15 | Allow: /dashboard/
16 | Allow: /docs/
17 | Allow: /assets/


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
 1 | pub mod args;
 2 | pub mod audit;
 3 | pub mod config;
 4 | pub mod coordinator;
 5 | pub mod guard_mode;
 6 | pub mod nvml_api;
 7 | pub mod proc;
 8 | pub mod process_mgmt;
 9 | pub mod remote;
10 | pub mod render;
11 | pub mod rogue_config;
12 | pub mod rogue_detection;
13 | pub mod util;
14 | pub mod vendor;
15 | pub mod version;
16 | 
17 | #[cfg(feature = "hotaisle")]
18 | pub mod hotaisle_client;
19 | 


--------------------------------------------------------------------------------
/dashboard/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   // https://nuxt.com/docs/guide/concepts/typescript
 3 |   "files": [],
 4 |   "references": [
 5 |     {
 6 |       "path": "./.nuxt/tsconfig.app.json"
 7 |     },
 8 |     {
 9 |       "path": "./.nuxt/tsconfig.server.json"
10 |     },
11 |     {
12 |       "path": "./.nuxt/tsconfig.shared.json"
13 |     },
14 |     {
15 |       "path": "./.nuxt/tsconfig.node.json"
16 |     }
17 |   ]
18 | }
19 | 


--------------------------------------------------------------------------------
/dashboard/public/browserconfig.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8"?>
 2 | <browserconfig>
 3 |   <msapplication>
 4 |     <tile>
 5 |       <square70x70logo src="/mstile-70x70.png"/>
 6 |       <square150x150logo src="/mstile-150x150.png"/>
 7 |       <square310x310logo src="/mstile-310x310.png"/>
 8 |       <wide310x150logo src="/mstile-310x150.png"/>
 9 |       <TileColor>#1e40af</TileColor>
10 |     </tile>
11 |   </msapplication>
12 | </browserconfig>
13 | 


--------------------------------------------------------------------------------
/mcp/src/lib.rs:
--------------------------------------------------------------------------------
 1 | //! GPU Kill MCP Server
 2 | //!
 3 | //! This module provides a Model Context Protocol (MCP) server for GPU Kill,
 4 | //! enabling AI assistants and other tools to interact with GPU management
 5 | //! functionality through a standardized interface.
 6 | 
 7 | pub mod resources;
 8 | pub mod server;
 9 | pub mod tools;
10 | pub mod types;
11 | 
12 | pub use server::GpuKillMCPServer;
13 | pub use types::*;
14 | 
15 | /// MCP Server version
16 | pub const MCP_VERSION: &str = "2024-11-05";
17 | 
18 | /// GPU Kill MCP Server capabilities
19 | pub const CAPABILITIES: &[&str] = &["resources", "tools", "logging"];
20 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Rust
 2 | /target/
 3 | **/*.rs.bk
 4 | Cargo.lock
 5 | 
 6 | # IDE
 7 | .vscode/
 8 | .idea/
 9 | *.swp
10 | *.swo
11 | *~
12 | 
13 | # OS
14 | .DS_Store
15 | .DS_Store?
16 | ._*
17 | .Spotlight-V100
18 | .Trashes
19 | ehthumbs.db
20 | Thumbs.db
21 | 
22 | # Logs
23 | *.log
24 | 
25 | # Temporary files
26 | *.tmp
27 | *.temp
28 | 
29 | # Build artifacts
30 | dist/
31 | *.tar.gz
32 | *.zip
33 | 
34 | # Configuration files (optional)
35 | config.toml
36 | .env
37 | 
38 | # Test artifacts
39 | test_output/
40 | coverage/
41 | 
42 | # Documentation build
43 | book/
44 | .DS_Store
45 | 
46 | # Dashboard (separate project)
47 | REMOVED.md


--------------------------------------------------------------------------------
/dashboard/public/sitemap.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
 3 |   <url>
 4 |     <loc>https://gpukill.com/</loc>
 5 |     <lastmod>2024-01-20</lastmod>
 6 |     <changefreq>daily</changefreq>
 7 |     <priority>1.0</priority>
 8 |   </url>
 9 |   <url>
10 |     <loc>https://gpukill.com/dashboard/</loc>
11 |     <lastmod>2024-01-20</lastmod>
12 |     <changefreq>daily</changefreq>
13 |     <priority>0.9</priority>
14 |   </url>
15 |   <url>
16 |     <loc>https://gpukill.com/docs/</loc>
17 |     <lastmod>2024-01-20</lastmod>
18 |     <changefreq>weekly</changefreq>
19 |     <priority>0.8</priority>
20 |   </url>
21 | </urlset>
22 | 


--------------------------------------------------------------------------------
/audit.toml:
--------------------------------------------------------------------------------
 1 | # Cargo audit configuration
 2 | [advisories]
 3 | # Allow unmaintained crates (warnings only, not errors)
 4 | unmaintained = "warn"
 5 | 
 6 | # License configuration
 7 | [licenses]
 8 | # Allow common open source licenses
 9 | allow = [
10 |     "MIT",
11 |     "Apache-2.0",
12 |     "Apache-2.0 OR MIT",
13 |     "BSD-2-Clause",
14 |     "BSD-3-Clause",
15 |     "ISC",
16 |     "Unlicense",
17 |     "0BSD",
18 |     "Zlib",
19 |     "CC0-1.0",
20 |     "MPL-2.0",
21 |     "LGPL-2.1",
22 |     "LGPL-3.0",
23 |     "GPL-2.0",
24 |     "GPL-3.0",
25 | ]
26 | 
27 | # Deny proprietary licenses
28 | deny = [
29 |     "proprietary",
30 |     "commercial",
31 | ]
32 | 
33 | # Allow unknown licenses (for crates without explicit license info)
34 | unknown = "warn"
35 | 


--------------------------------------------------------------------------------
/dashboard/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "gpukill-dashboard",
 3 |   "private": true,
 4 |   "type": "module",
 5 |   "scripts": {
 6 |     "build": "nuxt build",
 7 |     "dev": "nuxt dev --port 3000",
 8 |     "generate": "nuxt generate",
 9 |     "preview": "nuxt preview",
10 |     "postinstall": "nuxt prepare",
11 |     "start": "nuxt dev --port 3000"
12 |   },
13 |   "dependencies": {
14 |     "@headlessui/vue": "^1.7.23",
15 |     "@heroicons/vue": "^2.2.0",
16 |     "@nuxtjs/tailwindcss": "^6.14.0",
17 |     "@tailwindcss/aspect-ratio": "^0.4.2",
18 |     "@tailwindcss/forms": "^0.5.10",
19 |     "@tailwindcss/typography": "^0.5.18",
20 |     "chart.js": "^4.5.0",
21 |     "nuxt": "^3.13.0",
22 |     "vue": "^3.5.18",
23 |     "vue-chartjs": "^5.3.2",
24 |     "vue-router": "^4.5.1"
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/mcp/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "gpukill-mcp"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | authors = ["GPU Kill Team"]
 6 | description = "MCP server for GPU Kill - AI-accessible GPU management"
 7 | license = "MIT"
 8 | repository = "https://github.com/treadiehq/gpu-kill"
 9 | 
10 | [dependencies]
11 | # Core MCP dependencies
12 | tokio = { version = "1.0", features = ["rt", "rt-multi-thread", "net", "fs", "macros"] }
13 | serde = { version = "1.0", features = ["derive"] }
14 | serde_json = "1.0"
15 | anyhow = "1.0"
16 | tracing = "0.1"
17 | tracing-subscriber = { version = "0.3", features = ["env-filter"] }
18 | 
19 | # GPU Kill integration
20 | gpukill = { path = "../" }
21 | 
22 | # HTTP server for MCP protocol
23 | axum = { version = "0.7", features = ["ws", "macros"] }
24 | tower = "0.4"
25 | tower-http = { version = "0.5", features = ["cors", "trace"] }
26 | 
27 | # JSON-RPC for MCP protocol
28 | jsonrpc-core = "18.0"
29 | jsonrpc-derive = "18.0"
30 | jsonrpc-ws-server = "18.0"
31 | 
32 | # UUID for request IDs
33 | uuid = { version = "1.0", features = ["v4", "serde"] }
34 | 
35 | [dev-dependencies]
36 | tempfile = "3.0"
37 | 


--------------------------------------------------------------------------------
/src/version.rs:
--------------------------------------------------------------------------------
 1 | /// Version information for the gpukill CLI tool
 2 | pub const VERSION: &str = env!("CARGO_PKG_VERSION");
 3 | 
 4 | /// Build information
 5 | pub const BUILD_DATE: &str = env!("BUILD_DATE");
 6 | pub const BUILD_TARGET: &str = env!("BUILD_TARGET");
 7 | #[allow(dead_code)]
 8 | pub const GIT_COMMIT: &str = env!("GIT_COMMIT");
 9 | 
10 | /// Get formatted version string
11 | pub fn get_version_string() -> String {
12 |     format!("gpukill {} ({} {})", VERSION, BUILD_TARGET, BUILD_DATE)
13 | }
14 | 
15 | /// Get detailed version information
16 | #[allow(dead_code)]
17 | pub fn get_detailed_version() -> String {
18 |     format!(
19 |         "gpukill version {}\n\
20 |          Build target: {}\n\
21 |          Build date: {}\n\
22 |          Git commit: {}",
23 |         VERSION, BUILD_TARGET, BUILD_DATE, GIT_COMMIT
24 |     )
25 | }
26 | 
27 | #[cfg(test)]
28 | mod tests {
29 |     use super::*;
30 | 
31 |     #[test]
32 |     fn test_version_string_format() {
33 |         let version = get_version_string();
34 |         assert!(version.contains("gpukill"));
35 |         assert!(version.contains(VERSION));
36 |     }
37 | 
38 |     #[test]
39 |     fn test_detailed_version_format() {
40 |         let detailed = get_detailed_version();
41 |         assert!(detailed.contains("gpukill version"));
42 |         assert!(detailed.contains(VERSION));
43 |         assert!(detailed.contains("Build target:"));
44 |         assert!(detailed.contains("Build date:"));
45 |         assert!(detailed.contains("Git commit:"));
46 |     }
47 | }
48 | 


--------------------------------------------------------------------------------
/dashboard/nuxt.config.ts:
--------------------------------------------------------------------------------
 1 | // https://nuxt.com/docs/api/configuration/nuxt-config
 2 | export default defineNuxtConfig({
 3 |   compatibilityDate: '2024-04-03',
 4 |   devtools: { enabled: true },
 5 |   modules: [
 6 |     '@nuxtjs/tailwindcss'
 7 |   ],
 8 |   runtimeConfig: {
 9 |     public: {
10 |       apiBase: process.env.API_BASE || 'http://localhost:8080'
11 |     }
12 |   },
13 |   ssr: true,
14 |   app: {
15 |     head: {
16 |       title: 'GPU Kill - Cluster Management Dashboard',
17 |       titleTemplate: '%s',
18 |       meta: [
19 |         { charset: 'utf-8' },
20 |         { name: 'viewport', content: 'width=device-width, initial-scale=1' },
21 |         { name: 'format-detection', content: 'telephone=no' },
22 |         { name: 'theme-color', content: '#1e40af' }
23 |       ],
24 |       link: [
25 |         { rel: 'icon', type: 'image/x-icon', href: '/favicon.ico' },
26 |         { rel: 'preconnect', href: 'https://fonts.googleapis.com' },
27 |         { rel: 'preconnect', href: 'https://fonts.gstatic.com', crossorigin: '' }
28 |       ],
29 |       style: [
30 |         {
31 |           innerHTML: `
32 |             html, body, #__nuxt {
33 |               background: #000000 !important;
34 |               background-color: #000000 !important;
35 |               overscroll-behavior: none !important;
36 |             }
37 |             * {
38 |               overscroll-behavior: none !important;
39 |             }
40 |           `
41 |         }
42 |       ]
43 |     }
44 |   },
45 |   nitro: {
46 |     devProxy: {
47 |       '/api': {
48 |         target: 'http://localhost:8080/api',
49 |         changeOrigin: true
50 |       }
51 |     }
52 |   }
53 | })
54 | 


--------------------------------------------------------------------------------
/mcp/src/main.rs:
--------------------------------------------------------------------------------
 1 | //! GPU Kill MCP Server - Main entry point
 2 | 
 3 | use gpukill_mcp::GpuKillMCPServer;
 4 | use std::env;
 5 | use tracing::{error, info};
 6 | 
 7 | #[tokio::main]
 8 | async fn main() -> anyhow::Result<()> {
 9 |     // Initialize logging
10 |     tracing_subscriber::fmt()
11 |         .with_env_filter(tracing_subscriber::EnvFilter::from_default_env())
12 |         .init();
13 | 
14 |     info!("Starting GPU Kill MCP Server");
15 | 
16 |     // Get port from environment or use default
17 |     let port = env::var("MCP_PORT")
18 |         .unwrap_or_else(|_| "3001".to_string())
19 |         .parse::<u16>()
20 |         .unwrap_or(3001);
21 | 
22 |     // Create and start the MCP server
23 |     let server = GpuKillMCPServer::new().await?;
24 | 
25 |     info!("GPU Kill MCP Server initialized successfully");
26 |     info!("Available resources:");
27 |     info!("  - gpu://list - Current GPU status and utilization");
28 |     info!("  - gpu://processes - Currently running GPU processes");
29 |     info!("  - gpu://audit - Historical GPU usage data");
30 |     info!("  - gpu://policies - Current Guard Mode policies");
31 |     info!("  - gpu://rogue-detection - Security scan results");
32 | 
33 |     info!("Available tools:");
34 |     info!("  - kill_gpu_process - Kill a GPU process by PID");
35 |     info!("  - reset_gpu - Reset a GPU by ID");
36 |     info!("  - scan_rogue_activity - Scan for suspicious GPU activity");
37 |     info!("  - create_user_policy - Create a user policy for Guard Mode");
38 |     info!("  - get_gpu_status - Get detailed status of a specific GPU");
39 |     info!("  - kill_processes_by_name - Kill all processes matching a name pattern");
40 | 
41 |     // Start the server
42 |     if let Err(e) = server.start(port).await {
43 |         error!("Failed to start MCP server: {}", e);
44 |         return Err(e);
45 |     }
46 | 
47 |     Ok(())
48 | }
49 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [workspace]
 2 | members = [
 3 |     ".",
 4 |     "mcp",
 5 | ]
 6 | 
 7 | [package]
 8 | name = "gpukill"
 9 | version = "0.1.8"
10 | edition = "2021"
11 | authors = ["Kage <info@treadie.com>"]
12 | description = "A CLI tool for GPU management and monitoring supporting NVIDIA, AMD, Intel, and Apple Silicon GPUs"
13 | license = "FSL-1.1-MIT"
14 | repository = "https://github.com/treadiehq/gpu-kill"
15 | keywords = ["gpu", "nvidia", "amd", "intel", "apple", "metal", "nvml", "rocm", "cli", "monitoring"]
16 | categories = ["command-line-utilities", "development-tools"]
17 | 
18 | [[bin]]
19 | name = "gpukill"
20 | path = "src/main.rs"
21 | 
22 | [dependencies]
23 | clap = { version = "4.4", features = ["derive", "env"] }
24 | tabled = "0.15"
25 | tracing = "0.1"
26 | tracing-subscriber = { version = "0.3", features = ["env-filter"] }
27 | nvml-wrapper = "0.11"
28 | sysinfo = "0.30"
29 | color-eyre = "0.6"
30 | serde = { version = "1.0", features = ["derive"] }
31 | serde_json = "1.0"
32 | chrono = { version = "0.4", features = ["serde"] }
33 | nix = { version = "0.27", features = ["process", "signal"] }
34 | tokio = { version = "1.0", features = ["rt", "time", "process", "net", "fs"] }
35 | anyhow = "1.0"
36 | hostname = "0.3"
37 | libc = "0.2"
38 | toml = "0.8"
39 | dirs = "5.0"
40 | regex = "1.10"
41 | glob = "0.3"
42 | reqwest = { version = "0.11", features = ["json"] }
43 | 
44 | # HTTP server dependencies
45 | axum = { version = "0.7", features = ["ws", "macros"] }
46 | tower = "0.4"
47 | tower-http = { version = "0.5", features = ["cors", "trace"] }
48 | uuid = { version = "1.0", features = ["v4", "serde"] }
49 | futures-util = "0.3"
50 | 
51 | # SSH remote support (using system SSH for now)
52 | # ssh2 = "0.9"
53 | # rpassword = "7.3"
54 | 
55 | 
56 | # Apple Silicon GPU support
57 | [target.'cfg(target_os = "macos")'.dependencies]
58 | core-foundation = "0.9"
59 | core-foundation-sys = "0.8"
60 | io-kit-sys = "0.2"
61 | 
62 | [dev-dependencies]
63 | tempfile = "3.0"
64 | mockall = "0.12"
65 | 
66 | [build-dependencies]
67 | chrono = "0.4"
68 | 
69 | [features]
70 | default = []
71 | mock_nvml = []
72 | hotaisle = []
73 | 
74 | [profile.release]
75 | # Optimized for faster builds during development
76 | lto = "thin"  # Much faster than "true" (fat LTO)
77 | codegen-units = 4  # Allow parallel codegen for faster builds
78 | panic = "abort"
79 | strip = true
80 | 
81 | # Fast release profile for development
82 | [profile.release-fast]
83 | inherits = "release"
84 | lto = false
85 | codegen-units = 16
86 | opt-level = 2  # Slightly less optimization for speed
87 | 
88 | # Maximum optimization profile for final releases
89 | [profile.release-max]
90 | inherits = "release"
91 | lto = true
92 | codegen-units = 1
93 | opt-level = 3
94 | 


--------------------------------------------------------------------------------
/dashboard/public/site.webmanifest:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "GPU Kill - Cluster Management Dashboard",
 3 |   "short_name": "GPU Kill",
 4 |   "description": "Professional GPU cluster management dashboard with real-time monitoring, rogue detection, and policy enforcement.",
 5 |   "start_url": "/",
 6 |   "display": "standalone",
 7 |   "background_color": "#0f172a",
 8 |   "theme_color": "#1e40af",
 9 |   "orientation": "portrait-primary",
10 |   "scope": "/",
11 |   "lang": "en",
12 |   "categories": ["developer", "productivity", "utilities"],
13 |   "icons": [
14 |     {
15 |       "src": "/favicon-16x16.png",
16 |       "sizes": "16x16",
17 |       "type": "image/png"
18 |     },
19 |     {
20 |       "src": "/favicon-32x32.png",
21 |       "sizes": "32x32",
22 |       "type": "image/png"
23 |     },
24 |     {
25 |       "src": "/apple-touch-icon.png",
26 |       "sizes": "180x180",
27 |       "type": "image/png"
28 |     },
29 |     {
30 |       "src": "/android-chrome-192x192.png",
31 |       "sizes": "192x192",
32 |       "type": "image/png",
33 |       "purpose": "any maskable"
34 |     },
35 |     {
36 |       "src": "/android-chrome-512x512.png",
37 |       "sizes": "512x512",
38 |       "type": "image/png",
39 |       "purpose": "any maskable"
40 |     }
41 |   ],
42 |   "screenshots": [
43 |     {
44 |       "src": "/screenshot-desktop.png",
45 |       "sizes": "1280x720",
46 |       "type": "image/png",
47 |       "form_factor": "wide",
48 |       "label": "GPU Kill Dashboard - Desktop View"
49 |     },
50 |     {
51 |       "src": "/screenshot-mobile.png",
52 |       "sizes": "390x844",
53 |       "type": "image/png",
54 |       "form_factor": "narrow",
55 |       "label": "GPU Kill Dashboard - Mobile View"
56 |     }
57 |   ],
58 |   "shortcuts": [
59 |     {
60 |       "name": "Cluster Overview",
61 |       "short_name": "Overview",
62 |       "description": "View cluster overview and statistics",
63 |       "url": "/#cluster-overview",
64 |       "icons": [
65 |         {
66 |           "src": "/shortcut-overview.png",
67 |           "sizes": "96x96"
68 |         }
69 |       ]
70 |     },
71 |     {
72 |       "name": "Rogue Detection",
73 |       "short_name": "Rogue",
74 |       "description": "Scan for suspicious GPU activities",
75 |       "url": "/#rogue-detection",
76 |       "icons": [
77 |         {
78 |           "src": "/shortcut-rogue.png",
79 |           "sizes": "96x96"
80 |         }
81 |       ]
82 |     },
83 |     {
84 |       "name": "Guard Mode",
85 |       "short_name": "Guard",
86 |       "description": "Manage policy enforcement",
87 |       "url": "/#guard-mode",
88 |       "icons": [
89 |         {
90 |           "src": "/shortcut-guard.png",
91 |           "sizes": "96x96"
92 |         }
93 |       ]
94 |     }
95 |   ]
96 | }
97 | 


--------------------------------------------------------------------------------
/scripts/install.ps1:
--------------------------------------------------------------------------------
 1 | $ErrorActionPreference = "Stop"
 2 | 
 3 | # gpukill Windows installer: prefer winget, fallback to zip from GitHub Releases
 4 | 
 5 | param(
 6 |   [string]$Version = "",
 7 |   [string]$BinDir = "$env:LOCALAPPDATA\Programs\gpukill",
 8 |   [switch]$Yes,
 9 |   [switch]$Insecure
10 | )
11 | 
12 | function Get-Arch {
13 |   if ([System.Environment]::Is64BitOperatingSystem) { return "x86_64" } else { return "x86" }
14 | }
15 | 
16 | # Try winget first
17 | try {
18 |   if (Get-Command winget -ErrorAction SilentlyContinue) {
19 |     winget install --id TreadieHQ.GPUKill --silent --accept-package-agreements --accept-source-agreements
20 |     if ($LASTEXITCODE -eq 0) { Write-Host "✅ Installed via winget"; exit 0 }
21 |   }
22 | } catch {}
23 | 
24 | # Fallback to GitHub Releases
25 | $Owner = "treadiehq"
26 | $Repo = "gpu-kill"
27 | if ($Version -ne "") {
28 |   $ApiUrl = "https://api.github.com/repos/$Owner/$Repo/releases/tags/$Version"
29 | } else {
30 |   $ApiUrl = "https://api.github.com/repos/$Owner/$Repo/releases/latest"
31 | }
32 | 
33 | Write-Host "Resolving release…"
34 | $resp = Invoke-RestMethod -Uri $ApiUrl -UseBasicParsing
35 | $Tag = $resp.tag_name
36 | if (-not $Tag) { throw "Failed to resolve release tag" }
37 | 
38 | $arch = Get-Arch
39 | $assetName = "gpukill-$Tag-windows-$arch.zip"
40 | $asset = $resp.assets | Where-Object { $_.name -eq $assetName }
41 | if (-not $asset) { throw "No asset named $assetName in release $Tag" }
42 | 
43 | $tmp = New-Item -ItemType Directory -Path ([System.IO.Path]::GetTempPath() + [System.Guid]::NewGuid())
44 | $zipPath = Join-Path $tmp $assetName
45 | $sumsAsset = $resp.assets | Where-Object { $_.name -eq 'SHA256SUMS' }
46 | $sumsPath = Join-Path $tmp 'SHA256SUMS'
47 | 
48 | Write-Host "Downloading $assetName…"
49 | Invoke-WebRequest -Uri $asset.browser_download_url -OutFile $zipPath -UseBasicParsing
50 | if ($sumsAsset) {
51 |   Invoke-WebRequest -Uri $sumsAsset.browser_download_url -OutFile $sumsPath -UseBasicParsing
52 | }
53 | 
54 | if (Test-Path $sumsPath) {
55 |   $hash = (Get-FileHash -Algorithm SHA256 $zipPath).Hash.ToLower()
56 |   $sums = Get-Content $sumsPath
57 |   if (-not ($sums -match $hash)) {
58 |     if (-not $Insecure) { throw "Checksum verification failed" }
59 |     Write-Warning "Checksum verification skipped (--Insecure)"
60 |   }
61 | }
62 | 
63 | Write-Host "Extracting…"
64 | Expand-Archive -Path $zipPath -DestinationPath $tmp -Force
65 | 
66 | New-Item -ItemType Directory -Force -Path $BinDir | Out-Null
67 | Copy-Item -Path (Join-Path $tmp 'gpukill.exe') -Destination (Join-Path $BinDir 'gpukill.exe') -Force
68 | 
69 | # Add to PATH for current session
70 | $env:PATH = "$BinDir;$env:PATH"
71 | Write-Host "✅ Installed to $BinDir"
72 | & (Join-Path $BinDir 'gpukill.exe') --version
73 | 
74 | 


--------------------------------------------------------------------------------
/.github/workflows/test-hotaisle-integration.yml:
--------------------------------------------------------------------------------
 1 | name: Test Hot Aisle Integration
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [main, develop]
 6 |     paths:
 7 |       - 'src/hotaisle_client.rs'
 8 |       - 'scripts/run-gpu-tests.sh'
 9 |       - 'scripts/test-hotaisle-integration-simple.sh'
10 |       - '.github/workflows/test-hotaisle-integration.yml'
11 |       - '.github/workflows/hotaisle-gpu-testing.yml'
12 |       - 'docs/HOTAISLE_INTEGRATION.md'
13 |   pull_request:
14 |     branches: [main]
15 |     paths:
16 |       - 'src/hotaisle_client.rs'
17 |       - 'scripts/run-gpu-tests.sh'
18 |       - 'scripts/test-hotaisle-integration-simple.sh'
19 |       - '.github/workflows/test-hotaisle-integration.yml'
20 |       - '.github/workflows/hotaisle-gpu-testing.yml'
21 |       - 'docs/HOTAISLE_INTEGRATION.md'
22 |   workflow_dispatch:
23 | 
24 | permissions:
25 |   contents: read
26 | 
27 | env:
28 |   RUST_BACKTRACE: 1
29 |   RUST_LOG: info
30 | 
31 | jobs:
32 |   test-integration:
33 |     name: Test Hot Aisle Integration
34 |     runs-on: ubuntu-latest
35 |     timeout-minutes: 15
36 |     
37 |     steps:
38 |       - name: Checkout code
39 |         uses: actions/checkout@v4
40 | 
41 |       - name: Install Rust
42 |         uses: dtolnay/rust-toolchain@stable
43 |         with:
44 |           components: rustfmt, clippy
45 | 
46 |       - name: Install system dependencies
47 |         run: |
48 |           sudo apt-get update
49 |           sudo apt-get install -y build-essential libssl-dev pkg-config curl jq
50 | 
51 |       - name: Make test script executable
52 |         run: chmod +x scripts/test-hotaisle-integration-simple.sh
53 | 
54 |       - name: Run Hot Aisle Integration Tests
55 |         run: |
56 |           echo "Running comprehensive Hot Aisle integration tests..."
57 |           ./scripts/test-hotaisle-integration-simple.sh
58 | 
59 |       - name: Validate YAML Syntax
60 |         run: |
61 |           echo "Validating GitHub Actions workflow syntax..."
62 |           python3 -c "
63 |           import yaml
64 |           import sys
65 |           try:
66 |               with open('.github/workflows/hotaisle-gpu-testing.yml', 'r') as f:
67 |                   yaml.safe_load(f)
68 |               print('✅ YAML syntax is valid')
69 |           except yaml.YAMLError as e:
70 |               print(f'❌ YAML syntax error: {e}')
71 |               sys.exit(1)
72 |           except Exception as e:
73 |               print(f'❌ Error reading YAML file: {e}')
74 |               sys.exit(1)
75 |           " || echo "⚠️  Python YAML validation skipped (module not available)"
76 | 
77 |       - name: Integration Test Summary
78 |         run: |
79 |           echo "========================================"
80 |           echo "🎉 Hot Aisle Integration Test Summary"
81 |           echo "========================================"
82 |           echo "✅ All integration tests passed!"
83 |           echo "✅ Hot Aisle integration is ready for use!"
84 |           echo ""
85 |           echo "To use Hot Aisle GPU testing:"
86 |           echo "1. Set up HOTAISLE_API_KEY in GitHub Secrets"
87 |           echo "2. Manually trigger the 'Hot Aisle GPU Testing' workflow"
88 |           echo "3. Monitor results in the Actions tab"
89 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: Release
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - 'v*'
 7 |   workflow_dispatch:
 8 |     inputs:
 9 |       tag:
10 |         description: 'Tag to release (e.g., v0.1.1)'
11 |         required: true
12 |         type: string
13 | 
14 | permissions:
15 |   contents: write
16 | 
17 | env:
18 |   TAG: ${{ github.ref_type == 'tag' && github.ref_name || inputs.tag }}
19 | 
20 | jobs:
21 |   build-linux-x86_64:
22 |     runs-on: ubuntu-latest
23 |     steps:
24 |       - uses: actions/checkout@v4
25 |       - uses: dtolnay/rust-toolchain@stable
26 |         with:
27 |           targets: x86_64-unknown-linux-gnu
28 |       - name: Install system dependencies (NVML)
29 |         run: |
30 |           sudo apt-get update
31 |           sudo apt-get install -y libnvidia-ml-dev pkg-config
32 |       - name: Build
33 |         run: cargo build --release --target x86_64-unknown-linux-gnu
34 |       - name: Prepare artifact
35 |         run: |
36 |           mkdir -p dist
37 |           cp target/x86_64-unknown-linux-gnu/release/gpukill dist/gpukill-${{ env.TAG }}-linux-x86_64
38 |       - uses: actions/upload-artifact@v4
39 |         with:
40 |           name: linux-x86_64
41 |           path: dist/gpukill-${{ env.TAG }}-linux-x86_64
42 | 
43 |   build-macos-arm64:
44 |     runs-on: macos-14
45 |     steps:
46 |       - uses: actions/checkout@v4
47 |       - uses: dtolnay/rust-toolchain@stable
48 |       - name: Build
49 |         run: cargo build --release
50 |       - name: Prepare artifact
51 |         run: |
52 |           mkdir -p dist
53 |           cp target/release/gpukill dist/gpukill-${{ env.TAG }}-macos-aarch64
54 |       - uses: actions/upload-artifact@v4
55 |         with:
56 |           name: macos-aarch64
57 |           path: dist/gpukill-${{ env.TAG }}-macos-aarch64
58 | 
59 |   build-windows-x86_64:
60 |     runs-on: windows-latest
61 |     steps:
62 |       - uses: actions/checkout@v4
63 |       - uses: dtolnay/rust-toolchain@stable
64 |       - name: Build
65 |         run: cargo build --release
66 |       - name: Prepare zip
67 |         shell: pwsh
68 |         run: |
69 |           New-Item -ItemType Directory -Force -Path dist | Out-Null
70 |           Copy-Item target/release/gpukill.exe dist/gpukill.exe
71 |           Compress-Archive -Path dist/gpukill.exe -DestinationPath dist/gpukill-${{ env.TAG }}-windows-x86_64.zip -Force
72 |       - uses: actions/upload-artifact@v4
73 |         with:
74 |           name: windows-x86_64
75 |           path: dist/gpukill-${{ env.TAG }}-windows-x86_64.zip
76 | 
77 |   release:
78 |     runs-on: ubuntu-latest
79 |     needs: [build-linux-x86_64, build-macos-arm64, build-windows-x86_64]
80 |     steps:
81 |       - uses: actions/checkout@v4
82 |       - name: Download artifacts
83 |         uses: actions/download-artifact@v4
84 |         with:
85 |           path: dist
86 |       - name: Flatten artifacts and compute checksums
87 |         run: |
88 |           mkdir -p upload
89 |           find dist -type f -maxdepth 2 -exec cp {} upload/ \;
90 |           (cd upload && sha256sum * > SHA256SUMS) || (cd upload && shasum -a 256 * > SHA256SUMS)
91 |       - name: Create GitHub Release
92 |         uses: softprops/action-gh-release@v2
93 |         with:
94 |           tag_name: ${{ env.TAG }}
95 |           files: |
96 |             upload/*
97 |         env:
98 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
99 | 


--------------------------------------------------------------------------------
/.github/workflows/hotaisle-gpu-testing.yml:
--------------------------------------------------------------------------------
 1 | name: Hot Aisle GPU Testing
 2 | 
 3 | # This workflow only runs when manually triggered and when API key is configured
 4 | on:
 5 |   workflow_dispatch:
 6 |     inputs:
 7 |       gpu_types:
 8 |         description: 'Comma-separated GPU types to test (nvidia,amd,intel,apple-silicon)'
 9 |         required: false
10 |         default: 'nvidia,amd,intel'
11 |       test_duration:
12 |         description: 'Test duration in minutes'
13 |         required: false
14 |         default: '30'
15 | 
16 | jobs:
17 |   preflight:
18 |     name: Preflight Checks
19 |     runs-on: ubuntu-latest
20 |     outputs:
21 |       api_key_configured: ${{ steps.check_api_key.outputs.configured }}
22 |     steps:
23 |       - name: Check Hot Aisle API Key
24 |         id: check_api_key
25 |         run: |
26 |           if [[ -n "${{ secrets.HOTAISLE_API_KEY }}" ]]; then
27 |             echo "configured=true" >> $GITHUB_OUTPUT
28 |             echo "✅ Hot Aisle API key is configured"
29 |           else
30 |             echo "configured=false" >> $GITHUB_OUTPUT
31 |             echo "❌ Hot Aisle API key is not configured"
32 |             echo "Please set HOTAISLE_API_KEY in repository secrets to use this workflow"
33 |             exit 1
34 |           fi
35 | 
36 |   gpu-testing:
37 |     name: GPU Testing on Hot Aisle
38 |     needs: preflight
39 |     if: needs.preflight.outputs.api_key_configured == 'true'
40 |     runs-on: ubuntu-latest
41 |     strategy:
42 |       matrix:
43 |         gpu_type: [nvidia, amd, intel]
44 |     steps:
45 |       - name: Checkout code
46 |         uses: actions/checkout@v4
47 | 
48 |       - name: Set up Rust
49 |         uses: actions-rs/toolchain@v1
50 |         with:
51 |           toolchain: stable
52 |           components: rustfmt, clippy
53 | 
54 |       - name: Build GPU Kill with Hot Aisle support
55 |         run: |
56 |           cargo build --release --features hotaisle
57 |           # Verify binary was created
58 |           ls -la target/release/gpukill
59 | 
60 |       - name: Test Hot Aisle Integration
61 |         run: |
62 |           chmod +x scripts/test-hotaisle-integration.sh
63 |           ./scripts/test-hotaisle-integration.sh
64 | 
65 |       - name: Provision GPU Instance
66 |         id: provision
67 |         run: |
68 |           # This would use the Hot Aisle client to provision an instance
69 |           echo "Provisioning ${{ matrix.gpu_type }} GPU instance..."
70 |           # For now, we'll simulate this step
71 |           echo "instance_id=test-instance-123" >> $GITHUB_OUTPUT
72 |           echo "instance_ip=192.168.1.100" >> $GITHUB_OUTPUT
73 | 
74 |       - name: Deploy and Test on GPU Instance
75 |         run: |
76 |           echo "Deploying GPU Kill to instance ${{ steps.provision.outputs.instance_id }}"
77 |           echo "Running GPU tests on ${{ matrix.gpu_type }} hardware..."
78 |           # This would use the Hot Aisle client to deploy and run tests
79 |           # For now, we'll simulate the test results
80 |           echo "✅ GPU detection tests passed"
81 |           echo "✅ GPU performance tests passed"
82 |           echo "✅ GPU stress tests passed"
83 | 
84 |       - name: Cleanup GPU Instance
85 |         if: always()
86 |         run: |
87 |           echo "Cleaning up instance ${{ steps.provision.outputs.instance_id }}"
88 |           # This would use the Hot Aisle client to terminate the instance
89 | 


--------------------------------------------------------------------------------
/dashboard/assets/css/main.css:
--------------------------------------------------------------------------------
  1 | @tailwind base;
  2 | @tailwind components;
  3 | @tailwind utilities;
  4 | 
  5 | /* Import aggressive overscroll fix */
  6 | /* @import './overscroll-fix.css'; */
  7 | 
  8 | /* Custom styles for GPU Kill Dashboard */
  9 | @layer base {
 10 |   /* Force dark background on all elements */
 11 |   *, *::before, *::after {
 12 |     box-sizing: border-box;
 13 |   }
 14 |   
 15 |   html {
 16 |     font-family: 'Inter', system-ui, sans-serif;
 17 |     background: #000000 !important;
 18 |     background-color: #000000 !important;
 19 |     overscroll-behavior: none;
 20 |     -webkit-overflow-scrolling: touch;
 21 |     height: 100%;
 22 |     width: 100%;
 23 |   }
 24 |   
 25 |   body {
 26 |     background: #000000 !important;
 27 |     background-color: #000000 !important;
 28 |     overscroll-behavior: none;
 29 |     margin: 0;
 30 |     padding: 0;
 31 |     min-height: 100vh;
 32 |     height: 100%;
 33 |     width: 100%;
 34 |     overflow-x: hidden;
 35 |     position: relative;
 36 |   }
 37 |   
 38 |   /* Target the Nuxt root element */
 39 |   #__nuxt {
 40 |     background: #000000 !important;
 41 |     background-color: #000000 !important;
 42 |     min-height: 100vh;
 43 |     height: 100%;
 44 |     width: 100%;
 45 |   }
 46 |   
 47 |   /* Prevent any white backgrounds from showing through */
 48 |   div, main, section, article, header, footer, nav, aside {
 49 |     background-color: transparent !important;
 50 |   }
 51 |   
 52 |   /* Fix overscroll bounce on all platforms */
 53 |   html, body, #__nuxt {
 54 |     overscroll-behavior: none !important;
 55 |     overscroll-behavior-y: none !important;
 56 |     overscroll-behavior-x: none !important;
 57 |   }
 58 |   
 59 |   /* iOS specific fixes */
 60 |   @supports (-webkit-touch-callout: none) {
 61 |     html, body {
 62 |       position: fixed;
 63 |       height: 100%;
 64 |       width: 100%;
 65 |       overflow: hidden;
 66 |     }
 67 |     
 68 |     #__nuxt {
 69 |       position: fixed;
 70 |       top: 0;
 71 |       left: 0;
 72 |       right: 0;
 73 |       bottom: 0;
 74 |       overflow-y: auto;
 75 |       -webkit-overflow-scrolling: touch;
 76 |     }
 77 |   }
 78 |   
 79 |   /* Additional overscroll fixes */
 80 |   .overscroll-none {
 81 |     overscroll-behavior: none !important;
 82 |   }
 83 |   
 84 |   /* Prevent rubber band effect */
 85 |   .no-bounce {
 86 |     overscroll-behavior-y: none !important;
 87 |     -webkit-overflow-scrolling: touch;
 88 |   }
 89 | }
 90 | 
 91 | @layer components {
 92 |   .gpu-card {
 93 |     @apply bg-white dark:bg-gray-800 rounded-lg shadow-sm border border-gray-200 dark:border-gray-700 p-4 hover:shadow-md transition-shadow;
 94 |   }
 95 |   
 96 |   .status-online {
 97 |     @apply bg-green-100 text-green-800 dark:bg-green-900 dark:text-green-200;
 98 |   }
 99 |   
100 |   .status-offline {
101 |     @apply bg-red-100 text-red-800 dark:bg-red-900 dark:text-red-200;
102 |   }
103 |   
104 |   .status-degraded {
105 |     @apply bg-yellow-100 text-yellow-800 dark:bg-yellow-900 dark:text-yellow-200;
106 |   }
107 |   
108 |   .metric-card {
109 |     @apply bg-gradient-to-br from-blue-50 to-indigo-100 dark:from-gray-800 dark:to-gray-700 rounded-lg p-4 border border-blue-200 dark:border-gray-600;
110 |   }
111 |   
112 |   .utilization-bar {
113 |     @apply w-full bg-gray-200 dark:bg-gray-700 rounded-full h-2.5;
114 |   }
115 |   
116 |   .utilization-fill {
117 |     @apply h-2.5 rounded-full transition-all duration-300;
118 |   }
119 |   
120 |   .utilization-low {
121 |     @apply bg-green-500;
122 |   }
123 |   
124 |   .utilization-medium {
125 |     @apply bg-yellow-500;
126 |   }
127 |   
128 |   .utilization-high {
129 |     @apply bg-red-500;
130 |   }
131 | }
132 | 


--------------------------------------------------------------------------------
/.github/workflows/gpu-testing.yml:
--------------------------------------------------------------------------------
  1 | name: GPU Hardware Testing
  2 | 
  3 | on:
  4 |   push:
  5 |     branches: [main, develop]
  6 |   pull_request:
  7 |     branches: [main]
  8 |   workflow_dispatch:
  9 |     inputs:
 10 |       gpu_vendor:
 11 |         description: 'GPU vendor to test'
 12 |         required: true
 13 |         default: 'all'
 14 |         type: choice
 15 |         options:
 16 |           - all
 17 |           - nvidia
 18 |           - amd
 19 |           - intel
 20 |           - apple
 21 | 
 22 | permissions:
 23 |   contents: read
 24 | 
 25 | env:
 26 |   RUST_BACKTRACE: 1
 27 |   RUST_LOG: info
 28 | 
 29 | jobs:
 30 |   # Cross-platform compatibility tests
 31 |   cross-platform-tests:
 32 |     name: Cross-Platform Tests
 33 |     runs-on: ${{ matrix.os }}
 34 |     strategy:
 35 |       matrix:
 36 |         os: [ubuntu-22.04, macos-13, windows-2022]
 37 |         include:
 38 |           - os: ubuntu-22.04
 39 |             install_deps: |
 40 |               sudo apt-get update
 41 |               sudo apt-get install -y build-essential libssl-dev pkg-config
 42 |           - os: macos-13
 43 |             install_deps: |
 44 |               xcode-select --install || true
 45 |           - os: windows-2022
 46 |             install_deps: |
 47 |               # Windows dependencies handled by vcpkg
 48 | 
 49 |     steps:
 50 |       - name: Checkout code
 51 |         uses: actions/checkout@v4
 52 | 
 53 |       - name: Install Rust
 54 |         uses: dtolnay/rust-toolchain@stable
 55 |         with:
 56 |           components: rustfmt, clippy
 57 | 
 58 |       - name: Install system dependencies
 59 |         run: ${{ matrix.install_deps }}
 60 | 
 61 |       - name: Build and test
 62 |         run: |
 63 |           cargo build --release
 64 |           cargo test --features mock_nvml
 65 |           cargo clippy --all-targets --all-features -- -D warnings
 66 |           cargo fmt --all -- --check
 67 | 
 68 |   # Security and compliance tests
 69 |   security-tests:
 70 |     name: Security Tests
 71 |     runs-on: ubuntu-22.04
 72 |     
 73 |     steps:
 74 |       - name: Checkout code
 75 |         uses: actions/checkout@v4
 76 | 
 77 |       - name: Install Rust
 78 |         uses: dtolnay/rust-toolchain@stable
 79 | 
 80 |       - name: Install security tools
 81 |         run: |
 82 |           sudo apt-get update
 83 |           sudo apt-get install -y build-essential libssl-dev pkg-config
 84 |           cargo install cargo-audit
 85 |           cargo install cargo-deny
 86 | 
 87 |       - name: Security audit
 88 |         run: |
 89 |           cargo audit
 90 |           cargo deny check
 91 | 
 92 |       - name: Build with security flags
 93 |         run: |
 94 |           RUSTFLAGS="-C target-cpu=native" cargo build --release
 95 |           strip target/release/gpukill
 96 | 
 97 |   # Documentation and API tests
 98 |   api-tests:
 99 |     name: API Tests
100 |     runs-on: ubuntu-22.04
101 |     
102 |     steps:
103 |       - name: Checkout code
104 |         uses: actions/checkout@v4
105 | 
106 |       - name: Install Rust
107 |         uses: dtolnay/rust-toolchain@stable
108 | 
109 |       - name: Install dependencies
110 |         run: |
111 |           sudo apt-get update
112 |           sudo apt-get install -y build-essential libssl-dev pkg-config
113 | 
114 |       - name: Test MCP server
115 |         run: |
116 |           cargo build --release -p gpukill-mcp
117 |           # Test MCP server startup
118 |           timeout 10 ./target/release/gpukill-mcp || true
119 | 
120 |       - name: Test HTTP server
121 |         run: |
122 |           cargo build --release
123 |           # Test HTTP server startup
124 |           timeout 10 ./target/release/gpukill --server --server-port 8080 || true


--------------------------------------------------------------------------------
/dashboard/README.md:
--------------------------------------------------------------------------------
 1 | # GPU Kill Dashboard
 2 | 
 3 | A modern, responsive dashboard for monitoring GPU clusters built with Nuxt.js and Tailwind CSS.
 4 | 
 5 | ## Features
 6 | 
 7 | - **Real-time Cluster Monitoring**: Live updates via WebSocket
 8 | - **Magic Moment**: Instant visibility into GPU contention and blocked resources
 9 | - **Rogue Detection**: Security monitoring with threat detection and risk scoring
10 | - **Guard Mode Management**: Policy enforcement with user, group, and GPU policies
11 | - **Auto-refresh**: Automatic data updates with manual refresh controls
12 | - **Data Persistence**: Policy data saved locally across page refreshes
13 | - **Interactive Controls**: Toggle switches for enforcement modes
14 | - **Policy Management**: Complete CRUD operations for User, Group, and GPU policies
15 | - **Policy Testing**: Built-in policy simulation and testing interface
16 | 
17 | ## Quick Start
18 | 
19 | 1. **Start the GPU Kill Coordinator Server**:
20 |    ```bash
21 |    cd /path/to/gpu-kill
22 |    ./target/release/gpukill --server --server-port 8080
23 |    ```
24 | 
25 | 2. **Start the Dashboard**:
26 |    ```bash
27 |    cd dashboard
28 |    npm install  # First time only
29 |    npm run dev
30 |    ```
31 | 
32 | 3. **Open your browser**:
33 |    - Dashboard: http://localhost:3000
34 |    - API: http://localhost:8080
35 | 
36 | ## Dashboard Views
37 | 
38 | ### Overview Page
39 | - **Cluster Statistics**: Total nodes, GPUs, memory, and average utilization
40 | - **Real-time Metrics**: Live indicators with auto-refresh
41 | - **Magic Moment**: GPU contention analysis with blocked resources
42 | - **Top Users**: Ranked list of users by GPU memory consumption
43 | - **Node Details**: Individual node status and health information
44 | 
45 | ### Detection Page
46 | - **Threat Detection**: Real-time security monitoring
47 | - **Risk Scoring**: Confidence-based threat assessment
48 | - **Crypto Miner Detection**: Identifies mining software and patterns
49 | - **Suspicious Processes**: Flags unusual process behavior
50 | - **Resource Abuse Monitoring**: Detects excessive memory usage
51 | - **Interactive Scanning**: Manual scan controls with loading states
52 | 
53 | ### Guard Page
54 | - **Policy Management**: User, Group, and GPU policy configuration
55 | - **Enforcement Controls**: Soft/hard enforcement toggle switches
56 | - **Policy Statistics**: Modern gradient cards showing policy counts
57 | - **Visual Tables**: Clean display of all policies with action buttons
58 | - **Modal Forms**: Intuitive policy creation with validation
59 | - **Policy Testing**: Built-in simulation and testing interface
60 | - **Data Persistence**: Policy data saved locally across refreshes
61 | 
62 | ## Configuration
63 | 
64 | The dashboard automatically connects to the GPU Kill coordinator API. You can configure the API endpoint:
65 | 
66 | ```bash
67 | # Set custom API base URL
68 | export API_BASE=http://your-server:8080
69 | npm run dev
70 | ```
71 | 
72 | ## Development
73 | 
74 | ```bash
75 | # Install dependencies
76 | npm install
77 | 
78 | # Start development server
79 | npm run dev
80 | 
81 | # Build for production
82 | npm run build
83 | 
84 | # Preview production build
85 | npm run preview
86 | ```
87 | 
88 | ## API Integration
89 | 
90 | The dashboard connects to the GPU Kill coordinator API endpoints:
91 | 
92 | - `GET /api/cluster/snapshot` - Cluster overview data
93 | - `GET /api/cluster/contention` - Magic Moment analysis
94 | - `GET /api/cluster/rogue` - Rogue detection results
95 | - `GET /api/guard/config` - Guard Mode configuration
96 | - `GET /api/guard/status` - Guard Mode status
97 | - `POST /api/guard/toggle-dry-run` - Toggle dry-run mode
98 | - `POST /api/guard/test-policies` - Test policy enforcement
99 | - `WS /ws` - WebSocket for real-time updates


--------------------------------------------------------------------------------
/scripts/test-hotaisle-integration-simple.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # GPU Kill - Hot Aisle Integration Test Script (CI-friendly version)
  4 | # This script tests the Hot Aisle integration without requiring actual API access
  5 | 
  6 | # Colors for output
  7 | RED='\033[0;31m'
  8 | GREEN='\033[0;32m'
  9 | YELLOW='\033[1;33m'
 10 | BLUE='\033[0;34m'
 11 | NC='\033[0m' # No Color
 12 | 
 13 | # Logging functions
 14 | log_info() {
 15 |     echo -e "${BLUE}[INFO]${NC} $1"
 16 | }
 17 | 
 18 | log_success() {
 19 |     echo -e "${GREEN}[SUCCESS]${NC} $1"
 20 | }
 21 | 
 22 | log_warning() {
 23 |     echo -e "${YELLOW}[WARNING]${NC} $1"
 24 | }
 25 | 
 26 | log_error() {
 27 |     echo -e "${RED}[ERROR]${NC} $1"
 28 | }
 29 | 
 30 | # Test results
 31 | TESTS_PASSED=0
 32 | TESTS_FAILED=0
 33 | 
 34 | # Function to run a test
 35 | run_test() {
 36 |     local test_name="$1"
 37 |     local test_command="$2"
 38 |     
 39 |     log_info "Running test: $test_name"
 40 |     log_info "Command: $test_command"
 41 |     
 42 |     if eval "$test_command"; then
 43 |         log_success "✅ $test_name passed"
 44 |         ((TESTS_PASSED++))
 45 |     else
 46 |         log_error "❌ $test_name failed"
 47 |         log_error "Command that failed: $test_command"
 48 |         ((TESTS_FAILED++))
 49 |     fi
 50 |     echo
 51 | }
 52 | 
 53 | # Main test function
 54 | main() {
 55 |     log_info "Starting Hot Aisle Integration Tests (Simple Version)"
 56 |     echo "========================================"
 57 |     
 58 |     # Debug information
 59 |     log_info "Environment:"
 60 |     log_info "  CI: ${CI:-false}"
 61 |     log_info "  PWD: $(pwd)"
 62 |     log_info "  USER: ${USER:-unknown}"
 63 |     echo
 64 |     
 65 |     # Test 1: Check if we're in the right directory
 66 |     run_test "Project Root Check" '[[ -f "Cargo.toml" ]]'
 67 |     
 68 |     # Test 2: Check if Rust is available
 69 |     run_test "Rust Toolchain Check" 'command -v cargo > /dev/null 2>&1'
 70 |     
 71 |     # Test 3: Check if git is available
 72 |     run_test "Git Check" 'command -v git > /dev/null 2>&1'
 73 |     
 74 |     # Test 4: Build without Hot Aisle feature
 75 |     run_test "Build Without Hot Aisle Feature" 'cargo build --release'
 76 |     
 77 |     # Test 5: Build with Hot Aisle feature
 78 |     run_test "Build With Hot Aisle Feature" 'cargo build --release --features hotaisle'
 79 |     
 80 |     # Test 6: Check if Hot Aisle client compiles
 81 |     run_test "Hot Aisle Client Compilation" 'cargo check --features hotaisle'
 82 |     
 83 |     # Test 7: Validate test script syntax
 84 |     run_test "Test Script Syntax Check" 'bash -n scripts/run-gpu-tests.sh'
 85 |     
 86 |     # Test 8: Check if workflow file exists
 87 |     run_test "Workflow File Exists" '[[ -f ".github/workflows/hotaisle-gpu-testing.yml" ]]'
 88 |     
 89 |     # Test 9: Check if documentation exists
 90 |     run_test "Documentation Exists" '[[ -f "docs/HOTAISLE_INTEGRATION.md" ]]'
 91 |     
 92 |     # Test 10: Validate Cargo.toml has hotaisle feature
 93 |     run_test "Hot Aisle Feature in Cargo.toml" 'grep -q "hotaisle = \\[\\]" Cargo.toml'
 94 |     
 95 |     # Test 11: Check if lib.rs has conditional compilation
 96 |     run_test "Conditional Compilation in lib.rs" 'grep -q "#\\[cfg(feature = \"hotaisle\")\\]" src/lib.rs'
 97 |     
 98 |     # Summary
 99 |     echo "========================================"
100 |     log_info "Test Summary:"
101 |     log_success "✅ Tests Passed: $TESTS_PASSED"
102 |     if [[ $TESTS_FAILED -gt 0 ]]; then
103 |         log_error "❌ Tests Failed: $TESTS_FAILED"
104 |         exit 1
105 |     else
106 |         log_success "✅ Tests Failed: $TESTS_FAILED"
107 |     fi
108 |     
109 |     log_success "🎉 All integration tests passed!"
110 |     log_info "The Hot Aisle integration is ready for use with a valid API key."
111 |     exit 0
112 | }
113 | 
114 | # Run main function
115 | main "$@"
116 | 


--------------------------------------------------------------------------------
/scripts/install.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env sh
  2 | set -e
  3 | 
  4 | # gpukill install script: fetch prebuilt GitHub release binary
  5 | 
  6 | REPO_OWNER="treadiehq"
  7 | REPO_NAME="gpu-kill"
  8 | INSTALL_DIR_DEFAULT="$HOME/.local/bin"
  9 | BIN_NAME="gpukill"
 10 | 
 11 | # Flags
 12 | VERSION=""
 13 | BIN_DIR=""
 14 | YES="0"
 15 | INSECURE="0"
 16 | 
 17 | usage() {
 18 |   echo "Usage: curl -fsSL https://get.gpukill.sh | sh [-s] -- [--version vX.Y.Z] [--bin-dir DIR] [--yes] [--insecure]" >&2
 19 | }
 20 | 
 21 | while [ $# -gt 0 ]; do
 22 |   case "$1" in
 23 |     --version) VERSION="$2"; shift 2 ;;
 24 |     --bin-dir) BIN_DIR="$2"; shift 2 ;;
 25 |     --yes|-y) YES="1"; shift ;;
 26 |     --insecure) INSECURE="1"; shift ;;
 27 |     -h|--help) usage; exit 0 ;;
 28 |     *) echo "Unknown option: $1" >&2; usage; exit 1 ;;
 29 |   esac
 30 | done
 31 | 
 32 | detect_os() {
 33 |   uname_s=$(uname -s 2>/dev/null || echo unknown)
 34 |   case "$uname_s" in
 35 |     Linux) echo linux ;;
 36 |     Darwin) echo macos ;;
 37 |     *) echo unsupported ;;
 38 |   esac
 39 | }
 40 | 
 41 | detect_arch() {
 42 |   uname_m=$(uname -m 2>/dev/null || echo unknown)
 43 |   case "$uname_m" in
 44 |     x86_64|amd64) echo x86_64 ;;
 45 |     aarch64|arm64) echo aarch64 ;;
 46 |     *) echo unsupported ;;
 47 |   esac
 48 | }
 49 | 
 50 | need_cmd() { command -v "$1" >/dev/null 2>&1 || { echo "Missing required command: $1" >&2; exit 1; }; }
 51 | 
 52 | need_cmd curl
 53 | need_cmd uname
 54 | need_cmd mkdir
 55 | need_cmd chmod
 56 | 
 57 | OS=$(detect_os)
 58 | ARCH=$(detect_arch)
 59 | if [ "$OS" = "unsupported" ] || [ "$ARCH" = "unsupported" ]; then
 60 |   echo "Unsupported platform: OS=$OS ARCH=$ARCH" >&2
 61 |   exit 1
 62 | fi
 63 | 
 64 | BIN_DIR=${BIN_DIR:-$INSTALL_DIR_DEFAULT}
 65 | mkdir -p "$BIN_DIR"
 66 | 
 67 | # Resolve version
 68 | API="https://api.github.com/repos/$REPO_OWNER/$REPO_NAME/releases/latest"
 69 | if [ -n "$VERSION" ]; then
 70 |   API="https://api.github.com/repos/$REPO_OWNER/$REPO_NAME/releases/tags/$VERSION"
 71 | fi
 72 | 
 73 | echo "Resolving release…"
 74 | TAG=$(curl -fsSL "$API" | sed -n 's/  \"tag_name\": \"\(.*\)\",/\1/p' | head -n1)
 75 | if [ -z "$TAG" ]; then
 76 |   echo "Failed to resolve release tag" >&2
 77 |   exit 1
 78 | fi
 79 | 
 80 | case "$OS-$ARCH" in
 81 |   linux-x86_64) ASSET="gpukill-$TAG-linux-x86_64" ;;
 82 |   linux-aarch64) ASSET="gpukill-$TAG-linux-aarch64" ;;
 83 |   macos-x86_64) ASSET="gpukill-$TAG-macos-x86_64" ;;
 84 |   macos-aarch64) ASSET="gpukill-$TAG-macos-aarch64" ;;
 85 | esac
 86 | 
 87 | URL_BASE="https://github.com/$REPO_OWNER/$REPO_NAME/releases/download/$TAG"
 88 | BIN_URL="$URL_BASE/$ASSET"
 89 | SUMS_URL="$URL_BASE/SHA256SUMS"
 90 | 
 91 | TMPDIR=${TMPDIR:-/tmp}
 92 | TMP_BIN="$TMPDIR/$ASSET"
 93 | TMP_SUMS="$TMPDIR/${REPO_NAME}_SHA256SUMS"
 94 | 
 95 | echo "Downloading binary: $BIN_URL"
 96 | curl -fsSL "$BIN_URL" -o "$TMP_BIN"
 97 | 
 98 | echo "Downloading checksums: $SUMS_URL"
 99 | curl -fsSL "$SUMS_URL" -o "$TMP_SUMS" || true
100 | 
101 | if [ -s "$TMP_SUMS" ]; then
102 |   need_cmd shasum || need_cmd sha256sum
103 |   if command -v shasum >/dev/null 2>&1; then
104 |     SUM=$(shasum -a 256 "$TMP_BIN" | awk '{print $1}')
105 |   else
106 |     SUM=$(sha256sum "$TMP_BIN" | awk '{print $1}')
107 |   fi
108 |   if ! grep -q "$SUM" "$TMP_SUMS"; then
109 |     if [ "$INSECURE" != "1" ]; then
110 |       echo "Checksum verification failed" >&2
111 |       exit 1
112 |     else
113 |       echo "WARNING: checksum verification skipped (--insecure)" >&2
114 |     fi
115 |   fi
116 | else
117 |   echo "WARNING: no checksum file found in release; proceeding" >&2
118 | fi
119 | 
120 | DEST="$BIN_DIR/$BIN_NAME"
121 | mv "$TMP_BIN" "$DEST"
122 | chmod +x "$DEST"
123 | 
124 | if ! printf %s ":$PATH:" | grep -q ":$BIN_DIR:"; then
125 |   echo "Installed to $DEST but $BIN_DIR is not in PATH" >&2
126 |   echo "Add this to your shell rc: export PATH=\"$BIN_DIR:\$PATH\"" >&2
127 | fi
128 | 
129 | echo "✅ Installed $BIN_NAME $TAG to $DEST"
130 | "$DEST" --version || true
131 | 
132 | 


--------------------------------------------------------------------------------
/dashboard/components/AppSidebar.vue:
--------------------------------------------------------------------------------
  1 | <template>
  2 |   <aside class="w-64 h-screen flex flex-col sticky top-0 overflow-y-auto">
  3 |     <!-- Sidebar Header -->
  4 |     <div class="p-4 py-3 flex items-center justify-between">
  5 |       <div class="flex items-center space-x-1">
  6 |         <img src="/assets/img/logo.png" alt="GPU Kill Logo" class="w-6 h-6" />
  7 |         <div>
  8 |           <h1 class="text-sm font-semibold text-white">GPU Kill</h1>
  9 |         </div>
 10 |       </div>
 11 |       <div class="flex items-center space-x-2 px-2 py-1.5">
 12 |         <div class="w-2 h-2 bg-green-400 rounded-full animate-pulse-slow"></div>
 13 |         <span class="text-xs text-gray-400 font-medium">Connected</span>
 14 |       </div>
 15 |     </div>
 16 | 
 17 |     <!-- Navigation -->
 18 |     <nav class="flex-1 p-4 space-y-2 overflow-y-auto">
 19 |       <a 
 20 |         href="/" 
 21 |         @click="setActiveTab('overview')"
 22 |         :class="[
 23 |           'flex items-center space-x-3 text-sm px-3 py-2 border border-transparent rounded-xl transition-colors',
 24 |           activeTab === 'overview' 
 25 |             ? 'bg-gray-500/10 border !border-gray-500/10 text-white' 
 26 |             : 'text-white hover:bg-gray-500/10 hover:border-gray-500/10'
 27 |         ]"
 28 |       >
 29 |         <ChartBarIcon :class="[
 30 |           'w-4 h-4',
 31 |           activeTab === 'overview' ? 'text-gray-500' : 'text-gray-500'
 32 |         ]" />
 33 |         <span>Overview</span>
 34 |       </a>
 35 |       <a 
 36 |         href="/detection" 
 37 |         @click="setActiveTab('detection')"
 38 |         :class="[
 39 |           'flex items-center space-x-3 text-sm px-3 py-2 border border-transparent rounded-xl transition-colors',
 40 |           activeTab === 'detection' 
 41 |             ? 'bg-gray-500/10 border !border-gray-500/10 text-white' 
 42 |             : 'text-white hover:bg-gray-500/10 hover:border-gray-500/10'
 43 |         ]"
 44 |       >
 45 |         <ShieldExclamationIcon :class="[
 46 |           'w-4 h-4',
 47 |           activeTab === 'detection' ? 'text-gray-500' : 'text-gray-500'
 48 |         ]" />
 49 |         <span>Detection</span>
 50 |       </a>
 51 |       <a 
 52 |         href="/guard" 
 53 |         @click="setActiveTab('guard')"
 54 |         :class="[
 55 |           'flex items-center space-x-3 text-sm px-3 py-2 border border-transparent rounded-xl transition-colors',
 56 |           activeTab === 'guard' 
 57 |             ? 'bg-gray-500/10 border border-gray-500/10 text-white' 
 58 |             : 'text-white hover:bg-gray-500/10 hover:border-gray-500/10'
 59 |         ]"
 60 |       >
 61 |         <ShieldCheckIcon :class="[
 62 |           'w-4 h-4',
 63 |           activeTab === 'guard' ? 'text-gray-500' : 'text-gray-500'
 64 |         ]" />
 65 |         <span>Guard</span>
 66 |       </a>
 67 |     </nav>
 68 |   </aside>
 69 | </template>
 70 | 
 71 | <script setup>
 72 | import { ref, onMounted, watch } from 'vue'
 73 | import {
 74 |   ChartBarIcon,
 75 |   ShieldExclamationIcon,
 76 |   ShieldCheckIcon
 77 | } from '@heroicons/vue/24/solid'
 78 | 
 79 | // Props
 80 | const props = defineProps({
 81 |   currentTab: {
 82 |     type: String,
 83 |     default: 'overview'
 84 |   }
 85 | })
 86 | 
 87 | // Emits
 88 | const emit = defineEmits(['tab-change'])
 89 | 
 90 | // Reactive data
 91 | const activeTab = ref(props.currentTab)
 92 | 
 93 | // Methods
 94 | const setActiveTab = (tab) => {
 95 |   activeTab.value = tab
 96 |   emit('tab-change', tab)
 97 | }
 98 | 
 99 | // Watch for prop changes
100 | watch(() => props.currentTab, (newTab) => {
101 |   activeTab.value = newTab
102 | })
103 | 
104 | // Set initial active tab based on current route
105 | onMounted(() => {
106 |   const route = useRoute()
107 |   if (route.path === '/') {
108 |     activeTab.value = 'overview'
109 |   } else if (route.path === '/detection') {
110 |     activeTab.value = 'detection'
111 |   } else if (route.path === '/guard') {
112 |     activeTab.value = 'guard'
113 |   }
114 | })
115 | </script>
116 | 


--------------------------------------------------------------------------------
/mcp/README.md:
--------------------------------------------------------------------------------
  1 | # GPU Kill MCP Server
  2 | 
  3 | A MCP server for GPU Kill, enabling AI assistants and other tools to interact with GPU management functionality through a standardized interface.
  4 | 
  5 | ## Features
  6 | 
  7 | ### Resources (Read-only data)
  8 | - **gpu://list** - Current GPU status and utilization
  9 | - **gpu://processes** - Currently running GPU processes  
 10 | - **gpu://audit** - Historical GPU usage data
 11 | - **gpu://policies** - Current Guard Mode policies
 12 | - **gpu://rogue-detection** - Security scan results and threats
 13 | 
 14 | ### Tools (Actions)
 15 | - **kill_gpu_process** - Kill a GPU process by PID
 16 | - **reset_gpu** - Reset a GPU by ID
 17 | - **scan_rogue_activity** - Scan for suspicious GPU activity
 18 | - **create_user_policy** - Create a user policy for Guard Mode
 19 | - **get_gpu_status** - Get detailed status of a specific GPU
 20 | - **kill_processes_by_name** - Kill all processes matching a name pattern
 21 | 
 22 | ## Quick Start
 23 | 
 24 | ### Build and Run
 25 | 
 26 | ```bash
 27 | # Build the MCP server
 28 | cargo build --release -p gpukill-mcp
 29 | 
 30 | # Run the MCP server
 31 | cargo run --release -p gpukill-mcp
 32 | 
 33 | # Or run with custom port
 34 | MCP_PORT=3001 cargo run --release -p gpukill-mcp
 35 | ```
 36 | 
 37 | ### Using with AI Assistants
 38 | 
 39 | The MCP server exposes GPU management capabilities through a JSON-RPC interface that AI assistants can use to:
 40 | 
 41 | - Monitor GPU usage and performance
 42 | - Kill stuck or problematic processes
 43 | - Reset crashed GPUs
 44 | - Scan for security threats
 45 | - Manage resource policies
 46 | - Automate GPU operations
 47 | 
 48 | ### Example Usage
 49 | 
 50 | ```bash
 51 | # Start the MCP server
 52 | cargo run --release -p gpukill-mcp
 53 | 
 54 | # The server will be available at http://localhost:3001/mcp
 55 | # AI assistants can connect and use the available tools and resources
 56 | ```
 57 | 
 58 | ### Natural Language Examples
 59 | 
 60 | Ask your AI assistant to use the MCP tools with natural language:
 61 | 
 62 | ```text
 63 | What GPUs do I have and what's their current usage?
 64 | ```
 65 | 
 66 | ```text
 67 | Kill the Python process that's stuck on GPU 0
 68 | ```
 69 | 
 70 | ```text
 71 | Kill all training processes that are using too much GPU memory
 72 | ```
 73 | 
 74 | ```text
 75 | Show me GPU usage and kill any stuck processes
 76 | ```
 77 | 
 78 | ```text
 79 | Scan for crypto miners and suspicious activity
 80 | ```
 81 | 
 82 | ```text
 83 | Create a policy to limit user memory usage to 8GB
 84 | ```
 85 | 
 86 | ```text
 87 | Reset GPU 1 because it's not responding
 88 | ```
 89 | 
 90 | ```text
 91 | What processes are currently using my GPUs?
 92 | ```
 93 | 
 94 | ## API Endpoints
 95 | 
 96 | ### HTTP Interface
 97 | 
 98 | - **POST /mcp** - Main MCP JSON-RPC endpoint
 99 | - **GET /health** - Health check endpoint
100 | 
101 | ### MCP Methods
102 | 
103 | - **initialize** - Initialize the MCP connection
104 | - **resources/list** - List available resources
105 | - **resources/read** - Read resource contents
106 | - **tools/list** - List available tools
107 | - **tools/call** - Execute a tool
108 | 
109 | ## Configuration
110 | 
111 | The MCP server can be configured using environment variables:
112 | 
113 | - **MCP_PORT** - Port to listen on (default: 3001)
114 | - **RUST_LOG** - Logging level (default: info)
115 | 
116 | ## Integration
117 | 
118 | This MCP server enables AI assistants to:
119 | 
120 | 1. **Monitor GPU Health**: Check GPU status, utilization, and memory usage
121 | 2. **Manage Processes**: Kill problematic processes or reset GPUs
122 | 3. **Security Monitoring**: Scan for crypto miners and suspicious activity
123 | 4. **Policy Management**: Create and manage resource policies
124 | 5. **Automation**: Automate routine GPU management tasks
125 | 
126 | ## Development
127 | 
128 | ```bash
129 | # Run in development mode
130 | cargo run -p gpukill-mcp
131 | 
132 | # Run with debug logging
133 | RUST_LOG=debug cargo run -p gpukill-mcp
134 | 
135 | # Test the server
136 | curl -X POST http://localhost:3001/mcp \
137 |   -H "Content-Type: application/json" \
138 |   -d '{"jsonrpc":"2.0","id":"1","method":"tools/list","params":{}}'
139 | ```
140 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | # Functional Source License, Version 1.1, MIT Future License
  2 | 
  3 | ## Abbreviation
  4 | 
  5 | FSL-1.1-MIT
  6 | 
  7 | ## Notice
  8 | 
  9 | Copyright (c) 2024 Treadie, Inc
 10 | 
 11 | ## Terms and Conditions
 12 | 
 13 | ### Licensor ("We")
 14 | 
 15 | The party offering the Software under these Terms and Conditions.
 16 | 
 17 | ### The Software
 18 | 
 19 | The "Software" is each version of the software that we make available under
 20 | these Terms and Conditions, as indicated by our inclusion of these Terms and
 21 | Conditions with the Software.
 22 | 
 23 | ### License Grant
 24 | 
 25 | Subject to your compliance with this License Grant and the Patents,
 26 | Redistribution and Trademark clauses below, we hereby grant you the right to
 27 | use, copy, modify, create derivative works, publicly perform, publicly display
 28 | and redistribute the Software for any Permitted Purpose identified below.
 29 | 
 30 | ### Permitted Purpose
 31 | 
 32 | A Permitted Purpose is any purpose other than a Competing Use. A Competing Use
 33 | means making the Software available to others in a commercial product or
 34 | service that:
 35 | 
 36 | 1. substitutes for the Software;
 37 | 
 38 | 2. substitutes for any other product or service we offer using the Software
 39 |    that exists as of the date we make the Software available; or
 40 | 
 41 | 3. offers the same or substantially similar functionality as the Software.
 42 | 
 43 | Permitted Purposes specifically include using the Software:
 44 | 
 45 | 1. for your internal use and access;
 46 | 
 47 | 2. for non-commercial education;
 48 | 
 49 | 3. for non-commercial research; and
 50 | 
 51 | 4. in connection with professional services that you provide to a licensee
 52 |    using the Software in accordance with these Terms and Conditions.
 53 | 
 54 | ### Patents
 55 | 
 56 | To the extent your use for a Permitted Purpose would necessarily infringe our
 57 | patents, the license grant above includes a license under our patents. If you
 58 | make a claim against any party that the Software infringes or contributes to
 59 | the infringement of any patent, then your patent license to the Software ends
 60 | immediately.
 61 | 
 62 | ### Redistribution
 63 | 
 64 | The Terms and Conditions apply to all copies, modifications and derivatives of
 65 | the Software.
 66 | 
 67 | If you redistribute any copies, modifications or derivatives of the Software,
 68 | you must include a copy of or a link to these Terms and Conditions and not
 69 | remove any copyright notices provided in or with the Software.
 70 | 
 71 | ### Disclaimer
 72 | 
 73 | THE SOFTWARE IS PROVIDED "AS IS" AND WITHOUT WARRANTIES OF ANY KIND, EXPRESS OR
 74 | IMPLIED, INCLUDING WITHOUT LIMITATION WARRANTIES OF FITNESS FOR A PARTICULAR
 75 | PURPOSE, MERCHANTABILITY, TITLE OR NON-INFRINGEMENT.
 76 | 
 77 | IN NO EVENT WILL WE HAVE ANY LIABILITY TO YOU ARISING OUT OF OR RELATED TO THE
 78 | SOFTWARE, INCLUDING INDIRECT, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES,
 79 | EVEN IF WE HAVE BEEN INFORMED OF THEIR POSSIBILITY IN ADVANCE.
 80 | 
 81 | ### Trademarks
 82 | 
 83 | Except for displaying the License Details and identifying us as the origin of
 84 | the Software, you have no right under these Terms and Conditions to use our
 85 | trademarks, trade names, service marks or product names.
 86 | 
 87 | ## Grant of Future License
 88 | 
 89 | We hereby irrevocably grant you an additional license to use the Software under
 90 | the MIT license that is effective on the second anniversary of the date we make
 91 | the Software available. On or after that date, you may use the Software under
 92 | the MIT license, in which case the following will apply:
 93 | 
 94 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 95 | this software and associated documentation files (the "Software"), to deal in
 96 | the Software without restriction, including without limitation the rights to
 97 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
 98 | of the Software, and to permit persons to whom the Software is furnished to do
 99 | so, subject to the following conditions:
100 | 
101 | The above copyright notice and this permission notice shall be included in all
102 | copies or substantial portions of the Software.
103 | 
104 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
105 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
106 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
107 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
108 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
109 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
110 | SOFTWARE.


--------------------------------------------------------------------------------
/src/util.rs:
--------------------------------------------------------------------------------
  1 | use chrono::{DateTime, Local, Utc};
  2 | use std::time::{Duration, SystemTime};
  3 | 
  4 | /// Get the current hostname
  5 | pub fn get_hostname() -> String {
  6 |     hostname::get()
  7 |         .unwrap_or_else(|_| std::ffi::OsString::from("unknown"))
  8 |         .to_string_lossy()
  9 |         .to_string()
 10 | }
 11 | 
 12 | /// Format a timestamp as a human-readable string
 13 | #[allow(dead_code)]
 14 | pub fn format_timestamp(timestamp: SystemTime) -> String {
 15 |     let datetime: DateTime<Local> = timestamp.into();
 16 |     datetime.format("%Y-%m-%d %H:%M:%S").to_string()
 17 | }
 18 | 
 19 | /// Format a timestamp as ISO 8601 string
 20 | pub fn format_timestamp_iso(timestamp: SystemTime) -> String {
 21 |     let datetime: DateTime<Utc> = timestamp.into();
 22 |     datetime.format("%Y-%m-%dT%H:%M:%S%.3fZ").to_string()
 23 | }
 24 | 
 25 | /// Get current timestamp as ISO 8601 string
 26 | pub fn get_current_timestamp_iso() -> String {
 27 |     format_timestamp_iso(SystemTime::now())
 28 | }
 29 | 
 30 | /// Format duration as human-readable string
 31 | #[allow(dead_code)]
 32 | pub fn format_duration(duration: Duration) -> String {
 33 |     let total_seconds = duration.as_secs();
 34 |     let hours = total_seconds / 3600;
 35 |     let minutes = (total_seconds % 3600) / 60;
 36 |     let seconds = total_seconds % 60;
 37 | 
 38 |     if hours > 0 {
 39 |         format!("{}h {}m {}s", hours, minutes, seconds)
 40 |     } else if minutes > 0 {
 41 |         format!("{}m {}s", minutes, seconds)
 42 |     } else {
 43 |         format!("{}s", seconds)
 44 |     }
 45 | }
 46 | 
 47 | /// Format memory size in bytes to human-readable format
 48 | #[allow(dead_code)]
 49 | pub fn format_memory_size(bytes: u64) -> String {
 50 |     const UNITS: &[&str] = &["B", "KB", "MB", "GB", "TB"];
 51 |     const THRESHOLD: u64 = 1024;
 52 | 
 53 |     if bytes == 0 {
 54 |         return "0 B".to_string();
 55 |     }
 56 | 
 57 |     let mut size = bytes as f64;
 58 |     let mut unit_index = 0;
 59 | 
 60 |     while size >= THRESHOLD as f64 && unit_index < UNITS.len() - 1 {
 61 |         size /= THRESHOLD as f64;
 62 |         unit_index += 1;
 63 |     }
 64 | 
 65 |     if unit_index == 0 {
 66 |         format!("{} {}", bytes, UNITS[unit_index])
 67 |     } else {
 68 |         format!("{:.1} {}", size, UNITS[unit_index])
 69 |     }
 70 | }
 71 | 
 72 | /// Format memory size in MB to GiB
 73 | pub fn format_memory_mb_to_gib(mb: u32) -> String {
 74 |     let gib = mb as f64 / 1024.0;
 75 |     format!("{:.1}", gib)
 76 | }
 77 | 
 78 | /// Check if running on Linux
 79 | #[allow(dead_code)]
 80 | pub fn is_linux() -> bool {
 81 |     cfg!(target_os = "linux")
 82 | }
 83 | 
 84 | /// Check if running on macOS
 85 | #[allow(dead_code)]
 86 | pub fn is_macos() -> bool {
 87 |     cfg!(target_os = "macos")
 88 | }
 89 | 
 90 | /// Check if running on Windows
 91 | #[allow(dead_code)]
 92 | pub fn is_windows() -> bool {
 93 |     cfg!(target_os = "windows")
 94 | }
 95 | 
 96 | /// Get operating system name
 97 | #[allow(dead_code)]
 98 | pub fn get_os_name() -> &'static str {
 99 |     if is_linux() {
100 |         "Linux"
101 |     } else if is_macos() {
102 |         "macOS"
103 |     } else if is_windows() {
104 |         "Windows"
105 |     } else {
106 |         "Unknown"
107 |     }
108 | }
109 | 
110 | /// Truncate string to specified length with ellipsis
111 | pub fn truncate_string(s: &str, max_len: usize) -> String {
112 |     if s.len() <= max_len {
113 |         s.to_string()
114 |     } else {
115 |         format!("{}...", &s[..max_len.saturating_sub(3)])
116 |     }
117 | }
118 | 
119 | /// Parse process start time from system time
120 | #[allow(dead_code)]
121 | pub fn parse_process_start_time(start_time: SystemTime) -> String {
122 |     let now = SystemTime::now();
123 |     let duration = now.duration_since(start_time).unwrap_or_default();
124 |     format_duration(duration)
125 | }
126 | 
127 | #[cfg(test)]
128 | mod tests {
129 |     use super::*;
130 |     use std::time::Duration;
131 | 
132 |     #[test]
133 |     fn test_format_duration() {
134 |         assert_eq!(format_duration(Duration::from_secs(30)), "30s");
135 |         assert_eq!(format_duration(Duration::from_secs(90)), "1m 30s");
136 |         assert_eq!(format_duration(Duration::from_secs(3661)), "1h 1m 1s");
137 |     }
138 | 
139 |     #[test]
140 |     fn test_format_memory_size() {
141 |         assert_eq!(format_memory_size(0), "0 B");
142 |         assert_eq!(format_memory_size(1024), "1.0 KB");
143 |         assert_eq!(format_memory_size(1024 * 1024), "1.0 MB");
144 |         assert_eq!(format_memory_size(1024 * 1024 * 1024), "1.0 GB");
145 |     }
146 | 
147 |     #[test]
148 |     fn test_format_memory_mb_to_gib() {
149 |         assert_eq!(format_memory_mb_to_gib(0), "0.0");
150 |         assert_eq!(format_memory_mb_to_gib(1024), "1.0");
151 |         assert_eq!(format_memory_mb_to_gib(2048), "2.0");
152 |     }
153 | 
154 |     #[test]
155 |     fn test_truncate_string() {
156 |         assert_eq!(truncate_string("short", 10), "short");
157 |         assert_eq!(truncate_string("very long string", 10), "very lo...");
158 |         assert_eq!(truncate_string("abc", 3), "abc");
159 |     }
160 | 
161 |     #[test]
162 |     fn test_os_detection() {
163 |         // These tests will pass on the respective platforms
164 |         assert!(get_os_name() != "Unknown");
165 |     }
166 | }
167 | 


--------------------------------------------------------------------------------
/mcp/src/types.rs:
--------------------------------------------------------------------------------
  1 | //! MCP Protocol Types for GPU Kill
  2 | 
  3 | use serde::{Deserialize, Serialize};
  4 | use std::collections::HashMap;
  5 | 
  6 | /// MCP Request/Response types
  7 | #[derive(Debug, Serialize, Deserialize)]
  8 | #[serde(tag = "jsonrpc", rename = "2.0")]
  9 | pub struct JsonRpcRequest {
 10 |     pub id: String,
 11 |     pub method: String,
 12 |     pub params: Option<serde_json::Value>,
 13 | }
 14 | 
 15 | #[derive(Debug, Serialize, Deserialize)]
 16 | pub struct JsonRpcResponse {
 17 |     pub jsonrpc: String,
 18 |     pub id: String,
 19 |     #[serde(skip_serializing_if = "Option::is_none")]
 20 |     pub result: Option<serde_json::Value>,
 21 |     #[serde(skip_serializing_if = "Option::is_none")]
 22 |     pub error: Option<JsonRpcError>,
 23 | }
 24 | 
 25 | #[derive(Debug, Serialize, Deserialize)]
 26 | pub struct JsonRpcError {
 27 |     pub code: i32,
 28 |     pub message: String,
 29 |     #[serde(skip_serializing_if = "Option::is_none")]
 30 |     pub data: Option<serde_json::Value>,
 31 | }
 32 | 
 33 | /// MCP Protocol Messages
 34 | #[derive(Debug, Serialize, Deserialize)]
 35 | pub struct InitializeRequest {
 36 |     pub protocol_version: String,
 37 |     pub capabilities: ClientCapabilities,
 38 |     pub client_info: ClientInfo,
 39 | }
 40 | 
 41 | #[derive(Debug, Serialize, Deserialize)]
 42 | pub struct InitializeResponse {
 43 |     pub protocol_version: String,
 44 |     pub capabilities: ServerCapabilities,
 45 |     pub server_info: ServerInfo,
 46 | }
 47 | 
 48 | #[derive(Debug, Serialize, Deserialize)]
 49 | pub struct ClientCapabilities {
 50 |     #[serde(skip_serializing_if = "Option::is_none")]
 51 |     pub roots: Option<RootsCapability>,
 52 |     #[serde(skip_serializing_if = "Option::is_none")]
 53 |     pub sampling: Option<SamplingCapability>,
 54 | }
 55 | 
 56 | #[derive(Debug, Serialize, Deserialize)]
 57 | pub struct ServerCapabilities {
 58 |     #[serde(skip_serializing_if = "Option::is_none")]
 59 |     pub resources: Option<ResourcesCapability>,
 60 |     #[serde(skip_serializing_if = "Option::is_none")]
 61 |     pub tools: Option<ToolsCapability>,
 62 |     #[serde(skip_serializing_if = "Option::is_none")]
 63 |     pub logging: Option<LoggingCapability>,
 64 | }
 65 | 
 66 | #[derive(Debug, Serialize, Deserialize)]
 67 | pub struct ClientInfo {
 68 |     pub name: String,
 69 |     pub version: String,
 70 | }
 71 | 
 72 | #[derive(Debug, Serialize, Deserialize)]
 73 | pub struct ServerInfo {
 74 |     pub name: String,
 75 |     pub version: String,
 76 | }
 77 | 
 78 | #[derive(Debug, Serialize, Deserialize)]
 79 | pub struct RootsCapability {
 80 |     pub list_changed: Option<bool>,
 81 | }
 82 | 
 83 | #[derive(Debug, Serialize, Deserialize)]
 84 | pub struct SamplingCapability {}
 85 | 
 86 | #[derive(Debug, Serialize, Deserialize)]
 87 | pub struct ResourcesCapability {
 88 |     pub subscribe: Option<bool>,
 89 |     pub list_changed: Option<bool>,
 90 | }
 91 | 
 92 | #[derive(Debug, Serialize, Deserialize)]
 93 | pub struct ToolsCapability {
 94 |     pub list_changed: Option<bool>,
 95 | }
 96 | 
 97 | #[derive(Debug, Serialize, Deserialize)]
 98 | pub struct LoggingCapability {}
 99 | 
100 | /// Resource Types
101 | #[derive(Debug, Serialize, Deserialize)]
102 | pub struct Resource {
103 |     pub uri: String,
104 |     pub name: String,
105 |     pub description: Option<String>,
106 |     pub mime_type: Option<String>,
107 | }
108 | 
109 | #[derive(Debug, Serialize, Deserialize)]
110 | pub struct ResourceContents {
111 |     pub uri: String,
112 |     pub mime_type: Option<String>,
113 |     pub text: Option<String>,
114 |     pub blob: Option<String>, // Base64 encoded
115 | }
116 | 
117 | /// Tool Types
118 | #[derive(Debug, Serialize, Deserialize)]
119 | pub struct Tool {
120 |     pub name: String,
121 |     pub description: Option<String>,
122 |     pub input_schema: serde_json::Value,
123 | }
124 | 
125 | #[derive(Debug, Serialize, Deserialize)]
126 | pub struct ToolCall {
127 |     pub name: String,
128 |     pub arguments: Option<HashMap<String, serde_json::Value>>,
129 | }
130 | 
131 | #[derive(Debug, Serialize, Deserialize)]
132 | pub struct ToolResult {
133 |     pub content: Vec<ToolContent>,
134 |     pub is_error: Option<bool>,
135 | }
136 | 
137 | #[derive(Debug, Serialize, Deserialize)]
138 | pub struct ToolContent {
139 |     #[serde(rename = "type")]
140 |     pub content_type: String,
141 |     pub text: Option<String>,
142 |     #[serde(skip_serializing_if = "Option::is_none")]
143 |     pub data: Option<serde_json::Value>,
144 | }
145 | 
146 | /// GPU Kill specific types
147 | #[derive(Debug, Serialize, Deserialize)]
148 | pub struct GpuInfo {
149 |     pub id: u32,
150 |     pub name: String,
151 |     pub vendor: String,
152 |     pub memory_used: f64,
153 |     pub memory_total: f64,
154 |     pub utilization: f64,
155 |     pub temperature: Option<f64>,
156 |     pub power_usage: Option<f64>,
157 |     pub processes: Vec<GpuProcess>,
158 | }
159 | 
160 | #[derive(Debug, Serialize, Deserialize)]
161 | pub struct GpuProcess {
162 |     pub pid: u32,
163 |     pub name: String,
164 |     pub memory_usage: f64,
165 |     pub user: Option<String>,
166 | }
167 | 
168 | #[derive(Debug, Serialize, Deserialize)]
169 | pub struct ThreatInfo {
170 |     pub id: String,
171 |     pub threat_type: String,
172 |     pub severity: String,
173 |     pub confidence: f64,
174 |     pub description: String,
175 |     pub process_info: Option<GpuProcess>,
176 | }
177 | 
178 | #[derive(Debug, Serialize, Deserialize)]
179 | pub struct PolicyInfo {
180 |     pub policy_type: String,
181 |     pub name: String,
182 |     pub enabled: bool,
183 |     pub limits: HashMap<String, serde_json::Value>,
184 | }
185 | 


--------------------------------------------------------------------------------
/scripts/setup-gpu-runner.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # GPU Runner Setup Script
  3 | # This script helps set up a self-hosted GitHub Actions runner with GPU support
  4 | 
  5 | set -e
  6 | 
  7 | echo "🚀 GPU Kill - Self-Hosted Runner Setup"
  8 | echo "======================================"
  9 | 
 10 | # Check if running as root
 11 | if [[ $EUID -eq 0 ]]; then
 12 |    echo "❌ This script should not be run as root"
 13 |    exit 1
 14 | fi
 15 | 
 16 | # Detect OS
 17 | if [[ "$OSTYPE" == "linux-gnu"* ]]; then
 18 |     OS="linux"
 19 | elif [[ "$OSTYPE" == "darwin"* ]]; then
 20 |     OS="macos"
 21 | else
 22 |     echo "❌ Unsupported OS: $OSTYPE"
 23 |     exit 1
 24 | fi
 25 | 
 26 | echo "📋 Detected OS: $OS"
 27 | 
 28 | # Function to install dependencies
 29 | install_deps() {
 30 |     echo "📦 Installing system dependencies..."
 31 |     
 32 |     if [[ "$OS" == "linux" ]]; then
 33 |         sudo apt-get update
 34 |         sudo apt-get install -y build-essential libssl-dev pkg-config curl tar
 35 |         
 36 |         # Install GPU-specific tools
 37 |         echo "🔧 Installing GPU tools..."
 38 |         
 39 |         # NVIDIA
 40 |         if command -v nvidia-smi &> /dev/null; then
 41 |             echo "✅ NVIDIA GPU detected"
 42 |             sudo apt-get install -y nvidia-utils-* || echo "⚠️  NVIDIA utils installation failed"
 43 |         else
 44 |             echo "ℹ️  No NVIDIA GPU detected"
 45 |         fi
 46 |         
 47 |         # AMD
 48 |         if command -v rocm-smi &> /dev/null; then
 49 |             echo "✅ AMD GPU with ROCm detected"
 50 |         else
 51 |             echo "ℹ️  Installing ROCm tools..."
 52 |             sudo apt-get install -y rocm-smi || echo "⚠️  ROCm installation failed"
 53 |         fi
 54 |         
 55 |         # Intel
 56 |         echo "ℹ️  Installing Intel GPU tools..."
 57 |         sudo apt-get install -y intel-gpu-tools || echo "⚠️  Intel GPU tools installation failed"
 58 |         
 59 |     elif [[ "$OS" == "macos" ]]; then
 60 |         # Check for Xcode command line tools
 61 |         if ! command -v xcode-select &> /dev/null; then
 62 |             echo "📱 Installing Xcode command line tools..."
 63 |             xcode-select --install || echo "⚠️  Xcode tools installation failed"
 64 |         else
 65 |             echo "✅ Xcode command line tools already installed"
 66 |         fi
 67 |     fi
 68 | }
 69 | 
 70 | # Function to install Rust
 71 | install_rust() {
 72 |     echo "🦀 Installing Rust..."
 73 |     
 74 |     if command -v rustc &> /dev/null; then
 75 |         echo "✅ Rust already installed: $(rustc --version)"
 76 |     else
 77 |         curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
 78 |         source ~/.cargo/env
 79 |         echo "✅ Rust installed: $(rustc --version)"
 80 |     fi
 81 | }
 82 | 
 83 | # Function to setup GitHub Actions runner
 84 | setup_runner() {
 85 |     echo "🏃 Setting up GitHub Actions runner..."
 86 |     
 87 |     # Get repository URL and token from user
 88 |     read -p "📝 Enter your GitHub repository URL (e.g., https://github.com/username/gpu-kill): " REPO_URL
 89 |     read -p "🔑 Enter your GitHub Personal Access Token (with repo and admin:org permissions): " GITHUB_TOKEN
 90 |     
 91 |     # Create runner directory
 92 |     RUNNER_DIR="$HOME/actions-runner"
 93 |     mkdir -p "$RUNNER_DIR"
 94 |     cd "$RUNNER_DIR"
 95 |     
 96 |     # Download runner
 97 |     if [[ "$OS" == "linux" ]]; then
 98 |         RUNNER_FILE="actions-runner-linux-x64-2.311.0.tar.gz"
 99 |     elif [[ "$OS" == "macos" ]]; then
100 |         RUNNER_FILE="actions-runner-osx-x64-2.311.0.tar.gz"
101 |     fi
102 |     
103 |     echo "📥 Downloading GitHub Actions runner..."
104 |     curl -o "$RUNNER_FILE" -L "https://github.com/actions/runner/releases/download/v2.311.0/$RUNNER_FILE"
105 |     tar xzf "$RUNNER_FILE"
106 |     
107 |     # Configure runner
108 |     echo "⚙️  Configuring runner..."
109 |     ./config.sh --url "$REPO_URL" --token "$GITHUB_TOKEN" --labels "gpu,$OS" --name "gpu-runner-$(hostname)"
110 |     
111 |     echo "✅ Runner configured successfully!"
112 |     echo ""
113 |     echo "🎯 To start the runner:"
114 |     echo "   cd $RUNNER_DIR"
115 |     echo "   ./run.sh"
116 |     echo ""
117 |     echo "🎯 To run as a service:"
118 |     echo "   sudo ./svc.sh install"
119 |     echo "   sudo ./svc.sh start"
120 | }
121 | 
122 | # Function to test GPU detection
123 | test_gpu() {
124 |     echo "🧪 Testing GPU detection..."
125 |     
126 |     # Clone and build GPU Kill
127 |     if [[ ! -d "gpu-kill" ]]; then
128 |         git clone https://github.com/treadiehq/gpu-kill.git
129 |     fi
130 |     
131 |     cd gpu-kill
132 |     cargo build --release
133 |     
134 |     echo "🔍 GPU Detection Results:"
135 |     ./target/release/gpukill --list || echo "No GPUs detected"
136 |     
137 |     echo "🧪 Running GPU hardware tests..."
138 |     cargo test --test gpu_hardware_tests || echo "GPU tests completed (some may have been skipped)"
139 | }
140 | 
141 | # Main execution
142 | main() {
143 |     echo "🎯 What would you like to do?"
144 |     echo "1) Install dependencies only"
145 |     echo "2) Setup GitHub Actions runner"
146 |     echo "3) Test GPU detection"
147 |     echo "4) Full setup (dependencies + runner + test)"
148 |     echo "5) Exit"
149 |     
150 |     read -p "Choose an option (1-5): " choice
151 |     
152 |     case $choice in
153 |         1)
154 |             install_deps
155 |             install_rust
156 |             ;;
157 |         2)
158 |             install_deps
159 |             install_rust
160 |             setup_runner
161 |             ;;
162 |         3)
163 |             install_deps
164 |             install_rust
165 |             test_gpu
166 |             ;;
167 |         4)
168 |             install_deps
169 |             install_rust
170 |             setup_runner
171 |             test_gpu
172 |             ;;
173 |         5)
174 |             echo "👋 Goodbye!"
175 |             exit 0
176 |             ;;
177 |         *)
178 |             echo "❌ Invalid option"
179 |             exit 1
180 |             ;;
181 |     esac
182 |     
183 |     echo "✅ Setup completed!"
184 | }
185 | 
186 | # Run main function
187 | main "$@"
188 | 


--------------------------------------------------------------------------------
/docs/CLOUD_GPU_SETUP.md:
--------------------------------------------------------------------------------
  1 | # Cloud GPU Setup Guide
  2 | 
  3 | This guide shows how to set up GPU testing using cloud services.
  4 | 
  5 | ## Quick Start
  6 | 
  7 | ### Option 1: AWS EC2 with GPU
  8 | 
  9 | 1. **Launch GPU Instance:**
 10 |    ```bash
 11 |    # Using AWS CLI
 12 |    aws ec2 run-instances \
 13 |      --image-id ami-0c02fb55956c7d316 \
 14 |      --instance-type g4dn.xlarge \
 15 |      --key-name your-key \
 16 |      --security-group-ids sg-xxxxxxxxx \
 17 |      --subnet-id subnet-xxxxxxxxx
 18 |    ```
 19 | 
 20 | 2. **Connect and Setup:**
 21 |    ```bash
 22 |    ssh -i your-key.pem ubuntu@your-instance-ip
 23 |    curl -sSL https://raw.githubusercontent.com/treadiehq/gpu-kill/main/scripts/setup-gpu-runner.sh | bash
 24 |    ```
 25 | 
 26 | ### Option 2: Google Cloud with GPU
 27 | 
 28 | 1. **Create GPU Instance:**
 29 |    ```bash
 30 |    gcloud compute instances create gpu-test-runner \
 31 |      --zone=us-central1-a \
 32 |      --machine-type=n1-standard-4 \
 33 |      --accelerator=type=nvidia-tesla-t4,count=1 \
 34 |      --image-family=ubuntu-2004-lts \
 35 |      --image-project=ubuntu-os-cloud \
 36 |      --maintenance-policy=TERMINATE \
 37 |      --restart-on-failure
 38 |    ```
 39 | 
 40 | 2. **Setup:**
 41 |    ```bash
 42 |    gcloud compute ssh gpu-test-runner --zone=us-central1-a
 43 |    curl -sSL https://raw.githubusercontent.com/treadiehq/gpu-kill/main/scripts/setup-gpu-runner.sh | bash
 44 |    ```
 45 | 
 46 | ### Option 3: Azure with GPU
 47 | 
 48 | 1. **Create VM:**
 49 |    ```bash
 50 |    az vm create \
 51 |      --resource-group myResourceGroup \
 52 |      --name gpu-test-vm \
 53 |      --image UbuntuLTS \
 54 |      --size Standard_NC6s_v3 \
 55 |      --admin-username azureuser \
 56 |      --generate-ssh-keys
 57 |    ```
 58 | 
 59 | 2. **Setup:**
 60 |    ```bash
 61 |    ssh azureuser@your-vm-ip
 62 |    curl -sSL https://raw.githubusercontent.com/treadiehq/gpu-kill/main/scripts/setup-gpu-runner.sh | bash
 63 |    ```
 64 | 
 65 | ## Cost-Effective Options
 66 | 
 67 | ### Spot Instances
 68 | - **AWS Spot**: Up to 90% savings
 69 | - **GCP Preemptible**: Up to 80% savings
 70 | - **Azure Spot**: Up to 90% savings
 71 | 
 72 | ### Example Spot Instance Setup (AWS):
 73 | ```bash
 74 | aws ec2 request-spot-instances \
 75 |   --spot-price "0.50" \
 76 |   --instance-count 1 \
 77 |   --type "one-time" \
 78 |   --launch-specification '{
 79 |     "ImageId": "ami-0c02fb55956c7d316",
 80 |     "InstanceType": "g4dn.xlarge",
 81 |     "KeyName": "your-key",
 82 |     "SecurityGroupIds": ["sg-xxxxxxxxx"]
 83 |   }'
 84 | ```
 85 | 
 86 | ## Docker-Based Testing
 87 | 
 88 | ### NVIDIA Docker Setup
 89 | ```bash
 90 | # Install NVIDIA Docker
 91 | distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
 92 | curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
 93 | curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
 94 | 
 95 | sudo apt-get update && sudo apt-get install -y nvidia-docker2
 96 | sudo systemctl restart docker
 97 | 
 98 | # Test GPU access
 99 | docker run --rm --gpus all nvidia/cuda:11.0-base nvidia-smi
100 | ```
101 | 
102 | ### GPU Kill Docker Testing
103 | ```bash
104 | # Build GPU Kill with GPU support
105 | docker build -t gpukill:gpu .
106 | 
107 | # Run tests with GPU access
108 | docker run --rm --gpus all gpukill:gpu cargo test --test gpu_hardware_tests
109 | ```
110 | 
111 | ## GitHub Actions Integration
112 | 
113 | ### Enable GPU Tests
114 | Once you have a self-hosted runner set up:
115 | 
116 | 1. **Remove the `if: false` condition** in `.github/workflows/ci.yml`:
117 |    ```yaml
118 |    gpu-hardware-tests:
119 |      name: GPU Hardware Tests
120 |      runs-on: [self-hosted, gpu]
121 |      # if: false  # Remove this line
122 |    ```
123 | 
124 | 2. **Add runner labels** when setting up:
125 |    ```bash
126 |    ./config.sh --labels "gpu,nvidia,linux" --name "nvidia-gpu-runner"
127 |    ```
128 | 
129 | ### Conditional GPU Testing
130 | The CI will automatically:
131 | - ✅ **Run GPU tests** when GPU hardware is available
132 | - ✅ **Skip gracefully** when no GPU hardware is found
133 | - ✅ **Work on any runner** (hosted or self-hosted)
134 | 
135 | ## Cost Optimization
136 | 
137 | ### Scheduled Testing
138 | Set up runners to only run during business hours:
139 | ```yaml
140 | on:
141 |   schedule:
142 |     - cron: '0 9 * * 1-5'  # 9 AM, Monday-Friday
143 | ```
144 | 
145 | ### Auto-shutdown
146 | Add auto-shutdown to cloud instances:
147 | ```bash
148 | # AWS
149 | aws ec2 create-tags --resources i-1234567890abcdef0 --tags Key=shutdown,Value=yes
150 | 
151 | # GCP
152 | gcloud compute instances add-metadata gpu-test-runner \
153 |   --metadata shutdown-script='sudo shutdown -h +60'
154 | ```
155 | 
156 | ## Monitoring and Alerts
157 | 
158 | ### Set up monitoring for:
159 | - GPU utilization during tests
160 | - Test success/failure rates
161 | - Runner availability
162 | - Cost tracking
163 | 
164 | ### Example monitoring script:
165 | ```bash
166 | #!/bin/bash
167 | # Monitor GPU test results
168 | curl -H "Authorization: token $GITHUB_TOKEN" \
169 |   "https://api.github.com/repos/treadiehq/gpu-kill/actions/runs" | \
170 |   jq '.workflow_runs[] | select(.name=="GPU Hardware Tests") | {status, conclusion, created_at}'
171 | ```
172 | 
173 | ## Troubleshooting
174 | 
175 | ### Common Issues:
176 | 
177 | 1. **GPU not detected:**
178 |    ```bash
179 |    # Check NVIDIA
180 |    nvidia-smi
181 |    
182 |    # Check AMD
183 |    rocm-smi --showid
184 |    
185 |    # Check Intel
186 |    intel_gpu_top
187 |    ```
188 | 
189 | 2. **Permission issues:**
190 |    ```bash
191 |    # Add user to docker group
192 |    sudo usermod -aG docker $USER
193 |    
194 |    # Check GPU permissions
195 |    ls -la /dev/nvidia*
196 |    ```
197 | 
198 | 3. **Driver issues:**
199 |    ```bash
200 |    # Update NVIDIA drivers
201 |    sudo apt-get install nvidia-driver-470
202 |    
203 |    # Update AMD drivers
204 |    sudo apt-get install rocm-dkms
205 |    ```
206 | 
207 | ## Next Steps
208 | 
209 | 1. **Choose your cloud provider** (AWS, GCP, Azure)
210 | 2. **Set up a GPU instance** using the scripts above
211 | 3. **Configure the GitHub Actions runner** with GPU labels
212 | 4. **Enable GPU tests** in the CI workflow
213 | 5. **Monitor and optimize** costs and performance
214 | 
215 | The GPU tests will now run automatically whenever GPU hardware is available! 🚀
216 | 


--------------------------------------------------------------------------------
/dashboard/tailwind.config.js:
--------------------------------------------------------------------------------
  1 | /** @type {import('tailwindcss').Config} */
  2 | export default {
  3 |   content: [
  4 |     "./components/**/*.{js,vue,ts}",
  5 |     "./layouts/**/*.vue",
  6 |     "./pages/**/*.vue",
  7 |     "./plugins/**/*.{js,ts}",
  8 |     "./app.vue",
  9 |     "./error.vue"
 10 |   ],
 11 |   theme: {
 12 |     extend: {
 13 |       colors: {
 14 |         // Custom GPU Kill brand colors
 15 |         primary: {
 16 |           50: '#eff6ff',
 17 |           100: '#dbeafe',
 18 |           200: '#bfdbfe',
 19 |           300: '#93c5fd',
 20 |           400: '#60a5fa',
 21 |           500: '#3b82f6',
 22 |           600: '#2563eb',
 23 |           700: '#1d4ed8',
 24 |           800: '#1e40af',
 25 |           900: '#1e3a8a',
 26 |           950: '#172554',
 27 |         },
 28 |         gpu: {
 29 |           50: '#f0f9ff',
 30 |           100: '#e0f2fe',
 31 |           200: '#bae6fd',
 32 |           300: '#7dd3fc',
 33 |           400: '#38bdf8',
 34 |           500: '#0ea5e9',
 35 |           600: '#0284c7',
 36 |           700: '#0369a1',
 37 |           800: '#075985',
 38 |           900: '#0c4a6e',
 39 |           950: '#082f49',
 40 |         },
 41 |         danger: {
 42 |           50: '#fef2f2',
 43 |           100: '#fee2e2',
 44 |           200: '#fecaca',
 45 |           300: '#fca5a5',
 46 |           400: '#f87171',
 47 |           500: '#ef4444',
 48 |           600: '#dc2626',
 49 |           700: '#b91c1c',
 50 |           800: '#991b1b',
 51 |           900: '#7f1d1d',
 52 |           950: '#450a0a',
 53 |         },
 54 |         warning: {
 55 |           50: '#fffbeb',
 56 |           100: '#fef3c7',
 57 |           200: '#fde68a',
 58 |           300: '#fcd34d',
 59 |           400: '#fbbf24',
 60 |           500: '#f59e0b',
 61 |           600: '#d97706',
 62 |           700: '#b45309',
 63 |           800: '#92400e',
 64 |           900: '#78350f',
 65 |           950: '#451a03',
 66 |         },
 67 |         success: {
 68 |           50: '#f0fdf4',
 69 |           100: '#dcfce7',
 70 |           200: '#bbf7d0',
 71 |           300: '#86efac',
 72 |           400: '#4ade80',
 73 |           500: '#22c55e',
 74 |           600: '#16a34a',
 75 |           700: '#15803d',
 76 |           800: '#166534',
 77 |           900: '#14532d',
 78 |           950: '#052e16',
 79 |         },
 80 |         // Dark theme colors
 81 |         dark: {
 82 |           50: '#f8fafc',
 83 |           100: '#f1f5f9',
 84 |           200: '#e2e8f0',
 85 |           300: '#cbd5e1',
 86 |           400: '#94a3b8',
 87 |           500: '#64748b',
 88 |           600: '#475569',
 89 |           700: '#334155',
 90 |           800: '#1e293b',
 91 |           900: '#0f172a',
 92 |           950: '#020617',
 93 |         }
 94 |       },
 95 |       fontFamily: {
 96 |         sans: ['Inter', 'system-ui', 'sans-serif'],
 97 |         mono: ['JetBrains Mono', 'Fira Code', 'monospace'],
 98 |       },
 99 |       animation: {
100 |         'pulse-slow': 'pulse 3s cubic-bezier(0.4, 0, 0.6, 1) infinite',
101 |         'bounce-slow': 'bounce 2s infinite',
102 |         'spin-slow': 'spin 3s linear infinite',
103 |         'ping-slow': 'ping 2s cubic-bezier(0, 0, 0.2, 1) infinite',
104 |         'fade-in': 'fadeIn 0.5s ease-in-out',
105 |         'slide-up': 'slideUp 0.3s ease-out',
106 |         'slide-down': 'slideDown 0.3s ease-out',
107 |         'scale-in': 'scaleIn 0.2s ease-out',
108 |         'glow': 'glow 2s ease-in-out infinite alternate',
109 |       },
110 |       keyframes: {
111 |         fadeIn: {
112 |           '0%': { opacity: '0' },
113 |           '100%': { opacity: '1' },
114 |         },
115 |         slideUp: {
116 |           '0%': { transform: 'translateY(10px)', opacity: '0' },
117 |           '100%': { transform: 'translateY(0)', opacity: '1' },
118 |         },
119 |         slideDown: {
120 |           '0%': { transform: 'translateY(-10px)', opacity: '0' },
121 |           '100%': { transform: 'translateY(0)', opacity: '1' },
122 |         },
123 |         scaleIn: {
124 |           '0%': { transform: 'scale(0.95)', opacity: '0' },
125 |           '100%': { transform: 'scale(1)', opacity: '1' },
126 |         },
127 |         glow: {
128 |           '0%': { boxShadow: '0 0 5px rgba(59, 130, 246, 0.5)' },
129 |           '100%': { boxShadow: '0 0 20px rgba(59, 130, 246, 0.8)' },
130 |         },
131 |       },
132 |       backdropBlur: {
133 |         xs: '2px',
134 |       },
135 |       boxShadow: {
136 |         'glow': '0 0 20px rgba(59, 130, 246, 0.3)',
137 |         'glow-lg': '0 0 30px rgba(59, 130, 246, 0.4)',
138 |         'glow-danger': '0 0 20px rgba(239, 68, 68, 0.3)',
139 |         'glow-success': '0 0 20px rgba(34, 197, 94, 0.3)',
140 |         'glow-warning': '0 0 20px rgba(245, 158, 11, 0.3)',
141 |         'inner-lg': 'inset 0 2px 4px 0 rgba(0, 0, 0, 0.1)',
142 |       },
143 |       borderRadius: {
144 |         '4xl': '2rem',
145 |         '5xl': '2.5rem',
146 |       },
147 |       spacing: {
148 |         '18': '4.5rem',
149 |         '88': '22rem',
150 |         '128': '32rem',
151 |       },
152 |       zIndex: {
153 |         '60': '60',
154 |         '70': '70',
155 |         '80': '80',
156 |         '90': '90',
157 |         '100': '100',
158 |       },
159 |       screens: {
160 |         'xs': '475px',
161 |         '3xl': '1600px',
162 |       },
163 |       typography: {
164 |         DEFAULT: {
165 |           css: {
166 |             maxWidth: 'none',
167 |             color: '#e5e7eb',
168 |             a: {
169 |               color: '#60a5fa',
170 |               '&:hover': {
171 |                 color: '#93c5fd',
172 |               },
173 |             },
174 |             h1: {
175 |               color: '#ffffff',
176 |             },
177 |             h2: {
178 |               color: '#ffffff',
179 |             },
180 |             h3: {
181 |               color: '#ffffff',
182 |             },
183 |             h4: {
184 |               color: '#ffffff',
185 |             },
186 |             strong: {
187 |               color: '#ffffff',
188 |             },
189 |             code: {
190 |               color: '#fbbf24',
191 |               backgroundColor: '#1f2937',
192 |               padding: '0.25rem 0.375rem',
193 |               borderRadius: '0.25rem',
194 |             },
195 |             'code::before': {
196 |               content: '""',
197 |             },
198 |             'code::after': {
199 |               content: '""',
200 |             },
201 |           },
202 |         },
203 |       },
204 |     },
205 |   },
206 |   plugins: [
207 |     require('@tailwindcss/typography'),
208 |     require('@tailwindcss/forms'),
209 |     require('@tailwindcss/aspect-ratio'),
210 |   ],
211 |   darkMode: 'class',
212 | }
213 | 


--------------------------------------------------------------------------------
/mcp/src/server.rs:
--------------------------------------------------------------------------------
  1 | //! MCP Server implementation for GPU Kill
  2 | 
  3 | use crate::resources::ResourceHandler;
  4 | use crate::tools::ToolHandler;
  5 | use crate::types::*;
  6 | use crate::MCP_VERSION;
  7 | use anyhow::Result;
  8 | use serde_json::json;
  9 | use std::sync::Arc;
 10 | use tokio::sync::RwLock;
 11 | use tracing::{debug, error, info};
 12 | 
 13 | /// GPU Kill MCP Server
 14 | pub struct GpuKillMCPServer {
 15 |     resource_handler: Arc<ResourceHandler>,
 16 |     tool_handler: Arc<RwLock<ToolHandler>>,
 17 | }
 18 | 
 19 | impl GpuKillMCPServer {
 20 |     /// Create a new MCP server instance
 21 |     pub async fn new() -> Result<Self> {
 22 |         let resource_handler = Arc::new(ResourceHandler::new().await?);
 23 |         let tool_handler = Arc::new(RwLock::new(ToolHandler::new().await?));
 24 | 
 25 |         Ok(Self {
 26 |             resource_handler,
 27 |             tool_handler,
 28 |         })
 29 |     }
 30 | 
 31 |     /// Handle an MCP request
 32 |     pub async fn handle_request(&self, request: JsonRpcRequest) -> Result<JsonRpcResponse> {
 33 |         debug!("Handling MCP request: {}", request.method);
 34 | 
 35 |         let result = match request.method.as_str() {
 36 |             "initialize" => self.handle_initialize(request.params).await,
 37 |             "resources/list" => self.handle_resources_list().await,
 38 |             "resources/read" => self.handle_resources_read(request.params).await,
 39 |             "tools/list" => self.handle_tools_list().await,
 40 |             "tools/call" => self.handle_tools_call(request.params).await,
 41 |             _ => Err(anyhow::anyhow!("Unknown method: {}", request.method)),
 42 |         };
 43 | 
 44 |         match result {
 45 |             Ok(data) => Ok(JsonRpcResponse {
 46 |                 jsonrpc: "2.0".to_string(),
 47 |                 id: request.id,
 48 |                 result: Some(data),
 49 |                 error: None,
 50 |             }),
 51 |             Err(e) => {
 52 |                 error!("Error handling request {}: {}", request.method, e);
 53 |                 Ok(JsonRpcResponse {
 54 |                     jsonrpc: "2.0".to_string(),
 55 |                     id: request.id,
 56 |                     result: None,
 57 |                     error: Some(JsonRpcError {
 58 |                         code: -32603,
 59 |                         message: "Internal error".to_string(),
 60 |                         data: Some(json!({ "details": e.to_string() })),
 61 |                     }),
 62 |                 })
 63 |             }
 64 |         }
 65 |     }
 66 | 
 67 |     async fn handle_initialize(
 68 |         &self,
 69 |         _params: Option<serde_json::Value>,
 70 |     ) -> Result<serde_json::Value> {
 71 |         info!("MCP client initializing");
 72 | 
 73 |         let response = InitializeResponse {
 74 |             protocol_version: MCP_VERSION.to_string(),
 75 |             capabilities: ServerCapabilities {
 76 |                 resources: Some(ResourcesCapability {
 77 |                     subscribe: Some(false),
 78 |                     list_changed: Some(false),
 79 |                 }),
 80 |                 tools: Some(ToolsCapability {
 81 |                     list_changed: Some(false),
 82 |                 }),
 83 |                 logging: Some(LoggingCapability {}),
 84 |             },
 85 |             server_info: ServerInfo {
 86 |                 name: "GPU Kill MCP Server".to_string(),
 87 |                 version: env!("CARGO_PKG_VERSION").to_string(),
 88 |             },
 89 |         };
 90 | 
 91 |         Ok(serde_json::to_value(response)?)
 92 |     }
 93 | 
 94 |     async fn handle_resources_list(&self) -> Result<serde_json::Value> {
 95 |         let resources = self.resource_handler.list_resources();
 96 |         Ok(json!({ "resources": resources }))
 97 |     }
 98 | 
 99 |     async fn handle_resources_read(
100 |         &self,
101 |         params: Option<serde_json::Value>,
102 |     ) -> Result<serde_json::Value> {
103 |         let params = params.ok_or_else(|| anyhow::anyhow!("Missing parameters"))?;
104 |         let uri = params
105 |             .get("uri")
106 |             .and_then(|v| v.as_str())
107 |             .ok_or_else(|| anyhow::anyhow!("Missing uri parameter"))?;
108 | 
109 |         let contents = self.resource_handler.get_resource(uri).await?;
110 |         Ok(json!({ "contents": contents }))
111 |     }
112 | 
113 |     async fn handle_tools_list(&self) -> Result<serde_json::Value> {
114 |         let tool_handler = self.tool_handler.read().await;
115 |         let tools = tool_handler.list_tools();
116 |         Ok(json!({ "tools": tools }))
117 |     }
118 | 
119 |     async fn handle_tools_call(
120 |         &self,
121 |         params: Option<serde_json::Value>,
122 |     ) -> Result<serde_json::Value> {
123 |         let params = params.ok_or_else(|| anyhow::anyhow!("Missing parameters"))?;
124 |         let name = params
125 |             .get("name")
126 |             .and_then(|v| v.as_str())
127 |             .ok_or_else(|| anyhow::anyhow!("Missing name parameter"))?;
128 | 
129 |         let arguments = params
130 |             .get("arguments")
131 |             .and_then(|v| v.as_object())
132 |             .map(|obj| obj.iter().map(|(k, v)| (k.clone(), v.clone())).collect());
133 | 
134 |         let mut tool_handler = self.tool_handler.write().await;
135 |         let result = tool_handler.execute_tool(name, arguments).await?;
136 | 
137 |         Ok(json!({ "content": result.content, "isError": result.is_error }))
138 |     }
139 | 
140 |     /// Start the MCP server
141 |     pub async fn start(self, port: u16) -> Result<()> {
142 |         info!("Starting GPU Kill MCP Server on port {}", port);
143 | 
144 |         let server = Arc::new(self);
145 | 
146 |         // For now, we'll implement a simple HTTP-based MCP server
147 |         // In a full implementation, this would use stdio or WebSocket transport
148 |         let app = axum::Router::new()
149 |             .route(
150 |                 "/mcp",
151 |                 axum::routing::post({
152 |                     let server = server.clone();
153 |                     move |request: axum::extract::Json<JsonRpcRequest>| {
154 |                         let server = server.clone();
155 |                         async move {
156 |                             match server.handle_request(request.0).await {
157 |                                 Ok(response) => axum::response::Json(response),
158 |                                 Err(e) => {
159 |                                     error!("Failed to handle HTTP request: {}", e);
160 |                                     axum::response::Json(JsonRpcResponse {
161 |                                         jsonrpc: "2.0".to_string(),
162 |                                         id: "error".to_string(),
163 |                                         result: None,
164 |                                         error: Some(JsonRpcError {
165 |                                             code: -32603,
166 |                                             message: "Internal error".to_string(),
167 |                                             data: Some(json!({ "details": e.to_string() })),
168 |                                         }),
169 |                                     })
170 |                                 }
171 |                             }
172 |                         }
173 |                     }
174 |                 }),
175 |             )
176 |             .route("/health", axum::routing::get(|| async { "OK" }));
177 | 
178 |         let listener = tokio::net::TcpListener::bind(format!("0.0.0.0:{}", port)).await?;
179 |         info!("MCP Server listening on http://0.0.0.0:{}", port);
180 | 
181 |         axum::serve(listener, app).await?;
182 |         Ok(())
183 |     }
184 | }
185 | 
186 | // Remove Default implementation since new() is now async
187 | 


--------------------------------------------------------------------------------
/src/config.rs:
--------------------------------------------------------------------------------
  1 | use anyhow::{Context, Result};
  2 | use serde::{Deserialize, Serialize};
  3 | use std::fs;
  4 | use std::path::Path;
  5 | 
  6 | /// Configuration structure for gpukill
  7 | #[derive(Debug, Clone, Serialize, Deserialize)]
  8 | pub struct Config {
  9 |     /// Default log level
 10 |     pub log_level: String,
 11 | 
 12 |     /// Default output format
 13 |     pub output_format: String,
 14 | 
 15 |     /// Default timeout for process termination
 16 |     pub default_timeout_secs: u16,
 17 | 
 18 |     /// Whether to show detailed process information by default
 19 |     pub show_details: bool,
 20 | 
 21 |     /// Watch mode refresh interval in seconds
 22 |     pub watch_interval_secs: u64,
 23 | 
 24 |     /// Maximum number of processes to show in summary
 25 |     pub max_processes_summary: usize,
 26 | 
 27 |     /// Table width limit
 28 |     pub table_width: usize,
 29 | 
 30 |     /// Whether to use colors in output
 31 |     pub use_colors: bool,
 32 | }
 33 | 
 34 | impl Default for Config {
 35 |     fn default() -> Self {
 36 |         Self {
 37 |             log_level: "info".to_string(),
 38 |             output_format: "table".to_string(),
 39 |             default_timeout_secs: 5,
 40 |             show_details: false,
 41 |             watch_interval_secs: 2,
 42 |             max_processes_summary: 10,
 43 |             table_width: 120,
 44 |             use_colors: true,
 45 |         }
 46 |     }
 47 | }
 48 | 
 49 | /// Configuration manager
 50 | pub struct ConfigManager {
 51 |     config: Config,
 52 | }
 53 | 
 54 | impl Default for ConfigManager {
 55 |     fn default() -> Self {
 56 |         Self::new()
 57 |     }
 58 | }
 59 | 
 60 | #[allow(dead_code)]
 61 | impl ConfigManager {
 62 |     /// Create a new configuration manager
 63 |     pub fn new() -> Self {
 64 |         Self {
 65 |             config: Config::default(),
 66 |         }
 67 |     }
 68 | 
 69 |     /// Load configuration from file
 70 |     pub fn load_from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
 71 |         let config_path = path.as_ref();
 72 | 
 73 |         if !config_path.exists() {
 74 |             tracing::debug!("Config file not found at {:?}, using defaults", config_path);
 75 |             return Ok(Self::new());
 76 |         }
 77 | 
 78 |         let content = fs::read_to_string(config_path)
 79 |             .with_context(|| format!("Failed to read config file: {:?}", config_path))?;
 80 | 
 81 |         let config: Config = toml::from_str(&content)
 82 |             .with_context(|| format!("Failed to parse config file: {:?}", config_path))?;
 83 | 
 84 |         tracing::info!("Loaded configuration from {:?}", config_path);
 85 |         Ok(Self { config })
 86 |     }
 87 | 
 88 |     /// Load configuration from environment variables
 89 |     pub fn load_from_env() -> Self {
 90 |         let mut config = Config::default();
 91 | 
 92 |         // Override with environment variables if present
 93 |         if let Ok(log_level) = std::env::var("GPUKILL_LOG_LEVEL") {
 94 |             config.log_level = log_level;
 95 |         }
 96 | 
 97 |         if let Ok(output_format) = std::env::var("GPUKILL_OUTPUT_FORMAT") {
 98 |             config.output_format = output_format;
 99 |         }
100 | 
101 |         if let Ok(timeout) = std::env::var("GPUKILL_DEFAULT_TIMEOUT") {
102 |             if let Ok(timeout_secs) = timeout.parse::<u16>() {
103 |                 config.default_timeout_secs = timeout_secs;
104 |             }
105 |         }
106 | 
107 |         if let Ok(show_details) = std::env::var("GPUKILL_SHOW_DETAILS") {
108 |             config.show_details = show_details.parse().unwrap_or(false);
109 |         }
110 | 
111 |         if let Ok(watch_interval) = std::env::var("GPUKILL_WATCH_INTERVAL") {
112 |             if let Ok(interval_secs) = watch_interval.parse::<u64>() {
113 |                 config.watch_interval_secs = interval_secs;
114 |             }
115 |         }
116 | 
117 |         if let Ok(table_width) = std::env::var("GPUKILL_TABLE_WIDTH") {
118 |             if let Ok(width) = table_width.parse::<usize>() {
119 |                 config.table_width = width;
120 |             }
121 |         }
122 | 
123 |         if let Ok(use_colors) = std::env::var("GPUKILL_USE_COLORS") {
124 |             config.use_colors = use_colors.parse().unwrap_or(true);
125 |         }
126 | 
127 |         Self { config }
128 |     }
129 | 
130 |     /// Get the current configuration
131 |     pub fn config(&self) -> &Config {
132 |         &self.config
133 |     }
134 | 
135 |     /// Get a mutable reference to the configuration
136 |     pub fn config_mut(&mut self) -> &mut Config {
137 |         &mut self.config
138 |     }
139 | 
140 |     /// Save configuration to file
141 |     pub fn save_to_file<P: AsRef<Path>>(&self, path: P) -> Result<()> {
142 |         let config_path = path.as_ref();
143 |         let content =
144 |             toml::to_string_pretty(&self.config).context("Failed to serialize configuration")?;
145 | 
146 |         fs::write(config_path, content)
147 |             .with_context(|| format!("Failed to write config file: {:?}", config_path))?;
148 | 
149 |         tracing::info!("Saved configuration to {:?}", config_path);
150 |         Ok(())
151 |     }
152 | 
153 |     /// Get default configuration file path
154 |     pub fn default_config_path() -> Result<std::path::PathBuf> {
155 |         let home_dir = dirs::home_dir()
156 |             .ok_or_else(|| anyhow::anyhow!("Could not determine home directory"))?;
157 | 
158 |         Ok(home_dir.join(".config").join("gpukill").join("config.toml"))
159 |     }
160 | 
161 |     /// Load configuration from default location
162 |     pub fn load_default() -> Result<Self> {
163 |         let config_path = Self::default_config_path()?;
164 |         Self::load_from_file(config_path)
165 |     }
166 | 
167 |     /// Create default configuration file
168 |     pub fn create_default_config() -> Result<()> {
169 |         let config_path = Self::default_config_path()?;
170 | 
171 |         // Create directory if it doesn't exist
172 |         if let Some(parent) = config_path.parent() {
173 |             fs::create_dir_all(parent)
174 |                 .with_context(|| format!("Failed to create config directory: {:?}", parent))?;
175 |         }
176 | 
177 |         let config_manager = Self::new();
178 |         config_manager.save_to_file(config_path)?;
179 | 
180 |         Ok(())
181 |     }
182 | }
183 | 
184 | /// Get configuration with fallback chain
185 | pub fn get_config(config_path: Option<String>) -> Result<ConfigManager> {
186 |     // 1. Try to load from specified path
187 |     if let Some(path) = config_path {
188 |         return ConfigManager::load_from_file(path);
189 |     }
190 | 
191 |     // 2. Try to load from default location
192 |     if let Ok(config_manager) = ConfigManager::load_default() {
193 |         return Ok(config_manager);
194 |     }
195 | 
196 |     // 3. Load from environment variables
197 |     Ok(ConfigManager::load_from_env())
198 | }
199 | 
200 | #[cfg(test)]
201 | mod tests {
202 |     use super::*;
203 |     use tempfile::NamedTempFile;
204 | 
205 |     #[test]
206 |     fn test_default_config() {
207 |         let config = Config::default();
208 |         assert_eq!(config.log_level, "info");
209 |         assert_eq!(config.output_format, "table");
210 |         assert_eq!(config.default_timeout_secs, 5);
211 |         assert!(!config.show_details);
212 |         assert_eq!(config.watch_interval_secs, 2);
213 |     }
214 | 
215 |     #[test]
216 |     fn test_config_serialization() {
217 |         let config = Config::default();
218 |         let toml_str = toml::to_string(&config).unwrap();
219 |         let deserialized: Config = toml::from_str(&toml_str).unwrap();
220 | 
221 |         assert_eq!(config.log_level, deserialized.log_level);
222 |         assert_eq!(config.output_format, deserialized.output_format);
223 |     }
224 | 
225 |     #[test]
226 |     fn test_config_file_loading() {
227 |         let config = Config::default();
228 |         let toml_str = toml::to_string_pretty(&config).unwrap();
229 | 
230 |         let temp_file = NamedTempFile::new().unwrap();
231 |         std::fs::write(temp_file.path(), toml_str).unwrap();
232 | 
233 |         let loaded_config = ConfigManager::load_from_file(temp_file.path()).unwrap();
234 |         assert_eq!(loaded_config.config().log_level, config.log_level);
235 |     }
236 | 
237 |     #[test]
238 |     fn test_config_manager_creation() {
239 |         let manager = ConfigManager::new();
240 |         assert_eq!(manager.config().log_level, "info");
241 |     }
242 | }
243 | 


--------------------------------------------------------------------------------
/src/remote.rs:
--------------------------------------------------------------------------------
  1 | use anyhow::{Context, Result};
  2 | use std::{
  3 |     process::{Command, Stdio},
  4 |     time::Duration,
  5 | };
  6 | use tracing::{debug, info, warn};
  7 | 
  8 | /// SSH connection configuration
  9 | #[derive(Debug, Clone)]
 10 | pub struct SshConfig {
 11 |     pub host: String,
 12 |     pub port: u16,
 13 |     pub username: String,
 14 |     pub key_path: Option<String>,
 15 |     pub password: Option<String>,
 16 |     pub timeout: Duration,
 17 | }
 18 | 
 19 | impl SshConfig {
 20 |     /// Create a new SSH configuration
 21 |     pub fn new(host: String, port: u16, username: String) -> Self {
 22 |         Self {
 23 |             host,
 24 |             port,
 25 |             username,
 26 |             key_path: None,
 27 |             password: None,
 28 |             timeout: Duration::from_secs(30),
 29 |         }
 30 |     }
 31 | 
 32 |     /// Set SSH key path
 33 |     pub fn with_key_path(mut self, key_path: String) -> Self {
 34 |         self.key_path = Some(key_path);
 35 |         self
 36 |     }
 37 | 
 38 |     /// Set SSH password
 39 |     pub fn with_password(mut self, password: String) -> Self {
 40 |         self.password = Some(password);
 41 |         self
 42 |     }
 43 | 
 44 |     /// Set connection timeout
 45 |     pub fn with_timeout(mut self, timeout: Duration) -> Self {
 46 |         self.timeout = timeout;
 47 |         self
 48 |     }
 49 | }
 50 | 
 51 | /// SSH remote connection manager using system SSH
 52 | pub struct SshRemote {
 53 |     config: SshConfig,
 54 | }
 55 | 
 56 | impl SshRemote {
 57 |     /// Create a new SSH remote connection
 58 |     pub fn new(config: SshConfig) -> Self {
 59 |         Self { config }
 60 |     }
 61 | 
 62 |     /// Execute a command on the remote host
 63 |     pub fn execute_command(&self, command: &str) -> Result<String> {
 64 |         debug!("Executing remote command: {}", command);
 65 | 
 66 |         let mut ssh_cmd = Command::new("ssh");
 67 | 
 68 |         // Add SSH options
 69 |         ssh_cmd
 70 |             .arg("-o")
 71 |             .arg("ConnectTimeout=30")
 72 |             .arg("-o")
 73 |             .arg("StrictHostKeyChecking=no")
 74 |             .arg("-o")
 75 |             .arg("UserKnownHostsFile=/dev/null")
 76 |             .arg("-o")
 77 |             .arg("LogLevel=ERROR");
 78 | 
 79 |         // Add port if not default
 80 |         if self.config.port != 22 {
 81 |             ssh_cmd.arg("-p").arg(self.config.port.to_string());
 82 |         }
 83 | 
 84 |         // Add key file if specified
 85 |         if let Some(key_path) = &self.config.key_path {
 86 |             ssh_cmd.arg("-i").arg(key_path);
 87 |         }
 88 | 
 89 |         // Add password authentication if specified
 90 |         if self.config.password.is_some() {
 91 |             ssh_cmd.arg("-o").arg("PasswordAuthentication=yes");
 92 |         }
 93 | 
 94 |         // Add host and command
 95 |         let host_spec = format!("{}@{}", self.config.username, self.config.host);
 96 |         ssh_cmd.arg(host_spec).arg(command);
 97 | 
 98 |         // Set up input for password if needed
 99 |         if let Some(_password) = &self.config.password {
100 |             ssh_cmd.stdin(Stdio::piped());
101 |         }
102 | 
103 |         debug!("Running SSH command: {:?}", ssh_cmd);
104 | 
105 |         let mut child = ssh_cmd
106 |             .stdout(Stdio::piped())
107 |             .stderr(Stdio::piped())
108 |             .spawn()
109 |             .context("Failed to spawn SSH command")?;
110 | 
111 |         // Send password if provided
112 |         if let Some(password) = &self.config.password {
113 |             if let Some(stdin) = child.stdin.as_mut() {
114 |                 use std::io::Write;
115 |                 stdin
116 |                     .write_all(password.as_bytes())
117 |                     .context("Failed to write password to SSH stdin")?;
118 |                 stdin
119 |                     .write_all(b"\n")
120 |                     .context("Failed to write newline to SSH stdin")?;
121 |             }
122 |         }
123 | 
124 |         let output = child
125 |             .wait_with_output()
126 |             .context("Failed to wait for SSH command")?;
127 | 
128 |         if !output.status.success() {
129 |             let stderr = String::from_utf8_lossy(&output.stderr);
130 |             return Err(anyhow::anyhow!(
131 |                 "SSH command failed with exit code {}: {}",
132 |                 output.status.code().unwrap_or(-1),
133 |                 stderr
134 |             ));
135 |         }
136 | 
137 |         let stdout = String::from_utf8(output.stdout)
138 |             .context("Failed to decode SSH command output as UTF-8")?;
139 | 
140 |         debug!(
141 |             "Command executed successfully, output length: {} bytes",
142 |             stdout.len()
143 |         );
144 |         Ok(stdout)
145 |     }
146 | 
147 |     /// Execute gpukill command on remote host
148 |     pub fn execute_gpukill(&self, args: &[String]) -> Result<String> {
149 |         let command = format!("gpukill {}", args.join(" "));
150 |         self.execute_command(&command)
151 |     }
152 | 
153 |     /// Check if gpukill is available on remote host
154 |     pub fn check_gpukill_availability(&self) -> Result<bool> {
155 |         match self.execute_command("which gpukill") {
156 |             Ok(output) => {
157 |                 let available = !output.trim().is_empty();
158 |                 if available {
159 |                     info!("gpukill is available on remote host");
160 |                 } else {
161 |                     warn!("gpukill not found on remote host");
162 |                 }
163 |                 Ok(available)
164 |             }
165 |             Err(_) => {
166 |                 warn!("Failed to check gpukill availability on remote host");
167 |                 Ok(false)
168 |             }
169 |         }
170 |     }
171 | 
172 |     /// Get remote host information
173 |     pub fn get_host_info(&self) -> Result<RemoteHostInfo> {
174 |         let hostname = self.execute_command("hostname")?.trim().to_string();
175 |         let os_info = self.execute_command("uname -a")?.trim().to_string();
176 |         let gpu_info = self.execute_command("nvidia-smi --query-gpu=name --format=csv,noheader,nounits 2>/dev/null || echo 'No NVIDIA GPUs'")?.trim().to_string();
177 | 
178 |         Ok(RemoteHostInfo {
179 |             hostname,
180 |             os_info,
181 |             gpu_info,
182 |         })
183 |     }
184 | }
185 | 
186 | /// Information about the remote host
187 | #[derive(Debug, Clone)]
188 | pub struct RemoteHostInfo {
189 |     pub hostname: String,
190 |     pub os_info: String,
191 |     #[allow(dead_code)]
192 |     pub gpu_info: String,
193 | }
194 | 
195 | /// Execute a local gpukill command with remote forwarding
196 | pub fn execute_remote_operation(config: SshConfig, local_args: &[String]) -> Result<()> {
197 |     let remote = SshRemote::new(config);
198 | 
199 |     // Check if gpukill is available on remote host
200 |     if !remote.check_gpukill_availability()? {
201 |         return Err(anyhow::anyhow!(
202 |             "gpukill is not available on the remote host. Please install gpukill on the remote host first."
203 |         ));
204 |     }
205 | 
206 |     // Get remote host info
207 |     let host_info = remote.get_host_info()?;
208 |     info!(
209 |         "Remote host: {} ({})",
210 |         host_info.hostname, host_info.os_info
211 |     );
212 | 
213 |     // Execute the command on remote host
214 |     let output = remote.execute_gpukill(local_args)?;
215 | 
216 |     // Print the output
217 |     print!("{}", output);
218 | 
219 |     Ok(())
220 | }
221 | 
222 | #[cfg(test)]
223 | mod tests {
224 |     use super::*;
225 |     use std::time::Duration;
226 | 
227 |     #[test]
228 |     fn test_ssh_config_creation() {
229 |         let config = SshConfig::new("localhost".to_string(), 22, "testuser".to_string());
230 |         assert_eq!(config.host, "localhost");
231 |         assert_eq!(config.port, 22);
232 |         assert_eq!(config.username, "testuser");
233 |         assert_eq!(config.timeout, Duration::from_secs(30));
234 |     }
235 | 
236 |     #[test]
237 |     fn test_ssh_config_with_options() {
238 |         let config = SshConfig::new("localhost".to_string(), 22, "testuser".to_string())
239 |             .with_key_path("/path/to/key".to_string())
240 |             .with_password("password".to_string())
241 |             .with_timeout(Duration::from_secs(60));
242 | 
243 |         assert_eq!(config.key_path, Some("/path/to/key".to_string()));
244 |         assert_eq!(config.password, Some("password".to_string()));
245 |         assert_eq!(config.timeout, Duration::from_secs(60));
246 |     }
247 | }
248 | 


--------------------------------------------------------------------------------
/.github/workflows/self-hosted-setup.md:
--------------------------------------------------------------------------------
  1 | # Self-Hosted GPU Runner Setup Guide
  2 | 
  3 | This guide explains how to set up self-hosted GitHub Actions runners with GPU hardware for testing of GPU Kill.
  4 | 
  5 | ## Overview
  6 | 
  7 | GPU Kill requires actual GPU hardware to test all functionality. This setup provides:
  8 | - **NVIDIA GPU testing** with CUDA/NVML
  9 | - **AMD GPU testing** with ROCm
 10 | - **Intel GPU testing** with intel-gpu-tools
 11 | - **Apple Silicon testing** on macOS
 12 | - **Cross-platform compatibility** testing
 13 | 
 14 | ## Hardware Requirements
 15 | 
 16 | ### NVIDIA Runner
 17 | - **GPU**: Any NVIDIA GPU with CUDA support
 18 | - **OS**: Ubuntu 22.04 LTS
 19 | - **RAM**: 16GB+ recommended
 20 | - **Storage**: 100GB+ SSD
 21 | - **CPU**: 4+ cores
 22 | 
 23 | ### AMD Runner
 24 | - **GPU**: AMD GPU with ROCm support (RX 5000/6000 series, MI series)
 25 | - **OS**: Ubuntu 22.04 LTS
 26 | - **RAM**: 16GB+ recommended
 27 | - **Storage**: 100GB+ SSD
 28 | - **CPU**: 4+ cores
 29 | 
 30 | ### Intel Runner
 31 | - **GPU**: Intel Arc, Iris Xe, or integrated GPU
 32 | - **OS**: Ubuntu 22.04 LTS
 33 | - **RAM**: 8GB+ recommended
 34 | - **Storage**: 50GB+ SSD
 35 | - **CPU**: 4+ cores
 36 | 
 37 | ### Apple Silicon Runner
 38 | - **Hardware**: Mac Studio, MacBook Pro, or Mac mini with M1/M2/M3/M4
 39 | - **OS**: macOS 13+ (Ventura)
 40 | - **RAM**: 16GB+ recommended
 41 | - **Storage**: 100GB+ SSD
 42 | 
 43 | ## Setup Instructions
 44 | 
 45 | ### 1. NVIDIA Runner Setup
 46 | 
 47 | ```bash
 48 | # Install Ubuntu 22.04 LTS
 49 | # Update system
 50 | sudo apt update && sudo apt upgrade -y
 51 | 
 52 | # Install NVIDIA drivers
 53 | sudo apt install -y nvidia-driver-535
 54 | sudo reboot
 55 | 
 56 | # Verify NVIDIA installation
 57 | nvidia-smi
 58 | 
 59 | # Install development tools
 60 | sudo apt install -y build-essential curl git libssl-dev pkg-config
 61 | 
 62 | # Install Rust
 63 | curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
 64 | source ~/.cargo/env
 65 | 
 66 | # Install GitHub Actions runner
 67 | mkdir actions-runner && cd actions-runner
 68 | curl -o actions-runner-linux-x64-2.311.0.tar.gz -L https://github.com/actions/runner/releases/download/v2.311.0/actions-runner-linux-x64-2.311.0.tar.gz
 69 | tar xzf ./actions-runner-linux-x64-2.311.0.tar.gz
 70 | 
 71 | # Configure runner (get token from GitHub repo settings)
 72 | ./config.sh --url https://github.com/treadiehq/gpu-kill --token <YOUR_TOKEN>
 73 | ./config.sh --name "nvidia-gpu-runner" --labels "self-hosted,gpu,nvidia,ubuntu-22.04"
 74 | 
 75 | # Install as service
 76 | sudo ./svc.sh install
 77 | sudo ./svc.sh start
 78 | ```
 79 | 
 80 | ### 2. AMD Runner Setup
 81 | 
 82 | ```bash
 83 | # Install Ubuntu 22.04 LTS
 84 | # Update system
 85 | sudo apt update && sudo apt upgrade -y
 86 | 
 87 | # Install ROCm
 88 | wget https://repo.radeon.com/amdgpu-install/5.7/ubuntu/jammy/amdgpu-install_5.7.50700-1_all.deb
 89 | sudo dpkg -i amdgpu-install_5.7.50700-1_all.deb
 90 | sudo apt-get update
 91 | sudo amdgpu-install --usecase=rocm
 92 | 
 93 | # Verify ROCm installation
 94 | rocm-smi
 95 | rocminfo
 96 | 
 97 | # Install development tools
 98 | sudo apt install -y build-essential curl git libssl-dev pkg-config
 99 | 
100 | # Install Rust
101 | curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
102 | source ~/.cargo/env
103 | 
104 | # Install GitHub Actions runner (same as NVIDIA)
105 | mkdir actions-runner && cd actions-runner
106 | curl -o actions-runner-linux-x64-2.311.0.tar.gz -L https://github.com/actions/runner/releases/download/v2.311.0/actions-runner-linux-x64-2.311.0.tar.gz
107 | tar xzf ./actions-runner-linux-x64-2.311.0.tar.gz
108 | 
109 | # Configure runner
110 | ./config.sh --url https://github.com/treadiehq/gpu-kill --token <YOUR_TOKEN>
111 | ./config.sh --name "amd-gpu-runner" --labels "self-hosted,gpu,amd,ubuntu-22.04"
112 | 
113 | # Install as service
114 | sudo ./svc.sh install
115 | sudo ./svc.sh start
116 | ```
117 | 
118 | ### 3. Intel Runner Setup
119 | 
120 | ```bash
121 | # Install Ubuntu 22.04 LTS
122 | # Update system
123 | sudo apt update && sudo apt upgrade -y
124 | 
125 | # Install Intel GPU tools
126 | sudo apt install -y intel-gpu-tools
127 | 
128 | # Verify Intel GPU tools
129 | intel_gpu_top --help
130 | 
131 | # Install development tools
132 | sudo apt install -y build-essential curl git libssl-dev pkg-config
133 | 
134 | # Install Rust
135 | curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
136 | source ~/.cargo/env
137 | 
138 | # Install GitHub Actions runner
139 | mkdir actions-runner && cd actions-runner
140 | curl -o actions-runner-linux-x64-2.311.0.tar.gz -L https://github.com/actions/runner/releases/download/v2.311.0/actions-runner-linux-x64-2.311.0.tar.gz
141 | tar xzf ./actions-runner-linux-x64-2.311.0.tar.gz
142 | 
143 | # Configure runner
144 | ./config.sh --url https://github.com/treadiehq/gpu-kill --token <YOUR_TOKEN>
145 | ./config.sh --name "intel-gpu-runner" --labels "self-hosted,gpu,intel,ubuntu-22.04"
146 | 
147 | # Install as service
148 | sudo ./svc.sh install
149 | sudo ./svc.sh start
150 | ```
151 | 
152 | ### 4. Apple Silicon Runner Setup
153 | 
154 | ```bash
155 | # Install macOS 13+ (Ventura)
156 | # Install Xcode command line tools
157 | xcode-select --install
158 | 
159 | # Install Homebrew
160 | /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
161 | 
162 | # Install Rust
163 | curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
164 | source ~/.cargo/env
165 | 
166 | # Verify Apple Silicon GPU
167 | system_profiler SPDisplaysDataType
168 | 
169 | # Install GitHub Actions runner
170 | mkdir actions-runner && cd actions-runner
171 | curl -o actions-runner-osx-arm64-2.311.0.tar.gz -L https://github.com/actions/runner/releases/download/v2.311.0/actions-runner-osx-arm64-2.311.0.tar.gz
172 | tar xzf ./actions-runner-osx-arm64-2.311.0.tar.gz
173 | 
174 | # Configure runner
175 | ./config.sh --url https://github.com/treadiehq/gpu-kill --token <YOUR_TOKEN>
176 | ./config.sh --name "apple-gpu-runner" --labels "self-hosted,gpu,apple,macos-13"
177 | 
178 | # Install as service
179 | ./svc.sh install
180 | ./svc.sh start
181 | ```
182 | 
183 | ## Runner Labels
184 | 
185 | Each runner should be configured with these labels:
186 | - `self-hosted` - Required for self-hosted runners
187 | - `gpu` - Indicates GPU hardware availability
188 | - `nvidia`/`amd`/`intel`/`apple` - GPU vendor
189 | - `ubuntu-22.04`/`macos-13` - Operating system
190 | - `stress-test` - For runners capable of stress testing
191 | 
192 | ## Security Considerations
193 | 
194 | 1. **Network Security**: Ensure runners are behind a firewall
195 | 2. **Access Control**: Limit who can access the runner machines
196 | 3. **Token Management**: Regularly rotate GitHub tokens
197 | 4. **Monitoring**: Monitor runner health and performance
198 | 5. **Updates**: Keep runners updated with security patches
199 | 
200 | ## Monitoring and Maintenance
201 | 
202 | ### Health Checks
203 | ```bash
204 | # Check runner status
205 | sudo systemctl status actions.runner.*
206 | 
207 | # Check GPU status
208 | nvidia-smi  # NVIDIA
209 | rocm-smi    # AMD
210 | intel_gpu_top --help  # Intel
211 | system_profiler SPDisplaysDataType  # Apple
212 | ```
213 | 
214 | ### Logs
215 | ```bash
216 | # View runner logs
217 | sudo journalctl -u actions.runner.* -f
218 | 
219 | # View GitHub Actions logs
220 | tail -f /home/runner/_diag/Runner_*.log
221 | ```
222 | 
223 | ### Updates
224 | ```bash
225 | # Update runner software
226 | cd actions-runner
227 | ./config.sh remove --token <NEW_TOKEN>
228 | # Download new version
229 | ./config.sh --url https://github.com/treadiehq/gpu-kill --token <NEW_TOKEN>
230 | ```
231 | 
232 | ## Cost Optimization
233 | 
234 | 1. **Scheduled Testing**: Run tests during off-peak hours
235 | 2. **Resource Scaling**: Use smaller instances for basic tests
236 | 3. **Caching**: Implement aggressive caching for dependencies
237 | 4. **Parallel Testing**: Run multiple test suites in parallel
238 | 
239 | ## Troubleshooting
240 | 
241 | ### Common Issues
242 | 
243 | 1. **GPU Not Detected**
244 |    ```bash
245 |    # Check GPU status
246 |    lspci | grep -i vga
247 |    nvidia-smi  # or rocm-smi, intel_gpu_top
248 |    ```
249 | 
250 | 2. **Permission Issues**
251 |    ```bash
252 |    # Add user to video group
253 |    sudo usermod -a -G video $USER
254 |    sudo usermod -a -G render $USER
255 |    ```
256 | 
257 | 3. **Driver Issues**
258 |    ```bash
259 |    # Reinstall drivers
260 |    sudo apt purge nvidia-*  # NVIDIA
261 |    sudo apt purge rocm-*    # AMD
262 |    sudo apt install nvidia-driver-535  # Reinstall
263 |    ```
264 | 
265 | 4. **Runner Connection Issues**
266 |    ```bash
267 |    # Check network connectivity
268 |    curl -I https://github.com
269 |    # Restart runner service
270 |    sudo systemctl restart actions.runner.*
271 |    ```
272 | 
273 | ## Integration with GPU Kill
274 | 
275 | The runners will automatically execute the GPU testing workflow when:
276 | - Code is pushed to main/develop branches
277 | - Pull requests are opened
278 | - Manual workflow dispatch is triggered
279 | 
280 | Tests include:
281 | - GPU detection and enumeration
282 | - Performance benchmarking
283 | - Memory usage testing
284 | - Stress testing
285 | - Cross-platform compatibility
286 | - Security auditing
287 | 


--------------------------------------------------------------------------------
/docs/HOTAISLE_INTEGRATION.md:
--------------------------------------------------------------------------------
  1 | # Hot Aisle Integration for GPU Testing
  2 | 
  3 | This document describes the **optional** integration between GPU Kill and Hot Aisle's infrastructure for automated GPU testing in CI/CD pipelines.
  4 | 
  5 | > **Note**: Hot Aisle integration is an optional feature that must be enabled with the `hotaisle` feature flag.
  6 | 
  7 | ## Overview
  8 | 
  9 | The Hot Aisle integration enables GPU Kill to run comprehensive tests on real GPU hardware by:
 10 | 
 11 | 1. **Provisioning GPU instances** on-demand via Hot Aisle's API
 12 | 2. **Running GPU tests** on actual hardware (NVIDIA, AMD, Intel, Apple Silicon)
 13 | 3. **Automated cleanup** to minimize costs
 14 | 4. **Comprehensive reporting** of test results
 15 | 
 16 | ## Architecture
 17 | 
 18 | ```
 19 | ┌─────────────────┐    ┌─────────────────┐    ┌─────────────────┐
 20 | │   GitHub        │    │   Hot Aisle     │    │   GPU Hardware  │
 21 | │   Actions       │◄──►│   API           │◄──►│   (NVIDIA/AMD)  │
 22 | │   (CI/CD)       │    │   (Backend)     │    │   (Intel/Apple) │
 23 | └─────────────────┘    └─────────────────┘    └─────────────────┘
 24 | ```
 25 | 
 26 | ## Components
 27 | 
 28 | ### 1. Hot Aisle API Client (`src/hotaisle_client.rs`)
 29 | 
 30 | Rust client for interacting with Hot Aisle's API:
 31 | 
 32 | ```rust
 33 | use gpukill::hotaisle_client::{HotAisleClient, GpuInstanceConfig};
 34 | 
 35 | let client = HotAisleClient::new(api_key, None);
 36 | 
 37 | let config = GpuInstanceConfig {
 38 |     gpu_type: "nvidia".to_string(),
 39 |     duration_minutes: 30,
 40 |     instance_type: Some("g4dn.xlarge".to_string()),
 41 |     labels: Some(vec!["ci-test".to_string()]),
 42 | };
 43 | 
 44 | let instance = client.provision_gpu_instance(config).await?;
 45 | ```
 46 | 
 47 | ### 2. GPU Test Script (`scripts/run-gpu-tests.sh`)
 48 | 
 49 | Comprehensive test script that runs on provisioned instances:
 50 | 
 51 | ### 3. Integration Test Script (`scripts/test-hotaisle-integration-simple.sh`)
 52 | 
 53 | CI-friendly test script that validates the Hot Aisle integration without requiring API access:
 54 | 
 55 | - **GPU Detection Tests**: Verify GPU enumeration and information retrieval
 56 | - **Vendor-Specific Tests**: NVIDIA (nvidia-smi), AMD (rocm-smi, amd-smi), Intel (intel_gpu_top)
 57 | - **Performance Tests**: Run GPU hardware tests and benchmarks
 58 | - **Stress Tests**: Multiple iterations to ensure reliability
 59 | - **Report Generation**: Detailed test reports with system information
 60 | 
 61 | ### 3. GitHub Actions Workflow (`.github/workflows/hotaisle-gpu-testing.yml`)
 62 | 
 63 | Automated CI/CD pipeline that:
 64 | 
 65 | - **Provisions GPU instances** based on matrix strategy
 66 | - **Deploys GPU Kill** to instances
 67 | - **Runs comprehensive tests** on real hardware
 68 | - **Collects results** and uploads artifacts
 69 | - **Cleans up instances** automatically
 70 | 
 71 | ## Setup Instructions
 72 | 
 73 | ### 1. Enable Hot Aisle Feature
 74 | 
 75 | Build GPU Kill with the Hot Aisle feature enabled:
 76 | 
 77 | ```bash
 78 | # Build with Hot Aisle integration
 79 | cargo build --release --features hotaisle
 80 | 
 81 | # Or install with Hot Aisle integration
 82 | cargo install --path . --features hotaisle
 83 | ```
 84 | 
 85 | ### 2. Hot Aisle API Key
 86 | 
 87 | Add your Hot Aisle API key to GitHub Secrets:
 88 | 
 89 | ```bash
 90 | # In your GitHub repository settings:
 91 | # Settings → Secrets and variables → Actions → New repository secret
 92 | # Name: HOTAISLE_API_KEY
 93 | # Value: your-hotaisle-api-key
 94 | ```
 95 | 
 96 | ### 3. Configure GPU Types
 97 | 
 98 | The workflow supports testing multiple GPU types:
 99 | 
100 | ```yaml
101 | # Default configuration
102 | matrix:
103 |   gpu_type: [nvidia, amd, intel]
104 | 
105 | # Manual dispatch with custom GPU types
106 | # Use workflow_dispatch with inputs:
107 | # gpu_types: "nvidia,amd,intel,apple-silicon"
108 | ```
109 | 
110 | ### 4. Test Duration
111 | 
112 | Configure test duration to balance thoroughness with cost:
113 | 
114 | ```yaml
115 | # Default: 30 minutes
116 | # Can be overridden via workflow_dispatch
117 | test_duration: "30"  # minutes
118 | ```
119 | 
120 | ## Usage
121 | 
122 | ### Integration Testing
123 | 
124 | The integration is validated automatically via the "Test Hot Aisle Integration" workflow:
125 | - **Runs on**: Changes to Hot Aisle-related files
126 | - **Validates**: Build system, feature flags, documentation, and workflow syntax
127 | - **No API key required**: Tests the integration structure without actual GPU provisioning
128 | 
129 | ### Manual GPU Testing
130 | 
131 | Trigger tests manually via GitHub Actions:
132 | 
133 | 1. Go to **Actions** tab in your repository
134 | 2. Select **Hot Aisle GPU Testing** workflow
135 | 3. Click **Run workflow**
136 | 4. Configure parameters:
137 |    - **GPU types**: Comma-separated list (e.g., `nvidia,amd,intel`)
138 |    - **Test duration**: Minutes (e.g., `30`)
139 | 
140 | ### Local Testing
141 | 
142 | Test the integration locally:
143 | 
144 | ```bash
145 | # Build GPU Kill
146 | cargo build --release
147 | 
148 | # Run GPU tests (requires GPU hardware)
149 | ./scripts/run-gpu-tests.sh nvidia
150 | ```
151 | 
152 | ## Supported GPU Types
153 | 
154 | | GPU Type | Tools Used | Tests |
155 | |----------|------------|-------|
156 | | **NVIDIA** | nvidia-smi, NVML | GPU enumeration, memory, utilization, temperature, power |
157 | | **AMD** | rocm-smi, amd-smi | GPU enumeration, memory, utilization, temperature, power |
158 | | **Intel** | intel_gpu_top | GPU enumeration, utilization, memory estimation |
159 | | **Apple Silicon** | system_profiler | GPU enumeration, memory usage, Metal processes |
160 | 
161 | ## Cost Optimization
162 | 
163 | ### 1. Instance Lifecycle Management
164 | 
165 | - **Automatic provisioning** only when needed
166 | - **Immediate cleanup** after tests complete
167 | - **Timeout protection** to prevent runaway costs
168 | 
169 | ### 2. Test Duration Control
170 | 
171 | - **Configurable duration** (default: 30 minutes)
172 | - **Fast failure** for quick feedback
173 | - **Comprehensive testing** when needed
174 | 
175 | ### 3. Resource Efficiency
176 | 
177 | - **Parallel testing** across GPU types
178 | - **Shared infrastructure** via Hot Aisle
179 | - **No always-on runners** required
180 | 
181 | ## Test Results
182 | 
183 | ### Artifacts
184 | 
185 | Each test run produces:
186 | 
187 | - **Test Output Log**: Detailed execution logs
188 | - **Test Report**: Comprehensive system and GPU information
189 | - **Retention**: 30 days for debugging
190 | 
191 | ### Metrics
192 | 
193 | Tests measure:
194 | 
195 | - **GPU Detection**: Number of GPUs found
196 | - **Information Retrieval**: JSON validity and completeness
197 | - **Performance**: Test execution time
198 | - **Reliability**: Stress test success rate
199 | 
200 | ## Troubleshooting
201 | 
202 | ### Common Issues
203 | 
204 | 1. **Instance Provisioning Fails**
205 |    - Check Hot Aisle API key validity
206 |    - Verify GPU type availability
207 |    - Check Hot Aisle service status
208 | 
209 | 2. **SSH Connection Issues**
210 |    - Verify instance IP address
211 |    - Check SSH key generation
212 |    - Ensure instance is ready
213 | 
214 | 3. **Test Failures**
215 |    - Review test output logs
216 |    - Check GPU driver installation
217 |    - Verify tool availability (nvidia-smi, rocm-smi, etc.)
218 | 
219 | ### Debug Mode
220 | 
221 | Enable debug logging:
222 | 
223 | ```bash
224 | export RUST_LOG=debug
225 | export RUST_BACKTRACE=1
226 | ```
227 | 
228 | ## API Reference
229 | 
230 | ### HotAisleClient
231 | 
232 | ```rust
233 | impl HotAisleClient {
234 |     pub fn new(api_key: String, base_url: Option<String>) -> Self
235 |     pub async fn provision_gpu_instance(&self, config: GpuInstanceConfig) -> Result<GpuInstance>
236 |     pub async fn wait_for_instance_ready(&self, instance_id: &str, timeout_minutes: u32) -> Result<GpuInstance>
237 |     pub async fn get_instance(&self, instance_id: &str) -> Result<GpuInstance>
238 |     pub async fn run_gpu_tests(&self, instance: &GpuInstance, test_config: &GpuTestConfig) -> Result<GpuTestResults>
239 |     pub async fn terminate_instance(&self, instance_id: &str) -> Result<()>
240 |     pub async fn list_available_gpu_types(&self) -> Result<Vec<String>>
241 | }
242 | ```
243 | 
244 | ### Configuration Types
245 | 
246 | ```rust
247 | pub struct GpuInstanceConfig {
248 |     pub gpu_type: String,           // nvidia, amd, intel, apple-silicon
249 |     pub duration_minutes: u32,      // Instance lifetime
250 |     pub instance_type: Option<String>, // Auto-selected if None
251 |     pub labels: Option<Vec<String>>,   // Custom labels
252 | }
253 | 
254 | pub struct GpuTestConfig {
255 |     pub test_command: String,       // Command to execute
256 |     pub timeout_minutes: u32,       // Test timeout
257 |     pub env_vars: Option<HashMap<String, String>>, // Environment variables
258 |     pub working_dir: Option<String>, // Working directory
259 | }
260 | ```
261 | 
262 | ## Future Enhancements
263 | 
264 | ### Planned Features
265 | 
266 | 1. **Advanced GPU Testing**
267 |    - CUDA/ROCm kernel testing
268 |    - Memory bandwidth benchmarks
269 |    - Multi-GPU coordination tests
270 | 
271 | 2. **Cost Analytics**
272 |    - Test cost tracking
273 |    - Optimization recommendations
274 |    - Budget alerts
275 | 
276 | 3. **Integration Improvements**
277 |    - Webhook notifications
278 |    - Slack/Teams integration
279 |    - Custom test configurations
280 | 
281 | ### Contributing
282 | 
283 | To contribute to the Hot Aisle integration:
284 | 
285 | 1. **Fork the repository**
286 | 2. **Create a feature branch**
287 | 3. **Add tests** for new functionality
288 | 4. **Update documentation**
289 | 5. **Submit a pull request**
290 | 
291 | ## Support
292 | 
293 | For issues related to:
294 | 
295 | - **GPU Kill**: Create an issue in this repository
296 | - **Hot Aisle API**: Contact Hot Aisle support
297 | - **Integration**: Check the troubleshooting section above
298 | 
299 | ## License
300 | 
301 | This integration is part of GPU Kill and follows the same license terms.
302 | 


--------------------------------------------------------------------------------
/src/hotaisle_client.rs:
--------------------------------------------------------------------------------
  1 | //! Hot Aisle API client for GPU instance provisioning and management
  2 | //!
  3 | //! This module provides integration with Hot Aisle's infrastructure
  4 | //! for on-demand GPU testing in CI/CD pipelines.
  5 | 
  6 | use anyhow::Result;
  7 | use serde::{Deserialize, Serialize};
  8 | use std::time::Duration;
  9 | use tokio::time::sleep;
 10 | 
 11 | /// Hot Aisle API client for managing GPU instances
 12 | pub struct HotAisleClient {
 13 |     api_key: String,
 14 |     base_url: String,
 15 |     client: reqwest::Client,
 16 | }
 17 | 
 18 | /// GPU instance configuration
 19 | #[derive(Debug, Clone, Serialize, Deserialize)]
 20 | pub struct GpuInstanceConfig {
 21 |     /// GPU type (nvidia, amd, intel, apple-silicon)
 22 |     pub gpu_type: String,
 23 |     /// Instance duration in minutes
 24 |     pub duration_minutes: u32,
 25 |     /// Instance size/type
 26 |     pub instance_type: Option<String>,
 27 |     /// Custom labels for the instance
 28 |     pub labels: Option<Vec<String>>,
 29 | }
 30 | 
 31 | /// GPU instance information
 32 | #[derive(Debug, Clone, Serialize, Deserialize)]
 33 | pub struct GpuInstance {
 34 |     /// Unique instance ID
 35 |     pub id: String,
 36 |     /// Instance IP address
 37 |     pub ip_address: String,
 38 |     /// SSH connection details
 39 |     pub ssh_config: SshConfig,
 40 |     /// GPU type
 41 |     pub gpu_type: String,
 42 |     /// Instance status
 43 |     pub status: String,
 44 |     /// Creation timestamp
 45 |     pub created_at: String,
 46 |     /// Expiration timestamp
 47 |     pub expires_at: String,
 48 | }
 49 | 
 50 | /// SSH connection configuration
 51 | #[derive(Debug, Clone, Serialize, Deserialize)]
 52 | pub struct SshConfig {
 53 |     /// SSH username
 54 |     pub username: String,
 55 |     /// SSH port (default: 22)
 56 |     pub port: u16,
 57 |     /// SSH key path or content
 58 |     pub key_path: Option<String>,
 59 | }
 60 | 
 61 | /// Test results from GPU instance
 62 | #[derive(Debug, Clone, Serialize, Deserialize)]
 63 | pub struct GpuTestResults {
 64 |     /// Instance ID where tests were run
 65 |     pub instance_id: String,
 66 |     /// Test execution status
 67 |     pub status: String,
 68 |     /// Test output/logs
 69 |     pub output: String,
 70 |     /// Test duration in seconds
 71 |     pub duration_seconds: u64,
 72 |     /// Number of tests passed
 73 |     pub tests_passed: u32,
 74 |     /// Number of tests failed
 75 |     pub tests_failed: u32,
 76 |     /// Number of tests skipped
 77 |     pub tests_skipped: u32,
 78 | }
 79 | 
 80 | impl HotAisleClient {
 81 |     /// Create a new Hot Aisle client
 82 |     pub fn new(api_key: String, base_url: Option<String>) -> Self {
 83 |         let base_url = base_url.unwrap_or_else(|| "https://admin.hotaisle.app/api".to_string());
 84 | 
 85 |         Self {
 86 |             api_key,
 87 |             base_url,
 88 |             client: reqwest::Client::new(),
 89 |         }
 90 |     }
 91 | 
 92 |     /// Provision a new GPU instance
 93 |     pub async fn provision_gpu_instance(&self, config: GpuInstanceConfig) -> Result<GpuInstance> {
 94 |         let url = format!("{}/instances", self.base_url);
 95 | 
 96 |         let response = self
 97 |             .client
 98 |             .post(&url)
 99 |             .header("Authorization", format!("Bearer {}", self.api_key))
100 |             .header("Content-Type", "application/json")
101 |             .json(&config)
102 |             .send()
103 |             .await?;
104 | 
105 |         let status = response.status();
106 |         if !status.is_success() {
107 |             let error_text = response.text().await?;
108 |             return Err(anyhow::anyhow!(
109 |                 "Failed to provision GPU instance: {} - {}",
110 |                 status,
111 |                 error_text
112 |             ));
113 |         }
114 | 
115 |         let instance: GpuInstance = response.json().await?;
116 |         Ok(instance)
117 |     }
118 | 
119 |     /// Wait for instance to be ready
120 |     pub async fn wait_for_instance_ready(
121 |         &self,
122 |         instance_id: &str,
123 |         timeout_minutes: u32,
124 |     ) -> Result<GpuInstance> {
125 |         let timeout = Duration::from_secs(timeout_minutes as u64 * 60);
126 |         let start = std::time::Instant::now();
127 | 
128 |         while start.elapsed() < timeout {
129 |             let instance = self.get_instance(instance_id).await?;
130 | 
131 |             match instance.status.as_str() {
132 |                 "ready" | "running" => return Ok(instance),
133 |                 "failed" | "error" => {
134 |                     return Err(anyhow::anyhow!("Instance {} failed to start", instance_id));
135 |                 }
136 |                 _ => {
137 |                     // Still provisioning, wait and retry
138 |                     sleep(Duration::from_secs(10)).await;
139 |                 }
140 |             }
141 |         }
142 | 
143 |         Err(anyhow::anyhow!(
144 |             "Instance {} did not become ready within {} minutes",
145 |             instance_id,
146 |             timeout_minutes
147 |         ))
148 |     }
149 | 
150 |     /// Get instance information
151 |     pub async fn get_instance(&self, instance_id: &str) -> Result<GpuInstance> {
152 |         let url = format!("{}/instances/{}", self.base_url, instance_id);
153 | 
154 |         let response = self
155 |             .client
156 |             .get(&url)
157 |             .header("Authorization", format!("Bearer {}", self.api_key))
158 |             .send()
159 |             .await?;
160 | 
161 |         let status = response.status();
162 |         if !status.is_success() {
163 |             let error_text = response.text().await?;
164 |             return Err(anyhow::anyhow!(
165 |                 "Failed to get instance {}: {} - {}",
166 |                 instance_id,
167 |                 status,
168 |                 error_text
169 |             ));
170 |         }
171 | 
172 |         let instance: GpuInstance = response.json().await?;
173 |         Ok(instance)
174 |     }
175 | 
176 |     /// Execute GPU tests on an instance
177 |     pub async fn run_gpu_tests(
178 |         &self,
179 |         instance: &GpuInstance,
180 |         test_config: &GpuTestConfig,
181 |     ) -> Result<GpuTestResults> {
182 |         let url = format!("{}/instances/{}/execute", self.base_url, instance.id);
183 | 
184 |         let response = self
185 |             .client
186 |             .post(&url)
187 |             .header("Authorization", format!("Bearer {}", self.api_key))
188 |             .header("Content-Type", "application/json")
189 |             .json(test_config)
190 |             .send()
191 |             .await?;
192 | 
193 |         let status = response.status();
194 |         if !status.is_success() {
195 |             let error_text = response.text().await?;
196 |             return Err(anyhow::anyhow!(
197 |                 "Failed to run tests on instance {}: {} - {}",
198 |                 instance.id,
199 |                 status,
200 |                 error_text
201 |             ));
202 |         }
203 | 
204 |         let results: GpuTestResults = response.json().await?;
205 |         Ok(results)
206 |     }
207 | 
208 |     /// Terminate an instance
209 |     pub async fn terminate_instance(&self, instance_id: &str) -> Result<()> {
210 |         let url = format!("{}/instances/{}", self.base_url, instance_id);
211 | 
212 |         let response = self
213 |             .client
214 |             .delete(&url)
215 |             .header("Authorization", format!("Bearer {}", self.api_key))
216 |             .send()
217 |             .await?;
218 | 
219 |         let status = response.status();
220 |         if !status.is_success() {
221 |             let error_text = response.text().await?;
222 |             return Err(anyhow::anyhow!(
223 |                 "Failed to terminate instance {}: {} - {}",
224 |                 instance_id,
225 |                 status,
226 |                 error_text
227 |             ));
228 |         }
229 | 
230 |         Ok(())
231 |     }
232 | 
233 |     /// List available GPU types
234 |     pub async fn list_available_gpu_types(&self) -> Result<Vec<String>> {
235 |         let url = format!("{}/gpu-types", self.base_url);
236 | 
237 |         let response = self
238 |             .client
239 |             .get(&url)
240 |             .header("Authorization", format!("Bearer {}", self.api_key))
241 |             .send()
242 |             .await?;
243 | 
244 |         let status = response.status();
245 |         if !status.is_success() {
246 |             let error_text = response.text().await?;
247 |             return Err(anyhow::anyhow!(
248 |                 "Failed to list GPU types: {} - {}",
249 |                 status,
250 |                 error_text
251 |             ));
252 |         }
253 | 
254 |         let gpu_types: Vec<String> = response.json().await?;
255 |         Ok(gpu_types)
256 |     }
257 | }
258 | 
259 | /// GPU test configuration
260 | #[derive(Debug, Clone, Serialize, Deserialize)]
261 | pub struct GpuTestConfig {
262 |     /// Test command to execute
263 |     pub test_command: String,
264 |     /// Test timeout in minutes
265 |     pub timeout_minutes: u32,
266 |     /// Environment variables
267 |     pub env_vars: Option<std::collections::HashMap<String, String>>,
268 |     /// Working directory
269 |     pub working_dir: Option<String>,
270 | }
271 | 
272 | #[cfg(test)]
273 | mod tests {
274 |     use super::*;
275 | 
276 |     #[tokio::test]
277 |     async fn test_hotaisle_client_creation() {
278 |         let client = HotAisleClient::new("test-key".to_string(), None);
279 |         assert_eq!(client.base_url, "https://admin.hotaisle.app/api");
280 |     }
281 | 
282 |     #[tokio::test]
283 |     async fn test_gpu_instance_config() {
284 |         let config = GpuInstanceConfig {
285 |             gpu_type: "nvidia".to_string(),
286 |             duration_minutes: 30,
287 |             instance_type: Some("g4dn.xlarge".to_string()),
288 |             labels: Some(vec!["ci-test".to_string(), "gpu-kill".to_string()]),
289 |         };
290 | 
291 |         assert_eq!(config.gpu_type, "nvidia");
292 |         assert_eq!(config.duration_minutes, 30);
293 |     }
294 | }
295 | 


--------------------------------------------------------------------------------
/scripts/run-gpu-tests.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # GPU Kill - Hot Aisle GPU Testing Script
  4 | # This script runs comprehensive GPU tests on Hot Aisle provisioned instances
  5 | 
  6 | set -euo pipefail
  7 | 
  8 | # Configuration
  9 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 10 | PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
 11 | LOG_FILE="/tmp/gpu-kill-tests.log"
 12 | 
 13 | # Colors for output
 14 | RED='\033[0;31m'
 15 | GREEN='\033[0;32m'
 16 | YELLOW='\033[1;33m'
 17 | BLUE='\033[0;34m'
 18 | NC='\033[0m' # No Color
 19 | 
 20 | # Logging functions
 21 | log_info() {
 22 |     echo -e "${BLUE}[INFO]${NC} $1" | tee -a "$LOG_FILE"
 23 | }
 24 | 
 25 | log_success() {
 26 |     echo -e "${GREEN}[SUCCESS]${NC} $1" | tee -a "$LOG_FILE"
 27 | }
 28 | 
 29 | log_warning() {
 30 |     echo -e "${YELLOW}[WARNING]${NC} $1" | tee -a "$LOG_FILE"
 31 | }
 32 | 
 33 | log_error() {
 34 |     echo -e "${RED}[ERROR]${NC} $1" | tee -a "$LOG_FILE"
 35 | }
 36 | 
 37 | # Function to check prerequisites
 38 | check_prerequisites() {
 39 |     log_info "Checking prerequisites..."
 40 |     
 41 |     # Check if we're in the right directory
 42 |     if [[ ! -f "$PROJECT_ROOT/Cargo.toml" ]]; then
 43 |         log_error "Not in GPU Kill project root. Please run from project directory."
 44 |         exit 1
 45 |     fi
 46 |     
 47 |     # Check if cargo is available
 48 |     if ! command -v cargo &> /dev/null; then
 49 |         log_error "Cargo not found. Please install Rust toolchain."
 50 |         exit 1
 51 |     fi
 52 |     
 53 |     # Check if git is available
 54 |     if ! command -v git &> /dev/null; then
 55 |         log_error "Git not found. Please install git."
 56 |         exit 1
 57 |     fi
 58 |     
 59 |     log_success "Prerequisites check passed"
 60 | }
 61 | 
 62 | # Function to build GPU Kill
 63 | build_gpukill() {
 64 |     log_info "Building GPU Kill..."
 65 |     
 66 |     cd "$PROJECT_ROOT"
 67 |     
 68 |     # Build in release mode for better performance
 69 |     if cargo build --release; then
 70 |         log_success "GPU Kill built successfully"
 71 |     else
 72 |         log_error "Failed to build GPU Kill"
 73 |         exit 1
 74 |     fi
 75 | }
 76 | 
 77 | # Function to run basic GPU detection tests
 78 | run_gpu_detection_tests() {
 79 |     log_info "Running GPU detection tests..."
 80 |     
 81 |     local gpu_type="$1"
 82 |     local test_results=()
 83 |     
 84 |     # Test 1: List GPUs
 85 |     log_info "Testing GPU enumeration..."
 86 |     if ./target/release/gpukill --list > /tmp/gpu-list.txt 2>&1; then
 87 |         local gpu_count=$(grep -c "GPU [0-9]" /tmp/gpu-list.txt || echo "0")
 88 |         log_success "Found $gpu_count GPU(s)"
 89 |         test_results+=("gpu_enumeration:passed:$gpu_count")
 90 |     else
 91 |         log_error "GPU enumeration failed"
 92 |         test_results+=("gpu_enumeration:failed:0")
 93 |     fi
 94 |     
 95 |     # Test 2: GPU information
 96 |     log_info "Testing GPU information retrieval..."
 97 |     if ./target/release/gpukill --list > /tmp/gpu-info.json 2>&1; then
 98 |         local json_valid=$(python3 -m json.tool /tmp/gpu-info.json > /dev/null 2>&1 && echo "true" || echo "false")
 99 |         if [[ "$json_valid" == "true" ]]; then
100 |             log_success "GPU information JSON is valid"
101 |             test_results+=("gpu_info_json:passed:valid")
102 |         else
103 |             log_warning "GPU information JSON is invalid"
104 |             test_results+=("gpu_info_json:failed:invalid")
105 |         fi
106 |     else
107 |         log_error "GPU information retrieval failed"
108 |         test_results+=("gpu_info_json:failed:error")
109 |     fi
110 |     
111 |     # Test 3: GPU-specific tests based on type
112 |     case "$gpu_type" in
113 |         "nvidia")
114 |             run_nvidia_specific_tests
115 |             ;;
116 |         "amd")
117 |             run_amd_specific_tests
118 |             ;;
119 |         "intel")
120 |             run_intel_specific_tests
121 |             ;;
122 |         "apple-silicon")
123 |             run_apple_specific_tests
124 |             ;;
125 |         *)
126 |             log_warning "Unknown GPU type: $gpu_type"
127 |             ;;
128 |     esac
129 |     
130 |     # Output test results
131 |     echo "=== GPU Detection Test Results ==="
132 |     for result in "${test_results[@]}"; do
133 |         echo "$result"
134 |     done
135 | }
136 | 
137 | # Function to run NVIDIA-specific tests
138 | run_nvidia_specific_tests() {
139 |     log_info "Running NVIDIA-specific tests..."
140 |     
141 |     # Test nvidia-smi availability
142 |     if command -v nvidia-smi &> /dev/null; then
143 |         log_success "nvidia-smi is available"
144 |         nvidia-smi --query-gpu=name,memory.total,memory.used --format=csv,noheader,nounits
145 |     else
146 |         log_warning "nvidia-smi not found"
147 |     fi
148 | }
149 | 
150 | # Function to run AMD-specific tests
151 | run_amd_specific_tests() {
152 |     log_info "Running AMD-specific tests..."
153 |     
154 |     # Test rocm-smi availability
155 |     if command -v rocm-smi &> /dev/null; then
156 |         log_success "rocm-smi is available"
157 |         rocm-smi --showproductname
158 |         rocm-smi --showuse
159 |         rocm-smi --showtemp
160 |         rocm-smi --showpower
161 |         rocm-smi --showmemuse
162 |     else
163 |         log_warning "rocm-smi not found"
164 |     fi
165 |     
166 |     # Test amd-smi availability (newer tool)
167 |     if command -v amd-smi &> /dev/null; then
168 |         log_success "amd-smi is available"
169 |         amd-smi
170 |     else
171 |         log_warning "amd-smi not found"
172 |     fi
173 | }
174 | 
175 | # Function to run Intel-specific tests
176 | run_intel_specific_tests() {
177 |     log_info "Running Intel-specific tests..."
178 |     
179 |     # Test intel_gpu_top availability
180 |     if command -v intel_gpu_top &> /dev/null; then
181 |         log_success "intel_gpu_top is available"
182 |         timeout 5 intel_gpu_top -l 1 || true
183 |     else
184 |         log_warning "intel_gpu_top not found"
185 |     fi
186 | }
187 | 
188 | # Function to run Apple Silicon-specific tests
189 | run_apple_specific_tests() {
190 |     log_info "Running Apple Silicon-specific tests..."
191 |     
192 |     # Test system_profiler for GPU info
193 |     if command -v system_profiler &> /dev/null; then
194 |         log_success "system_profiler is available"
195 |         system_profiler SPDisplaysDataType | grep -A 5 "Chipset Model" || true
196 |     else
197 |         log_warning "system_profiler not found"
198 |     fi
199 | }
200 | 
201 | # Function to run performance tests
202 | run_performance_tests() {
203 |     log_info "Running GPU performance tests..."
204 |     
205 |     local gpu_type="$1"
206 |     local start_time=$(date +%s)
207 |     
208 |     # Run GPU hardware tests
209 |     if cargo test --test gpu_hardware_tests --release; then
210 |         local end_time=$(date +%s)
211 |         local duration=$((end_time - start_time))
212 |         log_success "GPU performance tests completed in ${duration}s"
213 |     else
214 |         log_warning "Some GPU performance tests failed or were skipped"
215 |     fi
216 | }
217 | 
218 | # Function to run stress tests
219 | run_stress_tests() {
220 |     log_info "Running GPU stress tests..."
221 |     
222 |     # Run multiple iterations of GPU detection
223 |     for i in {1..5}; do
224 |         log_info "Stress test iteration $i/5..."
225 |         if ./target/release/gpukill --list > /dev/null 2>&1; then
226 |             log_success "Iteration $i passed"
227 |         else
228 |             log_error "Iteration $i failed"
229 |             return 1
230 |         fi
231 |         sleep 1
232 |     done
233 |     
234 |     log_success "All stress test iterations passed"
235 | }
236 | 
237 | # Function to generate test report
238 | generate_test_report() {
239 |     log_info "Generating test report..."
240 |     
241 |     local gpu_type="$1"
242 |     local report_file="/tmp/gpu-kill-test-report-$(date +%Y%m%d-%H%M%S).txt"
243 |     
244 |     {
245 |         echo "=== GPU Kill Test Report ==="
246 |         echo "Date: $(date)"
247 |         echo "GPU Type: $gpu_type"
248 |         echo "Hostname: $(hostname)"
249 |         echo "OS: $(uname -a)"
250 |         echo ""
251 |         echo "=== GPU Detection Results ==="
252 |         cat /tmp/gpu-list.txt 2>/dev/null || echo "No GPU list available"
253 |         echo ""
254 |         echo "=== GPU Information (JSON) ==="
255 |         cat /tmp/gpu-info.json 2>/dev/null || echo "No GPU info available"
256 |         echo ""
257 |         echo "=== System Information ==="
258 |         # CPU info (cross-platform)
259 |         if command -v lscpu &> /dev/null; then
260 |             echo "CPU: $(lscpu | grep "Model name" | cut -d: -f2 | xargs || echo "Unknown")"
261 |         elif command -v sysctl &> /dev/null; then
262 |             echo "CPU: $(sysctl -n machdep.cpu.brand_string 2>/dev/null || echo "Unknown")"
263 |         else
264 |             echo "CPU: Unknown"
265 |         fi
266 |         
267 |         # Memory info (cross-platform)
268 |         if command -v free &> /dev/null; then
269 |             echo "Memory: $(free -h | grep "Mem:" | awk '{print $2}' || echo "Unknown")"
270 |         elif command -v vm_stat &> /dev/null; then
271 |             echo "Memory: $(system_profiler SPHardwareDataType | grep "Memory:" | awk '{print $2, $3}' || echo "Unknown")"
272 |         else
273 |             echo "Memory: Unknown"
274 |         fi
275 |         
276 |         # GPU drivers (cross-platform)
277 |         echo "GPU Drivers:"
278 |         if command -v lsmod &> /dev/null; then
279 |             lsmod | grep -E "(nvidia|amdgpu|i915)" || echo "No GPU drivers found"
280 |         elif command -v kextstat &> /dev/null; then
281 |             kextstat | grep -E "(nvidia|amd|intel)" || echo "No GPU drivers found"
282 |         else
283 |             echo "No GPU drivers found"
284 |         fi
285 |     } > "$report_file"
286 |     
287 |     log_success "Test report generated: $report_file"
288 |     cat "$report_file"
289 | }
290 | 
291 | # Main function
292 | main() {
293 |     local gpu_type="${1:-unknown}"
294 |     
295 |     log_info "Starting GPU Kill tests on Hot Aisle instance"
296 |     log_info "GPU Type: $gpu_type"
297 |     log_info "Project Root: $PROJECT_ROOT"
298 |     
299 |     # Initialize log file
300 |     echo "=== GPU Kill Test Log - $(date) ===" > "$LOG_FILE"
301 |     
302 |     # Run test suite
303 |     check_prerequisites
304 |     build_gpukill
305 |     run_gpu_detection_tests "$gpu_type"
306 |     run_performance_tests "$gpu_type"
307 |     run_stress_tests
308 |     generate_test_report "$gpu_type"
309 |     
310 |     log_success "All GPU Kill tests completed successfully!"
311 | }
312 | 
313 | # Run main function with all arguments
314 | main "$@"
315 | 


--------------------------------------------------------------------------------
/src/render.rs:
--------------------------------------------------------------------------------
  1 | use crate::args::OutputFormat;
  2 | use crate::nvml_api::Snapshot;
  3 | use crate::util::{format_memory_mb_to_gib, truncate_string};
  4 | // serde_json is used via serde_json::to_string_pretty
  5 | use std::io::{self, Write};
  6 | use tabled::{
  7 |     settings::{object::Rows, style::Style, Alignment, Modify, Padding, Width},
  8 |     Table, Tabled,
  9 | };
 10 | 
 11 | /// Render GPU information to various output formats
 12 | #[derive(Clone)]
 13 | pub struct Renderer {
 14 |     output_format: OutputFormat,
 15 | }
 16 | 
 17 | #[allow(dead_code)]
 18 | impl Renderer {
 19 |     /// Create a new renderer
 20 |     pub fn new(output_format: OutputFormat) -> Self {
 21 |         Self { output_format }
 22 |     }
 23 | 
 24 |     /// Render a complete snapshot
 25 |     pub fn render_snapshot(
 26 |         &self,
 27 |         snapshot: &Snapshot,
 28 |         details: bool,
 29 |     ) -> Result<(), Box<dyn std::error::Error>> {
 30 |         match self.output_format {
 31 |             OutputFormat::Table => self.render_table(snapshot, details),
 32 |             OutputFormat::Json => self.render_json(snapshot),
 33 |         }
 34 |     }
 35 | 
 36 |     /// Render as a table
 37 |     fn render_table(
 38 |         &self,
 39 |         snapshot: &Snapshot,
 40 |         details: bool,
 41 |     ) -> Result<(), Box<dyn std::error::Error>> {
 42 |         if details {
 43 |             self.render_detailed_table(snapshot)
 44 |         } else {
 45 |             self.render_summary_table(snapshot)
 46 |         }
 47 |     }
 48 | 
 49 |     /// Render summary table (one row per GPU)
 50 |     fn render_summary_table(&self, snapshot: &Snapshot) -> Result<(), Box<dyn std::error::Error>> {
 51 |         let mut table_data = Vec::new();
 52 | 
 53 |         for gpu in &snapshot.gpus {
 54 |             let mem_used_gib = format_memory_mb_to_gib(gpu.mem_used_mb);
 55 |             let mem_total_gib = format_memory_mb_to_gib(gpu.mem_total_mb);
 56 |             let mem_usage = format!("{}/{} GiB", mem_used_gib, mem_total_gib);
 57 | 
 58 |             let top_proc_info = if let Some(ref top_proc) = gpu.top_proc {
 59 |                 format!(
 60 |                     "{}:{}:{}MB",
 61 |                     truncate_string(&top_proc.proc_name, 15),
 62 |                     top_proc.pid,
 63 |                     top_proc.used_mem_mb
 64 |                 )
 65 |             } else {
 66 |                 "-".to_string()
 67 |             };
 68 | 
 69 |             let ecc_info = gpu
 70 |                 .ecc_volatile
 71 |                 .map(|e| e.to_string())
 72 |                 .unwrap_or_else(|| "-".to_string());
 73 | 
 74 |             table_data.push(SummaryRow {
 75 |                 gpu: gpu.gpu_index.to_string(),
 76 |                 name: truncate_string(&gpu.name, 20),
 77 |                 memory: mem_usage,
 78 |                 utilization: format!("{:.1}%", gpu.util_pct),
 79 |                 temperature: format!("{}°C", gpu.temp_c),
 80 |                 power: format!("{:.1}W", gpu.power_w),
 81 |                 ecc_volatile: ecc_info,
 82 |                 pids: gpu.pids.to_string(),
 83 |                 top_process: top_proc_info,
 84 |             });
 85 |         }
 86 | 
 87 |         let table = Table::new(&table_data)
 88 |             .with(Style::modern())
 89 |             .with(Modify::new(Rows::new(1..)).with(Alignment::left()))
 90 |             .with(Modify::new(Rows::new(1..)).with(Padding::new(1, 1, 0, 0)))
 91 |             .with(Width::wrap(120))
 92 |             .to_string();
 93 | 
 94 |         println!("{}", table);
 95 |         Ok(())
 96 |     }
 97 | 
 98 |     /// Render detailed table (one row per process)
 99 |     fn render_detailed_table(&self, snapshot: &Snapshot) -> Result<(), Box<dyn std::error::Error>> {
100 |         // First render summary
101 |         self.render_summary_table(snapshot)?;
102 |         println!();
103 | 
104 |         // Then render process details
105 |         if !snapshot.procs.is_empty() {
106 |             let mut table_data = Vec::new();
107 | 
108 |             for proc in &snapshot.procs {
109 |                 let container_info = proc
110 |                     .container
111 |                     .as_ref()
112 |                     .map(|c| truncate_string(c, 15))
113 |                     .unwrap_or_else(|| "-".to_string());
114 | 
115 |                 table_data.push(ProcessRow {
116 |                     gpu: proc.gpu_index.to_string(),
117 |                     pid: proc.pid.to_string(),
118 |                     user: truncate_string(&proc.user, 12),
119 |                     process: truncate_string(&proc.proc_name, 20),
120 |                     vram_mb: format!("{}MB", proc.used_mem_mb),
121 |                     start_time: truncate_string(&proc.start_time, 10),
122 |                     container: container_info,
123 |                 });
124 |             }
125 | 
126 |             let table = Table::new(&table_data)
127 |                 .with(Style::modern())
128 |                 .with(Modify::new(Rows::new(1..)).with(Alignment::left()))
129 |                 .with(Modify::new(Rows::new(1..)).with(Padding::new(1, 1, 0, 0)))
130 |                 .with(Width::wrap(120))
131 |                 .to_string();
132 | 
133 |             println!("Process Details:");
134 |             println!("{}", table);
135 |         }
136 | 
137 |         Ok(())
138 |     }
139 | 
140 |     /// Render as JSON
141 |     fn render_json(&self, snapshot: &Snapshot) -> Result<(), Box<dyn std::error::Error>> {
142 |         let json = serde_json::to_string_pretty(snapshot)?;
143 |         println!("{}", json);
144 |         Ok(())
145 |     }
146 | 
147 |     /// Render JSON snapshot for watch mode (newline-delimited)
148 |     pub fn render_json_snapshot(
149 |         &self,
150 |         snapshot: &Snapshot,
151 |     ) -> Result<(), Box<dyn std::error::Error>> {
152 |         let json = serde_json::to_string(snapshot)?;
153 |         println!("{}", json);
154 |         io::stdout().flush()?;
155 |         Ok(())
156 |     }
157 | 
158 |     /// Clear screen for watch mode
159 |     pub fn clear_screen(&self) {
160 |         print!("\x1B[2J\x1B[1;1H");
161 |         io::stdout().flush().unwrap_or_default();
162 |     }
163 | 
164 |     /// Get output format
165 |     pub fn get_output_format(&self) -> OutputFormat {
166 |         self.output_format.clone()
167 |     }
168 | }
169 | 
170 | /// Summary table row structure
171 | #[derive(Tabled)]
172 | struct SummaryRow {
173 |     #[tabled(rename = "GPU")]
174 |     gpu: String,
175 |     #[tabled(rename = "NAME")]
176 |     name: String,
177 |     #[tabled(rename = "MEM_USED/TOTAL")]
178 |     memory: String,
179 |     #[tabled(rename = "UTIL(%)")]
180 |     utilization: String,
181 |     #[tabled(rename = "TEMP(°C)")]
182 |     temperature: String,
183 |     #[tabled(rename = "POWER(W)")]
184 |     power: String,
185 |     #[tabled(rename = "ECC(volatile)")]
186 |     ecc_volatile: String,
187 |     #[tabled(rename = "PIDS")]
188 |     pids: String,
189 |     #[tabled(rename = "TOP_PROC")]
190 |     top_process: String,
191 | }
192 | 
193 | /// Process table row structure
194 | #[derive(Tabled)]
195 | struct ProcessRow {
196 |     #[tabled(rename = "GPU")]
197 |     gpu: String,
198 |     #[tabled(rename = "PID")]
199 |     pid: String,
200 |     #[tabled(rename = "USER")]
201 |     user: String,
202 |     #[tabled(rename = "PROC")]
203 |     process: String,
204 |     #[tabled(rename = "VRAM_MB")]
205 |     vram_mb: String,
206 |     #[tabled(rename = "START_TIME")]
207 |     start_time: String,
208 |     #[tabled(rename = "CONTAINER?")]
209 |     container: String,
210 | }
211 | 
212 | /// Render error messages
213 | pub fn render_error(message: &str) {
214 |     eprintln!("Error: {}", message);
215 | }
216 | 
217 | /// Render warning messages
218 | pub fn render_warning(message: &str) {
219 |     eprintln!("Warning: {}", message);
220 | }
221 | 
222 | /// Render info messages
223 | pub fn render_info(message: &str) {
224 |     println!("Info: {}", message);
225 | }
226 | 
227 | /// Render success messages
228 | pub fn render_success(message: &str) {
229 |     println!("Success: {}", message);
230 | }
231 | 
232 | #[cfg(test)]
233 | mod tests {
234 |     use super::*;
235 |     use crate::nvml_api::{GpuProc, GpuSnapshot, Snapshot};
236 | 
237 |     fn create_test_snapshot() -> Snapshot {
238 |         Snapshot {
239 |             host: "test-host".to_string(),
240 |             ts: "2024-01-01T00:00:00Z".to_string(),
241 |             gpus: vec![GpuSnapshot {
242 |                 gpu_index: 0,
243 |                 name: "Test GPU".to_string(),
244 |                 vendor: crate::vendor::GpuVendor::Unknown,
245 |                 mem_used_mb: 2048,
246 |                 mem_total_mb: 8192,
247 |                 util_pct: 50.0,
248 |                 temp_c: 75,
249 |                 power_w: 150.0,
250 |                 ecc_volatile: Some(0),
251 |                 pids: 2,
252 |                 top_proc: Some(GpuProc {
253 |                     gpu_index: 0,
254 |                     pid: 12345,
255 |                     user: "testuser".to_string(),
256 |                     proc_name: "test_process".to_string(),
257 |                     used_mem_mb: 1024,
258 |                     start_time: "1h 30m".to_string(),
259 |                     container: None,
260 |                 }),
261 |             }],
262 |             procs: vec![GpuProc {
263 |                 gpu_index: 0,
264 |                 pid: 12345,
265 |                 user: "testuser".to_string(),
266 |                 proc_name: "test_process".to_string(),
267 |                 used_mem_mb: 1024,
268 |                 start_time: "1h 30m".to_string(),
269 |                 container: None,
270 |             }],
271 |         }
272 |     }
273 | 
274 |     #[test]
275 |     fn test_renderer_creation() {
276 |         let renderer = Renderer::new(OutputFormat::Table);
277 |         assert!(matches!(renderer.output_format, OutputFormat::Table));
278 |     }
279 | 
280 |     #[test]
281 |     fn test_json_rendering() {
282 |         let renderer = Renderer::new(OutputFormat::Json);
283 |         let snapshot = create_test_snapshot();
284 | 
285 |         // This should not panic
286 |         let result = renderer.render_json(&snapshot);
287 |         assert!(result.is_ok());
288 |     }
289 | 
290 |     #[test]
291 |     fn test_table_rendering() {
292 |         let renderer = Renderer::new(OutputFormat::Table);
293 |         let snapshot = create_test_snapshot();
294 | 
295 |         // This should not panic
296 |         let result = renderer.render_table(&snapshot, false);
297 |         assert!(result.is_ok());
298 |     }
299 | 
300 |     #[test]
301 |     fn test_detailed_table_rendering() {
302 |         let renderer = Renderer::new(OutputFormat::Table);
303 |         let snapshot = create_test_snapshot();
304 | 
305 |         // This should not panic
306 |         let result = renderer.render_table(&snapshot, true);
307 |         assert!(result.is_ok());
308 |     }
309 | }
310 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # GPU Kill
  2 | 
  3 | A CLI tool for managing GPUs across NVIDIA, AMD, Intel, and Apple Silicon systems. Monitor, control, and secure your GPU infrastructure with ease.
  4 | 
  5 | ## Community & Support
  6 | 
  7 | Join our Discord community for discussions, support, and updates:
  8 | 
  9 | [![Discord](https://img.shields.io/badge/Discord-Join%20our%20community-7289DA?style=for-the-badge&logo=discord&logoColor=white)](https://discord.gg/KqdBcqRk5E)
 10 | 
 11 | 
 12 | ## Features
 13 | 
 14 | - **Monitor GPUs**: Real-time usage, memory, temperature, and processes
 15 | - **Kill Processes**: Gracefully terminate stuck GPU processes
 16 | - **Security**: Detect crypto miners and suspicious activity
 17 | - **Guard Mode**: Policy enforcement to prevent resource abuse
 18 | - **Remote**: Manage GPUs across multiple servers
 19 | - **Multi-Vendor**: Works with NVIDIA, AMD, Intel, and Apple Silicon
 20 | - **AI Integration**: MCP server for AI assistant integration
 21 | 
 22 | ## Requirements
 23 | 
 24 | ### Build Performance
 25 | 
 26 | **For faster development builds:**
 27 | ```bash
 28 | # Fast release build (recommended for development)
 29 | cargo build --profile release-fast
 30 | 
 31 | # Standard release build (optimized for production)
 32 | cargo build --release
 33 | 
 34 | # Maximum optimization (slowest, best performance)
 35 | cargo build --profile release-max
 36 | ```
 37 | 
 38 | **Build times on typical hardware:**
 39 | - Debug build: ~3 seconds
 40 | - Release-fast: ~28 seconds  
 41 | - Release: ~28 seconds (improved from 76 seconds)
 42 | - Release-max: ~60+ seconds (maximum optimization)
 43 | 
 44 | ### System Dependencies
 45 | 
 46 | **Linux (Ubuntu/Debian):**
 47 | ```bash
 48 | sudo apt install build-essential libssl-dev pkg-config
 49 | ```
 50 | 
 51 | **Linux (Fedora/RHEL/CentOS):**
 52 | ```bash
 53 | sudo dnf install gcc gcc-c++ pkg-config openssl-devel
 54 | # or for older systems:
 55 | # sudo yum install gcc gcc-c++ pkg-config openssl-devel
 56 | ```
 57 | 
 58 | **macOS:**
 59 | ```bash
 60 | # Install Xcode command line tools
 61 | xcode-select --install
 62 | # OpenSSL is included with macOS
 63 | ```
 64 | 
 65 | **Windows:**
 66 | - Install Visual Studio Build Tools
 67 | - OpenSSL is handled automatically by vcpkg
 68 | 
 69 | ### GPU Drivers
 70 | 
 71 | - **NVIDIA**: NVIDIA drivers installed
 72 | - **AMD**: ROCm drivers installed  
 73 | - **Intel**: intel-gpu-tools package installed
 74 | - **Apple Silicon**: macOS with Apple Silicon (M1/M2/M3/M4)
 75 | 
 76 | ### Build Requirements
 77 | 
 78 | - **OS**: Linux, macOS, or Windows
 79 | - **Rust**: 1.70+ (for building from source)
 80 | 
 81 | ## Quick Start
 82 | 
 83 | ### Install & Run
 84 | ```bash
 85 | # Build from source (first build may take 2-3 minutes)
 86 | git clone https://github.com/treadiehq/gpu-kill.git
 87 | cd gpu-kill
 88 | cargo build --release
 89 | 
 90 | # Or install via Cargo
 91 | cargo install gpukill
 92 | 
 93 | # Or one-liner installers (recommended)
 94 | # macOS/Linux
 95 | curl -fsSL https://raw.githubusercontent.com/treadiehq/gpu-kill/refs/heads/main/scripts/install.sh | sh
 96 | # Windows (PowerShell)
 97 | irm https://raw.githubusercontent.com/treadiehq/gpu-kill/refs/heads/main/scripts/install.ps1 | iex
 98 | 
 99 | # List your GPUs
100 | gpukill --list
101 | 
102 | # Watch GPU usage in real-time
103 | gpukill --list --watch
104 | ```
105 | 
106 | ### Dead-simple cheatsheet
107 | ```bash
108 | # Live watch (alias)
109 | gpukill watch            # = gpukill --list --watch
110 | 
111 | # Kill job by PID (positional alias)
112 | gpukill 12345            # = gpukill --kill --pid 12345
113 | 
114 | # Free a specific GPU index (kill all jobs on GPU 0)
115 | gpukill --kill --gpu 0   # add --batch to actually kill; preview without it
116 | 
117 | # Force reset a GPU (shorthand)
118 | gpukill --reset 0        # = gpukill --reset --gpu 0
119 | 
120 | # Safe mode: dry-run first (no changes)
121 | gpukill 12345 --safe     # alias: --dry-run
122 | ```
123 | 
124 | ## Dashboard (Local Development)
125 | 
126 | The GPU Kill dashboard provides a modern web interface for GPU cluster monitoring. The dashboard is included in the repository for local development but is **not required** for core GPU Kill functionality.
127 | 
128 | ![GPU Kill Dashboard](dashboard/public/screenshot.png)
129 | 
130 | ### Quick Start
131 | 
132 | ```bash
133 | # 1. Start the backend API server
134 | gpukill --server --server-port 8080
135 | 
136 | # 2. In a new terminal, start the dashboard UI
137 | cd dashboard
138 | npm install  # First time only
139 | npm run dev
140 | 
141 | # 3. Access the dashboard
142 | open http://localhost:3000
143 | ```
144 | 
145 | **Requirements:**
146 | - Node.js 18+ and npm
147 | - GPU Kill backend server running (provides the API)
148 | 
149 | **Note**: You need both the backend server (port 8080) and frontend UI (port 3000) running for the dashboard to work.
150 | 
151 | ### Dashboard Features
152 | 
153 | - **Real-time monitoring** of all GPUs across your cluster
154 | - **Security detection** with threat analysis and risk scoring
155 | - **Policy management** for resource control and enforcement
156 | - **Cluster overview** with Magic Moment contention insights
157 | - **Interactive controls** for process management and GPU operations
158 | 
159 | ### Production Deployment
160 | 
161 | For production GPU monitoring solutions, check the [Kill Suite](https://treadie.com) website.
162 | 
163 | ## MCP Server
164 | 
165 | GPU Kill includes a MCP server that enables AI assistants to interact with GPU management functionality:
166 | 
167 | - **Resources**: Read GPU status, processes, audit data, policies, and security scans
168 | - **Tools**: Kill processes, reset GPUs, scan for threats, create policies
169 | 
170 | ```bash
171 | # Start the MCP server
172 | cargo run --release -p gpukill-mcp
173 | 
174 | # Server runs on http://localhost:3001/mcp
175 | ```
176 | 
177 | ## Usage
178 | 
179 | Ask your AI to use the tools.
180 | 
181 | ```text
182 | What GPUs do I have and what's their current usage?
183 | ```
184 | 
185 | ```text
186 | Kill the Python process that's stuck on GPU 0
187 | ```
188 | 
189 | ```text
190 | Kill all training processes that are using too much GPU memory
191 | ```
192 | 
193 | ```text
194 | Show me GPU usage and kill any stuck processes
195 | ```
196 | 
197 | ```text
198 | Scan for crypto miners and suspicious activity
199 | ```
200 | 
201 | ```text
202 | Create a policy to limit user memory usage to 8GB
203 | ```
204 | 
205 | ```text
206 | Reset GPU 1 because it's not responding
207 | ```
208 | 
209 | ```text
210 | What processes are currently using my GPUs?
211 | ```
212 | 
213 | See [mcp/README.md](mcp/README.md) for detailed MCP server documentation.
214 | 
215 | 
216 | ## Security & Policies
217 | 
218 | ### Detect Threats
219 | ```bash
220 | # Scan for crypto miners and suspicious activity
221 | gpukill --audit --rogue
222 | 
223 | # Configure detection rules
224 | gpukill --audit --rogue-config
225 | ```
226 | 
227 | ### Policy Enforcement
228 | ```bash
229 | # Enable Guard Mode
230 | gpukill --guard --guard-enable
231 | 
232 | # Test policies safely
233 | gpukill --guard --guard-test-policies
234 | ```
235 | 
236 | *For detailed security and policy documentation, see [DETAILED.md](DETAILED.md).*
237 | 
238 | ## Remote Management
239 | 
240 | Manage GPUs across multiple servers via SSH:
241 | 
242 | ```bash
243 | # List GPUs on remote server
244 | gpukill --remote staging-server --list
245 | 
246 | # Kill process on remote server
247 | gpukill --remote prod-gpu-01 --kill --pid 1234
248 | 
249 | # Reset GPU on remote server
250 | gpukill --remote gpu-cluster --reset --gpu 0
251 | ```
252 | 
253 | ## Troubleshooting
254 | 
255 | ### Build Issues
256 | 
257 | **OpenSSL not found:**
258 | ```bash
259 | # Ubuntu/Debian
260 | sudo apt install build-essential libssl-dev pkg-config
261 | 
262 | # Fedora/RHEL/CentOS
263 | sudo dnf install gcc gcc-c++ pkg-config openssl-devel
264 | ```
265 | 
266 | **Other common build issues:**
267 | - Ensure you have the latest Rust toolchain: `rustup update`
268 | - Clean and rebuild: `cargo clean && cargo build --release`
269 | - Check system dependencies are installed (see Requirements section)
270 | 
271 | ## Need Help?
272 | 
273 | ```bash
274 | gpukill --help                    # Show all options
275 | gpukill --version                 # Show version
276 | ```
277 | 
278 | ## CI/CD and Testing
279 | 
280 | GPU Kill uses a CI/CD pipeline with **automatic GPU testing**:
281 | 
282 | - **✅ Conditional GPU testing** - Runs automatically when GPU hardware is available
283 | - **✅ Multi-vendor GPU testing** on real hardware (NVIDIA, AMD, Intel, Apple Silicon)
284 | - **✅ Hot Aisle integration** - Optional on-demand GPU instance provisioning for comprehensive testing
285 | - **✅ Cross-platform compatibility** testing
286 | - **✅ Performance benchmarking** and profiling
287 | - **✅ Security auditing** and compliance checks
288 | - **✅ Stress testing** for reliability validation
289 | 
290 | ### How GPU Testing Works
291 | 
292 | - **On GitHub hosted runners**: GPU tests skip gracefully (no GPU hardware)
293 | - **On self-hosted runners**: GPU tests run automatically when GPU hardware is detected
294 | - **On cloud instances**: GPU tests run automatically when GPU hardware is available
295 | - **On developer machines**: GPU tests run automatically when GPU hardware is detected
296 | - **Via Hot Aisle**: On-demand GPU instance provisioning for comprehensive testing
297 | 
298 | ### Quick Setup
299 | 
300 | **Option 1: Test Locally (Already Working)**
301 | ```bash
302 | cargo test --test gpu_hardware_tests  # Runs on your GPU hardware
303 | ```
304 | 
305 | **Option 2: Set Up Cloud GPU (5 minutes)**
306 | ```bash
307 | # On any cloud GPU instance:
308 | curl -sSL https://raw.githubusercontent.com/treadiehq/gpu-kill/main/scripts/setup-gpu-runner.sh | bash
309 | ```
310 | 
311 | **Option 3: Self-Hosted Runner**
312 | See **[CI_CD.md](CI_CD.md)** for detailed information about our testing infrastructure and how to set up self-hosted runners with GPU hardware.
313 | 
314 | **Option 4: Hot Aisle Integration (Optional)**
315 | ```bash
316 | # Build with Hot Aisle feature
317 | cargo build --release --features hotaisle
318 | 
319 | # Integration tests run automatically (no API key required)
320 | # For actual GPU testing:
321 | # 1. Set up HOTAISLE_API_KEY in GitHub Secrets
322 | # 2. Manually trigger "Hot Aisle GPU Testing" workflow
323 | # 3. Tests run on real GPU hardware with automatic cleanup
324 | ```
325 | 
326 | **Option 5: Cloud GPU Setup**
327 | See **[docs/CLOUD_GPU_SETUP.md](docs/CLOUD_GPU_SETUP.md)** for AWS, GCP, and Azure GPU instance setup.
328 | 
329 | ## Documentation
330 | 
331 | - **[DETAILED.md](DETAILED.md)** - Complete documentation, API reference, and advanced features
332 | - **[CI_CD.md](CI_CD.md)** - CI/CD pipeline and testing infrastructure
333 | - **[docs/HOTAISLE_INTEGRATION.md](docs/HOTAISLE_INTEGRATION.md)** - Hot Aisle integration guide
334 | - **[docs/CLOUD_GPU_SETUP.md](docs/CLOUD_GPU_SETUP.md)** - Cloud GPU setup guide (AWS, GCP, Azure)
335 | 
336 | ## License
337 | 
338 | This project is licensed under the FSL-1.1-MIT License. See the LICENSE file for details.


--------------------------------------------------------------------------------
/deny.toml:
--------------------------------------------------------------------------------
  1 | # This template contains all of the possible sections and their default values
  2 | 
  3 | # Note that all fields that take a lint level have these possible values:
  4 | # * deny - An error will be produced and the check will fail
  5 | # * warn - A warning will be produced, but the check will not fail
  6 | # * allow - No warning or error will be produced, though in some cases a note
  7 | # will be
  8 | 
  9 | # The values provided in this template are the default values that will be used
 10 | # when any section or field is not specified in your own configuration
 11 | 
 12 | # Root options
 13 | 
 14 | # The graph table configures how the dependency graph is constructed and thus
 15 | # which crates the checks are performed against
 16 | [graph]
 17 | # If 1 or more target triples (and optionally, target_features) are specified,
 18 | # only the specified targets will be checked when running `cargo deny check`.
 19 | # This means, if a particular package is only ever used as a target specific
 20 | # dependency, such as, for example, the `nix` crate only being used via the
 21 | # `target_family = "unix"` configuration, that only having windows targets in
 22 | # this list would mean the nix crate, as well as any of its exclusive
 23 | # dependencies not shared by any other crates, would be ignored, as the target
 24 | # list here is effectively saying which targets you are building for.
 25 | targets = [
 26 |     # The triple can be any string, but only the target triples built in to
 27 |     # rustc (as of 1.40) can be checked against actual config expressions
 28 |     #"x86_64-unknown-linux-musl",
 29 |     # You can also specify which target_features you promise are enabled for a
 30 |     # particular target. target_features are currently not validated against
 31 |     # the actual valid features supported by the target architecture.
 32 |     #{ triple = "wasm32-unknown-unknown", features = ["atomics"] },
 33 | ]
 34 | # When creating the dependency graph used as the source of truth when checks are
 35 | # executed, this field can be used to prune crates from the graph, removing them
 36 | # from the view of cargo-deny. This is an extremely heavy hammer, as if a crate
 37 | # is pruned from the graph, all of its dependencies will also be pruned unless
 38 | # they are connected to another crate in the graph that hasn't been pruned,
 39 | # so it should be used with care. The identifiers are [Package ID Specifications]
 40 | # (https://doc.rust-lang.org/cargo/reference/pkgid-spec.html)
 41 | #exclude = []
 42 | # If true, metadata will be collected with `--all-features`. Note that this can't
 43 | # be toggled off if true, if you want to conditionally enable `--all-features` it
 44 | # is recommended to pass `--all-features` on the cmd line instead
 45 | all-features = false
 46 | # If true, metadata will be collected with `--no-default-features`. The same
 47 | # caveat with `all-features` applies
 48 | no-default-features = false
 49 | # If set, these feature will be enabled when collecting metadata. If `--features`
 50 | # is specified on the cmd line they will take precedence over this option.
 51 | #features = []
 52 | 
 53 | # The output table provides options for how/if diagnostics are outputted
 54 | [output]
 55 | # When outputting inclusion graphs in diagnostics that include features, this
 56 | # option can be used to specify the depth at which feature edges will be added.
 57 | # This option is included since the graphs can be quite large and the addition
 58 | # of features from the crate(s) to all of the graph roots can be far too verbose.
 59 | # This option can be overridden via `--feature-depth` on the cmd line
 60 | feature-depth = 1
 61 | 
 62 | # This section is considered when running `cargo deny check advisories`
 63 | # More documentation for the advisories section can be found here:
 64 | # https://embarkstudios.github.io/cargo-deny/checks/advisories/cfg.html
 65 | [advisories]
 66 | # The path where the advisory databases are cloned/fetched into
 67 | #db-path = "$CARGO_HOME/advisory-dbs"
 68 | # The url(s) of the advisory databases to use
 69 | #db-urls = ["https://github.com/rustsec/advisory-db"]
 70 | # A list of advisory IDs to ignore. Note that ignored advisories will still
 71 | # output a note when they are encountered.
 72 | ignore = [
 73 |     # Allow unmaintained crates as warnings (not errors)
 74 |     { id = "RUSTSEC-2020-0168", reason = "mach crate is unmaintained but still functional" },
 75 |     { id = "RUSTSEC-2024-0370", reason = "proc-macro-error is unmaintained but still functional" },
 76 | ]
 77 | # If this is true, then cargo deny will use the git executable to fetch advisory database.
 78 | # If this is false, then it uses a built-in git library.
 79 | # Setting this to true can be helpful if you have special authentication requirements that cargo-deny does not support.
 80 | # See Git Authentication for more information about setting up git authentication.
 81 | #git-fetch-with-cli = true
 82 | 
 83 | # This section is considered when running `cargo deny check licenses`
 84 | # More documentation for the licenses section can be found here:
 85 | # https://embarkstudios.github.io/cargo-deny/checks/licenses/cfg.html
 86 | [licenses]
 87 | # List of explicitly allowed licenses
 88 | # See https://spdx.org/licenses/ for list of possible licenses
 89 | # [possible values: any SPDX 3.11 short identifier (+ optional exception)].
 90 | allow = [
 91 |     "MIT",
 92 |     "Apache-2.0",
 93 |     "Apache-2.0 WITH LLVM-exception",
 94 |     "BSD-2-Clause",
 95 |     "BSD-3-Clause",
 96 |     "ISC",
 97 |     "Unlicense",
 98 |     "0BSD",
 99 |     "Zlib",
100 |     "CC0-1.0",
101 |     "MPL-2.0",
102 |     "LGPL-2.1",
103 |     "LGPL-3.0",
104 |     "GPL-2.0",
105 |     "GPL-3.0",
106 |     "FSL-1.1-MIT",
107 |     "Unicode-3.0",
108 | ]
109 | # The confidence threshold for detecting a license from license text.
110 | # The higher the value, the more closely the license text must be to the
111 | # canonical license text of a valid SPDX license file.
112 | # [possible values: any between 0.0 and 1.0].
113 | confidence-threshold = 0.8
114 | # Allow 1 or more licenses on a per-crate basis, so that particular licenses
115 | # aren't accepted for every possible crate as with the normal allow list
116 | exceptions = [
117 |     # Each entry is the crate and version constraint, and its specific allow
118 |     # list
119 |     #{ allow = ["Zlib"], crate = "adler32" },
120 | ]
121 | 
122 | # Some crates don't have (easily) machine readable licensing information,
123 | # adding a clarification entry for it allows you to manually specify the
124 | # licensing information
125 | #[[licenses.clarify]]
126 | # The package spec the clarification applies to
127 | #crate = "ring"
128 | # The SPDX expression for the license requirements of the crate
129 | #expression = "MIT AND ISC AND OpenSSL"
130 | # One or more files in the crate's source used as the "source of truth" for
131 | # the license expression. If the contents match, the clarification will be used
132 | # when running the license check, otherwise the clarification will be ignored
133 | # and the crate will be checked normally, which may produce warnings or errors
134 | # depending on the rest of your configuration
135 | #license-files = [
136 | # Each entry is a crate relative path, and the (opaque) hash of its contents
137 | #{ path = "LICENSE", hash = 0xbd0eed23 }
138 | #]
139 | 
140 | [licenses.private]
141 | # If true, ignores workspace crates that aren't published, or are only
142 | # published to private registries.
143 | # To see how to mark a crate as unpublished (to the official registry),
144 | # visit https://doc.rust-lang.org/cargo/reference/manifest.html#the-publish-field.
145 | ignore = false
146 | # One or more private registries that you might publish crates to, if a crate
147 | # is only published to private registries, and ignore is true, the crate will
148 | # not have its license(s) checked
149 | registries = [
150 |     #"https://sekretz.com/registry
151 | ]
152 | 
153 | # This section is considered when running `cargo deny check bans`.
154 | # More documentation about the 'bans' section can be found here:
155 | # https://embarkstudios.github.io/cargo-deny/checks/bans/cfg.html
156 | [bans]
157 | # Lint level for when multiple versions of the same crate are detected
158 | multiple-versions = "warn"
159 | # Lint level for when a crate version requirement is `*`
160 | wildcards = "allow"
161 | # The graph highlighting used when creating dotgraphs for crates
162 | # with multiple versions
163 | # * lowest-version - The path to the lowest versioned duplicate is highlighted
164 | # * simplest-path - The path to the version with the fewest edges is highlighted
165 | # * all - Both lowest-version and simplest-path are used
166 | highlight = "all"
167 | # The default lint level for `default` features for crates that are members of
168 | # the workspace that is being checked. This can be overridden by allowing/denying
169 | # `default` on a crate-by-crate basis if desired.
170 | workspace-default-features = "allow"
171 | # The default lint level for `default` features for external crates that are not
172 | # members of the workspace. This can be overridden by allowing/denying `default`
173 | # on a crate-by-crate basis if desired.
174 | external-default-features = "allow"
175 | # List of crates that are allowed. Use with care!
176 | allow = [
177 |     #"ansi_term@0.11.0",
178 |     #{ crate = "ansi_term@0.11.0", reason = "you can specify a reason it is allowed" },
179 | ]
180 | # List of crates to deny
181 | deny = [
182 |     #"ansi_term@0.11.0",
183 |     #{ crate = "ansi_term@0.11.0", reason = "you can specify a reason it is banned" },
184 |     # Wrapper crates can optionally be specified to allow the crate when it
185 |     # is a direct dependency of the otherwise banned crate
186 |     #{ crate = "ansi_term@0.11.0", wrappers = ["this-crate-directly-depends-on-ansi_term"] },
187 | ]
188 | 
189 | # List of features to allow/deny
190 | # Each entry the name of a crate and a version range. If version is
191 | # not specified, all versions will be matched.
192 | #[[bans.features]]
193 | #crate = "reqwest"
194 | # Features to not allow
195 | #deny = ["json"]
196 | # Features to allow
197 | #allow = [
198 | #    "rustls",
199 | #    "__rustls",
200 | #    "__tls",
201 | #    "hyper-rustls",
202 | #    "rustls",
203 | #    "rustls-pemfile",
204 | #    "rustls-tls-webpki-roots",
205 | #    "tokio-rustls",
206 | #    "webpki-roots",
207 | #]
208 | # If true, the allowed features must exactly match the enabled feature set. If
209 | # this is set there is no point setting `deny`
210 | #exact = true
211 | 
212 | # Certain crates/versions that will be skipped when doing duplicate detection.
213 | skip = [
214 |     #"ansi_term@0.11.0",
215 |     #{ crate = "ansi_term@0.11.0", reason = "you can specify a reason why it can't be updated/removed" },
216 | ]
217 | # Similarly to `skip` allows you to skip certain crates during duplicate
218 | # detection. Unlike skip, it also includes the entire tree of transitive
219 | # dependencies starting at the specified crate, up to a certain depth, which is
220 | # by default infinite.
221 | skip-tree = [
222 |     #"ansi_term@0.11.0", # will be skipped along with _all_ of its direct and transitive dependencies
223 |     #{ crate = "ansi_term@0.11.0", depth = 20 },
224 | ]
225 | 
226 | # This section is considered when running `cargo deny check sources`.
227 | # More documentation about the 'sources' section can be found here:
228 | # https://embarkstudios.github.io/cargo-deny/checks/sources/cfg.html
229 | [sources]
230 | # Lint level for what to happen when a crate from a crate registry that is not
231 | # in the allow list is encountered
232 | unknown-registry = "warn"
233 | # Lint level for what to happen when a crate from a git repository that is not
234 | # in the allow list is encountered
235 | unknown-git = "warn"
236 | # List of URLs for allowed crate registries. Defaults to the crates.io index
237 | # if not specified. If it is specified but empty, no registries are allowed.
238 | allow-registry = ["https://github.com/rust-lang/crates.io-index"]
239 | # List of URLs for allowed Git repositories
240 | allow-git = []
241 | 
242 | [sources.allow-org]
243 | # github.com organizations to allow git sources for
244 | github = []
245 | # gitlab.com organizations to allow git sources for
246 | gitlab = []
247 | # bitbucket.org organizations to allow git sources for
248 | bitbucket = []
249 | 


--------------------------------------------------------------------------------
/src/proc.rs:
--------------------------------------------------------------------------------
  1 | use crate::nvml_api::NvmlApi;
  2 | use crate::util::parse_process_start_time;
  3 | use anyhow::{Context, Result};
  4 | #[cfg(unix)]
  5 | use nix::sys::signal::{kill, Signal};
  6 | #[cfg(unix)]
  7 | use nix::unistd::Pid;
  8 | // use std::process::Command; // Used conditionally below
  9 | use std::time::{Duration, SystemTime};
 10 | use sysinfo::{Pid as SysPid, System};
 11 | 
 12 | /// Process information for a running process
 13 | #[derive(Debug, Clone)]
 14 | pub struct ProcessInfo {
 15 |     #[allow(dead_code)]
 16 |     pub pid: u32,
 17 |     pub user: String,
 18 |     pub name: String,
 19 |     #[allow(dead_code)]
 20 |     pub start_time: SystemTime,
 21 |     #[allow(dead_code)]
 22 |     pub cmdline: String,
 23 | }
 24 | 
 25 | /// Process management utilities
 26 | pub struct ProcessManager {
 27 |     nvml_api: NvmlApi,
 28 |     system: System,
 29 | }
 30 | 
 31 | #[allow(dead_code)]
 32 | impl ProcessManager {
 33 |     /// Create a new process manager
 34 |     pub fn new(nvml_api: NvmlApi) -> Self {
 35 |         let mut system = System::new_all();
 36 |         system.refresh_all();
 37 | 
 38 |         Self { nvml_api, system }
 39 |     }
 40 | 
 41 |     /// Get process information by PID
 42 |     pub fn get_process_info(&mut self, pid: u32) -> Result<ProcessInfo> {
 43 |         self.system.refresh_processes();
 44 | 
 45 |         let sys_pid = SysPid::from_u32(pid);
 46 |         let process = self
 47 |             .system
 48 |             .process(sys_pid)
 49 |             .ok_or_else(|| anyhow::anyhow!("Process with PID {} not found", pid))?;
 50 | 
 51 |         let user = get_process_user(pid).unwrap_or_else(|_| "unknown".to_string());
 52 | 
 53 |         let start_time = process.start_time();
 54 |         let start_time_system = SystemTime::UNIX_EPOCH + Duration::from_secs(start_time);
 55 | 
 56 |         Ok(ProcessInfo {
 57 |             pid,
 58 |             user,
 59 |             name: process.name().to_string(),
 60 |             start_time: start_time_system,
 61 |             cmdline: process.cmd().join(" "),
 62 |         })
 63 |     }
 64 | 
 65 |     /// Check if a process is using any GPU
 66 |     pub fn is_process_using_gpu(&self, pid: u32) -> Result<bool> {
 67 |         self.nvml_api.is_process_using_gpu(pid)
 68 |     }
 69 | 
 70 |     /// Gracefully terminate a process with timeout and escalation
 71 |     #[cfg(unix)]
 72 |     pub fn graceful_kill(&self, pid: u32, timeout_secs: u16, force: bool) -> Result<()> {
 73 |         let pid = Pid::from_raw(pid as i32);
 74 | 
 75 |         // First, try SIGTERM
 76 |         tracing::info!("Sending SIGTERM to process {}", pid);
 77 |         kill(pid, Signal::SIGTERM).map_err(|e| anyhow::anyhow!("Failed to send SIGTERM: {}", e))?;
 78 | 
 79 |         // Wait for the process to terminate
 80 |         let timeout = Duration::from_secs(timeout_secs as u64);
 81 |         let start = SystemTime::now();
 82 | 
 83 |         while SystemTime::now().duration_since(start).unwrap_or_default() < timeout {
 84 |             // Check if process still exists
 85 |             if !self.is_process_running(pid.as_raw() as u32)? {
 86 |                 tracing::info!("Process {} terminated gracefully", pid);
 87 |                 return Ok(());
 88 |             }
 89 | 
 90 |             std::thread::sleep(Duration::from_millis(100));
 91 |         }
 92 | 
 93 |         // Process didn't terminate, escalate if force is enabled
 94 |         if force {
 95 |             tracing::warn!("Process {} did not terminate, escalating to SIGKILL", pid);
 96 |             kill(pid, Signal::SIGKILL)
 97 |                 .map_err(|e| anyhow::anyhow!("Failed to send SIGKILL: {}", e))?;
 98 | 
 99 |             // Wait a bit more for SIGKILL to take effect
100 |             std::thread::sleep(Duration::from_millis(500));
101 | 
102 |             if !self.is_process_running(pid.as_raw() as u32)? {
103 |                 tracing::info!("Process {} terminated with SIGKILL", pid);
104 |                 Ok(())
105 |             } else {
106 |                 Err(anyhow::anyhow!(
107 |                     "Process {} still running after SIGKILL",
108 |                     pid
109 |                 ))
110 |             }
111 |         } else {
112 |             Err(anyhow::anyhow!(
113 |                 "Process {} did not terminate within {} seconds. Use --force to escalate to SIGKILL",
114 |                 pid,
115 |                 timeout_secs
116 |             ))
117 |         }
118 |     }
119 | 
120 |     /// Gracefully terminate a process with timeout and escalation (Windows stub)
121 |     #[cfg(windows)]
122 |     pub fn graceful_kill(&self, _pid: u32, _timeout_secs: u16, _force: bool) -> Result<()> {
123 |         // On Windows, we can't use Unix signals, so we'll use a different approach
124 |         // For now, just return an error indicating this feature isn't available on Windows
125 |         Err(anyhow::anyhow!(
126 |             "Process termination not yet implemented for Windows"
127 |         ))
128 |     }
129 | 
130 |     /// Check if a process is still running
131 |     fn is_process_running(&self, pid: u32) -> Result<bool> {
132 |         let sys_pid = SysPid::from_u32(pid);
133 |         Ok(self.system.process(sys_pid).is_some())
134 |     }
135 | 
136 |     /// Enrich GPU processes with system information
137 |     pub fn enrich_gpu_processes(
138 |         &mut self,
139 |         mut processes: Vec<crate::nvml_api::GpuProc>,
140 |     ) -> Result<Vec<crate::nvml_api::GpuProc>> {
141 |         self.system.refresh_processes();
142 | 
143 |         for process in &mut processes {
144 |             if let Ok(process_info) = self.get_process_info(process.pid) {
145 |                 process.user = process_info.user;
146 |                 process.proc_name = process_info.name;
147 |                 process.start_time = parse_process_start_time(process_info.start_time);
148 |             }
149 |         }
150 | 
151 |         Ok(processes)
152 |     }
153 | 
154 |     /// Get all processes using GPUs with enriched information
155 |     pub fn get_enriched_gpu_processes(&mut self) -> Result<Vec<crate::nvml_api::GpuProc>> {
156 |         let processes = self.nvml_api.get_gpu_processes()?;
157 |         self.enrich_gpu_processes(processes)
158 |     }
159 | 
160 |     /// Validate that a process exists and optionally check GPU usage
161 |     pub fn validate_process(&self, pid: u32, check_gpu_usage: bool) -> Result<()> {
162 |         // Check if process exists
163 |         let sys_pid = SysPid::from_u32(pid);
164 |         if self.system.process(sys_pid).is_none() {
165 |             return Err(anyhow::anyhow!("Process with PID {} not found", pid));
166 |         }
167 | 
168 |         // Check GPU usage if requested
169 |         if check_gpu_usage {
170 |             let is_using_gpu = self.is_process_using_gpu(pid)?;
171 |             if !is_using_gpu {
172 |                 return Err(anyhow::anyhow!(
173 |                     "Process {} is not using any GPU. Use --force to kill anyway.",
174 |                     pid
175 |                 ));
176 |             }
177 |         }
178 | 
179 |         Ok(())
180 |     }
181 | 
182 |     /// Get device count
183 |     pub fn device_count(&self) -> Result<u32> {
184 |         self.nvml_api.device_count()
185 |     }
186 | 
187 |     /// Create snapshot
188 |     pub fn create_snapshot(&self) -> Result<crate::nvml_api::Snapshot> {
189 |         self.nvml_api.create_snapshot()
190 |     }
191 | 
192 |     /// Reset GPU
193 |     pub fn reset_gpu(&self, index: u32) -> Result<()> {
194 |         self.nvml_api.reset_gpu(index)
195 |     }
196 | }
197 | 
198 | /// Get the username for a process (cross-platform)
199 | fn get_process_user(pid: u32) -> Result<String> {
200 |     #[cfg(target_os = "linux")]
201 |     {
202 |         // On Linux, read from /proc/<pid>/status
203 |         let status_path = format!("/proc/{}/status", pid);
204 |         let status = std::fs::read_to_string(&status_path)
205 |             .with_context(|| format!("Failed to read process status from {}", status_path))?;
206 | 
207 |         for line in status.lines() {
208 |             if line.starts_with("Uid:") {
209 |                 let parts: Vec<&str> = line.split_whitespace().collect();
210 |                 if parts.len() >= 2 {
211 |                     let uid = parts[1]
212 |                         .parse::<u32>()
213 |                         .with_context(|| format!("Failed to parse UID: {}", parts[1]))?;
214 | 
215 |                     // Get username from UID
216 |                     return get_username_from_uid(uid);
217 |                 }
218 |             }
219 |         }
220 |     }
221 | 
222 |     #[cfg(target_os = "macos")]
223 |     {
224 |         use std::process::Command;
225 |         // On macOS, use ps command
226 |         let output = Command::new("ps")
227 |             .args(["-o", "user=", "-p", &pid.to_string()])
228 |             .output()
229 |             .context("Failed to execute ps command")?;
230 | 
231 |         if output.status.success() {
232 |             let user = String::from_utf8_lossy(&output.stdout).trim().to_string();
233 |             if !user.is_empty() {
234 |                 return Ok(user);
235 |             }
236 |         }
237 |     }
238 | 
239 |     #[cfg(target_os = "windows")]
240 |     {
241 |         use std::process::Command;
242 |         // On Windows, use wmic command
243 |         let output = Command::new("wmic")
244 |             .args([
245 |                 "process",
246 |                 "where",
247 |                 &format!("ProcessId={}", pid),
248 |                 "get",
249 |                 "ExecutablePath",
250 |                 "/format:value",
251 |             ])
252 |             .output()
253 |             .context("Failed to execute wmic command")?;
254 | 
255 |         if output.status.success() {
256 |             let output_str = String::from_utf8_lossy(&output.stdout);
257 |             for line in output_str.lines() {
258 |                 if line.starts_with("ExecutablePath=") {
259 |                     let path = line.strip_prefix("ExecutablePath=").unwrap_or("");
260 |                     if !path.is_empty() {
261 |                         // Extract username from path or use a default
262 |                         return Ok("windows_user".to_string());
263 |                     }
264 |                 }
265 |             }
266 |         }
267 |     }
268 | 
269 |     Ok("unknown".to_string())
270 | }
271 | 
272 | #[cfg(target_os = "linux")]
273 | fn get_username_from_uid(uid: u32) -> Result<String> {
274 |     use std::ffi::CString;
275 |     // use std::os::unix::ffi::OsStringExt; // Unused for now
276 | 
277 |     unsafe {
278 |         let passwd = libc::getpwuid(uid as libc::uid_t);
279 |         if passwd.is_null() {
280 |             return Ok(format!("uid_{}", uid));
281 |         }
282 | 
283 |         let username = CString::from_raw((*passwd).pw_name);
284 |         let username_str = username.to_string_lossy().to_string();
285 |         std::mem::forget(username); // Don't free the passwd struct
286 |         Ok(username_str)
287 |     }
288 | }
289 | 
290 | #[cfg(not(target_os = "linux"))]
291 | #[allow(dead_code)]
292 | fn get_username_from_uid(_uid: u32) -> Result<String> {
293 |     Ok("unknown".to_string())
294 | }
295 | 
296 | #[cfg(test)]
297 | mod tests {
298 |     use super::*;
299 |     use crate::nvml_api::NvmlApi;
300 | 
301 |     #[test]
302 |     fn test_process_info_creation() {
303 |         // Skip this test if NVML is not available
304 |         let nvml_api = match NvmlApi::new() {
305 |             Ok(api) => api,
306 |             Err(_) => {
307 |                 // Skip test if NVML is not available
308 |                 return;
309 |             }
310 |         };
311 | 
312 |         let mut proc_mgr = ProcessManager::new(nvml_api);
313 | 
314 |         // Test with a known process (init/systemd)
315 |         if let Ok(info) = proc_mgr.get_process_info(1) {
316 |             assert_eq!(info.pid, 1);
317 |             assert!(!info.name.is_empty());
318 |         }
319 |     }
320 | 
321 |     #[test]
322 |     fn test_process_validation() {
323 |         // Skip this test if NVML is not available
324 |         let nvml_api = match NvmlApi::new() {
325 |             Ok(api) => api,
326 |             Err(_) => {
327 |                 // Skip test if NVML is not available
328 |                 return;
329 |             }
330 |         };
331 | 
332 |         let proc_mgr = ProcessManager::new(nvml_api);
333 | 
334 |         // Test validation of non-existent process
335 |         let result = proc_mgr.validate_process(999999, false);
336 |         assert!(result.is_err());
337 |     }
338 | }
339 | 


--------------------------------------------------------------------------------
/mcp/src/resources.rs:
--------------------------------------------------------------------------------
  1 | //! MCP Resources for GPU Kill
  2 | 
  3 | use crate::types::*;
  4 | use gpukill::audit::AuditManager;
  5 | use gpukill::guard_mode::GuardModeManager;
  6 | use gpukill::rogue_detection::RogueDetector;
  7 | use gpukill::vendor::GpuManager;
  8 | use serde_json::json;
  9 | use std::collections::HashMap;
 10 | 
 11 | /// Resource handler for GPU Kill MCP server
 12 | pub struct ResourceHandler {
 13 |     gpu_manager: GpuManager,
 14 |     guard_mode: Option<GuardModeManager>,
 15 |     rogue_detector: Option<RogueDetector>,
 16 |     audit_manager: Option<AuditManager>,
 17 | }
 18 | 
 19 | impl ResourceHandler {
 20 |     pub async fn new() -> anyhow::Result<Self> {
 21 |         let gpu_manager = GpuManager::initialize()?;
 22 | 
 23 |         // Initialize optional components
 24 |         let guard_mode = GuardModeManager::new().ok();
 25 |         let audit_manager = AuditManager::new().await.ok();
 26 |         let rogue_detector = if let Some(am) = audit_manager {
 27 |             Some(RogueDetector::new(am))
 28 |         } else {
 29 |             None
 30 |         };
 31 | 
 32 |         Ok(Self {
 33 |             gpu_manager,
 34 |             guard_mode,
 35 |             rogue_detector,
 36 |             audit_manager: None, // We moved it to rogue_detector
 37 |         })
 38 |     }
 39 | 
 40 |     /// List all available resources
 41 |     pub fn list_resources(&self) -> Vec<Resource> {
 42 |         vec![
 43 |             Resource {
 44 |                 uri: "gpu://list".to_string(),
 45 |                 name: "GPU List".to_string(),
 46 |                 description: Some("Current GPU status and utilization".to_string()),
 47 |                 mime_type: Some("application/json".to_string()),
 48 |             },
 49 |             Resource {
 50 |                 uri: "gpu://processes".to_string(),
 51 |                 name: "GPU Processes".to_string(),
 52 |                 description: Some("Currently running GPU processes".to_string()),
 53 |                 mime_type: Some("application/json".to_string()),
 54 |             },
 55 |             Resource {
 56 |                 uri: "gpu://audit".to_string(),
 57 |                 name: "GPU Audit".to_string(),
 58 |                 description: Some("Historical GPU usage data".to_string()),
 59 |                 mime_type: Some("application/json".to_string()),
 60 |             },
 61 |             Resource {
 62 |                 uri: "gpu://policies".to_string(),
 63 |                 name: "Guard Mode Policies".to_string(),
 64 |                 description: Some("Current Guard Mode policies".to_string()),
 65 |                 mime_type: Some("application/json".to_string()),
 66 |             },
 67 |             Resource {
 68 |                 uri: "gpu://rogue-detection".to_string(),
 69 |                 name: "Rogue Detection".to_string(),
 70 |                 description: Some("Security scan results and threats".to_string()),
 71 |                 mime_type: Some("application/json".to_string()),
 72 |             },
 73 |         ]
 74 |     }
 75 | 
 76 |     /// Get resource contents by URI
 77 |     pub async fn get_resource(&self, uri: &str) -> anyhow::Result<ResourceContents> {
 78 |         match uri {
 79 |             "gpu://list" => self.get_gpu_list().await,
 80 |             "gpu://processes" => self.get_gpu_processes().await,
 81 |             "gpu://audit" => self.get_audit_data().await,
 82 |             "gpu://policies" => self.get_policies().await,
 83 |             "gpu://rogue-detection" => self.get_rogue_detection().await,
 84 |             _ => Err(anyhow::anyhow!("Unknown resource URI: {}", uri)),
 85 |         }
 86 |     }
 87 | 
 88 |     async fn get_gpu_list(&self) -> anyhow::Result<ResourceContents> {
 89 |         let gpus = self.gpu_manager.get_all_snapshots()?;
 90 |         let gpu_info: Vec<GpuInfo> = gpus
 91 |             .into_iter()
 92 |             .map(|gpu| GpuInfo {
 93 |                 id: gpu.gpu_index as u32,
 94 |                 name: gpu.name,
 95 |                 vendor: gpu.vendor.to_string(),
 96 |                 memory_used: gpu.mem_used_mb as f64,
 97 |                 memory_total: gpu.mem_total_mb as f64,
 98 |                 utilization: gpu.util_pct as f64,
 99 |                 temperature: Some(gpu.temp_c as f64),
100 |                 power_usage: Some(gpu.power_w as f64),
101 |                 processes: gpu
102 |                     .top_proc
103 |                     .map(|proc| GpuProcess {
104 |                         pid: proc.pid,
105 |                         name: proc.proc_name,
106 |                         memory_usage: proc.used_mem_mb as f64,
107 |                         user: Some(proc.user),
108 |                     })
109 |                     .into_iter()
110 |                     .collect(),
111 |             })
112 |             .collect();
113 | 
114 |         let json_text = serde_json::to_string_pretty(&gpu_info)?;
115 | 
116 |         Ok(ResourceContents {
117 |             uri: "gpu://list".to_string(),
118 |             mime_type: Some("application/json".to_string()),
119 |             text: Some(json_text),
120 |             blob: None,
121 |         })
122 |     }
123 | 
124 |     async fn get_gpu_processes(&self) -> anyhow::Result<ResourceContents> {
125 |         let gpus = self.gpu_manager.get_all_snapshots()?;
126 |         let mut all_processes = Vec::new();
127 | 
128 |         for gpu in gpus {
129 |             if let Some(proc) = gpu.top_proc {
130 |                 all_processes.push(GpuProcess {
131 |                     pid: proc.pid,
132 |                     name: proc.proc_name,
133 |                     memory_usage: proc.used_mem_mb as f64,
134 |                     user: Some(proc.user),
135 |                 });
136 |             }
137 |         }
138 | 
139 |         let json_text = serde_json::to_string_pretty(&all_processes)?;
140 | 
141 |         Ok(ResourceContents {
142 |             uri: "gpu://processes".to_string(),
143 |             mime_type: Some("application/json".to_string()),
144 |             text: Some(json_text),
145 |             blob: None,
146 |         })
147 |     }
148 | 
149 |     async fn get_audit_data(&self) -> anyhow::Result<ResourceContents> {
150 |         // For now, return empty audit data since we don't have access to audit_manager
151 |         // In a full implementation, we would need to restructure to share the audit_manager
152 |         Ok(ResourceContents {
153 |             uri: "gpu://audit".to_string(),
154 |             mime_type: Some("application/json".to_string()),
155 |             text: Some("[]".to_string()),
156 |             blob: None,
157 |         })
158 |     }
159 | 
160 |     async fn get_policies(&self) -> anyhow::Result<ResourceContents> {
161 |         if let Some(guard_mode) = &self.guard_mode {
162 |             let config = guard_mode.get_config();
163 |             let policies: Vec<PolicyInfo> = config
164 |                 .user_policies
165 |                 .iter()
166 |                 .map(|(name, policy)| {
167 |                     let mut limits = HashMap::new();
168 |                     limits.insert("memory_limit_gb".to_string(), json!(policy.memory_limit_gb));
169 |                     limits.insert(
170 |                         "utilization_limit_pct".to_string(),
171 |                         json!(policy.utilization_limit_pct),
172 |                     );
173 |                     limits.insert(
174 |                         "process_limit".to_string(),
175 |                         json!(policy.max_concurrent_processes),
176 |                     );
177 | 
178 |                     PolicyInfo {
179 |                         policy_type: "user".to_string(),
180 |                         name: name.clone(),
181 |                         enabled: true,
182 |                         limits,
183 |                     }
184 |                 })
185 |                 .collect();
186 | 
187 |             let json_text = serde_json::to_string_pretty(&policies)?;
188 | 
189 |             Ok(ResourceContents {
190 |                 uri: "gpu://policies".to_string(),
191 |                 mime_type: Some("application/json".to_string()),
192 |                 text: Some(json_text),
193 |                 blob: None,
194 |             })
195 |         } else {
196 |             Ok(ResourceContents {
197 |                 uri: "gpu://policies".to_string(),
198 |                 mime_type: Some("application/json".to_string()),
199 |                 text: Some("[]".to_string()),
200 |                 blob: None,
201 |             })
202 |         }
203 |     }
204 | 
205 |     async fn get_rogue_detection(&self) -> anyhow::Result<ResourceContents> {
206 |         if let Some(rogue_detector) = &self.rogue_detector {
207 |             let result = rogue_detector.detect_rogue_activity(24).await?;
208 | 
209 |             // Combine all threat types into a single list
210 |             let mut all_threats = Vec::new();
211 | 
212 |             // Add suspicious processes
213 |             for threat in result.suspicious_processes {
214 |                 all_threats.push(ThreatInfo {
215 |                     id: format!("suspicious_{}", threat.process.pid),
216 |                     threat_type: "suspicious_process".to_string(),
217 |                     severity: "medium".to_string(),
218 |                     confidence: threat.confidence as f64,
219 |                     description: format!("Suspicious process: {}", threat.process.proc_name),
220 |                     process_info: Some(GpuProcess {
221 |                         pid: threat.process.pid,
222 |                         name: threat.process.proc_name,
223 |                         memory_usage: threat.process.used_mem_mb as f64,
224 |                         user: Some(threat.process.user),
225 |                     }),
226 |                 });
227 |             }
228 | 
229 |             // Add crypto miners
230 |             for threat in result.crypto_miners {
231 |                 all_threats.push(ThreatInfo {
232 |                     id: format!("crypto_{}", threat.process.pid),
233 |                     threat_type: "crypto_miner".to_string(),
234 |                     severity: "high".to_string(),
235 |                     confidence: threat.confidence as f64,
236 |                     description: format!("Crypto miner detected: {}", threat.process.proc_name),
237 |                     process_info: Some(GpuProcess {
238 |                         pid: threat.process.pid,
239 |                         name: threat.process.proc_name,
240 |                         memory_usage: threat.process.used_mem_mb as f64,
241 |                         user: Some(threat.process.user),
242 |                     }),
243 |                 });
244 |             }
245 | 
246 |             // Add resource abusers
247 |             for threat in result.resource_abusers {
248 |                 all_threats.push(ThreatInfo {
249 |                     id: format!("abuser_{}", threat.process.pid),
250 |                     threat_type: "resource_abuser".to_string(),
251 |                     severity: "medium".to_string(),
252 |                     confidence: threat.severity as f64,
253 |                     description: format!("Resource abuser: {}", threat.process.proc_name),
254 |                     process_info: Some(GpuProcess {
255 |                         pid: threat.process.pid,
256 |                         name: threat.process.proc_name,
257 |                         memory_usage: threat.process.used_mem_mb as f64,
258 |                         user: Some(threat.process.user),
259 |                     }),
260 |                 });
261 |             }
262 | 
263 |             // Add data exfiltrators
264 |             for threat in result.data_exfiltrators {
265 |                 all_threats.push(ThreatInfo {
266 |                     id: format!("exfil_{}", threat.process.pid),
267 |                     threat_type: "data_exfiltrator".to_string(),
268 |                     severity: "high".to_string(),
269 |                     confidence: threat.confidence as f64,
270 |                     description: format!("Data exfiltrator: {}", threat.process.proc_name),
271 |                     process_info: Some(GpuProcess {
272 |                         pid: threat.process.pid,
273 |                         name: threat.process.proc_name,
274 |                         memory_usage: threat.process.used_mem_mb as f64,
275 |                         user: Some(threat.process.user),
276 |                     }),
277 |                 });
278 |             }
279 | 
280 |             let threat_info = all_threats;
281 | 
282 |             let json_text = serde_json::to_string_pretty(&threat_info)?;
283 | 
284 |             Ok(ResourceContents {
285 |                 uri: "gpu://rogue-detection".to_string(),
286 |                 mime_type: Some("application/json".to_string()),
287 |                 text: Some(json_text),
288 |                 blob: None,
289 |             })
290 |         } else {
291 |             Ok(ResourceContents {
292 |                 uri: "gpu://rogue-detection".to_string(),
293 |                 mime_type: Some("application/json".to_string()),
294 |                 text: Some("[]".to_string()),
295 |                 blob: None,
296 |             })
297 |         }
298 |     }
299 | }
300 | 


--------------------------------------------------------------------------------
/src/process_mgmt.rs:
--------------------------------------------------------------------------------
  1 | use crate::nvml_api::GpuProc;
  2 | use crate::proc::ProcessManager;
  3 | use anyhow::Result;
  4 | use regex::Regex;
  5 | use std::collections::HashMap;
  6 | use sysinfo::{Pid as SysPid, System};
  7 | 
  8 | /// Enhanced process management with filtering and batch operations
  9 | pub struct EnhancedProcessManager {
 10 |     pub process_manager: ProcessManager,
 11 |     system: System,
 12 | }
 13 | 
 14 | #[allow(dead_code)]
 15 | impl EnhancedProcessManager {
 16 |     pub fn new(process_manager: ProcessManager) -> Self {
 17 |         Self {
 18 |             process_manager,
 19 |             system: System::new_all(),
 20 |         }
 21 |     }
 22 | 
 23 |     /// Filter processes by name pattern (supports regex)
 24 |     pub fn filter_processes_by_name(
 25 |         &mut self,
 26 |         processes: &[GpuProc],
 27 |         pattern: &str,
 28 |     ) -> Result<Vec<GpuProc>> {
 29 |         let regex = Regex::new(pattern)
 30 |             .map_err(|e| anyhow::anyhow!("Invalid regex pattern '{}': {}", pattern, e))?;
 31 | 
 32 |         let mut filtered = Vec::new();
 33 |         for proc in processes {
 34 |             if regex.is_match(&proc.proc_name) {
 35 |                 filtered.push(proc.clone());
 36 |             }
 37 |         }
 38 | 
 39 |         Ok(filtered)
 40 |     }
 41 | 
 42 |     /// Filter processes by user
 43 |     pub fn filter_processes_by_user(
 44 |         &mut self,
 45 |         processes: &[GpuProc],
 46 |         user: &str,
 47 |     ) -> Result<Vec<GpuProc>> {
 48 |         let regex = Regex::new(user)
 49 |             .map_err(|e| anyhow::anyhow!("Invalid regex pattern '{}': {}", user, e))?;
 50 | 
 51 |         let mut filtered = Vec::new();
 52 |         for proc in processes {
 53 |             if regex.is_match(&proc.user) {
 54 |                 filtered.push(proc.clone());
 55 |             }
 56 |         }
 57 | 
 58 |         Ok(filtered)
 59 |     }
 60 | 
 61 |     /// Filter processes by memory usage threshold
 62 |     pub fn filter_processes_by_memory(
 63 |         &mut self,
 64 |         processes: &[GpuProc],
 65 |         min_mb: u32,
 66 |     ) -> Vec<GpuProc> {
 67 |         processes
 68 |             .iter()
 69 |             .filter(|proc| proc.used_mem_mb >= min_mb)
 70 |             .cloned()
 71 |             .collect()
 72 |     }
 73 | 
 74 |     /// Get process tree for a given PID
 75 |     pub fn get_process_tree(&mut self, root_pid: u32) -> Result<Vec<u32>> {
 76 |         self.system.refresh_processes();
 77 | 
 78 |         let mut pids = Vec::new();
 79 |         let mut to_process = vec![root_pid];
 80 | 
 81 |         while let Some(pid) = to_process.pop() {
 82 |             pids.push(pid);
 83 | 
 84 |             // Find child processes
 85 |             for process in self.system.processes().values() {
 86 |                 if let Some(parent) = process.parent() {
 87 |                     if parent.as_u32() == pid {
 88 |                         to_process.push(process.pid().as_u32());
 89 |                     }
 90 |                 }
 91 |             }
 92 |         }
 93 | 
 94 |         Ok(pids)
 95 |     }
 96 | 
 97 |     /// Kill a process and its children
 98 |     pub fn kill_process_tree(
 99 |         &mut self,
100 |         root_pid: u32,
101 |         timeout_secs: u16,
102 |         force: bool,
103 |     ) -> Result<()> {
104 |         let pids = self.get_process_tree(root_pid)?;
105 | 
106 |         tracing::info!("Killing process tree: {:?}", pids);
107 | 
108 |         // Kill children first, then parent
109 |         for pid in pids.iter().rev() {
110 |             if let Err(e) = self
111 |                 .process_manager
112 |                 .graceful_kill(*pid, timeout_secs, force)
113 |             {
114 |                 tracing::warn!("Failed to kill process {}: {}", pid, e);
115 |             }
116 |         }
117 | 
118 |         Ok(())
119 |     }
120 | 
121 |     /// Batch kill processes matching a pattern
122 |     pub fn batch_kill_processes(
123 |         &mut self,
124 |         processes: &[GpuProc],
125 |         timeout_secs: u16,
126 |         force: bool,
127 |     ) -> Result<Vec<u32>> {
128 |         let mut killed_pids = Vec::new();
129 |         let mut failed_pids = Vec::new();
130 | 
131 |         for proc in processes {
132 |             match self
133 |                 .process_manager
134 |                 .graceful_kill(proc.pid, timeout_secs, force)
135 |             {
136 |                 Ok(()) => {
137 |                     killed_pids.push(proc.pid);
138 |                     tracing::info!(
139 |                         "Successfully killed process {} ({})",
140 |                         proc.pid,
141 |                         proc.proc_name
142 |                     );
143 |                 }
144 |                 Err(e) => {
145 |                     failed_pids.push(proc.pid);
146 |                     tracing::warn!(
147 |                         "Failed to kill process {} ({}): {}",
148 |                         proc.pid,
149 |                         proc.proc_name,
150 |                         e
151 |                     );
152 |                 }
153 |             }
154 |         }
155 | 
156 |         if !failed_pids.is_empty() {
157 |             return Err(anyhow::anyhow!(
158 |                 "Failed to kill {} processes: {:?}",
159 |                 failed_pids.len(),
160 |                 failed_pids
161 |             ));
162 |         }
163 | 
164 |         Ok(killed_pids)
165 |     }
166 | 
167 |     /// Detect if a process is running in a container
168 |     pub fn detect_container(&mut self, pid: u32) -> Result<Option<String>> {
169 |         self.system.refresh_processes();
170 | 
171 |         let sys_pid = SysPid::from_u32(pid);
172 |         let process = self
173 |             .system
174 |             .process(sys_pid)
175 |             .ok_or_else(|| anyhow::anyhow!("Process {} not found", pid))?;
176 | 
177 |         // Check for common container indicators
178 |         let cmdline = process.cmd().join(" ");
179 | 
180 |         // Docker
181 |         if cmdline.contains("docker") || cmdline.contains("containerd") {
182 |             return Ok(Some("docker".to_string()));
183 |         }
184 | 
185 |         // Podman
186 |         if cmdline.contains("podman") {
187 |             return Ok(Some("podman".to_string()));
188 |         }
189 | 
190 |         // Kubernetes
191 |         if cmdline.contains("kubelet") || cmdline.contains("k8s") {
192 |             return Ok(Some("kubernetes".to_string()));
193 |         }
194 | 
195 |         // LXC
196 |         if cmdline.contains("lxc") {
197 |             return Ok(Some("lxc".to_string()));
198 |         }
199 | 
200 |         // Check environment variables for container indicators
201 |         let env = process.environ();
202 |         for env_var in env {
203 |             if env_var.starts_with("CONTAINER")
204 |                 || env_var.starts_with("DOCKER")
205 |                 || env_var.starts_with("KUBERNETES")
206 |             {
207 |                 return Ok(Some("container".to_string()));
208 |             }
209 |         }
210 | 
211 |         Ok(None)
212 |     }
213 | 
214 |     /// Enrich GPU processes with container information
215 |     pub fn enrich_with_containers(&mut self, mut processes: Vec<GpuProc>) -> Result<Vec<GpuProc>> {
216 |         for proc in &mut processes {
217 |             match self.detect_container(proc.pid) {
218 |                 Ok(container) => proc.container = container,
219 |                 Err(e) => {
220 |                     tracing::warn!("Failed to detect container for PID {}: {}", proc.pid, e);
221 |                     proc.container = None;
222 |                 }
223 |             }
224 |         }
225 | 
226 |         Ok(processes)
227 |     }
228 | 
229 |     /// Get process statistics
230 |     pub fn get_process_stats(&mut self, processes: &[GpuProc]) -> ProcessStats {
231 |         let mut stats = ProcessStats::default();
232 | 
233 |         for proc in processes {
234 |             stats.total_processes += 1;
235 |             stats.total_memory_mb += proc.used_mem_mb;
236 | 
237 |             // Count by user
238 |             *stats.users.entry(proc.user.clone()).or_insert(0) += 1;
239 | 
240 |             // Count by process name
241 |             *stats
242 |                 .process_names
243 |                 .entry(proc.proc_name.clone())
244 |                 .or_insert(0) += 1;
245 | 
246 |             // Count containers
247 |             if let Some(container) = &proc.container {
248 |                 *stats.containers.entry(container.clone()).or_insert(0) += 1;
249 |             } else {
250 |                 stats.non_container_processes += 1;
251 |             }
252 |         }
253 | 
254 |         stats
255 |     }
256 | }
257 | 
258 | /// Process statistics
259 | #[derive(Debug, Default)]
260 | pub struct ProcessStats {
261 |     pub total_processes: usize,
262 |     pub total_memory_mb: u32,
263 |     pub non_container_processes: usize,
264 |     pub users: HashMap<String, usize>,
265 |     pub process_names: HashMap<String, usize>,
266 |     pub containers: HashMap<String, usize>,
267 | }
268 | 
269 | impl std::fmt::Display for ProcessStats {
270 |     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
271 |         writeln!(f, "Process Statistics:")?;
272 |         writeln!(f, "  Total processes: {}", self.total_processes)?;
273 |         writeln!(f, "  Total memory: {} MB", self.total_memory_mb)?;
274 |         writeln!(
275 |             f,
276 |             "  Non-container processes: {}",
277 |             self.non_container_processes
278 |         )?;
279 | 
280 |         if !self.users.is_empty() {
281 |             writeln!(f, "  Users:")?;
282 |             for (user, count) in &self.users {
283 |                 writeln!(f, "    {}: {}", user, count)?;
284 |             }
285 |         }
286 | 
287 |         if !self.process_names.is_empty() {
288 |             writeln!(f, "  Process names:")?;
289 |             for (name, count) in &self.process_names {
290 |                 writeln!(f, "    {}: {}", name, count)?;
291 |             }
292 |         }
293 | 
294 |         if !self.containers.is_empty() {
295 |             writeln!(f, "  Containers:")?;
296 |             for (container, count) in &self.containers {
297 |                 writeln!(f, "    {}: {}", container, count)?;
298 |             }
299 |         }
300 | 
301 |         Ok(())
302 |     }
303 | }
304 | 
305 | #[cfg(test)]
306 | mod tests {
307 |     use super::*;
308 |     use crate::nvml_api::GpuProc;
309 | 
310 |     fn create_test_process(pid: u32, name: &str, user: &str, memory: u32) -> GpuProc {
311 |         GpuProc {
312 |             gpu_index: 0,
313 |             pid,
314 |             user: user.to_string(),
315 |             proc_name: name.to_string(),
316 |             used_mem_mb: memory,
317 |             start_time: "1h".to_string(),
318 |             container: None,
319 |         }
320 |     }
321 | 
322 |     #[test]
323 |     fn test_filter_processes_by_name() {
324 |         let processes = vec![
325 |             create_test_process(1, "python", "user1", 100),
326 |             create_test_process(2, "python3", "user1", 200),
327 |             create_test_process(3, "java", "user2", 300),
328 |         ];
329 | 
330 |         // Skip test if NVML is not available
331 |         if let Ok(nvml_api) = crate::nvml_api::NvmlApi::new() {
332 |             let mut manager = EnhancedProcessManager {
333 |                 process_manager: ProcessManager::new(nvml_api),
334 |                 system: System::new_all(),
335 |             };
336 | 
337 |             let filtered = manager
338 |                 .filter_processes_by_name(&processes, "python")
339 |                 .unwrap();
340 |             assert_eq!(filtered.len(), 2);
341 |             assert_eq!(filtered[0].proc_name, "python");
342 |             assert_eq!(filtered[1].proc_name, "python3");
343 |         }
344 |     }
345 | 
346 |     #[test]
347 |     fn test_filter_processes_by_memory() {
348 |         let processes = vec![
349 |             create_test_process(1, "python", "user1", 100),
350 |             create_test_process(2, "python3", "user1", 200),
351 |             create_test_process(3, "java", "user2", 300),
352 |         ];
353 | 
354 |         // Skip test if NVML is not available
355 |         if let Ok(nvml_api) = crate::nvml_api::NvmlApi::new() {
356 |             let mut manager = EnhancedProcessManager {
357 |                 process_manager: ProcessManager::new(nvml_api),
358 |                 system: System::new_all(),
359 |             };
360 | 
361 |             let filtered = manager.filter_processes_by_memory(&processes, 200);
362 |             assert_eq!(filtered.len(), 2);
363 |             assert!(filtered.iter().all(|p| p.used_mem_mb >= 200));
364 |         }
365 |     }
366 | 
367 |     #[test]
368 |     fn test_process_stats() {
369 |         let processes = vec![
370 |             create_test_process(1, "python", "user1", 100),
371 |             create_test_process(2, "python", "user1", 200),
372 |             create_test_process(3, "java", "user2", 300),
373 |         ];
374 | 
375 |         // Skip test if NVML is not available
376 |         if let Ok(nvml_api) = crate::nvml_api::NvmlApi::new() {
377 |             let mut manager = EnhancedProcessManager {
378 |                 process_manager: ProcessManager::new(nvml_api),
379 |                 system: System::new_all(),
380 |             };
381 | 
382 |             let stats = manager.get_process_stats(&processes);
383 |             assert_eq!(stats.total_processes, 3);
384 |             assert_eq!(stats.total_memory_mb, 600);
385 |             assert_eq!(stats.users.len(), 2);
386 |             assert_eq!(stats.process_names.len(), 2);
387 |         }
388 |     }
389 | }
390 | 


--------------------------------------------------------------------------------