├── docs ├── .nojekyll ├── 404.html ├── demo.html └── index.html ├── gpu-hot.png ├── version.py ├── static ├── favicon.svg └── js │ ├── app.js │ ├── ui.js │ ├── chart-config.js │ ├── socket-handlers.js │ └── chart-manager.js ├── requirements.txt ├── core ├── metrics │ ├── __init__.py │ ├── utils.py │ └── collector.py ├── __init__.py ├── config.py ├── hub_handlers.py ├── handlers.py ├── hub.py ├── nvidia_smi_fallback.py └── monitor.py ├── .dockerignore ├── tests ├── Dockerfile.test ├── docker-compose.test.yml ├── README.md └── test_cluster.py ├── docker-compose.yml ├── .editorconfig ├── Dockerfile ├── LICENSE ├── .gitignore ├── .github └── workflows │ └── publish.yml ├── README.md ├── app.py └── templates └── index.html /docs/.nojekyll: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /gpu-hot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/psalias2006/gpu-hot/HEAD/gpu-hot.png -------------------------------------------------------------------------------- /version.py: -------------------------------------------------------------------------------- 1 | """Version information for GPU Hot""" 2 | 3 | __version__ = "1.6.0" 4 | 5 | -------------------------------------------------------------------------------- /static/favicon.svg: -------------------------------------------------------------------------------- 1 | 2 | 🔥 3 | 4 | 5 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | fastapi==0.104.1 2 | uvicorn[standard]==0.24.0 3 | websockets==12.0 4 | psutil==5.9.6 5 | nvidia-ml-py==13.580.82 6 | requests==2.31.0 7 | websocket-client==1.6.3 8 | aiohttp==3.9.1 -------------------------------------------------------------------------------- /core/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | GPU Metrics Collection 3 | Organized collection of GPU metrics from NVML 4 | """ 5 | 6 | from .collector import MetricsCollector 7 | 8 | __all__ = ['MetricsCollector'] 9 | 10 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | *.pyc 3 | *.pyo 4 | *.pyd 5 | .Python 6 | *.so 7 | *.egg 8 | *.egg-info/ 9 | dist/ 10 | build/ 11 | .git/ 12 | .gitignore 13 | *.md 14 | !README.md 15 | docs/ 16 | *.png 17 | LICENSE 18 | .DS_Store 19 | -------------------------------------------------------------------------------- /core/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | GPU Hot - Core Package 3 | Real-time NVIDIA GPU monitoring application 4 | """ 5 | 6 | __version__ = '1.0.0' 7 | __author__ = 'GPU Hot Team' 8 | 9 | from .monitor import GPUMonitor 10 | from . import config 11 | 12 | __all__ = ['GPUMonitor', 'config'] 13 | 14 | -------------------------------------------------------------------------------- /tests/Dockerfile.test: -------------------------------------------------------------------------------- 1 | FROM python:3.9-slim 2 | 3 | WORKDIR /app 4 | 5 | # Install dependencies 6 | RUN pip install --no-cache-dir \ 7 | fastapi==0.104.1 \ 8 | uvicorn[standard]==0.24.0 \ 9 | websockets==12.0 10 | 11 | # Copy test script 12 | COPY tests/test_cluster.py . 13 | 14 | # Expose port range for mock nodes (default: 13120-13150) 15 | EXPOSE 13120-13150 16 | 17 | # Default: 3 nodes with 2,4,8 GPUs 18 | ENV NODES="2,4,8" 19 | ENV BASE_PORT="13120" 20 | ENV PREFIX="gpu-server" 21 | 22 | CMD python3 test_cluster.py --nodes ${NODES} --base-port ${BASE_PORT} --prefix ${PREFIX} 23 | 24 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | 2 | services: 3 | gpu-hot: 4 | build: . 5 | ports: 6 | - "1312:1312" 7 | environment: 8 | - NVIDIA_VISIBLE_DEVICES=all 9 | - NVIDIA_DRIVER_CAPABILITIES=all 10 | - NODE_NAME=${HOSTNAME} 11 | deploy: 12 | resources: 13 | reservations: 14 | devices: 15 | - driver: nvidia 16 | count: all 17 | capabilities: [gpu] 18 | init: true 19 | pid: "host" 20 | restart: unless-stopped 21 | healthcheck: 22 | test: ["CMD", "curl", "-f", "http://localhost:1312/api/gpu-data"] 23 | interval: 30s 24 | timeout: 10s 25 | retries: 3 26 | start_period: 40s 27 | 28 | -------------------------------------------------------------------------------- /core/metrics/utils.py: -------------------------------------------------------------------------------- 1 | """Utility functions for metrics collection""" 2 | 3 | import pynvml 4 | 5 | 6 | def safe_get(func, *args, default=None): 7 | """Safely call NVML function, returns default if unsupported""" 8 | try: 9 | result = func(*args) 10 | return result if result is not None else default 11 | except (pynvml.NVMLError, Exception): 12 | return default 13 | 14 | 15 | def decode_bytes(value): 16 | """Decode bytes to string if necessary""" 17 | return value.decode('utf-8') if isinstance(value, bytes) else value 18 | 19 | 20 | def to_mib(bytes_value): 21 | """Convert bytes to MiB""" 22 | return float(bytes_value / (1024 ** 2)) 23 | 24 | 25 | def to_watts(milliwatts): 26 | """Convert milliwatts to watts""" 27 | return float(milliwatts / 1000.0) 28 | 29 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | # EditorConfig is awesome: https://EditorConfig.org 2 | 3 | # top-most EditorConfig file 4 | root = true 5 | 6 | # Unix-style newlines with a newline ending every file 7 | [*] 8 | end_of_line = lf 9 | insert_final_newline = true 10 | charset = utf-8 11 | trim_trailing_whitespace = true 12 | 13 | # Python files 14 | [*.py] 15 | indent_style = space 16 | indent_size = 4 17 | max_line_length = 120 18 | 19 | # HTML/CSS/JS files 20 | [*.{html,css,js}] 21 | indent_style = space 22 | indent_size = 2 23 | 24 | # YAML files 25 | [*.{yml,yaml}] 26 | indent_style = space 27 | indent_size = 2 28 | 29 | # Markdown files 30 | [*.md] 31 | trim_trailing_whitespace = false 32 | max_line_length = off 33 | 34 | # Dockerfile 35 | [Dockerfile] 36 | indent_style = space 37 | indent_size = 2 38 | 39 | # Shell scripts 40 | [*.sh] 41 | indent_style = space 42 | indent_size = 2 43 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:12.2.2-runtime-ubuntu22.04 2 | 3 | # Set environment variables 4 | ENV DEBIAN_FRONTEND=noninteractive 5 | ENV PYTHONUNBUFFERED=1 6 | 7 | # Install system dependencies 8 | RUN apt-get update && apt-get install -y \ 9 | python3 \ 10 | python3-pip \ 11 | curl \ 12 | && rm -rf /var/lib/apt/lists/* 13 | 14 | # Set working directory 15 | WORKDIR /app 16 | 17 | # Copy requirements and install Python dependencies 18 | COPY requirements.txt . 19 | RUN pip3 install --no-cache-dir -r requirements.txt 20 | 21 | # Copy application code 22 | COPY . . 23 | 24 | # Create templates directory if it doesn't exist 25 | RUN mkdir -p templates 26 | 27 | # Expose port 28 | EXPOSE 1312 29 | 30 | # Health check 31 | HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ 32 | CMD curl -f http://localhost:1312/api/gpu-data || exit 1 33 | 34 | # Run the application 35 | CMD ["python3", "app.py"] 36 | 37 | -------------------------------------------------------------------------------- /core/config.py: -------------------------------------------------------------------------------- 1 | """ 2 | Configuration settings for GPU Hot 3 | """ 4 | 5 | import os 6 | import socket 7 | 8 | # Flask Configuration 9 | SECRET_KEY = 'gpu_hot_secret' 10 | HOST = '0.0.0.0' 11 | PORT = 1312 12 | DEBUG = False 13 | 14 | # Monitoring Configuration 15 | UPDATE_INTERVAL = 0.5 # Update interval for NVML (sub-second monitoring) 16 | NVIDIA_SMI_INTERVAL = 2.0 # Update interval for nvidia-smi fallback (slower to reduce overhead) 17 | 18 | # GPU Monitoring Mode 19 | # Can be set via environment variable: NVIDIA_SMI=true 20 | NVIDIA_SMI = os.getenv('NVIDIA_SMI', 'false').lower() == 'true' 21 | 22 | # Multi-Node Configuration 23 | # MODE: default (single node monitoring), hub (aggregate multiple nodes) 24 | MODE = os.getenv('GPU_HOT_MODE', 'default') 25 | NODE_NAME = os.getenv('NODE_NAME', socket.gethostname()) 26 | # NODE_URLS: comma-separated URLs for hub mode (e.g., http://node1:1312,http://node2:1312) 27 | NODE_URLS = [url.strip() for url in os.getenv('NODE_URLS', '').split(',') if url.strip()] 28 | 29 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 psalias 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | *.so 6 | .Python 7 | *.egg 8 | *.egg-info/ 9 | dist/ 10 | build/ 11 | eggs/ 12 | .eggs/ 13 | lib/ 14 | lib64/ 15 | parts/ 16 | sdist/ 17 | var/ 18 | wheels/ 19 | pip-wheel-metadata/ 20 | share/python-wheels/ 21 | *.manifest 22 | *.spec 23 | 24 | # Virtual Environment 25 | venv/ 26 | env/ 27 | ENV/ 28 | env.bak/ 29 | venv.bak/ 30 | .venv/ 31 | 32 | # IDE / Editor 33 | .vscode/ 34 | .idea/ 35 | *.swp 36 | *.swo 37 | *~ 38 | .project 39 | .pydevproject 40 | .settings/ 41 | *.sublime-project 42 | *.sublime-workspace 43 | .DS_Store 44 | 45 | # Flask 46 | instance/ 47 | .webassets-cache 48 | .env 49 | .flaskenv 50 | 51 | # Testing 52 | .pytest_cache/ 53 | .coverage 54 | .coverage.* 55 | htmlcov/ 56 | .tox/ 57 | .nox/ 58 | .hypothesis/ 59 | 60 | # Logs 61 | *.log 62 | logs/ 63 | *.log.* 64 | 65 | # Database 66 | *.db 67 | *.sqlite 68 | *.sqlite3 69 | 70 | # OS 71 | .DS_Store 72 | .DS_Store? 73 | ._* 74 | .Spotlight-V100 75 | .Trashes 76 | ehthumbs.db 77 | Thumbs.db 78 | 79 | # Docker 80 | docker-compose.override.yml 81 | 82 | # Environment variables 83 | .env 84 | .env.local 85 | .env.*.local 86 | 87 | # Temporary files 88 | *.tmp 89 | *.temp 90 | tmp/ 91 | temp/ 92 | 93 | # MacOS 94 | .AppleDouble 95 | .LSOverride 96 | Icon 97 | -------------------------------------------------------------------------------- /static/js/app.js: -------------------------------------------------------------------------------- 1 | /** 2 | * GPU Hot - Main Application 3 | * Initializes the application when the DOM is ready 4 | */ 5 | 6 | // Application initialization 7 | document.addEventListener('DOMContentLoaded', function() { 8 | console.log('GPU Hot application initialized'); 9 | 10 | // All functionality is loaded from other modules: 11 | // - charts.js: Chart configurations and updates 12 | // - gpu-cards.js: GPU card rendering and updates 13 | // - ui.js: UI interactions and navigation 14 | // - socket-handlers.js: Real-time data updates via Socket.IO 15 | 16 | // The socket connection is established automatically when socket-handlers.js loads 17 | 18 | // Check for version updates 19 | checkVersion(); 20 | }); 21 | 22 | /** 23 | * Check current version and update availability 24 | */ 25 | async function checkVersion() { 26 | try { 27 | const response = await fetch('/api/version'); 28 | const data = await response.json(); 29 | 30 | const versionCurrent = document.getElementById('version-current'); 31 | const updateBadge = document.getElementById('update-badge'); 32 | const updateLink = document.getElementById('update-link'); 33 | 34 | if (versionCurrent) { 35 | versionCurrent.textContent = `v${data.current}`; 36 | } 37 | 38 | if (data.update_available && data.latest) { 39 | updateBadge.style.display = 'inline-block'; 40 | updateLink.href = data.release_url || 'https://github.com/psalias2006/gpu-hot/releases/latest'; 41 | updateLink.title = `Update to v${data.latest}`; 42 | } 43 | } catch (error) { 44 | console.debug('Failed to check version:', error); 45 | const versionCurrent = document.getElementById('version-current'); 46 | if (versionCurrent) { 47 | versionCurrent.textContent = 'Unknown'; 48 | } 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Build and Push Docker Image 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | tags: 8 | - 'v*' 9 | workflow_dispatch: 10 | 11 | env: 12 | REGISTRY: ghcr.io 13 | IMAGE_NAME: ${{ github.repository }} 14 | 15 | jobs: 16 | build-and-push: 17 | runs-on: ubuntu-latest 18 | permissions: 19 | contents: read 20 | packages: write 21 | id-token: write 22 | attestations: write 23 | 24 | steps: 25 | - name: Checkout repository 26 | uses: actions/checkout@v4 27 | 28 | - name: Set up Docker Buildx 29 | uses: docker/setup-buildx-action@v3 30 | 31 | - name: Log in to ghcr 32 | if: github.event_name != 'pull_request' 33 | uses: docker/login-action@v3 34 | with: 35 | registry: ${{ env.REGISTRY }} 36 | username: ${{ github.actor }} 37 | password: ${{ secrets.GITHUB_TOKEN }} 38 | 39 | - name: Extract metadata 40 | id: meta 41 | uses: docker/metadata-action@v5 42 | with: 43 | images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} 44 | tags: | 45 | type=ref,event=branch 46 | type=semver,pattern={{version}} 47 | type=raw,value=latest,enable={{is_default_branch}} 48 | 49 | - name: Build and Push Docker image 50 | id: build 51 | uses: docker/build-push-action@v5 52 | with: 53 | context: . 54 | platforms: linux/amd64,linux/arm64 55 | push: ${{ github.event_name != 'pull_request' }} 56 | tags: ${{ steps.meta.outputs.tags }} 57 | labels: ${{ steps.meta.outputs.labels }} 58 | cache-from: type=gha 59 | cache-to: type=gha,mode=max 60 | build-args: | 61 | BUILDKIT_INLINE_CACHE=1 62 | 63 | - name: Generate artifact 64 | if: github.event_name != 'pull_request' 65 | uses: actions/attest-build-provenance@v1 66 | with: 67 | subject-name: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} 68 | subject-digest: ${{ steps.build.outputs.digest }} 69 | push-to-registry: true 70 | -------------------------------------------------------------------------------- /tests/docker-compose.test.yml: -------------------------------------------------------------------------------- 1 | services: 2 | # Mock GPU cluster (simulates multiple GPU servers) 3 | mock-cluster: 4 | build: 5 | context: .. 6 | dockerfile: tests/Dockerfile.test 7 | container_name: gpu-hot-mock-cluster 8 | hostname: mock-cluster 9 | ports: 10 | - "13120-13150:13120-13150" 11 | environment: 12 | # LOAD TEST PRESETS - uncomment one: 13 | # LIGHT: 3 nodes, 5 GPUs (typical small lab) 14 | - NODES=1,2,2 15 | # MEDIUM: 8 nodes, 64 GPUs (medium cluster) 16 | #- NODES=8,8,8,8,8,8,8,8 17 | # HEAVY: 20 nodes, 160 GPUs (large production) 18 | # - NODES=8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8 19 | 20 | - BASE_PORT=13120 21 | - PREFIX=gpu-server 22 | networks: 23 | gpu-hot-test: 24 | aliases: 25 | - mock-cluster 26 | 27 | # Hub (aggregates all mock nodes) 28 | hub: 29 | build: 30 | context: .. 31 | dockerfile: Dockerfile 32 | container_name: gpu-hot-hub 33 | ports: 34 | - "1312:1312" 35 | environment: 36 | - GPU_HOT_MODE=hub 37 | # Must match NODES count above: 38 | # LIGHT (3): 39 | - NODE_URLS=http://mock-cluster:13120,http://mock-cluster:13121,http://mock-cluster:13122 40 | # MEDIUM (8): 41 | #- NODE_URLS=http://mock-cluster:13120,http://mock-cluster:13121,http://mock-cluster:13122,http://mock-cluster:13123,http://mock-cluster:13124,http://mock-cluster:13125,http://mock-cluster:13126,http://mock-cluster:13127 42 | # HEAVY (20): 43 | # - NODE_URLS=http://mock-cluster:13120,http://mock-cluster:13121,http://mock-cluster:13122,http://mock-cluster:13123,http://mock-cluster:13124,http://mock-cluster:13125,http://mock-cluster:13126,http://mock-cluster:13127,http://mock-cluster:13128,http://mock-cluster:13129,http://mock-cluster:13130,http://mock-cluster:13131,http://mock-cluster:13132,http://mock-cluster:13133,http://mock-cluster:13134,http://mock-cluster:13135,http://mock-cluster:13136,http://mock-cluster:13137,http://mock-cluster:13138,http://mock-cluster:13139 44 | depends_on: 45 | mock-cluster: 46 | condition: service_started 47 | networks: 48 | - gpu-hot-test 49 | 50 | networks: 51 | gpu-hot-test: 52 | driver: bridge 53 | 54 | -------------------------------------------------------------------------------- /core/hub_handlers.py: -------------------------------------------------------------------------------- 1 | """Async WebSocket handlers for hub mode""" 2 | 3 | import asyncio 4 | import logging 5 | import json 6 | from fastapi import WebSocket 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | # Global WebSocket connections 11 | websocket_connections = set() 12 | 13 | def register_hub_handlers(app, hub): 14 | """Register FastAPI WebSocket handlers for hub mode""" 15 | 16 | @app.websocket("/socket.io/") 17 | async def websocket_endpoint(websocket: WebSocket): 18 | await websocket.accept() 19 | websocket_connections.add(websocket) 20 | logger.debug('Dashboard client connected') 21 | 22 | if not hub.running: 23 | hub.running = True 24 | asyncio.create_task(hub_loop(hub, websocket_connections)) 25 | 26 | # Start node connections if not already started 27 | if not hub._connection_started: 28 | hub._connection_started = True 29 | asyncio.create_task(hub._connect_all_nodes()) 30 | 31 | try: 32 | # Keep connection alive 33 | while True: 34 | await websocket.receive_text() 35 | except Exception as e: 36 | logger.debug(f'Dashboard client disconnected: {e}') 37 | finally: 38 | websocket_connections.discard(websocket) 39 | 40 | 41 | async def hub_loop(hub, connections): 42 | """Async background loop that emits aggregated cluster data""" 43 | logger.info("Hub monitoring loop started") 44 | 45 | while hub.running: 46 | try: 47 | cluster_data = await hub.get_cluster_data() 48 | 49 | # Send to all connected clients 50 | if connections: 51 | disconnected = set() 52 | for websocket in connections: 53 | try: 54 | await websocket.send_text(json.dumps(cluster_data)) 55 | except: 56 | disconnected.add(websocket) 57 | 58 | # Remove disconnected clients 59 | connections -= disconnected 60 | 61 | except Exception as e: 62 | logger.error(f"Error in hub loop: {e}") 63 | 64 | # Match node update rate for real-time responsiveness 65 | await asyncio.sleep(0.5) 66 | 67 | -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- 1 | # GPU Hot - Load Testing (FastAPI + AsyncIO) 2 | 3 | Simple load testing for multi-node GPU monitoring with realistic async patterns. 4 | 5 | ## Quick Start 6 | 7 | ```bash 8 | cd tests 9 | docker-compose -f docker-compose.test.yml up 10 | ``` 11 | 12 | Open http://localhost:1312 to see the dashboard. 13 | 14 | ## Architecture 15 | 16 | - **FastAPI + AsyncIO**: Modern async Python for better performance 17 | - **Native WebSockets**: No Socket.IO overhead, direct WebSocket protocol 18 | - **Concurrent Mock Nodes**: Multiple nodes running in parallel 19 | - **Realistic GPU Patterns**: Training jobs with epochs, warmup, validation 20 | 21 | ## Load Test Presets 22 | 23 | Edit `docker-compose.test.yml` and uncomment the preset you want: 24 | 25 | ### LIGHT (3 nodes, 14 GPUs) 26 | Good for development and quick testing. 27 | ```yaml 28 | - NODES=2,4,8 29 | - NODE_URLS=http://mock-cluster:13120,http://mock-cluster:13121,http://mock-cluster:13122 30 | ``` 31 | 32 | ### MEDIUM (8 nodes, 64 GPUs) ⭐ Default 33 | Realistic medium-sized cluster. 34 | ```yaml 35 | - NODES=8,8,8,8,8,8,8,8 36 | - NODE_URLS=http://mock-cluster:13120,...,http://mock-cluster:13127 37 | ``` 38 | 39 | ### HEAVY (20 nodes, 160 GPUs) 40 | Stress test for large production environments. 41 | ```yaml 42 | - NODES=8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8 43 | - NODE_URLS=http://mock-cluster:13120,...,http://mock-cluster:13139 44 | ``` 45 | 46 | ## What's Simulated 47 | 48 | - **Realistic GPU patterns**: Training jobs with epochs, warmup, validation 49 | - **Idle + busy GPUs**: ~40% utilization typical of real clusters 50 | - **Stable memory**: Memory allocated at job start, stays constant 51 | - **Clock speeds**: Proper P-states (P0/P2/P8) 52 | - **Data loading dips**: Periodic utilization drops 53 | - **Temperature correlation**: Realistic thermal behavior 54 | 55 | ## Files 56 | 57 | - `test_cluster.py` - Mock GPU node with realistic patterns (FastAPI + AsyncIO) 58 | - `docker-compose.test.yml` - Test stack with preset configurations 59 | - `Dockerfile.test` - Container for mock nodes (FastAPI dependencies) 60 | 61 | ## Performance Benefits 62 | 63 | - **20-40% latency reduction** with true async/await 64 | - **2-3x more concurrent connections** supported 65 | - **Better resource utilization** for hub mode aggregation 66 | - **Sub-500ms latency** consistently achieved 67 | 68 | ## Rebuild After Changes 69 | 70 | ```bash 71 | docker-compose -f docker-compose.test.yml down 72 | docker-compose -f docker-compose.test.yml up --build 73 | ``` 74 | -------------------------------------------------------------------------------- /core/handlers.py: -------------------------------------------------------------------------------- 1 | """Async WebSocket handlers for real-time monitoring""" 2 | 3 | import asyncio 4 | import psutil 5 | import logging 6 | import json 7 | from datetime import datetime 8 | from fastapi import WebSocket 9 | from . import config 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | # Global WebSocket connections 14 | websocket_connections = set() 15 | 16 | def register_handlers(app, monitor): 17 | """Register FastAPI WebSocket handlers""" 18 | 19 | @app.websocket("/socket.io/") 20 | async def websocket_endpoint(websocket: WebSocket): 21 | await websocket.accept() 22 | websocket_connections.add(websocket) 23 | logger.debug('Dashboard client connected') 24 | 25 | if not monitor.running: 26 | monitor.running = True 27 | asyncio.create_task(monitor_loop(monitor, websocket_connections)) 28 | 29 | try: 30 | # Keep connection alive 31 | while True: 32 | await websocket.receive_text() 33 | except Exception as e: 34 | logger.debug(f'Dashboard client disconnected: {e}') 35 | finally: 36 | websocket_connections.discard(websocket) 37 | 38 | 39 | async def monitor_loop(monitor, connections): 40 | """Async background loop that collects and emits GPU data""" 41 | # Determine update interval based on whether any GPU uses nvidia-smi 42 | uses_nvidia_smi = any(monitor.use_smi.values()) if hasattr(monitor, 'use_smi') else False 43 | update_interval = config.NVIDIA_SMI_INTERVAL if uses_nvidia_smi else config.UPDATE_INTERVAL 44 | 45 | if uses_nvidia_smi: 46 | logger.info(f"Using nvidia-smi polling interval: {update_interval}s") 47 | else: 48 | logger.info(f"Using NVML polling interval: {update_interval}s") 49 | 50 | while monitor.running: 51 | try: 52 | # Collect data concurrently 53 | gpu_data, processes = await asyncio.gather( 54 | monitor.get_gpu_data(), 55 | monitor.get_processes() 56 | ) 57 | 58 | system_info = { 59 | 'cpu_percent': psutil.cpu_percent(percpu=False), 60 | 'memory_percent': psutil.virtual_memory().percent, 61 | 'timestamp': datetime.now().isoformat() 62 | } 63 | 64 | data = { 65 | 'mode': config.MODE, 66 | 'node_name': config.NODE_NAME, 67 | 'gpus': gpu_data, 68 | 'processes': processes, 69 | 'system': system_info 70 | } 71 | 72 | # Send to all connected clients 73 | if connections: 74 | disconnected = set() 75 | for websocket in connections: 76 | try: 77 | await websocket.send_text(json.dumps(data)) 78 | except: 79 | disconnected.add(websocket) 80 | 81 | # Remove disconnected clients 82 | connections -= disconnected 83 | 84 | except Exception as e: 85 | logger.error(f"Error in monitor loop: {e}") 86 | 87 | await asyncio.sleep(update_interval) 88 | 89 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | # GPU Hot 4 | 5 | Real-time NVIDIA GPU monitoring dashboard. Web-based, no SSH required. 6 | 7 | [![Python](https://img.shields.io/badge/Python-3.8+-3776AB?style=flat-square&logo=python&logoColor=white)](https://www.python.org/) 8 | [![Docker](https://img.shields.io/badge/Docker-Ready-2496ED?style=flat-square&logo=docker&logoColor=white)](https://www.docker.com/) 9 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE) 10 | [![NVIDIA](https://img.shields.io/badge/NVIDIA-GPU-76B900?style=flat-square&logo=nvidia&logoColor=white)](https://www.nvidia.com/) 11 | 12 | GPU Hot Dashboard 13 | 14 |
15 | 16 | --- 17 | 18 | ## Usage 19 | 20 | Monitor a single machine or an entire cluster with the same Docker image. 21 | 22 | **Single machine:** 23 | ```bash 24 | docker run -d --gpus all -p 1312:1312 ghcr.io/psalias2006/gpu-hot:latest 25 | ``` 26 | 27 | **Multiple machines:** 28 | ```bash 29 | # On each GPU server 30 | docker run -d --gpus all -p 1312:1312 -e NODE_NAME=$(hostname) ghcr.io/psalias2006/gpu-hot:latest 31 | 32 | # On a hub machine (no GPU required) 33 | docker run -d -p 1312:1312 -e GPU_HOT_MODE=hub -e NODE_URLS=http://server1:1312,http://server2:1312,http://server3:1312 ghcr.io/psalias2006/gpu-hot:latest 34 | ``` 35 | 36 | Open `http://localhost:1312` 37 | 38 | **Older GPUs:** Add `-e NVIDIA_SMI=true` if metrics don't appear. 39 | 40 | **Process monitoring:** Add `--init --pid=host` to see process names. Note: This allows the container to access host process information. 41 | 42 | **From source:** 43 | ```bash 44 | git clone https://github.com/psalias2006/gpu-hot 45 | cd gpu-hot 46 | docker-compose up --build 47 | ``` 48 | 49 | **Requirements:** Docker + [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) 50 | 51 | --- 52 | 53 | ## Features 54 | 55 | - Real-time metrics (sub-second) 56 | - Automatic multi-GPU detection 57 | - Process monitoring (PID, memory usage) 58 | - Historical charts (utilization, temperature, power, clocks) 59 | - System metrics (CPU, RAM) 60 | - Scale from 1 to 100+ GPUs 61 | 62 | **Metrics:** Utilization, temperature, memory, power draw, fan speed, clock speeds, PCIe info, P-State, throttle status, encoder/decoder sessions 63 | 64 | --- 65 | 66 | ## Configuration 67 | 68 | **Environment variables:** 69 | ```bash 70 | NVIDIA_VISIBLE_DEVICES=0,1 # Specific GPUs (default: all) 71 | NVIDIA_SMI=true # Force nvidia-smi mode for older GPUs 72 | GPU_HOT_MODE=hub # Set to 'hub' for multi-node aggregation (default: single node) 73 | NODE_NAME=gpu-server-1 # Node display name (default: hostname) 74 | NODE_URLS=http://host:1312... # Comma-separated node URLs (required for hub mode) 75 | ``` 76 | 77 | **Backend (`core/config.py`):** 78 | ```python 79 | UPDATE_INTERVAL = 0.5 # Polling interval 80 | PORT = 1312 # Server port 81 | ``` 82 | 83 | --- 84 | 85 | ## API 86 | 87 | ### HTTP 88 | ```bash 89 | GET / # Dashboard 90 | GET /api/gpu-data # JSON metrics 91 | ``` 92 | 93 | ### WebSocket 94 | ```javascript 95 | socket.on('gpu_data', (data) => { 96 | // Updates every 0.5s (configurable) 97 | // Contains: data.gpus, data.processes, data.system 98 | }); 99 | ``` 100 | --- 101 | 102 | ## Project Structure 103 | 104 | ```bash 105 | gpu-hot/ 106 | ├── app.py # Flask + WebSocket server 107 | ├── core/ 108 | │ ├── config.py # Configuration 109 | │ ├── monitor.py # NVML GPU monitoring 110 | │ ├── handlers.py # WebSocket handlers 111 | │ ├── routes.py # HTTP routes 112 | │ └── metrics/ 113 | │ ├── collector.py # Metrics collection 114 | │ └── utils.py # Metric utilities 115 | ├── static/ 116 | │ ├── js/ 117 | │ │ ├── charts.js # Chart configs 118 | │ │ ├── gpu-cards.js # UI components 119 | │ │ ├── socket-handlers.js # WebSocket + rendering 120 | │ │ ├── ui.js # View management 121 | │ │ └── app.js # Init 122 | │ └── css/styles.css 123 | ├── templates/index.html 124 | ├── Dockerfile 125 | └── docker-compose.yml 126 | ``` 127 | 128 | --- 129 | 130 | ## Troubleshooting 131 | 132 | **No GPUs detected:** 133 | ```bash 134 | nvidia-smi # Verify drivers work 135 | docker run --rm --gpus all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi # Test Docker GPU access 136 | ``` 137 | 138 | **Hub can't connect to nodes:** 139 | ```bash 140 | curl http://node-ip:1312/api/gpu-data # Test connectivity 141 | sudo ufw allow 1312/tcp # Check firewall 142 | ``` 143 | 144 | **Performance issues:** Increase `UPDATE_INTERVAL` in `core/config.py` 145 | 146 | --- 147 | 148 | ## Star History 149 | 150 | [![Star History Chart](https://api.star-history.com/svg?repos=psalias2006/gpu-hot&type=date&legend=top-left)](https://www.star-history.com/#psalias2006/gpu-hot&type=date&legend=top-left) 151 | 152 | ## Contributing 153 | 154 | PRs welcome. Open an issue for major changes. 155 | 156 | ## License 157 | 158 | MIT - see [LICENSE](LICENSE) 159 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """GPU Hot - Real-time NVIDIA GPU Monitoring Dashboard (FastAPI + AsyncIO)""" 3 | 4 | import asyncio 5 | import logging 6 | import aiohttp 7 | from fastapi import FastAPI, WebSocket, WebSocketDisconnect 8 | from fastapi.staticfiles import StaticFiles 9 | from fastapi.responses import HTMLResponse, JSONResponse 10 | from core import config 11 | from version import __version__ 12 | 13 | # Setup logging 14 | logging.basicConfig( 15 | level=logging.DEBUG if config.DEBUG else logging.INFO, 16 | format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' 17 | ) 18 | logger = logging.getLogger(__name__) 19 | 20 | app = FastAPI(title="GPU Hot", version=__version__) 21 | 22 | # Serve static files 23 | app.mount("/static", StaticFiles(directory="static"), name="static") 24 | 25 | # Mode selection 26 | if config.MODE == 'hub': 27 | # Hub mode: aggregate data from multiple nodes 28 | if not config.NODE_URLS: 29 | raise ValueError("Hub mode requires NODE_URLS environment variable") 30 | 31 | logger.info("Starting GPU Hot in HUB mode (FastAPI)") 32 | logger.info(f"Connecting to {len(config.NODE_URLS)} node(s): {config.NODE_URLS}") 33 | 34 | from core.hub import Hub 35 | from core.hub_handlers import register_hub_handlers 36 | 37 | hub = Hub(config.NODE_URLS) 38 | register_hub_handlers(app, hub) 39 | monitor_or_hub = hub 40 | 41 | else: 42 | # Default mode: monitor local GPUs and serve dashboard 43 | logger.info("Starting GPU Hot (FastAPI)") 44 | logger.info(f"Node name: {config.NODE_NAME}") 45 | 46 | from core.monitor import GPUMonitor 47 | from core.handlers import register_handlers 48 | 49 | monitor = GPUMonitor() 50 | register_handlers(app, monitor) 51 | monitor_or_hub = monitor 52 | 53 | 54 | @app.get("/") 55 | async def index(): 56 | """Serve the main dashboard""" 57 | with open("templates/index.html", "r") as f: 58 | return HTMLResponse(content=f.read()) 59 | 60 | 61 | @app.get("/api/gpu-data") 62 | async def api_gpu_data(): 63 | """REST API endpoint for GPU data""" 64 | if config.MODE == 'hub': 65 | return {"gpus": {}, "timestamp": "hub_mode"} 66 | 67 | if hasattr(monitor_or_hub, 'get_gpu_data'): 68 | return {"gpus": await monitor_or_hub.get_gpu_data(), "timestamp": "async"} 69 | 70 | return {"gpus": {}, "timestamp": "no_data"} 71 | 72 | 73 | def compare_versions(current, latest): 74 | """Compare semantic versions. Returns True if latest > current""" 75 | try: 76 | current_parts = [int(x) for x in current.split('.')] 77 | latest_parts = [int(x) for x in latest.split('.')] 78 | 79 | # Pad to same length 80 | max_len = max(len(current_parts), len(latest_parts)) 81 | current_parts += [0] * (max_len - len(current_parts)) 82 | latest_parts += [0] * (max_len - len(latest_parts)) 83 | 84 | # Compare each part 85 | for c, l in zip(current_parts, latest_parts): 86 | if l > c: 87 | return True 88 | elif l < c: 89 | return False 90 | 91 | return False # Versions are equal 92 | except (ValueError, AttributeError): 93 | return False 94 | 95 | 96 | @app.get("/api/version") 97 | async def api_version(): 98 | """Get current version and check for updates from GitHub""" 99 | current_version = __version__ 100 | 101 | try: 102 | # Check GitHub for latest release 103 | async with aiohttp.ClientSession() as session: 104 | async with session.get( 105 | "https://api.github.com/repos/psalias2006/gpu-hot/releases/latest", 106 | timeout=aiohttp.ClientTimeout(total=5) 107 | ) as response: 108 | if response.status == 200: 109 | data = await response.json() 110 | latest_version = data.get("tag_name", "").lstrip("v") 111 | 112 | # Only show update if latest > current 113 | update_available = compare_versions(current_version, latest_version) if latest_version else False 114 | 115 | return JSONResponse({ 116 | "current": current_version, 117 | "latest": latest_version, 118 | "update_available": update_available, 119 | "release_url": data.get("html_url", "") 120 | }) 121 | except Exception as e: 122 | logger.debug(f"Failed to check for updates: {e}") 123 | 124 | # Return current version even if GitHub check fails 125 | return JSONResponse({ 126 | "current": current_version, 127 | "latest": None, 128 | "update_available": False, 129 | "release_url": None 130 | }) 131 | 132 | 133 | if __name__ == '__main__': 134 | import uvicorn 135 | try: 136 | logger.info(f"Server running on {config.HOST}:{config.PORT}") 137 | uvicorn.run(app, host=config.HOST, port=config.PORT, log_level="info") 138 | finally: 139 | if hasattr(monitor_or_hub, 'shutdown'): 140 | asyncio.run(monitor_or_hub.shutdown()) 141 | -------------------------------------------------------------------------------- /templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | GPU Hot 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 |
18 |
19 |

🔥 GPU Hot

20 |

Real-time NVIDIA GPU monitoring dashboard

21 | 35 |
36 | 37 |
38 |
39 |
40 | Live Monitoring 41 |
42 |
Connecting...
43 |
44 | 45 | 46 |
47 | 48 |
49 | 50 | 51 |
52 |
53 |
54 |
55 | Loading GPU data... 56 |
57 |
58 |
59 | 60 | 61 | 62 |
63 |
64 |
65 | Active GPU Processes 66 |
67 |
68 | 0 processes 69 | 70 |
71 |
72 |
73 |
74 |
75 |
76 | Loading processes... 77 |
78 |
79 |
80 |
81 | 82 |
83 |
84 | 85 |
0%
86 |
System CPU
87 |
Host Processor
88 |
89 |
90 | 91 |
0%
92 |
System RAM
93 |
Host Memory
94 |
95 |
96 |
97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | -------------------------------------------------------------------------------- /static/js/ui.js: -------------------------------------------------------------------------------- 1 | /** 2 | * UI Interactions and navigation 3 | */ 4 | 5 | // Global state 6 | let currentTab = 'overview'; 7 | let registeredGPUs = new Set(); 8 | let hasAutoSwitched = false; // Track if we've done initial auto-switch 9 | 10 | // Toggle processes section 11 | function toggleProcesses() { 12 | const content = document.getElementById('processes-content'); 13 | const header = document.querySelector('.processes-header'); 14 | const icon = document.querySelector('.toggle-icon'); 15 | 16 | content.classList.toggle('expanded'); 17 | header.classList.toggle('expanded'); 18 | icon.classList.toggle('expanded'); 19 | } 20 | 21 | // Tab switching with smooth transitions 22 | function switchToView(viewName) { 23 | if (!viewName) { 24 | console.warn('switchToView: Missing viewName'); 25 | return; 26 | } 27 | 28 | currentTab = viewName; 29 | 30 | // Update view selector states 31 | document.querySelectorAll('.view-option').forEach(btn => { 32 | btn.classList.remove('active'); 33 | if (btn.dataset.view === viewName) { 34 | btn.classList.add('active'); 35 | } 36 | }); 37 | 38 | // Switch tab content with animation 39 | document.querySelectorAll('.tab-content').forEach(content => { 40 | content.classList.remove('active'); 41 | }); 42 | 43 | const targetContent = document.getElementById(`tab-${viewName}`); 44 | if (!targetContent) { 45 | console.warn(`switchToView: Tab content not found for "${viewName}"`); 46 | return; 47 | } 48 | 49 | targetContent.classList.add('active'); 50 | 51 | // Trigger chart resize for visible charts immediately without animation 52 | if (viewName.startsWith('gpu-')) { 53 | const gpuId = viewName.replace('gpu-', ''); 54 | 55 | // Disable animations during resize to prevent glitchy transitions 56 | if (charts && charts[gpuId]) { 57 | Object.values(charts[gpuId]).forEach(chart => { 58 | if (!chart) return; 59 | 60 | try { 61 | if (chart.options) { 62 | // Store original animation setting 63 | const originalAnimation = chart.options.animation; 64 | 65 | // Temporarily disable all animations 66 | chart.options.animation = false; 67 | 68 | // Resize without animation 69 | if (typeof chart.resize === 'function') { 70 | chart.resize(); 71 | } 72 | 73 | // Force immediate update without animation 74 | if (typeof chart.update === 'function') { 75 | chart.update('none'); 76 | } 77 | 78 | // Restore original animation setting 79 | chart.options.animation = originalAnimation; 80 | } 81 | } catch (error) { 82 | console.error(`Error resizing chart for GPU ${gpuId}:`, error); 83 | } 84 | }); 85 | } 86 | } 87 | } 88 | 89 | // Create or update GPU tab 90 | function ensureGPUTab(gpuId, gpuInfo, shouldUpdateDOM = true) { 91 | if (!registeredGPUs.has(gpuId)) { 92 | // Add view option 93 | const viewSelector = document.getElementById('view-selector'); 94 | const viewOption = document.createElement('button'); 95 | viewOption.className = 'view-option'; 96 | viewOption.dataset.view = `gpu-${gpuId}`; 97 | viewOption.textContent = `GPU ${gpuId}`; 98 | viewOption.onclick = () => switchToView(`gpu-${gpuId}`); 99 | viewSelector.appendChild(viewOption); 100 | 101 | // Create tab content 102 | const tabContent = document.createElement('div'); 103 | tabContent.id = `tab-gpu-${gpuId}`; 104 | tabContent.className = 'tab-content'; 105 | tabContent.innerHTML = `
`; 106 | document.getElementById('tab-overview').after(tabContent); 107 | 108 | registeredGPUs.add(gpuId); 109 | } 110 | 111 | // Update or create detailed GPU card in tab 112 | const detailedContainer = document.querySelector(`#tab-gpu-${gpuId} .detailed-view`); 113 | const existingCard = document.getElementById(`gpu-${gpuId}`); 114 | 115 | if (!existingCard && detailedContainer) { 116 | detailedContainer.innerHTML = createGPUCard(gpuId, gpuInfo); 117 | // Do not reinitialize chartData here; it would break existing chart references 118 | if (!chartData[gpuId]) initGPUData(gpuId); 119 | initGPUCharts(gpuId); 120 | } else if (existingCard) { 121 | updateGPUDisplay(gpuId, gpuInfo, shouldUpdateDOM); 122 | } 123 | } 124 | 125 | // Remove GPU tab 126 | function removeGPUTab(gpuId) { 127 | if (!registeredGPUs.has(gpuId)) { 128 | return; // Tab doesn't exist 129 | } 130 | 131 | // If currently viewing this GPU's tab, switch to overview 132 | if (currentTab === `gpu-${gpuId}`) { 133 | switchToView('overview'); 134 | } 135 | 136 | // Remove view option button 137 | const viewOption = document.querySelector(`.view-option[data-view="gpu-${gpuId}"]`); 138 | if (viewOption) { 139 | viewOption.remove(); 140 | } 141 | 142 | // Remove tab content 143 | const tabContent = document.getElementById(`tab-gpu-${gpuId}`); 144 | if (tabContent) { 145 | tabContent.remove(); 146 | } 147 | 148 | // Destroy charts 149 | if (charts[gpuId]) { 150 | Object.values(charts[gpuId]).forEach(chart => { 151 | if (chart && chart.destroy) { 152 | chart.destroy(); 153 | } 154 | }); 155 | delete charts[gpuId]; 156 | } 157 | 158 | // Remove from registered GPUs 159 | registeredGPUs.delete(gpuId); 160 | } 161 | 162 | // Auto-switch to single GPU view if only 1 GPU detected 163 | function autoSwitchSingleGPU(gpuCount, gpuIds) { 164 | if (gpuCount === 1 && !hasAutoSwitched) { 165 | const singleGpuId = gpuIds[0]; 166 | setTimeout(() => { 167 | switchToView(`gpu-${singleGpuId}`); 168 | }, 300); // Small delay to ensure DOM is ready 169 | hasAutoSwitched = true; 170 | } 171 | } 172 | 173 | // Make switchToView globally available 174 | window.switchToView = switchToView; 175 | -------------------------------------------------------------------------------- /core/hub.py: -------------------------------------------------------------------------------- 1 | """Async Hub mode - aggregates data from multiple nodes""" 2 | 3 | import asyncio 4 | import logging 5 | import json 6 | import websockets 7 | from datetime import datetime 8 | from . import config 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | class Hub: 14 | """Aggregates GPU data from multiple nodes""" 15 | 16 | def __init__(self, node_urls): 17 | self.node_urls = node_urls 18 | self.nodes = {} # node_name -> {client, data, status, last_update} 19 | self.url_to_node = {} # url -> node_name mapping 20 | self.running = False 21 | self._connection_started = False 22 | 23 | # Initialize nodes as offline 24 | for url in node_urls: 25 | self.nodes[url] = { 26 | 'url': url, 27 | 'websocket': None, 28 | 'data': None, 29 | 'status': 'offline', 30 | 'last_update': None 31 | } 32 | self.url_to_node[url] = url 33 | 34 | async def _connect_all_nodes(self): 35 | """Connect to all nodes in background with retries""" 36 | # Wait a bit for Docker network to be ready 37 | await asyncio.sleep(2) 38 | 39 | # Connect to all nodes concurrently 40 | tasks = [self._connect_node_with_retry(url) for url in self.node_urls] 41 | await asyncio.gather(*tasks, return_exceptions=True) 42 | 43 | async def _connect_node_with_retry(self, url): 44 | """Connect to a node with retry logic""" 45 | max_retries = 5 46 | retry_delay = 2 47 | 48 | for attempt in range(max_retries): 49 | try: 50 | await self._connect_node(url) 51 | return # Success 52 | except Exception as e: 53 | if attempt < max_retries - 1: 54 | logger.warning(f'Connection attempt {attempt + 1}/{max_retries} failed for {url}: {str(e)}, retrying in {retry_delay}s...') 55 | await asyncio.sleep(retry_delay) 56 | else: 57 | logger.error(f'Failed to connect to node {url} after {max_retries} attempts: {str(e)}') 58 | 59 | async def _connect_node(self, url): 60 | """Connect to a node using native WebSocket""" 61 | while self.running: 62 | try: 63 | # Convert HTTP URL to WebSocket URL 64 | ws_url = url.replace('http://', 'ws://').replace('https://', 'wss://') + '/socket.io/' 65 | 66 | logger.info(f'Connecting to node WebSocket: {ws_url}') 67 | 68 | async with websockets.connect(ws_url) as websocket: 69 | logger.info(f'Connected to node: {url}') 70 | 71 | # Mark node as online 72 | node_name = self.url_to_node.get(url, url) 73 | self.nodes[node_name] = { 74 | 'url': url, 75 | 'websocket': websocket, 76 | 'data': None, 77 | 'status': 'online', 78 | 'last_update': datetime.now().isoformat() 79 | } 80 | 81 | # Listen for data from the node 82 | async for message in websocket: 83 | try: 84 | data = json.loads(message) 85 | 86 | # Extract node name from data or use URL as fallback 87 | node_name = data.get('node_name', url) 88 | 89 | # Update URL to node mapping 90 | self.url_to_node[url] = node_name 91 | 92 | # Update node entry with received data 93 | self.nodes[node_name] = { 94 | 'url': url, 95 | 'websocket': websocket, 96 | 'data': data, 97 | 'status': 'online', 98 | 'last_update': datetime.now().isoformat() 99 | } 100 | 101 | except json.JSONDecodeError as e: 102 | logger.error(f'Failed to parse message from {url}: {e}') 103 | except Exception as e: 104 | logger.error(f'Error processing message from {url}: {e}') 105 | 106 | except websockets.exceptions.ConnectionClosed: 107 | logger.warning(f'WebSocket connection closed for node: {url}') 108 | # Mark node as offline 109 | node_name = self.url_to_node.get(url, url) 110 | if node_name in self.nodes: 111 | self.nodes[node_name]['status'] = 'offline' 112 | logger.info(f'Marked node {node_name} as offline') 113 | except Exception as e: 114 | logger.error(f'Failed to connect to node {url}: {e}') 115 | # Mark node as offline 116 | node_name = self.url_to_node.get(url, url) 117 | if node_name in self.nodes: 118 | self.nodes[node_name]['status'] = 'offline' 119 | logger.info(f'Marked node {node_name} as offline') 120 | 121 | # Wait before retrying connection 122 | if self.running: 123 | await asyncio.sleep(5) 124 | 125 | async def get_cluster_data(self): 126 | """Get aggregated data from all nodes""" 127 | nodes = {} 128 | total_gpus = 0 129 | online_nodes = 0 130 | 131 | for node_name, node_info in self.nodes.items(): 132 | if node_info['status'] == 'online' and node_info['data']: 133 | nodes[node_name] = { 134 | 'status': 'online', 135 | 'gpus': node_info['data'].get('gpus', {}), 136 | 'processes': node_info['data'].get('processes', []), 137 | 'system': node_info['data'].get('system', {}), 138 | 'last_update': node_info['last_update'] 139 | } 140 | total_gpus += len(node_info['data'].get('gpus', {})) 141 | online_nodes += 1 142 | else: 143 | nodes[node_name] = { 144 | 'status': 'offline', 145 | 'gpus': {}, 146 | 'processes': [], 147 | 'system': {}, 148 | 'last_update': node_info.get('last_update') 149 | } 150 | 151 | return { 152 | 'mode': 'hub', 153 | 'nodes': nodes, 154 | 'cluster_stats': { 155 | 'total_nodes': len(self.nodes), 156 | 'online_nodes': online_nodes, 157 | 'total_gpus': total_gpus 158 | } 159 | } 160 | 161 | async def shutdown(self): 162 | """Disconnect from all nodes""" 163 | self.running = False 164 | for node_info in self.nodes.values(): 165 | if node_info.get('websocket'): 166 | try: 167 | await node_info['websocket'].close() 168 | except: 169 | pass 170 | 171 | -------------------------------------------------------------------------------- /core/nvidia_smi_fallback.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple nvidia-smi fallback parser 3 | Based on the original working implementation 4 | """ 5 | 6 | import subprocess 7 | import logging 8 | from datetime import datetime 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | def parse_nvidia_smi(): 14 | """Parse nvidia-smi output and extract comprehensive GPU information""" 15 | try: 16 | result = subprocess.run([ 17 | 'nvidia-smi', 18 | '--query-gpu=index,name,uuid,driver_version,vbios_version,' 19 | 'temperature.gpu,utilization.gpu,utilization.memory,' 20 | 'memory.used,memory.total,memory.free,power.draw,power.limit,' 21 | 'fan.speed,clocks.gr,clocks.sm,clocks.mem,' 22 | 'clocks.max.gr,clocks.max.sm,clocks.max.mem,' 23 | 'pcie.link.gen.current,pcie.link.gen.max,pcie.link.width.current,pcie.link.width.max,' 24 | 'encoder.stats.sessionCount,encoder.stats.averageFps,encoder.stats.averageLatency,' 25 | 'pstate,compute_mode', 26 | '--format=csv,noheader,nounits' 27 | ], capture_output=True, text=True, timeout=10) 28 | 29 | if result.returncode != 0: 30 | logger.warning(f"nvidia-smi comprehensive query failed (code {result.returncode}), trying basic query") 31 | return parse_nvidia_smi_fallback() 32 | 33 | lines = result.stdout.strip().split('\n') 34 | gpu_data = {} 35 | 36 | for line in lines: 37 | if line.strip(): 38 | parts = [p.strip() for p in line.split(',')] 39 | if len(parts) >= 27: 40 | gpu_id = parts[0] 41 | gpu_data[gpu_id] = { 42 | 'index': parts[0], 43 | 'name': parts[1], 44 | 'uuid': parts[2] if parts[2] not in ['N/A', '[N/A]', ''] else 'N/A', 45 | 'driver_version': parts[3] if parts[3] not in ['N/A', '[N/A]', ''] else 'N/A', 46 | 'vbios_version': parts[4] if parts[4] not in ['N/A', '[N/A]', ''] else 'N/A', 47 | 'temperature': float(parts[5]) if parts[5] not in ['N/A', '[N/A]', ''] else 0, 48 | 'temperature_memory': 0, 49 | 'utilization': float(parts[6]) if parts[6] not in ['N/A', '[N/A]', ''] else 0, 50 | 'memory_utilization': float(parts[7]) if parts[7] not in ['N/A', '[N/A]', ''] else 0, 51 | 'memory_used': float(parts[8]) if parts[8] not in ['N/A', '[N/A]', ''] else 0, 52 | 'memory_total': float(parts[9]) if parts[9] not in ['N/A', '[N/A]', ''] else 0, 53 | 'memory_free': float(parts[10]) if parts[10] not in ['N/A', '[N/A]', ''] else 0, 54 | 'power_draw': float(parts[11]) if parts[11] not in ['N/A', '[N/A]', ''] else 0, 55 | 'power_limit': float(parts[12]) if parts[12] not in ['N/A', '[N/A]', ''] else 0, 56 | 'power_default_limit': 0, 57 | 'fan_speed': float(parts[13]) if parts[13] not in ['N/A', '[N/A]', ''] else 0, 58 | 'clock_graphics': float(parts[14]) if parts[14] not in ['N/A', '[N/A]', ''] else 0, 59 | 'clock_sm': float(parts[15]) if parts[15] not in ['N/A', '[N/A]', ''] else 0, 60 | 'clock_memory': float(parts[16]) if parts[16] not in ['N/A', '[N/A]', ''] else 0, 61 | 'clock_video': 0, 62 | 'clock_max_graphics': float(parts[17]) if parts[17] not in ['N/A', '[N/A]', ''] else 0, 63 | 'clock_max_sm': float(parts[18]) if parts[18] not in ['N/A', '[N/A]', ''] else 0, 64 | 'clock_max_memory': float(parts[19]) if parts[19] not in ['N/A', '[N/A]', ''] else 0, 65 | 'pcie_gen': parts[20] if parts[20] not in ['N/A', '[N/A]', ''] else 'N/A', 66 | 'pcie_gen_max': parts[21] if parts[21] not in ['N/A', '[N/A]', ''] else 'N/A', 67 | 'pcie_width': parts[22] if parts[22] not in ['N/A', '[N/A]', ''] else 'N/A', 68 | 'pcie_width_max': parts[23] if parts[23] not in ['N/A', '[N/A]', ''] else 'N/A', 69 | 'encoder_sessions': int(parts[24]) if parts[24] not in ['N/A', '[N/A]', ''] else 0, 70 | 'encoder_fps': float(parts[25]) if parts[25] not in ['N/A', '[N/A]', ''] else 0, 71 | 'encoder_latency': float(parts[26]) if parts[26] not in ['N/A', '[N/A]', ''] else 0, 72 | 'decoder_sessions': 0, 73 | 'decoder_fps': 0, 74 | 'decoder_latency': 0, 75 | 'performance_state': parts[27] if len(parts) > 27 and parts[27] not in ['N/A', '[N/A]', ''] else 'N/A', 76 | 'compute_mode': parts[28] if len(parts) > 28 and parts[28] not in ['N/A', '[N/A]', ''] else 'N/A', 77 | 'throttle_reasons': 'None', 78 | 'timestamp': datetime.now().isoformat(), 79 | '_fallback_mode': True 80 | } 81 | 82 | if gpu_data: 83 | logger.debug(f"nvidia-smi returned data for {len(gpu_data)} GPU(s)") 84 | return gpu_data 85 | 86 | except subprocess.TimeoutExpired: 87 | logger.error("nvidia-smi command timed out (>10s)") 88 | return {} 89 | except Exception as e: 90 | logger.error(f"nvidia-smi comprehensive query error: {e}, trying basic query") 91 | return parse_nvidia_smi_fallback() 92 | 93 | 94 | def parse_nvidia_smi_fallback(): 95 | """Fallback parser with minimal, widely-supported fields""" 96 | try: 97 | logger.info("Using basic nvidia-smi query (minimal fields)") 98 | result = subprocess.run([ 99 | 'nvidia-smi', 100 | '--query-gpu=index,name,temperature.gpu,utilization.gpu,utilization.memory,' 101 | 'memory.used,memory.total,power.draw,power.limit,fan.speed,' 102 | 'clocks.gr,clocks.sm,clocks.mem,pstate', 103 | '--format=csv,noheader,nounits' 104 | ], capture_output=True, text=True, timeout=10) 105 | 106 | if result.returncode != 0: 107 | logger.error(f"Basic nvidia-smi query also failed (code {result.returncode})") 108 | return {} 109 | 110 | lines = result.stdout.strip().split('\n') 111 | gpu_data = {} 112 | 113 | for line in lines: 114 | if line.strip(): 115 | parts = [p.strip() for p in line.split(',')] 116 | if len(parts) >= 14: 117 | gpu_id = parts[0] 118 | gpu_data[gpu_id] = { 119 | 'index': parts[0], 120 | 'name': parts[1], 121 | 'uuid': 'N/A', 122 | 'driver_version': 'N/A', 123 | 'vbios_version': 'N/A', 124 | 'temperature': float(parts[2]) if parts[2] not in ['N/A', '[N/A]', ''] else 0, 125 | 'temperature_memory': 0, 126 | 'utilization': float(parts[3]) if parts[3] not in ['N/A', '[N/A]', ''] else 0, 127 | 'memory_utilization': float(parts[4]) if parts[4] not in ['N/A', '[N/A]', ''] else 0, 128 | 'memory_used': float(parts[5]) if parts[5] not in ['N/A', '[N/A]', ''] else 0, 129 | 'memory_total': float(parts[6]) if parts[6] not in ['N/A', '[N/A]', ''] else 0, 130 | 'memory_free': float(parts[6]) - float(parts[5]) if parts[6] not in ['N/A', '[N/A]', ''] and parts[5] not in ['N/A', '[N/A]', ''] else 0, 131 | 'power_draw': float(parts[7]) if parts[7] not in ['N/A', '[N/A]', ''] else 0, 132 | 'power_limit': float(parts[8]) if parts[8] not in ['N/A', '[N/A]', ''] else 0, 133 | 'power_default_limit': 0, 134 | 'fan_speed': float(parts[9]) if parts[9] not in ['N/A', '[N/A]', ''] else 0, 135 | 'clock_graphics': float(parts[10]) if parts[10] not in ['N/A', '[N/A]', ''] else 0, 136 | 'clock_sm': float(parts[11]) if parts[11] not in ['N/A', '[N/A]', ''] else 0, 137 | 'clock_memory': float(parts[12]) if parts[12] not in ['N/A', '[N/A]', ''] else 0, 138 | 'clock_video': 0, 139 | 'clock_max_graphics': 0, 140 | 'clock_max_sm': 0, 141 | 'clock_max_memory': 0, 142 | 'pcie_gen': 'N/A', 143 | 'pcie_gen_max': 'N/A', 144 | 'pcie_width': 'N/A', 145 | 'pcie_width_max': 'N/A', 146 | 'encoder_sessions': 0, 147 | 'encoder_fps': 0, 148 | 'encoder_latency': 0, 149 | 'decoder_sessions': 0, 150 | 'decoder_fps': 0, 151 | 'decoder_latency': 0, 152 | 'performance_state': parts[13] if parts[13] not in ['N/A', '[N/A]', ''] else 'N/A', 153 | 'compute_mode': 'N/A', 154 | 'throttle_reasons': 'None', 155 | 'timestamp': datetime.now().isoformat(), 156 | '_fallback_mode': True 157 | } 158 | 159 | if gpu_data: 160 | logger.info(f"Basic nvidia-smi query successful - Found {len(gpu_data)} GPU(s)") 161 | return gpu_data 162 | 163 | except Exception as e: 164 | logger.error(f"Basic nvidia-smi query failed: {e}") 165 | return {} 166 | 167 | -------------------------------------------------------------------------------- /core/monitor.py: -------------------------------------------------------------------------------- 1 | """Async GPU monitoring using NVML""" 2 | 3 | import asyncio 4 | import pynvml 5 | import psutil 6 | import logging 7 | from .metrics import MetricsCollector 8 | from .nvidia_smi_fallback import parse_nvidia_smi 9 | from .config import NVIDIA_SMI 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | class GPUMonitor: 15 | """Monitor NVIDIA GPUs using NVML""" 16 | 17 | def __init__(self): 18 | self.running = False 19 | self.gpu_data = {} 20 | self.collector = MetricsCollector() 21 | self.use_smi = {} # Track which GPUs use nvidia-smi (decided at boot) 22 | 23 | try: 24 | pynvml.nvmlInit() 25 | self.initialized = True 26 | version = pynvml.nvmlSystemGetDriverVersion() 27 | if isinstance(version, bytes): 28 | version = version.decode('utf-8') 29 | logger.info(f"NVML initialized - Driver: {version}") 30 | 31 | # Detect which GPUs need nvidia-smi (once at boot) 32 | self._detect_smi_gpus() 33 | 34 | except Exception as e: 35 | logger.error(f"Failed to initialize NVML: {e}") 36 | self.initialized = False 37 | 38 | def _detect_smi_gpus(self): 39 | """Detect which GPUs need nvidia-smi fallback (called once at boot)""" 40 | try: 41 | device_count = pynvml.nvmlDeviceGetCount() 42 | logger.info(f"Detected {device_count} GPU(s)") 43 | 44 | if NVIDIA_SMI: 45 | logger.warning("NVIDIA_SMI=True - Forcing nvidia-smi for all GPUs") 46 | for i in range(device_count): 47 | self.use_smi[str(i)] = True 48 | return 49 | 50 | # Auto-detect per GPU 51 | for i in range(device_count): 52 | gpu_id = str(i) 53 | try: 54 | handle = pynvml.nvmlDeviceGetHandleByIndex(i) 55 | data = self.collector.collect_all(handle, gpu_id) 56 | gpu_name = data.get('name', 'Unknown') 57 | 58 | if 'utilization' not in data or data.get('utilization') is None: 59 | self.use_smi[gpu_id] = True 60 | logger.warning(f"GPU {i} ({gpu_name}): Utilization metric not available via NVML") 61 | logger.warning(f"GPU {i} ({gpu_name}): Switching to nvidia-smi mode") 62 | else: 63 | self.use_smi[gpu_id] = False 64 | logger.info(f"GPU {i} ({gpu_name}): Using NVML (utilization: {data.get('utilization')}%)") 65 | 66 | except Exception as e: 67 | self.use_smi[gpu_id] = True 68 | logger.error(f"GPU {i}: NVML detection failed - {e}") 69 | logger.warning(f"GPU {i}: Falling back to nvidia-smi") 70 | 71 | # Summary 72 | nvml_count = sum(1 for use_smi in self.use_smi.values() if not use_smi) 73 | smi_count = sum(1 for use_smi in self.use_smi.values() if use_smi) 74 | if smi_count > 0: 75 | logger.info(f"Boot detection complete: {nvml_count} GPU(s) using NVML, {smi_count} GPU(s) using nvidia-smi") 76 | else: 77 | logger.info(f"Boot detection complete: All {nvml_count} GPU(s) using NVML") 78 | 79 | except Exception as e: 80 | logger.error(f"Failed to detect GPUs: {e}") 81 | 82 | async def get_gpu_data(self): 83 | """Async collect metrics from all detected GPUs""" 84 | if not self.initialized: 85 | logger.error("Cannot get GPU data - NVML not initialized") 86 | return {} 87 | 88 | try: 89 | device_count = pynvml.nvmlDeviceGetCount() 90 | gpu_data = {} 91 | 92 | # Get nvidia-smi data once if any GPU needs it 93 | smi_data = None 94 | if any(self.use_smi.values()): 95 | try: 96 | # Run nvidia-smi in thread pool to avoid blocking 97 | smi_data = await asyncio.get_event_loop().run_in_executor( 98 | None, parse_nvidia_smi 99 | ) 100 | except Exception as e: 101 | logger.error(f"nvidia-smi failed: {e}") 102 | 103 | # Collect GPU data concurrently 104 | tasks = [] 105 | for i in range(device_count): 106 | gpu_id = str(i) 107 | if self.use_smi.get(gpu_id, False): 108 | # Use nvidia-smi data 109 | if smi_data and gpu_id in smi_data: 110 | gpu_data[gpu_id] = smi_data[gpu_id] 111 | else: 112 | logger.warning(f"GPU {i}: No data from nvidia-smi") 113 | else: 114 | # Use NVML - run in thread pool to avoid blocking 115 | task = asyncio.get_event_loop().run_in_executor( 116 | None, self._collect_single_gpu, i 117 | ) 118 | tasks.append((gpu_id, task)) 119 | 120 | # Wait for all NVML tasks to complete 121 | if tasks: 122 | results = await asyncio.gather(*[task for _, task in tasks], return_exceptions=True) 123 | for (gpu_id, _), result in zip(tasks, results): 124 | if isinstance(result, Exception): 125 | logger.error(f"GPU {gpu_id}: Error - {result}") 126 | else: 127 | gpu_data[gpu_id] = result 128 | 129 | if not gpu_data: 130 | logger.error("No GPU data collected from any source") 131 | 132 | self.gpu_data = gpu_data 133 | return gpu_data 134 | 135 | except Exception as e: 136 | logger.error(f"Failed to get GPU data: {e}") 137 | return {} 138 | 139 | def _collect_single_gpu(self, gpu_index): 140 | """Collect data for a single GPU (runs in thread pool)""" 141 | try: 142 | handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_index) 143 | return self.collector.collect_all(handle, str(gpu_index)) 144 | except Exception as e: 145 | logger.error(f"GPU {gpu_index}: Error - {e}") 146 | return {} 147 | 148 | async def get_processes(self): 149 | """Async get GPU process information""" 150 | if not self.initialized: 151 | return [] 152 | 153 | try: 154 | # Run process collection in thread pool 155 | return await asyncio.get_event_loop().run_in_executor( 156 | None, self._get_processes_sync 157 | ) 158 | except Exception as e: 159 | logger.error(f"Error getting processes: {e}") 160 | return [] 161 | 162 | def _get_processes_sync(self): 163 | """Synchronous process collection (runs in thread pool)""" 164 | try: 165 | device_count = pynvml.nvmlDeviceGetCount() 166 | all_processes = [] 167 | gpu_process_counts = {} 168 | 169 | for i in range(device_count): 170 | try: 171 | handle = pynvml.nvmlDeviceGetHandleByIndex(i) 172 | uuid = pynvml.nvmlDeviceGetUUID(handle) 173 | if isinstance(uuid, bytes): 174 | uuid = uuid.decode('utf-8') 175 | 176 | gpu_id = str(i) 177 | gpu_process_counts[gpu_id] = {'compute': 0, 'graphics': 0} 178 | 179 | try: 180 | procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) 181 | gpu_process_counts[gpu_id]['compute'] = len(procs) 182 | 183 | for proc in procs: 184 | all_processes.append({ 185 | 'pid': str(proc.pid), 186 | 'name': self._get_process_name(proc.pid), 187 | 'gpu_uuid': uuid, 188 | 'gpu_id': gpu_id, 189 | 'memory': float(proc.usedGpuMemory / (1024 ** 2)) 190 | }) 191 | except pynvml.NVMLError: 192 | pass 193 | 194 | except pynvml.NVMLError: 195 | continue 196 | 197 | for gpu_id, counts in gpu_process_counts.items(): 198 | if gpu_id in self.gpu_data: 199 | self.gpu_data[gpu_id]['compute_processes_count'] = counts['compute'] 200 | self.gpu_data[gpu_id]['graphics_processes_count'] = counts['graphics'] 201 | 202 | return all_processes 203 | 204 | except Exception as e: 205 | logger.error(f"Error getting processes: {e}") 206 | return [] 207 | 208 | def _get_process_name(self, pid): 209 | """Extract readable process name from PID with improved logic""" 210 | try: 211 | p = psutil.Process(pid) 212 | 213 | # First try to get the process name 214 | try: 215 | process_name = p.name() 216 | if process_name and process_name not in ['python', 'python3', 'sh', 'bash']: 217 | return process_name 218 | except (psutil.AccessDenied, psutil.NoSuchProcess, psutil.ZombieProcess): 219 | pass 220 | 221 | # Try to get command line for better name extraction 222 | try: 223 | cmdline = p.cmdline() 224 | if cmdline: 225 | # Look for the actual executable or script name 226 | for i, arg in enumerate(cmdline): 227 | if not arg or arg.startswith('-'): 228 | continue 229 | 230 | # Skip common interpreters and shells 231 | if arg in ['python', 'python3', 'node', 'java', 'sh', 'bash', 'zsh']: 232 | continue 233 | 234 | # Extract filename from path 235 | filename = arg.split('/')[-1].split('\\')[-1] 236 | 237 | # Skip if it's still a generic name 238 | if filename in ['python', 'python3', 'node', 'java', 'sh', 'bash']: 239 | continue 240 | 241 | # Found a meaningful name 242 | if filename: 243 | return filename 244 | 245 | # Fallback to first argument if nothing else worked 246 | if cmdline[0]: 247 | return cmdline[0].split('/')[-1].split('\\')[-1] 248 | 249 | except (psutil.AccessDenied, psutil.NoSuchProcess, psutil.ZombieProcess): 250 | pass 251 | 252 | # Final fallback 253 | return f'PID:{pid}' 254 | 255 | except (psutil.NoSuchProcess, psutil.ZombieProcess): 256 | return f'PID:{pid}' 257 | except Exception as e: 258 | logger.debug(f"Error getting process name for PID {pid}: {e}") 259 | return f'PID:{pid}' 260 | 261 | async def shutdown(self): 262 | """Async shutdown""" 263 | if self.initialized: 264 | try: 265 | pynvml.nvmlShutdown() 266 | self.initialized = False 267 | logger.info("NVML shutdown") 268 | except Exception as e: 269 | logger.error(f"Error shutting down NVML: {e}") 270 | 271 | -------------------------------------------------------------------------------- /docs/404.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 404 - Page Not Found | GPU Hot 7 | 8 | 9 | 10 | 348 | 349 | 350 |
351 | 352 | 357 | 358 | 359 |
360 |
404
361 |

Page Not Found

362 |

This page doesn't exist. Even the GPU couldn't compute this one.

363 | Back to Home 364 | 368 |
369 |
370 | 371 | 372 | 379 | 380 | 381 | 382 | -------------------------------------------------------------------------------- /tests/test_cluster.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Mock GPU cluster for load testing hub mode 4 | Simulates realistic GPU workloads across multiple servers 5 | """ 6 | 7 | import time 8 | import random 9 | import asyncio 10 | import json 11 | from datetime import datetime 12 | import argparse 13 | import logging 14 | from fastapi import FastAPI, WebSocket 15 | import uvicorn 16 | 17 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(message)s') 18 | logger = logging.getLogger(__name__) 19 | 20 | 21 | class MockGPUNode: 22 | """Simulates a GPU node with realistic metrics for load testing""" 23 | 24 | def __init__(self, node_name, gpu_count, port=1312): 25 | self.node_name = node_name 26 | self.gpu_count = gpu_count 27 | self.port = port 28 | self.app = FastAPI(title=f"Mock GPU Node {node_name}") 29 | self.websocket_connections = set() 30 | self.broadcasting = False 31 | 32 | # Initialize per-GPU state for realistic patterns 33 | self.gpu_states = [] 34 | for gpu_id in range(gpu_count): 35 | self.gpu_states.append({ 36 | 'base_temp': random.randint(45, 55), 37 | 'is_busy': random.random() < 0.4, # 40% of GPUs are busy 38 | 'job_start': time.time() - random.uniform(0, 300), # Random job start time 39 | 'memory': random.choice([12288, 24576]), # Mix of 3080 (12GB) and 3090 (24GB) 40 | 'allocated_memory': 0, 41 | 'clock_base': random.randint(1710, 1890), # Stable boost clock 42 | }) 43 | 44 | self.start_time = time.time() 45 | 46 | def _generate_realistic_utilization(self, state, timestamp): 47 | """Generate realistic ML training utilization patterns""" 48 | if not state['is_busy']: 49 | # Idle GPU - occasionally switch to busy 50 | if random.random() < 0.001: # 0.1% chance per update to start job 51 | state['is_busy'] = True 52 | state['job_start'] = timestamp 53 | state['allocated_memory'] = state['memory'] * random.uniform(0.85, 0.95) 54 | return random.uniform(0, 3) 55 | 56 | # Busy GPU - simulate training epoch pattern 57 | job_duration = timestamp - state['job_start'] 58 | epoch_time = 120 # 2 minute epochs 59 | epoch_progress = (job_duration % epoch_time) / epoch_time 60 | 61 | # Occasionally finish job 62 | if random.random() < 0.0005: # Job finishes 63 | state['is_busy'] = False 64 | state['allocated_memory'] = 0 65 | return 0 66 | 67 | # Training pattern with data loading dips 68 | if epoch_progress < 0.05: # Warmup phase 69 | return random.gauss(25, 5) 70 | elif epoch_progress > 0.93: # Validation phase 71 | return random.gauss(65, 5) 72 | else: # Main training 73 | base_util = random.gauss(96, 2) 74 | # Data loading dips every ~5 seconds 75 | if (timestamp % 5) < 0.4: 76 | base_util *= 0.75 77 | return max(0, min(100, base_util)) 78 | 79 | def generate_gpu_data(self): 80 | """Generate realistic GPU metrics for load testing""" 81 | timestamp = time.time() 82 | gpus = {} 83 | processes = [] 84 | 85 | for gpu_id in range(self.gpu_count): 86 | state = self.gpu_states[gpu_id] 87 | 88 | # Realistic utilization pattern 89 | util = self._generate_realistic_utilization(state, timestamp) 90 | 91 | # Memory: allocated at job start, stays constant during training 92 | if state['is_busy']: 93 | mem_used = state['allocated_memory'] 94 | else: 95 | mem_used = random.uniform(0, 100) # Minimal idle usage 96 | 97 | # Temperature: correlates with utilization, slow changes 98 | target_temp = state['base_temp'] + (util / 100) * 35 99 | temp_variation = random.gauss(0, 1) 100 | temp = max(30, min(92, target_temp + temp_variation)) 101 | 102 | # Power: correlates with utilization 103 | mem_base = state['memory'] 104 | max_power = 250 if mem_base == 12288 else 350 105 | power = (util / 100) * max_power * random.uniform(0.85, 1.0) 106 | 107 | # Clock speeds: stable based on load 108 | if util > 50: 109 | clock_graphics = state['clock_base'] + random.randint(-20, 20) 110 | pstate = 'P0' 111 | elif util > 10: 112 | clock_graphics = int(state['clock_base'] * 0.8) + random.randint(-15, 15) 113 | pstate = 'P2' 114 | else: 115 | clock_graphics = random.randint(210, 500) 116 | pstate = 'P8' 117 | 118 | gpus[str(gpu_id)] = { 119 | 'index': gpu_id, 120 | 'name': f'NVIDIA RTX {"3090" if mem_base == 24576 else "3080"}', 121 | 'utilization': round(util, 1), 122 | 'temperature': round(temp, 1), 123 | 'memory_used': round(mem_used, 0), 124 | 'memory_total': mem_base, 125 | 'power_draw': round(power, 1), 126 | 'power_limit': max_power, 127 | 'fan_speed': round(min(100, 30 + max(0, temp - 40) * 1.5)), 128 | 'clock_graphics': clock_graphics, 129 | 'clock_sm': clock_graphics, 130 | 'clock_memory': 9501 if mem_base == 24576 else 9001, 131 | 'pcie_gen': 4, 132 | 'pcie_width': 16, 133 | 'pstate': pstate, 134 | 'encoder_sessions': 0, 135 | 'decoder_sessions': 0, 136 | 'throttle_reasons': [] 137 | } 138 | 139 | # Add processes for busy GPUs 140 | if state['is_busy']: 141 | process_count = random.randint(1, 2) 142 | for p in range(process_count): 143 | processes.append({ 144 | 'pid': random.randint(1000, 99999), 145 | 'name': random.choice(['python3', 'train.py', 'pytorch', 'python']), 146 | 'gpu_memory': round(mem_used / process_count, 0), 147 | 'gpu_id': gpu_id 148 | }) 149 | 150 | # System metrics: correlate with GPU load 151 | avg_gpu_util = sum(g['utilization'] for g in gpus.values()) / len(gpus) 152 | system = { 153 | 'cpu_percent': round(random.gauss(15 + avg_gpu_util * 0.3, 5), 1), 154 | 'memory_percent': round(random.gauss(60, 10), 1), 155 | 'memory_used': round(random.gauss(80, 15), 1), 156 | 'memory_total': 128.0 157 | } 158 | 159 | return { 160 | 'node_name': self.node_name, 161 | 'gpus': gpus, 162 | 'processes': processes, 163 | 'system': system 164 | } 165 | 166 | async def _broadcast_loop(self): 167 | """Background task to broadcast GPU data every 0.5s""" 168 | while self.broadcasting: 169 | try: 170 | data = self.generate_gpu_data() 171 | 172 | # Send to all connected clients 173 | if self.websocket_connections: 174 | disconnected = set() 175 | for websocket in self.websocket_connections: 176 | try: 177 | await websocket.send_text(json.dumps(data)) 178 | except: 179 | disconnected.add(websocket) 180 | 181 | # Remove disconnected clients 182 | self.websocket_connections -= disconnected 183 | 184 | except Exception as e: 185 | logger.error(f'[{self.node_name}] Error in broadcast loop: {e}') 186 | await asyncio.sleep(0.5) 187 | 188 | def setup_routes(self): 189 | """Setup WebSocket routes""" 190 | 191 | @self.app.websocket("/socket.io/") 192 | async def websocket_endpoint(websocket: WebSocket): 193 | await websocket.accept() 194 | self.websocket_connections.add(websocket) 195 | logger.info(f'[{self.node_name}] Client connected') 196 | 197 | # Start broadcasting when first client connects 198 | if not self.broadcasting: 199 | self.broadcasting = True 200 | asyncio.create_task(self._broadcast_loop()) 201 | 202 | try: 203 | # Keep connection alive 204 | while True: 205 | await websocket.receive_text() 206 | except Exception as e: 207 | logger.debug(f'[{self.node_name}] Client disconnected: {e}') 208 | finally: 209 | self.websocket_connections.discard(websocket) 210 | 211 | async def run(self): 212 | """Run the mock node server""" 213 | self.setup_routes() 214 | 215 | logger.info(f'[{self.node_name}] Starting mock node with {self.gpu_count} GPUs on port {self.port}') 216 | 217 | # Create server config 218 | config = uvicorn.Config( 219 | self.app, 220 | host='0.0.0.0', 221 | port=self.port, 222 | log_level='info', 223 | access_log=False 224 | ) 225 | server = uvicorn.Server(config) 226 | await server.serve() 227 | 228 | 229 | async def start_mock_node(node_name, gpu_count, port): 230 | """Start a mock node as async task""" 231 | node = MockGPUNode(node_name, gpu_count, port) 232 | await node.run() 233 | 234 | 235 | async def main(): 236 | parser = argparse.ArgumentParser(description='Mock GPU cluster for testing') 237 | parser.add_argument('--nodes', type=str, default='2,4,8', 238 | help='Comma-separated GPU counts for each node (e.g., "2,4,8")') 239 | parser.add_argument('--base-port', type=int, default=13120, 240 | help='Base port for nodes (increments for each node)') 241 | parser.add_argument('--prefix', type=str, default='gpu-server', 242 | help='Prefix for node names') 243 | 244 | args = parser.parse_args() 245 | 246 | gpu_counts = [int(x.strip()) for x in args.nodes.split(',')] 247 | 248 | print("\n" + "="*60) 249 | print("GPU Hot - Mock Cluster Test (FastAPI + AsyncIO)") 250 | print("="*60) 251 | print(f"\nStarting {len(gpu_counts)} mock GPU servers:\n") 252 | 253 | node_urls = [] 254 | for i, gpu_count in enumerate(gpu_counts): 255 | port = args.base_port + i 256 | node_name = f"{args.prefix}-{i+1}" 257 | node_urls.append(f"http://localhost:{port}") 258 | print(f" • {node_name}: {gpu_count} GPUs on port {port}") 259 | 260 | print("\n" + "-"*60) 261 | print("Mock nodes running! Now start the hub with:") 262 | print("-"*60) 263 | print(f"\nexport GPU_HOT_MODE=hub") 264 | print(f"export NODE_URLS={','.join(node_urls)}") 265 | print(f"python app.py") 266 | print("\nOr with Docker:") 267 | print(f"\ndocker run -d -p 1312:1312 \\") 268 | print(f" -e GPU_HOT_MODE=hub \\") 269 | print(f" -e NODE_URLS={','.join(node_urls)} \\") 270 | print(f" --network=host \\") 271 | print(f" ghcr.io/psalias2006/gpu-hot:latest") 272 | print("\nThen open: http://localhost:1312") 273 | print("-"*60 + "\n") 274 | 275 | # Start all nodes concurrently 276 | tasks = [] 277 | for i, gpu_count in enumerate(gpu_counts): 278 | port = args.base_port + i 279 | node_name = f"{args.prefix}-{i+1}" 280 | task = asyncio.create_task(start_mock_node(node_name, gpu_count, port)) 281 | tasks.append(task) 282 | 283 | # Keep all tasks running 284 | try: 285 | await asyncio.gather(*tasks) 286 | except KeyboardInterrupt: 287 | print("\n\nStopping mock cluster...") 288 | 289 | 290 | if __name__ == '__main__': 291 | asyncio.run(main()) 292 | 293 | -------------------------------------------------------------------------------- /static/js/chart-config.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Chart configuration factory - DRY approach for chart configs 3 | */ 4 | 5 | // Base chart options shared across all charts 6 | function getBaseChartOptions() { 7 | return { 8 | responsive: true, 9 | maintainAspectRatio: false, 10 | animation: false, // Disable all animations globally 11 | interaction: { 12 | intersect: false, 13 | mode: 'index' 14 | }, 15 | layout: { 16 | padding: { left: 0, right: 0, top: 5, bottom: 10 } 17 | }, 18 | scales: { 19 | x: { 20 | display: true, 21 | offset: true, 22 | grid: { 23 | display: false, 24 | drawBorder: false, 25 | offset: true 26 | }, 27 | ticks: { 28 | color: 'rgba(255, 255, 255, 0.6)', 29 | font: { size: 11, weight: '500' }, 30 | maxRotation: 0, 31 | autoSkip: true, 32 | maxTicksLimit: 7, 33 | padding: 8, 34 | align: 'center' 35 | } 36 | }, 37 | y: { 38 | min: 0, 39 | grid: { 40 | color: 'rgba(255, 255, 255, 0.08)', 41 | borderDash: [2, 3], 42 | drawBorder: false, 43 | lineWidth: 1 44 | }, 45 | ticks: { 46 | color: 'rgba(255, 255, 255, 0.7)', 47 | font: { size: 12, weight: '500' }, 48 | padding: 12, 49 | count: 6 50 | } 51 | } 52 | }, 53 | plugins: { 54 | legend: { 55 | display: false 56 | }, 57 | tooltip: { 58 | backgroundColor: 'rgba(0, 0, 0, 0.9)', 59 | titleColor: '#ffffff', 60 | bodyColor: '#ffffff', 61 | borderWidth: 2, 62 | cornerRadius: 12, 63 | displayColors: true, 64 | padding: 12, 65 | titleFont: { size: 14, weight: 'bold' }, 66 | bodyFont: { size: 13 } 67 | } 68 | } 69 | }; 70 | } 71 | 72 | // Create a line chart configuration 73 | function createLineChartConfig(options) { 74 | const { 75 | label, 76 | borderColor, 77 | backgroundColor, 78 | yMax, 79 | yStepSize, 80 | yUnit, 81 | tooltipTitle, 82 | tooltipLabel, // Optional: custom label for tooltip (defaults to dataset label) 83 | tooltipAfterLabel, 84 | thresholds = [] 85 | } = options; 86 | 87 | const datasets = [{ 88 | label: label, 89 | data: [], 90 | borderColor: borderColor, 91 | backgroundColor: backgroundColor, 92 | borderWidth: 2.5, 93 | tension: 0.35, 94 | fill: true, 95 | pointRadius: 0, 96 | pointHitRadius: 12, 97 | pointBackgroundColor: borderColor, 98 | pointBorderColor: '#fff', 99 | pointBorderWidth: 2, 100 | borderCapStyle: 'round', 101 | borderJoinStyle: 'round' 102 | }]; 103 | 104 | // Add threshold lines 105 | thresholds.forEach(threshold => { 106 | datasets.push({ 107 | label: threshold.label, 108 | data: [], 109 | borderColor: threshold.color, 110 | backgroundColor: 'transparent', 111 | borderWidth: 1, 112 | borderDash: threshold.dash || [5, 5], 113 | pointRadius: 0, 114 | fill: false 115 | }); 116 | }); 117 | 118 | const config = { 119 | type: 'line', 120 | data: { 121 | labels: [], 122 | datasets: datasets 123 | }, 124 | options: getBaseChartOptions() 125 | }; 126 | 127 | // Customize Y axis 128 | if (yMax !== undefined) config.options.scales.y.max = yMax; 129 | if (yMax === undefined && options.ySuggestedMax) config.options.scales.y.suggestedMax = options.ySuggestedMax; 130 | if (yStepSize) config.options.scales.y.ticks.stepSize = yStepSize; 131 | if (yUnit) { 132 | config.options.scales.y.ticks.callback = function(value) { 133 | return value + yUnit; 134 | }; 135 | } 136 | 137 | // Customize tooltip 138 | config.options.plugins.tooltip.borderColor = borderColor; 139 | config.options.plugins.tooltip.callbacks = { 140 | title: function(context) { 141 | return tooltipTitle; 142 | }, 143 | label: function(context) { 144 | const datasetLabel = context.dataset.label || ''; 145 | const value = context.parsed.y; 146 | // Skip threshold labels 147 | if (thresholds.some(t => datasetLabel.includes(t.label.split('(')[0]))) { 148 | return datasetLabel; 149 | } 150 | const displayLabel = tooltipLabel || datasetLabel; 151 | return `${displayLabel}: ${value.toFixed(options.decimals || 1)}${yUnit || ''}`; 152 | }, 153 | afterLabel: tooltipAfterLabel ? function(context) { 154 | if (thresholds.some(t => context.dataset.label.includes(t.label.split('(')[0]))) { 155 | return null; 156 | } 157 | return tooltipAfterLabel(context.parsed.y); 158 | } : undefined 159 | }; 160 | 161 | return config; 162 | } 163 | 164 | // Create multi-line chart (for clocks, pcie, etc) 165 | function createMultiLineChartConfig(options) { 166 | const { 167 | datasets, 168 | yUnit, 169 | tooltipTitle, 170 | showLegend = false, 171 | ySuggestedMax, 172 | decimals = 0 173 | } = options; 174 | 175 | const config = { 176 | type: 'line', 177 | data: { 178 | labels: [], 179 | datasets: datasets.map(ds => ({ 180 | label: ds.label, 181 | data: [], 182 | borderColor: ds.color, 183 | backgroundColor: ds.bgColor || `${ds.color}15`, 184 | borderWidth: ds.width || 2.5, 185 | tension: 0.35, 186 | fill: ds.fill !== undefined ? ds.fill : false, 187 | pointRadius: 0, 188 | pointHitRadius: 12, 189 | pointBackgroundColor: ds.color, 190 | pointBorderColor: '#fff', 191 | pointBorderWidth: 2, 192 | borderCapStyle: 'round', 193 | borderJoinStyle: 'round' 194 | })) 195 | }, 196 | options: getBaseChartOptions() 197 | }; 198 | 199 | // Y axis customization 200 | if (ySuggestedMax) config.options.scales.y.suggestedMax = ySuggestedMax; 201 | if (yUnit) { 202 | config.options.scales.y.ticks.callback = function(value) { 203 | return value.toFixed(decimals) + yUnit; 204 | }; 205 | } 206 | 207 | // Legend 208 | if (showLegend) { 209 | config.options.plugins.legend.display = true; 210 | config.options.plugins.legend.position = 'top'; 211 | config.options.plugins.legend.align = 'end'; 212 | config.options.plugins.legend.labels = { 213 | color: 'rgba(255, 255, 255, 0.8)', 214 | font: { size: 11 }, 215 | boxWidth: 10, 216 | boxHeight: 10, 217 | padding: 10, 218 | usePointStyle: true 219 | }; 220 | } 221 | 222 | // Tooltip 223 | config.options.plugins.tooltip.borderColor = datasets[0].color; 224 | config.options.plugins.tooltip.callbacks = { 225 | title: function(context) { 226 | return tooltipTitle; 227 | }, 228 | label: function(context) { 229 | const label = context.dataset.label || ''; 230 | const value = context.parsed.y; 231 | return `${label}: ${value.toFixed(decimals)}${yUnit || ''}`; 232 | } 233 | }; 234 | 235 | return config; 236 | } 237 | 238 | // Chart configurations using factory functions 239 | const chartConfigs = { 240 | utilization: createLineChartConfig({ 241 | label: 'GPU Utilization', 242 | borderColor: '#4facfe', 243 | backgroundColor: 'rgba(79, 172, 254, 0.15)', 244 | yMax: 100, 245 | yStepSize: 20, 246 | yUnit: '%', 247 | tooltipTitle: 'GPU Utilization', 248 | thresholds: [ 249 | { label: 'High Load (80%)', color: 'rgba(250, 112, 154, 0.5)', dash: [5, 5] } 250 | ], 251 | tooltipAfterLabel: (value) => { 252 | if (value > 90) return '🔥 Very High'; 253 | if (value > 80) return '⚡ High'; 254 | if (value > 50) return '✓ Active'; 255 | return '💤 Low'; 256 | } 257 | }), 258 | 259 | temperature: createLineChartConfig({ 260 | label: 'GPU Temperature', 261 | borderColor: '#f5576c', 262 | backgroundColor: 'rgba(245, 87, 108, 0.15)', 263 | ySuggestedMax: 90, 264 | yStepSize: 15, 265 | yUnit: '°C', 266 | tooltipTitle: 'GPU Temperature', 267 | thresholds: [ 268 | { label: 'Warning (75°C)', color: 'rgba(254, 202, 87, 0.6)', dash: [5, 5] }, 269 | { label: 'Danger (85°C)', color: 'rgba(250, 112, 154, 0.8)', dash: [10, 5] } 270 | ], 271 | tooltipAfterLabel: (value) => { 272 | if (value > 85) return '🚨 DANGER'; 273 | if (value > 75) return '⚠️ Warning'; 274 | if (value > 60) return '🌡️ Normal'; 275 | return '❄️ Cool'; 276 | } 277 | }), 278 | 279 | memory: createLineChartConfig({ 280 | label: 'Memory Usage', 281 | borderColor: '#4facfe', 282 | backgroundColor: 'rgba(79, 172, 254, 0.15)', 283 | yMax: 100, 284 | yStepSize: 20, 285 | yUnit: '%', 286 | tooltipTitle: 'VRAM Usage', 287 | thresholds: [ 288 | { label: 'High Usage (90%)', color: 'rgba(250, 112, 154, 0.6)', dash: [5, 5] } 289 | ], 290 | tooltipAfterLabel: (value) => { 291 | if (value > 95) return '🚨 Critical'; 292 | if (value > 90) return '⚠️ Very High'; 293 | if (value > 75) return '📊 High'; 294 | return '✓ Normal'; 295 | } 296 | }), 297 | 298 | power: createLineChartConfig({ 299 | label: 'Power Draw', 300 | borderColor: '#43e97b', 301 | backgroundColor: 'rgba(67, 233, 123, 0.15)', 302 | ySuggestedMax: 200, 303 | yStepSize: 50, 304 | yUnit: ' W', 305 | tooltipTitle: 'Power Draw', 306 | tooltipLabel: 'Power', // Shortened label for tooltip 307 | tooltipAfterLabel: (value) => { 308 | if (value > 200) return '⚡ Maximum Performance'; 309 | if (value > 150) return '🔥 High Performance'; 310 | if (value > 100) return '💪 Active'; 311 | if (value > 50) return '✓ Moderate'; 312 | return '💤 Idle'; 313 | } 314 | }), 315 | 316 | fanSpeed: createLineChartConfig({ 317 | label: 'Fan Speed', 318 | borderColor: '#38bdf8', 319 | backgroundColor: 'rgba(56, 189, 248, 0.15)', 320 | yMax: 100, 321 | yStepSize: 20, 322 | yUnit: '%', 323 | tooltipTitle: 'Fan Speed', 324 | tooltipAfterLabel: (value) => { 325 | if (value > 90) return '🌪️ Maximum'; 326 | if (value > 70) return '💨 High'; 327 | if (value > 40) return '🌬️ Active'; 328 | if (value > 10) return '✓ Low'; 329 | return '⏸️ Idle'; 330 | } 331 | }), 332 | 333 | clocks: createMultiLineChartConfig({ 334 | datasets: [ 335 | { label: 'Graphics Clock', color: '#a78bfa', bgColor: 'rgba(167, 139, 250, 0.1)' }, 336 | { label: 'SM Clock', color: '#fb923c', bgColor: 'rgba(251, 146, 60, 0.1)' }, 337 | { label: 'Memory Clock', color: '#34d399', bgColor: 'rgba(52, 211, 153, 0.1)' } 338 | ], 339 | yUnit: ' MHz', 340 | tooltipTitle: 'Clock Speeds', 341 | showLegend: true, 342 | decimals: 0 343 | }), 344 | 345 | efficiency: createLineChartConfig({ 346 | label: 'Power Efficiency', 347 | borderColor: '#fbbf24', 348 | backgroundColor: 'rgba(251, 191, 36, 0.15)', 349 | yUnit: ' %/W', 350 | tooltipTitle: 'Power Efficiency', 351 | tooltipLabel: 'Efficiency', // Shortened label for tooltip 352 | decimals: 2, 353 | tooltipAfterLabel: (value) => { 354 | if (value > 0.8) return '⭐ Excellent'; 355 | if (value > 0.5) return '✓ Good'; 356 | if (value > 0.3) return '📊 Fair'; 357 | if (value > 0.1) return '⚡ Active'; 358 | return '💤 Idle'; 359 | } 360 | }), 361 | 362 | pcie: createMultiLineChartConfig({ 363 | datasets: [ 364 | { label: 'RX Throughput', color: '#3b82f6', backgroundColor: 'rgba(59, 130, 246, 0.15)', width: 3, fill: true }, 365 | { label: 'TX Throughput', color: '#8b5cf6', backgroundColor: 'rgba(139, 92, 246, 0.15)', width: 3, fill: true } 366 | ], 367 | yUnit: ' KB/s', 368 | tooltipTitle: 'PCIe Throughput', 369 | showLegend: true, 370 | decimals: 0 371 | }), 372 | 373 | appclocks: createMultiLineChartConfig({ 374 | datasets: [ 375 | { label: 'Graphics Clock', color: '#4facfe', backgroundColor: 'rgba(79, 172, 254, 0.15)', width: 2, fill: true }, 376 | { label: 'Memory Clock', color: '#f59e0b', backgroundColor: 'rgba(245, 158, 11, 0.15)', width: 2, fill: true }, 377 | { label: 'SM Clock', color: '#ec4899', backgroundColor: 'rgba(236, 72, 153, 0.15)', width: 2, fill: true }, 378 | { label: 'Video Clock', color: '#10b981', backgroundColor: 'rgba(16, 185, 129, 0.15)', width: 2, fill: true } 379 | ], 380 | yUnit: ' MHz', 381 | tooltipTitle: 'Application Clocks', 382 | showLegend: true, 383 | decimals: 0 384 | }) 385 | }; 386 | 387 | -------------------------------------------------------------------------------- /docs/demo.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | GPU Hot - Interactive Demo 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 29 | 30 | 31 |
32 |
33 |

🔥 GPU Hot

34 |

Real-time NVIDIA GPU monitoring dashboard

35 |
36 | 37 |
38 | ⚠️ Interactive Demo - Simulated Data 39 |
40 | 41 |
42 |
43 |
44 | Live Monitoring (Demo) 45 |
46 |
Running Demo
47 |
48 | 49 | 50 |
51 | 52 |
53 | 54 | 55 |
56 |
57 |
58 |
59 | 60 | 61 | 62 |
63 |
64 |
65 | Active GPU Processes 66 |
67 |
68 | 0 processes 69 | 70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 | 78 |
79 |
80 | 81 |
0%
82 |
System CPU
83 |
Host Processor
84 |
85 |
86 | 87 |
0%
88 |
System RAM
89 |
Host Memory
90 |
91 |
92 |
93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 344 | 345 | 346 | -------------------------------------------------------------------------------- /core/metrics/collector.py: -------------------------------------------------------------------------------- 1 | """GPU metrics collector using NVML""" 2 | 3 | import time 4 | import pynvml 5 | from datetime import datetime 6 | from .utils import safe_get, decode_bytes, to_mib, to_watts 7 | 8 | 9 | class MetricsCollector: 10 | """Collect all available GPU metrics via NVML""" 11 | 12 | def __init__(self): 13 | self.previous_samples = {} 14 | self.last_sample_time = {} 15 | 16 | def collect_all(self, handle, gpu_id): 17 | """Collect all available metrics for a GPU""" 18 | data = { 19 | 'index': gpu_id, 20 | 'timestamp': datetime.now().isoformat() 21 | } 22 | current_time = time.time() 23 | 24 | self._add_basic_info(handle, data) 25 | self._add_performance(handle, data) 26 | self._add_memory(handle, data, gpu_id, current_time) 27 | self._add_power_thermal(handle, data) 28 | self._add_clocks(handle, data) 29 | self._add_connectivity(handle, data) 30 | self._add_media_engines(handle, data) 31 | self._add_health_status(handle, data) 32 | self._add_advanced(handle, data) 33 | 34 | self.previous_samples[gpu_id] = data.copy() 35 | self.last_sample_time[gpu_id] = current_time 36 | 37 | return data 38 | 39 | def _add_basic_info(self, handle, data): 40 | """Basic GPU information""" 41 | if name := safe_get(pynvml.nvmlDeviceGetName, handle): 42 | data['name'] = decode_bytes(name) 43 | 44 | if uuid := safe_get(pynvml.nvmlDeviceGetUUID, handle): 45 | data['uuid'] = decode_bytes(uuid) 46 | 47 | if driver := safe_get(pynvml.nvmlSystemGetDriverVersion): 48 | data['driver_version'] = decode_bytes(driver) 49 | 50 | if vbios := safe_get(pynvml.nvmlDeviceGetVbiosVersion, handle): 51 | data['vbios_version'] = decode_bytes(vbios) 52 | 53 | # Brand and architecture with smart detection 54 | self._detect_brand(handle, data) 55 | self._detect_architecture(handle, data) 56 | 57 | # CUDA capability 58 | if cap := safe_get(pynvml.nvmlDeviceGetCudaComputeCapability, handle): 59 | data['cuda_compute_capability'] = f"{cap[0]}.{cap[1]}" 60 | 61 | # Serial number 62 | if serial := safe_get(pynvml.nvmlDeviceGetSerial, handle): 63 | data['serial'] = decode_bytes(serial) 64 | 65 | def _detect_brand(self, handle, data): 66 | """Detect GPU brand from NVML""" 67 | BRAND_MAP = { 68 | 1: 'GeForce', 2: 'Quadro', 3: 'Tesla', 69 | 4: 'NVS', 5: 'GRID', 6: 'Titan', 70 | 7: 'GeForce GTX', 8: 'GeForce RTX', 9: 'Titan RTX' 71 | } 72 | 73 | if brand := safe_get(pynvml.nvmlDeviceGetBrand, handle): 74 | data['brand'] = BRAND_MAP.get(brand, f'Brand {brand}') 75 | 76 | def _detect_architecture(self, handle, data): 77 | """Detect GPU architecture with fallback to name-based detection""" 78 | ARCH_MAP = { 79 | 0: 'Kepler', 1: 'Maxwell', 2: 'Pascal', 3: 'Volta', 80 | 4: 'Turing', 5: 'Ampere', 6: 'Ada Lovelace', 7: 'Hopper', 81 | 8: 'Ada Lovelace', 9: 'Ada Lovelace' # Driver variations 82 | } 83 | 84 | # Try NVML first 85 | if arch := safe_get(pynvml.nvmlDeviceGetArchitecture, handle): 86 | data['architecture'] = ARCH_MAP.get(arch, self._detect_arch_from_name(data.get('name', ''))) 87 | # Fallback to name-based detection 88 | elif 'name' in data: 89 | data['architecture'] = self._detect_arch_from_name(data['name']) 90 | 91 | def _detect_arch_from_name(self, gpu_name): 92 | """Detect architecture from GPU model name""" 93 | name = gpu_name.upper() 94 | 95 | arch_patterns = [ 96 | (['RTX 40', 'RTX 4', 'L40', 'L4'], 'Ada Lovelace'), 97 | (['H100', 'H200'], 'Hopper'), 98 | (['RTX 30', 'RTX 3', 'A100', 'A40', 'A30', 'A10', 'A6000', 'A5000', 'A4000', 'A2000'], 'Ampere'), 99 | (['RTX 20', 'RTX 2', 'GTX 16', 'T1000', 'T2000', 'T600'], 'Turing'), 100 | (['GTX 10', 'TITAN X', 'P100', 'P40', 'P6'], 'Pascal'), 101 | (['GTX 9', 'TITAN M', 'M60', 'M40'], 'Maxwell'), 102 | (['GTX 7', 'GTX 6', 'K80', 'K40'], 'Kepler'), 103 | (['V100'], 'Volta'), 104 | ] 105 | 106 | for patterns, arch in arch_patterns: 107 | if any(pattern in name for pattern in patterns): 108 | return arch 109 | 110 | return 'Unknown' 111 | 112 | def _add_performance(self, handle, data): 113 | """Performance metrics""" 114 | # Utilization 115 | if util := safe_get(pynvml.nvmlDeviceGetUtilizationRates, handle): 116 | data['utilization'] = float(util.gpu) 117 | data['memory_utilization'] = float(util.memory) 118 | 119 | # Performance state 120 | if pstate := safe_get(pynvml.nvmlDeviceGetPerformanceState, handle): 121 | data['performance_state'] = f'P{pstate}' 122 | 123 | # Compute mode 124 | if mode := safe_get(pynvml.nvmlDeviceGetComputeMode, handle): 125 | modes = {0: 'Default', 1: 'Exclusive Thread', 126 | 2: 'Prohibited', 3: 'Exclusive Process'} 127 | data['compute_mode'] = modes.get(mode, 'Unknown') 128 | 129 | def _add_memory(self, handle, data, gpu_id, current_time): 130 | """Memory metrics""" 131 | if mem := safe_get(pynvml.nvmlDeviceGetMemoryInfo, handle): 132 | data['memory_used'] = to_mib(mem.used) 133 | data['memory_total'] = to_mib(mem.total) 134 | data['memory_free'] = to_mib(mem.free) 135 | 136 | # Calculate change rate 137 | if gpu_id in self.previous_samples: 138 | prev = self.previous_samples[gpu_id] 139 | if 'memory_used' in prev: 140 | dt = current_time - self.last_sample_time.get(gpu_id, current_time) 141 | if dt > 0: 142 | delta = data['memory_used'] - prev['memory_used'] 143 | data['memory_change_rate'] = float(delta / dt) 144 | 145 | # BAR1 memory 146 | if bar1 := safe_get(pynvml.nvmlDeviceGetBAR1MemoryInfo, handle): 147 | data['bar1_memory_used'] = to_mib(bar1.bar1Used) 148 | data['bar1_memory_total'] = to_mib(bar1.bar1Total) 149 | 150 | def _add_power_thermal(self, handle, data): 151 | """Power and thermal metrics""" 152 | self._add_temperature(handle, data) 153 | self._add_power(handle, data) 154 | self._add_fan_speeds(handle, data) 155 | self._add_throttling(handle, data) 156 | 157 | def _add_temperature(self, handle, data): 158 | if temp := safe_get(pynvml.nvmlDeviceGetTemperature, handle, pynvml.NVML_TEMPERATURE_GPU): 159 | data['temperature'] = float(temp) 160 | 161 | if temp_mem := safe_get(pynvml.nvmlDeviceGetTemperature, handle, 1): 162 | if temp_mem > 0: 163 | data['temperature_memory'] = float(temp_mem) 164 | 165 | def _add_power(self, handle, data): 166 | if power := safe_get(pynvml.nvmlDeviceGetPowerUsage, handle): 167 | data['power_draw'] = to_watts(power) 168 | 169 | if limit := safe_get(pynvml.nvmlDeviceGetPowerManagementLimit, handle): 170 | data['power_limit'] = to_watts(limit) 171 | 172 | if constraints := safe_get(pynvml.nvmlDeviceGetPowerManagementLimitConstraints, handle): 173 | if isinstance(constraints, tuple) and len(constraints) >= 2: 174 | data['power_limit_min'] = to_watts(constraints[0]) 175 | data['power_limit_max'] = to_watts(constraints[1]) 176 | 177 | if energy := safe_get(pynvml.nvmlDeviceGetTotalEnergyConsumption, handle): 178 | data['energy_consumption'] = float(energy) / 1000.0 179 | data['energy_consumption_wh'] = float(energy) / 3600000.0 180 | 181 | def _add_fan_speeds(self, handle, data): 182 | if fan := safe_get(pynvml.nvmlDeviceGetFanSpeed, handle): 183 | data['fan_speed'] = float(fan) 184 | 185 | if hasattr(pynvml, 'nvmlDeviceGetNumFans') and hasattr(pynvml, 'nvmlDeviceGetFanSpeed_v2'): 186 | if num_fans := safe_get(pynvml.nvmlDeviceGetNumFans, handle): 187 | fans = [] 188 | for i in range(num_fans): 189 | if speed := safe_get(pynvml.nvmlDeviceGetFanSpeed_v2, handle, i): 190 | fans.append(float(speed)) 191 | if fans: 192 | data['fan_speeds'] = fans 193 | 194 | def _add_throttling(self, handle, data): 195 | if throttle := safe_get(pynvml.nvmlDeviceGetCurrentClocksThrottleReasons, handle): 196 | throttle_map = [ 197 | (pynvml.nvmlClocksThrottleReasonGpuIdle, 'GPU Idle'), 198 | (pynvml.nvmlClocksThrottleReasonApplicationsClocksSetting, 'App Settings'), 199 | (pynvml.nvmlClocksThrottleReasonSwPowerCap, 'SW Power Cap'), 200 | (pynvml.nvmlClocksThrottleReasonHwSlowdown, 'HW Slowdown'), 201 | (pynvml.nvmlClocksThrottleReasonSwThermalSlowdown, 'SW Thermal'), 202 | (pynvml.nvmlClocksThrottleReasonHwThermalSlowdown, 'HW Thermal'), 203 | (pynvml.nvmlClocksThrottleReasonHwPowerBrakeSlowdown, 'Power Brake'), 204 | ] 205 | reasons = [label for flag, label in throttle_map if throttle & flag] 206 | data['throttle_reasons'] = ', '.join(reasons) if reasons else 'None' 207 | 208 | def _add_clocks(self, handle, data): 209 | """Clock speed metrics""" 210 | clock_types = [ 211 | ('clock_graphics', pynvml.NVML_CLOCK_GRAPHICS), 212 | ('clock_sm', pynvml.NVML_CLOCK_SM), 213 | ('clock_memory', pynvml.NVML_CLOCK_MEM), 214 | ('clock_video', pynvml.NVML_CLOCK_VIDEO), 215 | ] 216 | 217 | for key, clock_type in clock_types: 218 | # Current clocks 219 | if clock := safe_get(pynvml.nvmlDeviceGetClockInfo, handle, clock_type): 220 | data[key] = float(clock) 221 | 222 | # Max clocks 223 | if max_clock := safe_get(pynvml.nvmlDeviceGetMaxClockInfo, handle, clock_type): 224 | data[f'{key}_max'] = float(max_clock) 225 | 226 | # Application clocks (target clocks set by user/driver) 227 | if app_clock := safe_get(pynvml.nvmlDeviceGetApplicationsClock, handle, clock_type): 228 | data[f'{key}_app'] = float(app_clock) 229 | 230 | # Default application clocks 231 | if default_clock := safe_get(pynvml.nvmlDeviceGetDefaultApplicationsClock, handle, clock_type): 232 | data[f'{key}_default'] = float(default_clock) 233 | 234 | # Supported memory clocks (list of all available clock speeds) 235 | try: 236 | if mem_clocks := safe_get(pynvml.nvmlDeviceGetSupportedMemoryClocks, handle): 237 | if mem_clocks and len(mem_clocks) > 0: 238 | data['supported_memory_clocks'] = [float(c) for c in mem_clocks[:10]] # Limit to first 10 239 | except: 240 | pass 241 | 242 | def _add_connectivity(self, handle, data): 243 | """PCIe and interconnect metrics""" 244 | # PCIe 245 | pcie_metrics = [ 246 | ('pcie_gen', pynvml.nvmlDeviceGetCurrPcieLinkGeneration), 247 | ('pcie_gen_max', pynvml.nvmlDeviceGetMaxPcieLinkGeneration), 248 | ('pcie_width', pynvml.nvmlDeviceGetCurrPcieLinkWidth), 249 | ('pcie_width_max', pynvml.nvmlDeviceGetMaxPcieLinkWidth), 250 | ] 251 | 252 | for key, func in pcie_metrics: 253 | if value := safe_get(func, handle): 254 | data[key] = str(value) 255 | 256 | # PCIe throughput 257 | if tx := safe_get(pynvml.nvmlDeviceGetPcieThroughput, handle, 258 | pynvml.NVML_PCIE_UTIL_TX_BYTES): 259 | data['pcie_tx_throughput'] = float(tx) 260 | 261 | if rx := safe_get(pynvml.nvmlDeviceGetPcieThroughput, handle, 262 | pynvml.NVML_PCIE_UTIL_RX_BYTES): 263 | data['pcie_rx_throughput'] = float(rx) 264 | 265 | # PCI info 266 | if pci := safe_get(pynvml.nvmlDeviceGetPciInfo, handle): 267 | data['pci_bus_id'] = decode_bytes(pci.busId) 268 | 269 | def _add_media_engines(self, handle, data): 270 | """Encoder/decoder metrics""" 271 | # Encoder 272 | if enc := safe_get(pynvml.nvmlDeviceGetEncoderUtilization, handle): 273 | if isinstance(enc, tuple) and len(enc) >= 2: 274 | data['encoder_utilization'] = float(enc[0]) 275 | 276 | try: 277 | if sessions := pynvml.nvmlDeviceGetEncoderSessions(handle): 278 | data['encoder_sessions'] = len(sessions) 279 | if fps := [s.averageFps for s in sessions if hasattr(s, 'averageFps')]: 280 | data['encoder_fps'] = float(sum(fps) / len(fps)) 281 | except: 282 | pass 283 | 284 | # Decoder 285 | if dec := safe_get(pynvml.nvmlDeviceGetDecoderUtilization, handle): 286 | if isinstance(dec, tuple) and len(dec) >= 2: 287 | data['decoder_utilization'] = float(dec[0]) 288 | 289 | try: 290 | if sessions := pynvml.nvmlDeviceGetDecoderSessions(handle): 291 | data['decoder_sessions'] = len(sessions) 292 | except: 293 | pass 294 | 295 | def _add_health_status(self, handle, data): 296 | """ECC and health metrics""" 297 | try: 298 | if ecc := pynvml.nvmlDeviceGetEccMode(handle): 299 | if ecc[0]: 300 | data['ecc_enabled'] = True 301 | 302 | # ECC errors 303 | if err := safe_get(pynvml.nvmlDeviceGetTotalEccErrors, handle, 304 | pynvml.NVML_MEMORY_ERROR_TYPE_CORRECTED, 305 | pynvml.NVML_VOLATILE_ECC): 306 | data['ecc_errors_corrected'] = int(err) 307 | except: 308 | pass 309 | 310 | # Retired pages 311 | try: 312 | if pages := pynvml.nvmlDeviceGetRetiredPages(handle, 313 | pynvml.NVML_PAGE_RETIREMENT_CAUSE_DOUBLE_BIT_ECC_ERROR): 314 | data['retired_pages'] = len(pages) 315 | except: 316 | pass 317 | 318 | def _add_advanced(self, handle, data): 319 | """Advanced features""" 320 | if mode := safe_get(pynvml.nvmlDeviceGetPersistenceMode, handle): 321 | data['persistence_mode'] = 'Enabled' if mode else 'Disabled' 322 | 323 | if display := safe_get(pynvml.nvmlDeviceGetDisplayActive, handle): 324 | data['display_active'] = bool(display) 325 | 326 | if multi := safe_get(pynvml.nvmlDeviceGetMultiGpuBoard, handle): 327 | data['multi_gpu_board'] = bool(multi) 328 | 329 | if procs := safe_get(pynvml.nvmlDeviceGetGraphicsRunningProcesses, handle, default=[]): 330 | data['graphics_processes_count'] = len(procs) 331 | 332 | self._add_mig_mode(handle, data) 333 | self._add_nvlink(handle, data) 334 | 335 | def _add_mig_mode(self, handle, data): 336 | if hasattr(pynvml, 'nvmlDeviceGetMigMode'): 337 | if mig := safe_get(pynvml.nvmlDeviceGetMigMode, handle): 338 | if isinstance(mig, tuple) and len(mig) >= 2: 339 | data['mig_mode_current'] = 'Enabled' if mig[0] else 'Disabled' 340 | data['mig_mode_pending'] = 'Enabled' if mig[1] else 'Disabled' 341 | 342 | def _add_nvlink(self, handle, data): 343 | if hasattr(pynvml, 'nvmlDeviceGetNvLinkState'): 344 | nvlinks = [] 345 | active_count = 0 346 | 347 | for link_id in range(6): 348 | if state := safe_get(pynvml.nvmlDeviceGetNvLinkState, handle, link_id): 349 | link_data = {'id': link_id, 'active': bool(state)} 350 | 351 | if hasattr(pynvml, 'nvmlDeviceGetNvLinkCapability'): 352 | if hasattr(pynvml, 'NVML_NVLINK_CAP_P2P_SUPPORTED'): 353 | if caps := safe_get(pynvml.nvmlDeviceGetNvLinkCapability, handle, 354 | link_id, pynvml.NVML_NVLINK_CAP_P2P_SUPPORTED): 355 | link_data['p2p_supported'] = bool(caps) 356 | 357 | nvlinks.append(link_data) 358 | if state: 359 | active_count += 1 360 | else: 361 | break 362 | 363 | if nvlinks: 364 | data['nvlink_links'] = nvlinks 365 | data['nvlink_active_count'] = active_count 366 | 367 | -------------------------------------------------------------------------------- /static/js/socket-handlers.js: -------------------------------------------------------------------------------- 1 | /** 2 | * WebSocket event handlers 3 | */ 4 | 5 | // WebSocket connection with auto-reconnect 6 | let socket = null; 7 | let reconnectInterval = null; 8 | let reconnectAttempts = 0; 9 | const MAX_RECONNECT_ATTEMPTS = 10; 10 | const RECONNECT_DELAY = 2000; // Start with 2 seconds 11 | 12 | function createWebSocketConnection() { 13 | const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:'; 14 | const ws = new WebSocket(protocol + '//' + window.location.host + '/socket.io/'); 15 | return ws; 16 | } 17 | 18 | function connectWebSocket() { 19 | if (socket && (socket.readyState === WebSocket.CONNECTING || socket.readyState === WebSocket.OPEN)) { 20 | return; // Already connected or connecting 21 | } 22 | 23 | socket = createWebSocketConnection(); 24 | setupWebSocketHandlers(); 25 | } 26 | 27 | function setupWebSocketHandlers() { 28 | if (!socket) return; 29 | 30 | socket.onopen = handleSocketOpen; 31 | socket.onmessage = handleSocketMessage; 32 | socket.onclose = handleSocketClose; 33 | socket.onerror = handleSocketError; 34 | } 35 | 36 | function handleSocketOpen() { 37 | console.log('Connected to server'); 38 | reconnectAttempts = 0; 39 | clearInterval(reconnectInterval); 40 | reconnectInterval = null; 41 | 42 | const statusEl = document.getElementById('connection-status'); 43 | if (statusEl) { 44 | statusEl.textContent = 'Connected'; 45 | statusEl.style.color = '#43e97b'; 46 | } 47 | } 48 | 49 | function handleSocketClose() { 50 | console.log('Disconnected from server'); 51 | 52 | const statusEl = document.getElementById('connection-status'); 53 | if (statusEl) { 54 | statusEl.textContent = 'Reconnecting...'; 55 | statusEl.style.color = '#ffc107'; 56 | } 57 | 58 | // Attempt to reconnect 59 | attemptReconnect(); 60 | } 61 | 62 | function handleSocketError(error) { 63 | console.error('WebSocket error:', error); 64 | const statusEl = document.getElementById('connection-status'); 65 | if (statusEl) { 66 | statusEl.textContent = 'Connection Error'; 67 | statusEl.style.color = '#f5576c'; 68 | } 69 | } 70 | 71 | function attemptReconnect() { 72 | if (reconnectInterval) return; // Already trying to reconnect 73 | 74 | reconnectInterval = setInterval(() => { 75 | if (reconnectAttempts >= MAX_RECONNECT_ATTEMPTS) { 76 | clearInterval(reconnectInterval); 77 | reconnectInterval = null; 78 | const statusEl = document.getElementById('connection-status'); 79 | if (statusEl) { 80 | statusEl.textContent = 'Disconnected - Tap to Reload'; 81 | statusEl.style.color = '#f5576c'; 82 | statusEl.style.cursor = 'pointer'; 83 | statusEl.onclick = () => location.reload(); 84 | } 85 | return; 86 | } 87 | 88 | reconnectAttempts++; 89 | console.log(`Reconnection attempt ${reconnectAttempts}/${MAX_RECONNECT_ATTEMPTS}`); 90 | connectWebSocket(); 91 | }, RECONNECT_DELAY); 92 | } 93 | 94 | // Initialize connection 95 | connectWebSocket(); 96 | 97 | // Performance: Scroll detection to pause DOM updates during scroll 98 | let isScrolling = false; 99 | let scrollTimeout; 100 | const SCROLL_PAUSE_DURATION = 100; // ms to wait after scroll stops before resuming updates 101 | 102 | /** 103 | * Setup scroll event listeners to detect when user is scrolling 104 | * Uses passive listeners for better performance 105 | */ 106 | function setupScrollDetection() { 107 | const handleScroll = () => { 108 | isScrolling = true; 109 | clearTimeout(scrollTimeout); 110 | scrollTimeout = setTimeout(() => { 111 | isScrolling = false; 112 | }, SCROLL_PAUSE_DURATION); 113 | }; 114 | 115 | // Wait for DOM to be ready 116 | setTimeout(() => { 117 | // Listen to window scroll (primary scroll container) 118 | window.addEventListener('scroll', handleScroll, { passive: true }); 119 | 120 | // Also listen to .container as fallback 121 | const container = document.querySelector('.container'); 122 | if (container) { 123 | container.addEventListener('scroll', handleScroll, { passive: true }); 124 | } 125 | }, 500); 126 | } 127 | 128 | // Initialize scroll detection 129 | setupScrollDetection(); 130 | 131 | // Performance: Batched rendering system using requestAnimationFrame 132 | // Batches all DOM updates into a single frame to minimize reflows/repaints 133 | let pendingUpdates = new Map(); // Queue of pending GPU/system updates 134 | let rafScheduled = false; // Flag to prevent duplicate RAF scheduling 135 | 136 | // Performance: Throttle text updates (less critical than charts) 137 | const lastDOMUpdate = {}; // Track last update time per GPU 138 | const DOM_UPDATE_INTERVAL = 1000; // Text/card updates every 1s, charts update every frame 139 | 140 | // Handle incoming GPU data 141 | function handleSocketMessage(event) { 142 | const data = JSON.parse(event.data); 143 | // Hub mode: different data structure with nodes 144 | if (data.mode === 'hub') { 145 | handleClusterData(data); 146 | return; 147 | } 148 | 149 | const overviewContainer = document.getElementById('overview-container'); 150 | 151 | // Clear loading state 152 | if (overviewContainer.innerHTML.includes('Loading GPU data')) { 153 | overviewContainer.innerHTML = ''; 154 | } 155 | 156 | const gpuCount = Object.keys(data.gpus).length; 157 | const now = Date.now(); 158 | 159 | // Performance: Skip ALL DOM updates during active scrolling 160 | if (isScrolling) { 161 | // Still update chart data arrays (lightweight) to maintain continuity 162 | // This ensures no data gaps when scroll ends 163 | Object.keys(data.gpus).forEach(gpuId => { 164 | const gpuInfo = data.gpus[gpuId]; 165 | if (!chartData[gpuId]) { 166 | initGPUData(gpuId, { 167 | utilization: gpuInfo.utilization, 168 | temperature: gpuInfo.temperature, 169 | memory: (gpuInfo.memory_used / gpuInfo.memory_total) * 100, 170 | power: gpuInfo.power_draw, 171 | fanSpeed: gpuInfo.fan_speed, 172 | clockGraphics: gpuInfo.clock_graphics, 173 | clockSm: gpuInfo.clock_sm, 174 | clockMemory: gpuInfo.clock_memory 175 | }); 176 | } 177 | updateAllChartDataOnly(gpuId, gpuInfo); 178 | }); 179 | return; // Exit early - zero DOM work during scroll = smooth 60 FPS 180 | } 181 | 182 | // Process each GPU - queue updates for batched rendering 183 | Object.keys(data.gpus).forEach(gpuId => { 184 | const gpuInfo = data.gpus[gpuId]; 185 | 186 | // Initialize chart data structures if first time seeing this GPU 187 | if (!chartData[gpuId]) { 188 | initGPUData(gpuId, { 189 | utilization: gpuInfo.utilization, 190 | temperature: gpuInfo.temperature, 191 | memory: (gpuInfo.memory_used / gpuInfo.memory_total) * 100, 192 | power: gpuInfo.power_draw, 193 | fanSpeed: gpuInfo.fan_speed, 194 | clockGraphics: gpuInfo.clock_graphics, 195 | clockSm: gpuInfo.clock_sm, 196 | clockMemory: gpuInfo.clock_memory 197 | }); 198 | } 199 | 200 | // Determine if text/card DOM should update (throttled) or just charts (every frame) 201 | const shouldUpdateDOM = !lastDOMUpdate[gpuId] || (now - lastDOMUpdate[gpuId]) >= DOM_UPDATE_INTERVAL; 202 | 203 | // Queue this GPU's update instead of executing immediately 204 | pendingUpdates.set(gpuId, { 205 | gpuInfo, 206 | shouldUpdateDOM, 207 | now 208 | }); 209 | 210 | // Handle initial card creation (can't be batched since we need the DOM element) 211 | const existingOverview = overviewContainer.querySelector(`[data-gpu-id="${gpuId}"]`); 212 | if (!existingOverview) { 213 | overviewContainer.insertAdjacentHTML('beforeend', createOverviewCard(gpuId, gpuInfo)); 214 | initOverviewMiniChart(gpuId, gpuInfo.utilization); 215 | lastDOMUpdate[gpuId] = now; 216 | } 217 | }); 218 | 219 | // Queue system updates (processes/CPU/RAM) for batching 220 | if (!lastDOMUpdate.system || (now - lastDOMUpdate.system) >= DOM_UPDATE_INTERVAL) { 221 | pendingUpdates.set('_system', { 222 | processes: data.processes, 223 | system: data.system, 224 | now 225 | }); 226 | } 227 | 228 | // Schedule single batched render (if not already scheduled) 229 | // This ensures all updates happen in ONE animation frame 230 | if (!rafScheduled && pendingUpdates.size > 0) { 231 | rafScheduled = true; 232 | requestAnimationFrame(processBatchedUpdates); 233 | } 234 | 235 | // Auto-switch to single GPU view if only 1 GPU detected (first time only) 236 | autoSwitchSingleGPU(gpuCount, Object.keys(data.gpus)); 237 | } 238 | 239 | /** 240 | * Process all batched updates in a single animation frame 241 | * Called by requestAnimationFrame at optimal timing (~60 FPS) 242 | * 243 | * Performance benefit: All DOM updates execute in ONE layout/paint cycle 244 | * instead of multiple cycles, eliminating layout thrashing 245 | */ 246 | function processBatchedUpdates() { 247 | rafScheduled = false; 248 | 249 | // Execute all queued updates in a single batch 250 | pendingUpdates.forEach((update, gpuId) => { 251 | if (gpuId === '_system') { 252 | // System updates (CPU, RAM, processes) 253 | updateProcesses(update.processes); 254 | updateSystemInfo(update.system); 255 | lastDOMUpdate.system = update.now; 256 | } else { 257 | // GPU updates 258 | const { gpuInfo, shouldUpdateDOM, now } = update; 259 | 260 | // Update overview card (always for charts, conditionally for text) 261 | updateOverviewCard(gpuId, gpuInfo, shouldUpdateDOM); 262 | if (shouldUpdateDOM) { 263 | lastDOMUpdate[gpuId] = now; 264 | } 265 | 266 | // Performance: Only update detail view if tab is visible 267 | // Invisible tabs = zero wasted processing 268 | const isDetailTabVisible = currentTab === `gpu-${gpuId}`; 269 | if (isDetailTabVisible || !registeredGPUs.has(gpuId)) { 270 | ensureGPUTab(gpuId, gpuInfo, shouldUpdateDOM && isDetailTabVisible); 271 | } 272 | } 273 | }); 274 | 275 | // Clear queue for next batch 276 | pendingUpdates.clear(); 277 | } 278 | 279 | /** 280 | * Update chart data arrays without triggering any rendering (used during scroll) 281 | * 282 | * This maintains data continuity during scroll by collecting metrics 283 | * but skips expensive DOM/canvas updates for smooth 60 FPS scrolling 284 | * 285 | * @param {string} gpuId - GPU identifier 286 | * @param {object} gpuInfo - GPU metrics data 287 | */ 288 | function updateAllChartDataOnly(gpuId, gpuInfo) { 289 | if (!chartData[gpuId]) return; 290 | 291 | const timestamp = new Date().toLocaleTimeString(); 292 | const memory_used = gpuInfo.memory_used || 0; 293 | const memory_total = gpuInfo.memory_total || 1; 294 | const memPercent = (memory_used / memory_total) * 100; 295 | const power_draw = gpuInfo.power_draw || 0; 296 | 297 | // Prepare all metric updates 298 | const metrics = { 299 | utilization: gpuInfo.utilization || 0, 300 | temperature: gpuInfo.temperature || 0, 301 | memory: memPercent, 302 | power: power_draw, 303 | fanSpeed: gpuInfo.fan_speed || 0, 304 | efficiency: power_draw > 0 ? (gpuInfo.utilization || 0) / power_draw : 0 305 | }; 306 | 307 | // Update single-line charts 308 | Object.entries(metrics).forEach(([chartType, value]) => { 309 | const data = chartData[gpuId][chartType]; 310 | if (!data?.labels || !data?.data) return; 311 | 312 | data.labels.push(timestamp); 313 | data.data.push(Number(value) || 0); 314 | 315 | // Add threshold lines for specific charts 316 | if (chartType === 'utilization' && data.thresholdData) { 317 | data.thresholdData.push(80); 318 | } else if (chartType === 'temperature') { 319 | if (data.warningData) data.warningData.push(75); 320 | if (data.dangerData) data.dangerData.push(85); 321 | } else if (chartType === 'memory' && data.thresholdData) { 322 | data.thresholdData.push(90); 323 | } 324 | 325 | // Maintain rolling window (120 points = 60s at 0.5s interval) 326 | if (data.labels.length > 120) { 327 | data.labels.shift(); 328 | data.data.shift(); 329 | if (data.thresholdData) data.thresholdData.shift(); 330 | if (data.warningData) data.warningData.shift(); 331 | if (data.dangerData) data.dangerData.shift(); 332 | } 333 | }); 334 | 335 | // Update multi-line charts (clocks) 336 | const clocksData = chartData[gpuId].clocks; 337 | if (clocksData?.labels) { 338 | clocksData.labels.push(timestamp); 339 | clocksData.graphicsData.push(gpuInfo.clock_graphics || 0); 340 | clocksData.smData.push(gpuInfo.clock_sm || 0); 341 | clocksData.memoryData.push(gpuInfo.clock_memory || 0); 342 | 343 | if (clocksData.labels.length > 120) { 344 | clocksData.labels.shift(); 345 | clocksData.graphicsData.shift(); 346 | clocksData.smData.shift(); 347 | clocksData.memoryData.shift(); 348 | } 349 | } 350 | } 351 | 352 | // Handle page visibility changes (phone lock/unlock, tab switch) 353 | document.addEventListener('visibilitychange', () => { 354 | if (document.visibilityState === 'visible') { 355 | // Page became visible (phone unlocked or tab switched back) 356 | console.log('Page visible - checking connection'); 357 | if (!socket || socket.readyState !== WebSocket.OPEN) { 358 | // Connection is closed, reconnect immediately 359 | reconnectAttempts = 0; 360 | clearInterval(reconnectInterval); 361 | reconnectInterval = null; 362 | connectWebSocket(); 363 | } 364 | } 365 | }); 366 | 367 | // Also handle page focus (additional safety) 368 | window.addEventListener('focus', () => { 369 | if (!socket || socket.readyState !== WebSocket.OPEN) { 370 | console.log('Window focused - checking connection'); 371 | reconnectAttempts = 0; 372 | clearInterval(reconnectInterval); 373 | reconnectInterval = null; 374 | connectWebSocket(); 375 | } 376 | }); 377 | 378 | /** 379 | * Handle cluster/hub mode data 380 | * Data structure: { mode: 'hub', nodes: {...}, cluster_stats: {...} } 381 | */ 382 | function handleClusterData(data) { 383 | const overviewContainer = document.getElementById('overview-container'); 384 | const now = Date.now(); 385 | 386 | // Clear loading state 387 | if (overviewContainer.innerHTML.includes('Loading GPU data')) { 388 | overviewContainer.innerHTML = ''; 389 | } 390 | 391 | // Skip DOM updates during scrolling 392 | if (isScrolling) { 393 | // Still update chart data for continuity 394 | Object.entries(data.nodes).forEach(([nodeName, nodeData]) => { 395 | if (nodeData.status === 'online') { 396 | Object.entries(nodeData.gpus).forEach(([gpuId, gpuInfo]) => { 397 | const fullGpuId = `${nodeName}-${gpuId}`; 398 | if (!chartData[fullGpuId]) { 399 | initGPUData(fullGpuId, { 400 | utilization: gpuInfo.utilization, 401 | temperature: gpuInfo.temperature, 402 | memory: (gpuInfo.memory_used / gpuInfo.memory_total) * 100, 403 | power: gpuInfo.power_draw, 404 | fanSpeed: gpuInfo.fan_speed, 405 | clockGraphics: gpuInfo.clock_graphics, 406 | clockSm: gpuInfo.clock_sm, 407 | clockMemory: gpuInfo.clock_memory 408 | }); 409 | } 410 | updateAllChartDataOnly(fullGpuId, gpuInfo); 411 | }); 412 | } 413 | }); 414 | return; 415 | } 416 | 417 | // Render GPUs grouped by node (minimal grouping) 418 | Object.entries(data.nodes).forEach(([nodeName, nodeData]) => { 419 | // Get or create node group container 420 | let nodeGroup = overviewContainer.querySelector(`[data-node="${nodeName}"]`); 421 | if (!nodeGroup) { 422 | overviewContainer.insertAdjacentHTML('beforeend', ` 423 |
424 |
${nodeName}
425 |
426 |
427 | `); 428 | nodeGroup = overviewContainer.querySelector(`[data-node="${nodeName}"]`); 429 | } 430 | 431 | const nodeGrid = nodeGroup.querySelector('.node-grid'); 432 | 433 | if (nodeData.status === 'online') { 434 | // Node is online - process its GPUs normally 435 | Object.entries(nodeData.gpus).forEach(([gpuId, gpuInfo]) => { 436 | const fullGpuId = `${nodeName}-${gpuId}`; 437 | 438 | // Initialize chart data with current values 439 | if (!chartData[fullGpuId]) { 440 | initGPUData(fullGpuId, { 441 | utilization: gpuInfo.utilization, 442 | temperature: gpuInfo.temperature, 443 | memory: (gpuInfo.memory_used / gpuInfo.memory_total) * 100, 444 | power: gpuInfo.power_draw, 445 | fanSpeed: gpuInfo.fan_speed, 446 | clockGraphics: gpuInfo.clock_graphics, 447 | clockSm: gpuInfo.clock_sm, 448 | clockMemory: gpuInfo.clock_memory 449 | }); 450 | } 451 | 452 | // Queue update 453 | const shouldUpdateDOM = !lastDOMUpdate[fullGpuId] || (now - lastDOMUpdate[fullGpuId]) >= DOM_UPDATE_INTERVAL; 454 | pendingUpdates.set(fullGpuId, { 455 | gpuInfo, 456 | shouldUpdateDOM, 457 | now, 458 | nodeName 459 | }); 460 | 461 | // Create card if doesn't exist 462 | const existingCard = nodeGrid.querySelector(`[data-gpu-id="${fullGpuId}"]`); 463 | if (!existingCard) { 464 | nodeGrid.insertAdjacentHTML('beforeend', createClusterGPUCard(nodeName, gpuId, gpuInfo)); 465 | initOverviewMiniChart(fullGpuId, gpuInfo.utilization); 466 | lastDOMUpdate[fullGpuId] = now; 467 | } 468 | }); 469 | } else { 470 | // Node is offline - remove entire node group 471 | const existingCards = nodeGrid.querySelectorAll('[data-gpu-id]'); 472 | existingCards.forEach(card => { 473 | const gpuId = card.getAttribute('data-gpu-id'); 474 | // Clean up chart data 475 | if (chartData[gpuId]) { 476 | delete chartData[gpuId]; 477 | } 478 | if (lastDOMUpdate[gpuId]) { 479 | delete lastDOMUpdate[gpuId]; 480 | } 481 | // Remove the GPU tab 482 | removeGPUTab(gpuId); 483 | }); 484 | 485 | // Remove the entire node group from the UI 486 | nodeGroup.remove(); 487 | } 488 | }); 489 | 490 | // Update processes and system info (use first online node) 491 | const firstOnlineNode = Object.values(data.nodes).find(n => n.status === 'online'); 492 | if (firstOnlineNode) { 493 | if (!lastDOMUpdate.system || (now - lastDOMUpdate.system) >= DOM_UPDATE_INTERVAL) { 494 | pendingUpdates.set('_system', { 495 | processes: firstOnlineNode.processes || [], 496 | system: firstOnlineNode.system || {}, 497 | now 498 | }); 499 | } 500 | } 501 | 502 | // Schedule batched render 503 | if (!rafScheduled && pendingUpdates.size > 0) { 504 | rafScheduled = true; 505 | requestAnimationFrame(processBatchedUpdates); 506 | } 507 | } 508 | 509 | /** 510 | * Create GPU card for cluster view (includes node name) 511 | */ 512 | function createClusterGPUCard(nodeName, gpuId, gpuInfo) { 513 | const fullGpuId = `${nodeName}-${gpuId}`; 514 | const memory_used = getMetricValue(gpuInfo, 'memory_used', 0); 515 | const memory_total = getMetricValue(gpuInfo, 'memory_total', 1); 516 | const memPercent = (memory_used / memory_total) * 100; 517 | 518 | return ` 519 |
520 |
521 |
522 |

523 | GPU ${gpuId} 524 |

525 |

${getMetricValue(gpuInfo, 'name', 'Unknown GPU')}

526 |
527 |
528 | 529 | ONLINE 530 |
531 |
532 | 533 |
534 |
535 |
${getMetricValue(gpuInfo, 'utilization', 0)}%
536 |
GPU Usage
537 |
538 |
539 |
${getMetricValue(gpuInfo, 'temperature', 0)}°C
540 |
Temperature
541 |
542 |
543 |
${Math.round(memPercent)}%
544 |
Memory
545 |
546 |
547 |
${getMetricValue(gpuInfo, 'power_draw', 0).toFixed(0)}W
548 |
Power Draw
549 |
550 |
551 | 552 |
553 |
554 | 555 |
556 |
557 |
558 | `; 559 | } 560 | -------------------------------------------------------------------------------- /docs/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | GPU Hot - Real-time NVIDIA GPU Monitoring 7 | 8 | 9 | 10 | 11 | 604 | 605 | 606 |
607 | 608 | 614 | 615 | 616 |
617 |

GPU Hot
Metrics in Seconds

618 |

Real-time GPU monitoring in your browser. Start with one server, scale to dozens. No infrastructure, no setup, no SSH. Just one command.

619 |
620 | 621 | Try Interactive Demo 622 | 623 | 624 | 625 | 626 | 627 | View on GitHub 628 | 629 | 630 | 631 | 632 |
633 |
634 | 635 | 636 |
637 |

Get Started

638 |
639 |
640 | terminal 641 | 644 |
645 |
646 | $ docker run -d --name gpu-hot --gpus all -p 1312:1312 ghcr.io/psalias2006/gpu-hot:latest 647 |
648 |
649 |

650 | → http://localhost:1312 651 |

652 |

653 |
654 |
655 | 656 | 657 | 669 | 670 | 717 | 718 | 719 | -------------------------------------------------------------------------------- /static/js/chart-manager.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Chart management - data storage, updates, and initialization 3 | * Requires: chart-config.js to be loaded first 4 | */ 5 | 6 | // Detect if we're on a mobile device 7 | function isMobile() { 8 | return window.innerWidth <= 768; 9 | } 10 | 11 | // Get mobile-optimized chart options 12 | function getMobileChartOptions(baseOptions) { 13 | if (!isMobile()) return baseOptions; 14 | 15 | // Clone the options to avoid mutating the base config 16 | const mobileOptions = JSON.parse(JSON.stringify(baseOptions)); 17 | 18 | const isVerySmall = window.innerWidth <= 375; 19 | 20 | // Simplify axes for mobile - minimal but readable 21 | if (mobileOptions.scales) { 22 | if (mobileOptions.scales.x) { 23 | mobileOptions.scales.x.display = false; // Hide x-axis time labels 24 | } 25 | if (mobileOptions.scales.y) { 26 | // Keep y-axis visible and simple 27 | mobileOptions.scales.y.display = true; 28 | mobileOptions.scales.y.ticks = mobileOptions.scales.y.ticks || {}; 29 | mobileOptions.scales.y.ticks.font = { size: isVerySmall ? 8 : 9 }; 30 | mobileOptions.scales.y.ticks.padding = 3; 31 | mobileOptions.scales.y.ticks.color = 'rgba(255, 255, 255, 0.5)'; 32 | mobileOptions.scales.y.ticks.maxTicksLimit = 3; 33 | mobileOptions.scales.y.grid = mobileOptions.scales.y.grid || {}; 34 | mobileOptions.scales.y.grid.color = 'rgba(255, 255, 255, 0.08)'; 35 | mobileOptions.scales.y.grid.lineWidth = 1; 36 | mobileOptions.scales.y.grid.drawBorder = true; 37 | } 38 | } 39 | 40 | // Keep tooltips but simplify them 41 | if (mobileOptions.plugins && mobileOptions.plugins.tooltip) { 42 | mobileOptions.plugins.tooltip.enabled = true; 43 | mobileOptions.plugins.tooltip.padding = 8; 44 | mobileOptions.plugins.tooltip.titleFont = { size: 11 }; 45 | mobileOptions.plugins.tooltip.bodyFont = { size: 10 }; 46 | } 47 | 48 | // Hide legends on mobile 49 | if (mobileOptions.plugins && mobileOptions.plugins.legend) { 50 | mobileOptions.plugins.legend.display = false; 51 | } 52 | 53 | // Keep some padding so chart renders properly 54 | if (mobileOptions.layout && mobileOptions.layout.padding) { 55 | mobileOptions.layout.padding = { left: 10, right: 15, top: 5, bottom: 10 }; 56 | } 57 | 58 | // Ensure chart renders 59 | mobileOptions.responsive = true; 60 | mobileOptions.maintainAspectRatio = false; 61 | 62 | return mobileOptions; 63 | } 64 | 65 | // Store charts and data 66 | const charts = {}; 67 | const chartData = {}; 68 | 69 | // Initialize chart data for a GPU with pre-filled baseline data 70 | function initGPUData(gpuId, initialValues = {}) { 71 | const dataPoints = 120; // 60 seconds at 0.5s interval 72 | const labels = []; 73 | 74 | // Create labels for the full timeline 75 | for (let i = dataPoints - 1; i >= 0; i--) { 76 | const time = new Date(Date.now() - i * 500); 77 | labels.push(time.toLocaleTimeString()); 78 | } 79 | 80 | // Helper to create filled array with initial value 81 | const createFilledArray = (value = 0) => new Array(dataPoints).fill(value); 82 | 83 | chartData[gpuId] = { 84 | utilization: { 85 | labels: [...labels], 86 | data: createFilledArray(initialValues.utilization || 0), 87 | thresholdData: createFilledArray(80) 88 | }, 89 | temperature: { 90 | labels: [...labels], 91 | data: createFilledArray(initialValues.temperature || 0), 92 | warningData: createFilledArray(75), 93 | dangerData: createFilledArray(85) 94 | }, 95 | memory: { 96 | labels: [...labels], 97 | data: createFilledArray(initialValues.memory || 0), 98 | thresholdData: createFilledArray(90) 99 | }, 100 | power: { 101 | labels: [...labels], 102 | data: createFilledArray(initialValues.power || 0) 103 | }, 104 | fanSpeed: { 105 | labels: [...labels], 106 | data: createFilledArray(initialValues.fanSpeed || 0) 107 | }, 108 | clocks: { 109 | labels: [...labels], 110 | graphicsData: createFilledArray(initialValues.clockGraphics || 0), 111 | smData: createFilledArray(initialValues.clockSm || 0), 112 | memoryData: createFilledArray(initialValues.clockMemory || 0) 113 | }, 114 | efficiency: { 115 | labels: [...labels], 116 | data: createFilledArray(initialValues.efficiency || 0) 117 | }, 118 | pcie: { 119 | labels: [...labels], 120 | dataRX: createFilledArray(initialValues.pcieRX || 0), 121 | dataTX: createFilledArray(initialValues.pcieTX || 0) 122 | }, 123 | appclocks: { 124 | labels: [...labels], 125 | dataGr: createFilledArray(initialValues.appclockGr || 0), 126 | dataMem: createFilledArray(initialValues.appclockMem || 0), 127 | dataSM: createFilledArray(initialValues.appclockSM || 0), 128 | dataVideo: createFilledArray(initialValues.appclockVideo || 0) 129 | } 130 | }; 131 | } 132 | 133 | // Calculate statistics for chart data 134 | function calculateStats(data) { 135 | if (!data || !Array.isArray(data) || data.length === 0) { 136 | return { min: 0, max: 0, avg: 0, current: 0 }; 137 | } 138 | 139 | // Filter out invalid numbers 140 | const validData = data.filter(val => isFinite(val)); 141 | if (validData.length === 0) { 142 | return { min: 0, max: 0, avg: 0, current: 0 }; 143 | } 144 | 145 | const current = validData[validData.length - 1]; 146 | const min = Math.min(...validData); 147 | const max = Math.max(...validData); 148 | const avg = validData.reduce((a, b) => a + b, 0) / validData.length; 149 | 150 | return { 151 | min: isFinite(min) ? min : 0, 152 | max: isFinite(max) ? max : 0, 153 | avg: isFinite(avg) ? avg : 0, 154 | current: isFinite(current) ? current : 0 155 | }; 156 | } 157 | 158 | // Update statistics display for a chart 159 | function updateChartStats(gpuId, chartType, stats, unit) { 160 | const currentEl = document.getElementById(`stat-${chartType}-current-${gpuId}`); 161 | const minEl = document.getElementById(`stat-${chartType}-min-${gpuId}`); 162 | const maxEl = document.getElementById(`stat-${chartType}-max-${gpuId}`); 163 | const avgEl = document.getElementById(`stat-${chartType}-avg-${gpuId}`); 164 | 165 | // Use decimal formatting for efficiency values 166 | const formatter = (value) => { 167 | if (chartType === 'efficiency') { 168 | return value.toFixed(2); 169 | } 170 | return Math.round(value); 171 | }; 172 | 173 | if (currentEl) currentEl.textContent = `${formatter(stats.current)}${unit}`; 174 | if (minEl) minEl.textContent = `${formatter(stats.min)}${unit}`; 175 | if (maxEl) maxEl.textContent = `${formatter(stats.max)}${unit}`; 176 | if (avgEl) avgEl.textContent = `${formatter(stats.avg)}${unit}`; 177 | } 178 | 179 | // Update statistics display for PCIe chart (RX and TX separately) 180 | function updatePCIeChartStats(gpuId, statsRX, statsTX) { 181 | // Smart formatter that converts KB/s to MB/s when >= 1000 182 | const formatBandwidth = (value) => { 183 | if (value >= 1000) { 184 | return `${(value / 1024).toFixed(1)} MB/s`; 185 | } 186 | return `${Math.round(value)} KB/s`; 187 | }; 188 | 189 | // Update RX stats 190 | const rxCurrentEl = document.getElementById(`stat-pcie-rx-current-${gpuId}`); 191 | const rxMinEl = document.getElementById(`stat-pcie-rx-min-${gpuId}`); 192 | const rxMaxEl = document.getElementById(`stat-pcie-rx-max-${gpuId}`); 193 | const rxAvgEl = document.getElementById(`stat-pcie-rx-avg-${gpuId}`); 194 | 195 | if (rxCurrentEl) rxCurrentEl.textContent = formatBandwidth(statsRX.current); 196 | if (rxMinEl) rxMinEl.textContent = formatBandwidth(statsRX.min); 197 | if (rxMaxEl) rxMaxEl.textContent = formatBandwidth(statsRX.max); 198 | if (rxAvgEl) rxAvgEl.textContent = formatBandwidth(statsRX.avg); 199 | 200 | // Update TX stats 201 | const txCurrentEl = document.getElementById(`stat-pcie-tx-current-${gpuId}`); 202 | const txMinEl = document.getElementById(`stat-pcie-tx-min-${gpuId}`); 203 | const txMaxEl = document.getElementById(`stat-pcie-tx-max-${gpuId}`); 204 | const txAvgEl = document.getElementById(`stat-pcie-tx-avg-${gpuId}`); 205 | 206 | if (txCurrentEl) txCurrentEl.textContent = formatBandwidth(statsTX.current); 207 | if (txMinEl) txMinEl.textContent = formatBandwidth(statsTX.min); 208 | if (txMaxEl) txMaxEl.textContent = formatBandwidth(statsTX.max); 209 | if (txAvgEl) txAvgEl.textContent = formatBandwidth(statsTX.avg); 210 | } 211 | 212 | // Update mobile chart header value display 213 | function updateMobileChartValue(gpuId, chartType, value, unit) { 214 | const chartHeader = document.querySelector(`#chart-${chartType}-${gpuId}`)?.closest('.chart-container')?.querySelector('.chart-header'); 215 | if (chartHeader) { 216 | const formattedValue = chartType === 'efficiency' ? value.toFixed(2) : Math.round(value); 217 | chartHeader.setAttribute('data-value', `${formattedValue}${unit}`); 218 | } 219 | } 220 | 221 | // Update chart data 222 | function updateChart(gpuId, chartType, value, value2, value3, value4) { 223 | // Validate inputs 224 | if (!gpuId || !chartType) { 225 | console.warn('updateChart: Missing gpuId or chartType'); 226 | return; 227 | } 228 | 229 | if (!chartData[gpuId]) initGPUData(gpuId); 230 | 231 | const data = chartData[gpuId][chartType]; 232 | if (!data) { 233 | console.warn(`updateChart: Invalid chartType "${chartType}" for GPU ${gpuId}`); 234 | return; 235 | } 236 | 237 | const now = new Date().toLocaleTimeString(); 238 | 239 | data.labels.push(now); 240 | 241 | // Safe number conversion helper 242 | const safeNumber = (val) => { 243 | const num = Number(val); 244 | return (isFinite(num) && num >= 0) ? num : 0; 245 | }; 246 | 247 | // Handle multi-value charts 248 | if (chartType === 'clocks') { 249 | data.graphicsData.push(safeNumber(value)); 250 | data.smData.push(safeNumber(value2)); 251 | data.memoryData.push(safeNumber(value3)); 252 | } else if (chartType === 'pcie') { 253 | data.dataRX.push(safeNumber(value)); 254 | data.dataTX.push(safeNumber(value2)); 255 | } else if (chartType === 'appclocks') { 256 | data.dataGr.push(safeNumber(value)); 257 | data.dataMem.push(safeNumber(value2)); 258 | data.dataSM.push(safeNumber(value3)); 259 | data.dataVideo.push(safeNumber(value4)); 260 | } else { 261 | data.data.push(safeNumber(value)); 262 | } 263 | 264 | // Add threshold data based on chart type 265 | if (chartType === 'utilization') { 266 | data.thresholdData.push(80); // High load threshold at 80% 267 | } else if (chartType === 'temperature') { 268 | data.warningData.push(75); // Warning at 75°C 269 | data.dangerData.push(85); // Danger at 85°C 270 | } else if (chartType === 'memory') { 271 | data.thresholdData.push(90); // High usage at 90% 272 | } 273 | 274 | // Keep only last 120 data points (60 seconds at 0.5s interval) 275 | if (data.labels.length > 120) { 276 | data.labels.shift(); 277 | if (data.data) data.data.shift(); 278 | if (data.graphicsData) data.graphicsData.shift(); 279 | if (data.smData) data.smData.shift(); 280 | if (data.memoryData) data.memoryData.shift(); 281 | if (data.dataRX) data.dataRX.shift(); 282 | if (data.dataTX) data.dataTX.shift(); 283 | if (data.dataGr) data.dataGr.shift(); 284 | if (data.dataMem) data.dataMem.shift(); 285 | if (data.dataSM) data.dataSM.shift(); 286 | if (data.dataVideo) data.dataVideo.shift(); 287 | if (data.thresholdData) data.thresholdData.shift(); 288 | if (data.warningData) data.warningData.shift(); 289 | if (data.dangerData) data.dangerData.shift(); 290 | } 291 | 292 | // Calculate and update statistics 293 | if (chartType === 'pcie') { 294 | // Handle PCIe separately - need stats for both RX and TX 295 | const statsRX = calculateStats(data.dataRX); 296 | const statsTX = calculateStats(data.dataTX); 297 | updatePCIeChartStats(gpuId, statsRX, statsTX); 298 | } else { 299 | let statsData = data.data; 300 | if (chartType === 'clocks') statsData = data.graphicsData; 301 | else if (chartType === 'appclocks') statsData = data.dataGr; 302 | 303 | const stats = calculateStats(statsData); 304 | const unitMap = { 305 | 'utilization': '%', 306 | 'util': '%', 307 | 'temperature': '°C', 308 | 'temp': '°C', 309 | 'memory': '%', 310 | 'power': 'W', 311 | 'fanSpeed': '%', 312 | 'clocks': ' MHz', 313 | 'efficiency': ' %/W', 314 | 'appclocks': ' MHz' 315 | }; 316 | const unit = unitMap[chartType] || ''; 317 | updateChartStats(gpuId, chartType, stats, unit); 318 | 319 | // Update mobile chart header with current value 320 | if (isMobile()) { 321 | updateMobileChartValue(gpuId, chartType, stats.current, unit); 322 | } 323 | } 324 | 325 | // Update chart if it exists with error handling 326 | if (charts[gpuId] && charts[gpuId][chartType]) { 327 | try { 328 | charts[gpuId][chartType].update('none'); 329 | } catch (error) { 330 | console.error(`Error updating chart ${chartType} for GPU ${gpuId}:`, error); 331 | } 332 | } 333 | } 334 | 335 | // Initialize utilization background chart 336 | function initUtilBackgroundChart(gpuId) { 337 | const canvas = document.getElementById(`util-bg-chart-${gpuId}`); 338 | if (!canvas) return; 339 | 340 | if (!charts[gpuId]) charts[gpuId] = {}; 341 | if (charts[gpuId].utilBackground) return; // Already initialized 342 | 343 | charts[gpuId].utilBackground = new Chart(canvas, { 344 | type: 'line', 345 | data: { 346 | labels: chartData[gpuId].utilization.labels, 347 | datasets: [{ 348 | data: chartData[gpuId].utilization.data, 349 | borderColor: 'rgba(79, 172, 254, 0.8)', 350 | backgroundColor: 'rgba(79, 172, 254, 0.3)', 351 | borderWidth: 2, 352 | tension: 0.4, 353 | fill: true, 354 | pointRadius: 0 355 | }] 356 | }, 357 | options: { 358 | responsive: true, 359 | maintainAspectRatio: false, 360 | animation: false, 361 | scales: { 362 | x: { display: false }, 363 | y: { display: false, min: 0, max: 100 } 364 | }, 365 | plugins: { 366 | legend: { display: false }, 367 | tooltip: { enabled: false } 368 | } 369 | } 370 | }); 371 | } 372 | 373 | // Initialize charts for a GPU 374 | function initGPUCharts(gpuId) { 375 | if (!gpuId) { 376 | console.warn('initGPUCharts: Missing gpuId'); 377 | return; 378 | } 379 | 380 | const chartTypes = ['utilization', 'temperature', 'memory', 'power', 'fanSpeed', 'clocks', 'efficiency', 'pcie', 'appclocks']; 381 | if (!charts[gpuId]) charts[gpuId] = {}; 382 | 383 | // Initialize background utilization chart 384 | initUtilBackgroundChart(gpuId); 385 | 386 | chartTypes.forEach(type => { 387 | const canvas = document.getElementById(`chart-${type}-${gpuId}`); 388 | if (!canvas) return; 389 | 390 | // Destroy existing chart to prevent memory leaks 391 | if (charts[gpuId][type]) { 392 | try { 393 | charts[gpuId][type].destroy(); 394 | } catch (error) { 395 | console.warn(`Error destroying existing chart ${type} for GPU ${gpuId}:`, error); 396 | } 397 | } 398 | 399 | if (canvas) { 400 | const config = JSON.parse(JSON.stringify(chartConfigs[type])); // Deep clone 401 | 402 | // Link datasets to chartData FIRST 403 | if (type === 'utilization') { 404 | config.data.datasets[0].data = chartData[gpuId][type].data; 405 | if (config.data.datasets[1]) config.data.datasets[1].data = chartData[gpuId][type].thresholdData; 406 | } else if (type === 'temperature') { 407 | config.data.datasets[0].data = chartData[gpuId][type].data; 408 | if (config.data.datasets[1]) config.data.datasets[1].data = chartData[gpuId][type].warningData; 409 | if (config.data.datasets[2]) config.data.datasets[2].data = chartData[gpuId][type].dangerData; 410 | } else if (type === 'memory') { 411 | config.data.datasets[0].data = chartData[gpuId][type].data; 412 | if (config.data.datasets[1]) config.data.datasets[1].data = chartData[gpuId][type].thresholdData; 413 | } else if (type === 'clocks') { 414 | config.data.datasets[0].data = chartData[gpuId][type].graphicsData; 415 | if (config.data.datasets[1]) config.data.datasets[1].data = chartData[gpuId][type].smData; 416 | if (config.data.datasets[2]) config.data.datasets[2].data = chartData[gpuId][type].memoryData; 417 | } else if (type === 'pcie') { 418 | config.data.datasets[0].data = chartData[gpuId][type].dataRX; 419 | if (config.data.datasets[1]) config.data.datasets[1].data = chartData[gpuId][type].dataTX; 420 | } else if (type === 'appclocks') { 421 | config.data.datasets[0].data = chartData[gpuId][type].dataGr; 422 | if (config.data.datasets[1]) config.data.datasets[1].data = chartData[gpuId][type].dataMem; 423 | if (config.data.datasets[2]) config.data.datasets[2].data = chartData[gpuId][type].dataSM; 424 | if (config.data.datasets[3]) config.data.datasets[3].data = chartData[gpuId][type].dataVideo; 425 | } else { 426 | config.data.datasets[0].data = chartData[gpuId][type].data; 427 | } 428 | 429 | config.data.labels = chartData[gpuId][type].labels; 430 | 431 | // Optimize dataset appearance for mobile (BEFORE applying options) 432 | if (isMobile() && config.data.datasets) { 433 | // Make first dataset prominent 434 | config.data.datasets[0].borderWidth = 3; 435 | config.data.datasets[0].pointRadius = 0; 436 | config.data.datasets[0].fill = true; 437 | 438 | // Hide other datasets by making them invisible (don't remove them!) 439 | for (let i = 1; i < config.data.datasets.length; i++) { 440 | config.data.datasets[i].hidden = true; 441 | config.data.datasets[i].borderWidth = 0; 442 | } 443 | } 444 | 445 | // Apply mobile optimizations to chart options 446 | config.options = getMobileChartOptions(config.options); 447 | 448 | // Ensure canvas has proper dimensions before creating chart 449 | const parent = canvas.parentElement; 450 | if (parent && parent.clientWidth > 0 && parent.clientHeight > 0) { 451 | // Set canvas dimensions to match container 452 | canvas.style.width = '100%'; 453 | canvas.style.height = '100%'; 454 | } 455 | 456 | // Create chart with error handling 457 | try { 458 | charts[gpuId][type] = new Chart(canvas, config); 459 | } catch (error) { 460 | console.error(`Error creating chart ${type} for GPU ${gpuId}:`, error); 461 | } 462 | } 463 | }); 464 | } 465 | 466 | // Initialize overview mini chart 467 | function initOverviewMiniChart(gpuId, currentValue) { 468 | if (!gpuId) { 469 | console.warn('initOverviewMiniChart: Missing gpuId'); 470 | return; 471 | } 472 | 473 | const canvas = document.getElementById(`overview-chart-${gpuId}`); 474 | if (!canvas) return; 475 | 476 | // Destroy existing chart to prevent memory leaks 477 | if (charts[gpuId] && charts[gpuId].overviewMini) { 478 | try { 479 | charts[gpuId].overviewMini.destroy(); 480 | } catch (error) { 481 | console.warn(`Error destroying existing overview chart for GPU ${gpuId}:`, error); 482 | } 483 | } 484 | 485 | // Initialize with current utilization value if not already initialized 486 | if (!chartData[gpuId]) { 487 | initGPUData(gpuId, { utilization: currentValue }); 488 | } 489 | 490 | // Mobile-specific configuration for mini charts 491 | const fontSize = isMobile() ? 8 : 10; 492 | const yAxisDisplay = !isMobile() || window.innerWidth > 480; 493 | 494 | const config = { 495 | type: 'line', 496 | data: { 497 | labels: chartData[gpuId].utilization.labels, 498 | datasets: [{ 499 | data: chartData[gpuId].utilization.data, 500 | borderColor: '#4facfe', 501 | backgroundColor: 'rgba(79, 172, 254, 0.15)', 502 | borderWidth: isMobile() ? 2 : 2.5, 503 | tension: 0.4, 504 | fill: true, 505 | pointRadius: 0, 506 | pointHoverRadius: 3 507 | }] 508 | }, 509 | options: { 510 | responsive: true, 511 | maintainAspectRatio: false, 512 | animation: false, // Disable animations for overview charts 513 | interaction: { mode: 'index', intersect: false }, 514 | scales: { 515 | x: { display: false }, 516 | y: { 517 | min: 0, 518 | max: 100, 519 | display: yAxisDisplay, 520 | grid: { 521 | color: 'rgba(255, 255, 255, 0.08)', 522 | drawBorder: false 523 | }, 524 | ticks: { 525 | color: 'rgba(255, 255, 255, 0.4)', 526 | font: { size: fontSize }, 527 | stepSize: 50, 528 | callback: value => value + '%' 529 | } 530 | } 531 | }, 532 | plugins: { 533 | legend: { display: false }, 534 | tooltip: { 535 | enabled: true, 536 | backgroundColor: 'rgba(0, 0, 0, 0.9)', 537 | padding: isMobile() ? 8 : 12, 538 | cornerRadius: 8, 539 | titleFont: { size: isMobile() ? 11 : 12 }, 540 | bodyFont: { size: isMobile() ? 10 : 11 }, 541 | callbacks: { 542 | label: context => `GPU: ${context.parsed.y.toFixed(1)}%` 543 | } 544 | } 545 | } 546 | } 547 | }; 548 | 549 | if (!charts[gpuId]) charts[gpuId] = {}; 550 | 551 | try { 552 | charts[gpuId].overviewMini = new Chart(canvas, config); 553 | } catch (error) { 554 | console.error(`Error creating overview mini chart for GPU ${gpuId}:`, error); 555 | } 556 | } 557 | 558 | // System charts 559 | const systemCharts = {}; 560 | const systemData = { 561 | cpu: { labels: [], data: [] }, 562 | memory: { labels: [], data: [] } 563 | }; 564 | 565 | // Initialize system charts 566 | function initSystemCharts() { 567 | const cpuCanvas = document.getElementById('cpu-chart'); 568 | const memCanvas = document.getElementById('memory-chart'); 569 | 570 | if (cpuCanvas && !systemCharts.cpu) { 571 | systemCharts.cpu = new Chart(cpuCanvas, { 572 | type: 'line', 573 | data: { 574 | labels: systemData.cpu.labels, 575 | datasets: [{ 576 | data: systemData.cpu.data, 577 | borderColor: 'rgba(79, 172, 254, 0.8)', 578 | backgroundColor: 'rgba(79, 172, 254, 0.2)', 579 | borderWidth: 2, 580 | tension: 0.4, 581 | fill: true, 582 | pointRadius: 0 583 | }] 584 | }, 585 | options: { 586 | responsive: true, 587 | maintainAspectRatio: false, 588 | animation: false, 589 | scales: { 590 | x: { display: false }, 591 | y: { display: false, min: 0, max: 100 } 592 | }, 593 | plugins: { 594 | legend: { display: false }, 595 | tooltip: { enabled: false } 596 | } 597 | } 598 | }); 599 | } 600 | 601 | if (memCanvas && !systemCharts.memory) { 602 | systemCharts.memory = new Chart(memCanvas, { 603 | type: 'line', 604 | data: { 605 | labels: systemData.memory.labels, 606 | datasets: [{ 607 | data: systemData.memory.data, 608 | borderColor: 'rgba(79, 172, 254, 0.8)', 609 | backgroundColor: 'rgba(79, 172, 254, 0.2)', 610 | borderWidth: 2, 611 | tension: 0.4, 612 | fill: true, 613 | pointRadius: 0 614 | }] 615 | }, 616 | options: { 617 | responsive: true, 618 | maintainAspectRatio: false, 619 | animation: false, 620 | scales: { 621 | x: { display: false }, 622 | y: { display: false, min: 0, max: 100 } 623 | }, 624 | plugins: { 625 | legend: { display: false }, 626 | tooltip: { enabled: false } 627 | } 628 | } 629 | }); 630 | } 631 | } 632 | 633 | // Update system info with sparklines 634 | function updateSystemInfo(systemInfo) { 635 | const cpuEl = document.getElementById('cpu-usage'); 636 | const memEl = document.getElementById('memory-usage'); 637 | 638 | if (cpuEl) cpuEl.textContent = `${Math.round(systemInfo.cpu_percent)}%`; 639 | if (memEl) memEl.textContent = `${Math.round(systemInfo.memory_percent)}%`; 640 | 641 | // Update system chart data 642 | const now = new Date().toLocaleTimeString(); 643 | 644 | systemData.cpu.labels.push(now); 645 | systemData.cpu.data.push(systemInfo.cpu_percent); 646 | systemData.memory.labels.push(now); 647 | systemData.memory.data.push(systemInfo.memory_percent); 648 | 649 | // Keep only last 120 points (60 seconds at 0.5s interval) 650 | if (systemData.cpu.labels.length > 120) { 651 | systemData.cpu.labels.shift(); 652 | systemData.cpu.data.shift(); 653 | systemData.memory.labels.shift(); 654 | systemData.memory.data.shift(); 655 | } 656 | 657 | // Initialize charts if needed 658 | if (!systemCharts.cpu || !systemCharts.memory) { 659 | initSystemCharts(); 660 | } 661 | 662 | // Update charts 663 | if (systemCharts.cpu) systemCharts.cpu.update('none'); 664 | if (systemCharts.memory) systemCharts.memory.update('none'); 665 | } 666 | 667 | --------------------------------------------------------------------------------